提交 caef102d authored 作者: Razvan Pascanu's avatar Razvan Pascanu

Adding the cython version of scan

While we agreed that there might be a more principial way of solving this, this solution was fast to add and it is pretty efficient for now.
上级 95e010f5
......@@ -357,6 +357,10 @@ class Scan(Op):
the thunk can potentially cache return values (like CLinker does),
then it must not do so for variables in the no_recycling list.
"""
# Setting up all my variables in what I believe is a more Cython
# friendly form
node_input_storage = [storage_map[r] for r in node.inputs]
node_output_storage = [storage_map[r] for r in node.outputs]
node_input_compute = [compute_map[r] for r in node.inputs]
......@@ -384,7 +388,73 @@ class Scan(Op):
name = self.name,
profile = profile)
p = self.execute
try:
cython_mintaps = numpy.asarray(self.mintaps, dtype = 'int32')
cython_tap_array_len = \
numpy.asarray([ len(x) for x in self.tap_array],
dtype='int32')
if len(self.tap_array) == 0:
d1 = 0
else:
d1 = numpy.max(cython_tap_array_len)
d0 = len(self.tap_array)
cython_tap_array = numpy.zeros((d0,d1), dtype='int32')
for _d0 in range(d0):
for _d1 in range(cython_tap_array_len[_d0]):
cython_tap_array[_d0,_d1] = self.tap_array[_d0][_d1]
cython_mit_mot_out_nslices = \
numpy.asarray([ len(x) for x in self.mit_mot_out_slices],
dtype='int32')
if len(self.mit_mot_out_slices) == 0:
d1 = 0
else:
d1 = numpy.max(cython_mit_mot_out_nslices)
d0 = len(self.mit_mot_out_slices)
cython_mit_mot_out_slices = numpy.zeros((d0,d1),
dtype='int32')
for _d0 in range(d0):
for _d1 in range(cython_mit_mot_out_nslices[_d0]):
cython_mit_mot_out_slices[_d0,_d1] = \
self.mit_mot_out_slices[_d0][_d1]
vector_seqs = [ seq.ndim == 1 for seq in
self.inputs[1:1+self.n_seqs ] ]
vector_outs = [ arg.ndim ==1 for arg in
self.inputs[1+self.n_seqs: (1+self.n_seqs +
self.n_outs)] ]
vector_outs += [ False]*self.n_nit_sot
cython_vector_seqs = numpy.asarray(self.vector_seqs,
dtype='int32')
cython_vector_outs = numpy.asarray(self.vector_outs,
dtype='int32')
import scan_perform_ext
p = lambda node, args, outs:\
scan_perform_ext.perform(
self.n_shared_outs,
self.n_mit_mot_outs,
self.n_seqs,
self.n_mit_mot,
self.n_mit_sot,
self.n_sit_sot,
self.n_nit_sot,
args[0],
self.as_while,
cython_mintaps,
cython_tap_array,
cython_tap_array_len,
cython_vector_seqs,
cython_vector_outs,
cython_mit_mot_out_slices,
cython_mit_mot_out_nslices,
self.fn.fn,
self.fn,
self.inplace,
args,
outs,
self)
except ImportError:
p = self.execute
# default arguments are stored in the closure of `rval`
def rval(p=p, i=node_input_storage, o=node_output_storage, n=node):
r = p(n, [x[0] for x in i], o)
......
This source diff could not be displayed because it is too large. You can view the blob instead.
"""
This code implements the operations that scan has to carry on when called
as a stand alone function.
IF anything this is the entire code that needs to be transported to C.
Short description of how this code works:
Scan divides its inputs ( Op's inputs) into different classes of inputs
as follows:
i) sequences : inputs over which scan loops to get data. Nothing is
written into them ( they are readonly, loop over)
ii) mit_mot : multiple input taps multiple output taps arguments.
These are inputs over which scan loops and gets data but into which
scan also writes data. The shorthand mit_mot describes how scan
deal with them at each step : at each step take several slices as
input and produce sevaral slices as outputs
iii) mit_sot : multiple input taps single output tap arguments.
As before scan reads from these but also writes. At each step scan
uses several slices as input but produces only one as output
iv) sit_sot : single input tap single output tap arguments.
At each step use only the previous slice as input, produce only one
slice as output
v) nit_sot: no input tap single output tap arguments.
At each step don't use any previous values, only produce new onese
vi) shared_outs: arguments corresponding to shared variables with
updates.
At each step use its value as input, and afterwards replace it with
a new value.
vii) other_args: arguments that are passed to every call of the
inner function as they are ( no slicing is perfomed)
All these outputs are one after the other in the inputs list (named in
this code as args) in a given order ( namely the one described above
with little discrepencies depending if we are talking about the outputs
of the Scan op or the inputs of the Scan op Node, and if we are talking
about the inputs of the inner function of scan or of the scan op).
Because of this, all we need to be able to separate and tell arguments
apart is how many of which we have as well as how many taps and which
ones (where applicable). All this information is desribed (more or less)
by describing the arguments of this function)
"""
__authors__ = "Razvan Pascanu"
__copyright__ = "(c) 2011, Universite de Montreal"
__contact__ = "Razvan Pascanu <r.pascanu@gmail>"
import cython
import numpy
cimport numpy
import time
import copy
from theano.sandbox import cuda
def get_version():
return 0.1
@cython.boundscheck(False)
def perform(
unsigned int n_shared_outs,
unsigned int n_mit_mot_outs,
unsigned int n_seqs,
unsigned int n_mit_mot,
unsigned int n_mit_sot,
unsigned int n_sit_sot,
unsigned int n_nit_sot,
int n_steps,
bint as_while,
numpy.ndarray[numpy.int32_t,ndim=1] mintaps,
numpy.ndarray[numpy.int32_t,ndim=2] tap_array,
numpy.ndarray[numpy.int32_t,ndim=1] tap_array_len,
numpy.ndarray[numpy.int32_t,ndim=1] vector_seqs,
numpy.ndarray[numpy.int32_t,ndim=1] vector_outs,
numpy.ndarray[numpy.int32_t,ndim=2] mit_mot_out_slices,
numpy.ndarray[numpy.int32_t,ndim=1] mit_mot_out_nslices,
fn,
fnct,
bint inplace,
args,
outs,
self):
"""
Parameters
----------
n_shared_outs: unsigned int
Number of arugments that correspond to shared variables with
updates
n_mit_mot_outs: unsigned int
Sum over the number of output taps for each mit_mot sequence
n_seqs: unsigned int
Number of sequences provided as input
n_mit_mot : unsigned int
Number of mit_mot arguemnts
n_mit_sot: unsigned int
Number of mit_sot arguments
n_sit_sot: unsigned int
Number of sit sot arguemnts
n_nit_sot: unsigned int
Number of nit_sot arguments
n_steps: unsigned int
Number of steps to loop over
mintaps: int32 ndarray (can also be a simple python list if that is better !)
For any of the mit_mot, mit_sot, sit_sot says which is the furtherst
away input tap from current position. For example, if the taps where [-2,
-5, -9], the mintap would be -9. For sit_sot this is always -1 since
is the only allowed tap.
tap_array: int32 ndarray( can be replaced by a list of list in python if better)
For each of the mit_mot, mit_sot, sit_sot (the first dimension) says
which are the corresponding input taps. While this is a matrix, not all
values in a row are needed and tap_array_len is there to say up to
which entry we are dealing with valid taps ( afterwards there are
just 0s to ensure the fix format)
tap_array_len: int32 ndarray( can be replaced by a list if better)
For each of the mit_mot, mit_sot, sit_sot says how many input taps
each has. For sit_sot this will always be 1.
vector_seqs: int32 ndarray (can be replaced by a list of bools if better)
For each sequence the corresponding entry is either a 1, is the
sequence is a vector or 0 if it has more then 1 dimension
vector_outs: int32 ndarray( can be replaced by list of bools if better)
For each output ( mit_mot, mit_sot, sit_sot, nit_sot in this order)
the entry is 1 if the corresponding argument is a 1 dimensional
tensor, 0 otherwise.
mit_mot_out_slices : int32 ndarray( can be replaced by list of lists)
Same as tap_array, but for the output taps of mit_mot sequences
mit_mot_out_nslices: int32 ndarray (Can be replaced by a list)
Same as tap_array_len, but is the number of output taps of the
mit_mot sequences (i.e. it corresponds to mit_mot_out_slices)
fn: callable
This is the linker, i.e. the function that will loop over the
computational graph and call the perform of each operation. For this
linker there is a c version in gof/lazy_linker.c that will be the
starting point of implementing this funciton in C ( we need to take
all the code around the call of this function and put in C inside
that code)
fnct: python object
Only used to attach some timings for the profile mode ( can be
skiped if we don't care about Theano's profile mode)
inplace
Boolean that says if things should be computed inplace or if they
should not.
args: list of ndarrays (and random states)
The inputs of scan in a given order ( n_steps, sequences, mit_mot,
mit_sot, sit_sot, nit_sot, shared_outs, other_args)
outs: list of 1 element list ( or storage objects?)
This is where we need to copy our outputs ( we don't return the
results, though we can change the code such that we return, and
figure things out on the outside - python)
self: python object
The scan op itself. I only use it to attach to it some timing
informations .. but I don;t need to.
"""
# 1. Unzip the number of steps and sequences. If number of steps is
# negative flip sequences around, and make n_steps positive
t0_call = time.time()
t_fn = 0
cdef unsigned int n_outs = n_mit_mot + n_mit_sot + n_sit_sot
cdef unsigned int seqs_arg_offset = n_seqs + 1
cdef unsigned int shared_arg_offset = ( 1 + n_seqs + n_mit_mot +
n_mit_sot + n_sit_sot)
cdef unsigned int nit_sot_arg_offset = ( shared_arg_offset +
n_shared_outs)
cdef unsigned int offset_out
cdef unsigned int lenpos = n_outs + n_nit_sot
cdef int pos[500] # put a maximum of 500 outputs
cdef unsigned int len_store_steps = n_mit_mot + n_mit_sot + n_sit_sot + n_nit_sot
cdef int store_steps[500]
cdef unsigned int l
cdef unsigned int offset
cdef int tap
cdef int _idx
cdef unsigned int a_offset
cdef unsigned int o_offset
cdef unsigned int idx
cdef unsigned int i
cdef unsigned int j
cdef unsigned int k
cdef unsigned int kdx
cdef unsigned int tdx
cdef unsigned int pdx
cdef unsigned int jout
cdef unsigned int begin
cdef unsigned int end
cdef int cond
if n_steps < 0:
n_steps = -n_steps
for idx in range(n_seqs):
if args[<unsigned int>(1+idx)].shape[0] < n_steps:
raise ValueError(('Sequence is shorter then the required '
'number of steps : (n_steps, seq, '
'seq.shape):'), n_steps,
args[1+idx],
args[1+idx].shape)
args[<unsigned int>(1+idx)] = args[<unsigned int>(1+idx)][::-1]
else:
for idx in range(n_seqs):
if args[<unsigned int>(1+idx)].shape[0] < n_steps:
raise ValueError(('Sequence is shorter then the required '
'number of steps : (n_steps, seq, '
'seq.shape):'), n_steps,
args[1+idx],
args[1+idx].shape)
# 2. Allocate memory for the outputs. Construct the list:
# store_steps -- map containting the length of each output
# pos -- map containing the current position of each output
for idx in range(n_mit_mot + n_mit_sot + n_sit_sot):
store_steps[<unsigned int>idx] = args[<unsigned int>(idx+n_seqs+1)].shape[0]
for idx in range(n_nit_sot):
store_steps[<unsigned int>(idx + n_mit_mot + n_mit_sot + n_sit_sot)]=\
args[<unsigned int>(idx + n_mit_mot + n_mit_sot + n_sit_sot
+ n_shared_outs + n_seqs+1)]
for idx in range(n_outs + n_nit_sot):
pos[idx] = (-mintaps[idx])%store_steps[idx]
# 2.1 Create storage space for outputs
for idx in range(n_outs):
if inplace:
# ^ Case 1. Outputs should be computed inplace of their
# initial state
outs[idx][0] = args[ <unsigned int>(1+ n_seqs + idx)]
elif ( outs[idx][0] is not None and
outs[idx][0].shape[1:] == args[<unsigned int>(1+ n_seqs + idx)].shape[1:]
and outs[idx][0].shape[0] >= store_steps[idx] ):
# Put in the values of the initial state
outs[idx][0] = outs[idx][0][:store_steps[idx]]
if idx > n_mit_mot:
l = - mintaps[idx]
outs[idx][0][:l] = args[<unsigned int>(seqs_arg_offset +
idx)][:l]
else:
outs[idx][0][:] = args[<unsigned int>(seqs_arg_offset + idx)]
else:
outs[idx][0] = args[<unsigned int>(seqs_arg_offset + idx)].copy()
offset = nit_sot_arg_offset + n_nit_sot
other_args = args[offset:]
input_storage = fnct.input_storage
output_storage = fnct.output_storage
offset = n_seqs
for idx in range(n_outs):
offset += tap_array_len[idx]
offset += n_shared_outs
for idx in range(len(other_args)):
input_storage[<unsigned int>(idx+offset)].storage[0] = other_args[idx]
i = 0
cond = 1
############## THE MAIN LOOP #########################
#for i in range(n_steps):
while (i < n_steps) and cond == 1:
# sequences over which scan iterates
# 3. collect input slices
for idx in range(n_seqs):
if vector_seqs[idx] == 1:
input_storage[idx].storage[0] = args[\
<unsigned int>(1+idx)][i:<unsigned int>(i+1)].reshape(())
else:
input_storage[idx].storage[0] = \
args[<unsigned int>(idx+1)][i]
offset = n_seqs
for idx in range(n_outs):
if vector_outs[idx] == 1:
for tdx in range(tap_array_len[idx]):
tap = tap_array[idx,tdx]
_idx = (pos[idx]+tap)%store_steps[idx]
input_storage[offset].storage[0] =\
outs[idx][0][_idx:<unsigned int>(_idx+1)].reshape(())
offset += 1
else:
for tdx in range(tap_array_len[idx]):
tap = tap_array[idx,tdx]
_idx = (pos[idx]+tap)%store_steps[idx]
input_storage[offset].storage[0] = outs[idx][0][_idx]
offset += 1
a_offset = shared_arg_offset
o_offset = n_outs + n_nit_sot
if i == 0:
for j in range(n_shared_outs):
input_storage[offset].storage[0] = args[<unsigned int>(a_offset+j)]
offset += 1
else:
for j in range(n_shared_outs):
input_storage[offset].storage[0] = outs[<unsigned int>(o_offset+j)][0]
offset += 1
# 4. collecting slices where the output should be stored
for idx in range(n_mit_mot_outs):
output_storage[idx].storage[0] = None
offset = n_mit_mot_outs
if i !=0 and n_nit_sot >0:
for idx in range(n_outs + n_nit_sot - n_mit_mot):
if ( store_steps[<unsigned int>(idx+n_mit_mot)] == 1 or
vector_outs[<unsigned int>(idx+n_mit_mot)] == 1):
output_storage[<unsigned int>(idx+offset)].storage[0] = None
else:
output_storage[<unsigned int>(idx+offset)].storage[0] =\
outs[<unsigned int>(idx+n_mit_mot)][0][pos[\
<unsigned int>(idx+n_mit_mot)]]
else:
for idx in range(n_outs + n_nit_sot - n_mit_mot):
output_storage[<unsigned int>(idx+offset)].storage[0] = None
offset += n_outs+n_nit_sot - n_mit_mot
for idx in range(n_shared_outs):
output_storage[<unsigned int>(idx+offset)].storage[0] = None
if as_while:
pdx = offset + n_shared_outs
output_storage[<unsigned int>pdx].storage[0] = None
# 5. compute outputs
t0_fn = time.time()
fn()
dt_fn = time.time() - t0_fn
t_fn += dt_fn
if self.as_while:
pdx = offset + n_shared_outs
cond = output_storage[pdx].storage[0] == 0
offset_out = 0
# 5.1 Copy over the values for mit_mot outputs
for j in range(n_mit_mot):
for kdx in range(mit_mot_out_nslices[j]):
k = mit_mot_out_slices[j,kdx]
outs[j][0][<unsigned int>(k+pos[j])] = output_storage[offset_out].storage[0]
offset_out += 1
# 5.2 Copy over the values for mit_sot/sit_sot outputs
begin = n_mit_mot
end = n_outs
offset_out -= n_mit_mot
for j in range(begin, end):
if ( store_steps[j] == 1 or vector_outs[j] ==1 or
outs[j][0][pos[j]] is not output_storage[<unsigned int>(offset_out+j)].storage[0]):
outs[j][0][pos[j]] = output_storage[<unsigned int>(offset_out+j)].storage[0]
# 5.3 Copy over the values for nit_sot outputs
begin = end
end += n_nit_sot
for j in range(begin,end):
if i == 0:
jout = j+offset_out
shape = (store_steps[j],) + output_storage[jout].storage[0].shape
if len(output_storage[jout].storage[0].shape) == 0:
vector_outs[j] = 1
dtype = output_storage[jout].storage[0].dtype
if (outs[j][0] is None or
outs[j][0].shape[0] < store_steps[j] or
outs[j][0].shape[1:] != shape[1:] or
outs[j][0].dtype != dtype ):
if self.gpu:
outs[j][0] = cuda.cuda_ndarray.cuda_ndarray.CudaNdarray.zeros(shape)
else:
outs[j][0] = numpy.zeros(shape, dtype)
elif outs[j][0].shape[0] != store_steps[j]:
outs[j][0] = outs[j][0][:store_steps[j]]
outs[j][0][pos[j]] = output_storage[jout].storage[0]
elif (store_steps[j] == 1 or vector_outs[j] == 1 or
outs[j][0][pos[j]] is not output_storage[j+offset_out].storage[0]):
outs[j][0][pos[j]] = output_storage[j+offset_out].storage[0]
# 5.4 Copy over the values for outputs corresponding to shared
# variables
begin = end
end += n_shared_outs
for j in range(begin,end):
jout = j +offset_out
outs[j][0] = output_storage[jout].storage[0]
for idx in range(lenpos):
pos[idx] = (pos[idx]+1)%store_steps[idx]
i = i + 1
# 6. Check if you need to re-order output buffers
begin = n_mit_mot
end = n_outs + n_nit_sot
for idx in range(begin, end):
if ( store_steps[idx] < i-mintaps[idx] and
pos[idx] < store_steps[idx] ):
pdx = pos[idx]
if pdx < store_steps[idx]//2 :
shape = (pdx,)+ outs[idx][0].shape[1:]
if cuda.cuda_available and isinstance( outs[idx][0],
cuda.CudaNdarray):
tmp = cuda.cuda_ndarray.cuda_ndarray.CudaNdarray.zeros(shape)
else:
tmp = numpy.empty(shape, outs[idx][0].dtype)
tmp[:] = outs[idx][0][:pdx]
outs[idx][0][:store_steps[idx]-pdx] = outs[idx][0][pdx:]
outs[idx][0][store_steps[idx]-pdx:] = tmp
else:
shape = (store_steps[idx]-pdx,) + outs[idx][0].shape[1:]
if cuda.cuda_available and isinstance( outs[idx][0],
cuda.CudaNdarray):
tmp = cuda.cuda_ndarray.cuda_ndarray.CudaNdarray.zeros(shape)
else:
tmp = numpy.empty(shape, outs[idx][0].dtype)
tmp[:] = outs[idx][0][pdx:]
outs[idx][0][store_steps[idx]-pdx:] = outs[idx][0][:pdx]
outs[idx][0][:store_steps[idx]-pdx] = tmp
# This would normally happen only when doing truncated
# backpropagation through time. In such a scenarion Scan is
# expected to return 0 for all entries for which the gradient is
# not actually computed
elif store_steps[idx] > i - self.mintaps[idx]:
outs[idx][0][i+1-self.mintaps[idx]:] = 0
t_call = time.time() - t0_call
if hasattr(fnct.maker, 'profile'):
profile = fnct.maker.profile
if type(profile) is not bool and profile:
profile.vm_call_time += t_fn
profile.callcount += 1
profile.nbsteps += n_steps
profile.call_time += t_call
if hasattr(fn, 'update_profile'):
fn.update_profile(profile)
### Old Profile Mode
#if hasattr(fnct.maker.mode,'fct_call_time'):
# fnct.maker.mode.fct_call_time[fnct] += t_fn
# fnct.maker.mode.fct_call[fnct] += n_steps
#fnct.maker.mode.call_time += t_fn
#fnct.maker.mode.fn_time += t_fn
# DEBUG PRINT :
self.t_call = t_call
self.t_fn = t_fn
# print 'Cython > timing', t_call, t_fn, 'in percentage', 100.*t_fn/t_call
import os, logging, sys
import theano
from theano import config
from theano.gof.compilelock import get_lock, release_lock
from theano.gof import cmodule
_logger = logging.getLogger('theano.scan_module.scan_perform')
logging.basicConfig(level=logging.DEBUG)
# Ensure the compiledir is in `sys.path` to be able to reload an existing
# precompiled library.
if config.compiledir not in sys.path:
sys.path.append(config.compiledir)
version = 0.1 # must match constant returned in function get_version()
need_reload = False
try:
import scan_perform
need_reload = True
if version != getattr(scan_perform, '_version', None):
raise ImportError()
except ImportError:
get_lock()
try:
# Maybe someone else already finished compiling it while we were
# waiting for the lock?
try:
if need_reload:
# The module was successfully imported earlier: we need to
# reload it to check if the version was updated.
reload(scan_perform)
else:
import scan_perform
need_reload = True
if version != getattr(scan_perform, '_version', None):
raise ImportError()
except ImportError:
_logger.info("Compiling C code for scan")
dirname = 'scan_perform'
cfile = os.path.join(theano.__path__[0], 'scan_module',
'scan_perform.c')
code = open(cfile).read()
loc = os.path.join(config.compiledir, dirname)
if not os.path.exists(loc):
os.mkdir(loc)
cmodule.gcc_module_compile_str(dirname, code, location=loc,
preargs = ['-pthread','-fwrapv',
'-O2',
'-fno-strict-aliasing'])
# Save version into the __init__.py file.
init_py = os.path.join(loc, '__init__.py')
open(init_py, 'w').write('_version = %s\n' % version)
# If we just compiled the module for the first time, then it was
# imported at the same time: we need to make sure we do not
# reload the now outdated __init__.pyc below.
init_pyc = os.path.join(loc, '__init__.pyc')
if os.path.isfile(init_pyc):
os.remove(init_pyc)
import scan_perform
reload(scan_perform)
from scan_perform import scan_perform as scan_c
assert (scan_perform._version ==
scan_c.get_version())
_logger.info("New version %s", scan_perform._version)
finally:
# Release lock on compilation directory.
release_lock()
from scan_perform.scan_perform import *
assert version == get_version()
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论