提交 65af9781 authored 作者: abergeron's avatar abergeron 提交者: GitHub

Merge pull request #5073 from nouiz/Faruk-Ahmed-use_cxx_flag

Removing _op_use_c_code attribute
...@@ -99,7 +99,7 @@ possibilities you may encounter or need. For that refer to ...@@ -99,7 +99,7 @@ possibilities you may encounter or need. For that refer to
pass pass
# Other implementations (pycuda, ...): # Other implementations (pycuda, ...):
def make_thunk(self, node, storage_map, _, _2): def make_thunk(self, node, storage_map, _, _2, impl=None):
pass pass
# optional: # optional:
...@@ -190,11 +190,12 @@ or :func:`make_thunk`. ...@@ -190,11 +190,12 @@ or :func:`make_thunk`.
valid, but shouldn't be required anymore for this call. valid, but shouldn't be required anymore for this call.
The returned function must ensure that it sets the computed The returned function must ensure that it sets the computed
variables as computed in the `compute_map`. variables as computed in the `compute_map`.
- ``impl`` allow to select between multiple implementation.
It should have a default value of None.
:func:`make_thunk` is useful if you want to generate code and compile :func:`make_thunk` is useful if you want to generate code and compile
it yourself. For example, this allows you to use PyCUDA to compile GPU it yourself. For example, this allows you to use PyCUDA to compile GPU
code. code and keep state in the thunk.
If :func:`make_thunk()` is defined by an op, it will be used by Theano If :func:`make_thunk()` is defined by an op, it will be used by Theano
to obtain the op's implementation. to obtain the op's implementation.
......
...@@ -171,7 +171,7 @@ Optional methods or attributes ...@@ -171,7 +171,7 @@ Optional methods or attributes
returned, unless it is of length 1, where the single element will be returned, unless it is of length 1, where the single element will be
returned by itself. returned by itself.
.. function:: make_thunk(node, storage_map, compute_map, no_recycling) .. function:: make_thunk(node, storage_map, compute_map, no_recycling, impl=None)
This function must return a thunk, that is a zero-arguments This function must return a thunk, that is a zero-arguments
function that encapsulates the computation to be performed by this function that encapsulates the computation to be performed by this
...@@ -192,6 +192,8 @@ Optional methods or attributes ...@@ -192,6 +192,8 @@ Optional methods or attributes
valid, but shouldn't be required anymore for this call. valid, but shouldn't be required anymore for this call.
:param no_recycling: WRITEME :param no_recycling: WRITEME
WRITEME WRITEME
:param impl: None, 'c' or 'py'
Which implementation to use.
The returned function must ensure that is sets the computed The returned function must ensure that is sets the computed
variables as computed in the `compute_map`. variables as computed in the `compute_map`.
......
...@@ -92,7 +92,7 @@ You can use a GPU function compiled with PyCUDA in a Theano op: ...@@ -92,7 +92,7 @@ You can use a GPU function compiled with PyCUDA in a Theano op:
cuda.basic_ops.as_cuda_ndarray_variable(inp)) cuda.basic_ops.as_cuda_ndarray_variable(inp))
assert inp.dtype == "float32" assert inp.dtype == "float32"
return theano.Apply(self, [inp], [inp.type()]) return theano.Apply(self, [inp], [inp.type()])
def make_thunk(self, node, storage_map, _, _2): def make_thunk(self, node, storage_map, _, _2, impl=None):
mod = SourceModule(""" mod = SourceModule("""
__global__ void my_fct(float * i0, float * o0, int size) { __global__ void my_fct(float * i0, float * o0, int size) {
int i = blockIdx.x * blockDim.x + threadIdx.x; int i = blockIdx.x * blockDim.x + threadIdx.x;
......
...@@ -586,7 +586,7 @@ Modify and execute to work for a matrix of shape (20, 10). ...@@ -586,7 +586,7 @@ Modify and execute to work for a matrix of shape (20, 10).
assert inp.dtype == "float32" assert inp.dtype == "float32"
return theano.Apply(self, [inp], [inp.type()]) return theano.Apply(self, [inp], [inp.type()])
def make_thunk(self, node, storage_map, _, _2): def make_thunk(self, node, storage_map, _, _2, impl):
mod = SourceModule(""" mod = SourceModule("""
__global__ void my_fct(float * i0, float * o0, int size) { __global__ void my_fct(float * i0, float * o0, int size) {
int i = blockIdx.x*blockDim.x + threadIdx.x; int i = blockIdx.x*blockDim.x + threadIdx.x;
......
...@@ -124,14 +124,11 @@ class OpFromGraph(gof.Op): ...@@ -124,14 +124,11 @@ class OpFromGraph(gof.Op):
list(inputs) + self.shared_inputs, list(inputs) + self.shared_inputs,
[type() for type in self.output_types]) [type() for type in self.output_types])
def make_thunk(self, node, storage_map, compute_map, no_recycling): def prepare_node(self, node, storage_map, compute_map, impl):
ret = super(OpFromGraph, self).make_thunk(node, storage_map, if not hasattr(self, "fn") and impl == 'py':
compute_map, no_recycling)
if not hasattr(self, "fn"):
self.fn = orig_function(self.new_inputs, self.fn = orig_function(self.new_inputs,
self.new_outputs, self.new_outputs,
**self.kwargs) **self.kwargs)
return ret
def perform(self, node, inputs, outputs): def perform(self, node, inputs, outputs):
variables = self.fn(*inputs) variables = self.fn(*inputs)
......
...@@ -1837,8 +1837,6 @@ class _Linker(gof.link.LocalLinker): ...@@ -1837,8 +1837,6 @@ class _Linker(gof.link.LocalLinker):
thunk.inputs = [storage_map[v] for v in node.inputs] thunk.inputs = [storage_map[v] for v in node.inputs]
thunk.outputs = [storage_map[v] for v in node.outputs] thunk.outputs = [storage_map[v] for v in node.outputs]
thunk_other = thunk thunk_other = thunk
else:
node.op.prepare_node(node, storage_map, compute_map)
debug = hasattr(node.op, 'debug_perform') debug = hasattr(node.op, 'debug_perform')
...@@ -1852,6 +1850,7 @@ class _Linker(gof.link.LocalLinker): ...@@ -1852,6 +1850,7 @@ class _Linker(gof.link.LocalLinker):
if not isinstance(node.op, gof.op.Op): if not isinstance(node.op, gof.op.Op):
raise utils.MethodNotDefined() raise utils.MethodNotDefined()
node.op.prepare_node(node, storage_map, compute_map, 'c')
thunk = node.op.make_c_thunk(node, storage_map, compute_map, thunk = node.op.make_c_thunk(node, storage_map, compute_map,
no_recycling) no_recycling)
thunks_c.append(thunk) thunks_c.append(thunk)
...@@ -1864,6 +1863,7 @@ class _Linker(gof.link.LocalLinker): ...@@ -1864,6 +1863,7 @@ class _Linker(gof.link.LocalLinker):
if (((self.maker.mode.check_py_code or thunks_c[-1] is None) and if (((self.maker.mode.check_py_code or thunks_c[-1] is None) and
node.op.perform.__code__ != gof.op.PureOp.perform.__code__) or node.op.perform.__code__ != gof.op.PureOp.perform.__code__) or
debug): debug):
node.op.prepare_node(node, storage_map, compute_map, 'py')
thunk = node.op.make_py_thunk(node, storage_map, compute_map, thunk = node.op.make_py_thunk(node, storage_map, compute_map,
no_recycling, debug=debug) no_recycling, debug=debug)
thunks_py.append(thunk) thunks_py.append(thunk)
...@@ -1873,6 +1873,7 @@ class _Linker(gof.link.LocalLinker): ...@@ -1873,6 +1873,7 @@ class _Linker(gof.link.LocalLinker):
if not self.maker.mode.check_c_code and thunks_py[-1] is None: if not self.maker.mode.check_c_code and thunks_py[-1] is None:
_logger.warn("Op %s doesn't have a perform, " _logger.warn("Op %s doesn't have a perform, "
"forcing check of the C code" % node.op) "forcing check of the C code" % node.op)
node.op.prepare_node(node, storage_map, compute_map, 'c')
thunk = node.op.make_c_thunk(node, storage_map, compute_map, thunk = node.op.make_c_thunk(node, storage_map, compute_map,
no_recycling) no_recycling)
thunks_c[-1] = thunk thunks_c[-1] = thunk
......
...@@ -233,6 +233,7 @@ class PyDotFormatter(object): ...@@ -233,6 +233,7 @@ class PyDotFormatter(object):
gf = PyDotFormatter() gf = PyDotFormatter()
# Use different node prefix for sub-graphs # Use different node prefix for sub-graphs
gf.__node_prefix = __node_id gf.__node_prefix = __node_id
node.op.prepare_node(node, None, None, 'py')
gf(node.op.fn, subgraph) gf(node.op.fn, subgraph)
graph.add_subgraph(subgraph) graph.add_subgraph(subgraph)
pd_node.get_attributes()['subg'] = subgraph.get_name() pd_node.get_attributes()['subg'] = subgraph.get_name()
......
...@@ -1584,7 +1584,7 @@ class CLinker(link.Linker): ...@@ -1584,7 +1584,7 @@ class CLinker(link.Linker):
else: else:
# Set compute_map as None as clinker do not support lazy evaluation # Set compute_map as None as clinker do not support lazy evaluation
for node in self.node_order: for node in self.node_order:
node.op.prepare_node(node, storage_map, None) node.op.prepare_node(node, storage_map, None, 'c')
module = get_module_cache().module_from_key( module = get_module_cache().module_from_key(
key=key, lnk=self, keep_lock=keep_lock) key=key, lnk=self, keep_lock=keep_lock)
...@@ -1787,24 +1787,14 @@ class OpWiseCLinker(link.LocalLinker): ...@@ -1787,24 +1787,14 @@ class OpWiseCLinker(link.LocalLinker):
thunks = [] thunks = []
for node in order: for node in order:
# Maker sure we use the C version of the code whenever # make_thunk will try by default C code, otherwise
# possible # it fall back to python.
# There are ops that don't have _op_use_c_code property thunks += [node.op.make_thunk(node,
# for example ifelse (or any ops that come with their own storage_map,
# make_thunk compute_map,
old_value = getattr(node.op, '_op_use_c_code', False) no_recycling)]
try: thunks[-1].inputs = [storage_map[v] for v in node.inputs]
if theano.config.cxx: thunks[-1].outputs = [storage_map[v] for v in node.outputs]
node.op._op_use_c_code = True
thunks += [node.op.make_thunk(node,
storage_map,
compute_map,
no_recycling)]
thunks[-1].inputs = [storage_map[v] for v in node.inputs]
thunks[-1].outputs = [storage_map[v] for v in node.outputs]
finally:
node.op._op_use_c_code = old_value
for node in order: for node in order:
if self.allow_gc: if self.allow_gc:
......
...@@ -823,17 +823,13 @@ class PerformLinker(LocalLinker): ...@@ -823,17 +823,13 @@ class PerformLinker(LocalLinker):
# the python version # the python version
# Note : ops that implement their own make thunk don't usually # Note : ops that implement their own make thunk don't usually
# have this attribute defiend !! # have this attribute defiend !!
old_value = getattr(node.op, '_op_use_c_code', False) thunks += [node.op.make_thunk(node,
try: storage_map,
node.op._op_use_c_code = False compute_map,
thunks += [node.op.make_thunk(node, no_recycling,
storage_map, 'py')]
compute_map, thunks[-1].inputs = [storage_map[v] for v in node.inputs]
no_recycling)] thunks[-1].outputs = [storage_map[v] for v in node.outputs]
thunks[-1].inputs = [storage_map[v] for v in node.inputs]
thunks[-1].outputs = [storage_map[v] for v in node.outputs]
finally:
node.op._op_use_c_code = old_value
computed, last_user = gc_helper(order) computed, last_user = gc_helper(order)
if self.allow_gc: if self.allow_gc:
......
...@@ -32,6 +32,8 @@ __contact__ = "theano-dev <theano-dev@googlegroups.com>" ...@@ -32,6 +32,8 @@ __contact__ = "theano-dev <theano-dev@googlegroups.com>"
__docformat__ = "restructuredtext en" __docformat__ = "restructuredtext en"
_logger = logging.getLogger('theano.gof.op.Op')
class CLinkerObject(object): class CLinkerObject(object):
""" """
...@@ -779,34 +781,24 @@ class Op(utils.object2, PureOp, CLinkerOp): ...@@ -779,34 +781,24 @@ class Op(utils.object2, PureOp, CLinkerOp):
Convenience class to bundle `PureOp` and `CLinkerOp`. Convenience class to bundle `PureOp` and `CLinkerOp`.
""" """
def __new__(cls, *args, **kwargs): def prepare_node(self, node, storage_map, compute_map, impl):
# this function exists to silently and transparently ensure that all
# existing Ops get a _op_use_c_code attribute
obj = object.__new__(cls)
if not hasattr(obj, '_op_use_c_code'):
obj._op_use_c_code = theano.config.cxx
return obj
def __init__(self, use_c_code=theano.config.cxx):
self._op_use_c_code = use_c_code
def prepare_node(self, node, storage_map, compute_map):
""" """
Make any special modifications that the Op needs before doing Make any special modifications that the Op needs before doing
make_thunk(). make_thunk().
This can modify the node inplace and should return nothing. This can modify the node inplace and should return nothing.
It can be called multiple time with different impl. It is the
op responsability to don't re-prepare the node when it isn't
good to do so.
""" """
pass pass
def make_c_thunk(self, node, storage_map, compute_map, no_recycling): def make_c_thunk(self, node, storage_map, compute_map, no_recycling):
""" """Like make_thunk, but will only try to make a C thunk.
Like make_thunk, but will only try to make a C thunk.
""" """
logger = logging.getLogger('theano.gof.op.Op')
node_input_storage = [storage_map[r] for r in node.inputs] node_input_storage = [storage_map[r] for r in node.inputs]
node_output_storage = [storage_map[r] for r in node.outputs] node_output_storage = [storage_map[r] for r in node.outputs]
...@@ -828,7 +820,7 @@ class Op(utils.object2, PureOp, CLinkerOp): ...@@ -828,7 +820,7 @@ class Op(utils.object2, PureOp, CLinkerOp):
cl = theano.gof.cc.CLinker().accept(e, cl = theano.gof.cc.CLinker().accept(e,
no_recycling=e_no_recycling) no_recycling=e_no_recycling)
logger.debug('Trying CLinker.make_thunk') _logger.debug('Trying CLinker.make_thunk')
outputs = cl.make_thunk(input_storage=node_input_storage, outputs = cl.make_thunk(input_storage=node_input_storage,
output_storage=node_output_storage) output_storage=node_output_storage)
fill_storage, node_input_filters, node_output_filters = outputs fill_storage, node_input_filters, node_output_filters = outputs
...@@ -883,7 +875,8 @@ class Op(utils.object2, PureOp, CLinkerOp): ...@@ -883,7 +875,8 @@ class Op(utils.object2, PureOp, CLinkerOp):
rval.lazy = False rval.lazy = False
return rval return rval
def make_thunk(self, node, storage_map, compute_map, no_recycling): def make_thunk(self, node, storage_map, compute_map, no_recycling,
impl=None):
""" """
This function must return a thunk, that is a zero-arguments This function must return a thunk, that is a zero-arguments
function that encapsulates the computation to be performed function that encapsulates the computation to be performed
...@@ -904,6 +897,9 @@ class Op(utils.object2, PureOp, CLinkerOp): ...@@ -904,6 +897,9 @@ class Op(utils.object2, PureOp, CLinkerOp):
no_recycling no_recycling
List of variables for which it is forbidden to reuse memory List of variables for which it is forbidden to reuse memory
allocated by a previous call. allocated by a previous call.
impl
Currently, None, 'c' or 'py'. If 'c' or 'py' we will only try
that version of the code.
Notes Notes
----- -----
...@@ -913,27 +909,26 @@ class Op(utils.object2, PureOp, CLinkerOp): ...@@ -913,27 +909,26 @@ class Op(utils.object2, PureOp, CLinkerOp):
the thunk can potentially cache return values (like CLinker does), the thunk can potentially cache return values (like CLinker does),
then it must not do so for variables in the no_recycling list. then it must not do so for variables in the no_recycling list.
self.prepare_node(node, ...) is always called. If we try 'c' and it
fail and we try again 'py', prepare_node will be called twice.
""" """
logger = logging.getLogger('theano.gof.op.Op')
self.prepare_node(node, storage_map=storage_map, if impl is None or impl == 'c':
compute_map=compute_map) self.prepare_node(node, storage_map=storage_map,
compute_map=compute_map, impl='c')
if not hasattr(self, '_op_use_c_code'):
warnings.warn(
"The __getstate__ method of '%s' is not implemented correctly."
" It should keep the attributes added by the base class."
" To implement it correctly, it should keep all attributes"
" and only remove those it does not want." % (self),
stacklevel=2)
if getattr(self, '_op_use_c_code', theano.config.cxx):
try: try:
return self.make_c_thunk(node, storage_map, compute_map, return self.make_c_thunk(node, storage_map, compute_map,
no_recycling) no_recycling)
except (NotImplementedError, utils.MethodNotDefined): except (NotImplementedError, utils.MethodNotDefined):
logger.debug('Falling back on perform') # We requested the c code, so don't catch the error.
if impl == 'c':
raise
_logger.debug('Falling back on perform')
# condition: either there was no c_code, or it failed # condition: either there was no c_code, or it failed or
# python code was requested.
self.prepare_node(node, storage_map=storage_map,
compute_map=compute_map, impl='py')
return self.make_py_thunk(node, storage_map, compute_map, no_recycling) return self.make_py_thunk(node, storage_map, compute_map, no_recycling)
def make_node(self, *inputs): def make_node(self, *inputs):
...@@ -1196,9 +1191,9 @@ int main( int argc, const char* argv[] ) ...@@ -1196,9 +1191,9 @@ int main( int argc, const char* argv[] )
self.openmp = False self.openmp = False
theano.config.openmp = False theano.config.openmp = False
def prepare_node(self, node, storage_map, def prepare_node(self, node, storage_map, compute_map, impl):
compute_map): if impl == 'c':
self.update_self_openmp() self.update_self_openmp()
def simple_meth(tag): def simple_meth(tag):
......
...@@ -25,7 +25,7 @@ class IfElseIfElseIf(PureOp): ...@@ -25,7 +25,7 @@ class IfElseIfElseIf(PureOp):
assert t3.type == f3.type assert t3.type == f3.type
return Apply(self, [c1, t1, c2, t2, c3, t3, f3], [t1.type()]) return Apply(self, [c1, t1, c2, t2, c3, t3, f3], [t1.type()])
def make_thunk(self, node, storage_map, compute_map, no_recycling): def make_thunk(self, node, storage_map, compute_map, no_recycling, impl):
input_computed = [compute_map[v] for v in node.inputs] input_computed = [compute_map[v] for v in node.inputs]
output_computed = [compute_map[v] for v in node.outputs] output_computed = [compute_map[v] for v in node.outputs]
...@@ -93,7 +93,7 @@ class NotImplementedOp(PureOp): ...@@ -93,7 +93,7 @@ class NotImplementedOp(PureOp):
def make_node(self, x): def make_node(self, x):
return Apply(self, [x], [x.type()]) return Apply(self, [x], [x.type()])
def make_thunk(self, node, storage_map, compute_map, no_recycling): def make_thunk(self, node, storage_map, compute_map, no_recycling, impl):
def thunk(): def thunk():
raise self.E() raise self.E()
thunk.lazy = False thunk.lazy = False
......
...@@ -1043,12 +1043,14 @@ class VM_Linker(link.LocalLinker): ...@@ -1043,12 +1043,14 @@ class VM_Linker(link.LocalLinker):
t0 = time.time() t0 = time.time()
for node in order: for node in order:
try: try:
impl = None
if self.c_thunks is False: if self.c_thunks is False:
node.op._op_use_c_code = False impl = 'py'
thunks.append(node.op.make_thunk(node, thunks.append(node.op.make_thunk(node,
storage_map, storage_map,
compute_map, compute_map,
no_recycling)) no_recycling,
impl=impl))
if not hasattr(thunks[-1], 'lazy'): if not hasattr(thunks[-1], 'lazy'):
# We don't want all ops maker to think about lazy Ops. # We don't want all ops maker to think about lazy Ops.
# So if they didn't specify that its lazy or not, it isn't. # So if they didn't specify that its lazy or not, it isn't.
......
...@@ -2620,11 +2620,9 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2620,11 +2620,9 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
def get_params(self, node): def get_params(self, node):
return node.outputs[0].type.context return node.outputs[0].type.context
def make_thunk(self, node, storage_map, compute_map, no_recycling): def prepare_node(self, node, storage_map, compute_map, impl):
# cache the kernel object # cache the kernel object
self.get_kernel_cache(node) self.get_kernel_cache(node)
return super(GpuCAReduceCPY, self).make_thunk(
node, storage_map, compute_map, no_recycling)
def get_kernel_cache(self, node): def get_kernel_cache(self, node):
attr = '@cache_reduction_k' attr = '@cache_reduction_k'
......
...@@ -73,7 +73,7 @@ class CuRFFTOp(Op): ...@@ -73,7 +73,7 @@ class CuRFFTOp(Op):
return theano.Apply(self, [inp, s], [self.output_type(inp)()]) return theano.Apply(self, [inp, s], [self.output_type(inp)()])
def make_thunk(self, node, storage_map, _, _2): def make_thunk(self, node, storage_map, _, _2, impl=None):
inputs = [storage_map[v] for v in node.inputs] inputs = [storage_map[v] for v in node.inputs]
outputs = [storage_map[v] for v in node.outputs] outputs = [storage_map[v] for v in node.outputs]
...@@ -198,7 +198,7 @@ class CuIRFFTOp(Op): ...@@ -198,7 +198,7 @@ class CuIRFFTOp(Op):
return theano.Apply(self, [inp, s], [self.output_type(inp)()]) return theano.Apply(self, [inp, s], [self.output_type(inp)()])
def make_thunk(self, node, storage_map, _, _2): def make_thunk(self, node, storage_map, _, _2, impl=None):
inputs = [storage_map[v] for v in node.inputs] inputs = [storage_map[v] for v in node.inputs]
outputs = [storage_map[v] for v in node.outputs] outputs = [storage_map[v] for v in node.outputs]
......
...@@ -20,7 +20,7 @@ import numpy ...@@ -20,7 +20,7 @@ import numpy
import theano.tensor import theano.tensor
from theano.tensor import TensorType from theano.tensor import TensorType
from theano import gof from theano import gof
from theano.gof import PureOp, Apply from theano.gof import Op, Apply
from six import iteritems from six import iteritems
from six.moves import xrange from six.moves import xrange
...@@ -41,7 +41,7 @@ __contact__ = "Razvan Pascanu <r.pascanu@gmail>" ...@@ -41,7 +41,7 @@ __contact__ = "Razvan Pascanu <r.pascanu@gmail>"
_logger = logging.getLogger('theano.ifelse') _logger = logging.getLogger('theano.ifelse')
class IfElse(PureOp): class IfElse(Op):
""" """
Op that provides conditional graph evaluation if used with the CVM/VM Op that provides conditional graph evaluation if used with the CVM/VM
linkers. Note that there exist a helpful function `ifelse` that should linkers. Note that there exist a helpful function `ifelse` that should
...@@ -235,7 +235,7 @@ class IfElse(PureOp): ...@@ -235,7 +235,7 @@ class IfElse(PureOp):
if_true_op(*if_true, **dict(return_list=True)) + if_true_op(*if_true, **dict(return_list=True)) +
if_false_op(*if_false, **dict(return_list=True))) if_false_op(*if_false, **dict(return_list=True)))
def make_thunk(self, node, storage_map, compute_map, no_recycling): def make_thunk(self, node, storage_map, compute_map, no_recycling, impl=None):
cond = node.inputs[0] cond = node.inputs[0]
ts = node.inputs[1:][:self.n_outs] ts = node.inputs[1:][:self.n_outs]
fs = node.inputs[1:][self.n_outs:] fs = node.inputs[1:][self.n_outs:]
......
...@@ -320,7 +320,7 @@ class PycudaElemwiseSourceModuleMakeThunkOp(Op): ...@@ -320,7 +320,7 @@ class PycudaElemwiseSourceModuleMakeThunkOp(Op):
out_node = Apply(self, _inputs, [otype() for o in xrange(self.nout)]) out_node = Apply(self, _inputs, [otype() for o in xrange(self.nout)])
return out_node return out_node
def make_thunk(self, node, storage_map, _, _2): def make_thunk(self, node, storage_map, _, _2, impl=None):
# TODO support broadcast! # TODO support broadcast!
# TODO assert all input have the same shape # TODO assert all input have the same shape
fct_name = "pycuda_elemwise_%s" % str(self.scalar_op) fct_name = "pycuda_elemwise_%s" % str(self.scalar_op)
......
...@@ -246,18 +246,14 @@ class GpuOp(theano.gof.Op): ...@@ -246,18 +246,14 @@ class GpuOp(theano.gof.Op):
""" """
def make_thunk(self, node, storage_map, compute_map, no_recycling): def prepare_node(self, node, storage_map, compute_map, impl):
if use.device_number is None: if use.device_number is None:
use("gpu", use("gpu",
force=True, force=True,
default_to_move_computation_to_gpu=False, default_to_move_computation_to_gpu=False,
move_shared_float32_to_gpu=False, move_shared_float32_to_gpu=False,
enable_cuda=False) enable_cuda=False)
return super(GpuOp, self).make_thunk(node, storage_map,
compute_map, no_recycling)
theano.compile.debugmode.default_make_thunk.append(
get_unbound_function(GpuOp.make_thunk))
# We must do those import to be able to create the full doc when # We must do those import to be able to create the full doc when
# nvcc is not available # nvcc is not available
......
...@@ -541,10 +541,8 @@ class GpuGemm(GpuOp): ...@@ -541,10 +541,8 @@ class GpuGemm(GpuOp):
def __setstate__(self, dct): def __setstate__(self, dct):
self.__dict__.update(dct) self.__dict__.update(dct)
# Correctly reload older pickles where _op_use_c_code and # Correctly reload older pickles where destroy_map were not
# destroy_map were not saved # saved
if '_op_use_c_code' not in self.__dict__:
self._op_use_c_code = theano.config.cxx
if 'destroy_map' not in self.__dict__ and self.inplace: if 'destroy_map' not in self.__dict__ and self.inplace:
self.destroy_map = {0: [0]} self.destroy_map = {0: [0]}
...@@ -661,10 +659,8 @@ class GpuGemv(GpuOp): ...@@ -661,10 +659,8 @@ class GpuGemv(GpuOp):
def __setstate__(self, dct): def __setstate__(self, dct):
self.__dict__.update(dct) self.__dict__.update(dct)
# Correctly reload older pickles where _op_use_c_code and # Correctly reload older pickles where destroy_map were not
# destroy_map were not saved # saved
if '_op_use_c_code' not in self.__dict__:
self._op_use_c_code = theano.config.cxx
if 'destroy_map' not in self.__dict__ and self.inplace: if 'destroy_map' not in self.__dict__ and self.inplace:
self.destroy_map = {0: [0]} self.destroy_map = {0: [0]}
...@@ -761,10 +757,8 @@ class GpuGer(GpuOp): ...@@ -761,10 +757,8 @@ class GpuGer(GpuOp):
def __setstate__(self, dct): def __setstate__(self, dct):
self.__dict__.update(dct) self.__dict__.update(dct)
# Correctly reload older pickles where _op_use_c_code and # Correctly reload older pickles where destroy_map were not
# destroy_map were not saved # saved
if '_op_use_c_code' not in self.__dict__:
self._op_use_c_code = theano.config.cxx
if 'destroy_map' not in self.__dict__ and self.inplace: if 'destroy_map' not in self.__dict__ and self.inplace:
self.destroy_map = {0: [0]} self.destroy_map = {0: [0]}
...@@ -2187,7 +2181,9 @@ class GpuConv(GpuOp): ...@@ -2187,7 +2181,9 @@ class GpuConv(GpuOp):
images[2] * images[3] * 2) images[2] * images[3] * 2)
return flops return flops
def prepare_node(self, node, storage_map, compute_map): def prepare_node(self, node, storage_map, compute_map, impl):
super(GpuConv, self).prepare_node(node, storage_map, compute_map, impl)
if node.op.max_threads_dim0 is None: if node.op.max_threads_dim0 is None:
cuda = theano.sandbox.cuda cuda = theano.sandbox.cuda
device_id = cuda.use.device_number device_id = cuda.use.device_number
...@@ -2240,8 +2236,8 @@ class GpuConv(GpuOp): ...@@ -2240,8 +2236,8 @@ class GpuConv(GpuOp):
bmode = 0 bmode = 0
if max_threads_dim0 is None: if max_threads_dim0 is None:
raise NotImplementedError("GpuConv.c_code should not be called " raise NotImplementedError("GpuConv.c_code should not be called "
"directly. It should be called by " "directly. It should be called after "
"make_thunk() that add some information " "prepare_node() that add some information "
"related to the selected GPU.") "related to the selected GPU.")
sub.update(locals()) sub.update(locals())
return """ return """
......
...@@ -51,10 +51,7 @@ class GpuSolve(GpuOp): ...@@ -51,10 +51,7 @@ class GpuSolve(GpuOp):
assert inp2.ndim == 2 assert inp2.ndim == 2
return theano.Apply(self, [inp1, inp2], [self.output_type(inp1)()]) return theano.Apply(self, [inp1, inp2], [self.output_type(inp1)()])
def make_thunk(self, def make_thunk(self, node, storage_map, _, no_recycling, impl=None):
node,
storage_map, _,
no_recycling=[]):
# Initialize CULA the first time it is needed # Initialize CULA the first time it is needed
global cula_initialized global cula_initialized
......
...@@ -1567,7 +1567,10 @@ class GpuDnnPool(DnnBase): ...@@ -1567,7 +1567,10 @@ class GpuDnnPool(DnnBase):
assert mode in ('max', 'average_inc_pad', 'average_exc_pad') assert mode in ('max', 'average_inc_pad', 'average_exc_pad')
self.mode = mode self.mode = mode
def prepare_node(self, node, storage_map, compute_map): def prepare_node(self, node, storage_map, compute_map, impl):
super(GpuDnnPool, self).prepare_node(
node, storage_map, compute_map, impl)
if len(node.inputs) == 2: if len(node.inputs) == 2:
warnings.warn("Theano GPUDnnPoolGrad internal changed.", stacklevel=3) warnings.warn("Theano GPUDnnPoolGrad internal changed.", stacklevel=3)
# Old interface # Old interface
...@@ -1803,7 +1806,7 @@ class GpuDnnPoolGrad(DnnBase): ...@@ -1803,7 +1806,7 @@ class GpuDnnPoolGrad(DnnBase):
assert mode in ('max', 'average_inc_pad', 'average_exc_pad') assert mode in ('max', 'average_inc_pad', 'average_exc_pad')
self.mode = mode self.mode = mode
def prepare_node(self, node, storage_map, compute_map): def prepare_node(self, node, storage_map, compute_map, impl):
if len(node.inputs) == 4: if len(node.inputs) == 4:
warnings.warn("Theano GPUDnnPoolGrad internal changed.", stacklevel=3) warnings.warn("Theano GPUDnnPoolGrad internal changed.", stacklevel=3)
# Old interface # Old interface
......
...@@ -49,7 +49,7 @@ class GpuCumsum(CumsumOp, GpuOp): ...@@ -49,7 +49,7 @@ class GpuCumsum(CumsumOp, GpuOp):
return theano.Apply(self, [x], [x.type()]) return theano.Apply(self, [x], [x.type()])
def make_thunk(self, node, storage_map, compute_map, no_recycling): def make_thunk(self, node, storage_map, compute_map, no_recycling, impl=None):
node_ = copy.copy(node) node_ = copy.copy(node)
assert node.op is node_.op assert node.op is node_.op
if node_.op.max_threads_dim0 is None or node_.op.max_grid_size1 is None or node_.op.max_grid_size2 is None: if node_.op.max_threads_dim0 is None or node_.op.max_grid_size1 is None or node_.op.max_grid_size2 is None:
...@@ -70,7 +70,7 @@ class GpuCumsum(CumsumOp, GpuOp): ...@@ -70,7 +70,7 @@ class GpuCumsum(CumsumOp, GpuOp):
node_.op.max_grid_size2 = prop['maxGridSize2'] node_.op.max_grid_size2 = prop['maxGridSize2']
return super(GpuCumsum, node_.op).make_thunk(node_, storage_map, return super(GpuCumsum, node_.op).make_thunk(node_, storage_map,
compute_map, no_recycling) compute_map, no_recycling, impl)
def __str__(self): def __str__(self):
return "%s{%s}" % (self.__class__.__name__, self.axis) return "%s{%s}" % (self.__class__.__name__, self.axis)
......
...@@ -48,7 +48,7 @@ class ScikitsCudaOp(GpuOp): ...@@ -48,7 +48,7 @@ class ScikitsCudaOp(GpuOp):
return theano.Apply(self, [inp], [self.output_type(inp)()]) return theano.Apply(self, [inp], [self.output_type(inp)()])
def make_thunk(self, node, storage_map, _, _2): def make_thunk(self, node, storage_map, _, _2, impl=None):
if not scikits_cuda_available: if not scikits_cuda_available:
raise RuntimeError( raise RuntimeError(
"scikits.cuda is needed for all GPU fft implementation," "scikits.cuda is needed for all GPU fft implementation,"
...@@ -61,7 +61,7 @@ class CuFFTOp(ScikitsCudaOp): ...@@ -61,7 +61,7 @@ class CuFFTOp(ScikitsCudaOp):
return CudaNdarrayType( return CudaNdarrayType(
broadcastable=[False] * (inp.type.ndim + 1)) broadcastable=[False] * (inp.type.ndim + 1))
def make_thunk(self, node, storage_map, _, _2): def make_thunk(self, node, storage_map, _, _2, impl=None):
super(CuFFTOp, self).make_thunk(node, storage_map, _, _2) super(CuFFTOp, self).make_thunk(node, storage_map, _, _2)
from theano.misc.pycuda_utils import to_gpuarray from theano.misc.pycuda_utils import to_gpuarray
...@@ -118,7 +118,7 @@ class CuIFFTOp(ScikitsCudaOp): ...@@ -118,7 +118,7 @@ class CuIFFTOp(ScikitsCudaOp):
return CudaNdarrayType( return CudaNdarrayType(
broadcastable=[False] * (inp.type.ndim - 1)) broadcastable=[False] * (inp.type.ndim - 1))
def make_thunk(self, node, storage_map, _, _2): def make_thunk(self, node, storage_map, _, _2, impl=None):
super(CuIFFTOp, self).make_thunk(node, storage_map, _, _2) super(CuIFFTOp, self).make_thunk(node, storage_map, _, _2)
from theano.misc.pycuda_utils import to_gpuarray from theano.misc.pycuda_utils import to_gpuarray
...@@ -314,7 +314,7 @@ class BatchedComplexDotOp(ScikitsCudaOp): ...@@ -314,7 +314,7 @@ class BatchedComplexDotOp(ScikitsCudaOp):
def output_type(self, inp): def output_type(self, inp):
return CudaNdarrayType(broadcastable=[False] * inp.type.ndim) return CudaNdarrayType(broadcastable=[False] * inp.type.ndim)
def make_thunk(self, node, storage_map, _, _2): def make_thunk(self, node, storage_map, _, _2, impl=None):
super(BatchedComplexDotOp, self).make_thunk(node, storage_map, _, _2) super(BatchedComplexDotOp, self).make_thunk(node, storage_map, _, _2)
inputs = [storage_map[v] for v in node.inputs] inputs = [storage_map[v] for v in node.inputs]
......
...@@ -3064,7 +3064,7 @@ arctan = ArcTan(upgrade_to_float, name='arctan') ...@@ -3064,7 +3064,7 @@ arctan = ArcTan(upgrade_to_float, name='arctan')
class ArcTan2(BinaryScalarOp): class ArcTan2(BinaryScalarOp):
nfunc_spec = ('arctan2', 1, 1) nfunc_spec = ('arctan2', 2, 1)
def impl(self, y, x): def impl(self, y, x):
# If x and y are int8 or uint8, numpy.arctan2 will compute the result # If x and y are int8 or uint8, numpy.arctan2 will compute the result
...@@ -3663,11 +3663,15 @@ class Composite(ScalarOp): ...@@ -3663,11 +3663,15 @@ class Composite(ScalarOp):
# Postpone the creation in case it isn't needed. # Postpone the creation in case it isn't needed.
# self.init_name() # self.name # self.init_name() # self.name
self.name = None self.name = None
self.prepare_node_called = set()
def prepare_node(self, node, storage_map, compute_map):
self.init_py_impls() # self._impls def prepare_node(self, node, storage_map, compute_map, impl):
for n in theano.gof.graph.list_of_nodes(self.inputs, self.outputs): if impl == 'py':
n.op.prepare_node(n, None, None) self.init_py_impls() # self._impls
if impl not in self.prepare_node_called:
for n in theano.gof.graph.list_of_nodes(self.inputs, self.outputs):
n.op.prepare_node(n, None, None, impl)
self.prepare_node_called.add(impl)
def output_types(self, input_types): def output_types(self, input_types):
if tuple(input_types) != self.inputs_type: if tuple(input_types) != self.inputs_type:
......
...@@ -125,7 +125,7 @@ class Scan(PureOp): ...@@ -125,7 +125,7 @@ class Scan(PureOp):
outputs, outputs,
info, info,
typeConstructor=None, typeConstructor=None,
): ):
if 'gpua' not in info: if 'gpua' not in info:
info['gpua'] = False info['gpua'] = False
# adding properties into self # adding properties into self
...@@ -346,8 +346,8 @@ class Scan(PureOp): ...@@ -346,8 +346,8 @@ class Scan(PureOp):
len(self.inner_shared(self.inputs)) + len(self.inner_shared(self.inputs)) +
len(self.inner_non_seqs(self.inputs))) len(self.inner_non_seqs(self.inputs)))
assert n_outer_ins == n_inner_ins, \ assert n_outer_ins == n_inner_ins, \
("The number of inputs given to the inner function of scan" ("The number of inputs given to the inner function of scan"
" does not match the number of inputs given to scan.") " does not match the number of inputs given to scan.")
new_inputs = [inputs[0]] new_inputs = [inputs[0]]
# assert dtype is consistent # assert dtype is consistent
err_msg1 = ('When compiling the inner function of scan (the ' err_msg1 = ('When compiling the inner function of scan (the '
...@@ -372,7 +372,7 @@ class Scan(PureOp): ...@@ -372,7 +372,7 @@ class Scan(PureOp):
'have the same dimensionality, you can increase the ' 'have the same dimensionality, you can increase the '
'dimensionality of the varialbe in the initial state of scan ' 'dimensionality of the varialbe in the initial state of scan '
'by using dimshuffle or shape_padleft. ' 'by using dimshuffle or shape_padleft. '
) )
err_msg2 = ('When compiling the inner function of scan the ' err_msg2 = ('When compiling the inner function of scan the '
'following error has been encountered: The ' 'following error has been encountered: The '
'initial state (`outputs_info` in scan nomenclature) ' 'initial state (`outputs_info` in scan nomenclature) '
...@@ -399,7 +399,7 @@ class Scan(PureOp): ...@@ -399,7 +399,7 @@ class Scan(PureOp):
'have the same dimensionality, you can increase the ' 'have the same dimensionality, you can increase the '
'dimensionality of the variable in the initial state of scan ' 'dimensionality of the variable in the initial state of scan '
'by using dimshuffle or shape_padleft. ' 'by using dimshuffle or shape_padleft. '
) )
def format(var, as_var): def format(var, as_var):
""" """
...@@ -440,9 +440,9 @@ class Scan(PureOp): ...@@ -440,9 +440,9 @@ class Scan(PureOp):
inner_mitmot = self.inner_mitmot(self.inputs) inner_mitmot = self.inner_mitmot(self.inputs)
inner_mitmot_outs = self.inner_mitmot_outs(self.outputs) inner_mitmot_outs = self.inner_mitmot_outs(self.outputs)
for idx, (itaps, otaps, _outer_mitmot) in enumerate( for idx, (itaps, otaps, _outer_mitmot) in enumerate(
zip(self.mitmot_taps(), zip(self.mitmot_taps(),
self.mitmot_out_taps(), self.mitmot_out_taps(),
self.outer_mitmot(inputs))): self.outer_mitmot(inputs))):
outer_mitmot = format(_outer_mitmot, as_var=inner_mitmot[ipos]) outer_mitmot = format(_outer_mitmot, as_var=inner_mitmot[ipos])
new_inputs.append(outer_mitmot) new_inputs.append(outer_mitmot)
for k in xrange(len(itaps)): for k in xrange(len(itaps)):
...@@ -450,15 +450,15 @@ class Scan(PureOp): ...@@ -450,15 +450,15 @@ class Scan(PureOp):
outer_mitmot.type.dtype or outer_mitmot.type.dtype or
inner_mitmot[ipos + k].ndim != outer_mitmot.ndim - 1): inner_mitmot[ipos + k].ndim != outer_mitmot.ndim - 1):
raise ValueError(err_msg1 % ('initial state (outputs_info' raise ValueError(err_msg1 % ('initial state (outputs_info'
' in scan nomenclature) ', ' in scan nomenclature) ',
str(outer_mitmot), str(outer_mitmot),
argoffset + idx, argoffset + idx,
outer_mitmot.type.dtype, outer_mitmot.type.dtype,
outer_mitmot.type.ndim, outer_mitmot.type.ndim,
str(inner_mitmot[ipos + k]), str(inner_mitmot[ipos + k]),
inner_mitmot[ipos + inner_mitmot[ipos +
k].type.dtype, k].type.dtype,
inner_mitmot[ipos + k].type.ndim)) inner_mitmot[ipos + k].type.ndim))
ipos += len(itaps) ipos += len(itaps)
for k in xrange(len(otaps)): for k in xrange(len(otaps)):
if (inner_mitmot_outs[opos + k].type.dtype != if (inner_mitmot_outs[opos + k].type.dtype !=
...@@ -491,14 +491,14 @@ class Scan(PureOp): ...@@ -491,14 +491,14 @@ class Scan(PureOp):
outer_mitsot.type.dtype or outer_mitsot.type.dtype or
inner_mitsots[ipos + k].ndim != outer_mitsot.ndim - 1): inner_mitsots[ipos + k].ndim != outer_mitsot.ndim - 1):
raise ValueError(err_msg1 % ('initial state (outputs_info' raise ValueError(err_msg1 % ('initial state (outputs_info'
' in scan nomenclature) ', ' in scan nomenclature) ',
str(outer_mitsot), str(outer_mitsot),
argoffset + idx, argoffset + idx,
outer_mitsot.type.dtype, outer_mitsot.type.dtype,
outer_mitsot.type.ndim, outer_mitsot.type.ndim,
str(inner_mitsots[ipos + k]), str(inner_mitsots[ipos + k]),
inner_mitsots[ipos + k].type.dtype, inner_mitsots[ipos + k].type.dtype,
inner_mitsots[ipos + k].type.ndim)) inner_mitsots[ipos + k].type.ndim))
ipos += len(itaps) ipos += len(itaps)
if inner_mitsot_out.type.dtype != outer_mitsot.type.dtype: if inner_mitsot_out.type.dtype != outer_mitsot.type.dtype:
raise ValueError(err_msg2 % raise ValueError(err_msg2 %
...@@ -523,14 +523,14 @@ class Scan(PureOp): ...@@ -523,14 +523,14 @@ class Scan(PureOp):
new_inputs.append(outer_sitsot) new_inputs.append(outer_sitsot)
if (inner_sitsot.ndim != outer_sitsot.ndim - 1): if (inner_sitsot.ndim != outer_sitsot.ndim - 1):
raise ValueError(err_msg1 % ('initial state (outputs_info' raise ValueError(err_msg1 % ('initial state (outputs_info'
' in scan nomenclature) ', ' in scan nomenclature) ',
str(outer_sitsot), str(outer_sitsot),
argoffset + idx, argoffset + idx,
outer_sitsot.type.dtype, outer_sitsot.type.dtype,
outer_sitsot.type.ndim, outer_sitsot.type.ndim,
str(inner_sitsot), str(inner_sitsot),
inner_sitsot.type.dtype, inner_sitsot.type.dtype,
inner_sitsot.type.ndim)) inner_sitsot.type.ndim))
if inner_sitsot_out.type.dtype != outer_sitsot.type.dtype: if inner_sitsot_out.type.dtype != outer_sitsot.type.dtype:
raise ValueError(err_msg2 % raise ValueError(err_msg2 %
(str(outer_sitsot), (str(outer_sitsot),
...@@ -570,14 +570,14 @@ class Scan(PureOp): ...@@ -570,14 +570,14 @@ class Scan(PureOp):
(outer_shared.dtype != inner_shared.dtype or (outer_shared.dtype != inner_shared.dtype or
outer_shared.ndim != inner_shared.ndim)): outer_shared.ndim != inner_shared.ndim)):
raise ValueError(err_msg1 % ('initial state (outputs_info' raise ValueError(err_msg1 % ('initial state (outputs_info'
' in scan nomenclature) ', ' in scan nomenclature) ',
str(outer_shared), str(outer_shared),
argoffset + idx, argoffset + idx,
outer_shared.dtype, outer_shared.dtype,
outer_shared.ndim, outer_shared.ndim,
str(inner_shared), str(inner_shared),
inner_shared.dtype, inner_shared.dtype,
inner_shared.ndim)) inner_shared.ndim))
# We do not need to call `format` on outer_nisot arguments. # We do not need to call `format` on outer_nisot arguments.
# outer_nitsot stands for no input tap single output tap. This means # outer_nitsot stands for no input tap single output tap. This means
# these are states that do not feed anything back in the recurrent # these are states that do not feed anything back in the recurrent
...@@ -595,7 +595,7 @@ class Scan(PureOp): ...@@ -595,7 +595,7 @@ class Scan(PureOp):
if inner_nonseq.type != outer_nonseq.type: if inner_nonseq.type != outer_nonseq.type:
raise ValueError(('Argument %s given to scan node does not' raise ValueError(('Argument %s given to scan node does not'
' match its correspondance %s') % ' match its correspondance %s') %
(str(outer_nonseq), str(inner_nonseq))) (str(outer_nonseq), str(inner_nonseq)))
for outer_nitsot in self.outer_nitsot(inputs): for outer_nitsot in self.outer_nitsot(inputs):
# For every nit_sot input we get as input a int/uint that # For every nit_sot input we get as input a int/uint that
...@@ -697,7 +697,8 @@ class Scan(PureOp): ...@@ -697,7 +697,8 @@ class Scan(PureOp):
self._hash_inner_graph, self._hash_inner_graph,
scan_utils.hash_listsDictsTuples(self.info))) scan_utils.hash_listsDictsTuples(self.info)))
def make_thunk(self, node, storage_map, compute_map, no_recycling): def make_thunk(self, node, storage_map, compute_map, no_recycling,
impl=None):
""" """
Parameters Parameters
...@@ -715,7 +716,8 @@ class Scan(PureOp): ...@@ -715,7 +716,8 @@ class Scan(PureOp):
no_recycling no_recycling
List of variables for which it is forbidden to reuse memory List of variables for which it is forbidden to reuse memory
allocated by a previous call. allocated by a previous call.
impl
Use 'py' if we want python execution.
Notes Notes
----- -----
If the thunk consults the storage_map on every call, it is safe If the thunk consults the storage_map on every call, it is safe
...@@ -786,7 +788,7 @@ class Scan(PureOp): ...@@ -786,7 +788,7 @@ class Scan(PureOp):
# Wrap the corresponding input as usual. Leave the # Wrap the corresponding input as usual. Leave the
# output as-is. # output as-is.
wrapped_inputs.append(In(self.inputs[input_idx], wrapped_inputs.append(In(self.inputs[input_idx],
borrow=False)) borrow=False))
input_idx += 1 input_idx += 1
# Wrap the inputs not associated to mitmots and wrap the remaining # Wrap the inputs not associated to mitmots and wrap the remaining
...@@ -839,7 +841,7 @@ class Scan(PureOp): ...@@ -839,7 +841,7 @@ class Scan(PureOp):
profile = None profile = None
if (theano.config.profile or if (theano.config.profile or
(isinstance(self.profile, (string_types, bool, integer_types)) (isinstance(self.profile, (string_types, bool, integer_types))
and self.profile)): and self.profile)):
if isinstance(self.profile, string_types): if isinstance(self.profile, string_types):
profile = ScanProfileStats(name=self.profile) profile = ScanProfileStats(name=self.profile)
else: else:
...@@ -864,6 +866,8 @@ class Scan(PureOp): ...@@ -864,6 +866,8 @@ class Scan(PureOp):
for out in self.fn.maker.fgraph.outputs] for out in self.fn.maker.fgraph.outputs]
try: try:
if impl == 'py':
raise theano.gof.cmodule.MissingGXX
cython_mintaps = numpy.asarray(self.mintaps, dtype='int32') cython_mintaps = numpy.asarray(self.mintaps, dtype='int32')
cython_tap_array_len = \ cython_tap_array_len = \
numpy.asarray([len(x) for x in self.tap_array], numpy.asarray([len(x) for x in self.tap_array],
...@@ -886,16 +890,16 @@ class Scan(PureOp): ...@@ -886,16 +890,16 @@ class Scan(PureOp):
d1 = numpy.max(cython_mit_mot_out_nslices) d1 = numpy.max(cython_mit_mot_out_nslices)
d0 = len(self.mit_mot_out_slices) d0 = len(self.mit_mot_out_slices)
cython_mit_mot_out_slices = numpy.zeros((d0, d1), cython_mit_mot_out_slices = numpy.zeros((d0, d1),
dtype='int32') dtype='int32')
for _d0 in xrange(d0): for _d0 in xrange(d0):
for _d1 in xrange(cython_mit_mot_out_nslices[_d0]): for _d1 in xrange(cython_mit_mot_out_nslices[_d0]):
cython_mit_mot_out_slices[_d0, _d1] = \ cython_mit_mot_out_slices[_d0, _d1] = \
self.mit_mot_out_slices[_d0][_d1] self.mit_mot_out_slices[_d0][_d1]
cython_vector_seqs = numpy.asarray(self.vector_seqs, cython_vector_seqs = numpy.asarray(self.vector_seqs,
dtype='int32') dtype='int32')
cython_vector_outs = numpy.asarray(self.vector_outs, cython_vector_outs = numpy.asarray(self.vector_outs,
dtype='int32') dtype='int32')
cython_mitmots_preallocated = numpy.asarray(self.mitmots_preallocated, cython_mitmots_preallocated = numpy.asarray(self.mitmots_preallocated,
dtype='int32') dtype='int32')
...@@ -906,39 +910,38 @@ class Scan(PureOp): ...@@ -906,39 +910,38 @@ class Scan(PureOp):
if hasattr(self, 'destroy_map'): if hasattr(self, 'destroy_map'):
cython_destroy_map = [x in self.destroy_map cython_destroy_map = [x in self.destroy_map
for x in xrange(len(node.outputs))] for x in xrange(len(node.outputs))]
else: else:
cython_destroy_map = [0 for x in xrange(len(node.outputs))] cython_destroy_map = [0 for x in xrange(len(node.outputs))]
cython_destroy_map = numpy.asarray(cython_destroy_map, cython_destroy_map = numpy.asarray(cython_destroy_map,
dtype='int32') dtype='int32')
from . import scan_perform_ext from . import scan_perform_ext
p = lambda node, args, outs:\ p = lambda node, args, outs:\
scan_perform_ext.perform( scan_perform_ext.perform(self.n_shared_outs,
self.n_shared_outs, self.n_mit_mot_outs,
self.n_mit_mot_outs, self.n_seqs,
self.n_seqs, self.n_mit_mot,
self.n_mit_mot, self.n_mit_sot,
self.n_mit_sot, self.n_sit_sot,
self.n_sit_sot, self.n_nit_sot,
self.n_nit_sot, args[0],
args[0], self.as_while,
self.as_while, cython_mintaps,
cython_mintaps, cython_tap_array,
cython_tap_array, cython_tap_array_len,
cython_tap_array_len, cython_vector_seqs,
cython_vector_seqs, cython_vector_outs,
cython_vector_outs, cython_mit_mot_out_slices,
cython_mit_mot_out_slices, cython_mit_mot_out_nslices,
cython_mit_mot_out_nslices, cython_mitmots_preallocated,
cython_mitmots_preallocated, cython_inps_is_tensor,
cython_inps_is_tensor, cython_outs_is_tensor,
cython_outs_is_tensor, self.fn.fn,
self.fn.fn, self.fn,
self.fn, cython_destroy_map,
cython_destroy_map, args,
args, outs,
outs, self, node)
self, node)
except (ImportError, theano.gof.cmodule.MissingGXX): except (ImportError, theano.gof.cmodule.MissingGXX):
p = self.execute p = self.execute
# default arguments are stored in the closure of `rval` # default arguments are stored in the closure of `rval`
...@@ -1000,8 +1003,8 @@ class Scan(PureOp): ...@@ -1000,8 +1003,8 @@ class Scan(PureOp):
def inner_mitsot(self, list_inputs): def inner_mitsot(self, list_inputs):
n_mitmot_taps = sum(len(x) for x in self.tap_array[:self.n_mit_mot]) n_mitmot_taps = sum(len(x) for x in self.tap_array[:self.n_mit_mot])
ntaps_upto_sit_sot = sum(len(x) for x in ntaps_upto_sit_sot = sum(len(x) for x in
self.tap_array[:(self.n_mit_mot + self.tap_array[:(self.n_mit_mot +
self.n_mit_sot)]) self.n_mit_sot)])
return list_inputs[self.n_seqs + n_mitmot_taps: return list_inputs[self.n_seqs + n_mitmot_taps:
self.n_seqs + ntaps_upto_sit_sot] self.n_seqs + ntaps_upto_sit_sot]
...@@ -1090,7 +1093,7 @@ class Scan(PureOp): ...@@ -1090,7 +1093,7 @@ class Scan(PureOp):
if isinstance(list_outputs, Apply): if isinstance(list_outputs, Apply):
list_outputs = list_outputs.outputs list_outputs = list_outputs.outputs
offset = (self.n_mit_mot + self.n_mit_sot + self.n_sit_sot + offset = (self.n_mit_mot + self.n_mit_sot + self.n_sit_sot +
self.n_nit_sot) self.n_nit_sot)
return list_outputs[offset:offset + self.n_shared_outs] return list_outputs[offset:offset + self.n_shared_outs]
def inner_non_seqs(self, list_inputs): def inner_non_seqs(self, list_inputs):
...@@ -1149,10 +1152,10 @@ class Scan(PureOp): ...@@ -1149,10 +1152,10 @@ class Scan(PureOp):
for idx, seq in enumerate(args[1:self.seqs_arg_offset]): for idx, seq in enumerate(args[1:self.seqs_arg_offset]):
if seq.shape[0] < n_steps: if seq.shape[0] < n_steps:
raise ValueError(('Sequence is shorter then the required ' raise ValueError(('Sequence is shorter then the required '
'number of steps : (n_steps, seq, ' 'number of steps : (n_steps, seq, '
'seq.shape):'), n_steps, 'seq.shape):'), n_steps,
node.inputs[1 + idx], node.inputs[1 + idx],
seq.shape) seq.shape)
seqs.append(seq) seqs.append(seq)
# 2. Allocate memory for the outputs. Construct the list: # 2. Allocate memory for the outputs. Construct the list:
...@@ -1161,15 +1164,15 @@ class Scan(PureOp): ...@@ -1161,15 +1164,15 @@ class Scan(PureOp):
# output # output
store_steps = [arg.shape[0] for arg store_steps = [arg.shape[0] for arg
in args[self.seqs_arg_offset: in args[self.seqs_arg_offset:
self.shared_arg_offset]] self.shared_arg_offset]]
store_steps += [arg for arg in store_steps += [arg for arg in
args[self.nit_sot_arg_offset: args[self.nit_sot_arg_offset:
self.nit_sot_arg_offset + self.n_nit_sot] self.nit_sot_arg_offset + self.n_nit_sot]
] ]
pos = [(-self.mintaps[idx]) % store_steps[idx] for idx pos = [(-self.mintaps[idx]) % store_steps[idx] for idx
in xrange(self.n_outs + self.n_nit_sot)] in xrange(self.n_outs + self.n_nit_sot)]
if not getattr(self, 'destroy_map', None): if not getattr(self, 'destroy_map', None):
self.destroy_map = OrderedDict() self.destroy_map = OrderedDict()
# 2.1 Create storage space for outputs # 2.1 Create storage space for outputs
...@@ -1203,7 +1206,7 @@ class Scan(PureOp): ...@@ -1203,7 +1206,7 @@ class Scan(PureOp):
old_output_data = [None] * len(output_storage) old_output_data = [None] * len(output_storage)
fn = self.fn.fn fn = self.fn.fn
offset = (self.n_seqs + sum(map(len, self.tap_array[:self.n_outs])) + offset = (self.n_seqs + sum(map(len, self.tap_array[:self.n_outs])) +
self.n_shared_outs) self.n_shared_outs)
for idx in xrange(len(other_args)): for idx in xrange(len(other_args)):
input_storage[idx + offset].storage[0] = other_args[idx] input_storage[idx + offset].storage[0] = other_args[idx]
...@@ -1217,7 +1220,7 @@ class Scan(PureOp): ...@@ -1217,7 +1220,7 @@ class Scan(PureOp):
for idx in xrange(self.n_seqs): for idx in xrange(self.n_seqs):
if self.vector_seqs[idx]: if self.vector_seqs[idx]:
input_storage[idx].storage[0] = \ input_storage[idx].storage[0] = \
seqs[idx][i:i + 1].reshape(()) seqs[idx][i:i + 1].reshape(())
else: else:
input_storage[idx].storage[0] = seqs[idx][i] input_storage[idx].storage[0] = seqs[idx][i]
...@@ -1227,7 +1230,7 @@ class Scan(PureOp): ...@@ -1227,7 +1230,7 @@ class Scan(PureOp):
for tap in self.tap_array[idx]: for tap in self.tap_array[idx]:
_idx = (pos[idx] + tap) % store_steps[idx] _idx = (pos[idx] + tap) % store_steps[idx]
input_storage[offset].storage[0] =\ input_storage[offset].storage[0] =\
outs[idx][0][_idx:_idx + 1].reshape(()) outs[idx][0][_idx:_idx + 1].reshape(())
offset += 1 offset += 1
else: else:
for tap in self.tap_array[idx]: for tap in self.tap_array[idx]:
...@@ -1396,7 +1399,7 @@ class Scan(PureOp): ...@@ -1396,7 +1399,7 @@ class Scan(PureOp):
# This output tap has not been preallocated, recover # This output tap has not been preallocated, recover
# its value as usual # its value as usual
outs[j][0][k + pos[j]] = \ outs[j][0][k + pos[j]] = \
output_storage[offset_out].storage[0] output_storage[offset_out].storage[0]
offset_out += 1 offset_out += 1
mitmot_out_idx += 1 mitmot_out_idx += 1
...@@ -1413,7 +1416,7 @@ class Scan(PureOp): ...@@ -1413,7 +1416,7 @@ class Scan(PureOp):
# Copy the output value to `outs`, if necessary # Copy the output value to `outs`, if necessary
if store_steps[j] == 1 or self.vector_outs[j]: if store_steps[j] == 1 or self.vector_outs[j]:
outs[j][0][pos[j]] = \ outs[j][0][pos[j]] = \
output_storage[offset_out + j].storage[0] output_storage[offset_out + j].storage[0]
else: else:
# Check whether the initialization of the output storage # Check whether the initialization of the output storage
# map for this output has been reused. # map for this output has been reused.
...@@ -1442,7 +1445,7 @@ class Scan(PureOp): ...@@ -1442,7 +1445,7 @@ class Scan(PureOp):
if i == 0: if i == 0:
jout = j + offset_out jout = j + offset_out
shape = (store_steps[j],) + \ shape = (store_steps[j],) + \
output_storage[jout].storage[0].shape output_storage[jout].storage[0].shape
if len(output_storage[jout].storage[0].shape) == 0: if len(output_storage[jout].storage[0].shape) == 0:
self.vector_outs[j] = True self.vector_outs[j] = True
dtype = output_storage[jout].storage[0].dtype dtype = output_storage[jout].storage[0].dtype
...@@ -1486,7 +1489,7 @@ class Scan(PureOp): ...@@ -1486,7 +1489,7 @@ class Scan(PureOp):
outs[j][0] = output_storage[jout].storage[0] outs[j][0] = output_storage[jout].storage[0]
pos = [(idx + 1) % store for idx, store in pos = [(idx + 1) % store for idx, store in
izip(pos, store_steps)] izip(pos, store_steps)]
i = i + 1 i = i + 1
# 6. Check if you need to re-order output buffers # 6. Check if you need to re-order output buffers
...@@ -1642,17 +1645,15 @@ class Scan(PureOp): ...@@ -1642,17 +1645,15 @@ class Scan(PureOp):
self_outs = self.outputs[:-1] self_outs = self.outputs[:-1]
else: else:
self_outs = self.outputs self_outs = self.outputs
outs_shape = scan_utils.infer_shape( outs_shape = scan_utils.infer_shape(outs=self_outs,
outs=self_outs, inputs=self.inputs,
inputs=self.inputs, input_shapes=inner_ins_shapes)
input_shapes=inner_ins_shapes)
# Will be used to check if outs_shape can be expressed without using # Will be used to check if outs_shape can be expressed without using
# variables in self.inputs. # variables in self.inputs.
# The shapes of node.inputs are valid. # The shapes of node.inputs are valid.
validator = scan_utils.Validator( validator = scan_utils.Validator(valid=input_shapes,
valid=input_shapes, invalid=self.inputs,
invalid=self.inputs, valid_equivalent=out_equivalent)
valid_equivalent=out_equivalent)
offset = 1 + self.n_seqs offset = 1 + self.n_seqs
scan_outs = [x for x in input_shapes[offset:offset + n_outs]] scan_outs = [x for x in input_shapes[offset:offset + n_outs]]
...@@ -1687,7 +1688,7 @@ class Scan(PureOp): ...@@ -1687,7 +1688,7 @@ class Scan(PureOp):
scan_outs.append(tuple(shp)) scan_outs.append(tuple(shp))
scan_outs += [x for x in scan_outs += [x for x in
input_shapes[offset:offset + self.n_shared_outs]] input_shapes[offset:offset + self.n_shared_outs]]
# if we are dealing with a repeat-until, then we do not know the # if we are dealing with a repeat-until, then we do not know the
# leading dimension so we replace it for every entry with Shape_i # leading dimension so we replace it for every entry with Shape_i
if self.as_while: if self.as_while:
...@@ -1751,7 +1752,7 @@ class Scan(PureOp): ...@@ -1751,7 +1752,7 @@ class Scan(PureOp):
j_inp_idx = self.var_mappings["outer_inp_from_outer_out"][jidx] j_inp_idx = self.var_mappings["outer_inp_from_outer_out"][jidx]
if j_inp_idx != -1: if j_inp_idx != -1:
if connection_pattern[j_inp_idx][iidx] == True: if connection_pattern[j_inp_idx][iidx] == True:
for k in xrange(len(connection_pattern)): for k in xrange(len(connection_pattern)):
if connection_pattern[k][jidx]: if connection_pattern[k][jidx]:
connection_pattern[k][iidx] = True connection_pattern[k][iidx] = True
...@@ -1875,18 +1876,18 @@ class Scan(PureOp): ...@@ -1875,18 +1876,18 @@ class Scan(PureOp):
# With the global mapping inferred, the individual mappings # With the global mapping inferred, the individual mappings
# can be produced # can be produced
mappings = {"outer_inp_from_outer_out" : {}, mappings = {"outer_inp_from_outer_out": {},
"inner_inp_from_outer_out" : {}, "inner_inp_from_outer_out": {},
"inner_out_from_outer_out" : {}, "inner_out_from_outer_out": {},
"inner_inp_from_outer_inp" : {}, "inner_inp_from_outer_inp": {},
"inner_out_from_outer_inp" : {}, "inner_out_from_outer_inp": {},
"outer_out_from_outer_inp" : {}, "outer_out_from_outer_inp": {},
"outer_inp_from_inner_inp" : {}, "outer_inp_from_inner_inp": {},
"inner_out_from_inner_inp" : {}, "inner_out_from_inner_inp": {},
"outer_out_from_inner_inp" : {}, "outer_out_from_inner_inp": {},
"outer_inp_from_inner_out" : {}, "outer_inp_from_inner_out": {},
"inner_inp_from_inner_out" : {}, "inner_inp_from_inner_out": {},
"outer_out_from_inner_out" : {}} "outer_out_from_inner_out": {}}
for (oinp, iinp, iout, oout) in izip(outer_input_indices, for (oinp, iinp, iout, oout) in izip(outer_input_indices,
inner_input_indices, inner_input_indices,
...@@ -1932,7 +1933,7 @@ class Scan(PureOp): ...@@ -1932,7 +1933,7 @@ class Scan(PureOp):
grad_steps = self.outer_sitsot_outs(outs)[0].shape[0] - 1 grad_steps = self.outer_sitsot_outs(outs)[0].shape[0] - 1
elif self.n_mit_sot > 0: elif self.n_mit_sot > 0:
grad_steps = self.outer_mitsot_outs(outs)[0].shape[0] +\ grad_steps = self.outer_mitsot_outs(outs)[0].shape[0] +\
self.mintaps[self.n_mit_mot] self.mintaps[self.n_mit_mot]
else: else:
grad_steps = inputs[0] grad_steps = inputs[0]
...@@ -2019,14 +2020,13 @@ class Scan(PureOp): ...@@ -2019,14 +2020,13 @@ class Scan(PureOp):
# to X. # to X.
known_grads = OrderedDict([(k.copy(), v) for (k, v) in known_grads.items()]) known_grads = OrderedDict([(k.copy(), v) for (k, v) in known_grads.items()])
grads = gradient.grad( grads = gradient.grad(cost=None,
cost=None, known_grads=known_grads,
known_grads=known_grads, wrt=wrt,
wrt=wrt, consider_constant=wrt,
consider_constant=wrt, disconnected_inputs='ignore',
disconnected_inputs='ignore', return_disconnected='None',
return_disconnected='None', null_gradients='return')
null_gradients='return')
for i in range(len(wrt)): for i in range(len(wrt)):
gmp[wrt[i]] = grads[i] gmp[wrt[i]] = grads[i]
...@@ -2086,7 +2086,6 @@ class Scan(PureOp): ...@@ -2086,7 +2086,6 @@ class Scan(PureOp):
dC_dXt = safe_new(dC_douts[idx][0]) dC_dXt = safe_new(dC_douts[idx][0])
dC_dXts.append(dC_dXt) dC_dXts.append(dC_dXt)
known_grads = OrderedDict() known_grads = OrderedDict()
dc_dxts_idx = 0 dc_dxts_idx = 0
for i in range(len(diff_outputs)): for i in range(len(diff_outputs)):
...@@ -2141,7 +2140,7 @@ class Scan(PureOp): ...@@ -2141,7 +2140,7 @@ class Scan(PureOp):
dC_dXtm1s.append(safe_new(dC_dXts[opos])) dC_dXtm1s.append(safe_new(dC_dXts[opos]))
if hasattr(x, 'dtype') and x.dtype != dC_dXts[opos].dtype: if hasattr(x, 'dtype') and x.dtype != dC_dXts[opos].dtype:
dC_dinps_t[pos + self.n_seqs] = \ dC_dinps_t[pos + self.n_seqs] = \
x.astype(dC_dXts[opos].dtype) x.astype(dC_dXts[opos].dtype)
else: else:
dC_dXtm1s.append(safe_new(x)) dC_dXtm1s.append(safe_new(x))
...@@ -2168,7 +2167,7 @@ class Scan(PureOp): ...@@ -2168,7 +2167,7 @@ class Scan(PureOp):
seq = outs[idx] seq = outs[idx]
for k in self.tap_array[idx]: for k in self.tap_array[idx]:
if outmaxtap - k != 0: if outmaxtap - k != 0:
nw_seq = seq[k - mintap: -(outmaxtap-k)][::-1] nw_seq = seq[k - mintap: -(outmaxtap - k)][::-1]
else: else:
nw_seq = seq[k - mintap:][::-1] nw_seq = seq[k - mintap:][::-1]
outer_inp_seqs.append(nw_seq) outer_inp_seqs.append(nw_seq)
...@@ -2276,7 +2275,6 @@ class Scan(PureOp): ...@@ -2276,7 +2275,6 @@ class Scan(PureOp):
new_inner_out_mitmot = theano.clone(new_inner_out_mitmot, new_inner_out_mitmot = theano.clone(new_inner_out_mitmot,
replace=[(to_replace, replacement)]) replace=[(to_replace, replacement)])
inner_out_mitmot.append(new_inner_out_mitmot) inner_out_mitmot.append(new_inner_out_mitmot)
if not disconnected_dC_dinps_t[ins_pos]: if not disconnected_dC_dinps_t[ins_pos]:
...@@ -2541,8 +2539,7 @@ class Scan(PureOp): ...@@ -2541,8 +2539,7 @@ class Scan(PureOp):
gradients.append(NullType(t)()) gradients.append(NullType(t)())
end = self.n_mit_mot + self.n_mit_sot + self.n_sit_sot end = self.n_mit_mot + self.n_mit_sot + self.n_sit_sot
for p, (x, t) in enumerate( for p, (x, t) in enumerate(zip(outputs[:end], type_outs[:end])):
zip(outputs[:end], type_outs[:end])):
if t == 'connected': if t == 'connected':
gradients.append(x[::-1]) gradients.append(x[::-1])
elif t == 'disconnected': elif t == 'disconnected':
...@@ -2575,12 +2572,11 @@ class Scan(PureOp): ...@@ -2575,12 +2572,11 @@ class Scan(PureOp):
start = len(gradients) start = len(gradients)
gradients += [DisconnectedType()() gradients += [DisconnectedType()()
for x in xrange(self.n_nit_sot)] for x in xrange(self.n_nit_sot)]
begin = end begin = end
end = begin + n_sitsot_outs end = begin + n_sitsot_outs
for p, (x, t) in enumerate( for p, (x, t) in enumerate(zip(outputs[begin:end], type_outs[begin:end])):
zip(outputs[begin:end], type_outs[begin:end])):
if t == 'connected': if t == 'connected':
gradients.append(x[-1]) gradients.append(x[-1])
elif t == 'disconnected': elif t == 'disconnected':
...@@ -2617,7 +2613,7 @@ class Scan(PureOp): ...@@ -2617,7 +2613,7 @@ class Scan(PureOp):
self.outputs, '_rop') self.outputs, '_rop')
self_inputs = rval[0] self_inputs = rval[0]
rop_of_inputs = rval[0][:self.n_seqs + self.n_outs] + \ rop_of_inputs = rval[0][:self.n_seqs + self.n_outs] + \
rval[0][self.n_seqs + self.n_outs + self.n_shared_outs:] rval[0][self.n_seqs + self.n_outs + self.n_shared_outs:]
self_outputs = rval[1] self_outputs = rval[1]
# Step 1. Compute the R_op of the inner function # Step 1. Compute the R_op of the inner function
inner_eval_points = [scan_utils.safe_new(x, '_evalpoint') inner_eval_points = [scan_utils.safe_new(x, '_evalpoint')
...@@ -2628,8 +2624,7 @@ class Scan(PureOp): ...@@ -2628,8 +2624,7 @@ class Scan(PureOp):
rop_self_outputs = self_outputs rop_self_outputs = self_outputs
if self.info['n_shared_outs'] > 0: if self.info['n_shared_outs'] > 0:
rop_self_outputs = rop_self_outputs[:-self.info['n_shared_outs']] rop_self_outputs = rop_self_outputs[:-self.info['n_shared_outs']]
rop_outs = tensor.Rop(rop_self_outputs, rop_of_inputs, rop_outs = tensor.Rop(rop_self_outputs, rop_of_inputs, inner_eval_points)
inner_eval_points)
if type(rop_outs) not in (list, tuple): if type(rop_outs) not in (list, tuple):
rop_outs = [rop_outs] rop_outs = [rop_outs]
# Step 2. Figure out what corresponds to what in the scan # Step 2. Figure out what corresponds to what in the scan
...@@ -2709,8 +2704,8 @@ class Scan(PureOp): ...@@ -2709,8 +2704,8 @@ class Scan(PureOp):
e = e + self.n_mit_sot e = e + self.n_mit_sot
ib = ie ib = ie
ie = ie + int(numpy.sum([len(x) for x in ie = ie + int(numpy.sum([len(x) for x in
self.tap_array[self.n_mit_mot:\ self.tap_array[self.n_mit_mot: \
self.n_mit_mot + self.n_mit_sot]])) self.n_mit_mot + self.n_mit_sot]]))
clean_eval_points = [] clean_eval_points = []
for inp, evp in zip(inputs[b:e], eval_points[b:e]): for inp, evp in zip(inputs[b:e], eval_points[b:e]):
if evp is not None: if evp is not None:
......
...@@ -1015,7 +1015,7 @@ class GetItemList(gof.op.Op): ...@@ -1015,7 +1015,7 @@ class GetItemList(gof.op.Op):
def grad(self, inputs, g_outputs): def grad(self, inputs, g_outputs):
x, indices = inputs x, indices = inputs
gout, = g_outputs gout, = g_outputs
return [GetItemListGrad(self)(x, indices, gout), return [get_item_list_grad(x, indices, gout),
grad_undefined(self, 1, indices, "No gradient for this input")] grad_undefined(self, 1, indices, "No gradient for this input")]
get_item_list = GetItemList() get_item_list = GetItemList()
...@@ -1110,7 +1110,7 @@ class GetItem2Lists(gof.op.Op): ...@@ -1110,7 +1110,7 @@ class GetItem2Lists(gof.op.Op):
def grad(self, inputs, g_outputs): def grad(self, inputs, g_outputs):
x, ind1, ind2 = inputs x, ind1, ind2 = inputs
gout, = g_outputs gout, = g_outputs
return [GetItem2ListsGrad(self)(x, ind1, ind2, gout), return [get_item_2lists_grad(x, ind1, ind2, gout),
grad_undefined(self, 1, ind1, "No gradient for this input"), grad_undefined(self, 1, ind1, "No gradient for this input"),
grad_undefined(self, 1, ind2, "No gradient for this input")] grad_undefined(self, 1, ind2, "No gradient for this input")]
......
...@@ -297,9 +297,6 @@ class Ger(Op): ...@@ -297,9 +297,6 @@ class Ger(Op):
This interface to GER allows non-destructive operation on A via the This interface to GER allows non-destructive operation on A via the
`destructive` argument to the constructor. `destructive` argument to the constructor.
:TODO: Create better classes ScipyGer and CGer that inherit from this class
and override the make_thunk() method to use Scipy and C respectively.
""" """
__props__ = ("destructive",) __props__ = ("destructive",)
...@@ -837,10 +834,8 @@ class Gemm(GemmRelated): ...@@ -837,10 +834,8 @@ class Gemm(GemmRelated):
else: else:
self.setup_z_Nz_Sz = self.setup_z_Nz_Sz_outplace self.setup_z_Nz_Sz = self.setup_z_Nz_Sz_outplace
# Correctly reload older pickles where _op_use_c_code and # Correctly reload older pickles where destroy_map were not
# destroy_map were not saved # saved
if '_op_use_c_code' not in self.__dict__:
self._op_use_c_code = theano.config.cxx
if 'destroy_map' not in self.__dict__ and self.inplace: if 'destroy_map' not in self.__dict__ and self.inplace:
self.destroy_map = {0: [0]} self.destroy_map = {0: [0]}
......
...@@ -22,46 +22,34 @@ if have_fblas: ...@@ -22,46 +22,34 @@ if have_fblas:
class ScipyGer(Ger): class ScipyGer(Ger):
# keep everything else, but override the make_thunk def prepare_node(self, node, storage_map, compute_map, impl):
def make_thunk(self, node, storage_map, compute_map, no_recycling): if impl == 'py':
node.tag.local_ger = _blas_ger_fns[numpy.dtype(
node_input_storage = [storage_map[r] for r in node.inputs] node.inputs[0].type.dtype)]
node_output_storage = [storage_map[r] for r in node.outputs]
node_output_compute = [compute_map[r] for r in node.outputs] def perform(self, node, inputs, output_storage):
cA, calpha, cx, cy = inputs
# get vars for containers cZ, = output_storage
cA, calpha, cx, cy = node_input_storage # N.B. some versions of scipy (e.g. mine) don't actually work
cZ, = node_output_storage # in-place on a, even when I tell it to.
local_ger = _blas_ger_fns[numpy.dtype(node.inputs[0].type.dtype)] A = cA
local_ger = node.tag.local_ger
def rval(): if A.size == 0:
# N.B. some versions of scipy (e.g. mine) don't actually work # We don't have to compute anything, A is empty.
# in-place on a, even when I tell it to. # We need this special case because Numpy considers it
A = cA[0] # C-contiguous, wich is confusing.
if A.size == 0: if not self.destructive:
# We don't have to compute anything, A is empty. # Sometimes numpy thinks empty matrices can share memory,
# We need this special case because Numpy considers it # so here to stop DebugMode from complaining.
# C-contiguous, wich is confusing. A = A.copy()
if not self.destructive: elif A.flags['C_CONTIGUOUS']:
# Sometimes numpy thinks empty matrices can share memory, A = local_ger(calpha, cy, cx, a=A.T,
# so here to stop DebugMode from complaining. overwrite_a=int(self.destructive)).T
A = A.copy() else:
elif A.flags['C_CONTIGUOUS']: A = local_ger(calpha, cx, cy, a=A,
A = local_ger(calpha[0], cy[0], cx[0], a=A.T, overwrite_a=int(self.destructive))
overwrite_a=int(self.destructive)).T cZ[0] = A
else:
A = local_ger(calpha[0], cx[0], cy[0], a=A,
overwrite_a=int(self.destructive))
cZ[0] = A
for o in node_output_compute:
o[0] = True
# TODO: If this is currently an unofficial part of the thunk API,
# then maybe it should be documented and made official?
rval.inputs = node_input_storage
rval.outputs = node_output_storage
rval.lazy = False
return rval
scipy_ger_no_inplace = ScipyGer(False) scipy_ger_no_inplace = ScipyGer(False)
scipy_ger_inplace = ScipyGer(True) scipy_ger_inplace = ScipyGer(True)
......
...@@ -787,14 +787,15 @@ second dimension ...@@ -787,14 +787,15 @@ second dimension
return ret return ret
def prepare_node(self, node, storage_map, compute_map): def prepare_node(self, node, storage_map, compute_map, impl):
# Postpone the ufunc building to the last minutes # Postpone the ufunc building to the last minutes
# NumPy ufunc support only up to 31 inputs. # NumPy ufunc support only up to 31 inputs.
# But our c code support more. # But our c code support more.
if (len(node.inputs) < 32 and if (len(node.inputs) < 32 and
(self.nfunc is None or (self.nfunc is None or
self.scalar_op.nin != len(node.inputs)) and self.scalar_op.nin != len(node.inputs)) and
self.ufunc is None): self.ufunc is None and
impl == 'py'):
ufunc = numpy.frompyfunc(self.scalar_op.impl, ufunc = numpy.frompyfunc(self.scalar_op.impl,
len(node.inputs), len(node.inputs),
...@@ -830,7 +831,7 @@ second dimension ...@@ -830,7 +831,7 @@ second dimension
[get_scalar_type(dtype=output.type.dtype).make_variable() [get_scalar_type(dtype=output.type.dtype).make_variable()
for output in node.outputs]) for output in node.outputs])
self.scalar_op.prepare_node(node.tag.fake_node, None, None) self.scalar_op.prepare_node(node.tag.fake_node, None, None, impl)
def perform(self, node, inputs, output_storage): def perform(self, node, inputs, output_storage):
if len(node.inputs) >= 32: if len(node.inputs) >= 32:
...@@ -890,14 +891,18 @@ second dimension ...@@ -890,14 +891,18 @@ second dimension
# numpy the first (faster) version leads to segfaults # numpy the first (faster) version leads to segfaults
if self.ufunc: if self.ufunc:
ufunc = self.ufunc ufunc = self.ufunc
elif not hasattr(node.tag, 'ufunc'):
# It happen that make_thunk isn't called, like in
# get_scalar_constant_value
self.prepare_node(node, None, None, 'py')
# prepare_node will add ufunc to self or the tag
# depending if we can reuse it or not. So we need to
# test both again.
if self.ufunc:
ufunc = self.ufunc
else:
ufunc = node.tag.ufunc
else: else:
if not hasattr(node.tag, 'ufunc'):
# It happen that make_thunk isn't called, like in
# get_scalar_constant_value
node.tag.ufunc = numpy.frompyfunc(self.scalar_op.impl,
len(node.inputs),
self.scalar_op.nout)
ufunc = node.tag.ufunc ufunc = node.tag.ufunc
nout = ufunc.nout nout = ufunc.nout
...@@ -977,7 +982,7 @@ second dimension ...@@ -977,7 +982,7 @@ second dimension
# To not request all of them to call prepare_node(), do it here. # To not request all of them to call prepare_node(), do it here.
# There is no harm if it get called multile time. # There is no harm if it get called multile time.
if not hasattr(node.tag, 'fake_node'): if not hasattr(node.tag, 'fake_node'):
self.prepare_node(node, None, None) self.prepare_node(node, None, None, 'c')
_inames = inames _inames = inames
_onames = onames _onames = onames
......
...@@ -6299,20 +6299,12 @@ def constant_folding(node): ...@@ -6299,20 +6299,12 @@ def constant_folding(node):
for o in node.outputs: for o in node.outputs:
storage_map[o] = [None] storage_map[o] = [None]
compute_map[o] = [False] compute_map[o] = [False]
impl = None
if (hasattr(node.op, 'python_constant_folding') and if (hasattr(node.op, 'python_constant_folding') and
node.op.python_constant_folding(node)): node.op.python_constant_folding(node)):
old_value = getattr(node.op, '_op_use_c_code', False) impl = 'py'
try: thunk = node.op.make_thunk(node, storage_map, compute_map,
node.op._op_use_c_code = False no_recycling=[], impl=impl)
thunk = node.op.make_thunk(node,
storage_map,
compute_map,
[])
finally:
node.op._op_use_c_code = old_value
else:
thunk = node.op.make_thunk(node, storage_map, compute_map,
no_recycling=[])
required = thunk() required = thunk()
assert not required # a node whose inputs are all provided should always assert not required # a node whose inputs are all provided should always
......
...@@ -263,7 +263,7 @@ class Pool(OpenMPOp): ...@@ -263,7 +263,7 @@ class Pool(OpenMPOp):
" 'average_inc_pad' and 'average_exc_pad'. Got %s" % mode) " 'average_inc_pad' and 'average_exc_pad'. Got %s" % mode)
self.mode = mode self.mode = mode
def prepare_node(self, node, storage_map, compute_map): def prepare_node(self, node, storage_map, compute_map, impl):
if len(node.inputs) == 1: if len(node.inputs) == 1:
# Old interface # Old interface
self.ndim = len(node.op.ds) self.ndim = len(node.op.ds)
...@@ -796,7 +796,7 @@ class PoolGrad(OpenMPOp): ...@@ -796,7 +796,7 @@ class PoolGrad(OpenMPOp):
self.mode = mode self.mode = mode
super(PoolGrad, self).__init__(openmp=openmp) super(PoolGrad, self).__init__(openmp=openmp)
def prepare_node(self, node, storage_map, compute_map): def prepare_node(self, node, storage_map, compute_map, impl):
if len(node.inputs) < 5: # 5 for AveragePoolGrad, 6 for MaxPoolGrad if len(node.inputs) < 5: # 5 for AveragePoolGrad, 6 for MaxPoolGrad
# Old interface # Old interface
self.ndim = len(node.op.ds) self.ndim = len(node.op.ds)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论