Merge pull request #2342 from abergeron/multi_fixes

Multi fixes (pieces of the multi-gpu code)

Merge pull request #2342 from abergeron/multi_fixes
c190056b · Frédéric Bastien · 567ceab6 · 7a5cb506 · c190056b · c190056b
--- a/doc/extending/cop.txt
+++ b/doc/extending/cop.txt
@@ -43,9 +43,6 @@ There are less methods to define for an Op than for a Type:
          that a python exception is set) if your C code needs to
          raise an exception.
-       ``sub['struct_id']``
-          The integer id passed to the various _struct methods.
    .. method:: c_code_cleanup(node, name, input_names, output_names, sub)
@@ -99,15 +96,12 @@ There are less methods to define for an Op than for a Type:
       module is initialized, before anything else is executed and is
       specialized for a particular apply of an :ref:`op`.
-    .. method:: c_init_code_struct(node, struct_id, sub)
+    .. method:: c_init_code_struct(node, name, sub)
       Allows you to specify code that will be inserted in the struct
       constructor of the Op.  This is for code which should be
       executed once per thunk (Apply node, more or less).
-       `struct_id` is an integer guaranteed to be unique inside the
-       struct.
       `sub` is a dictionary of extras parameters to the
       c_code_init_code_struct method. It contains the following
       values:
@@ -131,31 +125,25 @@ There are less methods to define for an Op than for a Type:
       if the code is the same for each apply of an op.  It will be
       inserted at global scope.
-    .. method:: c_support_code_struct(node, struct_id)
+    .. method:: c_support_code_struct(node, name)
       Allows you to specify helper functions of variables that will
       be specific to one particular thunk.  These are inserted at
       struct scope.
-       `struct_id` is an integer guaranteed to be unique inside the
-       struct.
       :note:
-         You cannot specify kernels in the code returned by this since
+         You cannot specify CUDA kernels in the code returned by this
-         that isn't supported by CUDA.  You should place your kernels
+         since that isn't supported by CUDA.  You should place your
-         in :meth:`c_support_code()` or :meth:`c_support_code_apply()`
+         kernels in :meth:`c_support_code()` or
-         and call them from this code.
+         :meth:`c_support_code_apply()` and call them from this code.
-    .. method:: c_cleanup_code_struct(node, struct_id)
+    .. method:: c_cleanup_code_struct(node, name)
       Allows you to specify code that will be inserted in the struct
       destructor of the Op.  This is for cleaninp up allocations and
       stuff like this when the thunk is released (when you "free" a
       compiled function using this op).
-       `struct_id` is an integer guaranteed to be unique inside the
-       struct.
    .. method:: infer_shape(node, (i0_shapes,i1_shapes,...))
       Allow optimizations to lift the Shape op over this op.  An

--- a/doc/extending/type.txt
+++ b/doc/extending/type.txt
@@ -138,6 +138,16 @@ default values.
        :return: the number of bytes taken by the object described by
            ``shape_info``.
+    .. method:: clone(dtype=None, broadcastable=None)
+       Optional, for TensorType-alikes.
+       Return a copy of the type with a possibly changed value for
+       dtype and broadcastable (if they aren't `None`).
+       :param dtype: New dtype for the copy.
+       :param broadcastable: New broadcastable tuple for the copy.
    .. method:: may_share_memory(a, b)
        Optional to run, but mandatory for DebugMode. Return True if the Python

--- a/theano/compile/ops.py
+++ b/theano/compile/ops.py
@@ -570,7 +570,7 @@ class Rebroadcast(gof.Op):
    def __hash__(self):
        items = sorted(self.axis.iteritems())  # no ambiguity because each item key is unique
-        return hash(type(self)) ^ hash(tuple(items))
+        return hash((type(self), tuple(items)))
    def __str__(self):
        if len(self.axis) == 0:
@@ -586,10 +586,9 @@ class Rebroadcast(gof.Op):
    def make_node(self, x):
        if self.axis.keys() and (x.ndim <= numpy.max(self.axis.keys())):
            raise ValueError('Trying to rebroadcast non-existent dimension')
-        t = x.type.__class__(dtype=x.type.dtype,
+        t = x.type.clone(broadcastable=[self.axis.get(i, b)
-                             broadcastable=[self.axis.get(i, b)
+                                        for i, b in enumerate(
-                                            for i, b in enumerate(
+                    x.type.broadcastable)])
-                                                x.type.broadcastable)])
        return gof.Apply(self, [x], [t()])
    def perform(self, node, inp, out_):

--- a/theano/gof/cc.py
+++ b/theano/gof/cc.py
@@ -658,7 +658,6 @@ class CLinker(link.Linker):
            # Make the CodeBlock for c_code
            sub['id'] = id
-            sub['struct_id'] = id + 1
            sub['fail'] = failure_code(sub)
            sub_struct = dict()
@@ -692,7 +691,7 @@ class CLinker(link.Linker):
                    " didn't return a string for c_init_code_apply")
            try:
-                struct_init = op.c_init_code_struct(node, id + 1, sub_struct)
+                struct_init = op.c_init_code_struct(node, name, sub_struct)
                assert isinstance(struct_init, basestring), (
                    str(node.op) +
                    " didn't return a string for c_init_code_struct")
@@ -700,7 +699,7 @@ class CLinker(link.Linker):
                pass
            try:
-                struct_support = op.c_support_code_struct(node, id + 1)
+                struct_support = op.c_support_code_struct(node, name)
                assert isinstance(struct_support, basestring), (
                    str(node.op) +
                    " didn't return a string for c_support_code_struct")
@@ -708,7 +707,7 @@ class CLinker(link.Linker):
                pass
            try:
-                struct_cleanup = op.c_cleanup_code_struct(node, id + 1)
+                struct_cleanup = op.c_cleanup_code_struct(node, name)
                assert isinstance(struct_cleanup, basestring), (
                    str(node.op) +
                    " didn't return a string for c_cleanup_code_struct")

--- a/theano/gof/graph.py
+++ b/theano/gof/graph.py
@@ -184,7 +184,8 @@ class Apply(Node):
        :note:
            tags are copied from self to the returned instance.
        """
-        cp = self.__class__(self.op, self.inputs, [output.clone() for output in self.outputs])
+        cp = self.__class__(self.op, self.inputs,
+                            [output.clone() for output in self.outputs])
        cp.tag = copy(self.tag)
        return cp

--- a/theano/gof/op.py
+++ b/theano/gof/op.py
@@ -322,17 +322,15 @@ class CLinkerOp(CLinkerObject):
        raise utils.MethodNotDefined("c_init_code_apply", type(self),
                                     self.__class__.__name__)
-    def c_init_code_struct(self, node, struct_id, sub):
+    def c_init_code_struct(self, node, name, sub):
        """
        Optional: return a code string specific to the apply
        to be inserted in the struct initialization code.
        :param node: an Apply instance in the graph being compiled
-        :param struct_id: a number that serves to uniquely identify
+        :param name: a unique name to distinguish you variables from
-                          this code.  The c_code will receive another
+                     those of other nodes.
-                          sub parameter named struct_id that will
-                          contain this name.
        :param sub: a dictionary of values to substitute in the code.
                    Most notably it contains a 'fail' entry that you
@@ -345,17 +343,15 @@ class CLinkerOp(CLinkerObject):
        raise utils.MethodNotDefined("c_init_code_apply", type(self),
                                     self.__class__.__name__)
-    def c_support_code_struct(self, node, struct_id):
+    def c_support_code_struct(self, node, name):
        """Optional: Return utility code for use by an `Op` that will be
        inserted at struct scope, that can be specialized for the
        support of a particular `Apply` node.
        :param node: an Apply instance in the graph being compiled
-        :param struct_id: a number that serves to uniquely identify
+        :param name: a unique name to distinguish you variables from
-                          this code.  The c_code will receive another
+                     those of other nodes.
-                          sub parameter named struct_id that will
-                          contain this name.
        :Exceptions:
         - `MethodNotDefined`: Subclass does not implement this method
@@ -364,17 +360,15 @@ class CLinkerOp(CLinkerObject):
        raise utils.MethodNotDefined("c_support_code_struct",
                type(self), self.__class__.__name__)
-    def c_cleanup_code_struct(self, node, struct_id):
+    def c_cleanup_code_struct(self, node, name):
        """
        Optional: return a code string specific to the apply to be
        inserted in the struct cleanup code.
        :param node: an Apply instance in the graph being compiled
-        :param struct_id: a number that serves to uniquely identify
+        :param name: a unique name to distinguish you variables from
-                          this code.  The c_code will receive another
+                     those of other nodes.
-                          sub parameter named struct_id that will
-                          contain this name.
        :Exceptions:
        - `MethodNotDefined`: the subclass does not override this method

--- a/theano/gof/tests/test_op.py
+++ b/theano/gof/tests/test_op.py
@@ -94,20 +94,20 @@ class StructOp(Op):
    def make_node(self, i):
        return Apply(self, [i], [scalar.uint64()])
-    def c_support_code_struct(self, node, struct_id):
+    def c_support_code_struct(self, node, name):
-        return "npy_uint64 counter%d;" % (struct_id,)
+        return "npy_uint64 counter%s;" % (name,)
-    def c_init_code_struct(self, node, struct_id, sub):
+    def c_init_code_struct(self, node, name, sub):
-        return "counter%d = 0;" % (struct_id,)
+        return "counter%s = 0;" % (name,)
    def c_code(self, node, name, input_names, outputs_names, sub):
        return """
-%(out)s = counter%(sid)s;
+%(out)s = counter%(name)s;
-counter%(sid)s++;
+counter%(name)s++;
-""" % dict(out=outputs_names[0], sid=sub['struct_id'])
+""" % dict(out=outputs_names[0], name=name)
    def c_code_cache_version(self):
-        return (0,)
+        return (1,)
 class TestOp:

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -2000,12 +2000,6 @@ def local_gpu_extract_diagonal(node):
                gpu_from_host(diag_node.inputs[0]))]
    return False
-def typeConstructor(broadcastable, dtype):
-    if dtype == 'float32':
-        return CudaNdarrayType(broadcastable=broadcastable)
-    else:
-        return tensor.TensorType(broadcastable=broadcastable, dtype=dtype)
 @register_opt('scan')
 @local_optimizer([gpu_from_host, scan_op.Scan])
 def gpuScanOptimization(node):
@@ -2065,9 +2059,7 @@ def gpuScanOptimization(node):
            nw_op = scan_op.Scan(scan_ins,
                                 scan_outs,
-                                 info,
+                                 info).make_node(*nw_ins)
-                                 typeConstructor=typeConstructor).make_node(
-                                     *nw_ins)
            _outputs = nw_op.outputs
            return _outputs
@@ -2113,8 +2105,7 @@ def gpuScanOptimization(node):
            _outputs = scan_op.Scan(
                scan_ins,
                scan_outs,
-                info,
+                info).make_node(*nw_ins).outputs
-                typeConstructor=typeConstructor).make_node(*nw_ins).outputs
            outputs = []
            for x, y in zip(_outputs, node.outputs):
                if isinstance(y.type, CudaNdarrayType):
@@ -2126,8 +2117,7 @@ def gpuScanOptimization(node):
 optdb.register('gpu_scanOp_make_inplace',
-               scan_opt.ScanInplaceOptimizer(typeConstructor=typeConstructor,
+               scan_opt.ScanInplaceOptimizer(gpu_flag=True),
-                                             gpu_flag=True),
               75,
               'gpu',
               'fast_run',

--- a/theano/sandbox/cuda/type.py
+++ b/theano/sandbox/cuda/type.py
@@ -71,6 +71,11 @@ class CudaNdarrayType(Type):
        self.name = name
        self.dtype_specs()  # error checking is done there
+    def clone(self, dtype=None, broadcastable=None):
+        if broadcastable is None:
+            broadcastable = self.broadcastable
+        return self.__class__(broadcastable, name=self.name, dtype=dtype)
    def filter(self, data, strict=False, allow_downcast=None):
        return self.filter_inplace(data, None, strict=strict,
                                   allow_downcast=allow_downcast)

--- a/theano/sandbox/gpuarray/basic_ops.py
+++ b/theano/sandbox/gpuarray/basic_ops.py
@@ -607,7 +607,6 @@ class GpuAlloc(HideC, Alloc):
    def __init__(self, memset_0=False):
        """memset_0 is only an optimized version. True, it mean the
        value is always 0, so the c code call memset as it is faster.
        """
        self.memset_0 = memset_0

--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -716,13 +716,11 @@ def local_scan_to_gpua(node):
    _cmodule_key = gof.CLinker().cmodule_key_(local_fgraph, [])
    info['gpu_hash'] = hash(_cmodule_key)
-    nw_op = scan_op.Scan(scan_ins, scan_outs, info,
+    nw_op = scan_op.Scan(scan_ins, scan_outs, info).make_node(*nw_ins)
-                         typeConstructor=GpuArrayType).make_node(*nw_ins)
    return nw_op.outputs
 optdb.register('gpua_scanOp_make_inplace',
-               scan_opt.ScanInplaceOptimizer(typeConstructor=GpuArrayType,
+               scan_opt.ScanInplaceOptimizer(gpua_flag=True),
-                                             gpua_flag=True),
               75,
               'gpua',
               'fast_run',

--- a/theano/sandbox/gpuarray/tests/test_scan.py
+++ b/theano/sandbox/gpuarray/tests/test_scan.py
@@ -15,6 +15,7 @@ from theano.sandbox.gpuarray.tests.test_basic_ops import mode_with_gpu
 class T_Scan(TestCase):
    def setUp(self):
        utt.seed_rng()
+        super(T_Scan, self).setUp()
    def test_one_sequence_one_output_weights_gpu1(self):
        def f_rnn(u_t, x_tm1, W_in, W):

--- a/theano/sandbox/gpuarray/type.py
+++ b/theano/sandbox/gpuarray/type.py
@@ -28,6 +28,14 @@ class GpuArrayType(Type):
            raise TypeError("Unsupported dtype for %s: %s" %
                            (self.__class__.__name__, self.dtype))
+    def clone(self, dtype=None, broadcastable=None):
+        if dtype is None:
+            dtype = self.dtype
+        if broadcastable is None:
+            broadcastable = self.broadcastable
+        return self.__class__(dtype=dtype, broadcastable=broadcastable,
+                              name=self.name)
    def __str__(self):
        return "GpuArrayType(%s, %s)" % (self.dtype, self.broadcastable)

--- a/theano/scan_module/scan_op.py
+++ b/theano/scan_module/scan_op.py
@@ -47,7 +47,6 @@ class Scan(PureOp):
                 inputs,
                 outputs,
                 info,
-                 typeConstructor=None,
                ):
        """
        :param inputs: inputs of the inner function of scan
@@ -56,21 +55,6 @@ class Scan(PureOp):
            the scan op (like number of different types of
            arguments, name, mode, if it should run on GPU or
            not, etc.)
-        :param typeConstructor: function that constructs an equivalent
-            to Theano TensorType
-        Note: ``typeConstructor`` had been added to refactor how
-        Theano deals with the GPU. If it runs on the GPU, scan needs
-        to construct certain outputs (those who reside in the GPU
-        memory) as the GPU-specific type.  However we can not import
-        gpu code in this file (as it is in sandbox, and not available
-        on each machine) so the workaround is that the GPU
-        optimization passes to the constructor of this class a
-        function that is able to construct a GPU type. This way the
-        class Scan does not need to be aware of the details for the
-        GPU, it just constructs any tensor using this function (which
-        by default constructs normal tensors).
        """
        if 'gpua' not in info:
            info['gpua'] = False
@@ -87,19 +71,13 @@ class Scan(PureOp):
        self.output_types = []
        idx = 0
        jdx = 0
-        tensorConstructor = lambda broadcastable, dtype: TensorType(
-            broadcastable=broadcastable, dtype=dtype)
-        if typeConstructor is None:
-            typeConstructor = tensorConstructor
        while idx < self.n_mit_mot_outs:
            # Not that for mit_mot there are several output slices per
            # output sequence
            o = outputs[idx]
            self.output_types.append(
-                typeConstructor(
+                o.type.clone(broadcastable=(False,) + o.type.broadcastable))
-                    broadcastable=(False,) + o.type.broadcastable,
-                    dtype=o.type.dtype))
            idx += len(self.mit_mot_out_slices[jdx])
            jdx += 1
@@ -109,9 +87,7 @@ class Scan(PureOp):
        for o in outputs[idx:end]:
            self.output_types.append(
-                typeConstructor(
+                o.type.clone(broadcastable=(False,) + o.type.broadcastable))
-                    broadcastable=(False,) + o.type.broadcastable,
-                    dtype=o.type.dtype))
        # shared outputs + possibly the ending condition
        for o in outputs[end:]:
@@ -232,10 +208,9 @@ class Scan(PureOp):
            if rval.ndim == as_var.ndim:
                rval = as_var.type.filter_variable(rval)
            else:
-                tmp = as_var.type.__class__(
+                tmp = as_var.type.clone(
                    broadcastable=tuple(var.broadcastable[:1])+\
-                                  tuple(as_var.broadcastable),
+                        tuple(as_var.broadcastable))
-                    dtype=as_var.dtype)
                rval = tmp.filter_variable(rval)
            return rval

--- a/theano/scan_module/scan_opt.py
+++ b/theano/scan_module/scan_opt.py
@@ -916,9 +916,8 @@ class PushOutScanOutput(gof.Optimizer):
 class ScanInplaceOptimizer(Optimizer):
    """Graph optimizer for Scan(makes it run inplace)"""
-    def __init__(self, typeConstructor=None, gpu_flag=False, gpua_flag=False):
+    def __init__(self, gpu_flag=False, gpua_flag=False):
        Optimizer.__init__(self)
-        self.typeConstructor = typeConstructor
        self.gpu_flag = gpu_flag
        self.gpua_flag = gpua_flag
@@ -960,8 +959,7 @@ class ScanInplaceOptimizer(Optimizer):
                inputs = ls_begin + ls + ls_end
                new_op = scan_op.Scan(op.inputs,
                                      op.outputs,
-                                      info,
+                                      info)
-                                      typeConstructor=self.typeConstructor)
                # Do not call make_node for test_value
                new_outs = new_op(*inputs, **dict(return_list=True))
@@ -2086,8 +2084,7 @@ scan_eqopt2 = theano.gof.EquilibriumDB()
 optdb.register('scan_eqopt1', scan_eqopt1, .1, 'fast_run', 'scan')
 optdb.register('scan_eqopt2', scan_eqopt2, 1.6, 'fast_run', 'scan')
 optdb.register('scanOp_make_inplace',
-               ScanInplaceOptimizer(typeConstructor=None,
+               ScanInplaceOptimizer(),
-                                    gpu_flag=False),
               75,
               'fast_run',
               'inplace',

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -2403,17 +2403,7 @@ class Alloc(gof.Op):
    This Op is used to replace fill() during optimizations because after shapes
    are lifted, the first argument to fill can often be pruned from the graph.
    """
-    def __init__(self):
+    __props__ = ()
-        pass
-    def __eq__(self, other):
-        return type(self) == type(other)
-    def __hash__(self):
-        return hash(type(self))
-    def __str__(self):
-        return self.__class__.__name__
    def make_node(self, value, *shape):
        v = as_tensor_variable(value)

--- a/theano/tensor/type.py
+++ b/theano/tensor/type.py
@@ -52,6 +52,18 @@ class TensorType(Type):
                " AdvancedSubtensor1 sparse_grad. Now use"
                " theano.sparse_grad(a_tensor[an_int_vector]).")
+    def clone(self, dtype=None, broadcastable=None):
+        """
+        Return a copy of the type optionally with a new dtype or
+        broadcastable pattern.
+        """
+        if dtype is None:
+            dtype = self.dtype
+        if broadcastable is None:
+            broadcastable = self.broadcastable
+        return self.__class__(dtype, broadcastable, name=self.name,
+                              sparse_grad=self.sparse_grad)
    def filter(self, data, strict=False, allow_downcast=None):
        """Convert `data` to something which can be associated to a
        `TensorVariable`.