This should get scan working with non-float32 inputs/outputs in gpuarray.

It should also not break the old cuda backend.

This should get scan working with non-float32 inputs/outputs in gpuarray.
4ed010d8 · Arnaud Bergeron · 484ee1e0 · 4ed010d8 · 4ed010d8 · 4ed010d8
--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -1535,6 +1535,11 @@ def local_gpu_extract_diagonal(node):
                gpu_from_host(diag_node.inputs[0]))]
    return False
+def typeConstructor(broadcastable, dtype):
+    if dtype == 'float32':
+        return CudaNdarrayType(broadcastable=broadcastable)
+    else:
+        return TensorType(broadcastable=broadcastable, dtype=dtype)
 @register_opt('scan')
 @local_optimizer([gpu_from_host, scan_op.Scan])
@@ -1593,8 +1598,6 @@ def gpuScanOptimization(node):
            _cmodule_key = gof.CLinker().cmodule_key_(local_fgraph, [])
            info['gpu_hash'] = hash(_cmodule_key)
-            typeConstructor = lambda broadcastable, dtype: CudaNdarrayType(
-                    broadcastable=broadcastable)
            nw_op = scan_op.Scan(scan_ins,
                                 scan_outs,
                                 info,
@@ -1642,10 +1645,6 @@ def gpuScanOptimization(node):
            _cmodule_key = gof.CLinker().cmodule_key_(local_fgraph, [])
            info['gpu_hash'] = hash(_cmodule_key)
-            def typeConstructor(broadcastable, dtype):
-                assert dtype == 'float32'
-                return CudaNdarrayType(broadcastable=broadcastable)
            _outputs = scan_op.Scan(
                scan_ins,
                scan_outs,
@@ -1662,7 +1661,7 @@ def gpuScanOptimization(node):
 optdb.register('gpu_scanOp_make_inplace',
-               scan_opt.ScanInplaceOptimizer(typeConstructor=CudaNdarrayType,
+               scan_opt.ScanInplaceOptimizer(typeConstructor=typeConstructor,
                                            gpu_flag=True),
               75,
               'gpu',

--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -518,7 +518,6 @@ def gpu_reconstruct_graph(inputs, outputs, tag=None):
 @op_lifter([scan_op.Scan])
 def local_scan_to_gpua(node):
    info = copy.deepcopy(node.op.info)
-    info['gpu'] = True
    info['gpua'] = True
    nw_ins = [node.inputs[0]]
    e = (1 +
@@ -540,8 +539,8 @@ def local_scan_to_gpua(node):
                    [safe_to_cpu(x) for x in scan_ins]))
    # We need to construct the hash here, because scan
-    # __init__ does not know about cuda ndarray and can not
+    # __init__ does not know about the gpu and can not
-    # handle graphs with inputs being Cuda Ndarrays
+    # handle graphs with inputs being on the gpu
    tmp_in, tmp_out = gpu_reconstruct_graph(scan_ins, scan_outs)
    local_fgraph = gof.FunctionGraph(tmp_in, tmp_out, clone=False)
    _cmodule_key = gof.CLinker().cmodule_key_(local_fgraph, [])

--- a/theano/scan_module/scan_op.py
+++ b/theano/scan_module/scan_op.py
@@ -56,22 +56,21 @@ class Scan(PureOp):
            the scan op (like number of different types of
            arguments, name, mode, if it should run on GPU or
            not, etc.)
-        :param typeConstructor: function that constructs a Theano TensorType
+        :param typeConstructor: function that constructs an equivalent
-            able to represent a float32 ndarray.
+            to Theano TensorType
-        Note: ``typeConstructor`` had been added to refactor how Theano
-        deals with the GPU. If it runs on the GPU, scan needs to construct
+        Note: ``typeConstructor`` had been added to refactor how
-        certain outputs (those who reside in the GPU memory) as CudaNdarray.
+        Theano deals with the GPU. If it runs on the GPU, scan needs
-        However we can not import cuda in this file (as it is in sandbox,
+        to construct certain outputs (those who reside in the GPU
-        and not available on each machine) so the workaround is that the GPU
+        memory) as the GPU-specific type.  However we can not import
-        optimization (which is aware of cuda types) passes to the
+        gpu code in this file (as it is in sandbox, and not available
-        constructor of this class a function that is able to construct
+        on each machine) so the workaround is that the GPU
-        CudaNdarray. This way the class Scan does not need to be aware of
+        optimization passes to the constructor of this class a
-        CudaNdarray, it just constructs any float32 tensor using this
+        function that is able to construct a GPU type. This way the
-        function (which by default constructs normal tensors). Note that the
+        class Scan does not need to be aware of the details for the
-        second assumption in this code is that any float32 output or input
+        GPU, it just constructs any tensor using this function (which
-        will be moved on the GPU if the optimization gets applied (following
+        by default constructs normal tensors).
-        Theano's philosophy of moving as much as possible on gpu).
        """
        if 'gpua' not in info:
            info['gpua'] = False
@@ -97,23 +96,10 @@ class Scan(PureOp):
            # Not that for mit_mot there are several output slices per
            # output sequence
            o = outputs[idx]
-            # Scan assumes that only variables of dtype float32 might need a
+            self.output_types.append(
-            # special constructor (i.e. CudaNdarray constructor) when the
+                typeConstructor(
-            # code is running on GPU, as it is the only type supported by
+                    broadcastable=(False,) + o.type.broadcastable,
-            # Theano yet. Therefore only for dtype float32 we use the passed
+                    dtype=o.type.dtype))
-            # type constructor ``typeConstructor``. For anything else we
-            # know that even if we run it on the GPU we still construct
-            # normal Theano tensors.
-            if o.type.dtype in ['float32']:
-                self.output_types.append(
-                    typeConstructor(
-                        broadcastable=(False,) + o.type.broadcastable,
-                        dtype=o.type.dtype))
-            else:
-                self.output_types.append(
-                    tensorConstructor(
-                        broadcastable=(False,) + o.type.broadcastable,
-                        dtype=o.type.dtype))
            idx += len(self.mit_mot_out_slices[jdx])
            jdx += 1
@@ -122,23 +108,11 @@ class Scan(PureOp):
        end = idx + self.n_mit_sot + self.n_sit_sot + self.n_nit_sot
        for o in outputs[idx:end]:
-            # Scan assumes that only variables of dtype float32 might need a
+            self.output_types.append(
-            # special constructor (i.e. CudaNdarray constructor) when the
+                typeConstructor(
-            # code is running on GPU, as it is the only type supported by
+                    broadcastable=(False,) + o.type.broadcastable,
-            # Theano yet. Therefore only for dtype float32 we use the passed
+                    dtype=o.type.dtype))
-            # type constructor ``typeConstructor``. For anything else we
-            # know that even if we run it on the GPU we still construct
-            # normal Theano tensors.
-            if o.type.dtype in ['float32']:
-                self.output_types.append(
-                    typeConstructor(
-                        broadcastable=(False,) + o.type.broadcastable,
-                        dtype=o.type.dtype))
-            else:
-                self.output_types.append(
-                    tensorConstructor(
-                        broadcastable=(False,) + o.type.broadcastable,
-                        dtype=o.type.dtype))
        # shared outputs + possibly the ending condition
        for o in outputs[end:]:
            self.output_types.append(o.type)
@@ -184,14 +158,14 @@ class Scan(PureOp):
                                   self.n_shared_outs)
        self.n_outs = self.n_mit_mot + self.n_mit_sot + self.n_sit_sot
        self.n_tap_outs = self.n_mit_mot + self.n_mit_sot
-        if not self.info['gpu']:
+        if self.info['gpu'] or self.info['gpua']:
+            self._hash_inner_graph = self.info['gpu_hash']
+        else:
            tmp_in, tmp_out = scan_utils.reconstruct_graph(self.inputs,
                                                           self.outputs)
            local_fgraph = gof.FunctionGraph(tmp_in, tmp_out, clone=False)
            self._cmodule_key = gof.CLinker().cmodule_key_(local_fgraph, [])
            self._hash_inner_graph = hash(self._cmodule_key)
-        else:
-            self._hash_inner_graph = self.info['gpu_hash']
    def make_node(self, *inputs):
        """