Merge pull request #906 from pascanur/gpu_opt_scan_outputs_different_dtypes_rebased

Gpu opt scan outputs different dtypes rebased

Merge pull request #906 from pascanur/gpu_opt_scan_outputs_different_dtypes_rebased
edfc726f · nouiz · d95e876d · 553afd8a · edfc726f · edfc726f
--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -1525,8 +1525,9 @@ def gpuScanOptimization(node):
            local_fgraph = gof.FunctionGraph(tmp_in, tmp_out)
            _cmodule_key = gof.CLinker.cmodule_key_(local_fgraph, [])
            info['gpu_hash'] = hash(_cmodule_key)
-            typeConstructor = lambda broadcastable, dtype: CudaNdarrayType(
+            def typeConstructor(broadcastable, dtype):
-                    broadcastable=broadcastable)
+                assert dtype == 'float32'
+                return CudaNdarrayType(broadcastable=broadcastable)
            _outputs = scan_op.Scan(
                scan_ins,
                scan_outs,

--- a/theano/scan_module/scan_op.py
+++ b/theano/scan_module/scan_op.py
@@ -31,7 +31,6 @@ from theano import tensor
 from theano.tensor.opt import Shape_i
 from theano.gradient import grad_undefined
 from theano.gradient import DisconnectedType
-#from theano.sandbox import cuda
 from theano.compile.profiling import ScanProfileStats
 import scan_utils
@@ -51,8 +50,26 @@ class Scan(PureOp):
        """
        :param inputs: inputs of the inner function of scan
        :param outputs: outputs of the inner function of scan
-        :param properties: dictionary containing different properties of
+        :param info: dictionary containing different properties of
-                        the scan op.
+            the scan op (like number of different types of
+            arguments, name, mode, if it should run on GPU or
+            not, etc.)
+        :param typeConstructor: function that constructs a Theano TensorType
+            able to represent a float32 ndarray.
+        Note: ``typeConstructor`` had been added to refactor how Theano
+        deals with the GPU. If it runs on the GPU, scan needs to construct
+        certain outputs (those who reside in the GPU memory) as CudaNdarray.
+        However we can not import cuda in this file (as it is in sandbox,
+        and not available on each machine) so the workaround is that the GPU
+        optimization (which is aware of cuda types) passes to the
+        constructor of this class a function that is able to construct
+        CudaNdarray. This way the class Scan does not need to be aware of
+        CudaNdarray, it just constructs any float32 tensor using this
+        function (which by default constructs normal tensors). Note that the
+        second assumption in this code is that any float32 output or input
+        will be moved on the GPU if the optimization gets applied (following
+        Theano's philosophy of moving as much as possible on gpu).
        """
        # adding properties into self
        self.inputs = inputs
@@ -67,29 +84,57 @@ class Scan(PureOp):
        self.output_types = []
        idx = 0
        jdx = 0
+        tensorConstructor = lambda broadcastable, dtype: TensorType(
+            broadcastable=broadcastable, dtype=dtype)
        if typeConstructor is None:
-            typeConstructor = lambda broadcastable, dtype: TensorType(
+            typeConstructor = tensorConstructor
-                broadcastable=broadcastable, dtype=dtype)
        while idx < self.n_mit_mot_outs:
            # Not that for mit_mot there are several output slices per
            # output sequence
            o = outputs[idx]
-            self.output_types.append(
+            # Scan assumes that only variables of dtype float32 might need a
-                typeConstructor(
+            # special constructor (i.e. CudaNdarray constructor) when the
-                    broadcastable=(False,) + o.type.broadcastable,
+            # code is running on GPU, as it is the only type supported by
-                    dtype=o.type.dtype)
+            # Theano yet. Therefore only for dtype float32 we use the passed
-                        )
+            # type constructor ``typeConstructor``. For anything else we
+            # know that even if we run it on the GPU we still construct
+            # normal Theano tensors.
+            if o.type.dtype in ['float32']:
+                self.output_types.append(
+                    typeConstructor(
+                        broadcastable=(False,) + o.type.broadcastable,
+                        dtype=o.type.dtype))
+            else:
+                self.output_types.append(
+                    tensorConstructor(
+                        broadcastable=(False,) + o.type.broadcastable,
+                        dtype=o.type.dtype))
            idx += len(self.mit_mot_out_slices[jdx])
            jdx += 1
        # mit_sot / sit_sot / nit_sot
        end = idx + self.n_mit_sot + self.n_sit_sot + self.n_nit_sot
        for o in outputs[idx:end]:
-            self.output_types.append(
+            # Scan assumes that only variables of dtype float32 might need a
-                typeConstructor(
+            # special constructor (i.e. CudaNdarray constructor) when the
-                    broadcastable=(False,) + o.type.broadcastable,
+            # code is running on GPU, as it is the only type supported by
-                    dtype=o.type.dtype))
+            # Theano yet. Therefore only for dtype float32 we use the passed
+            # type constructor ``typeConstructor``. For anything else we
+            # know that even if we run it on the GPU we still construct
+            # normal Theano tensors.
+            if o.type.dtype in ['float32']:
+                self.output_types.append(
+                    typeConstructor(
+                        broadcastable=(False,) + o.type.broadcastable,
+                        dtype=o.type.dtype))
+            else:
+                self.output_types.append(
+                    tensorConstructor(
+                        broadcastable=(False,) + o.type.broadcastable,
+                        dtype=o.type.dtype))
        # shared outputs + possibly the ending condition
        for o in outputs[end:]:
            self.output_types.append(o.type)
@@ -572,7 +617,7 @@ class Scan(PureOp):
                        cython_destroy_map,
                        args,
                        outs,
-                        self)
+                        self, node)
        except (ImportError, theano.gof.cmodule.MissingGXX):
            p = self.execute
        # default arguments are stored in the closure of `rval`
@@ -757,8 +802,6 @@ class Scan(PureOp):
            Y sequence outputs y_1, y_2, ... y_<self.n_outs>
        """
-        # In order to be able to allocate cuda ndarrays if needed
-        from theano.sandbox import cuda
        # 1. Unzip the number of steps and sequences. If number of steps is
        # negative flip sequences around, and make n_steps positive
        t0_call = time.time()
@@ -949,14 +992,10 @@ class Scan(PureOp):
                        self.vector_outs[j] = True
                    dtype = output_storage[jout].storage[0].dtype
                    if (outs[j][0] is None or
-                        outs[j][0].shape[0] < store_steps[j] or
+                            outs[j][0].shape[0] < store_steps[j] or
-                        outs[j][0].shape[1:] != shape[1:] or
+                            outs[j][0].shape[1:] != shape[1:] or
-                        outs[j][0].dtype != dtype):
+                            outs[j][0].dtype != dtype):
-                        if self.gpu:
+                        outs[j][0] = node.outputs[j].type.value_zeros(shape)
-                            _cuda = cuda.cuda_ndarray.cuda_ndarray.CudaNdarray
-                            outs[j][0] = _cuda.zeros(shape)
-                        else:
-                            outs[j][0] = numpy.zeros(shape, dtype)
                    elif outs[j][0].shape[0] != store_steps[j]:
                        outs[j][0] = outs[j][0][:store_steps[j]]
                    outs[j][0][pos[j]] = output_storage[jout].storage[0]
@@ -994,24 +1033,14 @@ class Scan(PureOp):
                    # This way, there will be no information overwritten
                    # before it is read (as it used to happen).
                    shape = (pdx,) + outs[idx][0].shape[1:]
-                    if cuda.cuda_available and isinstance(outs[idx][0],
+                    tmp = node.outputs[idx].type.value_zeros(shape)
-                                                          cuda.CudaNdarray):
-                        _cuda = cuda.cuda_ndarray.cuda_ndarray.CudaNdarray
-                        tmp = _cuda.zeros(shape)
-                    else:
-                        tmp = numpy.empty(shape)
                    tmp[:] = outs[idx][0][:pdx]
                    outs[idx][0][:store_steps[idx] - pdx] = outs[idx][0][pdx:]
                    outs[idx][0][store_steps[idx] - pdx:] = tmp
                    del tmp
                else:
                    shape = (store_steps[idx] - pdx,) + outs[idx][0].shape[1:]
-                    if cuda.cuda_available and isinstance(outs[idx][0],
+                    tmp = node.outputs[idx].type.value_zeros(shape)
-                                                          cuda.CudaNdarray):
-                        _cuda = cuda.cuda_ndarray.cuda_ndarray.CudaNdarray
-                        tmp = _cuda.zeros(shape)
-                    else:
-                        tmp = numpy.empty(shape)
                    tmp[:] = outs[idx][0][pdx:]
                    outs[idx][0][store_steps[idx] - pdx:] = outs[idx][0][:pdx]
                    outs[idx][0][:store_steps[idx] - pdx] = tmp

--- a/theano/scan_module/scan_perform.c.txt
+++ b/theano/scan_module/scan_perform.c.txt
--- a/theano/scan_module/scan_perform.pyx
+++ b/theano/scan_module/scan_perform.pyx
@@ -59,11 +59,10 @@ cimport numpy
 from theano import gof
 import time
 import copy
-from theano.sandbox import cuda
 def get_version():
-    return 0.276
+    return 0.278
 @cython.boundscheck(False)
 def perform(
@@ -88,7 +87,8 @@ def perform(
            numpy.ndarray[numpy.int32_t,ndim=1] destroy_map,
            args,
            outs,
-            self):
+            self,
+            node):
    """
    Parameters
    ----------
@@ -383,10 +383,7 @@ def perform(
                        outs[j][0].shape[0] < store_steps[j] or
                        outs[j][0].shape[1:] != shape[1:] or
                        outs[j][0].dtype != dtype ):
-                    if self.gpu:
+                    outs[j][0] = node.outputs[j].type.value_zeros(shape)
-                        outs[j][0] = cuda.cuda_ndarray.cuda_ndarray.CudaNdarray.zeros(shape)
-                    else:
-                        outs[j][0] = numpy.zeros(shape, dtype)
                elif outs[j][0].shape[0] != store_steps[j]:
                    outs[j][0] = outs[j][0][:store_steps[j]]
                outs[j][0][pos[j]] = output_storage[jout].storage[0]
@@ -426,22 +423,13 @@ def perform(
                # before it is read (as it used to happen).
                shape = (pdx,)+ outs[idx][0].shape[1:]
-                if cuda.cuda_available and isinstance( outs[idx][0],
+                tmp = node.outputs[idx].type.value_zeros(shape)
-                                                      cuda.CudaNdarray):
-                    tmp = cuda.cuda_ndarray.cuda_ndarray.CudaNdarray.zeros(shape)
-                else:
-                    tmp = numpy.empty(shape, outs[idx][0].dtype)
                tmp[:] = outs[idx][0][:pdx]
                outs[idx][0][:store_steps[idx]-pdx] = outs[idx][0][pdx:]
                outs[idx][0][store_steps[idx]-pdx:] = tmp
            else:
                shape = (store_steps[idx]-pdx,) + outs[idx][0].shape[1:]
+                tmp = node.outputs[idx].type.value_zeros(shape)
-                if cuda.cuda_available and isinstance( outs[idx][0],
-                                                      cuda.CudaNdarray):
-                    tmp = cuda.cuda_ndarray.cuda_ndarray.CudaNdarray.zeros(shape)
-                else:
-                    tmp = numpy.empty(shape, outs[idx][0].dtype)
                tmp[:] = outs[idx][0][pdx:]
                outs[idx][0][store_steps[idx]-pdx:] = outs[idx][0][:pdx]
                outs[idx][0][:store_steps[idx]-pdx] = tmp

--- a/theano/scan_module/scan_perform_ext.py
+++ b/theano/scan_module/scan_perform_ext.py
@@ -14,7 +14,7 @@ logging.basicConfig(level=logging.DEBUG)
 if config.compiledir not in sys.path:
    sys.path.append(config.compiledir)
-version = 0.276  # must match constant returned in function get_version()
+version = 0.278  # must match constant returned in function get_version()
 need_reload = False
 try:

--- a/theano/scan_module/tests/test_scan.py
+++ b/theano/scan_module/tests/test_scan.py
@@ -504,6 +504,64 @@ class T_Scan(unittest.TestCase):
        assert not any([isinstance(node.op, theano.sandbox.cuda.GpuFromHost)
                        for node in scan_node_topo])
+    # This third test checks that scan can deal with a mixture of dtypes as
+    # outputs when is running on GPU
+    def test_gpu3_mixture_dtype_outputs(self):
+        from theano.sandbox import cuda
+        if cuda.cuda_available == False:
+            raise SkipTest('Optional package cuda disabled')
+        def f_rnn(u_t, x_tm1, W_in, W):
+            return (u_t * W_in + x_tm1 * W,
+                    tensor.cast(u_t+x_tm1, 'int64'))
+        u = theano.tensor.fvector('u')
+        x0 = theano.tensor.fscalar('x0')
+        W_in = theano.tensor.fscalar('win')
+        W = theano.tensor.fscalar('w')
+        output, updates = theano.scan(f_rnn,
+                                      u,
+                                      [x0, None],
+                                      [W_in, W],
+                                      n_steps=None,
+                                      truncate_gradient=-1,
+                                      go_backwards=False,
+                                      mode=mode_with_gpu)
+        f2 = theano.function([u, x0, W_in, W],
+                             output,
+                             updates=updates,
+                             allow_input_downcast=True,
+                             mode=mode_with_gpu)
+        # get random initial values
+        rng = numpy.random.RandomState(utt.fetch_seed())
+        v_u = rng.uniform(size=(4,), low=-5., high=5.)
+        v_x0 = rng.uniform()
+        W = rng.uniform()
+        W_in = rng.uniform()
+        # compute the output in numpy
+        v_out1 = numpy.zeros((4,))
+        v_out2 = numpy.zeros((4,), dtype='int64')
+        v_out1[0] = v_u[0] * W_in + v_x0 * W
+        v_out2[0] = v_u[0] + v_x0
+        for step in xrange(1, 4):
+            v_out1[step] = v_u[step] * W_in + v_out1[step - 1] * W
+            v_out2[step] = numpy.int64(v_u[step] + v_out1[step - 1])
+        theano_out1, theano_out2 = f2(v_u, v_x0, W_in, W)
+        assert numpy.allclose(theano_out1, v_out1)
+        assert numpy.allclose(theano_out2, v_out2)
+        topo = f2.maker.fgraph.toposort()
+        scan_node = [node for node in topo
+                     if isinstance(node.op, theano.scan_module.scan_op.Scan)]
+        assert len(scan_node) == 1
+        scan_node = scan_node[0]
+        assert scan_node.op.gpu
    # simple rnn, one input, one state, weights for each; input/state
    # are vectors, weights are scalars; using shared variables
    def test_one_sequence_one_output_weights_shared(self):