Merge pull request #1807 from abergeron/gpuarray_scan

Make scan work with new backend.

Merge pull request #1807 from abergeron/gpuarray_scan
500601a2 · Frédéric Bastien · d4f1d4eb · c5dc5576 · 500601a2 · 500601a2
--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -1535,6 +1535,11 @@ def local_gpu_extract_diagonal(node):
                gpu_from_host(diag_node.inputs[0]))]
    return False
+def typeConstructor(broadcastable, dtype):
+    if dtype == 'float32':
+        return CudaNdarrayType(broadcastable=broadcastable)
+    else:
+        return tensor.TensorType(broadcastable=broadcastable, dtype=dtype)
 @register_opt('scan')
 @local_optimizer([gpu_from_host, scan_op.Scan])
@@ -1593,8 +1598,6 @@ def gpuScanOptimization(node):
            _cmodule_key = gof.CLinker().cmodule_key_(local_fgraph, [])
            info['gpu_hash'] = hash(_cmodule_key)
-            typeConstructor = lambda broadcastable, dtype: CudaNdarrayType(
-                    broadcastable=broadcastable)
            nw_op = scan_op.Scan(scan_ins,
                                 scan_outs,
                                 info,
@@ -1642,10 +1645,6 @@ def gpuScanOptimization(node):
            _cmodule_key = gof.CLinker().cmodule_key_(local_fgraph, [])
            info['gpu_hash'] = hash(_cmodule_key)
-            def typeConstructor(broadcastable, dtype):
-                assert dtype == 'float32'
-                return CudaNdarrayType(broadcastable=broadcastable)
            _outputs = scan_op.Scan(
                scan_ins,
                scan_outs,
@@ -1662,7 +1661,7 @@ def gpuScanOptimization(node):
 optdb.register('gpu_scanOp_make_inplace',
-               scan_opt.ScanInplaceOptimizer(typeConstructor=CudaNdarrayType,
+               scan_opt.ScanInplaceOptimizer(typeConstructor=typeConstructor,
                                            gpu_flag=True),
               75,
               'gpu',

--- a/theano/sandbox/gpuarray/basic_ops.py
+++ b/theano/sandbox/gpuarray/basic_ops.py
@@ -161,7 +161,7 @@ class HostFromGpu(Op):
            raise TypeError(x)
        return Apply(self, [x],
                     [tensor.TensorType(dtype=x.dtype,
-                                        broadcastable=x.broadcastable,)()])
+                                        broadcastable=x.broadcastable)()])
    def perform(self, node, inp, out):
        x, = inp

--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
 import copy
 import theano
 import numpy
-from theano import tensor, scalar
+from theano import tensor, scalar, gof
 from theano.compile import optdb
 from theano.gof import (local_optimizer, EquilibriumDB,
                        SequenceDB, ProxyDB,
                        Optimizer, toolbox,
                        InconsistencyError, EquilibriumOptimizer)
+from theano.scan_module import scan_utils, scan_op, scan_opt
 from theano.gof.python25 import all, any
 from theano.tensor.nnet.conv import ConvOp
 from theano.sandbox.gpuarray.type import GpuArrayType
-from theano.sandbox.gpuarray.basic_ops import (host_from_gpu,
+from theano.sandbox.gpuarray.basic_ops import (
-                                               gpu_from_host,
+    host_from_gpu, gpu_from_host, HostFromGpu,
-                                               gpu_alloc,
+    gpu_alloc, GpuAlloc, GpuReshape, GpuEye
-                                               GpuAlloc,
+    )
-                                               GpuReshape,
-                                               GpuEye)
 from theano.sandbox.gpuarray.blas import gpu_dot22, GpuGemv, GpuGemm, GpuGer
 from theano.sandbox.gpuarray.conv import GpuConv
-from theano.sandbox.gpuarray.nnet import (GpuCrossentropySoftmaxArgmax1HotWithBias,
+from theano.sandbox.gpuarray.nnet import (
-                                          GpuCrossentropySoftmax1HotWithBiasDx,
+    GpuCrossentropySoftmaxArgmax1HotWithBias,
-                                          GpuSoftmaxWithBias,
+    GpuCrossentropySoftmax1HotWithBiasDx,
-                                          GpuSoftmax)
+    GpuSoftmaxWithBias, GpuSoftmax
+    )
 from theano.sandbox.gpuarray.elemwise import (GpuElemwise, _is_scalar,
                                              GpuDimShuffle, GpuCAReduceCuda)
 from theano.sandbox.gpuarray.subtensor import GpuIncSubtensor, GpuSubtensor
@@ -54,6 +55,20 @@ def register_opt(*tags, **kwargs):
 register_opt()(theano.tensor.opt.local_track_shape_i)
+def safe_to_gpu(x):
+    if isinstance(x.type, tensor.TensorType):
+        return gpu_from_host(x)
+    else:
+        return x
+def safe_to_cpu(x):
+    if isinstance(x.type, GpuArrayType):
+        return host_from_gpu(x)
+    else:
+        return x
 def op_lifter(OP):
    """
    OP(..., host_from_gpu(), ...) -> host_from_gpu(GpuOP(...))
@@ -73,10 +88,10 @@ def op_lifter(OP):
                    # This is needed as sometimes new_op inherit from OP.
                    if new_op and new_op != node.op:
                        if isinstance(new_op, theano.Op):
-                            return [host_from_gpu(o) for o in
+                            return [safe_to_cpu(o) for o in
                                    new_op(*node.inputs, return_list=True)]
                        elif isinstance(new_op, (tuple, list)):
-                            return [host_from_gpu(o) for o in new_op]
+                            return [safe_to_cpu(o) for o in new_op]
                        else:  # suppose it is a variable on the GPU
                            return [host_from_gpu(new_op)]
            return False
@@ -132,7 +147,17 @@ optdb['canonicalize'].register('local_cut_gpua_host_gpua',
 @register_opt()
 @op_lifter([tensor.Alloc])
 def local_gpualloc(node):
-    return gpu_alloc
+    new_out = gpu_alloc(*node.inputs)
+    # We need to hide new broadcastable dimensions because
+    # ReplaceValidate doesn't like when they change.
+    if new_out.broadcastable != node.outputs[0].broadcastable:
+        # but if a dim is suddenly not broadcastable anymore then that's a bug
+        for b_old, b_new in zip(node.outputs[0].broadcastable,
+                                new_out.broadcastable):
+            assert b_new or (not b_old)
+        new_out = tensor.patternbroadcast(new_out,
+                                          node.outputs[0].broadcastable)
+    return (new_out,)
 @register_opt()
@@ -158,6 +183,13 @@ def local_gpureshape(node):
    return res
+@register_opt()
+@op_lifter([tensor.Rebroadcast])
+def local_gpu_rebroadcast(node):
+    if isinstance(node.inputs[0].owner.op, HostFromGpu):
+        return node.op(node.inputs[0].owner.inputs[0])
 @register_opt()
 @op_lifter([tensor.Flatten])
 def local_gpuflatten(node):
@@ -176,8 +208,6 @@ def local_gpuflatten(node):
 def local_gpu_elemwise(node):
    op = node.op
    name = op.name
-    if node.outputs[0].ndim == 0:
-        return
    if name:
        name = 'Gpu'+name
    res = GpuElemwise(op.scalar_op, name=name,
@@ -432,3 +462,97 @@ def local_gpu_conv(node):
    out = gpu_from_host(out)
    out.values_eq_approx = values_eq_approx
    return [out]
+def tensor_to_gpu(x):
+    if isinstance(x.type, tensor.TensorType):
+        y = GpuArrayType(broadcastable=x.type.broadcastable,
+                         dtype=x.type.dtype)()
+        if x.name:
+            y.name = x.name + '[Gpua]'
+        return y
+    else:
+        return x
+def gpu_safe_new(x, tag=''):
+    """
+    Internal function that constructs a new variable from x with the same
+    type, but with a different name ( old name + tag). This function is used
+    by gradient, or the R-op to construct new variables for the inputs of
+    the inner graph such that there is no interference between the original
+    graph and the newly constructed graph.
+    """
+    if hasattr(x, 'name') and x.name is not None:
+        nw_name = x.name + tag
+    else:
+        nw_name = None
+    if isinstance(x, theano.Constant):
+        return x.clone()
+    nw_x = x.type()
+    nw_x.name = nw_name
+    return nw_x
+def gpu_reconstruct_graph(inputs, outputs, tag=None):
+    """
+    Different interface to clone, that allows you to pass inputs.
+    Compared to clone, this method always replaces the inputs with
+    new variables of the same type, and returns those ( in the same
+    order as the original inputs).
+    """
+    if tag is None:
+        tag = ''
+    nw_inputs = [gpu_safe_new(x, tag) for x in inputs]
+    givens = {}
+    for nw_x, x in zip(nw_inputs, inputs):
+        givens[x] = nw_x
+    nw_outputs = scan_utils.clone(outputs, replace=givens)
+    return (nw_inputs, nw_outputs)
+@register_opt('scan')
+@op_lifter([scan_op.Scan])
+def local_scan_to_gpua(node):
+    info = copy.deepcopy(node.op.info)
+    info['gpua'] = True
+    nw_ins = [node.inputs[0]]
+    e = (1 +
+         node.op.n_seqs +
+         node.op.n_mit_mot +
+         node.op.n_mit_sot +
+         node.op.n_sit_sot +
+         node.op.n_shared_outs)
+    nw_ins += [safe_to_gpu(x) for x in node.inputs[1:e]]
+    b = e
+    e = e + node.op.n_nit_sot
+    nw_ins += node.inputs[b:e]
+    nw_ins += [safe_to_gpu(x) for x in node.inputs[e:]]
+    scan_ins = [tensor_to_gpu(x) for x in node.op.inputs]
+    scan_outs = [safe_to_gpu(x) for x in node.op.outputs]
+    scan_outs = scan_utils.clone(
+        scan_outs,
+        replace=zip(node.op.inputs,
+                    [safe_to_cpu(x) for x in scan_ins]))
+    # We need to construct the hash here, because scan
+    # __init__ does not know about the gpu and can not
+    # handle graphs with inputs being on the gpu
+    tmp_in, tmp_out = gpu_reconstruct_graph(scan_ins, scan_outs)
+    local_fgraph = gof.FunctionGraph(tmp_in, tmp_out, clone=False)
+    _cmodule_key = gof.CLinker().cmodule_key_(local_fgraph, [])
+    info['gpu_hash'] = hash(_cmodule_key)
+    nw_op =  scan_op.Scan(scan_ins, scan_outs, info,
+                          typeConstructor=GpuArrayType).make_node(*nw_ins)
+    return nw_op.outputs
+optdb.register('gpua_scanOp_make_inplace',
+               scan_opt.ScanInplaceOptimizer(typeConstructor=GpuArrayType,
+                                             gpua_flag=True),
+               75,
+               'gpua',
+               'fast_run',
+               'inplace',
+               'scan')
--- a/theano/sandbox/gpuarray/subtensor.py
+++ b/theano/sandbox/gpuarray/subtensor.py
@@ -7,6 +7,7 @@ import theano
 from theano import tensor, gof
 from theano.gof.python25 import all, any
 from theano.tensor.subtensor import IncSubtensor, Subtensor, get_idx_list
+import theano.tensor.inplace
 from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
 try:

--- a/theano/sandbox/gpuarray/tests/test_opt.py
+++ b/theano/sandbox/gpuarray/tests/test_opt.py
 import numpy
 import theano
+from theano import tensor
 from theano.tests import unittest_tools as utt
+import theano.sandbox.gpuarray
+from theano.sandbox.gpuarray.type import GpuArrayType
 from theano.sandbox.gpuarray.basic_ops import GpuAlloc, GpuReshape, gpu_alloc
 from theano.sandbox.gpuarray.elemwise import GpuCAReduceCuda
-import theano.sandbox.gpuarray
+from theano.sandbox.gpuarray.tests.test_basic_ops import (
+    rand_gpuarray, mode_with_gpu, mode_without_gpu
+    )
 from theano.tests.unittest_tools import SkipTest
-if theano.sandbox.gpuarray.pygpu is None:
-    raise SkipTest("pygpu not installed")
-import theano.sandbox.cuda as cuda_ndarray
-if cuda_ndarray.cuda_available and not theano.sandbox.gpuarray.pygpu_activated:
-    if not cuda_ndarray.use.device_number:
-        cuda_ndarray.use('gpu')
-    theano.sandbox.gpuarray.init_dev('cuda')
-if not theano.sandbox.gpuarray.pygpu_activated:
-    raise SkipTest("pygpu disabled")
-if theano.config.mode == 'FAST_COMPILE':
-    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpuarray').excluding('gpu')
-    mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpuarray')
-else:
-    mode_with_gpu = theano.compile.mode.get_default_mode().including('gpuarray').excluding('gpu')
-    mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpuarray')
 def test_flatten():
    m = theano.tensor.fmatrix()
    f = theano.function([m], m.flatten(), mode=mode_with_gpu)
@@ -104,3 +89,20 @@ def test_local_gpualloc_memset_0():
    assert isinstance(topo[0].op, GpuAlloc)
    assert not topo[0].op.memset_0
    assert (numpy.asarray(f(2)) == 1).all()
+def test_rebroadcast():
+    d = numpy.random.rand(10, 10).astype('float32')
+    v = theano.tensor.fmatrix()
+    up = tensor.unbroadcast(v.sum().dimshuffle('x', 'x'), 0, 1)
+    f = theano.function([v], [up], mode=mode_with_gpu)
+    f(d)
+    topo = f.maker.fgraph.toposort()
+    rebrs = [node for node in topo if isinstance(node.op, tensor.Rebroadcast)]
+    assert len(rebrs) == 1
+    rebr = rebrs[0]
+    assert isinstance(rebr.inputs[0].type, GpuArrayType)
+    assert isinstance(rebr.outputs[0].type, GpuArrayType)
--- a/theano/sandbox/gpuarray/tests/test_scan.py
+++ b/theano/sandbox/gpuarray/tests/test_scan.py
+from unittest import TestCase
+import numpy
+import theano
+from theano.tests import unittest_tools as utt
+import theano.sandbox.rng_mrg
+from theano.sandbox.gpuarray.basic_ops import (
+    gpu_from_host, GpuFromHost, HostFromGpu
+)
+from theano.sandbox.gpuarray.elemwise import GpuElemwise
+from theano.sandbox.gpuarray.tests.test_basic_ops import mode_with_gpu
+class T_Scan(TestCase):
+    def setUp(self):
+        utt.seed_rng()
+    def test_one_sequence_one_output_weights_gpu1(self):
+        def f_rnn(u_t, x_tm1, W_in, W):
+            return u_t * W_in + x_tm1 * W
+        u = theano.tensor.fvector('u')
+        x0 = theano.tensor.fscalar('x0')
+        W_in = theano.tensor.fscalar('win')
+        W = theano.tensor.fscalar('w')
+        mode = mode_with_gpu.excluding('InputToGpuOptimizer')
+        output, updates = theano.scan(f_rnn,
+                                      u,
+                                      x0,
+                                      [W_in, W],
+                                      n_steps=None,
+                                      truncate_gradient=-1,
+                                      go_backwards=False,
+                                      mode=mode)
+        output = gpu_from_host(output)
+        f2 = theano.function([u, x0, W_in, W],
+                             output,
+                             updates=updates,
+                             allow_input_downcast=True,
+                             mode=mode)
+        rng = numpy.random.RandomState(utt.fetch_seed())
+        v_u = rng.uniform(size=(4,), low=-5., high=5.)
+        v_x0 = rng.uniform()
+        W = rng.uniform()
+        W_in = rng.uniform()
+        v_u = numpy.asarray(v_u, dtype='float32')
+        v_x0 = numpy.asarray(v_x0, dtype='float32')
+        W = numpy.asarray(W, dtype='float32')
+        W_in = numpy.asarray(W_in, dtype='float32')
+        # compute the output in numpy
+        v_out = numpy.zeros((4,))
+        v_out[0] = v_u[0] * W_in + v_x0 * W
+        for step in xrange(1, 4):
+            v_out[step] = v_u[step] * W_in + v_out[step - 1] * W
+        theano_values = f2(v_u, v_x0, W_in, W)
+        utt.assert_allclose(theano_values, v_out)
+        # TO DEL
+        topo = f2.maker.fgraph.toposort()
+        scan_node = [node for node in topo
+                     if isinstance(node.op, theano.scan_module.scan_op.Scan)]
+        assert len(scan_node) == 1
+        scan_node = scan_node[0]
+        topo = f2.maker.fgraph.toposort()
+        assert sum([isinstance(node.op, HostFromGpu)
+                    for node in topo]) == 0
+        assert sum([isinstance(node.op, GpuFromHost)
+                    for node in topo]) == 4
+        scan_node = [node for node in topo
+                     if isinstance(node.op, theano.scan_module.scan_op.Scan)]
+        assert len(scan_node) == 1
+        scan_node = scan_node[0]
+        scan_node_topo = scan_node.op.fn.maker.fgraph.toposort()
+        # check that there is no gpu transfer in the inner loop.
+        assert any([isinstance(node.op, GpuElemwise)
+                    for node in scan_node_topo])
+        assert not any([isinstance(node.op, HostFromGpu)
+                        for node in scan_node_topo])
+        assert not any([isinstance(node.op, GpuFromHost)
+                        for node in scan_node_topo])
+    # This second version test the second case in the optimizer to the gpu.
+    def test_one_sequence_one_output_weights_gpu2(self):
+        def f_rnn(u_t, x_tm1, W_in, W):
+            return u_t * W_in + x_tm1 * W
+        u = theano.tensor.fvector('u')
+        x0 = theano.tensor.fscalar('x0')
+        W_in = theano.tensor.fscalar('win')
+        W = theano.tensor.fscalar('w')
+        output, updates = theano.scan(f_rnn,
+                                      u,
+                                      x0,
+                                      [W_in, W],
+                                      n_steps=None,
+                                      truncate_gradient=-1,
+                                      go_backwards=False,
+                                      mode=mode_with_gpu)
+        f2 = theano.function([u, x0, W_in, W],
+                             output,
+                             updates=updates,
+                             allow_input_downcast=True,
+                             mode=mode_with_gpu)
+        # get random initial values
+        rng = numpy.random.RandomState(utt.fetch_seed())
+        v_u = rng.uniform(size=(4,), low=-5., high=5.)
+        v_x0 = rng.uniform()
+        W = rng.uniform()
+        W_in = rng.uniform()
+        # compute the output in numpy
+        v_out = numpy.zeros((4,))
+        v_out[0] = v_u[0] * W_in + v_x0 * W
+        for step in xrange(1, 4):
+            v_out[step] = v_u[step] * W_in + v_out[step - 1] * W
+        theano_values = f2(v_u, v_x0, W_in, W)
+        utt.assert_allclose(theano_values, v_out)
+        topo = f2.maker.fgraph.toposort()
+        assert sum([isinstance(node.op, HostFromGpu)
+                    for node in topo]) == 1
+        assert sum([isinstance(node.op, GpuFromHost)
+                    for node in topo]) == 4
+        scan_node = [node for node in topo
+                     if isinstance(node.op, theano.scan_module.scan_op.Scan)]
+        assert len(scan_node) == 1
+        scan_node = scan_node[0]
+        scan_node_topo = scan_node.op.fn.maker.fgraph.toposort()
+        # check that there is no gpu transfer in the inner loop.
+        assert any([isinstance(node.op, GpuElemwise)
+                    for node in scan_node_topo])
+        assert not any([isinstance(node.op, HostFromGpu)
+                        for node in scan_node_topo])
+        assert not any([isinstance(node.op, GpuFromHost)
+                        for node in scan_node_topo])
+    # This third test checks that scan can deal with a mixture of dtypes as
+    # outputs when is running on GPU
+    def test_gpu3_mixture_dtype_outputs(self):
+        def f_rnn(u_t, x_tm1, W_in, W):
+            return (u_t * W_in + x_tm1 * W,
+                    theano.tensor.cast(u_t + x_tm1, 'int64'))
+        u = theano.tensor.fvector('u')
+        x0 = theano.tensor.fscalar('x0')
+        W_in = theano.tensor.fscalar('win')
+        W = theano.tensor.fscalar('w')
+        output, updates = theano.scan(f_rnn,
+                                      u,
+                                      [x0, None],
+                                      [W_in, W],
+                                      n_steps=None,
+                                      truncate_gradient=-1,
+                                      go_backwards=False,
+                                      mode=mode_with_gpu)
+        f2 = theano.function([u, x0, W_in, W],
+                             output,
+                             updates=updates,
+                             allow_input_downcast=True,
+                             mode=mode_with_gpu)
+        # get random initial values
+        rng = numpy.random.RandomState(utt.fetch_seed())
+        v_u = rng.uniform(size=(4,), low=-5., high=5.)
+        v_x0 = rng.uniform()
+        W = rng.uniform()
+        W_in = rng.uniform()
+        # compute the output in numpy
+        v_out1 = numpy.zeros((4,))
+        v_out2 = numpy.zeros((4,), dtype='int64')
+        v_out1[0] = v_u[0] * W_in + v_x0 * W
+        v_out2[0] = v_u[0] + v_x0
+        for step in xrange(1, 4):
+            v_out1[step] = v_u[step] * W_in + v_out1[step - 1] * W
+            v_out2[step] = numpy.int64(v_u[step] + v_out1[step - 1])
+        theano_out1, theano_out2 = f2(v_u, v_x0, W_in, W)
+        utt.assert_allclose(theano_out1, v_out1)
+        utt.assert_allclose(theano_out2, v_out2)
+        topo = f2.maker.fgraph.toposort()
+        scan_node = [node for node in topo
+                     if isinstance(node.op, theano.scan_module.scan_op.Scan)]
+        assert len(scan_node) == 1
+        scan_node = scan_node[0]
+        assert scan_node.op.gpua
+        scan_node_topo = scan_node.op.fn.maker.fgraph.toposort()
+        # check that there is no gpu transfer in the inner loop.
+        assert not any([isinstance(node.op, HostFromGpu)
+                        for node in scan_node_topo])
+        assert not any([isinstance(node.op, GpuFromHost)
+                        for node in scan_node_topo])
+    def test_gpu4_gibbs_chain(self):
+        rng = numpy.random.RandomState(utt.fetch_seed())
+        v_vsample = numpy.array(rng.binomial(1, .5, size=(3, 20),),
+                                dtype='float32')
+        vsample = theano.shared(v_vsample)
+        trng = theano.sandbox.rng_mrg.MRG_RandomStreams(
+                                utt.fetch_seed())
+        def f(vsample_tm1):
+            return trng.binomial(vsample_tm1.shape, n=1, p=0.3,
+                                 dtype='float32') * vsample_tm1
+        theano_vsamples, updates = theano.scan(f,
+                                               [],
+                                               vsample,
+                                               [],
+                                               n_steps=10,
+                                               truncate_gradient=-1,
+                                               go_backwards=False,
+                                               mode=mode_with_gpu)
+        my_f = theano.function([],
+                               theano_vsamples[-1],
+                               updates=updates,
+                               allow_input_downcast=True,
+                               mode=mode_with_gpu)
+        # I leave this to tested by debugmode, this test was anyway
+        # more of does the graph compile kind of test
+        t_result = my_f()
--- a/theano/scan_module/scan_op.py
+++ b/theano/scan_module/scan_op.py
@@ -56,23 +56,24 @@ class Scan(PureOp):
            the scan op (like number of different types of
            arguments, name, mode, if it should run on GPU or
            not, etc.)
-        :param typeConstructor: function that constructs a Theano TensorType
+        :param typeConstructor: function that constructs an equivalent
-            able to represent a float32 ndarray.
+            to Theano TensorType
-        Note: ``typeConstructor`` had been added to refactor how Theano
-        deals with the GPU. If it runs on the GPU, scan needs to construct
+        Note: ``typeConstructor`` had been added to refactor how
-        certain outputs (those who reside in the GPU memory) as CudaNdarray.
+        Theano deals with the GPU. If it runs on the GPU, scan needs
-        However we can not import cuda in this file (as it is in sandbox,
+        to construct certain outputs (those who reside in the GPU
-        and not available on each machine) so the workaround is that the GPU
+        memory) as the GPU-specific type.  However we can not import
-        optimization (which is aware of cuda types) passes to the
+        gpu code in this file (as it is in sandbox, and not available
-        constructor of this class a function that is able to construct
+        on each machine) so the workaround is that the GPU
-        CudaNdarray. This way the class Scan does not need to be aware of
+        optimization passes to the constructor of this class a
-        CudaNdarray, it just constructs any float32 tensor using this
+        function that is able to construct a GPU type. This way the
-        function (which by default constructs normal tensors). Note that the
+        class Scan does not need to be aware of the details for the
-        second assumption in this code is that any float32 output or input
+        GPU, it just constructs any tensor using this function (which
-        will be moved on the GPU if the optimization gets applied (following
+        by default constructs normal tensors).
-        Theano's philosophy of moving as much as possible on gpu).
        """
+        if 'gpua' not in info:
+            info['gpua'] = False
        # adding properties into self
        self.inputs = inputs
        self.outputs = outputs
@@ -95,23 +96,10 @@ class Scan(PureOp):
            # Not that for mit_mot there are several output slices per
            # output sequence
            o = outputs[idx]
-            # Scan assumes that only variables of dtype float32 might need a
+            self.output_types.append(
-            # special constructor (i.e. CudaNdarray constructor) when the
+                typeConstructor(
-            # code is running on GPU, as it is the only type supported by
+                    broadcastable=(False,) + o.type.broadcastable,
-            # Theano yet. Therefore only for dtype float32 we use the passed
+                    dtype=o.type.dtype))
-            # type constructor ``typeConstructor``. For anything else we
-            # know that even if we run it on the GPU we still construct
-            # normal Theano tensors.
-            if o.type.dtype in ['float32']:
-                self.output_types.append(
-                    typeConstructor(
-                        broadcastable=(False,) + o.type.broadcastable,
-                        dtype=o.type.dtype))
-            else:
-                self.output_types.append(
-                    tensorConstructor(
-                        broadcastable=(False,) + o.type.broadcastable,
-                        dtype=o.type.dtype))
            idx += len(self.mit_mot_out_slices[jdx])
            jdx += 1
@@ -120,23 +108,11 @@ class Scan(PureOp):
        end = idx + self.n_mit_sot + self.n_sit_sot + self.n_nit_sot
        for o in outputs[idx:end]:
-            # Scan assumes that only variables of dtype float32 might need a
+            self.output_types.append(
-            # special constructor (i.e. CudaNdarray constructor) when the
+                typeConstructor(
-            # code is running on GPU, as it is the only type supported by
+                    broadcastable=(False,) + o.type.broadcastable,
-            # Theano yet. Therefore only for dtype float32 we use the passed
+                    dtype=o.type.dtype))
-            # type constructor ``typeConstructor``. For anything else we
-            # know that even if we run it on the GPU we still construct
-            # normal Theano tensors.
-            if o.type.dtype in ['float32']:
-                self.output_types.append(
-                    typeConstructor(
-                        broadcastable=(False,) + o.type.broadcastable,
-                        dtype=o.type.dtype))
-            else:
-                self.output_types.append(
-                    tensorConstructor(
-                        broadcastable=(False,) + o.type.broadcastable,
-                        dtype=o.type.dtype))
        # shared outputs + possibly the ending condition
        for o in outputs[end:]:
            self.output_types.append(o.type)
@@ -182,14 +158,14 @@ class Scan(PureOp):
                                   self.n_shared_outs)
        self.n_outs = self.n_mit_mot + self.n_mit_sot + self.n_sit_sot
        self.n_tap_outs = self.n_mit_mot + self.n_mit_sot
-        if not self.info['gpu']:
+        if self.info['gpu'] or self.info['gpua']:
+            self._hash_inner_graph = self.info['gpu_hash']
+        else:
            tmp_in, tmp_out = scan_utils.reconstruct_graph(self.inputs,
                                                           self.outputs)
            local_fgraph = gof.FunctionGraph(tmp_in, tmp_out, clone=False)
            self._cmodule_key = gof.CLinker().cmodule_key_(local_fgraph, [])
            self._hash_inner_graph = hash(self._cmodule_key)
-        else:
-            self._hash_inner_graph = self.info['gpu_hash']
    def make_node(self, *inputs):
        """

--- a/theano/scan_module/scan_opt.py
+++ b/theano/scan_module/scan_opt.py
@@ -537,10 +537,11 @@ class PushOutSeqScan(gof.Optimizer):
 class ScanInplaceOptimizer(Optimizer):
    """Graph optimizer for Scan(makes it run inplace)"""
-    def __init__(self, typeConstructor=None, gpu_flag=False):
+    def __init__(self, typeConstructor=None, gpu_flag=False, gpua_flag=False):
        Optimizer.__init__(self)
        self.typeConstructor = typeConstructor
        self.gpu_flag = gpu_flag
+        self.gpua_flag = gpua_flag
    def add_requirements(self, fgraph):
        fgraph.attach_feature(toolbox.ReplaceValidate())
@@ -551,7 +552,8 @@ class ScanInplaceOptimizer(Optimizer):
        nodes = fgraph.toposort()
        scan_nodes = [x for x in nodes
                      if (isinstance(x.op, scan_op.Scan) and
-                          x.op.info['gpu'] == self.gpu_flag)]
+                          x.op.info['gpu'] == self.gpu_flag and
+                          x.op.info['gpua'] == self.gpua_flag)]
        for scan_idx in xrange(len(scan_nodes)):
            node = scan_nodes[scan_idx]
            op = node.op