merge; no conflicts

e836fef3 · Razvan Pascanu · ff02e096 · faa629c8 · e836fef3 · e836fef3
--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -4,6 +4,7 @@ _logger = logging.getLogger('theano.sandbox.cuda.opt')
 import sys
 import theano
 import numpy
+from theano.scan_module import scan_utils, scan_op
 from theano import scalar as scal
 from theano import tensor, compile, gof
@@ -1030,3 +1031,226 @@ def local_gpualloc(node):
        #if old_out.type != new_out.type:
            #import pdb; pdb.set_trace()
        return [new_out]
+def safe_to_gpu(x):
+    if (isinstance(x.type, tensor.TensorType) and
+        x.type.dtype == 'float32'):
+        return gpu_from_host(x)
+    else:
+        return x
+def safe_to_cpu(x):
+    if isinstance(x.type, CudaNdarrayType):
+        return host_from_gpu(x)
+    else:
+        return x
+def gpu_safe_new(x, tag = ''):
+    """
+    Internal function that constructs a new variable from x with the same
+    type, but with a different name ( old name + tag). This function is used
+    by gradient, or the R-op to construct new variables for the inputs of
+    the inner graph such that there is no interference between the original
+    graph and the newly constructed graph.
+    """
+    if hasattr(x, 'name') and x.name is not None:
+        nw_name = x.name + tag
+    else:
+        nw_name = None
+    if isinstance(x, theano.Constant):
+        return x.clone()
+    nw_x = x.type()
+    nw_x.name = nw_name
+    return nw_x
+def gpu_reconstruct_graph(inputs, outputs, tag = None):
+    """
+    Different interface to clone, that allows you to pass inputs.
+    Compared to clone, this method always replaces the inputs with
+    new variables of the same type, and returns those ( in the same
+    order as the original inputs).
+    """
+    if tag is None:
+        tag = ''
+    nw_inputs = [gpu_safe_new(x,tag) for x in inputs]
+    givens = {}
+    for nw_x, x in zip(nw_inputs, inputs):
+        givens[x] = nw_x
+    nw_outputs = scan_utils.clone( outputs, replace=givens)
+    return (nw_inputs, nw_outputs)
+def tensor_to_cuda(x):
+    if (isinstance(x.type, tensor.TensorType) and
+        x.type.dtype == 'float32'):
+        y = CudaNdarrayType( broadcastable = x.type.broadcastable)()
+        if x.name :
+            y.name = x.name +'[cuda]'
+        return y
+    else:
+        return x
+@register_opt('scan')
+@local_optimizer([])
+def gpuScanOptimization(node):
+    """
+    scan(host_from_gpu) -> host_from_gpu(GPUscan)
+    gpu_from_host(scan) -> GPUscan(gpu_from_host)
+    """
+    #gpu_from_host(scan) -> GPUscan(gpu_from_host)
+    if node.op == gpu_from_host:
+        host_input = node.inputs[0]
+        if (host_input.owner and
+            isinstance(host_input.owner.op, scan_op.Scan) and
+            not host_input.owner.op.info['gpu'] and
+            len(host_input.owner.outputs) == 1 ):
+            # Note that we are not doing the right thing here !!
+            # This is because the local optimizer expects only one
+            # output that corresponds to the input of ``node``
+            # If we do this for each output seperately we will have
+            # multiple scan ops in the graph ( as many as outputs )
+            # and I'm not sure they will get merged into one again
+            # So for now I will just cover a limited case when there
+            # is only one output and the local optimizer can be used
+            # TODO (fix) : either make sure the different scans get
+            # merged or implement this optimization as a global
+            # optimization
+            thescan = host_input.owner.op
+            info = thescan.info.copy()
+            info['gpu'] = True
+            inputs = host_input.owner.inputs
+            nw_ins = [ inputs[0]]
+            e = ( 1+ thescan.n_seqs
+                 + thescan.n_mit_mot
+                 + thescan.n_mit_sot
+                 + thescan.n_sit_sot
+                 + thescan.n_shared_outs)
+            nw_ins += [safe_to_gpu(x) for x in inputs[1:e] ]
+            b = e
+            e = e + thescan.n_nit_sot
+            nw_ins += inputs[b:e]
+            nw_ins += [safe_to_gpu(x) for x in inputs[e:] ]
+            scan_ins = [ tensor_to_cuda(x) for x in thescan.inputs]
+            scan_outs = [ safe_to_gpu(x) for x in thescan.outputs ]
+            scan_outs = scan_utils.clone(
+                scan_outs
+                , replace = zip(thescan.inputs,
+                                [safe_to_cpu(x) for x in  scan_ins]))
+            # We need to construct the hash here, because scan
+            # __init__ does not know about cuda ndarray and can not
+            # handle graphs with inputs being Cuda Ndarrays
+            tmp_in, tmp_out = gpu_reconstruct_graph(scan_ins,
+                                                       scan_outs)
+            local_env = gof.Env(tmp_in, tmp_out)
+            _cmodule_key = gof.CLinker.cmodule_key_(local_env,[])
+            info['gpu_hash'] = hash(_cmodule_key)
+            typeConstructor = lambda broadcastable, dtype: CudaNdarrayType(
+                    broadcastable = broadcastable)
+            nw_op = scan_op.Scan( scan_ins
+                                 , scan_outs
+                                 , info
+                                 , typeConstructor = typeConstructor
+                                ).make_node(*nw_ins)
+            _outputs = nw_op.outputs
+            return _outputs
+    #scan(host_from_gpu) -> host_from_gpu(GPUscan)
+    if (type(node.op) == scan_op.Scan
+        and not node.op.info['gpu']):
+        if numpy.any([(i.owner and i.owner.op == host_from_gpu)
+                      for i in node.inputs]):
+            thescan = node.op
+            info = thescan.info.copy()
+            info['gpu'] = True
+            inputs = node.inputs
+            nw_ins = [ inputs[0]]
+            e = ( 1+ thescan.n_seqs
+                 + thescan.n_mit_mot
+                 + thescan.n_mit_sot
+                 + thescan.n_sit_sot
+                 + thescan.n_shared_outs)
+            nw_ins += [safe_to_gpu(x) for x in inputs[1:e] ]
+            b = e
+            e = e + thescan.n_nit_sot
+            nw_ins += inputs[b:e]
+            nw_ins += [safe_to_gpu(x) for x in inputs[e:] ]
+            scan_ins = [ tensor_to_cuda(x) for x in thescan.inputs]
+            scan_outs = [ safe_to_gpu(x) for x in thescan.outputs ]
+            scan_outs = scan_utils.clone(
+                scan_outs
+                , replace = zip(thescan.inputs
+                                ,[safe_to_cpu(x) for x in  scan_ins]))
+            # We need to construct the hash here, because scan
+            # __init__ does not know about cuda ndarray and can not
+            # handle graphs with inputs being Cuda Ndarrays
+            tmp_in, tmp_out = gpu_reconstruct_graph(scan_ins,
+                                                       scan_outs)
+            local_env = gof.Env(tmp_in, tmp_out)
+            _cmodule_key = gof.CLinker.cmodule_key_(local_env,[])
+            info['gpu_hash'] = hash(_cmodule_key)
+            typeConstructor = lambda broadcastable, dtype: CudaNdarrayType(
+                    broadcastable = broadcastable)
+            _outputs = scan_op.Scan(
+                    scan_ins
+                    , scan_outs
+                    , info
+                    , typeConstructor = typeConstructor
+                    ).make_node(*nw_ins).outputs
+            outputs = [safe_to_cpu(x) for x in _outputs]
+            return outputs
+    return False
+@gof.local_optimizer([None])
+def gpu_scan_make_inplace(node):
+    op = node.op
+    if ( isinstance(op, scan_op.Scan) and
+        (not op.info['inplace']) and
+        (op.info['gpu'])):
+        info = op.info.copy()
+        info['inplace'] = True
+        # inputs corresponding to sequences and n_steps
+        ls_begin = node.inputs[:1+op.n_seqs]
+        ls  = op.outer_mitmot(node)
+        ls += op.outer_mitsot(node)
+        ls += op.outer_sitsot(node)
+        ls_end  = op.outer_shared(node)
+        ls_end += op.outer_nitsot(node)
+        ls_end += op.outer_non_seqs(node)
+        n_outs = len(ls)
+        for idx in xrange(n_outs):
+            if ls[idx] in ls[:idx]:
+                ls[idx] = deep_copy_op(ls[idx])
+        inputs = ls_begin + ls + ls_end
+        typeConstructor = lambda broadcastable, dtype: CudaNdarrayType(
+                broadcastable = broadcastable)
+        new_op = scan_op.Scan( op.inputs
+                              , op.outputs
+                              , info
+                              , typeConstructor = typeConstructor
+                             )
+        return new_op.make_node(*inputs).outputs
+    return False
+optdb.register( 'gpu_scanOp_make_inplace'
+               , theano.tensor.opt.in2out(gpu_scan_make_inplace,ignore_newtrees=True)
+               , 75
+               , 'gpu'
+               , 'fast_run'
+               , 'inplace'
+               , 'scan')
--- a/theano/scan_module/scan_op.py
+++ b/theano/scan_module/scan_op.py
@@ -28,7 +28,7 @@ from theano import gof
 from theano.tensor import TensorType
 from theano import tensor
 from theano.tensor.opt import Shape_i
-from theano.sandbox import cuda
+#from theano.sandbox import cuda
 from theano.compile.profiling import ScanProfileStats
 import scan_utils
@@ -46,7 +46,9 @@ class Scan(Op):
    def __init__( self
                 , inputs
                 , outputs
-                 , info  ):
+                 , info
+                 , typeConstructor = None
+                ):
        """
        :param inputs: inputs of the inner function of scan
        :param outputs: outputs of the inner function of scan
@@ -66,60 +68,31 @@ class Scan(Op):
        self.output_types = []
        idx = 0
        jdx = 0
-        if self.gpu:
+        if typeConstructor is None:
-            # mit_mot
+            typeConstructor = lambda broadcastable, dtype: TensorType(
-            while idx < self.n_mit_mot_outs:
+                broadcastable = broadcastable, dtype = dtype)
-                # Not that for mit_mot there are several output slices per
-                # output sequence
+        while idx < self.n_mit_mot_outs:
-                o     = outputs[idx]
+            # Not that for mit_mot there are several output slices per
-                self.output_types.append(
+            # output sequence
-                    cuda.CudaNdarrayType(
+            o     = outputs[idx]
-                        broadcastable = (False,) + o.type.broadcastable))
+            self.output_types.append(
-                idx += len(self.mit_mot_out_slices[jdx])
+                typeConstructor( broadcastable = (False,) + o.type.broadcastable
-                jdx += 1
+                                , dtype = o.type.dtype)
+                        )
-            # mit_sot / sit_sot / nit_sot
+            idx += len(self.mit_mot_out_slices[jdx])
-            end = idx + self.n_mit_sot + self.n_sit_sot + self.n_nit_sot
+            jdx += 1
-            for o in outputs[idx:end]:
-                self.output_types.append(
+        # mit_sot / sit_sot / nit_sot
-                    cuda.CudaNdarrayType( broadcastable = (False,) +
+        end = idx + self.n_mit_sot + self.n_sit_sot + self.n_nit_sot
-                                    o.type.broadcastable))
+        for o in outputs[idx:end]:
-            # shared outputs
+            self.output_types.append(
-            for o in outputs[end:]:
+                typeConstructor(
-                if isinstance(o.type, TensorType):
+                    broadcastable = (False,) + o.type.broadcastable
-                    self.output_types.append(cuda.CudaNdarrayType(
+                    , dtype = o.type.dtype ))
-                        broadcastable = o.type.broadcastable))
+        # shared outputs + possibly the ending condition
-                else:
+        for o in outputs[end:]:
-                    self.output_types.append( o.type )
+            self.output_types.append( o.type )
-        else:
-            while idx < self.n_mit_mot_outs:
-                # Not that for mit_mot there are several output slices per
-                # output sequence
-                o     = outputs[idx]
-                self.output_types.append(
-                    TensorType(
-                        broadcastable = (False,) + o.type.broadcastable
-                        , dtype = o.type.dtype)
-                    )
-                idx += len(self.mit_mot_out_slices[jdx])
-                jdx += 1
-            # mit_sot / sit_sot / nit_sot
-            end = idx + self.n_mit_sot + self.n_sit_sot + self.n_nit_sot
-            for o in outputs[idx:end]:
-                self.output_types.append(
-                    TensorType(
-                        broadcastable = (False,) + o.type.broadcastable
-                        , dtype = o.type.dtype ))
-            # shared outputs + possibly the ending condition
-            for o in outputs[end:]:
-                if cuda.cuda_available and isinstance(o.type,
-                                                      cuda.CudaNdarrayType):
-                    self.output_types.append( TensorType(
-                        broadcastable = o.type.broadcastable
-                        , dtype = theano.config.floatX) )
-                else:
-                    self.output_types.append( o.type )
        if self.as_while:
            self.output_types = self.output_types[:-1]
@@ -168,11 +141,14 @@ class Scan(Op):
                                    self.n_shared_outs )
        self.n_outs = self.n_mit_mot + self.n_mit_sot + self.n_sit_sot
        self.n_tap_outs = self.n_mit_mot + self.n_mit_sot
-        tmp_in, tmp_out = scan_utils.reconstruct_graph(self.inputs,
+        if not self.info['gpu']:
+            tmp_in, tmp_out = scan_utils.reconstruct_graph(self.inputs,
                                                       self.outputs)
-        local_env = gof.Env(tmp_in, tmp_out)
+            local_env = gof.Env(tmp_in, tmp_out)
-        self._cmodule_key = gof.CLinker.cmodule_key_(local_env,[])
+            self._cmodule_key = gof.CLinker.cmodule_key_(local_env,[])
-        self._hash_inner_graph = hash(self._cmodule_key)
+            self._hash_inner_graph = hash(self._cmodule_key)
+        else:
+            self._hash_inner_graph = self.info['gpu_hash']
    def make_node(self, *inputs):
@@ -419,9 +395,9 @@ class Scan(Op):
                    cython_mit_mot_out_slices[_d0,_d1] = \
                        self.mit_mot_out_slices[_d0][_d1]
            vector_seqs = [ seq.ndim == 1 for seq in
-                                 self.inputs[1:1+self.n_seqs ] ]
+                                 node.inputs[1:1+self.n_seqs ] ]
            vector_outs = [ arg.ndim ==1 for arg in
-                                 self.inputs[1+self.n_seqs: (1+self.n_seqs +
+                                 node.inputs[1+self.n_seqs: (1+self.n_seqs +
                                                        self.n_outs)] ]
            vector_outs += [ False]*self.n_nit_sot
@@ -610,6 +586,8 @@ class Scan(Op):
            Y sequence outputs y_1, y_2, ... y_<self.n_outs>
        """
+        # In order to be able to allocate cuda ndarrays if needed
+        from theano.sandbox import cuda
        # 1. Unzip the number of steps and sequences. If number of steps is
        # negative flip sequences around, and make n_steps positive
        t0_call = time.time()

--- a/theano/scan_module/scan_opt.py
+++ b/theano/scan_module/scan_opt.py
@@ -289,7 +289,8 @@ optdb.register('scanOp_pushout_nonseqs_ops',
 def scan_make_inplace(node):
    op = node.op
    if ( isinstance(op, scan_op.Scan) and
-        (not op.info['inplace']) ):
+        (not op.info['inplace']) and
+        (not op.info['gpu'])):
        info = op.info.copy()
        info['inplace'] = True
        # inputs corresponding to sequences and n_steps
@@ -1122,122 +1123,4 @@ optdb.register('scanOp_merge_inouts'
              , 'fast_run'
              , 'scan')
-from theano.sandbox import cuda
-if cuda.cuda_available:
-    from theano.sandbox.cuda.basic_ops import gpu_from_host, host_from_gpu
-    from theano.sandbox.cuda.type import CudaNdarrayType
-    from theano.sandbox.cuda.opt import register_opt, local_optimizer
-    def safe_to_gpu(x):
-        if (isinstance(x.type, TensorType) and
-            x.type.dtype == 'float32'):
-            return gpu_from_host(x)
-        else:
-            return x
-    def safe_to_cpu(x):
-        if isinstance(x.type, CudaNdarrayType):
-            return host_from_gpu(x)
-        else:
-            return x
-    def tensor_to_cuda(x):
-        if (isinstance(x.type, TensorType) and
-            x.type.dtype == 'float32'):
-            y = CudaNdarrayType( broadcastable = x.type.broadcastable)()
-            if x.name :
-                y.name = x.name +'[cuda]'
-            return y
-        else:
-            return x
-    @register_opt('scan')
-    @local_optimizer([])
-    def gpuScanOptimization(node):
-        """
-        scan(host_from_gpu) -> host_from_gpu(GPUscan)
-        gpu_from_host(scan) -> GPUscan(gpu_from_host)
-        """
-        #gpu_from_host(scan) -> GPUscan(gpu_from_host)
-        if node.op == gpu_from_host:
-            host_input = node.inputs[0]
-            if (host_input.owner and
-                isinstance(host_input.owner.op, scan_op.Scan) and
-                not host_input.owner.op.info['gpu'] and
-                len(host_input.owner.outputs) == 1 ):
-                # Note that we are not doing the right thing here !!
-                # This is because the local optimizer expects only one
-                # output that corresponds to the input of ``node``
-                # If we do this for each output seperately we will have
-                # multiple scan ops in the graph ( as many as outputs )
-                # and I'm not sure they will get merged into one again
-                # So for now I will just cover a limited case when there
-                # is only one output and the local optimizer can be used
-                # TODO (fix) : either make sure the different scans get
-                # merged or implement this optimization as a global
-                # optimization
-                thescan = host_input.owner.op
-                info = thescan.info.copy()
-                info['gpu'] = True
-                inputs = host_input.owner.inputs
-                nw_ins = [ inputs[0]]
-                e = ( 1+ thescan.n_seqs
-                     + thescan.n_mit_mot
-                     + thescan.n_mit_sot
-                     + thescan.n_sit_sot
-                     + thescan.n_shared_outs)
-                nw_ins += [safe_to_gpu(x) for x in inputs[1:e] ]
-                b = e
-                e = e + thescan.n_nit_sot
-                nw_ins += inputs[b:e]
-                nw_ins += [safe_to_gpu(x) for x in inputs[e:] ]
-                scan_ins = [ tensor_to_cuda(x) for x in thescan.inputs]
-                scan_outs = [ safe_to_gpu(x) for x in thescan.outputs ]
-                scan_outs = scan_utils.clone(
-                    scan_outs
-                    , replace = zip(thescan.inputs,
-                                    [safe_to_cpu(x) for x in  scan_ins]))
-                nw_op = scan_op.Scan( scan_ins
-                                     , scan_outs
-                                     , info).make_node(*nw_ins)
-                _outputs = nw_op.outputs
-                return _outputs
-        #scan(host_from_gpu) -> host_from_gpu(GPUscan)
-        if (type(node.op) == scan_op.Scan
-            and not node.op.info['gpu']):
-            if numpy.any([(i.owner and i.owner.op == host_from_gpu)
-                          for i in node.inputs]):
-                thescan = node.op
-                info = thescan.info.copy()
-                info['gpu'] = True
-                inputs = node.inputs
-                nw_ins = [ inputs[0]]
-                e = ( 1+ thescan.n_seqs
-                     + thescan.n_mit_mot
-                     + thescan.n_mit_sot
-                     + thescan.n_sit_sot
-                     + thescan.n_shared_outs)
-                nw_ins += [safe_to_gpu(x) for x in inputs[1:e] ]
-                b = e
-                e = e + thescan.n_nit_sot
-                nw_ins += inputs[b:e]
-                nw_ins += [safe_to_gpu(x) for x in inputs[e:] ]
-                scan_ins = [ tensor_to_cuda(x) for x in thescan.inputs]
-                scan_outs = [ safe_to_gpu(x) for x in thescan.outputs ]
-                scan_outs = scan_utils.clone(
-                    scan_outs
-                    , replace = zip(thescan.inputs
-                                    ,[safe_to_cpu(x) for x in  scan_ins]))
-                _outputs = scan_op.Scan(
-                        scan_ins
-                        , scan_outs
-                        , info).make_node(*nw_ins).outputs
-                outputs = [safe_to_cpu(x) for x in _outputs]
-                return outputs
-        return False
--- a/theano/scan_module/scan_utils.py
+++ b/theano/scan_module/scan_utils.py
@@ -22,7 +22,6 @@ from theano import gof
 from theano import tensor, scalar
 from theano.tensor.basic import get_constant_value
-from theano.sandbox import cuda
 import theano
@@ -43,8 +42,7 @@ def safe_new(x, tag = ''):
        nw_name = x.name + tag
    else:
        nw_name = None
-    # Should it be theano.Constant? What is the difference between the two?
+    if isinstance(x, theano.Constant):
-    if isinstance(x, tensor.Constant):
        return x.clone()
    # Note, as_tensor_variable will convert the Scalar into a
    # TensorScalar that will require a ScalarFromTensor op,
@@ -93,14 +91,11 @@ def traverse(out, x,x_copy, d):
    fine for the main computational graph but confuses things a bit for the
    inner graph of scan '''
    if out == x:
-        d[out] = cuda.gpu_from_host(x_copy)
+        d[out] = tensor.as_tensor_variable(x_copy)
        return d
    elif out.owner is None:
        return d
-    elif (out.owner.op == cuda.host_from_gpu
-          and out.owner.inputs == [x] ):
-        d[out] = x_copy
-        return d
    else:
        for inp in out.owner.inputs:
            d = traverse(inp, x, x_copy, d)

--- a/theano/scan_module/tests/test_scan.py
+++ b/theano/scan_module/tests/test_scan.py
@@ -2282,7 +2282,7 @@ class T_Scan(unittest.TestCase):
            return x_t+1, theano.scan_module.until( x_t > 3)
        o, _ = theano.scan(lambda_fn, x)
        f = theano.function([x], o)
-        vx = numpy.zeros((50,))
+        vx = numpy.zeros((50,), dtype = theano.config.floatX)
        vx[23] = 4
        out = f(vx)
        assert numpy.sum(out[24:]) == 0
@@ -2296,7 +2296,7 @@ class T_Scan(unittest.TestCase):
                            x)
        f = theano.function([x], [o,o2])
-        vx = numpy.zeros((50,))
+        vx = numpy.zeros((50,), dtype = theano.config.floatX)
        vx[23] = 4
        out, out2 = f(vx)
        assert numpy.sum(out[24:]) == 0
@@ -2315,7 +2315,7 @@ class T_Scan(unittest.TestCase):
                            x)
        f = theano.function([x], [o,o2])
-        vx = numpy.zeros((50,))
+        vx = numpy.zeros((50,), dtype = theano.config.floatX)
        vx[23] = 4
        out, out2 = f(vx)
        assert numpy.sum(out[24:]) == 0