fix and add test for the gpu scan optimization when we have only the output moved to the gpu.

9810017d · Frederic Bastien · 21f930ce · 9810017d · 9810017d
--- a/theano/scan_module/__init__.py
+++ b/theano/scan_module/__init__.py
@@ -624,23 +624,22 @@ if cuda.cuda_available:
    @local_optimizer([])
    def gpuScanOptimization(node):
        """
-        gpu_from_host(scan) -> GPUscan(gpu_from_host)
        scan(host_from_gpu) -> host_from_gpu(GPUscan)
+        gpu_from_host(scan) -> GPUscan(gpu_from_host)
        """
+        #gpu_from_host(scan) -> GPUscan(gpu_from_host)
        if node.op == gpu_from_host:
            host_input = node.inputs[0]
-            if ( host_input.owner
+            if (host_input.owner and
-                and host_input.owner.op == scan_op.Scan
+                isinstance(host_input.owner.op, scan_op.Scan) and
-                and not host_input.owner.op.info['gpu']):
+                not host_input.owner.op.info['gpu']):
-                # NOT TESTED!!!!
                thescan = host_input.owner.op
-                inputs = host_input.owner.inputs
-                # I need to cast thescan.inputs to gpuhost stuff
                info = thescan.info.copy()
                info['gpu'] = True
+                inputs = host_input.owner.inputs
                nw_ins = [ inputs[0]]
-                e = ( thescan.n_seqs
+                e = ( 1+ thescan.n_seqs
                     + thescan.n_mit_mot
                     + thescan.n_mit_sot
                     + thescan.n_sit_sot
@@ -649,20 +648,20 @@ if cuda.cuda_available:
                b = e
                e = e + thescan.n_nit_sot + thescan.n_other_ignore
                nw_ins += inputs[b:e]
-                nw_ins += [safe_to_gpu(x) for x in inptus[e:] ]
+                nw_ins += [safe_to_gpu(x) for x in inputs[e:] ]
                scan_ins = [ tensor_to_cuda(x) for x in thescan.inputs]
                scan_outs = [ safe_to_gpu(x) for x in thescan.outputs ]
                scan_outs = scan_utils.clone(
                    scan_outs
-                    , replace = zip(thescan.inputs
+                    , replace = zip(thescan.inputs,
-                                    ,[safe_to_cpu(x) for x in  scan_ins]))
+                                    [safe_to_cpu(x) for x in  scan_ins]))
                nw_op = scan_op.Scan( scan_ins
                                     , scan_outs
                                     , info).make_node(*nw_ins)
                _outputs = nw_op.outputs
-                outputs = [safe_to_cpu(x) for x in _outputs]
+                return _outputs
-                return outputs
+        #scan(host_from_gpu) -> host_from_gpu(GPUscan)
        if (type(node.op) == scan_op.Scan
            and not node.op.info['gpu']):
            if numpy.any([(i.owner and i.owner.op == host_from_gpu)
@@ -689,7 +688,6 @@ if cuda.cuda_available:
                    scan_outs
                    , replace = zip(thescan.inputs
                                    ,[safe_to_cpu(x) for x in  scan_ins]))
-                info['gpu'] = True
                _outputs = scan_op.Scan(
                        scan_ins
                        , scan_outs

--- a/theano/scan_module/tests/test_scan.py
+++ b/theano/scan_module/tests/test_scan.py
@@ -227,7 +227,8 @@ class T_Scan(unittest.TestCase):
        assert numpy.allclose(theano_values, v_out)
    # as test_one_sequence_one_output_weights, but on the gpu
-    def test_one_sequence_one_output_weights_gpu(self):
+    # This first version test the first case in the optimizer to the gpu.
+    def test_one_sequence_one_output_weights_gpu1(self):
        def f_rnn(u_t,x_tm1,W_in, W):
            return u_t*W_in+x_tm1*W
@@ -235,13 +236,18 @@ class T_Scan(unittest.TestCase):
        x0   = theano.tensor.fscalar('x0')
        W_in = theano.tensor.fscalar('win')
        W    = theano.tensor.fscalar('w')
        mode = theano.compile.mode.get_default_mode().including('gpu')
+        # The following line is needed to have the first case being used
+        # Otherwise, it is the second that is tested.
+        mode = mode.excluding('InputToGpuOptimizer')
        output, updates = theano.scan(f_rnn, u,x0,[W_in,W]
                                      , n_steps           = None
                                      , truncate_gradient = -1
                                      , go_backwards      = False
                                      , mode = mode)
+        output = theano.sandbox.cuda.gpu_from_host(output)
        f2   = theano.function([u,x0,W_in,W], output, updates = updates,
                               allow_input_downcast = True,
                               mode = mode)
@@ -253,6 +259,11 @@ class T_Scan(unittest.TestCase):
        W    = rng.uniform()
        W_in = rng.uniform()
+        v_u = numpy.asarray(v_u, dtype='float32')
+        v_x0 = numpy.asarray(v_x0, dtype='float32')
+        W = numpy.asarray(W, dtype='float32')
+        W_in = numpy.asarray(W_in, dtype='float32')
        # compute the output in numpy
        v_out = numpy.zeros((4,))
        v_out[0] = v_u[0]*W_in + v_x0 * W
@@ -261,8 +272,17 @@ class T_Scan(unittest.TestCase):
        theano_values = f2(v_u,v_x0, W_in, W)
        assert numpy.allclose(theano_values, v_out)
+        # TO DEL
        topo = f2.maker.env.toposort()
-        assert sum([isinstance(node.op, theano.sandbox.cuda.HostFromGpu) for node in topo]) == 1
+        scan_node = [node for node in topo if isinstance(node.op, theano.scan_module.scan_op.Scan)]
+        assert len(scan_node) == 1
+        scan_node = scan_node[0]
+        #theano.printing.pydotprint(f2, outfile='out1.png', high_contrast=True)
+        #theano.printing.pydotprint(scan_node.op.fn,
+        #                           outfile='inner1.png', high_contrast=True)
+        topo = f2.maker.env.toposort()
+        assert sum([isinstance(node.op, theano.sandbox.cuda.HostFromGpu) for node in topo]) == 0
        assert sum([isinstance(node.op, theano.sandbox.cuda.GpuFromHost) for node in topo]) == 4
        scan_node = [node for node in topo if isinstance(node.op, theano.scan_module.scan_op.Scan)]
@@ -270,11 +290,59 @@ class T_Scan(unittest.TestCase):
        scan_node = scan_node[0]
        scan_node_topo = scan_node.op.fn.maker.env.toposort()
-        theano.printing.pydotprint(f2, outfile='out.png', high_contrast=True)
+        # check that there is no gpu transfer in the inner loop.
-        theano.printing.pydotprint(scan_node.op.fn,
+        assert any([isinstance(node.op, theano.sandbox.cuda.GpuElemwise) for node in scan_node_topo])
-                                   outfile='inner.png', high_contrast=True)
+        assert not any([isinstance(node.op, theano.sandbox.cuda.HostFromGpu) for node in scan_node_topo])
+        assert not any([isinstance(node.op, theano.sandbox.cuda.GpuFromHost) for node in scan_node_topo])
+    # This second version test the second case in the optimizer to the gpu.
+    def test_one_sequence_one_output_weights_gpu2(self):
+        def f_rnn(u_t,x_tm1,W_in, W):
+            return u_t*W_in+x_tm1*W
+        u    = theano.tensor.fvector('u')
+        x0   = theano.tensor.fscalar('x0')
+        W_in = theano.tensor.fscalar('win')
+        W    = theano.tensor.fscalar('w')
+        mode = theano.compile.mode.get_default_mode().including('gpu')
+        output, updates = theano.scan(f_rnn, u,x0,[W_in,W]
+                                      , n_steps           = None
+                                      , truncate_gradient = -1
+                                      , go_backwards      = False
+                                      , mode = mode)
+        f2   = theano.function([u,x0,W_in,W], output, updates = updates,
+                               allow_input_downcast = True,
+                               mode = mode)
+        # get random initial values
+        rng  = numpy.random.RandomState(utt.fetch_seed())
+        v_u  = rng.uniform( size = (4,), low = -5., high = 5.)
+        v_x0 = rng.uniform()
+        W    = rng.uniform()
+        W_in = rng.uniform()
+        # compute the output in numpy
+        v_out = numpy.zeros((4,))
+        v_out[0] = v_u[0]*W_in + v_x0 * W
+        for step in xrange(1,4):
+            v_out[step] = v_u[step]*W_in + v_out[step-1] * W
+        theano_values = f2(v_u,v_x0, W_in, W)
+        assert numpy.allclose(theano_values, v_out)
+        topo = f2.maker.env.toposort()
+        assert sum([isinstance(node.op, theano.sandbox.cuda.HostFromGpu) for node in topo]) == 1
+        assert sum([isinstance(node.op, theano.sandbox.cuda.GpuFromHost) for node in topo]) == 4
+        scan_node = [node for node in topo if isinstance(node.op, theano.scan_module.scan_op.Scan)]
+        assert len(scan_node) == 1
+        scan_node = scan_node[0]
+        scan_node_topo = scan_node.op.fn.maker.env.toposort()
+        #theano.printing.pydotprint(f2, outfile='out2.png', high_contrast=True)
+        #theano.printing.pydotprint(scan_node.op.fn,
+        #                           outfile='inner2.png', high_contrast=True)
-        #check that there is less gpu transfer
+        # check that there is no gpu transfer in the inner loop.
        assert any([isinstance(node.op, theano.sandbox.cuda.GpuElemwise) for node in scan_node_topo])
        assert not any([isinstance(node.op, theano.sandbox.cuda.HostFromGpu) for node in scan_node_topo])
        assert not any([isinstance(node.op, theano.sandbox.cuda.GpuFromHost) for node in scan_node_topo])