Merge pull request #1782 from nouiz/alloc_cst_fold

Alloc cst fold

Merge pull request #1782 from nouiz/alloc_cst_fold
19c0dbcb · abergeron · e93c61d1 · 1efbcce4 · 19c0dbcb · 19c0dbcb
--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -3198,14 +3198,28 @@ class GpuAlloc(GpuOp):
                # If the output is a constant, it will have to be deepcopied
                # each time the function is called.  So we do not fold.
                return False
-            elif (not isinstance(client[0], basestring)
+            elif (#The following ops work inplace of their input id 0.
-                    and isinstance(client[0].op, (
+                  client[1] == 0 and
-                        tensor.IncSubtensor,
+                  isinstance(client[0].op, (
-                        tensor.AdvancedIncSubtensor1,
+                    #Ops that will work inplace on the Alloc. So if they
+                    #get constant_folded, they would copy the
+                    #constant and this is less efficients.
+                    #Not doing the constant folding could also lower
+                    #the peak memory usage, as we the "constant" won't
+                    #always exists.
+                      #theano.tensor.subtensor.AdvancedIncSubtensor,
                      GpuIncSubtensor,
-                        GpuAdvancedIncSubtensor1
+                      GpuAdvancedIncSubtensor1,
+                      theano.sandbox.cuda.blas.GpuGemm,
+                      theano.sandbox.cuda.blas.GpuGemv,
+                      theano.sandbox.cuda.blas.GpuGer,
                  ))):
                return False
+            #If the clients is a transfer, we don't want to fold. We
+            #let the moving opt finish before deciding what to do.
+            elif isinstance(client[0].op, HostFromGpu):
+                return False
        return True
 gpu_alloc = GpuAlloc()

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -2625,13 +2625,31 @@ class Alloc(gof.Op):
                # If the output is a constant, it will have to be deepcopied
                # each time the function is called.  So we do not fold.
                return False
-            elif (not isinstance(client[0], basestring)
+            elif (#The following ops work inplace of their input id 0.
-                    and isinstance(client[0].op, (
+                  client[1] == 0 and
+                  isinstance(client[0].op, (
+                    #Ops that will work inplace on the Alloc. So if they
+                    #get constant_folded, they would copy the
+                    #constant and this is less efficients.
+                    #Not doing the constant folding could also lower
+                    #the peak memory usage, as we the "constant" won't
+                    #always exists.
                        theano.tensor.subtensor.IncSubtensor,
                        theano.tensor.subtensor.AdvancedIncSubtensor1,
                        theano.tensor.subtensor.AdvancedIncSubtensor,
+                        theano.tensor.blas.Gemv,
+                        theano.tensor.blas_c.CGemv,
+                        theano.tensor.blas.Ger,
+                        theano.tensor.blas_c.CGer,
+                        theano.tensor.blas_scipy.ScipyGer
                        ))):
                return False
+            #If the clients is a transfer to the GPU, we don't want to
+            #fold. We let the Alloc being moved to the GPU, then we
+            #let the GPU algo decide if it need to fold it or not.
+            elif client[0].op.__class__.__name__.lower().startswith("gpu"):
+                return False
        return True

--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
@@ -1928,7 +1928,8 @@ class TestAlloc(unittest.TestCase):
                #AdvancedIncSubtensor1
                (some_matrix[arange(60)], 2),
                #AdvancedIncSubtensor
-                (some_matrix[idx, idx], 1)]):
+                (some_matrix[idx, idx], 1)
+        ]):
            derp = sum(dot(subtensor, variables))
            fobj = theano.function([some_vector], derp, mode=self.mode)
@@ -1936,14 +1937,18 @@ class TestAlloc(unittest.TestCase):
            fgrad = theano.function([some_vector], grad_derp,
                                    mode=self.mode)
            topo_obj = fobj.maker.fgraph.toposort()
+            #<= is needed as the GPU currently don't implement
+            #AdvancedIncSubtensor. When this is the case it can be
+            #replaced with ==.
            assert numpy.sum([isinstance(node.op, alloc)
-                              for node in topo_obj]) == 0
+                              for node in topo_obj]) <= 1
            topo_grad = fgrad.maker.fgraph.toposort()
            #print subtensor
            #theano.printing.debugprint(fgrad)
            assert numpy.sum([isinstance(node.op, alloc)
-                              for node in topo_grad]) == n_alloc
+                              for node in topo_grad]) == n_alloc, (
+                                  alloc, subtensor, n_alloc, topo_grad)
            fobj(test_params)
            fgrad(test_params)

--- a/theano/tensor/tests/test_blas.py
+++ b/theano/tensor/tests/test_blas.py
@@ -1091,7 +1091,7 @@ class TestGemv(TestCase, unittest_tools.TestOptimizationMixin):
        # Assert that the dot was optimized somehow
        self.assertFunctionContains0(f, T.dot)
-        self.assertFunctionContains1(f, Gemv(False))
+        self.assertFunctionContains1(f, Gemv(True))
        # Assert they produce the same output
        assert numpy.allclose(f(), numpy.dot(v.get_value(), w.get_value()))