Merge pull request #434 from nouiz/alloc

Make a machanism to don't have Alloc and GpuAlloc constant folded in som...

Merge pull request #434 from nouiz/alloc
e764e8c2 · Olivier Delalleau · 7b943a5b · d544d6a9 · e764e8c2 · e764e8c2
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -3,6 +3,7 @@
 Since 0.5rc2
 * Fixed a memory leak with shared variable (we kept a pointer to the original value)
+ * Alloc, GpuAlloc are not always pre-computed (constant_folding optimization) at compile time if all their inputs are constant
 * The keys in our cache now store the hash of constants and not the constant values themselves. This is significantly more efficient for big constant arrays.
 * 'theano-cache list' lists key files bigger than 1M
 * 'theano-cache list' prints an histogram of the number of keys per compiled module

--- a/doc/NEWS.txt
+++ b/doc/NEWS.txt
@@ -3,6 +3,7 @@
 Since 0.5rc2
 * Fixed a memory leak with shared variable (we kept a pointer to the original value)
+ * Alloc, GpuAlloc are not always pre-computed (constant_folding optimization) at compile time if all their inputs are constant
 * The keys in our cache now store the hash of constants and not the constant values themselves. This is significantly more efficient for big constant arrays.
 * 'theano-cache list' lists key files bigger than 1M
 * 'theano-cache list' prints an histogram of the number of keys per compiled module

--- a/doc/extending/op.txt
+++ b/doc/extending/op.txt
@@ -217,6 +217,20 @@ following methods:
   ``__str__`` method include the name of the op and the Op's parameters'
   values.
+.. function:: do_constant_folding(node)
+   *Default:* Return True
+   By default when optimizations are enabled, we remove during
+   function compilation Apply nodes whose inputs are all constants.
+   We replace the Apply node with a Theano constant variable.
+   This way, the Apply node is not executed at each function
+   call. If you want to force the execution of an op during the
+   function call, make do_constant_folding return False.
+   As done in the Alloc op, you can return False only in some cases by
+   analyzing the graph from the node parameter.
 At a bare minimum, a new Op must define ``make_node`` and ``perform``, which
 have no defaults.

--- a/theano/gof/op.py
+++ b/theano/gof/op.py
@@ -509,6 +509,16 @@ class PureOp(object):
        """
        raise utils.MethodNotDefined("perform", type(self), self.__class__.__name__)
+    def do_constant_folding(self, node):
+        """
+        This allows each op to determine if it wants to be constant
+        folded when all its inputs are constant. This allows it to
+        choose where it puts its memory/speed trade-off. Also, it
+        could make things faster as constants can't be used for inplace
+        operations (see *IncSubtensor).
+        """
+        return True
 class Op(utils.object2, PureOp, CLinkerOp):
    """Convenience class to bundle `PureOp` and `CLinkerOp`"""

--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -2004,6 +2004,17 @@ class GpuAlloc(Op):
    def c_code_cache_version(self):
        return (3,)
+    def do_constant_folding(self, node):
+        if any([isinstance(client[0].op, (
+                        tensor.IncSubtensor,
+                        tensor.AdvancedIncSubtensor1,
+                        GpuIncSubtensor,
+                        GpuAdvancedIncSubtensor1
+                        ))
+                for client in node.outputs[0].clients]):
+            return False
+        return True
 gpu_alloc = GpuAlloc()

--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
@@ -728,7 +728,16 @@ def test_gpualloc_output_to_gpu():
    assert numpy.allclose(numpy.ones(a.get_value(borrow=True).shape)+9,f_gpu(9))
    assert numpy.allclose(f(5),f_gpu(5))
 import theano.tensor.tests.test_basic
+class TestAlloc(theano.tensor.tests.test_basic.TestAlloc):
+    dtype = "float32"
+    mode = mode_with_gpu
+    shared = staticmethod(cuda.shared_constructor)
+    allocs = [B.GpuAlloc, B.GpuAlloc, tensor.Alloc]
 class T_Join_and_Split(theano.tensor.tests.test_basic.T_Join_and_Split):
    def setUp(self):
        utt.seed_rng()

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -2616,6 +2616,16 @@ class Alloc(gof.Op):
            return [None]
        return self.make_node(eval_points[0], *inputs[1:]).outputs
+    def do_constant_folding(self, node):
+        if python_any([isinstance(client[0].op, (IncSubtensor,
+                                                 AdvancedIncSubtensor1,
+                                                 AdvancedIncSubtensor,
+                                                 ))
+                       for client in node.outputs[0].clients]):
+            return False
+        return True
 alloc = Alloc()
 pprint.assign(alloc, printing.FunctionPrinter('alloc'))

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -3767,6 +3767,9 @@ def constant_folding(node):
        if not isinstance(input, Constant):
            return False
    #condition:  all inputs are constant
+    if not node.op.do_constant_folding(node):
+        # The op asks not to be constant folded.
+        return False
    storage_map = dict([(i, [i.data]) for i in node.inputs])
    compute_map = dict([(i, [True]) for i in node.inputs])

--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
@@ -48,6 +48,11 @@ except ImportError:
        mode_no_scipy = "FAST_RUN"
 floatX = config.floatX
+if config.mode == "FAST_COMPILE":
+    mode_opt = "FAST_RUN"
+else:
+    mode_opt = get_default_mode()
 ### seed random number generator so that unittests are deterministic ###
 utt.seed_rng()
@@ -1266,6 +1271,48 @@ Alloc13GradTester = makeBroadcastTester(
            ),
        )
+class TestAlloc(unittest.TestCase):
+    dtype = config.floatX
+    mode = mode_opt
+    shared = staticmethod(theano.shared)
+    allocs = [tensor.Alloc] * 3
+    def test_alloc_constant_folding(self):
+        test_params = numpy.asarray(numpy.random.randn(50 * 60),
+                                    self.dtype)
+        some_vector = vector('some_vector', dtype=self.dtype)
+        some_matrix = some_vector.reshape((60, 50))
+        variables = self.shared(numpy.ones((50,), dtype=self.dtype))
+        idx = tensor.constant(numpy.arange(50))
+        for alloc, (subtensor, n_alloc) in zip(self.allocs, [
+                #IncSubtensor1
+                (some_matrix[:60], 2),
+                #AdvancedIncSubtensor1
+                (some_matrix[arange(60)], 2),
+                #AdvancedIncSubtensor
+                (some_matrix[idx, idx], 1)]):
+            derp = sum(dot(subtensor, variables))
+            fobj = theano.function([some_vector], derp, mode=self.mode)
+            grad_derp = theano.grad(derp, some_vector)
+            fgrad = theano.function([some_vector], grad_derp,
+                                    mode=self.mode)
+            topo_obj = fobj.maker.env.toposort()
+            assert numpy.sum([isinstance(node.op, alloc)
+                              for node in topo_obj]) == 0
+            topo_grad = fgrad.maker.env.toposort()
+            #print subtensor
+            #theano.printing.debugprint(fgrad)
+            assert numpy.sum([isinstance(node.op, alloc)
+                              for node in topo_grad]) == n_alloc
+            fobj(test_params)
+            fgrad(test_params)
 def test_eye():
    def check(dtype, N, M_=None, k=0):
        # Theano does not accept None as a tensor.