Merge pull request #1794 from nouiz/recursion_limit

Recursion limit

Merge pull request #1794 from nouiz/recursion_limit
394e8cf0 · abergeron · 939bfa73 · 9e60b59a · 394e8cf0 · 394e8cf0
--- a/doc/index.txt
+++ b/doc/index.txt
@@ -20,7 +20,9 @@ since 2007.  But it is also approachable enough to be used in the classroom
 News
 ====
-* Theano 0.6rc3 was released. Everybody is encouraged to update.
+* Ian Goodfellow did a `12h class with exercises on Theano <https://github.com/goodfeli/theano_exercises>`_.
+* Theano 0.6 was released. Everybody is encouraged to update.
 * New technical report on Theano: `Theano: new features and speed improvements <http://arxiv.org/abs/1211.5590>`_.
  However, please keep citing the other paper below in scientific work involving Theano.

--- a/theano/scalar/basic.py
+++ b/theano/scalar/basic.py
@@ -2974,7 +2974,40 @@ class Composite(ScalarOp):
        # We need to clone the graph as sometimes its nodes already
        # contain a reference to an fgraph. As we want the Composite
        # to be pickable, we can't have reference to fgraph.
-        inputs, outputs = gof.graph.clone(inputs, outputs)
+        # Also, if there is Composite in the inner graph, we want to
+        # remove them. In that case, we do a more complicated clone
+        # that will flatten Composite. We don't need to do this
+        # recusively, as the way the fusion optimizer work, we have
+        # only 1 new Composite each time at the output.
+        if len(outputs) > 1 or not any([isinstance(var.owner.op, Composite)
+                                        for var in outputs]):
+            # No inner Composite
+            inputs, outputs = gof.graph.clone(inputs, outputs)
+        else:
+            # Inner Composite that we need to flatten
+            assert len(outputs) == 1
+            # 1. Create a new graph from inputs up to the
+            # Composite
+            res = theano.compile.rebuild_collect_shared(
+                inputs=inputs,
+                outputs=outputs[0].owner.inputs,
+                copy_inputs_over=False) #  Clone also the inputs
+            # 2. We continue this partial clone with the graph in
+            # the inner Composite
+            res2 = theano.compile.rebuild_collect_shared(
+                inputs=outputs[0].owner.op.inputs,
+                outputs=outputs[0].owner.op.outputs,
+                replace=dict(zip(outputs[0].owner.op.inputs, res[1]))
+            )
+            assert len(res2[1]) == len(outputs)
+            assert len(res[0]) == len(inputs)
+            assert res[0] != inputs
+            inputs, outputs = res[0], res2[1]
+            # Next assert comment just for speed
+            #assert not any([isinstance(node.op, Composite) for node in
+            #                theano.gof.graph.ops(inputs, outputs)])
        self.inputs = copy(inputs)
        self.outputs = copy(outputs)
        self.inputs_type = tuple([input.type for input in inputs])

--- a/theano/scalar/tests/test_basic.py
+++ b/theano/scalar/tests/test_basic.py
@@ -68,19 +68,17 @@ class test_composite(unittest.TestCase):
        fn = gof.DualLinker().accept(g).make_function()
        assert fn(1.0, 2.0) == 1.5
-#    def test_sin(self):
+    def test_flatten(self):
-#        x = inputs()
+        #Test that we flatten multiple Composite.
-#        e = sin(x)
+        x, y, z = inputs()
-#        C = Composite([x], [e])
+        C = Composite([x, y], [x + y])
-#        c = C.make_node(x)
+        CC = Composite([x, y], [C(x * y, y)])
-#        # print c.c_code(['x'], ['z'], dict(id = 0))
+        assert not isinstance(CC.outputs[0].owner.op, Composite)
-#        g = FunctionGraph([x], [c.out])
-#        fn = gof.DualLinker().accept(g).make_function()
+        # Test with multiple outputs
-#        assert fn(0) == 0
+        CC = Composite([x, y, z], [C(x * y, y), C(x * z, y)])
-#        assert fn(3.14159265358/2) == 1
+        #We don't flatten that case.
-#        assert fn(3.14159265358) == 0
+        assert isinstance(CC.outputs[0].owner.op, Composite)
-    # WRITEME: Test for sin, pow, and other scalar ops.
    def test_with_constants(self):
        x, y, z = inputs()

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -508,6 +508,12 @@ class EmptyConstantError(NotScalarConstantError):
    """
+get_scalar_constant_value_elemwises = (
+    scal.Cast, scal.Switch,
+    scal.NEQ, scal.EQ,
+    scal.LT, scal.GT, scal.LE, scal.GE,
+    scal.Sub, scal.Add, scal.Mod, scal.Mul,
+    scal.IntDiv, scal.TrueDiv)
 def get_scalar_constant_value(v):
    """return the constant scalar(0-D) value underlying variable `v`
@@ -562,7 +568,7 @@ def get_scalar_constant_value(v):
                                   compile.ops.OutputGuard,
                                   compile.DeepCopyOp)):
            return get_scalar_constant_value(v.owner.inputs[0])
-        if (isinstance(v.owner.op, theano.compile.ops.Shape_i) and
+        elif (isinstance(v.owner.op, theano.compile.ops.Shape_i) and
            isinstance(v.owner.inputs[0], Constant)):
            return v.owner.inputs[0].data.shape[v.owner.op.i]
        # Don't act as the constant_folding optimization here as this
@@ -570,26 +576,29 @@ def get_scalar_constant_value(v):
        # mess with the stabilization optimization and be too slow.
        # We put all the scalar Ops used by get_canonical_form_slice()
        # to allow it to determine the broadcast pattern correctly.
-        if ((isinstance(v.owner.op, Elemwise) and
+        elif isinstance(v.owner.op, scal.ScalarOp):
-             isinstance(v.owner.op.scalar_op, scal.Second)) or
+            if isinstance(v.owner.op, scal.Second):
-            isinstance(v.owner.op, scal.Second)):
+                # We don't need both input to be constant for second
-            # We don't need both input to be constant for second
+                shape, val = v.owner.inputs
-            shape, val = v.owner.inputs
+                return get_scalar_constant_value(val)
-            return get_scalar_constant_value(val)
+            if isinstance(v.owner.op, get_scalar_constant_value_elemwises):
-        elemwises = (scal.Cast, scal.Switch,
+                const = [get_scalar_constant_value(i)
-                     scal.NEQ, scal.EQ,
+                         for i in v.owner.inputs]
-                     scal.LT, scal.GT, scal.LE, scal.GE,
+                ret = [[None]]
-                     scal.Sub, scal.Add, scal.Mod, scal.Mul,
+                v.owner.op.perform(v.owner, const, ret)
-                     scal.IntDiv, scal.TrueDiv)
+                return ret[0][0]
-        if (isinstance(v.owner.op, Elemwise) and
+        elif isinstance(v.owner.op, Elemwise):
-            len(v.owner.outputs) == 1 and
+            if isinstance(v.owner.op.scalar_op, scal.Second):
-            (isinstance(v.owner.op.scalar_op, elemwises) or
+                # We don't need both input to be constant for second
-            isinstance(v.owner.op, elemwises))):
+                shape, val = v.owner.inputs
-            const = [get_scalar_constant_value(i) for i in v.owner.inputs]
+                return get_scalar_constant_value(val)
-            ret = [[None]]
+            elif isinstance(v.owner.op.scalar_op,
-            v.owner.op.perform(v.owner, const, ret)
+                            get_scalar_constant_value_elemwises):
-            return ret[0][0]
+                const = [get_scalar_constant_value(i) for i in v.owner.inputs]
-        if isinstance(v.owner.op, theano.tensor.subtensor.Subtensor) and v.ndim == 0:
+                ret = [[None]]
+                v.owner.op.perform(v.owner, const, ret)
+                return ret[0][0]
+        elif isinstance(v.owner.op, theano.tensor.subtensor.Subtensor) and v.ndim == 0:
            if isinstance(v.owner.inputs[0], TensorConstant):
                cdata = tuple(v.owner.op.get_constant_idx(v.owner.inputs))
                try:
@@ -626,7 +635,7 @@ def get_scalar_constant_value(v):
                # join can cast implicitly its input in some case.
                return theano._asarray(ret, dtype=v.type.dtype)
-            if (v.owner.inputs[0].owner and
+            elif (v.owner.inputs[0].owner and
                isinstance(v.owner.inputs[0].owner.op,
                           theano.tensor.opt.MakeVector) and
                # MakeVector normally accept only scalar as input.

--- a/theano/tensor/elemwise.py
+++ b/theano/tensor/elemwise.py
@@ -774,8 +774,7 @@ class Elemwise(OpenMPOp):
            super(Elemwise, self).perform(node, inputs, output_storage)
        maxsize = max(len(input.shape) for input in inputs)
-        for dims in izip(*[([(1, True)] * (maxsize - len(input.shape))
+        for dims in izip(*[zip(input.shape, sinput.type.broadcastable)
-                            + zip(input.shape, sinput.type.broadcastable))
                          for input, sinput in zip(inputs, node.inputs)]):
            if max(d for d, b in dims) != 1 and (1, False) in dims:
                # yes there may be more compact ways to write this code,
@@ -808,34 +807,36 @@ class Elemwise(OpenMPOp):
                out_shape.append(max(values))
        out_shape = tuple(out_shape)
-        if not self.inplace_pattern:
+        # Commented as we don't reuse outputs now.
-            for output, storage in izip(node.outputs, output_storage):
+        #
-                odat = storage[0]
+        # if not self.inplace_pattern:
-                if odat is not None:
+        #     for output, storage in izip(node.outputs, output_storage):
-                    if odat.shape != out_shape:
+        #         odat = storage[0]
-                        # It is unsafe to try to resize odat,
+        #         if odat is not None:
-                        # we have to allocate output storage.
+        #             if odat.shape != out_shape:
-                        odat = None
+        #                 # It is unsafe to try to resize odat,
-                if odat is None:
+        #                 # we have to allocate output storage.
-                    odat = numpy.ndarray(out_shape, dtype=output.type.dtype)
+        #                 odat = None
-                storage[0] = odat
+        #         if odat is None:
-        else:
+        #             odat = numpy.ndarray(out_shape, dtype=output.type.dtype)
-            for i, (output, storage) in enumerate(
+        #         storage[0] = odat
-                    izip(node.outputs, output_storage)):
+        # else:
-                #i is an output idx
+        #     for i, (output, storage) in enumerate(
-                if i in self.inplace_pattern:
+        #             izip(node.outputs, output_storage)):
-                    odat = inputs[self.inplace_pattern[i]]
+        #         #i is an output idx
-                else:
+        #         if i in self.inplace_pattern:
-                    odat = storage[0]
+        #             odat = inputs[self.inplace_pattern[i]]
-                    if odat is not None:
+        #         else:
-                        if odat.shape != out_shape:
+        #             odat = storage[0]
-                            # It is unsafe to try to resize odat,
+        #             if odat is not None:
-                            # we have to allocate output storage.
+        #                 if odat.shape != out_shape:
-                            odat = None
+        #                     # It is unsafe to try to resize odat,
-                    if odat is None:
+        #                     # we have to allocate output storage.
-                        odat = numpy.ndarray(out_shape,
+        #                     odat = None
-                                dtype=output.type.dtype)
+        #             if odat is None:
-                storage[0] = odat
+        #                 odat = numpy.ndarray(out_shape,
+        #                         dtype=output.type.dtype)
+        #         storage[0] = odat
        ufunc_args = inputs  # + output_storage
        if self.nfunc and len(inputs) == self.nfunc_spec[1]:
@@ -860,26 +861,25 @@ class Elemwise(OpenMPOp):
        if nout == 1:
            variables = [variables]
+        i = 0
        for variable, storage, nout in izip(variables, output_storage,
                                            node.outputs):
-            if str(getattr(variable, "dtype", "")) == 'object':
+            if getattr(variable, "dtype", "") == 'object':
                # Since numpy 1.6, function created with numpy.frompyfunc
                # always return an ndarray with dtype object
                variable = numpy.asarray(variable, dtype=nout.dtype)
-            # The storage has been resized earlier.
+            if i in self.inplace_pattern:
-            if hasattr(variable, 'shape'):
+                odat = inputs[self.inplace_pattern[i]]
-                assert storage[0].shape == variable.shape
+                odat[...] = variable
+                storage[0] = odat
+            # Sometimes NumPy return a Python type.
+            elif not isinstance(variable, numpy.ndarray):
+                variable = numpy.asarray(variable, nout.dtype)
+                storage[0] = variable
            else:
-                # If variable has not shape, then it is a scalar.
+                storage[0] = variable
-                assert numpy.prod(storage[0].shape) == 1
+            i += 1
-            storage[0][...] = variable
-            assert str(storage[0].dtype) != 'object'
-        # the following should be used instead of the previous loop,
-        # unfortunately it tends to segfault
-        # self.ufunc(*(ufunc_args+[s[0] for s in output_storage]))
    def infer_shape(self, node, i_shapes):
        rval = []

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -4888,11 +4888,40 @@ class FusionOptimizer(Optimizer):
        print >> stream, blanc, " time_toposort", prof[7]
+def local_add_mul_fusion(node):
+    """Fuse consecutive add or mul in one such node with more inputs.
+    It is better to fuse add/mul that way then in a Composite node as
+    this make the inner graph of the Compiste smaller. This allow to
+    put more computation in a Composite before hitting the max
+    recusion limit when pickling Composite.
+    """
+    if (not isinstance(node.op, Elemwise) or
+        not isinstance(node.op.scalar_op, (scalar.Add, scalar.Mul))):
+        return False
+    s_op = node.op.scalar_op.__class__
+    for inp in node.inputs:
+        if (inp.owner and
+            isinstance(inp.owner.op, Elemwise) and
+            isinstance(inp.owner.op.scalar_op, s_op)):
+            l = list(node.inputs)
+            l.remove(inp)
+            return [node.op(*(l + inp.owner.inputs))]
 if config.tensor.local_elemwise_fusion:
    _logger.debug("enabling optimization fusion elemwise in fast_run")
    #Must be after gpu(48.5) and before AddDestroyHandler(49.5)
+    fuse_seqopt = gof.SequenceDB()
+    fuse_seqopt.register('local_add_mul_fusion',
+                         FusionOptimizer(local_add_mul_fusion),
+                         0, 'fast_run', 'fusion')
+    fuse_seqopt.register('composite_elemwise_fusion',
+                         FusionOptimizer(local_elemwise_fusion),
+                         1, 'fast_run', 'fusion')
    compile.optdb.register('elemwise_fusion',
-                           FusionOptimizer(local_elemwise_fusion), 49,
+                           fuse_seqopt, 49,
                           'fast_run', 'fusion', 'local_elemwise_fusion',
                           'FusionOptimizer')
 else:

--- a/theano/tensor/tests/test_opt.py
+++ b/theano/tensor/tests/test_opt.py
@@ -1207,6 +1207,36 @@ class test_fusion(unittest.TestCase):
        # Test it on some dummy values
        f(*[range(i, 4 + i) for i in range(35)])
+    def test_pickle_big_fusion(self):
+        """In the past, pickle of Composite generated in tha case
+        crashed with max recusion limit. So we where not able to
+        generate C code in that case.
+        """
+        factors = []
+        sd = tensor.dscalar()
+        means = tensor.dvector()
+        cst_05 = theano.tensor.constant(.5)
+        cst_m05 = theano.tensor.constant(-.5)
+        cst_2 = theano.tensor.constant(2)
+        cst_m2 = theano.tensor.constant(-2)
+        ones = theano.tensor.constant(numpy.ones(10))
+        n = 85
+        if theano.config.mode in ["DebugMode", "DEBUG_MODE"]:
+            n = 10
+        for i in range(n):
+            f = (cst_m05 * sd ** cst_m2 * (ones - means[i]) ** cst_2 +
+                 cst_05 * tensor.log(cst_05 * (sd ** cst_m2) / numpy.pi))
+            factors.append(tensor.sum(f))
+        logp = tensor.add(*factors)
+        vars = [sd, means]
+        dlogp = function(vars, [theano.grad(logp, v) for v in vars])
+        dlogp(2, numpy.random.rand(n))
    def speed_fusion(self, shared_fn=shared, gpu=False, s=None):
        """
        param type s: a slice object
@@ -1676,8 +1706,8 @@ class test_local_subtensor_lift(unittest.TestCase):
        f = function([x, y, z], tensor.exp(x + y + z)[0], mode=mode_opt)
        prog = f.maker.fgraph.toposort()
-        assert isinstance(prog[1].op, tensor.DimShuffle)
+        assert isinstance(prog[0].op, tensor.DimShuffle)
-        assert isinstance(prog[0].op, tensor.Subtensor)  # first subtensor
+        assert isinstance(prog[1].op, tensor.Subtensor)  # first subtensor
        assert isinstance(prog[2].op, tensor.Subtensor)  # first subtensor
        assert isinstance(prog[3].op.scalar_op, theano.scalar.
            Composite)  # Composite{add,add}
@@ -1693,8 +1723,8 @@ class test_local_subtensor_lift(unittest.TestCase):
        f = function([x, y, z], tensor.exp(x + y + z)[0:2], mode=mode_opt)
        prog = f.maker.fgraph.toposort()
-        assert isinstance(prog[1].op, tensor.DimShuffle)
+        assert isinstance(prog[0].op, tensor.DimShuffle)
-        assert isinstance(prog[0].op, tensor.Subtensor)  # first subtensor
+        assert isinstance(prog[1].op, tensor.Subtensor)  # first subtensor
        assert isinstance(prog[2].op, tensor.Subtensor)  # first subtensor
        assert isinstance(prog[3].op.scalar_op, theano.scalar.
            Composite)  # Composite{add,add}
@@ -3402,7 +3432,7 @@ class T_local_erfc(unittest.TestCase):
        assert len(f.maker.fgraph.apply_nodes) == 1, len(f.maker.fgraph.apply_nodes)
        assert f.maker.fgraph.outputs[0].dtype == theano.config.floatX
        assert len(f.maker.fgraph.toposort()[0].fgraph.toposort()[
-            0].op.scalar_op.fgraph.apply_nodes)==2,len(f.maker.fgraph.toposort()[0].fgraph.toposort()[0].op.scalar_op.fgraph.apply_nodes)
+            0].op.scalar_op.fgraph.apply_nodes)==22,len(f.maker.fgraph.toposort()[0].fgraph.toposort()[0].op.scalar_op.fgraph.apply_nodes)
        #TODO: fix this problem
        if theano.config.floatX=="float32" and theano.config.mode in ["DebugMode", "DEBUG_MODE"]:
            raise KnownFailureTest(