Merge pull request #176 from jaberg/Composite_fixes

Composite fixes

Merge pull request #176 from jaberg/Composite_fixes
a87e9bb0 · goodfeli · d63ae441 · 2b95f6a2 · a87e9bb0 · a87e9bb0
--- a/theano/compile/function_module.py
+++ b/theano/compile/function_module.py
@@ -994,6 +994,8 @@ class FunctionMaker(object):
        try:
            theano.config.compute_test_value = "off"
            start_optimizer = time.time()
+            add_stack_trace_on_call = gof.Op.add_stack_trace_on_call
+            gof.Op.add_stack_trace_on_call = False
            optimizer(env)
            end_optimizer = time.time()

@@ -1007,6 +1009,7 @@ class FunctionMaker(object):
            insert_deepcopy(env, inputs, outputs+additional_outputs)
        finally:
            theano.config.compute_test_value = compute_test_value_orig
+            gof.Op.add_stack_trace_on_call = add_stack_trace_on_call

        # initialize the linker
        if not hasattr(linker, 'accept'):

--- a/theano/gof/cc.py
+++ b/theano/gof/cc.py
@@ -853,7 +853,8 @@ class CLinker(link.Linker):
                          libraries=self.libraries()
                          )
    @staticmethod
-    def cmodule_key_(env, no_recycling, compile_args=[], libraries=[]):
+    def cmodule_key_(env, no_recycling, compile_args=[], libraries=[],
+            insert_config_md5=True):
        """
        Do the actual computation of cmodule_key in a static method
        to allow it to be reused in scalar.Composite.__eq__
@@ -871,11 +872,15 @@ class CLinker(link.Linker):
        sig = ['CLinker.cmodule_key'] # will be cast to tuple on return
        if compile_args is not None: sig.append(tuple(compile_args))
        if libraries is not None: sig.append(tuple(libraries))
+
        # IMPORTANT: The 'md5' prefix is used to isolate the compilation
        # parameters from the rest of the key. If you want to add more key
        # elements, they should be before this md5 hash if and only if they
        # can lead to a different compiled file with the same source code.
-        sig.append('md5:' + theano.configparser.get_config_md5())
+        if insert_config_md5:
+            sig.append('md5:' + theano.configparser.get_config_md5())
+        else:
+            sig.append('md5: <omitted>')

        # technically this should only be appended for gcc-compiled Ops
        # and the flags of other compilers should be inserted here... but it's not clear how to

--- a/theano/gof/cmodule.py
+++ b/theano/gof/cmodule.py
@@ -955,9 +955,10 @@ class ModuleCache(object):
        if found == 0:
            msg = 'Key not found in unpickled KeyData file'
            if key_data.keys:
-                # This is only to make debugging in pdb easier, by providing
-                # the offending key in the local context.
-                other_key = key_data.keys.__iter__().next()
+                # This is to make debugging in pdb easier, by providing
+                # the offending keys in the local context.
+                key_data_keys = list(key_data.keys)
+                ## import pdb; pdb.set_trace()
        elif found > 1:
            msg = 'Multiple equal keys found in unpickled KeyData file'
        if msg:

--- a/theano/gof/op.py
+++ b/theano/gof/op.py
@@ -310,6 +310,13 @@ class PureOp(object):

    """

+    add_stack_trace_on_call = True
+    """This class variable governs whether __call__ adds a stack trace to the node it creates.
+    
+    The tag trace is meant to connect a node to the line a user typed. It is nice for
+    debugging. It does not make as much sense during optimizations to store this information.
+    """
+
    #############
    # make_node #
    #############
@@ -367,7 +374,8 @@ class PureOp(object):

        """
        node = self.make_node(*inputs, **kwargs)
-        self.add_tag_trace(node)
+        if self.add_stack_trace_on_call:
+            self.add_tag_trace(node)

        if config.compute_test_value != 'off':
            run_perform = True

--- a/theano/sandbox/cuda/elemwise.py
+++ b/theano/sandbox/cuda/elemwise.py
@@ -38,6 +38,10 @@ class NaiveAlgo(object):
        :param scalar_op: the scalar operation to execute on each element.
        :param sync: if True, will wait after the kernel launch and check for error call.
        """
+        if scalar_op.c_support_code_apply(node=None, nodename="nodename"):
+            raise ValueError(('It is currently not possible to auto-generate'
+                    ' a GPU implementation for an elementwise Op with support'
+                    ' code'), scalar_op)
        self.scalar_op = scalar_op
        self.sync = sync
        self.inplace_pattern = inplace_pattern
@@ -799,12 +803,15 @@ nd_collapse_[i]=0;

    def c_support_code_apply(self, node, nodename):
        nd = node.outputs[0].type.ndim
-        return "".join(
-            [self.c_src_kernel(node, nodename,x) for x in xrange(1,nd+1)]+
-            [
-            self.c_src_kernel_Ccontiguous(node, nodename),
-            self.c_src_callkernel(node, nodename),
-            ])
+        defines =  """
+#define INTDIV_POW2(a, b) (a >> b)
+#define INTMOD_POW2(a, b) (a & ((1<<b)-1))
+        """
+        kernels = "".join(
+            [self.c_src_kernel(node, nodename, x) for x in xrange(1, nd + 1)]
+            + [self.c_src_kernel_Ccontiguous(node, nodename)],
+            + [self.c_src_callkernel(node, nodename)])
+        return defines + kernels

    def c_code(self, node, nodename, inputs, outputs, sub):
        d = dict(sub)
@@ -951,8 +958,3 @@ nd_collapse_[i]=0;
        #print sio.getvalue()
        return sio.getvalue()

-    def c_support_code(self):
-        return """
-        #define INTDIV_POW2(a, b) (a >> b)
-        #define INTMOD_POW2(a, b) (a & ((1<<b)-1))
-        """
--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -37,13 +37,13 @@ gpu_seqopt.register('gpu_cut_transfers', gpu_cut_copies, 2,
 optdb.register('gpu_opt',
               gpu_seqopt,
               optdb.__position__.get('add_destroy_handler', 49.5) - 1,
-               'gpu')
+               'gpu', 'fast_run')
 # This second pass is needed as the fusion can put all the non float32 code
 # inside the elemwise. When it there is no float64 op, this is working.
 optdb.register('gpu_after_fusion',
               ProxyDB(gpu_seqopt),
               optdb.__position__.get('elemwise_fusion', 71) + .1,
-               'gpu')
+               'gpu', 'fast_run')

 def register_opt(*tags, **kwargs):
    def f(local_opt):
@@ -144,7 +144,11 @@ def local_gpu_elemwise_0(node):
            if numpy.all([o.type.dtype == 'float32' for o in node.outputs]):
                # Don't set any inplace pattern.
                # gpu_inplace_elemwise_optimizer will do it later
-                new_op = GpuElemwise(node.op.scalar_op)
+                try:
+                    new_op = GpuElemwise(node.op.scalar_op)
+                except ValueError:
+                    # This happens when scalar_op requires support code
+                    return False

                #   first establish that float32 can store all inputs
                upcastable = set(['float32', 'int8', 'int16', 'uint8', 'uint16'])
@@ -188,7 +192,11 @@ def local_gpu_elemwise_1(node):
            elemwise_node = host_i.owner
            # Don't set any inplace pattern.
            # gpu_inplace_elemwise_optimizer will do it later
-            new_op = GpuElemwise(elemwise_node.op.scalar_op)
+            try:
+                new_op = GpuElemwise(elemwise_node.op.scalar_op)
+            except ValueError:
+                # This happens when scalar_op requires support code
+                return False
            if all([i.dtype=='float32' for i in elemwise_node.inputs]):
                gpu_elemwise = new_op(*[gpu_from_host(i) for i in elemwise_node.inputs])
                gpu_elemwise = split_huge_add_or_mul(gpu_elemwise.owner)

--- a/theano/scalar/basic.py
+++ b/theano/scalar/basic.py
--- a/theano/scalar/tests/test_basic.py
+++ b/theano/scalar/tests/test_basic.py
@@ -208,5 +208,9 @@ class test_div(unittest.TestCase):
        assert isinstance((a/c).owner.op, TrueDiv)


+# Testing of Composite is done in tensor/tests/test_opt.py
+# in test_fusion, TestCompositeCodegen
+
+
 if __name__ == '__main__':
    unittest.main()
--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -1081,6 +1081,7 @@ def tensor4(name=None, dtype=None):
 tensor4s, ftensor4s, dtensor4s, itensor4s, ltensor4s = _multi(tensor4, ftensor4, dtensor4,
        itensor4, ltensor4)

+
 class _tensor_py_operators:
    #UNARY
    def __abs__(self): return abs_(self)
@@ -1370,10 +1371,14 @@ class _tensor_py_operators:
    def get_constant_value(self):
        return get_constant_value(self)

+
 class TensorVariable(_tensor_py_operators, Variable):
    """Subclass to add the tensor operators to the basic `Variable` class."""
+
+
 TensorType.Variable = TensorVariable

+
 class TensorConstantSignature(tuple):
    """A Signature object for comparing TensorConstant instances

@@ -1497,7 +1502,8 @@ class TensorValue(_tensor_py_operators, Value):

 Tensor = TensorType

-#QUESTION: why are we doing this!?
+
+# This bizarre push-import avoids a circular dependency.
 elemwise.as_tensor_variable = as_tensor_variable
 elemwise.TensorType = TensorType
 elemwise.TensorVariable = TensorVariable
@@ -1505,29 +1511,10 @@ elemwise.TensorConstant = TensorConstant
 elemwise.TensorValue = TensorValue


-
 #########################
 # Utilities
 #########################

-def _elemwise(scalar_op, name, doc_prefix=''):
-    straight = elemwise.Elemwise(scalar_op, name = name)
-    inplace_scalar_op = scalar_op.__class__(scal.transfer_type(0))
-    inplace = elemwise.Elemwise(inplace_scalar_op, {0: 0}, name = name+"_inplace")
-
-    # don't add the inplace versions, they aren't supposed to be part of the user interface
-    _constructor_list.append(straight)
-
-    # This is here so that gen_oplist can detect which module declared these variables.
-
-    straight.__module__ = 'tensor'
-    inplace.__module__ = 'tensor'
-
-    if doc_prefix:
-        straight.__doc__ = doc_prefix + '\n' + straight.__doc__
-
-    return straight, inplace
-
 def _redefine(real_symbol_value, module='tensor'):
    """Replace the value associated with a function symbol.

@@ -1538,12 +1525,14 @@ def _redefine(real_symbol_value, module='tensor'):
        return real_symbol_value
    return decorator

+
 def _redefine_asRoutine(real_symbol_value):
    real_symbol_value.__epydoc_asRoutine = True
    def decorator(f):
        return real_symbol_value
    return decorator

+
 def _scal_elemwise_with_nfunc(nfunc, nin, nout):
    """
    Replace a symbol definition with an elementwise version of the

--- a/theano/tensor/elemwise.py
+++ b/theano/tensor/elemwise.py
@@ -793,7 +793,7 @@ class Elemwise(Op):
            rval.append(tuple(oshp))
        return rval

-    def _c_all(self, node, name, inames, onames, sub):
+    def _c_all(self, node, nodename, inames, onames, sub):
        _inames = inames
        _onames = onames

@@ -901,7 +901,7 @@ class Elemwise(Op):
                Apply(self.scalar_op,
                      [Scalar(dtype = input.type.dtype)() for input in node.inputs],
                      [Scalar(dtype = output.type.dtype)() for output in node.outputs]),
-                name + '_scalar_',
+                nodename + '_scalar_',
                ["%s_i" % s for s in _inames],
                ["%s_i" % s for s in onames],
                sub)
@@ -922,19 +922,20 @@ class Elemwise(Op):
                sub = sub)
        return decl, checks, alloc, loop

-    def c_code(self, node, name, inames, onames, sub):
-        code = "\n".join(self._c_all(node, name, inames, onames, sub))
+    def c_code(self, node, nodename, inames, onames, sub):
+        code = "\n".join(self._c_all(node, nodename, inames, onames, sub))
        return code

    def c_headers(self):
        return ['<vector>', '<algorithm>']

-    def c_support_code(self):
-        support_code = self.scalar_op.c_support_code()
+    def c_support_code_apply(self, node, nodename):
+        support_code = self.scalar_op.c_support_code_apply(node,
+                nodename + '_scalar_')
        return support_code

    def c_code_cache_version_apply(self, node):
-        version = [5] # the version corresponding to the c code in this Op
+        version = [6] # the version corresponding to the c code in this Op

        # now we insert versions for the ops on which we depend...
        scalar_node = Apply(self.scalar_op,

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -629,18 +629,23 @@ class ShapeFeature(object):
    """

    def shape_ir(self, i, r):
-        #TODO: Write a doc string for this method
-
+        """Return symbolic r.shape[i] for tensor variable r, int i"""
        if hasattr(r.type,"broadcastable") and r.type.broadcastable[i]:
            return self.lscalar_one
        else:
            return Shape_i(i).make_node(r).outputs[0]

    def shape_tuple(self, r):
-        #TODO: Write a doc string for this method
+        """Return a tuple of symbolic shape vars for tensor variable r"""
        return tuple([self.shape_ir(i,r) for i in xrange(r.ndim)])

    def default_infer_shape(self, node, i_shapes):
+        """Return a list of shape tuple or None for the outputs of node.
+        
+        This function is used for Ops that don't implement infer_shape.
+        Ops that do implement infer_shape should use the i_shapes parameter,
+        but this default implementation ignores it.
+        """
        rval = []
        for r in node.outputs:
            try:
@@ -650,16 +655,21 @@ class ShapeFeature(object):
        return rval

    def unpack(self, s_i):
+        """Return a symbolic integer scalar for the shape element s_i.
+        
+        The s_i argument was produced by the infer_shape() of an Op subclass.
+        """
        # unpack the s_i that the Op returned
        assert s_i is not None
        if s_i == 1:
            # don't make the optimizer merge a zillion ones together
+            # by always returning the same object to represent 1
            return self.lscalar_one
        if type(s_i) in (int,long) or isinstance(s_i, numpy.integer):
            # this shape is a constant
            assert s_i >= 0
            return T.constant(s_i, dtype='int64')
-        if type(s_i) in (tuple,list):
+        if type(s_i) in (tuple, list):
            # this dimension is the same as many of the inputs
            # which tells us that if one of the inputs is known,
            # the others all become known.
@@ -676,11 +686,19 @@ class ShapeFeature(object):
                    s_i, type(s_i), getattr(s_i, 'type', None))

    def set_shape(self, r, s):
+        """Assign the shape `s` to previously un-shaped variable `r`.
+        
+        :type r: a variable
+        :type s: None or a tuple of symbolic integers
+        """
        assert r not in self.shape_of, 'r already in shape_of'
        if s is None:
            self.shape_of[r] = s
        else:
-            self.shape_of[r] = tuple([self.unpack(s_i) for s_i in s])
+            shape_vars = [self.unpack(s_i) for s_i in s]
+            self.shape_of[r] = tuple(shape_vars)
+            for sv in shape_vars:
+                self.shape_of_reverse_index.setdefault(sv, set()).add(r)

    def update_shape(self, r, other_r):
        '''Replace shape of r by shape of other_r.
@@ -692,16 +710,17 @@ class ShapeFeature(object):
        assert other_r in self.shape_of, ('other_r not in shape_of', other_r)
        other_shape = self.shape_of[other_r]

+        # If other_shape has no information, call is pointless.
+        if other_shape is None:
+            return
+
        if r in self.shape_of:
            r_shape = self.shape_of[r]
        else:
            # If no info is known on r's shape, use other_shape
            self.shape_of[r] = other_shape
-            return
-
-        # If other_shape has no information, use r_shape
-        if other_shape is None:
-            self.shape_of[r] = r_shape
+            for sv in other_shape:
+                self.shape_of_reverse_index.setdefault(sv, set()).add(r)
            return

        # Merge other_shape with r_shape, giving the priority to other_shape
@@ -711,14 +730,16 @@ class ShapeFeature(object):
            # For now, we consider 2 cases of uninformative other_shape[i]:
            #  - Shape_i(i)(other_r);
            #  - Shape_i(i)(r).
-            if (ps.owner and
-                    isinstance(getattr(ps.owner,'op',None), Shape_i) and
-                    ps.owner.op.i == i and
-                    ps.owner.inputs[0] in (r, other_r)):
+            if (ps.owner
+                    and isinstance(getattr(ps.owner, 'op', None), Shape_i)
+                    and ps.owner.op.i == i
+                    and ps.owner.inputs[0] in (r, other_r)):
                merged_shape.append(r_shape[i])
            else:
                merged_shape.append(other_shape[i])
        self.shape_of[r] = tuple(merged_shape)
+        for sv in self.shape_of[r]:
+            self.shape_of_reverse_index.setdefault(sv, set()).add(r)

    def set_shape_i(self, r, i, s_i):
        '''Replace element i of shape_of[r] by s_i'''
@@ -733,14 +754,16 @@ class ShapeFeature(object):
            else:
                new_shape.append(s_j)
        self.shape_of[r] = tuple(new_shape)
+        for sv in self.shape_of[r]:
+            self.shape_of_reverse_index.setdefault(sv, set()).add(r)

    def init_r(self, r):
        '''Register r's shape in the shape_of dictionary.'''
        if r not in self.shape_of:
            try:
                self.set_shape(r, self.shape_tuple(r))
-            except AttributeError:
-                self.set_shape(r,None)
+            except AttributeError: #XXX: where would this come from?
+                self.set_shape(r, None)

    def make_vector_shape(self, r):
        return make_vector(*self.shape_of[r])
@@ -757,8 +780,15 @@ class ShapeFeature(object):
        self.lscalar_one = T.constant(1, dtype='int64')
        assert self.lscalar_one.type == T.lscalar

-        self.shape_of = {} # Variable -> tuple(scalars) or None  (All tensor vars map to tuple)
-        self.scheduled = {} # Variable ->
+        self.shape_of = {}
+        # Variable -> tuple(scalars) or None  (All tensor vars map to tuple)
+
+        self.scheduled = {}
+        # Variable ->
+
+        self.shape_of_reverse_index = {}
+        # shape var -> graph v
+
        for node in env.toposort():
            self.on_import(env, node)

@@ -798,9 +828,11 @@ class ShapeFeature(object):
        # this is packed information
        # an element of o_shapes is either None or a tuple
        #   elements of the tuple can be either strings, or ints
-
        if len(o_shapes) != len(node.outputs):
-            raise Exception('len(o_shapes) = '+str(len(o_shapes))+' != len(node.outputs) = '+str(len(node.outputs)))
+            raise Exception('len(o_shapes) = '
+                    + str(len(o_shapes))
+                    + ' != len(node.outputs) = '
+                    + str(len(node.outputs)))

        for r, s in zip(node.outputs, o_shapes):
            self.set_shape(r, s)
@@ -818,23 +850,28 @@ class ShapeFeature(object):
        # the shape of new_r.  Say that r is *scheduled*.
        # At that point, node is no longer a client of r, but of new_r
        for (shpnode, idx) in (r.clients + [(node, i)]):
-            if isinstance(getattr(shpnode,'op', None), Shape_i):
+            if isinstance(getattr(shpnode, 'op', None), Shape_i):
                self.scheduled[shpnode] = new_r
        # In case 2, if r is a variable that we've scheduled for shape update, then we
        # should cancel it.
-        # TODO: store some kind of reverse index?
-        for k,v in self.scheduled.items():
-            if v == r:
-                del self.scheduled[k]
+        unscheduled = [k for k, v in self.scheduled.items() if v == r]
+        for k in unscheduled:
+            del self.scheduled[k]

        # In either case, r could be in shape_of.values(), that is, r itself
        # is the shape of  something. In that case, we want to update
        # the value in shape_of, to keep it up-to-date.
-        for k,v in self.shape_of.iteritems():
-            if v is not None:
-                for ii, vi in enumerate(v):
-                    if vi == r:
-                        self.set_shape_i(k, ii, new_r)
+        for v in self.shape_of_reverse_index.get(r, []):
+            # The reverse index is only approximate. It is not updated on
+            # deletion of variables, or on change_input so it might be the
+            # case that there are a few extra `v`'s in it that no longer have
+            # a shape of r or possibly have been deleted from shape_of
+            # entirely. The important thing is that it permits to recall
+            # all variables with r in their shape.
+            for ii, svi in enumerate(self.shape_of.get(v, [])):
+                if svi == r:
+                    self.set_shape_i(v, ii, new_r)
+        self.shape_of_reverse_index[r] = set()

 class ShapeOptimizer(Optimizer):
    """Optimizer that serves to add ShapeFeature as an env feature.
@@ -926,6 +963,7 @@ def local_track_shape_i(node):
    if node in shape_feature.scheduled:
        assert isinstance(node.op, Shape_i)
        replacement = shape_feature.scheduled[node]
+        # XXX: what the heck is up with node.op.i ???
        return [shape_feature.shape_of[replacement][node.op.i]]

 @register_specialize

--- a/theano/tensor/tests/test_opt.py
+++ b/theano/tensor/tests/test_opt.py
@@ -767,8 +767,8 @@ class test_fusion(unittest.TestCase):
        cases = [
            (fx+fy+fz,(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv+fzv,'float32'),#0
            (fx*fy*fz,(fx,fy,fz),(fxv,fyv,fzv),1,fxv*fyv*fzv,'float32'),#1
-            (fx+fy*fz,(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv*fzv,'float32'),
-            (fx*fy+fz,(fx,fy,fz),(fxv,fyv,fzv),1,fxv*fyv+fzv,'float32'),
+            (fx+fy*fz,(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv*fzv,'float32'),#2
+            (fx*fy+fz,(fx,fy,fz),(fxv,fyv,fzv),1,fxv*fyv+fzv,'float32'),#3
            (fw+fx+fy+fz,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv,'float32'),
            ((fw+fx)+(fy+fz),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv,'float32'),#5
            (((fw+fx)+fy)+fz,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv,'float32'),
@@ -891,11 +891,19 @@ class test_fusion(unittest.TestCase):
                t1=time.time()
                out=out.get_value()

+            #print "CASE2/3", f.maker.env.toposort()
+            #print 'CASE2/3', f.maker.env
+            #print 'CASE2/3', f.maker.env.toposort()[3].op.scalar_op.env
+
            times[id]=t1-t0
            atol=1e-8
            if out_dtype=='float32':atol=1e-6
            if not numpy.allclose(out,answer*nb_repeat,atol=atol):
                fail1.append(id)
+                print val_inputs
+                print out
+                print answer*nb_repeat
+                #assert 0
            topo=f.maker.env.toposort()
            if gpu:
                import theano.sandbox.cuda as cuda
@@ -1109,6 +1117,70 @@ class test_fusion(unittest.TestCase):
 #            cases[id]=None #to remove g, that link to out that link to the ndarray!
            #g.owner.inputs[0] is out... make owner a weakref?

+class TestCompositeCodegen(unittest.TestCase):
+    """
+    Test The Composite Ops code generation in a case where there is multiple
+    scalar ops with support code.
+    """
+    def setUp(self):
+        class TimesN(theano.scalar.basic.UnaryScalarOp):
+            def __init__(self, n, *args, **kwargs):
+                self.n = n
+                theano.scalar.basic.UnaryScalarOp.__init__(self, *args, **kwargs)
+
+            def impl(self, x):
+                return x * self.n
+
+            def c_support_code_apply(self, node, nodename):
+                n = str(self.n)
+                return """
+                float %(nodename)s_timesn(float x) { return x * %(n)s; }
+                """ % locals()
+
+            def c_code(self, node, name, (x, ), (z, ), sub):
+                return "%(z)s = %(name)s_timesn(%(x)s);" % locals()
+
+        upgrade_to_float = theano.scalar.basic.upgrade_to_float
+
+        self.scal_times_2 = TimesN(2, upgrade_to_float, name='times_2')
+        self.times_2 = theano.tensor.elemwise.Elemwise(
+                self.scal_times_2,
+                name='times_2')
+
+        self.scal_times_3 = TimesN(3, upgrade_to_float, name='times_3')
+        self.times_3 = theano.tensor.elemwise.Elemwise(
+                self.scal_times_3,
+                name='times_3')
+
+        self.x = fvector()
+
+    def test_nested_composite(self):
+        y = self.times_2(self.x)
+        z = self.times_3(y)
+        f = function([self.x], z)
+        assert len(f.maker.env.toposort()) == 1
+        fval = f([1, 2, 3])
+        assert numpy.all(fval == [6, 12, 18])
+
+    def test_nested_gpu(self):
+        import theano.sandbox.cuda as cuda
+        if not cuda.cuda_available:
+            raise SkipTest("cuda not available")
+
+        import theano.sandbox.cuda.opt
+
+        y = self.times_2(self.x)
+        z = self.times_3(y)
+        f = theano.function([self.x], cuda.gpu_from_host(z))
+        topo = f.maker.env.toposort()
+        assert len(topo) == 2
+        assert topo[1].op == cuda.gpu_from_host
+        # topo1 is doing the composite work on the CPU. Auto-generation of
+        # GPU code for ops with support code is not possible.
+        fval = numpy.asarray(f([1, 2, 3]))
+        assert numpy.all(fval == [6, 12, 18]), fval
+
+
 def test_log1p():
    m = theano.config.mode
    if m == 'FAST_COMPILE':