merge; no conflicts

e4c6880e · Razvan Pascanu · 594a136d · ce9a0ec5 · e4c6880e · e4c6880e
--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -50,7 +50,20 @@ class HostFromGpu(Op):
        z[0] = numpy.asarray(x)
    def grad(self, inputs, grads):
        gz, = grads
+        if isinstance(gz, tensor.TensorType):
+            # This would only happen if you call Lop, and provide a tensor
+            # that is not cuda
+            # This might require another look to be sure
            return [gpu_from_host(gz)]
+        else:
+            return [gz]
+    def R_op(self, inputs, eval_points):
+        ev, = eval_points
+        if isinstance(ev, tensor.TensorType):
+            return [gpu_from_host(ev)]
+        else:
+            return [ev]
    def infer_shape(self, node, xshp):
        return xshp
 host_from_gpu = HostFromGpu()
@@ -72,7 +85,21 @@ class GpuFromHost(Op):
        z[0] = type_support_filter(theano._asarray(x, dtype='float32'), tuple([0]*x.ndim), 0, z[0])
    def grad(self, inputs, grads):
        gz, = grads
+        if isinstance(gz,CudaNdarrayType):
+            # This would only happen if you call Lop, and provide a tensor
+            # that is not cuda
+            # This might require another look to be sure
            return [host_from_gpu(gz)]
+        else:
+            return [gz]
+    def R_op(self, inputs, eval_points):
+        ev, = eval_points
+        if isinstance(ev, CudaNdarrayType):
+            return [host_from_gpu(ev)]
+        else:
+            return [ev]
    def infer_shape(self, node, xshp):
        return xshp
 gpu_from_host = GpuFromHost()

--- a/theano/tensor/__init__.py
+++ b/theano/tensor/__init__.py
@@ -24,5 +24,6 @@ from sharedvar import tensor_constructor as shared
 import nnet # used for softmax, sigmoid, etc.
+from tensor_grad import Rop, Lop, grad, numeric_grad, verify_grad
--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
--- a/theano/tensor/elemwise.py
+++ b/theano/tensor/elemwise.py
@@ -218,6 +218,13 @@ class DimShuffle(Op):
            rval.insert(augm, 1)
        return [rval]
+    def R_op(self, inputs, eval_points):
+        if None in eval_points:
+            return [None]
+        return self.make_node(*eval_points).outputs
    def c_code(self, node, name, inp, out, sub):
        input, = inp
        res, = out
@@ -534,7 +541,78 @@ class Elemwise(Op):
        else:
            return self.name
+    def R_op(self, inputs, eval_points):
+        outs = self.make_node(*inputs).outputs
+        rval = [None for x in outs]
+        # For each output
+        for idx, out in enumerate(outs):
+            # make such that _bgrads computes only the gradients of the
+            # current output on the inputs ( and not all outputs)
+            ograds = [ theano.tensor.zeros_like(x) for x in outs]
+            ograds[idx] = theano.tensor.ones_like(out)
+            bgrads = self._bgrad(inputs, ograds)
+            rop_out = None
+            for jdx, (inp, eval_point) in enumerate(zip(inputs,
+                                                        eval_points)):
+                # if None, then we can just ignore this branch ..
+                # what we do is to assume that for any non-differentiable
+                # branch, the gradient is actually 0, which I think is not
+                # the right thing to do .. have to talk to Ian and James
+                # about it
+                if bgrads[jdx] is None:
+                    pass
+                elif eval_point is not None:
+                    if rop_out is None:
+                        rop_out = bgrads[jdx]*eval_point
+                    else:
+                        rop_out = rop_out + bgrads[jdx]*eval_point
+            rval[idx] = rop_out
+        return rval
    def grad(self, inputs, ograds):
+        #compute grad with respect to broadcasted input
+        rval = self._bgrad(inputs,ograds)
+        #sum out the broadcasted dimensions
+        for i, ipt in enumerate(inputs):
+            if rval[i] is None:
+                continue
+            # list of all the dimensions that are broadcastable for input[i] so we
+            # can sum over them
+            # todo: only count dimensions that were effectively broadcasted
+            to_sum = [j for j, bcast in enumerate(ipt.type.broadcastable) if bcast]
+            if to_sum:
+                shuffle = []
+                j = 0
+                for bcast in ipt.type.broadcastable:
+                    if bcast == 1:
+                        shuffle.append('x')
+                    else:
+                        shuffle.append(j)
+                        j += 1
+                    #close if
+                #close for
+                sr = Sum(axis = to_sum)(rval[i])
+                sr = sr.dimshuffle(shuffle)
+                #sr = DimShuffle(sr.type.broadcastable, shuffle)(sr)
+                rval[i] = sr
+            #close if
+        #close for
+        return rval
+    def _bgrad(self, inputs, ograds):
+        # returns grad, with respect to broadcasted versions of inputs
        # Gradients (especially on the final costs) don't have to be symbolic
        # e.g., ograds will be [ 1. ] if your objective is c and the output
        # of the current apply node is c
@@ -558,35 +636,17 @@ class Elemwise(Op):
                                            broadcastable = ()),
                                     numpy.asarray(r.data)) # .reshape(b)
                return DimShuffle((), ['x']*nd, inplace = True)(res)
-            new_r = Elemwise(node.op, {})(*[transform(input) for input in node.inputs])
+            new_r = Elemwise(node.op, {})(*[transform(ipt) for ipt in node.inputs])
            return new_r
        ret = []
-        for scalar_igrad, input in zip(scalar_igrads, inputs):
+        for scalar_igrad, ipt in zip(scalar_igrads, inputs):
            if scalar_igrad is None:
                # undefined gradient
                ret.append(None)
                continue
-            r = transform(scalar_igrad)
+            ret.append( transform(scalar_igrad))
-            # list of all the dimensions that are broadcastable for that input so we
-            # can sum over them
-            # todo: only count dimensions that were effectively broadcasted
-            to_sum = [i for i, bcast in enumerate(input.type.broadcastable) if bcast]
-            if to_sum:
-                shuffle = []
-                j = 0
-                for bcast in input.type.broadcastable:
-                    if bcast == 1:
-                        shuffle.append('x')
-                    else:
-                        shuffle.append(j)
-                        j += 1
-                sr = Sum(axis = to_sum)(r)
-                sr = DimShuffle(sr.type.broadcastable, shuffle)(sr)
-                ret.append(sr)
-            else:
-                ret.append(r)
        return ret
    def perform(self, node, inputs, output_storage):
@@ -1180,6 +1240,11 @@ class Sum(CAReduce):
                i += 1
        return Elemwise(scalar.second)(x, DimShuffle(gz.type.broadcastable, new_dims)(gz)),
+    def R_op(self, inputs, eval_points):
+        if None in eval_points:
+            return [None]
+        return self.make_node(*eval_points).outputs
    def __str__(self):
        if self.axis is None:
            return "Sum"

--- a/theano/tensor/nnet/nnet.py
+++ b/theano/tensor/nnet/nnet.py
@@ -347,6 +347,13 @@ class Softmax(gof.Op):
        sm = softmax(x)
        return [softmax_grad(g_sm, sm)]
+    def R_op(self, inputs, eval_points):
+        # I think the Jacobian is symmetric so the R_op
+        # is the same as the grad
+        if None in eval_points:
+            return [None]
+        return self.grad(inputs, eval_points)
    def infer_shape(self, node, shape):
        return shape

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -469,6 +469,11 @@ class MakeVector(T.Op):
                grads.append(output_gradients[0][i])
        return grads
+    def R_op(self, inputs, eval_points):
+        if None in eval_points:
+            return [None]
+	    return self.make_node(*eval_points).outputs
 make_vector = MakeVector()
 class MakeVectorPrinter:

--- a/theano/tensor/tensor_grad.py
+++ b/theano/tensor/tensor_grad.py
--- a/theano/tensor/tests/test_rop.py
+++ b/theano/tensor/tests/test_rop.py
@@ -2,6 +2,48 @@
 WRITE ME
 Tests for the R operator / L operator
+ops without:
+    PermuteRowElements
+    Tile
+    AdvancedSubtensor
+    TensorDot
+    Outer
+    Prod
+    MulwithoutZeros
+    ProdWithoutZeros
+list of ops that support R-op:
+    * Alloc
+    * Split
+    * ARange
+    * ScalarFromTensor
+    * Shape
+    * SpecifyShape
+    * MaxAndArgmax
+    * Subtensor
+    * IncSubtensor
+    * Rebroadcast
+    * Join
+    * Reshape
+    * Flatten
+    * AdvancedSubtensor1
+    * AdvancedIncSubtensor1
+    * AdvancedIncSubtensor
+    * Dot
+    * DimShuffle
+    * Elemwise
+    * Sum
+    * Softmax
+    * Scan
 """
 import unittest
@@ -10,52 +52,208 @@ from theano import function
 import theano
 import theano.tensor as TT
 import numpy
+from theano.gof import Op, Apply
-class test_rop(unittest.TestCase):
+'''
+Special Op created to test what happens when you have one op that is not
+differentiable in the computational graph
+'''
+class BreakRop(Op):
+    """
+    @note: Non-differentiable.
+    """
+    def __hash__(self):
+        return hash(type(self))
+    def __eq__(self, other):
+        return type(self) == type(other)
+    def make_node(self, x):
+        return Apply(self, [x], [x.type()])
+    def perform(self, node, inp, out_):
+        x, = inp
+        out, = out_
+        out[0] = x
+    def grad(self, inp, grads):
+        return [None]
+    def R_op(self, inputs, eval_points):
+        return [None]
-    def test_specifyshape(self):
+break_op = BreakRop()
-        rng  = numpy.random.RandomState(utt.fetch_seed())
-        vx = numpy.asarray(rng.uniform(size=(5,)), theano.config.floatX)
-        vv = numpy.asarray(rng.uniform(size=(5,)), theano.config.floatX)
+class test_RopLop(unittest.TestCase):
-        x  = TT.vector('x')
+    def setUp(self):
-        v  = TT.vector('v')
+        # Using vectors make things a lot simpler for generating the same
-        y  = TT.specify_shape(x, (5,))
+        # computations using scan
-        yv = TT.Rop(y,x,v)
+        self.x = TT.vector('x')
-        rop_f = function([x,v], yv)
+        self.v = TT.vector('v')
-        J, _ = theano.scan( lambda i,y,x: TT.grad(y[i],x),
+        self.rng  = numpy.random.RandomState(utt.fetch_seed())
-                           sequences = TT.arange(x.shape[0]),
+        self.in_shape = ( 5+self.rng.randint(30),)
-                           non_sequences = [y,x])
+        self.mx = TT.matrix('mx')
-        sy = TT.dot(J, v)
+        self.mv = TT.matrix('mv')
+        self.mat_in_shape = ( 5 + self.rng.randint(30),
+                             5+self.rng.randint(30))
+    def check_nondiff_rop(self, y):
+        raised = False
+        try:
+            tmp = TT.Rop(y, self.x, self.v)
+        except ValueError:
+            raised = True
+        if not raised:
+            self.fail((
+                'Op did not raised an error even though the function'
+                ' is not differentiable'))
+    def check_mat_rop_lop(self, y, out_shape):
+        vx = numpy.asarray(self.rng.uniform(size=self.mat_in_shape), theano.config.floatX)
+        vv = numpy.asarray(self.rng.uniform(size=self.mat_in_shape), theano.config.floatX)
+        yv = TT.Rop(y, self.mx, self.mv)
+        rop_f = function([self.mx, self.mv], yv)
+        sy, _ = theano.scan( lambda i,y,x,v: (TT.grad(y[i],x)*v).sum(),
+                           sequences = TT.arange(y.shape[0]),
+                           non_sequences = [y,self.mx,self.mv])
+        scan_f = function([self.mx,self.mv], sy)
-        scan_f = function([x,v], sy)
        v1 = rop_f(vx,vv)
        v2 = scan_f(vx,vv)
        assert numpy.allclose(v1,v2)
+        self.check_nondiff_rop( theano.clone(y,
+                                             replace={self.mx:break_op(self.mx)}))
+        vv = numpy.asarray(self.rng.uniform(size=out_shape), theano.config.floatX)
+        yv = TT.Lop(y, self.mx, self.v)
+        lop_f = function([self.mx, self.v], yv)
+        sy, _ = theano.scan( lambda i,y,x,v: (TT.grad(y[i]*v[i],x))[i],
+                           sequences = TT.arange(y.shape[0]),
+                           non_sequences = [y,self.mx,self.v])
+        scan_f = function([self.mx, self.v], sy)
-class test_lop(unittest.TestCase):
-    def test_specifyshape(self):
+        v1 = lop_f(vx,vv)
-        rng  = numpy.random.RandomState(utt.fetch_seed())
+        v2 = scan_f(vx,vv)
-        vx = numpy.asarray(rng.uniform(size=(5,)), theano.config.floatX)
+        assert numpy.allclose(v1,v2)
-        vv = numpy.asarray(rng.uniform(size=(5,)), theano.config.floatX)
-        x  = TT.vector('x')
+    def check_rop_lop(self, y, out_shape):
-        v  = TT.vector('v')
+        # TEST ROP
-        y  = TT.specify_shape(x, (5,))
+        vx = numpy.asarray(self.rng.uniform(size=self.in_shape), theano.config.floatX)
-        yv = TT.Lop(y,x,v)
+        vv = numpy.asarray(self.rng.uniform(size=self.in_shape), theano.config.floatX)
-        rop_f = function([x,v], yv)
+        yv = TT.Rop(y,self.x,self.v)
+        rop_f = function([self.x,self.v], yv)
        J, _ = theano.scan( lambda i,y,x: TT.grad(y[i],x),
-                           sequences = TT.arange(x.shape[0]),
+                           sequences = TT.arange(y.shape[0]),
-                           non_sequences = [y,x])
+                           non_sequences = [y,self.x])
-        sy = TT.dot(v, J)
+        sy = TT.dot(J, self.v)
-        scan_f = function([x,v], sy)
+        scan_f = function([self.x,self.v], sy)
        v1 = rop_f(vx,vv)
        v2 = scan_f(vx,vv)
        assert numpy.allclose(v1,v2)
+        self.check_nondiff_rop( theano.clone(y,
+                                             replace={self.x:break_op(self.x)}))
+        # TEST LOP
+        vx = numpy.asarray(self.rng.uniform(size=self.in_shape), theano.config.floatX)
+        vv = numpy.asarray(self.rng.uniform(size=out_shape), theano.config.floatX)
+        yv = TT.Lop(y,self.x,self.v)
+        lop_f = function([self.x,self.v], yv)
+        J, _ = theano.scan( lambda i,y,x: TT.grad(y[i],x),
+                           sequences = TT.arange(y.shape[0]),
+                           non_sequences = [y,self.x])
+        sy = TT.dot(self.v, J)
+        scan_f = function([self.x,self.v], sy)
+        v1 = lop_f(vx,vv)
+        v2 = scan_f(vx,vv)
+        assert numpy.allclose(v1,v2)
+    def test_shape(self):
+        self.check_nondiff_rop( self.x.shape[0])
+    def test_specifyshape(self):
+        self.check_rop_lop(TT.specify_shape(self.x, self.in_shape),
+                           self.in_shape)
+    def test_max_argmax(self):
+        self.check_map_rop_lop(TT.max(self.mx, axis=1),
+                               (self.mat_in_shape[0],))
+    def test_max_argmax(self):
+        self.check_nondiff_rop(TT.argmax(self.mx,axis=1))
+    def test_subtensor(self):
+        self.check_rop_lop(self.x[:4], (4,))
+    def test_incsubtensor1(self):
+        tv = numpy.asarray( self.rng.uniform(size=(3,)),
+                           theano.config.floatX)
+        t = theano.shared(tv)
+        out = TT.inc_subtensor(self.x[:3], t)
+        self.check_rop_lop(out, self.in_shape)
+    def test_incsubtensor1(self):
+        tv = numpy.asarray( self.rng.uniform(size=(10,)),
+                           theano.config.floatX)
+        t = theano.shared(tv)
+        out = TT.inc_subtensor(t[:4], self.x[:4])
+        self.check_rop_lop(out, (10,))
+    def test_setsubtensor1(self):
+        tv = numpy.asarray( self.rng.uniform(size=(3,)),
+                           theano.config.floatX)
+        t = theano.shared(tv)
+        out = TT.set_subtensor(self.x[:3], t)
+        self.check_rop_lop(out, self.in_shape)
+    def test_setsubtensor1(self):
+        tv = numpy.asarray( self.rng.uniform(size=(10,)),
+                           theano.config.floatX)
+        t = theano.shared(tv)
+        out = TT.set_subtensor(t[:4], self.x[:4])
+        self.check_rop_lop(out, (10,))
+    def test_join(self):
+        tv = numpy.asarray( self.rng.uniform(size=(10,)),
+                           theano.config.floatX)
+        t = theano.shared(tv)
+        out = TT.join(0, self.x, t)
+        self.check_rop_lop(out, (self.in_shape[0]+10,))
+    def test_dot(self):
+        insh = self.in_shape[0]
+        vW   = numpy.asarray(self.rng.uniform(size=(insh,insh)),
+                           theano.config.floatX)
+        W = theano.shared(vW)
+        self.check_rop_lop( TT.dot(self.x, W), self.in_shape)
+    def test_elemwise0(self):
+        self.check_rop_lop( (self.x+1)**2, self.in_shape)
+    def test_elemwise1(self):
+        self.check_rop_lop( self.x+TT.cast(self.x, 'int32'),
+                           self.in_shape)
+    def test_sum(self):
+        self.check_mat_rop_lop(self.mx.sum(axis=1), (self.mat_in_shape[0],))
+    def test_softmax(self):
+        # Softmax adds an extra dimnesion !
+        self.check_rop_lop( TT.nnet.softmax(self.x)[0], self.in_shape)