Merge pull request #1019 from lamblin/grad_downcast

Re-add part of the dtype constraint on out grads

Merge pull request #1019 from lamblin/grad_downcast
5f75d4a0 · lamblin · 29ee997f · 9a5e2eff · 5f75d4a0 · 5f75d4a0
--- a/theano/gradient.py
+++ b/theano/gradient.py
@@ -465,9 +465,41 @@ def grad(cost, wrt, g_cost=None, consider_constant=None,
    # build a dict mapping var to the gradient of cost with respect to var
    grad_dict = {}
-    # by default, the gradient of the cost is 1
-    if g_cost is None:
+    # The gradient of the cost should default to 1 if the cost is of a
-        g_cost = _float_ones_like(cost)
+    # continuous dtype (float, for the moment, as complex are unsupported),
+    # and should always be 0 if the cost is of discrete (integer) dtype.
+    if getattr(cost.type, 'dtype', None) not in tensor.float_dtypes:
+        if g_cost is not None:
+            try:
+                cval = theano.get_constant_value(g_cost)
+                if cval == 0:
+                    g_cost_is_zero = True
+                else:
+                    g_cost_is_zero = False
+            except TypeError:
+                g_cost_is_zero = False
+            if not g_cost_is_zero:
+                raise ValueError("The gradient of a cost of non-continuous "
+                        "dtype (here, %s), if it is defined, should be 0. "
+                        "However, a value of %s was provided in the 'g_cost' "
+                        "argument of theano.grad(). To remove this error, "
+                        "you can simply omit the 'g_cost' argument, or "
+                        "give it the default value of None." % (
+                            getattr(g_cost.type, 'dtype', 'no dtype defined'),
+                            g_cost))
+        g_cost = tensor.zeros_like(cost)
+    elif g_cost is None:
+        # cost.type.dtype is in tensor.float_dtypes at that point
+        g_cost = tensor.ones_like(cost)
+    else:
+        # Cast the provided gradient so that it has the same dtype
+        # as the cost.
+        g_cost = g_cost.astype(cost.type.dtype)
    grad_dict[cost] = g_cost
    # the gradient of the constants is 0
@@ -501,10 +533,12 @@ def grad(cost, wrt, g_cost=None, consider_constant=None,
        cost_name = cost.name
    # Make sure we didn't initialize the grad_dict with any ints
+    # for non-int outputs
    for var in grad_dict:
        g = grad_dict[var]
-        if hasattr(g.type, 'dtype'):
+        if (hasattr(g.type, 'dtype') and
-            assert g.type.dtype.find('float') != -1
+                getattr(var.type, 'dtype', '') in tensor.float_dtypes):
+            assert g.type.dtype in tensor.float_dtypes
    rval = _populate_grad_dict(var_to_node_to_idx,
            grad_dict, wrt, cost_name)
@@ -739,7 +773,40 @@ def _populate_grad_dict(var_to_node_to_idx,
                inputs = [try_to_copy_if_needed(ipt) for ipt in inputs]
-                input_grads = node.op.grad(inputs, output_grads)
+                # Build a list of output gradients with the same dtype as
+                # the corresponding output variable.
+                # If an output is of a float dtype, we want to cast the
+                # output gradient into the same dtype, to avoid having a
+                # gradient graph with double precision (taking more memory,
+                # and more computation).
+                # If an output is of an integer dtype, then we ensure the
+                # output gradient is zero, and that zero can be represented
+                # in the same int dtype.
+                # If an output gradient is a NullType or DisconnectedType,
+                # then it will not have a dtype, and it will not be changed.
+                new_output_grads = []
+                for o, og in zip(node.outputs, output_grads):
+                    o_dt = getattr(o.type, 'dtype', None)
+                    og_dt = getattr(og.type, 'dtype', None)
+                    if og_dt and o_dt in theano.tensor.discrete_dtypes:
+                        new_output_grads.append(o.zeros_like())
+                    elif o_dt and og_dt and o_dt != og_dt:
+                        new_output_grads.append(og.astype(o_dt))
+                    else:
+                        new_output_grads.append(og)
+                # Make sure that, if new_output_grads[i] has a dtype:
+                # - it is the same dtype as outputs[i]
+                # - if the dtype is an int, then new_output_grads[i] is 0.
+                for o, ng in zip(node.outputs, new_output_grads):
+                    o_dt = getattr(o.type, 'dtype', None)
+                    ng_dt = getattr(ng.type, 'dtype', None)
+                    if ng_dt:
+                        assert ng_dt == o_dt
+                        if ng_dt in theano.tensor.discrete_dtypes:
+                            assert theano.get_constant_value(ng) == 0
+                input_grads = node.op.grad(inputs, new_output_grads)
                if input_grads is None:
                    raise TypeError("%s.grad returned NoneType, "
@@ -764,7 +831,7 @@ def _populate_grad_dict(var_to_node_to_idx,
            #List of bools indicating if each output is an integer dtype
            output_is_int = [hasattr(output.type, 'dtype') and
-                    output.type.dtype.find('int') != -1
+                    output.type.dtype in theano.tensor.discrete_dtypes
                    for output in node.outputs]
            #List of bools indicating if each input only has integer outputs
@@ -792,7 +859,7 @@ def _populate_grad_dict(var_to_node_to_idx,
                if not isinstance(term.type,
                        (NullType, DisconnectedType)):
-                    if term.type.dtype.find('float') == -1:
+                    if term.type.dtype not in theano.tensor.float_dtypes:
                        raise TypeError(str(node.op) + '.grad illegally '
                                ' returned an integer-valued variable.'
                                ' (Input index %d, dtype %s)' % (i,
@@ -997,8 +1064,18 @@ def grad_sources_inputs(sources, graph_inputs):
    # build a dict mapping var to the gradient of cost with respect to var
    grad_dict = {}
-    # by default, the gradient of the cost is 1
    for output, output_grad in sources:
+        # The gradient of the cost should always be 0 if the cost is of
+        # discrete (integer) dtype.
+        if getattr(output.type, 'dtype', '') not in theano.tensor.float_dtypes:
+            output_grad = output.zeros_like()
+        else:
+            # Cast the provided gradient so that it has the same dtype
+            # as the cost.
+            output_grad = output_grad.astype(output.type.dtype)
        grad_dict[output] = output_grad
    # variables that do not influence the cost have zero gradient.
@@ -1369,12 +1446,7 @@ def verify_grad(fun, pt, n_tests=2, rng=None, eps=None,
    cost_fn = function(tensor_pt, cost)
-    # todo-- determine if this is actually needed
+    symbolic_grad = grad(cost, tensor_pt,
-    g_cost = as_tensor_variable(1.0, name='g_cost')
-    if cast_to_output_type:
-        g_cost = cast(g_cost, o_output.dtype)
-    symbolic_grad = grad(cost, tensor_pt, g_cost,
                         disconnected_inputs='ignore')
    grad_fn = function(tensor_pt, symbolic_grad)

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -1966,10 +1966,18 @@ class TensorFromScalar(Op):
    def grad(self, inp, grads):
        s, = inp
        dt, = grads
-        assert dt.type.dtype.find('float') != -1
+        if s.type.dtype in float_dtypes:
-        if s.type.dtype.find('int') != -1:
+            assert dt.type.dtype in float_dtypes
+            return [scalar_from_tensor(dt)]
+        # If the input dtype is an integer, then so is the output dtype,
+        # and the "zero" gradient can be represented in that int dtype.
+        # Currently, theano.grad insists that the dtype of the returned
+        # gradient has a float dtype, so we use floatX.
+        if s.type.dtype in discrete_dtypes:
            return [s.zeros_like().astype(theano.config.floatX)]
-        return [scalar_from_tensor(dt)]
+        raise NotImplementedError("grad not implemented for complex dtypes")
    def __str__(self):
        return self.__class__.__name__

--- a/theano/tests/test_gradient.py
+++ b/theano/tests/test_gradient.py
@@ -11,7 +11,6 @@ from theano import gradient
 from theano.tensor.nnet.Conv3D import conv3D
 from theano import config
 import numpy as np
-from theano.gradient import DisconnectedType
 from theano.gof.null_type import NullType
 one = theano.tensor.as_tensor_variable(1.)
@@ -32,14 +31,11 @@ class testgrad_sources_inputs(unittest.TestCase):
                gz, = grads
                pass
        a = retNone().make_node()
-        try:
+        self.assertRaises(TypeError, grad_sources_inputs, [(a.out, one)], None)
-            grad_sources_inputs([(a.out, one)], None)
-        except TypeError, e:
-            return
-        self.fail()
    def test_wrong_rval_len1(self):
-        """Test that it is not ok to return the wrong number of gradient terms"""
+        """Test that it is not ok to return the wrong number of gradient terms
+        """
        class retOne(gof.op.Op):
            def make_node(self, *inputs):
                outputs = [theano.tensor.vector()]
@@ -51,13 +47,10 @@ class testgrad_sources_inputs(unittest.TestCase):
        i = theano.tensor.vector()
        j = theano.tensor.vector()
        a1 = retOne().make_node(i)
-        g = grad_sources_inputs([(a1.out, one)], None)
+        grad_sources_inputs([(a1.out, one)], None)
        a2 = retOne().make_node(i, j)
-        try:
+        self.assertRaises(ValueError, grad_sources_inputs,
-            g = grad_sources_inputs([(a2.out, one)], None)
+                [(a2.out, one)], None)
-        except ValueError, e:
-            return
-        self.fail()
    def test_1in_1out(self):
        """Test grad is called correctly for a 1-to-1 op"""
@@ -132,281 +125,299 @@ class testgrad_sources_inputs(unittest.TestCase):
        self.assertTrue(g[a1.inputs[1]] is gval1)
-def test_unimplemented_grad_func():
+class test_grad(unittest.TestCase):
-    # tests that function compilation catches unimplemented grads in the graph
-    a = theano.tensor.vector()
-    b = theano.gradient.grad_not_implemented(theano.tensor.add, 0, a)
-    try:
-        f = theano.function([a], b, on_unused_input='ignore')
-        assert 0
-    except TypeError:
-        pass
+    def test_unimplemented_grad_func(self):
+        # tests that function compilation catches unimplemented grads
+        # in the graph
+        a = theano.tensor.vector()
+        b = theano.gradient.grad_not_implemented(theano.tensor.add, 0, a)
+        self.assertRaises(TypeError, theano.function,
+                [a], b, on_unused_input='ignore')
-def test_undefined_grad_func():
+    def test_undefined_grad_func(self):
-    #tests that function compilation catches undefined grads in the graph
+        #tests that function compilation catches undefined grads in the graph
-    a = theano.tensor.vector()
+        a = theano.tensor.vector()
-    b = theano.gradient.grad_undefined(theano.tensor.add, 0, a)
+        b = theano.gradient.grad_undefined(theano.tensor.add, 0, a)
-    try:
+        self.assertRaises(TypeError, theano.function,
-        f = theano.function([a], b, on_unused_input='ignore')
+                [a], b, on_unused_input='ignore')
-        assert 0
-    except TypeError:
-        pass
+    def test_unimplemented_grad_grad(self):
+        #tests that unimplemented grads are caught in the grad method
-def test_unimplemented_grad_grad():
+        class DummyOp(gof.Op):
-    #tests that unimplemented grads are caught in the grad method
+            def make_node(self, x):
+                return gof.Apply(self, [x], [x.type()])
-    class DummyOp(gof.Op):
+            def grad(self, inputs, output_grads):
-        def make_node(self, x):
+                return [theano.gradient.grad_not_implemented(
-            return gof.Apply(self, [x], [x.type()])
+                            self, 0, inputs[0])]
-        def grad(self, inputs, output_grads):
+        a = theano.tensor.scalar()
-            return [theano.gradient.grad_not_implemented(self, 0, inputs[0])]
+        b = DummyOp()(a)
-    a = theano.tensor.scalar()
+        self.assertRaises(TypeError, theano.gradient.grad, b, a)
-    b = DummyOp()(a)
-    try:
+    def test_undefined_grad_grad(self):
-        g = theano.gradient.grad(b, a)
+        #tests that undefined grads are caught in the grad method
-        assert False
-    except TypeError:
-        pass
+        V = theano.tensor.TensorType(dtype=config.floatX,
+                broadcastable=(False, False, False, False, False))()
+        W = theano.tensor.TensorType(dtype=config.floatX,
+                broadcastable=(False, False, False, False, False))()
+        b = theano.tensor.vector()
+        d = theano.tensor.ivector()
-def test_undefined_grad_grad():
+        Z = conv3D(V, W, b, d)
-    #tests that undefined grads are caught in the grad method
-    V = theano.tensor.TensorType(dtype=config.floatX,
+        self.assertRaises(TypeError, theano.gradient.grad, Z.sum(), d)
-            broadcastable=(False, False, False, False, False))()
-    W = theano.tensor.TensorType(dtype=config.floatX,
-            broadcastable=(False, False, False, False, False))()
-    b = theano.tensor.vector()
-    d = theano.tensor.ivector()
-    Z = conv3D(V, W, b, d)
+    def test_grad_name(self):
+        A = theano.tensor.matrix('A')
+        x = theano.tensor.vector('x')
+        f = theano.tensor.dot(x, theano.tensor.dot(A, x))
+        f.name = 'f'
+        g = theano.tensor.grad(f, x)
+        assert g.name == '(df/dx)'
-    try:
+    def test_grad_duplicate_input(self):
-        g = theano.gradient.grad(Z.sum(), d)
-        assert False
-    except TypeError:
-        pass
+        #test that the grad works when a variable
+        #appears in more than one place in a node's input list
-def test_grad_name():
+        def output(x):
-    A = theano.tensor.matrix('A')
+            return (x * x)
-    x = theano.tensor.vector('x')
-    f = theano.tensor.dot(x, theano.tensor.dot(A, x))
-    f.name = 'f'
-    g = theano.tensor.grad(f, x)
-    assert g.name == '(df/dx)'
+        rng = np.random.RandomState([2012, 8, 28])
-def test_grad_duplicate_input():
+        vx = rng.randn(2)
-    #test that the grad works when a variable
+        theano.tests.unittest_tools.verify_grad(output, [vx])
-    #appears in more than one place in a node's input list
-    def output(x):
+    def test_grad_quadratic(self):
-        return (x * x)
-    rng = np.random.RandomState([2012, 8, 28])
+        #test the gradient on a tiny graph
-    vx = rng.randn(2)
+        def cost(x, A):
+            return theano.tensor.dot(x, theano.tensor.dot(A, x))
-    theano.tests.unittest_tools.verify_grad(output, [vx])
+        rng = np.random.RandomState([2012, 8, 28])
+        vx = rng.randn(2)
+        vA = rng.randn(2, 2)
-def test_grad_quadratic():
+        theano.tests.unittest_tools.verify_grad(cost, [vx, vA])
-    #test the gradient on a tiny graph
+    def test_grad_quadratic_vector(self):
-    def cost(x, A):
+        #test the gradient on a small graph
-        return theano.tensor.dot(x, theano.tensor.dot(A, x))
-    rng = np.random.RandomState([2012, 8, 28])
+        def output(x, A):
+            return theano.tensor.dot(x * x, A)
-    vx = rng.randn(2)
+        rng = np.random.RandomState([2012, 8, 28])
-    vA = rng.randn(2, 2)
-    theano.tests.unittest_tools.verify_grad(cost, [vx, vA])
+        vx = rng.randn(2)
+        vA = rng.randn(2, 2)
+        theano.tests.unittest_tools.verify_grad(output, [vx, vA])
-def test_grad_quadratic_vector():
+    def test_grad_cubic(self):
-    #test the gradient on a small graph
+        #test the gradient on a bigger graph
-    def output(x, A):
+        def cost(x, A):
-        return theano.tensor.dot(x * x, A)
+            return theano.tensor.dot(x * x, theano.tensor.dot(A, x))
-    rng = np.random.RandomState([2012, 8, 28])
+        rng = np.random.RandomState([2012, 8, 28])
-    vx = rng.randn(2)
+        vx = rng.randn(2)
-    vA = rng.randn(2, 2)
+        vA = rng.randn(2, 2)
-    theano.tests.unittest_tools.verify_grad(output, [vx, vA])
+        theano.tests.unittest_tools.verify_grad(cost, [vx, vA])
+    def test_grad_grad_quadratic(self):
-def test_grad_cubic():
+        #test the gradient on a graph constructed using the gradient
-    #test the gradient on a bigger graph
+        def output(x, A):
+            orig_cost = theano.tensor.dot(x, theano.tensor.dot(A, x))
+            return theano.gradient.grad(orig_cost, x)
-    def cost(x, A):
+        rng = np.random.RandomState([2012, 8, 28])
-        return theano.tensor.dot(x * x, theano.tensor.dot(A, x))
-    rng = np.random.RandomState([2012, 8, 28])
+        vx = rng.randn(2)
+        vA = rng.randn(2, 2)
-    vx = rng.randn(2)
+        theano.tests.unittest_tools.verify_grad(output, [vx, vA])
-    vA = rng.randn(2, 2)
-    theano.tests.unittest_tools.verify_grad(cost, [vx, vA])
+    def test_grad_grad_cubic(self):
+        #test the gradient on a bigger graph constructed using the gradient
-def test_grad_grad_quadratic():
+        def output(x, A):
+            orig_cost = theano.tensor.dot(x * x, theano.tensor.dot(A, x))
+            return theano.gradient.grad(orig_cost, x)
-    #test the gradient on a graph constructed using the gradient
+        rng = np.random.RandomState([2012, 8, 28])
-    def output(x, A):
+        vx = rng.randn(2)
-        orig_cost = theano.tensor.dot(x, theano.tensor.dot(A, x))
+        vA = rng.randn(2, 2)
-        return theano.gradient.grad(orig_cost, x)
-    rng = np.random.RandomState([2012, 8, 28])
+        theano.tests.unittest_tools.verify_grad(output, [vx, vA])
-    vx = rng.randn(2)
+    def test_grad_int(self):
-    vA = rng.randn(2, 2)
-    theano.tests.unittest_tools.verify_grad(output, [vx, vA])
+        # tests that the gradient with respect to an integer
+        # is the same as the gradient with respect to a float
+        W = theano.tensor.matrix()
+        b = theano.tensor.vector()
-def test_grad_grad_cubic():
+        def make_grad_func(X):
+            Z = theano.tensor.dot(X, W) + b
+            H = theano.tensor.nnet.sigmoid(Z)
+            cost = H.sum()
+            g = gradient.grad(cost, X)
+            return theano.function([X, W, b], g, on_unused_input='ignore')
-    #test the gradient on a bigger graph constructed using the gradient
+        int_func = make_grad_func(theano.tensor.imatrix())
+        #we have to use float64 as the float type to get the results to match
+        #using an integer for the input makes all the later functions use
+        #float64
+        float_func = make_grad_func(theano.tensor.matrix(dtype='float64'))
-    def output(x, A):
+        m = 5
-        orig_cost = theano.tensor.dot(x * x, theano.tensor.dot(A, x))
+        d = 3
-        return theano.gradient.grad(orig_cost, x)
+        n = 4
+        rng = np.random.RandomState([2012, 9, 5])
-    rng = np.random.RandomState([2012, 8, 28])
+        int_type = theano.tensor.imatrix().dtype
+        float_type = 'float64'
-    vx = rng.randn(2)
+        X = np.cast[int_type](rng.randn(m, d) * 127.)
-    vA = rng.randn(2, 2)
+        W = np.cast[W.dtype](rng.randn(d, n))
+        b = np.cast[b.dtype](rng.randn(n))
-    theano.tests.unittest_tools.verify_grad(output, [vx, vA])
+        int_result = int_func(X, W, b)
+        float_result = float_func(np.cast[float_type](X), W, b)
+        assert np.allclose(int_result, float_result), (
+                int_result, float_result)
-def test_grad_int():
+    def test_grad_disconnected(self):
-    # tests that the gradient with respect to an integer
+        #tests corner cases of gradient for shape and alloc
-    # is the same as the gradient with respect to a float
-    W = theano.tensor.matrix()
+        x = theano.tensor.vector(name='x')
-    b = theano.tensor.vector()
+        total = x.sum()
+        total.name = 'total'
+        num_elements = x.shape[0]
+        num_elements.name = 'num_elements'
+        silly_vector = theano.tensor.alloc(total / num_elements, num_elements)
+        silly_vector.name = 'silly_vector'
+        cost = silly_vector.sum()
+        cost.name = 'cost'
+        #note that cost simplifies to be the same as "total"
+        g = gradient.grad(cost, x, add_names=False)
+        #we still need to pass in x because it determines the shape of
+        #the output
+        f = theano.function([x], g)
+        rng = np.random.RandomState([2012, 9, 5])
+        x = np.cast[x.dtype](rng.randn(3))
+        g = f(x)
+        assert np.allclose(g, np.ones(x.shape, dtype=x.dtype))
-    def make_grad_func(X):
+    def test_disconnected_nan(self):
-        Z = theano.tensor.dot(X, W) + b
-        H = theano.tensor.nnet.sigmoid(Z)
-        cost = H.sum()
-        g = gradient.grad(cost, X)
-        return theano.function([X, W, b], g, on_unused_input='ignore')
-    int_func = make_grad_func(theano.tensor.imatrix())
+        # test that connection_pattern can prevent getting NaN
-    #we have to use float64 as the float type to get the results to match
-    #using an integer for the input makes all the later functions use float64
-    float_func = make_grad_func(theano.tensor.matrix(dtype='float64'))
-    m = 5
+        # Op1 has two outputs, f and g
-    d = 3
+        # x is connected to f but not to g
-    n = 4
+        class Op1(theano.gof.Op):
-    rng = np.random.RandomState([2012, 9, 5])
+            def make_node(self, x):
+                return theano.Apply(self, inputs=[x],
+                        outputs=[x.type(), theano.tensor.scalar()])
-    int_type = theano.tensor.imatrix().dtype
+            def connection_pattern(self, node):
-    float_type = 'float64'
+                return [[True, False]]
-    X = np.cast[int_type](rng.randn(m, d) * 127.)
-    W = np.cast[W.dtype](rng.randn(d, n))
-    b = np.cast[b.dtype](rng.randn(n))
-    int_result = int_func(X, W, b)
-    float_result = float_func(np.cast[float_type](X), W, b)
-    assert np.allclose(int_result, float_result)
-def test_grad_disconnected():
-    #tests corner cases of gradient for shape and alloc
-    x = theano.tensor.vector(name='x')
-    total = x.sum()
-    total.name = 'total'
-    num_elements = x.shape[0]
-    num_elements.name = 'num_elements'
-    silly_vector = theano.tensor.alloc(total / num_elements, num_elements)
-    silly_vector.name = 'silly_vector'
-    cost = silly_vector.sum()
-    cost.name = 'cost'
-    #note that cost simplifies to be the same as "total"
-    g = gradient.grad(cost, x, add_names=False)
-    #we still need to pass in x because it determines the shape of the output
-    f = theano.function([x], g)
-    rng = np.random.RandomState([2012, 9, 5])
-    x = np.cast[x.dtype](rng.randn(3))
-    g = f(x)
-    assert np.allclose(g, np.ones(x.shape, dtype=x.dtype))
-def test_disconnected_nan():
-    # test that connection_pattern can prevent getting NaN
-    # Op1 has two outputs, f and g
-    # x is connected to f but not to g
-    class Op1(theano.gof.Op):
-        def make_node(self, x):
-            return theano.Apply(self, inputs=[x],
-                    outputs=[x.type(), theano.tensor.scalar()])
-        def connection_pattern(self, node):
-            return [[True, False]]
-        def grad(self, inputs, output_grads):
-            return [inputs[0].zeros_like()]
-    # Op2 has two inputs, f and g
-    # Its gradient with respect to g is not defined
-    class Op2(theano.gof.Op):
-        def make_node(self, f, g):
-            return theano.Apply(self, inputs=[f, g],
-                    outputs=[theano.tensor.scalar()])
-        def grad(self, inputs, output_grads):
-            return [inputs[0].zeros_like(), NullType()()]
-    x = theano.tensor.vector()
-    f, g = Op1()(x)
-    cost = Op2()(f, g)
-    # cost is differentiable wrt x
-    # but we can't tell that without using Op1's connection pattern
-    # looking at the theano graph alone, g is an ancestor of cost
-    # and has x as an ancestor, so we must compute its gradient
-    g = gradient.grad(cost, x)
-    # If we made it to here without an exception, then the
-    # connection_pattern functionality worked correctly
+            def grad(self, inputs, output_grads):
+                return [inputs[0].zeros_like()]
-def test_sum_disconnected():
+        # Op2 has two inputs, f and g
+        # Its gradient with respect to g is not defined
+        class Op2(theano.gof.Op):
+            def make_node(self, f, g):
+                return theano.Apply(self, inputs=[f, g],
+                        outputs=[theano.tensor.scalar()])
+            def grad(self, inputs, output_grads):
+                return [inputs[0].zeros_like(), NullType()()]
+        x = theano.tensor.vector()
+        f, g = Op1()(x)
+        cost = Op2()(f, g)
+        # cost is differentiable wrt x
+        # but we can't tell that without using Op1's connection pattern
+        # looking at the theano graph alone, g is an ancestor of cost
+        # and has x as an ancestor, so we must compute its gradient
+        g = gradient.grad(cost, x)
+        # If we made it to here without an exception, then the
+        # connection_pattern functionality worked correctly
+    def test_sum_disconnected(self):
+        # Tests that we can add DisconnectedType to other terms correctly
+        x = theano.tensor.scalar()
+        y = x * 2.
+        z = x + 1.
+        cost = y + z
+        theano.tensor.grad(cost, x, consider_constant=[y, z])
+        # In an earlier version of theano, the above line would have failed
+        # while trying to add two DisconnectedTypes
+    def test_output_grad_on_int(self):
+        # If the g_cost argument is specified when x has a discrete dtype,
+        # g_cost should be equivalent to 0.
+        x = theano.tensor.iscalar('x')
+        y = x * 2
+        # Should work:
+        c0 = theano.tensor.constant(0)
+        theano.grad(y, x, g_cost=c0)
+        theano.grad(y, x, g_cost=y.zeros_like())
+        theano.grad(y, x, g_cost=y.zeros_like().astype('float64'))
+        # Should raise ValueError
+        c1 = theano.tensor.constant(1)
+        self.assertRaises(ValueError, theano.grad, y, x, g_cost=c1)
+        s0 = theano.shared(np.zeros((), dtype='int8'))
+        self.assertRaises(ValueError, theano.grad, y, x, g_cost=s0)
+    def test_downcast_dtype(self):
+        # Test that the gradient of a cost wrt a float32 variable does not
+        # get upcasted to float64.
+        # x has dtype float32, regardless of the value of floatX
+        x = theano.tensor.fscalar('x')
+        y = x * 2
+        z = theano.tensor.lscalar('z')
+        c = y + z
+        dc_dx, dc_dy, dc_dz, dc_dc = theano.grad(c, [x, y, z, c])
+        # The dtype of dc_dy and dc_dz can be either float32 or float64,
+        # that might depend on floatX, but is not specified.
+        assert dc_dc.dtype in ('float32', 'float64')
+        assert dc_dz.dtype in ('float32', 'float64')
+        assert dc_dy.dtype in ('float32', 'float64')
+        # When the output gradient of y is passed to op.grad, it should
+        # be downcasted to float32, so dc_dx should also be float32
+        assert dc_dx.dtype == 'float32'
-    # Tests that we can add DisconnectedType to other terms correctly
-    x = theano.tensor.scalar()
-    y = x * 2.
-    z = x + 1.
-    cost = y + z
-    theano.tensor.grad(cost, x, consider_constant=[y, z])
-    # In an earlier version of theano, the above line would have failed
-    # while trying to add two DisconnectedTypes
 if __name__ == '__main__':
    unittest.main()