Merge pull request #2521 from sisp/local_useless_incsubtensor_alloc

[WIP] New optimization: useless incsubtensor alloc

Merge pull request #2521 from sisp/local_useless_incsubtensor_alloc
f276201c · Frédéric Bastien · c8537817 · 9b9129c1 · f276201c · f276201c
--- a/theano/tensor/nnet/nnet.py
+++ b/theano/tensor/nnet/nnet.py
@@ -1045,7 +1045,7 @@ class CrossentropySoftmaxArgmax1HotWithBias(gof.Op):
        return code_template % dict(locals(), **sub)
-class CrossentropySoftmax1HotWithBiasDx (gof.Op):
+class CrossentropySoftmax1HotWithBiasDx(gof.Op):
    nin = 3
    nout = 1
    """Gradient wrt x of the CrossentropySoftmaxArgmax1HotWithBias Op"""
@@ -1065,9 +1065,9 @@ class CrossentropySoftmax1HotWithBiasDx (gof.Op):
        dy = tensor.as_tensor_variable(dy)
        sm = tensor.as_tensor_variable(sm)
        y_idx = tensor.as_tensor_variable(y_idx)
-        if (dy.type.ndim != 1 or
+        if (dy.type.ndim > 1 or
            dy.type.dtype not in tensor.float_dtypes):
-            raise ValueError('dy must be 1-d tensor of floats', dy.type)
+            raise ValueError('dy must be {0,1}-d tensor of floats', dy.type)
        if (sm.type.ndim != 2 or
            sm.type.dtype not in tensor.float_dtypes):
            raise ValueError('sm must be 2-d tensor of floats', sm.type)
@@ -1079,9 +1079,13 @@ class CrossentropySoftmax1HotWithBiasDx (gof.Op):
    def perform(self, node, input_storage, output_storage):
        dy, sm, y_idx = input_storage
        dx = numpy.zeros_like(sm)
+        if dy.ndim == 0:
+            dy = dy[None]
+        incr = int(dy.shape[0] > 1)
        for i in xrange(sm.shape[0]):
-            dx[i] = dy[i] * sm[i]  # vector scale
+            dy_i = dy[i * incr]
-            dx[i, y_idx[i]] -= dy[i]  # scalar decrement
+            dx[i] = dy_i * sm[i]  # vector scale
+            dx[i, y_idx[i]] -= dy_i  # scalar decrement
        output_storage[0][0] = dx
    def infer_shape(self, node, shapes):
@@ -1104,14 +1108,13 @@ class CrossentropySoftmax1HotWithBiasDx (gof.Op):
        return [g_dy, g_sm, g_y_idx]
    def c_code_cache_version(self):
-        return (3,)
+        return (4,)
    def c_code(self, node, name, inp, out, sub):
        dnll, sm, y_idx = inp
        dx, = out
        y_idx_type = node.inputs[2].type.dtype_specs()[1]
        return """
        if ((PyArray_TYPE(%(dnll)s) != NPY_DOUBLE) &&
            (PyArray_TYPE(%(dnll)s) != NPY_FLOAT))
        {
@@ -1126,26 +1129,41 @@ class CrossentropySoftmax1HotWithBiasDx (gof.Op):
                 "sm type should be float32 or float64");
            %(fail)s;
        }
-        if ((PyArray_NDIM(%(dnll)s) != 1)
+        // new scope because of variable declaration
+        // TODO: proper indentation, but the diff will get messy
+        {
+        // Get `dnll.shape[0]` or set it to zero if `dnll` is a scalar.
+        const npy_intp %(dnll)s_dims0 = (PyArray_NDIM(%(dnll)s) > 0 ?
+                                         PyArray_DIMS(%(dnll)s)[0] :
+                                         (npy_intp) 0);
+        // Get `dnll.strides[0]` and set it to zero if `dnll` is a scalar
+        // or a vector with just one element.
+        const npy_intp %(dnll)s_strides0 = (%(dnll)s_dims0 > 1 ?
+                                            PyArray_STRIDES(%(dnll)s)[0] :
+                                            (npy_intp) 0);
+        if ((PyArray_NDIM(%(dnll)s) > 1)
            || (PyArray_NDIM(%(sm)s) != 2)
            || (PyArray_NDIM(%(y_idx)s) != 1))
        {
            PyErr_SetString(PyExc_ValueError, "rank error");
            %(fail)s;
        }
-        if (PyArray_DIMS(%(dnll)s)[0] != PyArray_DIMS(%(sm)s)[0])
+        if (%(dnll)s_dims0 != PyArray_DIMS(%(sm)s)[0] && %(dnll)s_dims0 > 1)
        {
            PyErr_Format(PyExc_ValueError,
                         "dnll.shape[0] (%%ld) != sm.shape[0] (%%ld)",
-                         (long int)PyArray_DIMS(%(dnll)s)[0],
+                         (long int)%(dnll)s_dims0,
                         (long int)PyArray_DIMS(%(sm)s)[0]);
            %(fail)s;
        }
-        if (PyArray_DIMS(%(dnll)s)[0] != PyArray_DIMS(%(y_idx)s)[0])
+        if (%(dnll)s_dims0 != PyArray_DIMS(%(y_idx)s)[0] && %(dnll)s_dims0 > 1)
        {
            PyErr_Format(PyExc_ValueError,
                         "dnll.shape[0] (%%ld) != y_idx.shape[0] (%%ld)",
-                         (long int)PyArray_DIMS(%(dnll)s)[0],
+                         (long int)%(dnll)s_dims0,
                         (long int)PyArray_DIMS(%(y_idx)s)[0]);
            %(fail)s;
        }
@@ -1166,7 +1184,7 @@ class CrossentropySoftmax1HotWithBiasDx (gof.Op):
        for (size_t i = 0; i < PyArray_DIMS(%(dx)s)[0]; ++i)
        {
-            const dtype_%(dnll)s dnll_i = ((dtype_%(dnll)s*)(PyArray_BYTES(%(dnll)s) + PyArray_STRIDES(%(dnll)s)[0] * i))[0];
+            const dtype_%(dnll)s dnll_i = ((dtype_%(dnll)s*)(PyArray_BYTES(%(dnll)s) + %(dnll)s_strides0 * i))[0];
            const %(y_idx_type) s y_i = ((%(y_idx_type)s*)(PyArray_BYTES(%(y_idx)s) + PyArray_STRIDES(%(y_idx)s)[0] * i))[0];
@@ -1187,6 +1205,7 @@ class CrossentropySoftmax1HotWithBiasDx (gof.Op):
            }
            dx_i[y_i * Sdx] -= dnll_i;
        }
+        }
        """ % dict(locals(), **sub)
 crossentropy_softmax_argmax_1hot_with_bias = \
@@ -1741,7 +1760,7 @@ def local_advanced_indexing_crossentropy_onehot_grad(node):
            # if the graph is valid, they have the same shape, so we
            # also know that z has the right shape.
-            if incr.type not in (dvector, fvector):
+            if incr.ndim != 1 or incr.dtype not in tensor.float_dtypes:
                return
            # here we know that we are incrementing some part of matrix z by a vector
@@ -1789,6 +1808,65 @@ def graph_merge_softmax_with_crossentropy_softmax(node):
                    return [mergeable_client[1]]
+@opt.register_specialize
+@opt.register_stabilize
+@opt.register_canonicalize
+@gof.local_optimizer([CrossentropySoftmax1HotWithBiasDx])
+def local_useless_crossentropy_softmax_1hot_with_bias_dx_alloc(node):
+    """
+    Replaces a CrossentropySoftmax1HotWithBiasDx op, whose incoming gradient is
+    an `alloc` of a scalar variable or one that has either broadcastable or
+    matching dimensions with the output variable, by one that skips the
+    intermediate `alloc`.
+    """
+    if isinstance(node.op, CrossentropySoftmax1HotWithBiasDx):
+        dy, sm, y_idx = node.inputs
+        # Those cases are directly handled by the internal broadcasting of the
+        # `CrossentropySoftmax1HotWithBiasDx` op.
+        if dy.ndim == 0:
+            return False
+        if dy.ndim == 1 and dy.broadcastable[0]:
+            return False
+        assert dy.ndim == 1
+        if dy.owner is not None and isinstance(dy.owner.op, tensor.Alloc):
+            # dz is the input of the Alloc op, i.e. T.alloc(dz, <shape>)
+            dz = dy.owner.inputs[0]
+            try:
+                shape_feature = node.fgraph.shape_feature
+            except AttributeError:
+                # The shape feature may not be available in some mode, but we
+                # need it for this optimization, so don't continue.
+                return False
+            shape_of = shape_feature.shape_of
+            same_shape = shape_feature.same_shape
+            # Build `dz_broad` explicitly to include extra implicit dimensions.
+            dz_broad = (True,) * (dy.ndim - dz.ndim) + dz.broadcastable
+            # If we can infer statically that the shape of `sm` and
+            # `dy` are the same in dimension `k` or the shape of `dy` is equal
+            # to 1 (which triggers the internal broadcasting in
+            # `CrossentropySoftmax1HotWithBiasDx`) we do not need to
+            # check it at runtime.
+            if (dz_broad[0] and
+                not same_shape(sm, dy, dim_x=0, dim_y=0) and
+                shape_of[dy][0] != 1):
+                # If `dz` is broadcastable, we need to check whether the shapes
+                # of `dy` and `sm` are the same or whether the shape of `dy` is
+                # equal to 1.
+                cond = tensor.or_(tensor.eq(dy.shape[0], 1),
+                                  tensor.eq(dy.shape[0], sm.shape[0]))
+                msg = '`sm` and `dy` do not have the same shape.'
+                dz = opt.Assert(msg)(dz, cond)
+            return [node.op(dz, sm, y_idx)]
 def binary_crossentropy(output, target):
    """
    Compute the crossentropy of binary random variables

--- a/theano/tensor/nnet/tests/test_nnet.py
+++ b/theano/tensor/nnet/tests/test_nnet.py
@@ -590,7 +590,6 @@ class T_CrossentropyCategorical1Hot(utt.InferShapeTester):
                T.sum(-T.log(softmax(x))[T.arange(y.shape[0]), y])
                ]
        for expr in expressions:
            # Verify the optimizer worked on the expressions
            f = theano.function([x, y], expr, mode=mode)
            if verbose:
@@ -612,7 +611,7 @@ class T_CrossentropyCategorical1Hot(utt.InferShapeTester):
                theano.printing.debugprint(g)
            try:
                ops = [node.op for node in g.maker.fgraph.toposort()]
-                assert len(ops) == 4
+                assert len(ops) == 2
                assert crossentropy_softmax_1hot_with_bias_dx in ops
                assert softmax in ops
                assert softmax_grad not in ops
@@ -645,7 +644,7 @@ class T_CrossentropyCategorical1Hot(utt.InferShapeTester):
                theano.printing.debugprint(g)
            try:
                ops = [node.op for node in g.maker.fgraph.toposort()]
-                assert len(ops) == 4
+                assert len(ops) == 2
                assert crossentropy_softmax_1hot_with_bias_dx in ops
                assert softmax_with_bias in ops
                assert softmax_grad not in ops
@@ -681,8 +680,8 @@ class T_CrossentropyCategorical1Hot(utt.InferShapeTester):
                theano.printing.debugprint(g)
            try:
                ops = [node.op for node in g.maker.fgraph.toposort()]
-                assert len(ops) in (6, 7)
+                assert len(ops) == 5
-                # there's an extra dimshuffle in there
+                #there's an extra dimshuffle in there
                # but I can't think of a good rule to get rid of it
                assert crossentropy_softmax_1hot_with_bias_dx in ops
                assert softmax in ops
@@ -716,7 +715,7 @@ class T_CrossentropyCategorical1Hot(utt.InferShapeTester):
                theano.printing.debugprint(g)
            try:
                ops = [node.op for node in g.maker.fgraph.toposort()]
-                assert len(ops) in (6, 7)
+                assert len(ops) == 5
                assert crossentropy_softmax_1hot_with_bias_dx in ops
                assert softmax_with_bias in ops
                assert softmax_grad not in ops
@@ -765,7 +764,7 @@ class T_CrossentropyCategorical1Hot(utt.InferShapeTester):
                theano.printing.debugprint(g)
            try:
                ops = [node.op for node in g.maker.fgraph.toposort()]
-                assert len(ops) == 5
+                assert len(ops) == 3
                assert crossentropy_softmax_1hot_with_bias_dx in ops
                assert softmax in ops
                assert softmax_grad not in ops
@@ -1079,7 +1078,7 @@ class T_CrossentropyCategorical1Hot(utt.InferShapeTester):
            # Verify the gradient wrt x
            g = theano.function([x, y, a], T.grad(expr, x), mode=mode)
            try:
-                assert 5 <= len(g.maker.fgraph.toposort()) <= 12
+                assert 3 <= len(g.maker.fgraph.toposort()) <= 6
                validate_grad_graph(g)
                g(x_val, y_val, 0.1)
            except Exception:
@@ -1090,7 +1089,7 @@ class T_CrossentropyCategorical1Hot(utt.InferShapeTester):
            h = theano.function([x, y, a],
                    T.grad(expr, x, known_grads={expr: a * x.sum()}), mode=mode)
            try:
-                assert 8 <= len(h.maker.fgraph.toposort()) <= 17
+                assert 6 <= len(h.maker.fgraph.toposort()) <= 8
                validate_grad_graph(h)
                h(x_val, y_val, 0.1)
            except Exception:

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -31,8 +31,12 @@ from theano.tensor.subtensor import (get_idx_list, get_canonical_form_slice,
                                     Subtensor, IncSubtensor, make_constant,
                                     AdvancedIncSubtensor1,
                                     AdvancedIncSubtensor,
+                                     AdvancedSubtensor,
                                     AdvancedSubtensor1,
-                                     advanced_inc_subtensor1)
+                                     advanced_subtensor,
+                                     advanced_subtensor1,
+                                     advanced_inc_subtensor1,
+                                     inc_subtensor)
 from theano import scalar
 from theano.scalar import basic
 from theano.tensor import basic as T
@@ -2730,6 +2734,107 @@ def local_adv_sub1_adv_inc_sub1(node):
    return [T.cast(y, node.outputs[0].dtype)]
+@register_specialize
+@register_stabilize
+@register_canonicalize
+@gof.local_optimizer([IncSubtensor,
+                      AdvancedIncSubtensor,
+                      AdvancedIncSubtensor1])
+def local_useless_inc_subtensor_alloc(node):
+    """
+    Replaces an [Advanced]IncSubtensor[1], whose increment is an `alloc` of
+    a fully or partially broadcastable variable, by one that skips the
+    intermediate `alloc` where possible.
+    """
+    if isinstance(node.op, (IncSubtensor,
+                            AdvancedIncSubtensor,
+                            AdvancedIncSubtensor1)):
+        x = node.inputs[0]
+        y = node.inputs[1]
+        i = node.inputs[2:]
+        if y.owner is not None and isinstance(y.owner.op, T.Alloc):
+            # `z` is the input of the Alloc op, i.e. T.alloc(z, <shape>)
+            z = y.owner.inputs[0]
+            try:
+                shape_feature = node.fgraph.shape_feature
+            except AttributeError:
+                # The shape feature may not be available in some mode, but we
+                # need it for this optimization, so don't continue.
+                return False
+            shape_of = shape_feature.shape_of
+            same_shape = shape_feature.same_shape
+            # Get the subtensor of `x` indexed by `i` in order to compare
+            # shapes later.
+            if isinstance(node.op, IncSubtensor):
+                xi = Subtensor(node.op.idx_list)(x, *i)
+            elif isinstance(node.op, AdvancedIncSubtensor):
+                xi = advanced_subtensor(x, *i)
+            elif isinstance(node.op, AdvancedIncSubtensor1):
+                xi = advanced_subtensor1(x, *i)
+            else:
+                raise Exception('Should never happen!')
+            reason = 'local_useless_incsubtensor_alloc'
+            # Add `xi` to the shape feature `fgraph`. This is important for
+            # shape inference later because the variable must be part of the
+            # function graph in order to call `same_shape` on it.
+            if xi not in shape_of:
+                shape_feature.on_import(node.fgraph, xi.owner,
+                                        '%s: add `xi`' % reason)
+            # `xi` may have more dimensions than `y` since the subtensor ops
+            # do automatic broadcasting of the increment internally. Thus, we
+            # need to make the leading implicitly broadcasted dimensions
+            # explicit for shape comparison later.
+            if xi.ndim > y.ndim:
+                y = T.shape_padleft(y, xi.ndim - y.ndim)
+                if y not in shape_of:
+                    shape_feature.on_import(node.fgraph, y.owner,
+                                            '%s: add `y`' % reason)
+            # Build `z_broad` explicitly to include extra implicit dimensions.
+            z_broad = ((True,) * (xi.ndim - z.ndim) + z.broadcastable)
+            cond = [# The shapes of `y` and `xi` must either agree or `y` may
+                    # also have shape equal to 1 which may be treated as a
+                    # broadcastable dimension by the subtensor op.
+                    T.or_(T.eq(y.shape[k], 1), T.eq(y.shape[k], xi.shape[k]))
+                    # Loop over all dimensions.
+                    for k in xrange(xi.ndim)
+                    # We need to check the above shapes, if
+                    # * the pre-alloc increment `z` is broadcastable in
+                    #   dimension `k` (if it isn't, then the shapes of `z` and
+                    #   `y` are the same by the definition of the `Alloc` op in
+                    #   this dimension and replacing `y` by `z` will not hide a
+                    #   shape error), and
+                    # * `xi` and `y` do not have the same shape in dimension
+                    #   `k` or we cannot infer the shape statically (if the
+                    #   shapes of `xi` and `y` are not the same, then replacing
+                    #   `y` by `z` will hide the shape error of `y`), and
+                    # * the shape of `y` is not equal to 1 or we cannot infer
+                    #   the shape statically (if the shape of `y` is equal to
+                    #   1, then `y` is broadcasted by the inc_subtensor op
+                    #   internally, so the shapes of `xi` and `y` do not need
+                    #   to match in dimension `k`; else we need to check at
+                    #   runtime that the shape of `y` is either 1 or the same
+                    #   as `xi` or otherwise replacing `y` by `z` will hide a
+                    #   shape error).
+                    if (z_broad[k] and
+                        not same_shape(xi, y, dim_x=k, dim_y=k) and
+                        shape_of[y][k] != 1)]
+            if len(cond) > 0:
+                msg = '`x[i]` and `y` do not have the same shape.'
+                z = Assert(msg)(z, *cond)
+            return [node.op(x, z, *i)]
 ####################
 # Rebroadcast opts #
 ####################

--- a/theano/tensor/tests/test_opt.py
+++ b/theano/tensor/tests/test_opt.py
@@ -3171,6 +3171,102 @@ class Test_local_useless_alloc(unittest.TestCase):
        assert tensor.Alloc in op_classes
+class Test_local_useless_inc_subtensor_alloc(unittest.TestCase):
+    opt_name = 'local_useless_inc_subtensor_alloc'
+    def setUp(self):
+        # The optimization requires the shape feature so we need to compile in
+        # FAST_RUN mode.
+        mode = theano.config.mode
+        if mode == 'FAST_COMPILE':
+            mode = 'FAST_RUN'
+        self.mode = compile.mode.get_mode(mode)
+    def test_advanced_inc_subtensor(self):
+        if tensor.inplace_increment is None:
+            raise SkipTest('NumPy version >= 1.8 not available')
+        x = tensor.vector('x')
+        y = tensor.scalar('y')
+        i = tensor.matrix('i', dtype='int64')
+        z = tensor.advanced_inc_subtensor(x, T.alloc(y, *i.shape), i)
+        mode1 = self.mode.excluding(self.opt_name)
+        mode2 = self.mode.including(self.opt_name)
+        f1 = theano.function([x, i, y], z, mode=mode1)
+        f2 = theano.function([x, i, y], z, mode=mode2)
+        # the alloc op should still be there
+        assert (len([n for n in f1.maker.fgraph.toposort()
+                     if isinstance(n.op, tensor.Alloc)]) == 1)
+        # the alloc op should have been removed
+        assert (len([n for n in f2.maker.fgraph.toposort()
+                     if isinstance(n.op, tensor.Alloc)]) == 0)
+        x_value = numpy.random.randn(5).astype(config.floatX)
+        y_value = numpy.random.randn()
+        i_value = numpy.random.randint(0, 3, size=(2, 3))
+        r1 = f1(x_value, i_value, y_value)
+        r2 = f2(x_value, i_value, y_value)
+        utt.assert_allclose(r1, r2)
+    def test_advanced_inc_subtensor1(self):
+        if tensor.inplace_increment is None:
+            raise SkipTest('NumPy version >= 1.8 not available')
+        x = tensor.vector('x')
+        y = tensor.scalar('y')
+        i = tensor.vector('i', dtype='int64')
+        z = tensor.advanced_inc_subtensor1(x, T.alloc(y, *i.shape), i)
+        mode1 = self.mode.excluding(self.opt_name)
+        mode2 = self.mode.including(self.opt_name)
+        f1 = theano.function([x, i, y], z, mode=mode1)
+        f2 = theano.function([x, i, y], z, mode=mode2)
+        # the alloc op should still be there
+        assert (len([n for n in f1.maker.fgraph.toposort()
+                     if isinstance(n.op, tensor.Alloc)]) == 1)
+        # the alloc op should have been removed
+        assert (len([n for n in f2.maker.fgraph.toposort()
+                     if isinstance(n.op, tensor.Alloc)]) == 0)
+        x_value = numpy.random.randn(5).astype(config.floatX)
+        y_value = numpy.random.randn()
+        i_value = numpy.random.randint(0, 3, size=2)
+        r1 = f1(x_value, i_value, y_value)
+        r2 = f2(x_value, i_value, y_value)
+        utt.assert_allclose(r1, r2)
+    def test_incsubtensor(self):
+        x = tensor.vector('x')
+        y = tensor.scalar('y')
+        i = tensor.scalar('i', dtype='int64')
+        z = tensor.inc_subtensor(x[:i], T.alloc(y, i))
+        mode1 = self.mode.excluding(self.opt_name)
+        mode2 = self.mode.including(self.opt_name)
+        f1 = theano.function([x, i, y], z, mode=mode1)
+        f2 = theano.function([x, i, y], z, mode=mode2)
+        # the alloc op should still be there
+        assert (len([n for n in f1.maker.fgraph.toposort()
+                     if isinstance(n.op, tensor.Alloc)]) == 1)
+        # the alloc op should have been removed
+        assert (len([n for n in f2.maker.fgraph.toposort()
+                     if isinstance(n.op, tensor.Alloc)]) == 0)
+        x_value = numpy.random.randn(5).astype(config.floatX)
+        y_value = numpy.random.randn()
+        i_value = 3
+        r1 = f1(x_value, i_value, y_value)
+        r2 = f2(x_value, i_value, y_value)
+        utt.assert_allclose(r1, r2)
 class test_shapeoptimizer(unittest.TestCase):
    def setUp(self):
        utt.seed_rng()