Replaced tensor.basic.SetSubtensor with tensor.basic.IncSubTensor.

This was done in order to greatly simplify a new optimization to serialize multiple IncSubtensors of a common base. This is a case that happens if you compute more than one subtensor from a single variable and then take a gradient. Serialization helps because then all the IncSubtensors can be done inplace.

Replaced tensor.basic.SetSubtensor with tensor.basic.IncSubTensor.
d020e83b · James Bergstra · bc8e89aa · d020e83b · d020e83b
--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -1750,7 +1750,7 @@ class Subtensor(Op):
    def grad(self, inputs, (gz,)):
        x = inputs[0]
        rest = inputs[1:]
-        return [SetSubtensor(self.idx_list)(zeros_like(x), gz, *rest)] + [None] * len(rest)
+        return [IncSubtensor(self.idx_list)(zeros_like(x), gz, *rest)] + [None] * len(rest)
    def __eq__(self, other):
        return type(self) == type(other) and self.idx_list == other.idx_list
@@ -1837,13 +1837,14 @@ pprint.assign(lambda pstate, r: r.owner and isinstance(r.owner.op, Subtensor), S
-class SetSubtensor(Op):
+class IncSubtensor(Op):
-    """Set just some elements of a larger TensorType.
+    """Increment a subtensor.
    This is like numpy's 
-        z[i,j,k] = <something> 
+        z[i,j,k] += <something> 
+    It is used internally to implement the gradient on SubTensor.
    """
    def __init__(self, idx_list, inplace=False):
@@ -1901,7 +1902,7 @@ class SetSubtensor(Op):
        broadcastable = [bc for p, bc in zip(padded, x.type.broadcastable) if isinstance(p, slice)]
        if y.type.broadcastable != tuple(broadcastable):
-            raise TypeError("Invalid broadcastable pattern for y in SetSubtensor.make_node")
+            raise TypeError("Invalid broadcastable pattern for y in IncSubtensor.make_node")
        input_types = Subtensor.collapse(idx_list, lambda entry: isinstance(entry, gof.Type))
        if len(inputs) != len(input_types):
@@ -1933,7 +1934,8 @@ class SetSubtensor(Op):
            cdata = cdata[0]
        if not self.inplace:
            x = x.copy()
-        x.__setitem__(cdata, y)
+        sub_x = x.__getitem__(cdata)
+        sub_x += y
        out[0] = x
 def split(x, splits_size, n_splits, axis=0):

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -291,13 +291,73 @@ def local_subtensor_make_vector(node):
 register_canonicalize(local_subtensor_make_vector)
+@register_canonicalize
+@gof.local_optimizer([None])
+def local_IncSubtensor_serialize(node):
+    """
+    When using Subtensor, gradient graphs can be ugly.
+    If we ask for grad(f(a[0]), a), we are going to get something like
+        IncSubtensor(Elemwise{second}(a, 0), g(f(a[0])), [0])
+    This might be ugly, but at least it's as fast as you could want.  If we ask for
+    grad(f(a[0], a[1], a[2]), a), it's much worse...
+        Elemwise{Add}
+            IncSubtensor(Elemwise{second}(a, 0), g(f(a[0])), [0])
+            IncSubtensor(Elemwise{second}(a, 0), g(f(a[1])), [1])
+            IncSubtensor(Elemwise{second}(a, 0), g(f(a[2])), [2])
+    This is much worse because this time we have to produce 3 matrices the size of 'a', just so
+    we can add them together. 
+    This Op rearranges IncSubtensor's that all work on the same initial argument (here,
+    Elemwise{second}(a,0)) into a chain.  The advantage of the chain structure is that each one
+    can be optimized later in the pipeline to operate inplace.
+    Ideally, the op will do something like this:
+    #
+    #  add(x, incsubtensor(b, c), incsubtensor(b, d))
+    #  -> incsubtensor(incsubtensor(add(x,b), c), d)
+    """
+    def movable(i):
+        # Return True iff this is a incsubtensor that we can move
+        return i.owner \
+                and isinstance(i.owner.op, T.IncSubtensor) \
+                and i.type == o_type \
+                and len(i.clients) == 1
+    if node.op == T.add:
+        o_type = node.outputs[0].type
+        movable_inputs = [i for i in node.inputs if movable(i)]
+        if movable_inputs:
+            new_inputs = [i for i in node.inputs if not movable(i)] \
+                    + [mi.owner.inputs[0] for mi in movable_inputs]
+            new_add = T.add(*new_inputs)
+            # stack up the new incsubtensors
+            tip = new_add
+            for mi in movable_inputs:
+                assert tip.type == o_type
+                assert tip.type == mi.owner.inputs[0].type
+                tip = mi.owner.op(tip, *mi.owner.inputs[1:])
+            return [tip]
+        #print incsub_inputs, [id(i.owner.inputs[0]) for i in incsub_inputs]
 #after priority 50 Destructive inplace operations
 #gemm is the first one now, at priority 70
 @gof.local_optimizer([None])
 def local_inplace_setsubtensor(node):
-    if isinstance(node.op, T.SetSubtensor) and not node.op.inplace:
+    if isinstance(node.op, T.IncSubtensor) and not node.op.inplace:
-        new_op = T.SetSubtensor(node.op.idx_list, inplace=True)
+        new_op = T.IncSubtensor(node.op.idx_list, inplace=True)
        new_node = new_op(*node.inputs)
        return [new_node]
    return False