Merge branch 'master' into sparse_stack_fix

e2bd5b94 · Matthew Koichi Grimes · 39eedf7d · defa003a · e2bd5b94 · e2bd5b94
--- a/doc/library/compile/opfromgraph.txt
+++ b/doc/library/compile/opfromgraph.txt
+.. _opfromgraph:
+===========
+OpFromGraph
+===========
+This page descripbe :class:`theano.OpFromGraph
+<theano.compile.builders.OpFromGraph>`. an Op that allow to
+encapsulate a Theano graph in an op.
+This can be used to encapsulate some functionality in one block. It is
+useful to scale Theano compilation for regular bigger graph when we
+reuse that encapsulated fonctionality with different inputs many
+times. Due to this encapsulation, it can make Theano compilation phase
+faster for graph with many nodes.
+Using this for small graph isn't recommanded as it disable
+optimization between what is inside the encapsulation and outside it.
+.. note:
+    This wasn't used widely up to now. If you have any
+    questions/comments don't contact us on the mailing list.
+.. autoclass:: theano.compile.builders.OpFromGraph
--- a/theano/compile/__init__.py
+++ b/theano/compile/__init__.py
@@ -9,8 +9,6 @@ from theano.compile.mode import *
 from theano.compile.io import *
-from theano.compile.builders import *
 from theano.compile.module import *
 from theano.compile.debugmode import DebugMode
@@ -25,4 +23,6 @@ from theano.compile.sharedvalue import (shared, shared_constructor,
                                        SharedVariable)
 from theano.compile.pfunc import pfunc, Param, rebuild_collect_shared
+from theano.compile.builders import *
 from theano.compile.function import function
--- a/theano/compile/builders.py
+++ b/theano/compile/builders.py
 from theano import gof
 from theano import gradient as G
 from theano.compile.function_module import orig_function
+from theano.compile import SharedVariable, rebuild_collect_shared
 from theano.gof import ops_with_inner_function
 class OpFromGraph(gof.Op):
-    """
+    """This create an `Op` from inputs and outputs list of variables.
-    This create an L{Op} from a list of input variables and a list of output
-    variables.
+    The signature is similar to theano.function() and the resulting
+    `Op` perform will do the same operation as::
-    The signature is the same as the signature of L{FunctionFactory}
-    and/or function and the resulting L{Op}'s perform will do the same
+        orig_function(inputs, outputs, **kwargs)
-    operation as::
-      function(inputs, outputs, **kwargs)
+    TODO:
+        - examples for a multi-layer mlp. where?
-    Take note that the following options, if provided, must take the
+        - __hash__, __eq__ otherwise won't merge, try gof.opt.is_same_graph_with_merge(op1.new_outputs, op2, new_outputs)
-    value(s) listed below:
+        - c_code() to remove the double overhead?
-      unpack_single = False
+        - opt to unfold it, work inplace on inputs
-      borrow_outputs = False
+        - grad() make it support DisconnectedType and the new interface
+        - check how it work with updates.
-    OpFromGraph takes an additional input, grad_depth. If grad_depth
+        - add test with constant as input or inside the inner graph.
-    is n, OpFromGraph will make special Ops for gradients up to the
+        - Add support for the GPU? Probably just need an opt to remove transfer
-    nth level, allowing the user to differentiate this op up to n
+        - Add support to pickle this Op.
-    times. The parameter defaults to 1. If grad_depth == 0, the op
+        - Add support/test with random generator
-    will not be differentiable.
+    :note:
+        - We support shared variable in the inner graph. This is automatic and
-    Example:
+          invisible to the user. They can be as input to the node or in the
+          inner graph.
+        - We support unused inputs. This is needed for the grad.
+    Example 1:
+    .. code-block:: python
+        from theano import function, OpFromGraph, tensor
        x, y, z = tensor.scalars('xyz')
        e = x + y * z
-      op = OpFromGraph([x, y, z], [e], linker='c')
+        op = OpFromGraph([x, y, z], [e])
+        # op behaves like a normal theano op
+        e2 = op(x, y, z) + op(z, y, x)
+        fn = function([x, y, z], [e2])
+    Example 2 with shared variable:
+    .. code-block:: python
+        import numpy
+        import theano
+        from theano import config, function, OpFromGraph, tensor
+        x, y, z = tensor.scalars('xyz')
+        s = theano.shared(numpy.random.rand(2, 2).astype(config.floatX))
+        e = x + y * z + s
+        op = OpFromGraph([x, y, z], [e])
        # op behaves like a normal theano op
        e2 = op(x, y, z) + op(z, y, x)
        fn = function([x, y, z], [e2])
    """
-    def __init__(self, inputs, outputs, grad_depth=1, **kwargs):
+    def __init__(self, inputs, outputs, **kwargs):
        if not isinstance(outputs, list):
            raise TypeError('outputs must be list', outputs)
        for i in inputs + outputs:
@@ -44,34 +71,33 @@ class OpFromGraph(gof.Op):
        if 'updates' in kwargs:
            raise TypeError('updates are not allowed in kwargs')
-        # TODO: the graph may have implicit inputs like
+        # To support correctly shared variables the inner fct should
-        #       SharedVariable instances.
+        # not see them. Otherwise their is problem with the gradient.
-        #       what impact to they have on the validity of this Op?
+        self.shared_inputs = [var for var in gof.graph.inputs(outputs)
-        self.fn = orig_function(inputs, outputs, **kwargs)
+                              if isinstance(var, SharedVariable)]
+        used_inputs = [var for var in gof.graph.inputs(outputs)
+                       if not isinstance(var, gof.Constant)]
+        shared_vars = [var.type() for var in self.shared_inputs]
+        new = rebuild_collect_shared(outputs, inputs=inputs + shared_vars,
+                                     replace=dict(zip(self.shared_inputs,
+                                                      shared_vars)),
+                                     copy_inputs_over=False)
+        (new_inputs, new_outputs,
+         [clone_d, update_d, update_expr, shared_inputs]) = new
+        assert len(new_inputs) == len(inputs) + len(self.shared_inputs)
+        assert len(new_outputs) == len(outputs)
+        assert not update_d
+        assert not update_expr
+        assert not shared_inputs
+        self.new_inputs = new_inputs
+        self.new_outputs = new_outputs
        self.inputs = inputs
        self.outputs = outputs
+        self.kwargs = kwargs
        self.input_types = [input.type for input in inputs]
        self.output_types = [output.type for output in outputs]
-        if grad_depth > 0:
-            output_grads = [t() for t in self.output_types]
-            # OpFromGraph doesn't implement a connection_pattern, so for now we regard
-            # all inputs and outputs as connected. This will compute the right numerical
-            # value for the gradients but could fail to raise the disconnected inputs error
-            # in some cases.
-            gs = G.grad(cost=None, known_grads=dict(zip(self.outputs, output_grads)),
-                    wrt=self.inputs, disconnected_inputs='ignore')
-            self.grad_ops = []
-            for g in gs:
-                if g is None:
-                    self.grad_ops.append(lambda *args: None)
-                else:
-                    # It is normal if some inputs are not needed in order
-                    # to compute the gradient, so we ignore them.
-                    self.grad_ops.append(OpFromGraph(inputs + output_grads,
-                                                     [g],
-                                                     grad_depth=grad_depth - 1,
-                                                     on_unused_input='ignore'))
    def __eq__(self, other):
        #TODO: recognize a copy
@@ -87,9 +113,18 @@ class OpFromGraph(gof.Op):
                raise TypeError("Wrong type, expected %s but got %s"
                        % (type, input.type))
        return gof.Apply(self,
-                         inputs,
+                         list(inputs) + self.shared_inputs,
                         [type() for type in self.output_types])
+    def make_thunk(self, node, storage_map, compute_map, no_recycling):
+        ret = super(OpFromGraph, self).make_thunk(node, storage_map,
+                                                  compute_map, no_recycling)
+        if not hasattr(self, "fn"):
+            self.fn = orig_function(self.new_inputs,
+                                    self.new_outputs,
+                                    **self.kwargs)
+        return ret
    def perform(self, node, inputs, outputs):
        variables = self.fn(*inputs)
        assert len(variables) == len(outputs)
@@ -99,10 +134,32 @@ class OpFromGraph(gof.Op):
            output[0] = variable.copy()
    def grad(self, inputs, output_grads):
-        if hasattr(self, 'grad_ops'):
+        # OpFromGraph doesn't implement a connection_pattern, so for
-            return [go(*(inputs + output_grads)) for go in self.grad_ops]
+        # now we regard all inputs and outputs as connected. This will
+        # compute the right numerical value for the gradients but
+        # could fail to raise the disconnected inputs error in some
+        # cases.
+        if hasattr(self, "grad_ops"):
+            grad_ops = self.grad_ops
        else:
-            raise NotImplementedError
+            gs = G.grad(cost=None,
+                        known_grads=dict(zip(self.new_outputs, output_grads)),
+                        wrt=self.new_inputs,
+                        disconnected_inputs='ignore')
+            grad_ops = []
+            for g in gs:
+                if g is None:
+                    grad_ops.append(lambda *args: None)
+                else:
+                    # It is normal if some inputs are not needed in order
+                    # to compute the gradient, so we ignore them.
+                    grad_ops.append(OpFromGraph(self.new_inputs + output_grads,
+                                                [g],
+                                                on_unused_input='ignore'))
+            self.grad_ops = grad_ops
+        return [go(*(inputs + output_grads)) for go in grad_ops]
 # Since OpFromGraph contains a Theano compiled function, we should let
 # DebugMode know about it

--- a/theano/compile/function_module.py
+++ b/theano/compile/function_module.py
@@ -1036,7 +1036,7 @@ class FunctionMaker(object):
        # initialize the linker
        if not hasattr(linker, 'accept'):
-            raise ValueError("'linker' parameter of FunctionFactory should be a Linker with an accept method " \
+            raise ValueError("'linker' parameter of FunctionMaker should be a Linker with an accept method " \
                             "or one of %s" % theano.compile.mode.predefined_linkers.keys())
        #the 'no_borrow' outputs are the ones for which that we can't return the internal storage pointer.

--- a/theano/compile/tests/test_builders.py
+++ b/theano/compile/tests/test_builders.py
 import numpy
 import unittest
-from theano import config
+from theano import config, shared
 from theano.compile import function
@@ -17,7 +17,9 @@ class T_OpFromGraph(unittest.TestCase):
        x, y, z = T.matrices('xyz')
        e = x + y * z
        op = OpFromGraph([x, y, z], [e], mode='FAST_RUN')
-        f = op(x, y, z) - op(y, z, x)  # (1+3*5=array of 16) - (3+1*5=array of 8)
+        # (1+3*5=array of 16) - (3+1*5=array of 8)
+        f = op(x, y, z) - op(y, z, x)
        fn = function([x, y, z], f)
        xv = numpy.ones((2, 2), dtype=config.floatX)
        yv = numpy.ones((2, 2), dtype=config.floatX)*3
@@ -47,7 +49,7 @@ class T_OpFromGraph(unittest.TestCase):
    def test_grad(self):
        x, y, z = T.matrices('xyz')
        e = x + y * z
-        op = OpFromGraph([x, y, z], [e], mode='FAST_RUN', grad_depth=2)
+        op = OpFromGraph([x, y, z], [e], mode='FAST_RUN')
        f = op(x, y, z)
        f = f - T.grad(T.sum(f), y)
        fn = function([x, y, z], f)
@@ -56,6 +58,56 @@ class T_OpFromGraph(unittest.TestCase):
        zv = numpy.ones((2, 2), dtype=config.floatX)*5
        assert numpy.all(11.0 == fn(xv, yv, zv))
+    def test_grad_grad(self):
+        x, y, z = T.matrices('xyz')
+        e = x + y * z
+        op = OpFromGraph([x, y, z], [e], mode='FAST_RUN')
+        f = op(x, y, z)
+        f = f - T.grad(T.sum(f), y)
+        f = f - T.grad(T.sum(f), y)
+        fn = function([x, y, z], f)
+        xv = numpy.ones((2, 2), dtype=config.floatX)
+        yv = numpy.ones((2, 2), dtype=config.floatX)*3
+        zv = numpy.ones((2, 2), dtype=config.floatX)*5
+        assert numpy.allclose(6.0, fn(xv, yv, zv))
+    def test_shared(self):
+        x, y, z = T.matrices('xyz')
+        s = shared(numpy.random.rand(2, 2).astype(config.floatX))
+        e = x + y * z + s
+        op = OpFromGraph([x, y, z], [e], mode='FAST_RUN')
+        # (1+3*5=array of 16) - (3+1*5=array of 8)
+        f = op(x, y, z) - op(y, z, x)
+        fn = function([x, y, z], f)
+        xv = numpy.ones((2, 2), dtype=config.floatX)
+        yv = numpy.ones((2, 2), dtype=config.floatX)*3
+        zv = numpy.ones((2, 2), dtype=config.floatX)*5
+        #print function, function.__module__
+        #print fn.maker.fgraph.toposort()
+        assert numpy.allclose(8.0, fn(xv, yv, zv))
+        assert numpy.allclose(8.0, fn(xv, yv, zv))
+    def test_shared_grad(self):
+        x, y, z = T.matrices('xyz')
+        s = shared(numpy.random.rand(2, 2).astype(config.floatX))
+        e = x + y * z + s
+        op = OpFromGraph([x, y, z], [e], mode='FAST_RUN')
+        f = op(x, y, z)
+        f = f - T.grad(T.sum(f), y)
+        fn = function([x, y, z], f)
+        xv = numpy.ones((2, 2), dtype=config.floatX)
+        yv = numpy.ones((2, 2), dtype=config.floatX) * 3
+        zv = numpy.ones((2, 2), dtype=config.floatX) * 5
+        assert numpy.allclose(11.0 + s.get_value(), fn(xv, yv, zv))
+        # grad again the shared variable
+        f = op(x, y, z)
+        f = f - T.grad(T.sum(f), s)
+        fn = function([x, y, z], f)
+        assert numpy.allclose(15.0 + s.get_value(),
+                              fn(xv, yv, zv))
 if __name__ == '__main__':
    unittest.main()
--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -2794,20 +2794,7 @@ class GpuIncSubtensor(tensor.IncSubtensor, GpuOp):
        """
        return """CudaNdarray_CopyFromCudaNdarray(%(view)s, %(source)s)""" % locals()
-    def set_view_base(self, x, fail):
+    def add_to_zview(self, name, x, fail):
-        return """
-        //Set the base only now
-        if(CudaNdarray_set_device_data(zview, CudaNdarray_DEV_DATA(zview),
-                                    %(x)s)){
-            PyErr_Format(PyExc_RuntimeError,
-                         "GpuSubtensor is not able to set"
-                         " the base of the view array");
-            Py_XDECREF(zview);
-            %(fail)s;
-        }""" % locals()
-    def add_to_zview(self, x, fail):
        return """
        PyObject * add_result = CudaNdarray_inplace_add((PyObject *) zview,

--- a/theano/sandbox/gpuarray/elemwise.py
+++ b/theano/sandbox/gpuarray/elemwise.py
@@ -149,9 +149,19 @@ class GpuElemwise(HideC, Elemwise):
 #define ga_double double
 #define ga_half uint16_t
-#include <Python.h>
-#include <numpy/npy_common.h>
 """
+        for npy, ga in [("npy_uint8", "ga_ubyte"),
+                        ("npy_uint16", "ga_ushort"),
+                        ("npy_uin32", "ga_uint"),
+                        ("npy_uin64", "ga_ulong"),
+                        ("npy_int8", "ga_byte"),
+                        ("npy_int16", "ga_short"),
+                        ("npy_int32", "ga_int"),
+                        ("npy_int64", "ga_long"),
+                        ("npy_float32", "ga_float"),
+                        ("npy_float64", "ga_double"),
+            ]:
+            kop = kop.replace(npy, ga)
        return ElemwiseKernel(None, inps+outs, kop, preamble=support_code)
    def c_headers(self):
@@ -165,9 +175,34 @@ class GpuElemwise(HideC, Elemwise):
        # implementation
        k = self.generate_kernel(node, nodename)
        nd = node.inputs[0].type.ndim
-        import pycuda._cluda
+        CLUDA_PREAMBLE = """
+#define local_barrier() __syncthreads();
+#define WITHIN_KERNEL __device__
+#define KERNEL extern "C" __global__
+#define GLOBAL_MEM /* empty */
+#define LOCAL_MEM __shared__
+#define LOCAL_MEM_ARG /* empty */
+#define REQD_WG_SIZE(X,Y,Z) __launch_bounds__(X*Y*Z, 1)
+#define LID_0 threadIdx.x
+#define LID_1 threadIdx.y
+#define LID_2 threadIdx.z
+#define GID_0 blockIdx.x
+#define GID_1 blockIdx.y
+#define GID_2 blockIdx.z
+#define LDIM_0 blockDim.x
+#define LDIM_1 blockDim.y
+#define LDIM_2 blockDim.z
+#define GDIM_0 gridDim.x
+#define GDIM_1 gridDim.y
+#define GDIM_2 gridDim.z
+"""
        res = ["CUdeviceptr (*cuda_get_ptr)(gpudata *g);",
-               pycuda._cluda.CLUDA_PREAMBLE]
+               CLUDA_PREAMBLE]
        for i in range(0, nd + 1):
            res.append(k.render_basic(i, name="elem_" + str(i)) + ';')
        res.append(k.contig_src + ';')
@@ -338,8 +373,8 @@ class GpuElemwise(HideC, Elemwise):
                                       node.inputs + node.outputs)):
            if (n - len(inputs)) in self.inplace_pattern:
                continue
-            dtype = var.dtype
+            dtype = dtype_to_ctype(var.dtype)
-            param.append("(npy_%(dtype)s*)(cuda_get_ptr(%(name)s->ga.data))" % locals())
+            param.append("(%(dtype)s*)(cuda_get_ptr(%(name)s->ga.data))" % locals())
            param.append("%(name)s->ga.offset" % locals())
            for i in range(nd):
                param.append("PyGpuArray_DIMS(%(name)s)[%(i)d] == 1 ? 0 : PyGpuArray_STRIDES(%(name)s)[%(i)d]" % locals())

--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -24,7 +24,7 @@ from theano.sandbox.gpuarray.nnet import (GpuCrossentropySoftmaxArgmax1HotWithBi
                                          GpuCrossentropySoftmax1HotWithBiasDx)
 from theano.sandbox.gpuarray.elemwise import (GpuElemwise, _is_scalar,
                                              GpuDimShuffle, GpuCAReduce)
-from theano.sandbox.gpuarray.subtensor import GpuSubtensor
+from theano.sandbox.gpuarray.subtensor import GpuIncSubtensor, GpuSubtensor
 from theano.sandbox.gpuarray.type import GpuArrayConstant
 gpu_optimizer = EquilibriumDB()
@@ -234,6 +234,14 @@ def local_gpua_subtensor(node):
    return GpuSubtensor(node.op.idx_list)
+@register_opt()
+@op_lifter([tensor.IncSubtensor])
+def local_gpua_incsubtensor(node):
+    return GpuIncSubtensor(node.op.idx_list, node.op.inplace,
+                           node.op.set_instead_of_inc,
+                           node.op.destroyhandler_tolerate_aliased)
 @register_opt()
 @op_lifter([tensor.CAReduce, tensor.Sum])
 def local_gpua_careduce(node):

--- a/theano/sandbox/gpuarray/subtensor.py
+++ b/theano/sandbox/gpuarray/subtensor.py
+import copy
 import StringIO
 import numpy
 import theano
 from theano import tensor, gof
-from theano.tensor.subtensor import Subtensor, get_idx_list
 from theano.gof.python25 import all, any
+from theano.tensor.subtensor import IncSubtensor, Subtensor, get_idx_list
+from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
 try:
    import pygpu
@@ -16,6 +17,7 @@ except ImportError:
 from theano.sandbox.gpuarray.type import GpuArrayType
 from theano.sandbox.gpuarray.basic_ops import as_gpuarray_variable, HideC
+from theano.sandbox.gpuarray.elemwise import GpuElemwise
 class GpuSubtensor(HideC, Subtensor):
@@ -154,3 +156,203 @@ class GpuSubtensor(HideC, Subtensor):
    def c_code_cache_version(self):
        return (5,)
+class GpuIncSubtensor(IncSubtensor):
+    """
+    Implement IncSubtensor on the gpu.
+    Note: The optimization to make this inplace is in tensor/opt.
+          The same optimization handles IncSubtensor and GpuIncSubtensor.
+          This Op has c_code too; it inherits tensor.IncSubtensor's c_code.
+          The helper methods like do_type_checking, copy_of_x, etc. specialize
+          the c_code for this Op.
+    """
+    def c_headers(self):
+        return self.iadd_node.op.c_headers()
+    def c_compiler(self):
+        return self.iadd_node.op.c_compiler()
+    def c_init_code(self):
+        return self.iadd_node.op.c_init_code()
+    def make_node(self, x, y, *inputs):
+        x = as_gpuarray_variable(x)
+        y = as_gpuarray_variable(y)
+        rval = tensor.IncSubtensor.make_node(self, x, y, *inputs)
+        op = copy.copy(self)
+        ret = gof.Apply(op, [x, y] + rval.inputs[2:], [x.type()])
+        op.create_iadd_node(ret)
+        return ret
+    def create_iadd_node(self, node):
+        # We store a iadd_node in the op that contain the info needed
+        # for the inplace add.
+        cop = theano.tensor.inplace.add_inplace
+        gop = GpuElemwise(cop.scalar_op, copy.copy(cop.inplace_pattern),
+                          "Gpu" + cop.name, cop.nfunc_spec)
+        y = node.inputs[1]
+        xview = y.type()
+        iadd_node = gop(xview, y).owner
+        self.iadd_node = iadd_node
+    def perform(self, node, inputs, out_):
+        out, = out_
+        x, y = inputs[:2]
+        indices = list(reversed(inputs[2:]))
+        def convert(entry):
+            if isinstance(entry, gof.Type):
+                rval = indices.pop()
+                return rval
+            elif isinstance(entry, slice):
+                return slice(convert(entry.start),
+                             convert(entry.stop),
+                             convert(entry.step))
+            else:
+                return entry
+        cdata = tuple(map(convert, self.idx_list))
+        if len(cdata) == 1:
+            cdata = cdata[0]
+        if not self.inplace:
+            x = x.copy()
+        sub_x = x.__getitem__(cdata)
+        if sub_x.shape:
+            # we've sliced out an N-D tensor with N > 0
+            if not self.set_instead_of_inc:
+                #sub_x += y
+                pygpu.elemwise.ielemwise2(sub_x, '+', y,  broadcast=False)
+            else:
+                #sub_x += -sub_x + y
+                x.__setitem__(cdata, y)
+        else:
+            # scalar case
+            if not self.set_instead_of_inc:
+                #x.__setitem__(cdata, sub_x + y)
+                tmp = pygpu.elemwise.elemwise2(sub_x, '+', y,  sub_x, broadcast=False)
+                x.__setitem__(cdata, tmp)
+            else:
+                x.__setitem__(cdata, y)
+        out[0] = x
+    def __setstate__(self, d):
+        self.__dict__.update(d)
+        owner = getattr(self.__dict__, "owner", None)
+        if owner:
+            op.create_iadd_node(owner)
+    def __getstate__(self):
+        d = copy.copy(self.__dict__)
+        if "iadd_node" in d:
+            d.pop('iadd_node')
+        return d
+    def do_type_checking(self, node):
+        """ Should raise NotImplementedError if c_code does not support
+        the types involved in this node.
+        """
+        if not isinstance(node.inputs[0].type, GpuArrayType):
+            raise NotImplementedError()
+    def copy_of_x(self, x):
+        """
+            :param x: a string giving the name of a C variable
+                pointing to an array
+            :return: C code expression to make a copy of x
+            Base class uses `PyArrayObject *`, subclasses may override for
+            different types of arrays.
+        """
+        return """pygpu_copy(%(x)s, GA_ANY_ORDER)""" % locals()
+    def decl_view(self):
+        return "PyGpuArrayObject* zview = NULL;"
+    def make_view_array(self, x, view_ndim):
+        """//TODO
+            :param x: a string identifying an array to be viewed
+            :param view_ndim: a string specifying the number of dimensions
+                to have in the view
+            This doesn't need to actually set up the view with the
+            right indexing; we'll do that manually later.
+        """
+        ret = """
+        size_t dims[%(view_ndim)s];
+        for(int i=0; i<%(view_ndim)s; i++)
+            dims[i] = xview_dims[i];
+        zview = pygpu_fromgpudata(%(x)s->ga.data,
+                                  xview_offset,
+                                  %(x)s->ga.typecode,
+                                  %(view_ndim)s,
+                                  dims,
+                                  xview_strides,
+                                  pygpu_default_context(),
+                                  1,
+                                  (PyObject *)%(x)s,
+                                  (PyObject *)&PyGpuArrayType);
+        """ % locals()
+        return ret
+    def get_helper_c_code_args(self):
+        """ Return a dictionary of arguments to use with helper_c_code"""
+        return {'c_prefix': 'PyGpuArray',
+                'strides_mul': 1
+                }
+    def copy_into(self, view, source):
+        """
+            view: string, C code expression for an array
+            source: string, C code expression for an array
+            returns a C code expression to copy source into view, and
+            return 0 on success
+        """
+        return """GpuArray_move(&%(view)s->ga, &%(source)s->ga)""" % locals()
+    def c_support_code_apply(self, node, nodename):
+        gop = self.iadd_node.op
+        sub_name = nodename + "_add_to_zview"
+        ret = gop.c_support_code_apply(self.iadd_node, sub_name)
+        ret += """
+        PyGpuArrayObject* inc_sub_iadd_%(nodename)s(PyGpuArrayObject* dst,
+                                                    PyGpuArrayObject* src){
+           PyGpuArrayObject* ret = NULL;
+        """ % locals()
+        #def c_code(self, node, name, inputs, outputs, sub):
+        inputs = ["dst", "src"]
+        outputs = ["ret"]
+        sub = {"fail": "return NULL;"}
+        ret += gop.c_code(self.iadd_node, sub_name, inputs, outputs, sub)
+        ret += """
+            return dst;
+        }
+        """
+        return ret
+    def add_to_zview(self, nodename, x, fail):
+        #TODO
+        return """
+        PyGpuArrayObject * add_result = inc_sub_iadd_%(nodename)s(zview, %(x)s);
+        if (! add_result )
+        {
+            Py_DECREF(zview);
+            %(fail)s;
+        }
+        else
+        {
+            Py_DECREF(add_result);
+        }
+        """ % locals()
+    def c_code_cache_version(self):
+        parent_version = super(GpuIncSubtensor, self).c_code_cache_version()
+        elemwise_version = self.iadd_node.c_code_cache_version()
+        if not parent_version or not elemwise_version:
+            return
+        return parent_version + elemwise_version + (0,)
--- a/theano/sandbox/gpuarray/tests/test_subtensor.py
+++ b/theano/sandbox/gpuarray/tests/test_subtensor.py
 from theano.tensor.tests.test_subtensor import T_subtensor
 from theano.sandbox.gpuarray.basic_ops import (HostFromGpu, GpuFromHost)
-from theano.sandbox.gpuarray.subtensor import GpuSubtensor
+from theano.sandbox.gpuarray.subtensor import GpuIncSubtensor, GpuSubtensor
 from theano.sandbox.gpuarray.type import gpuarray_shared_constructor
@@ -11,6 +11,7 @@ from theano.compile import DeepCopyOp
 from theano import tensor
 class G_subtensor(T_subtensor):
    def shortDescription(self):
        return None
@@ -19,8 +20,10 @@ class G_subtensor(T_subtensor):
        T_subtensor.__init__(self, name,
                             shared=gpuarray_shared_constructor,
                             sub=GpuSubtensor,
+                             inc_sub=GpuIncSubtensor,
                             mode=mode_with_gpu,
                             # avoid errors with limited devices
                             dtype='float32',
-                             ignore_topo=(HostFromGpu,GpuFromHost,DeepCopyOp))
+                             ignore_topo=(HostFromGpu, GpuFromHost,
+                                          DeepCopyOp))
        assert self.sub == GpuSubtensor
--- a/theano/tensor/subtensor.py
+++ b/theano/tensor/subtensor.py
@@ -1255,7 +1255,7 @@ class IncSubtensor(Op):
        copy_into = self.copy_into("zview", y)
-        add_to_zview = self.add_to_zview(y, fail)
+        add_to_zview = self.add_to_zview(name, y, fail)
        make_modification = """
        if (%(op_is_set)s)
@@ -1353,7 +1353,7 @@ class IncSubtensor(Op):
        """
        return """PyArray_CopyInto(%(view)s, %(source)s)""" % locals()
-    def add_to_zview(self, x, fail):
+    def add_to_zview(self, name, x, fail):
        """ Return C code to add x to zview. Should DECREF zview if the
        add fails."""