Merge pull request #985 from goodfeli/rebase_gpu_incsub

C code for GpuIncsub, and a new CUDA kernel for one special case

Merge pull request #985 from goodfeli/rebase_gpu_incsub
818bdf4b · nouiz · f56254ef · 258750b5 · 818bdf4b · 818bdf4b
--- a/theano/compile/pfunc.py
+++ b/theano/compile/pfunc.py
@@ -232,8 +232,9 @@ def rebuild_collect_shared(outputs,
                                                      copy_inputs_over)
                cloned_outputs.append(Out(cloned_v, borrow=v.borrow))
            else:
-                raise TypeError('outputs must be theano Variable or '
-                                'Out instances', v)
+                raise TypeError('Outputs must be theano Variable or '
+                                'Out instances. Received ' + str(v)\
+                                + ' of type '+str(type(v)))
            #computed_list.append(cloned_v)
    else:
        if isinstance(outputs, Variable):

--- a/theano/gof/op.py
+++ b/theano/gof/op.py
@@ -589,6 +589,10 @@ class Op(utils.object2, PureOp, CLinkerOp):
                rval.outputs = node_output_storage
                rval.lazy = False
                return rval
+                # the next line does nothing, but pyflakes is too
+                # stupid to realize the def rval below is not a
+                # redefinition unless I include this
+                del rval
            except (NotImplementedError, utils.MethodNotDefined):
                logger.debug('Falling back on perform')


--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -2175,6 +2175,12 @@ class GpuReshape(tensor.Reshape, GpuOp):
        out[0] = x.reshape(tuple(shp))


+# C Code shared by GpuSubtensor and GpuIncSubtensor
+_define_set_data = """
+    #define CudaNdarray_set_device_data2(obj, ptr, base) \
+            CudaNdarray_set_device_data(obj, (float *)ptr, base)
+"""
+
 class GpuSubtensor(GpuOp, tensor.Subtensor):
    """
    Implement subtensor on the gpu.
@@ -2240,10 +2246,10 @@ class GpuSubtensor(GpuOp, tensor.Subtensor):
            %(fail)s;
        }
        cnda_mark_dev_structure_dirty(xview);
-        #define CudaNdarray_set_device_data2(obj, ptr, base) \
-                CudaNdarray_set_device_data(obj, (float *)ptr, base)
-""" % locals()
-        get_xview = self.helper_c_code(node, name, inputs, outputs, sub,
+        """ % locals()
+
+        get_xview = _define_set_data + \
+                    self.helper_c_code(node, name, inputs, outputs, sub,
                                       self.idx_list,
                                       c_prefix='CudaNdarray',
                                       set_data='CudaNdarray_set_device_data2',
@@ -2251,6 +2257,7 @@ class GpuSubtensor(GpuOp, tensor.Subtensor):
                                       set_stride='CudaNdarray_set_stride',
                                       update_flags="", strides_mul=4)

+
        finish_view = """
        //Set the base only now

@@ -2408,13 +2415,128 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp):
 class GpuIncSubtensor(tensor.IncSubtensor, GpuOp):
    """
    Implement IncSubtensor on the gpu.
+
+    Note: The optimization to make this inplace is in tensor/opt.
+          The same optimization handles IncSubtensor and GpuIncSubtensor.
+          This Op has c_code too; it inherits tensor.IncSubtensor's c_code.
+          The helper methods like do_type_checking, copy_of_x, etc. specialize
+          the c_code for this Op.
    """
+
    def make_node(self, x, y, *inputs):
-        assert isinstance(x.type, CudaNdarrayType)
-        assert isinstance(y.type, CudaNdarrayType)
+        x = as_cuda_ndarray_variable(x)
+        y = as_cuda_ndarray_variable(y)
        rval = tensor.IncSubtensor.make_node(self, x, y, *inputs)
        return Apply(self, [x, y] + rval.inputs[2:], [x.type()])

+    def do_type_checking(self, node):
+        """ Should raise NotImplementedError if c_code does not support
+        the types involved in this node.
+        """
+
+        if not isinstance(node.inputs[0].type, CudaNdarrayType):
+            raise NotImplementedError()
+
+    def copy_of_x(self, x):
+        """
+            x: a string giving the name of a C variable pointing to an array
+
+            Returns C code expression to make a copy of x.
+
+            Base class uses PyArrayObject *, subclasses may override for
+            different types of arrays.
+        """
+        return """(CudaNdarray*) CudaNdarray_Copy(%(x)s)""" % locals()
+
+    def make_view_array(self, x, view_ndim):
+        """
+            x: a string identifying an array to be viewed
+            view_ndim: a string specifying the number of dimensions
+                     to have in the view
+
+            This doesn't need to actually set up the view with the
+            right indexing; we'll do that manually later.
+        """
+        return """CudaNdarray* zview = (CudaNdarray*)
+                CudaNdarray_New(%(view_ndim)s)""" % locals()
+
+    def get_helper_c_code_args(self):
+        """ Return a dictionary of arguments to use with helper_c_code"""
+        return { 'update_flags' : "",
+                'c_prefix' : 'CudaNdarray',
+                'set_data' :'CudaNdarray_set_device_data2',
+                'set_dim' : 'CudaNdarray_set_dim',
+                'set_stride' : 'CudaNdarray_set_stride',
+                'update_flags' : "",
+                'strides_mul': 4
+                }
+
+    def copy_into(self, view, source):
+        """
+            view: string, C code expression for an array
+            source: string, C code expression for an array
+
+            returns a C code expression to copy source into view, and
+            return 0 on success
+        """
+        return """CudaNdarray_CopyFromCudaNdarray(%(view)s, %(source)s)""" % locals()
+
+    def define_set_data(self):
+        return _define_set_data
+
+    def link_view_array(self, x, fail):
+
+        return """
+        if (CudaNdarray_set_device_data(zview, CudaNdarray_DEV_DATA(%(x)s),
+                                       (PyObject*) NULL))
+        {
+            PyErr_Format(PyExc_RuntimeError,
+                         "GpuSubtensor is not able to set the"
+                         " devdata field of the view");
+            Py_XDECREF(zview);
+            %(fail)s;
+        }
+        cnda_mark_dev_structure_dirty(zview);
+        """ % locals()
+
+    def set_view_base(self, x, fail):
+        return """
+        //Set the base only now
+
+        if(CudaNdarray_set_device_data(zview, CudaNdarray_DEV_DATA(zview),
+                                    %(x)s)){
+            PyErr_Format(PyExc_RuntimeError,
+                         "GpuSubtensor is not able to set"
+                         " the base of the view array");
+            Py_XDECREF(zview);
+            %(fail)s;
+        }""" % locals()
+
+    def add_to_zview(self, x, fail):
+
+        return """
+
+        PyObject * add_result =  CudaNdarray_inplace_add((PyObject *) zview,
+                                                         (PyObject *) py_%(x)s);
+
+        if (! add_result )
+        {
+            Py_DECREF(zview);
+            %(fail)s;
+        }
+        else
+        {
+            Py_DECREF(add_result);
+        }
+        """ % locals()
+
+    def c_code_cache_version(self):
+
+        parent_version = super(GpuIncSubtensor, self).c_code_cache_version()
+        if parent_version:
+            return parent_version + (0,)
+        return ()
+

 class GpuFlatten(tensor.Flatten, GpuOp):
    """

--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -14,6 +14,17 @@
 //If true, we fill with NAN allocated device memory.
 #define ALLOC_MEMSET 0

+//If true, we print out when we free a device pointer, uninitialize a
+//CudaNdarray, or allocate a device pointer
+#define PRINT_FREE_MALLOC 0
+
+//If true, we do error checking at the start of functions, to make sure there
+//is not a pre-existing error when the function is called.
+//You probably need to set the environment variable
+//CUDA_LAUNCH_BLOCKING=1
+//if you want this to work.
+#define PRECHECK_ERROR 0
+
 /////////////////////////
 // Alloc and Free
 /////////////////////////
@@ -53,36 +64,53 @@ void * device_malloc(size_t size)
        // it returns something else I still don't see why we should ignore
        // it.  All we want to do here is reset the flag.
        cudaGetLastError();
-#if COMPUTE_GPU_MEM_USED
-        fprintf(stderr, "Error allocating %li bytes of device memory (%s). new total bytes allocated: %d\n", (long)size, cudaGetErrorString(err),_allocated_size);
-#else
-        fprintf(stderr, "Error allocating %li bytes of device memory (%s).\n", (long)size, cudaGetErrorString(err));
-#endif
-        PyErr_Format(PyExc_MemoryError, "Error allocating %li bytes of device memory (%s).", (long)size, cudaGetErrorString(err));
+        #if COMPUTE_GPU_MEM_USED
+            fprintf(stderr, "Error allocating %li bytes of device memory (%s). new total bytes allocated: %d\n", (long)size, cudaGetErrorString(err),_allocated_size);
+        #else
+            fprintf(stderr, "Error allocating %li bytes of device memory (%s).\n", (long)size, cudaGetErrorString(err));
+        #endif
+        PyErr_Format(PyExc_MemoryError,
+                "Error allocating %li bytes of device memory (%s).", (long)size, cudaGetErrorString(err));
        return NULL;
    }
    _outstanding_mallocs[0] += (rval != NULL);
-#if COMPUTE_GPU_MEM_USED
-    for(int i=0;i<TABLE_SIZE;i++){
-        if(NULL==_alloc_size_table[i].ptr){
-            _alloc_size_table[i].ptr=rval;
-            _alloc_size_table[i].size=size;
-            break;
+    #if COMPUTE_GPU_MEM_USED
+        for(int i=0;i<TABLE_SIZE;i++){
+            if(NULL==_alloc_size_table[i].ptr){
+                _alloc_size_table[i].ptr=rval;
+                _alloc_size_table[i].size=size;
+                break;
+            }
        }
-    }
-    _allocated_size += size;
-#endif
-    //fprintf(stderr, "allocated %li bytes of device memory (%s). new total bytes allocated: %d. ptr: %p\n", (long)size, cudaGetErrorString(err),_allocated_size,rval);
+        _allocated_size += size;
+    #endif
+    //fprintf(stderr,
+    //"allocated %li bytes of device memory (%s). new total bytes allocated: %d. ptr: %p\n",
+    //(long)size, cudaGetErrorString(err),_allocated_size,rval);

    if(ALLOC_MEMSET){
        //We init them to nan to make sure we catch more debug case.
        cudaMemset(rval, 0xFF, size);
        //printf("MEMSET\n");
    }
+    #if PRINT_FREE_MALLOC
+        fprintf(stderr, "device malloc %p\n",rval);
+    #endif
    return rval;
 }
+
 int device_free(void *ptr)
 {
+    #if PRINT_FREE_MALLOC
+        fprintf(stderr, "device_free %p\n",ptr);
+    #endif
+    #if PRECHECK_ERROR
+        cudaError_t prevError = cudaGetLastError();
+        if (cudaSuccess != prevError)
+        {
+            fprintf(stderr, "Error existed before calling device_free.\n");
+        }
+    #endif

    // if there is no gpu context, the call to cudaFree will fail; skip it entirely
    if(!g_gpu_context_active) {
@@ -101,33 +129,42 @@ int device_free(void *ptr)
        // it returns something else I still don't see why we should ignore
        // it.  All we want to do here is reset the flag.
        cudaGetLastError();
-#if COMPUTE_GPU_MEM_USED
-        fprintf(stderr, "Error freeing device pointer %p (%s).%d byte already allocated\n", ptr, cudaGetErrorString(err), _allocated_size);
-#else
-        fprintf(stderr, "Error freeing device pointer %p (%s).\n", ptr, cudaGetErrorString(err));
-#endif
-        PyErr_Format(PyExc_MemoryError, "error freeing device pointer %p (%s)", ptr, cudaGetErrorString(err));
+        #if COMPUTE_GPU_MEM_USED
+            fprintf(stderr,
+                    "Error freeing device pointer %p (%s).%d byte already allocated\n",
+                    ptr, cudaGetErrorString(err), _allocated_size);
+        #else
+            fprintf(stderr,
+                    "Error freeing device pointer %p (%s).\n",
+                    ptr,
+                    cudaGetErrorString(err));
+        #endif
+        PyErr_Format(PyExc_MemoryError,
+                "error freeing device pointer %p (%s)",
+                ptr,
+                cudaGetErrorString(err));
        return -1;
    }
    _outstanding_mallocs[0] -= (ptr != NULL);
-#if COMPUTE_GPU_MEM_USED
-    int i=0;
-    size_t total_freed = 0;
-    for(;i<TABLE_SIZE;i++)
-        if(_alloc_size_table[i].ptr==ptr){
-            _allocated_size -= _alloc_size_table[i].size;
-            total_freed += _alloc_size_table[i].size;
-            _alloc_size_table[i].ptr=0;
-            _alloc_size_table[i].size=0;
+    #if COMPUTE_GPU_MEM_USED
+        int i=0;
+        size_t total_freed = 0;
+        for(;i<TABLE_SIZE;i++)
+            if(_alloc_size_table[i].ptr==ptr){
+                _allocated_size -= _alloc_size_table[i].size;
+                total_freed += _alloc_size_table[i].size;
+                _alloc_size_table[i].ptr=0;
+                _alloc_size_table[i].size=0;

-            break;
-        }
-    //if(i==TABLE_SIZE)
-    //    printf("Unallocated unknow size!\n");
-    //fprintf(stderr, "freed %li bytes of device memory (%s). %d already allocated, ptr=%p\n", (long)total_freed, cudaGetErrorString(err),_allocated_size,ptr);
-#endif
+                break;
+            }
+        //if(i==TABLE_SIZE)
+        //    printf("Unallocated unknow size!\n");
+        //fprintf(stderr, "freed %li bytes of device memory (%s). %d already allocated, ptr=%p\n", (long)total_freed, cudaGetErrorString(err),_allocated_size,ptr);
+    #endif
    return 0;
 }
+
 static PyObject *
 outstanding_mallocs(PyObject* self, PyObject * args)
 {
@@ -153,14 +190,17 @@ CudaNdarray_null_init(CudaNdarray*self)
 static int
 CudaNdarray_uninit(CudaNdarray*self)
 {
+    #if PRINT_FREE_MALLOC
+        fprintf(stderr, "CudaNdarray_uninit %p\n", self);
+    #endif
    int rval = 0;
    if (self->data_allocated) {
        assert(self->devdata);
        if (device_free(self->devdata))
        {
            fprintf(stderr,
-                    "!!!! error freeing device memory %p (self=%p)\n",
-                    self->devdata, self);
+                    "CudaNdarray_uninit: error freeing self->devdata. (self=%p, self->devata=%p)\n",
+                    self, self->devdata);
            rval = -1;
        }
        self->devdata = NULL;
@@ -171,7 +211,7 @@ CudaNdarray_uninit(CudaNdarray*self)
        if (device_free(self->dev_structure))
        {
            fprintf(stderr,
-                    "!!!! error freeing dev_structure memory %p (self=%p)\n",
+                    "CudaNdarray_uninit: error freeing dev_structure memory %p (self=%p)\n",
                    self->dev_structure, self);
            rval = -1;
        }
@@ -1164,6 +1204,7 @@ CudaNdarray_exp(CudaNdarray* self)

    return (PyObject*)rval;
 }
+
 static PyMethodDef CudaNdarray_methods[] =
 {
    {"__array__",
@@ -1740,7 +1781,7 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
 * It returns py_self on success with an additional reference. Else NULL.
 */
 // Will be called by __iadd__ in Python
-static PyObject *
+PyObject *
 CudaNdarray_inplace_add(PyObject* py_self, PyObject * py_other)
 {
    if (CudaNdarray_inplace_elemwise(py_self, py_other, IADD))
@@ -2127,7 +2168,7 @@ CudaNdarray_setitem(PyObject *o, PyObject  *key, PyObject  *value)
    }

    PyObject * intobj = NULL;
-    if(CudaNdarray_Check(o)  && PyArray_Check(value)){
+    if (CudaNdarray_Check(o)  && PyArray_Check(value)){
        if (verbose)
            fprintf(stderr,
                    "CudaNdarray_setitem dest is a CudaNdarray and"
@@ -2137,7 +2178,7 @@ CudaNdarray_setitem(PyObject *o, PyObject  *key, PyObject  *value)
        {
            return -1;
        }
-        if(CudaNdarray_CopyFromArray(new_value, (PyArrayObject *) value))
+        if (CudaNdarray_CopyFromArray(new_value, (PyArrayObject *) value))
        {
            Py_XDECREF(new_value);
            Py_XDECREF(rval);
@@ -2214,7 +2255,7 @@ CudaNdarray_setitem(PyObject *o, PyObject  *key, PyObject  *value)

    PyObject *baseSavedForComparison = rval->base;

-    if(CudaNdarray_CopyFromCudaNdarray(rval, (CudaNdarray*)value, true))
+    if (CudaNdarray_CopyFromCudaNdarray(rval, (CudaNdarray*)value, true))
    {
        Py_DECREF((PyObject*)rval);
        Py_XDECREF(new_value);
@@ -3190,6 +3231,28 @@ static __global__ void k_copy_1d(const int N, const float * x, const int sx, flo
    }
 }

+// N1 through N4 are the size of y
+static __global__ void k_copy_4d(const int N1,
+        const int N2, const int N3, const int N4,
+        const float * x, const int sx1, const int sx2, const int sx3,
+        const int sx4,  float * y, const int sy1, const int sy2,
+        const int sy3, const int sy4)
+{
+    // These must be made int instead of unsigned int due to a bug in nvcc
+    int bx = blockIdx.x;
+    int by = blockIdx.y;
+    // N1 and N2 are kept in case a future implementation needs to
+    // loop on the first two dimensions if there are not enough blocks
+    for (int j = threadIdx.y; j < (int) N4; j += (int) blockDim.y)
+    {
+        for (int i = threadIdx.x; i <  N3; i += (int) blockDim.x)
+        {
+            y[bx * sy1 + by * sy2 + i * sy3 + j * sy4] =  
+                x[bx * sx1 + by * sx2 + i * sx3 + j * sx4];
+        }
+    }
+}
+
 //copy from other into self
 int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self,
                                    const CudaNdarray * other,
@@ -3304,6 +3367,61 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self,
                    return -1;
                }
            }; break;
+        case 4: // 4-tensor
+            {
+                if (verbose)
+                {
+                    if (0 != fprint_CudaNdarray(stderr, other))
+                    {
+                        Py_XDECREF(new_other);
+                        return -1;
+                    }
+                }
+
+                // The blocks implement the looping over the first two axes so
+                // this needs to be (N1, N2)
+                dim3 n_blocks( (unsigned int) CudaNdarray_HOST_DIMS(self)[0],
+                               (unsigned int) CudaNdarray_HOST_DIMS(self)[1]);
+                // For the threads, just make as many as possible
+                dim3 n_threads( std::min( (unsigned int) CudaNdarray_HOST_DIMS(self)[2],
+                                 (unsigned int) NUM_VECTOR_OP_THREADS_PER_BLOCK),
+                                std::min( (unsigned int) CudaNdarray_HOST_DIMS(self)[3],
+                                    (unsigned int) NUM_VECTOR_OP_THREADS_PER_BLOCK));
+
+                n_threads.x = std::min( (unsigned int) 32, (unsigned int) n_threads.x);
+                n_threads.y = std::min( n_threads.y, NUM_VECTOR_OP_THREADS_PER_BLOCK / n_threads.x);
+
+                k_copy_4d<<<n_blocks, n_threads>>>(
+                                            // size of y
+                                            (unsigned int) CudaNdarray_HOST_DIMS(self)[0], // N1
+                                            (unsigned int) CudaNdarray_HOST_DIMS(self)[1], // N2
+                                            (unsigned int) CudaNdarray_HOST_DIMS(self)[2], // N3
+                                            (unsigned int) CudaNdarray_HOST_DIMS(self)[3], // N4
+                                            CudaNdarray_DEV_DATA(other), // x
+                                            // x strides
+                                            CudaNdarray_HOST_STRIDES(other)[0],
+                                            CudaNdarray_HOST_STRIDES(other)[1],
+                                            CudaNdarray_HOST_STRIDES(other)[2],
+                                            CudaNdarray_HOST_STRIDES(other)[3],
+                                            CudaNdarray_DEV_DATA(self), // y
+                                            // y strides
+                                            CudaNdarray_HOST_STRIDES(self)[0],
+                                            CudaNdarray_HOST_STRIDES(self)[1],
+                                            CudaNdarray_HOST_STRIDES(self)[2],
+                                            CudaNdarray_HOST_STRIDES(self)[3]
+                                            );
+                CNDA_THREAD_SYNC;
+                cudaError_t err = cudaGetLastError();
+                if( cudaSuccess != err)
+                {
+                    PyErr_Format(PyExc_RuntimeError,
+                                 "Cuda error: %s: %s.",
+                                 "k_copy_4d",
+                                 cudaGetErrorString(err));
+                    Py_XDECREF(new_other);
+                    return -1;
+                }
+            }; break;
        default:
            {
                assert (cudaSuccess == cudaGetLastError());
@@ -4486,8 +4604,17 @@ PyObject * CudaNdarray_IS_C_Contiguous(CudaNdarray * self)
    return PyBool_FromLong(CudaNdarray_is_c_contiguous(self));
 }

-void fprint_CudaNdarray(FILE * fd, const CudaNdarray *self)
+int fprint_CudaNdarray(FILE * fd, const CudaNdarray *self)
 {
+    cudaError_t err = cudaGetLastError();
+    if( cudaSuccess != err)
+    {
+        PyErr_Format(PyExc_RuntimeError,
+                     "Cuda error: %s: %s.",
+                     "fprint_CudaNdarray was called with an uncleared error",
+                     cudaGetErrorString(err));
+        return -1;
+    }
    fprintf(fd, "CudaNdarray <%p, %p> nd=%i dev_structure_fresh=%d data_allocated=%d\n",
            self, self->devdata, self->nd, self->dev_structure_fresh, self->data_allocated);
    fprintf(fd, "\tHOST_DIMS:      ");
@@ -4526,6 +4653,17 @@ void fprint_CudaNdarray(FILE * fd, const CudaNdarray *self)
    {
        fprintf(fd, "\n\tdev_structure not allocated\n");
    }
+
+    err = cudaGetLastError();
+    if( cudaSuccess != err)
+    {
+        PyErr_Format(PyExc_RuntimeError,
+                     "Cuda error: %s: %s.",
+                     "fprint_CudaNdarray",
+                     cudaGetErrorString(err));
+        return -1;
+    }
+    return 0;
 }

 /*

--- a/theano/sandbox/cuda/cuda_ndarray.cuh
+++ b/theano/sandbox/cuda/cuda_ndarray.cuh
@@ -75,15 +75,16 @@ struct CudaNdarray
    /* Type-specific fields go here. */
    //GpuTensorType::VoidTensor * vt;
    int nd; //the number of dimensions of the tensor
-    // Client should acces host_structure via CudaNdarray_HOST_DIMS / CudaNdarray_HOST_STRIDES macros
+    // Client should acces host_structure via CudaNdarray_HOST_DIMS / CudaNdarray_HOST_STRIDES functions
    int * host_structure; //dim0, dim1, ... stride0, stride1, ...
    int data_allocated; //the number of bytes allocated for devdata


    //device pointers (allocated by cudaMalloc)
    mutable int dev_structure_fresh;
-    //dev_structure should be accessed via macros, otherwise may not be
-    //synchronized. The macro will allocate it when needed.
+    //dev_structure should be accessed via the functions like
+    //CudaNdarray_DEV_DIMS, otherwise may not be
+    //synchronized with host_structure. The accessor functions will allocate it when needed.
    mutable int * dev_structure; //dim0, dim1, ..., stride0, stride1, ...
    real* devdata; //pointer to data element [0,..,0].
 };
@@ -118,6 +119,12 @@ CudaNdarray_is_c_contiguous(const CudaNdarray * self);
 */
 DllExport int cnda_structure_size(int nd);

+/*
+ * This describes the shape of the ndarray. The array
+ * of dimensions is itself stored on the host.
+ * If you need to access the dimensions array from inside
+ * a kernel, use CudaNdarray_DEVICE_DIMS.
+ */
 DllExport const int *
 CudaNdarray_HOST_DIMS(const CudaNdarray * self);

@@ -144,7 +151,7 @@ CudaNdarray_Equal(CudaNdarray *cnda1, CudaNdarray *cnda2);
 /****
 *  Set the idx'th dimension to value d.
 *
- *  Updates the log2dim shaddow array.
+ *  Updates the log2dim shadow array.
 *
 *  Does not sync structure to host.
 */
@@ -188,6 +195,10 @@ CudaNdarray_set_stride(CudaNdarray * self, int idx, int s)
 */
 DllExport int cnda_copy_structure_to_device(const CudaNdarray * self);

+/* CudaNdarray_DEV_DIMS gives the same information as CudaNdarray_HOST_DIMS,
+ * but stored on the GPU. Use this pointer when it needs to be accessed
+ * from inside a CUDA kernel.
+ */
 DllExport const int *CudaNdarray_DEV_DIMS(const CudaNdarray * self);
 DllExport const int *CudaNdarray_DEV_STRIDES(const CudaNdarray * self);
 DllExport const int *CudaNdarray_DEV_LOG2DIMS(const CudaNdarray * self);
@@ -389,8 +400,21 @@ DllExport int CudaNdarray_CopyFromArray(CudaNdarray * self, PyArrayObject*obj);
 * Transfer the contents of CudaNdarray `other` to `self`.
 *
 * self is reallocated to have the correct dimensions if necessary.
+ * TODO: WRITEME: what does "if necessary" mean?
+ * TODO: we use this to implement set/inc subtensor, where self is a view of
+ *       the original tensor so that we write only to the subtensor. How
+ *       do we ensure that self is not reallocated in this case?
+ *
+ *  unbroadcast: if true, this means that other is broadcastable in some
+ *               dimensions, and the result, self, is not.
+ *               ie, if unbroadcast=false, we must do the broadcasting
+ *               operation as part of the copy.
+ *               e.g. suppose self and other are 2D matrices and other
+ *               has only one row. Then we need to copy this row several
+ *               times when copying to self.
 */
-DllExport int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, const CudaNdarray * other, bool unbroadcast = false);
+DllExport int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self,
+        const CudaNdarray * other, bool unbroadcast = false);

 /**
 * Transfer the contents of CudaNdarray `self` to a new numpy ndarray.
@@ -437,7 +461,12 @@ DllExport int CudaNdarray_dimshuffle(CudaNdarray * self, unsigned int len, const
 DllExport PyObject*
 CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args);

-static void fprint_CudaNdarray(FILE * fd, const CudaNdarray *self);
+int fprint_CudaNdarray(FILE * fd, const CudaNdarray *self);
+
+
+PyObject * CudaNdarray_View(const CudaNdarray * self);
+PyObject * CudaNdarray_inplace_add(PyObject* py_self, PyObject * py_other);
+

 #endif
 /*

--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
@@ -904,6 +904,12 @@ class T_Join_and_Split(theano.tensor.tests.test_basic.T_Join_and_Split):

 # This is to don't duplicate test.
 class T_subtensor(theano.tensor.tests.test_basic.T_subtensor):
+
+    # This prevents nose from printing method docstrings instead of method
+    # names
+    def shortDescription(self):
+        return None
+
    shared = staticmethod(cuda.shared_constructor)
    sub = cuda.GpuSubtensor
    inc_sub = cuda.GpuIncSubtensor
@@ -921,6 +927,7 @@ class T_subtensor(theano.tensor.tests.test_basic.T_subtensor):
                     self).__init__(name)

    def test_adv_sub1_fast(self):
+
        """We check that the special cases of advanced indexing that
        use CudaNdarrayTakeFrom are handled correctly


--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -3761,21 +3761,28 @@ class AdvancedIndexingError(TypeError):
 class Subtensor(Op):
    """Return a subtensor view

-    This class uses a relatively complex internal representation of the inputs
-    to remember how the input tensor x should be sliced.  The instance variable
-    idx_list is a list whose elements are either integers, or slices.  The
-    integers are indexes into the inputs array, and the start/stop/step members
-    of each slice are also integer indexes into the inputs array (or None).
    The inputs array is the tensor x, followed by scalar integer variables.
+    TODO: WRITEME: how are the scalar integer variables formatted?
+
+    This class uses a relatively complex internal representation of the inputs
+    to remember how the input tensor x should be sliced.
+
+    idx_list: instance variable TODO: WRITEME: is this a list or a tuple?
+                                        (old docstring gives two conflicting
+                                        descriptions)
+              elements are either integers, theano scalars, or slices.
+              one element per "explicitly named dimension"
+                TODO: WRITEME: what is an "explicitly named dimension" ?
+
+              if integer:
+                  indexes into the inputs array
+              if slice:
+                  start/stop/step members of each slice are integer indices
+                  into the inputs array or None
+                  integer indices be actual integers or theano scalars

    @todo: add support for advanced tensor indexing (in Subtensor_dx too).

-    The idx_list is a tuple similar in structure to the sort of key you might
-    expect in numpy's basic indexing mode. It has one element for each
-    explicitly named dimension. In numpy, the elements can be either integers
-    or slices containing integers and None. In Subtensor, each element can
-    additionally be a Scalar instance, and slice components can also be Scalar
-    instances too.
    """
    e_invalid = ('The index list is longer (size %d) than the number of '
                 'dimensions of the tensor(namely %d). You are asking for '
@@ -3873,6 +3880,10 @@ class Subtensor(Op):
            return scal.as_scalar(a)

    def make_node(self, x, *inputs):
+        """
+            x: the tensor to take a subtensor of
+            inputs: a list of theano Scalars
+        """
        x = as_tensor_variable(x)
        inputs = tuple(self.my_as_scalar(a) for a in inputs)

@@ -4015,22 +4026,67 @@ class Subtensor(Op):
                indices.append(str(entry))
        return "%s{%s}" % (self.__class__.__name__, ", ".join(indices))

+    @staticmethod
+    def default_helper_c_code_args():
+        """
+        Returns a dictionary of default arguments to
+        helper_c_code
+        """
+
+        return {
+                "c_prefix" : "PyArray",
+                "update_flags": ("PyArray_UpdateFlags(%(view_name)s,"
+                " NPY_ARRAY_C_CONTIGUOUS|"
+                "NPY_ARRAY_F_CONTIGUOUS);"),
+                "set_data" : "PyArray_set_data",
+                "set_dim" : "PyArray_set_dim",
+                "set_stride" : "PyArray_set_stride",
+                "strides_mul" : 1,
+                "view_name" : "xview" }
+
+
    @staticmethod
    def helper_c_code(node, name, inputs, outputs, sub, idx_list,
-                      c_prefix="PyArray",
-                      update_flags=("PyArray_UpdateFlags(xview,"
-                                    " NPY_ARRAY_C_CONTIGUOUS|"
-                                    "NPY_ARRAY_F_CONTIGUOUS);"),
-                      set_data='PyArray_set_data',
-                      set_dim='PyArray_set_dim',
-                      set_stride='PyArray_set_stride',
-                      strides_mul=1,
+                      c_prefix=None,
+                      update_flags=None,
+                      set_data=None,
+                      set_dim=None,
+                      set_stride=None,
+                      strides_mul=None,
+                      view_name=None
                  ):
-        """The parameters c_prefix, update_flags, set_data, set_dim,
+        """
+        The parameters c_prefix, update_flags, set_data, set_dim,
        set_stride and strides_mul are there to allow reusing this
        function on PyArray and CudaNdarray object.
-
        """
+
+        default_args = Subtensor.default_helper_c_code_args()
+
+        if update_flags is None:
+            update_flags = default_args['update_flags']
+
+        if set_data is None:
+            set_data = default_args['set_data']
+
+        if set_dim is None:
+            set_dim = default_args['set_dim']
+
+        if set_stride is None:
+            set_stride = default_args['set_stride']
+
+        if strides_mul is None:
+            strides_mul = default_args['strides_mul']
+
+        if c_prefix is None:
+            c_prefix = default_args['c_prefix']
+
+        if view_name is None:
+            view_name = default_args['view_name']
+
+        #update_flags may depend on view_name
+        update_flags = update_flags % locals()
+
        #
        # two arrays are created in C code:
        # is_slice: len == ndim, 0 means int, 1 means slice
@@ -4104,6 +4160,8 @@ class Subtensor(Op):
        x, = inputs[:1]
        z, = outputs

+        xview = view_name
+
        rval = """
        #define PyArray_set_dim(obj, idx, d) PyArray_DIMS(obj)[idx]=d
        #define PyArray_set_stride(obj, idx, d) PyArray_STRIDES(obj)[idx]=d
@@ -4119,30 +4177,30 @@ class Subtensor(Op):
        int inner_ii = 0; // the current dimension of zview
        int outer_ii = 0; // current dimension of z

-        char* ptr = (char*) %(c_prefix)s_BYTES(xview);
+        char* ptr = (char*) %(c_prefix)s_BYTES(%(xview)s);

-        if ((%(c_prefix)s_DIMS(xview) == %(c_prefix)s_DIMS(%(x)s))
+        if ((%(c_prefix)s_DIMS(%(xview)s) == %(c_prefix)s_DIMS(%(x)s))
            && (%(c_prefix)s_DIMS(%(x)s) != NULL))
        {
-            PyErr_Format(PyExc_ValueError, "x and xview"
+            PyErr_Format(PyExc_ValueError, "x and %(xview)s"
                         "(with %%d dims) have the same dimensions"
                         " pointers: %%p and %%p",
                         %(c_prefix)s_NDIM(%(x)s),
-                         %(c_prefix)s_DIMS(xview),
+                         %(c_prefix)s_DIMS(%(xview)s),
                         %(c_prefix)s_DIMS(%(x)s));
-            Py_XDECREF(xview);
+            Py_XDECREF(%(xview)s);
            %(fail)s;
        }
-        if (%(c_prefix)s_STRIDES(xview) == %(c_prefix)s_STRIDES(%(x)s)
+        if (%(c_prefix)s_STRIDES(%(xview)s) == %(c_prefix)s_STRIDES(%(x)s)
            && (%(c_prefix)s_DIMS(%(x)s) != NULL))
        {
-            PyErr_Format(PyExc_ValueError, "x and xview"
+            PyErr_Format(PyExc_ValueError, "x and %(xview)s"
                         "(with %%d dims) have the same strides"
                         " pointers: %%p and %%p",
                         %(c_prefix)s_NDIM(%(x)s),
-                         %(c_prefix)s_STRIDES(xview),
+                         %(c_prefix)s_STRIDES(%(xview)s),
                         %(c_prefix)s_STRIDES(%(x)s));
-            Py_XDECREF(xview);
+            Py_XDECREF(%(xview)s);
            %(fail)s;
        }

@@ -4164,10 +4222,10 @@ class Subtensor(Op):
                // PySlice_GetIndicesEx in python source
                if (!step)
                {
-                    Py_DECREF(xview);
+                    Py_DECREF(%(xview)s);
                    PyErr_Format(PyExc_ValueError,
                                 "slice step cannot be zero");
-                    Py_XDECREF(xview);
+                    Py_XDECREF(%(xview)s);
                    %(fail)s;
                }

@@ -4218,8 +4276,8 @@ class Subtensor(Op):

                ptr += %(c_prefix)s_STRIDES(%(x)s)[outer_ii] * start *
                       %(strides_mul)s;
-                %(set_dim)s(xview, inner_ii, slicelength);
-                %(set_stride)s(xview, inner_ii,
+                %(set_dim)s(%(xview)s, inner_ii, slicelength);
+                %(set_stride)s(%(xview)s, inner_ii,
                               %(c_prefix)s_STRIDES(%(x)s)[outer_ii] * step);

                inner_ii += 1;
@@ -4239,27 +4297,27 @@ class Subtensor(Op):
                    else
                    {
                        PyErr_Format(PyExc_IndexError,"index out of bounds");
-                        Py_XDECREF(xview);
+                        Py_XDECREF(%(xview)s);
                        %(fail)s;
                    }
                }
                else
                {
                    PyErr_Format(PyExc_IndexError,"index out of bounds");
-                    Py_XDECREF(xview);
+                    Py_XDECREF(%(xview)s);
                    %(fail)s;
                }

                spec_pos += 1;
            }
        }
-        %(set_data)s(xview, ptr, (PyObject*)NULL);
-        assert (inner_ii <= %(c_prefix)s_NDIM(xview));
-        while (inner_ii < %(c_prefix)s_NDIM(xview))
+        %(set_data)s(%(xview)s, ptr, (PyObject*)NULL);
+        assert (inner_ii <= %(c_prefix)s_NDIM(%(xview)s));
+        while (inner_ii < %(c_prefix)s_NDIM(%(xview)s))
        {
            assert (outer_ii < %(c_prefix)s_NDIM(%(x)s));
-            %(set_dim)s(xview, inner_ii, %(c_prefix)s_DIMS(%(x)s)[outer_ii]);
-            %(set_stride)s(xview, inner_ii, %(c_prefix)s_STRIDES(%(x)s)[outer_ii]);
+            %(set_dim)s(%(xview)s, inner_ii, %(c_prefix)s_DIMS(%(x)s)[outer_ii]);
+            %(set_stride)s(%(xview)s, inner_ii, %(c_prefix)s_STRIDES(%(x)s)[outer_ii]);
            inner_ii += 1;
            outer_ii += 1;
        }
@@ -4454,7 +4512,6 @@ def inc_subtensor(x, y, inplace=False, set_instead_of_inc=False,
    else:
        raise TypeError('x must be result of a subtensor operation')

-
 class IncSubtensor(Op):
    """Increment a subtensor.

@@ -4524,6 +4581,11 @@ class IncSubtensor(Op):
                ", ".join(indices))

    def make_node(self, x, y, *inputs):
+        """
+            x: the tensor to increment
+            y: the value to increment by
+            inputs: TODO WRITEME
+        """
        x, y = map(as_tensor_variable, [x, y])
        if y.ndim > x.ndim:
            raise ValueError(("Trying to increment a %d-dimensional "
@@ -4602,9 +4664,16 @@ class IncSubtensor(Op):
                x.__setitem__(cdata, y)
        out[0] = x

-    def c_code(self, node, name, inputs, outputs, sub):  # DEBUG
-        if not isinstance(node.inputs[0].type, TensorType):
-            raise NotImplementedError()
+    def c_code(self, node, name, inputs, outputs, sub):
+
+        # This method delegates much of the work to helper
+        # methods. This method implements the main logic
+        # but subclasses may override the helper methods
+        # to change the particulars, e.g. GpuIncSubtensor
+        # turns the view/copy operations on numpy arrays
+        # into the same operations on cuda arrays.
+
+        self.do_type_checking(node)

        if self.inplace:  # convert bool to int
            inplace = 1
@@ -4621,12 +4690,15 @@ class IncSubtensor(Op):
        view_ndim = (node.inputs[0].ndim -
                     numpy.sum([not isinstance(idx, slice)
                                for idx in self.idx_list]))
+
+        copy_of_x = self.copy_of_x(x)
+
        copy_input_if_necessary = """
        if (%(inplace)s)
        {
            if (%(x)s != %(z)s)
            {
-                if (%(z)s) Py_DECREF(%(z)s);
+                Py_XDECREF(%(z)s);
                Py_INCREF(%(x)s);
                %(z)s = %(x)s;
            }
@@ -4634,69 +4706,75 @@ class IncSubtensor(Op):
        else
        {
            if (%(z)s) Py_DECREF(%(z)s);
-            %(z)s = (PyArrayObject*)PyArray_FromAny(py_%(x)s, NULL, 0, 0,
-                                                    NPY_ARRAY_ENSURECOPY, NULL);
+            %(z)s = %(copy_of_x)s;
        }
        """ % locals()

+        alloc_zview = self.make_view_array(z, view_ndim)
+        # On GPU, it takes two steps to make a view
+        link_zview = self.link_view_array(z, fail);
+
        #Make a first view on the output, as we will write into it.
        build_view = """
        //TODO: give this Op a second output so that this view can be cached
        //TODO: alternatively, fix the memory leak on failure
-        Py_INCREF(PyArray_DESCR(%(z)s));
-        PyArrayObject * xview = (PyArrayObject*)PyArray_NewFromDescr(
-                &PyArray_Type,
-                PyArray_DESCR(%(z)s),
-                %(view_ndim)s,
-                PyArray_DIMS(%(z)s),
-                PyArray_STRIDES(%(z)s),
-                PyArray_DATA(%(z)s),
-                %(z)s->flags,
-                NULL);
-        if (!xview)
+        %(alloc_zview)s;
+        if (!zview)
        {
            %(fail)s;
        }
+        %(link_zview)s;
        """ % locals()
-        # make xview actually a view of %(z)s
-        get_xview = Subtensor.helper_c_code(node, name,
-                outputs[:1] + inputs[2:],
-                outputs, sub, self.idx_list)
+        # make zview actually a view of %(z)s
+        helper_args = self.get_helper_c_code_args()
+        helper_args['view_name'] = 'zview'
+        get_zview = self.define_set_data() + \
+                Subtensor.helper_c_code(
+                node=node,
+                name=name,
+                inputs=outputs[:1] + inputs[2:],
+                outputs=outputs,
+                sub=sub,
+                idx_list=self.idx_list,
+                ** helper_args
+                )
+
+        copy_into = self.copy_into("zview", y)
+
+        add_to_zview = self.add_to_zview(y, fail)

        make_modification = """
        if (%(op_is_set)s)
        {
-            if (PyArray_CopyInto(xview, %(y)s)) // does broadcasting
+            if (%(copy_into)s) // does broadcasting
            {
-                Py_DECREF(xview);
+                Py_DECREF(zview);
                %(fail)s;
            }
        }
        else
        {
-            PyArrayObject * add_rval = (PyArrayObject*)PyNumber_InPlaceAdd(
-                    (PyObject*)xview, py_%(y)s);
-            if (add_rval)
-            {
-                assert (PyArray_Check((PyObject*)add_rval));
-                assert (PyArray_DATA(add_rval) == PyArray_DATA(xview));
-                Py_DECREF(add_rval);
-            }
-            else
-            {
-                Py_DECREF(xview);
-                %(fail)s;
-            }
+            %(add_to_zview)s
        }
        """ % locals()

        return (copy_input_if_necessary
                + build_view
-                + "{" + get_xview + "}"
+                + "{" + get_zview + "}"
                + make_modification
-                + "Py_DECREF(xview);"
+                + "Py_DECREF(zview);"
                )

+    def do_type_checking(self, node):
+        """ Should raise NotImplementedError if c_code does not support
+        the types involved in this node.
+        """
+
+        if not isinstance(node.inputs[0].type, TensorType):
+            raise NotImplementedError()
+
+
+
    def c_code_cache_version(self):
        hv = Subtensor.helper_c_code_cache_version()
        if hv:
@@ -4704,6 +4782,99 @@ class IncSubtensor(Op):
        else:
            return ()

+    def copy_of_x(self, x):
+        """
+            x: a string giving the name of a C variable pointing to an array
+
+            Returns C code expression to make a copy of x.
+
+            Base class uses PyArrayObject *, subclasses may override for
+            different types of arrays.
+        """
+        # Parameters of PyArrary_FromAny are:
+        # array
+        # dtype: we pass NULL to say any dtype is acceptable, so the existing
+        #        dtype will be copied
+        # min_depth: we pass 0 to have this parameter ignored
+        # max_depth: we pass 0 to have this parameter ignored
+        # requirements: here we pass NPY_ARRAY_ENSURECOPY to force a copy
+        # context: this is almost always NULL, I'm not sure what it's used for
+        return """(PyArrayObject*)PyArray_FromAny(py_%(x)s, NULL, 0, 0,
+                NPY_ARRAY_ENSURECOPY, NULL)""" % locals()
+
+    def make_view_array(self, x, view_ndim):
+        """
+            x: a string identifying an array to be viewed
+            view_ndim: a string specifying the number of dimensions
+                     to have in the view
+
+            This doesn't need to actually set up the view with the
+            right indexing; we'll do that manually later.
+        """
+
+        return """Py_INCREF(PyArray_DESCR(%(x)s));
+        PyArrayObject * zview =
+                (PyArrayObject*)PyArray_NewFromDescr(
+                &PyArray_Type,
+                PyArray_DESCR(%(x)s),
+                %(view_ndim)s,
+                PyArray_DIMS(%(x)s),
+                PyArray_STRIDES(%(x)s),
+                PyArray_DATA(%(x)s),
+                %(x)s->flags,
+                NULL)""" % locals()
+
+    def get_helper_c_code_args(self):
+        """ Return a dictionary of arguments to pass to helper_c_code."""
+        return Subtensor.default_helper_c_code_args()
+
+    def copy_into(self, view, source):
+        """
+            view: string, C code expression for an array
+            source: string, C code expression for an array
+
+            returns a C code expression to copy source into view, and
+            return 0 on success
+        """
+        return """PyArray_CopyInto(%(view)s, %(source)s)""" % locals()
+
+    def define_set_data(self):
+        """ Returns C code used to define any macros used in the
+        set data argument to the helper C code. """
+        return ""
+
+    def link_view_array(self, x, fail):
+        """ Returns code to complete making zview a view of x"""
+
+        # On CPU there is nothing to do, make_view_array already did this
+        return ""
+
+    def set_view_base(self, x, fail):
+        """ Returns code to make zview be a correct view of x,
+        after helper_c_code is done messing with x"""
+
+        # On CPU there is nothing to do
+        return ""
+
+    def add_to_zview(self, x, fail):
+        """ Return C code to add x to zview. Should DECREF zview if the
+        add fails."""
+
+        return """
+            PyArrayObject * add_rval = (PyArrayObject*)PyNumber_InPlaceAdd(
+                    (PyObject*)zview, py_%(x)s);
+            if (add_rval)
+            {
+                assert (PyArray_Check((PyObject*)add_rval));
+                assert (PyArray_DATA(add_rval) == PyArray_DATA(zview));
+                Py_DECREF(add_rval);
+            }
+            else
+            {
+                Py_DECREF(zview);
+                %(fail)s;
+            }""" % locals()
+
    def infer_shape(self, node, shapes):
        return [shapes[0]]


--- a/theano/tensor/tests/test_opt.py
+++ b/theano/tensor/tests/test_opt.py
@@ -1056,7 +1056,8 @@ class test_fusion(unittest.TestCase):
            if gpu:
                import theano.sandbox.cuda as cuda
                topo_ = [x for x in topo if not isinstance(
-                    x.op,cuda.basic_ops.GpuFromHost) and not isinstance(x.op,cuda.basic_ops.HostFromGpu)]
+                    x.op, (cuda.basic_ops.GpuFromHost, cuda.basic_ops.HostFromGpu))]
+
                gpu_ = [x for x in topo if isinstance(x.op,
                    cuda.basic_ops.GpuFromHost)]
                if not len(gpu_) == len(sym_inputs):
@@ -1067,13 +1068,16 @@ class test_fusion(unittest.TestCase):
                if not len(topo_) == nb_elemwise:
                    fail3.append((id, topo_, nb_elemwise))
                if nb_elemwise == 1:
-                    # check that the number of input to the Composite Elemwise is ok
-                    # when there is not variable that appear multiple time the in input
-                    # of g
-                    assert ((numpy.sum([not isinstance(x, theano.gof.Constant)
-                                        for x in topo_[0].inputs]) ==
-                             len(sym_inputs)) or
-                            len(set(g.owner.inputs)) != len(g.owner.inputs))
+                    # if no variable appears multiple times in the
+                    # input of g,
+                    # check that the number of input to the Composite
+                    # Elemwise is ok
+                    if len(set(g.owner.inputs)) == len(g.owner.inputs):
+                        expected_len_sym_inputs = numpy.sum(
+                            [not isinstance(x, theano.gof.Constant)
+                            for x in topo_[0].inputs])
+                        assert expected_len_sym_inputs == len(sym_inputs)
+
            if not out_dtype == out.dtype:
                fail4.append((id, out_dtype, out.dtype))