Merge pull request #985 from goodfeli/rebase_gpu_incsub

C code for GpuIncsub, and a new CUDA kernel for one special case

Merge pull request #985 from goodfeli/rebase_gpu_incsub
818bdf4b · nouiz · f56254ef · 258750b5 · 818bdf4b · 818bdf4b
--- a/theano/compile/pfunc.py
+++ b/theano/compile/pfunc.py
@@ -232,8 +232,9 @@ def rebuild_collect_shared(outputs,
                                                      copy_inputs_over)
                cloned_outputs.append(Out(cloned_v, borrow=v.borrow))
            else:
-                raise TypeError('outputs must be theano Variable or '
+                raise TypeError('Outputs must be theano Variable or '
-                                'Out instances', v)
+                                'Out instances. Received ' + str(v)\
+                                + ' of type '+str(type(v)))
            #computed_list.append(cloned_v)
    else:
        if isinstance(outputs, Variable):

--- a/theano/gof/op.py
+++ b/theano/gof/op.py
@@ -589,6 +589,10 @@ class Op(utils.object2, PureOp, CLinkerOp):
                rval.outputs = node_output_storage
                rval.lazy = False
                return rval
+                # the next line does nothing, but pyflakes is too
+                # stupid to realize the def rval below is not a
+                # redefinition unless I include this
+                del rval
            except (NotImplementedError, utils.MethodNotDefined):
                logger.debug('Falling back on perform')

--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -2175,6 +2175,12 @@ class GpuReshape(tensor.Reshape, GpuOp):
        out[0] = x.reshape(tuple(shp))
+# C Code shared by GpuSubtensor and GpuIncSubtensor
+_define_set_data = """
+    #define CudaNdarray_set_device_data2(obj, ptr, base) \
+            CudaNdarray_set_device_data(obj, (float *)ptr, base)
+"""
 class GpuSubtensor(GpuOp, tensor.Subtensor):
    """
    Implement subtensor on the gpu.
@@ -2240,10 +2246,10 @@ class GpuSubtensor(GpuOp, tensor.Subtensor):
            %(fail)s;
        }
        cnda_mark_dev_structure_dirty(xview);
-        #define CudaNdarray_set_device_data2(obj, ptr, base) \
+        """ % locals()
-                CudaNdarray_set_device_data(obj, (float *)ptr, base)
-""" % locals()
+        get_xview = _define_set_data + \
-        get_xview = self.helper_c_code(node, name, inputs, outputs, sub,
+                    self.helper_c_code(node, name, inputs, outputs, sub,
                                       self.idx_list,
                                       c_prefix='CudaNdarray',
                                       set_data='CudaNdarray_set_device_data2',
@@ -2251,6 +2257,7 @@ class GpuSubtensor(GpuOp, tensor.Subtensor):
                                       set_stride='CudaNdarray_set_stride',
                                       update_flags="", strides_mul=4)
        finish_view = """
        //Set the base only now
@@ -2408,13 +2415,128 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp):
 class GpuIncSubtensor(tensor.IncSubtensor, GpuOp):
    """
    Implement IncSubtensor on the gpu.
+    Note: The optimization to make this inplace is in tensor/opt.
+          The same optimization handles IncSubtensor and GpuIncSubtensor.
+          This Op has c_code too; it inherits tensor.IncSubtensor's c_code.
+          The helper methods like do_type_checking, copy_of_x, etc. specialize
+          the c_code for this Op.
    """
    def make_node(self, x, y, *inputs):
-        assert isinstance(x.type, CudaNdarrayType)
+        x = as_cuda_ndarray_variable(x)
-        assert isinstance(y.type, CudaNdarrayType)
+        y = as_cuda_ndarray_variable(y)
        rval = tensor.IncSubtensor.make_node(self, x, y, *inputs)
        return Apply(self, [x, y] + rval.inputs[2:], [x.type()])
+    def do_type_checking(self, node):
+        """ Should raise NotImplementedError if c_code does not support
+        the types involved in this node.
+        """
+        if not isinstance(node.inputs[0].type, CudaNdarrayType):
+            raise NotImplementedError()
+    def copy_of_x(self, x):
+        """
+            x: a string giving the name of a C variable pointing to an array
+            Returns C code expression to make a copy of x.
+            Base class uses PyArrayObject *, subclasses may override for
+            different types of arrays.
+        """
+        return """(CudaNdarray*) CudaNdarray_Copy(%(x)s)""" % locals()
+    def make_view_array(self, x, view_ndim):
+        """
+            x: a string identifying an array to be viewed
+            view_ndim: a string specifying the number of dimensions
+                     to have in the view
+            This doesn't need to actually set up the view with the
+            right indexing; we'll do that manually later.
+        """
+        return """CudaNdarray* zview = (CudaNdarray*)
+                CudaNdarray_New(%(view_ndim)s)""" % locals()
+    def get_helper_c_code_args(self):
+        """ Return a dictionary of arguments to use with helper_c_code"""
+        return { 'update_flags' : "",
+                'c_prefix' : 'CudaNdarray',
+                'set_data' :'CudaNdarray_set_device_data2',
+                'set_dim' : 'CudaNdarray_set_dim',
+                'set_stride' : 'CudaNdarray_set_stride',
+                'update_flags' : "",
+                'strides_mul': 4
+                }
+    def copy_into(self, view, source):
+        """
+            view: string, C code expression for an array
+            source: string, C code expression for an array
+            returns a C code expression to copy source into view, and
+            return 0 on success
+        """
+        return """CudaNdarray_CopyFromCudaNdarray(%(view)s, %(source)s)""" % locals()
+    def define_set_data(self):
+        return _define_set_data
+    def link_view_array(self, x, fail):
+        return """
+        if (CudaNdarray_set_device_data(zview, CudaNdarray_DEV_DATA(%(x)s),
+                                       (PyObject*) NULL))
+        {
+            PyErr_Format(PyExc_RuntimeError,
+                         "GpuSubtensor is not able to set the"
+                         " devdata field of the view");
+            Py_XDECREF(zview);
+            %(fail)s;
+        }
+        cnda_mark_dev_structure_dirty(zview);
+        """ % locals()
+    def set_view_base(self, x, fail):
+        return """
+        //Set the base only now
+        if(CudaNdarray_set_device_data(zview, CudaNdarray_DEV_DATA(zview),
+                                    %(x)s)){
+            PyErr_Format(PyExc_RuntimeError,
+                         "GpuSubtensor is not able to set"
+                         " the base of the view array");
+            Py_XDECREF(zview);
+            %(fail)s;
+        }""" % locals()
+    def add_to_zview(self, x, fail):
+        return """
+        PyObject * add_result =  CudaNdarray_inplace_add((PyObject *) zview,
+                                                         (PyObject *) py_%(x)s);
+        if (! add_result )
+        {
+            Py_DECREF(zview);
+            %(fail)s;
+        }
+        else
+        {
+            Py_DECREF(add_result);
+        }
+        """ % locals()
+    def c_code_cache_version(self):
+        parent_version = super(GpuIncSubtensor, self).c_code_cache_version()
+        if parent_version:
+            return parent_version + (0,)
+        return ()
 class GpuFlatten(tensor.Flatten, GpuOp):
    """

--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
--- a/theano/sandbox/cuda/cuda_ndarray.cuh
+++ b/theano/sandbox/cuda/cuda_ndarray.cuh
@@ -75,15 +75,16 @@ struct CudaNdarray
    /* Type-specific fields go here. */
    //GpuTensorType::VoidTensor * vt;
    int nd; //the number of dimensions of the tensor
-    // Client should acces host_structure via CudaNdarray_HOST_DIMS / CudaNdarray_HOST_STRIDES macros
+    // Client should acces host_structure via CudaNdarray_HOST_DIMS / CudaNdarray_HOST_STRIDES functions
    int * host_structure; //dim0, dim1, ... stride0, stride1, ...
    int data_allocated; //the number of bytes allocated for devdata
    //device pointers (allocated by cudaMalloc)
    mutable int dev_structure_fresh;
-    //dev_structure should be accessed via macros, otherwise may not be
+    //dev_structure should be accessed via the functions like
-    //synchronized. The macro will allocate it when needed.
+    //CudaNdarray_DEV_DIMS, otherwise may not be
+    //synchronized with host_structure. The accessor functions will allocate it when needed.
    mutable int * dev_structure; //dim0, dim1, ..., stride0, stride1, ...
    real* devdata; //pointer to data element [0,..,0].
 };
@@ -118,6 +119,12 @@ CudaNdarray_is_c_contiguous(const CudaNdarray * self);
 */
 DllExport int cnda_structure_size(int nd);
+/*
+ * This describes the shape of the ndarray. The array
+ * of dimensions is itself stored on the host.
+ * If you need to access the dimensions array from inside
+ * a kernel, use CudaNdarray_DEVICE_DIMS.
+ */
 DllExport const int *
 CudaNdarray_HOST_DIMS(const CudaNdarray * self);
@@ -144,7 +151,7 @@ CudaNdarray_Equal(CudaNdarray *cnda1, CudaNdarray *cnda2);
 /****
 *  Set the idx'th dimension to value d.
 *
- *  Updates the log2dim shaddow array.
+ *  Updates the log2dim shadow array.
 *
 *  Does not sync structure to host.
 */
@@ -188,6 +195,10 @@ CudaNdarray_set_stride(CudaNdarray * self, int idx, int s)
 */
 DllExport int cnda_copy_structure_to_device(const CudaNdarray * self);
+/* CudaNdarray_DEV_DIMS gives the same information as CudaNdarray_HOST_DIMS,
+ * but stored on the GPU. Use this pointer when it needs to be accessed
+ * from inside a CUDA kernel.
+ */
 DllExport const int *CudaNdarray_DEV_DIMS(const CudaNdarray * self);
 DllExport const int *CudaNdarray_DEV_STRIDES(const CudaNdarray * self);
 DllExport const int *CudaNdarray_DEV_LOG2DIMS(const CudaNdarray * self);
@@ -389,8 +400,21 @@ DllExport int CudaNdarray_CopyFromArray(CudaNdarray * self, PyArrayObject*obj);
 * Transfer the contents of CudaNdarray `other` to `self`.
 *
 * self is reallocated to have the correct dimensions if necessary.
+ * TODO: WRITEME: what does "if necessary" mean?
+ * TODO: we use this to implement set/inc subtensor, where self is a view of
+ *       the original tensor so that we write only to the subtensor. How
+ *       do we ensure that self is not reallocated in this case?
+ *
+ *  unbroadcast: if true, this means that other is broadcastable in some
+ *               dimensions, and the result, self, is not.
+ *               ie, if unbroadcast=false, we must do the broadcasting
+ *               operation as part of the copy.
+ *               e.g. suppose self and other are 2D matrices and other
+ *               has only one row. Then we need to copy this row several
+ *               times when copying to self.
 */
-DllExport int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, const CudaNdarray * other, bool unbroadcast = false);
+DllExport int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self,
+        const CudaNdarray * other, bool unbroadcast = false);
 /**
 * Transfer the contents of CudaNdarray `self` to a new numpy ndarray.
@@ -437,7 +461,12 @@ DllExport int CudaNdarray_dimshuffle(CudaNdarray * self, unsigned int len, const
 DllExport PyObject*
 CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args);
-static void fprint_CudaNdarray(FILE * fd, const CudaNdarray *self);
+int fprint_CudaNdarray(FILE * fd, const CudaNdarray *self);
+PyObject * CudaNdarray_View(const CudaNdarray * self);
+PyObject * CudaNdarray_inplace_add(PyObject* py_self, PyObject * py_other);
 #endif
 /*

--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
@@ -904,6 +904,12 @@ class T_Join_and_Split(theano.tensor.tests.test_basic.T_Join_and_Split):
 # This is to don't duplicate test.
 class T_subtensor(theano.tensor.tests.test_basic.T_subtensor):
+    # This prevents nose from printing method docstrings instead of method
+    # names
+    def shortDescription(self):
+        return None
    shared = staticmethod(cuda.shared_constructor)
    sub = cuda.GpuSubtensor
    inc_sub = cuda.GpuIncSubtensor
@@ -921,6 +927,7 @@ class T_subtensor(theano.tensor.tests.test_basic.T_subtensor):
                     self).__init__(name)
    def test_adv_sub1_fast(self):
        """We check that the special cases of advanced indexing that
        use CudaNdarrayTakeFrom are handled correctly

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
--- a/theano/tensor/tests/test_opt.py
+++ b/theano/tensor/tests/test_opt.py
@@ -1056,7 +1056,8 @@ class test_fusion(unittest.TestCase):
            if gpu:
                import theano.sandbox.cuda as cuda
                topo_ = [x for x in topo if not isinstance(
-                    x.op,cuda.basic_ops.GpuFromHost) and not isinstance(x.op,cuda.basic_ops.HostFromGpu)]
+                    x.op, (cuda.basic_ops.GpuFromHost, cuda.basic_ops.HostFromGpu))]
                gpu_ = [x for x in topo if isinstance(x.op,
                    cuda.basic_ops.GpuFromHost)]
                if not len(gpu_) == len(sym_inputs):
@@ -1067,13 +1068,16 @@ class test_fusion(unittest.TestCase):
                if not len(topo_) == nb_elemwise:
                    fail3.append((id, topo_, nb_elemwise))
                if nb_elemwise == 1:
-                    # check that the number of input to the Composite Elemwise is ok
+                    # if no variable appears multiple times in the
-                    # when there is not variable that appear multiple time the in input
+                    # input of g,
-                    # of g
+                    # check that the number of input to the Composite
-                    assert ((numpy.sum([not isinstance(x, theano.gof.Constant)
+                    # Elemwise is ok
-                                        for x in topo_[0].inputs]) ==
+                    if len(set(g.owner.inputs)) == len(g.owner.inputs):
-                             len(sym_inputs)) or
+                        expected_len_sym_inputs = numpy.sum(
-                            len(set(g.owner.inputs)) != len(g.owner.inputs))
+                            [not isinstance(x, theano.gof.Constant)
+                            for x in topo_[0].inputs])
+                        assert expected_len_sym_inputs == len(sym_inputs)
            if not out_dtype == out.dtype:
                fail4.append((id, out_dtype, out.dtype))