Merge pull request #1664 from nouiz/gpu_inc_sub

GpuIncSubtensor

Merge pull request #1664 from nouiz/gpu_inc_sub
e05e801f · abergeron · ff27c77a · c7702a5e · e05e801f · e05e801f
--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -2794,20 +2794,7 @@ class GpuIncSubtensor(tensor.IncSubtensor, GpuOp):
        """
        return """CudaNdarray_CopyFromCudaNdarray(%(view)s, %(source)s)""" % locals()
-    def set_view_base(self, x, fail):
+    def add_to_zview(self, name, x, fail):
-        return """
-        //Set the base only now
-        if(CudaNdarray_set_device_data(zview, CudaNdarray_DEV_DATA(zview),
-                                    %(x)s)){
-            PyErr_Format(PyExc_RuntimeError,
-                         "GpuSubtensor is not able to set"
-                         " the base of the view array");
-            Py_XDECREF(zview);
-            %(fail)s;
-        }""" % locals()
-    def add_to_zview(self, x, fail):
        return """
        PyObject * add_result = CudaNdarray_inplace_add((PyObject *) zview,

--- a/theano/sandbox/gpuarray/elemwise.py
+++ b/theano/sandbox/gpuarray/elemwise.py
@@ -149,9 +149,19 @@ class GpuElemwise(HideC, Elemwise):
 #define ga_double double
 #define ga_half uint16_t
-#include <Python.h>
-#include <numpy/npy_common.h>
 """
+        for npy, ga in [("npy_uint8", "ga_ubyte"),
+                        ("npy_uint16", "ga_ushort"),
+                        ("npy_uin32", "ga_uint"),
+                        ("npy_uin64", "ga_ulong"),
+                        ("npy_int8", "ga_byte"),
+                        ("npy_int16", "ga_short"),
+                        ("npy_int32", "ga_int"),
+                        ("npy_int64", "ga_long"),
+                        ("npy_float32", "ga_float"),
+                        ("npy_float64", "ga_double"),
+            ]:
+            kop = kop.replace(npy, ga)
        return ElemwiseKernel(None, inps+outs, kop, preamble=support_code)
    def c_headers(self):
@@ -165,9 +175,34 @@ class GpuElemwise(HideC, Elemwise):
        # implementation
        k = self.generate_kernel(node, nodename)
        nd = node.inputs[0].type.ndim
-        import pycuda._cluda
+        CLUDA_PREAMBLE = """
+#define local_barrier() __syncthreads();
+#define WITHIN_KERNEL __device__
+#define KERNEL extern "C" __global__
+#define GLOBAL_MEM /* empty */
+#define LOCAL_MEM __shared__
+#define LOCAL_MEM_ARG /* empty */
+#define REQD_WG_SIZE(X,Y,Z) __launch_bounds__(X*Y*Z, 1)
+#define LID_0 threadIdx.x
+#define LID_1 threadIdx.y
+#define LID_2 threadIdx.z
+#define GID_0 blockIdx.x
+#define GID_1 blockIdx.y
+#define GID_2 blockIdx.z
+#define LDIM_0 blockDim.x
+#define LDIM_1 blockDim.y
+#define LDIM_2 blockDim.z
+#define GDIM_0 gridDim.x
+#define GDIM_1 gridDim.y
+#define GDIM_2 gridDim.z
+"""
        res = ["CUdeviceptr (*cuda_get_ptr)(gpudata *g);",
-               pycuda._cluda.CLUDA_PREAMBLE]
+               CLUDA_PREAMBLE]
        for i in range(0, nd + 1):
            res.append(k.render_basic(i, name="elem_" + str(i)) + ';')
        res.append(k.contig_src + ';')
@@ -338,8 +373,8 @@ class GpuElemwise(HideC, Elemwise):
                                       node.inputs + node.outputs)):
            if (n - len(inputs)) in self.inplace_pattern:
                continue
-            dtype = var.dtype
+            dtype = dtype_to_ctype(var.dtype)
-            param.append("(npy_%(dtype)s*)(cuda_get_ptr(%(name)s->ga.data))" % locals())
+            param.append("(%(dtype)s*)(cuda_get_ptr(%(name)s->ga.data))" % locals())
            param.append("%(name)s->ga.offset" % locals())
            for i in range(nd):
                param.append("PyGpuArray_DIMS(%(name)s)[%(i)d] == 1 ? 0 : PyGpuArray_STRIDES(%(name)s)[%(i)d]" % locals())

--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -24,7 +24,7 @@ from theano.sandbox.gpuarray.nnet import (GpuCrossentropySoftmaxArgmax1HotWithBi
                                          GpuCrossentropySoftmax1HotWithBiasDx)
 from theano.sandbox.gpuarray.elemwise import (GpuElemwise, _is_scalar,
                                              GpuDimShuffle, GpuCAReduce)
-from theano.sandbox.gpuarray.subtensor import GpuSubtensor
+from theano.sandbox.gpuarray.subtensor import GpuIncSubtensor, GpuSubtensor
 from theano.sandbox.gpuarray.type import GpuArrayConstant
 gpu_optimizer = EquilibriumDB()
@@ -234,6 +234,14 @@ def local_gpua_subtensor(node):
    return GpuSubtensor(node.op.idx_list)
+@register_opt()
+@op_lifter([tensor.IncSubtensor])
+def local_gpua_incsubtensor(node):
+    return GpuIncSubtensor(node.op.idx_list, node.op.inplace,
+                           node.op.set_instead_of_inc,
+                           node.op.destroyhandler_tolerate_aliased)
 @register_opt()
 @op_lifter([tensor.CAReduce, tensor.Sum])
 def local_gpua_careduce(node):

--- a/theano/sandbox/gpuarray/subtensor.py
+++ b/theano/sandbox/gpuarray/subtensor.py
+import copy
 import StringIO
 import numpy
 import theano
 from theano import tensor, gof
-from theano.tensor.subtensor import Subtensor, get_idx_list
 from theano.gof.python25 import all, any
+from theano.tensor.subtensor import IncSubtensor, Subtensor, get_idx_list
+from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
 try:
    import pygpu
@@ -16,6 +17,7 @@ except ImportError:
 from theano.sandbox.gpuarray.type import GpuArrayType
 from theano.sandbox.gpuarray.basic_ops import as_gpuarray_variable, HideC
+from theano.sandbox.gpuarray.elemwise import GpuElemwise
 class GpuSubtensor(HideC, Subtensor):
@@ -154,3 +156,203 @@ class GpuSubtensor(HideC, Subtensor):
    def c_code_cache_version(self):
        return (5,)
+class GpuIncSubtensor(IncSubtensor):
+    """
+    Implement IncSubtensor on the gpu.
+    Note: The optimization to make this inplace is in tensor/opt.
+          The same optimization handles IncSubtensor and GpuIncSubtensor.
+          This Op has c_code too; it inherits tensor.IncSubtensor's c_code.
+          The helper methods like do_type_checking, copy_of_x, etc. specialize
+          the c_code for this Op.
+    """
+    def c_headers(self):
+        return self.iadd_node.op.c_headers()
+    def c_compiler(self):
+        return self.iadd_node.op.c_compiler()
+    def c_init_code(self):
+        return self.iadd_node.op.c_init_code()
+    def make_node(self, x, y, *inputs):
+        x = as_gpuarray_variable(x)
+        y = as_gpuarray_variable(y)
+        rval = tensor.IncSubtensor.make_node(self, x, y, *inputs)
+        op = copy.copy(self)
+        ret = gof.Apply(op, [x, y] + rval.inputs[2:], [x.type()])
+        op.create_iadd_node(ret)
+        return ret
+    def create_iadd_node(self, node):
+        # We store a iadd_node in the op that contain the info needed
+        # for the inplace add.
+        cop = theano.tensor.inplace.add_inplace
+        gop = GpuElemwise(cop.scalar_op, copy.copy(cop.inplace_pattern),
+                          "Gpu" + cop.name, cop.nfunc_spec)
+        y = node.inputs[1]
+        xview = y.type()
+        iadd_node = gop(xview, y).owner
+        self.iadd_node = iadd_node
+    def perform(self, node, inputs, out_):
+        out, = out_
+        x, y = inputs[:2]
+        indices = list(reversed(inputs[2:]))
+        def convert(entry):
+            if isinstance(entry, gof.Type):
+                rval = indices.pop()
+                return rval
+            elif isinstance(entry, slice):
+                return slice(convert(entry.start),
+                             convert(entry.stop),
+                             convert(entry.step))
+            else:
+                return entry
+        cdata = tuple(map(convert, self.idx_list))
+        if len(cdata) == 1:
+            cdata = cdata[0]
+        if not self.inplace:
+            x = x.copy()
+        sub_x = x.__getitem__(cdata)
+        if sub_x.shape:
+            # we've sliced out an N-D tensor with N > 0
+            if not self.set_instead_of_inc:
+                #sub_x += y
+                pygpu.elemwise.ielemwise2(sub_x, '+', y,  broadcast=False)
+            else:
+                #sub_x += -sub_x + y
+                x.__setitem__(cdata, y)
+        else:
+            # scalar case
+            if not self.set_instead_of_inc:
+                #x.__setitem__(cdata, sub_x + y)
+                tmp = pygpu.elemwise.elemwise2(sub_x, '+', y,  sub_x, broadcast=False)
+                x.__setitem__(cdata, tmp)
+            else:
+                x.__setitem__(cdata, y)
+        out[0] = x
+    def __setstate__(self, d):
+        self.__dict__.update(d)
+        owner = getattr(self.__dict__, "owner", None)
+        if owner:
+            op.create_iadd_node(owner)
+    def __getstate__(self):
+        d = copy.copy(self.__dict__)
+        if "iadd_node" in d:
+            d.pop('iadd_node')
+        return d
+    def do_type_checking(self, node):
+        """ Should raise NotImplementedError if c_code does not support
+        the types involved in this node.
+        """
+        if not isinstance(node.inputs[0].type, GpuArrayType):
+            raise NotImplementedError()
+    def copy_of_x(self, x):
+        """
+            :param x: a string giving the name of a C variable
+                pointing to an array
+            :return: C code expression to make a copy of x
+            Base class uses `PyArrayObject *`, subclasses may override for
+            different types of arrays.
+        """
+        return """pygpu_copy(%(x)s, GA_ANY_ORDER)""" % locals()
+    def decl_view(self):
+        return "PyGpuArrayObject* zview = NULL;"
+    def make_view_array(self, x, view_ndim):
+        """//TODO
+            :param x: a string identifying an array to be viewed
+            :param view_ndim: a string specifying the number of dimensions
+                to have in the view
+            This doesn't need to actually set up the view with the
+            right indexing; we'll do that manually later.
+        """
+        ret = """
+        size_t dims[%(view_ndim)s];
+        for(int i=0; i<%(view_ndim)s; i++)
+            dims[i] = xview_dims[i];
+        zview = pygpu_fromgpudata(%(x)s->ga.data,
+                                  xview_offset,
+                                  %(x)s->ga.typecode,
+                                  %(view_ndim)s,
+                                  dims,
+                                  xview_strides,
+                                  pygpu_default_context(),
+                                  1,
+                                  (PyObject *)%(x)s,
+                                  (PyObject *)&PyGpuArrayType);
+        """ % locals()
+        return ret
+    def get_helper_c_code_args(self):
+        """ Return a dictionary of arguments to use with helper_c_code"""
+        return {'c_prefix': 'PyGpuArray',
+                'strides_mul': 1
+                }
+    def copy_into(self, view, source):
+        """
+            view: string, C code expression for an array
+            source: string, C code expression for an array
+            returns a C code expression to copy source into view, and
+            return 0 on success
+        """
+        return """GpuArray_move(&%(view)s->ga, &%(source)s->ga)""" % locals()
+    def c_support_code_apply(self, node, nodename):
+        gop = self.iadd_node.op
+        sub_name = nodename + "_add_to_zview"
+        ret = gop.c_support_code_apply(self.iadd_node, sub_name)
+        ret += """
+        PyGpuArrayObject* inc_sub_iadd_%(nodename)s(PyGpuArrayObject* dst,
+                                                    PyGpuArrayObject* src){
+           PyGpuArrayObject* ret = NULL;
+        """ % locals()
+        #def c_code(self, node, name, inputs, outputs, sub):
+        inputs = ["dst", "src"]
+        outputs = ["ret"]
+        sub = {"fail": "return NULL;"}
+        ret += gop.c_code(self.iadd_node, sub_name, inputs, outputs, sub)
+        ret += """
+            return dst;
+        }
+        """
+        return ret
+    def add_to_zview(self, nodename, x, fail):
+        #TODO
+        return """
+        PyGpuArrayObject * add_result = inc_sub_iadd_%(nodename)s(zview, %(x)s);
+        if (! add_result )
+        {
+            Py_DECREF(zview);
+            %(fail)s;
+        }
+        else
+        {
+            Py_DECREF(add_result);
+        }
+        """ % locals()
+    def c_code_cache_version(self):
+        parent_version = super(GpuIncSubtensor, self).c_code_cache_version()
+        elemwise_version = self.iadd_node.c_code_cache_version()
+        if not parent_version or not elemwise_version:
+            return
+        return parent_version + elemwise_version + (0,)
--- a/theano/sandbox/gpuarray/tests/test_subtensor.py
+++ b/theano/sandbox/gpuarray/tests/test_subtensor.py
 from theano.tensor.tests.test_subtensor import T_subtensor
 from theano.sandbox.gpuarray.basic_ops import (HostFromGpu, GpuFromHost)
-from theano.sandbox.gpuarray.subtensor import GpuSubtensor
+from theano.sandbox.gpuarray.subtensor import GpuIncSubtensor, GpuSubtensor
 from theano.sandbox.gpuarray.type import gpuarray_shared_constructor
@@ -11,6 +11,7 @@ from theano.compile import DeepCopyOp
 from theano import tensor
 class G_subtensor(T_subtensor):
    def shortDescription(self):
        return None
@@ -19,8 +20,10 @@ class G_subtensor(T_subtensor):
        T_subtensor.__init__(self, name,
                             shared=gpuarray_shared_constructor,
                             sub=GpuSubtensor,
+                             inc_sub=GpuIncSubtensor,
                             mode=mode_with_gpu,
                             # avoid errors with limited devices
                             dtype='float32',
-                             ignore_topo=(HostFromGpu,GpuFromHost,DeepCopyOp))
+                             ignore_topo=(HostFromGpu, GpuFromHost,
+                                          DeepCopyOp))
        assert self.sub == GpuSubtensor
--- a/theano/tensor/subtensor.py
+++ b/theano/tensor/subtensor.py
@@ -1255,7 +1255,7 @@ class IncSubtensor(Op):
        copy_into = self.copy_into("zview", y)
-        add_to_zview = self.add_to_zview(y, fail)
+        add_to_zview = self.add_to_zview(name, y, fail)
        make_modification = """
        if (%(op_is_set)s)
@@ -1353,7 +1353,7 @@ class IncSubtensor(Op):
        """
        return """PyArray_CopyInto(%(view)s, %(source)s)""" % locals()
-    def add_to_zview(self, x, fail):
+    def add_to_zview(self, name, x, fail):
        """ Return C code to add x to zview. Should DECREF zview if the
        add fails."""