Merge pull request #1646 from abergeron/c_red

c_code() for GpuCAReduce (in gpuarray)

Merge pull request #1646 from abergeron/c_red
f85085d0 · Frédéric Bastien · 8b837a42 · c2c22856 · f85085d0 · f85085d0
--- a/doc/extending/cop.txt
+++ b/doc/extending/cop.txt
@@ -62,6 +62,13 @@ There are less methods to define for an Op than for a Type:
      Allows you to specify code that will be executed once when the
      module is initialized, before anything else is executed.

+    .. method:: c_init_code_apply(self, node, name)
+
+      Allows you to specify code that will be executed once when the
+      module is initialized, before anything else is executed and is
+      specialized for a particular apply of an :ref:`op`. Use
+      `c_init_code` if the code is the same for each apply of an op.
+
    .. method:: c_support_code()

      Allows you to specify helper functions/structs that the

--- a/theano/gof/cc.py
+++ b/theano/gof/cc.py
@@ -484,6 +484,7 @@ class CLinker(link.Linker):
        self.consts = []

        c_support_code_apply = []
+        c_init_code_apply = []

        symbol = {}

@@ -630,7 +631,15 @@ class CLinker(link.Linker):
                # The following will be executed if the "try" block succeeds
                assert isinstance(c_support_code_apply[-1], basestring), (
                        str(node.op) +
-                        " didn't returned a string for c_support_code_apply")
+                        " didn't return a string for c_support_code_apply")
+            try:
+                c_init_code_apply.append(op.c_init_code_apply(node, name))
+            except utils.MethodNotDefined:
+                pass
+            else:
+                assert isinstance(c_init_code_apply[-1], basestring), (
+                    str(node.op) +
+                    " didn't return a string for c_init_code_apply")

            # emit c_code
            try:
@@ -638,7 +647,7 @@ class CLinker(link.Linker):
            except utils.MethodNotDefined:
                raise NotImplementedError("%s cannot produce C code" % op)
            assert isinstance(behavior, basestring), (
-                str(node.op) + " didn't returned a string for c_code")
+                str(node.op) + " didn't return a string for c_code")

            try:
                cleanup = op.c_code_cleanup(node, name, isyms, osyms, sub)
@@ -677,6 +686,7 @@ class CLinker(link.Linker):
        self.tasks = tasks
        all_info = self.inputs + self.outputs + self.orphans
        self.c_support_code_apply = c_support_code_apply
+        self.c_init_code_apply = c_init_code_apply

        if (self.init_tasks, self.tasks) != self.get_init_tasks():
            print >> sys.stderr, "init_tasks\n", self.init_tasks
@@ -1292,7 +1302,7 @@ class CLinker(link.Linker):
        mod.add_function(instantiate)
        for header in self.headers():
            mod.add_include(header)
-        for init_code_block in self.init_code():
+        for init_code_block in self.init_code() + self.c_init_code_apply:
            mod.add_init_code(init_code_block)

        return mod

--- a/theano/gof/op.py
+++ b/theano/gof/op.py
@@ -187,6 +187,18 @@ class CLinkerObject(object):
                                     self.__class__.__name__)


+    def c_init_code_apply(self, node, name):
+        """
+        Optional: return a list of code snippets specific to the apply
+        to be inserted in module initialization.
+
+        :Exceptions:
+         - `MethodNotDefined`: the subclass does not override this method
+        """
+        raise utils.MethodNotDefined("c_init_code_apply", type(self),
+                                     self.__class__.__name__)
+
+
 class CLinkerOp(CLinkerObject):
    """
    Interface definition for `Op` subclasses compiled by `CLinker`.

--- a/theano/sandbox/gpuarray/basic_ops.py
+++ b/theano/sandbox/gpuarray/basic_ops.py
@@ -52,6 +52,7 @@ class HideC(object):
    c_compile_args = __hide
    c_no_compile_args = __hide
    c_init_code = __hide
+    c_init_code_apply = __hide

    def c_code_cache_version(self):
        return ()
@@ -63,13 +64,13 @@ class HideC(object):
 class GpuKernelBase(object):
    GpuKernelBase_version = 0

-    def c_kernel_code(self):
+    def c_kernel_code(self, node):
        """
        Return the source code of the kernel.
        """
        raise AttributeError("c_kernel_code", type(self))

-    def c_kernel_params(self):
+    def c_kernel_params(self, node):
        """
        Return the list of typecodes for kernel parameters.

@@ -83,7 +84,7 @@ class GpuKernelBase(object):
        """
        raise AttributeError("c_kernel_name", type(self))

-    def c_kernel_flags(self):
+    def c_kernel_flags(self, node):
        """
        Return a string representing the C flags for the kernel.

@@ -95,11 +96,11 @@ class GpuKernelBase(object):
        """
        raise AttributeError("c_kernel_flags", type(self))

-    def c_kernel_codevar(self):
-        return 'kcode_' + type(self).__name__ + '_' + hex(hash(self))[2:]
+    def c_kernel_codevar(self, name):
+        return 'kcode_' + name

-    def c_kernel_obj(self):
-        return 'k_' + type(self).__name__ + '_' + hex(hash(self))[2:]
+    def c_kernel_obj(self, name):
+        return 'k_' + name

    def _get_kernel_flags(self, *dtypes):
        dtypes = [numpy.dtype(d) for d in dtypes]
@@ -113,35 +114,36 @@ class GpuKernelBase(object):
    def c_headers(self):
        return ['compyte/types.h']

-    def c_support_code(self):
-        kcode = self.c_kernel_code()
-        vname = self.c_kernel_codevar()
-        kname = self.c_kernel_obj()
+    def c_support_code_apply(self, node, name):
+        kcode = self.c_kernel_code(node)
+        vname = self.c_kernel_codevar(name)
+        kname = self.c_kernel_obj(name)
        code = '\\n'.join(l for l in kcode.split('\n'))
+        code = code.replace('"', '\\"')
        return """static const char *%(vname)s = "%(code)s";
-static GpuKernel %(kname)s;""" % dict(vname=vname, kname=kname,code=code)
+static GpuKernel %(kname)s;""" % dict(vname=vname, kname=kname, code=code)

-    def c_init_code(self):
-        types = self.c_kernel_params()
+    def c_init_code_apply(self, node, name):
+        types = self.c_kernel_params(node)
        numargs = len(types)
-        name = self.c_kernel_name()
-        vname = self.c_kernel_codevar()
-        kname = self.c_kernel_obj()
-        flags = self.c_kernel_flags()
+        kname = self.c_kernel_name()
+        vname = self.c_kernel_codevar(name)
+        oname = self.c_kernel_obj(name)
+        flags = self.c_kernel_flags(node)
        # TODO: find a way to release the kernel once the module is unloaded
        error_out = ""
        if PY3:
            error_out = "NULL"
-        return ["""
-int types[%(numargs)u] = {%(types)s};
-if (GpuKernel_init(&%(kname)s, pygpu_default_context()->ops,
+        return """
+int types_%(name)s[%(numargs)u] = {%(types)s};
+if (GpuKernel_init(&%(oname)s, pygpu_default_context()->ops,
                   pygpu_default_context()->ctx, 1, &%(vname)s, NULL,
-                   "%(name)s", %(numargs)s, types, %(flags)s) != GA_NO_ERROR) {
+                   "%(kname)s", %(numargs)s, types_%(name)s, %(flags)s) != GA_NO_ERROR) {
    PyErr_SetString(PyExc_RuntimeError, "Error initializing kernel");
    return %(error_out)s;
 }
-""" % dict(types=','.join(types), numargs=numargs, kname=kname, name=name,
-           vname=vname, flags=flags, error_out=error_out)]
+""" % dict(types=','.join(types), numargs=numargs, kname=kname, oname=oname,
+           vname=vname, flags=flags, error_out=error_out, name=name)


 class HostFromGpu(Op):
@@ -726,7 +728,7 @@ class GpuEye(GpuKernelBase, Op):
    def __hash__(self):
        return hash(self.dtype) ^ hash(type(self))

-    def c_kernel_code(self):
+    def c_kernel_code(self, node):
        return """
 KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) {
    ga_size nb = n < m ? n : m;
@@ -735,13 +737,13 @@ KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) {
    }
 }""" % dict(ctype=pygpu.gpuarray.dtype_to_ctype(self.dtype))

-    def c_kernel_params(self):
+    def c_kernel_params(self, node):
        return ["GA_BUFFER", "GA_SIZE", "GA_SIZE"]

    def c_kernel_name(self):
        return "k"

-    def c_kernel_flags(self):
+    def c_kernel_flags(self, node):
        return self._get_kernel_flags(self.dtype)

    def c_code(self, node, name, inp, out, sub):
@@ -750,7 +752,7 @@ KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) {
        fail = sub['fail']
        typecode = pygpu.gpuarray.dtype_to_typecode(self.dtype)
        sync = bool(config.gpuarray.sync)
-        kname = self.c_kernel_obj()
+        kname = self.c_kernel_obj(name)
        s = """
        size_t dims[2] = {0, 0};
        void *args[3];

--- a/theano/sandbox/gpuarray/elemwise.py
+++ b/theano/sandbox/gpuarray/elemwise.py
@@ -11,11 +11,12 @@ try:
    from pygpu.tools import ScalarArg, ArrayArg
    from pygpu.elemwise import ElemwiseKernel
    from pygpu.reduction import ReductionKernel
-    from pygpu.gpuarray import dtype_to_typecode
+    from pygpu.gpuarray import dtype_to_typecode, dtype_to_ctype
 except ImportError:
    pass

-from theano.sandbox.gpuarray.basic_ops import as_gpuarray_variable, HideC
+from theano.sandbox.gpuarray.basic_ops import (as_gpuarray_variable, HideC,
+                                               GpuKernelBase)
 from theano.sandbox.gpuarray.type import GpuArrayType

 from theano.gof.utils import MethodNotDefined
@@ -480,7 +481,7 @@ class GpuDimShuffle(HideC, DimShuffle):
        return (3,)


-class GpuCAReduce(HideC, CAReduceDtype):
+class GpuCAReduce(GpuKernelBase, HideC, CAReduceDtype):
    def __init__(self, scalar_op, axis=None, dtype=None, acc_dtype=None):
        if not hasattr(scalar_op, 'identity'):
            raise ValueError("No identity on scalar op")
@@ -510,18 +511,218 @@ class GpuCAReduce(HideC, CAReduceDtype):
        return Apply(res.op, [input], [otype()])

    def make_thunk(self, node, storage_map, compute_map, no_recycling):
+        # cache the kernel object
+        self.get_kernel_cache(node)
+        return super(GpuCAReduce, self).make_thunk(node, storage_map,
+                                                   compute_map, no_recycling)
+
+    def get_kernel_cache(self, node):
+        attr = '@cache_reduction_k'
        if self.axis is None:
            redux = [True] * node.inputs[0].ndim
        else:
            redux = self.redux
+        if not hasattr(node, attr):
+            acc_dtype = getattr(self, 'acc_dtype', None)
+            if acc_dtype is None:
+                acc_dtype = node.outputs[0].type.dtype
+            if any(redux):
+                setattr(node, attr, self.generate_kernel(node, acc_dtype,
+                                                         redux))
+
+        if any(redux):
+            return getattr(node, attr)
+
+    def c_kernel_code(self, node):
+        if not any(getattr(self, 'redux', [node.inputs[0].ndim != 0])):
+            # Some OpenCL compilers do not accept no-arguments kernels
+            return "KERNEL void reduk(GLOBAL_MEM float *a) {}"
+        else:
+            k = self.get_kernel_cache(node)
+            _, src, _, _ = k._get_basic_kernel(k.init_local_size,
+                                               node.inputs[0].ndim)
+            return src
+
+    def c_kernel_name(self):
+        return "reduk"
+
+    def c_kernel_params(self, node):
+        if not any(getattr(self, 'redux', [node.inputs[0].ndim != 0])):
+            return ["GA_FLOAT"]
+        else:
+            # Make sure this is synced with the call definition in
+            # pygpu/reduction.py
+            nd = node.inputs[0].ndim
+            res = ["GA_UINT", "GA_BUFFER"]
+            res.extend("GA_UINT" for _ in range(nd))
+            res.append("GA_BUFFER")
+            res.append("GA_UINT")
+            res.extend("GA_INT" for _ in range(nd))
+            return res
+
+    def c_kernel_flags(self, node):
        acc_dtype = getattr(self, 'acc_dtype', None)
        if acc_dtype is None:
            acc_dtype = node.outputs[0].type.dtype
-        if any(redux):
-            node._cache_reduction_k = self.generate_kernel(node, acc_dtype,
-                                                           redux)
-        return super(GpuCAReduce, self).make_thunk(node, storage_map,
-                                                   compute_map, no_recycling)
+        return self._get_kernel_flags(node.inputs[0].type.dtype,
+                                      acc_dtype,
+                                      node.outputs[0].type.dtype)
+
+    def c_code(self, node, name, inp, out, sub):
+        if not any(getattr(self, 'redux', [node.inputs[0].ndim != 0])):
+            # We special case the no-reduction case since the gpu
+            # kernel has trouble handling it.
+            return """
+        Py_XDECREF(%(out)s);
+        %(out)s = pygpu_copy(%(inp)s, GA_ANY_ORDER);
+        if (!%(out)s) {
+            %(fail)s
+        }
+
+        if (%(sync)d)
+            GpuArray_sync(&%(out)s->ga);
+""" % dict(out=out[0], inp=inp[0], fail=sub['fail'],
+           sync=bool(config.gpuarray.sync))
+        k = self.get_kernel_cache(node)
+        _, src, _, ls = k._get_basic_kernel(k.init_local_size,
+                                           node.inputs[0].ndim)
+        if self.axis is None:
+            redux = [True] * node.inputs[0].ndim
+        else:
+            redux = self.redux
+        acc_dtype = getattr(self, 'acc_dtype', None)
+        if acc_dtype is None:
+            acc_dtype = node.outputs[0].type.dtype
+        input = inp[0]
+        output = out[0]
+        nd_out = node.outputs[0].ndim
+        code = """
+        size_t gs = 1;
+        unsigned int n = 1;
+        unsigned int proxy_dim[%(nd_in)s];
+        unsigned int proxy_off;
+        int proxy_str[%(nd_in)s];
+        void *args[%(n_args)s];
+        PyGpuArrayObject *tmp;
+        int err;
+""" % dict(n_args=4 + (node.inputs[0].ndim * 2), nd_in=node.inputs[0].ndim)
+
+        if nd_out != 0:
+            code += """
+        size_t out_dims[%(nd_out)s];
+        int need_out = %(output)s == NULL || %(output)s->ga.nd != %(nd_out)s;
+""" % dict(nd_out=nd_out, output=output)
+            j = 0
+            for i in range(node.inputs[0].ndim):
+                if not self.redux[i]:
+                    code += """
+         out_dims[%(j)s] = %(input)s->ga.dimensions[%(i)s];
+         if (!need_out)
+             need_out |= %(output)s->ga.dimensions[%(j)s] != out_dims[%(j)s];
+""" % dict(j=j, i=i, input=input, output=output)
+                    j += 1
+            code += """
+         if (need_out) {
+             %(output)s = pygpu_empty(%(nd_out)s, out_dims, %(out_type)s, GA_C_ORDER, pygpu_default_context(), Py_None);
+             if (!%(output)s) {
+                 %(fail)s
+             }
+         }
+""" % dict(output=output, nd_out=nd_out, fail=sub['fail'],
+           out_type=dtype_to_typecode(node.outputs[0].type.dtype))
+        else:
+            code += """
+        if (%(output)s == NULL || %(output)s->ga.nd != 0) {
+            Py_XDECREF(%(output)s);
+            %(output)s = pygpu_empty(0, NULL, %(out_type)s, GA_C_ORDER,
+                                     pygpu_default_context(), Py_None);
+            if (!%(output)s) {
+                %(fail)s
+            }
+        }
+""" % dict(output=output, fail=sub['fail'],
+           out_type=dtype_to_typecode(node.outputs[0].type.dtype))
+
+        if acc_dtype != node.outputs[0].type.dtype:
+            code += """
+        tmp = pygpu_empty(%(output)s->ga.nd, %(output)s->ga.dimensions,
+                          %(acc_type)s, GA_C_ORDER, pygpu_default_context(),
+                          Py_None);
+        if (!tmp) %(fail)s
+""" % dict(output=output, fail=sub['fail'], acc_type=dtype_to_typecode(acc_dtype))
+        else:
+            code += """
+        tmp = %(output)s;
+        Py_INCREF(tmp);
+""" % dict(output=output)
+
+        # We need the proxies since we are passing a pointer to the
+        # data into the call and therefore we need a real copy of the
+        # data in the proper type.
+        code += """
+        args[0] = &n;
+        args[1] = &tmp->ga;
+""" % dict(output=output)
+
+        p = 2
+        for i in range(node.inputs[0].ndim):
+            code += """
+        proxy_dim[%(i)s] = %(input)s->ga.dimensions[%(i)s];
+        args[%(p)s] = &proxy_dim[%(i)s];
+        n *= %(input)s->ga.dimensions[%(i)s];
+""" % dict(i=i, p=p, input=input)
+            p += 1
+            if not redux[i]:
+                code += "gs *= %(input)s->ga.dimensions[%(i)s];" % dict(input=input, i=i)
+
+        code += """
+        args[%(p)s] = &%(input)s->ga;
+        proxy_off = %(input)s->ga.offset;
+        args[%(p)s+1] = &proxy_off;
+""" % dict(p=p, input=input)
+        p += 2
+
+        for i in range(node.inputs[0].ndim):
+            code += """
+        proxy_str[%(i)s] = %(input)s->ga.strides[%(i)s];
+        args[%(p)s] = &proxy_str[%(i)s];
+""" % dict(p=p, i=i, input=input)
+            p += 1
+
+        code += """
+        if (gs == 0) gs = 1;
+        n /= gs;
+        err = GpuKernel_call(&%(k_var)s, 0, %(ls)s, gs, args);
+        if (err != GA_NO_ERROR) {
+            PyErr_Format(PyExc_RuntimeError,
+                         "compyte error: GpuCAReduce: %%s.",
+                         GpuKernel_error(&%(k_var)s, err));
+            %(fail)s
+        }
+
+        if (%(cast_out)d) {
+            err = GpuArray_move(&%(output)s->ga, &tmp->ga);
+            if (err != GA_NO_ERROR) {
+                PyErr_Format(PyExc_RuntimeError,
+                             "compyte error: GpuCAReduce [cast]: %%s.",
+                             GpuArray_error(&tmp->ga, err));
+                %(fail)s
+            }
+        } else {
+            Py_XDECREF(%(output)s);
+            %(output)s = tmp;
+        }
+
+        if (%(sync)d)
+            GpuArray_sync(&%(output)s->ga);
+""" % dict(k_var=self.c_kernel_obj(name), sync=bool(config.gpuarray.sync),
+           ls=ls, fail=sub['fail'], output=output, input=input,
+           cast_out=bool(acc_dtype != node.outputs[0].type.dtype))
+
+        return code
+
+    def c_code_cache_version(self):
+        return (0,)

    def generate_kernel(self, node, odtype, redux):
        if isinstance(self.scalar_op, scalar.basic.Add):
@@ -533,8 +734,7 @@ class GpuCAReduce(HideC, CAReduceDtype):
        return ReductionKernel(pygpu.get_default_context(), odtype,
                               self.scalar_op.identity, reduce_expr, redux,
                               arguments=[make_argument(node.inputs[0], 'a')],
-                               init_nd=node.inputs[0].ndim
-        )
+                               init_nd=node.inputs[0].ndim)

    def perform(self, node, inp, out):
        input, = inp
@@ -546,7 +746,7 @@ class GpuCAReduce(HideC, CAReduceDtype):
            redux = self.redux

        if any(redux):
-            output[0] = node._cache_reduction_k(input).astype(copy=False,
+            output[0] = self.get_kernel_cache(node)(input).astype(copy=False,
                                             dtype=node.outputs[0].type.dtype)
        else:
            output[0] = pygpu.gpuarray.array(input, copy=True,

--- a/theano/sandbox/gpuarray/tests/test_elemwise.py
+++ b/theano/sandbox/gpuarray/tests/test_elemwise.py
@@ -55,7 +55,12 @@ class test_GpuCAReduce(test_CAReduce):
                                 test_nan=True)

    def test_c(self):
-        raise SkipTest("no C code")
+        for dtype in self.dtypes + self.bin_dtypes:
+            for op in self.reds:
+                self.with_linker(gof.CLinker(), op, dtype=dtype)

    def test_c_nan(self):
-        raise SkipTest("no C code")
+        for dtype in self.dtypes:
+            for op in self.reds:
+                self.with_linker(gof.CLinker(), op, dtype=dtype,
+                                 test_nan=True)