Merge pull request #1833 from abergeron/kernel_precomp

Kernel precomp

Merge pull request #1833 from abergeron/kernel_precomp
c587ca18 · Frédéric Bastien · e2c4597d · d5181aee · c587ca18 · c587ca18
--- a/theano/sandbox/gpuarray/basic_ops.py
+++ b/theano/sandbox/gpuarray/basic_ops.py
@@ -61,89 +61,146 @@ class HideC(object):
        return self.c_code_cache_version()


-class GpuKernelBase(object):
-    GpuKernelBase_version = 0
-
-    def c_kernel_code(self, node):
-        """
-        Return the source code of the kernel.
-        """
-        raise AttributeError("c_kernel_code", type(self))
-
-    def c_kernel_params(self, node):
-        """
-        Return the list of typecodes for kernel parameters.
-
-        The list can contain strings ( "GA_BUFFER" ) or direct int values.
-        """
-        raise AttributeError("c_kernel_params", type(self))
+class Kernel(object):
+    """
+    This class groups together all the attributes of a gpu kernel.
+    """
+    def __init__(self, code, params, name, flags,
+                 codevar=None, binvar=None, objvar=None):
+        self.code = code
+        self.params = params
+        self.name = name
+        self.flags = flags
+        if codevar is None:
+            codevar = 'kcode_' + name
+        self.codevar = codevar
+        if binvar is None:
+            binvar = 'kbin_' + name
+        self.binvar = binvar
+        if objvar is None:
+            self.objvar = 'k_' + name
+        self.objvar = objvar
+
+    @staticmethod
+    def get_flags(*types):
+        def get_dtype(t):
+            if isinstance(t, (str, unicode)):
+                return numpy.dtype(t)
+            elif isinstance(t, Type):
+                return t.dtype
+            elif isinstance(t, Variable):
+                return t.type.dtype
+            else:
+                raise TypeError, "can't get a dtype from %s" % (type(t),)
+        dtypes = [get_dtype(t) for t in types]
+        flags = dict(cluda=True)
+        if any(d == numpy.float64 for d in dtypes):
+            flags['have_double'] = True
+        if any(d.itemsize < 4 for d in dtypes):
+            flags['have_small'] = True
+        if any(d.kind == 'c' for d in dtypes):
+            flags['have_complex'] = True
+        if any(d == numpy.float16 for d in dtypes):
+            flags['have_half'] = True
+        return flags
+
+    def _get_c_flags(self):
+        res = []
+        if self.flags.get('cluda', False):
+            res.append('GA_USE_CLUDA')
+        if self.flags.get('have_double', False):
+            res.append('GA_USE_DOUBLE')
+        if self.flags.get('have_small', False):
+            res.append('GA_USE_SMALL')
+        if self.flags.get('have_complex', False):
+            res.append('GA_USE_COMPLEX')
+        if self.flags.get('have_half', False):
+            res.append('GA_USE_SMALL')
+        return '|'.join(res)
+
+    def _get_c_types(self):
+        def m(t):
+            if t == gpuarray.GpuArray:
+                return "GA_BUFFER"
+            else:
+                return str(gpuarray.dtype_to_typecode(t))
+        return ', '.join(m(t) for t in self.params)

-    def c_kernel_name(self):
-        """
-        Return the name of the kernel in the source.
-        """
-        raise AttributeError("c_kernel_name", type(self))

-    def c_kernel_flags(self, node):
+class GpuKernelBase(object):
+    def gpu_kernels(self, node, name):
        """
-        Return a string representing the C flags for the kernel.
-
-        Example:
-          "GA_USE_CLUDA|GA_USE_DOUBLE"
-
-        self._get_kernel_flags(*dtypes) returns an appropritate string
-        for the result of this function.
+        This is the method to override.  This should return an
+        iterable of Kernel objects that describe the kernels this op
+        will need.
        """
-        raise AttributeError("c_kernel_flags", type(self))
-
-    def c_kernel_codevar(self, name):
-        return 'kcode_' + name
-
-    def c_kernel_obj(self, name):
-        return 'k_' + name
-
-    def _get_kernel_flags(self, *dtypes):
-        dtypes = [numpy.dtype(d) for d in dtypes]
-        flags = ['GA_USE_CLUDA']
-        if any(d == numpy.float64 for d in dtypes):
-            flags.append('GA_USE_DOUBLE')
-        if any(d.itemsize < 4 for d in dtypes):
-            flags.append('GA_USE_SMALL')
-        return '|'.join(flags)
+        raise MethodNotDefined, 'gpu_kernels'

    def c_headers(self):
-        return ['gpuarray/types.h']
+        try:
+            o = super(GpuKernelBase, self).c_headers()
+        except MethodNotDefined:
+            o = []
+        return o + ['gpuarray/types.h']
+
+    def _generate_kernel_bin(self, k):
+        gk = gpuarray.GpuKernel(k.code, k.name, k.params, **k.flags)
+        bin = gk._binary
+        bcode = ','.join(hex(ord(c)) for c in bin)
+        return ("""static const char %(bname)s[] = { %(bcode)s };""" %
+                dict(bname=k.binvar, bcode=bcode))
+
+    def _generate_kernel_code(self, k):
+        code = '\\n'.join(l for l in k.code.split('\n'))
+        code = code.replace('"', '\\"')
+        return ("""static const char *%(cname)s = "%(code)s";""" %
+                dict(cname=k.codevar, code=code))
+
+    def _generate_kernel_vars(self, k):
+        return """static GpuKernel %(kname)s;""" % dict(kname=k.objvar)

    def c_support_code_apply(self, node, name):
-        kcode = self.c_kernel_code(node)
-        vname = self.c_kernel_codevar(name)
-        kname = self.c_kernel_obj(name)
-        code = '\\n'.join(l for l in kcode.split('\n'))
-        code = code.replace('"', '\\"')
-        return """static const char *%(vname)s = "%(code)s";
-static GpuKernel %(kname)s;""" % dict(vname=vname, kname=kname, code=code)
+        kernels = self.gpu_kernels(node, name)
+        bins = '\n'.join(self._generate_kernel_bin(k) for k in kernels)
+        codes = '\n'.join(self._generate_kernel_code(k) for k in kernels)
+        vars = '\n'.join(self._generate_kernel_vars(k) for k in kernels)
+        return '\n'.join([bins, codes, vars])

-    def c_init_code_apply(self, node, name):
-        types = self.c_kernel_params(node)
-        numargs = len(types)
-        kname = self.c_kernel_name()
-        vname = self.c_kernel_codevar(name)
-        oname = self.c_kernel_obj(name)
-        flags = self.c_kernel_flags(node)
-        # TODO: find a way to release the kernel once the module is unloaded
-        error_out = ""
+    def _generate_kernel_init(self, k, err):
        if PY3:
            error_out = "NULL"
-        return """
-int types_%(name)s[%(numargs)u] = {%(types)s};
-if (GpuKernel_init(&%(oname)s, pygpu_default_context()->ops,
-                   pygpu_default_context()->ctx, 1, &%(vname)s, NULL,
-                   "%(kname)s", %(numargs)s, types_%(name)s, %(flags)s) != GA_NO_ERROR) {
-    PyErr_SetString(PyExc_RuntimeError, "Error initializing kernel");
-    return %(error_out)s;
-}
-""" % dict(types=','.join(types), numargs=numargs, kname=kname, oname=oname,
-           vname=vname, flags=flags, error_out=error_out, name=name)
+        else:
+            error_out = ""
+        return """{
+  int types[%(numargs)u] = {%(types)s};
+  const char *bcode = %(bvar)s;
+  size_t sz = sizeof(%(bvar)s);
+  PyGpuContextObject *c = pygpu_default_context();
+  if (GpuKernel_init(&%(ovar)s, c->ops, c->ctx, 1, &bcode, &sz, "%(kname)s",
+                     %(numargs)u, types, GA_USE_BINARY) != GA_NO_ERROR) {
+    if ((%(err)s = GpuKernel_init(&%(ovar)s, c->ops, c->ctx, 1, &%(cname)s,
+                                  NULL, "%(kname)s", %(numargs)u, types,
+                                  %(flags)s)) != GA_NO_ERROR) {
+      PyErr_Format(PyExc_RuntimeError, "GpuKernel_init error %%d: %%s",
+                   %(err)s, Gpu_error(c->ops, c->ctx, %(err)s));
+      return %(error_out)s;
+    }
+  }
+}""" % dict(numargs=len(k.params), types=k._get_c_types(), bvar=k.binvar,
+            ovar=k.objvar, kname=k.name, err=err, cname=k.codevar,
+            flags=k._get_c_flags(), error_out=error_out)
+
+    def c_init_code_apply(self, node, name):
+        err = 'err_' + name
+        kernels = self.gpu_kernels(node, name)
+        inits ='\n'.join(self._generate_kernel_init(k, err) for k in kernels)
+        return ("int %(err)s;\n" % dict(err=err)) + inits
+
+    def _GpuKernelBase_version(self):
+        ctx = gpuarray.get_default_context()
+        return (2, ctx.kind, ctx.devname)
+
+    GpuKernelBase_version = property(_GpuKernelBase_version)


 class HostFromGpu(Op):
@@ -815,23 +872,20 @@ class GpuEye(GpuKernelBase, Op):
    def __hash__(self):
        return hash(self.dtype) ^ hash(type(self))

-    def c_kernel_code(self, node):
-        return """
+    def gpu_kernels(self, node, name):
+        code = """
 KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) {
    ga_size nb = n < m ? n : m;
    for (ga_size i = LID_0; i < nb; i += LDIM_0) {
        a[i*m + i] = 1;
    }
-}""" % dict(ctype=pygpu.gpuarray.dtype_to_ctype(self.dtype))
-
-    def c_kernel_params(self, node):
-        return ["GA_BUFFER", "GA_SIZE", "GA_SIZE"]
-
-    def c_kernel_name(self):
-        return "k"
-
-    def c_kernel_flags(self, node):
-        return self._get_kernel_flags(self.dtype)
+}""" % dict(ctype=pygpu.gpuarray.dtype_to_ctype(self.dtype), name=name)
+        return [Kernel(
+                code=code, name="k",
+                params=[gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SIZE],
+                flags=Kernel.get_flags(self.dtype),
+                objvar='k_eye_'+name,
+                )]

    def c_code(self, node, name, inp, out, sub):
        n, m = inp
@@ -839,7 +893,7 @@ KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) {
        fail = sub['fail']
        typecode = pygpu.gpuarray.dtype_to_typecode(self.dtype)
        sync = bool(config.gpuarray.sync)
-        kname = self.c_kernel_obj(name)
+        kname = self.gpu_kernels()[0].objvar
        s = """
        size_t dims[2] = {0, 0};
        void *args[3];

--- a/theano/sandbox/gpuarray/elemwise.py
+++ b/theano/sandbox/gpuarray/elemwise.py
@@ -14,6 +14,7 @@ from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler

 try:
    import pygpu
+    from pygpu import gpuarray
    from pygpu.tools import ScalarArg, ArrayArg
    from pygpu.elemwise import ElemwiseKernel
    from pygpu.reduction import ReductionKernel
@@ -22,7 +23,7 @@ except ImportError:
    pass

 from theano.sandbox.gpuarray.basic_ops import (as_gpuarray_variable, HideC,
-                                               GpuKernelBase)
+                                               GpuKernelBase, Kernel)
 from theano.sandbox.gpuarray.type import GpuArrayType

 from theano.gof.utils import MethodNotDefined
@@ -2406,40 +2407,29 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
        if any(redux):
            return getattr(node, attr)

-    def c_kernel_code(self, node):
+    def gpu_kernels(self, node, name):
        if not any(getattr(self, 'redux', [node.inputs[0].ndim != 0])):
            # Some OpenCL compilers do not accept no-arguments kernels
-            return "KERNEL void reduk(GLOBAL_MEM float *a) {}"
+            src = "KERNEL void reduk(GLOBAL_MEM float *a) {}"
+            params = ['float32']
        else:
            k = self.get_kernel_cache(node)
            _, src, _, _ = k._get_basic_kernel(k.init_local_size,
                                               node.inputs[0].ndim)
-            return src
-
-    def c_kernel_name(self):
-        return "reduk"
-
-    def c_kernel_params(self, node):
-        if not any(getattr(self, 'redux', [node.inputs[0].ndim != 0])):
-            return ["GA_FLOAT"]
-        else:
-            # Make sure this is synced with the call definition in
-            # pygpu/reduction.py
            nd = node.inputs[0].ndim
-            res = ["GA_UINT", "GA_BUFFER"]
-            res.extend("GA_UINT" for _ in range(nd))
-            res.append("GA_BUFFER")
-            res.append("GA_UINT")
-            res.extend("GA_INT" for _ in range(nd))
-            return res
-
-    def c_kernel_flags(self, node):
+            params = ['uint32', gpuarray.GpuArray]
+            params.extend('uint32' for _ in range(nd))
+            params.append(gpuarray.GpuArray)
+            params.append('uint32')
+            params.extend('int32' for _ in range(nd))
        acc_dtype = getattr(self, 'acc_dtype', None)
        if acc_dtype is None:
            acc_dtype = node.outputs[0].type.dtype
-        return self._get_kernel_flags(node.inputs[0].type.dtype,
-                                      acc_dtype,
-                                      node.outputs[0].type.dtype)
+        return [Kernel(code=src, name="reduk", params=params,
+                       flags=Kernel.get_flags(node.inputs[0].type.dtype,
+                                              acc_dtype,
+                                              node.outputs[0].type.dtype),
+                       objvar='k_reduk_'+name)]

    def c_code(self, node, name, inp, out, sub):
        if not any(getattr(self, 'redux', [node.inputs[0].ndim != 0])):
@@ -2458,7 +2448,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
           sync=bool(config.gpuarray.sync))
        k = self.get_kernel_cache(node)
        _, src, _, ls = k._get_basic_kernel(k.init_local_size,
-                                           node.inputs[0].ndim)
+                                            node.inputs[0].ndim)
        if self.axis is None:
            redux = [True] * node.inputs[0].ndim
        else:
@@ -2588,14 +2578,14 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):

        if (%(sync)d)
            GpuArray_sync(&%(output)s->ga);
-""" % dict(k_var=self.c_kernel_obj(name), sync=bool(config.gpuarray.sync),
+""" % dict(k_var='k_reduk_'+name, sync=bool(config.gpuarray.sync),
           ls=ls, fail=sub['fail'], output=output, input=input,
           cast_out=bool(acc_dtype != node.outputs[0].type.dtype))

        return code

    def c_code_cache_version(self):
-        return (0,)
+        return (0, self.GpuKernelBase_version)

    def generate_kernel(self, node, odtype, redux):
        if isinstance(self.scalar_op, scalar.basic.Add):

--- a/theano/sandbox/rng_mrg.py
+++ b/theano/sandbox/rng_mrg.py
@@ -26,7 +26,7 @@ if cuda_available:
    from theano.sandbox.cuda import (CudaNdarrayType,
                                     float32_shared_constructor)

-from theano.sandbox.gpuarray.basic_ops import GpuKernelBase
+from theano.sandbox.gpuarray.basic_ops import GpuKernelBase, Kernel
 from theano.sandbox.gpuarray.type import GpuArrayType


@@ -772,9 +772,9 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
        return op(rstate, cast(v_size, 'int32'))

    def c_headers(self):
-        return GpuKernelBase.c_headers(self) + ['numpy_compat.h']
+        return super(GPUA_mrg_uniform, self).c_headers() + ['numpy_compat.h']

-    def c_kernel_code(self, node):
+    def gpu_kernels(self, node, name):
        if self.output_type.dtype == 'float32':
            otype = 'float'
            NORM = '4.6566126e-10f'  # numpy.float32(1.0/(2**31+65))
@@ -783,10 +783,10 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
        else:
            otype = 'double'
            NORM = '4.656612873077392578125e-10'
-        return """
+        code = """
        KERNEL void mrg_uniform(
-                %(otype)s *sample_data,
-                ga_int *state_data,
+                GLOBAL_MEM %(otype)s *sample_data,
+                GLOBAL_MEM ga_int *state_data,
                const ga_uint Nsamples,
                const ga_uint Nstreams_used)
        {
@@ -809,7 +809,7 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
            const ga_int MASK2 = 65535;      //2^16 - 1
            const ga_int MULT2 = 21069;

-            const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
+            const ga_uint idx = GID_0 * LDIM_0 + LID_0;
            ga_int y1, y2, x11, x12, x13, x21, x22, x23;

            if (idx < Nstreams_used)
@@ -821,7 +821,7 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
            x22 = state_data[idx*6+4];
            x23 = state_data[idx*6+5];

-            for (int i = idx; i < Nsamples; i += Nstreams_used)
+            for (ga_uint i = idx; i < Nsamples; i += Nstreams_used)
            {
                y1 = ((x12 & MASK12) << i22) + (x12 >> i9) + ((x13 & MASK13) << i7) + (x13 >> i24);
                y1 -= (y1 < 0 || y1 >= M1) ? M1 : 0;
@@ -864,14 +864,14 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):

        """ % locals()

-    def c_kernel_params(self, node):
-        return ["GA_BUFFER", "GA_BUFFER", "GA_UINT", "GA_UINT"]
-
-    def c_kernel_name(self):
-        return "mrg_uniform"
+        # we shouldn't get to this line if it's about to fail
+        from pygpu import gpuarray

-    def c_kernel_flags(self, node):
-        return self._get_kernel_flags(self.output_type.dtype, 'int32')
+        return [Kernel(code=code, name="mrg_uniform",
+                       params=[gpuarray.GpuArray, gpuarray.GpuArray,
+                               'uint32', 'uint32'],
+                       flags=Kernel.get_flags(self.output_type.dtype, 'int32'))
+                ]

    def c_code(self, node, nodename, inp, out, sub):
        rstate, size = inp
@@ -880,7 +880,7 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
        ndim = self.output_type.ndim
        o_type_num = numpy.asarray(0, dtype=self.output_type.dtype).dtype.num
        fail = sub['fail']
-        kname = self.c_kernel_obj(nodename)
+        kname = self.gpu_kernels(node, nodename)[0].objvar

        if self.output_type.dtype == 'float32':
            otype = 'float'