Rework the interface to GpuKernelBase to accept a list of kernel object.

Each item will be precompiled separetly and embedded into the c_code of the Op. This allows ops that need multiple kernels or that will choose between alternatives at runtime to use this interface. It also groups all kernel-related parameters under one object. This change also save the source of the kernel code to re-attempt source compilation in case the binary is rejected for some reason (some implementations do not support reloading from pre-compiled kernel). There may still be more changes to how stuff works under the hood (most notably a blacklist of bad runtime/drivers that crash when attempting to create a kernel from a binary), but the visible interface should not change anymore, so now is the time to start using it more.

Rework the interface to GpuKernelBase to accept a list of kernel object.
a8b3b329 · Arnaud Bergeron · 4e350322 · a8b3b329 · a8b3b329 · a8b3b329
--- a/theano/sandbox/gpuarray/basic_ops.py
+++ b/theano/sandbox/gpuarray/basic_ops.py
--- a/theano/sandbox/gpuarray/elemwise.py
+++ b/theano/sandbox/gpuarray/elemwise.py
@@ -20,7 +20,7 @@ except ImportError:
    pass

 from theano.sandbox.gpuarray.basic_ops import (as_gpuarray_variable, HideC,
-                                               GpuKernelBase)
+                                               GpuKernelBase, Kernel)
 from theano.sandbox.gpuarray.type import GpuArrayType

 from theano.gof.utils import MethodNotDefined
@@ -2373,40 +2373,29 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
        if any(redux):
            return getattr(node, attr)

-    def c_kernel_code(self, node):
+    def gpu_kernels(self, node, name):
        if not any(getattr(self, 'redux', [node.inputs[0].ndim != 0])):
            # Some OpenCL compilers do not accept no-arguments kernels
-            return "KERNEL void reduk(GLOBAL_MEM float *a) {}"
+            src = "KERNEL void reduk(GLOBAL_MEM float *a) {}"
+            params = ['float32']
        else:
            k = self.get_kernel_cache(node)
            _, src, _, _ = k._get_basic_kernel(k.init_local_size,
                                               node.inputs[0].ndim)
-            return src
-
-    def c_kernel_name(self):
-        return "reduk"
-
-    def c_kernel_params(self, node):
-        if not any(getattr(self, 'redux', [node.inputs[0].ndim != 0])):
-            return ["GA_FLOAT"]
-        else:
-            # Make sure this is synced with the call definition in
-            # pygpu/reduction.py
            nd = node.inputs[0].ndim
-            res = ["GA_UINT", "GA_BUFFER"]
-            res.extend("GA_UINT" for _ in range(nd))
-            res.append("GA_BUFFER")
-            res.append("GA_UINT")
-            res.extend("GA_INT" for _ in range(nd))
-            return res
-
-    def c_kernel_flags(self, node):
+            params = ['uint32', gpuarray.GpuArray]
+            params.extend('uint32' for _ in range(nd))
+            params.append(gpuarray.GpuArray)
+            params.append('uint32')
+            params.extend('int32' for _ in range(nd))
        acc_dtype = getattr(self, 'acc_dtype', None)
        if acc_dtype is None:
            acc_dtype = node.outputs[0].type.dtype
-        return self._get_kernel_flags(node.inputs[0].type.dtype,
-                                      acc_dtype,
-                                      node.outputs[0].type.dtype)
+        return [Kernel(code=src, name="reduk", params=params,
+                       flags=Kernel.get_flags(node.inputs[0].type.dtype,
+                                              acc_dtype,
+                                              node.outputs[0].type.dtype),
+                       objname='k_reduk_'+name)]

    def c_code(self, node, name, inp, out, sub):
        if not any(getattr(self, 'redux', [node.inputs[0].ndim != 0])):
@@ -2555,14 +2544,14 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):

        if (%(sync)d)
            GpuArray_sync(&%(output)s->ga);
-""" % dict(k_var=self.c_kernel_obj(name), sync=bool(config.gpuarray.sync),
+""" % dict(k_var='k_reduk_'+name, sync=bool(config.gpuarray.sync),
           ls=ls, fail=sub['fail'], output=output, input=input,
           cast_out=bool(acc_dtype != node.outputs[0].type.dtype))

        return code

    def c_code_cache_version(self):
-        return (0,)
+        return (0, self.GpuKernelBase_version)

    def generate_kernel(self, node, odtype, redux):
        if isinstance(self.scalar_op, scalar.basic.Add):

--- a/theano/sandbox/rng_mrg.py
+++ b/theano/sandbox/rng_mrg.py
@@ -26,7 +26,7 @@ if cuda_available:
    from theano.sandbox.cuda import (CudaNdarrayType,
                                     float32_shared_constructor)

-from theano.sandbox.gpuarray.basic_ops import GpuKernelBase
+from theano.sandbox.gpuarray.basic_ops import GpuKernelBase, Kernel
 from theano.sandbox.gpuarray.type import GpuArrayType


@@ -772,9 +772,9 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
        return op(rstate, cast(v_size, 'int32'))

    def c_headers(self):
-        return GpuKernelBase.c_headers(self) + ['numpy_compat.h']
+        return super(GPUA_mrg_uniform, self) + ['numpy_compat.h']

-    def c_kernel_code(self, node):
+    def gpu_kernels(self, node):
        if self.output_type.dtype == 'float32':
            otype = 'float'
            NORM = '4.6566126e-10f'  # numpy.float32(1.0/(2**31+65))
@@ -783,7 +783,7 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
        else:
            otype = 'double'
            NORM = '4.656612873077392578125e-10'
-        return """
+        code = """
        KERNEL void mrg_uniform(
                %(otype)s *sample_data,
                ga_int *state_data,
@@ -864,14 +864,11 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):

        """ % locals()

-    def c_kernel_params(self, node):
-        return ["GA_BUFFER", "GA_BUFFER", "GA_UINT", "GA_UINT"]
-
-    def c_kernel_name(self):
-        return "mrg_uniform"
-
-    def c_kernel_flags(self, node):
-        return self._get_kernel_flags(self.output_type.dtype, 'int32')
+        return [Kernel(code=code, name="mrg_uniform",
+                       params=[gpuarray.GpuArray, gpuarray.GpuArray,
+                               'uint32', 'uint32'],
+                       flags=Kernel.get_flags(self.output_type.dtype, 'int32'),
+                       objname='k_mrg_uniform')]

    def c_code(self, node, nodename, inp, out, sub):
        rstate, size = inp
@@ -880,7 +877,7 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
        ndim = self.output_type.ndim
        o_type_num = numpy.asarray(0, dtype=self.output_type.dtype).dtype.num
        fail = sub['fail']
-        kname = self.c_kernel_obj(nodename)
+        kname = 'k_mrg_uniform'

        if self.output_type.dtype == 'float32':
            otype = 'float'