Change GPUA_mrg over to use GpuKernelBase. (I couldn't get it to work otherwise).

bd9e0243 · Arnaud Bergeron · c7c2a019 · bd9e0243
--- a/theano/sandbox/rng_mrg.py
+++ b/theano/sandbox/rng_mrg.py
@@ -25,6 +25,9 @@ if cuda_available:
    from theano.sandbox.cuda import (CudaNdarrayType,
                                     float32_shared_constructor)

+from theano.sandbox.gpuarray.basic_ops import GpuKernelBase
+from theano.sandbox.gpuarray.type import GpuArrayType
+from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler

 def matVecModM(A, s, m):
    # return (A * s) % m
@@ -608,7 +611,7 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
        return (7,)


-class GPUA_mrg_uniform(mrg_uniform_base):
+class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
    #GpuArray version

    @classmethod
@@ -620,12 +623,9 @@ class GPUA_mrg_uniform(mrg_uniform_base):
        return op(rstate, cast(v_size, 'int32'))

    def c_headers(self):
-        return ["<compyte/ext_cuda.h>"]
+        return GpuKernelBase.c_headers(self) + ['numpy_compat.h']

-    def c_init_code(self):
-        return ["setup_ext_cuda();"]
-
-    def c_support_code_apply(self, node, nodename):
+    def c_kernel_code(self, node):
        if self.output_type.dtype == 'float32':
            otype = 'float'
            NORM = '4.6566126e-10f'  # numpy.float32(1.0/(2**31+65))
@@ -635,32 +635,28 @@ class GPUA_mrg_uniform(mrg_uniform_base):
            otype = 'double'
            NORM = '4.656612873077392578125e-10'
        return """
-        static int %(nodename)s_printed_warning = 0;
-
-        static __global__ void %(nodename)s_mrg_uniform(
-                %(otype)s*sample_data,
-                npy_int32*state_data,
-                const int Nsamples,
-                const int Nstreams_used)
+        KERNEL void mrg_uniform(
+                %(otype)s *sample_data,
+                ga_int *state_data,
+                const ga_uint Nsamples,
+                const ga_uint Nstreams_used)
        {
-            const npy_int32 i0 = 0;
-            const npy_int32 i7 = 7;
-            const npy_int32 i9 = 9;
-            const npy_int32 i15 = 15;
-            const npy_int32 i16 = 16;
-            const npy_int32 i22 = 22;
-            const npy_int32 i24 = 24;
-
-            const npy_int32 M1 = 2147483647;      //2^31 - 1
-            const npy_int32 M2 = 2147462579;      //2^31 - 21069
-            const npy_int32 MASK12 = 511;       //2^9 - 1
-            const npy_int32 MASK13 = 16777215;  //2^24 - 1
-            const npy_int32 MASK2 = 65535;      //2^16 - 1
-            const npy_int32 MULT2 = 21069;
+            const ga_int i7 = 7;
+            const ga_int i9 = 9;
+            const ga_int i15 = 15;
+            const ga_int i16 = 16;
+            const ga_int i22 = 22;
+            const ga_int i24 = 24;
+
+            const ga_int M1 = 2147483647;      //2^31 - 1
+            const ga_int M2 = 2147462579;      //2^31 - 21069
+            const ga_int MASK12 = 511;       //2^9 - 1
+            const ga_int MASK13 = 16777215;  //2^24 - 1
+            const ga_int MASK2 = 65535;      //2^16 - 1
+            const ga_int MULT2 = 21069;

-            const unsigned int numThreads = blockDim.x * gridDim.x;
            const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
-            npy_int32 y1, y2, x11, x12, x13, x21, x22, x23;
+            ga_int y1, y2, x11, x12, x13, x21, x22, x23;

            if (idx < Nstreams_used)
            {
@@ -714,6 +710,15 @@ class GPUA_mrg_uniform(mrg_uniform_base):

        """ % locals()

+    def c_kernel_params(self, node):
+        return ["GA_BUFFER", "GA_BUFFER", "GA_UINT", "GA_UINT"]
+
+    def c_kernel_name(self):
+        return "mrg_uniform"
+
+    def c_kernel_flags(self, node):
+        return self._get_kernel_flags(self.output_type.dtype, 'int32')
+
    def c_code(self, node, nodename, inp, out, sub):
        rstate, size = inp
        o_rstate, o_sample = out
@@ -721,18 +726,21 @@ class GPUA_mrg_uniform(mrg_uniform_base):
        ndim = self.output_type.ndim
        o_type_num = numpy.asarray(0, dtype=self.output_type.dtype).dtype.num
        fail = sub['fail']
+        kname = self.c_kernel_obj(nodename)

        if self.output_type.dtype == 'float32':
            otype = 'float'
+            otypecode = 'GA_FLOAT'
        else:
            otype = 'double'
+            otypecode = 'GA_DOUBLE'

        return """
        //////// <code generated by mrg_uniform>

        size_t odims[%(ndim)s];
        unsigned int n_elements = 1;
-        unsigned int n_streams, n_streams_used_in_this_call;
+        unsigned int n_streams;
        int must_alloc_sample = ((NULL == %(o_sample)s)
                || !pygpu_GpuArray_Check(py_%(o_sample)s)
                || !(%(o_sample)s->ga.flags & GA_C_CONTIGUOUS)
@@ -745,7 +753,7 @@ class GPUA_mrg_uniform(mrg_uniform_base):
        }
        if (PyArray_DIMS(%(size)s)[0] != %(ndim)s)
        {
-            PyErr_Format(PyExc_ValueError, "size must have length %%i (not %%i)",
+            PyErr_Format(PyExc_ValueError, "size must have length %%i (not %%li)",
                %(ndim)s, PyArray_DIMS(%(size)s)[0]);
            %(fail)s
        }
@@ -756,7 +764,7 @@ class GPUA_mrg_uniform(mrg_uniform_base):
        }
        for (int i = 0; i < %(ndim)s; ++i)
        {
-            odims[i] = ((npy_int32*)(PyArray_BYTES(%(size)s) + PyArray_STRIDES(%(size)s)[0] * i))[0];
+            odims[i] = ((npy_int32 *)(PyArray_BYTES(%(size)s) + PyArray_STRIDES(%(size)s)[0] * i))[0];
            n_elements *= odims[i];
            must_alloc_sample = (must_alloc_sample
                    || PyGpuArray_DIMS(%(o_sample)s)[i] != odims[i]);
@@ -764,7 +772,7 @@ class GPUA_mrg_uniform(mrg_uniform_base):
        if (must_alloc_sample)
        {
            Py_XDECREF(%(o_sample)s);
-            %(o_sample)s = pygpu_empty(%(ndim)s, odims, GA_FLOAT, GA_C_ORDER,
+            %(o_sample)s = pygpu_empty(%(ndim)s, odims, %(otypecode)s, GA_C_ORDER,
                                       pygpu_default_context(), Py_None);
            if(!%(o_sample)s)
            {
@@ -785,7 +793,7 @@ class GPUA_mrg_uniform(mrg_uniform_base):
        }
        else
        {
-            %(o_rstate)s = pygpu_copy(%(rstate)s);
+            %(o_rstate)s = pygpu_copy(%(rstate)s, GA_ANY_ORDER);
        }

        if (PyGpuArray_NDIM(%(o_rstate)s) != 1)
@@ -799,44 +807,28 @@ class GPUA_mrg_uniform(mrg_uniform_base):
            %(fail)s;
        }
        n_streams = PyGpuArray_DIMS(%(o_rstate)s)[0]/6;
-        n_streams_used_in_this_call = std::min(n_streams, n_elements);
+        if (n_streams > n_elements)
+          n_streams = n_elements;

        {
-            unsigned int threads_per_block = std::min((unsigned int)n_streams_used_in_this_call, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
-            unsigned int n_blocks = std::min(ceil_intdiv((unsigned int)n_streams_used_in_this_call, threads_per_block), (unsigned int)NUM_VECTOR_OP_BLOCKS);
-
-            if (threads_per_block * n_blocks < n_streams)
-            {
-                if (! %(nodename)s_printed_warning)
-                  fprintf(stderr, "WARNING: unused streams above %%i (Tune GPU_mrg get_n_streams)\\n", threads_per_block * n_blocks );
-                %(nodename)s_printed_warning = 1;
-            }
-            cuda_enter(pygpu_default_context()->ctx);
-
-            %(nodename)s_mrg_uniform<<<n_blocks,threads_per_block>>>(
-                cuda_get_ptr(%(o_sample)s),
-                cuda_get_ptr(%(o_rstate)s),
-                n_elements, n_streams_used_in_this_call);
-
-            /* We need the full sync since we just modified libgpu
-            objects without informing it */
-            cudaDeviceSynchronize();
+          void *args[4];
+          args[0] = &%(o_sample)s->ga;
+          args[1] = &%(o_rstate)s->ga;
+          args[2] = &n_elements;
+          args[3] = &n_streams;
+          int err = GpuKernel_call(&%(kname)s, n_elements, 0, 0, args);
+          if (err != GA_NO_ERROR) {
+              PyErr_Format(PyExc_RuntimeError, "GpuKernel_call: %%s\\n",
+                           GpuKernel_error(&%(kname)s, err));
+              %(fail)s
+          }
        }

-        cudaError_t err = cudaGetLastError();
-        cuda_exit(pygpu_default_context()->ctx);
-
-        if (cudaSuccess != err)
-            {
-                PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n", "mrg_uniform", cudaGetErrorString(err));
-                %(fail)s;
-            }
-
        //////// </ code generated by mrg_uniform>
        """ % locals()

    def c_code_cache_version(self):
-        return (0,)
+        return (1, self.GpuKernelBase_version)


 def guess_n_streams(size, warn=True):