Make the GpuCrossentropySoftmaxArgmax1HotWithBias c_code more readable.

ed9c389c · Arnaud Bergeron · dfe973bc · ed9c389c
--- a/theano/gpuarray/nnet.py
+++ b/theano/gpuarray/nnet.py
 from __future__ import absolute_import, print_function, division
+import os
 import numpy
 from theano import Op, Apply, config
@@ -45,7 +46,10 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
        return node.inputs[0].type.context
    def c_headers(self):
-        return ['<numpy_compat.h>', '<gpuarray/types.h>']
+        return ['<numpy_compat.h>', '<gpuarray/types.h>', 'gpuarray_helper.h']
+    def c_header_dirs(self):
+        return [os.path.dirname(__file__)]
    def gpu_kernels(self, node, nodename):
        dtype_x = node.inputs[0].dtype
@@ -191,9 +195,6 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
    def c_code(self, node, nodename, inp, out, sub):
        if node.inputs[0].type.context.kind != b'cuda':
            raise NotImplementedError('cuda only')
-        typecode_x = pygpu.gpuarray.dtype_to_typecode(node.inputs[0].dtype)
-        typecode_b = pygpu.gpuarray.dtype_to_typecode(node.inputs[1].dtype)
-        typecode_y_idx = pygpu.gpuarray.dtype_to_typecode(node.inputs[2].dtype)
        itemsize_x = numpy.dtype(node.inputs[0].dtype).itemsize
        worksize_x = numpy.dtype(work_dtype(node.inputs[0].dtype)).itemsize
        itemsize_b = numpy.dtype(node.inputs[1].dtype).itemsize
@@ -203,13 +204,6 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
        itemsize_am = numpy.dtype(node.outputs[2].dtype).itemsize
        x, b, y_idx = inp
        nll, sm, am = out
-        dtype_x = node.inputs[0].dtype
-        dtype_b = node.inputs[1].dtype
-        dtype_y_idx = node.inputs[2].dtype
-        dtype_nll = node.outputs[0].dtype
-        dtype_sm = node.outputs[1].dtype
-        dtype_am = node.outputs[2].dtype
-        classname = self.__class__.__name__
        fail = sub['fail']
        ctx = sub['params']
        k_var = "k_xent_sm_1hot_bias_%(nodename)s" % locals()
@@ -229,21 +223,6 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
            """ % locals()
        sio = StringIO()
        print("""
-        if (PyGpuArray_NDIM(%(y_idx)s) != 1)
-        {
-            PyErr_SetString(PyExc_ValueError, "y_idx not 1d tensor");
-            %(fail)s;
-        }
-        if (PyGpuArray_NDIM(%(x)s) != 2)
-        {
-            PyErr_SetString(PyExc_ValueError, "x not 2d tensor");
-            %(fail)s;
-        }
-        if (PyGpuArray_NDIM(%(b)s) != 1)
-        {
-            PyErr_SetString(PyExc_ValueError, "b not 1d tensor");
-            %(fail)s;
-        }
        if (PyGpuArray_DIMS(%(x)s)[0] !=
            PyGpuArray_DIMS(%(y_idx)s)[0])
        {
@@ -257,82 +236,32 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
                            "dimension mismatch in x,b arguments");
            %(fail)s;
        }
-        if ((NULL == %(nll)s) //initial condition
+        if (theano_prep_output(&%(nll)s, 1, PyGpuArray_DIMS(%(y_idx)s), %(x)s->ga.typecode, GA_C_ORDER, %(ctx)s)) %(fail)s
-            || (PyGpuArray_DIMS(%(nll)s)[0] !=
+        if (theano_prep_output(&%(sm)s, 2, PyGpuArray_DIMS(%(x)s), %(x)s->ga.typecode, GA_C_ORDER, %(ctx)s)) %(fail)s
-                PyGpuArray_DIMS(%(y_idx)s)[0]))
+        if (theano_prep_output(&%(am)s, 1, PyGpuArray_DIMS(%(y_idx)s), %(y_idx)s->ga.typecode, GA_C_ORDER, %(ctx)s)) %(fail)s
-        {
-            Py_XDECREF(%(nll)s);
-            %(nll)s = pygpu_empty(1, PyGpuArray_DIMS(%(y_idx)s),
-                                %(typecode_x)s, GA_C_ORDER, %(ctx)s,
-                                Py_None);
-            if (!%(nll)s) {
-                %(fail)s
-            }
-        }
-        if ((NULL == %(sm)s)
-            || (PyGpuArray_DIMS(%(sm)s)[0] !=
-                PyGpuArray_DIMS(%(x)s)[0])
-            || (PyGpuArray_DIMS(%(sm)s)[1] !=
-                PyGpuArray_DIMS(%(x)s)[1]))
-        {
-            Py_XDECREF(%(sm)s);
-            %(sm)s = pygpu_empty(2, PyGpuArray_DIMS(%(x)s),
-                                %(typecode_b)s, GA_C_ORDER,
-                                %(ctx)s, Py_None);
-            if(!%(sm)s)
-            {
-                PyErr_SetString(PyExc_MemoryError,
-                                "failed to alloc sm output");
-                // no need to decref cnda_nll, the cleanup code should do it up
-                %(fail)s;
-            }
-        }
-        if ((NULL == %(am)s)
-            || (PyGpuArray_DIMS(%(am)s)[0] !=
-                PyGpuArray_DIMS(%(y_idx)s)[0]))
-        {
-            Py_XDECREF(%(am)s);
-            %(am)s = pygpu_empty(1, PyGpuArray_DIMS(%(y_idx)s),
-                                %(typecode_y_idx)s, GA_C_ORDER,
-                                %(ctx)s, Py_None);
-            if(!%(am)s)
-            {
-                PyErr_SetString(PyExc_MemoryError,
-                                "failed to alloc am output");
-                // no need to decref nll and sm,
-                // the cleanup code should do it up
-                %(fail)s;
-            }
-        }
        {
            size_t n_blocks = std::min(PyGpuArray_DIM(%(x)s, 0), (size_t)4096);
            size_t n_threads = std::min(PyGpuArray_DIM(%(x)s, 1), (size_t)256);
            size_t n_shared = n_threads * %(worksize_x)s;
-            ssize_t stride_X0 = PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s;
-            ssize_t stride_X1 = PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s;
-            ssize_t stride_B0 = PyGpuArray_STRIDES(%(b)s)[0] / %(itemsize_b)s;
-            ssize_t stride_YIDX0 = PyGpuArray_STRIDES(%(y_idx)s)[0] / %(itemsize_y_idx)s;
-            ssize_t stride_NLL0 = PyGpuArray_STRIDES(%(nll)s)[0] / %(itemsize_nll)s;
-            ssize_t stride_SM0 = PyGpuArray_STRIDES(%(sm)s)[0] / %(itemsize_sm)s;
-            ssize_t stride_SM1 = PyGpuArray_STRIDES(%(sm)s)[1] / %(itemsize_sm)s;
-            ssize_t stride_AM0 = PyGpuArray_STRIDES(%(am)s)[0] / %(itemsize_am)s;
     //TODO: launch more threads per row and do parallel sum and max reductions
-            void *kernel_params[] = {
+            int err = k_xent_sm_1hot_bias_call(
-                (void *)&PyGpuArray_DIMS(%(x)s)[0],
+                1, &n_blocks, &n_threads, n_shared,
-                (void *)&PyGpuArray_DIMS(%(x)s)[1],
+                PyGpuArray_DIMS(%(x)s)[0],
-                (void *)%(x)s->ga.data, (void *)&%(x)s->ga.offset,
+                PyGpuArray_DIMS(%(x)s)[1],
-                (void *)&stride_X0, (void *)&stride_X1,
+                %(x)s->ga.data, %(x)s->ga.offset,
-                (void *)%(b)s->ga.data, (void *)&%(b)s->ga.offset,
+                PyGpuArray_STRIDE(%(x)s, 0) / %(itemsize_x)s,
-                (void *)&stride_B0,
+                PyGpuArray_STRIDE(%(x)s, 1) / %(itemsize_x)s,
-                (void *)%(y_idx)s->ga.data, (void *)&%(y_idx)s->ga.offset,
+                %(b)s->ga.data, %(b)s->ga.offset,
-                (void *)&stride_YIDX0,
+                PyGpuArray_STRIDE(%(b)s, 0) / %(itemsize_b)s,
-                (void *)%(nll)s->ga.data, (void *)&%(nll)s->ga.offset,
+                %(y_idx)s->ga.data, %(y_idx)s->ga.offset,
-                (void *)&stride_NLL0,
+                PyGpuArray_STRIDE(%(y_idx)s, 0) / %(itemsize_y_idx)s,
-                (void *)%(sm)s->ga.data, (void *)&%(sm)s->ga.offset,
+                %(nll)s->ga.data, %(nll)s->ga.offset,
-                (void *)&stride_SM0, (void *)&stride_SM1,
+                PyGpuArray_STRIDE(%(nll)s, 0) / %(itemsize_nll)s,
-                (void *)%(am)s->ga.data, (void *)&%(am)s->ga.offset,
+                %(sm)s->ga.data, %(sm)s->ga.offset,
-                (void *)&stride_AM0};
+                PyGpuArray_STRIDE(%(sm)s, 0) / %(itemsize_sm)s,
-            int err = GpuKernel_call(&%(k_var)s, 1, &n_threads, &n_blocks, n_shared, kernel_params);
+                PyGpuArray_STRIDE(%(sm)s, 1) / %(itemsize_sm)s,
+                %(am)s->ga.data, %(am)s->ga.offset,
+                PyGpuArray_STRIDE(%(am)s, 0) / %(itemsize_am)s);
            %(err_check)s
            %(sync)s
        }
@@ -340,7 +269,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
        return sio.getvalue()
    def c_code_cache_version(self):
-        return (10,)
+        return (12,)
 gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1HotWithBias()