added gpu implementation of Crossentropy gradient op

eb84fcb3 · James Bergstra · 85908603 · eb84fcb3 · eb84fcb3 · eb84fcb3
--- a/blas.py
+++ b/blas.py
@@ -43,7 +43,8 @@ class GpuDot22(Op):
            || (CudaNdarray_HOST_DIMS(cnda_%(z)s)[0] != CudaNdarray_HOST_DIMS(cnda_%(x)s)[0])
            || (CudaNdarray_HOST_DIMS(cnda_%(z)s)[1] != CudaNdarray_HOST_DIMS(cnda_%(y)s)[1]))
        {
-            if (cnda_%(z)s) Py_DECREF(cnda_%(z)s);
+            //if (cnda_%(z)s) Py_DECREF(cnda_%(z)s);
+            Py_XDECREF(cnda_%(z)s);
            npy_intp dims[2];
            dims[0] = CudaNdarray_HOST_DIMS(cnda_%(x)s)[0];
            dims[1] = CudaNdarray_HOST_DIMS(cnda_%(y)s)[1];
@@ -183,168 +184,6 @@ class GpuConv(Op):
                subsample=self.subsample,
                logical_img_shape=self.logical_img_hw,
                logical_kern_shape=self.logical_kern_hw,
-                kern_align=self.logical_kern_align_top)
+                kern_align=self.logical_kern_align_top,
+                verbose=1)
-class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
-    nin=3
-    nout=3
-    def __eq__(self, other):
-        return type(self) == type(other)
-    def __hash__(self):
-        return hash(type(self))
-    def make_node(self, x, b, y_idx):
-        nll = y_idx.type() #N.B. won't work when we don't cast y_idx to float anymore
-        sm = x.type()
-        am = y_idx.type()
-        return Apply(self, [x, b, y_idx], [nll, sm, am])
-    def c_support_code(self):
-        return """
-        __global__ void k_xent_sm_1hot_bias(int M, int N,
-            const float * x_data, int xs0, int xs1,
-            const float * b, int bs0,
-            const float * y_idx_data, int y_idxs0,
-            float * nll_data, int nlls0,
-            float * sm_data, int sms0, int sms1,
-            float * am_data, int ams0)
-        {
-            const int row = blockIdx.x;
-            const float * x = x_data + xs0 * row;
-            const int y_idx = (int)y_idx_data[row * y_idxs0];
-            float * sm = sm_data + sms0 * row;
-            float sum = 0.0;
-            int row_max_j = 0;
-            float row_max = x[0] + b[0];
-            for (int j = 1; j < N; ++j)
-            {
-                float row_ij = x[j*xs1] + b[j*bs0];
-                //todo: store to shared memory
-                row_max_j = (row_ij > row_max) ? j : row_max_j;
-                row_max   = (row_ij > row_max) ? row_ij : row_max;
-            }
-            //compute the exp
-            for (int j = 0; j < N; ++j)
-            {
-                float row_ij = x[j*xs1] + b[j*bs0];
-                float sm_ij = exp(row_ij - row_max);
-                sum += sm_ij;
-                sm[j * sms1] = sm_ij;
-            }
-            float sum_inv = 1.0 / sum;
-            for (int j = 0; j < N; ++j)
-            {
-                sm[j * sms1] *= sum_inv;
-            }
-            if ((y_idx >= N) || (y_idx < 0))
-            {
-                //TODO: set raise an error bit in a global var?
-                nll_data[row*nlls0] = 0.0; // raise some suspicion at least...
-            }
-            else
-            {
-                nll_data[row*nlls0] = - x[y_idx*xs1]
-                           - b[y_idx*bs0]
-                           + row_max
-                           + log(sum);
-            }
-            am_data[row*ams0] = row_max_j;
-        }
-        """
-    def c_code(self, node, nodename, (x, b, y_idx), (nll, sm, am), sub):
-        classname=self.__class__.__name__
-        fail = sub['fail']
-        sio = StringIO.StringIO()
-        print >> sio, """
-        if (cnda_%(y_idx)s->nd != 1)
-        {
-            PyErr_SetString(PyExc_ValueError, "y_idx not 1d tensor");
-            %(fail)s;
-        }
-        if (cnda_%(x)s->nd != 2)
-        {
-            PyErr_SetString(PyExc_ValueError, "x not 2d tensor");
-            %(fail)s;
-        }
-        if (cnda_%(b)s->nd != 1)
-        {
-            PyErr_SetString(PyExc_ValueError, "b not 1d tensor");
-            %(fail)s;
-        }
-        if (CudaNdarray_HOST_DIMS(cnda_%(x)s)[0] != CudaNdarray_HOST_DIMS(cnda_%(y_idx)s)[0])
-        {
-            PyErr_SetString(PyExc_ValueError, "dimension mismatch in x,y_idx arguments");
-            %(fail)s;
-        }
-        if (CudaNdarray_HOST_DIMS(cnda_%(x)s)[1] != CudaNdarray_HOST_DIMS(cnda_%(b)s)[0])
-        {
-            PyErr_SetString(PyExc_ValueError, "dimension mismatch in x,b arguments");
-            %(fail)s;
-        }
-        if ((NULL == cnda_%(nll)s) //initial condition
-            || (CudaNdarray_HOST_DIMS(cnda_%(nll)s)[0] != CudaNdarray_HOST_DIMS(cnda_%(y_idx)s)[0]))
-        {
-            Py_XDECREF(cnda_%(nll)s);
-            cnda_%(nll)s = (CudaNdarray*)CudaNdarray_NewDims(1, CudaNdarray_HOST_DIMS(cnda_%(y_idx)s));
-            if(!cnda_%(nll)s)
-            {
-                %(fail)s;
-            }
-        }
-        if ((NULL == cnda_%(sm)s)
-            || (CudaNdarray_HOST_DIMS(cnda_%(sm)s)[0] != CudaNdarray_HOST_DIMS(cnda_%(x)s)[0])
-            || (CudaNdarray_HOST_DIMS(cnda_%(sm)s)[1] != CudaNdarray_HOST_DIMS(cnda_%(x)s)[1]))
-        {
-            Py_XDECREF(cnda_%(sm)s);
-            cnda_%(sm)s = (CudaNdarray*) CudaNdarray_NewDims(2, CudaNdarray_HOST_DIMS(cnda_%(x)s));
-            if(!cnda_%(sm)s)
-            {
-                PyErr_SetString(PyExc_MemoryError, "failed to alloc sm output");
-                // no need to decref cnda_nll, the cleanup code should pick it up.
-                %(fail)s;
-            }
-        }
-        if ((NULL == cnda_%(am)s)
-            || (CudaNdarray_HOST_DIMS(cnda_%(am)s)[0] != CudaNdarray_HOST_DIMS(cnda_%(y_idx)s)[0]))
-        {
-            Py_XDECREF(cnda_%(am)s);
-            cnda_%(am)s = (CudaNdarray*) CudaNdarray_NewDims(1, CudaNdarray_HOST_DIMS(cnda_%(y_idx)s));
-            if(!cnda_%(am)s)
-            {
-                PyErr_SetString(PyExc_MemoryError, "failed to alloc am output");
-                // no need to decref nll amd sm, the cleanup code should pick it up.
-                %(fail)s;
-            }
-        }
-        {
-            int n_blocks = CudaNdarray_HOST_DIMS(cnda_%(sm)s)[0];
-            int n_threads = 1; //TODO: launch more threads per row and do parallel sum and max reductions.
-            int n_shared_bytes = 0; //n_threads * sizeof(float);
-            k_xent_sm_1hot_bias<<<n_blocks, n_threads, n_shared_bytes>>>(
-                CudaNdarray_HOST_DIMS(cnda_%(x)s)[0],
-                CudaNdarray_HOST_DIMS(cnda_%(x)s)[1],
-                CudaNdarray_DEV_DATA(cnda_%(x)s), CudaNdarray_HOST_STRIDES(cnda_%(x)s)[0], CudaNdarray_HOST_STRIDES(cnda_%(x)s)[1], 
-                CudaNdarray_DEV_DATA(cnda_%(b)s), CudaNdarray_HOST_STRIDES(cnda_%(b)s)[0], 
-                CudaNdarray_DEV_DATA(cnda_%(y_idx)s), CudaNdarray_HOST_STRIDES(cnda_%(y_idx)s)[0], 
-                CudaNdarray_DEV_DATA(cnda_%(nll)s), CudaNdarray_HOST_STRIDES(cnda_%(nll)s)[0], 
-                CudaNdarray_DEV_DATA(cnda_%(sm)s), CudaNdarray_HOST_STRIDES(cnda_%(sm)s)[0], CudaNdarray_HOST_STRIDES(cnda_%(sm)s)[1], 
-                CudaNdarray_DEV_DATA(cnda_%(am)s), CudaNdarray_HOST_STRIDES(cnda_%(am)s)[0]);
-            CNDA_THREAD_SYNC;
-            cudaError_t err = cudaGetLastError();
-            if (cudaSuccess != err) 
-            {
-                PyErr_Format(PyExc_RuntimeError, "Cuda error: %(classname)s %(nodename)s: %%s.\\n", cudaGetErrorString(err));
-                // no need to decref output vars the cleanup code should pick them up.
-                %(fail)s;
-            }
-        }
-        """ % locals()
-        return sio.getvalue()
-    def c_code_cache_version(self):
-        return (1,0)
--- a/nnet.py
+++ b/nnet.py
--- a/opt.py
+++ b/opt.py
@@ -3,8 +3,10 @@ from theano import tensor, scalar, compile
 from theano.gof import local_optimizer, EquilibriumDB, SequenceDB
 from theano_cuda_ndarray.basic_ops import *
-from theano_cuda_ndarray.blas import gpu_dot22, gpu_gemm, GpuConv, GpuCrossentropySoftmaxArgmax1HotWithBias
+from theano_cuda_ndarray.blas import gpu_dot22, gpu_gemm, GpuConv
+from theano_cuda_ndarray.nnet import (
+        GpuCrossentropySoftmaxArgmax1HotWithBias,
+        GpuCrossentropySoftmax1HotWithBiasDx)
 from theano.compile import optdb
 #optdb.print_summary()  # this shows what is currently registered (in a so-far crude way...)
@@ -249,3 +251,19 @@ def local_gpu_crossentorpy_softmax_argmax_1hot_with_bias(node):
                    host_from_gpu(gpu_sm), 
                    cast(host_from_gpu(gpu_am), am_dtype)]
    return False
+@register_opt()
+@local_optimizer([])
+def local_gpu_crossentorpy_softmax_1hot_with_bias_dx(node):
+    print 'REPLACING ', node, '??'
+    if isinstance(node.op, tensor.nnet.CrossentropySoftmax1HotWithBiasDx):
+        dnll,sm,yidx = node.inputs
+        if sm.owner and sm.owner.op == host_from_gpu:
+            gpu_sm, = sm.owner.inputs
+            gpu_dx = GpuCrossentropySoftmax1HotWithBiasDx()(
+                gpu_from_host(dnll),
+                gpu_sm,
+                gpu_from_host(cast(yidx, 'float32')))
+            print 'YEP ', node
+            return [host_from_gpu(gpu_dx)]
+    return False