Merge pull request #1634 from vdumoulin/new_backend

New backend: GpuCrossentropySoftmaxArgmax1HotWithBias, GpuCrossentropySoftmax1HotWithBiasDx

Merge pull request #1634 from vdumoulin/new_backend
51a6bbc6 · abergeron · 5249876b · 9b7d22e7 · 51a6bbc6 · 51a6bbc6
--- a/theano/sandbox/gpuarray/nnet.py
+++ b/theano/sandbox/gpuarray/nnet.py
+import numpy
+from theano import Op, Apply
+from theano.compat.six import StringIO
+from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
+try:
+    import pygpu
+    from pygpu import gpuarray, elemwise
+except ImportError:
+    pass
+from theano.sandbox.gpuarray.basic_ops import as_gpuarray_variable
+from theano.sandbox.gpuarray.type import GpuArrayType
+class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
+    """
+    Implement CrossentropySoftmaxArgmax1HotWithBias on the gpu.
+    """
+    nin = 3
+    nout = 3
+    def __eq__(self, other):
+        return type(self) == type(other)
+    def __hash__(self):
+        return hash(type(self))
+    def __str__(self):
+        return self.__class__.__name__
+    def make_node(self, x, b, y_idx):
+        #N.B. won't work when we don't cast y_idx to float anymore
+        x = as_gpuarray_variable(x)
+        b = as_gpuarray_variable(b)
+        y_idx = as_gpuarray_variable(y_idx)
+        nll = GpuArrayType(x.type.dtype,
+                           y_idx.type.broadcastable)()
+        sm = x.type()
+        am = y_idx.type()
+        return Apply(self, [x, b, y_idx], [nll, sm, am])
+    def c_headers(self):
+        return ['cuda.h', '<compyte/extension.h>', '<compyte/numpy_compat.h>']
+    def c_support_code_apply(self, node, nodename):
+        dtype_x = node.inputs[0].dtype
+        dtype_b = node.inputs[1].dtype
+        dtype_y_idx = node.inputs[2].dtype
+        return """
+        __global__ void k_xent_sm_1hot_bias_%(nodename)s(int M, int N,
+            const npy_%(dtype_x)s* x_data, int xs0, int xs1,
+            const npy_%(dtype_b)s* b, int bs0,
+            const npy_%(dtype_y_idx)s* y_idx_data, int y_idxs0,
+            npy_%(dtype_x)s* nll_data, int nlls0,
+            npy_%(dtype_x)s* sm_data, int sms0, int sms1,
+            npy_%(dtype_y_idx)s* am_data, int ams0)
+        {
+          for (int row = blockIdx.x; row < M; row += gridDim.x){
+            const npy_%(dtype_x)s* x = x_data + xs0 * row;
+            const npy_%(dtype_y_idx)s y_idx = y_idx_data[row * y_idxs0];
+            npy_%(dtype_x)s* sm = sm_data + sms0 * row;
+            npy_%(dtype_x)s sum = 0.0;
+            int row_max_j = 0;
+            npy_%(dtype_x)s row_max = x[0] + b[0];
+            for (int j = 1; j < N; ++j)
+            {
+                npy_%(dtype_x)s row_ij = x[j*xs1] + b[j*bs0];
+                //todo: store to shared memory
+                row_max_j = (row_ij > row_max) ? j : row_max_j;
+                row_max   = (row_ij > row_max) ? row_ij : row_max;
+            }
+            //compute the exp
+            for (int j = 0; j < N; ++j)
+            {
+                npy_%(dtype_x)s row_ij = x[j*xs1] + b[j*bs0];
+                npy_%(dtype_x)s sm_ij = exp(row_ij - row_max);
+                sum += sm_ij;
+                sm[j * sms1] = sm_ij;
+            }
+            npy_%(dtype_x)s sum_inv = 1.0 / sum;
+            for (int j = 0; j < N; ++j)
+            {
+                sm[j * sms1] *= sum_inv;
+            }
+            if ((y_idx >= N) || (y_idx < 0))
+            {
+                //TODO: set raise an error bit in a global var?
+                nll_data[row*nlls0] = 0.0; // raise some suspicion at least...
+            }
+            else
+            {
+                nll_data[row*nlls0] = - x[y_idx*xs1]
+                           - b[y_idx*bs0]
+                           + row_max
+                           + log(sum);
+            }
+            am_data[row*ams0] = row_max_j;
+          }
+        }
+        CUdeviceptr (*cuda_get_ptr)(gpudata *g);
+        """ % locals()
+    def c_init_code(self):
+        return ['cuda_get_ptr = (CUdeviceptr (*)(gpudata *g))compyte_get_extension("cuda_get_ptr");']
+    def c_code(self, node, nodename, inp, out, sub):
+        typecode_x = pygpu.gpuarray.dtype_to_typecode(node.inputs[0].dtype)
+        typecode_b = pygpu.gpuarray.dtype_to_typecode(node.inputs[1].dtype)
+        typecode_y_idx = pygpu.gpuarray.dtype_to_typecode(node.inputs[2].dtype)
+        itemsize_x = numpy.dtype(node.inputs[0].dtype).itemsize
+        itemsize_b = numpy.dtype(node.inputs[1].dtype).itemsize
+        itemsize_y_idx = numpy.dtype(node.inputs[2].dtype).itemsize
+        itemsize_nll = numpy.dtype(node.outputs[0].dtype).itemsize
+        itemsize_sm = numpy.dtype(node.outputs[1].dtype).itemsize
+        itemsize_am = numpy.dtype(node.outputs[2].dtype).itemsize
+        x, b, y_idx = inp
+        nll, sm, am = out
+        dtype_x = node.inputs[0].dtype
+        dtype_b = node.inputs[1].dtype
+        dtype_y_idx = node.inputs[2].dtype
+        dtype_nll = node.outputs[0].dtype
+        dtype_sm = node.outputs[1].dtype
+        dtype_am = node.outputs[2].dtype
+        classname = self.__class__.__name__
+        fail = sub['fail']
+        sio = StringIO()
+        print >> sio, """
+        if (PyGpuArray_NDIM(%(y_idx)s) != 1)
+        {
+            PyErr_SetString(PyExc_ValueError, "y_idx not 1d tensor");
+            %(fail)s;
+        }
+        if (PyGpuArray_NDIM(%(x)s) != 2)
+        {
+            PyErr_SetString(PyExc_ValueError, "x not 2d tensor");
+            %(fail)s;
+        }
+        if (PyGpuArray_NDIM(%(b)s) != 1)
+        {
+            PyErr_SetString(PyExc_ValueError, "b not 1d tensor");
+            %(fail)s;
+        }
+        if (PyGpuArray_DIMS(%(x)s)[0] !=
+            PyGpuArray_DIMS(%(y_idx)s)[0])
+        {
+            PyErr_SetString(PyExc_ValueError,
+                            "dimension mismatch in x,y_idx arguments");
+            %(fail)s;
+        }
+        if (PyGpuArray_DIMS(%(x)s)[1] != PyGpuArray_DIMS(%(b)s)[0])
+        {
+            PyErr_SetString(PyExc_ValueError,
+                            "dimension mismatch in x,b arguments");
+            %(fail)s;
+        }
+        if ((NULL == %(nll)s) //initial condition
+            || (PyGpuArray_DIMS(%(nll)s)[0] !=
+                PyGpuArray_DIMS(%(y_idx)s)[0]))
+        {
+            Py_XDECREF(%(nll)s);
+            %(nll)s = pygpu_empty(1, PyGpuArray_DIMS(%(y_idx)s),
+                                %(typecode_x)s,
+                                GA_C_ORDER,
+                                pygpu_default_context(), Py_None);
+            if (!%(nll)s) {
+                %(fail)s
+            }
+        }
+        if ((NULL == %(sm)s)
+            || (PyGpuArray_DIMS(%(sm)s)[0] !=
+                PyGpuArray_DIMS(%(x)s)[0])
+            || (PyGpuArray_DIMS(%(sm)s)[1] !=
+                PyGpuArray_DIMS(%(x)s)[1]))
+        {
+            Py_XDECREF(%(sm)s);
+            %(sm)s = pygpu_empty(2, PyGpuArray_DIMS(%(x)s),
+                                %(typecode_b)s,
+                                GA_C_ORDER,
+                                pygpu_default_context(), Py_None);
+            if(!%(sm)s)
+            {
+                PyErr_SetString(PyExc_MemoryError,
+                                "failed to alloc sm output");
+                // no need to decref cnda_nll, the cleanup code should do it up
+                %(fail)s;
+            }
+        }
+        if ((NULL == %(am)s)
+            || (PyGpuArray_DIMS(%(am)s)[0] !=
+                PyGpuArray_DIMS(%(y_idx)s)[0]))
+        {
+            Py_XDECREF(%(am)s);
+            %(am)s = pygpu_empty(1, PyGpuArray_DIMS(%(y_idx)s),
+                                %(typecode_y_idx)s,
+                                GA_C_ORDER,
+                                pygpu_default_context(), Py_None);
+            if(!%(am)s)
+            {
+                PyErr_SetString(PyExc_MemoryError,
+                                "failed to alloc am output");
+                // no need to decref nll and sm,
+                // the cleanup code should do it up
+                %(fail)s;
+            }
+        }
+        {
+            int n_blocks = PyGpuArray_DIMS(%(x)s)[0] < 256 ? PyGpuArray_DIMS(%(x)s)[0] : 256;
+     //TODO: launch more threads per row and do parallel sum and max reductions
+            int n_threads = 1;
+            int n_shared_bytes = 0; //n_threads * sizeof(dtype);
+            k_xent_sm_1hot_bias_%(nodename)s<<<n_blocks, n_threads, n_shared_bytes>>>(
+                PyGpuArray_DIMS(%(x)s)[0],
+                PyGpuArray_DIMS(%(x)s)[1],
+                (npy_%(dtype_x)s*)(((char *)cuda_get_ptr(%(x)s->ga.data)) +
+                                   %(x)s->ga.offset),
+                PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s,
+                PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s,
+                (npy_%(dtype_b)s*)(((char *)cuda_get_ptr(%(b)s->ga.data)) +
+                                   %(b)s->ga.offset),
+                PyGpuArray_STRIDES(%(b)s)[0] / %(itemsize_b)s,
+                (npy_%(dtype_y_idx)s*)(((char *)cuda_get_ptr(%(y_idx)s->ga.data)) +
+                                   %(y_idx)s->ga.offset),
+                PyGpuArray_STRIDES(%(y_idx)s)[0] / %(itemsize_y_idx)s,
+                (npy_%(dtype_nll)s*)(((char *)cuda_get_ptr(%(nll)s->ga.data)) +
+                                   %(nll)s->ga.offset),
+                PyGpuArray_STRIDES(%(nll)s)[0] / %(itemsize_nll)s,
+                (npy_%(dtype_sm)s*)(((char *)cuda_get_ptr(%(sm)s->ga.data)) +
+                                   %(sm)s->ga.offset),
+                PyGpuArray_STRIDES(%(sm)s)[0] / %(itemsize_sm)s,
+                PyGpuArray_STRIDES(%(sm)s)[1] / %(itemsize_sm)s,
+                (npy_%(dtype_am)s*)(((char *)cuda_get_ptr(%(am)s->ga.data)) +
+                                   %(am)s->ga.offset),
+                PyGpuArray_STRIDES(%(am)s)[0] / %(itemsize_am)s);
+            cudaError_t err = cudaGetLastError();
+            if (cudaSuccess != err)
+            {
+                PyErr_Format(PyExc_RuntimeError,
+                             "Cuda error: %(classname)s %(nodename)s: %%s.\\n"
+                             "The kernel was launched with %%d threads,"
+                             " %%d blocks and %%d shared memory\\n",
+                             cudaGetErrorString(err),
+                             n_threads, n_blocks, n_shared_bytes);
+                // no need to decref output vars the cleanup code will do it
+                %(fail)s;
+            }
+        }
+        """ % locals()
+        return sio.getvalue()
+    def c_code_cache_version(self):
+        #return ()
+        return (5,)
+    def c_compiler(self):
+        return NVCC_compiler
+gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1HotWithBias()
+class GpuCrossentropySoftmax1HotWithBiasDx(Op):
+    """
+    Implement CrossentropySoftmax1HotWithBiasDx on the gpu.
+    """
+    nin = 3
+    nout = 1
+    """Gradient wrt x of the CrossentropySoftmax1Hot Op"""
+    def __init__(self, **kwargs):
+        Op.__init__(self, **kwargs)
+    def __eq__(self, other):
+        return type(self) == type(other)
+    def __hash__(self):
+        return hash(type(self))
+    def __str__(self):
+        return self.__class__.__name__
+    def make_node(self, dnll, sm, y_idx):
+        dnll = as_gpuarray_variable(dnll)
+        sm = as_gpuarray_variable(sm)
+        y_idx = as_gpuarray_variable(y_idx)
+        return Apply(self, [dnll, sm, y_idx], [sm.type()])
+    def c_code_cache_version(self):
+        #return ()
+        return (6,)
+    def c_headers(self):
+        return ['cuda.h', '<compyte/extension.h>', '<compyte/numpy_compat.h>']
+    def c_compiler(self):
+        return NVCC_compiler
+    def c_code(self, node, nodename, inp, out, sub):
+        typecode_dx = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
+        itemsize_dnll = numpy.dtype(node.inputs[0].dtype).itemsize
+        itemsize_sm = numpy.dtype(node.inputs[1].dtype).itemsize
+        itemsize_y_idx = numpy.dtype(node.inputs[2].dtype).itemsize
+        itemsize_dx = numpy.dtype(node.outputs[0].dtype).itemsize
+        dtype_dnll = node.inputs[0].dtype
+        dtype_sm = node.inputs[1].dtype
+        dtype_y_idx = node.inputs[2].dtype
+        dtype_dx = node.outputs[0].dtype
+        dnll, sm, y_idx = inp
+        dx, = out
+        fail = sub['fail']
+        return """
+        if ((PyGpuArray_NDIM(%(dnll)s) != 1)
+            || (PyGpuArray_NDIM(%(sm)s) != 2)
+            || (PyGpuArray_NDIM(%(y_idx)s) != 1))
+        {
+            PyErr_SetString(PyExc_ValueError, "rank error");
+            %(fail)s;
+        }
+        if (PyGpuArray_DIMS(%(dnll)s)[0] !=
+            PyGpuArray_DIMS(%(sm)s)[0])
+        {
+            PyErr_Format(PyExc_ValueError,
+                         "dnll.shape[0] == %%i, but sm.shape[0] == %%i",
+                         PyGpuArray_DIMS(%(dnll)s)[0],
+                         PyGpuArray_DIMS(%(sm)s)[0]);
+            %(fail)s;
+        }
+        if (PyGpuArray_DIMS(%(dnll)s)[0] !=
+            PyGpuArray_DIMS(%(y_idx)s)[0])
+        {
+            PyErr_SetString(PyExc_ValueError,
+                            "dnll.shape[0] != y_idx.shape[0]");
+            %(fail)s;
+        }
+        if ((NULL == %(dx)s)
+            || (PyGpuArray_DIMS(%(dx)s)[0] !=
+                PyGpuArray_DIMS(%(sm)s)[0])
+            || (PyGpuArray_DIMS(%(dx)s)[1] !=
+                PyGpuArray_DIMS(%(sm)s)[1]))
+        {
+            Py_XDECREF(%(dx)s);
+            %(dx)s = pygpu_empty(2, PyGpuArray_DIMS(%(sm)s),
+                                 %(typecode_dx)s,
+                                 GA_C_ORDER,
+                                 pygpu_default_context(), Py_None);
+            if (!%(dx)s) {
+                %(fail)s
+            }
+        }
+        {
+            int n_blocks = PyGpuArray_DIMS(%(dx)s)[0] < 256 ? PyGpuArray_DIMS(%(dx)s)[0] : 256;
+            int n_threads = PyGpuArray_DIMS(%(dx)s)[1] < 256 ? PyGpuArray_DIMS(%(dx)s)[1] : 256;
+            kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s
+                <<<n_blocks, n_threads>>>(
+                        PyGpuArray_DIMS(%(dx)s)[0],
+                        PyGpuArray_DIMS(%(dx)s)[1],
+                        (npy_%(dtype_dnll)s*)(((char *)cuda_get_ptr(%(dnll)s->ga.data)) +
+                                           %(dnll)s->ga.offset),
+                        PyGpuArray_STRIDES(%(dnll)s)[0] / %(itemsize_dnll)s,
+                        (npy_%(dtype_sm)s*)(((char *)cuda_get_ptr(%(sm)s->ga.data)) +
+                                           %(sm)s->ga.offset),
+                        PyGpuArray_STRIDES(%(sm)s)[0] / %(itemsize_sm)s,
+                        PyGpuArray_STRIDES(%(sm)s)[1] / %(itemsize_sm)s,
+                        (npy_%(dtype_y_idx)s*)(((char *)cuda_get_ptr(%(y_idx)s->ga.data)) +
+                                           %(y_idx)s->ga.offset),
+                        PyGpuArray_STRIDES(%(y_idx)s)[0] / %(itemsize_y_idx)s,
+                        (npy_%(dtype_dx)s*)(((char *)cuda_get_ptr(%(dx)s->ga.data)) +
+                                           %(dx)s->ga.offset),
+                        PyGpuArray_STRIDES(%(dx)s)[0] / %(itemsize_dx)s,
+                        PyGpuArray_STRIDES(%(dx)s)[1] / %(itemsize_dx)s
+                );
+            cudaError_t err = cudaGetLastError();
+            if( cudaSuccess != err)
+            {
+                PyErr_Format(PyExc_RuntimeError,
+                             "Cuda error: %%s: %%s.\\n"
+                             "The kernel was launched with %%d threads and"
+                             " %%d blocks\\n",
+                             "kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s",
+                             cudaGetErrorString(err), n_threads, n_blocks);
+                %(fail)s;
+            }
+        }
+        assert(%(dx)s);
+        """ % locals()
+    def c_support_code_apply(self, node, nodename):
+        dtype_dnll = node.inputs[0].dtype
+        dtype_sm = node.inputs[1].dtype
+        dtype_y_idx = node.inputs[2].dtype
+        dtype_dx = node.outputs[0].dtype
+        return """
+        __global__ void kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s(
+           int N, int K,
+           const npy_%(dtype_dnll)s* dnll, const int dnll_s0,
+           const npy_%(dtype_sm)s* sm, const int sm_s0, const int sm_s1,
+           const npy_%(dtype_y_idx)s* y_idx, const int y_idx_s0,
+           npy_%(dtype_dx)s* dx, const int dx_s0, const int dx_s1)
+        {
+            for (int i = blockIdx.x; i < N; i += gridDim.x)
+            {
+                npy_%(dtype_dnll)s dnll_i = dnll[i * dnll_s0];
+                npy_%(dtype_y_idx)s y_i = y_idx[i * y_idx_s0];
+                for (int j = threadIdx.x; j < K; j += blockDim.x)
+                {
+                    if (y_i == j)
+                    {
+                        dx[i * dx_s0 + j * dx_s1] =
+                            dnll_i * (sm[i * sm_s0 + j * sm_s1]-1.0);
+                    }
+                    else
+                    {
+                        dx[i * dx_s0 + j * dx_s1] =
+                            dnll_i * sm[i * sm_s0 + j * sm_s1];
+                    }
+                    //dx[i * dx_s0 + j * dx_s1] =
+                    //    dnll_i * sm[i * sm_s0 + j * sm_s1];
+                    //dx[i*dx_s0+j*dx_s1] = 0;
+                }
+            }
+        }
+        CUdeviceptr (*cuda_get_ptr)(gpudata *g);
+        """ % locals()
+    def c_init_code(self):
+        return ['cuda_get_ptr = (CUdeviceptr (*)(gpudata *g))compyte_get_extension("cuda_get_ptr");']
+gpu_crossentropy_softmax_1hot_with_bias_dx = GpuCrossentropySoftmax1HotWithBiasDx()
--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -18,6 +18,8 @@ from theano.sandbox.gpuarray.basic_ops import (host_from_gpu,
                                               GpuReshape,
                                               GpuEye)
 from theano.sandbox.gpuarray.blas import gpu_dot22, GpuGemv, GpuGemm
+from theano.sandbox.gpuarray.nnet import (GpuCrossentropySoftmaxArgmax1HotWithBias,
+                                          GpuCrossentropySoftmax1HotWithBiasDx)
 from theano.sandbox.gpuarray.elemwise import (GpuElemwise, _is_scalar,
                                              GpuDimShuffle, GpuCAReduce)
 from theano.sandbox.gpuarray.subtensor import GpuSubtensor
@@ -58,7 +60,6 @@ def op_lifter(OP):
        def local_opt(node):
            if type(node.op) in OP:
                # This does not support nodes that have more than one output.
-                assert len(node.outputs) == 1
                # either one of our inputs is on the gpu or
                # all of our client are on the gpu
                if (any([i.owner and i.owner.op == host_from_gpu
@@ -69,7 +70,9 @@ def op_lifter(OP):
                    # This is needed as sometimes new_op inherit from OP.
                    if new_op and new_op != node.op:
                        if isinstance(new_op, theano.Op):
-                            return [host_from_gpu(new_op(*node.inputs))]
+                            return [host_from_gpu(o) for o in new_op(*node.inputs, return_list=True)]
+                        elif isinstance(new_op, (tuple, list)):
+                            return [host_from_gpu(o) for o in new_op]
                        else:  # suppose it is a variable on the GPU
                            return [host_from_gpu(new_op)]
            return False
@@ -267,3 +270,15 @@ def local_gpua_dot22(node):
 @op_lifter([tensor.basic.Eye])
 def local_gpua_eye(node):
    return GpuEye(dtype=node.op.dtype)
+@register_opt()
+@op_lifter([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias])
+def local_gpua_crossentropysoftmaxargmax1hotwithbias(node):
+    return GpuCrossentropySoftmaxArgmax1HotWithBias()
+@register_opt()
+@op_lifter([tensor.nnet.CrossentropySoftmax1HotWithBiasDx])
+def local_gpua_crossentropysoftmax1hotwithbiasdx(node):
+    return GpuCrossentropySoftmax1HotWithBiasDx()
--- a/theano/sandbox/gpuarray/tests/test_nnet.py
+++ b/theano/sandbox/gpuarray/tests/test_nnet.py
+from nose.plugins.skip import SkipTest
+import numpy
+import theano
+from theano.gof.python25 import any
+import theano.tensor as T
+import theano.tests.unittest_tools as utt
+from theano.sandbox import gpuarray
+if theano.sandbox.gpuarray.pygpu is None:
+    raise SkipTest("pygpu not installed")
+# We let that import do the init of the back-end if needed.
+from theano.sandbox.gpuarray.tests.test_basic_ops import (mode_with_gpu,
+                                                          mode_without_gpu)
+if not gpuarray.pygpu_activated:
+    raise SkipTest("pygpu disabled")
+from theano.sandbox.gpuarray.nnet import (
+    GpuCrossentropySoftmaxArgmax1HotWithBias,
+    GpuCrossentropySoftmax1HotWithBiasDx)
+def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
+    """
+    This is basic test for GpuCrossentropySoftmaxArgmax1HotWithBias
+    We check that we loop when their is too much threads
+    """
+    n_in = 1000
+    batch_size = 4097
+    n_out = 1250
+    if not isinstance(mode_with_gpu, theano.compile.DebugMode):
+        n_in = 4098
+        n_out = 4099
+    x = T.fmatrix('x')
+    y = T.lvector('y')
+    b = T.fvector('b')
+    #W = T.fmatrix('W')
+    #we precompute the dot with big shape before to allow the test of
+    #GpuCrossentropySoftmax1HotWithBiasDx to don't fail with the error
+    #(the launch timed out and was terminated) on GPU card not
+    #powerful enough. We need the big shape to check for corner
+    #case.
+    dot_result = T.fmatrix('dot_result')
+    # Seed numpy.random with config.unittests.rseed
+    utt.seed_rng()
+    xx = numpy.asarray(numpy.random.rand(batch_size, n_in),
+                       dtype=numpy.float32)
+    #?????yy = numpy.ones((batch_size,),dtype='float32')
+    yy = numpy.ones((batch_size,), dtype='int32')
+    b_values = numpy.zeros((n_out,), dtype='float32')
+    W_values = numpy.asarray(numpy.random.rand(n_in, n_out), dtype='float32')
+    dot_value = numpy.asarray(numpy.dot(xx, W_values), dtype='float32')
+    del W_values
+    p_y_given_x = T.nnet.softmax(dot_result + b)
+    y_pred = T.argmax(p_y_given_x, axis=-1)
+    loss = -T.mean(T.log(p_y_given_x)[T.arange(y.shape[0]), y])
+    dW = T.grad(loss, dot_result)
+    classify = theano.function(inputs=[y, b, dot_result],
+                               outputs=[loss, y_pred, dW],
+                               mode=mode_without_gpu)
+    classify_gpu = theano.function(inputs=[y, b, dot_result],
+                                   outputs=[loss, y_pred, dW],
+                                   mode=mode_with_gpu)
+    #theano.printing.debugprint(classify)
+    #theano.printing.debugprint(classify_gpu)
+    assert any([isinstance(node.op,
+                           T.nnet.CrossentropySoftmaxArgmax1HotWithBias)
+                for node in classify.maker.fgraph.toposort()])
+    assert any([isinstance(node.op,
+                           GpuCrossentropySoftmaxArgmax1HotWithBias)
+                for node in classify_gpu.maker.fgraph.toposort()])
+    out = classify(yy, b_values, dot_value)
+    gout = classify_gpu(yy, b_values, dot_value)
+    assert len(out) == len(gout) == 3
+    assert numpy.allclose(out[0], gout[0])
+    assert numpy.allclose(out[2], gout[2], atol=3e-6), numpy.absolute(
+        gout[2] - out[2]).max()
+    assert numpy.allclose(out[1], gout[1]), [(id, out[1][id], gout[1][id], val)
+                                             for id, val in enumerate(out[1] -
+                                                                      gout[1])
+                                             if val != 0]
+def test_GpuCrossentropySoftmax1HotWithBiasDx():
+    """
+    This is basic test for GpuCrossentropySoftmax1HotWithBiasDx
+    We check that we loop when their is too much threads
+    """
+    n_in = 1000
+    batch_size = 4097
+    n_out = 1250
+    if not isinstance(mode_with_gpu, theano.compile.DebugMode):
+        n_in = 4098
+        n_out = 4099
+    # Seed numpy.random with config.unittests.rseed
+    utt.seed_rng()
+    softmax_output_value = numpy.random.rand(batch_size,
+                                             n_out).astype('float32')
+    dnll_value = numpy.asarray(numpy.random.rand(batch_size), dtype='float32')
+    y_idx_value = numpy.random.randint(low=0, high=5, size=batch_size)
+    softmax_output = T.fmatrix()
+    softmax_output /= softmax_output.sum(axis=1).reshape(
+        softmax_output.shape[1], 1)
+    op = theano.tensor.nnet.crossentropy_softmax_1hot_with_bias_dx(
+        dnll_value,
+        softmax_output,
+        y_idx_value)
+    cpu_f = theano.function([softmax_output], op, mode=mode_without_gpu)
+    gpu_f = theano.function([softmax_output], op, mode=mode_with_gpu)
+    #theano.printing.debugprint(cpu_f)
+    #theano.printing.debugprint(gpu_f)
+    assert any([isinstance(node.op, T.nnet.CrossentropySoftmax1HotWithBiasDx)
+                for node in cpu_f.maker.fgraph.toposort()])
+    assert any([isinstance(node.op,
+                           GpuCrossentropySoftmax1HotWithBiasDx)
+                for node in gpu_f.maker.fgraph.toposort()])
+    cpu_out = cpu_f(softmax_output_value)
+    gpu_out = gpu_f(softmax_output_value)
+    rtol = 1e-5
+    atol = 1e-6
+    if not numpy.allclose(cpu_out, gpu_out, rtol=rtol, atol=atol):
+        abs_err, rel_err = T.numeric_grad.abs_rel_err(cpu_out, gpu_out)
+        scaled_err = numpy.minimum(abs_err / atol, rel_err / rtol)
+        max_i = scaled_err.argmax()
+        print 'max err index:', max_i, max_i / batch_size,
+        print max_i % batch_size, max_i / n_out, max_i & n_out
+        print 'At that index:'
+        print 'err:', scaled_err.flatten()[max_i]
+        print 'absolute error:', abs_err.flatten()[max_i]
+        print 'relative error:', rel_err.flatten()[max_i]
+        print 'cpu_out:', cpu_out.flatten()[max_i]
+        print 'gpu_out:', gpu_out.flatten()[max_i]
+        print 'softmax_output_value:', softmax_output_value.flatten()[max_i]
+        print 'dnll_value:', dnll_value[max_i / n_out]
+        print 'y_idx_value:', y_idx_value[max_i / n_out]
+        assert False, "numpy.allclose(cpu_out, gpu_out, rtol=%s, atol=%s)" % (
+            rtol, atol)
--- a/theano/sandbox/gpuarray/type.py
+++ b/theano/sandbox/gpuarray/type.py
@@ -138,7 +138,9 @@ class GpuArrayType(Type):
            return numpy.dtype(self.dtype).itemsize
    def c_declare(self, name, sub):
-        return "PyGpuArrayObject *%s;" % (name,)
+        return """
+        PyGpuArrayObject *%(name)s;
+        """ % locals()
    def c_init(self, name, sub):
        return "%s = NULL;" % (name,)