fix GpuSoftmaxWithBias when the input have size 0.

acb1a0e1 · Frederic · f16aee3d · acb1a0e1 · acb1a0e1 · acb1a0e1
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -60,6 +60,8 @@ Crash Fix
   element-wise fusion optimization when upcasting some inputs to
   float32 (to compute them on the GPU).
   (Frederic B., reported by Sander Dieleman)
+ * GpuSoftmaxWithBias with shape (0, N) with N > 1.
+   (Frédéric B., reported by Razvan P.)
 =============
 Release Notes

--- a/theano/sandbox/cuda/nnet.py
+++ b/theano/sandbox/cuda/nnet.py
@@ -419,7 +419,7 @@ class GpuSoftmaxWithBias (GpuOp):
        return  [shape[0]]
    def c_code_cache_version(self):
        #return ()
-        return (4,) + inline_softmax.code_version
+        return (5,) + inline_softmax.code_version
    def c_code(self, node, nodename, inp, out, sub):
        x, b = inp
@@ -461,10 +461,12 @@ class GpuSoftmaxWithBias (GpuOp):
 //TODO, detect the maximum number of thread per block.
            int n_threads = std::min(CudaNdarray_HOST_DIMS(%(x)s)[1], 1024);
            int n_shared_bytes = CudaNdarray_HOST_DIMS(%(x)s)[1] * 2 * sizeof(float);
+            if (CudaNdarray_HOST_DIMS(%(x)s)[0] > 0)
+            {
                kSoftmaxWithBias_%(nodename)s
                    <<<
-                // todo: cap these at the card limits, implement loops in kernel
+                    // todo: cap these at the card limits,
+                    //       implement loops in kernel
                        n_blocks,
                        n_threads,
                        n_shared_bytes
@@ -485,10 +487,14 @@ class GpuSoftmaxWithBias (GpuOp):
                cudaError_t err = cudaGetLastError();
                if( cudaSuccess != err)
                {
-                PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n", "kSoftmax_%(nodename)s", cudaGetErrorString(err));
+                    PyErr_Format(PyExc_RuntimeError,
+                                 "Cuda error: %%s: %%s.\\n",
+                                 "kSoftmaxWithBias_%(nodename)s",
+                                 cudaGetErrorString(err));
                    %(fail)s;
                }
            }
+        }
        assert(%(z)s);
        """ % locals()

--- a/theano/sandbox/cuda/tests/test_nnet.py
+++ b/theano/sandbox/cuda/tests/test_nnet.py
@@ -142,7 +142,10 @@ def test_softmax_with_bias():
    TODO: check that we loop when their is too much thread.(THIS IS NOT IMPLEMENTED)
    """
    x = T.fmatrix('x')
-    z = T.nnet.softmax_with_bias(x, T.zeros_like(x[0,:]))
+    # We can't use zeros_like(x[0,::]) as this don't allow to test with
+    # 0 shape.
+    z = T.nnet.softmax_with_bias(x, T.alloc(numpy.asarray(0, dtype='float32'),
+                                            x.shape[1]))
    f = theano.function([x],z, mode=mode_without_gpu)
    f_gpu = theano.function([x],z, mode=mode_with_gpu)
@@ -165,6 +168,7 @@ def test_softmax_with_bias():
    #we need to test n>32*1024 to check that we make the block loop.
    cmp(2<<15, 5)
    cmp(4074, 400)
+    cmp(0, 10)
    cmp(4, 1000, True)
    cmp(4, 1024, True)
    cmp(4, 2000, True)