fix GpuSoftmaxWithBias when the input have size 0.

acb1a0e1 · Frederic · f16aee3d · acb1a0e1 · acb1a0e1 · acb1a0e1
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -60,6 +60,8 @@ Crash Fix
   element-wise fusion optimization when upcasting some inputs to
   float32 (to compute them on the GPU).
   (Frederic B., reported by Sander Dieleman)
+ * GpuSoftmaxWithBias with shape (0, N) with N > 1.
+   (Frédéric B., reported by Razvan P.)

 =============
 Release Notes

--- a/theano/sandbox/cuda/nnet.py
+++ b/theano/sandbox/cuda/nnet.py
@@ -419,7 +419,7 @@ class GpuSoftmaxWithBias (GpuOp):
        return  [shape[0]]
    def c_code_cache_version(self):
        #return ()
-        return (4,) + inline_softmax.code_version
+        return (5,) + inline_softmax.code_version

    def c_code(self, node, nodename, inp, out, sub):
        x, b = inp
@@ -461,14 +461,16 @@ class GpuSoftmaxWithBias (GpuOp):
 //TODO, detect the maximum number of thread per block.
            int n_threads = std::min(CudaNdarray_HOST_DIMS(%(x)s)[1], 1024);
            int n_shared_bytes = CudaNdarray_HOST_DIMS(%(x)s)[1] * 2 * sizeof(float);
-
-            kSoftmaxWithBias_%(nodename)s
-                <<<
-                // todo: cap these at the card limits, implement loops in kernel
-                    n_blocks,
-                    n_threads,
-                    n_shared_bytes
-                >>>(
+            if (CudaNdarray_HOST_DIMS(%(x)s)[0] > 0)
+            {
+                kSoftmaxWithBias_%(nodename)s
+                    <<<
+                    // todo: cap these at the card limits,
+                    //       implement loops in kernel
+                        n_blocks,
+                        n_threads,
+                        n_shared_bytes
+                    >>>(
                        CudaNdarray_HOST_DIMS(%(x)s)[0],
                        CudaNdarray_HOST_DIMS(%(x)s)[1],

@@ -480,13 +482,17 @@ class GpuSoftmaxWithBias (GpuOp):
                        CudaNdarray_HOST_STRIDES(%(b)s)[0],

                        CudaNdarray_DEV_DATA(%(z)s)  //guarantee c contig
-                );
-            CNDA_THREAD_SYNC;
-            cudaError_t err = cudaGetLastError();
-            if( cudaSuccess != err)
-            {
-                PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n", "kSoftmax_%(nodename)s", cudaGetErrorString(err));
-                %(fail)s;
+                    );
+                CNDA_THREAD_SYNC;
+                cudaError_t err = cudaGetLastError();
+                if( cudaSuccess != err)
+                {
+                    PyErr_Format(PyExc_RuntimeError,
+                                 "Cuda error: %%s: %%s.\\n",
+                                 "kSoftmaxWithBias_%(nodename)s",
+                                 cudaGetErrorString(err));
+                    %(fail)s;
+                }
            }
        }
        assert(%(z)s);

--- a/theano/sandbox/cuda/tests/test_nnet.py
+++ b/theano/sandbox/cuda/tests/test_nnet.py
@@ -142,7 +142,10 @@ def test_softmax_with_bias():
    TODO: check that we loop when their is too much thread.(THIS IS NOT IMPLEMENTED)
    """
    x = T.fmatrix('x')
-    z = T.nnet.softmax_with_bias(x, T.zeros_like(x[0,:]))
+    # We can't use zeros_like(x[0,::]) as this don't allow to test with
+    # 0 shape.
+    z = T.nnet.softmax_with_bias(x, T.alloc(numpy.asarray(0, dtype='float32'),
+                                            x.shape[1]))

    f = theano.function([x],z, mode=mode_without_gpu)
    f_gpu = theano.function([x],z, mode=mode_with_gpu)
@@ -165,6 +168,7 @@ def test_softmax_with_bias():
    #we need to test n>32*1024 to check that we make the block loop.
    cmp(2<<15, 5)
    cmp(4074, 400)
+    cmp(0, 10)
    cmp(4, 1000, True)
    cmp(4, 1024, True)
    cmp(4, 2000, True)