Merge pull request #548 from nouiz/crash_size_0

Crash size 0

Merge pull request #548 from nouiz/crash_size_0
41a4a100 · lamblin · f16aee3d · d0c47a3d · 41a4a100 · 41a4a100
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -60,6 +60,8 @@ Crash Fix
   element-wise fusion optimization when upcasting some inputs to
   float32 (to compute them on the GPU).
   (Frederic B., reported by Sander Dieleman)
+ * GpuSoftmaxWithBias with shape (0, N) with N > 1.
+   (Frédéric B., reported by Razvan P.)
 =============
 Release Notes

--- a/theano/sandbox/cuda/nnet.py
+++ b/theano/sandbox/cuda/nnet.py
@@ -419,7 +419,7 @@ class GpuSoftmaxWithBias (GpuOp):
        return  [shape[0]]
    def c_code_cache_version(self):
        #return ()
-        return (4,) + inline_softmax.code_version
+        return (5,) + inline_softmax.code_version
    def c_code(self, node, nodename, inp, out, sub):
        x, b = inp
@@ -461,14 +461,16 @@ class GpuSoftmaxWithBias (GpuOp):
 //TODO, detect the maximum number of thread per block.
            int n_threads = std::min(CudaNdarray_HOST_DIMS(%(x)s)[1], 1024);
            int n_shared_bytes = CudaNdarray_HOST_DIMS(%(x)s)[1] * 2 * sizeof(float);
+            if (CudaNdarray_HOST_DIMS(%(x)s)[0] > 0)
-            kSoftmaxWithBias_%(nodename)s
+            {
-                <<<
+                kSoftmaxWithBias_%(nodename)s
-                // todo: cap these at the card limits, implement loops in kernel
+                    <<<
-                    n_blocks,
+                    // todo: cap these at the card limits,
-                    n_threads,
+                    //       implement loops in kernel
-                    n_shared_bytes
+                        n_blocks,
-                >>>(
+                        n_threads,
+                        n_shared_bytes
+                    >>>(
                        CudaNdarray_HOST_DIMS(%(x)s)[0],
                        CudaNdarray_HOST_DIMS(%(x)s)[1],
@@ -480,13 +482,17 @@ class GpuSoftmaxWithBias (GpuOp):
                        CudaNdarray_HOST_STRIDES(%(b)s)[0],
                        CudaNdarray_DEV_DATA(%(z)s)  //guarantee c contig
-                );
+                    );
-            CNDA_THREAD_SYNC;
+                CNDA_THREAD_SYNC;
-            cudaError_t err = cudaGetLastError();
+                cudaError_t err = cudaGetLastError();
-            if( cudaSuccess != err)
+                if( cudaSuccess != err)
-            {
+                {
-                PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n", "kSoftmax_%(nodename)s", cudaGetErrorString(err));
+                    PyErr_Format(PyExc_RuntimeError,
-                %(fail)s;
+                                 "Cuda error: %%s: %%s.\\n",
+                                 "kSoftmaxWithBias_%(nodename)s",
+                                 cudaGetErrorString(err));
+                    %(fail)s;
+                }
            }
        }
        assert(%(z)s);

--- a/theano/sandbox/cuda/tests/test_nnet.py
+++ b/theano/sandbox/cuda/tests/test_nnet.py
-import theano, numpy
+from nose.plugins.skip import SkipTest
+import numpy
+import theano
 import theano.tensor as T
 import theano.tests.unittest_tools as utt
 # Skip test if cuda_ndarray is not available.
-from nose.plugins.skip import SkipTest
 import theano.sandbox.cuda as cuda
 if cuda.cuda_available == False:
    raise SkipTest('Optional package cuda disabled')
-if theano.config.mode=='FAST_COMPILE':
+if theano.config.mode == 'FAST_COMPILE':
    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
-    mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpu')
+    mode_without_gpu = theano.compile.mode.get_mode(
+        'FAST_RUN').excluding('gpu')
 else:
    mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
    mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpu')
 def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
    """
    This is basic test for GpuCrossentropySoftmaxArgmax1HotWithBias
@@ -28,52 +32,67 @@ def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
    batch_size = 4097
    n_out = 1250
-    if theano.config.mode!="DEBUG_MODE":
+    if theano.config.mode != "DEBUG_MODE":
        n_in = 4098
        n_out = 4099
    x = T.fmatrix('x')
    y = T.lvector('y')
    b = T.fvector('b')
    #W = T.fmatrix('W')
-    #we precompute the dot with big shape before to allow the test of GpuCrossentropySoftmax1HotWithBiasDx to don't fail with the error (the launch timed out and was terminated) on GPU card not powerfull enought. We need the big shape to check for corner case.
+    #we precompute the dot with big shape before to allow the test of
+    #GpuCrossentropySoftmax1HotWithBiasDx to don't fail with the error
+    #(the launch timed out and was terminated) on GPU card not
+    #powerfull enought. We need the big shape to check for corner
+    #case.
    dot_result = T.fmatrix('dot_result')
    # Seed numpy.random with config.unittests.rseed
    utt.seed_rng()
-    xx = numpy.asarray(numpy.random.rand(batch_size,n_in),dtype=numpy.float32)
+    xx = numpy.asarray(numpy.random.rand(batch_size, n_in),
+                       dtype=numpy.float32)
    #?????yy = numpy.ones((batch_size,),dtype='float32')
-    yy = numpy.ones((batch_size,),dtype='int32')
+    yy = numpy.ones((batch_size,), dtype='int32')
-    b_values = numpy.zeros((n_out,),dtype='float32')
+    b_values = numpy.zeros((n_out,), dtype='float32')
-    W_values = numpy.asarray(numpy.random.rand(n_in,n_out),dtype='float32')
+    W_values = numpy.asarray(numpy.random.rand(n_in, n_out), dtype='float32')
-    dot_value = numpy.asarray(numpy.dot(xx, W_values),dtype='float32')
+    dot_value = numpy.asarray(numpy.dot(xx, W_values), dtype='float32')
    del W_values
-    p_y_given_x = T.nnet.softmax(dot_result+b)
+    p_y_given_x = T.nnet.softmax(dot_result + b)
    y_pred = T.argmax(p_y_given_x, axis=-1)
    loss = -T.mean(T.log(p_y_given_x)[T.arange(y.shape[0]), y])
    dW = T.grad(loss, dot_result)
-    classify = theano.function( inputs = [y,b,dot_result], outputs = [loss,y_pred,dW],
+    classify = theano.function(inputs=[y, b, dot_result],
-                                mode = mode_without_gpu)
+                               outputs=[loss, y_pred, dW],
-    classify_gpu = theano.function( inputs = [y,b,dot_result], outputs = [loss,y_pred,dW],
+                               mode=mode_without_gpu)
-                                    mode = mode_with_gpu)
+    classify_gpu = theano.function(inputs=[y, b, dot_result],
+                                   outputs=[loss, y_pred, dW],
+                                    mode=mode_with_gpu)
    #theano.printing.debugprint(classify)
    #theano.printing.debugprint(classify_gpu)
-    assert any([isinstance(node.op,T.nnet.CrossentropySoftmaxArgmax1HotWithBias) for node in classify.maker.env.toposort()])
+    assert any([isinstance(node.op,
-    assert any([isinstance(node.op,cuda.nnet.GpuCrossentropySoftmaxArgmax1HotWithBias) for node in classify_gpu.maker.env.toposort()])
+                           T.nnet.CrossentropySoftmaxArgmax1HotWithBias)
+                for node in classify.maker.env.toposort()])
+    assert any([isinstance(node.op,
+                           cuda.nnet.GpuCrossentropySoftmaxArgmax1HotWithBias)
+                for node in classify_gpu.maker.env.toposort()])
+    out = classify(yy, b_values, dot_value)
+    gout = classify_gpu(yy, b_values, dot_value)
-    out=classify(yy,b_values,dot_value)
+    assert len(out) == len(gout) == 3
-    gout=classify_gpu(yy,b_values,dot_value)
+    assert numpy.allclose(out[0], gout[0])
+    assert numpy.allclose(out[2], gout[2], atol=3e-6), numpy.absolute(
+        gout - out).max()
+    assert numpy.allclose(out[1], gout[1]), [(id, out[1][id], gout[1][id], val)
+                                             for id, val in enumerate(out[1] -
+                                                                      gout[1])
+                                             if val != 0]
-    assert len(out)==len(gout)==3
-    assert numpy.allclose(out[0],gout[0])
-    assert numpy.allclose(out[2],gout[2],atol=3e-6),numpy.absolute(gout-out).max()
-    assert numpy.allclose(out[1],gout[1]),[(id,out[1][id],gout[1][id],val) for id,val in enumerate(out[1]-gout[1]) if val!=0]
 def test_GpuCrossentropySoftmax1HotWithBiasDx():
    """
@@ -90,24 +109,29 @@ def test_GpuCrossentropySoftmax1HotWithBiasDx():
    # Seed numpy.random with config.unittests.rseed
    utt.seed_rng()
-    softmax_output_value = numpy.random.rand(batch_size, n_out).astype('float32')
+    softmax_output_value = numpy.random.rand(batch_size,
-    dnll_value = numpy.asarray(numpy.random.rand(batch_size),dtype='float32')
+                                             n_out).astype('float32')
+    dnll_value = numpy.asarray(numpy.random.rand(batch_size), dtype='float32')
    y_idx_value = numpy.random.randint(low=0, high=5, size=batch_size)
    softmax_output = T.fmatrix()
-    softmax_output /= softmax_output.sum(axis=1).reshape(softmax_output.shape[1],1)
+    softmax_output /= softmax_output.sum(axis=1).reshape(
+        softmax_output.shape[1], 1)
    op = theano.tensor.nnet.crossentropy_softmax_1hot_with_bias_dx(
        dnll_value,
        softmax_output,
        y_idx_value)
-    cpu_f = theano.function([softmax_output],op,mode = mode_without_gpu)
+    cpu_f = theano.function([softmax_output], op, mode=mode_without_gpu)
-    gpu_f = theano.function([softmax_output],op,mode = mode_with_gpu)
+    gpu_f = theano.function([softmax_output], op, mode=mode_with_gpu)
    #theano.printing.debugprint(cpu_f)
    #theano.printing.debugprint(gpu_f)
-    assert any([isinstance(node.op,T.nnet.CrossentropySoftmax1HotWithBiasDx) for node in cpu_f.maker.env.toposort()])
+    assert any([isinstance(node.op, T.nnet.CrossentropySoftmax1HotWithBiasDx)
-    assert any([isinstance(node.op,cuda.nnet.GpuCrossentropySoftmax1HotWithBiasDx) for node in gpu_f.maker.env.toposort()])
+                for node in cpu_f.maker.env.toposort()])
+    assert any([isinstance(node.op,
+                           cuda.nnet.GpuCrossentropySoftmax1HotWithBiasDx)
+                for node in gpu_f.maker.env.toposort()])
    cpu_out = cpu_f(softmax_output_value)
    gpu_out = gpu_f(softmax_output_value)
@@ -116,10 +140,11 @@ def test_GpuCrossentropySoftmax1HotWithBiasDx():
    atol = 1e-6
    if not numpy.allclose(cpu_out, gpu_out, rtol=rtol, atol=atol):
        abs_err, rel_err = T.numeric_grad.abs_rel_err(cpu_out, gpu_out)
-        scaled_err = numpy.minimum(abs_err/atol, rel_err/rtol)
+        scaled_err = numpy.minimum(abs_err / atol, rel_err / rtol)
        max_i = scaled_err.argmax()
-        print 'max err index:', max_i, max_i / batch_size, max_i % batch_size, max_i / n_out, max_i & n_out
+        print 'max err index:', max_i, max_i / batch_size,
+        print  max_i % batch_size, max_i / n_out, max_i & n_out
        print 'At that index:'
        print 'err:', scaled_err.flatten()[max_i]
        print 'absolute error:', abs_err.flatten()[max_i]
@@ -139,69 +164,84 @@ def test_softmax_with_bias():
    This is basic test for GpuSoftmaxWithBias
    We check that we loop when their is too much block
-    TODO: check that we loop when their is too much thread.(THIS IS NOT IMPLEMENTED)
+    TODO: check that we loop when their is too much thread.(THIS IS
+    NOT IMPLEMENTED)
    """
    x = T.fmatrix('x')
-    z = T.nnet.softmax_with_bias(x, T.zeros_like(x[0,:]))
+    # We can't use zeros_like(x[0,::]) as this don't allow to test with
+    # 0 shape.
-    f = theano.function([x],z, mode=mode_without_gpu)
+    z = T.nnet.softmax_with_bias(x, T.alloc(numpy.asarray(0, dtype='float32'),
-    f_gpu = theano.function([x],z, mode=mode_with_gpu)
+                                            x.shape[1]))
-    assert f.maker.env.toposort()[-1].op==T.nnet.softmax_with_bias
-    assert isinstance(f_gpu.maker.env.toposort()[-2].op,cuda.nnet.GpuSoftmaxWithBias)
+    f = theano.function([x], z, mode=mode_without_gpu)
+    f_gpu = theano.function([x], z, mode=mode_with_gpu)
-    def cmp(n,m, catch=False):
+    assert f.maker.env.toposort()[-1].op == T.nnet.softmax_with_bias
-        """Some old card won't accet the configuration arguments of this implementation."""
+    assert isinstance(f_gpu.maker.env.toposort()[-2].op,
+                      cuda.nnet.GpuSoftmaxWithBias)
+    def cmp(n, m, catch=False):
+        """Some old card won't accet the configuration arguments of
+        this implementation."""
        try:
            #print "test_softmax",n,m
-            data = numpy.arange(n*m, dtype='float32').reshape(n,m)
+            data = numpy.arange(n * m, dtype='float32').reshape(n, m)
-            out=f(data)
+            out = f(data)
-            gout=f_gpu(data)
+            gout = f_gpu(data)
-            assert numpy.allclose(out,gout),numpy.absolute(out-gout)
+            assert numpy.allclose(out, gout), numpy.absolute(out - gout)
        except RuntimeError, e:
            if not catch:
                raise
-            assert e.args[0]=='Cuda error: kSoftmax_node_0: invalid configuration argument.\n'
+            assert (e.args[0] ==
+              'Cuda error: kSoftmax_node_0: invalid configuration argument.\n')
    cmp(2, 5)
    #we need to test n>32*1024 to check that we make the block loop.
-    cmp(2<<15, 5)
+    cmp(2 << 15, 5)
    cmp(4074, 400)
+    cmp(0, 10)
    cmp(4, 1000, True)
    cmp(4, 1024, True)
    cmp(4, 2000, True)
    cmp(4, 2024, True)
    cmp(4, 4074, True)
 def test_softmax():
    """
    This is basic test for GpuSoftmax
    We check that we loop when their is too much block
-    TODO: check that we loop when their is too much thread.(THIS IS NOT IMPLEMENTED)
+    TODO: check that we loop when their is too much thread.(THIS IS
+    NOT IMPLEMENTED)
    """
    x = T.fmatrix('x')
    z = T.nnet.softmax(x)
-    f = theano.function([x],z, mode=mode_without_gpu)
+    f = theano.function([x], z, mode=mode_without_gpu)
-    f_gpu = theano.function([x],z, mode=mode_with_gpu)
+    f_gpu = theano.function([x], z, mode=mode_with_gpu)
-    assert f.maker.env.toposort()[-1].op==T.nnet.softmax
+    assert f.maker.env.toposort()[-1].op == T.nnet.softmax
-    assert isinstance(f_gpu.maker.env.toposort()[-2].op,cuda.nnet.GpuSoftmax)
+    assert isinstance(f_gpu.maker.env.toposort()[-2].op,
+                      cuda.nnet.GpuSoftmax)
-    def cmp(n,m, catch=False):
-        """Some old card won't accet the configuration arguments of this implementation."""
+    def cmp(n, m, catch=False):
+        """Some old card won't accet the configuration arguments of
+        this implementation."""
        try:
            #print "test_softmax",n,m
-            data = numpy.arange(n*m, dtype='float32').reshape(n,m)
+            data = numpy.arange(n * m, dtype='float32').reshape(n, m)
-            out=f(data)
+            out = f(data)
-            gout=f_gpu(data)
+            gout = f_gpu(data)
-            assert numpy.allclose(out,gout),numpy.absolute(out-gout)
+            assert numpy.allclose(out, gout), numpy.absolute(out - gout)
        except RuntimeError, e:
            if not catch:
                raise
-            assert e.args[0]=='Cuda error: kSoftmax_node_0: invalid configuration argument.\n'
+            assert (e.args[0] ==
+              'Cuda error: kSoftmax_node_0: invalid configuration argument.\n')
    #we need to test n>32*1024 to check that we make the block loop.
    cmp(2, 5)
-    cmp(2<<15, 5)
+    cmp(2 << 15, 5)
    cmp(4074, 400)
    cmp(4, 1000, True)
    cmp(4, 1024, True)