Make GpuSoftmaxWithBias don't crash on GTX285 with some shapes.

56b301a3 · Frederic · d0040637 · 56b301a3 · 56b301a3
--- a/theano/sandbox/cuda/nnet.py
+++ b/theano/sandbox/cuda/nnet.py
@@ -470,7 +470,7 @@ class GpuSoftmaxWithBias (GpuOp):
    def c_code_cache_version(self):
        #return ()
-        return (6,) + inline_softmax.code_version
+        return (7,) + inline_softmax.code_version
    def c_code(self, node, nodename, inp, out, sub):
        x, b = inp
@@ -510,7 +510,7 @@ class GpuSoftmaxWithBias (GpuOp):
        {
            int n_blocks = std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],32*1024);
 //TODO, detect the maximum number of thread per block.
-            int n_threads = std::min(CudaNdarray_HOST_DIMS(%(x)s)[1], 1024);
+            int n_threads = std::min(CudaNdarray_HOST_DIMS(%(x)s)[1], 512);
            int n_shared_bytes = CudaNdarray_HOST_DIMS(%(x)s)[1] * 2 * sizeof(float);
            if (CudaNdarray_HOST_DIMS(%(x)s)[0] > 0)
            {

--- a/theano/sandbox/cuda/tests/test_nnet.py
+++ b/theano/sandbox/cuda/tests/test_nnet.py
@@ -183,7 +183,9 @@ def test_softmax_with_bias():
    def cmp(n, m, catch=False):
        """Some old card won't accet the configuration arguments of
-        this implementation."""
+        this implementation. For those cases set catch=True to skip
+        those errors.
+        """
        try:
            #print "test_softmax",n,m
            data = numpy.arange(n * m, dtype='float32').reshape(n, m)
@@ -193,18 +195,22 @@ def test_softmax_with_bias():
        except RuntimeError, e:
            if not catch:
                raise
-            assert (e.args[0] ==
+            # Different CUDA driver have different error message
-              'Cuda error: kSoftmaxWithBias_node_0: invalid configuration argument.\n'
+            assert (e.args[0].startswith(
-            ), e.args[0]
+              'Cuda error: kSoftmaxWithBias_node_0: invalid configuration argument.\n') or
+            e.args[0].startswith('Cuda error: kSoftmaxWithBias_node_0: invalid argument.\n'))
    cmp(2, 5)
    #we need to test n>32*1024 to check that we make the block loop.
    cmp(2 << 15, 5)
    cmp(4074, 400)
    cmp(0, 10)
-    cmp(4, 1000, True)
+    cmp(784, 784)
-    cmp(4, 1024, True)
+    cmp(4, 1000)
-    cmp(4, 2000, True)
+    cmp(4, 1024)
-    cmp(4, 2024, True)
+    cmp(4, 2000)
+    cmp(4, 2024)
+    #GTX285 don't have enought shared mem for this case.
    cmp(4, 4074, True)