Merge pull request #1303 from nouiz/gpusoftmax

Fix GpuSoftmax[WithBias] Ops to work for large rows

Merge pull request #1303 from nouiz/gpusoftmax
ec3a90d4 · James Bergstra · 97fff534 · 2335f829 · ec3a90d4 · ec3a90d4
--- a/doc/dev_start_guide.txt
+++ b/doc/dev_start_guide.txt
@@ -505,5 +505,6 @@ Other tools that can help you
 * `line_profiler <http://pypi.python.org/pypi/line_profiler/>`_: Line-by-line profiler.
 * `memory_profiler <http://fseoane.net/blog/2012/line-by-line-report-of-memory-usage/>`_: memory profiler
 * `runsnake <http://www.vrplumber.com/programming/runsnakerun/>`_: Gui for cProfile(time profiler) and Meliae(memory profiler)
+ * `Guppy <https://pypi.python.org/pypi/guppy/>`_: Supports object and heap memory sizing, profiling and debugging.
 * `hub <https://github.com/defunkt/hub>`_: A tool that adds github commands to the git command line.
 * `git pull-requests <http://www.splitbrain.org/blog/2011-06/19-automate_github_pull_requests>`_: Another tool for git/github command line.
--- a/theano/misc/check_blas.py
+++ b/theano/misc/check_blas.py
@@ -198,6 +198,8 @@ if __name__ == "__main__":
        cuda version      5.0    4.2    4.1    4.0    3.2    3.0   # note
        gpu
+        K20m/ECC          0.07s
+        K20/NOECC         0.07s
        M2070             0.25s         0.27s         0.32s
        M2050(Amazon)     0.25s
        C2075                    0.25s
@@ -215,7 +217,7 @@ if __name__ == "__main__":
        GTX 285                  0.452s        0.452s        0.40s # cuda 3.0 seems faster? driver version?
        GTX 550 Ti                             0.57s
        GT 520                   2.68s                3.06s
-        520M                                          3.19s        # with bumblebee on Ubuntu 12.04
+        520M              2.44s                       3.19s        # with bumblebee on Ubuntu 12.04
        GT 220                                        3.80s
        GT 210                                 6.35s
        8500 GT                                              10.68s

--- a/theano/sandbox/cuda/kernel_codegen.py
+++ b/theano/sandbox/cuda/kernel_codegen.py
--- a/theano/sandbox/cuda/nnet.py
+++ b/theano/sandbox/cuda/nnet.py
--- a/theano/sandbox/cuda/tests/test_nnet.py
+++ b/theano/sandbox/cuda/tests/test_nnet.py
@@ -172,8 +172,8 @@ def test_softmax_with_bias():
    x = T.fmatrix('x')
    # We can't use zeros_like(x[0,::]) as this don't allow to test with
    # 0 shape.
-    z = T.nnet.softmax_with_bias(x, T.alloc(numpy.asarray(0, dtype='float32'),
+    z = T.nnet.softmax_with_bias(x, T.arange(x.shape[1] * 2,
-                                            x.shape[1]))
+                                             dtype='float32')[::2])
    f = theano.function([x], z, mode=mode_without_gpu)
    f_gpu = theano.function([x], z, mode=mode_with_gpu)
@@ -181,24 +181,12 @@ def test_softmax_with_bias():
    assert isinstance(f_gpu.maker.fgraph.toposort()[-2].op,
                      cuda.nnet.GpuSoftmaxWithBias)
-    def cmp(n, m, catch=False):
+    def cmp(n, m):
-        """Some old card won't accet the configuration arguments of
-        this implementation. For those cases set catch=True to skip
-        those errors.
-        """
-        try:
        #print "test_softmax",n,m
        data = numpy.arange(n * m, dtype='float32').reshape(n, m)
        out = f(data)
        gout = f_gpu(data)
        assert numpy.allclose(out, gout), numpy.absolute(out - gout)
-        except RuntimeError, e:
-            if not catch:
-                raise
-            # Different CUDA driver have different error message
-            assert (e.args[0].startswith(
-              'Cuda error: kSoftmaxWithBias_node_0: invalid configuration argument.\n') or
-            e.args[0].startswith('Cuda error: kSoftmaxWithBias_node_0: invalid argument.\n'))
    cmp(2, 5)
    #we need to test n>32*1024 to check that we make the block loop.
@@ -211,7 +199,11 @@ def test_softmax_with_bias():
    cmp(4, 2000)
    cmp(4, 2024)
    #GTX285 don't have enough shared mem for this case.
-    cmp(4, 4074, True)
+    cmp(4, 4074)
+    # The GTX580, 680 and kepler don't have enough shared memory.
+    cmp(2, 10000)
+    cmp(128, 16 * 1024)
+    cmp(128, 64 * 1024)
 def test_softmax():
@@ -219,9 +211,7 @@ def test_softmax():
    This is basic test for GpuSoftmax
    We check that we loop when their is too much block
+    We use slower code when there isn't enough shared memory
-    TODO: check that we loop when their is too much thread.(THIS IS
-    NOT IMPLEMENTED)
    """
    x = T.fmatrix('x')
@@ -232,25 +222,12 @@ def test_softmax():
    assert isinstance(f_gpu.maker.fgraph.toposort()[-2].op,
                      cuda.nnet.GpuSoftmax)
-    def cmp(n, m, catch=False):
+    def cmp(n, m):
-        """Some old card won't accept the configuration arguments of
-        this implementation. For those cases set catch=True to skip
-        those errors.
-        """
-        try:
        #print "test_softmax",n,m
        data = numpy.arange(n * m, dtype='float32').reshape(n, m)
        out = f(data)
        gout = f_gpu(data)
        assert numpy.allclose(out, gout), numpy.absolute(out - gout)
-        except RuntimeError, e:
-            if not catch:
-                raise
-            # Different CUDA driver have different error message
-            assert (e.args[0].startswith(
-              'Cuda error: kSoftmax_node_0: invalid configuration argument.\n') or
-            e.args[0].startswith('Cuda error: kSoftmax_node_0: invalid argument.\n'))
    #we need to test n>32*1024 to check that we make the block loop.
    cmp(2, 5)
@@ -262,5 +239,9 @@ def test_softmax():
    cmp(4, 1024)
    cmp(4, 2000)
    cmp(4, 2024)
-    #GTX285 don't have enough shared mem for this case.
+    # The GTX285 don't have enough shared memory.
-    cmp(4, 4074, True)
+    cmp(4, 4074)
+    # The GTX580, 680 and kepler don't have enough shared memory.
+    cmp(2, 10000)
+    cmp(128, 16 * 1024)
+    cmp(128, 64 * 1024)