Merge pull request #1170 from nouiz/denormal

Denormal

Merge pull request #1170 from nouiz/denormal
b7e7be45 · lamblin · eab4cada · a18d8455 · b7e7be45 · b7e7be45
--- a/doc/tutorial/using_gpu.txt
+++ b/doc/tutorial/using_gpu.txt
@@ -284,6 +284,14 @@ Tips for Improving Performance on GPU
  Check the line similar to *Spent Xs(X%) in cpu op, Xs(X%) in gpu op and Xs(X%) in transfer op*.
  This can tell you if not enough of your graph is on the GPU or if there
  is too much memory transfer.
+* Use nvcc options. nvcc support those options to speed up some
+  computations: `-ftz=true` to `flush denormals values to
+  zeros. <https://developer.nvidia.com/content/cuda-pro-tip-flush-denormals-confidence>`_,
+  `--prec-div=false` and `--prec-sqrt=false` option to speed up
+  division and square root operation by being less precise. You can
+  enable all of them with with the `nvcc.flags=--use_fast_math` Theano
+  flags or you can enable them individually as in this example
+  `nvcc.flags=-ftz=true --prec-div=false`.
 .. _gpu_async:

--- a/theano/sandbox/cuda/nnet.py
+++ b/theano/sandbox/cuda/nnet.py
--- a/theano/sandbox/cuda/nvcc_compiler.py
+++ b/theano/sandbox/cuda/nvcc_compiler.py
@@ -255,10 +255,15 @@ class NVCC_compiler(object):
        # compute capability? '--gpu-architecture=compute_13',
        # '--gpu-code=compute_13',
        #nvcc argument
-        preargs1 = [pa for pa in preargs
+        preargs1 = []
-                    if pa.startswith('-O') or
+        for pa in preargs:
-                    pa.startswith('--maxrregcount=') or
+            for pattern in ['-O', '-arch=',
-                    pa.startswith('-arch=')]
+                            '--fmad', '--ftz', '--maxrregcount',
+                            '--prec-div', '--prec-sqrt',  '--use_fast_math',
+                            '-fmad', '-ftz', '-maxrregcount',
+                            '-prec-div', '-prec-sqrt', '-use_fast_math']:
+                if pa.startswith(pattern):
+                    preargs1.append(pa)
        preargs2 = [pa for pa in preargs
                    if pa not in preargs1]  # other arguments

--- a/theano/sandbox/cuda/tests/test_nnet.py
+++ b/theano/sandbox/cuda/tests/test_nnet.py
@@ -183,7 +183,9 @@ def test_softmax_with_bias():
    def cmp(n, m, catch=False):
        """Some old card won't accet the configuration arguments of
-        this implementation."""
+        this implementation. For those cases set catch=True to skip
+        those errors.
+        """
        try:
            #print "test_softmax",n,m
            data = numpy.arange(n * m, dtype='float32').reshape(n, m)
@@ -193,18 +195,22 @@ def test_softmax_with_bias():
        except RuntimeError, e:
            if not catch:
                raise
-            assert (e.args[0] ==
+            # Different CUDA driver have different error message
-              'Cuda error: kSoftmaxWithBias_node_0: invalid configuration argument.\n'
+            assert (e.args[0].startswith(
-            ), e.args[0]
+              'Cuda error: kSoftmaxWithBias_node_0: invalid configuration argument.\n') or
+            e.args[0].startswith('Cuda error: kSoftmaxWithBias_node_0: invalid argument.\n'))
    cmp(2, 5)
    #we need to test n>32*1024 to check that we make the block loop.
    cmp(2 << 15, 5)
    cmp(4074, 400)
    cmp(0, 10)
-    cmp(4, 1000, True)
+    cmp(784, 784)
-    cmp(4, 1024, True)
+    cmp(4, 1000)
-    cmp(4, 2000, True)
+    cmp(4, 1024)
-    cmp(4, 2024, True)
+    cmp(4, 2000)
+    cmp(4, 2024)
+    #GTX285 don't have enough shared mem for this case.
    cmp(4, 4074, True)
@@ -227,8 +233,11 @@ def test_softmax():
                      cuda.nnet.GpuSoftmax)
    def cmp(n, m, catch=False):
-        """Some old card won't accet the configuration arguments of
+        """Some old card won't accept the configuration arguments of
-        this implementation."""
+        this implementation. For those cases set catch=True to skip
+        those errors.
+        """
        try:
            #print "test_softmax",n,m
            data = numpy.arange(n * m, dtype='float32').reshape(n, m)
@@ -238,15 +247,20 @@ def test_softmax():
        except RuntimeError, e:
            if not catch:
                raise
-            assert (e.args[0] ==
+            # Different CUDA driver have different error message
-              'Cuda error: kSoftmax_node_0: invalid configuration argument.\n')
+            assert (e.args[0].startswith(
+              'Cuda error: kSoftmax_node_0: invalid configuration argument.\n') or
+            e.args[0].startswith('Cuda error: kSoftmax_node_0: invalid argument.\n'))
    #we need to test n>32*1024 to check that we make the block loop.
    cmp(2, 5)
    cmp(2 << 15, 5)
    cmp(4074, 400)
-    cmp(4, 1000, True)
+    cmp(0, 10)
-    cmp(4, 1024, True)
+    cmp(784, 784)
-    cmp(4, 2000, True)
+    cmp(4, 1000)
-    cmp(4, 2024, True)
+    cmp(4, 1024)
+    cmp(4, 2000)
+    cmp(4, 2024)
+    #GTX285 don't have enough shared mem for this case.
    cmp(4, 4074, True)