Merge pull request #1170 from nouiz/denormal

Denormal

Merge pull request #1170 from nouiz/denormal
b7e7be45 · lamblin · eab4cada · a18d8455 · b7e7be45 · b7e7be45
--- a/doc/tutorial/using_gpu.txt
+++ b/doc/tutorial/using_gpu.txt
@@ -284,6 +284,14 @@ Tips for Improving Performance on GPU
  Check the line similar to *Spent Xs(X%) in cpu op, Xs(X%) in gpu op and Xs(X%) in transfer op*.
  This can tell you if not enough of your graph is on the GPU or if there
  is too much memory transfer.
+* Use nvcc options. nvcc support those options to speed up some
+  computations: `-ftz=true` to `flush denormals values to
+  zeros. <https://developer.nvidia.com/content/cuda-pro-tip-flush-denormals-confidence>`_,
+  `--prec-div=false` and `--prec-sqrt=false` option to speed up
+  division and square root operation by being less precise. You can
+  enable all of them with with the `nvcc.flags=--use_fast_math` Theano
+  flags or you can enable them individually as in this example
+  `nvcc.flags=-ftz=true --prec-div=false`.
 .. _gpu_async:

--- a/theano/sandbox/cuda/nnet.py
+++ b/theano/sandbox/cuda/nnet.py
@@ -5,23 +5,30 @@ import StringIO
 from theano.sandbox.cuda.type import CudaNdarrayType
 from theano.sandbox.cuda import GpuOp
-from theano.sandbox.cuda.kernel_codegen import nvcc_kernel, inline_reduce_max, inline_reduce_sum, inline_softmax
+from theano.sandbox.cuda.kernel_codegen import (nvcc_kernel, inline_reduce_max,
+                                                inline_reduce_sum,
+                                                inline_softmax)
 class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp):
    """
    Implement CrossentropySoftmaxArgmax1HotWithBias on the gpu.
    """
-    nin=3
+    nin = 3
-    nout=3
+    nout = 3
    def __eq__(self, other):
        return type(self) == type(other)
    def __hash__(self):
        return hash(type(self))
    def __str__(self):
        return self.__class__.__name__
    def make_node(self, x, b, y_idx):
-        nll = y_idx.type() #N.B. won't work when we don't cast y_idx to float anymore
+        #N.B. won't work when we don't cast y_idx to float anymore
+        nll = y_idx.type()
        sm = x.type()
        am = y_idx.type()
        return Apply(self, [x, b, y_idx], [nll, sm, am])
@@ -85,7 +92,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp):
    def c_code(self, node, nodename, inp, out, sub):
        x, b, y_idx = inp
        nll, sm, am = out
-        classname=self.__class__.__name__
+        classname = self.__class__.__name__
        fail = sub['fail']
        sio = StringIO.StringIO()
        print >> sio, """
@@ -106,12 +113,14 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp):
        }
        if (CudaNdarray_HOST_DIMS(%(x)s)[0] != CudaNdarray_HOST_DIMS(%(y_idx)s)[0])
        {
-            PyErr_SetString(PyExc_ValueError, "dimension mismatch in x,y_idx arguments");
+            PyErr_SetString(PyExc_ValueError,
+                            "dimension mismatch in x,y_idx arguments");
            %(fail)s;
        }
        if (CudaNdarray_HOST_DIMS(%(x)s)[1] != CudaNdarray_HOST_DIMS(%(b)s)[0])
        {
-            PyErr_SetString(PyExc_ValueError, "dimension mismatch in x,b arguments");
+            PyErr_SetString(PyExc_ValueError,
+                            "dimension mismatch in x,b arguments");
            %(fail)s;
        }
        if ((NULL == %(nll)s) //initial condition
@@ -132,7 +141,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp):
            %(sm)s = (CudaNdarray*) CudaNdarray_NewDims(2, CudaNdarray_HOST_DIMS(%(x)s));
            if(!%(sm)s)
            {
-                PyErr_SetString(PyExc_MemoryError, "failed to alloc sm output");
+                PyErr_SetString(PyExc_MemoryError,
+                                "failed to alloc sm output");
                // no need to decref cnda_nll, the cleanup code should pick it up.
                %(fail)s;
            }
@@ -144,7 +154,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp):
            %(am)s = (CudaNdarray*) CudaNdarray_NewDims(1, CudaNdarray_HOST_DIMS(%(y_idx)s));
            if(!%(am)s)
            {
-                PyErr_SetString(PyExc_MemoryError, "failed to alloc am output");
+                PyErr_SetString(PyExc_MemoryError,
+                                "failed to alloc am output");
                // no need to decref nll amd sm, the cleanup code should pick it up.
                %(fail)s;
            }
@@ -167,7 +178,9 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp):
            cudaError_t err = cudaGetLastError();
            if (cudaSuccess != err)
            {
-                PyErr_Format(PyExc_RuntimeError, "Cuda error: %(classname)s %(nodename)s: %%s.\\n", cudaGetErrorString(err));
+                PyErr_Format(PyExc_RuntimeError,
+                             "Cuda error: %(classname)s %(nodename)s: %%s.\\n",
+                             cudaGetErrorString(err));
                // no need to decref output vars the cleanup code should pick them up.
                %(fail)s;
            }
@@ -181,26 +194,33 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp):
 gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1HotWithBias()
 class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp):
    """
    Implement CrossentropySoftmax1HotWithBiasDx on the gpu.
    """
-    nin=3
+    nin = 3
-    nout=1
+    nout = 1
    """Gradient wrt x of the CrossentropySoftmax1Hot Op"""
    def __init__(self, **kwargs):
        Op.__init__(self,**kwargs)
    def __eq__(self, other):
        return type(self) == type(other)
    def __hash__(self):
        return hash(type(self))
    def __str__(self):
        return self.__class__.__name__
    def make_node(self, dy, sm, y_idx):
-        return Apply(self, [dy, sm, y_idx],[sm.type()])
+        return Apply(self, [dy, sm, y_idx], [sm.type()])
    def c_code_cache_version(self):
        #return ()
        return (5,)
    def c_code(self, node, nodename, inp, out, sub):
        dnll, sm, y_idx = inp
        dx, = out
@@ -221,7 +241,8 @@ class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp):
        }
        if (CudaNdarray_HOST_DIMS(%(dnll)s)[0] != CudaNdarray_HOST_DIMS(%(y_idx)s)[0])
        {
-            PyErr_SetString(PyExc_ValueError, "dnll.shape[0] != y_idx.shape[0]");
+            PyErr_SetString(PyExc_ValueError,
+                            "dnll.shape[0] != y_idx.shape[0]");
            %(fail)s;
        }
        if ((NULL == %(dx)s)
@@ -265,7 +286,10 @@ class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp):
            cudaError_t err = cudaGetLastError();
            if( cudaSuccess != err)
            {
-                PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n", "kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s", cudaGetErrorString(err));
+                PyErr_Format(PyExc_RuntimeError,
+                             "Cuda error: %%s: %%s.\\n",
+                             "kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s",
+                             cudaGetErrorString(err));
                %(fail)s;
            }
        }
@@ -305,23 +329,30 @@ class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp):
 gpu_crossentropy_softmax_1hot_with_bias_dx = GpuCrossentropySoftmax1HotWithBiasDx()
 class GpuSoftmax (GpuOp):
    """
    Implement Softmax on the gpu.
    """
    def __eq__(self, other):
        return type(self) == type(other)
    def __hash__(self):
        return hash(type(self))
    def __str__(self):
        return self.__class__.__name__
    def make_node(self, x):
-        return Apply(self, [x],[x.type()])
+        return Apply(self, [x], [x.type()])
    def infer_shape(self, node, shape):
        return shape
    def c_code_cache_version(self):
        #return ()
-        return (5,) + inline_softmax.code_version
+        return (7,) + inline_softmax.code_version
    def c_code(self, node, nodename, inp, out, sub):
        x, = inp
        z, = out
@@ -332,14 +363,15 @@ class GpuSoftmax (GpuOp):
            PyErr_SetString(PyExc_ValueError, "rank error");
            %(fail)s;
        }
-        if ((NULL == %(z)s)
+        if ((NULL == %(z)s) ||
-            || (CudaNdarray_HOST_DIMS(%(z)s)[0] != CudaNdarray_HOST_DIMS(%(x)s)[0])
+            (CudaNdarray_HOST_DIMS(%(z)s)[0] != CudaNdarray_HOST_DIMS(%(x)s)[0]) ||
-            || (CudaNdarray_HOST_DIMS(%(z)s)[1] != CudaNdarray_HOST_DIMS(%(x)s)[1]))
+            (CudaNdarray_HOST_DIMS(%(z)s)[1] != CudaNdarray_HOST_DIMS(%(x)s)[1]))
        {
            Py_XDECREF(%(z)s);
            %(z)s = (CudaNdarray*)CudaNdarray_New();
            if ((NULL == %(z)s)
-                || CudaNdarray_alloc_contiguous(%(z)s, 2, CudaNdarray_HOST_DIMS(%(x)s)))
+                || CudaNdarray_alloc_contiguous(%(z)s, 2,
+                                                CudaNdarray_HOST_DIMS(%(x)s)))
            {
                Py_XDECREF(%(z)s);
                %(z)s = NULL;
@@ -347,42 +379,48 @@ class GpuSoftmax (GpuOp):
            }
        }
        {
-            int n_blocks = std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],32*1024);
+            int n_blocks = std::min(CudaNdarray_HOST_DIMS(%(x)s)[0], 32 * 1024);
 //TODO, detect the maximum number of thread per block.
-            int n_threads = std::min(CudaNdarray_HOST_DIMS(%(x)s)[1], 1024);
+            int n_threads = std::min(CudaNdarray_HOST_DIMS(%(x)s)[1], 512);
            int n_shared_bytes = CudaNdarray_HOST_DIMS(%(x)s)[1] * 2 * sizeof(float);
-            kSoftmax_%(nodename)s
+            if (CudaNdarray_HOST_DIMS(%(x)s)[0] > 0)
-                <<<
+            {
-                // todo: cap these at the card limits, implement loops in kernel
+                kSoftmax_%(nodename)s
-                    n_blocks,
+                    <<<
-                    n_threads,
+                        n_blocks,
-                    n_shared_bytes
+                        n_threads,
-                >>>(
+                        n_shared_bytes
-                        CudaNdarray_HOST_DIMS(%(x)s)[0],
+                    >>>(
-                        CudaNdarray_HOST_DIMS(%(x)s)[1],
+                            CudaNdarray_HOST_DIMS(%(x)s)[0],
+                            CudaNdarray_HOST_DIMS(%(x)s)[1],
-                        CudaNdarray_DEV_DATA(%(x)s),
+                            CudaNdarray_DEV_DATA(%(x)s),
-                        CudaNdarray_HOST_STRIDES(%(x)s)[0],
+                            CudaNdarray_HOST_STRIDES(%(x)s)[0],
-                        CudaNdarray_HOST_STRIDES(%(x)s)[1],
+                            CudaNdarray_HOST_STRIDES(%(x)s)[1],
-                        CudaNdarray_DEV_DATA(%(z)s),
+                            CudaNdarray_DEV_DATA(%(z)s),
-                        CudaNdarray_HOST_STRIDES(%(z)s)[0],
+                            CudaNdarray_HOST_STRIDES(%(z)s)[0],
-                        CudaNdarray_HOST_STRIDES(%(z)s)[1]
+                            CudaNdarray_HOST_STRIDES(%(z)s)[1]
-                );
+                    );
-            CNDA_THREAD_SYNC;
+                CNDA_THREAD_SYNC;
-            cudaError_t err = cudaGetLastError();
+                cudaError_t err = cudaGetLastError();
-            if( cudaSuccess != err)
+                if( cudaSuccess != err)
-            {
+                {
-                PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n", "kSoftmax_%(nodename)s", cudaGetErrorString(err));
+                    PyErr_Format(PyExc_RuntimeError,
-                %(fail)s;
+                                 "Cuda error: %%s: %%s.\\n Used %%d blocks,"
+                                 " %%d threads %%d bytes of shared memory",
+                                 "kSoftmax_%(nodename)s", cudaGetErrorString(err),
+                                 n_blocks, n_threads, n_shared_bytes);
+                    %(fail)s;
+                }
            }
        }
        assert(%(z)s);
        """ % locals()
    def c_support_code_apply(self, node, nodename):
-        return nvcc_kernel("kSoftmax_%s"%nodename,
+        return nvcc_kernel("kSoftmax_%s" % nodename,
                params=['int M', 'int N',
                    'const float * x', 'const int sx0', 'const int sx1',
                    'float * sm', 'const int sm_s0', 'const int sm_s1'],
@@ -395,9 +433,11 @@ class GpuSoftmax (GpuOp):
                        "buf2[tx] = buf[tx]",
                      "}",
                      "__syncthreads()",
-                      inline_softmax('N', 'buf', 'buf2', 'threadIdx.x', 'blockDim.x'),
+                      inline_softmax('N', 'buf', 'buf2',
+                                     'threadIdx.x', 'blockDim.x'),
                      "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
-                        "sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]",# This set all value correctly
+                        # This set all value correctly
+                        "sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]",
                      "}",
                      "__syncthreads()",
                    "}",
@@ -405,25 +445,32 @@ class GpuSoftmax (GpuOp):
 gpu_softmax = GpuSoftmax()
 class GpuSoftmaxWithBias (GpuOp):
    """
    Implement SoftmaxWithBias on the gpu.
    """
    nin = 2
    nout = 1
    def __eq__(self, other):
        return type(self) == type(other)
    def __hash__(self):
        return hash(type(self))
    def __str__(self):
        return self.__class__.__name__
    def make_node(self, x, b):
-        return Apply(self, [x, b],[x.type()])
+        return Apply(self, [x, b], [x.type()])
    def infer_shape(self, node, shape):
        return  [shape[0]]
    def c_code_cache_version(self):
        #return ()
-        return (6,) + inline_softmax.code_version
+        return (7,) + inline_softmax.code_version
    def c_code(self, node, nodename, inp, out, sub):
        x, b = inp
@@ -463,7 +510,7 @@ class GpuSoftmaxWithBias (GpuOp):
        {
            int n_blocks = std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],32*1024);
 //TODO, detect the maximum number of thread per block.
-            int n_threads = std::min(CudaNdarray_HOST_DIMS(%(x)s)[1], 1024);
+            int n_threads = std::min(CudaNdarray_HOST_DIMS(%(x)s)[1], 512);
            int n_shared_bytes = CudaNdarray_HOST_DIMS(%(x)s)[1] * 2 * sizeof(float);
            if (CudaNdarray_HOST_DIMS(%(x)s)[0] > 0)
            {

--- a/theano/sandbox/cuda/nvcc_compiler.py
+++ b/theano/sandbox/cuda/nvcc_compiler.py
@@ -255,10 +255,15 @@ class NVCC_compiler(object):
        # compute capability? '--gpu-architecture=compute_13',
        # '--gpu-code=compute_13',
        #nvcc argument
-        preargs1 = [pa for pa in preargs
+        preargs1 = []
-                    if pa.startswith('-O') or
+        for pa in preargs:
-                    pa.startswith('--maxrregcount=') or
+            for pattern in ['-O', '-arch=',
-                    pa.startswith('-arch=')]
+                            '--fmad', '--ftz', '--maxrregcount',
+                            '--prec-div', '--prec-sqrt',  '--use_fast_math',
+                            '-fmad', '-ftz', '-maxrregcount',
+                            '-prec-div', '-prec-sqrt', '-use_fast_math']:
+                if pa.startswith(pattern):
+                    preargs1.append(pa)
        preargs2 = [pa for pa in preargs
                    if pa not in preargs1]  # other arguments

--- a/theano/sandbox/cuda/tests/test_nnet.py
+++ b/theano/sandbox/cuda/tests/test_nnet.py
@@ -183,7 +183,9 @@ def test_softmax_with_bias():
    def cmp(n, m, catch=False):
        """Some old card won't accet the configuration arguments of
-        this implementation."""
+        this implementation. For those cases set catch=True to skip
+        those errors.
+        """
        try:
            #print "test_softmax",n,m
            data = numpy.arange(n * m, dtype='float32').reshape(n, m)
@@ -193,18 +195,22 @@ def test_softmax_with_bias():
        except RuntimeError, e:
            if not catch:
                raise
-            assert (e.args[0] ==
+            # Different CUDA driver have different error message
-              'Cuda error: kSoftmaxWithBias_node_0: invalid configuration argument.\n'
+            assert (e.args[0].startswith(
-            ), e.args[0]
+              'Cuda error: kSoftmaxWithBias_node_0: invalid configuration argument.\n') or
+            e.args[0].startswith('Cuda error: kSoftmaxWithBias_node_0: invalid argument.\n'))
    cmp(2, 5)
    #we need to test n>32*1024 to check that we make the block loop.
    cmp(2 << 15, 5)
    cmp(4074, 400)
    cmp(0, 10)
-    cmp(4, 1000, True)
+    cmp(784, 784)
-    cmp(4, 1024, True)
+    cmp(4, 1000)
-    cmp(4, 2000, True)
+    cmp(4, 1024)
-    cmp(4, 2024, True)
+    cmp(4, 2000)
+    cmp(4, 2024)
+    #GTX285 don't have enough shared mem for this case.
    cmp(4, 4074, True)
@@ -227,8 +233,11 @@ def test_softmax():
                      cuda.nnet.GpuSoftmax)
    def cmp(n, m, catch=False):
-        """Some old card won't accet the configuration arguments of
+        """Some old card won't accept the configuration arguments of
-        this implementation."""
+        this implementation. For those cases set catch=True to skip
+        those errors.
+        """
        try:
            #print "test_softmax",n,m
            data = numpy.arange(n * m, dtype='float32').reshape(n, m)
@@ -238,15 +247,20 @@ def test_softmax():
        except RuntimeError, e:
            if not catch:
                raise
-            assert (e.args[0] ==
+            # Different CUDA driver have different error message
-              'Cuda error: kSoftmax_node_0: invalid configuration argument.\n')
+            assert (e.args[0].startswith(
+              'Cuda error: kSoftmax_node_0: invalid configuration argument.\n') or
+            e.args[0].startswith('Cuda error: kSoftmax_node_0: invalid argument.\n'))
    #we need to test n>32*1024 to check that we make the block loop.
    cmp(2, 5)
    cmp(2 << 15, 5)
    cmp(4074, 400)
-    cmp(4, 1000, True)
+    cmp(0, 10)
-    cmp(4, 1024, True)
+    cmp(784, 784)
-    cmp(4, 2000, True)
+    cmp(4, 1000)
-    cmp(4, 2024, True)
+    cmp(4, 1024)
+    cmp(4, 2000)
+    cmp(4, 2024)
+    #GTX285 don't have enough shared mem for this case.
    cmp(4, 4074, True)