Merge pull request #2116 from daemonmaker/cudnn

Implemented cuDNN softmax but it is not currently passing tests.

Merge pull request #2116 from daemonmaker/cudnn
52ea2d0c · Frédéric Bastien · a7b9a7e8 · a729ae42 · 52ea2d0c · 52ea2d0c
--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -4,16 +4,18 @@ import os
 import theano
 from theano import Apply, tensor
 from theano.gof.type import CDataType
+from theano.compat import PY3
 from theano.compat.six import StringIO
 from theano.sandbox.cuda.type import CudaNdarrayType
 from theano.sandbox.cuda import GpuOp
 from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
                                           gpu_contiguous)
 from theano.sandbox.cuda.blas import GpuConv
-from theano.compat import PY3
+from theano.sandbox.cuda.nnet import GpuSoftmax
 from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
 class DnnBase(GpuOp):
    """
    Creates a handle for cudnn and pulls in the cudnn libraries and headers.
@@ -360,6 +362,7 @@ class GpuDnnConvGradI(GpuDnnConvBase):
 from theano.sandbox.cuda.opt import (local_optimizer, gpu_contiguous,
                                     gpu_optimizer)
 def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
             conv_mode='conv'):
    img = gpu_contiguous(img)
@@ -368,6 +371,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
                          conv_mode=conv_mode)(img.shape, kerns.shape)
    return GpuDnnConv()(img, kerns, desc)
 @local_optimizer([GpuConv])
 def local_conv_dnn(node):
    if isinstance(node.op, GpuConv):
@@ -380,3 +384,166 @@ def local_conv_dnn(node):
                         border_mode=border_mode, subsample=subsample)]
 gpu_optimizer.register("conv_cudnn", local_conv_dnn, 'cudnn')
+class GpuDnnSoftmax(DnnBase):
+    """
+    Op for the cuDNN Softmax.
+    Parameters''
+    -tensor_format: Whether the data format is 'bc01' or 'b01c'
+    -algo: 'fast' or 'accurate' indicating whether computations should be
+    optimized for speed or accuracy respectively.
+    -mode: 'instance' or 'channel' indicating whether the softmax should be
+    computed per image across 'c01' or per spationali location '01' per image
+    across 'c'.
+    """
+    __props__ = ('tensor_format', 'mode', 'algo')
+    def __init__(self, tensor_format, algo, mode):
+        assert(tensor_format in ('bc01', 'b01c'))
+        self.tensor_format = tensor_format
+        assert(algo in ('fast', 'accurate'))
+        self.algo = algo
+        assert(mode in ('instance', 'channel'))
+        self.mode = mode
+    def make_node(self, x):
+        x = as_cuda_ndarray_variable(x)
+        assert x.ndim == 4
+        return Apply(self, [x], [x.type()])
+    def c_support_code_struct(self, node, struct_id):
+        return """
+cudnnTensor4dDescriptor_t softmax_input_%(id)d;
+cudnnTensor4dDescriptor_t softmax_output_%(id)d;
+""" % dict(id=struct_id)
+    def c_init_code_struct(self, node, struct_id, sub):
+        return """
+softmax_input_%(id)d = NULL;
+softmax_output_%(id)d = NULL;
+cudnnStatus_t err%(id)d;
+if ((err%(id)d = cudnnCreateTensor4dDescriptor(&softmax_input_%(id)d)) != CUDNN_STATUS_SUCCESS) {
+  PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
+               "(inp): %%s", cudnnGetErrorString(err%(id)d));
+  %(fail)s
+}
+if ((err%(id)d = cudnnCreateTensor4dDescriptor(&softmax_output_%(id)d)) != CUDNN_STATUS_SUCCESS) {
+  PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
+               "(out): %%s", cudnnGetErrorString(err%(id)d));
+  %(fail)s
+}
+""" % dict(id=struct_id, fail=sub['fail'])
+    def c_cleanup_code_struct(self, node, struct_id):
+        return """
+if(softmax_input_%(id)d != NULL)
+  cudnnDestroyTensor4dDescriptor(softmax_input_%(id)d);
+if(softmax_output_%(id)d != NULL)
+  cudnnDestroyTensor4dDescriptor(softmax_output_%(id)d);
+""" % dict(id=struct_id)
+    def c_code(self, node, name, inputs, outputs, sub):
+        ins, = inputs
+        outs, = outputs
+        if self.tensor_format == 'b01c':
+            tensor_format = 1
+        else:
+            tensor_format = 0
+        if self.mode == 'instance':
+            mode = 1
+        else:
+            mode = 0
+        if self.algo == 'fast':
+            algo = 1
+        else:
+            algo = 0
+        return """
+cudnnStatus_t err%(name)s;
+cudnnTensorFormat_t format%(id)d = CUDNN_TENSOR_NCHW;
+if (%(tensor_format)d == 1)
+  format%(id)d = CUDNN_TENSOR_NHWC;
+cudnnSoftmaxAlgorithm_t algo%(id)d = CUDNN_SOFTMAX_ACCURATE;
+if (%(algo)d == 1)
+  algo%(id)d = CUDNN_SOFTMAX_FAST;
+cudnnSoftmaxMode_t mode%(id)d = CUDNN_SOFTMAX_MODE_CHANNEL;
+if (%(mode)d == 1)
+  mode%(id)d = CUDNN_SOFTMAX_MODE_INSTANCE;
+if (!CudaNdarray_is_c_contiguous(%(ins)s)) {
+  PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
+  %(fail)s
+}
+err%(name)s = cudnnSetTensor4dDescriptor(
+  softmax_input_%(id)d,
+  format%(id)d,
+  CUDNN_DATA_FLOAT,
+  CudaNdarray_HOST_DIMS(%(ins)s)[0],
+  CudaNdarray_HOST_DIMS(%(ins)s)[1],
+  CudaNdarray_HOST_DIMS(%(ins)s)[2],
+  CudaNdarray_HOST_DIMS(%(ins)s)[3]
+);
+if (err%(name)s != CUDNN_STATUS_SUCCESS) {
+  PyErr_Format(PyExc_RuntimeError, "could not set tensor4d descriptor: %%s",
+               cudnnGetErrorString(err%(name)s));
+  %(fail)s
+}
+if (CudaNdarray_prep_output(&%(outs)s, 4, CudaNdarray_HOST_DIMS(%(ins)s)) != 0)
+{
+  %(fail)s
+}
+err%(name)s = cudnnSetTensor4dDescriptor(
+  softmax_output_%(id)d,
+  format%(id)d,
+  CUDNN_DATA_FLOAT,
+  CudaNdarray_HOST_DIMS(%(outs)s)[0],
+  CudaNdarray_HOST_DIMS(%(outs)s)[1],
+  CudaNdarray_HOST_DIMS(%(outs)s)[2],
+  CudaNdarray_HOST_DIMS(%(outs)s)[3]
+);
+if (err%(name)s != CUDNN_STATUS_SUCCESS) {
+  PyErr_Format(PyExc_RuntimeError, "could not set out descriptor: %%s",
+               cudnnGetErrorString(err%(name)s));
+  %(fail)s
+}
+err%(name)s = cudnnSoftmaxForward(
+  _handle,
+  algo%(id)d,
+  mode%(id)d,
+  softmax_input_%(id)d,
+  CudaNdarray_DEV_DATA(%(ins)s),
+  softmax_output_%(id)d,
+  CudaNdarray_DEV_DATA(%(outs)s)
+);
+""" % dict(ins=ins, outs=outs, tensor_format=tensor_format, mode=mode,
+           algo=algo, fail=sub['fail'], id=sub['struct_id'], name=name)
+    def c_code_cache_version(self):
+        return (0, 3)
+@local_optimizer([GpuSoftmax])
+def local_softmax_dnn(node):
+    if isinstance(node.op, GpuSoftmax):
+        ins = node.inputs[0].dimshuffle(0, 1, 'x', 'x')
+        out = GpuDnnSoftmax('bc01', 'accurate', 'channel')(gpu_contiguous(ins))
+        out = as_cuda_ndarray_variable(out.dimshuffle(0, 1))
+        return [out]
+gpu_optimizer.register("softmax_cudnn", local_softmax_dnn, 'cudnn')
--- a/theano/sandbox/cuda/tests/test_nnet.py
+++ b/theano/sandbox/cuda/tests/test_nnet.py
 from nose.plugins.skip import SkipTest
 import numpy
+import unittest
 import theano
 from theano.gof.python25 import any
@@ -208,42 +209,140 @@ def test_softmax_with_bias():
    cmp(128, 64 * 1024)
-def test_softmax():
+class test_SoftMax(unittest.TestCase):
+    def _test_softmax(
+        self,
+        x,
+        x_gpu,
+        f_z,
+        f_gpu_z,
+        cmp,
+        gpu_mode,
+        check_types
+    ):
        """
-    This is basic test for GpuSoftmax
+        This is basic test for GpuSoftmax and GpuDnnSoftmax
        We check that we loop when their is too much block
        We use slower code when there isn't enough shared memory
        """
-    x = T.fmatrix('x')
+        f_z_out = f_z(x)
+        f_gpu_z_out = f_gpu_z(x_gpu)
-    z = T.nnet.softmax(x)
+        f = theano.function([x], f_z_out, mode=mode_without_gpu)
-    f = theano.function([x], z, mode=mode_without_gpu)
+        f_gpu = theano.function([x_gpu], f_gpu_z_out, mode=gpu_mode)
-    f_gpu = theano.function([x], z, mode=mode_with_gpu)
+        check_types(f, f_gpu)
-    assert f.maker.fgraph.toposort()[-1].op == T.nnet.softmax
-    assert isinstance(f_gpu.maker.fgraph.toposort()[-2].op,
-                      cuda.nnet.GpuSoftmax)
-    def cmp(n, m):
+        #we need to test n>32*1024 to check that we make the block loop.
+        cmp(1, 5, f, f_gpu)
+        cmp(2, 5, f, f_gpu)
+        cmp(10, 5, f, f_gpu)
+        cmp(100, 5, f, f_gpu)
+        cmp(1000, 5, f, f_gpu)
+        cmp(10000, 5, f, f_gpu)
+        cmp(4074, 400, f, f_gpu)
+        cmp(784, 784, f, f_gpu)
+        cmp(4, 1000, f, f_gpu)
+        cmp(4, 1024, f, f_gpu)
+        cmp(4, 2000, f, f_gpu)
+        cmp(4, 2024, f, f_gpu)
+        # The GTX285 don't have enough shared memory.
+        cmp(4, 4074, f, f_gpu)
+        # The GTX580, 680 and kepler don't have enough shared memory.
+        cmp(2, 10000, f, f_gpu)
+        cmp(128, 16 * 1024, f, f_gpu)
+        cmp(128, 64 * 1024, f, f_gpu)
+        # cudnn permits no more than 2^15 - 1 rows
+        cmp((2 << 15) - 1, 5, f, f_gpu)
+        cmp(5, 2 << 15, f, f_gpu)
+        return f, f_gpu
+    def _cmp(self, n, m, f, f_gpu):
        #print "test_softmax",n,m
        data = numpy.arange(n * m, dtype='float32').reshape(n, m)
        out = f(data)
        gout = f_gpu(data)
        assert numpy.allclose(out, gout), numpy.absolute(out - gout)
-    #we need to test n>32*1024 to check that we make the block loop.
+    def _check_types(self, graph, graph_gpu, topo_idx, f_type, f_gpu_type):
-    cmp(2, 5)
+        assert isinstance(graph.maker.fgraph.toposort()[-1].op, f_type)
-    cmp(2 << 15, 5)
+        assert isinstance(
-    cmp(4074, 400)
+            graph_gpu.maker.fgraph.toposort()[topo_idx].op,
-    cmp(0, 10)
+            f_gpu_type
-    cmp(784, 784)
+        )
-    cmp(4, 1000)
-    cmp(4, 1024)
+    def test_softmax(self):
-    cmp(4, 2000)
+        x = T.fmatrix('x')
-    cmp(4, 2024)
+        z = T.nnet.softmax
-    # The GTX285 don't have enough shared memory.
-    cmp(4, 4074)
+        def check_types(graph, graph_gpu):
-    # The GTX580, 680 and kepler don't have enough shared memory.
+            self._check_types(
-    cmp(2, 10000)
+                graph,
-    cmp(128, 16 * 1024)
+                graph_gpu,
-    cmp(128, 64 * 1024)
+                -2,
+                type(z),
+                cuda.nnet.GpuSoftmax
+            )
+        f, f_gpu = self._test_softmax(
+            x,
+            x,
+            z,
+            z,
+            self._cmp,
+            mode_with_gpu,
+            check_types
+        )
+        # cuDNN R1 cannot handle these test cases but the Theano softmax can so
+        # we test them only for the Theano softmax.
+        self._cmp(2 << 15, 5, f, f_gpu)
+        self._cmp(0, 10, f, f_gpu)
+    def test_cudnn_softmax(self):
+        def cmp(n, m, f, f_gpu):
+            data = numpy.arange(n * m, dtype='float32').reshape(n, m)
+            gdata = numpy.asarray(data)[:, :, None, None]
+            out = f(data)
+            gout = numpy.asarray(f_gpu(gdata))[:, :, 0, 0]
+            assert numpy.allclose(out, gout), numpy.absolute(out - gout)
+        x = T.matrix('x', 'float32')
+        x_gpu = T.tensor4('x_gpu', 'float32')
+        f_z = T.nnet.softmax
+        f_gpu = theano.sandbox.cuda.dnn.GpuDnnSoftmax(
+            'bc01',
+            'accurate',
+            'channel'
+        )
+        def check_types(graph, graph_gpu):
+            self._check_types(
+                graph,
+                graph_gpu,
+                -1,
+                type(f_z),
+                theano.sandbox.cuda.dnn.GpuDnnSoftmax
+            )
+        def check_types_opt(graph, graph_gpu):
+            assert isinstance(graph.maker.fgraph.toposort()[-1].op, type(f_z))
+            assert len([n for n in graph_gpu.maker.fgraph.toposort()
+                        if isinstance(
+                            n.op,
+                            theano.sandbox.cuda.dnn.GpuDnnSoftmax
+                        )]) == 1
+        self._test_softmax(
+            x,
+            x_gpu,
+            f_z,
+            f_gpu,
+            cmp,
+            mode_with_gpu,
+            check_types
+        )
+        mode = mode_with_gpu.including("cudnn")
+        self._test_softmax(x, x, f_z, f_z, self._cmp, mode, check_types_opt)