Implemented cuDNN softmax but it is not currently passing tests.

f3b5fe3b · Dustin Webb · 239b6d80 · f3b5fe3b · f3b5fe3b
--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -4,16 +4,18 @@ import os
 import theano
 from theano import Apply, tensor
 from theano.gof.type import CDataType
+from theano.compat import PY3
 from theano.compat.six import StringIO
 from theano.sandbox.cuda.type import CudaNdarrayType
 from theano.sandbox.cuda import GpuOp
 from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
                                           gpu_contiguous)
 from theano.sandbox.cuda.blas import GpuConv
-from theano.compat import PY3
+from theano.sandbox.cuda.nnet import GpuSoftmax
 from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
 class DnnBase(GpuOp):
    """
    Creates a handle for cudnn and pulls in the cudnn libraries and headers.
@@ -360,6 +362,7 @@ class GpuDnnConvGradI(GpuDnnConvBase):
 from theano.sandbox.cuda.opt import (local_optimizer, gpu_contiguous,
                                     gpu_optimizer)
 def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
             conv_mode='conv'):
    img = gpu_contiguous(img)
@@ -368,6 +371,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
                          conv_mode=conv_mode)(img.shape, kerns.shape)
    return GpuDnnConv()(img, kerns, desc)
 @local_optimizer([GpuConv])
 def local_conv_dnn(node):
    if isinstance(node.op, GpuConv):
@@ -380,3 +384,182 @@ def local_conv_dnn(node):
                         border_mode=border_mode, subsample=subsample)]
 gpu_optimizer.register("conv_cudnn", local_conv_dnn, 'cudnn')
+class GpuDnnSoftmax(GpuOp):
+    """
+    Op for the cuDNN Softmax.
+    Parameters''
+    -tensor_format: Whether the data has shape 'bc01' or 'b01c'
+    -algo: 'fast' or 'accurate' indicating whether computations should be
+    optimized for speed or accuracy respectively.
+    -mode: 'instance' or 'channel' indicating whether the data format is
+    'bc01' or 'b01c' respectively.
+    """
+    def __init__(self, tensor_format, algo, mode):
+        assert(tensor_format in ('bc01', 'b01c'))
+        self.tensor_format = tensor_format
+        assert(algo in ('fast', 'accurate'))
+        self.algo = algo
+        assert(mode in ('instance', 'channel'))
+        self.mode = mode
+    def __eq__(self, other):
+        return type(self) == type(other)
+    def __hash__(self):
+        return hash(type(self))
+    def __str__(self):
+        return self.__class__.__name__
+    def make_node(self, x):
+        x = as_cuda_ndarray_variable(x)
+        assert x.ndim == 4
+        return Apply(self, [x], [x.type()])
+    def c_headers(self):
+        return ['cudnn.h', 'cudnn_helper.h']
+    def c_header_dirs(self):
+        return [os.path.dirname(__file__)]
+    def c_libraries(self):
+        return ['cudnn']
+    def c_support_code_struct(self, node, struct_id):
+        return """
+cudnnHandle_t softmax_handle_%(id)d;
+cudnnTensor4dDescriptor_t softmax_input_%(id)d;
+cudnnTensor4dDescriptor_t softmax_output_%(id)d;
+""" % dict(id=struct_id)
+    def c_init_code_struct(self, node, struct_id, sub):
+        return """
+softmax_handle_%(id)d = NULL;
+softmax_input_%(id)d = NULL;
+softmax_output_%(id)d = NULL;
+cudnnStatus_t err%(id)d;
+if ((err%(id)d = cudnnCreate(&softmax_handle_%(id)d)) != CUDNN_STATUS_SUCCESS) {
+  PyErr_Format(PyExc_RuntimeError, "could not create cudnn handle: %%s",
+               cudnnGetErrorString(err%(id)d));
+  %(fail)s
+}
+if ((err%(id)d = cudnnCreateTensor4dDescriptor(&softmax_input_%(id)d)) != CUDNN_STATUS_SUCCESS) {
+  PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
+               "(inp): %%s", cudnnGetErrorString(err%(id)d));
+  %(fail)s
+}
+if ((err%(id)d = cudnnCreateTensor4dDescriptor(&softmax_output_%(id)d)) != CUDNN_STATUS_SUCCESS) {
+  PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
+               "(out): %%s", cudnnGetErrorString(err%(id)d));
+  %(fail)s
+}
+""" % dict(id=struct_id, fail=sub['fail'])
+    def c_cleanup_code_struct(self, node, struct_id):
+        return """
+if(softmax_input_%(id)d != NULL)
+  cudnnDestroyTensor4dDescriptor(softmax_input_%(id)d);
+if(softmax_output_%(id)d != NULL)
+  cudnnDestroyTensor4dDescriptor(softmax_output_%(id)d);
+if(softmax_handle_%(id)d != NULL)
+  cudnnDestroy(softmax_handle_%(id)d);
+""" % dict(id=struct_id)
+    def c_code(self, node, name, inputs, outputs, sub):
+        ins, = inputs
+        outs, = outputs
+        if self.tensor_format == 'b01c':
+            tensor_format = 1
+        else:
+            tensor_format = 0
+        if self.mode == 'channel':
+            mode = 1
+        else:
+            mode = 0
+        if self.algo == 'fast':
+            algo = 1
+        else:
+            algo = 0
+        return """
+cudnnStatus_t err%(name)s;
+cudnnTensorFormat_t format%(id)d = CUDNN_TENSOR_NCHW;
+if (%(tensor_format)d == 1)
+  format%(id)d = CUDNN_TENSOR_NHWC;
+cudnnSoftmaxAlgorithm_t algo%(id)d = CUDNN_SOFTMAX_ACCURATE;
+if (%(algo)d == 1)
+  algo%(id)d = CUDNN_SOFTMAX_FAST;
+cudnnSoftmaxMode_t mode%(id)d = CUDNN_SOFTMAX_MODE_INSTANCE;
+if (%(mode)d == 1)
+  mode%(id)d = CUDNN_SOFTMAX_MODE_CHANNEL;
+err%(name)s = cudnnSetTensor4dDescriptor(
+  softmax_input_%(id)d,
+  format%(id)d,
+  CUDNN_DATA_FLOAT,
+  CudaNdarray_HOST_DIMS(%(ins)s)[0],
+  CudaNdarray_HOST_DIMS(%(ins)s)[1],
+  CudaNdarray_HOST_DIMS(%(ins)s)[2],
+  CudaNdarray_HOST_DIMS(%(ins)s)[3]
+);
+if (err%(name)s != CUDNN_STATUS_SUCCESS) {
+  PyErr_Format(PyExc_RuntimeError, "could not set tensor4d descriptor: %%s",
+               cudnnGetErrorString(err%(name)s));
+  %(fail)s
+}
+err%(name)s = cudnnSetTensor4dDescriptor(
+  softmax_output_%(id)d,
+  format%(id)d,
+  CUDNN_DATA_FLOAT,
+  CudaNdarray_HOST_DIMS(%(ins)s)[0],
+  CudaNdarray_HOST_DIMS(%(ins)s)[1],
+  CudaNdarray_HOST_DIMS(%(ins)s)[2],
+  CudaNdarray_HOST_DIMS(%(ins)s)[3]
+);
+if (err%(name)s != CUDNN_STATUS_SUCCESS) {
+  PyErr_Format(PyExc_RuntimeError, "could not set out descriptor: %%s",
+               cudnnGetErrorString(err%(name)s));
+  %(fail)s
+}
+CudaNdarray_prep_output(&(%(outs)s), 4, CudaNdarray_HOST_DIMS(%(ins)s));
+err%(name)s = cudnnSoftmaxForward(
+  softmax_handle_%(id)d,
+  algo%(id)d,
+  mode%(id)d,
+  softmax_input_%(id)d,
+  CudaNdarray_DEV_DATA(%(ins)s),
+  softmax_output_%(id)d,
+  CudaNdarray_DEV_DATA(%(outs)s)
+);
+""" % dict(ins=ins, outs=outs, tensor_format=tensor_format, mode=mode,
+           algo=algo, fail=sub['fail'], id=sub['struct_id'], name=name)
+    def c_code_cache_version(self):
+        return (0, )
+@local_optimizer([GpuSoftmax])
+def local_softmax_dnn(node):
+    if isinstance(node.op, GpuSoftmax):
+        ins = node.inputs[0].dimshuffle('x', 'x', 0, 1)
+        out = GpuDnnSoftmax('bc01', 'accurate', 'instance')(ins)
+        out = as_cuda_ndarray_variable(out.dimshuffle(2, 3))
+        return [out]
+gpu_optimizer.register("softmax_cudnn", local_softmax_dnn, 'cudnn')
--- a/theano/sandbox/cuda/tests/test_nnet.py
+++ b/theano/sandbox/cuda/tests/test_nnet.py
@@ -208,42 +208,65 @@ def test_softmax_with_bias():
    cmp(128, 64 * 1024)
-def test_softmax():
+def _test_softmax(x, x_gpu, f_z, f_gpu_z, cpu_type, gpu_type, cmp, topo_idx):
    """
-    This is basic test for GpuSoftmax
+    This is basic test for GpuSoftmax and GpuDnnSoftmax
    We check that we loop when their is too much block
    We use slower code when there isn't enough shared memory
    """
-    x = T.fmatrix('x')
+    f_z_out = f_z(x)
+    f_gpu_z_out = f_gpu_z(x_gpu)
+    f = theano.function([x], f_z_out, mode=mode_without_gpu)
+    f_gpu = theano.function([x_gpu], f_gpu_z_out, mode=mode_with_gpu)
+    assert isinstance(f.maker.fgraph.toposort()[-1].op, cpu_type)
+    assert isinstance(f_gpu.maker.fgraph.toposort()[topo_idx].op, gpu_type)
-    z = T.nnet.softmax(x)
+    #we need to test n>32*1024 to check that we make the block loop.
-    f = theano.function([x], z, mode=mode_without_gpu)
+    cmp(2, 5, f, f_gpu)
-    f_gpu = theano.function([x], z, mode=mode_with_gpu)
+    cmp(2 << 15, 5, f, f_gpu)
-    assert f.maker.fgraph.toposort()[-1].op == T.nnet.softmax
+    cmp(4074, 400, f, f_gpu)
-    assert isinstance(f_gpu.maker.fgraph.toposort()[-2].op,
+    cmp(0, 10, f, f_gpu)
-                      cuda.nnet.GpuSoftmax)
+    cmp(784, 784, f, f_gpu)
+    cmp(4, 1000, f, f_gpu)
+    cmp(4, 1024, f, f_gpu)
+    cmp(4, 2000, f, f_gpu)
+    cmp(4, 2024, f, f_gpu)
+    # The GTX285 don't have enough shared memory.
+    cmp(4, 4074, f, f_gpu)
+    # The GTX580, 680 and kepler don't have enough shared memory.
+    cmp(2, 10000, f, f_gpu)
+    cmp(128, 16 * 1024, f, f_gpu)
+    cmp(128, 64 * 1024, f, f_gpu)
-    def cmp(n, m):
+def test_softmax():
+    def cmp(n, m, f, f_gpu):
        #print "test_softmax",n,m
        data = numpy.arange(n * m, dtype='float32').reshape(n, m)
        out = f(data)
        gout = f_gpu(data)
        assert numpy.allclose(out, gout), numpy.absolute(out - gout)
-    #we need to test n>32*1024 to check that we make the block loop.
+    x = T.fmatrix('x')
-    cmp(2, 5)
+    z = T.nnet.softmax
-    cmp(2 << 15, 5)
+    _test_softmax(x, x, z, z, type(z), cuda.nnet.GpuSoftmax, cmp, -2)
-    cmp(4074, 400)
-    cmp(0, 10)
-    cmp(784, 784)
+def test_cudnn_softmax():
-    cmp(4, 1000)
+    def cmp(n, m, f, f_gpu):
-    cmp(4, 1024)
+        #print "test_softmax",n,m
-    cmp(4, 2000)
+        data = numpy.arange(n * m, dtype='float32').reshape(n, m)
-    cmp(4, 2024)
+        out = f(data)
-    # The GTX285 don't have enough shared memory.
+        gout = f_gpu(data.reshape(1, 1, n, m)).reshape((n, m))
-    cmp(4, 4074)
+        assert numpy.allclose(out, gout), numpy.absolute(out - gout)
-    # The GTX580, 680 and kepler don't have enough shared memory.
-    cmp(2, 10000)
+    x = T.matrix('x')
-    cmp(128, 16 * 1024)
+    x_gpu = T.tensor4('x_gpu')
-    cmp(128, 64 * 1024)
+    f_z = T.nnet.softmax
+    f_gpu = theano.sandbox.cuda.dnn.GpuDnnSoftmax(
+        'bc01',
+        'accurate',
+        'instance'
+    )
+    _test_softmax(x, x_gpu, f_z, f_gpu, type(f_z), type(f_gpu), cmp, -1)