Merge pull request #2250 from daemonmaker/cudnn

Implemented grad for cudnn softmax.

Merge pull request #2250 from daemonmaker/cudnn
ea9e3e54 · Frédéric Bastien · c416c5eb · 15826072 · ea9e3e54 · ea9e3e54
--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -5,11 +5,12 @@ from theano import Apply, gof, tensor
 from theano.gof import Optimizer
 from theano.gof.type import CDataType
 from theano.compat import PY3
+from theano.tensor.nnet import SoftmaxGrad
 from theano.sandbox.cuda.type import CudaNdarrayType
 from theano.sandbox.cuda import (GpuOp, cuda_available, active_device_number,
                                 device_properties)
 from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
-                                           gpu_contiguous)
+                                           gpu_contiguous, HostFromGpu)
 from theano.sandbox.cuda.blas import (GpuConv, GpuDownsampleFactorMax,
                                      GpuDownsampleFactorMaxGrad)
 from theano.sandbox.cuda.nnet import GpuSoftmax
@@ -849,7 +850,7 @@ def dnn_pool(img, ws, stride=(1, 1), mode='max'):
    return GpuDnnPool()(img, desc)


-class GpuDnnSoftmax(DnnBase):
+class GpuDnnSoftmaxBase(DnnBase):
    """
    Op for the cuDNN Softmax.

@@ -873,46 +874,54 @@ class GpuDnnSoftmax(DnnBase):
        assert(mode in ('instance', 'channel'))
        self.mode = mode

-    def make_node(self, x):
-        x = as_cuda_ndarray_variable(x)
-        assert x.ndim == 4
-        return Apply(self, [x], [x.type()])
+        self.tensor_4d_descs = [softmax_input
+                                for softmax_input in self.softmax_inputs]
+        self.tensor_4d_descs.append('softmax_output')

-    def c_support_code_struct(self, node, struct_id):
+    def _define_tensor4d_desc(self, name, id):
        return """
-cudnnTensor4dDescriptor_t softmax_input_%(id)d;
-cudnnTensor4dDescriptor_t softmax_output_%(id)d;
-""" % dict(id=struct_id)
+cudnnTensor4dDescriptor_t %(name)s_%(id)d;
+""" % dict(name=name, id=id)

-    def c_init_code_struct(self, node, struct_id, sub):
+    def _init_tensor4d_desc(self, name, id, fail):
        return """
-softmax_input_%(id)d = NULL;
-softmax_output_%(id)d = NULL;
-
-cudnnStatus_t err%(id)d;
-if ((err%(id)d = cudnnCreateTensor4dDescriptor(&softmax_input_%(id)d)) != CUDNN_STATUS_SUCCESS) {
+%(name)s_%(id)d = NULL;
+if ((err%(id)d = cudnnCreateTensor4dDescriptor(&%(name)s_%(id)d)) != CUDNN_STATUS_SUCCESS) {
  PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
-               "(inp): %%s", cudnnGetErrorString(err%(id)d));
-  %(fail)s
-}
-if ((err%(id)d = cudnnCreateTensor4dDescriptor(&softmax_output_%(id)d)) != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
-               "(out): %%s", cudnnGetErrorString(err%(id)d));
+               ": %%s", cudnnGetErrorString(err%(id)d));
  %(fail)s
 }
-""" % dict(id=struct_id, fail=sub['fail'])
+""" % dict(name=name, id=id, fail=fail)

-    def c_cleanup_code_struct(self, node, struct_id):
+    def _clean_tensor4d_desc(self, name, id):
        return """
-if(softmax_input_%(id)d != NULL)
-  cudnnDestroyTensor4dDescriptor(softmax_input_%(id)d);
+if(%(name)s_%(id)d!= NULL)
+  cudnnDestroyTensor4dDescriptor(%(name)s_%(id)d);
+""" % dict(name=name, id=id)
+
+    def c_support_code_struct(self, node, struct_id):
+        result = ''
+        for name in self.tensor_4d_descs:
+            result += self._define_tensor4d_desc(name, struct_id)
+        return result

-if(softmax_output_%(id)d != NULL)
-  cudnnDestroyTensor4dDescriptor(softmax_output_%(id)d);
+    def c_init_code_struct(self, node, struct_id, sub):
+        result = """
+cudnnStatus_t err%(id)d;
 """ % dict(id=struct_id)

+        for name in self.tensor_4d_descs:
+            result += self._init_tensor4d_desc(name, struct_id, sub['fail'])
+        return result
+
+    def c_cleanup_code_struct(self, node, struct_id):
+        result = ''
+        for name in self.tensor_4d_descs:
+            result += self._clean_tensor4d_desc(name, struct_id)
+        return result
+
    def c_code(self, node, name, inputs, outputs, sub):
-        ins, = inputs
+        ins = inputs
        outs, = outputs

        if self.tensor_format == 'b01c':
@@ -930,7 +939,8 @@ if(softmax_output_%(id)d != NULL)
        else:
            algo = 0

-        return """
+        # Setup configuration variables.
+        result = """
 cudnnStatus_t err%(name)s;
 cudnnTensorFormat_t format%(id)d = CUDNN_TENSOR_NCHW;
 if (%(tensor_format)d == 1)
@@ -943,14 +953,19 @@ if (%(algo)d == 1)
 cudnnSoftmaxMode_t mode%(id)d = CUDNN_SOFTMAX_MODE_CHANNEL;
 if (%(mode)d == 1)
  mode%(id)d = CUDNN_SOFTMAX_MODE_INSTANCE;
+""" % dict(id=sub['struct_id'], name=name,
+           tensor_format=tensor_format, mode=mode, algo=algo)

+        # Validate the input and build the input variables.
+        for input_idx, input_name in enumerate(self.softmax_inputs):
+            result += """
 if (!CudaNdarray_is_c_contiguous(%(ins)s)) {
  PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
  %(fail)s
 }

 err%(name)s = cudnnSetTensor4dDescriptor(
-  softmax_input_%(id)d,
+  %(input_name)s_%(id)d,
  format%(id)d,
  CUDNN_DATA_FLOAT,
  CudaNdarray_HOST_DIMS(%(ins)s)[0],
@@ -959,11 +974,15 @@ err%(name)s = cudnnSetTensor4dDescriptor(
  CudaNdarray_HOST_DIMS(%(ins)s)[3]
 );
 if (err%(name)s != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_RuntimeError, "could not set tensor4d descriptor: %%s",
+  PyErr_Format(PyExc_RuntimeError, "could not set tensor4d descriptor: %%%%s",
               cudnnGetErrorString(err%(name)s));
  %(fail)s
 }
+""" % dict(id=sub['struct_id'], name=name, input_name=input_name,
+           ins=ins[input_idx], fail=sub['fail'])

+        # Build and prepare the output variable.
+        result += """
 if (CudaNdarray_prep_output(&%(outs)s, 4, CudaNdarray_HOST_DIMS(%(ins)s)) != 0)
 {
  %(fail)s
@@ -979,11 +998,41 @@ err%(name)s = cudnnSetTensor4dDescriptor(
  CudaNdarray_HOST_DIMS(%(outs)s)[3]
 );
 if (err%(name)s != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_RuntimeError, "could not set out descriptor: %%s",
+  PyErr_Format(PyExc_RuntimeError, "could not set out descriptor: %%%%s",
               cudnnGetErrorString(err%(name)s));
  %(fail)s
 }
+"""
+
+        # Add on a call to the method that does the actual work.
+        result += self.method()

+        subs = dict(ins=ins[-1], outs=outs, fail=sub['fail'],
+                    id=sub['struct_id'], name=name)
+
+        for idx, softmax_input in enumerate(self.softmax_inputs):
+            subs['name%d' % idx] = softmax_input
+            subs['ins%d' % idx] = inputs[idx]
+
+        return result % subs
+
+    def c_code_cache_version(self):
+        return (0, 6)
+
+    def method(self):
+        raise NotImplementedError('GpuDnnSoftmaxBase::method')
+
+
+class GpuDnnSoftmax(GpuDnnSoftmaxBase):
+    softmax_inputs = ['softmax_input']
+
+    def make_node(self, x):
+        x = as_cuda_ndarray_variable(x)
+        assert x.ndim == 4
+        return Apply(self, [x], [x.type()])
+
+    def method(self):
+        return """
 err%(name)s = cudnnSoftmaxForward(
  _handle,
  algo%(id)d,
@@ -993,11 +1042,43 @@ err%(name)s = cudnnSoftmaxForward(
  softmax_output_%(id)d,
  CudaNdarray_DEV_DATA(%(outs)s)
 );
-""" % dict(ins=ins, outs=outs, tensor_format=tensor_format, mode=mode,
-           algo=algo, fail=sub['fail'], id=sub['struct_id'], name=name)
+"""

-    def c_code_cache_version(self):
-        return (0, 3)
+    def grad(self, inp, grads):
+        x, = inp
+        g_sm, = grads
+        sm = self.make_node(x).outputs[0]
+        return [GpuDnnSoftmaxGrad(
+            self.tensor_format,
+            self.algo,
+            self.mode
+        )(g_sm, sm)]
+
+
+class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
+    softmax_inputs = ['softmax_gout', 'softmax_input']
+
+    def make_node(self, dy, sm):
+        dy = as_cuda_ndarray_variable(dy)
+        sm = as_cuda_ndarray_variable(sm)
+        assert dy.ndim == 4
+        assert sm.ndim == 4
+        return Apply(self, [dy, sm], [sm.type.make_variable()])
+
+    def method(self):
+        return """
+err%(name)s = cudnnSoftmaxBackward(
+  _handle,
+  algo%(id)d,
+  mode%(id)d,
+  %(name1)s_%(id)d,
+  CudaNdarray_DEV_DATA(%(ins1)s),
+  %(name0)s_%(id)d,
+  CudaNdarray_DEV_DATA(%(ins0)s),
+  softmax_output_%(id)d,
+  CudaNdarray_DEV_DATA(%(outs)s)
+);
+"""


 # We need this since other stuff from opt is not importable.
@@ -1072,3 +1153,27 @@ if cuda_available:
                    " to use it. We got this error: \n" +
                    dnn_available.msg)
    gpu_seqopt.register("NoCuDNNRaise", NoCuDNNRaise(), 0, 'cudnn')
+
+    @register_opt('cudnn')
+    @local_optimizer([SoftmaxGrad])
+    def local_softmax_dnn_grad(node):
+        if (
+            isinstance(node.op, SoftmaxGrad)
+            and (isinstance(node.inputs[0].owner.op, HostFromGpu)
+                 or isinstance(node.inputs[1].owner.op, HostFromGpu))
+        ):
+            ins = []
+            for n in node.inputs:
+                if isinstance(n.owner.op, HostFromGpu):
+                    n = n.owner.inputs[0]
+                ins.append(n.dimshuffle(0, 1, 'x', 'x'))
+
+            out = GpuDnnSoftmaxGrad(
+                'bc01',
+                'accurate',
+                'channel'
+            )(
+                gpu_contiguous(ins[0]),
+                gpu_contiguous(ins[1])
+            )
+            return [out.dimshuffle(0, 1)]
--- a/theano/sandbox/cuda/tests/test_nnet.py
+++ b/theano/sandbox/cuda/tests/test_nnet.py
@@ -276,7 +276,7 @@ class test_SoftMax(unittest.TestCase):
        x = T.fmatrix('x')
        z = T.nnet.softmax

-        def check_types(graph, graph_gpu):
+        def check_types_without_cudnn(graph, graph_gpu):
            self._check_types(
                graph,
                graph_gpu,
@@ -285,14 +285,15 @@ class test_SoftMax(unittest.TestCase):
                cuda.nnet.GpuSoftmax
            )

+        mode_wo_cudnn = mode_with_gpu.excluding("cudnn")
        f, f_gpu = self._test_softmax(
            x,
            x,
            z,
            z,
            self._cmp,
-            mode_with_gpu,
-            check_types
+            mode_wo_cudnn,
+            check_types_without_cudnn
        )

        # cuDNN R1 cannot handle these test cases but the Theano softmax can so
@@ -300,6 +301,25 @@ class test_SoftMax(unittest.TestCase):
        self._cmp(2 << 15, 5, f, f_gpu)
        self._cmp(0, 10, f, f_gpu)

+        def check_types_with_cudnn(graph, graph_gpu):
+            self._check_types(
+                graph,
+                graph_gpu,
+                -3,
+                type(z),
+                theano.sandbox.cuda.dnn.GpuDnnSoftmax
+            )
+
+        f, f_gpu = self._test_softmax(
+            x,
+            x,
+            z,
+            z,
+            self._cmp,
+            mode_with_gpu,
+            check_types_with_cudnn
+        )
+
    def test_cudnn_softmax(self):
        if not cuda.dnn.dnn_available():
            raise SkipTest(cuda.dnn.dnn_available.msg)
@@ -320,6 +340,14 @@ class test_SoftMax(unittest.TestCase):
            'channel'
        )

+        # Verify the grad operation
+        dims = (2, 3, 4, 5)
+        gdata = numpy.arange(
+            numpy.product(dims),
+            dtype='float32'
+        ).reshape(dims)
+        T.verify_grad(f_gpu, [gdata], rng=numpy.random)
+
        def check_types(graph, graph_gpu):
            self._check_types(
                graph,
@@ -337,6 +365,8 @@ class test_SoftMax(unittest.TestCase):
                            theano.sandbox.cuda.dnn.GpuDnnSoftmax
                        )]) == 1

+        # Verify that the CPU and GPU implementations return the same results
+        # up to a tolerance.
        self._test_softmax(
            x,
            x_gpu,
@@ -347,5 +377,53 @@ class test_SoftMax(unittest.TestCase):
            check_types
        )

-        mode = mode_with_gpu.including("cudnn")
-        self._test_softmax(x, x, f_z, f_z, self._cmp, mode, check_types_opt)
+        mode_w_cudnn = mode_with_gpu.including("cudnn")
+        self._test_softmax(
+            x, x, f_z, f_z, self._cmp,
+            mode_w_cudnn, check_types_opt
+        )
+
+        # Verify that the SoftmaxGrad -> GpuDnnSoftmaxGrad optimization is
+        # applied when cudnn is required
+        y = T.fvector('y')
+        f = theano.function(
+            [y],
+            T.grad(T.nnet.softmax(y).mean(), y),
+            mode=mode_with_gpu
+        )
+        sorted_f = f.maker.fgraph.toposort()
+        assert(len([i
+                    for i in sorted_f
+                    if isinstance(
+                        i.op,
+                        theano.sandbox.cuda.dnn.GpuDnnSoftmaxGrad
+                    )]) == 1)
+        assert(len([i
+                    for i in sorted_f
+                    if isinstance(
+                        i.op,
+                        theano.tensor.nnet.SoftmaxGrad
+                    )]) == 0)
+
+        # Verify that the SoftmaxGrad -> GpuDnnSoftmaxGrad optimization is not
+        # applied when cudnn is excluded or not available
+        mode_wo_cudnn = mode_with_gpu.excluding("cudnn")
+        y = T.vector('y')
+        f = theano.function(
+            [y],
+            T.grad(T.nnet.softmax(y).mean(), y),
+            mode=mode_wo_cudnn
+        )
+        sorted_f = f.maker.fgraph.toposort()
+        assert(len([i
+                    for i in sorted_f
+                    if isinstance(
+                        i.op,
+                        theano.sandbox.cuda.dnn.GpuDnnSoftmaxGrad
+                    )]) == 0)
+        assert(len([i
+                    for i in sorted_f
+                    if isinstance(
+                        i.op,
+                        theano.tensor.nnet.SoftmaxGrad
+                    )]) == 1)