Implemented grad for cudnn softmax.

ddb712a5 · Dustin Webb · c416c5eb · ddb712a5 · ddb712a5
--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -849,7 +849,7 @@ def dnn_pool(img, ws, stride=(1, 1), mode='max'):
    return GpuDnnPool()(img, desc)


-class GpuDnnSoftmax(DnnBase):
+class GpuDnnSoftmaxBase(DnnBase):
    """
    Op for the cuDNN Softmax.

@@ -873,46 +873,57 @@ class GpuDnnSoftmax(DnnBase):
        assert(mode in ('instance', 'channel'))
        self.mode = mode

-    def make_node(self, x):
-        x = as_cuda_ndarray_variable(x)
-        assert x.ndim == 4
-        return Apply(self, [x], [x.type()])
+        self.tensor_4d_descs = [softmax_input
+                                for softmax_input in self.softmax_inputs]
+        self.tensor_4d_descs.append('softmax_output')

-    def c_support_code_struct(self, node, struct_id):
+    def _define_tensor4d_desc(self, name, id):
        return """
-cudnnTensor4dDescriptor_t softmax_input_%(id)d;
-cudnnTensor4dDescriptor_t softmax_output_%(id)d;
-""" % dict(id=struct_id)
+cudnnTensor4dDescriptor_t %(name)s_%(id)d;
+""" % dict(name=name, id=id)

-    def c_init_code_struct(self, node, struct_id, sub):
+    def _init_tensor4d_desc(self, name, id, fail):
        return """
-softmax_input_%(id)d = NULL;
-softmax_output_%(id)d = NULL;
-
-cudnnStatus_t err%(id)d;
-if ((err%(id)d = cudnnCreateTensor4dDescriptor(&softmax_input_%(id)d)) != CUDNN_STATUS_SUCCESS) {
+%(name)s_%(id)d = NULL;
+if ((err%(id)d = cudnnCreateTensor4dDescriptor(&%(name)s_%(id)d)) != CUDNN_STATUS_SUCCESS) {
  PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
               "(inp): %%s", cudnnGetErrorString(err%(id)d));
  %(fail)s
 }
-if ((err%(id)d = cudnnCreateTensor4dDescriptor(&softmax_output_%(id)d)) != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
-               "(out): %%s", cudnnGetErrorString(err%(id)d));
-  %(fail)s
-}
-""" % dict(id=struct_id, fail=sub['fail'])
+""" % dict(name=name, id=id, fail=fail)

-    def c_cleanup_code_struct(self, node, struct_id):
+    def _clean_tensor4d_desc(self, name, id):
        return """
-if(softmax_input_%(id)d != NULL)
-  cudnnDestroyTensor4dDescriptor(softmax_input_%(id)d);
+if(%(name)s_%(id)d!= NULL)
+  cudnnDestroyTensor4dDescriptor(%(name)s_%(id)d);
+""" % dict(name=name, id=id)
+
+    def c_support_code_struct(self, node, struct_id):
+        result = ''
+        for name in self.tensor_4d_descs:
+            result += self._define_tensor4d_desc(name, struct_id)
+        return result

-if(softmax_output_%(id)d != NULL)
-  cudnnDestroyTensor4dDescriptor(softmax_output_%(id)d);
+    def c_init_code_struct(self, node, struct_id, sub):
+        result = """
+cudnnStatus_t err%(id)d;
 """ % dict(id=struct_id)

+        for name in self.tensor_4d_descs:
+            result += self._init_tensor4d_desc(name, struct_id, sub['fail'])
+        return result
+
+    def c_cleanup_code_struct(self, node, struct_id):
+        result = ''
+        for name in self.tensor_4d_descs:
+            result += self._clean_tensor4d_desc(name, struct_id)
+        return result
+
    def c_code(self, node, name, inputs, outputs, sub):
-        ins, = inputs
+        ins = inputs
+        if type(inputs) is not list:
+            ins = [ins]
+
        outs, = outputs

        if self.tensor_format == 'b01c':
@@ -930,7 +941,8 @@ if(softmax_output_%(id)d != NULL)
        else:
            algo = 0

-        return """
+        # Setup configuration variables.
+        result = """
 cudnnStatus_t err%(name)s;
 cudnnTensorFormat_t format%(id)d = CUDNN_TENSOR_NCHW;
 if (%(tensor_format)d == 1)
@@ -943,14 +955,19 @@ if (%(algo)d == 1)
 cudnnSoftmaxMode_t mode%(id)d = CUDNN_SOFTMAX_MODE_CHANNEL;
 if (%(mode)d == 1)
  mode%(id)d = CUDNN_SOFTMAX_MODE_INSTANCE;
+""" % dict(id=sub['struct_id'], name=name,
+           tensor_format=tensor_format, mode=mode, algo=algo)

+        # Validate the input and build the input variables.
+        for input_idx, input_name in enumerate(self.softmax_inputs):
+            result += """
 if (!CudaNdarray_is_c_contiguous(%(ins)s)) {
  PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
  %(fail)s
 }

 err%(name)s = cudnnSetTensor4dDescriptor(
-  softmax_input_%(id)d,
+  %(input_name)s_%(id)d,
  format%(id)d,
  CUDNN_DATA_FLOAT,
  CudaNdarray_HOST_DIMS(%(ins)s)[0],
@@ -959,11 +976,15 @@ err%(name)s = cudnnSetTensor4dDescriptor(
  CudaNdarray_HOST_DIMS(%(ins)s)[3]
 );
 if (err%(name)s != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_RuntimeError, "could not set tensor4d descriptor: %%s",
+  PyErr_Format(PyExc_RuntimeError, "could not set tensor4d descriptor: %%%%s",
               cudnnGetErrorString(err%(name)s));
  %(fail)s
 }
+""" % dict(id=sub['struct_id'], name=name, input_name=input_name,
+           ins=ins[input_idx], fail=sub['fail'])

+        # Build and prepare the output variable.
+        result += """
 if (CudaNdarray_prep_output(&%(outs)s, 4, CudaNdarray_HOST_DIMS(%(ins)s)) != 0)
 {
  %(fail)s
@@ -979,11 +1000,41 @@ err%(name)s = cudnnSetTensor4dDescriptor(
  CudaNdarray_HOST_DIMS(%(outs)s)[3]
 );
 if (err%(name)s != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_RuntimeError, "could not set out descriptor: %%s",
+  PyErr_Format(PyExc_RuntimeError, "could not set out descriptor: %%%%s",
               cudnnGetErrorString(err%(name)s));
  %(fail)s
 }
+"""

+        # Add on a call to the method that does the actual work.
+        result += self.method()
+
+        subs = dict(ins=ins[-1], outs=outs, fail=sub['fail'],
+                    id=sub['struct_id'], name=name)
+
+        for idx, softmax_input in enumerate(self.softmax_inputs):
+            subs['name%d' % idx] = softmax_input
+            subs['ins%d' % idx] = inputs[idx]
+
+        return result % subs
+
+    def c_code_cache_version(self):
+        return (0, 6)
+
+    def method(self):
+        raise NotImplementedError('GpuDnnSoftmaxBase::method')
+
+
+class GpuDnnSoftmax(GpuDnnSoftmaxBase):
+    softmax_inputs = ['softmax_input']
+
+    def make_node(self, x):
+        x = as_cuda_ndarray_variable(x)
+        assert x.ndim == 4
+        return Apply(self, [x], [x.type()])
+
+    def method(self):
+        return """
 err%(name)s = cudnnSoftmaxForward(
  _handle,
  algo%(id)d,
@@ -993,11 +1044,43 @@ err%(name)s = cudnnSoftmaxForward(
  softmax_output_%(id)d,
  CudaNdarray_DEV_DATA(%(outs)s)
 );
-""" % dict(ins=ins, outs=outs, tensor_format=tensor_format, mode=mode,
-           algo=algo, fail=sub['fail'], id=sub['struct_id'], name=name)
+"""

-    def c_code_cache_version(self):
-        return (0, 3)
+    def grad(self, inp, grads):
+        x, = inp
+        g_sm, = grads
+        sm = self.make_node(x).outputs[0]
+        return [GpuDnnSoftmaxGrad(
+            self.tensor_format,
+            self.algo,
+            self.mode
+        )(g_sm, sm)]
+
+
+class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
+    softmax_inputs = ['softmax_var', 'softmax_input']
+
+    def make_node(self, dy, sm):
+        dy = as_cuda_ndarray_variable(dy)
+        sm = as_cuda_ndarray_variable(sm)
+        assert dy.ndim == 4
+        assert sm.ndim == 4
+        return Apply(self, [dy, sm], [sm.type.make_variable()])
+
+    def method(self):
+        return """
+err%(name)s = cudnnSoftmaxBackward(
+  _handle,
+  algo%(id)d,
+  mode%(id)d,
+  %(name1)s_%(id)d,
+  CudaNdarray_DEV_DATA(%(ins1)s),
+  %(name0)s_%(id)d,
+  CudaNdarray_DEV_DATA(%(ins0)s),
+  softmax_output_%(id)d,
+  CudaNdarray_DEV_DATA(%(outs)s)
+);
+"""


 # We need this since other stuff from opt is not importable.

--- a/theano/sandbox/cuda/tests/test_nnet.py
+++ b/theano/sandbox/cuda/tests/test_nnet.py
@@ -320,6 +320,13 @@ class test_SoftMax(unittest.TestCase):
            'channel'
        )

+        # Verify the grad operation
+        n = 3
+        m = 5
+        data = numpy.arange(n * m, dtype='float32').reshape(n, m)
+        gdata = numpy.asarray(data)[:, :, None, None]
+        T.verify_grad(f_gpu, [gdata], rng=numpy.random)
+
        def check_types(graph, graph_gpu):
            self._check_types(
                graph,
@@ -337,6 +344,8 @@ class test_SoftMax(unittest.TestCase):
                            theano.sandbox.cuda.dnn.GpuDnnSoftmax
                        )]) == 1

+        # Verify that the CPU and GPU implementations return the same results
+        # up to a tolerance.
        self._test_softmax(
            x,
            x_gpu,