Added optimization that converts SoftmaxGrad to DnnSoftmaxGrad and associated…

Added optimization that converts SoftmaxGrad to DnnSoftmaxGrad and associated test to make sure it is applied correctly.

Added optimization that converts SoftmaxGrad to DnnSoftmaxGrad and associated…
b9d546a9 · Dustin Webb · 1625e634 · b9d546a9 · b9d546a9
--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -5,11 +5,12 @@ from theano import Apply, gof, tensor
 from theano.gof import Optimizer
 from theano.gof.type import CDataType
 from theano.compat import PY3
+from theano.tensor.nnet import SoftmaxGrad
 from theano.sandbox.cuda.type import CudaNdarrayType
 from theano.sandbox.cuda import (GpuOp, cuda_available, active_device_number,
                                 device_properties)
 from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
-                                           gpu_contiguous)
+                                           gpu_contiguous, HostFromGpu)
 from theano.sandbox.cuda.blas import (GpuConv, GpuDownsampleFactorMax,
                                      GpuDownsampleFactorMaxGrad)
 from theano.sandbox.cuda.nnet import GpuSoftmax
@@ -887,7 +888,7 @@ cudnnTensor4dDescriptor_t %(name)s_%(id)d;
 %(name)s_%(id)d = NULL;
 if ((err%(id)d = cudnnCreateTensor4dDescriptor(&%(name)s_%(id)d)) != CUDNN_STATUS_SUCCESS) {
  PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
-               "%%s", cudnnGetErrorString(err%(id)d));
+               ": %%s", cudnnGetErrorString(err%(id)d));
  %(fail)s
 }
 """ % dict(name=name, id=id, fail=fail)
@@ -1152,3 +1153,27 @@ if cuda_available:
                    " to use it. We got this error: \n" +
                    dnn_available.msg)
    gpu_seqopt.register("NoCuDNNRaise", NoCuDNNRaise(), 0, 'cudnn')
+    @register_opt('cudnn')
+    @local_optimizer([SoftmaxGrad])
+    def local_softmax_dnn_grad(node):
+        if (
+            isinstance(node.op, SoftmaxGrad)
+            and (isinstance(node.inputs[0].owner.op, HostFromGpu)
+                 or isinstance(node.inputs[1].owner.op, HostFromGpu))
+        ):
+            ins = []
+            for n in node.inputs:
+                if isinstance(n.owner.op, HostFromGpu):
+                    n = n.owner.inputs[0]
+                ins.append(n.dimshuffle(0, 1, 'x', 'x'))
+            out = GpuDnnSoftmaxGrad(
+                'bc01',
+                'accurate',
+                'channel'
+            )(
+                ins[0],
+                gpu_contiguous(ins[1])
+            )
+            return [out.dimshuffle(0, 1)]
--- a/theano/sandbox/cuda/tests/test_nnet.py
+++ b/theano/sandbox/cuda/tests/test_nnet.py
@@ -276,7 +276,7 @@ class test_SoftMax(unittest.TestCase):
        x = T.fmatrix('x')
        z = T.nnet.softmax
-        def check_types(graph, graph_gpu):
+        def check_types_without_cudnn(graph, graph_gpu):
            self._check_types(
                graph,
                graph_gpu,
@@ -285,14 +285,15 @@ class test_SoftMax(unittest.TestCase):
                cuda.nnet.GpuSoftmax
            )
+        mode_wo_cudnn = mode_with_gpu.excluding("cudnn")
        f, f_gpu = self._test_softmax(
            x,
            x,
            z,
            z,
            self._cmp,
-            mode_with_gpu,
+            mode_wo_cudnn,
-            check_types
+            check_types_without_cudnn
        )
        # cuDNN R1 cannot handle these test cases but the Theano softmax can so
@@ -300,6 +301,25 @@ class test_SoftMax(unittest.TestCase):
        self._cmp(2 << 15, 5, f, f_gpu)
        self._cmp(0, 10, f, f_gpu)
+        def check_types_with_cudnn(graph, graph_gpu):
+            self._check_types(
+                graph,
+                graph_gpu,
+                -3,
+                type(z),
+                theano.sandbox.cuda.dnn.GpuDnnSoftmax
+            )
+        f, f_gpu = self._test_softmax(
+            x,
+            x,
+            z,
+            z,
+            self._cmp,
+            mode_with_gpu,
+            check_types_with_cudnn
+        )
    def test_cudnn_softmax(self):
        if not cuda.dnn.dnn_available():
            raise SkipTest(cuda.dnn.dnn_available.msg)
@@ -357,5 +377,29 @@ class test_SoftMax(unittest.TestCase):
            check_types
        )
-        mode = mode_with_gpu.including("cudnn")
+        mode_w_cudnn = mode_with_gpu.including("cudnn")
-        self._test_softmax(x, x, f_z, f_z, self._cmp, mode, check_types_opt)
+        self._test_softmax(
+            x, x, f_z, f_z, self._cmp,
+            mode_w_cudnn, check_types_opt
+        )
+        # Verify that the SoftmaxGrad -> GpuDnnSoftmaxGrad optimization
+        y = T.vector('y')
+        f = theano.function(
+            [y],
+            T.grad(T.nnet.softmax(y).mean(), y),
+            mode=mode_with_gpu
+        )
+        sorted_f = f.maker.fgraph.toposort()
+        assert(len([i
+                    for i in sorted_f
+                    if isinstance(
+                        i.op,
+                        theano.sandbox.cuda.dnn.GpuDnnSoftmaxGrad
+                    )]) == 1)
+        assert(len([i
+                    for i in sorted_f
+                    if isinstance(
+                        i.op,
+                        theano.tensor.nnet.SoftmaxGrad
+                    )]) == 0)