Merge pull request #3899 from carriepl/log_softmax

Add optimizations to transfer new LogSoftmax op to the GPU as CuDNN softmax ops

Merge pull request #3899 from carriepl/log_softmax
ccf4116b · Pascal Lamblin · fdfbab37 · 6f549791 · ccf4116b · ccf4116b
--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -10,7 +10,7 @@ from theano.gof import Optimizer, local_optimizer, COp
 from theano.gof.type import CDataType, Generic
 from theano.compile import optdb
 from theano.compile.ops import shape_i
-from theano.tensor.nnet import SoftmaxGrad
+from theano.tensor.nnet import LogSoftmax, SoftmaxGrad
 from theano.tensor.nnet.abstract_conv import get_conv_output_shape
 from theano.tensor.signal.pool import (
    Pool, MaxPoolGrad, AveragePoolGrad)
@@ -2400,11 +2400,12 @@ if True:
            return [out]
    @register_opt('cudnn')
-    @local_optimizer([GpuElemwise])
+    @local_optimizer([GpuElemwise, LogSoftmax])
    def local_log_softmax_dnn(node):
        # The log-softmax implementation is only available starting at CuDNN V3
        if not dnn_available() or version() < (3000, 3000):
            return
        if (isinstance(node.op, GpuElemwise) and
                isinstance(node.op.scalar_op, Log) and
                node.inputs[0].owner and
@@ -2419,6 +2420,21 @@ if True:
            new_log_softmax = new_softmax_node(softmax_node.inputs[0])
            return [new_log_softmax]
+        elif (isinstance(node.op, LogSoftmax) and node.inputs[0].owner and
+              isinstance(node.inputs[0].owner.op, HostFromGpu)):
+            if not dnn_available():
+                return
+            # Transform the input in the format expected by GpuDnnSoftmax
+            inp = node.inputs[0].owner.inputs[0]
+            if inp.ndim != 2:
+                return
+            inp = inp.dimshuffle(0, 1, 'x', 'x')
+            # Apply GpuDnnSoftmax and return the result
+            out = GpuDnnSoftmax('bc01', 'log', 'channel')(gpu_contiguous(inp))
+            return [out.dimshuffle(0, 1)]
    class NoCuDNNRaise(Optimizer):
        def apply(self, fgraph):
            """ Raise a RuntimeError if cudnn can't be used"""

--- a/theano/sandbox/cuda/tests/test_dnn.py
+++ b/theano/sandbox/cuda/tests/test_dnn.py
@@ -612,6 +612,41 @@ class test_DnnSoftMax(test_nnet.test_SoftMax):
            utt.assert_allclose(out, expected_out)
+    def test_log_softmax2(self):
+        # Test that the op LogSoftmax is correctly replaced by the op
+        # DnnSoftmax with the 'log' mode.
+        # Compile a reference function, on the CPU, to be used to validate the
+        # results of the other function.
+        x = T.fmatrix()
+        f_ref = theano.function([x], T.nnet.LogSoftmax()(x))
+        # Build the first graph and ensure that the optimization is applied
+        log_softmax_out = T.nnet.LogSoftmax()(x)
+        f = theano.function([x], log_softmax_out, mode=mode_with_gpu)
+        dnn_softmax_nodes = [n for n in f.maker.fgraph.toposort() if
+                             isinstance(n.op, cuda.dnn.GpuDnnSoftmax)]
+        assert len(dnn_softmax_nodes) == 1
+        assert dnn_softmax_nodes[0].op.algo == "log"
+        # Compare the output of the function with the reference function
+        inp = numpy.random.normal(0, 1, (5, 6)).astype("float32")
+        utt.assert_allclose(f(inp), f_ref(inp))
+        # Build the first graph and ensure that the optimization is applied
+        log_softmax_out = T.log(T.nnet.Softmax()(x))
+        f = theano.function([x], log_softmax_out, mode=mode_with_gpu)
+        dnn_softmax_nodes = [n for n in f.maker.fgraph.toposort() if
+                             isinstance(n.op, cuda.dnn.GpuDnnSoftmax)]
+        assert len(dnn_softmax_nodes) == 1
+        assert dnn_softmax_nodes[0].op.algo == "log"
+        # Compare the output of the function with the reference function
+        inp = numpy.random.normal(0, 1, (5, 6)).astype("float32")
+        utt.assert_allclose(f(inp), f_ref(inp))
 def test_dnn_tag():
    """

--- a/theano/sandbox/gpuarray/dnn.py
+++ b/theano/sandbox/gpuarray/dnn.py
@@ -12,7 +12,7 @@ from theano.gof.cmodule import GCC_compiler
 from theano.gof.type import CDataType, Generic
 from theano.compile import optdb
 from theano.compile.ops import shape_i
-from theano.tensor.nnet import SoftmaxGrad
+from theano.tensor.nnet import LogSoftmax, SoftmaxGrad
 from theano.tensor.nnet.abstract_conv import (AbstractConv2d,
                                              AbstractConv2d_gradWeights,
                                              AbstractConv2d_gradInputs,
@@ -1170,8 +1170,9 @@ class GpuDnnSoftmaxBase(DnnBase):
    Parameters
    ----------
    algo
-        'fast' or 'accurate' indicating whether computations should be
+        'fast', 'accurate' or 'log' indicating whether, respectively,
-        optimized for speed or accuracy respectively.
+        computations should be optimized for speed, for accuracy, or if CuDNN
+        should rather compute the log-softmax instead.
    mode
        'instance' or 'channel' indicating whether the softmax should be
        computed per image across 'c01' or per spatial location '01' per
@@ -1219,8 +1220,9 @@ class GpuDnnSoftmax(GpuDnnSoftmaxBase):
    Op for the cuDNN Softmax.
    algo
-        'fast' or 'accurate' indicating whether computations should be
+        'fast', 'accurate' or 'log' indicating whether, respectively,
-        optimized for speed or accuracy respectively.
+        computations should be optimized for speed, for accuracy, or if CuDNN
+        should rather compute the log-softmax instead.
    mode
        'instance' or 'channel' indicating whether the softmax should be
        computed per image across 'c01' or per spatial location '01' per
@@ -1253,8 +1255,9 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
    Parameters
    ----------
    algo
-        'fast' or 'accurate' indicating whether computations should be
+        'fast', 'accurate' or 'log' indicating whether, respectively,
-        optimized for speed or accuracy respectively.
+        computations should be optimized for speed, for accuracy, or if CuDNN
+        should rather compute the gradient of the log-softmax instead.
    mode
        'instance' or 'channel' indicating whether the softmax should
        be computed per image across 'c01' or per spatial location '01' per
@@ -1472,6 +1475,25 @@ def local_log_softmax_dnn(node):
        return [new_softmax(softmax_node.inputs[0])]
+@register_opt('cudnn')
+@op_lifter([LogSoftmax])
+def local_logsoftmax_to_dnn(node, ctx_name):
+    if not dnn_available(ctx_name) or version() < 3000:
+        # No log-softmax before cudnn v3
+        return
+    # Transform the input in the format expected by GpuDnnSoftmax
+    inp = node.inputs[0]
+    if inp.ndim != 2:
+        return
+    inp = inp.dimshuffle(0, 1, 'x', 'x')
+    inp.tag.context_name = ctx_name
+    # Apply GpuDnnSoftmax and return the result
+    out = GpuDnnSoftmax('log', 'channel')(gpu_contiguous(inp))
+    return [out.dimshuffle(0, 1)]
 class NoCuDNNRaise(Optimizer):
    def apply(self, fgraph):
        """

--- a/theano/sandbox/gpuarray/tests/test_dnn.py
+++ b/theano/sandbox/gpuarray/tests/test_dnn.py
@@ -847,3 +847,43 @@ class test_SoftMax(test_nnet.test_SoftMax):
                                     numpy.exp(input_val).sum(1)[:, None, :, :])
            utt.assert_allclose(out, expected_out)
+    def test_log_softmax2(self):
+        # Test that the op LogSoftmax is correctly replaced by the op
+        # DnnSoftmax with the 'log' mode.
+        # This is a test for an optimization that depends on CuDNN v3 or
+        # more recent. Don't test if the CuDNN version is too old.
+        if dnn.version() < 3000:
+            raise SkipTest("Log-softmax is only in cudnn v3+")
+        # Compile a reference function, on the CPU, to be used to validate the
+        # results of the other function.
+        x = T.fmatrix()
+        f_ref = theano.function([x], T.nnet.LogSoftmax()(x))
+        # Build the first graph and ensure that the optimization is applied
+        log_softmax_out = T.nnet.LogSoftmax()(x)
+        f = theano.function([x], log_softmax_out, mode=mode_with_gpu)
+        dnn_softmax_nodes = [n for n in f.maker.fgraph.toposort() if
+                             isinstance(n.op, dnn.GpuDnnSoftmax)]
+        assert len(dnn_softmax_nodes) == 1
+        assert dnn_softmax_nodes[0].op.algo == "log"
+        # Compare the output of the function with the reference function
+        inp = numpy.random.normal(0, 1, (5, 6)).astype("float32")
+        utt.assert_allclose(f(inp), f_ref(inp))
+        # Build the first graph and ensure that the optimization is applied
+        log_softmax_out = T.log(T.nnet.Softmax()(x))
+        f = theano.function([x], log_softmax_out, mode=mode_with_gpu)
+        dnn_softmax_nodes = [n for n in f.maker.fgraph.toposort() if
+                             isinstance(n.op, dnn.GpuDnnSoftmax)]
+        assert len(dnn_softmax_nodes) == 1
+        assert dnn_softmax_nodes[0].op.algo == "log"
+        # Compare the output of the function with the reference function
+        inp = numpy.random.normal(0, 1, (5, 6)).astype("float32")
+        utt.assert_allclose(f(inp), f_ref(inp))