Added test for the GpuSoftmax -> GpuDnnSoftmax optimization.

bc6e4d4e · Dustin Webb · 113afd29 · bc6e4d4e · bc6e4d4e
--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -546,9 +546,9 @@ err%(name)s = cudnnSoftmaxForward(
 @local_optimizer([GpuSoftmax])
 def local_softmax_dnn(node):
    if isinstance(node.op, GpuSoftmax):
-        ins = node.inputs[0].dimshuffle('x', 'x', 0, 1)
+        ins = node.inputs[0].dimshuffle(0, 1, 'x', 'x')
        out = GpuDnnSoftmax('bc01', 'accurate', 'channel')(gpu_contiguous(ins))
-        out = as_cuda_ndarray_variable(out.dimshuffle(2, 3))
+        out = as_cuda_ndarray_variable(out.dimshuffle(0, 1))
        return [out]
 gpu_optimizer.register("softmax_cudnn", local_softmax_dnn, 'cudnn')
--- a/theano/sandbox/cuda/tests/test_nnet.py
+++ b/theano/sandbox/cuda/tests/test_nnet.py
 from nose.plugins.skip import SkipTest
 import numpy
+import unittest
 import theano
 from theano.gof.python25 import any
@@ -208,87 +209,140 @@ def test_softmax_with_bias():
    cmp(128, 64 * 1024)
-def _test_softmax(x, x_gpu, f_z, f_gpu_z, cpu_type, gpu_type, cmp, topo_idx):
+class test_SoftMax(unittest.TestCase):
-    """
+    def _test_softmax(
-    This is basic test for GpuSoftmax and GpuDnnSoftmax
+        self,
-    We check that we loop when their is too much block
-    We use slower code when there isn't enough shared memory
-    """
-    f_z_out = f_z(x)
-    f_gpu_z_out = f_gpu_z(x_gpu)
-    f = theano.function([x], f_z_out, mode=mode_without_gpu)
-    f_gpu = theano.function([x_gpu], f_gpu_z_out, mode=mode_with_gpu)
-    assert isinstance(f.maker.fgraph.toposort()[-1].op, cpu_type)
-    assert isinstance(f_gpu.maker.fgraph.toposort()[topo_idx].op, gpu_type)
-    #we need to test n>32*1024 to check that we make the block loop.
-    cmp(1, 5, f, f_gpu)
-    cmp(2, 5, f, f_gpu)
-    cmp(10, 5, f, f_gpu)
-    cmp(100, 5, f, f_gpu)
-    cmp(1000, 5, f, f_gpu)
-    cmp(10000, 5, f, f_gpu)
-    cmp(4074, 400, f, f_gpu)
-    cmp(784, 784, f, f_gpu)
-    cmp(4, 1000, f, f_gpu)
-    cmp(4, 1024, f, f_gpu)
-    cmp(4, 2000, f, f_gpu)
-    cmp(4, 2024, f, f_gpu)
-    # The GTX285 don't have enough shared memory.
-    cmp(4, 4074, f, f_gpu)
-    # The GTX580, 680 and kepler don't have enough shared memory.
-    cmp(2, 10000, f, f_gpu)
-    cmp(128, 16 * 1024, f, f_gpu)
-    cmp(128, 64 * 1024, f, f_gpu)
-    cmp((2 << 15) - 1, 5, f, f_gpu)  # cudnn permits no more than 2^15 - 1 rows
-    cmp(5, 2 << 15, f, f_gpu)
-    return f, f_gpu
-def test_softmax():
-    def cmp(n, m, f, f_gpu):
-        #print "test_softmax",n,m
-        data = numpy.arange(n * m, dtype='float32').reshape(n, m)
-        out = f(data)
-        gout = f_gpu(data)
-        assert numpy.allclose(out, gout), numpy.absolute(out - gout)
-    x = T.fmatrix('x')
-    z = T.nnet.softmax
-    f, f_gpu = _test_softmax(
        x,
-        x,
+        x_gpu,
-        z,
+        f_z,
-        z,
+        f_gpu_z,
-        type(z),
-        cuda.nnet.GpuSoftmax,
        cmp,
-        -2
+        gpu_mode,
-    )
+        check_types
+    ):
-    # cuDNN cannot handle these test cases but the Theano softmax can so we
+        """
-    # test them only for the Theano softmax.
+        This is basic test for GpuSoftmax and GpuDnnSoftmax
-    cmp(2 << 15, 5, f, f_gpu)
-    cmp(0, 10, f, f_gpu)
+        We check that we loop when their is too much block
+        We use slower code when there isn't enough shared memory
+        """
-def test_cudnn_softmax():
+        f_z_out = f_z(x)
-    def cmp(n, m, f, f_gpu):
+        f_gpu_z_out = f_gpu_z(x_gpu)
+        f = theano.function([x], f_z_out, mode=mode_without_gpu)
+        f_gpu = theano.function([x_gpu], f_gpu_z_out, mode=gpu_mode)
+        check_types(f, f_gpu)
+        #we need to test n>32*1024 to check that we make the block loop.
+        cmp(1, 5, f, f_gpu)
+        cmp(2, 5, f, f_gpu)
+        cmp(10, 5, f, f_gpu)
+        cmp(100, 5, f, f_gpu)
+        cmp(1000, 5, f, f_gpu)
+        cmp(10000, 5, f, f_gpu)
+        cmp(4074, 400, f, f_gpu)
+        cmp(784, 784, f, f_gpu)
+        cmp(4, 1000, f, f_gpu)
+        cmp(4, 1024, f, f_gpu)
+        cmp(4, 2000, f, f_gpu)
+        cmp(4, 2024, f, f_gpu)
+        # The GTX285 don't have enough shared memory.
+        cmp(4, 4074, f, f_gpu)
+        # The GTX580, 680 and kepler don't have enough shared memory.
+        cmp(2, 10000, f, f_gpu)
+        cmp(128, 16 * 1024, f, f_gpu)
+        cmp(128, 64 * 1024, f, f_gpu)
+        # cudnn permits no more than 2^15 - 1 rows
+        cmp((2 << 15) - 1, 5, f, f_gpu)
+        cmp(5, 2 << 15, f, f_gpu)
+        return f, f_gpu
+    def _cmp(self, n, m, f, f_gpu):
        #print "test_softmax",n,m
        data = numpy.arange(n * m, dtype='float32').reshape(n, m)
-        gdata = numpy.asarray(data).transpose()[None, :, :, None]
        out = f(data)
-        gout = numpy.asarray(f_gpu(gdata))[0, :, :, 0].transpose()
+        gout = f_gpu(data)
        assert numpy.allclose(out, gout), numpy.absolute(out - gout)
-    x = T.matrix('x', 'float32')
+    def _check_types(self, graph, graph_gpu, topo_idx, f_type, f_gpu_type):
-    x_gpu = T.tensor4('x_gpu', 'float32')
+        assert isinstance(graph.maker.fgraph.toposort()[-1].op, f_type)
-    f_z = T.nnet.softmax
+        assert isinstance(
-    f_gpu = theano.sandbox.cuda.dnn.GpuDnnSoftmax(
+            graph_gpu.maker.fgraph.toposort()[topo_idx].op,
-        'bc01',
+            f_gpu_type
-        'accurate',
+        )
-        'channel'
-    )
+    def test_softmax(self):
-    _test_softmax(x, x_gpu, f_z, f_gpu, type(f_z), type(f_gpu), cmp, -1)
+        x = T.fmatrix('x')
+        z = T.nnet.softmax
+        def check_types(graph, graph_gpu):
+            self._check_types(
+                graph,
+                graph_gpu,
+                -2,
+                type(z),
+                cuda.nnet.GpuSoftmax
+            )
+        f, f_gpu = self._test_softmax(
+            x,
+            x,
+            z,
+            z,
+            self._cmp,
+            mode_with_gpu,
+            check_types
+        )
+        # cuDNN cannot handle these test cases but the Theano softmax can so we
+        # test them only for the Theano softmax.
+        self._cmp(2 << 15, 5, f, f_gpu)
+        self._cmp(0, 10, f, f_gpu)
+    def test_cudnn_softmax(self):
+        def cmp(n, m, f, f_gpu):
+            data = numpy.arange(n * m, dtype='float32').reshape(n, m)
+            gdata = numpy.asarray(data)[:, :, None, None]
+            out = f(data)
+            gout = numpy.asarray(f_gpu(gdata))[:, :, 0, 0]
+            assert numpy.allclose(out, gout), numpy.absolute(out - gout)
+        x = T.matrix('x', 'float32')
+        x_gpu = T.tensor4('x_gpu', 'float32')
+        f_z = T.nnet.softmax
+        f_gpu = theano.sandbox.cuda.dnn.GpuDnnSoftmax(
+            'bc01',
+            'accurate',
+            'channel'
+        )
+        def check_types(graph, graph_gpu):
+            self._check_types(
+                graph,
+                graph_gpu,
+                -1,
+                type(f_z),
+                theano.sandbox.cuda.dnn.GpuDnnSoftmax
+            )
+        def check_types_opt(graph, graph_gpu):
+            assert isinstance(graph.maker.fgraph.toposort()[-1].op, type(f_z))
+            assert len([n for n in graph_gpu.maker.fgraph.toposort()
+                        if isinstance(
+                            n.op,
+                            theano.sandbox.cuda.dnn.GpuDnnSoftmax
+                        )]) == 1
+        self._test_softmax(
+            x,
+            x_gpu,
+            f_z,
+            f_gpu,
+            cmp,
+            mode_with_gpu,
+            check_types
+        )
+        mode = mode_with_gpu.including("cudnn")
+        self._test_softmax(x, x, f_z, f_z, self._cmp, mode, check_types_opt)