Merge pull request #1726 from carriepl/master

Conversion of GpuSoftmax and GpuSoftmaxWithBias to the new backend

Merge pull request #1726 from carriepl/master
ff0abb5f · abergeron · db4352dc · eaab9d97 · ff0abb5f · ff0abb5f
--- a/theano/sandbox/gpuarray/kernel_codegen.py
+++ b/theano/sandbox/gpuarray/kernel_codegen.py
--- a/theano/sandbox/gpuarray/nnet.py
+++ b/theano/sandbox/gpuarray/nnet.py
--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -20,7 +20,9 @@ from theano.sandbox.gpuarray.basic_ops import (host_from_gpu,
 from theano.sandbox.gpuarray.blas import gpu_dot22, GpuGemv, GpuGemm
 from theano.sandbox.gpuarray.conv import GpuConv
 from theano.sandbox.gpuarray.nnet import (GpuCrossentropySoftmaxArgmax1HotWithBias,
-                                          GpuCrossentropySoftmax1HotWithBiasDx)
+                                          GpuCrossentropySoftmax1HotWithBiasDx,
+                                          GpuSoftmaxWithBias,
+                                          GpuSoftmax)
 from theano.sandbox.gpuarray.elemwise import (GpuElemwise, _is_scalar,
                                              GpuDimShuffle, GpuCAReduceCuda)
 from theano.sandbox.gpuarray.subtensor import GpuIncSubtensor, GpuSubtensor
@@ -340,7 +342,16 @@ def local_gpua_crossentropysoftmaxargmax1hotwithbias(node):
 @op_lifter([tensor.nnet.CrossentropySoftmax1HotWithBiasDx])
 def local_gpua_crossentropysoftmax1hotwithbiasdx(node):
    return GpuCrossentropySoftmax1HotWithBiasDx()
+    
+@register_opt()
+@op_lifter([tensor.nnet.Softmax])
+def local_gpua_softmax(node):
+    return GpuSoftmax()

+@register_opt()
+@op_lifter([tensor.nnet.SoftmaxWithBias])
+def local_gpua_softmaxwithbias(node):
+    return GpuSoftmaxWithBias()

 @register_opt()
 @op_lifter([gpu_from_host, ConvOp])

--- a/theano/sandbox/gpuarray/tests/test_nnet.py
+++ b/theano/sandbox/gpuarray/tests/test_nnet.py
@@ -157,3 +157,132 @@ def test_GpuCrossentropySoftmax1HotWithBiasDx():

        assert False, "numpy.allclose(cpu_out, gpu_out, rtol=%s, atol=%s)" % (
            rtol, atol)
+
+
+def test_softmax_with_bias_float32():
+    softmax_with_bias_unittest_template(dtypeInput='float32',
+                                        dtypeBias='float32')
+
+def test_softmax_with_bias_float64():
+    softmax_with_bias_unittest_template(dtypeInput='float32',
+                                        dtypeBias='float64')
+    softmax_with_bias_unittest_template(dtypeInput='float64',
+                                        dtypeBias='float32')
+    softmax_with_bias_unittest_template(dtypeInput='float64',
+                                        dtypeBias='float64')
+
+def softmax_with_bias_unittest_template(dtypeInput, dtypeBias):
+    """
+    This is basic test for GpuSoftmaxWithBias with float64 variables
+
+    We check that we loop when their is too much block
+
+    TODO: check that we loop when their is too much thread.(THIS IS
+    NOT IMPLEMENTED)
+    """
+    assert dtypeInput in ['float32', 'float64']
+    assert dtypeBias in ['float32', 'float64']
+
+    if dtypeInput == 'float32':
+        x = T.fmatrix('x')
+    elif dtypeInput == 'float64':
+        x = T.dmatrix('x')
+
+    # We can't use zeros_like(x[0,::]) as this don't allow to test with
+    # 0 shape
+    if dtypeBias == 'float32':
+        z = T.nnet.softmax_with_bias(x, T.arange(x.shape[1] * 2,
+                                                 dtype='float32')[::2])
+    elif dtypeBias == 'float64':
+        z = T.nnet.softmax_with_bias(x, T.arange(x.shape[1] * 2,
+                                                 dtype='float64')[::2])
+
+
+    f = theano.function([x], z, mode=mode_without_gpu)
+    f_gpu = theano.function([x], z, mode=mode_with_gpu)
+    assert f.maker.fgraph.toposort()[-1].op == T.nnet.softmax_with_bias
+    assert isinstance(f_gpu.maker.fgraph.toposort()[-2].op,
+                      theano.sandbox.gpuarray.nnet.GpuSoftmaxWithBias)
+
+    def cmp(n, m):
+        #print "test_softmax",n,m
+        if dtypeInput == 'float32':
+            data = numpy.arange(n * m, dtype='float32').reshape(n, m)
+        elif dtypeInput == 'float64':
+            data = numpy.arange(n * m, dtype='float64').reshape(n, m)
+
+        out = f(data)
+        gout = f_gpu(data)
+        assert numpy.allclose(out, gout), numpy.absolute(out - gout)
+
+    cmp(2, 5)
+    #we need to test n>32*1024 to check that we make the block loop.
+    cmp(2 << 15, 5)
+    cmp(4074, 400)
+    cmp(0, 10)
+    cmp(784, 784)
+    cmp(4, 1000)
+    cmp(4, 1024)
+    cmp(4, 2000)
+    cmp(4, 2024)
+    #GTX285 don't have enough shared mem for this case.
+    cmp(4, 4074)
+    # The GTX580, 680 and kepler don't have enough shared memory.
+    cmp(2, 10000)
+    cmp(128, 16 * 1024)
+    cmp(128, 64 * 1024)
+
+
+def test_softmax_float32():
+    softmax_unittest_template('float32')
+
+def test_softmax_float64():
+    softmax_unittest_template('float64')
+
+def softmax_unittest_template(dtypeInput):
+    """
+    This is basic test for GpuSoftmax with float64 variables
+
+    We check that we loop when their is too much block
+    We use slower code when there isn't enough shared memory
+    """
+    assert dtypeInput in ['float32', 'float64']
+
+    if dtypeInput == 'float32':
+        x = T.fmatrix('x')
+    elif dtypeInput == 'float64':
+        x = T.dmatrix('x')
+
+    z = T.nnet.softmax(x)
+    f = theano.function([x], z, mode=mode_without_gpu)
+    f_gpu = theano.function([x], z, mode=mode_with_gpu)
+    assert f.maker.fgraph.toposort()[-1].op == T.nnet.softmax
+    assert isinstance(f_gpu.maker.fgraph.toposort()[-2].op,
+                      theano.sandbox.gpuarray.nnet.GpuSoftmax)
+
+    def cmp(n, m):
+        if dtypeInput == 'float32':
+            data = numpy.arange(n * m, dtype='float32').reshape(n, m)
+        elif dtypeInput == 'float64':
+            data = numpy.arange(n * m, dtype='float64').reshape(n, m)
+
+        out = f(data)
+        gout = f_gpu(data)
+        assert numpy.allclose(out, gout), numpy.absolute(out - gout)
+
+    #we need to test n>32*1024 to check that we make the block loop.
+    cmp(2, 5)
+    cmp(2 << 15, 5)
+    cmp(4074, 400)
+    cmp(0, 10)
+    cmp(784, 784)
+    cmp(4, 1000)
+    cmp(4, 1024)
+    cmp(4, 2000)
+    cmp(4, 2024)
+    # The GTX285 don't have enough shared memory.
+    cmp(4, 4074)
+    # The GTX580, 680 and kepler don't have enough shared memory.
+    cmp(2, 10000)
+    cmp(128, 16 * 1024)
+    cmp(128, 64 * 1024)
\ No newline at end of file