Add v3 features for softmax.

62c81c9c · Arnaud Bergeron · 2198fc07 · 62c81c9c · 62c81c9c · 62c81c9c
--- a/theano/sandbox/gpuarray/dnn.py
+++ b/theano/sandbox/gpuarray/dnn.py
@@ -1333,15 +1333,17 @@ class GpuDnnSoftmaxBase(DnnBase):
        DnnBase.__init__(self)
        self.tensor_format = tensor_format

-        assert(algo in ('fast', 'accurate'))
+        assert(algo in ('fast', 'accurate', 'log'))
+        if algo == 'log' and version() < 3000:
+            raise RuntimeError("Need CuDNN v3 for log-softmax")
        self.algo = algo

        assert(mode in ('instance', 'channel'))
        self.mode = mode

-        self.tensor_4d_descs = [softmax_input
-                                for softmax_input in self.softmax_inputs]
-        self.tensor_4d_descs.append('softmax_output')
+        self.tensor_descs = [softmax_input
+                             for softmax_input in self.softmax_inputs]
+        self.tensor_descs.append('softmax_output')

    def infer_shape(self, node, shape):
        if self.direction == 'forward':
@@ -1349,22 +1351,22 @@ class GpuDnnSoftmaxBase(DnnBase):
        else:
            return [shape[1]]

-    def _define_tensor4d_desc(self, name, id):
+    def _define_tensor_desc(self, name, id):
        return """
 cudnnTensorDescriptor_t %(id)s_%(name)s;
 """ % dict(name=name, id=id)

-    def _init_tensor4d_desc(self, name, id, fail):
+    def _init_tensor_desc(self, name, id, fail):
        return """
 %(id)s_%(name)s = NULL;
 if ((err%(name)s = cudnnCreateTensorDescriptor(&%(id)s_%(name)s)) != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
-               ": %%s", cudnnGetErrorString(err%(name)s));
+  PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor : %%s",
+               cudnnGetErrorString(err%(name)s));
  %(fail)s
 }
 """ % dict(name=name, id=id, fail=fail)

-    def _clean_tensor4d_desc(self, name, id):
+    def _clean_tensor_desc(self, name, id):
        return """
 if(%(id)s_%(name)s!= NULL)
  cudnnDestroyTensorDescriptor(%(id)s_%(name)s);
@@ -1372,8 +1374,8 @@ if(%(id)s_%(name)s!= NULL)

    def c_support_code_struct(self, node, name):
        result = ''
-        for id in self.tensor_4d_descs:
-            result += self._define_tensor4d_desc(name, id)
+        for id in self.tensor_descs:
+            result += self._define_tensor_desc(name, id)
        return result

    def c_init_code_struct(self, node, name, sub):
@@ -1381,14 +1383,14 @@ if(%(id)s_%(name)s!= NULL)
 cudnnStatus_t err%(name)s;
 """ % dict(name=name)

-        for id in self.tensor_4d_descs:
-            result += self._init_tensor4d_desc(name, id, sub['fail'])
+        for id in self.tensor_descs:
+            result += self._init_tensor_desc(name, id, sub['fail'])
        return result

    def c_cleanup_code_struct(self, node, name):
        result = ''
-        for id in self.tensor_4d_descs:
-            result += self._clean_tensor4d_desc(name, id)
+        for id in self.tensor_descs:
+            result += self._clean_tensor_desc(name, id)
        return result

    def c_code(self, node, name, inputs, outputs, sub):
@@ -1396,43 +1398,31 @@ cudnnStatus_t err%(name)s;
        outs, = outputs

        if self.tensor_format == 'b01c':
-            tensor_format = 1
+            tensor_format = "CUDNN_TENSOR_NHWC"
        else:
-            tensor_format = 0
+            tensor_format = "CUDNN_TENSOR_NCHW"

        if self.mode == 'instance':
-            mode = 1
+            mode = "CUDNN_SOFTMAX_MODE_INSTANCE"
        else:
-            mode = 0
+            mode = "CUDNN_SOFTMAX_MODE_CHANNEL"

        if self.algo == 'fast':
-            algo = 1
+            algo = "CUDNN_SOFTMAX_FAST"
+        elif self.algo == 'log':
+            algo = "CUDNN_SOFTMAX_LOG"
        else:
-            algo = 0
-
-        # Setup configuration variables.
-        result = """
-cudnnStatus_t err%(name)s;
-cudnnTensorFormat_t format%(name)s = CUDNN_TENSOR_NCHW;
-if (%(tensor_format)d == 1)
-  format%(name)s = CUDNN_TENSOR_NHWC;
-
-cudnnSoftmaxAlgorithm_t algo%(name)s = CUDNN_SOFTMAX_ACCURATE;
-if (%(algo)d == 1)
-  algo%(name)s = CUDNN_SOFTMAX_FAST;
-
-cudnnSoftmaxMode_t mode%(name)s = CUDNN_SOFTMAX_MODE_CHANNEL;
-if (%(mode)d == 1)
-  mode%(name)s = CUDNN_SOFTMAX_MODE_INSTANCE;
-""" % dict(name=name, tensor_format=tensor_format, mode=mode, algo=algo)
+            algo = "CUDNN_SOFTMAX_ACCURATE"

        # Validate the input and build the input variables.
        for input_idx, input_name in enumerate(self.softmax_inputs):
-            result += c_set_tensor4d(ins[input_idx], input_name + "_" + name,
-                                     "err" + name, sub['fail'])
+            result += """
+if (c_set_tensorNd(%(t)s, %(desc)s) != 0)
+  %(fail)s
+""" % dict(t=ins[input_idx], desc=input_name + "_" + name, fail=sub['fail'])

        subs = dict(ins=ins[-1], outs=outs, fail=sub['fail'],
-                    name=name)
+                    name=name, algo=algo, mode=mode)

        for idx, softmax_input in enumerate(self.softmax_inputs):
            subs['name%d' % idx] = softmax_input
@@ -1446,10 +1436,9 @@ if (theano_prep_output(&%(outs)s, PyGpuArray_NDIM(%(ins)s),
 {
  %(fail)s
 }
+if (c_set_tensorNd(%(outs)s, softmax_output_%(name)s) != 0)
+  %(fail)s
 """ % subs
-        result += c_set_tensor4d(outs,
-                                 "softmax_output_" + name,
-                                 "err" + name, sub['fail'])

        # Add on a call to the method that does the actual work.
        result += self.method() % subs
@@ -1457,7 +1446,7 @@ if (theano_prep_output(&%(outs)s, PyGpuArray_NDIM(%(ins)s),
        return result

    def c_code_cache_version(self):
-        return (0, 7, version())
+        return (0.1, version())

    def method(self):
        raise NotImplementedError('GpuDnnSoftmaxBase::method')
@@ -1489,24 +1478,13 @@ class GpuDnnSoftmax(GpuDnnSoftmaxBase):

    def method(self):
        return """
-#ifndef CUDNN_VERSION
-err%(name)s = cudnnSoftmaxForward(
-  _handle,
-  algo%(name)s,
-  mode%(name)s,
-  softmax_input_%(name)s,
-  PyGpuArray_DEV_DATA(%(ins)s),
-  softmax_output_%(name)s,
-  PyGpuArray_DEV_DATA(%(outs)s)
-);
-#else
 {
 const float alpha = 1.;
 const float beta = 0.;
 err%(name)s = cudnnSoftmaxForward(
  _handle,
-  algo%(name)s,
-  mode%(name)s,
+  %(algo)s,
+  %(mode)s,
  (void*) &alpha,
  softmax_input_%(name)s,
  PyGpuArray_DEV_DATA(%(ins)s),
@@ -1515,7 +1493,6 @@ err%(name)s = cudnnSoftmaxForward(
  PyGpuArray_DEV_DATA(%(outs)s)
 );
 }
-#endif
 """

    def grad(self, inp, grads):
@@ -1558,26 +1535,13 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):

    def method(self):
        return """
-#ifndef CUDNN_VERSION
-err%(name)s = cudnnSoftmaxBackward(
-  _handle,
-  algo%(name)s,
-  mode%(name)s,
-  %(name1)s_%(name)s,
-  PyGpuArray_DEV_DATA(%(ins1)s),
-  %(name0)s_%(name)s,
-  PyGpuArray_DEV_DATA(%(ins0)s),
-  softmax_output_%(name)s,
-  PyGpuArray_DEV_DATA(%(outs)s)
-);
-#else
 {
 const float alpha = 1.;
 const float beta = 0.;
 err%(name)s = cudnnSoftmaxBackward(
  _handle,
-  algo%(name)s,
-  mode%(name)s,
+  %(algo)s,
+  %(mode)s,
  (void*) &alpha,
  %(name1)s_%(name)s,
  PyGpuArray_DEV_DATA(%(ins1)s),
@@ -1588,8 +1552,7 @@ err%(name)s = cudnnSoftmaxBackward(
  PyGpuArray_DEV_DATA(%(outs)s)
 );
 }
-#endif
-        """
+"""


 # @register_opt('cudnn')  # this optimizer is registered in opt.py instead.

--- a/theano/sandbox/gpuarray/tests/test_dnn.py
+++ b/theano/sandbox/gpuarray/tests/test_dnn.py
@@ -175,8 +175,6 @@ def test_pooling():
            func = T.max
        else:
            func = T.mean
-        if pad != (0, 0) and dnn.version() == -1:
-            continue

        if pad != (0, 0) and func is T.mean:
            continue
@@ -611,15 +609,9 @@ def test_dnn_conv_alpha_output_merge():

    lr = numpy.asarray(0.05, dtype='float32')

-    if dnn.version() == -1:
-        # Can't merge alpha with cudnn v1
-        fr = conv + out
-        wr = kern + gw
-        ir = img + gi
-    else:
-        fr = lr * (conv + out)
-        wr = kern + lr * gw
-        ir = img + lr * gi
+    fr = lr * (conv + out)
+    wr = kern + lr * gw
+    ir = img + lr * gi

    f1 = theano.function([img, kern, out], [fr, wr, ir], mode=mode_with_gpu)
    assert isinstance(f1.maker.fgraph.outputs[0].owner.inputs[0].owner.op,
@@ -656,9 +648,6 @@ def test_dnn_conv_alpha_output_merge():


 def test_dnn_conv_grad():
-    if not dnn.dnn_available() or dnn.version() == -1:
-        raise SkipTest('alpha != 1.0 not supported in cudnn v1')
-
    b = 1
    c = 4
    f = 3
@@ -696,7 +685,7 @@ def test_dnn_conv_grad():
 def test_version():
    if not dnn.dnn_available():
        raise SkipTest(dnn.dnn_available.msg)
-    assert isinstance(dnn.version(), (int, tuple))
+    assert isinstance(dnn.version(), int)


 class test_SoftMax(test_nnet.test_SoftMax):
@@ -705,7 +694,7 @@ class test_SoftMax(test_nnet.test_SoftMax):
    mode = mode_with_gpu

    def test_softmax_shape_0(self):
-        raise SkipTest("Cudnn do not suport 0 shapes")
+        raise SkipTest("Cudnn doesn't suport 0 shapes")

    def test_softmax_grad(self):
        def cmp(n, m, f, f_gpu):
@@ -758,18 +747,20 @@ class test_SoftMax(test_nnet.test_SoftMax):
            mode=mode_with_gpu
        )
        sorted_f = f.maker.fgraph.toposort()
-        assert(len([i
-                    for i in sorted_f
-                    if isinstance(
-                        i.op,
-                        self.gpu_grad_op
-                    )]) == 1)
-        assert(len([i
-                    for i in sorted_f
-                    if isinstance(
-                        i.op,
-                        theano.tensor.nnet.SoftmaxGrad
-                    )]) == 0)
+        # Optimization is disabled for cudnn v3 rc1
+        if dnn.version() == 2000:
+            assert(len([i
+                        for i in sorted_f
+                        if isinstance(
+                            i.op,
+                            self.gpu_grad_op
+                            )]) == 1)
+            assert(len([i
+                        for i in sorted_f
+                        if isinstance(
+                            i.op,
+                            theano.tensor.nnet.SoftmaxGrad
+                            )]) == 0)

        # Verify that the SoftmaxGrad -> Gpu[Dnn]SoftmaxGrad
        # optimization is not applied when cudnn is excluded or not
@@ -801,15 +792,17 @@ class test_SoftMax(test_nnet.test_SoftMax):
        o = theano.tensor.nnet.SoftmaxGrad()(y, y * 2)
        f = theano.function([y], o, mode=mode_with_gpu)
        sorted_f = f.maker.fgraph.toposort()
-        assert(len([i
-                    for i in sorted_f
-                    if isinstance(
-                        i.op,
-                        self.gpu_grad_op
-                    )]) == 1)
-        assert(len([i
-                    for i in sorted_f
-                    if isinstance(
-                        i.op,
-                        theano.tensor.nnet.SoftmaxGrad
-                    )]) == 0)
+        if dnn.version() == 2000:
+            # opt disabled for cudnn v3 rc1
+            assert(len([i
+                        for i in sorted_f
+                        if isinstance(
+                            i.op,
+                            self.gpu_grad_op
+                            )]) == 1)
+            assert(len([i
+                        for i in sorted_f
+                        if isinstance(
+                            i.op,
+                            theano.tensor.nnet.SoftmaxGrad
+                            )]) == 0)
--- a/theano/sandbox/gpuarray/tests/test_nnet.py
+++ b/theano/sandbox/gpuarray/tests/test_nnet.py
@@ -346,7 +346,6 @@ class test_SoftMax(unittest.TestCase):
        return f, f_gpu

    def _cmp(self, n, m, f, f_gpu):
-        # print "test_softmax",n,m
        data = numpy.arange(n * m, dtype='float32').reshape(n, m)
        out = f(data)
        gout = f_gpu(data)
@@ -369,8 +368,6 @@ class test_SoftMax(unittest.TestCase):
            self._cmp
        )

-        # cuDNN R1 cannot handle these test cases but the Theano softmax can so
-        # we test them only for the Theano softmax.
        self._cmp(2 << 15, 5, f, f_gpu)

    def test_softmax_shape_0(self):