new gpu backend accepting tensor variables

ef4c9e9b · Harm de Vries · Frederic Bastien · fe0b1477 · ef4c9e9b · ef4c9e9b
--- a/theano/sandbox/gpuarray/dnn.py
+++ b/theano/sandbox/gpuarray/dnn.py
@@ -141,6 +141,7 @@ dnn_available.msg = None
 class DnnBase(COp):
    """
    Creates a handle for cudnn and pulls in the cudnn libraries and headers.
@@ -254,6 +255,7 @@ version.v = None
 class GpuDnnConvDesc(COp):
    """
    This Op builds a convolution descriptor for use in the other convolution
    operations.
@@ -387,6 +389,7 @@ def ensure_dt(val, default, name, dtype):
 class GpuDnnConv(DnnBase):
    """
    The forward convolution.
@@ -554,6 +557,7 @@ class GpuDnnConv(DnnBase):
 class GpuDnnConvGradW(DnnBase):
    """
    The convolution gradient with respect to the weights.
@@ -674,6 +678,7 @@ class GpuDnnConvGradW(DnnBase):
 class GpuDnnConvGradI(DnnBase):
    """
    The convolution gradient with respect to the inputs.
@@ -942,6 +947,7 @@ def dnn_gradinput(kerns, topgrad, img_shp, border_mode='valid',
 class GpuDnnPoolDesc(Op):
    """
    This Op builds a pooling descriptor for use in the other
    pooling operations.
@@ -1060,69 +1066,89 @@ class GpuDnnPoolDesc(Op):
 class GpuDnnPool(DnnBase):
-    """
-    Pooling.
+    """
    Parameters
    ----------
    img
-        The image 4d tensor.
+        The image 4d or 5d tensor.
-    desc
+    Parameters
-        The pooling descriptor.
+    ----------
+    ws : tensor variable
+        Window size.
+    stride : tensor variable
+        (dx, dy) or (dx, dy, dz).
+    mode : {'max', 'average_inc_pad', 'average_exc_pad'}
+        The old deprecated name 'average' corresponds to 'average_inc_pad'.
+    pad : tensor
+        (padX, padY) or (padX, padY, padZ)
    """
-    __props__ = ()
+    __props__ = ('mode',)
-    def __init__(self):
+    def __init__(self, mode='max'):
        DnnBase.__init__(self, ["dnn_pool.c"], "APPLY_SPECIFIC(dnn_pool)")
+        if mode == 'average':
+            mode = 'average_inc_pad'
+        assert mode in ('max', 'average_inc_pad', 'average_exc_pad')
+        if version() == -1:
+            raise Exception("cudnn v1 do not support average_exc_pad")
+        self.mode = mode
-    def make_node(self, img, desc):
+    def get_op_params(self):
-        img = as_gpuarray_variable(img, infer_context_name(img))
+        if self.mode == 'max':
+            mode_flag = 'CUDNN_POOLING_MAX'
+        elif self.mode == "average_inc_pad":
+            mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING'
+        elif self.mode == "average_exc_pad":
+            mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING'
-        if desc.owner is not None:
+        return [('MODE_FLAG', mode_flag)]
-            e_ndim = desc.owner.op.get_ndim() + 2
-            if img.type.ndim != e_ndim:
+    def make_node(self, img, ws, stride, pad):
-                raise TypeError('img must be %dD tensor' % (e_ndim,))
+        ctx_name = infer_context_name(img, ws, stride, pad)
+        img = as_gpuarray_variable(img, ctx_name)
-        if (not isinstance(desc.type, CDataType) or
+        ws = tensor.as_tensor_variable(ws)
-                desc.type.ctype != 'cudnnPoolingDescriptor_t'):
+        stride = tensor.as_tensor_variable(stride)
-            raise TypeError('desc must be cudnnPoolingDescriptor_t')
+        pad = tensor.as_tensor_variable(pad)
+        assert ws.type.ndim == stride.type.ndim and ws.type.ndim == pad.type.ndim
+        assert ws.type.ndim == 1
-        return Apply(self, [img, desc], [img.type()])
+        return Apply(self, [img, ws, stride, pad], [img.type()])
    def infer_shape(self, node, shape):
-        desc = node.inputs[1].owner.op
+        w = node.inputs[1]
-        w = desc.ws
+        s = node.inputs[2]
-        s = desc.stride
+        p = node.inputs[3]
-        p = desc.pad
        res = [shape[0][0], shape[0][1],
               (shape[0][2] + 2 * p[0] - w[0]) // s[0] + 1,
               (shape[0][3] + 2 * p[1] - w[1]) // s[1] + 1
               ]
-        if len(w) > 2:
+        if node.inputs[0].ndim == 5:
            res.append((shape[0][4] + 2 * p[2] - w[2]) // s[2] + 1)
        return [res]
    def grad(self, inp, grads):
-        img, desc = inp
+        img, ws, stride, pad = inp
        grad, = grads
        grad = gpu_contiguous(grad)
-        out = self(img, desc)
+        out = self(img, ws, stride, pad)
-        g_out = GpuDnnPoolGrad()(img, out, grad, desc)
+        g_out = GpuDnnPoolGrad(mode=self.mode)(img, out, grad, ws, stride, pad)
-        return g_out, theano.gradient.DisconnectedType()()
+        return g_out, theano.gradient.DisconnectedType()(), theano.gradient.DisconnectedType()(), theano.gradient.DisconnectedType()()
    def connection_pattern(self, node):
-        # not connected to desc
+        # not connected to parameters
-        return [[1], [0]]
+        return [[1], [0], [0], [0]]
 class GpuDnnPoolGrad(DnnBase):
    """
    The pooling gradient.
@@ -1134,40 +1160,56 @@ class GpuDnnPoolGrad(DnnBase):
        The output of the pooling in the forward.
    out_grad
        Same size as out, but is the corresponding gradient information.
-    desc
+    ws : tensor variable
-        The pooling descriptor.
+        Window size.
+    stride : tensor variable
+        (dx, dy) or (dx, dy, dz).
+    mode : {'max', 'average_inc_pad', 'average_exc_pad'}
+        The old deprecated name 'average' corresponds to 'average_inc_pad'.
+    pad : tensor
+        (padX, padY) or (padX, padY, padZ)
    """
-    __props__ = ()
+    __props__ = ('mode',)
-    def __init__(self):
+    def __init__(self, mode='max'):
        DnnBase.__init__(self, ["dnn_pool_grad.c"],
                         "APPLY_SPECIFIC(dnn_pool_grad)")
+        if mode == 'average':
+            mode = 'average_inc_pad'
+        assert mode in ('max', 'average_inc_pad', 'average_exc_pad')
+        self.mode = mode
+    def get_op_params(self):
+        if self.mode == 'max':
+            mode_flag = 'CUDNN_POOLING_MAX'
+        elif self.mode == "average_inc_pad":
+            mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING'
+        elif self.mode == "average_exc_pad":
+            mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING'
-    def make_node(self, inp, out, out_grad, desc):
+        return [('MODE_FLAG', mode_flag)]
+    def make_node(self, inp, out, out_grad, ws, stride, pad):
        ctx_name = infer_context_name(inp, out, out_grad)
        inp = as_gpuarray_variable(inp, ctx_name)
+        assert (inp.ndim in [4, 5])
        out_grad = as_gpuarray_variable(out_grad, ctx_name)
+        assert (out_grad.ndim in [4, 5])
        out = as_gpuarray_variable(out, ctx_name)
+        assert(out.ndim in [4, 5])
-        if desc.owner is not None:
+        assert (out_grad.ndim == inp.ndim)
-            nd = desc.owner.op.get_ndim() + 2
+        assert (inp.ndim == out.ndim)
-            if inp.type.ndim != nd:
-                raise TypeError('inp must be %dD tensor' % (nd,))
-            if out_grad.type.ndim != nd:
+        ws = tensor.as_tensor_variable(ws)
-                raise TypeError('out_grad must be %dD tensor' % (nd,))
+        stride = tensor.as_tensor_variable(stride)
+        pad = tensor.as_tensor_variable(pad)
+        assert ws.type.ndim == stride.type.ndim and ws.type.ndim == pad.type.ndim
+        assert ws.type.ndim == 1
-            if out.type.ndim != nd:
+        return Apply(self, [inp, out, out_grad, ws, stride, pad], [inp.type()])
-                raise TypeError('out must be %dD tensor' % (nd,))
-        if (not isinstance(desc.type, CDataType) or
-                desc.type.ctype != 'cudnnPoolingDescriptor_t'):
-            raise TypeError('desc must be cudnnPoolingDescriptor_t')
-        return Apply(self, [inp, out, out_grad, desc], [inp.type()])
    def infer_shape(self, node, shape):
        return [shape[0]]
@@ -1205,11 +1247,11 @@ def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)):
    """
    img = gpu_contiguous(img)
-    desc = GpuDnnPoolDesc(ws=ws, stride=stride, mode=mode, pad=pad)()
+    return GpuDnnPool(mode=mode)(img, ws, stride, pad)
-    return GpuDnnPool()(img, desc)
 class GpuDnnSoftmaxBase(DnnBase):
    """
    Op for the cuDNN Softmax.
@@ -1262,6 +1304,7 @@ class GpuDnnSoftmaxBase(DnnBase):
 class GpuDnnSoftmax(GpuDnnSoftmaxBase):
    """
    Op for the cuDNN Softmax.
@@ -1295,6 +1338,7 @@ class GpuDnnSoftmax(GpuDnnSoftmaxBase):
 class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
    """
    Op for the cuDNN SoftmaxGrad.
@@ -1466,11 +1510,12 @@ def local_pool_dnn_grad_stride(node, ctx_name):
    pad = node.op.padding
    mode = node.op.mode
-    desc = GpuDnnPoolDesc(ws=ds, stride=st, mode=mode, pad=pad)()
+    return GpuDnnPoolGrad(mode=mode)(gpu_contiguous(inp),
-    return GpuDnnPoolGrad()(gpu_contiguous(inp),
+                                     gpu_contiguous(out),
-                            gpu_contiguous(out),
+                                     gpu_contiguous(out_grad),
-                            gpu_contiguous(out_grad),
+                                     ds,
-                            desc)
+                                     st,
+                                     pad)
 @register_opt('cudnn')
@@ -1490,11 +1535,10 @@ def local_avg_pool_dnn_grad_stride(node, ctx_name):
    cg = gpu_contiguous(out_grad)
-    desc = GpuDnnPoolDesc(ws=ds, stride=st, mode=mode, pad=pad)()
    # We reuse cg because CuDNN does not use the value of the `out`
    # argument but still checks its shape for average pooling. This
    # has been observed in v2 and v3 as far as I know.
-    return GpuDnnPoolGrad()(gpu_contiguous(inp), cg, cg, desc)
+    return GpuDnnPoolGrad(mode=mode)(gpu_contiguous(inp), cg, cg, ds, st, pad)
 @register_opt('cudnn')
@@ -1547,6 +1591,7 @@ def local_logsoftmax_to_dnn(node, ctx_name):
 class NoCuDNNRaise(Optimizer):
    def apply(self, fgraph):
        """
        Raise a error if cudnn can't be used.

--- a/theano/sandbox/gpuarray/dnn_pool.c
+++ b/theano/sandbox/gpuarray/dnn_pool.c
@@ -2,12 +2,15 @@
 cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
 cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
+cudnnPoolingDescriptor_t APPLY_SPECIFIC(pool);
 #section init_code_struct
 cudnnStatus_t APPLY_SPECIFIC(err);
 APPLY_SPECIFIC(input) = NULL;
 APPLY_SPECIFIC(output) = NULL;
+APPLY_SPECIFIC(pool) = NULL;
 if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != CUDNN_STATUS_SUCCESS) {
  PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
@@ -19,16 +22,25 @@ if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output)))
               "(out): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
  FAIL;
 }
+if ((APPLY_SPECIFIC(err) = cudnnCreatePoolingDescriptor(&APPLY_SPECIFIC(pool))) != CUDNN_STATUS_SUCCESS) {
+  PyErr_Format(PyExc_MemoryError, "could not allocate pooling descriptor"
+                "(pool): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));  
+  FAIL;
+}
 #section cleanup_code_struct
 if (APPLY_SPECIFIC(input) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input)); }
 if (APPLY_SPECIFIC(output) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output)); }
+if (APPLY_SPECIFIC(pool) != NULL) { cudnnDestroyPoolingDescriptor(APPLY_SPECIFIC(pool)); }
 #section support_code_struct
 int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
-                             cudnnPoolingDescriptor_t desc,
+                             PyArrayObject *ws, 
+                             PyArrayObject *stride,
+                             PyArrayObject *pad,
                             PyGpuArrayObject **out,
                             PyGpuContextObject *c) {
  cudnnStatus_t err;
@@ -46,14 +58,21 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
  int w[3];
  int p[3];
  int s[3];
-  int ndims;
+  int ndims = PyArray_DIM(ws, 0);//PyGpuArray_NDIM(img) - 2;
+  for(int i = 0; i < ndims; i++) {
+     w[i] = *((npy_intp*)PyArray_GETPTR1(ws, i));
+  }
+  for(int i = 0; i < ndims; i++) {
+     p[i] = *((npy_intp*)PyArray_GETPTR1(pad, i));
+  }
+  for(int i = 0; i < ndims; i++) {
+     s[i] = *((npy_intp*)PyArray_GETPTR1(stride, i));
+  }
+  err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), MODE_FLAG, ndims, w, p, s);
-  err = cudnnGetPoolingNdDescriptor(desc, 3, &mode, &ndims, w, p, s);
  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError,
+    PyErr_Format(PyExc_RuntimeError, "could not set op descriptor %s", cudnnGetErrorString(err));
-                 "error doing cudnnGetPoolingDescriptor operation: %s",
-                 cudnnGetErrorString(err));
-    return 1;
  }
  dims[0] = PyGpuArray_DIM(img, 0);
@@ -98,7 +117,7 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
    cuda_wait((*out)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
    err = cudnnPoolingForward(
-      APPLY_SPECIFIC(_handle), desc,
+      APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(pool),
      alpha,
      APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(img),
      beta,

--- a/theano/sandbox/gpuarray/dnn_pool_grad.c
+++ b/theano/sandbox/gpuarray/dnn_pool_grad.c
@@ -4,6 +4,7 @@ cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
 cudnnTensorDescriptor_t APPLY_SPECIFIC(input_grad);
 cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
 cudnnTensorDescriptor_t APPLY_SPECIFIC(output_grad);
+cudnnPoolingDescriptor_t APPLY_SPECIFIC(pool);
 #section init_code_struct
@@ -11,6 +12,7 @@ APPLY_SPECIFIC(input) = NULL;
 APPLY_SPECIFIC(input_grad) = NULL;
 APPLY_SPECIFIC(output) = NULL;
 APPLY_SPECIFIC(output_grad) = NULL;
+APPLY_SPECIFIC(pool) = NULL;
 {
  cudnnStatus_t err;
@@ -38,6 +40,11 @@ APPLY_SPECIFIC(output_grad) = NULL;
                 cudnnGetErrorString(err));
    FAIL;
  }
+  if ((err = cudnnCreatePoolingDescriptor(&APPLY_SPECIFIC(pool))) != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_MemoryError, "could not allocate pooling descriptor"
+                "(pool): %s", cudnnGetErrorString(err));  
+    FAIL;
+  }
 }
 #section cleanup_code_struct
@@ -46,13 +53,16 @@ if (APPLY_SPECIFIC(input) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC
 if (APPLY_SPECIFIC(input_grad) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input_grad)); }
 if (APPLY_SPECIFIC(output) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output)); }
 if (APPLY_SPECIFIC(output_grad) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output_grad)); }
+if (APPLY_SPECIFIC(pool) != NULL) { cudnnDestroyPoolingDescriptor(APPLY_SPECIFIC(pool)); }
 #section support_code_struct
 int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
                                  PyGpuArrayObject *out,
                                  PyGpuArrayObject *out_grad,
-                                  cudnnPoolingDescriptor_t desc,
+                                  PyArrayObject *ws, 
+                                  PyArrayObject *stride,
+                                  PyArrayObject *pad,
                                  PyGpuArrayObject **inp_grad,
                                  PyGpuContextObject *c) {
  cudnnStatus_t err;
@@ -85,6 +95,26 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
    return 1;
  }
+  int w[3];
+  int p[3];
+  int s[3];
+  int ndims = PyArray_DIM(ws, 0);//PyGpuArray_NDIM(img) - 2;
+  for(int i = 0; i < ndims; i++) {
+     w[i] = *((npy_intp*)PyArray_GETPTR1(ws, i));
+  }
+  for(int i = 0; i < ndims; i++) {
+     p[i] = *((npy_intp*)PyArray_GETPTR1(pad, i));
+  }
+  for(int i = 0; i < ndims; i++) {
+     s[i] = *((npy_intp*)PyArray_GETPTR1(stride, i));
+  }
+  err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), MODE_FLAG, ndims, w, p, s);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError, "could not set op descriptor %s", cudnnGetErrorString(err));
+  }
  if (c_set_tensorNd(*inp_grad, APPLY_SPECIFIC(input_grad)) != 0)
    return 1;
@@ -118,7 +148,7 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
    cuda_wait((*inp_grad)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
    err = cudnnPoolingBackward(
-      APPLY_SPECIFIC(_handle), desc,
+      APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(pool),
      alpha,
      APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(out),
      APPLY_SPECIFIC(output_grad), PyGpuArray_DEV_DATA(out_grad),

--- a/theano/sandbox/gpuarray/tests/test_dnn.py
+++ b/theano/sandbox/gpuarray/tests/test_dnn.py
@@ -339,6 +339,7 @@ def test_dnn_tag():
 class TestDnnInferShapes(utt.InferShapeTester):
    def setUp(self):
        super(TestDnnInferShapes, self).setUp()
        self.mode = mode_with_gpu