Move the gradients of convolution over to v3 and fix the test for gradI

70b1100c · Arnaud Bergeron · 59dcaf9c · 70b1100c · 70b1100c · 70b1100c
--- a/theano/sandbox/gpuarray/dnn.py
+++ b/theano/sandbox/gpuarray/dnn.py
@@ -533,19 +533,27 @@ class GpuDnnConvGradW(DnnBase, COp):
    """
-    __props__ = ('inplace',)
+    __props__ = ('algo', 'inplace')
-    def __init__(self, inplace=False):
+    def __init__(self, inplace=False, algo=None):
        COp.__init__(self, ["dnn_base.c", "dnn_conv_base.c", "dnn_gw.c"],
                     "APPLY_SPECIFIC(conv_gw)")
        self.inplace = inplace
        if self.inplace:
            self.destroy_map = {0: [2]}
+        if algo is None:
+            algo = config.dnn.conv.algo_bwd
+        self.algo = algo
+        assert self.algo in ['none', 'deterministic', 'fft', 'guess_once',
+                             'guess_on_shape_change', 'time_once',
+                             'time_on_shape_change']
    def __setstate__(self, d):
        self.__dict__.update(d)
        if not hasattr(self, 'inplace'):
            self.inplace = False
+        if not hasattr(self, 'algo'):
+            self.algo = config.dnn.conv.algo_bwd
    def grad(self, inp, grads):
        img, top, output, desc, alpha, beta = inp
@@ -566,24 +574,55 @@ class GpuDnnConvGradW(DnnBase, COp):
        return [[1], [1], [1], [0], [1], [1]]
    def get_op_params(self):
+        defs = []
        if self.inplace:
-            return [('CONV_INPLACE', '1')]
+            defs.append(('CONV_INPLACE', '1'))
+        if version() < 3000:
+            alg = '0'
        else:
-            return []
+            alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0'
+            if self.algo == 'none':
+                alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0'
+            if self.algo == 'deterministic':
+                alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1'
+            if self.algo == 'fft':
+                alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT'
+            if self.algo in ['guess_once', 'guess_on_shape_change',
+                             'time_once', 'time_on_shape_change']:
+                defs.append(('CHOOSE_ALGO', ''))
+            if self.algo in ['guess_once', 'time_once']:
+                defs.append(('CHOOSE_ONCE', ''))
+            if self.algo in ['time_once', 'time_on_shape_change']:
+                defs.append(('CHOOSE_TIME', ''))
+        defs.append(('CONV_ALGO', alg))
+        return defs
    def make_node(self, img, topgrad, output, desc, alpha=None, beta=None):
        img = as_gpuarray_variable(img)
        topgrad = as_gpuarray_variable(topgrad)
        output = as_gpuarray_variable(output)
-        if img.type.ndim != 4:
+        if img.type.ndim not in (4, 5):
-            raise TypeError('img must be 4D tensor')
+            raise TypeError('img must be 4D or 5D tensor')
-        if topgrad.type.ndim != 4:
+        if topgrad.type.ndim not in (4, 5):
-            raise TypeError('topgrad must be 4D tensor')
+            raise TypeError('topgrad must be 4D or 5D tensor')
-        if output.type.ndim != 4:
+        if output.type.ndim not in (4, 5):
-            raise TypeError('output must be 4D tensor')
+            raise TypeError('output must be 4D or 5D tensor')
-        if not isinstance(desc.type, CDataType) \
+        if (img.type.ndim != topgrad.type.ndim or
-                or desc.type.ctype != 'cudnnConvolutionDescriptor_t':
+                img.type.ndim != output.type.ndim):
+            raise TypeError("The number of dimensions of "
+                            "img, topgrad and output must match")
+        if img.type.ndim == 5 and self.algo in ['fft', 'deterministic']:
+            raise ValueError("convolution algo %s can't be used for "
+                             "3d convolutions", (self.algo,))
+        if (not isinstance(desc.type, CDataType) or
+                desc.type.ctype != 'cudnnConvolutionDescriptor_t'):
            raise TypeError('desc must be cudnnConvolutionDescriptor_t')
        alpha = ensure_dt(alpha, _one, 'alpha', img.dtype)
@@ -609,14 +648,27 @@ class GpuDnnConvGradI(DnnBase):
    """
-    __props__ = ('inplace',)
+    __props__ = ('algo', 'inplace',)
-    def __init__(self, inplace=False):
+    def __init__(self, inplace=False, algo=None):
        COp.__init__(self, ["dnn_base.c", "dnn_conv_base.c", "dnn_gi.c"],
                     "APPLY_SPECIFIC(conv_gi)")
        self.inplace = inplace
        if self.inplace:
            self.destroy_map = {0: [2]}
+        if algo is None:
+            algo = config.dnn.conv.algo_bwd
+        self.algo = algo
+        assert self.algo in ['none', 'deterministic', 'fft', 'guess_once',
+                             'guess_on_shape_change', 'time_once',
+                             'time_on_shape_change']
+    def __setstate__(self, d):
+        self.__dict__.update(d)
+        if not hasattr(self, 'algo'):
+            self.algo = config.dnn.conv.algo_bwd
+        if not hasattr(self, 'inplace'):
+            self.inplace = False
    def grad(self, inp, grads):
        kerns, top, output, desc, alpha, beta = inp
@@ -637,24 +689,55 @@ class GpuDnnConvGradI(DnnBase):
        return [[1], [1], [1], [0], [1], [1]]
    def get_op_params(self):
+        defs = []
        if self.inplace:
-            return [('CONV_INPLACE', '1')]
+            defs.append(('CONV_INPLACE', '1'))
+        if version() < 3000:
+            alg = '0'
        else:
-            return []
+            alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_0'
+            if self.algo == 'none':
+                alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_0'
+            if self.algo == 'deterministic':
+                alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_1'
+            if self.algo == 'fft':
+                alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT'
+            if self.algo in ['guess_once', 'guess_on_shape_change',
+                             'time_once', 'time_on_shape_change']:
+                defs.append(('CHOOSE_ALGO', ''))
+            if self.algo in ['guess_once', 'time_once']:
+                defs.append(('CHOOSE_ONCE', ''))
+            if self.algo in ['time_once', 'time_on_shape_change']:
+                defs.append(('CHOOSE_TIME', ''))
+        defs.append(('CONV_ALGO', alg))
+        return defs
    def make_node(self, kern, topgrad, output, desc, alpha=None, beta=None):
        kern = as_gpuarray_variable(kern)
        topgrad = as_gpuarray_variable(topgrad)
        output = as_gpuarray_variable(output)
-        if kern.type.ndim != 4:
+        if kern.type.ndim not in (4, 5):
-            raise TypeError('kern must be 4D tensor')
+            raise TypeError('kern must be 4D or 5D tensor')
-        if topgrad.type.ndim != 4:
+        if topgrad.type.ndim not in (4, 5):
-            raise TypeError('topgrad must be 4D tensor')
+            raise TypeError('topgrad must be 4D or 5D tensor')
-        if output.type.ndim != 4:
+        if output.type.ndim not in (4, 5):
-            raise TypeError('output must be 4D tensor')
+            raise TypeError('output must be 4D or 5D tensor')
-        if not isinstance(desc.type, CDataType) \
+        if (kern.type.ndim != topgrad.type.ndim or
-                or desc.type.ctype != 'cudnnConvolutionDescriptor_t':
+                kern.type.ndim != output.type.ndim):
+            raise TypeError("The number of dimensions of "
+                            "kern, topgrad and output must match")
+        if kern.type.ndim == 5 and self.algo in ['fft', 'deterministic']:
+            raise ValueError("convolution algo %s can't be used for "
+                             "3d convolutions", (self.algo,))
+        if (not isinstance(desc.type, CDataType) or
+                desc.type.ctype != 'cudnnConvolutionDescriptor_t'):
            raise TypeError('desc must be cudnnConvolutionDescriptor_t')
        alpha = ensure_dt(alpha, _one, 'alpha', kern.dtype)
@@ -1638,7 +1721,7 @@ def local_dnn_convgw_inplace(node):
            isinstance(dest.owner.op, GpuAllocEmpty) and
            len(dest.clients) > 1):
        inputs[2] = GpuAllocEmpty(dest.owner.op.dtype)(*dest.owner.inputs)
-    return [GpuDnnConvGradW(inplace=True)(*inputs)]
+    return [GpuDnnConvGradW(algo=node.op.algo, inplace=True)(*inputs)]
 @local_optimizer([GpuDnnConvGradI], inplace=True)
@@ -1651,7 +1734,7 @@ def local_dnn_convgi_inplace(node):
            isinstance(dest.owner.op, GpuAllocEmpty) and
            len(dest.clients) > 1):
        inputs[2] = GpuAllocEmpty(dest.owner.op.dtype)(*dest.owner.inputs)
-    return [GpuDnnConvGradI(inplace=True)(*inputs)]
+    return [GpuDnnConvGradI(algo=node.op.algo, inplace=True)(*inputs)]
 optdb.register('local_dnna_conv_inplace',
               tensor.opt.in2out(local_dnn_conv_inplace,
@@ -1674,7 +1757,7 @@ def local_dnn_conv_alpha_merge(node, *inputs):
 def local_dnn_convw_alpha_merge(node, *inputs):
    if not dnn_available() or version() == -1:
        return None
-    return [GpuDnnConvGradW()(*inputs)]
+    return [GpuDnnConvGradW(algo=node.op.algo)(*inputs)]
 @register_opt('cudnn')
@@ -1682,28 +1765,28 @@ def local_dnn_convw_alpha_merge(node, *inputs):
 def local_dnn_convi_alpha_merge(node, *inputs):
    if not dnn_available() or version() == -1:
        return None
-    return [GpuDnnConvGradI()(*inputs)]
+    return [GpuDnnConvGradI(algo=node.op.algo)(*inputs)]
 @register_opt('cudnn')
 @output_merge(GpuDnnConv, alpha_in=4, beta_in=5, out_in=2, nd=4)
 def local_dnn_conv_output_merge(node, *inputs):
    inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
-    return [GpuDnnConv(workmem=node.op.workmem)(*inputs)]
+    return [GpuDnnConv(algo=node.op.algo)(*inputs)]
 @register_opt('cudnn')
 @output_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, out_in=2, nd=4)
 def local_dnn_convw_output_merge(node, *inputs):
    inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
-    return [GpuDnnConvGradW()(*inputs)]
+    return [GpuDnnConvGradW(algo=node.op.algo)(*inputs)]
 @register_opt('cudnn')
 @output_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, out_in=2, nd=4)
 def local_dnn_convi_output_merge(node, *inputs):
    inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
-    return [GpuDnnConvGradI()(*inputs)]
+    return [GpuDnnConvGradI(algo=node.op.algo)(*inputs)]
 @register_opt('cudnn')

--- a/theano/sandbox/gpuarray/dnn_base.c
+++ b/theano/sandbox/gpuarray/dnn_base.c
@@ -12,7 +12,7 @@ c_set_tensorNd(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc) {
  case GA_DOUBLE:
    dt = CUDNN_DATA_DOUBLE;
    break;
-#ifdef CUDNN_VERSION > 3000
+#if CUDNN_VERSION > 3000
  case GA_HALF:
    dt = CUDNN_DATA_HALF;
    break;
@@ -64,7 +64,7 @@ c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t desc) {
  case GA_DOUBLE:
    dt = CUDNN_DATA_DOUBLE;
    break;
-#ifdef CUDNN_VERSION > 3000
+#if CUDNN_VERSION > 3000
  case GA_HALF:
    dt = CUDNN_DATA_HALF;
    break;

--- a/theano/sandbox/gpuarray/dnn_gi.c
+++ b/theano/sandbox/gpuarray/dnn_gi.c
@@ -11,12 +11,12 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
  void *beta_p;
  if (PyGpuArray_DIMS(im)[1] != PyGpuArray_DIMS(kerns)[1]) {
-    PyErr_SetString(PyExc_ValueError,
+    PyErr_SetString(PyExc_ValueError, "images and kernel must have the same "
-		    "GpuDnnConv images and kernel must have the same stack size");
+                    "stack size");
    return 1;
  }
-  if (c_set_tensor4d(output, APPLY_SPECIFIC(output)) == -1)
+  if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
    return 1;
  if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
    return 1;
@@ -27,6 +27,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
    beta_p = (void *)&beta;
    break;
  case GA_FLOAT:
+  case GA_HALF:
    alpha_p = (void *)&af;
    beta_p = (void *)&bf;
    break;
@@ -48,19 +49,142 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
    return 1;
 #endif
-  if (c_set_tensor4d(*input, APPLY_SPECIFIC(input)) == -1)
+  if (c_set_tensorNd(*input, APPLY_SPECIFIC(input)) == -1)
    return 1;
-  err = cudnnConvolutionBackwardData(
+  cudnnConvolutionBwdDataAlgo_t algo = CONV_ALGO;
+#ifdef CHOOSE_ALGO
+  static int reuse_algo = 0;
+  static cudnnConvolutionBwdDataAlgo_t prev_algo = CONV_ALGO;
+#ifndef CHOOSE_ONCE
+  static size_t prev_kern_dims[5] = {0};
+  static size_t prev_top_dims[5] = {0};
+  reuse_algo = 1;
+  for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); i++) {
+    reuse_algo = (reuse_algo &&
+                  PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]);
+    reuse_algo = (reuse_algo &&
+                  PyGpuArray_DIM(output, i) == prev_top_dims[i]);
+  }
+#endif
+  if (!reuse_algo) {
+#ifdef CHOOSE_TIME
+    int count;
+    cudnnConvolutionBwdDataAlgoPerf_t choice;
+    err = cudnnFindConvolutionBackwardDataAlgorithm(
+      _handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
+      APPLY_SPECIFIC(kerns), 1, &count, &choice);
+    if (err != CUDNN_STATUS_SUCCESS) {
+      PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
+                   cudnnGetErrorString(err));
+      return 1;
+    }
+    algo = choice.algo;
+#else
+    size_t free = 0, total = 0;
+    cudaError_t err2 = cudaMemGetInfo(&free, &total);
+    if (err2 != cudaSuccess){
+      cudaGetLastError();
+      PyErr_Format(PyExc_RuntimeError, "Error when trying to find the memory "
+                   "information on the GPU: %s\n", cudaGetErrorString(err2));
+      return 1;
+    }
+    err = cudnnGetConvolutionBackwardDataAlgorithm(
+      _handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
+      desc, APPLY_SPECIFIC(kerns),
+      CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, free, &algo);
+    if (err != CUDNN_STATUS_SUCCESS) {
+      PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
+                   cudnnGetErrorString(err));
+      return 1;
+    }
+#endif
+    prev_algo = algo;
+  } else {
+    algo = prev_algo;
+  }
+#ifdef CHOOSE_ONCE
+  reuse_algo = 1;
+#else
+  for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); i++) {
+    prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
+    prev_top_dims[i] = PyGpuArray_DIM(output, i);
+  }
+#endif
+#endif
+#if CUDNN_VERSION > 3000
+  if (algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) {
+    int nd;
+    int pad[2];
+    int stride[2];
+    int upscale[2];
+    cudnnConvolutionMode_t mode;
+    err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
+                                          upscale, &mode);
+    if (err != CUDNN_STATUS_SUCCESS) {
+      PyErr_Format(PyExc_RuntimeError,
+                   "error getting convolution properties: %s",
+                   cudnnGetErrorString(err));
+      return 1;
+    }
+    if (stride[0] != 1 || stride[1] != 1 ||
+        PyGpuArray_DIM(*input, 0) > 1024 || PyGpuArray_DIM(*input, 1) > 1024 ||
+        (PyGpuArray_DIM(kerns, 0) == 1 && PyGpuArray_DIM(kerns, 1) == 1)) {
+      algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
+    }
+  }
+#endif
+  size_t worksize;
+  gpudata *workspace;
+  PyGpuContextObject *c;
+  err = cudnnGetConvolutionBackwardDataWorkspaceSize(
+    _handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
+    APPLY_SPECIFIC(input), algo, &worksize);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s",
+                 cudnnGetErrorString(err));
+    return 1;
+  }
+  if (worksize != 0) {
+    c = pygpu_default_context();
+    workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL);
+    if (workspace == NULL) {
+      PyErr_SetString(PyExc_RuntimeError,
+                      "Could not allocate working memory");
+      return 1;
+    }
+  }
+  err = cudnnConvolutionBackwardData_v3(
    _handle,
    alpha_p,
    APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
    APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),
-    desc,
+    desc, algo, worksize == 0 ? NULL : *(void **)workspace, worksize,
    beta_p,
    APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(*input));
+  if (worksize != 0)
+    c->ops->buffer_release(workspace);
  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradI: error doing operation: %s",
+    PyErr_Format(PyExc_RuntimeError, "error doing operation: %s",
                 cudnnGetErrorString(err));
    return 1;
  }

--- a/theano/sandbox/gpuarray/dnn_gw.c
+++ b/theano/sandbox/gpuarray/dnn_gw.c
 #section support_code_struct
-int 
+int
 APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
                        PyGpuArrayObject *km,
                        cudnnConvolutionDescriptor_t desc,
@@ -16,9 +16,9 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
    return 1;
  }
-  if (c_set_tensor4d(input, APPLY_SPECIFIC(input)) == -1)
+  if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
    return 1;
-  if (c_set_tensor4d(output, APPLY_SPECIFIC(output)) == -1)
+  if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
    return 1;
  switch (input->ga.typecode) {
@@ -27,6 +27,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
    beta_p = (void *)&beta;
    break;
  case GA_FLOAT:
+  case GA_HALF:
    alpha_p = (void *)&af;
    beta_p = (void *)&bf;
    break;
@@ -51,16 +52,140 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
  if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns)) == -1)
    return 1;
-  err = cudnnConvolutionBackwardFilter(
+  cudnnConvolutionBwdFilterAlgo_t algo = CONV_ALGO;
+#ifdef CHOOSE_ALGO
+  static int reuse_algo = 0;
+  static cudnnConvolutionBwdFilterAlgo_t prev_algo = CONV_ALGO;
+#ifndef CHOOSE_ONCE
+  static size_t prev_img_dims[5] = {0};
+  static size_t prev_top_dims[5] = {0};
+  reuse_algo = 1;
+  for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
+    reuse_algo = (reuse_algo &&
+                  PyGpuArray_DIM(input, i) == prev_img_dims[i]);
+    reuse_algo = (reuse_algo &&
+                  PyGpuArray_DIM(output, i) == prev_top_dims[i]);
+  }
+#endif
+  if (!reuse_algo) {
+#ifdef CHOOSE_TIME
+    int count;
+    cudnnConvolutionBwdFilterAlgoPerf_t choice;
+    err = cudnnFindConvolutionBackwardFilterAlgorithm(
+      _handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
+      APPLY_SPECIFIC(kerns), 1, &count, &choice);
+    if (err != CUDNN_STATUS_SUCCESS) {
+      PyErr_Format(PyExc_RuntimeError,
+                   "error selecting convolution algo: %s",
+                   cudnnGetErrorString(err));
+      return 1;
+    }
+    algo = choice.algo;
+#else
+    size_t free = 0, total = 0;
+    cudaError_t err2 = cudaMemGetInfo(&free, &total);
+    if (err2 != cudaSuccess){
+      cudaGetLastError();
+      PyErr_Format(PyExc_RuntimeError, "Error when trying to find the memory "
+                   "information on the GPU: %s\n", cudaGetErrorString(err2));
+      return 1;
+    }
+    err = cudnnGetConvolutionBackwardFilterAlgorithm(
+      _handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
+      desc, APPLY_SPECIFIC(kerns),
+      CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, free, &algo);
+    if (err != CUDNN_STATUS_SUCCESS) {
+      PyErr_Format(PyExc_RuntimeError,
+                   "error selecting convolution algo: %s",
+                   cudnnGetErrorString(err));
+      return 1;
+    }
+#endif
+    prev_algo = algo;
+  } else {
+    algo = prev_algo;
+  }
+#ifdef CHOOSE_ONCE
+  reuse_algo = 1;
+#else
+  for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
+    prev_img_dims[i] = PyGpuArray_DIM(input, i);
+    prev_top_dims[i] = PyGpuArray_DIM(output, i);
+  }
+#endif
+#endif
+#ifdef CUDNN_VERSION > 3000
+  if (algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT) {
+    int nd;
+    int pad[2];
+    int stride[2];
+    int upscale[2];
+    cudnnConvolutionMode_t mode;
+    err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
+                                          upscale, &mode);
+    if (err != CUDNN_STATUS_SUCCESS) {
+      PyErr_Format(PyExc_RuntimeError,
+                   "error getting convolution properties: %s",
+                   cudnnGetErrorString(err));
+      return 1;
+    }
+    if (stride[0] != 1 || stride[1] != 1 ||
+        PyGpuArray_DIM(input, 0) > 1024 || PyGpuArray_DIM(input, 1) > 1024 ||
+        (PyGpuArray_DIM(*kerns, 0) == 1 && PyGpuArray_DIM(*kerns, 1) == 1)) {
+      algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
+    }
+  }
+#endif
+  size_t worksize;
+  gpudata *workspace;
+  PyGpuContextObject *c;
+  err = cudnnGetConvolutionBackwardFilterWorkspaceSize(
+    _handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
+    APPLY_SPECIFIC(kerns), algo, &worksize);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s",
+                 cudnnGetErrorString(err));
+    return 1;
+  }
+  if (worksize != 0) {
+    c = pygpu_default_context();
+    workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL);
+    if (workspace == NULL) {
+      PyErr_SetString(PyExc_RuntimeError, "Could not allocate working memory");
+      return 1;
+    }
+  }
+  err = cudnnConvolutionBackwardFilter_v3(
    _handle,
    alpha_p,
    APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
    APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),
-    desc,
+    desc, algo, worksize == 0 ? NULL : *(void **)workspace, worksize,
    beta_p,
    APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(*kerns));
+  if (worksize != 0)
+    c->ops->buffer_release(workspace);
  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradW: error doing operation: %s",
+    PyErr_Format(PyExc_RuntimeError, "error doing operation: %s",
                 cudnnGetErrorString(err));
    return 1;
  }

--- a/theano/sandbox/gpuarray/tests/test_dnn.py
+++ b/theano/sandbox/gpuarray/tests/test_dnn.py
@@ -467,42 +467,41 @@ class TestDnnInferShapes(utt.InferShapeTester):
        img = T.ftensor4('img')
        kerns = T.ftensor4('kerns')
        out = T.ftensor4('out')
-        img_val = numpy.asarray(
-            numpy.random.rand(3, 4, 5, 6),
-            dtype='float32'
-        )
        kern_vals = numpy.asarray(
            numpy.random.rand(13, 14, 15, 16),
            dtype='float32'
        )
+        out_vals = numpy.asarray(
+            numpy.random.rand(3, 13, 5, 6),
+            dtype='float32'
+        )
        for params in product(
            ['valid'],  # Should this work for 'full'?
            [(1, 1)],
            ['conv', 'cross']
        ):
-            temp_kerns = kerns.dimshuffle(1, 0, 2, 3)
            shape = (
-                img_val.shape[0], kern_vals.shape[1],
+                out_vals.shape[0], kern_vals.shape[1],
-                img_val.shape[2] + kern_vals.shape[2] - 1,
+                out_vals.shape[2] + kern_vals.shape[2] - 1,
-                img_val.shape[3] + kern_vals.shape[3] - 1
+                out_vals.shape[3] + kern_vals.shape[3] - 1
            )
-            out_vals = numpy.zeros(shape, dtype='float32')
+            img_vals = numpy.zeros(shape, dtype='float32')
            desc = dnn.GpuDnnConvDesc(
                border_mode=params[0],
                subsample=params[1],
                conv_mode=params[2]
-            )(out.shape, temp_kerns.shape)
+            )(out.shape, kerns.shape)
            conv_grad_i = dnn.GpuDnnConvGradI()(
-                temp_kerns,
+                kerns,
-                img,
                out,
+                img,
                desc,
            )
            self._compile_and_check(
-                [temp_kerns, img, out],
+                [kerns, img, out],
                [conv_grad_i],
-                [kern_vals, img_val, out_vals],
+                [kern_vals, img_vals, out_vals],
                dnn.GpuDnnConvGradI
            )