Cancel some changes into CEnumType.

Cancel changes into dnn_fwd.c Heavy simplification of check_dnn. Make check_dnn runnable as a python script.

Cancel some changes into CEnumType.
1b6e6389 · notoraptor · 6cc0c5ca · 1b6e6389 · 1b6e6389 · 1b6e6389
--- a/theano/gof/type.py
+++ b/theano/gof/type.py
@@ -909,11 +909,7 @@ class EnumType(Type, dict):

    .. note::

-        :class:`EnumType` is not complete and should never be used for regular graph operations.
-
-        :class:`EnumList` is not complete and should never be used for regular graph operations.
-
-        **:class:`CEnumType` is complete.**
+        This Type (and subclasses) is not complete and should never be used for regular graph operations.

    """

@@ -1053,9 +1049,6 @@ class EnumType(Type, dict):
        #ifndef PyInt_AsLong
            #define PyInt_AsLong PyLong_AsLong
        #endif
-        #ifndef PyInt_FromLong
-            #define PyInt_FromLong PyLong_FromLong
-        #endif
    #endif
    """

@@ -1248,22 +1241,5 @@ class CEnumType(EnumList):
                   """ % dict(i=i, name=name, constant_cname=swapped_dict[i]) for i in sorted(swapped_dict.keys())),
                   fail=sub['fail'])

-    def c_sync(self, name, sub):
-        return """
-        int py_value = -1;
-        Py_XDECREF(py_%(name)s);
-        /* We assume that ctype is an integer type usable in a switch. */
-        switch (%(name)s) {
-            %(cases)s
-            default:
-                PyErr_SetString(PyExc_ValueError, "CEnumType: cannot map C value to Python constant.");
-                {%(fail)s}
-                break;
-        }
-        py_%(name)s = PyInt_FromLong(py_value);
-        """ % dict(name=name, fail=sub['fail'], cases=''.join("""
-            case %(constant_cname)s: py_value = %(constant_pyvalue)d; break;
-        """ % dict(constant_cname=k, constant_pyvalue=v) for k, v in sorted(self.items(), key=lambda t: t[1])))
-
    def c_code_cache_version(self):
        return (1, super(CEnumType, self).c_code_cache_version())
--- a/theano/gpuarray/cudnn_defs.py
+++ b/theano/gpuarray/cudnn_defs.py
@@ -19,8 +19,27 @@ from __future__ import absolute_import, print_function, division

 from theano.gof import CEnumType

-
 HALF, FLOAT, DOUBLE = ('float16', 'float32', 'float64')
+TRUE_HALF_CONFIG = (HALF, HALF)
+PSEUDO_HALF_CONFIG = (HALF, FLOAT)
+FLOAT_CONFIG = (FLOAT, FLOAT)
+DOUBLE_CONFIG = (DOUBLE, DOUBLE)
+
+
+def is_true_half_config(dtype, precision):
+    return dtype == precision == HALF
+
+
+def is_pseudo_half_config(dtype, precision):
+    return dtype == HALF and precision == FLOAT
+
+
+def is_float_config(dtype, precision):
+    return dtype == precision == FLOAT
+
+
+def is_double_config(dtype, precision):
+    return dtype == precision == DOUBLE


 # NB: Some cuDNN algorithms are listed in cuDNN enums but not implemented.
@@ -103,22 +122,97 @@ class CuDNNV51(object):
    # empty list of enum to don't crash with cudnn 5
    cudnnReduceTensorOp_t = CEnumType()

-    def supported_precisions(self, dtype):
+    def get_supported_dtype_configs(self):
        """
-        Return the tuple of precisions supported by cuDNN for given input data type.
+        Return the tuple of data type configurations supported by this version of cuDNN.
        This is currently convenient for both cuDNN V5.1 and V6, as Theano does not
        yet support new data types (like INT8, INT8x4, etc.).
        """
-        assert dtype in (HALF, FLOAT, DOUBLE)
-        if dtype == HALF:
-            # TRUE_HALF_CONFIG, PSEUDO_HALF_CONFIG
-            return (HALF, FLOAT)
-        if dtype == FLOAT:
-            # FLOAT_CONFIG
-            return (FLOAT,)
-        if dtype == DOUBLE:
-            # DOUBLE_CONFIG
-            return (DOUBLE,)
+        return (TRUE_HALF_CONFIG, PSEUDO_HALF_CONFIG, FLOAT_CONFIG, DOUBLE_CONFIG)
+
+    def get_fwd_dtype_configs(self, check_runtime=None):
+        # NB: "TRUE_HALF_CONFIG is only supported on architectures with true fp16 support
+        # (compute capability 5.3 and 6.0)". Can be checked at runtime only.
+        if check_runtime is None or check_runtime(*TRUE_HALF_CONFIG):
+            return self.get_supported_dtype_configs()
+        return (PSEUDO_HALF_CONFIG, FLOAT_CONFIG, DOUBLE_CONFIG)
+
+    def get_bwd_filter_dtype_configs(self, check_runtime=None):
+        return self.get_supported_dtype_configs()
+
+    def get_bwd_data_dtype_configs(self, check_runtime=None):
+        return self.get_supported_dtype_configs()
+
+    def fwd_algo_supports_dtype_config(self, algo, dtype, precision, ndim):
+        algorithms = self.cudnnConvolutionFwdAlgo_t
+        algo = algorithms.fromalias(algo)
+        if algo == algorithms.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM:
+            return not is_true_half_config(dtype, precision)
+        if algo == algorithms.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM:
+            return ndim == 2 or not is_true_half_config(dtype, precision)
+        if algo == algorithms.CUDNN_CONVOLUTION_FWD_ALGO_GEMM:
+            return ndim == 2 and not is_true_half_config(dtype, precision)
+        # CUDNN_CONVOLUTION_FWD_ALGO_DIRECT: not implemented.
+        if algo == algorithms.CUDNN_CONVOLUTION_FWD_ALGO_FFT:
+            return ndim == 2 and (is_pseudo_half_config(dtype, precision) or is_float_config(dtype, precision))
+        if algo == algorithms.CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING:
+            if ndim == 2:
+                return is_pseudo_half_config(dtype, precision) or is_float_config(dtype, precision)
+                # NB: For cuDNN V6:
+                # " Data Type Config Support: PSEUDO_HALF_CONFIG, FLOAT_CONFIG
+                # (DOUBLE_CONFIG is also supported when the task can be handled by 1D FFT,
+                # ie, one of the filter dimension, width or height is 1)"
+                # Could be checked only when being in C code.
+            if ndim == 3:
+                return not is_true_half_config(dtype, precision)
+        if algo == algorithms.CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD:
+            return ndim == 2 and (is_pseudo_half_config(dtype, precision) or is_float_config(dtype, precision))
+        if algo == algorithms.CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED:
+            # NB: "If wDesc 's filter (height, width) is (5,5), data type config TRUE_HALF_CONFIG is not supported".
+            # We could not check it before being in C code.
+            return ndim == 2 and not is_double_config(dtype, precision)
+        return False
+
+    def bwd_filter_algo_supports_dtype_config(self, algo, dtype, precision, ndim):
+        algorithms = self.cudnnConvolutionBwdFilterAlgo_t
+        algo = algorithms.fromalias(algo)
+        if algo == algorithms.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0:
+            return not is_true_half_config(dtype, precision)
+        if algo == algorithms.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1:
+            return ndim == 2
+        if algo == algorithms.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT:
+            return ndim == 2 and (is_pseudo_half_config(dtype, precision) or is_float_config(dtype, precision))
+        if algo == algorithms.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3:
+            return not is_true_half_config(dtype, precision)
+        if algo == algorithms.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED:
+            # NB: "If wDesc 's filter (height, width) is (5,5), data type config TRUE_HALF_CONFIG is not supported".
+            # We could not check it before being in C code.
+            return ndim == 2 and not is_double_config(dtype, precision)
+        return False
+
+    def bwd_data_algo_supports_dtype_config(self, algo, dtype, precision, ndim):
+        algorithms = self.cudnnConvolutionBwdDataAlgo_t
+        algo = algorithms.fromalias(algo)
+        if algo == algorithms.CUDNN_CONVOLUTION_BWD_DATA_ALGO_0:
+            return not is_true_half_config(dtype, precision)
+        # CUDNN_CONVOLUTION_BWD_DATA_ALGO_1: all data type configs supported.
+        if algo == algorithms.CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT:
+            return ndim == 2 and (is_pseudo_half_config(dtype, precision) or is_float_config(dtype, precision))
+        if algo == algorithms.CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING:
+            if ndim == 2:
+                return is_pseudo_half_config(dtype, precision) or is_float_config(dtype, precision)
+                # NB: For cuDNN V6: "(DOUBLE_CONFIG is also supported when the task can be handled by 1D FFT,
+                # ie, one of the filter dimension, width or height is 1)"
+                # Could be checked only when being in C code.
+            if ndim == 3:
+                return not is_true_half_config(dtype, precision)
+        if algo == algorithms.CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD:
+            return ndim == 2 and is_pseudo_half_config(dtype, precision) or is_float_config(dtype, precision)
+        if algo == algorithms.CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED:
+            # NB: "If wDesc 's filter (height, width) is (5,5), data type config TRUE_HALF_CONFIG is not supported".
+            # We could not check it before being in C code.
+            return ndim == 2 and not is_double_config(dtype, precision)
+        return False


 class CuDNNV6(CuDNNV51):
@@ -162,6 +256,17 @@ class CuDNNV6(CuDNNV51):
                                      ('CUDNN_REDUCE_TENSOR_NORM2', 'norm2'),
                                      ctype='cudnnReduceTensorOp_t')

+    def bwd_filter_algo_supports_dtype_config(self, algo, dtype, precision, ndim):
+        is_supported = super(CuDNNV6, self).bwd_filter_algo_supports_dtype_config(algo, dtype, precision, ndim)
+        if not is_supported:
+            algorithms = self.cudnnConvolutionBwdFilterAlgo_t
+            algo = algorithms.fromalias(algo)
+            if algo == algorithms.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING:
+                return ndim == 2 and (is_pseudo_half_config(dtype, precision) or
+                                      is_float_config(dtype, precision) or
+                                      is_double_config(dtype, precision))
+        return is_supported
+

 class CuDNNV7(CuDNNV6):
    version = 7

--- a/theano/gpuarray/tests/check_dnn.py
+++ b/theano/gpuarray/tests/check_dnn.py
--- a/theano/gpuarray/tests/check_dnn_doc.py
+++ b/theano/gpuarray/tests/check_dnn_doc.py
--- a/theano/gpuarray/tests/dnn_choose_fwd.c
+++ b/theano/gpuarray/tests/dnn_choose_fwd.c
-#section init_code_struct
-
-reuse_algo = 0;
-prev_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
-if (!PARAMS->choose_once) {
-  memset(prev_img_dims, 0, sizeof(prev_img_dims));
-  memset(prev_kern_dims, 0, sizeof(prev_kern_dims));
-}
-
-#section support_code_struct
-
-int reuse_algo;
-cudnnConvolutionFwdAlgo_t prev_algo;
-size_t prev_img_dims[5];
-size_t prev_kern_dims[5];
-
-int
-APPLY_SPECIFIC(choose_fwd_algo)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
-                                PyGpuArrayObject *output,
-                                cudnnConvolutionDescriptor_t desc,
-                                cudnnConvolutionFwdAlgo_t *output_algo,
-                                PARAMS_TYPE* params) {
-  PyGpuContextObject *c = input->context;
-  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
-
-  if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) {
-    PyErr_SetString(PyExc_ValueError, "images and kernel must have the same stack size");
-    return 1;
-  }
-
-  if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
-    return 1;
-  if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
-    return 1;
-  if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
-    return 1;
-
-
-  cudnnConvolutionFwdAlgo_t algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
-
-  cuda_enter(c->ctx);
-
-  int expected_output_dims[5] = {0};
-  err = cudnnGetConvolutionNdForwardOutputDim(desc, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
-                                              PyGpuArray_NDIM(input), expected_output_dims);
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "error computing convolution output dim: %s",
-                 cudnnGetErrorString(err));
-    cuda_exit(c->ctx);
-    return 1;
-  }
-  if (PyGpuArray_NDIM(input) == 4) {
-    if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
-        (PyGpuArray_DIMS(output)[1] != expected_output_dims[1]) ||
-        (PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
-        (PyGpuArray_DIMS(output)[3] != expected_output_dims[3])) {
-      PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ld"
-                                     " but received gradient with shape %ldx%ldx%ldx%ld",
-                   expected_output_dims[0], expected_output_dims[1],
-                   expected_output_dims[2], expected_output_dims[3],
-                   PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
-                   PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3]);
-      cuda_exit(c->ctx);
-      return 1;
-    }
-  } else if (PyGpuArray_NDIM(input) == 5) {
-    if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
-        (PyGpuArray_DIMS(output)[1] != expected_output_dims[1]) ||
-        (PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
-        (PyGpuArray_DIMS(output)[3] != expected_output_dims[3]) ||
-        (PyGpuArray_DIMS(output)[4] != expected_output_dims[4])) {
-      PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ldx%ld"
-                                     " but received gradient with shape %ldx%ldx%ldx%ldx%ld",
-                   expected_output_dims[0], expected_output_dims[1],
-                   expected_output_dims[2], expected_output_dims[3],
-                   expected_output_dims[4],
-                   PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
-                   PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3],
-                   PyGpuArray_DIMS(output)[4]);
-      cuda_exit(c->ctx);
-      return 1;
-    }
-  }
-
-  if (!params->choose_once) {
-    reuse_algo = 1;
-    for (unsigned int i = 0; i < PyGpuArray_NDIM(input); ++i) {
-      reuse_algo = (reuse_algo && PyGpuArray_DIM(input, i) == prev_img_dims[i]);
-      reuse_algo = (reuse_algo && PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]);
-    }
-  }
-
-  if (!reuse_algo) {
-    size_t free;
-    int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
-
-    if (err2 != GA_NO_ERROR) {
-      PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
-                   "memory information on the GPU");
-      cuda_exit(c->ctx);
-      return 1;
-    }
-
-    // Guess 4Mb if the info is not available
-    if (free == 0) free = 4 * 1024 * 1024;
-
-    if (params->choose_time) {
-      int count;
-      cudnnConvolutionFwdAlgoPerf_t choice;
-      gpudata *tmpmem;
-
-      tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
-      if (tmpmem == NULL) {
-        PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
-        return -1;
-      }
-
-      err = cudnnFindConvolutionForwardAlgorithmEx(
-        params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
-        APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
-        desc, APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),
-        1, &count, &choice, *(void **)tmpmem,
-        free);
-      gpudata_release(tmpmem);
-
-      if (err != CUDNN_STATUS_SUCCESS) {
-        PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", cudnnGetErrorString(err));
-        cuda_exit(c->ctx);
-        return 1;
-      }
-
-      algo = choice.algo;
-    } else {
-      err = cudnnGetConvolutionForwardAlgorithm(
-        params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
-        desc, APPLY_SPECIFIC(output),
-        CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, &algo);
-      if (err != CUDNN_STATUS_SUCCESS) {
-        PyErr_Format(PyExc_RuntimeError,
-                     "error selecting convolution algo: %s",
-                     cudnnGetErrorString(err));
-        cuda_exit(c->ctx);
-        return 1;
-      }
-    }
-    prev_algo = algo;
-  } else {
-    algo = prev_algo;
-  }
-
-  if (params->choose_once) {
-    reuse_algo = 1;
-  } else {
-    for (unsigned int i = 0; i < PyGpuArray_NDIM(input); ++i) {
-      prev_img_dims[i] = PyGpuArray_DIM(input, i);
-      prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
-    }
-  }
-
-  /* These two algos are not supported for 3d conv */
-  if (PyGpuArray_NDIM(input) == 5 &&
-      (algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM ||
-       algo == CUDNN_CONVOLUTION_FWD_ALGO_GEMM))
-    algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
-
-  // The FFT implementation does not support strides, 1x1 filters or inputs
-  // with a spatial dimension larger than 1024. The tiled-FFT implementation
-  // does not support strides.
-  // If the chosen implementation is FFT or tiled-FFT, validate that it can
-  // be used on the current data and default to a safe implementation if it
-  // can't.
-  // The following code is 2d-specific but it is fine as FFT and tiled-FFT are
-  // defined only for 2d filters
-  if ((algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT ||
-       algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) && PyGpuArray_NDIM(input) == 4) {
-
-    // Extract the properties of the convolution descriptor
-    int nd;
-    int pad[2];
-    int stride[2];
-    int dilation[2];
-    cudnnConvolutionMode_t mode;
-    cudnnDataType_t data_type;
-    err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
-                                             dilation, &mode, &data_type);
-    if (err != CUDNN_STATUS_SUCCESS) {
-      PyErr_Format(PyExc_RuntimeError,
-                   "error getting convolution properties: %s",
-                   cudnnGetErrorString(err));
-      cuda_exit(c->ctx);
-      return 1;
-    }
-
-    if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT) {
-      if (stride[0] != 1 || stride[1] != 1 ||
-          PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
-          (PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1))
-      {
-        algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
-      }
-    } else {
-      // algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING
-      if (stride[0] != 1 || stride[1] != 1) {
-        algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
-      }
-    }
-  }
-
-  *output_algo = algo;
-  cuda_exit(c->ctx);
-  return 0;
-}
--- a/theano/gpuarray/tests/dnn_choose_gi.c
+++ b/theano/gpuarray/tests/dnn_choose_gi.c
-#section init_code_struct
-
-reuse_algo = 0;
-prev_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
-if (!PARAMS->choose_once) {
-  memset(prev_kern_dims, 0, sizeof(prev_kern_dims));
-  memset(prev_top_dims, 0, sizeof(prev_top_dims));
-}
-
-#section support_code_struct
-
-int reuse_algo;
-cudnnConvolutionBwdDataAlgo_t prev_algo;
-size_t prev_kern_dims[5];
-size_t prev_top_dims[5];
-
-int
-APPLY_SPECIFIC(choose_bwd_data_algo)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
-                                       PyGpuArrayObject *input,
-                                       cudnnConvolutionDescriptor_t desc,
-                                       cudnnConvolutionBwdDataAlgo_t *output_algo,
-                                       PARAMS_TYPE* params) {
-  PyGpuContextObject *c = kerns->context;
-  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
-
-  if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) {
-    PyErr_SetString(PyExc_ValueError, "images and kernel must have the same stack size");
-    return 1;
-  }
-
-  if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
-    return 1;
-  if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
-    return 1;
-  if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
-    return 1;
-
-
-  cudnnConvolutionBwdDataAlgo_t algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
-
-  cuda_enter(c->ctx);
-
-  int expected_output_dims[5] = {0};
-  err = cudnnGetConvolutionNdForwardOutputDim(desc, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
-                                              PyGpuArray_NDIM(input), expected_output_dims);
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "error computing convolution output dim: %s",
-                 cudnnGetErrorString(err));
-    cuda_exit(c->ctx);
-    return 1;
-  }
-  if (PyGpuArray_NDIM(input) == 4) {
-    if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
-        (PyGpuArray_DIMS(output)[1] != expected_output_dims[1]) ||
-        (PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
-        (PyGpuArray_DIMS(output)[3] != expected_output_dims[3])) {
-      PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ld"
-                                     " but received gradient with shape %ldx%ldx%ldx%ld",
-                   expected_output_dims[0], expected_output_dims[1],
-                   expected_output_dims[2], expected_output_dims[3],
-                   PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
-                   PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3]);
-      cuda_exit(c->ctx);
-      return 1;
-    }
-  } else if (PyGpuArray_NDIM(input) == 5) {
-    if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
-        (PyGpuArray_DIMS(output)[1] != expected_output_dims[1]) ||
-        (PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
-        (PyGpuArray_DIMS(output)[3] != expected_output_dims[3]) ||
-        (PyGpuArray_DIMS(output)[4] != expected_output_dims[4])) {
-      PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ldx%ld"
-                                     " but received gradient with shape %ldx%ldx%ldx%ldx%ld",
-                   expected_output_dims[0], expected_output_dims[1],
-                   expected_output_dims[2], expected_output_dims[3],
-                   expected_output_dims[4],
-                   PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
-                   PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3],
-                   PyGpuArray_DIMS(output)[4]);
-      cuda_exit(c->ctx);
-      return 1;
-    }
-  }
-
-  if (!params->choose_once) {
-    reuse_algo = 1;
-    for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); ++i) {
-      reuse_algo = (reuse_algo && PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]);
-      reuse_algo = (reuse_algo && PyGpuArray_DIM(output, i) == prev_top_dims[i]);
-    }
-  }
-
-  if (!reuse_algo) {
-    size_t free;
-    int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
-
-    if (err2 != GA_NO_ERROR) {
-      PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
-                   "memory information on the GPU");
-      cuda_exit(c->ctx);
-      return 1;
-    }
-
-    // Guess 4Mb if the info is not available
-    if (free == 0) free = 4 * 1024 * 1024;
-
-    if (params->choose_time) {
-      int count;
-      cudnnConvolutionBwdDataAlgoPerf_t choice;
-      gpudata *tmpmem;
-
-      tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
-      if (tmpmem == NULL) {
-        PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
-        return -1;
-      }
-
-      err = cudnnFindConvolutionBackwardDataAlgorithmEx(
-        params->handle, APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
-        APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc,
-        APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
-        1, &count, &choice, *(void **)tmpmem, free);
-      gpudata_release(tmpmem);
-
-      if (err != CUDNN_STATUS_SUCCESS) {
-        PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", cudnnGetErrorString(err));
-        cuda_exit(c->ctx);
-        return 1;
-      }
-
-      algo = choice.algo;
-    } else {
-      err = cudnnGetConvolutionBackwardDataAlgorithm(
-        params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output),
-        desc, APPLY_SPECIFIC(input),
-        CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, free, &algo);
-      if (err != CUDNN_STATUS_SUCCESS) {
-        PyErr_Format(PyExc_RuntimeError,
-                     "error selecting convolution algo: %s",
-                     cudnnGetErrorString(err));
-        cuda_exit(c->ctx);
-        return 1;
-      }
-    }
-    prev_algo = algo;
-  } else {
-    algo = prev_algo;
-  }
-
-  if (params->choose_once) {
-    reuse_algo = 1;
-  } else {
-    for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); ++i) {
-      prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
-      prev_top_dims[i] = PyGpuArray_DIM(output, i);
-    }
-  }
-
-  // The FFT implementation does not support strides, 1x1 filters or inputs
-  // with a spatial dimension larger than 1024. The tiled-FFT implementation
-  // does not support strides.
-  // If the chosen implementation is FFT or tiled-FFT, validate that it can
-  // be used on the current data and default to a safe implementation if it
-  // can't.
-  // The following code is 2d-specific but it is fine as FFT and tiled-FFT are
-  // defined only for 2d filters
-  if ((algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING ||
-       algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) && PyGpuArray_NDIM(kerns) == 4) {
-
-    // Extract the properties of the convolution descriptor
-    int nd;
-    int pad[2];
-    int stride[2];
-    int upscale[2];
-    cudnnConvolutionMode_t mode;
-    cudnnDataType_t data_type;
-    err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride, upscale, &mode, &data_type);
-    if (err != CUDNN_STATUS_SUCCESS) {
-      PyErr_Format(PyExc_RuntimeError, "error getting convolution properties: %s", cudnnGetErrorString(err));
-      cuda_exit(c->ctx);
-      return 1;
-    }
-
-    if (algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) {
-      if (stride[0] != 1 || stride[1] != 1 ||
-          PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
-          (PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1))
-      {
-        algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
-      }
-    } else {
-      // algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING
-      if (stride[0] != 1 || stride[1] != 1) {
-        algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
-      }
-    }
-  }
-
-  *output_algo = algo;
-  cuda_exit(c->ctx);
-  return 0;
-}
--- a/theano/gpuarray/tests/dnn_choose_gw.c
+++ b/theano/gpuarray/tests/dnn_choose_gw.c
-#section init_code_struct
-
-reuse_algo = 0;
-prev_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
-if (!PARAMS->choose_once) {
-  memset(prev_img_dims, 0, sizeof(prev_img_dims));
-  memset(prev_top_dims, 0, sizeof(prev_top_dims));
-}
-
-#section support_code_struct
-
-int reuse_algo;
-cudnnConvolutionBwdFilterAlgo_t prev_algo;
-size_t prev_img_dims[5];
-size_t prev_top_dims[5];
-
-int
-APPLY_SPECIFIC(choose_bwd_filter_algo)(PyGpuArrayObject *input, PyGpuArrayObject *output,
-                                       PyGpuArrayObject *kerns,
-                                       cudnnConvolutionDescriptor_t desc,
-                                       cudnnConvolutionBwdFilterAlgo_t *output_algo,
-                                       PARAMS_TYPE* params) {
-  PyGpuContextObject *c = input->context;
-  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
-
-  if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) {
-    PyErr_SetString(PyExc_ValueError, "GpuDnnConv images and kernel must have the same stack size");
-    return 1;
-  }
-
-  if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
-    return 1;
-  if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
-    return 1;
-  if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
-    return 1;
-
-  cudnnConvolutionBwdFilterAlgo_t algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
-
-  cuda_enter(c->ctx);
-
-  int expected_output_dims[5] = {0};
-  err = cudnnGetConvolutionNdForwardOutputDim(desc, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
-                                              PyGpuArray_NDIM(input), expected_output_dims);
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "error computing convolution output dim: %s",
-                 cudnnGetErrorString(err));
-    cuda_exit(c->ctx);
-    return 1;
-  }
-  if (PyGpuArray_NDIM(input) == 4) {
-    if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
-        (PyGpuArray_DIMS(output)[1] != expected_output_dims[1]) ||
-        (PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
-        (PyGpuArray_DIMS(output)[3] != expected_output_dims[3])) {
-      PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ld"
-                                     " but received gradient with shape %ldx%ldx%ldx%ld",
-                   expected_output_dims[0], expected_output_dims[1],
-                   expected_output_dims[2], expected_output_dims[3],
-                   PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
-                   PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3]);
-      cuda_exit(c->ctx);
-      return 1;
-    }
-  } else if (PyGpuArray_NDIM(input) == 5) {
-    if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
-        (PyGpuArray_DIMS(output)[1] != expected_output_dims[1]) ||
-        (PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
-        (PyGpuArray_DIMS(output)[3] != expected_output_dims[3]) ||
-        (PyGpuArray_DIMS(output)[4] != expected_output_dims[4])) {
-      PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ldx%ld"
-                                     " but received gradient with shape %ldx%ldx%ldx%ldx%ld",
-                   expected_output_dims[0], expected_output_dims[1],
-                   expected_output_dims[2], expected_output_dims[3],
-                   expected_output_dims[4],
-                   PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
-                   PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3],
-                   PyGpuArray_DIMS(output)[4]);
-      cuda_exit(c->ctx);
-      return 1;
-    }
-  }
-
-  if (!params->choose_once) {
-    reuse_algo = 1;
-    for (unsigned int i = 0; i < PyGpuArray_NDIM(input); ++i) {
-      reuse_algo = (reuse_algo && PyGpuArray_DIM(input, i) == prev_img_dims[i]);
-      reuse_algo = (reuse_algo && PyGpuArray_DIM(output, i) == prev_top_dims[i]);
-    }
-  }
-
-  if (!reuse_algo) {
-    size_t free;
-
-    int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
-    if (err2 != GA_NO_ERROR) {
-      PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
-                   "memory information on the GPU");
-      cuda_exit(c->ctx);
-      return 1;
-    }
-
-    // Guess 4Mb if the info is not available
-    if (free == 0) free = 4 * 1024 * 1024;
-
-    if (params->choose_time) {
-      int count;
-      cudnnConvolutionBwdFilterAlgoPerf_t choice;
-      gpudata *tmpmem;
-
-      tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
-      if (tmpmem == NULL) {
-        PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
-        return -1;
-      }
-
-      err = cudnnFindConvolutionBackwardFilterAlgorithmEx(
-        params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
-        APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc,
-        APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
-        1, &count, &choice, *(void **)tmpmem, free);
-      gpudata_release(tmpmem);
-
-      if (err != CUDNN_STATUS_SUCCESS) {
-        PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", cudnnGetErrorString(err));
-        cuda_exit(c->ctx);
-        return 1;
-      }
-
-      algo = choice.algo;
-    } else {
-      err = cudnnGetConvolutionBackwardFilterAlgorithm(
-        params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
-        desc, APPLY_SPECIFIC(kerns),
-        CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, free, &algo);
-      if (err != CUDNN_STATUS_SUCCESS) {
-        PyErr_Format(PyExc_RuntimeError,
-                     "error selecting convolution algo: %s",
-                     cudnnGetErrorString(err));
-        cuda_exit(c->ctx);
-        return 1;
-      }
-    }
-    prev_algo = algo;
-  } else {
-    algo = prev_algo;
-  }
-
-  if (params->choose_once) {
-    reuse_algo = 1;
-  } else {
-    for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
-      prev_img_dims[i] = PyGpuArray_DIM(input, i);
-      prev_top_dims[i] = PyGpuArray_DIM(output, i);
-    }
-  }
-
-  // The FFT implementation does not support strides, 1x1 filters or inputs
-  // with a spatial dimension larger than 1024.
-  // If the chosen implementation is FFT, validate that it can
-  // be used on the current data and default to a safe implementation if it
-  // can't.
-  // The following code is 2d-specific but it is fine as FFT and tiled-FFT are
-  // defined only for 2d filters
-  if (algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT && PyGpuArray_NDIM(input) == 4) {
-    // Extract the properties of the convolution descriptor
-    int nd;
-    int pad[2];
-    int stride[2];
-    int upscale[2];
-    cudnnConvolutionMode_t mode;
-    cudnnDataType_t data_type;
-    err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride, upscale, &mode, &data_type);
-    if (err != CUDNN_STATUS_SUCCESS) {
-      PyErr_Format(PyExc_RuntimeError, "error getting convolution properties: %s", cudnnGetErrorString(err));
-      cuda_exit(c->ctx);
-      return 1;
-    }
-
-    if (stride[0] != 1 || stride[1] != 1 ||
-        PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
-        (PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1)) {
-      algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
-    }
-  }
-
-  *output_algo = algo;
-  cuda_exit(c->ctx);
-  return 0;
-}