Cancel some changes into CEnumType.

Cancel changes into dnn_fwd.c Heavy simplification of check_dnn. Make check_dnn runnable as a python script.

Cancel some changes into CEnumType.
1b6e6389 · notoraptor · 6cc0c5ca · 1b6e6389 · 1b6e6389 · 1b6e6389
--- a/theano/gof/type.py
+++ b/theano/gof/type.py
@@ -909,11 +909,7 @@ class EnumType(Type, dict):

    .. note::

-        :class:`EnumType` is not complete and should never be used for regular graph operations.
-
-        :class:`EnumList` is not complete and should never be used for regular graph operations.
-
-        **:class:`CEnumType` is complete.**
+        This Type (and subclasses) is not complete and should never be used for regular graph operations.

    """

@@ -1053,9 +1049,6 @@ class EnumType(Type, dict):
        #ifndef PyInt_AsLong
            #define PyInt_AsLong PyLong_AsLong
        #endif
-        #ifndef PyInt_FromLong
-            #define PyInt_FromLong PyLong_FromLong
-        #endif
    #endif
    """

@@ -1248,22 +1241,5 @@ class CEnumType(EnumList):
                   """ % dict(i=i, name=name, constant_cname=swapped_dict[i]) for i in sorted(swapped_dict.keys())),
                   fail=sub['fail'])

-    def c_sync(self, name, sub):
-        return """
-        int py_value = -1;
-        Py_XDECREF(py_%(name)s);
-        /* We assume that ctype is an integer type usable in a switch. */
-        switch (%(name)s) {
-            %(cases)s
-            default:
-                PyErr_SetString(PyExc_ValueError, "CEnumType: cannot map C value to Python constant.");
-                {%(fail)s}
-                break;
-        }
-        py_%(name)s = PyInt_FromLong(py_value);
-        """ % dict(name=name, fail=sub['fail'], cases=''.join("""
-            case %(constant_cname)s: py_value = %(constant_pyvalue)d; break;
-        """ % dict(constant_cname=k, constant_pyvalue=v) for k, v in sorted(self.items(), key=lambda t: t[1])))
-
    def c_code_cache_version(self):
        return (1, super(CEnumType, self).c_code_cache_version())
--- a/theano/gpuarray/cudnn_defs.py
+++ b/theano/gpuarray/cudnn_defs.py
@@ -19,8 +19,27 @@ from __future__ import absolute_import, print_function, division

 from theano.gof import CEnumType

-
 HALF, FLOAT, DOUBLE = ('float16', 'float32', 'float64')
+TRUE_HALF_CONFIG = (HALF, HALF)
+PSEUDO_HALF_CONFIG = (HALF, FLOAT)
+FLOAT_CONFIG = (FLOAT, FLOAT)
+DOUBLE_CONFIG = (DOUBLE, DOUBLE)
+
+
+def is_true_half_config(dtype, precision):
+    return dtype == precision == HALF
+
+
+def is_pseudo_half_config(dtype, precision):
+    return dtype == HALF and precision == FLOAT
+
+
+def is_float_config(dtype, precision):
+    return dtype == precision == FLOAT
+
+
+def is_double_config(dtype, precision):
+    return dtype == precision == DOUBLE


 # NB: Some cuDNN algorithms are listed in cuDNN enums but not implemented.
@@ -103,22 +122,97 @@ class CuDNNV51(object):
    # empty list of enum to don't crash with cudnn 5
    cudnnReduceTensorOp_t = CEnumType()

-    def supported_precisions(self, dtype):
+    def get_supported_dtype_configs(self):
        """
-        Return the tuple of precisions supported by cuDNN for given input data type.
+        Return the tuple of data type configurations supported by this version of cuDNN.
        This is currently convenient for both cuDNN V5.1 and V6, as Theano does not
        yet support new data types (like INT8, INT8x4, etc.).
        """
-        assert dtype in (HALF, FLOAT, DOUBLE)
-        if dtype == HALF:
-            # TRUE_HALF_CONFIG, PSEUDO_HALF_CONFIG
-            return (HALF, FLOAT)
-        if dtype == FLOAT:
-            # FLOAT_CONFIG
-            return (FLOAT,)
-        if dtype == DOUBLE:
-            # DOUBLE_CONFIG
-            return (DOUBLE,)
+        return (TRUE_HALF_CONFIG, PSEUDO_HALF_CONFIG, FLOAT_CONFIG, DOUBLE_CONFIG)
+
+    def get_fwd_dtype_configs(self, check_runtime=None):
+        # NB: "TRUE_HALF_CONFIG is only supported on architectures with true fp16 support
+        # (compute capability 5.3 and 6.0)". Can be checked at runtime only.
+        if check_runtime is None or check_runtime(*TRUE_HALF_CONFIG):
+            return self.get_supported_dtype_configs()
+        return (PSEUDO_HALF_CONFIG, FLOAT_CONFIG, DOUBLE_CONFIG)
+
+    def get_bwd_filter_dtype_configs(self, check_runtime=None):
+        return self.get_supported_dtype_configs()
+
+    def get_bwd_data_dtype_configs(self, check_runtime=None):
+        return self.get_supported_dtype_configs()
+
+    def fwd_algo_supports_dtype_config(self, algo, dtype, precision, ndim):
+        algorithms = self.cudnnConvolutionFwdAlgo_t
+        algo = algorithms.fromalias(algo)
+        if algo == algorithms.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM:
+            return not is_true_half_config(dtype, precision)
+        if algo == algorithms.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM:
+            return ndim == 2 or not is_true_half_config(dtype, precision)
+        if algo == algorithms.CUDNN_CONVOLUTION_FWD_ALGO_GEMM:
+            return ndim == 2 and not is_true_half_config(dtype, precision)
+        # CUDNN_CONVOLUTION_FWD_ALGO_DIRECT: not implemented.
+        if algo == algorithms.CUDNN_CONVOLUTION_FWD_ALGO_FFT:
+            return ndim == 2 and (is_pseudo_half_config(dtype, precision) or is_float_config(dtype, precision))
+        if algo == algorithms.CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING:
+            if ndim == 2:
+                return is_pseudo_half_config(dtype, precision) or is_float_config(dtype, precision)
+                # NB: For cuDNN V6:
+                # " Data Type Config Support: PSEUDO_HALF_CONFIG, FLOAT_CONFIG
+                # (DOUBLE_CONFIG is also supported when the task can be handled by 1D FFT,
+                # ie, one of the filter dimension, width or height is 1)"
+                # Could be checked only when being in C code.
+            if ndim == 3:
+                return not is_true_half_config(dtype, precision)
+        if algo == algorithms.CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD:
+            return ndim == 2 and (is_pseudo_half_config(dtype, precision) or is_float_config(dtype, precision))
+        if algo == algorithms.CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED:
+            # NB: "If wDesc 's filter (height, width) is (5,5), data type config TRUE_HALF_CONFIG is not supported".
+            # We could not check it before being in C code.
+            return ndim == 2 and not is_double_config(dtype, precision)
+        return False
+
+    def bwd_filter_algo_supports_dtype_config(self, algo, dtype, precision, ndim):
+        algorithms = self.cudnnConvolutionBwdFilterAlgo_t
+        algo = algorithms.fromalias(algo)
+        if algo == algorithms.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0:
+            return not is_true_half_config(dtype, precision)
+        if algo == algorithms.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1:
+            return ndim == 2
+        if algo == algorithms.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT:
+            return ndim == 2 and (is_pseudo_half_config(dtype, precision) or is_float_config(dtype, precision))
+        if algo == algorithms.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3:
+            return not is_true_half_config(dtype, precision)
+        if algo == algorithms.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED:
+            # NB: "If wDesc 's filter (height, width) is (5,5), data type config TRUE_HALF_CONFIG is not supported".
+            # We could not check it before being in C code.
+            return ndim == 2 and not is_double_config(dtype, precision)
+        return False
+
+    def bwd_data_algo_supports_dtype_config(self, algo, dtype, precision, ndim):
+        algorithms = self.cudnnConvolutionBwdDataAlgo_t
+        algo = algorithms.fromalias(algo)
+        if algo == algorithms.CUDNN_CONVOLUTION_BWD_DATA_ALGO_0:
+            return not is_true_half_config(dtype, precision)
+        # CUDNN_CONVOLUTION_BWD_DATA_ALGO_1: all data type configs supported.
+        if algo == algorithms.CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT:
+            return ndim == 2 and (is_pseudo_half_config(dtype, precision) or is_float_config(dtype, precision))
+        if algo == algorithms.CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING:
+            if ndim == 2:
+                return is_pseudo_half_config(dtype, precision) or is_float_config(dtype, precision)
+                # NB: For cuDNN V6: "(DOUBLE_CONFIG is also supported when the task can be handled by 1D FFT,
+                # ie, one of the filter dimension, width or height is 1)"
+                # Could be checked only when being in C code.
+            if ndim == 3:
+                return not is_true_half_config(dtype, precision)
+        if algo == algorithms.CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD:
+            return ndim == 2 and is_pseudo_half_config(dtype, precision) or is_float_config(dtype, precision)
+        if algo == algorithms.CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED:
+            # NB: "If wDesc 's filter (height, width) is (5,5), data type config TRUE_HALF_CONFIG is not supported".
+            # We could not check it before being in C code.
+            return ndim == 2 and not is_double_config(dtype, precision)
+        return False


 class CuDNNV6(CuDNNV51):
@@ -162,6 +256,17 @@ class CuDNNV6(CuDNNV51):
                                      ('CUDNN_REDUCE_TENSOR_NORM2', 'norm2'),
                                      ctype='cudnnReduceTensorOp_t')

+    def bwd_filter_algo_supports_dtype_config(self, algo, dtype, precision, ndim):
+        is_supported = super(CuDNNV6, self).bwd_filter_algo_supports_dtype_config(algo, dtype, precision, ndim)
+        if not is_supported:
+            algorithms = self.cudnnConvolutionBwdFilterAlgo_t
+            algo = algorithms.fromalias(algo)
+            if algo == algorithms.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING:
+                return ndim == 2 and (is_pseudo_half_config(dtype, precision) or
+                                      is_float_config(dtype, precision) or
+                                      is_double_config(dtype, precision))
+        return is_supported
+

 class CuDNNV7(CuDNNV6):
    version = 7

--- a/theano/gpuarray/tests/check_dnn.py
+++ b/theano/gpuarray/tests/check_dnn.py
+#!/usr/bin/env python
+# You can pass nosetests args when running this script. Examples:
+# python theano/gpuarray/tests/check_dnn.py       # Normal mode.
+# python theano/gpuarray/tests/check_dnn.py -xvs  # Verbose mode, capture output, exit at first error.
 from __future__ import absolute_import, print_function, division

-from itertools import ifilter, product
+from itertools import ifilter, product, chain

+import nose
 import numpy as np
+from nose.plugins.skip import SkipTest

 import theano
 import theano.tests.unittest_tools as utt
 from theano.compile.ops import shape_i_op
 from theano.configdefaults import SUPPORTED_DNN_CONV_ALGO_RUNTIME
-from theano.gof import COp, Apply, ParamsType
-from theano.gof.type import CDataType
 from theano.gpuarray import cudnn_defs
 from theano.gpuarray.basic_ops import infer_context_name, as_gpuarray_variable, gpu_contiguous, GpuAllocEmpty
-from theano.gpuarray.dnn import (GpuDnnConvDesc, GpuDnnConv, GpuDnnConvGradW, GpuDnnConvGradI, version, get_precision,
-                                 DnnBase, handle_type, DNN_CONV_ALGO_CHOOSE_ONCE, DNN_CONV_ALGO_CHOOSE_TIME)
-from theano.gpuarray.tests.check_dnn_doc import check_fwd_algorithm
+from theano.gpuarray.dnn import GpuDnnConvDesc, GpuDnnConv, GpuDnnConvGradW, GpuDnnConvGradI, version, get_precision
 from theano.gpuarray.tests.config import mode_with_gpu, ref_cast
-from theano.scalar import bool as bool_t
 from theano.tensor.nnet.abstract_conv import get_conv_output_shape, assert_conv_shape
 from theano.tensor.opt import Assert

 cudnn = cudnn_defs.get_definitions(version(raises=False))

-cudnnConvolutionFwdAlgo_t = cudnn.cudnnConvolutionFwdAlgo_t
-cudnnConvolutionBwdFilterAlgo_t = cudnn.cudnnConvolutionBwdFilterAlgo_t
-cudnnConvolutionBwdDataAlgo_t = cudnn.cudnnConvolutionBwdDataAlgo_t
-
-AVAILABLE_PRECISIONS = cudnn.supported_precisions(theano.config.floatX)
-

 class DnnCaseGenerator:
    """
@@ -81,11 +76,11 @@ class DnnCaseGenerator:

    @staticmethod
    def get_if_valid_conv_output_shape(case_tuple):
-        out_shp = get_conv_output_shape(case_tuple[0][0],  # input shape
-                                        case_tuple[0][1],  # filter shape
-                                        case_tuple[1],  # border mode
-                                        case_tuple[0][2],  # subsample
-                                        case_tuple[0][3]  # dilation
+        out_shp = get_conv_output_shape(case_tuple[0],  # input shape
+                                        case_tuple[1],  # filter shape
+                                        case_tuple[4],  # border mode
+                                        case_tuple[2],  # subsample
+                                        case_tuple[3]  # dilation
                                        )
        try:
            return assert_conv_shape(out_shp)
@@ -94,7 +89,7 @@ class DnnCaseGenerator:

    def get_cases(self):
        # Generate an iterator of tuples with format:
-        # ( (input shape, filter shape, subsample, dilation), border mode, convolution mode, alpha, beta )
+        # (input shape, filter shape, subsample, dilation, border mode, convolution mode, alpha, beta)
        all_batch_sizes = (self.batch_size,)
        all_input_channels = (self.input_channels,)
        all_input_sizes = self._shapes(self.input_size)
@@ -114,7 +109,7 @@ class DnnCaseGenerator:
        all_filter_shapes = ((oc, ic) + fis
                             for oc in all_output_channels for ic in all_input_channels for fis in all_filter_sizes)
        return ifilter(DnnCaseGenerator.get_if_valid_conv_output_shape,
-                       product(product(all_input_shapes, all_filter_shapes, all_subsamples, all_dilations),
+                       product(all_input_shapes, all_filter_shapes, all_subsamples, all_dilations,
                               all_border_modes, all_conv_modes, all_alphas, all_betas))


@@ -142,14 +137,14 @@ def dnn_conv(img, kerns, alpha=1, beta=0, out=None, border_mode='valid', subsamp
                                    desc_op.subsample,
                                    filter_dilation=dilation)
    out_shp = assert_conv_shape(out_shp)
-    if beta != 0:
+    if beta == 0:
+        real_out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*out_shp)
+    else:
        assert out is not None
        out = as_gpuarray_variable(out, ctx_name)
        out = gpu_contiguous(out)
        check = Assert('GpuDnnConv: qiven output (for beta not null) does not have expected shape')
        real_out = check(out, theano.tensor.all(theano.tensor.eq(out.shape, out_shp)))
-    else:
-        real_out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*out_shp)
    return GpuDnnConv(algo=algo)(img, kerns, real_out, desc, alpha, beta)


@@ -207,78 +202,28 @@ def dnn_gradinput(kerns, topgrad, img_shp, alpha=1, beta=0, out=None, border_mod
    return GpuDnnConvGradI(algo=algo)(kerns, topgrad, real_out, desc, alpha, beta)


-class BaseGpuDnnConvChooseAlgo(DnnBase):
-    """
-    This class and its subclasses allow to retrieve a cuDNN algorithm
-    at runtime without any computation, given the user choose option
-    (time_once, time_on_shape_change, guess_once or guess_on_shape_change).
-    To help reduce whole test time, I suggest we use these classes when
-    algo is one of choose options, as any chosen algorithm would have
-    been tested by the other exhaustive tests.
-    """
-
-    _f16_ok = True
-    check_input = False
-    __props__ = ('choice',)
-    params_type = ParamsType(choose_once=bool_t, choose_time=bool_t, handle=handle_type)
-
-    # Abstract attributes.
-    func_file = None
-    func_name = None
-
-    def __init__(self, choice):
-        COp.__init__(self, ["../dnn_base.c", "../dnn_conv_base.c", self.func_file], self.func_name)
-        assert choice in SUPPORTED_DNN_CONV_ALGO_RUNTIME
-        self.choice = choice
-        self.choose_once = self.choice in DNN_CONV_ALGO_CHOOSE_ONCE
-        self.choose_time = self.choice in DNN_CONV_ALGO_CHOOSE_TIME
-
-    def dnn_context(self, node):
-        return node.inputs[0].type.context_name
-
-    def _prepare_inputs(self, i1, name_i1, i2, name_i2, output, desc):
-        ctx_name = infer_context_name(i1, i2, output)
-        i1 = as_gpuarray_variable(i1, ctx_name)
-        i2 = as_gpuarray_variable(i2, ctx_name)
-        output = as_gpuarray_variable(output, ctx_name)
-        if i1.type.ndim not in (4, 5):
-            raise TypeError('%s must be 4D or 5D tensor' % name_i1)
-        if i2.type.ndim not in (4, 5):
-            raise TypeError('%s must be 4D or 5D tensor' % name_i2)
-        if output.type.ndim not in (4, 5):
-            raise TypeError('output must be 4D or 5D tensor')
-        if i1.type.ndim != i2.type.ndim or i1.type.ndim != output.type.ndim:
-            raise TypeError("The number of dimensions of %s, %s and output must match" % (name_i1, name_i2))
-        if not isinstance(desc.type, CDataType) or desc.type.ctype != 'cudnnConvolutionDescriptor_t':
-            raise TypeError('desc must be cudnnConvolutionDescriptor_t')
-        return (i1, i2, output, desc)
-
-
-class GpuDnnConvChooseFwdAlgo(BaseGpuDnnConvChooseAlgo):
-    func_file = 'dnn_choose_fwd.c'
-    func_name = 'APPLY_SPECIFIC(choose_fwd_algo)'
+def check_fwd_dtype_config_support(dtype, precision):
+    inputs_shape = (1, 1, 3, 3)
+    filters_shape = (1, 1, 2, 2)
+    inputs = np.zeros(inputs_shape, dtype=dtype)
+    filters = np.zeros(filters_shape, dtype=dtype)
+    inputs = theano.shared(inputs)
+    filters = theano.shared(filters)
+    conv = dnn_conv(inputs, filters, precision=precision)
+    f = theano.function([], conv, mode=mode_with_gpu)
+    try:
+        f()
+    except RuntimeError as e:
+        assert 'CUDNN_STATUS_ARCH_MISMATCH' in e.message
+        return False
+    return True

-    def make_node(self, img, kern, output, desc):
-        img, kern, output, desc = self._prepare_inputs(img, 'img', kern, 'kern', output, desc)
-        return Apply(self, [img, kern, output, desc], [cudnn.cudnnConvolutionFwdAlgo_t()])

-
-class GpuDnnConvChooseBwdFilterAlgo(BaseGpuDnnConvChooseAlgo):
-    func_file = 'dnn_choose_gw.c'
-    func_name = 'APPLY_SPECIFIC(choose_bwd_filter_algo)'
-
-    def make_node(self, img, topgrad, output, desc):
-        img, topgrad, output, desc = self._prepare_inputs(img, 'img', topgrad, 'topgrad', output, desc)
-        return Apply(self, [img, topgrad, output, desc], [cudnn.cudnnConvolutionBwdFilterAlgo_t()])
-
-
-class GpuDnnConvChooseBwdDataAlgo(BaseGpuDnnConvChooseAlgo):
-    func_file = 'dnn_choose_gi.c'
-    func_name = 'APPLY_SPECIFIC(choose_bwd_data_algo)'
-
-    def make_node(self, kern, topgrad, output, desc):
-        kern, topgrad, output, desc = self._prepare_inputs(kern, 'kern', topgrad, 'topgrad', output, desc)
-        return Apply(self, [kern, topgrad, output, desc], [cudnn.cudnnConvolutionBwdDataAlgo_t()])
+def test_fwd_true_half_config_support():
+    # For cuDNN V5.1 and V6.0:
+    # "TRUE_HALF_CONFIG is only supported on architectures with true fp16 support (compute capability 5.3 and 6.0)"
+    if not check_fwd_dtype_config_support('float16', 'float16'):
+        raise SkipTest('FWD: TRUE_HALF_CONFIG not supported on this GPU.')


 class BaseTestDnnConv(object):
@@ -287,10 +232,6 @@ class BaseTestDnnConv(object):
    to run actual tests.
    """

-    _functions_checked_for_fwd = False
-    _functions_checked_for_gradinput = False
-    _functions_checked_for_gradweight = False
-
    # Abstract attributes.

    ndim = 2
@@ -303,25 +244,25 @@ class BaseTestDnnConv(object):
    cpu_gradinput_class = None
    cpu_gradweight_class = None

+    # Utility methods.
+
    def get_cases(self):
        # Return an iterable of test cases. Each test case is a tuple (or list) with following syntax:
-        # ( (input shape, filter shape, subsample, dilation), border mode, convolution mode, alpha, beta )
+        # (input shape, filter shape, subsample, dilation, border mode, convolution mode, alpha, beta)
        generator = DnnCaseGenerator(ndim=self.ndim)
        return generator.get_cases()

-    # Run and utility methods.
-
-    def array_like_conv_output(self, inputs_shape, filters_shape, border_mode, subsample, dilation):
+    def array_like_conv_output(self, inputs_shape, filters_shape, border_mode, subsample, dilation, dtype):
        # Return an random array with inferred convolution output shape.
        out_shp = get_conv_output_shape(inputs_shape, filters_shape, border_mode, subsample, dilation)
        out_shp = assert_conv_shape(out_shp)
-        return np.random.random(out_shp).astype(theano.config.floatX)
+        return np.random.random(out_shp).astype(dtype)

-    def run_conv_fwd(self, algo, precision, parameters):
-        (inputs_shape, filters_shape, subsample, dilation), border_mode, conv_mode, alpha, beta = parameters
+    def run_conv_fwd(self, algo, dtype, precision, parameters):
+        inputs_shape, filters_shape, subsample, dilation, border_mode, conv_mode, alpha, beta = parameters

-        inputs_val = np.random.random(inputs_shape).astype(theano.config.floatX)
-        filters_val = np.random.random(filters_shape).astype(theano.config.floatX)
+        inputs_val = np.random.random(inputs_shape).astype(dtype)
+        filters_val = np.random.random(filters_shape).astype(dtype)

        # Scale down the input values to prevent very large absolute errors
        # due to float rounding
@@ -331,8 +272,11 @@ class BaseTestDnnConv(object):
        inputs = theano.shared(inputs_val)
        filters = theano.shared(filters_val)

-        out = None if beta == 0 else self.array_like_conv_output(inputs_shape, filters_shape, border_mode, subsample,
-                                                                 dilation)
+        if beta == 0:
+            out = None
+        else:
+            out = self.array_like_conv_output(inputs_shape, filters_shape, border_mode, subsample, dilation, dtype)
+            out /= 10
        # Compile a theano function for the cuDNN implementation
        conv = dnn_conv(img=inputs, kerns=filters, alpha=alpha, beta=beta, out=out, border_mode=border_mode,
                        subsample=subsample, dilation=dilation, conv_mode=conv_mode, algo=algo, precision=precision)
@@ -354,15 +298,6 @@ class BaseTestDnnConv(object):
                                       filter_dilation=dilation)(ref_cast(inputs), flipped_filters)
        f_ref = theano.function([], conv_ref, mode="FAST_RUN")

-        if not self._functions_checked_for_fwd:
-            self._functions_checked_for_fwd = True
-            assert any(isinstance(node.op, GpuDnnConv) for node in f.maker.fgraph.apply_nodes)
-            assert not any(isinstance(node.op, (GpuDnnConvGradI, GpuDnnConvGradW))
-                           for node in f.maker.fgraph.apply_nodes)
-
-            assert not any(isinstance(node.op, (GpuDnnConv, GpuDnnConvGradW, GpuDnnConvGradI))
-                           for node in f_ref.maker.fgraph.apply_nodes)
-
        # Compare the results of the two implementations
        res_ref = f_ref()
        res = f()
@@ -371,19 +306,26 @@ class BaseTestDnnConv(object):
            utt.assert_allclose(res, res2)

        # Raise tolerance for float16
-        rtol = 6e-2 if theano.config.floatX == 'float16' else None
+        rtol = 6e-2 if dtype == 'float16' else None
        if beta == 0:
            utt.assert_allclose(alpha * res_ref, res, rtol=rtol)
        else:
-            # print('(conv: beta not null) ', end='')
            utt.assert_allclose(alpha * res_ref + beta * out, res, rtol=rtol)

-    def run_conv_gradinput(self, algo, precision, parameters):
-        (inputs_shape, filters_shape, subsample, dilation), border_mode, conv_mode, alpha, beta = parameters
+    def run_conv_gradinput(self, algo, dtype, precision, parameters):
+        inputs_shape, filters_shape, subsample, dilation, border_mode, conv_mode, alpha, beta = parameters
+
+        if beta == 0:
+            inputs_val = None
+        else:
+            inputs_val = np.random.random(inputs_shape).astype(dtype)
+            inputs_val /= 10
+        filters_val = np.random.random(filters_shape).astype(dtype)
+        topgrad_val = self.array_like_conv_output(inputs_shape, filters_shape, border_mode, subsample, dilation, dtype)

-        inputs_val = np.random.random(inputs_shape).astype(theano.config.floatX)
-        filters_val = np.random.random(filters_shape).astype(theano.config.floatX)
-        topgrad_val = self.array_like_conv_output(inputs_shape, filters_shape, border_mode, subsample, dilation)
+        # Scale down the input values to prevent absolute errors in utt.assert_allclose.
+        filters_val /= 10
+        topgrad_val /= 10

        filters = theano.shared(filters_val)
        topgrad = theano.shared(topgrad_val)
@@ -412,15 +354,6 @@ class BaseTestDnnConv(object):
                                              )(ref_cast(flipped_filters), ref_cast(topgrad), inputs_shape[2:])
        f_ref = theano.function([], grad_i_ref, mode="FAST_RUN")

-        if not self._functions_checked_for_gradinput:
-            self._functions_checked_for_gradinput = True
-            assert any(isinstance(node.op, GpuDnnConvGradI) for node in f.maker.fgraph.apply_nodes)
-            assert not any(isinstance(node.op, (GpuDnnConv, GpuDnnConvGradW))
-                           for node in f.maker.fgraph.apply_nodes)
-
-            assert not any(isinstance(node.op, (GpuDnnConv, GpuDnnConvGradW, GpuDnnConvGradI))
-                           for node in f_ref.maker.fgraph.apply_nodes)
-
        # Compare the results of the two implementations
        res_ref = f_ref()
        res = f()
@@ -429,15 +362,26 @@ class BaseTestDnnConv(object):
            utt.assert_allclose(res, res2)

        # Raise tolerance for float16
-        rtol = 5e-2 if theano.config.floatX == 'float16' else None
-        utt.assert_allclose(alpha * res_ref + beta * inputs_val, res, rtol=rtol)
+        rtol = 5e-2 if dtype == 'float16' else None
+        if beta == 0:
+            utt.assert_allclose(alpha * res_ref, res, rtol=rtol)
+        else:
+            utt.assert_allclose(alpha * res_ref + beta * inputs_val, res, rtol=rtol)

-    def run_conv_gradweight(self, algo, precision, parameters):
-        (inputs_shape, filters_shape, subsample, dilation), border_mode, conv_mode, alpha, beta = parameters
+    def run_conv_gradweight(self, algo, dtype, precision, parameters):
+        inputs_shape, filters_shape, subsample, dilation, border_mode, conv_mode, alpha, beta = parameters

-        inputs_val = np.random.random(inputs_shape).astype(theano.config.floatX)
-        filters_val = np.random.random(filters_shape).astype(theano.config.floatX)
-        topgrad_val = self.array_like_conv_output(inputs_shape, filters_shape, border_mode, subsample, dilation)
+        inputs_val = np.random.random(inputs_shape).astype(dtype)
+        if beta == 0:
+            filters_val = None
+        else:
+            filters_val = np.random.random(filters_shape).astype(dtype)
+            filters_val /= 10
+        topgrad_val = self.array_like_conv_output(inputs_shape, filters_shape, border_mode, subsample, dilation, dtype)
+
+        # Scale down the input values to prevent absolute errors in utt.assert_allclose.
+        inputs_val /= 10
+        topgrad_val /= 10

        inputs = theano.shared(inputs_val)
        topgrad = theano.shared(topgrad_val)
@@ -458,15 +402,6 @@ class BaseTestDnnConv(object):
            grad_w_ref = grad_w_ref[:, :, ::-1, ::-1, ::-1]
        f_ref = theano.function([], grad_w_ref, mode="FAST_RUN")

-        if not self._functions_checked_for_gradweight:
-            self._functions_checked_for_gradweight = True
-            assert any(isinstance(node.op, GpuDnnConvGradW) for node in f.maker.fgraph.apply_nodes)
-            assert not any(isinstance(node.op, (GpuDnnConv, GpuDnnConvGradI))
-                           for node in f.maker.fgraph.apply_nodes)
-
-            assert not any(isinstance(node.op, (GpuDnnConv, GpuDnnConvGradW, GpuDnnConvGradI))
-                           for node in f_ref.maker.fgraph.apply_nodes)
-
        # Compare the results of the two implementations
        res_ref = f_ref()
        res = f()
@@ -475,119 +410,58 @@ class BaseTestDnnConv(object):
            utt.assert_allclose(res, res2)

        # Raise tolerance for float16
-        rtol = 5e-2 if theano.config.floatX == 'float16' else None
-        utt.assert_allclose(alpha * res_ref + beta * filters_val, res, rtol=rtol)
-
-    def run_choose_runtime_algos(self, algo, precision, parameters):
-        (inputs_shape, filters_shape, subsample, dilation), border_mode, conv_mode, alpha, beta = parameters
-        out_shp = assert_conv_shape(
-            get_conv_output_shape(inputs_shape, filters_shape, border_mode, subsample, dilation))
-
-        inputs_val = np.random.random(inputs_shape).astype(theano.config.floatX)
-        filters_val = np.random.random(filters_shape).astype(theano.config.floatX)
-        topgrad_val = self.array_like_conv_output(inputs_shape, filters_shape, border_mode, subsample, dilation)
-
-        inputs = theano.shared(inputs_val)
-        filters = theano.shared(filters_val)
-        topgrad = theano.shared(topgrad_val)
-        ctx_name = infer_context_name(inputs, topgrad)
-
-        desc_filter = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample, dilation=dilation,
-                                     conv_mode=conv_mode, precision=precision)(filters_shape)
-
-        array_like_filters = GpuAllocEmpty(dtype=inputs.dtype, context_name=ctx_name)(*filters_shape)
-        array_like_inputs = GpuAllocEmpty(dtype=inputs.dtype, context_name=ctx_name)(*inputs_shape)
-        array_like_conv_output = GpuAllocEmpty(dtype=inputs.dtype, context_name=ctx_name)(*out_shp)
-
-        algo_filter = GpuDnnConvChooseBwdFilterAlgo(algo)(inputs, topgrad, array_like_filters, desc_filter)
-        algo_input = GpuDnnConvChooseBwdDataAlgo(algo)(filters, topgrad, array_like_inputs, desc_filter)
-        algo_conv = GpuDnnConvChooseFwdAlgo(algo)(inputs, filters, array_like_conv_output, desc_filter)
-        f = theano.function([], [algo_filter, algo_input, algo_conv], mode=mode_with_gpu)
-
-        # Just test that it runs.
-        algo_filter_val, algo_input_val, algo_conv_val = f()
-        # How to test if it "works" ?
+        rtol = 5e-2 if dtype == 'float16' else None
+        if beta == 0:
+            utt.assert_allclose(alpha * res_ref, res, rtol=rtol)
+        else:
+            utt.assert_allclose(alpha * res_ref + beta * filters_val, res, rtol=rtol)

    def get_expected_tcount(self):
        """
        Utility function to get expected test count
        without actually run nosetests.
        """
-        len_cases = 0
-        for c in self.get_cases():
-            len_cases += 1
-        print(len_cases, 'conv cases for %dD' % self.ndim)
-        return len(AVAILABLE_PRECISIONS) * len_cases * len(self.fwd_algorithms +
-                                                           self.bwd_data_algorithms +
-                                                           self.bwd_filter_algorithms +
-                                                           SUPPORTED_DNN_CONV_ALGO_RUNTIME)
+        len_cases = sum(1 for case in self.get_cases())
+        count_contexts = 0
+        for dtype, precision in cudnn.get_fwd_dtype_configs(check_runtime=check_fwd_dtype_config_support):
+            algos = (algo for algo in self.fwd_algorithms
+                     if cudnn.fwd_algo_supports_dtype_config(algo, dtype, precision, self.ndim))
+            count_contexts += sum(1 for algo in algos) + len(SUPPORTED_DNN_CONV_ALGO_RUNTIME)
+        for dtype, precision in cudnn.get_bwd_data_dtype_configs():
+            algos = (algo for algo in self.bwd_data_algorithms
+                     if cudnn.bwd_data_algo_supports_dtype_config(algo, dtype, precision, self.ndim))
+            count_contexts += sum(1 for algo in algos) + len(SUPPORTED_DNN_CONV_ALGO_RUNTIME)
+        for dtype, precision in cudnn.get_bwd_filter_dtype_configs():
+            algos = (algo for algo in self.bwd_filter_algorithms
+                     if cudnn.bwd_filter_algo_supports_dtype_config(algo, dtype, precision, self.ndim))
+            count_contexts += sum(1 for algo in algos) + len(SUPPORTED_DNN_CONV_ALGO_RUNTIME)
+        return len_cases * count_contexts

    # Iterable test methods.

    def test_fwd(self):
-        for precision, algo, parameters in product(AVAILABLE_PRECISIONS, self.fwd_algorithms, self.get_cases()):
-            yield (self.run_conv_fwd, algo, precision, parameters)
+        for dtype, precision in cudnn.get_fwd_dtype_configs(check_runtime=check_fwd_dtype_config_support):
+            algos = (algo for algo in self.fwd_algorithms
+                     if cudnn.fwd_algo_supports_dtype_config(algo, dtype, precision, self.ndim))
+            for algo in chain(algos, SUPPORTED_DNN_CONV_ALGO_RUNTIME):
+                for parameters in self.get_cases():
+                    yield (self.run_conv_fwd, algo, dtype, precision, parameters)

    def test_gradinput(self):
-        for precision, algo, parameters in product(AVAILABLE_PRECISIONS, self.bwd_data_algorithms, self.get_cases()):
-            yield (self.run_conv_gradinput, algo, precision, parameters)
+        for dtype, precision in cudnn.get_bwd_data_dtype_configs():
+            algos = (algo for algo in self.bwd_data_algorithms
+                     if cudnn.bwd_data_algo_supports_dtype_config(algo, dtype, precision, self.ndim))
+            for algo in chain(algos, SUPPORTED_DNN_CONV_ALGO_RUNTIME):
+                for parameters in self.get_cases():
+                    yield (self.run_conv_gradinput, algo, dtype, precision, parameters)

    def test_gradweight(self):
-        for precision, algo, parameters in product(AVAILABLE_PRECISIONS, self.bwd_filter_algorithms, self.get_cases()):
-            yield (self.run_conv_gradweight, algo, precision, parameters)
-
-    def test_choose_runtime_algos(self):
-        for precision, algo, parameters in product(AVAILABLE_PRECISIONS, SUPPORTED_DNN_CONV_ALGO_RUNTIME,
-                                                   self.get_cases()):
-            yield (self.run_choose_runtime_algos, algo, precision, parameters)
-
-    def check_fwd_predictions(self):
-        """
-        Call this method to check if tests fail when they
-        don't follow cuDNN V5.1 doc conditions for FWD algorithms.
-        Script will exit as soon as there is a test that does not fail when expected.
-        """
-
-        print()
-        print('TESTING FWD FAILURES PREDICTED FOR %dD' % self.ndim)
-        count = 0
-        for precision, algo, parameters in product(AVAILABLE_PRECISIONS, self.fwd_algorithms,
-                                                   self.get_cases()):
-            (inputs_shape, filters_shape, subsample, dilation), border_mode, conv_mode, alpha, beta = parameters
-
-            inputs_val = np.random.random(inputs_shape).astype(theano.config.floatX)
-            filters_val = np.random.random(filters_shape).astype(theano.config.floatX)
-            # Scale down the input values to prevent very large absolute errors
-            # due to float rounding
-            inputs_val /= 10
-            filters_val /= 10
-            out = self.array_like_conv_output(inputs_shape, filters_shape, border_mode, subsample, dilation)
-            desc_op = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample, dilation=dilation,
-                                     conv_mode=conv_mode, precision=precision)
-            should_compute = check_fwd_algorithm(inputs_val, filters_val, out, desc_op,
-                                                 algo, precision, subsample, dilation)
-
-            if not should_compute.ok:
-                infos = ['ndim               : %s' % (len(inputs_shape) - 2),
-                         'precision          : %s' % precision]
-                infos += should_compute.messages
-                try:
-                    self.run_conv_fwd(algo, precision, parameters)
-                except Exception as e:
-                    print('(FAILS as expected)', algo, precision, parameters)
-                    print(e.message.split('\n')[0])
-                    for info in infos:
-                        print(info)
-                        # exit(0)
-                else:
-                    print('**SHOULD FAIL**|', algo, precision, parameters)
-                    for info in infos:
-                        print(info)
-                    exit(-1)
-            count += 1
-            if count % 200 == 0:
-                print(count, 'passed')
-        print(count, 'finished')
+        for dtype, precision in cudnn.get_bwd_filter_dtype_configs():
+            algos = (algo for algo in self.bwd_filter_algorithms
+                     if cudnn.bwd_filter_algo_supports_dtype_config(algo, dtype, precision, self.ndim))
+            for algo in chain(algos, SUPPORTED_DNN_CONV_ALGO_RUNTIME):
+                for parameters in self.get_cases():
+                    yield (self.run_conv_gradweight, algo, dtype, precision, parameters)


 class TestDnnConv2D(BaseTestDnnConv):
@@ -615,23 +489,39 @@ class TestDnnConv3D(BaseTestDnnConv):


 if __name__ == '__main__':
+
+    def dtype_config_to_str(dtype_config):
+        dtype, precision = dtype_config
+        if dtype == precision == 'float16':
+            return 'TRUE_HALF_CONFIG'
+        if dtype == 'float16' and precision == 'float32':
+            return 'PSEUDO_HALF_CONFIG'
+        if dtype == precision == 'float32':
+            return 'FLOAT_CONFIG'
+        if dtype == precision == 'float64':
+            return 'DOUBLE_CONFIG'
+        raise ValueError
+
    test_2d = TestDnnConv2D()
    test_3d = TestDnnConv3D()
+    print()
+    print('Available data type configurations     :',
+          ', '.join(dtype_config_to_str(d) for d in cudnn.get_supported_dtype_configs()))
+    print()
    print('2D algorithms:')
-    print('FWD       :', test_2d.fwd_algorithms)
-    print('BWD FILTER:', test_2d.bwd_filter_algorithms)
-    print('BWD DATA  :', test_2d.bwd_data_algorithms)
+    print('FWD        :', ', '.join(test_2d.fwd_algorithms))
+    print('BWD FILTER :', ', '.join(test_2d.bwd_filter_algorithms))
+    print('BWD DATA   :', ', '.join(test_2d.bwd_data_algorithms))
+    print()
    print('3D algorithms:')
-    print('FWD       :', test_3d.fwd_algorithms)
-    print('BWD FILTER:', test_3d.bwd_filter_algorithms)
-    print('BWD DATA  :', test_3d.bwd_data_algorithms)
+    print('FWD        :', ', '.join(test_3d.fwd_algorithms))
+    print('BWD FILTER :', ', '.join(test_3d.bwd_filter_algorithms))
+    print('BWD DATA   :', ', '.join(test_3d.bwd_data_algorithms))
+    print()
    count_tests_2d = test_2d.get_expected_tcount()
    count_tests_3d = test_3d.get_expected_tcount()
-    print(count_tests_2d, 'total cases for 2D.')
-    print(count_tests_3d, 'total cases for 3D.')
-    print(count_tests_2d + count_tests_3d, 'total cases.')
-    import sys
-
-    if len(sys.argv) == 2 and sys.argv[1] == 'run':
-        test_2d.check_fwd_predictions()
-        test_3d.check_fwd_predictions()
+    print(count_tests_2d, 'conv2D test cases.')
+    print(count_tests_3d, 'conv3D test cases.')
+    print(count_tests_2d + count_tests_3d, 'total conv test cases.')
+    print()
+    nose.main(defaultTest='theano.gpuarray.tests.check_dnn')
--- a/theano/gpuarray/tests/check_dnn_doc.py
+++ b/theano/gpuarray/tests/check_dnn_doc.py
-"""
-This module is just a collection of definitions to be used by `check_dnn.py`.
-
-Following classes, functions and definitions are used to check if
-tests fail as expected when conditions listed into cuDNN documentation are not verified.
-I have currently implemented checking only for 2D/3D FWD algorithms in cuDNN V5.1,
-and in practice, many tests pass even when they don't follow cuDNN doc conditions.
-So, I think we should better just run all tests and check ourselves
-which tests pass, which fail, and why they fail.
-
-
-Reminder:
-    N: batch number
-    C: number of feature maps
-    D: depth
-    H: height
-    W: width
-
-NB: We assume that we **always** use NC(D)HW tensors in Theano.
-
-"""
-
-from __future__ import absolute_import, print_function, division
-import theano
-from ..cudnn_defs import HALF, FLOAT, DOUBLE, get_definitions
-from ..dnn import version
-
-UNKNOWN, TRUE_HALF_CONFIG, PSEUDO_HALF_CONFIG, FLOAT_CONFIG, DOUBLE_CONFIG = -1, 0, 1, 2, 3
-
-cudnn = get_definitions(version(raises=False))
-cudnnConvolutionFwdAlgo_t = cudnn.cudnnConvolutionFwdAlgo_t
-
-
-class Success:
-    ok = True
-    messages = []
-
-    def __init__(self, messages=[]):
-        self.messages = list(messages)
-
-    def add_message(self, *parts):
-        self.messages.append(''.join(str(part) for part in parts))
-
-
-class Failure(Success):
-    ok = False
-
-
-def _and(*tests):
-    # `tests` is a list of tuples with format (lambda test, test description)
-    messages = []
-    for test_lambda, message in tests:
-        if not test_lambda():
-            messages.append(message)
-    return Failure(messages) if messages else Success()
-
-
-def _or(*tests):
-    messages = []
-    ok = False
-    for test_lambda, message in tests:
-        if test_lambda():
-            ok = True
-            break
-        else:
-            messages.append(message)
-    return Success() if ok else Failure(messages)
-
-
-def type_conf(precision):
-    # All Op's input tensors are floatX tensors.
-    floatX = theano.config.floatX
-    if floatX == precision == HALF:
-        return TRUE_HALF_CONFIG
-    if floatX == HALF and precision == FLOAT:
-        return PSEUDO_HALF_CONFIG
-    if floatX == precision == FLOAT:
-        return FLOAT_CONFIG
-    if floatX == precision == DOUBLE:
-        return DOUBLE_CONFIG
-    return UNKNOWN
-    # raise ValueError('Unknown data type configuration (%s %s)' % (floatX, precision))
-
-
-def type_conf_to_string(conf):
-    if conf == -1:
-        return 'UNKNOWN'
-    if conf == 0:
-        return 'TRUE_HALF_CONFIG'
-    if conf == 1:
-        return 'PSEUDO_HALF_CONFIG'
-    if conf == 2:
-        return 'FLOAT_CONFIG'
-    if conf == 3:
-        return 'DOUBLE_CONFIG'
-
-
-def strideof(tensor, i):
-    return tensor.strides[i] // tensor.itemsize
-
-
-def tensor_is_partially_packed(tensor, packed_dim_names):
-    if tensor.ndim == 4:
-        dim_names = 'NCHW'
-    else:
-        dim_names = 'NCDHW'
-    packed_dims = []
-    unpacked_dims = []
-    for i in range(tensor.ndim - 1):
-        if dim_names[i] in packed_dim_names:
-            packed_dims.append(i)
-        else:
-            unpacked_dims.append(i)
-    if dim_names[tensor.ndim - 1] in packed_dim_names and strideof(tensor, -1) != 1:
-        # We won't put last dimension in the list of packed dims.
-        # We just need to check if stride of that dimension is 1.
-        return False
-    return (all(strideof(tensor, i) >= tensor.shape[i + 1] * strideof(tensor, i + 1) for i in unpacked_dims) and
-            all(strideof(tensor, i) == tensor.shape[i + 1] * strideof(tensor, i + 1) for i in packed_dims))
-
-
-def tensor_is_fully_packed(tensor):
-    return strideof(tensor, -1) == 1 and all(strideof(tensor, i) == tensor.shape[i + 1] * strideof(tensor, i + 1)
-                                             for i in range(tensor.ndim - 1))
-
-
-def check_fwd_algorithm(img, kern, out, desc_op, algo, precision, subsample, dilation):
-    # Based on cuDNN v5.1 user guide.
-
-    ndim = img.ndim - 2
-    if ndim == 2:
-        # rD won't be used.
-        rD, rH, rW = -1, 0, 1
-    else:
-        rD, rH, rW = 0, 1, 2
-
-    algo = cudnnConvolutionFwdAlgo_t.fromalias(algo)
-
-    kern_shape = kern.shape[2:]
-    kern_shape = tuple((kern_shape[i] - 1) * dilation[i] + 1 for i in range(len(dilation)))
-
-    pad = (desc_op.pad0, desc_op.pad1, desc_op.pad2)[:len(kern_shape)]
-    if desc_op.bmode == 'full':
-        pad = tuple(kern_shape[i] - 1 for i in range(len(pad)))
-    elif desc_op.bmode == 'half':
-        pad = tuple(kern_shape[i] // 2 for i in range(len(pad)))
-
-    img_shape = img.shape[2:]
-
-    img_with_borders = tuple(img_shape[i] + 2 * pad[i] for i in range(len(pad)))
-
-    def check_algo():
-        if algo == cudnnConvolutionFwdAlgo_t.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM:
-            return _and((lambda: type_conf(precision) != TRUE_HALF_CONFIG,
-                         "Data Type Config Support: All except TRUE_HALF_CONFIG"))
-
-        # CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM: 2D: everything supported.
-        if ndim == 3 and algo == cudnnConvolutionFwdAlgo_t.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM:
-            return _and(
-                (lambda: type_conf(precision) != TRUE_HALF_CONFIG,
-                 "Data Type Config Support: All except TRUE_HALF_CONFIG"),
-                (lambda: tensor_is_fully_packed(img),
-                 "xDesc Format Support: NCDHW-fully-packed"),
-                (lambda: tensor_is_fully_packed(out),
-                 "yDesc Format Support: NCDHW-fully-packed"),
-            )
-
-        if algo == cudnnConvolutionFwdAlgo_t.CUDNN_CONVOLUTION_FWD_ALGO_GEMM:
-            return _and(
-                (lambda: type_conf(precision) != TRUE_HALF_CONFIG,
-                 "Data Type Config Support: All except TRUE_HALF_CONFIG"),
-                (lambda: ndim == 2,
-                 "Only for conv2d")
-            )
-
-        # CUDNN_CONVOLUTION_FWD_ALGO_DIRECT: not implemented.
-
-        if algo == cudnnConvolutionFwdAlgo_t.CUDNN_CONVOLUTION_FWD_ALGO_FFT:
-            return _and(
-                (lambda: type_conf(precision) in (PSEUDO_HALF_CONFIG, FLOAT_CONFIG),
-                 "Data Type Config Support: PSEUDO_HALF_CONFIG, FLOAT_CONFIG"),
-                (lambda: ndim == 2,
-                 "Only for conv2d"),
-                (lambda: tensor_is_partially_packed(img, 'HW'),
-                 "xDesc Format Support: NCHW HW-packed"),
-                (lambda: tensor_is_partially_packed(out, 'HW'),
-                 "yDesc Format Support: NCHW HW-packed"),
-                (lambda: img_with_borders[rH] <= 256,
-                 "xDesc 's feature map height + 2 * convDesc 's zero-padding height must equal 256 or less"),
-                (lambda: img_with_borders[rW] <= 256,
-                 "xDesc 's feature map width + 2 * convDesc 's zero-padding width must equal 256 or less"),
-                (lambda: subsample[rH] == subsample[rW] == 1,
-                 "convDesc 's vertical and horizontal filter stride must equal 1"),
-                (lambda: kern_shape[rH] > pad[rH],
-                 "wDesc 's filter height must be greater than convDesc 's zero-padding height"),
-                (lambda: kern_shape[rW] > pad[rW],
-                 "wDesc 's filter width must be greater than convDesc 's zero-padding width")
-            )
-
-        if algo == cudnnConvolutionFwdAlgo_t.CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING:
-            if ndim == 2:
-                return _and(
-                    (lambda: type_conf(precision) in (PSEUDO_HALF_CONFIG, FLOAT_CONFIG),
-                     "Data Type Config Support: PSEUDO_HALF_CONFIG, FLOAT_CONFIG"),
-                    (lambda: tensor_is_partially_packed(img, 'HW'),
-                     "xDesc Format Support: NCHW HW-packed"),
-                    (lambda: tensor_is_partially_packed(out, 'HW'),
-                     "yDesc Format Support: NCHW HW-packed"),
-                    (lambda: kern_shape[rH] <= 32,
-                     "wDesc 's filter height must equal 32 or less"),
-                    (lambda: kern_shape[rW] <= 32,
-                     "wDesc 's filter width must equal 32 or less"),
-                    (lambda: subsample[rH] == subsample[rW] == 1,
-                     "convDesc 's vertical and horizontal filter stride must equal 1"),
-                    (lambda: pad[rH] < kern_shape[rH],
-                     "wDesc 's filter height must be greater than convDesc 's zero-padding height"),
-                    (lambda: pad[rW] < kern_shape[rW],
-                     "wDesc 's filter width must be greater than convDesc 's zero-padding width"),
-                )
-            if ndim == 3:
-                return _and(
-                    (lambda: type_conf(precision) != TRUE_HALF_CONFIG,
-                     "Data Type Config Support: All except TRUE_HALF_CONFIG"),
-                    (lambda: tensor_is_partially_packed(img, 'DHW'),
-                     "xDesc Format Support: NCDHW DHW-packed"),
-                    (lambda: tensor_is_partially_packed(out, 'DHW'),
-                     "yDesc Format Support: NCDHW DHW-packed"),
-                    (lambda: kern_shape[rH] <= 16,
-                     "wDesc 's filter height must equal 16 or less"),
-                    (lambda: kern_shape[rW] <= 16,
-                     "wDesc 's filter width must equal 16 or less"),
-                    (lambda: kern_shape[rD] <= 16,
-                     "wDesc 's filter depth must equal 16 or less"),
-                    (lambda: all(s == 1 for s in subsample),
-                     "convDesc 's must have all filter strides equal to 1"),
-                    (lambda: pad[rH] < kern_shape[rH],
-                     "wDesc 's filter height must be greater than convDesc 's zero-padding height"),
-                    (lambda: pad[rW] < kern_shape[rW],
-                     "wDesc 's filter width must be greater than convDesc 's zero-padding width"),
-                    (lambda: pad[rW] < kern_shape[rD],
-                     "wDesc 's filter depth must be greater than convDesc 's zero-padding width"),
-                )
-
-        if algo == cudnnConvolutionFwdAlgo_t.CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD:
-            return _and(
-                (lambda: type_conf(precision) in (PSEUDO_HALF_CONFIG, FLOAT_CONFIG, DOUBLE_CONFIG),
-                 "Data Type Config Support: PSEUDO_HALF_CONFIG, FLOAT_CONFIG"),
-                (lambda: ndim == 2,
-                 "Only for conv2d"),
-                (lambda: subsample[rH] == subsample[rW] == 1,
-                 "convDesc 's vertical and horizontal filter stride must equal 1"),
-                (lambda: kern_shape[rH] == 3,
-                 "wDesc 's filter height must be 3"),
-                (lambda: kern_shape[rW] == 3,
-                 "wDesc 's filter width must be 3"),
-            )
-
-        if algo == cudnnConvolutionFwdAlgo_t.CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED:
-            data_type_conf = type_conf(precision)
-            return _and(
-                (lambda: data_type_conf != DOUBLE_CONFIG,
-                 "Data Type Config Support: All except DOUBLE_CONFIG"),
-                (lambda: ndim == 2,
-                 "Only for conv2d"),
-                (lambda: subsample[rH] == subsample[rW] == 1,
-                 "convDesc 's vertical and horizontal filter stride must equal 1"),
-                (lambda: kern_shape[rH] == kern_shape[rW] and kern_shape[rH] in (3, 5),
-                 "wDesc 's filter (height, width) must be (3,3) or (5,5)"),
-                (lambda: kern_shape[rH] == 3 or data_type_conf != TRUE_HALF_CONFIG,
-                 "If wDesc 's filter (height, width) is (5,5), "
-                 "data type config TRUE_HALF_CONFIG is not supported")
-            )
-
-    checking = check_algo()
-    if not checking.ok:
-        messages = checking.messages
-        checking.messages = []
-        checking.add_message('config             : ', type_conf_to_string(type_conf(precision)))
-        checking.add_message('computed borders   : ', pad)
-        checking.add_message('img with borders   : ', img_with_borders)
-        checking.add_message('computed kern shape: ', kern_shape)
-        checking.add_message('== why should fail ==')
-        checking.messages += messages
-    return checking
--- a/theano/gpuarray/tests/dnn_choose_fwd.c
+++ b/theano/gpuarray/tests/dnn_choose_fwd.c
-#section init_code_struct
-
-reuse_algo = 0;
-prev_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
-if (!PARAMS->choose_once) {
-  memset(prev_img_dims, 0, sizeof(prev_img_dims));
-  memset(prev_kern_dims, 0, sizeof(prev_kern_dims));
-}
-
-#section support_code_struct
-
-int reuse_algo;
-cudnnConvolutionFwdAlgo_t prev_algo;
-size_t prev_img_dims[5];
-size_t prev_kern_dims[5];
-
-int
-APPLY_SPECIFIC(choose_fwd_algo)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
-                                PyGpuArrayObject *output,
-                                cudnnConvolutionDescriptor_t desc,
-                                cudnnConvolutionFwdAlgo_t *output_algo,
-                                PARAMS_TYPE* params) {
-  PyGpuContextObject *c = input->context;
-  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
-
-  if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) {
-    PyErr_SetString(PyExc_ValueError, "images and kernel must have the same stack size");
-    return 1;
-  }
-
-  if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
-    return 1;
-  if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
-    return 1;
-  if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
-    return 1;
-
-
-  cudnnConvolutionFwdAlgo_t algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
-
-  cuda_enter(c->ctx);
-
-  int expected_output_dims[5] = {0};
-  err = cudnnGetConvolutionNdForwardOutputDim(desc, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
-                                              PyGpuArray_NDIM(input), expected_output_dims);
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "error computing convolution output dim: %s",
-                 cudnnGetErrorString(err));
-    cuda_exit(c->ctx);
-    return 1;
-  }
-  if (PyGpuArray_NDIM(input) == 4) {
-    if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
-        (PyGpuArray_DIMS(output)[1] != expected_output_dims[1]) ||
-        (PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
-        (PyGpuArray_DIMS(output)[3] != expected_output_dims[3])) {
-      PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ld"
-                                     " but received gradient with shape %ldx%ldx%ldx%ld",
-                   expected_output_dims[0], expected_output_dims[1],
-                   expected_output_dims[2], expected_output_dims[3],
-                   PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
-                   PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3]);
-      cuda_exit(c->ctx);
-      return 1;
-    }
-  } else if (PyGpuArray_NDIM(input) == 5) {
-    if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
-        (PyGpuArray_DIMS(output)[1] != expected_output_dims[1]) ||
-        (PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
-        (PyGpuArray_DIMS(output)[3] != expected_output_dims[3]) ||
-        (PyGpuArray_DIMS(output)[4] != expected_output_dims[4])) {
-      PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ldx%ld"
-                                     " but received gradient with shape %ldx%ldx%ldx%ldx%ld",
-                   expected_output_dims[0], expected_output_dims[1],
-                   expected_output_dims[2], expected_output_dims[3],
-                   expected_output_dims[4],
-                   PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
-                   PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3],
-                   PyGpuArray_DIMS(output)[4]);
-      cuda_exit(c->ctx);
-      return 1;
-    }
-  }
-
-  if (!params->choose_once) {
-    reuse_algo = 1;
-    for (unsigned int i = 0; i < PyGpuArray_NDIM(input); ++i) {
-      reuse_algo = (reuse_algo && PyGpuArray_DIM(input, i) == prev_img_dims[i]);
-      reuse_algo = (reuse_algo && PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]);
-    }
-  }
-
-  if (!reuse_algo) {
-    size_t free;
-    int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
-
-    if (err2 != GA_NO_ERROR) {
-      PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
-                   "memory information on the GPU");
-      cuda_exit(c->ctx);
-      return 1;
-    }
-
-    // Guess 4Mb if the info is not available
-    if (free == 0) free = 4 * 1024 * 1024;
-
-    if (params->choose_time) {
-      int count;
-      cudnnConvolutionFwdAlgoPerf_t choice;
-      gpudata *tmpmem;
-
-      tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
-      if (tmpmem == NULL) {
-        PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
-        return -1;
-      }
-
-      err = cudnnFindConvolutionForwardAlgorithmEx(
-        params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
-        APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
-        desc, APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),
-        1, &count, &choice, *(void **)tmpmem,
-        free);
-      gpudata_release(tmpmem);
-
-      if (err != CUDNN_STATUS_SUCCESS) {
-        PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", cudnnGetErrorString(err));
-        cuda_exit(c->ctx);
-        return 1;
-      }
-
-      algo = choice.algo;
-    } else {
-      err = cudnnGetConvolutionForwardAlgorithm(
-        params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
-        desc, APPLY_SPECIFIC(output),
-        CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, &algo);
-      if (err != CUDNN_STATUS_SUCCESS) {
-        PyErr_Format(PyExc_RuntimeError,
-                     "error selecting convolution algo: %s",
-                     cudnnGetErrorString(err));
-        cuda_exit(c->ctx);
-        return 1;
-      }
-    }
-    prev_algo = algo;
-  } else {
-    algo = prev_algo;
-  }
-
-  if (params->choose_once) {
-    reuse_algo = 1;
-  } else {
-    for (unsigned int i = 0; i < PyGpuArray_NDIM(input); ++i) {
-      prev_img_dims[i] = PyGpuArray_DIM(input, i);
-      prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
-    }
-  }
-
-  /* These two algos are not supported for 3d conv */
-  if (PyGpuArray_NDIM(input) == 5 &&
-      (algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM ||
-       algo == CUDNN_CONVOLUTION_FWD_ALGO_GEMM))
-    algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
-
-  // The FFT implementation does not support strides, 1x1 filters or inputs
-  // with a spatial dimension larger than 1024. The tiled-FFT implementation
-  // does not support strides.
-  // If the chosen implementation is FFT or tiled-FFT, validate that it can
-  // be used on the current data and default to a safe implementation if it
-  // can't.
-  // The following code is 2d-specific but it is fine as FFT and tiled-FFT are
-  // defined only for 2d filters
-  if ((algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT ||
-       algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) && PyGpuArray_NDIM(input) == 4) {
-
-    // Extract the properties of the convolution descriptor
-    int nd;
-    int pad[2];
-    int stride[2];
-    int dilation[2];
-    cudnnConvolutionMode_t mode;
-    cudnnDataType_t data_type;
-    err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
-                                             dilation, &mode, &data_type);
-    if (err != CUDNN_STATUS_SUCCESS) {
-      PyErr_Format(PyExc_RuntimeError,
-                   "error getting convolution properties: %s",
-                   cudnnGetErrorString(err));
-      cuda_exit(c->ctx);
-      return 1;
-    }
-
-    if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT) {
-      if (stride[0] != 1 || stride[1] != 1 ||
-          PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
-          (PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1))
-      {
-        algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
-      }
-    } else {
-      // algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING
-      if (stride[0] != 1 || stride[1] != 1) {
-        algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
-      }
-    }
-  }
-
-  *output_algo = algo;
-  cuda_exit(c->ctx);
-  return 0;
-}
--- a/theano/gpuarray/tests/dnn_choose_gi.c
+++ b/theano/gpuarray/tests/dnn_choose_gi.c
-#section init_code_struct
-
-reuse_algo = 0;
-prev_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
-if (!PARAMS->choose_once) {
-  memset(prev_kern_dims, 0, sizeof(prev_kern_dims));
-  memset(prev_top_dims, 0, sizeof(prev_top_dims));
-}
-
-#section support_code_struct
-
-int reuse_algo;
-cudnnConvolutionBwdDataAlgo_t prev_algo;
-size_t prev_kern_dims[5];
-size_t prev_top_dims[5];
-
-int
-APPLY_SPECIFIC(choose_bwd_data_algo)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
-                                       PyGpuArrayObject *input,
-                                       cudnnConvolutionDescriptor_t desc,
-                                       cudnnConvolutionBwdDataAlgo_t *output_algo,
-                                       PARAMS_TYPE* params) {
-  PyGpuContextObject *c = kerns->context;
-  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
-
-  if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) {
-    PyErr_SetString(PyExc_ValueError, "images and kernel must have the same stack size");
-    return 1;
-  }
-
-  if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
-    return 1;
-  if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
-    return 1;
-  if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
-    return 1;
-
-
-  cudnnConvolutionBwdDataAlgo_t algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
-
-  cuda_enter(c->ctx);
-
-  int expected_output_dims[5] = {0};
-  err = cudnnGetConvolutionNdForwardOutputDim(desc, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
-                                              PyGpuArray_NDIM(input), expected_output_dims);
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "error computing convolution output dim: %s",
-                 cudnnGetErrorString(err));
-    cuda_exit(c->ctx);
-    return 1;
-  }
-  if (PyGpuArray_NDIM(input) == 4) {
-    if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
-        (PyGpuArray_DIMS(output)[1] != expected_output_dims[1]) ||
-        (PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
-        (PyGpuArray_DIMS(output)[3] != expected_output_dims[3])) {
-      PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ld"
-                                     " but received gradient with shape %ldx%ldx%ldx%ld",
-                   expected_output_dims[0], expected_output_dims[1],
-                   expected_output_dims[2], expected_output_dims[3],
-                   PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
-                   PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3]);
-      cuda_exit(c->ctx);
-      return 1;
-    }
-  } else if (PyGpuArray_NDIM(input) == 5) {
-    if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
-        (PyGpuArray_DIMS(output)[1] != expected_output_dims[1]) ||
-        (PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
-        (PyGpuArray_DIMS(output)[3] != expected_output_dims[3]) ||
-        (PyGpuArray_DIMS(output)[4] != expected_output_dims[4])) {
-      PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ldx%ld"
-                                     " but received gradient with shape %ldx%ldx%ldx%ldx%ld",
-                   expected_output_dims[0], expected_output_dims[1],
-                   expected_output_dims[2], expected_output_dims[3],
-                   expected_output_dims[4],
-                   PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
-                   PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3],
-                   PyGpuArray_DIMS(output)[4]);
-      cuda_exit(c->ctx);
-      return 1;
-    }
-  }
-
-  if (!params->choose_once) {
-    reuse_algo = 1;
-    for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); ++i) {
-      reuse_algo = (reuse_algo && PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]);
-      reuse_algo = (reuse_algo && PyGpuArray_DIM(output, i) == prev_top_dims[i]);
-    }
-  }
-
-  if (!reuse_algo) {
-    size_t free;
-    int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
-
-    if (err2 != GA_NO_ERROR) {
-      PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
-                   "memory information on the GPU");
-      cuda_exit(c->ctx);
-      return 1;
-    }
-
-    // Guess 4Mb if the info is not available
-    if (free == 0) free = 4 * 1024 * 1024;
-
-    if (params->choose_time) {
-      int count;
-      cudnnConvolutionBwdDataAlgoPerf_t choice;
-      gpudata *tmpmem;
-
-      tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
-      if (tmpmem == NULL) {
-        PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
-        return -1;
-      }
-
-      err = cudnnFindConvolutionBackwardDataAlgorithmEx(
-        params->handle, APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
-        APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc,
-        APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
-        1, &count, &choice, *(void **)tmpmem, free);
-      gpudata_release(tmpmem);
-
-      if (err != CUDNN_STATUS_SUCCESS) {
-        PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", cudnnGetErrorString(err));
-        cuda_exit(c->ctx);
-        return 1;
-      }
-
-      algo = choice.algo;
-    } else {
-      err = cudnnGetConvolutionBackwardDataAlgorithm(
-        params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output),
-        desc, APPLY_SPECIFIC(input),
-        CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, free, &algo);
-      if (err != CUDNN_STATUS_SUCCESS) {
-        PyErr_Format(PyExc_RuntimeError,
-                     "error selecting convolution algo: %s",
-                     cudnnGetErrorString(err));
-        cuda_exit(c->ctx);
-        return 1;
-      }
-    }
-    prev_algo = algo;
-  } else {
-    algo = prev_algo;
-  }
-
-  if (params->choose_once) {
-    reuse_algo = 1;
-  } else {
-    for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); ++i) {
-      prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
-      prev_top_dims[i] = PyGpuArray_DIM(output, i);
-    }
-  }
-
-  // The FFT implementation does not support strides, 1x1 filters or inputs
-  // with a spatial dimension larger than 1024. The tiled-FFT implementation
-  // does not support strides.
-  // If the chosen implementation is FFT or tiled-FFT, validate that it can
-  // be used on the current data and default to a safe implementation if it
-  // can't.
-  // The following code is 2d-specific but it is fine as FFT and tiled-FFT are
-  // defined only for 2d filters
-  if ((algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING ||
-       algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) && PyGpuArray_NDIM(kerns) == 4) {
-
-    // Extract the properties of the convolution descriptor
-    int nd;
-    int pad[2];
-    int stride[2];
-    int upscale[2];
-    cudnnConvolutionMode_t mode;
-    cudnnDataType_t data_type;
-    err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride, upscale, &mode, &data_type);
-    if (err != CUDNN_STATUS_SUCCESS) {
-      PyErr_Format(PyExc_RuntimeError, "error getting convolution properties: %s", cudnnGetErrorString(err));
-      cuda_exit(c->ctx);
-      return 1;
-    }
-
-    if (algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) {
-      if (stride[0] != 1 || stride[1] != 1 ||
-          PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
-          (PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1))
-      {
-        algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
-      }
-    } else {
-      // algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING
-      if (stride[0] != 1 || stride[1] != 1) {
-        algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
-      }
-    }
-  }
-
-  *output_algo = algo;
-  cuda_exit(c->ctx);
-  return 0;
-}
--- a/theano/gpuarray/tests/dnn_choose_gw.c
+++ b/theano/gpuarray/tests/dnn_choose_gw.c
-#section init_code_struct
-
-reuse_algo = 0;
-prev_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
-if (!PARAMS->choose_once) {
-  memset(prev_img_dims, 0, sizeof(prev_img_dims));
-  memset(prev_top_dims, 0, sizeof(prev_top_dims));
-}
-
-#section support_code_struct
-
-int reuse_algo;
-cudnnConvolutionBwdFilterAlgo_t prev_algo;
-size_t prev_img_dims[5];
-size_t prev_top_dims[5];
-
-int
-APPLY_SPECIFIC(choose_bwd_filter_algo)(PyGpuArrayObject *input, PyGpuArrayObject *output,
-                                       PyGpuArrayObject *kerns,
-                                       cudnnConvolutionDescriptor_t desc,
-                                       cudnnConvolutionBwdFilterAlgo_t *output_algo,
-                                       PARAMS_TYPE* params) {
-  PyGpuContextObject *c = input->context;
-  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
-
-  if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) {
-    PyErr_SetString(PyExc_ValueError, "GpuDnnConv images and kernel must have the same stack size");
-    return 1;
-  }
-
-  if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
-    return 1;
-  if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
-    return 1;
-  if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
-    return 1;
-
-  cudnnConvolutionBwdFilterAlgo_t algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
-
-  cuda_enter(c->ctx);
-
-  int expected_output_dims[5] = {0};
-  err = cudnnGetConvolutionNdForwardOutputDim(desc, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
-                                              PyGpuArray_NDIM(input), expected_output_dims);
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "error computing convolution output dim: %s",
-                 cudnnGetErrorString(err));
-    cuda_exit(c->ctx);
-    return 1;
-  }
-  if (PyGpuArray_NDIM(input) == 4) {
-    if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
-        (PyGpuArray_DIMS(output)[1] != expected_output_dims[1]) ||
-        (PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
-        (PyGpuArray_DIMS(output)[3] != expected_output_dims[3])) {
-      PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ld"
-                                     " but received gradient with shape %ldx%ldx%ldx%ld",
-                   expected_output_dims[0], expected_output_dims[1],
-                   expected_output_dims[2], expected_output_dims[3],
-                   PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
-                   PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3]);
-      cuda_exit(c->ctx);
-      return 1;
-    }
-  } else if (PyGpuArray_NDIM(input) == 5) {
-    if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
-        (PyGpuArray_DIMS(output)[1] != expected_output_dims[1]) ||
-        (PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
-        (PyGpuArray_DIMS(output)[3] != expected_output_dims[3]) ||
-        (PyGpuArray_DIMS(output)[4] != expected_output_dims[4])) {
-      PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ldx%ld"
-                                     " but received gradient with shape %ldx%ldx%ldx%ldx%ld",
-                   expected_output_dims[0], expected_output_dims[1],
-                   expected_output_dims[2], expected_output_dims[3],
-                   expected_output_dims[4],
-                   PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
-                   PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3],
-                   PyGpuArray_DIMS(output)[4]);
-      cuda_exit(c->ctx);
-      return 1;
-    }
-  }
-
-  if (!params->choose_once) {
-    reuse_algo = 1;
-    for (unsigned int i = 0; i < PyGpuArray_NDIM(input); ++i) {
-      reuse_algo = (reuse_algo && PyGpuArray_DIM(input, i) == prev_img_dims[i]);
-      reuse_algo = (reuse_algo && PyGpuArray_DIM(output, i) == prev_top_dims[i]);
-    }
-  }
-
-  if (!reuse_algo) {
-    size_t free;
-
-    int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
-    if (err2 != GA_NO_ERROR) {
-      PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
-                   "memory information on the GPU");
-      cuda_exit(c->ctx);
-      return 1;
-    }
-
-    // Guess 4Mb if the info is not available
-    if (free == 0) free = 4 * 1024 * 1024;
-
-    if (params->choose_time) {
-      int count;
-      cudnnConvolutionBwdFilterAlgoPerf_t choice;
-      gpudata *tmpmem;
-
-      tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
-      if (tmpmem == NULL) {
-        PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
-        return -1;
-      }
-
-      err = cudnnFindConvolutionBackwardFilterAlgorithmEx(
-        params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
-        APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc,
-        APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
-        1, &count, &choice, *(void **)tmpmem, free);
-      gpudata_release(tmpmem);
-
-      if (err != CUDNN_STATUS_SUCCESS) {
-        PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", cudnnGetErrorString(err));
-        cuda_exit(c->ctx);
-        return 1;
-      }
-
-      algo = choice.algo;
-    } else {
-      err = cudnnGetConvolutionBackwardFilterAlgorithm(
-        params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
-        desc, APPLY_SPECIFIC(kerns),
-        CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, free, &algo);
-      if (err != CUDNN_STATUS_SUCCESS) {
-        PyErr_Format(PyExc_RuntimeError,
-                     "error selecting convolution algo: %s",
-                     cudnnGetErrorString(err));
-        cuda_exit(c->ctx);
-        return 1;
-      }
-    }
-    prev_algo = algo;
-  } else {
-    algo = prev_algo;
-  }
-
-  if (params->choose_once) {
-    reuse_algo = 1;
-  } else {
-    for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
-      prev_img_dims[i] = PyGpuArray_DIM(input, i);
-      prev_top_dims[i] = PyGpuArray_DIM(output, i);
-    }
-  }
-
-  // The FFT implementation does not support strides, 1x1 filters or inputs
-  // with a spatial dimension larger than 1024.
-  // If the chosen implementation is FFT, validate that it can
-  // be used on the current data and default to a safe implementation if it
-  // can't.
-  // The following code is 2d-specific but it is fine as FFT and tiled-FFT are
-  // defined only for 2d filters
-  if (algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT && PyGpuArray_NDIM(input) == 4) {
-    // Extract the properties of the convolution descriptor
-    int nd;
-    int pad[2];
-    int stride[2];
-    int upscale[2];
-    cudnnConvolutionMode_t mode;
-    cudnnDataType_t data_type;
-    err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride, upscale, &mode, &data_type);
-    if (err != CUDNN_STATUS_SUCCESS) {
-      PyErr_Format(PyExc_RuntimeError, "error getting convolution properties: %s", cudnnGetErrorString(err));
-      cuda_exit(c->ctx);
-      return 1;
-    }
-
-    if (stride[0] != 1 || stride[1] != 1 ||
-        PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
-        (PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1)) {
-      algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
-    }
-  }
-
-  *output_algo = algo;
-  cuda_exit(c->ctx);
-  return 0;
-}