Merge pull request #3788 from nouiz/carriepl-v4

Rebased cudnn v4

Merge pull request #3788 from nouiz/carriepl-v4
4ad36ddc · Frédéric Bastien · 22db3930 · 32e113c1 · 4ad36ddc · 4ad36ddc
--- a/theano/configdefaults.py
+++ b/theano/configdefaults.py
@@ -219,30 +219,91 @@ AddConfigVar('gpuarray.sync',
             BoolParam(False),
             in_c_key=True)

+
+def safe_no_dnn_workmem(workmem):
+    """
+    Make sure the user is not attempting to use dnn.conv.workmem`.
+    """
+    if workmem:
+        raise RuntimeError(
+            'The option `dnn.conv.workmem` has been removed and should '
+            'not be used anymore. Please use the option '
+            '`dnn.conv.algo_fwd` instead.')
+    return True
+
 AddConfigVar('dnn.conv.workmem',
             "This flag is deprecated; use dnn.conv.algo_fwd.",
-             EnumStr(''),
+             ConfigParam('', allow_override=False, filter=safe_no_dnn_workmem),
             in_c_key=False)

+
+def safe_no_dnn_workmem_bwd(workmem):
+    """
+    Make sure the user is not attempting to use dnn.conv.workmem_bwd`.
+    """
+    if workmem:
+        raise RuntimeError(
+            'The option `dnn.conv.workmem_bwd` has been removed and '
+            'should not be used anymore. Please use the options '
+            '`dnn.conv.algo_bwd_filter` and `dnn.conv.algo_bwd_data` instead.')
+    return True
+
 AddConfigVar('dnn.conv.workmem_bwd',
             "This flag is deprecated; use dnn.conv.algo_bwd.",
-             EnumStr(''),
+             ConfigParam('', allow_override=False,
+                         filter=safe_no_dnn_workmem_bwd),
+             in_c_key=False)
+
+
+def safe_no_dnn_algo_bwd(algo):
+    """
+    Make sure the user is not attempting to use dnn.conv.algo_bwd`.
+    """
+    if algo:
+        raise RuntimeError(
+            'The option `dnn.conv.algo_bwd` has been removed and '
+            'should not be used anymore. Please use the options '
+            '`dnn.conv.algo_bwd_filter` and `dnn.conv.algo_bwd_data` instead.')
+    return True
+
+AddConfigVar('dnn.conv.algo_bwd',
+             "This flag is deprecated; use dnn.conv.algo_bwd_data and "
+             "dnn.conv.algo_bwd_filter.",
+             ConfigParam('', allow_override=False,
+                         filter=safe_no_dnn_algo_bwd),
             in_c_key=False)

 AddConfigVar('dnn.conv.algo_fwd',
             "Default implementation to use for CuDNN forward convolution.",
-             EnumStr('small', 'none', 'large', 'fft', 'guess_once',
-                     'guess_on_shape_change', 'time_once',
+             EnumStr('small', 'none', 'large', 'fft', 'fft_tiling',
+                     'guess_once', 'guess_on_shape_change',
+                     'time_once', 'time_on_shape_change'),
+             in_c_key=False)
+
+AddConfigVar('dnn.conv.algo_bwd_data',
+             "Default implementation to use for CuDNN backward convolution to "
+             "get the gradients of the convolution with regard to the inputs.",
+             EnumStr('none', 'deterministic', 'fft', 'fft_tiling',
+                     'guess_once', 'guess_on_shape_change', 'time_once',
                     'time_on_shape_change'),
             in_c_key=False)

-AddConfigVar('dnn.conv.algo_bwd',
-             "Default implementation to use for CuDNN backward convolution.",
-             EnumStr('none', 'deterministic', 'fft', 'guess_once',
+AddConfigVar('dnn.conv.algo_bwd_filter',
+             "Default implementation to use for CuDNN backward convolution to "
+             "get the gradients of the convolution with regard to the "
+             "filters.",
+             EnumStr('none', 'deterministic', 'fft', 'small', 'guess_once',
                     'guess_on_shape_change', 'time_once',
                     'time_on_shape_change'),
             in_c_key=False)

+AddConfigVar('dnn.conv.precision',
+             "Default data precision to use for the computation in CuDNN "
+             "convolutions (defaults to the same dtype as the inputs of the "
+             "convolutions).",
+             EnumStr('as_input', 'float16', 'float32', 'float64'),
+             in_c_key=False)
+

 def default_dnn_path(suffix):
    def f(suffix=suffix):

--- a/theano/sandbox/cuda/cudnn_helper.h
+++ b/theano/sandbox/cuda/cudnn_helper.h
@@ -3,6 +3,15 @@

 #include <cudnn.h>

+// If needed, define element of the V4 interface in terms of elements of
+// previous versions
+#if defined(CUDNN_VERSION) && CUDNN_VERSION < 4000
+
+#define CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING 5
+#define CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING 3
+
+#endif
+
 #ifndef CUDNN_VERSION
 #include <assert.h>


--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -92,27 +92,13 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
                                         " from one version, but we link with"
                                         " a different version %s" % str(v))
                    raise RuntimeError(dnn_available.msg)
-                if v == -1:
-                    dnn_available.avail = False
-                    dnn_available.msg = (
-                        "CuDNN v1 detected. This version is no longer "
-                        "supported by Theano. Update your CuDNN installation "
-                        "to a more recent version")
-                    raise RuntimeError(dnn_available.msg)
-                if v == (20, 20):
-                    dnn_available.avail = False
-                    dnn_available.msg = (
-                        "You have installed a release candidate of CuDNN v2."
-                        " This isn't supported anymore."
-                        " Update to CuDNN v2 final version.")
-                    raise RuntimeError(dnn_available.msg)
-                if 3000 <= v[0] < 3007:
+                if v == -1 or v[0] < 3007:
                    # 3007 is the final release of cudnn v3
                    dnn_available.avail = False
                    dnn_available.msg = (
-                        "You have installed a release candidate of CuDNN v3."
-                        " This isn't supported anymore."
-                        " Update to CuDNN v3 final version.")
+                        "You have an old release of CuDNN (or a release "
+                        "candidate) that isn't supported.  Please update to "
+                        "at least v3 final version.")
                    raise RuntimeError(dnn_available.msg)

    return dnn_available.avail
@@ -248,7 +234,7 @@ class GpuDnnConvDesc(GpuOp):

    """

-    __props__ = ('border_mode', 'subsample', 'conv_mode')
+    __props__ = ('border_mode', 'subsample', 'conv_mode', 'precision')

    def c_headers(self):
        return ['cudnn.h', 'cudnn_helper.h']
@@ -265,7 +251,8 @@ class GpuDnnConvDesc(GpuOp):
    def do_constant_folding(self, node):
        return False

-    def __init__(self, border_mode, subsample=(1, 1), conv_mode='conv'):
+    def __init__(self, border_mode, subsample=(1, 1), conv_mode='conv',
+                 precision="float32"):
        if isinstance(border_mode, int):
            border_mode = (border_mode,) * len(subsample)
        if isinstance(border_mode, tuple):
@@ -283,6 +270,9 @@ class GpuDnnConvDesc(GpuOp):
        assert conv_mode in ('conv', 'cross')
        self.conv_mode = conv_mode

+        assert precision in ['float16', 'float32', 'float64']
+        self.precision = precision
+
    def make_node(self, img_shape, kern_shape):
        if img_shape.type.ndim != 1 or img_shape.type.dtype != 'int64':
            raise TypeError('img must be 1D shape tensor')
@@ -321,6 +311,14 @@ class GpuDnnConvDesc(GpuOp):
        subsample_str = ", ".join([str(s) for s in self.subsample])
        upscale_str = ", ".join(["1"] * nb_dim)

+        if self.precision == 'float16':
+            precision = 'CUDNN_DATA_HALF'
+        elif self.precision == 'float32':
+            precision = 'CUDNN_DATA_FLOAT'
+        else:
+            assert self.precision == 'float64'
+            precision = 'CUDNN_DATA_DOUBLE'
+
        return """
 {
  cudnnStatus_t err;
@@ -346,11 +344,11 @@ class GpuDnnConvDesc(GpuOp):
    }
  }

-  err = cudnnSetConvolutionNdDescriptor(
+  err = cudnnSetConvolutionNdDescriptor_v3(
  %(desc)s,
  %(nb_dim)d,
  pad, subsample, upscale,
-  %(conv_flag)s
+  %(conv_flag)s, %(precision)s
  );
 #else
  PyErr_Format(PyExc_RuntimeError, "could not set op descriptor: CUDNN_VERSION must be >= 30");
@@ -364,10 +362,10 @@ class GpuDnnConvDesc(GpuOp):
 """ % dict(name=name, img_shape=img_shape, kern_shape=kern_shape, desc=desc,
           bmode=bmode, conv_flag=conv_flag, fail=sub['fail'],
           pad_str=pad_str, subsample_str=subsample_str,
-           upscale_str=upscale_str, nb_dim=nb_dim)
+           upscale_str=upscale_str, nb_dim=nb_dim, precision=precision)

    def c_code_cache_version(self):
-        return (2, version())
+        return (3, version())

 # scalar constants
 _zero = constant(numpy.asarray(0.0, dtype='float32'))
@@ -401,9 +399,8 @@ class GpuDnnConv(DnnBase, COp):
    workmem
        *deprecated*, use parameter algo instead.
    algo
-        ['none', 'small', 'large', 'fft', 'guess_once',
-         'guess_on_shape_change', 'time_once',
-         'time_on_shape_change']
+        ['none', 'small', 'large', 'fft', 'fft_tiling', 'guess_once',
+         'guess_on_shape_change', 'time_once', 'time_on_shape_change']

        Default is the value of :attr:`config.dnn.conv.algo_fwd`.

@@ -445,9 +442,15 @@ class GpuDnnConv(DnnBase, COp):
                raise RuntimeError("CuDNN convolution timing requires CuDNN "
                                   "v3")

-        assert self.algo in ['none', 'small', 'large', 'fft', 'guess_once',
-                             'guess_on_shape_change', 'time_once',
-                             'time_on_shape_change']
+        # The fft_tiling implementation is only available from CuDNN V4 onward
+        if version() < (4000, 4000):
+            if self.algo == 'fft_tiling':
+                raise RuntimeError("CuDNN tiled-FFT convolution requires "
+                                   "CuDNN v4 or more recent")
+
+        assert self.algo in ['none', 'small', 'large', 'fft', 'fft_tiling',
+                             'guess_once', 'guess_on_shape_change',
+                             'time_once', 'time_on_shape_change']

    def __setstate__(self, d):
        self.__dict__.update(d)
@@ -477,8 +480,15 @@ class GpuDnnConv(DnnBase, COp):
                alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM'
            elif self.algo == 'large':
                alg = 'CUDNN_CONVOLUTION_FWD_ALGO_GEMM'
+            elif self.algo == 'direct':
+                # need v2
+                alg = 'CUDNN_CONVOLUTION_FWD_ALGO_DIRECT'
            elif self.algo == 'fft':
+                # need v3
                alg = 'CUDNN_CONVOLUTION_FWD_ALGO_FFT'
+            elif self.algo == 'fft_tiling':
+                # need v4
+                alg = 'CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING'
            elif self.algo in ['guess_once', 'guess_on_shape_change']:
                # The convolution implementation should be choosen according
                # to a heuristic
@@ -652,8 +662,8 @@ class GpuDnnConvGradW(DnnBase, COp):
        The convolution descriptor.
    workmem
        *deprecated*, use parameter algo instead.
-    algo : {'none', 'deterministic', 'fft', 'guess_once', 'guess_on_shape_change', 'time_once', 'time_on_shape_change'}
-        Default is the value of :attr:`config.dnn.conv.algo_bwd`.
+    algo : {'none', 'deterministic', 'fft', 'small', 'guess_once', 'guess_on_shape_change', 'time_once', 'time_on_shape_change'}
+        Default is the value of :attr:`config.dnn.conv.algo_bwd_filter`.

    """

@@ -671,15 +681,16 @@ class GpuDnnConvGradW(DnnBase, COp):
            self.algo = workmem
        else:
            if algo is None:
-                algo = config.dnn.conv.algo_bwd
+                algo = config.dnn.conv.algo_bwd_filter
            self.algo = algo

        self.inplace = inplace
        if self.inplace:
            self.destroy_map = {0: [2]}
-        assert self.algo in ['none', 'deterministic', 'fft', 'guess_once',
-                             'guess_on_shape_change', 'time_once',
-                             'time_on_shape_change']
+
+        assert self.algo in ['none', 'deterministic', 'fft', 'small',
+                             'guess_once', 'guess_on_shape_change',
+                             'time_once', 'time_on_shape_change']

    def __setstate__(self, d):
        self.__dict__.update(d)
@@ -687,7 +698,7 @@ class GpuDnnConvGradW(DnnBase, COp):
            if hasattr(self, 'workmem'):
                self.algo = self.workmem
            else:
-                self.algo = config.dnn.conv.algo_bwd
+                self.algo = config.dnn.conv.algo_bwd_filter
        if not hasattr(self, 'inplace'):
            self.inplace = False

@@ -724,11 +735,15 @@ class GpuDnnConvGradW(DnnBase, COp):
            alg = "0"
        else:
            if self.algo == 'none':
+                # non-deterministic
                alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0'
            elif self.algo == 'deterministic':
                alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1'
            elif self.algo == 'fft':
                alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT'
+            elif self.algo == 'small':
+                # need v3, non-deterministic, small workspace
+                alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3'
            elif self.algo in ['guess_once', 'guess_on_shape_change']:
                # The convolution implementation should be chosen according
                # to a heuristic
@@ -788,7 +803,7 @@ class GpuDnnConv3dGradW(GpuDnnConvGradW):
    :param workmem:
        *deprecated*, use parameter algo instead.
    :param algo: ['none', 'guess_once', 'guess_on_shape_change', 'time_once', 'time_on_shape_change']
-        Default is the value of :attr:`config.dnn.conv.algo_bwd`.
+        Default is the value of :attr:`config.dnn.conv.algo_bwd_filter`.

    """
    __props__ = ('algo', 'inplace',)
@@ -856,11 +871,11 @@ class GpuDnnConvGradI(DnnBase, COp):
    workmem
        *deprecated*, use parameter algo instead.
    algo
-        ['none', 'deterministic', 'fft', 'guess_once',
+        ['none', 'deterministic', 'fft', 'fft_tiling', 'guess_once',
         'guess_on_shape_change', 'time_once',
         'time_on_shape_change']

-        Default is the value of :attr:`config.dnn.conv.algo_bwd`.
+        Default is the value of :attr:`config.dnn.conv.algo_bwd_data`.

    """

@@ -879,15 +894,23 @@ class GpuDnnConvGradI(DnnBase, COp):
            self.algo = workmem
        else:
            if algo is None:
-                algo = config.dnn.conv.algo_bwd
+                algo = config.dnn.conv.algo_bwd_data
            self.algo = algo

        self.inplace = inplace
        if self.inplace:
            self.destroy_map = {0: [2]}
-        assert self.algo in ['none', 'deterministic', 'fft', 'guess_once',
-                             'guess_on_shape_change', 'time_once',
-                             'time_on_shape_change']
+
+        # The small-workspace implementation is only available from CuDNN V4
+        # onward.
+        if version() < (4000, 4000):
+            if self.algo == 'fft_tiling':
+                raise RuntimeError("CuDNN's tiled-FFT convolution requires "
+                                   "CuDNN v4 or more recent")
+
+        assert self.algo in ['none', 'deterministic', 'fft', 'fft_tiling',
+                             'guess_once', 'guess_on_shape_change',
+                             'time_once', 'time_on_shape_change']

    def __setstate__(self, d):
        self.__dict__.update(d)
@@ -895,7 +918,7 @@ class GpuDnnConvGradI(DnnBase, COp):
            if hasattr(self, 'workmem'):
                self.algo = self.workmem
            else:
-                self.algo = config.dnn.conv.algo_bwd
+                self.algo = config.dnn.conv.algo_bwd_data
        if not hasattr(self, 'inplace'):
            self.inplace = False

@@ -936,7 +959,11 @@ class GpuDnnConvGradI(DnnBase, COp):
            elif self.algo == 'deterministic':
                alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_1'
            elif self.algo == 'fft':
+                # need v3, big workspace
                alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT'
+            elif self.algo == 'fft_tiling':
+                # need v4, big workspace, but less then fft
+                alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING'
            elif self.algo in ['guess_once', 'guess_on_shape_change']:
                # The convolution implementation should be chosen according
                # to a heuristic
@@ -998,7 +1025,7 @@ class GpuDnnConv3dGradI(GpuDnnConvGradI):
    :param algo: ['none', 'guess_once', 'guess_on_shape_change',
                  'time_once', 'time_on_shape_change']

-        Default is the value of :attr:`config.dnn.conv.algo_bwd`.
+        Default is the value of :attr:`config.dnn.conv.algo_bwd_data`.

    """
    __props__ = ('algo', 'inplace',)
@@ -1055,7 +1082,8 @@ class GpuDnnConv3dGradI(GpuDnnConvGradI):


 def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
-             conv_mode='conv', direction_hint=None, workmem=None, algo=None):
+             conv_mode='conv', direction_hint=None, workmem=None, algo=None,
+             precision=None):
    """
    GPU convolution using cuDNN from NVIDIA.

@@ -1090,13 +1118,24 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
        removed at any time without a deprecation period. You have been warned.
    workmem
        *deprecated*, use parameter algo instead.
-        algo : {'none', 'small', 'large', 'fft', 'guess_once', 'guess_on_shape_change', 'time_once', 'time_on_shape_change'}
+    algo : {'none', 'small', 'large', 'fft', 'guess_once', 'guess_on_shape_change', 'time_once', 'time_on_shape_change'}
        Convolution implementation to use. Some of its  values may require certain
        versions of CuDNN to be installed. Default is the value of
        :attr:`config.dnn.conv.algo_fwd`.
+    precision : {'as_input', 'float16', 'float32', 'float64'}
+        Description of the dtype in which the computation of the convolution
+        should be done. Possible values are 'as_input', 'float16', 'float32'
+        and 'float64'. Default is the value of
+        :attr:`config.dnn.conv.precision`.

    """

+    # Establish dtype in which to perform the computation of the convolution
+    if precision is None:
+        precision = theano.config.dnn.conv.precision
+    if precision == 'as_input':
+        precision = theano.scalar.upcast(img.dtype, kerns.dtype)
+
    # Check if deprecated param 'workmem' is used
    if workmem is not None:
        warnings.warn(("dnn_conv: parameter 'workmem' is deprecated. Use "
@@ -1123,7 +1162,8 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
        out = gpu_alloc_empty(shape_i(kerns, 1, fgraph),
                              shape_i(img, 1, fgraph), shape2, shape3)
        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
-                              conv_mode='cross')(img.shape, out.shape)
+                              conv_mode='cross', precision=precision)(img.shape,
+                                                                      out.shape)
        conv = GpuDnnConvGradW()(img, kerns, out, desc)
        return as_cuda_ndarray_variable(conv.dimshuffle(1, 0, 2, 3))

@@ -1139,7 +1179,8 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
        out = gpu_alloc_empty(shape_i(img, 0, fgraph),
                              shape_i(kerns, 1, fgraph), shape2, shape3)
        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
-                              conv_mode=conv_mode)(out.shape, kerns.shape)
+                              conv_mode=conv_mode, precision=precision)(out.shape,
+                                                                        kerns.shape)
        return GpuDnnConvGradI()(kerns, img, out, desc)

    # Standard case: We use GpuDnnConv with suitable padding.
@@ -1148,7 +1189,8 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
    img = gpu_contiguous(img)
    kerns = gpu_contiguous(kerns)
    desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
-                          conv_mode=conv_mode)(img.shape, kerns.shape)
+                          conv_mode=conv_mode, precision=precision)(img.shape,
+                                                                    kerns.shape)
    desc_op = desc.owner.op
    out_shp = GpuDnnConv.get_out_shape(img.shape, kerns.shape,
                                       desc_op.border_mode,
@@ -1159,7 +1201,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),

 def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1),
               conv_mode='conv', direction_hint=None, workmem=None,
-               algo='none'):
+               algo='none', precision=None):
    """
    GPU convolution using cuDNN from NVIDIA.

@@ -1186,6 +1228,10 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1),
    :param algo: convolution implementation to use. Only 'none' is implemented
        for the conv3d. Default is the value of
        :attr:`config.dnn.conv.algo_fwd`.
+    :param precision : dtype in which the computation of the convolution
+        should be done. Possible values are 'as_input', 'float16', 'float32'
+        and 'float64'. Default is the value of
+        :attr:`config.dnn.conv.precision`.

    :warning: The cuDNN library only works with GPU that have a compute
      capability of 3.0 or higer.  This means that older GPU will not
@@ -1194,6 +1240,12 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1),

    """

+    # Establish dtype in which to perform the computation of the convolution
+    if precision is None:
+        precision = theano.config.dnn.conv.precision
+    if precision == 'as_input':
+        precision = theano.scalar.upcast(img.dtype, kerns.dtype)
+
    # Check if deprecated param 'workmem' is used
    if workmem is not None:
        warnings.warn(("dnn_conv3d: parameter 'workmem' is deprecated. Use "
@@ -1221,7 +1273,8 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1),
        out = gpu_alloc_empty(shape_i(kerns, 1, fgraph),
                              shape_i(img, 1, fgraph), shape2, shape3, shape4)
        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1, 1),
-                              conv_mode='cross')(img.shape, out.shape)
+                              conv_mode='cross', precision=precision)(img.shape,
+                                                                      out.shape)
        conv = GpuDnnConv3dGradW()(img, kerns, out, desc)
        return as_cuda_ndarray_variable(conv.dimshuffle(1, 0, 2, 3, 4))

@@ -1231,7 +1284,8 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1),
    img = gpu_contiguous(img)
    kerns = gpu_contiguous(kerns)
    desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
-                          conv_mode=conv_mode)(img.shape, kerns.shape)
+                          conv_mode=conv_mode, precision=precision)(img.shape,
+                                                                    kerns.shape)
    desc_op = desc.owner.op
    out_shp = GpuDnnConv3d.get_out_shape(img.shape, kerns.shape,
                                         desc_op.border_mode,

--- a/theano/sandbox/cuda/dnn_conv_base.c
+++ b/theano/sandbox/cuda/dnn_conv_base.c
@@ -15,11 +15,8 @@ int APPLY_SPECIFIC(previous_kerns_shape)[5];
 int APPLY_SPECIFIC(previous_output_shape)[5];
 bool APPLY_SPECIFIC(previous_algo_set);
 cudnnConvolutionFwdAlgo_t APPLY_SPECIFIC(previous_algo);
-
-#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 3000
 cudnnConvolutionBwdFilterAlgo_t APPLY_SPECIFIC(previous_bwd_f_algo);
 cudnnConvolutionBwdDataAlgo_t APPLY_SPECIFIC(previous_bwd_d_algo);
-#endif

 #section init_code_struct

@@ -55,10 +52,8 @@ APPLY_SPECIFIC(previous_algo_set) = false;
 // Select default implementations for the case where the convolution
 // implementations should be selected based on the size of the data.
 APPLY_SPECIFIC(previous_algo) = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
-#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 3000
 APPLY_SPECIFIC(previous_bwd_f_algo) = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
 APPLY_SPECIFIC(previous_bwd_d_algo) = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
-#endif

 #section cleanup_code_struct


--- a/theano/sandbox/cuda/dnn_fwd.c
+++ b/theano/sandbox/cuda/dnn_fwd.c
@@ -81,7 +81,6 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
        // CuDNN time every implementation and choose the best one.
        if (CHOOSE_ALGO_TIME)
        {
-#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 3000
          // Time the different implementations to choose the best one
          int requestedCount = 1;
          int count;
@@ -102,7 +101,6 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
          }

          chosen_algo = choosen_algo_perf.algo;
-#endif
        }
        else
        {
@@ -161,24 +159,28 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
      chosen_algo = CONV_ALGO;
    }

-#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 3000
    // The FFT implementation (only in V3 and onward) does not support strides,
    // 1x1 filters or inputs with a spatial dimension larger than 1024.
-    // If the chosen implementation is FFT, validate that it can be used
-    // on the current data and default on a safe implementation if it
+    // The tiled-FFT implementation (only in V4 onward) does not support
+    // strides.
+    // If the chosen implementation is FFT or tiled-FFT, validate that it can
+    // be used on the current data and default on a safe implementation if it
    // can't.
-    // Following code is 2d-specific, but it is fine as ftt is defined only for
-    // 2d-filters
-    if (chosen_algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT && nb_dim == 4)
+    // Following code is 2d-specific, but it is fine as FFT and tiled-FFT are
+    // defined only for 2d-filters
+    if ((chosen_algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT ||
+         chosen_algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) && nb_dim == 4)
    {

      // Extract the properties of the convolution descriptor
-      int pad_h, pad_w, stride_v, stride_h, upscale_x, upscale_y;
+      int nd;
+      int pad[2];
+      int stride[2];
+      int upscale[2];
      cudnnConvolutionMode_t mode;
-      err = cudnnGetConvolution2dDescriptor(desc, &pad_h, &pad_w,
-                                            &stride_v, &stride_h,
-                                            &upscale_x, &upscale_y,
-                                            &mode);
+      cudnnDataType_t data_type;
+      err = cudnnGetConvolutionNdDescriptor_v3(desc, 2, &nd, pad, stride,
+                                               upscale, &mode, &data_type);

      if (err != CUDNN_STATUS_SUCCESS) {
        PyErr_Format(PyExc_RuntimeError,
@@ -197,36 +199,23 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,

      // Ensure that the selected implementation supports the requested
      // convolution. Fall back to a safe implementation otherwise.
-      if (stride_v != 1 || stride_h != 1 || input_h > 1024 ||
-          input_w > 1024 || (filter_h == 1 && filter_w == 1))
+      if (chosen_algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT)
      {
-        chosen_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
+        if (stride[0] != 1 || stride[1] != 1 || input_h > 1024 ||
+            input_w > 1024 || (filter_h == 1 && filter_w == 1))
+        {
+          chosen_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
+        }
+      }
+      else
+      {
+        // chosen_algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING
+        if (stride[0] != 1 || stride[1] != 1)
+        {
+          chosen_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
+        }
      }
    }
-#endif
-
-#if defined(CUDNN_VERSION) && CUDNN_VERSION < 3000
-    // In versions before V3, CuDNN did not support kernels larger than the
-    // inputs in any spatial dimension, even if padding was used such that the
-    // padded inputs were larger than the kernels. If the kernels are larger
-    // then the inputs, raise an error message.
-
-    bool shape_mismatch = false;
-    for (int i=2; i < nb_dim; i++){
-        shape_mismatch = shape_mismatch || (CudaNdarray_HOST_DIMS(kerns)[i] >
-                                            CudaNdarray_HOST_DIMS(input)[i]);
-    }
-
-    if (shape_mismatch){
-      PyErr_Format(PyExc_RuntimeError,
-                   "GpuDnnConv: the current version of CuDNN does not support "
-                   "kernels larger than the inputs in any spatial dimension, "
-                   "even if the inputs are padded such that the padded inputs "
-                   "are larger than the kernels. Update your installation of "
-                   "CuDNN to V3 or more recent to solve the issue.");
-      return 1;
-    }
-#endif

    err = cudnnGetConvolutionForwardWorkspaceSize(_handle,
                                                  APPLY_SPECIFIC(input),

--- a/theano/sandbox/cuda/dnn_gi.c
+++ b/theano/sandbox/cuda/dnn_gi.c
@@ -33,7 +33,6 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
  if (c_set_tensorNd(*input, APPLY_SPECIFIC(input)) == -1)
    return 1;

-#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 3000
  {
    size_t worksize;
    void *workspace;
@@ -159,21 +158,28 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
        chosen_algo = CONV_ALGO;
    }

-    // The FFT implementation (only in v3 and onward) does not support strides,
+    // The FFT implementation (only in V3 and onward) does not support strides,
    // 1x1 filters or inputs with a spatial dimension larger than 1024.
-    // If the chosen implementation is FFT, validate that it can be used
-    // on the current data and default on a safe implementation if it
+    // The tiled-FFT implementation (only in V4 onward) does not support
+    // strides.
+    // If the chosen implementation is FFT or tiled-FFT, validate that it can
+    // be used on the current data and default on a safe implementation if it
    // can't.
-    if (chosen_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT && nb_dim == 4)
+    // Following code is 2d-specific, but it is fine as FFT and tiled-FFT are
+    // defined only for 2d-filters
+    if ((chosen_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING  ||
+         chosen_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) && nb_dim == 4)
    {

      // Extract the properties of the convolution descriptor
-      int pad_h, pad_w, stride_v, stride_h, upscale_x, upscale_y;
+      int nd;
+      int pad[2];
+      int stride[2];
+      int upscale[2];
      cudnnConvolutionMode_t mode;
-      err = cudnnGetConvolution2dDescriptor(desc, &pad_h, &pad_w,
-                                            &stride_v, &stride_h,
-                                            &upscale_x, &upscale_y,
-                                            &mode);
+      cudnnDataType_t data_type;
+      err = cudnnGetConvolutionNdDescriptor_v3(desc, 2, &nd, pad, stride,
+                                               upscale, &mode, &data_type);

      if (err != CUDNN_STATUS_SUCCESS) {
        PyErr_Format(PyExc_RuntimeError,
@@ -192,10 +198,21 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,

      // Ensure that the selected implementation supports the requested
      // convolution. Fall back to a safe implementation otherwise.
-      if (stride_v != 1 || stride_h != 1 || input_h > 1024 ||
-          input_w > 1024 || (filter_h == 1 && filter_w == 1))
+      if (chosen_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT)
      {
-        chosen_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
+        if (stride[0] != 1 || stride[1] != 1 || input_h > 1024 ||
+            input_w > 1024 || (filter_h == 1 && filter_w == 1))
+        {
+          chosen_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
+        }
+      }
+      else
+      {
+        // chosen_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING
+        if (stride[0] != 1 || stride[1] != 1)
+        {
+          chosen_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
+        }
      }
    }

@@ -231,16 +248,6 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
      (void *)&beta,
      APPLY_SPECIFIC(input), CudaNdarray_DEV_DATA(*input));
  }
-#else
-  err = cudnnConvolutionBackwardData(
-    _handle,
-    (void *)&alpha,
-    APPLY_SPECIFIC(kerns), CudaNdarray_DEV_DATA(kerns),
-    APPLY_SPECIFIC(output), CudaNdarray_DEV_DATA(output),
-    desc,
-    (void *)&beta,
-    APPLY_SPECIFIC(input), CudaNdarray_DEV_DATA(*input));
-#endif

  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradI: error doing operation: %s",

--- a/theano/sandbox/cuda/dnn_gw.c
+++ b/theano/sandbox/cuda/dnn_gw.c
@@ -33,7 +33,6 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
  if (c_set_filterNd(*kerns, APPLY_SPECIFIC(kerns)) == -1)
    return 1;

-#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 3000
  {
    size_t worksize;
    void *workspace;
@@ -168,12 +167,14 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
    {

      // Extract the properties of the convolution descriptor
-      int pad_h, pad_w, stride_v, stride_h, upscale_x, upscale_y;
+      int nd;
+      int pad[2];
+      int stride[2];
+      int upscale[2];
      cudnnConvolutionMode_t mode;
-      err = cudnnGetConvolution2dDescriptor(desc, &pad_h, &pad_w,
-                                            &stride_v, &stride_h,
-                                            &upscale_x, &upscale_y,
-                                            &mode);
+      cudnnDataType_t data_type;
+      err = cudnnGetConvolutionNdDescriptor_v3(desc, 2, &nd, pad, stride,
+                                               upscale, &mode, &data_type);

      if (err != CUDNN_STATUS_SUCCESS) {
        PyErr_Format(PyExc_RuntimeError,
@@ -192,7 +193,7 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,

      // Ensure that the selected implementation supports the requested
      // convolution. Fall back to a safe implementation otherwise.
-      if (stride_v != 1 || stride_h != 1 || input_h > 1024 ||
+      if (stride[0] != 1 || stride[1] != 1 || input_h > 1024 ||
          input_w > 1024 || (filter_h == 1 && filter_w == 1))
      {
        chosen_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
@@ -232,16 +233,6 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
      APPLY_SPECIFIC(kerns), CudaNdarray_DEV_DATA(*kerns));

  }
-#else
-  err = cudnnConvolutionBackwardFilter(
-    _handle,
-    (void *)&alpha,
-    APPLY_SPECIFIC(input), CudaNdarray_DEV_DATA(input),
-    APPLY_SPECIFIC(output), CudaNdarray_DEV_DATA(output),
-    desc,
-    (void *)&beta,
-    APPLY_SPECIFIC(kerns), CudaNdarray_DEV_DATA(*kerns));
-#endif

  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradW: error doing operation: %s",

--- a/theano/sandbox/gpuarray/conv_desc.c
+++ b/theano/sandbox/gpuarray/conv_desc.c
@@ -29,7 +29,7 @@ int APPLY_SPECIFIC(conv_desc)(PyArrayObject *filt_shp,
    return -1;
  }

-  err = cudnnSetConvolutionNdDescriptor(*desc, NB_DIMS, pad, strides, upscale,
-                                        CONV_MODE);
+  err = cudnnSetConvolutionNdDescriptor_v3(*desc, NB_DIMS, pad, strides,
+                                           upscale, CONV_MODE, PRECISION);
  return 0;
 }
--- a/theano/sandbox/gpuarray/cudnn_helper.h
+++ b/theano/sandbox/gpuarray/cudnn_helper.h
@@ -13,99 +13,12 @@ static inline int cudnnGetVersion() {

 #include <assert.h>

-#if CUDNN_VERSION < 3000
-// Here we define the R3 API in terms of functions in the R2 interface
-// This is only for what we use
+// If needed, define element of the V4 interface in terms of elements of
+// previous versions
+#if defined(CUDNN_VERSION) && CUDNN_VERSION < 4000

-typedef int cudnnConvolutionBwdDataAlgo_t;
-
-#define CUDNN_CONVOLUTION_BWD_DATA_ALGO_0 0
-#define CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 1
-#define CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT 2
-
-static cudnnStatus_t cudnnGetConvolutionBackwardDataWorkspaceSize(
-  cudnnHandle_t handle,
-  const cudnnFilterDescriptor_t filterDesc,
-  const cudnnTensorDescriptor_t diffDesc,
-  const cudnnConvolutionDescriptor_t convDesc,
-  const cudnnTensorDescriptor_t gradDesc,
-  cudnnConvolutionBwdDataAlgo_t algo,
-  size_t *sizeInBytes) {
-  *sizeInBytes = 0;
-  return CUDNN_STATUS_SUCCESS;
-}
-
-static cudnnStatus_t cudnnConvolutionBackwardData_v3(
-  cudnnHandle_t handle,
-  const void *alpha,
-  const cudnnFilterDescriptor_t filterDesc,
-  const void *filterData,
-  const cudnnTensorDescriptor_t diffDesc,
-  const void *diffData,
-  const cudnnConvolutionDescriptor_t convDesc,
-  cudnnConvolutionBwdDataAlgo_t algo,
-  void *workspace,
-  size_t workspaceSizeInBytes,
-  const void *beta,
-  const cudnnTensorDescriptor_t gradDesc,
-  void *gradData) {
-  return cudnnConvolutionBackwardData(
-    handle,
-    alpha,
-    filterDesc,
-    filterData,
-    diffDesc,
-    diffData,
-    convDesc,
-    beta,
-    gradDesc,
-    gradData);
-}
-
-typedef int cudnnConvolutionBwdFilterAlgo_t;
-
-#define CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0 0
-#define CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 1
-#define CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT 2
-
-static cudnnStatus_t cudnnGetConvolutionBackwardFilterWorkspaceSize(
-  cudnnHandle_t handle,
-  const cudnnTensorDescriptor_t filterDesc,
-  const cudnnTensorDescriptor_t diffDesc,
-  const cudnnConvolutionDescriptor_t convDesc,
-  const cudnnFilterDescriptor_t gradDesc,
-  cudnnConvolutionBwdDataAlgo_t algo,
-  size_t *sizeInBytes) {
-  *sizeInBytes = 0;
-  return CUDNN_STATUS_SUCCESS;
-}
-
-static cudnnStatus_t cudnnConvolutionBackwardFilter_v3(
-  cudnnHandle_t handle,
-  const void *alpha,
-  const cudnnTensorDescriptor_t srcDesc,
-  const void *srcData,
-  const cudnnTensorDescriptor_t diffDesc,
-  const void *diffData,
-  const cudnnConvolutionDescriptor_t convDesc,
-  cudnnConvolutionBwdFilterAlgo_t algo,
-  void *workspace,
-  size_t workspaceSizeInBytes,
-  const void *beta,
-  const cudnnFilterDescriptor_t gradDesc,
-  void *gradData) {
-  return cudnnConvolutionBackwardFilter(
-    handle,
-    alpha,
-    srcDesc,
-    srcData,
-    diffDesc,
-    diffData,
-    convDesc,
-    beta,
-    gradDesc,
-    gradData);
-}
+#define CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING 5
+#define CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING 3

 #endif


--- a/theano/sandbox/gpuarray/dnn.py
+++ b/theano/sandbox/gpuarray/dnn.py
@@ -75,15 +75,11 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {

 def _dnn_check_version():
    v = version()
-    if v < 2000:
+    if v < 3007:
        return False, (
            "You have an old release of CuDNN (or a release candidate) "
-            "that isn't supported.  Please update to at least v2 final "
+            "that isn't supported.  Please update to at least v3 final "
            "version.")
-    if 3000 <= v < 3007:
-        return False, (
-            "You have installed a release candidate of CuDNN v3. This "
-            "isn't supported. Please update to v3 final version.")

    return True, None

@@ -241,7 +237,7 @@ class GpuDnnConvDesc(COp):

    """

-    __props__ = ('border_mode', 'subsample', 'conv_mode')
+    __props__ = ('border_mode', 'subsample', 'conv_mode', 'precision')

    def c_headers(self):
        return ['cudnn.h', 'cudnn_helper.h']
@@ -258,7 +254,8 @@ class GpuDnnConvDesc(COp):
    def do_constant_folding(self, node):
        return False

-    def __init__(self, border_mode, subsample=(1, 1), conv_mode='conv'):
+    def __init__(self, border_mode, subsample=(1, 1), conv_mode='conv',
+                 precision="float32"):
        COp.__init__(self, ["conv_desc.c"], "APPLY_SPECIFIC(conv_desc)")

        if isinstance(border_mode, int):
@@ -278,6 +275,9 @@ class GpuDnnConvDesc(COp):
        assert conv_mode in ('conv', 'cross')
        self.conv_mode = conv_mode

+        assert precision in ['float16', 'float32', 'float64']
+        self.precision = precision
+
    def make_node(self, kern_shape):
        if kern_shape.type.ndim != 1 or kern_shape.type.dtype != 'int64':
            raise TypeError('kern must be 1D shape tensor')
@@ -315,11 +315,20 @@ class GpuDnnConvDesc(COp):
        else:
            sub2 = '0'

+        if self.precision == 'float16':
+            precision = 'CUDNN_DATA_HALF'
+        elif self.precision == 'float32':
+            precision = 'CUDNN_DATA_FLOAT'
+        else:
+            assert self.precision == 'float64'
+            precision = 'CUDNN_DATA_DOUBLE'
+
        return [('NB_DIMS', str(len(self.subsample))),
                ('BORDER_MODE', bmode),
                ('PAD_0', pad0), ('PAD_1', pad1), ('PAD_2', pad2),
                ('CONV_MODE', conv_flag),
-                ('SUB_0', sub0), ('SUB_1', sub1), ('SUB_2', sub2)]
+                ('SUB_0', sub0), ('SUB_1', sub1), ('SUB_2', sub2),
+                ('PRECISION', precision)]

    def c_code_cache_version(self):
        return (super(GpuDnnConvDesc, self).c_code_cache_version(), version())
@@ -353,7 +362,8 @@ class GpuDnnConv(DnnBase):
    kernel
    descr
        The convolution descriptor.
-    algo : {'small', 'none', 'large', 'fft', 'guess_once', 'guess_on_shape_change', 'time_once', 'time_on_shape_change'}
+    algo : {'small', 'none', 'large', 'fft', 'fft_tiling', 'guess_once',
+            'guess_on_shape_change', 'time_once', 'time_on_shape_change'}
        Default is the value of :attr:`config.dnn.conv.algo_fwd`.

    """
@@ -382,9 +392,15 @@ class GpuDnnConv(DnnBase):
            elif self.algo in ['time_once', 'time_on_shape_change']:
                raise RuntimeError("CuDNN convolution timing requires CuDNN v3")

-        assert self.algo in ['none', 'small', 'large', 'fft', 'guess_once',
-                             'guess_on_shape_change', 'time_once',
-                             'time_on_shape_change']
+        # The fft_tiling implementation is only available from CuDNN V4 onward
+        if version() < 4000:
+            if self.algo == 'fft_tiling':
+                raise RuntimeError("CuDNN tiled-FFT convolution requires "
+                                   "CuDNN v4 or more recent")
+
+        assert self.algo in ['none', 'small', 'large', 'fft', 'fft_tiling',
+                             'guess_once', 'guess_on_shape_change',
+                             'time_once', 'time_on_shape_change']

    def __setstate__(self, d):
        self.__dict__.update(d)
@@ -408,8 +424,13 @@ class GpuDnnConv(DnnBase):
            alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM'
        elif self.algo == 'large':
            alg = 'CUDNN_CONVOLUTION_FWD_ALGO_GEMM'
+        elif self.algo == 'direct':
+            alg = 'CUDNN_CONVOLUTION_FWD_ALGO_DIRECT'
        elif self.algo == 'fft':
            alg = 'CUDNN_CONVOLUTION_FWD_ALGO_FFT'
+        elif self.algo == 'fft_tiling':
+            # need v4
+            alg = 'CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING'
        defs.append(('CONV_ALGO', alg))

        if self.algo in ['guess_once', 'guess_on_shape_change',
@@ -439,9 +460,10 @@ class GpuDnnConv(DnnBase):
            raise TypeError("The number of dimensions of "
                            "img, kern and output must match")

-        if img.type.ndim == 5 and self.algo == 'fft':
-            raise ValueError("convolution algo fft can't be used for "
-                             "3d convolutions")
+        if (img.type.ndim == 5 and
+                self.algo in ['small', 'large', 'fft', 'fft_tiling']):
+            raise ValueError("convolution algo %s can't be used for "
+                             "3d convolutions", (self.algo,))

        if (not isinstance(desc.type, CDataType) or
                desc.type.ctype != 'cudnnConvolutionDescriptor_t'):
@@ -479,6 +501,14 @@ class GpuDnnConv(DnnBase):
        or scalar.

        """
+
+        # if ishape and/or kshape are not tuples or list, but rather symbolic
+        # vectors, turn them into lists of symbolic scalars.
+        if not isinstance(ishape, (list, tuple)):
+            ishape = [ishape[i] for i in range(len(subsample) + 2)]
+        if not isinstance(kshape, (list, tuple)):
+            kshape = [kshape[i] for i in range(len(subsample) + 2)]
+
        return get_conv_output_shape(
            ishape,
            kshape,
@@ -511,18 +541,19 @@ class GpuDnnConvGradW(DnnBase):
        if self.inplace:
            self.destroy_map = {0: [2]}
        if algo is None:
-            algo = config.dnn.conv.algo_bwd
+            algo = config.dnn.conv.algo_bwd_filter
        self.algo = algo
-        assert self.algo in ['none', 'deterministic', 'fft', 'guess_once',
-                             'guess_on_shape_change', 'time_once',
-                             'time_on_shape_change']
+
+        assert self.algo in ['none', 'deterministic', 'fft', 'small',
+                             'guess_once', 'guess_on_shape_change',
+                             'time_once', 'time_on_shape_change']

    def __setstate__(self, d):
        self.__dict__.update(d)
        if not hasattr(self, 'inplace'):
            self.inplace = False
        if not hasattr(self, 'algo'):
-            self.algo = config.dnn.conv.algo_bwd
+            self.algo = config.dnn.conv.algo_bwd_filter

    def grad(self, inp, grads):
        img, top, output, desc, alpha, beta = inp
@@ -557,7 +588,9 @@ class GpuDnnConvGradW(DnnBase):
                alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1'
            if self.algo == 'fft':
                alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT'
-
+            if self.algo == 'small':
+                # non-deterministic, small workspace
+                alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3'
            if self.algo in ['guess_once', 'guess_on_shape_change',
                             'time_once', 'time_on_shape_change']:
                defs.append(('CHOOSE_ALGO', ''))
@@ -587,7 +620,8 @@ class GpuDnnConvGradW(DnnBase):
            raise TypeError("The number of dimensions of "
                            "img, topgrad and output must match")

-        if img.type.ndim == 5 and self.algo in ['fft', 'deterministic']:
+        if (img.type.ndim == 5 and
+                self.algo in ['fft', 'deterministic', 'small']):
            raise ValueError("convolution algo %s can't be used for "
                             "3d convolutions", (self.algo,))

@@ -627,16 +661,23 @@ class GpuDnnConvGradI(DnnBase):
        if self.inplace:
            self.destroy_map = {0: [2]}
        if algo is None:
-            algo = config.dnn.conv.algo_bwd
+            algo = config.dnn.conv.algo_bwd_data
        self.algo = algo
-        assert self.algo in ['none', 'deterministic', 'fft', 'guess_once',
-                             'guess_on_shape_change', 'time_once',
-                             'time_on_shape_change']
+
+        # The small-workspace implementation is only available from CuDNN V4
+        # onward.
+        if version() < 4000 and self.algo == 'fft_tiling':
+            raise RuntimeError("CuDNN's tiled-FFT convolution requires CuDNN "
+                               "v4 or more recent")
+
+        assert self.algo in ['none', 'deterministic', 'fft', 'fft_tiling',
+                             'guess_once', 'guess_on_shape_change',
+                             'time_once', 'time_on_shape_change']

    def __setstate__(self, d):
        self.__dict__.update(d)
        if not hasattr(self, 'algo'):
-            self.algo = config.dnn.conv.algo_bwd
+            self.algo = config.dnn.conv.algo_bwd_data
        if not hasattr(self, 'inplace'):
            self.inplace = False

@@ -673,6 +714,9 @@ class GpuDnnConvGradI(DnnBase):
                alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_1'
            if self.algo == 'fft':
                alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT'
+            if self.algo == 'fft_tiling':
+                # big workspace but less than fft
+                alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING'

            if self.algo in ['guess_once', 'guess_on_shape_change',
                             'time_once', 'time_on_shape_change']:
@@ -703,7 +747,8 @@ class GpuDnnConvGradI(DnnBase):
            raise TypeError("The number of dimensions of "
                            "kern, topgrad and output must match")

-        if kern.type.ndim == 5 and self.algo in ['fft', 'deterministic']:
+        if (kern.type.ndim == 5 and
+                self.algo in ['fft', 'deterministic', 'fft_tiling']):
            raise ValueError("convolution algo %s can't be used for "
                             "3d convolutions", (self.algo,))

@@ -723,7 +768,7 @@ class GpuDnnConvGradI(DnnBase):

 def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
             conv_mode='conv', direction_hint=None, workmem=None,
-             algo=None):
+             algo=None, precision=None):
    """
    GPU convolution using cuDNN from NVIDIA.

@@ -757,12 +802,24 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
        Convolution implementation to use. Some of its values may
        require certain versions of CuDNN to be installed. Default is
        the value of :attr:`config.dnn.conv.algo_fwd`.
+    precision : {'as_input', 'float16', 'float32', 'float64'}
+        Description of the dtype in which the computation of the convolution
+        should be done. Possible values are 'as_input', 'float16', 'float32'
+        and 'float64'. Default is the value of
+        :attr:`config.dnn.conv.precision`.

    .. warning:: The cuDNN library only works with GPUs that have a compute
        capability of 3.0 or higer. This means that older GPUs will not
        work with this Op.

    """
+
+    # Establish dtype in which to perform the computation of the convolution
+    if precision is None:
+        precision = theano.config.dnn.conv.precision
+    if precision == 'as_input':
+        precision = theano.scalar.upcast(img.dtype, kerns.dtype)
+
    if workmem is not None:
        if algo is not None:
            raise ValueError("You can't use both algo and workmem")
@@ -786,7 +843,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
            shape_i(kerns, 1, fgraph),
            shape_i(img, 1, fgraph), shape2, shape3)
        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
-                              conv_mode='cross')(out.shape)
+                              conv_mode='cross', precision=precision)(out.shape)
        conv = GpuDnnConvGradW()(img, kerns, out, desc)
        return as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3), ctx_name)

@@ -804,7 +861,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
                                                 shape_i(kerns, 1, fgraph),
                                                 shape2, shape3)
        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
-                              conv_mode=conv_mode)(kerns.shape)
+                              conv_mode=conv_mode, precision=precision)(kerns.shape)
        return GpuDnnConvGradI()(kerns, img, out, desc)

    # Standard case: We use GpuDnnConv with suitable padding.
@@ -813,7 +870,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
    img = gpu_contiguous(img)
    kerns = gpu_contiguous(kerns)
    desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
-                          conv_mode=conv_mode)(kerns.shape)
+                          conv_mode=conv_mode, precision=precision)(kerns.shape)
    desc_op = desc.owner.op
    out_shp = GpuDnnConv.get_out_shape(img.shape, kerns.shape,
                                       desc_op.border_mode,

--- a/theano/sandbox/gpuarray/dnn_fwd.c
+++ b/theano/sandbox/gpuarray/dnn_fwd.c
@@ -136,15 +136,26 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
       algo == CUDNN_CONVOLUTION_FWD_ALGO_GEMM))
    algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;

-#if CUDNN_VERSION > 3000
-  if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT) {
+  // The FFT implementation does not support strides, 1x1 filters or inputs
+  // with a spatial dimension larger than 1024. The tiled-FFT implementation
+  // does not support strides.
+  // If the chosen implementation is FFT or tiled-FFT, validate that it can
+  // be used on the current data and default to a safe implementation if it
+  // can't.
+  // The following code is 2d-specific but it is fine as FFT and tiled-FFT are
+  // defined only for 2d filters
+  if ((algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT ||
+       algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) && PyGpuArray_NDIM(input) == 4) {
+
+    // Extract the properties of the convolution descriptor
    int nd;
    int pad[2];
    int stride[2];
    int upscale[2];
    cudnnConvolutionMode_t mode;
-    err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
-                                          upscale, &mode);
+    cudnnDataType_t data_type;
+    err = cudnnGetConvolutionNdDescriptor_v3(desc, 2, &nd, pad, stride,
+                                             upscale, &mode, &data_type);
    if (err != CUDNN_STATUS_SUCCESS) {
      PyErr_Format(PyExc_RuntimeError,
                   "error getting convolution properties: %s",
@@ -153,30 +164,24 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
      return 1;
    }

-    if (stride[0] != 1 || stride[1] != 1 ||
-        PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
-        (PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1)) {
-      algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
+    if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT)
+    {
+      if (stride[0] != 1 || stride[1] != 1 ||
+          PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
+          (PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1))
+      {
+        algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
+      }
    }
-  }
-#endif
-
-#if CUDNN_VERSION < 3000
-  /* cuDNN before v3 does not support kernels larger than input even
-   * if appropriate padding is selected. */
-  for (unsigned int i = 2; i < PyGpuArray_NDIM(input); i++) {
-    if (PyGpuArray_DIM(kerns, i) > PyGpuArray_DIM(input, i)) {
-      PyErr_SetString(PyExc_RuntimeError, "the current version "
-                      "of CuDNN does not support kernels larger than the "
-                      "inputs in any spatial dimension, even if the inputs "
-                      "are padded such that the padded inputs are larger "
-                      "than the kernels. Update your installation of CuDNN "
-                      "to V3 or more recent to solve the issue.");
-      cuda_exit(c->ctx);
-      return 1;
+    else
+    {
+      // algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING
+      if (stride[0] != 1 || stride[1] != 1)
+      {
+        algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
+      }
    }
  }
-#endif

  {
    size_t worksize;

--- a/theano/sandbox/gpuarray/dnn_gi.c
+++ b/theano/sandbox/gpuarray/dnn_gi.c
@@ -128,15 +128,26 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,

 #endif

-#if CUDNN_VERSION > 3000
-  if (algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) {
+  // The FFT implementation does not support strides, 1x1 filters or inputs
+  // with a spatial dimension larger than 1024. The tiled-FFT implementation
+  // does not support strides.
+  // If the chosen implementation is FFT or tiled-FFT, validate that it can
+  // be used on the current data and default to a safe implementation if it
+  // can't.
+  // The following code is 2d-specific but it is fine as FFT and tiled-FFT are
+  // defined only for 2d filters
+  if ((algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING ||
+       algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) && PyGpuArray_NDIM(kerns) == 4) {
+
+    // Extract the properties of the convolution descriptor
    int nd;
    int pad[2];
    int stride[2];
    int upscale[2];
    cudnnConvolutionMode_t mode;
-    err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
-                                          upscale, &mode);
+    cudnnDataType_t data_type;
+    err = cudnnGetConvolutionNdDescriptor_v3(desc, 2, &nd, pad, stride,
+                                             upscale, &mode, &data_type);
    if (err != CUDNN_STATUS_SUCCESS) {
      PyErr_Format(PyExc_RuntimeError,
                   "error getting convolution properties: %s",
@@ -145,13 +156,24 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
      return 1;
    }

-    if (stride[0] != 1 || stride[1] != 1 ||
-        PyGpuArray_DIM(*input, 2) > 1024 || PyGpuArray_DIM(*input, 3) > 1024 ||
-        (PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1)) {
-      algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
+    if (algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT)
+    {
+      if (stride[0] != 1 || stride[1] != 1 ||
+          PyGpuArray_DIM(*input, 2) > 1024 || PyGpuArray_DIM(*input, 3) > 1024 ||
+          (PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1))
+      {
+        algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
+      }
+    }
+    else
+    {
+      // algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING
+      if (stride[0] != 1 || stride[1] != 1)
+      {
+        algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
+      }
    }
  }
-#endif

  size_t worksize;
  gpudata *workspace;

--- a/theano/sandbox/gpuarray/dnn_gw.c
+++ b/theano/sandbox/gpuarray/dnn_gw.c
@@ -130,15 +130,24 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,

 #endif

-#if CUDNN_VERSION > 3000
-  if (algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT) {
+  // The FFT implementation does not support strides, 1x1 filters or inputs
+  // with a spatial dimension larger than 1024.
+  // If the chosen implementation is FFT, validate that it can
+  // be used on the current data and default to a safe implementation if it
+  // can't.
+  // The following code is 2d-specific but it is fine as FFT and tiled-FFT are
+  // defined only for 2d filters
+  if (algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT &&
+      PyGpuArray_NDIM(input) == 4) {
+    // Extract the properties of the convolution descriptor
    int nd;
    int pad[2];
    int stride[2];
    int upscale[2];
    cudnnConvolutionMode_t mode;
-    err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
-                                          upscale, &mode);
+    cudnnDataType_t data_type;
+    err = cudnnGetConvolutionNdDescriptor_v3(desc, 2, &nd, pad, stride,
+                                             upscale, &mode, &data_type);
    if (err != CUDNN_STATUS_SUCCESS) {
      PyErr_Format(PyExc_RuntimeError,
                   "error getting convolution properties: %s",
@@ -153,7 +162,6 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
      algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
    }
  }
-#endif

  size_t worksize;
  gpudata *workspace;