Merge branch 'master' of https://github.com/Theano/Theano into gpureduce-fix

5d3433c7 · xiaoqie · 608e9aef · ebc59491 · 5d3433c7 · 5d3433c7
--- a/.jenkins/docker_cuda_theano/Dockerfile
+++ b/.jenkins/docker_cuda_theano/Dockerfile
+FROM nvidia/cuda:8.0-cudnn6-devel-ubuntu16.04
+
+ENV DEBIAN_FRONTEND noninteractive
+
+# Install tools
+RUN apt-get update
+RUN apt-get install -y build-essential apt-utils wget git dvipng time
+
+# Install magma lib
+RUN apt-get install -y libopenblas-dev
+RUN wget http://icl.cs.utk.edu/projectsfiles/magma/downloads/magma-2.2.0.tar.gz
+RUN tar xvf magma-2.2.0.tar.gz
+RUN cp magma-2.2.0/make.inc-examples/make.inc.openblas magma-2.2.0/make.inc
+ENV OPENBLASDIR /usr
+ENV CUDADIR /usr/local/cuda
+RUN (cd magma-2.2.0 && make && make install prefix=/usr/local)
+RUN ldconfig
+
+# Setup conda python for Theano
+RUN wget https://repo.continuum.io/miniconda/Miniconda2-latest-Linux-x86_64.sh
+RUN bash Miniconda2-latest-Linux-x86_64.sh -p /miniconda -b
+ENV PATH=/miniconda/bin:${PATH}
+RUN conda install numpy scipy sympy mkl nose pydot-ng graphviz cython cmake
+RUN conda update -y conda
+RUN pip install nose-timer parameterized "flake8<3" "sphinx==1.5.2" sphinx_rtd_theme
+RUN pip install --upgrade-strategy only-if-needed git+https://github.com/lebedov/scikit-cuda.git 
+
+# Setup latex for doc test
+RUN apt-get install -y texlive-latex-base texlive-latex-extra
+
+# Install SSH server and Java runtime for Jenkins
+RUN apt-get install -y openssh-server default-jre-headless
+
+# Add jenkins user and setup environment
+RUN useradd -m -s /bin/bash jenkins
+RUN echo jenkins:jenkins | chpasswd
+RUN echo "export PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/miniconda/bin:\$PATH" >> /home/jenkins/.bashrc
+RUN echo "export LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64:\$LD_LIBRARY_PATH" >> /home/jenkins/.bashrc
+
+# Copy jenkins slave.jar into container
+RUN wget -P home/jenkins http://earlgrey.iro.umontreal.ca:8080/jnlpJars/slave.jar
+RUN chown -R jenkins:jenkins /home/jenkins/*
+
+# Set launch command as Jenkins slave.jar
+CMD java -jar /home/jenkins/slave.jar
\ No newline at end of file
--- a/.jenkins/jenkins_buildbot_python2.sh
+++ b/.jenkins/jenkins_buildbot_python2.sh
@@ -6,6 +6,7 @@ export THEANO_FLAGS=init_gpu_device=cuda

 # CUDA
 export PATH=/usr/local/cuda/bin:$PATH
+export CPATH=/usr/local/cuda/include/:$CPATH
 export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
 export LIBRARY_PATH=/usr/local/cuda/lib64:$LIBRARY_PATH

@@ -79,6 +80,9 @@ FLAGS=on_shape_error=raise,$FLAGS
 #      while we want all other runs to run with 'floatX=float64'.
 FLAGS=${FLAGS},device=cpu,floatX=float64

+# Enable magma GPU library
+FLAGS=${FLAGS},magma.enabled=true
+
 # Only use elements in the cache for < 7 days
 FLAGS=${FLAGS},cmodule.age_thresh_use=604800


--- a/.jenkins/jenkins_buildbot_python2_debug.sh
+++ b/.jenkins/jenkins_buildbot_python2_debug.sh
@@ -6,6 +6,7 @@ export THEANO_FLAGS=init_gpu_device=cuda

 # CUDA
 export PATH=/usr/local/cuda/bin:$PATH
+export CPATH=/usr/local/cuda/include/:$CPATH
 export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
 export LIBRARY_PATH=/usr/local/cuda/lib64:$LIBRARY_PATH

@@ -79,6 +80,9 @@ FLAGS=${FLAGS},device=cpu,floatX=float64
 # Only use elements in the cache for < 7 days
 FLAGS=${FLAGS},cmodule.age_thresh_use=604800

+# Enable magma GPU library
+FLAGS=${FLAGS},magma.enabled=true
+
 #we change the seed and record it everyday to test different combination. We record it to be able to reproduce bug caused by different seed. We don't want multiple test in DEBUG_MODE each day as this take too long.
 seed=$RANDOM
 echo "Executing tests with mode=DEBUG_MODE with seed of the day $seed"

--- a/.jenkins/jenkins_buildbot_python3.sh
+++ b/.jenkins/jenkins_buildbot_python3.sh
@@ -8,6 +8,7 @@ export THEANO_FLAGS=init_gpu_device=cuda

 # CUDA
 export PATH=/usr/local/cuda/bin:$PATH
+export CPATH=/usr/local/cuda/include/:$CPATH
 export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
 export LIBRARY_PATH=/usr/local/cuda/lib64:$LIBRARY_PATH

@@ -19,4 +20,4 @@ echo

 FILE=${BUILDBOT_DIR}/theano_python3_tests.xml
 set -x
-PYTHONPATH= THEANO_FLAGS=$THEANO_FLAGS,compiledir=$HOME/.theano/buildbot_theano_python3,mode=FAST_COMPILE,warn.ignore_bug_before=all,on_opt_error=raise,on_shape_error=raise python3 bin/theano-nose ${THEANO_PARAM} ${XUNIT}${FILE}
+PYTHONPATH= THEANO_FLAGS=$THEANO_FLAGS,compiledir=$HOME/.theano/buildbot_theano_python3,mode=FAST_COMPILE,warn.ignore_bug_before=all,on_opt_error=raise,on_shape_error=raise,magma.enabled=true python3 bin/theano-nose ${THEANO_PARAM} ${XUNIT}${FILE}
--- a/.jenkins/jenkins_pretest.sh
+++ b/.jenkins/jenkins_pretest.sh
@@ -5,9 +5,6 @@
 # Print commands as they are executed
 set -x

-# Anaconda python
-export PATH=/usr/local/miniconda2/bin:$PATH
-
 # Test flake8
 echo "===== Testing flake8"
 bin/theano-nose theano/tests/test_flake8.py --with-xunit --xunit-file=theano_pre_tests.xml || exit 1

--- a/.jenkins/jenkins_test1.sh
+++ b/.jenkins/jenkins_test1.sh
@@ -5,9 +5,6 @@
 # Print commands as they are executed
 set -x

-# Anaconda python
-export PATH=/usr/local/miniconda2/bin:$PATH
-
 echo "===== Testing theano core"

 # Test theano core

--- a/.jenkins/jenkins_test2.sh
+++ b/.jenkins/jenkins_test2.sh
@@ -5,11 +5,9 @@
 # Print commands as they are executed
 set -x

-# Anaconda python
-export PATH=/usr/local/miniconda2/bin:$PATH
-
 # CUDA
 export PATH=/usr/local/cuda/bin:$PATH
+export CPATH=/usr/local/cuda/include/:$CPATH
 export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
 export LIBRARY_PATH=/usr/local/cuda/lib64:$LIBRARY_PATH

@@ -54,4 +52,5 @@ THEANO_GPUARRAY_TESTS="theano/gpuarray/tests \
                       theano/scan_module/tests/test_scan.py:T_Scan_Gpuarray \
                       theano/scan_module/tests/test_scan_checkpoints.py:TestScanCheckpoint.test_memory"
 FLAGS="init_gpu_device=$DEVICE,gpuarray.preallocate=1000,mode=FAST_RUN,on_opt_error=raise,on_shape_error=raise,cmodule.age_thresh_use=604800"
+FLAGS=${FLAGS},magma.enabled=true # Enable magma GPU library
 THEANO_FLAGS=${FLAGS} time nosetests --with-xunit --xunit-file=theanogpuarray_tests.xml ${THEANO_GPUARRAY_TESTS}
--- a/theano/compile/builders.py
+++ b/theano/compile/builders.py
@@ -271,6 +271,7 @@ class OpFromGraph(gof.Op):
        is_inline = self.is_inline
        return '%(name)s{inline=%(is_inline)s}' % locals()

+    @theano.configparser.change_flags(compute_test_value='off')
    def _recompute_grad_op(self):
        '''
        converts self._grad_op from user supplied form to type(self) instance

--- a/theano/compile/tests/test_builders.py
+++ b/theano/compile/tests/test_builders.py
@@ -2,6 +2,7 @@ from __future__ import absolute_import, print_function, division
 from functools import partial
 import numpy as np

+import theano
 from theano import config, shared

 from theano.gradient import DisconnectedType
@@ -313,3 +314,14 @@ class T_OpFromGraph(unittest_tools.InferShapeTester):
                                [np.ones([3, 4], dtype=config.floatX),
                                 np.ones([3, 4], dtype=config.floatX)],
                                OpFromGraph)
+
+    @theano.configparser.change_flags(compute_test_value='raise')
+    def test_compute_test_value(self):
+        x = T.scalar('x')
+        x.tag.test_value = np.array(1., dtype=config.floatX)
+        op = OpFromGraph([x], [x ** 3])
+        y = T.scalar('y')
+        y.tag.test_value = np.array(1., dtype=config.floatX)
+        f = op(y)
+        grad_f = T.grad(f, y)
+        assert grad_f.tag.test_value is not None
--- a/theano/configdefaults.py
+++ b/theano/configdefaults.py
@@ -268,19 +268,18 @@ def safe_no_dnn_algo_bwd(algo):
            '`dnn.conv.algo_bwd_filter` and `dnn.conv.algo_bwd_data` instead.')
    return True

+# Those are the options provided by Theano to choose algorithms at runtime.
+SUPPORTED_DNN_CONV_ALGO_RUNTIME = ('guess_once', 'guess_on_shape_change', 'time_once', 'time_on_shape_change')
+
 # Those are the supported algorithm by Theano,
 # The tests will reference those lists.
-SUPPORTED_DNN_CONV_ALGO_FWD = ('small', 'none', 'large', 'fft', 'fft_tiling',
-                               'winograd', 'guess_once', 'guess_on_shape_change',
-                               'time_once', 'time_on_shape_change')
+SUPPORTED_DNN_CONV_ALGO_FWD = ('small', 'none', 'large', 'fft', 'fft_tiling', 'winograd') + SUPPORTED_DNN_CONV_ALGO_RUNTIME
+
+SUPPORTED_DNN_CONV_ALGO_BWD_DATA = ('none', 'deterministic', 'fft', 'fft_tiling', 'winograd') + SUPPORTED_DNN_CONV_ALGO_RUNTIME

-SUPPORTED_DNN_CONV_ALGO_BWD_DATA = ('none', 'deterministic', 'fft', 'fft_tiling',
-                                    'winograd', 'guess_once', 'guess_on_shape_change',
-                                    'time_once', 'time_on_shape_change')
+SUPPORTED_DNN_CONV_ALGO_BWD_FILTER = ('none', 'deterministic', 'fft', 'small') + SUPPORTED_DNN_CONV_ALGO_RUNTIME

-SUPPORTED_DNN_CONV_ALGO_BWD_FILTER = ('none', 'deterministic', 'fft', 'small',
-                                      'guess_once', 'guess_on_shape_change',
-                                      'time_once', 'time_on_shape_change')
+SUPPORTED_DNN_CONV_PRECISION = ('as_input_f32', 'as_input', 'float16', 'float32', 'float64')

 AddConfigVar('dnn.conv.algo_bwd',
             "This flag is deprecated; use dnn.conv.algo_bwd_data and "
@@ -311,8 +310,7 @@ AddConfigVar('dnn.conv.precision',
             "Default data precision to use for the computation in cuDNN "
             "convolutions (defaults to the same dtype as the inputs of the "
             "convolutions, or float32 if inputs are float16).",
-             EnumStr('as_input_f32', 'as_input', 'float16', 'float32',
-                     'float64'),
+             EnumStr(*SUPPORTED_DNN_CONV_PRECISION),
             in_c_key=False)



--- a/theano/gof/op.py
+++ b/theano/gof/op.py
@@ -1413,7 +1413,10 @@ class COp(Op):
        return []

    def c_code_cache_version(self):
-        return hash(tuple(self.func_codes))
+        version = (hash(tuple(self.func_codes)), )
+        if hasattr(self, 'params_type'):
+            version += (self.params_type.c_code_cache_version(), )
+        return version

    def c_init_code(self):
        """

--- a/theano/gof/type.py
+++ b/theano/gof/type.py
@@ -963,6 +963,12 @@ class EnumType(Type, dict):
        """
        return alias in self.aliases

+    def get_aliases(self):
+        """
+        Return the list of all aliases in this enumeration.
+        """
+        return self.aliases.keys()
+
    def __repr__(self):
        names_to_aliases = {constant_name: '' for constant_name in self}
        for alias in self.aliases:
@@ -1184,4 +1190,6 @@ class CEnumType(EnumList):
                   fail=sub['fail'])

    def c_code_cache_version(self):
-        return (1, super(CEnumType, self).c_code_cache_version())
+        # C code depends on (C constant name, Python value) associations (given by `self.items()`),
+        # so we should better take them into account in C code version.
+        return (1, tuple(self.items()), super(CEnumType, self).c_code_cache_version())
--- a/theano/gpuarray/conv_desc.c
+++ b/theano/gpuarray/conv_desc.c
 #section support_code_apply

 int APPLY_SPECIFIC(conv_desc)(PyArrayObject *filt_shp,
-                              cudnnConvolutionDescriptor_t *desc) {
+                              cudnnConvolutionDescriptor_t *desc,
+                              PARAMS_TYPE* params) {
  cudnnStatus_t err;
-  int pad[3] = {PAD_0, PAD_1, PAD_2};
-  int strides[3] = {SUB_0, SUB_1, SUB_2};
-  int dilation[3] = {DIL_0, DIL_1, DIL_2};
+  int pad[3] = {params->pad0, params->pad1, params->pad2};
+  int strides[3] = {params->sub0, params->sub1, params->sub2};
+  int dilation[3] = {params->dil0, params->dil1, params->dil2};

-#if BORDER_MODE == 0
-  pad[0] = (*(npy_int64 *)PyArray_GETPTR1(filt_shp, 2) - 1) * DIL_0;
-  pad[1] = (*(npy_int64 *)PyArray_GETPTR1(filt_shp, 3) - 1) * DIL_1;
-#if NB_DIMS > 2
-  pad[2] = (*(npy_int64 *)PyArray_GETPTR1(filt_shp, 4) - 1) * DIL_2;
-#endif
-#elif BORDER_MODE == 2
-  pad[0] = ((*(npy_int64 *)PyArray_GETPTR1(filt_shp, 2) - 1) * DIL_0 + 1) / 2;
-  pad[1] = ((*(npy_int64 *)PyArray_GETPTR1(filt_shp, 3) - 1) * DIL_1 + 1) / 2;
-#if NB_DIMS > 2
-  pad[2] = ((*(npy_int64 *)PyArray_GETPTR1(filt_shp, 4) - 1) * DIL_2 + 1) / 2;
-#endif
-#endif
+  if (params->bmode == BORDER_MODE_FULL) {
+    pad[0] = (*(npy_int64 *)PyArray_GETPTR1(filt_shp, 2) - 1) * dilation[0];
+    pad[1] = (*(npy_int64 *)PyArray_GETPTR1(filt_shp, 3) - 1) * dilation[1];
+    if (params->nb_dims > 2) {
+      pad[2] = (*(npy_int64 *)PyArray_GETPTR1(filt_shp, 4) - 1) * dilation[2];
+    }
+  } else if(params->bmode == BORDER_MODE_HALF) {
+    pad[0] = ((*(npy_int64 *)PyArray_GETPTR1(filt_shp, 2) - 1) * dilation[0] + 1) / 2;
+    pad[1] = ((*(npy_int64 *)PyArray_GETPTR1(filt_shp, 3) - 1) * dilation[1] + 1) / 2;
+    if (params->nb_dims > 2) {
+      pad[2] = ((*(npy_int64 *)PyArray_GETPTR1(filt_shp, 4) - 1) * dilation[2] + 1) / 2;
+    }
+  }

-  if (PyArray_DIM(filt_shp, 0) - 2 != NB_DIMS) {
+  if (PyArray_DIM(filt_shp, 0) - 2 != params->nb_dims) {
    PyErr_Format(PyExc_ValueError, "Filter shape has too many dimensions: "
-                 "expected %d, got %lld.", NB_DIMS,
+                 "expected %d, got %lld.", params->nb_dims,
                 (long long)PyArray_DIM(filt_shp, 0));
    return -1;
  }
@@ -35,8 +36,8 @@ int APPLY_SPECIFIC(conv_desc)(PyArrayObject *filt_shp,
    return -1;
  }

-  err = cudnnSetConvolutionNdDescriptor(*desc, NB_DIMS, pad, strides,
-                                        dilation, CONV_MODE, PRECISION);
+  err = cudnnSetConvolutionNdDescriptor(*desc, params->nb_dims, pad, strides,
+                                        dilation, params->conv_mode, params->precision);
  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_MemoryError, "could not set convolution "
                 "descriptor: %s", cudnnGetErrorString(err));

--- a/theano/gpuarray/cudnn_defs.py
+++ b/theano/gpuarray/cudnn_defs.py
+"""
+Declarations of cuDNN types and constants used in Theano gpuarray DNN module.
+
+For every cuDNN API supported by Theano, this module defines a class that
+provides the set of cuDNN definitions to be used in Theano Ops.
+
+Use :func:`get_definitions` to get the right cuDNN definitions
+for a given cuDNN version.
+
+Currently supported cuDNN APIs:
+
+ - v5.1
+ - v6.0
+
+"""
+
+from __future__ import absolute_import, print_function, division
+
+from theano.gof import CEnumType
+
+# NB: Some cuDNN algorithms are listed in cuDNN enums but not implemented.
+# We still register them here because we try to exactly copy cuDNN enums
+# in Python side, but they will have no aliases associated, to help
+# exclude them from lists of supported algorithms.
+
+
+class CuDNNV51(object):
+    version = 5
+
+    cudnnConvolutionMode_t = CEnumType(('CUDNN_CONVOLUTION', 'conv'),
+                                       ('CUDNN_CROSS_CORRELATION', 'cross'),
+                                       ctype='cudnnConvolutionMode_t')
+
+    cudnnDataType_t = CEnumType(('CUDNN_DATA_FLOAT', 'float32'),
+                                ('CUDNN_DATA_DOUBLE', 'float64'),
+                                ('CUDNN_DATA_HALF', 'float16'),
+                                # CUDNN_DATA_INT8  # new in v6
+                                # CUDNN_DATA_INT32  # new in v6
+                                # CUDNN_DATA_INT8x4  # new in v6
+                                ctype='cudnnDataType_t')
+
+    cudnnConvolutionFwdAlgo_t = CEnumType(('CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM', 'none'),
+                                          ('CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM', 'small'),
+                                          ('CUDNN_CONVOLUTION_FWD_ALGO_GEMM', 'large'),
+                                          # not implemented:
+                                          ('CUDNN_CONVOLUTION_FWD_ALGO_DIRECT'),
+                                          ('CUDNN_CONVOLUTION_FWD_ALGO_FFT', 'fft'),
+                                          ('CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING', 'fft_tiling'),
+                                          ('CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD', 'winograd'),
+                                          # TODO: Not yet tested/documented:
+                                          ('CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED', 'winograd_non_fused'),
+                                          ctype='cudnnConvolutionFwdAlgo_t')
+
+    conv3d_fwd_algorithms = ('none', 'small', 'fft_tiling')
+
+    cudnnConvolutionBwdFilterAlgo_t = CEnumType(('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0', 'none'),
+                                                ('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1', 'deterministic'),
+                                                ('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT', 'fft'),
+                                                ('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3', 'small'),
+                                                # TODO: not yet tested/documented:
+                                                ('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED', 'winograd_non_fused'),
+                                                ctype='cudnnConvolutionBwdFilterAlgo_t')
+
+    conv3d_bwd_filter_algorithms = ('none', 'small')
+
+    cudnnConvolutionBwdDataAlgo_t = CEnumType(('CUDNN_CONVOLUTION_BWD_DATA_ALGO_0', 'none'),
+                                              ('CUDNN_CONVOLUTION_BWD_DATA_ALGO_1', 'deterministic'),
+                                              ('CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT', 'fft'),
+                                              ('CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING', 'fft_tiling'),
+                                              ('CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD', 'winograd'),
+                                              # TODO: not yet tested/documented:
+                                              ('CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED', 'winograd_non_fused'),
+                                              ctype='cudnnConvolutionBwdDataAlgo_t')
+
+    conv3d_bwd_data_algorithms = ('none', 'deterministic', 'fft_tiling')
+
+    cudnnPoolingMode_t = CEnumType(('CUDNN_POOLING_MAX', 'max'),
+                                   ('CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING', 'average_inc_pad'),
+                                   ('CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING', 'average_exc_pad'),
+                                   ctype='cudnnPoolingMode_t')
+
+    cudnnSoftmaxAlgorithm_t = CEnumType(('CUDNN_SOFTMAX_FAST', 'fast'),
+                                        ('CUDNN_SOFTMAX_ACCURATE', 'accurate'),
+                                        ('CUDNN_SOFTMAX_LOG', 'log'),
+                                        ctype='cudnnSoftmaxAlgorithm_t')
+
+    cudnnSoftmaxMode_t = CEnumType(('CUDNN_SOFTMAX_MODE_INSTANCE', 'instance'),
+                                   ('CUDNN_SOFTMAX_MODE_CHANNEL', 'channel'),
+                                   ctype='cudnnSoftmaxMode_t')
+
+    cudnnBatchNormMode_t = CEnumType(('CUDNN_BATCHNORM_PER_ACTIVATION', 'per-activation'),
+                                     ('CUDNN_BATCHNORM_SPATIAL', 'spatial'),
+                                     ctype='cudnnBatchNormMode_t')
+
+
+class CuDNNV6(CuDNNV51):
+    version = 6
+
+    cudnnPoolingMode_t = CEnumType(('CUDNN_POOLING_MAX', 'max'),
+                                   ('CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING', 'average_inc_pad'),
+                                   ('CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING', 'average_exc_pad'),
+                                   # new in v6:
+                                   ('CUDNN_POOLING_MAX_DETERMINISTIC', 'max_deterministic'),
+                                   ctype='cudnnPoolingMode_t')
+
+    cudnnConvolutionBwdFilterAlgo_t = CEnumType(('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0', 'none'),
+                                                ('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1', 'deterministic'),
+                                                ('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT', 'fft'),
+                                                ('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3', 'small'),
+                                                # not implemented:
+                                                ('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD'),
+                                                ('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED', 'winograd_non_fused'),
+                                                # TODO: not yet tested/documented:
+                                                # new in v6:
+                                                ('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING', 'fft_tiling'),
+                                                ctype='cudnnConvolutionBwdFilterAlgo_t')
+
+
+def get_definitions(cudnn_version=None):
+    """
+    Return cuDNN definitions to be used by Theano for the given cuDNN version.
+
+    ``cudnn_version`` must be None or an integer
+    (typically the version returned by :func:`theano.gpuarray.dnn.version`).
+    if None, return definitions for the  most recent supported cuDNN version.
+
+    """
+    if cudnn_version is not None and cudnn_version // 1000 == 5:
+        return CuDNNV51()
+    # By default, we use definitions for the last supported cuDNN version.
+    return CuDNNV6()
--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
--- a/theano/gpuarray/dnn_batchnorm_grad.c
+++ b/theano/gpuarray/dnn_batchnorm_grad.c
@@ -24,7 +24,7 @@ int dnn_batchnorm_grad(PyGpuArrayObject *inp, PyGpuArrayObject *doutp,
                       PyGpuArrayObject *scale, PyGpuArrayObject *x_mean,
                       PyGpuArrayObject *x_invstd, npy_float64 epsilon,
                       PyGpuArrayObject **dinp, PyGpuArrayObject **dscale,
-                       PyGpuArrayObject **dbias, cudnnHandle_t _handle) {
+                       PyGpuArrayObject **dbias, PARAMS_TYPE* params) {
  PyGpuContextObject *c = inp->context;

  if (c_set_tensorNd(inp, bn_input) != 0)
@@ -70,8 +70,8 @@ int dnn_batchnorm_grad(PyGpuArrayObject *inp, PyGpuArrayObject *doutp,
      betaParam = (void *)&fbeta;
    }
    cudnnStatus_t err = cudnnBatchNormalizationBackward(
-      _handle,
-      MODE,
+      params->handle,
+      params->mode,
      alphaData,
      betaData,
      alphaParam,

--- a/theano/gpuarray/dnn_batchnorm_inf.c
+++ b/theano/gpuarray/dnn_batchnorm_inf.c
@@ -3,7 +3,7 @@
 int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
                     PyGpuArrayObject *bias, PyGpuArrayObject *est_mean,
                     PyGpuArrayObject *est_var, npy_float64 epsilon, 
-                     PyGpuArrayObject **outp, cudnnHandle_t _handle) {
+                     PyGpuArrayObject **outp, PARAMS_TYPE* params) {
  PyGpuContextObject *c = inp->context;

  if (c_set_tensorNd(inp, bn_input) != 0)
@@ -16,14 +16,14 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
    return 1;
  }

-#ifdef INPLACE_OUTPUT
-  Py_XDECREF(*outp);
-  *outp = inp;
-  Py_INCREF(*outp);
-#else
-  if (theano_prep_output(outp, inp->ga.nd, inp->ga.dimensions, inp->ga.typecode, GA_C_ORDER, c) != 0)
-    return 1;
-#endif
+  if (params->inplace) {
+    Py_XDECREF(*outp);
+    *outp = inp;
+    Py_INCREF(*outp);
+  } else {
+    if (theano_prep_output(outp, inp->ga.nd, inp->ga.dimensions, inp->ga.typecode, GA_C_ORDER, c) != 0)
+      return 1;
+  }

  if (c_set_tensorNd(*outp, bn_output) != 0)
    return 1;
@@ -43,8 +43,8 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
      beta = (void *)&fbeta;
    }
    cudnnStatus_t err = cudnnBatchNormalizationForwardInference(
-      _handle,
-      MODE,
+      params->handle,
+      params->mode,
      alpha,
      beta,
      bn_input,

--- a/theano/gpuarray/dnn_fwd.c
+++ b/theano/gpuarray/dnn_fwd.c
 #section init_code_struct

-#ifdef CHOOSE_ALGO
-reuse_algo = 0;
-prev_algo = CONV_ALGO;
-#ifndef CHOOSE_ONCE
-memset(prev_img_dims, 0, sizeof(prev_img_dims));
-memset(prev_kern_dims, 0, sizeof(prev_kern_dims));
-#endif
-#endif
+if (PARAMS->choose_algo) {
+  reuse_algo = 0;
+  prev_algo = PARAMS->conv_algo;
+  if (!PARAMS->choose_once) {
+    memset(prev_img_dims, 0, sizeof(prev_img_dims));
+    memset(prev_kern_dims, 0, sizeof(prev_kern_dims));
+  }
+}

 #section support_code_struct

-#ifdef CHOOSE_ALGO
 int reuse_algo;
 cudnnConvolutionFwdAlgo_t prev_algo;
-#ifndef CHOOSE_ONCE
 size_t prev_img_dims[5];
 size_t prev_kern_dims[5];
-#endif
-#endif

 int
 APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
@@ -26,7 +22,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
                         cudnnConvolutionDescriptor_t desc,
                         double alpha, double beta,
                         PyGpuArrayObject **output,
-                         cudnnHandle_t _handle) {
+                         PARAMS_TYPE* params) {
  PyGpuContextObject *c = input->context;
  void *alpha_p;
  void *beta_p;
@@ -54,17 +50,17 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
    return 1;
  }

-#ifdef CONV_INPLACE
-  Py_XDECREF(*output);
-  *output = om;
-  Py_INCREF(*output);
-#else
-  if (theano_prep_output(output, PyGpuArray_NDIM(om), PyGpuArray_DIMS(om),
-                         om->ga.typecode, GA_C_ORDER, c) != 0)
-    return 1;
-  if (beta != 0.0 && pygpu_move(*output, om))
-    return 1;
-#endif
+  if (params->inplace) {
+    Py_XDECREF(*output);
+    *output = om;
+    Py_INCREF(*output);
+  } else {
+    if (theano_prep_output(output, PyGpuArray_NDIM(om), PyGpuArray_DIMS(om),
+                           om->ga.typecode, GA_C_ORDER, c) != 0)
+      return 1;
+    if (beta != 0.0 && pygpu_move(*output, om))
+      return 1;
+  }

  if (PyGpuArray_DIMS(input)[0] == 0 || PyGpuArray_DIMS(kerns)[0] == 0 || PyGpuArray_DIMS(kerns)[1] == 0) {
    int err2 = GpuArray_memset(&(*output)->ga, 0);
@@ -83,90 +79,90 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
  if (c_set_tensorNd(*output, APPLY_SPECIFIC(output)) == -1)
    return 1;

-  cudnnConvolutionFwdAlgo_t algo = CONV_ALGO;
+  cudnnConvolutionFwdAlgo_t algo = params->conv_algo;

  cuda_enter(c->ctx);
-#ifdef CHOOSE_ALGO
-#ifndef CHOOSE_ONCE
-  reuse_algo = 1;
-  for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
-    reuse_algo = (reuse_algo &&
-                  PyGpuArray_DIM(input, i) == prev_img_dims[i]);
-    reuse_algo = (reuse_algo &&
-                  PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]);
-  }
-#endif
-
-  if (!reuse_algo) {
-    size_t free;

-    int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
-    if (err2 != GA_NO_ERROR) {
-      PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
-                   "memory information on the GPU");
-      cuda_exit(c->ctx);
-      return 1;
+  if (params->choose_algo) {
+    if (params->choose_once) {
+      reuse_algo = 1;
+      for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
+        reuse_algo = (reuse_algo &&
+                      PyGpuArray_DIM(input, i) == prev_img_dims[i]);
+        reuse_algo = (reuse_algo &&
+                      PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]);
+      }
    }

-    // Guess 4Mb if the info is not available
-    if (free == 0) free = 4 * 1024 * 1024;
+    if (!reuse_algo) {
+      size_t free;

-#ifdef CHOOSE_TIME
-    int count;
-    cudnnConvolutionFwdAlgoPerf_t choice;
-    gpudata *tmpmem;
+      int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
+      if (err2 != GA_NO_ERROR) {
+        PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
+                     "memory information on the GPU");
+        cuda_exit(c->ctx);
+        return 1;
+      }

-    tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
-    if (tmpmem == NULL) {
-      PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
-      return -1;
+      // Guess 4Mb if the info is not available
+      if (free == 0) free = 4 * 1024 * 1024;
+
+      if (params->choose_time) {
+        int count;
+        cudnnConvolutionFwdAlgoPerf_t choice;
+        gpudata *tmpmem;
+
+        tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
+        if (tmpmem == NULL) {
+          PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
+          return -1;
+        }
+        // We don't sync the buffer as we don't care about the values.
+        err = cudnnFindConvolutionForwardAlgorithmEx(
+          params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
+          APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
+          desc, APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output),
+          1, &count, &choice, *(void **)tmpmem,
+          free);
+        gpudata_release(tmpmem);
+
+        if (err != CUDNN_STATUS_SUCCESS) {
+          PyErr_Format(PyExc_RuntimeError,
+                       "error selecting convolution algo: %s",
+                       cudnnGetErrorString(err));
+          cuda_exit(c->ctx);
+          return 1;
+        }
+        algo = choice.algo;
+      } else {
+        err = cudnnGetConvolutionForwardAlgorithm(
+          params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
+          desc, APPLY_SPECIFIC(output),
+          CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, &algo);
+          if (err != CUDNN_STATUS_SUCCESS) {
+            PyErr_Format(PyExc_RuntimeError,
+                         "error selecting convolution algo: %s",
+                         cudnnGetErrorString(err));
+            cuda_exit(c->ctx);
+            return 1;
+          }
+      }
+      prev_algo = algo;
+    } else {
+      algo = prev_algo;
    }
-    // We don't sync the buffer as we don't care about the values.
-    err = cudnnFindConvolutionForwardAlgorithmEx(
-      _handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
-      APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
-      desc, APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output),
-      1, &count, &choice, *(void **)tmpmem,
-      free);
-    gpudata_release(tmpmem);

-    if (err != CUDNN_STATUS_SUCCESS) {
-      PyErr_Format(PyExc_RuntimeError,
-                   "error selecting convolution algo: %s",
-                   cudnnGetErrorString(err));
-      cuda_exit(c->ctx);
-      return 1;
-    }
-    algo = choice.algo;
-#else
-    err = cudnnGetConvolutionForwardAlgorithm(
-      _handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
-      desc, APPLY_SPECIFIC(output),
-      CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, &algo);
-    if (err != CUDNN_STATUS_SUCCESS) {
-      PyErr_Format(PyExc_RuntimeError,
-                   "error selecting convolution algo: %s",
-                   cudnnGetErrorString(err));
-      cuda_exit(c->ctx);
-      return 1;
+    if (params->choose_once) {
+      reuse_algo = 1;
+    } else {
+      for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
+        prev_img_dims[i] = PyGpuArray_DIM(input, i);
+        prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
+      }
    }
-#endif
-    prev_algo = algo;
-  } else {
-    algo = prev_algo;
  }

-#ifdef CHOOSE_ONCE
-  reuse_algo = 1;
-#else
-  for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
-    prev_img_dims[i] = PyGpuArray_DIM(input, i);
-    prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
-  }
-#endif
-
-#endif
-
  /* These two algos are not supported for 3d conv */
  if (PyGpuArray_NDIM(input) == 5 &&
      (algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM ||
@@ -201,20 +197,16 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
      return 1;
    }

-    if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT)
-    {
+    if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT) {
      if (stride[0] != 1 || stride[1] != 1 ||
          PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
          (PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1))
      {
        algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
      }
-    }
-    else
-    {
+    } else {
      // algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING
-      if (stride[0] != 1 || stride[1] != 1)
-      {
+      if (stride[0] != 1 || stride[1] != 1) {
        algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
      }
    }
@@ -223,7 +215,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
  {
    size_t worksize;
    gpudata *workspace;
-    err = cudnnGetConvolutionForwardWorkspaceSize(_handle,
+    err = cudnnGetConvolutionForwardWorkspaceSize(params->handle,
                                                  APPLY_SPECIFIC(input),
                                                  APPLY_SPECIFIC(kerns),
                                                  desc,
@@ -236,7 +228,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
      // TODO: Print a warning
      algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;

-      err = cudnnGetConvolutionForwardWorkspaceSize(_handle,
+      err = cudnnGetConvolutionForwardWorkspaceSize(params->handle,
                                                    APPLY_SPECIFIC(input),
                                                    APPLY_SPECIFIC(kerns),
                                                    desc,
@@ -273,7 +265,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
    cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);

    err = cudnnConvolutionForward(
-      _handle,
+      params->handle,
      alpha_p,
      APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
      APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),

--- a/theano/gpuarray/dnn_gi.c
+++ b/theano/gpuarray/dnn_gi.c
 #section init_code_struct

-#ifdef CHOOSE_ALGO
-reuse_algo = 0;
-prev_algo = CONV_ALGO;
-#ifndef CHOOSE_ONCE
-memset(prev_kern_dims, 0, sizeof(prev_kern_dims));
-memset(prev_top_dims, 0, sizeof(prev_top_dims));
-#endif
-#endif
+// #ifdef CHOOSE_ALGO
+if (PARAMS->choose_algo) {
+  reuse_algo = 0;
+  prev_algo = PARAMS->conv_algo;
+  // #ifndef CHOOSE_ONCE
+  if (!PARAMS->choose_once) {
+      memset(prev_kern_dims, 0, sizeof(prev_kern_dims));
+      memset(prev_top_dims, 0, sizeof(prev_top_dims));
+  }
+  // #endif
+}
+// #endif

 #section support_code_struct

-#ifdef CHOOSE_ALGO
-int reuse_algo = 0;
-cudnnConvolutionBwdDataAlgo_t prev_algo = CONV_ALGO;
-#ifndef CHOOSE_ONCE
+int reuse_algo;
+cudnnConvolutionBwdDataAlgo_t prev_algo;
 size_t prev_kern_dims[5] = {0};
 size_t prev_top_dims[5] = {0};
-#endif
-#endif

 int
 APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
                        PyGpuArrayObject *im,
                        cudnnConvolutionDescriptor_t desc,
                        double alpha, double beta, PyGpuArrayObject **input,
-                        cudnnHandle_t _handle) {
+                        PARAMS_TYPE* params) {
  PyGpuContextObject *c = kerns->context;
  void *alpha_p;
  void *beta_p;
@@ -53,17 +53,20 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
    return 1;
  }

-#ifdef CONV_INPLACE
-  Py_XDECREF(*input);
-  *input = im;
-  Py_INCREF(*input);
-#else
-  if (theano_prep_output(input, PyGpuArray_NDIM(im), PyGpuArray_DIMS(im),
-                         im->ga.typecode, GA_C_ORDER, c) != 0)
-    return 1;
-  if (beta != 0.0 && pygpu_move(*input, im))
-    return 1;
-#endif
+  // #ifdef CONV_INPLACE
+  if (params->inplace) {
+    Py_XDECREF(*input);
+    *input = im;
+    Py_INCREF(*input);
+  // #else
+  } else {
+    if (theano_prep_output(input, PyGpuArray_NDIM(im), PyGpuArray_DIMS(im),
+                           im->ga.typecode, GA_C_ORDER, c) != 0)
+      return 1;
+    if (beta != 0.0 && pygpu_move(*input, im))
+      return 1;
+  }
+  // #endif

  if (PyGpuArray_DIMS(im)[0] == 0 || PyGpuArray_DIMS(kerns)[0] == 0 || PyGpuArray_DIMS(kerns)[1] == 0) {
    int err2 = GpuArray_memset(&(*input)->ga, 0);
@@ -82,7 +85,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
  if (c_set_tensorNd(*input, APPLY_SPECIFIC(input)) == -1)
    return 1;

-  cudnnConvolutionBwdDataAlgo_t algo = CONV_ALGO;
+  cudnnConvolutionBwdDataAlgo_t algo = params->conv_algo;

  cuda_enter(c->ctx);

@@ -128,84 +131,93 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
    }
  }

-#ifdef CHOOSE_ALGO
-#ifndef CHOOSE_ONCE
-  reuse_algo = 1;
-  for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); i++) {
-    reuse_algo = (reuse_algo &&
-                  PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]);
-    reuse_algo = (reuse_algo &&
-                  PyGpuArray_DIM(output, i) == prev_top_dims[i]);
-  }
-#endif
+  // #ifdef CHOOSE_ALGO
+  if (params->choose_algo) {
+    // #ifndef CHOOSE_ONCE
+    if (!params->choose_once) {
+      reuse_algo = 1;
+      for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); i++) {
+        reuse_algo = (reuse_algo &&
+                      PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]);
+        reuse_algo = (reuse_algo &&
+                      PyGpuArray_DIM(output, i) == prev_top_dims[i]);
+      }
+    }
+    // #endif

-  if (!reuse_algo) {
-    size_t free;
-    int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
+    if (!reuse_algo) {
+      size_t free;
+      int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);

-    if (err2 != GA_NO_ERROR) {
-      PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
-                   "memory information on the GPU");
-      cuda_exit(c->ctx);
-      return 1;
-    }
+      if (err2 != GA_NO_ERROR) {
+        PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
+                     "memory information on the GPU");
+        cuda_exit(c->ctx);
+        return 1;
+      }

-    // Guess 4Mb if the info is not available
-    if (free == 0) free = 4 * 1024 * 1024;
+      // Guess 4Mb if the info is not available
+      if (free == 0) free = 4 * 1024 * 1024;

-#ifdef CHOOSE_TIME
-    int count;
-    cudnnConvolutionBwdDataAlgoPerf_t choice;
-    gpudata *tmpmem;
+    // #ifdef CHOOSE_TIME
+    if (params->choose_time) {
+      int count;
+      cudnnConvolutionBwdDataAlgoPerf_t choice;
+      gpudata *tmpmem;

-    tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
-    if (tmpmem == NULL) {
-      PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
-      return -1;
-    }
+      tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
+      if (tmpmem == NULL) {
+        PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
+        return -1;
+      }

-    err = cudnnFindConvolutionBackwardDataAlgorithmEx(
-      _handle, APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
-      APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc,
-      APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(*input),
-      1, &count, &choice, *(void **)tmpmem, free);
-    gpudata_release(tmpmem);
+      err = cudnnFindConvolutionBackwardDataAlgorithmEx(
+        params->handle, APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
+        APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc,
+        APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(*input),
+        1, &count, &choice, *(void **)tmpmem, free);
+      gpudata_release(tmpmem);
+
+      if (err != CUDNN_STATUS_SUCCESS) {
+        PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
+                     cudnnGetErrorString(err));
+        cuda_exit(c->ctx);
+        return 1;
+      }

-    if (err != CUDNN_STATUS_SUCCESS) {
-      PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
-                   cudnnGetErrorString(err));
-      cuda_exit(c->ctx);
-      return 1;
+      algo = choice.algo;
+    // #else
+    } else {
+      err = cudnnGetConvolutionBackwardDataAlgorithm(
+        params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output),
+        desc, APPLY_SPECIFIC(input),
+        CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, free, &algo);
+      if (err != CUDNN_STATUS_SUCCESS) {
+        PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
+                     cudnnGetErrorString(err));
+        cuda_exit(c->ctx);
+        return 1;
+      }
    }
-
-    algo = choice.algo;
-#else
-    err = cudnnGetConvolutionBackwardDataAlgorithm(
-      _handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output),
-      desc, APPLY_SPECIFIC(input),
-      CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, free, &algo);
-    if (err != CUDNN_STATUS_SUCCESS) {
-      PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
-                   cudnnGetErrorString(err));
-      cuda_exit(c->ctx);
-      return 1;
+    // #endif
+      prev_algo = algo;
+    } else {
+      algo = prev_algo;
    }
-#endif
-    prev_algo = algo;
-  } else {
-    algo = prev_algo;
-  }

-#ifdef CHOOSE_ONCE
-  reuse_algo = 1;
-#else
-  for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); i++) {
-    prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
-    prev_top_dims[i] = PyGpuArray_DIM(output, i);
+    // #ifdef CHOOSE_ONCE
+    if (params->choose_once) {
+      reuse_algo = 1;
+    // #else
+    } else {
+      for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); i++) {
+        prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
+        prev_top_dims[i] = PyGpuArray_DIM(output, i);
+      }
+    }
+    // #endif
  }
-#endif
-
-#endif
+  // #endif

  // The FFT implementation does not support strides, 1x1 filters or inputs
  // with a spatial dimension larger than 1024. The tiled-FFT implementation
@@ -258,7 +270,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
  gpudata *workspace;

  err = cudnnGetConvolutionBackwardDataWorkspaceSize(
-    _handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
+    params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
    APPLY_SPECIFIC(input), algo, &worksize);

  if (err != CUDNN_STATUS_SUCCESS) {
@@ -283,7 +295,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
  cuda_wait((*input)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);

  err = cudnnConvolutionBackwardData(
-    _handle,
+    params->handle,
    alpha_p,
    APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
    APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),

--- a/theano/gpuarray/dnn_gw.c
+++ b/theano/gpuarray/dnn_gw.c
 #section init_code_struct

-#ifdef CHOOSE_ALGO
-reuse_algo = 0;
-prev_algo = CONV_ALGO;
-#ifndef CHOOSE_ONCE
-memset(prev_img_dims, 0, sizeof(prev_img_dims));
-memset(prev_top_dims, 0, sizeof(prev_top_dims));
-#endif
-#endif
+if (PARAMS->choose_algo) {
+  reuse_algo = 0;
+  prev_algo = PARAMS->conv_algo;
+  if (!PARAMS->choose_once) {
+      memset(prev_img_dims, 0, sizeof(prev_img_dims));
+      memset(prev_top_dims, 0, sizeof(prev_top_dims));
+  }
+}

 #section support_code_struct

-#ifdef CHOOSE_ALGO
 int reuse_algo;
 cudnnConvolutionBwdFilterAlgo_t prev_algo;
-#ifndef CHOOSE_ONCE
 size_t prev_img_dims[5];
 size_t prev_top_dims[5];
-#endif
-#endif

 int
 APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
                        PyGpuArrayObject *km,
                        cudnnConvolutionDescriptor_t desc,
                        double alpha, double beta, PyGpuArrayObject **kerns,
-                        cudnnHandle_t _handle) {
+                        PARAMS_TYPE* params) {
  PyGpuContextObject *c = input->context;
  void *alpha_p;
  void *beta_p;
@@ -53,17 +49,17 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
    return 1;
  }

-#ifdef CONV_INPLACE
-  Py_XDECREF(*kerns);
-  *kerns = km;
-  Py_INCREF(*kerns);
-#else
-  if (theano_prep_output(kerns, PyGpuArray_NDIM(km), PyGpuArray_DIMS(km),
-                         km->ga.typecode, GA_C_ORDER, c) != 0)
-    return 1;
-  if (beta != 0.0 && pygpu_move(*kerns, km))
-    return 1;
-#endif
+  if (params->inplace) {
+    Py_XDECREF(*kerns);
+    *kerns = km;
+    Py_INCREF(*kerns);
+  } else {
+    if (theano_prep_output(kerns, PyGpuArray_NDIM(km), PyGpuArray_DIMS(km),
+                           km->ga.typecode, GA_C_ORDER, c) != 0)
+      return 1;
+    if (beta != 0.0 && pygpu_move(*kerns, km))
+      return 1;
+  }

  if (PyGpuArray_DIMS(input)[0] == 0 || PyGpuArray_DIMS(km)[0] == 0 || PyGpuArray_DIMS(km)[1] == 0) {
    int err2 = GpuArray_memset(&(*kerns)->ga, 0);
@@ -82,7 +78,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
  if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns)) == -1)
    return 1;

-  cudnnConvolutionBwdFilterAlgo_t algo = CONV_ALGO;
+  cudnnConvolutionBwdFilterAlgo_t algo = params->conv_algo;

  cuda_enter(c->ctx);

@@ -128,86 +124,85 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
    }
  }

-#ifdef CHOOSE_ALGO
-#ifndef CHOOSE_ONCE
-  reuse_algo = 1;
-  for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
-    reuse_algo = (reuse_algo &&
-                  PyGpuArray_DIM(input, i) == prev_img_dims[i]);
-    reuse_algo = (reuse_algo &&
-                  PyGpuArray_DIM(output, i) == prev_top_dims[i]);
-  }
-#endif
-
-  if (!reuse_algo) {
-    size_t free;
-
-    int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
-    if (err2 != GA_NO_ERROR) {
-      PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
-                   "memory information on the GPU");
-      cuda_exit(c->ctx);
-      return 1;
+  if (params->choose_algo) {
+    if (!params->choose_once) {
+      reuse_algo = 1;
+      for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
+        reuse_algo = (reuse_algo &&
+                      PyGpuArray_DIM(input, i) == prev_img_dims[i]);
+        reuse_algo = (reuse_algo &&
+                      PyGpuArray_DIM(output, i) == prev_top_dims[i]);
+      }
    }

-    // Guess 4Mb if the info is not available
-    if (free == 0) free = 4 * 1024 * 1024;
-
-#ifdef CHOOSE_TIME
-    int count;
-    cudnnConvolutionBwdFilterAlgoPerf_t choice;
-    gpudata *tmpmem;
-
-    tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
-    if (tmpmem == NULL) {
-      PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
-      return -1;
-    }
+    if (!reuse_algo) {
+      size_t free;

-    err = cudnnFindConvolutionBackwardFilterAlgorithmEx(
-      _handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
-      APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc,
-      APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(*kerns),
-      1, &count, &choice, *(void **)tmpmem, free);
-    gpudata_release(tmpmem);
-
-    if (err != CUDNN_STATUS_SUCCESS) {
-      PyErr_Format(PyExc_RuntimeError,
-                   "error selecting convolution algo: %s",
-                   cudnnGetErrorString(err));
-      cuda_exit(c->ctx);
-      return 1;
+      int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
+      if (err2 != GA_NO_ERROR) {
+        PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
+                     "memory information on the GPU");
+        cuda_exit(c->ctx);
+        return 1;
+      }
+
+      // Guess 4Mb if the info is not available
+      if (free == 0) free = 4 * 1024 * 1024;
+
+      if (params->choose_time) {
+        int count;
+        cudnnConvolutionBwdFilterAlgoPerf_t choice;
+        gpudata *tmpmem;
+
+        tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
+        if (tmpmem == NULL) {
+          PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
+          return -1;
+        }
+
+        err = cudnnFindConvolutionBackwardFilterAlgorithmEx(
+          params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
+          APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc,
+          APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(*kerns),
+          1, &count, &choice, *(void **)tmpmem, free);
+        gpudata_release(tmpmem);
+
+        if (err != CUDNN_STATUS_SUCCESS) {
+          PyErr_Format(PyExc_RuntimeError,
+                       "error selecting convolution algo: %s",
+                       cudnnGetErrorString(err));
+          cuda_exit(c->ctx);
+          return 1;
+        }
+
+        algo = choice.algo;
+      } else {
+        err = cudnnGetConvolutionBackwardFilterAlgorithm(
+          params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
+          desc, APPLY_SPECIFIC(kerns),
+          CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, free, &algo);
+        if (err != CUDNN_STATUS_SUCCESS) {
+          PyErr_Format(PyExc_RuntimeError,
+                       "error selecting convolution algo: %s",
+                       cudnnGetErrorString(err));
+          cuda_exit(c->ctx);
+          return 1;
+        }
+      }
+      prev_algo = algo;
+    } else {
+      algo = prev_algo;
    }

-    algo = choice.algo;
-#else
-    err = cudnnGetConvolutionBackwardFilterAlgorithm(
-      _handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
-      desc, APPLY_SPECIFIC(kerns),
-      CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, free, &algo);
-    if (err != CUDNN_STATUS_SUCCESS) {
-      PyErr_Format(PyExc_RuntimeError,
-                   "error selecting convolution algo: %s",
-                   cudnnGetErrorString(err));
-      cuda_exit(c->ctx);
-      return 1;
+    if (params->choose_once) {
+      reuse_algo = 1;
+    } else {
+      for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
+        prev_img_dims[i] = PyGpuArray_DIM(input, i);
+        prev_top_dims[i] = PyGpuArray_DIM(output, i);
+      }
    }
-#endif
-    prev_algo = algo;
-  } else {
-    algo = prev_algo;
-  }
-
-#ifdef CHOOSE_ONCE
-  reuse_algo = 1;
-#else
-  for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
-    prev_img_dims[i] = PyGpuArray_DIM(input, i);
-    prev_top_dims[i] = PyGpuArray_DIM(output, i);
  }
-#endif
-
-#endif

  // The FFT implementation does not support strides, 1x1 filters or inputs
  // with a spatial dimension larger than 1024.
@@ -246,7 +241,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
  gpudata *workspace;

  err = cudnnGetConvolutionBackwardFilterWorkspaceSize(
-    _handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
+    params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
    APPLY_SPECIFIC(kerns), algo, &worksize);

  if (err != CUDNN_STATUS_SUCCESS) {
@@ -270,7 +265,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
  cuda_wait((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);

  err = cudnnConvolutionBackwardFilter(
-    _handle,
+    params->handle,
    alpha_p,
    APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
    APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),

--- a/theano/gpuarray/dnn_pool.c
+++ b/theano/gpuarray/dnn_pool.c
@@ -42,7 +42,7 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
                             PyArrayObject *stride,
                             PyArrayObject *pad,
                             PyGpuArrayObject **out,
-                             cudnnHandle_t _handle) {
+                             PARAMS_TYPE* params) {
  PyGpuContextObject *c = img->context;
  size_t dims[5];
  cudnnStatus_t err;
@@ -90,7 +90,7 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
  if (c_set_tensorNd(*out, APPLY_SPECIFIC(output)) != 0)
    return 1;

-  err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), MODE_FLAG, CUDNN_PROPAGATE_NAN, ndims, w, p, s);
+  err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), params->mode, CUDNN_PROPAGATE_NAN, ndims, w, p, s);

  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError, "could not set op descriptor %s", cudnnGetErrorString(err));
@@ -124,7 +124,7 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
    cuda_wait((*out)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);

    err = cudnnPoolingForward(
-      _handle, APPLY_SPECIFIC(pool),
+      params->handle, APPLY_SPECIFIC(pool),
      alpha,
      APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(img),
      beta,

--- a/theano/gpuarray/dnn_pool_grad.c
+++ b/theano/gpuarray/dnn_pool_grad.c
@@ -64,7 +64,7 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
                                  PyArrayObject *stride,
                                  PyArrayObject *pad,
                                  PyGpuArrayObject **inp_grad,
-                                  cudnnHandle_t _handle) {
+                                  PARAMS_TYPE* params) {
  PyGpuContextObject *c = inp->context;
  cudnnStatus_t err;

@@ -116,7 +116,7 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
     s[i] = *((npy_intp*)PyArray_GETPTR1(stride, i));
  }

-  err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), MODE_FLAG, CUDNN_PROPAGATE_NAN, ndims, w, p, s);
+  err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), params->mode, CUDNN_PROPAGATE_NAN, ndims, w, p, s);

  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError, "could not set op descriptor %s", cudnnGetErrorString(err));
@@ -155,7 +155,7 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
    cuda_wait((*inp_grad)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);

    err = cudnnPoolingBackward(
-      _handle, APPLY_SPECIFIC(pool),
+      params->handle, APPLY_SPECIFIC(pool),
      alpha,
      APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(out),
      APPLY_SPECIFIC(output_grad), PyGpuArray_DEV_DATA(out_grad),

--- a/theano/gpuarray/extra_ops.py
+++ b/theano/gpuarray/extra_ops.py
@@ -35,7 +35,7 @@ class GpuCumOp(GpuKernelBase, Op):
        return hash(self.axis) ^ hash(self.mode)

    def c_code_cache_version(self):
-        return (3,)
+        return (5,)

    def c_headers(self):
        return ['<numpy_compat.h>', '<gpuarray/types.h>', '<gpuarray_helper.h>']
@@ -67,13 +67,16 @@ class GpuCumOp(GpuKernelBase, Op):
        dtype_x = node.inputs[0].dtype
        flags = Kernel.get_flags(dtype_x)
        code = """
-        KERNEL void %(kname)s(float* input, float* output,
+        KERNEL void %(kname)s(float* input, ga_size input_offset,
+                              float* output, ga_size output_offset,
                              ga_ssize inputStrides_x,
                              ga_ssize inputStrides_y,
                              ga_ssize inputStrides_z,
                              ga_ssize outputStrides_x, ga_ssize outputStrides_y,
                              ga_ssize outputStrides_z, const int offsetY, const int offsetZ,
                              const int beforeLastElementIdx, const int lastElementIdx){
+            input = (float *)(((char *)input) + input_offset);
+            output = (float *)(((char *)output) + output_offset);
            int idY = blockIdx.y + offsetY;
            int idZ = blockIdx.z + offsetZ;

@@ -85,8 +88,10 @@ class GpuCumOp(GpuKernelBase, Op):
            output[idx_last_output] = input[idx_last_input] %(op)s output[idx_beforelast];
            }
        """ % locals()
-        params = [gpuarray.GpuArray, gpuarray.GpuArray, gpuarray.SSIZE,
-                  gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
+        params = [gpuarray.GpuArray, gpuarray.SIZE,
+                  gpuarray.GpuArray, gpuarray.SIZE,
+                  gpuarray.SSIZE, gpuarray.SSIZE,
+                  gpuarray.SSIZE, gpuarray.SSIZE,
                  gpuarray.SSIZE, gpuarray.SSIZE,
                  'intc', 'intc',
                  'intc', 'intc',
@@ -96,10 +101,11 @@ class GpuCumOp(GpuKernelBase, Op):
        # blockCumOp
        kname = "k_blockCumOp"
        k_var = "k_blockCumOp_" + nodename
-        params = [gpuarray.GpuArray, gpuarray.GpuArray, gpuarray.SIZE,
+        params = [gpuarray.GpuArray, gpuarray.SIZE,
+                  gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SIZE,
                  gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
                  gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
-                  'int32', 'int32', gpuarray.GpuArray, ]
+                  'int32', 'int32', gpuarray.GpuArray, gpuarray.SIZE]
        code = """
        // helper functions
        WITHIN_KERNEL
@@ -154,12 +160,17 @@ class GpuCumOp(GpuKernelBase, Op):
            output[idx_odd]  = partialCumOp[threadIdx.x*2 + 1];
        }

-        KERNEL void k_blockCumOp(float* input, float* output,
-                                        size_t nbElementsPerCumOp, ga_ssize inputStrides_x,
-                                        ga_ssize inputStrides_y,  ga_ssize inputStrides_z,
-                                        ga_ssize outputStrides_x, ga_ssize outputStrides_y,
-                                        ga_ssize outputStrides_z, int offsetY,
-                                        int offsetZ, float* blockSum) {
+        KERNEL void k_blockCumOp(float* input, ga_size input_offset,
+                                 float* output, ga_size output_offset,
+                                 size_t nbElementsPerCumOp, ga_ssize inputStrides_x,
+                                 ga_ssize inputStrides_y,  ga_ssize inputStrides_z,
+                                 ga_ssize outputStrides_x, ga_ssize outputStrides_y,
+                                 ga_ssize outputStrides_z, int offsetY,
+                                 int offsetZ, float* blockSum, ga_size blockSum_offset) {
+            input = (float *)(((char *)input) + input_offset);
+            output = (float *)(((char *)output) + output_offset);
+            blockSum = (float *)(((char *)blockSum) + blockSum_offset);
+
            // Regarding blockIdx and threadIdx, 'CumOp' is always performed along the X axis.
            // The Y and Z axis of the grid will contain all independent cumops of the 2D/3D case.

@@ -197,9 +208,14 @@ class GpuCumOp(GpuKernelBase, Op):
        kname = "k_finalCumOp"
        k_var = "k_finalCumOp_" + nodename
        code = """
-        KERNEL void k_finalCumOp(float* output, float* blockSum, size_t nbElementsPerCumOp,
-                                               ga_ssize dataStrides_x,  ga_ssize dataStrides_y,  ga_ssize dataStrides_z,
-                                               int offsetY, int offsetZ) {
+        KERNEL void k_finalCumOp(float* output, ga_size output_offset,
+                                 float* blockSum, ga_size blockSum_offset,
+                                 size_t nbElementsPerCumOp,
+                                 ga_ssize dataStrides_x,  ga_ssize dataStrides_y,  ga_ssize dataStrides_z,
+                                 int offsetY, int offsetZ) {
+
+            output = (float *)(((char *)output) + output_offset);
+            blockSum = (float *)(((char *)blockSum) + blockSum_offset);
            int globalThreadID = (blockIdx.x + 1) * blockDim.x + threadIdx.x;

            // Check if current has data to process.
@@ -218,7 +234,8 @@ class GpuCumOp(GpuKernelBase, Op):
            output[idx_odd] %(op)s= currentBlockSum;
        }
        """ % locals()
-        params = [gpuarray.GpuArray, gpuarray.GpuArray, gpuarray.SIZE,
+        params = [gpuarray.GpuArray, gpuarray.SIZE,
+                  gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SIZE,
                  gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
                  'int32', 'int32', ]
        kernels.append(Kernel(code=code, name=kname, params=params,
@@ -381,7 +398,9 @@ class GpuCumOp(GpuKernelBase, Op):
                    size_t dimBlock[3] = {dimBlockX, 1, 1};  // One cum op per block.
                    size_t sharedBytes = (2*dimBlockX) * sizeof(float);
                    void* kernel_params[] = {(void*) input->ga.data,
+                                             (void*) &(input->ga.offset),
                                             (void*) output->ga.data,
+                                             (void*) &(output->ga.offset),
                                             (void*) &nbElementsPerCumOp,
                                             (void*) &inputStrides_x,
                                             (void*) &inputStrides_y,
@@ -391,7 +410,8 @@ class GpuCumOp(GpuKernelBase, Op):
                                             (void*) &outputStrides_z,
                                             (void*) &offsetY,
                                             (void*) &offsetZ,
-                                             (void*) deviceBlockSum->ga.data
+                                             (void*) deviceBlockSum->ga.data,
+                                             (void*) &(deviceBlockSum->ga.offset)
                        };
                    int err = GpuKernel_call(&k_blockCumOp_%(nodename)s, 3, dimGrid, dimBlock, sharedBytes, kernel_params);
                    if (err != GA_NO_ERROR){
@@ -410,7 +430,9 @@ class GpuCumOp(GpuKernelBase, Op):
                        size_t dimGrid[3] = {dimGridX, localDimGridY, localDimGridZ};
                        size_t dimBlock[3] = {dimBlockX, 1, 1};
                        void* kernel_params[] = {(void*) output->ga.data,
+                                                 (void*) &(output->ga.offset),
                                                 (void*) deviceBlockSum->ga.data,
+                                                 (void*) &(deviceBlockSum->ga.offset),
                                                 (void*) &nbElementsPerCumOp,
                                                 (void*) &outputStrides_x,
                                                 (void*) &outputStrides_y,
@@ -431,7 +453,9 @@ class GpuCumOp(GpuKernelBase, Op):
                        size_t tmp0 = shape[axis]-2;
                        size_t tmp1 = shape[axis]-1;
                        void* kernel_params[] = {(void*) input->ga.data,
+                                                 (void*) &(input->ga.offset),
                                                 (void*) output->ga.data,
+                                                 (void*) &(output->ga.offset),
                                                 (void*) &inputStrides_x,
                                                 (void*) &inputStrides_y,
                                                 (void*) &inputStrides_z,

--- a/theano/gpuarray/tests/test_dnn.py
+++ b/theano/gpuarray/tests/test_dnn.py
@@ -31,6 +31,20 @@ mode_with_gpu = mode_with_gpu.including()
 mode_with_gpu.check_py_code = False


+# This variable will store the list of pooling modes available with the current runtime cuDNN version.
+# Don't use this variable directly, always call `get_dnn_pool_modes()` instead.
+dnn_pool_modes = None
+
+
+def get_dnn_pool_modes():
+    # This function is called only by pooling tests to initialize and/or get dnn_pool_modes.
+    global dnn_pool_modes
+    if dnn_pool_modes is None:
+        from .. import cudnn_defs
+        dnn_pool_modes = cudnn_defs.get_definitions(dnn.version(raises=False)).cudnnPoolingMode_t.get_aliases()
+    return dnn_pool_modes
+
+
 # If using float16, set CUDNN precision to float32
 def set_precision(floatX):
    if floatX == "float16":
@@ -155,11 +169,7 @@ def test_pooling():
        raise SkipTest(dnn.dnn_available.msg)
    utt.seed_rng()

-    # 'average_exc_pad' is disabled for versions < 4004
-    if dnn.version(raises=False) < 4004:
-        modes = ('max', 'average_inc_pad')
-    else:
-        modes = ('max', 'average_inc_pad', 'average_exc_pad')
+    modes = get_dnn_pool_modes()

    x = T.tensor4()
    for mode, pad in product(modes,
@@ -242,7 +252,9 @@ def test_pooling():
                        for node in fg.maker.fgraph.toposort()])


-def test_pooling_with_tensor_vars():
+# This test will be run with different values of 'mode'
+# (see next test below).
+def run_pooling_with_tensor_vars(mode):
    if not dnn.dnn_available(test_ctx_name):
        raise SkipTest(dnn.dnn_available.msg)
    utt.seed_rng()
@@ -251,7 +263,6 @@ def test_pooling_with_tensor_vars():
    ws = theano.shared(np.array([2, 2], dtype='int32'))
    stride = theano.shared(np.array([1, 1], dtype='int32'))
    pad = theano.shared(np.array([0, 0], dtype='int32'))
-    mode = 'max'

    def fn(x):
        dnn_op = dnn.dnn_pool(
@@ -297,6 +308,12 @@ def test_pooling_with_tensor_vars():
        i += 1


+def test_pooling_with_tensor_vars():
+    # Let's test for mode 'max' and also for 'max_deterministic' if available.
+    for mode in [m for m in get_dnn_pool_modes() if m in ('max', 'max_deterministic')]:
+        yield (run_pooling_with_tensor_vars, mode)
+
+
 def test_pooling3d():
    # 3d pooling requires version 3 or newer.
    if not dnn.dnn_available(test_ctx_name) or dnn.version(raises=False) < 3000:
@@ -307,11 +324,7 @@ def test_pooling3d():
    mode_without_gpu_ref = theano.compile.mode.get_mode(
        'FAST_RUN').excluding('gpuarray')

-    # 'average_exc_pad' is disabled for versions < 4004
-    if dnn.version(raises=False) < 4004:
-        modes = ('max', 'average_inc_pad')
-    else:
-        modes = ('max', 'average_inc_pad', 'average_exc_pad')
+    modes = get_dnn_pool_modes()

    x = T.tensor5()
    for mode, pad in product(modes,
@@ -467,11 +480,7 @@ def test_pooling_opt_arbitrary_dimensions():
        raise SkipTest(dnn.dnn_available.msg)
    utt.seed_rng()

-    # 'average_exc_pad' is disabled for versions < 4004
-    if dnn.version(raises=False) < 4004:
-        modes = ('max', 'average_inc_pad')
-    else:
-        modes = ('max', 'average_inc_pad', 'average_exc_pad')
+    modes = get_dnn_pool_modes()

    for n_non_pool_dims in (0, 1, 2, 3):
        for ws in ((2, 2), (3, 3, 3)):
@@ -498,7 +507,7 @@ def test_pooling_opt_arbitrary_dimensions():
                fc = theano.function([], out, mode=mode_without_gpu)
                assert any([isinstance(node.op, Pool)
                           for node in fc.maker.fgraph.toposort()])
-                if mode == 'max':
+                if mode in ('max', 'max_deterministic'):
                    assert any([isinstance(node.op, MaxPoolGrad)
                               for node in fc.maker.fgraph.toposort()])
                else:
@@ -780,11 +789,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
            dtype=theano.config.floatX
        )

-        # 'average_exc_pad' is disabled for versions < 4004
-        if dnn.version(raises=False) < 4004:
-            modes = ['max', 'average_inc_pad']
-        else:
-            modes = ['max', 'average_inc_pad', 'average_exc_pad']
+        modes = get_dnn_pool_modes()

        for params in product(
            [(1, 1), (2, 2), (3, 3)],
@@ -807,11 +812,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
            dtype=theano.config.floatX
        )

-        # 'average_exc_pad' is disabled for versions < 4004
-        if dnn.version(raises=False) < 4004:
-            modes = ['max', 'average_inc_pad']
-        else:
-            modes = ['max', 'average_inc_pad', 'average_exc_pad']
+        modes = get_dnn_pool_modes()

        for params in product(
            [(1, 1, 1), (2, 2, 2), (3, 3, 3)],
@@ -847,7 +848,8 @@ class TestDnnInferShapes(utt.InferShapeTester):
        for params in product(
            [(1, 1), (2, 2), (3, 3)],
            [(1, 1), (2, 2), (3, 3)],
-            ['max', 'average_inc_pad']
+            # modes without `average_exc_pad`
+            [m for m in get_dnn_pool_modes() if m != 'average_exc_pad']
        ):
            pool_grad = dnn.GpuDnnPoolGrad(mode=params[2])(
                img,
@@ -886,7 +888,8 @@ class TestDnnInferShapes(utt.InferShapeTester):
        for params in product(
            [(1, 1, 1), (2, 2, 2), (3, 3, 3)],
            [(1, 1, 1), (2, 2, 2), (3, 3, 3)],
-            ['max', 'average_inc_pad']
+            # modes without `average_exc_pad`
+            [m for m in get_dnn_pool_modes() if m != 'average_exc_pad']
        ):
            pool_grad = dnn.GpuDnnPoolGrad(mode=params[2])(
                img,

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -5710,41 +5710,50 @@ def local_opt_alloc(node):
        if node_inps.owner and isinstance(node_inps.owner.op, T.Alloc):
            input = node_inps.owner.inputs[0]
            shapes = node_inps.owner.inputs[1:]
-            if (node.op.axis is None or
-                    node.op.axis == tuple(range(input.ndim))):
-                try:
-                    val = get_scalar_constant_value(input,
-                                                    only_process_constants=True)
-                    assert val.size == 1
-                    # check which type of op
-                    casted = T.mul(*shapes).astype(str(input.dtype))
+            try:
+                val = get_scalar_constant_value(input,
+                                                only_process_constants=True)
+                assert val.size == 1
+                val = val.reshape(1)[0]
+                # check which type of op
+                size = T.mul(*shapes)
+                if input.dtype in ["float16", "float32"]:
+                    # shapes are ints and normally int64.
+                    # We don't want to have a float64 upcast
+                    # We don't want to downcast to float16
+                    # as we fear it could loose too much precision
+                    # that will be amplified by the mul/pow below.
+                    size = size.astype('float32')
+                if (node.op.axis is None or
+                        node.op.axis == tuple(range(input.ndim))):
                    if isinstance(node.op, T.Sum):
-                        val = val.reshape(1)[0] * casted
+                        val = val * size
                    else:
-                        val = val.reshape(1)[0] ** casted
+                        val = val ** size
+                    # Sum can change the input dtype (upcast or bool
+                    # -> float32) by default or by user request.
+                    # We can ignore the acc_dtype, as there is only 1
+                    # elemwise we will do and not a sequence, so there is no
+                    # accumulation of errors.
+                    # So mostly, we just need to cast the output to the old
+                    # dtype.
+                    val = val.astype(node.outputs[0].dtype)
                    return [val]
-
-                except NotScalarConstantError:
-                    pass
-            else:
-                try:
-                    val = get_scalar_constant_value(input,
-                                                    only_process_constants=True)
-                    assert val.size == 1
-                    val = val.reshape(1)[0]
-                    to_prod = [shapes[i] for i in xrange(len(shapes))
-                               if i in node.op.axis]
-                    if to_prod:
-                        casted = T.mul(*to_prod).astype(str(input.dtype))
-                        if isinstance(node.op, T.Sum):
-                            val *= casted
-                        else:
-                            val = val ** casted
-                    return [T.alloc(val,
-                                    *[shapes[i] for i in xrange(len(shapes))
-                                      if i not in node.op.axis])]
-                except NotScalarConstantError:
-                    pass
+                to_prod = [shapes[i] for i in xrange(len(shapes))
+                           if i in node.op.axis]
+                if to_prod:
+                    size = T.mul(*to_prod)
+                    if isinstance(node.op, T.Sum):
+                        val *= size
+                    else:
+                        val = val ** size
+                # See comments above.
+                val = val.astype(node.outputs[0].dtype)
+                return [T.alloc(val,
+                                *[shapes[i] for i in xrange(len(shapes))
+                                  if i not in node.op.axis])]
+            except NotScalarConstantError:
+                pass


 @register_specialize

--- a/theano/tensor/signal/pool.py
+++ b/theano/tensor/signal/pool.py
@@ -59,7 +59,8 @@ def pool_2d(input, ws=None, ignore_border=None, stride=None, pad=(0, 0),
    stride : tuple of two ints or theano vector of ints of size 2.
        Stride size, which is the number of shifts over rows/cols to get the
        next pool region. If stride is None, it is considered equal to ws
-        (no overlap on pooling regions).
+        (no overlap on pooling regions), eg: stride=(1,1) will shifts over
+        one row and one col for every iteration.
    pad : tuple of two ints or theano vector of ints of size 2.
        (pad_h, pad_w), pad zeros to extend beyond four borders of the
        images, pad_h is the size of the top and bottom margins, and
@@ -433,6 +434,9 @@ class Pool(OpenMPOp):
        super(Pool, self).__init__(openmp=openmp)
        self.ndim = ndim
        self.ignore_border = ignore_border
+        if mode == 'max_deterministic':
+            # It seems max pool algo is already deterministic in CPU.
+            mode = 'max'
        if mode not in ['max', 'average_inc_pad', 'average_exc_pad', 'sum']:
            raise ValueError(
                "Pool mode parameter only support 'max', 'sum',"
@@ -1040,6 +1044,9 @@ class PoolGrad(OpenMPOp):
    def __init__(self, ignore_border, mode='max', ndim=2, openmp=None):
        self.ndim = ndim
        self.ignore_border = ignore_border
+        if mode == 'max_deterministic':
+            # It seems max pool grad algo is already deterministic in CPU.
+            mode = 'max'
        if mode not in ['max', 'sum', 'average_inc_pad', 'average_exc_pad']:
            raise ValueError(
                "Pool mode parameter only support 'max', 'sum',"

--- a/theano/tensor/tests/test_opt.py
+++ b/theano/tensor/tests/test_opt.py
@@ -5558,9 +5558,11 @@ class T_local_sum_prod(unittest.TestCase):


 class T_local_opt_alloc(unittest.TestCase):
+    dtype = 'float32'
+
    def test_sum_upcast(self):
        s = theano.tensor.lscalar()
-        a = theano.tensor.alloc(np.asarray(5, dtype='float32'), s, s)
+        a = theano.tensor.alloc(np.asarray(5, dtype=self.dtype), s, s)
        orig = theano.config.warn_float64
        theano.config.warn_float64 = "raise"
        try:
@@ -5571,7 +5573,7 @@ class T_local_opt_alloc(unittest.TestCase):

    def test_prod_upcast(self):
        s = theano.tensor.lscalar()
-        a = theano.tensor.alloc(np.asarray(5, dtype='float32'), s, s)
+        a = theano.tensor.alloc(np.asarray(5, dtype=self.dtype), s, s)
        orig = theano.config.warn_float64
        theano.config.warn_float64 = "raise"
        try:
@@ -5580,6 +5582,24 @@ class T_local_opt_alloc(unittest.TestCase):
        finally:
            theano.config.warn_float64 = orig

+    @theano.configparser.change_flags(on_opt_error='raise')
+    def test_sum_bool_upcast(self):
+        s = theano.tensor.lscalar()
+        a = theano.tensor.alloc(np.asarray(True, dtype='bool'), s, s)
+        f = theano.function([s], a.sum())
+        f(5)
+        # test with user specified dtype
+        f = theano.function([s], a.sum(dtype=self.dtype))
+        f(5)
+        # test only 1 axis summed
+        f = theano.function([s], a.sum(axis=0, dtype=self.dtype))
+        f(5)
+        print(self.dtype)
+
+
+class T_local_opt_alloc_f16(T_local_opt_alloc):
+    dtype = 'float16'
+

 class T_local_reduce(unittest.TestCase):
    def setUp(self):

--- a/theano/tensor/type.py
+++ b/theano/tensor/type.py
@@ -804,8 +804,8 @@ theano.compile.register_specify_shape_c_code(
            PyErr_Format(PyExc_AssertionError,
                         "SpecifyShape: vector of shape has %%d elements,"
                         " but the input has %%d dimensions.",
-                         PyArray_NDIM(%(iname)s),
-                         PyArray_DIMS(%(shape)s)[0]);
+                         PyArray_DIMS(%(shape)s)[0],
+                         PyArray_NDIM(%(iname)s));
            %(fail)s;
        }
        for(int i = 0; i < PyArray_NDIM(%(iname)s); i++){