Delete sandbox.cuda.

9dcf3f4c · Arnaud Bergeron · bea31470 · bea31470 · bea31470 · bea31470
--- a/theano/sandbox/cuda/GpuConv3D.py
+++ b/theano/sandbox/cuda/GpuConv3D.py
-from __future__ import absolute_import, print_function, division
-import numpy
-import theano
-import theano.tensor as T
-from theano.gof import local_optimizer
-from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
-                                           host_from_gpu, HostFromGpu)
-from theano.misc import strutil
-from theano.tensor.nnet.Conv3D import Conv3D
-from theano.sandbox.cuda.opt import gpu_optimizer
-from theano.sandbox.cuda import CudaNdarrayType, GpuOp
-class GpuConv3D(GpuOp):
-    """
-    GPU implementation of Conv3D.
-    """
-    def __eq__(self, other):
-        return type(self) == type(other)
-    def __hash__(self):
-        return hash(type(self))
-    def __str__(self):
-        return '%s' % (self.__class__.__name__)
-    def make_node(self, V, W, b, d):
-        """
-        Parameters
-        ----------
-        V
-            Visible unit, input.
-        W
-            Weights, filter.
-        b
-            Bias.
-        d
-            Strides when moving the filter over the input.
-        """
-        V_ = as_cuda_ndarray_variable(V)
-        W_ = as_cuda_ndarray_variable(W)
-        b_ = as_cuda_ndarray_variable(b)
-        d_ = T.as_tensor_variable(d)
-        broad = (V_.broadcastable[0], W_.broadcastable[0], False, False, False)
-        return theano.Apply(self, inputs=[V_, W_, b_, d_],
-                            outputs=[CudaNdarrayType(dtype=V_.dtype,
-                                                     broadcastable=broad)()])
-    def c_code_cache_version(self):
-        return ()
-    def c_code(self, node, nodename, inputs, outputs, sub):
-        V, W, b, d = inputs
-        fail = sub['fail']
-        H = outputs[0]
-        codeSource = """
-                        ///////////// < code generated by GpuConv3D >
-                        //printf("\t\t\t\tConv3DGPU c code\\n");
-                        //Check dimensionality of inputs
-                        if (CudaNdarray_NDIM(%(W)s) != 5)
-                        {
-                PyErr_Format(PyExc_ValueError, "GpuConv3D: W must be a 5 dimensional CudaNdarray");
-                            %(fail)s
-                        }
-                        if (CudaNdarray_NDIM(%(V)s) != 5)
-                        {
-                PyErr_Format(PyExc_ValueError, "GpuConv3D: V must be a 5 dimensional CudaNdarray");
-                            %(fail)s
-                        }
-                        if (CudaNdarray_NDIM(%(b)s) != 1)
-                        {
-                PyErr_Format(PyExc_ValueError, "GpuConv3D: b must be a vector CudaNdarray");
-                            %(fail)s
-                        }
-                        if (CudaNdarray_NDIM(%(d)s) != 1)
-                        {
-PyErr_Format(PyExc_ValueError, "GpuConv3D: d must be a vector CudaNdarray");
-                            %(fail)s
-                        }
-                        if (PyArray_DIMS(%(d)s)[0] != 3)
-                        {
-                PyErr_Format(PyExc_ValueError, "GpuConv3D: 3 stride length arguments expected (row, col, time) but %%li were given", PyArray_DIMS(%(d)s)[0]);
-                            %(fail)s
-                        }
-{ //extra scope so fail doesn't jump over declarations
-                        //Read and check sizes of inputs
-                        const int batchSize = CudaNdarray_HOST_DIMS(%(V)s)[0];
-                        const int outputChannels =  CudaNdarray_HOST_DIMS(%(W)s)[0];
-                        const int inputChannels = CudaNdarray_HOST_DIMS(%(V)s)[4];
-                        if (CudaNdarray_HOST_DIMS(%(W)s)[4] != inputChannels)
-                        {
-                            PyErr_Format(PyExc_ValueError, "GpuConv3D: W operates on a %%i channel image but the image has %%i channels",CudaNdarray_HOST_DIMS(%(W)s)[4],inputChannels);
-                            %(fail)s
-                        }
-{  //extra scope so error handler jumps don't cause errors
-                        const int filterHeight = CudaNdarray_HOST_DIMS(%(W)s)[1];
-                        const int filterWidth = CudaNdarray_HOST_DIMS(%(W)s)[2];
-                        const int filterDur = CudaNdarray_HOST_DIMS(%(W)s)[3];
-                        const int vidHeight = CudaNdarray_HOST_DIMS(%(V)s)[1];
-                        const int vidWidth = CudaNdarray_HOST_DIMS(%(V)s)[2];
-                        const int vidDur = CudaNdarray_HOST_DIMS(%(V)s)[3];
-            if (vidHeight < filterHeight)
-            {
-                PyErr_Format(PyExc_ValueError, "GpuConv3D: W has a height of %%i but V is only %%i pixels tall",filterHeight,vidHeight);
-                %(fail)s
-            }
-{ // extra scope so fail works
-            if (vidWidth < filterWidth)
-            {
-                PyErr_Format(PyExc_ValueError, "GpuConv3D: W has a width of %%i but V is only %%i pixels wide",filterWidth,vidWidth);
-                %(fail)s
-            }
-{ // extra scope so fail works
-            if (vidDur < filterDur)
-            {
-                PyErr_Format(PyExc_ValueError, "GpuConv3D: W has a duration of %%i but V is only %%i pixels long",filterDur,vidDur);
-                %(fail)s
-            }
-{ // extra scope so fail works
-                        //Read and check stride arguments
-                        const int dr = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,0);
-                        const int dc = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,1);
-                        const int dt = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,2);
-                        if (dr <= 0 || dc <= 0 || dt <= 0)
-                        {
-                PyErr_Format(PyExc_ValueError, "GpuConv3D: Strides must all be positive but are %%i, %%i, %%i", dr, dc, dt);
-                %(fail)s
-                        }
-{ // extra scope so fail works
-                        //Make correctly sized output
-                        const int outputHeight = int( (vidHeight - filterHeight) / dr )+1;
-                        const int outputWidth = int( (vidWidth - filterWidth) / dc )+1;
-                        const int outputDur = int( (vidDur - filterDur) / dt ) +1;
-                        npy_intp dims[5];
-                        dims[0] = batchSize;
-                        dims[4] = outputChannels;
-                        dims[1] = outputHeight;
-                        dims[2] = outputWidth;
-                        dims[3] = outputDur;
-                        if(!(%(H)s) || CudaNdarray_HOST_DIMS(%(H)s)[0]!=dims[0] ||
-                        CudaNdarray_HOST_DIMS(%(H)s)[1]!=dims[1] ||
-                        CudaNdarray_HOST_DIMS(%(H)s)[2]!=dims[2] ||
-                        CudaNdarray_HOST_DIMS(%(H)s)[3]!=dims[3] ||
-                        CudaNdarray_HOST_DIMS(%(H)s)[4]!=dims[4]){
-                                Py_XDECREF(%(H)s);
-                                %(H)s = (CudaNdarray*)CudaNdarray_NewDims(5,dims);
-                                if (!(%(H)s)) {
-                    PyErr_Format(PyExc_MemoryError, "GpuConv3D: could not allocate output");
-                            %(fail)s
-                                }
-                        }
-{ // extra scope so fail will not cross declarations
-                        //#define ELEM_AT(x, i) * ( dtype_ ## x *) ( x->data + (i) )####################
-                        const int ws4 = CudaNdarray_HOST_STRIDES(%(W)s)[4];
-                        const int vs4 = CudaNdarray_HOST_STRIDES(%(V)s)[4];
-                        const int ws3 = CudaNdarray_HOST_STRIDES(%(W)s)[3];
-                        const int vs3 = CudaNdarray_HOST_STRIDES(%(V)s)[3];
-                        const int ws2 = CudaNdarray_HOST_STRIDES(%(W)s)[2];
-                        const int vs2 = CudaNdarray_HOST_STRIDES(%(V)s)[2];
-                        const int ws1 = CudaNdarray_HOST_STRIDES(%(W)s)[1];
-                        const int vs1 = CudaNdarray_HOST_STRIDES(%(V)s)[1];
-                        const int ws0 = CudaNdarray_HOST_STRIDES(%(W)s)[0];
-                        const int vs0 = CudaNdarray_HOST_STRIDES(%(V)s)[0];
-                        // Compute H
-                        //H[i,x,y,t,j] = b_j + sum_k sum_l sum_m sum_z W[j,k,l,m,z] V[i, dr*r+k,dc*c+l,dt*t+m,z]
-bool out_contiguous = CudaNdarray_is_c_contiguous(%(H)s);
-int version = -1;
-int verbose = 0;
-bool subsample =(dr>1)||(dc>1)||(dt>1);
-bool b_strided = (CudaNdarray_HOST_STRIDES(%(b)s)[0]!=1) && !(CudaNdarray_HOST_STRIDES(%(b)s)[0]==0 && outputChannels==1);
-bool work_complete = false;
-if(out_contiguous && !b_strided && (version==0||version==-1) && outputDur<=512 && !work_complete){
-    //conv_rows_stack
-    dim3 grid(outputHeight*outputWidth,batchSize*outputChannels);
-    dim3 threads(outputDur);
-    int shared_size=0;
-        conv_rows_stack<<<grid, threads, shared_size>>>(
-        CudaNdarray_DEV_DATA(%(V)s), CudaNdarray_DEV_DATA(%(W)s), CudaNdarray_DEV_DATA(%(b)s), CudaNdarray_DEV_DATA(%(H)s),
-        vidHeight, vidWidth, vidDur,
-        filterHeight, filterWidth, filterDur,
-        outputChannels, inputChannels,
-        dr,dc,dt,
-        vs3,vs2,vs1,vs4,vs0,
-        ws3,ws2,ws1,ws4,ws0);
-        CNDA_THREAD_SYNC;
-        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts)
-        {
-            work_complete = true;
-            if (verbose>1) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
-            if (verbose) printf("INFO: used 'conv_rows_stack' version\\n");
-        }
-        else
-        {
-            if (verbose) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
-            if (verbose) printf("ERROR: all implementations failed for GpuConv3D! (%%s)",cudaGetErrorString(sts));
-            PyErr_Format(PyExc_RuntimeError, "ERROR: all implementations failed for GpuConv3D! (%%s)",
-                    cudaGetErrorString(sts));
-            %(fail)s
-        }
-}
-if(!work_complete){
-            PyErr_Format(PyExc_RuntimeError, "ERROR: no implementations executed for this GpuConv3D!");
-            %(fail)s
-}
-}}}}}}} //extra scope so error handler jumps don't cross declarations
-                        ///////////// < /code generated by GpuConv3D >
-        """
-        return strutil.render_string(codeSource, locals())
-    def c_support_code_apply(self, node, nodename):
-        # This code is not sensitive to the ignore_border flag.
-        # It runs for every position in the output z, and then computes the gradient for the
-        # input pixels that were downsampled to that z-position.
-        codeSource = """
-__global__ void
-//thread block size = out_dur
-//grid block size =(out_len*out_wid, nb kern *nb batch)
-//
-conv_rows_stack( float* img, float* kern, float* bias, float* out,
-                 int img_len, int img_wid, int img_dur,
-                 int kern_height, int kern_wid, int kern_dur,
-                 int nkern, int input_channels,
-                 int dr, int dc, int dt,
-                 int img_stride_frame, int img_stride_col, int img_stride_row,
-                 int img_stride_ochannel, int img_stride_batch,
-                 int kern_stride_frame, int kern_stride_col, int kern_stride_row,
-                 int kern_stride_stack, int kern_stride_okern)
-{
-  int __shared__ out_len, out_wid, out_dur, batch_id, kern_id;
-  float  __shared__ *d_img, *d_kern;
-  out_len = int( (img_len - kern_height) / dr )+1;
-  out_wid = int( (img_wid - kern_wid) / dc )+1;
-  out_dur = int( (img_dur - kern_dur) / dt )+1;
-  batch_id= blockIdx.y/nkern;
-  kern_id = blockIdx.y - batch_id*nkern;
-  const int out_row = blockIdx.x%out_len;
-  const int out_col = blockIdx.x/out_len;
-  const int out_frame=threadIdx.x;
-  img += batch_id*img_stride_batch + out_row*dr*img_stride_row + out_col*dc*img_stride_col+out_frame*dt*img_stride_frame;
-  kern += kern_id*kern_stride_okern;
-    float sum = 0.0f;
-    for (int z = 0; z < input_channels; z++) {//1 for first layer
-        for (int k =0; k < kern_height; k++) {
-          for (int l = 0; l < kern_wid; l++) {
-            for (int m = 0; m < kern_dur; m++) {
-              sum += img[img_stride_ochannel*z+img_stride_row*k+img_stride_col*l+img_stride_frame*m] *
-                         kern[kern_stride_stack*z+kern_stride_row*k+kern_stride_col*l+kern_stride_frame*m];
-            }
-          }
-        }
-      out[batch_id*nkern*out_len*out_wid*out_dur+//the good batch
-          out_frame*nkern+//the output frame
-          out_row*out_wid*out_dur*nkern+//the output row
-          out_col*out_dur*nkern + //the output_col
-          kern_id //the output image (channel)
-] = sum + bias[kern_id];
-    }
-}
-            """
-        return codeSource
-gpu_convd = GpuConv3D()
-@local_optimizer([Conv3D])
-def local_gpu_conv3d(node):
-    if isinstance(node.op, Conv3D):
-        if numpy.any([i.owner and isinstance(i.owner.op, HostFromGpu)
-                      for i in node.inputs]):
-            if numpy.all([o.type.dtype == 'float32' for o in node.outputs]):
-                V, W, b, d = node.inputs
-                return [host_from_gpu(gpu_convd(as_cuda_ndarray_variable(V),
-                                                as_cuda_ndarray_variable(W),
-                                                as_cuda_ndarray_variable(b),
-                                                d))]
-# Not enabled by default as we don't want people to use it.
-gpu_optimizer.register("local_gpu_conv3d", local_gpu_conv3d)
--- a/theano/sandbox/cuda/GpuConvGrad3D.py
+++ b/theano/sandbox/cuda/GpuConvGrad3D.py
-from __future__ import absolute_import, print_function, division
-import numpy
-import theano
-import theano.tensor as T
-from six.moves import xrange
-from theano.gof import local_optimizer
-from theano.sandbox.cuda.basic_ops import as_cuda_ndarray_variable
-from theano.misc import strutil
-from theano.tensor.nnet.ConvGrad3D import ConvGrad3D
-from theano.sandbox.cuda.opt import gpu_optimizer
-from theano.sandbox.cuda import (CudaNdarrayType, HostFromGpu,
-                                 host_from_gpu, GpuOp)
-class GpuConvGrad3D(GpuOp):
-    """
-    GPU version of gradient of ConvGrad3D with respect to W.
-    """
-    def make_node(self, V, d, WShape, dCdH):
-        """
-        Parameters
-        ----------
-        V
-            Visible.
-        d
-            Strides.
-        WShape
-            Shapes of the weights -> shape of this op output.
-        dCdH
-            Other input with what V will be convolved.
-        """
-        V_ = as_cuda_ndarray_variable(V)
-        d_ = T.as_tensor_variable(d)
-        WShape_ = T.as_tensor_variable(WShape)
-        dCdH_ = as_cuda_ndarray_variable(dCdH)
-        broad = (False,) * 5
-        return theano.Apply(self, inputs=[V_, d_, WShape_, dCdH_],
-                            outputs=[CudaNdarrayType(dtype=V_.dtype,
-                                                     broadcastable=broad)()])
-    def perform_(self, node, inputs, output_storage):
-        V, d, WShape, dCdH = inputs
-        print("GpuConvGrad3D python code (warning not updated to new format)")
-        # partial C / partial W[j,z,k,l,m] = sum_i sum_p sum_q sum_r (partial C /partial H[i,j,p,q,r] ) *  V[i,z,dr*p+k,dc*q+l,dt*r+m]
-        batchSize = dCdH.shape[0]
-        outputHeight = dCdH.shape[2]
-        outputWidth = dCdH.shape[3]
-        outputDur = dCdH.shape[4]
-        assert V.shape[0] == batchSize
-        dr, dc, dt = d
-        dCdW = numpy.zeros(WShape, dtype=V.dtype)
-        # block
-        for j in xrange(0, WShape[0]):
-            for z in xrange(0, WShape[1]):
-                for k in xrange(0, WShape[2]):
-                    for l in xrange(0, WShape[3]):
-                        # threads
-                        for m in xrange(0, WShape[4]):
-                            # thread
-                            for i in xrange(0, batchSize):
-                                for p in xrange(0, outputHeight):
-                                    for q in xrange(0, outputWidth):
-                                        for r in xrange(0, outputDur):
-                                            dCdW[j, z, k, l, m] += dCdH[
-                                                i, j, p, q, r] * \
-                                                V[i, z, dr * p + k,
-                                                  dc * q + l,
-                                                  dt * r + m]
-        output_storage[0][0] = dCdW
-    def c_code(self, node, nodename, inputs, outputs, sub):
-        V, d, WShape, dCdH = inputs
-        fail = sub['fail']
-        dCdW = outputs[0]
-        codeSource = """
-            ///////////// < code generated by GpuConvGrad3D >
-            //printf("\t\t\t\tGpuConvGrad3DW c code\\n");
-            //Check dimensionality of inputs
-            if (CudaNdarray_NDIM(%(dCdH)s) != 5)
-            {
-                PyErr_Format(PyExc_ValueError, "GpuConvGrad3D: dCdH must be a 5-d CudaNdArray");
-                %(fail)s
-            }
-            if (CudaNdarray_NDIM(%(V)s) != 5)
-            {
-                PyErr_Format(PyExc_ValueError, "GpuConvGrad3D: V must be a 5-d CudaNdArray");
-                %(fail)s
-            }
-            if (CudaNdarray_NDIM(%(WShape)s) != 1)
-            {
-                PyErr_Format(PyExc_ValueError, "GpuConvGrad3D: WShape must be a 1-d CudaNdArray");
-                %(fail)s
-            }
-            if (PyArray_NDIM(%(d)s) != 1)
-            {
-                PyErr_Format(PyExc_ValueError, "GpuConvGrad3D: d must be a 1-d CudaNdArray");
-                %(fail)s
-            }
-            if (PyArray_DIMS(%(d)s)[0] != 3)
-            {
-                PyErr_Format(PyExc_ValueError, "GpuConvGrad3D: 3 stride lengths arguments expected(for row, col, and time) but %%li were given", PyArray_DIMS(%(d)s)[0]);
-                %(fail)s
-            }
-{ // for fail
-            //Read and check sizes of inputs
-            const int batchSize = CudaNdarray_HOST_DIMS(%(V)s)[0];
-            if (PyArray_DIMS(%(WShape)s)[0] != 5)
-            {
-                PyErr_Format(PyExc_ValueError, "GpuConvGrad3D: WShape must specify a 5-d shape");
-                %(fail)s
-            }
-            if (!PyArray_ISCONTIGUOUS(%(WShape)s))
-            {
-                PyErr_Format(PyExc_ValueError, "GpuConvGrad3D: WShape must be contiguous");
-                %(fail)s
-            }
-{ //for fail
-            dtype_%(WShape)s * WShape = (dtype_%(WShape)s *) PyArray_DATA(%(WShape)s);
-            const int outputChannels =  WShape[0];
-            const int inputChannels = CudaNdarray_HOST_DIMS(%(V)s)[4];
-            if (WShape[4] != inputChannels)
-            {
-                PyErr_Format(PyExc_ValueError, "ConvGrad3D: W operates on a %%d channel image but the image has %%d channels",WShape[4],inputChannels);
-                %(fail)s
-            }
-{ //extra scope so fail works
-            const int filterHeight = WShape[1];
-            const int filterWidth = WShape[2];
-            const int filterDur = WShape[3];
-            const int vidHeight = CudaNdarray_HOST_DIMS(%(V)s)[1];
-            const int vidWidth = CudaNdarray_HOST_DIMS(%(V)s)[2];
-            const int vidDur = CudaNdarray_HOST_DIMS(%(V)s)[3];
-            if (vidHeight < filterHeight)
-            {
-                PyErr_Format(PyExc_ValueError, "W has a height of %%i but V is only %%i pixels tall", filterHeight, vidHeight);
-                %(fail)s
-            }
-            if (vidWidth < filterWidth)
-            {
-                PyErr_Format(PyExc_ValueError, "GpuConvGrad3D: W has a width of %%i but V is only %%i pixels wide", filterWidth, vidWidth);
-                %(fail)s
-            }
-            if (vidDur < filterDur)
-            {
-                PyErr_Format(PyExc_ValueError, "GpuConvGrad3D: W has a duration of %%i but V is only %%i pixels long", filterWidth, vidWidth);
-                %(fail)s
-            }
-{ // extra scope so fail works
-            //Read and check stride arguments
-            const int dr = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,0);
-            const int dc = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,1);
-            const int dt = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,2);
-            if (dr <= 0 || dc <= 0 || dt <= 0)
-            {
-                PyErr_Format(PyExc_ValueError, "GpuConvGrad3D: Strides must all be positive but are %%i, %%i, %%i",dr,dc,dt);
-                %(fail)s
-            }
-            //Compute correctl sized of output
-            const int outputHeight = int( (vidHeight - filterHeight) / dr )+1;
-            const int outputWidth = int( (vidWidth - filterWidth) / dc )+1;
-            const int outputDur = int( (vidDur - filterDur) / dt ) +1;
-            if (CudaNdarray_HOST_DIMS(%(dCdH)s)[0] != batchSize ||
-                CudaNdarray_HOST_DIMS(%(dCdH)s)[4] != outputChannels ||
-                CudaNdarray_HOST_DIMS(%(dCdH)s)[1] != outputHeight ||
-                CudaNdarray_HOST_DIMS(%(dCdH)s)[2] != outputWidth ||
-                CudaNdarray_HOST_DIMS(%(dCdH)s)[3] != outputDur)
-            {
-                PyErr_Format(PyExc_ValueError, "dCdH is the wrong size, expected (%%i,%%i,%%i,%%i,%%i), got (%%i,%%i,%%i,%%i,%%i)", batchSize, outputHeight, outputWidth, outputDur, outputChannels, CudaNdarray_HOST_DIMS(%(dCdH)s)[0], CudaNdarray_HOST_DIMS(%(dCdH)s)[1], CudaNdarray_HOST_DIMS(%(dCdH)s)[2] ,CudaNdarray_HOST_DIMS(%(dCdH)s)[3], CudaNdarray_HOST_DIMS(%(dCdH)s)[4] );
-                %(fail)s
-            }
-{ // extra scope for fail
-            npy_intp dims[5];
-            dims[0] = outputChannels;
-            dims[4] = inputChannels;
-            dims[1] = filterHeight;
-            dims[2] = filterWidth;
-            dims[3] = filterDur;
-            if(!(%(dCdW)s)  || CudaNdarray_HOST_DIMS(%(dCdW)s)[0]!=dims[0] ||
-                  CudaNdarray_HOST_DIMS(%(dCdW)s)[1]!=dims[1] ||
-                  CudaNdarray_HOST_DIMS(%(dCdW)s)[2]!=dims[2] ||
-                  CudaNdarray_HOST_DIMS(%(dCdW)s)[3]!=dims[3] ||
-                  CudaNdarray_HOST_DIMS(%(dCdW)s)[4]!=dims[4] ){
-               Py_XDECREF(%(dCdW)s);
-               %(dCdW)s = (CudaNdarray*)CudaNdarray_NewDims(5,dims);
-               if (!(%(dCdW)s)) {
-                PyErr_Format(PyExc_MemoryError, "GpuConvGrad3D: Could not allocated dCdW");
-                %(fail)s
-               }
-            }
-{ //for fail
-            const int dcdhs4 = CudaNdarray_HOST_STRIDES(%(dCdH)s)[4];
-            const int dcdhs3 = CudaNdarray_HOST_STRIDES(%(dCdH)s)[3];
-            const int dcdhs1 = CudaNdarray_HOST_STRIDES(%(dCdH)s)[1];
-            const int dcdhs2 = CudaNdarray_HOST_STRIDES(%(dCdH)s)[2];
-            const int dcdhs0 = CudaNdarray_HOST_STRIDES(%(dCdH)s)[0];
-            const int vs4 = CudaNdarray_HOST_STRIDES(%(V)s)[4];
-            const int vs3 = CudaNdarray_HOST_STRIDES(%(V)s)[3];
-            const int vs2 = CudaNdarray_HOST_STRIDES(%(V)s)[2];
-            const int vs1 = CudaNdarray_HOST_STRIDES(%(V)s)[1];
-            const int vs0 = CudaNdarray_HOST_STRIDES(%(V)s)[0];
-bool out_contiguous = CudaNdarray_is_c_contiguous(%(dCdW)s);
-int version = -1;
-int verbose = 0;
-bool subsample =(dr>1)||(dc>1)||(dt>1);
-bool work_complete = false;
-if(out_contiguous && (version==0||version==-1) && WShape[4]<=512 && !work_complete){
-    //conv_rows_stack
-    dim3 grid(WShape[0]*WShape[4],WShape[1]*WShape[2]);//outputHeight*outputWidth);
-    dim3 threads(WShape[3]);
-    int shared_size=0;
-        convgrad_rows_stack<<<grid, threads, shared_size>>>(
-        CudaNdarray_DEV_DATA(%(V)s), CudaNdarray_DEV_DATA(%(dCdH)s), CudaNdarray_DEV_DATA(%(dCdW)s),
-        vidHeight, vidWidth, vidDur,
-        filterHeight, filterWidth, filterDur,
-        WShape[0], WShape[1], WShape[2], WShape[3], WShape[4],
-        outputHeight,outputWidth,outputDur,
-        batchSize, outputChannels, inputChannels,
-        dr,dc,dt,
-        vs3,vs2,vs1,vs4,vs0,
-        dcdhs3,dcdhs2,dcdhs1,dcdhs4,dcdhs0);
-        CNDA_THREAD_SYNC;
-        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts)
-        {
-            work_complete = true;
-            if (verbose>1) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
-            if (verbose) printf("INFO: used 'conv_rows_stack' version\\n");
-        }
-        else
-        {
-            if (verbose) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
-            if (verbose) printf("ERROR: all implementations failed for GpuConv3D! (%%s)",cudaGetErrorString(sts));
-            PyErr_Format(PyExc_RuntimeError, "ERROR: all implementations failed for GpuConvGrad3D! (%%s)",
-                    cudaGetErrorString(sts));
-            %(fail)s
-        }
-}
-if(!work_complete){
-            PyErr_Format(PyExc_RuntimeError, "ERROR: no implementations executed for this GpuConv3D!");
-            %(fail)s
-}
-}}}}} // extra scope for fail
-            ///////////// < /code generated by GpuConvGrad3D >
-        """
-        return strutil.render_string(codeSource, locals())
-    def c_support_code_apply(self, node, nodename):
-        # This code is not sensitive to the ignore_border flag.
-        # It runs for every position in the output z, and then computes the gradient for the
-        # input pixels that were downsampled to that z-position.
-        codeSource = """
-__global__ void
-//thread block size = WShape[4]
-//grid block size = (WShape[0]*WShape[1],WShape[2]*WShape[3])
-//
-convgrad_rows_stack( float* img, float* dCdH, float* dCdW,
-                 int img_len, int img_wid, int img_dur,
-                 int dCdW_len, int dCdW_wid, int dCdW_dur,
-                 int wsh0, int wsh1, int wsh2, int wsh3, int wsh4,
-                 int out_len, int out_wid, int out_dur,
-                 int batchSize, int nkern, int nstack,
-                 int dr, int dc, int dt,
-                 int img_stride_frame, int img_stride_col, int img_stride_row,
-                 int img_stride_stack, int img_stride_batch,
-                 int dCdW_stride_frame, int dCdW_stride_col, int dCdW_stride_row,
-                 int dCdW_stride_stack, int dCdW_stride_nkern)
-{
-  int __shared__ kern_id, stack_id;
-  float  __shared__ *d_img, *d_kern;
-  kern_id= blockIdx.x%nkern;
-  stack_id = blockIdx.x/nkern;
-  const int dCdW_row = blockIdx.y%ws1;
-  const int dCdW_col = blockIdx.y/ws1;
-  const int dCdW_frame=threadIdx.x;
-  img +=stack_id*img_stride_stack;
-  dCdH +=kern_id*dCdW_stride_stack;
-  float sum = 0.0f;
-  for(int i=0;i<batchSize;i++){
-      for(int p=0;p<out_len;p++){
-          for(int q=0;q<out_wid;q++){
-              for(int r=0;r<out_dur;r++){
-                  sum += dCdH[i*dCdW_stride_nkern+p*dCdW_stride_row+q*dCdW_stride_col+r*dCdW_stride_frame] *
-                         img[i*img_stride_batch+(dr*p+dCdW_row)*img_stride_row+(dc*q+dCdW_col)*img_stride_col+(dt*r+dCdW_frame)*img_stride_frame];
-              }
-          }
-      }
-  }
-  dCdW[kern_id*wsh1*wsh2*wsh3*wsh4+//the good batch
-      stack_id+//the output image
-      dCdW_row*wsh2*wsh3*wsh4+//the output row
-      dCdW_col*wsh3*wsh4 + //the output_col
-      dCdW_frame*wsh4] = sum;
-}
-/*
-        #block
-        for j in xrange(0,WShape[0]):
-            for z in xrange(0,WShape[1]):
-                for k in xrange(0,WShape[2]):
-                    for l in xrange(0,WShape[3]):
-                        #threads
-                        for m in xrange(0,WShape[4]):
-                            #thread
-                            for i in xrange(0,batchSize):
-                                for p in xrange(0,outputHeight):
-                                    for q in xrange(0,outputWidth):
-                                        for r in xrange(0,outputDur):
-                                            dCdW[j,z,k,l,m] += dCdH[i,j,p,q,r] * V[i,z,dr*p+k,dc*q+l,dt*r+m]
-*/
-"""
-        return codeSource
-gpu_conv_grad3d = GpuConvGrad3D()
-@local_optimizer([ConvGrad3D])
-def local_gpu_conv_grad3d(node):
-    if isinstance(node.op, ConvGrad3D):
-        if numpy.any([i.owner and isinstance(i.owner.op, HostFromGpu)
-                      for i in node.inputs]):
-            if numpy.all([o.type.dtype == 'float32' for o in node.outputs]):
-                V, d, WShape, dCdH = node.inputs
-                return [host_from_gpu(gpu_conv_grad3d(
-                    as_cuda_ndarray_variable(V),
-                    d,
-                    WShape,
-                    as_cuda_ndarray_variable(dCdH)))]
-# Not enabled by default as we don't want people to use it.
-gpu_optimizer.register("local_gpu_conv_grad3d", local_gpu_conv_grad3d)
--- a/theano/sandbox/cuda/GpuConvTransp3D.py
+++ b/theano/sandbox/cuda/GpuConvTransp3D.py
-from __future__ import absolute_import, print_function, division
-import numpy
-import theano.tensor as T
-from theano.misc import strutil
-import theano
-from six.moves import xrange
-from theano.tensor.nnet.ConvTransp3D import ConvTransp3D
-from theano.gof import local_optimizer
-from theano.sandbox.cuda.basic_ops import as_cuda_ndarray_variable
-from theano.sandbox.cuda.opt import gpu_optimizer
-from theano.sandbox.cuda import (CudaNdarrayType, HostFromGpu,
-                                 host_from_gpu, GpuOp)
-class GpuConvTransp3D(GpuOp):
-    """
-    The gpu version of ConvTransp3D.
-    """
-    def __eq__(self, other):
-        return type(self) == type(other)
-    def __hash__(self):
-        return hash(type(self))
-    def make_node(self, W, b, d, H, RShape=None):
-        W_ = as_cuda_ndarray_variable(W)
-        b_ = as_cuda_ndarray_variable(b)
-        d_ = T.as_tensor_variable(d)
-        H_ = as_cuda_ndarray_variable(H)
-        if RShape:
-            RShape_ = T.as_tensor_variable(RShape)
-        else:
-            RShape_ = T.as_tensor_variable([-1, -1, -1])
-        return theano.Apply(
-            self, inputs=[W_, b_, d_, H_, RShape_],
-            outputs=[CudaNdarrayType(
-                dtype=H_.dtype, broadcastable=(False,) * 5)()])
-    def infer_shape(self, node, input_shapes):
-        W, b, d, H, RShape = node.inputs
-        W_shape, b_shape, d_shape, H_shape, RShape_shape = input_shapes
-        return [(H_shape[0], W_shape[1], RShape[0], RShape[1], RShape[2])]
-    def perform_(self, node, inputs, output_storage):
-        W, b, d, H, RShape = inputs
-        print("\t\t\t\tGpuConvTransp3D python code still uses old format")
-        output_storage[0][0] = computeR(W, b, d, H, RShape)
-    def c_code_cache_version(self):
-        return ()
-    def c_code(self, node, nodename, inputs, outputs, sub):
-        W, b, d, H, RShape = inputs
-        fail = sub['fail']
-        R = outputs[0]
-        codeSource = """
-            ///////////// < code generated by GpuConvTransp3D >
-            //printf("\t\t\t\tGpuConvTransp c code\\n");
-            //Check dimensionality of inputs
-            if (CudaNdarray_NDIM(%(H)s) != 5)
-            {
-                PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: H must be a 5-D tensor but it is %%i-D", CudaNdarray_NDIM(%(H)s));
-                %(fail)s
-            }
-            if (CudaNdarray_NDIM(%(W)s) != 5)
-            {
-                PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: W must be a 5-D tensor");
-                %(fail)s
-            }
-            if (CudaNdarray_NDIM(%(b)s) != 1)
-            {
-                PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: b must be a vector");
-                %(fail)s
-            }
-            if (PyArray_NDIM(%(d)s) != 1)
-            {
-                PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: d must be a vector");
-                %(fail)s
-            }
-            //Read and check stride arguments
-            if (PyArray_DIMS(%(d)s)[0] != 3)
-            {
-                PyErr_Format(PyExc_ValueError,"GpuConvTransp3D: 3 stride length arguments expected (for row, col, and time) but %%li were given", PyArray_DIMS(%(d)s)[0]);
-                %(fail)s
-            }
-{ // for fail
-            const int dr = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,0);
-            const int dc = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,1);
-            const int dt = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,2);
-            if (dr <= 0 || dc <= 0 || dt <= 0)
-            {
-                PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: Strides must all be positive but are %%i, %%i, %%i",dr,dc,dt);
-                %(fail)s
-            }
-            //Read and check sizes of inputs
-{ // for fail
-            const int batchSize = CudaNdarray_HOST_DIMS(%(H)s)[0];
-            const int outputChannels =  CudaNdarray_HOST_DIMS(%(W)s)[0];
-            if (CudaNdarray_HOST_DIMS(%(H)s)[4] != outputChannels)
-            {
-                PyErr_Format(PyExc_ValueError, "W produces a %%i channel image but the image has %%i channels. W.shape: (%%i, %%i, %%i,%%i, %%i) H.shape: (%%i, %%i, %%i, %%i, %%i)",outputChannels,CudaNdarray_HOST_DIMS(%(H)s)[4], CudaNdarray_HOST_DIMS(%(W)s)[0], CudaNdarray_HOST_DIMS(%(W)s)[1], CudaNdarray_HOST_DIMS(%(W)s)[2], CudaNdarray_HOST_DIMS(%(W)s)[3], CudaNdarray_HOST_DIMS(%(W)s)[4], CudaNdarray_HOST_DIMS(%(H)s)[0], CudaNdarray_HOST_DIMS(%(H)s)[1], CudaNdarray_HOST_DIMS(%(H)s)[2], CudaNdarray_HOST_DIMS(%(H)s)[3], CudaNdarray_HOST_DIMS(%(H)s)[4]);
-                %(fail)s
-            }
-{ // for fail
-            const int inputChannels = CudaNdarray_HOST_DIMS(%(W)s)[4];
-            if (CudaNdarray_HOST_DIMS(%(b)s)[0] != inputChannels)
-            {
-                PyErr_Format(PyExc_ValueError, "ConvTransp3D: b operates on a %%i channel image but the image has %%i channels", CudaNdarray_HOST_DIMS(%(b)s)[0], inputChannels );
-                %(fail)s
-            }
-{ // for fail
-            const int filterHeight = CudaNdarray_HOST_DIMS(%(W)s)[1];
-            const int filterWidth = CudaNdarray_HOST_DIMS(%(W)s)[2];
-            const int filterDur = CudaNdarray_HOST_DIMS(%(W)s)[3];
-            const int outputHeight = CudaNdarray_HOST_DIMS(%(H)s)[1];
-            const int outputWidth = CudaNdarray_HOST_DIMS(%(H)s)[2];
-            const int outputDur = CudaNdarray_HOST_DIMS(%(H)s)[3];
-            int videoHeight = (outputHeight-1) * dr + filterHeight;
-            int videoWidth = (outputWidth-1) * dc + filterWidth;
-            int videoDur = (outputDur-1) * dt + filterDur;
-            if (%(RShape)s)
-            {
-                if (PyArray_NDIM(%(RShape)s) != 1)
-                {
-                    PyErr_Format(PyExc_ValueError, "RShape must be a vector");
-                    %(fail)s
-                }
-                if (PyArray_DIMS(%(RShape)s)[0] != 3)
-                {
-                    PyErr_Format(PyExc_ValueError, "RShape must specify a 3D shape ( [height,width,duration] )");
-                    %(fail)s
-                }
-{ // for fail
-                                dtype_%(RShape)s RShape0 = *(dtype_%(RShape)s*)PyArray_GETPTR1(%(RShape)s,0);
-                                dtype_%(RShape)s RShape1 = *(dtype_%(RShape)s*)PyArray_GETPTR1(%(RShape)s,1);
-                                dtype_%(RShape)s RShape2 = *(dtype_%(RShape)s*)PyArray_GETPTR1(%(RShape)s,2);
-                if (RShape0 != -1)
-                {
-                    if (RShape0 < videoHeight || RShape1 < videoWidth || RShape2 < videoDur)
-                    {
-                        PyErr_Format(PyExc_ValueError, "Reconstruction must have shape of at least [%%i,%%i,%%i] but RShape argument requests that it be [%%i,%%i,%%i]" , videoHeight, videoWidth, videoDur, RShape0, RShape 1, RShape2 );
-                        %(fail)s
-                    }
-                    videoHeight = RShape0;
-                    videoWidth = RShape1;
-                    videoDur = RShape2;
-                }
-            }
-            //Allocate the reconstruction
-            npy_intp dims[5];
-            dims[0] = batchSize;
-            dims[4] = inputChannels;
-            dims[1] = videoHeight;
-            dims[2] = videoWidth;
-            dims[3] = videoDur;
-                        if(!(%(R)s) || CudaNdarray_HOST_DIMS(%(R)s)[0]!=dims[0] ||
-                        CudaNdarray_HOST_DIMS(%(R)s)[1]!=dims[1] ||
-                        CudaNdarray_HOST_DIMS(%(R)s)[2]!=dims[2] ||
-                        CudaNdarray_HOST_DIMS(%(R)s)[3]!=dims[3] ||
-                        CudaNdarray_HOST_DIMS(%(R)s)[4]!=dims[4]){
-                    Py_XDECREF(%(R)s);
-               %(R)s = (CudaNdarray*)CudaNdarray_NewDims(5,dims);
-                if (!(%(R)s)) {
-                    PyErr_Format(PyExc_MemoryError,"Could not allocate R");
-                    %(fail)s;
-                }
-                        }
-            cudaMemset(CudaNdarray_DEV_DATA(%(R)s), 0, 4 * batchSize * inputChannels * videoHeight * videoWidth * videoDur);
-{ // for fail
-bool out_contiguous = CudaNdarray_is_c_contiguous(%(R)s);
-int version = -1;
-int verbose = 0;
-bool subsample =(dr>1)||(dc>1)||(dt>1);
-bool b_strided = (CudaNdarray_HOST_STRIDES(%(b)s)[0]!=1) && !(CudaNdarray_HOST_STRIDES(%(b)s)[0]==0 && outputChannels==1);
-printf("b stride0=%%d\\n",CudaNdarray_HOST_STRIDES(%(b)s)[0]);
-bool work_complete = false;
-const int ws4 = CudaNdarray_HOST_STRIDES(%(W)s)[4];
-const int ws3 = CudaNdarray_HOST_STRIDES(%(W)s)[3];
-const int ws2 = CudaNdarray_HOST_STRIDES(%(W)s)[2];
-const int ws1 = CudaNdarray_HOST_STRIDES(%(W)s)[1];
-const int ws0 = CudaNdarray_HOST_STRIDES(%(W)s)[0];
-const int hs4 = CudaNdarray_HOST_STRIDES(%(H)s)[4];
-const int hs3 = CudaNdarray_HOST_STRIDES(%(H)s)[3];
-const int hs2 = CudaNdarray_HOST_STRIDES(%(H)s)[2];
-const int hs1 = CudaNdarray_HOST_STRIDES(%(H)s)[1];
-const int hs0 = CudaNdarray_HOST_STRIDES(%(H)s)[0];
-if(out_contiguous && (version==0||version==-1) && outputDur<=512 && !work_complete){
-    //conv_transp_rows_stack
-    dim3 grid(batchSize * inputChannels, videoHeight * videoWidth);
-    dim3 threads(videoDur);
-HERE
-    int shared_size=0;
-        conv_transp_rows_stack<<<grid, threads, shared_size>>>(
-        CudaNdarray_DEV_DATA(%(H)s), CudaNdarray_DEV_DATA(%(W)s), CudaNdarray_DEV_DATA(%(b)s), CudaNdarray_DEV_DATA(%(R)s),
-        videoHeight, videoWidth, videoDur,
-        filterHeight, filterWidth, filterDur,
-        outputHeight, outputWidth, outputDur,
-        outputChannels, inputChannels,
-        dr,dc,dt,
-        hs3,hs2,hs1,hs4,hs0,
-        ws3,ws2,ws1,ws4,ws0,
-        CudaNdarray_HOST_STRIDES(%(b)s)[0]);
-        CNDA_THREAD_SYNC;
-        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts)
-        {
-            work_complete = true;
-        if (verbose>1) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
-            if (verbose) printf("INFO: used 'conv_transp_rows_stack' version\\n");
-        }
-        else
-        {
-            if (verbose) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
-            if (verbose) printf("ERROR: all implementations failed for GpuConvTransp3D! (%%s)",cudaGetErrorString(sts));
-            PyErr_Format(PyExc_RuntimeError, "ERROR: all implementations failed for GpuConvTransp3D! (%%s)",
-                    cudaGetErrorString(sts));
-            %(fail)s
-        }
-}
-if(!work_complete){
-            PyErr_Format(PyExc_RuntimeError, "ERROR: no implementations executed for this GpuConvTransp3D! out_contiguous=%%d b_strided=%%d outputDur=%%d",
-                         out_contiguous,b_strided,outputDur);
-            %(fail)s
-}
-}}}}}} // for fail
-            ///////////// < /code generated by GpuConvTransp3D >
-        """
-        return strutil.render_string(codeSource, locals())
-    def c_support_code_apply(self, node, nodename):
-        # This code is not sensitive to the ignore_border flag.
-        # It runs for every position in the output z, and then computes the gradient for the
-        # input pixels that were downsampled to that z-position.
-        codeSource = """
-__global__ void
-//thread block size = videoDur
-//grid block size =(batchSize * inputChannels, videoHeight * videoWidth)
-//
-conv_transp_rows_stack( float* H, float* kern, float* bias, float* R,
-         int img_len, int img_wid, int img_dur,
-                 int kern_len, int kern_wid, int kern_dur,
-                 int H_len, int H_wid, int H_dur,
-         int nkern, int nstack,
-                 int dr, int dc, int dt,
-                 int H_stride_frame, int H_stride_col, int H_stride_row,
-         int H_stride_stack, int H_stride_batch,
-                 int kern_stride_frame, int kern_stride_col, int kern_stride_row,
-         int kern_stride_stack, int kern_stride_nkern,
-                 int bias_stride)
-{
-    int __shared__ batch_id, stack_id;
-    float  __shared__ *d_img, *d_kern;
-    batch_id= blockIdx.x/nstack;
-    stack_id = blockIdx.x - batch_id*nstack;
-    const int R_row = blockIdx.y/img_wid;
-    const int R_col = blockIdx.y%img_wid;
-    const int R_frame=threadIdx.x;
-    const int r = R_row;
-    const int c = R_col;
-    const int t = R_frame;
-    const int ftc = max(0, int(ceil(float(t-kern_dur +1  )/float(dt))));
-    const int fcc = max(0, int(ceil(float(c-kern_wid +1)/float(dc))));
-    int rc =  max(0, int(ceil(float(r-kern_len+1)/float(dr))));
-    float sum = 0;
-    while(rc < H_len){
-    int rk = r - rc * dr;
-        if(rk < 0)
-            break;
-        int cc = fcc;
-        while( cc < H_wid){
-            int ck = c - cc * dc;
-            if(ck < 0)
-                break;
-            int tc = ftc;
-            while(tc < H_dur){
-                int tk = t - tc * dt;
-                if(tk < 0)
-                    break;
-                //R[i,j,r,c,t] += numpy.dot(W[:,j,rk,ck,tk], H[i,:,rc,cc,tc] )
-                        for(int q=0;q<nkern;q++){
-                          sum += kern[q*kern_stride_nkern+stack_id*kern_stride_stack+rk*kern_stride_row+ck*kern_stride_col+tk*kern_stride_frame]*
-                                 H[batch_id*H_stride_batch+q*H_stride_stack+rc*H_stride_row+cc*H_stride_col+tc*H_stride_frame];
-                        }
-                tc += 1;
-                }
-            cc += 1;
-        }
-        rc += 1;
-    }
-    R[batch_id*nstack*img_len*img_wid*img_dur+//the good batch
-      stack_id+//the output image
-      R_row*img_wid*img_dur*nstack+//the output row
-      R_col*img_dur*nstack + //the output_col
-      R_frame*nstack] = sum + bias[stack_id*bias_stride];
-}
-"""
-        return codeSource
-gpu_conv_transpd = GpuConvTransp3D()
-@local_optimizer([ConvTransp3D])
-def local_gpu_conv_transp3d(node):
-    if isinstance(node.op, ConvTransp3D):
-        if numpy.any([i.owner and isinstance(i.owner.op, HostFromGpu)
-                      for i in node.inputs]):
-            if numpy.all([o.type.dtype == 'float32' for o in node.outputs]):
-                W, b, d, H, RShape = node.inputs
-                return [host_from_gpu(gpu_conv_transpd(W, b, d, H, RShape))]
-# Not enabled by default as we don't want people to use it.
-gpu_optimizer.register("local_gpu_conv_transp3d", local_gpu_conv_transp3d)
-# If the input size wasn't a multiple of D we may need to cause some automatic padding to get the right size of reconstruction
-def computeR(W, b, d, H, Rshape=None):
-        assert len(W.shape) == 5
-        assert len(H.shape) == 5
-        assert len(b.shape) == 1
-        assert len(d) == 3
-        outputChannels, inputChannels, filterHeight, filterWidth, filterDur = W.shape
-        batchSize, outputChannelsAgain, outputHeight, outputWidth, outputDur = H.shape
-        assert outputChannelsAgain == outputChannels
-        assert b.shape[0] == inputChannels
-        dr, dc, dt = d
-        assert dr > 0
-        assert dc > 0
-        assert dt > 0
-        videoHeight = (outputHeight - 1) * dr + filterHeight
-        videoWidth = (outputWidth - 1) * dc + filterWidth
-        videoDur = (outputDur - 1) * dt + filterDur
-        if Rshape is not None and Rshape[0] != -1:
-            if Rshape[0] < videoHeight:
-                print((Rshape[0], videoHeight))
-                assert False
-            assert Rshape[1] >= videoWidth
-            assert Rshape[2] >= videoDur
-            # print "setting video size to Rshape = "+str(Rshape)
-            videoHeight, videoWidth, videoDur = Rshape
-        # else:
-        #    print "No Rshape passed in"
-        # print "video size: " + str((videoHeight, videoWidth, videoDur))
-        R = numpy.zeros((batchSize, inputChannels, videoHeight,
-                         videoWidth, videoDur),
-                        dtype=H.dtype)
-        # R[i,j,r,c,t] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} \
-        # sum_{tc,tk | ...} sum_k W[k, j, rk, ck, tk] * H[i,k,rc,cc,tc]
-        for i in xrange(0, batchSize):
-            # print '\texample '+str(i+1)+'/'+str(batchSize)
-            for j in xrange(0, inputChannels):
-                # print '\t\tfeature map ' + str(j+1) + '/' + str(inputChannels)
-                for r in xrange(0, videoHeight):
-                    # print '\t\t\trow ' + str(r+1) + '/'+str(videoHeight)
-                    for c in xrange(0, videoWidth):
-                        for t in xrange(0, videoDur):
-                            R[i, j, r, c, t] = b[j]
-                            ftc = max(
-                                [0,
-                                 int(numpy.ceil(
-                                     float(t - filterDur + 1) / float(dt)
-                                     ))
-                                 ]
-                            )
-                            fcc = max(
-                                [0,
-                                 int(numpy.ceil(
-                                     float(c - filterWidth + 1) / float(dc)
-                                     ))
-                                 ]
-                            )
-                            rc = max(
-                                [0,
-                                 int(numpy.ceil(
-                                     float(r - filterHeight + 1) / float(dr)
-                                     ))
-                                 ]
-                            )
-                            while rc < outputHeight:
-                                rk = r - rc * dr
-                                if rk < 0:
-                                    break
-                                cc = fcc
-                                while cc < outputWidth:
-                                    ck = c - cc * dc
-                                    if ck < 0:
-                                        break
-                                    tc = ftc
-                                    while tc < outputDur:
-                                        tk = t - tc * dt
-                                        if tk < 0:
-                                            break
-                                        R[i, j, r, c, t] += numpy.dot(
-                                            W[:, j, rk, ck, tk],
-                                            H[i, :, rc, cc, tc])
-                                        tc += 1
-                                    ""  # close loop over tc
-                                    cc += 1
-                                ""  # close loop over cc
-                                rc += 1
-                            ""  # close loop over rc
-                        ""  # close loop over t
-                    ""  # close loop over c
-                ""  # close loop over r
-            ""  # close loop over j
-        ""  # close loop over i
-        return R
--- a/theano/sandbox/cuda/__init__.py
+++ b/theano/sandbox/cuda/__init__.py
-from __future__ import absolute_import, print_function, division
-import atexit
-import errno
-import logging
-import os
-import shutil
-import stat
-import sys
-import textwrap
-import warnings
-import theano
-from theano.compat import get_unbound_function
-from theano.compile import optdb
-from theano.gof import EquilibriumDB, SequenceDB, TopoOptimizer
-from theano.gof.cmodule import get_lib_extension
-from theano.gof.compilelock import get_lock, release_lock
-from theano import config
-from . import nvcc_compiler
-from theano.tensor.basic import register_transfer
-# ignore_newtrees is to speed the optimization as this is the pattern
-# we use for optimization. Otherwise, we can iterate 100s of time on
-# the graph and apply only a few optimizations each time.
-gpu_optimizer = EquilibriumDB(ignore_newtrees=False)
-gpu_seqopt = SequenceDB()
-def register_opt(*tags, **kwargs):
-    if any([not isinstance(t, str) for t in tags]):
-        raise RuntimeError("Bad call to register_opt."
-                           " All tags must be strings.", tags)
-    def f(local_opt):
-        name = (kwargs and kwargs.pop('name')) or local_opt.__name__
-        gpu_optimizer.register(name, local_opt, 'fast_run', 'fast_compile',
-                               'gpu', *tags, **kwargs)
-        return local_opt
-    return f
-def register_inplace(*tags, **kwargs):
-    def f(local_opt):
-        name = (kwargs and kwargs.pop('name')) or local_opt.__name__
-        optdb.register(
-            name, TopoOptimizer(
-                local_opt, failure_callback=TopoOptimizer.warn_inplace),
-            60, 'fast_run', 'inplace', 'gpu', *tags)
-        return local_opt
-    return f
-_logger_name = 'theano.sandbox.cuda'
-_logger = logging.getLogger(_logger_name)
-# is_nvcc_available called here to initialize global vars in
-# nvcc_compiler module
-nvcc_compiler.is_nvcc_available()
-# Compile cuda_ndarray.cu
-# This need that nvcc (part of cuda) is installed. If it is not, a warning is
-# printed and this module will not be working properly (we set `cuda_available`
-# to False).
-# This variable is True by default, and set to False if nvcc is not
-# available or their is no cuda card or something goes wrong when
-# trying to initialize cuda.
-cuda_available = True
-# Global variable to avoid displaying the same warning multiple times.
-cuda_warning_is_displayed = False
-# This variable is set to True when we enable cuda.(i.e. when use() is called)
-cuda_enabled = False
-# Code factorized within a function so that it may be called from multiple
-# places (which is not currently the case, but may be useful in the future).
-def set_cuda_disabled():
-    """
-    Function used to disable cuda.
-    A warning is displayed, so that the user is aware that cuda-based code is
-    not going to work.
-    Note that there is no point calling this function from outside of
-    `cuda.__init__`, since it has no effect once the module is loaded.
-    """
-    global cuda_available, cuda_warning_is_displayed
-    cuda_available = False
-# cuda_ndarray compile and import
-cuda_path = os.path.abspath(os.path.split(__file__)[0])
-cuda_ndarray_loc = os.path.join(config.compiledir, 'cuda_ndarray')
-cuda_ndarray_so = os.path.join(
-        cuda_ndarray_loc, 'cuda_ndarray.' + get_lib_extension())
-libcuda_ndarray_so = os.path.join(
-        cuda_ndarray_loc, 'libcuda_ndarray.' + get_lib_extension())
-def try_import():
-    """
-    Load the cuda_ndarray module if present and up to date.
-    Return True if loaded correctly, otherwise return False.
-    """
-    cuda_files = (
-        'cuda_ndarray.cu',
-        'cuda_ndarray.cuh',
-        'conv_full_kernel.cu',
-        'cnmem.h',
-        'cnmem.cpp',
-        'conv_kernel.cu')
-    stat_times = [os.stat(os.path.join(cuda_path, cuda_file))[stat.ST_MTIME]
-                  for cuda_file in cuda_files]
-    date = max(stat_times)
-    if os.path.exists(cuda_ndarray_so):
-        if date >= os.stat(cuda_ndarray_so)[stat.ST_MTIME]:
-            return False
-    try:
-        # If we load a previously-compiled version, config.compiledir should
-        # be in sys.path.
-        sys.path[0:0] = [config.compiledir]
-        import cuda_ndarray.cuda_ndarray
-        del sys.path[0]
-    except ImportError:
-        return False
-    return True
-if not nvcc_compiler.is_nvcc_available() or not theano.config.cxx:
-    # It can happen that the file cuda_ndarray.so is already compiled
-    # but nvcc is not available. In that case we need to disable the CUDA
-    # back-end as we won't be able to compile any new op and we can't only
-    # use already compiled GPU op and not the others.
-    # Also, if cxx is not available, we need to disable all GPU code.
-    set_cuda_disabled()
-    compile_cuda_ndarray = False
-elif not config.device.startswith('gpu') and config.force_device:
-    # We where asked to NEVER use the GPU
-    set_cuda_disabled()
-    compile_cuda_ndarray = False
-else:
-    # Add the theano cache directory's cuda_ndarray subdirectory to the
-    # list of places that are hard-coded into compiled modules' runtime
-    # library search list.  This works in conjunction with
-    # nvcc_compiler.NVCC_compiler.compile_str which adds this folder during
-    # compilation with -L and also adds -lcuda_ndarray when compiling
-    # modules.
-    nvcc_compiler.add_standard_rpath(cuda_ndarray_loc)
-    compile_cuda_ndarray = not try_import()
-if compile_cuda_ndarray and cuda_available:
-    get_lock()
-    try:
-        # Retry to load again in case someone else compiled it
-        # while we waited for the lock
-        if not try_import():
-            try:
-                if not nvcc_compiler.is_nvcc_available():
-                    set_cuda_disabled()
-                if cuda_available:
-                    code = open(os.path.join(cuda_path,
-                                             "cuda_ndarray.cu")).read()
-                    if not os.path.exists(cuda_ndarray_loc):
-                        os.makedirs(cuda_ndarray_loc)
-                    # If $TMPDIR is defined, nvopencc wants it to exist
-                    if 'TMPDIR' in os.environ:
-                        tmpdir = os.environ['TMPDIR']
-                        if not os.path.exists(tmpdir):
-                            os.makedirs(tmpdir)
-                    compiler = nvcc_compiler.NVCC_compiler()
-                    preargs = ['-O3'] + compiler.compile_args()
-                    compiler.compile_str(
-                            'cuda_ndarray',
-                            code,
-                            location=cuda_ndarray_loc,
-                            include_dirs=[cuda_path],
-                            libs=[config.cublas.lib],
-                            preargs=preargs,
-                    )
-                    from cuda_ndarray.cuda_ndarray import *
-            except Exception as e:
-                _logger.error("Failed to compile cuda_ndarray.cu: %s", str(e))
-                set_cuda_disabled()
-    finally:
-        release_lock()
-del compile_cuda_ndarray
-if cuda_available:
-    global cuda_initialization_error_message
-    # The module should be compiled.
-    from cuda_ndarray.cuda_ndarray import *
-    # If necessary,
-    # create a symlink called libcuda_ndarray.so
-    # which nvcc_compiler.NVCC_compiler uses when linking
-    # any module except "cuda_ndarray" itself.
-    def ok():
-        """
-        Check if an existing library exists and can be read.
-        """
-        try:
-            open(libcuda_ndarray_so).close()
-            return True
-        except IOError:
-            return False
-    if not ok():
-        if sys.platform == "win32":
-            # The Python `os` module does not support symlinks on win32.
-            shutil.copyfile(cuda_ndarray_so, libcuda_ndarray_so)
-        else:
-            try:
-                os.symlink(cuda_ndarray_so, libcuda_ndarray_so)
-            except OSError as e:
-                # This may happen for instance when running multiple
-                # concurrent jobs, if two of them try to create the
-                # symlink simultaneously.
-                # If that happens, we verify that the existing symlink is
-                # indeed working.
-                if getattr(e, 'errno', None) != errno.EEXIST or not ok():
-                    raise
-    try:
-        # This only test if the cuda driver is available and if there
-        # is at least one GPU that support cuda. This do not select a
-        # device.
-        gpu_init()
-        cuda_available = True
-        cuda_initialization_error_message = ""
-# actively closing our gpu session presents segfault-on-exit on some systems
-        atexit.register(gpu_shutdown)
-    except EnvironmentError as e:
-        cuda_available = False
-        cuda_initialization_error_message = " ".join(e.args)
-else:
-    cuda_initialization_error_message = 'cuda unavailable'
-class GpuOp(theano.gof.Op):
-    """
-    Parent class for all GPU Ops.
-    This class ensures we verify the GPU is working properly when a GPU Op is
-    used for the first time.
-    It is defined in __init__.py so that it exists even when `cuda_available`
-    is False (this is necessary to avoid breaking the test suite).
-    """
-    def prepare_node(self, node, storage_map, compute_map, impl):
-        if use.device_number is None:
-            use("gpu",
-                force=True,
-                default_to_move_computation_to_gpu=False,
-                move_shared_float32_to_gpu=False,
-                enable_cuda=False)
-# We must do those import to be able to create the full doc when
-# nvcc is not available
-from theano.sandbox.cuda.var import (CudaNdarrayVariable,
-                                     CudaNdarrayConstant,
-                                     CudaNdarraySharedVariable,
-                                     float32_shared_constructor)
-from theano.sandbox.cuda.type import CudaNdarrayType
-def dnn_available():
-    if config.dnn.enabled == "False":
-        dnn_available.avail = False
-        dnn_available.msg = "Disabled by dnn.enabled flag"
-    if dnn_available.avail is None and not cuda_available:
-        dnn_available.msg = "CUDA not available"
-        dnn_available.avail = False
-    elif config.dnn.enabled == "no_check":
-        raise RuntimeException("The old gpu back-end do not support the flag dnn.enabled=no_check")
-    elif dnn_available.avail is None:
-        dev = active_device_number()
-        if device_properties(dev)['major'] < 3:
-            dnn_available.msg = "Device not supported"
-            dnn_available.avail = False
-        else:
-            preambule = textwrap.dedent(
-                """
-                #include <stdio.h>
-                #include <cuda.h>
-                #include <cudnn.h>
-                #include <cudnn_helper.h>
-                """)
-            body = textwrap.dedent(
-                """
-                cudnnHandle_t _handle = NULL;
-                cudnnStatus_t err;
-                if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
-                  fprintf(stderr, "could not create cuDNN handle: %s",
-                          cudnnGetErrorString(err));
-                  return 1;
-                }
-                """)
-            # to support path that includes spaces, we need to wrap it with double quotes on Windows
-            path_wrapper = "\"" if os.name =='nt' else ""
-            params = ["-l", "cudnn"]
-            params.extend(['-I%s%s%s' % (path_wrapper, os.path.dirname(__file__), path_wrapper)])
-            if config.dnn.include_path:
-                params.extend(['-I%s%s%s' % (path_wrapper, config.dnn.include_path, path_wrapper)])
-            if config.dnn.library_path:
-                params.extend(['-L%s%s%s' % (path_wrapper, config.dnn.library_path, path_wrapper)])
-            if config.nvcc.compiler_bindir:
-                params.extend(['--compiler-bindir',
-                               '%s%s%s' % (path_wrapper, config.nvcc.compiler_bindir, path_wrapper)])
-            params.extend([flag for flag in config.nvcc.flags.split(' ') if flag])
-            # Do not run here the test program. It would run on the
-            # default gpu, not the one selected by the user. If mixed
-            # GPU are installed or if the GPUs are configured in
-            # exclusive mode, this cause bad detection.
-            comp, out, err = nvcc_compiler.NVCC_compiler.try_flags(
-                flag_list=params, preambule=preambule, body=body,
-                try_run=False, output=True)
-            dnn_available.avail = comp
-            if not dnn_available.avail:
-                dnn_available.msg = (
-                    "Can not compile with cuDNN. We got this error:\n" +
-                    str(err))
-            else:
-                # If we can compile, check that we can import and run.
-                v = dnn_version()
-                if isinstance(v, tuple) and v[0] != v[1]:
-                    dnn_available.avail = False
-                    dnn_available.msg = ("Mixed dnn version. The header is"
-                                         " from one version, but we link with"
-                                         " a different version %s" % str(v))
-                    raise RuntimeError(dnn_available.msg)
-                if v == -1 or v[0] < 4007:
-                    # 4007 is the final release of cudnn v4
-                    dnn_available.avail = False
-                    dnn_available.msg = "Version is too old. Update to v5, was %d." % v[0]
-                    raise RuntimeError(dnn_available.msg)
-                else:
-                    dnn_available.avail = comp
-    if config.dnn.enabled == "True":
-        if not dnn_available.avail:
-            raise RuntimeError(
-                "You enabled cuDNN, but we aren't able to use it: %s" %
-                dnn_available.msg)
-    return dnn_available.avail
-dnn_available.avail = None
-dnn_available.msg = None
-class DnnVersion(GpuOp):
-    def c_compiler(self):
-        return nvcc_compiler.NVCC_compiler
-    def c_headers(self):
-        return ['cudnn.h']
-    def c_header_dirs(self):
-        return [config.dnn.include_path]
-    def c_libraries(self):
-        return ['cudnn']
-    def c_lib_dirs(self):
-        return [config.dnn.library_path]
-    def c_compile_args(self):
-        return ['-Wl,-rpath,' + config.dnn.library_path]
-    def c_support_code(self):
-        return textwrap.dedent(
-            """
-            #if PY_MAJOR_VERSION >= 3
-            #define PyInt_FromLong PyLong_FromLong
-            #endif
-            """)
-    def make_node(self):
-        return theano.gof.Apply(self, [], [theano.gof.Generic()()])
-    def c_code(self, node, name, inputs, outputs, sub):
-        o = outputs[0]
-        return textwrap.dedent(
-            """
-            #if defined(CUDNN_VERSION)
-            %(o)s = PyTuple_Pack(2, PyInt_FromLong(CUDNN_VERSION), PyInt_FromLong(cudnnGetVersion()));
-            #else
-            %(o)s = PyInt_FromLong(-1);
-            #endif
-            """) % locals()
-    def do_constant_folding(self, node):
-        # Needed as we do not want to cache this information.
-        return False
-    def c_code_cache_version(self):
-        # Not needed, but make it clear that we do not want to cache this.
-        return None
-def dnn_version():
-    """Return the current cuDNN version we compile with.
-    This returns a tuple with the header version and the library
-    version we link with. For older cudnn version without version
-    information, we return -1.
-    """
-    if not dnn_available():
-        raise Exception(
-            "We can't determine the cudnn version as it is not available",
-            dnn_available.msg)
-    if dnn_version.v is None:
-        f = theano.function([], DnnVersion()(),
-                            theano.Mode(optimizer=None),
-                            profile=False)
-        dnn_version.v = f()
-    return dnn_version.v
-dnn_version.v = None
-if cuda_available:
-    # check if their is an old cuda_ndarray that was loading instead of the one
-    # we compiled!
-    import cuda_ndarray.cuda_ndarray
-    if cuda_ndarray_so != cuda_ndarray.cuda_ndarray.__file__:
-        _logger.warning("cuda_ndarray was loaded from %s, but Theano expected "
-                        "to load it from %s. This is not expected as theano "
-                        "should compile it automatically for you. Do you have "
-                        "a directory called cuda_ndarray in your "
-                        "LD_LIBRARY_PATH environment variable? If so, please "
-                        "remove it as it is outdated.",
-                        cuda_ndarray.cuda_ndarray.__file__,
-                        cuda_ndarray_so)
-    shared_constructor = float32_shared_constructor
-    from . import basic_ops
-    from .basic_ops import (
-            GpuFromHost, HostFromGpu, GpuElemwise,
-            GpuDimShuffle, GpuCAReduce, GpuReshape, GpuContiguous,
-            GpuSubtensor, GpuIncSubtensor,
-            GpuAdvancedSubtensor1, GpuAdvancedIncSubtensor1,
-            gpu_flatten, GpuFlatten, GpuShape, GpuAlloc, GpuAllocEmpty, GpuSplit,
-            GpuJoin, fscalar, fvector, fmatrix, frow, fcol,
-            ftensor3, ftensor4,
-            scalar, vector, matrix, row, col,
-            tensor3, tensor4)
-    from .basic_ops import (host_from_gpu, gpu_from_host, as_cuda_array,
-                            as_cuda_ndarray_variable)
-    import cuda_ndarray
-    from . import opt, dnn
-    from .rng_curand import CURAND_RandomStreams
-    def transfer(x, target):
-        if target == 'gpu':
-            return as_cuda_ndarray_variable(x)
-    register_transfer(transfer)
-def use(device,
-        force=False,
-        default_to_move_computation_to_gpu=True,
-        move_shared_float32_to_gpu=True,
-        enable_cuda=True,
-        test_driver=True):
-    """
-    Error and warning about CUDA should be displayed only when this
-    function is called. We need to be able to load this module only
-    to check if it is available!
-    Parameters
-    ----------
-    device : string
-        "cpu", "gpu", "gpuN" (N is the device number to use).
-    force
-        Will always raise an exception if we can't use the gpu.
-    default_to_move_computation_to_gpu
-        If gpu init succeeded, enable by default optimizations to move
-        computations to the gpu.
-    move_shared_float32_to_gpu
-        If gpu init succeeded, put new shared variables in float32 on the gpu.
-    enable_cuda
-        If the gpu is correctly enabled, set the variable cuda_enabled to True.
-    """
-    global cuda_enabled, cuda_initialization_error_message
-    _logger.warn("The cuda backend is deprecated and will be removed in "
-                 "the next release (v0.10).  Please switch to the gpuarray backend. "
-                 "You can get more information about how to switch at this "
-                 "URL:\n https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29\n")
-    if force and not cuda_available and device.startswith('gpu'):
-        if not nvcc_compiler.is_nvcc_available():
-            raise EnvironmentError("You forced the use of gpu device '%s', but"
-                                   " nvcc was not found. Set it in your PATH "
-                                   "environment variable or set the Theano "
-                                   "flags 'cuda.root' to its directory"
-                                   "" % device)
-        else:
-            raise EnvironmentError("You forced the use of gpu device %s, "
-                                   "but CUDA initialization failed "
-                                   "with error:\n%s" % (
-                                       device,
-                                       cuda_initialization_error_message))
-    elif not nvcc_compiler.is_nvcc_available():
-        _logger.error("nvcc compiler not found on $PATH. "
-                      "Check your nvcc installation and try again.")
-        return
-    elif not cuda_available:
-        error_addendum = ""
-        try:
-            if cuda_initialization_error_message:
-                error_addendum = (" (error: %s)" %
-                                  cuda_initialization_error_message)
-        except NameError:
-            # cuda_initialization_error_message is not available b/c compilation failed
-            pass
-        _logger.warning("CUDA is installed, but device %s is not available %s",
-                        device, error_addendum)
-        return
-    if device == 'gpu':
-        pass
-    elif device.startswith('gpu'):
-        device = int(device[3:])
-    elif device == 'cpu':
-        device = -1
-    else:
-        raise ValueError("Invalid device identifier", device)
-    if use.device_number is None:
-        # No successful call to use() has been made yet
-        if device != 'gpu' and device < 0:
-            return
-        msg = ("Theano flag device=gpu* (old gpu back-end) only support"
-               " floatX=float32. You have floatX=%s. Use the new gpu"
-               " back-end with device=cuda* for that value of floatX." %
-               config.floatX)
-        if config.floatX == 'float16':
-            raise RuntimeError(msg)
-        elif config.floatX == 'float64':
-            warnings.warn(msg)
-        # Has PyCUDA already initialized the GPU context
-        pycuda_init_dev = False
-        if config.pycuda.init:
-            import theano.misc.pycuda_init
-            pycuda_init_dev = theano.misc.pycuda_init.pycuda_available
-        try:
-            if pycuda_init_dev:
-                use.device_number = active_device_number()
-                # This is needed to initialize the cublas handle.
-                gpu_init(use.device_number, config.lib.cnmem)
-            elif(device != 'gpu'):
-                assert isinstance(device, int)
-                gpu_init(device, config.lib.cnmem)
-                use.device_number = device
-                active_device = active_device_number()
-                assert active_device == device, (active_device, device)
-            else:
-                # This mean the driver should select the GPU.  As we
-                # need to get the device number now, we force the
-                # selection of the GPU by the driver now and then we
-                # query the active GPU. If we check the active GPU before
-                # the device is initialized we will always receive 0
-                # event if another device is selected later.
-                if not hasattr(cuda_ndarray.cuda_ndarray, 'select_a_gpu'):
-                    raise Exception(
-                        "Delete your Theano cache. The automatic"
-                        " recompilation did not work.")
-                cuda_ndarray.cuda_ndarray.select_a_gpu()
-                use.device_number = active_device_number()
-                # This is needed to initialize the cublas handle.
-                gpu_init(use.device_number, config.lib.cnmem)
-            if test_driver:
-                import theano.sandbox.cuda.tests.test_driver
-                theano.sandbox.cuda.tests.test_driver.test_nvidia_driver1()
-            if device_properties(use.device_number)["warpSize"] != 32:
-                raise ValueError("Your GPU has a warpSize != 32. Currently"
-                                 " we have code that depends on this. Email"
-                                 " the Theano mailing list to tell us about"
-                                 " this new GPU as we don't know any with"
-                                 " this property")
-            if config.print_active_device:
-                if config.lib.cnmem:
-                    if config.lib.cnmem > 1:
-                        cnmem_enabled = "enabled with initial size: %d MB" % config.lib.cnmem
-                    else:
-                        cnmem = min(config.lib.cnmem, 0.95) * 100
-                        cnmem_enabled = "enabled with initial size: %.1f%% of memory" % cnmem
-                else:
-                    cnmem_enabled = "disabled"
-                cudnn_version = "not available"
-                warn = None
-                try:
-                    if dnn_available():
-                        (hdr_v, runtime_v) = dnn_version()
-                        cudnn_version = runtime_v
-                        # 5200 should not print warning with cudnn 5 final.
-                        if cudnn_version >= 5200:
-                            warn = ("Your cuDNN version is more recent than the one"
-                                    " Theano officially supports."
-                                    " If you see any problems, try updating Theano or"
-                                    " downgrading cuDNN to version 5.1.")
-                except Exception:
-                    cudnn_version = dnn_available.msg
-                print("Using gpu device %d: %s (CNMeM is %s, cuDNN %s)" % (
-                    active_device_number(),
-                    active_device_name(),
-                    cnmem_enabled,
-                    cudnn_version,),
-                      file=sys.stderr)
-                if warn:
-                    warnings.warn(warn)
-            if device_properties(use.device_number)['regsPerBlock'] < 16384:
-                # We will try to use too much register per bloc at many places
-                # when there is only 8k register per multi-processor.
-                _logger.warning(
-                        "You are probably using an old GPU, that Theano"
-                        " does not support."
-                        " This means GPU code will most likely be slow AND may"
-                        " crash when we try to use features"
-                        " that your GPU does not support.")
-        except (EnvironmentError, ValueError, RuntimeError) as e:
-            _logger.error(("ERROR: Not using GPU."
-                           " Initialisation of device %s failed:\n%s"),
-                          str(device), e)
-            cuda_enabled = False
-            if force:
-                e.args += (("You asked to force this device and it failed."
-                            " No fallback to the cpu or other gpu device."),)
-                raise
-    elif use.device_number != device and device != 'gpu':
-        _logger.warning(("Ignoring call to use(%s), GPU number %i "
-                         "is already in use."),
-                        str(device), use.device_number)
-    if move_shared_float32_to_gpu:
-        handle_shared_float32(True)
-    if enable_cuda:
-        cuda_enabled = True
-    if default_to_move_computation_to_gpu:
-        # Do not add inplace tag here. We do not want to
-        # enable/disable gpu opt based on the inplace tag.
-        optdb.add_tags('gpu_opt',
-                       'fast_compile',
-                       'fast_run')
-        optdb.add_tags('gpu_after_fusion',
-                       'fast_run')
-        optdb.add_tags('gpu_scanOp_make_inplace',
-                       'fast_run')
-    if force:
-        try:
-            # in case the device if just gpu,
-            # we check that the driver init it correctly.
-            cuda_ndarray.cuda_ndarray.CudaNdarray.zeros((5, 5))
-        except (Exception, NameError) as e:
-            # NameError when no gpu present as cuda_ndarray is not loaded.
-            e.args += ("ERROR: GPU forced but failed. ",)
-            raise
-use.device_number = None
-def unuse():
-    """
-    This undo what was done by the call to.
-    use('gpu[0-9]', default_to_move_computation_to_gpu=True,
-        move_shared_float32_to_gpu=True,
-        enable_cuda=True)
-    This is used in Pylearn2 tests to enable/disable the GPU when needed.
-    After this call, the rest of Theano think the GPU shouldn't be used by
-    default.
-    """
-    global cuda_enabled
-    cuda_enabled = False
-    handle_shared_float32(False)
-    optdb.remove_tags('gpu_opt',
-                      'fast_compile',
-                      'fast_run')
-    optdb.remove_tags('gpu_after_fusion',
-                      'fast_run')
-def handle_shared_float32(tf):
-    """
-    Set the default shared type for float32 tensor to CudaNdarrayType.
-    This function is intended to be called from use(gpu_index), not directly.
-    """
-    if tf:
-        theano.compile.shared_constructor(float32_shared_constructor)
-    else:
-        theano.compile.shared_constructor(float32_shared_constructor, True)
-        assert (float32_shared_constructor not in
-                theano.compile.shared.constructors)
-# We can't test the driver during import here as this cause circular
-# import dependency. So we also test it in the file theano/__init__.py
-if config.device.startswith('gpu'):
-    use(device=config.device, force=config.force_device, test_driver=False)
-elif config.init_gpu_device.startswith('gpu'):
-    assert config.device == "cpu", (
-        "We can use the Theano flag init_gpu_device"
-        " only when the Theano flag device=='cpu'")
-    _logger.warning(("GPU device %s will be initialized, and used if a GPU is "
-                     "needed. However, no computation, nor shared variables, "
-                     "will be implicitly moved to that device. If you want "
-                     "that behavior, use the 'device' flag instead."),
-                    config.init_gpu_device)
-    use(device=config.init_gpu_device,
-        force=config.force_device,
-        default_to_move_computation_to_gpu=False,
-        move_shared_float32_to_gpu=False,
-        enable_cuda=False, test_driver=False)
--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
--- a/theano/sandbox/cuda/blocksparse.py
+++ b/theano/sandbox/cuda/blocksparse.py
-from __future__ import absolute_import, print_function, division
-import logging
-import numpy
-from theano import Apply, tensor
-from theano.tensor import discrete_dtypes
-from theano.gradient import grad_undefined
-from theano.sandbox.cuda import cuda_available, GpuOp
-_logger = logging.getLogger('theano.sandbox.cuda.blocksparse')
-if cuda_available:
-    from theano.sandbox.cuda import basic_ops
-class GpuSparseBlockGemv(GpuOp):
-    """
-    GPU version of SparseBlockGemv. Check SparseBlockGemv's docstring for more
-    information.
-    This should not be directly called since the interface is subject
-    to change without notice.  Use the sandbox.blocksparse.sparse_block_dot()
-    function for a stable interface.
-    """
-    __props__ = ('inplace',)
-    def __init__(self, inplace=False):
-        self.inplace = inplace
-        if self.inplace:
-            self.destroy_map = {0: [0]}
-    def make_node(self, o, W, h, inputIdx, outputIdx):
-        o = basic_ops.as_cuda_ndarray_variable(o)
-        W = basic_ops.as_cuda_ndarray_variable(W)
-        h = basic_ops.as_cuda_ndarray_variable(h)
-        assert o.ndim == 3
-        assert W.ndim == 4
-        assert h.ndim == 3
-        assert inputIdx.ndim == 2
-        assert outputIdx.ndim == 2
-        assert inputIdx.type.dtype in discrete_dtypes
-        assert outputIdx.type.dtype in discrete_dtypes
-        return Apply(self, [o, W, h, inputIdx, outputIdx],
-                     [o.type()])
-    def infer_shape(self, node, input_shapes):
-        return [input_shapes[0]]
-    def c_support_code(self):
-        return """
-        __global__ void
-        SparseBlockGemv_fill_lists(
-int maxi, int maxj,
-const float **inp_list,
-float **out_list,
-const float **W_list,
-const float *W, int W_str_0, int W_str_1,
-const float *h, int h_str_0, int h_str_1,
-float *out, int o_str_0, int o_str_1,
-const npy_intp *iIdx, int iI_str_0,
-const npy_intp *oIdx, int oI_str_0
-        ) {
-  int i = threadIdx.x + blockDim.x * blockIdx.x;
-  int j = threadIdx.y + blockDim.y * blockIdx.y;
-  int b = blockIdx.z;
-  if (i >= maxi || j >= maxj) return;
-  int p = i + j * maxi + b * maxi * maxj;
-  inp_list[p] = &h[b * h_str_0 + i * h_str_1];
-  out_list[p] = &out[b * o_str_0 + j * o_str_1];
-  W_list[p] = &W[iIdx[b*iI_str_0+i] * W_str_0 +
-                 oIdx[b*oI_str_0+j] * W_str_1];
-}
-__global__ void _sgemvBH_N_a1_b1_small(const float *A[], int lda,
-                                       const float *x[], int incx,
-                                       float *y[], int incy,
-                                       int b, int m, int n) {
-  for (int p = blockIdx.y * blockDim.y + threadIdx.y; p < b;
-       p += gridDim.y * blockDim.y) {
-    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < m;
-         i += gridDim.x * blockDim.x) {
-      float yi = 0.0f;
-      const float *Ap = A[p] + i;
-      const float *xp = x[p];
-      #pragma unroll 32
-      for (int j = 0; j < n; j++) {
-        yi += Ap[0] * xp[0];
-        Ap += lda;
-        xp += incx;
-      }
-      atomicAdd(&y[p][i*incy], yi);
-    }
-  }
-}
-__global__ void _sgemvBH_T_a1_b1_small(const float *A[], int lda,
-                                       const float *x[], int incx,
-                                       float *y[], int incy,
-                                       int b, int m, int n) {
-  int i = blockIdx.x * blockDim.x + threadIdx.x;
-  int p = blockIdx.y * blockDim.y + threadIdx.y;
-  if (i >= m || p >= b) return;
-  float yi = 0.0f;
-  const float *Ap = A[p] + i * lda;
-  const float *xp = x[p];
-  # pragma unroll 32
-  for (int j = 0; j < n; j++) {
-    yi += Ap[j] * xp[0];
-    xp += incx;
-  }
-  atomicAdd(&y[p][i*incy], yi);
-}
-static cublasStatus_t SgemvBatched(cublasHandle_t handle,
-                                   cublasOperation_t trans,
-                                   int m, int n,
-                                   const float *alpha,
-                                   const float *A[], int lda,
-                                   const float *x[], int incx,
-                                   const float *beta,
-                                   float *y[], int incy, int batchCount) {
-  dim3 block(m, batchCount, 1);
-  dim3 grid(1, 1, 1);
-  cublasPointerMode_t mode;
-  cudaError_t err;
-  if (m < 512) {
-    block.x = 32;
-    if (batchCount > 16)
-      block.y = 16;
-    else
-      block.y = batchCount;
-  } else {
-    block.x = 512;
-    block.y = 1;
-  }
-  grid.x = (m + block.x - 1) / block.x;
-  grid.y = (batchCount + block.y - 1) / block.y;
-  if (grid.x * grid.y > 65535) {
-    grid.y = (65535 / grid.x);
-  }
-  cublasGetPointerMode(handle, &mode);
-  if (mode != CUBLAS_POINTER_MODE_HOST)
-    return CUBLAS_STATUS_INVALID_VALUE;
-  if (*alpha != 1.0 || *beta != 1.0)
-    return CUBLAS_STATUS_INVALID_VALUE;
-  if (trans == CUBLAS_OP_N)
-    _sgemvBH_N_a1_b1_small<<<grid, block>>>(A, lda, x, incx,
-                                            y, incy,
-                                            batchCount, m, n);
-  else if (trans == CUBLAS_OP_T)
-    _sgemvBH_T_a1_b1_small<<<grid, block>>>(A, lda, x, incx,
-                                            y, incy,
-                                            batchCount, m, n);
-  else
-    return CUBLAS_STATUS_INVALID_VALUE;
-  err = cudaGetLastError();
-  if (err != cudaSuccess)
-    return CUBLAS_STATUS_EXECUTION_FAILED;
-  return CUBLAS_STATUS_SUCCESS;
-}
-static int SparseBlockGemv_copy(PyArrayObject *a, npy_intp *b) {
-  cudaError_t err;
-  PyArrayObject *aa = (PyArrayObject *)PyArray_Cast(a, NPY_INTP);
-  if (aa == NULL) { return -1; }
-  err = cudaMemcpyAsync(b, PyArray_DATA(aa), PyArray_NBYTES(aa),
-                        cudaMemcpyHostToDevice);
-  Py_DECREF(aa);
-  if (err != cudaSuccess) {
-    PyErr_Format(PyExc_RuntimeError, "Cannot copy index data to GPU (%s)",
-                 cudaGetErrorString(err));
-    return -1;
-  }
-  return 0;
-}
-"""
-    def c_support_code_apply(self, node, nodename):
-        return """
-        /* Statics are initialized with 0 */
-        static const float **%(n)s_inp_list;
-        static float **%(n)s_out_list;
-        static const float **%(n)s_W_list;
-        static size_t %(n)s_list_len;
-        static npy_intp *%(n)s_iIdx;
-        static size_t %(n)s_iIdx_len;
-        static npy_intp *%(n)s_oIdx;
-        static size_t %(n)s_oIdx_len;
-        static int %(n)s_prep(int b, int i, int j, int outsize) {
-          int s = b*i*j;
-          if (%(n)s_list_len < s) {
-            device_free(%(n)s_inp_list);
-            device_free(%(n)s_out_list);
-            device_free(%(n)s_W_list);
-            %(n)s_inp_list = (const float **) device_malloc(s*sizeof(float *));
-            if (%(n)s_inp_list == NULL) return -1;
-            %(n)s_out_list = (float **) device_malloc(s*sizeof(float *));
-            if (%(n)s_out_list == NULL) return -1;
-            %(n)s_W_list = (const float **) device_malloc(s*sizeof(float *));
-            if (%(n)s_W_list == NULL) return -1;
-            %(n)s_list_len = s;
-          }
-          if (%(n)s_iIdx_len < b*i) {
-            device_free(%(n)s_iIdx);
-        %(n)s_iIdx = (npy_intp*) device_malloc(b*i*sizeof(npy_intp));
-        if (%(n)s_iIdx == NULL) return -1;
-            %(n)s_iIdx_len = b*i;
-          }
-          if (%(n)s_oIdx_len < b*j) {
-            device_free(%(n)s_oIdx);
-            %(n)s_oIdx = (npy_intp*) device_malloc(b*j*sizeof(npy_intp));
-            if (%(n)s_oIdx == NULL) return -1;
-            %(n)s_oIdx_len = b*j;
-          }
-          return 0;
-        }
-        """ % dict(n=nodename)
-    def c_code(self, node, nodename, inputs, outputs, sub):
-        o, W, h, inputIdx, outputIdx = inputs
-        out = outputs[0]
-        if self.inplace:
-            res = """
-Py_XDECREF(%(out)s);
-%(out)s = %(o)s;
-Py_INCREF(%(out)s);
-""" % dict(out=out, o=o)
-        else:
-            res = """
-if (CudaNdarray_prep_output(&%(out)s, 3, CudaNdarray_HOST_DIMS(%(o)s)))
-{
-  // Error already set
-  %(fail)s
-}
-if (CudaNdarray_CopyFromCudaNdarray(%(out)s, %(o)s)) {
-  // Error already set
-  %(fail)s
-}
-""" % dict(out=out, o=o, fail=sub['fail'])
-        return res + """
-        if (%(name)s_prep(CudaNdarray_HOST_DIMS(%(o)s)[0],
-                          CudaNdarray_HOST_DIMS(%(h)s)[1],
-                          CudaNdarray_HOST_DIMS(%(o)s)[1],
-                          CudaNdarray_HOST_DIMS(%(o)s)[2]) == -1) {
-          PyErr_SetString(PyExc_RuntimeError,
-                          "Could not allocate working memory.");
-          %(fail)s
-        }
-        if (SparseBlockGemv_copy(%(inputIdx)s, %(name)s_iIdx) == -1)
-          { %(fail)s }
-        if (SparseBlockGemv_copy(%(outputIdx)s, %(name)s_oIdx) == -1)
-          { %(fail)s }
-        { /* Prepare lists for the batch */
-          dim3 block;
-          dim3 grid;
-          block.x = CudaNdarray_HOST_DIMS(%(h)s)[1];
-          block.y = CudaNdarray_HOST_DIMS(%(o)s)[1];
-          grid.z = CudaNdarray_HOST_DIMS(%(o)s)[0]; // batch size
-          if (block.x > 32) {
-            grid.x = (block.x + 31) / 32;
-            block.x = 32;
-          }
-          if (block.x * block.y > 512) {
-            grid.y = (block.y + 15) / 16;
-            block.y = 16;
-          }
-          SparseBlockGemv_fill_lists<<<grid, block>>>(
-CudaNdarray_HOST_DIMS(%(h)s)[1], CudaNdarray_HOST_DIMS(%(o)s)[1],
-%(name)s_inp_list,
-%(name)s_out_list,
-%(name)s_W_list,
-CudaNdarray_DEV_DATA(%(W)s),
-CudaNdarray_HOST_STRIDES(%(W)s)[0], CudaNdarray_HOST_STRIDES(%(W)s)[1],
-CudaNdarray_DEV_DATA(%(h)s),
-CudaNdarray_HOST_STRIDES(%(h)s)[0], CudaNdarray_HOST_STRIDES(%(h)s)[1],
-CudaNdarray_DEV_DATA(%(out)s),
-CudaNdarray_HOST_STRIDES(%(out)s)[0], CudaNdarray_HOST_STRIDES(%(out)s)[1],
-%(name)s_iIdx, PyArray_DIM(%(inputIdx)s, 1),
-%(name)s_oIdx, PyArray_DIM(%(outputIdx)s, 1));
-        }
-        { /* Run SgemvBatched */
-          float alpha = 1.0f;
-          float beta = 1.0f;
-          cublasStatus_t err;
-          cublasOperation_t transA = CUBLAS_OP_N;
-          int lda = CudaNdarray_HOST_STRIDES(%(W)s)[2];
-          if (lda == 1) {
-            transA = CUBLAS_OP_T;
-            lda = CudaNdarray_HOST_STRIDES(%(W)s)[3];
-          }
-          if (lda == 0) lda = 1;
-          err = SgemvBatched(handle, transA,
-                             CudaNdarray_HOST_DIMS(%(o)s)[2],
-                             CudaNdarray_HOST_DIMS(%(h)s)[2], &alpha,
-                             %(name)s_W_list, lda, %(name)s_inp_list,
-                             CudaNdarray_HOST_STRIDES(%(h)s)[2],
-                             &beta, %(name)s_out_list,
-                             CudaNdarray_HOST_STRIDES(%(o)s)[2],
-                             CudaNdarray_HOST_DIMS(%(o)s)[1] *
-                             CudaNdarray_HOST_DIMS(%(h)s)[1] *
-                             CudaNdarray_HOST_DIMS(%(o)s)[0]);
-          if (err != CUBLAS_STATUS_SUCCESS) {
-            PyErr_Format(PyExc_RuntimeError, "SgemvBatched failed(%%s)",
-                         cublasGetErrorString(err));
-            %(fail)s
-          }
-        }
-        // And we're done!
-        """ % dict(out=out, h=h, o=o, inputIdx=inputIdx, outputIdx=outputIdx,
-                   W=W, fail=sub['fail'], name=nodename)
-    def c_code_cache_version(self):
-        return (12,)
-    def grad(self, inputs, grads):
-        o, W, h, inputIdx, outputIdx = inputs
-        go = grads[0]
-        Wgrad = gpu_sparse_block_outer(W.zeros_like(),
-                                       h, go, inputIdx, outputIdx)
-        hgrad = gpu_sparse_block_gemv(h.zeros_like(),
-                                      W.dimshuffle((1, 0, 3, 2)),
-                                      go,
-                                      outputIdx, inputIdx)
-        return [go, Wgrad, hgrad,
-                grad_undefined(self, 3, inputIdx,
-                               "grad of inputIdx makes no sense"),
-                grad_undefined(self, 4, outputIdx,
-                               "grad of outputIdx makes no sense")]
-gpu_sparse_block_gemv = GpuSparseBlockGemv(False)
-gpu_sparse_block_gemv_inplace = GpuSparseBlockGemv(True)
-class GpuSparseBlockOuter(GpuOp):
-    """
-    GPU version of SparseBlockOuter. See SparseBlockOuter's docstring for more
-    information.
-    This op should not be called directly since its interface is
-    subject to change without notice.  It is involved in the gradient
-    of GpuSparseBlockGemv. The gradient is not implemented.
-    """
-    __props__ = ('inplace',)
-    def __init__(self, inplace=False):
-        self.inplace = inplace
-        if self.inplace:
-            self.destroy_map = {0: [0]}
-    def make_node(self, o, x, y, xIdx, yIdx, alpha=None):
-        one = tensor.constant(numpy.asarray(1.0, dtype='float32'))
-        o = basic_ops.as_cuda_ndarray_variable(o)
-        x = basic_ops.as_cuda_ndarray_variable(x)
-        y = basic_ops.as_cuda_ndarray_variable(y)
-        if alpha is None:
-            alpha = one
-        return Apply(self, [o, x, y, xIdx, yIdx, alpha],
-                     [o.type()])
-    def infer_shape(self, node, input_shapes):
-        return [input_shapes[0]]
-    def c_support_code(self):
-        return """
-__global__ void
-SparseBlockOuter_fill_lists(
-int maxi, int maxj,
-const float **x_list,
-const float **y_list,
-float **out_list,
-const float *x, int x_str_0, int x_str_1,
-const float *y, int y_str_0, int y_str_1,
-float *out, int o_str_0, int o_str_1,
-const npy_intp *xIdx, int xI_str_0,
-const npy_intp *yIdx, int yI_str_0
-) {
-  int i = threadIdx.x + blockDim.x * blockIdx.x;
-  int j = threadIdx.y + blockDim.y * blockIdx.y;
-  int b = blockIdx.z;
-  if (i >= maxi || j >= maxj) return;
-  int p = i + j * maxi + b * maxi * maxj;
-  x_list[p] = &x[b * x_str_0 + i * x_str_1];
-  y_list[p] = &y[b * y_str_0 + j * y_str_1];
-  out_list[p] = &out[xIdx[b * xI_str_0 + i] * o_str_0 +
-                     yIdx[b * yI_str_0 + j] * o_str_1];
-}
-/* This is tuned for smaller sizes (< 512) since it's what we get normally */
-__global__ void _sgerBH_gen_small(const float *x[], int incx,
-                                  const float *y[], int incy,
-                                  float alpha,
-                                  float *A[], int lda,
-                                  int b, int m, int n) {
-  int i = blockIdx.x * blockDim.x + threadIdx.x;
-  int j = blockIdx.y * blockDim.y + threadIdx.y;
-  if (i >= m || j >= n) return;
-  for (int p = blockIdx.z; p < b; p += gridDim.z) {
-    atomicAdd(&A[p][j * lda + i],
-              alpha * x[p][i * incx] * y[p][j * incy]);
-  }
-}
-static cublasStatus_t SgerBatched(cublasHandle_t handle, int m, int n,
-                                  const float *alpha,
-                                  const float *x[], int incx,
-                                  const float *y[], int incy,
-                                  float *A[], int lda,
-                                  int batchCount) {
-  dim3 block(m, n, 1);
-  dim3 grid(1, 1, batchCount);
-  cublasPointerMode_t mode;
-  cudaError_t err;
-  if (incx == 1) {
-    if (block.x > 32) {
-      grid.x = (block.x + 31)/32;
-      block.x = 32;
-    }
-    if (block.x * block.y > 512) {
-      grid.y = (block.y + 15) / 16;
-      block.y = 16;
-    }
-  } else {
-    if (block.y > 32) {
-      grid.y = (block.y + 31)/32;
-      block.y = 32;
-    }
-    if (block.x * block.y > 512) {
-      grid.x = (block.x + 15) / 16;
-      block.x = 16;
-    }
-  }
-  if (grid.x * grid.y * grid.z > 65535) {
-    if (grid.x * grid.y > 65535)
-      return CUBLAS_STATUS_INVALID_VALUE;
-    grid.z = (65535 / (grid.x * grid.y));
-  }
-  cublasGetPointerMode(handle, &mode);
-  if (mode == CUBLAS_POINTER_MODE_HOST) {
-    _sgerBH_gen_small<<<grid, block>>>(x, incx, y, incy, *alpha, A, lda,
-                                       batchCount, m, n);
-  } else {
-    return CUBLAS_STATUS_INVALID_VALUE;
-  }
-  err = cudaGetLastError();
-  if (err != cudaSuccess)
-    return CUBLAS_STATUS_EXECUTION_FAILED;
-  return CUBLAS_STATUS_SUCCESS;
-}
-static int SparseBlockOuter_copy(PyArrayObject *a, npy_intp *b) {
-  cudaError_t err;
-  PyArrayObject *aa = (PyArrayObject *)PyArray_Cast(a, NPY_INTP);
-  if (aa == NULL) { return -1; }
-  err = cudaMemcpyAsync(b, PyArray_DATA(aa), PyArray_NBYTES(aa),
-                        cudaMemcpyHostToDevice);
-  Py_DECREF(aa);
-  if (err != cudaSuccess) {
-    PyErr_Format(PyExc_RuntimeError, "Cannot copy index data to GPU(%s)",
-                 cudaGetErrorString(err));
-    return -1;
-  }
-  return 0;
-}
-"""
-    def c_support_code_apply(self, node, name):
-        return """
-/* statics are initialized with 0 */
-static float **%(n)s_out_list;
-static const float **%(n)s_x_list;
-static const float **%(n)s_y_list;
-static size_t %(n)s_list_len;
-static npy_intp *%(n)s_xIdx;
-static size_t %(n)s_xIdx_len;
-static npy_intp *%(n)s_yIdx;
-static size_t %(n)s_yIdx_len;
-static int %(n)s_prep(int b, int i, int j) {
-  int s = b*i*j;
-  if (%(n)s_list_len < s) {
-    device_free(%(n)s_x_list);
-    device_free(%(n)s_y_list);
-    device_free(%(n)s_out_list);
-    %(n)s_x_list = (const float **) device_malloc(s*sizeof(float *));
-    if (%(n)s_x_list == NULL) return -1;
-    %(n)s_y_list = (const float **) device_malloc(s*sizeof(float *));
-    if (%(n)s_y_list == NULL) return -1;
-    %(n)s_out_list = (float **) device_malloc(s*sizeof(float *));
-    if (%(n)s_out_list == NULL) return -1;
-    %(n)s_list_len = s;
-  }
-  if (%(n)s_xIdx_len < b*i) {
-    device_free(%(n)s_xIdx);
-    %(n)s_xIdx = (npy_intp*) device_malloc(b*i*sizeof(npy_intp));
-    if (%(n)s_xIdx == NULL) return -1;
-    %(n)s_xIdx_len = b*i;
-  }
-  if (%(n)s_yIdx_len < b*j) {
-    device_free(%(n)s_yIdx);
-    %(n)s_yIdx = (npy_intp*) device_malloc(b*j*sizeof(npy_intp));
-    if (%(n)s_yIdx == NULL) return -1;
-    %(n)s_yIdx_len = b*j;
-  }
-  return 0;
-}
-""" % dict(n=name)
-    def c_code(self, node, name, inputs, outputs, sub):
-        o, x, y, xIdx, yIdx, alpha = inputs
-        out = outputs[0]
-        if self.inplace:
-            res = """
-Py_XDECREF(%(out)s);
-%(out)s = %(o)s;
-Py_INCREF(%(out)s);
-""" % dict(out=out, o=o)
-        else:
-            res = """
-if (CudaNdarray_prep_output(&%(out)s, 4, CudaNdarray_HOST_DIMS(%(o)s)))
-{
-  // Python error already set
-  %(fail)s
-}
-if (CudaNdarray_CopyFromCudaNdarray(%(out)s, %(o)s)) {
-  //Error message already set
-  %(fail)s
-}
-""" % dict(out=out, o=o, fail=sub['fail'])
-        return res + """
-if (%(name)s_prep(CudaNdarray_HOST_DIMS(%(x)s)[0],
-                  CudaNdarray_HOST_DIMS(%(x)s)[1],
-                  CudaNdarray_HOST_DIMS(%(y)s)[1]) == -1) {
-  PyErr_SetString(PyExc_RuntimeError, "Could not allocate working memory.");
-  %(fail)s
-}
-if (SparseBlockOuter_copy(%(xIdx)s, %(name)s_xIdx) == -1)
- { %(fail)s }
-if (SparseBlockOuter_copy(%(yIdx)s, %(name)s_yIdx) == -1)
- { %(fail)s }
-{
-  dim3 block;
-  dim3 grid;
-  block.x = CudaNdarray_HOST_DIMS(%(x)s)[1];
-  block.y = CudaNdarray_HOST_DIMS(%(y)s)[1];
-  grid.z = CudaNdarray_HOST_DIMS(%(x)s)[0];
-  if (block.x > 32) {
-    grid.x = (block.x + 31) / 32;
-    block.x = 32;
-  }
-  if (block.x * block.y > 512) {
-    grid.y = (block.y + 15) / 16;
-    block.y = 16;
-  }
-  SparseBlockOuter_fill_lists<<<grid, block>>>(
-CudaNdarray_HOST_DIMS(%(x)s)[1], CudaNdarray_HOST_DIMS(%(y)s)[1],
-%(name)s_x_list,
-%(name)s_y_list,
-%(name)s_out_list,
-CudaNdarray_DEV_DATA(%(x)s), CudaNdarray_HOST_STRIDES(%(x)s)[0],
-CudaNdarray_HOST_STRIDES(%(x)s)[1],
-CudaNdarray_DEV_DATA(%(y)s), CudaNdarray_HOST_STRIDES(%(y)s)[0],
-CudaNdarray_HOST_STRIDES(%(y)s)[1],
-CudaNdarray_DEV_DATA(%(out)s),
-CudaNdarray_HOST_STRIDES(%(out)s)[0], CudaNdarray_HOST_STRIDES(%(out)s)[1],
-%(name)s_xIdx, PyArray_DIM(%(xIdx)s, 1),
-%(name)s_yIdx, PyArray_DIM(%(yIdx)s, 1));
-}
-{
-  cublasStatus_t err;
-  int str_y = CudaNdarray_HOST_STRIDES(%(y)s)[2];
-  if (str_y == 0) str_y = 1;
-  int str_x = CudaNdarray_HOST_STRIDES(%(x)s)[2];
-  if (str_x == 0) str_x = 1;
-  int str_out = CudaNdarray_HOST_STRIDES(%(out)s)[2];
-  if (str_out == 0) str_out = 1;
-  err = SgerBatched(handle,
-    CudaNdarray_HOST_DIMS(%(y)s)[2], CudaNdarray_HOST_DIMS(%(x)s)[2],
-    (float *)PyArray_GETPTR1(%(alpha)s, 0), %(name)s_y_list, str_y,
-    %(name)s_x_list, str_x,
-    %(name)s_out_list, str_out,
-    CudaNdarray_HOST_DIMS(%(x)s)[0] *
-    CudaNdarray_HOST_DIMS(%(x)s)[1] *
-    CudaNdarray_HOST_DIMS(%(y)s)[1]);
-  if (err != CUBLAS_STATUS_SUCCESS) {
-    if (err == CUBLAS_STATUS_INVALID_VALUE) {
-       /* The current code would be much too slow for sizes any larger
-          than this. */
-       PyErr_SetString(PyExc_ValueError,
-                       "SgerBatched failed, probably because you have your "
-                       "block size too big. The current limit is 65535 for "
-                       "iSize * oSize.");
-    } else {
-      PyErr_Format(PyExc_RuntimeError, "SgerBatched failed(%%s)",
-                   cublasGetErrorString(err));
-    }
-    %(fail)s
-  }
-}""" % dict(x=x, y=y, out=out, xIdx=xIdx, yIdx=yIdx, name=name,
-            alpha=alpha, fail=sub['fail'])
-    def c_code_cache_version(self):
-        return (11,)
-gpu_sparse_block_outer = GpuSparseBlockOuter(False)
-gpu_sparse_block_outer_inplace = GpuSparseBlockOuter(True)
--- a/theano/sandbox/cuda/cnmem.cpp
+++ b/theano/sandbox/cuda/cnmem.cpp
-///////////////////////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions
-// are met:
-//  * Redistributions of source code must retain the above copyright
-//    notice, this list of conditions and the following disclaimer.
-//  * Redistributions in binary form must reproduce the above copyright
-//    notice, this list of conditions and the following disclaimer in the
-//    documentation and/or other materials provided with the distribution.
-//  * Neither the name of NVIDIA CORPORATION nor the names of its
-//    contributors may be used to endorse or promote products derived
-//    from this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-///////////////////////////////////////////////////////////////////////////////////////////////////
-#include "cnmem.h"
-#include <cstddef>
-#include <vector>
-#include <cuda_runtime_api.h>
-#if !defined(WIN32) && defined(_MSC_VER)
-#define WIN32
-#endif
-#ifdef WIN32
-#include <Windows.h>
-#else
-#include <pthread.h>
-#endif
-#define CNMEM_GRANULARITY 512
-///////////////////////////////////////////////////////////////////////////////////////////////////
-extern "C" const char* cnmemGetErrorString(cnmemStatus_t status) {
-    switch(status) {
-        case CNMEM_STATUS_SUCCESS: return "CNMEM_STATUS_SUCCESS";
-        case CNMEM_STATUS_CUDA_ERROR: return "CNMEM_STATUS_CUDA_ERROR";
-        case CNMEM_STATUS_INVALID_ARGUMENT: return "CNMEM_STATUS_INVALID_ARGUMENT";
-        case CNMEM_STATUS_NOT_INITIALIZED: return "CNMEM_STATUS_NOT_INITIALIZED";
-        case CNMEM_STATUS_OUT_OF_MEMORY: return "CNMEM_STATUS_OUT_OF_MEMORY";
-        default: return "CNMEM_STATUS_UNKNOWN_ERROR";
-    }
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-#if 0
-#ifdef WIN32
-#define CNMEM_DEBUG_ERROR(...) do { \
-    fprintf(stderr, "Error at line: %d\n", __LINE__); \
-    fprintf(stderr, __VA_ARGS__); \
-} while(0)
-#else
-#include <execinfo.h>
-static inline void printBacktrace() {
-    void *stackBuffer[64]; 
-    int numAddresses = backtrace((void**) &stackBuffer, 64); 
-    char **addresses = backtrace_symbols(stackBuffer, numAddresses); 
-    for( int i = 0 ; i < numAddresses ; ++i ) { 
-        fprintf(stderr, "[%2d]: %s\n", i, addresses[i]); 
-    } 
-    free(addresses); 
-}
-#define CNMEM_DEBUG_ERROR(...) do { \
-    fprintf(stderr, "Error at line: %d\n", __LINE__); \
-    fprintf(stderr, __VA_ARGS__); \
-    fprintf(stderr, "Backtrace:\n"); \
-    printBacktrace(); \
-} while(0)
-#endif
-#else
-#define CNMEM_DEBUG_ERROR(...)
-#endif
-#if 0
-#define CNMEM_DEBUG_INFO printf
-#else
-#define CNMEM_DEBUG_INFO(...)
-#endif
-#if 0 // Enable/disable assertions
-#include <cassert>
-#define CNMEM_ASSERT assert
-#else
-#define CNMEM_ASSERT(...)
-#endif
-#define CNMEM_CHECK_TRUE(cond, error) do { \
-    if( !(cond) ) { \
-        CNMEM_DEBUG_ERROR("CNMEM_CHECK_TRUE evaluates to false\n"); \
-        return error; \
-    } \
-} while(0) 
-#define CNMEM_CHECK(call) do { \
-    cnmemStatus_t status = (call); \
-    if( status != CNMEM_STATUS_SUCCESS ) { \
-        CNMEM_DEBUG_ERROR("CNMEM_CHECK failed with status \"%s\"\n", \
-                cnmemGetErrorString(status)); \
-        return status; \
-    } \
-} while(0)
-#define CNMEM_CHECK_OR_UNLOCK(call, mutex) do { \
-    cnmemStatus_t status = (call); \
-    if( status != CNMEM_STATUS_SUCCESS ) { \
-        CNMEM_DEBUG_ERROR("CNMEM_CHECK_OR_UNLOCK failed with status \"%s\"\n", \
-                cnmemGetErrorString(status)); \
-        (mutex).unlock(); \
-        return status; \
-    } \
-} while(0)
-#define CNMEM_CHECK_CUDA(call) do { \
-    cudaError_t cudaError = (call); \
-    if( cudaError == cudaErrorMemoryAllocation ) { \
-        CNMEM_DEBUG_ERROR("CNMEM_CHECK_CUDA failed with CUDA error \"%s\"\n", \
-                cudaGetErrorString(cudaError)); \
-        return CNMEM_STATUS_OUT_OF_MEMORY; \
-    } \
-    else if( cudaError != cudaSuccess ) { \
-        CNMEM_DEBUG_ERROR("CNMEM_CHECK_CUDA failed with CUDA error \"%s\"\n", \
-                cudaGetErrorString(cudaError)); \
-        return CNMEM_STATUS_CUDA_ERROR; \
-    } \
-} while(0)
-#define CNMEM_CHECK_CUDA_OR_UNLOCK(call, mutex) do { \
-    cudaError_t cudaError = (call); \
-    if( cudaError == cudaErrorMemoryAllocation ) { \
-        CNMEM_DEBUG_ERROR("CNMEM_CHECK_CUDA_OR_UNLOCK failed with CUDA error \"%s\"\n", \
-                cudaGetErrorString(cudaError)); \
-        (mutex).unlock(); \
-        return CNMEM_STATUS_OUT_OF_MEMORY; \
-    } \
-    else if( cudaError != cudaSuccess ) { \
-        CNMEM_DEBUG_ERROR("CNMEM_CHECK_CUDA_OR_UNLOCK failed with CUDA error \"%s\"\n", \
-                cudaGetErrorString(cudaError)); \
-        (mutex).unlock(); \
-        return CNMEM_STATUS_CUDA_ERROR; \
-    } \
-} while(0)
-#ifdef WIN32
-#define CNMEM_CHECK_WIN32(call, error_code) do { \
-    SetLastError(0); /* Clean the flag. */ \
-    call; \
-    DWORD status = GetLastError(); \
-    if( status ) \
-        return error_code; \
-} while(0)
-#else
-#define CNMEM_CHECK_PTHREAD(call, error_code) do { \
-    int status = call; \
-    if( status ) { \
-        CNMEM_DEBUG_ERROR("CNMEM_CHECK_PTHREAD failed with status %d\n", status); \
-        return error_code; \
-    } \
-} while(0)
-#endif
-///////////////////////////////////////////////////////////////////////////////////////////////////
-namespace cnmem {
-static inline std::size_t ceilInt(std::size_t m, std::size_t n) {
-    CNMEM_ASSERT(n > 0);
-    return (m + n-1) / n * n;
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-class Mutex {
-#ifdef WIN32
-    mutable CRITICAL_SECTION mCriticalSection;
-#else
-    pthread_mutex_t  mMutex;
-#endif
-public:
-    /// Initialize the mutex.
-    cnmemStatus_t initialize();
-    /// Finalize the mutex.
-    cnmemStatus_t finalize();
-    /// Lock the mutex.
-    cnmemStatus_t lock() const;
-    /// Unlock the mutex.
-    cnmemStatus_t unlock() const;
-};
-///////////////////////////////////////////////////////////////////////////////////////////////////
-cnmemStatus_t Mutex::initialize() {
-#ifdef WIN32
-    CNMEM_CHECK_WIN32(InitializeCriticalSection((CRITICAL_SECTION*) &mCriticalSection), CNMEM_STATUS_UNKNOWN_ERROR);
-#else
-#if 0
-    pthread_mutexattr_t attr;
-    CNMEM_CHECK_PTHREAD(pthread_mutexattr_init(&attr), CNMEM_STATUS_UNKNOWN_ERROR);
-    CNMEM_CHECK_PTHREAD(pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE), CNMEM_STATUS_UNKNOWN_ERROR);
-    CNMEM_CHECK_PTHREAD(pthread_mutex_init(&mMutex, &attr), CNMEM_STATUS_UNKNOWN_ERROR);
-#else
-    CNMEM_CHECK_PTHREAD(pthread_mutex_init(&mMutex, NULL), CNMEM_STATUS_UNKNOWN_ERROR);
-#endif
-#endif
-    return CNMEM_STATUS_SUCCESS;
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-cnmemStatus_t Mutex::finalize() {
-#ifdef WIN32
-    CNMEM_CHECK_WIN32(DeleteCriticalSection((CRITICAL_SECTION*) &mCriticalSection), CNMEM_STATUS_UNKNOWN_ERROR);
-#else
-    CNMEM_CHECK_PTHREAD(pthread_mutex_destroy(&mMutex), CNMEM_STATUS_UNKNOWN_ERROR);
-#endif
-    return CNMEM_STATUS_SUCCESS;
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-cnmemStatus_t Mutex::lock() const {
-#ifdef WIN32
-    CNMEM_CHECK_WIN32(EnterCriticalSection(&mCriticalSection), CNMEM_STATUS_UNKNOWN_ERROR);
-#else
-    CNMEM_CHECK_PTHREAD(pthread_mutex_lock((pthread_mutex_t*) &mMutex), CNMEM_STATUS_UNKNOWN_ERROR);
-#endif
-    return CNMEM_STATUS_SUCCESS;
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-cnmemStatus_t Mutex::unlock() const {
-#ifdef WIN32
-    CNMEM_CHECK_WIN32(LeaveCriticalSection(&mCriticalSection), CNMEM_STATUS_UNKNOWN_ERROR);
-#else
-    CNMEM_CHECK_PTHREAD(pthread_mutex_unlock((pthread_mutex_t*) &mMutex), CNMEM_STATUS_UNKNOWN_ERROR);
-#endif
-    return CNMEM_STATUS_SUCCESS;
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-class Block {
-    /// The pointer to the memory region on the device. 
-    char *mData;
-    /// The size of the memory buffer.
-    std::size_t mSize;
-    /// The prev/next blocks in the linked list of blocks.
-    Block *mNext;
-    /// Is it a head node (i.e. a node obtained from parent->allocate or cudaMalloc).
-    bool mIsHead;
-public:
-    /// Create a block.
-    Block(char *data, std::size_t size, Block *next, bool isHead)
-        : mData(data)
-        , mSize(size)
-        , mNext(next)
-        , mIsHead(isHead) {
-    }
-    /// The data.
-    inline const char* getData() const { return mData; }
-    /// The data (mutable).
-    inline char* getData() { return mData; }
-    /// The size of the block.
-    inline std::size_t getSize() const { return mSize; }
-    /// The next block in the linked list.
-    inline const Block* getNext() const { return mNext; }
-    /// The next block in the linked list (mutable).
-    inline Block* getNext() { return mNext; }
-    /// Is it a head block.
-    inline bool isHead() const { return mIsHead; }
-    /// Change the next block.
-    inline void setNext(Block *next) { mNext = next; }
-    /// Change the size of the block.
-    inline void setSize(std::size_t size) { mSize = size; }
-    /// Set the head flag.
-    inline void setHeadFlag(bool isHead) { mIsHead = isHead; }
-};
-///////////////////////////////////////////////////////////////////////////////////////////////////
-class Manager {
-    /// The parent manager.
-    Manager *mParent;
-    /// The children managers.
-    std::vector<Manager*> mChildren;
-    /// The GPU device where the memory is allocated.
-    int mDevice;
-    /// The stream this manager is associated with. It could be NULL.
-    cudaStream_t mStream;
-    /// Is the stream blocking?
-    bool mIsStreamBlocking;
-    /// The list of used blocks.
-    Block *mUsedBlocks;
-    /// The list of free blocks.
-    Block *mFreeBlocks;
-    /// The managed memory size.
-    std::size_t mSize;
-    /// The flags.
-    unsigned mFlags;
-    /// To support multi-threading. Each manager has its own mutex.
-    Mutex mMutex;
-public:
-    /// Create an unitialized manager.
-    Manager();
-    /// Dtor.
-    ~Manager();
-    /// Allocate a block of memory.
-    cnmemStatus_t allocate(void *&ptr, std::size_t size, bool isBlocking = true);
-    /// Release a block of memory.
-    cnmemStatus_t release(void *ptr);
-    /// Release memory. It returns true if we have no memory leak.
-    cnmemStatus_t releaseAllUnsafe();
-    /// Reserve memory for a manager.
-    cnmemStatus_t reserve(std::size_t size);
-    /// Steal memory from another manager.
-    cnmemStatus_t stealUnsafe(void *&ptr, std::size_t size);
-    /// Print the full memory state.
-    cnmemStatus_t printMemoryState(FILE *file) const;
-    /// The amount of used memory.
-    inline cnmemStatus_t getUsedMemoryUnsafe(std::size_t &usedMemory) const { 
-        return getMemoryUnsafe(usedMemory, mUsedBlocks); 
-    }
-    /// The amount of used memory.
-    inline cnmemStatus_t getFreeMemoryUnsafe(std::size_t &freeMemory) const { 
-        return getMemoryUnsafe(freeMemory, mFreeBlocks); 
-    }
-    /// Get a specific child based on the stream id. 
-    cnmemStatus_t getChildFromStream(Manager *&manager, cudaStream_t stream) const;
-    /// Get a specific child based on the stream id. 
-    cnmemStatus_t getChild(Manager *&manager, std::size_t i) const;
-    /// Add a new child.
-    cnmemStatus_t addChild(Manager *manager);
-    /// The number of children.
-    cnmemStatus_t getNumChildren(std::size_t &numChildren) const;
-    /// The associated device.
-    inline int getDevice() const { return mDevice; }
-    /// The flags.
-    inline unsigned getFlags() const { return mFlags; }
-    /// Get the mutex.
-    inline const Mutex* getMutex() const { return &mMutex; }
-    /// The size allocated to that manager.
-    inline std::size_t getSize() const { return mSize; }
-    /// The CUDA stream.
-    inline cudaStream_t getStream() const { return mStream; }
-    /// Define the parent.
-    inline void setParent(Manager *parent) { mParent = parent; }
-    /// Define the device.
-    inline void setDevice(int device) { mDevice = device; }
-    /// Define the stream.
-    inline cnmemStatus_t setStream(cudaStream_t stream) { 
-        mStream = stream; 
-#ifdef CUDA_API_PER_THREAD_DEFAULT_STREAM
-        mIsStreamBlocking = false;
-#elif CUDART_VERSION < 5050
-        mIsStreamBlocking = true;
-#else
-        unsigned flags = 0;
-        CNMEM_CHECK_CUDA(cudaStreamGetFlags(mStream, &flags));
-        mIsStreamBlocking = !mStream || !(flags & cudaStreamNonBlocking);
-#endif
-        return CNMEM_STATUS_SUCCESS;
-    }
-    /// Define the flags.
-    inline void setFlags(unsigned flags) { mFlags = flags; }
-private:
-    /// The member functions below which are marked "Unsafe" are not thread-safe when called on a
-    /// same Manager object. Make sure they are called by a single thread in that case.
-    /// Allocate a new block and add it to the free list.
-    cnmemStatus_t allocateBlockUnsafe(Block *&curr, Block *&prev, std::size_t size);
-    /// Release a block from the active list.
-    cnmemStatus_t releaseBlockUnsafe(Block *curr, Block *prev);
-    /// Find the best free node based on the size.
-    cnmemStatus_t findBestBlockUnsafe(Block *&curr, Block *&prev, std::size_t size);
-    /// Extract a node from the list of free blocks.
-    cnmemStatus_t extractBlockUnsafe(Block *curr, Block *prev, std::size_t size, bool stolen);
-    /// Give a free block from that manager.
-    cnmemStatus_t giveBlockUnsafe(void *&data, std::size_t &dataSize, std::size_t size);
-    /// Steal a block from another manager.
-    cnmemStatus_t stealBlockUnsafe(void *&data, std::size_t &dataSize, std::size_t size);
-    /// The memory consumption of a list.
-    cnmemStatus_t getMemoryUnsafe(std::size_t &memSize, const Block *head) const;
-    /// Print an internal linked list.
-    cnmemStatus_t printListUnsafe(FILE *file, const char *name, const Block *head) const;
-};
-///////////////////////////////////////////////////////////////////////////////////////////////////
-Manager::Manager()
-    : mParent(NULL)
-    , mChildren()
-    , mDevice(-1)
-    , mStream(NULL)
-    , mIsStreamBlocking(false)
-    , mUsedBlocks(NULL)
-    , mFreeBlocks(NULL)
-    , mSize(0)
-    , mFlags(CNMEM_FLAGS_DEFAULT)
-    , mMutex() {
-    mMutex.initialize();
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-Manager::~Manager() {
-    if( mDevice == -1 || cudaSetDevice(mDevice) != cudaSuccess ) { // Invalid device, skip it.
-        return;
-    }
-    releaseAllUnsafe();
-    mMutex.finalize();
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-cnmemStatus_t Manager::addChild(Manager *manager) {
-    CNMEM_CHECK(mMutex.lock());
-    mChildren.push_back(manager);
-    CNMEM_CHECK(mMutex.unlock());
-    return CNMEM_STATUS_SUCCESS;
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-cnmemStatus_t Manager::allocate(void *&ptr, std::size_t size, bool isBlocking) {
-    CNMEM_CHECK(mMutex.lock());
-    // If the client is not blocking, we have to explicitly synchronize before giving one buffer.
-    if( !isBlocking ) {
-        CNMEM_CHECK_CUDA_OR_UNLOCK(cudaStreamSynchronize(mStream), mMutex);
-    }
-    // Find the best fit.
-    Block *best = NULL, *prev = NULL;
-    CNMEM_CHECK_OR_UNLOCK(findBestBlockUnsafe(best, prev, size), mMutex);
-    // If there's no block left in the list of free blocks (with a sufficient size). Request a new block. 
-    if( best == NULL && !(mFlags & CNMEM_FLAGS_CANNOT_GROW) ) {
-        CNMEM_CHECK_OR_UNLOCK(allocateBlockUnsafe(best, prev, size), mMutex);
-    }
-    // Make sure we do have a block or quit.
-    if( !best ) {
-        ptr = NULL;
-        CNMEM_CHECK(mMutex.unlock());
-        return CNMEM_STATUS_OUT_OF_MEMORY;
-    }
-    // Split the free block if needed.
-    CNMEM_CHECK_OR_UNLOCK(extractBlockUnsafe(best, prev, size, false), mMutex);
-    // Push the node to the list of used nodes.
-    best->setNext(mUsedBlocks);
-    mUsedBlocks = best;
-    // Return the new pointer into memory.
-    ptr = mUsedBlocks->getData();
-    CNMEM_CHECK(mMutex.unlock());
-    return CNMEM_STATUS_SUCCESS;
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-cnmemStatus_t Manager::allocateBlockUnsafe(Block *&curr, Block *&prev, std::size_t size) {
-    // Reset the outputs.
-    curr = prev = NULL;
-    // Try to allocate data from the parent or the device.
-    void *data = NULL;
-    if( mParent ) {
-        CNMEM_CHECK(mParent->allocate(data, size, mIsStreamBlocking));
-    }
-    else {
-        CNMEM_DEBUG_INFO("cudaMalloc(%lu)\n", size);
-        CNMEM_CHECK_CUDA(cudaMalloc(&data, size));
-        CNMEM_DEBUG_INFO(">> returned address=0x%016lx\n", (size_t) data);
-    }
-    // If it failed, there's an unexpected issue.
-    CNMEM_ASSERT(data);
-    // We have data, we now need to add it to the list of free nodes. We keep the list sorted.
-    Block *next = mFreeBlocks;
-    for( ; next && next->getData() < data ; next = next->getNext() ) {
-        prev = next;
-    }
-    curr = new Block((char*) data, size, next, true);
-    if( !curr ) {
-        return CNMEM_STATUS_OUT_OF_MEMORY;
-    }
-    if( prev ) {
-        prev->setNext(curr);
-    }
-    else {
-        mFreeBlocks = curr;
-    }
-    return CNMEM_STATUS_SUCCESS;
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-cnmemStatus_t Manager::extractBlockUnsafe(Block *curr, Block *prev, std::size_t size, bool stolen) {
-    // We have two cases: 1/ It is the right size so we keep it or 2/ it is too large and we split the node.
-    Block *next;
-    if( curr->getSize() == size ) {
-        next = curr->getNext();
-    }
-    else {
-        std::size_t remaining = curr->getSize()-size;
-        Block *newBlock = new Block(curr->getData() + size, remaining, curr->getNext(), stolen);
-        if( !newBlock ) {
-            return CNMEM_STATUS_OUT_OF_MEMORY;
-        }
-        next = newBlock;
-        curr->setSize(size);
-    }
-    // Redo the "branching" in the nodes.
-    if( prev ) {
-        prev->setNext(next);
-    }
-    else {
-        mFreeBlocks = next;
-    }
-    return CNMEM_STATUS_SUCCESS;
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-cnmemStatus_t Manager::findBestBlockUnsafe(Block *&best, Block *&prev, std::size_t size) {
-    best = NULL, prev = NULL;
-    for( Block *temp = mFreeBlocks, *tempPrev = NULL ; temp ; temp = temp->getNext() ) {
-        if( temp->getSize() >= size && (!best || temp->getSize() < best->getSize()) ) {
-            best = temp;
-            prev = tempPrev;
-        }
-        tempPrev = temp;
-    }
-    return CNMEM_STATUS_SUCCESS;
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-cnmemStatus_t Manager::getChildFromStream(Manager *&manager, cudaStream_t stream) const {
-    CNMEM_CHECK(mMutex.lock());
-    std::size_t i = 0, numChildren = mChildren.size();
-    for( ; i < numChildren ; ++i ) {
-        if( mChildren[i]->mStream == stream ) {
-            manager = mChildren[i];
-            break;
-        }
-    }
-    CNMEM_CHECK(mMutex.unlock());
-    return i < numChildren ? CNMEM_STATUS_SUCCESS : CNMEM_STATUS_INVALID_ARGUMENT;
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-cnmemStatus_t Manager::getChild(Manager *&manager, std::size_t i) const {
-    CNMEM_CHECK(mMutex.lock());
-    if( i >= mChildren.size() ) {
-        CNMEM_CHECK(mMutex.unlock());
-        return CNMEM_STATUS_INVALID_ARGUMENT;
-    }
-    manager = mChildren[i];
-    CNMEM_CHECK(mMutex.unlock());
-    return CNMEM_STATUS_SUCCESS;
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-cnmemStatus_t Manager::getMemoryUnsafe(std::size_t &size, const Block *head) const {
-    size = 0;
-    for( Block *curr = (Block*) head ; curr ; curr = curr->getNext() ) {
-        size += curr->getSize();
-    }
-    return CNMEM_STATUS_SUCCESS;
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-#if 0
-cnmemStatus_t Manager::getMemory(std::size_t &size, const Block *head) const {
-    CNMEM_CHECK(mMutex.lock());
-    CNMEM_CHECK_OR_UNLOCK(getMemoryUnsafe(size, head));
-    CNMEM_CHECK(mMutex.unlock());
-    return status;
-}
-#endif
-///////////////////////////////////////////////////////////////////////////////////////////////////
-cnmemStatus_t Manager::getNumChildren(std::size_t &numChildren) const {
-    CNMEM_CHECK(mMutex.lock());
-    numChildren = mChildren.size();
-    CNMEM_CHECK(mMutex.unlock());
-    return CNMEM_STATUS_SUCCESS;
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-cnmemStatus_t Manager::giveBlockUnsafe(void *&blockData, std::size_t &blockSize, std::size_t size) {
-    // Make sure the block is not in use any more. It could be too coarse grain and we may change 
-    // it in the future.
-    CNMEM_CHECK_CUDA(cudaStreamSynchronize(mStream));
-    // Init the returned values to 0.
-    blockData = NULL;
-    blockSize = 0;
-    // Find the best node to steal and reserve it.
-    Block *best = NULL, *prev = NULL;
-    CNMEM_CHECK(findBestBlockUnsafe(best, prev, size));
-    if( !best ) {
-        return CNMEM_STATUS_OUT_OF_MEMORY;
-    }
-    CNMEM_CHECK(extractBlockUnsafe(best, prev, size, true));
-    blockData = best->getData();
-    blockSize = best->getSize();
-    // Release the memory used by that block.
-    delete best;
-    return CNMEM_STATUS_SUCCESS;
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-cnmemStatus_t Manager::printListUnsafe(FILE *file, const char *name, const Block *head) const {
-    std::size_t size = 0;
-    for( Block *curr = (Block*) head; curr; curr = curr->getNext() ) {
-        size += curr->getSize();
-    }
-    fprintf(file, "| list=\"%s\", size=%lu\n", name, size);
-    for( Block *curr = (Block*) head ; curr ; curr = curr->getNext() ) {
-        fprintf(file, "| | node=0x%016lx, data=0x%016lx, size=%lu, next=0x%016lx, head=%2lu\n", 
-            (std::size_t) curr, 
-            (std::size_t) curr->getData(),
-            (std::size_t) curr->getSize(),
-            (std::size_t) curr->getNext(),
-            (std::size_t) curr->isHead ());
-    }
-    fprintf(file, "|\n");
-    return CNMEM_STATUS_SUCCESS;
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-cnmemStatus_t Manager::printMemoryState(FILE *file) const {
-    CNMEM_CHECK(mMutex.lock());
-    std::size_t streamCode = (std::size_t) mStream;
-    std::size_t usedMemory, freeMemory;
-    CNMEM_CHECK_OR_UNLOCK(getUsedMemoryUnsafe(usedMemory), mMutex);
-    CNMEM_CHECK_OR_UNLOCK(getFreeMemoryUnsafe(freeMemory), mMutex);
-    fprintf(file, ">> [%s] device=%d, stream=0x%016lx, used=%luB, free=%luB\n", 
-            mParent ? "child" : "root",
-            mDevice, 
-            streamCode,
-            usedMemory,
-            freeMemory);
-    CNMEM_CHECK_OR_UNLOCK(printListUnsafe(file, "used", mUsedBlocks), mMutex);
-    CNMEM_CHECK_OR_UNLOCK(printListUnsafe(file, "free", mFreeBlocks), mMutex);
-    fprintf(file, "\n");
-    CNMEM_CHECK(mMutex.unlock());
-    if( mParent ) {
-        CNMEM_CHECK(mParent->printMemoryState(file));
-    }
-    return CNMEM_STATUS_SUCCESS;
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-cnmemStatus_t Manager::release(void *ptr) {
-    // Skip if ptr is NULL.
-    if( ptr == NULL ) {
-        return CNMEM_STATUS_SUCCESS;
-    }
-    // Lock to make sure only one thread execute that fragment of code.
-    CNMEM_CHECK(mMutex.lock());
-    // Find the node in the list of used blocks.
-    Block *curr = mUsedBlocks, *prev = NULL;
-    for( ; curr && curr->getData() != ptr ; curr = curr->getNext() ) {
-        prev = curr;
-    }
-    // Make sure we have found a node.
-    if( curr == NULL ) {
-        CNMEM_CHECK(mMutex.unlock());
-        return CNMEM_STATUS_INVALID_ARGUMENT;
-    }
-    // We have the node so release it.
-    cnmemStatus_t result = releaseBlockUnsafe(curr, prev);
-    CNMEM_CHECK(mMutex.unlock());
-    return result;
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-cnmemStatus_t Manager::releaseAllUnsafe() {
-    // Destroy the children if any.
-    for( std::size_t i = 0; i < mChildren.size(); ++i ) {
-        Manager *child = mChildren[i];
-        CNMEM_CHECK(child->releaseAllUnsafe());
-        delete child;
-    }
-    mChildren.clear();
-    // Destroy used blocks. It's a kind of panic mode to avoid leaks. NOTE: Do that only with roots!!!
-    if( !mParent ) {
-        while( mUsedBlocks ) {
-            CNMEM_CHECK(releaseBlockUnsafe(mUsedBlocks, NULL));
-        }
-    }
-    // We should be having only free blocks that are head blocks. Release those blocks.
-    while( mFreeBlocks ) {
-        if( mParent ) {
-            CNMEM_CHECK(mParent->release(mFreeBlocks->getData()));
-        }
-        else if( mFreeBlocks->isHead() ) {
-            void *data = mFreeBlocks->getData();
-            CNMEM_DEBUG_INFO("cudaFree(%lu, 0x%016lx)\n", mFreeBlocks->getSize(), (size_t) data);
-            CNMEM_CHECK_CUDA(cudaFree(data));
-            CNMEM_DEBUG_INFO(">> success\n");
-        }
-        Block *block = mFreeBlocks;
-        mFreeBlocks = mFreeBlocks->getNext();
-        delete block;
-    }
-    // We shouldn't have any used block left. Or, it means the user is causing memory leaks!
-    return CNMEM_STATUS_SUCCESS;
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-cnmemStatus_t Manager::releaseBlockUnsafe(Block *curr, Block *prev) {
-    // The current node cannot be NULL!
-    CNMEM_ASSERT(curr != NULL);
-    // Change the connection of the node.
-    if( prev ) {
-        prev->setNext(curr->getNext());
-    }
-    else {
-        mUsedBlocks = curr->getNext();
-    }
-    // Find the location where this block should be added to the free list.
-    prev = NULL;
-    Block *iter = mFreeBlocks;
-    for( ; iter && iter->getData() < curr->getData() ; iter = iter->getNext() ) {
-        prev = iter;
-    }
-    // Keep track of the successor of pred. We may lose track of it in the following "else".
-    Block *next = prev ? prev->getNext() : mFreeBlocks;
-    // We first check if we can merge the block with its predecessor in the list and curr can be merged.
-    if( prev && prev->getData() + prev->getSize() == curr->getData() && !curr->isHead() ) {
-        prev->setSize(prev->getSize() + curr->getSize());
-        delete curr;
-        curr = prev;
-    }
-    else if( prev ) {
-        prev->setNext(curr);
-    }
-    else {
-        mFreeBlocks = curr;
-    }
-    // Check if we can merge curr and next. We can't merge over "cudaMalloc" boundaries.
-    if( next && curr->getData() + curr->getSize() == next->getData() && !next->isHead() ) {
-        curr->setSize(curr->getSize() + next->getSize());
-        curr->setNext(next->getNext());
-        delete next;
-    }
-    else {
-        curr->setNext(next);
-    }
-    return CNMEM_STATUS_SUCCESS;
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-cnmemStatus_t Manager::reserve(std::size_t size) {
-    CNMEM_CHECK(mMutex.lock());
-    Block *curr, *prev;
-    CNMEM_CHECK_OR_UNLOCK(allocateBlockUnsafe(curr, prev, size), mMutex);
-    mSize = size;
-    CNMEM_CHECK(mMutex.unlock());
-    return CNMEM_STATUS_SUCCESS;
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-cnmemStatus_t Manager::stealUnsafe(void *&stolen, std::size_t size) {
-    // If we cannot steal, don't even try.
-    if( mFlags & CNMEM_FLAGS_CANNOT_STEAL ) {
-        stolen = NULL;
-        return CNMEM_STATUS_INVALID_ARGUMENT;
-    }
-    // The stolen block.
-    void *data = NULL; std::size_t dataSize = 0;
-    if( !mChildren.empty() ) {
-        CNMEM_CHECK(stealBlockUnsafe(data, dataSize, size));
-    }
-    else if( mParent ) {
-        CNMEM_CHECK(mParent->stealBlockUnsafe(data, dataSize, size));
-    }
-    // Make sure we do have a block of memory or quit.
-    if( !data ) {
-        stolen = NULL;
-        return CNMEM_STATUS_OUT_OF_MEMORY;
-    }
-    // Push the block in the used list.
-    mUsedBlocks = new Block((char*) data, dataSize, mUsedBlocks, true);
-    if( !mUsedBlocks ) {
-        return CNMEM_STATUS_OUT_OF_MEMORY;
-    }
-    // Return the new pointer into memory.
-    stolen = data;
-    return CNMEM_STATUS_SUCCESS;
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-cnmemStatus_t Manager::stealBlockUnsafe(void *&data, std::size_t &dataSize, ::size_t size) {
-    // No block found and no room to grow. Try to steal from a children (if we have any).
-    data = NULL;
-    for( std::size_t i = 0 ; !data && i < mChildren.size() ; ++i ) {
-        Manager *child = mChildren[i];
-        if( child->giveBlockUnsafe(data, dataSize, size) == CNMEM_STATUS_SUCCESS ) {
-            break;
-        }
-    }
-    // If no memory space found, simply return NULL. We have failed to allocate. Quit miserably.
-    if( !data ) {
-        return CNMEM_STATUS_OUT_OF_MEMORY;
-    }
-    // We have got a node from a children. We need to update our "used" list before we can do 
-    // anything with it.
-    Block *curr = mUsedBlocks, *prev = NULL;
-    for( ; curr ; curr = curr->getNext() ) { 
-        if( curr->getData() <= data && data < curr->getData()+curr->getSize() ) {
-            break;
-        }
-        prev = curr;
-    }
-    // Curr points to the node which contains that memory region.
-    CNMEM_ASSERT(curr);
-    // If it is exactly the same memory region, we are done!!!
-    if( curr->getData() == data && curr->getSize() == dataSize ) {
-        return CNMEM_STATUS_SUCCESS;
-    }
-    // Track the blocks before and after curr.
-    Block *next = curr->getNext();
-    // We may have up to 3 blocks.
-    std::size_t sizeBefore = (std::size_t) ((char*) data - curr->getData());
-    std::size_t sizeAfter = (curr->getSize() - sizeBefore - dataSize);
-    // The resulting block.
-    Block *result = curr;
-    // If we have no space between curr->getData and block->getData.
-    if( sizeBefore == 0 ) {
-        curr->setSize(dataSize);
-    }
-    else {
-        curr->setSize(sizeBefore);
-        Block *block = new Block((char*) data, dataSize, next, false);
-        if( !block ) {
-            return CNMEM_STATUS_OUT_OF_MEMORY;
-        }
-        curr->setNext(block);
-        curr = block;
-        data = (char*) data + dataSize;
-        dataSize = sizeAfter; 
-        result = block;
-    }
-    // We have space at the end so we may need to add a new node.
-    if( sizeAfter > 0 ) {
-        Block *block = new Block(curr->getData() + curr->getSize(), sizeAfter, next, false);
-        if( !block ) {
-            return CNMEM_STATUS_OUT_OF_MEMORY;
-        }
-        curr->setNext(block);
-        curr = block;
-    }
-    return CNMEM_STATUS_SUCCESS;
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-class Context {
-    /// Use a magic number to specify that the context is valid.
-    enum { CTX_VALID = 0x1f5632a3 };
-    /// The reference counting mechanism.
-    int mRefCount;
-    /// The mutex to increase/decrease the reference counter. TODO: Use atomics.
-    Mutex mMutex;
-    /// The memory managers.
-    std::vector<Manager> mManagers;
-    /// The global context.
-    static Context *sCtx;
-    /// Use a magic number to specify that the context was created.
-    static int sCtxCheck;
-public:
-    /// Ctor.
-    Context() : mRefCount(1) { mMutex.initialize(); }
-    /// Dtor.
-    ~Context();
-    /// Get the managers.
-    inline std::vector<Manager>& getManagers() { return mManagers; }
-    /// Get a single manager associated with a device.
-    inline Manager& getManager(int i) { return mManagers[i]; }
-    /// Create the global context.
-    static cnmemStatus_t create();
-    /// Check that the context was created.
-    static inline bool check() { return sCtxCheck == CTX_VALID && sCtx; }
-    /// Get the global context.
-    static Context* get();
-    /// Retain.
-    static cnmemStatus_t retain();
-    /// Release.
-    static cnmemStatus_t release();
-};
-Context *Context::sCtx;
-int Context::sCtxCheck;
-///////////////////////////////////////////////////////////////////////////////////////////////////
-Context::~Context() { 
-    int oldDevice;
-    cudaGetDevice(&oldDevice);
-    for( std::size_t i = 0 ; i < mManagers.size() ; ++i ) {
-        if( mManagers[i].getDevice() != -1 ) { // Skip invalid managers.
-            cudaSetDevice(mManagers[i].getDevice());
-            mManagers[i].releaseAllUnsafe();
-        }
-    }
-    mManagers.clear();
-    mMutex.finalize();
-    cudaSetDevice(oldDevice);
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-cnmemStatus_t Context::create() {
-    sCtx = new Context;
-    sCtxCheck = CTX_VALID;
-    return CNMEM_STATUS_SUCCESS;
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-Context* Context::get() {
-    CNMEM_ASSERT(Context::check());
-    return Context::sCtx;
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-cnmemStatus_t Context::retain() { 
-    CNMEM_CHECK(sCtx->mMutex.lock());
-    sCtx->mRefCount++; 
-    CNMEM_CHECK(sCtx->mMutex.unlock());
-    return CNMEM_STATUS_SUCCESS;
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-cnmemStatus_t Context::release() {
-    CNMEM_CHECK(sCtx->mMutex.lock());
-    int refCount = --sCtx->mRefCount;
-    CNMEM_CHECK(sCtx->mMutex.unlock());
-    if( refCount == 0 ) { // Kill the context.
-        delete sCtx;
-        Context::sCtx = NULL;
-        Context::sCtxCheck = 0;
-    }
-    return CNMEM_STATUS_SUCCESS;
-}
-} // namespace cnmem
-///////////////////////////////////////////////////////////////////////////////////////////////////
-extern "C" {
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-cnmemStatus_t cnmemInit(int numDevices, const cnmemDevice_t *devices, unsigned flags) {
-    // Make sure we have at least one device declared.
-    CNMEM_CHECK_TRUE(numDevices > 0, CNMEM_STATUS_INVALID_ARGUMENT);
-    // Find the largest ID of the device.
-    int maxDevice = 0;
-    for( int i = 0 ; i < numDevices ; ++i ) {
-        if( devices[i].device > maxDevice ) {
-            maxDevice = devices[i].device;
-        }
-    }
-    // Create the global context.
-    cnmem::Context::create();
-    cnmem::Context *ctx = cnmem::Context::get();
-    // Allocate enough managers.
-    CNMEM_CHECK_TRUE(maxDevice >= 0, CNMEM_STATUS_INVALID_ARGUMENT);
-    std::vector<cnmem::Manager> &managers = ctx->getManagers();
-    managers.resize(maxDevice+1);
-    // Create a root manager for each device and create the children.
-    int oldDevice;
-    CNMEM_CHECK_CUDA(cudaGetDevice(&oldDevice));
-    for( int i = 0 ; i < numDevices ; ++i ) {
-        CNMEM_CHECK_CUDA(cudaSetDevice(devices[i].device));
-        std::size_t size = devices[i].size;
-        if( size == 0 ) {
-            cudaDeviceProp props;
-            CNMEM_CHECK_CUDA(cudaGetDeviceProperties(&props, devices[i].device));
-            size = props.totalGlobalMem / 2;
-        }
-        CNMEM_CHECK_TRUE(size > 0, CNMEM_STATUS_INVALID_ARGUMENT);
-        cnmem::Manager &manager = ctx->getManager(devices[i].device);
-        manager.setDevice(devices[i].device);
-        manager.setFlags(flags);
-        size = cnmem::ceilInt(size, CNMEM_GRANULARITY);
-        CNMEM_CHECK(manager.reserve(size));
-        for( int j = 0 ; j < devices[i].numStreams ; ++j ) {
-            cnmem::Manager *child = new cnmem::Manager;
-            child->setParent(&manager);
-            child->setDevice(devices[i].device);
-            child->setStream(devices[i].streams[j]);
-            child->setFlags(flags & ~CNMEM_FLAGS_CANNOT_GROW);
-            if( devices[i].streamSizes && devices[i].streamSizes[j] > 0 ) {
-                CNMEM_CHECK(child->reserve(devices[i].streamSizes[j]));
-            }
-            CNMEM_CHECK(manager.addChild(child));
-        }
-    }
-    CNMEM_CHECK_CUDA(cudaSetDevice(oldDevice));
-    return CNMEM_STATUS_SUCCESS;
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-cnmemStatus_t cnmemFinalize() {
-    CNMEM_CHECK_TRUE(cnmem::Context::check(), CNMEM_STATUS_NOT_INITIALIZED);
-    return cnmem::Context::release();
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-cnmemStatus_t cnmemRetain() {
-    CNMEM_CHECK_TRUE(cnmem::Context::check(), CNMEM_STATUS_NOT_INITIALIZED);
-    return cnmem::Context::retain();
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-cnmemStatus_t cnmemRelease() {
-    CNMEM_CHECK_TRUE(cnmem::Context::check(), CNMEM_STATUS_NOT_INITIALIZED);
-    return cnmem::Context::release();
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-cnmemStatus_t cnmemRegisterStream(cudaStream_t stream) {
-    CNMEM_CHECK_TRUE(cnmem::Context::check(), CNMEM_STATUS_NOT_INITIALIZED);
-    CNMEM_CHECK_TRUE(stream, CNMEM_STATUS_INVALID_ARGUMENT);
-    int device;
-    CNMEM_CHECK_CUDA(cudaGetDevice(&device));
-    cnmem::Manager &root = cnmem::Context::get()->getManager(device);
-    cnmem::Manager *child = new cnmem::Manager;
-    child->setParent(&root);
-    child->setDevice(device);
-    child->setStream(stream);
-    child->setFlags(root.getFlags() & ~CNMEM_FLAGS_CANNOT_GROW);
-    root.addChild(child);
-    return CNMEM_STATUS_SUCCESS;
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-cnmemStatus_t cnmemMalloc(void **ptr, std::size_t size, cudaStream_t stream) {
-    CNMEM_CHECK_TRUE(cnmem::Context::check(), CNMEM_STATUS_NOT_INITIALIZED);
-    if( !ptr && !size ) {
-        return CNMEM_STATUS_SUCCESS;
-    }
-    else if( !size ) {
-        ptr[0] = NULL;
-        return CNMEM_STATUS_SUCCESS;
-    }
-    CNMEM_CHECK_TRUE(ptr,  CNMEM_STATUS_INVALID_ARGUMENT);
-    int device;
-    CNMEM_CHECK_CUDA(cudaGetDevice(&device));
-    cnmem::Manager &root = cnmem::Context::get()->getManager(device);
-    cnmem::Manager *manager = &root;
-    if( stream ) {
-        CNMEM_CHECK(root.getChildFromStream(manager, stream));
-    }
-    CNMEM_ASSERT(manager);
-    size = cnmem::ceilInt(size, CNMEM_GRANULARITY);
-    cnmemStatus_t result = manager->allocate(ptr[0], size);
-    // We failed to allocate but there might still be a buffer available in another manager. Try to 
-    // steal it.
-    if( result == CNMEM_STATUS_OUT_OF_MEMORY ) {
-        // Try to acquire locks on all the children.
-        std::size_t numChildren;
-        CNMEM_CHECK(root.getNumChildren(numChildren));
-        std::vector<const cnmem::Mutex*> mutexes(numChildren);
-        std::size_t numLocked = 0;
-        for( size_t i = 0 ; i < numChildren ; ++i, ++numLocked ) {
-            cnmem::Manager *child;
-            CNMEM_CHECK(root.getChild(child, i));
-            mutexes[numLocked] = child->getMutex();
-            if( mutexes[numLocked]->lock() != CNMEM_STATUS_SUCCESS ) {
-                break;
-            }
-        }
-        // One lock failed, quit. Reduce the damage as much as possible, though.
-        if( numLocked != numChildren ) {
-            for( std::size_t i = 0 ; i < numLocked ; ++i ) {
-                cnmemStatus_t lockStatus = mutexes[i]->unlock();
-            }
-            return CNMEM_STATUS_UNKNOWN_ERROR;
-        }
-        // Grab the lock on the root, first.
-        const cnmem::Mutex *rootMutex = root.getMutex();
-        CNMEM_CHECK(rootMutex->lock());
-        // We acquired all the lock so we try to steal a node from another child.
-        if( numLocked == mutexes.size() ) {
-            result = manager->stealUnsafe(ptr[0], size);
-        }
-        for( std::size_t i = 0 ; i < numLocked ; ++i ) {
-            cnmemStatus_t lockStatus = mutexes[i]->unlock();
-            if( lockStatus != CNMEM_STATUS_SUCCESS ) { 
-                // Starting from now we are panicking!!! One lock failed to be released, we try
-                // we others. We could also give up because we are already screwed. I don't know
-                // what's best! Comment are welcome.
-                result = lockStatus;
-            }
-        }
-        CNMEM_CHECK(rootMutex->unlock());
-    }
-    return result;
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-cnmemStatus_t cnmemFree(void *ptr, cudaStream_t stream) {
-    CNMEM_CHECK_TRUE(cnmem::Context::check(), CNMEM_STATUS_NOT_INITIALIZED);
-    if( ptr == NULL ) {
-        return CNMEM_STATUS_SUCCESS;
-    }
-    int device;
-    CNMEM_CHECK_CUDA(cudaGetDevice(&device));
-    cnmem::Manager &root = cnmem::Context::get()->getManager(device);
-    cnmem::Manager *manager = &root;
-    if( stream ) {
-        CNMEM_CHECK(root.getChildFromStream(manager, stream));
-    }
-    CNMEM_ASSERT(manager);
-    return manager->release(ptr);
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-cnmemStatus_t cnmemMemGetInfo(size_t *freeMem, size_t *totalMem, cudaStream_t stream) {
-    CNMEM_CHECK_TRUE(cnmem::Context::check(), CNMEM_STATUS_NOT_INITIALIZED);
-    CNMEM_CHECK_TRUE(totalMem && freeMem, CNMEM_STATUS_INVALID_ARGUMENT);
-    int device;
-    CNMEM_CHECK_CUDA(cudaGetDevice(&device));
-    cnmem::Manager &root = cnmem::Context::get()->getManager(device);
-    cnmem::Manager *manager = &root;
-    if( stream ) {
-        CNMEM_CHECK(root.getChildFromStream(manager, stream));
-    }
-    CNMEM_ASSERT(manager);
-    const cnmem::Mutex *mutex = manager->getMutex();
-    CNMEM_CHECK(mutex->lock());
-    CNMEM_CHECK_OR_UNLOCK(manager->getFreeMemoryUnsafe(*freeMem), *mutex);
-    size_t usedMem;
-    CNMEM_CHECK_OR_UNLOCK(manager->getUsedMemoryUnsafe(usedMem), *mutex);
-    CNMEM_CHECK(mutex->unlock());
-    totalMem[0] = usedMem + freeMem[0];
-    return CNMEM_STATUS_SUCCESS;
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-cnmemStatus_t cnmemPrintMemoryState(FILE *file, cudaStream_t stream) {
-    CNMEM_CHECK_TRUE(cnmem::Context::check(), CNMEM_STATUS_NOT_INITIALIZED);
-    int device;
-    CNMEM_CHECK_CUDA(cudaGetDevice(&device));
-    cnmem::Manager &root = cnmem::Context::get()->getManager(device);
-    cnmem::Manager *manager = &root;
-    if( stream ) {
-        CNMEM_CHECK(root.getChildFromStream(manager, stream));
-    }
-    CNMEM_ASSERT(manager);
-    return manager->printMemoryState(file); 
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-} // extern "C"
--- a/theano/sandbox/cuda/cnmem.h
+++ b/theano/sandbox/cuda/cnmem.h
-/* ********************************************************************** 
- * Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *  * Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of NVIDIA CORPORATION nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- * ********************************************************************** */
-#pragma once
-#ifdef __cplusplus
-#include "cstdio"
-#else
-#include "stdio.h"
-#endif
-#include "cuda_runtime_api.h"
-#if defined(_MSC_VER) || defined(WIN32)
-#ifdef CNMEM_DLLEXPORT
-#define CNMEM_API __declspec(dllexport)
-#else
-#define CNMEM_API __declspec(dllimport)
-#endif
-#else
-#ifdef CNMEM_DLLEXPORT
-#define CNMEM_API __attribute__((visibility ("default")))
-#else
-#define CNMEM_API
-#endif
-#endif
-#define CNMEM_VERSION 100 // It corresponds to 1.0.0
-#ifdef __cplusplus
-extern "C" {
-#endif
-/* ********************************************************************************************* */
-typedef enum
-{
-  CNMEM_STATUS_SUCCESS = 0,
-  CNMEM_STATUS_CUDA_ERROR,
-  CNMEM_STATUS_INVALID_ARGUMENT,
-  CNMEM_STATUS_NOT_INITIALIZED,
-  CNMEM_STATUS_OUT_OF_MEMORY,
-  CNMEM_STATUS_UNKNOWN_ERROR
-} cnmemStatus_t;
-/* ********************************************************************************************* */
-typedef enum
-{
-  CNMEM_FLAGS_DEFAULT = 0,       /// Default flags.
-  CNMEM_FLAGS_CANNOT_GROW = 1,   /// Prevent the manager from growing its memory consumption.
-  CNMEM_FLAGS_CANNOT_STEAL = 2,  /// Prevent the manager from stealing memory.
-} cnmemManagerFlags_t;
-/* ********************************************************************************************* */
-typedef struct cnmemDevice_t_
-{
-  /** The device number. */
-  int device;
-  /** The size to allocate for that device. If 0, the implementation chooses the size. */
-  size_t size;
-  /** The number of named streams associated with the device. The NULL stream is not counted. */
-  int numStreams;
-  /** The streams associated with the device. It can be NULL. The NULL stream is managed. */
-  cudaStream_t *streams;
-  /** The size reserved for each streams. It can be 0. */
-  size_t *streamSizes;
-} cnmemDevice_t;
-/**
- * \brief Initialize the library and allocate memory on the listed devices.
- *
- * For each device, an internal memory manager is created and the specified amount of memory is 
- * allocated (it is the size defined in device[i].size). For each, named stream an additional 
- * memory manager is created. Currently, it is implemented as a tree of memory managers: A root 
- * manager for the device and a list of children, one for each named stream.
- * 
- * This function must be called before any other function in the library. It has to be called 
- * by a single thread since it is not thread-safe.
- *
- * \return 
- * CNMEM_STATUS_SUCCESS,          if everything goes fine,
- * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid,
- * CNMEM_STATUS_OUT_OF_MEMORY,    if the requested size exceeds the available memory,
- * CNMEM_STATUS_CUDA_ERROR,       if an error happens in a CUDA function.
- */
-cnmemStatus_t CNMEM_API cnmemInit(int numDevices, const cnmemDevice_t *devices, unsigned flags);
-/**
- * \brief Release all the allocated memory. 
- * 
- * This function must be called by a single thread and after all threads that called 
- * cnmemMalloc/cnmemFree have joined. This function is not thread-safe.
- *
- * \return 
- * CNMEM_STATUS_SUCCESS,          if everything goes fine,
- * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
- * CNMEM_STATUS_CUDA_ERROR,       if an error happens in one of the CUDA functions.
- */
-cnmemStatus_t CNMEM_API cnmemFinalize();
-/**
- * \brief Increase the internal reference counter of the context object.
- * 
- * This function increases the internal reference counter of the library. The purpose of that
- * reference counting mechanism is to give more control to the user over the lifetime of the 
- * library. It is useful with scoped memory allocation which may be destroyed in a final 
- * memory collection after the end of main(). That function is thread-safe.
- *
- * \return 
- * CNMEM_STATUS_SUCCESS,          if everything goes fine,
- * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
- */
-cnmemStatus_t CNMEM_API cnmemRetain();
-/**
- * \brief Decrease the internal reference counter of the context object.
- * 
- * This function decreases the internal reference counter of the library. The purpose of that
- * reference counting mechanism is to give more control to the user over the lifetime of the 
- * library. It is useful with scoped memory allocation which may be destroyed in a final 
- * memory collection after the end of main(). That function is thread-safe.
- *
- * You can use \c cnmemRelease to explicitly finalize the library.
- *
- * \return 
- * CNMEM_STATUS_SUCCESS,          if everything goes fine,
- * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
- */
-cnmemStatus_t CNMEM_API cnmemRelease();
-/**
- * \brief Add a new stream to the pool of managed streams on a device.
- *
- * This function registers a new stream into a device memory manager. It is thread-safe.
- *
- * \return 
- * CNMEM_STATUS_SUCCESS,          if everything goes fine,
- * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid,
- */
-cnmemStatus_t CNMEM_API cnmemRegisterStream(cudaStream_t stream);
-/**
- * \brief Allocate memory. 
- * 
- * This function allocates memory and initializes a pointer to device memory. If no memory 
- * is available, it returns a CNMEM_STATUS_OUT_OF_MEMORY error. This function is thread safe.
- *
- * The behavior of that function is the following: 
- *
- * - If the stream is NULL, the root memory manager is asked to allocate a buffer of device 
- *   memory. If there's a buffer of size larger or equal to the requested size in the list of 
- *   free blocks, it is returned. If there's no such buffer but the manager is allowed to grow 
- *   its memory usage (the CNMEM_FLAGS_CANNOT_GROW flag is not set), the memory manager calls 
- *   cudaMalloc. If cudaMalloc fails due to no more available memory or the manager is not 
- *   allowed to grow, the manager attempts to steal memory from one of its children (unless 
- *   CNMEM_FLAGS_CANNOT_STEAL is set). If that attempt also fails, the manager returns 
- *   CNMEM_STATUS_OUT_OF_MEMORY.
- * 
- * - If the stream is a named stream, the initial request goes to the memory manager associated 
- *   with that stream. If a free node is available in the lists of that manager, it is returned. 
- *   Otherwise, the request is passed to the root node and works as if the request were made on 
- *   the NULL stream.
- *
- * The calls to cudaMalloc are potentially costly and may induce GPU synchronizations. Also the 
- * mechanism to steal memory from the children induces GPU synchronizations (the manager has to 
- * make sure no kernel uses a given buffer before stealing it) and it the execution is 
- * sequential (in a multi-threaded context, the code is executed in a critical section inside
- * the cnmem library - no need for the user to wrap cnmemMalloc with locks).
- *
- * \return 
- * CNMEM_STATUS_SUCCESS,          if everything goes fine,
- * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
- * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, ptr == 0,
- * CNMEM_STATUS_OUT_OF_MEMORY,    if there is not enough memory available,
- * CNMEM_STATUS_CUDA_ERROR,       if an error happens in one of the CUDA functions.
- */
-cnmemStatus_t CNMEM_API cnmemMalloc(void **ptr, size_t size, cudaStream_t stream);
-/**
- * \brief Release memory. 
- * 
- * This function releases memory and recycles a memory block in the manager. This function is 
- * thread safe. 
- *
- * \return 
- * CNMEM_STATUS_SUCCESS,          if everything goes fine,
- * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
- * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, ptr == 0,
- * CNMEM_STATUS_CUDA_ERROR,       if an error happens in one of the CUDA functions.
- */
-cnmemStatus_t CNMEM_API cnmemFree(void *ptr, cudaStream_t stream);
-/* ********************************************************************************************* */
-/* Utility functions.                                                                            */
-/* ********************************************************************************************* */
-/**
- * \brief Returns the amount of memory managed by the memory manager associated with a stream.
- * 
- * The pointers totalMem and freeMem must be valid. At the moment, this function has a comple-
- * xity linear in the number of allocated blocks so do not call it in performance critical 
- * sections. 
- *
- * \return 
- * CNMEM_STATUS_SUCCESS,          if everything goes fine,
- * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
- * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid,
- * CNMEM_STATUS_CUDA_ERROR,       if an error happens in one of the CUDA functions.
- */
-cnmemStatus_t CNMEM_API cnmemMemGetInfo(size_t *freeMem, size_t *totalMem, cudaStream_t stream);
-/**
- * \brief Print a list of nodes to a file. 
- * 
- * This function is intended to be used in case of complex scenarios to help understand the 
- * behaviour of the memory managers/application. It is thread safe.
- *
- * \return 
- * CNMEM_STATUS_SUCCESS,          if everything goes fine,
- * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
- * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, used_mem == 0 
- *                                or free_mem == 0,
- * CNMEM_STATUS_CUDA_ERROR,       if an error happens in one of the CUDA functions.
- */
-cnmemStatus_t CNMEM_API cnmemPrintMemoryState(FILE *file, cudaStream_t stream);
-/**
- * \brief Converts a cnmemStatus_t value to a string.
- */
-const char CNMEM_API * cnmemGetErrorString(cnmemStatus_t status);
-/* ********************************************************************************************* */
-#ifdef __cplusplus
-} // extern "C"
-#endif
--- a/theano/sandbox/cuda/conv.cu
+++ b/theano/sandbox/cuda/conv.cu
-// REMEMBER TO INCREASE c_code_cache_version when changing this file
-//
-enum { ConvMode_FULL, ConvMode_VALID };
-PyObject * CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern, CudaNdarray * out, const int mode, const int subsample_rows, const int subsample_cols, const int version, const int verbose);
-/*
- * version: -1, autodetect, >=0 a specific version to use.
- *          If it can't be executed, we revert to the reference implementation
- */
-int
-CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
-                       CudaNdarray * out, int subsample_rows, int subsample_cols,
-                       int version = -1, int verbose=0,
-                       int max_threads_dim0 = 512
-                       )
-{
-    int work_complete = 0;
-    const int shared_avail = SHARED_SIZE-150;//144 is the biggest static shared size used with compiling this file.
-    if (img->nd != 4)
-    {
-        PyErr_SetString(PyExc_ValueError, "required img of 4D");
-        return -1;
-    }
-    if (kern->nd != 4)
-    {
-        PyErr_SetString(PyExc_ValueError, "required kern of 4D");
-        return -1;
-    }
-    if (out->nd != 4)
-    {
-        PyErr_SetString(PyExc_ValueError, "required out of 4D");
-        return -1;
-    }
-    if (verbose>1)
-    {
-        fprintf(stderr,
-                "INFO: Running conv_valid version=%d,"
-                " MACRO kern_width=%d with inputs:\n",
-                version, THEANO_KERN_WID);
-        fprintf(stderr,
-                "INFO:   img  dim: %i %i %i %i  img  stride: %i %i %i %i\n",
-                CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(img)[1],
-                CudaNdarray_HOST_DIMS(img)[2],CudaNdarray_HOST_DIMS(img)[3],
-                CudaNdarray_HOST_STRIDES(img)[0],
-                CudaNdarray_HOST_STRIDES(img)[1],
-                CudaNdarray_HOST_STRIDES(img)[2],
-                CudaNdarray_HOST_STRIDES(img)[3]);
-        fprintf(stderr,
-                "INFO:   kern dim: %i %i %i %i  kern stride: %i %i %i %i\n",
-                CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(kern)[1],
-                CudaNdarray_HOST_DIMS(kern)[2], CudaNdarray_HOST_DIMS(kern)[3],
-                CudaNdarray_HOST_STRIDES(kern)[0],
-                CudaNdarray_HOST_STRIDES(kern)[1],
-                CudaNdarray_HOST_STRIDES(kern)[2],
-                CudaNdarray_HOST_STRIDES(kern)[3]);
-        fprintf(stderr,
-                "INFO:   out dim: %i %i %i %i  out stride: %i %i %i %i\n",
-               CudaNdarray_HOST_DIMS(out)[0], CudaNdarray_HOST_DIMS(out)[1],
-               CudaNdarray_HOST_DIMS(out)[2], CudaNdarray_HOST_DIMS(out)[3],
-               CudaNdarray_HOST_STRIDES(out)[0],
-               CudaNdarray_HOST_STRIDES(out)[1],
-               CudaNdarray_HOST_STRIDES(out)[2],
-               CudaNdarray_HOST_STRIDES(out)[3]);
-        fprintf(stderr,
-                "INFO:   subsample_rows=%d, subsample_cols=%d\n",
-                subsample_rows, subsample_cols);
-    }
-    //Check the output size is valid
-    if (!(CudaNdarray_HOST_DIMS(out)[2] == ceil_intdiv(CudaNdarray_HOST_DIMS(img)[2]- CudaNdarray_HOST_DIMS(kern)[2] + 1, subsample_rows) ||
-          CudaNdarray_HOST_DIMS(out)[3] == ceil_intdiv(CudaNdarray_HOST_DIMS(img)[3]- CudaNdarray_HOST_DIMS(kern)[3] + 1, subsample_cols) ||
-          CudaNdarray_HOST_DIMS(out)[0] == CudaNdarray_HOST_DIMS(img)[0] ||
-          CudaNdarray_HOST_DIMS(out)[1] == CudaNdarray_HOST_DIMS(kern)[0] ||
-          CudaNdarray_HOST_DIMS(img)[1] == CudaNdarray_HOST_DIMS(kern)[1])) {
-        PyErr_SetString(PyExc_ValueError, "GpuConv: sizes don't match");
-        return -1;
-    }
-    // we now search through a few implementations until one applies to our arguments.
-    //TODO: make separate version as if all fill this is slower.
-    //TODO: Make a switch with power of 2 max size as template
-    //TODO: make a parameter the number of division
-    //TODO: Should we make them in separate grid block instead?
-    const int nstack=CudaNdarray_HOST_DIMS(kern)[1];
-    const int nbatch=CudaNdarray_HOST_DIMS(img)[0];
-    const int nkern=CudaNdarray_HOST_DIMS(kern)[0];
-    const int img_wid=CudaNdarray_HOST_DIMS(img)[3];
-    const int img_len=CudaNdarray_HOST_DIMS(img)[2];
-    const int kern_wid=CudaNdarray_HOST_DIMS(kern)[3];
-    const int kern_len=CudaNdarray_HOST_DIMS(kern)[2];
-    const int out_wid=CudaNdarray_HOST_DIMS(out)[3];
-    const int out_len=CudaNdarray_HOST_DIMS(out)[2];
-    const int img_stride_col= CudaNdarray_HOST_STRIDES(img)[3];
-    const int img_stride_row=CudaNdarray_HOST_STRIDES(img)[2];
-    const int img_stride_stack= CudaNdarray_HOST_STRIDES(img)[1];
-    const int img_stride_batch=CudaNdarray_HOST_STRIDES(img)[0];
-    const int kern_stride_col= CudaNdarray_HOST_STRIDES(kern)[3];
-    const int kern_stride_row=CudaNdarray_HOST_STRIDES(kern)[2];
-    const int kern_stride_stack= CudaNdarray_HOST_STRIDES(kern)[1];
-    const int kern_stride_nkern=CudaNdarray_HOST_STRIDES(kern)[0];
-    const int img_size=img_len*img_wid;
-    const int kern_size=kern_len*kern_wid;
-    const int out_size=out_len*out_wid;
-    const int img_size_byte = img_size*sizeof(float);
-    const int kern_size_byte = kern_size*sizeof(float);
-    const int out_size_byte = out_size*sizeof(float);
-    if (!((THEANO_KERN_WID == CudaNdarray_HOST_DIMS(kern)[3]) || (THEANO_KERN_WID==0))){
-      PyErr_Format(PyExc_ValueError, "ERROR: This GpuConv code was compiled for"
-                   " %d kernel columns, but the kernel we received had %d columns!",
-                   THEANO_KERN_WID, CudaNdarray_HOST_DIMS(kern)[3]);
-      return -1;
-    }
-    bool subsample = subsample_rows!=1 || subsample_cols!=1;
-    bool img_contiguous = CudaNdarray_is_c_contiguous(img);
-    bool kern_contiguous = CudaNdarray_is_c_contiguous(kern);
-    bool out_contiguous = CudaNdarray_is_c_contiguous(out);
-    bool c_contiguous = img_contiguous &&  kern_contiguous && out_contiguous;
-    bool img_contiguous_2d = (img_stride_col == 1) && (img_stride_row==img_wid);
-    bool kern_contiguous_2d = (kern_stride_col == 1) && (kern_stride_row==kern_wid);
-    //if the lower 2 dims are c_contiguous but flipped, unflipping the
-    // stride and not flipping the kernel in shared memroy
-    //allow to use a version that use less registers(so is faster)
-    //the unflipped version of variable have the original value when
-    //we don't need to unflip it, but have the new value when we unflip it.
-    bool kern_flipped=true;
-    bool kern_contiguous_2d_unflipped = kern_contiguous_2d;
-    float * kern_data_unflipped = kern->devdata;
-    int kern_stride_col_unflipped=kern_stride_col;
-    int kern_stride_row_unflipped=kern_stride_row;
-    if(kern_stride_col_unflipped==-1 && kern_stride_row_unflipped==-kern_wid){
-      //the last two dimensions are c_contiguous but flipped!
-      kern_stride_col_unflipped=1;
-      kern_stride_row_unflipped=kern_wid;
-      kern_flipped=false;
-      kern_contiguous_2d_unflipped = true;
-      kern_data_unflipped=&(kern->devdata[(kern_wid-1)*kern_stride_col + (kern_len-1)*kern_stride_row]);
-    }
-    //if we remove the restriction
-    //img_size_byte+kern_size_byte>8*1024, we can enter in condition where
-    //we will lower the occupency due to shared memory and/or registers.
-    if ((version == -1) &&
-        (out_size<64 || img_size_byte+kern_size_byte>8*1024) &&
-        out_size<=256){
-      //condition for exec 
-      if(!subsample &&
-        out_contiguous &&
-        out_size<=max_threads_dim0 &&//Maximum of X threads by block
-         std::max(int(img_size_byte+2*kern_wid*sizeof(float)), out_size_byte*2)<shared_avail && //there is only 16k of shared memory and if we can't have the output at least twice in shared mem, we won't have any reduce!
-        !work_complete)
-        version = 7; //conv_patch_stack_reduce, switch to version 8/13 automatically if needed.
-    }
-    if (!subsample && c_contiguous &&
-        (version==0||version==2||version==-1) &&
-        out_wid<=max_threads_dim0 &&//Maximum of X threads for block.x
-        nstack == 1 &&// don't implement the stack in the kernel.
-        img_size_byte+kern_size_byte<shared_avail && //there is only 16k of shared memory
-        !work_complete) //conv_patch
-    {
-        int nb_split=1;//The number of split (i.e. the number of output pixel each thread compute.)
-        if(version==2 && out_len>1)nb_split++;//to force the use of split=true when testing.
-        //we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
-        while (ceil_intdiv(out_len,nb_split)*out_wid>max_threads_dim0)
-            nb_split++;
-        dim3 threads(out_wid, ceil_intdiv(out_len,nb_split));
-        dim3 grid(nbatch, nkern);
-        int shared_size=(img_size + kern_size)*sizeof(float);
-        void (*f)(float*, float*, float*,
-                  int, int, int, int,
-                  int, int);
-#define CONV_PATCH_SPECIAL(kern_wid) \
-            if(threads.y==out_len) f=conv_patch<true,kern_wid,false>;\
-            else f=conv_patch<true,kern_wid,true>;
-        CONV_PATCH_SPECIAL(THEANO_KERN_WID);
-         f<<< grid, threads, shared_size>>>
-             (img->devdata, kern->devdata, out->devdata,
-              img_len, img_wid, kern_len, kern_wid, nkern, nstack);
-        CNDA_THREAD_SYNC;
-        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts)
-        {
-            if (verbose)
-              fprintf(stderr,
-                      "INFO: used 'conv_patch' version %s nb_split=%d\n",
-                      threads.y==out_len ? "no split": "split", nb_split);
-            work_complete = true;
-        }
-        else
-        {
-            if (verbose)
-              fprintf(stderr,
-                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
-                      " shared_size=%i, nb_threads=%i, nb_split=%i\n",
-                      threads.x, threads.y, grid.x, grid.y,
-                      shared_size, threads.x * threads.y, nb_split);
-            if (verbose)
-              fprintf(stderr,
-                      "INFO: impl 'conv_patch' failed (%s),"
-                      " trying next implementation\n",
-                      cudaGetErrorString(sts));
-        }
-    }
-    if (out_contiguous &&
-        (version==1||version==3||version==11||version==12||version==-1) &&
-        (version!=1 || out_size<=max_threads_dim0) &&//Maximum of X threads by block.x
-        out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
-        img_size_byte+kern_wid*sizeof(float)<shared_avail && //there is only 16k of shared memory
-        !work_complete) //conv_patch_stack
-    {
-      //version 1 is without split and preload the full kernel
-      //version 3 is with split and preload the full kernel
-      //version 11 is without split and load only 1 kernel row at a time.
-      //version 12 is with split and load only 1 kernel row at a time.
-        int nb_split=1;//The number of split (i.e. the number of output pixel each thread compute.)
-        if((version==3||version==12) && out_len>1)nb_split++;//to force the use of split=true when testing.
-        //we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
-        while (ceil_intdiv(out_len,nb_split)*out_wid>max_threads_dim0) nb_split++;
-        dim3 threads(out_wid, ceil_intdiv(out_len,nb_split));
-        bool preload_full_kernel = (img_size_byte + kern_size_byte) <shared_avail;
-        if(version==11 || version==12) preload_full_kernel=false;
-        dim3 grid(nbatch,nkern);
-        int shared_size=(img_size + (preload_full_kernel?kern_size:kern_wid))*sizeof(float);
-        void (*f)(float*, float*, float*,
-                  int, int, int, int,
-                  int, int, int, int,
-                  int, int, int, int,
-                  int, int, int, int,
-                  int, int);
-#define CONV_PATCH_STACK_SPECIAL(kern_wid) \
-        if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,true,true>;} \
-        else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,true,true>;} \
-        else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,true,true>;}\
-        else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,true,true>;}\
-        else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,true,true>;}\
-        else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,true,true>;}\
-        else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,true,true>;}\
-        else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,true,true>;}\
-        else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,false,true>;}\
-        else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,false,true>;}\
-        else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,false,true>;}\
-        else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,false,true>;}\
-        else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,false,true>;} \
-        else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,false,true>;} \
-        else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,false,true>;} \
-        else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,false,true>;} \
-        else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,true,false>;} \
-        else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,true,false>;} \
-        else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,true,false>;}\
-        else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,true,false>;}\
-        else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,true,false>;}\
-        else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,true,false>;}\
-        else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,true,false>;}\
-        else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,true,false>;}\
-        else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,false,false>;}\
-        else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,false,false>;}\
-        else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,false,false>;}\
-        else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,false,false>;}\
-        else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,false,false>;} \
-        else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,false,false>;} \
-        else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,false,false>;} \
-        else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,false,false>;}
-        CONV_PATCH_STACK_SPECIAL(THEANO_KERN_WID);
-        f<<< grid, threads, shared_size>>>
-             (img->devdata, kern->devdata, out->devdata,
-              img_len, img_wid, kern_len, kern_wid, 
-              out_len, out_wid, nkern, nstack,
-              img_stride_col, img_stride_row, img_stride_stack,
-              img_stride_batch, kern_stride_col, kern_stride_row,
-              kern_stride_stack, kern_stride_nkern, subsample_rows, subsample_cols);
-        CNDA_THREAD_SYNC;
-        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts)
-        {
-            if (verbose>1)
-              fprintf(stderr,
-                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
-                      " shared_size=%i, nb_threads=%i,"
-                      " kern_flipped=true, accumulate=false, kern_width=%i,"
-                      " img_c_contiguous_2d=%i,"
-                      " kern_c_contiguous_2d=%i, nb_split=%i,"
-                      " preload_full_kernel=%i,"
-                      " subsample_rows=%i, subsample_cols=%i\n",
-                      threads.x, threads.y, grid.x, grid.y,
-                      shared_size, threads.x * threads.y,
-                      THEANO_KERN_WID, img_contiguous_2d, kern_contiguous_2d,
-                      nb_split, preload_full_kernel,
-                      subsample_rows, subsample_cols);
-            if (verbose)
-              fprintf(stderr,
-                      "INFO: used 'conv_patch_stack' version with nb_split=%i"
-                      " and preload_full_kernel=%i,"
-                      " subsample_rows=%i, subsample_cols=%i\n",
-                      nb_split, preload_full_kernel,
-                      subsample_rows, subsample_cols);
-            work_complete = true;
-        }
-        else
-        {
-            if (verbose)
-              fprintf(stderr,
-                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
-                      " shared_size=%i, nb_threads=%i,"
-                      " kern_flipped=true, accumulate=false,"
-                      " kern_width=%i, img_c_contiguous_2d=%i,"
-                      " kern_c_contiguous_2d=%i, nb_split=%i,"
-                      " preload_full_kernel=%i,"
-                      " subsample_rows=%i, subsample_cols=%i\n",
-                      threads.x, threads.y, grid.x, grid.y,
-                      shared_size, threads.x * threads.y,
-                      THEANO_KERN_WID, img_contiguous_2d, kern_contiguous_2d,
-                      nb_split, preload_full_kernel,
-                      subsample_rows, subsample_cols);
-            if (verbose)
-              fprintf(stderr,
-                      "INFO: impl 'conv_patch_stack' failed (%s),"
-                      " trying next implementation\n",
-                      cudaGetErrorString(sts));
-        }
-    }
-    if (!subsample && out_contiguous &&
-        (version==4||version==-1) &&
-        out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
-        nstack == 1 &&// don't implement the stack in the kernel.
-        kern_len*img_wid*sizeof(float)+kern_size_byte<shared_avail &&//there is only 16k of shared memory
-        !work_complete) //conv_rows
-    {
-        dim3 threads(out_wid);
-        dim3 grid(out_len, nbatch*nkern);
-        int shared_size=(kern_len*img_wid + kern_size)*sizeof(float);
-        void (*f)(float*, float*, float*,
-                  int, int, int, int,
-                  int, int, int, int,
-                  int, int, int, int,
-                  int, int);
-#define CONV_ROWS_SPECIAL(kern_wid) \
-        if(!img_contiguous_2d || !kern_contiguous_2d) f = conv_rows<kern_wid, false>;\
-        else f = conv_rows<kern_wid, true>;\
-        CONV_ROWS_SPECIAL(THEANO_KERN_WID);
-        f<<< grid, threads, shared_size >>>
-          (img->devdata, kern->devdata, out->devdata,
-           img_len, img_wid, kern_len, kern_wid, nkern, nstack,
-           img_stride_col, img_stride_row,
-           img_stride_stack,img_stride_batch,
-           kern_stride_col, kern_stride_row,
-           kern_stride_stack, kern_stride_nkern);
-        CNDA_THREAD_SYNC;
-        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts)
-        {
-            work_complete = true;
-            if (verbose)
-              fprintf(stderr, "INFO: used 'conv_rows' version\n");
-        }
-        else
-        {
-            if (verbose)
-              fprintf(stderr,
-                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
-                      " shared_size=%i, nb_threads=%i\n",
-                      threads.x, threads.y, grid.x, grid.y,
-                      shared_size, threads.x * threads.y);
-            if (verbose)
-              fprintf(stderr,
-                      "INFO: impl 'conv_rows' failed (%s),"
-                      " trying next implementation\n",
-                      cudaGetErrorString(sts));
-        }
-    }
-    if (!subsample && out_contiguous &&
-        (version==5||version==-1) &&
-        out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
-        img_wid*kern_len*sizeof(float)+kern_size_byte<shared_avail && //there is only 16k of shared memory
-        !work_complete) //conv_rows_stack
-    {
-        int nb_row=1;
-        //TODO:if not c_contiguous, lower max_thread as we use 22
-        //registers by thread and we won't execute 2 block in one MP.
-        for(int i=2;i<=out_len;i++){
-          if((i)*out_wid<=max_threads_dim0 && ((kern_len+i)*img_wid + kern_size)*sizeof(float)<shared_avail)
-            nb_row=i;
-        }
-        dim3 threads(out_wid,nb_row);
-        dim3 grid(ceil_intdiv(out_len,nb_row), nbatch*nkern);
-        int shared_size=((kern_len+nb_row-1)*img_wid + kern_size)*sizeof(float);
-        void (*f)(float*, float*, float*,
-                  int, int, int, int,
-                  int, int, int, int,
-                  int, int, int, int,
-                  int, int);
-        if (0)
-          fprintf(stderr,
-                  "IMG CONTIG %i KERN_CONTIG %i (%i %i %i) (%i %i %i)\n",
-                  img_contiguous_2d, kern_contiguous_2d,
-                  threads.x, threads.y, threads.z,
-                  grid.x, grid.y, grid.z);
-        if(!img_contiguous_2d || !kern_contiguous_2d) {
-            //fprintf(stderr, "using false version\n");
-            f = conv_rows_stack<THEANO_KERN_WID, false>;
-        } else {
-            //fprintf(stderr, "using true version\n");
-            f = conv_rows_stack<THEANO_KERN_WID, true>;
-        }
-        f<<< grid, threads, shared_size >>>
-          (img->devdata,
-           kern->devdata,
-           out->devdata,
-           img_len, img_wid, kern_len, kern_wid, nkern, nstack,
-           img_stride_col, img_stride_row,
-           img_stride_stack,img_stride_batch,
-           kern_stride_col, kern_stride_row,
-           kern_stride_stack, kern_stride_nkern);
-        CNDA_THREAD_SYNC;
-        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts)
-        {
-            work_complete = true;
-            if (verbose>1)
-              fprintf(stderr,
-                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
-                      " shared_size=%i, nb_threads=%i\n",
-                      threads.x, threads.y, grid.x, grid.y,
-                      shared_size, threads.x * threads.y);
-            if (verbose)
-              fprintf(stderr, "INFO: used 'conv_rows_stack' version\n");
-        }
-        else
-        {
-            if (verbose)
-              fprintf(stderr,
-                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
-                      " shared_size=%i, nb_threads=%i\n",
-                      threads.x, threads.y, grid.x, grid.y,
-                      shared_size, threads.x * threads.y);
-            if (verbose)
-              fprintf(stderr,
-                      "INFO: impl 'conv_rows_stack' failed (%s),"
-                      " trying next implementation\n",
-                      cudaGetErrorString(sts));
-        }
-    }
-    if (!subsample && out_contiguous &&
-        (version==9||version==10||version==-1) &&
-        out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
-        (img_wid+kern_wid)*sizeof(float)<shared_avail && //there is only 16k of shared memory
-        (version != 9 || (img_wid+kern_len*kern_wid)*sizeof(float)<shared_avail) && //version 9 use more memory
-        !work_complete) //conv_rows_stack2
-    {
-      // version 9:we preload the full kernel
-      // version 10: load only a few row at a time.
-        int nb_row=1;
-        int version_back = version;
-        //TODO:if not c_contiguous, lower max_thread as we use 22 registers by thread and we won't execute 2 block in one MP.
-        if(version==-1 && (img_wid+kern_len*kern_wid)*sizeof(float)<shared_avail)
-          version = 9;
-        else if(version==-1)version = 10;
-        int k_size = kern_size;
-        if(version==10)
-          k_size=kern_wid;
-        for(int i=2;i<=out_len;i++){
-          if(i*out_wid<=max_threads_dim0 && (i*img_wid + k_size)*sizeof(float)<shared_avail)
-            nb_row=i;
-        }
-        //to test the case when we don't have a thread by output pixel.
-        if((version_back!=-1)&& nb_row>1) nb_row--;
-        dim3 threads(out_wid,nb_row);
-        dim3 grid(ceil_intdiv(out_len,nb_row), nbatch*nkern);
-        int shared_size=(threads.y*img_wid + k_size)*sizeof(float);
-        void (*f)(float*, float*, float*,
-                  int, int, int, int,
-                  int, int, int, int,
-                  int, int, int, int,
-                  int, int);
-#define CONV_ROWS_STACK2_SPECIAL(kern_wid) \
-        if((!img_contiguous_2d || !kern_contiguous_2d)&&version==9) f = conv_rows_stack2<kern_wid, false,true>;\
-        else if(version==9) f = conv_rows_stack2<kern_wid, true,true>;\
-        else if(!img_contiguous_2d || !kern_contiguous_2d) f = conv_rows_stack2<kern_wid, false, false>;\
-        else f = conv_rows_stack2<kern_wid, true, false>;
-        CONV_ROWS_STACK2_SPECIAL(THEANO_KERN_WID);
-        f<<< grid, threads, shared_size >>>
-          (img->devdata,
-           kern->devdata,
-           out->devdata,
-           img_len, img_wid, kern_len, kern_wid, nkern, nstack,
-           img_stride_col, img_stride_row,
-           img_stride_stack,img_stride_batch,
-           kern_stride_col, kern_stride_row,
-           kern_stride_stack, kern_stride_nkern);
-        CNDA_THREAD_SYNC;
-        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts) 
-        {
-            work_complete = true;
-            if (verbose>1)
-              fprintf(stderr,
-                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
-                      " shared_size=%i, nb_threads=%i\n",
-                      threads.x, threads.y, grid.x, grid.y,
-                      shared_size, threads.x * threads.y);
-            if (verbose)
-              fprintf(stderr,
-                      "INFO: used 'conv_rows_stack2' version %s with"
-                      " %d row(s).\n",
-                      (version==9?"'load full kernel'":
-                       "'load 1 kern row at a time'"),nb_row);
-        }
-        else
-        {
-            if (verbose)
-              fprintf(stderr,
-                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
-                      " shared_size=%i, nb_threads=%i version=%d\n",
-                      threads.x, threads.y, grid.x, grid.y,
-                      shared_size, threads.x * threads.y,(version==9?2:3));
-            if (verbose)
-              fprintf(stderr,
-                      "INFO: impl 'conv_rows_stack2' failed (%s),"
-                      " trying next implementation\n",
-                      cudaGetErrorString(sts));
-        }
-    }
-    //version 8 is the same but we force the split.
-    // The split is need in case we have too much threads.
-    // This happen frequently if the kernel length is big.
-    // Big kernel is frequent in the gradient.
-    //version 8 need a minimum of kernel length as we force the split.
-    //version 8 is needed to test more easily this kernel template parameter.
-    //version 13 load only 1 kernel row at a time.
-    if (!subsample &&
-        out_contiguous &&
-        out_size<=max_threads_dim0 &&//Maximum of X threads by block
-        (version==7||version==8||version==13||version==-1) &&
-        (version!=8||kern_len>1) && //version 8 need a minimal kernel length as big as the split.
-        //version 13 need a minimal kernel length as big as the split.
-        (version!=13||kern_len>1) &&
-        !work_complete) //conv_patch_stack_reduce
-    {
-        int nb_split=1;
-        int full_kern=true;
-        if(version==8||version==13) nb_split++;//force the split.
-        if(version==13)full_kern=false;
-        //check if we can fit the full kernel in the shared memory
-        if(sizeof(float)*std::max(img_size + kern_size, out_size*2) > shared_avail){
-          full_kern = false;
-        }
-        //thread_z is going to be ceil_intdiv(kern_len, nb_split)
-        // we need enough splits so that
-        // a) thread_z fits in the 'z' threadIdx (i.e. is less than 64)
-        // b) thread_z * out_len * out_wid fits in the thread count
-        // c) the kernel doesn't need too much shared memory
-        // constraint (a)
-        // device 1.3 have a max of 64 thread in z
-        while(ceil_intdiv(kern_len,nb_split)>64) nb_split++;
-        // constraint (b)
-        //  (TODO: read the number of threads per block from the device)
-        while(out_size*ceil_intdiv(kern_len,nb_split)>max_threads_dim0)
-            nb_split++;
-        // tentative estimates (prior to contraint c)
-        int thread_z=ceil_intdiv(kern_len,nb_split);
-        int shared_size = sizeof(float)*(full_kern
-                ? std::max(img_size + kern_size, out_size*thread_z)
-                : std::max(img_size + thread_z*kern_wid, out_size*thread_z));
-        // constraint (c)
-        while ((shared_size >= shared_avail) && (nb_split <= kern_len)){
-            //if we can't fit the kernel in shared memory, we must split it more.
-            nb_split++;
-            thread_z=ceil_intdiv(kern_len,nb_split);
-            shared_size = sizeof(float)*(full_kern
-                ? std::max(img_size + kern_size, out_size*thread_z)
-                : std::max(img_size + thread_z*kern_wid, out_size*thread_z));
-        }
-        if (nb_split <= kern_len)
-        {
-            assert(thread_z>0);//should not happen, but in case...
-            if(!full_kern) assert(thread_z!=kern_len);
-            dim3 threads(out_wid, out_len, thread_z);
-            dim3 grid(nbatch,nkern);
-            void (*f)(float*, float*, float*,
-                      int, int, int, int,
-                      int, int, int, int,
-                      int, int,
-                      int, int,
-                      int, int);
-            const bool split=thread_z!=kern_len;
-            const bool ccontig=img_contiguous_2d && kern_contiguous_2d_unflipped;
-            //printf("kern_flipped=%d, ccontig=%d, split=%d, full_kern=%d\n",kern_flipped,ccontig,split,full_kern);
-            //We will always be split when we don't load the full kernel
-#define CONV_PATCH_STACK_REDUCE_SPECIAL(kern_wid) \
-                if     (kern_flipped  && ccontig  && !split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, false, true>;\
-                else if(kern_flipped  && !ccontig && !split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, false, true>;\
-                else if(kern_flipped  && ccontig  && split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, true, true>;\
-                else if(kern_flipped  && !ccontig && split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, true, true>;\
-                else if(!kern_flipped && ccontig  && !split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, false, true>;\
-                else if(!kern_flipped && !ccontig && !split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, false, true>;\
-                else if(!kern_flipped && ccontig  && split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, true, true>;\
-                else if(!kern_flipped && !ccontig  && split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, true, true>;\
-                /*else if(kern_flipped  && ccontig  && !split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, false, false>;*/\
-                /*else if(kern_flipped  && !ccontig && !split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, false, false>;*/\
-                else if(kern_flipped  && ccontig  && split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, true, false>;\
-                else if(kern_flipped  && !ccontig && split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, true, false>;\
-                /*else if(!kern_flipped && ccontig  && !split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, false, false>;*/\
-                /*else if(!kern_flipped && !ccontig && !split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, false, false>;*/\
-                else if(!kern_flipped && ccontig  && split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, true, false>;\
-                else if(!kern_flipped && !ccontig  && split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, true, false>;
-            CONV_PATCH_STACK_REDUCE_SPECIAL(THEANO_KERN_WID);
-            f<<< grid, threads, shared_size>>>(img->devdata, kern_data_unflipped, out->devdata,
-                                               img_len, img_wid, kern_len, kern_wid,
-                                               nkern, nstack,
-                                               img_stride_col, img_stride_row, img_stride_stack, img_stride_batch,
-                                               kern_stride_col_unflipped, kern_stride_row_unflipped,
-                                               kern_stride_stack, kern_stride_nkern);
-            CNDA_THREAD_SYNC;
-            cudaError_t sts = cudaGetLastError();
-            if (cudaSuccess == sts)
-            {
-                if (verbose>1)
-                    fprintf(stderr,
-                            "threads.x=%i, threads.y=%i, threads.z=%i, "
-                            "grid.x=%i, grid.y=%i, shared_size=%i,"
-                            " nb_threads=%i\n",
-                            threads.x, threads.y, threads.z, grid.x, grid.y,
-                            shared_size, threads.x * threads.y * threads.z);
-                if (verbose)
-                    fprintf(stderr,
-                            "INFO: used 'conv_patch_stack_reduce' version"
-                            " kern_flipped=%i ccontig=%i nb_split=%d,"
-                            " preload_full_kern=%d\n",
-                            kern_flipped, ccontig, nb_split, full_kern);
-                work_complete = true;
-            }
-            else
-            {
-                if (verbose)
-                  fprintf(stderr,
-                          "threads.x=%i, threads.y=%i, threads.z=%i,"
-                          " grid.x=%i, grid.y=%i,shared_size=%i,"
-                          " nb_threads=%i\n",
-                          threads.x, threads.y, threads.z,
-                          grid.x, grid.y, shared_size,
-                          threads.x * threads.y * threads.z);
-                if (verbose)
-                  fprintf(stderr,
-                          "INFO: impl 'conv_patch_stack_reduce' failed (%s),"
-                          " trying next implementation\n",
-                          cudaGetErrorString(sts));
-            }
-        } // else no good nb_splits was found
-    }
-    if (1 && (version==6||version==-1) &&
-        kern_len<=320 &&
-        !work_complete) //conv_valid_row_reduce
-    {
-        int outsize = CudaNdarray_SIZE(out);
-        int n_blocks = std::min(outsize, NUM_VECTOR_OP_BLOCKS);
-        int block_nstack=nstack;
-        //Max of 512 threads per blocks.
-        //On old hardware, we have a max of 356 threads as we have only 
-        //8k registers and the kernel use 23 register
-        //TODO: check if we have 8k or 16k of register...
-        while(block_nstack*kern_len>320)block_nstack--;
-        dim3 n_threads(block_nstack, kern_len, 1);
-        int n_reduce_buf = block_nstack * kern_len * sizeof(float);
-        /* initial_reduce_boundary is the greatest power of two less than n_reduce_buf/ sizeof(float)
-         *
-         * if n_reduce_buf == sizeof(float), then initial_reduce_boundary == 0.
-         * */
-        int initial_reduce_boundary = (1 << (int)(log2((double)(n_reduce_buf/sizeof(float)))));
-        if (initial_reduce_boundary == (n_reduce_buf / sizeof(float)))
-            initial_reduce_boundary >>= 1;
-        if (n_reduce_buf == sizeof(float))
-            assert (initial_reduce_boundary == 0);
-        else
-        {
-            assert (initial_reduce_boundary * 2 >= n_reduce_buf/sizeof(float));
-            assert (initial_reduce_boundary < n_reduce_buf/sizeof(float));
-        }
-        void (*f)(int, int, int, int,
-                  int, int, int, int, int,
-                  float*, int, int, int, int,
-                  float*, int, int, int, int,
-                  float*, int, int, int, int,
-                  int, int, int);
-        //std::cerr << "initial_reduce_boundary " << initial_reduce_boundary << "\n";
-        //std::cerr << "kerns " << nstack << " " << kern_len << "\n";
-        //std::cerr << "n_reduce_buf/sizeof(float) " << n_reduce_buf / sizeof(float) << "\n";
-        if(block_nstack==nstack)
-          f=conv_valid_row_reduce<false>;
-        else
-          f=conv_valid_row_reduce<true>;
-        f<<<n_blocks, n_threads, n_reduce_buf>>>(
-                nbatch, nkern, CudaNdarray_HOST_DIMS(img)[1],
-                img_len, img_wid,
-                kern_len, kern_wid,
-                out_len, out_wid,
-                img->devdata,
-                CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1], 
-                img_stride_row, img_stride_col,
-                kern->devdata,
-                CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1],
-                CudaNdarray_HOST_STRIDES(kern)[2], CudaNdarray_HOST_STRIDES(kern)[3],
-                out->devdata,
-                CudaNdarray_HOST_STRIDES(out)[0], CudaNdarray_HOST_STRIDES(out)[1],
-                CudaNdarray_HOST_STRIDES(out)[2], CudaNdarray_HOST_STRIDES(out)[3],
-                subsample_rows, subsample_cols, initial_reduce_boundary);
-        CNDA_THREAD_SYNC;
-        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts) 
-        {
-            work_complete = true;
-            if (verbose)
-              fprintf(stderr, "INFO: used 'conv_valid_row_reduce' version\n");
-        }
-        else
-        {
-            if (verbose)
-              fprintf(stderr,
-                      "threads.x=%i, threads.y=%i, grid.x=%i,"
-                      " shared_size=%i, nb_threads=%i\n",
-                      n_threads.x, n_threads.y, n_blocks,
-                      n_reduce_buf, n_threads.x * n_threads.y);
-            if (verbose)
-              fprintf(stderr,
-                      "INFO: impl 'conv_valid_row_reduce' failed (%s),"
-                      " trying next implementation\n",
-                      cudaGetErrorString(sts));
-        }
-    }
-    if (1 && !work_complete) //conv_reference_valid
-    {
-        int outsize = CudaNdarray_SIZE(out);
-        int n_blocks = std::min(outsize, NUM_VECTOR_OP_BLOCKS);
-        int n_threads = std::min(ceil_intdiv(outsize, n_blocks),
-                                 NUM_VECTOR_OP_THREADS_PER_BLOCK);
-        if (1)
-        {
-            if (verbose)
-              fprintf(stderr, "INFO: launching conv_reference_valid\n");
-            if (verbose>1)
-              fprintf(stderr, "      img : %i %i %i %i %p  %i %i %i %i\n",
-                      nbatch, CudaNdarray_HOST_DIMS(img)[1], img_len, img_wid,
-                      img->devdata,
-                      CudaNdarray_HOST_STRIDES(img)[0],
-                      CudaNdarray_HOST_STRIDES(img)[1],
-                      CudaNdarray_HOST_STRIDES(img)[2],
-                      CudaNdarray_HOST_STRIDES(img)[3]);
-            if (verbose>1)
-              fprintf(stderr, "      kern: %i %i %i %i %p  %i %i %i %i\n",
-                      nkern, nstack, kern_len, kern_wid,
-                      kern->devdata,
-                      CudaNdarray_HOST_STRIDES(kern)[0],
-                      CudaNdarray_HOST_STRIDES(kern)[1],
-                      CudaNdarray_HOST_STRIDES(kern)[2],
-                      CudaNdarray_HOST_STRIDES(kern)[3]);
-            if (verbose>1)
-              fprintf(stderr, "      out : %i %i %i %i %p  %i %i %i %i\n",
-                      CudaNdarray_HOST_DIMS(out)[0],
-                      CudaNdarray_HOST_DIMS(out)[1], out_len, out_wid,
-                      out->devdata,
-                      CudaNdarray_HOST_STRIDES(out)[0],
-                      CudaNdarray_HOST_STRIDES(out)[1],
-                      CudaNdarray_HOST_STRIDES(out)[2],
-                      CudaNdarray_HOST_STRIDES(out)[3]);
-            if (verbose>1)
-              fprintf(stderr, "   launch params: %i %i %i\n",
-                      outsize, n_blocks, n_threads);
-        }
-        conv_reference_valid<<<n_blocks, n_threads>>>(nbatch, nkern,
-                CudaNdarray_HOST_DIMS(img)[1],
-                img_len, img_wid,
-                kern_len, kern_wid,
-                out_len, out_wid,
-                img->devdata,
-                CudaNdarray_HOST_STRIDES(img)[0],
-                CudaNdarray_HOST_STRIDES(img)[1],
-                CudaNdarray_HOST_STRIDES(img)[2],
-                CudaNdarray_HOST_STRIDES(img)[3],
-                kern->devdata,
-                CudaNdarray_HOST_STRIDES(kern)[0],
-                CudaNdarray_HOST_STRIDES(kern)[1],
-                CudaNdarray_HOST_STRIDES(kern)[2],
-                CudaNdarray_HOST_STRIDES(kern)[3],
-                out->devdata,
-                CudaNdarray_HOST_STRIDES(out)[0],
-                CudaNdarray_HOST_STRIDES(out)[1],
-                CudaNdarray_HOST_STRIDES(out)[2],
-                CudaNdarray_HOST_STRIDES(out)[3],
-                subsample_rows, subsample_cols);
-        CNDA_THREAD_SYNC;
-        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts)
-        {
-            work_complete = true;
-            if (verbose)
-              fprintf(stderr, "INFO: used 'conv_reference_valid' version\n");
-        }
-        else
-        {
-            if (verbose)
-              fprintf(stderr, "INFO: 'conv_reference_valid' failed\n");
-            PyErr_Format(PyExc_RuntimeError,
-                         "ERROR: all implementations failed for"
-                         " CudaNdarray_conv_valid! (%s)",
-                         cudaGetErrorString(sts));
-            return -1;
-        }
-    }
-    if (!work_complete)
-    {
-      PyErr_Format(PyExc_RuntimeError,
-                   "ERROR: no implementation(s) worked for"
-                   " CudaNdarray_conv_valid!"
-                   " Version asked(%d) (-1 mean use an heuristic)",
-                   version);
-        return -1;
-    }
-    return 0;
-}
-int
-CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
-                      CudaNdarray * out, int subsample_rows,
-                      int subsample_cols, int version = -1, int verbose=0,
-                      int max_threads_dim0=512)
-{
-  //144 is the biggest static shared size used with compiling this file.
-    const int shared_avail = SHARED_SIZE - 150;
-    int work_complete = 0;
-    if (img->nd != 4)
-    {
-        PyErr_SetString(PyExc_ValueError, "required img of 4D");
-        return -1;
-    }
-    if (kern->nd != 4)
-    {
-        PyErr_SetString(PyExc_ValueError, "required kern of 4D");
-        return -1;
-    }
-    if (out->nd != 4)
-    {
-        PyErr_SetString(PyExc_ValueError, "required out of 4D");
-        return -1;
-    }
-    // check the size of the output matrix
-    assert (CudaNdarray_HOST_DIMS(out)[2] == ceil_intdiv(CudaNdarray_HOST_DIMS(img)[2] + CudaNdarray_HOST_DIMS(kern)[2] - 1, subsample_rows));
-    assert (CudaNdarray_HOST_DIMS(out)[3] == ceil_intdiv(CudaNdarray_HOST_DIMS(img)[3] + CudaNdarray_HOST_DIMS(kern)[3] - 1, subsample_cols));
-    assert (CudaNdarray_HOST_DIMS(out)[0] == CudaNdarray_HOST_DIMS(img)[0]);
-    assert (CudaNdarray_HOST_DIMS(out)[1] == CudaNdarray_HOST_DIMS(kern)[0]);
-    assert (CudaNdarray_HOST_DIMS(img)[1] == CudaNdarray_HOST_DIMS(kern)[1]);
-    const int nstack=CudaNdarray_HOST_DIMS(kern)[1];
-    const int nbatch=CudaNdarray_HOST_DIMS(img)[0];
-    const int nkern=CudaNdarray_HOST_DIMS(kern)[0];
-    const int img_wid=CudaNdarray_HOST_DIMS(img)[3];
-    const int img_len=CudaNdarray_HOST_DIMS(img)[2];
-    const int kern_wid=CudaNdarray_HOST_DIMS(kern)[3];
-    const int kern_len=CudaNdarray_HOST_DIMS(kern)[2];
-    const int out_wid=CudaNdarray_HOST_DIMS(out)[3];
-    const int out_len=CudaNdarray_HOST_DIMS(out)[2];
-    const int img_stride_col= CudaNdarray_HOST_STRIDES(img)[3];
-    const int img_stride_row=CudaNdarray_HOST_STRIDES(img)[2];
-    const int img_stride_stack=CudaNdarray_HOST_STRIDES(img)[1];
-    const int img_stride_batch=CudaNdarray_HOST_STRIDES(img)[0];
-    const int kern_stride_col= CudaNdarray_HOST_STRIDES(kern)[3];
-    const int kern_stride_row=CudaNdarray_HOST_STRIDES(kern)[2];
-    const int kern_stride_stack= CudaNdarray_HOST_STRIDES(kern)[1];
-    const int kern_stride_nkern=CudaNdarray_HOST_STRIDES(kern)[0];
-    const int img_size=img_len*img_wid;
-    const int kern_size=kern_len*kern_wid;
-    const int out_size=out_len*out_wid;
-    const int img_size_byte = img_size*sizeof(float);
-    const int kern_size_byte = kern_size*sizeof(float);
-    //padded image sizes
-    const int img_wid_padded=img_wid+2*kern_wid-2;
-    const int img_len_padded=img_len+2*kern_len-2;
-    const int img_size_padded=img_len_padded * img_wid_padded;
-    const int img_size_padded_byte = img_size_padded*sizeof(float);
-    //const int out_size_byte = out_size*sizeof(float); // unused 
-    if (!((THEANO_KERN_WID == CudaNdarray_HOST_DIMS(kern)[3]) ||
-          (THEANO_KERN_WID == 0))){
-      PyErr_Format(PyExc_ValueError,
-                   "ERROR: This GpuConv code was compiled for"
-                   " %d kernel columns, but the kernel we received"
-                   " had %d columns!",
-                   THEANO_KERN_WID, CudaNdarray_HOST_DIMS(kern)[3]);
-      return -1;
-    }
-    bool subsample = subsample_rows!=1 || subsample_cols!=1;
-    bool img_contiguous = CudaNdarray_is_c_contiguous(img);
-    bool kern_contiguous = CudaNdarray_is_c_contiguous(kern);
-    bool out_contiguous = CudaNdarray_is_c_contiguous(out);
-    bool c_contiguous = img_contiguous &&  kern_contiguous && out_contiguous;
-    bool img_contiguous_2d = (img_stride_col == 1) && (img_stride_row==img_wid);
-    bool kern_contiguous_2d = (kern_stride_col == 1) && (kern_stride_row==kern_wid);
-    bool img_batch_stack_contiguous = (img_stride_stack==img_stride_row*img_len) && (img_stride_batch==img_stride_stack*nstack);//don't support stride for nbatch and nstack
-    //if the lower 2 dims are c_contiguous but flipped, unflipping the
-    //stride and not flipping the kernel in shared memroy
-    //allow to use a version that use less registers(so is faster)
-    //the unflipped version of variable have the original value when
-    //we don't need to unflip it, but have the new value when we unflip it.
-    bool kern_flipped=true;
-    bool kern_contiguous_2d_unflipped = kern_contiguous_2d;
-    float * kern_data_unflipped = kern->devdata;
-    int kern_stride_col_unflipped=kern_stride_col;
-    int kern_stride_row_unflipped=kern_stride_row;
-    if(kern_stride_col_unflipped==-1 && kern_stride_row_unflipped==-kern_wid){
-      //the last two dimensions are c_contiguous but flipped!
-      kern_stride_col_unflipped=1;
-      kern_stride_row_unflipped=kern_wid;
-      kern_flipped=false;
-      kern_contiguous_2d_unflipped = true;
-      kern_data_unflipped=&(kern->devdata[(kern_wid-1)*kern_stride_col + (kern_len-1)*kern_stride_row]);
-    }
-    if (verbose>1)
-    {
-        printf("INFO: Running conv_full version=%d,"
-               " MACRO kern_width=%d with inputs:\n", version, THEANO_KERN_WID);
-        printf("INFO:   img  dim: %i %i %i %i  img  stride: %i %i %i %i\n", 
-               CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(img)[1],
-               CudaNdarray_HOST_DIMS(img)[2], CudaNdarray_HOST_DIMS(img)[3],
-               CudaNdarray_HOST_STRIDES(img)[0],
-               CudaNdarray_HOST_STRIDES(img)[1],
-               CudaNdarray_HOST_STRIDES(img)[2],
-               CudaNdarray_HOST_STRIDES(img)[3]);
-        printf("INFO:   kern dim: %i %i %i %i  kern stride: %i %i %i %i\n",
-               CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(kern)[1],
-               CudaNdarray_HOST_DIMS(kern)[2], CudaNdarray_HOST_DIMS(kern)[3],
-               CudaNdarray_HOST_STRIDES(kern)[0],
-               CudaNdarray_HOST_STRIDES(kern)[1],
-               CudaNdarray_HOST_STRIDES(kern)[2],
-               CudaNdarray_HOST_STRIDES(kern)[3]);
-        printf("INFO:   out dim: %i %i %i %i  out stride: %i %i %i %i\n",
-               CudaNdarray_HOST_DIMS(out)[0], CudaNdarray_HOST_DIMS(out)[1],
-               CudaNdarray_HOST_DIMS(out)[2], CudaNdarray_HOST_DIMS(out)[3],
-               CudaNdarray_HOST_STRIDES(out)[0],
-               CudaNdarray_HOST_STRIDES(out)[1],
-               CudaNdarray_HOST_STRIDES(out)[2],
-               CudaNdarray_HOST_STRIDES(out)[3]);
-    }
-    if (!subsample &&
-        out_contiguous &&
-        (version==3||version==4||version==5||version==-1) &&
-        out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
-        (kern_len+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte<shared_avail && //there is only 16k of shared memory
-        (kern_len > 1 || (img_size_padded_byte+kern_size_byte)<=shared_avail) &&
-        !work_complete) //conv_full_patch_stack_padded
-    {
-      //version 3 without split
-      //version 4 with split (more registers)
-      //version 5 with split (more registers) low mem version(some restriction and still more register)
-        int nb_split=1;//The number of split (i.e. the number of output pixel each thread compute.)
-        if((version==4 || version==5) && out_len>1) nb_split++;//to force the use of split=true when testing.
-        if(kern_len==1 && version==5){
-          //version 5 don't support kern_len==1 as 1%0 return -1.
-          version=-1;
-          if(verbose)fprintf(stderr, "WARNING:conv full: Asking version 5 with kern_len==1. Combination not supported!\n");
-        }
-        if(img_size_padded_byte+kern_size_byte>shared_avail) version=5;
-        //we pass by ceil_intdiv in case the out_len is not a multiple
-        //of nb_split, we want nb_split the number of iteration.
-        //Max of 16k of shared memory
-        if(version==5)
-          while ((((kern_len+ceil_intdiv(out_len,nb_split)-1)+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte)>shared_avail) nb_split++;
-        //327 as we use 25 register
-        //version 5 will have only 1 block running at a time, so we
-        //can use 32 registers per threads, but there is some other stuff that
-        //for the limit to bu lower then 512.
-        int max_thread = (version!=5?327:450);
-        while (ceil_intdiv(out_len,nb_split)*out_wid>max_thread) nb_split++;
-        if(version==-1 && out_size>max_threads_dim0)version=4;
-        if(version==-1)version=3;
-        if(version==-1 && nb_split>1) version=4;
-        else if(version==-1) version=3;
-        //force version 4 when more than 1 split are needed to always execute.
-        else if(version==3 && nb_split!=1) version=4;
-        assert(version!=3 || nb_split==1);
-        assert(version!=5 || kern_len>1);
-        assert(version!=-1);
-        dim3 threads(out_wid, ceil_intdiv(out_len,nb_split));
-        dim3 grid(nbatch,nkern);
-        int shared_size=img_size_padded_byte + kern_size_byte;
-        if(version==5)
-          shared_size=((kern_len+threads.y-1)+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte;
-        void (*f)(float*, float*, float*,
-                  int, int, int, int,
-                  int, int, int, int,
-                  int, int, int, int,
-                  int, int);
-#define CONV_FULL_PATCH_STACK_PADDED_SPECIAL(kern_wid) \
-             if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,true,false,false>;\
-        else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,true,true,false>;\
-        else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,true,false,true>;\
-        else if(version==3 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,false,false,false>;\
-        else if(version==4 && kern_flipped)f=conv_full_patch_stack_padded<true,kern_wid,false,true,false>;\
-        else if(version==5 && kern_flipped)f=conv_full_patch_stack_padded<true,kern_wid,false,false,true>;\
-        else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3) f=conv_full_patch_stack_padded<false,kern_wid,true,false,false>;\
-        else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4) f=conv_full_patch_stack_padded<false,kern_wid,true,true,false>;\
-        else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5) f=conv_full_patch_stack_padded<false,kern_wid,true,false,true>;\
-        else if(version==3) f=conv_full_patch_stack_padded<false,kern_wid,false,false,false>;\
-        else if(version==4) f=conv_full_patch_stack_padded<false,kern_wid,false,true,false>;\
-        else if(version==5) f=conv_full_patch_stack_padded<false,kern_wid,false,false,true>;\
-        else assert(false);
-        CONV_FULL_PATCH_STACK_PADDED_SPECIAL(THEANO_KERN_WID);
-        f<<< grid, threads, shared_size>>>
-             (img->devdata, kern_data_unflipped, out->devdata,
-              img_len, img_wid, kern_len, kern_wid, nkern, nstack,
-              img_stride_col, img_stride_row, img_stride_stack,
-              img_stride_batch, kern_stride_col_unflipped, kern_stride_row_unflipped,
-              kern_stride_stack, kern_stride_nkern);
-        CNDA_THREAD_SYNC;
-        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts)
-        {
-          if (verbose>1)
-            fprintf(stderr,
-                    "threads.x=%i, threads.y=%i, threads.z=%i,"
-                    " grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i,"
-                    " out_len=%i, nb_split=%i, version=%i\n",
-                    threads.x, threads.y, threads.z,
-                    grid.x, grid.y, shared_size,
-                    threads.x * threads.y * threads.z,
-                    out_len, nb_split, version);
-            if (verbose)
-              fprintf(stderr,
-                      "INFO: used 'conv_full_patch_stack_padded'"
-                      " nb_split=%d low_mem=%s\n",
-                      nb_split, (version==5?"true":"false"));
-            work_complete = true;
-        }
-        else
-        {
-          if (verbose)
-            fprintf(stderr,
-                    "threads.x=%i, threads.y=%i, threads.z=%i,"
-                    " grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i,"
-                    " out_len=%i, nb_split=%i, version=%i\n",
-                    threads.x, threads.y, threads.z,
-                    grid.x, grid.y, shared_size,
-                    threads.x * threads.y * threads.z,
-                    out_len, nb_split, version);
-          if (verbose)
-            fprintf(stderr,
-                    "INFO: impl 'conv_full_patch_stack_padded' %s %s"
-                    " failed (%s), trying next implementation\n",
-                    version==3?"no split": "split",
-                    (version==5?"low_mem":"not_low_mem"),
-                    cudaGetErrorString(sts));
-        }                         
-    }
-    if (!subsample && c_contiguous &&
-        (version==0||version==-1) &&
-        out_size<=max_threads_dim0 &&//Maximum of X threads by block
-        nstack == 1 &&// don't implement the stack in the kernel.
-        img_size_byte+kern_size_byte<shared_avail && //there is only 16k of shared memory
-        !work_complete) //conv_full_patch
-    {
-        dim3 threads(out_wid, out_len);
-        dim3 grid(nbatch,nkern);
-        int shared_size=(img_size + kern_size)*sizeof(float);
-        //TODO assert c_continious for img, kern and out in the 2 inner dimensions.
-        conv_full_patch<<< grid, threads, shared_size>>>
-          (img->devdata,
-           kern->devdata,
-           out->devdata,
-           img_len, img_wid,
-           kern_len, kern_wid,
-           nkern, nstack);
-        CNDA_THREAD_SYNC;
-        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts) 
-        {
-            if (verbose) fprintf(stderr, "INFO: used 'conv_full_patch' version\n");
-            work_complete = true;
-        }
-        else
-        {
-            if (verbose)
-              fprintf(stderr,
-                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
-                      " shared_size=%i, nb_threads=%i\n",
-                      threads.x, threads.y, grid.x, grid.y, shared_size,
-                      threads.x * threads.y);
-            if (verbose)
-              fprintf(stderr,
-                      "INFO: impl 'conv_full_patch' failed (%s),"
-                      " trying next implementation\n",
-                      cudaGetErrorString(sts));
-        }                         
-    }
-    if (false && !subsample && //disabled as test fail for this kernel
-        (version==1||version==-1) &&
-        out_size<=max_threads_dim0 &&//Maximum of X threads by block
-        (nbatch > 20 || version==1) &&  // we only launch nbatch blocks, so make sure there is enough to be worth it, but if we specify the version, this check should not be done to allow testing.
-        nstack*img_size_byte+nstack*kern_size_byte<shared_avail && //there is only 16k of shared memory
-        !work_complete) //conv_full_load_everything
-    {
-        dim3 threads(out_wid, out_len);
-        dim3 grid(nbatch);
-        int shared_size=(img_size + kern_size)*nstack*sizeof(float);
-        //TODO assert c_continious for img, kern and out in the 2 inner dimensions.
-        //typeof(conv_full_load_everything<0>) f = ;
-        void (*f)(float*, float*, float*,
-                  int, int, int, int, int, int,
-                  int, int, int, int, int, int, int, int) = conv_full_load_everything<0>;
-        f = conv_full_load_everything<THEANO_KERN_WID>;
-        f<<< grid, threads, shared_size>>>
-          (img->devdata,
-           kern->devdata,
-           out->devdata,
-           img_len, img_wid, 
-           kern_len, kern_wid,
-           nkern, nstack,
-           CudaNdarray_HOST_STRIDES(img)[3],
-           CudaNdarray_HOST_STRIDES(img)[2],
-           CudaNdarray_HOST_STRIDES(img)[1],
-           CudaNdarray_HOST_STRIDES(img)[0],
-           CudaNdarray_HOST_STRIDES(kern)[3],
-           CudaNdarray_HOST_STRIDES(kern)[2],
-           CudaNdarray_HOST_STRIDES(kern)[1],
-           CudaNdarray_HOST_STRIDES(kern)[0]
-           );
-        CNDA_THREAD_SYNC;
-        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts) 
-        {
-            if (verbose) fprintf(stderr, "INFO: used 'conv_full_load_everything' version\n");
-            work_complete = true;
-        }
-        else
-        {
-            if (verbose)
-              fprintf(stderr,
-                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
-                      " shared_size=%i, nb_threads=%i\n",
-                      threads.x, threads.y, grid.x, grid.y, shared_size,
-                      threads.x * threads.y);
-            if (verbose)
-              fprintf(stderr, "INFO: impl 'conv_full_load_everything'"
-                      " failed (%s), trying next implementation\n",
-                      cudaGetErrorString(sts));
-        }
-    }
-    if (!subsample &&
-        img_batch_stack_contiguous &&
-        out_contiguous &&
-        (version==2||version==-1) &&
-        out_size<=max_threads_dim0 &&//Maximum of X threads by block
-        img_size_byte+kern_size_byte<shared_avail && //there is only 16k of shared memory
-        !work_complete) //conv_full_patch_stack
-    {
-        dim3 threads(out_wid, out_len);
-        dim3 grid(nbatch,nkern);
-        int shared_size=(img_size + kern_size)*sizeof(float);
-        void (*f)(float*, float*, float*,
-                  int, int, int, int,
-                  int, int, int, int,
-                  int, int, int, int);
-        if(img_contiguous_2d && kern_contiguous_2d) f=conv_full_patch_stack<true,true>;\
-        else if(img_contiguous_2d && !kern_contiguous_2d) f=conv_full_patch_stack<true,false>;\
-        else if(!img_contiguous_2d && kern_contiguous_2d) f=conv_full_patch_stack<false,true>;\
-        else if(!img_contiguous_2d && !kern_contiguous_2d) f=conv_full_patch_stack<false,false>;
-        f<<< grid, threads, shared_size>>>(
-                img->devdata,
-                kern->devdata,
-                out->devdata,
-                img_len, img_wid,
-                kern_len, kern_wid,
-                nkern, nstack,img_stride_col, img_stride_row,
-                kern_stride_col, kern_stride_row,
-                kern_stride_stack, kern_stride_nkern);
-        CNDA_THREAD_SYNC;
-        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts) 
-        {
-            if (verbose)
-              fprintf(stderr, "INFO: used 'conv_full_patch_stack' version\n");
-            work_complete = true;
-        }
-        else
-        {
-            if (verbose)
-              fprintf(stderr,
-                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
-                      " shared_size=%i, nb_threads=%i\n",
-                      threads.x, threads.y, grid.x, grid.y,
-                      shared_size, threads.x * threads.y);
-            if (verbose)
-              fprintf(stderr, "INFO: impl 'conv_full_patch_stack' failed (%s), trying next implementation\n",
-                      cudaGetErrorString(sts));
-        }                         
-    }
-    if (1 && !work_complete) //conv_reference_full
-    {
-        if(verbose>1) fprintf(stderr, "INFO: will start conv_reference_full\n");
-        int outsize = CudaNdarray_SIZE(out);
-        int n_blocks = std::min(outsize, NUM_VECTOR_OP_BLOCKS);
-        int n_threads = std::min(ceil_intdiv(outsize, n_blocks),
-                                 NUM_VECTOR_OP_THREADS_PER_BLOCK);
-        if (0)
-        {
-            if (verbose)
-              fprintf(stderr, "INFO: launching conv_reference_valid\n");
-            if (verbose)
-              fprintf(stderr, "      img : %i %i %i %i %p  %i %i %i %i\n",
-                      CudaNdarray_HOST_DIMS(img)[0],
-                      CudaNdarray_HOST_DIMS(img)[1],
-                      CudaNdarray_HOST_DIMS(img)[2],
-                      CudaNdarray_HOST_DIMS(img)[3],
-                      img->devdata,
-                      CudaNdarray_HOST_STRIDES(img)[0],
-                      CudaNdarray_HOST_STRIDES(img)[1],
-                      CudaNdarray_HOST_STRIDES(img)[2],
-                      CudaNdarray_HOST_STRIDES(img)[3]);
-            if (verbose)
-              fprintf(stderr, "      kern: %i %i %i %i %p  %i %i %i %i\n",
-                      CudaNdarray_HOST_DIMS(kern)[0],
-                      CudaNdarray_HOST_DIMS(kern)[1],
-                      CudaNdarray_HOST_DIMS(kern)[2],
-                      CudaNdarray_HOST_DIMS(kern)[3],
-                      kern->devdata,
-                      CudaNdarray_HOST_STRIDES(kern)[0],
-                      CudaNdarray_HOST_STRIDES(kern)[1],
-                      CudaNdarray_HOST_STRIDES(kern)[2],
-                      CudaNdarray_HOST_STRIDES(kern)[3]
-                        );
-            if (verbose)
-              fprintf(stderr, "      out : %i %i %i %i %p  %i %i %i %i\n",
-                      CudaNdarray_HOST_DIMS(out)[0],
-                      CudaNdarray_HOST_DIMS(out)[1],
-                      CudaNdarray_HOST_DIMS(out)[2],
-                      CudaNdarray_HOST_DIMS(out)[3],
-                      out->devdata,
-                      CudaNdarray_HOST_STRIDES(out)[0],
-                      CudaNdarray_HOST_STRIDES(out)[1],
-                      CudaNdarray_HOST_STRIDES(out)[2],
-                      CudaNdarray_HOST_STRIDES(out)[3]);
-            if (verbose)
-              fprintf(stderr, "   launch params: %i %i %i\n",
-                      outsize, n_blocks, n_threads);
-            if (verbose)
-              fprintf(stderr, "   subsample params: %i %i\n",
-                      subsample_rows, subsample_cols);
-        }
-        conv_reference_full<<<n_blocks, n_threads>>>(
-                CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(kern)[0],
-                CudaNdarray_HOST_DIMS(img)[1],
-                CudaNdarray_HOST_DIMS(img)[2], CudaNdarray_HOST_DIMS(img)[3],
-                CudaNdarray_HOST_DIMS(kern)[2], CudaNdarray_HOST_DIMS(kern)[3],
-                CudaNdarray_HOST_DIMS(out)[2], CudaNdarray_HOST_DIMS(out)[3],
-                img->devdata, CudaNdarray_HOST_STRIDES(img)[0],
-                CudaNdarray_HOST_STRIDES(img)[1],
-                CudaNdarray_HOST_STRIDES(img)[2],
-                CudaNdarray_HOST_STRIDES(img)[3],
-                kern->devdata, CudaNdarray_HOST_STRIDES(kern)[0],
-                CudaNdarray_HOST_STRIDES(kern)[1],
-                CudaNdarray_HOST_STRIDES(kern)[2],
-                CudaNdarray_HOST_STRIDES(kern)[3],
-                out->devdata, CudaNdarray_HOST_STRIDES(out)[0],
-                CudaNdarray_HOST_STRIDES(out)[1],
-                CudaNdarray_HOST_STRIDES(out)[2],
-                CudaNdarray_HOST_STRIDES(out)[3],
-                subsample_rows, subsample_cols);
-        CNDA_THREAD_SYNC;
-        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts) 
-        {
-            if (verbose)
-              fprintf(stderr, "INFO: used 'conv_reference_full' version"
-                      " ishp(%d, %d) kshp(%d, %d) oshp(%d, %d) nbatch=%d"
-                      " nkern=%d nstack=%d subsample=%d\n",
-                      img_len,img_wid, kern_len, kern_wid,
-                      out_len, out_wid, nbatch, nkern, nstack, subsample);
-            work_complete = true;
-        }
-        else
-        {
-          if (verbose)
-            fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
-                    " shared_size=%i, nb_threads=%i\n",
-                    n_threads, 1, n_blocks, 1, 0, n_threads);
-          if (verbose)
-            fprintf(stderr, "INFO: impl 'conv_reference_full' failed (%s),"
-                    " trying next implementation\n",
-                    cudaGetErrorString(sts));
-          PyErr_Format(PyExc_RuntimeError,
-                       "ERROR: all implementations failed for"
-                       " CudaNdarray_conv_full! (%s)",
-                       cudaGetErrorString(sts));
-          return -1;
-        }
-    }
-    return 0;
-}
-PyObject *
-CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern,
-                 CudaNdarray * out, const int mode,
-                 const int subsample_rows, const int subsample_cols,
-                 const int version, const int verbose,
-                 const int max_threads_dim0 = 512
-                 )
-{
-    // Re-use the out object if possible.  If the out object it not used, then its refcount is not modified.
-    //  If the out object is re-used then it is returned, and its refcount is incremented by 1.
-    //
-    if (img->nd != 4)
-    {
-      PyErr_SetString(PyExc_ValueError, "CudaNdarray 4-D tensor required");
-      return NULL;
-    }
-    if (kern->nd != 4)
-    {
-      PyErr_SetString(PyExc_ValueError, "CudaNdarray 4-D tensor required");
-      return NULL;
-    }
-    int out_dim[4];
-    out_dim[0] = CudaNdarray_HOST_DIMS(img)[0];
-    out_dim[1] = CudaNdarray_HOST_DIMS(kern)[0];
-    int logical_rows, logical_cols;
-    if (mode == ConvMode_VALID)
-    {
-        logical_rows = CudaNdarray_HOST_DIMS(img)[2] - CudaNdarray_HOST_DIMS(kern)[2] + 1;
-        logical_cols = CudaNdarray_HOST_DIMS(img)[3] - CudaNdarray_HOST_DIMS(kern)[3] + 1;
-    }
-    else
-    {
-        logical_rows = CudaNdarray_HOST_DIMS(img)[2] + CudaNdarray_HOST_DIMS(kern)[2] - 1;
-        logical_cols = CudaNdarray_HOST_DIMS(img)[3] + CudaNdarray_HOST_DIMS(kern)[3] - 1;
-    }
-    out_dim[2] = ceil_intdiv(logical_rows, subsample_rows);
-    out_dim[3] = ceil_intdiv(logical_cols, subsample_cols);
-    CudaNdarray * rval = NULL;
-    if ( out
-         && out->nd==4
-         && CudaNdarray_is_c_contiguous(out)
-         && CudaNdarray_HOST_DIMS(out)[0]==out_dim[0]
-         && CudaNdarray_HOST_DIMS(out)[1]==out_dim[1]
-         && CudaNdarray_HOST_DIMS(out)[2]==out_dim[2]
-         && CudaNdarray_HOST_DIMS(out)[3]==out_dim[3])
-    {
-      rval = out;
-      Py_INCREF(rval);
-      if (verbose)
-        fprintf(stderr,
-                "INFO: Conv is reusing the 'out' argument"
-                " structure.\n");
-    }
-    else
-    {
-      if (out && verbose)
-        fprintf(stderr,
-                "INFO: Conv is ignoring 'out' argument with wrong"
-                " structure.\n");
-      else if(verbose)
-        fprintf(stderr,
-                "INFO: Conv don't have an 'out' argument"
-                " structure.\n");
-      rval = (CudaNdarray*)CudaNdarray_NewDims(4,out_dim);
-      //rval might be null
-    }
-    if ((rval==NULL)
-        || ((mode==ConvMode_VALID) && CudaNdarray_conv_valid(img, kern, rval,
-                                                             subsample_rows,
-                                                             subsample_cols,
-                                                             version, verbose,
-                                                             max_threads_dim0))
-        || ((mode==ConvMode_FULL) && CudaNdarray_conv_full(img, kern, rval,
-                                                           subsample_rows,
-                                                           subsample_cols,
-                                                           version, verbose,
-                                                           max_threads_dim0))
-            )
-    {
-        // if rval is something we just allocated,
-        // and there was a problem, then we have to free it.
-        Py_XDECREF(rval);
-        return NULL;
-    }
-    return (PyObject*)rval;
-}
-/*
-  Local Variables:
-  mode:c++
-  c-basic-offset:4
-  c-file-style:"stroustrup"
-  indent-tabs-mode:nil
-  fill-column:79
-  End:
-*/
-// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 :
--- a/theano/sandbox/cuda/conv_full_kernel.cu
+++ b/theano/sandbox/cuda/conv_full_kernel.cu
-//we store the full image and the full kernel in the shared memory
-//each thread compute only one value for the output
-//thread block size=out_wid, out_len/nb_split
-//grid block size=batch_id
-//dynamic shared memory: img_len*img_wid+kern_len*kern_wid
-__global__ void
-conv_full_patch_split( float* img, float* kern, float* out, int img_len, int img_wid, int kern_len, int kern_wid, int nb_split)
-{
-  int __shared__ out_len, out_wid, nb_thread_id;
-  out_len = img_len + kern_len - 1;
-  out_wid = img_wid + kern_wid - 1;
-  nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
-  extern __shared__ float s_data[];
-    int batch_id = blockIdx.x;
-    // Thread index
-    int tx = threadIdx.x;
-    int ty = threadIdx.y;
-    int out_col = tx;//output col
-    int out_row = ty;//output row
-    const int thread_id  = out_row*out_wid + out_col;
-    float * d_img=&s_data[0];//size of [IMAGE_LEN * IMAGE_WID];
-    float * d_kern=&s_data[img_len * img_wid];//size of [KERNEL_LEN * KERNEL_WID];
-    img+=img_len*img_wid*batch_id;//the good batch
-    load_to_shared(d_img, img, thread_id, nb_thread_id, img_len*img_wid);
-    load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_len*kern_wid);
-    __syncthreads();
-    for(int out_row=ty;out_row<out_len;out_row+=out_len/nb_split){
-      float sum = 0.0f;
-      int img_row = out_row;
-      for (int row=0; row < kern_len; row++) {//loop over row
-        int inverse_row = (img_row-row);
-        if(inverse_row<0 ||inverse_row>=(img_len))continue;//row outside the image
-        const float* idx_in=&d_img[inverse_row*img_wid];
-        const float* idx_kern=&d_kern[row*kern_wid];
-        int img_col = out_col;
-        int col=0,last=0;
-        for (col=0,last=img_col; col < kern_wid; col++,last--) {//loop over col
-          if(last<0 ||last>=(img_wid))continue;//col outside the image        
-          sum+=idx_in[last]*idx_kern[col];
-        }
-      }
-      out[batch_id*out_len*out_wid+//the output image
-          out_row*out_wid+out_col] = sum;
-    }
-}
-//we store the full image and the full kernel in the shared memory
-//each thread compute only one value for the output
-//thread block size=out_wid, out_len
-//grid block size=batch_id, nkern
-//dynamic shared memory: img_len*img_wid+kern_len*kern_wid
-__global__ void
-conv_full_patch( float* img, float* kern, float* out,
-                 int img_len, int img_wid,
-                 int kern_len, int kern_wid, int nkern, int nstack)
-{
-  int __shared__ out_len, out_wid, nb_thread_id;
-  out_len = img_len + kern_len - 1;
-  out_wid = img_wid + kern_wid - 1;
-  nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
-  extern __shared__ float s_data[];
-    int batch_id = blockIdx.x;
-    // Thread index
-    int tx = threadIdx.x;
-    int ty = threadIdx.y;
-    int out_col = tx;//output col
-    int out_row = ty;//output row
-    const int thread_id  = out_row*out_wid + out_col;
-    float * d_img=&s_data[0];//size of [IMAGE_LEN * IMAGE_WID];
-    float * d_kern=&s_data[img_len * img_wid];//size of [KERNEL_LEN * KERNEL_WID];
-    kern+=kern_len*kern_wid*nstack*blockIdx.y;//the good nkern
-    img+=img_len*img_wid*batch_id;//the good batch
-    load_to_shared(d_img, img, thread_id, nb_thread_id, img_len*img_wid);
-    load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_len*kern_wid, true);
-    __syncthreads();
-    float sum = 0.0f;
-    for (int row=0; row < kern_len; row++) {//loop over row
-      if(row+out_row-kern_len+1<0 || row+out_row-kern_len+1>=img_len)continue;
-      const float* idx_in=&d_img[(row+out_row-kern_len+1)*img_wid+out_col-kern_wid+1];
-      const float* idx_kern=&d_kern[row*kern_wid];
-      int col=0;
-      int max_col=kern_wid;
-      int img_col=out_col-kern_wid+1;
-      max_col=min(max_col,img_wid-img_col);
-      if(img_col<0){col=-img_col;img_col+=col;}
-      for (; col < max_col; col++, img_col++) {//loop over col
-        sum+=idx_in[col]*idx_kern[col];
-      }
-    }
-    out[batch_id*out_wid*out_len*nkern+//the good batch
-        out_wid*out_len*blockIdx.y+//the output image
-        out_row*out_wid+out_col] = sum;
-}
-//we store the full image and the full kernel in the shared memory
-//each thread compute only one value for the output
-//thread block size=out_wid, out_len
-//grid block size=batch_id, nkern
-//dynamic shared memory: img_len*img_wid+kern_len*kern_wid
-//template c_contiguous: if true, the img and kern have are column and row contiguous else we use the stride value from the param. The image need to be c_contiguous in the nbatch and nstack dimensions.
-template<bool img_c_contiguous_2d, bool kern_c_contiguous_2d>
-__global__ void
-conv_full_patch_stack( float* img, float* kern, float* out,
-                       int img_len, int img_wid,
-                       int kern_len, int kern_wid, int nkern, int nstack,
-                       int img_stride_col, int img_stride_row,
-                       int kern_stride_col, int kern_stride_row, 
-                       int kern_stride_stack, int kern_stride_nkern)
-{
-  int __shared__ out_len, out_wid, nb_thread_id;
-  out_len = img_len + kern_len - 1;
-  out_wid = img_wid + kern_wid - 1;
-  nb_thread_id = blockDim.y*blockDim.x;//blockDim.z*
-  float __shared__ *kern_, *img_;
-  extern __shared__ float s_data[];
-    const int batch_id = blockIdx.x;
-    const int nkern_id = blockIdx.y;
-    const int out_col = threadIdx.x;
-    const int out_row = threadIdx.y;
-    const int thread_id  = threadIdx.y*blockDim.x+ threadIdx.x;
-    float* d_img=&s_data[0];//size of [IMAGE_LEN * IMAGE_WID];
-    float* d_kern=&s_data[img_len * img_wid];//size of [KERNEL_LEN * KERNEL_WID];
-    kern_=kern+kern_stride_nkern*nkern_id;//the good nkern
-    img_=img+img_len*img_stride_row*(nstack*batch_id);//the good batch
-    float sum = 0.0f;
-    for (int stack = 0;stack<nstack;stack++){
-      load_to_shared(d_img, img_+stack*img_len*img_stride_row, thread_id,nb_thread_id,img_wid,img_len,img_stride_col, img_stride_row,false,img_c_contiguous_2d);
-      load_to_shared(d_kern, kern_+stack*kern_stride_stack, thread_id,nb_thread_id,kern_wid,kern_len,kern_stride_col,kern_stride_row,true,kern_c_contiguous_2d);
-      __syncthreads();
-      for (int row=0; row < kern_len; row++) {//loop over row
-        if(row+out_row-kern_len+1<0 || row+out_row-kern_len+1>=img_len)continue;
-        const float* idx_in=&d_img[(row+out_row-kern_len+1)*img_wid+out_col-kern_wid+1];
-        const float* idx_kern=&d_kern[row*kern_wid];
-        int col=0;
-        int max_col=kern_wid;
-        int img_col=out_col-kern_wid+1;
-        max_col=min(max_col,img_wid-img_col);
-        if(img_col<0){col=-img_col;img_col+=col;}
-        for (; col < max_col; col++, img_col++) {//loop over col
-          sum+=idx_in[col]*idx_kern[col];
-        }
-      }
-      //Needed as not all thread finish at the same time the loop
-      //And we don't want to overwrite the shared memory.
-      __syncthreads();
-    }
-    out[batch_id*out_wid*out_len*nkern+//the good batch
-        out_wid*out_len*blockIdx.y+//the output image
-        out_row*out_wid+out_col] = sum;
-}
-/**
- * As conv_patch_stack, but used for the full convolution by padding the image in shared memory.
- * I keep it separated from conv_patch as we take 19-20 register which is more than the 10/16 max for each thread and thus this could lower the occupency.
- * Implementation of the valid convolution that keep the full image and the full kernel in shared memory
- * each thread compute only one value for the output if split is true. Otherwise compute ceil((float)out_len/N) pixel.
- * thread block size=out_wid, nb_rows (optimized value is ceil(out_len/N))
- * grid block size=batch_id, nkern
- * dynamic shared memory: full mem: (img_len+2*kern_len-2)*(img_wid+2*kern_wid-2)+kern_len*kern_wid
- * dynamic shared memory: low mem:((kern_len+nb_row-1)+2*kern_len-2)*(img_wid+2*kern_wid-2)+kern_len*kern_wid
- * 
- * nkern: the number of kernel, used to compute the output image to store the result
- * nstack: the size of the stack, used to compute the image to load.
- * template flipped_kern: if true, we "flip" the kernel as in a real convolution, else we don't
- * template c_contiguous: if true, the image and kernel have are c_contiguous.(use less registers)
- * template split: if true, each thread compute more than 1 output pixel.
- * template low_mem: if true, as split but with use less dynamic shared memory but use more registers.
- *          if you set split and low_mem to true, we will use the low_mem version!
- */
-template<bool flipped_kern, int KERN_WIDTH, bool c_contiguous, bool split, bool low_mem >
-__global__ void
-conv_full_patch_stack_padded( float* img, float* kern, float* out,
-                  const int img_len, const int img_wid,
-                  const int kern_len, const int kern_wid,
-                  const int nkern, const int nstack,
-                  const int img_stride_col, const int img_stride_row,
-                  const int img_stride_stack, const int img_stride_batch,
-                  const int kern_stride_col, const int kern_stride_row,
-                  const int kern_stride_stack, const int kern_stride_nkern)
-{
-  int __shared__ out_len, out_wid, nb_thread_id;
-  out_len = img_len + kern_len - 1;
-  out_wid = img_wid + kern_wid - 1;
-  nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
-  extern __shared__ float s_data[];
-    __shared__ int batch_id, kern_id, img_wid_valid, nb_rows;
-    batch_id = blockIdx.x;
-    kern_id = blockIdx.y;
-    nb_rows = blockDim.y;
-    // Thread index
-    const int tx = threadIdx.x;
-    const int ty = threadIdx.y;
-    int out_col = tx;//output col
-    const int thread_id  = ty*blockDim.x + tx;
-    float * d_kern=&s_data[0];//size of [KERNEL_LEN * KERNEL_WID];
-    float * d_img=&s_data[kern_len*kern_wid];//size of [see fct doc];
-    kern+=kern_stride_nkern*kern_id;//the good nkern
-    img+=img_stride_batch*batch_id;//the good batch
-    img_wid_valid=img_wid+2*kern_wid-2;
-    if(!split && !low_mem){
-      fill(d_img,img_wid_valid*(img_len+2*kern_len-2), 0, thread_id, nb_thread_id);
-      const int out_row = ty;//output row
-      float sum = 0.0f;
-      for (int stack = 0;stack<nstack;stack++,kern+=kern_stride_stack,
-             img+=img_stride_stack){
-          __syncthreads();
-        load_padded_col_to_shared(d_img+img_wid_valid*(kern_len-1),img,
-                                  thread_id,nb_thread_id,img_wid,img_len,
-                                  img_stride_col, img_stride_row, kern_wid-1,
-                                  c_contiguous);
-        load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid,kern_len,
-                       kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
-        __syncthreads();
-        for (int row=0; row < kern_len; row++) {//loop over row
-          const float* idx_kern=&d_kern[row*kern_wid];
-          const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col];
-          convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid);
-        }
-      }
-      out[batch_id*out_wid*out_len*nkern+//the good batch
-          kern_id*out_wid*out_len+//the output image
-          out_row*out_wid+out_col] = sum;
-    }else if(split && !low_mem){
-      fill(d_img,img_wid_valid*(img_len+2*kern_len-2), 0, thread_id, nb_thread_id);
-      //out_len_max must by higher then out_len as we need all thread when we load the image as the nb_rows is not always a multiple of out_len.
-      __shared__ int out_len_max;
-      //TODO pass a parameter nb_split
-      out_len_max = (out_len/blockDim.y+(out_len%blockDim.y==0?0:1))*blockDim.y;
-      for(int out_row = ty;out_row<out_len_max;out_row+=nb_rows){
-        float sum = 0.0f;
-        for (int stack = 0;stack<nstack;stack++){
-          __syncthreads();
-          //TODO: load only the part of the image needed or put the partial result in shared memory
-          load_padded_col_to_shared(d_img+img_wid_valid*(kern_len-1),
-                                    img+img_stride_stack*stack,
-                                    thread_id,nb_thread_id,img_wid,img_len,
-                                    img_stride_col, img_stride_row, kern_wid-1,
-                                    c_contiguous);
-          load_to_shared(d_kern, kern+kern_stride_stack*stack,
-                         thread_id, nb_thread_id, kern_wid,kern_len,
-                         kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
-          __syncthreads();
-          //The if is needed as on Fermi as reading out of bound index from shared memory generate an error.
-          //Not needed on generation before as they worked anyway. Removing the if generate the good code
-          //as we store the result of only the good thread.
-          //This was with nvcc 3.0 on an GTX470 card.
-          if(out_row<out_len)
-            for (int row=0; row < kern_len; row++) {//loop over row
-              const float* idx_kern=&d_kern[row*kern_wid];
-              const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col];
-              convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid);
-            }
-          if(out_row<out_len)
-            out[batch_id*out_wid*out_len*nkern+//the good batch
-                out_wid*out_len*kern_id+//the output image
-                out_row*out_wid+out_col] = sum;
-        }
-      }
-    }else{//low_mem version
-      //don't need to fill the last rows padding as this is done later.
-      fill(d_img,img_wid_valid*((kern_len+nb_rows-1)+2*kern_len-2), 0, thread_id, nb_thread_id);
-      //out_len_max must by higher then out_len as we need all thread when we load the image as the nb_rows is not always a multiple of out_len.
-      __shared__ int out_len_max;
-      //TODO pass a parameter nb_split
-      if(thread_id==0)
-        out_len_max = (out_len/nb_rows+(out_len%nb_rows==0?0:1))*nb_rows;
-      __syncthreads();
-      for(int out_row = ty, out_row_iter=0;out_row<out_len_max;
-          out_row+=nb_rows, out_row_iter++){
-        float sum = 0.0f;
-        for (int stack = 0;stack<nstack;stack++){
-          __syncthreads();
-          const int len_to_load=min(kern_len+nb_rows,img_len-out_row_iter*nb_rows);//nb rows to load, min(nb_rows for this iter, nb rows left in the image)
-          const int empty_row = max(kern_len-1-out_row_iter*nb_rows,0);//number of empty row at the start
-          //we need to reload some row as when we change of out_row we lost the last load du to the stack.
-          const int previous_row = min(out_row_iter*nb_rows,kern_len-1);//number of row from last out_row iteration to reload
-          load_padded_col_to_shared(d_img+(kern_len-1-previous_row)*img_wid_valid,
-                                    img+img_stride_stack*stack//the good stack image
-                                    +(out_row_iter*nb_rows-previous_row)*img_stride_row,//the good split top row.
-                                    thread_id,nb_thread_id,img_wid,
-                                    len_to_load+previous_row,
-                                    img_stride_col, img_stride_row, kern_wid-1,
-                                    c_contiguous);
-          //TODO: fill the last row padding only when needed.
-          //We always fill the last rows padding event when not needed.
-          int row_to_fill = 2*kern_len-2+nb_rows- empty_row - previous_row - len_to_load;
-          row_to_fill = min(row_to_fill,kern_len-1);
-          fill(d_img+(kern_len-1+len_to_load)*img_wid_valid,
-               img_wid_valid*row_to_fill, 0, thread_id, nb_thread_id);
-          load_to_shared(d_kern, kern+kern_stride_stack*stack,
-                         thread_id, nb_thread_id, kern_wid,kern_len,
-                         kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
-          __syncthreads();
-          for (int row=0; row < kern_len; row++) {//loop over row
-            const float* idx_kern=&d_kern[row*kern_wid];
-            const float* idx_in=&d_img[(row+out_row-out_row_iter*nb_rows)*img_wid_valid+out_col];
-            convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid);
-          }
-        }
-        if(out_row<out_len)
-          out[batch_id*out_wid*out_len*nkern+//the good batch
-              out_wid*out_len*kern_id+//the output image
-              out_row*out_wid+out_col] = sum;
-      }
-    }
-}
-template <int i> __device__ float everything_dot(const float * x, const int sx, const float * y, const int sy) 
-{ 
-    return everything_dot<i/2>(x, sx, y, sy) + everything_dot<(i+1)/2>(x+sy*(i/2), sx, y+sy*(i/2), sy) ;
-    //return x[0] * y[0] + everything_dot<i-1>(x+sx, sx, y+sy, sy);
-}
-template <> __device__ float everything_dot<0>(const float * x, const int sx, const float * y, const int sy)
-{ 
-    return 0;
-}
-template <> __device__ float everything_dot<1>(const float * x, const int sx, const float * y, const int sy)
-{ 
-    return x[0] * y[0];
-}
-template<int NSTACK>
-__global__ void
-conv_full_load_everything( float* img, float* kern, float* out,
-                 int img_len, int img_wid,
-                 int kern_len, int kern_wid, int nkern, int nstack,
-                 int img_stride_col, int img_stride_row,
-                 int img_stride_stack, int img_stride_batch,
-                 int kern_stride_col, int kern_stride_row, 
-                 int kern_stride_stack, int kern_stride_nkern)
-{
-    int __shared__ out_len, out_wid, nb_thread_id;
-    out_len = img_len + kern_len - 1;
-    out_wid = img_wid + kern_wid - 1;
-    nb_thread_id = blockDim.y*blockDim.x;
-    extern __shared__ float s_data[];
-    int batch_id = blockIdx.x;
-    const int out_col = threadIdx.x;//output col
-    const int out_row = threadIdx.y;//output row
-    const int thread_id  = out_row*out_wid + out_col;
-    float * d_img=&s_data[0]; //size [nstack * IMAGE_LEN * IMAGE_WID];
-    float * d_kern=&s_data[nstack * img_len * img_wid];//size [nstack * KERNEL_LEN * KERNEL_WID];
-    img += blockIdx.x * img_stride_batch;//the good batch
-    // load the image to shared memory
-    for (int i = thread_id; i < nstack * img_len * img_wid; i += nb_thread_id)
-    {
-        int stack = i / (img_wid*img_len);
-        int row = (i % (img_wid*img_len)) / img_wid;
-        int col = (i % (img_wid*img_len)) % img_wid;
-        d_img[i] = img[stack*img_stride_stack +row*img_stride_row +col*img_stride_col];
-    }
-    for (int kern_idx = 0; kern_idx < nkern; ++kern_idx, kern += kern_stride_nkern)
-    {
-        // load the kernel into shared memory and flip it
-        for (int i = thread_id; i < nstack * kern_len * kern_wid; i += nb_thread_id)
-        {
-            int stack = i / (kern_wid*kern_len);
-            int row = (i % (kern_wid*kern_len)) / kern_wid;
-            int col = (i % (kern_wid*kern_len)) % kern_wid;
-            d_kern[stack*kern_len*kern_wid + (kern_len-1-row)*kern_wid + (kern_wid-1-col)]
-               = kern[stack*kern_stride_stack +row*kern_stride_row +col*kern_stride_col];
-        }
-        __syncthreads();
-        float sum = 0.0f;
-        for (int row=0; row < kern_len; ++row)
-        {
-            int irow = out_row - kern_len+1+row;
-            if (irow < 0 || irow > img_len) continue;
-            for (int col = 0; col < kern_wid; ++col)
-            {
-                int icol = out_col - kern_wid+1+col;
-                if (icol < 0 || icol > img_wid) continue;
-                if (NSTACK > 0)
-                {
-                    sum += everything_dot<NSTACK>(d_img + irow*img_wid + icol, img_len*img_wid,
-                            d_kern + row*kern_wid+col, kern_len*kern_wid);
-                }
-                else
-                {
-                    for (int stack = 0; stack < nstack; ++stack)
-                    {
-                        sum += d_img[stack*img_len*img_wid + irow*img_wid + icol] * d_kern[stack*kern_len*kern_wid+row*kern_wid+col];
-                    }
-                }
-            }
-        }
-        out[batch_id*out_wid*out_len*nkern+//the good batch
-            out_wid*out_len*kern_idx+//the output image
-            out_row*out_wid+out_col] = sum;
-        __syncthreads(); //don't start loading another kernel until we're done here
-    }
-}
-/*
-  Local Variables:
-  mode:c++
-  c-basic-offset:4
-  c-file-style:"stroustrup"
-  indent-tabs-mode:nil
-  fill-column:79
-  End:
-*/
-// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 :
--- a/theano/sandbox/cuda/conv_kernel.cu
+++ b/theano/sandbox/cuda/conv_kernel.cu
-// REMEMBER TO INCREASE c_code_cache_version when changing this file
-//
-//implement the valid convolution only
-/*
-for (int iter_m=0; iter_m < Os[0]; iter_m++) {
-  // Reposition index into input image based on requested output size
-  int pos_m = iter_m*%(self_dx)s;//The position of the patch in the image
-  int new_m = (pos_m+dim_ker[0]-1);
-  for (int iter_n=0; iter_n < Os[1]; iter_n++) {  // loop over columns
-    int pos_n=iter_n*%(self_dy)s;
-    %(type)s sum=0;
-    // Sum over kernel, if index into image is out of bounds
-    // fill with the value
-    for (int j=0; j < dim_ker[0]; j++) {
-      int inverse_row = (new_m-j);
-      const %(type)s* idx_in=&in[inverse_row*dim_im[1]]; //JB: should be dim_im[1] right? (was dim_im[0])
-      const %(type)s* idx_kern=&hvals[j*dim_ker[1]];
-      int new_n = (pos_n+dim_ker[1]-1);
-      for (int k=0,last=new_n; k < dim_ker[1]; k++,last--) {
-        sum+=idx_kern[k]*idx_in[last];
-      }
-    }//for j
-    out[iter_m*dim_zz[1]+iter_n] %(affectation)s sum;
-  }//for n
- }//for m
-*/
-#ifndef CONV_KERNEL_CU
-#define CONV_KERNEL_CU
-#include <stdint.h>
-/*
-#define CHECK_BANK_CONFLICTS 0
-#if CHECK_BANK_CONFLICTS
-#define AS(i, j) cutilBankChecker(((float*)&As[0][0]), (BLOCK_SIZE * i + j))
-#define BS(i, j) cutilBankChecker(((float*)&Bs[0][0]), (BLOCK_SIZE * i + j))
-#else
-#define AS(i, j) As[i][j]
-#define BS(i, j) Bs[i][j]
-#endif
-*/
-#define MIN(a, b) ((a) < (b) ? (a) : (b) )
-#define MAX(a, b) ((a) < (b) ? (b) : (a) )
-//Must be the same size as a ptr. We can't use unsigned long as on Windows 64
-//bit, it is 32 bit.
-const uintptr_t COALESCED_ALIGN = 0xFFFFFFFFFFFFFF00; // zero-out the trailing bits of pointers
-__device__ void load_to_shared(float * dst, const float * src, const int thread_id, int nb_thread, const int N, const bool flipped=false){
-  if (nb_thread < 64)
-    {
-      if(flipped)
-        //TODO very slow on device before 1.3.
-        //     make access to kern sequential and access to d_kern flipped.
-        for(int i=thread_id;i<N;i+=nb_thread)
-          dst[i]=src[N - 1 - i];
-        //dst[N-1-i]=src[i];
-      else
-      {
-        for(int i = thread_id; i < N; i += nb_thread)
-        {
-            dst[i] = src[i];
-        }
-      }
-    }
-  else
-    {
-      nb_thread = nb_thread & 0xFFFFFFE0; //make nb_thread a multiple of 32
-      // Global memory:
-      //  <-------------------------------------->
-      //      A      A      A      A      A   // points of 256-byte alignment
-      //         dddddddddddddddddddddd       // layout of src in global memory
-      if (thread_id < nb_thread)
-        {
-          const float * my_src_ptr = (const float *)(
-                  ((uintptr_t)src) & COALESCED_ALIGN);
-          my_src_ptr += thread_id;
-          while (my_src_ptr < src + N)
-          {
-              if (my_src_ptr >= src)
-              {
-                  int i = my_src_ptr - src;
-                  if (flipped)
-                  {
-                      dst[N - 1 - i] = *my_src_ptr;
-                  }
-                  else
-                  {
-                      dst[i] = *my_src_ptr;
-                  }
-              }
-              my_src_ptr += nb_thread;
-          }
-        }
-    }
-}
-/*
- * We load from global memory to shared memory. The outer if is optimized away at compilation.
- */
-__device__ void load_to_shared(float * dst, const float * src, const int thread_id,
-                               int nb_thread, const int nb_col, const int nb_row,
-                               const int stride_col, const int stride_row,
-                               const bool flipped=false, const bool c_contiguous=true){
-    if (c_contiguous)
-    {
-        load_to_shared(dst, src, thread_id, nb_thread, nb_col*nb_row, flipped);
-    }
-    else
-    {
-        if (flipped)
-        {
-            int LAST = nb_row * nb_col - 1;
-            for(int i=thread_id;i<nb_row*nb_col;i+=nb_thread)
-            {
-                // XXX
-                // THIS IS SLOW - use whatever blocks are in the the
-                // threads to avoid division and modulo
-                dst[LAST - i] \
-                    = src[(i/nb_col)*stride_row+(i%nb_col)*stride_col];
-            }
-        }
-        else
-        {
-            for(int i=thread_id;i<nb_row*nb_col;i+=nb_thread)
-            {
-                // XXX
-                // THIS IS SLOW - use whatever blocks are in the the
-                // threads to avoid division and modulo
-                dst[i]=src[i/nb_col*stride_row+i%nb_col*stride_col];
-            }
-        }
-    }
-}
-__device__ void fill(float * dst, int N, float value, int thread_id, int nb_thread){
-  for(int i=thread_id;i<N;i+=nb_thread)
-    dst[i]=value;
-}
-/*
- * We load from global memory to shared memory. The outer if is optimized away at compilation.
- * We put the image at the center of another one. Usefull to padd an image with 0.
- */
-__device__ void load_padded_col_to_shared(float * dst, const float * src, 
-                                          const int thread_id, const int nb_thread,
-                                          const int nb_col, const int nb_row, 
-                                          const int stride_col, const int stride_row,
-                                          const int wid_pad, const bool c_contiguous=true){
-  if(c_contiguous){//flipped==false
-    for(int i=thread_id;i<nb_col*nb_row;i+=nb_thread){
-      int col=i%nb_col;
-      int row=i/nb_col;
-      dst[row*(nb_col+2*wid_pad)+col+wid_pad]=src[i];
-    }
-  }else{
-    for(int i=thread_id;i<nb_row*nb_col;i+=nb_thread){
-      int col=i%nb_col;
-      int row=i/nb_col;
-      dst[row*(nb_col+2*wid_pad)+col+wid_pad]=src[row*stride_row+col*stride_col];
-    }
-  }
-}
-template<int i> __device__ float convolutionRowNoFlip(const float *data,
-                                                      const float *kern){
-    return convolutionRowNoFlip<i/2>(data, kern)+ convolutionRowNoFlip<(i+1)/2>(data+i/2, kern+i/2) ;
-  //return data[i-1] * kern[i-1] + convolutionRowNoFlip<i - 1>(data,kern);
-}
-template<> __device__ float convolutionRowNoFlip<1>(const float *data,
-                                                    const float *kern){
-    return data[0]*kern[0];
-}
-template<> __device__ float convolutionRowNoFlip<0>(const float *data,
-                                                    const float *kern){
-    return 0;
-}
-template<int KERN_WIDTH>
-__device__ void convolutionRowNoFlip(float& sum,
-                                     const float *data,
-                                     const float *kern, const int kern_wid){
-  if(KERN_WIDTH>0)
-    sum+=convolutionRowNoFlip<KERN_WIDTH>(data,kern);
-  else
-#pragma unroll 8
-    for (int col=0; col < kern_wid; col++) {//loop over col
-      sum+=data[col]*kern[col];
-    }
-}
-template<bool accumulate>
-__device__ void store_or_accumulate(float& dst,const float value ){
-  if(accumulate){
-    dst += value;
-  }else
-    dst = value;
-}
-/**
- * Implementation of the valid convolution that keep the full image and the full kernel in shared memory
- * Don't implement the stack.
- * each thread compute only one value for the output if split is false
- * thread block size=out_wid, out_len(or less then out_len if split is true)
- * grid block size=batch_id, nkern
- * dynamic shared memory: img_len*img_wid+kern_len*kern_wid
- * 
- * nkern: the number of kernel, used to compute the output image to store the result
- * nstack: the size of the stack, used to compute the image to load.
- * template flipped_kern: if true, we "flip" the kernel as in a real convolution, else we don't
- * template split: if true, each thread computes more than 1 output pixel
- *                 When true, allow for output image bigger then 512 pixel.
- *                 Use more registers.
- */
-template<bool flipped_kern, int KERN_WIDTH, bool split>
-__global__ void
-conv_patch( float* img, float* kern, float* out,
-            int img_len, int img_wid, int kern_len, int kern_wid,
-            int nkern, int nstack)
-{
-  int __shared__ out_len, out_wid, nb_thread_id;
-  out_len = img_len - kern_len + 1;
-  out_wid = img_wid - kern_wid + 1;
-  nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
-  extern __shared__ float s_data[];
-    __shared__ int batch_id, kern_id;
-    batch_id = blockIdx.x;
-    kern_id = blockIdx.y;
-    // Thread index
-    int tx = threadIdx.x;
-    int ty = threadIdx.y;
-    int out_col = tx;//output col
-    const int thread_id  = ty*blockDim.x + tx;
-    float * d_img=&s_data[0];//size of [IMAGE_LEN * IMAGE_WID];
-    float * d_kern=&s_data[img_len * img_wid];//size of [KERNEL_LEN * KERNEL_WID];
-    kern+=kern_len*kern_wid*nstack*kern_id;
-    img+=img_len*img_wid*(nstack*batch_id);
-    load_to_shared(d_img, img, thread_id,nb_thread_id,img_len*img_wid);
-    load_to_shared(d_kern, kern, thread_id,nb_thread_id,kern_len*kern_wid,flipped_kern);
-    __syncthreads();
-    if(!split){
-      int out_row = ty;//output row
-      float sum = 0.0f;
-      for (int row=0; row < kern_len; row++) {//loop over row
-        const float* idx_kern=&d_kern[row*kern_wid];
-        const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
-        convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
-      }
-      out[batch_id*out_wid*out_len*nkern+//the good batch
-          blockIdx.y*out_wid*out_len+//the output image
-          out_row*out_wid+out_col] = sum;
-    }else{
-      for(int out_row=ty;out_row<out_len;out_row+=blockDim.y){
-        float sum = 0.0f;
-        for (int row=0; row < kern_len; row++) {//loop over row
-          const float* idx_kern=&d_kern[row*kern_wid];
-          const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
-          convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
-        }
-        out[batch_id*out_wid*out_len*nkern+//the good batch
-            kern_id*out_wid*out_len+//the output image
-            out_row*out_wid+out_col] = sum;
-      }
-    }
-}
-/**
- * As conv_patch, but implement the stack in the kernel.
- * I keep it separated from conv_patch as we take more registers and this could lower the occupency.
- * Implementation of the valid convolution that keep the full image and the full kernel in shared memory
- * each thread compute only one value for the output if split==false else it compute more than 1 values
- * thread block size=out_wid, out_len/X (X is any number, optimized value is ceil(out_len/N)
- * grid block size=batch_id, nkern
- * dynamic shared memory: img_len*img_wid+(preload_full_kern?KERNEL_LEN:1)*kern_wid
- * 
- * nkern: the number of kernel, used to compute the output image to store the result
- * nstack: the size of the stack, used to compute the image to load.
- * dx: patch stride rows(1 for normal convolution)
- * dy: patch stride cols(1 for normal convolution)
- * template flipped_kern: if true, we "flip" the kernel as in a real convolution, else we don't
- * template accumulate: if true, we add the result, else we override the result
- * template KERN_WIDTH: if 0, will work for any kern_wid, else it specialyse to this kern_wid as an optimization
- * template img_c_contiguous_2d: if true, the img have are collon and row contiguous
- * template kern_c_contiguous_2d: if true, the kernel have are collon and row contiguous
- * template split: if true, each thread generate more than 1 output pixel, but use more registers.
- * template preload_full_kern: if true, we load the full kernel in shared memory, else, we load 1 row at a time.
- * template subsample: if false, remove some computation needed when dx or dy!=1.
- */
-template<bool flipped_kern, bool accumulate, int KERN_WIDTH, bool img_c_contiguous_2d, bool kern_c_contiguous_2d, bool split, bool preload_full_kern, bool subsample>
-__global__ void
-conv_patch_stack( float* img, float* kern, float* out,
-                  int img_len, int img_wid, int kern_len, int kern_wid,
-                  int out_len, int out_wid,
-                  int nkern, int nstack, int img_stride_col,int img_stride_row,
-                  int img_stride_stack, int img_stride_batch,
-                  int kern_stride_col, int kern_stride_row,
-                  int kern_stride_stack, int kern_stride_nkern, int dx, int dy)
-{
-  int __shared__ nb_thread_id;
-  nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
-  extern __shared__ float s_data[];
-    int batch_id = blockIdx.x;
-    int kern_id = blockIdx.y;
-    // Thread index
-    int tx = threadIdx.x;
-    int ty = threadIdx.y;
-    int out_col = tx;//output col
-    int out_row = ty;//output row
-    const int thread_id  = out_row*out_wid + out_col;
-    float * d_img=&s_data[0];//size of [IMAGE_LEN * IMAGE_WID];
-    float * d_kern=&s_data[img_len * img_wid];//size of [(preload_full_kern?KERNEL_LEN:1) * KERNEL_WID];
-    if(!split){
-      kern+=kern_stride_nkern*kern_id;//the good nkern
-      img+=img_stride_batch*batch_id;//the good batch
-      float sum = 0.0f;
-      for (int stack = 0;stack<nstack;stack++,kern+=kern_stride_stack,
-             img+=img_stride_stack){
-        load_to_shared(d_img,img,thread_id,nb_thread_id,img_wid,img_len,
-                       img_stride_col, img_stride_row, false, img_c_contiguous_2d);
-        if(preload_full_kern)
-          load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid,kern_len,
-                         kern_stride_col, kern_stride_row, flipped_kern, kern_c_contiguous_2d);
-        __syncthreads();
-        for (int row=0; row < kern_len; row++) {//loop over row
-          if(!preload_full_kern){
-            __syncthreads();
-            int idx2;
-            if(flipped_kern) idx2=(kern_len-row-1)*kern_stride_row;
-            else idx2=(row)*kern_stride_row;
-            load_to_shared(d_kern, kern+idx2, thread_id, nb_thread_id, kern_wid,1,
-                           kern_stride_col, kern_stride_row, flipped_kern, kern_c_contiguous_2d);
-            __syncthreads();              
-          }
-          const float* idx_kern;
-          if(preload_full_kern) idx_kern=&d_kern[row*kern_wid];
-          else idx_kern=d_kern;
-          const float* idx_in;
-          if(subsample)
-            idx_in=&d_img[(row+out_row*dx)*img_wid+out_col*dy];
-          else
-            idx_in=&d_img[(row+out_row)*img_wid+out_col];
-          convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
-        }
-        __syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory
-      }
-      store_or_accumulate<accumulate>(
-                                      out[batch_id*out_wid*out_len*nkern+//the good batch
-                                          out_wid*out_len*kern_id+//the output image
-                                          out_row*out_wid+out_col],sum);
-    }else{
-      float __shared__ *kern_, *img_;
-      int __shared__ out_len_max;
-      kern_=kern+kern_stride_nkern*kern_id;//the good nkern
-      img_=img+img_stride_batch*batch_id;//the good batch
-      //out_len_max must by higher then out_len as we need all thread when we load the image as the blockDim.y is not always a multiple of out_len.
-      out_len_max = (out_len/blockDim.y+(out_len%blockDim.y==0?0:1))*blockDim.y;
-      //TODO: inverse the out_row and stack loop to don't load the date as frequently!
-      //TODO: do this happen elsewhere?
-      for(;out_row<out_len_max;out_row+=blockDim.y){
-        float sum = 0.0f;
-        for (int stack = 0;stack<nstack;stack++){
-          //TODO: load only the part of the image needed or put the partial result in shared memory
-          int idx1=img_stride_stack*stack;
-          load_to_shared(d_img,img_+idx1,thread_id,nb_thread_id,img_wid,img_len,
-                         img_stride_col, img_stride_row, false, img_c_contiguous_2d);
-          if(preload_full_kern){
-            int idx2=kern_stride_stack*stack;
-            load_to_shared(d_kern, kern_+idx2, thread_id, nb_thread_id, kern_wid,kern_len,
-                           kern_stride_col, kern_stride_row, flipped_kern, kern_c_contiguous_2d);
-          }
-          __syncthreads();
-          for (int row=0; row < kern_len; row++) {//loop over row
-            if(!preload_full_kern){
-              __syncthreads();
-              int idx2=kern_stride_stack*stack;
-              if(flipped_kern)
-                idx2+=(kern_len-row-1)*kern_stride_row;
-              else
-                idx2+=(row)*kern_stride_row;
-              load_to_shared(d_kern, kern_+idx2, thread_id, nb_thread_id, kern_wid,1,
-                             kern_stride_col, kern_stride_row, flipped_kern, kern_c_contiguous_2d);
-              __syncthreads();              
-            }
-            const float* idx_kern;
-            if(preload_full_kern) idx_kern=&d_kern[row*kern_wid];
-            else idx_kern=d_kern;
-            const float* idx_in;
-            if(subsample)
-              idx_in=&d_img[(row+out_row*dx)*img_wid+out_col*dy];
-            else
-              idx_in=&d_img[(row+out_row)*img_wid+out_col];
-            //if needed as on Fermi as reading out of bound index from shared memory generate an error.
-            //Not needed on generation before as they worked anyway. Removing the if generate the good code
-            //as we store the result of only the good thread.
-            //This was with nvcc 3.0 on an GTX470 card.
-            if(out_row<out_len)
-              convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
-          }
-          __syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory
-        }
-        if(out_row<out_len)
-          store_or_accumulate<accumulate>(
-                                          out[batch_id*out_wid*out_len*nkern+//the good batch
-                                              out_wid*out_len*kern_id+//the output image
-                                              out_row*out_wid+out_col],sum);
-      }
-    }
-}
-/**
- * As conv_patch_stack, but kern_len thread for each output pixel
- * I keep it separated as use more register.
- * Implementation of the valid convolution that keep the full image and the full kernel in shared memory
- * thread block size=out_wid, out_len, ceil_intdiv(kern_len/nb_split)
- * grid block size=batch_id, nkern
- * dynamic shared memory: img_len*img_wid+kern_wid*(preload_full_kern?kern_len:thread_z)+out_size*thread_z
- * 
- * nkern: the number of kernel, used to compute the output image to store the result
- * nstack: the size of the stack, used to compute the image to load.
- * template flipped_kern: if true, we "flip" the kernel as in a real convolution, else we don't
- * template img_contiguous: if true, the img have are collon and row contiguous
- * template preload_full_kern: work only when split is true. We don't load the full kernel at once, but we load ceil_intdiv(kern_len/nb_split) kernel row at a time
- */
-template<bool flipped_kern, int KERN_WIDTH, bool c_contiguous, bool split, bool preload_full_kern>
-__global__ void
-conv_patch_stack_reduce( float* img, float* kern, float* out,
-                  int img_len, int img_wid, int kern_len, int kern_wid,
-                  int nkern, int nstack, int img_stride_col,int img_stride_row,
-                  int img_stride_stack, int img_stride_batch,
-                  int kern_stride_col, int kern_stride_row,
-                  int kern_stride_stack, int kern_stride_nkern)
-{
-  //int __shared__ out_len, out_wid, nb_thread_id;
-  //out_len = img_len - kern_len + 1;
-  //out_wid = img_wid - kern_wid + 1;
-  const int out_wid = blockDim.x;
-  const int out_len = blockDim.y;
-  const int nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
-  extern __shared__ float s_data[];
-    int batch_id = blockIdx.x;
-    // Thread index
-    int tx = threadIdx.x;
-    int ty = threadIdx.y;
-    int tz = threadIdx.z;
-    int out_col = tx;//output col
-    int out_row = ty;//output row
-    const int thread_id  = tz*blockDim.y*blockDim.x+ty*blockDim.x+tx;
-    //d_img size [IMAGE_LEN * IMAGE_WID];
-    float * d_img=&s_data[0];
-    //d_kern size[(preload_full_kern?KERNEL_LEN:blockDim.z) * KERNEL_WID]
-    float * d_kern=&s_data[img_len * img_wid];
-    //d_reduce size [n_threads]
-    //N.B. this overlaps with d_img and d_kern!
-    float * d_reduce=&s_data[0];
-    float sum = 0.0f;
-    kern+=kern_stride_nkern*blockIdx.y;//the good nkern
-    img+=img_stride_batch*batch_id;//the good batch
-    for (int stack = 0;stack<nstack;stack++,kern+=kern_stride_stack,
-           img+=img_stride_stack){
-      __syncthreads();
-      load_to_shared(d_img, img, thread_id, nb_thread_id, img_wid, img_len,
-                     img_stride_col, img_stride_row, false, c_contiguous);
-      if(split && ! preload_full_kern){
-        for(int first_row=0;first_row<kern_len;first_row+=blockDim.z){
-            //N.B. - Jan 30, 2011 with CUDA 3.2 I found that without the explicit cast to
-            // (int)blockDim.z, idx3 would sometimes be negative. I'm rusty on my signed vs. unsigned
-            // details, but that seemed really weird. tricky bug to find too.
-          int idx3 = flipped_kern
-              ? max((kern_len - (int)blockDim.z - first_row),0)
-              : first_row;
-          int len3 = min(blockDim.z, kern_len - first_row);
-          __syncthreads();
-          load_to_shared(d_kern, kern+idx3*kern_stride_row, thread_id, nb_thread_id, kern_wid, len3,
-                         kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
-          __syncthreads();
-          const float* idx_kern=&d_kern[tz*kern_wid];
-          const float* idx_in=&d_img[(first_row+tz+out_row)*img_wid+out_col];
-          float sum2 = 0;
-          if(tz<len3)
-            convolutionRowNoFlip<KERN_WIDTH>(sum2,idx_in,idx_kern,kern_wid);
-          sum+=sum2;
-        }
-      }else if(split){
-        load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid, kern_len,
-                       kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
-        __syncthreads();
-        for(int row=tz;row<kern_len;row+=blockDim.z){
-          const float* idx_kern=&d_kern[row*kern_wid];
-          const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
-          convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
-        }
-      }else{
-        int row = tz;//The row of the kernel.
-        const float* idx_kern=&d_kern[row*kern_wid];
-        const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
-        load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid, kern_len,
-                       kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
-        __syncthreads();
-        convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
-      }
-        __syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory
-    }
-    //reduce no sync because previous loop ends with sync
-    d_reduce[thread_id]=sum;
-    __syncthreads();
-    if(thread_id<out_len*out_wid){ // blockDim.x==out_wid, blockDim.y==out_len
-      //sum=0;
-      for(int i=1;i<blockDim.z;i++){
-        sum+=d_reduce[thread_id+i*out_wid*out_len];
-      }
-      out[batch_id*out_wid*out_len*nkern+//the good batch
-          out_wid*out_len*blockIdx.y+//the output image
-          out_row*out_wid+out_col] = sum;
-    }
-}
-/**
- * WORK FOR IMAGE THAT DON'T FIT IN SHARED MEMORY
- * we store kern_len row of the image and the full kernel in the shared memory
- * each thread compute only one value for the output
- * Don't implement the stack and nkern in the kernel.
- * thread block size=out_wid
- * grid block size=out_len,batch_id
- * dynamic shared memory: kern_len*img_wid+kern_len*kern_wid
- * Diff with conv_patch: don't store the full image in the shared memory. 
- *    I.E. work for bigger image then conv_patch<split=true,...>.
- */
-template<int KERN_WIDTH, bool c_contiguous>
-__global__ void
-conv_rows( float* img, float* kern, float* out,
-           int img_len, int img_wid, int kern_len, int kern_wid,
-           int nkern, int nstack,
-           int img_stride_col, int img_stride_row,
-           int img_stride_stack, int img_stride_batch,
-           int kern_stride_col, int kern_stride_row,
-           int kern_stride_stack, int kern_stride_nkern)
-{
-  int __shared__ out_len, out_wid, nb_thread_id, batch_id, kern_id;
-  float __shared__ *d_img, *d_kern;
-  out_len = img_len - kern_len + 1;
-  out_wid = img_wid - kern_wid + 1;
-  nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
-  batch_id= blockIdx.y/nkern;
-  kern_id = blockIdx.y%nkern;
-  extern __shared__ float s_data[];
-    const int out_col = threadIdx.x;//output col
-    const int out_row = blockIdx.x;;//output row
-    const int thread_id = threadIdx.x;
-    d_img=&s_data[0];//size of [KERN_LEN * IMAGE_WID];
-    d_kern=&s_data[kern_len * img_wid];//size of [KERNEL_LEN * KERNEL_WID];
-    img+=img_stride_batch*batch_id;//selection the good image from the batch
-    img+=out_row*img_stride_row;//select the good top row.
-    kern+=kern_stride_nkern*kern_id;//the good nkern
-    load_to_shared(d_img,img,thread_id,nb_thread_id,img_wid,kern_len,
-                   img_stride_col, img_stride_row, false, c_contiguous);
-    load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid,kern_len,
-                   kern_stride_col, kern_stride_row, true, c_contiguous);
-    __syncthreads();
-    float sum = 0.0f;
-    for (int row=0; row < kern_len; row++) {//loop over row
-      const float* idx_kern=&d_kern[row*kern_wid];
-      const float* idx_in=&d_img[(row)*img_wid+out_col];
-      convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
-    }
-    out[batch_id*out_wid*out_len*nkern+//the good batch
-        kern_id*out_wid*out_len+//the output image
-        out_row*out_wid+out_col] = sum;
-}
-/**
- * WORK FOR IMAGE THAT DON'T FIT IN SHARED MEMORY
- * as conv_rows, but implement the stack. Separate as this use more register.
- * we store kern_len row of the image and the full kernel in the shared memory
- * each thread compute only one value for the output
- * thread block size=out_wid, block_len
- * grid block size=intceil(out_len/block_len),nb_batch*nb_kern
- * dynamic shared memory: (kern_len+block_len-1)*img_wid+kern_len*kern_wid
- * Diff with conv_patch: don't store the full image in the shared memory. 
- *    I.E. work for bigger image then conv_patch<split=true,...>.
- */
-template<int KERN_WIDTH, bool c_contiguous>
-__global__ void
-conv_rows_stack( float* img, float* kern, float* out,
-                 const int img_len, const int img_wid, const int kern_len, const int kern_wid,
-                 const int nkern, const int nstack,
-                 const int img_stride_col, const int img_stride_row,
-                 const int img_stride_stack, const int img_stride_batch,
-                 const int kern_stride_col, const int kern_stride_row,
-                 const int kern_stride_stack, const int kern_stride_nkern)
-{
-  int __shared__ out_len, out_wid, nb_thread_id, batch_id, kern_id, nb_rows;
-  float  __shared__ *d_img, *d_kern;
-  out_len = img_len - kern_len + 1;
-  out_wid = img_wid - kern_wid + 1;
-  nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
-  batch_id= blockIdx.y/nkern;
-  kern_id = blockIdx.y%nkern;
-  nb_rows = blockDim.y;
-  int rows_to_read = MIN(
-          kern_len + nb_rows - 1,
-          img_len - blockIdx.x * nb_rows);
-  /**
-   * Every thread ultimately computes one value in the output, at coordinates
-   *   out[ batch_id, kern_id, out_row, out_col]
-   *
-   * The batch_id and kern_id are packed into blockIdx.y. out_row and out_col
-   * are the threadIdx.x and threadIdx.y.
-   *
-   * Every thread block deals only with one image, and one filter kernel.
-   */
-  extern __shared__ float s_data[];
-    const int out_col = threadIdx.x;//output col
-    const int out_row = blockIdx.x*blockDim.y+threadIdx.y;//output row
-    const int shared_row = threadIdx.y;
-    const int thread_id = threadIdx.y*blockDim.x+threadIdx.x;
-  /*
-   * The kernel works by looping over channels (aka colours, aka the stack).
-   * On each iteration, a thread block loads one channel of all the image rows that
-   * it needs to use, and one channel slice of one kernel.
-   */
-    d_img=&s_data[0];//size of [(KERN_LEN+block_len-1) * IMAGE_WID];
-    d_kern=&s_data[(kern_len+nb_rows-1) * img_wid];//size of [KERNEL_LEN * KERNEL_WID];
-    float sum = 0.0f;
-    for (int stack = 0; stack < nstack; stack++){
-      int offset =
-          img_stride_batch * batch_id
-          + img_stride_stack * stack
-          //blockIdx.x is which chunk of nb_rows this thread block deals with
-          + img_stride_row * (blockIdx.x * nb_rows);
-      load_to_shared(
-              d_img,              // dst
-              img+offset,         // src
-              thread_id,          // linear position in block
-              nb_thread_id,       // number of threads
-              img_wid,            // cols in image to read
-              rows_to_read,       // number of rows to read
-              img_stride_col,     // img[i, j, k, l] to img[i, j, k, l + 1]
-              img_stride_row,     // img[i, j, k, l] to img[i, j, k + 1, l]
-              false,              // flip while reading
-              c_contiguous);
-      offset = kern_stride_nkern * kern_id + kern_stride_stack * stack;
-      load_to_shared(d_kern, kern+offset, thread_id, nb_thread_id, kern_wid,kern_len,
-                     kern_stride_col, kern_stride_row, true, c_contiguous);
-      __syncthreads();
-      for (int row=0; row < kern_len; row++) {//loop over row
-        const float* idx_kern=&d_kern[row*kern_wid];
-        const float* idx_in=&d_img[(row+shared_row)*img_wid+out_col];
-        convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
-      }
-      __syncthreads();//to be sure all thread have finished before we modif the shared memory.
-    }
-    if (out_row < out_len)
-      out[batch_id*out_wid*out_len*nkern+//the good batch
-          kern_id*out_wid*out_len+//the output image
-          out_row*out_wid+out_col] = sum;
-}
-/**
- * WORK FOR IMAGE THAT DON'T FIT IN SHARED MEMORY
- * as conv_rows_stack, but load only block_len of the image at a time and 1 or all kern row.
- * we store block_len row of the image(at a time) and one or all kernel row in the shared memory
- * each thread compute only one value for the output
- * thread block size=out_wid, block_len
- * grid block size=intceil(out_len/block_len),nb_batch*nb_kern
- * dynamic shared memory: block_len * img_wid+(preload_full_kern?kern_len:1)*kern_wid
- * Diff with conv_patch: don't store the full image and kernel in the shared memory. 
- *    I.E. work for bigger image then conv_patch<split=true,...>.
- */
-template<int KERN_WIDTH, bool c_contiguous, bool preload_full_kern>
-__global__ void
-conv_rows_stack2( float* img, float* kern, float* out,
-                 const int img_len, const int img_wid, const int kern_len, const int kern_wid,
-                 const int nkern, const int nstack,
-                 const int img_stride_col, const int img_stride_row,
-                 const int img_stride_stack, const int img_stride_batch,
-                 const int kern_stride_col, const int kern_stride_row,
-                 const int kern_stride_stack, const int kern_stride_nkern)
-{
-  int __shared__ out_len, out_wid, nb_thread_id, batch_id, kern_id, nb_rows;
-  float  __shared__ *d_img, *d_kern;
-  out_len = img_len - kern_len + 1;
-  out_wid = img_wid - kern_wid + 1;
-  nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
-  batch_id= blockIdx.y/nkern;
-  kern_id = blockIdx.y%nkern;
-  nb_rows = blockDim.y;
-  extern __shared__ float s_data[];
-    const int out_col = threadIdx.x;//output col
-    const int out_row = blockIdx.x*blockDim.y+threadIdx.y;//output row
-    const int shared_row = threadIdx.y;
-    const int thread_id = threadIdx.y*blockDim.x+threadIdx.x;
-    d_img=&s_data[0];//size of [nb_rows * IMAGE_WID];
-    d_kern=&s_data[nb_rows*img_wid];//size of [(preload_full_kern?KERNEL_LEN:1) * KERNEL_WID];
-    float sum = 0.0f;
-    for (int stack = 0;stack<nstack;stack++){
-      int _idx2=img_stride_batch*batch_id+img_stride_stack*stack;//selection the good image from the batch and stack
-      _idx2+=(blockIdx.x*nb_rows)*img_stride_row;//select the good top row for the block of threads
-      __syncthreads();
-      load_to_shared(d_img,img+_idx2,thread_id,nb_thread_id,img_wid,nb_rows-1,
-                           img_stride_col, img_stride_row, false, c_contiguous);
-      if(preload_full_kern)
-        load_to_shared(d_kern, kern+kern_stride_nkern*kern_id+kern_stride_stack*stack,
-                       thread_id, nb_thread_id, kern_wid,kern_len,
-                       kern_stride_col, kern_stride_row, true, c_contiguous);
-      __syncthreads();
-      for (int row=0; row < kern_len; row++) {//loop over row
-        __syncthreads();
-        if((blockIdx.x*nb_rows+row+nb_rows-1)<img_len){
-          int _idx1=img_stride_batch*batch_id+img_stride_stack*stack;//selection the good image from the batch and stack
-          _idx1+=(blockIdx.x*nb_rows)*img_stride_row;//select the good top row for the block of threads
-          _idx1+=(row+nb_rows-1)*img_stride_row;//the current last row
-          load_to_shared(d_img+((row+nb_rows-1)%nb_rows)*img_wid,
-                         img+_idx1, thread_id, nb_thread_id, img_wid, 1,
-                         img_stride_col, img_stride_row, false, c_contiguous);//we use d_img as a circular buffer.
-        }
-        if(!preload_full_kern){
-          int _idx3=kern_stride_nkern*kern_id+kern_stride_stack*stack;//selection the good kern from the batch and stack
-          _idx3+=(kern_len-row-1)*kern_stride_row;//the current last row flipped
-          load_to_shared(d_kern, kern+_idx3,
-                         thread_id, nb_thread_id, kern_wid,1,
-                         kern_stride_col, kern_stride_row, true, c_contiguous);
-        }
-        __syncthreads();
-        //if needed as on Fermi as reading out of bound index from shared memory generate an error.
-        //Not needed on generation before as they worked anyway. Removing the if generate the good code
-        //as we store the result of only the good thread.
-        //This was with nvcc 3.0 on an GTX470 card.
-        if(out_row<out_len){
-          const float* idx_kern;
-          if(preload_full_kern) idx_kern=&d_kern[row*kern_wid];
-          else idx_kern=d_kern;
-          const float* idx_in=&d_img[((shared_row+row)%nb_rows)*img_wid+out_col];
-          float sum_ =0.0f;
-          convolutionRowNoFlip<KERN_WIDTH>(sum_,idx_in,idx_kern,kern_wid);
-          sum+=sum_;//We pass by an intermediate variable to have more precission.
-        }
-      }
-    }
-    __syncthreads();
-    if(out_row<out_len)
-      out[batch_id*out_wid*out_len*nkern+//the good batch
-          kern_id*out_wid*out_len+//the output image
-          out_row*out_wid+out_col] = sum;
-}
-/**
- * Implementation of 'valid' mode convolution that uses one block per output pixel, and uses a sum-reduce within each block to compute the
- * kernel-image inner-product in parallel.
- * 
- * This implementation uses shared memory for the reduce, so it is limited by the product of stacklen x kern_len
- *
- * template stack_loop: if true, we accept that blockDim.x < nstack and we add a loop for this(use 3 more registers, so lower occupency when true, but accept nstack*kern_len>512)
- * TODO: explain parameters, preconditions
- */
-template<bool stack_loop>
-__global__ void
-conv_valid_row_reduce(int nB, int nK, int stacklen,
-        int img_len, int img_wid, 
-        int kern_len, int kern_wid,
-        int out_len, int out_wid, //physical
-        float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
-        float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
-        float *out, int out_str_B, int out_str_K, int out_str_R, int out_str_C ,
-        int subsample_rows, int subsample_cols,
-        const int initial_reduce_boundary)
-{
-    const int outsize = nB * nK * out_len * out_wid;
-    extern __shared__ float reducebuf[];
-    for (int i = blockIdx.x; i < /*physical*/outsize; i += gridDim.x)
-    {
-        //figure out what output element we're in charge of computing
-        int ii = i;
-        int iB = ii % nB;      // output batch index
-        ii = ii / nB;
-        int iK = ii % nK;      // output kernel index
-        ii = ii / nK;
-        int iR_physical = ii % out_len; //output kernel row
-        int iC_physical = ii / out_len; // output kernel column
-        int iR_logical = iR_physical * subsample_rows;
-        int iC_logical = iC_physical * subsample_cols;
-        int ss = threadIdx.x;
-        int rr = threadIdx.y;
-        int img_rr = iR_logical + kern_len - 1 - rr;
-        int reduceIdx = threadIdx.x * blockDim.y + threadIdx.y;
-        float sum = 0.0f;
-        if(stack_loop){
-          for (; ss < stacklen; ss+=blockDim.x){
-            float * kk_0 = kern + iK*kern_str_K + ss*kern_str_S + rr*kern_str_R;
-            float * ii_0 = img + iB*img_str_B + ss*img_str_S + img_rr*img_str_R + (iC_logical + kern_wid - 1)*img_str_C;
-            for (int cc = 0; cc < kern_wid; ++cc)
-            {
-                sum +=  kk_0[0] * ii_0[0];
-                kk_0 += kern_str_C;
-                ii_0 -= img_str_C;
-            }
-          }
-        }else{
-          float * kk_0 = kern + iK*kern_str_K + ss*kern_str_S + rr*kern_str_R;
-          float * ii_0 = img + iB*img_str_B + ss*img_str_S + img_rr*img_str_R + (iC_logical + kern_wid - 1)*img_str_C;
-          for (int cc = 0; cc < kern_wid; ++cc)
-          {
-            sum +=  kk_0[0] * ii_0[0];
-            kk_0 += kern_str_C;
-            ii_0 -= img_str_C;
-          }
-        }
-        if (blockDim.x * blockDim.y == 1)
-        {
-            out[iB * out_str_B + iK * out_str_K + iR_physical * out_str_R + iC_physical * out_str_C] = sum;
-        }
-        else
-        {
-            reducebuf[reduceIdx] = sum;
-            __syncthreads();
-            int reduce_boundary = initial_reduce_boundary;
-            // add in the terms above the reduce boundary
-            if (reduceIdx + reduce_boundary < (blockDim.x * blockDim.y))
-                reducebuf[reduceIdx] += reducebuf[reduce_boundary +reduceIdx];
-            reduce_boundary >>= 1;
-            // there are an equal number of terms above and below the reduce_boundary
-            while (reduce_boundary)
-            {
-                __syncthreads();
-                if (reduceIdx < reduce_boundary)
-                {
-                    reducebuf[reduceIdx] += reducebuf[reduce_boundary + reduceIdx];
-                }
-                reduce_boundary >>= 1;
-            }
-            if (reduceIdx == 0)
-            {
-                out[iB * out_str_B + iK * out_str_K + iR_physical * out_str_R + iC_physical * out_str_C] = reducebuf[0];
-            }
-        }
-    }
-}
-/**
- * Reference implementation of 'valid' mode convolution (with stack)
- * 
- * This implementation works for any size of image and kernel.  It does not use shared memory.
- *
- * TODO: explain parameters, preconditions
- */
-__global__ void
-conv_reference_valid(int nB, int nK, int stacklen,
-        int img_len, int img_wid, 
-        int kern_len, int kern_wid,
-        int out_len, int out_wid, //physical
-        float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
-        float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
-        float *out, int out_str_B, int out_str_K, int out_str_R, int out_str_C ,
-        int subsample_rows, int subsample_cols)
-{
-    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    __shared__ int numThreads, outsize;
-    numThreads = blockDim.x * gridDim.x;
-    outsize = nB * nK * out_len * out_wid;
-    for (int i = idx; i < outsize; i += numThreads)  //physical
-    {
-        //figure out what output element we're in charge of computing
-        int ii = i;
-        int iB = ii % nB;      // output batch index
-        ii = ii / nB;
-        int iK = ii % nK;      // output kernel index
-        ii = ii / nK;
-        int iR_physical = ii % out_len; //output kernel row
-        int iC_physical = ii / out_len; // output kernel column
-        int iR_logical = iR_physical * subsample_rows;
-        int iC_logical = iC_physical * subsample_cols;
-        float sum = 0.0f;
-        for (int ss = 0; ss < stacklen; ++ss)
-        {
-            for (int rr = 0; rr < kern_len; ++rr)
-            {
-                int img_rr = iR_logical + kern_len - 1 - rr;
-                for (int cc = 0; cc < kern_wid; ++cc)
-                {
-                    int img_cc = iC_logical + kern_wid-1-cc;
-                    float k_0 = kern[iK*kern_str_K + ss*kern_str_S + rr*kern_str_R + cc*kern_str_C];
-                    float i_0 = img[iB*img_str_B + ss*img_str_S + img_rr*img_str_R + img_cc*img_str_C];
-                    sum +=  k_0 * i_0;
-                }
-            }
-        }
-        //coords[i*5+0] = iB;
-        //coords[i*5+1] = iK;
-        //coords[i*5+2] = iR;
-        //coords[i*5+3] = iC;
-        //coords[i*5+4] = iB * out_str_B + iK * out_str_K + iR * out_str_R + iC * out_str_C;
-        out[iB * out_str_B + iK * out_str_K + iR_physical * out_str_R + iC_physical * out_str_C] = sum;
-    }
-}
-/**
- * Reference implementation of 'full' mode convolution (with stack)
- * 
- * This implementation works for any size of image and kernel.  It does not use shared memory.
- *
- * TODO: explain parameters, preconditions
- */
-__global__ void
-conv_reference_full(int nB, int nK, int stacklen,
-        int img_len, int img_wid, 
-        int kern_len, int kern_wid,
-        int out_len, int out_wid, //physical dimensions
-        float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
-        float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
-        float *out, int out_str_B, int out_str_K, int out_str_R, int out_str_C,
-        int subsample_rows, int subsample_cols)
-{
-    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    __shared__ int numThreads, physical_outsize;
-    numThreads = blockDim.x * gridDim.x;
-    physical_outsize = nB * nK * out_len * out_wid;
-    for (int i = idx; i < physical_outsize; i += numThreads) 
-    {
-        //figure out what output element we're in charge of computing
-        int ii = i;
-        int iB = ii % nB;      // output batch index
-        ii = ii / nB;
-        int iK = ii % nK;      // output kernel index
-        ii = ii / nK;
-        int iR_physical = ii % out_len; //output kernel row
-        int iC_physical = ii / out_len; // output kernel column
-        int iR_logical = iR_physical * subsample_rows;
-        int iC_logical = iC_physical * subsample_cols;
-        float sum = 0.0f;
-        for (int ss = 0; ss < stacklen; ++ss)
-        {
-            for (int rr = 0; rr < kern_len; ++rr)
-            {
-                int img_rr = iR_logical - rr;
-                if ((img_rr >= 0) && (img_rr < img_len))
-                {
-                    for (int cc = 0; cc < kern_wid; ++cc)
-                    {
-                        int img_cc = iC_logical - cc;
-                        if ((img_cc >= 0) && (img_cc < img_wid))
-                        {
-                            float k_0 = kern[iK*kern_str_K + ss*kern_str_S + rr*kern_str_R + cc*kern_str_C];
-                            float i_0 = img[iB*img_str_B + ss*img_str_S + img_rr*img_str_R + img_cc*img_str_C];
-                            sum +=  k_0 * i_0;
-                        }
-                    }
-                }
-            }
-        }
-        out[iB * out_str_B + iK * out_str_K + iR_physical * out_str_R + iC_physical * out_str_C] = sum;
-    }
-}
-#endif // #ifndef CONV_KERNEL_CU
-/*
-  Local Variables:
-  mode:c++
-  c-basic-offset:4
-  c-file-style:"stroustrup"
-  indent-tabs-mode:nil
-  fill-column:79
-  End:
-*/
-// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 :
--- a/theano/sandbox/cuda/corr3d_gemm.cu
+++ b/theano/sandbox/cuda/corr3d_gemm.cu
-// This uses a lot of code from Caffe (http://caffe.berkeleyvision.org/);
-// sources are clearly marked. Below we reproduce the original license of
-// the Caffe software.
-/*
-Copyright (c) 2014, The Regents of the University of California (Regents)
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-1. Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-#undef _GLIBCXX_ATOMIC_BUILTINS
-// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/caffe_common.hpp)
-// CUDA: grid stride looping
-#define CUDA_KERNEL_LOOP(i, n)                        \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
-       i < (n);                                       \
-       i += blockDim.x * gridDim.x)
-// CUDA: thread number configuration.
-// Use 1024 threads per block, which requires cuda sm_2x or above,
-// or fall back to attempt compatibility (best of luck to you).
-#if __CUDA_ARCH__ >= 200
-    const int CUDA_NUM_THREADS = 1024;
-#else
-    const int CUDA_NUM_THREADS = 512;
-#endif
-// CUDA: number of blocks for threads.
-inline int GET_BLOCKS(const int N) {
-  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
-}
-// (Adapted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/util/im2col.cu)
-// Kernels for fast unfold + copy
-// CUDA kernel for the case of dilation
-__global__ void dilated_im3d2col_kernel(const int n, const float* data_im,
-    const int height, const int width, const int depth,
-    const int kernel_h, const int kernel_w, const int kernel_d,
-    const int dilation_h, const int dilation_w, const int dilation_d,
-    const int pad_h, const int pad_w, const int pad_d,
-    const int stride_h, const int stride_w, const int stride_d,
-    const int height_col, const int width_col, const int depth_col,
-    float* data_col) {
-  CUDA_KERNEL_LOOP(index, n) {
-    const int w_index = index / depth_col;
-    const int h_index = w_index / width_col;
-    const int d_col = index % depth_col;
-    const int h_col = h_index % height_col;
-    const int w_col = w_index % width_col;
-    const int c_im = h_index / height_col;
-    const int c_col = c_im * kernel_h * kernel_w * kernel_d;
-    const int h_offset = h_col * stride_h - pad_h;
-    const int w_offset = w_col * stride_w - pad_w;
-    const int d_offset = d_col * stride_d - pad_d;
-    float* data_col_ptr = data_col;
-    data_col_ptr += c_col * (height_col * width_col * depth_col) +
-      h_col * (width_col * depth_col) + w_col * depth_col + d_col;
-    const float* data_im_ptr = data_im;
-    data_im_ptr += c_im * (height * width * depth) +
-      h_offset * (width * depth) + w_offset * depth + d_offset;
-    for (int i = 0; i < kernel_h; ++i)
-    {
-      int h_im = h_offset + i * dilation_h;
-      for (int j = 0; j < kernel_w; ++j)
-      {
-        int w_im = w_offset + j * dilation_w;
-        for (int k = 0; k < kernel_d; ++k)
-        {
-          int d_im = d_offset + k * dilation_d;
-          *data_col_ptr = (h_im >= 0 && w_im >= 0 && d_im >= 0 &&
-                           h_im < height && w_im < width && d_im < depth) ?
-                           data_im_ptr[i * dilation_h * (width * depth) +
-                                       j * dilation_w * depth +
-                                       k * dilation_d] : 0;
-          data_col_ptr += height_col * width_col * depth_col;
-        }
-      }
-    }
-  }
-}
-__global__ void im3d2col_kernel(const int n, const float* data_im,
-                                const int height, const int width, const int depth,
-                                const int kernel_h, const int kernel_w, const int kernel_d,
-                                const int pad_h, const int pad_w, const int pad_d,
-                                const int stride_h, const int stride_w, const int stride_d,
-                                const int height_col, const int width_col, const int depth_col,
-                                float* data_col)
-{
-  CUDA_KERNEL_LOOP(index, n)
-  {
-    const int w_index = index / depth_col;
-    const int h_index = w_index / width_col;
-    const int d_col = index % depth_col;
-    const int h_col = h_index % height_col;
-    const int w_col = w_index % width_col;
-    const int c_im = h_index / height_col;
-    const int c_col = c_im * kernel_h * kernel_w * kernel_d;
-    const int h_offset = h_col * stride_h - pad_h;
-    const int w_offset = w_col * stride_w - pad_w;
-    const int d_offset = d_col * stride_d - pad_d;
-    float* data_col_ptr = data_col;
-    data_col_ptr += c_col * (height_col * width_col * depth_col) +
-      h_col * (width_col * depth_col) + w_col * depth_col + d_col;
-    const float* data_im_ptr = data_im;
-    data_im_ptr += c_im * (height * width * depth) +
-      h_offset * (width * depth) + w_offset * depth + d_offset;
-    for (int i = 0; i < kernel_h; ++i)
-    {
-      int h_im = h_offset + i;
-      for (int j = 0; j < kernel_w; ++j)
-      {
-        int w_im = w_offset + j;
-        for (int k = 0; k < kernel_d; ++k)
-        {
-          int d_im = d_offset + k;
-          *data_col_ptr = (h_im >= 0 && w_im >= 0 && d_im >= 0 &&
-                           h_im < height && w_im < width && d_im < depth) ?
-                           data_im_ptr[i * (width * depth) + j * depth + k] : 0;
-          data_col_ptr += height_col * width_col * depth_col;
-        }
-      }
-    }
-  }
-}
-void im3d2col(const float* data_im, const int channels,
-              const int height, const int width, const int depth,
-              const int kernel_h, const int kernel_w, const int kernel_d,
-              const int dilation_h, const int dilation_w, const int dilation_d,
-              const int pad_h, const int pad_w, const int pad_d,
-              const int stride_h, const int stride_w, const int stride_d,
-              float* data_col)
-{
-  // We are going to launch channels * height_col * width_col * depth_col kernels, each
-  // kernel responsible for copying a single-channel grid.
-  int dil_kernel_h = (kernel_h - 1) * dilation_h + 1;
-  int dil_kernel_w = (kernel_w - 1) * dilation_w + 1;
-  int dil_kernel_d = (kernel_d - 1) * dilation_d + 1;
-  int height_col = (height + 2 * pad_h - dil_kernel_h) / stride_h + 1;
-  int width_col = (width + 2 * pad_w - dil_kernel_w) / stride_w + 1;
-  int depth_col = (depth + 2 * pad_d - dil_kernel_d) / stride_d + 1;
-  int num_kernels = channels * height_col * width_col * depth_col;
-  if(dilation_h != 1 || dilation_w != 1 || dilation_d != 1){
-    dilated_im3d2col_kernel<<<GET_BLOCKS(num_kernels),
-                      CUDA_NUM_THREADS>>>(num_kernels, data_im,
-                                          height, width, depth,
-                                          kernel_h, kernel_w, kernel_d,
-                                          dilation_h, dilation_w, dilation_d,
-                                          pad_h, pad_w, pad_d,
-                                          stride_h, stride_w, stride_d,
-                                          height_col, width_col, depth_col,
-                                          data_col);
-  }
-  else{
-    im3d2col_kernel<<<GET_BLOCKS(num_kernels),
-                      CUDA_NUM_THREADS>>>(num_kernels, data_im,
-                                          height, width, depth,
-                                          kernel_h, kernel_w, kernel_d,
-                                          pad_h, pad_w, pad_d,
-                                          stride_h, stride_w, stride_d,
-                                          height_col, width_col, depth_col,
-                                          data_col);
-  }
-}
-// CUDA kernel for the case of dilation
-__global__ void dilated_col2im3d_kernel(
-    const int n, const float* data_col,
-    const int height, const int width, const int depth,
-    const int channels,
-    const int kernel_h, const int kernel_w, const int kernel_d,
-    const int dilation_h, const int dilation_w, const int dilation_d,
-    const int pad_h, const int pad_w, const int pad_d,
-    const int stride_h, const int stride_w, const int stride_d,
-    const int height_col, const int width_col, const int depth_col,
-    float* data_im)
-{
-  CUDA_KERNEL_LOOP(index, n)
-  {
-    float val = 0;
-    const int d_im = index % depth + pad_d;
-    const int w_index = index / depth;
-    const int w_im = w_index % width + pad_w;
-    const int h_index = w_index / width;
-    const int h_im = h_index % height + pad_h;
-    const int c_im = h_index / height;
-    int kernel_extent_w = (kernel_w - 1) * dilation_w + 1;
-    int kernel_extent_h = (kernel_h - 1) * dilation_h + 1;
-    int kernel_extent_d = (kernel_d - 1) * dilation_d + 1;
-    // compute the start and end of the output
-    const int d_col_start = (d_im < kernel_extent_d) ? 0 : (d_im - kernel_extent_d) / stride_d + 1;
-    const int d_col_end = min(d_im / stride_d + 1, depth_col);
-    const int w_col_start = (w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1;
-    const int w_col_end = min(w_im / stride_w + 1, width_col);
-    const int h_col_start = (h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) / stride_h + 1;
-    const int h_col_end = min(h_im / stride_h + 1, height_col);
-    // TODO: use LCM of stride and dilation to avoid unnecessary loops
-    for (int d_col = d_col_start; d_col < d_col_end; ++d_col) {
-      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
-        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
-          int h_k = (h_im - h_col * stride_h);
-          int w_k = (w_im - w_col * stride_w);
-          int d_k = (d_im - d_col * stride_d);
-          if (h_k % dilation_h == 0 && w_k % dilation_w == 0 && d_k % dilation_d == 0) {
-            h_k /= dilation_h;
-            w_k /= dilation_w;
-            d_k /= dilation_d;
-            int data_col_index = c_im * kernel_h * kernel_w * kernel_d * height_col * width_col * depth_col +
-                                 h_k             * kernel_w * kernel_d * height_col * width_col * depth_col +
-                                 w_k                        * kernel_d * height_col * width_col * depth_col +
-                                 d_k                                   * height_col * width_col * depth_col +
-                                 h_col                                              * width_col * depth_col +
-                                 w_col                                                          * depth_col +
-                                 d_col;
-            val += data_col[data_col_index];
-          }
-        }
-      }
-   }
-   data_im[index] = val;
-  }
-}
-__global__ void col2im3d_kernel(const int n, const float* data_col,
-                                const int height, const int width, const int depth,
-                                const int channels,
-                                const int kernel_h, const int kernel_w, const int kernel_d,
-                                const int pad_h, const int pad_w, const int pad_d,
-                                const int stride_h, const int stride_w, const int stride_d,
-                                const int height_col, const int width_col, const int depth_col,
-                                float* data_im)
-{
-  CUDA_KERNEL_LOOP(index, n)
-  {
-    float val = 0;
-    const int d_im = index % depth + pad_d;
-    const int w_index = index / depth;
-    const int w_im = w_index % width + pad_w;
-    const int h_index = w_index / width;
-    const int h_im = h_index % height + pad_h;
-    const int c_im = h_index / height;
-    // compute the start and end of the output
-    const int d_col_start = (d_im < kernel_d) ? 0 : (d_im - kernel_d) / stride_d + 1;
-    const int d_col_end = min(d_im / stride_d + 1, depth_col);
-    const int w_col_start = (w_im < kernel_w) ? 0 : (w_im - kernel_w) / stride_w + 1;
-    const int w_col_end = min(w_im / stride_w + 1, width_col);
-    const int h_col_start = (h_im < kernel_h) ? 0 : (h_im - kernel_h) / stride_h + 1;
-    const int h_col_end = min(h_im / stride_h + 1, height_col);
-    int offset =
-      (c_im * kernel_h * kernel_w * kernel_d + h_im * kernel_w * kernel_d +
-       w_im * kernel_d + d_im) * height_col * width_col * depth_col;
-    int coeff_h_col = (1 - stride_h * kernel_w * kernel_d * height_col) * width_col * depth_col;
-    int coeff_w_col = (1 - stride_w * kernel_d * height_col * width_col) * depth_col;
-    int coeff_d_col = (1 - stride_d * height_col * width_col * depth_col);
-    for (int d_col = d_col_start; d_col < d_col_end; ++d_col) {
-      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
-        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
-          val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col + d_col * coeff_d_col];
-        }
-      }
-   }
-   data_im[index] = val;
-  }
-}
-void col2im3d(const float* data_col, const int channels,
-              const int height, const int width, const int depth,
-              const int patch_h, const int patch_w, const int patch_d,
-              const int dilation_h, const int dilation_w, const int dilation_d,
-              const int pad_h, const int pad_w, const int pad_d,
-              const int stride_h, const int stride_w, const int stride_d,
-              float* data_im)
-{
-  int dil_patch_h = (patch_h - 1) * dilation_h + 1;
-  int dil_patch_w = (patch_w - 1) * dilation_w + 1;
-  int dil_patch_d = (patch_d - 1) * dilation_d + 1;
-  int height_col = (height + 2 * pad_h - dil_patch_h) / stride_h + 1;
-  int width_col = (width + 2 * pad_w - dil_patch_w) / stride_w + 1;
-  int depth_col = (depth + 2 * pad_d - dil_patch_d) / stride_d + 1;
-  int num_kernels = channels * height * width * depth;
-  // To avoid involving atomic operations, we will launch one kernel per
-  // bottom dimension, and then in the kernel add up the top dimensions.
-  if(dilation_h != 1 || dilation_w != 1 || dilation_d != 1){
-    dilated_col2im3d_kernel<<<GET_BLOCKS(num_kernels),
-                              CUDA_NUM_THREADS>>>(num_kernels, data_col,
-                                                  height, width, depth, channels,
-                                                  patch_h, patch_w, patch_d,
-                                                  dilation_h, dilation_w, dilation_d,
-                                                  pad_h, pad_w, pad_d,
-                                                  stride_h, stride_w, stride_d,
-                                                  height_col, width_col, depth_col,
-                                                  data_im);
-  }
-  else{
-    col2im3d_kernel<<<GET_BLOCKS(num_kernels),
-                      CUDA_NUM_THREADS>>>(num_kernels, data_col,
-                                          height, width, depth, channels,
-                                          patch_h, patch_w, patch_d,
-                                          pad_h, pad_w, pad_d,
-                                          stride_h, stride_w, stride_d,
-                                          height_col, width_col, depth_col,
-                                          data_im);
-  }
-}
-// Theano op code
-// Authors: Arjun Jain, Frederic Bastien, Jan Schluter, Nicolas Ballas
-// Reference code: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu
-//   and https://github.com/torch/cunn/blob/master/SpatialConvolutionMM.cu
-// Adaptation for 3d
-CudaNdarray* corr3dMM(CudaNdarray *const bottom,
-                      CudaNdarray *const weight,
-                      CudaNdarray *const top,
-                      const int direction,
-                      const int dH = 1,
-                      const int dW = 1,
-                      const int dD = 1,
-                      const int dilH = 1,
-                      const int dilW = 1,
-                      const int dilD = 1,
-                      const int padH = 0,
-                      const int padW = 0,
-                      const int padD = 0)
-{
-    if (bottom->nd != 5)
-    {
-      PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM requires bottom of 5D");
-      return NULL;
-    }
-    if (!CudaNdarray_is_c_contiguous(bottom))
-    {
-      PyErr_Format(PyExc_ValueError,
-                   "GpuCorr3dMM requires bottom to be C-contiguous, "
-                   "but strides are: %d %d %d %d %d\n",
-                   CudaNdarray_HOST_STRIDES(bottom)[0],
-                   CudaNdarray_HOST_STRIDES(bottom)[1],
-                   CudaNdarray_HOST_STRIDES(bottom)[2],
-                   CudaNdarray_HOST_STRIDES(bottom)[3],
-                   CudaNdarray_HOST_STRIDES(bottom)[4]);
-      return 0;
-    }
-    if (weight->nd != 5)
-    {
-      PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM requires weight of 5D");
-      return 0;
-    }
-    if (!CudaNdarray_is_c_contiguous(weight))
-    {
-      PyErr_Format(PyExc_ValueError,
-                   "GpuCorr3dMM requires weight to be C-contiguous, "
-                   "but strides are: %d %d %d %d %d\n",
-                   CudaNdarray_HOST_STRIDES(weight)[0],
-                   CudaNdarray_HOST_STRIDES(weight)[1],
-                   CudaNdarray_HOST_STRIDES(weight)[2],
-                   CudaNdarray_HOST_STRIDES(weight)[3],
-                   CudaNdarray_HOST_STRIDES(weight)[4]);
-      return 0;
-    }
-    if (top->nd != 5)
-    {
-      PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM requires top of 5D");
-      return 0;
-    }
-    if (!CudaNdarray_is_c_contiguous(top))
-    {
-      PyErr_Format(PyExc_ValueError,
-                   "GpuCorr3dMM requires top to be C-contiguous, "
-                   "but strides are: %d %d %d %d %d\n",
-                   CudaNdarray_HOST_STRIDES(top)[0],
-                   CudaNdarray_HOST_STRIDES(top)[1],
-                   CudaNdarray_HOST_STRIDES(top)[2],
-                   CudaNdarray_HOST_STRIDES(top)[3],
-                   CudaNdarray_HOST_STRIDES(top)[4]);
-      return 0;
-    }
-    // Extract some shape information for later and check shape consistency
-    // bottom: (batchSize, nChannels, bottomHeight, bottomWidth, bottomDepth)
-    const int batchSize = CudaNdarray_HOST_DIMS(bottom)[0];
-    const int nChannels = CudaNdarray_HOST_DIMS(bottom)[1];
-    const int bottomHeight = CudaNdarray_HOST_DIMS(bottom)[2];
-    const int bottomWidth = CudaNdarray_HOST_DIMS(bottom)[3];
-    const int bottomDepth = CudaNdarray_HOST_DIMS(bottom)[4];
-    // weights: (nFilters, nChannels, rows, columns, depth)
-    const int nFilters = CudaNdarray_HOST_DIMS(weight)[0];
-    const int kH = CudaNdarray_HOST_DIMS(weight)[2];
-    const int kW = CudaNdarray_HOST_DIMS(weight)[3];
-    const int kD = CudaNdarray_HOST_DIMS(weight)[4];
-    if (nChannels != CudaNdarray_HOST_DIMS(weight)[1])
-    {
-      PyErr_SetString(PyExc_ValueError,
-                      "GpuCorr3dMM images and kernel must have the same stack size\n");
-      return 0;
-    }
-    // implicit dilated filter
-    const int dil_kH = (kH - 1) * dilH + 1;
-    const int dil_kW = (kW - 1) * dilW + 1;
-    const int dil_kD = (kD - 1) * dilD + 1;
-    // top: (batchSize, nFilters, topHeight, topWidth, topDepth)
-    const int topHeightNoDH = (bottomHeight + 2*padH - dil_kH);
-    const int topWidthNoDW  = (bottomWidth + 2*padW - dil_kW);
-    const int topDepthNoDD  = (bottomDepth + 2*padD - dil_kD);
-    // the above values might be negative so we need to use Python-like
-    // flooring integer division to be compatible with get_conv_output.
-    // note: this macro implements Python's // for negative x only
-#define _CONV_FLOORDIV_X(x,y) ((x < 0) ? (- ((-x) / y) - (((-x) % y) == 0 ? 0 : 1)) : (x / y))
-    const int topHeight = _CONV_FLOORDIV_X(topHeightNoDH, dH) + 1;
-    const int topWidth  = _CONV_FLOORDIV_X(topWidthNoDW, dW) + 1;
-    const int topDepth  = _CONV_FLOORDIV_X(topDepthNoDD, dD) + 1;
-#undef _CONV_FLOORDIV
-    if (batchSize != CudaNdarray_HOST_DIMS(top)[0] ||
-        nFilters != CudaNdarray_HOST_DIMS(top)[1] ||
-        topHeight != CudaNdarray_HOST_DIMS(top)[2] ||
-        topWidth != CudaNdarray_HOST_DIMS(top)[3] ||
-        topDepth != CudaNdarray_HOST_DIMS(top)[4])
-   {
-     PyErr_Format(PyExc_ValueError,
-                  "GpuCorr3dMM shape inconsistency:\n"
-                  "  bottom shape: %d %d %d %d %d\n"
-                  "  weight shape: %d %d %d %d %d\n"
-                  "  top shape: %d %d %d %d %d (expected %d %d %d %d %d)\n",
-                  batchSize, nChannels, bottomHeight, bottomWidth, bottomDepth,
-                  nFilters, nChannels, kH, kW, kD,
-                  CudaNdarray_HOST_DIMS(top)[0], CudaNdarray_HOST_DIMS(top)[1],
-                  CudaNdarray_HOST_DIMS(top)[2], CudaNdarray_HOST_DIMS(top)[3],
-                  CudaNdarray_HOST_DIMS(top)[4],
-                  batchSize, nFilters, topHeight, topWidth, topDepth);
-        return 0;
-    }
-    // Create temporary columns
-    int col_dim[2];
-    col_dim[0] = nChannels * kW * kH * kD;
-    col_dim[1] = topHeight * topWidth * topDepth;
-    CudaNdarray* col = (CudaNdarray*) CudaNdarray_NewDims(2, col_dim);
-    if (0 == col)
-    {
-      PyErr_Format(PyExc_RuntimeError,
-                   "GpuCorr3dMM failed to allocate working memory of %d x %d\n",
-                   col_dim[0], col_dim[1]);
-        return 0;
-    }
-    // Define some useful variables
-    const int bottom_stride = CudaNdarray_HOST_STRIDES(bottom)[0];
-    const int top_stride = CudaNdarray_HOST_STRIDES(top)[0];
-    const int K_ = col_dim[0];
-    const int N_ = col_dim[1];
-    const int M_ = nFilters;
-    const float one = 1.0f;
-    const float zero = 0.0f;
-    CudaNdarray *output;
-    if (direction == 0)
-    { // forward pass
-      output = top;
-      if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
-          cudaError_t err = cudaMemset(output->devdata, 0,
-                                       CudaNdarray_SIZE(output) * sizeof(real));
-          if (err != cudaSuccess) {
-            PyErr_Format(PyExc_RuntimeError,
-                         "GpuCorr3dMM could not fill the output with zeros: %s",
-                         cudaGetErrorString(err));
-            Py_DECREF(col);
-            return NULL;
-          }
-          Py_DECREF(col);
-          return output;
-      }
-      // valid correlation: im2col, then gemm
-      // Iterate over batch
-      for (int n = 0; n < batchSize; n++)
-      {
-        // First, im3d2col
-        im3d2col(bottom->devdata + n * bottom_stride,
-                 nChannels,
-                 bottomHeight, bottomWidth, bottomDepth,
-                 kH, kW, kD,
-                 dilH, dilW, dilD,
-                 padH, padW, padD,
-                 dH, dW, dD,
-                 col->devdata);
-        cudaError_t err = cudaGetLastError();
-        if (err != cudaSuccess)
-        {
-          PyErr_Format(PyExc_RuntimeError,
-                       "GpuCorr3dMM encountered a CUDA error in im2col: %s\n"
-                       "This could be a known bug in CUDA, please see the "
-                       "GpuCorr3dMM() documentation.\n",
-                       cudaGetErrorString(err));
-          Py_DECREF(col);
-          return 0;
-        }
-        // Second, gemm
-        cublasStatus_t status = cublasSgemm(handle,
-                                            CUBLAS_OP_N, CUBLAS_OP_N,
-                                            N_, M_, K_,
-                                            &one,
-                                            col->devdata, N_,
-                                            weight->devdata, K_,
-                                            &zero,
-                                            top->devdata + n * top_stride, N_);
-        if (status != CUBLAS_STATUS_SUCCESS)
-        {
-          PyErr_Format(PyExc_RuntimeError,
-                       "GpuCorr3dMM encountered a CUBLAS error: %s\n"
-                       "This could be a known bug in CUDA, please see the "
-                       "GpuCorr3dMM() documentation.\n",
-                       cublasGetErrorString(status));
-          Py_DECREF(col);
-          return 0;
-        }
-      }
-    }
-    else if (direction == 1)
-    {
-      // backprop wrt. weights
-      output = weight;
-      if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
-          cudaError_t err = cudaMemset(output->devdata, 0,
-                                       CudaNdarray_SIZE(output) * sizeof(real));
-          if (err != cudaSuccess) {
-            PyErr_Format(PyExc_RuntimeError,
-                         "GpuCorr3dMM grad wrt. weights could not fill the output with zeros: %s",
-                         cudaGetErrorString(err));
-            Py_DECREF(col);
-            return NULL;
-          }
-          Py_DECREF(col);
-          return output;
-      }
-      // valid convolution: im2col, then gemm
-      // Iterate over batch
-      for (int n = 0; n < batchSize; n++)
-      {
-        // First, im2col
-        im3d2col(bottom->devdata + n * bottom_stride, nChannels,
-                 bottomHeight, bottomWidth, bottomDepth,
-                 kH, kW, kD,
-                 dilH, dilW, dilD,
-                 padH, padW, padD,
-                 dH, dW, dD,
-                 col->devdata);
-        cudaError_t err = cudaGetLastError();
-        if (err != cudaSuccess)
-       {
-         PyErr_Format(PyExc_RuntimeError,
-                      "GpuCorr3dMM encountered a CUDA error in im2col: %s\n"
-                      "This could be a known bug in CUDA, please see the "
-                      "GpuCorr3dMM() documentation.\n",
-                      cudaGetErrorString(err));
-         Py_DECREF(col);
-         return 0;
-       }
-        // Second, gemm
-        // Note that we accumulate into weight. We do so by setting beta = 0
-        // for the first iteration and beta = 1 for subsequent ones. (This
-        // is faster than setting weight to all zeros before the loop.)
-        cublasStatus_t status = cublasSgemm(handle,
-                                            CUBLAS_OP_T, CUBLAS_OP_N,
-                                            K_, M_, N_,
-                                            &one,
-                                            col->devdata, N_,
-                                            top->devdata + n * top_stride, N_,
-                                            (n == 0) ? &zero : &one,
-                                            weight->devdata, K_);
-        if (status != CUBLAS_STATUS_SUCCESS)
-        {
-          PyErr_Format(PyExc_RuntimeError,
-                       "GpuCorr3dMM encountered a CUBLAS error: %s\n"
-                       "This could be a known bug in CUDA, please see the "
-                       "GpuCorr3dMM() documentation.\n",
-                       cublasGetErrorString(status));
-          Py_DECREF(col);
-          return 0;
-        }
-      }
-    }
-    else if (direction == 2)
-    {
-      // backprop wrt. inputs
-      output = bottom;
-      if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
-          cudaError_t err = cudaMemset(output->devdata, 0,
-                                       CudaNdarray_SIZE(output) * sizeof(real));
-          if (err != cudaSuccess) {
-            PyErr_Format(PyExc_RuntimeError,
-                         "GpuCorr3dMM grad wrt. inputs could not fill the output with zeros: %s",
-                         cudaGetErrorString(err));
-            Py_DECREF(col);
-            return NULL;
-          }
-          Py_DECREF(col);
-          return output;
-      }
-      // full convolution: gemm, then col2im3d
-      // Iterate over batch
-      for (int n = 0; n < batchSize; n++)
-      {
-        // gemm into columns
-        cublasStatus_t status = cublasSgemm(handle,
-                                            CUBLAS_OP_N, CUBLAS_OP_T,
-                                            N_, K_, M_,
-                                            &one,
-                                            top->devdata + n * top_stride, N_,
-                                            weight->devdata, K_,
-                                            &zero,
-                                            col->devdata, N_);
-        if (status != CUBLAS_STATUS_SUCCESS)
-        {
-          PyErr_Format(PyExc_RuntimeError,
-                       "GpuCorr3dMM encountered a CUBLAS error: %s\n"
-                       "This could be a known bug in CUDA, please see the "
-                       "GpuCorr3dMM() documentation.\n",
-                       cublasGetErrorString(status));
-          Py_DECREF(col);
-          return 0;
-        }
-        // col2im3d back to the data
-        col2im3d(col->devdata, nChannels,
-                 bottomHeight, bottomWidth, bottomDepth,
-                 kH, kW, kD,
-                 dilH, dilW, dilD,
-                 padH, padW, padD,
-                 dH, dW, dD, bottom->devdata + n * bottom_stride);
-        cudaError_t err = cudaGetLastError();
-        if (err != cudaSuccess)
-        {
-          PyErr_Format(PyExc_RuntimeError,
-                       "GpuCorr3dMM encountered a CUDA error in col2im: %s\n"
-                       "This could be a known bug in CUDA, please see the "
-                       "GpuCorr3dMM() documentation.\n",
-                       cudaGetErrorString(err));
-          Py_DECREF(col);
-          return 0;
-        }
-      }
-    }
-    // Free temporary columns
-    Py_DECREF(col);
-    // Note that we don't change the refcount of the output matrix here. Output
-    // allocation and refcounting is done in BaseGpuCorr3dMM.c_code_helper();
-    // in here output is just aliased to one of bottom, weights, or top.
-    return output;
-}
--- a/theano/sandbox/cuda/corr_gemm.cu
+++ b/theano/sandbox/cuda/corr_gemm.cu
-// This uses a lot of code from Caffe (http://caffe.berkeleyvision.org/);
-// sources are clearly marked. Below we reproduce the original license of
-// the Caffe software.
-/*
-Copyright (c) 2014, The Regents of the University of California (Regents)
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-1. Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-#undef _GLIBCXX_ATOMIC_BUILTINS
-// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/caffe_common.hpp)
-// CUDA: grid stride looping
-#define CUDA_KERNEL_LOOP(i, n)                        \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
-       i < (n);                                       \
-       i += blockDim.x * gridDim.x)
-// CUDA: thread number configuration.
-// Use 1024 threads per block, which requires cuda sm_2x or above,
-// or fall back to attempt compatibility (best of luck to you).
-#if __CUDA_ARCH__ >= 200
-    const int CUDA_NUM_THREADS = 1024;
-#else
-    const int CUDA_NUM_THREADS = 512;
-#endif
-// CUDA: number of blocks for threads.
-inline int GET_BLOCKS(const int N) {
-  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
-}
-// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/util/im2col.cu)
-// Kernels for fast unfold + copy
-// CUDA kernel for the case of dilation
-__global__ void dilated_im2col_kernel(const int n, const float* data_im,
-    const int height, const int width, const int kernel_h, const int kernel_w,
-    const int dilation_h, const int dilation_w,
-    const int pad_h, const int pad_w,
-    const int stride_h, const int stride_w,
-    const int height_col, const int width_col,
-    float* data_col) {
-  CUDA_KERNEL_LOOP(index, n) {
-    const int h_index = index / width_col;
-    const int h_col = h_index % height_col;
-    const int w_col = index % width_col;
-    const int c_im = h_index / height_col;
-    const int c_col = c_im * kernel_h * kernel_w;
-    const int h_offset = h_col * stride_h - pad_h;
-    const int w_offset = w_col * stride_w - pad_w;
-    float* data_col_ptr = data_col;
-    data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;
-    const float* data_im_ptr = data_im;
-    data_im_ptr += (c_im * height + h_offset) * width + w_offset;
-    for (int i = 0; i < kernel_h; ++i) {
-      for (int j = 0; j < kernel_w; ++j) {
-        int h_im = h_offset + i * dilation_h;
-        int w_im = w_offset + j * dilation_w;
-        *data_col_ptr =
-          (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?
-            data_im_ptr[i * dilation_h * width + j * dilation_w] : 0;
-        data_col_ptr += height_col * width_col;
-      }
-    }
-  }
-}
-__global__ void im2col_kernel(const int n, const float* data_im,
-    const int height, const int width, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w,
-    const int stride_h, const int stride_w,
-    const int height_col, const int width_col,
-    float* data_col) {
-  CUDA_KERNEL_LOOP(index, n) {
-    const int h_index = index / width_col;
-    const int h_col = h_index % height_col;
-    const int w_col = index % width_col;
-    const int c_im = h_index / height_col;
-    const int c_col = c_im * kernel_h * kernel_w;
-    const int h_offset = h_col * stride_h - pad_h;
-    const int w_offset = w_col * stride_w - pad_w;
-    float* data_col_ptr = data_col;
-    data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;
-    const float* data_im_ptr = data_im;
-    data_im_ptr += (c_im * height + h_offset) * width + w_offset;
-    for (int i = 0; i < kernel_h; ++i) {
-      for (int j = 0; j < kernel_w; ++j) {
-        int h_im = h_offset + i ;
-        int w_im = w_offset + j ;
-        *data_col_ptr =
-          (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?
-           data_im_ptr[i * width + j] : 0;
-        data_col_ptr += height_col * width_col;
-      }
-    }
-  }
-}
-void im2col(const float* data_im, const int channels,
-    const int height, const int width, const int kernel_h, const int kernel_w,
-    const int dilation_h, const int dilation_w,
-    const int pad_h, const int pad_w,
-    const int stride_h, const int stride_w,
-    float* data_col) {
-  // We are going to launch channels * height_col * width_col kernels, each
-  // kernel responsible for copying a single-channel grid.
-  int dil_kernel_h = (kernel_h - 1) * dilation_h + 1;
-  int dil_kernel_w = (kernel_w - 1) * dilation_w + 1;
-  int height_col = (height + 2 * pad_h - dil_kernel_h) / stride_h + 1;
-  int width_col = (width + 2 * pad_w - dil_kernel_w) / stride_w + 1;
-  int num_kernels = channels * height_col * width_col;
-  if(dilation_h != 1 || dilation_w != 1){
-    dilated_im2col_kernel<<<GET_BLOCKS(num_kernels),
-                  CUDA_NUM_THREADS>>>(
-      num_kernels, data_im, height, width, kernel_h, kernel_w,
-      dilation_h, dilation_w, pad_h, pad_w, stride_h, stride_w, height_col,
-      width_col, data_col);
-  }
-  else{
-    im2col_kernel<<<GET_BLOCKS(num_kernels),
-                  CUDA_NUM_THREADS>>>(
-      num_kernels, data_im, height, width, kernel_h, kernel_w,
-      pad_h, pad_w, stride_h, stride_w, height_col,
-      width_col, data_col);
-  }
-}
-// CUDA kernel for the case of dilation
-__global__ void dilated_col2im_kernel(const int n, const float* data_col,
-    const int height, const int width, const int channels,
-    const int kernel_h, const int kernel_w,
-    const int dilation_h, const int dilation_w,
-    const int pad_h, const int pad_w,
-    const int stride_h, const int stride_w,
-    const int height_col, const int width_col,
-    float* data_im) {
-  CUDA_KERNEL_LOOP(index, n) {
-    float val = 0;
-    const int w_im = index % width + pad_w;
-    const int h_im = (index / width) % height + pad_h;
-    const int c_im = index / (width * height);
-    int kernel_extent_w = (kernel_w - 1) * dilation_w + 1;
-    int kernel_extent_h = (kernel_h - 1) * dilation_h + 1;
-    // compute the start and end of the output
-    const int w_col_start =
-        (w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1;
-    const int w_col_end = min(w_im / stride_w + 1, width_col);
-    const int h_col_start =
-        (h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) / stride_h + 1;
-    const int h_col_end = min(h_im / stride_h + 1, height_col);
-    // TODO: use LCM of stride and dilation to avoid unnecessary loops
-    for (int h_col = h_col_start; h_col < h_col_end; h_col += 1) {
-      for (int w_col = w_col_start; w_col < w_col_end; w_col += 1) {
-        int h_k = (h_im - h_col * stride_h);
-        int w_k = (w_im - w_col * stride_w);
-        if (h_k % dilation_h == 0 && w_k % dilation_w == 0) {
-          h_k /= dilation_h;
-          w_k /= dilation_w;
-          int data_col_index = (((c_im * kernel_h + h_k) * kernel_w + w_k) *
-                                height_col + h_col) * width_col + w_col;
-          val += data_col[data_col_index];
-        }
-      }
-    }
-    data_im[index] = val;
-  }
-}
-__global__ void col2im_kernel(const int n, const float* data_col,
-    const int height, const int width, const int channels,
-    const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w,
-    const int stride_h, const int stride_w,
-    const int height_col, const int width_col,
-    float* data_im) {
-  CUDA_KERNEL_LOOP(index, n) {
-    float val = 0;
-    const int w_im = index % width + pad_w;
-    const int h_im = (index / width) % height + pad_h;
-    const int c_im = index / (width * height);
-    // compute the start and end of the output
-    const int w_col_start =
-        (w_im < kernel_w) ? 0 : (w_im - kernel_w) / stride_w + 1;
-    const int w_col_end = min(w_im / stride_w + 1, width_col);
-    const int h_col_start =
-        (h_im < kernel_h) ? 0 : (h_im - kernel_h) / stride_h + 1;
-    const int h_col_end = min(h_im / stride_h + 1, height_col);
-    // equivalent implementation, no dilation
-    int offset =
-      (c_im * kernel_h * kernel_w + h_im * kernel_w + w_im) * height_col * width_col;
-    int coeff_h_col = (1 - stride_h * kernel_w * height_col) * width_col;
-    int coeff_w_col = (1 - stride_w * height_col * width_col);
-    for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
-      for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
-        val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
-      }
-    }
-    data_im[index] = val;
-  }
-}
-void col2im(const float* data_col, const int channels,
-    const int height, const int width, const int patch_h, const int patch_w,
-    const int dilation_h, const int dilation_w,
-    const int pad_h, const int pad_w, const int stride_h,
-    const int stride_w, float* data_im) {
-  int dil_patch_h = (patch_h - 1) * dilation_h + 1;
-  int dil_patch_w = (patch_w - 1) * dilation_w + 1;
-  int height_col = (height + 2 * pad_h - dil_patch_h) / stride_h + 1;
-  int width_col = (width + 2 * pad_w - dil_patch_w) / stride_w + 1;
-  int num_kernels = channels * height * width;
-  // To avoid involving atomic operations, we will launch one kernel per
-  // bottom dimension, and then in the kernel add up the top dimensions.
-  if(dilation_h != 1 || dilation_w != 1){
-    dilated_col2im_kernel<<<GET_BLOCKS(num_kernels),
-                  CUDA_NUM_THREADS>>>(
-      num_kernels, data_col, height, width, channels, patch_h, patch_w,
-      dilation_h, dilation_w, pad_h, pad_w, stride_h, stride_w,
-      height_col, width_col, data_im);
-  }
-  else{
-    col2im_kernel<<<GET_BLOCKS(num_kernels),
-                  CUDA_NUM_THREADS>>>(
-      num_kernels, data_col, height, width, channels, patch_h, patch_w,
-      pad_h, pad_w, stride_h, stride_w,
-      height_col, width_col, data_im);
-  }
-}
-// Theano op code
-// Authors: Arjun Jain, Frederic Bastien, Jan Schluter
-// Reference code: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu
-//   and https://github.com/torch/cunn/blob/master/SpatialConvolutionMM.cu
-CudaNdarray* corrMM(CudaNdarray *const bottom,
-                    CudaNdarray *const weight,
-                    CudaNdarray *const top,
-                    const int direction,
-                    const int dH = 1,
-                    const int dW = 1,
-                    const int dilH = 1,
-                    const int dilW = 1,
-                    const int padH = 0,
-                    const int padW = 0)
-{
-    if (bottom->nd != 4)
-    {
-        PyErr_SetString(PyExc_ValueError, "GpuCorrMM requires bottom of 4D");
-        return NULL;
-    }
-    if (!CudaNdarray_is_c_contiguous(bottom))
-    {
-        PyErr_Format(PyExc_ValueError,
-                "GpuCorrMM requires bottom to be C-contiguous, "
-                "but strides are: %d %d %d %d\n",
-                CudaNdarray_HOST_STRIDES(bottom)[0],
-                CudaNdarray_HOST_STRIDES(bottom)[1],
-                CudaNdarray_HOST_STRIDES(bottom)[2],
-                CudaNdarray_HOST_STRIDES(bottom)[3]);
-        return NULL;
-    }
-    if (weight->nd != 4)
-    {
-        PyErr_SetString(PyExc_ValueError, "GpuCorrMM requires weight of 4D");
-        return NULL;
-    }
-    if (!CudaNdarray_is_c_contiguous(weight))
-    {
-        PyErr_Format(PyExc_ValueError,
-                "GpuCorrMM requires weight to be C-contiguous, "
-                "but strides are: %d %d %d %d\n",
-                CudaNdarray_HOST_STRIDES(weight)[0],
-                CudaNdarray_HOST_STRIDES(weight)[1],
-                CudaNdarray_HOST_STRIDES(weight)[2],
-                CudaNdarray_HOST_STRIDES(weight)[3]);
-        return NULL;
-    }
-    if (top->nd != 4)
-    {
-        PyErr_SetString(PyExc_ValueError, "GpuCorrMM requires top of 4D");
-        return NULL;
-    }
-    if (!CudaNdarray_is_c_contiguous(top))
-    {
-        PyErr_Format(PyExc_ValueError,
-                "GpuCorrMM requires top to be C-contiguous, "
-                "but strides are: %d %d %d %d\n",
-                CudaNdarray_HOST_STRIDES(top)[0],
-                CudaNdarray_HOST_STRIDES(top)[1],
-                CudaNdarray_HOST_STRIDES(top)[2],
-                CudaNdarray_HOST_STRIDES(top)[3]);
-        return NULL;
-    }
-    // Extract some shape information for later and check shape consistency
-    // bottom: (batchSize, nChannels, bottomHeight, bottomWidth)
-    const int batchSize = CudaNdarray_HOST_DIMS(bottom)[0];
-    const int nChannels = CudaNdarray_HOST_DIMS(bottom)[1];
-    const int bottomHeight = CudaNdarray_HOST_DIMS(bottom)[2];
-    const int bottomWidth = CudaNdarray_HOST_DIMS(bottom)[3];
-    // weights: (nFilters, nChannels, rows, columns)
-    const int nFilters = CudaNdarray_HOST_DIMS(weight)[0];
-    const int kH = CudaNdarray_HOST_DIMS(weight)[2];
-    const int kW = CudaNdarray_HOST_DIMS(weight)[3];
-    if (nChannels != CudaNdarray_HOST_DIMS(weight)[1]) {
-        PyErr_SetString(PyExc_ValueError,
-                "GpuCorrMM images and kernel must have the same stack size\n");
-        return NULL;
-    }
-    // implicit dilated filter
-    const int dil_kH = (kH - 1) * dilH + 1;
-    const int dil_kW = (kW - 1) * dilW + 1;
-    // top: (batchSize, nFilters, topHeight, topWidth)
-    const int topHeightNoDH = (bottomHeight + 2*padH - dil_kH);
-    const int topWidthNoDW  = (bottomWidth + 2*padW - dil_kW);
-    // the above values might be negative so we need to use Python-like
-    // flooring integer division to be compatible with get_conv_output.
-    // note: this macro implements Python's // for negative x only
-#define _CONV_FLOORDIV_X(x,y) ((x < 0) ? (- ((-x) / y) - (((-x) % y) == 0 ? 0 : 1)) : (x / y))
-    const int topHeight = _CONV_FLOORDIV_X(topHeightNoDH, dH) + 1;
-    const int topWidth  = _CONV_FLOORDIV_X(topWidthNoDW, dW) + 1;
-#undef _CONV_FLOORDIV
-    if (batchSize != CudaNdarray_HOST_DIMS(top)[0] ||
-            nFilters != CudaNdarray_HOST_DIMS(top)[1] ||
-            topHeight != CudaNdarray_HOST_DIMS(top)[2] ||
-            topWidth != CudaNdarray_HOST_DIMS(top)[3]) {
-        PyErr_Format(PyExc_ValueError,
-                "GpuCorrMM shape inconsistency:\n"
-                "  bottom shape: %d %d %d %d\n"
-                "  weight shape: %d %d %d %d\n"
-                "  top shape: %d %d %d %d (expected %d %d %d %d)\n",
-                batchSize, nChannels, bottomHeight, bottomWidth,
-                nFilters, nChannels, kH, kW,
-                CudaNdarray_HOST_DIMS(top)[0], CudaNdarray_HOST_DIMS(top)[1],
-                CudaNdarray_HOST_DIMS(top)[2], CudaNdarray_HOST_DIMS(top)[3],
-                batchSize, nFilters, topHeight, topWidth);
-        return NULL;
-    }
-    // Create temporary columns
-    int col_dim[2];
-    col_dim[0] = nChannels * kW * kH;
-    col_dim[1] = topHeight * topWidth;
-    CudaNdarray* col = (CudaNdarray*)CudaNdarray_NewDims(2, col_dim);
-    if (NULL == col)
-    {
-        PyErr_Format(PyExc_RuntimeError,
-                "GpuCorrMM failed to allocate working memory of %d x %d\n",
-                col_dim[0], col_dim[1]);
-        return NULL;
-    }
-    // Define some useful variables
-    const int bottom_stride = CudaNdarray_HOST_STRIDES(bottom)[0];
-    const int top_stride = CudaNdarray_HOST_STRIDES(top)[0];
-    const int K_ = col_dim[0];
-    const int N_ = col_dim[1];
-    const int M_ = nFilters;
-    const float one = 1.0f;
-    const float zero = 0.0f;
-    CudaNdarray *output;
-    if (direction == 0) {  // forward pass
-        output = top;
-        if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
-            cudaError_t err = cudaMemset(output->devdata, 0,
-                                         CudaNdarray_SIZE(output) * sizeof(real));
-            if (err != cudaSuccess) {
-                PyErr_Format(PyExc_RuntimeError,
-                             "GpuCorrMM could not fill the output with zeros: %s",
-                             cudaGetErrorString(err));
-                Py_DECREF(col);
-                return NULL;
-            }
-            Py_DECREF(col);
-            return output;
-        }
-        // valid correlation: im2col, then gemm
-        // Iterate over batch
-        for (int n = 0; n < batchSize; n++) {
-            // First, im2col
-            im2col(bottom->devdata + n * bottom_stride, nChannels, bottomHeight,
-                   bottomWidth, kH, kW, dilH, dilW,
-                   padH, padW, dH, dW, col->devdata);
-            cudaError_t err = cudaGetLastError();
-            if (err != cudaSuccess) {
-                PyErr_Format(PyExc_RuntimeError,
-                             "GpuCorrMM encountered a CUDA error in im2col: %s\n"
-                             "This could be a known bug in CUDA, please see the "
-                             "GpuCorrMM() documentation.\n",
-                             cudaGetErrorString(err));
-                Py_DECREF(col);
-                return NULL;
-            }
-            // Second, gemm
-            cublasStatus_t status = cublasSgemm(handle,
-                    CUBLAS_OP_N, CUBLAS_OP_N,
-                    N_, M_, K_,
-                    &one,
-                    col->devdata, N_,
-                    weight->devdata, K_,
-                    &zero,
-                    top->devdata + n * top_stride, N_);
-            if (status != CUBLAS_STATUS_SUCCESS) {
-                PyErr_Format(PyExc_RuntimeError,
-                        "GpuCorrMM encountered a CUBLAS error: %s\n"
-                        "This could be a known bug in CUDA, please see the "
-                        "GpuCorrMM() documentation.\n",
-                        cublasGetErrorString(status));
-                Py_DECREF(col);
-                return NULL;
-            }
-        }
-        /*
-        // Original caffe code for comparison
-        // https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu
-        // Note that this is for grouped convolution; we can ignore groups here,
-        // but the group-related offsets help explain what M_, N_ and K_ are
-        int weight_offset = M_ * K_;
-        int col_offset = K_ * N_;
-        int top_offset = M_ * N_;
-        for (int n = 0; n < num_; ++n) {
-          // First, im2col
-          im2col_gpu(bottom_data + bottom[i]->offset(n), channels_, height_,
-              width_, kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_,
-              col_data);
-          // Second, innerproduct with groups
-          for (int g = 0; g < group_; ++g) {
-            caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, N_, K_,
-              (Dtype)1., weight + weight_offset * g, col_data + col_offset * g,
-              (Dtype)0., top_data + (*top)[i]->offset(n) + top_offset * g);
-            == (see https://github.com/BVLC/caffe/blob/master/src/caffe/util/math_functions.cu#L16)
-            cublasSgemm(CUBLAS_OP_N, CUBLAS_OP_N,
-              N_, M_, K_,
-              1.,
-              col_data + col_offset * g, N_,
-              weight + weight_offset * g, K_,
-              0.,
-              top_data + (*top)[i]->offset(n) + top_offset * g, N_);
-          }
-        }
-        */
-    }
-    else if (direction == 1) {  // backprop wrt. weights
-        output = weight;
-        if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
-            cudaError_t err = cudaMemset(output->devdata, 0,
-                                         CudaNdarray_SIZE(output) * sizeof(real));
-            if (err != cudaSuccess) {
-                PyErr_Format(PyExc_RuntimeError,
-                             "GpuCorrMM grad wrt. weights could not fill the output with zeros: %s",
-                             cudaGetErrorString(err));
-                Py_DECREF(col);
-                return NULL;
-            }
-            Py_DECREF(col);
-            return output;
-        }
-        // valid convolution: im2col, then gemm
-        // Iterate over batch
-        for (int n = 0; n < batchSize; n++) {
-            // First, im2col
-            im2col(bottom->devdata + n * bottom_stride, nChannels, bottomHeight,
-                   bottomWidth, kH, kW, dilH, dilW,
-                   padH, padW, dH, dW, col->devdata);
-            cudaError_t err = cudaGetLastError();
-            if (err != cudaSuccess) {
-                PyErr_Format(PyExc_RuntimeError,
-                             "GpuCorrMM encountered a CUDA error in im2col: %s\n"
-                             "This could be a known bug in CUDA, please see the "
-                             "GpuCorrMM() documentation.\n",
-                             cudaGetErrorString(err));
-                Py_DECREF(col);
-                return NULL;
-            }
-            // Second, gemm
-            // Note that we accumulate into weight. We do so by setting beta = 0
-            // for the first iteration and beta = 1 for subsequent ones. (This
-            // is faster than setting weight to all zeros before the loop.)
-            cublasStatus_t status = cublasSgemm(handle,
-                    CUBLAS_OP_T, CUBLAS_OP_N,
-                    K_, M_, N_,
-                    &one,
-                    col->devdata, N_,
-                    top->devdata + n * top_stride, N_,
-                    (n == 0) ? &zero : &one,
-                    weight->devdata, K_);
-            if (status != CUBLAS_STATUS_SUCCESS) {
-                PyErr_Format(PyExc_RuntimeError,
-                        "GpuCorrMM encountered a CUBLAS error: %s\n"
-                        "This could be a known bug in CUDA, please see the "
-                        "GpuCorrMM() documentation.\n",
-                        cublasGetErrorString(status));
-                Py_DECREF(col);
-                return NULL;
-            }
-        }
-        /*
-        // Original caffe code for comparison
-        // https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu
-        // Note that this is for grouped convolution; we can ignore groups
-        for (int n = 0; n < num_; ++n) {
-          // Since we saved memory in the forward pass by not storing all col
-          // data, we will need to recompute them.
-          im2col_gpu(bottom_data + (*bottom)[i]->offset(n), channels_, height_,
-                     width_, kernel_h_, kernel_w_, pad_h_, pad_w_,
-                     stride_h_, stride_w_, col_data);
-          // gradient w.r.t. weight. Note that we will accumulate diffs.
-          for (int g = 0; g < group_; ++g) {
-            caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasTrans, M_, K_, N_,
-                (Dtype)1., top_diff + top[i]->offset(n) + top_offset * g,
-                col_data + col_offset * g, (Dtype)1.,
-                weight_diff + weight_offset * g);
-            == (see https://github.com/BVLC/caffe/blob/master/src/caffe/util/math_functions.cu#L16)
-            cublasSgemm(CUBLAS_OP_T, CUBLAS_OP_N, K_, M_, N_,
-                1.0,
-                col_data + col_offset * g, N_,
-                top_diff + top[i]->offset(n) + top_offset * g, N_,
-                1.0,
-                weight_diff + weight_offset * g, K_);
-          }
-        }
-        */
-    }
-    else if (direction == 2) {  // backprop wrt. inputs
-        output = bottom;
-        if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
-            cudaError_t err = cudaMemset(output->devdata, 0,
-                                         CudaNdarray_SIZE(output) * sizeof(real));
-            if (err != cudaSuccess) {
-                PyErr_Format(PyExc_RuntimeError,
-                             "GpuCorrMM grad wrt. inputs could not fill the output with zeros: %s",
-                             cudaGetErrorString(err));
-                Py_DECREF(col);
-                return NULL;
-            }
-            Py_DECREF(col);
-            return output;
-        }
-        // full convolution: gemm, then col2im
-        // Iterate over batch
-        for (int n = 0; n < batchSize; n++) {
-            // gemm into columns
-            cublasStatus_t status = cublasSgemm(handle,
-                    CUBLAS_OP_N, CUBLAS_OP_T,
-                    N_, K_, M_,
-                    &one,
-                    top->devdata + n * top_stride, N_,
-                    weight->devdata, K_,
-                    &zero,
-                    col->devdata, N_);
-            if (status != CUBLAS_STATUS_SUCCESS) {
-                PyErr_Format(PyExc_RuntimeError,
-                        "GpuCorrMM encountered a CUBLAS error: %s\n"
-                        "This could be a known bug in CUDA, please see the "
-                        "GpuCorrMM() documentation.\n",
-                        cublasGetErrorString(status));
-                Py_DECREF(col);
-                return NULL;
-            }
-            // col2im back to the data
-            col2im(col->devdata, nChannels, bottomHeight, bottomWidth,
-                   kH, kW, dilH, dilW, padH, padW,
-                   dH, dW, bottom->devdata + n * bottom_stride);
-            cudaError_t err = cudaGetLastError();
-            if (err != cudaSuccess) {
-                PyErr_Format(PyExc_RuntimeError,
-                             "GpuCorrMM encountered a CUDA error in col2im: %s\n"
-                             "This could be a known bug in CUDA, please see the "
-                             "GpuCorrMM() documentation.\n",
-                             cudaGetErrorString(err));
-                Py_DECREF(col);
-                return NULL;
-            }
-        }
-        /*
-        // Original caffe code for comparison
-        // https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu
-        for (int n = 0; n < num_; ++n) {
-          // gradient w.r.t. bottom data, if necessary
-          if (propagate_down[i]) {
-            for (int g = 0; g < group_; ++g) {
-              caffe_gpu_gemm<Dtype>(CblasTrans, CblasNoTrans, K_, N_, M_,
-                  (Dtype)1., weight + weight_offset * g,
-                  top_diff + top[i]->offset(n) + top_offset * g,
-                  (Dtype)0., col_diff + col_offset * g);
-              == (see https://github.com/BVLC/caffe/blob/master/src/caffe/util/math_functions.cu#L16)
-              cublasSgemm(CUBLAS_OP_N, CUBLAS_OP_T, N_, K_, M_,
-                  1.,
-                  top_diff + top[i]->offset(n) + top_offset * g, N_,
-                  weight + weight_offset * g, K_,
-                  0.,
-                  col_diff + col_offset * g, N_);
-            }
-            // col2im back to the data
-            col2im_gpu(col_diff, channels_, height_, width_,
-                kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_,
-                bottom_diff + (*bottom)[i]->offset(n));
-          }
-        }
-        */
-    }
-    // Free temporary columns
-    Py_DECREF(col);
-    // Note that we don't change the refcount of the output matrix here. Output
-    // (re)allocation and refcounting is done in BaseGpuCorrMM.c_code_helper();
-    // in here output is just aliased to one of bottom, weights, or top.
-    return output;
-}
--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
--- a/theano/sandbox/cuda/cuda_ndarray.cuh
+++ b/theano/sandbox/cuda/cuda_ndarray.cuh
-#ifndef _CUDA_NDARRAY_H
-#define _CUDA_NDARRAY_H
-#include <algorithm>
-// Defines for Python 2/3 compatibility.
-#if PY_MAJOR_VERSION >= 3
-// Py3k treats all ints as longs. This one is not caught by npy_3kcompat.h.
-#define PyNumber_Int PyNumber_Long
-#include "numpy/npy_3kcompat.h"
-// Py3k strings are unicode, these mimic old functionality.
-//
-// NOTE: npy_3kcompat.h replaces PyString_X with PyBytes_X, which breaks
-// compatibility with some functions returning text.
-#define PyString_Check PyUnicode_Check
-#define PyString_FromString PyUnicode_FromString
-#define PyString_AsString PyUnicode_AsUTF8
-#define PyString_FromStringAndSize PyUnicode_FromStringAndSize
-#define PyString_Size PyUnicode_GET_SIZE
-#define PyInt_FromSize_t PyLong_FromSize_t
-// Python 3 expects a PyObject* as the first argument to PySlice_GetIndicesEx().
-#define SLICE_CAST(x) (x)
-#else
-// Python 2 expects a PySliceObject* as the first argument to PySlice_GetIndicesEx().
-#define SLICE_CAST(x) ((PySliceObject*)(x))
-#endif // end #if PY_MAJOR_VERSION >= 3
-#ifndef Py_TYPE
-#  define Py_TYPE(o) ((o)->ob_type)
-#endif
-#ifndef Py_REFCNT
-#  define Py_REFCNT(o) ((o)->ob_refcnt)
-#endif
-#include <numpy/arrayobject.h>
-#include <stdio.h>
-#include <stdint.h>
-#ifndef SIZE_MAX
-    #define SIZE_MAX ((size_t)-1)
-#endif
-// Cuda GPUs only accept a single representation for NaN whereas CPU may have
-// more than one. So it's better to use the CUDA one to be sure
-#ifdef NAN
-#undef NAN
-#endif
-#include <math_constants.h>
-#define NAN CUDART_NAN_F
-#include <cublas_v2.h>
-#ifdef _WIN32
-# ifdef _CUDA_NDARRAY_C
-#  define DllExport   __declspec( dllexport )
-# else
-#  define DllExport   __declspec( dllimport )
-# endif
-# define ALWAYS_INLINE
-#else //else _WIN32
-# define DllExport __attribute__((visibility ("default")))
-# define ALWAYS_INLINE __attribute__((always_inline))
-#endif
-typedef float real;
-#define REAL_TYPENUM 11
-#ifdef __DEVICE_EMULATION__
-#define NUM_VECTOR_OP_BLOCKS                4096
-#define NUM_VECTOR_OP_THREADS_PER_BLOCK     1  //This prevents printf from getting tangled up
-#else
-#define NUM_VECTOR_OP_BLOCKS                4096 //Max number of blocks to launch.  Should be read from device properties. (#10)
-#define NUM_VECTOR_OP_THREADS_PER_BLOCK     256  //Should be read from device properties. (#10)
-#endif
-#if 1
-// Do not wait after every kernel & transfer.
-#define CNDA_THREAD_SYNC
-#else
-// This is useful for using normal profiling tools
-#define CNDA_THREAD_SYNC cudaThreadSynchronize();
-#endif
-//If true, we release the GIL around blocking GPU calls, to allow other Python
-//threads to run in the meantime. For a single-threaded program, the overhead
-//is neglectible (about 20ms for 1 million GIL release/reclaim cycles). Can
-//still be overridden on compilation with -DRELEASE_GIL=0 in nvcc.flags.
-#ifndef RELEASE_GIL
-#define RELEASE_GIL 1
-#endif
-#if RELEASE_GIL
-#define CNDA_BEGIN_ALLOW_THREADS Py_BEGIN_ALLOW_THREADS
-#define CNDA_END_ALLOW_THREADS Py_END_ALLOW_THREADS
-#else
-#define CNDA_BEGIN_ALLOW_THREADS
-#define CNDA_END_ALLOW_THREADS
-#endif
-#ifndef SHARED_SIZE
-#define SHARED_SIZE (16*1024)
-#endif
-#define VERBOSE_DEVICE_MALLOC 1
-#define NO_VERBOSE_DEVICE_MALLOC 0
-/* Use this handle to make cublas calls */
-extern DllExport cublasHandle_t handle;
-/**
- * Allocation and freeing of device memory should go through these functions so
- * that the lib can track memory usage.
- *
- * device_malloc will set the Python error message before returning None.
- * device_free will return nonzero on failure (after setting the python error message)
- *
- * Set the Python error
- */
-DllExport void * device_malloc(size_t size);
-DllExport void * device_malloc(size_t size, int verbose);
-DllExport int device_free(void * ptr);
-DllExport void *get_work_mem(size_t sz);
-// Pointor to 1 int on the device
-// Used in CudaNdarray_TakeFrom and in an op
-// to tell that there is an out of bound error
-// When it is allocated, it should always be 0
-// So if there is an error, we must reset it to 0 BEFORE we raise the error
-// This prevent us from setting it to 0 before each use
-extern DllExport int* err_var;
-static inline int init_err_var(){
-    if (err_var == NULL) {
-        err_var = (int*)device_malloc(sizeof(int));
-        if (!err_var) { // PyErr set by device_malloc
-            return -1;
-        }
-        cudaError_t err = cudaMemset((void*)err_var, 0,
-                                     sizeof(int));
-        if (cudaSuccess != err) {
-            // Clear the error flag, cudaMemset doesn't do it.
-            cudaGetLastError();
-            PyErr_Format(
-                PyExc_RuntimeError,
-                "Error setting device error code to 0. %s",
-                cudaGetErrorString(err));
-            return -1;
-        }
-    }
-    return 0;
-}
-static inline int check_err_var(){
-    //-10 could be any value different then 0.
-    int cpu_err_var=-10;
-    cudaError_t err;
-    CNDA_BEGIN_ALLOW_THREADS
-    // As we execute cudaMemcpy on the default stream, it waits
-    // for all kernels (on all streams) to be finished before
-    // starting to copy
-    err = cudaMemcpy(&cpu_err_var, err_var, sizeof(int),
-                     cudaMemcpyDeviceToHost);
-    CNDA_END_ALLOW_THREADS
-    if (cudaSuccess != err) {
-        PyErr_Format(
-            PyExc_RuntimeError,
-            "Cuda error: %s when trying to get the error"
-            " value.\\n",
-            cudaGetErrorString(err));
-        return -1;
-    }
-    if (cpu_err_var != 0) {
-        PyErr_Format(
-            PyExc_IndexError,
-            "One of the index value is out of bound. Error code: %i.\\n",
-            cpu_err_var);
-        // Must reset it to 0 to don't reset it before each use.
-        err = cudaMemset((void*)err_var, 0, sizeof(int));
-        if (cudaSuccess != err) {
-            PyErr_Format(PyExc_MemoryError,
-                "Error setting device error code to 0 after having"
-                " an index error. %s", cudaGetErrorString(err));
-            return -1;
-        }
-        return -1;
-    }
-    return 0;
-}
-template <typename T>
-static T ceil_intdiv(T a, T b)
-{
-    return (a/b) + ((a % b) ? 1: 0);
-}
-/**
- * struct CudaNdarray
- *
- * This is a Python type.
- *
- */
-struct CudaNdarray
-{
-    PyObject_HEAD
-    /**
-     * base:
-     *  either NULL or a pointer to a fellow CudaNdarray into which this one is viewing.
-     *  This pointer is never followed, except during Py_DECREF when we do not need it any longer.
-     */
-    PyObject * base;
-    /* Type-specific fields go here. */
-    //GpuTensorType::VoidTensor * vt;
-    int nd; //the number of dimensions of the tensor
-    // Client should acces host_structure via CudaNdarray_HOST_DIMS / CudaNdarray_HOST_STRIDES functions
-    int * host_structure; //dim0, dim1, ... stride0, stride1, ...
-    int data_allocated; //the number of bytes allocated for devdata
-    //device pointers (allocated by cudaMalloc)
-    mutable int dev_structure_fresh;
-    //dev_structure should be accessed via the functions like
-    //CudaNdarray_DEV_DIMS, otherwise may not be
-    //synchronized with host_structure. The accessor functions will allocate it when needed.
-    mutable int * dev_structure; //dim0, dim1, ..., stride0, stride1, ...
-    real* devdata; //pointer to data element [0,..,0].
-};
-enum operator_t
-{
-    IADD=0,
-    IDIV,
-    CPY,
-    N_ELEMWISE_OPS // This is to know the number of operation
-};
-/*
- * Return a CudaNdarray whose 'nd' dimensions are all 0.
- * if nd==-1, it is not initialized.
- *
- * Set the Python error
- */
-DllExport PyObject *
-CudaNdarray_New(int nd=-1);
-/**
- * Return 1 for a CudaNdarray otw 0
- */
-DllExport int
-CudaNdarray_Check(const PyObject * ob);
-/**
- * Return 1 for a CudaNdarray otw 0
- */
-DllExport int
-CudaNdarray_CheckExact(const PyObject * ob);
-/**
- * Return true for a C-contiguous CudaNdarray, else false
- */
-DllExport bool
-CudaNdarray_is_c_contiguous(const CudaNdarray * self);
-/**
- * Return true for a F-contiguous CudaNdarray, else false
- */
-DllExport bool
-CudaNdarray_is_f_contiguous(const CudaNdarray * self);
-/****
- * Returns the number of elements necessary in host_structure and dev_structure for a given number of dimensions.
- */
-DllExport int cnda_structure_size(int nd);
-/*
- * This describes the shape of the ndarray. The array
- * of dimensions is itself stored on the host.
- * If you need to access the dimensions array from inside
- * a kernel, use CudaNdarray_DEVICE_DIMS.
- */
-DllExport const int *
-CudaNdarray_HOST_DIMS(const CudaNdarray * self);
-DllExport const int *
-CudaNdarray_HOST_STRIDES(const CudaNdarray * self);
-DllExport const int *
-CudaNdarray_HOST_LOG2DIMS(const CudaNdarray * self);
-DllExport inline void ALWAYS_INLINE
-cnda_mark_dev_structure_dirty(CudaNdarray * self)
-{
-    self->dev_structure_fresh = 0;
-}
-DllExport int
-CudaNdarray_EqualAndIgnore(CudaNdarray *cnda1, CudaNdarray *cnda2, int ignoreSync, int ignoreBase);
-// Default: do not ignore sync of dev and host structures in comparing, and do not ignore difference in base pointers
-DllExport int
-CudaNdarray_Equal(CudaNdarray *cnda1, CudaNdarray *cnda2);
-/****
- *  Set the dimension[idx] to value d.
- *
- *  Updates the log2dim shadow array.
- *
- *  Does not sync structure to device.
- */
-DllExport inline void ALWAYS_INLINE
-CudaNdarray_set_dim(CudaNdarray * self, int idx, int d) 
-{
-    if ((idx >= self->nd) || (idx < 0) || (d < 0))
-    {
-        fprintf(stderr, "WARNING: probably bad CudaNdarray_set_dim arguments: self->ndim=%i, idx=%i stride=%i\n",
-                self->nd, idx, d);
-    }
-    if (d != self->host_structure[idx])
-    {
-        self->host_structure[idx] = d;
-        int log2d = (int)log2((double)d);
-        self->host_structure[idx + 2*self->nd] = (d == (1 << log2d)) ? log2d : -1;
-        cnda_mark_dev_structure_dirty(self);
-    }
-}
-DllExport inline void ALWAYS_INLINE
-CudaNdarray_set_stride(CudaNdarray * self, int idx, int s)
-{
-    if ((idx >= self->nd) || (idx < 0))
-    {
-        fprintf(stderr, "WARNING: probably bad CudaNdarray_set_stride arguments: %i %i\n", idx, s);
-    }
-    if (s != CudaNdarray_HOST_STRIDES(self)[idx])
-    {
-        self->host_structure[idx+self->nd] = s;
-        cnda_mark_dev_structure_dirty(self);
-    }
-}
-/***
- *  Update dependent variables from the contents of CudaNdarray_HOST_DIMS(self) and CudaNdarray_HOST_STRIDES(self)
- *
- *  This means: recalculate the log2dims and transfer structure to the card
- */
-DllExport int cnda_copy_structure_to_device(const CudaNdarray * self);
-/* CudaNdarray_DEV_DIMS gives the same information as CudaNdarray_HOST_DIMS,
- * but stored on the GPU. Use this pointer when it needs to be accessed
- * from inside a CUDA kernel.
- */
-DllExport const int *CudaNdarray_DEV_DIMS(const CudaNdarray * self);
-DllExport const int *CudaNdarray_DEV_STRIDES(const CudaNdarray * self);
-DllExport const int *CudaNdarray_DEV_LOG2DIMS(const CudaNdarray * self);
-DllExport float *CudaNdarray_DEV_DATA(const CudaNdarray * self);
-// The following 4 macro are here to help make c code generator that work on
-// both PyArray and CudaNdarray.  This is at least used for Subtensor and
-// GpuSubtensor
-#define CudaNdarray_DIMS CudaNdarray_HOST_DIMS
-#define CudaNdarray_NDIM(self) self->nd
-#define CudaNdarray_STRIDES CudaNdarray_HOST_STRIDES
-#define CudaNdarray_BYTES CudaNdarray_DEV_DATA
-/**
- * Return the number of elements in the ndarray (product of the dimensions)
- */
-DllExport size_t CudaNdarray_SIZE(const CudaNdarray *self);
-static PyObject *CudaNdarray_SIZE_Object(const CudaNdarray *self, void *closure);
-/**
- * Allocate a new CudaNdarray with room for given number of dimensions
- *
- * No Storage space is allocated (and all dimensions are 0)
- *
- * Set the Python error
- */
-DllExport PyObject * CudaNdarray_new_nd(const int nd);
-/**
- * [Re]allocate a CudaNdarray with access to 'nd' dimensions.
- *
- * Note: This does not allocate storage for data, or free
- *       pre-existing storage.
- *
- * Set the Python error
- */
-DllExport inline int ALWAYS_INLINE
-CudaNdarray_set_nd(CudaNdarray * self, const int nd)
-{
-    if (nd != self->nd)
-    {
-        if (self->dev_structure)
-        {
-            if (device_free(self->dev_structure))
-            {
-                return -1;
-            }
-            self->dev_structure = NULL;
-        }
-        if (self->host_structure)
-        {
-            free(self->host_structure);
-            self->host_structure = NULL;
-            self->nd = -1;
-        }
-        if (nd == -1) return 0;
-        self->host_structure = (int*)malloc(cnda_structure_size(nd)*sizeof(int));
-        if (NULL == self->host_structure)
-        {
-            PyErr_SetString(PyExc_MemoryError, "Failed to allocate dim or str");
-            return -1;
-        }
-        //initialize all dimensions and strides to 0
-        for (int i = 0; i < cnda_structure_size(nd); ++i)
-        {
-            self->host_structure[i] = 0;
-        }
-        //The device structure will be created in cnda_copy_structure_to_device
-        //if needed.
-        self->nd = nd;
-        self->dev_structure_fresh = 0;
-    }
-    return 0;
-}
-/**
- * CudaNdarray_alloc_contiguous
- *
- * Allocate storage space for a tensor of rank 'nd' and given dimensions.
- * (No-op if self already has a contiguous tensor of the right dimensions)
- *
- * If fortran is non-zeros, a fortran order is made, otherwise it is a c order.
- *
- * Note: CudaNdarray_alloc_contiguous is templated to work for both int dimensions and npy_intp dimensions
- */
-template<typename inttype>
-static int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd,
-                                        const inttype * dim, int fortran=0)
-{
-    // allocate an empty ndarray with c_contiguous access
-    // return 0 on success
-    size_t size = 1; //set up the strides for contiguous tensor
-    assert (nd >= 0);
-    // Here we modify the host structure to have the desired shape and
-    // strides. This does not cause the storage to be freed or reallocated.
-    if (CudaNdarray_set_nd(self, nd))
-    {
-        return -1;
-    }
-    if (fortran)
-    {
-        for (int i = 0; i < nd; i++)
-        {
-            CudaNdarray_set_stride(self, i, (dim[i] == 1) ? 0 : size);
-            CudaNdarray_set_dim(self, i, dim[i]);
-            //Detect overflow on unsigned integer
-            if (dim[i] != 0 && size > (SIZE_MAX / dim[i])) {
-                PyErr_Format(PyExc_AssertionError,
-                             "Can't store in size_t for the bytes requested %llu * %llu",
-                             (unsigned long long)size, (unsigned long long)dim[i]);
-                return -1;
-            }
-            size = size * dim[i];
-        }
-    }
-    else
-    {
-        for (int i = nd-1; i >= 0; --i)
-        {
-            CudaNdarray_set_stride(self, i, (dim[i] == 1) ? 0 : size);
-            CudaNdarray_set_dim(self, i, dim[i]);
-            //Detect overflow on unsigned integer
-            if (dim[i] != 0 && size > (SIZE_MAX / dim[i])) {
-                PyErr_Format(PyExc_AssertionError,
-                             "Can't store in size_t for the bytes requested %llu * 4",
-                             (unsigned long long)size);
-                return -1;
-            }
-            size = size * dim[i];
-        }
-    }
-    // Detect overflow on unsigned integer
-    if (size > (SIZE_MAX / sizeof(real))) {
-        PyErr_Format(PyExc_RuntimeError,
-                     "Can't store in size_t for the bytes requested %llu",
-                     (unsigned long long)size);
-        return -1;
-    }
-    // If the allocated buffer is already of the right size, we don't need to
-    // do anything else.
-    // Note: self->data_allocated is 0 for a view, so views will fail this
-    // check and be turned into independent arrays below.
-    if (self->data_allocated == size)
-    {
-        return 0;
-    }
-    // The structure of self will be reused with newly allocated memory.
-    // If self was a view, we should remove the reference to its base.
-    // (If base was already NULL, the following has no effect.)
-    Py_XDECREF(self->base);
-    self->base = NULL;
-    // If self is a view, do not try to free its memory
-    if (self->data_allocated && device_free(self->devdata))
-    {
-        self->devdata = NULL;
-        self->data_allocated = 0;
-        return -1;
-    }
-    self->devdata = (float*)device_malloc(size*sizeof(real));
-    if (size && !self->devdata)
-    {
-        CudaNdarray_set_nd(self, -1);
-        self->data_allocated = 0;
-        self->devdata = 0;
-        return -1;
-    }
-    if (0)
-        fprintf(stderr,
-            "Allocated devdata %p (self=%p)\n",
-            self->devdata,
-            self);
-    self->data_allocated = size;
-    return 0;
-}
-/*
- * Return a CudaNdarray whose 'nd' dimensions are set to dims, and allocated.
- * Set the python error.
- */
-template<typename inttype> 
-static PyObject *CudaNdarray_NewDims(int nd, const inttype * dims)
-{
-    CudaNdarray * rval = (CudaNdarray*)CudaNdarray_New();
-    if (rval)
-    {
-        if (CudaNdarray_alloc_contiguous(rval, nd, dims))
-        {
-            Py_DECREF(rval);
-            return NULL;
-        }
-    }else{
-        PyErr_SetString(PyExc_MemoryError,
-                        "Failed to allocate the CudaNdarray structure.");
-    }
-    return (PyObject*)rval;
-}
-/**
- * CudaNdarray_set_device_data
- *
- * Set self to be a view of given `data`, owned by existing CudaNdarray `base`.
- */
-DllExport int CudaNdarray_set_device_data(CudaNdarray * self, float * data, PyObject * base);
-DllExport int CudaNdarray_set_device_data(CudaNdarray * self, float * data, const CudaNdarray * base);
-/**
- * Return an independent copy of self
- */
-DllExport PyObject * CudaNdarray_DeepCopy(CudaNdarray * self, PyObject * memo);
-/**
- * Return an independent copy of self
- */
-DllExport PyObject * CudaNdarray_Copy(const CudaNdarray * self);
-/**
- * Return a new object obtained by summing over the dimensions for which there is a 1 in the mask.
- */
-DllExport PyObject * CudaNdarray_ReduceSum(CudaNdarray * self, PyObject * py_reduce_mask);
-/**
- * Reshape self to the new shape gived by the tuple shape.
- */
-DllExport PyObject * CudaNdarray_Reshape(CudaNdarray * self, PyObject * shape);
-/**
- * Transfer the contents of numpy array `obj` to `self`.
- *
- * self is reallocated to have the correct dimensions if necessary.
- */
-DllExport int CudaNdarray_CopyFromArray(CudaNdarray * self, PyArrayObject*obj);
-/**
- * Transfer the contents of CudaNdarray `other` to `self`.
- *
- * self is reallocated to have the correct dimensions if necessary.
- * TODO: WRITEME: what does "if necessary" mean?
- * TODO: we use this to implement set/inc subtensor, where self is a view of
- *       the original tensor so that we write only to the subtensor. How
- *       do we ensure that self is not reallocated in this case?
- *
- *  unbroadcast: if true, this means that other is broadcastable in some
- *               dimensions, and the result, self, is not.
- *               ie, if unbroadcast=false, we must do the broadcasting
- *               operation as part of the copy.
- *               e.g. suppose self and other are 2D matrices and other
- *               has only one row. Then we need to copy this row several
- *               times when copying to self.
- *
- * Set the Python error
- */
-DllExport int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self,
-        const CudaNdarray * other, bool unbroadcast = false);
-/**
- * Transfer the contents of CudaNdarray `self` to a new numpy ndarray.
- */
-DllExport PyObject *
-CudaNdarray_CreateArrayObj(CudaNdarray * self, PyObject *args = NULL);
-DllExport PyObject *
-CudaNdarray_ZEROS(int n, int * dims);
-/**
- * True iff the strides look like [dim[nd-2], dim[nd-3], ... , dim[0], 1]
- */
-DllExport inline bool ALWAYS_INLINE
-CudaNdarray_is_c_contiguous(const CudaNdarray * self)
-{
-    bool c_contiguous = true;
-    int size = 1;
-    for (int i = self->nd-1; (i >= 0) && c_contiguous; --i)
-    {
-        if (CudaNdarray_HOST_DIMS(self)[i] == 1)
-            continue;
-        if (CudaNdarray_HOST_STRIDES(self)[i] != size)
-        {
-            c_contiguous = false;
-        }
-        size = size * CudaNdarray_HOST_DIMS(self)[i];
-    }
-    return c_contiguous;
-}
-/**
- * True iff the strides look like [1, dim[0], dim[0]*dim[1], ...]
- */
-DllExport inline bool ALWAYS_INLINE
-CudaNdarray_is_f_contiguous(const CudaNdarray * self)
-{
-    bool f_contiguous = true;
-    int size = 1;
-    for (int i = 0; (i < self->nd) && f_contiguous; i++)
-    {
-        if (CudaNdarray_HOST_DIMS(self)[i] == 1)
-            continue;
-        if (CudaNdarray_HOST_STRIDES(self)[i] != size)
-        {
-            f_contiguous = false;
-        }
-        size = size * CudaNdarray_HOST_DIMS(self)[i];
-    }
-    return f_contiguous;
-}
-DllExport PyObject * CudaNdarray_IS_C_Contiguous(CudaNdarray * self);
-DllExport int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C);
-DllExport int CudaNdarray_sgemv(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C);
-DllExport int CudaNdarray_sger(float alpha, const CudaNdarray * x, const CudaNdarray * y, CudaNdarray* A);
-DllExport int CudaNdarray_reduce_sum(CudaNdarray * self, CudaNdarray * A);
-DllExport int CudaNdarray_reduce_prod(CudaNdarray * self, CudaNdarray * A);
-DllExport int CudaNdarray_reduce_min(CudaNdarray * self, CudaNdarray * A);
-DllExport int CudaNdarray_reduce_max(CudaNdarray * self, CudaNdarray * A);
-DllExport int CudaNdarray_dimshuffle(CudaNdarray * self, unsigned int len, const int * pattern);
-DllExport PyObject*
-CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args);
-// Set the Python error
-int fprint_CudaNdarray(FILE * fd, const CudaNdarray *self);
-DllExport PyObject * CudaNdarray_View(const CudaNdarray * self);
-DllExport PyObject * CudaNdarray_inplace_add(PyObject* py_self, PyObject * py_other);
-DllExport PyObject * CudaNdarray_Subscript(PyObject * py_self, PyObject * key);
-DllExport int CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t fct_nb);
-// Ensures that *arr is a pointer to a contiguous ndarray of the specified
-// dimensions.
-// *arr may initially be NULL, a pointer to an ndarray of the wrong size,
-// or a pointer to an ndarray of the right size. In the last case it will
-// not change.
-// If fortran is non-zero, a fortran order is expected/created
-//
-// Set the Python error
-DllExport int CudaNdarray_prep_output(CudaNdarray ** arr, int nd,
-                                      const int * dims, int fortran = 0);
-DllExport inline const char* ALWAYS_INLINE cublasGetErrorString(cublasStatus_t err){
-    switch(err) {
-    case CUBLAS_STATUS_SUCCESS:
-        return "success";
-    case CUBLAS_STATUS_NOT_INITIALIZED:
-        return "the library was not initialized";
-    case CUBLAS_STATUS_ALLOC_FAILED:
-        return "the resource allocation failed";
-    case CUBLAS_STATUS_INVALID_VALUE:
-        return "the parameters n<0 or incx,incy=0";
-#ifdef CUBLAS_STATUS_ARCH_MISMATCH
-    case CUBLAS_STATUS_ARCH_MISMATCH:
-        return "required device feature not present";
-#endif
-    case CUBLAS_STATUS_MAPPING_ERROR:
-        return "an access to GPU memory space failed";
-    case CUBLAS_STATUS_EXECUTION_FAILED:
-        return "the function failed to launch on the GPU";
-    case CUBLAS_STATUS_INTERNAL_ERROR:
-        return "an internal operation failed";
-#ifdef CUBLAS_STATUS_NOT_SUPPORTED
-    case CUBLAS_STATUS_NOT_SUPPORTED:
-        return "unsupported function";
-#endif
-    default:
-        return "unknow code";
-    }
-}
-#endif
-/*
-  Local Variables:
-  mode:c++
-  c-basic-offset:4
-  c-file-style:"stroustrup"
-  indent-tabs-mode:nil
-  fill-column:79
-  End:
-*/
-// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 :
--- a/theano/sandbox/cuda/cudnn_helper.h
+++ b/theano/sandbox/cuda/cudnn_helper.h
-#ifndef CUDNN_HELPER_H
-#define CUDNN_HELPER_H
-#include <cudnn.h>
-// If needed, define element of the V4 interface in terms of elements of
-// previous versions
-#if defined(CUDNN_VERSION) && CUDNN_VERSION < 4000
-#define CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING 5
-#define CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING 3
-#endif
-#ifndef CUDNN_VERSION
-#include <assert.h>
-// Here we define the R2 API in terms of functions in the R1 interface
-// This is only for what we use
-static inline const char *cudnnGetErrorString(cudnnStatus_t err) {
-  switch (err) {
-  case CUDNN_STATUS_SUCCESS:
-    return "The operation completed successfully.";
-  case CUDNN_STATUS_NOT_INITIALIZED:
-    return "The handle was not initialized(Is your driver recent enought?).";
-  case CUDNN_STATUS_ALLOC_FAILED:
-    return "Ressource allocation failed inside the library.";
-  case CUDNN_STATUS_BAD_PARAM:
-    return "An incorrect value was passed in.";
-  case CUDNN_STATUS_ARCH_MISMATCH:
-    return "The current GPU does not support the required features (only cc 3.0+ are supported).";
-  case CUDNN_STATUS_MAPPING_ERROR:
-    return "An access to GPU memory space failed (probably due to a failure to bind texture).";
-  case CUDNN_STATUS_EXECUTION_FAILED:
-    return "A kernel failed to execute.";
-  case CUDNN_STATUS_INTERNAL_ERROR:
-    return "An internal cuDNN operation failed.";
-  case CUDNN_STATUS_NOT_SUPPORTED:
-    return "The combination of parameters is not currently supported.";
-  default:
-    return "Unknown error code.";
-  }
-}
-// some macros to help support cudnn R1 while using R2 code.
-#define cudnnCreateTensorDescriptor cudnnCreateTensor4dDescriptor
-#define cudnnDestroyTensorDescriptor cudnnDestroyTensor4dDescriptor
-#define cudnnSetFilter4dDescriptor cudnnSetFilterDescriptor
-typedef cudnnTensor4dDescriptor_t cudnnTensorDescriptor_t;
-static inline cudnnStatus_t
-cudnnSetTensorNdDescriptor(
-  cudnnTensorDescriptor_t tensorDesc,
-  cudnnDataType_t dataType,
-  int nbDims,
-  const int dimA[],
-  const int strideA[]) {
-  if (nbDims != 4) return CUDNN_STATUS_NOT_SUPPORTED;
-  return cudnnSetTensor4dDescriptorEx(
-    tensorDesc, dataType,
-    dimA[0], dimA[1], dimA[2], dimA[3],
-    strideA[0], strideA[1], strideA[2], strideA[3]);
-}
-static inline cudnnStatus_t
-cudnnGetConvolution2dForwardOutputDim(
-  const cudnnConvolutionDescriptor_t convDesc,
-  const cudnnTensorDescriptor_t inputTensorDesc,
-  const cudnnFilterDescriptor_t filterDesc,
-  int *n,
-  int *c,
-  int *h,
-  int *w) {
-  return cudnnGetOutputTensor4dDim(convDesc, CUDNN_CONVOLUTION_FWD,
-				   n, c, h, w);
-}
-typedef int cudnnConvolutionFwdAlgo_t;
-typedef int cudnnConvolutionFwdPreference_t;
-#define CUDNN_CONVOLUTION_FWD_NO_WORKSPACE 0
-static inline cudnnStatus_t
-cudnnGetConvolutionForwardAlgorithm(
-  cudnnHandle_t handle,
-  const cudnnTensorDescriptor_t srcDesc,
-  const cudnnFilterDescriptor_t filterDesc,
-  const cudnnConvolutionDescriptor_t convDesc,
-  const cudnnTensorDescriptor_t destDesc,
-  cudnnConvolutionFwdPreference_t preference,
-  size_t memoryLimitInbytes,
-  cudnnConvolutionFwdAlgo_t *algo) {
-  *algo = 0;
-  return CUDNN_STATUS_SUCCESS;
-}
-static inline cudnnStatus_t
-cudnnGetConvolutionForwardWorkspaceSize(
- cudnnHandle_t handle,
- const cudnnTensorDescriptor_t srcDesc,
- const cudnnFilterDescriptor_t filterDesc,
- const cudnnConvolutionDescriptor_t convDesc,
- const cudnnTensor4dDescriptor_t destDesc,
- cudnnConvolutionFwdAlgo_t algo,
- size_t *sizeInBytes) {
-  *sizeInBytes = 0;
-  return CUDNN_STATUS_SUCCESS;
-}
-static inline cudnnStatus_t
-cudnnConvolutionForward_v2(
-  cudnnHandle_t handle,
-  const void *alpha,
-  const cudnnTensorDescriptor_t srcDesc,
-  const void *srcData,
-  const cudnnFilterDescriptor_t filterDesc,
-  const void *filterData,
-  const cudnnConvolutionDescriptor_t convDesc,
-  cudnnConvolutionFwdAlgo_t algo,
-  void *workSpace,
-  size_t workSpaceSizeInBytes,
-  const void *beta,
-  const cudnnTensorDescriptor_t destDesc,
-  void *destData) {
-  assert(*(float *)alpha == 1.0);
-  cudnnAccumulateResult_t r;
-  if (*(float *)beta == 0.0) {
-    r = CUDNN_RESULT_NO_ACCUMULATE;
-  } else if (*(float *)beta == 1.0) {
-    r = CUDNN_RESULT_ACCUMULATE;
-  } else {
-    assert(0 && "beta must be 0.0 or 1.0");
-  }
-  return cudnnConvolutionForward(handle, srcDesc, srcData,
-				 filterDesc, filterData,
-				 convDesc, destDesc, destData,
-				 r);
-}
-#define cudnnConvolutionForward cudnnConvolutionForward_v2
-static inline cudnnStatus_t
-cudnnConvolutionBackwardFilter_v2(
-  cudnnHandle_t	handle,
-  const void *alpha,
-  const cudnnTensorDescriptor_t srcDesc,
-  const void *srcData,
-  const cudnnTensorDescriptor_t diffDesc,
-  const void *diffData,
-  const cudnnConvolutionDescriptor_t convDesc,
-  const void *beta,
-  const cudnnFilterDescriptor_t gradDesc,
-  void *gradData) {
-  assert(*(float *)alpha == 1.0);
-  cudnnAccumulateResult_t r;
-  if (*(float *)beta == 0.0) {
-    r = CUDNN_RESULT_NO_ACCUMULATE;
-  } else if (*(float *)beta == 1.0) {
-    r = CUDNN_RESULT_ACCUMULATE;
-  } else {
-    assert(0 && "beta must be 0.0 or 1.0");
-  }
-  return cudnnConvolutionBackwardFilter(handle, srcDesc, srcData,
-					diffDesc, diffData,
-					convDesc, gradDesc, gradData,
-					r);
-}
-#define cudnnConvolutionBackwardFilter cudnnConvolutionBackwardFilter_v2
-static inline cudnnStatus_t
-cudnnConvolutionBackwardData_v2(
-  cudnnHandle_t	handle,
-  const void *alpha,
-  const cudnnFilterDescriptor_t filterDesc,
-  const void *filterData,
-  const cudnnTensorDescriptor_t diffDesc,
-  const void *diffData,
-  const cudnnConvolutionDescriptor_t convDesc,
-  const void *beta,
-  const cudnnTensorDescriptor_t gradDesc,
-  void *gradData) {
-  assert(*(float *)alpha == 1.0);
-  cudnnAccumulateResult_t r;
-  if (*(float *)beta == 0.0) {
-    r = CUDNN_RESULT_NO_ACCUMULATE;
-  } else if (*(float *)beta == 1.0) {
-    r = CUDNN_RESULT_ACCUMULATE;
-  } else {
-    assert(0 && "beta must be 0.0 or 1.0");
-  }
-  /* This function needs the casting because its params are not
-     declared as const */
-  return cudnnConvolutionBackwardData(handle,
-				      (cudnnFilterDescriptor_t)filterDesc,
-				      filterData,
-				      (cudnnTensorDescriptor_t)diffDesc,
-				      diffData,
-				      (cudnnConvolutionDescriptor_t)convDesc,
-				      (cudnnTensorDescriptor_t)gradDesc,
-				      gradData,
-				      r);
-}
-#define cudnnConvolutionBackwardData cudnnConvolutionBackwardData_v2
-static inline cudnnStatus_t
-cudnnSetPoolingNdDescriptor(
-  cudnnPoolingDescriptor_t poolingDesc,
-  const cudnnPoolingMode_t mode,
-  int nbDims,
-  const int windowDimA[],
-  const int paddingA[],
-  const int strideA[]) {
-  if (nbDims != 2) return CUDNN_STATUS_NOT_SUPPORTED;
-  if (paddingA[0] != 0 || paddingA[1] != 0) return CUDNN_STATUS_NOT_SUPPORTED;
-  return cudnnSetPoolingDescriptor(poolingDesc, mode,
-                                   windowDimA[0], windowDimA[1],
-                                   strideA[0], strideA[1]);
-}
-static inline cudnnStatus_t
-cudnnGetPoolingNdDescriptor(
-  const cudnnPoolingDescriptor_t poolingDesc,
-  const int nbDimsRequested,
-  cudnnPoolingMode_t *mode,
-  int *nbDims,
-  int windowA[],
-  int paddingA[],
-  int strideA[]) {
-  int win0, win1, str0, str1;
-  cudnnStatus_t err;
-  if (nbDimsRequested < 2) return CUDNN_STATUS_NOT_SUPPORTED;
-  err = cudnnGetPoolingDescriptor(poolingDesc, mode, &win0, &win1,
-                                  &str0, &str1);
-  if (err != CUDNN_STATUS_SUCCESS) return err;
-  *nbDims = 2;
-  paddingA[0] = 0;
-  paddingA[1] = 0;
-  windowA[0] = win0;
-  windowA[1] = win1;
-  strideA[0] = str0;
-  strideA[1] = str1;
-  return CUDNN_STATUS_SUCCESS;
-}
-static inline cudnnStatus_t
-cudnnPoolingForward_v2(
-  cudnnHandle_t handle,
-  const cudnnPoolingDescriptor_t poolingDesc,
-  const void *alpha,
-  const cudnnTensorDescriptor_t srcDesc,
-  const void *srcData,
-  const void *beta,
-  const cudnnTensorDescriptor_t destDesc,
-  void *destData) {
-  if (*(float*)alpha != 1.0 || *(float *)beta != 0.0) return CUDNN_STATUS_NOT_SUPPORTED;
-  return cudnnPoolingForward(handle, poolingDesc, srcDesc, srcData,
-                             destDesc, destData);
-}
-#define cudnnPoolingForward cudnnPoolingForward_v2
-static inline cudnnStatus_t
-cudnnPoolingBackward_v2(
-  cudnnHandle_t handle,
-  const cudnnPoolingDescriptor_t poolingDesc,
-  const void *alpha,
-  const cudnnTensorDescriptor_t srcDesc,
-  const void *srcData,
-  const cudnnTensorDescriptor_t srcDiffDesc,
-  const void *srcDiffData,
-  const cudnnTensorDescriptor_t destDesc,
-  const void *destData,
-  const void *beta,
-  const cudnnTensorDescriptor_t destDiffDesc,
-  void *destDiffData) {
-  if (*(float*)alpha != 1.0 || *(float *)beta != 0.0) return CUDNN_STATUS_NOT_SUPPORTED;
-  return cudnnPoolingBackward(handle, poolingDesc,
-                              srcDesc, srcData,
-                              srcDiffDesc, srcDiffData,
-                              destDesc, destData,
-                              destDiffDesc, destDiffData);
-}
-#define cudnnPoolingBackward cudnnPoolingBackward_v2
-//Needed for R2 rc2
-# define CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING CUDNN_POOLING_AVERAGE
-#else
-// r2 rc1 and rc2 do not have the same macro defined
-// I didn't checked if this the right combination, but as we do not wrap the padding interface, it is fine for now.
-# define CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING ((cudnnPoolingMode_t)1)
-#endif
-#endif
--- a/theano/sandbox/cuda/cula.py
+++ b/theano/sandbox/cuda/cula.py
-from __future__ import absolute_import, print_function, division
-import pkg_resources
-import theano
-from theano.sandbox.cuda.type import CudaNdarrayType
-from theano.sandbox.cuda import GpuOp
-from theano.sandbox.cuda.basic_ops import as_cuda_ndarray_variable
-try:
-    from theano.sandbox.cuda import cuda_ndarray
-    dimshuffle = cuda_ndarray.cuda_ndarray.dimshuffle
-except ImportError:
-    pass
-cula_available = False
-try:
-    from scikits.cuda import cula
-    cula_available = True
-except (ImportError, OSError, RuntimeError, pkg_resources.DistributionNotFound):
-    pass
-cula_initialized = False
-class GpuSolve(GpuOp):
-    """
-    CULA GPU solver OP.
-    Parameters
-    ----------
-    trans
-        Whether to take the transpose of the input matrix or not.
-    """
-    __props__ = ('trans',)
-    def __init__(self, trans='N'):
-        self.trans = trans
-        super(GpuSolve, self).__init__()
-    def output_type(self, inp):
-        return CudaNdarrayType(broadcastable=[False] * inp.type.ndim)
-    def make_node(self, inp1, inp2):
-        inp1 = as_cuda_ndarray_variable(inp1)
-        inp2 = as_cuda_ndarray_variable(inp2)
-        assert inp1.ndim == 2
-        assert inp2.ndim == 2
-        return theano.Apply(self, [inp1, inp2], [self.output_type(inp1)()])
-    def make_thunk(self, node, storage_map, _, no_recycling, impl=None):
-        # Initialize CULA the first time it is needed
-        global cula_initialized
-        if not cula_available:
-            raise RuntimeError('Cula is not available and '
-                               'GpuSolve Op can not be constructed.')
-        if not cula_initialized:
-            cula.culaInitialize()
-            cula_initialized = True
-        inputs = [storage_map[v] for v in node.inputs]
-        outputs = [storage_map[v] for v in node.outputs]
-        def thunk():
-            # size of the matrices to invert
-            z = outputs[0]
-            # Matrix
-            A = inputs[0][0]
-            # Solution vectors
-            b = inputs[1][0]
-            # A is not explicitly converted between C and F order, instead we
-            # switch the "transpose" flag
-            if self.trans in ('T', 'C'):
-                trans = 'N'
-            else:
-                trans = 'T'
-            # Convert b to F-order from c-order.
-            b_cpy = dimshuffle(b, (1, 0)).reshape((b.shape[0], b.shape[1]))
-            # This copy forces allocation of a new C-contiguous buffer
-            # and returns it.
-            A_cpy = A.copy()
-            b_cpy = b_cpy.copy()
-            def cula_gpu_solve(A_, b_, trans='T'):
-                A_shape = A_.shape
-                b_shape = b_.shape
-                assert(len(A_shape) == 2)
-                assert(len(b_shape) == 2)
-                if trans in ['T', 'C']:
-                    l, n = A_shape
-                    k, m = b_shape
-                    if n != k:
-                        raise ValueError('A and b must be aligned.')
-                elif trans in ['N']:
-                    n, l = A_shape
-                    k, m = b_shape
-                    if l != m:
-                        raise ValueError('A and b must be aligned.')
-                else:
-                    raise ValueError('Invalid value for trans')
-                lda = max(1, n)
-                ldb = max(1, n, l)
-                # construct pointer arrays needed for culaDeviceSgels
-                # Cula requires you to pass a pointer for A and b.
-                A_ptr = A_.gpudata
-                b_ptr = b_.gpudata
-                cula.culaDeviceSgels(trans, n, l, m, A_ptr, lda, b_ptr, ldb)
-                return A_, b_
-            A_pycuda, b_pycuda = cula_gpu_solve(A_cpy, b_cpy, trans)
-            # Convert b to F-order from c-order and assign it to output:
-            b_cpy = b_cpy.reshape(b.shape[::-1])
-            b_cpy = dimshuffle(b_cpy, (1, 0))
-            z[0] = b_cpy
-        thunk.inputs = inputs
-        thunk.outputs = outputs
-        thunk.lazy = False
-        return thunk
-gpu_solve = GpuSolve()
--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
--- a/theano/sandbox/cuda/dnn_base.c
+++ b/theano/sandbox/cuda/dnn_base.c
-#section support_code
-static cudnnHandle_t _handle = NULL;
-static int
-c_set_tensorNd(CudaNdarray *var, cudnnTensorDescriptor_t desc) {
-  int dim = CudaNdarray_NDIM(var);
-  int *strides = (int *)malloc(dim * sizeof(int));
-  int default_str = 1;
-  int return_value = 0;
-  if (strides != NULL) {
-    for (int i = dim-1; i >= 0; i--)
-    {
-      if (CudaNdarray_HOST_STRIDES(var)[i])
-        strides[i] = CudaNdarray_HOST_STRIDES(var)[i];
-      else
-        strides[i] = default_str;
-      default_str *= CudaNdarray_HOST_DIMS(var)[i];
-    }
-    cudnnStatus_t err = cudnnSetTensorNdDescriptor(desc, CUDNN_DATA_FLOAT, dim,
-                                                   CudaNdarray_HOST_DIMS(var),
-                                                   strides);
-    if (err != CUDNN_STATUS_SUCCESS) {
-      PyErr_Format(PyExc_RuntimeError,
-		  "Could not set tensorNd descriptor: %s"
-		  "dim=%d",
-		  cudnnGetErrorString(err), dim);
-	  return_value = -1;
-    }
-  } else {
-    PyErr_Format(PyExc_MemoryError,
-		"Could not allocate memory for strides array of size %d.",
-		dim);
-    return_value = -1;  
-  }
-  free(strides);
-  return return_value;
-}
-static int
-c_set_filterNd(CudaNdarray *var, cudnnFilterDescriptor_t desc) {
-  if (!CudaNdarray_is_c_contiguous(var)) {
-    PyErr_SetString(PyExc_ValueError,
-		    "Only contiguous filters (kernels) are supported.");
-    return -1;
-  }
-  int dim = CudaNdarray_NDIM(var);
-  cudnnStatus_t err = cudnnSetFilterNdDescriptor_v4(desc,
-                                                    CUDNN_DATA_FLOAT,
-                                                    CUDNN_TENSOR_NCHW,
-                                                    dim,
-                                                    CudaNdarray_HOST_DIMS(var));
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError,
-		 "Could not set filter descriptor: %s."
-		 " dims= %d",
-		 cudnnGetErrorString(err), dim);
-    return -1;
-  }
-  return 0;
-}
-#section init_code
-{
-  cudnnStatus_t err;
-  if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "could not create cuDNN handle: %s",
-		 cudnnGetErrorString(err));
-#if PY_MAJOR_VERSION >= 3
-    return NULL;
-#else
-    return;
-#endif
-  }
-}
--- a/theano/sandbox/cuda/dnn_conv_base.c
+++ b/theano/sandbox/cuda/dnn_conv_base.c
-#section support_code_struct
-cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
-cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
-cudnnFilterDescriptor_t APPLY_SPECIFIC(kerns);
-/* Keep track, from one execution to another, of the dimension of the data
-and the algorithms, if any, that were selected according to these dimensions
-and according to the amount of memory available at that time.
-Note : Implementation selection for backward convolution only exists starting
-at V3.
-*/
-int APPLY_SPECIFIC(previous_input_shape)[5];
-int APPLY_SPECIFIC(previous_kerns_shape)[5];
-int APPLY_SPECIFIC(previous_output_shape)[5];
-bool APPLY_SPECIFIC(previous_algo_set);
-cudnnConvolutionFwdAlgo_t APPLY_SPECIFIC(previous_algo);
-cudnnConvolutionBwdFilterAlgo_t APPLY_SPECIFIC(previous_bwd_f_algo);
-cudnnConvolutionBwdDataAlgo_t APPLY_SPECIFIC(previous_bwd_d_algo);
-#section init_code_struct
-cudnnStatus_t APPLY_SPECIFIC(err);
-APPLY_SPECIFIC(input) = NULL;
-APPLY_SPECIFIC(output) = NULL;
-APPLY_SPECIFIC(kerns) = NULL;
-if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
-	       "(inp): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
-  FAIL;
-}
-if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
-               "(out): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
-  FAIL;
-}
-if ((APPLY_SPECIFIC(err) = cudnnCreateFilterDescriptor(&APPLY_SPECIFIC(kerns))) != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_MemoryError, "could not allocate filter descriptor: %s",
-	       cudnnGetErrorString(APPLY_SPECIFIC(err)));
-  FAIL;
-}
-for (int i = 0; i < 5; i++)
-{
-  APPLY_SPECIFIC(previous_input_shape)[i] = 0;
-  APPLY_SPECIFIC(previous_kerns_shape)[i] = 0;
-  APPLY_SPECIFIC(previous_output_shape)[i] = 0;
-}
-APPLY_SPECIFIC(previous_algo_set) = false;
-// Select default implementations for the case where the convolution
-// implementations should be selected based on the size of the data.
-APPLY_SPECIFIC(previous_algo) = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
-APPLY_SPECIFIC(previous_bwd_f_algo) = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
-APPLY_SPECIFIC(previous_bwd_d_algo) = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
-#section cleanup_code_struct
-if (APPLY_SPECIFIC(input) != NULL)
-  cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input));
-if (APPLY_SPECIFIC(output) != NULL)
-  cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output));
-if (APPLY_SPECIFIC(kerns) != NULL)
-  cudnnDestroyFilterDescriptor(APPLY_SPECIFIC(kerns));
--- a/theano/sandbox/cuda/dnn_fwd.c
+++ b/theano/sandbox/cuda/dnn_fwd.c
-#section support_code_struct
-int
-APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
-                         CudaNdarray *om, cudnnConvolutionDescriptor_t desc,
-                         float alpha, float beta, CudaNdarray **output) {
-  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
-  if (CudaNdarray_HOST_DIMS(input)[1] != CudaNdarray_HOST_DIMS(kerns)[1]) {
-    PyErr_SetString(PyExc_ValueError,
-                    "GpuDnnConv images and kernel must have the same stack size\n");
-    return 1;
-  }
-  int nb_dim = CudaNdarray_NDIM(input);
-#ifdef CONV_INPLACE
-  Py_XDECREF(*output);
-  *output = om;
-  Py_INCREF(*output);
-#else
-  if (CudaNdarray_prep_output(output, nb_dim, CudaNdarray_HOST_DIMS(om)) != 0)
-    return 1;
-  if (beta != 0.0 && CudaNdarray_CopyFromCudaNdarray(*output, om))
-    return 1;
-#endif
-  if (CudaNdarray_DIMS(input)[0] == 0 || CudaNdarray_DIMS(kerns)[0] == 0 || CudaNdarray_DIMS(kerns)[1] == 0) {
-    cudaError_t err2 = cudaMemset((*output)->devdata, 0,
-                                  CudaNdarray_SIZE(*output) * sizeof(real));
-    if (err2 != cudaSuccess) {
-      PyErr_Format(PyExc_RuntimeError,
-                   "GpuDnnConv could not fill the output with zeros: %s",
-                   cudaGetErrorString(err2));
-      return 1;
-    }
-    return 0;
-  }
-  if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
-    return 1;
-  if (c_set_filterNd(kerns, APPLY_SPECIFIC(kerns)) == -1)
-    return 1;
-  if (c_set_tensorNd(*output, APPLY_SPECIFIC(output)) == -1)
-    return 1;
-  {
-    size_t worksize;
-    void *workspace;
-    cudnnConvolutionFwdAlgo_t chosen_algo;
-    if (CHOOSE_ALGO)
-    {
-      // A new convolution implementation should be selected, based either on
-      // timing or heuristics if in one of the two following cases :
-      // - The implementation should only be chosen during the first execution
-      //   of an apply node and this is the first execution of the apply node.
-      // - The implementation should be chosen as often as necessary and the
-      //   shapes of the inputs differ from the last time an implementation
-      //   was chosen.
-      bool reuse_previous_algo;
-      if (CHOOSE_ALGO_ONCE)
-      {
-        // Only choose a new implementation of none has been chosen before.
-        reuse_previous_algo = APPLY_SPECIFIC(previous_algo_set);
-      }
-      else
-      {
-        // Reuse the previous implementation if the inputs and the kernels
-        // have the same shapes as they had when the previous implementation
-        // was selected
-        bool same_shapes = true;
-        for (int i = 0; (i < nb_dim) && same_shapes; i++)
-        {
-          same_shapes &= (CudaNdarray_HOST_DIMS(input)[i] ==
-                          APPLY_SPECIFIC(previous_input_shape)[i]);
-          same_shapes &= (CudaNdarray_HOST_DIMS(kerns)[i] ==
-                          APPLY_SPECIFIC(previous_kerns_shape)[i]);
-        }
-        reuse_previous_algo = same_shapes;
-      }
-      // If the previously choosen implementation can't be reused, select a
-      // new one based on the shapes of the current inputs
-      if (!reuse_previous_algo)
-      {
-        // Obtain a convolution algorithm appropriate for the input and kernel
-        // shapes. Either by choosing one according to heuristics or by making
-        // cuDNN time every implementation and choose the best one.
-        if (CHOOSE_ALGO_TIME)
-        {
-          // Time the different implementations to choose the best one
-          int requestedCount = 1;
-          int count;
-          cudnnConvolutionFwdAlgoPerf_t choosen_algo_perf;
-          err = cudnnFindConvolutionForwardAlgorithm(_handle,
-                                                     APPLY_SPECIFIC(input),
-                                                     APPLY_SPECIFIC(kerns),
-                                                     desc,
-                                                     APPLY_SPECIFIC(output),
-                                                     requestedCount,
-                                                     &count,
-                                                     &choosen_algo_perf);
-          if (err != CUDNN_STATUS_SUCCESS) {
-            PyErr_Format(PyExc_RuntimeError,
-                         "GpuDnnConv: error selecting convolution algo: %s",
-                         cudnnGetErrorString(err));
-            return 1;
-          }
-          chosen_algo = choosen_algo_perf.algo;
-        }
-        else
-        {
-          // The implementation should be chosen using heuristics based on the
-          // input shapes and the amount of memory available.
-          // Get the amount of available memory
-          size_t free = 0, total = 0;
-          cudaError_t err2 = cudaMemGetInfo(&free, &total);
-          if (err2 != cudaSuccess){
-            cudaGetLastError();
-            fprintf(stderr,
-                    "Error when trying to find the memory information"
-                    " on the GPU: %s\n", cudaGetErrorString(err2));
-            return 1;
-          }
-          // Use heuristics to choose the implementation
-          err = cudnnGetConvolutionForwardAlgorithm(_handle,
-                                                    APPLY_SPECIFIC(input),
-                                                    APPLY_SPECIFIC(kerns),
-                                                    desc,
-                                                    APPLY_SPECIFIC(output),
-                                                    CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
-                                                    free,
-                                                    &chosen_algo);
-          if (err != CUDNN_STATUS_SUCCESS) {
-            PyErr_Format(PyExc_RuntimeError,
-                         "GpuDnnConv: error selecting convolution algo: %s",
-                         cudnnGetErrorString(err));
-            return 1;
-          }
-        }
-        // Store the shapes of the inputs and kernels as well as the chosen
-        // algorithm for future use.
-        APPLY_SPECIFIC(previous_algo) = chosen_algo;
-        APPLY_SPECIFIC(previous_algo_set) = true;
-        for (int i = 0; i < nb_dim; i++)
-        {
-            APPLY_SPECIFIC(previous_input_shape)[i] =
-                                            CudaNdarray_HOST_DIMS(input)[i];
-            APPLY_SPECIFIC(previous_kerns_shape)[i] =
-                                            CudaNdarray_HOST_DIMS(kerns)[i];
-        }
-      }
-      else
-      {
-          // Reuse the previously chosen convolution implementation
-          chosen_algo = APPLY_SPECIFIC(previous_algo);
-      }
-    }
-    else
-    {
-      chosen_algo = CONV_ALGO;
-    }
-    if (0){
-      char * a;
-      switch(chosen_algo){
-      case CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM:
-	a = "implicit gemm (0)";
-	break;
-      case CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM:
-	a = "precomp gemm (1)";
-	break;
-      case CUDNN_CONVOLUTION_FWD_ALGO_GEMM:
-	a = "gemm (2)";
-	break;
-      case CUDNN_CONVOLUTION_FWD_ALGO_DIRECT:
-	a = "direct (3)";
-	break;
-      case CUDNN_CONVOLUTION_FWD_ALGO_FFT:
-	a = "fft (4)";
-	break;
-      case CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING:
-	a = "fft tiling (5)";
-	break;
-#if CUDNN_VERSION > 5000
-      case CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD:
-	a = "winograd (6)";
-	break;
-#endif
-      }
-      printf("GpuDNNConv: algo %s\n", a);
-    }
-    // The FFT implementation (only in V3 and onward) does not support strides,
-    // 1x1 filters or inputs with a spatial dimension larger than 1024.
-    // The tiled-FFT implementation (only in V4 onward) does not support
-    // strides.
-    // If the chosen implementation is FFT or tiled-FFT, validate that it can
-    // be used on the current data and default on a safe implementation if it
-    // can't.
-    // Following code is 2d-specific, but it is fine as FFT and tiled-FFT are
-    // defined only for 2d-filters
-    if ((chosen_algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT ||
-         chosen_algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) && nb_dim == 4)
-    {
-      // Extract the properties of the convolution descriptor
-      int nd;
-      int pad[2];
-      int stride[2];
-      int upscale[2];
-      cudnnConvolutionMode_t mode;
-      cudnnDataType_t data_type;
-      err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
-                                            upscale, &mode, &data_type);
-      if (err != CUDNN_STATUS_SUCCESS) {
-        PyErr_Format(PyExc_RuntimeError,
-                     "GpuDnnConv: error getting convolution properties: %s",
-                     cudnnGetErrorString(err));
-        return 1;
-      }
-      // Extract the spatial size of the filters
-      int filter_h = CudaNdarray_HOST_DIMS(kerns)[2];
-      int filter_w = CudaNdarray_HOST_DIMS(kerns)[3];
-      // Extract the spatial size of the input
-      int input_h = CudaNdarray_HOST_DIMS(input)[2];
-      int input_w = CudaNdarray_HOST_DIMS(input)[3];
-      // Ensure that the selected implementation supports the requested
-      // convolution. Fall back to a safe implementation otherwise.
-      if (chosen_algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT)
-      {
-        if (stride[0] != 1 || stride[1] != 1 || input_h > 1024 ||
-            input_w > 1024 || (filter_h == 1 && filter_w == 1))
-        {
-          chosen_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
-        }
-      }
-      else
-      {
-        // chosen_algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING
-        if (stride[0] != 1 || stride[1] != 1)
-        {
-          chosen_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
-        }
-      }
-    }
-    err = cudnnGetConvolutionForwardWorkspaceSize(_handle,
-                                                  APPLY_SPECIFIC(input),
-                                                  APPLY_SPECIFIC(kerns),
-                                                  desc,
-                                                  APPLY_SPECIFIC(output),
-                                                  chosen_algo,
-                                                  &worksize);
-    if (err == CUDNN_STATUS_NOT_SUPPORTED) {
-      // Fallback to none algo if not supported
-      // TODO: Print a warning
-      chosen_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
-      err = cudnnGetConvolutionForwardWorkspaceSize(_handle,
-                                                    APPLY_SPECIFIC(input),
-                                                    APPLY_SPECIFIC(kerns),
-                                                    desc,
-                                                    APPLY_SPECIFIC(output),
-                                                    chosen_algo,
-                                                    &worksize);
-    }
-    if (err != CUDNN_STATUS_SUCCESS) {
-      PyErr_Format(PyExc_RuntimeError,
-                   "GpuDnnConv: error getting worksize: %s",
-                   cudnnGetErrorString(err));
-      return 1;
-    }
-    workspace = get_work_mem(worksize);
-    if (workspace == NULL && worksize != 0)
-      return 1;
-    err = cudnnConvolutionForward(
-      _handle,
-      (void *)&alpha,
-      APPLY_SPECIFIC(input), CudaNdarray_DEV_DATA(input),
-      APPLY_SPECIFIC(kerns), CudaNdarray_DEV_DATA(kerns),
-      desc,
-      chosen_algo,
-      workspace, worksize,
-      (void *)&beta,
-      APPLY_SPECIFIC(output), CudaNdarray_DEV_DATA(*output));
-  }
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "GpuDnnConv: error doing operation: %s",
-		 cudnnGetErrorString(err));
-    return 1;
-  }
-  return 0;
-}
--- a/theano/sandbox/cuda/dnn_gi.c
+++ b/theano/sandbox/cuda/dnn_gi.c
-#section support_code_struct
-int
-APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
-                        CudaNdarray *im, cudnnConvolutionDescriptor_t desc,
-                        float alpha, float beta, CudaNdarray **input) {
-  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
-  if (CudaNdarray_HOST_DIMS(im)[1] != CudaNdarray_HOST_DIMS(kerns)[1]) {
-    PyErr_SetString(PyExc_ValueError,
-		    "GpuDnnConv images and kernel must have the same stack size\n");
-    return 1;
-  }
-  int nb_dim = CudaNdarray_NDIM(output);
-#ifdef CONV_INPLACE
-  Py_XDECREF(*input);
-  *input = im;
-  Py_INCREF(*input);
-#else
-  if (CudaNdarray_prep_output(input, nb_dim, CudaNdarray_HOST_DIMS(im)) != 0)
-    return 1;
-  if (beta != 0.0 && CudaNdarray_CopyFromCudaNdarray(*input, im))
-    return 1;
-#endif
-  if (CudaNdarray_DIMS(im)[0] == 0 || CudaNdarray_DIMS(kerns)[0] == 0 || CudaNdarray_DIMS(kerns)[1] == 0) {
-    cudaError_t err2 = cudaMemset((*input)->devdata, 0,
-                                  CudaNdarray_SIZE(*input) * sizeof(real));
-    if (err2 != cudaSuccess) {
-      PyErr_Format(PyExc_RuntimeError,
-                   "GpuDnnConv grad wrt. inputs could not fill the output with zeros: %s",
-                   cudaGetErrorString(err2));
-      return 1;
-    }
-    return 0;
-  }
-  if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
-    return 1;
-  if (c_set_filterNd(kerns, APPLY_SPECIFIC(kerns)) == -1)
-    return 1;
-  if (c_set_tensorNd(*input, APPLY_SPECIFIC(input)) == -1)
-    return 1;
-  int expected_output_dims[5] = {0};
-  err = cudnnGetConvolutionNdForwardOutputDim(desc, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
-                                              nb_dim, expected_output_dims);
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "error computing convolution output dim: %s",
-                 cudnnGetErrorString(err));
-    return 1;
-  }
-  if (nb_dim == 4) {
-    if ((CudaNdarray_HOST_DIMS(output)[0] != expected_output_dims[0]) ||
-        (CudaNdarray_HOST_DIMS(output)[1] != expected_output_dims[1]) ||
-        (CudaNdarray_HOST_DIMS(output)[2] != expected_output_dims[2]) ||
-        (CudaNdarray_HOST_DIMS(output)[3] != expected_output_dims[3])) {
-      PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ld"
-                                     " but received gradient with shape %ldx%ldx%ldx%ld",
-                   (long int)expected_output_dims[0], (long int)expected_output_dims[1],
-                   (long int)expected_output_dims[2], (long int)expected_output_dims[3],
-                   (long int)CudaNdarray_HOST_DIMS(output)[0], (long int)CudaNdarray_HOST_DIMS(output)[1],
-                   (long int)CudaNdarray_HOST_DIMS(output)[2], (long int)CudaNdarray_HOST_DIMS(output)[3]);
-      return 1;
-    }
-  } else if (nb_dim == 5) {
-    if ((CudaNdarray_HOST_DIMS(output)[0] != expected_output_dims[0]) ||
-        (CudaNdarray_HOST_DIMS(output)[1] != expected_output_dims[1]) ||
-        (CudaNdarray_HOST_DIMS(output)[2] != expected_output_dims[2]) ||
-        (CudaNdarray_HOST_DIMS(output)[3] != expected_output_dims[3]) ||
-        (CudaNdarray_HOST_DIMS(output)[4] != expected_output_dims[4])) {
-      PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ldx%ld"
-                                     " but received gradient with shape %ldx%ldx%ldx%ldx%ld",
-                   (long int)expected_output_dims[0], (long int)expected_output_dims[1],
-                   (long int)expected_output_dims[2], (long int)expected_output_dims[3],
-                   (long int)expected_output_dims[4],
-                   (long int)CudaNdarray_HOST_DIMS(output)[0], (long int)CudaNdarray_HOST_DIMS(output)[1],
-                   (long int)CudaNdarray_HOST_DIMS(output)[2], (long int)CudaNdarray_HOST_DIMS(output)[3],
-                   (long int)CudaNdarray_HOST_DIMS(output)[4]);
-      return 1;
-    }
-  }
-  {
-    size_t worksize;
-    void *workspace;
-    cudnnConvolutionBwdDataAlgo_t chosen_algo;
-    if (CHOOSE_ALGO)
-    {
-      // A new convolution implementation should be selected, based either on
-      // timing or heuristics, if in one of the two following cases :
-      // - The implementation should only be chosen during the first execution
-      //   of an apply node and this is the first execution of the apply node.
-      // - The implementation should be chosen as often as necessary and the
-      //   shapes of the inputs differ from the last time an implementation
-      //   was chosen.
-      bool reuse_previous_algo;
-      if (CHOOSE_ALGO_ONCE)
-      {
-        // Only choose a new implementation of none has been chosen before.
-        reuse_previous_algo = APPLY_SPECIFIC(previous_algo_set);
-      }
-      else
-      {
-        // Reuse the previous implementation if the the kernels and the outputs
-        // have the same shapes as they had when the previous implementation
-        // was selected
-        bool same_shapes = true;
-        for (int i = 0; (i < nb_dim) && same_shapes; i++)
-        {
-            same_shapes &= (CudaNdarray_HOST_DIMS(kerns)[i] ==
-                            APPLY_SPECIFIC(previous_kerns_shape)[i]);
-            same_shapes &= (CudaNdarray_HOST_DIMS(output)[i] ==
-                            APPLY_SPECIFIC(previous_output_shape)[i]);
-        }
-        reuse_previous_algo = same_shapes;
-      }
-      // If the previously choosen implementation can't be reused, select a
-      // new one based on the shapes of the current inputs
-      if (!reuse_previous_algo)
-      {
-        // Obtain a convolution algorithm appropriate for the kernel and output
-        // shapes. Either by choosing one according to heuristics or by making
-        // cuDNN time every implementation and choose the best one.
-        if (CHOOSE_ALGO_TIME)
-        {
-          // Time the different implementations to choose the best one
-          int requestedCount = 1;
-          int count;
-          cudnnConvolutionBwdDataAlgoPerf_t choosen_algo_perf;
-          err = cudnnFindConvolutionBackwardDataAlgorithm(_handle,
-                                                          APPLY_SPECIFIC(kerns),
-                                                          APPLY_SPECIFIC(output),
-                                                          desc,
-                                                          APPLY_SPECIFIC(input),
-                                                          requestedCount,
-                                                          &count,
-                                                          &choosen_algo_perf);
-          if (err != CUDNN_STATUS_SUCCESS) {
-            PyErr_Format(PyExc_RuntimeError,
-                         "GpuDnnConvGradI: error selecting convolution algo: "
-                         "%s", cudnnGetErrorString(err));
-            return 1;
-          }
-          chosen_algo = choosen_algo_perf.algo;
-        }
-        else
-        {
-          // Choose the convolution implementation using heuristics based on the
-          // shapes of the inputs and the amount of memory available.
-          // Get the amount of available memory
-          size_t free = 0, total = 0;
-          cudaError_t err2 = cudaMemGetInfo(&free, &total);
-          if (err2 != cudaSuccess){
-            cudaGetLastError();
-            fprintf(stderr,
-                    "Error when trying to find the memory information"
-                    " on the GPU: %s\n", cudaGetErrorString(err2));
-            return 1;
-          }
-          // Use heuristics to choose the implementation
-          err = cudnnGetConvolutionBackwardDataAlgorithm(_handle,
-                                                         APPLY_SPECIFIC(kerns),
-                                                         APPLY_SPECIFIC(output),
-                                                         desc,
-                                                         APPLY_SPECIFIC(input),
-                                                         CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
-                                                         free,
-                                                         &chosen_algo);
-          if (err != CUDNN_STATUS_SUCCESS) {
-            PyErr_Format(PyExc_RuntimeError,
-                         "GpuDnnConvGradI: error selecting convolution algo: %s",
-                         cudnnGetErrorString(err));
-            return 1;
-          }
-        }
-        // Store the shapes of the kernels and output as well as the chosen
-        // algorithm for future use.
-        APPLY_SPECIFIC(previous_bwd_d_algo) = chosen_algo;
-        APPLY_SPECIFIC(previous_algo_set) = true;
-        for (int i = 0; i < nb_dim; i++)
-        {
-            APPLY_SPECIFIC(previous_kerns_shape)[i] =
-                                            CudaNdarray_HOST_DIMS(kerns)[i];
-            APPLY_SPECIFIC(previous_output_shape)[i] =
-                                            CudaNdarray_HOST_DIMS(output)[i];
-        }
-      }
-      else
-      {
-        // Reuse the previously chosen convlution implementation
-        chosen_algo = APPLY_SPECIFIC(previous_bwd_d_algo);
-      }
-    }
-    else
-    {
-        chosen_algo = CONV_ALGO;
-    }
-    if (0){
-      char * a;
-      switch(chosen_algo){
-      case CUDNN_CONVOLUTION_BWD_DATA_ALGO_0:
-	a = "implicit gemm (0)";
-	break;
-      case CUDNN_CONVOLUTION_BWD_DATA_ALGO_1:
-	a = "precomp gemm (1)";
-	break;
-      case CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT:
-	a = "fft (2)";
-	break;
-      case CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING:
-	a = "fft tiling (3)";
-	break;
-#if CUDNN_VERSION > 5000
-      case CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD:
-	a = "winograd (4)";
-	break;
-#endif
-      }
-      printf("GpuDNNConvGI: algo %s\n", a);
-    }
-    // The FFT implementation (only in V3 and onward) does not support strides,
-    // 1x1 filters or inputs with a spatial dimension larger than 1024.
-    // The tiled-FFT implementation (only in V4 onward) does not support
-    // strides.
-    // If the chosen implementation is FFT or tiled-FFT, validate that it can
-    // be used on the current data and default on a safe implementation if it
-    // can't.
-    // Following code is 2d-specific, but it is fine as FFT and tiled-FFT are
-    // defined only for 2d-filters
-    if ((chosen_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING  ||
-         chosen_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) && nb_dim == 4)
-    {
-      // Extract the properties of the convolution descriptor
-      int nd;
-      int pad[2];
-      int stride[2];
-      int upscale[2];
-      cudnnConvolutionMode_t mode;
-      cudnnDataType_t data_type;
-      err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
-                                            upscale, &mode, &data_type);
-      if (err != CUDNN_STATUS_SUCCESS) {
-        PyErr_Format(PyExc_RuntimeError,
-                     "GpuDnnConvGradI: error getting convolution properties: %s",
-                     cudnnGetErrorString(err));
-        return 1;
-      }
-      // Extract the spatial size of the filters
-      int filter_h = CudaNdarray_HOST_DIMS(kerns)[2];
-      int filter_w = CudaNdarray_HOST_DIMS(kerns)[3];
-      // Extract the spatial size of the input
-      int input_h = CudaNdarray_HOST_DIMS(*input)[2];
-      int input_w = CudaNdarray_HOST_DIMS(*input)[3];
-      // Ensure that the selected implementation supports the requested
-      // convolution. Fall back to a safe implementation otherwise.
-      if (chosen_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT)
-      {
-        if (stride[0] != 1 || stride[1] != 1 || input_h > 1024 ||
-            input_w > 1024 || (filter_h == 1 && filter_w == 1))
-        {
-          chosen_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
-        }
-      }
-      else
-      {
-        // chosen_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING
-        if (stride[0] != 1 || stride[1] != 1)
-        {
-          chosen_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
-        }
-      }
-    }
-    // Infer required workspace size from the chosen implementation
-    err = cudnnGetConvolutionBackwardDataWorkspaceSize(_handle,
-                                                       APPLY_SPECIFIC(kerns),
-                                                       APPLY_SPECIFIC(output),
-                                                       desc,
-                                                       APPLY_SPECIFIC(input),
-                                                       chosen_algo,
-                                                       &worksize);
-    if (err != CUDNN_STATUS_SUCCESS) {
-      PyErr_Format(PyExc_RuntimeError,
-                   "GpuDnnConvGradI: error getting worksize: %s",
-                   cudnnGetErrorString(err));
-      return 1;
-    }
-    // Allocate workspace for the convolution
-    workspace = get_work_mem(worksize);
-    if (workspace == NULL && worksize != 0)
-      return 1;
-    // Perform the convolution
-    err = cudnnConvolutionBackwardData(
-      _handle,
-      (void *)&alpha,
-      APPLY_SPECIFIC(kerns), CudaNdarray_DEV_DATA(kerns),
-      APPLY_SPECIFIC(output), CudaNdarray_DEV_DATA(output),
-      desc,
-      chosen_algo,
-      workspace, worksize,
-      (void *)&beta,
-      APPLY_SPECIFIC(input), CudaNdarray_DEV_DATA(*input));
-  }
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradI: error doing operation: %s",
-                 cudnnGetErrorString(err));
-    return 1;
-  }
-  return 0;
-}
--- a/theano/sandbox/cuda/dnn_gw.c
+++ b/theano/sandbox/cuda/dnn_gw.c
-#section support_code_struct
-int
-APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
-                        CudaNdarray *km, cudnnConvolutionDescriptor_t desc,
-                        float alpha, float beta, CudaNdarray **kerns) {
-  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
-  if (CudaNdarray_HOST_DIMS(input)[1] != CudaNdarray_HOST_DIMS(km)[1]) {
-    PyErr_SetString(PyExc_ValueError,
-                   "GpuDnnConv images and kernel must have the same stack size\n");
-    return 1;
-  }
-  int nb_dim = CudaNdarray_NDIM(output);
-#ifdef CONV_INPLACE
-  Py_XDECREF(*kerns);
-  *kerns = km;
-  Py_INCREF(*kerns);
-#else
-  if (CudaNdarray_prep_output(kerns, nb_dim, CudaNdarray_HOST_DIMS(km)) != 0)
-    return 1;
-  if (beta != 0.0 && CudaNdarray_CopyFromCudaNdarray(*kerns, km))
-    return 1;
-#endif
-  if (CudaNdarray_DIMS(input)[0] == 0 || CudaNdarray_DIMS(km)[0] == 0 || CudaNdarray_DIMS(km)[1] == 0) {
-    cudaError_t err2 = cudaMemset((*kerns)->devdata, 0,
-                                  CudaNdarray_SIZE(*kerns) * sizeof(real));
-    if (err2 != cudaSuccess) {
-      PyErr_Format(PyExc_RuntimeError,
-                   "GpuDnnConv grad wrt. weights could not fill the output with zeros: %s",
-                   cudaGetErrorString(err2));
-      return 1;
-    }
-    return 0;
-  }
-  if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
-    return 1;
-  if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
-    return 1;
-  if (c_set_filterNd(*kerns, APPLY_SPECIFIC(kerns)) == -1)
-    return 1;
-  int expected_output_dims[5] = {0};
-  err = cudnnGetConvolutionNdForwardOutputDim(desc, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
-                                              nb_dim, expected_output_dims);
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "error computing convolution output dim: %s",
-                 cudnnGetErrorString(err));
-    return 1;
-  }
-  if (nb_dim == 4) {
-    if ((CudaNdarray_HOST_DIMS(output)[0] != expected_output_dims[0]) ||
-        (CudaNdarray_HOST_DIMS(output)[1] != expected_output_dims[1]) ||
-        (CudaNdarray_HOST_DIMS(output)[2] != expected_output_dims[2]) ||
-        (CudaNdarray_HOST_DIMS(output)[3] != expected_output_dims[3])) {
-      PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%dx%ld"
-                                     " but received gradient with shape %ldx%ldx%dx%ld",
-                   (long int)expected_output_dims[0], (long int)expected_output_dims[1],
-                   (long int)expected_output_dims[2], (long int)expected_output_dims[3],
-                   (long int)CudaNdarray_HOST_DIMS(output)[0], (long int)CudaNdarray_HOST_DIMS(output)[1],
-                   (long int)CudaNdarray_HOST_DIMS(output)[2], (long int)CudaNdarray_HOST_DIMS(output)[3]);
-      return 1;
-    }
-  } else if (nb_dim == 5) {
-    if ((CudaNdarray_HOST_DIMS(output)[0] != expected_output_dims[0]) ||
-        (CudaNdarray_HOST_DIMS(output)[1] != expected_output_dims[1]) ||
-        (CudaNdarray_HOST_DIMS(output)[2] != expected_output_dims[2]) ||
-        (CudaNdarray_HOST_DIMS(output)[3] != expected_output_dims[3]) ||
-        (CudaNdarray_HOST_DIMS(output)[4] != expected_output_dims[4])) {
-      PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ldx%ld"
-                                     " but received gradient with shape %ldx%ldx%ldx%ldx%ld",
-                   (long int)expected_output_dims[0], (long int)expected_output_dims[1],
-                   (long int)expected_output_dims[2], (long int)expected_output_dims[3],
-                   (long int)expected_output_dims[4],
-                   (long int)CudaNdarray_HOST_DIMS(output)[0], (long int)CudaNdarray_HOST_DIMS(output)[1],
-                   (long int)CudaNdarray_HOST_DIMS(output)[2], (long int)CudaNdarray_HOST_DIMS(output)[3],
-                   (long int)CudaNdarray_HOST_DIMS(output)[4]);
-      return 1;
-    }
-  }
-  {
-    size_t worksize;
-    void *workspace;
-    cudnnConvolutionBwdFilterAlgo_t chosen_algo;
-    if (CHOOSE_ALGO)
-    {
-      // A new convolution implementation should be selected, based either on
-      // timing or heuristics, if in one of the two following cases :
-      // - The implementation should only be chosen during the first execution
-      //   of an apply node and this is the first execution of the apply node.
-      // - The implementation should be chosen as often as necessary and the
-      //   shapes of the inputs differ from the last time an implementation
-      //   was chosen.
-      bool reuse_previous_algo;
-      if (CHOOSE_ALGO_ONCE)
-      {
-        // Only choose a new implementation of none has been chosen before.
-        reuse_previous_algo = APPLY_SPECIFIC(previous_algo_set);
-      }
-      else
-      {
-        // Reuse the previous implementation if the the kernels and the outputs
-        // have the same shapes as they had when the previous implementation
-        // was selected
-        bool same_shapes = true;
-        for (int i = 0; (i < nb_dim) && same_shapes; i++)
-        {
-            same_shapes &= (CudaNdarray_HOST_DIMS(input)[i] ==
-                            APPLY_SPECIFIC(previous_input_shape)[i]);
-            same_shapes &= (CudaNdarray_HOST_DIMS(output)[i] ==
-                            APPLY_SPECIFIC(previous_output_shape)[i]);
-        }
-        reuse_previous_algo = same_shapes;
-      }
-      // If the previously choosen implementation can't be reused, select a
-      // new one based on the shapes of the current inputs
-      if (!reuse_previous_algo)
-      {
-        // Obtain a convolution algorithm appropriate for the input and output
-        // shapes. Either by choosing one according to heuristics or by making
-        // cuDNN time every implementation and choose the best one.
-        if (CHOOSE_ALGO_TIME)
-        {
-          // Time the different implementations to choose the best one
-          int requestedCount = 1;
-          int count;
-          cudnnConvolutionBwdFilterAlgoPerf_t choosen_algo_perf;
-          err = cudnnFindConvolutionBackwardFilterAlgorithm(_handle,
-                                                            APPLY_SPECIFIC(input),
-                                                            APPLY_SPECIFIC(output),
-                                                            desc,
-                                                            APPLY_SPECIFIC(kerns),
-                                                            requestedCount,
-                                                            &count,
-                                                            &choosen_algo_perf);
-          if (err != CUDNN_STATUS_SUCCESS) {
-            PyErr_Format(PyExc_RuntimeError,
-                         "GpuDnnConvGradW: error selecting convolution algo: "
-                         "%s", cudnnGetErrorString(err));
-            return 1;
-          }
-          chosen_algo = choosen_algo_perf.algo;
-        }
-        else
-        {
-          // Choose the convolution implementation using heuristics based on the
-          // shapes of the inputs and the amount of memory available.
-          // Get the amount of available memory
-          size_t free = 0, total = 0;
-          cudaError_t err2 = cudaMemGetInfo(&free, &total);
-          if (err2 != cudaSuccess){
-            cudaGetLastError();
-            fprintf(stderr,
-                    "Error when trying to find the memory information"
-                    " on the GPU: %s\n", cudaGetErrorString(err2));
-            return 1;
-          }
-          // Use heuristics to choose the implementation
-          err = cudnnGetConvolutionBackwardFilterAlgorithm(_handle,
-                                                           APPLY_SPECIFIC(input),
-                                                           APPLY_SPECIFIC(output),
-                                                           desc,
-                                                           APPLY_SPECIFIC(kerns),
-                                                           CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
-                                                           free,
-                                                           &chosen_algo);
-          if (err != CUDNN_STATUS_SUCCESS) {
-            PyErr_Format(PyExc_RuntimeError,
-                         "GpuDnnConvGradW: error selecting convolution algo: %s",
-                         cudnnGetErrorString(err));
-            return 1;
-          }
-        }
-        // Store the shapes of the inputs and kernels as well as the chosen
-        // algorithm for future use.
-        APPLY_SPECIFIC(previous_bwd_f_algo) = chosen_algo;
-        APPLY_SPECIFIC(previous_algo_set) = true;
-        for (int i = 0; i < nb_dim; i++)
-        {
-            APPLY_SPECIFIC(previous_input_shape)[i] =
-                                            CudaNdarray_HOST_DIMS(input)[i];
-            APPLY_SPECIFIC(previous_output_shape)[i] =
-                                            CudaNdarray_HOST_DIMS(output)[i];
-        }
-      }
-      else
-      {
-        // Reuse the previously chosen convlution implementation
-        chosen_algo = APPLY_SPECIFIC(previous_bwd_f_algo);
-      }
-    }
-    else
-    {
-        chosen_algo = CONV_ALGO;
-    }
-    if (0){
-      char * a;
-      switch(chosen_algo){
-      case CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0:
-	a = "algo 0 (0)";
-	break;
-      case CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1:
-	a = "algo 1 (1)";
-	break;
-      case CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT:
-	a = "fft (2)";
-	break;
-      case CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3:
-	a = "algo 3 (3)";
-	break;
-      }
-      printf("GpuDNNConvGW: algo %s\n", a);
-    }
-    // The FFT implementation (only in v3 and onward) does not support strides,
-    // 1x1 filters or inputs with a spatial dimension larger than 1024.
-    // If the chosen implementation is FFT, validate that it can be used
-    // on the current data and default on a safe implementation if it
-    // can't.
-    if (chosen_algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT && nb_dim == 4)
-    {
-      // Extract the properties of the convolution descriptor
-      int nd;
-      int pad[2];
-      int stride[2];
-      int upscale[2];
-      cudnnConvolutionMode_t mode;
-      cudnnDataType_t data_type;
-      err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
-                                            upscale, &mode, &data_type);
-      if (err != CUDNN_STATUS_SUCCESS) {
-        PyErr_Format(PyExc_RuntimeError,
-                     "GpuDnnConvGradW: error getting convolution properties: %s",
-                     cudnnGetErrorString(err));
-        return 1;
-      }
-      // Extract the spatial size of the filters
-      int filter_h = CudaNdarray_HOST_DIMS(*kerns)[2];
-      int filter_w = CudaNdarray_HOST_DIMS(*kerns)[3];
-      // Extract the spatial size of the input
-      int input_h = CudaNdarray_HOST_DIMS(input)[2];
-      int input_w = CudaNdarray_HOST_DIMS(input)[3];
-      // Ensure that the selected implementation supports the requested
-      // convolution. Fall back to a safe implementation otherwise.
-      if (stride[0] != 1 || stride[1] != 1 || input_h > 1024 ||
-          input_w > 1024 || (filter_h == 1 && filter_w == 1))
-      {
-        chosen_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
-      }
-    }
-    // Infer required workspace size from the chosen implementation
-    err = cudnnGetConvolutionBackwardFilterWorkspaceSize(_handle,
-                                                         APPLY_SPECIFIC(input),
-                                                         APPLY_SPECIFIC(output),
-                                                         desc,
-                                                         APPLY_SPECIFIC(kerns),
-                                                         chosen_algo,
-                                                         &worksize);
-    if (err != CUDNN_STATUS_SUCCESS) {
-      PyErr_Format(PyExc_RuntimeError,
-                   "GpuDnnConvGradW: error getting worksize: %s",
-                   cudnnGetErrorString(err));
-      return 1;
-    }
-    // Allocate workspace for the convolution
-    workspace = get_work_mem(worksize);
-    if (workspace == NULL && worksize != 0)
-      return 1;
-    // Perform the convolution
-    err = cudnnConvolutionBackwardFilter(
-      _handle,
-      (void *)&alpha,
-      APPLY_SPECIFIC(input), CudaNdarray_DEV_DATA(input),
-      APPLY_SPECIFIC(output), CudaNdarray_DEV_DATA(output),
-      desc,
-      chosen_algo,
-      workspace, worksize,
-      (void *)&beta,
-      APPLY_SPECIFIC(kerns), CudaNdarray_DEV_DATA(*kerns));
-  }
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradW: error doing operation: %s",
-                 cudnnGetErrorString(err));
-    return 1;
-  }
-  return 0;
-}
--- a/theano/sandbox/cuda/elemwise.py
+++ b/theano/sandbox/cuda/elemwise.py
-"""
-This file implement 3 different version of the elemwise op on the
-gpu. Only NaiveAlgo is used and it is not very naive now.
-The elemwise fct are also used with scalar operation! So it can happen
-that ndim is 0 as with all scalar type.
-"""
-from __future__ import absolute_import, print_function, division
-import logging
-import numpy
-from theano.scalar.basic import upgrade_to_float_no_complex, complex_types
-from theano.scalar.basic_scipy import Erfinv
-from six import StringIO
-from six.moves import xrange
-from theano import Apply
-from theano import gof, scalar
-_logger_name = 'theano.sandbox.cuda.elemwise'
-_logger = logging.getLogger(_logger_name)
-def _logical_scalar(x):
-    return numpy.all(x.type.broadcastable)
-def get_str_list_logical_scalar(node, value_str='ii_i%i_value',
-                                data_str='ii_i%i_data[0]'):
-    l = []
-    for ipos, i in enumerate(node.inputs):
-        if _logical_scalar(i):
-            l += [value_str % ipos]
-        else:
-            l += [data_str % ipos]
-    return l
-class SupportCodeError(Exception):
-    """
-    It is currently not possible to auto-generate a GPU implementation for
-    an elementwise Op with c_support_code_apply().
-    But we support Op.c_support_code.
-    """
-class NaiveAlgo(object):
-    """
-    Parameters
-    ----------
-    scalar_op
-        The scalar operation to execute on each element.
-    sync
-        If True, will wait after the kernel launch and check for error call.
-    """
-    verbose = 0  # 1, 2 or 3 for more verbose output.
-    @property
-    def cache_version(self):
-        ver = self.scalar_op.c_code_cache_version()
-        if ver:
-            return (20, self.verbose, self.sync, ver)
-        else:
-            return ver
-    def __init__(self, scalar_op, sync=True, inplace_pattern=None):
-        if inplace_pattern is None:
-            inplace_pattern = {}
-        try:
-            code = scalar_op.c_support_code_apply(None, "nodename")
-            if code:
-                raise SupportCodeError(scalar_op)
-        except gof.utils.MethodNotDefined:
-            pass
-        self.scalar_op = scalar_op
-        self.sync = sync
-        self.inplace_pattern = inplace_pattern
-    def c_src_kernel(self, node, nodename, nd):
-        sio = StringIO()
-        # print 'C_SRC_KERNEL', sio.getvalue()
-        print("// %s" % str(node.op), file=sio)
-        print("// node.op.destroy_map=%s" % str(
-            getattr(node.op, 'destroy_map', None)), file=sio)
-        for ipos, i in enumerate(node.inputs):
-            print("//    Input  ", ipos, str(i.type), file=sio)
-        for ipos, i in enumerate(node.outputs):
-            print("//    Output ", ipos, str(i.type), file=sio)
-        print("static __global__ void kernel_%s_%s_%s(unsigned int numEls" % (
-            self.scalar_op.__class__.__name__, nodename, nd), file=sio)
-        if (nd):
-            print("\t,", ", ".join("const int dim%i" % i
-                                   for i in xrange(nd)), file=sio)
-        # declare inputs
-        for ipos, i in enumerate(node.inputs):
-            s = ", ".join(["const float * i%i_data" % ipos] +
-                          ["int i%i_str_%i" % (ipos, d) for d in xrange(nd)])
-            print("\t,", s, file=sio)
-        # declare outputs
-        for ipos, i in enumerate(node.outputs):
-            s = ", ".join(["float * o%i_data" % ipos] +
-                          ["int o%i_str_%i" % (ipos, d) for d in xrange(nd)])
-            print("\t,", s, file=sio)
-            # print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) for d in xrange(nd))
-            # print >> sio, "\t,", "float * o%i_data" % ipos
-        print("\t)\n{", file=sio)
-        print("    const int idx = blockIdx.x * blockDim.x + threadIdx.x;", file=sio)
-        print("    const int numThreads = blockDim.x * gridDim.x;", file=sio)
-        # For each input that is a scalar which has been broadcasted to a tensor,
-        #     load it into a local variable
-        for ipos, i in enumerate(node.inputs):
-            if _logical_scalar(i):
-                print("    const float ii_i%i_value = i%i_data[0];" % (ipos, ipos), file=sio)
-        # loop over the elements to be treated by this kernel call
-        print("    for (int i = idx; i < numEls; i += numThreads) {", file=sio)
-        # calculate the data pointers for all arguments
-        print("        int ii = i;", file=sio)
-        for ipos, i in enumerate(node.inputs):
-            if not _logical_scalar(i):
-                print("        const float * ii_i%i_data = i%i_data;" % (ipos, ipos), file=sio)
-        for ipos, i in enumerate(node.outputs):
-            print("        float * ii_o%i_data = o%i_data;" % (ipos, ipos), file=sio)
-        for d in xrange(nd - 1, -1, -1):
-            if d > 0:
-                print("        int pos%i = ii %% dim%i;" % (d, d), file=sio)
-                print("        ii = ii / dim%i;" % d, file=sio)
-            else:
-                print("        int pos%i = ii;" % d, file=sio)
-            for ipos, i in enumerate(node.inputs):
-                if not _logical_scalar(i):
-                    print("        ii_i%i_data += pos%i * i%i_str_%i;" % (ipos, d, ipos, d), file=sio)
-            for ipos, i in enumerate(node.outputs):
-                print("        ii_o%i_data += pos%i * o%i_str_%i;" % (ipos, d, ipos, d), file=sio)
-        # perform the scalar operation on the input and output references
-        # TODO: What if the scalar_op needs support_code??
-        for ipos, i in enumerate(node.outputs):
-            print("npy_%s o%d_i;" % (i.dtype, ipos), file=sio)
-        task_code = self.scalar_op.c_code(
-            Apply(self.scalar_op,
-                  [scalar.Scalar(dtype=input.type.dtype).make_variable()
-                   for input in node.inputs],
-                  [scalar.Scalar(dtype=output.type.dtype).make_variable()
-                   for output in node.outputs]),
-            nodename + '_scalar_',
-            get_str_list_logical_scalar(node),
-            ['o%i_i' % ipos for ipos, i in enumerate(node.outputs)],
-            sub=dict(fail='return;'))  # TODO: set a failure code somehow!!!
-        print("       ", task_code, file=sio)
-        for ipos, _ in enumerate(node.outputs):
-            print("ii_o%i_data[0] = o%i_i;" % (ipos, ipos), file=sio)
-        print("    }", file=sio)
-        # indent = " "*(4*d+7)
-        # for ipos, i in enumerate(node.inputs):
-        # print >> sio, indent, "const float * i%i" % ipos, '= i%i_data', ''
-        print("}", file=sio)
-        # print sio.getvalue()
-        return sio.getvalue()
-    def c_src_kernel_tiling(self, node, nodename):
-        """
-        The kernel applies to problems with <= 5 dimensions.
-        """
-        # The kernel is intended to be structured roughly like this:
-        """
-        static __global__ void kernel()
-        {
-            for (int v = blockIdx.y; v < dim0; v += gridDim.x)
-            {
-                for (int w = blockIdx.y; w < dim1; w += gridDim.y)
-                {
-                    for (int x = threadIdx.x; x < dim2; x += blockDim.x)
-                    {
-                        for (int y = threadIdx.y; y < dim3; y += blockDim.y)
-                        {
-                            for (int z = threadIdx.z; z < dim4; z += blockDim.z)
-                            {
-                                out[v * out_stride[0] + ...] = f(in1[...],  in2[...])
-                            }
-                        }
-                    }
-                }
-            }
-        }
-        """
-        nd = node.outputs[0].type.ndim
-        sio = StringIO()
-        # print 'C_SRC_KERNEL', sio.getvalue()
-        if nd in (4,):
-            # print some leading comments to make the code easier to read
-            print("// %s" % str(node.op), file=sio)
-            print("// node.op.destroy_map=%s" % str(
-                getattr(node.op, 'destroy_map', None)), file=sio)
-            for ipos, i in enumerate(node.inputs):
-                print("//    Input  ", ipos, str(i.type), file=sio)
-            for ipos, i in enumerate(node.outputs):
-                print("//    Output ", ipos, str(i.type), file=sio)
-            print(
-                "static __global__ void kernel_%s_%s_%s(unsigned int numEls" %
-                (self.scalar_op.__class__.__name__,
-                 nodename,
-                 'tiling%i' % nd), file=sio)
-            if (nd):
-                print("\t,", ", ".join("const int dim%i" % i for i in xrange(nd)), file=sio)
-            # declare inputs
-            for ipos, i in enumerate(node.inputs):
-                s = ", ".join(["const float * i%i_data" % ipos] + list("int i%i_str_%i" % (ipos, d) for d in xrange(nd)))
-                print("\t,", s, file=sio)
-            # declare outputs
-            for ipos, i in enumerate(node.outputs):
-                s = ", ".join(["float * o%i_data" % ipos] + list("int o%i_str_%i" % (ipos, d) for d in xrange(nd)))
-                print("\t,", s, file=sio)
-                # print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) for d in xrange(nd))
-                # print >> sio, "\t,", "float * o%i_data" % ipos
-            print("\t)\n{", file=sio)
-            # For each input that is a scalar which has been broadcasted to a tensor,
-            #     load it into a local variable
-            print("    __shared__ float value0[%i];" % len(node.inputs), file=sio)
-            print("    __shared__ int shared_dims[%(nd)s];" % locals(), file=sio)
-            # print >> sio, "    __shared__ int shared_i_str[%(n_in)s][%(nd)s]"
-            print("    if ((threadIdx.x == 0) && (threadIdx.y == 0)) {", file=sio)
-            for ipos, i in enumerate(node.inputs):
-                if _logical_scalar(i):
-                    print("    value0[%i] = i%i_data[0];" % (ipos, ipos), file=sio)
-            for ipos in xrange(nd):
-                print("    shared_dims[%i] = dim%i;" % (ipos, ipos), file=sio)
-            print("    }", file=sio)
-            print("    __syncthreads();", file=sio)
-            if (nd == 4):
-                print("""
-                for (int pos0 = blockIdx.x; pos0 < shared_dims[0]; pos0 += gridDim.x)
-                {
-                    for (int pos1 = blockIdx.y; pos1 < shared_dims[1]; pos1 += gridDim.y)
-                    {
-                        //for (int pos2 = threadIdx.x; pos2 < shared_dims[2]; pos2 += blockDim.x)
-                        for (int pos2 = threadIdx.y; pos2 < shared_dims[2]; pos2 += blockDim.y)
-                        {
-                            //for (int pos3 = threadIdx.y; pos3 < shared_dims[3]; pos3 += blockDim.y)
-                            for (int pos3 = threadIdx.x; pos3 < shared_dims[3]; pos3 += blockDim.x)
-                            {
-                """, file=sio)
-            else:
-                raise NotImplementedError()
-            for ipos, i in enumerate(node.inputs):
-                if not _logical_scalar(i):
-                    print("        const float * ii_i%i_data = i%i_data;" % (ipos, ipos), file=sio)
-            for ipos, i in enumerate(node.outputs):
-                print("        float * ii_o%i_data = o%i_data;" % (ipos, ipos), file=sio)
-            for d in xrange(nd):
-                for ipos, i in enumerate(node.inputs):
-                    if not _logical_scalar(i):
-                        print("        ii_i%i_data += pos%i * i%i_str_%i;" % (ipos, d, ipos, d), file=sio)
-                for ipos, i in enumerate(node.outputs):
-                    print("        ii_o%i_data += pos%i * o%i_str_%i;" % (ipos, d, ipos, d), file=sio)
-            # perform the scalar operation on the input and output references
-            # TODO: What if the scalar_op needs support_code??
-            task_code = self.scalar_op.c_code(
-                Apply(
-                    self.scalar_op,
-                    [scalar.Scalar(
-                        dtype=input.type.dtype).make_variable()
-                     for input in node.inputs],
-                    [scalar.Scalar(
-                        dtype=output.type.dtype).make_variable()
-                     for output in node.outputs]),
-                nodename + '_scalar_',
-                get_str_list_logical_scalar(node, value_str='value0[%i]'),
-                ['ii_o%i_data[0]' % ipos for ipos, i in enumerate(node.outputs)],
-                sub=dict(fail='return;'))  # TODO: set a failure code somehow!!!
-            print("       ", task_code, file=sio)
-            print("    }" * nd, file=sio)
-            # TODO: insert runtime stride checks that select the best loop order either here, or in
-            # the host code that launched the  kernel (host code probably better spot)
-            # indent = " "*(4*d+7)
-            # for ipos, i in enumerate(node.inputs):
-            # print >> sio, indent, "const float * i%i" % ipos, '= i%i_data', ''
-            print("}", file=sio)
-        print(sio.getvalue())
-        return sio.getvalue()
-    def c_src_kernel_tiling_less_registers(self, node, nodename):
-        """
-        The kernel applies to problems with <= 5 dimensions.
-        """
-        nd = node.outputs[0].type.ndim
-        n_in = len(node.inputs)
-        n_out = len(node.outputs)
-        sio = StringIO()
-        if nd not in (2,):
-            return sio.getvalue()
-        # print some leading comments to make the code easier to read
-        print("// %s" % str(node.op), file=sio)
-        print("// node.op.destroy_map=%s" % str(
-            getattr(node.op, 'destroy_map', None)), file=sio)
-        for ipos, i in enumerate(node.inputs):
-            print("//    Input  ", ipos, str(i.type), file=sio)
-        for ipos, i in enumerate(node.outputs):
-            print("//    Output ", ipos, str(i.type), file=sio)
-        print(
-            "static __global__ void kernel_%s_%s_%s(unsigned int numEls" %
-            (self.scalar_op.__class__.__name__,
-             nodename,
-             'tiling%i_less_registers' % nd), file=sio)
-        if (nd):
-            print("\t,", ", ".join("const int dim%i" % i for i in xrange(nd)), file=sio)
-        # declare inputs
-        for ipos, i in enumerate(node.inputs):
-            s = ", ".join(["const float * i%i_data_0" % ipos] + list("int i%i_str_%i" % (ipos, d) for d in xrange(nd)))
-            print("\t,", s, file=sio)
-        # declare outputs
-        for ipos, i in enumerate(node.outputs):
-            s = ", ".join(["float * o%i_data_0" % ipos] + list("int o%i_str_%i" % (ipos, d) for d in xrange(nd)))
-            print("\t,", s, file=sio)
-            # print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) for d in xrange(nd))
-            # print >> sio, "\t,", "float * o%i_data" % ipos
-        print("\t)\n{", file=sio)
-        # TODO: Setting these to true makes the function fail SOMETIMES.  I don't know why yet.
-        use_shared_stride = False
-        use_shared_limits = False
-        def decl_limits(nd):
-            if use_shared_limits:
-                print("__shared__ float * limits[%(nd)s];" % locals(), file=sio)
-        def stride(io, p, d):
-            if use_shared_stride:
-                return "s%s_str[%i][%i]" % (io, p, d)
-            else:
-                return "%s%i_str_%i" % (io, p, d)
-        def limits(d):
-            if use_shared_limits:
-                return "limits[%i]" % d
-            else:
-                return "limits%i" % d
-        def decl_shared_stride(nin, nout, nd):
-            if not use_shared_stride:
-                return
-            print("""
-            __shared__ int si_str[%(nin)s][%(nd)s];
-            __shared__ int so_str[%(nout)s][%(nd)s];
-            if ((threadIdx.x == 0) && (threadIdx.y == 0)) {
-            """ % locals(), file=sio)
-            for i in xrange(nin):
-                for d in xrange(nd):
-                    print("si_str[%(i)s][%(d)s] = i%(i)s_str_%(d)s;" % locals(), file=sio)
-            for i in xrange(n_out):
-                for d in xrange(nd):
-                    print("so_str[%(i)s][%(d)s] = o%(i)s_str_%(d)s;" % locals(), file=sio)
-            print("} __syncthreads();", file=sio)
-        def calc_limit(d):
-            s = stride('o', 0, d)
-            lname = limits(d)
-            if use_shared_limits:
-                print("if ((threadIdx.x == 0) && (threadIdx.y == 0)) {", file=sio)
-                if d == 0:
-                    print("%(lname)s = o0_data_0 + dim%(d)s * %(s)s;" % locals(), file=sio)
-                else:
-                    dm1 = d - 1
-                    print("%(lname)s = o0_data_%(dm1)s + dim%(d)s * %(s)s;" % locals(), file=sio)
-                print("} __syncthreads();", file=sio)
-            else:
-                if d == 0:
-                    print("const float * %(lname)s = o0_data_0 + dim%(d)s * %(s)s;" % locals(), file=sio)
-                else:
-                    dm1 = d - 1
-                    print("const float * %(lname)s = o0_data_%(dm1)s + dim%(d)s * %(s)s;" % locals(), file=sio)
-        def decl_ptrs(d, offset):
-            dm1 = d - 1
-            assert dm1 >= 0
-            for i in xrange(n_in):
-                s = stride('i', i, d)
-                print("const float * i%(i)s_data_%(d)s = i%(i)s_data_%(dm1)s + %(offset)s * %(s)s;" % locals(), file=sio)
-            for i in xrange(n_out):
-                s = stride('o', i, d)
-                print("float * o%(i)s_data_%(d)s = o%(i)s_data_%(dm1)s + %(offset)s * %(s)s;" % locals(), file=sio)
-        def inc_ptrs(d, amt):
-            for i in xrange(n_in):
-                s = stride('i', i, d)
-                print("i%(i)s_data_%(d)s += %(amt)s * %(s)s;" % locals(), file=sio)
-            for i in xrange(n_out):
-                s = stride('o', i, d)
-                print("o%(i)s_data_%(d)s += %(amt)s * %(s)s;" % locals(), file=sio)
-        def while_limit(d):
-            lname = limits(d)
-            print("while (o0_data_%(d)s < %(lname)s) { " % locals(), file=sio)
-        def end_while(d):
-            print("}", file=sio)
-        def task_code(d):
-            print(self.scalar_op.c_code(
-                Apply(
-                    self.scalar_op,
-                    [scalar.Scalar(dtype=input.type.dtype).make_variable()
-                     for input in node.inputs],
-                    [scalar.Scalar(dtype=output.type.dtype).make_variable()
-                     for output in node.outputs]),
-                nodename + '_scalar_',
-                ['i%i_data_%i[0]' % (ipos, d) for ipos,
-                 i in enumerate(node.inputs)],
-                ['o%i_data_%i[0]' % (ipos, d) for ipos,
-                 i in enumerate(node.outputs)],
-                sub=dict(fail='return;')), file=sio)
-            # TODO: set a failure code somehow!!!
-        if nd == 4:
-            decl_shared_stride(n_in, n_out, nd)
-            decl_limits(nd)
-            calc_limit(0)
-            inc_ptrs(0, 'blockIdx.x')
-            while_limit(0)
-            if 1:
-                calc_limit(1)
-                decl_ptrs(1, 'blockIdx.y')
-                while_limit(1)
-                if 1:
-                    calc_limit(2)
-                    decl_ptrs(2, 'threadIdx.y')
-                    while_limit(2)
-                    if 1:
-                        calc_limit(3)
-                        decl_ptrs(3, 'threadIdx.x')
-                        while_limit(3)
-                        if 1:
-                            task_code(3)
-                            inc_ptrs(3, 'blockDim.x')
-                        end_while(3)
-                        inc_ptrs(2, 'blockDim.y')
-                    end_while(2)
-                    inc_ptrs(1, 'gridDim.y')
-                end_while(1)
-                inc_ptrs(0, 'gridDim.x')
-            end_while(0)
-        print("}", file=sio)
-        print(sio.getvalue())
-        return sio.getvalue()
-    def c_src_kernel_Ccontiguous(self, node, nodename):
-        sio = StringIO()
-        # print 'C_SRC_KERNEL', sio.getvalue()
-        print("// %s" % str(node.op), file=sio)
-        print("// node.op.destroy_map=%s" % str(
-            getattr(node.op, 'destroy_map', None)), file=sio)
-        for ipos, i in enumerate(node.inputs):
-            print("//    Input  ", ipos, str(i.type), file=sio)
-        for ipos, i in enumerate(node.outputs):
-            print("//    Output ", ipos, str(i.type), file=sio)
-        print("static __global__ void kernel_%s_%s_Ccontiguous (unsigned int numEls" % (self.scalar_op.__class__.__name__, nodename), file=sio)
-        # declare inputs
-        for ipos, i in enumerate(node.inputs):
-            print("\t,", "const float * i%i_data" % ipos, file=sio)
-        # declare outputs
-        for ipos, i in enumerate(node.outputs):
-            print("\t,", "float * o%i_data" % ipos, file=sio)
-        print("\t)\n{", file=sio)
-        print("    const int idx = blockIdx.x * blockDim.x + threadIdx.x;", file=sio)
-        print("    const int numThreads = blockDim.x * gridDim.x;", file=sio)
-        # For each input that is a scalar which has been broadcasted to a tensor,
-        #     load it into a local variable
-        for ipos, i in enumerate(node.inputs):
-            if _logical_scalar(i):
-                print("    const float ii_i%i_value = i%i_data[0];" % (ipos, ipos), file=sio)
-        # loop over the elements to be treated by this kernel call
-        print("    for (int i = idx; i < numEls; i += numThreads) {", file=sio)
-        # perform the scalar operation on the input and output references
-        # TODO: What if the scalar_op needs support_code??
-        for ipos, i in enumerate(node.outputs):
-            print("npy_%s o%d_i;" % (i.dtype, ipos), file=sio)
-        task_code = self.scalar_op.c_code(
-            Apply(
-                self.scalar_op,
-                [scalar.Scalar(dtype=input.type.dtype).make_variable()
-                 for input in node.inputs],
-                [scalar.Scalar(dtype=output.type.dtype).make_variable()
-                 for output in node.outputs]),
-            nodename + '_scalar_',
-            # , ['i%i_data[i]'%ipos for ipos,
-            #     i in enumerate(node.inputs)]
-            get_str_list_logical_scalar(node, data_str='i%i_data[i]'),
-            ['o%i_i' % ipos for ipos, i in enumerate(node.outputs)],
-            sub=dict(fail='return;'))
-        # TODO: set a failure code somehow!!!
-        print("       ", task_code, file=sio)
-        for ipos, _ in enumerate(node.outputs):
-            print("o%i_data[i] = o%i_i;" % (ipos, ipos), file=sio)
-        print("    }", file=sio)
-        print("}", file=sio)
-        # print sio.getvalue()
-        return sio.getvalue()
-    def c_src_callkernel(self, node, nodename):
-        #
-        # This function serves three main goals:
-        #
-        # The first is stride unpacking:
-        # it accepts input and output arguments as
-        #    float * , int*
-        # pairs, and it constructs a kernel function call where inputs and arguments are named
-        # like
-        #    float *, int, int, int ...
-        #
-        # The second is to recognize when any dimensions can be collapsed as
-        # being contiguous. That mean that we can merge that dimensions with another
-        # one for all inputs/outputs and have the same retusuls (confusing... read code)
-        #
-        # The thrid is to make a special case for scalar element. We allow the collapsing of them.
-        # In the ccontiguous and not contiguous case, we use registers to lower the number of memory access.
-        # TODO: make a special case for broadcasting, to store the data in shared memory.
-        nd = node.outputs[0].type.ndim
-        nb_inputs = len(node.inputs)
-        nb_outputs = len(node.outputs)
-        d = dict()
-        # input_params and output_params go into the function declaration/definition
-        input_params = ", ".join(
-            "const float * i%i_data, const int * i%i_str" % (ipos, ipos)
-            for ipos in xrange(len(node.inputs)))
-        output_params = ", ".join(
-            "float * o%i_data, const int * o%i_str" % (ipos, ipos)
-            for ipos in xrange(len(node.outputs)))
-        # input_args and output_args go into the recursive call.
-        input_args = ", ".join("i%i_data, i%i_str" % (ipos, ipos)
-                               for ipos in xrange(len(node.inputs)))
-        output_args = ", ".join("o%i_data, o%i_str" % (ipos, ipos)
-                                for ipos in xrange(len(node.outputs)))
-        prod_dims = '*'.join(
-            ["dims[%i]" % di for di in xrange(nd)] + ['1'])
-        scalar_op = self.scalar_op.__class__.__name__
-        sio = StringIO()
-        print("""
-        static void can_collapse_%(nodename)s(int nd, const int * dims, const int * strides, int collapse[])
-        {
-            //can we collapse dims[i] and dims[i-1]
-            for(int i=nd-1;i>0;i--){
-                if(strides[i]*dims[i]==strides[i-1]){//the dims nd-1 are not strided again dimension nd
-                    collapse[i]=1;
-                }else collapse[i]=0;
-            }
-        }
-        """ % locals(), file=sio)
-        print("""
-        static int callkernel_%(nodename)s(unsigned int numEls, const int d,
-            const int * dims,
-            %(input_params)s,
-            %(output_params)s)
-        {
-            numEls = %(prod_dims)s;
-        """ % locals(), file=sio)
-        if self.verbose:
-            print("""
-                std::cerr << "calling kernel_%(scalar_op)s_%(nodename)s     w numEls" << numEls << " dims"<< d << "\\n";
-            """ % locals(), file=sio)
-            print(
-                'std::cerr << ' + " << ' ' <<  ".join(
-                    ['"  "'] +
-                    list("dims[%i]" % di for di in xrange(nd)) +
-                    ["'\\n';"]),
-                file=sio)
-        if self.verbose > 1:
-            for ipos in xrange(len(node.inputs)):
-                istrings = [
-                    "i%s_str[%i]" % (ipos, di) for di in xrange(nd)]
-                ipositions = " << ' ' <<  ".join(
-                    ["i%s_data" % ipos] + istrings)
-                print("""
-                std::cerr << "   %(ipos)s data strides" << %(ipositions)s << "\\n";
-                """ % dict(ipos=ipos, ipositions=ipositions), file=sio)
-            for ipos in xrange(len(node.outputs)):
-                print("""
-                std::cerr << "   %(ipos)s data strides" <<
-                """ % locals() + " << ' ' <<  ".join(
-                    ["o%s_data" % ipos] +
-                    list(
-                        "o%s_str[%i]" % (ipos, di) for di in xrange(nd)
-                        )) +
-                    ''' << "\\n"; ''', file=sio)
-    # collapse dimension that are broadcast in all inputs.
-    # need to be done before contiguous collapse as it will break it.
-    # do the dimensions and the strides
-        if nd > 0:
-            print("int local_dims[%(nd)s];" % locals(), file=sio)
-        else:
-            print("int *local_dims=NULL;", file=sio)
-        if nb_inputs > 0 and nd > 0:
-            print("""
-            int local_str[%(nb_inputs)s][%(nd)s];
-            int local_ostr[%(nb_outputs)s][%(nd)s];
-            """ % locals(), file=sio)
-        else:
-            print("""
-            int local_str[1][1];
-            int local_ostr[1][1];
-            """, file=sio)
-        print("""
-        int nd_collapse = %(nd)s;
-        for(int i=0;i<%(nd)s;i++){//init new dim
-          local_dims[i]=dims[i];
-        }
-        """ % locals(), file=sio)
-        for ipos in xrange(len(node.inputs)):
-            print("""
-            for(int i=0;i<%(nd)s;i++){//init new strides
-              local_str[%(ipos)s][i]=i%(ipos)s_str[i];
-            }
-            """ % locals(), file=sio)
-        for ipos in xrange(len(node.outputs)):
-            print("""
-            for(int i=0;i<%(nd)s;i++){//init new strides
-              local_ostr[%(ipos)s][i]=o%(ipos)s_str[i];
-            }
-            """ % locals(), file=sio)
-        if self.verbose > 2:
-            print('std::cerr <<"before broadcast collapse\\n";', file=sio)
-            print('std::cerr<< "nd_collapse "<< nd_collapse << "\\n"; ', file=sio)
-            print('std::cerr << "local_dims";', file=sio)
-            for d in xrange(nd):
-                print('std::cerr << " " << local_dims[%(d)s]; ' % locals(), file=sio)
-            print('std::cerr << "\\n";', file=sio)
-            if nd > 0:
-                for ipos in xrange(len(node.inputs)):
-                    print(
-                        'std::cerr << " local_str inputs %(ipos)s: " <<' % locals() +
-                        ' << " " << '.join(["local_str[%s][%s]" % (ipos, x)
-                                            for x in xrange(nd)]) +
-                        '<<"\\n";', file=sio)
-                    for ipos in xrange(len(node.outputs)):
-                        print(
-                            'std::cerr << " local_ostr inputs %(ipos)s: " <<' %
-                            locals() +
-                            ' << " " << '.join(
-                                ["local_ostr[%s][%s]" %
-                                 (ipos, x) for x in xrange(nd)]) +
-                            '<<"\\n";', file=sio)
-        print("""
-        for(int id=0;id<nd_collapse;id++){
-          bool all_broadcast=true;
-          for(int input_id=0;input_id<%(nb_inputs)s;input_id++){
-            if(local_str[input_id][id]!=0 || local_dims[id]!=1) all_broadcast= false;
-          }
-          for(int input_id=0;input_id<%(nb_outputs)s;input_id++){
-            if(local_ostr[input_id][id]!=0 || local_dims[id]!=1) all_broadcast= false;
-          }
-          if(all_broadcast){
-            for(int j=id+1;j<nd_collapse;j++)//remove dims i from the array
-              local_dims[j-1]=local_dims[j];
-            for(int input_id=0;input_id<%(nb_inputs)s;input_id++){
-              for(int j=id+1;j<nd_collapse;j++){//remove dims i from the array
-                local_str[input_id][j-1]=local_str[input_id][j];
-              }
-            }
-            for(int output_id=0;output_id<%(nb_outputs)s;output_id++){
-              for(int j=id+1;j<nd_collapse;j++){//remove dims i from the array
-                local_ostr[output_id][j-1]=local_ostr[output_id][j];
-              }
-            }
-            nd_collapse--; id--;
-          }
-        }
-        """ % locals(), file=sio)
-        if self.verbose > 2:
-            print('std::cerr <<"after broadcast collapse\\n";', file=sio)
-            print('std::cerr<< "nd_collapse "<< nd_collapse << "\\n"; ', file=sio)
-            print('std::cerr << "local_dims";', file=sio)
-            for d in xrange(nd):
-                print('std::cerr << " " << local_dims[%(d)s]; ' %
-                      locals(), file=sio)
-            print('std::cerr << "\\n";', file=sio)
-            if nd > 0:
-                for ipos in xrange(len(node.inputs)):
-                    print('std::cerr << " local_str %(ipos)s: " <<' %
-                          locals() + ' << " " << '.join(
-                              ["local_str[%s][%s]" %
-                               (ipos, x) for x in xrange(nd)]) +
-                          '<<"\\n";', file=sio)
-                    for ipos in xrange(len(node.outputs)):
-                        print(
-                            'std::cerr << " local_ostr %(ipos)s: " <<' %
-                            locals() + ' << " " << '.join(
-                                ["local_ostr[%s][%s]" %
-                                 (ipos, x) for x in xrange(nd)]) +
-                            '<<"\\n";', file=sio)
-    # collapse contiguous dimensions (ignoring scalars, generic version(collapse any dimensions, right, left, middle))
-    # this is a good idea because we make less index calculation in the gpu.
-        if nd > 0:
-            print("int nd_collapse_[%(nd)s] = {" %
-                  locals() + ','.join(
-                      ['1' for x in xrange(nd)]) + "};", file=sio)
-        else:
-            print("int *nd_collapse_ = NULL;", file=sio)
-        for ipos in xrange(len(node.inputs)):
-            if not _logical_scalar(node.inputs[ipos]):
-                if nd > 0:
-                    print("""
-                          int nd_collapse_%(ipos)s[%(nd)s] = {""" %
-                          locals() +
-                          ','.join(['1' for x in xrange(nd)]) +
-                          "};", file=sio)
-                else:
-                    print("""
-                          int * nd_collapse_%(ipos)s = NULL;""" %
-                          locals(), file=sio)
-                print("""
-can_collapse_%(nodename)s(nd_collapse, local_dims, local_str[%(ipos)s], nd_collapse_%(ipos)s);
-for(int i=0;i<nd_collapse;i++){
-if(nd_collapse_%(ipos)s[i]==0)
-nd_collapse_[i]=0;
-}
-                """ % locals(), file=sio)
-                if self.verbose > 1:
-                    print("""
-                    std::cerr<< "nd_collapse_%(ipos)s "<<
-                    """ % locals(), file=sio)
-                    print(' << " " << '.join(["nd_collapse_ %s[" %
-                          ipos + str(i) + "]" for i in xrange(nd)]),
-                          file=sio)
-                    print('<< "\\n";', file=sio)
-    # update the local stride.
-        for ipos in xrange(len(node.inputs)):
-            print("""
-            for(int i=nd_collapse-1;i>0;i--){
-              if(nd_collapse_[i]==1){
-                local_str[%(ipos)s][i-1]=local_str[%(ipos)s][i];//set new strides
-                for(int j=i+1;j<nd_collapse;j++)//remove stride i from the array
-                  local_str[%(ipos)s][j-1]=local_str[%(ipos)s][j];
-                }
-            }
-            """ % locals(), file=sio)
-        for ipos in xrange(len(node.outputs)):
-            print("""
-            for(int i=nd_collapse-1;i>0;i--){
-              if(nd_collapse_[i]==1){
-                local_ostr[%(ipos)s][i-1]=local_ostr[%(ipos)s][i];//set new strides
-                for(int j=i+1;j<nd_collapse;j++)//remove stride i from the array
-                  local_ostr[%(ipos)s][j-1]=local_ostr[%(ipos)s][j];
-                }
-            }
-            """ % locals(), file=sio)
-    # update the local dims.
-        print("""
-        for(int i=nd_collapse-1;i>0;i--){
-          if(nd_collapse_[i]==1){
-            local_dims[i-1]*=local_dims[i];//set new dims
-            for(int j=i+1;j<nd_collapse;j++)//remove dims i from the array
-              local_dims[j-1]=local_dims[j];
-          }
-        }
-        """ % locals(), file=sio)
-    # update the new number of dim
-        print("""
-        for(int i=1, end=nd_collapse;i<end;i++){
-          if(nd_collapse_[i]==1)nd_collapse--;
-        }
-        if(nd_collapse == 1 """ % locals(), file=sio)
-        l = ["local_str[%s][nd_collapse-1]==1 " %
-             ipos for ipos in xrange(len(node.inputs)) if not
-             _logical_scalar(node.inputs[ipos])]
-        l += ["local_ostr[%s][nd_collapse-1]==1 " %
-              ipos for ipos in xrange(len(node.outputs)) if not
-              _logical_scalar(node.outputs[ipos])]
-        if len(l) > 0:
-            print(" && ", " && ".join(l), file=sio)
-        print("""){nd_collapse=0;} """, file=sio)
-        if self.verbose:
-            print('std::cerr <<"after can_collapse\\n";', file=sio)
-            print("""std::cerr << "nd_collapse " << nd_collapse << "\\n"; """ % locals(), file=sio)
-        if self.verbose > 1:
-            for d in xrange(nd):
-                print('std::cerr << " " << local_dims[%(d)s]; ' %
-                      locals(),
-                      file=sio)
-            print('std::cerr << "\\n";', file=sio)
-            if nd > 0:
-                for ipos in xrange(len(node.inputs)):
-                    print(
-                        'std::cerr << " local_str % (ipos)s: " <<' %
-                        locals() + ' << " " << '.join(
-                            ["local_str[%s][%s]" %
-                             (ipos, x) for x in xrange(nd)]) +
-                        '<<"\\n";', file=sio)
-                    for ipos in xrange(len(node.outputs)):
-                        print('std::cerr << " local_ostr  % (ipos)s: " <<' %
-                              locals() + ' << " " << '.join(
-                                  ["local_ostr[%s][%s]" %
-                                   (ipos, x) for x in xrange(nd)]) +
-                              '<<"\\n";', file=sio)
-        def launch_Ccontiguous(nodename, scalar_op, sync=True):
-            kernel_call_args = ["numEls"]
-            for ipos in xrange(len(node.inputs)):
-                kernel_call_args.append("i%i_data" % ipos)
-            for ipos in xrange(len(node.outputs)):
-                kernel_call_args.append("o%i_data" % ipos)
-            kernel_call_args = ", ".join(kernel_call_args)
-            verb = ""
-            if self.verbose:
-                verb = 'std::cerr << "   Running ccontiguous version\\n";'
-            print("""
-                //first use at least a full warp
-                int threads_per_block = std::min(numEls,  (unsigned int)32); //WARP SIZE
-                //next start adding multiprocessors
-                int n_blocks = std::min(numEls/threads_per_block + (numEls %% threads_per_block?1:0), (unsigned int)30); // UP TO NUMBER OF MULTIPROCESSORS
-                // next start adding more warps per multiprocessor
-                if (threads_per_block * n_blocks < numEls)
-                    threads_per_block = std::min(numEls/n_blocks, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
-                kernel_%(scalar_op)s_%(nodename)s_Ccontiguous<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s);
-                //std::cerr << "calling callkernel returned\\n";
-                """ % locals(), file=sio)
-            if sync:
-                print("""
-                CNDA_THREAD_SYNC;
-                cudaError_t err = cudaGetLastError();
-                if( cudaSuccess != err)
-                {
-                    PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n    n_blocks=%%i threads_per_block=%%i\\n   Call: %%s\\n",
-                         "GpuElemwise %(nodename)s %(scalar_op)s", cudaGetErrorString(err),
-                         n_blocks, threads_per_block,
-                         "kernel_%(scalar_op)s_%(nodename)s_Ccontiguous<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s)");
-                    return -1;
-                }
-                %(verb)s
-                return 0;
-                """ % locals(), file=sio)
-            else:
-                print(" return 0; " % locals(), file=sio)
-        def launch_General(nodename, scalar_op, force_nd, sync=True):
-            # kernel_call_args are used to invoke the cuda kernel
-            local = "local_"
-            kernel_call_args = ["numEls"]
-            kernel_call_args.extend(
-                local + "dims[%i]" %
-                di for di in xrange(force_nd))
-            for ipos in xrange(len(node.inputs)):
-                kernel_call_args += ["i%i_data" % ipos] + list(
-                    local + "str[%i][%i]" %
-                    (ipos, di) for di in xrange(force_nd))
-                # strides = ", ".join("i%i_str[%i]"%(ipos, di) for di in xrange(force_nd))
-                # kernel_call_args.append( "%s, i%i_data" % (strides, ipos))
-            for ipos in xrange(len(node.outputs)):
-                kernel_call_args += ["o%i_data" % ipos] + list(
-                    local + "ostr[%i][%i]" %
-                    (ipos, di) for di in xrange(force_nd))
-                # strides = ", ".join("o%i_str[%i]"%(ipos, di) for di in xrange(force_nd))
-                # kernel_call_args.append( "%s, o%i_data" % (strides, ipos))
-            if self.verbose:
-                print("""
-                    std::cerr << "   Running general version with %(force_nd)s  dims\\n";
-                    """ % locals(), file=sio)
-                print("std::cerr << " + ' << " " << '.join(
-                    kernel_call_args) + ' << "\\n";', file=sio)
-                # std::cerr << numEls << dims[0] << i0_data, i0_str[0] << o0_data, o0_str[0]\n;
-            kernel_call_args = ", ".join(kernel_call_args)
-            print("""
-                //first use at least a full warp
-                int threads_per_block = std::min(numEls, (unsigned int)32); //WARP SIZE
-                //next start adding multiprocessors
-                int n_blocks = std::min(numEls/threads_per_block + (numEls %% threads_per_block?1:0), (unsigned int)30); // UP TO NUMBER OF MULTIPROCESSORS
-                // next start adding more warps per multiprocessor
-                if (threads_per_block * n_blocks < numEls)
-                    threads_per_block = std::min(numEls/n_blocks, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
-                kernel_%(scalar_op)s_%(nodename)s_%(force_nd)s<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s);
-                """ % locals(), file=sio)
-            if sync:
-                print("""
-                CNDA_THREAD_SYNC;
-                cudaError_t err = cudaGetLastError();
-                if( cudaSuccess != err)
-                {
-                    PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n    n_blocks=%%i threads_per_block=%%i\\n   Call: %%s\\n",
-                         "GpuElemwise %(nodename)s %(scalar_op)s", cudaGetErrorString(err),
-                         n_blocks, threads_per_block,
-                         "kernel_%(scalar_op)s_%(nodename)s_Ccontiguous<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s)");
-                    return -1;
-                }
-                return 0;
-                """ % locals(), file=sio)
-            else:
-                print(" return 0; " % locals(), file=sio)
-        print("if(numEls==0) return 0;", file=sio)
-        print("switch (nd_collapse==0?0:min(%(nd)s,nd_collapse)) {" %
-              locals(), file=sio)
-        print("case 0: {", file=sio)
-        launch_Ccontiguous(nodename, scalar_op, self.sync)
-        print("        } break;", file=sio)
-        for i in xrange(1, nd + 1):
-            print("case " + str(i) + ": {", file=sio)
-            launch_General(nodename, scalar_op, i, self.sync)
-            print("        } break;", file=sio)
-        print("}", file=sio)  # end case
-        print("return -2;", file=sio)  # should not get to this point
-        print("}", file=sio)  # end fct
-        # N.B. cudaGetLastError is called by c_code
-        return sio.getvalue()
-    def c_support_code_apply(self, node, nodename):
-        nd = node.outputs[0].type.ndim
-        defines = """
-#define INTDIV_POW2(a, b) (a >> b)
-#define INTMOD_POW2(a, b) (a & ((1<<b)-1))
-        """
-        kernels = "".join(
-            [self.c_src_kernel(node, nodename, x)
-             for x in xrange(1, nd + 1)] +
-            [self.c_src_kernel_Ccontiguous(node, nodename)] +
-            [self.c_src_callkernel(node, nodename)])
-        return defines + kernels
-    def c_support_code(self):
-        return self.scalar_op.c_support_code()
-    def c_code(self, node, nodename, inputs, outputs, sub):
-        d = dict(sub)
-        nd = node.outputs[0].type.ndim
-        d.update(locals())
-        sio = StringIO()
-        nin = len(inputs)
-        nout = len(outputs)
-        fail = sub['fail']
-        opname = str(self.scalar_op)
-        initial_dims = ','.join('1' for i in xrange(nd))
-        if 1 or self.scalar_op == scalar.pow:
-            print("""
-        //std::cerr << "C_CODE %(opname)s START\\n";
-        //standard elemwise size checks
-            """ % locals(), file=sio)
-        if nd > 0:
-            print("""
-            int dims[%(nd)s] = {%(initial_dims)s};
-            """ % locals(), file=sio)
-        else:
-            print("""
-            int *dims = NULL;
-            """, file=sio)
-        # check that all inputs have valid dimensions
-        emitted_inames = {}
-        for id, iname in enumerate(inputs):
-            if iname in emitted_inames:
-                assert emitted_inames[iname] is node.inputs[id]
-                continue
-            # with python 2.4 (at least), if a broadcastable pattern is made of
-            # numpy.bool_ instead of bool, calling int() once is not enough.
-            broadcasts = map(int, map(int, node.inputs[id].broadcastable))
-            broadcasts = ', '.join(map(str, broadcasts))
-            nd = node.inputs[id].ndim
-            if nd > 0:
-                print("""
-                int broadcasts_%(iname)s[%(nd)s] = {%(broadcasts)s};
-                """ % locals(), file=sio)
-            else:
-                print("""
-                int *broadcasts_%(iname)s = NULL;
-                """ % locals(), file=sio)
-            emitted_inames[iname] = node.inputs[id]
-        # check that all inputs have valid dimensions
-        emitted_inames = {}
-        for id, iname in enumerate(inputs):
-            if iname in emitted_inames:
-                continue
-            print("""
-        //std::cerr << "C_CODE %(opname)s checking input %(iname)s\\n";
-        if (%(nd)s != %(iname)s->nd)
-        {
-            PyErr_Format(PyExc_TypeError,
-                         "need %(nd)s dims, not %%i", %(iname)s->nd);
-            %(fail)s;
-        }
-        for (int i = 0; i< %(nd)s; ++i)
-        {
-            dims[i] = (dims[i] == 1) ? CudaNdarray_HOST_DIMS(%(iname)s)[i] : dims[i];
-            if ((!(broadcasts_%(iname)s[i] &&
-                 CudaNdarray_HOST_DIMS(%(iname)s)[i] == 1)) &&
-                (dims[i] != CudaNdarray_HOST_DIMS(%(iname)s)[i]))
-            {
-                //std::cerr << "C_CODE %(opname)s checking input %(iname)s failed\\n";
-                PyErr_Format(PyExc_ValueError,
-                             "GpuElemwise. Input dimension mis-match. Input"
-                             " %(id)d (indices start at 0) has shape[%%i] == %%i"
-                             ", but the output's size on that axis is %%i.",
-                             i,
-                             CudaNdarray_HOST_DIMS(%(iname)s)[i],
-                             dims[i]
-                            );
-                %(fail)s;
-            }
-        }
-            """ % locals(), file=sio)
-            emitted_inames[iname] = True
-        # check that all outputs have valid dimensions
-        for idx, oname in enumerate(outputs):
-            if idx not in self.inplace_pattern.keys():
-                print("""
-        for (int i = 0; (i< %(nd)s) && (%(oname)s); ++i) {
-            if (dims[i] != CudaNdarray_HOST_DIMS(%(oname)s)[i])
-            {
-                Py_DECREF(%(oname)s);
-                %(oname)s = NULL;
-            }
-        }
-        if (%(oname)s && !CudaNdarray_is_c_contiguous(%(oname)s))
-        {
-            Py_XDECREF(%(oname)s);
-            %(oname)s = NULL;
-        }
-        if (NULL == %(oname)s)
-        {
-            %(oname)s = (CudaNdarray*)CudaNdarray_New();
-            if (!%(oname)s)
-            {
-                //error string already set
-                %(fail)s;
-            }
-            if (CudaNdarray_alloc_contiguous(%(oname)s, %(nd)s, dims))
-            {
-                //error string already set
-                Py_DECREF(%(oname)s);
-                %(oname)s = NULL;
-                %(fail)s;
-            }
-        }
-        //std::cerr << "ELEMWISE NEW %(oname)s nd" << %(oname)s->nd << "\\n";
-        //std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n";
-        """ % locals(), file=sio)
-            else:
-                input_idx = self.inplace_pattern[idx]
-                iname = inputs[input_idx]
-                print("""
-        Py_XDECREF(%(oname)s);
-        %(oname)s = %(iname)s;
-        Py_INCREF(%(oname)s);
-        for (int i = 0; (i< %(nd)s) && (%(oname)s); ++i) {
-            if (dims[i] != CudaNdarray_HOST_DIMS(%(oname)s)[i])
-            {
-                PyErr_Format(PyExc_ValueError,
-                             "GpuElemwise. Output dimension mis-match. Output"
-                             " %(idx)d (indices start at 0), working inplace"
-                             " on input %(input_idx)s, has shape[%%i] == %%i"
-                             ", but the output's size on that axis is %%i.",
-                             i,
-                             CudaNdarray_HOST_DIMS(%(oname)s)[i],
-                             dims[i]
-                            );
-                Py_DECREF(%(oname)s);
-                %(oname)s = NULL;
-                %(fail)s;
-            }
-        }
-        //std::cerr << "ELEMWISE NEW %(oname)s nd" << %(oname)s->nd << "\\n";
-        //std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n";
-        """ % locals(), file=sio)
-        print("""
-        {
-            //new block so that failure gotos don't skip over variable initialization
-            //std::cerr << "calling callkernel\\n";
-            if (callkernel_%(nodename)s(1, 0, dims
-            """ % locals(), file=sio)
-        for iname in inputs:
-            print("""
-                        , CudaNdarray_DEV_DATA(%(iname)s), CudaNdarray_HOST_STRIDES(%(iname)s)
-            """ % locals(), file=sio)
-        for oname in outputs:
-            print("""
-                        , CudaNdarray_DEV_DATA(%(oname)s), CudaNdarray_HOST_STRIDES(%(oname)s)
-            """ % locals(), file=sio)
-        print("""
-                        ))
-            {
-                 // error
-            """, file=sio)
-        for oname in outputs:
-            print("""
-                Py_DECREF(%(oname)s);
-                %(oname)s = NULL;
-                """ % locals(), file=sio)
-        print("""
-                %(fail)s;
-            }
-            else // no error
-            {
-            }
-        }
-        //std::cerr << "C_CODE %(opname)s END\\n";
-        """ % locals(), file=sio)
-        # print sio.getvalue()
-        return sio.getvalue()
-class ErfinvGPU(Erfinv):
-    """
-    Provides a c-code implementation of the inverse error function for GPU.
-    Notes
-    -----
-    We do not add this c_code to theano.scalar.basic_scipy.Erfinv, as we
-    currently rely on Nvidia's cublas library to provide the erfinv
-    c-implementation (which requires different c_headers). As it stands,
-    theano.scalar.basic_scipy.Erfinv does not have c_code as scipy does not
-    export the required C function.
-    """
-    def c_headers(self):
-        return ['math_functions.h', 'cublas_v2.h']
-    def c_code(self, node, name, inp, out, sub):
-        x, = inp
-        z, = out
-        if node.inputs[0].type in complex_types:
-            raise NotImplementedError('type not supported', type)
-        return "%(z)s = erfinv(%(x)s);" % locals()
-erfinv_gpu = ErfinvGPU(upgrade_to_float_no_complex, name='erfinv_gpu')
-class ErfcxGPU(Erfinv):
-    """
-    Provides a c-code implementation of the scaled complementary error function
-    for GPU.
-    Notes
-    -----
-    We do not add this c_code to theano.scalar.basic_scipy.Erfcx, as we
-    currently rely on Nvidia's cublas library to provide the erfcx
-    c-implementation (which requires different c_headers). As it stands,
-    theano.scalar.basic_scipy.Erfcx does not have c_code as scipy does not
-    export the required C function.
-    """
-    def c_headers(self):
-        return ['math_functions.h', 'cublas_v2.h']
-    def c_code(self, node, name, inp, out, sub):
-        x, = inp
-        z, = out
-        if node.inputs[0].type in complex_types:
-            raise NotImplementedError('type not supported', type)
-        return "%(z)s = erfcx(%(x)s);" % locals()
-erfcx_gpu = ErfcxGPU(upgrade_to_float_no_complex, name='erfcx_gpu')
--- a/theano/sandbox/cuda/extra_ops.py
+++ b/theano/sandbox/cuda/extra_ops.py
-from __future__ import absolute_import, print_function, division
-import theano
-import copy
-from theano import Op
-from theano.gof import local_optimizer
-from theano.sandbox.cuda import cuda_available, GpuOp
-from theano.sandbox.cuda.basic_ops import gpu_flatten
-from theano.tensor.extra_ops import CumOp
-if cuda_available:
-    from theano.sandbox.cuda import CudaNdarrayType
-    from theano.sandbox.cuda.basic_ops import host_from_gpu, gpu_from_host, HostFromGpu
-    from theano.sandbox.cuda import register_opt as register_gpu_opt
-class GpuCumsum(CumOp, GpuOp):
-    """
-    Parameters
-    ----------
-    axis
-        Can not be None. If you want the array flatten, do it before.
-    """
-    SUPPORTED_NDIMS = 3
-    __props__ = ('axis', 'max_threads_dim0', 'max_grid_size1', 'max_grid_size2')
-    def __init__(self, axis):
-        self.axis = axis
-        self.max_threads_dim0 = None
-        self.max_grid_size1 = None
-        self.max_grid_size2 = None
-    # We must reuse the same method, not reimplement and call it.
-    # Otherwise DebugMode will print many warnings.
-    perform = Op.perform
-    def make_node(self, x):
-        assert x.dtype == 'float32'
-        if not isinstance(x.type, CudaNdarrayType):
-            raise TypeError('x must be a CudaNdarrayType', x)
-        if x.ndim > GpuCumsum.SUPPORTED_NDIMS:
-            raise NotImplementedError('Only cumsum on 1D, 2D and 3D array are supported right now!')
-        if self.axis >= x.ndim or self.axis < -x.ndim:
-            raise ValueError('axis(={1}) out of bounds'.format(self.axis))
-        return theano.Apply(self, [x], [x.type()])
-    def make_thunk(self, node, storage_map, compute_map, no_recycling, impl=None):
-        node_ = copy.copy(node)
-        assert node.op is node_.op
-        if node_.op.max_threads_dim0 is None or node_.op.max_grid_size1 is None or node_.op.max_grid_size2 is None:
-            cuda = theano.sandbox.cuda
-            device_id = cuda.use.device_number
-            if device_id is None:
-                cuda.use("gpu",
-                         force=False,
-                         default_to_move_computation_to_gpu=False,
-                         move_shared_float32_to_gpu=False,
-                         enable_cuda=False,
-                         test_driver=True)
-                device_id = cuda.use.device_number
-            cuda_ndarray = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray
-            prop = cuda_ndarray.device_properties(device_id)
-            node_.op.max_threads_dim0 = prop['maxThreadsDim0']
-            node_.op.max_grid_size1 = prop['maxGridSize1']
-            node_.op.max_grid_size2 = prop['maxGridSize2']
-        return super(GpuCumsum, node_.op).make_thunk(node_, storage_map,
-                                                     compute_map, no_recycling, impl)
-    def __str__(self):
-        return "%s{%s}" % (self.__class__.__name__, self.axis)
-    def c_code_cache_version(self):
-        return (9,)
-    def c_support_code_apply(self, node, nodename):
-        return """
-        __device__
-        void k_reductionPhase_%(nodename)s(float* partialCumSum) {
-            // Traverse down from leaves to root building partial sums at internal nodes in the tree.
-            for (unsigned int stride = 1; stride <= blockDim.x; stride *= 2) {
-                __syncthreads();
-                unsigned int index = (threadIdx.x + 1) * (stride * 2) - 1;
-                if(index < blockDim.x*2) {
-                    partialCumSum[index] += partialCumSum[index - stride];
-                }
-            }
-        }
-        __device__
-        void k_reversePhase_%(nodename)s(float* partialCumSum) {
-            // Traverse back up the tree building the scan from the partial sums
-            for (unsigned int stride = exp2(ceil(log2((float)blockDim.x))); stride > 0; stride /= 2) {
-                __syncthreads();
-                unsigned int index = (threadIdx.x + 1) * (stride * 2) - 1;
-                if(index + stride < blockDim.x*2) {
-                    partialCumSum[index + stride] += partialCumSum[index];
-                }
-            }
-        }
-        __device__
-        void k_fetchData_%(nodename)s(float* partialCumSum, float* input, int globalThreadID, dim3 dataStrides, int offsetY, int offsetZ) {
-            // blockIdx.y and blockIdx.z represents the current independent cumsum
-            int idY = blockIdx.y + offsetY;
-            int idZ = blockIdx.z + offsetZ;
-            int offset = idY * dataStrides.y + idZ * dataStrides.z;
-            int idx_even = (globalThreadID*2    ) * dataStrides.x + offset;
-            int idx_odd  = (globalThreadID*2 + 1) * dataStrides.x + offset;
-            partialCumSum[threadIdx.x*2]     = input[idx_even];
-            partialCumSum[threadIdx.x*2 + 1] = input[idx_odd];
-        }
-        __device__
-        void k_pushData_%(nodename)s(float* partialCumSum, float* output, int globalThreadID, dim3 dataStrides, int offsetY, int offsetZ) {
-            __syncthreads();
-            // blockIdx.y and blockIdx.z represents the current independent cumsum
-            int idY = blockIdx.y + offsetY;
-            int idZ = blockIdx.z + offsetZ;
-            int offset = idY * dataStrides.y + idZ * dataStrides.z;
-            int idx_even = (globalThreadID*2    ) * dataStrides.x + offset;
-            int idx_odd  = (globalThreadID*2 + 1) * dataStrides.x + offset;
-            output[idx_even] = partialCumSum[threadIdx.x*2];
-            output[idx_odd]  = partialCumSum[threadIdx.x*2 + 1];
-        }
-        __global__
-        void k_cumadd_%(nodename)s(float* input, float* output, dim3 inputStrides, dim3 outputStrides, int offsetY, int offsetZ, int beforeLastElementIdx, int lastElementIdx) {
-            int idY = blockIdx.y + offsetY;
-            int idZ = blockIdx.z + offsetZ;
-            int dataOffsetY_input = idY * inputStrides.y + idZ * inputStrides.z;
-            int dataOffsetY_output = idY * outputStrides.y + idZ * outputStrides.z;
-            int idx_last_input = lastElementIdx*inputStrides.x + dataOffsetY_input;
-            int idx_last_output = lastElementIdx*outputStrides.x + dataOffsetY_output;
-            int idx_beforelast = beforeLastElementIdx*outputStrides.x + dataOffsetY_output;
-            output[idx_last_output] = input[idx_last_input] + output[idx_beforelast];
-        }
-        __global__
-        void k_finalCumSum_%(nodename)s(float* output, float* blockSum, int nbElementsPerCumsum, dim3 dataStrides, int offsetY, int offsetZ) {
-            int globalThreadID = (blockIdx.x + 1) * blockDim.x + threadIdx.x;
-            // Check if current has data to process.
-            if (globalThreadID >= ceil(nbElementsPerCumsum/2.0)) {
-                return;
-            }
-            int idY = blockIdx.y + offsetY;
-            int idZ = blockIdx.z + offsetZ;
-            const float currentBlockSum = blockSum[blockIdx.x*(gridDim.y*gridDim.z) + idY*gridDim.z + idZ];
-            int offset = idY * dataStrides.y + idZ * dataStrides.z;
-            int idx_even = (globalThreadID*2    ) * dataStrides.x + offset;
-            int idx_odd  = (globalThreadID*2 + 1) * dataStrides.x + offset;
-            output[idx_even] += currentBlockSum;
-            output[idx_odd] += currentBlockSum;
-        }
-        __global__
-        void k_blockCumSum_%(nodename)s(float* input, float* output, int nbElementsPerCumsum, dim3 inputStrides, dim3 outputStrides, int offsetY, int offsetZ, float* blockSum) {
-            // Regarding blockIdx and threadIdx, 'Cumsum' is always performed along the X axis.
-            // The Y and Z axis of the grid will contain all independent cumsums of the 2D/3D case.
-            int globalThreadID = blockIdx.x * blockDim.x + threadIdx.x;
-            // Check if current thread has data to process.
-            if (globalThreadID >= ceil(nbElementsPerCumsum/2.0)) {
-                return;
-            }
-            extern __shared__ float partialCumSum[];
-            // Load data in shared memory
-            k_fetchData_%(nodename)s(partialCumSum, input, globalThreadID, inputStrides, offsetY, offsetZ);
-            // Use a dichotomy approach to compute the cumsum (i.e. balanced binary tree).
-            // The tree is sweeped from the leaves to the root and from the root to the leaves.
-            // Similar to http://www.umiacs.umd.edu/~ramani/cmsc828e_gpusci/ScanTalk.pdf
-            k_reductionPhase_%(nodename)s(partialCumSum);
-            k_reversePhase_%(nodename)s(partialCumSum);
-            // Write the final output to global memory
-            k_pushData_%(nodename)s(partialCumSum, output, globalThreadID, outputStrides, offsetY, offsetZ);
-            if (blockSum != NULL){
-                if (threadIdx.x == blockDim.x - 1) {
-                    blockSum[blockIdx.x*(gridDim.y*gridDim.z) + (blockIdx.y + offsetY)*gridDim.z + blockIdx.z + offsetZ] = partialCumSum[threadIdx.x*2 + 1];
-                }
-            }
-        }
-        int cumSum_%(nodename)s(CudaNdarray* input, CudaNdarray* output, int axis, int maxThreads, int maxGridY, int maxGridZ) {
-            int shape[3] = { 1, 1, 1 };
-            dim3 inputStrides(0, 0, 0);
-            dim3 outputStrides(0, 0, 0);
-            switch (CudaNdarray_NDIM(input))
-            {
-            case 1:
-                shape[0] = CudaNdarray_HOST_DIMS(input)[0];
-                inputStrides.x = CudaNdarray_HOST_STRIDES(input)[0];
-                outputStrides.x = CudaNdarray_HOST_STRIDES(output)[0];
-                break;
-            case 2:
-                shape[0] = CudaNdarray_HOST_DIMS(input)[0];
-                shape[1] = CudaNdarray_HOST_DIMS(input)[1];
-                inputStrides.x = CudaNdarray_HOST_STRIDES(input)[0];
-                inputStrides.y = CudaNdarray_HOST_STRIDES(input)[1];
-                outputStrides.x = CudaNdarray_HOST_STRIDES(output)[0];
-                outputStrides.y = CudaNdarray_HOST_STRIDES(output)[1];
-                break;
-            case 3:
-                shape[0] = CudaNdarray_HOST_DIMS(input)[0];
-                shape[1] = CudaNdarray_HOST_DIMS(input)[1];
-                shape[2] = CudaNdarray_HOST_DIMS(input)[2];
-                inputStrides.x = CudaNdarray_HOST_STRIDES(input)[0];
-                inputStrides.y = CudaNdarray_HOST_STRIDES(input)[1];
-                inputStrides.z = CudaNdarray_HOST_STRIDES(input)[2];
-                outputStrides.x = CudaNdarray_HOST_STRIDES(output)[0];
-                outputStrides.y = CudaNdarray_HOST_STRIDES(output)[1];
-                outputStrides.z = CudaNdarray_HOST_STRIDES(output)[2];
-                break;
-            default:
-                return -1;
-            }
-            if (shape[axis] <= 1) {
-                CudaNdarray_CopyFromCudaNdarray(output, input);
-                return 0;
-            }
-            // Perform cumsum on array of even size.
-            int nbElementsPerCumsum = shape[axis] - (shape[axis] %% 2);
-            // Determine how many elements can be processed in one block.
-            int dimBlockX = ceil( min(nbElementsPerCumsum, 2*maxThreads) / 2.0);
-            // Determine how many blocks are needed in total.
-            int dimGridX = ceil(nbElementsPerCumsum / (2.0*dimBlockX));  // Nb. of blocks needed per cumsum.
-            int dimGridY;  // Nb. of independent cumsums (width).
-            int dimGridZ;  // Nb. of independent cumsums (height).
-            int tmp;
-            switch (axis)
-            {
-            case 0:
-                dimGridY = shape[1];
-                dimGridZ = shape[2];
-                break;
-            case 1:
-                dimGridY = shape[0];
-                dimGridZ = shape[2];
-                tmp = inputStrides.x;
-                inputStrides.x = inputStrides.y;
-                inputStrides.y = tmp;
-                tmp = outputStrides.x;
-                outputStrides.x = outputStrides.y;
-                outputStrides.y = tmp;
-                break;
-            case 2:
-                dimGridY = shape[1];
-                dimGridZ = shape[0];
-                tmp = inputStrides.x;
-                inputStrides.x = inputStrides.z;
-                inputStrides.z = tmp;
-                tmp = outputStrides.x;
-                outputStrides.x = outputStrides.z;
-                outputStrides.z = tmp;
-                break;
-            default:
-                return -1;
-            }
-            const int shapeBlockSum[2] = { dimGridX, dimGridY*dimGridZ };
-            CudaNdarray* deviceBlockSum = (CudaNdarray*) CudaNdarray_NewDims(2, shapeBlockSum);
-            // Perform `maxGridY`*`maxGridZ` cumsums in parallel.
-            for (int offsetY = 0; offsetY < dimGridY; offsetY += maxGridY){
-                int localDimGridY = min(dimGridY - offsetY, maxGridY);
-                for (int offsetZ = 0; offsetZ < dimGridZ; offsetZ += maxGridZ){
-                    int localDimGridZ = min(dimGridZ - offsetZ, maxGridZ);
-                    dim3 dimGrid(dimGridX, localDimGridY, localDimGridZ);
-                    dim3 dimBlock(dimBlockX, 1, 1);  // One cumsum per block.
-                    int sharedBytes = (2*dimBlockX) * sizeof(float);
-                    k_blockCumSum_%(nodename)s<<<dimGrid, dimBlock, sharedBytes>>>
-                    (
-                        CudaNdarray_DEV_DATA(input),
-                        CudaNdarray_DEV_DATA(output),
-                        nbElementsPerCumsum,
-                        inputStrides,
-                        outputStrides,
-                        offsetY,
-                        offsetZ,
-                        CudaNdarray_DEV_DATA(deviceBlockSum)
-                    );
-                    if (dimGridX > 1) {
-                        // Do a cumsum over the blockSum (recursive).
-                        if (cumSum_%(nodename)s(deviceBlockSum, deviceBlockSum, 0, maxThreads, maxGridY, maxGridZ) == -1){
-                            Py_DECREF(deviceBlockSum);
-                            return -1;
-                        }
-                        // Since there are more than one block (i.e. `dimGridX > 1`)
-                        //  report partial cumsums of previous blocks to subsequents ones.
-                        dim3 dimGrid(dimGridX, localDimGridY, localDimGridZ);
-                        dim3 dimBlock(dimBlockX, 1, 1);
-                        k_finalCumSum_%(nodename)s<<<dimGrid, dimBlock>>>
-                        (
-                            CudaNdarray_DEV_DATA(output),
-                            CudaNdarray_DEV_DATA(deviceBlockSum),
-                            nbElementsPerCumsum,
-                            outputStrides,
-                            offsetY,
-                            offsetZ
-                        );
-                    }
-                    // If shape[axis] is odd, the last element is compute manually
-                    if (shape[axis] != nbElementsPerCumsum){
-                        dim3 dimGrid(1, localDimGridY, localDimGridZ);
-                        dim3 dimBlock(1, 1, 1);
-                        k_cumadd_%(nodename)s<<<dimGrid, dimBlock>>>
-                        (
-                            CudaNdarray_DEV_DATA(input),
-                            CudaNdarray_DEV_DATA(output),
-                            inputStrides,
-                            outputStrides,
-                            offsetY,
-                            offsetZ,
-                            shape[axis]-2,
-                            shape[axis]-1
-                        );
-                    }
-                }
-            }
-            Py_DECREF(deviceBlockSum);
-            CNDA_THREAD_SYNC;
-            return 0;
-        }
-        """ % locals()
-    def c_code(self, node, nodename, inames, onames, sub):
-        x, = inames
-        z, = onames
-        # We assume array has been already flattened if needed.
-        axis = self.axis if self.axis is not None else 0
-        fail = sub['fail']
-        max_threads_dim0 = self.max_threads_dim0
-        max_grid_size1 = self.max_grid_size1
-        max_grid_size2 = self.max_grid_size2
-        if max_threads_dim0 is None or max_grid_size1 is None or max_grid_size2 is None:
-            raise NotImplementedError("GpuCumsum.c_code should not be called "
-                                      "directly. It should be called by "
-                                      "make_thunk() that add some information "
-                                      "related to the selected GPU.")
-        code = """
-            const int* shape = CudaNdarray_HOST_DIMS(%(x)s);
-            bool needAllocation = !%(z)s || CudaNdarray_NDIM(%(x)s) != CudaNdarray_NDIM(%(z)s);
-            int axis = %(axis)s;
-            if (axis < 0) {
-                // Convert negative axis to positive axis.
-                axis += CudaNdarray_NDIM(%(x)s);
-            }
-            // If output is already allocated, check if its shape matches the input's one.
-            if (!needAllocation) {
-                for (int i= 0; i < CudaNdarray_NDIM(%(x)s); ++i) {
-                    if (CudaNdarray_HOST_DIMS(%(x)s)[i] != CudaNdarray_HOST_DIMS(%(z)s)[i]) {
-                        needAllocation = true;
-                    }
-                }
-            }
-            if (needAllocation){
-                Py_XDECREF(%(z)s);
-                %(z)s = (CudaNdarray*) CudaNdarray_NewDims(CudaNdarray_NDIM(%(x)s), shape);
-            }
-            if (!%(z)s) {
-                %(fail)s;
-            }
-            { // Namespace for kernel calls //
-                if (cumSum_%(nodename)s(%(x)s, %(z)s, axis, %(max_threads_dim0)s, %(max_grid_size1)s, %(max_grid_size2)s) == -1){
-                    %(fail)s;
-                }
-                cudaError_t sts = cudaGetLastError();
-                if (cudaSuccess != sts)
-                {
-                    PyErr_Format(PyExc_RuntimeError,
-                                 "Cuda error: %%s: %%s.\\n",
-                        "cumSum_%(nodename)s",
-                        cudaGetErrorString(sts));
-                    %(fail)s;
-                }
-            }
-        """ % locals()
-        return code
-def values_eq_approx_high_tol(a, b):
-    """
-    This fct is needed to don't have DebugMode raise useless
-    error due to rounding error.
-    This happen with big input size due to change in the order of
-    operation.
-    """
-    rtol = None
-    if a.size > 100000:
-        # For float32 the default rtol is 1e-5
-        rtol = 5e-5
-    return CudaNdarrayType.values_eq_approx(a, b, rtol=rtol)
-@register_gpu_opt()
-@local_optimizer([CumOp])
-def use_gpu_cumsum(node):
-    if type(node.op) is CumOp \
-       and node.inputs[0].dtype == 'float32' \
-       and node.inputs[0].owner \
-       and isinstance(node.inputs[0].owner.op, HostFromGpu):
-        if node.op.mode != 'add':
-            return None
-        axis = node.op.axis
-        x = node.inputs[0]
-        if axis is not None and x.ndim > GpuCumsum.SUPPORTED_NDIMS:
-            return None
-        x = gpu_from_host(x)
-        if axis is None and x.ndim > 1:
-            x = gpu_flatten(x)
-        # ``gpu_cumsum`` assume array has been flattened if needed.
-        if axis is None:
-            axis = 0
-        ret = host_from_gpu(GpuCumsum(axis)(x))
-        ret.tag.values_eq_approx = values_eq_approx_high_tol
-        return [ret]
--- a/theano/sandbox/cuda/fftconv.py
+++ b/theano/sandbox/cuda/fftconv.py
-from __future__ import absolute_import, print_function, division
-import numpy as np
-import theano
-import theano.tensor as T
-from theano.misc.pycuda_init import pycuda_available
-from theano.sandbox.cuda import cuda_available, GpuOp
-from theano.ifelse import ifelse
-if cuda_available:
-    from theano.sandbox.cuda import (basic_ops, CudaNdarrayType,
-                                     CudaNdarray)
-if pycuda_available:
-    import pycuda.gpuarray
-try:
-    import scikits.cuda
-    from scikits.cuda import fft, cublas
-    scikits.cuda.misc.init()
-    scikits_cuda_available = True
-except (ImportError, Exception):
-    scikits_cuda_available = False
-# TODO: investigate the effect of enabling fastmath on FFT performance
-# (how can it be enabled?).
-# base class for shared code between scikits.cuda-based ops
-class ScikitsCudaOp(GpuOp):
-    def __eq__(self, other):
-        return type(self) == type(other)
-    def __hash__(self):
-        return hash(type(self))
-    def __str__(self):
-        return self.__class__.__name__
-    def output_type(self, inp):
-        raise NotImplementedError
-    def make_node(self, inp):
-        inp = basic_ops.gpu_contiguous(
-            basic_ops.as_cuda_ndarray_variable(inp))
-        assert inp.dtype == "float32"
-        return theano.Apply(self, [inp], [self.output_type(inp)()])
-    def make_thunk(self, node, storage_map, _, _2, impl=None):
-        if not scikits_cuda_available:
-            raise RuntimeError(
-                "scikits.cuda is needed for all GPU fft implementation,"
-                " including fftconv.")
-class CuFFTOp(ScikitsCudaOp):
-    def output_type(self, inp):
-        # add one extra dim for real/imag
-        return CudaNdarrayType(
-            broadcastable=[False] * (inp.type.ndim + 1))
-    def make_thunk(self, node, storage_map, _, _2, impl=None):
-        super(CuFFTOp, self).make_thunk(node, storage_map, _, _2)
-        from theano.misc.pycuda_utils import to_gpuarray
-        inputs = [storage_map[v] for v in node.inputs]
-        outputs = [storage_map[v] for v in node.outputs]
-        plan_input_shape = [None]
-        plan = [None]
-        def thunk():
-            input_shape = inputs[0][0].shape
-            # construct output shape
-            output_shape = list(input_shape)
-            # DFT of real input is symmetric, no need to store
-            # redundant coefficients
-            output_shape[-1] = output_shape[-1] // 2 + 1
-            # extra dimension with length 2 for real/imag
-            output_shape += [2]
-            output_shape = tuple(output_shape)
-            z = outputs[0]
-            # only allocate if there is no previous allocation of the
-            # right size.
-            if z[0] is None or z[0].shape != output_shape:
-                z[0] = CudaNdarray.zeros(output_shape)
-            input_pycuda = to_gpuarray(inputs[0][0])
-            # I thought we'd need to change the type on output_pycuda
-            # so it is complex64, but as it turns out scikits.cuda.fft
-            # doesn't really care either way and treats the array as
-            # if it is complex64 anyway.
-            output_pycuda = to_gpuarray(z[0])
-            # only initialise plan if necessary
-            if plan[0] is None or plan_input_shape[0] != input_shape:
-                plan_input_shape[0] = input_shape
-                plan[0] = fft.Plan(input_shape[1:], np.float32, np.complex64,
-                                   batch=input_shape[0])
-            fft.fft(input_pycuda, output_pycuda, plan[0])
-        thunk.inputs = inputs
-        thunk.outputs = outputs
-        thunk.lazy = False
-        return thunk
-class CuIFFTOp(ScikitsCudaOp):
-    def output_type(self, inp):
-        # remove extra real/imag dim
-        return CudaNdarrayType(
-            broadcastable=[False] * (inp.type.ndim - 1))
-    def make_thunk(self, node, storage_map, _, _2, impl=None):
-        super(CuIFFTOp, self).make_thunk(node, storage_map, _, _2)
-        from theano.misc.pycuda_utils import to_gpuarray
-        inputs = [storage_map[v] for v in node.inputs]
-        outputs = [storage_map[v] for v in node.outputs]
-        plan_input_shape = [None]
-        plan = [None]
-        def thunk():
-            input_shape = inputs[0][0].shape
-            # construct output shape
-            # chop off the extra length-2 dimension for real/imag
-            output_shape = list(input_shape[:-1])
-            # restore full signal length
-            output_shape[-1] = (output_shape[-1] - 1) * 2
-            output_shape = tuple(output_shape)
-            z = outputs[0]
-            # only allocate if there is no previous allocation of the
-            # right size.
-            if z[0] is None or z[0].shape != output_shape:
-                z[0] = CudaNdarray.zeros(output_shape)
-            input_pycuda = to_gpuarray(inputs[0][0])
-            # input_pycuda is a float32 array with an extra dimension,
-            # but will be interpreted by scikits.cuda as a complex64
-            # array instead.
-            output_pycuda = to_gpuarray(z[0])
-            # only initialise plan if necessary
-            if plan[0] is None or plan_input_shape[0] != input_shape:
-                plan_input_shape[0] = input_shape
-                plan[0] = fft.Plan(output_shape[1:], np.complex64, np.float32,
-                                   batch=output_shape[0])
-            fft.ifft(input_pycuda, output_pycuda, plan[0])
-            # strangely enough, enabling rescaling here makes it run
-            # very, very slowly.  so do this rescaling manually
-            # afterwards!
-        thunk.inputs = inputs
-        thunk.outputs = outputs
-        thunk.lazy = False
-        return thunk
-def to_complex_gpuarray(x, copyif=False):
-    """
-    Adapted version of theano.misc.pycuda_utils.to_gpuarray that takes
-    an array with an extra trailing dimension of length 2 for
-    real/imaginary parts, and turns it into a complex64 PyCUDA
-    GPUArray.
-    """
-    if not isinstance(x, CudaNdarray):
-        raise ValueError("We can transfer only CudaNdarray "
-                         "to pycuda.gpuarray.GPUArray")
-    else:
-        # Check if trailing dimension has length 2
-        assert x.shape[-1] == 2
-        # check if dtype is float32
-        assert x.dtype == 'float32'
-        # Check if it is c contiguous
-        size = 1
-        c_contiguous = True
-        for i in range(x.ndim - 1, -1, -1):
-            if x.shape[i] == 1:
-                continue
-            if x._strides[i] != size:
-                c_contiguous = False
-                break
-            size *= x.shape[i]
-        if not c_contiguous:
-            if copyif:
-                x = x.copy()
-            else:
-                raise ValueError("We were asked to not copy memory, "
-                                 "but the memory is not c contiguous.")
-        # Now x is always c contiguous
-        px = pycuda.gpuarray.GPUArray(x.shape[:-1], np.complex64, base=x,
-                                      gpudata=x.gpudata)
-        return px
-def bptrs(a):
-    """
-    Pointer array when input represents a batch of matrices.
-    Taken from scikits.cuda tests/test_cublas.py.
-    """
-    return pycuda.gpuarray.arange(a.ptr, a.ptr + a.shape[0] * a.strides[0],
-                                  a.strides[0], dtype=cublas.ctypes.c_void_p)
-def sc_complex_dot_batched(bx_gpu, by_gpu, bc_gpu, transa='N', transb='N',
-                           handle=None):
-    """
-    Uses cublasCgemmBatched to compute a bunch of complex dot products
-    in parallel.
-    """
-    if handle is None:
-        handle = scikits.cuda.misc._global_cublas_handle
-    assert len(bx_gpu.shape) == 3
-    assert len(by_gpu.shape) == 3
-    assert len(bc_gpu.shape) == 3
-    assert bx_gpu.dtype == np.complex64
-    assert by_gpu.dtype == np.complex64
-    assert bc_gpu.dtype == np.complex64
-    # Get the shapes of the arguments
-    bx_shape = bx_gpu.shape
-    by_shape = by_gpu.shape
-    # Perform matrix multiplication for 2D arrays:
-    alpha = np.complex64(1.0)
-    beta = np.complex64(0.0)
-    transa = transa.lower()
-    transb = transb.lower()
-    if transb in ['t', 'c']:
-        N, m, k = by_shape
-    elif transb in ['n']:
-        N, k, m = by_shape
-    else:
-        raise ValueError('invalid value for transb')
-    if transa in ['t', 'c']:
-        N2, l, n = bx_shape
-    elif transa in ['n']:
-        N2, n, l = bx_shape
-    else:
-        raise ValueError('invalid value for transa')
-    if l != k:
-        raise ValueError('objects are not aligned')
-    if N != N2:
-        raise ValueError('batch sizes are not the same')
-    if transb == 'n':
-        lda = max(1, m)
-    else:
-        lda = max(1, k)
-    if transa == 'n':
-        ldb = max(1, k)
-    else:
-        ldb = max(1, n)
-    ldc = max(1, m)
-    # construct pointer arrays needed for cublasCgemmBatched
-    bx_arr = bptrs(bx_gpu)
-    by_arr = bptrs(by_gpu)
-    bc_arr = bptrs(bc_gpu)
-    cublas.cublasCgemmBatched(handle, transb, transa, m, n, k, alpha,
-                              by_arr.gpudata, lda, bx_arr.gpudata, ldb,
-                              beta, bc_arr.gpudata, ldc, N)
-class BatchedComplexDotOp(ScikitsCudaOp):
-    """
-    This version uses cublasCgemmBatched under the hood, instead of
-    doing multiple cublasCgemm calls.
-    """
-    def make_node(self, inp1, inp2):
-        inp1 = basic_ops.gpu_contiguous(
-            basic_ops.as_cuda_ndarray_variable(inp1))
-        inp2 = basic_ops.gpu_contiguous(
-            basic_ops.as_cuda_ndarray_variable(inp2))
-        assert inp1.dtype == "float32"
-        assert inp2.dtype == "float32"
-        assert inp1.ndim == 4  # (batch, a, b, real/imag)
-        assert inp2.ndim == 4
-        return theano.Apply(self, [inp1, inp2], [self.output_type(inp1)()])
-    def output_type(self, inp):
-        return CudaNdarrayType(broadcastable=[False] * inp.type.ndim)
-    def make_thunk(self, node, storage_map, _, _2, impl=None):
-        super(BatchedComplexDotOp, self).make_thunk(node, storage_map, _, _2)
-        inputs = [storage_map[v] for v in node.inputs]
-        outputs = [storage_map[v] for v in node.outputs]
-        def thunk():
-            bx = inputs[0]
-            by = inputs[1]
-            input_shape_x = bx[0].shape  # (batch, a, b, 2)
-            input_shape_y = by[0].shape  # (batch, b, c, 2)
-            output_shape = (input_shape_x[0], input_shape_x[1],
-                            input_shape_y[2], 2)  # (batch, a, c, 2)
-            bz = outputs[0]
-            # only allocate if there is no previous allocation of the
-            # right size.
-            if bz[0] is None or bz[0].shape != output_shape:
-                bz[0] = CudaNdarray.zeros(output_shape)
-            input_bx_pycuda = to_complex_gpuarray(bx[0])
-            input_by_pycuda = to_complex_gpuarray(by[0])
-            output_b_pycuda = to_complex_gpuarray(bz[0])
-            # fancy native batched version
-            sc_complex_dot_batched(input_bx_pycuda, input_by_pycuda,
-                                   output_b_pycuda)
-        thunk.inputs = inputs
-        thunk.outputs = outputs
-        thunk.lazy = False
-        return thunk
-cufft = CuFFTOp()
-cuifft = CuIFFTOp()
-batched_complex_dot = BatchedComplexDotOp()
-def mult_and_reduce(input_fft_v, filters_fft_v, input_shape=None,
-                    filter_shape=None):
-    """
-    Parameters
-    ----------
-    input_fft_v
-        It's (b, ic, i0, i1//2 + 1, 2).
-    filters_fft_v
-        It's (oc, ic, i0, i1//2 + 1, 2).
-    """
-    if input_shape is None:
-        input_shape = input_fft_v.shape  # symbolic
-    if filter_shape is None:
-        filter_shape = filters_fft_v.shape  # symbolic
-    b, ic, i0, i1_f, _ = input_shape
-    oc = filter_shape[0]
-    # reshape to flatten the dimensions that are multiplied elemwise
-    input_r = input_fft_v.reshape((b, ic, i0 * i1_f, 2))
-    filters_r = filters_fft_v.reshape((oc, ic, i0 * i1_f, 2))
-    # shuffle for batched dot product
-    input_s = input_r.dimshuffle(2, 0, 1, 3)  # (i0 * i1_f, b, ic, 2)
-    filters_s = filters_r.dimshuffle(2, 1, 0, 3)  # (i0 * i1_f, ic, oc, 2)
-    output_s = batched_complex_dot(input_s, filters_s)
-    # shuffle again
-    output_r = output_s.dimshuffle(1, 2, 0, 3)
-    # reshape to unflatten
-    output = output_r.reshape((b, oc, i0, i1_f, 2))
-    return output
-def conv2d_fft(input, filters, image_shape=None, filter_shape=None,
-               border_mode='valid', pad_last_dim=False):
-    """
-    Perform a convolution through fft.
-    Only support input which will be even on the last dimension
-    (width).  All other dimensions can be anything and the filters can
-    have an even or odd width.
-    If you must use input which has an odd width, you can either pad
-    it or use the `pad_last_dim` argument which will do it for you and
-    take care to strip the padding before returning.  Don't use this
-    argument if you are not sure the input is odd since the padding is
-    unconditional and will make even input odd, thus leading to
-    problems.
-    On valid mode the filters must be smaller than the input.
-    Parameters
-    ----------
-    input
-        (b, ic, i0, i1).
-    filters
-        (oc, ic, f0, f1).
-    border_mode : {'valid', 'full'}
-    pad_last_dim
-        Unconditionally pad the last dimension of the input
-        to to turn it from odd to even.  Will strip the
-        padding before returning the result.
-    """
-    # use symbolic shapes to compute shape info at runtime if not specified
-    if image_shape is None:
-        image_shape = input.shape
-    if filter_shape is None:
-        filter_shape = filters.shape
-    # batch size, input channels, input dim 0, input dim 1
-    b, ic, i0, i1 = image_shape
-    # output channels, input channels, filter dim 0, filter dim 1
-    oc, ic_, f0, f1 = filter_shape
-    # pad filters/image to output shape
-    if border_mode == 'valid':
-        o0 = i0
-        if pad_last_dim:
-            o1 = i1 + 1
-            input_padded = T.zeros((b, ic, o0, o1), dtype='float32')
-            input_padded = T.set_subtensor(input_padded[:, :, :i0, :i1],
-                                           input)
-        else:
-            o1 = i1
-            input_padded = input
-        filters_padded = T.zeros((oc, ic, o0, o1), dtype='float32')
-        filters_padded = T.set_subtensor(filters_padded[:, :, :f0, :f1],
-                                         filters)
-    elif border_mode == 'full':
-        # In this particular case, the values of (o0, o1) represent
-        # the dimensions of the work buffer more than the actual dimensions
-        # of the desired output.
-        o0 = i0 + 2 * (f0 - 1)
-        o1 = i1 + 2 * (f1 - 1)
-        if pad_last_dim:
-            o1 = o1 + 1
-        # We line up the filters and the images in a way
-        # such that the filters are tightly placed against the
-        # top-left of the array, and the images intersect with
-        # them on one pixel. The top-left pixel of the images
-        # is the bottom-right pixel of the filters when we
-        # do the layout here.
-        filters_padded = T.zeros((oc, ic, o0, o1), dtype='float32')
-        filters_padded = T.set_subtensor(filters_padded[:, :, :f0, :f1],
-                                         filters)
-        input_padded = T.zeros((b, ic, o0, o1), dtype='float32')
-        input_padded = T.set_subtensor(input_padded[:, :, (f0 - 1):(f0 - 1 + i0), (f1 - 1):(f1 - 1 + i1)],
-                                       input)
-    else:
-        raise ValueError('invalid mode')
-    input_padded = T.opt.Assert("in conv2d_fft: width is not even")(
-        input_padded, T.eq(o1 % 2, 0))
-    # reshape for FFT
-    input_flat = input_padded.reshape((b * ic, o0, o1))
-    filters_flat = filters_padded.reshape((oc * ic, o0, o1))
-    # perform FFT
-    input_fft_flat = cufft(input_flat)  # (b * ic, o0, o1//2 + 1, 2)
-    filters_fft_flat = cufft(filters_flat)  # (oc * ic, o0, o1//2 + 1, 2)
-    # unfold ic dimension
-    input_fft_v_shape = (b, ic, o0, o1 // 2 + 1, 2)
-    filters_fft_v_shape = (oc, ic, o0, o1 // 2 + 1, 2)
-    input_fft_v = input_fft_flat.reshape(input_fft_v_shape)
-    filters_fft_v = filters_fft_flat.reshape(filters_fft_v_shape)
-    # (b, oc, o0, o1//2 + 1, 2)
-    output_fft_s = mult_and_reduce(input_fft_v, filters_fft_v,
-                                   input_shape=input_fft_v_shape,
-                                   filter_shape=filters_fft_v_shape)
-    # reshape for IFFT
-    output_fft_flat = output_fft_s.reshape((b * oc, o0, o1 // 2 + 1, 2))
-    # perform IFFT
-    output_flat = cuifft(output_fft_flat)  # (b * oc, o0, o1)
-    # reshape
-    output_circ = output_flat.reshape((b, oc, o0, o1))  # circular!
-    # Now we extract the region of interest.
-    # We just cut it out from the output_circ
-    # array that was used for the computation.
-    # We do not need to handle pad_last_dim in a
-    # special way because we specify explicitly here
-    # how much values are expected.
-    if border_mode == 'valid':
-        output = output_circ[:, :, (f0 - 1):(f0 - 1 + i0 - f0 + 1),
-                             (f1 - 1):(f1 - 1 + i1 - f1 + 1)]
-    elif border_mode == 'full':
-        output = output_circ[:, :, (f0 - 1):(f0 - 1 + i0 + f0 - 1),
-                             (f1 - 1):(f1 - 1 + i1 + f1 - 1)]
-    else:
-        raise ValueError('invalid mode')
-    # Rescale manually. This is just a factor that comes in during the
-    # trip through FFT and inverse FFT.
-    output = (1.0 / T.cast(o0 * o1, 'float32')) * output
-    # output should now be the result of a batched valid convolution
-    # of the input with the filters.
-    return basic_ops.as_cuda_ndarray_variable(output)
-def conv3d_fft(input, filters, image_shape=None, filter_shape=None,
-               border_mode='valid', pad_last_dim=False):
-    """
-    Perform a convolution through fft.
-    Only supports input whose shape is even on the last dimension.
-    All other dimensions can be anything and the filters can
-    have an even or odd last dimension.
-    The semantics associated with the last three dimensions
-    are not important as long as they are in the same order between
-    the inputs and the filters. For example, when the convolution
-    is done on a sequence of images, they could be either
-    (duration, height, width) or (height, width, duration).
-    If you must use input which has an odd width, you can either pad
-    it or use the `pad_last_dim` argument which will do it for you and
-    take care to strip the padding before returning. pad_last_dim checks
-    that the last dimension is odd before the actual paddding
-    On valid mode the filters must be smaller than the input.
-    Parameters
-    ----------
-    input
-        (b, ic, i0, i1, i2).
-    filters
-        (oc, ic, f0, f1, i2).
-    border_mode : {'valid', 'full'}.
-    pad_last_dim
-        Unconditionally pad the last dimension of the input
-        to to turn it from odd to even.  Will strip the
-        padding before returning the result.
-    """
-    # use symbolic shapes to compute shape info at runtime if not specified
-    if image_shape is None:
-        image_shape = input.shape
-    if filter_shape is None:
-        filter_shape = filters.shape
-    # batch size, input channels, input dim 0, input dim 1
-    b, ic, i0, i1, i2 = image_shape
-    # output channels, input channels, filter dim 0, filter dim 1
-    oc, ic_, f0, f1, f2 = filter_shape
-    # Check that the last dimension is odd
-    is_odd = T.eq(T.mod(input.shape[4], 2), 1)
-    # pad filters/image to output shape
-    if border_mode == 'valid':
-        o0 = i0
-        o1 = i1
-        o2 = i2
-        input_padded = input
-        if pad_last_dim:
-            o2 = ifelse(is_odd, o2 + 1, o2)
-            input_padded = T.zeros((b, ic, o0, o1, o2), dtype='float32')
-            input_padded = T.set_subtensor(input_padded[:, :, :i0, :i1, :i2],
-                                           input)
-        filters_padded = T.zeros((oc, ic, o0, o1, o2), dtype='float32')
-        filters_padded = T.set_subtensor(filters_padded[:, :, :f0, :f1, :f2],
-                                         filters)
-    elif border_mode == 'full':
-        # In this particular case, the values of (o0, o1) represent
-        # the dimensions of the work buffer more than the actual dimensions
-        # of the desired output.
-        o0 = i0 + 2 * (f0 - 1)
-        o1 = i1 + 2 * (f1 - 1)
-        o2 = i2 + 2 * (f2 - 1)
-        if pad_last_dim:
-            o2 = ifelse(is_odd, o2 + 1, o2)
-        # We line up the filters and the images in a way
-        # such that the filters are tightly placed against the
-        # top-left of the array, and the images intersect with
-        # them on one pixel. The top-left pixel of the images
-        # is the bottom-right pixel of the filters when we
-        # do the layout here.
-        filters_padded = T.zeros((oc, ic, o0, o1, o2), dtype='float32')
-        filters_padded = T.set_subtensor(filters_padded[:, :, :f0, :f1, :f2],
-                                         filters)
-        input_padded = T.zeros((b, ic, o0, o1, o2), dtype='float32')
-        input_padded = T.set_subtensor(input_padded[:, :, (f0 - 1):(f0 - 1 + i0), (f1 - 1):(f1 - 1 + i1), (f2 - 1):(f2 - 1 + i2)],
-                                       input)
-    else:
-        raise ValueError('invalid mode')
-    # reshape for FFT
-    input_flat = input_padded.reshape((b * ic, o0, o1, o2))
-    filters_flat = filters_padded.reshape((oc * ic, o0, o1, o2))
-    # perform FFT
-    input_fft_flat = cufft(input_flat)  # (b * ic, o0, o1, o2//2 + 1, 2)
-    filters_fft_flat = cufft(filters_flat)  # (oc * ic, o0, o1, o2//2 + 1, 2)
-    # Unfold ic dimension.
-    # We have to collapse two dimensions together
-    # in order to reuse the same `mult_and_reduce`.
-    # This explains the o0 * 01 instead of just keeping
-    # the two dimensions intact.
-    input_fft_v_shape = (b, ic, o0 * o1, o2 // 2 + 1, 2)
-    filters_fft_v_shape = (oc, ic, o0 * o1, o2 // 2 + 1, 2)
-    input_fft_v = input_fft_flat.reshape(input_fft_v_shape)
-    filters_fft_v = filters_fft_flat.reshape(filters_fft_v_shape)
-    # (b, oc, o0 * o1, o2//2 + 1, 2)
-    output_fft_s = mult_and_reduce(input_fft_v, filters_fft_v,
-                                   input_shape=input_fft_v_shape,
-                                   filter_shape=filters_fft_v_shape)
-    # output_fft_s = input_fft_v
-    # reshape for IFFT
-    output_fft_flat = output_fft_s.reshape((b * oc, o0, o1, o2 // 2 + 1, 2))
-    # perform IFFT
-    output_flat = cuifft(output_fft_flat)  # (b * oc, o0, o1, o2)
-    # reshape
-    output_circ = output_flat.reshape((b, oc, o0, o1, o2))  # circular!
-    # Now we extract the region of interest.
-    # We just cut it out from the output_circ
-    # array that was used for the computation.
-    # We do not need to handle pad_last_dim in a
-    # special way because we specify explicitly here
-    # how much values are expected.
-    if border_mode == 'valid':
-        output = output_circ[:, :, (f0 - 1):(f0 - 1 + i0 - f0 + 1),
-                             (f1 - 1):(f1 - 1 + i1 - f1 + 1),
-                             (f2 - 1):(f2 - 1 + i2 - f2 + 1)]
-    elif border_mode == 'full':
-        output = output_circ[:, :, (f0 - 1):(f0 - 1 + i0 + f0 - 1),
-                             (f1 - 1):(f1 - 1 + i1 + f1 - 1),
-                             (f2 - 1):(f2 - 1 + i2 + f2 - 1)]
-    else:
-        raise ValueError('invalid mode')
-    # output = output_circ[:, :, :, :, :]
-    # Rescale manually. This is just a factor that comes in during the
-    # trip through FFT and inverse FFT.
-    output = (1.0 / T.cast(o0 * o1 * o2, 'float32')) * output
-    # output should now be the result of a batched valid convolution
-    # of the input with the filters.
-    return basic_ops.as_cuda_ndarray_variable(output)
--- a/theano/sandbox/cuda/kernel_codegen.py
+++ b/theano/sandbox/cuda/kernel_codegen.py
-"""
-Helper routines for generating gpu kernels for nvcc.
-"""
-from __future__ import absolute_import, print_function, division
-def nvcc_kernel(name, params, body):
-    """
-    Return the c code of a kernel function.
-    Parameters
-    ----------
-    params
-        The parameters to the function as one or more strings.
-    body
-        The [nested] list of statements for the body of the
-        function. These will be separated by ';' characters.
-    """
-    paramstr = ', '.join(params)
-    def flatbody():
-        for b in body:
-            if isinstance(b, (list, tuple)):
-                for bb in b:
-                    yield bb
-            else:
-                yield b
-    bodystr = ';\n'.join(flatbody())
-    return """__global__ void %(name)s (%(paramstr)s)
-    {
-        %(bodystr)s;
-    }
-    """ % locals()
-def code_version(version):
-    """
-    Decorator to support version-based cache mechanism.
-    """
-    if not isinstance(version, tuple):
-        raise TypeError('version must be tuple', version)
-    def deco(f):
-        f.code_version = version
-        return f
-    return deco
-UNVERSIONED = ()
-@code_version((1,))
-def inline_reduce(N, buf, pos, count, manner_fn):
-    """
-    Return C++ code for a function that reduces a contiguous buffer.
-    Parameters
-    ----------
-    N
-        Length of the buffer.
-    buf
-        Buffer pointer.
-    pos
-        Index of executing thread.
-    count
-        Number of executing threads.
-    manner_fn
-        A function that accepts strings of arguments a
-        and b, and returns c code for their reduction. (Example:
-        return "%(a)s + %(b)s" for a sum reduction).
-    :postcondition:
-    This function leaves the answer in position 0 of the buffer. The
-    rest of the buffer is trashed by this function.
-    Notes
-    -----
-    buf should be in gpu shared memory, we access it many times.
-    """
-    loop_line = manner_fn("%s[%s]" % (buf, pos), "%s[i]" % (buf))
-    r_16 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+16]" % (buf, pos))
-    r_8 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+8]" % (buf, pos))
-    r_4 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+4]" % (buf, pos))
-    r_2 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+2]" % (buf, pos))
-    r_1 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+1]" % (buf, pos))
-    return """
-    {
-        // This function trashes buf[1..warpSize],
-        // leaving the reduction result in buf[0].
-        if (%(pos)s < warpSize)
-        {
-            for (int i = %(pos)s + warpSize; i < %(N)s; i += warpSize)
-            {
-                %(buf)s[%(pos)s] = %(loop_line)s;
-            }
-            if (%(pos)s < 16)
-            {
-                //reduce so that %(pos)s 0 has the sum of everything
-                if(%(pos)s + 16 < %(N)s)
-                    %(buf)s[%(pos)s] = %(r_16)s;
-                if(%(pos)s + 8 < %(N)s)
-                    %(buf)s[%(pos)s] = %(r_8)s;
-                if(%(pos)s + 4 < %(N)s)
-                    %(buf)s[%(pos)s] = %(r_4)s;
-                if(%(pos)s + 2 < %(N)s)
-                    %(buf)s[%(pos)s] = %(r_2)s;
-                if(%(pos)s + 1 < %(N)s)
-                    %(buf)s[%(pos)s] = %(r_1)s;
-            }
-        }
-    }
-    """ % locals()
-@code_version(inline_reduce.code_version)
-def inline_reduce_max(N, buf, pos, count):
-    return inline_reduce(N, buf, pos, count,
-                         lambda a, b: "max(%s, %s)" % (a, b))
-@code_version(inline_reduce.code_version)
-def inline_reduce_sum(N, buf, pos, count):
-    return inline_reduce(N, buf, pos, count,
-                         lambda a, b: "%s + %s" % (a, b))
-@code_version(inline_reduce.code_version)
-def inline_reduce_min(N, buf, pos, count):
-    return inline_reduce(N, buf, pos, count,
-                         lambda a, b: "min(%s, %s)" % (a, b))
-@code_version(inline_reduce.code_version)
-def inline_reduce_prod(N, buf, pos, count):
-    return inline_reduce(N, buf, pos, count,
-                         lambda a, b: "%s * %s" % (a, b))
-@code_version((2,) + inline_reduce_max.code_version +
-              inline_reduce_sum.code_version)
-def inline_softmax(N, buf, buf2, threadPos, threadCount):
-    """
-    Parameters
-    ----------
-    N
-        Length of the buffer.
-    threadPos
-        Index of executing thread.
-    threadCount
-        Number of executing threads.
-    :Precondition: buf and buf2 contain two identical copies of the input
-        to softmax
-    :Postcondition: buf contains the softmax, buf2 contains un-normalized
-        softmax
-    Notes
-    -----
-    buf and buf2 should be in gpu shared memory, we access it many times.
-    We use __i as an int variable in a loop.
-    """
-    return [  # get max of buf (trashing all but buf[0])
-        inline_reduce_max(N, buf, threadPos, threadCount),
-        '__syncthreads()',
-        'float row_max = ' + buf + '[0]',
-        '__syncthreads()',
-        'for(int __i=' + threadPos + '; __i<' + N + '; __i+=' +
-        threadCount + '){',
-        buf + '[__i] = exp(' + buf2 + '[__i] - row_max)',
-        buf2 + '[__i] = ' + buf + '[__i]', '}',
-        '__syncthreads()',
-        inline_reduce_sum(N, buf, threadPos, threadCount),
-        '__syncthreads()',
-        'float row_sum = ' + buf + '[0]',
-        '__syncthreads()',
-        # divide each exp() result by the sum to complete the job.
-        'for(int __i=' + threadPos + '; __i<' + N +
-        '; __i+=' + threadCount + '){',
-        buf + '[__i] = ' + buf2 + '[__i] / row_sum', '}',
-        '__syncthreads()',
-        ]
-@code_version((1,))
-def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
-                               manner_fn, manner_init,
-                               b='', stride_b=''):
-    """
-    Return C++ code for a function that reduces a contiguous buffer.
-    Parameters
-    ----------
-    N
-        Length of the buffer.
-    buf
-        Buffer pointer of size warpSize * sizeof(float).
-    pos
-        Index of executing thread.
-    count
-        Number of executing threads.
-    b
-        Optional, pointer to the bias.
-    stride_b
-        Optional, the stride of b if b is provided.
-    manner_fn
-        A function that accepts strings of arguments a
-        and b, and returns c code for their reduction. (Example:
-        return "%(a)s + %(b)s" for a sum reduction).
-    manner_init
-        A function that accepts strings of arguments a
-        and return c code for its initialization.
-    :postcondition:
-    This function leaves the answer in position 0 of the buffer. The
-    rest of the buffer is trashed by this function.
-    Notes
-    -----
-    buf should be in gpu shared memory, we access it many times.
-    """
-    if b:
-        init = manner_init("%(x)s[%(pos)s * %(stride_x)s] +"
-                           " %(b)s[%(pos)s * %(stride_b)s]" % locals())
-        loop_line = manner_fn("red",
-                              manner_init("%(x)s[i * %(stride_x)s] + "
-                                          "%(b)s[i * %(stride_b)s]" %
-                                          locals()))
-    else:
-        init = manner_init("%(x)s[%(pos)s * %(stride_x)s]" % locals())
-        loop_line = manner_fn("red", manner_init("%(x)s[i * %(stride_x)s]" %
-                                                 locals()))
-    loop_line2 = manner_fn("%s[%s]" % (buf, pos), "%s[i]" % buf)
-    r_16 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+16]" % (buf, pos))
-    r_8 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+8]" % (buf, pos))
-    r_4 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+4]" % (buf, pos))
-    r_2 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+2]" % (buf, pos))
-    r_1 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+1]" % (buf, pos))
-    return """
-    {
-        // This function trashes buf[1..n_threads],
-        // leaving the reduction result in buf[0].
-        float red = %(init)s;
-        #pragma unroll 16
-        for (int i = %(pos)s + %(count)s; i<%(N)s; i += %(count)s){
-          red = %(loop_line)s;
-        }
-        buf[%(pos)s] = red;
-        __syncthreads();
-        if (%(pos)s < warpSize)
-        {
-            for (int i = %(pos)s + warpSize; i < %(count)s; i += warpSize)
-            {
-                %(buf)s[%(pos)s] = %(loop_line2)s;
-            }
-            if (%(pos)s < 16)
-            {
-                //reduce so that %(pos)s 0 has the reduction of everything
-                if(%(pos)s + 16 < %(N)s)
-                    %(buf)s[%(pos)s] = %(r_16)s;
-                if(%(pos)s + 8 < %(N)s)
-                    %(buf)s[%(pos)s] = %(r_8)s;
-                if(%(pos)s + 4 < %(N)s)
-                    %(buf)s[%(pos)s] = %(r_4)s;
-                if(%(pos)s + 2 < %(N)s)
-                    %(buf)s[%(pos)s] = %(r_2)s;
-                if(%(pos)s + 1 < %(N)s)
-                    %(buf)s[%(pos)s] = %(r_1)s;
-            }
-        }
-    }
-    """ % locals()
-@code_version(inline_reduce_fixed_shared.code_version)
-def inline_reduce_fixed_shared_max(N, buf, x, stride_x, pos, count,
-                                   b='', stride_b=''):
-    return inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
-                                      lambda a, b: "max(%s, %s)" % (a, b),
-                                      lambda a: a,
-                                      b, stride_b)
-@code_version((1,) + inline_reduce_max.code_version +
-              inline_reduce_sum.code_version)
-def inline_softmax_fixed_shared(N, buf, x, stride_x,
-                                sm, sm_stride,
-                                threadPos, threadCount,
-                                b='', stride_b=''):
-    """
-    Parameters
-    ----------
-    N
-        Length of the buffer, atleast waprSize(32).
-    buf
-        A shared memory buffer of size warpSize * sizeof(float).
-    x
-        A ptr to the gpu memory where the row is stored.
-    stride_x
-        The stride between each element in x.
-    sm
-        A ptr to the gpu memory to store the result.
-    sm_stride
-        The stride between each sm element.
-    threadPos
-        Index of executing thread.
-    threadCount
-        Number of executing threads.
-    b
-        Optional, pointer to the bias.
-    stride_b
-        Optional, the stride of b if b is provided.
-    :Precondition: buf is empty
-    :Postcondition: buf[0] contains the softmax,
-        buf2 contains un-normalized softmax
-    Notes
-    -----
-    buf should be in gpu shared memory, we access it many times.
-    We use tx as an int variable in a loop.
-    """
-    ret = [
-        # get max of buf (trashing all but buf[0])
-        inline_reduce_fixed_shared_max(N, buf, x, stride_x,
-                                       threadPos, threadCount, b, stride_b),
-        '__syncthreads()',
-        'float row_max = ' + buf + '[0]',
-        '__syncthreads()',
-        inline_reduce_fixed_shared(N, buf, x, stride_x, threadPos, threadCount,
-                                   lambda a, b: "%s + %s" % (a, b),
-                                   lambda a: "exp(%s - row_max)" % a,
-                                   b, stride_b),
-        '__syncthreads()',
-        'float row_sum = ' + buf + '[0]',
-        '__syncthreads()',
-        "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
-        ]
-    # This set all value correctly
-    if b:
-        ret += [
-            "%(sm)s[tx * %(sm_stride)s] = "
-            "  exp(%(x)s[tx * %(stride_x)s] +"
-            "            %(b)s[tx * %(stride_b)s] - row_max)"
-            " / row_sum" % locals()]
-    else:
-        ret += [
-            "%(sm)s[tx * %(sm_stride)s] = "
-            "exp(%(x)s[tx * %(stride_x)s] - row_max) / row_sum" % locals()]
-    ret += [
-        "}",
-        '__syncthreads()',
-    ]
-    return ret
--- a/theano/sandbox/cuda/neighbours.py
+++ b/theano/sandbox/cuda/neighbours.py
-from __future__ import absolute_import, print_function, division
-# This is work in progress
-from theano import Apply, tensor
-from theano.gof import local_optimizer
-from theano.sandbox.cuda import cuda_available, GpuOp
-from theano.tensor.nnet.neighbours import Images2Neibs
-if cuda_available:
-    from theano.sandbox.cuda import CudaNdarrayType
-    from theano.sandbox.cuda.basic_ops import (
-        as_cuda_ndarray_variable, host_from_gpu, gpu_from_host)
-    from theano.sandbox.cuda.opt import register_opt as register_gpu_opt
-class GpuImages2Neibs(Images2Neibs, GpuOp):
-    def __init__(self, mode='valid'):
-        if mode not in ['valid', 'ignore_borders', 'wrap_centered']:
-            raise NotImplementedError("Only the mode valid, ignore_borders"
-                                      " and wrap_centered"
-                                      " have been implemented for the op"
-                                      " GpuImages2Neibs")
-        self.mode = mode
-    def make_node(self, ten4, neib_shape, neib_step):
-        ten4 = as_cuda_ndarray_variable(ten4)
-        neib_shape = tensor.as_tensor_variable(neib_shape)
-        neib_step = tensor.as_tensor_variable(neib_step)
-        assert ten4.ndim == 4
-        assert ten4.dtype == 'float32'
-        assert neib_shape.ndim == 1
-        assert neib_step.ndim == 1
-        assert neib_shape.dtype in tensor.integer_dtypes
-        assert neib_step.dtype in tensor.integer_dtypes
-        return Apply(self, [ten4, neib_shape, neib_step],
-                     [CudaNdarrayType(broadcastable=(False, False),
-                                      dtype=ten4.type.dtype)()])
-    def c_code_cache_version(self):
-        return (8,)
-    def c_support_code_apply(self, node, nodename):
-        mode = self.mode
-        return """
-//a version that use less register but don't work in all case.
-        static __global__ void k_multi_warp_less_%(nodename)s(
-            const int nb_batch,
-            const int nb_stack,
-            const int height,
-            const int width,
-            const int c,
-            const int d,
-            const int step_x,
-            const int step_y,
-            const int grid_c,
-            const int grid_d,
-            const int stride0, const int stride1,
-            const int stride2, const int stride3,
-            float * global_ten4,
-            const int out_s0, const int out_s1,
-            float * global_out
-        )
-        {
-            const int wrap_centered_idx_shift_x = c/2;
-            const int wrap_centered_idx_shift_y = d/2;
-            for(int tblock = blockIdx.x*blockDim.z+threadIdx.z;
-                tblock<nb_batch*nb_stack*grid_c*grid_d;
-                tblock+=gridDim.x*blockDim.z){
-                const int b = tblock%%grid_d;
-                int left = tblock/grid_d;
-                const int a = left%%grid_c;
-                left = left/grid_c;
-                const int s = left%%nb_stack;
-                left = left/nb_stack;
-                const int n = left;
-                if(n>nb_batch)continue;
-                if(s>nb_stack)continue;
-                if(a>grid_c)continue;
-                if(b>grid_d)continue;
-                            int z_row = b + grid_d*(a + grid_c*
-                                                    (s + nb_stack*n));
-                            int i = threadIdx.y;     // loop over c
-                            {
-                                int ten4_2 = i + a * step_x;
-                                if("%(mode)s"=="wrap_centered"){
-                                    ten4_2 -= wrap_centered_idx_shift_x;
-                                    if ( ten4_2 < 0 )
-                                        ten4_2 += height;
-                                    else if (ten4_2 >= height)
-                                        ten4_2 -= height;
-                                }
-                                int j = threadIdx.x;  // loop over d
-                                {
-                                    int ten4_3 = j + b * step_y;
-                                    if("%(mode)s"=="wrap_centered"){
-                                        ten4_3 -= wrap_centered_idx_shift_y;
-                                        if ( ten4_3 < 0 )
-                                            ten4_3 += width;
-                                        else if (ten4_3 >= width)
-                                            ten4_3 -= width;
-                                    }
-                                    int ten4_idx = stride3*ten4_3 +
-                                                   stride2*ten4_2 +
-                                                   stride1*s + stride0*n;
-                                    int z_col = j + d * i;
-                                    int z_idx = z_col * out_s1 +
-                                                z_row * out_s0;
-                                    global_out[z_idx] = global_ten4[ten4_idx];
-                                }
-                            }
-            }
-        }
-        static __global__ void k_multi_warp_%(nodename)s(
-            const int nb_batch,
-            const int nb_stack,
-            const int height,
-            const int width,
-            const int c,
-            const int d,
-            const int step_x,
-            const int step_y,
-            const int grid_c,
-            const int grid_d,
-            const int stride0, const int stride1,
-            const int stride2, const int stride3,
-            float * global_ten4,
-            const int out_s0, const int out_s1,
-            float * global_out
-        )
-        {
-            const int wrap_centered_idx_shift_x = c/2;
-            const int wrap_centered_idx_shift_y = d/2;
-            for(int tblock = blockIdx.x*blockDim.z+threadIdx.z;
-                tblock<nb_batch*nb_stack*grid_c*grid_d;
-                tblock+=gridDim.x*blockDim.z){
-                const int b = tblock%%grid_d;
-                int left = tblock/grid_d;
-                const int a = left%%grid_c;
-                left = left/grid_c;
-                const int s = left%%nb_stack;
-                left = left/nb_stack;
-                const int n = left;
-                if(n>nb_batch)continue;
-                if(s>nb_stack)continue;
-                if(a>grid_c)continue;
-                if(b>grid_d)continue;
-                            int z_row = b + grid_d*(a + grid_c*
-                                                    (s + nb_stack*n));
-                            // loop over c
-                            for (int i = threadIdx.y; i < c; i+=blockDim.y)
-                            {
-                                int ten4_2 = i + a * step_x;
-                                if("%(mode)s"=="wrap_centered"){
-                                    ten4_2 -= wrap_centered_idx_shift_x;
-                                    if ( ten4_2 < 0 )
-                                        ten4_2 += height;
-                                    else if (ten4_2 >= height)
-                                        ten4_2 -= height;
-                                }
-                                // loop over d
-                                for (int j = threadIdx.x; j < d; j+=blockDim.x)
-                                {
-                                    int ten4_3 = j + b * step_y;
-                                    if("%(mode)s"=="wrap_centered"){
-                                        ten4_3 -= wrap_centered_idx_shift_y;
-                                        if ( ten4_3 < 0 )
-                                            ten4_3 += width;
-                                        else if (ten4_3 >= width)
-                                            ten4_3 -= width;
-                                    }
-                                    int ten4_idx = stride3*ten4_3 +
-                                                   stride2*ten4_2 +
-                                                   stride1*s + stride0*n;
-                                    int z_col = j + d * i;
-                                    int z_idx = z_col * out_s1 +
-                                                z_row * out_s0;
-                                    global_out[z_idx] = global_ten4[ten4_idx];
-                                }
-                            }
-            }
-        }
-        """ % locals()
-    def c_code(self, node, name, inp, out, sub):
-        ten4, neib_shape, neib_step = inp
-        z, = out
-        fail = sub['fail']
-        mode = self.mode
-        return """
-#ifndef CEIL_INTDIV
-#define CEIL_INTDIV(a, b) ((a/b) + ((a %% b) ? 1: 0))
-#endif
-        int grid_c = -1;
-        int grid_d = -1;
-        {
-            if (CudaNdarray_NDIM(%(ten4)s) != 4)
-            {
-                PyErr_Format(PyExc_TypeError, "pvals wrong rank");
-                %(fail)s;
-            }
-            if (PyArray_NDIM(%(neib_shape)s) != 1)
-            {
-                PyErr_Format(PyExc_TypeError, "unis wrong rank");
-                %(fail)s;
-            }
-            if (PyArray_DIMS(%(neib_shape)s)[0] != 2)
-            {
-                PyErr_Format(PyExc_ValueError,
-                             "neib_shape has to contain two elements");
-                %(fail)s;
-            }
-            const int c = *(dtype_%(neib_shape)s*) PyArray_GETPTR1(
-                                                     %(neib_shape)s, 0);
-            const int d = *(dtype_%(neib_shape)s*) PyArray_GETPTR1(
-                                                     %(neib_shape)s, 1);
-            const npy_intp step_x = (npy_intp) *(dtype_%(neib_step)s*)
-                                         PyArray_GETPTR1(%(neib_step)s, 0);
-            const npy_intp step_y = (npy_intp) *(dtype_%(neib_step)s*)
-                                         PyArray_GETPTR1(%(neib_step)s, 1);
-            if ( "%(mode)s" == "wrap_centered") {
-                if (c%%2!=1 || d%%2!=1){
-                    PyErr_Format(PyExc_TypeError,
-        "Images2Neibs: in mode wrap_centered need patch with odd shapes");
-                    %(fail)s;
-                }
-                if ( CudaNdarray_HOST_DIMS(%(ten4)s)[2] < c ||
-                     CudaNdarray_HOST_DIMS(%(ten4)s)[3] < d)
-                {
-                    PyErr_Format(PyExc_TypeError,
-                                 "Images2Neibs: in wrap_centered mode, don't"
-                                 " support image shapes smaller then the patch"
-                                 " shapes: neib_shape=(%%d,%%d),"
-                                 " ten4[2:]=[%%d,%%d]",
-                                 c, d, CudaNdarray_HOST_DIMS(%(ten4)s)[2],
-                                 CudaNdarray_HOST_DIMS(%(ten4)s)[3]);
-                    %(fail)s;
-                }
-                grid_c = CEIL_INTDIV(((CudaNdarray_HOST_DIMS(%(ten4)s))[2]),
-                                     step_x);
-                grid_d = CEIL_INTDIV(((CudaNdarray_HOST_DIMS(%(ten4)s))[3]),
-                                     step_y);
-            }else if ( "%(mode)s" == "valid") {
-                if ( ((CudaNdarray_HOST_DIMS(%(ten4)s))[2] < c) ||
-                     ((((CudaNdarray_HOST_DIMS(%(ten4)s))[2]-c) %% step_x)!=0))
-                {
-                    PyErr_Format(PyExc_TypeError,
-                                 "neib_shape[0]=%%d, neib_step[0]=%%d and"
-                                 " ten4.shape[2]=%%d not consistent",
-                                 c, step_x,
-                                 CudaNdarray_HOST_DIMS(%(ten4)s)[2]);
-                    %(fail)s;
-                }
-                if ( ((CudaNdarray_HOST_DIMS(%(ten4)s))[3] < d) ||
-                     ((((CudaNdarray_HOST_DIMS(%(ten4)s))[3]-d) %% step_y)!=0))
-                {
-                    PyErr_Format(PyExc_TypeError,
-                                 "neib_shape[1]=%%d, neib_step[1]=%%d and"
-                                 " ten4.shape[3]=%%d not consistent",
-                                 d, step_y,
-                                 CudaNdarray_HOST_DIMS(%(ten4)s)[3]);
-                    %(fail)s;
-                }
-                //number of patch in height
-                grid_c = 1+(((CudaNdarray_HOST_DIMS(%(ten4)s))[2]-c)/step_x);
-                //number of patch in width
-                grid_d = 1+(((CudaNdarray_HOST_DIMS(%(ten4)s))[3]-d)/step_y);
-            }else if ( "%(mode)s" == "ignore_borders") {
-                //number of patch in height
-                grid_c = 1+(((CudaNdarray_HOST_DIMS(%(ten4)s))[2]-c)/step_x);
-                //number of patch in width
-                grid_d = 1+(((CudaNdarray_HOST_DIMS(%(ten4)s))[3]-d)/step_y);
-            }else{
-                PyErr_Format(PyExc_TypeError,
-                             "Images2Neibs: unknow mode '%(mode)s'");
-                 %(fail)s;
-            }
-            // new dimensions for z
-            const int z_dim1 = c * d;
-            const int z_dim0 =  grid_c
-                                * grid_d
-                                * CudaNdarray_HOST_DIMS(%(ten4)s)[1]
-                                * CudaNdarray_HOST_DIMS(%(ten4)s)[0];
-            if ((NULL == %(z)s)
-                || (CudaNdarray_HOST_DIMS(%(z)s)[0] != z_dim0)
-                || (CudaNdarray_HOST_DIMS(%(z)s)[1] != z_dim1))
-            {
-                Py_XDECREF(%(z)s);
-                npy_intp dims[2];
-                dims[0] = z_dim0;
-                dims[1] = z_dim1;
-                %(z)s = (CudaNdarray*)CudaNdarray_NewDims(2, dims);
-                if (!%(z)s)
-                {
-                    PyErr_SetString(PyExc_MemoryError,
-                                    "failed to alloc z output");
-                    %(fail)s;
-                }
-            }
-        }
-        { // NESTED SCOPE
-            const int nb_batch = CudaNdarray_HOST_DIMS(%(ten4)s)[0];
-            const int nb_stack = CudaNdarray_HOST_DIMS(%(ten4)s)[1];
-            const int height = CudaNdarray_HOST_DIMS(%(ten4)s)[2];
-            const int width = CudaNdarray_HOST_DIMS(%(ten4)s)[3];
-            const int c = *(dtype_%(neib_shape)s*) PyArray_GETPTR1(
-                                                     %(neib_shape)s, 0);
-            const int d = *(dtype_%(neib_shape)s*) PyArray_GETPTR1(
-                                                     %(neib_shape)s, 1);
-            const npy_intp step_x = (npy_intp) *(dtype_%(neib_step)s*)
-                                         PyArray_GETPTR1(%(neib_step)s, 0);
-            const npy_intp step_y = (npy_intp) *(dtype_%(neib_step)s*)
-                                         PyArray_GETPTR1(%(neib_step)s, 1);
-            dim3 n_threads(d,c,1);
-            //Their is a max of 512 threads per blocks
-            while(n_threads.x*n_threads.y>512 && n_threads.y>1)n_threads.y--;
-            while(n_threads.x*n_threads.y>512 && n_threads.x>1)n_threads.x--;
-            //Make bigger block to have better memory access pattern and
-            //a higher core utilisation. for smaller patch size
-            while(c*d*(n_threads.z+1) < 128 && n_threads.z<64 &&
-                  n_threads.z<CudaNdarray_HOST_DIMS(%(z)s)[0]){
-                n_threads.z++;
-            }
-            int nb_block;
-            if (CudaNdarray_HOST_DIMS(%(z)s)[0] %% n_threads.z == 0)
-                nb_block = CudaNdarray_HOST_DIMS(%(z)s)[0] / n_threads.z;
-            else
-                nb_block = (CudaNdarray_HOST_DIMS(%(z)s)[0] / n_threads.z) + 1;
-            dim3 n_blocks(std::min(32*1024,nb_block));
-            int n_shared = 0;
-            void (*f)(int, int, int ,int,
-                      int, int, int ,int,
-                      int, int,
-                      int, int, int, int,
-                      float*,
-                      int, int,
-                      float*);
-            if(n_threads.x==d && n_threads.y==c){
-                f = k_multi_warp_less_%(name)s;
-            }else{
-                f = k_multi_warp_%(name)s;
-            }
-            f<<<n_blocks, n_threads, n_shared>>>(
-                nb_batch,
-                nb_stack,
-                height, width,
-                c, d, step_x, step_y,
-                grid_c, grid_d,
-                CudaNdarray_HOST_STRIDES(%(ten4)s)[0],
-                CudaNdarray_HOST_STRIDES(%(ten4)s)[1],
-                CudaNdarray_HOST_STRIDES(%(ten4)s)[2],
-                CudaNdarray_HOST_STRIDES(%(ten4)s)[3],
-                CudaNdarray_DEV_DATA(%(ten4)s),
-                CudaNdarray_HOST_STRIDES(%(z)s)[0],
-                CudaNdarray_HOST_STRIDES(%(z)s)[1],
-                CudaNdarray_DEV_DATA(%(z)s)
-            );
-            CNDA_THREAD_SYNC;
-            cudaError_t sts = cudaGetLastError();
-            if (cudaSuccess != sts)
-            {
-                PyErr_Format(PyExc_RuntimeError,
-                             "Cuda error: %%s: %%s. (grid: %%i x %%i;"
-                             " block: %%i x %%i x %%i; shared: %%i)\\n",
-                    "k_multi_warp_%(name)s",
-                    cudaGetErrorString(sts),
-                    n_blocks.x,
-                    n_blocks.y,
-                    n_threads.x,
-                    n_threads.y,
-                    n_threads.z,
-                    n_shared);
-                %(fail)s;
-            }
-        } // END NESTED SCOPE
-        """ % locals()
-def gpu_images2neibs(ten4, neib_shape, neib_step=None, mode='valid'):
-    return GpuImages2Neibs(mode)(ten4, neib_shape, neib_step)
-@local_optimizer([Images2Neibs])
-def use_gpu_images2neibs(node):
-    if (type(node.op) is Images2Neibs and
-        node.inputs[0].dtype == 'float32' and
-        node.op.mode in ['valid', 'ignore_borders',
-                         'wrap_centered']):
-        return [host_from_gpu(gpu_images2neibs(gpu_from_host(node.inputs[0]),
-                                               node.inputs[1], node.inputs[2],
-                                               mode=node.op.mode))]
-if cuda_available:
-    register_gpu_opt()(use_gpu_images2neibs)
--- a/theano/sandbox/cuda/nnet.py
+++ b/theano/sandbox/cuda/nnet.py
-from __future__ import absolute_import, print_function, division
-from theano import Op, Apply
-from six import StringIO
-from theano.sandbox.cuda import GpuOp
-from theano.sandbox.cuda.basic_ops import as_cuda_ndarray_variable
-from theano.sandbox.cuda.kernel_codegen import (nvcc_kernel,
-                                                inline_softmax,
-                                                inline_softmax_fixed_shared)
-class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuOp):
-    """
-    Implement CrossentropySoftmaxArgmax1HotWithBias on the gpu.
-    """
-    nin = 3
-    nout = 3
-    def __eq__(self, other):
-        return type(self) == type(other)
-    def __hash__(self):
-        return hash(type(self))
-    def __str__(self):
-        return self.__class__.__name__
-    def make_node(self, x, b, y_idx):
-        # N.B. won't work when we don't cast y_idx to float anymore
-        x = as_cuda_ndarray_variable(x)
-        b = as_cuda_ndarray_variable(b)
-        y_idx = as_cuda_ndarray_variable(y_idx)
-        nll = y_idx.type()
-        sm = x.type()
-        am = y_idx.type()
-        return Apply(self, [x, b, y_idx], [nll, sm, am])
-    def c_headers(self):
-        return ['<float.h>']
-    def c_support_code(self):
-        return """
-__global__ void k_xent_sm_1hot_bias(const int M, const int N,
-    const float * x_data, const int xs0, const int xs1,
-    const float * b, const int bs0,
-    const float * y_idx_data, const int y_idxs0,
-    float * nll_data, const int nlls0,
-    float * sm_data, const int sms0, const int sms1,
-    float * am_data, const int ams0)
-{
-  for (int row = blockIdx.x; row < M; row += gridDim.x){
-    const float * x = x_data + xs0 * row;
-    float * sm = sm_data + sms0 * row;
-    extern __shared__ float per_thread_values[];
-    __shared__ float row_max, sum, sum_inv;
-    __shared__ int row_max_threadIdx;
-    float per_thread_row_max, per_thread_sum;
-    int per_thread_row_max_j;
-    // COMPUTE ROW MAX AND ARGMAX
-    // compute separate per-thread maximums and argmax's
-    per_thread_row_max = -FLT_MAX;
-    per_thread_row_max_j = 0;
-    for (int j = threadIdx.x; j < N; j += blockDim.x)
-    {
-      float row_ij = x[j*xs1] + b[j*bs0];
-      per_thread_row_max_j = (row_ij > per_thread_row_max) ? j : per_thread_row_max_j;
-      per_thread_row_max = fmaxf(row_ij, per_thread_row_max);
-    }
-    per_thread_values[threadIdx.x] = per_thread_row_max;
-    // wait for access to shared per_thread_values to do final
-    // reduction in thread 0
-    __syncthreads();
-    // Finish the reduction in one go in a single thread. Could be
-    // smarter about this with more hierarchical reductions but think
-    // this will do for now.
-    if (threadIdx.x == 0) {
-      // compute overall maximum and the id of the thread which has it
-      row_max = -FLT_MAX;
-      row_max_threadIdx = 0;
-      for (int j = 0; j < blockDim.x; ++j)
-      {
-        float per_thread_max = per_thread_values[j];
-        row_max_threadIdx = (per_thread_max > row_max) ? j : row_max_threadIdx;
-        row_max = fmaxf(per_thread_max, row_max);
-      }
-    }
-    // all threads wait for access to shared row_max and row_maxThreadIdx
-    __syncthreads();
-    // thread whose max was the overall max writes out the overall argmax:
-    if (threadIdx.x == row_max_threadIdx) am_data[row*ams0] = per_thread_row_max_j;
-    // COMPUTE SOFTMAX
-    // compute the exp and the per-thread sums of exps
-    per_thread_sum = 0.0;
-    for (int j = threadIdx.x; j < N; j += blockDim.x)
-    {
-      float row_ij = x[j*xs1] + b[j*bs0];
-      float sm_ij = __expf(row_ij - row_max);
-      per_thread_sum += sm_ij;
-      sm[j * sms1] = sm_ij;
-    }
-    per_thread_values[threadIdx.x] = per_thread_sum;
-    // wait for access to shared per_thread_values to do final
-    // reduction in thread 0
-    __syncthreads();
-    if (threadIdx.x == 0) {
-      // compute overall sum
-      sum = 0.0;
-      for (int j = 0; j < blockDim.x; ++j)
-      {
-        sum += per_thread_values[j];
-      }
-      sum_inv = 1.0 / sum;
-    }
-    // all threads wait for access to shared sum, sum_inv
-    __syncthreads();
-    // all threads normalize their softmax result using sum_inv
-    for (int j = threadIdx.x; j < N; j += blockDim.x)
-    {
-      sm[j * sms1] *= sum_inv;
-    }
-    // COMPUTE NEGATIVE LOG-LIKELIHOOD FOR TARGET INDEX
-    if (threadIdx.x == 0) {
-      const int y_idx = (int)y_idx_data[row * y_idxs0];
-      if ((y_idx >= N) || (y_idx < 0))
-      {
-        //TODO: set raise an error bit in a global var?
-        nll_data[row*nlls0] = 0.0; // raise some suspicion at least...
-      }
-      else
-      {
-        nll_data[row*nlls0] = - x[y_idx*xs1]
-          - b[y_idx*bs0]
-          + row_max
-          + logf(sum);
-      }
-    }
-  }
-}
-        """
-    def c_code(self, node, nodename, inp, out, sub):
-        x, b, y_idx = inp
-        nll, sm, am = out
-        classname = self.__class__.__name__
-        fail = sub['fail']
-        sio = StringIO()
-        print("""
-        if (CudaNdarray_NDIM(%(y_idx)s) != 1)
-        {
-            PyErr_SetString(PyExc_ValueError, "y_idx not 1d tensor");
-            %(fail)s;
-        }
-        if (CudaNdarray_NDIM(%(x)s) != 2)
-        {
-            PyErr_SetString(PyExc_ValueError, "x not 2d tensor");
-            %(fail)s;
-        }
-        if (CudaNdarray_NDIM(%(b)s) != 1)
-        {
-            PyErr_SetString(PyExc_ValueError, "b not 1d tensor");
-            %(fail)s;
-        }
-        if (CudaNdarray_HOST_DIMS(%(x)s)[0] !=
-            CudaNdarray_HOST_DIMS(%(y_idx)s)[0])
-        {
-            PyErr_SetString(PyExc_ValueError,
-                            "dimension mismatch in x,y_idx arguments");
-            %(fail)s;
-        }
-        if (CudaNdarray_HOST_DIMS(%(x)s)[1] != CudaNdarray_HOST_DIMS(%(b)s)[0])
-        {
-            PyErr_SetString(PyExc_ValueError,
-                            "dimension mismatch in x,b arguments");
-            %(fail)s;
-        }
-        if ((NULL == %(nll)s) //initial condition
-            || (CudaNdarray_HOST_DIMS(%(nll)s)[0] !=
-                CudaNdarray_HOST_DIMS(%(y_idx)s)[0]))
-        {
-            Py_XDECREF(%(nll)s);
-            %(nll)s = (CudaNdarray*)CudaNdarray_NewDims(1,
-                CudaNdarray_HOST_DIMS(%(y_idx)s));
-            if(!%(nll)s)
-            {
-                %(fail)s;
-            }
-        }
-        if ((NULL == %(sm)s)
-            || (CudaNdarray_HOST_DIMS(%(sm)s)[0] !=
-                CudaNdarray_HOST_DIMS(%(x)s)[0])
-            || (CudaNdarray_HOST_DIMS(%(sm)s)[1] !=
-                CudaNdarray_HOST_DIMS(%(x)s)[1]))
-        {
-            Py_XDECREF(%(sm)s);
-            %(sm)s = (CudaNdarray*) CudaNdarray_NewDims(2,
-                CudaNdarray_HOST_DIMS(%(x)s));
-            if(!%(sm)s)
-            {
-                PyErr_SetString(PyExc_MemoryError,
-                                "failed to alloc sm output");
-                // no need to decref cnda_nll, the cleanup code should do it up
-                %(fail)s;
-            }
-        }
-        if ((NULL == %(am)s)
-            || (CudaNdarray_HOST_DIMS(%(am)s)[0] !=
-                CudaNdarray_HOST_DIMS(%(y_idx)s)[0]))
-        {
-            Py_XDECREF(%(am)s);
-            %(am)s = (CudaNdarray*) CudaNdarray_NewDims(1,
-                CudaNdarray_HOST_DIMS(%(y_idx)s));
-            if(!%(am)s)
-            {
-                PyErr_SetString(PyExc_MemoryError,
-                                "failed to alloc am output");
-                // no need to decref nll and sm,
-                // the cleanup code should do it up
-                %(fail)s;
-            }
-        }
-        {
-            int n_blocks = std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],
-                                    NUM_VECTOR_OP_BLOCKS);
-            int n_threads = std::min(CudaNdarray_HOST_DIMS(%(x)s)[1],
-                                     NUM_VECTOR_OP_THREADS_PER_BLOCK);
-            int n_shared_bytes = n_threads * sizeof(float);
-            k_xent_sm_1hot_bias<<<n_blocks, n_threads, n_shared_bytes>>>(
-                CudaNdarray_HOST_DIMS(%(x)s)[0],
-                CudaNdarray_HOST_DIMS(%(x)s)[1],
-                CudaNdarray_DEV_DATA(%(x)s),
-                CudaNdarray_HOST_STRIDES(%(x)s)[0],
-                CudaNdarray_HOST_STRIDES(%(x)s)[1],
-                CudaNdarray_DEV_DATA(%(b)s),
-                CudaNdarray_HOST_STRIDES(%(b)s)[0],
-                CudaNdarray_DEV_DATA(%(y_idx)s),
-                CudaNdarray_HOST_STRIDES(%(y_idx)s)[0],
-                CudaNdarray_DEV_DATA(%(nll)s),
-                CudaNdarray_HOST_STRIDES(%(nll)s)[0],
-                CudaNdarray_DEV_DATA(%(sm)s),
-                CudaNdarray_HOST_STRIDES(%(sm)s)[0],
-                CudaNdarray_HOST_STRIDES(%(sm)s)[1],
-                CudaNdarray_DEV_DATA(%(am)s),
-                CudaNdarray_HOST_STRIDES(%(am)s)[0]);
-            CNDA_THREAD_SYNC;
-            cudaError_t err = cudaGetLastError();
-            if (cudaSuccess != err)
-            {
-                PyErr_Format(PyExc_RuntimeError,
-                             "Cuda error: %(classname)s %(nodename)s: %%s.\\n"
-                             "The kernel was launched with %%d threads,"
-                             " %%d blocks and %%d shared memory\\n",
-                             cudaGetErrorString(err),
-                             n_threads, n_blocks, n_shared_bytes);
-                // no need to decref output vars the cleanup code will do it
-                %(fail)s;
-            }
-        }
-        """ % locals(), file=sio)
-        return sio.getvalue()
-    def c_code_cache_version(self):
-        # return ()
-        return (5,)
-gpu_crossentropy_softmax_argmax_1hot_with_bias = \
-    GpuCrossentropySoftmaxArgmax1HotWithBias()
-class GpuCrossentropySoftmax1HotWithBiasDx(GpuOp):
-    """
-    Implement CrossentropySoftmax1HotWithBiasDx on the gpu.
-    """
-    nin = 3
-    nout = 1
-    """Gradient wrt x of the CrossentropySoftmax1Hot Op"""
-    def __init__(self, **kwargs):
-        Op.__init__(self, **kwargs)
-    def __eq__(self, other):
-        return type(self) == type(other)
-    def __hash__(self):
-        return hash(type(self))
-    def __str__(self):
-        return self.__class__.__name__
-    def make_node(self, dy, sm, y_idx):
-        dy = as_cuda_ndarray_variable(dy)
-        sm = as_cuda_ndarray_variable(sm)
-        y_idx = as_cuda_ndarray_variable(y_idx)
-        return Apply(self, [dy, sm, y_idx], [sm.type()])
-    def c_code_cache_version(self):
-        # return ()
-        return (8,)
-    def c_code(self, node, nodename, inp, out, sub):
-        dnll, sm, y_idx = inp
-        dx, = out
-        fail = sub['fail']
-        return """
-        // Get `dnll.shape[0]` or set it to zero if `dnll` is a scalar.
-        const npy_intp %(dnll)s_dims0 = (CudaNdarray_NDIM(%(dnll)s) > 0 ?
-                                         CudaNdarray_HOST_DIMS(%(dnll)s)[0] :
-                                         (npy_intp) 0);
-        // Get `dnll.strides[0]` and set it to zero if `dnll` is a scalar
-        // or a vector with just one element.
-        const npy_intp %(dnll)s_strides0 = (%(dnll)s_dims0 > 1 ?
-                                            CudaNdarray_HOST_STRIDES(%(dnll)s)[0] :
-                                            (npy_intp) 0);
-        if ((CudaNdarray_NDIM(%(dnll)s) > 1)
-            || (CudaNdarray_NDIM(%(sm)s) != 2)
-            || (CudaNdarray_NDIM(%(y_idx)s) != 1))
-        {
-            PyErr_SetString(PyExc_ValueError, "rank error");
-            %(fail)s;
-        }
-        if (%(dnll)s_dims0 !=
-            CudaNdarray_HOST_DIMS(%(sm)s)[0] && %(dnll)s_dims0 > 1)
-        {
-            PyErr_Format(PyExc_ValueError,
-                         "dnll.shape[0] == %%i, but sm.shape[0] == %%i",
-                         %(dnll)s_dims0,
-                         CudaNdarray_HOST_DIMS(%(sm)s)[0]);
-            %(fail)s;
-        }
-        if (%(dnll)s_dims0 !=
-            CudaNdarray_HOST_DIMS(%(y_idx)s)[0] && %(dnll)s_dims0 > 1)
-        {
-            PyErr_SetString(PyExc_ValueError,
-                            "dnll.shape[0] != y_idx.shape[0]");
-            %(fail)s;
-        }
-        if (CudaNdarray_HOST_DIMS(%(sm)s)[0] !=
-            CudaNdarray_HOST_DIMS(%(y_idx)s)[0])
-        {
-            PyErr_SetString(PyExc_ValueError,
-                            "sm.shape[0] != y_idx.shape[0]");
-            %(fail)s;
-        }
-        if ((NULL == %(dx)s)
-            || (CudaNdarray_HOST_DIMS(%(dx)s)[0] !=
-                CudaNdarray_HOST_DIMS(%(sm)s)[0])
-            || (CudaNdarray_HOST_DIMS(%(dx)s)[1] !=
-                CudaNdarray_HOST_DIMS(%(sm)s)[1]))
-        {
-            Py_XDECREF(%(dx)s);
-            %(dx)s = (CudaNdarray*)CudaNdarray_New();
-            if ((NULL == %(dx)s)
-                || CudaNdarray_alloc_contiguous(%(dx)s, 2,
-                                                CudaNdarray_HOST_DIMS(%(sm)s)))
-            {
-                Py_XDECREF(%(dx)s);
-                %(dx)s = NULL;
-                %(fail)s;
-            }
-        }
-        {
-            int n_blocks = std::min(CudaNdarray_HOST_DIMS(%(dx)s)[0],
-                                    NUM_VECTOR_OP_BLOCKS);
-            int n_threads = std::min(CudaNdarray_HOST_DIMS(%(dx)s)[1],256);
-            kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s
-                <<<n_blocks, n_threads>>>(
-                        CudaNdarray_HOST_DIMS(%(dx)s)[0],
-                        CudaNdarray_HOST_DIMS(%(dx)s)[1],
-                        CudaNdarray_DEV_DATA(%(dnll)s),
-                        %(dnll)s_strides0,
-                        CudaNdarray_DEV_DATA(%(sm)s),
-                        CudaNdarray_HOST_STRIDES(%(sm)s)[0],
-                        CudaNdarray_HOST_STRIDES(%(sm)s)[1],
-                        CudaNdarray_DEV_DATA(%(y_idx)s),
-                        CudaNdarray_HOST_STRIDES(%(y_idx)s)[0],
-                        CudaNdarray_DEV_DATA(%(dx)s),
-                        CudaNdarray_HOST_STRIDES(%(dx)s)[0],
-                        CudaNdarray_HOST_STRIDES(%(dx)s)[1]
-                );
-            CNDA_THREAD_SYNC;
-            cudaError_t err = cudaGetLastError();
-            if( cudaSuccess != err)
-            {
-                PyErr_Format(PyExc_RuntimeError,
-                             "Cuda error: %%s: %%s.\\n"
-                             "The kernel was launched with %%d threads and"
-                             " %%d blocks\\n",
-                             "kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s",
-                             cudaGetErrorString(err), n_threads, n_blocks);
-                %(fail)s;
-            }
-        }
-        assert(%(dx)s);
-        """ % locals()
-    def c_support_code_apply(self, node, nodename):
-        return """
-        __global__ void kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s(
-           int N, int K,
-           const float * dnll, const int dnll_s0,
-           const float * sm, const int sm_s0, const int sm_s1,
-           const float * y_idx, const int y_idx_s0,
-           float * dx, const int dx_s0, const int dx_s1)
-        {
-            for (int i = blockIdx.x; i < N; i += gridDim.x)
-            {
-                float dnll_i = dnll[i * dnll_s0];
-                int y_i = (int)y_idx[i * y_idx_s0];
-                for (int j = threadIdx.x; j < K; j += blockDim.x)
-                {
-                    if (y_i == j)
-                    {
-                        dx[i * dx_s0 + j * dx_s1] =
-                            dnll_i * (sm[i * sm_s0 + j * sm_s1]-1.0);
-                    }
-                    else
-                    {
-                        dx[i * dx_s0 + j * dx_s1] =
-                            dnll_i * sm[i * sm_s0 + j * sm_s1];
-                    }
-                    //dx[i * dx_s0 + j * dx_s1] =
-                    //    dnll_i * sm[i * sm_s0 + j * sm_s1];
-                    //dx[i*dx_s0+j*dx_s1] = 0;
-                }
-            }
-        }
-        """ % locals()
-gpu_crossentropy_softmax_1hot_with_bias_dx = \
-    GpuCrossentropySoftmax1HotWithBiasDx()
-class GpuSoftmax(GpuOp):
-    """
-    Implement Softmax on the gpu.
-    """
-    def __eq__(self, other):
-        return type(self) == type(other)
-    def __hash__(self):
-        return hash(type(self))
-    def __str__(self):
-        return self.__class__.__name__
-    def make_node(self, x):
-        x = as_cuda_ndarray_variable(x)
-        return Apply(self, [x], [x.type()])
-    def infer_shape(self, node, shape):
-        return shape
-    def c_code_cache_version(self):
-        return (9,) + inline_softmax.code_version
-    def c_code(self, node, nodename, inp, out, sub):
-        x, = inp
-        z, = out
-        fail = sub['fail']
-        return """
-        if (CudaNdarray_NDIM(%(x)s) != 2)
-        {
-            PyErr_SetString(PyExc_ValueError, "rank error");
-            %(fail)s;
-        }
-        if ((NULL == %(z)s) ||
-            (CudaNdarray_HOST_DIMS(%(z)s)[0] !=
-             CudaNdarray_HOST_DIMS(%(x)s)[0]) ||
-            (CudaNdarray_HOST_DIMS(%(z)s)[1] !=
-             CudaNdarray_HOST_DIMS(%(x)s)[1]))
-        {
-            Py_XDECREF(%(z)s);
-            %(z)s = (CudaNdarray*)CudaNdarray_New();
-            if ((NULL == %(z)s)
-                || CudaNdarray_alloc_contiguous(%(z)s, 2,
-                                                CudaNdarray_HOST_DIMS(%(x)s)))
-            {
-                Py_XDECREF(%(z)s);
-                %(z)s = NULL;
-                %(fail)s;
-            }
-        }
-        {
-            int n_blocks = std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],
-                                    32 * 1024);
-//TODO, detect the maximum number of thread per block.
-            int n_threads = std::min(CudaNdarray_HOST_DIMS(%(x)s)[1], 512);
-            int n_shared_bytes = CudaNdarray_HOST_DIMS(%(x)s)[1] *
-                                     2 * sizeof(float);
-            if (CudaNdarray_HOST_DIMS(%(x)s)[0] > 0)
-            {
-              //Those numbers are based on not too recent GPU
-              //to make them compatible with more GPU.
-              //TODO: read the information from the card.
-              if(n_shared_bytes < (32 * 1024 - 500)){
-                kSoftmax_%(nodename)s
-                    <<<
-                        n_blocks,
-                        n_threads,
-                        n_shared_bytes
-                    >>>(
-                            CudaNdarray_HOST_DIMS(%(x)s)[0],
-                            CudaNdarray_HOST_DIMS(%(x)s)[1],
-                            CudaNdarray_DEV_DATA(%(x)s),
-                            CudaNdarray_HOST_STRIDES(%(x)s)[0],
-                            CudaNdarray_HOST_STRIDES(%(x)s)[1],
-                            CudaNdarray_DEV_DATA(%(z)s),
-                            CudaNdarray_HOST_STRIDES(%(z)s)[0],
-                            CudaNdarray_HOST_STRIDES(%(z)s)[1]
-                    );
-              }else{
-                kSoftmax_fixed_shared%(nodename)s
-                    <<<
-                        n_blocks,
-                        n_threads,
-                        n_threads * sizeof(float)
-                    >>>(
-                            CudaNdarray_HOST_DIMS(%(x)s)[0],
-                            CudaNdarray_HOST_DIMS(%(x)s)[1],
-                            CudaNdarray_DEV_DATA(%(x)s),
-                            CudaNdarray_HOST_STRIDES(%(x)s)[0],
-                            CudaNdarray_HOST_STRIDES(%(x)s)[1],
-                            CudaNdarray_DEV_DATA(%(z)s),
-                            CudaNdarray_HOST_STRIDES(%(z)s)[0],
-                            CudaNdarray_HOST_STRIDES(%(z)s)[1]
-                    );
-              }
-              CNDA_THREAD_SYNC;
-              cudaError_t err = cudaGetLastError();
-              if( cudaSuccess != err)
-              {
-                  PyErr_Format(PyExc_RuntimeError,
-                               "Cuda error: %%s: %%s.\\n Used %%d blocks,"
-                               " %%d threads %%d bytes of shared memory",
-                               "kSoftmax[_fixed_shared]%(nodename)s",
-                               cudaGetErrorString(err),
-                               n_blocks, n_threads, n_shared_bytes);
-                  %(fail)s;
-              }
-            }
-        }
-        assert(%(z)s);
-        """ % locals()
-    def c_support_code_apply(self, node, nodename):
-        ret1 = nvcc_kernel(
-            "kSoftmax_%s" % nodename,
-            params=['int M', 'int N',
-                    'const float * x',
-                    'const int sx0',
-                    'const int sx1',
-                    'float * sm',
-                    'const int sm_s0',
-                    'const int sm_s1'],
-            body=["extern __shared__ float buf[]",
-                  "float * buf2 = buf + N",
-                  "for (int blockIDX = blockIdx.x; blockIDX < M;"
-                  "     blockIDX += gridDim.x){",
-                  "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
-                  "buf[tx] = x[blockIDX * sx0 + tx * sx1]",
-                  "buf2[tx] = buf[tx]", "}", "__syncthreads()",
-                  inline_softmax('N',
-                                 'buf',
-                                 'buf2',
-                                 'threadIdx.x',
-                                 'blockDim.x'),
-                  "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
-                  # This set all value correctly
-                  "sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]", "}",
-                  "__syncthreads()", "}", ])
-        ret2 = nvcc_kernel(
-            "kSoftmax_fixed_shared%s" % nodename,
-            params=['int M', 'int N',
-                    'const float * x', 'const int sx0', 'const int sx1',
-                    'float * sm', 'const int sm_s0', 'const int sm_s1'],
-            body=["extern __shared__ float buf[]",
-                  "for (int blockIDX = blockIdx.x; blockIDX < M;"
-                  "     blockIDX += gridDim.x){",
-                  "const float *x_ptr = &x[blockIDX * sx0]",
-                  "float *sm_ptr = &sm[blockIDX * sm_s0]",
-                  inline_softmax_fixed_shared('N', 'buf', 'x_ptr', 'sx1',
-                                              'sm_ptr', 'sm_s1',
-                                              'threadIdx.x',
-                                              'blockDim.x'),
-                  "__syncthreads()", "}", ])
-        return ret1 + "\n" + ret2
-gpu_softmax = GpuSoftmax()
-class GpuSoftmaxWithBias(GpuOp):
-    """
-    Implement SoftmaxWithBias on the gpu.
-    """
-    nin = 2
-    nout = 1
-    def __eq__(self, other):
-        return type(self) == type(other)
-    def __hash__(self):
-        return hash(type(self))
-    def __str__(self):
-        return self.__class__.__name__
-    def make_node(self, x, b):
-        x = as_cuda_ndarray_variable(x)
-        return Apply(self, [x, b], [x.type()])
-    def infer_shape(self, node, shape):
-        return [shape[0]]
-    def c_code_cache_version(self):
-        # return ()
-        return (9,) + inline_softmax.code_version
-    def c_code(self, node, nodename, inp, out, sub):
-        x, b = inp
-        z, = out
-        fail = sub['fail']
-        return """
-        if (CudaNdarray_NDIM(%(x)s) != 2)
-        {
-            PyErr_SetString(PyExc_ValueError, "rank error input");
-            %(fail)s;
-        }
-        if (CudaNdarray_NDIM(%(b)s) != 1)
-        {
-            PyErr_SetString(PyExc_ValueError, "rank error for the bias");
-            %(fail)s;
-        }
-        if ((CudaNdarray_HOST_DIMS(%(x)s)[1] !=
-            CudaNdarray_HOST_DIMS(%(b)s)[0]))
-        {
-            PyErr_Format(PyExc_ValueError,
-                         "number of columns in x (%%ld)"
-                         " does not match length of b (%%ld)",
-                         (long int)CudaNdarray_HOST_DIMS(%(x)s)[1],
-                         (long int)CudaNdarray_HOST_DIMS(%(b)s)[0]);
-            %(fail)s;
-        }
-        if ((NULL == %(z)s)
-            || (CudaNdarray_HOST_DIMS(%(z)s)[0] !=
-                CudaNdarray_HOST_DIMS(%(x)s)[0])
-            || (CudaNdarray_HOST_DIMS(%(z)s)[1] !=
-                CudaNdarray_HOST_DIMS(%(x)s)[1]))
-        {
-            Py_XDECREF(%(z)s);
-            %(z)s = (CudaNdarray*)CudaNdarray_New();
-            if ((NULL == %(z)s)
-                || CudaNdarray_alloc_contiguous(%(z)s, 2,
-                       CudaNdarray_HOST_DIMS(%(x)s)))
-            {
-                Py_XDECREF(%(z)s);
-                %(z)s = NULL;
-                %(fail)s;
-            }
-        }
-        {
-            int n_blocks = std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],32*1024);
-//TODO, detect the maximum number of thread per block.
-            int n_threads = std::min(CudaNdarray_HOST_DIMS(%(x)s)[1], 512);
-            int n_shared_bytes = CudaNdarray_HOST_DIMS(%(x)s)[1] *
-                                     2 * sizeof(float);
-            if (CudaNdarray_HOST_DIMS(%(x)s)[0] > 0)
-            {
-              if(n_shared_bytes < (32 * 1024 - 500)){
-                kSoftmaxWithBias_%(nodename)s
-                    <<<
-                        n_blocks,
-                        n_threads,
-                        n_shared_bytes
-                    >>>(
-                        CudaNdarray_HOST_DIMS(%(x)s)[0],
-                        CudaNdarray_HOST_DIMS(%(x)s)[1],
-                        CudaNdarray_DEV_DATA(%(x)s),
-                        CudaNdarray_HOST_STRIDES(%(x)s)[0],
-                        CudaNdarray_HOST_STRIDES(%(x)s)[1],
-                        CudaNdarray_DEV_DATA(%(b)s),
-                        CudaNdarray_HOST_STRIDES(%(b)s)[0],
-                        CudaNdarray_DEV_DATA(%(z)s),
-                        CudaNdarray_HOST_STRIDES(%(z)s)[0],
-                        CudaNdarray_HOST_STRIDES(%(z)s)[1]
-                    );
-              }else{
-                kSoftmaxWithBias_fixed_shared%(nodename)s
-                    <<<
-                        n_blocks,
-                        n_threads,
-                        n_threads * sizeof(float)
-                    >>>(
-                        CudaNdarray_HOST_DIMS(%(x)s)[0],
-                        CudaNdarray_HOST_DIMS(%(x)s)[1],
-                        CudaNdarray_DEV_DATA(%(x)s),
-                        CudaNdarray_HOST_STRIDES(%(x)s)[0],
-                        CudaNdarray_HOST_STRIDES(%(x)s)[1],
-                        CudaNdarray_DEV_DATA(%(b)s),
-                        CudaNdarray_HOST_STRIDES(%(b)s)[0],
-                        CudaNdarray_DEV_DATA(%(z)s),
-                        CudaNdarray_HOST_STRIDES(%(z)s)[0],
-                        CudaNdarray_HOST_STRIDES(%(z)s)[1]
-                    );
-              }
-                CNDA_THREAD_SYNC;
-                cudaError_t err = cudaGetLastError();
-                if( cudaSuccess != err)
-                {
-                    PyErr_Format(PyExc_RuntimeError,
-                                 "Cuda error: %%s: %%s. n_blocks=%%d,"
-                                 " n_threads=%%d, n_shared_bytes=%%d\\n",
-                                 "kSoftmaxWithBias_%(nodename)s",
-                                 cudaGetErrorString(err),
-                                 n_blocks, n_threads, n_shared_bytes);
-                    %(fail)s;
-                }
-            }
-        }
-        assert(%(z)s);
-        """ % locals()
-    def c_support_code_apply(self, node, nodename):
-        ret1 = nvcc_kernel(
-            "kSoftmaxWithBias_%s" % nodename,
-            params=['int M', 'int N',
-                    'const float * x', 'const int sx0', 'const int sx1',
-                    'const float * b', 'const int sb0',
-                    'float * sm', 'const int sm_s0', 'const int sm_s1'],
-            body=["extern __shared__ float buf[]",
-                  "float * buf2 = buf + N",
-                  "for (int blockIDX = blockIdx.x; blockIDX < M;"
-                  "     blockIDX += gridDim.x){",
-                  "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
-                  "buf[tx] = x[blockIDX * sx0 + tx * sx1]",
-                  "buf[tx] += b[tx * sb0]",
-                  "buf2[tx] = buf[tx]", "}",
-                  "__syncthreads()", inline_softmax('N', 'buf', 'buf2',
-                                                    'threadIdx.x',
-                                                    'blockDim.x'),
-                  "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
-                  "sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]", "}",
-                  "__syncthreads()", "}", ])
-        ret2 = nvcc_kernel("kSoftmaxWithBias_fixed_shared%s" % nodename,
-                           params=['int M', 'int N',
-                                   'const float * x',
-                                   'const int sx0', 'const int sx1',
-                                   'const float * b', 'const int sb0',
-                                   'float * sm',
-                                   'const int sm_s0', 'const int sm_s1'],
-                           body=[
-                               "extern __shared__ float buf[]",
-                               "for (int blockIDX = blockIdx.x; blockIDX < M;"
-                               "     blockIDX += gridDim.x){",
-                               "const float *x_ptr = &x[blockIDX * sx0]",
-                               "float *sm_ptr = &sm[blockIDX * sm_s0]",
-                               inline_softmax_fixed_shared('N', 'buf',
-                                                           'x_ptr', 'sx1',
-                                                           'sm_ptr',
-                                                           'sm_s1',
-                                                           'threadIdx.x',
-                                                           'blockDim.x',
-                                                           'b', 'sb0'),
-                               "__syncthreads()",
-                               "}",
-                           ])
-        return ret1 + "\n" + ret2
-gpu_softmax_with_bias = GpuSoftmaxWithBias()
--- a/theano/sandbox/cuda/nvcc_compiler.py
+++ b/theano/sandbox/cuda/nvcc_compiler.py
-from __future__ import absolute_import, print_function, division
-import distutils
-import logging
-import os
-import subprocess
-import sys
-from locale import getpreferredencoding
-from theano import config
-from theano.compat import decode, decode_with
-from theano.configdefaults import local_bitwidth
-from theano.gof.utils import hash_from_file
-from theano.gof.cmodule import (std_libs, std_lib_dirs,
-                                std_include_dirs, dlimport,
-                                Compiler,
-                                get_lib_extension)
-from theano.misc.windows import output_subprocess_Popen
-_logger = logging.getLogger("theano.sandbox.cuda.nvcc_compiler")
-nvcc_path = 'nvcc'
-nvcc_version = None
-def is_nvcc_available():
-    """
-    Return True iff the nvcc compiler is found.
-    """
-    if not config.cuda.enabled:
-        return False
-    def set_version():
-        p_out = output_subprocess_Popen([nvcc_path, '--version'])
-        ver_line = decode(p_out[0]).strip().split('\n')[-1]
-        build, version = ver_line.split(',')[1].strip().split()
-        assert build == 'release'
-        global nvcc_version
-        nvcc_version = version
-    try:
-        set_version()
-        return True
-    except Exception:
-        # try to find nvcc into cuda.root
-        p = os.path.join(config.cuda.root, 'bin', 'nvcc')
-        if os.path.exists(p):
-            global nvcc_path
-            nvcc_path = p
-            try:
-                set_version()
-            except Exception:
-                return False
-            return True
-        else:
-            return False
-rpath_defaults = []
-def add_standard_rpath(rpath):
-    rpath_defaults.append(rpath)
-class NVCC_compiler(Compiler):
-    supports_amdlibm = False
-    @classmethod
-    def try_compile_tmp(cls, src_code, tmp_prefix='', flags=(),
-                        try_run=False, output=False, comp_args=False):
-        return cls._try_compile_tmp(src_code, tmp_prefix, flags,
-                                    try_run, output, nvcc_path, comp_args)
-    @classmethod
-    def try_flags(cls, flag_list, preambule="", body="",
-                  try_run=False, output=False, comp_args=False):
-        return cls._try_flags(flag_list, preambule, body, try_run, output,
-                              nvcc_path, comp_args)
-    @staticmethod
-    def version_str():
-        return "nvcc " + nvcc_version
-    @staticmethod
-    def compile_args():
-        """
-        This args will be received by compile_str() in the preargs paramter.
-        They will also be included in the "hard" part of the key module.
-        """
-        flags = [flag for flag in config.nvcc.flags.split(' ') if flag]
-        if config.nvcc.fastmath:
-            flags.append('-use_fast_math')
-        cuda_ndarray_cuh_hash = hash_from_file(
-            os.path.join(os.path.split(__file__)[0], 'cuda_ndarray.cuh'))
-        flags.append('-DCUDA_NDARRAY_CUH=' + cuda_ndarray_cuh_hash)
-        # NumPy 1.7 Deprecate the old API.
-        # The following macro asserts that we don't bring new code
-        # that use the old API.
-        flags.append("-DNPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION")
-        # If the user didn't specify architecture flags add them
-        if not any(['-arch=sm_' in f for f in flags]):
-            # We compile cuda_ndarray.cu during import.
-            # We should not add device properties at that time.
-            # As the device is not selected yet!
-            # TODO: re-compile cuda_ndarray when we bind to a GPU?
-            import theano.sandbox.cuda
-            if hasattr(theano.sandbox, 'cuda'):
-                n = theano.sandbox.cuda.use.device_number
-                if n is None:
-                    _logger.warn(
-                        "We try to get compilation arguments for CUDA"
-                        " code, but the GPU device is not initialized."
-                        " This is probably caused by an Op that work on"
-                        " the GPU that don't inherit from GpuOp."
-                        " We Initialize the GPU now.")
-                    theano.sandbox.cuda.use(
-                        "gpu",
-                        force=True,
-                        default_to_move_computation_to_gpu=False,
-                        move_shared_float32_to_gpu=False,
-                        enable_cuda=False)
-                    n = theano.sandbox.cuda.use.device_number
-                p = theano.sandbox.cuda.device_properties(n)
-                flags.append('-arch=sm_' + str(p['major']) +
-                             str(p['minor']))
-        return flags
-    @staticmethod
-    def compile_str(
-            module_name, src_code,
-            location=None, include_dirs=[], lib_dirs=[], libs=[], preargs=[],
-            rpaths=rpath_defaults, py_module=True, hide_symbols=True):
-        """
-        Parameters
-        ----------
-        module_name: str
-             This has been embedded in the src_code.
-        src_code
-            A complete c or c++ source listing for the module.
-        location
-            A pre-existing filesystem directory where the
-            cpp file and .so will be written.
-        include_dirs
-            A list of include directory names (each gets prefixed with -I).
-        lib_dirs
-            A list of library search path directory names (each gets
-            prefixed with -L).
-        libs
-            A list of libraries to link with (each gets prefixed with -l).
-        preargs
-            A list of extra compiler arguments.
-        rpaths
-            List of rpaths to use with Xlinker. Defaults to `rpath_defaults`.
-        py_module
-            If False, compile to a shared library, but
-            do not import as a Python module.
-        hide_symbols
-            If True (the default), hide all symbols from the library symbol
-            table unless explicitely exported.
-        Returns
-        -------
-        module
-            Dynamically-imported python module of the compiled code.
-            (unless py_module is False, in that case returns None.)
-        Notes
-        -----
-        On Windows 7 with nvcc 3.1 we need to compile in the real directory
-        Otherwise nvcc never finish.
-        """
-        # Remove empty string directory
-        include_dirs = [d for d in include_dirs if d]
-        lib_dirs = [d for d in lib_dirs if d]
-        rpaths = list(rpaths)
-        if sys.platform == "win32":
-            # Remove some compilation args that cl.exe does not understand.
-            # cl.exe is the compiler used by nvcc on Windows.
-            for a in ["-Wno-write-strings", "-Wno-unused-label",
-                      "-Wno-unused-variable", "-fno-math-errno"]:
-                if a in preargs:
-                    preargs.remove(a)
-        if preargs is None:
-            preargs = []
-        else:
-            preargs = list(preargs)
-        if sys.platform != 'win32':
-            preargs.append('-fPIC')
-        if config.cmodule.remove_gxx_opt:
-            preargs = [p for p in preargs if not p.startswith('-O')]
-        cuda_root = config.cuda.root
-        # The include dirs gived by the user should have precedence over
-        # the standards ones.
-        include_dirs = include_dirs + std_include_dirs()
-        if os.path.abspath(os.path.split(__file__)[0]) not in include_dirs:
-            include_dirs.append(os.path.abspath(os.path.split(__file__)[0]))
-        libs = libs + std_libs()
-        if 'cudart' not in libs:
-            libs.append('cudart')
-        lib_dirs = lib_dirs + std_lib_dirs()
-        if sys.platform != 'darwin':
-            # config.dnn.include_path add this by default for cudnn in the
-            # new back-end. This should not be used in this back-end. So
-            # just remove them.
-            lib_dirs = [ld for ld in lib_dirs if
-                        not(ld == os.path.join(cuda_root, 'lib') or
-                            ld == os.path.join(cuda_root, 'lib64'))]
-        if sys.platform != 'darwin':
-            # sometimes, the linker cannot find -lpython so we need to tell it
-            # explicitly where it is located
-            # this returns somepath/lib/python2.x
-            python_lib = distutils.sysconfig.get_python_lib(plat_specific=1,
-                                                            standard_lib=1)
-            python_lib = os.path.dirname(python_lib)
-            if python_lib not in lib_dirs:
-                lib_dirs.append(python_lib)
-        if (config.nvcc.cudafe == 'heuristic' and not
-            any(marker in src_code for marker in ("__global__", "__device__",
-                                                  "__host__", "<<<",
-                                                  "nvmatrix.cuh"))):
-            # only calls existing CUDA functions, can compile much faster
-            cppfilename = os.path.join(location, 'mod.cpp')
-            src_code = ("#include <cuda.h>\n"
-                        "#include <cuda_runtime_api.h>\n" +
-                        src_code)
-        else:
-            # contains CUDA host code or device functions, needs .cu extension
-            cppfilename = os.path.join(location, 'mod.cu')
-        with open(cppfilename, 'w') as cppfile:
-            _logger.debug('Writing module C++ code to %s', cppfilename)
-            cppfile.write(src_code)
-        lib_filename = os.path.join(
-            location, '%s.%s' %
-            (module_name, get_lib_extension()))
-        _logger.debug('Generating shared lib %s', lib_filename)
-        # TODO: Why do these args cause failure on gtx285 that has 1.3
-        # compute capability? '--gpu-architecture=compute_13',
-        # '--gpu-code=compute_13',
-        # nvcc argument
-        preargs1 = []
-        preargs2 = []
-        for pa in preargs:
-            if pa.startswith('-Wl,'):
-                # the -rpath option is not understood by the Microsoft linker
-                if sys.platform != 'win32' or not pa.startswith('-Wl,-rpath'):
-                    preargs1.append('-Xlinker')
-                    preargs1.append(pa[4:])
-                continue
-            for pattern in ['-O', '-arch=', '-ccbin=', '-G', '-g', '-I',
-                            '-L', '--fmad', '--ftz', '--maxrregcount',
-                            '--prec-div', '--prec-sqrt', '--use_fast_math',
-                            '-fmad', '-ftz', '-maxrregcount',
-                            '-prec-div', '-prec-sqrt', '-use_fast_math',
-                            '--use-local-env', '--cl-version=', '-std=']:
-                if pa.startswith(pattern):
-                    preargs1.append(pa)
-                    break
-            else:
-                preargs2.append(pa)
-        # Don't put -G by default, as it slow things down.
-        # We aren't sure if -g slow things down, so we don't put it by default.
-        cmd = [nvcc_path, '-shared'] + preargs1
-        if config.nvcc.compiler_bindir:
-            cmd.extend(['--compiler-bindir', config.nvcc.compiler_bindir])
-        if sys.platform == 'win32':
-            # add flags for Microsoft compiler to create .pdb files
-            preargs2.extend(['/Zi', '/MD'])
-            cmd.extend(['-Xlinker', '/DEBUG'])
-            # remove the complaints for the duplication of `double round(double)`
-            # in both math_functions.h and pymath.h,
-            # by not including the one in pymath.h
-            cmd.extend(['-D HAVE_ROUND'])
-        else:
-            if hide_symbols:
-                preargs2.append('-fvisibility=hidden')
-        if local_bitwidth() == 64:
-            cmd.append('-m64')
-        else:
-            cmd.append('-m32')
-        if len(preargs2) > 0:
-            cmd.extend(['-Xcompiler', ','.join(preargs2)])
-        # We should not use rpath if possible. If the user provided
-        # provided an cuda.root flag, we need to add one, but
-        # otherwise, we don't add it. See gh-1540 and
-        # https://wiki.debian.org/RpathIssue for details.
-        if (not type(config.cuda).root.is_default and
-                os.path.exists(os.path.join(config.cuda.root, 'lib'))):
-            rpaths.append(os.path.join(config.cuda.root, 'lib'))
-            if sys.platform != 'darwin':
-                # the CUDA libs are universal (contain both 32-bit and 64-bit)
-                rpaths.append(os.path.join(config.cuda.root, 'lib64'))
-        if sys.platform != 'win32':
-            # the -rpath option is not understood by the Microsoft linker
-            for rpath in rpaths:
-                cmd.extend(['-Xlinker', ','.join(['-rpath', rpath])])
-        # to support path that includes spaces, we need to wrap it with double quotes on Windows
-        path_wrapper = "\"" if os.name == 'nt' else ""
-        cmd.extend(['-I%s%s%s' % (path_wrapper, idir, path_wrapper) for idir in include_dirs])
-        cmd.extend(['-L%s%s%s' % (path_wrapper, ldir, path_wrapper) for ldir in lib_dirs])
-        cmd.extend(['-o', lib_filename])
-        cmd.append(os.path.split(cppfilename)[-1])
-        cmd.extend(['-l%s' % l for l in libs])
-        if sys.platform == 'darwin':
-            # This tells the compiler to use the already-loaded python
-            # symbols (which should always be the right ones).
-            cmd.extend(['-Xcompiler', '-undefined,dynamic_lookup'])
-        # Remove "-u Symbol" arguments, since they are usually not
-        # relevant for the new compilation, even if they were used for
-        # compiling python.  If they are necessary, the nvcc syntax is
-        # "-U Symbol" with a capital U.
-        done = False
-        while not done:
-            try:
-                indexof = cmd.index('-u')
-                cmd.pop(indexof)  # Remove -u
-                cmd.pop(indexof)  # Remove argument to -u
-            except ValueError:
-                done = True
-        # CUDA Toolkit v4.1 Known Issues:
-        # Host linker on Mac OS 10.7 (and 10.6 for me) passes -no_pie option
-        # to nvcc this option is not recognized and generates an error
-        # http://stackoverflow.com/questions/9327265/nvcc-unknown-option-no-pie
-        # Passing -Xlinker -pie stops -no_pie from getting passed
-        if sys.platform == 'darwin' and nvcc_version >= '4.1':
-            cmd.extend(['-Xlinker', '-pie'])
-        # cmd.append("--ptxas-options=-v") #uncomment this to see
-        # register and shared-mem requirements
-        _logger.debug('Running cmd %s', ' '.join(cmd))
-        orig_dir = os.getcwd()
-        try:
-            os.chdir(location)
-            p = subprocess.Popen(
-                cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-            nvcc_stdout_raw, nvcc_stderr_raw = p.communicate()[:2]
-            console_encoding = getpreferredencoding()
-            nvcc_stdout = decode_with(nvcc_stdout_raw, console_encoding)
-            nvcc_stderr = decode_with(nvcc_stderr_raw, console_encoding)
-        finally:
-            os.chdir(orig_dir)
-        for eline in nvcc_stderr.split('\n'):
-            if not eline:
-                continue
-            if 'skipping incompatible' in eline:
-                # ld is skipping an incompatible library
-                continue
-            if 'declared but never referenced' in eline:
-                continue
-            if 'statement is unreachable' in eline:
-                continue
-            _logger.info("NVCC: %s", eline)
-        if p.returncode:
-            for i, l in enumerate(src_code.split('\n')):
-                print(i + 1, l, file=sys.stderr)
-            print('===============================', file=sys.stderr)
-            # filter the output from the compiler
-            for l in nvcc_stderr.split('\n'):
-                if not l:
-                    continue
-                # filter out the annoying declaration warnings
-                try:
-                    if l[l.index(':'):].startswith(': warning: variable'):
-                        continue
-                    if l[l.index(':'):].startswith(': warning: label'):
-                        continue
-                except Exception:
-                    pass
-                print(l, file=sys.stderr)
-            print(nvcc_stdout)
-            print(cmd)
-            raise Exception('nvcc return status', p.returncode,
-                            'for cmd', ' '.join(cmd))
-        elif config.cmodule.compilation_warning and nvcc_stdout:
-            print(nvcc_stdout)
-        # On Windows, nvcc print useless stuff by default
-        if sys.platform != 'win32' and nvcc_stdout:
-            # this doesn't happen to my knowledge
-            print("DEBUG: nvcc STDOUT", nvcc_stdout, file=sys.stderr)
-        if py_module:
-            # touch the __init__ file
-            open(os.path.join(location, "__init__.py"), 'w').close()
-            return dlimport(lib_filename)
--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
--- a/theano/sandbox/cuda/opt_util.py
+++ b/theano/sandbox/cuda/opt_util.py
-from __future__ import absolute_import, print_function, division
-from functools import wraps
-import numpy
-from theano import tensor, scalar as scal, Constant
-from theano.gof import local_optimizer
-from theano.tensor import (DimShuffle, get_scalar_constant_value,
-                           NotScalarConstantError)
-from theano.sandbox.cuda.basic_ops import (
-    GpuFromHost, HostFromGpu, host_from_gpu, GpuDimShuffle, GpuElemwise, GpuReshape)
-_one = scal.constant(numpy.asarray(1.0, dtype='float32'))
-def grab_cpu_scalar(v, nd):
-    if v.owner is not None:
-        n = v.owner
-        if (isinstance(n.op, GpuDimShuffle) and
-                n.op.new_order == ('x',) * nd):
-            return host_from_gpu(n.inputs[0])
-        elif (isinstance(n.op, DimShuffle) and
-              n.op.new_order == ('x',) * nd):
-            return n.inputs[0]
-        elif isinstance(n.op, GpuFromHost):
-            return grab_cpu_scalar(n.inputs[0], nd=nd)
-        else:
-            return None
-    else:
-        if (isinstance(v, Constant) and
-                v.broadcastable == (True,) * nd):
-            return v.dimshuffle(())
-def find_node(v, cls, ignore_clients=False):
-    # This digs through possibly redundant transfers to for the node
-    # that has the op class specified.
-    if v.owner is not None and (ignore_clients or len(v.clients) == 1):
-        if isinstance(v.owner.op, cls):
-            return v.owner
-        elif (isinstance(v.owner.op, GpuFromHost) and
-              v.owner.inputs[0].owner is not None and
-              (ignore_clients or len(v.owner.inputs[0].clients) == 1) and
-              isinstance(v.owner.inputs[0].owner.op, HostFromGpu)):
-            return find_node(v.owner.inputs[0].owner.inputs[0], cls)
-        else:
-            return None
-def is_equal(var, val):
-    # Returns True if var is always equal to val (python value), False
-    # otherwise (including if var is not constant)
-    try:
-        v = get_scalar_constant_value(var)
-        return v == val
-    except NotScalarConstantError:
-        return False
-def alpha_merge(cls, alpha_in, beta_in):
-    def wrapper(maker):
-        @local_optimizer([GpuElemwise])
-        @wraps(maker)
-        def opt(node):
-            if (isinstance(node.op, GpuElemwise) and
-                    node.op.scalar_op == scal.mul and
-                    node.nin == 2):
-                targ = find_node(node.inputs[0], cls)
-                if targ is None:
-                    targ = find_node(node.inputs[1], cls)
-                    if targ is None:
-                        return
-                    lr = grab_cpu_scalar(node.inputs[0],
-                                         nd=targ.outputs[0].ndim)
-                else:
-                    lr = grab_cpu_scalar(node.inputs[1],
-                                         nd=targ.outputs[0].ndim)
-                if lr is None or targ is None:
-                    return None
-                inputs = list(targ.inputs)
-                try:
-                    c = get_scalar_constant_value(lr)
-                    if c == 0:
-                        inputs[alpha_in] = lr
-                        inputs[beta_in] = lr
-                    elif c == 1:
-                        inputs[alpha_in] = targ.inputs[alpha_in]
-                        inputs[beta_in] = targ.inputs[beta_in]
-                    else:
-                        inputs[alpha_in] = lr * targ.inputs[alpha_in]
-                        inputs[beta_in] = lr * targ.inputs[beta_in]
-                except NotScalarConstantError:
-                    inputs[alpha_in] = lr * targ.inputs[alpha_in]
-                    inputs[beta_in] = lr * targ.inputs[beta_in]
-                return maker(targ, *inputs)
-        return opt
-    return wrapper
-def output_merge(cls, alpha_in, beta_in, out_in):
-    def wrapper(maker):
-        @local_optimizer([GpuElemwise])
-        @wraps(maker)
-        def opt(node):
-            if (isinstance(node.op, GpuElemwise) and
-                    node.op.scalar_op == scal.add and
-                    node.nin == 2):
-                targ = find_node(node.inputs[0], cls)
-                W = node.inputs[1]
-                if targ is None:
-                    targ = find_node(node.inputs[1], cls)
-                    W = node.inputs[0]
-                if targ is None:
-                    return None
-                if not is_equal(targ.inputs[beta_in], 0.0):
-                    # other cases are too complex for now
-                    return None
-                if W.broadcastable != targ.inputs[out_in].broadcastable:
-                    # May change later to do the broadcast, but it's
-                    # under discussion.
-                    return None
-                inputs = list(targ.inputs)
-                inputs[out_in] = W
-                inputs[beta_in] = _one.clone()
-                return maker(targ, *inputs)
-        return opt
-    return wrapper
-def pad_dims(input, leftdims, rightdims):
-    """Reshapes the input to a (leftdims + rightdims) tensor
-    This helper function is used to convert pooling inputs with arbitrary
-    non-pooling dimensions to the correct number of dimensions for the
-    GPU pooling ops.
-    This reduces or expands the number of dimensions of the input to
-    exactly `leftdims`, by adding extra dimensions on the left or by
-    combining some existing dimensions on the left of the input.
-    Use `unpad_dims` to reshape back to the original dimensions.
-    Examples
-    --------
-    Given input of shape (3, 5, 7), ``pad_dims(input, 2, 2)``
-    adds a singleton dimension and reshapes to (3, 1, 5, 7).
-    Given that output from pad_dims, ``unpad_dims(output, input, 2, 2)``
-    reshapes back to (3, 5, 7).
-    Given input of shape (3, 5, 7, 9), ``pad_dims(input, 2, 2)``
-    does not reshape and returns output with shape (3, 5, 7, 9).
-    Given input of shape (3, 5, 7, 9, 11), ``pad_dims(input, 2, 2)``
-    combines the first two dimensions and reshapes to (8, 7, 9, 11).
-    Given input of shape (3, 5, 7, 9), ``pad_dims(input, 2, 3)``
-    adds a singleton dimension and reshapes to (3, 1, 5, 7, 9).
-    """
-    assert input.ndim >= rightdims
-    if input.ndim == (leftdims + rightdims):
-        return input
-    # extract image dimensions
-    img_shape = input.shape[-rightdims:]
-    non_pool_ndim = input.ndim - rightdims
-    if non_pool_ndim < leftdims:
-        # too few dimensions, pad on the left
-        dummy_dims = tensor.as_tensor([1] * (leftdims - non_pool_ndim))
-        new_shape = tensor.join(0, dummy_dims,
-                                input.shape[:non_pool_ndim],
-                                img_shape)
-    else:
-        # too many dimensions, combine the leading dimensions
-        batched_ndim = non_pool_ndim - leftdims + 1
-        batch_size = tensor.prod(input.shape[:batched_ndim])
-        # convert to a vector for tensor.join
-        batch_size = tensor.shape_padright(batch_size, 1)
-        new_shape = tensor.join(0, batch_size,
-                                input.shape[batched_ndim:non_pool_ndim],
-                                img_shape)
-    # store in the required shape
-    new_shape = tensor.cast(new_shape, 'int64')
-    input_ND = GpuReshape(leftdims + rightdims)(input, new_shape)
-    return input_ND
-def unpad_dims(output, input, leftdims, rightdims):
-    """Reshapes the output after pad_dims.
-    This reverts the padding by `pad_dims`.
-    """
-    if output.ndim == input.ndim:
-        return output
-    # restore the output to the original shape
-    outshp = tensor.join(0, input.shape[:-rightdims], output.shape[-rightdims:])
-    return GpuReshape(input.ndim)(output, outshp)
--- a/theano/sandbox/cuda/rng_curand.py
+++ b/theano/sandbox/cuda/rng_curand.py
-from __future__ import absolute_import, print_function, division
-import numpy
-import theano.gof
-from theano.compat import PY3
-from theano.sandbox.cuda import CudaNdarrayType, GpuOp
-from theano.tensor import (get_vector_length, cast, opt)
-from theano.compile import optdb
-from theano.gof import local_optimizer, Variable
-__authors__ = "James Bergstra"
-__copyright__ = "(c) 2011, University of Montreal"
-__license__ = "3-clause BSD License"
-__contact__ = "theano-dev@googlegroups.com"
-"""
-Define CURAND_RandomStreams - backed by CURAND.
-"""
-config = theano.config
-class CURAND_Base(GpuOp):
-    """
-    Base class for a random number generator implemented in CURAND.
-    The random number generator itself is an opaque reference managed by
-    CURAND.  This Op uses a generic-typed shared variable to point to a CObject
-    that encapsulates this opaque reference.
-    Each random variable is created with a generator of None.
-    The actual random number generator is allocated from the seed, on the first
-    call to allocate random numbers (see c_code).
-    Parameters
-    ----------
-    output_type
-        A theano type (e.g. tensor.fvector).
-    seed: int
-    destructive
-        True or False (on the generator)
-    Notes
-    -----
-    One caveat is that the random number state is simply not serializable.
-    Consequently, attempts to serialize functions compiled with these
-    random numbers will fail.
-    """
-    def __init__(self, output_type, seed, destructive):
-        theano.gof.Op.__init__(self)
-        self.destructive = destructive
-        self.seed = seed
-        if self.destructive:
-            self.destroy_map = {0: [0]}
-        self.output_type = output_type
-        assert output_type.dtype == "float32"
-    def as_destructive(self):
-        """
-        Return an destructive version of self.
-        """
-        return self.__class__(self.output_type, self.seed, destructive=True)
-    def _config(self):
-        """
-        Return a tuple of attributes that define the Op.
-        """
-        return (self.destructive,
-                self.output_type,
-                self.seed,
-                )
-    def __eq__(self, other):
-        return type(self) == type(other) and self._config() == other._config()
-    def __hash__(self):
-        return hash((type(self), self._config()))
-    def __str__(self):
-        return (self.__class__.__name__ + "{inplace=%s, out_dtype=%s}" %
-                (self.destructive, self.output_type))
-    def make_node(self, generator, size):
-        return theano.gof.Apply(self, [generator, size],
-                                [generator.type(), self.output_type()])
-    @classmethod
-    def new_auto_update(cls, generator, ndim, dtype, size, seed):
-        """
-        Return a symbolic sample from generator.
-        cls dictates the random variable (e.g. uniform, normal).
-        """
-        v_size = theano.tensor.as_tensor_variable(size)
-        if ndim is None:
-            ndim = get_vector_length(v_size)
-        self = cls(output_type=CudaNdarrayType((False,) * ndim),
-                   seed=seed,
-                   destructive=False)
-        o_gen, sample = self(generator, cast(v_size, 'int32'))
-        sample.generator = generator        # for user
-        sample.update = (generator, o_gen)  # for CURAND_RandomStreams
-        generator.default_update = o_gen    # for pfunc uses this attribute
-        return sample
-    def c_headers(self):
-        return ["curand.h"]
-    def c_libraries(self):
-        return ['curand']
-    def c_support_code(self):
-        return """
-        #if PY_MAJOR_VERSION >= 3
-        void free_generator(PyObject *_gen)
-        {
-            curandGenerator_t * gen = (curandGenerator_t*)NpyCapsule_AsVoidPtr(_gen);
-        #else
-        void free_generator(void *_gen)
-        {
-            curandGenerator_t * gen = (curandGenerator_t*)_gen;
-        #endif
-            curandStatus_t err = curandDestroyGenerator(*gen);
-            if (err != CURAND_STATUS_SUCCESS)
-            {
-                fprintf(stderr, "Failure (%i) in destroying CURAND generator.\\n",
-                    (int)err);
-            }
-            free(gen);
-        }
-        """
-    def c_code(self, node, nodename, inp, out, sub):
-        i_generator, size = inp
-        o_generator, o_sample = out
-        destructive = int(self.destructive)
-        ndim = self.output_type.ndim
-        o_type_num = numpy.asarray(0, dtype=self.output_type.dtype).dtype.num
-        fail = sub['fail']
-        seed = self.seed
-        call_string = self._curand_call_str(o_sample=o_sample)
-        if self.output_type.dtype == 'float32':
-            otype = 'float'
-        else:
-            otype = 'double'
-        code = """
-        //////// <code generated by CURAND_Base>
-        int odims[%(ndim)s];
-        int n_elements = 1;
-        int must_alloc_sample = ((NULL == %(o_sample)s)
-                || !CudaNdarray_Check((PyObject*)%(o_sample)s)
-                || (CudaNdarray_NDIM(%(o_sample)s) != %(ndim)s));
-        if (PyArray_NDIM(%(size)s) != 1)
-        {
-            PyErr_SetString(PyExc_ValueError, "size must be vector");
-            %(fail)s
-        }
-        if (PyArray_DIMS(%(size)s)[0] != %(ndim)s)
-        {
-            PyErr_Format(PyExc_ValueError, "size must have length %%i (not %%i)",
-                %(ndim)s, PyArray_DIMS(%(size)s)[0]);
-            %(fail)s
-        }
-        if (PyArray_TYPE(%(size)s) != NPY_INT32)
-        {
-            PyErr_SetString(PyExc_ValueError, "size must be int32");
-            %(fail)s
-        }
-        for (int i = 0; i < %(ndim)s; ++i)
-        {
-            odims[i] = ((npy_int32*)PyArray_GETPTR1(%(size)s, i))[0];
-            n_elements *= odims[i];
-            must_alloc_sample = (must_alloc_sample
-                    || CudaNdarray_HOST_DIMS(%(o_sample)s)[i] != odims[i]);
-        }
-        if (must_alloc_sample)
-        {
-            Py_XDECREF(%(o_sample)s);
-            %(o_sample)s = (CudaNdarray*)CudaNdarray_NewDims(%(ndim)s, odims);
-            if(!%(o_sample)s)
-            {
-                %(fail)s;
-            }
-        }
-        if (!PyCObject_Check(%(i_generator)s))
-        {
-            // allocate a new generator for o_generator
-            Py_XDECREF(%(o_generator)s);
-            curandGenerator_t * gen = (curandGenerator_t*)malloc(sizeof(curandGenerator_t));
-            assert(gen);
-            if (CURAND_STATUS_SUCCESS !=
-                    curandCreateGenerator(gen, CURAND_RNG_PSEUDO_DEFAULT)) {
-                PyErr_Format(PyExc_RuntimeError, "Failed to initialize curand generator");
-                %(fail)s;
-            }
-            if (CURAND_STATUS_SUCCESS !=
-                    curandSetPseudoRandomGeneratorSeed(*gen,%(seed)s))
-            {
-                PyErr_Format(PyExc_RuntimeError, "Failed to set curand generator seed");
-                %(fail)s;
-            }
-            %(o_generator)s = PyCObject_FromVoidPtr(gen, &free_generator);
-            assert (%(i_generator)s == Py_None);
-        }
-        else if (%(destructive)s)
-        {
-            // use i_generator for o_generator
-            Py_XDECREF(%(o_generator)s);
-            Py_INCREF(%(i_generator)s);
-            %(o_generator)s = %(i_generator)s;
-        }
-        else
-        {
-            // copy i_generator for o_generator
-            PyErr_Format(PyExc_NotImplementedError, "non-destructive CURAND generation");
-            %(fail)s;
-        }
-        {
-            curandGenerator_t * gen = (curandGenerator_t*)PyCObject_AsVoidPtr(%(o_generator)s);
-            curandStatus_t err = %(call_string)s
-            if (err != CURAND_STATUS_SUCCESS)
-            {
-                PyErr_Format(PyExc_RuntimeError, "curand error generating random normals %%i", (int)err);
-                %(fail)s;
-            }
-            cudaThreadSynchronize();
-        }
-        //////// </ code generated by CURAND_Base>
-        """ % locals()
-        if PY3:
-            code = code.replace("PyCObject", "NpyCapsule")
-        return code
-    def c_code_cache_version(self):
-        return (5,)
-class CURAND_Normal(CURAND_Base):
-    """
-    Op to draw normal numbers using CURAND.
-    """
-    def _curand_call_str(self, **kwargs):
-        return """curandGenerateNormal(*gen,
-                CudaNdarray_DEV_DATA(%(o_sample)s),
-                n_elements,
-                0.0, 1.0);
-        """ % kwargs
-class CURAND_Uniform(CURAND_Base):
-    """
-    Op to draw uniform numbers using CURAND.
-    """
-    def _curand_call_str(self, **kwargs):
-        return """ curandGenerateUniform(*gen,
-                CudaNdarray_DEV_DATA(%(o_sample)s),
-                n_elements);
-               """ % kwargs
-class CURAND_RandomStreams(object):
-    """
-    RandomStreams instance that creates CURAND-based random variables.
-    One caveat is that generators are not serializable.
-    Parameters
-    ----------
-    seed : int
-    """
-    def __init__(self, seed):
-        self._start_seed = seed
-        self._cur_seed = seed
-        self._has_lost_states = False  # True if self.state_updates incomplete
-        self.state_updates = []
-    def updates(self):
-        """
-        List of all (old, new) generator update pairs created by this
-        instance.
-        """
-        return list(self.state_updates)
-    def next_seed(self):
-        """
-        Return a unique seed for initializing a random variable.
-        """
-        self._cur_seed += 1
-        return self._cur_seed - 1
-    def __getstate__(self):
-        rval = dict(self.__dict__)
-        # the CObject used to store updates cannot be serialized
-        rval['state_updates'] = []
-        rval['_has_lost_states'] = True
-        return rval
-    def uniform(self, size, low=0.0, high=1.0, ndim=None,
-                dtype=config.floatX):
-        """
-        Return symbolic tensor of uniform numbers.
-        """
-        if isinstance(size, tuple):
-            msg = "size must be a tuple of int or a Theano variable"
-            assert all([isinstance(i, int) or isinstance(i, Variable)
-                        for i in size]), msg
-        else:
-            msg = "size must be a tuple of int or a Theano variable"
-            assert isinstance(size, Variable) and size.ndim == 1, msg
-        generator = theano.shared(None)  # makes a generic
-        s_size = theano.tensor.as_tensor_variable(size)
-        u = CURAND_Uniform.new_auto_update(generator, ndim, dtype, s_size,
-                                           self.next_seed())
-        self.state_updates.append(u.update)
-        rval = u * (high - low) + low
-        if u.type.broadcastable != rval.type.broadcastable:
-            raise NotImplementedError(
-                'Increase the size to match the broadcasting pattern of '
-                'low and `high` arguments'
-            )
-        return rval
-    def normal(self, size=None, avg=0.0, std=1.0, ndim=None,
-               dtype=config.floatX):
-        """
-        Return symbolic tensor of normally-distributed numbers.
-        Parameters
-        ----------
-        size
-            Can be a list of integer or Theano variable (ex: the shape
-            of other Theano Variable)
-        """
-        if isinstance(size, tuple):
-            msg = "size must be a tuple of int or a Theano variable"
-            assert all([isinstance(i, int) or isinstance(i, Variable)
-                        for i in size]), msg
-        else:
-            msg = "size must be a tuple of int or a Theano variable"
-            assert isinstance(size, Variable) and size.ndim == 1, msg
-        generator = theano.shared(None)  # makes a generic
-        s_size = theano.tensor.as_tensor_variable(size)
-        u = CURAND_Normal.new_auto_update(generator, ndim, dtype, s_size,
-                                          self.next_seed())
-        self.state_updates.append(u.update)
-        rval = u * std + avg
-        if u.type.broadcastable != rval.type.broadcastable:
-            raise NotImplementedError(
-                'Increase the size to match the broadcasting pattern of `low`'
-                'and `high` arguments'
-            )
-        return rval
-@local_optimizer([CURAND_Base])
-def local_destructive(node):
-    op = node.op
-    if isinstance(op, CURAND_Base) and not op.destructive:
-        # op might be gpu version
-        new_op = op.as_destructive()
-        return new_op.make_node(*node.inputs).outputs
-    return False
-optdb.register('CURAND_destructive',
-               opt.in2out(local_destructive, ignore_newtrees=True),
-               99, 'fast_run', 'inplace')
--- a/theano/sandbox/cuda/tests/CudaNdarray.pkl
+++ b/theano/sandbox/cuda/tests/CudaNdarray.pkl
-ctheano.sandbox.cuda.type
-CudaNdarray_unpickler
-p1
-(cnumpy.core.multiarray
-_reconstruct
-p2
-(cnumpy
-ndarray
-p3
-(I0
-tS'b'
-tRp4
-(I1
-(I1
-tcnumpy
-dtype
-p5
-(S'f4'
-I0
-I1
-tRp6
-(I3
-S'<'
-NNNI-1
-I-1
-I0
-tbI00
-S'\x00\x00(\xc2'
-tbtR.
\ No newline at end of file
--- a/theano/sandbox/cuda/tests/__init__.py
+++ b/theano/sandbox/cuda/tests/__init__.py