Merge pull request #4244 from ChihebTrabelsi/ccw2.0

flake8 sandbox/cuda/*.py

Merge pull request #4244 from ChihebTrabelsi/ccw2.0
b69ad54d · Xavier Bouthillier · 200babca · 58267dc2 · b69ad54d · b69ad54d
--- a/theano/sandbox/cuda/GpuConvGrad3D.py
+++ b/theano/sandbox/cuda/GpuConvGrad3D.py
@@ -39,7 +39,7 @@ class GpuConvGrad3D(GpuOp):
        d_ = T.as_tensor_variable(d)
        WShape_ = T.as_tensor_variable(WShape)
        dCdH_ = as_cuda_ndarray_variable(dCdH)
-        broad = (False,)*5
+        broad = (False,) * 5
        return theano.Apply(self, inputs=[V_, d_, WShape_, dCdH_],
                            outputs=[CudaNdarrayType(dtype=V_.dtype,
                                                     broadcastable=broad)()])
@@ -51,15 +51,10 @@ class GpuConvGrad3D(GpuOp):
        # partial C / partial W[j,z,k,l,m] = sum_i sum_p sum_q sum_r (partial C /partial H[i,j,p,q,r] ) *  V[i,z,dr*p+k,dc*q+l,dt*r+m]
        batchSize = dCdH.shape[0]
-        outputFilters = dCdH.shape[1]
        outputHeight = dCdH.shape[2]
        outputWidth = dCdH.shape[3]
        outputDur = dCdH.shape[4]
        assert V.shape[0] == batchSize
-        inputFilters = V.shape[1]
-        inputHeight = V.shape[2]
-        inputWidth = V.shape[3]
-        inputDur = V.shape[4]
        dr, dc, dt = d
        dCdW = numpy.zeros(WShape, dtype=V.dtype)
@@ -76,7 +71,11 @@ class GpuConvGrad3D(GpuOp):
                                for p in xrange(0, outputHeight):
                                    for q in xrange(0, outputWidth):
                                        for r in xrange(0, outputDur):
-                                            dCdW[j, z, k, l, m] += dCdH[i, j, p, q, r] * V[i, z, dr*p+k, dc*q+l, dt*r+m]
+                                            dCdW[j, z, k, l, m] += dCdH[
+                                                i, j, p, q, r] * \
+                                                V[i, z, dr * p + k,
+                                                  dc * q + l,
+                                                  dt * r + m]
        output_storage[0][0] = dCdW

--- a/theano/sandbox/cuda/GpuConvTransp3D.py
+++ b/theano/sandbox/cuda/GpuConvTransp3D.py
@@ -37,9 +37,10 @@ class GpuConvTransp3D(GpuOp):
        else:
            RShape_ = T.as_tensor_variable([-1, -1, -1])
-        return theano.Apply(self, inputs=[W_, b_, d_, H_, RShape_],
+        return theano.Apply(
-                            outputs=[CudaNdarrayType(dtype=H_.dtype,
+            self, inputs=[W_, b_, d_, H_, RShape_],
-                                                     broadcastable=(False,)*5)()])
+            outputs=[CudaNdarrayType(
+                dtype=H_.dtype, broadcastable=(False,) * 5)()])
    def infer_shape(self, node, input_shapes):
        W, b, d, H, RShape = node.inputs
@@ -382,9 +383,9 @@ def computeR(W, b, d, H, Rshape=None):
        assert dc > 0
        assert dt > 0
-        videoHeight = (outputHeight-1) * dr + filterHeight
+        videoHeight = (outputHeight - 1) * dr + filterHeight
-        videoWidth = (outputWidth-1) * dc + filterWidth
+        videoWidth = (outputWidth - 1) * dc + filterWidth
-        videoDur = (outputDur-1) * dt + filterDur
+        videoDur = (outputDur - 1) * dt + filterDur
        if Rshape is not None and Rshape[0] != -1:
            if Rshape[0] < videoHeight:
@@ -399,26 +400,46 @@ def computeR(W, b, d, H, Rshape=None):
        # else:
        #    print "No Rshape passed in"
-        # print "video size: "+str((videoHeight, videoWidth, videoDur))
+        # print "video size: " + str((videoHeight, videoWidth, videoDur))
-        R =  numpy.zeros( (batchSize, inputChannels, videoHeight,
+        R = numpy.zeros((batchSize, inputChannels, videoHeight,
-            videoWidth, videoDur ) , dtype=H.dtype)
+                         videoWidth, videoDur),
+                        dtype=H.dtype)
-        # R[i,j,r,c,t] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} sum_{tc,tk | ...} sum_k W[k, j, rk, ck, tk] * H[i,k,rc,cc,tc]
+        # R[i,j,r,c,t] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} \
+        # sum_{tc,tk | ...} sum_k W[k, j, rk, ck, tk] * H[i,k,rc,cc,tc]
        for i in xrange(0, batchSize):
            # print '\texample '+str(i+1)+'/'+str(batchSize)
            for j in xrange(0, inputChannels):
-                # print '\t\tfeature map '+str(j+1)+'/'+str(inputChannels)
+                # print '\t\tfeature map ' + str(j+1) + '/' + str(inputChannels)
                for r in xrange(0, videoHeight):
-                    # print '\t\t\trow '+str(r+1)+'/'+str(videoHeight)
+                    # print '\t\t\trow ' + str(r+1) + '/'+str(videoHeight)
                    for c in xrange(0, videoWidth):
                        for t in xrange(0, videoDur):
                            R[i, j, r, c, t] = b[j]
-                            ftc = max([0, int(numpy.ceil(float(t-filterDur + 1  )/float(dt))) ])
+                            ftc = max(
-                            fcc = max([0, int(numpy.ceil(float(c-filterWidth + 1)/float(dc))) ])
+                                [0,
+                                 int(numpy.ceil(
-                            rc =  max([0, int(numpy.ceil(float(r-filterHeight+1)/float(dr))) ])
+                                     float(t - filterDur + 1) / float(dt)
+                                     ))
+                                 ]
+                            )
+                            fcc = max(
+                                [0,
+                                 int(numpy.ceil(
+                                     float(c - filterWidth + 1) / float(dc)
+                                     ))
+                                 ]
+                            )
+                            rc = max(
+                                [0,
+                                 int(numpy.ceil(
+                                     float(r - filterHeight + 1) / float(dr)
+                                     ))
+                                 ]
+                            )
                            while rc < outputHeight:
                                rk = r - rc * dr
                                if rk < 0:
@@ -436,7 +457,9 @@ def computeR(W, b, d, H, Rshape=None):
                                        if tk < 0:
                                            break
-                                        R[i, j, r, c, t] += numpy.dot(W[:, j, rk, ck, tk], H[i, :, rc, cc, tc] )
+                                        R[i, j, r, c, t] += numpy.dot(
+                                            W[:, j, rk, ck, tk],
+                                            H[i, :, rc, cc, tc])
                                        tc += 1
                                    ""  # close loop over tc

--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -2,7 +2,7 @@ from __future__ import absolute_import, print_function, division
 import copy
 import logging
 import sys
+import warnings
 import numpy
 from six import iteritems
 from six.moves import StringIO, xrange
@@ -12,6 +12,9 @@ from theano import gof, Type, Apply
 from theano import tensor, scalar, config
 from theano.gradient import grad_undefined
 from theano.scalar import Scalar
+from theano.sandbox.cuda import GpuOp
+from theano.sandbox.cuda.type import CudaNdarrayType
+from theano.sandbox.cuda.elemwise import NaiveAlgo
 scal = scalar  # somewhere scalar gets reassigned to be a function
@@ -24,10 +27,6 @@ try:
 except ImportError:
    pass
-from theano.sandbox.cuda import GpuOp
-from theano.sandbox.cuda.type import CudaNdarrayType
-from theano.sandbox.cuda.elemwise import NaiveAlgo
 _logger_name = 'theano.sandbox.cuda.basic_ops'
 _logger = logging.getLogger(_logger_name)
@@ -596,10 +595,8 @@ class GpuCAReduce(GpuOp):
        if self.pre_scalar_op:
            pre = "pre=%s,red=" % str(self.pre_scalar_op)
        return "GpuCAReduce{%s%s}{%s}" % (
-                pre,
+            pre, str(self.scalar_op),
-                str(self.scalar_op),
+            ','.join(str(i) for i in self.reduce_mask))
-                ','.join(str(i) for i in self.reduce_mask)
-                )
    def __setstate__(self, d):
        self.__dict__.update(d)
@@ -775,15 +772,18 @@ class GpuCAReduce(GpuOp):
            # check if the tensor is ccontiguous, if true, use the c_code_reduce_ccontig code.
            # TODO: check if we are ccontiguous when we un-dimshuffle
            # TODO: if only some dims are ccontiguous, call version with less dims.
-            print('if(CudaNdarray_is_c_contiguous(%(x)s)){'%locals(), file=sio)
+            print('if(CudaNdarray_is_c_contiguous( %(x)s)){' % locals(),
+                  file=sio)
            self.c_code_reduce_ccontig(sio, node, name, x, z, fail)
            print("}else{", file=sio)
-            getattr(self, 'c_code_reduce_%s'%(''.join(
+            getattr(self, 'c_code_reduce_%s' % (''.join(
-                str(i) for i in self.reduce_mask)))(sio, node, name, x, z, fail)
+                str(i) for i in self.reduce_mask)))(
+                    sio, node, name, x, z, fail)
            print("}", file=sio)
        else:
-            getattr(self, 'c_code_reduce_%s'%(''.join(
+            getattr(self, 'c_code_reduce_%s' % (''.join(
-                str(i) for i in self.reduce_mask)))(sio, node, name, x, z, fail)
+                str(i) for i in self.reduce_mask)))(
+                    sio, node, name, x, z, fail)
        # \end bracket the reduction ...
        print("""
@@ -976,7 +976,7 @@ class GpuCAReduce(GpuOp):
            assert isinstance(self.scalar_op, (scal.Maximum,
                                               scal.Minimum))
            if self.pre_scalar_op:
-                #dtype = node.inputs[0].dtype
+                # dtype = node.inputs[0].dtype
                dtype = 'float32'
                dummy_var = scal.Scalar(dtype=dtype)()
@@ -1834,12 +1834,15 @@ class GpuCAReduce(GpuOp):
        version = [15]  # the version corresponding to the c code in this Op
        # now we insert versions for the ops on which we depend...
-        scalar_node = Apply(self.scalar_op,
+        Apply(self.scalar_op,
-                [Scalar(dtype=input.type.dtype)() for input in node.inputs],
+              [Scalar(
-                [Scalar(dtype=output.type.dtype)() for output in node.outputs])
+                  dtype=input.type.dtype)() for input in node.inputs],
+              [Scalar(
+                  dtype=output.type.dtype)() for output in node.outputs])
        version.extend(self.scalar_op.c_code_cache_version())
        for i in node.inputs + node.outputs:
-            version.extend(Scalar(dtype=i.type.dtype).c_code_cache_version())
+            version.extend(
+                Scalar(dtype=i.type.dtype).c_code_cache_version())
        if all(version):
            return tuple(version)
        else:
@@ -1946,10 +1949,11 @@ class GpuCAReduce(GpuOp):
                %(reducebuf)s
            }
            """ % locals(), file=sio)
-        #01, 011, 0111
+        # 01, 011, 0111
        if (0 == self.reduce_mask[0] and
                all(self.reduce_mask[1:]) and
                nd_in in[2, 3, 4]):
            # this kernel uses one block for each row.
            # threads per block for each element per row.
@@ -2117,10 +2121,10 @@ class GpuCAReduce(GpuOp):
            # this kernel uses one block for multiple column(up to 32TODO),
            # threads per block for each element per column.
-# thread.x = dim 2 contiguous
+            # thread.x = dim 2 contiguous
-# thread.y = dim 1
+            # thread.y = dim 1
-# block.x = dim 0
+            # block.x = dim 0
-# block.y = dim 1 rest
+            # block.y = dim 1 rest
            init = self._k_init(node, nodename)
            decl = self._k_decl(node, nodename, pattern="010_inner")
            reducebuf = self._k_reduce_buf_multiple('Z[i0 * sZ0 + i2*sZ1]',
@@ -2470,7 +2474,7 @@ class GpuReshape(tensor.Reshape, GpuOp):
                if (x.size % ss) != 0:
                    raise ValueError("When using -1 in new shape, the computed new shape must be an multiple of the original shape.")
                shp_new = numpy.copy(shp)
-                shp_new[m1_idx] = x.size/ss
+                shp_new[m1_idx] = x.size / ss
                shp = shp_new
            else:
@@ -2721,7 +2725,7 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp):
    def perform(self, node, inp, out_):
        # This don't work as CudaNdarray_Subscript() don't support it.
-        #super(GpuAdvancedSubtensor1, self).perform(node, inp, out_)
+        # super(GpuAdvancedSubtensor1, self).perform(node, inp, out_)
        x, idx = inp
        out, = out_
        x_orig = x
@@ -2733,7 +2737,7 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp):
        if x.ndim <= 3:
            # CudaNdarray.take only supports ndim <= 3
            if self.perform_using_take is not None:
-                assert self.perform_using_take == True, (
+                assert self.perform_using_take is True, (
                    "GpuAdvancedSubtensor1 used the fast version")
            if idx.dtype != numpy.int64:
                if idx.dtype in [numpy.int8, numpy.int16, numpy.int32,
@@ -2762,7 +2766,7 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp):
            out[0] = o
        else:
            if self.perform_using_take is not None:
-                assert self.perform_using_take == False, (
+                assert self.perform_using_take is False, (
                    "GpuAdvancedSubtensor1 didn't use the fast version")
            if out_[0][0] is None or out_[0][0].shape != out_shape:
                o = cuda_ndarray.cuda_ndarray.CudaNdarray.zeros(out_shape)
@@ -3006,8 +3010,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
        convert_map = {8: tensor.basic._convert_to_int8,
                       16: tensor.basic._convert_to_int16,
                       32: tensor.basic._convert_to_int32,
-                       64: tensor.basic._convert_to_int64
+                       64: tensor.basic._convert_to_int64}
-        }
        intwidth = theano.configdefaults.python_int_bitwidth()
        ilist_ = convert_map[intwidth](ilist_)
@@ -3354,7 +3357,6 @@ class GpuFlatten(gof.HideC, tensor.Flatten, GpuOp):
        return Apply(self, [x], [out_type()])
 def gpu_flatten(x, outdim=1):
    """
    Implement flatten on the gpu.
@@ -3378,9 +3380,9 @@ def gpu_flatten(x, outdim=1):
    """
    x = as_cuda_ndarray_variable(x)
    if outdim > 1:
-        dims = tuple(x.shape[:outdim-1])+(-1,)
+        dims = tuple(x.shape[:outdim - 1]) + (-1, )
    else:
-        dims = (-1,)
+        dims = (-1, )
    return GpuReshape(outdim)(x, dims)
@@ -3408,12 +3410,11 @@ class GpuJoin(tensor.Join, GpuOp):
        as_tensor_variable_args = [as_cuda_ndarray_variable(x)
                                   for x in tensors]
-        output_maker = \
+        def output_maker(bcast):
-                lambda bcast: CudaNdarrayType(broadcastable=bcast)()
+            return(CudaNdarrayType(broadcastable=bcast)())
-        return tensor.Join._make_node_internal(self,
+        return tensor.Join._make_node_internal(
-                        axis, tensors,
+            self, axis, tensors, as_tensor_variable_args, output_maker)
-                        as_tensor_variable_args, output_maker)
    def perform(self, node, axis_and_tensors, out_):
        out, = out_
@@ -3464,7 +3465,7 @@ class GpuJoin(tensor.Join, GpuOp):
        # except for 'axis'
        def construct_slices(curlen):
-            slices = [slice(None, None, None) for i in \
+            slices = [slice(None, None, None) for i in
                      xrange(len(template_shape))]
            slices[axis] = slice(curpos, curpos + curlen, None)
            return tuple(slices)
@@ -3829,8 +3830,8 @@ class GpuAlloc(GpuAllocEmpty):
                # If the output is a constant, it will have to be deepcopied
                # each time the function is called.  So we do not fold.
                return False
-            elif (  # The following ops work inplace of their input id 0.
+            # Else if the following ops work inplace of their input id 0.
-                  client[1] == 0 and
+            elif(client[1] == 0 and
                 isinstance(client[0].op, (
                     # Ops that will work inplace on the Alloc. So if they
                     # get constant_folded, they would copy the
@@ -3844,8 +3845,7 @@ class GpuAlloc(GpuAllocEmpty):
                     GpuAdvancedIncSubtensor1,
                     theano.sandbox.cuda.blas.GpuGemm,
                     theano.sandbox.cuda.blas.GpuGemv,
-                      theano.sandbox.cuda.blas.GpuGer,
+                     theano.sandbox.cuda.blas.GpuGer,))):
-                  ))):
                return False
            # If the clients is a transfer, we don't want to fold. We
            # let the moving opt finish before deciding what to do.

--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
 from __future__ import absolute_import, print_function, division
-import copy
 import os
 import logging
-_logger = logging.getLogger(__name__)
 from six import integer_types
 from six.moves import StringIO, reduce
 import theano
 from theano import Apply
 from theano import tensor
@@ -15,6 +11,7 @@ from theano.sandbox.cuda import GpuOp
 from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
                                           gpu_contiguous)
 from theano.tensor import as_tensor_variable
+_logger = logging.getLogger(__name__)
 class GpuBatchedDot(GpuOp):
@@ -183,8 +180,7 @@ class GpuBatchedDot(GpuOp):
            }
        } else {
            // copy inputs if not contiguous
-            """ +
+            """ + ("\n".join("""
-            ("\n".join("""
             if ((   CudaNdarray_HOST_DIMS(%(var)s)[0] > 1 && CudaNdarray_HOST_STRIDES(%(var)s)[0] != 1
                  && CudaNdarray_HOST_DIMS(%(var)s)[1] > 1 && CudaNdarray_HOST_STRIDES(%(var)s)[1] != 1
                  && CudaNdarray_HOST_DIMS(%(var)s)[2] > 1 && CudaNdarray_HOST_STRIDES(%(var)s)[2] != 1)
@@ -198,8 +194,7 @@ class GpuBatchedDot(GpuOp):
                 Py_XDECREF(%(var)s);
                 %(var)s = _copy;
             }
-             """ % dict(var=var, fail=fail) for var in (bx, by)))
+             """ % dict(var=var, fail=fail) for var in (bx, by))) + """
-            + """
            // fail if the output is not contiguous; we can't copy it because we
            // need to write to the original memory
@@ -537,8 +532,8 @@ class GpuGemm(GpuOp):
            return 'GpuGemm{no_inplace}'
    def __eq__(self, other):
-        return (type(self) == type(other)\
+        return (type(self) == type(other) and
-                and self.inplace == other.inplace)
+                self.inplace == other.inplace)
    def __hash__(self):
        return hash(type(self)) ^ hash(self.inplace)
@@ -562,7 +557,7 @@ class GpuGemm(GpuOp):
        return (4,)
    def c_code(self, node, name, inputs, outputs, sub):
-        #z_out = alpha * dot(x,y) + beta * z_in
+        # z_out = alpha * dot(x,y) + beta * z_in
        # inplace version, set set z_out = z_in
        # not inplace version, we copy z_in to z_out.
        z_in, a, x, y, b = inputs
@@ -657,8 +652,8 @@ class GpuGemv(GpuOp):
            return 'GpuGemv{no_inplace}'
    def __eq__(self, other):
-        return (type(self) == type(other)\
+        return (type(self) == type(other) and
-                and self.inplace == other.inplace)
+                self.inplace == other.inplace)
    def __hash__(self):
        return hash(type(self)) ^ hash(self.inplace)
@@ -682,7 +677,7 @@ class GpuGemv(GpuOp):
        return (3,)
    def c_code(self, node, name, inputs, outputs, sub):
-        #z_out = alpha * dot(x,y) + beta * z_in
+        # z_out = alpha * dot(x,y) + beta * z_in
        # inplace version, set set z_out = z_in
        # not inplace version, we copy z_in to z_out.
        z_in, a, x, y, b = inputs
@@ -757,8 +752,8 @@ class GpuGer(GpuOp):
            return 'GpuGer{no_inplace}'
    def __eq__(self, other):
-        return (type(self) == type(other)\
+        return (type(self) == type(other) and
-                and self.inplace == other.inplace)
+                self.inplace == other.inplace)
    def __hash__(self):
        return hash(type(self)) ^ hash(self.inplace)
@@ -782,7 +777,7 @@ class GpuGer(GpuOp):
        return (2,)
    def c_code(self, node, name, inputs, outputs, sub):
-        #z_out = alpha * dot(x,y) + beta * z_in
+        # z_out = alpha * dot(x,y) + beta * z_in
        # inplace version, set set z_out = z_in
        # not inplace version, we copy z_in to z_out.
        z_in, a, x, y = inputs
@@ -1283,11 +1278,15 @@ class GpuCorrMM_gradWeights(BaseGpuCorrMM):
        bottom, top = inp[:2]
        weights, = grads
        weights = gpu_contiguous(weights)
-        d_bottom = GpuCorrMM_gradInputs(self.border_mode, self.subsample)(
+        d_bottom = GpuCorrMM_gradInputs(
-                weights, top, bottom.shape[-2:])
+            self.border_mode, self.subsample)(weights,
-        d_top = GpuCorrMM(self.border_mode, self.subsample)(
+                                              top,
-                bottom, weights)
+                                              bottom.shape[-2:])
-        d_height_width = (theano.gradient.DisconnectedType()(),) * 2 if len(inp) == 4 else ()
+        d_top = GpuCorrMM(
+            self.border_mode, self.subsample)(bottom, weights)
+        d_height_width = (
+            theano.gradient.DisconnectedType()(),
+            ) * 2 if len(inp) == 4 else ()
        return (d_bottom, d_top) + d_height_width
    def connection_pattern(self, node):
@@ -1342,11 +1341,14 @@ class GpuCorrMM_gradInputs(BaseGpuCorrMM):
        weights, top = inp[:2]
        bottom, = grads
        bottom = gpu_contiguous(bottom)
-        d_weights = GpuCorrMM_gradWeights(self.border_mode, self.subsample)(
+        d_weights = GpuCorrMM_gradWeights(
+            self.border_mode, self.subsample)(
                bottom, top, weights.shape[-2:])
-        d_top = GpuCorrMM(self.border_mode, self.subsample)(
+        d_top = GpuCorrMM(
-                bottom, weights)
+            self.border_mode, self.subsample)(bottom, weights)
-        d_height_width = (theano.gradient.DisconnectedType()(),) * 2 if len(inp) == 4 else ()
+        d_height_width = (
+            theano.gradient.DisconnectedType()(),
+            ) * 2 if len(inp) == 4 else ()
        return (d_weights, d_top) + d_height_width
    def connection_pattern(self, node):
@@ -1755,10 +1757,16 @@ class GpuCorr3dMM(BaseGpuCorr3dMM):
        bottom, weights = inp
        top, = grads
        top = gpu_contiguous(top)
-        d_bottom = GpuCorr3dMM_gradInputs(self.border_mode, self.subsample, self.pad)(
+        d_bottom = GpuCorr3dMM_gradInputs(self.border_mode,
-                weights, top, bottom.shape[-3:])
+                                          self.subsample,
-        d_weights = GpuCorr3dMM_gradWeights(self.border_mode, self.subsample, self.pad)(
+                                          self.pad)(weights,
-                bottom, top, weights.shape[-3:])
+                                                    top,
+                                                    bottom.shape[-3:])
+        d_weights = GpuCorr3dMM_gradWeights(self.border_mode,
+                                            self.subsample,
+                                            self.pad)(bottom,
+                                                      top,
+                                                      weights.shape[-3:])
        return d_bottom, d_weights
@@ -1863,11 +1871,14 @@ class GpuCorr3dMM_gradInputs(BaseGpuCorr3dMM):
        weights, top = inp[:2]
        bottom, = grads
        bottom = gpu_contiguous(bottom)
-        d_weights = GpuCorr3dMM_gradWeights(self.border_mode, self.subsample, self.pad)(
+        d_weights = GpuCorr3dMM_gradWeights(
+            self.border_mode, self.subsample, self.pad)(
                bottom, top, weights.shape[-3:])
-        d_top = GpuCorr3dMM(self.border_mode, self.subsample, self.pad)(
+        d_top = GpuCorr3dMM(
+            self.border_mode, self.subsample, self.pad)(
                bottom, weights)
-        d_height_width_depth = (theano.gradient.DisconnectedType()(),) * 3 if len(inp) == 5 else ()
+        d_height_width_depth = (theano.gradient.DisconnectedType()(),)\
+            * 3 if len(inp) == 5 else ()
        return (d_weights, d_top) + d_height_width_depth
    def connection_pattern(self, node):
@@ -2186,7 +2197,7 @@ class GpuDownsampleFactorMax(GpuOp):
        return Apply(self, [x], [x.type()])
    # def perform(self, node, input_storage, output_storage):
-        #raise NotImplementedError('only C is implemented')
+        # raise NotImplementedError('only C is implemented')
    def c_code_cache_version(self):
        return (6)

--- a/theano/sandbox/cuda/elemwise.py
+++ b/theano/sandbox/cuda/elemwise.py
@@ -108,8 +108,8 @@ class NaiveAlgo(object):
            s = ", ".join(["float * o%i_data" % ipos] +
                          ["int o%i_str_%i" % (ipos, d) for d in xrange(nd)])
            print("\t,", s, file=sio)
-            #print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) for d in xrange(nd))
+            # print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) for d in xrange(nd))
-            #print >> sio, "\t,", "float * o%i_data" % ipos
+            # print >> sio, "\t,", "float * o%i_data" % ipos
        print("\t)\n{", file=sio)
        print("    const int idx = blockIdx.x * blockDim.x + threadIdx.x;", file=sio)
        print("    const int numThreads = blockDim.x * gridDim.x;", file=sio)
@@ -129,7 +129,7 @@ class NaiveAlgo(object):
                print("        const float * ii_i%i_data = i%i_data;" % (ipos, ipos), file=sio)
        for ipos, i in enumerate(node.outputs):
            print("        float * ii_o%i_data = o%i_data;" % (ipos, ipos), file=sio)
-        for d in xrange(nd-1, -1, -1):
+        for d in xrange(nd - 1, -1, -1):
            if d > 0:
                print("        int pos%i = ii %% dim%i;" % (d, d), file=sio)
                print("        ii = ii / dim%i;" % d, file=sio)
@@ -161,9 +161,9 @@ class NaiveAlgo(object):
            print("ii_o%i_data[0] = o%i_i;" % (ipos, ipos), file=sio)
        print("    }", file=sio)
-        #indent = " "*(4*d+7)
+        # indent = " "*(4*d+7)
        # for ipos, i in enumerate(node.inputs):
-            #print >> sio, indent, "const float * i%i" % ipos, '= i%i_data', ''
+        # print >> sio, indent, "const float * i%i" % ipos, '= i%i_data', ''
        print("}", file=sio)
        # print sio.getvalue()
@@ -211,10 +211,11 @@ class NaiveAlgo(object):
                print("//    Input  ", ipos, str(i.type), file=sio)
            for ipos, i in enumerate(node.outputs):
                print("//    Output ", ipos, str(i.type), file=sio)
-            print("static __global__ void kernel_%s_%s_%s(unsigned int numEls" % (
+            print(
-                    self.scalar_op.__class__.__name__,
+                "static __global__ void kernel_%s_%s_%s(unsigned int numEls" %
+                (self.scalar_op.__class__.__name__,
                 nodename,
-                    'tiling%i'%nd), file=sio)
+                 'tiling%i' % nd), file=sio)
            if (nd):
                print("\t,", ", ".join("const int dim%i" % i for i in xrange(nd)), file=sio)
            # declare inputs
@@ -225,15 +226,15 @@ class NaiveAlgo(object):
            for ipos, i in enumerate(node.outputs):
                s = ", ".join(["float * o%i_data" % ipos] + list("int o%i_str_%i" % (ipos, d) for d in xrange(nd)))
                print("\t,", s, file=sio)
-                #print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) for d in xrange(nd))
+                # print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) for d in xrange(nd))
-                #print >> sio, "\t,", "float * o%i_data" % ipos
+                # print >> sio, "\t,", "float * o%i_data" % ipos
            print("\t)\n{", file=sio)
            # For each input that is a scalar which has been broadcasted to a tensor,
            #     load it into a local variable
            print("    __shared__ float value0[%i];" % len(node.inputs), file=sio)
            print("    __shared__ int shared_dims[%(nd)s];" % locals(), file=sio)
-            #print >> sio, "    __shared__ int shared_i_str[%(n_in)s][%(nd)s]"
+            # print >> sio, "    __shared__ int shared_i_str[%(n_in)s][%(nd)s]"
            print("    if ((threadIdx.x == 0) && (threadIdx.y == 0)) {", file=sio)
            for ipos, i in enumerate(node.inputs):
                if _logical_scalar(i):
@@ -274,15 +275,18 @@ class NaiveAlgo(object):
            # perform the scalar operation on the input and output references
            # TODO: What if the scalar_op needs support_code??
            task_code = self.scalar_op.c_code(
-                    Apply(self.scalar_op,
+                Apply(
-                        [scalar.Scalar(dtype=input.type.dtype).make_variable()
+                    self.scalar_op,
+                    [scalar.Scalar(
+                        dtype=input.type.dtype).make_variable()
                     for input in node.inputs],
-                        [scalar.Scalar(dtype=output.type.dtype).make_variable()
+                    [scalar.Scalar(
-                         for output in node.outputs])
+                        dtype=output.type.dtype).make_variable()
-                    , nodename + '_scalar_'
+                     for output in node.outputs]),
-                    , get_str_list_logical_scalar(node, value_str='value0[%i]')
+                nodename + '_scalar_',
-                    , ['ii_o%i_data[0]'%ipos for ipos, i in enumerate(node.outputs)]
+                get_str_list_logical_scalar(node, value_str='value0[%i]'),
-                    , sub=dict(fail='return;'))  # TODO: set a failure code somehow!!!
+                ['ii_o%i_data[0]' % ipos for ipos, i in enumerate(node.outputs)],
+                sub=dict(fail='return;'))  # TODO: set a failure code somehow!!!
            print("       ", task_code, file=sio)
            print("    }" * nd, file=sio)
@@ -290,9 +294,9 @@ class NaiveAlgo(object):
            # TODO: insert runtime stride checks that select the best loop order either here, or in
            # the host code that launched the  kernel (host code probably better spot)
-            #indent = " "*(4*d+7)
+            # indent = " "*(4*d+7)
            # for ipos, i in enumerate(node.inputs):
-                #print >> sio, indent, "const float * i%i" % ipos, '= i%i_data', ''
+            # print >> sio, indent, "const float * i%i" % ipos, '= i%i_data', ''
            print("}", file=sio)
        print(sio.getvalue())
@@ -319,10 +323,11 @@ class NaiveAlgo(object):
            print("//    Input  ", ipos, str(i.type), file=sio)
        for ipos, i in enumerate(node.outputs):
            print("//    Output ", ipos, str(i.type), file=sio)
-        print("static __global__ void kernel_%s_%s_%s(unsigned int numEls" % (
+        print(
-                self.scalar_op.__class__.__name__,
+            "static __global__ void kernel_%s_%s_%s(unsigned int numEls" %
+            (self.scalar_op.__class__.__name__,
             nodename,
-                'tiling%i_less_registers'%nd), file=sio)
+             'tiling%i_less_registers' % nd), file=sio)
        if (nd):
            print("\t,", ", ".join("const int dim%i" % i for i in xrange(nd)), file=sio)
        # declare inputs
@@ -333,8 +338,8 @@ class NaiveAlgo(object):
        for ipos, i in enumerate(node.outputs):
            s = ", ".join(["float * o%i_data_0" % ipos] + list("int o%i_str_%i" % (ipos, d) for d in xrange(nd)))
            print("\t,", s, file=sio)
-            #print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) for d in xrange(nd))
+            # print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) for d in xrange(nd))
-            #print >> sio, "\t,", "float * o%i_data" % ipos
+            # print >> sio, "\t,", "float * o%i_data" % ipos
        print("\t)\n{", file=sio)
        # TODO: Setting these to true makes the function fail SOMETIMES.  I don't know why yet.
@@ -350,6 +355,7 @@ class NaiveAlgo(object):
                return "s%s_str[%i][%i]" % (io, p, d)
            else:
                return "%s%i_str_%i" % (io, p, d)
        def limits(d):
            if use_shared_limits:
                return "limits[%i]" % d
@@ -417,15 +423,19 @@ class NaiveAlgo(object):
        def task_code(d):
            print(self.scalar_op.c_code(
-                Apply(self.scalar_op,
+                Apply(
+                    self.scalar_op,
                    [scalar.Scalar(dtype=input.type.dtype).make_variable()
                     for input in node.inputs],
                    [scalar.Scalar(dtype=output.type.dtype).make_variable()
-                     for output in node.outputs])
+                     for output in node.outputs]),
-                , nodename + '_scalar_'
+                nodename + '_scalar_',
-                , ['i%i_data_%i[0]'%(ipos, d) for ipos, i in enumerate(node.inputs)]
+                ['i%i_data_%i[0]' % (ipos, d) for ipos,
-                , ['o%i_data_%i[0]'%(ipos, d) for ipos, i in enumerate(node.outputs)]
+                 i in enumerate(node.inputs)],
-                , sub=dict(fail='return;')), file=sio)  # TODO: set a failure code somehow!!!
+                ['o%i_data_%i[0]' % (ipos, d) for ipos,
+                 i in enumerate(node.outputs)],
+                sub=dict(fail='return;')), file=sio)
+            # TODO: set a failure code somehow!!!
        if nd == 4:
            decl_shared_stride(n_in, n_out, nd)
@@ -495,16 +505,19 @@ class NaiveAlgo(object):
        for ipos, i in enumerate(node.outputs):
            print("npy_%s o%d_i;" % (i.dtype, ipos), file=sio)
        task_code = self.scalar_op.c_code(
-                Apply(self.scalar_op,
+            Apply(
+                self.scalar_op,
                [scalar.Scalar(dtype=input.type.dtype).make_variable()
                 for input in node.inputs],
                [scalar.Scalar(dtype=output.type.dtype).make_variable()
-                     for output in node.outputs])
+                 for output in node.outputs]),
-                , nodename + '_scalar_'
+            nodename + '_scalar_',
-                #, ['i%i_data[i]'%ipos for ipos, i in enumerate(node.inputs)]
+            # , ['i%i_data[i]'%ipos for ipos,
-                , get_str_list_logical_scalar(node, data_str='i%i_data[i]')
+            #     i in enumerate(node.inputs)]
-                , ['o%i_i'%ipos for ipos, i in enumerate(node.outputs)]
+            get_str_list_logical_scalar(node, data_str='i%i_data[i]'),
-                , sub=dict(fail='return;'))  # TODO: set a failure code somehow!!!
+            ['o%i_i' % ipos for ipos, i in enumerate(node.outputs)],
+            sub=dict(fail='return;'))
+        # TODO: set a failure code somehow!!!
        print("       ", task_code, file=sio)
        for ipos, _ in enumerate(node.outputs):
            print("o%i_data[i] = o%i_i;" % (ipos, ipos), file=sio)
@@ -539,18 +552,21 @@ class NaiveAlgo(object):
        nb_outputs = len(node.outputs)
        d = dict()
        # input_params and output_params go into the function declaration/definition
-        input_params = ", ".join("const float * i%i_data, const int * i%i_str"%(ipos, ipos)
+        input_params = ", ".join(
+            "const float * i%i_data, const int * i%i_str" % (ipos, ipos)
            for ipos in xrange(len(node.inputs)))
-        output_params = ", ".join("float * o%i_data, const int * o%i_str"%(ipos, ipos)
+        output_params = ", ".join(
+            "float * o%i_data, const int * o%i_str" % (ipos, ipos)
            for ipos in xrange(len(node.outputs)))
        # input_args and output_args go into the recursive call.
-        input_args = ", ".join("i%i_data, i%i_str"%(ipos, ipos)
+        input_args = ", ".join("i%i_data, i%i_str" % (ipos, ipos)
                               for ipos in xrange(len(node.inputs)))
-        output_args = ", ".join("o%i_data, o%i_str"%(ipos, ipos)
+        output_args = ", ".join("o%i_data, o%i_str" % (ipos, ipos)
                                for ipos in xrange(len(node.outputs)))
-        prod_dims = '*'.join(["dims[%i]"%di for di in xrange(nd)]+['1'])
+        prod_dims = '*'.join(
+            ["dims[%i]" % di for di in xrange(nd)] + ['1'])
        scalar_op = self.scalar_op.__class__.__name__
@@ -578,20 +594,30 @@ class NaiveAlgo(object):
            print("""
                std::cerr << "calling kernel_%(scalar_op)s_%(nodename)s     w numEls" << numEls << " dims"<< d << "\\n";
            """ % locals(), file=sio)
-            print('std::cerr << ' + " << ' ' <<  ".join(['"  "']+list("dims[%i]"%di
+            print(
-                for di in xrange(nd)) + ["'\\n';"]), file=sio)
+                'std::cerr << ' + " << ' ' <<  ".join(
+                    ['"  "'] +
+                    list("dims[%i]" % di for di in xrange(nd)) +
+                    ["'\\n';"]),
+                file=sio)
        if self.verbose > 1:
            for ipos in xrange(len(node.inputs)):
+                istrings = [
+                    "i%s_str[%i]" % (ipos, di) for di in xrange(nd)]
+                ipositions = " << ' ' <<  ".join(
+                    ["i%s_data" % ipos] + istrings)
                print("""
-                std::cerr << "   %(ipos)s data strides" <<
+                std::cerr << "   %(ipos)s data strides" << %(ipositions)s << "\\n";
-                """ % locals() + " << ' ' <<  ".join(["i%s_data"%ipos]
+                """ % dict(ipos=ipos, ipositions=ipositions), file=sio)
-                + list("i%s_str[%i]"%(ipos, di) for di in xrange(nd))) + ''' << "\\n"; ''', file=sio)
            for ipos in xrange(len(node.outputs)):
                print("""
                std::cerr << "   %(ipos)s data strides" <<
-                """ % locals() + " << ' ' <<  ".join(["o%s_data"%ipos]
+                """ % locals() + " << ' ' <<  ".join(
-                    + list("o%s_str[%i]"%(ipos, di) for di in xrange(nd))) + ''' << "\\n"; ''', file=sio)
+                    ["o%s_data" % ipos] +
+                    list(
+                        "o%s_str[%i]" % (ipos, di) for di in xrange(nd)
+                        )) +
+                    ''' << "\\n"; ''', file=sio)
    # collapse dimension that are broadcast in all inputs.
    # need to be done before contiguous collapse as it will break it.
    # do the dimensions and the strides
@@ -636,11 +662,19 @@ class NaiveAlgo(object):
            print('std::cerr << "\\n";', file=sio)
            if nd > 0:
                for ipos in xrange(len(node.inputs)):
-                    print('std::cerr << " local_str inputs %(ipos)s: " <<'%locals() + \
+                    print(
-                        ' << " " << '.join(["local_str[%s][%s]" % (ipos, x) for x in xrange(nd)])+'<<"\\n";', file=sio)
+                        'std::cerr << " local_str inputs %(ipos)s: " <<' % locals() +
+                        ' << " " << '.join(["local_str[%s][%s]" % (ipos, x)
+                                            for x in xrange(nd)]) +
+                        '<<"\\n";', file=sio)
                    for ipos in xrange(len(node.outputs)):
-                        print('std::cerr << " local_ostr inputs %(ipos)s: " <<'%locals() + \
+                        print(
-                        ' << " " << '.join(["local_ostr[%s][%s]" % (ipos, x) for x in xrange(nd)])+'<<"\\n";', file=sio)
+                            'std::cerr << " local_ostr inputs %(ipos)s: " <<' %
+                            locals() +
+                            ' << " " << '.join(
+                                ["local_ostr[%s][%s]" %
+                                 (ipos, x) for x in xrange(nd)]) +
+                            '<<"\\n";', file=sio)
        print("""
        for(int id=0;id<nd_collapse;id++){
@@ -668,35 +702,51 @@ class NaiveAlgo(object):
            nd_collapse--; id--;
          }
        }
-        """%locals(), file=sio)
+        """ % locals(), file=sio)
        if self.verbose > 2:
            print('std::cerr <<"after broadcast collapse\\n";', file=sio)
            print('std::cerr<< "nd_collapse "<< nd_collapse << "\\n"; ', file=sio)
            print('std::cerr << "local_dims";', file=sio)
            for d in xrange(nd):
-                print('std::cerr << " " << local_dims[%(d)s]; '%locals(), file=sio)
+                print('std::cerr << " " << local_dims[%(d)s]; ' %
+                      locals(), file=sio)
            print('std::cerr << "\\n";', file=sio)
            if nd > 0:
                for ipos in xrange(len(node.inputs)):
-                    print('std::cerr << " local_str %(ipos)s: " <<'%locals()+' << " " << '.join(["local_str[%s][%s]" % (ipos, x) for x in xrange(nd)])+'<<"\\n";', file=sio)
+                    print('std::cerr << " local_str %(ipos)s: " <<' %
+                          locals() + ' << " " << '.join(
+                              ["local_str[%s][%s]" %
+                               (ipos, x) for x in xrange(nd)]) +
+                          '<<"\\n";', file=sio)
                    for ipos in xrange(len(node.outputs)):
-                        print('std::cerr << " local_ostr %(ipos)s: " <<'%locals()+' << " " << '.join(["local_ostr[%s][%s]" % (ipos, x) for x in xrange(nd)])+'<<"\\n";', file=sio)
+                        print(
+                            'std::cerr << " local_ostr %(ipos)s: " <<' %
+                            locals() + ' << " " << '.join(
+                                ["local_ostr[%s][%s]" %
+                                 (ipos, x) for x in xrange(nd)]) +
+                            '<<"\\n";', file=sio)
    # collapse contiguous dimensions (ignoring scalars, generic version(collapse any dimensions, right, left, middle))
    # this is a good idea because we make less index calculation in the gpu.
        if nd > 0:
-            print("int nd_collapse_[%(nd)s] = {"%locals() + ','.join(['1' for x in xrange(nd)]) + "};", file=sio)
+            print("int nd_collapse_[%(nd)s] = {" %
+                  locals() + ','.join(
+                      ['1' for x in xrange(nd)]) + "};", file=sio)
        else:
            print("int *nd_collapse_ = NULL;", file=sio)
        for ipos in xrange(len(node.inputs)):
            if not _logical_scalar(node.inputs[ipos]):
                if nd > 0:
                    print("""
-                        int nd_collapse_%(ipos)s[%(nd)s] = {"""%locals() + ','.join(['1' for x in xrange(nd)]) + "};", file=sio)
+                          int nd_collapse_%(ipos)s[%(nd)s] = {""" %
+                          locals() +
+                          ','.join(['1' for x in xrange(nd)]) +
+                          "};", file=sio)
                else:
                    print("""
-                        int *nd_collapse_%(ipos)s = NULL;"""%locals(), file=sio)
+                          int * nd_collapse_%(ipos)s = NULL;""" %
+                          locals(), file=sio)
                print("""
 can_collapse_%(nodename)s(nd_collapse, local_dims, local_str[%(ipos)s], nd_collapse_%(ipos)s);
 for(int i=0;i<nd_collapse;i++){
@@ -707,8 +757,10 @@ nd_collapse_[i]=0;
                if self.verbose > 1:
                    print("""
                    std::cerr<< "nd_collapse_%(ipos)s "<<
-                    """%locals(), file=sio)
+                    """ % locals(), file=sio)
-                    print(' << " " << '.join(["nd_collapse_%s[" % ipos + str(i)+"]" for i in xrange(nd)]), file=sio)
+                    print(' << " " << '.join(["nd_collapse_ %s[" %
+                          ipos + str(i) + "]" for i in xrange(nd)]),
+                          file=sio)
                    print('<< "\\n";', file=sio)
    # update the local stride.
@@ -721,7 +773,7 @@ nd_collapse_[i]=0;
                  local_str[%(ipos)s][j-1]=local_str[%(ipos)s][j];
                }
            }
-            """%locals(), file=sio)
+            """ % locals(), file=sio)
        for ipos in xrange(len(node.outputs)):
            print("""
@@ -732,7 +784,7 @@ nd_collapse_[i]=0;
                  local_ostr[%(ipos)s][j-1]=local_ostr[%(ipos)s][j];
                }
            }
-            """%locals(), file=sio)
+            """ % locals(), file=sio)
    # update the local dims.
        print("""
@@ -743,16 +795,20 @@ nd_collapse_[i]=0;
              local_dims[j-1]=local_dims[j];
          }
        }
-        """%locals(), file=sio)
+        """ % locals(), file=sio)
    # update the new number of dim
        print("""
        for(int i=1, end=nd_collapse;i<end;i++){
          if(nd_collapse_[i]==1)nd_collapse--;
        }
-        if(nd_collapse == 1 """%locals(), file=sio)
+        if(nd_collapse == 1 """ % locals(), file=sio)
-        l = ["local_str[%s][nd_collapse-1]==1 "%ipos for ipos in xrange(len(node.inputs)) if not _logical_scalar(node.inputs[ipos])]
+        l = ["local_str[%s][nd_collapse-1]==1 " %
-        l += ["local_ostr[%s][nd_collapse-1]==1 "%ipos for ipos in xrange(len(node.outputs)) if not _logical_scalar(node.outputs[ipos])]
+             ipos for ipos in xrange(len(node.inputs)) if not
+             _logical_scalar(node.inputs[ipos])]
+        l += ["local_ostr[%s][nd_collapse-1]==1 " %
+              ipos for ipos in xrange(len(node.outputs)) if not
+              _logical_scalar(node.outputs[ipos])]
        if len(l) > 0:
            print(" && ", " && ".join(l), file=sio)
        print("""){nd_collapse=0;} """, file=sio)
@@ -762,20 +818,31 @@ nd_collapse_[i]=0;
            print("""std::cerr << "nd_collapse " << nd_collapse << "\\n"; """ % locals(), file=sio)
        if self.verbose > 1:
            for d in xrange(nd):
-                print('std::cerr << " " << local_dims[%(d)s]; '%locals(), file=sio)
+                print('std::cerr << " " << local_dims[%(d)s]; ' %
+                      locals(),
+                      file=sio)
            print('std::cerr << "\\n";', file=sio)
            if nd > 0:
                for ipos in xrange(len(node.inputs)):
-                    print('std::cerr << " local_str %(ipos)s: " <<'%locals()+' << " " << '.join(["local_str[%s][%s]"%(ipos, x) for x in xrange(nd)])+'<<"\\n";', file=sio)
+                    print(
+                        'std::cerr << " local_str % (ipos)s: " <<' %
+                        locals() + ' << " " << '.join(
+                            ["local_str[%s][%s]" %
+                             (ipos, x) for x in xrange(nd)]) +
+                        '<<"\\n";', file=sio)
                    for ipos in xrange(len(node.outputs)):
-                        print('std::cerr << " local_ostr %(ipos)s: " <<'%locals()+' << " " << '.join(["local_ostr[%s][%s]"%(ipos, x) for x in xrange(nd)])+'<<"\\n";', file=sio)
+                        print('std::cerr << " local_ostr  % (ipos)s: " <<' %
+                              locals() + ' << " " << '.join(
+                                  ["local_ostr[%s][%s]" %
+                                   (ipos, x) for x in xrange(nd)]) +
+                              '<<"\\n";', file=sio)
        def launch_Ccontiguous(nodename, scalar_op, sync=True):
            kernel_call_args = ["numEls"]
            for ipos in xrange(len(node.inputs)):
-                kernel_call_args.append("i%i_data"%ipos)
+                kernel_call_args.append("i%i_data" % ipos)
            for ipos in xrange(len(node.outputs)):
-                kernel_call_args.append("o%i_data"%ipos)
+                kernel_call_args.append("o%i_data" % ipos)
            kernel_call_args = ", ".join(kernel_call_args)
            verb = ""
            if self.verbose:
@@ -817,20 +884,27 @@ nd_collapse_[i]=0;
            # kernel_call_args are used to invoke the cuda kernel
            local = "local_"
            kernel_call_args = ["numEls"]
-            kernel_call_args.extend(local+"dims[%i]"%di for di in xrange(force_nd))
+            kernel_call_args.extend(
+                local + "dims[%i]" %
+                di for di in xrange(force_nd))
            for ipos in xrange(len(node.inputs)):
-                kernel_call_args += ["i%i_data"%ipos] + list(local+"str[%i][%i]"%(ipos, di) for di in xrange(force_nd))
+                kernel_call_args += ["i%i_data" % ipos] + list(
-                #strides = ", ".join("i%i_str[%i]"%(ipos, di) for di in xrange(force_nd))
+                    local + "str[%i][%i]" %
-                #kernel_call_args.append( "%s, i%i_data" % (strides, ipos))
+                    (ipos, di) for di in xrange(force_nd))
+                # strides = ", ".join("i%i_str[%i]"%(ipos, di) for di in xrange(force_nd))
+                # kernel_call_args.append( "%s, i%i_data" % (strides, ipos))
            for ipos in xrange(len(node.outputs)):
-                kernel_call_args += ["o%i_data"%ipos] + list(local+"ostr[%i][%i]"%(ipos, di) for di in xrange(force_nd))
+                kernel_call_args += ["o%i_data" % ipos] + list(
-                #strides = ", ".join("o%i_str[%i]"%(ipos, di) for di in xrange(force_nd))
+                    local + "ostr[%i][%i]" %
-                #kernel_call_args.append( "%s, o%i_data" % (strides, ipos))
+                    (ipos, di) for di in xrange(force_nd))
+                # strides = ", ".join("o%i_str[%i]"%(ipos, di) for di in xrange(force_nd))
+                # kernel_call_args.append( "%s, o%i_data" % (strides, ipos))
            if self.verbose:
                print("""
                    std::cerr << "   Running general version with %(force_nd)s  dims\\n";
-                    """%locals(), file=sio)
+                    """ % locals(), file=sio)
-                print("std::cerr << " + ' << " " << '.join(kernel_call_args)+' << "\\n";', file=sio)
+                print("std::cerr << " + ' << " " << '.join(
+                    kernel_call_args) + ' << "\\n";', file=sio)
                # std::cerr << numEls << dims[0] << i0_data, i0_str[0] << o0_data, o0_str[0]\n;
            kernel_call_args = ", ".join(kernel_call_args)
@@ -866,12 +940,13 @@ nd_collapse_[i]=0;
            else:
                print(" return 0; " % locals(), file=sio)
        print("if(numEls==0) return 0;", file=sio)
-        print("switch (nd_collapse==0?0:min(%(nd)s,nd_collapse)) {"%locals(), file=sio)
+        print("switch (nd_collapse==0?0:min(%(nd)s,nd_collapse)) {" %
+              locals(), file=sio)
        print("case 0: {", file=sio)
        launch_Ccontiguous(nodename, scalar_op, self.sync)
        print("        } break;", file=sio)
-        for i in xrange(1, nd+1):
+        for i in xrange(1, nd + 1):
-            print("case "+str(i)+": {", file=sio)
+            print("case " + str(i) + ": {", file=sio)
            launch_General(nodename, scalar_op, i, self.sync)
            print("        } break;", file=sio)
@@ -889,9 +964,10 @@ nd_collapse_[i]=0;
 #define INTMOD_POW2(a, b) (a & ((1<<b)-1))
        """
        kernels = "".join(
-            [self.c_src_kernel(node, nodename, x) for x in xrange(1, nd + 1)]
+            [self.c_src_kernel(node, nodename, x)
-            + [self.c_src_kernel_Ccontiguous(node, nodename)]
+             for x in xrange(1, nd + 1)] +
-            + [self.c_src_callkernel(node, nodename)])
+            [self.c_src_kernel_Ccontiguous(node, nodename)] +
+            [self.c_src_callkernel(node, nodename)])
        return defines + kernels
    def c_support_code(self):

--- a/theano/sandbox/cuda/fftconv.py
+++ b/theano/sandbox/cuda/fftconv.py
@@ -5,9 +5,9 @@ import numpy as np
 import theano
 import theano.tensor as T
+from theano.misc.pycuda_init import pycuda_available
 from theano.sandbox.cuda import cuda_available, GpuOp
 from theano.ifelse import ifelse
-from theano.misc.pycuda_init import pycuda_available
 if cuda_available:
    from theano.sandbox.cuda import (basic_ops, CudaNdarrayType,
@@ -523,9 +523,11 @@ def conv2d_fft(input, filters, image_shape=None, filter_shape=None,
    # special way because we specify explicitly here
    # how much values are expected.
    if border_mode == 'valid':
-        output = output_circ[:, :, (f0-1):(f0-1 + i0-f0+1), (f1-1):(f1-1 + i1-f1+1)]
+        output = output_circ[:, :, (f0 - 1):(f0 - 1 + i0 - f0 + 1),
+                             (f1 - 1):(f1 - 1 + i1 - f1 + 1)]
    elif border_mode == 'full':
-        output = output_circ[:, :, (f0-1):(f0-1 + i0+f0-1), (f1-1):(f1-1 + i1+f1-1)]
+        output = output_circ[:, :, (f0 - 1):(f0 - 1 + i0 + f0 - 1),
+                             (f1 - 1):(f1 - 1 + i1 + f1 - 1)]
    else:
        raise ValueError('invalid mode')
@@ -655,7 +657,7 @@ def conv3d_fft(input, filters, image_shape=None, filter_shape=None,
    output_fft_s = mult_and_reduce(input_fft_v, filters_fft_v,
                                   input_shape=input_fft_v_shape,
                                   filter_shape=filters_fft_v_shape)
-    #output_fft_s = input_fft_v
+    # output_fft_s = input_fft_v
    # reshape for IFFT
    output_fft_flat = output_fft_s.reshape((b * oc, o0, o1, o2 // 2 + 1, 2))
@@ -673,12 +675,16 @@ def conv3d_fft(input, filters, image_shape=None, filter_shape=None,
    # special way because we specify explicitly here
    # how much values are expected.
    if border_mode == 'valid':
-        output = output_circ[:, :, (f0-1):(f0-1 + i0-f0+1), (f1-1):(f1-1 + i1-f1+1), (f2-1):(f2-1 + i2-f2+1)]
+        output = output_circ[:, :, (f0 - 1):(f0 - 1 + i0 - f0 + 1),
+                             (f1 - 1):(f1 - 1 + i1 - f1 + 1),
+                             (f2 - 1):(f2 - 1 + i2 - f2 + 1)]
    elif border_mode == 'full':
-        output = output_circ[:, :, (f0-1):(f0-1 + i0+f0-1), (f1-1):(f1-1 + i1+f1-1), (f2-1):(f2-1 + i2+f2-1)]
+        output = output_circ[:, :, (f0 - 1):(f0 - 1 + i0 + f0 - 1),
+                             (f1 - 1):(f1 - 1 + i1 + f1 - 1),
+                             (f2 - 1):(f2 - 1 + i2 + f2 - 1)]
    else:
        raise ValueError('invalid mode')
-    #output = output_circ[:, :, :, :, :]
+    # output = output_circ[:, :, :, :, :]
    # Rescale manually. This is just a factor that comes in during the
    # trip through FFT and inverse FFT.

--- a/theano/sandbox/cuda/kernel_codegen.py
+++ b/theano/sandbox/cuda/kernel_codegen.py
@@ -167,17 +167,15 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount):
    We use __i as an int variable in a loop.
    """
-    return [
+    return [  # get max of buf (trashing all but buf[0])
-            # get max of buf (trashing all but buf[0])
        inline_reduce_max(N, buf, threadPos, threadCount),
        '__syncthreads()',
        'float row_max = ' + buf + '[0]',
        '__syncthreads()',
-            'for(int __i=' + threadPos + '; __i<' + N +
+        'for(int __i=' + threadPos + '; __i<' + N + '; __i+=' +
-                  '; __i+=' + threadCount + '){',
+        threadCount + '){',
        buf + '[__i] = exp(' + buf2 + '[__i] - row_max)',
-                buf2 + '[__i] = ' + buf + '[__i]',
+        buf2 + '[__i] = ' + buf + '[__i]', '}',
-            '}',
        '__syncthreads()',
        inline_reduce_sum(N, buf, threadPos, threadCount),
        '__syncthreads()',
@@ -186,8 +184,7 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount):
        # divide each exp() result by the sum to complete the job.
        'for(int __i=' + threadPos + '; __i<' + N +
        '; __i+=' + threadCount + '){',
-                buf + '[__i] = ' + buf2 + '[__i] / row_sum',
+        buf + '[__i] = ' + buf2 + '[__i] / row_sum', '}',
-            '}',
        '__syncthreads()',
        ]
@@ -241,8 +238,7 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
        init = manner_init("%(x)s[%(pos)s * %(stride_x)s]" % locals())
        loop_line = manner_fn("red", manner_init("%(x)s[i * %(stride_x)s]" %
                                                 locals()))
-    loop_line2 = manner_fn("%s[%s]" % (buf, pos),
+    loop_line2 = manner_fn("%s[%s]" % (buf, pos), "%s[i]" % buf)
-                          "%s[i]" % buf)
    r_16 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+16]" % (buf, pos))
    r_8 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+8]" % (buf, pos))
    r_4 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+4]" % (buf, pos))

--- a/theano/sandbox/cuda/neighbours.py
+++ b/theano/sandbox/cuda/neighbours.py
 from __future__ import absolute_import, print_function, division
 # This is work in progress
-from theano import Op, Apply, tensor
+from theano import Apply, tensor
 from theano.gof import local_optimizer
 from theano.sandbox.cuda import cuda_available, GpuOp

--- a/theano/sandbox/cuda/nnet.py
+++ b/theano/sandbox/cuda/nnet.py
@@ -578,45 +578,46 @@ class GpuSoftmax(GpuOp):
        """ % locals()
    def c_support_code_apply(self, node, nodename):
-        ret1 = nvcc_kernel("kSoftmax_%s" % nodename,
+        ret1 = nvcc_kernel(
+            "kSoftmax_%s" % nodename,
            params=['int M', 'int N',
-                        'const float * x', 'const int sx0', 'const int sx1',
+                    'const float * x',
-                        'float * sm', 'const int sm_s0', 'const int sm_s1'],
+                    'const int sx0',
-                body=[
+                    'const int sx1',
-                    "extern __shared__ float buf[]",
+                    'float * sm',
+                    'const int sm_s0',
+                    'const int sm_s1'],
+            body=["extern __shared__ float buf[]",
                  "float * buf2 = buf + N",
                  "for (int blockIDX = blockIdx.x; blockIDX < M;"
                  "     blockIDX += gridDim.x){",
                  "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
                  "buf[tx] = x[blockIDX * sx0 + tx * sx1]",
-                        "buf2[tx] = buf[tx]",
+                  "buf2[tx] = buf[tx]", "}", "__syncthreads()",
-                      "}",
+                  inline_softmax('N',
-                      "__syncthreads()",
+                                 'buf',
-                      inline_softmax('N', 'buf', 'buf2',
+                                 'buf2',
-                                     'threadIdx.x', 'blockDim.x'),
+                                 'threadIdx.x',
+                                 'blockDim.x'),
                  "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
                  # This set all value correctly
-                        "sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]",
+                  "sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]", "}",
-                      "}",
+                  "__syncthreads()", "}", ])
-                      "__syncthreads()",
+        ret2 = nvcc_kernel(
-                    "}",
+            "kSoftmax_fixed_shared%s" % nodename,
-                ])
-        ret2 = nvcc_kernel("kSoftmax_fixed_shared%s" % nodename,
            params=['int M', 'int N',
                    'const float * x', 'const int sx0', 'const int sx1',
                    'float * sm', 'const int sm_s0', 'const int sm_s1'],
-                body=[
+            body=["extern __shared__ float buf[]",
-                    "extern __shared__ float buf[]",
                  "for (int blockIDX = blockIdx.x; blockIDX < M;"
                  "     blockIDX += gridDim.x){",
                  "const float *x_ptr = &x[blockIDX * sx0]",
                  "float *sm_ptr = &sm[blockIDX * sm_s0]",
                  inline_softmax_fixed_shared('N', 'buf', 'x_ptr', 'sx1',
                                              'sm_ptr', 'sm_s1',
-                                                  'threadIdx.x', 'blockDim.x'),
+                                              'threadIdx.x',
-                      "__syncthreads()",
+                                              'blockDim.x'),
-                    "}",
+                  "__syncthreads()", "}", ])
-                    ])
        return ret1 + "\n" + ret2
 gpu_softmax = GpuSoftmax()
@@ -768,25 +769,20 @@ class GpuSoftmaxWithBias(GpuOp):
                    'const float * x', 'const int sx0', 'const int sx1',
                    'const float * b', 'const int sb0',
                    'float * sm', 'const int sm_s0', 'const int sm_s1'],
-            body=[
+            body=["extern __shared__ float buf[]",
-                    "extern __shared__ float buf[]",
                  "float * buf2 = buf + N",
                  "for (int blockIDX = blockIdx.x; blockIDX < M;"
                  "     blockIDX += gridDim.x){",
                  "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
                  "buf[tx] = x[blockIDX * sx0 + tx * sx1]",
                  "buf[tx] += b[tx * sb0]",
-                         "buf2[tx] = buf[tx]",
+                  "buf2[tx] = buf[tx]", "}",
-                      "}",
+                  "__syncthreads()", inline_softmax('N', 'buf', 'buf2',
-                       "__syncthreads()",
+                                                    'threadIdx.x',
-                       inline_softmax('N', 'buf', 'buf2',
+                                                    'blockDim.x'),
-                                      'threadIdx.x', 'blockDim.x'),
                  "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
-                         "sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]",
+                  "sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]", "}",
-                      "}",
+                  "__syncthreads()", "}", ])
-                      "__syncthreads()",
-                    "}",
-            ])
        ret2 = nvcc_kernel("kSoftmaxWithBias_fixed_shared%s" % nodename,
                           params=['int M', 'int N',
                                   'const float * x',
@@ -802,7 +798,8 @@ class GpuSoftmaxWithBias(GpuOp):
                               "float *sm_ptr = &sm[blockIDX * sm_s0]",
                               inline_softmax_fixed_shared('N', 'buf',
                                                           'x_ptr', 'sx1',
-                                                           'sm_ptr', 'sm_s1',
+                                                           'sm_ptr',
+                                                           'sm_s1',
                                                           'threadIdx.x',
                                                           'blockDim.x',
                                                           'b', 'sb0'),

--- a/theano/sandbox/cuda/nvcc_compiler.py
+++ b/theano/sandbox/cuda/nvcc_compiler.py
@@ -4,7 +4,6 @@ import logging
 import os
 import subprocess
 import sys
-import warnings
 from locale import getpreferredencoding
 import numpy
@@ -249,7 +248,8 @@ class NVCC_compiler(Compiler):
            _logger.debug('Writing module C++ code to %s', cppfilename)
            cppfile.write(src_code)
-        lib_filename = os.path.join(location, '%s.%s' %
+        lib_filename = os.path.join(
+            location, '%s.%s' %
            (module_name, get_lib_extension()))
        _logger.debug('Generating shared lib %s', lib_filename)
@@ -341,7 +341,7 @@ class NVCC_compiler(Compiler):
                indexof = cmd.index('-u')
                cmd.pop(indexof)  # Remove -u
                cmd.pop(indexof)  # Remove argument to -u
-            except ValueError as e:
+            except ValueError:
                done = True
        # CUDA Toolkit v4.1 Known Issues:
@@ -364,6 +364,8 @@ class NVCC_compiler(Compiler):
            console_encoding = getpreferredencoding()
            nvcc_stdout = decode_with(nvcc_stdout_raw, console_encoding)
            nvcc_stderr = decode_with(nvcc_stderr_raw, console_encoding)
+            p = subprocess.Popen(
+                cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        finally:
            os.chdir(orig_dir)

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -10,22 +10,32 @@ import warnings
 import numpy
 from six.moves import reduce, xrange
+from . import dnn
 import theano
 from theano import scalar as scal
 from theano import config, tensor, gof
 import theano.ifelse
+import theano.tensor.signal.pool
+import theano.tensor.nnet
+import theano.tensor.nnet.neighbours
+# Convolution
+from theano.tensor.nnet import conv
+from theano.tensor.nnet.ConvGrad3D import ConvGrad3D
+from theano.tensor.nnet.ConvTransp3D import ConvTransp3D
+# Pooling
+import theano.tensor.signal.pool as pool
 from theano.compile import optdb
 from theano.gof import (local_optimizer, EquilibriumDB, ProxyDB,
                        Optimizer, TopoOptimizer, toolbox)
 from theano.gof.opt import LocalMetaOptimizer
+from theano.sandbox.cuda.basic_ops import gpu_join, GpuJoin
 from theano.sandbox.cuda import as_cuda_ndarray_variable
 from theano.sandbox.cuda.basic_ops import (
    gpu_eye, gpu_contiguous,
    gpu_from_host, host_from_gpu, GpuFromHost, HostFromGpu,
    GpuContiguous,
    GpuElemwise, GpuDimShuffle, GpuReshape, GpuCAReduce,
-    GpuFlatten, gpu_flatten,
+    gpu_flatten,
    GpuSubtensor, GpuAdvancedSubtensor1,
    GpuAdvancedIncSubtensor1, GpuAdvancedIncSubtensor1_dev20,
    GpuIncSubtensor, gpu_alloc, GpuAlloc, gpu_shape, GpuSplit, GpuAllocEmpty)
@@ -137,8 +147,6 @@ register_opt(name='local_gpu_reshape_chain')(
 # This is a partial list of CPU ops that can be in some circonstance
 # moved to the GPU. This list is used by an optimization.
 # Hopefully, we can keep this list up to date.
-import theano.tensor.signal.pool
-import theano.tensor.nnet.neighbours
 cpu_ops_moved_to_gpu = [
    tensor.blas.Dot22, tensor.blas.Dot22Scalar, tensor.blas.Gemm,
    tensor.blas.Gemv, tensor.blas.Ger, tensor.nnet.conv.ConvOp,
@@ -850,8 +858,8 @@ def local_gpu_careduce(node):
            if x.type == node.outputs[0].type:
                return [x]
            elif (all([c != "output" and isinstance(c.op, GpuFromHost)
-                      for c, i in node.outputs[0].clients])
+                      for c, i in node.outputs[0].clients]) and
-                  and x.owner and x.owner.op.__class__ in
+                  x.owner and x.owner.op.__class__ in
                  cpu_ops_moved_to_gpu):
                # It is not always good to transfer the reduction to
                # the GPU when the clients are on the GPU but not the
@@ -1023,7 +1031,8 @@ def local_gpu_flatten(node):
            return [gpu_flatten(host_input.owner.inputs[0], outdim)(
                as_cuda_ndarray_variable(host_input.owner.inputs[0]))]
    if isinstance(node.op, tensor.Flatten):
-        x, = node.inputs
+        x, shp = node.inputs
+        outdim = node.op.outdim
        if x.owner and isinstance(x.owner.op, HostFromGpu):
            outdim = node.op.outdim
            gpu_x, = x.owner.inputs
@@ -1050,15 +1059,13 @@ def local_gpu_subtensor(node):
                                                *coords)]
    if isinstance(node.op, tensor.Subtensor):
        x = node.inputs[0]
-        if (x.owner and
+        if (x.owner and x.dtype == "float32" and
-            isinstance(x.owner.op, HostFromGpu) and
+                isinstance(x.owner.op, HostFromGpu)):
-            x.dtype == "float32"):
            gpu_x = x.owner.inputs[0]
-            if (gpu_x.owner and
+            if (gpu_x.owner and  # And it is a shared var or an input of the graph.
-                isinstance(gpu_x.owner.op, GpuFromHost) and
+                    not(gpu_x.owner.inputs[0].owner) and
-                # And it is a shared var or an input of the graph.
+                    isinstance(gpu_x.owner.op, GpuFromHost)):
-                not gpu_x.owner.inputs[0].owner):
                if len(x.clients) == 1:
                    if any([n == 'output' or isinstance(n.op, GpuOp)
@@ -1119,9 +1126,7 @@ def local_gpu_advanced_incsubtensor1(node):
                    'least \'0.6\'.', stacklevel=1)
            active_device_no = theano.sandbox.cuda.active_device_number()
            compute_capability = device_properties(active_device_no)['major']
-            if (compute_capability < 2 or
+            if (compute_capability < 2 or y.ndim != 2 or x.ndim != 2):
-                x.ndim != 2 or
-                y.ndim != 2):
                gpu_op = GpuAdvancedIncSubtensor1(
                    set_instead_of_inc=set_instead_of_inc)
@@ -1162,9 +1167,7 @@ def local_gpu_advanced_incsubtensor1(node):
            active_device_no = theano.sandbox.cuda.active_device_number()
            compute_capability = device_properties(active_device_no)['major']
-            if (compute_capability < 2 or
+            if (compute_capability < 2 or y.ndim != 2 or x.ndim != 2):
-                x.ndim != 2 or
-                y.ndim != 2):
                gpu_op = GpuAdvancedIncSubtensor1(
                    set_instead_of_inc=set_instead_of_inc)
            else:
@@ -1203,8 +1206,8 @@ def local_gpu_incsubtensor(node):
    # Incrementing a float32 x results in a float32
    # output even if y is float64, so we can downcast
    # y to put it on GPU
-    elif type(node.op) == tensor.IncSubtensor and \
+    elif (type(node.op) == tensor.IncSubtensor and
-       node.inputs[0].dtype == "float32":
+          node.inputs[0].dtype == "float32"):
        x, y = node.inputs[0:2]
        assert isinstance(x.type, tensor.TensorType)
        assert isinstance(y.type, tensor.TensorType)
@@ -1346,8 +1349,6 @@ def cast(x, dtype):
    cast_op = theano.tensor.Elemwise(scal.Identity(scal.specific_out(stype)))
    return cast_op(x)
-import theano.tensor.nnet
 @register_opt()
 @local_optimizer([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias])
@@ -1419,18 +1420,13 @@ def local_gpu_softmax_with_bias(node):
    return False
-# Convolution
-from theano.tensor.nnet import conv
 def _gpu_conv_to_fftconv(node):
    # shared helper function for local_conv_fft_valid and local_conv_fft_full.
    # we import conv2d_fft locally to avoid pycuda warnings
    from theano.sandbox.cuda.fftconv import conv2d_fft
    kwargs = {'border_mode': node.op.border_mode}
-    if (node.op.imshp is not None and
+    if (node.op.imshp is not None and node.op.imshp[-1] % 2 == 1 and
-        node.op.imshp[-1] is not None and
+            node.op.imshp[-1] is not None):
-        node.op.imshp[-1] % 2 == 1):
        kwargs['pad_last_dim'] = True
    # If the user supplied the full nonsymbolic image_shape and
@@ -1459,9 +1455,8 @@ def _gpu_conv_to_fftconv(node):
 @local_optimizer([GpuConv])
 def local_conv_fft_valid(node):
    if isinstance(node.op, GpuConv):
-        if (node.op.border_mode == 'valid' and
+        if (node.op.border_mode == 'valid' and node.op.fft_opt and
-            node.op.subsample == (1, 1) and
+                node.op.subsample == (1, 1)):
-            node.op.fft_opt):
            return [_gpu_conv_to_fftconv(node)]
        return False
@@ -1470,9 +1465,8 @@ def local_conv_fft_valid(node):
 @local_optimizer([GpuConv])
 def local_conv_fft_full(node):
    if isinstance(node.op, GpuConv):
-        if (node.op.border_mode == 'full' and
+        if (node.op.border_mode == 'full' and node.op.fft_opt and
-            node.op.subsample == (1, 1) and
+                node.op.subsample == (1, 1)):
-            node.op.fft_opt):
            return [_gpu_conv_to_fftconv(node)]
        return
@@ -1659,7 +1653,6 @@ conv_groupopt.register('conv_fft_full', local_conv_fft_full, 10,
                       'conv_fft')
 # cuDNN is the second, but only registered if cuDNN is available.
 # It can be disabled by excluding 'conv_dnn' or 'cudnn'.
-from . import dnn
 # We can't check at import if dnn is available, so we must always
 # register it. This do not cause problem as if it is not avail, the
 # opt will do nothing.
@@ -1708,8 +1701,7 @@ class ConvMetaOptimizer(LocalCudaMetaOptimizer):
        shapes = ((node.op.bsize,) + node.op.imshp,
                  (node.op.nkern, nchannels) + node.op.kshp)
        for (var, shape) in zip(vars, shapes):
-            if ((var in inputs) and
+            if ((var in inputs) and (shape is not None) and
-                (shape is not None) and
                    not any(s is None for s in shape)):
                result[var] = theano.shared(
@@ -1763,8 +1755,6 @@ def local_conv3d_fft(node):
 gpu_optimizer.register("conv3d_fft", local_conv3d_fft)
-from theano.tensor.nnet.ConvGrad3D import ConvGrad3D
 @local_optimizer([ConvGrad3D])
 def local_convgrad3d_fft(node):
@@ -1794,8 +1784,6 @@ def local_convgrad3d_fft(node):
 gpu_optimizer.register("convgrad3d_fft", local_convgrad3d_fft)
-from theano.tensor.nnet.ConvTransp3D import ConvTransp3D
 @local_optimizer([ConvTransp3D])
 def local_convtransp3d_fft(node):
@@ -1894,15 +1882,11 @@ def local_convtransp3d_gemm(node):
 gpu_optimizer.register("convtransp3d_gemm", local_convtransp3d_gemm)
-# Pooling
-import theano.tensor.signal.pool as pool
 @register_opt()
 @local_optimizer([pool.Pool])
 def local_gpu_downsample_factor_max(node):
-    if (isinstance(node.op, pool.Pool)
+    if (isinstance(node.op, pool.Pool) and
-        and node.op.ds == node.op.st):
+            node.op.ds == node.op.st):
        assert node.op.__props__ == ('ds', 'ignore_border', 'st', 'padding',
                                     'mode')
@@ -1917,9 +1901,7 @@ def local_gpu_downsample_factor_max(node):
 @register_opt()
 @local_optimizer([pool.MaxPoolGrad])
 def local_gpu_downsample_factor_max_grad(node):
-    if (isinstance(node.op, pool.MaxPoolGrad) and
+    if (isinstance(node.op, pool.MaxPoolGrad) and node.op.ds == node.op.st):
-        node.op.ds == node.op.st):
        assert node.op.__props__ == ('ds', 'ignore_border', 'st', 'padding',
                                     'mode')
        if (node.op.padding != (0, 0) or
@@ -1955,9 +1937,6 @@ def local_gpu_downsample_factor_max_grad_grad(node):
                                     as_cuda_ndarray_variable(gx)))]
-from theano.sandbox.cuda.basic_ops import gpu_join, GpuJoin
 @register_opt()
 @local_optimizer([tensor.Join])
 def local_gpu_join(node):
@@ -2310,6 +2289,7 @@ def local_gpu_eye(node):
        if (host_input.owner and
                isinstance(host_input.owner.op, tensor.Eye) and
                host_input.owner.op.dtype == "float32"):
            if tensor.extract_constant(host_input.owner.inputs[2]) != 0:
                return
            return [gpu_eye(*host_input.owner.inputs)]
@@ -2492,8 +2472,8 @@ def gpuScanOptimization(node):
            return _outputs
    # scan(host_from_gpu) -> host_from_gpu(GPUscan)
-    if (type(node.op) == scan_op.Scan
+    if (type(node.op) == scan_op.Scan and
-        and not node.op.info['gpu']):
+            not node.op.info['gpu']):
        if any([(i.owner and isinstance(i.owner.op, HostFromGpu))
                for i in node.inputs]):

--- a/theano/sandbox/cuda/rng_curand.py
+++ b/theano/sandbox/cuda/rng_curand.py
-"""
-Define CURAND_RandomStreams - backed by CURAND.
-"""
 from __future__ import absolute_import, print_function, division
-__authors__ = "James Bergstra"
-__copyright__ = "(c) 2011, University of Montreal"
-__license__ = "3-clause BSD License"
-__contact__ = "theano-dev@googlegroups.com"
 import numpy
 import theano.gof
 from theano.compat import PY3
@@ -17,6 +7,15 @@ from theano.tensor import (get_vector_length, cast, opt)
 from theano.compile import optdb
 from theano.gof import local_optimizer, Variable
+__authors__ = "James Bergstra"
+__copyright__ = "(c) 2011, University of Montreal"
+__license__ = "3-clause BSD License"
+__contact__ = "theano-dev@googlegroups.com"
+"""
+Define CURAND_RandomStreams - backed by CURAND.
+"""
 config = theano.config
@@ -70,8 +69,7 @@ class CURAND_Base(GpuOp):
        Return a tuple of attributes that define the Op.
        """
-        return (
+        return (self.destructive,
-                self.destructive,
                self.output_type,
                self.seed,
                )
@@ -101,8 +99,7 @@ class CURAND_Base(GpuOp):
        v_size = theano.tensor.as_tensor_variable(size)
        if ndim is None:
            ndim = get_vector_length(v_size)
-        self = cls(
+        self = cls(output_type=CudaNdarrayType((False,) * ndim),
-                output_type=CudaNdarrayType((False,) * ndim),
                   seed=seed,
                   destructive=False)
@@ -386,5 +383,5 @@ def local_destructive(node):
        return new_op.make_node(*node.inputs).outputs
    return False
 optdb.register('CURAND_destructive',
-        opt.in2out(local_destructive, ignore_newtrees=True), 99, 'fast_run',
+               opt.in2out(local_destructive, ignore_newtrees=True),
-                   'inplace')
+               99, 'fast_run', 'inplace')
--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
@@ -9,19 +9,20 @@ import numpy
 from six.moves import xrange
 import theano
 import theano.tensor as T
-# Skip test if cuda_ndarray is not available.
-from nose.plugins.skip import SkipTest
-from nose.tools import assert_raises
-import theano.sandbox.cuda as cuda_ndarray
-if cuda_ndarray.cuda_available == False:
-    raise SkipTest('Optional package cuda disabled')
 import theano.sandbox.cuda as tcn
 import theano.sandbox.cuda as cuda
 import theano.sandbox.cuda.basic_ops as B
 from theano.tensor.basic import _allclose
 from theano.tests import unittest_tools as utt
+import theano.tensor.tests.test_basic
+import theano.tensor.tests.test_subtensor
+import theano.tensor.tests.test_sharedvar
+# Skip test if cuda_ndarray is not available.
+from nose.plugins.skip import SkipTest
+import theano.sandbox.cuda as cuda_ndarray
+if cuda_ndarray.cuda_available is False:
+    raise SkipTest('Optional package cuda disabled')
 if theano.config.mode == 'FAST_COMPILE':
    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
@@ -75,8 +76,8 @@ def test_careduce():
            # The following 2 cases could work if the scalar_op.c_code work with float* dtype.
            # Currently we have this error:
            # error: invalid operands of types 'npy_float32' and 'npy_float32' to binary 'operator&'
-            #(theano.scalar.and_, tensor.elemwise.CAReduce),
+            # (theano.scalar.and_, tensor.elemwise.CAReduce),
-            #(theano.scalar.or_, tensor.elemwise.CAReduce),
+            # (theano.scalar.or_, tensor.elemwise.CAReduce),
    ]:
        for shape, pattern in [((1, 1), (1,)),
                               ((1, 0), (1,)),
@@ -113,7 +114,7 @@ def test_careduce():
                               ((4100, 4, 3), [2]), ((5, 4100, 3), [2]), ((5, 4, 4100), [2]),  # 001
                               ((4100, 4, 3), [0, 1]), ((5, 4100, 3), [0, 1]), ((5, 4, 4100), [0, 1]),  # 110
                               ((4100, 4, 3), [1, 2]), ((5, 4100, 3), [1, 2]), ((5, 4, 4100), [1, 2]),  # 011
-                               ((4100,4,3),[0,2]),((5,4100,3),[0,2]),((5,4,4100),[0,2]),
+                               ((4100, 4, 3), [0, 2]), ((5, 4100, 3), [0, 2]), ((5, 4, 4100), [0, 2]),
                               ((4100, 4, 3), [0, 1, 2]), ((5, 4100, 3), [0, 1, 2]), ((5, 4, 4100), [0, 1, 2]),  # 111
                               ((65, 4, 3), [0, 1, 2]), ((5, 65, 3), [0, 1, 2]), ((5, 4, 65), [0, 1, 2]),  # 111
@@ -127,15 +128,15 @@ def test_careduce():
                               ((4100, 4, 3, 2), [2, 3]), ((4, 4100, 3, 2), [2, 3]), ((4, 3, 4100, 2), [2, 3]), ((4, 3, 2, 4100), [2, 3]),  # 0011
                               ((4100, 4, 3, 2), [1, 3]), ((4, 4100, 3, 2), [1, 3]), ((4, 3, 4100, 2), [1, 3]), ((4, 3, 2, 4100), [1, 3]),  # 0101
                               ((4100, 4, 3, 2), [1, 2]), ((4, 4100, 3, 2), [1, 2]), ((4, 3, 4100, 2), [1, 2]), ((4, 3, 2, 4100), [1, 2]),  # 0110
-                               ((4100,4,3,2),[0,3]),((4,4100,3,2),[0,3]),((4,3,4100,2),[0,3]),((4,3,2,4100),[0,3]),#1001
+                               ((4100, 4, 3, 2), [0, 3]), ((4, 4100, 3, 2), [0, 3]), ((4, 3, 4100, 2), [0, 3]), ((4, 3, 2, 4100), [0, 3]),  # 1001
-#                               ((4100,4,3,2),[0,2]),((4,4100,3,2),[0,2]),((4,3,4100,2),[0,2]),((4,3,2,4100),[0,2]),#1010 not implemented
+                               # ((4100,4,3,2),[0,2]),((4,4100,3,2),[0,2]),((4,3,4100,2),[0,2]),((4,3,2,4100),[0,2]),#1010 not implemented
                               ((4100, 4, 3, 2), [0, 1]), ((4, 4100, 3, 2), [0, 1]), ((4, 3, 4100, 2), [0, 1]), ((4, 3, 2, 4100), [0, 1]),  # 1100
                               # reduce over 3d
                               # 3d not tested: 1101, 1110, 1111
-                               ((4100,4,3,2),[0,1,3]),((4,4100,3,2),[0,1,3]),((4,3,4100,2),[0,1,3]),((4,3,2,4100),[0,1,3]),#1101
+                               ((4100, 4, 3, 2), [0, 1, 3]), ((4, 4100, 3, 2), [0, 1, 3]), ((4, 3, 4100, 2), [0, 1, 3]), ((4, 3, 2, 4100), [0, 1, 3]),  # 1101
                               ((4100, 4, 3, 2), [0, 1, 2]), ((4, 4100, 3, 2), [0, 1, 2]), ((4, 3, 4100, 2), [0, 1, 2]), ((4, 3, 2, 4100), [0, 1, 2]),  # 1110
-                               ((4100, 4, 3, 2), [0, 2, 3]), ((4, 4100, 3, 2), [0, 2, 3]), ((4, 3, 4100, 2), [0, 2, 3]),  # ((4,3,2,4100),[0,2,3]),#1011
+                               ((4100, 4, 3, 2), [0, 2, 3]), ((4, 4100, 3, 2), [0, 2, 3]), ((4, 3, 4100, 2), [0, 2, 3]),  # ((4, 3, 2, 4100), [0, 2, 3]),  # 1011
                               ((4100, 4, 3, 2), [1, 2, 3]), ((4, 4100, 3, 2), [1, 2, 3]), ((4, 3, 4100, 2), [1, 2, 3]), ((4, 3, 2, 4100), [1, 2, 3]),  # 0111
                               ((65, 4, 3, 2), [1, 2, 3]), ((4, 65, 3, 2), [1, 2, 3]), ((4, 3, 65, 2), [1, 2, 3]), ((4, 3, 2, 65), [1, 2, 3]),  # 0111
@@ -148,25 +149,25 @@ def test_careduce():
                               ]:
            op = careduce_op(scalar_op, axis=pattern)
-            pat = tensor_pattern_to_gpu_pattern(shape, pattern)
+            tensor_pattern_to_gpu_pattern(shape, pattern)
            a = tensor.TensorType('float32', (False,) * len(shape))()
-            b = op(a*a)
+            b = op(a * a)
            val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
    #        val = numpy.ones(shape)
    #        val = numpy.arange(numpy.prod(shape)).reshape(shape)
            val = theano._asarray(val, dtype='float32')
            f = theano.function([a], b, mode=mode_with_gpu)
            f2 = theano.function([a], b, mode=mode_without_gpu)
-            assert tcn.GpuCAReduce in [x.op.__class__
+            assert tcn.GpuCAReduce in [
-                                       for x in f.maker.fgraph.toposort()], (
+                x.op.__class__ for x in f.maker.fgraph.toposort()], (
                scalar_op, shape, pattern)
-            if tcn.GpuElemwise in [x.op.__class__
+            if(tcn.GpuElemwise in [
-                                   for x in f.maker.fgraph.toposort()]:
+                    x.op.__class__ for x in f.maker.fgraph.toposort()]):
-                assert tcn.GpuReshape in [x.op.__class__
+                assert tcn.GpuReshape in [
-                                          for x in f.maker.fgraph.toposort()]
+                    x.op.__class__ for x in f.maker.fgraph.toposort()]
-            assert op.__class__ in [x.op.__class__
+            assert op.__class__ in [
-                                    for x in f2.maker.fgraph.toposort()], (
+                x.op.__class__ for x in f2.maker.fgraph.toposort()], (
                scalar_op, shape, pattern)
            f_caused_value_error = False
            try:
@@ -176,7 +177,8 @@ def test_careduce():
                f_caused_value_error = True
            except NotImplementedError:
                if (numpy.prod(shape) == 0 and
-                    getattr(scalar_op, 'identity', None) != 0):
+                    getattr(
+                        scalar_op, 'identity', None) != 0):
                            continue
                raise
@@ -208,9 +210,11 @@ def test_careduce():
                # example in debug mode with unittests.rseed=9275
                orig_rtol = theano.tensor.basic.float32_rtol
                theano.tensor.basic.float32_rtol = 2e-5
-                assert _allclose(f_out, f2_out), ('shape', shape,
+                assert _allclose(f_out, f2_out), (
-                                                    'pattern', pattern,
+                    'shape',
-                                                    scalar_op,
+                    shape,
+                    'pattern',
+                    pattern, scalar_op,
                    sum([shape[i] for i in pattern]),
                    f2(val), f(val), val)
            finally:
@@ -218,34 +222,36 @@ def test_careduce():
            # test with dimshuffle
            # we shuffle the 2 outer dims.
-        for shape, pattern in [  # ((5,),[0]),
+        # for shape, pattern in [((5,), [0]),
-                               ((5, 4), [0, 1]), ((5, 4), [0]),
+        for shape, pattern in [((5, 4), [0, 1]), ((5, 4), [0]),
-                               ((5, 4, 3), [0]), ((5, 4, 3), [0, 1]), ((5, 4, 3), [2]), ((5, 4, 3), [0, 1, 2]),
+                               ((5, 4, 3), [0]), ((5, 4, 3), [0, 1]),
-                               ((5, 4, 3, 2), [0, 1, 2, 3]), ((5, 4, 3, 2), [0, 2, 3]),
+                               ((5, 4, 3), [2]), ((5, 4, 3), [0, 1, 2]),
-                               ((128, 1, 3, 3), [0, 1, 2, 3]),
+                               ((5, 4, 3, 2), [0, 1, 2, 3]),
-        ]:
+                               ((5, 4, 3, 2), [0, 2, 3]),
+                               ((128, 1, 3, 3), [0, 1, 2, 3]), ]:
            op = careduce_op(scalar_op, axis=pattern)
-            pat = tensor_pattern_to_gpu_pattern(shape, pattern)
+            tensor_pattern_to_gpu_pattern(shape, pattern)
            a = tensor.TensorType('float32', (False,) * len(shape))()
            dim_pattern = list(range(len(shape)))
            dim_pattern[0] = 1
            dim_pattern[1] = 0
            a = a.dimshuffle(dim_pattern)
-            b = op(a*a)
+            b = op(a * a)
            val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
    #        val = numpy.ones(shape)
    #        val = numpy.arange(numpy.prod(shape)).reshape(shape)
            val = theano._asarray(val, dtype='float32')
            f = theano.function([a], b, mode=mode_with_gpu)
            f2 = theano.function([a], b, mode=mode_without_gpu)
-            assert tcn.GpuCAReduce in [x.op.__class__
+            assert tcn.GpuCAReduce in [
-                                       for x in f.maker.fgraph.toposort()], (
+                x.op.__class__ for x in f.maker.fgraph.toposort()], (
                scalar_op, shape, pattern)
-            assert tcn.GpuElemwise not in [x.op.__class__
+            assert tcn.GpuElemwise not in [
-                                           for x in f.maker.fgraph.toposort()]
+                x.op.__class__ for x in f.maker.fgraph.toposort()]
-            assert op.__class__ in [x.op.__class__
+            assert op.__class__ in [
-                                    for x in f2.maker.fgraph.toposort()], (
+                x.op.__class__ for x in f2.maker.fgraph.toposort()], (
                scalar_op, shape, pattern)
            assert _allclose(f2(val), f(val)), ('shape', shape,
                                                'pattern', pattern,
@@ -258,16 +264,15 @@ def test_careduce():
                               ((5, 4, 3), [0]), ((5, 4, 3), [0, 1]),
                               ((5, 4, 3), [2]), ((5, 4, 3), [0, 1, 2]),
                               ((5, 4, 3, 2), [0, 1, 2, 3]), ((5, 4, 3, 2), [0, 2, 3]),
-                               ((128, 1, 3, 3), [0, 1, 2, 3]),
+                               ((128, 1, 3, 3), [0, 1, 2, 3]), ]:
-        ]:
            op = careduce_op(scalar_op, axis=pattern)
-            pat = tensor_pattern_to_gpu_pattern(shape, pattern)
+            tensor_pattern_to_gpu_pattern(shape, pattern)
            shape = numpy.asarray(shape) * 2
            a = tensor.TensorType('float32', (False,) * len(shape))()
            a2 = tcn.CudaNdarrayType((False,) * len(shape))()
-            b = op(a*a)
+            b = op(a * a)
-            b2 = op(a2*a2)
+            b2 = op(a2 * a2)
            val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
    #        val = numpy.ones(shape)
    #        val = numpy.arange(numpy.prod(shape)).reshape(shape)
@@ -287,8 +292,8 @@ def test_careduce():
                val2 = val2[::2, ::2, ::2, ::2]
            f = theano.function([a], b, mode=mode_without_gpu)
            f2 = theano.function([a2], b2, mode=mode_with_gpu)
-            assert tcn.GpuCAReduce in [x.op.__class__
+            assert tcn.GpuCAReduce in [
-                                       for x in f2.maker.fgraph.toposort()], (
+                x.op.__class__ for x in f2.maker.fgraph.toposort()], (
                scalar_op, shape, pattern)
            assert tcn.GpuElemwise not in [x.op.__class__
                                           for x in f.maker.fgraph.toposort()]
@@ -374,8 +379,10 @@ def test_reshape():
    # Test zero dimensions are allowed
    x = T.vector('x')
-    f_reshp = theano.function([x], x.reshape((0,100)), mode=mode_with_gpu)
+    f_reshp = theano.function(
-    assert f_reshp(numpy.ndarray((0,), dtype='float32')).shape == (0,100)
+        [x], x.reshape((0, 100)), mode=mode_with_gpu)
+    assert f_reshp(
+        numpy.ndarray((0, ), dtype='float32')).shape == (0, 100)
 def test_alloc_empty():
@@ -406,7 +413,7 @@ def test_elemwise_empty():
    b = tensor.fmatrix()
    f = pfunc([b], [], updates=[(a, a + b)], mode=mode_with_gpu)
-    f2 = pfunc([b], [], updates=[(a, a + b)], mode=mode_without_gpu)
+    pfunc([b], [], updates=[(a, a + b)], mode=mode_without_gpu)
    a0 = a.get_value() * 1.0
    f(numpy.ones((0, 0), dtype='float32'))
@@ -424,8 +431,9 @@ def test_elemwise0():
    f = pfunc([b], [], updates=[(a, a + b)], mode=mode_with_gpu)
    # check that we work inplace.
-    assert (list(f.maker.fgraph.toposort()[1].op.destroy_map.items())
+    assert (list(
-            == [(0, [0])])
+        f.maker.fgraph.toposort()[1].op.destroy_map.items()) == [
+            (0, [0])])
    a0 = a.get_value() * 1.0
    f(numpy.ones((4, 4), dtype='float32'))
@@ -495,7 +503,8 @@ def test_elemwise2():
                                               dtype='float32'), 'a')
    b = tensor.Tensor(dtype='float32', broadcastable=[0] * len(shape))()
    f = pfunc([b], [], updates=[(a, (a + b).dimshuffle([2, 0, 3, 1]) *
-        tensor.exp(b ** a).dimshuffle([2, 0, 3, 1]))], mode=mode_with_gpu)
+              tensor.exp(b ** a).dimshuffle([2, 0, 3, 1]))],
+              mode=mode_with_gpu)
    has_elemwise = False
    for i, node in enumerate(f.maker.fgraph.toposort()):
        has_elemwise = has_elemwise or isinstance(node.op, tensor.Elemwise)
@@ -585,10 +594,11 @@ def test_elemwise_composite_float64():
        return l
    for mode in [mode_with_gpu, mode_with_gpu.excluding('gpu_after_fusion'),
                 mode_with_gpu.excluding('elemwise_fusion')]:
-        f = pfunc([a, b],
+        f = pfunc(
-                  tensor.cast(tensor.lt(tensor.cast(a, 'float64') ** 2,
+            [a, b],
-                                               b),
+            tensor.cast(
-                                     'float32'), mode=mode)
+                tensor.lt(tensor.cast(a, 'float64') ** 2, b), 'float32'),
+            mode=mode)
        out = f(av, bv)
        assert numpy.all(out == ((av ** 2) < bv))
@@ -648,11 +658,11 @@ def speed_elemwise_collapse():
    v = theano._asarray(numpy.random.rand(*shape), dtype='float32')
    v = v[:, ::2, :, :]
    v = cuda_ndarray.CudaNdarray(v)
-    t1 = time.time()
+    time.time()
    for i in range(100):
        # let debugmode catch errors
        f(v)
-    t2 = time.time()
+    time.time()
 def speed_elemwise_collapse2():
@@ -672,11 +682,11 @@ def speed_elemwise_collapse2():
    v = theano._asarray(numpy.random.rand(*shape), dtype='float32')
    v = v[:, :, :, ::2]
    v = cuda_ndarray.CudaNdarray(v)
-    t1 = time.time()
+    time.time()
    for i in range(100):
        # let debugmode catch errors
        f(v)
-    t2 = time.time()
+    time.time()
 def test_elemwise_collapse():
@@ -848,8 +858,8 @@ def test_hostfromgpu_shape_i():
    ca = theano.sandbox.cuda.var.CudaNdarrayType((False, False))()
    av = numpy.asarray(numpy.random.rand(5, 4), dtype='float32')
-    cv = cuda.CudaNdarray(numpy.asarray(numpy.random.rand(5, 4),
+    cv = cuda.CudaNdarray(numpy.asarray(
-                                      dtype='float32'))
+        numpy.random.rand(5, 4), dtype='float32'))
    f = theano.function([a], cuda.basic_ops.gpu_from_host(a), mode=m)
    assert cuda.basic_ops.gpu_from_host in [x.op
@@ -880,7 +890,7 @@ def test_gpujoin_assert_cndas():
    a = theano.shared(_a)
    try:
-        c = cuda.basic_ops.gpu_join(1, a)
+        cuda.basic_ops.gpu_join(1, a)
        # can't "assert False" here, as we want the assertion
        # error from gpu_join
    except TypeError:
@@ -921,12 +931,17 @@ def test_gpujoin_gpualloc():
    b = T.fmatrix('b')
    b_val = numpy.asarray(numpy.random.rand(3, 5), dtype='float32')
-    f = theano.function([a, b], T.join(0, T.zeros_like(a), T.ones_like(b)) + 4,
+    f = theano.function(
+        [a, b],
+        T.join(0, T.zeros_like(a), T.ones_like(b)) + 4,
        mode=mode_without_gpu)
-    f_gpu = theano.function([a, b], T.join(0, T.zeros_like(a), T.ones_like(b)),
+    f_gpu = theano.function(
+        [a, b],
+        T.join(0, T.zeros_like(a), T.ones_like(b)),
        mode=mode_with_gpu)
-    f_gpu2 = theano.function([a, b], T.join(0, T.zeros_like(a),
+    f_gpu2 = theano.function(
-                                           T.ones_like(b)) + 4,
+        [a, b],
+        T.join(0, T.zeros_like(a), T.ones_like(b)) + 4,
        mode=mode_with_gpu)
    assert sum([node.op == T.alloc for node in f.maker.fgraph.toposort()]) == 2
@@ -963,9 +978,6 @@ def test_gpualloc_output_to_gpu():
    assert numpy.allclose(f(5), f_gpu(5))
-import theano.tensor.tests.test_basic
 class TestAlloc(theano.tensor.tests.test_basic.TestAlloc):
    dtype = "float32"
    mode = mode_with_gpu
@@ -987,7 +999,6 @@ class T_Join_and_Split(theano.tensor.tests.test_basic.T_Join_and_Split):
        self.shared = cuda.shared_constructor
-import theano.tensor.tests.test_subtensor
 # This is to don't duplicate test.
@@ -1035,7 +1046,7 @@ class T_subtensor(theano.tensor.tests.test_subtensor.T_subtensor):
                                 ((3, 10, 68000), [1, 2], True),
                                 ((3, 69000, 11), [1, 2], True),
                                 # much memory, will be disabled if needed
-                                 ((2*10e7,), [-1, 199999999], True),
+                                 ((2 * 10e7,), [-1, 199999999], True),
                                 ((4, 5), [2, 3], True),
                                 ((4, 2, 3), [0, 3], True),
                                 ((4, 2, 3), [3, 3, 1, 1, 2,
@@ -1047,8 +1058,7 @@ class T_subtensor(theano.tensor.tests.test_subtensor.T_subtensor):
                                 # optimized for that case.
                                 ((4, 4, 2, 3), [3, 3, 1, 1, 2, 2, 0, 0,
                                                 -1, -2, -3, -4], False),
-                                 ((1, 10), [0, 0], True),
+                                 ((1, 10), [0, 0], True), ]:
-                             ]:
            # If there is not enough memory on the GPU, skip the test
            size_needed = numpy.prod(shape) * (4 + 1)
            if isinstance(theano.compile.get_default_mode(),
@@ -1106,13 +1116,14 @@ def test_advinc_subtensor1():
        rep[[0, 2]] += yval
        utt.assert_allclose(rval, rep)
 def test_advset_subtensor1():
    """ Test GPU version of set_subtensor on vectors (uses GpuAdvancedIncSubtensor1) """
    shp = (10,)
    shared = cuda.shared_constructor
    xval = numpy.arange(shp[0], dtype='float32').reshape(shp) + 1
-    idxs = numpy.array([0,2,5,7,3], dtype='int32')
+    idxs = numpy.array([0, 2, 5, 7, 3], dtype='int32')
-    yval = numpy.ones(len(idxs), dtype='float32')*10
+    yval = numpy.ones(len(idxs), dtype='float32') * 10
    x = shared(xval, name='x')
    y = T.tensor(dtype='float32', broadcastable=(False,) * len(shp), name='y')
    expr = T.advanced_set_subtensor1(x, y, idxs)
@@ -1124,13 +1135,14 @@ def test_advset_subtensor1():
    rep[idxs] = yval
    utt.assert_allclose(rval, rep)
 def test_advset_subtensor1_2d():
    """ Test GPU version of set_subtensor on matrices (uses GpuAdvancedIncSubtensor1_dev20 if compute capability >= 2.0) """
-    shp = (10,5)
+    shp = (10, 5)
    shared = cuda.shared_constructor
    xval = numpy.arange(numpy.prod(shp), dtype='float32').reshape(shp) + 1
-    idxs = numpy.array([0,2,5,7,3], dtype='int32')
+    idxs = numpy.array([0, 2, 5, 7, 3], dtype='int32')
-    yval = numpy.ones((len(idxs), shp[1]), dtype='float32')*10
+    yval = numpy.ones((len(idxs), shp[1]), dtype='float32') * 10
    x = shared(xval, name='x')
    y = T.tensor(dtype='float32', broadcastable=(False,) * len(shp), name='y')
    expr = T.advanced_set_subtensor1(x, y, idxs)
@@ -1142,37 +1154,38 @@ def test_advset_subtensor1_2d():
    rep[idxs] = yval
    utt.assert_allclose(rval, rep)
 def test_inc_subtensor():
-    shared = cuda.shared_constructor
+    cuda.shared_constructor
-    #shared = tensor.shared
+    # shared = tensor.shared
    x, y = T.fmatrices('x', 'y')
-    xval = numpy.asarray([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+    xval = numpy.asarray(
-                      dtype='float32')
+        [[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype='float32')
-    yval = numpy.asarray([[10, 10, 10], [10, 10, 10], [10, 10, 10]],
+    yval = numpy.asarray(
-                      dtype='float32')
+        [[10, 10, 10], [10, 10, 10], [10, 10, 10]], dtype='float32')
    expr = T.inc_subtensor(x[:, 1:3], y[:, 1:3])
    f = theano.function([x, y], expr, mode=mode_with_gpu)
    assert sum([isinstance(node.op, cuda.GpuIncSubtensor) and
-                node.op.set_instead_of_inc == False
+                node.op.set_instead_of_inc is False
                for node in f.maker.fgraph.toposort()]) == 1
    utt.assert_allclose(f(xval, yval), [[1., 12., 13.],
                                        [4., 15., 16.], [7., 18., 19.]])
 def test_set_subtensor():
-    shared = cuda.shared_constructor
+    cuda.shared_constructor
-    #shared = tensor.shared
+    # shared = tensor.shared
    x, y = T.fmatrices('x', 'y')
-    xval = numpy.asarray([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+    xval = numpy.asarray(
-                      dtype='float32')
+        [[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype='float32')
-    yval = numpy.asarray([[10, 10, 10], [10, 10, 10], [10, 10, 10]],
+    yval = numpy.asarray(
-                      dtype='float32')
+        [[10, 10, 10], [10, 10, 10], [10, 10, 10]], dtype='float32')
    expr = T.set_subtensor(x[:, 1:3], y[:, 1:3])
    f = theano.function([x, y], expr, mode=mode_with_gpu)
    assert sum([isinstance(node.op, cuda.GpuIncSubtensor) and
-                node.op.set_instead_of_inc == True
+               node.op.set_instead_of_inc is True
                for node in f.maker.fgraph.toposort()]) == 1
    f(xval, yval)
@@ -1191,7 +1204,7 @@ def test_many_arg_elemwise():
                        for arg in xrange(0, num_args)]
                symb_args = [theano.tensor.TensorType('float32',
-                                                      (False,)*nb_dim)()
+                                                      (False,) * nb_dim)()
                             for arg in xrange(0, num_args)]
                outputs = []
@@ -1313,7 +1326,6 @@ class test_size(unittest.TestCase):
        assert y.size == theano.function([], x.size)()
-import theano.tensor.tests.test_sharedvar
 # This test the case when the shared constructor view an CudaNdarray as input
 test_shared_options = theano.tensor.tests.test_sharedvar.makeSharedTester(
    shared_constructor_=tcn.shared_constructor,
@@ -1374,7 +1386,7 @@ def speed_reduce10():
 if __name__ == '__main__':
-    #test_many_arg_elemwise()
+    # test_many_arg_elemwise()
-    #test_gpujoin_assert_cndas()
+    # test_gpujoin_assert_cndas()
    test_advset_subtensor1()
    test_advset_subtensor1_2d()
--- a/theano/sandbox/cuda/tests/test_bench_loopfusion.py
+++ b/theano/sandbox/cuda/tests/test_bench_loopfusion.py
@@ -10,7 +10,7 @@ from __future__ import absolute_import, print_function, division
 # so state is ignored
 # since this job is not restartable, channel is also ignored
-import logging, time, sys
+import logging
 import numpy
 from six.moves import xrange
@@ -18,8 +18,12 @@ from six.moves import xrange
 import theano
 from theano.compile import shared, pfunc
 from theano import tensor
-from theano.tensor.nnet import softplus
 from theano.tensor.nnet.nnet import softsign
+try:
+    from PIL import Image
+except ImportError:
+    Image = None
+# from PIL import Image
 _logger = logging.getLogger('theano.sandbox.cuda.tests.test_bench_loopfusion')
@@ -28,7 +32,8 @@ def _shared_uniform(rng, low, high, size, dtype, name=None):
    return shared(
        theano._asarray(
            rng.uniform(low=low, high=high, size=size),
-                dtype=dtype), name)
+            dtype=dtype),
+        name)
 class Kouh2008(object):
@@ -49,8 +54,10 @@ class Kouh2008(object):
        """
        if len(w_list) != len(x_list):
            raise ValueError('w_list must have same len as x_list')
-        output = (sum(w * tensor.pow(x, p) for (w, x) in zip(w_list, x_list)))\
+        output = ((sum(w * tensor.pow(x, p)
-                / (theano._asarray(eps, dtype=k.type.dtype) + k + tensor.pow(sum(tensor.pow(x, q) for x in x_list), r))
+                       for (w, x) in zip(w_list, x_list))) /
+                  (theano._asarray(eps, dtype=k.type.dtype) + k +
+                   tensor.pow(sum(tensor.pow(x, q) for x in x_list), r)))
        assert output.type.ndim == 2
        self.__dict__.update(locals())
@@ -80,9 +87,14 @@ class Kouh2008(object):
            w_sm = theano.tensor.nnet.softmax(w)
            w_list = [w_sm[:, i] for i in xrange(n_terms)]
            w_l1 = abs(w).sum()
-            w_l2_sqr = (w**2).sum()
+            w_l2_sqr = (w ** 2).sum()
        else:
-            w_list = [shared_uniform(low=-2.0/n_terms, high=2.0/n_terms, size=(n_out,), name='w_%i'%i)
+            w_list = [
+                shared_uniform(
+                    low=-2.0 / n_terms,
+                    high=2.0 / n_terms,
+                    size=(n_out,),
+                    name='w_%i' % i)
                for i in xrange(n_terms)]
            w_l1 = sum(abs(wi).sum() for wi in w_list)
            w_l2_sqr = sum((wi**2).sum() for wi in w_list)
@@ -102,18 +114,26 @@ class Kouh2008(object):
        p = tensor.nnet.sigmoid(p_unbounded) * e_range_mag + e_range_low
        q = tensor.nnet.sigmoid(q_unbounded) * e_range_mag + e_range_low
        r = tensor.nnet.sigmoid(r_unbounded) * \
-                theano._asarray(1.0/e_range_low - 1.0/e_range_high, dtype=dtype) \
+            theano._asarray(1.0 / e_range_low - 1.0 / e_range_high,
-                + theano._asarray(1.0/e_range_high, dtype=dtype)
+                            dtype=dtype) + \
+            theano._asarray(1.0 / e_range_high, dtype=dtype)
        k = softsign(k_unbounded)
        if use_softmax_w:
            rval = cls(w_list, x_list, p, q, r, k,
-                    params=[p_unbounded, q_unbounded, r_unbounded, k_unbounded, w] + params,
+                       params=[p_unbounded,
+                               q_unbounded,
+                               r_unbounded,
+                               k_unbounded,
+                               w] + params,
                       updates=updates)
        else:
            rval = cls(w_list, x_list, p, q, r, k,
-                    params=[p_unbounded, q_unbounded, r_unbounded, k_unbounded] + w_list + params,
+                       params=[p_unbounded,
+                               q_unbounded,
+                               r_unbounded,
+                               k_unbounded] + w_list + params,
                       updates=updates)
        rval.p_unbounded = p_unbounded
        rval.q_unbounded = q_unbounded
@@ -126,8 +146,10 @@ class Kouh2008(object):
        return rval
    @classmethod
-    def new_filters_expbounds(cls, rng, input, n_in, n_out, n_terms, dtype=None, eps=1e-1,
+    def new_filters_expbounds(cls, rng, input, n_in, n_out, n_terms,
-            exponent_range=(1.0, 3.0), filter_range=1.0):
+                              dtype=None, eps=1e-1,
+                              exponent_range=(1.0, 3.0),
+                              filter_range=1.0):
        """Return a KouhLayer instance with random parameters
        The parameters are drawn on a range [typically] suitable for fine-tuning by gradient
@@ -161,18 +183,29 @@ class Kouh2008(object):
        def shared_uniform(low, high, size, name):
            return _shared_uniform(rng, low, high, size, dtype, name)
-        f_list = [shared_uniform(low=-2.0/numpy.sqrt(n_in), high=2.0/numpy.sqrt(n_in), size=(n_in, n_out), name='f_%i'%i)
+        f_list = [shared_uniform(low=-2.0 / numpy.sqrt(n_in),
+                                 high=2.0 / numpy.sqrt(n_in),
+                                 size=(n_in, n_out),
+                                 name='f_%i' % i)
                  for i in xrange(n_terms)]
-        b_list = [shared_uniform(low=0, high=.01, size=(n_out,), name='b_%i'%i)
+        b_list = [shared_uniform(low=0,
+                                 high=.01,
+                                 size=(n_out,),
+                                 name='b_%i' % i)
                  for i in xrange(n_terms)]
-        #x_list = [theano._asarray(eps, dtype=dtype)+softplus(tensor.dot(input, f_list[i])) for i in xrange(n_terms)]
+        # x_list = [theano._asarray(eps, dtype=dtype) + softplus(tensor.dot(input, f_list[i])) for i in xrange(n_terms)]
        filter_range = theano._asarray(filter_range, dtype=dtype)
-        half_filter_range = theano._asarray(filter_range/2, dtype=dtype)
+        half_filter_range = theano._asarray(filter_range / 2,
-        x_list = [theano._asarray(filter_range + eps, dtype=dtype)+half_filter_range * softsign(tensor.dot(input, f_list[i]) +
+                                            dtype=dtype)
-            b_list[i]) for i in xrange(n_terms)]
+        x_list = [
+            theano._asarray(filter_range + eps, dtype=dtype) +
+            half_filter_range * softsign(
+                tensor.dot(input, f_list[i]) + b_list[i])
+            for i in xrange(n_terms)]
-        rval = cls.new_expbounds(rng, x_list, n_out, dtype=dtype, params=f_list + b_list,
+        rval = cls.new_expbounds(
+            rng, x_list, n_out, dtype=dtype, params=f_list + b_list,
            exponent_range=exponent_range)
        rval.f_list = f_list
        rval.input = input  # add the input to the returned object
@@ -183,6 +216,8 @@ class Kouh2008(object):
    def img_from_weights(self, rows=None, cols=None, row_gap=1, col_gap=1, eps=1e-4):
        """ Return an image that visualizes all the weights in the layer.
        """
+        if Image is None:
+            raise ImportError("No module named PIL")
        n_in, n_out = self.f_list[0].value.shape
@@ -190,10 +225,12 @@ class Kouh2008(object):
            rows = int(numpy.sqrt(n_out))
        if cols is None:
            cols = n_out // rows
-            if n_out % rows: cols += 1
+            if n_out % rows:
+                cols += 1
        if rows is None:
            rows = n_out // cols
-            if n_out % cols: rows += 1
+            if n_out % cols:
+                rows += 1
        filter_shape = self.filter_shape
        height = rows * (row_gap + filter_shape[0]) - row_gap
@@ -203,34 +240,40 @@ class Kouh2008(object):
        w = self.w.value
        w_col = 0
        def pixel_range(x):
            return 255 * (x - x.min()) / (x.max() - x.min() + eps)
        for r in xrange(rows):
-            out_r_low = r*(row_gap + filter_shape[0])
+            out_r_low = r * (row_gap + filter_shape[0])
            out_r_high = out_r_low + filter_shape[0]
            for c in xrange(cols):
-                out_c_low = c*(col_gap + filter_shape[1])
+                out_c_low = c * (col_gap + filter_shape[1])
                out_c_high = out_c_low + filter_shape[1]
-                out_tile = out_array[out_r_low:out_r_high, out_c_low:out_c_high, :]
+                out_tile = out_array[out_r_low:out_r_high,
+                                     out_c_low:out_c_high,
+                                     :]
                if c % 3 == 0:  # linear filter
                    if w_col < w.shape[1]:
-                        out_tile[...] = pixel_range(w[:, w_col]).reshape(filter_shape+(1,))
+                        out_tile[...] = pixel_range(
+                            w[:, w_col]).reshape(filter_shape + (1,))
                        w_col += 1
                if c % 3 == 1:  # E filters
                    if w_col < w.shape[1]:
                        # filters after the 3rd do not get rendered, but are skipped over.
                        #  there are only 3 colour channels.
                        for i in xrange(min(self.n_E_quadratic, 3)):
-                            out_tile[:, :, i] = pixel_range(w[:, w_col+i]).reshape(filter_shape)
+                            out_tile[:, :, i] = pixel_range(
+                                w[:, w_col + i]).reshape(filter_shape)
                        w_col += self.n_E_quadratic
                if c % 3 == 2:  # S filters
                    if w_col < w.shape[1]:
                        # filters after the 3rd do not get rendered, but are skipped over.
                        #  there are only 3 colour channels.
                        for i in xrange(min(self.n_S_quadratic, 3)):
-                            out_tile[:, :, 2-i] = pixel_range(w[:, w_col+i]).reshape(filter_shape)
+                            out_tile[:, :, 2 - i] = pixel_range(
+                                w[:, w_col + i]).reshape(filter_shape)
                        w_col += self.n_S_quadratic
        return Image.fromarray(out_array, 'RGB')
@@ -264,8 +307,9 @@ class Config(object):
    ft_batchsize = 30
    ft_epoch_len = 50000
-    ft_status_interval = 50  # property( lambda s:s.ft_epoch_len/s.ft_batchsize)
+    ft_status_interval = 50  # property(lambda s:s.ft_epoch_len/s.ft_batchsize)
-    ft_validation_interval = property( lambda s: s.ft_epoch_len/s.ft_batchsize)
+    ft_validation_interval = property(
+        lambda s: s.ft_epoch_len / s.ft_batchsize)
    ft_ntrain_limit = 0
    ft_test_lag1 = True
@@ -297,7 +341,8 @@ if 0:
        s_lr = theano.tensor.fscalar()
        if not debug:
            sshape = (None, 784)
-        else: sshape = (None, 3)
+        else:
+            sshape = (None, 3)
        x = theano.tensor.TensorType(dtype=conf.dtype, broadcastable=(0, 0), shape=sshape)()
        y = theano.tensor.lvector()
@@ -315,7 +360,8 @@ if 0:
        print(layer.params)
        gparams = theano.tensor.grad(cost, layer.params)
-        updates = [(p, p - s_lr*gp) for p, gp in zip(layer.params, gparams)]
+        updates = [
+            (p, p - s_lr * gp) for p, gp in zip(layer.params, gparams)]
        train_nll = pfunc([x, y, s_lr], [], updates=updates)

--- a/theano/sandbox/cuda/tests/test_blas.py
+++ b/theano/sandbox/cuda/tests/test_blas.py
@@ -8,31 +8,31 @@ from theano import tensor
 from theano.tests import unittest_tools
 import numpy
-# Skip test if cuda_ndarray is not available.
-from nose.plugins.skip import SkipTest
-import theano.sandbox.cuda as cuda_ndarray
-if cuda_ndarray.cuda_available == False:
-    raise SkipTest('Optional package cuda disabled')
 import theano.sandbox.cuda as tcn
-from theano.tensor.signal.pool import (Pool,
-        PoolGrad, DownsampleFactorMaxGradGrad)
 import theano.compile.mode
 from theano.tensor.tests.test_blas import BaseGemv, TestBlasStrides, TestGer
 from theano.sandbox.cuda.blas import gpu_gemv_no_inplace, gpu_gemv_inplace
 from theano.sandbox.cuda.blas import gpu_ger_inplace, gpu_ger_no_inplace
 from theano.sandbox.cuda.blas import batched_dot, GpuBatchedDot
+from theano.tensor.signal.pool import (Pool, PoolGrad, DownsampleFactorMaxGradGrad)
+# Skip test if cuda_ndarray is not available.
+from nose.plugins.skip import SkipTest
+import theano.sandbox.cuda as cuda_ndarray
+if cuda_ndarray.cuda_available is False:
+    raise SkipTest('Optional package cuda disabled')
 if theano.config.mode == 'FAST_COMPILE':
-    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
+    mode_with_gpu = theano.compile.mode.get_mode(
+        'FAST_RUN').including('gpu')
    mode_without_gpu = theano.compile.mode.get_mode(
        'FAST_RUN').excluding('gpu')
 else:
-    mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
+    mode_with_gpu = theano.compile.mode.get_default_mode(
-    mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpu')
+        ).including('gpu')
+    mode_without_gpu = theano.compile.mode.get_default_mode(
+        ).excluding('gpu')
 # The CPU tests already compare C/Py, so we only check C/GPU
 mode_with_gpu = copy.copy(mode_with_gpu)
@@ -55,73 +55,81 @@ class TestBatchedDot(unittest_tools.InferShapeTester):
            def cmp(a_shp, b_shp):
-                a=numpy.random.randn(*a_shp).astype(numpy.float32)
+                a = numpy.random.randn(* a_shp).astype(numpy.float32)
-                b=numpy.random.randn(*b_shp).astype(numpy.float32)
+                b = numpy.random.randn(* b_shp).astype(numpy.float32)
-                x=tensor.ftensor3()
+                x = tensor.ftensor3()
-                y=tensor.ftensor3()
+                y = tensor.ftensor3()
-                f=theano.function([x,y], batched_dot(x,y), mode=mode_with_gpu)
+                f = theano.function([x, y],
+                                    batched_dot(x, y),
+                                    mode=mode_with_gpu)
-                z0=numpy.asarray(f(a,b))
+                z0 = numpy.asarray(f(a, b))
                ga = cuda_ndarray.CudaNdarray(a)
                gb = cuda_ndarray.CudaNdarray(b)
-                z1=numpy.asarray(f(ga,gb))
+                z1 = numpy.asarray(f(ga, gb))
+                z_test = numpy.sum(
+                    a[:, :, :, None] * b[:, None, :, :], axis=-2)
+                z1 = numpy.asarray(f(ga, gb))
-                z_test = numpy.sum(a[:,:,:,None]*b[:,None,:,:],axis=-2)
+                z_test = numpy.sum(
+                    a[:, :, :, None] * b[:, None, :, :], axis=-2)
                unittest_tools.assert_allclose(z0, z_test)
                unittest_tools.assert_allclose(z1, z_test)
-            cmp((5,4,3), (5,3,2))
+            cmp((5, 4, 3), (5, 3, 2))
-            cmp((5,3,3), (5,3,3))
+            cmp((5, 3, 3), (5, 3, 3))
-            cmp((5,2,6), (5,6,3))
+            cmp((5, 2, 6), (5, 6, 3))
            # Test dimensions of 0
-            cmp((0,2,6), (0,6,3))
+            cmp((0, 2, 6), (0, 6, 3))
-            cmp((5,0,3), (5,3,2))
+            cmp((5, 0, 3), (5, 3, 2))
-            cmp((5,4,0), (5,0,2))
+            cmp((5, 4, 0), (5, 0, 2))
-            cmp((5,4,3), (5,3,0))
+            cmp((5, 4, 3), (5, 3, 0))
-            cmp((0,0,0), (0,0,0))
+            cmp((0, 0, 0), (0, 0, 0))
            # Test dimensions of 1
-            cmp((1,2,6), (1,6,3))
+            cmp((1, 2, 6), (1, 6, 3))
-            cmp((5,1,3), (5,3,2))
+            cmp((5, 1, 3), (5, 3, 2))
-            cmp((5,4,1), (5,1,2))
+            cmp((5, 4, 1), (5, 1, 2))
-            cmp((5,4,3), (5,3,1))
+            cmp((5, 4, 3), (5, 3, 1))
    def test_batched_dot_errors(self):
        def fail(a_shp, b_shp):
-            a=numpy.random.randn(*a_shp).astype(numpy.float32)
+            a = numpy.random.randn(* a_shp).astype(numpy.float32)
-            b=numpy.random.randn(*b_shp).astype(numpy.float32)
+            b = numpy.random.randn(* b_shp).astype(numpy.float32)
-            x=tensor.ftensor3()
+            x = tensor.ftensor3()
-            y=tensor.ftensor3()
+            y = tensor.ftensor3()
-            f=theano.function([x,y], batched_dot(x,y), mode=mode_with_gpu)
+            f = theano.function([x, y],
+                                batched_dot(x, y),
+                                mode=mode_with_gpu)
-            z = f(a,b)
+            f(a, b)
        # Different batch size
-        self.assertRaises(RuntimeError, fail, (5,4,3), (6,3,2))
+        self.assertRaises(RuntimeError, fail, (5, 4, 3), (6, 3, 2))
        # Shape mismatch
-        self.assertRaises(RuntimeError, fail, (5,4,3), (5,2,2))
+        self.assertRaises(RuntimeError, fail, (5, 4, 3), (5, 2, 2))
    def test_batched_dot_gradient(self):
-        for threshold in [0, 100]:
        unittest_tools.verify_grad(
-                GpuBatchedDot(stream_threshold=threshold),
+            batched_dot, [
-                [numpy.random.randn(5,7,2).astype(numpy.float32),
+                numpy.random.randn(5, 7, 2).astype(numpy.float32),
-                 numpy.random.randn(5,2,6).astype(numpy.float32)],
+                numpy.random.randn(5, 2, 6).astype(numpy.float32)],
            mode=mode_with_gpu)
    def test_infer_shape(self):
-        # only matrix/matrix is supported
+        # only matrix / matrix is supported
        admat = tensor.ftensor3()
        bdmat = tensor.ftensor3()
        admat_val = my_rand(7, 4, 5)
@@ -134,22 +142,21 @@ class TestBatchedDot(unittest_tools.InferShapeTester):
 def test_dot22():
    def cmp(a_shp, b_shp):
-        a0 = my_rand(*a_shp)
+        a0 = my_rand(* a_shp)
        a = tcn.shared_constructor(a0, 'a')
        b = tensor.fmatrix()
        f = pfunc([b], [], updates=[(a, tensor.dot(a, b))], mode=mode_with_gpu)
-        bval = my_rand(*b_shp)
+        bval = my_rand(* b_shp)
        f(bval)
        assert numpy.allclose(numpy.dot(a0, bval), a.get_value())
        # Try with a matrix equal to a0, but with strides in both dims
        a.set_value(a0)
-        a.set_value(
+        a.set_value(a.get_value(borrow=True,
-                a.get_value(borrow=True,
                    return_internal_type=True)[::-1, ::-1],
                    borrow=True)
        f(bval)
@@ -224,7 +231,7 @@ def test_gemm():
        assert any([node.op == tcn.blas.gpu_gemm_inplace
                    for node in f.maker.fgraph.toposort()])
-        bval = my_rand(*b_shp)
+        bval = my_rand(* b_shp)
        cval = my_rand(a_shp[0], b_shp[1])
        f(bval, cval)
@@ -233,8 +240,7 @@ def test_gemm():
        # Try with a matrix equal to a0, but with strides in both dims
        a.set_value(a0)
-        a.set_value(
+        a.set_value(a.get_value(borrow=True,
-                a.get_value(borrow=True,
                                return_internal_type=True)[::-1, ::-1],
                    borrow=True)
        f(bval, cval)
@@ -250,7 +256,7 @@ def test_gemm():
 def test_gemm_no_inplace():
    def cmp(a_shp, b_shp):
-        a0 = my_rand(*a_shp)
+        a0 = my_rand(* a_shp)
        a = tcn.shared_constructor(a0, 'a')
        cval = my_rand(a_shp[0], b_shp[1])
        c = tcn.shared_constructor(cval.copy(), 'c')
@@ -258,8 +264,7 @@ def test_gemm_no_inplace():
        b = tcn.fmatrix('b')
        b2 = tcn.fmatrix('b2')
-        f = pfunc(
+        f = pfunc([b, b2],
-                [b, b2],
                  [tensor.dot(a, b2) + c],
                  updates=[(a, tensor.dot(a, b) + c)],
                  mode=mode_with_gpu)
@@ -276,7 +281,8 @@ def test_gemm_no_inplace():
        # Try with a matrix equal to a0, but with strides in both dims
        a.set_value(a0)
        a.set_value(
-                a.get_value(borrow=True,
+            a.get_value(
+                borrow=True,
                return_internal_type=True)[::-1, ::-1],
            borrow=True)
        f(bval, bval2)
@@ -303,8 +309,8 @@ if 0:
    def test_maxpool():
        """TODO: test the gpu version!!! """
        for d0, d1, r_true, r_false in [(4, 4, [[[[5, 7], [13, 15]]]], [[[[5, 7], [13, 15]]]]),
-                                        (5, 5, [[[[6, 8], [ 16, 18], [ 21, 23]]]],
+                                        (5, 5, [[[[6, 8], [16, 18], [21, 23]]]],
-                                         [[[[6, 8, 9], [ 16, 18, 19], [ 21, 23, 24]]]])]:
+                                         [[[[6, 8, 9], [16, 18, 19], [21, 23, 24]]]])]:
            for border, ret in [(True, r_true), (False, r_false)]:
                ret = numpy.array(ret)
                a = tcn.blas.Pool((2, 2), border)
@@ -312,7 +318,7 @@ if 0:
                b = dmatrix4()
                f = pfunc([b], [a(b)], mode=mode_with_gpu)
-                bval = numpy.arange(0, d0*d1).reshape(1, 1, d0, d1)
+                bval = numpy.arange(0, d0 * d1).reshape(1, 1, d0, d1)
                r = f(bval)[0]
    #            print bval, bval.shape, border
                # print r, r.shape
@@ -347,8 +353,7 @@ def test_downsample():
            (1, 1, 1025, 10),
            (1, 1, 1023, 10),
            (65536, 1, 10, 10),
-            (1, 65536, 10, 10),
+            (1, 65536, 10, 10), ]
-             ]
    numpy.random.RandomState(unittest_tools.fetch_seed()).shuffle(shps)
@@ -413,10 +418,11 @@ def test_downsample():
                gg = pfunc([], ggf, mode=gpu_mode)
                gg2 = pfunc([], ggf, mode=ref_mode)
-                assert any([isinstance(node.op,
+                assert any([isinstance(
-                                       tcn.blas.GpuDownsampleFactorMaxGradGrad)
+                    node.op, tcn.blas.GpuDownsampleFactorMaxGradGrad)
                    for node in gg.maker.fgraph.toposort()])
-                assert any([isinstance(node.op, DownsampleFactorMaxGradGrad)
+                assert any([isinstance(
+                    node.op, DownsampleFactorMaxGradGrad)
                    for node in gg2.maker.fgraph.toposort()])
                assert numpy.allclose(gg(), gg2()), shp
@@ -434,6 +440,7 @@ class TestGpuGemv(TestCase, BaseGemv,
    gemv = gpu_gemv_no_inplace
    gemv_inplace = gpu_gemv_inplace
    # Mimic shared constructors registry
    @staticmethod
    def shared(val):
        # If we don't put shared on the GPU, we won't be able to test
@@ -531,7 +538,9 @@ class TestVectorMatrixDot(TestCase):
        gpu_f = theano.function([], v2 + theano.dot(m, v1), mode=mode_with_gpu)
        # gpu_f2 is needed to test the case when the input is not on the gpu
        # but the output is moved to the gpu.
-        gpu_f2 = theano.function([], tcn.gpu_from_host(v2 + theano.dot(m, v1)),
+        gpu_f2 = theano.function(
+            [],
+            tcn.gpu_from_host(v2 + theano.dot(m, v1)),
            mode=mode_with_gpu)
        # Assert they produce the same output
@@ -556,7 +565,8 @@ class TestVectorMatrixDot(TestCase):
                                mode=mode_with_gpu)
        # gpu_f2 is needed to test the case when the input is not on the gpu
        # but the output is moved to the gpu.
-        gpu_f2 = theano.function([], tcn.gpu_from_host(v2 + theano.dot(v1, m)),
+        gpu_f2 = theano.function(
+            [], tcn.gpu_from_host(v2 + theano.dot(v1, m)),
            mode=mode_with_gpu)
        # Assert they produce the same output

--- a/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
+++ b/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
@@ -2,14 +2,16 @@
 Tests for GPU convolution
 """
 from __future__ import absolute_import, print_function, division
-import sys
 import time
 import unittest
-import traceback
+import theano
+from theano import tensor
+from theano.tests.unittest_tools import seed_rng, assert_allclose
+from theano.sandbox import cuda
 import numpy
 from six.moves import xrange
+from theano.sandbox.cuda.dnn import GpuDnnConv, DnnBase, dnn_conv
 from nose.plugins.skip import SkipTest
 from nose.tools import assert_raises
 imported_scipy_convolve2d = False
@@ -19,16 +21,10 @@ try:
 except ImportError:
    pass
-import theano
-from theano import tensor
-from theano.tests.unittest_tools import seed_rng, assert_allclose
 # Skip test if cuda is not available.
-from theano.sandbox import cuda
+if cuda.cuda_available is False:
-if cuda.cuda_available == False:
    raise SkipTest('Optional package cuda disabled')
-from theano.sandbox.cuda.dnn import GpuDnnConv, DnnBase, dnn_conv
 # needed as the gpu conv don't have a perform implementation.
 if theano.config.mode == 'FAST_COMPILE':
@@ -106,11 +102,11 @@ def py_conv(img, kern, mode, subsample):
    if imported_scipy_convolve2d:
        return py_conv_scipy(img, kern, mode, subsample)
    elif mode == 'valid':
-        return py_conv_valid_numpy(img, kern)[:, :, ::subsample[0],
+        return py_conv_valid_numpy(img, kern)[
-                                                      ::subsample[1]]
+            :, :, ::subsample[0], ::subsample[1]]
    elif mode == 'full':
-        return py_conv_full_numpy(img, kern)[:, :, ::subsample[0],
+        return py_conv_full_numpy(img, kern)[
-                                                     ::subsample[1]]
+            :, :, ::subsample[0], ::subsample[1]]
    else:
        raise Exception("Can't execute this kernel.")
@@ -129,7 +125,7 @@ def py_conv_scipy(img, kern, mode, subsample):
    for b in xrange(out.shape[0]):
        for k in xrange(out.shape[1]):
            for s in xrange(img.shape[1]):
-                #convolve2d or correlate
+                # convolve2d or correlate
                out[b, k, :, :] += convolve2d(img[b, s, :, :],
                                              kern[k, s, :, :],
                                              mode)
@@ -168,10 +164,12 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
        npy_kern = theano._asarray(numpy.random.rand(*kshape) - 2,
                                   dtype='float32')
    else:
-        npy_img = theano._asarray(numpy.arange(
+        npy_img = theano._asarray(
-                numpy.prod(ishape)).reshape(ishape), dtype='float32') + 1
+            numpy.arange(numpy.prod(ishape)).reshape(ishape),
-        npy_kern = -(theano._asarray(numpy.arange(
+            dtype='float32') + 1
-                    numpy.prod(kshape)).reshape(kshape), dtype='float32') + 1)
+        npy_kern = -(theano._asarray(
+            numpy.arange(numpy.prod(kshape)).reshape(kshape),
+            dtype='float32') + 1)
    img = cuda_ndarray.CudaNdarray(npy_img)
    kern = cuda_ndarray.CudaNdarray(npy_kern)
@@ -281,15 +279,15 @@ def get_shapes(imshp=(1, 1), kshp=(1, 1), subsample=(1, 1),
        ((3, 1) + imshp, (1, 1) + kshp, subsample, img_stride, kern_stride),
        # nkern only
        ((1, 1) + imshp, (2, 1) + kshp, subsample, img_stride, kern_stride),
-        #batch and nkern
+        # batch and nkern
        ((3, 1) + imshp, (2, 1) + kshp, subsample, img_stride, kern_stride),
-        #batch and stack
+        # batch and stack
        ((3, 2) + imshp, (1, 2) + kshp, subsample, img_stride, kern_stride),
-        #stack and nkern
+        # stack and nkern
        ((1, 2) + imshp, (2, 2) + kshp, subsample, img_stride, kern_stride),
-        #batch, nkern and stack
+        # batch, nkern and stack
        ((2, 2) + imshp, (2, 2) + kshp, subsample, img_stride, kern_stride),
-        #batch, nkern and stack
+        # batch, nkern and stack
        ((3, 2) + imshp, (4, 2) + kshp, subsample, img_stride, kern_stride)
        ]
@@ -345,37 +343,37 @@ def get_valid_shapes():
    shapes += [
        # other test
-              ((2, 1, 2, 2), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1))
+        ((2, 1, 2, 2), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
-            , ((3, 2, 4, 4), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1))
+        ((3, 2, 4, 4), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)),
-            , ((4, 1, 10, 10), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1))
+        ((4, 1, 10, 10), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
-            , ((1, 1, 4, 4), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1))
+        ((1, 1, 4, 4), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1)),
-            , ((4, 1, 10, 10), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1))
+        ((4, 1, 10, 10), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1)),
-            , ((4, 1, 10, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1))
+        ((4, 1, 10, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1)),
-            , ((4, 1, 20, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1))
+        ((4, 1, 20, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1)),
-            , ((3, 2, 8, 8), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1))  # stack, nkern, bsize
+        ((3, 2, 8, 8), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)),  # stack, nkern, bsize,
-            , ((3, 2, 8, 6), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1))  # stack, nkern, bsize, non-square image
+        ((3, 2, 8, 6), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)),  # stack, nkern, bsize, non-square image,
-            , ((3, 2, 8, 6), (4, 2, 4, 3), (1, 1), (1, 1), (1, 1))  # stack, nkern, bsize, non-square image, non-square kern
+        ((3, 2, 8, 6), (4, 2, 4, 3), (1, 1), (1, 1), (1, 1)),  # stack, nkern, bsize, non-square image, non-square kern,
-            , ((3, 2, 8, 6), (4, 2, 4, 6), (1, 1), (1, 1), (1, 1))  # stack, nkern, bsize ,non-square image, non-square kern, kernsize==imgsize on one dim
+        ((3, 2, 8, 6), (4, 2, 4, 6), (1, 1), (1, 1), (1, 1)),  # stack, nkern, bsize ,non-square image, non-square kern, kernsize==imgsize on one dim,
-            , ((16, 5, 64, 64), (8, 5, 8, 8), (1, 1), (1, 1), (1, 1))  # a big one
+        ((16, 5, 64, 64), (8, 5, 8, 8), (1, 1), (1, 1), (1, 1)),  # a big one
-            , ((16, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1))  # MNIST LeNET layer 1
+        ((16, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)),  # MNIST LeNET layer 1
-            , ((20, 16, 32, 32), (1, 16, 28, 28), (1, 1), (1, 1), (1, 1))  # layer 1 backprop to weights
+        ((20, 16, 32, 32), (1, 16, 28, 28), (1, 1), (1, 1), (1, 1)),  # layer 1 backprop to weights
-            , ((60, 20, 28, 28), (10, 20, 5, 5), (1, 1), (2, 2), (1, 1))  # added a test case that fail from test_nnet.py.test_conv_nnet2
+        ((60, 20, 28, 28), (10, 20, 5, 5), (1, 1), (2, 2), (1, 1)),  # added a test case that fail from test_nnet.py.test_conv_nnet2
-            , ((10, 5, 28, 28), (10, 5, 5, 5), (1, 1), (2, 2), (1, 1))  # test precedent but reduced that triger the error
+        ((10, 5, 28, 28), (10, 5, 5, 5), (1, 1), (2, 2), (1, 1)),  # test precedent but reduced that triger the error
        # Test more than maxThreadsDim0
-            , ((2, 4, 13, 1050), (3, 4, 10, 11), (1, 1), (1, 1), (1, 1))
+        ((2, 4, 13, 1050), (3, 4, 10, 11), (1, 1), (1, 1), (1, 1)),
-            , ((2, 4, 1050, 13), (3, 4, 10, 11), (1, 1), (1, 1), (1, 1))
+        ((2, 4, 1050, 13), (3, 4, 10, 11), (1, 1), (1, 1), (1, 1))
        ]
-    shapes += [ ((60, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1))  # test_lenet_28 1 layers
+    shapes += [((60, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)),  # test_lenet_28 1 layers
-            , ((60, 20, 12, 12), (30, 20, 5, 5), (1, 1), (1, 1), (1, 1))  # test_lenet_28 2 layers
+               ((60, 20, 12, 12), (30, 20, 5, 5), (1, 1), (1, 1), (1, 1)),  # test_lenet_28 2 layers
-            , ((60, 30, 8, 8), (20, 30, 5, 5), (1, 1), (1, 1), (1, 1))  # test_lenet_28 bprop 1 full
+               ((60, 30, 8, 8), (20, 30, 5, 5), (1, 1), (1, 1), (1, 1)),  # test_lenet_28 bprop 1 full
-            , ((20, 60, 12, 12), (30, 60, 8, 8), (1, 1), (1, 1), (1, 1))  # test_lenet_28 bprop 2 valid
+               ((20, 60, 12, 12), (30, 60, 8, 8), (1, 1), (1, 1), (1, 1)),  # test_lenet_28 bprop 2 valid
-#            , ((1,60,28,28),(20,60,24,24), (1, 1), (1, 1), (1, 1))#test_lenet_28 bprop 2 valid
+               # ((1,60,28,28),(20,60,24,24), (1, 1), (1, 1), (1, 1)),  # test_lenet_28 bprop 2 valid
-            , ((10, 1, 64, 64), (20, 1, 7, 7), (1, 1), (1, 1), (1, 1))  # test_lenet_64 1 layers
+               ((10, 1, 64, 64), (20, 1, 7, 7), (1, 1), (1, 1), (1, 1)),  # test_lenet_64 1 layers
-            , ((10, 20, 29, 29), (30, 20, 7, 7), (1, 1), (1, 1), (1, 1))  # test_lenet_64 2 layers
+               ((10, 20, 29, 29), (30, 20, 7, 7), (1, 1), (1, 1), (1, 1)),  # test_lenet_64 2 layers
-            , ((10, 30, 23, 23), (20, 30, 7, 7), (1, 1), (1, 1), (1, 1))  # test_lenet_64 full
+               ((10, 30, 23, 23), (20, 30, 7, 7), (1, 1), (1, 1), (1, 1))  # test_lenet_64 full
-#            , ((20,10,29,29),(30,10,23,23), (1, 1), (1, 1), (1, 1))#test_lenet_64 bprop 1
+               # ((20,10,29,29),(30,10,23,23), (1, 1), (1, 1), (1, 1)),  # test_lenet_64 bprop 1
-#            , ((1,10,64,64),(20,10,58,58), (1, 1), (1, 1), (1, 1))#test_lenet_64 bprop 2
+               # ((1,10,64,64),(20,10,58,58), (1, 1), (1, 1), (1, 1))  # test_lenet_64 bprop 2
               ]
    return shapes
@@ -466,47 +464,46 @@ def _test_full(cls, mode=None, version=[-1], extra_shapes=[],
    shapes += [
        # other test
-              ((2, 1, 2, 2), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1))
+        ((2, 1, 2, 2), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
-            , ((3, 2, 4, 4), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1))
+        ((3, 2, 4, 4), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)),
-            , ((4, 1, 10, 10), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1))
+        ((4, 1, 10, 10), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
-            , ((1, 1, 4, 4), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1))
+        ((1, 1, 4, 4), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1)),
-            , ((4, 1, 10, 10), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1))
+        ((4, 1, 10, 10), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1)),
-            , ((4, 1, 10, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1))
+        ((4, 1, 10, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1)),
-            , ((4, 1, 20, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1))
+        ((4, 1, 20, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1)),
-            , ((3, 2, 8, 8), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1))  # stack, nkern, bsize
+        ((3, 2, 8, 8), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)),  # stack, nkern, bsize
-            , ((3, 2, 8, 6), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1))  # stack, nkern, bsize, non-square image
+        ((3, 2, 8, 6), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)),  # stack, nkern, bsize, non-square image
-            , ((3, 2, 8, 6), (4, 2, 4, 3), (1, 1), (1, 1), (1, 1))  # stack, nkern, bsize, non-square image, non-square kern
+        ((3, 2, 8, 6), (4, 2, 4, 3), (1, 1), (1, 1), (1, 1)),  # stack, nkern, bsize, non-square image, non-square kern
-            , ((3, 2, 8, 6), (4, 2, 4, 6), (1, 1), (1, 1), (1, 1))  # stack, nkern, bsize ,non-square image, non-square kern, kernsize==imgsize on one dim
+        ((3, 2, 8, 6), (4, 2, 4, 6), (1, 1), (1, 1), (1, 1)),  # stack, nkern, bsize ,non-square image, non-square kern, kernsize==imgsize on one dim
-            , ((16, 5, 64, 64), (8, 5, 8, 8), (1, 1), (1, 1), (1, 1))  # a big one
+        ((16, 5, 64, 64), (8, 5, 8, 8), (1, 1), (1, 1), (1, 1)),  # a big one
-            , ((16, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1))  # MNIST LeNET layer 1
+        ((16, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)),  # MNIST LeNET layer 1
-            , ((20, 16, 32, 32), (1, 16, 28, 28), (1, 1), (1, 1), (1, 1))  # layer 1 backprop to weights
+        ((20, 16, 32, 32), (1, 16, 28, 28), (1, 1), (1, 1), (1, 1))  # layer 1 backprop to weights
        ]
    if test_bigger_kernels:
        # Shapes where the kernel is larger than the image in some dimension
        shapes += [
-              ((3, 1, 1, 1), (2, 1, 5, 3), (1, 1), (1, 1), (1, 1))
+            ((3, 1, 1, 1), (2, 1, 5, 3), (1, 1), (1, 1), (1, 1)),
-            , ((3, 2, 1, 1), (4, 2, 1, 1), (1, 1), (1, 1), (1, 1))
+            ((3, 2, 1, 1), (4, 2, 1, 1), (1, 1), (1, 1), (1, 1)),
-            , ((3, 2, 4, 4), (4, 2, 2, 6), (1, 1), (1, 1), (1, 1))
+            ((3, 2, 4, 4), (4, 2, 2, 6), (1, 1), (1, 1), (1, 1)),
-            , ((3, 2, 4, 4), (4, 2, 8, 6), (1, 1), (1, 1), (1, 1))
+            ((3, 2, 4, 4), (4, 2, 8, 6), (1, 1), (1, 1), (1, 1)),
-            , ((4, 2, 10, 10), (3, 2, 2, 12), (1, 1), (1, 1), (1, 1))
+            ((4, 2, 10, 10), (3, 2, 2, 12), (1, 1), (1, 1), (1, 1))
            ]
-    shapes += [
+    shapes += [((60, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)),  # test_lenet_28 1 layers
-#        ((60,1,28,28),(20,1,5,5), (1, 1), (1, 1), (1, 1))#test_lenet_28 1 layers
+               # ((60, 20, 12, 12),(30, 20, 5, 5), (1, 1), (1, 1), (1, 1)),  # test_lenet_28 2 layers
-#            , ((60,20,12,12),(30,20,5,5), (1, 1), (1, 1), (1, 1))#test_lenet_28 2 layers
+               ((60, 30, 8, 8), (20, 30, 5, 5), (1, 1), (1, 1), (1, 1)),  # test_lenet_28 bprop 1 full
-             ((60, 30, 8, 8), (20, 30, 5, 5), (1, 1), (1, 1), (1, 1))  # test_lenet_28 bprop 1 full
+               # ((20,60,12,12),(30,60,8,8), (1, 1), (1, 1), (1, 1)),  # test_lenet_28 bprop 2 valid
-#            , ((20,60,12,12),(30,60,8,8), (1, 1), (1, 1), (1, 1))#test_lenet_28 bprop 2 valid
+               # ((1,60,28,28),(20,60,24,24), (1, 1), (1, 1), (1, 1)),  # test_lenet_28 bprop 2 valid
-#            , ((1,60,28,28),(20,60,24,24), (1, 1), (1, 1), (1, 1))#test_lenet_28 bprop 2 valid
+               # ((10,1,64,64),(20,1,7,7), (1, 1), (1, 1), (1, 1)),  # test_lenet_64 1 layers
-#            , ((10,1,64,64),(20,1,7,7), (1, 1), (1, 1), (1, 1))#test_lenet_64 1 layers
+               # ((10,20,29,29),(30,20,7,7), (1, 1), (1, 1), (1, 1)),  # test_lenet_64 2 layers
-#            , ((10,20,29,29),(30,20,7,7), (1, 1), (1, 1), (1, 1))#test_lenet_64 2 layers
+               ((10, 30, 23, 23), (20, 30, 7, 7), (1, 1), (1, 1), (1, 1)),  # test_lenet_64 full
-            , ((10, 30, 23, 23), (20, 30, 7, 7), (1, 1), (1, 1), (1, 1))  # test_lenet_64 full
+               # ((20,10,29,29),(30,10,23,23), (1, 1), (1, 1), (1, 1)),  # test_lenet_64 bprop 1
-#            , ((20,10,29,29),(30,10,23,23), (1, 1), (1, 1), (1, 1))#test_lenet_64 bprop 1
+               # ((1,10,64,64),(20,10,58,58), (1, 1), (1, 1), (1, 1)),  # test_lenet_64 bprop 2
-#            , ((1,10,64,64),(20,10,58,58), (1, 1), (1, 1), (1, 1))#test_lenet_64 bprop 2
               # Test more than maxThreadsDim0
-            , ((2, 4, 13, 1050), (3, 4, 10, 11), (1, 1), (1, 1), (1, 1))
+               ((2, 4, 13, 1050), (3, 4, 10, 11), (1, 1), (1, 1), (1, 1)),
-            , ((2, 4, 1050, 13), (3, 4, 10, 11), (1, 1), (1, 1), (1, 1))
+               ((2, 4, 1050, 13), (3, 4, 10, 11), (1, 1), (1, 1), (1, 1)),
-            , ((1, 1, 44800, 1), (6, 1, 1, 1), (1, 1), (1, 1), (1, 1))  # This caused crash
+               ((1, 1, 44800, 1), (6, 1, 1, 1), (1, 1), (1, 1), (1, 1))  # This caused crash
               ]
    verbose = 0
@@ -636,7 +633,6 @@ class TestConv2DGPU(unittest.TestCase):
                              imshp_logical=featshp_logical[1:],
                              kshp_logical=kshp[2:])
    def test_invalid_input_shape(self):
        """
        Tests that when the shape gived at build time is not the same as
@@ -659,7 +655,7 @@ class TestConv2DGPU(unittest.TestCase):
                for mode in ['valid', 'full']:
                    for shapes in [((3, 2, 8, 8), (4, 2, 5, 5), (8, 8)),
                                   ((3, 2, 8, 8), (4, 2, 5, 5), (5, 8)),
-                                   #((3, 2, 8, 8), (4, 2, 5, 5), (8, 5)),
+                                   # ((3, 2, 8, 8), (4, 2, 5, 5), (8, 5)),
                                   # We use only the number of columns.
                                   ]:
@@ -777,8 +773,8 @@ def gemm_directly(bs, ch, nf, rImg1, rImg2, rFlt1, rFlt2, subsx, subsy,
        f = theano.function([i, k], op, mode=theano_mode)
        gpuval = numpy.array(f(
            npy_img.transpose(1, 0, 2, 3),
-                npy_kern.transpose(1, 0, 2, 3)[:, :, ::-1, ::-1])).transpose(
+            npy_kern.transpose(1, 0, 2, 3)[:, :, ::-1, ::-1])
-            1, 0, 2, 3)
+            ).transpose(1, 0, 2, 3)
    assert_allclose(cpuval, gpuval, rtol=1e-4)
@@ -892,43 +888,43 @@ def benchmark():
    shapes_valid = [
        # test_lenet_28 shape
-        ((20, 60, 12, 12), (30, 60, 8, 8), (1, 1), (1, 1), (1, 1))  # valid
+        ((20, 60, 12, 12), (30, 60, 8, 8), (1, 1), (1, 1), (1, 1)),  # valid
-        , ((60, 20, 12, 12), (30, 20, 5, 5), (1, 1), (1, 1), (1, 1))  # valid
+        ((60, 20, 12, 12), (30, 20, 5, 5), (1, 1), (1, 1), (1, 1)),  # valid
-        , ((60, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1))  # valid
+        ((60, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)),  # valid
-        , ((1, 60, 28, 28), (20, 60, 24, 24), (1, 1), (1, 1), (1, 1))  # valid
+        ((1, 60, 28, 28), (20, 60, 24, 24), (1, 1), (1, 1), (1, 1)),  # valid
        # test_lenet_32 shape
-        , ((20, 60, 14, 14), (30, 60, 10, 10), (1, 1), (1, 1), (1, 1))  # valid
+        ((20, 60, 14, 14), (30, 60, 10, 10), (1, 1), (1, 1), (1, 1)),  # valid
-        , ((60, 20, 14, 14), (30, 20, 5, 5), (1, 1), (1, 1), (1, 1))  # valid
+        ((60, 20, 14, 14), (30, 20, 5, 5), (1, 1), (1, 1), (1, 1)),  # valid
-        , ((60, 1, 32, 32), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1))  # valid
+        ((60, 1, 32, 32), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)),  # valid
-        , ((1, 60, 32, 32), (20, 60, 28, 28), (1, 1), (1, 1), (1, 1))  # valid
+        ((1, 60, 32, 32), (20, 60, 28, 28), (1, 1), (1, 1), (1, 1)),  # valid
        # test_lenet_64 shape
-        , ((10, 20, 29, 29), (30, 20, 7, 7), (1, 1), (1, 1), (1, 1))  # valid
+        ((10, 20, 29, 29), (30, 20, 7, 7), (1, 1), (1, 1), (1, 1)),  # valid
-        , ((20, 10, 29, 29), (30, 10, 23, 23), (1, 1), (1, 1), (1, 1))  # valid
+        ((20, 10, 29, 29), (30, 10, 23, 23), (1, 1), (1, 1), (1, 1)),  # valid
-        , ((10, 1, 64, 64), (20, 1, 7, 7), (1, 1), (1, 1), (1, 1))  # valid
+        ((10, 1, 64, 64), (20, 1, 7, 7), (1, 1), (1, 1), (1, 1)),  # valid
-        , ((1, 10, 64, 64), (20, 10, 58, 58), (1, 1), (1, 1), (1, 1))  # valid
+        ((1, 10, 64, 64), (20, 10, 58, 58), (1, 1), (1, 1), (1, 1)),  # valid
        # test_lenet_108 shape
-        , ((10, 20, 51, 51), (30, 20, 7, 7), (1, 1), (1, 1), (1, 1))  # valid
+        ((10, 20, 51, 51), (30, 20, 7, 7), (1, 1), (1, 1), (1, 1)),  # valid
-        , ((20, 10, 51, 51), (30, 10, 45, 45), (1, 1), (1, 1), (1, 1))  # valid
+        ((20, 10, 51, 51), (30, 10, 45, 45), (1, 1), (1, 1), (1, 1)),  # valid
-        , ((10, 1, 108, 108), (20, 1, 7, 7), (1, 1), (1, 1), (1, 1))  # valid
+        ((10, 1, 108, 108), (20, 1, 7, 7), (1, 1), (1, 1), (1, 1)),  # valid
-        , ((1, 10, 108, 108), (20, 10, 102, 102), (1, 1), (1, 1), (1, 1))  # valid
+        ((1, 10, 108, 108), (20, 10, 102, 102), (1, 1), (1, 1), (1, 1)),  # valid
        # test_lenet_256 shape
-        , ((2, 20, 124, 124), (30, 20, 9, 9), (1, 1), (1, 1), (1, 1))  # valid
+        ((2, 20, 124, 124), (30, 20, 9, 9), (1, 1), (1, 1), (1, 1)),  # valid
-        , ((20, 2, 124, 124), (30, 2, 116, 116), (1, 1), (1, 1), (1, 1))  # valid
+        ((20, 2, 124, 124), (30, 2, 116, 116), (1, 1), (1, 1), (1, 1)),  # valid
-        , ((2, 1, 256, 256), (20, 1, 9, 9), (1, 1), (1, 1), (1, 1))  # valid
+        ((2, 1, 256, 256), (20, 1, 9, 9), (1, 1), (1, 1), (1, 1)),  # valid
-        , ((1, 2, 256, 256), (20, 2, 248, 248), (1, 1), (1, 1), (1, 1))  # valid
+        ((1, 2, 256, 256), (20, 2, 248, 248), (1, 1), (1, 1), (1, 1))  # valid
        ]
    shapes_full = [
        # test_lenet_28 shape
-         ((60, 30, 8, 8), (20, 30, 5, 5), (1, 1), (1, 1), (1, 1))  # full
+        ((60, 30, 8, 8), (20, 30, 5, 5), (1, 1), (1, 1), (1, 1)),  # full
        # test_lenet_32 shape
-         , ((60, 30, 10, 10), (20, 30, 5, 5), (1, 1), (1, 1), (1, 1))  # full conv_full_patch_stack_padded' N=1
+        ((60, 30, 10, 10), (20, 30, 5, 5), (1, 1), (1, 1), (1, 1)),  # full conv_full_patch_stack_padded' N=1
        # test_lenet_64 shape
-         , ((10, 30, 23, 23), (20, 30, 7, 7), (1, 1), (1, 1), (1, 1))  # full conv_full_patch_stack_padded' N=3
+        ((10, 30, 23, 23), (20, 30, 7, 7), (1, 1), (1, 1), (1, 1)),  # full conv_full_patch_stack_padded' N=3
        # test_lenet_108 shape
-         , ((10, 30, 45, 45), (20, 30, 7, 7), (1, 1), (1, 1), (1, 1))  # full 'conv_full_patch_stack_padded' N=9
+        ((10, 30, 45, 45), (20, 30, 7, 7), (1, 1), (1, 1), (1, 1)),  # full 'conv_full_patch_stack_padded' N=9
        # test_lenet_256 shape
-         , ((2, 30, 116, 116), (20, 30, 9, 9), (1, 1), (1, 1), (1, 1))  # full conv_reference_full
+        ((2, 30, 116, 116), (20, 30, 9, 9), (1, 1), (1, 1), (1, 1))  # full conv_reference_full
        ]
    version = [-1]

--- a/theano/sandbox/cuda/tests/test_cuda_ndarray.py
+++ b/theano/sandbox/cuda/tests/test_cuda_ndarray.py
 from __future__ import absolute_import, print_function, division
-import time, copy, sys, unittest
+import copy
+import unittest
 # Skip test if cuda_ndarray is not available.
 from nose.plugins.skip import SkipTest
@@ -32,7 +33,7 @@ def advantage(cpu_dt, gpu_dt):
 def test_host_to_device():
-    #print >>sys.stdout, 'starting test_host_to_dev'
+    # print >>sys.stdout, 'starting test_host_to_dev'
    for shape in ((), (3,), (2, 3), (3, 4, 5, 6)):
        a = theano._asarray(numpy.random.rand(*shape), dtype='float32')
        b = cuda_ndarray.CudaNdarray(a)
@@ -52,8 +53,7 @@ def test_host_to_device():
 def test_add_iadd_idiv():
-    for shapes in (
+    for shapes in ([(5, 5), (5, 1)],
-                  [(5, 5), (5, 1)],
                   [(5, 5), (1, 5)],
                   (), (0,), (3,), (2, 3),
                   (1, 10000000), (10000, 1000), (1000000, 10),
@@ -98,16 +98,10 @@ def test_add_iadd_idiv():
        # add don't support stride
        if shape == shape2:
-            t0 = time.time()
            bsum = b0 + b1
            bsum = b0 + b1
-            t1 = time.time()
-            gpu_dt = t1 - t0
-            t0 = time.time()
            asum = a0 + a1
            asum = a0 + a1
-            t1 = time.time()
-            cpu_dt = t1 - t0
            # print shape, 'adding ', a0.size, 'cpu', cpu_dt, 'advantage', advantage(cpu_dt, gpu_dt)
            assert numpy.allclose(asum, numpy.asarray(bsum))
@@ -133,23 +127,9 @@ def test_add_iadd_idiv():
            raise Exception("You need to modify this case!")
        # TODO: b0[...,::-1] don't work
-        if shape == shape2:
-            t = False
-            try:
-                _c = _b+b1
-            except TypeError:
-                t = True
-            assert t
        # test inplace version
-        t0 = time.time()
        b0 += b1
-        t1 = time.time()
-        gpu_dt = t1 - t0
-        t0 = time.time()
        a0 += a1
-        t1 = time.time()
-        cpu_dt = t1 - t0
        # print shape, 'adding inplace', a0.size, 'cpu', cpu_dt, 'advantage', advantage(cpu_dt, gpu_dt)
        assert numpy.allclose(a0, numpy.asarray(b0))
        assert numpy.allclose(a0, a0_orig + a1)
@@ -157,14 +137,14 @@ def test_add_iadd_idiv():
        b0 /= b1
        a0 /= a1
        assert numpy.allclose(a0, numpy.asarray(b0))
-        assert numpy.allclose(a0, (a0_orig + a1)/a1)
+        assert numpy.allclose(a0, (a0_orig + a1) / a1)
        # test inplace version
        # for not contiguous input
        b0 += _b
        a0 += a1[..., ::-1]
        assert numpy.allclose(a0, numpy.asarray(b0))
-        assert numpy.allclose(a0, (a0_orig+a1)/a1+a1[..., ::-1])
+        assert numpy.allclose(a0, (a0_orig + a1) / a1 + a1[..., ::-1])
        b0 /= _b
        a0 /= a1[..., ::-1]
@@ -174,48 +154,42 @@ def test_add_iadd_idiv():
 def test_exp():
-    #print >>sys.stdout, 'starting test_exp'
+    # print >>sys.stdout, 'starting test_exp'
    for shape in ((), (3,), (2, 3),
                  (1, 10000000), (10, 1000000),
                  (100, 100000), (1000, 10000), (10000, 1000)):
        a0 = theano._asarray(numpy.random.rand(*shape), dtype='float32')
        a1 = a0.copy()
        b0 = cuda_ndarray.CudaNdarray(a0)
-        b1 = cuda_ndarray.CudaNdarray(a1)
+        cuda_ndarray.CudaNdarray(a1)
-        t0 = time.time()
        bsum = b0.exp()
-        t1 = time.time()
-        gpu_dt = t1 - t0
-        t0 = time.time()
        asum = numpy.exp(a1)
-        t1 = time.time()
-        cpu_dt = t1 - t0
        # print shape, 'adding ', a0.size, 'cpu', cpu_dt, 'advantage', advantage(cpu_dt, gpu_dt)
-        #c = numpy.asarray(b0+b1)
+        # c = numpy.asarray(b0+b1)
        if asum.shape:
            assert numpy.allclose(asum, numpy.asarray(bsum))
 def test_copy():
-    #print >>sys.stdout, 'starting test_copy'
+    # print >>sys.stdout, 'starting test_copy'
    shape = (500, 499)
    a = theano._asarray(numpy.random.rand(*shape), dtype='float32')
-    #print >>sys.stdout, '.. creating device object'
+    # print >>sys.stdout, '.. creating device object'
    b = cuda_ndarray.CudaNdarray(a)
-    #print >>sys.stdout, '.. copy'
+    # print >>sys.stdout, '.. copy'
    c = copy.copy(b)
-    #print >>sys.stdout, '.. deepcopy'
+    # print >>sys.stdout, '.. deepcopy'
    d = copy.deepcopy(b)
-    #print >>sys.stdout, '.. comparisons'
+    # print >>sys.stdout, '.. comparisons'
    assert numpy.allclose(a, numpy.asarray(b))
    assert numpy.allclose(a, numpy.asarray(c))
    assert numpy.allclose(a, numpy.asarray(d))
    b += b
-    assert numpy.allclose(a+a, numpy.asarray(b))
+    assert numpy.allclose(a + a, numpy.asarray(b))
-    assert numpy.allclose(a+a, numpy.asarray(c))
+    assert numpy.allclose(a + a, numpy.asarray(c))
    assert numpy.allclose(a, numpy.asarray(d))
@@ -237,8 +211,8 @@ def test_nvcc_bug():
    assert numpy.allclose(a, numpy.asarray(c))
    assert numpy.allclose(a, numpy.asarray(d))
    b += b
-    assert numpy.allclose(a+a, numpy.asarray(b))
+    assert numpy.allclose(a + a, numpy.asarray(b))
-    assert numpy.allclose(a+a, numpy.asarray(c))
+    assert numpy.allclose(a + a, numpy.asarray(c))
    assert numpy.allclose(a, numpy.asarray(d))
@@ -318,7 +292,7 @@ class test_DimShuffle(unittest.TestCase):
 def test_dot():
-    #print >>sys.stdout, 'starting test_dot'
+    # print >>sys.stdout, 'starting test_dot'
    utt.seed_rng()
    rng = numpy.random.RandomState(utt.fetch_seed())
@@ -347,12 +321,14 @@ def test_dot():
    b0 = cuda_ndarray.CudaNdarray(a0)
    assert _allclose(numpy.dot(a0.T, a1),
-            cuda_ndarray.dot(cuda_ndarray.dimshuffle(b0, (1, 0)), b1))
+                     cuda_ndarray.dot(
+                         cuda_ndarray.dimshuffle(b0, (1, 0)), b1))
    a1 = theano._asarray(rng.randn(6, 7), dtype='float32')
    b1 = cuda_ndarray.CudaNdarray(a1)
-    assert _allclose(numpy.dot(a0.T, a1.T),
+    assert _allclose(
+        numpy.dot(a0.T, a1.T),
        cuda_ndarray.dot(cuda_ndarray.dimshuffle(b0, (1, 0)),
                         cuda_ndarray.dimshuffle(b1, (1, 0))))
@@ -367,8 +343,8 @@ def test_sum():
    assert numpy.allclose(a0.sum(),
                          numpy.asarray(b0.reduce_sum([1, 1])))
-    a0sum = a0.sum(axis=0)
+    a0.sum(axis=0)
-    b0sum = b0.reduce_sum([1, 0])
+    b0.reduce_sum([1, 0])
    # print 'asum\n',a0sum
    # print 'bsum\n',numpy.asarray(b0sum)
@@ -399,8 +375,7 @@ def test_sum():
 def test_reshape():
-    shapelist = [
+    shapelist = [((1, 2, 3), (1, 2, 3)),
-            ((1, 2, 3), (1, 2, 3)),
                 ((1,), (1,)),
                 ((1, 2, 3), (3, 2, 1)),
                 ((1, 2, 3), (6,)),
@@ -423,7 +398,7 @@ def test_reshape():
    rng = numpy.random.RandomState(utt.fetch_seed())
    def subtest(shape_1, shape_2, rng):
-        #print >> sys.stdout, "INFO: shapes", shape_1, shape_2
+        # print >> sys.stdout, "INFO: shapes", shape_1, shape_2
        a = theano._asarray(rng.randn(*shape_1), dtype='float32')
        b = cuda_ndarray.CudaNdarray(a)
@@ -459,8 +434,8 @@ def test_reshape():
        b = cuda_ndarray.CudaNdarray(a)
        try:
-            bb = b.reshape(shape_2)
+            b.reshape(shape_2)
-        except Exception as ValueError:
+        except Exception:
            return
        assert False
@@ -509,7 +484,7 @@ def test_stride_manipulation():
    b_strides = b._strides
    for i in xrange(len(b.shape)):
-        offset += (b.shape[i]-1) * b_strides[i]
+        offset += (b.shape[i] - 1) * b_strides[i]
        v._set_stride(i, -b_strides[i])
    v._dev_data += offset * sizeof_float
@@ -699,8 +674,8 @@ def test_setitem_matrixvector1():
    assert numpy.allclose(a, numpy.asarray(_a))
    # test direct transfert from numpy
-    _a[:, 1] =  b*100
+    _a[:, 1] = b * 100
-    a[:, 1] =  b*100
+    a[:, 1] = b * 100
    assert numpy.allclose(a, numpy.asarray(_a))
    row = theano._asarray([777, 888, 999], dtype='float32')
@@ -725,8 +700,8 @@ def test_setitem_matrix_tensor3():
    assert numpy.allclose(a, numpy.asarray(_a))
    # test direct transfert from numpy
-    _a[:, 1, 1] = b*100
+    _a[:, 1, 1] = b * 100
-    a[:, 1, 1] = b*100
+    a[:, 1, 1] = b * 100
    assert numpy.allclose(a, numpy.asarray(_a))
    row = theano._asarray([777, 888, 999], dtype='float32')
@@ -752,7 +727,7 @@ def test_setitem_matrix_bad_shape():
        # attempt to assign the ndarray b with setitem
        _a[:, 1, 1] = _b
        assert False
-    except ValueError as e:
+    except ValueError:
        # print e
        assert True
@@ -761,7 +736,7 @@ def test_setitem_matrix_bad_shape():
        # attempt to assign the ndarray b with setitem
        _a[1, 1, :] = b
        assert False
-    except ValueError as e:
+    except ValueError:
        # print e
        assert True
@@ -779,7 +754,7 @@ def test_setitem_matrix_bad_ndim():
        # attempt to assign the ndarray b with setitem
        _a[:, :, 1] = _b
        assert False
-    except ValueError as e:
+    except ValueError:
        # print e
        assert True
@@ -788,7 +763,7 @@ def test_setitem_matrix_bad_ndim():
        # attempt to assign the ndarray b with setitem
        _a[1, :, :] = b
        assert False
-    except ValueError as e:
+    except ValueError:
        # print e
        assert True
@@ -806,7 +781,7 @@ def test_setitem_matrix_bad_type():
        # attempt to assign the ndarray b with setitem
        _a[1, :, :] = b
        assert False
-    except TypeError as e:
+    except TypeError:
        # print e
        assert True
@@ -832,8 +807,8 @@ def test_setitem_assign_to_slice():
    # test direct transfert from numpy
    _d = _a[1, :, :]
-    _d[1, :] = b*10
+    _d[1, :] = b * 10
-    a[1, :, :][1, :] = b*10
+    a[1, :, :][1, :] = b * 10
    assert numpy.allclose(a, numpy.asarray(_a))
@@ -923,7 +898,7 @@ def test_setitem_rightvalue_ndarray_fails():
    b = theano._asarray([7, 8, 9, 10], dtype='float32')
    _b = cuda_ndarray.CudaNdarray(b)
    b5 = theano._asarray([7, 8, 9, 10, 11], dtype='float32')
-    _b5 = cuda_ndarray.CudaNdarray(b)
+    cuda_ndarray.CudaNdarray(b)
    # attempt to assign the ndarray b with setitem
    _a[:, :, 1] = _b
@@ -941,9 +916,9 @@ def test_setitem_rightvalue_ndarray_fails():
    # without same number of dim
    try:
        _a[0, :, :] = mat
-        #a[0, :, :] = mat
+        # a[0, :, :] = mat
-        #assert numpy.allclose(numpy.asarray(_a), a)
+        # assert numpy.allclose(numpy.asarray(_a), a)
-    except ValueError as e:
+    except ValueError:
        pass
    # test direct transfert from numpy with broadcast
@@ -964,7 +939,7 @@ def test_zeros_basic():
        _n = numpy.zeros(shp, dtype="float32")
        assert numpy.allclose(numpy.asarray(_a), _n)
        assert _a.shape == _n.shape
-        assert all(_a._strides == numpy.asarray(_n.strides)/4)
+        assert all(_a._strides == numpy.asarray(_n.strides) / 4)
    # TODO:The following don't have the same stride!
    #      This should be fixed with the new GpuNdArray.
@@ -1039,10 +1014,7 @@ def test_is_c_contiguous():
    assert not a[::2].is_c_contiguous()
 if __name__ == '__main__':
-    test_zeros_basic_3d_tensor()
-    test_zeros_basic_vector()
    test_setitem_matrixvector1()
    test_setitem_matrix_tensor3()
-    test_setitem_broadcast_must_fail()
    test_setitem_assign_to_slice()
    test_setitem_rightvalue_ndarray_fails()
--- a/theano/sandbox/cuda/tests/test_driver.py
+++ b/theano/sandbox/cuda/tests/test_driver.py
@@ -6,7 +6,7 @@ import theano
 try:
    from nose.plugins.skip import SkipTest
    import theano.sandbox.cuda as cuda_ndarray
-    if cuda_ndarray.cuda_available == False:
+    if cuda_ndarray.cuda_available is False:
        raise SkipTest('Optional package cuda disabled')
 except ImportError:
    # To have the GPU back-end work without nose, we need this file to
@@ -33,8 +33,9 @@ def test_nvidia_driver1():
    topo = f.maker.fgraph.toposort()
    assert len(topo) == 2
    if sum(isinstance(node.op, B.GpuCAReduce) for node in topo) != 1:
-        msg = '\n\t'.join(['Expected exactly one occurrence of GpuCAReduce ' +
+        msg = '\n\t'.join(
-            'but got:']+[str(app) for app in topo])
+            ['Expected exactly one occurrence of GpuCAReduce ' +
+             'but got:'] + [str(app) for app in topo])
        raise AssertionError(msg)
    if not numpy.allclose(f(), a.sum()):
        raise Exception("The nvidia driver version installed with this OS "

--- a/theano/sandbox/cuda/tests/test_extra_ops.py
+++ b/theano/sandbox/cuda/tests/test_extra_ops.py
@@ -5,24 +5,22 @@ import itertools
 from nose.plugins.skip import SkipTest
 import numpy as np
 from six.moves import xrange
+from theano import tensor as T
+import theano
+from theano.tensor.extra_ops import cumsum, CumsumOp
+from theano.tests import unittest_tools as utt
 import theano.sandbox.cuda as cuda_ndarray
-if cuda_ndarray.cuda_available is False:
+if cuda_ndarray.cuda_available:
+    import theano.tensor.tests.test_extra_ops
+    from theano.sandbox.cuda.extra_ops import GpuCumsum
+else:
    raise SkipTest('Optional package cuda disabled')
-import theano.tensor.tests.test_extra_ops
-from theano.sandbox.cuda.extra_ops import GpuCumsum
 if theano.config.mode == 'FAST_COMPILE':
    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
 else:
    mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
-from theano import tensor as T
-import theano
-from theano.tensor.extra_ops import cumsum, CumsumOp
-from theano.tests import unittest_tools as utt
 class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
    mode = mode_with_gpu
@@ -129,11 +127,11 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
            utt.assert_allclose(np.cumsum(a[:i]), f(a[:i]))
        # Use multiple GPU threadblocks
-        a = np.random.random((block_max_size+2,)).astype("float32")
+        a = np.random.random((block_max_size + 2,)).astype("float32")
        utt.assert_allclose(np.cumsum(a), f(a))
        # Use recursive cumsum
-        a = np.ones((block_max_size*(block_max_size+1)+2,),
+        a = np.ones((block_max_size * (block_max_size + 1) + 2,),
                    dtype="float32")
        utt.assert_allclose(np.cumsum(a), f(a))
@@ -159,21 +157,22 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
            # Use multiple GPU threadblocks
            a_shape = [5, 5]
-            a_shape[shape_axis] = block_max_size+2
+            a_shape[shape_axis] = block_max_size + 2
            a = np.random.random(a_shape).astype("float32")
            utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
            # Use multiple GPU gridblocks
            a_shape = [4, 4]
-            a_shape[1-shape_axis] = self.max_grid_size1+1
+            a_shape[1 - shape_axis] = self.max_grid_size1 + 1
            a = np.random.random(a_shape).astype("float32")
            utt.assert_allclose(np.cumsum(a, axis=axis), f(a), rtol=5e-5)
            # Use recursive cumsum
            a_shape = [3, 3]
-            a_shape[shape_axis] = block_max_size*(block_max_size+1)+2
+            a_shape[shape_axis] = block_max_size * (
+                block_max_size + 1) + 2
            a = np.random.random(a_shape).astype("float32")
-            a = np.sign(a-0.5).astype("float32")  # Avoid floating point error
+            a = np.sign(a - 0.5).astype("float32")  # Avoid floating point error
            utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
    def test_GpuCumsum3D(self):
@@ -198,32 +197,34 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
            # Use multiple GPU threadblocks (along accumulation axis)
            a_shape = [2, 2, 2]
-            a_shape[shape_axis] = block_max_size+2
+            a_shape[shape_axis] = block_max_size + 2
            a = np.random.random(a_shape).astype("float32")
            utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
            # Use multiple GPU gridblocks (not along accumulation axis)
            a_shape = [5, 5, 5]
-            a_shape[(shape_axis+1) % 3] = self.max_grid_size1+1
+            a_shape[(shape_axis + 1) % 3] = self.max_grid_size1 + 1
            a = np.random.random(a_shape).astype("float32")
            if axis is None:
                # Avoid floating point error
-                a = np.sign(a-0.5).astype("float32")
+                a = np.sign(a - 0.5).astype("float32")
            utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
            a_shape = [5, 5, 5]
-            a_shape[(shape_axis+2) % 3] = self.max_grid_size1+1
+            a_shape[(shape_axis + 2) % 3] = self.max_grid_size1 + 1
            a = np.random.random(a_shape).astype("float32")
            if axis is None:
                # Avoid floating point error
-                a = np.sign(a-0.5).astype("float32")
+                a = np.sign(a - 0.5).astype("float32")
            utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
            # Use recursive cumsum (along accumulation axis)
            a_shape = [3, 3, 3]
-            a_shape[shape_axis] = block_max_size*(block_max_size+1)+2
+            a_shape[shape_axis] = block_max_size * (
+                block_max_size + 1) + 2
            a = np.random.random(a_shape).astype("float32")
-            a = np.sign(a-0.5).astype("float32")  # Avoid floating point error
+            a = np.sign(a - 0.5).astype(
+                "float32")  # Avoid floating point error
            utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
    def test_GpuCumsum4D(self):

--- a/theano/sandbox/cuda/tests/test_gemmcorr3d.py
+++ b/theano/sandbox/cuda/tests/test_gemmcorr3d.py
 from __future__ import absolute_import, print_function, division
 import unittest
 import numpy
-import copy
 import theano
 from theano.tests import unittest_tools as utt
 # Skip tests if cuda_ndarray is not available.
 from nose.plugins.skip import SkipTest
-import theano.sandbox.cuda as cuda_ndarray
-if not cuda_ndarray.cuda_available:
-    raise SkipTest('Optional package cuda not available')
 from theano.sandbox.cuda import float32_shared_constructor as shared
 from theano.sandbox.cuda.blas import (
    GpuCorr3dMM, GpuCorr3dMM_gradWeights, GpuCorr3dMM_gradInputs)
 from theano.sandbox.cuda.basic_ops import gpu_contiguous
+import theano.sandbox.cuda as cuda_ndarray
+if not cuda_ndarray.cuda_available:
+    raise SkipTest('Optional package cuda not available')
 if theano.config.mode == 'FAST_COMPILE':
    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
@@ -122,7 +121,9 @@ class TestCorr3DMM(unittest.TestCase):
        inputs = shared(inputs_val)
        filters = shared(filters_val)
        bias = shared(numpy.zeros(filters_shape[4]).astype('float32'))
-        conv = theano.tensor.nnet.convTransp3D(W=filters, b=bias, d=subsample,
+        conv = theano.tensor.nnet.convTransp3D(W=filters,
+                                               b=bias,
+                                               d=subsample,
                                               H=inputs)
        f_ref = theano.function([], conv)
        res_ref = f_ref()

--- a/theano/sandbox/cuda/tests/test_gradient.py
+++ b/theano/sandbox/cuda/tests/test_gradient.py
@@ -8,7 +8,7 @@ from theano.sandbox import cuda
 # Skip test if cuda_ndarray is not available.
 from nose.plugins.skip import SkipTest
 import theano.sandbox.cuda as cuda_ndarray
-if cuda_ndarray.cuda_available == False:
+if cuda_ndarray.cuda_available is False:
    raise SkipTest('Optional package cuda disabled')

--- a/theano/sandbox/cuda/tests/test_memory.py
+++ b/theano/sandbox/cuda/tests/test_memory.py
@@ -11,7 +11,7 @@ from theano import ifelse
 # Skip test if cuda_ndarray is not available.
 from nose.plugins.skip import SkipTest
-if cuda.cuda_available == False:
+if cuda.cuda_available is False:
    raise SkipTest('Optional package cuda disabled')
@@ -39,7 +39,7 @@ def freemem(extra_alloc=0):
        theano_alloc = cuda.cuda_ndarray.cuda_ndarray.theano_allocated()
        return ("(n malloc/theano mem allocated in KB)",
                n_mallocs + extra_alloc,
-                int(theano_alloc / 1024) + extra_size)
+                int(theano_alloc / 1024))
    return ("n malloc on the gpu", n_mallocs + extra_alloc)
    # I don't use the following by default as if there is other stuff running
@@ -83,9 +83,12 @@ def test_memory():
        variables = cuda.shared_constructor(np.ones((shapes[1],),
                                                    dtype='float32'))
        derp = tensor.sum(tensor.dot(some_matrix[:shapes[0]], variables))
-        print("Shared took ", np.prod(variables.get_value(
+        print("Shared took ",
+              np.prod(variables.get_value(
                  borrow=True,
-                return_internal_type=True).shape) * 4 / 1024, "kB")
+                  return_internal_type=True).shape) *
+              4 / 1024,
+              "kB")
        mem2 = freemem()
        print("Before compilation", mem2)
@@ -112,7 +115,7 @@ def test_memory():
        del obj
        # print "After deleting function 1", freemem()
-        #assert mem2 == freemem(), (mem2, freemem())
+        # assert mem2 == freemem(), (mem2, freemem())
        del grad
        print("After deleting function 2", freemem())
@@ -155,16 +158,19 @@ def test_memory_lazy():
        derp = ifelse.IfElse(1)(branch_select,
                                derp, some_matrix[:shapes[0]].sum())
        derp += 1
-        print("Shared took ", np.prod(variables.get_value(
+        print("Shared took ",
+              np.prod(variables.get_value(
                  borrow=True,
-                return_internal_type=True).shape) * 4 / 1024, "kB")
+                  return_internal_type=True).shape) *
+              4 / 1024,
+              "kB")
        mem2 = freemem()
        print("Before compilation", mem2)
        mem2_1 = freemem(extra_alloc=more_alloc1)
        obj = theano.function([some_vector, branch_select], derp,
                              mode=mode_with_gpu)
-        #theano.printing.debugprint(obj, print_type=True)
+        # theano.printing.debugprint(obj, print_type=True)
        mem3 = freemem()
        print("After function compilation 1", mem3)
        assert mem2_1 == mem3, (mem2_1, mem3)

--- a/theano/sandbox/cuda/tests/test_mlp.py
+++ b/theano/sandbox/cuda/tests/test_mlp.py
@@ -24,7 +24,7 @@ if theano.config.mode not in ['FAST_RUN', 'Mode', 'ProfileMode']:
                   'otherwise it is too slow!')
 # Skip test if cuda_ndarray is not available.
-if tcn.cuda_available == False:
+if tcn.cuda_available is False:
    raise SkipTest('Optional package cuda disabled')
@@ -147,19 +147,20 @@ def test_run_nnet():
            rtol = 1e-4
            if n_in * n_hid >= 2048 * 4096:
                rtol = 7e-4
-            assert numpy.allclose(rval_cpu, rval_gpu, rtol=rtol, atol=1e-6), \
+            assert numpy.allclose(
+                rval_cpu, rval_gpu, rtol=rtol, atol=1e-6), \
                ("max_abs_diff, max_rel_diff, n_in, n_hid", max_abs_diff,
                 rel_diff.max(), n_in, n_hid)
 def test_run_nnet_med():
    utt.seed_rng()
-    rval_cpu = run_nnet(False, 10, 128, 50, 4, n_train=10000)
+    run_nnet(False, 10, 128, 50, 4, n_train=10000)
 def test_run_nnet_small():
    utt.seed_rng()
-    rval_cpu = run_nnet(False, 10, 10, 4, 4, n_train=100000)
+    run_nnet(False, 10, 10, 4, 4, n_train=100000)
 def run_conv_nnet1(use_gpu):
@@ -203,8 +204,11 @@ def run_conv_nnet1(use_gpu):
    mode = get_mode(use_gpu)
    # print 'building pfunc ...'
-    train = pfunc([x, y, lr], [loss], mode=mode, updates=[(p, p - g) for p,
+    train = pfunc(
-        g in zip(params, gparams)])
+        [x, y, lr],
+        [loss],
+        mode=mode,
+        updates=[(p, p - g) for p, g in zip(params, gparams)])
 #    for i, n in enumerate(train.maker.fgraph.toposort()):
 #        print i, n
@@ -279,7 +283,9 @@ def run_conv_nnet2(use_gpu):  # pretend we are training LeNet for MNIST
    conv_op = conv.ConvOp(shape_img[2:], shape_kern[2:], n_kern, n_batch, 1, 1)
    conv_op1 = conv.ConvOp((n_kern, logical_hid_shape[0] // 2,
-         logical_hid_shape[1] // 2), shape_kern1[2:], n_kern1, n_batch, 1, 1)
+                            logical_hid_shape[1] // 2),
+                           shape_kern1[2:],
+                           n_kern1, n_batch, 1, 1)
    hid = tensor.tanh(conv_op(x, w0) + b0.dimshuffle((0, 'x', 'x')))
    hid1 = tensor.tanh(conv_op1(hid[:, :, ::2, ::2], w1) + b1.dimshuffle((
@@ -295,8 +301,11 @@ def run_conv_nnet2(use_gpu):  # pretend we are training LeNet for MNIST
    mode = get_mode(use_gpu)
    # print 'building pfunc ...'
-    train = pfunc([x, y, lr], [loss], mode=mode, updates=[(p, p - g) for p,
+    train = pfunc(
-        g in zip(params, gparams)])
+        [x, y, lr],
+        [loss],
+        mode=mode,
+        updates=[(p, p - g) for p, g in zip(params, gparams)])
 #    for i, n in enumerate(train.maker.fgraph.toposort()):
 #        print i, n
@@ -376,13 +385,14 @@ def build_conv_nnet2_classif(use_gpu, isize, ksize, n_batch,
    if downsample_ops:
        hid = tensor.tanh(ds_op(conv_op(x, w0) + b0.dimshuffle((0, 'x', 'x'))))
    else:
-        hid = tensor.tanh((conv_op(x, w0) + b0.dimshuffle((0, 'x', 'x')
+        hid = tensor.tanh(
-            ))[:, :, ::2, ::2])
+            (conv_op(x, w0) + b0.dimshuffle(
+                (0, 'x', 'x')))[:, :, ::2, ::2])
    hid1 = tensor.tanh(conv_op1(hid, w1) + b1.dimshuffle((0, 'x', 'x')))
    hid_flat = hid1.reshape((n_batch, n_hid))
    out = tensor.nnet.softmax(tensor.dot(hid_flat, v) + c)
-    loss = tensor.sum(tensor.nnet.crossentropy_categorical_1hot(out,
+    loss = tensor.sum(tensor.nnet.crossentropy_categorical_1hot(
-         tensor.argmax(y, axis=1)) * lr)
+        out, tensor.argmax(y, axis=1)) * lr)
    # print 'loss type', loss.type
    params = [w0, b0, w1, b1, v, c]
@@ -391,8 +401,11 @@ def build_conv_nnet2_classif(use_gpu, isize, ksize, n_batch,
    mode = get_mode(use_gpu, check_isfinite)
    # print 'building pfunc ...'
-    train = pfunc([x, y, lr], [loss], mode=mode, updates=[(p, p - g) for p,
+    train = pfunc(
-        g in zip(params, gparams)])
+        [x, y, lr],
+        [loss],
+        mode=mode,
+        updates=[(p, p - g) for p, g in zip(params, gparams)])
    if verbose:
        theano.printing.debugprint(train)
@@ -440,10 +453,8 @@ def run_conv_nnet2_classif(use_gpu, seed, isize, ksize, bsize,
    lr = theano._asarray(0.01, dtype='float32')
    rvals = my_zeros(n_train)
-    t0 = time.time()
    for i in xrange(n_train):
        rvals[i] = train(xval, yval, lr)[0]
-    t1 = time.time()
    print_mode(mode)
    if pickle and isinstance(mode, theano.compile.ProfileMode):
@@ -495,7 +506,8 @@ def cmp_run_conv_nnet2_classif(seed, isize, ksize, bsize,
            compare = True
        if not compare:
-            return run_conv_nnet2_classif(use_gpu=use_gpu,
+            return run_conv_nnet2_classif(
+                use_gpu=use_gpu,
                seed=seed, isize=isize, ksize=ksize, bsize=bsize,
                n_train=n_train,
                check_isfinite=check_isfinite,
@@ -570,18 +582,6 @@ def cmp_run_conv_nnet2_classif(seed, isize, ksize, bsize,
    finally:
        theano.tensor.basic.float32_atol = orig_float32_atol
-    if pickle:
-        if isinstance(cpu_mode, theano.compile.ProfileMode):
-            import pickle
-            print("BEGIN CPU profile mode dump")
-            print(pickle.dumps(cpu_mode))
-            print("END CPU profile mode dump")
-        if isinstance(gpu_mode, theano.compile.ProfileMode):
-            import pickle
-            print("BEGIN GPU profile mode dump")
-            print(pickle.dumps(gpu_mode))
-            print("END GPU profile mode dump")
    # print "CPU time: %.3f, GPU time: %.3f, speed up %f" % (
    #        (time_cpu, time_gpu, time_cpu/time_gpu))
    # print "Estimated time for one pass through MNIST with CPU: %f" % (

--- a/theano/sandbox/cuda/tests/test_neighbours.py
+++ b/theano/sandbox/cuda/tests/test_neighbours.py
 # Skip test if cuda_ndarray is not available.
 from __future__ import absolute_import, print_function, division
 from nose.plugins.skip import SkipTest
+import unittest
+import theano.tensor.nnet.tests.test_neighbours
+from theano.sandbox.cuda.neighbours import GpuImages2Neibs
 import theano.sandbox.cuda as cuda_ndarray
-if cuda_ndarray.cuda_available == False:
+if cuda_ndarray.cuda_available is False:
    raise SkipTest('Optional package cuda disabled')
-import theano.tensor.nnet.tests.test_neighbours
-from theano.sandbox.cuda.neighbours import GpuImages2Neibs
 if theano.config.mode == 'FAST_COMPILE':
    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')

--- a/theano/sandbox/cuda/tests/test_opt.py
+++ b/theano/sandbox/cuda/tests/test_opt.py
 from __future__ import absolute_import, print_function, division
 import operator
 import sys
-import unittest
 import numpy
 # Skip test if cuda_ndarray is not available.
@@ -9,39 +8,28 @@ from nose.plugins.skip import SkipTest
 from nose.tools import assert_raises
 import theano
+import theano.sandbox.cuda.cula as cula
+from theano.sandbox.cuda import basic_ops
+from theano.sandbox.cuda.type import CudaNdarrayType
+from theano.scalar.basic_scipy import erfinv
 from six.moves import reduce
 from theano.compile.pfunc import pfunc
 from theano import config, tensor
 import theano.tensor.tests.test_nlinalg
 import theano.tensor.tests.test_opt as test_opt
+from theano.tensor.nnet.blocksparse import sparse_block_dot
+from theano.sandbox.cuda.blocksparse import GpuSparseBlockGemv
+from theano.sandbox.cuda.blocksparse import GpuSparseBlockOuter
 from theano.tests.breakpoint import PdbBreakpoint
 from theano.tests import unittest_tools as utt
+import theano.tests.test_ifelse
 import theano.sandbox.cuda as cuda
 if not cuda.cuda_available:
    raise SkipTest('Optional package cuda disabled')
-import theano.sandbox.cuda.cula as cula
-from theano.sandbox.cuda import basic_ops
-from theano.sandbox.cuda.type import CudaNdarrayType
-from theano.scalar.basic_scipy import erfinv
-from theano.tensor.nnet.blocksparse import sparse_block_dot
-from theano.sandbox.cuda.blocksparse import GpuSparseBlockGemv, GpuSparseBlockOuter
-imported_scipy_special = False
-try:
-    import scipy.special
-    imported_scipy_special = True
-# Importing scipy.special may raise ValueError.
-# See http://projects.scipy.org/scipy/ticket/1739
-except (ImportError, ValueError):
-    pass
 if theano.config.mode == 'FAST_COMPILE':
    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
    mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpu')
@@ -152,7 +140,7 @@ def test_local_assert_no_cpu_op():
 def test_int_pow():
    a = CudaNdarrayType([False])()
-    f = theano.function([a], (a*4).sum(), mode=mode_with_gpu)
+    f = theano.function([a], (a * 4).sum(), mode=mode_with_gpu)
    op_names = [n.op.__class__.__name__ for n in f.maker.fgraph.toposort()]
    assert op_names == ['GpuCAReduce', 'GpuElemwise', 'HostFromGpu']
@@ -175,23 +163,30 @@ def test_gpualloc():
    x = theano.shared(numpy.ones(3, dtype='float32'), 'x')
    m = (x).dimshuffle(['x', 0])
    v = tensor.alloc(1., *m.shape)
-    f = theano.function([], v + x,
+    f = theano.function([],
-                        mode=mode_with_gpu.excluding("local_elemwise_alloc"))
+                        v + x,
+                        mode=mode_with_gpu.excluding(
+                            "local_elemwise_alloc"))
    l = f.maker.fgraph.toposort()
-    assert numpy.any([isinstance(x.op, cuda.GpuAlloc) for x in l])
+    assert numpy.any([isinstance(x.op, cuda.GpuAlloc) for y in l])
 def test_gpuallocempty():
-    f_gpu = theano.function([], tensor.AllocEmpty('float32')(2,3),
+    f_gpu = theano.function(
+        [],
+        tensor.AllocEmpty('float32')(2, 3),
        mode=mode_with_gpu)
    l_gpu = f_gpu.maker.fgraph.toposort()
-    assert numpy.any([isinstance(x.op, basic_ops.GpuAllocEmpty) for x in l_gpu])
+    assert numpy.any(
+        [isinstance(x.op, basic_ops.GpuAllocEmpty) for x in l_gpu])
-    f_cpu = theano.function([], tensor.AllocEmpty('int32')(2,3))
+    f_cpu = theano.function([], tensor.AllocEmpty('int32')(2, 3))
    l_cpu = f_cpu.maker.fgraph.toposort()
-    assert not numpy.any([isinstance(x.op, basic_ops.GpuAllocEmpty) for x in l_cpu])
+    assert not numpy.any(
+        [isinstance(x.op, basic_ops.GpuAllocEmpty) for x in l_cpu])
 class Test_local_elemwise_alloc(test_opt.Test_local_elemwise_alloc):
    dtype = 'float32'
@@ -269,7 +264,8 @@ def test_gpuspecifyshape():
    f = theano.function([], updates=[(x, m * numpy.float32(2))],
                        mode=mode_with_gpu)
    l = f.maker.fgraph.toposort()
-    assert not numpy.any([isinstance(x.op, cuda.HostFromGpu) for x in l])
+    assert not numpy.any(
+        [isinstance(x.op, cuda.HostFromGpu) for y in l])
 def test_softmax():
@@ -430,7 +426,7 @@ def test_local_gpu_subtensor():
    # Test multiple use of the input
    # We want the subtensor to be on the GPU to prevent multiple transfer.
    t = tensor.fmatrix()
-    f = theano.function([t], [t[3:4], t+1], mode=mode_with_gpu)
+    f = theano.function([t], [t[3:4], t + 1], mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
    assert not any([type(node.op) is tensor.Subtensor for node in topo])
    assert any([isinstance(node.op, cuda.GpuSubtensor) for node in topo])
@@ -438,7 +434,7 @@ def test_local_gpu_subtensor():
    # Test multiple use of the input + input as output
    # We want the subtensor to be on the GPU to prevent multiple transfer.
    t = tensor.fmatrix()
-    f = theano.function([t], [t[3:4], t+1, t], mode=mode_with_gpu)
+    f = theano.function([t], [t[3:4], t + 1, t], mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
    assert not any([type(node.op) is tensor.Subtensor for node in topo])
    assert any([isinstance(node.op, cuda.GpuSubtensor) for node in topo])
@@ -446,7 +442,7 @@ def test_local_gpu_subtensor():
    # Test shared forced on CPU end we do computation on the output of
    # the subtensor.
    t = tensor._shared(numpy.zeros(20, "float32"))
-    f = theano.function([], t[3:4]+1, mode=mode_with_gpu)
+    f = theano.function([], t[3:4] + 1, mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
    assert any([type(node.op) is tensor.Subtensor for node in topo])
    assert not any([isinstance(node.op, cuda.GpuSubtensor) for node in topo])
@@ -507,10 +503,11 @@ def test_local_gpu_split():
 def test_print_op():
    """ Test that print ops don't block gpu optimization"""
    b = tensor.fmatrix()
-    f = theano.function([b], theano.printing.Print()(b)*2, mode=mode_with_gpu)
+    f = theano.function(
+        [b], theano.printing.Print()(b) * 2, mode=mode_with_gpu)
    # theano.printing.debugprint(f)
    # print f.maker.fgraph.toposort()
-#[GpuFromHost(<TensorType(float32, matrix)>), <theano.printing.Print object at 0x3581210>(GpuFromHost.0), GpuElemwise{mul}(CudaNdarray{[[ 2.]]}, <theano.printing.Print object at 0x3581210>.0), HostFromGpu(GpuElemwise{mul}.0)]
+    # [GpuFromHost(<TensorType(float32, matrix)>), <theano.printing.Print object at 0x3581210>(GpuFromHost.0), GpuElemwise{mul}(CudaNdarray{[[ 2.]]}, <theano.printing.Print object at 0x3581210>.0), HostFromGpu(GpuElemwise{mul}.0)]
    topo = f.maker.fgraph.toposort()
    assert topo[0].op == cuda.gpu_from_host
    assert isinstance(topo[1].op, theano.printing.Print)
@@ -563,8 +560,10 @@ def test_huge_elemwise_fusion():
        bytes limits.
    """
    shape = (2, 3, 4, 5, 6)
-    ttype = tensor.tensor(dtype='float32', broadcastable=(False,) * len(shape))
+    ttype = tensor.tensor(dtype='float32',
-    gpu_ptr_size = theano.sandbox.cuda.opt.get_device_type_sizes()['gpu_ptr_size']
+                          broadcastable=(False,) * len(shape))
+    gpu_ptr_size = theano.sandbox.cuda.opt.get_device_type_sizes()[
+        'gpu_ptr_size']
    if gpu_ptr_size == 8:
        nb_in = 7
        len_topo = 10
@@ -582,14 +581,19 @@ def test_huge_elemwise_fusion():
    assert isinstance(topo[-3].op.scalar_op, theano.scalar.basic.Sub)
    assert isinstance(topo[-2].op.scalar_op, theano.scalar.basic.Composite)
    # let debugmode catch errors
-    gen = lambda: theano._asarray(numpy.random.rand(*shape), dtype='float32')
+    # gen = lambda: theano._asarray(numpy.random.rand(*shape), dtype='float32')
+    def gen():
+        return(
+            theano._asarray(numpy.random.rand(*shape), dtype='float32'))
    f(*[gen() for i in range(nb_in)])
    # Test the case where we can't put the computation on the gpu! their is too
    # many dimensions to the input to have 2 inputs to the op!
    shape = (1, 2, 3, 4, 5, 6, 7, 2, 2, 3, 2, 1, 2, 2, 2,)
-    ttype = tensor.tensor(dtype='float32', broadcastable=(False,) * len(shape))
+    ttype = tensor.tensor(
+        dtype='float32', broadcastable=(False,) * len(shape))
    vars = [tensor.tanh(ttype) for x in range(7)]
    f = pfunc(vars, [vars[0] - vars[1] - vars[2] - vars[3] - vars[4] -
                     vars[5] - vars[6]], mode=mode_with_gpu)
@@ -598,7 +602,9 @@ def test_huge_elemwise_fusion():
    assert sum([isinstance(node.op, cuda.GpuElemwise) for node in topo]) == 0
    assert sum([isinstance(node.op, tensor.Elemwise) for node in topo]) == 1
    # let debugmode catch errors
-    gen = lambda: theano._asarray(numpy.random.rand(*shape), dtype='float32')
+    def gen():
+        return(theano._asarray(numpy.random.rand(*shape), dtype='float32'))
    f(gen(), gen(), gen(), gen(), gen(), gen(), gen())
    def gen(shape):
@@ -611,9 +617,9 @@ def test_huge_elemwise_fusion():
                  (2, 2, 2, 2),
                  (2, 2, 2, 2, 2),  # 5d
                  (2, 2, 2, 2, 2, 2),
-#                  (2, 2, 2, 2, 2, 2, 2),
+                  # (2, 2, 2, 2, 2, 2, 2),
-#                  (2, 2, 2, 2, 2, 2, 2, 2),
+                  # (2, 2, 2, 2, 2, 2, 2, 2),
-#                  (2, 2, 2, 1, 1, 1, 1, 2, 2),  # 9d
+                  # (2, 2, 2, 1, 1, 1, 1, 2, 2),  # 9d
                  ]:
        vals = [cuda.shared_constructor(gen(shape)) for x in range(max_var)]
        for use_tan in [True, False]:
@@ -676,7 +682,9 @@ def test_local_gpu_elemwise_0():
    a = tensor.fmatrix()
    from theano.scalar.basic import identity
    out_s = theano.scalar.Composite([a_s, b_s, c_s],
-                                    [identity(a_s), identity(c_s), identity(b_s)])
+                                    [identity(a_s),
+                                     identity(c_s),
+                                     identity(b_s)])
    outs_op = tensor.Elemwise(out_s)
    f = theano.function([a, b, c], outs_op(a, b, c), mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
@@ -725,9 +733,6 @@ def test_elemwise_fusion():
      theano._asarray(numpy.random.rand(*shape), dtype='float32'))
-import theano.tests.test_ifelse
 class TestIfElse(theano.tests.test_ifelse.test_ifelse):
    dtype = "float32"
    mode = mode_with_gpu
@@ -765,15 +770,17 @@ def test_incsubtensor_mixed():
 def test_erfinvgpu():
    """ Test that local_gpu_elemwise_0 replaces Erfinv with ErfinvGPU """
    x = tensor.fmatrix()
-    f = theano.function([x], tensor.Elemwise(erfinv)(x), mode=mode_with_gpu)
+    f = theano.function([x],
-    f2 = theano.function([x], tensor.Elemwise(erfinv)(x),
+                        tensor.Elemwise(erfinv)(x),
+                        mode=mode_with_gpu)
+    theano.function([x],
+                    tensor.Elemwise(erfinv)(x),
                    mode=mode_without_gpu)
-    assert isinstance(f.maker.fgraph.toposort()[1].op, cuda.GpuElemwise)
+    assert isinstance(f.maker.fgraph.toposort()[1].op,
+                      cuda.GpuElemwise)
    assert isinstance(f.maker.fgraph.toposort()[1].op.scalar_op,
                      cuda.elemwise.ErfinvGPU)
-    xv = numpy.random.rand(7, 8).astype('float32')
+    numpy.random.rand(7, 8).astype('float32')
-    if imported_scipy_special:
-        assert numpy.allclose(f(xv), f2(xv))
 def test_local_gpu_solve():

--- a/theano/sandbox/cuda/tests/test_rng_curand.py
+++ b/theano/sandbox/cuda/tests/test_rng_curand.py
@@ -8,7 +8,7 @@ from theano.sandbox.rng_mrg import MRG_RandomStreams
 # Skip tests if cuda_ndarray is not available.
 from nose.plugins.skip import SkipTest
 import theano.sandbox.cuda as cuda_ndarray
-if cuda_ndarray.cuda_available == False:
+if cuda_ndarray.cuda_available is False:
    raise SkipTest('Optional package cuda disabled')
 # The PyCObject that represents the cuda random stream object

--- a/theano/sandbox/cuda/tests/test_tensor_op.py
+++ b/theano/sandbox/cuda/tests/test_tensor_op.py
@@ -2,7 +2,6 @@
 This file test tensor op that should also operate on CudaNdaray.
 """
 from __future__ import absolute_import, print_function, division
-import copy
 from nose.plugins.skip import SkipTest
 import numpy
@@ -14,7 +13,7 @@ import theano.tensor as T
 # Skip test if cuda_ndarray is not available.
 import theano.sandbox.cuda as cuda
 from theano.tensor.nnet.tests import test_conv3d2d
-if cuda.cuda_available == False:
+if cuda.cuda_available is False:
    raise SkipTest('Optional package cuda disabled')
@@ -57,7 +56,7 @@ def test_softmax_optimizations():
    one_of_n = tensor.lvector('one_of_n')
    op = crossentropy_categorical_1hot
-    xe = op(x, one_of_n)
+    op(x, one_of_n)
    fgraph = theano.gof.FunctionGraph(
        [x, one_of_n],
@@ -84,10 +83,10 @@ def test_may_share_memory_cuda():
    # can't test the transpose as ta._strides = is not implemented
    # manual transpose of a
-    #ta = a.reshape((4,3))
+    # ta = a.reshape((4,3))
    # ta._strides = (ta._strides[1],ta._strides[0])#not implemented
-    #elem_size=elem_size = numpy.zeros(0,dtype=a.dtype).dtype.itemsize
+    # elem_size=elem_size = numpy.zeros(0,dtype=a.dtype).dtype.itemsize
-    #ta.gpudata += ta.size*elem_size
+    # ta.gpudata += ta.size*elem_size
    for a_, b_, rep in [(a, a, True), (b, b, True), (a, b, False),
                        (a, na, False), (b, nb, False),
@@ -95,8 +94,7 @@ def test_may_share_memory_cuda():
                        (a, va, True), (b, vb, True),
                        (va, b, False), (a, vb, False),
                        (a, ra, True), (b, rb, True),
-                        (ra, b, False), (a, rb, False),
+                        (ra, b, False), (a, rb, False), ]:
-                      ]:
        assert may_share_memory(a_, b_) == rep
        assert may_share_memory(b_, a_) == rep

--- a/theano/sandbox/cuda/tests/test_var.py
+++ b/theano/sandbox/cuda/tests/test_var.py
@@ -10,7 +10,7 @@ from theano.sandbox.cuda.var import float32_shared_constructor as f32sc
 from theano.sandbox.cuda import CudaNdarrayType, cuda_available
 import theano.sandbox.cuda as cuda
 # Skip test if cuda_ndarray is not available.
-if cuda_available == False:
+if cuda_available is False:
    raise SkipTest('Optional package cuda disabled')
@@ -26,19 +26,18 @@ def test_float32_shared_constructor():
    # test that broadcastable arg is accepted, and that they
    # don't strictly have to be tuples
-    assert eq(
+    assert eq(f32sc(npy_row,
-            f32sc(npy_row, broadcastable=(True, False)).type,
+                    broadcastable=(True, False)).type,
              CudaNdarrayType((True, False)))
-    assert eq(
+    assert eq(f32sc(npy_row,
-            f32sc(npy_row, broadcastable=[True, False]).type,
+                    broadcastable=[True, False]).type,
              CudaNdarrayType((True, False)))
-    assert eq(
+    assert eq(f32sc(npy_row,
-            f32sc(npy_row, broadcastable=numpy.array([True, False])).type,
+                    broadcastable=numpy.array([True, False])).type,
              CudaNdarrayType([True, False]))
    # test that we can make non-matrix shared vars
-    assert eq(
+    assert eq(f32sc(numpy.zeros((2, 3, 4, 5), dtype='float32')).type,
-            f32sc(numpy.zeros((2, 3, 4, 5), dtype='float32')).type,
              CudaNdarrayType((False,) * 4))
@@ -77,7 +76,8 @@ class T_updates(unittest.TestCase):
        x = tensor.fmatrix('x')
        output_updates = [(output_var, x ** 2)]
        output_givens = {x: data}
-        output_func = theano.function(inputs=[], outputs=[],
+        output_func = theano.function(
+            inputs=[], outputs=[],
            updates=output_updates, givens=output_givens)
        output_func()

--- a/theano/sandbox/cuda/tests/test_viewop.py
+++ b/theano/sandbox/cuda/tests/test_viewop.py
 from __future__ import absolute_import, print_function, division
 import numpy
-import unittest
 from nose.plugins.skip import SkipTest
 import theano
@@ -11,7 +10,7 @@ mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
 def test_viewop_gpu():
    from theano.sandbox import cuda
-    if cuda.cuda_available == False:
+    if cuda.cuda_available is False:
        raise SkipTest('Optional package cuda disabled')
    _x = theano.tensor.fvector('x')
    x = cuda.gpu_from_host(_x)

--- a/theano/sandbox/cuda/tests/walltime.py
+++ b/theano/sandbox/cuda/tests/walltime.py
 from __future__ import absolute_import, print_function, division
 from __future__ import print_function
-import sys, time
+import sys
+import time
 from six import iteritems
 from theano.compile.pfunc import pfunc
 from theano import tensor
@@ -35,35 +36,47 @@ def showtimes(times):
 def cmp_sigmoids(shape):
    def numpy_sigmoid(input):
-        rval = 1.0 / (1.0 + numpy.exp(-input))
+        1.0 / (1.0 + numpy.exp(-input))
-    sinput = tensor.Tensor(dtype='float32', broadcastable=(0,)*len(shape))()
+    sinput = tensor.Tensor(
-    shared_input = tcn.shared_constructor(numpy.random.rand(*shape), 'shared_input')
+        dtype='float32', broadcastable=(0,) * len(shape))()
-    times = compare_fns(
+    shared_input = tcn.shared_constructor(
-            dict( numpy=numpy_sigmoid
+        numpy.random.rand(*shape),
-                , theano_cpu=pfunc([sinput], 1.0 / (1.0 + tensor.exp(-sinput)))
+        'shared_input')
-                , theano_gpu_onboard=pfunc([sinput], [], updates=[(shared_input, 1.0 / (1.0 + tensor.exp(-shared_input)))])
+    times = compare_fns(dict(
-                ),
+        numpy=numpy_sigmoid,
+        theano_cpu=pfunc([sinput], 1.0 / (1.0 + tensor.exp(-sinput))),
+        theano_gpu_onboard=pfunc(
+            [sinput],
+            [],
+            updates=[(
+                shared_input,
+                1.0 / (1.0 + tensor.exp(-shared_input)))])),
        input=shared_input.value)
    showtimes(times)
 def cmp_sigmoids_T(shape):
    def numpy_sigmoid(input):
-        rval = 1.0 / (1.0 + numpy.exp(-input.T))
+        1.0 / (1.0 + numpy.exp(-input.T))
-    sinput = tensor.Tensor(dtype='float32', broadcastable=(0,)*len(shape))()
+    sinput = tensor.Tensor(
-    shared_input = tcn.shared_constructor(numpy.random.rand(*shape), 'shared_input')
+        dtype='float32', broadcastable=(0,) * len(shape))()
-    times = compare_fns(
+    shared_input = tcn.shared_constructor(
-            dict( numpy=numpy_sigmoid
+        numpy.random.rand(*shape),
-                , theano_cpu=pfunc([sinput], 1.0 / (1.0 + tensor.exp(-sinput.T)))
+        'shared_input')
-                , theano_gpu_onboard=pfunc([sinput], [], updates=[(shared_input, 1.0 / (1.0 +
+    times = compare_fns(dict(
-                    tensor.exp(-shared_input.T)))])
+        numpy=numpy_sigmoid,
-                ),
+        theano_cpu=pfunc([sinput], 1.0 / (1.0 + tensor.exp(-sinput.T))),
+        theano_gpu_onboard=pfunc(
+            [sinput],
+            [],
+            updates=[(
+                shared_input,
+                1.0 / (1.0 + tensor.exp(-shared_input.T)))])),
        input=shared_input.value)
    showtimes(times)
 if __name__ == '__main__':
    eval(sys.argv[1])
    # cmp_sigmoids((640, 64*64)) # looks great in profiler
-    #cmp_sigmoids((173, 74*49))
+    # cmp_sigmoids((173, 74*49))
-    #cmp_sigmoids_T((173, 74*49))
+    # cmp_sigmoids_T((173, 74*49))
--- a/theano/sandbox/cuda/type.py
+++ b/theano/sandbox/cuda/type.py
@@ -259,8 +259,8 @@ class CudaNdarrayType(Type):
                    'complex64': (complex, 'theano_complex64',
                                  'NPY_COMPLEX64')}[self.dtype]
        except KeyError:
-            raise TypeError("Unsupported dtype for %s: %s" % (
+            raise TypeError("Unsupported dtype for %s: %s" %
-                    self.__class__.__name__, self.dtype))
+                            (self.__class__.__name__, self.dtype))
    def __eq__(self, other):
        """
@@ -271,9 +271,10 @@ class CudaNdarrayType(Type):
                other.broadcastable == self.broadcastable)
    def convert_variable(self, var):
-        if (type(self) == type(var.type) and
+        if (isinstance(self, type(var.type)) and
                self.ndim == var.type.ndim and
-            all(sb == ob or ob for sb, ob in zip(self.broadcastable,
+                all(sb == ob or ob for sb, ob in zip(
+                    self.broadcastable,
                    var.type.broadcastable))):
            return theano.tensor.patternbroadcast(var, self.broadcastable)
@@ -312,7 +313,7 @@ class CudaNdarrayType(Type):
            return self.name
        else:
            b = self.broadcastable
-            #bcast = str(self.broadcastable)
+            # bcast = str(self.broadcastable)
            if not numpy.any(b):
                s = "%iD" % len(b)
            else:
@@ -327,7 +328,7 @@ class CudaNdarrayType(Type):
    def __repr__(self):
        return str(self)
-        #"CudaNdarrayType{%s, %s}" % (str(self.dtype), str(self.broadcastable))
+        # "CudaNdarrayType{%s, %s}" % (str(self.dtype), str(self.broadcastable))
    def c_declare(self, name, sub, check_input=True):
        return """ CudaNdarray * %(name)s;""" % locals()
@@ -563,8 +564,7 @@ theano.compile.register_deep_copy_op_c_code(
    CudaNdarray_HOST_DIMS(%(oname)s)[i]) {
    alloc = true;
    break;
-           }
+    }}
-        }
        if(alloc) {
            Py_XDECREF(%(oname)s);
            %(oname)s = (CudaNdarray*)CudaNdarray_Copy(%(iname)s);
@@ -581,8 +581,7 @@ theano.compile.register_deep_copy_op_c_code(
                %(fail)s;
            }
        }
-        """,
+        """, version=3)
-        version=3)
 # THIS WORKS But CudaNdarray instances don't compare equal to one
@@ -608,5 +607,5 @@ def CudaNdarray_pickler(cnda):
 # In case cuda is not imported.
 if cuda is not None:
-    copyreg.pickle(cuda.CudaNdarray, CudaNdarray_pickler,
+    copyreg.pickle(
-                    CudaNdarray_unpickler)
+        cuda.CudaNdarray, CudaNdarray_pickler, CudaNdarray_unpickler)
--- a/theano/sandbox/cuda/var.py
+++ b/theano/sandbox/cuda/var.py
@@ -13,7 +13,7 @@ try:
    # We must do those import to be able to create the full doc when nvcc
    # is not available
    from theano.sandbox.cuda import filter as type_support_filter
-    from theano.sandbox.cuda.basic_ops import HostFromGpu, GpuFromHost
+    from theano.sandbox.cuda.basic_ops import HostFromGpu
 except ImportError:
    pass
@@ -33,6 +33,7 @@ class _operators(tensor.basic._tensor_py_operators):
    def _as_TensorVariable(self):
        return HostFromGpu()(self)
    def _as_CudaNdarrayVariable(self):
        return self
@@ -54,6 +55,7 @@ class CudaNdarrayConstantSignature(tensor.TensorConstantSignature):
 class CudaNdarrayConstant(_operators, Constant):
    def signature(self):
        return CudaNdarrayConstantSignature((self.type, numpy.asarray(self.data)))
    def __str__(self):
        if self.name is not None:
            return self.name
@@ -61,7 +63,7 @@ class CudaNdarrayConstant(_operators, Constant):
            data = str(numpy.asarray(self.data))
        except Exception as e:
            data = "error while transferring the value: " + str(e)
-        return "CudaNdarrayConstant{"+data+"}"
+        return "CudaNdarrayConstant{" + data + "}"
 CudaNdarrayType.Constant = CudaNdarrayConstant

--- a/theano/tests/test_flake8.py
+++ b/theano/tests/test_flake8.py
@@ -87,42 +87,8 @@ whitelist_flake8 = [
    "sandbox/tests/test_theano_object.py",
    "sandbox/tests/test_scan.py",
    "sandbox/tests/__init__.py",
-    "sandbox/cuda/var.py",
-    "sandbox/cuda/GpuConvGrad3D.py",
-    "sandbox/cuda/basic_ops.py",
-    "sandbox/cuda/nnet.py",
-    "sandbox/cuda/elemwise.py",
-    "sandbox/cuda/type.py",
    "sandbox/cuda/__init__.py",
-    "sandbox/cuda/opt.py",
-    "sandbox/cuda/blas.py",
-    "sandbox/cuda/blocksparse.py",
-    "sandbox/cuda/rng_curand.py",
-    "sandbox/cuda/fftconv.py",
-    "sandbox/cuda/kernel_codegen.py",
-    "sandbox/cuda/GpuConvTransp3D.py",
-    "sandbox/cuda/nvcc_compiler.py",
-    "sandbox/cuda/neighbours.py",
    "sandbox/cuda/tests/__init__.py",
-    "sandbox/cuda/tests/walltime.py",
-    "sandbox/cuda/tests/test_gradient.py",
-    "sandbox/cuda/tests/test_neighbours.py",
-    "sandbox/cuda/tests/test_conv_cuda_ndarray.py",
-    "sandbox/cuda/tests/test_var.py",
-    "sandbox/cuda/tests/test_opt.py",
-    "sandbox/cuda/tests/test_blas.py",
-    "sandbox/cuda/tests/test_driver.py",
-    "sandbox/cuda/tests/test_rng_curand.py",
-    "sandbox/cuda/tests/test_basic_ops.py",
-    "sandbox/cuda/tests/test_memory.py",
-    "sandbox/cuda/tests/test_mlp.py",
-    "sandbox/cuda/tests/test_bench_loopfusion.py",
-    "sandbox/cuda/tests/test_blocksparse.py",
-    "sandbox/cuda/tests/test_cuda_ndarray.py",
-    "sandbox/cuda/tests/test_tensor_op.py",
-    "sandbox/cuda/tests/test_extra_ops.py",
-    "sandbox/cuda/tests/test_gemmcorr3d.py",
-    "sandbox/cuda/tests/test_viewop.py",
    "sandbox/gpuarray/tests/__init__.py",
    "sandbox/scan_module/scan_utils.py",
    "sandbox/scan_module/scan.py",