Merge pull request #4244 from ChihebTrabelsi/ccw2.0

flake8 sandbox/cuda/*.py

Merge pull request #4244 from ChihebTrabelsi/ccw2.0
b69ad54d · Xavier Bouthillier · 200babca · 58267dc2 · b69ad54d · b69ad54d
--- a/theano/sandbox/cuda/GpuConvGrad3D.py
+++ b/theano/sandbox/cuda/GpuConvGrad3D.py
@@ -39,7 +39,7 @@ class GpuConvGrad3D(GpuOp):
        d_ = T.as_tensor_variable(d)
        WShape_ = T.as_tensor_variable(WShape)
        dCdH_ = as_cuda_ndarray_variable(dCdH)
-        broad = (False,)*5
+        broad = (False,) * 5
        return theano.Apply(self, inputs=[V_, d_, WShape_, dCdH_],
                            outputs=[CudaNdarrayType(dtype=V_.dtype,
                                                     broadcastable=broad)()])
@@ -51,15 +51,10 @@ class GpuConvGrad3D(GpuOp):
        # partial C / partial W[j,z,k,l,m] = sum_i sum_p sum_q sum_r (partial C /partial H[i,j,p,q,r] ) *  V[i,z,dr*p+k,dc*q+l,dt*r+m]
        batchSize = dCdH.shape[0]
-        outputFilters = dCdH.shape[1]
        outputHeight = dCdH.shape[2]
        outputWidth = dCdH.shape[3]
        outputDur = dCdH.shape[4]
        assert V.shape[0] == batchSize
-        inputFilters = V.shape[1]
-        inputHeight = V.shape[2]
-        inputWidth = V.shape[3]
-        inputDur = V.shape[4]
        dr, dc, dt = d
        dCdW = numpy.zeros(WShape, dtype=V.dtype)
@@ -76,7 +71,11 @@ class GpuConvGrad3D(GpuOp):
                                for p in xrange(0, outputHeight):
                                    for q in xrange(0, outputWidth):
                                        for r in xrange(0, outputDur):
-                                            dCdW[j, z, k, l, m] += dCdH[i, j, p, q, r] * V[i, z, dr*p+k, dc*q+l, dt*r+m]
+                                            dCdW[j, z, k, l, m] += dCdH[
+                                                i, j, p, q, r] * \
+                                                V[i, z, dr * p + k,
+                                                  dc * q + l,
+                                                  dt * r + m]
        output_storage[0][0] = dCdW

--- a/theano/sandbox/cuda/GpuConvTransp3D.py
+++ b/theano/sandbox/cuda/GpuConvTransp3D.py
@@ -37,9 +37,10 @@ class GpuConvTransp3D(GpuOp):
        else:
            RShape_ = T.as_tensor_variable([-1, -1, -1])
-        return theano.Apply(self, inputs=[W_, b_, d_, H_, RShape_],
+        return theano.Apply(
-                            outputs=[CudaNdarrayType(dtype=H_.dtype,
+            self, inputs=[W_, b_, d_, H_, RShape_],
-                                                     broadcastable=(False,)*5)()])
+            outputs=[CudaNdarrayType(
+                dtype=H_.dtype, broadcastable=(False,) * 5)()])
    def infer_shape(self, node, input_shapes):
        W, b, d, H, RShape = node.inputs
@@ -382,9 +383,9 @@ def computeR(W, b, d, H, Rshape=None):
        assert dc > 0
        assert dt > 0
-        videoHeight = (outputHeight-1) * dr + filterHeight
+        videoHeight = (outputHeight - 1) * dr + filterHeight
-        videoWidth = (outputWidth-1) * dc + filterWidth
+        videoWidth = (outputWidth - 1) * dc + filterWidth
-        videoDur = (outputDur-1) * dt + filterDur
+        videoDur = (outputDur - 1) * dt + filterDur
        if Rshape is not None and Rshape[0] != -1:
            if Rshape[0] < videoHeight:
@@ -399,26 +400,46 @@ def computeR(W, b, d, H, Rshape=None):
        # else:
        #    print "No Rshape passed in"
-        # print "video size: "+str((videoHeight, videoWidth, videoDur))
+        # print "video size: " + str((videoHeight, videoWidth, videoDur))
-        R =  numpy.zeros( (batchSize, inputChannels, videoHeight,
+        R = numpy.zeros((batchSize, inputChannels, videoHeight,
-            videoWidth, videoDur ) , dtype=H.dtype)
+                         videoWidth, videoDur),
+                        dtype=H.dtype)
-        # R[i,j,r,c,t] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} sum_{tc,tk | ...} sum_k W[k, j, rk, ck, tk] * H[i,k,rc,cc,tc]
+        # R[i,j,r,c,t] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} \
+        # sum_{tc,tk | ...} sum_k W[k, j, rk, ck, tk] * H[i,k,rc,cc,tc]
        for i in xrange(0, batchSize):
            # print '\texample '+str(i+1)+'/'+str(batchSize)
            for j in xrange(0, inputChannels):
-                # print '\t\tfeature map '+str(j+1)+'/'+str(inputChannels)
+                # print '\t\tfeature map ' + str(j+1) + '/' + str(inputChannels)
                for r in xrange(0, videoHeight):
-                    # print '\t\t\trow '+str(r+1)+'/'+str(videoHeight)
+                    # print '\t\t\trow ' + str(r+1) + '/'+str(videoHeight)
                    for c in xrange(0, videoWidth):
                        for t in xrange(0, videoDur):
                            R[i, j, r, c, t] = b[j]
-                            ftc = max([0, int(numpy.ceil(float(t-filterDur + 1  )/float(dt))) ])
+                            ftc = max(
-                            fcc = max([0, int(numpy.ceil(float(c-filterWidth + 1)/float(dc))) ])
+                                [0,
+                                 int(numpy.ceil(
-                            rc =  max([0, int(numpy.ceil(float(r-filterHeight+1)/float(dr))) ])
+                                     float(t - filterDur + 1) / float(dt)
+                                     ))
+                                 ]
+                            )
+                            fcc = max(
+                                [0,
+                                 int(numpy.ceil(
+                                     float(c - filterWidth + 1) / float(dc)
+                                     ))
+                                 ]
+                            )
+                            rc = max(
+                                [0,
+                                 int(numpy.ceil(
+                                     float(r - filterHeight + 1) / float(dr)
+                                     ))
+                                 ]
+                            )
                            while rc < outputHeight:
                                rk = r - rc * dr
                                if rk < 0:
@@ -436,7 +457,9 @@ def computeR(W, b, d, H, Rshape=None):
                                        if tk < 0:
                                            break
-                                        R[i, j, r, c, t] += numpy.dot(W[:, j, rk, ck, tk], H[i, :, rc, cc, tc] )
+                                        R[i, j, r, c, t] += numpy.dot(
+                                            W[:, j, rk, ck, tk],
+                                            H[i, :, rc, cc, tc])
                                        tc += 1
                                    ""  # close loop over tc

--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
 from __future__ import absolute_import, print_function, division
-import copy
 import os
 import logging
-_logger = logging.getLogger(__name__)
 from six import integer_types
 from six.moves import StringIO, reduce
 import theano
 from theano import Apply
 from theano import tensor
@@ -15,6 +11,7 @@ from theano.sandbox.cuda import GpuOp
 from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
                                           gpu_contiguous)
 from theano.tensor import as_tensor_variable
+_logger = logging.getLogger(__name__)
 class GpuBatchedDot(GpuOp):
@@ -183,8 +180,7 @@ class GpuBatchedDot(GpuOp):
            }
        } else {
            // copy inputs if not contiguous
-            """ +
+            """ + ("\n".join("""
-            ("\n".join("""
             if ((   CudaNdarray_HOST_DIMS(%(var)s)[0] > 1 && CudaNdarray_HOST_STRIDES(%(var)s)[0] != 1
                  && CudaNdarray_HOST_DIMS(%(var)s)[1] > 1 && CudaNdarray_HOST_STRIDES(%(var)s)[1] != 1
                  && CudaNdarray_HOST_DIMS(%(var)s)[2] > 1 && CudaNdarray_HOST_STRIDES(%(var)s)[2] != 1)
@@ -198,8 +194,7 @@ class GpuBatchedDot(GpuOp):
                 Py_XDECREF(%(var)s);
                 %(var)s = _copy;
             }
-             """ % dict(var=var, fail=fail) for var in (bx, by)))
+             """ % dict(var=var, fail=fail) for var in (bx, by))) + """
-            + """
            // fail if the output is not contiguous; we can't copy it because we
            // need to write to the original memory
@@ -537,8 +532,8 @@ class GpuGemm(GpuOp):
            return 'GpuGemm{no_inplace}'
    def __eq__(self, other):
-        return (type(self) == type(other)\
+        return (type(self) == type(other) and
-                and self.inplace == other.inplace)
+                self.inplace == other.inplace)
    def __hash__(self):
        return hash(type(self)) ^ hash(self.inplace)
@@ -562,7 +557,7 @@ class GpuGemm(GpuOp):
        return (4,)
    def c_code(self, node, name, inputs, outputs, sub):
-        #z_out = alpha * dot(x,y) + beta * z_in
+        # z_out = alpha * dot(x,y) + beta * z_in
        # inplace version, set set z_out = z_in
        # not inplace version, we copy z_in to z_out.
        z_in, a, x, y, b = inputs
@@ -657,8 +652,8 @@ class GpuGemv(GpuOp):
            return 'GpuGemv{no_inplace}'
    def __eq__(self, other):
-        return (type(self) == type(other)\
+        return (type(self) == type(other) and
-                and self.inplace == other.inplace)
+                self.inplace == other.inplace)
    def __hash__(self):
        return hash(type(self)) ^ hash(self.inplace)
@@ -682,7 +677,7 @@ class GpuGemv(GpuOp):
        return (3,)
    def c_code(self, node, name, inputs, outputs, sub):
-        #z_out = alpha * dot(x,y) + beta * z_in
+        # z_out = alpha * dot(x,y) + beta * z_in
        # inplace version, set set z_out = z_in
        # not inplace version, we copy z_in to z_out.
        z_in, a, x, y, b = inputs
@@ -757,8 +752,8 @@ class GpuGer(GpuOp):
            return 'GpuGer{no_inplace}'
    def __eq__(self, other):
-        return (type(self) == type(other)\
+        return (type(self) == type(other) and
-                and self.inplace == other.inplace)
+                self.inplace == other.inplace)
    def __hash__(self):
        return hash(type(self)) ^ hash(self.inplace)
@@ -782,7 +777,7 @@ class GpuGer(GpuOp):
        return (2,)
    def c_code(self, node, name, inputs, outputs, sub):
-        #z_out = alpha * dot(x,y) + beta * z_in
+        # z_out = alpha * dot(x,y) + beta * z_in
        # inplace version, set set z_out = z_in
        # not inplace version, we copy z_in to z_out.
        z_in, a, x, y = inputs
@@ -1283,11 +1278,15 @@ class GpuCorrMM_gradWeights(BaseGpuCorrMM):
        bottom, top = inp[:2]
        weights, = grads
        weights = gpu_contiguous(weights)
-        d_bottom = GpuCorrMM_gradInputs(self.border_mode, self.subsample)(
+        d_bottom = GpuCorrMM_gradInputs(
-                weights, top, bottom.shape[-2:])
+            self.border_mode, self.subsample)(weights,
-        d_top = GpuCorrMM(self.border_mode, self.subsample)(
+                                              top,
-                bottom, weights)
+                                              bottom.shape[-2:])
-        d_height_width = (theano.gradient.DisconnectedType()(),) * 2 if len(inp) == 4 else ()
+        d_top = GpuCorrMM(
+            self.border_mode, self.subsample)(bottom, weights)
+        d_height_width = (
+            theano.gradient.DisconnectedType()(),
+            ) * 2 if len(inp) == 4 else ()
        return (d_bottom, d_top) + d_height_width
    def connection_pattern(self, node):
@@ -1342,11 +1341,14 @@ class GpuCorrMM_gradInputs(BaseGpuCorrMM):
        weights, top = inp[:2]
        bottom, = grads
        bottom = gpu_contiguous(bottom)
-        d_weights = GpuCorrMM_gradWeights(self.border_mode, self.subsample)(
+        d_weights = GpuCorrMM_gradWeights(
+            self.border_mode, self.subsample)(
                bottom, top, weights.shape[-2:])
-        d_top = GpuCorrMM(self.border_mode, self.subsample)(
+        d_top = GpuCorrMM(
-                bottom, weights)
+            self.border_mode, self.subsample)(bottom, weights)
-        d_height_width = (theano.gradient.DisconnectedType()(),) * 2 if len(inp) == 4 else ()
+        d_height_width = (
+            theano.gradient.DisconnectedType()(),
+            ) * 2 if len(inp) == 4 else ()
        return (d_weights, d_top) + d_height_width
    def connection_pattern(self, node):
@@ -1755,10 +1757,16 @@ class GpuCorr3dMM(BaseGpuCorr3dMM):
        bottom, weights = inp
        top, = grads
        top = gpu_contiguous(top)
-        d_bottom = GpuCorr3dMM_gradInputs(self.border_mode, self.subsample, self.pad)(
+        d_bottom = GpuCorr3dMM_gradInputs(self.border_mode,
-                weights, top, bottom.shape[-3:])
+                                          self.subsample,
-        d_weights = GpuCorr3dMM_gradWeights(self.border_mode, self.subsample, self.pad)(
+                                          self.pad)(weights,
-                bottom, top, weights.shape[-3:])
+                                                    top,
+                                                    bottom.shape[-3:])
+        d_weights = GpuCorr3dMM_gradWeights(self.border_mode,
+                                            self.subsample,
+                                            self.pad)(bottom,
+                                                      top,
+                                                      weights.shape[-3:])
        return d_bottom, d_weights
@@ -1863,11 +1871,14 @@ class GpuCorr3dMM_gradInputs(BaseGpuCorr3dMM):
        weights, top = inp[:2]
        bottom, = grads
        bottom = gpu_contiguous(bottom)
-        d_weights = GpuCorr3dMM_gradWeights(self.border_mode, self.subsample, self.pad)(
+        d_weights = GpuCorr3dMM_gradWeights(
+            self.border_mode, self.subsample, self.pad)(
                bottom, top, weights.shape[-3:])
-        d_top = GpuCorr3dMM(self.border_mode, self.subsample, self.pad)(
+        d_top = GpuCorr3dMM(
+            self.border_mode, self.subsample, self.pad)(
                bottom, weights)
-        d_height_width_depth = (theano.gradient.DisconnectedType()(),) * 3 if len(inp) == 5 else ()
+        d_height_width_depth = (theano.gradient.DisconnectedType()(),)\
+            * 3 if len(inp) == 5 else ()
        return (d_weights, d_top) + d_height_width_depth
    def connection_pattern(self, node):
@@ -2186,7 +2197,7 @@ class GpuDownsampleFactorMax(GpuOp):
        return Apply(self, [x], [x.type()])
    # def perform(self, node, input_storage, output_storage):
-        #raise NotImplementedError('only C is implemented')
+        # raise NotImplementedError('only C is implemented')
    def c_code_cache_version(self):
        return (6)

--- a/theano/sandbox/cuda/elemwise.py
+++ b/theano/sandbox/cuda/elemwise.py
--- a/theano/sandbox/cuda/fftconv.py
+++ b/theano/sandbox/cuda/fftconv.py
@@ -5,9 +5,9 @@ import numpy as np
 import theano
 import theano.tensor as T
+from theano.misc.pycuda_init import pycuda_available
 from theano.sandbox.cuda import cuda_available, GpuOp
 from theano.ifelse import ifelse
-from theano.misc.pycuda_init import pycuda_available
 if cuda_available:
    from theano.sandbox.cuda import (basic_ops, CudaNdarrayType,
@@ -523,9 +523,11 @@ def conv2d_fft(input, filters, image_shape=None, filter_shape=None,
    # special way because we specify explicitly here
    # how much values are expected.
    if border_mode == 'valid':
-        output = output_circ[:, :, (f0-1):(f0-1 + i0-f0+1), (f1-1):(f1-1 + i1-f1+1)]
+        output = output_circ[:, :, (f0 - 1):(f0 - 1 + i0 - f0 + 1),
+                             (f1 - 1):(f1 - 1 + i1 - f1 + 1)]
    elif border_mode == 'full':
-        output = output_circ[:, :, (f0-1):(f0-1 + i0+f0-1), (f1-1):(f1-1 + i1+f1-1)]
+        output = output_circ[:, :, (f0 - 1):(f0 - 1 + i0 + f0 - 1),
+                             (f1 - 1):(f1 - 1 + i1 + f1 - 1)]
    else:
        raise ValueError('invalid mode')
@@ -655,7 +657,7 @@ def conv3d_fft(input, filters, image_shape=None, filter_shape=None,
    output_fft_s = mult_and_reduce(input_fft_v, filters_fft_v,
                                   input_shape=input_fft_v_shape,
                                   filter_shape=filters_fft_v_shape)
-    #output_fft_s = input_fft_v
+    # output_fft_s = input_fft_v
    # reshape for IFFT
    output_fft_flat = output_fft_s.reshape((b * oc, o0, o1, o2 // 2 + 1, 2))
@@ -673,12 +675,16 @@ def conv3d_fft(input, filters, image_shape=None, filter_shape=None,
    # special way because we specify explicitly here
    # how much values are expected.
    if border_mode == 'valid':
-        output = output_circ[:, :, (f0-1):(f0-1 + i0-f0+1), (f1-1):(f1-1 + i1-f1+1), (f2-1):(f2-1 + i2-f2+1)]
+        output = output_circ[:, :, (f0 - 1):(f0 - 1 + i0 - f0 + 1),
+                             (f1 - 1):(f1 - 1 + i1 - f1 + 1),
+                             (f2 - 1):(f2 - 1 + i2 - f2 + 1)]
    elif border_mode == 'full':
-        output = output_circ[:, :, (f0-1):(f0-1 + i0+f0-1), (f1-1):(f1-1 + i1+f1-1), (f2-1):(f2-1 + i2+f2-1)]
+        output = output_circ[:, :, (f0 - 1):(f0 - 1 + i0 + f0 - 1),
+                             (f1 - 1):(f1 - 1 + i1 + f1 - 1),
+                             (f2 - 1):(f2 - 1 + i2 + f2 - 1)]
    else:
        raise ValueError('invalid mode')
-    #output = output_circ[:, :, :, :, :]
+    # output = output_circ[:, :, :, :, :]
    # Rescale manually. This is just a factor that comes in during the
    # trip through FFT and inverse FFT.

--- a/theano/sandbox/cuda/kernel_codegen.py
+++ b/theano/sandbox/cuda/kernel_codegen.py
@@ -167,17 +167,15 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount):
    We use __i as an int variable in a loop.
    """
-    return [
+    return [  # get max of buf (trashing all but buf[0])
-            # get max of buf (trashing all but buf[0])
        inline_reduce_max(N, buf, threadPos, threadCount),
        '__syncthreads()',
        'float row_max = ' + buf + '[0]',
        '__syncthreads()',
-            'for(int __i=' + threadPos + '; __i<' + N +
+        'for(int __i=' + threadPos + '; __i<' + N + '; __i+=' +
-                  '; __i+=' + threadCount + '){',
+        threadCount + '){',
        buf + '[__i] = exp(' + buf2 + '[__i] - row_max)',
-                buf2 + '[__i] = ' + buf + '[__i]',
+        buf2 + '[__i] = ' + buf + '[__i]', '}',
-            '}',
        '__syncthreads()',
        inline_reduce_sum(N, buf, threadPos, threadCount),
        '__syncthreads()',
@@ -186,8 +184,7 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount):
        # divide each exp() result by the sum to complete the job.
        'for(int __i=' + threadPos + '; __i<' + N +
        '; __i+=' + threadCount + '){',
-                buf + '[__i] = ' + buf2 + '[__i] / row_sum',
+        buf + '[__i] = ' + buf2 + '[__i] / row_sum', '}',
-            '}',
        '__syncthreads()',
        ]
@@ -241,8 +238,7 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
        init = manner_init("%(x)s[%(pos)s * %(stride_x)s]" % locals())
        loop_line = manner_fn("red", manner_init("%(x)s[i * %(stride_x)s]" %
                                                 locals()))
-    loop_line2 = manner_fn("%s[%s]" % (buf, pos),
+    loop_line2 = manner_fn("%s[%s]" % (buf, pos), "%s[i]" % buf)
-                          "%s[i]" % buf)
    r_16 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+16]" % (buf, pos))
    r_8 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+8]" % (buf, pos))
    r_4 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+4]" % (buf, pos))

--- a/theano/sandbox/cuda/neighbours.py
+++ b/theano/sandbox/cuda/neighbours.py
 from __future__ import absolute_import, print_function, division
 # This is work in progress
-from theano import Op, Apply, tensor
+from theano import Apply, tensor
 from theano.gof import local_optimizer
 from theano.sandbox.cuda import cuda_available, GpuOp

--- a/theano/sandbox/cuda/nnet.py
+++ b/theano/sandbox/cuda/nnet.py
@@ -578,45 +578,46 @@ class GpuSoftmax(GpuOp):
        """ % locals()
    def c_support_code_apply(self, node, nodename):
-        ret1 = nvcc_kernel("kSoftmax_%s" % nodename,
+        ret1 = nvcc_kernel(
+            "kSoftmax_%s" % nodename,
            params=['int M', 'int N',
-                        'const float * x', 'const int sx0', 'const int sx1',
+                    'const float * x',
-                        'float * sm', 'const int sm_s0', 'const int sm_s1'],
+                    'const int sx0',
-                body=[
+                    'const int sx1',
-                    "extern __shared__ float buf[]",
+                    'float * sm',
+                    'const int sm_s0',
+                    'const int sm_s1'],
+            body=["extern __shared__ float buf[]",
                  "float * buf2 = buf + N",
                  "for (int blockIDX = blockIdx.x; blockIDX < M;"
                  "     blockIDX += gridDim.x){",
                  "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
                  "buf[tx] = x[blockIDX * sx0 + tx * sx1]",
-                        "buf2[tx] = buf[tx]",
+                  "buf2[tx] = buf[tx]", "}", "__syncthreads()",
-                      "}",
+                  inline_softmax('N',
-                      "__syncthreads()",
+                                 'buf',
-                      inline_softmax('N', 'buf', 'buf2',
+                                 'buf2',
-                                     'threadIdx.x', 'blockDim.x'),
+                                 'threadIdx.x',
+                                 'blockDim.x'),
                  "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
                  # This set all value correctly
-                        "sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]",
+                  "sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]", "}",
-                      "}",
+                  "__syncthreads()", "}", ])
-                      "__syncthreads()",
+        ret2 = nvcc_kernel(
-                    "}",
+            "kSoftmax_fixed_shared%s" % nodename,
-                ])
-        ret2 = nvcc_kernel("kSoftmax_fixed_shared%s" % nodename,
            params=['int M', 'int N',
                    'const float * x', 'const int sx0', 'const int sx1',
                    'float * sm', 'const int sm_s0', 'const int sm_s1'],
-                body=[
+            body=["extern __shared__ float buf[]",
-                    "extern __shared__ float buf[]",
                  "for (int blockIDX = blockIdx.x; blockIDX < M;"
                  "     blockIDX += gridDim.x){",
                  "const float *x_ptr = &x[blockIDX * sx0]",
                  "float *sm_ptr = &sm[blockIDX * sm_s0]",
                  inline_softmax_fixed_shared('N', 'buf', 'x_ptr', 'sx1',
                                              'sm_ptr', 'sm_s1',
-                                                  'threadIdx.x', 'blockDim.x'),
+                                              'threadIdx.x',
-                      "__syncthreads()",
+                                              'blockDim.x'),
-                    "}",
+                  "__syncthreads()", "}", ])
-                    ])
        return ret1 + "\n" + ret2
 gpu_softmax = GpuSoftmax()
@@ -768,25 +769,20 @@ class GpuSoftmaxWithBias(GpuOp):
                    'const float * x', 'const int sx0', 'const int sx1',
                    'const float * b', 'const int sb0',
                    'float * sm', 'const int sm_s0', 'const int sm_s1'],
-            body=[
+            body=["extern __shared__ float buf[]",
-                    "extern __shared__ float buf[]",
                  "float * buf2 = buf + N",
                  "for (int blockIDX = blockIdx.x; blockIDX < M;"
                  "     blockIDX += gridDim.x){",
                  "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
                  "buf[tx] = x[blockIDX * sx0 + tx * sx1]",
                  "buf[tx] += b[tx * sb0]",
-                         "buf2[tx] = buf[tx]",
+                  "buf2[tx] = buf[tx]", "}",
-                      "}",
+                  "__syncthreads()", inline_softmax('N', 'buf', 'buf2',
-                       "__syncthreads()",
+                                                    'threadIdx.x',
-                       inline_softmax('N', 'buf', 'buf2',
+                                                    'blockDim.x'),
-                                      'threadIdx.x', 'blockDim.x'),
                  "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
-                         "sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]",
+                  "sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]", "}",
-                      "}",
+                  "__syncthreads()", "}", ])
-                      "__syncthreads()",
-                    "}",
-            ])
        ret2 = nvcc_kernel("kSoftmaxWithBias_fixed_shared%s" % nodename,
                           params=['int M', 'int N',
                                   'const float * x',
@@ -802,7 +798,8 @@ class GpuSoftmaxWithBias(GpuOp):
                               "float *sm_ptr = &sm[blockIDX * sm_s0]",
                               inline_softmax_fixed_shared('N', 'buf',
                                                           'x_ptr', 'sx1',
-                                                           'sm_ptr', 'sm_s1',
+                                                           'sm_ptr',
+                                                           'sm_s1',
                                                           'threadIdx.x',
                                                           'blockDim.x',
                                                           'b', 'sb0'),

--- a/theano/sandbox/cuda/nvcc_compiler.py
+++ b/theano/sandbox/cuda/nvcc_compiler.py
@@ -4,7 +4,6 @@ import logging
 import os
 import subprocess
 import sys
-import warnings
 from locale import getpreferredencoding
 import numpy
@@ -249,7 +248,8 @@ class NVCC_compiler(Compiler):
            _logger.debug('Writing module C++ code to %s', cppfilename)
            cppfile.write(src_code)
-        lib_filename = os.path.join(location, '%s.%s' %
+        lib_filename = os.path.join(
+            location, '%s.%s' %
            (module_name, get_lib_extension()))
        _logger.debug('Generating shared lib %s', lib_filename)
@@ -341,7 +341,7 @@ class NVCC_compiler(Compiler):
                indexof = cmd.index('-u')
                cmd.pop(indexof)  # Remove -u
                cmd.pop(indexof)  # Remove argument to -u
-            except ValueError as e:
+            except ValueError:
                done = True
        # CUDA Toolkit v4.1 Known Issues:
@@ -364,6 +364,8 @@ class NVCC_compiler(Compiler):
            console_encoding = getpreferredencoding()
            nvcc_stdout = decode_with(nvcc_stdout_raw, console_encoding)
            nvcc_stderr = decode_with(nvcc_stderr_raw, console_encoding)
+            p = subprocess.Popen(
+                cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        finally:
            os.chdir(orig_dir)

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
--- a/theano/sandbox/cuda/rng_curand.py
+++ b/theano/sandbox/cuda/rng_curand.py
-"""
-Define CURAND_RandomStreams - backed by CURAND.
-"""
 from __future__ import absolute_import, print_function, division
-__authors__ = "James Bergstra"
-__copyright__ = "(c) 2011, University of Montreal"
-__license__ = "3-clause BSD License"
-__contact__ = "theano-dev@googlegroups.com"
 import numpy
 import theano.gof
 from theano.compat import PY3
@@ -17,6 +7,15 @@ from theano.tensor import (get_vector_length, cast, opt)
 from theano.compile import optdb
 from theano.gof import local_optimizer, Variable
+__authors__ = "James Bergstra"
+__copyright__ = "(c) 2011, University of Montreal"
+__license__ = "3-clause BSD License"
+__contact__ = "theano-dev@googlegroups.com"
+"""
+Define CURAND_RandomStreams - backed by CURAND.
+"""
 config = theano.config
@@ -70,8 +69,7 @@ class CURAND_Base(GpuOp):
        Return a tuple of attributes that define the Op.
        """
-        return (
+        return (self.destructive,
-                self.destructive,
                self.output_type,
                self.seed,
                )
@@ -101,8 +99,7 @@ class CURAND_Base(GpuOp):
        v_size = theano.tensor.as_tensor_variable(size)
        if ndim is None:
            ndim = get_vector_length(v_size)
-        self = cls(
+        self = cls(output_type=CudaNdarrayType((False,) * ndim),
-                output_type=CudaNdarrayType((False,) * ndim),
                   seed=seed,
                   destructive=False)
@@ -386,5 +383,5 @@ def local_destructive(node):
        return new_op.make_node(*node.inputs).outputs
    return False
 optdb.register('CURAND_destructive',
-        opt.in2out(local_destructive, ignore_newtrees=True), 99, 'fast_run',
+               opt.in2out(local_destructive, ignore_newtrees=True),
-                   'inplace')
+               99, 'fast_run', 'inplace')
--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
--- a/theano/sandbox/cuda/tests/test_bench_loopfusion.py
+++ b/theano/sandbox/cuda/tests/test_bench_loopfusion.py
--- a/theano/sandbox/cuda/tests/test_blas.py
+++ b/theano/sandbox/cuda/tests/test_blas.py
--- a/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
+++ b/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
--- a/theano/sandbox/cuda/tests/test_cuda_ndarray.py
+++ b/theano/sandbox/cuda/tests/test_cuda_ndarray.py
--- a/theano/sandbox/cuda/tests/test_driver.py
+++ b/theano/sandbox/cuda/tests/test_driver.py
@@ -6,7 +6,7 @@ import theano
 try:
    from nose.plugins.skip import SkipTest
    import theano.sandbox.cuda as cuda_ndarray
-    if cuda_ndarray.cuda_available == False:
+    if cuda_ndarray.cuda_available is False:
        raise SkipTest('Optional package cuda disabled')
 except ImportError:
    # To have the GPU back-end work without nose, we need this file to
@@ -33,8 +33,9 @@ def test_nvidia_driver1():
    topo = f.maker.fgraph.toposort()
    assert len(topo) == 2
    if sum(isinstance(node.op, B.GpuCAReduce) for node in topo) != 1:
-        msg = '\n\t'.join(['Expected exactly one occurrence of GpuCAReduce ' +
+        msg = '\n\t'.join(
-            'but got:']+[str(app) for app in topo])
+            ['Expected exactly one occurrence of GpuCAReduce ' +
+             'but got:'] + [str(app) for app in topo])
        raise AssertionError(msg)
    if not numpy.allclose(f(), a.sum()):
        raise Exception("The nvidia driver version installed with this OS "

--- a/theano/sandbox/cuda/tests/test_extra_ops.py
+++ b/theano/sandbox/cuda/tests/test_extra_ops.py
@@ -5,24 +5,22 @@ import itertools
 from nose.plugins.skip import SkipTest
 import numpy as np
 from six.moves import xrange
+from theano import tensor as T
+import theano
+from theano.tensor.extra_ops import cumsum, CumsumOp
+from theano.tests import unittest_tools as utt
 import theano.sandbox.cuda as cuda_ndarray
-if cuda_ndarray.cuda_available is False:
+if cuda_ndarray.cuda_available:
+    import theano.tensor.tests.test_extra_ops
+    from theano.sandbox.cuda.extra_ops import GpuCumsum
+else:
    raise SkipTest('Optional package cuda disabled')
-import theano.tensor.tests.test_extra_ops
-from theano.sandbox.cuda.extra_ops import GpuCumsum
 if theano.config.mode == 'FAST_COMPILE':
    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
 else:
    mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
-from theano import tensor as T
-import theano
-from theano.tensor.extra_ops import cumsum, CumsumOp
-from theano.tests import unittest_tools as utt
 class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
    mode = mode_with_gpu
@@ -129,11 +127,11 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
            utt.assert_allclose(np.cumsum(a[:i]), f(a[:i]))
        # Use multiple GPU threadblocks
-        a = np.random.random((block_max_size+2,)).astype("float32")
+        a = np.random.random((block_max_size + 2,)).astype("float32")
        utt.assert_allclose(np.cumsum(a), f(a))
        # Use recursive cumsum
-        a = np.ones((block_max_size*(block_max_size+1)+2,),
+        a = np.ones((block_max_size * (block_max_size + 1) + 2,),
                    dtype="float32")
        utt.assert_allclose(np.cumsum(a), f(a))
@@ -159,21 +157,22 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
            # Use multiple GPU threadblocks
            a_shape = [5, 5]
-            a_shape[shape_axis] = block_max_size+2
+            a_shape[shape_axis] = block_max_size + 2
            a = np.random.random(a_shape).astype("float32")
            utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
            # Use multiple GPU gridblocks
            a_shape = [4, 4]
-            a_shape[1-shape_axis] = self.max_grid_size1+1
+            a_shape[1 - shape_axis] = self.max_grid_size1 + 1
            a = np.random.random(a_shape).astype("float32")
            utt.assert_allclose(np.cumsum(a, axis=axis), f(a), rtol=5e-5)
            # Use recursive cumsum
            a_shape = [3, 3]
-            a_shape[shape_axis] = block_max_size*(block_max_size+1)+2
+            a_shape[shape_axis] = block_max_size * (
+                block_max_size + 1) + 2
            a = np.random.random(a_shape).astype("float32")
-            a = np.sign(a-0.5).astype("float32")  # Avoid floating point error
+            a = np.sign(a - 0.5).astype("float32")  # Avoid floating point error
            utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
    def test_GpuCumsum3D(self):
@@ -198,32 +197,34 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
            # Use multiple GPU threadblocks (along accumulation axis)
            a_shape = [2, 2, 2]
-            a_shape[shape_axis] = block_max_size+2
+            a_shape[shape_axis] = block_max_size + 2
            a = np.random.random(a_shape).astype("float32")
            utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
            # Use multiple GPU gridblocks (not along accumulation axis)
            a_shape = [5, 5, 5]
-            a_shape[(shape_axis+1) % 3] = self.max_grid_size1+1
+            a_shape[(shape_axis + 1) % 3] = self.max_grid_size1 + 1
            a = np.random.random(a_shape).astype("float32")
            if axis is None:
                # Avoid floating point error
-                a = np.sign(a-0.5).astype("float32")
+                a = np.sign(a - 0.5).astype("float32")
            utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
            a_shape = [5, 5, 5]
-            a_shape[(shape_axis+2) % 3] = self.max_grid_size1+1
+            a_shape[(shape_axis + 2) % 3] = self.max_grid_size1 + 1
            a = np.random.random(a_shape).astype("float32")
            if axis is None:
                # Avoid floating point error
-                a = np.sign(a-0.5).astype("float32")
+                a = np.sign(a - 0.5).astype("float32")
            utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
            # Use recursive cumsum (along accumulation axis)
            a_shape = [3, 3, 3]
-            a_shape[shape_axis] = block_max_size*(block_max_size+1)+2
+            a_shape[shape_axis] = block_max_size * (
+                block_max_size + 1) + 2
            a = np.random.random(a_shape).astype("float32")
-            a = np.sign(a-0.5).astype("float32")  # Avoid floating point error
+            a = np.sign(a - 0.5).astype(
+                "float32")  # Avoid floating point error
            utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
    def test_GpuCumsum4D(self):

--- a/theano/sandbox/cuda/tests/test_gemmcorr3d.py
+++ b/theano/sandbox/cuda/tests/test_gemmcorr3d.py
 from __future__ import absolute_import, print_function, division
 import unittest
 import numpy
-import copy
 import theano
 from theano.tests import unittest_tools as utt
 # Skip tests if cuda_ndarray is not available.
 from nose.plugins.skip import SkipTest
-import theano.sandbox.cuda as cuda_ndarray
-if not cuda_ndarray.cuda_available:
-    raise SkipTest('Optional package cuda not available')
 from theano.sandbox.cuda import float32_shared_constructor as shared
 from theano.sandbox.cuda.blas import (
    GpuCorr3dMM, GpuCorr3dMM_gradWeights, GpuCorr3dMM_gradInputs)
 from theano.sandbox.cuda.basic_ops import gpu_contiguous
+import theano.sandbox.cuda as cuda_ndarray
+if not cuda_ndarray.cuda_available:
+    raise SkipTest('Optional package cuda not available')
 if theano.config.mode == 'FAST_COMPILE':
    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
@@ -122,7 +121,9 @@ class TestCorr3DMM(unittest.TestCase):
        inputs = shared(inputs_val)
        filters = shared(filters_val)
        bias = shared(numpy.zeros(filters_shape[4]).astype('float32'))
-        conv = theano.tensor.nnet.convTransp3D(W=filters, b=bias, d=subsample,
+        conv = theano.tensor.nnet.convTransp3D(W=filters,
+                                               b=bias,
+                                               d=subsample,
                                               H=inputs)
        f_ref = theano.function([], conv)
        res_ref = f_ref()

--- a/theano/sandbox/cuda/tests/test_gradient.py
+++ b/theano/sandbox/cuda/tests/test_gradient.py
@@ -8,7 +8,7 @@ from theano.sandbox import cuda
 # Skip test if cuda_ndarray is not available.
 from nose.plugins.skip import SkipTest
 import theano.sandbox.cuda as cuda_ndarray
-if cuda_ndarray.cuda_available == False:
+if cuda_ndarray.cuda_available is False:
    raise SkipTest('Optional package cuda disabled')

--- a/theano/sandbox/cuda/tests/test_memory.py
+++ b/theano/sandbox/cuda/tests/test_memory.py
@@ -11,7 +11,7 @@ from theano import ifelse
 # Skip test if cuda_ndarray is not available.
 from nose.plugins.skip import SkipTest
-if cuda.cuda_available == False:
+if cuda.cuda_available is False:
    raise SkipTest('Optional package cuda disabled')
@@ -39,7 +39,7 @@ def freemem(extra_alloc=0):
        theano_alloc = cuda.cuda_ndarray.cuda_ndarray.theano_allocated()
        return ("(n malloc/theano mem allocated in KB)",
                n_mallocs + extra_alloc,
-                int(theano_alloc / 1024) + extra_size)
+                int(theano_alloc / 1024))
    return ("n malloc on the gpu", n_mallocs + extra_alloc)
    # I don't use the following by default as if there is other stuff running
@@ -83,9 +83,12 @@ def test_memory():
        variables = cuda.shared_constructor(np.ones((shapes[1],),
                                                    dtype='float32'))
        derp = tensor.sum(tensor.dot(some_matrix[:shapes[0]], variables))
-        print("Shared took ", np.prod(variables.get_value(
+        print("Shared took ",
+              np.prod(variables.get_value(
                  borrow=True,
-                return_internal_type=True).shape) * 4 / 1024, "kB")
+                  return_internal_type=True).shape) *
+              4 / 1024,
+              "kB")
        mem2 = freemem()
        print("Before compilation", mem2)
@@ -112,7 +115,7 @@ def test_memory():
        del obj
        # print "After deleting function 1", freemem()
-        #assert mem2 == freemem(), (mem2, freemem())
+        # assert mem2 == freemem(), (mem2, freemem())
        del grad
        print("After deleting function 2", freemem())
@@ -155,16 +158,19 @@ def test_memory_lazy():
        derp = ifelse.IfElse(1)(branch_select,
                                derp, some_matrix[:shapes[0]].sum())
        derp += 1
-        print("Shared took ", np.prod(variables.get_value(
+        print("Shared took ",
+              np.prod(variables.get_value(
                  borrow=True,
-                return_internal_type=True).shape) * 4 / 1024, "kB")
+                  return_internal_type=True).shape) *
+              4 / 1024,
+              "kB")
        mem2 = freemem()
        print("Before compilation", mem2)
        mem2_1 = freemem(extra_alloc=more_alloc1)
        obj = theano.function([some_vector, branch_select], derp,
                              mode=mode_with_gpu)
-        #theano.printing.debugprint(obj, print_type=True)
+        # theano.printing.debugprint(obj, print_type=True)
        mem3 = freemem()
        print("After function compilation 1", mem3)
        assert mem2_1 == mem3, (mem2_1, mem3)

--- a/theano/sandbox/cuda/tests/test_mlp.py
+++ b/theano/sandbox/cuda/tests/test_mlp.py
@@ -24,7 +24,7 @@ if theano.config.mode not in ['FAST_RUN', 'Mode', 'ProfileMode']:
                   'otherwise it is too slow!')
 # Skip test if cuda_ndarray is not available.
-if tcn.cuda_available == False:
+if tcn.cuda_available is False:
    raise SkipTest('Optional package cuda disabled')
@@ -147,19 +147,20 @@ def test_run_nnet():
            rtol = 1e-4
            if n_in * n_hid >= 2048 * 4096:
                rtol = 7e-4
-            assert numpy.allclose(rval_cpu, rval_gpu, rtol=rtol, atol=1e-6), \
+            assert numpy.allclose(
+                rval_cpu, rval_gpu, rtol=rtol, atol=1e-6), \
                ("max_abs_diff, max_rel_diff, n_in, n_hid", max_abs_diff,
                 rel_diff.max(), n_in, n_hid)
 def test_run_nnet_med():
    utt.seed_rng()
-    rval_cpu = run_nnet(False, 10, 128, 50, 4, n_train=10000)
+    run_nnet(False, 10, 128, 50, 4, n_train=10000)
 def test_run_nnet_small():
    utt.seed_rng()
-    rval_cpu = run_nnet(False, 10, 10, 4, 4, n_train=100000)
+    run_nnet(False, 10, 10, 4, 4, n_train=100000)
 def run_conv_nnet1(use_gpu):
@@ -203,8 +204,11 @@ def run_conv_nnet1(use_gpu):
    mode = get_mode(use_gpu)
    # print 'building pfunc ...'
-    train = pfunc([x, y, lr], [loss], mode=mode, updates=[(p, p - g) for p,
+    train = pfunc(
-        g in zip(params, gparams)])
+        [x, y, lr],
+        [loss],
+        mode=mode,
+        updates=[(p, p - g) for p, g in zip(params, gparams)])
 #    for i, n in enumerate(train.maker.fgraph.toposort()):
 #        print i, n
@@ -279,7 +283,9 @@ def run_conv_nnet2(use_gpu):  # pretend we are training LeNet for MNIST
    conv_op = conv.ConvOp(shape_img[2:], shape_kern[2:], n_kern, n_batch, 1, 1)
    conv_op1 = conv.ConvOp((n_kern, logical_hid_shape[0] // 2,
-         logical_hid_shape[1] // 2), shape_kern1[2:], n_kern1, n_batch, 1, 1)
+                            logical_hid_shape[1] // 2),
+                           shape_kern1[2:],
+                           n_kern1, n_batch, 1, 1)
    hid = tensor.tanh(conv_op(x, w0) + b0.dimshuffle((0, 'x', 'x')))
    hid1 = tensor.tanh(conv_op1(hid[:, :, ::2, ::2], w1) + b1.dimshuffle((
@@ -295,8 +301,11 @@ def run_conv_nnet2(use_gpu):  # pretend we are training LeNet for MNIST
    mode = get_mode(use_gpu)
    # print 'building pfunc ...'
-    train = pfunc([x, y, lr], [loss], mode=mode, updates=[(p, p - g) for p,
+    train = pfunc(
-        g in zip(params, gparams)])
+        [x, y, lr],
+        [loss],
+        mode=mode,
+        updates=[(p, p - g) for p, g in zip(params, gparams)])
 #    for i, n in enumerate(train.maker.fgraph.toposort()):
 #        print i, n
@@ -376,13 +385,14 @@ def build_conv_nnet2_classif(use_gpu, isize, ksize, n_batch,
    if downsample_ops:
        hid = tensor.tanh(ds_op(conv_op(x, w0) + b0.dimshuffle((0, 'x', 'x'))))
    else:
-        hid = tensor.tanh((conv_op(x, w0) + b0.dimshuffle((0, 'x', 'x')
+        hid = tensor.tanh(
-            ))[:, :, ::2, ::2])
+            (conv_op(x, w0) + b0.dimshuffle(
+                (0, 'x', 'x')))[:, :, ::2, ::2])
    hid1 = tensor.tanh(conv_op1(hid, w1) + b1.dimshuffle((0, 'x', 'x')))
    hid_flat = hid1.reshape((n_batch, n_hid))
    out = tensor.nnet.softmax(tensor.dot(hid_flat, v) + c)
-    loss = tensor.sum(tensor.nnet.crossentropy_categorical_1hot(out,
+    loss = tensor.sum(tensor.nnet.crossentropy_categorical_1hot(
-         tensor.argmax(y, axis=1)) * lr)
+        out, tensor.argmax(y, axis=1)) * lr)
    # print 'loss type', loss.type
    params = [w0, b0, w1, b1, v, c]
@@ -391,8 +401,11 @@ def build_conv_nnet2_classif(use_gpu, isize, ksize, n_batch,
    mode = get_mode(use_gpu, check_isfinite)
    # print 'building pfunc ...'
-    train = pfunc([x, y, lr], [loss], mode=mode, updates=[(p, p - g) for p,
+    train = pfunc(
-        g in zip(params, gparams)])
+        [x, y, lr],
+        [loss],
+        mode=mode,
+        updates=[(p, p - g) for p, g in zip(params, gparams)])
    if verbose:
        theano.printing.debugprint(train)
@@ -440,10 +453,8 @@ def run_conv_nnet2_classif(use_gpu, seed, isize, ksize, bsize,
    lr = theano._asarray(0.01, dtype='float32')
    rvals = my_zeros(n_train)
-    t0 = time.time()
    for i in xrange(n_train):
        rvals[i] = train(xval, yval, lr)[0]
-    t1 = time.time()
    print_mode(mode)
    if pickle and isinstance(mode, theano.compile.ProfileMode):
@@ -495,7 +506,8 @@ def cmp_run_conv_nnet2_classif(seed, isize, ksize, bsize,
            compare = True
        if not compare:
-            return run_conv_nnet2_classif(use_gpu=use_gpu,
+            return run_conv_nnet2_classif(
+                use_gpu=use_gpu,
                seed=seed, isize=isize, ksize=ksize, bsize=bsize,
                n_train=n_train,
                check_isfinite=check_isfinite,
@@ -570,18 +582,6 @@ def cmp_run_conv_nnet2_classif(seed, isize, ksize, bsize,
    finally:
        theano.tensor.basic.float32_atol = orig_float32_atol
-    if pickle:
-        if isinstance(cpu_mode, theano.compile.ProfileMode):
-            import pickle
-            print("BEGIN CPU profile mode dump")
-            print(pickle.dumps(cpu_mode))
-            print("END CPU profile mode dump")
-        if isinstance(gpu_mode, theano.compile.ProfileMode):
-            import pickle
-            print("BEGIN GPU profile mode dump")
-            print(pickle.dumps(gpu_mode))
-            print("END GPU profile mode dump")
    # print "CPU time: %.3f, GPU time: %.3f, speed up %f" % (
    #        (time_cpu, time_gpu, time_cpu/time_gpu))
    # print "Estimated time for one pass through MNIST with CPU: %f" % (

--- a/theano/sandbox/cuda/tests/test_neighbours.py
+++ b/theano/sandbox/cuda/tests/test_neighbours.py
 # Skip test if cuda_ndarray is not available.
 from __future__ import absolute_import, print_function, division
 from nose.plugins.skip import SkipTest
+import unittest
+import theano.tensor.nnet.tests.test_neighbours
+from theano.sandbox.cuda.neighbours import GpuImages2Neibs
 import theano.sandbox.cuda as cuda_ndarray
-if cuda_ndarray.cuda_available == False:
+if cuda_ndarray.cuda_available is False:
    raise SkipTest('Optional package cuda disabled')
-import theano.tensor.nnet.tests.test_neighbours
-from theano.sandbox.cuda.neighbours import GpuImages2Neibs
 if theano.config.mode == 'FAST_COMPILE':
    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')

--- a/theano/sandbox/cuda/tests/test_opt.py
+++ b/theano/sandbox/cuda/tests/test_opt.py
--- a/theano/sandbox/cuda/tests/test_rng_curand.py
+++ b/theano/sandbox/cuda/tests/test_rng_curand.py
@@ -8,7 +8,7 @@ from theano.sandbox.rng_mrg import MRG_RandomStreams
 # Skip tests if cuda_ndarray is not available.
 from nose.plugins.skip import SkipTest
 import theano.sandbox.cuda as cuda_ndarray
-if cuda_ndarray.cuda_available == False:
+if cuda_ndarray.cuda_available is False:
    raise SkipTest('Optional package cuda disabled')
 # The PyCObject that represents the cuda random stream object

--- a/theano/sandbox/cuda/tests/test_tensor_op.py
+++ b/theano/sandbox/cuda/tests/test_tensor_op.py
@@ -2,7 +2,6 @@
 This file test tensor op that should also operate on CudaNdaray.
 """
 from __future__ import absolute_import, print_function, division
-import copy
 from nose.plugins.skip import SkipTest
 import numpy
@@ -14,7 +13,7 @@ import theano.tensor as T
 # Skip test if cuda_ndarray is not available.
 import theano.sandbox.cuda as cuda
 from theano.tensor.nnet.tests import test_conv3d2d
-if cuda.cuda_available == False:
+if cuda.cuda_available is False:
    raise SkipTest('Optional package cuda disabled')
@@ -57,7 +56,7 @@ def test_softmax_optimizations():
    one_of_n = tensor.lvector('one_of_n')
    op = crossentropy_categorical_1hot
-    xe = op(x, one_of_n)
+    op(x, one_of_n)
    fgraph = theano.gof.FunctionGraph(
        [x, one_of_n],
@@ -84,10 +83,10 @@ def test_may_share_memory_cuda():
    # can't test the transpose as ta._strides = is not implemented
    # manual transpose of a
-    #ta = a.reshape((4,3))
+    # ta = a.reshape((4,3))
    # ta._strides = (ta._strides[1],ta._strides[0])#not implemented
-    #elem_size=elem_size = numpy.zeros(0,dtype=a.dtype).dtype.itemsize
+    # elem_size=elem_size = numpy.zeros(0,dtype=a.dtype).dtype.itemsize
-    #ta.gpudata += ta.size*elem_size
+    # ta.gpudata += ta.size*elem_size
    for a_, b_, rep in [(a, a, True), (b, b, True), (a, b, False),
                        (a, na, False), (b, nb, False),
@@ -95,8 +94,7 @@ def test_may_share_memory_cuda():
                        (a, va, True), (b, vb, True),
                        (va, b, False), (a, vb, False),
                        (a, ra, True), (b, rb, True),
-                        (ra, b, False), (a, rb, False),
+                        (ra, b, False), (a, rb, False), ]:
-                      ]:
        assert may_share_memory(a_, b_) == rep
        assert may_share_memory(b_, a_) == rep

--- a/theano/sandbox/cuda/tests/test_var.py
+++ b/theano/sandbox/cuda/tests/test_var.py
@@ -10,7 +10,7 @@ from theano.sandbox.cuda.var import float32_shared_constructor as f32sc
 from theano.sandbox.cuda import CudaNdarrayType, cuda_available
 import theano.sandbox.cuda as cuda
 # Skip test if cuda_ndarray is not available.
-if cuda_available == False:
+if cuda_available is False:
    raise SkipTest('Optional package cuda disabled')
@@ -26,19 +26,18 @@ def test_float32_shared_constructor():
    # test that broadcastable arg is accepted, and that they
    # don't strictly have to be tuples
-    assert eq(
+    assert eq(f32sc(npy_row,
-            f32sc(npy_row, broadcastable=(True, False)).type,
+                    broadcastable=(True, False)).type,
              CudaNdarrayType((True, False)))
-    assert eq(
+    assert eq(f32sc(npy_row,
-            f32sc(npy_row, broadcastable=[True, False]).type,
+                    broadcastable=[True, False]).type,
              CudaNdarrayType((True, False)))
-    assert eq(
+    assert eq(f32sc(npy_row,
-            f32sc(npy_row, broadcastable=numpy.array([True, False])).type,
+                    broadcastable=numpy.array([True, False])).type,
              CudaNdarrayType([True, False]))
    # test that we can make non-matrix shared vars
-    assert eq(
+    assert eq(f32sc(numpy.zeros((2, 3, 4, 5), dtype='float32')).type,
-            f32sc(numpy.zeros((2, 3, 4, 5), dtype='float32')).type,
              CudaNdarrayType((False,) * 4))
@@ -77,7 +76,8 @@ class T_updates(unittest.TestCase):
        x = tensor.fmatrix('x')
        output_updates = [(output_var, x ** 2)]
        output_givens = {x: data}
-        output_func = theano.function(inputs=[], outputs=[],
+        output_func = theano.function(
+            inputs=[], outputs=[],
            updates=output_updates, givens=output_givens)
        output_func()

--- a/theano/sandbox/cuda/tests/test_viewop.py
+++ b/theano/sandbox/cuda/tests/test_viewop.py
 from __future__ import absolute_import, print_function, division
 import numpy
-import unittest
 from nose.plugins.skip import SkipTest
 import theano
@@ -11,7 +10,7 @@ mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
 def test_viewop_gpu():
    from theano.sandbox import cuda
-    if cuda.cuda_available == False:
+    if cuda.cuda_available is False:
        raise SkipTest('Optional package cuda disabled')
    _x = theano.tensor.fvector('x')
    x = cuda.gpu_from_host(_x)

--- a/theano/sandbox/cuda/tests/walltime.py
+++ b/theano/sandbox/cuda/tests/walltime.py
 from __future__ import absolute_import, print_function, division
 from __future__ import print_function
-import sys, time
+import sys
+import time
 from six import iteritems
 from theano.compile.pfunc import pfunc
 from theano import tensor
@@ -35,35 +36,47 @@ def showtimes(times):
 def cmp_sigmoids(shape):
    def numpy_sigmoid(input):
-        rval = 1.0 / (1.0 + numpy.exp(-input))
+        1.0 / (1.0 + numpy.exp(-input))
-    sinput = tensor.Tensor(dtype='float32', broadcastable=(0,)*len(shape))()
+    sinput = tensor.Tensor(
-    shared_input = tcn.shared_constructor(numpy.random.rand(*shape), 'shared_input')
+        dtype='float32', broadcastable=(0,) * len(shape))()
-    times = compare_fns(
+    shared_input = tcn.shared_constructor(
-            dict( numpy=numpy_sigmoid
+        numpy.random.rand(*shape),
-                , theano_cpu=pfunc([sinput], 1.0 / (1.0 + tensor.exp(-sinput)))
+        'shared_input')
-                , theano_gpu_onboard=pfunc([sinput], [], updates=[(shared_input, 1.0 / (1.0 + tensor.exp(-shared_input)))])
+    times = compare_fns(dict(
-                ),
+        numpy=numpy_sigmoid,
+        theano_cpu=pfunc([sinput], 1.0 / (1.0 + tensor.exp(-sinput))),
+        theano_gpu_onboard=pfunc(
+            [sinput],
+            [],
+            updates=[(
+                shared_input,
+                1.0 / (1.0 + tensor.exp(-shared_input)))])),
        input=shared_input.value)
    showtimes(times)
 def cmp_sigmoids_T(shape):
    def numpy_sigmoid(input):
-        rval = 1.0 / (1.0 + numpy.exp(-input.T))
+        1.0 / (1.0 + numpy.exp(-input.T))
-    sinput = tensor.Tensor(dtype='float32', broadcastable=(0,)*len(shape))()
+    sinput = tensor.Tensor(
-    shared_input = tcn.shared_constructor(numpy.random.rand(*shape), 'shared_input')
+        dtype='float32', broadcastable=(0,) * len(shape))()
-    times = compare_fns(
+    shared_input = tcn.shared_constructor(
-            dict( numpy=numpy_sigmoid
+        numpy.random.rand(*shape),
-                , theano_cpu=pfunc([sinput], 1.0 / (1.0 + tensor.exp(-sinput.T)))
+        'shared_input')
-                , theano_gpu_onboard=pfunc([sinput], [], updates=[(shared_input, 1.0 / (1.0 +
+    times = compare_fns(dict(
-                    tensor.exp(-shared_input.T)))])
+        numpy=numpy_sigmoid,
-                ),
+        theano_cpu=pfunc([sinput], 1.0 / (1.0 + tensor.exp(-sinput.T))),
+        theano_gpu_onboard=pfunc(
+            [sinput],
+            [],
+            updates=[(
+                shared_input,
+                1.0 / (1.0 + tensor.exp(-shared_input.T)))])),
        input=shared_input.value)
    showtimes(times)
 if __name__ == '__main__':
    eval(sys.argv[1])
    # cmp_sigmoids((640, 64*64)) # looks great in profiler
-    #cmp_sigmoids((173, 74*49))
+    # cmp_sigmoids((173, 74*49))
-    #cmp_sigmoids_T((173, 74*49))
+    # cmp_sigmoids_T((173, 74*49))
--- a/theano/sandbox/cuda/type.py
+++ b/theano/sandbox/cuda/type.py
@@ -259,8 +259,8 @@ class CudaNdarrayType(Type):
                    'complex64': (complex, 'theano_complex64',
                                  'NPY_COMPLEX64')}[self.dtype]
        except KeyError:
-            raise TypeError("Unsupported dtype for %s: %s" % (
+            raise TypeError("Unsupported dtype for %s: %s" %
-                    self.__class__.__name__, self.dtype))
+                            (self.__class__.__name__, self.dtype))
    def __eq__(self, other):
        """
@@ -271,9 +271,10 @@ class CudaNdarrayType(Type):
                other.broadcastable == self.broadcastable)
    def convert_variable(self, var):
-        if (type(self) == type(var.type) and
+        if (isinstance(self, type(var.type)) and
                self.ndim == var.type.ndim and
-            all(sb == ob or ob for sb, ob in zip(self.broadcastable,
+                all(sb == ob or ob for sb, ob in zip(
+                    self.broadcastable,
                    var.type.broadcastable))):
            return theano.tensor.patternbroadcast(var, self.broadcastable)
@@ -312,7 +313,7 @@ class CudaNdarrayType(Type):
            return self.name
        else:
            b = self.broadcastable
-            #bcast = str(self.broadcastable)
+            # bcast = str(self.broadcastable)
            if not numpy.any(b):
                s = "%iD" % len(b)
            else:
@@ -327,7 +328,7 @@ class CudaNdarrayType(Type):
    def __repr__(self):
        return str(self)
-        #"CudaNdarrayType{%s, %s}" % (str(self.dtype), str(self.broadcastable))
+        # "CudaNdarrayType{%s, %s}" % (str(self.dtype), str(self.broadcastable))
    def c_declare(self, name, sub, check_input=True):
        return """ CudaNdarray * %(name)s;""" % locals()
@@ -563,8 +564,7 @@ theano.compile.register_deep_copy_op_c_code(
    CudaNdarray_HOST_DIMS(%(oname)s)[i]) {
    alloc = true;
    break;
-           }
+    }}
-        }
        if(alloc) {
            Py_XDECREF(%(oname)s);
            %(oname)s = (CudaNdarray*)CudaNdarray_Copy(%(iname)s);
@@ -581,8 +581,7 @@ theano.compile.register_deep_copy_op_c_code(
                %(fail)s;
            }
        }
-        """,
+        """, version=3)
-        version=3)
 # THIS WORKS But CudaNdarray instances don't compare equal to one
@@ -608,5 +607,5 @@ def CudaNdarray_pickler(cnda):
 # In case cuda is not imported.
 if cuda is not None:
-    copyreg.pickle(cuda.CudaNdarray, CudaNdarray_pickler,
+    copyreg.pickle(
-                    CudaNdarray_unpickler)
+        cuda.CudaNdarray, CudaNdarray_pickler, CudaNdarray_unpickler)
--- a/theano/sandbox/cuda/var.py
+++ b/theano/sandbox/cuda/var.py
@@ -13,7 +13,7 @@ try:
    # We must do those import to be able to create the full doc when nvcc
    # is not available
    from theano.sandbox.cuda import filter as type_support_filter
-    from theano.sandbox.cuda.basic_ops import HostFromGpu, GpuFromHost
+    from theano.sandbox.cuda.basic_ops import HostFromGpu
 except ImportError:
    pass
@@ -33,6 +33,7 @@ class _operators(tensor.basic._tensor_py_operators):
    def _as_TensorVariable(self):
        return HostFromGpu()(self)
    def _as_CudaNdarrayVariable(self):
        return self
@@ -54,6 +55,7 @@ class CudaNdarrayConstantSignature(tensor.TensorConstantSignature):
 class CudaNdarrayConstant(_operators, Constant):
    def signature(self):
        return CudaNdarrayConstantSignature((self.type, numpy.asarray(self.data)))
    def __str__(self):
        if self.name is not None:
            return self.name
@@ -61,7 +63,7 @@ class CudaNdarrayConstant(_operators, Constant):
            data = str(numpy.asarray(self.data))
        except Exception as e:
            data = "error while transferring the value: " + str(e)
-        return "CudaNdarrayConstant{"+data+"}"
+        return "CudaNdarrayConstant{" + data + "}"
 CudaNdarrayType.Constant = CudaNdarrayConstant

--- a/theano/tests/test_flake8.py
+++ b/theano/tests/test_flake8.py
@@ -87,42 +87,8 @@ whitelist_flake8 = [
    "sandbox/tests/test_theano_object.py",
    "sandbox/tests/test_scan.py",
    "sandbox/tests/__init__.py",
-    "sandbox/cuda/var.py",
-    "sandbox/cuda/GpuConvGrad3D.py",
-    "sandbox/cuda/basic_ops.py",
-    "sandbox/cuda/nnet.py",
-    "sandbox/cuda/elemwise.py",
-    "sandbox/cuda/type.py",
    "sandbox/cuda/__init__.py",
-    "sandbox/cuda/opt.py",
-    "sandbox/cuda/blas.py",
-    "sandbox/cuda/blocksparse.py",
-    "sandbox/cuda/rng_curand.py",
-    "sandbox/cuda/fftconv.py",
-    "sandbox/cuda/kernel_codegen.py",
-    "sandbox/cuda/GpuConvTransp3D.py",
-    "sandbox/cuda/nvcc_compiler.py",
-    "sandbox/cuda/neighbours.py",
    "sandbox/cuda/tests/__init__.py",
-    "sandbox/cuda/tests/walltime.py",
-    "sandbox/cuda/tests/test_gradient.py",
-    "sandbox/cuda/tests/test_neighbours.py",
-    "sandbox/cuda/tests/test_conv_cuda_ndarray.py",
-    "sandbox/cuda/tests/test_var.py",
-    "sandbox/cuda/tests/test_opt.py",
-    "sandbox/cuda/tests/test_blas.py",
-    "sandbox/cuda/tests/test_driver.py",
-    "sandbox/cuda/tests/test_rng_curand.py",
-    "sandbox/cuda/tests/test_basic_ops.py",
-    "sandbox/cuda/tests/test_memory.py",
-    "sandbox/cuda/tests/test_mlp.py",
-    "sandbox/cuda/tests/test_bench_loopfusion.py",
-    "sandbox/cuda/tests/test_blocksparse.py",
-    "sandbox/cuda/tests/test_cuda_ndarray.py",
-    "sandbox/cuda/tests/test_tensor_op.py",
-    "sandbox/cuda/tests/test_extra_ops.py",
-    "sandbox/cuda/tests/test_gemmcorr3d.py",
-    "sandbox/cuda/tests/test_viewop.py",
    "sandbox/gpuarray/tests/__init__.py",
    "sandbox/scan_module/scan_utils.py",
    "sandbox/scan_module/scan.py",