Merge pull request #4244 from ChihebTrabelsi/ccw2.0

flake8 sandbox/cuda/*.py

Merge pull request #4244 from ChihebTrabelsi/ccw2.0
b69ad54d · Xavier Bouthillier · 200babca · 58267dc2 · b69ad54d · b69ad54d
--- a/theano/sandbox/cuda/GpuConvGrad3D.py
+++ b/theano/sandbox/cuda/GpuConvGrad3D.py
@@ -39,7 +39,7 @@ class GpuConvGrad3D(GpuOp):
        d_ = T.as_tensor_variable(d)
        WShape_ = T.as_tensor_variable(WShape)
        dCdH_ = as_cuda_ndarray_variable(dCdH)
-        broad = (False,)*5
+        broad = (False,) * 5
        return theano.Apply(self, inputs=[V_, d_, WShape_, dCdH_],
                            outputs=[CudaNdarrayType(dtype=V_.dtype,
                                                     broadcastable=broad)()])
@@ -51,15 +51,10 @@ class GpuConvGrad3D(GpuOp):
        # partial C / partial W[j,z,k,l,m] = sum_i sum_p sum_q sum_r (partial C /partial H[i,j,p,q,r] ) *  V[i,z,dr*p+k,dc*q+l,dt*r+m]
        batchSize = dCdH.shape[0]
-        outputFilters = dCdH.shape[1]
        outputHeight = dCdH.shape[2]
        outputWidth = dCdH.shape[3]
        outputDur = dCdH.shape[4]
        assert V.shape[0] == batchSize
-        inputFilters = V.shape[1]
-        inputHeight = V.shape[2]
-        inputWidth = V.shape[3]
-        inputDur = V.shape[4]
        dr, dc, dt = d
        dCdW = numpy.zeros(WShape, dtype=V.dtype)
@@ -76,7 +71,11 @@ class GpuConvGrad3D(GpuOp):
                                for p in xrange(0, outputHeight):
                                    for q in xrange(0, outputWidth):
                                        for r in xrange(0, outputDur):
-                                            dCdW[j, z, k, l, m] += dCdH[i, j, p, q, r] * V[i, z, dr*p+k, dc*q+l, dt*r+m]
+                                            dCdW[j, z, k, l, m] += dCdH[
+                                                i, j, p, q, r] * \
+                                                V[i, z, dr * p + k,
+                                                  dc * q + l,
+                                                  dt * r + m]
        output_storage[0][0] = dCdW
@@ -86,7 +85,7 @@ class GpuConvGrad3D(GpuOp):
        dCdW = outputs[0]
-        codeSource =  """
+        codeSource = """
            ///////////// < code generated by GpuConvGrad3D >
            //printf("\t\t\t\tGpuConvGrad3DW c code\\n");
@@ -285,7 +284,7 @@ if(!work_complete){
        # This code is not sensitive to the ignore_border flag.
        # It runs for every position in the output z, and then computes the gradient for the
        # input pixels that were downsampled to that z-position.
-        codeSource =  """
+        codeSource = """
 __global__ void
 //thread block size = WShape[4]
 //grid block size = (WShape[0]*WShape[1],WShape[2]*WShape[3])

--- a/theano/sandbox/cuda/GpuConvTransp3D.py
+++ b/theano/sandbox/cuda/GpuConvTransp3D.py
@@ -37,9 +37,10 @@ class GpuConvTransp3D(GpuOp):
        else:
            RShape_ = T.as_tensor_variable([-1, -1, -1])
-        return theano.Apply(self, inputs=[W_, b_, d_, H_, RShape_],
+        return theano.Apply(
-                            outputs=[CudaNdarrayType(dtype=H_.dtype,
+            self, inputs=[W_, b_, d_, H_, RShape_],
-                                                     broadcastable=(False,)*5)()])
+            outputs=[CudaNdarrayType(
+                dtype=H_.dtype, broadcastable=(False,) * 5)()])
    def infer_shape(self, node, input_shapes):
        W, b, d, H, RShape = node.inputs
@@ -382,9 +383,9 @@ def computeR(W, b, d, H, Rshape=None):
        assert dc > 0
        assert dt > 0
-        videoHeight = (outputHeight-1) * dr + filterHeight
+        videoHeight = (outputHeight - 1) * dr + filterHeight
-        videoWidth = (outputWidth-1) * dc + filterWidth
+        videoWidth = (outputWidth - 1) * dc + filterWidth
-        videoDur = (outputDur-1) * dt + filterDur
+        videoDur = (outputDur - 1) * dt + filterDur
        if Rshape is not None and Rshape[0] != -1:
            if Rshape[0] < videoHeight:
@@ -399,26 +400,46 @@ def computeR(W, b, d, H, Rshape=None):
        # else:
        #    print "No Rshape passed in"
-        # print "video size: "+str((videoHeight, videoWidth, videoDur))
+        # print "video size: " + str((videoHeight, videoWidth, videoDur))
-        R =  numpy.zeros( (batchSize, inputChannels, videoHeight,
+        R = numpy.zeros((batchSize, inputChannels, videoHeight,
-            videoWidth, videoDur ) , dtype=H.dtype)
+                         videoWidth, videoDur),
+                        dtype=H.dtype)
-        # R[i,j,r,c,t] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} sum_{tc,tk | ...} sum_k W[k, j, rk, ck, tk] * H[i,k,rc,cc,tc]
+        # R[i,j,r,c,t] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} \
+        # sum_{tc,tk | ...} sum_k W[k, j, rk, ck, tk] * H[i,k,rc,cc,tc]
        for i in xrange(0, batchSize):
            # print '\texample '+str(i+1)+'/'+str(batchSize)
            for j in xrange(0, inputChannels):
-                # print '\t\tfeature map '+str(j+1)+'/'+str(inputChannels)
+                # print '\t\tfeature map ' + str(j+1) + '/' + str(inputChannels)
                for r in xrange(0, videoHeight):
-                    # print '\t\t\trow '+str(r+1)+'/'+str(videoHeight)
+                    # print '\t\t\trow ' + str(r+1) + '/'+str(videoHeight)
                    for c in xrange(0, videoWidth):
                        for t in xrange(0, videoDur):
                            R[i, j, r, c, t] = b[j]
-                            ftc = max([0, int(numpy.ceil(float(t-filterDur + 1  )/float(dt))) ])
+                            ftc = max(
-                            fcc = max([0, int(numpy.ceil(float(c-filterWidth + 1)/float(dc))) ])
+                                [0,
+                                 int(numpy.ceil(
-                            rc =  max([0, int(numpy.ceil(float(r-filterHeight+1)/float(dr))) ])
+                                     float(t - filterDur + 1) / float(dt)
+                                     ))
+                                 ]
+                            )
+                            fcc = max(
+                                [0,
+                                 int(numpy.ceil(
+                                     float(c - filterWidth + 1) / float(dc)
+                                     ))
+                                 ]
+                            )
+                            rc = max(
+                                [0,
+                                 int(numpy.ceil(
+                                     float(r - filterHeight + 1) / float(dr)
+                                     ))
+                                 ]
+                            )
                            while rc < outputHeight:
                                rk = r - rc * dr
                                if rk < 0:
@@ -436,7 +457,9 @@ def computeR(W, b, d, H, Rshape=None):
                                        if tk < 0:
                                            break
-                                        R[i, j, r, c, t] += numpy.dot(W[:, j, rk, ck, tk], H[i, :, rc, cc, tc] )
+                                        R[i, j, r, c, t] += numpy.dot(
+                                            W[:, j, rk, ck, tk],
+                                            H[i, :, rc, cc, tc])
                                        tc += 1
                                    ""  # close loop over tc

--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
--- a/theano/sandbox/cuda/elemwise.py
+++ b/theano/sandbox/cuda/elemwise.py
--- a/theano/sandbox/cuda/fftconv.py
+++ b/theano/sandbox/cuda/fftconv.py
@@ -5,9 +5,9 @@ import numpy as np
 import theano
 import theano.tensor as T
+from theano.misc.pycuda_init import pycuda_available
 from theano.sandbox.cuda import cuda_available, GpuOp
 from theano.ifelse import ifelse
-from theano.misc.pycuda_init import pycuda_available
 if cuda_available:
    from theano.sandbox.cuda import (basic_ops, CudaNdarrayType,
@@ -448,7 +448,7 @@ def conv2d_fft(input, filters, image_shape=None, filter_shape=None,
            o1 = i1 + 1
            input_padded = T.zeros((b, ic, o0, o1), dtype='float32')
            input_padded = T.set_subtensor(input_padded[:, :, :i0, :i1],
-                                       input)
+                                           input)
        else:
            o1 = i1
            input_padded = input
@@ -523,9 +523,11 @@ def conv2d_fft(input, filters, image_shape=None, filter_shape=None,
    # special way because we specify explicitly here
    # how much values are expected.
    if border_mode == 'valid':
-        output = output_circ[:, :, (f0-1):(f0-1 + i0-f0+1), (f1-1):(f1-1 + i1-f1+1)]
+        output = output_circ[:, :, (f0 - 1):(f0 - 1 + i0 - f0 + 1),
+                             (f1 - 1):(f1 - 1 + i1 - f1 + 1)]
    elif border_mode == 'full':
-        output = output_circ[:, :, (f0-1):(f0-1 + i0+f0-1), (f1-1):(f1-1 + i1+f1-1)]
+        output = output_circ[:, :, (f0 - 1):(f0 - 1 + i0 + f0 - 1),
+                             (f1 - 1):(f1 - 1 + i1 + f1 - 1)]
    else:
        raise ValueError('invalid mode')
@@ -655,7 +657,7 @@ def conv3d_fft(input, filters, image_shape=None, filter_shape=None,
    output_fft_s = mult_and_reduce(input_fft_v, filters_fft_v,
                                   input_shape=input_fft_v_shape,
                                   filter_shape=filters_fft_v_shape)
-    #output_fft_s = input_fft_v
+    # output_fft_s = input_fft_v
    # reshape for IFFT
    output_fft_flat = output_fft_s.reshape((b * oc, o0, o1, o2 // 2 + 1, 2))
@@ -673,12 +675,16 @@ def conv3d_fft(input, filters, image_shape=None, filter_shape=None,
    # special way because we specify explicitly here
    # how much values are expected.
    if border_mode == 'valid':
-        output = output_circ[:, :, (f0-1):(f0-1 + i0-f0+1), (f1-1):(f1-1 + i1-f1+1), (f2-1):(f2-1 + i2-f2+1)]
+        output = output_circ[:, :, (f0 - 1):(f0 - 1 + i0 - f0 + 1),
+                             (f1 - 1):(f1 - 1 + i1 - f1 + 1),
+                             (f2 - 1):(f2 - 1 + i2 - f2 + 1)]
    elif border_mode == 'full':
-        output = output_circ[:, :, (f0-1):(f0-1 + i0+f0-1), (f1-1):(f1-1 + i1+f1-1), (f2-1):(f2-1 + i2+f2-1)]
+        output = output_circ[:, :, (f0 - 1):(f0 - 1 + i0 + f0 - 1),
+                             (f1 - 1):(f1 - 1 + i1 + f1 - 1),
+                             (f2 - 1):(f2 - 1 + i2 + f2 - 1)]
    else:
        raise ValueError('invalid mode')
-    #output = output_circ[:, :, :, :, :]
+    # output = output_circ[:, :, :, :, :]
    # Rescale manually. This is just a factor that comes in during the
    # trip through FFT and inverse FFT.

--- a/theano/sandbox/cuda/kernel_codegen.py
+++ b/theano/sandbox/cuda/kernel_codegen.py
@@ -76,7 +76,7 @@ def inline_reduce(N, buf, pos, count, manner_fn):
    rest of the buffer is trashed by this function.
    Notes
-    ----- 
+    -----
    buf should be in gpu shared memory, we access it many times.
    """
@@ -167,29 +167,26 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount):
    We use __i as an int variable in a loop.
    """
-    return [
+    return [  # get max of buf (trashing all but buf[0])
-            # get max of buf (trashing all but buf[0])
+        inline_reduce_max(N, buf, threadPos, threadCount),
-            inline_reduce_max(N, buf, threadPos, threadCount),
+        '__syncthreads()',
-            '__syncthreads()',
+        'float row_max = ' + buf + '[0]',
-            'float row_max = ' + buf + '[0]',
+        '__syncthreads()',
-            '__syncthreads()',
+        'for(int __i=' + threadPos + '; __i<' + N + '; __i+=' +
-            'for(int __i=' + threadPos + '; __i<' + N +
+        threadCount + '){',
-                  '; __i+=' + threadCount + '){',
+        buf + '[__i] = exp(' + buf2 + '[__i] - row_max)',
-                buf + '[__i] = exp(' + buf2 + '[__i] - row_max)',
+        buf2 + '[__i] = ' + buf + '[__i]', '}',
-                buf2 + '[__i] = ' + buf + '[__i]',
+        '__syncthreads()',
-            '}',
+        inline_reduce_sum(N, buf, threadPos, threadCount),
-            '__syncthreads()',
+        '__syncthreads()',
-            inline_reduce_sum(N, buf, threadPos, threadCount),
+        'float row_sum = ' + buf + '[0]',
-            '__syncthreads()',
+        '__syncthreads()',
-            'float row_sum = ' + buf + '[0]',
+        # divide each exp() result by the sum to complete the job.
-            '__syncthreads()',
+        'for(int __i=' + threadPos + '; __i<' + N +
-            # divide each exp() result by the sum to complete the job.
+        '; __i+=' + threadCount + '){',
-            'for(int __i=' + threadPos + '; __i<' + N +
+        buf + '[__i] = ' + buf2 + '[__i] / row_sum', '}',
-                  '; __i+=' + threadCount + '){',
+        '__syncthreads()',
-                buf + '[__i] = ' + buf2 + '[__i] / row_sum',
+        ]
-            '}',
-            '__syncthreads()',
-            ]
 @code_version((1,))
@@ -241,8 +238,7 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
        init = manner_init("%(x)s[%(pos)s * %(stride_x)s]" % locals())
        loop_line = manner_fn("red", manner_init("%(x)s[i * %(stride_x)s]" %
                                                 locals()))
-    loop_line2 = manner_fn("%s[%s]" % (buf, pos),
+    loop_line2 = manner_fn("%s[%s]" % (buf, pos), "%s[i]" % buf)
-                          "%s[i]" % buf)
    r_16 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+16]" % (buf, pos))
    r_8 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+8]" % (buf, pos))
    r_4 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+4]" % (buf, pos))

--- a/theano/sandbox/cuda/neighbours.py
+++ b/theano/sandbox/cuda/neighbours.py
 from __future__ import absolute_import, print_function, division
 # This is work in progress
-from theano import Op, Apply, tensor
+from theano import Apply, tensor
 from theano.gof import local_optimizer
 from theano.sandbox.cuda import cuda_available, GpuOp

--- a/theano/sandbox/cuda/nnet.py
+++ b/theano/sandbox/cuda/nnet.py
@@ -578,45 +578,46 @@ class GpuSoftmax(GpuOp):
        """ % locals()
    def c_support_code_apply(self, node, nodename):
-        ret1 = nvcc_kernel("kSoftmax_%s" % nodename,
+        ret1 = nvcc_kernel(
-                params=['int M', 'int N',
+            "kSoftmax_%s" % nodename,
-                        'const float * x', 'const int sx0', 'const int sx1',
+            params=['int M', 'int N',
-                        'float * sm', 'const int sm_s0', 'const int sm_s1'],
+                    'const float * x',
-                body=[
+                    'const int sx0',
-                    "extern __shared__ float buf[]",
+                    'const int sx1',
-                    "float * buf2 = buf + N",
+                    'float * sm',
-                    "for (int blockIDX = blockIdx.x; blockIDX < M;"
+                    'const int sm_s0',
-                    "     blockIDX += gridDim.x){",
+                    'const int sm_s1'],
-                      "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
+            body=["extern __shared__ float buf[]",
-                        "buf[tx] = x[blockIDX * sx0 + tx * sx1]",
+                  "float * buf2 = buf + N",
-                        "buf2[tx] = buf[tx]",
+                  "for (int blockIDX = blockIdx.x; blockIDX < M;"
-                      "}",
+                  "     blockIDX += gridDim.x){",
-                      "__syncthreads()",
+                  "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
-                      inline_softmax('N', 'buf', 'buf2',
+                  "buf[tx] = x[blockIDX * sx0 + tx * sx1]",
-                                     'threadIdx.x', 'blockDim.x'),
+                  "buf2[tx] = buf[tx]", "}", "__syncthreads()",
-                      "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
+                  inline_softmax('N',
-                        # This set all value correctly
+                                 'buf',
-                        "sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]",
+                                 'buf2',
-                      "}",
+                                 'threadIdx.x',
-                      "__syncthreads()",
+                                 'blockDim.x'),
-                    "}",
+                  "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
-                ])
+                  # This set all value correctly
-        ret2 = nvcc_kernel("kSoftmax_fixed_shared%s" % nodename,
+                  "sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]", "}",
-                params=['int M', 'int N',
+                  "__syncthreads()", "}", ])
-                        'const float * x', 'const int sx0', 'const int sx1',
+        ret2 = nvcc_kernel(
-                        'float * sm', 'const int sm_s0', 'const int sm_s1'],
+            "kSoftmax_fixed_shared%s" % nodename,
-                body=[
+            params=['int M', 'int N',
-                    "extern __shared__ float buf[]",
+                    'const float * x', 'const int sx0', 'const int sx1',
-                    "for (int blockIDX = blockIdx.x; blockIDX < M;"
+                    'float * sm', 'const int sm_s0', 'const int sm_s1'],
-                    "     blockIDX += gridDim.x){",
+            body=["extern __shared__ float buf[]",
-                      "const float *x_ptr = &x[blockIDX * sx0]",
+                  "for (int blockIDX = blockIdx.x; blockIDX < M;"
-                      "float *sm_ptr = &sm[blockIDX * sm_s0]",
+                  "     blockIDX += gridDim.x){",
-                      inline_softmax_fixed_shared('N', 'buf', 'x_ptr', 'sx1',
+                  "const float *x_ptr = &x[blockIDX * sx0]",
-                                                  'sm_ptr', 'sm_s1',
+                  "float *sm_ptr = &sm[blockIDX * sm_s0]",
-                                                  'threadIdx.x', 'blockDim.x'),
+                  inline_softmax_fixed_shared('N', 'buf', 'x_ptr', 'sx1',
-                      "__syncthreads()",
+                                              'sm_ptr', 'sm_s1',
-                    "}",
+                                              'threadIdx.x',
-                    ])
+                                              'blockDim.x'),
+                  "__syncthreads()", "}", ])
        return ret1 + "\n" + ret2
 gpu_softmax = GpuSoftmax()
@@ -768,25 +769,20 @@ class GpuSoftmaxWithBias(GpuOp):
                    'const float * x', 'const int sx0', 'const int sx1',
                    'const float * b', 'const int sb0',
                    'float * sm', 'const int sm_s0', 'const int sm_s1'],
-            body=[
+            body=["extern __shared__ float buf[]",
-                    "extern __shared__ float buf[]",
+                  "float * buf2 = buf + N",
-                    "float * buf2 = buf + N",
+                  "for (int blockIDX = blockIdx.x; blockIDX < M;"
-                    "for (int blockIDX = blockIdx.x; blockIDX < M;"
+                  "     blockIDX += gridDim.x){",
-                    "     blockIDX += gridDim.x){",
+                  "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
-                      "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
+                  "buf[tx] = x[blockIDX * sx0 + tx * sx1]",
-                         "buf[tx] = x[blockIDX * sx0 + tx * sx1]",
+                  "buf[tx] += b[tx * sb0]",
-                         "buf[tx] += b[tx * sb0]",
+                  "buf2[tx] = buf[tx]", "}",
-                         "buf2[tx] = buf[tx]",
+                  "__syncthreads()", inline_softmax('N', 'buf', 'buf2',
-                      "}",
+                                                    'threadIdx.x',
-                       "__syncthreads()",
+                                                    'blockDim.x'),
-                       inline_softmax('N', 'buf', 'buf2',
+                  "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
-                                      'threadIdx.x', 'blockDim.x'),
+                  "sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]", "}",
-                      "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
+                  "__syncthreads()", "}", ])
-                         "sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]",
-                      "}",
-                      "__syncthreads()",
-                    "}",
-            ])
        ret2 = nvcc_kernel("kSoftmaxWithBias_fixed_shared%s" % nodename,
                           params=['int M', 'int N',
                                   'const float * x',
@@ -802,7 +798,8 @@ class GpuSoftmaxWithBias(GpuOp):
                               "float *sm_ptr = &sm[blockIDX * sm_s0]",
                               inline_softmax_fixed_shared('N', 'buf',
                                                           'x_ptr', 'sx1',
-                                                           'sm_ptr', 'sm_s1',
+                                                           'sm_ptr',
+                                                           'sm_s1',
                                                           'threadIdx.x',
                                                           'blockDim.x',
                                                           'b', 'sb0'),

--- a/theano/sandbox/cuda/nvcc_compiler.py
+++ b/theano/sandbox/cuda/nvcc_compiler.py
@@ -4,7 +4,6 @@ import logging
 import os
 import subprocess
 import sys
-import warnings
 from locale import getpreferredencoding
 import numpy
@@ -249,8 +248,9 @@ class NVCC_compiler(Compiler):
            _logger.debug('Writing module C++ code to %s', cppfilename)
            cppfile.write(src_code)
-        lib_filename = os.path.join(location, '%s.%s' %
+        lib_filename = os.path.join(
-                (module_name, get_lib_extension()))
+            location, '%s.%s' %
+            (module_name, get_lib_extension()))
        _logger.debug('Generating shared lib %s', lib_filename)
        # TODO: Why do these args cause failure on gtx285 that has 1.3
@@ -268,7 +268,7 @@ class NVCC_compiler(Compiler):
                continue
            for pattern in ['-O', '-arch=', '-ccbin=', '-G', '-g', '-I',
                            '-L', '--fmad', '--ftz', '--maxrregcount',
-                            '--prec-div', '--prec-sqrt',  '--use_fast_math',
+                            '--prec-div', '--prec-sqrt', '--use_fast_math',
                            '-fmad', '-ftz', '-maxrregcount',
                            '-prec-div', '-prec-sqrt', '-use_fast_math',
                            '--use-local-env', '--cl-version=']:
@@ -311,7 +311,7 @@ class NVCC_compiler(Compiler):
        # https://wiki.debian.org/RpathIssue for details.
        if (not type(config.cuda).root.is_default and
-            os.path.exists(os.path.join(config.cuda.root, 'lib'))):
+                os.path.exists(os.path.join(config.cuda.root, 'lib'))):
            rpaths.append(os.path.join(config.cuda.root, 'lib'))
            if sys.platform != 'darwin':
@@ -341,7 +341,7 @@ class NVCC_compiler(Compiler):
                indexof = cmd.index('-u')
                cmd.pop(indexof)  # Remove -u
                cmd.pop(indexof)  # Remove argument to -u
-            except ValueError as e:
+            except ValueError:
                done = True
        # CUDA Toolkit v4.1 Known Issues:
@@ -359,11 +359,13 @@ class NVCC_compiler(Compiler):
        try:
            os.chdir(location)
            p = subprocess.Popen(
-                    cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+                cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            nvcc_stdout_raw, nvcc_stderr_raw = p.communicate()[:2]
            console_encoding = getpreferredencoding()
            nvcc_stdout = decode_with(nvcc_stdout_raw, console_encoding)
            nvcc_stderr = decode_with(nvcc_stderr_raw, console_encoding)
+            p = subprocess.Popen(
+                cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        finally:
            os.chdir(orig_dir)

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
--- a/theano/sandbox/cuda/rng_curand.py
+++ b/theano/sandbox/cuda/rng_curand.py
-"""
-Define CURAND_RandomStreams - backed by CURAND.
-"""
 from __future__ import absolute_import, print_function, division
-__authors__ = "James Bergstra"
-__copyright__ = "(c) 2011, University of Montreal"
-__license__ = "3-clause BSD License"
-__contact__ = "theano-dev@googlegroups.com"
 import numpy
 import theano.gof
 from theano.compat import PY3
@@ -17,12 +7,21 @@ from theano.tensor import (get_vector_length, cast, opt)
 from theano.compile import optdb
 from theano.gof import local_optimizer, Variable
+__authors__ = "James Bergstra"
+__copyright__ = "(c) 2011, University of Montreal"
+__license__ = "3-clause BSD License"
+__contact__ = "theano-dev@googlegroups.com"
+"""
+Define CURAND_RandomStreams - backed by CURAND.
+"""
 config = theano.config
 class CURAND_Base(GpuOp):
-    """ 
+    """
    Base class for a random number generator implemented in CURAND.
    The random number generator itself is an opaque reference managed by
@@ -70,8 +69,7 @@ class CURAND_Base(GpuOp):
        Return a tuple of attributes that define the Op.
        """
-        return (
+        return (self.destructive,
-                self.destructive,
                self.output_type,
                self.seed,
                )
@@ -88,7 +86,7 @@ class CURAND_Base(GpuOp):
    def make_node(self, generator, size):
        return theano.gof.Apply(self, [generator, size],
-                [generator.type(), self.output_type()])
+                                [generator.type(), self.output_type()])
    @classmethod
    def new_auto_update(cls, generator, ndim, dtype, size, seed):
@@ -101,10 +99,9 @@ class CURAND_Base(GpuOp):
        v_size = theano.tensor.as_tensor_variable(size)
        if ndim is None:
            ndim = get_vector_length(v_size)
-        self = cls(
+        self = cls(output_type=CudaNdarrayType((False,) * ndim),
-                output_type=CudaNdarrayType((False,) * ndim),
+                   seed=seed,
-                seed=seed,
+                   destructive=False)
-                destructive=False)
        o_gen, sample = self(generator, cast(v_size, 'int32'))
@@ -282,7 +279,7 @@ class CURAND_RandomStreams(object):
    RandomStreams instance that creates CURAND-based random variables.
    One caveat is that generators are not serializable.
    Parameters
    ----------
    seed : int
@@ -319,7 +316,7 @@ class CURAND_RandomStreams(object):
        return rval
    def uniform(self, size, low=0.0, high=1.0, ndim=None,
-            dtype=config.floatX):
+                dtype=config.floatX):
        """
        Return symbolic tensor of uniform numbers.
@@ -327,14 +324,14 @@ class CURAND_RandomStreams(object):
        if isinstance(size, tuple):
            msg = "size must be a tuple of int or a Theano variable"
            assert all([isinstance(i, int) or isinstance(i, Variable)
-                for i in size]), msg
+                        for i in size]), msg
        else:
            msg = "size must be a tuple of int or a Theano variable"
            assert isinstance(size, Variable) and size.ndim == 1, msg
        generator = theano.shared(False)  # makes a generic
        s_size = theano.tensor.as_tensor_variable(size)
        u = CURAND_Uniform.new_auto_update(generator, ndim, dtype, s_size,
-                self.next_seed())
+                                           self.next_seed())
        self.state_updates.append(u.update)
        rval = u * (high - low) + low
        if u.type.broadcastable != rval.type.broadcastable:
@@ -342,10 +339,10 @@ class CURAND_RandomStreams(object):
                'Increase the size to match the broadcasting pattern of '
                'low and `high` arguments'
            )
-        return  rval
+        return rval
    def normal(self, size=None, avg=0.0, std=1.0, ndim=None,
-            dtype=config.floatX):
+               dtype=config.floatX):
        """
        Return symbolic tensor of normally-distributed numbers.
@@ -359,14 +356,14 @@ class CURAND_RandomStreams(object):
        if isinstance(size, tuple):
            msg = "size must be a tuple of int or a Theano variable"
            assert all([isinstance(i, int) or isinstance(i, Variable)
-                for i in size]), msg
+                        for i in size]), msg
        else:
            msg = "size must be a tuple of int or a Theano variable"
            assert isinstance(size, Variable) and size.ndim == 1, msg
        generator = theano.shared(False)  # makes a generic
        s_size = theano.tensor.as_tensor_variable(size)
        u = CURAND_Normal.new_auto_update(generator, ndim, dtype, s_size,
-                self.next_seed())
+                                          self.next_seed())
        self.state_updates.append(u.update)
        rval = u * std + avg
        if u.type.broadcastable != rval.type.broadcastable:
@@ -374,7 +371,7 @@ class CURAND_RandomStreams(object):
                'Increase the size to match the broadcasting pattern of `low`'
                'and `high` arguments'
            )
-        return  rval
+        return rval
 @local_optimizer([CURAND_Base])
@@ -386,5 +383,5 @@ def local_destructive(node):
        return new_op.make_node(*node.inputs).outputs
    return False
 optdb.register('CURAND_destructive',
-        opt.in2out(local_destructive, ignore_newtrees=True), 99, 'fast_run',
+               opt.in2out(local_destructive, ignore_newtrees=True),
-                   'inplace')
+               99, 'fast_run', 'inplace')
--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
--- a/theano/sandbox/cuda/tests/test_bench_loopfusion.py
+++ b/theano/sandbox/cuda/tests/test_bench_loopfusion.py
--- a/theano/sandbox/cuda/tests/test_blas.py
+++ b/theano/sandbox/cuda/tests/test_blas.py
--- a/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
+++ b/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
--- a/theano/sandbox/cuda/tests/test_cuda_ndarray.py
+++ b/theano/sandbox/cuda/tests/test_cuda_ndarray.py
--- a/theano/sandbox/cuda/tests/test_driver.py
+++ b/theano/sandbox/cuda/tests/test_driver.py
@@ -6,7 +6,7 @@ import theano
 try:
    from nose.plugins.skip import SkipTest
    import theano.sandbox.cuda as cuda_ndarray
-    if cuda_ndarray.cuda_available == False:
+    if cuda_ndarray.cuda_available is False:
        raise SkipTest('Optional package cuda disabled')
 except ImportError:
    # To have the GPU back-end work without nose, we need this file to
@@ -33,8 +33,9 @@ def test_nvidia_driver1():
    topo = f.maker.fgraph.toposort()
    assert len(topo) == 2
    if sum(isinstance(node.op, B.GpuCAReduce) for node in topo) != 1:
-        msg = '\n\t'.join(['Expected exactly one occurrence of GpuCAReduce ' +
+        msg = '\n\t'.join(
-            'but got:']+[str(app) for app in topo])
+            ['Expected exactly one occurrence of GpuCAReduce ' +
+             'but got:'] + [str(app) for app in topo])
        raise AssertionError(msg)
    if not numpy.allclose(f(), a.sum()):
        raise Exception("The nvidia driver version installed with this OS "

--- a/theano/sandbox/cuda/tests/test_extra_ops.py
+++ b/theano/sandbox/cuda/tests/test_extra_ops.py
@@ -5,24 +5,22 @@ import itertools
 from nose.plugins.skip import SkipTest
 import numpy as np
 from six.moves import xrange
+from theano import tensor as T
+import theano
+from theano.tensor.extra_ops import cumsum, CumsumOp
+from theano.tests import unittest_tools as utt
 import theano.sandbox.cuda as cuda_ndarray
-if cuda_ndarray.cuda_available is False:
+if cuda_ndarray.cuda_available:
+    import theano.tensor.tests.test_extra_ops
+    from theano.sandbox.cuda.extra_ops import GpuCumsum
+else:
    raise SkipTest('Optional package cuda disabled')
-import theano.tensor.tests.test_extra_ops
-from theano.sandbox.cuda.extra_ops import GpuCumsum
 if theano.config.mode == 'FAST_COMPILE':
    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
 else:
    mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
-from theano import tensor as T
-import theano
-from theano.tensor.extra_ops import cumsum, CumsumOp
-from theano.tests import unittest_tools as utt
 class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
    mode = mode_with_gpu
@@ -129,11 +127,11 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
            utt.assert_allclose(np.cumsum(a[:i]), f(a[:i]))
        # Use multiple GPU threadblocks
-        a = np.random.random((block_max_size+2,)).astype("float32")
+        a = np.random.random((block_max_size + 2,)).astype("float32")
        utt.assert_allclose(np.cumsum(a), f(a))
        # Use recursive cumsum
-        a = np.ones((block_max_size*(block_max_size+1)+2,),
+        a = np.ones((block_max_size * (block_max_size + 1) + 2,),
                    dtype="float32")
        utt.assert_allclose(np.cumsum(a), f(a))
@@ -159,21 +157,22 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
            # Use multiple GPU threadblocks
            a_shape = [5, 5]
-            a_shape[shape_axis] = block_max_size+2
+            a_shape[shape_axis] = block_max_size + 2
            a = np.random.random(a_shape).astype("float32")
            utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
            # Use multiple GPU gridblocks
            a_shape = [4, 4]
-            a_shape[1-shape_axis] = self.max_grid_size1+1
+            a_shape[1 - shape_axis] = self.max_grid_size1 + 1
            a = np.random.random(a_shape).astype("float32")
            utt.assert_allclose(np.cumsum(a, axis=axis), f(a), rtol=5e-5)
            # Use recursive cumsum
            a_shape = [3, 3]
-            a_shape[shape_axis] = block_max_size*(block_max_size+1)+2
+            a_shape[shape_axis] = block_max_size * (
+                block_max_size + 1) + 2
            a = np.random.random(a_shape).astype("float32")
-            a = np.sign(a-0.5).astype("float32")  # Avoid floating point error
+            a = np.sign(a - 0.5).astype("float32")  # Avoid floating point error
            utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
    def test_GpuCumsum3D(self):
@@ -198,32 +197,34 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
            # Use multiple GPU threadblocks (along accumulation axis)
            a_shape = [2, 2, 2]
-            a_shape[shape_axis] = block_max_size+2
+            a_shape[shape_axis] = block_max_size + 2
            a = np.random.random(a_shape).astype("float32")
            utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
            # Use multiple GPU gridblocks (not along accumulation axis)
            a_shape = [5, 5, 5]
-            a_shape[(shape_axis+1) % 3] = self.max_grid_size1+1
+            a_shape[(shape_axis + 1) % 3] = self.max_grid_size1 + 1
            a = np.random.random(a_shape).astype("float32")
            if axis is None:
                # Avoid floating point error
-                a = np.sign(a-0.5).astype("float32")
+                a = np.sign(a - 0.5).astype("float32")
            utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
            a_shape = [5, 5, 5]
-            a_shape[(shape_axis+2) % 3] = self.max_grid_size1+1
+            a_shape[(shape_axis + 2) % 3] = self.max_grid_size1 + 1
            a = np.random.random(a_shape).astype("float32")
            if axis is None:
                # Avoid floating point error
-                a = np.sign(a-0.5).astype("float32")
+                a = np.sign(a - 0.5).astype("float32")
            utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
            # Use recursive cumsum (along accumulation axis)
            a_shape = [3, 3, 3]
-            a_shape[shape_axis] = block_max_size*(block_max_size+1)+2
+            a_shape[shape_axis] = block_max_size * (
+                block_max_size + 1) + 2
            a = np.random.random(a_shape).astype("float32")
-            a = np.sign(a-0.5).astype("float32")  # Avoid floating point error
+            a = np.sign(a - 0.5).astype(
+                "float32")  # Avoid floating point error
            utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
    def test_GpuCumsum4D(self):

--- a/theano/sandbox/cuda/tests/test_gemmcorr3d.py
+++ b/theano/sandbox/cuda/tests/test_gemmcorr3d.py
 from __future__ import absolute_import, print_function, division
 import unittest
 import numpy
-import copy
 import theano
 from theano.tests import unittest_tools as utt
 # Skip tests if cuda_ndarray is not available.
 from nose.plugins.skip import SkipTest
-import theano.sandbox.cuda as cuda_ndarray
-if not cuda_ndarray.cuda_available:
-    raise SkipTest('Optional package cuda not available')
 from theano.sandbox.cuda import float32_shared_constructor as shared
 from theano.sandbox.cuda.blas import (
    GpuCorr3dMM, GpuCorr3dMM_gradWeights, GpuCorr3dMM_gradInputs)
 from theano.sandbox.cuda.basic_ops import gpu_contiguous
+import theano.sandbox.cuda as cuda_ndarray
+if not cuda_ndarray.cuda_available:
+    raise SkipTest('Optional package cuda not available')
 if theano.config.mode == 'FAST_COMPILE':
    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
@@ -122,7 +121,9 @@ class TestCorr3DMM(unittest.TestCase):
        inputs = shared(inputs_val)
        filters = shared(filters_val)
        bias = shared(numpy.zeros(filters_shape[4]).astype('float32'))
-        conv = theano.tensor.nnet.convTransp3D(W=filters, b=bias, d=subsample,
+        conv = theano.tensor.nnet.convTransp3D(W=filters,
+                                               b=bias,
+                                               d=subsample,
                                               H=inputs)
        f_ref = theano.function([], conv)
        res_ref = f_ref()

--- a/theano/sandbox/cuda/tests/test_gradient.py
+++ b/theano/sandbox/cuda/tests/test_gradient.py
@@ -8,7 +8,7 @@ from theano.sandbox import cuda
 # Skip test if cuda_ndarray is not available.
 from nose.plugins.skip import SkipTest
 import theano.sandbox.cuda as cuda_ndarray
-if cuda_ndarray.cuda_available == False:
+if cuda_ndarray.cuda_available is False:
    raise SkipTest('Optional package cuda disabled')

--- a/theano/sandbox/cuda/tests/test_memory.py
+++ b/theano/sandbox/cuda/tests/test_memory.py
@@ -11,7 +11,7 @@ from theano import ifelse
 # Skip test if cuda_ndarray is not available.
 from nose.plugins.skip import SkipTest
-if cuda.cuda_available == False:
+if cuda.cuda_available is False:
    raise SkipTest('Optional package cuda disabled')
@@ -39,7 +39,7 @@ def freemem(extra_alloc=0):
        theano_alloc = cuda.cuda_ndarray.cuda_ndarray.theano_allocated()
        return ("(n malloc/theano mem allocated in KB)",
                n_mallocs + extra_alloc,
-                int(theano_alloc / 1024) + extra_size)
+                int(theano_alloc / 1024))
    return ("n malloc on the gpu", n_mallocs + extra_alloc)
    # I don't use the following by default as if there is other stuff running
@@ -83,9 +83,12 @@ def test_memory():
        variables = cuda.shared_constructor(np.ones((shapes[1],),
                                                    dtype='float32'))
        derp = tensor.sum(tensor.dot(some_matrix[:shapes[0]], variables))
-        print("Shared took ", np.prod(variables.get_value(
+        print("Shared took ",
-                borrow=True,
+              np.prod(variables.get_value(
-                return_internal_type=True).shape) * 4 / 1024, "kB")
+                  borrow=True,
+                  return_internal_type=True).shape) *
+              4 / 1024,
+              "kB")
        mem2 = freemem()
        print("Before compilation", mem2)
@@ -112,7 +115,7 @@ def test_memory():
        del obj
        # print "After deleting function 1", freemem()
-        #assert mem2 == freemem(), (mem2, freemem())
+        # assert mem2 == freemem(), (mem2, freemem())
        del grad
        print("After deleting function 2", freemem())
@@ -155,16 +158,19 @@ def test_memory_lazy():
        derp = ifelse.IfElse(1)(branch_select,
                                derp, some_matrix[:shapes[0]].sum())
        derp += 1
-        print("Shared took ", np.prod(variables.get_value(
+        print("Shared took ",
-                borrow=True,
+              np.prod(variables.get_value(
-                return_internal_type=True).shape) * 4 / 1024, "kB")
+                  borrow=True,
+                  return_internal_type=True).shape) *
+              4 / 1024,
+              "kB")
        mem2 = freemem()
        print("Before compilation", mem2)
        mem2_1 = freemem(extra_alloc=more_alloc1)
        obj = theano.function([some_vector, branch_select], derp,
                              mode=mode_with_gpu)
-        #theano.printing.debugprint(obj, print_type=True)
+        # theano.printing.debugprint(obj, print_type=True)
        mem3 = freemem()
        print("After function compilation 1", mem3)
        assert mem2_1 == mem3, (mem2_1, mem3)

--- a/theano/sandbox/cuda/tests/test_mlp.py
+++ b/theano/sandbox/cuda/tests/test_mlp.py
@@ -24,7 +24,7 @@ if theano.config.mode not in ['FAST_RUN', 'Mode', 'ProfileMode']:
                   'otherwise it is too slow!')
 # Skip test if cuda_ndarray is not available.
-if tcn.cuda_available == False:
+if tcn.cuda_available is False:
    raise SkipTest('Optional package cuda disabled')
@@ -68,7 +68,7 @@ def print_mode(mode):
 def print_diff_mode(a, b):
    if (a is not None and
        isinstance(a, (theano.compile.ProfileMode,)) and
-        isinstance(b, (theano.compile.ProfileMode,))):
+       isinstance(b, (theano.compile.ProfileMode,))):
        a.print_diff_summary(b)
@@ -138,8 +138,8 @@ def test_run_nnet():
            # print "cpu:", rval_cpu
            # print "gpu:", rval_gpu
            abs_diff, rel_diff = \
-                    theano.gradient.numeric_grad.abs_rel_err(rval_gpu,
+                theano.gradient.numeric_grad.abs_rel_err(rval_gpu,
-                                                             rval_cpu)
+                                                         rval_cpu)
            max_abs_diff = abs_diff.max()
            # print "max abs diff=%e max rel diff=%e n_in=%d n_hid=%d" % (
            #    max_abs_diff, rel_diff.max(), n_in, n_hid)
@@ -147,19 +147,20 @@ def test_run_nnet():
            rtol = 1e-4
            if n_in * n_hid >= 2048 * 4096:
                rtol = 7e-4
-            assert numpy.allclose(rval_cpu, rval_gpu, rtol=rtol, atol=1e-6), \
+            assert numpy.allclose(
-                   ("max_abs_diff, max_rel_diff, n_in, n_hid", max_abs_diff,
+                rval_cpu, rval_gpu, rtol=rtol, atol=1e-6), \
-                    rel_diff.max(), n_in, n_hid)
+                ("max_abs_diff, max_rel_diff, n_in, n_hid", max_abs_diff,
+                 rel_diff.max(), n_in, n_hid)
 def test_run_nnet_med():
    utt.seed_rng()
-    rval_cpu = run_nnet(False, 10, 128, 50, 4, n_train=10000)
+    run_nnet(False, 10, 128, 50, 4, n_train=10000)
 def test_run_nnet_small():
    utt.seed_rng()
-    rval_cpu = run_nnet(False, 10, 10, 4, 4, n_train=100000)
+    run_nnet(False, 10, 10, 4, 4, n_train=100000)
 def run_conv_nnet1(use_gpu):
@@ -203,8 +204,11 @@ def run_conv_nnet1(use_gpu):
    mode = get_mode(use_gpu)
    # print 'building pfunc ...'
-    train = pfunc([x, y, lr], [loss], mode=mode, updates=[(p, p - g) for p,
+    train = pfunc(
-        g in zip(params, gparams)])
+        [x, y, lr],
+        [loss],
+        mode=mode,
+        updates=[(p, p - g) for p, g in zip(params, gparams)])
 #    for i, n in enumerate(train.maker.fgraph.toposort()):
 #        print i, n
@@ -279,7 +283,9 @@ def run_conv_nnet2(use_gpu):  # pretend we are training LeNet for MNIST
    conv_op = conv.ConvOp(shape_img[2:], shape_kern[2:], n_kern, n_batch, 1, 1)
    conv_op1 = conv.ConvOp((n_kern, logical_hid_shape[0] // 2,
-         logical_hid_shape[1] // 2), shape_kern1[2:], n_kern1, n_batch, 1, 1)
+                            logical_hid_shape[1] // 2),
+                           shape_kern1[2:],
+                           n_kern1, n_batch, 1, 1)
    hid = tensor.tanh(conv_op(x, w0) + b0.dimshuffle((0, 'x', 'x')))
    hid1 = tensor.tanh(conv_op1(hid[:, :, ::2, ::2], w1) + b1.dimshuffle((
@@ -295,8 +301,11 @@ def run_conv_nnet2(use_gpu):  # pretend we are training LeNet for MNIST
    mode = get_mode(use_gpu)
    # print 'building pfunc ...'
-    train = pfunc([x, y, lr], [loss], mode=mode, updates=[(p, p - g) for p,
+    train = pfunc(
-        g in zip(params, gparams)])
+        [x, y, lr],
+        [loss],
+        mode=mode,
+        updates=[(p, p - g) for p, g in zip(params, gparams)])
 #    for i, n in enumerate(train.maker.fgraph.toposort()):
 #        print i, n
@@ -376,13 +385,14 @@ def build_conv_nnet2_classif(use_gpu, isize, ksize, n_batch,
    if downsample_ops:
        hid = tensor.tanh(ds_op(conv_op(x, w0) + b0.dimshuffle((0, 'x', 'x'))))
    else:
-        hid = tensor.tanh((conv_op(x, w0) + b0.dimshuffle((0, 'x', 'x')
+        hid = tensor.tanh(
-            ))[:, :, ::2, ::2])
+            (conv_op(x, w0) + b0.dimshuffle(
+                (0, 'x', 'x')))[:, :, ::2, ::2])
    hid1 = tensor.tanh(conv_op1(hid, w1) + b1.dimshuffle((0, 'x', 'x')))
    hid_flat = hid1.reshape((n_batch, n_hid))
    out = tensor.nnet.softmax(tensor.dot(hid_flat, v) + c)
-    loss = tensor.sum(tensor.nnet.crossentropy_categorical_1hot(out,
+    loss = tensor.sum(tensor.nnet.crossentropy_categorical_1hot(
-         tensor.argmax(y, axis=1)) * lr)
+        out, tensor.argmax(y, axis=1)) * lr)
    # print 'loss type', loss.type
    params = [w0, b0, w1, b1, v, c]
@@ -391,8 +401,11 @@ def build_conv_nnet2_classif(use_gpu, isize, ksize, n_batch,
    mode = get_mode(use_gpu, check_isfinite)
    # print 'building pfunc ...'
-    train = pfunc([x, y, lr], [loss], mode=mode, updates=[(p, p - g) for p,
+    train = pfunc(
-        g in zip(params, gparams)])
+        [x, y, lr],
+        [loss],
+        mode=mode,
+        updates=[(p, p - g) for p, g in zip(params, gparams)])
    if verbose:
        theano.printing.debugprint(train)
@@ -422,13 +435,13 @@ def run_conv_nnet2_classif(use_gpu, seed, isize, ksize, bsize,
    utt.seed_rng(seed)  # Seeds numpy.random with seed
    train, params, x_shape, y_shape, mode = build_conv_nnet2_classif(
-            use_gpu=use_gpu,
+        use_gpu=use_gpu,
-            isize=isize,
+        isize=isize,
-            ksize=ksize,
+        ksize=ksize,
-            n_batch=bsize,
+        n_batch=bsize,
-            verbose=verbose,
+        verbose=verbose,
-            version=version,
+        version=version,
-            check_isfinite=check_isfinite)
+        check_isfinite=check_isfinite)
    if use_gpu:
        device = 'GPU'
@@ -440,10 +453,8 @@ def run_conv_nnet2_classif(use_gpu, seed, isize, ksize, bsize,
    lr = theano._asarray(0.01, dtype='float32')
    rvals = my_zeros(n_train)
-    t0 = time.time()
    for i in xrange(n_train):
        rvals[i] = train(xval, yval, lr)[0]
-    t1 = time.time()
    print_mode(mode)
    if pickle and isinstance(mode, theano.compile.ProfileMode):
@@ -495,35 +506,36 @@ def cmp_run_conv_nnet2_classif(seed, isize, ksize, bsize,
            compare = True
        if not compare:
-            return run_conv_nnet2_classif(use_gpu=use_gpu,
+            return run_conv_nnet2_classif(
-                    seed=seed, isize=isize, ksize=ksize, bsize=bsize,
+                use_gpu=use_gpu,
-                    n_train=n_train,
+                seed=seed, isize=isize, ksize=ksize, bsize=bsize,
-                    check_isfinite=check_isfinite,
+                n_train=n_train,
-                    pickle=pickle,
+                check_isfinite=check_isfinite,
-                    verbose=verbose,
+                pickle=pickle,
-                    version=version)
+                verbose=verbose,
+                version=version)
        utt.seed_rng(seed)  # Seeds numpy.random with seed
        train_cpu, params_cpu, x_shape, y_shape, mode_cpu = \
-                build_conv_nnet2_classif(
+            build_conv_nnet2_classif(
-                        use_gpu=False,
+                use_gpu=False,
-                        isize=isize,
+                isize=isize,
-                        ksize=ksize,
+                ksize=ksize,
-                        n_batch=bsize,
+                n_batch=bsize,
-                        verbose=verbose,
+                verbose=verbose,
-                        version=version,
+                version=version,
-                        check_isfinite=check_isfinite)
+                check_isfinite=check_isfinite)
        utt.seed_rng(seed)  # Seeds numpy.random with seed
        train_gpu, params_gpu, x_shape_gpu, y_shape_gpu, mode_gpu = \
-                build_conv_nnet2_classif(
+            build_conv_nnet2_classif(
-                        use_gpu=True,
+                use_gpu=True,
-                        isize=isize,
+                isize=isize,
-                        ksize=ksize,
+                ksize=ksize,
-                        n_batch=bsize,
+                n_batch=bsize,
-                        verbose=verbose,
+                verbose=verbose,
-                        version=version,
+                version=version,
-                        check_isfinite=check_isfinite)
+                check_isfinite=check_isfinite)
        assert x_shape == x_shape_gpu
        assert y_shape == y_shape_gpu
@@ -570,18 +582,6 @@ def cmp_run_conv_nnet2_classif(seed, isize, ksize, bsize,
    finally:
        theano.tensor.basic.float32_atol = orig_float32_atol
-    if pickle:
-        if isinstance(cpu_mode, theano.compile.ProfileMode):
-            import pickle
-            print("BEGIN CPU profile mode dump")
-            print(pickle.dumps(cpu_mode))
-            print("END CPU profile mode dump")
-        if isinstance(gpu_mode, theano.compile.ProfileMode):
-            import pickle
-            print("BEGIN GPU profile mode dump")
-            print(pickle.dumps(gpu_mode))
-            print("END GPU profile mode dump")
    # print "CPU time: %.3f, GPU time: %.3f, speed up %f" % (
    #        (time_cpu, time_gpu, time_cpu/time_gpu))
    # print "Estimated time for one pass through MNIST with CPU: %f" % (

--- a/theano/sandbox/cuda/tests/test_neighbours.py
+++ b/theano/sandbox/cuda/tests/test_neighbours.py
 # Skip test if cuda_ndarray is not available.
 from __future__ import absolute_import, print_function, division
 from nose.plugins.skip import SkipTest
+import unittest
+import theano.tensor.nnet.tests.test_neighbours
+from theano.sandbox.cuda.neighbours import GpuImages2Neibs
 import theano.sandbox.cuda as cuda_ndarray
-if cuda_ndarray.cuda_available == False:
+if cuda_ndarray.cuda_available is False:
    raise SkipTest('Optional package cuda disabled')
-import theano.tensor.nnet.tests.test_neighbours
-from theano.sandbox.cuda.neighbours import GpuImages2Neibs
 if theano.config.mode == 'FAST_COMPILE':
    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')

--- a/theano/sandbox/cuda/tests/test_opt.py
+++ b/theano/sandbox/cuda/tests/test_opt.py
--- a/theano/sandbox/cuda/tests/test_rng_curand.py
+++ b/theano/sandbox/cuda/tests/test_rng_curand.py
--- a/theano/sandbox/cuda/tests/test_tensor_op.py
+++ b/theano/sandbox/cuda/tests/test_tensor_op.py
--- a/theano/sandbox/cuda/tests/test_var.py
+++ b/theano/sandbox/cuda/tests/test_var.py
--- a/theano/sandbox/cuda/tests/test_viewop.py
+++ b/theano/sandbox/cuda/tests/test_viewop.py
--- a/theano/sandbox/cuda/tests/walltime.py
+++ b/theano/sandbox/cuda/tests/walltime.py
--- a/theano/sandbox/cuda/type.py
+++ b/theano/sandbox/cuda/type.py
--- a/theano/sandbox/cuda/var.py
+++ b/theano/sandbox/cuda/var.py
--- a/theano/tests/test_flake8.py
+++ b/theano/tests/test_flake8.py