Merge pull request #3095 from harlouci/flake8_v4

flake8 for tensor/nnet/nnet.py

Merge pull request #3095 from harlouci/flake8_v4
03e77233 · Frédéric Bastien · f4edcc59 · 9b457370 · 03e77233 · 03e77233
--- a/theano/tensor/nnet/Conv3D.py
+++ b/theano/tensor/nnet/Conv3D.py
 from __future__ import print_function
+import numpy as N
 from six.moves import xrange
 import theano
 from theano.tensor import basic as T
-import numpy as N
+# from util import strutil
-#from util import strutil
 from theano.tensor.blas_headers import blas_header_text, blas_header_version
 from theano.tensor.blas import ldflags
 from theano.misc import strutil
@@ -72,26 +74,28 @@ class Conv3D(theano.Op):
    def grad(self, inputs, output_gradients):
        V, W, b, d = inputs
-        dCdH , = output_gradients
+        dCdH, = output_gradients
        # make all of these ops support broadcasting of scalar b to vector b and eplace the zeros_like in all their grads
        # print dCdH.broadcastable
        # print "dCdH.broadcastable"
        # quit(-1)
-        #dCdH = printing.Print("dCdH = ",["shape"])
+        # dCdH = printing.Print("dCdH = ",["shape"])
        # Make sure the broadcasting pattern of the gradient is the the same
        # as the initial variable
-        dCdV = ConvTransp3D.convTransp3D(W, T.zeros_like(V[0, 0, 0, 0, :]), d, dCdH, V.shape[1:4])
+        dCdV = theano.tensor.nnet.convTransp3D(
+            W, T.zeros_like(V[0, 0, 0, 0, :]), d, dCdH, V.shape[1:4])
        dCdV = T.patternbroadcast(dCdV, V.broadcastable)
        WShape = W.shape
-        dCdW = ConvGrad3D.convGrad3D(V, d, WShape, dCdH)
+        dCdW = theano.tensor.nnet.convGrad3D(V, d, WShape, dCdH)
        dCdW = T.patternbroadcast(dCdW, W.broadcastable)
        dCdb = T.sum(dCdH, axis=(0, 1, 2, 3))
        dCdb = T.patternbroadcast(dCdb, b.broadcastable)
-        dCdd = grad_undefined(self, 3, inputs[3],
+        dCdd = grad_undefined(
-                "The gradient of Conv3D with respect to the convolution" +\
+            self, 3, inputs[3],
-                " stride is undefined because Conv3D is only defined for" +\
+            "The gradient of Conv3D with respect to the convolution"
-                " integer strides.")
+            " stride is undefined because Conv3D is only defined for"
+            " integer strides.")
        if 'name' in dir(dCdH) and dCdH.name is not None:
            dCdH_name = dCdH.name
@@ -113,11 +117,13 @@ class Conv3D(theano.Op):
        else:
            b_name = 'anon_b'
-        dCdV.name = 'Conv3D_dCdV(dCdH='+dCdH_name+',V='+V_name+')'
+        dCdV.name = 'Conv3D_dCdV(dCdH=' + dCdH_name + ',V=' + V_name + ')'
-        dCdW.name = 'Conv3D_dCdW(dCdH='+dCdH_name+',V='+V_name+',W='+W_name+')'
+        dCdW.name = ('Conv3D_dCdW(dCdH=' + dCdH_name + ',V=' + V_name +
-        dCdb.name = 'Conv3D_dCdb(dCdH='+dCdH_name+',V='+V_name+',W='+W_name+',b='+b_name+')'
+                     ',W=' + W_name + ')')
+        dCdb.name = ('Conv3D_dCdb(dCdH=' + dCdH_name + ',V=' + V_name +
+                     ',W=' + W_name + ',b=' + b_name + ')')
-        return [ dCdV, dCdW, dCdb, dCdd ]
+        return [dCdV, dCdW, dCdb, dCdd]
    def perform(self, node, inputs, output_storage):
        V, W, b, d = inputs
@@ -144,7 +150,7 @@ class Conv3D(theano.Op):
        output_width = T.floor((vidWidth - filterWidth) // dc) + 1
        output_dur = T.floor((vidDur - filterDur) // dt) + 1
-        rval = (batch_size,  output_height, output_width, output_dur, output_channels )
+        rval = (batch_size, output_height, output_width, output_dur, output_channels)
        return [rval]
@@ -155,7 +161,7 @@ class Conv3D(theano.Op):
        return ldflags()
    def c_compile_args(self):
-        flags =  ldflags(libs=False, flags=True)
+        flags = ldflags(libs=False, flags=True)
        return flags
    def c_lib_dirs(self):
@@ -170,7 +176,7 @@ class Conv3D(theano.Op):
        H = outputs[0]
-        codeSource =  """
+        codeSource = """
            ///////////// < code generated by Conv3D >
            //printf("\t\t\t\tConv3D c code\\n");
@@ -320,13 +326,13 @@ class Conv3D(theano.Op):
        VV, WV, bv, dv = node.inputs
        HV = node.outputs[0]
        if (theano.config.blas.ldflags and
-            VV.dtype == WV.dtype and HV.dtype == VV.dtype):
+                VV.dtype == WV.dtype and HV.dtype == VV.dtype):
            if VV.dtype == 'float64':
                gemv = 'dgemv_'
            elif VV.dtype == 'float32':
                gemv = 'sgemv_'
            else:
-                raise Exception('Unrecognized dtype for convolution '+V.value.dtype)
+                raise Exception('Unrecognized dtype for convolution ' + V.value.dtype)
            codeSource += """
            if (inputChannels > 20 && outputChannels > 20 && ws4 == sizeof(ELEM_AT(%(W)s,0)))
@@ -571,7 +577,7 @@ def computeH(V, W, b, d):
    outputChannels = W.shape[0]
    inputChannels = V.shape[4]
    if W.shape[4] != inputChannels:
-        raise Exception("W.shape[4] = "+str(W.shape[4])+" but inputChannels = "+str(inputChannels))
+        raise Exception("W.shape[4] = " + str(W.shape[4]) + " but inputChannels = " + str(inputChannels))
    filterHeight = W.shape[1]
    filterWidth = W.shape[2]
    filterDur = W.shape[3]
@@ -586,12 +592,12 @@ def computeH(V, W, b, d):
    assert dy > 0
    assert dt > 0
-    outputHeight = int( (vidHeight - filterHeight) / dx )+1
+    outputHeight = int((vidHeight - filterHeight) / dx) + 1
-    outputWidth = int( (vidWidth - filterWidth) / dy )+1
+    outputWidth = int((vidWidth - filterWidth) / dy) + 1
-    outputDur = int( (vidDur - filterDur) / dt ) + 1
+    outputDur = int((vidDur - filterDur) / dt) + 1
-    H =  N.zeros( (batchSize,  outputHeight,
+    H = N.zeros((batchSize, outputHeight,
-        outputWidth, outputDur, outputChannels ), dtype=V.dtype )
+                outputWidth, outputDur, outputChannels), dtype=V.dtype)
    # H[i,j,x,y,t] = b_j + sum_k sum_l sum_m sum_z W[j,z,k,l,m] V[i,z, dx*x+k,dy*y+l,dt*t+m]
    for i in xrange(0, H.shape[0]):
@@ -610,12 +616,8 @@ def computeH(V, W, b, d):
                                        # if (i,j,x,y,t) == (0,0,0,0,0):
                                        #    print (( W[j,z,k,l,m] , V[i,z,d[0]*x+k,d[1]*y+l,d[2]*t+m] ), (k,l,m) )
                                        w = W[j, k, l, m, z]
-                                        v = V[i, d[0]*x+k, d[1]*y+l, d[2]*t+m, z]
+                                        v = V[i, d[0] * x + k, d[1] * y + l, d[2] * t + m, z]
                                        # if i == 0 and x == 0 and y == 0 and t == 0 and j == 0:
                                        #    print 'setting H[0] += '+str(w*v)+'   W['+str((j,z,k,l,m))+']='+str(w)+'   V['+str((i,d[0]*x+k,d[1]*y+l,d[2]*t+m,z))+']='+str(v)
                                        H[i, x, y, t, j] += w * v
    return H
-from . import ConvGrad3D
-from . import ConvTransp3D
--- a/theano/tensor/nnet/ConvGrad3D.py
+++ b/theano/tensor/nnet/ConvGrad3D.py
+from six.moves import xrange
+import numpy as N
 import theano
 from theano.tensor import basic as T
 from theano.misc import strutil
-import numpy as N
-from six.moves import xrange
 from theano.gradient import grad_undefined
 from theano.gradient import DisconnectedType
@@ -23,11 +25,15 @@ class ConvGrad3D(theano.Op):
        WShape_ = T.as_tensor_variable(WShape)
        dCdH_ = T.as_tensor_variable(dCdH)
-        return theano.Apply(self, inputs=[V_, d_, WShape_, dCdH_], outputs=[ T.TensorType(V_.dtype, (False, False, False, False, False))() ] )
+        return theano.Apply(self,
+                            inputs=[V_, d_, WShape_, dCdH_],
+                            outputs=[T.TensorType(
+                                V_.dtype,
+                                (False, False, False, False, False))()])
    def infer_shape(self, node, input_shapes):
        V, d, W_shape, dCdH = node.inputs
-        return [ ( W_shape[0], W_shape[1], W_shape[2], W_shape[3], W_shape[4] ) ]
+        return [(W_shape[0], W_shape[1], W_shape[2], W_shape[3], W_shape[4])]
    def connection_pattern(self, node):
@@ -38,12 +44,12 @@ class ConvGrad3D(theano.Op):
        dLdA, = output_gradients
        z = T.zeros_like(C[0, 0, 0, 0, :])
-        dLdC = convTransp3D(dLdA, z, d, B, C.shape[1:4])
+        dLdC = theano.tensor.nnet.convTransp3D(dLdA, z, d, B, C.shape[1:4])
        # d actually does affect the outputs, so it's not disconnected
        dLdd = grad_undefined(self, 1, d)
        # The shape of the weights doesn't affect the output elements
        dLdWShape = DisconnectedType()()
-        dLdB = conv3D(C, dLdA, T.zeros_like(B[0, 0, 0, 0, :]), d)
+        dLdB = theano.tensor.nnet.conv3D(C, dLdA, T.zeros_like(B[0, 0, 0, 0, :]), d)
        return [dLdC, dLdd, dLdWShape, dLdB]
@@ -54,15 +60,10 @@ class ConvGrad3D(theano.Op):
        # partial C / partial W[j,z,k,l,m] = sum_i sum_p sum_q sum_r (partial C /partial H[i,j,p,q,r] ) *  V[i,z,dr*p+k,dc*q+l,dt*r+m]
        batchSize = dCdH.shape[0]
-        outputFilters = dCdH.shape[4]
        outputHeight = dCdH.shape[1]
        outputWidth = dCdH.shape[2]
        outputDur = dCdH.shape[3]
        assert V.shape[0] == batchSize
-        inputFilters = V.shape[4]
-        inputHeight = V.shape[1]
-        inputWidth = V.shape[2]
-        inputDur = V.shape[3]
        dr, dc, dt = d
        dCdW = N.zeros(WShape, dtype=V.dtype)
@@ -78,7 +79,10 @@ class ConvGrad3D(theano.Op):
                                for r in xrange(0, outputDur):
                                    for j in xrange(0, WShape[0]):
                                        for z in xrange(0, WShape[4]):
-                                            dCdW[j, k, l, m, z] +=  dCdH[i, p, q, r, j] * V[i, dr*p+k, dc*q+l, dt*r+m, z]
+                                            dCdW[j, k, l, m, z] += (
+                                                dCdH[i, p, q, r, j] *
+                                                V[i, dr * p + k, dc * q + l,
+                                                  dt * r + m, z])
        output_storage[0][0] = dCdW
@@ -272,6 +276,3 @@ class ConvGrad3D(theano.Op):
 convGrad3D = ConvGrad3D()
-from theano.tensor.nnet.Conv3D import conv3D
-from theano.tensor.nnet.ConvTransp3D import convTransp3D
--- a/theano/tensor/nnet/ConvTransp3D.py
+++ b/theano/tensor/nnet/ConvTransp3D.py
 from __future__ import print_function
 import numpy as N
 from six.moves import xrange
+import theano
 from theano.tensor import basic as T
 from theano.misc import strutil
-import theano
 from theano.gradient import grad_undefined
 from theano.gradient import DisconnectedType
@@ -31,12 +33,15 @@ class ConvTransp3D(theano.Op):
        else:
            RShape_ = T.as_tensor_variable([-1, -1, -1])
-        return theano.Apply(self, inputs=[W_, b_, d_, H_, RShape_], outputs=[ T.TensorType(H_.dtype, (False, False, False, False, False))() ] )
+        return theano.Apply(self,
+                            inputs=[W_, b_, d_, H_, RShape_],
+                            outputs=[T.TensorType(H_.dtype,
+                                     (False, False, False, False, False))()])
    def infer_shape(self, node, input_shapes):
        W, b, d, H, RShape = node.inputs
        W_shape, b_shape, d_shape, H_shape, RShape_shape = input_shapes
-        return [(H_shape[0],  RShape[0], RShape[1], RShape[2], W_shape[4])]
+        return [(H_shape[0], RShape[0], RShape[1], RShape[2], W_shape[4])]
    def connection_pattern(self, node):
        return [[True], [True], [True], [True], [False]]
@@ -44,9 +49,9 @@ class ConvTransp3D(theano.Op):
    def grad(self, inputs, output_gradients):
        W, b, d, H, RShape = inputs
        dCdR, = output_gradients
-        dCdH = conv3D(dCdR, W, T.zeros_like(H[0, 0, 0, 0, :]), d)
+        dCdH = theano.tensor.nnet.conv3D(dCdR, W, T.zeros_like(H[0, 0, 0, 0, :]), d)
        WShape = W.shape
-        dCdW = convGrad3D(dCdR, d, WShape, H)
+        dCdW = theano.tensor.nnet.convGrad3D(dCdR, d, WShape, H)
        dCdb = T.sum(dCdR, axis=(0, 1, 2, 3))
        # not differentiable, since d affects the output elements
        dCdd = grad_undefined(self, 2, d)
@@ -73,11 +78,13 @@ class ConvTransp3D(theano.Op):
        else:
            b_name = 'anon_b'
-        dCdW.name = 'ConvTransp3D_dCdW.H='+H_name+',dCdR='+dCdR_name+',W='+W_name
+        dCdW.name = ('ConvTransp3D_dCdW.H=' + H_name + ',dCdR=' + dCdR_name +
-        dCdb.name = 'ConvTransp3D_dCdb.H='+H_name+',dCdR='+dCdR_name+',W='+W_name+',b='+b_name
+                     ',W=' + W_name)
+        dCdb.name = ('ConvTransp3D_dCdb.H=' + H_name + ',dCdR=' + dCdR_name +
+                     ',W=' + W_name + ',b=' + b_name)
        dCdH.name = 'ConvTransp3D_dCdH.H=' + H_name + ',dCdR=' + dCdR_name
-        return [dCdW,  dCdb, dCdd, dCdH, dCdRShape]
+        return [dCdW, dCdb, dCdd, dCdH, dCdRShape]
    def perform(self, node, inputs, output_storage):
        W, b, d, H, RShape = inputs
@@ -335,7 +342,7 @@ def computeR(W, b, d, H, Rshape=None):
    assert len(b.shape) == 1
    assert len(d) == 3
-    outputChannels,  filterHeight, filterWidth, filterDur, \
+    outputChannels, filterHeight, filterWidth, filterDur, \
        inputChannels = W.shape
    batchSize, outputHeight, outputWidth, outputDur, \
        outputChannelsAgain = H.shape
@@ -367,7 +374,7 @@ def computeR(W, b, d, H, Rshape=None):
    # print "video size: "+str((videoHeight, videoWidth, videoDur))
    R = N.zeros((batchSize, videoHeight,
-            videoWidth, videoDur, inputChannels), dtype=H.dtype)
+                videoWidth, videoDur, inputChannels), dtype=H.dtype)
    # R[i,j,r,c,t] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} sum_{tc,tk | ...} sum_k W[k, j, rk, ck, tk] * H[i,k,rc,cc,tc]
    for i in xrange(0, batchSize):
@@ -404,8 +411,8 @@ def computeR(W, b, d, H, Rshape=None):
                                    if tk < 0:
                                        break
-                                    R[
+                                    R[i, r, c, t, j] += N.dot(
-                                        i, r, c, t, j] += N.dot(W[:, rk, ck, tk, j], H[i, rc, cc, tc, :] )
+                                        W[:, rk, ck, tk, j], H[i, rc, cc, tc, :])
                                    tc += 1
                                ""  # close loop over tc
@@ -421,7 +428,3 @@ def computeR(W, b, d, H, Rshape=None):
    ""  # close loop over i
    return R
-from theano.tensor.nnet.Conv3D import conv3D
-from theano.tensor.nnet.ConvGrad3D import convGrad3D
--- a/theano/tensor/nnet/conv.py
+++ b/theano/tensor/nnet/conv.py
-from __future__ import print_function
 """
 Contains an Op for convolving input images with a set of filters. This was
 developed especially for Convolutional Neural Networks.
@@ -9,7 +8,7 @@ tensor.signal and tensor.signal.downsample.
 See especially conv2d().
 """
-__docformat__ = "restructuredtext en"
+from __future__ import print_function
 import logging
@@ -17,12 +16,11 @@ import numpy
 from six.moves import xrange
 import theano
+from theano import OpenMPOp
 from theano.tensor import (as_tensor_variable, blas, get_scalar_constant_value,
                           patternbroadcast, NotScalarConstantError)
-from theano import OpenMPOp, config
 from theano.gof import Apply
-imported_scipy_signal = False
 try:
    # TODO: move these back out to global scope when they no longer
    # cause an atexit error
@@ -30,8 +28,9 @@ try:
    from scipy.signal.sigtools import _convolve2d
    imported_scipy_signal = True
 except ImportError:
-    pass
+    imported_scipy_signal = False
+__docformat__ = "restructuredtext en"
 _logger = logging.getLogger("theano.tensor.nnet.conv")
@@ -103,7 +102,7 @@ def conv2d(input, filters, image_shape=None, filter_shape=None,
                try:
                    image_shape[i] = get_scalar_constant_value(
                        as_tensor_variable(image_shape[i]))
-                except NotScalarConstantError as e:
+                except NotScalarConstantError:
                    raise NotScalarConstantError(
                        "The convolution need that the shape"
                        " information are constant values. We got"
@@ -118,7 +117,7 @@ def conv2d(input, filters, image_shape=None, filter_shape=None,
                try:
                    filter_shape[i] = get_scalar_constant_value(
                        as_tensor_variable(filter_shape[i]))
-                except NotScalarConstantError as e:
+                except NotScalarConstantError:
                    raise NotScalarConstantError(
                        "The convolution need that the shape"
                        " information are constant values. We got"
@@ -267,9 +266,9 @@ class ConvOp(OpenMPOp):
        # with s=1 for mode=='full' and s=-1 for mode=='valid'.
        # To support symbolic shapes, we express this with integer arithmetics.
        return tuple(None if i is None or k is None
-                else ((i - k) // d + 1) if mode == 'valid'
+                     else ((i - k) // d + 1) if mode == 'valid'
-                else ((i + k + d - 2) // d)
+                     else ((i + k + d - 2) // d)
-                for i, k, d in zip(inshp, kshp, stride))
+                     for i, k, d in zip(inshp, kshp, stride))
    def __init__(self, imshp=None, kshp=None, nkern=None, bsize=None,
                 dx=1, dy=1,
@@ -402,11 +401,11 @@ class ConvOp(OpenMPOp):
        if dy is None:
            dy = 1
-        if  int(dx) != dx:
+        if int(dx) != dx:
            raise TypeError('ConvOp.__init__ param dx must be an int', dx)
        dx = int(dx)
-        if  int(dy) != dy:
+        if int(dy) != dy:
            raise TypeError('ConvOp.__init__ param dy must be an int', dy)
        dy = int(dy)
@@ -509,7 +508,7 @@ class ConvOp(OpenMPOp):
        self.out_mode = output_mode
-        if not self.out_mode in ["valid", "full"]:
+        if self.out_mode not in ["valid", "full"]:
            raise Exception("Mode %s not implemented" % self.out_mode)
        if any((shp is not None) and (shp <= 0) for shp in self.outshp):
@@ -520,9 +519,8 @@ class ConvOp(OpenMPOp):
                            (self.imshp_logical, self.kshp_logical))
        if (self.unroll_kern is None and
-            self.unroll_batch is None and
+                self.unroll_batch is None and
-            self.unroll_patch is None):
+                self.unroll_patch is None):
            # no version specified. Find the faster we have
            if self.bsize is None and self.nkern is None:
                self.unroll_patch = True
@@ -540,7 +538,7 @@ class ConvOp(OpenMPOp):
                time_unroll_batch_kern = 9999999
                for i in xrange(len(self.speed_unroll_batch_kern)):
                    if (bsize % self.speed_unroll_batch_kern[i][0] == 0 and
-                        nkern % self.speed_unroll_batch_kern[i][1] == 0):
+                            nkern % self.speed_unroll_batch_kern[i][1] == 0):
                        if self.speed_unroll_batch_kern[i][2 + mode_idx] < time_unroll_batch_kern:
                            time_unroll_batch_kern = self.speed_unroll_batch_kern[i][2 + mode_idx]
                            time_unroll_batch_kern_idx = i
@@ -613,7 +611,6 @@ class ConvOp(OpenMPOp):
        inputs - 4 dim: batches x stacksize x rows x cols
        kerns - 4 dim: nkern x stackidx x rows x cols
        """
-        outdim = kerns.ndim
        _inputs = as_tensor_variable(inputs)
        _kerns = as_tensor_variable(kerns)
        # TODO: lift this restriction by upcasting either inputs or kerns
@@ -631,7 +628,7 @@ class ConvOp(OpenMPOp):
        output = theano.tensor.tensor(dtype=_inputs.type.dtype,
                                      broadcastable=[_inputs.broadcastable[0],
                                                     _kerns.broadcastable[0]] +
-                                                     bcastable23)
+                                      bcastable23)
        return Apply(self, [_inputs, _kerns], [output])
@@ -778,7 +775,7 @@ class ConvOp(OpenMPOp):
                img2d2[:, :, kshp[0] - 1:kshp[0] - 1 + imshp[1],
                       kshp[1] - 1:kshp[1] - 1 + imshp[2]] = img2d
                img2d = img2d2
-            #N_image_shape = image_data.shape
+            # N_image_shape = image_data.shape
            for b in xrange(bsize):
                for n in xrange(nkern):
@@ -786,8 +783,10 @@ class ConvOp(OpenMPOp):
                    for im0 in xrange(stacklen):
                        for row in xrange(0, zz.shape[2], self.dx):
                            for col in xrange(0, zz.shape[3], self.dy):
-                                zz[b, n, row, col] += (img2d[b, im0, row:row + kshp[0], col:col + kshp[1]] *
+                                zz[b, n, row, col] += (
-                                                            filtersflipped[n, im0, ::-1, ::-1]).sum()
+                                    img2d[b, im0, row:row + kshp[0],
+                                          col:col + kshp[1]] *
+                                    filtersflipped[n, im0, ::-1, ::-1]).sum()
        # We copy it to remove the Stride mismatch warning from DEBUG_MODE.
        # The copy make that we return an object with the same stride as the c version.
@@ -843,8 +842,8 @@ class ConvOp(OpenMPOp):
            # mimic what happens inside theano.grad: get the input gradient
            # of the final cost wrt all variables involved.
-            return theano.gradient.grad(cost=None,
+            return theano.gradient.grad(cost=None, known_grads={node: gz},
-                    known_grads={node: gz}, wrt=[inputs, kerns])
+                                        wrt=[inputs, kerns])
        if self.dx not in (1, 2) or self.dy not in (1, 2):
            raise NotImplementedError(
@@ -858,7 +857,7 @@ class ConvOp(OpenMPOp):
            raise Exception("ConvOp.grad when dx!=1 or dy!=1 we must have all "
                            "the optional shape information")
-        ####### Determine gradient on kernels ########
+        # Determine gradient on kernels ########
        assert inputs.ndim == 4 and kerns.ndim == 4
        newin = inputs.dimshuffle((1, 0, 2, 3))
@@ -943,7 +942,7 @@ class ConvOp(OpenMPOp):
            dw = dw.dimshuffle((1, 0, 2, 3))
            dw = dw[:, :, ::-1, ::-1]
-        ####### Determine gradient on inputs ########
+        # Determine gradient on inputs ########
        mode = 'valid'
        if not self.out_mode == 'full':
            mode = 'full'
@@ -1011,11 +1010,10 @@ using namespace std;
        if self.out_mode == 'valid' and self.dx == 0 and self.dy == 0:
            # We use a faster version in those case.
            if (self.imshp != self.imshp_logical or
-                self.kshp != self.kshp_logical or
+                    self.kshp != self.kshp_logical or
-                self.unroll_patch or
+                    self.unroll_patch or
-                self.unroll_batch > 0 or
+                    self.unroll_batch > 0 or
-                self.unroll_kern > 0):
+                    self.unroll_kern > 0):
                return False
            return True
        return False
@@ -1029,8 +1027,7 @@ using namespace std;
        # when the ksph==(1,1) gcc 4.3.0 segfault during the
        # compilation with -O3.  This don't happen at -O2
        if (theano.gof.cmodule.gcc_version() in ['4.3.0'] and
-            self.kshp == (1, 1)):
+                self.kshp == (1, 1)):
            return ['-O3']
        else:
            return []
@@ -1041,7 +1038,7 @@ using namespace std;
        if self.use_blas():
            ret = blas.ldflags(libs=False, flags=True)
        if (theano.gof.cmodule.gcc_version() in ['4.3.0'] and
-            self.kshp == (1, 1)):
+                self.kshp == (1, 1)):
            ret += ['-O2']
        # Add the -fopenmp flags
        ret += super(ConvOp, self).c_compile_args()
@@ -1068,7 +1065,7 @@ using namespace std;
        d.update(sub)
        all_shape = (self.has_all_shape(self.imshp, self.kshp,
-                                       self.nkern, self.bsize) and
+                                        self.nkern, self.bsize) and
                     self.has_all_shape(self.imshp_logical, self.kshp_logical))
        d["self_out_mode"] = self.out_mode
@@ -1228,9 +1225,9 @@ if(%(value)s != %(expected)s){
            d["self_kshp_logical_stride_c"] = int(numpy.ceil(
                self.kshp_logical[1] / float(self.kshp[1])))
            d["self_imshp_logical_r"] = self.imshp_logical[1]
-                # numpy.B. 1  not 0
+            # numpy.B. 1  not 0
            d["self_imshp_logical_c"] = self.imshp_logical[2]
-                # numpy.B. 2  not 1
+            # numpy.B. 2  not 1
            d["self_imshp_logical_stride_r"] = int(numpy.ceil(
                self.imshp_logical[1] / float(self.imshp[1])))
            d["self_imshp_logical_stride_c"] = int(numpy.ceil(
@@ -1300,7 +1297,7 @@ if(kerns_dim[1] != img2d_dim[1]){
                              all_shape)
            return _conv_op_code_unroll_patch % d
        if ((self.unroll_batch is not None and self.unroll_batch > 0) or
-            (self.unroll_kern is not None and self.unroll_kern > 0)):
+                (self.unroll_kern is not None and self.unroll_kern > 0)):
            assert self.unroll_batch > 0
            assert self.unroll_kern > 0
            if self.verbose:

--- a/theano/tensor/nnet/conv3d2d.py
+++ b/theano/tensor/nnet/conv3d2d.py
@@ -194,13 +194,13 @@ def conv3d(signals, filters,
        _signals_shape_5d[2],
        _signals_shape_5d[3],
        _signals_shape_5d[4],
-        )
+    )
    _filters_shape_4d = (
        _filters_shape_5d[0] * _filters_shape_5d[1],
        _filters_shape_5d[2],
        _filters_shape_5d[3],
        _filters_shape_5d[4],
-        )
+    )
    if border_mode[1] != border_mode[2]:
        raise NotImplementedError('height and width bordermodes must match')
@@ -228,7 +228,7 @@ def conv3d(signals, filters,
            _filters_shape_5d[1],  # Tf
            _signals_shape_5d[3] - _filters_shape_5d[3] + 1,
            _signals_shape_5d[4] - _filters_shape_5d[4] + 1,
-            ))
+        ))
    elif border_mode[1] == 'full':
        out_tmp = out_4d.reshape((
            _signals_shape_5d[0],  # Ns
@@ -237,7 +237,7 @@ def conv3d(signals, filters,
            _filters_shape_5d[1],  # Tf
            _signals_shape_5d[3] + _filters_shape_5d[3] - 1,
            _signals_shape_5d[4] + _filters_shape_5d[4] - 1,
-            ))
+        ))
    elif border_mode[1] == 'same':
        raise NotImplementedError()
    else:
@@ -246,15 +246,15 @@ def conv3d(signals, filters,
    # now sum out along the Tf to get the output
    # but we have to sum on a diagonal through the Tf and Ts submatrix.
    if border_mode[0] == 'valid':
-        if _filters_shape_5d[1]!=1:
+        if _filters_shape_5d[1] != 1:
-          out_5d = diagonal_subtensor(out_tmp, 1, 3).sum(axis=3)
+            out_5d = diagonal_subtensor(out_tmp, 1, 3).sum(axis=3)
-        else: # for Tf==1, no sum along Tf, the Ts-axis of the output is unchanged!
+        else:  # for Tf==1, no sum along Tf, the Ts-axis of the output is unchanged!
-          out_5d = out_tmp.reshape((
+            out_5d = out_tmp.reshape((
-            _signals_shape_5d[0],
+                _signals_shape_5d[0],
-            _signals_shape_5d[1],
+                _signals_shape_5d[1],
-            _filters_shape_5d[0],
+                _filters_shape_5d[0],
-            _signals_shape_5d[3] - _filters_shape_5d[3] + 1,
+                _signals_shape_5d[3] - _filters_shape_5d[3] + 1,
-            _signals_shape_5d[4] - _filters_shape_5d[4] + 1,
+                _signals_shape_5d[4] - _filters_shape_5d[4] + 1,
            ))
    elif border_mode[0] in ('full', 'same'):
        raise NotImplementedError('sequence border mode', border_mode[0])
@@ -316,7 +316,7 @@ if cuda.cuda_available:
 def local_inplace_DiagonalSubtensor(node):
    """ also work for IncDiagonalSubtensor """
    if (isinstance(node.op, (DiagonalSubtensor, IncDiagonalSubtensor)) and
-        not node.op.inplace):
+            not node.op.inplace):
        new_op = node.op.__class__(inplace=True)
        new_node = new_op(*node.inputs)
        return [new_node]

--- a/theano/tensor/nnet/neighbours.py
+++ b/theano/tensor/nnet/neighbours.py
@@ -2,15 +2,15 @@
 TODO: implement Images2Neibs.infer_shape() methods
 """
-from six.moves import xrange
+import numpy
 import theano
 from theano import Op, Apply
 import theano.tensor as T
 from theano.gradient import grad_not_implemented
 from theano.gradient import grad_undefined
-import numpy
 class Images2Neibs(Op):
@@ -206,7 +206,7 @@ class Images2Neibs(Op):
                                z_col = j + d * i
                                z[0][z_row, z_col] = ten4[n, s, ten4_2, ten4_3]
    def infer_shape(self, node, input_shape):
        in_shape = input_shape[0]
        c, d = node.inputs[1]
@@ -223,7 +223,7 @@ class Images2Neibs(Op):
        z_dim0 = grid_c * grid_d * in_shape[1] * in_shape[0]
        z_dim1 = c * d
        return [(z_dim0, z_dim1)]
    def c_code(self, node, name, inp, out, sub):
        ten4, neib_shape, neib_step = inp
        z, = out
@@ -417,21 +417,21 @@ class Images2Neibs(Op):
 def images2neibs(ten4, neib_shape, neib_step=None, mode='valid'):
-    """ 
+    """
    Function :func:`images2neibs <theano.sandbox.neighbours.images2neibs>`
-    allows to apply a sliding window operation to a tensor containing 
+    allows to apply a sliding window operation to a tensor containing
    images
-    or other two-dimensional objects. 
+    or other two-dimensional objects.
-    The sliding window operation loops 
+    The sliding window operation loops
-    over points in input data and stores a rectangular neighbourhood of 
+    over points in input data and stores a rectangular neighbourhood of
-    each point.   
+    each point.
-    It is possible to assign a step of selecting patches (parameter 
+    It is possible to assign a step of selecting patches (parameter
-    `neib_step`). 
+    `neib_step`).
-    :param ten4:     A 4-dimensional tensor which represents 
+    :param ten4:     A 4-dimensional tensor which represents
                     a list of lists of images.a list of lists of images.
                     It should have shape (list 1 dim, list 2 dim,
-                     row, col). The first two dimensions can be 
+                     row, col). The first two dimensions can be
                     useful to store different channels and batches.
    :type ten4:      A 4d tensor-like.
    :param neib_shape: A tuple containing two
@@ -442,20 +442,20 @@ def images2neibs(ten4, neib_shape, neib_step=None, mode='valid'):
    :type neib_shape: A 1d tensor-like of 2 values.
    :param neib_step: (dr,dc) where dr is the number of rows to
                      skip between patch and dc is the number of
-                      columns. The parameter should be a tuple of two elements: 
+                      columns. The parameter should be a tuple of two elements:
-                      number 
+                      number
-                      of rows and number of columns to skip each iteration. 
+                      of rows and number of columns to skip each iteration.
                      Basically, when the step is 1, the neighbourhood of every
-                      first element is taken and every possible rectangular 
+                      first element is taken and every possible rectangular
                      subset is returned. By default it is equal to
                      `neib_shape` in other words, the
-                      patches are disjoint. When the step is greater than 
+                      patches are disjoint. When the step is greater than
                      `neib_shape`, some elements are omitted. When None, this
                      is the same as
                      neib_shape(patch are disjoint)
-                      .. note:: Currently the step size should be chosen in the way that the 
+                      .. note:: Currently the step size should be chosen in the way that the
-                         corresponding dimension :math:`i` (width or height) is equal to 
+                         corresponding dimension :math:`i` (width or height) is equal to
                         :math:`n * step\_size_i + neib\_shape_i` for some :math:`n`
    :type neib_step: A 1d tensor-like of 2 values.
    :param mode:
@@ -489,29 +489,29 @@ def images2neibs(ten4, neib_shape, neib_step=None, mode='valid'):
                                  = flattened version of ten4[i,j,l:l+r,k:k+c]
                             idx += 1
-          .. note:: The operation isn't necessarily implemented internally with 
+          .. note:: The operation isn't necessarily implemented internally with
-             these for loops, they're just the easiest way to describe the 
+             these for loops, they're just the easiest way to describe the
             output pattern.
    Example:
    .. code-block:: python
        # Defining variables
        images = T.tensor4('images')
        neibs = images2neibs(images, neib_shape=(5, 5))
-        # Constructing theano function 
+        # Constructing theano function
        window_function = theano.function([images], neibs)
        # Input tensor (one image 10x10)
        im_val = np.arange(100.).reshape((1, 1, 10, 10))
        # Function application
        neibs_val = window_function(im_val)
-    .. note:: The underlying code will construct a 2D tensor of disjoint 
+    .. note:: The underlying code will construct a 2D tensor of disjoint
-       patches 5x5. The output has shape 4x25. 
+       patches 5x5. The output has shape 4x25.
    """
    return Images2Neibs(mode)(ten4, neib_shape, neib_step)
@@ -524,25 +524,24 @@ def neibs2images(neibs, neib_shape, original_shape, mode='valid'):
    the output of :func:`images2neibs <theano.sandbox.neigbours.neibs2images>`
    and reconstructs its input.
-    :param neibs: matrix like the one obtained by 
+    :param neibs: matrix like the one obtained by
                  :func:`images2neibs <theano.sandbox.neigbours.neibs2images>`
-    :param neib_shape: `neib_shape` that was used in 
+    :param neib_shape: `neib_shape` that was used in
                  :func:`images2neibs <theano.sandbox.neigbours.neibs2images>`
-    :param original_shape: original shape of the 4d tensor given to 
+    :param original_shape: original shape of the 4d tensor given to
                  :func:`images2neibs <theano.sandbox.neigbours.neibs2images>`
-    :return: Reconstructs the input of 
+    :return: Reconstructs the input of
                  :func:`images2neibs <theano.sandbox.neigbours.neibs2images>`,
                  a 4d tensor of shape `original_shape`.
    .. note:: Currently, the function doesn't support tensors created with
       `neib_step` different from default value. This means that it may be
-       impossible to compute the gradient of a variable gained by 
+       impossible to compute the gradient of a variable gained by
-       :func:`images2neibs <theano.sandbox.neigbours.neibs2images>` w.r.t. 
+       :func:`images2neibs <theano.sandbox.neigbours.neibs2images>` w.r.t.
-       its inputs in this case, because it uses 
+       its inputs in this case, because it uses
-       :func:`images2neibs <theano.sandbox.neigbours.neibs2images>` for 
+       :func:`images2neibs <theano.sandbox.neigbours.neibs2images>` for
       gradient computation.
    Example, which uses a tensor gained in example for
    :func:`images2neibs <theano.sandbox.neigbours.neibs2images>`:

--- a/theano/tensor/nnet/nnet.py
+++ b/theano/tensor/nnet/nnet.py
@@ -15,6 +15,7 @@ from six.moves import xrange
 import theano
 from theano import gof
+from theano import scalar
 from theano.tensor import basic as tensor
 from theano.tensor import subtensor
 from theano.tensor import elemwise
@@ -27,12 +28,12 @@ from theano.gradient import DisconnectedType
 from theano.gradient import grad_not_implemented
 from theano.tensor.type import values_eq_approx_remove_nan
 ############
 #
 # TENSOR OPS
 #
 class SoftmaxWithBias(gof.Op):
    """
    An L{Op} for the output of neural-net multiclass classifiers.
@@ -299,13 +300,13 @@ class SoftmaxGrad(gof.Op):
    def grad(self, inp, grads):
        dy, sm = inp
        g, = grads
-        tmp = g + tensor.neg(tensor.sum(g*sm, axis=1).dimshuffle((0, 'x')))
+        tmp = g + tensor.neg(tensor.sum(g * sm, axis=1).dimshuffle((0, 'x')))
        g_dy = tmp * sm
-        tmp2 = tensor.sum(dy*sm, axis=1).dimshuffle((0, 'x'))
+        tmp2 = tensor.sum(dy * sm, axis=1).dimshuffle((0, 'x'))
-        g_sm = tmp*dy - g *tmp2
+        g_sm = tmp * dy - g * tmp2
        return g_dy, g_sm
    def infer_shape(self, node, shape):
@@ -571,12 +572,15 @@ class Softmax(gof.Op):
 softmax_op = Softmax()
 def softmax_graph(c):
    return tensor.exp(c) / tensor.exp(c).sum(axis=-1, keepdims=True)
 def softmax(c):
    return softmax_op(c)
 @opt.register_specialize('fast_compile_gpu')
 @gof.local_optimizer([softmax_op])
 def local_softmax_with_bias(node):
@@ -593,15 +597,15 @@ def local_softmax_with_bias(node):
                    # tensor.DimShuffle) since specialization comes
                    # relatively late in optimization, we don't want to
                    # put in extra DimShuffles un-necessarily.
-                    if (x_in.owner and isinstance(x_in.owner.op,
+                    if (x_in.owner and
-                                                 tensor.DimShuffle)
+                            isinstance(x_in.owner.op, tensor.DimShuffle) and
-                 and list(x_in.owner.inputs[0].type.broadcastable) == [False]):
+                            list(x_in.owner.inputs[0].type.broadcastable) == [False]):
                        # cut out the DimShuffle that was broadcasting a vector
                        vectors.append(x_in.owner.inputs[0])
                    else:
                        # insert an extra DimShuffle to correct the old one
                        vectors.append(tensor.
-                            DimShuffle((True, False), (1,))(x_in))
+                                       DimShuffle((True, False), (1,))(x_in))
                else:
                    non_vectors.append(x_in)
@@ -658,7 +662,7 @@ def softmax_simplifier(numerators, denominators):
                                                tensor.DimShuffle):
                if denominator.owner.op.new_order == (0, 'x'):
                    z = denominator.owner.inputs[0]
-                          # thing getting dimshuffled
+                    # thing getting dimshuffled
                    if z.owner and isinstance(z.owner.op, tensor.Sum):
                        # print 'ASDF', denominator.owner.op.new_order
                        # print z.owner.op.axis
@@ -673,8 +677,7 @@ def softmax_simplifier(numerators, denominators):
            numerators.append(softmax_op(x))
    return numerators, denominators
-opt.local_mul_canonizer.add_simplifier(softmax_simplifier,
+opt.local_mul_canonizer.add_simplifier(softmax_simplifier, 'softmax_simplifier')
-     'softmax_simplifier')
 if 0:
    @opt.register_specialize
@@ -694,11 +697,11 @@ if 0:
            # First, prod_term
            for add_in in add_inputs:
                if (add_in.owner and
-                    add_in.owner.op == tensor.mul and
+                        add_in.owner.op == tensor.mul and
-                    prod_term is None):
+                        prod_term is None):
                    mul_inputs = add_in.owner.inputs
                    if (len(mul_inputs) == 2 and
-                        all([mul_in.ndim == 2 for mul_in in mul_inputs])):
+                            all([mul_in.ndim == 2 for mul_in in mul_inputs])):
                        prod_term = add_in
                    else:
                        other_terms.append(add_in)
@@ -724,16 +727,16 @@ if 0:
                        maybe_ds = None
                        for i, mul2_in in enumerate(mul2_inputs):
                            if mul2_in.owner and isinstance(mul2_in.owner.op,
-                                                        elemwise.DimShuffle):
+                                                            elemwise.DimShuffle):
                                maybe_ds = mul2_in
                                maybe_sm = mul2_inputs[1 - i]  # The other one
                        if (maybe_ds is None or
-                            maybe_ds.ndim != 2 or
+                                maybe_ds.ndim != 2 or
-                            maybe_sm.ndim != 2):
+                                maybe_sm.ndim != 2):
                            rest.append(add_in)
                            # print 'maybe_ds =', maybe_ds
                            # if maybe_ds:
-                            #    print 'maybe_ds.ndim =', maybe_ds.ndim, ', maybe_sm.ndim =', maybe_sm.ndim
+                            # print 'maybe_ds.ndim =', maybe_ds.ndim, ', maybe_sm.ndim =', maybe_sm.ndim
                            continue
                        if maybe_sm is mul_inputs[0]:
@@ -755,8 +758,8 @@ if 0:
                            sum_input = ds_input.owner.inputs[0]
                        if ((ds_order != (0, 'x')) or
-                            (axis != (1,)) or
+                                (axis != (1,)) or
-                            (sum_input is not prod_term)):
+                                (sum_input is not prod_term)):
                            rest.append(add_in)
                            # print 'ds_order =', ds_order
                            # print 'axis =', axis
@@ -816,7 +819,7 @@ class CrossentropySoftmaxArgmax1HotWithBias(gof.Op):
    nin = 3
    nout = 3
    __props__ = ()
    def __init__(self, **kwargs):
        gof.Op.__init__(self, **kwargs)
@@ -836,7 +839,7 @@ class CrossentropySoftmaxArgmax1HotWithBias(gof.Op):
 #       TODO: Is this correct? It used to be y, not y_idx
        nll = tensor.TensorType(x.type.dtype,
-                y_idx.type.broadcastable)()
+                                y_idx.type.broadcastable).make_variable()
 #        nll = TensorType(x.dtype, y.broadcastable)
        sm = x.type()
        am = y_idx.type()
@@ -866,15 +869,14 @@ class CrossentropySoftmaxArgmax1HotWithBias(gof.Op):
        if any(y_idx < 0):
            raise ValueError("y_i value out of bounds")
        sm = numpy.zeros_like(x)  # softmax
-        nll = numpy.zeros(x.shape[0], dtype=node.outputs[0].type.
+        nll = numpy.zeros(x.shape[0], dtype=node.outputs[0].type.dtype)  # nll(y | softmax(x))
-            dtype)  # nll(y | softmax(x))
        am = numpy.zeros_like(y_idx)
        for i in xrange(sm.shape[0]):
            # add the bias vector to the i'th row of x
            row = x[i] + b
            # get the maximum value of i'th row for numerically safe
-            #softmax / nll
+            # softmax / nll
            am[i] = numpy.argmax(row)
            m = row[am[i]]
@@ -956,7 +958,7 @@ class CrossentropySoftmaxArgmax1HotWithBias(gof.Op):
        # TODO: use this to accept float32 and int32: node.inputs[0].type.dtype_specs()[1]
        (init_decl, begin_row_loop, inside_row_loop, end_row_loop) = \
-                SoftmaxWithBias.c_code_template(dtype)
+            SoftmaxWithBias.c_code_template(dtype)
        return (init_decl,
                """
        if (PyArray_NDIM(%(y_idx)s) != 1)
@@ -1038,7 +1040,7 @@ class CrossentropySoftmax1HotWithBiasDx(gof.Op):
    nin = 3
    nout = 1
    __props__ = ()
    """Gradient wrt x of the CrossentropySoftmaxArgmax1HotWithBias Op"""
    def make_node(self, dy, sm, y_idx, **kwargs):
@@ -1046,13 +1048,13 @@ class CrossentropySoftmax1HotWithBiasDx(gof.Op):
        sm = tensor.as_tensor_variable(sm)
        y_idx = tensor.as_tensor_variable(y_idx)
        if (dy.type.ndim > 1 or
-            dy.type.dtype not in tensor.float_dtypes):
+                dy.type.dtype not in tensor.float_dtypes):
            raise ValueError('dy must be {0,1}-d tensor of floats', dy.type)
        if (sm.type.ndim != 2 or
-            sm.type.dtype not in tensor.float_dtypes):
+                sm.type.dtype not in tensor.float_dtypes):
            raise ValueError('sm must be 2-d tensor of floats', sm.type)
        if (y_idx.type.ndim != 1 or
-            y_idx.type.dtype not in tensor.discrete_dtypes):
+                y_idx.type.dtype not in tensor.discrete_dtypes):
            raise ValueError('y_idx must be 1-d tensor of [u]ints', y_idx.type)
        return Apply(self, [dy, sm, y_idx], [sm.type()])
@@ -1082,9 +1084,8 @@ class CrossentropySoftmax1HotWithBiasDx(gof.Op):
        # typically we should not need the gradient w.r.t. dy).
        y_idx_range = tensor.arange(y_idx.shape[0])
        g_dy = tensor.sum(
-                g_dx * subtensor.AdvancedIncSubtensor()(
+            g_dx * subtensor.AdvancedIncSubtensor()(
-                    sm, tensor.fill(dy, -1), y_idx_range, y_idx),
+                sm, tensor.fill(dy, -1), y_idx_range, y_idx), axis=1)
-                axis=1)
        g_sm = dy.dimshuffle(0, 'x') * g_dx
        g_y_idx = grad_not_implemented(self, 2, y_idx)
        return [g_dy, g_sm, g_y_idx]
@@ -1226,8 +1227,7 @@ def crossentropy_softmax_max_and_argmax_1hot_with_bias(x, b, y_idx, **kwargs):
    unnecessary? e.g. CrossentropySoftmaxArgmax1HotWithBias should return
    the appropriate information (i.e. the max probability)?
    """
-    (xent, softmax) = crossentropy_softmax_1hot_with_bias(x, b, y_idx,
+    (xent, softmax) = crossentropy_softmax_1hot_with_bias(x, b, y_idx, **kwargs)
-         **kwargs)
    (max_pr, argmax) = tensor.max_and_argmax(softmax, axis=-1)
    return (xent, softmax, max_pr, argmax)
@@ -1239,7 +1239,7 @@ def crossentropy_softmax_max_and_argmax_1hot(x, y_idx, **kwargs):
 class CrossentropyCategorical1HotGrad(gof.Op):
    __props__ = ()
    def make_node(self, g_y, coding_dist, true_one_of_n):
@@ -1251,8 +1251,8 @@ class CrossentropyCategorical1HotGrad(gof.Op):
        g_coding_strg, = out
        g_coding = numpy.zeros_like(coding_dist)
        for i in xrange(len(g_y)):
-            g_coding[i, true_one_of_n[i]] = -g_y[i] / coding_dist[i,
+            g_coding[i, true_one_of_n[i]] = (-g_y[i] /
-                                                        true_one_of_n[i]]
+                                             coding_dist[i, true_one_of_n[i]])
        g_coding_strg[0] = g_coding
    def infer_shape(self, node, in_shapes):
@@ -1297,8 +1297,8 @@ class CrossentropyCategorical1Hot(gof.Op):
                                                   tensor.lvector))
        return Apply(self, [_coding_dist, _true_one_of_n],
-                [tensor.Tensor(dtype=_coding_dist.dtype,
+                     [tensor.Tensor(dtype=_coding_dist.dtype,
-                               broadcastable=[False])()])
+                      broadcastable=[False])()])
    def perform(self, node, inp, out):
        coding, one_of_n = inp
@@ -1346,10 +1346,11 @@ def crossentropy_to_crossentropy_with_softmax_with_bias(fgraph):
                sm, one_of_n = node.inputs
                if sm.owner and sm.owner.op == softmax_with_bias:
                    x, b = sm.owner.inputs
-                    new_nll, new_sm, new_am = crossentropy_softmax_argmax_1hot_with_bias(x, b,
+                    new_nll, new_sm, new_am = crossentropy_softmax_argmax_1hot_with_bias(
-                            one_of_n)
+                        x, b, one_of_n)
-                    fgraph.replace_all_validate([(nll, new_nll), (sm, new_sm)],
+                    fgraph.replace_all_validate(
-                reason="crossentropy_to_crossentropy_with_softmax_with_bias")
+                        [(nll, new_nll), (sm, new_sm)],
+                        reason="crossentropy_to_crossentropy_with_softmax_with_bias")
                    return True
        return False
@@ -1381,17 +1382,19 @@ def crossentropy_to_crossentropy_with_softmax(fgraph):
                sm, one_of_n = node.inputs
                if sm.owner and sm.owner.op == softmax_op:
                    x, = sm.owner.inputs
-                    new_nll, new_sm, new_am = crossentropy_softmax_argmax_1hot_with_bias(x,
+                    new_nll, new_sm, new_am = crossentropy_softmax_argmax_1hot_with_bias(
-                            tensor.zeros_like(x[0]), one_of_n)
+                        x, tensor.zeros_like(x[0]), one_of_n)
-                    fgraph.replace_all_validate([(nll, new_nll), (sm, new_sm)],
+                    fgraph.replace_all_validate(
-                            reason="crossentropy_to_crossentropy_with_softmax")
+                        [(nll, new_nll), (sm, new_sm)],
+                        reason="crossentropy_to_crossentropy_with_softmax")
                    return True
                if sm.owner and sm.owner.op == softmax_with_bias:
                    x, b = sm.owner.inputs
                    new_nll, new_sm, new_am = crossentropy_softmax_argmax_1hot_with_bias(x, b,
-                            one_of_n)
+                                                                                         one_of_n)
-                    fgraph.replace_all_validate([(nll, new_nll), (sm, new_sm)],
+                    fgraph.replace_all_validate(
-                            reason="crossentropy_to_crossentropy_with_softmax")
+                        [(nll, new_nll), (sm, new_sm)],
+                        reason="crossentropy_to_crossentropy_with_softmax")
                    return True
        return False
@@ -1413,10 +1416,10 @@ def local_softmax_grad_to_crossentropy_with_softmax_grad(node):
    if node.op == softmax_grad:
        g_coding_dist, coding_dist = node.inputs
        if (g_coding_dist.owner and
-            g_coding_dist.owner.op == crossentropy_categorical_1hot_grad):
+                g_coding_dist.owner.op == crossentropy_categorical_1hot_grad):
            g_nll, coding_dist, true_one_of_n = g_coding_dist.owner.inputs
-            dx = crossentropy_softmax_1hot_with_bias_dx(g_nll,
+            dx = crossentropy_softmax_1hot_with_bias_dx(g_nll, coding_dist,
-                 coding_dist, true_one_of_n)
+                                                        true_one_of_n)
            return [dx]
@@ -1428,16 +1431,17 @@ def local_argmax_pushdown(node):
            (softmax_op, softplus, tensor.exp, tensor.log, tensor.tanh, sigmoid,
             softmax_with_bias):
        if theano.config.warn.argmax_pushdown_bug:
-            logging.getLogger('theano.tensor.nnet.nnet').warn("WARNING: there "
+            logging.getLogger('theano.tensor.nnet.nnet').warn(
-                    "was a bug in Theano fixed on May 27th, 2010 in this case."
+                "WARNING: there "
-                    " I.E. when we take the max of a softplus, softmax, exp, "
+                "was a bug in Theano fixed on May 27th, 2010 in this case."
-                    "log, tanh, sigmoid, softmax_with_bias op, we were doing "
+                " I.E. when we take the max of a softplus, softmax, exp, "
-                    "the max of the parent of the input. To remove this "
+                "log, tanh, sigmoid, softmax_with_bias op, we were doing "
-                    "warning set the Theano flags 'warn.argmax_pushdown_bug' "
+                "the max of the parent of the input. To remove this "
-                    "to False")
+                "warning set the Theano flags 'warn.argmax_pushdown_bug' "
+                "to False")
    if (node.op == tensor._max_and_argmax and
-        node.inputs[0].owner and len(node.outputs[0].clients) == 0):
+            node.inputs[0].owner and len(node.outputs[0].clients) == 0):
        x_max, x_argmax = node.outputs
        x, axis = node.inputs
        # TODO: Make a list/set of monotonic ops...
@@ -1657,15 +1661,15 @@ def local_advanced_indexing_crossentropy_onehot_grad(node):
            if isinstance(denom.owner.op, subtensor.AdvancedSubtensor):
                # Base case
                adv_subtensor = denom
-                #out_grad /= 1.
+                # out_grad /= 1.
            elif denom.owner.op == tensor.mul:
                # Try to find the AdvancedSubtensor node mentionned above,
                # and the output gradient
                for i, input in enumerate(denom.owner.inputs):
                    if input.owner and isinstance(input.owner.op,
                                                  subtensor.AdvancedSubtensor):
-                        other_inputs = [in_ for (j,
+                        other_inputs = [in_ for (j, in_) in
-                             in_) in enumerate(denom.owner.inputs) if j != i]
+                                        enumerate(denom.owner.inputs) if j != i]
                        if len(other_inputs) == 1:
                            rest = other_inputs[0]
                        else:
@@ -1831,8 +1835,8 @@ def local_useless_crossentropy_softmax_1hot_with_bias_dx_alloc(node):
            # `CrossentropySoftmax1HotWithBiasDx`) we do not need to
            # check it at runtime.
            if (dz_broad[0] and
-                not same_shape(sm, dy, dim_x=0, dim_y=0) and
+                    not same_shape(sm, dy, dim_x=0, dim_y=0) and
-                shape_of[dy][0] != 1):
+                    shape_of[dy][0] != 1):
                # If `dz` is broadcastable, we need to check whether the shapes
                # of `dy` and `sm` are the same or whether the shape of `dy` is
                # equal to 1.
@@ -1894,20 +1898,18 @@ def categorical_crossentropy(coding_dist, true_dist):
    """
    if true_dist.ndim == coding_dist.ndim:
-        return -tensor.sum(true_dist * tensor.log(coding_dist), axis=coding_dist.ndim-1)
+        return -tensor.sum(true_dist * tensor.log(coding_dist),
+                           axis=coding_dist.ndim - 1)
    elif true_dist.ndim == coding_dist.ndim - 1:
        return crossentropy_categorical_1hot(coding_dist, true_dist)
    else:
        raise TypeError('rank mismatch between coding and true distributions')
-from theano import scalar
 class Prepend_scalar_constant_to_each_row(gof.Op):
    __props__ = ()
    def __init__(self, val=0):
        if isinstance(val, float):
            val = scalar.constant(val)
@@ -2026,7 +2028,7 @@ local_log_softmax = gof.PatternSub(in_pattern=(tensor.log, (softmax_op, 'x')),
 # don't do register_stabilize, this is to make local_log_softmax run
 # only after another more specific optimization that stabilizes cross entropy
-#opt.register_stabilize(local_log_softmax, name = 'local_log_softmax')
+# opt.register_stabilize(local_log_softmax, name = 'local_log_softmax')
 opt.register_specialize(local_log_softmax, 'fast_compile_gpu', name='local_log_softmax')

--- a/theano/tensor/nnet/sigm.py
+++ b/theano/tensor/nnet/sigm.py
@@ -7,7 +7,6 @@ from __future__ import print_function
 import warnings
 import numpy
-from six.moves import xrange
 import theano
 from theano import config, gof, printing, scalar
@@ -92,7 +91,7 @@ class ScalarSigmoid(scalar.UnaryScalarOp):
        x, = inp
        z, = out
        if (not theano.config.lib.amdlibm or
-            node.inputs[0].dtype != node.outputs[0].dtype):
+                node.inputs[0].dtype != node.outputs[0].dtype):
            raise theano.gof.utils.MethodNotDefined()
        dtype = node.inputs[0].dtype
        if dtype == 'float32' and self.amd_float32 is not None:
@@ -129,9 +128,8 @@ class ScalarSigmoid(scalar.UnaryScalarOp):
        """
        This method was used to generate the graph: sigmoid_prec.png in the doc
        """
-        import matplotlib
        data = numpy.arange(-15, 15, .1)
-        val = 1/(1+numpy.exp(-data))
+        val = 1 / (1 + numpy.exp(-data))
        def hard_sigmoid(x):
            return theano.tensor.nnet.hard_sigmoid(x)
@@ -164,10 +162,10 @@ scalar_sigmoid = ScalarSigmoid(scalar.upgrade_to_float, name='scalar_sigmoid')
 sigmoid = elemwise.Elemwise(scalar_sigmoid, name='sigmoid')
 sigmoid_inplace = elemwise.Elemwise(
-        ScalarSigmoid(scalar.transfer_type(0)),
+    ScalarSigmoid(scalar.transfer_type(0)),
-        inplace_pattern={0: 0},
+    inplace_pattern={0: 0},
-        name='sigmoid_inplace',
+    name='sigmoid_inplace',
-        )
+)
 pprint.assign(sigmoid, printing.FunctionPrinter('sigmoid'))
@@ -240,7 +238,7 @@ pprint.assign(ultra_fast_sigmoid,
              printing.FunctionPrinter('ultra_fast_sigmoid'))
-#@opt.register_uncanonicalize
+# @opt.register_uncanonicalize
 @gof.local_optimizer([sigmoid])
 def local_ultra_fast_sigmoid(node):
    """
@@ -290,7 +288,7 @@ def hard_sigmoid(x):
    return x
-#@opt.register_uncanonicalize
+# @opt.register_uncanonicalize
 @gof.local_optimizer([sigmoid])
 def local_hard_sigmoid(node):
    if (isinstance(node.op, tensor.Elemwise) and
@@ -412,7 +410,7 @@ def is_1pexp(t):
    """
    if t.owner and t.owner.op == tensor.add:
        scalars, scalar_inputs, nonconsts = \
-                opt.scalarconsts_rest(t.owner.inputs)
+            opt.scalarconsts_rest(t.owner.inputs)
        # scalar_inputs are potentially dimshuffled and fill'd scalars
        if len(nonconsts) == 1:
            maybe_exp = nonconsts[0]
@@ -439,11 +437,12 @@ def is_1pexp(t):
    return None
-AddConfigVar('warn.identify_1pexp_bug',
+AddConfigVar(
-        'Warn if Theano versions prior to 7987b51 (2011-12-18) could have '
+    'warn.identify_1pexp_bug',
-        'yielded a wrong result due to a bug in the is_1pexp function',
+    'Warn if Theano versions prior to 7987b51 (2011-12-18) could have '
-        BoolParam(theano.configdefaults.warn_default('0.4.1')),
+    'yielded a wrong result due to a bug in the is_1pexp function',
-        in_c_key=False)
+    BoolParam(theano.configdefaults.warn_default('0.4.1')),
+    in_c_key=False)
 def is_exp(var):
@@ -778,9 +777,9 @@ def perform_sigm_times_exp(tree, exp_x=None, exp_minus_x=None, sigm_x=None,
        rval = False
        for sub_idx, sub_tree in enumerate(inputs):
            rval |= perform_sigm_times_exp(
-                    tree=sub_tree, parent=tree, child_idx=sub_idx,
+                tree=sub_tree, parent=tree, child_idx=sub_idx,
-                    exp_x=exp_x, exp_minus_x=exp_minus_x, sigm_x=sigm_x,
+                exp_x=exp_x, exp_minus_x=exp_minus_x, sigm_x=sigm_x,
-                    sigm_minus_x=sigm_minus_x, full_tree=full_tree)
+                sigm_minus_x=sigm_minus_x, full_tree=full_tree)
        return rval
    else:
        # Reached a leaf: if it is an exponential or a sigmoid, then we
@@ -867,15 +866,15 @@ def local_inv_1_plus_exp(node):
        inv_arg = node.inputs[0]
        if inv_arg.owner and inv_arg.owner.op == tensor.add:
            scalars, scalar_inputs, nonconsts = \
-                    opt.scalarconsts_rest(inv_arg.owner.inputs)
+                opt.scalarconsts_rest(inv_arg.owner.inputs)
            # scalar_inputs are potentially dimshuffled and fill'd scalars
            if len(nonconsts) == 1:
                if nonconsts[0].owner and nonconsts[0].owner.op == tensor.exp:
                    if scalars and numpy.allclose(numpy.sum(scalars), 1):
                        return opt._fill_chain(
-                                sigmoid(
+                            sigmoid(
-                                    tensor.neg(nonconsts[0].owner.inputs[0])),
+                                tensor.neg(nonconsts[0].owner.inputs[0])),
-                                scalar_inputs)
+                            scalar_inputs)
 # Registration is below, and conditional.
@@ -892,7 +891,7 @@ def local_1msigmoid(node):
        if sub_r.owner and sub_r.owner.op == sigmoid:
            try:
                val_l = opt.get_scalar_constant_value(sub_l)
-            except Exception as e:
+            except Exception:
                return
            if numpy.allclose(numpy.sum(val_l), 1):
                return [sigmoid(-sub_r.owner.inputs[0])]
@@ -921,7 +920,6 @@ if 0:
        print(sigm_canonicalize(node))
    def sigm_canonicalize(node):
-        add = tensor.add
        mul = tensor.mul
        div = tensor.true_div

--- a/theano/tests/test_flake8.py
+++ b/theano/tests/test_flake8.py
@@ -88,15 +88,7 @@ whitelist_flake8 = [
    "tensor/signal/conv.py",
    "tensor/signal/tests/test_conv.py",
    "tensor/signal/tests/test_downsample.py",
-    "tensor/nnet/nnet.py",
-    "tensor/nnet/Conv3D.py",
    "tensor/nnet/__init__.py",
-    "tensor/nnet/ConvTransp3D.py",
-    "tensor/nnet/sigm.py",
-    "tensor/nnet/ConvGrad3D.py",
-    "tensor/nnet/conv3d2d.py",
-    "tensor/nnet/conv.py",
-    "tensor/nnet/neighbours.py",
    "tensor/nnet/tests/test_conv.py",
    "tensor/nnet/tests/test_neighbours.py",
    "tensor/nnet/tests/test_nnet.py",