Merge pull request #3095 from harlouci/flake8_v4

flake8 for tensor/nnet/nnet.py

Merge pull request #3095 from harlouci/flake8_v4
03e77233 · Frédéric Bastien · f4edcc59 · 9b457370 · 03e77233 · 03e77233
--- a/theano/tensor/nnet/Conv3D.py
+++ b/theano/tensor/nnet/Conv3D.py
 from __future__ import print_function
+import numpy as N
 from six.moves import xrange
 import theano
 from theano.tensor import basic as T
-import numpy as N
+# from util import strutil
-#from util import strutil
 from theano.tensor.blas_headers import blas_header_text, blas_header_version
 from theano.tensor.blas import ldflags
 from theano.misc import strutil
@@ -72,25 +74,27 @@ class Conv3D(theano.Op):
    def grad(self, inputs, output_gradients):
        V, W, b, d = inputs
-        dCdH , = output_gradients
+        dCdH, = output_gradients
        # make all of these ops support broadcasting of scalar b to vector b and eplace the zeros_like in all their grads
        # print dCdH.broadcastable
        # print "dCdH.broadcastable"
        # quit(-1)
-        #dCdH = printing.Print("dCdH = ",["shape"])
+        # dCdH = printing.Print("dCdH = ",["shape"])
        # Make sure the broadcasting pattern of the gradient is the the same
        # as the initial variable
-        dCdV = ConvTransp3D.convTransp3D(W, T.zeros_like(V[0, 0, 0, 0, :]), d, dCdH, V.shape[1:4])
+        dCdV = theano.tensor.nnet.convTransp3D(
+            W, T.zeros_like(V[0, 0, 0, 0, :]), d, dCdH, V.shape[1:4])
        dCdV = T.patternbroadcast(dCdV, V.broadcastable)
        WShape = W.shape
-        dCdW = ConvGrad3D.convGrad3D(V, d, WShape, dCdH)
+        dCdW = theano.tensor.nnet.convGrad3D(V, d, WShape, dCdH)
        dCdW = T.patternbroadcast(dCdW, W.broadcastable)
        dCdb = T.sum(dCdH, axis=(0, 1, 2, 3))
        dCdb = T.patternbroadcast(dCdb, b.broadcastable)
-        dCdd = grad_undefined(self, 3, inputs[3],
+        dCdd = grad_undefined(
-                "The gradient of Conv3D with respect to the convolution" +\
+            self, 3, inputs[3],
-                " stride is undefined because Conv3D is only defined for" +\
+            "The gradient of Conv3D with respect to the convolution"
+            " stride is undefined because Conv3D is only defined for"
            " integer strides.")
        if 'name' in dir(dCdH) and dCdH.name is not None:
@@ -113,11 +117,13 @@ class Conv3D(theano.Op):
        else:
            b_name = 'anon_b'
-        dCdV.name = 'Conv3D_dCdV(dCdH='+dCdH_name+',V='+V_name+')'
+        dCdV.name = 'Conv3D_dCdV(dCdH=' + dCdH_name + ',V=' + V_name + ')'
-        dCdW.name = 'Conv3D_dCdW(dCdH='+dCdH_name+',V='+V_name+',W='+W_name+')'
+        dCdW.name = ('Conv3D_dCdW(dCdH=' + dCdH_name + ',V=' + V_name +
-        dCdb.name = 'Conv3D_dCdb(dCdH='+dCdH_name+',V='+V_name+',W='+W_name+',b='+b_name+')'
+                     ',W=' + W_name + ')')
+        dCdb.name = ('Conv3D_dCdb(dCdH=' + dCdH_name + ',V=' + V_name +
+                     ',W=' + W_name + ',b=' + b_name + ')')
-        return [ dCdV, dCdW, dCdb, dCdd ]
+        return [dCdV, dCdW, dCdb, dCdd]
    def perform(self, node, inputs, output_storage):
        V, W, b, d = inputs
@@ -144,7 +150,7 @@ class Conv3D(theano.Op):
        output_width = T.floor((vidWidth - filterWidth) // dc) + 1
        output_dur = T.floor((vidDur - filterDur) // dt) + 1
-        rval = (batch_size,  output_height, output_width, output_dur, output_channels )
+        rval = (batch_size, output_height, output_width, output_dur, output_channels)
        return [rval]
@@ -326,7 +332,7 @@ class Conv3D(theano.Op):
            elif VV.dtype == 'float32':
                gemv = 'sgemv_'
            else:
-                raise Exception('Unrecognized dtype for convolution '+V.value.dtype)
+                raise Exception('Unrecognized dtype for convolution ' + V.value.dtype)
            codeSource += """
            if (inputChannels > 20 && outputChannels > 20 && ws4 == sizeof(ELEM_AT(%(W)s,0)))
@@ -571,7 +577,7 @@ def computeH(V, W, b, d):
    outputChannels = W.shape[0]
    inputChannels = V.shape[4]
    if W.shape[4] != inputChannels:
-        raise Exception("W.shape[4] = "+str(W.shape[4])+" but inputChannels = "+str(inputChannels))
+        raise Exception("W.shape[4] = " + str(W.shape[4]) + " but inputChannels = " + str(inputChannels))
    filterHeight = W.shape[1]
    filterWidth = W.shape[2]
    filterDur = W.shape[3]
@@ -586,12 +592,12 @@ def computeH(V, W, b, d):
    assert dy > 0
    assert dt > 0
-    outputHeight = int( (vidHeight - filterHeight) / dx )+1
+    outputHeight = int((vidHeight - filterHeight) / dx) + 1
-    outputWidth = int( (vidWidth - filterWidth) / dy )+1
+    outputWidth = int((vidWidth - filterWidth) / dy) + 1
-    outputDur = int( (vidDur - filterDur) / dt ) + 1
+    outputDur = int((vidDur - filterDur) / dt) + 1
-    H =  N.zeros( (batchSize,  outputHeight,
+    H = N.zeros((batchSize, outputHeight,
-        outputWidth, outputDur, outputChannels ), dtype=V.dtype )
+                outputWidth, outputDur, outputChannels), dtype=V.dtype)
    # H[i,j,x,y,t] = b_j + sum_k sum_l sum_m sum_z W[j,z,k,l,m] V[i,z, dx*x+k,dy*y+l,dt*t+m]
    for i in xrange(0, H.shape[0]):
@@ -610,12 +616,8 @@ def computeH(V, W, b, d):
                                        # if (i,j,x,y,t) == (0,0,0,0,0):
                                        #    print (( W[j,z,k,l,m] , V[i,z,d[0]*x+k,d[1]*y+l,d[2]*t+m] ), (k,l,m) )
                                        w = W[j, k, l, m, z]
-                                        v = V[i, d[0]*x+k, d[1]*y+l, d[2]*t+m, z]
+                                        v = V[i, d[0] * x + k, d[1] * y + l, d[2] * t + m, z]
                                        # if i == 0 and x == 0 and y == 0 and t == 0 and j == 0:
                                        #    print 'setting H[0] += '+str(w*v)+'   W['+str((j,z,k,l,m))+']='+str(w)+'   V['+str((i,d[0]*x+k,d[1]*y+l,d[2]*t+m,z))+']='+str(v)
                                        H[i, x, y, t, j] += w * v
    return H
-from . import ConvGrad3D
-from . import ConvTransp3D
--- a/theano/tensor/nnet/ConvGrad3D.py
+++ b/theano/tensor/nnet/ConvGrad3D.py
+from six.moves import xrange
+import numpy as N
 import theano
 from theano.tensor import basic as T
 from theano.misc import strutil
-import numpy as N
-from six.moves import xrange
 from theano.gradient import grad_undefined
 from theano.gradient import DisconnectedType
@@ -23,11 +25,15 @@ class ConvGrad3D(theano.Op):
        WShape_ = T.as_tensor_variable(WShape)
        dCdH_ = T.as_tensor_variable(dCdH)
-        return theano.Apply(self, inputs=[V_, d_, WShape_, dCdH_], outputs=[ T.TensorType(V_.dtype, (False, False, False, False, False))() ] )
+        return theano.Apply(self,
+                            inputs=[V_, d_, WShape_, dCdH_],
+                            outputs=[T.TensorType(
+                                V_.dtype,
+                                (False, False, False, False, False))()])
    def infer_shape(self, node, input_shapes):
        V, d, W_shape, dCdH = node.inputs
-        return [ ( W_shape[0], W_shape[1], W_shape[2], W_shape[3], W_shape[4] ) ]
+        return [(W_shape[0], W_shape[1], W_shape[2], W_shape[3], W_shape[4])]
    def connection_pattern(self, node):
@@ -38,12 +44,12 @@ class ConvGrad3D(theano.Op):
        dLdA, = output_gradients
        z = T.zeros_like(C[0, 0, 0, 0, :])
-        dLdC = convTransp3D(dLdA, z, d, B, C.shape[1:4])
+        dLdC = theano.tensor.nnet.convTransp3D(dLdA, z, d, B, C.shape[1:4])
        # d actually does affect the outputs, so it's not disconnected
        dLdd = grad_undefined(self, 1, d)
        # The shape of the weights doesn't affect the output elements
        dLdWShape = DisconnectedType()()
-        dLdB = conv3D(C, dLdA, T.zeros_like(B[0, 0, 0, 0, :]), d)
+        dLdB = theano.tensor.nnet.conv3D(C, dLdA, T.zeros_like(B[0, 0, 0, 0, :]), d)
        return [dLdC, dLdd, dLdWShape, dLdB]
@@ -54,15 +60,10 @@ class ConvGrad3D(theano.Op):
        # partial C / partial W[j,z,k,l,m] = sum_i sum_p sum_q sum_r (partial C /partial H[i,j,p,q,r] ) *  V[i,z,dr*p+k,dc*q+l,dt*r+m]
        batchSize = dCdH.shape[0]
-        outputFilters = dCdH.shape[4]
        outputHeight = dCdH.shape[1]
        outputWidth = dCdH.shape[2]
        outputDur = dCdH.shape[3]
        assert V.shape[0] == batchSize
-        inputFilters = V.shape[4]
-        inputHeight = V.shape[1]
-        inputWidth = V.shape[2]
-        inputDur = V.shape[3]
        dr, dc, dt = d
        dCdW = N.zeros(WShape, dtype=V.dtype)
@@ -78,7 +79,10 @@ class ConvGrad3D(theano.Op):
                                for r in xrange(0, outputDur):
                                    for j in xrange(0, WShape[0]):
                                        for z in xrange(0, WShape[4]):
-                                            dCdW[j, k, l, m, z] +=  dCdH[i, p, q, r, j] * V[i, dr*p+k, dc*q+l, dt*r+m, z]
+                                            dCdW[j, k, l, m, z] += (
+                                                dCdH[i, p, q, r, j] *
+                                                V[i, dr * p + k, dc * q + l,
+                                                  dt * r + m, z])
        output_storage[0][0] = dCdW
@@ -272,6 +276,3 @@ class ConvGrad3D(theano.Op):
 convGrad3D = ConvGrad3D()
-from theano.tensor.nnet.Conv3D import conv3D
-from theano.tensor.nnet.ConvTransp3D import convTransp3D
--- a/theano/tensor/nnet/ConvTransp3D.py
+++ b/theano/tensor/nnet/ConvTransp3D.py
 from __future__ import print_function
 import numpy as N
 from six.moves import xrange
+import theano
 from theano.tensor import basic as T
 from theano.misc import strutil
-import theano
 from theano.gradient import grad_undefined
 from theano.gradient import DisconnectedType
@@ -31,7 +33,10 @@ class ConvTransp3D(theano.Op):
        else:
            RShape_ = T.as_tensor_variable([-1, -1, -1])
-        return theano.Apply(self, inputs=[W_, b_, d_, H_, RShape_], outputs=[ T.TensorType(H_.dtype, (False, False, False, False, False))() ] )
+        return theano.Apply(self,
+                            inputs=[W_, b_, d_, H_, RShape_],
+                            outputs=[T.TensorType(H_.dtype,
+                                     (False, False, False, False, False))()])
    def infer_shape(self, node, input_shapes):
        W, b, d, H, RShape = node.inputs
@@ -44,9 +49,9 @@ class ConvTransp3D(theano.Op):
    def grad(self, inputs, output_gradients):
        W, b, d, H, RShape = inputs
        dCdR, = output_gradients
-        dCdH = conv3D(dCdR, W, T.zeros_like(H[0, 0, 0, 0, :]), d)
+        dCdH = theano.tensor.nnet.conv3D(dCdR, W, T.zeros_like(H[0, 0, 0, 0, :]), d)
        WShape = W.shape
-        dCdW = convGrad3D(dCdR, d, WShape, H)
+        dCdW = theano.tensor.nnet.convGrad3D(dCdR, d, WShape, H)
        dCdb = T.sum(dCdR, axis=(0, 1, 2, 3))
        # not differentiable, since d affects the output elements
        dCdd = grad_undefined(self, 2, d)
@@ -73,8 +78,10 @@ class ConvTransp3D(theano.Op):
        else:
            b_name = 'anon_b'
-        dCdW.name = 'ConvTransp3D_dCdW.H='+H_name+',dCdR='+dCdR_name+',W='+W_name
+        dCdW.name = ('ConvTransp3D_dCdW.H=' + H_name + ',dCdR=' + dCdR_name +
-        dCdb.name = 'ConvTransp3D_dCdb.H='+H_name+',dCdR='+dCdR_name+',W='+W_name+',b='+b_name
+                     ',W=' + W_name)
+        dCdb.name = ('ConvTransp3D_dCdb.H=' + H_name + ',dCdR=' + dCdR_name +
+                     ',W=' + W_name + ',b=' + b_name)
        dCdH.name = 'ConvTransp3D_dCdH.H=' + H_name + ',dCdR=' + dCdR_name
        return [dCdW, dCdb, dCdd, dCdH, dCdRShape]
@@ -404,8 +411,8 @@ def computeR(W, b, d, H, Rshape=None):
                                    if tk < 0:
                                        break
-                                    R[
+                                    R[i, r, c, t, j] += N.dot(
-                                        i, r, c, t, j] += N.dot(W[:, rk, ck, tk, j], H[i, rc, cc, tc, :] )
+                                        W[:, rk, ck, tk, j], H[i, rc, cc, tc, :])
                                    tc += 1
                                ""  # close loop over tc
@@ -421,7 +428,3 @@ def computeR(W, b, d, H, Rshape=None):
    ""  # close loop over i
    return R
-from theano.tensor.nnet.Conv3D import conv3D
-from theano.tensor.nnet.ConvGrad3D import convGrad3D
--- a/theano/tensor/nnet/conv.py
+++ b/theano/tensor/nnet/conv.py
-from __future__ import print_function
 """
 Contains an Op for convolving input images with a set of filters. This was
 developed especially for Convolutional Neural Networks.
@@ -9,7 +8,7 @@ tensor.signal and tensor.signal.downsample.
 See especially conv2d().
 """
-__docformat__ = "restructuredtext en"
+from __future__ import print_function
 import logging
@@ -17,12 +16,11 @@ import numpy
 from six.moves import xrange
 import theano
+from theano import OpenMPOp
 from theano.tensor import (as_tensor_variable, blas, get_scalar_constant_value,
                           patternbroadcast, NotScalarConstantError)
-from theano import OpenMPOp, config
 from theano.gof import Apply
-imported_scipy_signal = False
 try:
    # TODO: move these back out to global scope when they no longer
    # cause an atexit error
@@ -30,8 +28,9 @@ try:
    from scipy.signal.sigtools import _convolve2d
    imported_scipy_signal = True
 except ImportError:
-    pass
+    imported_scipy_signal = False
+__docformat__ = "restructuredtext en"
 _logger = logging.getLogger("theano.tensor.nnet.conv")
@@ -103,7 +102,7 @@ def conv2d(input, filters, image_shape=None, filter_shape=None,
                try:
                    image_shape[i] = get_scalar_constant_value(
                        as_tensor_variable(image_shape[i]))
-                except NotScalarConstantError as e:
+                except NotScalarConstantError:
                    raise NotScalarConstantError(
                        "The convolution need that the shape"
                        " information are constant values. We got"
@@ -118,7 +117,7 @@ def conv2d(input, filters, image_shape=None, filter_shape=None,
                try:
                    filter_shape[i] = get_scalar_constant_value(
                        as_tensor_variable(filter_shape[i]))
-                except NotScalarConstantError as e:
+                except NotScalarConstantError:
                    raise NotScalarConstantError(
                        "The convolution need that the shape"
                        " information are constant values. We got"
@@ -509,7 +508,7 @@ class ConvOp(OpenMPOp):
        self.out_mode = output_mode
-        if not self.out_mode in ["valid", "full"]:
+        if self.out_mode not in ["valid", "full"]:
            raise Exception("Mode %s not implemented" % self.out_mode)
        if any((shp is not None) and (shp <= 0) for shp in self.outshp):
@@ -522,7 +521,6 @@ class ConvOp(OpenMPOp):
        if (self.unroll_kern is None and
                self.unroll_batch is None and
                self.unroll_patch is None):
            # no version specified. Find the faster we have
            if self.bsize is None and self.nkern is None:
                self.unroll_patch = True
@@ -613,7 +611,6 @@ class ConvOp(OpenMPOp):
        inputs - 4 dim: batches x stacksize x rows x cols
        kerns - 4 dim: nkern x stackidx x rows x cols
        """
-        outdim = kerns.ndim
        _inputs = as_tensor_variable(inputs)
        _kerns = as_tensor_variable(kerns)
        # TODO: lift this restriction by upcasting either inputs or kerns
@@ -778,7 +775,7 @@ class ConvOp(OpenMPOp):
                img2d2[:, :, kshp[0] - 1:kshp[0] - 1 + imshp[1],
                       kshp[1] - 1:kshp[1] - 1 + imshp[2]] = img2d
                img2d = img2d2
-            #N_image_shape = image_data.shape
+            # N_image_shape = image_data.shape
            for b in xrange(bsize):
                for n in xrange(nkern):
@@ -786,7 +783,9 @@ class ConvOp(OpenMPOp):
                    for im0 in xrange(stacklen):
                        for row in xrange(0, zz.shape[2], self.dx):
                            for col in xrange(0, zz.shape[3], self.dy):
-                                zz[b, n, row, col] += (img2d[b, im0, row:row + kshp[0], col:col + kshp[1]] *
+                                zz[b, n, row, col] += (
+                                    img2d[b, im0, row:row + kshp[0],
+                                          col:col + kshp[1]] *
                                    filtersflipped[n, im0, ::-1, ::-1]).sum()
        # We copy it to remove the Stride mismatch warning from DEBUG_MODE.
@@ -843,8 +842,8 @@ class ConvOp(OpenMPOp):
            # mimic what happens inside theano.grad: get the input gradient
            # of the final cost wrt all variables involved.
-            return theano.gradient.grad(cost=None,
+            return theano.gradient.grad(cost=None, known_grads={node: gz},
-                    known_grads={node: gz}, wrt=[inputs, kerns])
+                                        wrt=[inputs, kerns])
        if self.dx not in (1, 2) or self.dy not in (1, 2):
            raise NotImplementedError(
@@ -858,7 +857,7 @@ class ConvOp(OpenMPOp):
            raise Exception("ConvOp.grad when dx!=1 or dy!=1 we must have all "
                            "the optional shape information")
-        ####### Determine gradient on kernels ########
+        # Determine gradient on kernels ########
        assert inputs.ndim == 4 and kerns.ndim == 4
        newin = inputs.dimshuffle((1, 0, 2, 3))
@@ -943,7 +942,7 @@ class ConvOp(OpenMPOp):
            dw = dw.dimshuffle((1, 0, 2, 3))
            dw = dw[:, :, ::-1, ::-1]
-        ####### Determine gradient on inputs ########
+        # Determine gradient on inputs ########
        mode = 'valid'
        if not self.out_mode == 'full':
            mode = 'full'
@@ -1015,7 +1014,6 @@ using namespace std;
                    self.unroll_patch or
                    self.unroll_batch > 0 or
                    self.unroll_kern > 0):
                return False
            return True
        return False
@@ -1030,7 +1028,6 @@ using namespace std;
        # compilation with -O3.  This don't happen at -O2
        if (theano.gof.cmodule.gcc_version() in ['4.3.0'] and
                self.kshp == (1, 1)):
            return ['-O3']
        else:
            return []

--- a/theano/tensor/nnet/conv3d2d.py
+++ b/theano/tensor/nnet/conv3d2d.py
@@ -246,7 +246,7 @@ def conv3d(signals, filters,
    # now sum out along the Tf to get the output
    # but we have to sum on a diagonal through the Tf and Ts submatrix.
    if border_mode[0] == 'valid':
-        if _filters_shape_5d[1]!=1:
+        if _filters_shape_5d[1] != 1:
            out_5d = diagonal_subtensor(out_tmp, 1, 3).sum(axis=3)
        else:  # for Tf==1, no sum along Tf, the Ts-axis of the output is unchanged!
            out_5d = out_tmp.reshape((

--- a/theano/tensor/nnet/neighbours.py
+++ b/theano/tensor/nnet/neighbours.py
@@ -2,15 +2,15 @@
 TODO: implement Images2Neibs.infer_shape() methods
 """
-from six.moves import xrange
+import numpy
 import theano
 from theano import Op, Apply
 import theano.tensor as T
 from theano.gradient import grad_not_implemented
 from theano.gradient import grad_undefined
-import numpy
 class Images2Neibs(Op):
@@ -543,7 +543,6 @@ def neibs2images(neibs, neib_shape, original_shape, mode='valid'):
       :func:`images2neibs <theano.sandbox.neigbours.neibs2images>` for
       gradient computation.
    Example, which uses a tensor gained in example for
    :func:`images2neibs <theano.sandbox.neigbours.neibs2images>`:

--- a/theano/tensor/nnet/nnet.py
+++ b/theano/tensor/nnet/nnet.py
@@ -15,6 +15,7 @@ from six.moves import xrange
 import theano
 from theano import gof
+from theano import scalar
 from theano.tensor import basic as tensor
 from theano.tensor import subtensor
 from theano.tensor import elemwise
@@ -27,12 +28,12 @@ from theano.gradient import DisconnectedType
 from theano.gradient import grad_not_implemented
 from theano.tensor.type import values_eq_approx_remove_nan
 ############
 #
 # TENSOR OPS
 #
 class SoftmaxWithBias(gof.Op):
    """
    An L{Op} for the output of neural-net multiclass classifiers.
@@ -300,11 +301,11 @@ class SoftmaxGrad(gof.Op):
        dy, sm = inp
        g, = grads
-        tmp = g + tensor.neg(tensor.sum(g*sm, axis=1).dimshuffle((0, 'x')))
+        tmp = g + tensor.neg(tensor.sum(g * sm, axis=1).dimshuffle((0, 'x')))
        g_dy = tmp * sm
-        tmp2 = tensor.sum(dy*sm, axis=1).dimshuffle((0, 'x'))
+        tmp2 = tensor.sum(dy * sm, axis=1).dimshuffle((0, 'x'))
-        g_sm = tmp*dy - g *tmp2
+        g_sm = tmp * dy - g * tmp2
        return g_dy, g_sm
@@ -571,12 +572,15 @@ class Softmax(gof.Op):
 softmax_op = Softmax()
 def softmax_graph(c):
    return tensor.exp(c) / tensor.exp(c).sum(axis=-1, keepdims=True)
 def softmax(c):
    return softmax_op(c)
 @opt.register_specialize('fast_compile_gpu')
 @gof.local_optimizer([softmax_op])
 def local_softmax_with_bias(node):
@@ -593,9 +597,9 @@ def local_softmax_with_bias(node):
                    # tensor.DimShuffle) since specialization comes
                    # relatively late in optimization, we don't want to
                    # put in extra DimShuffles un-necessarily.
-                    if (x_in.owner and isinstance(x_in.owner.op,
+                    if (x_in.owner and
-                                                 tensor.DimShuffle)
+                            isinstance(x_in.owner.op, tensor.DimShuffle) and
-                 and list(x_in.owner.inputs[0].type.broadcastable) == [False]):
+                            list(x_in.owner.inputs[0].type.broadcastable) == [False]):
                        # cut out the DimShuffle that was broadcasting a vector
                        vectors.append(x_in.owner.inputs[0])
                    else:
@@ -673,8 +677,7 @@ def softmax_simplifier(numerators, denominators):
            numerators.append(softmax_op(x))
    return numerators, denominators
-opt.local_mul_canonizer.add_simplifier(softmax_simplifier,
+opt.local_mul_canonizer.add_simplifier(softmax_simplifier, 'softmax_simplifier')
-     'softmax_simplifier')
 if 0:
    @opt.register_specialize
@@ -836,7 +839,7 @@ class CrossentropySoftmaxArgmax1HotWithBias(gof.Op):
 #       TODO: Is this correct? It used to be y, not y_idx
        nll = tensor.TensorType(x.type.dtype,
-                y_idx.type.broadcastable)()
+                                y_idx.type.broadcastable).make_variable()
 #        nll = TensorType(x.dtype, y.broadcastable)
        sm = x.type()
        am = y_idx.type()
@@ -866,15 +869,14 @@ class CrossentropySoftmaxArgmax1HotWithBias(gof.Op):
        if any(y_idx < 0):
            raise ValueError("y_i value out of bounds")
        sm = numpy.zeros_like(x)  # softmax
-        nll = numpy.zeros(x.shape[0], dtype=node.outputs[0].type.
+        nll = numpy.zeros(x.shape[0], dtype=node.outputs[0].type.dtype)  # nll(y | softmax(x))
-            dtype)  # nll(y | softmax(x))
        am = numpy.zeros_like(y_idx)
        for i in xrange(sm.shape[0]):
            # add the bias vector to the i'th row of x
            row = x[i] + b
            # get the maximum value of i'th row for numerically safe
-            #softmax / nll
+            # softmax / nll
            am[i] = numpy.argmax(row)
            m = row[am[i]]
@@ -1083,8 +1085,7 @@ class CrossentropySoftmax1HotWithBiasDx(gof.Op):
        y_idx_range = tensor.arange(y_idx.shape[0])
        g_dy = tensor.sum(
            g_dx * subtensor.AdvancedIncSubtensor()(
-                    sm, tensor.fill(dy, -1), y_idx_range, y_idx),
+                sm, tensor.fill(dy, -1), y_idx_range, y_idx), axis=1)
-                axis=1)
        g_sm = dy.dimshuffle(0, 'x') * g_dx
        g_y_idx = grad_not_implemented(self, 2, y_idx)
        return [g_dy, g_sm, g_y_idx]
@@ -1226,8 +1227,7 @@ def crossentropy_softmax_max_and_argmax_1hot_with_bias(x, b, y_idx, **kwargs):
    unnecessary? e.g. CrossentropySoftmaxArgmax1HotWithBias should return
    the appropriate information (i.e. the max probability)?
    """
-    (xent, softmax) = crossentropy_softmax_1hot_with_bias(x, b, y_idx,
+    (xent, softmax) = crossentropy_softmax_1hot_with_bias(x, b, y_idx, **kwargs)
-         **kwargs)
    (max_pr, argmax) = tensor.max_and_argmax(softmax, axis=-1)
    return (xent, softmax, max_pr, argmax)
@@ -1251,8 +1251,8 @@ class CrossentropyCategorical1HotGrad(gof.Op):
        g_coding_strg, = out
        g_coding = numpy.zeros_like(coding_dist)
        for i in xrange(len(g_y)):
-            g_coding[i, true_one_of_n[i]] = -g_y[i] / coding_dist[i,
+            g_coding[i, true_one_of_n[i]] = (-g_y[i] /
-                                                        true_one_of_n[i]]
+                                             coding_dist[i, true_one_of_n[i]])
        g_coding_strg[0] = g_coding
    def infer_shape(self, node, in_shapes):
@@ -1346,9 +1346,10 @@ def crossentropy_to_crossentropy_with_softmax_with_bias(fgraph):
                sm, one_of_n = node.inputs
                if sm.owner and sm.owner.op == softmax_with_bias:
                    x, b = sm.owner.inputs
-                    new_nll, new_sm, new_am = crossentropy_softmax_argmax_1hot_with_bias(x, b,
+                    new_nll, new_sm, new_am = crossentropy_softmax_argmax_1hot_with_bias(
-                            one_of_n)
+                        x, b, one_of_n)
-                    fgraph.replace_all_validate([(nll, new_nll), (sm, new_sm)],
+                    fgraph.replace_all_validate(
+                        [(nll, new_nll), (sm, new_sm)],
                        reason="crossentropy_to_crossentropy_with_softmax_with_bias")
                    return True
@@ -1381,16 +1382,18 @@ def crossentropy_to_crossentropy_with_softmax(fgraph):
                sm, one_of_n = node.inputs
                if sm.owner and sm.owner.op == softmax_op:
                    x, = sm.owner.inputs
-                    new_nll, new_sm, new_am = crossentropy_softmax_argmax_1hot_with_bias(x,
+                    new_nll, new_sm, new_am = crossentropy_softmax_argmax_1hot_with_bias(
-                            tensor.zeros_like(x[0]), one_of_n)
+                        x, tensor.zeros_like(x[0]), one_of_n)
-                    fgraph.replace_all_validate([(nll, new_nll), (sm, new_sm)],
+                    fgraph.replace_all_validate(
+                        [(nll, new_nll), (sm, new_sm)],
                        reason="crossentropy_to_crossentropy_with_softmax")
                    return True
                if sm.owner and sm.owner.op == softmax_with_bias:
                    x, b = sm.owner.inputs
                    new_nll, new_sm, new_am = crossentropy_softmax_argmax_1hot_with_bias(x, b,
                                                                                         one_of_n)
-                    fgraph.replace_all_validate([(nll, new_nll), (sm, new_sm)],
+                    fgraph.replace_all_validate(
+                        [(nll, new_nll), (sm, new_sm)],
                        reason="crossentropy_to_crossentropy_with_softmax")
                    return True
@@ -1415,8 +1418,8 @@ def local_softmax_grad_to_crossentropy_with_softmax_grad(node):
        if (g_coding_dist.owner and
                g_coding_dist.owner.op == crossentropy_categorical_1hot_grad):
            g_nll, coding_dist, true_one_of_n = g_coding_dist.owner.inputs
-            dx = crossentropy_softmax_1hot_with_bias_dx(g_nll,
+            dx = crossentropy_softmax_1hot_with_bias_dx(g_nll, coding_dist,
-                 coding_dist, true_one_of_n)
+                                                        true_one_of_n)
            return [dx]
@@ -1428,7 +1431,8 @@ def local_argmax_pushdown(node):
            (softmax_op, softplus, tensor.exp, tensor.log, tensor.tanh, sigmoid,
             softmax_with_bias):
        if theano.config.warn.argmax_pushdown_bug:
-            logging.getLogger('theano.tensor.nnet.nnet').warn("WARNING: there "
+            logging.getLogger('theano.tensor.nnet.nnet').warn(
+                "WARNING: there "
                "was a bug in Theano fixed on May 27th, 2010 in this case."
                " I.E. when we take the max of a softplus, softmax, exp, "
                "log, tanh, sigmoid, softmax_with_bias op, we were doing "
@@ -1657,15 +1661,15 @@ def local_advanced_indexing_crossentropy_onehot_grad(node):
            if isinstance(denom.owner.op, subtensor.AdvancedSubtensor):
                # Base case
                adv_subtensor = denom
-                #out_grad /= 1.
+                # out_grad /= 1.
            elif denom.owner.op == tensor.mul:
                # Try to find the AdvancedSubtensor node mentionned above,
                # and the output gradient
                for i, input in enumerate(denom.owner.inputs):
                    if input.owner and isinstance(input.owner.op,
                                                  subtensor.AdvancedSubtensor):
-                        other_inputs = [in_ for (j,
+                        other_inputs = [in_ for (j, in_) in
-                             in_) in enumerate(denom.owner.inputs) if j != i]
+                                        enumerate(denom.owner.inputs) if j != i]
                        if len(other_inputs) == 1:
                            rest = other_inputs[0]
                        else:
@@ -1894,16 +1898,14 @@ def categorical_crossentropy(coding_dist, true_dist):
    """
    if true_dist.ndim == coding_dist.ndim:
-        return -tensor.sum(true_dist * tensor.log(coding_dist), axis=coding_dist.ndim-1)
+        return -tensor.sum(true_dist * tensor.log(coding_dist),
+                           axis=coding_dist.ndim - 1)
    elif true_dist.ndim == coding_dist.ndim - 1:
        return crossentropy_categorical_1hot(coding_dist, true_dist)
    else:
        raise TypeError('rank mismatch between coding and true distributions')
-from theano import scalar
 class Prepend_scalar_constant_to_each_row(gof.Op):
    __props__ = ()
@@ -2026,7 +2028,7 @@ local_log_softmax = gof.PatternSub(in_pattern=(tensor.log, (softmax_op, 'x')),
 # don't do register_stabilize, this is to make local_log_softmax run
 # only after another more specific optimization that stabilizes cross entropy
-#opt.register_stabilize(local_log_softmax, name = 'local_log_softmax')
+# opt.register_stabilize(local_log_softmax, name = 'local_log_softmax')
 opt.register_specialize(local_log_softmax, 'fast_compile_gpu', name='local_log_softmax')

--- a/theano/tensor/nnet/sigm.py
+++ b/theano/tensor/nnet/sigm.py
@@ -7,7 +7,6 @@ from __future__ import print_function
 import warnings
 import numpy
-from six.moves import xrange
 import theano
 from theano import config, gof, printing, scalar
@@ -129,9 +128,8 @@ class ScalarSigmoid(scalar.UnaryScalarOp):
        """
        This method was used to generate the graph: sigmoid_prec.png in the doc
        """
-        import matplotlib
        data = numpy.arange(-15, 15, .1)
-        val = 1/(1+numpy.exp(-data))
+        val = 1 / (1 + numpy.exp(-data))
        def hard_sigmoid(x):
            return theano.tensor.nnet.hard_sigmoid(x)
@@ -167,7 +165,7 @@ sigmoid_inplace = elemwise.Elemwise(
    ScalarSigmoid(scalar.transfer_type(0)),
    inplace_pattern={0: 0},
    name='sigmoid_inplace',
-        )
+)
 pprint.assign(sigmoid, printing.FunctionPrinter('sigmoid'))
@@ -240,7 +238,7 @@ pprint.assign(ultra_fast_sigmoid,
              printing.FunctionPrinter('ultra_fast_sigmoid'))
-#@opt.register_uncanonicalize
+# @opt.register_uncanonicalize
 @gof.local_optimizer([sigmoid])
 def local_ultra_fast_sigmoid(node):
    """
@@ -290,7 +288,7 @@ def hard_sigmoid(x):
    return x
-#@opt.register_uncanonicalize
+# @opt.register_uncanonicalize
 @gof.local_optimizer([sigmoid])
 def local_hard_sigmoid(node):
    if (isinstance(node.op, tensor.Elemwise) and
@@ -439,7 +437,8 @@ def is_1pexp(t):
    return None
-AddConfigVar('warn.identify_1pexp_bug',
+AddConfigVar(
+    'warn.identify_1pexp_bug',
    'Warn if Theano versions prior to 7987b51 (2011-12-18) could have '
    'yielded a wrong result due to a bug in the is_1pexp function',
    BoolParam(theano.configdefaults.warn_default('0.4.1')),
@@ -892,7 +891,7 @@ def local_1msigmoid(node):
        if sub_r.owner and sub_r.owner.op == sigmoid:
            try:
                val_l = opt.get_scalar_constant_value(sub_l)
-            except Exception as e:
+            except Exception:
                return
            if numpy.allclose(numpy.sum(val_l), 1):
                return [sigmoid(-sub_r.owner.inputs[0])]
@@ -921,7 +920,6 @@ if 0:
        print(sigm_canonicalize(node))
    def sigm_canonicalize(node):
-        add = tensor.add
        mul = tensor.mul
        div = tensor.true_div

--- a/theano/tests/test_flake8.py
+++ b/theano/tests/test_flake8.py
@@ -88,15 +88,7 @@ whitelist_flake8 = [
    "tensor/signal/conv.py",
    "tensor/signal/tests/test_conv.py",
    "tensor/signal/tests/test_downsample.py",
-    "tensor/nnet/nnet.py",
-    "tensor/nnet/Conv3D.py",
    "tensor/nnet/__init__.py",
-    "tensor/nnet/ConvTransp3D.py",
-    "tensor/nnet/sigm.py",
-    "tensor/nnet/ConvGrad3D.py",
-    "tensor/nnet/conv3d2d.py",
-    "tensor/nnet/conv.py",
-    "tensor/nnet/neighbours.py",
    "tensor/nnet/tests/test_conv.py",
    "tensor/nnet/tests/test_neighbours.py",
    "tensor/nnet/tests/test_nnet.py",