opt.py has been modified in order to respect the flake8 style.

54fe4a7f · Chiheb Trabelsi · 1a3948cc · 54fe4a7f
--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -10,22 +10,32 @@ import warnings
 import numpy
 from six.moves import reduce, xrange
+from . import dnn
 import theano
 from theano import scalar as scal
 from theano import config, tensor, gof
 import theano.ifelse
+import theano.tensor.signal.pool
+import theano.tensor.nnet
+import theano.tensor.nnet.neighbours
+# Convolution
+from theano.tensor.nnet import conv
+from theano.tensor.nnet.ConvGrad3D import ConvGrad3D
+from theano.tensor.nnet.ConvTransp3D import ConvTransp3D
+# Pooling
+import theano.tensor.signal.pool as pool
 from theano.compile import optdb
 from theano.gof import (local_optimizer, EquilibriumDB, ProxyDB,
                        Optimizer, TopoOptimizer, toolbox)
 from theano.gof.opt import LocalMetaOptimizer
+from theano.sandbox.cuda.basic_ops import gpu_join, GpuJoin
 from theano.sandbox.cuda import as_cuda_ndarray_variable
 from theano.sandbox.cuda.basic_ops import (
    gpu_eye, gpu_contiguous,
    gpu_from_host, host_from_gpu, GpuFromHost, HostFromGpu,
    GpuContiguous,
    GpuElemwise, GpuDimShuffle, GpuReshape, GpuCAReduce,
-    GpuFlatten, gpu_flatten,
+    gpu_flatten,
    GpuSubtensor, GpuAdvancedSubtensor1,
    GpuAdvancedIncSubtensor1, GpuAdvancedIncSubtensor1_dev20,
    GpuIncSubtensor, gpu_alloc, GpuAlloc, gpu_shape, GpuSplit, GpuAllocEmpty)
@@ -137,8 +147,6 @@ register_opt(name='local_gpu_reshape_chain')(
 # This is a partial list of CPU ops that can be in some circonstance
 # moved to the GPU. This list is used by an optimization.
 # Hopefully, we can keep this list up to date.
-import theano.tensor.signal.pool
-import theano.tensor.nnet.neighbours
 cpu_ops_moved_to_gpu = [
    tensor.blas.Dot22, tensor.blas.Dot22Scalar, tensor.blas.Gemm,
    tensor.blas.Gemv, tensor.blas.Ger, tensor.nnet.conv.ConvOp,
@@ -630,7 +638,7 @@ def local_gpu_batched_dot(node):
        if y.ndim == 2:
            y_ = y_.dimshuffle(0, 1, "x")
        z = GpuBatchedDot()(as_cuda_ndarray_variable(x_),
-                           as_cuda_ndarray_variable(y_))
+                            as_cuda_ndarray_variable(y_))
        # unpad z shape
        if x.ndim == 2:
            z = z.dimshuffle(0, *range(2, z.ndim))
@@ -850,8 +858,8 @@ def local_gpu_careduce(node):
            if x.type == node.outputs[0].type:
                return [x]
            elif (all([c != "output" and isinstance(c.op, GpuFromHost)
-                      for c, i in node.outputs[0].clients])
+                      for c, i in node.outputs[0].clients]) and
-                  and x.owner and x.owner.op.__class__ in
+                  x.owner and x.owner.op.__class__ in
                  cpu_ops_moved_to_gpu):
                # It is not always good to transfer the reduction to
                # the GPU when the clients are on the GPU but not the
@@ -970,7 +978,7 @@ def local_gpu_elemwise_careduce(node):
        # automatically add more case, as some like trigonometic
        # operation with some reduction pattern will probably result
        # to slow down.
-        isinstance(node.inputs[0].owner.op.scalar_op, scal.basic.Sqr)):
+       isinstance(node.inputs[0].owner.op.scalar_op, scal.basic.Sqr)):
        op = node.op
        inp = node.inputs[0].owner.inputs[0]
@@ -1023,7 +1031,8 @@ def local_gpu_flatten(node):
            return [gpu_flatten(host_input.owner.inputs[0], outdim)(
                as_cuda_ndarray_variable(host_input.owner.inputs[0]))]
    if isinstance(node.op, tensor.Flatten):
-        x, = node.inputs
+        x, shp = node.inputs
+        outdim = node.op.outdim
        if x.owner and isinstance(x.owner.op, HostFromGpu):
            outdim = node.op.outdim
            gpu_x, = x.owner.inputs
@@ -1050,15 +1059,13 @@ def local_gpu_subtensor(node):
                                                *coords)]
    if isinstance(node.op, tensor.Subtensor):
        x = node.inputs[0]
-        if (x.owner and
+        if (x.owner and x.dtype == "float32" and
-            isinstance(x.owner.op, HostFromGpu) and
+                isinstance(x.owner.op, HostFromGpu)):
-            x.dtype == "float32"):
            gpu_x = x.owner.inputs[0]
-            if (gpu_x.owner and
+            if (gpu_x.owner and  # And it is a shared var or an input of the graph.
-                isinstance(gpu_x.owner.op, GpuFromHost) and
+                    not(gpu_x.owner.inputs[0].owner) and
-                # And it is a shared var or an input of the graph.
+                    isinstance(gpu_x.owner.op, GpuFromHost)):
-                not gpu_x.owner.inputs[0].owner):
                if len(x.clients) == 1:
                    if any([n == 'output' or isinstance(n.op, GpuOp)
@@ -1119,9 +1126,7 @@ def local_gpu_advanced_incsubtensor1(node):
                    'least \'0.6\'.', stacklevel=1)
            active_device_no = theano.sandbox.cuda.active_device_number()
            compute_capability = device_properties(active_device_no)['major']
-            if (compute_capability < 2 or
+            if (compute_capability < 2 or y.ndim != 2 or x.ndim != 2):
-                x.ndim != 2 or
-                y.ndim != 2):
                gpu_op = GpuAdvancedIncSubtensor1(
                    set_instead_of_inc=set_instead_of_inc)
@@ -1162,9 +1167,7 @@ def local_gpu_advanced_incsubtensor1(node):
            active_device_no = theano.sandbox.cuda.active_device_number()
            compute_capability = device_properties(active_device_no)['major']
-            if (compute_capability < 2 or
+            if (compute_capability < 2 or y.ndim != 2 or x.ndim != 2):
-                x.ndim != 2 or
-                y.ndim != 2):
                gpu_op = GpuAdvancedIncSubtensor1(
                    set_instead_of_inc=set_instead_of_inc)
            else:
@@ -1203,8 +1206,8 @@ def local_gpu_incsubtensor(node):
    # Incrementing a float32 x results in a float32
    # output even if y is float64, so we can downcast
    # y to put it on GPU
-    elif type(node.op) == tensor.IncSubtensor and \
+    elif (type(node.op) == tensor.IncSubtensor and
-       node.inputs[0].dtype == "float32":
+          node.inputs[0].dtype == "float32"):
        x, y = node.inputs[0:2]
        assert isinstance(x.type, tensor.TensorType)
        assert isinstance(y.type, tensor.TensorType)
@@ -1346,8 +1349,6 @@ def cast(x, dtype):
    cast_op = theano.tensor.Elemwise(scal.Identity(scal.specific_out(stype)))
    return cast_op(x)
-import theano.tensor.nnet
 @register_opt()
 @local_optimizer([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias])
@@ -1419,18 +1420,13 @@ def local_gpu_softmax_with_bias(node):
    return False
-# Convolution
-from theano.tensor.nnet import conv
 def _gpu_conv_to_fftconv(node):
    # shared helper function for local_conv_fft_valid and local_conv_fft_full.
    # we import conv2d_fft locally to avoid pycuda warnings
    from theano.sandbox.cuda.fftconv import conv2d_fft
    kwargs = {'border_mode': node.op.border_mode}
-    if (node.op.imshp is not None and
+    if (node.op.imshp is not None and node.op.imshp[-1] % 2 == 1 and
-        node.op.imshp[-1] is not None and
+            node.op.imshp[-1] is not None):
-        node.op.imshp[-1] % 2 == 1):
        kwargs['pad_last_dim'] = True
    # If the user supplied the full nonsymbolic image_shape and
@@ -1459,9 +1455,8 @@ def _gpu_conv_to_fftconv(node):
 @local_optimizer([GpuConv])
 def local_conv_fft_valid(node):
    if isinstance(node.op, GpuConv):
-        if (node.op.border_mode == 'valid' and
+        if (node.op.border_mode == 'valid' and node.op.fft_opt and
-            node.op.subsample == (1, 1) and
+                node.op.subsample == (1, 1)):
-            node.op.fft_opt):
            return [_gpu_conv_to_fftconv(node)]
        return False
@@ -1470,9 +1465,8 @@ def local_conv_fft_valid(node):
 @local_optimizer([GpuConv])
 def local_conv_fft_full(node):
    if isinstance(node.op, GpuConv):
-        if (node.op.border_mode == 'full' and
+        if (node.op.border_mode == 'full' and node.op.fft_opt and
-            node.op.subsample == (1, 1) and
+                node.op.subsample == (1, 1)):
-            node.op.fft_opt):
            return [_gpu_conv_to_fftconv(node)]
        return
@@ -1586,7 +1580,7 @@ def local_gpu_conv(node):
 @local_optimizer([GpuConv])
 def local_conv_gemm(node):
    if (isinstance(node.op, GpuConv) and
-        node.op.border_mode in ['full', 'valid']):
+            node.op.border_mode in ['full', 'valid']):
        img, kern = node.inputs
        border_mode = node.op.border_mode
@@ -1659,7 +1653,6 @@ conv_groupopt.register('conv_fft_full', local_conv_fft_full, 10,
                       'conv_fft')
 # cuDNN is the second, but only registered if cuDNN is available.
 # It can be disabled by excluding 'conv_dnn' or 'cudnn'.
-from . import dnn
 # We can't check at import if dnn is available, so we must always
 # register it. This do not cause problem as if it is not avail, the
 # opt will do nothing.
@@ -1708,9 +1701,8 @@ class ConvMetaOptimizer(LocalCudaMetaOptimizer):
        shapes = ((node.op.bsize,) + node.op.imshp,
                  (node.op.nkern, nchannels) + node.op.kshp)
        for (var, shape) in zip(vars, shapes):
-            if ((var in inputs) and
+            if ((var in inputs) and (shape is not None) and
-                (shape is not None) and
+                    not any(s is None for s in shape)):
-                not any(s is None for s in shape)):
                result[var] = theano.shared(
                    # TODO: Use var.type.filter when cuda_ndarray.filter
@@ -1763,8 +1755,6 @@ def local_conv3d_fft(node):
 gpu_optimizer.register("conv3d_fft", local_conv3d_fft)
-from theano.tensor.nnet.ConvGrad3D import ConvGrad3D
 @local_optimizer([ConvGrad3D])
 def local_convgrad3d_fft(node):
@@ -1775,7 +1765,7 @@ def local_convgrad3d_fft(node):
    except tensor.NotScalarConstantError:
        return False
    if (isinstance(node.op, ConvGrad3D) and
-        (stride_x, stride_y, stride_z) == (1, 1, 1)):
+            (stride_x, stride_y, stride_z) == (1, 1, 1)):
        # we import conv3d_fft locally to avoid pycuda warnings
        from theano.sandbox.cuda.fftconv import conv3d_fft
@@ -1794,8 +1784,6 @@ def local_convgrad3d_fft(node):
 gpu_optimizer.register("convgrad3d_fft", local_convgrad3d_fft)
-from theano.tensor.nnet.ConvTransp3D import ConvTransp3D
 @local_optimizer([ConvTransp3D])
 def local_convtransp3d_fft(node):
@@ -1806,7 +1794,7 @@ def local_convtransp3d_fft(node):
    except tensor.NotScalarConstantError:
        return False
    if (isinstance(node.op, ConvTransp3D) and
-        (stride_x, stride_y, stride_z) == (1, 1, 1)):
+            (stride_x, stride_y, stride_z) == (1, 1, 1)):
        # we import conv3d_fft locally to avoid pycuda warnings
        from theano.sandbox.cuda.fftconv import conv3d_fft
        # Shuffle filters from (oc, 0, 1, t, ic) to (ic, oc, 0, 1, t)
@@ -1894,15 +1882,11 @@ def local_convtransp3d_gemm(node):
 gpu_optimizer.register("convtransp3d_gemm", local_convtransp3d_gemm)
-# Pooling
-import theano.tensor.signal.pool as pool
 @register_opt()
 @local_optimizer([pool.Pool])
 def local_gpu_downsample_factor_max(node):
-    if (isinstance(node.op, pool.Pool)
+    if (isinstance(node.op, pool.Pool) and
-        and node.op.ds == node.op.st):
+            node.op.ds == node.op.st):
        assert node.op.__props__ == ('ds', 'ignore_border', 'st', 'padding',
                                     'mode')
@@ -1917,14 +1901,12 @@ def local_gpu_downsample_factor_max(node):
 @register_opt()
 @local_optimizer([pool.MaxPoolGrad])
 def local_gpu_downsample_factor_max_grad(node):
-    if (isinstance(node.op, pool.MaxPoolGrad) and
+    if (isinstance(node.op, pool.MaxPoolGrad) and node.op.ds == node.op.st):
-        node.op.ds == node.op.st):
        assert node.op.__props__ == ('ds', 'ignore_border', 'st', 'padding',
                                     'mode')
        if (node.op.padding != (0, 0) or
-            node.op.mode != 'max' or
+                node.op.mode != 'max' or
-            node.op.st != node.op.ds):
+                node.op.st != node.op.ds):
            return
        x, z, gz = node.inputs
@@ -1955,9 +1937,6 @@ def local_gpu_downsample_factor_max_grad_grad(node):
                                     as_cuda_ndarray_variable(gx)))]
-from theano.sandbox.cuda.basic_ops import gpu_join, GpuJoin
 @register_opt()
 @local_optimizer([tensor.Join])
 def local_gpu_join(node):
@@ -2252,8 +2231,8 @@ def local_gpualloc_memset_0(node):
    if isinstance(node.op, GpuAlloc) and not node.op.memset_0:
        inp = node.inputs[0]
        if (isinstance(inp, CudaNdarrayConstant) and
-            inp.data.size == 1 and
+                inp.data.size == 1 and
-            (numpy.asarray(inp.data) == 0).all()):
+                (numpy.asarray(inp.data) == 0).all()):
            new_out = GpuAlloc(memset_0=True)(*node.inputs)
            old_bcast = node.outputs[0].type.broadcastable
@@ -2308,8 +2287,9 @@ def local_gpu_eye(node):
    if isinstance(node.op, GpuFromHost):
        host_input = node.inputs[0]
        if (host_input.owner and
-            isinstance(host_input.owner.op, tensor.Eye) and
+                isinstance(host_input.owner.op, tensor.Eye) and
-            host_input.owner.op.dtype == "float32"):
+                host_input.owner.op.dtype == "float32"):
            if tensor.extract_constant(host_input.owner.inputs[2]) != 0:
                return
            return [gpu_eye(*host_input.owner.inputs)]
@@ -2324,7 +2304,7 @@ def local_gpu_eye(node):
 def safe_to_gpu(x):
    if (isinstance(x.type, tensor.TensorType) and
-        x.type.dtype == 'float32'):
+            x.type.dtype == 'float32'):
        return as_cuda_ndarray_variable(x)
    else:
@@ -2379,7 +2359,7 @@ def gpu_reconstruct_graph(inputs, outputs, tag=None):
 def tensor_to_cuda(x):
    if (isinstance(x.type, tensor.TensorType) and
-        x.type.dtype == 'float32'):
+            x.type.dtype == 'float32'):
        y = CudaNdarrayType(broadcastable=x.type.broadcastable)()
        if x.name:
@@ -2437,9 +2417,9 @@ def gpuScanOptimization(node):
    if isinstance(node.op, GpuFromHost):
        host_input = node.inputs[0]
        if (host_input.owner and
-            isinstance(host_input.owner.op, scan_op.Scan) and
+                isinstance(host_input.owner.op, scan_op.Scan) and
-            not host_input.owner.op.info['gpu'] and
+                not host_input.owner.op.info['gpu'] and
-            len(host_input.owner.outputs) == 1):
+                len(host_input.owner.outputs) == 1):
            # Note that we are not doing the right thing here !!
            # This is because the local optimizer expects only one
@@ -2492,8 +2472,8 @@ def gpuScanOptimization(node):
            return _outputs
    # scan(host_from_gpu) -> host_from_gpu(GPUscan)
-    if (type(node.op) == scan_op.Scan
+    if (type(node.op) == scan_op.Scan and
-        and not node.op.info['gpu']):
+            not node.op.info['gpu']):
        if any([(i.owner and isinstance(i.owner.op, HostFromGpu))
                for i in node.inputs]):
@@ -2792,7 +2772,7 @@ def local_abstractconv_gemm(node):
        kern = kern.dimshuffle(1, 0, 2, 3)
        # call GpuCorrMM_gradInputs
        rval = GpuCorrMM_gradInputs('valid', subsample)(
-                gpu_contiguous(kern), gpu_contiguous(img))
+            gpu_contiguous(kern), gpu_contiguous(img))
    else:
        # need to flip the kernel if necessary
        if node.op.filter_flip:
@@ -2807,11 +2787,11 @@ def local_abstractconv_gemm(node):
        # GpuConv does not always store information on the batchsize and
        # channels, though, so we only use what information we have.)
        if ((subsample == (1, 1)) and
-            (node.op.imshp is not None) and
+                (node.op.imshp is not None) and
-            (None not in node.op.imshp[-2:]) and
+                (None not in node.op.imshp[-2:]) and
-            (node.op.kshp is not None) and
+                (node.op.kshp is not None) and
-            (None not in node.op.kshp) and
+                (None not in node.op.kshp) and
-             border_mode != "half"):
+                border_mode != "half"):
            # we know the kernel and output size
            prod1 = node.op.kshp[0] * node.op.kshp[1]
            prod2 = ((node.op.imshp[-2] - node.op.kshp[0] + 1) *