pep8

d87cf8b4 · Kelvin Xu · 99cffe57 · d87cf8b4 · d87cf8b4 · d87cf8b4
--- a/theano/sandbox/cuda/extra_ops.py
+++ b/theano/sandbox/cuda/extra_ops.py
@@ -43,7 +43,7 @@ class GpuCumsum(CumsumOp, GpuOp):
        if x.ndim > GpuCumsum.SUPPORTED_NDIMS:
            raise NotImplementedError('Only cumsum on 1D, 2D and 3D array are supported right now!')
-        print(self.axis)
        if self.axis >= x.ndim or self.axis < -x.ndim:
            raise ValueError('axis(={1}) out of bounds'.format(self.axis))

--- a/theano/sandbox/gpuarray/extra_ops.py
+++ b/theano/sandbox/gpuarray/extra_ops.py
 from __future__ import absolute_import, print_function, division
-import theano
-import numpy
 import os
-from theano import Op, Apply, config
+from theano import Apply
 from theano.tensor.extra_ops import CumsumOp
 try:
-    import pygpu
    from pygpu import gpuarray
 except ImportError:
    pass
 from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel,
-                        infer_context_name, GpuFromHost, HideC)
+                        infer_context_name, GpuFromHost)
 from .opt import register_opt as register_gpu_opt, op_lifter
-from .type import GpuArrayType
-class GpuCumsum(GpuKernelBase, HideC, CumsumOp):
+class GpuCumsum(GpuKernelBase):
    """
    Parameters
    ----------
@@ -34,7 +30,7 @@ class GpuCumsum(GpuKernelBase, HideC, CumsumOp):
        return "%s{%s}" % (self.__class__.__name__, self.axis)
    def c_code_cache_version_apply(self, node):
-        return None
+        return (1,)
    def c_headers(self):
        return ['<numpy_compat.h>', '<gpuarray/types.h>', '<gpuarray_helper.h>']
@@ -43,9 +39,9 @@ class GpuCumsum(GpuKernelBase, HideC, CumsumOp):
        return [os.path.dirname(__file__)]
    def get_params(self, node):
-        return node.inputs[0].type.context 
+        return node.inputs[0].type.context
-    def make_node(self, x): 
+    def make_node(self, x):
        assert x.type.dtype == 'float32', "Only float32 supported for GpuCumSum"
        x = as_gpuarray_variable(x, infer_context_name(x))
@@ -57,17 +53,10 @@ class GpuCumsum(GpuKernelBase, HideC, CumsumOp):
            raise ValueError('axis(={0}) out of bounds'.format(self.axis))
        return Apply(self, [x], [x.type()])
-    # copied from neighbour.py
-    def perform(self, node, inp, out, ctx):
-        # Disable the perform method from the CPU version
-        Op.perform(self, node, inp, out, ctx)
    def gpu_kernels(self, node, nodename):
        kernels = []
        # cumadd
-        kname = "k_cumadd"   
+        kname = "k_cumadd"
        k_var = "k_cumadd_" + nodename
        dtype_x = node.inputs[0].dtype
        flags = Kernel.get_flags(dtype_x)
@@ -77,7 +66,7 @@ class GpuCumsum(GpuKernelBase, HideC, CumsumOp):
                              ga_ssize inputStrides_y,
                              ga_ssize inputStrides_z,
                              ga_ssize outputStrides_x, ga_ssize outputStrides_y,
-                              ga_ssize outputStrides_z, const int offsetY, const int offsetZ, 
+                              ga_ssize outputStrides_z, const int offsetY, const int offsetZ,
                              const int beforeLastElementIdx, const int lastElementIdx){
            int idY = blockIdx.y + offsetY;
            int idZ = blockIdx.z + offsetZ;
@@ -90,22 +79,22 @@ class GpuCumsum(GpuKernelBase, HideC, CumsumOp):
            output[idx_last_output] = input[idx_last_input] + output[idx_beforelast];
            }
        """ % locals()
-        params = [gpuarray.GpuArray, gpuarray.GpuArray, gpuarray.SSIZE, 
+        params = [gpuarray.GpuArray, gpuarray.GpuArray, gpuarray.SSIZE,
-                  gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, 
+                  gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
-                  gpuarray.SSIZE, gpuarray.SSIZE, 
+                  gpuarray.SSIZE, gpuarray.SSIZE,
                  'intc', 'intc',
                  'intc', 'intc',
                  ]
        kernels.append(Kernel(code=code, name=kname, params=params,
                              flags=flags, objvar=k_var))
        # blockCumSum
-        kname = "k_blockCumSum" 
+        kname = "k_blockCumSum"
        k_var = "k_blockCumSum_" + nodename
        params = [gpuarray.GpuArray, gpuarray.GpuArray, gpuarray.SIZE,
                  gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
                  gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
-                  'int32', 'int32', gpuarray.GpuArray,]
+                  'int32', 'int32', gpuarray.GpuArray, ]
-        code="""
+        code = """
        // helper functions
        WITHIN_KERNEL
        void k_reductionPhase(float* partialCumSum) {
@@ -199,10 +188,10 @@ class GpuCumsum(GpuKernelBase, HideC, CumsumOp):
        kernels.append(Kernel(code=code, name=kname, params=params,
                              flags=flags, objvar=k_var))
        # k_finalCumSum
-        kname = "k_finalCumSum" 
+        kname = "k_finalCumSum"
        k_var = "k_finalCumSum_" + nodename
        code = """
-        KERNEL void k_finalCumSum(float* output, float* blockSum, size_t nbElementsPerCumsum, 
+        KERNEL void k_finalCumSum(float* output, float* blockSum, size_t nbElementsPerCumsum,
                                               ga_ssize dataStrides_x,  ga_ssize dataStrides_y,  ga_ssize dataStrides_z,
                                               int offsetY, int offsetZ) {
            int globalThreadID = (blockIdx.x + 1) * blockDim.x + threadIdx.x;
@@ -226,12 +215,11 @@ class GpuCumsum(GpuKernelBase, HideC, CumsumOp):
        """
        params = [gpuarray.GpuArray, gpuarray.GpuArray, gpuarray.SIZE,
                  gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
-                  'int32', 'int32',]
+                  'int32', 'int32', ]
        kernels.append(Kernel(code=code, name=kname, params=params,
                              flags=flags, objvar=k_var))
        return kernels
    def c_code(self, node, nodename, inp, out, sub):
        if node.inputs[0].type.context.kind != 'cuda':
            raise NotImplementedError("cuda only")
@@ -257,9 +245,9 @@ class GpuCumsum(GpuKernelBase, HideC, CumsumOp):
            }
            { // Namespace for kernel calls //
-                size_t max_threads_dim0; 
+                size_t max_threads_dim0;
-                size_t max_grid_size1; 
+                size_t max_grid_size1;
-                size_t max_grid_size2; 
+                size_t max_grid_size2;
                int err;
                err = %(ctx)s->ops->property(%(ctx)s->ctx, NULL, NULL, GA_CTX_PROP_MAXLSIZE0, &max_threads_dim0);
                if (err != GA_NO_ERROR){
@@ -331,7 +319,7 @@ class GpuCumsum(GpuKernelBase, HideC, CumsumOp):
            }
            // Perform cumsum on array of even size.
            size_t nbElementsPerCumsum = shape[axis] - (shape[axis] %% 2);
-            // Determine how many elements can be processed in one block. 
+            // Determine how many elements can be processed in one block.
            size_t dimBlockX = ceil((nbElementsPerCumsum > 2*maxThreads ? 2*maxThreads : nbElementsPerCumsum) / 2.0);
            // Determine how many blocks are needed in total.
            size_t dimGridX = ceil(nbElementsPerCumsum / (2.0*dimBlockX));  // Nb. of blocks needed per cumsum.
@@ -389,7 +377,7 @@ class GpuCumsum(GpuKernelBase, HideC, CumsumOp):
                    size_t sharedBytes = (2*dimBlockX) * sizeof(float);
                    void* kernel_params[] = {(void*) input->ga.data,
                                             (void*) output->ga.data,
-                                             (void*) &nbElementsPerCumsum, 
+                                             (void*) &nbElementsPerCumsum,
                                             (void*) &inputStrides_x,
                                             (void*) &inputStrides_y,
                                             (void*) &inputStrides_z,
@@ -417,7 +405,7 @@ class GpuCumsum(GpuKernelBase, HideC, CumsumOp):
                        size_t dimGrid[3] = {dimGridX, localDimGridY, localDimGridZ};
                        size_t dimBlock[3] = {dimBlockX, 1, 1};
                        void* kernel_params[] = {(void*) output->ga.data,
-                                                 (void*) deviceBlockSum->ga.data, 
+                                                 (void*) deviceBlockSum->ga.data,
                                                 (void*) &nbElementsPerCumsum,
                                                 (void*) &outputStrides_x,
                                                 (void*) &outputStrides_y,

--- a/theano/sandbox/gpuarray/tests/test_extra_ops.py
+++ b/theano/sandbox/gpuarray/tests/test_extra_ops.py
@@ -10,6 +10,7 @@ import theano
 import theano.tensor.tests.test_extra_ops
 from theano.tensor.extra_ops import cumsum, CumsumOp
+from theano.tests.unittest_tools import SkipTest
 from theano.tests import unittest_tools as utt
 from .config import mode_with_gpu, test_ctx_name
@@ -28,8 +29,6 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
        self.max_threads_dim0 = test_ctx.maxlsize0
        self.max_grid_size1 = test_ctx.maxgsize2
    def test_Strides1D(self):
        x = T.fvector('x')
@@ -113,16 +112,16 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
            utt.assert_allclose(np.cumsum(a[:i]), f(a[:i]))
        # Use multiple GPU threadblocks
-        a = np.random.random((block_max_size+2,)).astype("float32")
+        a = np.random.random((block_max_size + 2, )).astype("float32")
        utt.assert_allclose(np.cumsum(a), f(a))
        # Use recursive cumsum
-        a = np.ones((block_max_size*(block_max_size+1)+2,),
+        a = np.ones((block_max_size * (block_max_size + 1) + 2,),
                    dtype="float32")
        utt.assert_allclose(np.cumsum(a), f(a))
    def test_GpuCumsum2D(self):
-        block_max_size = self.max_threads_dim0 * 2 
+        block_max_size = self.max_threads_dim0 * 2
        x = T.fmatrix('x')
        for shape_axis, axis in zip([0, 1, 0, 1, 0], [0, 1, None, -1, -2]):
@@ -143,21 +142,21 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
            # Use multiple GPU threadblocks
            a_shape = [5, 5]
-            a_shape[shape_axis] = block_max_size+2
+            a_shape[shape_axis] = block_max_size + 2
            a = np.random.random(a_shape).astype("float32")
            utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
            # Use multiple GPU gridblocks
            a_shape = [4, 4]
-            a_shape[1-shape_axis] = self.max_grid_size1+1
+            a_shape[1 - shape_axis] = self.max_grid_size1 + 1
            a = np.random.random(a_shape).astype("float32")
            utt.assert_allclose(np.cumsum(a, axis=axis), f(a), rtol=5e-5)
            # Use recursive cumsum
            a_shape = [3, 3]
-            a_shape[shape_axis] = block_max_size*(block_max_size+1)+2
+            a_shape[shape_axis] = block_max_size * (block_max_size + 1) + 2
            a = np.random.random(a_shape).astype("float32")
-            a = np.sign(a-0.5).astype("float32")  # Avoid floating point error
+            a = np.sign(a - 0.5).astype("float32")  # Avoid floating point error
            utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
    def test_GpuCumsum3D(self):
@@ -182,32 +181,32 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
            # Use multiple GPU threadblocks (along accumulation axis)
            a_shape = [2, 2, 2]
-            a_shape[shape_axis] = block_max_size+2
+            a_shape[shape_axis] = block_max_size + 2
            a = np.random.random(a_shape).astype("float32")
            utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
            # Use multiple GPU gridblocks (not along accumulation axis)
            a_shape = [5, 5, 5]
-            a_shape[(shape_axis+1) % 3] = self.max_grid_size1+1
+            a_shape[(shape_axis + 1) % 3] = self.max_grid_size1 + 1
            a = np.random.random(a_shape).astype("float32")
            if axis is None:
                # Avoid floating point error
-                a = np.sign(a-0.5).astype("float32")
+                a = np.sign(a - 0.5).astype("float32")
            utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
            a_shape = [5, 5, 5]
-            a_shape[(shape_axis+2) % 3] = self.max_grid_size1+1
+            a_shape[(shape_axis + 2) % 3] = self.max_grid_size1 + 1
            a = np.random.random(a_shape).astype("float32")
            if axis is None:
                # Avoid floating point error
-                a = np.sign(a-0.5).astype("float32")
+                a = np.sign(a - 0.5).astype("float32")
            utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
            # Use recursive cumsum (along accumulation axis)
            a_shape = [3, 3, 3]
-            a_shape[shape_axis] = block_max_size*(block_max_size+1)+2
+            a_shape[shape_axis] = block_max_size * (block_max_size + 1) + 2
            a = np.random.random(a_shape).astype("float32")
-            a = np.sign(a-0.5).astype("float32")  # Avoid floating point error
+            a = np.sign(a - 0.5).astype("float32")  # Avoid floating point error
            utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
    def test_GpuCumsum4D(self):