提交 d87cf8b4 authored 作者: Kelvin Xu's avatar Kelvin Xu

pep8

上级 99cffe57
...@@ -43,7 +43,7 @@ class GpuCumsum(CumsumOp, GpuOp): ...@@ -43,7 +43,7 @@ class GpuCumsum(CumsumOp, GpuOp):
if x.ndim > GpuCumsum.SUPPORTED_NDIMS: if x.ndim > GpuCumsum.SUPPORTED_NDIMS:
raise NotImplementedError('Only cumsum on 1D, 2D and 3D array are supported right now!') raise NotImplementedError('Only cumsum on 1D, 2D and 3D array are supported right now!')
print(self.axis)
if self.axis >= x.ndim or self.axis < -x.ndim: if self.axis >= x.ndim or self.axis < -x.ndim:
raise ValueError('axis(={1}) out of bounds'.format(self.axis)) raise ValueError('axis(={1}) out of bounds'.format(self.axis))
......
from __future__ import absolute_import, print_function, division from __future__ import absolute_import, print_function, division
import theano
import numpy
import os import os
from theano import Op, Apply, config from theano import Apply
from theano.tensor.extra_ops import CumsumOp from theano.tensor.extra_ops import CumsumOp
try: try:
import pygpu
from pygpu import gpuarray from pygpu import gpuarray
except ImportError: except ImportError:
pass pass
from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel, from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel,
infer_context_name, GpuFromHost, HideC) infer_context_name, GpuFromHost)
from .opt import register_opt as register_gpu_opt, op_lifter from .opt import register_opt as register_gpu_opt, op_lifter
from .type import GpuArrayType
class GpuCumsum(GpuKernelBase, HideC, CumsumOp): class GpuCumsum(GpuKernelBase):
""" """
Parameters Parameters
---------- ----------
...@@ -34,7 +30,7 @@ class GpuCumsum(GpuKernelBase, HideC, CumsumOp): ...@@ -34,7 +30,7 @@ class GpuCumsum(GpuKernelBase, HideC, CumsumOp):
return "%s{%s}" % (self.__class__.__name__, self.axis) return "%s{%s}" % (self.__class__.__name__, self.axis)
def c_code_cache_version_apply(self, node): def c_code_cache_version_apply(self, node):
return None return (1,)
def c_headers(self): def c_headers(self):
return ['<numpy_compat.h>', '<gpuarray/types.h>', '<gpuarray_helper.h>'] return ['<numpy_compat.h>', '<gpuarray/types.h>', '<gpuarray_helper.h>']
...@@ -43,9 +39,9 @@ class GpuCumsum(GpuKernelBase, HideC, CumsumOp): ...@@ -43,9 +39,9 @@ class GpuCumsum(GpuKernelBase, HideC, CumsumOp):
return [os.path.dirname(__file__)] return [os.path.dirname(__file__)]
def get_params(self, node): def get_params(self, node):
return node.inputs[0].type.context return node.inputs[0].type.context
def make_node(self, x): def make_node(self, x):
assert x.type.dtype == 'float32', "Only float32 supported for GpuCumSum" assert x.type.dtype == 'float32', "Only float32 supported for GpuCumSum"
x = as_gpuarray_variable(x, infer_context_name(x)) x = as_gpuarray_variable(x, infer_context_name(x))
...@@ -57,17 +53,10 @@ class GpuCumsum(GpuKernelBase, HideC, CumsumOp): ...@@ -57,17 +53,10 @@ class GpuCumsum(GpuKernelBase, HideC, CumsumOp):
raise ValueError('axis(={0}) out of bounds'.format(self.axis)) raise ValueError('axis(={0}) out of bounds'.format(self.axis))
return Apply(self, [x], [x.type()]) return Apply(self, [x], [x.type()])
# copied from neighbour.py
def perform(self, node, inp, out, ctx):
# Disable the perform method from the CPU version
Op.perform(self, node, inp, out, ctx)
def gpu_kernels(self, node, nodename): def gpu_kernels(self, node, nodename):
kernels = [] kernels = []
# cumadd # cumadd
kname = "k_cumadd" kname = "k_cumadd"
k_var = "k_cumadd_" + nodename k_var = "k_cumadd_" + nodename
dtype_x = node.inputs[0].dtype dtype_x = node.inputs[0].dtype
flags = Kernel.get_flags(dtype_x) flags = Kernel.get_flags(dtype_x)
...@@ -77,7 +66,7 @@ class GpuCumsum(GpuKernelBase, HideC, CumsumOp): ...@@ -77,7 +66,7 @@ class GpuCumsum(GpuKernelBase, HideC, CumsumOp):
ga_ssize inputStrides_y, ga_ssize inputStrides_y,
ga_ssize inputStrides_z, ga_ssize inputStrides_z,
ga_ssize outputStrides_x, ga_ssize outputStrides_y, ga_ssize outputStrides_x, ga_ssize outputStrides_y,
ga_ssize outputStrides_z, const int offsetY, const int offsetZ, ga_ssize outputStrides_z, const int offsetY, const int offsetZ,
const int beforeLastElementIdx, const int lastElementIdx){ const int beforeLastElementIdx, const int lastElementIdx){
int idY = blockIdx.y + offsetY; int idY = blockIdx.y + offsetY;
int idZ = blockIdx.z + offsetZ; int idZ = blockIdx.z + offsetZ;
...@@ -90,22 +79,22 @@ class GpuCumsum(GpuKernelBase, HideC, CumsumOp): ...@@ -90,22 +79,22 @@ class GpuCumsum(GpuKernelBase, HideC, CumsumOp):
output[idx_last_output] = input[idx_last_input] + output[idx_beforelast]; output[idx_last_output] = input[idx_last_input] + output[idx_beforelast];
} }
""" % locals() """ % locals()
params = [gpuarray.GpuArray, gpuarray.GpuArray, gpuarray.SSIZE, params = [gpuarray.GpuArray, gpuarray.GpuArray, gpuarray.SSIZE,
gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc',
] ]
kernels.append(Kernel(code=code, name=kname, params=params, kernels.append(Kernel(code=code, name=kname, params=params,
flags=flags, objvar=k_var)) flags=flags, objvar=k_var))
# blockCumSum # blockCumSum
kname = "k_blockCumSum" kname = "k_blockCumSum"
k_var = "k_blockCumSum_" + nodename k_var = "k_blockCumSum_" + nodename
params = [gpuarray.GpuArray, gpuarray.GpuArray, gpuarray.SIZE, params = [gpuarray.GpuArray, gpuarray.GpuArray, gpuarray.SIZE,
gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
'int32', 'int32', gpuarray.GpuArray,] 'int32', 'int32', gpuarray.GpuArray, ]
code=""" code = """
// helper functions // helper functions
WITHIN_KERNEL WITHIN_KERNEL
void k_reductionPhase(float* partialCumSum) { void k_reductionPhase(float* partialCumSum) {
...@@ -199,10 +188,10 @@ class GpuCumsum(GpuKernelBase, HideC, CumsumOp): ...@@ -199,10 +188,10 @@ class GpuCumsum(GpuKernelBase, HideC, CumsumOp):
kernels.append(Kernel(code=code, name=kname, params=params, kernels.append(Kernel(code=code, name=kname, params=params,
flags=flags, objvar=k_var)) flags=flags, objvar=k_var))
# k_finalCumSum # k_finalCumSum
kname = "k_finalCumSum" kname = "k_finalCumSum"
k_var = "k_finalCumSum_" + nodename k_var = "k_finalCumSum_" + nodename
code = """ code = """
KERNEL void k_finalCumSum(float* output, float* blockSum, size_t nbElementsPerCumsum, KERNEL void k_finalCumSum(float* output, float* blockSum, size_t nbElementsPerCumsum,
ga_ssize dataStrides_x, ga_ssize dataStrides_y, ga_ssize dataStrides_z, ga_ssize dataStrides_x, ga_ssize dataStrides_y, ga_ssize dataStrides_z,
int offsetY, int offsetZ) { int offsetY, int offsetZ) {
int globalThreadID = (blockIdx.x + 1) * blockDim.x + threadIdx.x; int globalThreadID = (blockIdx.x + 1) * blockDim.x + threadIdx.x;
...@@ -226,12 +215,11 @@ class GpuCumsum(GpuKernelBase, HideC, CumsumOp): ...@@ -226,12 +215,11 @@ class GpuCumsum(GpuKernelBase, HideC, CumsumOp):
""" """
params = [gpuarray.GpuArray, gpuarray.GpuArray, gpuarray.SIZE, params = [gpuarray.GpuArray, gpuarray.GpuArray, gpuarray.SIZE,
gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
'int32', 'int32',] 'int32', 'int32', ]
kernels.append(Kernel(code=code, name=kname, params=params, kernels.append(Kernel(code=code, name=kname, params=params,
flags=flags, objvar=k_var)) flags=flags, objvar=k_var))
return kernels return kernels
def c_code(self, node, nodename, inp, out, sub): def c_code(self, node, nodename, inp, out, sub):
if node.inputs[0].type.context.kind != 'cuda': if node.inputs[0].type.context.kind != 'cuda':
raise NotImplementedError("cuda only") raise NotImplementedError("cuda only")
...@@ -257,9 +245,9 @@ class GpuCumsum(GpuKernelBase, HideC, CumsumOp): ...@@ -257,9 +245,9 @@ class GpuCumsum(GpuKernelBase, HideC, CumsumOp):
} }
{ // Namespace for kernel calls // { // Namespace for kernel calls //
size_t max_threads_dim0; size_t max_threads_dim0;
size_t max_grid_size1; size_t max_grid_size1;
size_t max_grid_size2; size_t max_grid_size2;
int err; int err;
err = %(ctx)s->ops->property(%(ctx)s->ctx, NULL, NULL, GA_CTX_PROP_MAXLSIZE0, &max_threads_dim0); err = %(ctx)s->ops->property(%(ctx)s->ctx, NULL, NULL, GA_CTX_PROP_MAXLSIZE0, &max_threads_dim0);
if (err != GA_NO_ERROR){ if (err != GA_NO_ERROR){
...@@ -331,7 +319,7 @@ class GpuCumsum(GpuKernelBase, HideC, CumsumOp): ...@@ -331,7 +319,7 @@ class GpuCumsum(GpuKernelBase, HideC, CumsumOp):
} }
// Perform cumsum on array of even size. // Perform cumsum on array of even size.
size_t nbElementsPerCumsum = shape[axis] - (shape[axis] %% 2); size_t nbElementsPerCumsum = shape[axis] - (shape[axis] %% 2);
// Determine how many elements can be processed in one block. // Determine how many elements can be processed in one block.
size_t dimBlockX = ceil((nbElementsPerCumsum > 2*maxThreads ? 2*maxThreads : nbElementsPerCumsum) / 2.0); size_t dimBlockX = ceil((nbElementsPerCumsum > 2*maxThreads ? 2*maxThreads : nbElementsPerCumsum) / 2.0);
// Determine how many blocks are needed in total. // Determine how many blocks are needed in total.
size_t dimGridX = ceil(nbElementsPerCumsum / (2.0*dimBlockX)); // Nb. of blocks needed per cumsum. size_t dimGridX = ceil(nbElementsPerCumsum / (2.0*dimBlockX)); // Nb. of blocks needed per cumsum.
...@@ -389,7 +377,7 @@ class GpuCumsum(GpuKernelBase, HideC, CumsumOp): ...@@ -389,7 +377,7 @@ class GpuCumsum(GpuKernelBase, HideC, CumsumOp):
size_t sharedBytes = (2*dimBlockX) * sizeof(float); size_t sharedBytes = (2*dimBlockX) * sizeof(float);
void* kernel_params[] = {(void*) input->ga.data, void* kernel_params[] = {(void*) input->ga.data,
(void*) output->ga.data, (void*) output->ga.data,
(void*) &nbElementsPerCumsum, (void*) &nbElementsPerCumsum,
(void*) &inputStrides_x, (void*) &inputStrides_x,
(void*) &inputStrides_y, (void*) &inputStrides_y,
(void*) &inputStrides_z, (void*) &inputStrides_z,
...@@ -417,7 +405,7 @@ class GpuCumsum(GpuKernelBase, HideC, CumsumOp): ...@@ -417,7 +405,7 @@ class GpuCumsum(GpuKernelBase, HideC, CumsumOp):
size_t dimGrid[3] = {dimGridX, localDimGridY, localDimGridZ}; size_t dimGrid[3] = {dimGridX, localDimGridY, localDimGridZ};
size_t dimBlock[3] = {dimBlockX, 1, 1}; size_t dimBlock[3] = {dimBlockX, 1, 1};
void* kernel_params[] = {(void*) output->ga.data, void* kernel_params[] = {(void*) output->ga.data,
(void*) deviceBlockSum->ga.data, (void*) deviceBlockSum->ga.data,
(void*) &nbElementsPerCumsum, (void*) &nbElementsPerCumsum,
(void*) &outputStrides_x, (void*) &outputStrides_x,
(void*) &outputStrides_y, (void*) &outputStrides_y,
......
...@@ -10,6 +10,7 @@ import theano ...@@ -10,6 +10,7 @@ import theano
import theano.tensor.tests.test_extra_ops import theano.tensor.tests.test_extra_ops
from theano.tensor.extra_ops import cumsum, CumsumOp from theano.tensor.extra_ops import cumsum, CumsumOp
from theano.tests.unittest_tools import SkipTest
from theano.tests import unittest_tools as utt from theano.tests import unittest_tools as utt
from .config import mode_with_gpu, test_ctx_name from .config import mode_with_gpu, test_ctx_name
...@@ -28,8 +29,6 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp): ...@@ -28,8 +29,6 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
self.max_threads_dim0 = test_ctx.maxlsize0 self.max_threads_dim0 = test_ctx.maxlsize0
self.max_grid_size1 = test_ctx.maxgsize2 self.max_grid_size1 = test_ctx.maxgsize2
def test_Strides1D(self): def test_Strides1D(self):
x = T.fvector('x') x = T.fvector('x')
...@@ -113,16 +112,16 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp): ...@@ -113,16 +112,16 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
utt.assert_allclose(np.cumsum(a[:i]), f(a[:i])) utt.assert_allclose(np.cumsum(a[:i]), f(a[:i]))
# Use multiple GPU threadblocks # Use multiple GPU threadblocks
a = np.random.random((block_max_size+2,)).astype("float32") a = np.random.random((block_max_size + 2, )).astype("float32")
utt.assert_allclose(np.cumsum(a), f(a)) utt.assert_allclose(np.cumsum(a), f(a))
# Use recursive cumsum # Use recursive cumsum
a = np.ones((block_max_size*(block_max_size+1)+2,), a = np.ones((block_max_size * (block_max_size + 1) + 2,),
dtype="float32") dtype="float32")
utt.assert_allclose(np.cumsum(a), f(a)) utt.assert_allclose(np.cumsum(a), f(a))
def test_GpuCumsum2D(self): def test_GpuCumsum2D(self):
block_max_size = self.max_threads_dim0 * 2 block_max_size = self.max_threads_dim0 * 2
x = T.fmatrix('x') x = T.fmatrix('x')
for shape_axis, axis in zip([0, 1, 0, 1, 0], [0, 1, None, -1, -2]): for shape_axis, axis in zip([0, 1, 0, 1, 0], [0, 1, None, -1, -2]):
...@@ -143,21 +142,21 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp): ...@@ -143,21 +142,21 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
# Use multiple GPU threadblocks # Use multiple GPU threadblocks
a_shape = [5, 5] a_shape = [5, 5]
a_shape[shape_axis] = block_max_size+2 a_shape[shape_axis] = block_max_size + 2
a = np.random.random(a_shape).astype("float32") a = np.random.random(a_shape).astype("float32")
utt.assert_allclose(np.cumsum(a, axis=axis), f(a)) utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
# Use multiple GPU gridblocks # Use multiple GPU gridblocks
a_shape = [4, 4] a_shape = [4, 4]
a_shape[1-shape_axis] = self.max_grid_size1+1 a_shape[1 - shape_axis] = self.max_grid_size1 + 1
a = np.random.random(a_shape).astype("float32") a = np.random.random(a_shape).astype("float32")
utt.assert_allclose(np.cumsum(a, axis=axis), f(a), rtol=5e-5) utt.assert_allclose(np.cumsum(a, axis=axis), f(a), rtol=5e-5)
# Use recursive cumsum # Use recursive cumsum
a_shape = [3, 3] a_shape = [3, 3]
a_shape[shape_axis] = block_max_size*(block_max_size+1)+2 a_shape[shape_axis] = block_max_size * (block_max_size + 1) + 2
a = np.random.random(a_shape).astype("float32") a = np.random.random(a_shape).astype("float32")
a = np.sign(a-0.5).astype("float32") # Avoid floating point error a = np.sign(a - 0.5).astype("float32") # Avoid floating point error
utt.assert_allclose(np.cumsum(a, axis=axis), f(a)) utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
def test_GpuCumsum3D(self): def test_GpuCumsum3D(self):
...@@ -182,32 +181,32 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp): ...@@ -182,32 +181,32 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
# Use multiple GPU threadblocks (along accumulation axis) # Use multiple GPU threadblocks (along accumulation axis)
a_shape = [2, 2, 2] a_shape = [2, 2, 2]
a_shape[shape_axis] = block_max_size+2 a_shape[shape_axis] = block_max_size + 2
a = np.random.random(a_shape).astype("float32") a = np.random.random(a_shape).astype("float32")
utt.assert_allclose(np.cumsum(a, axis=axis), f(a)) utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
# Use multiple GPU gridblocks (not along accumulation axis) # Use multiple GPU gridblocks (not along accumulation axis)
a_shape = [5, 5, 5] a_shape = [5, 5, 5]
a_shape[(shape_axis+1) % 3] = self.max_grid_size1+1 a_shape[(shape_axis + 1) % 3] = self.max_grid_size1 + 1
a = np.random.random(a_shape).astype("float32") a = np.random.random(a_shape).astype("float32")
if axis is None: if axis is None:
# Avoid floating point error # Avoid floating point error
a = np.sign(a-0.5).astype("float32") a = np.sign(a - 0.5).astype("float32")
utt.assert_allclose(np.cumsum(a, axis=axis), f(a)) utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
a_shape = [5, 5, 5] a_shape = [5, 5, 5]
a_shape[(shape_axis+2) % 3] = self.max_grid_size1+1 a_shape[(shape_axis + 2) % 3] = self.max_grid_size1 + 1
a = np.random.random(a_shape).astype("float32") a = np.random.random(a_shape).astype("float32")
if axis is None: if axis is None:
# Avoid floating point error # Avoid floating point error
a = np.sign(a-0.5).astype("float32") a = np.sign(a - 0.5).astype("float32")
utt.assert_allclose(np.cumsum(a, axis=axis), f(a)) utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
# Use recursive cumsum (along accumulation axis) # Use recursive cumsum (along accumulation axis)
a_shape = [3, 3, 3] a_shape = [3, 3, 3]
a_shape[shape_axis] = block_max_size*(block_max_size+1)+2 a_shape[shape_axis] = block_max_size * (block_max_size + 1) + 2
a = np.random.random(a_shape).astype("float32") a = np.random.random(a_shape).astype("float32")
a = np.sign(a-0.5).astype("float32") # Avoid floating point error a = np.sign(a - 0.5).astype("float32") # Avoid floating point error
utt.assert_allclose(np.cumsum(a, axis=axis), f(a)) utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
def test_GpuCumsum4D(self): def test_GpuCumsum4D(self):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论