提交 771d5c27 authored 作者: Marc-Alexandre Cote's avatar Marc-Alexandre Cote

Added unit tests and comments.

上级 2ad36935
......@@ -9,7 +9,7 @@ from theano.sandbox.cuda import GpuFlatten
if cuda_available:
from theano.sandbox.cuda import CudaNdarrayType
from theano.sandbox.cuda.basic_ops import host_from_gpu, gpu_from_host
from theano.sandbox.cuda.basic_ops import host_from_gpu, gpu_from_host, HostFromGpu
from theano.sandbox.cuda.opt import register_opt as register_gpu_opt
......@@ -78,12 +78,13 @@ class GpuCumsum(CumsumOp, GpuOp):
compute_map, no_recycling)
def c_code_cache_version(self):
return (3,)
return (4,)
def c_support_code_apply(self, node, nodename):
return """
__device__
void k_reductionPhase_%(nodename)s(float* partialCumSum) {
// Traverse down from leaves to root building partial sums at internal nodes in the tree.
for (unsigned int stride = 1; stride <= blockDim.x; stride *= 2) {
__syncthreads();
unsigned int index = (threadIdx.x + 1) * (stride * 2) - 1;
......@@ -95,6 +96,7 @@ class GpuCumsum(CumsumOp, GpuOp):
__device__
void k_reversePhase_%(nodename)s(float* partialCumSum) {
// Traverse back up the tree building the scan from the partial sums
for (unsigned int stride = exp2(ceil(log2((float)blockDim.x))); stride > 0; stride /= 2) {
__syncthreads();
unsigned int index = (threadIdx.x + 1) * (stride * 2) - 1;
......@@ -151,7 +153,7 @@ class GpuCumsum(CumsumOp, GpuOp):
__global__
void k_blockCumSum_%(nodename)s(float* input, float* output, int numElements, dim3 dataStrides, int dataOffset, float* blockSum) {
// Regarding blockIdx and threadIdx, 'Cumsum' is always perform along the X axis.
// Regarding blockIdx and threadIdx, 'Cumsum' is always performed along the X axis.
// The Y axis will contain all the independent cumsums of the 2D case.
int globalThreadID = blockIdx.x * blockDim.x + threadIdx.x;
......@@ -166,6 +168,9 @@ class GpuCumsum(CumsumOp, GpuOp):
// Load data in shared memory
k_fetchData_%(nodename)s(partialCumSum, input, globalThreadID, dataStrides, dataOffset);
// Use a dichotomy approach to compute the cumsum (i.e. balanced binary tree).
// The tree is sweeped from the leaves to the root and from the root to the leaves.
// Similar to http://www.umiacs.umd.edu/~ramani/cmsc828e_gpusci/ScanTalk.pdf
k_reductionPhase_%(nodename)s(partialCumSum);
k_reversePhase_%(nodename)s(partialCumSum);
......@@ -179,7 +184,7 @@ class GpuCumsum(CumsumOp, GpuOp):
}
}
void cumSum_%(nodename)s(CudaNdarray* input, CudaNdarray* output, int maxThreads, int axis, int maxGridY) {
int cumSum_%(nodename)s(CudaNdarray* input, CudaNdarray* output, int maxThreads, int axis, int maxGridY) {
int shape[2] = { 1, 1 };
dim3 dataStrides(0,0,0);
......@@ -195,12 +200,14 @@ class GpuCumsum(CumsumOp, GpuOp):
dataStrides.x = CudaNdarray_HOST_STRIDES(input)[0];
dataStrides.y = CudaNdarray_HOST_STRIDES(input)[1];
break;
default: printf("Only 1D and 2D cumsum is implemented yet.\\n");
default:
printf("Only 1D and 2D cumsum is implemented yet.\\n");
return -1;
}
if (shape[axis] <= 1) {
CudaNdarray_CopyFromCudaNdarray(output, input);
return;
return 0;
}
if (axis == 1) {
......@@ -215,8 +222,7 @@ class GpuCumsum(CumsumOp, GpuOp):
int dimGridY = shape[1-axis]; // Nb. of independent cumsums.
const int shapeBlockSum[2] = { dimGridX, dimGridY };
//CudaNdarray* deviceBlockSum = (CudaNdarray*) CudaNdarray_NewDims(2, shapeBlockSum);
CudaNdarray* deviceBlockSum = (CudaNdarray*) CudaNdarray_ZEROS(2, (int*)shapeBlockSum);
CudaNdarray* deviceBlockSum = (CudaNdarray*) CudaNdarray_NewDims(2, shapeBlockSum);
for (int dataOffset = 0; dataOffset < dimGridY; dataOffset += maxGridY){
int localDimGridY = min(dimGridY - dataOffset, maxGridY);
......@@ -224,7 +230,6 @@ class GpuCumsum(CumsumOp, GpuOp):
dim3 dimGrid(dimGridX, localDimGridY, 1);
int sharedBytes = (2*blockSize) * sizeof(float);
cudaThreadSynchronize();
k_blockCumSum_%(nodename)s<<<dimGrid, dimBlock, sharedBytes>>>
(
CudaNdarray_DEV_DATA(input),
......@@ -237,8 +242,12 @@ class GpuCumsum(CumsumOp, GpuOp):
if (dimGridX > 1) {
// Do a cumsum over the blockSum (recursive).
cumSum_%(nodename)s(deviceBlockSum, deviceBlockSum, maxThreads, 0, maxGridY);
if (cumSum_%(nodename)s(deviceBlockSum, deviceBlockSum, maxThreads, 0, maxGridY) == -1){
return -1;
}
// Since there are more than one block (i.e. `dimGridX > 1`)
// report partial cumsums of previous blocks to subsequents ones.
dim3 dimGrid(dimGridX, dimGridY, 1);
dim3 dimBlock(blockSize, 1, 1);
k_finalCumSum_%(nodename)s<<<dimGrid, dimBlock>>>
......@@ -253,7 +262,7 @@ class GpuCumsum(CumsumOp, GpuOp):
// If shape[axis] is odd, the last element is compute manually
if (shape[axis] != numElements){
cudaThreadSynchronize();
CNDA_THREAD_SYNC;
dim3 dimGrid(1, localDimGridY, 1);
dim3 dimBlock(1, 1, 1);
k_cumadd_%(nodename)s<<<dimGrid, dimBlock>>>
......@@ -269,7 +278,8 @@ class GpuCumsum(CumsumOp, GpuOp):
}
cudaFree(CudaNdarray_DEV_DATA(deviceBlockSum));
cudaThreadSynchronize();
CNDA_THREAD_SYNC;
return 0;
}
""" % locals()
......@@ -321,7 +331,9 @@ class GpuCumsum(CumsumOp, GpuOp):
}
{ // Namespace for kernel calls //
cumSum_%(nodename)s(%(x)s, %(z)s, %(max_threads_dim0)s, %(axis)s, %(max_grid_size1)s);
if (cumSum_%(nodename)s(%(x)s, %(z)s, %(max_threads_dim0)s, %(axis)s, %(max_grid_size1)s) == -1){
%(fail)s;
}
cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts)
......@@ -340,7 +352,11 @@ class GpuCumsum(CumsumOp, GpuOp):
@local_optimizer([CumsumOp])
def use_gpu_cumsum(node):
if type(node.op) is CumsumOp and node.inputs[0].dtype == 'float32':
if type(node.op) is CumsumOp \
and node.inputs[0].dtype == 'float32' \
and node.inputs[0].owner \
and isinstance(node.inputs[0].owner.op, HostFromGpu):
axis = node.op.axis
x = node.inputs[0]
......
......@@ -17,12 +17,11 @@ from theano import tensor as T
import numpy as np
import theano
from theano import config
from theano.tensor.extra_ops import cumsum
from theano.tensor.extra_ops import cumsum, CumsumOp
class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
mode = mode_with_gpu
op = GpuCumsum
def setUp(self):
super(TestGpuCumsum, self).setUp()
......@@ -68,8 +67,8 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
assert [n for n in f.maker.fgraph.toposort()
if isinstance(n.op, GpuCumsum)]
# Extensive testing for the first 1k sizes
a = np.ones((int(1e3),), dtype="float32")
# Extensive testing for the first 1025 sizes
a = np.random.random(1025).astype("float32")
for i in xrange(a.shape[0]):
assert np.allclose(np.cumsum(a[:i]), f(a[:i]))
......@@ -86,36 +85,43 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
block_max_size = self.max_threads_dim0 * 2
x = T.fmatrix('x')
for axis in xrange(2):
for shape_axis, axis in zip([0, 1, 0], [0, 1, None]):
f = theano.function([x], cumsum(x, axis=axis), mode=self.mode)
assert [n for n in f.maker.fgraph.toposort()
if isinstance(n.op, GpuCumsum)]
# Extensive testing for the first 1k sizes
# Extensive testing for the first 1025 sizes
a_shape = [5, 5]
a_shape[axis] = int(1e3)
a = np.ones(a_shape, dtype="float32")
a_shape[shape_axis] = 1025
a = np.random.random(a_shape).astype("float32")
slices = [slice(None), slice(None)]
for i in xrange(a.shape[axis]):
slices[axis] = slice(i)
for i in xrange(a.shape[shape_axis]):
slices[shape_axis] = slice(i)
fa = f(a[slices])
npa = np.cumsum(a[slices], axis=axis)
assert np.allclose(npa, fa)
# Use multiple GPU threadblocks
a_shape = [5, 5]
a_shape[axis] = block_max_size+2
a = np.ones(a_shape, dtype="float32")
a_shape[shape_axis] = block_max_size+2
a = np.random.random(a_shape).astype("float32")
assert np.allclose(np.cumsum(a, axis=axis), f(a))
# Use multiple GPU gridblocks
a_shape = [5, 5]
a_shape[1-axis] = self.max_grid_size1+1
a = np.ones(a_shape, dtype="float32")
a_shape[1-shape_axis] = self.max_grid_size1+1
a = np.random.random(a_shape).astype("float32")
assert np.allclose(np.cumsum(a, axis=axis), f(a))
# Use recursive cumsum
a_shape = [5, 5]
a_shape[axis] = block_max_size*(block_max_size+1)+2
a_shape = [5, 3]
a_shape[shape_axis] = block_max_size*(block_max_size+1)+2
a = np.ones(a_shape, dtype="float32")
assert np.allclose(np.cumsum(a, axis=axis), f(a))
def test_GpuCumsum3D(self):
# Should not use the GPU version.
x = T.ftensor3('x')
f = theano.function([x], cumsum(x, axis=1), mode=self.mode)
assert [n for n in f.maker.fgraph.toposort()
if isinstance(n.op, CumsumOp)]
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论