提交 29e1eb1e authored 作者: Marc-Alexandre Cote's avatar Marc-Alexandre Cote

Fixed bugs, added some unit tests.

上级 8fc116f9
...@@ -25,7 +25,8 @@ class GpuCumsum(CumsumOp, GpuOp): ...@@ -25,7 +25,8 @@ class GpuCumsum(CumsumOp, GpuOp):
out_type = x.type() out_type = x.type()
if self.axis is None and x.ndim > 1: if self.axis is None and x.ndim > 1:
out_type = CudaNdarrayType(broadcastable=(False,), dtype=x.dtype) out_type = CudaNdarrayType(broadcastable=(False,), dtype=x.dtype)()
return theano.Apply(self, [x], [out_type]) return theano.Apply(self, [x], [out_type])
def make_thunk(self, node, storage_map, compute_map, no_recycling): def make_thunk(self, node, storage_map, compute_map, no_recycling):
...@@ -56,28 +57,38 @@ class GpuCumsum(CumsumOp, GpuOp): ...@@ -56,28 +57,38 @@ class GpuCumsum(CumsumOp, GpuOp):
axis = self.axis axis = self.axis
return """ return """
__global__ __global__
void finalCumSum_1D_%(nodename)s(float * output, float * blockSum) { void finalCumSum_1D_%(nodename)s(float* output, float* blockSum, int numElements) {
int globalThreadID = (blockIdx.x + 1) * blockDim.x + threadIdx.x; int globalThreadID = (blockIdx.x + 1) * blockDim.x + threadIdx.x;
if (globalThreadID < ceil(numElements/2.0)) {
const float currentBlockSum = blockSum[blockIdx.x]; const float currentBlockSum = blockSum[blockIdx.x];
output[globalThreadID * 2] += currentBlockSum; output[globalThreadID*2] += currentBlockSum;
output[(globalThreadID * 2) + 1] += currentBlockSum; output[globalThreadID*2 + 1] += currentBlockSum;
}
} }
__global__ __global__
void blockCumSum_1D_%(nodename)s(float * input, float * output, int numElements, float * blockSum) { void cumadd_%(nodename)s(float* input, float* output, int beforeLastElementIdx, int lastElementIdx) {
output[lastElementIdx] = input[lastElementIdx] + output[beforeLastElementIdx];
}
__global__
void blockCumSum_1D_%(nodename)s(float* input, float* output, int numElements, float* blockSum) {
int globalThreadID = blockIdx.x * blockDim.x + threadIdx.x; int globalThreadID = blockIdx.x * blockDim.x + threadIdx.x;
if (globalThreadID < numElements/2) { if (globalThreadID < ceil(numElements/2.0)) {
extern __shared__ float partialCumSum[]; extern __shared__ float partialCumSum[];
// Load data in shared memory // Load data in shared memory
partialCumSum[threadIdx.x*2] = input[globalThreadID*2]; partialCumSum[threadIdx.x*2] = input[globalThreadID*2];
partialCumSum[(threadIdx.x *2) +1] = input[(globalThreadID * 2) + 1]; partialCumSum[threadIdx.x*2 + 1] = input[globalThreadID*2 + 1];
// Reduction Phase // Reduction Phase
for (int stride = 1; stride < blockDim.x*2; stride *= 2) { int stride;
for (stride = 1; stride <= blockDim.x; stride *= 2) {
__syncthreads(); __syncthreads();
int index = (threadIdx.x + 1) * (stride * 2) - 1; int index = (threadIdx.x + 1) * (stride * 2) - 1;
if(index < blockDim.x*2) { if(index < blockDim.x*2) {
...@@ -86,7 +97,7 @@ class GpuCumsum(CumsumOp, GpuOp): ...@@ -86,7 +97,7 @@ class GpuCumsum(CumsumOp, GpuOp):
} }
// Reverse Phase // Reverse Phase
for (int stride = blockDim.x*2/2; stride > 0; stride /= 2) { for (; stride > 0; stride /= 2) {
__syncthreads(); __syncthreads();
int index = (threadIdx.x + 1) * (stride * 2) - 1; int index = (threadIdx.x + 1) * (stride * 2) - 1;
if(index + stride < blockDim.x*2) { if(index + stride < blockDim.x*2) {
...@@ -96,10 +107,13 @@ class GpuCumsum(CumsumOp, GpuOp): ...@@ -96,10 +107,13 @@ class GpuCumsum(CumsumOp, GpuOp):
// Wtite the final output to global memory // Wtite the final output to global memory
__syncthreads(); __syncthreads();
output[globalThreadID * 2] = partialCumSum[threadIdx.x * 2]; output[globalThreadID*2] = partialCumSum[threadIdx.x*2];
output[(globalThreadID * 2) + 1] = partialCumSum[(threadIdx.x * 2) + 1]; output[globalThreadID*2 + 1] = partialCumSum[threadIdx.x*2 + 1];
if (blockSum != NULL){
if (threadIdx.x == blockDim.x - 1) { if (threadIdx.x == blockDim.x - 1) {
blockSum[blockIdx.x] = partialCumSum[(threadIdx.x * 2) + 1]; blockSum[blockIdx.x] = partialCumSum[threadIdx.x*2 + 1];
}
} }
} }
} }
...@@ -134,46 +148,95 @@ class GpuCumsum(CumsumOp, GpuOp): ...@@ -134,46 +148,95 @@ class GpuCumsum(CumsumOp, GpuOp):
} }
{ // Namespace for kernel calls // { // Namespace for kernel calls //
int blockSize = min((int)shape[0], %(max_threads_dim0)s/2); int numElements = shape[0] - (shape[0] %% 2);
int blockSize = ceil( min(numElements, 2*%(max_threads_dim0)s) / 2.);
int dimGridX = ceil(shape[0] / (2.0*blockSize)); int dimGridX = ceil(shape[0] / (2.0*blockSize));
npy_intp WARDFRT[1] = { dimGridX }; npy_intp shapeBlockSum[1] = { dimGridX };
CudaNdarray * deviceBlockSum = (CudaNdarray*) CudaNdarray_NewDims(1, WARDFRT); CudaNdarray * deviceBlockSum = (CudaNdarray*) CudaNdarray_NewDims(1, shapeBlockSum);
dim3 dimBlock(blockSize, 1, 1); dim3 dimBlock(blockSize, 1, 1);
dim3 dimGrid(dimGridX, 1, 1); dim3 dimGrid(dimGridX, 1, 1);
int sharedBytes = (2*blockSize) * sizeof(float);
blockCumSum_1D_%(nodename)s<<<dimGrid, dimBlock, (2*blockSize) * sizeof(float)>>>
/*
printf("N: %%d (%%d)\\n", shape[0], numElements);
printf("max_threads_dim0: %%d\\n", %(max_threads_dim0)s);
printf("N_sum: %%d\\n", shapeBlockSum[0]);
printf("dimGridX: %%d\\n", dimGridX);
printf("dimBlock: (%%d, %%d, %%d)\\n", dimBlock.x, dimBlock.y, dimBlock.z);
printf("dimGrid: (%%d, %%d)\\n", dimGrid.x, dimGrid.y);
printf("sharedBytes: %%d bytes\\n", sharedBytes);
*/
blockCumSum_1D_%(nodename)s<<<dimGrid, dimBlock, sharedBytes>>>
( (
CudaNdarray_DEV_DATA(%(x)s), CudaNdarray_DEV_DATA(%(x)s),
CudaNdarray_DEV_DATA(%(z)s), CudaNdarray_DEV_DATA(%(z)s),
shape[0], numElements,
CudaNdarray_DEV_DATA(deviceBlockSum) CudaNdarray_DEV_DATA(deviceBlockSum)
); );
cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts)
{
PyErr_Format(PyExc_RuntimeError,
"Cuda error: %%s: %%s. (grid: %%i x %%i;"
" block: %%i x %%i x %%i; shared: %%i)\\n",
"blockCumSum_1D_%(nodename)s",
cudaGetErrorString(sts),
dimGrid.x,
dimGrid.y,
dimBlock.x,
dimBlock.y,
dimBlock.z,
sharedBytes);
%(fail)s;
}
if (dimGridX > 1) { if (dimGridX > 1) {
dim3 dimBlockBlockSum(ceil(dimGridX/2.0), 1, 1);
dim3 dimGridBlockSum(1, 1, 1); // Suppose we only had less than 2048 grids initially.
int sharedBytesBlockSum = (2*dimBlockBlockSum.x) * sizeof(float);
//printf("dimBlockBlockSum: (%%d, %%d, %%d)\\n", dimBlockBlockSum.x, dimBlockBlockSum.y, dimBlockBlockSum.z);
//printf("dimGridBlockSum: (%%d, %%d)\\n", dimGridBlockSum.x, dimGridBlockSum.y);
//printf("sharedBytesBlockSum: %%d bytes\\n", sharedBytesBlockSum);
cudaThreadSynchronize(); cudaThreadSynchronize();
dim3 dimGridBlockSum(1, 1, 1);
dim3 dimBlockBlockSum(dimGridX-1, 1, 1); blockCumSum_1D_%(nodename)s<<<dimGridBlockSum, dimBlockBlockSum, sharedBytesBlockSum>>>
blockCumSum_1D_%(nodename)s<<<dimGridBlockSum, dimBlockBlockSum, (2*(dimGridX-1)) * sizeof(float)>>>
( (
CudaNdarray_DEV_DATA(deviceBlockSum), CudaNdarray_DEV_DATA(deviceBlockSum),
CudaNdarray_DEV_DATA(deviceBlockSum), CudaNdarray_DEV_DATA(deviceBlockSum),
dimGridX-1, dimGridX,
NULL NULL
); );
cudaThreadSynchronize(); cudaThreadSynchronize();
dim3 dimGrid(dimGridX-1, 1, 1); dim3 dimGrid(dimGridX-1, 1, 1);
dim3 dimBlock(blockSize, 1, 1); dim3 dimBlock(blockSize, 1, 1);
finalCumSum_1D_%(nodename)s<<<dimGrid, dimBlock>>> finalCumSum_1D_%(nodename)s<<<dimGrid, dimBlock>>>
( (
CudaNdarray_DEV_DATA(%(z)s), CudaNdarray_DEV_DATA(%(z)s),
CudaNdarray_DEV_DATA(deviceBlockSum) CudaNdarray_DEV_DATA(deviceBlockSum),
numElements
); );
} }
cudaDeviceSynchronize(); // If shape[0] is odd, the last element is compute manually
if (shape[0] != numElements){
cudaThreadSynchronize();
cumadd_%(nodename)s<<<dim3(1,1,1), dim3(1,1,1)>>>
(
CudaNdarray_DEV_DATA(%(x)s),
CudaNdarray_DEV_DATA(%(z)s),
shape[0]-2,
shape[0]-1
);
}
cudaThreadSynchronize();
} }
""" % locals() """ % locals()
...@@ -183,12 +246,17 @@ class GpuCumsum(CumsumOp, GpuOp): ...@@ -183,12 +246,17 @@ class GpuCumsum(CumsumOp, GpuOp):
def gpu_cumsum(x, axis=None): def gpu_cumsum(x, axis=None):
return GpuCumsum(axis)(x) return GpuCumsum(axis)(x)
from theano.sandbox.cuda import GpuFlatten
@local_optimizer([CumsumOp]) @local_optimizer([CumsumOp])
def use_gpu_cumsum(node): def use_gpu_cumsum(node):
if type(node.op) is CumsumOp and node.inputs[0].dtype == 'float32': if type(node.op) is CumsumOp and node.inputs[0].dtype == 'float32':
return [host_from_gpu(gpu_cumsum(gpu_from_host(node.inputs[0]),
axis=node.op.axis))] x = gpu_from_host(node.inputs[0])
if node.op.axis is None and x.ndim > 1:
x = GpuFlatten()(x)
return [host_from_gpu(gpu_cumsum(x, axis=node.op.axis))]
if cuda_available: if cuda_available:
register_gpu_opt()(use_gpu_cumsum) register_gpu_opt()(use_gpu_cumsum)
...@@ -13,8 +13,67 @@ if theano.config.mode == 'FAST_COMPILE': ...@@ -13,8 +13,67 @@ if theano.config.mode == 'FAST_COMPILE':
else: else:
mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu') mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
from theano import tensor as T
import numpy as np
import theano
from theano import config
from theano.tensor.extra_ops import cumsum
class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp): class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
mode = mode_with_gpu mode = mode_with_gpu
op = GpuCumsum op = GpuCumsum
dtypes = ['float32'] dtypes = ['float32']
def test_GpuCumsum(self):
### Test 1D case ###
x = T.vector('x')
f = theano.function([x], cumsum(x))
# Even number of elements
a = np.random.random((18,)).astype(config.floatX)
assert np.allclose(np.cumsum(a), f(a))
# Odd number of elements
a = np.random.random((7,)).astype(config.floatX)
assert np.allclose(np.cumsum(a), f(a))
# Use multiple GPU threadblocks
a = np.random.random((2048+1,)).astype(config.floatX)
assert np.allclose(np.cumsum(a), f(a))
# Use multiple GPU gridblocks
a = np.random.random((2048*2048+1,)).astype(config.floatX)
assert np.allclose(np.cumsum(a), f(a))
# #x = T.tensor3('x')
# #a = np.random.random((3, 5, 2)).astype(config.floatX)
# x = T.vector('x')
# a = np.random.random((6,)).astype(config.floatX)
# f = theano.function([x], cumsum(x))
# a = (np.ones(2048*2048+1)+1).astype(config.floatX)
# print ""
# print f(a)
# print np.cumsum(a)
# assert np.allclose(np.cumsum(a), f(a)) # Test axis=None
# return
# #for i in range(3000-1,3000*2):
# #for i in range(3,2048*100,2048):
# i = 145000;
# while True:
# #a = np.random.random((i,)).astype(config.floatX)
# a = np.ones((i,), dtype=config.floatX)
# fa = f(a)
# npa = np.cumsum(a)
# if not np.allclose(npa, fa):
# print i, np.allclose(npa, fa) # Test axis=None
# print fa
# print npa
# assert False
# if i % 1000 == 0:
# print i
# i += 1
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论