提交 79f9288a authored 作者: Marc-Alexandre Cote's avatar Marc-Alexandre Cote

Make cumsum recursive to handle very large array

上级 29e1eb1e
...@@ -57,7 +57,7 @@ class GpuCumsum(CumsumOp, GpuOp): ...@@ -57,7 +57,7 @@ class GpuCumsum(CumsumOp, GpuOp):
axis = self.axis axis = self.axis
return """ return """
__global__ __global__
void finalCumSum_1D_%(nodename)s(float* output, float* blockSum, int numElements) { void k_finalCumSum_1D_%(nodename)s(float* output, float* blockSum, int numElements) {
int globalThreadID = (blockIdx.x + 1) * blockDim.x + threadIdx.x; int globalThreadID = (blockIdx.x + 1) * blockDim.x + threadIdx.x;
if (globalThreadID < ceil(numElements/2.0)) { if (globalThreadID < ceil(numElements/2.0)) {
...@@ -70,13 +70,13 @@ class GpuCumsum(CumsumOp, GpuOp): ...@@ -70,13 +70,13 @@ class GpuCumsum(CumsumOp, GpuOp):
__global__ __global__
void cumadd_%(nodename)s(float* input, float* output, int beforeLastElementIdx, int lastElementIdx) { void k_cumadd_%(nodename)s(float* input, float* output, int beforeLastElementIdx, int lastElementIdx) {
output[lastElementIdx] = input[lastElementIdx] + output[beforeLastElementIdx]; output[lastElementIdx] = input[lastElementIdx] + output[beforeLastElementIdx];
} }
__global__ __global__
void blockCumSum_1D_%(nodename)s(float* input, float* output, int numElements, float* blockSum) { void k_blockCumSum_1D_%(nodename)s(float* input, float* output, int numElements, float* blockSum) {
int globalThreadID = blockIdx.x * blockDim.x + threadIdx.x; int globalThreadID = blockIdx.x * blockDim.x + threadIdx.x;
if (globalThreadID < ceil(numElements/2.0)) { if (globalThreadID < ceil(numElements/2.0)) {
...@@ -117,6 +117,62 @@ class GpuCumsum(CumsumOp, GpuOp): ...@@ -117,6 +117,62 @@ class GpuCumsum(CumsumOp, GpuOp):
} }
} }
} }
void cumSum_1D_%(nodename)s(CudaNdarray* input, CudaNdarray* output, npy_intp* shape, int maxThreads) {
if (shape[0] <= 1) {
CudaNdarray_CopyFromCudaNdarray(output, input);
return;
}
int numElements = shape[0] - (shape[0] %% 2);
int blockSize = ceil( min(numElements, 2*maxThreads) / 2.0);
int dimGridX = ceil(numElements / (2.0*blockSize));
npy_intp shapeBlockSum[1] = { dimGridX };
CudaNdarray* deviceBlockSum = (CudaNdarray*) CudaNdarray_NewDims(1, shapeBlockSum);
dim3 dimBlock(blockSize, 1, 1);
dim3 dimGrid(dimGridX, 1, 1);
int sharedBytes = (2*blockSize) * sizeof(float);
cudaThreadSynchronize();
k_blockCumSum_1D_%(nodename)s<<<dimGrid, dimBlock, sharedBytes>>>
(
CudaNdarray_DEV_DATA(input),
CudaNdarray_DEV_DATA(output),
numElements,
CudaNdarray_DEV_DATA(deviceBlockSum)
);
if (dimGridX > 1) {
// Do a cumsum over the blockSum (recursive).
cumSum_1D_%(nodename)s(deviceBlockSum, deviceBlockSum, shapeBlockSum, maxThreads);
dim3 dimGrid(dimGridX-1, 1, 1);
dim3 dimBlock(blockSize, 1, 1);
k_finalCumSum_1D_%(nodename)s<<<dimGrid, dimBlock>>>
(
CudaNdarray_DEV_DATA(output),
CudaNdarray_DEV_DATA(deviceBlockSum),
numElements
);
}
// If shape[0] is odd, the last element is compute manually
if (shape[0] != numElements){
cudaThreadSynchronize();
k_cumadd_%(nodename)s<<<dim3(1,1,1), dim3(1,1,1)>>>
(
CudaNdarray_DEV_DATA(input),
CudaNdarray_DEV_DATA(output),
shape[0]-2,
shape[0]-1
);
}
cudaFree(CudaNdarray_DEV_DATA(deviceBlockSum));
cudaThreadSynchronize();
}
""" % locals() """ % locals()
def c_code(self, node, nodename, inames, onames, sub): def c_code(self, node, nodename, inames, onames, sub):
...@@ -148,95 +204,17 @@ class GpuCumsum(CumsumOp, GpuOp): ...@@ -148,95 +204,17 @@ class GpuCumsum(CumsumOp, GpuOp):
} }
{ // Namespace for kernel calls // { // Namespace for kernel calls //
int numElements = shape[0] - (shape[0] %% 2); cumSum_1D_%(nodename)s(%(x)s, %(z)s, shape, %(max_threads_dim0)s);
int blockSize = ceil( min(numElements, 2*%(max_threads_dim0)s) / 2.);
int dimGridX = ceil(shape[0] / (2.0*blockSize));
npy_intp shapeBlockSum[1] = { dimGridX };
CudaNdarray * deviceBlockSum = (CudaNdarray*) CudaNdarray_NewDims(1, shapeBlockSum);
dim3 dimBlock(blockSize, 1, 1);
dim3 dimGrid(dimGridX, 1, 1);
int sharedBytes = (2*blockSize) * sizeof(float);
/*
printf("N: %%d (%%d)\\n", shape[0], numElements);
printf("max_threads_dim0: %%d\\n", %(max_threads_dim0)s);
printf("N_sum: %%d\\n", shapeBlockSum[0]);
printf("dimGridX: %%d\\n", dimGridX);
printf("dimBlock: (%%d, %%d, %%d)\\n", dimBlock.x, dimBlock.y, dimBlock.z);
printf("dimGrid: (%%d, %%d)\\n", dimGrid.x, dimGrid.y);
printf("sharedBytes: %%d bytes\\n", sharedBytes);
*/
blockCumSum_1D_%(nodename)s<<<dimGrid, dimBlock, sharedBytes>>>
(
CudaNdarray_DEV_DATA(%(x)s),
CudaNdarray_DEV_DATA(%(z)s),
numElements,
CudaNdarray_DEV_DATA(deviceBlockSum)
);
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts) if (cudaSuccess != sts)
{ {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"Cuda error: %%s: %%s. (grid: %%i x %%i;" "Cuda error: %%s: %%s.\\n",
" block: %%i x %%i x %%i; shared: %%i)\\n", "cumSum_1D_%(nodename)s",
"blockCumSum_1D_%(nodename)s", cudaGetErrorString(sts));
cudaGetErrorString(sts),
dimGrid.x,
dimGrid.y,
dimBlock.x,
dimBlock.y,
dimBlock.z,
sharedBytes);
%(fail)s; %(fail)s;
} }
if (dimGridX > 1) {
dim3 dimBlockBlockSum(ceil(dimGridX/2.0), 1, 1);
dim3 dimGridBlockSum(1, 1, 1); // Suppose we only had less than 2048 grids initially.
int sharedBytesBlockSum = (2*dimBlockBlockSum.x) * sizeof(float);
//printf("dimBlockBlockSum: (%%d, %%d, %%d)\\n", dimBlockBlockSum.x, dimBlockBlockSum.y, dimBlockBlockSum.z);
//printf("dimGridBlockSum: (%%d, %%d)\\n", dimGridBlockSum.x, dimGridBlockSum.y);
//printf("sharedBytesBlockSum: %%d bytes\\n", sharedBytesBlockSum);
cudaThreadSynchronize();
blockCumSum_1D_%(nodename)s<<<dimGridBlockSum, dimBlockBlockSum, sharedBytesBlockSum>>>
(
CudaNdarray_DEV_DATA(deviceBlockSum),
CudaNdarray_DEV_DATA(deviceBlockSum),
dimGridX,
NULL
);
cudaThreadSynchronize();
dim3 dimGrid(dimGridX-1, 1, 1);
dim3 dimBlock(blockSize, 1, 1);
finalCumSum_1D_%(nodename)s<<<dimGrid, dimBlock>>>
(
CudaNdarray_DEV_DATA(%(z)s),
CudaNdarray_DEV_DATA(deviceBlockSum),
numElements
);
}
// If shape[0] is odd, the last element is compute manually
if (shape[0] != numElements){
cudaThreadSynchronize();
cumadd_%(nodename)s<<<dim3(1,1,1), dim3(1,1,1)>>>
(
CudaNdarray_DEV_DATA(%(x)s),
CudaNdarray_DEV_DATA(%(z)s),
shape[0]-2,
shape[0]-1
);
}
cudaThreadSynchronize();
} }
""" % locals() """ % locals()
......
...@@ -19,6 +19,8 @@ import theano ...@@ -19,6 +19,8 @@ import theano
from theano import config from theano import config
from theano.tensor.extra_ops import cumsum from theano.tensor.extra_ops import cumsum
from mlpython.misc.utils import Timer
class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp): class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
mode = mode_with_gpu mode = mode_with_gpu
op = GpuCumsum op = GpuCumsum
...@@ -31,27 +33,44 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp): ...@@ -31,27 +33,44 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
# Even number of elements # Even number of elements
a = np.random.random((18,)).astype(config.floatX) a = np.random.random((18,)).astype(config.floatX)
print "\nEven number of elements"
print f(a)
print np.cumsum(a)
assert np.allclose(np.cumsum(a), f(a)) assert np.allclose(np.cumsum(a), f(a))
# Odd number of elements # Odd number of elements
a = np.random.random((7,)).astype(config.floatX) a = np.random.random((7,)).astype(config.floatX)
print "\nOdd number of elements"
print f(a)
print np.cumsum(a)
assert np.allclose(np.cumsum(a), f(a)) assert np.allclose(np.cumsum(a), f(a))
# Use multiple GPU threadblocks # Use multiple GPU threadblocks
a = np.random.random((2048+1,)).astype(config.floatX) a = np.random.random((2048+1,)).astype(config.floatX)
print "\nUse multiple GPU threadblocks"
print f(a)
print np.cumsum(a)
assert np.allclose(np.cumsum(a), f(a))
# Use multiple GPU threadblocks
a = np.random.random((2048*80+1,)).astype(config.floatX)
print "\nUse multiple GPU threadblocks 2"
print f(a)
print np.cumsum(a)
assert np.allclose(np.cumsum(a), f(a)) assert np.allclose(np.cumsum(a), f(a))
# Use multiple GPU gridblocks # Use multiple GPU gridblocks
a = np.random.random((2048*2048+1,)).astype(config.floatX) #a = (np.random.random((2048*2048+1,)).astype(config.floatX) - 0.5) * 10.
a = np.floor(np.random.random((2048*2048+1,)) * 10).astype(config.floatX)
#a = np.ones((2048*2048+1,)).astype(config.floatX)
print "\nUse multiple GPU gridblocks"
print f(a)
print np.cumsum(a)
assert np.allclose(np.cumsum(a), f(a)) assert np.allclose(np.cumsum(a), f(a))
# #x = T.tensor3('x')
# #a = np.random.random((3, 5, 2)).astype(config.floatX)
# x = T.vector('x') # x = T.vector('x')
# a = np.random.random((6,)).astype(config.floatX)
# f = theano.function([x], cumsum(x)) # f = theano.function([x], cumsum(x))
# a = (np.ones(2048*2048+1)+1).astype(config.floatX) # a = (np.ones(0)+1).astype(config.floatX)
# print "" # print ""
# print f(a) # print f(a)
# print np.cumsum(a) # print np.cumsum(a)
...@@ -60,10 +79,13 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp): ...@@ -60,10 +79,13 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
# #for i in range(3000-1,3000*2): # #for i in range(3000-1,3000*2):
# #for i in range(3,2048*100,2048): # #for i in range(3,2048*100,2048):
# i = 145000; # i = 150000;
# import time
# start = time.time()
# while True: # while True:
# #a = np.random.random((i,)).astype(config.floatX) # #a = np.random.random((i,)).astype(config.floatX)
# a = np.ones((i,), dtype=config.floatX) # a = np.ones((i,), dtype=config.floatX)
# fa = f(a) # fa = f(a)
# npa = np.cumsum(a) # npa = np.cumsum(a)
...@@ -75,5 +97,11 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp): ...@@ -75,5 +97,11 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
# if i % 1000 == 0: # if i % 1000 == 0:
# print i # print i
# print time.time() - start
# start = time.time()
# #i-=1000
# if i == 80000:
# f = theano.function([x], cumsum(x))
# i += 1 # i += 1
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论