提交 65f5d0c7 authored 作者: Marc-Alexandre Cote's avatar Marc-Alexandre Cote

Add a 2D version cumsum

上级 6dbb2457
...@@ -46,6 +46,7 @@ class GpuCumsum(CumsumOp, GpuOp): ...@@ -46,6 +46,7 @@ class GpuCumsum(CumsumOp, GpuOp):
cuda_ndarray = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray cuda_ndarray = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray
prop = cuda_ndarray.device_properties(device_id) prop = cuda_ndarray.device_properties(device_id)
node_.op.max_threads_dim0 = prop['maxThreadsDim0'] node_.op.max_threads_dim0 = prop['maxThreadsDim0']
node_.op.max_grid_size1 = prop['maxGridSize1']
return super(GpuCumsum, node_.op).make_thunk(node_, storage_map, return super(GpuCumsum, node_.op).make_thunk(node_, storage_map,
compute_map, no_recycling) compute_map, no_recycling)
...@@ -56,149 +57,74 @@ class GpuCumsum(CumsumOp, GpuOp): ...@@ -56,149 +57,74 @@ class GpuCumsum(CumsumOp, GpuOp):
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
axis = self.axis axis = self.axis
return """ return """
__global__ __device__
void k_finalCumSum_1D_%(nodename)s(float* output, float* blockSum, int numElements) { void k_reductionPhase_%(nodename)s(float* partialCumSum) {
int globalThreadID = (blockIdx.x + 1) * blockDim.x + threadIdx.x; for (unsigned int stride = 1; stride <= blockDim.x; stride *= 2) {
if (globalThreadID < ceil(numElements/2.0)) {
const float currentBlockSum = blockSum[blockIdx.x];
output[globalThreadID*2] += currentBlockSum;
output[globalThreadID*2 + 1] += currentBlockSum;
}
}
__global__
void k_cumadd_%(nodename)s(float* input, float* output, int beforeLastElementIdx, int lastElementIdx) {
output[lastElementIdx] = input[lastElementIdx] + output[beforeLastElementIdx];
}
__global__
void k_blockCumSum_1D_%(nodename)s(float* input, float* output, int numElements, float* blockSum) {
int globalThreadID = blockIdx.x * blockDim.x + threadIdx.x;
if (globalThreadID < ceil(numElements/2.0)) {
extern __shared__ float partialCumSum[];
// Load data in shared memory
partialCumSum[threadIdx.x*2] = input[globalThreadID*2];
partialCumSum[threadIdx.x*2 + 1] = input[globalThreadID*2 + 1];
// Reduction Phase
int stride;
for (stride = 1; stride <= blockDim.x; stride *= 2) {
__syncthreads(); __syncthreads();
int index = (threadIdx.x + 1) * (stride * 2) - 1; unsigned int index = (threadIdx.x + 1) * (stride * 2) - 1;
if(index < blockDim.x*2) { if(index < blockDim.x*2) {
partialCumSum[index] += partialCumSum[index - stride]; partialCumSum[index] += partialCumSum[index - stride];
} }
} }
}
// Reverse Phase __device__
for (; stride > 0; stride /= 2) { void k_reversePhase_%(nodename)s(float* partialCumSum) {
for (unsigned int stride = exp2(ceil(log2((float)blockDim.x))); stride > 0; stride /= 2) {
__syncthreads(); __syncthreads();
int index = (threadIdx.x + 1) * (stride * 2) - 1; unsigned int index = (threadIdx.x + 1) * (stride * 2) - 1;
if(index + stride < blockDim.x*2) { if(index + stride < blockDim.x*2) {
partialCumSum[index + stride] += partialCumSum[index]; partialCumSum[index + stride] += partialCumSum[index];
} }
} }
// Write the final output to global memory
__syncthreads();
output[globalThreadID*2] = partialCumSum[threadIdx.x*2];
output[globalThreadID*2 + 1] = partialCumSum[threadIdx.x*2 + 1];
if (blockSum != NULL){
if (threadIdx.x == blockDim.x - 1) {
blockSum[blockIdx.x] = partialCumSum[threadIdx.x*2 + 1];
}
}
}
} }
void cumSum_1D_%(nodename)s(CudaNdarray* input, CudaNdarray* output, npy_intp* shape, int maxThreads) { __device__
void k_fetchData_%(nodename)s(float* partialCumSum, float* input, int globalThreadID, dim3 dataStrides, int dataOffset) {
if (shape[0] <= 1) { // blockIdx.y represents the # of the current independent cumsum
CudaNdarray_CopyFromCudaNdarray(output, input); partialCumSum[threadIdx.x*2] = input[(globalThreadID*2 ) * dataStrides.x + (blockIdx.y + dataOffset) * dataStrides.y];
return; partialCumSum[threadIdx.x*2 + 1] = input[(globalThreadID*2 + 1) * dataStrides.x + (blockIdx.y + dataOffset) * dataStrides.y];
} }
int numElements = shape[0] - (shape[0] %% 2); __device__
int blockSize = ceil( min(numElements, 2*maxThreads) / 2.0); void k_pushData_%(nodename)s(float* partialCumSum, float* output, int globalThreadID, dim3 dataStrides, int dataOffset) {
int dimGridX = ceil(numElements / (2.0*blockSize)); __syncthreads();
npy_intp shapeBlockSum[1] = { dimGridX }; // blockIdx.y represents the # of the current independent cumsum
CudaNdarray* deviceBlockSum = (CudaNdarray*) CudaNdarray_NewDims(1, shapeBlockSum); output[(globalThreadID*2 ) * dataStrides.x + (blockIdx.y + dataOffset) * dataStrides.y] = partialCumSum[threadIdx.x*2];
output[(globalThreadID*2 + 1) * dataStrides.x + (blockIdx.y + dataOffset) * dataStrides.y] = partialCumSum[threadIdx.x*2 + 1];
dim3 dimBlock(blockSize, 1, 1);
dim3 dimGrid(dimGridX, 1, 1);
int sharedBytes = (2*blockSize) * sizeof(float);
cudaThreadSynchronize();
k_blockCumSum_1D_%(nodename)s<<<dimGrid, dimBlock, sharedBytes>>>
(
CudaNdarray_DEV_DATA(input),
CudaNdarray_DEV_DATA(output),
numElements,
CudaNdarray_DEV_DATA(deviceBlockSum)
);
if (dimGridX > 1) {
// Do a cumsum over the blockSum (recursive).
cumSum_1D_%(nodename)s(deviceBlockSum, deviceBlockSum, shapeBlockSum, maxThreads);
dim3 dimGrid(dimGridX-1, 1, 1);
dim3 dimBlock(blockSize, 1, 1);
k_finalCumSum_1D_%(nodename)s<<<dimGrid, dimBlock>>>
(
CudaNdarray_DEV_DATA(output),
CudaNdarray_DEV_DATA(deviceBlockSum),
numElements
);
}
// If shape[0] is odd, the last element is compute manually
if (shape[0] != numElements){
cudaThreadSynchronize();
k_cumadd_%(nodename)s<<<dim3(1,1,1), dim3(1,1,1)>>>
(
CudaNdarray_DEV_DATA(input),
CudaNdarray_DEV_DATA(output),
shape[0]-2,
shape[0]-1
);
} }
cudaFree(CudaNdarray_DEV_DATA(deviceBlockSum)); __global__
cudaThreadSynchronize(); void k_cumadd_%(nodename)s(float* input, float* output, dim3 dataStrides, int dataOffset, int beforeLastElementIdx, int lastElementIdx) {
int dataOffsetY = (blockIdx.y + dataOffset) * dataStrides.y;
output[lastElementIdx*dataStrides.x + dataOffsetY] = input[lastElementIdx*dataStrides.x + dataOffsetY]
+ output[beforeLastElementIdx*dataStrides.x + dataOffsetY];
} }
__global__ __global__
void k_finalCumSum_2D_axis1_%(nodename)s(float* output, float* blockSum, int numElements, dim3 dataStrides) { void k_finalCumSum_%(nodename)s(float* output, float* blockSum, int numElements, dim3 dataStrides, int dataOffset) {
int globalThreadID = (blockIdx.y + 1) * blockDim.y + threadIdx.y; int globalThreadID = (blockIdx.x + 1) * blockDim.x + threadIdx.x;
// Check if current has data to process. // Check if current has data to process.
if (globalThreadID >= ceil(numElements/2.0)) { if (globalThreadID >= ceil(numElements/2.0)) {
return; return;
} }
const float currentBlockSum = blockSum[blockIdx.x*gridDim.y + blockIdx.y]; const float currentBlockSum = blockSum[blockIdx.x*gridDim.y + blockIdx.y + dataOffset];
output[globalThreadID*2 + blockIdx.x*dataStrides.x] += currentBlockSum; int dataOffsetY = (blockIdx.y + dataOffset) * dataStrides.y;
output[globalThreadID*2 + 1 + blockIdx.x*dataStrides.x] += currentBlockSum; output[(globalThreadID*2 ) * dataStrides.x + dataOffsetY] += currentBlockSum;
output[(globalThreadID*2 + 1) * dataStrides.x + dataOffsetY] += currentBlockSum;
} }
__global__ __global__
void k_cumadd_2D_axis1_%(nodename)s(float* input, float* output, int beforeLastElementIdx, int lastElementIdx) { void k_blockCumSum_%(nodename)s(float* input, float* output, int numElements, dim3 dataStrides, int dataOffset, float* blockSum) {
output[blockIdx.x*(lastElementIdx+1) + lastElementIdx] = input[blockIdx.x*(lastElementIdx+1) + lastElementIdx] // Regarding blockIdx and threadIdx, 'Cumsum' is always perform along the X axis.
+ output[blockIdx.x*(lastElementIdx+1) + beforeLastElementIdx]; // The Y axis will contain all the independent cumsums of the 2D case.
}
__global__ int globalThreadID = blockIdx.x * blockDim.x + threadIdx.x;
void k_blockCumSum_2D_axis1_%(nodename)s(float* input, float* output, int numElements, dim3 dataStrides, float* blockSum) {
int globalThreadID = blockIdx.y * blockDim.y + threadIdx.y;
// Check if current has data to process. // Check if current thread has data to process.
if (globalThreadID >= ceil(numElements/2.0)) { if (globalThreadID >= ceil(numElements/2.0)) {
return; return;
} }
...@@ -206,102 +132,109 @@ class GpuCumsum(CumsumOp, GpuOp): ...@@ -206,102 +132,109 @@ class GpuCumsum(CumsumOp, GpuOp):
extern __shared__ float partialCumSum[]; extern __shared__ float partialCumSum[];
// Load data in shared memory // Load data in shared memory
partialCumSum[threadIdx.y*2] = input[globalThreadID*2 + blockIdx.x*dataStrides.x]; k_fetchData_%(nodename)s(partialCumSum, input, globalThreadID, dataStrides, dataOffset);
partialCumSum[threadIdx.y*2 + 1] = input[globalThreadID*2 + 1 + blockIdx.x*dataStrides.x];
// Reduction Phase k_reductionPhase_%(nodename)s(partialCumSum);
int stride; k_reversePhase_%(nodename)s(partialCumSum);
for (stride = 1; stride <= blockDim.y; stride *= 2) {
__syncthreads();
int index = (threadIdx.y + 1) * (stride * 2) - 1;
if(index < blockDim.y*2) {
partialCumSum[index] += partialCumSum[index - stride];
}
}
// Reverse Phase
for (; stride > 0; stride /= 2) {
__syncthreads();
int index = (threadIdx.y + 1) * (stride * 2) - 1;
if(index + stride < blockDim.y*2) {
partialCumSum[index + stride] += partialCumSum[index];
}
}
// Write the final output to global memory // Write the final output to global memory
__syncthreads(); k_pushData_%(nodename)s(partialCumSum, output, globalThreadID, dataStrides, dataOffset);
output[globalThreadID*2 + blockIdx.x*dataStrides.x] = partialCumSum[threadIdx.y*2];
output[globalThreadID*2 + 1 + blockIdx.x*dataStrides.x] = partialCumSum[threadIdx.y*2 + 1];
if (blockSum != NULL){ if (blockSum != NULL){
if (threadIdx.y == blockDim.y - 1) { if (threadIdx.x == blockDim.x - 1) {
blockSum[blockIdx.x*gridDim.y + blockIdx.y] = partialCumSum[threadIdx.y*2 + 1]; blockSum[blockIdx.x*gridDim.y + blockIdx.y + dataOffset] = partialCumSum[threadIdx.x*2 + 1];
} }
} }
} }
void cumSum_2D_axis1_%(nodename)s(CudaNdarray* input, CudaNdarray* output, const int* shape, int maxThreads) { void cumSum_%(nodename)s(CudaNdarray* input, CudaNdarray* output, int maxThreads, int axis, int maxGridY) {
int axis = 1; // Convert into a parameter int shape[2] = { 1, 1 };
dim3 dataStrides(0,0,0);
switch (CudaNdarray_NDIM(input))
{
case 1:
shape[0] = CudaNdarray_HOST_DIMS(input)[0];
dataStrides.x = CudaNdarray_HOST_STRIDES(input)[0];
break;
case 2:
shape[0] = CudaNdarray_HOST_DIMS(input)[0];
shape[1] = CudaNdarray_HOST_DIMS(input)[1];
dataStrides.x = CudaNdarray_HOST_STRIDES(input)[0];
dataStrides.y = CudaNdarray_HOST_STRIDES(input)[1];
break;
default: printf("Only 1D and 2D cumsum is implemented yet.\\n");
}
if (shape[axis] <= 1) { if (shape[axis] <= 1) {
CudaNdarray_CopyFromCudaNdarray(output, input); CudaNdarray_CopyFromCudaNdarray(output, input);
return; return;
} }
if (axis == 1) {
int tmp = dataStrides.x;
dataStrides.x = dataStrides.y;
dataStrides.y = tmp;
}
int numElements = shape[axis] - (shape[axis] %% 2); int numElements = shape[axis] - (shape[axis] %% 2);
int blockSize = ceil( min(numElements, 2*maxThreads) / 2.0); int blockSize = ceil( min(numElements, 2*maxThreads) / 2.0);
int dimGridX = shape[0]; int dimGridX = ceil(numElements / (2.0*blockSize)); // Nb. of elements to perform cumsum on.
int dimGridY = ceil(numElements / (2.0*blockSize)); int dimGridY = shape[1-axis]; // Nb. of independent cumsums.
const int shapeBlockSum[2] = { dimGridX, dimGridY }; const int shapeBlockSum[2] = { dimGridX, dimGridY };
//CudaNdarray* deviceBlockSum = (CudaNdarray*) CudaNdarray_NewDims(2, shapeBlockSum); //CudaNdarray* deviceBlockSum = (CudaNdarray*) CudaNdarray_NewDims(2, shapeBlockSum);
CudaNdarray* deviceBlockSum = (CudaNdarray*) CudaNdarray_ZEROS(2, (int*)shapeBlockSum); CudaNdarray* deviceBlockSum = (CudaNdarray*) CudaNdarray_ZEROS(2, (int*)shapeBlockSum);
for (int dataOffset = 0; dataOffset < dimGridY; dataOffset += maxGridY){
dim3 dimBlock(1, blockSize, 1); int localDimGridY = min(dimGridY - dataOffset, maxGridY);
dim3 dimGrid(dimGridX, dimGridY, 1); dim3 dimBlock(blockSize, 1, 1);
dim3 dimGrid(dimGridX, localDimGridY, 1);
int sharedBytes = (2*blockSize) * sizeof(float); int sharedBytes = (2*blockSize) * sizeof(float);
dim3 dataStrides(CudaNdarray_HOST_STRIDES(input)[0], CudaNdarray_HOST_STRIDES(input)[1], 0);
cudaThreadSynchronize(); cudaThreadSynchronize();
k_blockCumSum_2D_axis1_%(nodename)s<<<dimGrid, dimBlock, sharedBytes>>> k_blockCumSum_%(nodename)s<<<dimGrid, dimBlock, sharedBytes>>>
( (
CudaNdarray_DEV_DATA(input), CudaNdarray_DEV_DATA(input),
CudaNdarray_DEV_DATA(output), CudaNdarray_DEV_DATA(output),
numElements, numElements,
dataStrides, dataStrides,
dataOffset,
CudaNdarray_DEV_DATA(deviceBlockSum) CudaNdarray_DEV_DATA(deviceBlockSum)
); );
if (dimGridY > 1) { if (dimGridX > 1) {
// Do a cumsum over the blockSum (recursive). // Do a cumsum over the blockSum (recursive).
cumSum_2D_axis1_%(nodename)s(deviceBlockSum, deviceBlockSum, shapeBlockSum, maxThreads); cumSum_%(nodename)s(deviceBlockSum, deviceBlockSum, maxThreads, 0, maxGridY);
dim3 dimGrid(dimGridX, dimGridY, 1); dim3 dimGrid(dimGridX, dimGridY, 1);
dim3 dimBlock(1, blockSize, 1); dim3 dimBlock(blockSize, 1, 1);
k_finalCumSum_2D_axis1_%(nodename)s<<<dimGrid, dimBlock>>> k_finalCumSum_%(nodename)s<<<dimGrid, dimBlock>>>
( (
CudaNdarray_DEV_DATA(output), CudaNdarray_DEV_DATA(output),
CudaNdarray_DEV_DATA(deviceBlockSum), CudaNdarray_DEV_DATA(deviceBlockSum),
numElements, numElements,
dataStrides dataStrides,
dataOffset
); );
} }
// If shape[axis] is odd, the last element is compute manually // If shape[axis] is odd, the last element is compute manually
if (shape[axis] != numElements){ if (shape[axis] != numElements){
cudaThreadSynchronize(); cudaThreadSynchronize();
dim3 dimGrid(dimGridX, 1, 1); dim3 dimGrid(1, localDimGridY, 1);
dim3 dimBlock(1, 1, 1); dim3 dimBlock(1, 1, 1);
k_cumadd_2D_axis1_%(nodename)s<<<dimGrid, dimBlock>>> k_cumadd_%(nodename)s<<<dimGrid, dimBlock>>>
( (
CudaNdarray_DEV_DATA(input), CudaNdarray_DEV_DATA(input),
CudaNdarray_DEV_DATA(output), CudaNdarray_DEV_DATA(output),
dataStrides,
dataOffset,
shape[axis]-2, shape[axis]-2,
shape[axis]-1 shape[axis]-1
); );
} }
}
cudaFree(CudaNdarray_DEV_DATA(deviceBlockSum)); cudaFree(CudaNdarray_DEV_DATA(deviceBlockSum));
cudaThreadSynchronize(); cudaThreadSynchronize();
...@@ -316,18 +249,17 @@ class GpuCumsum(CumsumOp, GpuOp): ...@@ -316,18 +249,17 @@ class GpuCumsum(CumsumOp, GpuOp):
sub = sub.copy() sub = sub.copy()
max_threads_dim0 = self.max_threads_dim0 max_threads_dim0 = self.max_threads_dim0
if max_threads_dim0 is None: max_grid_size1 = self.max_grid_size1
if max_threads_dim0 is None or max_grid_size1 is None:
raise NotImplementedError("GpuConv.c_code should not be called " raise NotImplementedError("GpuConv.c_code should not be called "
"directly. It should be called by " "directly. It should be called by "
"make_thunk() that add some information " "make_thunk() that add some information "
"related to the selected GPU.") "related to the selected GPU.")
sub.update(locals()) sub.update(locals())
#Right now, only the 1D case works.
if self.axis is None or (self.axis == 0 and node.inputs[0].ndim == 1): if self.axis is None or (self.axis == 0 and node.inputs[0].ndim == 1):
code = """ code = """
npy_intp shape[1] = { CudaNdarray_SIZE(%(x)s) }; const int* shape = CudaNdarray_HOST_DIMS(%(x)s);
if(! (%(z)s && CudaNdarray_HOST_DIMS(%(z)s)[0] == shape[0]) ) { if(! (%(z)s && CudaNdarray_HOST_DIMS(%(z)s)[0] == shape[0]) ) {
Py_XDECREF(%(z)s); Py_XDECREF(%(z)s);
%(z)s = (CudaNdarray*) CudaNdarray_NewDims(1, shape); %(z)s = (CudaNdarray*) CudaNdarray_NewDims(1, shape);
...@@ -338,7 +270,7 @@ class GpuCumsum(CumsumOp, GpuOp): ...@@ -338,7 +270,7 @@ class GpuCumsum(CumsumOp, GpuOp):
} }
{ // Namespace for kernel calls // { // Namespace for kernel calls //
cumSum_1D_%(nodename)s(%(x)s, %(z)s, shape, %(max_threads_dim0)s); cumSum_%(nodename)s(%(x)s, %(z)s, %(max_threads_dim0)s, 0, %(max_grid_size1)s);
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts) if (cudaSuccess != sts)
...@@ -351,7 +283,7 @@ class GpuCumsum(CumsumOp, GpuOp): ...@@ -351,7 +283,7 @@ class GpuCumsum(CumsumOp, GpuOp):
} }
} }
""" % locals() """ % locals()
elif node.inputs[0].ndim == 2 and self.axis == 1: elif node.inputs[0].ndim == 2:
code = """ code = """
const int* shape = CudaNdarray_HOST_DIMS(%(x)s); const int* shape = CudaNdarray_HOST_DIMS(%(x)s);
bool needAllocation = !%(z)s || CudaNdarray_NDIM(%(x)s) != CudaNdarray_NDIM(%(z)s); bool needAllocation = !%(z)s || CudaNdarray_NDIM(%(x)s) != CudaNdarray_NDIM(%(z)s);
...@@ -375,7 +307,7 @@ class GpuCumsum(CumsumOp, GpuOp): ...@@ -375,7 +307,7 @@ class GpuCumsum(CumsumOp, GpuOp):
} }
{ // Namespace for kernel calls // { // Namespace for kernel calls //
cumSum_2D_axis1_%(nodename)s(%(x)s, %(z)s, shape, %(max_threads_dim0)s); cumSum_%(nodename)s(%(x)s, %(z)s, %(max_threads_dim0)s, %(axis)s, %(max_grid_size1)s);
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts) if (cudaSuccess != sts)
......
...@@ -17,7 +17,7 @@ from theano import tensor as T ...@@ -17,7 +17,7 @@ from theano import tensor as T
import numpy as np import numpy as np
import theano import theano
from theano import config from theano import config
from theano.tensor.extra_ops import cumsum from theano.tensor.extra_ops import cumsum, diff
from mlpython.misc.utils import Timer from mlpython.misc.utils import Timer
...@@ -26,123 +26,148 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp): ...@@ -26,123 +26,148 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
op = GpuCumsum op = GpuCumsum
dtypes = ['float32'] dtypes = ['float32']
def test_GpuCumsum(self): def test_benchmark_1D_vs_2D(self):
### Test 1D case ### print "\nBenchmark:"
x = T.vector('x')
f = theano.function([x], cumsum(x))
# # Even number of elements from theano import sandbox, Out
# a = np.random.random((18,)).astype(config.floatX) import time
# assert np.allclose(np.cumsum(a), f(a))
# # Odd number of elements vlen = 40 * 1024 * 2048 # 10 x # cores x # threads per core
# a = np.random.random((7,)).astype(config.floatX) iters = 25
# assert np.allclose(np.cumsum(a), f(a))
# # Use multiple GPU threadblocks x = theano.shared(np.ones((vlen,), dtype=config.floatX), borrow=False)
# a = np.random.random((2048+2,)).astype(config.floatX) res = Out(sandbox.cuda.basic_ops.gpu_from_host(cumsum(x)), borrow=True)
# assert np.allclose(np.cumsum(a), f(a)) f = theano.function([], res)
# # Use multiple GPU threadblocks print f.maker.fgraph.toposort()
# a = np.random.random((2048*75+2,)).astype(config.floatX) t0 = time.time()
# assert np.allclose(np.cumsum(a), f(a)) for i in xrange(iters):
r = f()
t1 = time.time()
# # Use multiple GPU gridblocks print 'Looping %d times took' % iters, t1 - t0, 'seconds'
# a = np.ones((2048*2048+2,)).astype(config.floatX) print 'Result is', r
# assert np.allclose(np.cumsum(a), f(a)) print 'Numpy result is', np.asarray(r)
print "\nBenchmark:"
import timeit as t # x = theano.shared(np.ones((1,vlen), dtype=config.floatX), borrow=True)
#theano_time = t.timeit("np.ones((100,))", "import numpy as np", number=1000) # f = theano.function([], Out(sandbox.cuda.basic_ops.gpu_from_host(cumsum(x,axis=1)), borrow=True))
# print f.maker.fgraph.toposort()
# t0 = time.time()
# for i in xrange(iters):
# r = f()
# t1 = time.time()
stmt = "f(a)" # print 'Looping %d times took' % iters, t1 - t0, 'seconds'
setup = """ # print 'Result is', r
import numpy as np # print 'Numpy result is', np.asarray(r)
import theano
import theano.tensor as T # print 'Used the', config.device
from theano.tensor.extra_ops import cumsum
from theano import config
def test_GpuCumsum(self):
### Test 1D case ###
x = T.vector('x') x = T.vector('x')
f = theano.function([x], cumsum(x)) f = theano.function([x], cumsum(x))
a = np.ones((100000,), dtype=config.floatX)
""".replace(" ", "") # Even number of elements
theano_time = t.timeit(stmt, setup, number=1000) a = np.random.random((18,)).astype(config.floatX)
print "Theano:\t", theano_time print f(a)
print np.cumsum(a)
stmt = "np.cumsum(a)" assert np.allclose(np.cumsum(a), f(a))
setup = """
import numpy as np # Odd number of elements
from theano import config a = np.random.random((7,)).astype(config.floatX)
a = np.ones((100000,), dtype=config.floatX) assert np.allclose(np.cumsum(a), f(a))
""".replace(" ", "")
numpy_time = t.timeit(stmt, setup, number=1000) # Use multiple GPU threadblocks
print "Numpy:\t", numpy_time a = np.random.random((2048+2,)).astype(config.floatX)
print "Speedup: {0}x".format(numpy_time/theano_time) assert np.allclose(np.cumsum(a), f(a))
# Use multiple GPU threadblocks
# # Extensive testing a = np.random.random((2048*75+2,)).astype(config.floatX)
# i = 0; assert np.allclose(np.cumsum(a), f(a))
# while True:
# a = np.ones((i,), dtype=config.floatX) # Use multiple GPU gridblocks
a = np.ones((2048*2048+2,)).astype(config.floatX)
# fa = f(a) assert np.allclose(np.cumsum(a), f(a))
# npa = np.cumsum(a)
# if not np.allclose(npa, fa): # Extensive testing
# print i, np.allclose(npa, fa) # Test axis=None for i in xrange(int(1e3)*5):
# print fa a = np.ones((i,), dtype=config.floatX)
# print npa
# assert False fa = f(a)
npa = np.cumsum(a)
# if i % 1000 == 0:
# print i if not np.allclose(npa, fa):
print i, np.allclose(npa, fa) # Test axis=None
# i += 1 print fa
print npa
assert False
# ### Test 2D case - axis=1 ###
# x = T.matrix('x') if i % 1000 == 0:
# f = theano.function([x], cumsum(x, axis=1)) print i
# # # Even number of elements
# # print "\n# Even number of elements" #for axis in xrange(2):
# # a = np.random.random((18,18)).astype(config.floatX) for axis in xrange(2):
# # assert np.allclose(np.cumsum(a, axis=1), f(a)) ### Test 2D case - axis=1 ###
x = T.matrix('x')
# # # Odd number of elements f = theano.function([x], cumsum(x, axis=axis))
# # print "\n# Odd number of elements"
# # assert np.allclose(np.cumsum(a, axis=1), f(a)) # Even number of elements
print "\n# Even number of elements (axis={0})".format(axis)
# # # Use multiple GPU threadblocks a = np.random.random((18,18)).astype(config.floatX)
# # print "\n# Use multiple GPU threadblocks" assert np.allclose(np.cumsum(a, axis=axis), f(a))
# # a = np.random.random((2048+2,2048+2)).astype(config.floatX)
# # assert np.allclose(np.cumsum(a, axis=1), f(a)) # Odd number of elements
print "\n# Odd number of elements (axis={0})".format(axis)
# # # Use multiple GPU threadblocks a = np.random.random((21,21)).astype(config.floatX)
# # print "\n# Use multiple GPU threadblocks" assert np.allclose(np.cumsum(a, axis=axis), f(a))
# # a = np.ones((10,2048*75+3)).astype(config.floatX)
# # assert np.allclose(np.cumsum(a, axis=1), f(a)) # Use two GPU threadblocks
print "\n# Use two GPU threadblocks (axis={0})".format(axis)
# # # Use multiple GPU gridblocks a = np.random.random((2048+2,2048+2)).astype(config.floatX)
# # print "\n# Use multiple GPU gridblocks" assert np.allclose(np.cumsum(a, axis=axis), f(a))
# # a = np.ones((11,2048*2048+3)).astype(config.floatX)
# # assert np.allclose(np.cumsum(a, axis=1), f(a)) # Use multiple GPU threadblocks
print "\n# Use multiple GPU threadblocks (axis={0})".format(axis)
# # Extensive testing a = np.ones((10,2048*75+3)).astype(config.floatX)
# i = 19000; assert np.allclose(np.cumsum(a, axis=axis), f(a))
# while True:
# a = np.ones((11,i), dtype=config.floatX) a = np.ones((2048*75+3,10)).astype(config.floatX)
# fa = f(a) assert np.allclose(np.cumsum(a, axis=axis), f(a))
# npa = np.cumsum(a, axis=1)
# Use multiple GPU gridblocks
# if not np.allclose(npa, fa): print "\n# Use multiple GPU gridblocks (axis={0})".format(axis)
# print i, np.allclose(npa, fa) # Test axis=None a = np.ones((11,2048*2048+3)).astype(config.floatX)
# print fa assert np.allclose(np.cumsum(a, axis=axis), f(a))
# print npa a = np.ones((2048*2048+3,11)).astype(config.floatX)
# assert False assert np.allclose(np.cumsum(a, axis=axis), f(a))
# if i % 1000 == 0: # Extensive testing for the first 10k sizes
# print i for i in xrange(int(1e3)*5):
a = np.ones((11,i), dtype=config.floatX)
# i += 1 fa = f(a)
npa = np.cumsum(a, axis=axis)
if not np.allclose(npa, fa):
print i, np.allclose(npa, fa) # Test axis=None
print fa
print npa
assert False
a = np.ones((i,11), dtype=config.floatX)
fa = f(a)
npa = np.cumsum(a, axis=axis)
if not np.allclose(npa, fa):
print i, np.allclose(npa, fa) # Test axis=None
print fa
print npa
assert False
if i % 1000 == 0:
print i
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论