提交 65f5d0c7 authored 作者: Marc-Alexandre Cote's avatar Marc-Alexandre Cote

Add a 2D version cumsum

上级 6dbb2457
...@@ -46,6 +46,7 @@ class GpuCumsum(CumsumOp, GpuOp): ...@@ -46,6 +46,7 @@ class GpuCumsum(CumsumOp, GpuOp):
cuda_ndarray = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray cuda_ndarray = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray
prop = cuda_ndarray.device_properties(device_id) prop = cuda_ndarray.device_properties(device_id)
node_.op.max_threads_dim0 = prop['maxThreadsDim0'] node_.op.max_threads_dim0 = prop['maxThreadsDim0']
node_.op.max_grid_size1 = prop['maxGridSize1']
return super(GpuCumsum, node_.op).make_thunk(node_, storage_map, return super(GpuCumsum, node_.op).make_thunk(node_, storage_map,
compute_map, no_recycling) compute_map, no_recycling)
...@@ -56,149 +57,74 @@ class GpuCumsum(CumsumOp, GpuOp): ...@@ -56,149 +57,74 @@ class GpuCumsum(CumsumOp, GpuOp):
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
axis = self.axis axis = self.axis
return """ return """
__global__ __device__
void k_finalCumSum_1D_%(nodename)s(float* output, float* blockSum, int numElements) { void k_reductionPhase_%(nodename)s(float* partialCumSum) {
int globalThreadID = (blockIdx.x + 1) * blockDim.x + threadIdx.x; for (unsigned int stride = 1; stride <= blockDim.x; stride *= 2) {
__syncthreads();
if (globalThreadID < ceil(numElements/2.0)) { unsigned int index = (threadIdx.x + 1) * (stride * 2) - 1;
const float currentBlockSum = blockSum[blockIdx.x]; if(index < blockDim.x*2) {
partialCumSum[index] += partialCumSum[index - stride];
output[globalThreadID*2] += currentBlockSum; }
output[globalThreadID*2 + 1] += currentBlockSum;
} }
} }
__global__ __device__
void k_cumadd_%(nodename)s(float* input, float* output, int beforeLastElementIdx, int lastElementIdx) { void k_reversePhase_%(nodename)s(float* partialCumSum) {
output[lastElementIdx] = input[lastElementIdx] + output[beforeLastElementIdx]; for (unsigned int stride = exp2(ceil(log2((float)blockDim.x))); stride > 0; stride /= 2) {
}
__global__
void k_blockCumSum_1D_%(nodename)s(float* input, float* output, int numElements, float* blockSum) {
int globalThreadID = blockIdx.x * blockDim.x + threadIdx.x;
if (globalThreadID < ceil(numElements/2.0)) {
extern __shared__ float partialCumSum[];
// Load data in shared memory
partialCumSum[threadIdx.x*2] = input[globalThreadID*2];
partialCumSum[threadIdx.x*2 + 1] = input[globalThreadID*2 + 1];
// Reduction Phase
int stride;
for (stride = 1; stride <= blockDim.x; stride *= 2) {
__syncthreads();
int index = (threadIdx.x + 1) * (stride * 2) - 1;
if(index < blockDim.x*2) {
partialCumSum[index] += partialCumSum[index - stride];
}
}
// Reverse Phase
for (; stride > 0; stride /= 2) {
__syncthreads();
int index = (threadIdx.x + 1) * (stride * 2) - 1;
if(index + stride < blockDim.x*2) {
partialCumSum[index + stride] += partialCumSum[index];
}
}
// Write the final output to global memory
__syncthreads(); __syncthreads();
output[globalThreadID*2] = partialCumSum[threadIdx.x*2]; unsigned int index = (threadIdx.x + 1) * (stride * 2) - 1;
output[globalThreadID*2 + 1] = partialCumSum[threadIdx.x*2 + 1]; if(index + stride < blockDim.x*2) {
partialCumSum[index + stride] += partialCumSum[index];
if (blockSum != NULL){
if (threadIdx.x == blockDim.x - 1) {
blockSum[blockIdx.x] = partialCumSum[threadIdx.x*2 + 1];
}
} }
} }
} }
void cumSum_1D_%(nodename)s(CudaNdarray* input, CudaNdarray* output, npy_intp* shape, int maxThreads) { __device__
void k_fetchData_%(nodename)s(float* partialCumSum, float* input, int globalThreadID, dim3 dataStrides, int dataOffset) {
if (shape[0] <= 1) { // blockIdx.y represents the # of the current independent cumsum
CudaNdarray_CopyFromCudaNdarray(output, input); partialCumSum[threadIdx.x*2] = input[(globalThreadID*2 ) * dataStrides.x + (blockIdx.y + dataOffset) * dataStrides.y];
return; partialCumSum[threadIdx.x*2 + 1] = input[(globalThreadID*2 + 1) * dataStrides.x + (blockIdx.y + dataOffset) * dataStrides.y];
} }
int numElements = shape[0] - (shape[0] %% 2);
int blockSize = ceil( min(numElements, 2*maxThreads) / 2.0);
int dimGridX = ceil(numElements / (2.0*blockSize));
npy_intp shapeBlockSum[1] = { dimGridX };
CudaNdarray* deviceBlockSum = (CudaNdarray*) CudaNdarray_NewDims(1, shapeBlockSum);
dim3 dimBlock(blockSize, 1, 1);
dim3 dimGrid(dimGridX, 1, 1);
int sharedBytes = (2*blockSize) * sizeof(float);
cudaThreadSynchronize();
k_blockCumSum_1D_%(nodename)s<<<dimGrid, dimBlock, sharedBytes>>>
(
CudaNdarray_DEV_DATA(input),
CudaNdarray_DEV_DATA(output),
numElements,
CudaNdarray_DEV_DATA(deviceBlockSum)
);
if (dimGridX > 1) {
// Do a cumsum over the blockSum (recursive).
cumSum_1D_%(nodename)s(deviceBlockSum, deviceBlockSum, shapeBlockSum, maxThreads);
dim3 dimGrid(dimGridX-1, 1, 1);
dim3 dimBlock(blockSize, 1, 1);
k_finalCumSum_1D_%(nodename)s<<<dimGrid, dimBlock>>>
(
CudaNdarray_DEV_DATA(output),
CudaNdarray_DEV_DATA(deviceBlockSum),
numElements
);
}
// If shape[0] is odd, the last element is compute manually
if (shape[0] != numElements){
cudaThreadSynchronize();
k_cumadd_%(nodename)s<<<dim3(1,1,1), dim3(1,1,1)>>>
(
CudaNdarray_DEV_DATA(input),
CudaNdarray_DEV_DATA(output),
shape[0]-2,
shape[0]-1
);
}
cudaFree(CudaNdarray_DEV_DATA(deviceBlockSum)); __device__
cudaThreadSynchronize(); void k_pushData_%(nodename)s(float* partialCumSum, float* output, int globalThreadID, dim3 dataStrides, int dataOffset) {
__syncthreads();
// blockIdx.y represents the # of the current independent cumsum
output[(globalThreadID*2 ) * dataStrides.x + (blockIdx.y + dataOffset) * dataStrides.y] = partialCumSum[threadIdx.x*2];
output[(globalThreadID*2 + 1) * dataStrides.x + (blockIdx.y + dataOffset) * dataStrides.y] = partialCumSum[threadIdx.x*2 + 1];
} }
__global__
void k_cumadd_%(nodename)s(float* input, float* output, dim3 dataStrides, int dataOffset, int beforeLastElementIdx, int lastElementIdx) {
int dataOffsetY = (blockIdx.y + dataOffset) * dataStrides.y;
output[lastElementIdx*dataStrides.x + dataOffsetY] = input[lastElementIdx*dataStrides.x + dataOffsetY]
+ output[beforeLastElementIdx*dataStrides.x + dataOffsetY];
}
__global__ __global__
void k_finalCumSum_2D_axis1_%(nodename)s(float* output, float* blockSum, int numElements, dim3 dataStrides) { void k_finalCumSum_%(nodename)s(float* output, float* blockSum, int numElements, dim3 dataStrides, int dataOffset) {
int globalThreadID = (blockIdx.y + 1) * blockDim.y + threadIdx.y; int globalThreadID = (blockIdx.x + 1) * blockDim.x + threadIdx.x;
// Check if current has data to process. // Check if current has data to process.
if (globalThreadID >= ceil(numElements/2.0)) { if (globalThreadID >= ceil(numElements/2.0)) {
return; return;
} }
const float currentBlockSum = blockSum[blockIdx.x*gridDim.y + blockIdx.y]; const float currentBlockSum = blockSum[blockIdx.x*gridDim.y + blockIdx.y + dataOffset];
output[globalThreadID*2 + blockIdx.x*dataStrides.x] += currentBlockSum; int dataOffsetY = (blockIdx.y + dataOffset) * dataStrides.y;
output[globalThreadID*2 + 1 + blockIdx.x*dataStrides.x] += currentBlockSum; output[(globalThreadID*2 ) * dataStrides.x + dataOffsetY] += currentBlockSum;
output[(globalThreadID*2 + 1) * dataStrides.x + dataOffsetY] += currentBlockSum;
} }
__global__ __global__
void k_cumadd_2D_axis1_%(nodename)s(float* input, float* output, int beforeLastElementIdx, int lastElementIdx) { void k_blockCumSum_%(nodename)s(float* input, float* output, int numElements, dim3 dataStrides, int dataOffset, float* blockSum) {
output[blockIdx.x*(lastElementIdx+1) + lastElementIdx] = input[blockIdx.x*(lastElementIdx+1) + lastElementIdx] // Regarding blockIdx and threadIdx, 'Cumsum' is always perform along the X axis.
+ output[blockIdx.x*(lastElementIdx+1) + beforeLastElementIdx]; // The Y axis will contain all the independent cumsums of the 2D case.
}
__global__ int globalThreadID = blockIdx.x * blockDim.x + threadIdx.x;
void k_blockCumSum_2D_axis1_%(nodename)s(float* input, float* output, int numElements, dim3 dataStrides, float* blockSum) {
int globalThreadID = blockIdx.y * blockDim.y + threadIdx.y;
// Check if current has data to process. // Check if current thread has data to process.
if (globalThreadID >= ceil(numElements/2.0)) { if (globalThreadID >= ceil(numElements/2.0)) {
return; return;
} }
...@@ -206,101 +132,108 @@ class GpuCumsum(CumsumOp, GpuOp): ...@@ -206,101 +132,108 @@ class GpuCumsum(CumsumOp, GpuOp):
extern __shared__ float partialCumSum[]; extern __shared__ float partialCumSum[];
// Load data in shared memory // Load data in shared memory
partialCumSum[threadIdx.y*2] = input[globalThreadID*2 + blockIdx.x*dataStrides.x]; k_fetchData_%(nodename)s(partialCumSum, input, globalThreadID, dataStrides, dataOffset);
partialCumSum[threadIdx.y*2 + 1] = input[globalThreadID*2 + 1 + blockIdx.x*dataStrides.x];
k_reductionPhase_%(nodename)s(partialCumSum);
// Reduction Phase k_reversePhase_%(nodename)s(partialCumSum);
int stride;
for (stride = 1; stride <= blockDim.y; stride *= 2) {
__syncthreads();
int index = (threadIdx.y + 1) * (stride * 2) - 1;
if(index < blockDim.y*2) {
partialCumSum[index] += partialCumSum[index - stride];
}
}
// Reverse Phase
for (; stride > 0; stride /= 2) {
__syncthreads();
int index = (threadIdx.y + 1) * (stride * 2) - 1;
if(index + stride < blockDim.y*2) {
partialCumSum[index + stride] += partialCumSum[index];
}
}
// Write the final output to global memory // Write the final output to global memory
__syncthreads(); k_pushData_%(nodename)s(partialCumSum, output, globalThreadID, dataStrides, dataOffset);
output[globalThreadID*2 + blockIdx.x*dataStrides.x] = partialCumSum[threadIdx.y*2];
output[globalThreadID*2 + 1 + blockIdx.x*dataStrides.x] = partialCumSum[threadIdx.y*2 + 1];
if (blockSum != NULL){ if (blockSum != NULL){
if (threadIdx.y == blockDim.y - 1) { if (threadIdx.x == blockDim.x - 1) {
blockSum[blockIdx.x*gridDim.y + blockIdx.y] = partialCumSum[threadIdx.y*2 + 1]; blockSum[blockIdx.x*gridDim.y + blockIdx.y + dataOffset] = partialCumSum[threadIdx.x*2 + 1];
} }
} }
} }
void cumSum_2D_axis1_%(nodename)s(CudaNdarray* input, CudaNdarray* output, const int* shape, int maxThreads) { void cumSum_%(nodename)s(CudaNdarray* input, CudaNdarray* output, int maxThreads, int axis, int maxGridY) {
int axis = 1; // Convert into a parameter int shape[2] = { 1, 1 };
dim3 dataStrides(0,0,0);
switch (CudaNdarray_NDIM(input))
{
case 1:
shape[0] = CudaNdarray_HOST_DIMS(input)[0];
dataStrides.x = CudaNdarray_HOST_STRIDES(input)[0];
break;
case 2:
shape[0] = CudaNdarray_HOST_DIMS(input)[0];
shape[1] = CudaNdarray_HOST_DIMS(input)[1];
dataStrides.x = CudaNdarray_HOST_STRIDES(input)[0];
dataStrides.y = CudaNdarray_HOST_STRIDES(input)[1];
break;
default: printf("Only 1D and 2D cumsum is implemented yet.\\n");
}
if (shape[axis] <= 1) { if (shape[axis] <= 1) {
CudaNdarray_CopyFromCudaNdarray(output, input); CudaNdarray_CopyFromCudaNdarray(output, input);
return; return;
} }
if (axis == 1) {
int tmp = dataStrides.x;
dataStrides.x = dataStrides.y;
dataStrides.y = tmp;
}
int numElements = shape[axis] - (shape[axis] %% 2); int numElements = shape[axis] - (shape[axis] %% 2);
int blockSize = ceil( min(numElements, 2*maxThreads) / 2.0); int blockSize = ceil( min(numElements, 2*maxThreads) / 2.0);
int dimGridX = shape[0]; int dimGridX = ceil(numElements / (2.0*blockSize)); // Nb. of elements to perform cumsum on.
int dimGridY = ceil(numElements / (2.0*blockSize)); int dimGridY = shape[1-axis]; // Nb. of independent cumsums.
const int shapeBlockSum[2] = { dimGridX, dimGridY }; const int shapeBlockSum[2] = { dimGridX, dimGridY };
//CudaNdarray* deviceBlockSum = (CudaNdarray*) CudaNdarray_NewDims(2, shapeBlockSum); //CudaNdarray* deviceBlockSum = (CudaNdarray*) CudaNdarray_NewDims(2, shapeBlockSum);
CudaNdarray* deviceBlockSum = (CudaNdarray*) CudaNdarray_ZEROS(2, (int*)shapeBlockSum); CudaNdarray* deviceBlockSum = (CudaNdarray*) CudaNdarray_ZEROS(2, (int*)shapeBlockSum);
for (int dataOffset = 0; dataOffset < dimGridY; dataOffset += maxGridY){
int localDimGridY = min(dimGridY - dataOffset, maxGridY);
dim3 dimBlock(blockSize, 1, 1);
dim3 dimGrid(dimGridX, localDimGridY, 1);
int sharedBytes = (2*blockSize) * sizeof(float);
dim3 dimBlock(1, blockSize, 1);
dim3 dimGrid(dimGridX, dimGridY, 1);
int sharedBytes = (2*blockSize) * sizeof(float);
dim3 dataStrides(CudaNdarray_HOST_STRIDES(input)[0], CudaNdarray_HOST_STRIDES(input)[1], 0);
cudaThreadSynchronize();
k_blockCumSum_2D_axis1_%(nodename)s<<<dimGrid, dimBlock, sharedBytes>>>
(
CudaNdarray_DEV_DATA(input),
CudaNdarray_DEV_DATA(output),
numElements,
dataStrides,
CudaNdarray_DEV_DATA(deviceBlockSum)
);
if (dimGridY > 1) {
// Do a cumsum over the blockSum (recursive).
cumSum_2D_axis1_%(nodename)s(deviceBlockSum, deviceBlockSum, shapeBlockSum, maxThreads);
dim3 dimGrid(dimGridX, dimGridY, 1);
dim3 dimBlock(1, blockSize, 1);
k_finalCumSum_2D_axis1_%(nodename)s<<<dimGrid, dimBlock>>>
(
CudaNdarray_DEV_DATA(output),
CudaNdarray_DEV_DATA(deviceBlockSum),
numElements,
dataStrides
);
}
// If shape[axis] is odd, the last element is compute manually
if (shape[axis] != numElements){
cudaThreadSynchronize(); cudaThreadSynchronize();
dim3 dimGrid(dimGridX, 1, 1); k_blockCumSum_%(nodename)s<<<dimGrid, dimBlock, sharedBytes>>>
dim3 dimBlock(1, 1, 1);
k_cumadd_2D_axis1_%(nodename)s<<<dimGrid, dimBlock>>>
( (
CudaNdarray_DEV_DATA(input), CudaNdarray_DEV_DATA(input),
CudaNdarray_DEV_DATA(output), CudaNdarray_DEV_DATA(output),
shape[axis]-2, numElements,
shape[axis]-1 dataStrides,
dataOffset,
CudaNdarray_DEV_DATA(deviceBlockSum)
); );
if (dimGridX > 1) {
// Do a cumsum over the blockSum (recursive).
cumSum_%(nodename)s(deviceBlockSum, deviceBlockSum, maxThreads, 0, maxGridY);
dim3 dimGrid(dimGridX, dimGridY, 1);
dim3 dimBlock(blockSize, 1, 1);
k_finalCumSum_%(nodename)s<<<dimGrid, dimBlock>>>
(
CudaNdarray_DEV_DATA(output),
CudaNdarray_DEV_DATA(deviceBlockSum),
numElements,
dataStrides,
dataOffset
);
}
// If shape[axis] is odd, the last element is compute manually
if (shape[axis] != numElements){
cudaThreadSynchronize();
dim3 dimGrid(1, localDimGridY, 1);
dim3 dimBlock(1, 1, 1);
k_cumadd_%(nodename)s<<<dimGrid, dimBlock>>>
(
CudaNdarray_DEV_DATA(input),
CudaNdarray_DEV_DATA(output),
dataStrides,
dataOffset,
shape[axis]-2,
shape[axis]-1
);
}
} }
cudaFree(CudaNdarray_DEV_DATA(deviceBlockSum)); cudaFree(CudaNdarray_DEV_DATA(deviceBlockSum));
...@@ -316,18 +249,17 @@ class GpuCumsum(CumsumOp, GpuOp): ...@@ -316,18 +249,17 @@ class GpuCumsum(CumsumOp, GpuOp):
sub = sub.copy() sub = sub.copy()
max_threads_dim0 = self.max_threads_dim0 max_threads_dim0 = self.max_threads_dim0
if max_threads_dim0 is None: max_grid_size1 = self.max_grid_size1
if max_threads_dim0 is None or max_grid_size1 is None:
raise NotImplementedError("GpuConv.c_code should not be called " raise NotImplementedError("GpuConv.c_code should not be called "
"directly. It should be called by " "directly. It should be called by "
"make_thunk() that add some information " "make_thunk() that add some information "
"related to the selected GPU.") "related to the selected GPU.")
sub.update(locals()) sub.update(locals())
#Right now, only the 1D case works.
if self.axis is None or (self.axis == 0 and node.inputs[0].ndim == 1): if self.axis is None or (self.axis == 0 and node.inputs[0].ndim == 1):
code = """ code = """
npy_intp shape[1] = { CudaNdarray_SIZE(%(x)s) }; const int* shape = CudaNdarray_HOST_DIMS(%(x)s);
if(! (%(z)s && CudaNdarray_HOST_DIMS(%(z)s)[0] == shape[0]) ) { if(! (%(z)s && CudaNdarray_HOST_DIMS(%(z)s)[0] == shape[0]) ) {
Py_XDECREF(%(z)s); Py_XDECREF(%(z)s);
%(z)s = (CudaNdarray*) CudaNdarray_NewDims(1, shape); %(z)s = (CudaNdarray*) CudaNdarray_NewDims(1, shape);
...@@ -338,7 +270,7 @@ class GpuCumsum(CumsumOp, GpuOp): ...@@ -338,7 +270,7 @@ class GpuCumsum(CumsumOp, GpuOp):
} }
{ // Namespace for kernel calls // { // Namespace for kernel calls //
cumSum_1D_%(nodename)s(%(x)s, %(z)s, shape, %(max_threads_dim0)s); cumSum_%(nodename)s(%(x)s, %(z)s, %(max_threads_dim0)s, 0, %(max_grid_size1)s);
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts) if (cudaSuccess != sts)
...@@ -351,7 +283,7 @@ class GpuCumsum(CumsumOp, GpuOp): ...@@ -351,7 +283,7 @@ class GpuCumsum(CumsumOp, GpuOp):
} }
} }
""" % locals() """ % locals()
elif node.inputs[0].ndim == 2 and self.axis == 1: elif node.inputs[0].ndim == 2:
code = """ code = """
const int* shape = CudaNdarray_HOST_DIMS(%(x)s); const int* shape = CudaNdarray_HOST_DIMS(%(x)s);
bool needAllocation = !%(z)s || CudaNdarray_NDIM(%(x)s) != CudaNdarray_NDIM(%(z)s); bool needAllocation = !%(z)s || CudaNdarray_NDIM(%(x)s) != CudaNdarray_NDIM(%(z)s);
...@@ -375,7 +307,7 @@ class GpuCumsum(CumsumOp, GpuOp): ...@@ -375,7 +307,7 @@ class GpuCumsum(CumsumOp, GpuOp):
} }
{ // Namespace for kernel calls // { // Namespace for kernel calls //
cumSum_2D_axis1_%(nodename)s(%(x)s, %(z)s, shape, %(max_threads_dim0)s); cumSum_%(nodename)s(%(x)s, %(z)s, %(max_threads_dim0)s, %(axis)s, %(max_grid_size1)s);
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts) if (cudaSuccess != sts)
......
...@@ -17,7 +17,7 @@ from theano import tensor as T ...@@ -17,7 +17,7 @@ from theano import tensor as T
import numpy as np import numpy as np
import theano import theano
from theano import config from theano import config
from theano.tensor.extra_ops import cumsum from theano.tensor.extra_ops import cumsum, diff
from mlpython.misc.utils import Timer from mlpython.misc.utils import Timer
...@@ -26,123 +26,148 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp): ...@@ -26,123 +26,148 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
op = GpuCumsum op = GpuCumsum
dtypes = ['float32'] dtypes = ['float32']
def test_GpuCumsum(self): def test_benchmark_1D_vs_2D(self):
### Test 1D case ###
x = T.vector('x')
f = theano.function([x], cumsum(x))
# # Even number of elements
# a = np.random.random((18,)).astype(config.floatX)
# assert np.allclose(np.cumsum(a), f(a))
# # Odd number of elements
# a = np.random.random((7,)).astype(config.floatX)
# assert np.allclose(np.cumsum(a), f(a))
# # Use multiple GPU threadblocks
# a = np.random.random((2048+2,)).astype(config.floatX)
# assert np.allclose(np.cumsum(a), f(a))
# # Use multiple GPU threadblocks
# a = np.random.random((2048*75+2,)).astype(config.floatX)
# assert np.allclose(np.cumsum(a), f(a))
# # Use multiple GPU gridblocks
# a = np.ones((2048*2048+2,)).astype(config.floatX)
# assert np.allclose(np.cumsum(a), f(a))
print "\nBenchmark:" print "\nBenchmark:"
import timeit as t from theano import sandbox, Out
#theano_time = t.timeit("np.ones((100,))", "import numpy as np", number=1000) import time
stmt = "f(a)"
setup = """
import numpy as np
import theano
import theano.tensor as T
from theano.tensor.extra_ops import cumsum
from theano import config
x = T.vector('x')
f = theano.function([x], cumsum(x))
a = np.ones((100000,), dtype=config.floatX)
""".replace(" ", "")
theano_time = t.timeit(stmt, setup, number=1000)
print "Theano:\t", theano_time
stmt = "np.cumsum(a)"
setup = """
import numpy as np
from theano import config
a = np.ones((100000,), dtype=config.floatX)
""".replace(" ", "")
numpy_time = t.timeit(stmt, setup, number=1000)
print "Numpy:\t", numpy_time
print "Speedup: {0}x".format(numpy_time/theano_time)
vlen = 40 * 1024 * 2048 # 10 x # cores x # threads per core
iters = 25
# # Extensive testing x = theano.shared(np.ones((vlen,), dtype=config.floatX), borrow=False)
# i = 0; res = Out(sandbox.cuda.basic_ops.gpu_from_host(cumsum(x)), borrow=True)
# while True: f = theano.function([], res)
# a = np.ones((i,), dtype=config.floatX)
# fa = f(a) print f.maker.fgraph.toposort()
# npa = np.cumsum(a) t0 = time.time()
for i in xrange(iters):
r = f()
t1 = time.time()
print 'Looping %d times took' % iters, t1 - t0, 'seconds'
print 'Result is', r
print 'Numpy result is', np.asarray(r)
# if not np.allclose(npa, fa): # x = theano.shared(np.ones((1,vlen), dtype=config.floatX), borrow=True)
# print i, np.allclose(npa, fa) # Test axis=None # f = theano.function([], Out(sandbox.cuda.basic_ops.gpu_from_host(cumsum(x,axis=1)), borrow=True))
# print fa
# print npa
# assert False
# if i % 1000 == 0: # print f.maker.fgraph.toposort()
# print i # t0 = time.time()
# for i in xrange(iters):
# r = f()
# t1 = time.time()
# print 'Looping %d times took' % iters, t1 - t0, 'seconds'
# print 'Result is', r
# print 'Numpy result is', np.asarray(r)
# i += 1 # print 'Used the', config.device
# ### Test 2D case - axis=1 ### def test_GpuCumsum(self):
# x = T.matrix('x') ### Test 1D case ###
# f = theano.function([x], cumsum(x, axis=1)) x = T.vector('x')
f = theano.function([x], cumsum(x))
# # # Even number of elements # Even number of elements
# # print "\n# Even number of elements" a = np.random.random((18,)).astype(config.floatX)
# # a = np.random.random((18,18)).astype(config.floatX) print f(a)
# # assert np.allclose(np.cumsum(a, axis=1), f(a)) print np.cumsum(a)
assert np.allclose(np.cumsum(a), f(a))
# # # Odd number of elements
# # print "\n# Odd number of elements" # Odd number of elements
# # assert np.allclose(np.cumsum(a, axis=1), f(a)) a = np.random.random((7,)).astype(config.floatX)
assert np.allclose(np.cumsum(a), f(a))
# # # Use multiple GPU threadblocks
# # print "\n# Use multiple GPU threadblocks" # Use multiple GPU threadblocks
# # a = np.random.random((2048+2,2048+2)).astype(config.floatX) a = np.random.random((2048+2,)).astype(config.floatX)
# # assert np.allclose(np.cumsum(a, axis=1), f(a)) assert np.allclose(np.cumsum(a), f(a))
# # # Use multiple GPU threadblocks # Use multiple GPU threadblocks
# # print "\n# Use multiple GPU threadblocks" a = np.random.random((2048*75+2,)).astype(config.floatX)
# # a = np.ones((10,2048*75+3)).astype(config.floatX) assert np.allclose(np.cumsum(a), f(a))
# # assert np.allclose(np.cumsum(a, axis=1), f(a))
# Use multiple GPU gridblocks
# # # Use multiple GPU gridblocks a = np.ones((2048*2048+2,)).astype(config.floatX)
# # print "\n# Use multiple GPU gridblocks" assert np.allclose(np.cumsum(a), f(a))
# # a = np.ones((11,2048*2048+3)).astype(config.floatX)
# # assert np.allclose(np.cumsum(a, axis=1), f(a))
# Extensive testing
# # Extensive testing for i in xrange(int(1e3)*5):
# i = 19000; a = np.ones((i,), dtype=config.floatX)
# while True:
# a = np.ones((11,i), dtype=config.floatX) fa = f(a)
# fa = f(a) npa = np.cumsum(a)
# npa = np.cumsum(a, axis=1)
if not np.allclose(npa, fa):
# if not np.allclose(npa, fa): print i, np.allclose(npa, fa) # Test axis=None
# print i, np.allclose(npa, fa) # Test axis=None print fa
# print fa print npa
# print npa assert False
# assert False
if i % 1000 == 0:
# if i % 1000 == 0: print i
# print i
# i += 1 #for axis in xrange(2):
for axis in xrange(2):
### Test 2D case - axis=1 ###
x = T.matrix('x')
f = theano.function([x], cumsum(x, axis=axis))
# Even number of elements
print "\n# Even number of elements (axis={0})".format(axis)
a = np.random.random((18,18)).astype(config.floatX)
assert np.allclose(np.cumsum(a, axis=axis), f(a))
# Odd number of elements
print "\n# Odd number of elements (axis={0})".format(axis)
a = np.random.random((21,21)).astype(config.floatX)
assert np.allclose(np.cumsum(a, axis=axis), f(a))
# Use two GPU threadblocks
print "\n# Use two GPU threadblocks (axis={0})".format(axis)
a = np.random.random((2048+2,2048+2)).astype(config.floatX)
assert np.allclose(np.cumsum(a, axis=axis), f(a))
# Use multiple GPU threadblocks
print "\n# Use multiple GPU threadblocks (axis={0})".format(axis)
a = np.ones((10,2048*75+3)).astype(config.floatX)
assert np.allclose(np.cumsum(a, axis=axis), f(a))
a = np.ones((2048*75+3,10)).astype(config.floatX)
assert np.allclose(np.cumsum(a, axis=axis), f(a))
# Use multiple GPU gridblocks
print "\n# Use multiple GPU gridblocks (axis={0})".format(axis)
a = np.ones((11,2048*2048+3)).astype(config.floatX)
assert np.allclose(np.cumsum(a, axis=axis), f(a))
a = np.ones((2048*2048+3,11)).astype(config.floatX)
assert np.allclose(np.cumsum(a, axis=axis), f(a))
# Extensive testing for the first 10k sizes
for i in xrange(int(1e3)*5):
a = np.ones((11,i), dtype=config.floatX)
fa = f(a)
npa = np.cumsum(a, axis=axis)
if not np.allclose(npa, fa):
print i, np.allclose(npa, fa) # Test axis=None
print fa
print npa
assert False
a = np.ones((i,11), dtype=config.floatX)
fa = f(a)
npa = np.cumsum(a, axis=axis)
if not np.allclose(npa, fa):
print i, np.allclose(npa, fa) # Test axis=None
print fa
print npa
assert False
if i % 1000 == 0:
print i
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论