提交 c3e013da authored 作者: Frederic Bastien's avatar Frederic Bastien

added faster GpuSum case when we sum on all dimensions on a ccontiguous tensor.

上级 b2cccdd2
...@@ -473,6 +473,19 @@ class GpuSum(Op): ...@@ -473,6 +473,19 @@ class GpuSum(Op):
# #
# Now perform the reduction # Now perform the reduction
# #
if all(i==1 for i in self.reduce_mask):
#check if the tensor is ccontiguous, if true, use the c_c0de_reduce_ccontig code.
#TODO: check if we are ccontiguous when we un-dimshuffle
#TODO: if only some dims are ccontiguous, call version with less dims.
print >> sio, 'if(CudaNdarray_is_c_contiguous(%(x)s)){'%locals()
self.c_code_reduce_ccontig(sio, node, name, x, z, fail)
print >> sio, "}else{"
getattr(self, 'c_code_reduce_%s'%(''.join(str(i) for i in self.reduce_mask)))(sio, node, name, x, z, fail)
print >> sio, "}"
else:
getattr(self, 'c_code_reduce_%s'%(''.join(str(i) for i in self.reduce_mask)))(sio, node, name, x, z, fail) getattr(self, 'c_code_reduce_%s'%(''.join(str(i) for i in self.reduce_mask)))(sio, node, name, x, z, fail)
return sio.getvalue() return sio.getvalue()
...@@ -639,6 +652,37 @@ class GpuSum(Op): ...@@ -639,6 +652,37 @@ class GpuSum(Op):
} }
""" %locals() """ %locals()
def c_code_reduce_ccontig(self, sio, node, name, x, z, fail):
print >> sio, """
{
int verbose = 0;
dim3 n_threads(
std::min(CudaNdarray_SIZE(%(x)s),
NUM_VECTOR_OP_THREADS_PER_BLOCK));
dim3 n_blocks(1);
if (verbose) printf("running kernel_reduce_sum_ccontig_%(name)s\\n");
int n_shared = sizeof(float) * n_threads.x * n_threads.y * n_threads.z;
kernel_reduce_sum_ccontig_%(name)s<<<n_blocks, n_threads, n_shared>>>(
CudaNdarray_SIZE(%(x)s),//need SIZE here as we use this kernel for ccontiguous tensor
CudaNdarray_DEV_DATA(%(x)s),
CudaNdarray_DEV_DATA(%(z)s));
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts)
{
PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
"kernel_reduce_sum_ccontig_%(name)s",
cudaGetErrorString(sts),
n_blocks.x,
n_blocks.y,
n_threads.x,
n_threads.y,
n_threads.z);
%(fail)s;
}
}
""" %locals()
def c_code_reduce_1(self, sio, node, name, x, z, fail): def c_code_reduce_1(self, sio, node, name, x, z, fail):
print >> sio, """ print >> sio, """
{ {
...@@ -935,11 +979,38 @@ class GpuSum(Op): ...@@ -935,11 +979,38 @@ class GpuSum(Op):
def c_code_cache_version(self): def c_code_cache_version(self):
#return () #return ()
return (8,) return (9,)
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
sio = StringIO.StringIO() sio = StringIO.StringIO()
if all(i==1 for i in self.reduce_mask):
#this kernel is ok for up to a few thousand elements, but
# it only runs on ONE multiprocessor
reducebuf = self._k_reduce_buf('Z[0]')
print >> sio, """
static __global__ void kernel_reduce_sum_ccontig_%(nodename)s(
const unsigned int d0,
const float *A,
float * Z)
{
const int threadCount = blockDim.x;
const int threadNum = threadIdx.x;
extern __shared__ float buf[];
float mysum = 0.0f;
if (warpSize != 32)
{
return; //TODO: set error code
}
for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x)
{
mysum += A[i0];
}
%(reducebuf)s
}
""" %locals()
if self.reduce_mask == (1,): if self.reduce_mask == (1,):
#this kernel is ok for up to a few thousand elements, but #this kernel is ok for up to a few thousand elements, but
# it only runs on ONE multiprocessor # it only runs on ONE multiprocessor
......
...@@ -52,6 +52,29 @@ def test_sum(): ...@@ -52,6 +52,29 @@ def test_sum():
assert numpy.allclose(f2(val),f(val)) assert numpy.allclose(f2(val),f(val))
#test with dimshuffle
#we shuffle the 2 outer dims.
for shape, pattern in [#((5,),[0]),
((5,4),[0,1]),((5,4),[0]),
((5,4,3),[0]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[0,1,2]),
((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3])]:
a = tensor.TensorType('float32',(False,)*len(shape))()
dim_pattern = range(len(shape))
dim_pattern[0]=1
dim_pattern[1]=0
a = a.dimshuffle(dim_pattern)
b = T.Sum(pattern)(a)
val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
# val = numpy.ones(shape)
# val = numpy.arange(numpy.prod(shape)).reshape(shape)
val = theano._asarray(val,dtype='float32')
f = theano.function([a],b, mode=mode_with_gpu)
f2 = theano.function([a],b, mode=mode_without_gpu)
assert tcn.GpuSum in [x.op.__class__ for x in f.maker.env.toposort()]
assert T.Sum in [x.op.__class__ for x in f2.maker.env.toposort()]
assert numpy.allclose(f2(val),f(val))
#test with broadcast #test with broadcast
for shape, pattern in [((5,),[0]), for shape, pattern in [((5,),[0]),
((5,4),[0,1]),((5,4),[0]), ((5,4),[0,1]),((5,4),[0]),
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论