提交 e2909177 authored 作者: Frederic's avatar Frederic

Make reduction on the columns of a matrix in c order faster on the GPU.

上级 5f2ed7aa
...@@ -1265,8 +1265,65 @@ class GpuCAReduce(GpuOp): ...@@ -1265,8 +1265,65 @@ class GpuCAReduce(GpuOp):
def c_code_reduce_10(self, sio, node, name, x, z, fail): def c_code_reduce_10(self, sio, node, name, x, z, fail):
print >> sio, """ print >> sio, """
{ {
int verbose = 0; int verbose = 0;
if(CudaNdarray_HOST_STRIDES(%(x)s)[0] >
CudaNdarray_HOST_STRIDES(%(x)s)[1]){
// If there are a lot of summations to do, then we can use simple parallelization -
// use each thread to do one sum.
// we might as well launch blocks of 32 threads because that's the warp size.
// we could schedule more threads if we were maxing out the gridsize below, but
// the gridsize is way more than the physical hardware and I think 32 threads
// on a huge grid is enough to fully use the hardware.
dim3 n_threads(32,1,1);
// We kindof reshape the input implicitly to something 4D:
// the shape A,B,C -> A, B, D, E
// where C <= D*E < C+32
// where E==32
int A = 1;
int B = CudaNdarray_HOST_DIMS(%(x)s)[0];
int C = CudaNdarray_HOST_DIMS(%(x)s)[1];
int D = C/32;
if (32*D < C) D+= 1;
assert ((C <= 32*D) && (32*D < C+32));
// The gridsize would ideally be (A, D). But we do the following logic to make
// sure we don't ask for a grid that is too big.
dim3 n_blocks(A,D);
if (n_blocks.x > NUM_VECTOR_OP_BLOCKS) n_blocks.x = NUM_VECTOR_OP_BLOCKS;
if (n_blocks.x*n_blocks.y > NUM_VECTOR_OP_BLOCKS) n_blocks.y = NUM_VECTOR_OP_BLOCKS/n_blocks.x;
int n_shared = 0;
kernel_reduce_010_AD_%(name)s<<<n_blocks, n_threads, n_shared>>>(
A,B,C,D,
CudaNdarray_DEV_DATA(%(x)s),
1,
CudaNdarray_HOST_STRIDES(%(x)s)[0],
CudaNdarray_HOST_STRIDES(%(x)s)[1],
CudaNdarray_DEV_DATA(%(z)s),
1,
CudaNdarray_HOST_STRIDES(%(z)s)[0]
);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts)
{
PyErr_Format(PyExc_RuntimeError,
"Cuda error: %%s: %%s."
" (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
"kernel_reduce_10_AD%(name)s",
cudaGetErrorString(sts),
n_blocks.x,
n_blocks.y,
n_threads.x,
n_threads.y,
n_threads.z);
%(fail)s;
}
}else{
dim3 n_threads( dim3 n_threads(
std::min(CudaNdarray_HOST_DIMS(%(x)s)[0], std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],
NUM_VECTOR_OP_THREADS_PER_BLOCK)); NUM_VECTOR_OP_THREADS_PER_BLOCK));
...@@ -1279,7 +1336,7 @@ class GpuCAReduce(GpuOp): ...@@ -1279,7 +1336,7 @@ class GpuCAReduce(GpuOp):
n_blocks.x, n_blocks.x,
n_blocks.y); n_blocks.y);
} }
assert( CudaNdarray_HOST_DIMS(%(x)s)[1] == CudaNdarray_HOST_DIMS(%(z)s)[0]); assert(CudaNdarray_HOST_DIMS(%(x)s)[1] == CudaNdarray_HOST_DIMS(%(z)s)[0]);
int n_shared = sizeof(float) * n_threads.x; int n_shared = sizeof(float) * n_threads.x;
kernel_reduce_010_%(name)s<<<n_blocks, n_threads, n_shared>>>( kernel_reduce_010_%(name)s<<<n_blocks, n_threads, n_shared>>>(
1, 1,
...@@ -1310,6 +1367,7 @@ class GpuCAReduce(GpuOp): ...@@ -1310,6 +1367,7 @@ class GpuCAReduce(GpuOp):
%(fail)s; %(fail)s;
} }
} }
}
""" % locals() """ % locals()
def c_code_reduce_010(self, sio, node, name, x, z, fail): def c_code_reduce_010(self, sio, node, name, x, z, fail):
...@@ -1640,7 +1698,7 @@ class GpuCAReduce(GpuOp): ...@@ -1640,7 +1698,7 @@ class GpuCAReduce(GpuOp):
""" % locals() """ % locals()
def c_code_cache_version_apply(self, node): def c_code_cache_version_apply(self, node):
version = [9] # the version corresponding to the c code in this Op version = [10] # the version corresponding to the c code in this Op
# now we insert versions for the ops on which we depend... # now we insert versions for the ops on which we depend...
scalar_node = Apply(self.scalar_op, scalar_node = Apply(self.scalar_op,
...@@ -1874,7 +1932,7 @@ class GpuCAReduce(GpuOp): ...@@ -1874,7 +1932,7 @@ class GpuCAReduce(GpuOp):
} }
""" % locals() """ % locals()
if self.reduce_mask == (0, 1, 0): if self.reduce_mask == (0, 1, 0) or self.reduce_mask == (1, 0):
reduce_fct = self._assign_reduce(node, nodename, "myresult", reduce_fct = self._assign_reduce(node, nodename, "myresult",
"X[a * sX0 + b * sX1 + c * sX2]", "X[a * sX0 + b * sX1 + c * sX2]",
{}, True) {}, True)
......
...@@ -1269,6 +1269,15 @@ def speed_adv_sub1(): ...@@ -1269,6 +1269,15 @@ def speed_adv_sub1():
print "ProfileMode with batch size", batch_size print "ProfileMode with batch size", batch_size
mode_with_gpu.print_summary() mode_with_gpu.print_summary()
def speed_reduce10():
data = numpy.random.rand(1000, 1000).astype("float32")
m = theano.tensor.fmatrix()
f = theano.function([m], [m.sum(axis=0), m.T.sum(axis=0)],
mode=mode_with_gpu)
f(data)
if __name__ == '__main__': if __name__ == '__main__':
test_many_arg_elemwise() test_many_arg_elemwise()
test_gpujoin_assert_cndas() test_gpujoin_assert_cndas()
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论