提交 e66dbf76 authored 作者: James Bergstra's avatar James Bergstra

GpuSum - added 010 case, and support for high-dimensional sums via reshape

上级 55dde830
...@@ -851,16 +851,19 @@ class GpuSum(Op): ...@@ -851,16 +851,19 @@ class GpuSum(Op):
dim3 n_threads( dim3 n_threads(
std::min(CudaNdarray_HOST_DIMS(%(x)s)[0], std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],
NUM_VECTOR_OP_THREADS_PER_BLOCK)); NUM_VECTOR_OP_THREADS_PER_BLOCK));
dim3 n_blocks(CudaNdarray_HOST_DIMS(%(x)s)[1]); dim3 n_blocks(1,CudaNdarray_HOST_DIMS(%(x)s)[1]);
if (verbose) printf("running kernel_reduce_sum_10_%(name)s\\n"); if (verbose) printf("running kernel_reduce_sum_10_%(name)s\\n");
int n_shared = sizeof(float) * n_threads.x; int n_shared = sizeof(float) * n_threads.x;
kernel_reduce_sum_10_%(name)s<<<n_blocks, n_threads, n_shared>>>( kernel_reduce_sum_010_%(name)s<<<n_blocks, n_threads, n_shared>>>(
1,
CudaNdarray_HOST_DIMS(%(x)s)[0], CudaNdarray_HOST_DIMS(%(x)s)[0],
CudaNdarray_HOST_DIMS(%(x)s)[1], CudaNdarray_HOST_DIMS(%(x)s)[1],
CudaNdarray_DEV_DATA(%(x)s), CudaNdarray_DEV_DATA(%(x)s),
1,
CudaNdarray_HOST_STRIDES(%(x)s)[0], CudaNdarray_HOST_STRIDES(%(x)s)[0],
CudaNdarray_HOST_STRIDES(%(x)s)[1], CudaNdarray_HOST_STRIDES(%(x)s)[1],
CudaNdarray_DEV_DATA(%(z)s), CudaNdarray_DEV_DATA(%(z)s),
1,
CudaNdarray_HOST_STRIDES(%(z)s)[0] CudaNdarray_HOST_STRIDES(%(z)s)[0]
); );
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
...@@ -868,7 +871,45 @@ class GpuSum(Op): ...@@ -868,7 +871,45 @@ class GpuSum(Op):
if (cudaSuccess != sts) if (cudaSuccess != sts)
{ {
PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i)\\n", PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
"kernel_reduce_sum_10_%(name)s", "kernel_reduce_sum_010_%(name)s",
cudaGetErrorString(sts),
n_blocks.x,
n_blocks.y,
n_threads.x,
n_threads.y,
n_threads.z);
%(fail)s;
}
}
""" %locals()
def c_code_reduce_010(self, sio, node, name, x, z, fail):
print >> sio, """
{
int verbose = 0;
dim3 n_threads(
std::min(CudaNdarray_HOST_DIMS(%(x)s)[1],
NUM_VECTOR_OP_THREADS_PER_BLOCK));
dim3 n_blocks(CudaNdarray_HOST_DIMS(%(x)s)[0], CudaNdarray_HOST_DIMS(%(x)s)[2]);
if (verbose) printf("running kernel_reduce_sum_10_%(name)s\\n");
int n_shared = sizeof(float) * n_threads.x;
kernel_reduce_sum_010_%(name)s<<<n_blocks, n_threads, n_shared>>>(
CudaNdarray_HOST_DIMS(%(x)s)[0],
CudaNdarray_HOST_DIMS(%(x)s)[1],
CudaNdarray_HOST_DIMS(%(x)s)[2],
CudaNdarray_DEV_DATA(%(x)s),
CudaNdarray_HOST_STRIDES(%(x)s)[0],
CudaNdarray_HOST_STRIDES(%(x)s)[1],
CudaNdarray_HOST_STRIDES(%(x)s)[2],
CudaNdarray_DEV_DATA(%(z)s),
CudaNdarray_HOST_STRIDES(%(z)s)[0],
CudaNdarray_HOST_STRIDES(%(z)s)[1]
);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts)
{
PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
"kernel_reduce_sum_010_%(name)s",
cudaGetErrorString(sts), cudaGetErrorString(sts),
n_blocks.x, n_blocks.x,
n_blocks.y, n_blocks.y,
...@@ -1095,7 +1136,8 @@ class GpuSum(Op): ...@@ -1095,7 +1136,8 @@ class GpuSum(Op):
""" %locals() """ %locals()
def c_code_cache_version(self): def c_code_cache_version(self):
return (13,) #return ()
return (14,)
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
...@@ -1226,20 +1268,21 @@ class GpuSum(Op): ...@@ -1226,20 +1268,21 @@ class GpuSum(Op):
%(reducebuf)s %(reducebuf)s
} }
""" %locals() """ %locals()
if self.reduce_mask == (1,0): if self.reduce_mask == (0,1,0) or self.reduce_mask == (1,0):
# this kernel uses one block for each column, # this kernel uses one block for each column,
# threads per block for each element per column. # threads per block for each element per column.
#TODO: This kernel is pretty inefficient in terms of reading, because if A is #TODO: This kernel is pretty inefficient in terms of reading, because if A is
# c_contiguous (typical case) then each warp is accessing non-contigous # c_contiguous (typical case) then each warp is accessing non-contigous
# memory (a segment of a column). # memory (a segment of a column).
reducebuf = self._k_reduce_buf('Z[blockIdx.x * sZ0]') reducebuf = self._k_reduce_buf('Z[blockIdx.x * sZ0 + blockIdx.y*sZ1]')
print >> sio, """ print >> sio, """
static __global__ void kernel_reduce_sum_10_%(nodename)s( static __global__ void kernel_reduce_sum_010_%(nodename)s(
const int d0, const int d0,
const int d1, const int d1,
const float *A, const int sA0, const int sA1, const int d2,
float * Z, const int sZ0) const float *A, const int sA0, const int sA1, const int sA2,
float * Z, const int sZ0, const int sZ1)
{ {
const int threadCount = blockDim.x; const int threadCount = blockDim.x;
const int threadNum = threadIdx.x; const int threadNum = threadIdx.x;
...@@ -1253,8 +1296,7 @@ class GpuSum(Op): ...@@ -1253,8 +1296,7 @@ class GpuSum(Op):
for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x) for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x)
{ {
float Ai = A[i0 * sA0 + blockIdx.x * sA1]; mysum += A[blockIdx.x * sA0 + i0 * sA1 + blockIdx.y * sA2];
mysum += Ai;
} }
%(reducebuf)s %(reducebuf)s
} }
......
...@@ -228,6 +228,33 @@ def local_gpu_sum(node): ...@@ -228,6 +228,33 @@ def local_gpu_sum(node):
if hasattr(gsum, 'c_code_reduce_%s'%pattern): if hasattr(gsum, 'c_code_reduce_%s'%pattern):
return [host_from_gpu(gsum(gpu_from_host(x)))] return [host_from_gpu(gsum(gpu_from_host(x)))]
else: else:
# Try to make a simpler pattern based on reshaping
# The principle is that if two adjacent dimensions have the same value in
# the reduce_mask, then we can reshape to make them a single dimension, do
# the sum, and then reshape to get them back.
shape_of = node.env.shape_feature.shape_of
x_shape = shape_of[x]
new_in_shp = [x_shape[0]]
new_mask = [reduce_mask[0]]
for i in range(1, x.type.ndim):
if reduce_mask[i] == reduce_mask[i-1]:
new_in_shp[-1] *= x_shape[i]
else:
new_mask.append(reduce_mask[i])
new_in_shp.append(x_shape[i])
pattern=(''.join(str(i) for i in new_mask))
new_gsum = GpuSum(new_mask)
if hasattr(new_gsum, 'c_code_reduce_%s'%pattern):
reshaped_x = x.reshape(tensor.stack(*new_in_shp))
sum_reshaped_x = host_from_gpu(new_gsum(gpu_from_host(reshaped_x)))
unreshaped_sum = sum_reshaped_x.reshape(tensor.stack(*shape_of[node.outputs[0]]))
return [unreshaped_sum]
raise Exception("GpuSum don't have implemented the pattern",pattern) raise Exception("GpuSum don't have implemented the pattern",pattern)
return False return False
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论