提交 45b32281 authored 作者: James Bergstra's avatar James Bergstra

GpuSum - fixed bug in calculation of n_blocks for the 10 pattern

上级 e6876da1
......@@ -453,7 +453,6 @@ class GpuSum(Op):
PyErr_Format(PyExc_RuntimeError, "Failed to allocate output");
%(fail)s;
}
}
""" %locals()
......@@ -824,8 +823,16 @@ class GpuSum(Op):
dim3 n_threads(
std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],
NUM_VECTOR_OP_THREADS_PER_BLOCK));
dim3 n_blocks(1,CudaNdarray_HOST_DIMS(%(x)s)[1]);
if (verbose) printf("running kernel_reduce_sum_10_%(name)s\\n");
dim3 n_blocks(1,
std::min(CudaNdarray_HOST_DIMS(%(x)s)[1],
NUM_VECTOR_OP_BLOCKS));
if (verbose) {
fprintf(stderr,
"running kernel_reduce_sum_10_%(name)s n_blocks=(%%i,%%i)\\n",
n_blocks.x,
n_blocks.y);
}
assert( CudaNdarray_HOST_DIMS(%(x)s)[1] == CudaNdarray_HOST_DIMS(%(z)s)[0]);
int n_shared = sizeof(float) * n_threads.x;
kernel_reduce_sum_010_%(name)s<<<n_blocks, n_threads, n_shared>>>(
1,
......@@ -1173,9 +1180,7 @@ class GpuSum(Op):
""" %locals()
def c_code_cache_version(self):
#return ()
return (19,)
return (20,)
def c_support_code_apply(self, node, nodename):
sio = StringIO.StringIO()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论