提交 45b32281 authored 作者: James Bergstra's avatar James Bergstra

GpuSum - fixed bug in calculation of n_blocks for the 10 pattern

上级 e6876da1
...@@ -453,7 +453,6 @@ class GpuSum(Op): ...@@ -453,7 +453,6 @@ class GpuSum(Op):
PyErr_Format(PyExc_RuntimeError, "Failed to allocate output"); PyErr_Format(PyExc_RuntimeError, "Failed to allocate output");
%(fail)s; %(fail)s;
} }
} }
""" %locals() """ %locals()
...@@ -824,8 +823,16 @@ class GpuSum(Op): ...@@ -824,8 +823,16 @@ class GpuSum(Op):
dim3 n_threads( dim3 n_threads(
std::min(CudaNdarray_HOST_DIMS(%(x)s)[0], std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],
NUM_VECTOR_OP_THREADS_PER_BLOCK)); NUM_VECTOR_OP_THREADS_PER_BLOCK));
dim3 n_blocks(1,CudaNdarray_HOST_DIMS(%(x)s)[1]); dim3 n_blocks(1,
if (verbose) printf("running kernel_reduce_sum_10_%(name)s\\n"); std::min(CudaNdarray_HOST_DIMS(%(x)s)[1],
NUM_VECTOR_OP_BLOCKS));
if (verbose) {
fprintf(stderr,
"running kernel_reduce_sum_10_%(name)s n_blocks=(%%i,%%i)\\n",
n_blocks.x,
n_blocks.y);
}
assert( CudaNdarray_HOST_DIMS(%(x)s)[1] == CudaNdarray_HOST_DIMS(%(z)s)[0]);
int n_shared = sizeof(float) * n_threads.x; int n_shared = sizeof(float) * n_threads.x;
kernel_reduce_sum_010_%(name)s<<<n_blocks, n_threads, n_shared>>>( kernel_reduce_sum_010_%(name)s<<<n_blocks, n_threads, n_shared>>>(
1, 1,
...@@ -1173,9 +1180,7 @@ class GpuSum(Op): ...@@ -1173,9 +1180,7 @@ class GpuSum(Op):
""" %locals() """ %locals()
def c_code_cache_version(self): def c_code_cache_version(self):
#return () return (20,)
return (19,)
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
sio = StringIO.StringIO() sio = StringIO.StringIO()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论