提交 a9f2a795 authored 作者: Frederic Bastien's avatar Frederic Bastien

verbose mode more verbose

上级 4e1e0f8a
...@@ -673,10 +673,11 @@ class GpuSum(Op): ...@@ -673,10 +673,11 @@ class GpuSum(Op):
std::min(CudaNdarray_SIZE(%(x)s), std::min(CudaNdarray_SIZE(%(x)s),
NUM_VECTOR_OP_THREADS_PER_BLOCK)); NUM_VECTOR_OP_THREADS_PER_BLOCK));
dim3 n_blocks(1); dim3 n_blocks(1);
if (verbose) printf("running kernel_reduce_sum_ccontig_%(name)s\\n"); if (verbose) printf("running kernel_reduce_sum_ccontig_%(name)s n_threads.x=%%d, size=%%d, ndim=%%d\\n",
int n_shared = sizeof(float) * n_threads.x * n_threads.y * n_threads.z; n_threads.x,CudaNdarray_SIZE(%(x)s),%(x)s->nd);
int n_shared = sizeof(float) * n_threads.x;
kernel_reduce_sum_ccontig_%(name)s<<<n_blocks, n_threads, n_shared>>>( kernel_reduce_sum_ccontig_%(name)s<<<n_blocks, n_threads, n_shared>>>(
CudaNdarray_SIZE(%(x)s),//need SIZE here as we use this kernel for ccontiguous tensor CudaNdarray_SIZE(%(x)s),
CudaNdarray_DEV_DATA(%(x)s), CudaNdarray_DEV_DATA(%(x)s),
CudaNdarray_DEV_DATA(%(z)s)); CudaNdarray_DEV_DATA(%(z)s));
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论