提交 5001ae40 authored 作者: Frederic's avatar Frederic

make CudaNdarray_CopyFromCudaNdarray faster by using more threads per block.

上级 1bcecc38
...@@ -2880,8 +2880,10 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, const CudaNdarray * othe ...@@ -2880,8 +2880,10 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, const CudaNdarray * othe
assert (cudaSuccess == cudaGetLastError()); assert (cudaSuccess == cudaGetLastError());
if (verbose) fprintf(stderr, "Copying with default version unbroadcast=%d\n", unbroadcast); if (verbose) fprintf(stderr, "Copying with default version unbroadcast=%d\n", unbroadcast);
// call worker routine // call worker routine
unsigned int n_blocks = std::min(size, (unsigned int)NUM_VECTOR_OP_BLOCKS); unsigned int threads_per_block = std::min(size,
unsigned int threads_per_block = std::min(ceil_intdiv(size, n_blocks), (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK); (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
unsigned int n_blocks = std::min(ceil_intdiv(size, threads_per_block),
(unsigned int)NUM_VECTOR_OP_BLOCKS);
const CudaNdarray * cuda_dims = other; const CudaNdarray * cuda_dims = other;
if(unbroadcast) if(unbroadcast)
cuda_dims = self; cuda_dims = self;
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论