提交 317688d9 authored 作者: James Bergstra's avatar James Bergstra

GpuElemwise - better block-count and thread-count heuristic for kernel call

上级 151a5dc8
...@@ -36,8 +36,8 @@ def get_str_list_logical_scalar(node, value_str='ii_i%i_value', data_str='ii_i%i ...@@ -36,8 +36,8 @@ def get_str_list_logical_scalar(node, value_str='ii_i%i_value', data_str='ii_i%i
class NaiveAlgo(object): class NaiveAlgo(object):
verbose = 0 # 1, 2 or 3 for more verbose output. verbose = 0 # 1, 2 or 3 for more verbose output.
cache_version = ('debug', 7, verbose)
cache_version = () cache_version = ()
cache_version = ('debug', 9, verbose)
def __init__(self, scalar_op, sync=True): def __init__(self, scalar_op, sync=True):
""" """
...@@ -703,8 +703,15 @@ nd_collapse_[i]=0; ...@@ -703,8 +703,15 @@ nd_collapse_[i]=0;
if self.verbose: if self.verbose:
verb='std::cerr << " Running ccontiguous version\\n";' verb='std::cerr << " Running ccontiguous version\\n";'
print >> sio, """ print >> sio, """
int threads_per_block = std::min(numEls, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK); //first use at least a full warp
int n_blocks = std::min(numEls/threads_per_block + (numEls %% threads_per_block?1:0), (unsigned int)NUM_VECTOR_OP_BLOCKS); int threads_per_block = std::min(numEls, (unsigned int)32); //WARP SIZE
//next start adding multiprocessors
int n_blocks = std::min(numEls/threads_per_block + (numEls %% threads_per_block?1:0), (unsigned int)30); // UP TO NUMBER OF MULTIPROCESSORS
// next start adding more warps per multiprocessor
if (threads_per_block * n_blocks < numEls)
threads_per_block = std::min(numEls/n_blocks, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
kernel_%(scalar_op)s_%(nodename)s_Ccontiguous<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s); kernel_%(scalar_op)s_%(nodename)s_Ccontiguous<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s);
//std::cerr << "calling callkernel returned\\n"; //std::cerr << "calling callkernel returned\\n";
...@@ -715,7 +722,10 @@ nd_collapse_[i]=0; ...@@ -715,7 +722,10 @@ nd_collapse_[i]=0;
cudaError_t err = cudaGetLastError(); cudaError_t err = cudaGetLastError();
if( cudaSuccess != err) if( cudaSuccess != err)
{ {
PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n", "Elemwise %(nodename)s %(scalar_op)s", cudaGetErrorString(err)); PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n n_blocks=%%i threads_per_block=%%i\\n Call: %%s\\n",
"GpuElemwise %(nodename)s %(scalar_op)s", cudaGetErrorString(err),
n_blocks, threads_per_block,
"kernel_%(scalar_op)s_%(nodename)s_Ccontiguous<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s)");
return -1; return -1;
} }
...@@ -748,8 +758,16 @@ nd_collapse_[i]=0; ...@@ -748,8 +758,16 @@ nd_collapse_[i]=0;
kernel_call_args = ", ".join(kernel_call_args) kernel_call_args = ", ".join(kernel_call_args)
print >> sio, """ print >> sio, """
int threads_per_block = std::min(numEls, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK); //first use at least a full warp
int n_blocks = std::min(numEls/threads_per_block + (numEls %% threads_per_block?1:0), (unsigned int)NUM_VECTOR_OP_BLOCKS); int threads_per_block = std::min(numEls, (unsigned int)32); //WARP SIZE
//next start adding multiprocessors
int n_blocks = std::min(numEls/threads_per_block + (numEls %% threads_per_block?1:0), (unsigned int)30); // UP TO NUMBER OF MULTIPROCESSORS
// next start adding more warps per multiprocessor
if (threads_per_block * n_blocks < numEls)
threads_per_block = std::min(numEls/n_blocks, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
kernel_%(scalar_op)s_%(nodename)s_%(id_self)s_%(force_nd)s<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s); kernel_%(scalar_op)s_%(nodename)s_%(id_self)s_%(force_nd)s<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s);
""" %locals() """ %locals()
if sync: if sync:
...@@ -758,7 +776,10 @@ nd_collapse_[i]=0; ...@@ -758,7 +776,10 @@ nd_collapse_[i]=0;
cudaError_t err = cudaGetLastError(); cudaError_t err = cudaGetLastError();
if( cudaSuccess != err) if( cudaSuccess != err)
{ {
PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n", "Elemwise %(nodename)s %(scalar_op)s", cudaGetErrorString(err)); PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n n_blocks=%%i threads_per_block=%%i\\n Call: %%s\\n",
"GpuElemwise %(nodename)s %(scalar_op)s", cudaGetErrorString(err),
n_blocks, threads_per_block,
"kernel_%(scalar_op)s_%(nodename)s_Ccontiguous<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s)");
return -1; return -1;
} }
...@@ -776,6 +797,7 @@ nd_collapse_[i]=0; ...@@ -776,6 +797,7 @@ nd_collapse_[i]=0;
print >> sio, " } break;" print >> sio, " } break;"
print >> sio, "}"#end case print >> sio, "}"#end case
print >> sio, "return -2;" # should not get to this point
print >> sio, "}"#end fct print >> sio, "}"#end fct
#N.B. cudaGetLastError is called by c_code #N.B. cudaGetLastError is called by c_code
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论