提交 89f584bc authored 作者: Sean Lee's avatar Sean Lee

Use the CUDA driver API for CUDA gpuarray operations.

Instead of mixing the CUDA driver API and the runtime API in the generated code, use only the CUDA driver API. GPU programs for CUDA gpuarray operations (except conv operations) are now generated as a string that is passed to the python interface of libgpuarray. libgpuarray then generates a cubin bytearray, which is embedded in the generated code. The generated code then uses the CUDA driver API via the C++ interface of libgpuarray to load and launch the GPU program. This has at least two benefits: (1) This approach does not use the nvcc offline compiler to compile the generated code into the shared library. It uses the host compiler directly, which is likely to be faster. Note that, for cubin generation, libgpuarray still uses the nvcc offline compiler, but an improvement is being made to use NVRTC and ptxas instead of nvcc, which should be, again, faster. (2) Mixing the CUDA driver API and the runtime API is typically discouraged.
上级 7852531c
...@@ -145,11 +145,18 @@ class GpuKernelBase(object): ...@@ -145,11 +145,18 @@ class GpuKernelBase(object):
return """static GpuKernel %(kname)s;""" % dict(kname=k.objvar) return """static GpuKernel %(kname)s;""" % dict(kname=k.objvar)
def c_support_code_apply(self, node, name): def c_support_code_apply(self, node, name):
ceil_intdiv = """
template <typename T>
static T ceil_intdiv(T a, T b)
{
return (a/b) + ((a % b) ? 1: 0);
}
"""
kernels = self.gpu_kernels(node, name) kernels = self.gpu_kernels(node, name)
bins = '\n'.join(self._generate_kernel_bin(k) for k in kernels) bins = '\n'.join(self._generate_kernel_bin(k) for k in kernels)
codes = '\n'.join(self._generate_kernel_code(k) for k in kernels) codes = '\n'.join(self._generate_kernel_code(k) for k in kernels)
vars = '\n'.join(self._generate_kernel_vars(k) for k in kernels) vars = '\n'.join(self._generate_kernel_vars(k) for k in kernels)
return '\n'.join([bins, codes, vars]) return '\n'.join([ceil_intdiv, bins, codes, vars])
def _generate_kernel_init(self, k, err): def _generate_kernel_init(self, k, err):
if PY3: if PY3:
......
...@@ -3,6 +3,12 @@ Helper routines for generating gpu kernels for nvcc. ...@@ -3,6 +3,12 @@ Helper routines for generating gpu kernels for nvcc.
""" """
try:
import pygpu
from pygpu import gpuarray
except ImportError:
pass
def nvcc_kernel(name, params, body): def nvcc_kernel(name, params, body):
""" """
Return the c code of a kernel function. Return the c code of a kernel function.
...@@ -26,7 +32,7 @@ def nvcc_kernel(name, params, body): ...@@ -26,7 +32,7 @@ def nvcc_kernel(name, params, body):
else: else:
yield b yield b
bodystr = ';\n'.join(flatbody()) bodystr = ';\n'.join(flatbody())
return """__global__ void %(name)s (%(paramstr)s) return """KERNEL void %(name)s (%(paramstr)s)
{ {
%(bodystr)s; %(bodystr)s;
} }
...@@ -167,11 +173,12 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"): ...@@ -167,11 +173,12 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):
We use __i as an int variable in a loop. We use __i as an int variable in a loop.
""" """
ctype = gpuarray.dtype_to_ctype(dtype)
return [ return [
# get max of buf (trashing all but buf[0]) # get max of buf (trashing all but buf[0])
inline_reduce_max(N, buf, threadPos, threadCount), inline_reduce_max(N, buf, threadPos, threadCount),
'__syncthreads()', '__syncthreads()',
('npy_%s row_max = ' + buf + '[0]') % dtype, ('%s row_max = ' + buf + '[0]') % ctype,
'__syncthreads()', '__syncthreads()',
'for(int __i=' + threadPos + '; __i<' + N + 'for(int __i=' + threadPos + '; __i<' + N +
'; __i+=' + threadCount + '){', '; __i+=' + threadCount + '){',
...@@ -181,7 +188,7 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"): ...@@ -181,7 +188,7 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):
'__syncthreads()', '__syncthreads()',
inline_reduce_sum(N, buf, threadPos, threadCount), inline_reduce_sum(N, buf, threadPos, threadCount),
'__syncthreads()', '__syncthreads()',
('npy_%s row_sum = ' + buf + '[0]') % dtype, ('%s row_sum = ' + buf + '[0]') % ctype,
'__syncthreads()', '__syncthreads()',
# divide each exp() result by the sum to complete the job. # divide each exp() result by the sum to complete the job.
'for(int __i=' + threadPos + '; __i<' + N + 'for(int __i=' + threadPos + '; __i<' + N +
...@@ -259,11 +266,12 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count, ...@@ -259,11 +266,12 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count,
r_2 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+2]" % (buf, pos)) r_2 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+2]" % (buf, pos))
r_1 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+1]" % (buf, pos)) r_1 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+1]" % (buf, pos))
ctype = gpuarray.dtype_to_ctype(dtype)
return """ return """
{ {
// This function trashes buf[1..n_threads], // This function trashes buf[1..n_threads],
// leaving the reduction result in buf[0]. // leaving the reduction result in buf[0].
npy_%(dtype)s red = %(init)s; %(ctype)s red = %(init)s;
#pragma unroll 16 #pragma unroll 16
for (int i = %(pos)s + %(count)s; i<%(N)s; i += %(count)s){ for (int i = %(pos)s + %(count)s; i<%(N)s; i += %(count)s){
red = %(loop_line)s; red = %(loop_line)s;
...@@ -356,6 +364,7 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x, load_x, ...@@ -356,6 +364,7 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x, load_x,
We use tx as an int variable in a loop. We use tx as an int variable in a loop.
""" """
ctype = gpuarray.dtype_to_ctype(dtype)
ret = [ ret = [
# get max of buf (trashing all but buf[0]) # get max of buf (trashing all but buf[0])
inline_reduce_fixed_shared_max(N, buf, x, stride_x, load_x, inline_reduce_fixed_shared_max(N, buf, x, stride_x, load_x,
...@@ -363,7 +372,7 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x, load_x, ...@@ -363,7 +372,7 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x, load_x,
b, stride_b, load_b, b, stride_b, load_b,
dtype), dtype),
'__syncthreads()', '__syncthreads()',
('npy_%s row_max = ' + buf + '[0]') % dtype, ('%s row_max = ' + buf + '[0]') % ctype,
'__syncthreads()', '__syncthreads()',
inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, inline_reduce_fixed_shared(N, buf, x, stride_x, load_x,
threadPos, threadCount, threadPos, threadCount,
...@@ -371,7 +380,7 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x, load_x, ...@@ -371,7 +380,7 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x, load_x,
lambda a: "exp(%s - row_max)" % a, lambda a: "exp(%s - row_max)" % a,
b, stride_b, load_b, dtype), b, stride_b, load_b, dtype),
'__syncthreads()', '__syncthreads()',
('npy_%s row_sum = ' + buf + '[0]') % dtype, ('%s row_sum = ' + buf + '[0]') % ctype,
'__syncthreads()', '__syncthreads()',
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){", "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
] ]
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论