提交 89f584bc authored 作者: Sean Lee's avatar Sean Lee

Use the CUDA driver API for CUDA gpuarray operations.

Instead of mixing the CUDA driver API and the runtime API in the generated code, use only the CUDA driver API. GPU programs for CUDA gpuarray operations (except conv operations) are now generated as a string that is passed to the python interface of libgpuarray. libgpuarray then generates a cubin bytearray, which is embedded in the generated code. The generated code then uses the CUDA driver API via the C++ interface of libgpuarray to load and launch the GPU program. This has at least two benefits: (1) This approach does not use the nvcc offline compiler to compile the generated code into the shared library. It uses the host compiler directly, which is likely to be faster. Note that, for cubin generation, libgpuarray still uses the nvcc offline compiler, but an improvement is being made to use NVRTC and ptxas instead of nvcc, which should be, again, faster. (2) Mixing the CUDA driver API and the runtime API is typically discouraged.
上级 7852531c
...@@ -145,11 +145,18 @@ class GpuKernelBase(object): ...@@ -145,11 +145,18 @@ class GpuKernelBase(object):
return """static GpuKernel %(kname)s;""" % dict(kname=k.objvar) return """static GpuKernel %(kname)s;""" % dict(kname=k.objvar)
def c_support_code_apply(self, node, name): def c_support_code_apply(self, node, name):
ceil_intdiv = """
template <typename T>
static T ceil_intdiv(T a, T b)
{
return (a/b) + ((a % b) ? 1: 0);
}
"""
kernels = self.gpu_kernels(node, name) kernels = self.gpu_kernels(node, name)
bins = '\n'.join(self._generate_kernel_bin(k) for k in kernels) bins = '\n'.join(self._generate_kernel_bin(k) for k in kernels)
codes = '\n'.join(self._generate_kernel_code(k) for k in kernels) codes = '\n'.join(self._generate_kernel_code(k) for k in kernels)
vars = '\n'.join(self._generate_kernel_vars(k) for k in kernels) vars = '\n'.join(self._generate_kernel_vars(k) for k in kernels)
return '\n'.join([bins, codes, vars]) return '\n'.join([ceil_intdiv, bins, codes, vars])
def _generate_kernel_init(self, k, err): def _generate_kernel_init(self, k, err):
if PY3: if PY3:
......
...@@ -8,6 +8,7 @@ from theano import Apply, scalar, config ...@@ -8,6 +8,7 @@ from theano import Apply, scalar, config
from theano import scalar as scal from theano import scalar as scal
from six.moves import StringIO, xrange from six.moves import StringIO, xrange
from theano.gof.utils import MethodNotDefined from theano.gof.utils import MethodNotDefined
from theano.gof.cmodule import GCC_compiler
from theano.scalar import Scalar from theano.scalar import Scalar
from theano.tensor.elemwise import (Elemwise, DimShuffle, CAReduceDtype) from theano.tensor.elemwise import (Elemwise, DimShuffle, CAReduceDtype)
...@@ -23,7 +24,6 @@ except ImportError: ...@@ -23,7 +24,6 @@ except ImportError:
from .basic_ops import (as_gpuarray_variable, HideC, from .basic_ops import (as_gpuarray_variable, HideC,
GpuKernelBase, Kernel) GpuKernelBase, Kernel)
from .comp import NVCC_compiler
from .type import GpuArrayType from .type import GpuArrayType
from .fp16_help import load_w, write_w from .fp16_help import load_w, write_w
...@@ -57,7 +57,7 @@ def as_C_string_const(s): ...@@ -57,7 +57,7 @@ def as_C_string_const(s):
for l in s.split('\n')) for l in s.split('\n'))
class GpuElemwise(HideC, Elemwise): class GpuElemwise(GpuKernelBase, HideC, Elemwise):
nin = property(lambda self: self.scalar_op.nin) nin = property(lambda self: self.scalar_op.nin)
nout = property(lambda self: self.scalar_op.nout) nout = property(lambda self: self.scalar_op.nout)
_f16_ok = True _f16_ok = True
...@@ -150,39 +150,7 @@ class GpuElemwise(HideC, Elemwise): ...@@ -150,39 +150,7 @@ class GpuElemwise(HideC, Elemwise):
code.append('}') code.append('}')
kop = '\n'.join(code) kop = '\n'.join(code)
# Translate types for scalar composite ops (except complex). support_code = ""
# NB: OpenCL implicitly has 'stdint' defs at the kernel
# compilation stage
support_code = "" if pygpu.get_default_context().kind == 'opencl' else """
#ifdef _MSC_VER
#define signed __int8 int8_t
#define unsigned __int8 uint8_t
#define signed __int16 int16_t
#define unsigned __int16 uint16_t
#define signed __int32 int32_t
#define unsigned __int32 uint32_t
#define signed __int64 int64_t
#define unsigned __int64 uint64_t
#else
#include <stdint.h>
#endif
"""
# Translate ga_ pseudo-types into their specific realizations
support_code += """
#define ga_bool uint8_t
#define ga_byte int8_t
#define ga_ubyte uint8_t
#define ga_short int16_t
#define ga_ushort uint16_t
#define ga_int int32_t
#define ga_uint uint32_t
#define ga_long int64_t
#define ga_ulong uint64_t
#define ga_float float
#define ga_double double
#define ga_half uint16_t
"""
try: try:
# We accept only some c_support_code(). # We accept only some c_support_code().
# This filter is done in the make_node() # This filter is done in the make_node()
...@@ -204,60 +172,65 @@ class GpuElemwise(HideC, Elemwise): ...@@ -204,60 +172,65 @@ class GpuElemwise(HideC, Elemwise):
kop = kop.replace(npy, ga) kop = kop.replace(npy, ga)
return ElemwiseKernel(None, inps+outs, kop, preamble=support_code) return ElemwiseKernel(None, inps+outs, kop, preamble=support_code)
def c_headers(self): def c_header_dirs(self):
if pygpu.get_default_context().kind == 'opencl': if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only') raise MethodNotDefined('cuda only')
return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>', cuda_root = config.cuda.root
'<gpuarray/ext_cuda.h>'] if cuda_root:
import os
return [os.path.join(cuda_root, 'include')]
else:
return []
def c_compiler(self): def c_compiler(self):
return GCC_compiler
def c_headers(self):
if pygpu.get_default_context().kind == 'opencl': if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only') raise MethodNotDefined('cuda only')
return NVCC_compiler return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
'<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']
def c_support_code(self): def c_support_code(self):
return self.scalar_op.c_support_code() return self.scalar_op.c_support_code()
def c_support_code_apply(self, node, nodename): def _gpu_kernel_code(self, node, nodename):
if pygpu.get_default_context().kind == 'opencl': if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only') raise MethodNotDefined('cuda only')
# This is useless by itself, but will serve an eventual c_code # This is useless by itself, but will serve an eventual c_code
# implementation # implementation
k = self.generate_kernel(node, nodename) k = self.generate_kernel(node, nodename)
nd = node.inputs[0].type.ndim nd = node.inputs[0].type.ndim
CLUDA_PREAMBLE = """ res = []
#define local_barrier() __syncthreads();
#define WITHIN_KERNEL __device__
#define KERNEL extern "C" __global__
#define GLOBAL_MEM /* empty */
#define LOCAL_MEM __shared__
#define LOCAL_MEM_ARG /* empty */
#define REQD_WG_SIZE(X,Y,Z) __launch_bounds__(X*Y*Z, 1)
#define LID_0 threadIdx.x
#define LID_1 threadIdx.y
#define LID_2 threadIdx.z
#define GID_0 blockIdx.x
#define GID_1 blockIdx.y
#define GID_2 blockIdx.z
#define LDIM_0 blockDim.x
#define LDIM_1 blockDim.y
#define LDIM_2 blockDim.z
#define GDIM_0 gridDim.x
#define GDIM_1 gridDim.y
#define GDIM_2 gridDim.z
"""
res = [CLUDA_PREAMBLE]
for i in range(0, nd + 1): for i in range(0, nd + 1):
res.append(k.render_basic(i, name="elem_" + str(i)) + ';') res.append(k.render_basic(i, name="elem_" + str(i)) + ';')
res.append(k.contig_src + ';') res.append(k.contig_src + ';')
return '\n'.join(res) return '\n'.join(res)
def gpu_kernels(self, node, nodename):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
src = self._gpu_kernel_code(node, nodename)
nd = node.outputs[0].ndim
params = ['uintp']
params.extend('uintp' for _ in range(nd))
num_inputs = len(node.inputs)
num_outputs = len(node.outputs)
for n in range(num_inputs + num_outputs):
if (n - len(node.inputs)) in self.inplace_pattern:
continue
params.extend([gpuarray.GpuArray, 'uintp'])
params.extend('intp' for _ in range(nd))
acc_dtype = getattr(self, 'acc_dtype', None)
if acc_dtype is None:
acc_dtype = node.outputs[0].type.dtype
return [Kernel(code=src, name="elem_%d" % nd, params=params,
flags=Kernel.get_flags(node.inputs[0].type.dtype,
acc_dtype,
node.outputs[0].type.dtype),
objvar='elem_%d_%s' % (nd, nodename))]
def c_init_code(self): def c_init_code(self):
if pygpu.get_default_context().kind == 'opencl': if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only') raise MethodNotDefined('cuda only')
...@@ -273,11 +246,15 @@ class GpuElemwise(HideC, Elemwise): ...@@ -273,11 +246,15 @@ class GpuElemwise(HideC, Elemwise):
# check that all inputs have valid dimensions # check that all inputs have valid dimensions
emitted_inames = {} emitted_inames = {}
num_kernel_params = 1 + nd + len(inputs + outputs) * (2 + nd)
code = """ code = """
int n_blocks = 0; size_t n_blocks = 0;
int threads_per_block = 0; size_t threads_per_block = 0;
size_t numEls = 0; size_t numEls = 0;
""" const ssize_t zero = 0;
void *kernel_params[%(num_kernel_params)d] = {0};
int err;
""" % locals()
if nd > 0: if nd > 0:
code += """ code += """
size_t dims[%(nd)s] = {%(initial_dims)s}; size_t dims[%(nd)s] = {%(initial_dims)s};
...@@ -416,23 +393,41 @@ class GpuElemwise(HideC, Elemwise): ...@@ -416,23 +393,41 @@ class GpuElemwise(HideC, Elemwise):
//std::cerr << "calling callkernel returned\\n"; //std::cerr << "calling callkernel returned\\n";
""" % locals() """ % locals()
code += "elem_%(nd)s<<<n_blocks, threads_per_block>>>(numEls,\n" % locals() kname = 'elem_%d_%s' % (nd, name)
param = [] param = ["(void *)&numEls"]
for i in range(nd): for i in range(nd):
param.append("%(z)s->ga.dimensions[%(i)d]" % dict(z=outputs[0], param.append("(void *)&%(z)s->ga.dimensions[%(i)d]" % dict(z=outputs[0],
i=i)) i=i))
for n, (name, var) in enumerate(zip(inputs + outputs, for n, (name, var) in enumerate(zip(inputs + outputs,
node.inputs + node.outputs)): node.inputs + node.outputs)):
if (n - len(inputs)) in self.inplace_pattern: if (n - len(inputs)) in self.inplace_pattern:
continue continue
dtype = dtype_to_ctype(var.dtype) dtype = dtype_to_ctype(var.dtype)
param.append("(%(dtype)s*)(cuda_get_ptr(%(name)s->ga.data))" % locals()) param.append("(void *)%(name)s->ga.data" % locals())
param.append("%(name)s->ga.offset" % locals()) param.append("(void *)&%(name)s->ga.offset" % locals())
for i in range(nd): for i in range(nd):
param.append("PyGpuArray_DIMS(%(name)s)[%(i)d] == 1 ? 0 : PyGpuArray_STRIDES(%(name)s)[%(i)d]" % locals()) param.append("PyGpuArray_DIMS(%(name)s)[%(i)d] == 1 ? (void *)&zero: (void *)&PyGpuArray_STRIDES(%(name)s)[%(i)d]" % locals())
code += ',\n'.join(param) + ");\n" for n, p in enumerate(param):
code += "kernel_params[%(n)d] = %(p)s;\n" % locals()
code += """
err = GpuKernel_call(&%(kname)s, 1, &threads_per_block, &n_blocks, 0, kernel_params);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: %(kname)s: %%s.",
GpuKernel_error(&%(kname)s, err));
%(fail)s;
}
""" % dict(kname=kname,fail=fail)
if config.gpuarray.sync: if config.gpuarray.sync:
code += "GpuArray_sync(&%(z)s->ga);\n" % dict(z=z) code += """
err = GpuArray_sync(&%(z)s->ga);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: %(kname)s: %%s.",
GpuKernel_error(&%(kname)s, err));
%(fail)s;
}
""" % locals()
return str(code) return str(code)
def perform(self, node, inputs, output_storage): def perform(self, node, inputs, output_storage):
...@@ -573,7 +568,7 @@ class GpuDimShuffle(HideC, DimShuffle): ...@@ -573,7 +568,7 @@ class GpuDimShuffle(HideC, DimShuffle):
return (4,) return (4,)
class GpuCAReduceCuda(HideC, CAReduceDtype): class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
""" """
GpuCAReduceCuda is a Reduction along some dimensions by a scalar op. GpuCAReduceCuda is a Reduction along some dimensions by a scalar op.
...@@ -737,12 +732,15 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -737,12 +732,15 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
return False return False
return True return True
def c_header_dirs(self):
cuda_root = config.cuda.root
if cuda_root:
import os
return [os.path.join(cuda_root, 'include')]
def c_headers(self): def c_headers(self):
return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>', return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
'<gpuarray/ext_cuda.h>'] '<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']
def c_compiler(self):
return NVCC_compiler
def c_init_code(self): def c_init_code(self):
return ['setup_ext_cuda();'] return ['setup_ext_cuda();']
...@@ -840,7 +838,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -840,7 +838,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
# \begin bracket the reduction in a check that there is # \begin bracket the reduction in a check that there is
# actually work to do # actually work to do
if getattr(self.scalar_op, 'identity', None) == 0: if getattr(self.scalar_op, 'identity', None) == 0:
zero_shp = "cudaMemset((%(out_dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset), 0, PyGpuArray_SIZE(%(z)s) * sizeof(%(out_dtype)s))" % locals() zero_shp = "GpuArray_memset(&%(z)s->ga, 0)" % locals()
# TODO: elif getattr(self.scalar_op, 'identity', None) == 1: # TODO: elif getattr(self.scalar_op, 'identity', None) == 1:
else: else:
scalar_op = self.scalar_op scalar_op = self.scalar_op
...@@ -891,28 +889,24 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -891,28 +889,24 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
.. code-block:: c .. code-block:: c
ssize_t stride_A0 = PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s);
ssize_t stride_A1 = PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s);
ssize_t stride_Z0 = PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s);
if (verbose) if (verbose)
printf("running kernel_reduce_10_%(name)s\\n"); printf("running kernel_reduce_10_%(name)s\\n");
int n_shared = sizeof(%(acc_dtype)s) * n_threads.x * n_threads.y * n_threads.z; size_t n_shared = sizeof(%(acc_dtype)s) * n_threads[0] * n_threads[1] * n_threads[2];
kernel_reduce_10_%(name)s<<<n_blocks, n_threads, void *kernel_params[] = {
n_shared>>>( (void *)&PyGpuArray_DIMS(%(x)s)[0],
PyGpuArray_DIMS(%(x)s)[0], (void *)&PyGpuArray_DIMS(%(x)s)[1],
PyGpuArray_DIMS(%(x)s)[1], (void *)%(x)s->ga.data,
(%(in_dtype)s *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset), (void *)&%(x)s->ga.offset,
PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s), (void *)&stride_A0,
PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s), (void *)&stride_A1,
(%(out_dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset), (void *)%(z)s->ga.data,
PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s) (void *)&%(z)s->ga.offset,
); (void *)&stride_Z0};
[ int err = GpuKernel_call(&%(k_var)s, 3, n_threads, n_blocks, n_shared, kernel_params);
if config.gpuarray.sync: %(err_check)s
code += "GpuArray_sync(&%(z)s->ga);\n" % dict(z=z)
]
if (cudaSuccess != cudaGetLastError())
{
PyErr_Format(PyExc_RuntimeError, "Cuda error: ... );
%(fail)s;
}
""" """
in_dtype = "npy_" + node.inputs[0].dtype in_dtype = "npy_" + node.inputs[0].dtype
out_dtype = "npy_" + node.outputs[0].dtype out_dtype = "npy_" + node.outputs[0].dtype
...@@ -923,64 +917,66 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -923,64 +917,66 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
ndim = len(self.reduce_mask) ndim = len(self.reduce_mask)
nd_out = ndim - sum(self.reduce_mask) nd_out = ndim - sum(self.reduce_mask)
shapes_format = "shape=(%s)" % ",".join(["%llu"] * node.inputs[0].ndim) shapes_format = "shape=(%s)" % ",".join(["%llu"] * node.inputs[0].ndim)
shapes_data = ",".join(["(unsigned long long) PyGpuArray_DIMS(%s)[%d]" % (x, i) shapes_data = ",".join(["(size_t) PyGpuArray_DIMS(%s)[%d]" % (x, i)
for i in range(node.inputs[0].ndim)]) for i in range(node.inputs[0].ndim)])
k_var = "kernel_reduce_%(pattern)s_%(name)s" % locals()
params = []
print("""
if (verbose)
printf("running kernel_reduce_%(pattern)s_%(name)s\\n");
int n_shared = sizeof(%(acc_dtype)s) * n_threads.x * n_threads.y * n_threads.z;
if (verbose>1)
printf("n_threads.x=%%d, n_threads.y=%%d, n_threads.z=%%d,"
" nb_threads=%%d, n_blocks.x=%%d, n_blocks.y=%%d,"
" nb_block=%%d, n_shared=%%d, %(shapes_format)s\\n",
n_threads.x,n_threads.y,n_threads.z,
n_threads.x*n_threads.y*n_threads.z,
n_blocks.x,n_blocks.y,
n_blocks.x*n_blocks.y, n_shared, %(shapes_data)s);
kernel_reduce_%(pattern)s_%(name)s<<<n_blocks, n_threads, n_shared>>>(
""" % locals(), file=sio)
for i in xrange(ndim): for i in xrange(ndim):
print(""" params.append("(void *)&PyGpuArray_DIMS(%(x)s)[%(i)s]" % locals())
PyGpuArray_DIMS(%(x)s)[%(i)s], params.append("(void *)%(x)s->ga.data" % locals())
""" % locals(), file=sio) params.append("(void *)&%(x)s->ga.offset" % locals())
print("""
(%(in_dtype)s *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset)
""" % locals(), file=sio)
for i in xrange(ndim): for i in xrange(ndim):
print(""" print("""
,PyGpuArray_STRIDES(%(x)s)[%(i)s]/sizeof(%(in_dtype)s) ssize_t stride_A%(i)d = PyGpuArray_STRIDES(%(x)s)[%(i)s]/sizeof(%(in_dtype)s);
""" % locals(), file=sio)
print("""
,(%(out_dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset)
""" % locals(), file=sio) """ % locals(), file=sio)
params.append("(void *)&stride_A%(i)d" % locals())
params.append("(void *)%(z)s->ga.data" % locals())
params.append("(void *)&%(z)s->ga.offset" % locals())
for i in xrange(nd_out): for i in xrange(nd_out):
print(""" print("""
,PyGpuArray_STRIDES(%(z)s)[%(i)s]/sizeof(%(out_dtype)s) ssize_t stride_Z%(i)d = PyGpuArray_STRIDES(%(z)s)[%(i)s]/sizeof(%(out_dtype)s);
""" % locals(), file=sio) """ % locals(), file=sio)
params.append("(void *)&stride_Z%(i)d" % locals())
kernel_params = ', '.join(params)
err_check = """
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: %(k_var)s: %%s.",
GpuKernel_error(&%(k_var)s, err));
%(fail)s;
}
""" % locals()
print("""
if (verbose)
printf("running kernel_reduce_%(pattern)s_%(name)s\\n");
size_t n_shared = sizeof(%(acc_dtype)s) * n_threads[0] * n_threads[1] * n_threads[2];
void *kernel_params[] = { %(kernel_params)s };
if (verbose>1)
printf("n_threads[0]=%%lu, n_threads[1]=%%lu, "
"n_threads[2]=%%lu, n_threads=%%lu, "
"n_blocks[0]=%%lu, n_blocks[1]=%%lu, n_blocks[2]=%%lu, "
"n_blocks=%%lu, n_shared=%%d, %(shapes_format)s\\n",
n_threads[0],n_threads[1],
n_threads[2],
n_threads[0]*n_threads[1]*
n_threads[2],
n_blocks[0],n_blocks[1],n_blocks[2],
n_blocks[0]*n_blocks[1]*n_blocks[2],
n_shared, %(shapes_data)s);
int err = GpuKernel_call(&%(k_var)s, 3, n_threads, n_blocks, n_shared, kernel_params);
%(err_check)s
""" % locals(), file=sio)
sync = "" sync = ""
if config.gpuarray.sync: if config.gpuarray.sync:
sync = """GpuArray_sync(&%(z)s->ga);""" % locals() sync = """
err = GpuArray_sync(&%(z)s->ga);
%(err_check)s
""" % locals()
print(""" print("""
);
%(sync)s %(sync)s
cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts)
{
PyErr_Format(PyExc_RuntimeError,
"Cuda error: %%s: %%s."
" (grid: %%i x %%i; block: %%i x %%i x %%i)"
" %(shapes_format)s \\n",
"kernel_reduce_%(pattern)s_%(name)s",
cudaGetErrorString(sts),
n_blocks.x,
n_blocks.y,
n_threads.x,
n_threads.y,
n_threads.z,
%(shapes_data)s);
%(fail)s;
}
""" % locals(), file=sio) """ % locals(), file=sio)
return sio.getvalue() return sio.getvalue()
...@@ -993,66 +989,86 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -993,66 +989,86 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
.. code-block:: c .. code-block:: c
static __global__ void kernel_reduce_110_%(nodename)s( KERNEL void kernel_reduce_110_%(nodename)s(
const int d0, const ga_size d0,
const int d1, const ga_size d1,
const int d2, const ga_size d2,
const %(in_dtype)s *A, const %(in_type)s *A,
const int sA0, const ga_size offset_A,
const int sA1, const ga_ssize sA0,
const int sA2, const ga_ssize sA1,
%(out_dtype)s * Z, const ga_ssize sA2,
const int sZ0) %(out_type)s * Z,
const ga_size offset_Z,
const ga_ssize sZ0)
Since the nodename is unique, we don't need to put the name Since the nodename is unique, we don't need to put the name
of the scalar_op in here. of the scalar_op in here.
""" """
in_dtype = "npy_" + node.inputs[0].dtype in_dtype = node.inputs[0].dtype
out_dtype = "npy_" + node.outputs[0].dtype out_dtype = node.outputs[0].dtype
in_type = gpuarray.dtype_to_ctype(in_dtype)
out_type = gpuarray.dtype_to_ctype(out_dtype)
if reduce_mask is None: if reduce_mask is None:
reduce_mask = self.reduce_mask reduce_mask = self.reduce_mask
if ndim is None: if ndim is None:
ndim = len(reduce_mask) ndim = len(reduce_mask)
if pattern is None: if pattern is None:
pattern = ''.join(str(i) for i in reduce_mask) pattern = ''.join(str(i) for i in reduce_mask)
kname = "kernel_reduce_%(pattern)s" % locals()
k_var = "kernel_reduce_%(pattern)s_%(nodename)s" % locals()
params = []
sio = StringIO() sio = StringIO()
print(""" print("""
static __global__ void kernel_reduce_%(pattern)s_%(nodename)s( KERNEL void %(kname)s(
""" % locals(), file=sio) """ % locals(), file=sio)
for i in xrange(ndim): for i in xrange(ndim):
params.append('uintp')
print(""" print("""
const int d%(i)s, const ga_size d%(i)s,
""" % locals(), file=sio) """ % locals(), file=sio)
params.append(gpuarray.GpuArray)
params.append('uintp')
print(""" print("""
const %(in_dtype)s *A, const %(in_type)s *A, const ga_size offset_A,
""" % locals(), file=sio) """ % locals(), file=sio)
for i in xrange(ndim): for i in xrange(ndim):
params.append('intp')
print(""" print("""
const int sA%(i)s, const ga_ssize sA%(i)s,
""" % locals(), file=sio) """ % locals(), file=sio)
params.append(gpuarray.GpuArray)
params.append('uintp')
print(""" print("""
%(out_dtype)s * Z %(out_type)s * Z, const ga_size offset_Z
""" % locals(), file=sio) """ % locals(), file=sio)
for i in xrange(ndim - sum(reduce_mask)): for i in xrange(ndim - sum(reduce_mask)):
params.append('intp')
print(""" print("""
, const int sZ%(i)s , const ga_ssize sZ%(i)s
""" % locals(), file=sio) """ % locals(), file=sio)
print(")", file=sio) print(")", file=sio)
return sio.getvalue() return sio.getvalue(), kname, params, k_var
def _k_init(self, node, nodename): def _k_init(self, node, nodename):
in_dtype = node.inputs[0].dtype
out_dtype = node.outputs[0].dtype
acc_dtype = self._acc_dtype(node.inputs[0].dtype) acc_dtype = self._acc_dtype(node.inputs[0].dtype)
# We need to use theano_complex* and not npy_complex* # We need to use theano_complex* and not npy_complex*
acc_dtype = theano.scalar.basic.Scalar(acc_dtype).dtype_specs()[1] in_type = gpuarray.dtype_to_ctype(in_dtype)
out_type = gpuarray.dtype_to_ctype(out_dtype)
acc_type = gpuarray.dtype_to_ctype(acc_dtype)
return """ return """
const int threadCount = blockDim.x * blockDim.y * blockDim.z; const int threadCount = blockDim.x * blockDim.y * blockDim.z;
const int threadNum = threadIdx.z * blockDim.x * blockDim.y const int threadNum = threadIdx.z * blockDim.x * blockDim.y
+ threadIdx.y * blockDim.x + threadIdx.x; + threadIdx.y * blockDim.x + threadIdx.x;
extern __shared__ %(acc_dtype)s buf[]; extern __shared__ %(acc_type)s buf[];
%(acc_dtype)s myresult = 0; %(acc_type)s myresult = 0;
A = (const %(in_type)s *)(((char *)A)+offset_A);
Z = (%(out_type)s *)(((char *)Z)+offset_Z);
//This is caught in cuda/init.py when we init the gpu. I keep //This is caught in cuda/init.py when we init the gpu. I keep
//it here to ease finding code that rely on this. //it here to ease finding code that rely on this.
...@@ -1315,7 +1331,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1315,7 +1331,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
in_dtype = "npy_" + node.inputs[0].dtype in_dtype = "npy_" + node.inputs[0].dtype
out_dtype = "npy_" + node.outputs[0].dtype out_dtype = "npy_" + node.outputs[0].dtype
if getattr(self.scalar_op, 'identity', None) == 0: if getattr(self.scalar_op, 'identity', None) == 0:
zero_shp = "cudaMemset((%(out_dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset), 0, PyGpuArray_SIZE(%(z)s) * sizeof(%(out_dtype)s))" % locals() zero_shp = "GpuArray_memset(&%(z)s->ga, 0)" % locals()
# TODO: elif getattr(self.scalar_op, 'identity', None) == 1: # TODO: elif getattr(self.scalar_op, 'identity', None) == 1:
else: else:
zero_shp = """ zero_shp = """
...@@ -1325,44 +1341,43 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1325,44 +1341,43 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
""" % locals() """ % locals()
acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype) acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype)
k_var = "kernel_reduce_ccontig_%(name)s" % locals()
err_check = """
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: %(k_var)s: %%s.",
GpuKernel_error(&%(k_var)s, err));
%(fail)s;
}
""" % locals()
sync = "" sync = ""
if config.gpuarray.sync: if config.gpuarray.sync:
sync = """GpuArray_sync(&%(z)s->ga);""" % locals() sync = """
err = GpuArray_sync(&%(z)s->ga);
%(err_check)s
""" % locals()
print(""" print("""
{ {
if(PyGpuArray_SIZE(%(x)s)==0){ if(PyGpuArray_SIZE(%(x)s)==0){
%(zero_shp)s; %(zero_shp)s;
}else{ }else{
int verbose = 0; int verbose = 0;
dim3 n_threads( size_t numEls = PyGpuArray_SIZE(%(x)s);
std::min(PyGpuArray_SIZE(%(x)s), size_t n_threads = std::min(numEls, (size_t) 256);
(size_t) 256)); size_t n_blocks = 1;
dim3 n_blocks(1); void *kernel_params[] = {(void *)&numEls,
(void *)%(x)s->ga.data,
(void *)&%(x)s->ga.offset,
(void *)%(z)s->ga.data,
(void *)&%(z)s->ga.offset};
if (verbose) printf("running kernel_reduce_ccontig_%(name)s" if (verbose) printf("running kernel_reduce_ccontig_%(name)s"
" n_threads.x=%%d, size=%%d, ndim=%%d\\n", " n_threads=%%lu, size=%%lu, ndim=%%d\\n",
n_threads.x,PyGpuArray_SIZE(%(x)s), n_threads,numEls,
PyGpuArray_NDIM(%(x)s)); PyGpuArray_NDIM(%(x)s));
int n_shared = sizeof(%(acc_dtype)s) * n_threads.x; size_t n_shared = sizeof(%(acc_dtype)s) * n_threads;
kernel_reduce_ccontig_%(name)s<<<n_blocks, n_threads, n_shared>>>( int err = GpuKernel_call(&%(k_var)s, 1, &n_threads, &n_blocks, n_shared, kernel_params);
PyGpuArray_SIZE(%(x)s), %(err_check)s
(%(in_dtype)s *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset),
(%(out_dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset));
%(sync)s %(sync)s
cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts)
{
PyErr_Format(PyExc_RuntimeError,
"Cuda error: %%s: %%s."
" (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
"kernel_reduce_ccontig_%(name)s",
cudaGetErrorString(sts),
n_blocks.x,
n_blocks.y,
n_threads.x,
n_threads.y,
n_threads.z);
%(fail)s;
}
} }
} }
""" % locals(), file=sio) """ % locals(), file=sio)
...@@ -1372,10 +1387,8 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1372,10 +1387,8 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
print(""" print("""
{ {
int verbose = 0; int verbose = 0;
dim3 n_threads( size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t) 256), 1, 1};
std::min(PyGpuArray_DIMS(%(x)s)[0], size_t n_blocks[3] = {1, 1, 1};
(size_t) 256));
dim3 n_blocks(1);
%(makecall)s %(makecall)s
} }
""" % locals(), file=sio) """ % locals(), file=sio)
...@@ -1385,15 +1398,14 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1385,15 +1398,14 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
print(""" print("""
{ {
int verbose = 0; int verbose = 0;
dim3 n_threads(
std::min(PyGpuArray_DIMS(%(x)s)[1], size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t) 256), 1, 1};
(size_t) 256)); while (n_threads[1] * n_threads[0] <= 256) ++n_threads[1];
while (n_threads.y * n_threads.x <= 256) ++n_threads.y; n_threads[1] -= 1;
n_threads.y -= 1; if (n_threads[1] > PyGpuArray_DIMS(%(x)s)[0])
if (n_threads.y > PyGpuArray_DIMS(%(x)s)[0]) n_threads[1] = PyGpuArray_DIMS(%(x)s)[0];
n_threads.y = PyGpuArray_DIMS(%(x)s)[0];
size_t n_blocks[3] = {1, 1, 1};
dim3 n_blocks(1);
%(makecall)s %(makecall)s
} }
""" % locals(), file=sio) """ % locals(), file=sio)
...@@ -1421,25 +1433,25 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1421,25 +1433,25 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
threads_y = """ threads_y = """
//get as many y threads as we can fit //get as many y threads as we can fit
while (n_threads.x * (n_threads.y+1) <= 256) while (n_threads[0] * (n_threads[1]+1) <= 256)
{ {
if (n_threads.y < PyGpuArray_DIMS(%(x)s)[%(N)s-1]) if (n_threads[1] < PyGpuArray_DIMS(%(x)s)[%(N)s-1])
n_threads.y += 1; n_threads[1] += 1;
else else
break; break;
}""" % locals() }""" % locals()
threads_z = """ threads_z = """
//get as many z threads as we can fit //get as many z threads as we can fit
while (n_threads.x * n_threads.y * (n_threads.z+1) <= 256) while (n_threads[0] * n_threads[1] * (n_threads[2]+1) <= 256)
{ {
if (n_threads.z < PyGpuArray_DIMS(%(x)s)[%(N)s-2]) if (n_threads[2] < PyGpuArray_DIMS(%(x)s)[%(N)s-2])
n_threads.z += 1; n_threads[2] += 1;
else else
break; break;
} }
//Maximum for Fermi GPU on that dimensions. //Maximum for Fermi GPU on that dimensions.
n_threads.z = std::min(n_threads.z, (unsigned)64); n_threads[2] = std::min(n_threads[2], (size_t)64);
""" % locals() """ % locals()
if len(self.reduce_mask) == 2: if len(self.reduce_mask) == 2:
...@@ -1452,13 +1464,10 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1452,13 +1464,10 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
print(""" print("""
{ {
int verbose = 0; int verbose = 0;
dim3 n_threads( size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[%(N)s], (size_t) 256), 1, 1};
std::min(PyGpuArray_DIMS(%(x)s)[%(N)s],
(size_t) 256));
%(threads_y)s %(threads_y)s
%(threads_z)s %(threads_z)s
dim3 n_blocks(std::min(PyGpuArray_DIMS(%(x)s)[0], size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t) 4096), 1, 1};
(size_t) 4096));
%(makecall)s %(makecall)s
} }
""" % locals(), file=sio) """ % locals(), file=sio)
...@@ -1476,9 +1485,21 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1476,9 +1485,21 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
in_dtype = "npy_" + node.inputs[0].dtype in_dtype = "npy_" + node.inputs[0].dtype
out_dtype = "npy_" + node.outputs[0].dtype out_dtype = "npy_" + node.outputs[0].dtype
acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype) acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype)
k_var = "kernel_reduce_10_%(name)s" % locals()
err_check = """
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: %(k_var)s: %%s.",
GpuKernel_error(%(k_var)s, err));
%(fail)s;
}
""" % locals()
sync = "" sync = ""
if config.gpuarray.sync: if config.gpuarray.sync:
sync = """GpuArray_sync(&%(z)s->ga);""" % locals() sync = """
err = GpuArray_sync(&%(z)s->ga);
%(err_check)s
""" % locals()
print(""" print("""
{ {
int verbose = 0; int verbose = 0;
...@@ -1491,95 +1512,71 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1491,95 +1512,71 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
// we could schedule more threads if we were maxing out the gridsize below, but // we could schedule more threads if we were maxing out the gridsize below, but
// the gridsize is way more than the physical hardware and I think 32 threads // the gridsize is way more than the physical hardware and I think 32 threads
// on a huge grid is enough to fully use the hardware. // on a huge grid is enough to fully use the hardware.
dim3 n_threads(32,1,1); size_t n_threads[3] = {32, 1, 1};
// We kindof reshape the input implicitly to something 4D: // We kindof reshape the input implicitly to something 4D:
// the shape A,B,C -> A, B, D, E // the shape A,B,C -> A, B, D, E
// where C <= D*E < C+32 // where C <= D*E < C+32
// where E==32 // where E==32
int A = 1; GpuKernel *%(k_var)s = &kernel_reduce_010_AD_%(name)s;
int B = PyGpuArray_DIMS(%(x)s)[0]; size_t A = 1;
int C = PyGpuArray_DIMS(%(x)s)[1]; size_t B = PyGpuArray_DIMS(%(x)s)[0];
int D = C/32; size_t C = PyGpuArray_DIMS(%(x)s)[1];
size_t D = C/32;
if (32*D < C) D+= 1; if (32*D < C) D+= 1;
assert ((C <= 32*D) && (32*D < C+32)); assert ((C <= 32*D) && (32*D < C+32));
// The gridsize would ideally be (A, D). But we do the following logic to make // The gridsize would ideally be (A, D). But we do the following logic to make
// sure we don't ask for a grid that is too big. // sure we don't ask for a grid that is too big.
dim3 n_blocks(A,D); size_t n_blocks[3] = {A, D, 1};
if (n_blocks.x > 4096) n_blocks.x = 4096; if (n_blocks[0] > 4096) n_blocks[0] = 4096;
if (n_blocks.x*n_blocks.y > 4096) n_blocks.y = 4096/n_blocks.x; if (n_blocks[0]*n_blocks[1] > 4096) n_blocks[1] = 4096/n_blocks[0];
kernel_reduce_010_AD_%(name)s<<<n_blocks, n_threads>>>( ssize_t stride_A0 = 1;
A,B,C,D, ssize_t stride_A1 = PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s);
(%(in_dtype)s *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset), ssize_t stride_A2 = PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s);
1, ssize_t stride_Z0 = 1;
PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s), ssize_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s);
PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s), void *kernel_params[] = {
(%(out_dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset), (void *)&A, (void *)&B, (void *)&C, (void *)&D,
1, (void *)%(x)s->ga.data,
PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s) (void *)&%(x)s->ga.offset,
); (void *)&stride_A0, (void *)&stride_A1, (void *)&stride_A2,
(void *)%(z)s->ga.data,
%(sync)s (void *)&%(z)s->ga.offset,
cudaError_t sts = cudaGetLastError(); (void *)&stride_Z0, (void *)&stride_Z1};
if (cudaSuccess != sts) int err = GpuKernel_call(%(k_var)s, 3, n_threads, n_blocks, 0, kernel_params);
{ %(err_check)s
PyErr_Format(PyExc_RuntimeError, %(sync)s
"Cuda error: %%s: %%s."
" (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
"kernel_reduce_10_AD%(name)s",
cudaGetErrorString(sts),
n_blocks.x,
n_blocks.y,
n_threads.x,
n_threads.y,
n_threads.z);
%(fail)s;
}
}else{ }else{
dim3 n_threads( GpuKernel *%(k_var)s = &kernel_reduce_010_%(name)s;
std::min(PyGpuArray_DIMS(%(x)s)[0], size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t) 256), 1, 1};
(size_t) 256)); size_t n_blocks[3] = {1, std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t) 4096), 1};
dim3 n_blocks(1,
std::min(PyGpuArray_DIMS(%(x)s)[1],
(size_t) 4096));
if (verbose) { if (verbose) {
fprintf(stderr, fprintf(stderr,
"running kernel_reduce_10_%(name)s n_blocks=(%%i,%%i)\\n", "running kernel_reduce_10_%(name)s n_blocks=(%%i,%%i)\\n",
n_blocks.x, n_blocks[0],
n_blocks.y); n_blocks[1]);
} }
assert(PyGpuArray_DIMS(%(x)s)[1] == PyGpuArray_DIMS(%(z)s)[0]); assert(PyGpuArray_DIMS(%(x)s)[1] == PyGpuArray_DIMS(%(z)s)[0]);
int n_shared = sizeof(%(acc_dtype)s) * n_threads.x; size_t n_shared = sizeof(%(acc_dtype)s) * n_threads[0];
kernel_reduce_010_%(name)s<<<n_blocks, n_threads, n_shared>>>( size_t dim_0 = 1;
1, ssize_t stride_A0 = 1;
PyGpuArray_DIMS(%(x)s)[0], ssize_t stride_A1 = PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s);
PyGpuArray_DIMS(%(x)s)[1], ssize_t stride_A2 = PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s);
(%(in_dtype)s *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset), ssize_t stride_Z0 = 1;
1, ssize_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s);
PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s), void *kernel_params[] = {
PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s), (void *)&dim_0,
(%(out_dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset), (void *)&PyGpuArray_DIMS(%(x)s)[0],
1, (void *)&PyGpuArray_DIMS(%(x)s)[1],
PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s) (void *)%(x)s->ga.data, (void *)&%(x)s->ga.offset,
); (void *)&stride_A0, (void *)&stride_A1, (void *)&stride_A2,
(void *)%(z)s->ga.data, (void *)&%(z)s->ga.offset,
(void *)&stride_Z0, (void *)&stride_Z1};
int err = GpuKernel_call(%(k_var)s, 3, n_threads, n_blocks, n_shared, kernel_params);
%(err_check)s
%(sync)s %(sync)s
cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts)
{
PyErr_Format(PyExc_RuntimeError,
"Cuda error: %%s: %%s."
" (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
"kernel_reduce_010_%(name)s",
cudaGetErrorString(sts),
n_blocks.x,
n_blocks.y,
n_threads.x,
n_threads.y,
n_threads.z);
%(fail)s;
}
} }
} }
""" % locals(), file=sio) """ % locals(), file=sio)
...@@ -1591,9 +1588,21 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1591,9 +1588,21 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
pattern = ''.join(str(i) for i in self.reduce_mask) pattern = ''.join(str(i) for i in self.reduce_mask)
in_dtype = "npy_" + node.inputs[0].dtype in_dtype = "npy_" + node.inputs[0].dtype
out_dtype = "npy_" + node.outputs[0].dtype out_dtype = "npy_" + node.outputs[0].dtype
k_var = "kernel_reduce_010_AD_%(name)s" % locals()
err_check = """
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: %(k_var)s: %%s.",
GpuKernel_error(&%(k_var)s, err));
%(fail)s;
}
""" % locals()
sync = "" sync = ""
if config.gpuarray.sync: if config.gpuarray.sync:
sync = """GpuArray_sync(&%(z)s->ga);""" % locals() sync = """
err = GpuArray_sync(&%(z)s->ga);
%(err_check)s
""" % locals()
print(""" print("""
{ {
//int n_summations = PyGpuArray_DIMS(%(x)s)[0] * PyGpuArray_DIMS(%(x)s)[2]; //int n_summations = PyGpuArray_DIMS(%(x)s)[0] * PyGpuArray_DIMS(%(x)s)[2];
...@@ -1608,108 +1617,82 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1608,108 +1617,82 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
// we could schedule more threads if we were maxing out the gridsize below, but // we could schedule more threads if we were maxing out the gridsize below, but
// the gridsize is way more than the physical hardware and I think 32 threads // the gridsize is way more than the physical hardware and I think 32 threads
// on a huge grid is enough to fully use the hardware. // on a huge grid is enough to fully use the hardware.
dim3 n_threads(32,1,1); size_t n_threads[3] = {32, 1, 1};
// We kindof reshape the input implicitly to something 4D: // We kindof reshape the input implicitly to something 4D:
// the shape A,B,C -> A, B, D, E // the shape A,B,C -> A, B, D, E
// where C <= D*E < C+32 // where C <= D*E < C+32
// where E==32 // where E==32
int A = PyGpuArray_DIMS(%(x)s)[0]; size_t A = PyGpuArray_DIMS(%(x)s)[0];
int B = PyGpuArray_DIMS(%(x)s)[1]; size_t B = PyGpuArray_DIMS(%(x)s)[1];
int C = PyGpuArray_DIMS(%(x)s)[2]; size_t C = PyGpuArray_DIMS(%(x)s)[2];
int D = C/32; size_t D = C/32;
if (32*D < C) D+= 1; if (32*D < C) D+= 1;
assert ((C <= 32*D) && (32*D < C+32)); assert ((C <= 32*D) && (32*D < C+32));
// The gridsize would ideally be (A, D). But we do the following logic to make // The gridsize would ideally be (A, D). But we do the following logic to make
// sure we don't ask for a grid that is too big. // sure we don't ask for a grid that is too big.
dim3 n_blocks(A,D); size_t n_blocks[3] = {A, D, 1};
if (n_blocks.x > 4096) n_blocks.x = 4096; if (n_blocks[0] > 4096) n_blocks[0] = 4096;
if (n_blocks.x*n_blocks.y > 4096) n_blocks.y = 4096/n_blocks.x; if (n_blocks[0]*n_blocks[1] > 4096) n_blocks[1] = 4096/n_blocks[0];
int n_shared = 0; ssize_t stride_A0 = PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s);
kernel_reduce_010_AD_%(name)s<<<n_blocks, n_threads, n_shared>>>( ssize_t stride_A1 = PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s);
A,B,C,D, ssize_t stride_A2 = PyGpuArray_STRIDES(%(x)s)[2]/sizeof(%(in_dtype)s);
(%(in_dtype)s *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset), ssize_t stride_Z0 = PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s);
PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s), ssize_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[1]/sizeof(%(out_dtype)s);
PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s), void *kernel_params[] = {
PyGpuArray_STRIDES(%(x)s)[2]/sizeof(%(in_dtype)s), (void *)&A, (void *)&B, (void *)&C, (void *)&D,
(%(out_dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset), (void *)%(x)s->ga.data,
PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s), (void *)&%(x)s->ga.offset,
PyGpuArray_STRIDES(%(z)s)[1]/sizeof(%(out_dtype)s) (void *)&stride_A0, (void *)&stride_A1, (void *)&stride_A2,
); (void *)%(z)s->ga.data,
(void *)&%(z)s->ga.offset,
(void *)&stride_Z0, (void *)&stride_Z1};
int err = GpuKernel_call(&%(k_var)s, 3, n_threads, n_blocks, 0, kernel_params);
%(err_check)s
%(sync)s %(sync)s
cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts)
{
PyErr_Format(PyExc_RuntimeError,
"Cuda error: %%s: %%s."
" (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
"kernel_reduce_010_%(name)s",
cudaGetErrorString(sts),
n_blocks.x,
n_blocks.y,
n_threads.x,
n_threads.y,
n_threads.z);
%(fail)s;
}
} }
else else
{ {
int verbose = 2; int verbose = 2;
dim3 n_threads(std::min((size_t) 32, size_t n_threads[3] = {std::min((size_t) 32, PyGpuArray_DIMS(%(x)s)[2]), 1, 1};
PyGpuArray_DIMS(%(x)s)[2])); while( (n_threads[0]*(n_threads[1]+1)<=256)
while( (n_threads.x*(n_threads.y+1)<=256) && (n_threads[1]<PyGpuArray_DIMS(%(x)s)[1])){
&& (n_threads.y<PyGpuArray_DIMS(%(x)s)[1])){ n_threads[1]++;
n_threads.y++;
} }
dim3 n_blocks(std::min(PyGpuArray_DIMS(%(x)s)[0], size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t)4096), 1, 1};
(size_t)4096)); n_blocks[1] = std::min(
n_blocks.y = std::min(
ceil_intdiv(PyGpuArray_DIMS(%(x)s)[2], ceil_intdiv(PyGpuArray_DIMS(%(x)s)[2],
(size_t)n_threads.x), (size_t)n_threads[0]),
(size_t)(4096 / n_blocks.x) (size_t)(4096 / n_blocks[0])
); );
if(std::min(std::min(PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s), if(std::min(std::min(PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s),
PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s)), PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s)),
PyGpuArray_STRIDES(%(x)s)[2]/sizeof(%(in_dtype)s)) PyGpuArray_STRIDES(%(x)s)[2]/sizeof(%(in_dtype)s))
==PyGpuArray_STRIDES(%(x)s)[2]/sizeof(%(in_dtype)s) ==PyGpuArray_STRIDES(%(x)s)[2]/sizeof(%(in_dtype)s)
&& n_blocks.y==ceil_intdiv(PyGpuArray_DIMS(%(x)s)[2], && n_blocks[1]==ceil_intdiv(PyGpuArray_DIMS(%(x)s)[2],
(size_t)n_threads.x)){ (size_t)n_threads[0])){
if(verbose>1) if(verbose>1)
printf("n_block.x.1=%%d, n_block.x.2=%%d, n_block.y.1=%%d, n_block.y.2=%%d,\\n", printf("n_block.x.1=%%d, n_block.x.2=%%d, n_block.y.1=%%d, n_block.y.2=%%d,\\n",
PyGpuArray_DIMS(%(x)s)[0],4096, PyGpuArray_DIMS(%(x)s)[0],4096,
ceil_intdiv(PyGpuArray_DIMS(%(x)s)[2],(size_t)n_threads.x), ceil_intdiv(PyGpuArray_DIMS(%(x)s)[2],(size_t)n_threads[0]),
(size_t)(4096 / n_blocks.x)); (size_t)(4096 / n_blocks[0]));
assert(n_threads.x<=32); assert(n_threads[0]<=32);
%(makecall_inner)s %(makecall_inner)s
}else{ }else{
n_threads.x = std::min(PyGpuArray_DIMS(%(x)s)[1], n_threads[0] = std::min(PyGpuArray_DIMS(%(x)s)[1],
(size_t) 256); (size_t) 256);
n_blocks.x = std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t)4096); n_blocks[0] = std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t)4096);
n_blocks.y = std::min( n_blocks[1] = std::min(
PyGpuArray_DIMS(%(x)s)[2], PyGpuArray_DIMS(%(x)s)[2],
(size_t)(4096 / n_blocks.x) (size_t)(4096 / n_blocks[0])
); );
%(makecall)s %(makecall)s
} }
%(sync)s %(sync)s
cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts)
{
PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
"kernel_reduce_%(pattern)s_%(name)s",
cudaGetErrorString(sts),
n_blocks.x,
n_blocks.y,
n_threads.x,
n_threads.y,
n_threads.z);
%(fail)s;
}
} }
} }
""" % locals(), file=sio) """ % locals(), file=sio)
...@@ -1719,16 +1702,14 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1719,16 +1702,14 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
print(""" print("""
{ {
int verbose = 0; int verbose = 0;
dim3 n_threads( size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[3], (size_t) 256), 1, 1};
std::min(PyGpuArray_DIMS(%(x)s)[3], while (n_threads[0] * n_threads[1] <= 256)
(size_t) 256));
while (n_threads.x * n_threads.y <= 256)
{ {
if (n_threads.y > PyGpuArray_DIMS(%(x)s)[1]) break; if (n_threads[1] > PyGpuArray_DIMS(%(x)s)[1]) break;
n_threads.y += 1; n_threads[1] += 1;
} }
n_threads.y -= 1; n_threads[1] -= 1;
dim3 n_blocks(PyGpuArray_DIMS(%(x)s)[0], PyGpuArray_DIMS(%(x)s)[2]); size_t n_blocks[3] = {PyGpuArray_DIMS(%(x)s)[0], PyGpuArray_DIMS(%(x)s)[2], 1};
%(makecall)s %(makecall)s
} }
""" % locals(), file=sio) """ % locals(), file=sio)
...@@ -1738,7 +1719,21 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1738,7 +1719,21 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
in_dtype = "npy_" + node.inputs[0].dtype in_dtype = "npy_" + node.inputs[0].dtype
out_dtype = "npy_" + node.outputs[0].dtype out_dtype = "npy_" + node.outputs[0].dtype
acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype) acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype)
sync = bool(config.gpuarray.sync) k_var = "kernel_reduce_010_AD_%(name)s" % locals()
err_check = """
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: %(k_var)s: %%s.",
GpuKernel_error(&%(k_var)s, err));
%(fail)s;
}
""" % locals()
sync = ""
if config.gpuarray.sync:
sync = """
err = GpuArray_sync(&%(z)s->ga);
%(err_check)s
""" % locals()
# use threadIdx.x for i0 # use threadIdx.x for i0
# use blockIdx.x for i1 # use blockIdx.x for i1
# use blockIdx.y for i2 # use blockIdx.y for i2
...@@ -1747,15 +1742,12 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1747,15 +1742,12 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
int verbose = 0; int verbose = 0;
if (PyGpuArray_STRIDES(%(x)s)[2] != sizeof(%(in_dtype)s)){ if (PyGpuArray_STRIDES(%(x)s)[2] != sizeof(%(in_dtype)s)){
printf("slow\\n"); printf("slow\\n");
dim3 n_threads( size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t) 256), 1, 1};
std::min(PyGpuArray_DIMS(%(x)s)[0], size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t)4096), 1, 1};
(size_t) 256)); while (n_blocks[0] * (n_blocks[1]+1) <= 4096 &&
dim3 n_blocks(std::min(PyGpuArray_DIMS(%(x)s)[1], n_blocks[1] <= PyGpuArray_DIMS(%(x)s)[2])
(size_t)4096));
while (n_blocks.x * (n_blocks.y+1) <= 4096 &&
n_blocks.y <= PyGpuArray_DIMS(%(x)s)[2])
{ {
n_blocks.y += 1; n_blocks[1] += 1;
} }
%(makecall)s %(makecall)s
} }
...@@ -1763,50 +1755,38 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1763,50 +1755,38 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
{ // reuse 010_AD kernel, we transpose the 2 first dim { // reuse 010_AD kernel, we transpose the 2 first dim
// See the reduction for the real 010_AD kernel for // See the reduction for the real 010_AD kernel for
// explanation. We do this to get coalesced read. // explanation. We do this to get coalesced read.
dim3 n_threads(32,1,1); size_t n_threads[3] = {32, 1, 1};
int A = PyGpuArray_DIMS(%(x)s)[1]; size_t A = PyGpuArray_DIMS(%(x)s)[1];
int B = PyGpuArray_DIMS(%(x)s)[0]; size_t B = PyGpuArray_DIMS(%(x)s)[0];
int C = PyGpuArray_DIMS(%(x)s)[2]; size_t C = PyGpuArray_DIMS(%(x)s)[2];
int D = C/32; size_t D = C/32;
if (32*D < C) D+= 1; if (32*D < C) D+= 1;
assert ((C <= 32*D) && (32*D < C+32)); assert ((C <= 32*D) && (32*D < C+32));
// The gridsize would ideally be (A, D). But we do the following logic to make // The gridsize would ideally be (A, D). But we do the following logic to make
// sure we don't ask for a grid that is too big. // sure we don't ask for a grid that is too big.
dim3 n_blocks(A,D); size_t n_blocks[3] = {A, D, 1};
if (n_blocks.x > 4096) n_blocks.x = 4096; if (n_blocks[0] > 4096) n_blocks[0] = 4096;
if (n_blocks.x*n_blocks.y > 4096) n_blocks.y = 4096/n_blocks.x; if (n_blocks[0]*n_blocks[1] > 4096) n_blocks[1] = 4096/n_blocks[0];
int n_shared = 0; size_t n_shared = 0;
kernel_reduce_010_AD_%(name)s<<<n_blocks, n_threads, n_shared>>>( ssize_t stride_A0 = PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s);
A,B,C,D, ssize_t stride_A1 = PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s);
(%(in_dtype)s *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset), ssize_t stride_A2 = PyGpuArray_STRIDES(%(x)s)[2]/sizeof(%(in_dtype)s);
PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s), ssize_t stride_Z0 = PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s);
PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s), ssize_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[1]/sizeof(%(out_dtype)s);
PyGpuArray_STRIDES(%(x)s)[2]/sizeof(%(in_dtype)s), void *kernel_params[] = {
(%(out_dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset), (void *)&A, (void *)&B, (void *)&C, (void *)&D,
PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s), (void *)%(x)s->ga.data,
PyGpuArray_STRIDES(%(z)s)[1]/sizeof(%(out_dtype)s) (void *)&%(x)s->ga.offset,
); (void *)&stride_A0, (void *)&stride_A1, (void *)&stride_A2,
if (%(sync)d) (void *)%(z)s->ga.data,
GpuArray_sync(&%(z)s->ga); (void *)&%(z)s->ga.offset,
cudaError_t sts = cudaGetLastError(); (void *)&stride_Z0, (void *)&stride_Z1};
if (cudaSuccess != sts) int err = GpuKernel_call(&%(k_var)s, 3, n_threads, n_blocks, 0, kernel_params);
{ %(err_check)s
PyErr_Format(PyExc_RuntimeError, %(sync)s
"Cuda error: %%s: %%s."
" (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
"kernel_reduce_010_%(name)s",
cudaGetErrorString(sts),
n_blocks.x,
n_blocks.y,
n_threads.x,
n_threads.y,
n_threads.z);
%(fail)s;
}
} }
} }
""" % locals(), file=sio) """ % locals(), file=sio)
...@@ -1815,18 +1795,16 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1815,18 +1795,16 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
print(""" print("""
{ {
int verbose = 0; int verbose = 0;
dim3 n_threads( size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t) 256), 1, 1};
std::min(PyGpuArray_DIMS(%(x)s)[1], while (n_threads[0]*n_threads[1] <= 256)
(size_t) 256));
while (n_threads.x*n_threads.y <= 256)
{ {
if (n_threads.y > PyGpuArray_DIMS(%(x)s)[0]) if (n_threads[1] > PyGpuArray_DIMS(%(x)s)[0])
break; break;
n_threads.y += 1; n_threads[1] += 1;
} }
n_threads.y -= 1; n_threads[1] -= 1;
dim3 n_blocks(PyGpuArray_DIMS(%(x)s)[2]); size_t n_blocks[3] = {PyGpuArray_DIMS(%(x)s)[2], 1, 1};
%(makecall)s %(makecall)s
} }
""" % locals(), file=sio) """ % locals(), file=sio)
...@@ -1836,19 +1814,15 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1836,19 +1814,15 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
print(""" print("""
{ {
int verbose = 0; int verbose = 0;
dim3 n_threads( size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[2], (size_t) 256), 1, 1};
std::min(PyGpuArray_DIMS(%(x)s)[2], size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t) 4096), 1, 1};
(size_t) 256)); while (n_blocks[0] * n_blocks[1] <= 4096)
dim3 n_blocks(
std::min(PyGpuArray_DIMS(%(x)s)[0],
(size_t) 4096));
while (n_blocks.x * n_blocks.y <= 4096)
{ {
if (n_blocks.y > PyGpuArray_DIMS(%(x)s)[1]) if (n_blocks[1] > PyGpuArray_DIMS(%(x)s)[1])
break; break;
n_blocks.y += 1; n_blocks[1] += 1;
} }
n_blocks.y -= 1; n_blocks[1] -= 1;
%(makecall)s %(makecall)s
} }
""" % locals(), file=sio) """ % locals(), file=sio)
...@@ -1858,31 +1832,29 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1858,31 +1832,29 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
print(""" print("""
{ {
int verbose = 0; int verbose = 0;
dim3 n_threads( size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[2], (size_t) 256), 1, 1};
std::min(PyGpuArray_DIMS(%(x)s)[2],
(size_t) 256));
//get as many y threads as we can fit //get as many y threads as we can fit
while (n_threads.x * n_threads.y <= 256) while (n_threads[0] * n_threads[1] <= 256)
{ {
if (n_threads.y > PyGpuArray_DIMS(%(x)s)[1]) if (n_threads[1] > PyGpuArray_DIMS(%(x)s)[1])
break; break;
n_threads.y += 1; n_threads[1] += 1;
} }
n_threads.y -= 1; n_threads[1] -= 1;
//get as many z threads as we can fit //get as many z threads as we can fit
while (n_threads.x * n_threads.y * n_threads.z <= 256) while (n_threads[0] * n_threads[1] * n_threads[2] <= 256)
{ {
if (n_threads.z > PyGpuArray_DIMS(%(x)s)[0]) if (n_threads[2] > PyGpuArray_DIMS(%(x)s)[0])
break; break;
n_threads.z += 1; n_threads[2] += 1;
} }
n_threads.z -= 1; n_threads[2] -= 1;
//Maximum for Fermi GPU on that dimensions. //Maximum for Fermi GPU on that dimensions.
n_threads.z = std::min(n_threads.z, (unsigned)64); n_threads[2] = std::min(n_threads[2], (size_t)64);
dim3 n_blocks(1,1,1); size_t n_blocks[3] = {1, 1, 1};
%(makecall)s %(makecall)s
} }
""" % locals(), file=sio) """ % locals(), file=sio)
...@@ -1896,24 +1868,20 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1896,24 +1868,20 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
{ {
int verbose = 0; int verbose = 0;
dim3 n_blocks( size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t) 4096), 1, 1};
std::min(PyGpuArray_DIMS(%(x)s)[0],
(size_t) 4096));
while (n_blocks.x * n_blocks.y <= 4096 && while (n_blocks[0] * n_blocks[1] <= 4096 &&
n_blocks.y < PyGpuArray_DIMS(%(x)s)[1]) n_blocks[1] < PyGpuArray_DIMS(%(x)s)[1])
{ {
n_blocks.y += 1; n_blocks[1] += 1;
} }
dim3 n_threads( size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[3], (size_t) 256), 1, 1};
std::min(PyGpuArray_DIMS(%(x)s)[3], while (n_threads[0] * n_threads[1] <= 256
(size_t) 256)); && n_threads[1] < PyGpuArray_DIMS(%(x)s)[2]
while (n_threads.x * n_threads.y <= 256 && n_threads[0] * n_threads[1] * sizeof(%(acc_dtype)s) <=(15*1024-200))
&& n_threads.y < PyGpuArray_DIMS(%(x)s)[2]
&& n_threads.x * n_threads.y * sizeof(%(acc_dtype)s) <=(15*1024-200))
{ {
n_threads.y += 1; n_threads[1] += 1;
} }
%(makecall)s %(makecall)s
...@@ -1925,32 +1893,30 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1925,32 +1893,30 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
print(""" print("""
{ {
int verbose = 0; int verbose = 0;
dim3 n_threads( size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[2], (size_t) 256), 1, 1};
std::min(PyGpuArray_DIMS(%(x)s)[2],
(size_t) 256));
//get as many y threads as we can fit //get as many y threads as we can fit
while (n_threads.x * n_threads.y <= 256) while (n_threads[0] * n_threads[1] <= 256)
{ {
if (n_threads.y > PyGpuArray_DIMS(%(x)s)[1]) if (n_threads[1] > PyGpuArray_DIMS(%(x)s)[1])
break; break;
n_threads.y += 1; n_threads[1] += 1;
} }
n_threads.y -= 1; n_threads[1] -= 1;
//get as many z threads as we can fit //get as many z threads as we can fit
while (n_threads.x * n_threads.y * n_threads.z <= 256) while (n_threads[0] * n_threads[1] * n_threads[2] <= 256)
{ {
if (n_threads.z > PyGpuArray_DIMS(%(x)s)[0]) if (n_threads[2] > PyGpuArray_DIMS(%(x)s)[0])
break; break;
n_threads.z += 1; n_threads[2] += 1;
} }
n_threads.z -= 1; n_threads[2] -= 1;
//Maximum for Fermi GPU on that dimensions. //Maximum for Fermi GPU on that dimensions.
n_threads.z = std::min(n_threads.z, (unsigned)64); n_threads[2] = std::min(n_threads[2], (size_t)64);
dim3 n_blocks(1,1,1); size_t n_blocks[3] = {1, 1, 1};
%(makecall)s %(makecall)s
} }
""" % locals(), file=sio) """ % locals(), file=sio)
...@@ -1960,27 +1926,25 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1960,27 +1926,25 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
print(""" print("""
{ {
int verbose = 0; int verbose = 0;
dim3 n_threads( size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[3], (size_t) 256), 1, 1};
std::min(PyGpuArray_DIMS(%(x)s)[3],
(size_t) 256));
while (n_threads.x * (n_threads.y+1) <= 256) ++n_threads.y; while (n_threads[0] * (n_threads[1]+1) <= 256) ++n_threads[1];
if (n_threads.y > PyGpuArray_DIMS(%(x)s)[2]) if (n_threads[1] > PyGpuArray_DIMS(%(x)s)[2])
n_threads.y = PyGpuArray_DIMS(%(x)s)[2]; n_threads[1] = PyGpuArray_DIMS(%(x)s)[2];
while (n_threads.x * n_threads.y * (n_threads.z+1) <= 256) ++n_threads.z; while (n_threads[0] * n_threads[1] * (n_threads[2]+1) <= 256) ++n_threads[2];
if (n_threads.z > 64) if (n_threads[2] > 64)
n_threads.z = 64; n_threads[2] = 64;
if (n_threads.z > PyGpuArray_DIMS(%(x)s)[0]) if (n_threads[2] > PyGpuArray_DIMS(%(x)s)[0])
n_threads.z = PyGpuArray_DIMS(%(x)s)[0]; n_threads[2] = PyGpuArray_DIMS(%(x)s)[0];
dim3 n_blocks(PyGpuArray_DIMS(%(x)s)[1]); size_t n_blocks[3] = {PyGpuArray_DIMS(%(x)s)[1], 1, 1};
%(makecall)s %(makecall)s
} }
""" % locals(), file=sio) """ % locals(), file=sio)
def c_code_cache_version_apply(self, node): def c_code_cache_version_apply(self, node):
version = [15] # the version corresponding to the c code in this Op version = [16] # the version corresponding to the c code in this Op
# now we insert versions for the ops on which we depend... # now we insert versions for the ops on which we depend...
scalar_node = Apply(self.scalar_op, scalar_node = Apply(self.scalar_op,
...@@ -1994,14 +1958,18 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1994,14 +1958,18 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
else: else:
return () return ()
def c_support_code_apply(self, node, nodename): def gpu_kernels(self, node, nodename):
sio = StringIO()
nd_in = len(self.reduce_mask) nd_in = len(self.reduce_mask)
in_dtype = "npy_" + node.inputs[0].dtype in_dtype = node.inputs[0].dtype
out_dtype = "npy_" + node.outputs[0].dtype out_dtype = node.outputs[0].dtype
acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype) acc_dtype = self._acc_dtype(node.inputs[0].dtype)
load_in = load_w(node.inputs[0].dtype) flags=Kernel.get_flags(in_dtype, acc_dtype, out_dtype)
write_out = write_w(node.outputs[0].dtype) in_type = gpuarray.dtype_to_ctype(in_dtype)
out_type = gpuarray.dtype_to_ctype(out_dtype)
acc_type = gpuarray.dtype_to_ctype(acc_dtype)
load_in = load_w(in_dtype)
write_out = write_w(out_dtype)
kernels = []
if all(i == 1 for i in self.reduce_mask): if all(i == 1 for i in self.reduce_mask):
# this kernel is ok for up to a few thousand elements, but # this kernel is ok for up to a few thousand elements, but
...@@ -2011,16 +1979,21 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2011,16 +1979,21 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
load_in + "(A[i0])", load_in + "(A[i0])",
{}, True) {}, True)
reduce_init = self._assign_init(load_in + "(A[0])") reduce_init = self._assign_init(load_in + "(A[0])")
kname = "kernel_reduce_ccontig"
k_var = "kernel_reduce_ccontig_" + nodename
sio = StringIO()
print(""" print("""
static __global__ void kernel_reduce_ccontig_%(nodename)s( KERNEL void %(kname)s(
const unsigned int d0, const ga_size d0,
const %(in_dtype)s *A, const %(in_type)s *A, const ga_size offset_A,
%(out_dtype)s * Z) %(out_type)s *Z, const ga_size offset_Z)
{ {
const int threadCount = blockDim.x; const int threadCount = blockDim.x;
const int threadNum = threadIdx.x; const int threadNum = threadIdx.x;
extern __shared__ %(acc_dtype)s buf[]; extern __shared__ %(acc_type)s buf[];
%(acc_dtype)s myresult = %(reduce_init)s; %(acc_type)s myresult = %(reduce_init)s;
A = (const %(in_type)s *)(((char *)A)+offset_A);
Z = (%(out_type)s *)(((char *)Z)+offset_Z);
if (warpSize != 32) if (warpSize != 32)
{ {
...@@ -2034,6 +2007,13 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2034,6 +2007,13 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
%(reducebuf)s %(reducebuf)s
} }
""" % locals(), file=sio) """ % locals(), file=sio)
params = [
'uintp',
gpuarray.GpuArray, 'uintp',
gpuarray.GpuArray, 'uintp'
]
kernels.append(Kernel(code=sio.getvalue(), name=kname,
params=params, flags=flags, objvar=k_var))
if self.reduce_mask == (1,): if self.reduce_mask == (1,):
# this kernel is ok for up to a few thousand elements, but # this kernel is ok for up to a few thousand elements, but
# it only runs on ONE multiprocessor # it only runs on ONE multiprocessor
...@@ -2042,16 +2022,22 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2042,16 +2022,22 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
load_in + "(A[i0 * sA0])", load_in + "(A[i0 * sA0])",
{}, True) {}, True)
reduce_init = self._assign_init(load_in + "(A[0])") reduce_init = self._assign_init(load_in + "(A[0])")
kname = "kernel_reduce_1"
k_var = "kernel_reduce_1_" + nodename
sio = StringIO()
print(""" print("""
static __global__ void kernel_reduce_1_%(nodename)s( KERNEL void %(kname)s(
const unsigned int d0, const ga_size d0,
const %(in_dtype)s *A, const int sA0, const %(in_type)s *A, const ga_size offset_A,
%(out_dtype)s * Z) const ga_ssize sA0,
%(out_type)s * Z, const ga_size offset_Z)
{ {
const int threadCount = blockDim.x; const int threadCount = blockDim.x;
const int threadNum = threadIdx.x; const int threadNum = threadIdx.x;
extern __shared__ %(acc_dtype)s buf[]; extern __shared__ %(acc_type)s buf[];
%(acc_dtype)s myresult = %(reduce_init)s; %(acc_type)s myresult = %(reduce_init)s;
A = (const %(in_type)s *)(((char *)A)+offset_A);
Z = (%(out_type)s *)(((char *)Z)+offset_Z);
if (warpSize != 32) if (warpSize != 32)
{ {
...@@ -2065,6 +2051,14 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2065,6 +2051,14 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
%(reducebuf)s %(reducebuf)s
} }
""" % locals(), file=sio) """ % locals(), file=sio)
params = [
'uintp',
gpuarray.GpuArray, 'uintp',
'intp',
gpuarray.GpuArray, 'uintp'
]
kernels.append(Kernel(code=sio.getvalue(), name=kname,
params=params, flags=flags, objvar=k_var))
if self.reduce_mask == (1, 1): if self.reduce_mask == (1, 1):
# this kernel is ok for up to a few thousand elements, but # this kernel is ok for up to a few thousand elements, but
# it only runs on ONE multiprocessor # it only runs on ONE multiprocessor
...@@ -2073,17 +2067,22 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2073,17 +2067,22 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
load_in + "(A[i0 * sA0 + i1 * sA1])", load_in + "(A[i0 * sA0 + i1 * sA1])",
{}, True) {}, True)
reduce_init = self._assign_init(load_in + "(A[0])") reduce_init = self._assign_init(load_in + "(A[0])")
kname = "kernel_reduce_11"
k_var = "kernel_reduce_11_" + nodename
sio = StringIO()
print(""" print("""
static __global__ void kernel_reduce_11_%(nodename)s( KERNEL void %(kname)s(
const int d0, const ga_size d0, const ga_size d1,
const int d1, const %(in_type)s *A, const ga_size offset_A,
const %(in_dtype)s *A, const int sA0, const int sA1, const ga_ssize sA0, const ga_ssize sA1,
%(out_dtype)s * Z) %(out_type)s * Z, const ga_size offset_Z)
{ {
const int threadCount = blockDim.x * blockDim.y; const int threadCount = blockDim.x * blockDim.y;
const int threadNum = threadIdx.y*blockDim.x + threadIdx.x; const int threadNum = threadIdx.y*blockDim.x + threadIdx.x;
extern __shared__ %(acc_dtype)s buf[]; extern __shared__ %(acc_type)s buf[];
%(acc_dtype)s myresult = %(reduce_init)s; %(acc_type)s myresult = %(reduce_init)s;
A = (const %(in_type)s *)(((char *)A)+offset_A);
Z = (%(out_type)s *)(((char *)Z)+offset_Z);
if (warpSize != 32) if (warpSize != 32)
{ {
...@@ -2100,6 +2099,14 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2100,6 +2099,14 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
%(reducebuf)s %(reducebuf)s
} }
""" % locals(), file=sio) """ % locals(), file=sio)
params = [
'uintp', 'uintp',
gpuarray.GpuArray, 'uintp',
'intp', 'intp',
gpuarray.GpuArray, 'uintp'
]
kernels.append(Kernel(code=sio.getvalue(), name=kname,
params=params, flags=flags, objvar=k_var))
#01, 011, 0111 #01, 011, 0111
if (0 == self.reduce_mask[0] and if (0 == self.reduce_mask[0] and
all(self.reduce_mask[1:]) and all(self.reduce_mask[1:]) and
...@@ -2144,17 +2151,18 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2144,17 +2151,18 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
reducebuf = self._k_reduce_buf('Z[i0 * sZ0]', node, reducebuf = self._k_reduce_buf('Z[i0 * sZ0]', node,
nodename, sub={}) nodename, sub={})
param_dim = ",".join(["const int d%d" % i param_dim = ",".join(["const ga_size d%d" % i
for i in xrange(nd_in)]) for i in xrange(nd_in)])
param_strides = ",".join(["const int sA%d" % i param_strides = ",".join(["const ga_ssize sA%d" % i
for i in xrange(nd_in)]) for i in xrange(nd_in)])
decl = self._k_decl(node, nodename) decl, kname, params, k_var = self._k_decl(node, nodename)
init = self._k_init(node, nodename) init = self._k_init(node, nodename)
reduce_init = self._assign_init(load_in + "(A[%(first_i3)s * %(sA3)s + %(first_i2)s * %(sA2)s + %(first_i1)s * %(sA1)s + i0 * sA0])" % locals()) reduce_init = self._assign_init(load_in + "(A[%(first_i3)s * %(sA3)s + %(first_i2)s * %(sA2)s + %(first_i1)s * %(sA1)s + i0 * sA0])" % locals())
reduce_fct = self._assign_reduce( reduce_fct = self._assign_reduce(
node, nodename, "myresult", node, nodename, "myresult",
load_in + "(A[i3 * sA3 + i2 * sA2 + i1 * sA1 + i0 * sA0])", load_in + "(A[i3 * sA3 + i2 * sA2 + i1 * sA1 + i0 * sA0])",
{}, True) {}, True)
sio = StringIO()
print(""" print("""
%(decl)s{ %(decl)s{
%(init)s %(init)s
...@@ -2171,6 +2179,8 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2171,6 +2179,8 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
} }
} }
""" % locals(), file=sio) """ % locals(), file=sio)
kernels.append(Kernel(code=sio.getvalue(), name=kname,
params=params, flags=flags, objvar=k_var))
if self.reduce_mask == (0, 1, 0) or self.reduce_mask == (1, 0): if self.reduce_mask == (0, 1, 0) or self.reduce_mask == (1, 0):
# this kernel uses one block for each column, # this kernel uses one block for each column,
# threads per block for each element per column. # threads per block for each element per column.
...@@ -2184,18 +2194,22 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2184,18 +2194,22 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])", load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])",
{}, True) {}, True)
reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + threadIdx.x * sA1 + i2 * sA2])") reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + threadIdx.x * sA1 + i2 * sA2])")
kname = "kernel_reduce_010"
k_var = "kernel_reduce_010_" + nodename
sio = StringIO()
print(""" print("""
static __global__ void kernel_reduce_010_%(nodename)s( KERNEL void %(kname)s(
const int d0, const ga_size d0, const ga_size d1, const ga_size d2,
const int d1, const %(in_type)s *A, const ga_size offset_A,
const int d2, const ga_ssize sA0, const ga_ssize sA1, const ga_ssize sA2,
const %(in_dtype)s *A, const int sA0, %(out_type)s * Z, const ga_size offset_Z,
const int sA1, const int sA2, const ga_ssize sZ0, const ga_ssize sZ1)
%(out_dtype)s * Z, const int sZ0, const int sZ1)
{ {
const int threadCount = blockDim.x; const int threadCount = blockDim.x;
const int threadNum = threadIdx.x; const int threadNum = threadIdx.x;
extern __shared__ %(acc_dtype)s buf[]; extern __shared__ %(acc_type)s buf[];
A = (const %(in_type)s *)(((char *)A)+offset_A);
Z = (%(out_type)s *)(((char *)Z)+offset_Z);
if (warpSize != 32) if (warpSize != 32)
{ {
...@@ -2207,7 +2221,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2207,7 +2221,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
{ {
for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y) for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y)
{ {
%(acc_dtype)s myresult = %(reduce_init)s; %(acc_type)s myresult = %(reduce_init)s;
for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x) for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)
{ {
%(reduce_fct)s; %(reduce_fct)s;
...@@ -2218,25 +2232,36 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2218,25 +2232,36 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
} }
""" % locals(), file=sio) """ % locals(), file=sio)
params = [
'uintp', 'uintp', 'uintp',
gpuarray.GpuArray, 'uintp',
'intp', 'intp', 'intp',
gpuarray.GpuArray, 'uintp',
'intp', 'intp'
]
kernels.append(Kernel(code=sio.getvalue(), name=kname,
params=params, flags=flags, objvar=k_var))
if self.reduce_mask in [(0, 1, 0), (1, 0), (1, 0, 0)]: if self.reduce_mask in [(0, 1, 0), (1, 0), (1, 0, 0)]:
reduce_fct = self._assign_reduce(node, nodename, "myresult", reduce_fct = self._assign_reduce(node, nodename, "myresult",
load_in + "(X[a * sX0 + b * sX1 + c * sX2])", load_in + "(X[a * sX0 + b * sX1 + c * sX2])",
{}, True) {}, True)
reduce_init = self._assign_init(load_in + "(X[a * sX0 + 0 * sX1 + c * sX2])") reduce_init = self._assign_init(load_in + "(X[a * sX0 + 0 * sX1 + c * sX2])")
kname = "kernel_reduce_010_AD"
k_var = "kernel_reduce_010_AD_" + nodename
sio = StringIO()
print(""" print("""
static __global__ void kernel_reduce_010_AD_%(nodename)s( KERNEL void %(kname)s(
const int A, const ga_size A, const ga_size B, const ga_size C, const ga_size D,
const int B, const %(in_type)s *X, const ga_size offset_X,
const int C, const ga_ssize sX0, const ga_ssize sX1, const ga_ssize sX2,
const int D, %(out_type)s * Z, const ga_size offset_Z,
//const int E, // THIS is 32 const ga_ssize sZ0, const ga_ssize sZ1)
const %(in_dtype)s *X, const int sX0,
const int sX1, const int sX2,
%(out_dtype)s * Z, const int sZ0, const int sZ1)
{ {
const int threadCount = blockDim.x; const int threadCount = blockDim.x;
const int threadNum = threadIdx.x; const int threadNum = threadIdx.x;
%(acc_dtype)s myresult = 0; %(acc_type)s myresult = 0;
X = (const %(in_type)s *)(((char *)X)+offset_X);
Z = (%(out_type)s *)(((char *)Z)+offset_Z);
if (warpSize != 32) if (warpSize != 32)
{ {
...@@ -2262,6 +2287,15 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2262,6 +2287,15 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
} }
""" % locals(), file=sio) """ % locals(), file=sio)
params = [
'uintp', 'uintp', 'uintp', 'uintp',
gpuarray.GpuArray, 'uintp',
'intp', 'intp', 'intp',
gpuarray.GpuArray, 'uintp',
'intp', 'intp'
]
kernels.append(Kernel(code=sio.getvalue(), name=kname,
params=params, flags=flags, objvar=k_var))
if self.reduce_mask == (0, 1, 0): if self.reduce_mask == (0, 1, 0):
# #
# This kernel is optimized when the inner most dimensions # This kernel is optimized when the inner most dimensions
...@@ -2275,7 +2309,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2275,7 +2309,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
# block.x = dim 0 # block.x = dim 0
# block.y = dim 1 rest # block.y = dim 1 rest
init = self._k_init(node, nodename) init = self._k_init(node, nodename)
decl = self._k_decl(node, nodename, pattern="010_inner") decl, kname, params, k_var = self._k_decl(node, nodename, pattern="010_inner")
reducebuf = self._k_reduce_buf_multiple('Z[i0 * sZ0 + i2*sZ1]', reducebuf = self._k_reduce_buf_multiple('Z[i0 * sZ0 + i2*sZ1]',
node, nodename, node, nodename,
'blockDim.x') 'blockDim.x')
...@@ -2283,6 +2317,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2283,6 +2317,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])", load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])",
{}, True) {}, True)
reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + 0 * sA1 + i2 * sA2])") reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + 0 * sA1 + i2 * sA2])")
sio = StringIO()
print(""" print("""
%(decl)s %(decl)s
{ {
...@@ -2307,6 +2342,8 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2307,6 +2342,8 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
} }
} }
""" % locals(), file=sio) """ % locals(), file=sio)
kernels.append(Kernel(code=sio.getvalue(), name=kname,
params=params, flags=flags, objvar=k_var))
if self.reduce_mask == (1, 1, 0): if self.reduce_mask == (1, 1, 0):
# this kernel uses one block for each column, # this kernel uses one block for each column,
# threads per block for each element per column. # threads per block for each element per column.
...@@ -2319,19 +2356,23 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2319,19 +2356,23 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
load_in + "(A[i0 * sA0 + i1 * sA1 + blockIdx.x * sA2])", load_in + "(A[i0 * sA0 + i1 * sA1 + blockIdx.x * sA2])",
{}, True) {}, True)
reduce_init = self._assign_init(load_in + "(A[blockIdx.x * sA2])") reduce_init = self._assign_init(load_in + "(A[blockIdx.x * sA2])")
kname = "kernel_reduce_110"
k_var = "kernel_reduce_110_" + nodename
sio = StringIO()
print(""" print("""
static __global__ void kernel_reduce_110_%(nodename)s( KERNEL void %(kname)s(
const int d0, const ga_size d0, const ga_size d1, const ga_size d2,
const int d1, const %(in_type)s *A, const ga_size offset_A,
const int d2, const ga_ssize sA0, const ga_ssize sA1, const ga_ssize sA2,
const %(in_dtype)s *A, const int sA0, %(out_type)s * Z, const ga_size offset_Z,
const int sA1, const int sA2, const ga_ssize sZ0)
%(out_dtype)s * Z, const int sZ0)
{ {
const int threadCount = blockDim.x * blockDim.y; const int threadCount = blockDim.x * blockDim.y;
const int threadNum = threadIdx.y * blockDim.x + threadIdx.x; const int threadNum = threadIdx.y * blockDim.x + threadIdx.x;
extern __shared__ %(acc_dtype)s buf[]; extern __shared__ %(acc_type)s buf[];
%(acc_dtype)s myresult = %(reduce_init)s; %(acc_type)s myresult = %(reduce_init)s;
A = (const %(in_type)s *)(((char *)A)+offset_A);
Z = (%(out_type)s *)(((char *)Z)+offset_Z);
if (warpSize != 32) if (warpSize != 32)
{ {
...@@ -2351,15 +2392,25 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2351,15 +2392,25 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
%(reducebuf)s %(reducebuf)s
} }
""" % locals(), file=sio) """ % locals(), file=sio)
params = [
'uintp', 'uintp', 'uintp',
gpuarray.GpuArray, 'uintp',
'intp', 'intp', 'intp',
gpuarray.GpuArray, 'uintp',
'intp'
]
kernels.append(Kernel(code=sio.getvalue(), name=kname,
params=params, flags=flags, objvar=k_var))
if self.reduce_mask == (1, 0, 0): if self.reduce_mask == (1, 0, 0):
reducebuf = self._k_reduce_buf('Z[i1 * sZ0 + i2 * sZ1]', reducebuf = self._k_reduce_buf('Z[i1 * sZ0 + i2 * sZ1]',
node, nodename, sub={}) node, nodename, sub={})
decl = self._k_decl(node, nodename) decl, kname, params, k_var = self._k_decl(node, nodename)
init = self._k_init(node, nodename) init = self._k_init(node, nodename)
reduce_fct = self._assign_reduce(node, nodename, "myresult", reduce_fct = self._assign_reduce(node, nodename, "myresult",
load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])", load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])",
{}, True) {}, True)
reduce_init = self._assign_init(load_in + "(A[i1 * sA1 + i2 * sA2])") reduce_init = self._assign_init(load_in + "(A[i1 * sA1 + i2 * sA2])")
sio = StringIO()
print(""" print("""
%(decl)s %(decl)s
{ {
...@@ -2378,15 +2429,18 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2378,15 +2429,18 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
} }
} }
""" % locals(), file=sio) """ % locals(), file=sio)
kernels.append(Kernel(code=sio.getvalue(), name=kname,
params=params, flags=flags, objvar=k_var))
if self.reduce_mask == (1, 1, 1): if self.reduce_mask == (1, 1, 1):
reducebuf = self._k_reduce_buf('Z[0]', node, reducebuf = self._k_reduce_buf('Z[0]', node,
nodename, sub={}) nodename, sub={})
decl = self._k_decl(node, nodename) decl, kname, params, k_var = self._k_decl(node, nodename)
init = self._k_init(node, nodename) init = self._k_init(node, nodename)
reduce_fct = self._assign_reduce(node, nodename, "myresult", reduce_fct = self._assign_reduce(node, nodename, "myresult",
load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])", load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])",
{}, True) {}, True)
reduce_init = self._assign_init(load_in + "(A[0])") reduce_init = self._assign_init(load_in + "(A[0])")
sio = StringIO()
print(""" print("""
%(decl)s %(decl)s
{ {
...@@ -2405,6 +2459,8 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2405,6 +2459,8 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
%(reducebuf)s %(reducebuf)s
} }
""" % locals(), file=sio) """ % locals(), file=sio)
kernels.append(Kernel(code=sio.getvalue(), name=kname,
params=params, flags=flags, objvar=k_var))
if self.reduce_mask == (0, 0, 1): if self.reduce_mask == (0, 0, 1):
# this kernel uses one block for each row, # this kernel uses one block for each row,
# threads per block for each element per row. # threads per block for each element per row.
...@@ -2414,18 +2470,22 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2414,18 +2470,22 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])", load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])",
{}, True) {}, True)
reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + i1 * sA1])") reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + i1 * sA1])")
kname = "kernel_reduce_001"
k_var = "kernel_reduce_001_" + nodename
sio = StringIO()
print(""" print("""
static __global__ void kernel_reduce_001_%(nodename)s( KERNEL void %(kname)s(
const int d0, const ga_size d0, const ga_size d1, const ga_size d2,
const int d1, const %(in_type)s *A, const ga_size offset_A,
const int d2, const ga_ssize sA0, const ga_ssize sA1, const ga_ssize sA2,
const %(in_dtype)s *A, const int sA0, %(out_type)s * Z, const ga_size offset_Z,
const int sA1, const int sA2, const ga_ssize sZ0, const ga_ssize sZ1)
%(out_dtype)s * Z, const int sZ0, const int sZ1)
{ {
const int threadCount = blockDim.x; const int threadCount = blockDim.x;
const int threadNum = threadIdx.x; const int threadNum = threadIdx.x;
extern __shared__ %(acc_dtype)s buf[]; extern __shared__ %(acc_type)s buf[];
A = (const %(in_type)s *)(((char *)A)+offset_A);
Z = (%(out_type)s *)(((char *)Z)+offset_Z);
if (warpSize != 32) if (warpSize != 32)
{ {
...@@ -2436,7 +2496,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2436,7 +2496,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
{ {
for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y) for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y)
{ {
%(acc_dtype)s myresult = %(reduce_init)s; %(acc_type)s myresult = %(reduce_init)s;
for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x) for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x)
{ {
%(reduce_fct)s; %(reduce_fct)s;
...@@ -2446,17 +2506,27 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2446,17 +2506,27 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
} }
} }
""" % locals(), file=sio) """ % locals(), file=sio)
params = [
'uintp', 'uintp', 'uintp',
gpuarray.GpuArray, 'uintp',
'intp', 'intp', 'intp',
gpuarray.GpuArray, 'uintp',
'intp', 'intp'
]
kernels.append(Kernel(code=sio.getvalue(), name=kname,
params=params, flags=flags, objvar=k_var))
if self.reduce_mask == (0, 0, 1, 1): if self.reduce_mask == (0, 0, 1, 1):
# this kernel uses one block for each row, # this kernel uses one block for each row,
# threads per block for each element per row. # threads per block for each element per row.
reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i1 * sZ1]', reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i1 * sZ1]',
node, nodename, sub={}) node, nodename, sub={})
decl = self._k_decl(node, nodename) decl, kname, params, k_var = self._k_decl(node, nodename)
init = self._k_init(node, nodename) init = self._k_init(node, nodename)
reduce_fct = self._assign_reduce(node, nodename, "myresult", reduce_fct = self._assign_reduce(node, nodename, "myresult",
load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3])", load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3])",
{}, True) {}, True)
reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + i1 * sA1])") reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + i1 * sA1])")
sio = StringIO()
print(""" print("""
%(decl)s %(decl)s
{ {
...@@ -2466,7 +2536,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2466,7 +2536,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
{ {
for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y) for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y)
{ {
%(acc_dtype)s myresult = %(reduce_init)s; %(acc_type)s myresult = %(reduce_init)s;
for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y) for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y)
{ {
for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x) for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)
...@@ -2479,17 +2549,20 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2479,17 +2549,20 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
} }
} }
""" % locals(), file=sio) """ % locals(), file=sio)
kernels.append(Kernel(code=sio.getvalue(), name=kname,
params=params, flags=flags, objvar=k_var))
if self.reduce_mask == (0, 1, 0, 1): if self.reduce_mask == (0, 1, 0, 1):
# this kernel uses one block for each row, # this kernel uses one block for each row,
# threads per block for each element per row. # threads per block for each element per row.
reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i2 * sZ1]', reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i2 * sZ1]',
node, nodename, sub={}) node, nodename, sub={})
decl = self._k_decl(node, nodename) decl, kname, params, k_var = self._k_decl(node, nodename)
init = self._k_init(node, nodename) init = self._k_init(node, nodename)
reduce_fct = self._assign_reduce(node, nodename, "myresult", reduce_fct = self._assign_reduce(node, nodename, "myresult",
load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3])", load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3])",
{}, True) {}, True)
reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + i2 * sA2])") reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + i2 * sA2])")
sio = StringIO()
print(""" print("""
%(decl)s %(decl)s
{ {
...@@ -2499,7 +2572,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2499,7 +2572,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
{ {
for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y) for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y)
{ {
%(acc_dtype)s myresult = %(reduce_init)s; %(acc_type)s myresult = %(reduce_init)s;
for (int i1 = threadIdx.y; i1 < d1; i1 += blockDim.y) for (int i1 = threadIdx.y; i1 < d1; i1 += blockDim.y)
{ {
for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x) for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)
...@@ -2512,15 +2585,18 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2512,15 +2585,18 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
} }
} }
""" % locals(), file=sio) """ % locals(), file=sio)
kernels.append(Kernel(code=sio.getvalue(), name=kname,
params=params, flags=flags, objvar=k_var))
if self.reduce_mask == (1, 1, 1, 1): if self.reduce_mask == (1, 1, 1, 1):
reducebuf = self._k_reduce_buf('Z[0]', node, nodename, reducebuf = self._k_reduce_buf('Z[0]', node, nodename,
sub={}) sub={})
decl = self._k_decl(node, nodename) decl, kname, params, k_var = self._k_decl(node, nodename)
init = self._k_init(node, nodename) init = self._k_init(node, nodename)
reduce_fct = self._assign_reduce(node, nodename, "myresult", reduce_fct = self._assign_reduce(node, nodename, "myresult",
load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3])", load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3])",
{}, True) {}, True)
reduce_init = self._assign_init(load_in + "(A[0])") reduce_init = self._assign_init(load_in + "(A[0])")
sio = StringIO()
print(""" print("""
%(decl)s %(decl)s
{ {
...@@ -2540,6 +2616,8 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2540,6 +2616,8 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
%(reducebuf)s %(reducebuf)s
} }
""" % locals(), file=sio) """ % locals(), file=sio)
kernels.append(Kernel(code=sio.getvalue(), name=kname,
params=params, flags=flags, objvar=k_var))
if self.reduce_mask == (1, 0, 1, 1): if self.reduce_mask == (1, 0, 1, 1):
reducebuf = self._k_reduce_buf('Z[blockIdx.x*sZ0]', reducebuf = self._k_reduce_buf('Z[blockIdx.x*sZ0]',
node, nodename, sub={}) node, nodename, sub={})
...@@ -2547,20 +2625,23 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2547,20 +2625,23 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
load_in + "(A[i0 * sA0 + blockIdx.x * sA1 + i2 * sA2 + i3 * sA3])", load_in + "(A[i0 * sA0 + blockIdx.x * sA1 + i2 * sA2 + i3 * sA3])",
{}, True) {}, True)
reduce_init = self._assign_init(load_in + "(A[blockIdx.x * sA1])") reduce_init = self._assign_init(load_in + "(A[blockIdx.x * sA1])")
kname = "kernel_reduce_1011"
k_var= "kernel_reduce_1011_" + nodename
sio = StringIO()
print(""" print("""
static __global__ void kernel_reduce_1011_%(nodename)s( KERNEL void %(kname)s(
const unsigned int d0, const ga_size d0, const ga_size d1, const ga_size d2, const ga_size d3,
const unsigned int d1, const %(in_type)s *A, const ga_size offset_A,
const unsigned int d2, const ga_ssize sA0, const ga_ssize sA1, const ga_ssize sA2, const ga_ssize sA3,
const unsigned int d3, %(out_type)s * Z, const ga_size offset_Z,
const %(in_dtype)s *A, const int sA0, const int sA1, const ga_ssize sZ0)
const int sA2, const int sA3,
%(out_dtype)s * Z, const int sZ0)
{ {
const int threadCount = blockDim.x * blockDim.y * blockDim.z; const int threadCount = blockDim.x * blockDim.y * blockDim.z;
const int threadNum = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x; const int threadNum = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
extern __shared__ %(acc_dtype)s buf[]; extern __shared__ %(acc_type)s buf[];
%(acc_dtype)s myresult = %(reduce_init)s; %(acc_type)s myresult = %(reduce_init)s;
A = (const %(in_type)s *)(((char *)A)+offset_A);
Z = (%(out_type)s *)(((char *)Z)+offset_Z);
if (warpSize != 32) if (warpSize != 32)
{ {
...@@ -2580,14 +2661,16 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2580,14 +2661,16 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
%(reducebuf)s %(reducebuf)s
} }
""" % locals(), file=sio) """ % locals(), file=sio)
print(""" params = [
template <typename T> 'uintp', 'uintp', 'uintp', 'uintp',
static T ceil_intdiv(T a, T b) gpuarray.GpuArray, 'uintp',
{ 'intp', 'intp', 'intp', 'intp',
return (a/b) + ((a % b) ? 1: 0); gpuarray.GpuArray, 'uintp',
} 'intp'
""", file=sio) ]
return sio.getvalue() kernels.append(Kernel(code=sio.getvalue(), name=kname,
params=params, flags=flags, objvar=k_var))
return kernels
class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype): class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
...@@ -2820,8 +2903,15 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2820,8 +2903,15 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
%(output)s = tmp; %(output)s = tmp;
} }
if (%(sync)d) if (%(sync)d) {
GpuArray_sync(&%(output)s->ga); err = GpuArray_sync(&%(output)s->ga);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: GpuCAReduceCPY: %%s.",
GpuKernel_error(&%(k_var)s, err));
%(fail)s
}
}
""" % dict(k_var='k_reduk_'+name, sync=bool(config.gpuarray.sync), """ % dict(k_var='k_reduk_'+name, sync=bool(config.gpuarray.sync),
ls=ls, fail=sub['fail'], output=output, input=input, ls=ls, fail=sub['fail'], output=output, input=input,
cast_out=bool(acc_dtype != node.outputs[0].type.dtype)) cast_out=bool(acc_dtype != node.outputs[0].type.dtype))
......
...@@ -3,6 +3,12 @@ Helper routines for generating gpu kernels for nvcc. ...@@ -3,6 +3,12 @@ Helper routines for generating gpu kernels for nvcc.
""" """
try:
import pygpu
from pygpu import gpuarray
except ImportError:
pass
def nvcc_kernel(name, params, body): def nvcc_kernel(name, params, body):
""" """
Return the c code of a kernel function. Return the c code of a kernel function.
...@@ -26,7 +32,7 @@ def nvcc_kernel(name, params, body): ...@@ -26,7 +32,7 @@ def nvcc_kernel(name, params, body):
else: else:
yield b yield b
bodystr = ';\n'.join(flatbody()) bodystr = ';\n'.join(flatbody())
return """__global__ void %(name)s (%(paramstr)s) return """KERNEL void %(name)s (%(paramstr)s)
{ {
%(bodystr)s; %(bodystr)s;
} }
...@@ -167,11 +173,12 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"): ...@@ -167,11 +173,12 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):
We use __i as an int variable in a loop. We use __i as an int variable in a loop.
""" """
ctype = gpuarray.dtype_to_ctype(dtype)
return [ return [
# get max of buf (trashing all but buf[0]) # get max of buf (trashing all but buf[0])
inline_reduce_max(N, buf, threadPos, threadCount), inline_reduce_max(N, buf, threadPos, threadCount),
'__syncthreads()', '__syncthreads()',
('npy_%s row_max = ' + buf + '[0]') % dtype, ('%s row_max = ' + buf + '[0]') % ctype,
'__syncthreads()', '__syncthreads()',
'for(int __i=' + threadPos + '; __i<' + N + 'for(int __i=' + threadPos + '; __i<' + N +
'; __i+=' + threadCount + '){', '; __i+=' + threadCount + '){',
...@@ -181,7 +188,7 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"): ...@@ -181,7 +188,7 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):
'__syncthreads()', '__syncthreads()',
inline_reduce_sum(N, buf, threadPos, threadCount), inline_reduce_sum(N, buf, threadPos, threadCount),
'__syncthreads()', '__syncthreads()',
('npy_%s row_sum = ' + buf + '[0]') % dtype, ('%s row_sum = ' + buf + '[0]') % ctype,
'__syncthreads()', '__syncthreads()',
# divide each exp() result by the sum to complete the job. # divide each exp() result by the sum to complete the job.
'for(int __i=' + threadPos + '; __i<' + N + 'for(int __i=' + threadPos + '; __i<' + N +
...@@ -259,11 +266,12 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count, ...@@ -259,11 +266,12 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count,
r_2 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+2]" % (buf, pos)) r_2 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+2]" % (buf, pos))
r_1 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+1]" % (buf, pos)) r_1 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+1]" % (buf, pos))
ctype = gpuarray.dtype_to_ctype(dtype)
return """ return """
{ {
// This function trashes buf[1..n_threads], // This function trashes buf[1..n_threads],
// leaving the reduction result in buf[0]. // leaving the reduction result in buf[0].
npy_%(dtype)s red = %(init)s; %(ctype)s red = %(init)s;
#pragma unroll 16 #pragma unroll 16
for (int i = %(pos)s + %(count)s; i<%(N)s; i += %(count)s){ for (int i = %(pos)s + %(count)s; i<%(N)s; i += %(count)s){
red = %(loop_line)s; red = %(loop_line)s;
...@@ -356,6 +364,7 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x, load_x, ...@@ -356,6 +364,7 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x, load_x,
We use tx as an int variable in a loop. We use tx as an int variable in a loop.
""" """
ctype = gpuarray.dtype_to_ctype(dtype)
ret = [ ret = [
# get max of buf (trashing all but buf[0]) # get max of buf (trashing all but buf[0])
inline_reduce_fixed_shared_max(N, buf, x, stride_x, load_x, inline_reduce_fixed_shared_max(N, buf, x, stride_x, load_x,
...@@ -363,7 +372,7 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x, load_x, ...@@ -363,7 +372,7 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x, load_x,
b, stride_b, load_b, b, stride_b, load_b,
dtype), dtype),
'__syncthreads()', '__syncthreads()',
('npy_%s row_max = ' + buf + '[0]') % dtype, ('%s row_max = ' + buf + '[0]') % ctype,
'__syncthreads()', '__syncthreads()',
inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, inline_reduce_fixed_shared(N, buf, x, stride_x, load_x,
threadPos, threadCount, threadPos, threadCount,
...@@ -371,7 +380,7 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x, load_x, ...@@ -371,7 +380,7 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x, load_x,
lambda a: "exp(%s - row_max)" % a, lambda a: "exp(%s - row_max)" % a,
b, stride_b, load_b, dtype), b, stride_b, load_b, dtype),
'__syncthreads()', '__syncthreads()',
('npy_%s row_sum = ' + buf + '[0]') % dtype, ('%s row_sum = ' + buf + '[0]') % ctype,
'__syncthreads()', '__syncthreads()',
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){", "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
] ]
......
import os
import numpy import numpy
from theano import Op, Apply, config from theano import Op, Apply, config
...@@ -12,13 +13,14 @@ except ImportError: ...@@ -12,13 +13,14 @@ except ImportError:
pass pass
from .basic_ops import (as_gpuarray_variable, from .basic_ops import (as_gpuarray_variable,
host_from_gpu, gpu_from_host) host_from_gpu, gpu_from_host,
GpuKernelBase, Kernel)
from .opt import register_opt as register_gpu_opt, op_lifter from .opt import register_opt as register_gpu_opt, op_lifter
from .type import GpuArrayType from .type import GpuArrayType
from .comp import NVCC_compiler from .comp import NVCC_compiler
class GpuImages2Neibs(Images2Neibs, Op): class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
def __init__(self, mode='valid'): def __init__(self, mode='valid'):
if mode not in ['valid', 'ignore_borders', 'wrap_centered']: if mode not in ['valid', 'ignore_borders', 'wrap_centered']:
raise NotImplementedError("Only the mode valid, ignore_borders" raise NotImplementedError("Only the mode valid, ignore_borders"
...@@ -43,25 +45,42 @@ class GpuImages2Neibs(Images2Neibs, Op): ...@@ -43,25 +45,42 @@ class GpuImages2Neibs(Images2Neibs, Op):
dtype=ten4.type.dtype)()]) dtype=ten4.type.dtype)()])
def c_code_cache_version(self): def c_code_cache_version(self):
return (9, 1) return (10,1)
def c_headers(self): def c_headers(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>', return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
'<gpuarray/ext_cuda.h>'] '<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']
def c_compiler(self): def c_header_dirs(self):
return NVCC_compiler if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
cuda_root = config.cuda.root
if cuda_root:
import os
return [os.path.join(cuda_root, 'include')]
else:
return []
def c_init_code(self): def c_init_code(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
return ['setup_ext_cuda();'] return ['setup_ext_cuda();']
def c_support_code_apply(self, node, nodename): def gpu_kernels(self, node, nodename):
dtype_ten4 = node.inputs[0].dtype dtype_ten4 = node.inputs[0].dtype
dtype_z = node.outputs[0].dtype dtype_z = node.outputs[0].dtype
flags = Kernel.get_flags(dtype_ten4, dtype_z)
type_ten4 = gpuarray.dtype_to_ctype(dtype_ten4)
type_z = gpuarray.dtype_to_ctype(dtype_z)
mode = self.mode mode = self.mode
return """ kernels = []
kname = "k_multi_warp_less"
k_var = "k_multi_warp_less_" + nodename
code = """
//a version that use less register but don't work in all case. //a version that use less register but don't work in all case.
static __global__ void k_multi_warp_less_%(nodename)s( KERNEL void %(kname)s(
const int nb_batch, const int nb_batch,
const int nb_stack, const int nb_stack,
const int height, const int height,
...@@ -72,15 +91,17 @@ class GpuImages2Neibs(Images2Neibs, Op): ...@@ -72,15 +91,17 @@ class GpuImages2Neibs(Images2Neibs, Op):
const int step_y, const int step_y,
const int grid_c, const int grid_c,
const int grid_d, const int grid_d,
const int stride0, const int stride1, const size_t stride0, const size_t stride1,
const int stride2, const int stride3, const size_t stride2, const size_t stride3,
npy_%(dtype_ten4)s * global_ten4, const %(type_ten4)s * global_ten4, const size_t offset_ten4,
const int out_s0, const int out_s1, const size_t out_s0, const size_t out_s1,
npy_%(dtype_z)s * global_out %(type_z)s * global_out, const size_t offset_out
) )
{ {
const int wrap_centered_idx_shift_x = c/2; const int wrap_centered_idx_shift_x = c/2;
const int wrap_centered_idx_shift_y = d/2; const int wrap_centered_idx_shift_y = d/2;
global_ten4 = (const %(type_ten4)s *)(((char *)global_ten4)+offset_ten4);
global_out = (%(type_z)s *)(((char *)global_out)+offset_out);
for(int tblock = blockIdx.x*blockDim.z+threadIdx.z; for(int tblock = blockIdx.x*blockDim.z+threadIdx.z;
tblock<nb_batch*nb_stack*grid_c*grid_d; tblock<nb_batch*nb_stack*grid_c*grid_d;
...@@ -131,9 +152,22 @@ class GpuImages2Neibs(Images2Neibs, Op): ...@@ -131,9 +152,22 @@ class GpuImages2Neibs(Images2Neibs, Op):
} }
} }
} }
} }""" % locals()
params = [
static __global__ void k_multi_warp_%(nodename)s( 'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc',
'uintp', 'uintp', 'uintp', 'uintp',
gpuarray.GpuArray, 'uintp',
'uintp', 'uintp',
gpuarray.GpuArray, 'uintp',
]
kernels.append(Kernel(code=code, name=kname, params=params,
flags=flags, objvar=k_var))
kname = "k_multi_warp"
k_var = "k_multi_warp_" + nodename
code = """
KERNEL void %(kname)s(
const int nb_batch, const int nb_batch,
const int nb_stack, const int nb_stack,
const int height, const int height,
...@@ -144,15 +178,17 @@ class GpuImages2Neibs(Images2Neibs, Op): ...@@ -144,15 +178,17 @@ class GpuImages2Neibs(Images2Neibs, Op):
const int step_y, const int step_y,
const int grid_c, const int grid_c,
const int grid_d, const int grid_d,
const int stride0, const int stride1, const size_t stride0, const size_t stride1,
const int stride2, const int stride3, const size_t stride2, const size_t stride3,
npy_%(dtype_ten4)s * global_ten4, const %(type_ten4)s * global_ten4, const size_t offset_ten4,
const int out_s0, const int out_s1, const size_t out_s0, const size_t out_s1,
npy_%(dtype_z)s * global_out %(type_z)s * global_out, const size_t offset_out
) )
{ {
const int wrap_centered_idx_shift_x = c/2; const int wrap_centered_idx_shift_x = c/2;
const int wrap_centered_idx_shift_y = d/2; const int wrap_centered_idx_shift_y = d/2;
global_ten4 = (const %(type_ten4)s *)(((char *)global_ten4)+offset_ten4);
global_out = (%(type_z)s *)(((char *)global_out)+offset_out);
for(int tblock = blockIdx.x*blockDim.z+threadIdx.z; for(int tblock = blockIdx.x*blockDim.z+threadIdx.z;
tblock<nb_batch*nb_stack*grid_c*grid_d; tblock<nb_batch*nb_stack*grid_c*grid_d;
...@@ -207,6 +243,17 @@ class GpuImages2Neibs(Images2Neibs, Op): ...@@ -207,6 +243,17 @@ class GpuImages2Neibs(Images2Neibs, Op):
} }
} }
""" % locals() """ % locals()
params = [
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc',
'uintp', 'uintp', 'uintp', 'uintp',
gpuarray.GpuArray, 'uintp',
'uintp', 'uintp',
gpuarray.GpuArray, 'uintp',
]
kernels.append(Kernel(code=code, name=kname, params=params,
flags=flags, objvar=k_var))
return kernels
def c_code(self, node, name, inp, out, sub): def c_code(self, node, name, inp, out, sub):
dtype_ten4 = node.inputs[0].dtype dtype_ten4 = node.inputs[0].dtype
...@@ -220,15 +267,21 @@ class GpuImages2Neibs(Images2Neibs, Op): ...@@ -220,15 +267,21 @@ class GpuImages2Neibs(Images2Neibs, Op):
z, = out z, = out
fail = sub['fail'] fail = sub['fail']
mode = self.mode mode = self.mode
err_check = """
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: *fptr: %%s.",
GpuKernel_error(fptr, err));
%(fail)s;
}
""" % locals()
sync = ""
if config.gpuarray.sync: if config.gpuarray.sync:
cnda_thread_sync = "GpuArray_sync(&%(z)s->ga);" % dict(z=z) sync = """
else: err = GpuArray_sync(&%(z)s->ga);
cnda_thread_sync = "" %(err_check)s
""" % locals()
return """ return """
#ifndef CEIL_INTDIV
#define CEIL_INTDIV(a, b) ((a/b) + ((a %% b) ? 1: 0))
#endif
int grid_c = -1; int grid_c = -1;
int grid_d = -1; int grid_d = -1;
...@@ -281,10 +334,10 @@ class GpuImages2Neibs(Images2Neibs, Op): ...@@ -281,10 +334,10 @@ class GpuImages2Neibs(Images2Neibs, Op):
PyGpuArray_DIMS(%(ten4)s)[3]); PyGpuArray_DIMS(%(ten4)s)[3]);
%(fail)s; %(fail)s;
} }
grid_c = CEIL_INTDIV(((PyGpuArray_DIMS(%(ten4)s))[2]), grid_c = ceil_intdiv(((PyGpuArray_DIMS(%(ten4)s))[2]),
step_x); (size_t)step_x);
grid_d = CEIL_INTDIV(((PyGpuArray_DIMS(%(ten4)s))[3]), grid_d = ceil_intdiv(((PyGpuArray_DIMS(%(ten4)s))[3]),
step_y); (size_t)step_y);
}else if ( "%(mode)s" == "valid") { }else if ( "%(mode)s" == "valid") {
...@@ -367,75 +420,57 @@ class GpuImages2Neibs(Images2Neibs, Op): ...@@ -367,75 +420,57 @@ class GpuImages2Neibs(Images2Neibs, Op):
const npy_intp step_y = (npy_intp) *(npy_%(dtype_neib_step)s*) const npy_intp step_y = (npy_intp) *(npy_%(dtype_neib_step)s*)
PyArray_GETPTR1(%(neib_step)s, 1); PyArray_GETPTR1(%(neib_step)s, 1);
dim3 n_threads(d,c,1); size_t threads_per_block[3] = {d, c, 1};
//Their is a max of 512 threads per blocks //Their is a max of 512 threads per blocks
while(n_threads.x*n_threads.y>512 && n_threads.y>1)n_threads.y--; while(threads_per_block[0]*threads_per_block[1]>512 && threads_per_block[1]>1)threads_per_block[1]--;
while(n_threads.x*n_threads.y>512 && n_threads.x>1)n_threads.x--; while(threads_per_block[0]*threads_per_block[1]>512 && threads_per_block[0]>1)threads_per_block[0]--;
//Make bigger block to have better memory access pattern and //Make bigger block to have better memory access pattern and
//a higher core utilisation. for smaller patch size //a higher core utilisation. for smaller patch size
while(c*d*(n_threads.z+1) < 128 && n_threads.z<64 && while(c*d*(threads_per_block[2]+1) < 128 && threads_per_block[2]<64 &&
n_threads.z<PyGpuArray_DIMS(%(z)s)[0]){ threads_per_block[2]<PyGpuArray_DIMS(%(z)s)[0]){
n_threads.z++; threads_per_block[2]++;
} }
int nb_block; int nb_block;
if (PyGpuArray_DIMS(%(z)s)[0] %% n_threads.z == 0) if (PyGpuArray_DIMS(%(z)s)[0] %% threads_per_block[2] == 0)
nb_block = PyGpuArray_DIMS(%(z)s)[0] / n_threads.z; nb_block = PyGpuArray_DIMS(%(z)s)[0] / threads_per_block[2];
else else
nb_block = (PyGpuArray_DIMS(%(z)s)[0] / n_threads.z) + 1; nb_block = (PyGpuArray_DIMS(%(z)s)[0] / threads_per_block[2]) + 1;
dim3 n_blocks(std::min(32*1024,nb_block)); size_t n_blocks[3] = {std::min(32*1024,nb_block), 1, 1};
int n_shared = 0;
void (*f)(int, int, int ,int,
int, int, int ,int,
int, int,
int, int, int, int,
npy_%(dtype_ten4)s*,
int, int,
npy_%(dtype_z)s*);
if(n_threads.x==d && n_threads.y==c){
f = k_multi_warp_less_%(name)s;
}else{
f = k_multi_warp_%(name)s;
}
f<<<n_blocks, n_threads, n_shared>>>( GpuKernel *fptr;
nb_batch, if(threads_per_block[0]==d && threads_per_block[1]==c){
nb_stack, fptr = &k_multi_warp_less_%(name)s;
height, width, }else{
c, d, step_x, step_y, fptr = &k_multi_warp_%(name)s;
grid_c, grid_d,
PyGpuArray_STRIDES(%(ten4)s)[0] / %(itemsize_ten4)s,
PyGpuArray_STRIDES(%(ten4)s)[1] / %(itemsize_ten4)s,
PyGpuArray_STRIDES(%(ten4)s)[2] / %(itemsize_ten4)s,
PyGpuArray_STRIDES(%(ten4)s)[3] / %(itemsize_ten4)s,
(npy_%(dtype_ten4)s*)(
((char *)cuda_get_ptr(%(ten4)s->ga.data)) +
%(ten4)s->ga.offset),
PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s,
PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s,
(npy_%(dtype_z)s*)(((char *)cuda_get_ptr(%(z)s->ga.data)) +
%(z)s->ga.offset)
);
%(cnda_thread_sync)s
cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts)
{
PyErr_Format(PyExc_RuntimeError, "GpuImages2Neibs:"
" Cuda error: %%s: %%s. (grid: %%i x %%i;"
" block: %%i x %%i x %%i; shared: %%i)\\n",
"k_multi_warp_%(name)s",
cudaGetErrorString(sts),
n_blocks.x,
n_blocks.y,
n_threads.x,
n_threads.y,
n_threads.z,
n_shared);
%(fail)s;
} }
size_t stride_A0 = PyGpuArray_STRIDES(%(ten4)s)[0] / %(itemsize_ten4)s;
size_t stride_A1 = PyGpuArray_STRIDES(%(ten4)s)[1] / %(itemsize_ten4)s;
size_t stride_A2 = PyGpuArray_STRIDES(%(ten4)s)[2] / %(itemsize_ten4)s;
size_t stride_A3 = PyGpuArray_STRIDES(%(ten4)s)[3] / %(itemsize_ten4)s;
size_t stride_Z0 = PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s;
size_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s;
void *kernel_params[] = {(void *)&nb_batch,
(void *)&nb_stack,
(void *)&height, (void *)&width,
(void *)&c, (void *)&d,
(void *)&step_x, (void *)&step_y,
(void *)&grid_c, (void *)&grid_d,
(void *)&stride_A0,
(void *)&stride_A1,
(void *)&stride_A2,
(void *)&stride_A3,
(void *)%(ten4)s->ga.data,
(void *)&%(ten4)s->ga.offset,
(void *)&stride_Z0,
(void *)&stride_Z1,
(void *)%(z)s->ga.data,
(void *)&%(z)s->ga.offset};
int err = GpuKernel_call(fptr, 3, threads_per_block, n_blocks, 0, kernel_params);
%(err_check)s
%(sync)s
} // END NESTED SCOPE } // END NESTED SCOPE
""" % locals() """ % locals()
......
...@@ -10,16 +10,15 @@ try: ...@@ -10,16 +10,15 @@ try:
except ImportError: except ImportError:
pass pass
from .basic_ops import as_gpuarray_variable from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel)
from .comp import NVCC_compiler
from .type import GpuArrayType from .type import GpuArrayType
from .kernel_codegen import (nvcc_kernel, from .kernel_codegen import (nvcc_kernel,
inline_softmax, inline_softmax,
inline_softmax_fixed_shared) inline_softmax_fixed_shared)
from .fp16_help import work_dtype, load_w, write_w from .fp16_help import work_dtype, load_w, write_w
class GpuCrossentropySoftmaxArgmax1HotWithBias(Op): class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
""" """
Implement CrossentropySoftmaxArgmax1HotWithBias on the gpu. Implement CrossentropySoftmaxArgmax1HotWithBias on the gpu.
...@@ -41,10 +40,19 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op): ...@@ -41,10 +40,19 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
am = y_idx.type() am = y_idx.type()
return Apply(self, [x, b, y_idx], [nll, sm, am]) return Apply(self, [x, b, y_idx], [nll, sm, am])
def c_header_dirs(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
cuda_root = config.cuda.root
if cuda_root:
import os
return [os.path.join(cuda_root, 'include')]
def c_headers(self): def c_headers(self):
return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>'] return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
'<gpuarray/types.h>']
def c_support_code_apply(self, node, nodename): def gpu_kernels(self, node, nodename):
dtype_x = node.inputs[0].dtype dtype_x = node.inputs[0].dtype
dtype_b = node.inputs[1].dtype dtype_b = node.inputs[1].dtype
dtype_y_idx = node.inputs[2].dtype dtype_y_idx = node.inputs[2].dtype
...@@ -54,28 +62,48 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op): ...@@ -54,28 +62,48 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
load_b = load_w(dtype_b) load_b = load_w(dtype_b)
write_x = write_w(dtype_x) write_x = write_w(dtype_x)
write_b = write_w(dtype_b) write_b = write_w(dtype_b)
return """ flags = Kernel.get_flags(dtype_x, dtype_b, dtype_y_idx)
__global__ void k_xent_sm_1hot_bias_%(nodename)s(int M, int N, type_x = gpuarray.dtype_to_ctype(work_x)
const npy_%(dtype_x)s* x_data, int xs0, int xs1, type_b = gpuarray.dtype_to_ctype(work_b)
const npy_%(dtype_b)s* b, int bs0, type_y_idx = gpuarray.dtype_to_ctype(dtype_y_idx)
const npy_%(dtype_y_idx)s* y_idx_data, int y_idxs0, kname = "k_xent_sm_1hot_bias"
npy_%(dtype_x)s* nll_data, int nlls0, k_var = "k_xent_sm_1hot_bias_" + nodename
npy_%(dtype_x)s* sm_data, int sms0, int sms1, sio = StringIO()
npy_%(dtype_y_idx)s* am_data, int ams0) print("""
KERNEL void %(kname)s(const ga_size M, const ga_size N,
const %(type_x)s* x_data, const ga_size offset_x,
const ga_ssize xs0, const ga_ssize xs1,
const %(type_b)s* b, const ga_size offset_b,
const ga_ssize bs0,
const %(type_y_idx)s* y_idx_data, const ga_size offset_y_idx,
const ga_ssize y_idxs0,
%(type_x)s* nll_data, const ga_size offset_nll,
const ga_ssize nlls0,
%(type_x)s* sm_data, const ga_size offset_sm,
const ga_ssize sms0, const ga_ssize sms1,
%(type_y_idx)s* am_data, const ga_size offset_am,
const ga_ssize ams0)
{ {
x_data = (const %(type_x)s *)(((char *)x_data)+offset_x);
b = (const %(type_b)s *)(((char *)b)+offset_b);
y_idx_data = (const %(type_y_idx)s *)(((char *)y_idx_data)+offset_y_idx);
nll_data = (%(type_x)s *)(((char *)nll_data)+offset_nll);
sm_data = (%(type_x)s *)(((char *)sm_data)+offset_sm);
am_data = (%(type_y_idx)s *)(((char *)am_data)+offset_am);
for (int row = blockIdx.x; row < M; row += gridDim.x){ for (int row = blockIdx.x; row < M; row += gridDim.x){
const npy_%(dtype_x)s* x = x_data + xs0 * row; const %(type_x)s* x = x_data + xs0 * row;
const npy_%(dtype_y_idx)s y_idx = y_idx_data[row * y_idxs0]; const %(type_y_idx)s y_idx = y_idx_data[row * y_idxs0];
npy_%(dtype_x)s* sm = sm_data + sms0 * row; %(type_x)s* sm = sm_data + sms0 * row;
npy_%(work_x)s sum = 0.0; %(type_x)s sum = 0.0;
int row_max_j = 0; int row_max_j = 0;
npy_%(work_x)s row_max = %(load_x)s(x[0]) + %(load_b)s(b[0]); %(type_x)s row_max = %(load_x)s(x[0]) + %(load_b)s(b[0]);
for (int j = 1; j < N; ++j) for (int j = 1; j < N; ++j)
{ {
npy_%(work_x)s row_ij = %(load_x)s(x[j*xs1]) + %(type_x)s row_ij = %(load_x)s(x[j*xs1]) +
%(load_b)s(b[j*bs0]); %(load_b)s(b[j*bs0]);
//todo: store to shared memory //todo: store to shared memory
row_max_j = (row_ij > row_max) ? j : row_max_j; row_max_j = (row_ij > row_max) ? j : row_max_j;
row_max = (row_ij > row_max) ? row_ij : row_max; row_max = (row_ij > row_max) ? row_ij : row_max;
...@@ -83,16 +111,16 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op): ...@@ -83,16 +111,16 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
//compute the exp //compute the exp
for (int j = 0; j < N; ++j) for (int j = 0; j < N; ++j)
{ {
npy_%(work_x)s row_ij = %(load_x)s(x[j*xs1]) + %(type_x)s row_ij = %(load_x)s(x[j*xs1]) +
%(load_b)s(b[j*bs0]); %(load_b)s(b[j*bs0]);
npy_%(work_x)s sm_ij = exp(row_ij - row_max); %(type_x)s sm_ij = exp(row_ij - row_max);
sum += sm_ij; sum += sm_ij;
sm[j * sms1] = %(write_x)s(sm_ij); sm[j * sms1] = %(write_x)s(sm_ij);
} }
npy_%(work_x)s sum_inv = 1.0 / sum; %(type_x)s sum_inv = 1.0 / sum;
for (int j = 0; j < N; ++j) for (int j = 0; j < N; ++j)
{ {
npy_%(work_x)s __tmp = %(load_x)s(sm[j * sms1]); %(type_x)s __tmp = %(load_x)s(sm[j * sms1]);
__tmp *= sum_inv; __tmp *= sum_inv;
sm[j * sms1] = %(write_x)s(__tmp); sm[j * sms1] = %(write_x)s(__tmp);
} }
...@@ -111,12 +139,18 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op): ...@@ -111,12 +139,18 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
am_data[row*ams0] = row_max_j; am_data[row*ams0] = row_max_j;
} }
} }
""" % locals(), file=sio)
CUdeviceptr (*cuda_get_ptr)(gpudata *g); params = [
""" % locals() 'uintp', 'uintp',
gpuarray.GpuArray, 'uintp', 'intp', 'intp',
def c_init_code(self): gpuarray.GpuArray, 'uintp', 'intp',
return ['cuda_get_ptr = (CUdeviceptr (*)(gpudata *g))gpuarray_get_extension("cuda_get_ptr");'] gpuarray.GpuArray, 'uintp', 'intp',
gpuarray.GpuArray, 'uintp', 'intp',
gpuarray.GpuArray, 'uintp', 'intp', 'intp',
gpuarray.GpuArray, 'uintp', 'intp'
]
return [Kernel(code=sio.getvalue(), name=kname, params=params,
flags=flags, objvar=k_var)]
def c_code(self, node, nodename, inp, out, sub): def c_code(self, node, nodename, inp, out, sub):
typecode_x = pygpu.gpuarray.dtype_to_typecode(node.inputs[0].dtype) typecode_x = pygpu.gpuarray.dtype_to_typecode(node.inputs[0].dtype)
...@@ -138,6 +172,21 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op): ...@@ -138,6 +172,21 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
dtype_am = node.outputs[2].dtype dtype_am = node.outputs[2].dtype
classname = self.__class__.__name__ classname = self.__class__.__name__
fail = sub['fail'] fail = sub['fail']
k_var = "k_xent_sm_1hot_bias_%(nodename)s" % locals()
err_check = """
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: %(k_var)s: %%s.",
GpuKernel_error(&%(k_var)s, err));
%(fail)s;
}
""" % locals()
sync = ""
if config.gpuarray.sync:
sync = """
err = GpuArray_sync(&%(z)s->ga);
%(err_check)s
""" % locals()
sio = StringIO() sio = StringIO()
print(""" print("""
if (PyGpuArray_NDIM(%(y_idx)s) != 1) if (PyGpuArray_NDIM(%(y_idx)s) != 1)
...@@ -219,62 +268,47 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op): ...@@ -219,62 +268,47 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
} }
} }
{ {
int n_blocks = PyGpuArray_DIMS(%(x)s)[0] < 256 ? PyGpuArray_DIMS(%(x)s)[0] : 256; size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t)256), 1, 1};
size_t threads_per_block[3] = {1, 1, 1};
ssize_t stride_X0 = PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s;
ssize_t stride_X1 = PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s;
ssize_t stride_B0 = PyGpuArray_STRIDES(%(b)s)[0] / %(itemsize_b)s;
ssize_t stride_YIDX0 = PyGpuArray_STRIDES(%(y_idx)s)[0] / %(itemsize_y_idx)s;
ssize_t stride_NLL0 = PyGpuArray_STRIDES(%(nll)s)[0] / %(itemsize_nll)s;
ssize_t stride_SM0 = PyGpuArray_STRIDES(%(sm)s)[0] / %(itemsize_sm)s;
ssize_t stride_SM1 = PyGpuArray_STRIDES(%(sm)s)[1] / %(itemsize_sm)s;
ssize_t stride_AM0 = PyGpuArray_STRIDES(%(am)s)[0] / %(itemsize_am)s;
//TODO: launch more threads per row and do parallel sum and max reductions //TODO: launch more threads per row and do parallel sum and max reductions
int n_threads = 1; void *kernel_params[] = {
int n_shared_bytes = 0; //n_threads * sizeof(dtype); (void *)&PyGpuArray_DIMS(%(x)s)[0],
(void *)&PyGpuArray_DIMS(%(x)s)[1],
(void *)%(x)s->ga.data, (void *)&%(x)s->ga.offset,
k_xent_sm_1hot_bias_%(nodename)s<<<n_blocks, n_threads, n_shared_bytes>>>( (void *)&stride_X0, (void *)&stride_X1,
PyGpuArray_DIMS(%(x)s)[0], (void *)%(b)s->ga.data, (void *)&%(b)s->ga.offset,
PyGpuArray_DIMS(%(x)s)[1], (void *)&stride_B0,
(npy_%(dtype_x)s*)(((char *)cuda_get_ptr(%(x)s->ga.data)) + (void *)%(y_idx)s->ga.data, (void *)&%(y_idx)s->ga.offset,
%(x)s->ga.offset), (void *)&stride_YIDX0,
PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s, (void *)%(nll)s->ga.data, (void *)&%(nll)s->ga.offset,
PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s, (void *)&stride_NLL0,
(npy_%(dtype_b)s*)(((char *)cuda_get_ptr(%(b)s->ga.data)) + (void *)%(sm)s->ga.data, (void *)&%(sm)s->ga.offset,
%(b)s->ga.offset), (void *)&stride_SM0, (void *)&stride_SM1,
PyGpuArray_STRIDES(%(b)s)[0] / %(itemsize_b)s, (void *)%(am)s->ga.data, (void *)&%(am)s->ga.offset,
(npy_%(dtype_y_idx)s*)(((char *)cuda_get_ptr(%(y_idx)s->ga.data)) + (void *)&stride_AM0};
%(y_idx)s->ga.offset), int err = GpuKernel_call(&%(k_var)s, 3, threads_per_block, n_blocks, 0, kernel_params);
PyGpuArray_STRIDES(%(y_idx)s)[0] / %(itemsize_y_idx)s, %(err_check)s
(npy_%(dtype_nll)s*)(((char *)cuda_get_ptr(%(nll)s->ga.data)) + %(sync)s
%(nll)s->ga.offset),
PyGpuArray_STRIDES(%(nll)s)[0] / %(itemsize_nll)s,
(npy_%(dtype_sm)s*)(((char *)cuda_get_ptr(%(sm)s->ga.data)) +
%(sm)s->ga.offset),
PyGpuArray_STRIDES(%(sm)s)[0] / %(itemsize_sm)s,
PyGpuArray_STRIDES(%(sm)s)[1] / %(itemsize_sm)s,
(npy_%(dtype_am)s*)(((char *)cuda_get_ptr(%(am)s->ga.data)) +
%(am)s->ga.offset),
PyGpuArray_STRIDES(%(am)s)[0] / %(itemsize_am)s);
cudaError_t err = cudaGetLastError();
if (cudaSuccess != err)
{
PyErr_Format(PyExc_RuntimeError,
"Cuda error: %(classname)s %(nodename)s: %%s.\\n"
"The kernel was launched with %%d threads,"
" %%d blocks and %%d shared memory\\n",
cudaGetErrorString(err),
n_threads, n_blocks, n_shared_bytes);
// no need to decref output vars the cleanup code will do it
%(fail)s;
}
} }
""" % locals(), file=sio) """ % locals(), file=sio)
return sio.getvalue() return sio.getvalue()
def c_code_cache_version(self): def c_code_cache_version(self):
return (6,) return (7,)
def c_compiler(self):
return NVCC_compiler
gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1HotWithBias() gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1HotWithBias()
class GpuCrossentropySoftmax1HotWithBiasDx(Op): class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
""" """
Implement CrossentropySoftmax1HotWithBiasDx on the gpu. Implement CrossentropySoftmax1HotWithBiasDx on the gpu.
...@@ -294,13 +328,19 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op): ...@@ -294,13 +328,19 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op):
return Apply(self, [dnll, sm, y_idx], [sm.type()]) return Apply(self, [dnll, sm, y_idx], [sm.type()])
def c_code_cache_version(self): def c_code_cache_version(self):
return (9,) return (10,)
def c_headers(self): def c_header_dirs(self):
return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>'] if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
cuda_root = config.cuda.root
if cuda_root:
import os
return [os.path.join(cuda_root, 'include')]
def c_compiler(self): def c_headers(self):
return NVCC_compiler return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
'<gpuarray/types.h>']
def c_code(self, node, nodename, inp, out, sub): def c_code(self, node, nodename, inp, out, sub):
typecode_dx = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype) typecode_dx = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
...@@ -312,20 +352,36 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op): ...@@ -312,20 +352,36 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op):
dtype_sm = node.inputs[1].dtype dtype_sm = node.inputs[1].dtype
dtype_y_idx = node.inputs[2].dtype dtype_y_idx = node.inputs[2].dtype
dtype_dx = node.outputs[0].dtype dtype_dx = node.outputs[0].dtype
type_intp = gpuarray.dtype_to_ctype(numpy.intp)
dnll, sm, y_idx = inp dnll, sm, y_idx = inp
dx, = out dx, = out
fail = sub['fail'] fail = sub['fail']
k_var = "kCrossEntropySoftmax1HotWithBiasDx_" + nodename
err_check = """
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: %(k_var)s: %%s.",
GpuKernel_error(&%(k_var)s, err));
%(fail)s;
}
""" % locals()
sync = ""
if config.gpuarray.sync:
sync = """
err = GpuArray_sync(&%(z)s->ga);
%(err_check)s
""" % locals()
return """ return """
// Get `dnll.shape[0]` or set it to zero if `dnll` is a scalar. // Get `dnll.shape[0]` or set it to zero if `dnll` is a scalar.
const npy_intp %(dnll)s_dims0 = (PyGpuArray_NDIM(%(dnll)s) > 0 ? const ssize_t %(dnll)s_dims0 = (PyGpuArray_NDIM(%(dnll)s) > 0 ?
PyGpuArray_DIMS(%(dnll)s)[0] : PyGpuArray_DIMS(%(dnll)s)[0] :
(npy_intp) 0); (ssize_t) 0);
// Get `dnll.strides[0]` and set it to zero if `dnll` is a scalar // Get `dnll.strides[0]` and set it to zero if `dnll` is a scalar
// or a vector with just one element. // or a vector with just one element.
const npy_intp %(dnll)s_strides0 = (%(dnll)s_dims0 > 1 ? const ssize_t %(dnll)s_strides0 = (%(dnll)s_dims0 > 1 ?
PyGpuArray_STRIDES(%(dnll)s)[0] : PyGpuArray_STRIDES(%(dnll)s)[0] :
(npy_intp) 0); (ssize_t) 0);
if ((PyGpuArray_NDIM(%(dnll)s) > 1) if ((PyGpuArray_NDIM(%(dnll)s) > 1)
|| (PyGpuArray_NDIM(%(sm)s) != 2) || (PyGpuArray_NDIM(%(sm)s) != 2)
...@@ -373,48 +429,33 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op): ...@@ -373,48 +429,33 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op):
} }
} }
{ {
int n_blocks = PyGpuArray_DIMS(%(dx)s)[0] < 256 ? PyGpuArray_DIMS(%(dx)s)[0] : 256; size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(dx)s)[0], (size_t)256), 1, 1};
int n_threads = PyGpuArray_DIMS(%(dx)s)[1] < 256 ? PyGpuArray_DIMS(%(dx)s)[1] : 256; size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(%(dx)s)[1], (size_t)256), 1, 1};
ssize_t stride_DNLL0 = %(dnll)s_strides0 / %(itemsize_dnll)s;
kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s ssize_t stride_SM0 = PyGpuArray_STRIDES(%(sm)s)[0] / %(itemsize_sm)s;
<<<n_blocks, n_threads>>>( ssize_t stride_SM1 = PyGpuArray_STRIDES(%(sm)s)[1] / %(itemsize_sm)s;
PyGpuArray_DIMS(%(dx)s)[0], ssize_t stride_YIDX0 = PyGpuArray_STRIDES(%(y_idx)s)[0] / %(itemsize_y_idx)s;
PyGpuArray_DIMS(%(dx)s)[1], ssize_t stride_DX0 = PyGpuArray_STRIDES(%(dx)s)[0] / %(itemsize_dx)s;
ssize_t stride_DX1 = PyGpuArray_STRIDES(%(dx)s)[1] / %(itemsize_dx)s;
(npy_%(dtype_dnll)s*)(((char *)cuda_get_ptr(%(dnll)s->ga.data)) + void *kernel_params[] = {
%(dnll)s->ga.offset), (void *)&PyGpuArray_DIMS(%(dx)s)[0],
%(dnll)s_strides0 / %(itemsize_dnll)s, (void *)&PyGpuArray_DIMS(%(dx)s)[1],
(void *)%(dnll)s->ga.data, (void *)&%(dnll)s->ga.offset,
(npy_%(dtype_sm)s*)(((char *)cuda_get_ptr(%(sm)s->ga.data)) + (void *)&stride_DNLL0,
%(sm)s->ga.offset), (void *)%(sm)s->ga.data, (void *)&%(sm)s->ga.offset,
PyGpuArray_STRIDES(%(sm)s)[0] / %(itemsize_sm)s, (void *)&stride_SM0, (void *)&stride_SM1,
PyGpuArray_STRIDES(%(sm)s)[1] / %(itemsize_sm)s, (void *)%(y_idx)s->ga.data, (void *)&%(y_idx)s->ga.offset,
(void *)&stride_YIDX0,
(npy_%(dtype_y_idx)s*)(((char *)cuda_get_ptr(%(y_idx)s->ga.data)) + (void *)%(dx)s->ga.data, (void *)&%(dx)s->ga.offset,
%(y_idx)s->ga.offset), (void *)&stride_DX0, (void *)&stride_DX1};
PyGpuArray_STRIDES(%(y_idx)s)[0] / %(itemsize_y_idx)s, int err = GpuKernel_call(&%(k_var)s, 3, threads_per_block, n_blocks, 0, kernel_params);
%(err_check)s
(npy_%(dtype_dx)s*)(((char *)cuda_get_ptr(%(dx)s->ga.data)) + %(sync)s
%(dx)s->ga.offset),
PyGpuArray_STRIDES(%(dx)s)[0] / %(itemsize_dx)s,
PyGpuArray_STRIDES(%(dx)s)[1] / %(itemsize_dx)s
);
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
PyErr_Format(PyExc_RuntimeError,
"Cuda error: %%s: %%s.\\n"
"The kernel was launched with %%d threads and"
" %%d blocks\\n",
"kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s",
cudaGetErrorString(err), n_threads, n_blocks);
%(fail)s;
}
} }
assert(%(dx)s); assert(%(dx)s);
""" % locals() """ % locals()
def c_support_code_apply(self, node, nodename): def gpu_kernels(self, node, nodename):
dtype_dnll = node.inputs[0].dtype dtype_dnll = node.inputs[0].dtype
dtype_sm = node.inputs[1].dtype dtype_sm = node.inputs[1].dtype
dtype_y_idx = node.inputs[2].dtype dtype_y_idx = node.inputs[2].dtype
...@@ -423,18 +464,35 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op): ...@@ -423,18 +464,35 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op):
load_dnll = load_w(dtype_dnll) load_dnll = load_w(dtype_dnll)
load_sm = load_w(dtype_sm) load_sm = load_w(dtype_sm)
write_dx = write_w(dtype_dx) write_dx = write_w(dtype_dx)
return """ flags = Kernel.get_flags(dtype_dnll, dtype_sm, dtype_y_idx, dtype_dx)
__global__ void kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s( type_dnll = gpuarray.dtype_to_ctype(work_dnll)
int N, int K, type_sm = gpuarray.dtype_to_ctype(dtype_sm)
const npy_%(dtype_dnll)s* dnll, const int dnll_s0, type_y_idx = gpuarray.dtype_to_ctype(dtype_y_idx)
const npy_%(dtype_sm)s* sm, const int sm_s0, const int sm_s1, type_dx = gpuarray.dtype_to_ctype(dtype_dx)
const npy_%(dtype_y_idx)s* y_idx, const int y_idx_s0, kname = "kCrossEntropySoftmax1HotWithBiasDx"
npy_%(dtype_dx)s* dx, const int dx_s0, const int dx_s1) k_var = "kCrossEntropySoftmax1HotWithBiasDx_" + nodename
sio = StringIO()
print("""
KERNEL void %(kname)s(
const ga_size N, const ga_size K,
const %(type_dnll)s* dnll, const ga_size offset_dnll,
const ga_ssize dnll_s0,
const %(type_sm)s* sm, const ga_size offset_sm,
const ga_ssize sm_s0, const ga_ssize sm_s1,
const %(type_y_idx)s* y_idx, const ga_size offset_y_idx,
const ga_ssize y_idx_s0,
%(type_dx)s* dx, const ga_size offset_dx,
const ga_ssize dx_s0, const ga_ssize dx_s1)
{ {
dnll = (const %(type_dnll)s *)(((char *)dnll)+offset_dnll);
sm = (const %(type_sm)s *)(((char *)sm)+offset_sm);
y_idx = (const %(type_y_idx)s *)(((char *)y_idx)+offset_y_idx);
dx = (%(type_dx)s *)(((char *)dx)+offset_dx);
for (int i = blockIdx.x; i < N; i += gridDim.x) for (int i = blockIdx.x; i < N; i += gridDim.x)
{ {
npy_%(work_dnll)s dnll_i = %(load_dnll)s(dnll[i * dnll_s0]); %(type_dnll)s dnll_i = %(load_dnll)s(dnll[i * dnll_s0]);
npy_%(dtype_y_idx)s y_i = y_idx[i * y_idx_s0]; %(type_y_idx)s y_i = y_idx[i * y_idx_s0];
for (int j = threadIdx.x; j < K; j += blockDim.x) for (int j = threadIdx.x; j < K; j += blockDim.x)
{ {
...@@ -453,17 +511,21 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op): ...@@ -453,17 +511,21 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op):
} }
} }
} }
""" % locals(), file=sio)
CUdeviceptr (*cuda_get_ptr)(gpudata *g); params = [
""" % locals() 'uintp', 'uintp',
gpuarray.GpuArray, 'uintp', 'intp',
def c_init_code(self): gpuarray.GpuArray, 'uintp', 'intp', 'intp',
return ['cuda_get_ptr = (CUdeviceptr (*)(gpudata *g))gpuarray_get_extension("cuda_get_ptr");'] gpuarray.GpuArray, 'uintp', 'intp',
gpuarray.GpuArray, 'uintp', 'intp', 'intp'
]
return [Kernel(code=sio.getvalue(), name=kname, params=params,
flags=flags, objvar=k_var)]
gpu_crossentropy_softmax_1hot_with_bias_dx = GpuCrossentropySoftmax1HotWithBiasDx() gpu_crossentropy_softmax_1hot_with_bias_dx = GpuCrossentropySoftmax1HotWithBiasDx()
class GpuSoftmax (Op): class GpuSoftmax (GpuKernelBase, Op):
""" """
Implement Softmax on the gpu. Implement Softmax on the gpu.
...@@ -482,12 +544,17 @@ class GpuSoftmax (Op): ...@@ -482,12 +544,17 @@ class GpuSoftmax (Op):
def c_code_cache_version(self): def c_code_cache_version(self):
return (13,) + inline_softmax.code_version return (13,) + inline_softmax.code_version
def c_header_dirs(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
cuda_root = config.cuda.root
if cuda_root:
import os
return [os.path.join(cuda_root, 'include')]
def c_headers(self): def c_headers(self):
return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>', return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
'<gpuarray/ext_cuda.h>'] '<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']
def c_compiler(self):
return NVCC_compiler
def c_init_code(self): def c_init_code(self):
return ['setup_ext_cuda();'] return ['setup_ext_cuda();']
...@@ -502,10 +569,21 @@ class GpuSoftmax (Op): ...@@ -502,10 +569,21 @@ class GpuSoftmax (Op):
x, = inp x, = inp
z, = out z, = out
fail = sub['fail'] fail = sub['fail']
err_check = """
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, fmt_str, msg);
%(fail)s;
}
""" % locals()
sync = ""
if config.gpuarray.sync: if config.gpuarray.sync:
cnda_thread_sync = "GpuArray_sync(&%(zz)s->ga);" % dict(zz=zz) sync = """
err = GpuArray_sync(&%(z)s->ga);
msg = "sync error";
%(err_check)s
""" % locals()
else: else:
cnda_thread_sync = "" sync = ""
return """ return """
if (PyGpuArray_NDIM(%(x)s) != 2) if (PyGpuArray_NDIM(%(x)s) != 2)
{ {
...@@ -528,97 +606,82 @@ class GpuSoftmax (Op): ...@@ -528,97 +606,82 @@ class GpuSoftmax (Op):
} }
} }
{ {
int n_blocks = std::min(PyGpuArray_DIMS(%(x)s)[0], size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t)(32 * 1024)), 1, 1};
(size_t)(32 * 1024));
//TODO, detect the maximum number of thread per block. //TODO, detect the maximum number of thread per block.
int n_threads = std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t)512); size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t)512), 1, 1};
int n_shared_bytes = PyGpuArray_DIMS(%(x)s)[1] * size_t shmem_sz = PyGpuArray_DIMS(%(x)s)[1] *
2 * sizeof(npy_%(work_x)s); 2 * sizeof(npy_%(work_x)s);
ssize_t stride_X0 = PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s;
ssize_t stride_X1 = PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s;
ssize_t stride_Z0 = PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s;
ssize_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s;
const char *fmt_str, *msg;
void *kernel_params[] = {
(void *)&PyGpuArray_DIMS(%(x)s)[0],
(void *)&PyGpuArray_DIMS(%(x)s)[1],
(void *)%(x)s->ga.data, (void *)&%(x)s->ga.offset,
(void *)&stride_X0, (void *)&stride_X1,
(void *)%(z)s->ga.data, (void *)&%(z)s->ga.offset,
(void *)&stride_Z0, (void *)&stride_Z1};
int err = GA_NO_ERROR;
if (PyGpuArray_DIMS(%(x)s)[0] > 0) if (PyGpuArray_DIMS(%(x)s)[0] > 0)
{ {
//Those numbers are based on not too recent GPU //Those numbers are based on not too recent GPU
//to make them compatible with more GPU. //to make them compatible with more GPU.
//TODO: read the information from the card. //TODO: read the information from the card.
if(n_shared_bytes < (32 * 1024 - 500)){ if(shmem_sz < (32 * 1024 - 500)){
kSoftmax_%(nodename)s err = GpuKernel_call(&kSoftmax_%(nodename)s, 3,
<<< threads_per_block, n_blocks, shmem_sz,
n_blocks, kernel_params);
n_threads, fmt_str = "gpuarray error: kSoftmax_%(nodename)s: %%s";
n_shared_bytes msg = GpuKernel_error(&kSoftmax_%(nodename)s, err);
>>>(
PyGpuArray_DIMS(%(x)s)[0],
PyGpuArray_DIMS(%(x)s)[1],
(npy_%(dtype_x)s*)(
((char *)cuda_get_ptr(%(x)s->ga.data)) +
%(x)s->ga.offset),
PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s,
PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s,
(npy_%(dtype_z)s*)(
((char *)cuda_get_ptr(%(z)s->ga.data)) +
%(z)s->ga.offset),
PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s,
PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s
);
}else{ }else{
kSoftmax_fixed_shared%(nodename)s err = GpuKernel_call(&kSoftmax_fixed_shared%(nodename)s, 3,
<<< threads_per_block, n_blocks,
n_blocks, threads_per_block[0] * sizeof(npy_%(work_x)s),
n_threads, kernel_params);
n_threads * sizeof(npy_%(work_x)s) fmt_str = "gpuarray error: kSoftmax_fixed_shared%(nodename)s: %%s";
>>>( msg = GpuKernel_error(&kSoftmax_fixed_shared%(nodename)s, err);
PyGpuArray_DIMS(%(x)s)[0],
PyGpuArray_DIMS(%(x)s)[1],
(npy_%(dtype_x)s*)(
((char *)cuda_get_ptr(%(x)s->ga.data)) +
%(x)s->ga.offset),
PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s,
PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s,
(npy_%(dtype_z)s*)(
((char *)cuda_get_ptr(%(z)s->ga.data)) +
%(z)s->ga.offset),
PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s,
PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s
);
}
%(cnda_thread_sync)s
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
PyErr_Format(PyExc_RuntimeError,
"Cuda error: %%s: %%s.\\n Used %%d blocks,"
" %%d threads %%d bytes of shared memory",
"kSoftmax[_fixed_shared]%(nodename)s",
cudaGetErrorString(err),
n_blocks, n_threads, n_shared_bytes);
%(fail)s;
} }
%(err_check)s
%(sync)s
} }
} }
assert(%(z)s); assert(%(z)s);
""" % locals() """ % locals()
def c_support_code_apply(self, node, nodename): def gpu_kernels(self, node, nodename):
dtype_x = node.inputs[0].dtype dtype_x = node.inputs[0].dtype
dtype_sm = node.outputs[0].dtype dtype_sm = node.outputs[0].dtype
load_x = load_w(node.inputs[0].dtype) load_x = load_w(dtype_x)
write_sm = write_w(node.outputs[0].dtype) write_sm = write_w(node.outputs[0].dtype)
work_sm = work_dtype(node.outputs[0].dtype) work_sm = work_dtype(dtype_sm)
ret1 = nvcc_kernel("kSoftmax_%s" % nodename, flags = Kernel.get_flags(dtype_x, dtype_sm)
params=['int M', 'int N', type_x = gpuarray.dtype_to_ctype(dtype_x)
'const npy_%(dtype_x)s * x', 'const int sx0', 'const int sx1', type_sm = gpuarray.dtype_to_ctype(work_sm)
'npy_%(dtype_sm)s * sm', 'const int sm_s0', 'const int sm_s1'], params = [
'uintp', 'uintp',
gpuarray.GpuArray, 'uintp', 'intp', 'intp',
gpuarray.GpuArray, 'uintp', 'intp', 'intp'
]
kernels = []
kname = "kSoftmax"
k_var= "kSoftmax_" + nodename
code = nvcc_kernel(kname,
params=['const ga_size M', 'const ga_size N',
'const %s * x' % type_x, 'const ga_size offset_x',
'const ga_ssize sx0', 'const ga_ssize sx1',
'%s * sm' % type_sm, 'const ga_size offset_sm',
'const ga_ssize sm_s0', 'const ga_ssize sm_s1'],
body=[ body=[
"extern __shared__ npy_%(work_sm)s buf[]", "extern __shared__ %s buf[]" % type_sm,
"npy_%(work_sm)s * buf2 = buf + N", "%s * buf2 = buf + N" % type_sm,
"x = (const %s *)(((char *)x)+offset_x)" % type_x,
"sm = (%s *)(((char *)sm)+offset_sm)" % type_sm,
"for (int blockIDX = blockIdx.x; blockIDX < M;" "for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){", " blockIDX += gridDim.x){",
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){", "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
"buf[tx] = %(load_x)s(x[blockIDX * sx0 + tx * sx1])", "buf[tx] = %s(x[blockIDX * sx0 + tx * sx1])" % load_x,
"buf2[tx] = buf[tx]", "buf2[tx] = buf[tx]",
"}", "}",
"__syncthreads()", "__syncthreads()",
...@@ -626,21 +689,29 @@ class GpuSoftmax (Op): ...@@ -626,21 +689,29 @@ class GpuSoftmax (Op):
'threadIdx.x', 'blockDim.x', work_sm), 'threadIdx.x', 'blockDim.x', work_sm),
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){", "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
# This set all value correctly # This set all value correctly
"sm[blockIDX * sm_s0 + tx * sm_s1] = %(write_sm)s(buf[tx])", "sm[blockIDX * sm_s0 + tx * sm_s1] = %s(buf[tx])" % write_sm,
"}", "}",
"__syncthreads()", "__syncthreads()",
"}", "}",
]) ])
ret2 = nvcc_kernel("kSoftmax_fixed_shared%s" % nodename, kernels.append(Kernel(code=code, name=kname, params=params,
params=['int M', 'int N', flags=flags, objvar=k_var))
'const npy_%(dtype_x)s * x', 'const int sx0', 'const int sx1', kname = "kSoftmax_fixed_shared"
'npy_%(dtype_sm)s * sm', 'const int sm_s0', 'const int sm_s1'], k_var= "kSoftmax_fixed_shared" + nodename
code = nvcc_kernel(kname,
params=['const ga_size M', 'const ga_size N',
'const %s * x' % type_x, 'const ga_size offset_x',
'const ga_ssize sx0', 'const ga_ssize sx1',
'%s * sm' % type_sm, 'const ga_size offset_sm',
'const ga_ssize sm_s0', 'const ga_ssize sm_s1'],
body=[ body=[
"extern __shared__ npy_%(work_sm)s buf[]", "extern __shared__ %s buf[]" % type_sm,
"x = (const %s *)(((char *)x)+offset_x)" % type_x,
"sm = (%s *)(((char *)sm)+offset_sm)" % type_sm,
"for (int blockIDX = blockIdx.x; blockIDX < M;" "for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){", " blockIDX += gridDim.x){",
"const npy_%(dtype_x)s *x_ptr = &x[blockIDX * sx0]", "const %s *x_ptr = &x[blockIDX * sx0]" % type_x,
"npy_%(dtype_sm)s *sm_ptr = &sm[blockIDX * sm_s0]", "%s *sm_ptr = &sm[blockIDX * sm_s0]" % type_sm,
inline_softmax_fixed_shared('N', 'buf', 'x_ptr', 'sx1', inline_softmax_fixed_shared('N', 'buf', 'x_ptr', 'sx1',
load_x, load_x,
'sm_ptr', 'sm_s1', write_sm, 'sm_ptr', 'sm_s1', write_sm,
...@@ -649,12 +720,14 @@ class GpuSoftmax (Op): ...@@ -649,12 +720,14 @@ class GpuSoftmax (Op):
"__syncthreads()", "__syncthreads()",
"}", "}",
]) ])
return (ret1 + "\n" + ret2) % locals() kernels.append(Kernel(code=code, name=kname, params=params,
flags=flags, objvar=k_var))
return kernels
gpu_softmax = GpuSoftmax() gpu_softmax = GpuSoftmax()
class GpuSoftmaxWithBias (Op): class GpuSoftmaxWithBias (GpuKernelBase, Op):
""" """
Implement SoftmaxWithBias on the gpu. Implement SoftmaxWithBias on the gpu.
...@@ -676,12 +749,19 @@ class GpuSoftmaxWithBias (Op): ...@@ -676,12 +749,19 @@ class GpuSoftmaxWithBias (Op):
def c_code_cache_version(self): def c_code_cache_version(self):
return (12,) + inline_softmax.code_version return (12,) + inline_softmax.code_version
def c_header_dirs(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
cuda_root = config.cuda.root
if cuda_root:
import os
return [os.path.join(cuda_root, 'include')]
else:
return []
def c_headers(self): def c_headers(self):
return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>', return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
'<gpuarray/ext_cuda.h>'] '<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']
def c_compiler(self):
return NVCC_compiler
def c_init_code(self): def c_init_code(self):
return ['setup_ext_cuda();'] return ['setup_ext_cuda();']
...@@ -698,10 +778,19 @@ class GpuSoftmaxWithBias (Op): ...@@ -698,10 +778,19 @@ class GpuSoftmaxWithBias (Op):
x, b = inp x, b = inp
z, = out z, = out
fail = sub['fail'] fail = sub['fail']
err_check = """
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, fmt_str, msg);
%(fail)s;
}
""" % locals()
sync = ""
if config.gpuarray.sync: if config.gpuarray.sync:
cnda_thread_sync = "GpuArray_sync(&%(zz)s->ga);" % dict(zz=zz) sync = """
else: err = GpuArray_sync(&%(z)s->ga);
cnda_thread_sync = "" msg = "sync error";
%(err_check)s
""" % locals()
return """ return """
if (PyGpuArray_NDIM(%(x)s) != 2) if (PyGpuArray_NDIM(%(x)s) != 2)
{ {
...@@ -739,82 +828,51 @@ class GpuSoftmaxWithBias (Op): ...@@ -739,82 +828,51 @@ class GpuSoftmaxWithBias (Op):
} }
} }
{ {
int n_blocks = std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t)(32*1024)); size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t)(32*1024)), 1, 1};
//TODO, detect the maximum number of thread per block. //TODO, detect the maximum number of thread per block.
int n_threads = std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t)512); size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t)512), 1, 1};
int n_shared_bytes = PyGpuArray_DIMS(%(x)s)[1] * size_t shmem_sz = PyGpuArray_DIMS(%(x)s)[1] *
2 * sizeof(npy_%(work_x)s); 2 * sizeof(npy_%(work_x)s);
ssize_t stride_X0 = PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s;
ssize_t stride_X1 = PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s;
ssize_t stride_B0 = PyGpuArray_STRIDES(%(b)s)[0] / %(itemsize_b)s;
ssize_t stride_Z0 = PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s;
ssize_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s;
const char *fmt_str, *msg;
void *kernel_params[] = {
(void *)&PyGpuArray_DIMS(%(x)s)[0],
(void *)&PyGpuArray_DIMS(%(x)s)[1],
(void *)%(x)s->ga.data, (void *)&%(x)s->ga.offset,
(void *)&stride_X0, (void *)&stride_X1,
(void *)%(b)s->ga.data, (void *)&%(b)s->ga.offset,
(void *)&stride_B0,
(void *)%(z)s->ga.data, (void *)&%(z)s->ga.offset,
(void *)&stride_Z0, (void *)&stride_Z1};
int err = GA_NO_ERROR;
if (PyGpuArray_DIMS(%(x)s)[0] > 0) if (PyGpuArray_DIMS(%(x)s)[0] > 0)
{ {
if(n_shared_bytes < (32 * 1024 - 500)){ if(shmem_sz < (32 * 1024 - 500)){
kSoftmaxWithBias_%(nodename)s err = GpuKernel_call(&kSoftmaxWithBias_%(nodename)s, 3,
<<< threads_per_block, n_blocks, shmem_sz,
n_blocks, kernel_params);
n_threads, fmt_str = "gpuarray error: kSoftmaxWithBias_%(nodename)s: %%s";
n_shared_bytes msg = GpuKernel_error(&kSoftmaxWithBias_%(nodename)s, err);
>>>(
PyGpuArray_DIMS(%(x)s)[0],
PyGpuArray_DIMS(%(x)s)[1],
(npy_%(dtype_x)s*)(
((char *)cuda_get_ptr(%(x)s->ga.data)) +
%(x)s->ga.offset),
PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s,
PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s,
(npy_%(dtype_b)s*)(((char *)cuda_get_ptr(%(b)s->ga.data)) +
%(b)s->ga.offset),
PyGpuArray_STRIDES(%(b)s)[0] / %(itemsize_b)s,
(npy_%(dtype_z)s*)(((char *)cuda_get_ptr(%(z)s->ga.data)) +
%(z)s->ga.offset),
PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s,
PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s
);
}else{ }else{
kSoftmaxWithBias_fixed_shared%(nodename)s err = GpuKernel_call(&kSoftmaxWithBias_fixed_shared%(nodename)s,
<<< 3, threads_per_block, n_blocks,
n_blocks, threads_per_block[0] * sizeof(npy_%(work_x)s),
n_threads, kernel_params);
n_threads * sizeof(npy_%(work_x)s) fmt_str = "gpuarray error: kSoftmaxWithBias_fixed_shared%(nodename)s: %%s";
>>>( msg = GpuKernel_error(&kSoftmaxWithBias_fixed_shared%(nodename)s, err);
PyGpuArray_DIMS(%(x)s)[0],
PyGpuArray_DIMS(%(x)s)[1],
(npy_%(dtype_x)s*)(
((char *)cuda_get_ptr(%(x)s->ga.data)) +
%(x)s->ga.offset),
PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s,
PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s,
(npy_%(dtype_b)s*)(
((char *)cuda_get_ptr(%(b)s->ga.data)) +
%(b)s->ga.offset),
PyGpuArray_STRIDES(%(b)s)[0] / %(itemsize_b)s,
(npy_%(dtype_z)s*)(
((char *)cuda_get_ptr(%(z)s->ga.data)) +
%(z)s->ga.offset),
PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s,
PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s
);
} }
%(cnda_thread_sync)s %(err_check)s
cudaError_t err = cudaGetLastError(); %(sync)s
if( cudaSuccess != err)
{
PyErr_Format(PyExc_RuntimeError,
"Cuda error: %%s: %%s.\\n",
"kSoftmaxWithBias_%(nodename)s",
cudaGetErrorString(err));
%(fail)s;
}
} }
} }
assert(%(z)s); assert(%(z)s);
""" % locals() """ % locals()
def c_support_code_apply(self, node, nodename): def gpu_kernels(self, node, nodename):
dtype_x = node.inputs[0].dtype dtype_x = node.inputs[0].dtype
dtype_b = node.inputs[1].dtype dtype_b = node.inputs[1].dtype
dtype_sm = node.outputs[0].dtype dtype_sm = node.outputs[0].dtype
...@@ -822,55 +880,80 @@ class GpuSoftmaxWithBias (Op): ...@@ -822,55 +880,80 @@ class GpuSoftmaxWithBias (Op):
load_b = load_w(node.inputs[1].dtype) load_b = load_w(node.inputs[1].dtype)
write_sm = write_w(node.outputs[0].dtype) write_sm = write_w(node.outputs[0].dtype)
work_sm = work_dtype(node.outputs[0].dtype) work_sm = work_dtype(node.outputs[0].dtype)
ret1 = nvcc_kernel("kSoftmaxWithBias_%s" % nodename, flags = Kernel.get_flags(dtype_x, dtype_b, dtype_sm)
params=['int M', 'int N', type_x = gpuarray.dtype_to_ctype(dtype_x)
'const npy_%(dtype_x)s * x', 'const int sx0', 'const int sx1', type_b = gpuarray.dtype_to_ctype(dtype_b)
'const npy_%(dtype_b)s * b', 'const int sb0', type_sm = gpuarray.dtype_to_ctype(work_sm)
'npy_%(dtype_sm)s * sm', 'const int sm_s0', 'const int sm_s1'], params = [
'uintp', 'uintp',
gpuarray.GpuArray, 'uintp', 'intp', 'intp',
gpuarray.GpuArray, 'uintp', 'intp',
gpuarray.GpuArray, 'uintp', 'intp', 'intp'
]
kernels = []
kname = "kSoftmaxWithBias"
k_var = "kSoftmaxWithBias_" + nodename
code = nvcc_kernel(kname,
params=['const ga_size M', 'const ga_size N',
'const %s * x' % type_x, 'const ga_size offset_x',
'const ga_ssize sx0', 'const ga_ssize sx1',
'const %s * b' % type_b, 'const ga_size offset_b',
'const ga_ssize sb0',
'%s * sm' % type_sm, 'const ga_size offset_sm',
'const ga_ssize sm_s0', 'const ga_ssize sm_s1'],
body=[ body=[
"extern __shared__ npy_%(work_sm)s buf[]", "extern __shared__ %s buf[]" % type_sm,
"npy_%(work_sm)s * buf2 = buf + N", "%s * buf2 = buf + N" % type_sm,
"x = (const %s *)(((char *)x)+offset_x)" % type_x,
"b = (const %s *)(((char *)b)+offset_b)" % type_b,
"sm = (%s *)(((char *)sm)+offset_sm)" % type_sm,
"for (int blockIDX = blockIdx.x; blockIDX < M;" "for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){", " blockIDX += gridDim.x){",
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){", "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
"buf[tx] = %(load_x)s(x[blockIDX * sx0 + tx * sx1])", "buf[tx] = %s(x[blockIDX * sx0 + tx * sx1])" % load_x,
"buf[tx] += %(load_b)s(b[tx * sb0])", "buf[tx] += %s(b[tx * sb0])" % load_b,
"buf2[tx] = buf[tx]", "buf2[tx] = buf[tx]",
"}", "}",
"__syncthreads()", "__syncthreads()",
inline_softmax('N', 'buf', 'buf2', inline_softmax('N', 'buf', 'buf2',
'threadIdx.x', 'blockDim.x', work_sm), 'threadIdx.x', 'blockDim.x', work_sm),
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){", "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
"sm[blockIDX * sm_s0 + tx * sm_s1] = %(write_sm)s(buf[tx])", "sm[blockIDX * sm_s0 + tx * sm_s1] = %s(buf[tx])" % write_sm,
"}", "}",
"__syncthreads()", "__syncthreads()",
"}", "}",
]) ])
ret2 = nvcc_kernel("kSoftmaxWithBias_fixed_shared%s" % nodename, kernels.append(Kernel(code=code, name=kname, params=params,
params=['int M', 'int N', flags=flags, objvar=k_var))
'const npy_%(dtype_x)s * x', kname = "kSoftmaxWithBias_fixed_shared"
'const int sx0', 'const int sx1', k_var = "kSoftmaxWithBias_fixed_shared" + nodename
'const npy_%(dtype_b)s * b', 'const int sb0', code = nvcc_kernel(kname,
'npy_%(dtype_sm)s * sm', params=['const ga_size M', 'const ga_size N',
'const int sm_s0', 'const int sm_s1'], 'const %s * x' % type_x, 'const ga_size offset_x',
body=[ 'const ga_ssize sx0', 'const ga_ssize sx1',
"extern __shared__ npy_%(work_sm)s buf[]", 'const %s * b' % type_b, 'const ga_size offset_b',
"for (int blockIDX = blockIdx.x; blockIDX < M;" 'const ga_ssize sb0',
" blockIDX += gridDim.x){", '%s * sm' % type_sm, 'const ga_size offset_sm',
"const npy_%(dtype_x)s *x_ptr = &x[blockIDX * sx0]", 'const ga_ssize sm_s0', 'const ga_ssize sm_s1'],
"npy_%(dtype_sm)s *sm_ptr = &sm[blockIDX * sm_s0]", body=[
inline_softmax_fixed_shared('N', 'buf', "extern __shared__ %s buf[]" % type_sm,
'x_ptr', 'sx1', "x = (const %s *)(((char *)x)+offset_x)" % type_x,
load_x, "b = (const %s *)(((char *)b)+offset_b)" % type_b,
'sm_ptr', 'sm_s1', "sm = (%s *)(((char *)sm)+offset_sm)" % type_sm,
write_sm, "for (int blockIDX = blockIdx.x; blockIDX < M;"
'threadIdx.x', " blockIDX += gridDim.x){",
'blockDim.x', "const %s *x_ptr = &x[blockIDX * sx0]" % type_x,
'b', 'sb0', load_b, "%s *sm_ptr = &sm[blockIDX * sm_s0]" % type_sm,
work_sm), inline_softmax_fixed_shared('N', 'buf', 'x_ptr', 'sx1',
"__syncthreads()", load_x,
"}", 'sm_ptr', 'sm_s1', write_sm,
]) 'threadIdx.x', 'blockDim.x',
return (ret1 + "\n" + ret2) % locals() 'b', 'sb0', load_b, work_sm),
"__syncthreads()",
"}",
])
kernels.append(Kernel(code=code, name=kname, params=params,
flags=flags, objvar=k_var))
return kernels
gpu_softmax_with_bias = GpuSoftmaxWithBias() gpu_softmax_with_bias = GpuSoftmaxWithBias()
...@@ -3,7 +3,7 @@ import copy ...@@ -3,7 +3,7 @@ import copy
import numpy import numpy
import theano import theano
from theano import tensor, gof, Op from theano import tensor, gof, Op, config
from six.moves import StringIO from six.moves import StringIO
from theano.tensor.subtensor import IncSubtensor, Subtensor, get_idx_list from theano.tensor.subtensor import IncSubtensor, Subtensor, get_idx_list
import theano.tensor.inplace import theano.tensor.inplace
...@@ -15,7 +15,7 @@ except ImportError: ...@@ -15,7 +15,7 @@ except ImportError:
pass pass
from .type import GpuArrayType from .type import GpuArrayType
from .basic_ops import as_gpuarray_variable, HideC from .basic_ops import (as_gpuarray_variable, HideC, GpuKernelBase, Kernel)
from .elemwise import GpuElemwise from .elemwise import GpuElemwise
from .comp import NVCC_compiler from .comp import NVCC_compiler
...@@ -159,7 +159,7 @@ class GpuSubtensor(HideC, Subtensor): ...@@ -159,7 +159,7 @@ class GpuSubtensor(HideC, Subtensor):
return (6,) return (6,)
class GpuIncSubtensor(IncSubtensor): class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
""" """
Implement IncSubtensor on the gpu. Implement IncSubtensor on the gpu.
...@@ -177,6 +177,14 @@ class GpuIncSubtensor(IncSubtensor): ...@@ -177,6 +177,14 @@ class GpuIncSubtensor(IncSubtensor):
def _f16_ok(self): def _f16_ok(self):
return self.iadd_node.op._f16_ok return self.iadd_node.op._f16_ok
def c_header_dirs(self):
cuda_root = config.cuda.root
if cuda_root:
import os
return [os.path.join(cuda_root, 'include')]
else:
return []
def c_headers(self): def c_headers(self):
return self.iadd_node.op.c_headers() return self.iadd_node.op.c_headers()
...@@ -186,6 +194,10 @@ class GpuIncSubtensor(IncSubtensor): ...@@ -186,6 +194,10 @@ class GpuIncSubtensor(IncSubtensor):
def c_init_code(self): def c_init_code(self):
return self.iadd_node.op.c_init_code() return self.iadd_node.op.c_init_code()
def gpu_kernels(self, node, nodename):
subname = nodename + "_add_to_zview"
return self.iadd_node.op.gpu_kernels(self.iadd_node, subname)
def make_node(self, x, y, *inputs): def make_node(self, x, y, *inputs):
x = as_gpuarray_variable(x) x = as_gpuarray_variable(x)
y = as_gpuarray_variable(y) y = as_gpuarray_variable(y)
...@@ -486,7 +498,7 @@ class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1): ...@@ -486,7 +498,7 @@ class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1):
k(x[i], reshaped_y, broadcast=True) k(x[i], reshaped_y, broadcast=True)
class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1): class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, GpuAdvancedIncSubtensor1):
""" """
Implement AdvancedIncSubtensor1 on the gpu, but use function Implement AdvancedIncSubtensor1 on the gpu, but use function
only avail on compute capability 2.0 and more recent. only avail on compute capability 2.0 and more recent.
...@@ -525,16 +537,25 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1): ...@@ -525,16 +537,25 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
return gof.Apply(self, [x_, y_, ilist_], [x_.type()]) return gof.Apply(self, [x_, y_, ilist_], [x_.type()])
def c_code_cache_version(self): def c_code_cache_version(self):
return (4,) return (5,)
def c_headers(self): def c_headers(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>', return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
'<gpuarray/ext_cuda.h>'] '<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']
def c_compiler(self): def c_header_dirs(self):
return NVCC_compiler if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
cuda_root = config.cuda.root
if cuda_root:
import os
return [os.path.join(cuda_root, 'include')]
def c_init_code(self): def c_init_code(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
return ['setup_ext_cuda();'] return ['setup_ext_cuda();']
def c_code(self, node, name, inputs, outputs, sub): def c_code(self, node, name, inputs, outputs, sub):
...@@ -569,7 +590,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1): ...@@ -569,7 +590,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
} }
""" % locals() """ % locals()
def c_support_code_apply(self, node, nodename): def gpu_kernels(self, node, nodename):
dtype_x = node.inputs[0].dtype dtype_x = node.inputs[0].dtype
dtype_y = node.inputs[1].dtype dtype_y = node.inputs[1].dtype
dtype_ind = node.inputs[2].dtype dtype_ind = node.inputs[2].dtype
...@@ -578,7 +599,14 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1): ...@@ -578,7 +599,14 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
itemsize_y = numpy.dtype(dtype_y).itemsize itemsize_y = numpy.dtype(dtype_y).itemsize
itemsize_ind = numpy.dtype(dtype_ind).itemsize itemsize_ind = numpy.dtype(dtype_ind).itemsize
itemsize_out = numpy.dtype(dtype_out).itemsize itemsize_out = numpy.dtype(dtype_out).itemsize
return """ flags=Kernel.get_flags(dtype_x, dtype_y, dtype_ind)
type_x = gpuarray.dtype_to_ctype(dtype_x)
type_y = gpuarray.dtype_to_ctype(dtype_y)
type_ind = gpuarray.dtype_to_ctype(dtype_ind)
type_out = gpuarray.dtype_to_ctype(dtype_out)
kname = "k_vector_add_fast"
k_var = "k_vector_add_fast_" + nodename
code = """
/* /*
* This is a version of atomicAdd that works for half-floats. It may * This is a version of atomicAdd that works for half-floats. It may
...@@ -587,37 +615,43 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1): ...@@ -587,37 +615,43 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
* will not be modified. * will not be modified.
*/ */
__device__ npy_float16 atomicAdd(npy_float16 *addr, npy_float16 val) { __device__ ga_half atomicAdd(ga_half *addr, ga_half val) {
npy_uint32 *base = (npy_uint32 *)((size_t)addr & ~2); ga_uint *base = (ga_uint *)((ga_size)addr & ~2);
npy_uint32 old, assumed, sum, new_; ga_uint old, assumed, sum, new_;
old = *base; old = *base;
do { do {
assumed = old; assumed = old;
sum = __float2half_rn( sum = __float2half_rn(
__half2float(val) + __half2float(val) +
__half2float((npy_float16)__byte_perm(old, 0, __half2float((ga_half)__byte_perm(old, 0,
((size_t)addr & 2) ? 0x4432 : 0x4410))); ((ga_size)addr & 2) ? 0x4432 : 0x4410)));
new_ = __byte_perm(old, sum, ((size_t)addr & 2) ? 0x5410 : 0x3254); new_ = __byte_perm(old, sum, ((ga_size)addr & 2) ? 0x5410 : 0x3254);
old = atomicCAS(base, assumed, new_); old = atomicCAS(base, assumed, new_);
} while (assumed != old); } while (assumed != old);
return (npy_float16)__byte_perm(old, 0, return (ga_half)__byte_perm(old, 0,
((size_t)addr & 2) ? 0x4432 : 0x4410); ((ga_size)addr & 2) ? 0x4432 : 0x4410);
} }
__global__ void k_vector_add_fast(int numRowsX, KERNEL void k_vector_add_fast(const ga_size numRowsX,
int numColsX, const ga_size numColsX,
int stridesX0, const ga_ssize stridesX0,
int stridesX1, const ga_ssize stridesX1,
npy_%(dtype_x)s *X, %(type_x)s *X,
int numRowsY, const ga_size offset_X,
int numColsY, const ga_size numRowsY,
int stridesY0, const ga_size numColsY,
int stridesY1, const ga_ssize stridesY0,
npy_%(dtype_y)s *Y, const ga_ssize stridesY1,
int numIndices, %(type_y)s *Y,
int stridesIndices, const ga_size offset_Y,
npy_%(dtype_ind)s *indices_arr) const ga_size numIndices,
const ga_ssize stridesIndices,
%(type_ind)s *indices_arr,
const ga_size offset_indices_arr)
{ {
X = (%(type_x)s *)(((char *)X)+offset_X);
Y = (%(type_y)s *)(((char *)Y)+offset_Y);
indices_arr = (%(type_ind)s *)(((char *)indices_arr)+offset_indices_arr);
for (int i = (blockIdx.x); i < numIndices; i += gridDim.x) for (int i = (blockIdx.x); i < numIndices; i += gridDim.x)
{ {
for(int j = (threadIdx.x); j < numColsX;j += blockDim.x) for(int j = (threadIdx.x); j < numColsX;j += blockDim.x)
...@@ -631,41 +665,71 @@ __device__ npy_float16 atomicAdd(npy_float16 *addr, npy_float16 val) { ...@@ -631,41 +665,71 @@ __device__ npy_float16 atomicAdd(npy_float16 *addr, npy_float16 val) {
} }
return; return;
} }
""" % locals()
params = [
'uintp', 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp',
'uintp', 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp',
'uintp', 'intp', gpuarray.GpuArray, 'uintp'
]
return [Kernel(code=code, name=kname, params=params,
flags=flags, objvar=k_var)]
def c_support_code_apply(self, node, nodename):
dtype_x = node.inputs[0].dtype
dtype_y = node.inputs[1].dtype
dtype_ind = node.inputs[2].dtype
dtype_out = node.outputs[0].dtype
itemsize_x = numpy.dtype(dtype_x).itemsize
itemsize_y = numpy.dtype(dtype_y).itemsize
itemsize_ind = numpy.dtype(dtype_ind).itemsize
itemsize_out = numpy.dtype(dtype_out).itemsize
k_var = "k_vector_add_fast_" + nodename
err_check = """
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: %(k_var)s: %%s.",
GpuKernel_error(&%(k_var)s, err));
}
""" % locals()
sync = ""
if config.gpuarray.sync:
sync = """
err = GpuArray_sync(&%(z)s->ga);
%(err_check)s
""" % locals()
return super(GpuAdvancedIncSubtensor1_dev20, self).c_support_code_apply(node, nodename) + """
void GpuArray_vector_add_fast(PyGpuArrayObject* py_self, void GpuArray_vector_add_fast(PyGpuArrayObject* py_self,
PyGpuArrayObject* py_other, PyGpuArrayObject* py_other,
PyGpuArrayObject *indices_arr) PyGpuArrayObject *indices_arr)
{ {
int num_threads_per_block = std::min(PyGpuArray_DIMS(py_self)[1], size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(py_self)[1], (size_t)256), 1, 1};
(size_t)256); size_t n_blocks[3] = {std::min(PyGpuArray_SIZE(indices_arr), (size_t)4096), 1, 1};
int num_blocks = std::min(PyGpuArray_SIZE(indices_arr), if (threads_per_block[0] > 0 && n_blocks[0] > 0) {
(size_t)4096); ssize_t stride_X0 = PyGpuArray_STRIDES(py_self)[0] / %(itemsize_x)s;
ssize_t stride_X1 = PyGpuArray_STRIDES(py_self)[1] / %(itemsize_x)s;
dim3 n_blocks(num_blocks); ssize_t stride_Y0 = PyGpuArray_DIMS(py_other)[0] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[0] / %(itemsize_y)s;
dim3 n_threads(num_threads_per_block); ssize_t stride_Y1 = PyGpuArray_DIMS(py_other)[1] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[1] / %(itemsize_y)s;
ssize_t stride_ind = PyGpuArray_STRIDES(indices_arr)[0] / %(itemsize_ind)s;
k_vector_add_fast<<<n_blocks, n_threads>>>( void *kernel_params[] = {(void *)&PyGpuArray_DIMS(py_self)[0],
PyGpuArray_DIM(py_self, 0), (void *)&PyGpuArray_DIMS(py_self)[1],
PyGpuArray_DIM(py_self, 1), (void *)&stride_X0,
PyGpuArray_STRIDE(py_self, 0) / %(itemsize_x)s, (void *)&stride_X1,
PyGpuArray_STRIDE(py_self, 1) / %(itemsize_x)s, (void *)py_self->ga.data,
(npy_%(dtype_x)s*)( (void *)&py_self->ga.offset,
((char *)cuda_get_ptr(py_self->ga.data)) + (void *)&PyGpuArray_DIMS(py_other)[0],
py_self->ga.offset), (void *)&PyGpuArray_DIMS(py_other)[1],
PyGpuArray_DIM(py_other, 0), (void *)&stride_Y0,
PyGpuArray_DIM(py_other, 1), (void *)&stride_Y1,
PyGpuArray_DIM(py_other, 0) == 1 ? 0 : PyGpuArray_STRIDE(py_other, 0) / %(itemsize_y)s, (void *)py_other->ga.data,
PyGpuArray_DIM(py_other, 1) == 1 ? 0 : PyGpuArray_STRIDE(py_other, 1) / %(itemsize_y)s, (void *)&py_other->ga.offset,
(npy_%(dtype_x)s*)( (void *)&PyGpuArray_DIMS(indices_arr)[0],
((char *)cuda_get_ptr(py_other->ga.data)) + (void *)&stride_ind,
py_other->ga.offset), (void *)indices_arr->ga.data,
PyGpuArray_DIMS(indices_arr)[0], (void *)&indices_arr->ga.offset};
PyGpuArray_STRIDES(indices_arr)[0] / %(itemsize_ind)s, int err = GpuKernel_call(&%(k_var)s, 3, threads_per_block, n_blocks, 0, kernel_params);
(npy_%(dtype_ind)s*)( %(err_check)s
((char *)cuda_get_ptr(indices_arr->ga.data)) + %(sync)s
indices_arr->ga.offset) }
);
return;
} }
""" % locals() """ % locals()
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论