提交 b980a8ee authored 作者: Frédéric Bastien's avatar Frédéric Bastien 提交者: GitHub

Merge pull request #6317 from abergeron/gpuarray_07

Work to integrate libgpuarray 0.7 changes.
v0.6.9 v0.7.1
\ No newline at end of file \ No newline at end of file
...@@ -46,23 +46,36 @@ def init_dev(dev, name=None, preallocate=None): ...@@ -46,23 +46,36 @@ def init_dev(dev, name=None, preallocate=None):
global pygpu_activated global pygpu_activated
if not config.cxx: if not config.cxx:
raise RuntimeError("The new gpu-backend need a c++ compiler.") raise RuntimeError("The new gpu-backend need a c++ compiler.")
if (pygpu.version.major, pygpu.version.minor, pygpu.version.patch) < (0, 6, 1): if (pygpu.version.major != 0 or pygpu.version.minor != 7 or
pygpu.version.patch < 0):
raise ValueError( raise ValueError(
"Your installed version of pygpu is too old, please upgrade to 0.6.1 or later") "Your installed version of pygpu(%s) is too old, please upgrade to 0.7.0 or later" %
pygpu.version.fullversion)
# This is for the C headers API, we need to match the exact version. # This is for the C headers API, we need to match the exact version.
if pygpu.gpuarray.api_version()[0] != 1: gpuarray_version_major_supported = 2
gpuarray_version_major_detected = pygpu.gpuarray.api_version()[0]
if gpuarray_version_major_detected != gpuarray_version_major_supported:
raise ValueError( raise ValueError(
"Your installed libgpuarray is not in sync, please make sure to have the appropriate version") "Your installed version oflibgpuarray is not in sync with the current Theano"
" version. The installed libgpuarray version support API version %d,"
" while current Theano support API version %d. Change the version of"
" libgpuarray or Theano to fix this problem.",
gpuarray_version_major_detected,
gpuarray_version_major_supported)
if dev not in init_dev.devmap: if dev not in init_dev.devmap:
args = dict()
if config.gpuarray.cache_path != '': if config.gpuarray.cache_path != '':
os.environ['GPUARRAY_CACHE_PATH'] = config.gpuarray.cache_path args['kernel_cache_path'] = config.gpuarray.cache_path
if preallocate is None: if preallocate is None:
preallocate = config.gpuarray.preallocate preallocate = config.gpuarray.preallocate
if preallocate < 0:
args['max_cache_size'] = 0
else:
args['initial_cache_size'] = preallocate
context = pygpu.init( context = pygpu.init(
dev, dev,
disable_alloc_cache=preallocate < 0, sched=config.gpuarray.sched,
single_stream=config.gpuarray.single_stream, **args)
sched=config.gpuarray.sched)
context.dev = dev context.dev = dev
init_dev.devmap[dev] = context init_dev.devmap[dev] = context
reg_context(name, context) reg_context(name, context)
...@@ -115,12 +128,12 @@ def init_dev(dev, name=None, preallocate=None): ...@@ -115,12 +128,12 @@ def init_dev(dev, name=None, preallocate=None):
# This will map the context name to the real context object. # This will map the context name to the real context object.
if config.print_active_device: if config.print_active_device:
try: try:
pcibusid = '(' + context.pcibusid + ')' unique_id = '(' + context.unique_id + ')'
except pygpu.gpuarray.UnsupportedException: except pygpu.gpuarray.UnsupportedException:
pcibusid = '' unique_id = ''
print("Mapped name %s to device %s: %s %s" % print("Mapped name %s to device %s: %s %s" %
(name, dev, context.devname, pcibusid), (name, dev, context.devname, unique_id),
file=sys.stderr) file=sys.stderr)
pygpu_activated = True pygpu_activated = True
...@@ -207,5 +220,5 @@ else: ...@@ -207,5 +220,5 @@ else:
config.device.startswith('opencl') or config.device.startswith('opencl') or
config.device.startswith('cuda') or config.device.startswith('cuda') or
config.contexts != ''): config.contexts != ''):
error("pygpu was configured but could not be imported or is too old (version 0.6 or higher required)", error("pygpu was configured but could not be imported or is too old (version 0.7 or higher required)",
exc_info=True) exc_info=True)
...@@ -158,7 +158,7 @@ class Kernel(object): ...@@ -158,7 +158,7 @@ class Kernel(object):
the `params` list consists of C typecodes the `params` list consists of C typecodes
It can also have the key `cflags` which is a string of C flag It can also have the key `cflags` which is a string of C flag
values like this `"GA_USE_DOUBLE|GA_USE_CLUDA"`. values like this `"GA_USE_DOUBLE|GA_USE_SMALL"`.
Parameters Parameters
---------- ----------
...@@ -216,7 +216,7 @@ class Kernel(object): ...@@ -216,7 +216,7 @@ class Kernel(object):
else: else:
raise TypeError("can't get a dtype from %s" % (type(t),)) raise TypeError("can't get a dtype from %s" % (type(t),))
dtypes = [get_dtype(t) for t in types] dtypes = [get_dtype(t) for t in types]
flags = dict(cluda=True) flags = dict()
if any(d == np.float64 for d in dtypes): if any(d == np.float64 for d in dtypes):
flags['have_double'] = True flags['have_double'] = True
if any(d.itemsize < 4 for d in dtypes): if any(d.itemsize < 4 for d in dtypes):
...@@ -231,8 +231,6 @@ class Kernel(object): ...@@ -231,8 +231,6 @@ class Kernel(object):
res = [] res = []
if self.flags.get('cflags', '') != '': if self.flags.get('cflags', '') != '':
res.append(self.flags['cflags']) res.append(self.flags['cflags'])
if self.flags.get('cluda', False):
res.append('GA_USE_CLUDA')
if self.flags.get('have_double', False): if self.flags.get('have_double', False):
res.append('GA_USE_DOUBLE') res.append('GA_USE_DOUBLE')
if self.flags.get('have_small', False): if self.flags.get('have_small', False):
...@@ -241,15 +239,16 @@ class Kernel(object): ...@@ -241,15 +239,16 @@ class Kernel(object):
res.append('GA_USE_COMPLEX') res.append('GA_USE_COMPLEX')
if self.flags.get('have_half', False): if self.flags.get('have_half', False):
res.append('GA_USE_HALF') res.append('GA_USE_HALF')
return '|'.join(res) res = '|'.join(res)
if not res:
return '0'
return res
def _get_py_flags(self): def _get_py_flags(self):
res = dict(self.flags) res = dict(self.flags)
cflags = res.pop('cflags', '') cflags = res.pop('cflags', '')
for fl in cflags.split('|'): for fl in cflags.split('|'):
fl = fl.strip() fl = fl.strip()
if fl == 'GA_USE_CLUDA':
res['cluda'] = True
if fl == 'GA_USE_DOUBLE': if fl == 'GA_USE_DOUBLE':
res['have_double'] = True res['have_double'] = True
if fl == 'GA_USE_SMALL': if fl == 'GA_USE_SMALL':
...@@ -555,7 +554,7 @@ class CGpuKernelBase(COp, GpuKernelBase): ...@@ -555,7 +554,7 @@ class CGpuKernelBase(COp, GpuKernelBase):
kflags = splt2[2].strip() kflags = splt2[2].strip()
kcode = def_macros + '\n' + kcode + '\n' + undef_macros kcode = def_macros + '\n' + kcode + '\n' + undef_macros
res.append(Kernel(kcode, ktypes, kname, res.append(Kernel(kcode, ktypes, kname,
flags=dict(cluda=True, cflags=kflags))) flags=dict(cflags=kflags)))
n += 2 n += 2
self._cached_kernels = res self._cached_kernels = res
return res return res
...@@ -703,39 +702,35 @@ class GpuFromHost(Op): ...@@ -703,39 +702,35 @@ class GpuFromHost(Op):
if (%(name)s_tmp == NULL) if (%(name)s_tmp == NULL)
%(fail)s %(fail)s
if (%(out)s != NULL && GpuArray_IS_C_CONTIGUOUS(&%(out)s->ga) && if (%(out)s == NULL || !GpuArray_IS_C_CONTIGUOUS(&%(out)s->ga) ||
theano_size_check(%(out)s, PyArray_NDIM(%(name)s_tmp), !theano_size_check(%(out)s, PyArray_NDIM(%(name)s_tmp),
(size_t *)PyArray_DIMS(%(name)s_tmp), (size_t *)PyArray_DIMS(%(name)s_tmp),
get_typecode((PyObject *)PyArray_DESCR(%(name)s_tmp)))) { get_typecode((PyObject *)PyArray_DESCR(%(name)s_tmp)))) {
Py_BEGIN_ALLOW_THREADS
err = GpuArray_write(&%(out)s->ga, PyArray_DATA(%(name)s_tmp),
PyArray_NBYTES(%(name)s_tmp));
Py_END_ALLOW_THREADS
Py_DECREF(%(name)s_tmp);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "Could not write data to gpu");
%(fail)s;
}
} else {
Py_XDECREF(%(out)s); Py_XDECREF(%(out)s);
// This method will release the GIL when needed. %(out)s = pygpu_empty(PyArray_NDIM(%(name)s_tmp),
%(out)s = pygpu_fromhostdata(PyArray_DATA(%(name)s_tmp), (size_t *)PyArray_DIMS(%(name)s_tmp),
get_typecode((PyObject *)PyArray_DESCR(%(name)s_tmp)), get_typecode((PyObject *)PyArray_DESCR(%(name)s_tmp)),
PyArray_NDIM(%(name)s_tmp), GA_C_ORDER, %(ctx)s, Py_None);
(size_t *)PyArray_DIMS(%(name)s_tmp),
(ssize_t *)PyArray_STRIDES(%(name)s_tmp),
%(ctx)s,
Py_None);
Py_DECREF(%(name)s_tmp);
if (%(out)s == NULL) { if (%(out)s == NULL) {
%(fail)s Py_DECREF(%(name)s_tmp);
%(fail)s;
} }
} }
Py_BEGIN_ALLOW_THREADS
err = GpuArray_write(&%(out)s->ga, PyArray_DATA(%(name)s_tmp),
PyArray_NBYTES(%(name)s_tmp));
Py_END_ALLOW_THREADS
Py_DECREF(%(name)s_tmp);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "Could not write data to gpu");
%(fail)s;
}
""" % {'name': name, 'inp': inputs[0], 'ctx': sub['params'], """ % {'name': name, 'inp': inputs[0], 'ctx': sub['params'],
'out': outputs[0], 'fail': sub['fail']} 'out': outputs[0], 'fail': sub['fail']}
def c_code_cache_version(self): def c_code_cache_version(self):
return (9,) return (10,)
class GpuToGpu(Op): class GpuToGpu(Op):
...@@ -1619,7 +1614,8 @@ class GpuEye(GpuKernelBase, Op): ...@@ -1619,7 +1614,8 @@ class GpuEye(GpuKernelBase, Op):
for i in xrange(3)] for i in xrange(3)]
def gpu_kernels(self, node, name): def gpu_kernels(self, node, name):
code = """ code = """#include "cluda.h"
KERNEL void eye(GLOBAL_MEM %(ctype)s *a, ga_size a_off, KERNEL void eye(GLOBAL_MEM %(ctype)s *a, ga_size a_off,
ga_size n, ga_size m, ga_ssize k) { ga_size n, ga_size m, ga_ssize k) {
a = (GLOBAL_MEM %(ctype)s *)(((GLOBAL_MEM char *)a) + a_off); a = (GLOBAL_MEM %(ctype)s *)(((GLOBAL_MEM char *)a) + a_off);
......
#section kernels #section kernels
#kernel dilated_im3d2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size : #kernel dilated_im3d2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
// TODO check kernel flags #include "cluda.h"
// This uses a lot of code from Caffe (http://caffe.berkeleyvision.org/); // This uses a lot of code from Caffe (http://caffe.berkeleyvision.org/);
// sources are clearly marked. Below we reproduce the original license of // sources are clearly marked. Below we reproduce the original license of
// the Caffe software. // the Caffe software.
...@@ -87,6 +88,8 @@ KERNEL void dilated_im3d2col_kernel(const ga_size n, ...@@ -87,6 +88,8 @@ KERNEL void dilated_im3d2col_kernel(const ga_size n,
} }
#kernel im3d2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size : #kernel im3d2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
#include "cluda.h"
KERNEL void im3d2col_kernel(const ga_size n, KERNEL void im3d2col_kernel(const ga_size n,
GLOBAL_MEM const DTYPE_INPUT_0 * data_im, GLOBAL_MEM const DTYPE_INPUT_0 * data_im,
const ga_size offset_im, const ga_size offset_im,
...@@ -139,6 +142,8 @@ KERNEL void im3d2col_kernel(const ga_size n, ...@@ -139,6 +142,8 @@ KERNEL void im3d2col_kernel(const ga_size n,
// GPU kernel for the case of dilation // GPU kernel for the case of dilation
#kernel dilated_col2im3d_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size, size : #kernel dilated_col2im3d_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size, size :
#include "cluda.h"
KERNEL void dilated_col2im3d_kernel(const ga_size n, KERNEL void dilated_col2im3d_kernel(const ga_size n,
GLOBAL_MEM const DTYPE_INPUT_0 * data_col, GLOBAL_MEM const DTYPE_INPUT_0 * data_col,
const ga_size offset_col, const ga_size offset_col,
...@@ -207,6 +212,7 @@ KERNEL void dilated_col2im3d_kernel(const ga_size n, ...@@ -207,6 +212,7 @@ KERNEL void dilated_col2im3d_kernel(const ga_size n,
} }
#kernel col2im3d_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size, size : #kernel col2im3d_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size, size :
#include "cluda.h"
KERNEL void col2im3d_kernel(const ga_size n, KERNEL void col2im3d_kernel(const ga_size n,
GLOBAL_MEM const DTYPE_INPUT_0 * data_col, GLOBAL_MEM const DTYPE_INPUT_0 * data_col,
......
#section kernels #section kernels
#kernel dilated_im2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size : #kernel dilated_im2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
#include "cluda.h"
// TODO check kernel flags // TODO check kernel flags
// This uses a lot of code from Caffe (http://caffe.berkeleyvision.org/); // This uses a lot of code from Caffe (http://caffe.berkeleyvision.org/);
// sources are clearly marked. Below we reproduce the original license of // sources are clearly marked. Below we reproduce the original license of
...@@ -77,6 +78,7 @@ KERNEL void dilated_im2col_kernel(const ga_size n, ...@@ -77,6 +78,7 @@ KERNEL void dilated_im2col_kernel(const ga_size n,
} }
#kernel im2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, *, size : #kernel im2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
#include "cluda.h"
KERNEL void im2col_kernel(const ga_size n, KERNEL void im2col_kernel(const ga_size n,
GLOBAL_MEM const DTYPE_INPUT_0 * data_im, GLOBAL_MEM const DTYPE_INPUT_0 * data_im,
...@@ -122,6 +124,8 @@ KERNEL void im2col_kernel(const ga_size n, ...@@ -122,6 +124,8 @@ KERNEL void im2col_kernel(const ga_size n,
// GPU kernel for the case of dilation // GPU kernel for the case of dilation
#kernel dilated_col2im_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size, size : #kernel dilated_col2im_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size, size :
#include "cluda.h"
KERNEL void dilated_col2im_kernel(const ga_size n, KERNEL void dilated_col2im_kernel(const ga_size n,
GLOBAL_MEM const DTYPE_INPUT_0 * data_col, const ga_size offset_col, GLOBAL_MEM const DTYPE_INPUT_0 * data_col, const ga_size offset_col,
const ga_size height, const ga_size width, const ga_size channels, const ga_size height, const ga_size width, const ga_size channels,
...@@ -172,6 +176,7 @@ KERNEL void dilated_col2im_kernel(const ga_size n, ...@@ -172,6 +176,7 @@ KERNEL void dilated_col2im_kernel(const ga_size n,
} }
#kernel col2im_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, *, size, size : #kernel col2im_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, *, size, size :
#include "cluda.h"
KERNEL void col2im_kernel(const ga_size n, KERNEL void col2im_kernel(const ga_size n,
GLOBAL_MEM const DTYPE_INPUT_0 * data_col, const ga_size offset_col, GLOBAL_MEM const DTYPE_INPUT_0 * data_col, const ga_size offset_col,
......
...@@ -199,7 +199,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -199,7 +199,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
if (!reuse_algo) { if (!reuse_algo) {
char pci_id[16]; char pci_id[16];
gpucontext_property(c->ctx, GA_CTX_PROP_PCIBUSID, pci_id); gpucontext_property(c->ctx, GA_CTX_PROP_UNIQUE_ID, pci_id);
// check out cache // check out cache
hashkey = dnn_conv_shape(APPLY_SPECIFIC(input), input, APPLY_SPECIFIC(kerns), kerns, desc, *output, groups); hashkey = dnn_conv_shape(APPLY_SPECIFIC(input), input, APPLY_SPECIFIC(kerns), kerns, desc, *output, groups);
if (hashkey.empty()) { if (hashkey.empty()) {
......
...@@ -168,7 +168,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -168,7 +168,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
if (!reuse_algo) { if (!reuse_algo) {
char pci_id[16]; char pci_id[16];
gpucontext_property(c->ctx, GA_CTX_PROP_PCIBUSID, pci_id); gpucontext_property(c->ctx, GA_CTX_PROP_UNIQUE_ID, pci_id);
// check out cache // check out cache
hashkey = dnn_conv_shape(APPLY_SPECIFIC(input), *input, APPLY_SPECIFIC(kerns), kerns, desc, output, groups); hashkey = dnn_conv_shape(APPLY_SPECIFIC(input), *input, APPLY_SPECIFIC(kerns), kerns, desc, output, groups);
if (hashkey.empty()) { if (hashkey.empty()) {
......
...@@ -155,7 +155,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -155,7 +155,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
if (!reuse_algo) { if (!reuse_algo) {
char pci_id[16]; char pci_id[16];
gpucontext_property(c->ctx, GA_CTX_PROP_PCIBUSID, pci_id); gpucontext_property(c->ctx, GA_CTX_PROP_UNIQUE_ID, pci_id);
// check out cache // check out cache
hashkey = dnn_conv_shape(APPLY_SPECIFIC(input), input, APPLY_SPECIFIC(kerns), *kerns, desc, output, groups); hashkey = dnn_conv_shape(APPLY_SPECIFIC(input), input, APPLY_SPECIFIC(kerns), *kerns, desc, output, groups);
if (hashkey.empty()) { if (hashkey.empty()) {
......
#section kernels #section kernels
#kernel tril_kernel : size, size, size, *: #kernel tril_kernel : size, size, size, *:
#include "cluda.h"
KERNEL void tril_kernel(const ga_size nthreads, const ga_size ncols, KERNEL void tril_kernel(const ga_size nthreads, const ga_size ncols,
const ga_size a_off, GLOBAL_MEM DTYPE_INPUT_0 *a) { const ga_size a_off, GLOBAL_MEM DTYPE_INPUT_0 *a) {
...@@ -17,6 +18,7 @@ KERNEL void tril_kernel(const ga_size nthreads, const ga_size ncols, ...@@ -17,6 +18,7 @@ KERNEL void tril_kernel(const ga_size nthreads, const ga_size ncols,
} }
#kernel triu_kernel : size, size, size, *: #kernel triu_kernel : size, size, size, *:
#include "cluda.h"
KERNEL void triu_kernel(const ga_size nthreads, const ga_size ncols, KERNEL void triu_kernel(const ga_size nthreads, const ga_size ncols,
const ga_size a_off, GLOBAL_MEM DTYPE_INPUT_0 *a) { const ga_size a_off, GLOBAL_MEM DTYPE_INPUT_0 *a) {
......
#section kernels #section kernels
#kernel triu_kernel : size, size, size, *: #kernel triu_kernel : size, size, size, *:
#include "cluda.h"
KERNEL void triu_kernel(const ga_size nthreads, const ga_size ncols, KERNEL void triu_kernel(const ga_size nthreads, const ga_size ncols,
const ga_size a_off, GLOBAL_MEM DTYPE_INPUT_0 *a) { const ga_size a_off, GLOBAL_MEM DTYPE_INPUT_0 *a) {
......
#section kernels #section kernels
#kernel max_pool2d_kernel : size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, *, size : #kernel max_pool2d_kernel : size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, *, size :
#include "cluda.h"
// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu) // (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void max_pool2d_kernel(const ga_size nthreads, KERNEL void max_pool2d_kernel(const ga_size nthreads,
...@@ -44,6 +45,7 @@ KERNEL void max_pool2d_kernel(const ga_size nthreads, ...@@ -44,6 +45,7 @@ KERNEL void max_pool2d_kernel(const ga_size nthreads,
} }
#kernel max_pool3d_kernel : size, size, size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, size, size, size, *, size : #kernel max_pool3d_kernel : size, size, size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, size, size, size, *, size :
#include "cluda.h"
// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu) // (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void max_pool3d_kernel(const ga_size nthreads, KERNEL void max_pool3d_kernel(const ga_size nthreads,
...@@ -95,6 +97,7 @@ KERNEL void max_pool3d_kernel(const ga_size nthreads, ...@@ -95,6 +97,7 @@ KERNEL void max_pool3d_kernel(const ga_size nthreads,
} }
#kernel ave_pool2d_kernel : size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, bool, bool, *, size: #kernel ave_pool2d_kernel : size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, bool, bool, *, size:
#include "cluda.h"
// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu) // (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void ave_pool2d_kernel(const ga_size nthreads, KERNEL void ave_pool2d_kernel(const ga_size nthreads,
...@@ -150,6 +153,7 @@ KERNEL void ave_pool2d_kernel(const ga_size nthreads, ...@@ -150,6 +153,7 @@ KERNEL void ave_pool2d_kernel(const ga_size nthreads,
} }
#kernel ave_pool3d_kernel : size, size, size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, size, size, size, bool, bool, *, size : #kernel ave_pool3d_kernel : size, size, size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, size, size, size, bool, bool, *, size :
#include "cluda.h"
// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu) // (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void ave_pool3d_kernel(const ga_size nthreads, KERNEL void ave_pool3d_kernel(const ga_size nthreads,
......
#section kernels #section kernels
#kernel ave_pool2d_grad_kernel : size, size, size, size, size, size, size, *, size, *, size, size, size, size, size, size, size, bool, bool, *, size : #kernel ave_pool2d_grad_kernel : size, size, size, size, size, size, size, *, size, *, size, size, size, size, size, size, size, bool, bool, *, size :
#include "cluda.h"
// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu) // (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void ave_pool2d_grad_kernel(const ga_size nthreads, KERNEL void ave_pool2d_grad_kernel(const ga_size nthreads,
...@@ -50,6 +51,7 @@ KERNEL void ave_pool2d_grad_kernel(const ga_size nthreads, ...@@ -50,6 +51,7 @@ KERNEL void ave_pool2d_grad_kernel(const ga_size nthreads,
} }
#kernel ave_pool3d_grad_kernel : size, size, size, size, size, size, size, size, size, *, size, *, size, size, size, size, size, size, size, size, size, size, bool, bool, *, size : #kernel ave_pool3d_grad_kernel : size, size, size, size, size, size, size, size, size, *, size, *, size, size, size, size, size, size, size, size, size, size, bool, bool, *, size :
#include "cluda.h"
// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu) // (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void ave_pool3d_grad_kernel(const ga_size nthreads, KERNEL void ave_pool3d_grad_kernel(const ga_size nthreads,
......
#section kernels #section kernels
#kernel max_pool2d_grad_grad_kernel : size, size, size, size, size, size, size, *, size, *, size, *, size, size, size, size, size, size, size, *, size : #kernel max_pool2d_grad_grad_kernel : size, size, size, size, size, size, size, *, size, *, size, *, size, size, size, size, size, size, size, *, size :
#include "cluda.h"
KERNEL void max_pool2d_grad_grad_kernel(const ga_size nthreads, KERNEL void max_pool2d_grad_grad_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size pooled_height, const ga_size num, const ga_size channels, const ga_size pooled_height,
...@@ -47,6 +48,7 @@ KERNEL void max_pool2d_grad_grad_kernel(const ga_size nthreads, ...@@ -47,6 +48,7 @@ KERNEL void max_pool2d_grad_grad_kernel(const ga_size nthreads,
} }
#kernel max_pool3d_grad_grad_kernel : size, size, size, size, size, size, size, size, size, *, size, *, size, *, size, size, size, size, size, size, size, size, size, size, *, size : #kernel max_pool3d_grad_grad_kernel : size, size, size, size, size, size, size, size, size, *, size, *, size, *, size, size, size, size, size, size, size, size, size, size, *, size :
#include "cluda.h"
KERNEL void max_pool3d_grad_grad_kernel(const ga_size nthreads, KERNEL void max_pool3d_grad_grad_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size pooled_depth, const ga_size num, const ga_size channels, const ga_size pooled_depth,
......
#section kernels #section kernels
#kernel max_pool2d_grad_kernel : size, size, size, size, size, size, size, *, size, *, size, *, size, size, size, size, size, size, size, *, size : #kernel max_pool2d_grad_kernel : size, size, size, size, size, size, size, *, size, *, size, *, size, size, size, size, size, size, size, *, size :
#include "cluda.h"
// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu) // (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void max_pool2d_grad_kernel(const ga_size nthreads, KERNEL void max_pool2d_grad_kernel(const ga_size nthreads,
...@@ -43,6 +44,7 @@ KERNEL void max_pool2d_grad_kernel(const ga_size nthreads, ...@@ -43,6 +44,7 @@ KERNEL void max_pool2d_grad_kernel(const ga_size nthreads,
} }
#kernel max_pool3d_grad_kernel : size, size, size, size, size, size, size, size, size, *, size, *, size, *, size, size, size, size, size, size, size, size, size, size, *, size : #kernel max_pool3d_grad_kernel : size, size, size, size, size, size, size, size, size, *, size, *, size, *, size, size, size, size, size, size, size, size, size, size, *, size :
#include "cluda.h"
// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu) // (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void max_pool3d_grad_kernel(const ga_size nthreads, KERNEL void max_pool3d_grad_kernel(const ga_size nthreads,
......
#section kernels #section kernels
#kernel max_pool2d_rop_kernel : size, size, size, size, size, size, size, *, size, *, size, size, size, size, size, size, size, *, size : #kernel max_pool2d_rop_kernel : size, size, size, size, size, size, size, *, size, *, size, size, size, size, size, size, size, *, size :
#include "cluda.h"
// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu) // (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void max_pool2d_rop_kernel(const ga_size nthreads, KERNEL void max_pool2d_rop_kernel(const ga_size nthreads,
...@@ -50,6 +51,7 @@ KERNEL void max_pool2d_rop_kernel(const ga_size nthreads, ...@@ -50,6 +51,7 @@ KERNEL void max_pool2d_rop_kernel(const ga_size nthreads,
} }
#kernel max_pool3d_rop_kernel : size, size, size, size, size, size, size, size, size, *, size, *, size, size, size, size, size, size, size, size, size, size, *, size : #kernel max_pool3d_rop_kernel : size, size, size, size, size, size, size, size, size, *, size, *, size, size, size, size, size, size, size, size, size, size, *, size :
#include "cluda.h"
// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu) // (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void max_pool3d_rop_kernel(const ga_size nthreads, KERNEL void max_pool3d_rop_kernel(const ga_size nthreads,
......
...@@ -1743,7 +1743,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1743,7 +1743,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
kname = "kernel_reduce_ccontig" kname = "kernel_reduce_ccontig"
k_var = "kernel_reduce_ccontig_" + nodename k_var = "kernel_reduce_ccontig_" + nodename
sio = StringIO() sio = StringIO()
print(""" print("""#include "cluda.h"
KERNEL void %(kname)s( KERNEL void %(kname)s(
const ga_size d0, const ga_size d0,
const %(in_type)s *A, const ga_size offset_A, const %(in_type)s *A, const ga_size offset_A,
...@@ -1781,7 +1782,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1781,7 +1782,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
kname = "kernel_reduce_1" kname = "kernel_reduce_1"
k_var = "kernel_reduce_1_" + nodename k_var = "kernel_reduce_1_" + nodename
sio = StringIO() sio = StringIO()
print(""" print("""#include "cluda.h"
KERNEL void %(kname)s( KERNEL void %(kname)s(
const ga_size d0, const ga_size d0,
const %(in_type)s *A, const ga_size offset_A, const %(in_type)s *A, const ga_size offset_A,
...@@ -1821,7 +1823,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1821,7 +1823,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
kname = "kernel_reduce_11" kname = "kernel_reduce_11"
k_var = "kernel_reduce_11_" + nodename k_var = "kernel_reduce_11_" + nodename
sio = StringIO() sio = StringIO()
print(""" print("""#include "cluda.h"
KERNEL void %(kname)s( KERNEL void %(kname)s(
const ga_size d0, const ga_size d1, const ga_size d0, const ga_size d1,
const %(in_type)s *A, const ga_size offset_A, const %(in_type)s *A, const ga_size offset_A,
...@@ -1909,7 +1912,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1909,7 +1912,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
load_in + "(A[i3 * sA3 + i2 * sA2 + i1 * sA1 + i0 * sA0])", load_in + "(A[i3 * sA3 + i2 * sA2 + i1 * sA1 + i0 * sA0])",
{}, True) {}, True)
sio = StringIO() sio = StringIO()
print(""" print("""#include "cluda.h"
%(decl)s{ %(decl)s{
%(init)s %(init)s
for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){ for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){
...@@ -1943,7 +1947,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1943,7 +1947,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
kname = "kernel_reduce_010" kname = "kernel_reduce_010"
k_var = "kernel_reduce_010_" + nodename k_var = "kernel_reduce_010_" + nodename
sio = StringIO() sio = StringIO()
print(""" print("""#include "cluda.h"
KERNEL void %(kname)s( KERNEL void %(kname)s(
const ga_size d0, const ga_size d1, const ga_size d2, const ga_size d0, const ga_size d1, const ga_size d2,
const %(in_type)s *A, const ga_size offset_A, const %(in_type)s *A, const ga_size offset_A,
...@@ -1989,7 +1994,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1989,7 +1994,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
kname = "kernel_reduce_010_AD" kname = "kernel_reduce_010_AD"
k_var = "kernel_reduce_010_AD_" + nodename k_var = "kernel_reduce_010_AD_" + nodename
sio = StringIO() sio = StringIO()
print(""" print("""#include "cluda.h"
KERNEL void %(kname)s( KERNEL void %(kname)s(
const ga_size A, const ga_size B, const ga_size C, const ga_size D, const ga_size A, const ga_size B, const ga_size C, const ga_size D,
const %(in_type)s *X, const ga_size offset_X, const %(in_type)s *X, const ga_size offset_X,
...@@ -2053,7 +2059,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2053,7 +2059,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
{}, True) {}, True)
reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + 0 * sA1 + i2 * sA2])") reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + 0 * sA1 + i2 * sA2])")
sio = StringIO() sio = StringIO()
print(""" print("""#include "cluda.h"
%(decl)s %(decl)s
{ {
%(init)s %(init)s
...@@ -2088,7 +2095,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2088,7 +2095,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
kname = "kernel_reduce_110" kname = "kernel_reduce_110"
k_var = "kernel_reduce_110_" + nodename k_var = "kernel_reduce_110_" + nodename
sio = StringIO() sio = StringIO()
print(""" print("""#include "cluda.h"
KERNEL void %(kname)s( KERNEL void %(kname)s(
const ga_size d0, const ga_size d1, const ga_size d2, const ga_size d0, const ga_size d1, const ga_size d2,
const %(in_type)s *A, const ga_size offset_A, const %(in_type)s *A, const ga_size offset_A,
...@@ -2133,7 +2141,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2133,7 +2141,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
{}, True) {}, True)
reduce_init = self._assign_init(load_in + "(A[i1 * sA1 + i2 * sA2])") reduce_init = self._assign_init(load_in + "(A[i1 * sA1 + i2 * sA2])")
sio = StringIO() sio = StringIO()
print(""" print("""#include "cluda.h"
%(decl)s %(decl)s
{ {
%(init)s %(init)s
...@@ -2163,7 +2172,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2163,7 +2172,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
{}, True) {}, True)
reduce_init = self._assign_init(load_in + "(A[0])") reduce_init = self._assign_init(load_in + "(A[0])")
sio = StringIO() sio = StringIO()
print(""" print("""#include "cluda.h"
%(decl)s %(decl)s
{ {
%(init)s %(init)s
...@@ -2195,7 +2205,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2195,7 +2205,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
kname = "kernel_reduce_001" kname = "kernel_reduce_001"
k_var = "kernel_reduce_001_" + nodename k_var = "kernel_reduce_001_" + nodename
sio = StringIO() sio = StringIO()
print(""" print("""#include "cluda.h"
KERNEL void %(kname)s( KERNEL void %(kname)s(
const ga_size d0, const ga_size d1, const ga_size d2, const ga_size d0, const ga_size d1, const ga_size d2,
const %(in_type)s *A, const ga_size offset_A, const %(in_type)s *A, const ga_size offset_A,
...@@ -2244,7 +2254,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2244,7 +2254,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
{}, True) {}, True)
reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + i1 * sA1])") reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + i1 * sA1])")
sio = StringIO() sio = StringIO()
print(""" print("""#include "cluda.h"
%(decl)s %(decl)s
{ {
%(init)s %(init)s
...@@ -2280,7 +2291,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2280,7 +2291,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
{}, True) {}, True)
reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + i2 * sA2])") reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + i2 * sA2])")
sio = StringIO() sio = StringIO()
print(""" print("""#include "cluda.h"
%(decl)s %(decl)s
{ {
%(init)s %(init)s
...@@ -2314,7 +2326,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2314,7 +2326,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
{}, True) {}, True)
reduce_init = self._assign_init(load_in + "(A[0])") reduce_init = self._assign_init(load_in + "(A[0])")
sio = StringIO() sio = StringIO()
print(""" print("""#include "cluda.h"
%(decl)s %(decl)s
{ {
%(init)s %(init)s
...@@ -2345,7 +2358,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2345,7 +2358,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
kname = "kernel_reduce_1011" kname = "kernel_reduce_1011"
k_var = "kernel_reduce_1011_" + nodename k_var = "kernel_reduce_1011_" + nodename
sio = StringIO() sio = StringIO()
print(""" print("""#include "cluda.h"
KERNEL void %(kname)s( KERNEL void %(kname)s(
const ga_size d0, const ga_size d1, const ga_size d2, const ga_size d3, const ga_size d0, const ga_size d1, const ga_size d2, const ga_size d3,
const %(in_type)s *A, const ga_size offset_A, const %(in_type)s *A, const ga_size offset_A,
...@@ -2502,15 +2516,15 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2502,15 +2516,15 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
def gpu_kernels(self, node, name): def gpu_kernels(self, node, name):
if not any(getattr(self, 'redux', [node.inputs[0].ndim != 0])): if not any(getattr(self, 'redux', [node.inputs[0].ndim != 0])):
# Some OpenCL compilers do not accept no-arguments kernels # Some OpenCL compilers do not accept no-arguments empty kernels
src = "KERNEL void reduk(GLOBAL_MEM float *a) {}" src = "#include \"cluda.h\"\nKERNEL void reduk(GLOBAL_MEM float *a) { a[0] = 0; }"
params = ['float32'] params = ['float32']
else: else:
k = self.get_kernel_cache(node) k = self.get_kernel_cache(node)
_, src, _, _ = k._get_basic_kernel(k.init_local_size, _, src, _, _ = k._get_basic_kernel(k.init_local_size,
node.inputs[0].ndim) node.inputs[0].ndim)
nd = node.inputs[0].ndim nd = node.inputs[0].ndim
params = ['uint32', gpuarray.GpuArray] params = ['uint32', gpuarray.GpuArray, 'uint32']
params.extend('uint32' for _ in range(nd)) params.extend('uint32' for _ in range(nd))
params.append(gpuarray.GpuArray) params.append(gpuarray.GpuArray)
params.append('uint32') params.append('uint32')
...@@ -2617,9 +2631,10 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2617,9 +2631,10 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
code += """ code += """
args[0] = &n; args[0] = &n;
args[1] = tmp->ga.data; args[1] = tmp->ga.data;
args[2] = &tmp->ga.offset;
""" % dict(output=output) """ % dict(output=output)
p = 2 p = 3
for i in range(node.inputs[0].ndim): for i in range(node.inputs[0].ndim):
code += """ code += """
proxy_dim[%(i)s] = %(input)s->ga.dimensions[%(i)s]; proxy_dim[%(i)s] = %(input)s->ga.dimensions[%(i)s];
...@@ -2677,7 +2692,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2677,7 +2692,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
return code return code
def c_code_cache_version_apply(self, node): def c_code_cache_version_apply(self, node):
return (3, self.kernel_version(node)) return (4, self.kernel_version(node))
def generate_kernel(self, node, odtype, redux): def generate_kernel(self, node, odtype, redux):
if isinstance(self.scalar_op, scalar.basic.Add): if isinstance(self.scalar_op, scalar.basic.Add):
......
...@@ -74,7 +74,8 @@ class GpuCumOp(GpuKernelBase, Op): ...@@ -74,7 +74,8 @@ class GpuCumOp(GpuKernelBase, Op):
k_var = "k_cumadd_" + nodename k_var = "k_cumadd_" + nodename
dtype_x = node.inputs[0].dtype dtype_x = node.inputs[0].dtype
flags = Kernel.get_flags(dtype_x) flags = Kernel.get_flags(dtype_x)
code = """ code = """#include "cluda.h"
KERNEL void %(kname)s(float* input, ga_size input_offset, KERNEL void %(kname)s(float* input, ga_size input_offset,
float* output, ga_size output_offset, float* output, ga_size output_offset,
ga_ssize inputStrides_x, ga_ssize inputStrides_y, ga_ssize inputStrides_z, ga_ssize inputStrides_x, ga_ssize inputStrides_y, ga_ssize inputStrides_z,
...@@ -112,7 +113,8 @@ class GpuCumOp(GpuKernelBase, Op): ...@@ -112,7 +113,8 @@ class GpuCumOp(GpuKernelBase, Op):
gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
'int32', 'int32', gpuarray.GpuArray, gpuarray.SIZE] 'int32', 'int32', gpuarray.GpuArray, gpuarray.SIZE]
code = """ code = """#include "cluda.h"
// helper functions // helper functions
WITHIN_KERNEL WITHIN_KERNEL
void k_reductionPhase(float* partialCumOp) { void k_reductionPhase(float* partialCumOp) {
...@@ -213,7 +215,8 @@ class GpuCumOp(GpuKernelBase, Op): ...@@ -213,7 +215,8 @@ class GpuCumOp(GpuKernelBase, Op):
# k_finalCumOp # k_finalCumOp
kname = "k_finalCumOp" kname = "k_finalCumOp"
k_var = "k_finalCumOp_" + nodename k_var = "k_finalCumOp_" + nodename
code = """ code = """#include "cluda.h"
KERNEL void k_finalCumOp(float* output, ga_size output_offset, KERNEL void k_finalCumOp(float* output, ga_size output_offset,
float* blockSum, ga_size blockSum_offset, float* blockSum, ga_size blockSum_offset,
size_t nbElementsPerCumOp, size_t nbElementsPerCumOp,
......
...@@ -22,7 +22,7 @@ def load_w(dtype): ...@@ -22,7 +22,7 @@ def load_w(dtype):
""" """
if dtype == 'float16': if dtype == 'float16':
return '__half2float' return 'ga_half2float'
else: else:
return '' return ''
...@@ -37,6 +37,6 @@ def write_w(dtype): ...@@ -37,6 +37,6 @@ def write_w(dtype):
""" """
if dtype == 'float16': if dtype == 'float16':
return '__float2half_rn' return 'ga_float2half'
else: else:
return '' return ''
...@@ -34,7 +34,9 @@ def nvcc_kernel(name, params, body): ...@@ -34,7 +34,9 @@ def nvcc_kernel(name, params, body):
else: else:
yield b yield b
bodystr = ';\n'.join(flatbody()) bodystr = ';\n'.join(flatbody())
return """KERNEL void %(name)s (%(paramstr)s) return """#include "cluda.h"
KERNEL void %(name)s (%(paramstr)s)
{ {
%(bodystr)s; %(bodystr)s;
} }
......
...@@ -66,7 +66,8 @@ class GPUAMultinomialFromUniform(GpuKernelBase, Op): ...@@ -66,7 +66,8 @@ class GPUAMultinomialFromUniform(GpuKernelBase, Op):
work_ctype = pygpu.gpuarray.dtype_to_ctype(work_dtype(node.inputs[0].dtype)) work_ctype = pygpu.gpuarray.dtype_to_ctype(work_dtype(node.inputs[0].dtype))
write_out_ctype = write_w(node.outputs[0].dtype) write_out_ctype = write_w(node.outputs[0].dtype)
load_in_ctype = load_w(node.inputs[0].dtype) load_in_ctype = load_w(node.inputs[0].dtype)
code = """ code = """#include "cluda.h"
KERNEL void k_multi_warp_multinomial( KERNEL void k_multi_warp_multinomial(
const ga_size nb_multi, const ga_size nb_multi,
const ga_size nb_outcomes, const ga_size nb_outcomes,
...@@ -276,7 +277,8 @@ class GPUAChoiceFromUniform(GpuKernelBase, Op): ...@@ -276,7 +277,8 @@ class GPUAChoiceFromUniform(GpuKernelBase, Op):
def gpu_kernels(self, node, name): def gpu_kernels(self, node, name):
replace = int(self.replace) replace = int(self.replace)
code = """ code = """#include "cluda.h"
KERNEL void k_multi_warp_multinomial_wor( KERNEL void k_multi_warp_multinomial_wor(
const ga_size nb_multi, const ga_size nb_multi,
const ga_size nb_outcomes, const ga_size nb_outcomes,
......
...@@ -61,7 +61,8 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -61,7 +61,8 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
kernels = [] kernels = []
kname = "k_multi_warp_less" kname = "k_multi_warp_less"
k_var = "k_multi_warp_less_" + nodename k_var = "k_multi_warp_less_" + nodename
code = """ code = """#include "cluda.h"
// a version that uses less registers but doesn't work in all cases. // a version that uses less registers but doesn't work in all cases.
%(mode_constants)s %(mode_constants)s
KERNEL void %(kname)s( KERNEL void %(kname)s(
...@@ -163,7 +164,8 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -163,7 +164,8 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
kname = "k_multi_warp" kname = "k_multi_warp"
k_var = "k_multi_warp_" + nodename k_var = "k_multi_warp_" + nodename
code = """ code = """#include "cluda.h"
%(mode_constants)s %(mode_constants)s
KERNEL void %(kname)s( KERNEL void %(kname)s(
const ga_int mode, const ga_int mode,
...@@ -500,7 +502,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -500,7 +502,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
size_t threads_per_block[3] = {d, c, 1}; size_t threads_per_block[3] = {d, c, 1};
//get the max threads per blocks //get the max threads per blocks
size_t max_threads_dim; size_t max_threads_dim;
int err = gpucontext_property(%(params)s->context->ctx, GA_CTX_PROP_MAXLSIZE, &max_threads_dim); int err = gpucontext_property(%(params)s->context->ctx, GA_CTX_PROP_MAXLSIZE0, &max_threads_dim);
if (err != GA_NO_ERROR){ if (err != GA_NO_ERROR){
PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_threads_dims"); PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_threads_dims");
%(fail)s; %(fail)s;
......
...@@ -75,7 +75,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op): ...@@ -75,7 +75,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE
] ]
sio = StringIO() sio = StringIO()
print(""" print("""#include "cluda.h"
KERNEL void %(kname)s(const ga_size M, const ga_size N, KERNEL void %(kname)s(const ga_size M, const ga_size N,
GLOBAL_MEM const %(type_x)s* x_data, const ga_size offset_x, const ga_ssize xs0, const ga_ssize xs1, GLOBAL_MEM const %(type_x)s* x_data, const ga_size offset_x, const ga_ssize xs0, const ga_ssize xs1,
GLOBAL_MEM const %(type_b)s* b, const ga_size offset_b, const ga_ssize bs0, GLOBAL_MEM const %(type_b)s* b, const ga_size offset_b, const ga_ssize bs0,
...@@ -393,7 +394,8 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op): ...@@ -393,7 +394,8 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE,
] ]
sio = StringIO() sio = StringIO()
print(""" print("""#include "cluda.h"
KERNEL void %(kname)s( KERNEL void %(kname)s(
const ga_size N, const ga_size K, const ga_size N, const ga_size K,
GLOBAL_MEM const %(type_dnll)s* dnll, const ga_size offset_dnll, const ga_ssize dnll_s0, GLOBAL_MEM const %(type_dnll)s* dnll, const ga_size offset_dnll, const ga_ssize dnll_s0,
...@@ -495,7 +497,7 @@ class GpuSoftmax(GpuKernelBase, Op): ...@@ -495,7 +497,7 @@ class GpuSoftmax(GpuKernelBase, Op):
{ {
size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t)(32 * 1024)), 1, 1}; size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t)(32 * 1024)), 1, 1};
//TODO, detect the maximum number of thread per block. //TODO, detect the maximum number of thread per block.
size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t)256), 1, 1}; // TODO: Read GA_CTX_PROP_MAXLSIZE size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t)256), 1, 1}; // TODO: Read GA_CTX_PROP_MAXLSIZE0
size_t shmem_sz = PyGpuArray_DIMS(%(x)s)[1] * size_t shmem_sz = PyGpuArray_DIMS(%(x)s)[1] *
2 * sizeof(npy_%(work_x)s); 2 * sizeof(npy_%(work_x)s);
ssize_t stride_X0 = PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s; ssize_t stride_X0 = PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s;
...@@ -557,7 +559,8 @@ class GpuSoftmax(GpuKernelBase, Op): ...@@ -557,7 +559,8 @@ class GpuSoftmax(GpuKernelBase, Op):
kernels = [] kernels = []
kname = "kSoftmax" kname = "kSoftmax"
k_var = "kSoftmax_" + nodename k_var = "kSoftmax_" + nodename
code = """ code = """#include "cluda.h"
KERNEL void %(kname)s (const ga_size M, const ga_size N, KERNEL void %(kname)s (const ga_size M, const ga_size N,
GLOBAL_MEM const %(type_x)s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1, GLOBAL_MEM const %(type_x)s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1,
GLOBAL_MEM %(type_sm)s * sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1 GA_DECL_SHARED_PARAM(%(type_acc)s, buf)) GLOBAL_MEM %(type_sm)s * sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1 GA_DECL_SHARED_PARAM(%(type_acc)s, buf))
...@@ -630,7 +633,8 @@ class GpuSoftmax(GpuKernelBase, Op): ...@@ -630,7 +633,8 @@ class GpuSoftmax(GpuKernelBase, Op):
flags=flags, objvar=k_var)) flags=flags, objvar=k_var))
kname = "kSoftmax_fixed_shared" kname = "kSoftmax_fixed_shared"
k_var = "kSoftmax_fixed_shared" + nodename k_var = "kSoftmax_fixed_shared" + nodename
code = """ code = """#include "cluda.h"
KERNEL void %(kname)s (const ga_size M, const ga_size N, KERNEL void %(kname)s (const ga_size M, const ga_size N,
GLOBAL_MEM const %(type_x)s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1, GLOBAL_MEM const %(type_x)s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1,
GLOBAL_MEM %(type_sm)s * sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1 GA_DECL_SHARED_PARAM(%(type_acc)s, buf)) GLOBAL_MEM %(type_sm)s * sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1 GA_DECL_SHARED_PARAM(%(type_acc)s, buf))
...@@ -788,7 +792,7 @@ class GpuSoftmaxWithBias(GpuKernelBase, Op): ...@@ -788,7 +792,7 @@ class GpuSoftmaxWithBias(GpuKernelBase, Op):
{ {
size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t)(32*1024)), 1, 1}; size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t)(32*1024)), 1, 1};
//TODO, detect the maximum number of thread per block. //TODO, detect the maximum number of thread per block.
size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t)256), 1, 1}; // TODO: Read GA_CTX_PROP_MAXLSIZE size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t)256), 1, 1}; // TODO: Read GA_CTX_PROP_MAXLSIZE0
size_t shmem_sz = PyGpuArray_DIMS(%(x)s)[1] * size_t shmem_sz = PyGpuArray_DIMS(%(x)s)[1] *
2 * sizeof(npy_%(work_x)s); 2 * sizeof(npy_%(work_x)s);
ssize_t stride_X0 = PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s; ssize_t stride_X0 = PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s;
...@@ -854,7 +858,8 @@ class GpuSoftmaxWithBias(GpuKernelBase, Op): ...@@ -854,7 +858,8 @@ class GpuSoftmaxWithBias(GpuKernelBase, Op):
kernels = [] kernels = []
kname = "kSoftmaxWithBias" kname = "kSoftmaxWithBias"
k_var = "kSoftmaxWithBias_" + nodename k_var = "kSoftmaxWithBias_" + nodename
code = """ code = """#include "cluda.h"
KERNEL void %(kname)s (const ga_size M, const ga_size N, KERNEL void %(kname)s (const ga_size M, const ga_size N,
GLOBAL_MEM const %(type_x)s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1, GLOBAL_MEM const %(type_x)s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1,
GLOBAL_MEM const %(type_b)s * b, const ga_size offset_b, const ga_ssize sb0, GLOBAL_MEM const %(type_b)s * b, const ga_size offset_b, const ga_ssize sb0,
...@@ -930,7 +935,8 @@ class GpuSoftmaxWithBias(GpuKernelBase, Op): ...@@ -930,7 +935,8 @@ class GpuSoftmaxWithBias(GpuKernelBase, Op):
flags=flags, objvar=k_var)) flags=flags, objvar=k_var))
kname = "kSoftmaxWithBias_fixed_shared" kname = "kSoftmaxWithBias_fixed_shared"
k_var = "kSoftmaxWithBias_fixed_shared" + nodename k_var = "kSoftmaxWithBias_fixed_shared" + nodename
code = """ code = """#include "cluda.h"
KERNEL void %(kname)s (const ga_size M, const ga_size N, KERNEL void %(kname)s (const ga_size M, const ga_size N,
GLOBAL_MEM const %(type_x)s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1, GLOBAL_MEM const %(type_x)s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1,
GLOBAL_MEM const %(type_b)s * b, const ga_size offset_b, const ga_ssize sb0, GLOBAL_MEM const %(type_b)s * b, const ga_size offset_b, const ga_ssize sb0,
......
...@@ -1110,17 +1110,11 @@ def local_gpua_advanced_boolean_subtensor(op, context_name, inputs, outputs): ...@@ -1110,17 +1110,11 @@ def local_gpua_advanced_boolean_subtensor(op, context_name, inputs, outputs):
@op_lifter([tensor.AdvancedIncSubtensor1]) @op_lifter([tensor.AdvancedIncSubtensor1])
@register_opt2([tensor.AdvancedIncSubtensor1], 'fast_compile') @register_opt2([tensor.AdvancedIncSubtensor1], 'fast_compile')
def local_gpua_advanced_incsubtensor1(op, context_name, inputs, outputs): def local_gpua_advanced_incsubtensor1(op, context_name, inputs, outputs):
context = get_context(context_name)
# This is disabled on non-cuda contexts
if context.kind != b'cuda':
return None
x, y, ilist = inputs x, y, ilist = inputs
set_instead_of_inc = op.set_instead_of_inc set_instead_of_inc = op.set_instead_of_inc
compute_capability = int(context.bin_id[-2]) if (x.ndim == 1 and y.ndim == 0 and
if (compute_capability >= 2 and x.ndim == 1 and y.ndim == 0 and
config.deterministic == 'default'): config.deterministic == 'default'):
x = x.dimshuffle(0, 'x') x = x.dimshuffle(0, 'x')
y = y.dimshuffle('x', 'x') y = y.dimshuffle('x', 'x')
...@@ -1128,7 +1122,7 @@ def local_gpua_advanced_incsubtensor1(op, context_name, inputs, outputs): ...@@ -1128,7 +1122,7 @@ def local_gpua_advanced_incsubtensor1(op, context_name, inputs, outputs):
set_instead_of_inc=set_instead_of_inc)(x, y, ilist) set_instead_of_inc=set_instead_of_inc)(x, y, ilist)
ret = GpuDimShuffle(ret.type.broadcastable, [0])(ret) ret = GpuDimShuffle(ret.type.broadcastable, [0])(ret)
return ret return ret
elif (compute_capability < 2 or x.ndim != 2 or y.ndim != 2 or elif (x.ndim != 2 or y.ndim != 2 or
config.deterministic == 'more'): config.deterministic == 'more'):
return GpuAdvancedIncSubtensor1( return GpuAdvancedIncSubtensor1(
set_instead_of_inc=set_instead_of_inc) set_instead_of_inc=set_instead_of_inc)
......
...@@ -80,7 +80,8 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base): ...@@ -80,7 +80,8 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
else: else:
raise ValueError('Unsupported data type for output', raise ValueError('Unsupported data type for output',
self.output_type.dtype) self.output_type.dtype)
code = """ code = """#include "cluda.h"
KERNEL void mrg_uniform( KERNEL void mrg_uniform(
GLOBAL_MEM %(otype)s *sample_data, GLOBAL_MEM %(otype)s *sample_data,
ga_size sample_offset, ga_size sample_offset,
......
...@@ -353,7 +353,7 @@ int sub_setarray(GpuArray *dst, GpuArray *src) { ...@@ -353,7 +353,7 @@ int sub_setarray(GpuArray *dst, GpuArray *src) {
int err; int err;
err = GpuArray_setarray(dst, src); err = GpuArray_setarray(dst, src);
if (err != GA_NO_ERROR) if (err != GA_NO_ERROR)
PyErr_SetString(PyExc_RuntimeError, "setarray failed"); PyErr_SetString(PyExc_RuntimeError, GpuArray_error(src, err));
return err; return err;
} }
""" """
...@@ -1037,8 +1037,7 @@ class GpuAdvancedIncSubtensor1(Op): ...@@ -1037,8 +1037,7 @@ class GpuAdvancedIncSubtensor1(Op):
class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC, class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC,
GpuAdvancedIncSubtensor1): GpuAdvancedIncSubtensor1):
""" """
Implement AdvancedIncSubtensor1 on the gpu, but use function Implement AdvancedIncSubtensor1 on the gpu with atomics
only avail on compute capability 2.0 and more recent.
""" """
_f16_ok = True _f16_ok = True
...@@ -1089,12 +1088,8 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC, ...@@ -1089,12 +1088,8 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC,
return [gpuarray_helper_inc_dir()] return [gpuarray_helper_inc_dir()]
def c_code(self, node, name, inputs, outputs, sub): def c_code(self, node, name, inputs, outputs, sub):
ctx = self.get_params(node).context
if ctx.kind != b'cuda':
raise NotImplementedError("cuda only")
if (node.inputs[0].ndim != node.inputs[1].ndim or if (node.inputs[0].ndim != node.inputs[1].ndim or
node.inputs[0].ndim != 2 or node.inputs[0].ndim != 2):
int(ctx.bin_id[-2]) < 2):
raise NotImplementedError("This case does not have C code yet.") raise NotImplementedError("This case does not have C code yet.")
return """ return """
...@@ -1125,110 +1120,33 @@ if (GpuArray_vector_add_fast(%(out)s, %(y)s, %(ind)s, %(params)s->set_instead_of ...@@ -1125,110 +1120,33 @@ if (GpuArray_vector_add_fast(%(out)s, %(y)s, %(ind)s, %(params)s->set_instead_of
flags = Kernel.get_flags(dtype_x, dtype_y, dtype_ind) flags = Kernel.get_flags(dtype_x, dtype_y, dtype_ind)
kname = "k_vector_add_fast" kname = "k_vector_add_fast"
k_var = "k_vector_add_fast_" + nodename k_var = "k_vector_add_fast_" + nodename
code = """ code = """#include "cluda.h"
/*
* This is an atomicAdd that works for doubles since that is not provided
* natively by cuda before arch 6.0.
*/
#if __CUDA_ARCH__ < 600
__device__ ga_double atomicAdd(ga_double* address, ga_double val) {
ga_ulong *address_as_ull = (ga_ulong *)address;
ga_ulong old = *address_as_ull, assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed,
__double_as_longlong(val +
__longlong_as_double(assumed)));
} while (assumed != old);
return __longlong_as_double(old);
}
#endif
__device__ ga_double atomicExch(ga_double *address, ga_double val) {
return atomicExch((ga_ulong *)address,
__double_as_longlong(val));
}
/* GA_LONG */
__device__ ga_long atomicAdd(ga_long* address, ga_long val) {
ga_ulong *address_as_ull = (ga_ulong *)address;
ga_ulong old = *address_as_ull, assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed,
(ga_ulong)(val + (ga_long)assumed));
} while (assumed != old);
return (ga_long)old;
}
__device__ ga_long atomicExch(ga_long *address, ga_long val) {
return (ga_long)atomicExch((ga_ulong *)address, (ga_ulong)val);
}
/* GA_HALF */
/*
* This may read and write 2 bytes more than the size of the array
* if the array has an uneven number of elements. The actual value
* at that spot will not be modified.
*/
__device__ ga_half atomicAdd(ga_half *addr, ga_half val) {
ga_uint *base = (ga_uint *)((ga_size)addr & ~2);
ga_uint old, assumed, sum, new_;
old = *base;
do {
assumed = old;
sum = __float2half_rn(
__half2float(val) +
__half2float((ga_half)__byte_perm(old, 0,
((ga_size)addr & 2) ? 0x4432 : 0x4410)));
new_ = __byte_perm(old, sum, ((ga_size)addr & 2) ? 0x5410 : 0x3254);
old = atomicCAS(base, assumed, new_);
} while (assumed != old);
return (ga_half)__byte_perm(old, 0,
((ga_size)addr & 2) ? 0x4432 : 0x4410);
}
__device__ ga_half atomicExch(ga_half *addr, ga_half val) {
ga_uint *base = (ga_uint *)((ga_size)addr & ~2);
ga_uint old, assumed, new_;
old = *base;
do {
assumed = old;
new_ = __byte_perm(old, val, ((ga_size)addr & 2) ? 0x5410 : 0x3254);
old = atomicCAS(base, assumed, new_);
} while (assumed != old);
return (ga_half)__byte_perm(old, 0,
((ga_size)addr & 2) ? 0x4432 : 0x4410);
}
KERNEL void k_vector_add_fast(const ga_size numRowsX, KERNEL void k_vector_add_fast(const ga_size numRowsX,
const ga_size numColsX, const ga_size numColsX,
const ga_ssize stridesX0, const ga_ssize stridesX0,
const ga_ssize stridesX1, const ga_ssize stridesX1,
%(type_x)s *X, GLOBAL_MEM %(type_x)s *X,
const ga_size offset_X, const ga_size offset_X,
const ga_size numRowsY, const ga_size numRowsY,
const ga_size numColsY, const ga_size numColsY,
const ga_ssize stridesY0, const ga_ssize stridesY0,
const ga_ssize stridesY1, const ga_ssize stridesY1,
%(type_y)s *Y, GLOBAL_MEM %(type_y)s *Y,
const ga_size offset_Y, const ga_size offset_Y,
const ga_size numIndices, const ga_size numIndices,
const ga_ssize stridesIndices, const ga_ssize stridesIndices,
%(type_ind)s *indices_arr, GLOBAL_MEM %(type_ind)s *indices_arr,
const ga_size offset_indices_arr, const ga_size offset_indices_arr,
const int set_instead_of_inc, const ga_int set_instead_of_inc,
ga_int *err) GLOBAL_MEM ga_int *err)
{ {
X = (%(type_x)s *)(((char *)X)+offset_X); X = (GLOBAL_MEM %(type_x)s *)(((GLOBAL_MEM char *)X)+offset_X);
Y = (%(type_y)s *)(((char *)Y)+offset_Y); Y = (GLOBAL_MEM %(type_y)s *)(((GLOBAL_MEM char *)Y)+offset_Y);
indices_arr = (%(type_ind)s *)(((char *)indices_arr)+offset_indices_arr); indices_arr = (GLOBAL_MEM %(type_ind)s *)(((GLOBAL_MEM char *)indices_arr)+offset_indices_arr);
for (int i = (blockIdx.x); i < numIndices; i += gridDim.x)
for (ga_int i = GID_0; i < numIndices; i += GDIM_0)
{ {
for(int j = (threadIdx.x); j < numColsX;j += blockDim.x) for (ga_int j = LID_0; j < numColsX; j += LDIM_0)
{ {
ga_ssize x_row = indices_arr[i * stridesIndices]; ga_ssize x_row = indices_arr[i * stridesIndices];
if (x_row < 0) if (x_row < 0)
...@@ -1236,10 +1154,10 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) { ...@@ -1236,10 +1154,10 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) {
ga_ssize y_row = i; ga_ssize y_row = i;
if (x_row < numRowsX && x_row >= 0) { if (x_row < numRowsX && x_row >= 0) {
if (set_instead_of_inc) { if (set_instead_of_inc) {
atomicExch(&X[(x_row * stridesX0) + (j * stridesX1)], atom_xchg_%(tc)sg(&X[(x_row * stridesX0) + (j * stridesX1)],
Y[(y_row * stridesY0) + (j * stridesY1)]); Y[(y_row * stridesY0) + (j * stridesY1)]);
} else { } else {
atomicAdd(&X[(x_row * stridesX0) + (j * stridesX1)], atom_add_%(tc)sg(&X[(x_row * stridesX0) + (j * stridesX1)],
Y[(y_row * stridesY0) + (j * stridesY1)]); Y[(y_row * stridesY0) + (j * stridesY1)]);
} }
} else { } else {
...@@ -1249,11 +1167,13 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) { ...@@ -1249,11 +1167,13 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) {
} }
return; return;
} }
""" % dict(type_x=type_x, type_y=type_y, type_ind=type_ind) """ % dict(type_x=type_x, type_y=type_y, type_ind=type_ind,
tc=np.dtype(dtype_x).char)
from pygpu.gpuarray import SIZE, SSIZE
params = [ params = [
'uintp', 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp', SIZE, SIZE, SSIZE, SSIZE, gpuarray.GpuArray, SIZE,
'uintp', 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp', SIZE, SIZE, SSIZE, SSIZE, gpuarray.GpuArray, SIZE,
'uintp', 'intp', gpuarray.GpuArray, 'uintp', 'int', SIZE, SSIZE, gpuarray.GpuArray, SIZE, 'int32',
gpuarray.GpuArray] gpuarray.GpuArray]
return [Kernel(code=code, name=kname, params=params, return [Kernel(code=code, name=kname, params=params,
flags=flags, objvar=k_var)] flags=flags, objvar=k_var)]
...@@ -1265,15 +1185,15 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) { ...@@ -1265,15 +1185,15 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) {
PyGpuArrayObject* indices_arr, PyGpuArrayObject* indices_arr,
const int set_instead_of_inc) const int set_instead_of_inc)
{ {
size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(py_self)[1], (size_t)256), 1, 1}; size_t threads_per_block = std::min(PyGpuArray_DIMS(py_self)[1], (size_t)256);
size_t n_blocks[3] = {std::min(PyGpuArray_SIZE(indices_arr), (size_t)4096), 1, 1}; size_t n_blocks = std::min(PyGpuArray_SIZE(indices_arr), (size_t)4096);
gpudata *errbuf; gpudata *errbuf;
int err, kerr = 0; int err, kerr = 0;
size_t itemsize_x = GpuArray_ITEMSIZE(&py_self->ga); size_t itemsize_x = GpuArray_ITEMSIZE(&py_self->ga);
size_t itemsize_y = GpuArray_ITEMSIZE(&py_other->ga); size_t itemsize_y = GpuArray_ITEMSIZE(&py_other->ga);
size_t itemsize_ind = GpuArray_ITEMSIZE(&indices_arr->ga); size_t itemsize_ind = GpuArray_ITEMSIZE(&indices_arr->ga);
if (threads_per_block[0] > 0 && n_blocks[0] > 0) { if (threads_per_block > 0 && n_blocks > 0) {
err = gpudata_property(py_self->ga.data, err = gpudata_property(py_self->ga.data,
GA_CTX_PROP_ERRBUF, &errbuf); GA_CTX_PROP_ERRBUF, &errbuf);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
...@@ -1281,30 +1201,27 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) { ...@@ -1281,30 +1201,27 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) {
return 1; return 1;
} }
ssize_t stride_X0 = PyGpuArray_STRIDES(py_self)[0] / itemsize_x; err = k_vector_add_fast_call(
ssize_t stride_X1 = PyGpuArray_STRIDES(py_self)[1] / itemsize_x; 1, &n_blocks, &threads_per_block, 0,
ssize_t stride_Y0 = PyGpuArray_DIMS(py_other)[0] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[0] / itemsize_y; PyGpuArray_DIMS(py_self)[0],
ssize_t stride_Y1 = PyGpuArray_DIMS(py_other)[1] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[1] / itemsize_y; PyGpuArray_DIMS(py_self)[1],
ssize_t stride_ind = PyGpuArray_STRIDES(indices_arr)[0] / itemsize_ind; PyGpuArray_STRIDES(py_self)[0] / itemsize_x,
void *kernel_params[] = {(void *)&PyGpuArray_DIMS(py_self)[0], PyGpuArray_STRIDES(py_self)[1] / itemsize_x,
(void *)&PyGpuArray_DIMS(py_self)[1], py_self->ga.data,
(void *)&stride_X0, py_self->ga.offset,
(void *)&stride_X1, PyGpuArray_DIMS(py_other)[0],
(void *)py_self->ga.data, PyGpuArray_DIMS(py_other)[1],
(void *)&py_self->ga.offset, PyGpuArray_DIMS(py_other)[0] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[0] / itemsize_y,
(void *)&PyGpuArray_DIMS(py_other)[0], PyGpuArray_DIMS(py_other)[1] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[1] / itemsize_y,
(void *)&PyGpuArray_DIMS(py_other)[1], py_other->ga.data,
(void *)&stride_Y0, py_other->ga.offset,
(void *)&stride_Y1, PyGpuArray_DIMS(indices_arr)[0],
(void *)py_other->ga.data, PyGpuArray_STRIDES(indices_arr)[0] / itemsize_ind,
(void *)&py_other->ga.offset, indices_arr->ga.data,
(void *)&PyGpuArray_DIMS(indices_arr)[0], indices_arr->ga.offset,
(void *)&stride_ind, set_instead_of_inc,
(void *)indices_arr->ga.data, errbuf);
(void *)&indices_arr->ga.offset,
(void *)&set_instead_of_inc,
(void *)errbuf};
err = GpuKernel_call(&%(k_var)s, 3, n_blocks, threads_per_block, 0, kernel_params);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"gpuarray error: %(k_var)s: %%s.", "gpuarray error: %(k_var)s: %%s.",
......
#section kernels #section kernels
#kernel eye : *, size, size, size : #kernel eye : *, size, size, size :
#include <cluda.h>
/* The eye name will be used to generate supporting objects. The only /* The eye name will be used to generate supporting objects. The only
you probably need to care about is the kernel object which will be you probably need to care about is the kernel object which will be
named 'k_' + <the name above> (k_eye in this case). This name also named 'k_' + <the name above> (k_eye in this case). This name also
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论