提交 b80a7d12 authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron 提交者: Frederic Bastien

Add cluda include to all kernels.

上级 28ffda75
...@@ -158,7 +158,7 @@ class Kernel(object): ...@@ -158,7 +158,7 @@ class Kernel(object):
the `params` list consists of C typecodes the `params` list consists of C typecodes
It can also have the key `cflags` which is a string of C flag It can also have the key `cflags` which is a string of C flag
values like this `"GA_USE_DOUBLE|GA_USE_CLUDA"`. values like this `"GA_USE_DOUBLE|GA_USE_SMALL"`.
Parameters Parameters
---------- ----------
...@@ -216,7 +216,7 @@ class Kernel(object): ...@@ -216,7 +216,7 @@ class Kernel(object):
else: else:
raise TypeError("can't get a dtype from %s" % (type(t),)) raise TypeError("can't get a dtype from %s" % (type(t),))
dtypes = [get_dtype(t) for t in types] dtypes = [get_dtype(t) for t in types]
flags = dict(cluda=True) flags = dict()
if any(d == np.float64 for d in dtypes): if any(d == np.float64 for d in dtypes):
flags['have_double'] = True flags['have_double'] = True
if any(d.itemsize < 4 for d in dtypes): if any(d.itemsize < 4 for d in dtypes):
...@@ -231,8 +231,6 @@ class Kernel(object): ...@@ -231,8 +231,6 @@ class Kernel(object):
res = [] res = []
if self.flags.get('cflags', '') != '': if self.flags.get('cflags', '') != '':
res.append(self.flags['cflags']) res.append(self.flags['cflags'])
if self.flags.get('cluda', False):
res.append('GA_USE_CLUDA')
if self.flags.get('have_double', False): if self.flags.get('have_double', False):
res.append('GA_USE_DOUBLE') res.append('GA_USE_DOUBLE')
if self.flags.get('have_small', False): if self.flags.get('have_small', False):
...@@ -241,15 +239,16 @@ class Kernel(object): ...@@ -241,15 +239,16 @@ class Kernel(object):
res.append('GA_USE_COMPLEX') res.append('GA_USE_COMPLEX')
if self.flags.get('have_half', False): if self.flags.get('have_half', False):
res.append('GA_USE_HALF') res.append('GA_USE_HALF')
return '|'.join(res) res = '|'.join(res)
if not res:
return '0'
return res
def _get_py_flags(self): def _get_py_flags(self):
res = dict(self.flags) res = dict(self.flags)
cflags = res.pop('cflags', '') cflags = res.pop('cflags', '')
for fl in cflags.split('|'): for fl in cflags.split('|'):
fl = fl.strip() fl = fl.strip()
if fl == 'GA_USE_CLUDA':
res['cluda'] = True
if fl == 'GA_USE_DOUBLE': if fl == 'GA_USE_DOUBLE':
res['have_double'] = True res['have_double'] = True
if fl == 'GA_USE_SMALL': if fl == 'GA_USE_SMALL':
...@@ -555,7 +554,7 @@ class CGpuKernelBase(COp, GpuKernelBase): ...@@ -555,7 +554,7 @@ class CGpuKernelBase(COp, GpuKernelBase):
kflags = splt2[2].strip() kflags = splt2[2].strip()
kcode = def_macros + '\n' + kcode + '\n' + undef_macros kcode = def_macros + '\n' + kcode + '\n' + undef_macros
res.append(Kernel(kcode, ktypes, kname, res.append(Kernel(kcode, ktypes, kname,
flags=dict(cluda=True, cflags=kflags))) flags=dict(cflags=kflags)))
n += 2 n += 2
self._cached_kernels = res self._cached_kernels = res
return res return res
...@@ -1619,7 +1618,8 @@ class GpuEye(GpuKernelBase, Op): ...@@ -1619,7 +1618,8 @@ class GpuEye(GpuKernelBase, Op):
for i in xrange(3)] for i in xrange(3)]
def gpu_kernels(self, node, name): def gpu_kernels(self, node, name):
code = """ code = """#include "cluda.h"
KERNEL void eye(GLOBAL_MEM %(ctype)s *a, ga_size a_off, KERNEL void eye(GLOBAL_MEM %(ctype)s *a, ga_size a_off,
ga_size n, ga_size m, ga_ssize k) { ga_size n, ga_size m, ga_ssize k) {
a = (GLOBAL_MEM %(ctype)s *)(((GLOBAL_MEM char *)a) + a_off); a = (GLOBAL_MEM %(ctype)s *)(((GLOBAL_MEM char *)a) + a_off);
......
#section kernels #section kernels
#kernel dilated_im3d2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size : #kernel dilated_im3d2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
// TODO check kernel flags #include "cluda.h"
// This uses a lot of code from Caffe (http://caffe.berkeleyvision.org/); // This uses a lot of code from Caffe (http://caffe.berkeleyvision.org/);
// sources are clearly marked. Below we reproduce the original license of // sources are clearly marked. Below we reproduce the original license of
// the Caffe software. // the Caffe software.
...@@ -87,6 +88,8 @@ KERNEL void dilated_im3d2col_kernel(const ga_size n, ...@@ -87,6 +88,8 @@ KERNEL void dilated_im3d2col_kernel(const ga_size n,
} }
#kernel im3d2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size : #kernel im3d2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
#include "cluda.h"
KERNEL void im3d2col_kernel(const ga_size n, KERNEL void im3d2col_kernel(const ga_size n,
GLOBAL_MEM const DTYPE_INPUT_0 * data_im, GLOBAL_MEM const DTYPE_INPUT_0 * data_im,
const ga_size offset_im, const ga_size offset_im,
...@@ -139,6 +142,8 @@ KERNEL void im3d2col_kernel(const ga_size n, ...@@ -139,6 +142,8 @@ KERNEL void im3d2col_kernel(const ga_size n,
// GPU kernel for the case of dilation // GPU kernel for the case of dilation
#kernel dilated_col2im3d_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size, size : #kernel dilated_col2im3d_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size, size :
#include "cluda.h"
KERNEL void dilated_col2im3d_kernel(const ga_size n, KERNEL void dilated_col2im3d_kernel(const ga_size n,
GLOBAL_MEM const DTYPE_INPUT_0 * data_col, GLOBAL_MEM const DTYPE_INPUT_0 * data_col,
const ga_size offset_col, const ga_size offset_col,
...@@ -207,6 +212,7 @@ KERNEL void dilated_col2im3d_kernel(const ga_size n, ...@@ -207,6 +212,7 @@ KERNEL void dilated_col2im3d_kernel(const ga_size n,
} }
#kernel col2im3d_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size, size : #kernel col2im3d_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size, size :
#include "cluda.h"
KERNEL void col2im3d_kernel(const ga_size n, KERNEL void col2im3d_kernel(const ga_size n,
GLOBAL_MEM const DTYPE_INPUT_0 * data_col, GLOBAL_MEM const DTYPE_INPUT_0 * data_col,
......
#section kernels #section kernels
#kernel dilated_im2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size : #kernel dilated_im2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
#include "cluda.h"
// TODO check kernel flags // TODO check kernel flags
// This uses a lot of code from Caffe (http://caffe.berkeleyvision.org/); // This uses a lot of code from Caffe (http://caffe.berkeleyvision.org/);
// sources are clearly marked. Below we reproduce the original license of // sources are clearly marked. Below we reproduce the original license of
...@@ -77,6 +78,7 @@ KERNEL void dilated_im2col_kernel(const ga_size n, ...@@ -77,6 +78,7 @@ KERNEL void dilated_im2col_kernel(const ga_size n,
} }
#kernel im2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, *, size : #kernel im2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
#include "cluda.h"
KERNEL void im2col_kernel(const ga_size n, KERNEL void im2col_kernel(const ga_size n,
GLOBAL_MEM const DTYPE_INPUT_0 * data_im, GLOBAL_MEM const DTYPE_INPUT_0 * data_im,
...@@ -122,6 +124,8 @@ KERNEL void im2col_kernel(const ga_size n, ...@@ -122,6 +124,8 @@ KERNEL void im2col_kernel(const ga_size n,
// GPU kernel for the case of dilation // GPU kernel for the case of dilation
#kernel dilated_col2im_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size, size : #kernel dilated_col2im_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size, size :
#include "cluda.h"
KERNEL void dilated_col2im_kernel(const ga_size n, KERNEL void dilated_col2im_kernel(const ga_size n,
GLOBAL_MEM const DTYPE_INPUT_0 * data_col, const ga_size offset_col, GLOBAL_MEM const DTYPE_INPUT_0 * data_col, const ga_size offset_col,
const ga_size height, const ga_size width, const ga_size channels, const ga_size height, const ga_size width, const ga_size channels,
...@@ -172,6 +176,7 @@ KERNEL void dilated_col2im_kernel(const ga_size n, ...@@ -172,6 +176,7 @@ KERNEL void dilated_col2im_kernel(const ga_size n,
} }
#kernel col2im_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, *, size, size : #kernel col2im_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, *, size, size :
#include "cluda.h"
KERNEL void col2im_kernel(const ga_size n, KERNEL void col2im_kernel(const ga_size n,
GLOBAL_MEM const DTYPE_INPUT_0 * data_col, const ga_size offset_col, GLOBAL_MEM const DTYPE_INPUT_0 * data_col, const ga_size offset_col,
......
#section kernels #section kernels
#kernel tril_kernel : size, size, size, *: #kernel tril_kernel : size, size, size, *:
#include "cluda.h"
KERNEL void tril_kernel(const ga_size nthreads, const ga_size ncols, KERNEL void tril_kernel(const ga_size nthreads, const ga_size ncols,
const ga_size a_off, GLOBAL_MEM DTYPE_INPUT_0 *a) { const ga_size a_off, GLOBAL_MEM DTYPE_INPUT_0 *a) {
...@@ -17,6 +18,7 @@ KERNEL void tril_kernel(const ga_size nthreads, const ga_size ncols, ...@@ -17,6 +18,7 @@ KERNEL void tril_kernel(const ga_size nthreads, const ga_size ncols,
} }
#kernel triu_kernel : size, size, size, *: #kernel triu_kernel : size, size, size, *:
#include "cluda.h"
KERNEL void triu_kernel(const ga_size nthreads, const ga_size ncols, KERNEL void triu_kernel(const ga_size nthreads, const ga_size ncols,
const ga_size a_off, GLOBAL_MEM DTYPE_INPUT_0 *a) { const ga_size a_off, GLOBAL_MEM DTYPE_INPUT_0 *a) {
......
#section kernels #section kernels
#kernel triu_kernel : size, size, size, *: #kernel triu_kernel : size, size, size, *:
#include "cluda.h"
KERNEL void triu_kernel(const ga_size nthreads, const ga_size ncols, KERNEL void triu_kernel(const ga_size nthreads, const ga_size ncols,
const ga_size a_off, GLOBAL_MEM DTYPE_INPUT_0 *a) { const ga_size a_off, GLOBAL_MEM DTYPE_INPUT_0 *a) {
......
#section kernels #section kernels
#kernel max_pool2d_kernel : size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, *, size : #kernel max_pool2d_kernel : size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, *, size :
#include "cluda.h"
// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu) // (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void max_pool2d_kernel(const ga_size nthreads, KERNEL void max_pool2d_kernel(const ga_size nthreads,
...@@ -44,6 +45,7 @@ KERNEL void max_pool2d_kernel(const ga_size nthreads, ...@@ -44,6 +45,7 @@ KERNEL void max_pool2d_kernel(const ga_size nthreads,
} }
#kernel max_pool3d_kernel : size, size, size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, size, size, size, *, size : #kernel max_pool3d_kernel : size, size, size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, size, size, size, *, size :
#include "cluda.h"
// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu) // (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void max_pool3d_kernel(const ga_size nthreads, KERNEL void max_pool3d_kernel(const ga_size nthreads,
...@@ -95,6 +97,7 @@ KERNEL void max_pool3d_kernel(const ga_size nthreads, ...@@ -95,6 +97,7 @@ KERNEL void max_pool3d_kernel(const ga_size nthreads,
} }
#kernel ave_pool2d_kernel : size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, bool, bool, *, size: #kernel ave_pool2d_kernel : size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, bool, bool, *, size:
#include "cluda.h"
// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu) // (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void ave_pool2d_kernel(const ga_size nthreads, KERNEL void ave_pool2d_kernel(const ga_size nthreads,
...@@ -150,6 +153,7 @@ KERNEL void ave_pool2d_kernel(const ga_size nthreads, ...@@ -150,6 +153,7 @@ KERNEL void ave_pool2d_kernel(const ga_size nthreads,
} }
#kernel ave_pool3d_kernel : size, size, size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, size, size, size, bool, bool, *, size : #kernel ave_pool3d_kernel : size, size, size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, size, size, size, bool, bool, *, size :
#include "cluda.h"
// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu) // (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void ave_pool3d_kernel(const ga_size nthreads, KERNEL void ave_pool3d_kernel(const ga_size nthreads,
......
#section kernels #section kernels
#kernel ave_pool2d_grad_kernel : size, size, size, size, size, size, size, *, size, *, size, size, size, size, size, size, size, bool, bool, *, size : #kernel ave_pool2d_grad_kernel : size, size, size, size, size, size, size, *, size, *, size, size, size, size, size, size, size, bool, bool, *, size :
#include "cluda.h"
// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu) // (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void ave_pool2d_grad_kernel(const ga_size nthreads, KERNEL void ave_pool2d_grad_kernel(const ga_size nthreads,
...@@ -50,6 +51,7 @@ KERNEL void ave_pool2d_grad_kernel(const ga_size nthreads, ...@@ -50,6 +51,7 @@ KERNEL void ave_pool2d_grad_kernel(const ga_size nthreads,
} }
#kernel ave_pool3d_grad_kernel : size, size, size, size, size, size, size, size, size, *, size, *, size, size, size, size, size, size, size, size, size, size, bool, bool, *, size : #kernel ave_pool3d_grad_kernel : size, size, size, size, size, size, size, size, size, *, size, *, size, size, size, size, size, size, size, size, size, size, bool, bool, *, size :
#include "cluda.h"
// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu) // (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void ave_pool3d_grad_kernel(const ga_size nthreads, KERNEL void ave_pool3d_grad_kernel(const ga_size nthreads,
......
#section kernels #section kernels
#kernel max_pool2d_grad_grad_kernel : size, size, size, size, size, size, size, *, size, *, size, *, size, size, size, size, size, size, size, *, size : #kernel max_pool2d_grad_grad_kernel : size, size, size, size, size, size, size, *, size, *, size, *, size, size, size, size, size, size, size, *, size :
#include "cluda.h"
KERNEL void max_pool2d_grad_grad_kernel(const ga_size nthreads, KERNEL void max_pool2d_grad_grad_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size pooled_height, const ga_size num, const ga_size channels, const ga_size pooled_height,
...@@ -47,6 +48,7 @@ KERNEL void max_pool2d_grad_grad_kernel(const ga_size nthreads, ...@@ -47,6 +48,7 @@ KERNEL void max_pool2d_grad_grad_kernel(const ga_size nthreads,
} }
#kernel max_pool3d_grad_grad_kernel : size, size, size, size, size, size, size, size, size, *, size, *, size, *, size, size, size, size, size, size, size, size, size, size, *, size : #kernel max_pool3d_grad_grad_kernel : size, size, size, size, size, size, size, size, size, *, size, *, size, *, size, size, size, size, size, size, size, size, size, size, *, size :
#include "cluda.h"
KERNEL void max_pool3d_grad_grad_kernel(const ga_size nthreads, KERNEL void max_pool3d_grad_grad_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size pooled_depth, const ga_size num, const ga_size channels, const ga_size pooled_depth,
......
#section kernels #section kernels
#kernel max_pool2d_grad_kernel : size, size, size, size, size, size, size, *, size, *, size, *, size, size, size, size, size, size, size, *, size : #kernel max_pool2d_grad_kernel : size, size, size, size, size, size, size, *, size, *, size, *, size, size, size, size, size, size, size, *, size :
#include "cluda.h"
// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu) // (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void max_pool2d_grad_kernel(const ga_size nthreads, KERNEL void max_pool2d_grad_kernel(const ga_size nthreads,
...@@ -43,6 +44,7 @@ KERNEL void max_pool2d_grad_kernel(const ga_size nthreads, ...@@ -43,6 +44,7 @@ KERNEL void max_pool2d_grad_kernel(const ga_size nthreads,
} }
#kernel max_pool3d_grad_kernel : size, size, size, size, size, size, size, size, size, *, size, *, size, *, size, size, size, size, size, size, size, size, size, size, *, size : #kernel max_pool3d_grad_kernel : size, size, size, size, size, size, size, size, size, *, size, *, size, *, size, size, size, size, size, size, size, size, size, size, *, size :
#include "cluda.h"
// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu) // (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void max_pool3d_grad_kernel(const ga_size nthreads, KERNEL void max_pool3d_grad_kernel(const ga_size nthreads,
......
#section kernels #section kernels
#kernel max_pool2d_rop_kernel : size, size, size, size, size, size, size, *, size, *, size, size, size, size, size, size, size, *, size : #kernel max_pool2d_rop_kernel : size, size, size, size, size, size, size, *, size, *, size, size, size, size, size, size, size, *, size :
#include "cluda.h"
// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu) // (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void max_pool2d_rop_kernel(const ga_size nthreads, KERNEL void max_pool2d_rop_kernel(const ga_size nthreads,
...@@ -50,6 +51,7 @@ KERNEL void max_pool2d_rop_kernel(const ga_size nthreads, ...@@ -50,6 +51,7 @@ KERNEL void max_pool2d_rop_kernel(const ga_size nthreads,
} }
#kernel max_pool3d_rop_kernel : size, size, size, size, size, size, size, size, size, *, size, *, size, size, size, size, size, size, size, size, size, size, *, size : #kernel max_pool3d_rop_kernel : size, size, size, size, size, size, size, size, size, *, size, *, size, size, size, size, size, size, size, size, size, size, *, size :
#include "cluda.h"
// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu) // (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void max_pool3d_rop_kernel(const ga_size nthreads, KERNEL void max_pool3d_rop_kernel(const ga_size nthreads,
......
...@@ -1743,7 +1743,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1743,7 +1743,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
kname = "kernel_reduce_ccontig" kname = "kernel_reduce_ccontig"
k_var = "kernel_reduce_ccontig_" + nodename k_var = "kernel_reduce_ccontig_" + nodename
sio = StringIO() sio = StringIO()
print(""" print("""#include "cluda.h"
KERNEL void %(kname)s( KERNEL void %(kname)s(
const ga_size d0, const ga_size d0,
const %(in_type)s *A, const ga_size offset_A, const %(in_type)s *A, const ga_size offset_A,
...@@ -1781,7 +1782,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1781,7 +1782,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
kname = "kernel_reduce_1" kname = "kernel_reduce_1"
k_var = "kernel_reduce_1_" + nodename k_var = "kernel_reduce_1_" + nodename
sio = StringIO() sio = StringIO()
print(""" print("""#include "cluda.h"
KERNEL void %(kname)s( KERNEL void %(kname)s(
const ga_size d0, const ga_size d0,
const %(in_type)s *A, const ga_size offset_A, const %(in_type)s *A, const ga_size offset_A,
...@@ -1821,7 +1823,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1821,7 +1823,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
kname = "kernel_reduce_11" kname = "kernel_reduce_11"
k_var = "kernel_reduce_11_" + nodename k_var = "kernel_reduce_11_" + nodename
sio = StringIO() sio = StringIO()
print(""" print("""#include "cluda.h"
KERNEL void %(kname)s( KERNEL void %(kname)s(
const ga_size d0, const ga_size d1, const ga_size d0, const ga_size d1,
const %(in_type)s *A, const ga_size offset_A, const %(in_type)s *A, const ga_size offset_A,
...@@ -1909,7 +1912,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1909,7 +1912,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
load_in + "(A[i3 * sA3 + i2 * sA2 + i1 * sA1 + i0 * sA0])", load_in + "(A[i3 * sA3 + i2 * sA2 + i1 * sA1 + i0 * sA0])",
{}, True) {}, True)
sio = StringIO() sio = StringIO()
print(""" print("""#include "cluda.h"
%(decl)s{ %(decl)s{
%(init)s %(init)s
for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){ for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){
...@@ -1943,7 +1947,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1943,7 +1947,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
kname = "kernel_reduce_010" kname = "kernel_reduce_010"
k_var = "kernel_reduce_010_" + nodename k_var = "kernel_reduce_010_" + nodename
sio = StringIO() sio = StringIO()
print(""" print("""#include "cluda.h"
KERNEL void %(kname)s( KERNEL void %(kname)s(
const ga_size d0, const ga_size d1, const ga_size d2, const ga_size d0, const ga_size d1, const ga_size d2,
const %(in_type)s *A, const ga_size offset_A, const %(in_type)s *A, const ga_size offset_A,
...@@ -1989,7 +1994,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1989,7 +1994,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
kname = "kernel_reduce_010_AD" kname = "kernel_reduce_010_AD"
k_var = "kernel_reduce_010_AD_" + nodename k_var = "kernel_reduce_010_AD_" + nodename
sio = StringIO() sio = StringIO()
print(""" print("""#include "cluda.h"
KERNEL void %(kname)s( KERNEL void %(kname)s(
const ga_size A, const ga_size B, const ga_size C, const ga_size D, const ga_size A, const ga_size B, const ga_size C, const ga_size D,
const %(in_type)s *X, const ga_size offset_X, const %(in_type)s *X, const ga_size offset_X,
...@@ -2053,7 +2059,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2053,7 +2059,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
{}, True) {}, True)
reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + 0 * sA1 + i2 * sA2])") reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + 0 * sA1 + i2 * sA2])")
sio = StringIO() sio = StringIO()
print(""" print("""#include "cluda.h"
%(decl)s %(decl)s
{ {
%(init)s %(init)s
...@@ -2088,7 +2095,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2088,7 +2095,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
kname = "kernel_reduce_110" kname = "kernel_reduce_110"
k_var = "kernel_reduce_110_" + nodename k_var = "kernel_reduce_110_" + nodename
sio = StringIO() sio = StringIO()
print(""" print("""#include "cluda.h"
KERNEL void %(kname)s( KERNEL void %(kname)s(
const ga_size d0, const ga_size d1, const ga_size d2, const ga_size d0, const ga_size d1, const ga_size d2,
const %(in_type)s *A, const ga_size offset_A, const %(in_type)s *A, const ga_size offset_A,
...@@ -2133,7 +2141,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2133,7 +2141,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
{}, True) {}, True)
reduce_init = self._assign_init(load_in + "(A[i1 * sA1 + i2 * sA2])") reduce_init = self._assign_init(load_in + "(A[i1 * sA1 + i2 * sA2])")
sio = StringIO() sio = StringIO()
print(""" print("""#include "cluda.h"
%(decl)s %(decl)s
{ {
%(init)s %(init)s
...@@ -2163,7 +2172,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2163,7 +2172,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
{}, True) {}, True)
reduce_init = self._assign_init(load_in + "(A[0])") reduce_init = self._assign_init(load_in + "(A[0])")
sio = StringIO() sio = StringIO()
print(""" print("""#include "cluda.h"
%(decl)s %(decl)s
{ {
%(init)s %(init)s
...@@ -2195,7 +2205,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2195,7 +2205,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
kname = "kernel_reduce_001" kname = "kernel_reduce_001"
k_var = "kernel_reduce_001_" + nodename k_var = "kernel_reduce_001_" + nodename
sio = StringIO() sio = StringIO()
print(""" print("""#include "cluda.h"
KERNEL void %(kname)s( KERNEL void %(kname)s(
const ga_size d0, const ga_size d1, const ga_size d2, const ga_size d0, const ga_size d1, const ga_size d2,
const %(in_type)s *A, const ga_size offset_A, const %(in_type)s *A, const ga_size offset_A,
...@@ -2244,7 +2254,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2244,7 +2254,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
{}, True) {}, True)
reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + i1 * sA1])") reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + i1 * sA1])")
sio = StringIO() sio = StringIO()
print(""" print("""#include "cluda.h"
%(decl)s %(decl)s
{ {
%(init)s %(init)s
...@@ -2280,7 +2291,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2280,7 +2291,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
{}, True) {}, True)
reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + i2 * sA2])") reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + i2 * sA2])")
sio = StringIO() sio = StringIO()
print(""" print("""#include "cluda.h"
%(decl)s %(decl)s
{ {
%(init)s %(init)s
...@@ -2314,7 +2326,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2314,7 +2326,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
{}, True) {}, True)
reduce_init = self._assign_init(load_in + "(A[0])") reduce_init = self._assign_init(load_in + "(A[0])")
sio = StringIO() sio = StringIO()
print(""" print("""#include "cluda.h"
%(decl)s %(decl)s
{ {
%(init)s %(init)s
...@@ -2345,7 +2358,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2345,7 +2358,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
kname = "kernel_reduce_1011" kname = "kernel_reduce_1011"
k_var = "kernel_reduce_1011_" + nodename k_var = "kernel_reduce_1011_" + nodename
sio = StringIO() sio = StringIO()
print(""" print("""#include "cluda.h"
KERNEL void %(kname)s( KERNEL void %(kname)s(
const ga_size d0, const ga_size d1, const ga_size d2, const ga_size d3, const ga_size d0, const ga_size d1, const ga_size d2, const ga_size d3,
const %(in_type)s *A, const ga_size offset_A, const %(in_type)s *A, const ga_size offset_A,
...@@ -2502,8 +2516,8 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2502,8 +2516,8 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
def gpu_kernels(self, node, name): def gpu_kernels(self, node, name):
if not any(getattr(self, 'redux', [node.inputs[0].ndim != 0])): if not any(getattr(self, 'redux', [node.inputs[0].ndim != 0])):
# Some OpenCL compilers do not accept no-arguments kernels # Some OpenCL compilers do not accept no-arguments empty kernels
src = "KERNEL void reduk(GLOBAL_MEM float *a) {}" src = "#include \"cluda.h\"\nKERNEL void reduk(GLOBAL_MEM float *a) { a[0] = 0; }"
params = ['float32'] params = ['float32']
else: else:
k = self.get_kernel_cache(node) k = self.get_kernel_cache(node)
......
...@@ -74,7 +74,8 @@ class GpuCumOp(GpuKernelBase, Op): ...@@ -74,7 +74,8 @@ class GpuCumOp(GpuKernelBase, Op):
k_var = "k_cumadd_" + nodename k_var = "k_cumadd_" + nodename
dtype_x = node.inputs[0].dtype dtype_x = node.inputs[0].dtype
flags = Kernel.get_flags(dtype_x) flags = Kernel.get_flags(dtype_x)
code = """ code = """#include "cluda.h"
KERNEL void %(kname)s(float* input, ga_size input_offset, KERNEL void %(kname)s(float* input, ga_size input_offset,
float* output, ga_size output_offset, float* output, ga_size output_offset,
ga_ssize inputStrides_x, ga_ssize inputStrides_y, ga_ssize inputStrides_z, ga_ssize inputStrides_x, ga_ssize inputStrides_y, ga_ssize inputStrides_z,
...@@ -112,7 +113,8 @@ class GpuCumOp(GpuKernelBase, Op): ...@@ -112,7 +113,8 @@ class GpuCumOp(GpuKernelBase, Op):
gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
'int32', 'int32', gpuarray.GpuArray, gpuarray.SIZE] 'int32', 'int32', gpuarray.GpuArray, gpuarray.SIZE]
code = """ code = """#include "cluda.h"
// helper functions // helper functions
WITHIN_KERNEL WITHIN_KERNEL
void k_reductionPhase(float* partialCumOp) { void k_reductionPhase(float* partialCumOp) {
...@@ -213,7 +215,8 @@ class GpuCumOp(GpuKernelBase, Op): ...@@ -213,7 +215,8 @@ class GpuCumOp(GpuKernelBase, Op):
# k_finalCumOp # k_finalCumOp
kname = "k_finalCumOp" kname = "k_finalCumOp"
k_var = "k_finalCumOp_" + nodename k_var = "k_finalCumOp_" + nodename
code = """ code = """#include "cluda.h"
KERNEL void k_finalCumOp(float* output, ga_size output_offset, KERNEL void k_finalCumOp(float* output, ga_size output_offset,
float* blockSum, ga_size blockSum_offset, float* blockSum, ga_size blockSum_offset,
size_t nbElementsPerCumOp, size_t nbElementsPerCumOp,
......
...@@ -34,7 +34,9 @@ def nvcc_kernel(name, params, body): ...@@ -34,7 +34,9 @@ def nvcc_kernel(name, params, body):
else: else:
yield b yield b
bodystr = ';\n'.join(flatbody()) bodystr = ';\n'.join(flatbody())
return """KERNEL void %(name)s (%(paramstr)s) return """#include "cluda.h"
KERNEL void %(name)s (%(paramstr)s)
{ {
%(bodystr)s; %(bodystr)s;
} }
......
...@@ -66,7 +66,8 @@ class GPUAMultinomialFromUniform(GpuKernelBase, Op): ...@@ -66,7 +66,8 @@ class GPUAMultinomialFromUniform(GpuKernelBase, Op):
work_ctype = pygpu.gpuarray.dtype_to_ctype(work_dtype(node.inputs[0].dtype)) work_ctype = pygpu.gpuarray.dtype_to_ctype(work_dtype(node.inputs[0].dtype))
write_out_ctype = write_w(node.outputs[0].dtype) write_out_ctype = write_w(node.outputs[0].dtype)
load_in_ctype = load_w(node.inputs[0].dtype) load_in_ctype = load_w(node.inputs[0].dtype)
code = """ code = """#include "cluda.h"
KERNEL void k_multi_warp_multinomial( KERNEL void k_multi_warp_multinomial(
const ga_size nb_multi, const ga_size nb_multi,
const ga_size nb_outcomes, const ga_size nb_outcomes,
...@@ -276,7 +277,8 @@ class GPUAChoiceFromUniform(GpuKernelBase, Op): ...@@ -276,7 +277,8 @@ class GPUAChoiceFromUniform(GpuKernelBase, Op):
def gpu_kernels(self, node, name): def gpu_kernels(self, node, name):
replace = int(self.replace) replace = int(self.replace)
code = """ code = """#include "cluda.h"
KERNEL void k_multi_warp_multinomial_wor( KERNEL void k_multi_warp_multinomial_wor(
const ga_size nb_multi, const ga_size nb_multi,
const ga_size nb_outcomes, const ga_size nb_outcomes,
......
...@@ -61,7 +61,8 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -61,7 +61,8 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
kernels = [] kernels = []
kname = "k_multi_warp_less" kname = "k_multi_warp_less"
k_var = "k_multi_warp_less_" + nodename k_var = "k_multi_warp_less_" + nodename
code = """ code = """#include "cluda.h"
// a version that uses less registers but doesn't work in all cases. // a version that uses less registers but doesn't work in all cases.
%(mode_constants)s %(mode_constants)s
KERNEL void %(kname)s( KERNEL void %(kname)s(
...@@ -163,7 +164,8 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -163,7 +164,8 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
kname = "k_multi_warp" kname = "k_multi_warp"
k_var = "k_multi_warp_" + nodename k_var = "k_multi_warp_" + nodename
code = """ code = """#include "cluda.h"
%(mode_constants)s %(mode_constants)s
KERNEL void %(kname)s( KERNEL void %(kname)s(
const ga_int mode, const ga_int mode,
......
...@@ -75,7 +75,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op): ...@@ -75,7 +75,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE
] ]
sio = StringIO() sio = StringIO()
print(""" print("""#include "cluda.h"
KERNEL void %(kname)s(const ga_size M, const ga_size N, KERNEL void %(kname)s(const ga_size M, const ga_size N,
GLOBAL_MEM const %(type_x)s* x_data, const ga_size offset_x, const ga_ssize xs0, const ga_ssize xs1, GLOBAL_MEM const %(type_x)s* x_data, const ga_size offset_x, const ga_ssize xs0, const ga_ssize xs1,
GLOBAL_MEM const %(type_b)s* b, const ga_size offset_b, const ga_ssize bs0, GLOBAL_MEM const %(type_b)s* b, const ga_size offset_b, const ga_ssize bs0,
...@@ -393,7 +394,8 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op): ...@@ -393,7 +394,8 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE,
] ]
sio = StringIO() sio = StringIO()
print(""" print("""#include "cluda.h"
KERNEL void %(kname)s( KERNEL void %(kname)s(
const ga_size N, const ga_size K, const ga_size N, const ga_size K,
GLOBAL_MEM const %(type_dnll)s* dnll, const ga_size offset_dnll, const ga_ssize dnll_s0, GLOBAL_MEM const %(type_dnll)s* dnll, const ga_size offset_dnll, const ga_ssize dnll_s0,
...@@ -557,7 +559,8 @@ class GpuSoftmax(GpuKernelBase, Op): ...@@ -557,7 +559,8 @@ class GpuSoftmax(GpuKernelBase, Op):
kernels = [] kernels = []
kname = "kSoftmax" kname = "kSoftmax"
k_var = "kSoftmax_" + nodename k_var = "kSoftmax_" + nodename
code = """ code = """#include "cluda.h"
KERNEL void %(kname)s (const ga_size M, const ga_size N, KERNEL void %(kname)s (const ga_size M, const ga_size N,
GLOBAL_MEM const %(type_x)s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1, GLOBAL_MEM const %(type_x)s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1,
GLOBAL_MEM %(type_sm)s * sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1 GA_DECL_SHARED_PARAM(%(type_acc)s, buf)) GLOBAL_MEM %(type_sm)s * sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1 GA_DECL_SHARED_PARAM(%(type_acc)s, buf))
...@@ -630,7 +633,8 @@ class GpuSoftmax(GpuKernelBase, Op): ...@@ -630,7 +633,8 @@ class GpuSoftmax(GpuKernelBase, Op):
flags=flags, objvar=k_var)) flags=flags, objvar=k_var))
kname = "kSoftmax_fixed_shared" kname = "kSoftmax_fixed_shared"
k_var = "kSoftmax_fixed_shared" + nodename k_var = "kSoftmax_fixed_shared" + nodename
code = """ code = """#include "cluda.h"
KERNEL void %(kname)s (const ga_size M, const ga_size N, KERNEL void %(kname)s (const ga_size M, const ga_size N,
GLOBAL_MEM const %(type_x)s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1, GLOBAL_MEM const %(type_x)s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1,
GLOBAL_MEM %(type_sm)s * sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1 GA_DECL_SHARED_PARAM(%(type_acc)s, buf)) GLOBAL_MEM %(type_sm)s * sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1 GA_DECL_SHARED_PARAM(%(type_acc)s, buf))
...@@ -854,7 +858,8 @@ class GpuSoftmaxWithBias(GpuKernelBase, Op): ...@@ -854,7 +858,8 @@ class GpuSoftmaxWithBias(GpuKernelBase, Op):
kernels = [] kernels = []
kname = "kSoftmaxWithBias" kname = "kSoftmaxWithBias"
k_var = "kSoftmaxWithBias_" + nodename k_var = "kSoftmaxWithBias_" + nodename
code = """ code = """#include "cluda.h"
KERNEL void %(kname)s (const ga_size M, const ga_size N, KERNEL void %(kname)s (const ga_size M, const ga_size N,
GLOBAL_MEM const %(type_x)s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1, GLOBAL_MEM const %(type_x)s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1,
GLOBAL_MEM const %(type_b)s * b, const ga_size offset_b, const ga_ssize sb0, GLOBAL_MEM const %(type_b)s * b, const ga_size offset_b, const ga_ssize sb0,
...@@ -930,7 +935,8 @@ class GpuSoftmaxWithBias(GpuKernelBase, Op): ...@@ -930,7 +935,8 @@ class GpuSoftmaxWithBias(GpuKernelBase, Op):
flags=flags, objvar=k_var)) flags=flags, objvar=k_var))
kname = "kSoftmaxWithBias_fixed_shared" kname = "kSoftmaxWithBias_fixed_shared"
k_var = "kSoftmaxWithBias_fixed_shared" + nodename k_var = "kSoftmaxWithBias_fixed_shared" + nodename
code = """ code = """#include "cluda.h"
KERNEL void %(kname)s (const ga_size M, const ga_size N, KERNEL void %(kname)s (const ga_size M, const ga_size N,
GLOBAL_MEM const %(type_x)s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1, GLOBAL_MEM const %(type_x)s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1,
GLOBAL_MEM const %(type_b)s * b, const ga_size offset_b, const ga_ssize sb0, GLOBAL_MEM const %(type_b)s * b, const ga_size offset_b, const ga_ssize sb0,
......
...@@ -80,7 +80,8 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base): ...@@ -80,7 +80,8 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
else: else:
raise ValueError('Unsupported data type for output', raise ValueError('Unsupported data type for output',
self.output_type.dtype) self.output_type.dtype)
code = """ code = """#include "cluda.h"
KERNEL void mrg_uniform( KERNEL void mrg_uniform(
GLOBAL_MEM %(otype)s *sample_data, GLOBAL_MEM %(otype)s *sample_data,
ga_size sample_offset, ga_size sample_offset,
......
...@@ -1121,7 +1121,7 @@ if (GpuArray_vector_add_fast(%(out)s, %(y)s, %(ind)s, %(params)s->set_instead_of ...@@ -1121,7 +1121,7 @@ if (GpuArray_vector_add_fast(%(out)s, %(y)s, %(ind)s, %(params)s->set_instead_of
flags = Kernel.get_flags(dtype_x, dtype_y, dtype_ind) flags = Kernel.get_flags(dtype_x, dtype_y, dtype_ind)
kname = "k_vector_add_fast" kname = "k_vector_add_fast"
k_var = "k_vector_add_fast_" + nodename k_var = "k_vector_add_fast_" + nodename
code = """#include <cluda.h> code = """#include "cluda.h"
KERNEL void k_vector_add_fast(const ga_size numRowsX, KERNEL void k_vector_add_fast(const ga_size numRowsX,
const ga_size numColsX, const ga_size numColsX,
const ga_ssize stridesX0, const ga_ssize stridesX0,
...@@ -1211,7 +1211,7 @@ if (GpuArray_vector_add_fast(%(out)s, %(y)s, %(ind)s, %(params)s->set_instead_of ...@@ -1211,7 +1211,7 @@ if (GpuArray_vector_add_fast(%(out)s, %(y)s, %(ind)s, %(params)s->set_instead_of
PyGpuArray_DIMS(py_other)[0], PyGpuArray_DIMS(py_other)[0],
PyGpuArray_DIMS(py_other)[1], PyGpuArray_DIMS(py_other)[1],
PyGpuArray_DIMS(py_other)[0] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[0] / itemsize_y, PyGpuArray_DIMS(py_other)[0] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[0] / itemsize_y,
PyGpuArray_DIMS(py_other)[1] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[1] / itemsize_y PyGpuArray_DIMS(py_other)[1] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[1] / itemsize_y,
py_other->ga.data, py_other->ga.data,
py_other->ga.offset, py_other->ga.offset,
PyGpuArray_DIMS(indices_arr)[0], PyGpuArray_DIMS(indices_arr)[0],
......
#section kernels #section kernels
#kernel eye : *, size, size, size : #kernel eye : *, size, size, size :
#include <cluda.h>
/* The eye name will be used to generate supporting objects. The only /* The eye name will be used to generate supporting objects. The only
you probably need to care about is the kernel object which will be you probably need to care about is the kernel object which will be
named 'k_' + <the name above> (k_eye in this case). This name also named 'k_' + <the name above> (k_eye in this case). This name also
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论