提交 b80a7d12 authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron 提交者: Frederic Bastien

Add cluda include to all kernels.

上级 28ffda75
......@@ -158,7 +158,7 @@ class Kernel(object):
the `params` list consists of C typecodes
It can also have the key `cflags` which is a string of C flag
values like this `"GA_USE_DOUBLE|GA_USE_CLUDA"`.
values like this `"GA_USE_DOUBLE|GA_USE_SMALL"`.
Parameters
----------
......@@ -216,7 +216,7 @@ class Kernel(object):
else:
raise TypeError("can't get a dtype from %s" % (type(t),))
dtypes = [get_dtype(t) for t in types]
flags = dict(cluda=True)
flags = dict()
if any(d == np.float64 for d in dtypes):
flags['have_double'] = True
if any(d.itemsize < 4 for d in dtypes):
......@@ -231,8 +231,6 @@ class Kernel(object):
res = []
if self.flags.get('cflags', '') != '':
res.append(self.flags['cflags'])
if self.flags.get('cluda', False):
res.append('GA_USE_CLUDA')
if self.flags.get('have_double', False):
res.append('GA_USE_DOUBLE')
if self.flags.get('have_small', False):
......@@ -241,15 +239,16 @@ class Kernel(object):
res.append('GA_USE_COMPLEX')
if self.flags.get('have_half', False):
res.append('GA_USE_HALF')
return '|'.join(res)
res = '|'.join(res)
if not res:
return '0'
return res
def _get_py_flags(self):
res = dict(self.flags)
cflags = res.pop('cflags', '')
for fl in cflags.split('|'):
fl = fl.strip()
if fl == 'GA_USE_CLUDA':
res['cluda'] = True
if fl == 'GA_USE_DOUBLE':
res['have_double'] = True
if fl == 'GA_USE_SMALL':
......@@ -555,7 +554,7 @@ class CGpuKernelBase(COp, GpuKernelBase):
kflags = splt2[2].strip()
kcode = def_macros + '\n' + kcode + '\n' + undef_macros
res.append(Kernel(kcode, ktypes, kname,
flags=dict(cluda=True, cflags=kflags)))
flags=dict(cflags=kflags)))
n += 2
self._cached_kernels = res
return res
......@@ -1619,7 +1618,8 @@ class GpuEye(GpuKernelBase, Op):
for i in xrange(3)]
def gpu_kernels(self, node, name):
code = """
code = """#include "cluda.h"
KERNEL void eye(GLOBAL_MEM %(ctype)s *a, ga_size a_off,
ga_size n, ga_size m, ga_ssize k) {
a = (GLOBAL_MEM %(ctype)s *)(((GLOBAL_MEM char *)a) + a_off);
......
#section kernels
#kernel dilated_im3d2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
// TODO check kernel flags
#include "cluda.h"
// This uses a lot of code from Caffe (http://caffe.berkeleyvision.org/);
// sources are clearly marked. Below we reproduce the original license of
// the Caffe software.
......@@ -87,6 +88,8 @@ KERNEL void dilated_im3d2col_kernel(const ga_size n,
}
#kernel im3d2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
#include "cluda.h"
KERNEL void im3d2col_kernel(const ga_size n,
GLOBAL_MEM const DTYPE_INPUT_0 * data_im,
const ga_size offset_im,
......@@ -139,6 +142,8 @@ KERNEL void im3d2col_kernel(const ga_size n,
// GPU kernel for the case of dilation
#kernel dilated_col2im3d_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size, size :
#include "cluda.h"
KERNEL void dilated_col2im3d_kernel(const ga_size n,
GLOBAL_MEM const DTYPE_INPUT_0 * data_col,
const ga_size offset_col,
......@@ -207,6 +212,7 @@ KERNEL void dilated_col2im3d_kernel(const ga_size n,
}
#kernel col2im3d_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size, size :
#include "cluda.h"
KERNEL void col2im3d_kernel(const ga_size n,
GLOBAL_MEM const DTYPE_INPUT_0 * data_col,
......
#section kernels
#kernel dilated_im2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
#include "cluda.h"
// TODO check kernel flags
// This uses a lot of code from Caffe (http://caffe.berkeleyvision.org/);
// sources are clearly marked. Below we reproduce the original license of
......@@ -77,6 +78,7 @@ KERNEL void dilated_im2col_kernel(const ga_size n,
}
#kernel im2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
#include "cluda.h"
KERNEL void im2col_kernel(const ga_size n,
GLOBAL_MEM const DTYPE_INPUT_0 * data_im,
......@@ -122,6 +124,8 @@ KERNEL void im2col_kernel(const ga_size n,
// GPU kernel for the case of dilation
#kernel dilated_col2im_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size, size :
#include "cluda.h"
KERNEL void dilated_col2im_kernel(const ga_size n,
GLOBAL_MEM const DTYPE_INPUT_0 * data_col, const ga_size offset_col,
const ga_size height, const ga_size width, const ga_size channels,
......@@ -172,6 +176,7 @@ KERNEL void dilated_col2im_kernel(const ga_size n,
}
#kernel col2im_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, *, size, size :
#include "cluda.h"
KERNEL void col2im_kernel(const ga_size n,
GLOBAL_MEM const DTYPE_INPUT_0 * data_col, const ga_size offset_col,
......
#section kernels
#kernel tril_kernel : size, size, size, *:
#include "cluda.h"
KERNEL void tril_kernel(const ga_size nthreads, const ga_size ncols,
const ga_size a_off, GLOBAL_MEM DTYPE_INPUT_0 *a) {
......@@ -17,6 +18,7 @@ KERNEL void tril_kernel(const ga_size nthreads, const ga_size ncols,
}
#kernel triu_kernel : size, size, size, *:
#include "cluda.h"
KERNEL void triu_kernel(const ga_size nthreads, const ga_size ncols,
const ga_size a_off, GLOBAL_MEM DTYPE_INPUT_0 *a) {
......
#section kernels
#kernel triu_kernel : size, size, size, *:
#include "cluda.h"
KERNEL void triu_kernel(const ga_size nthreads, const ga_size ncols,
const ga_size a_off, GLOBAL_MEM DTYPE_INPUT_0 *a) {
......
#section kernels
#kernel max_pool2d_kernel : size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, *, size :
#include "cluda.h"
// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void max_pool2d_kernel(const ga_size nthreads,
......@@ -44,6 +45,7 @@ KERNEL void max_pool2d_kernel(const ga_size nthreads,
}
#kernel max_pool3d_kernel : size, size, size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, size, size, size, *, size :
#include "cluda.h"
// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void max_pool3d_kernel(const ga_size nthreads,
......@@ -95,6 +97,7 @@ KERNEL void max_pool3d_kernel(const ga_size nthreads,
}
#kernel ave_pool2d_kernel : size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, bool, bool, *, size:
#include "cluda.h"
// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void ave_pool2d_kernel(const ga_size nthreads,
......@@ -150,6 +153,7 @@ KERNEL void ave_pool2d_kernel(const ga_size nthreads,
}
#kernel ave_pool3d_kernel : size, size, size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, size, size, size, bool, bool, *, size :
#include "cluda.h"
// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void ave_pool3d_kernel(const ga_size nthreads,
......
#section kernels
#kernel ave_pool2d_grad_kernel : size, size, size, size, size, size, size, *, size, *, size, size, size, size, size, size, size, bool, bool, *, size :
#include "cluda.h"
// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void ave_pool2d_grad_kernel(const ga_size nthreads,
......@@ -50,6 +51,7 @@ KERNEL void ave_pool2d_grad_kernel(const ga_size nthreads,
}
#kernel ave_pool3d_grad_kernel : size, size, size, size, size, size, size, size, size, *, size, *, size, size, size, size, size, size, size, size, size, size, bool, bool, *, size :
#include "cluda.h"
// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void ave_pool3d_grad_kernel(const ga_size nthreads,
......
#section kernels
#kernel max_pool2d_grad_grad_kernel : size, size, size, size, size, size, size, *, size, *, size, *, size, size, size, size, size, size, size, *, size :
#include "cluda.h"
KERNEL void max_pool2d_grad_grad_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size pooled_height,
......@@ -47,6 +48,7 @@ KERNEL void max_pool2d_grad_grad_kernel(const ga_size nthreads,
}
#kernel max_pool3d_grad_grad_kernel : size, size, size, size, size, size, size, size, size, *, size, *, size, *, size, size, size, size, size, size, size, size, size, size, *, size :
#include "cluda.h"
KERNEL void max_pool3d_grad_grad_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size pooled_depth,
......
#section kernels
#kernel max_pool2d_grad_kernel : size, size, size, size, size, size, size, *, size, *, size, *, size, size, size, size, size, size, size, *, size :
#include "cluda.h"
// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void max_pool2d_grad_kernel(const ga_size nthreads,
......@@ -43,6 +44,7 @@ KERNEL void max_pool2d_grad_kernel(const ga_size nthreads,
}
#kernel max_pool3d_grad_kernel : size, size, size, size, size, size, size, size, size, *, size, *, size, *, size, size, size, size, size, size, size, size, size, size, *, size :
#include "cluda.h"
// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void max_pool3d_grad_kernel(const ga_size nthreads,
......
#section kernels
#kernel max_pool2d_rop_kernel : size, size, size, size, size, size, size, *, size, *, size, size, size, size, size, size, size, *, size :
#include "cluda.h"
// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void max_pool2d_rop_kernel(const ga_size nthreads,
......@@ -50,6 +51,7 @@ KERNEL void max_pool2d_rop_kernel(const ga_size nthreads,
}
#kernel max_pool3d_rop_kernel : size, size, size, size, size, size, size, size, size, *, size, *, size, size, size, size, size, size, size, size, size, size, *, size :
#include "cluda.h"
// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void max_pool3d_rop_kernel(const ga_size nthreads,
......
......@@ -1743,7 +1743,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
kname = "kernel_reduce_ccontig"
k_var = "kernel_reduce_ccontig_" + nodename
sio = StringIO()
print("""
print("""#include "cluda.h"
KERNEL void %(kname)s(
const ga_size d0,
const %(in_type)s *A, const ga_size offset_A,
......@@ -1781,7 +1782,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
kname = "kernel_reduce_1"
k_var = "kernel_reduce_1_" + nodename
sio = StringIO()
print("""
print("""#include "cluda.h"
KERNEL void %(kname)s(
const ga_size d0,
const %(in_type)s *A, const ga_size offset_A,
......@@ -1821,7 +1823,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
kname = "kernel_reduce_11"
k_var = "kernel_reduce_11_" + nodename
sio = StringIO()
print("""
print("""#include "cluda.h"
KERNEL void %(kname)s(
const ga_size d0, const ga_size d1,
const %(in_type)s *A, const ga_size offset_A,
......@@ -1909,7 +1912,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
load_in + "(A[i3 * sA3 + i2 * sA2 + i1 * sA1 + i0 * sA0])",
{}, True)
sio = StringIO()
print("""
print("""#include "cluda.h"
%(decl)s{
%(init)s
for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){
......@@ -1943,7 +1947,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
kname = "kernel_reduce_010"
k_var = "kernel_reduce_010_" + nodename
sio = StringIO()
print("""
print("""#include "cluda.h"
KERNEL void %(kname)s(
const ga_size d0, const ga_size d1, const ga_size d2,
const %(in_type)s *A, const ga_size offset_A,
......@@ -1989,7 +1994,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
kname = "kernel_reduce_010_AD"
k_var = "kernel_reduce_010_AD_" + nodename
sio = StringIO()
print("""
print("""#include "cluda.h"
KERNEL void %(kname)s(
const ga_size A, const ga_size B, const ga_size C, const ga_size D,
const %(in_type)s *X, const ga_size offset_X,
......@@ -2053,7 +2059,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
{}, True)
reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + 0 * sA1 + i2 * sA2])")
sio = StringIO()
print("""
print("""#include "cluda.h"
%(decl)s
{
%(init)s
......@@ -2088,7 +2095,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
kname = "kernel_reduce_110"
k_var = "kernel_reduce_110_" + nodename
sio = StringIO()
print("""
print("""#include "cluda.h"
KERNEL void %(kname)s(
const ga_size d0, const ga_size d1, const ga_size d2,
const %(in_type)s *A, const ga_size offset_A,
......@@ -2133,7 +2141,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
{}, True)
reduce_init = self._assign_init(load_in + "(A[i1 * sA1 + i2 * sA2])")
sio = StringIO()
print("""
print("""#include "cluda.h"
%(decl)s
{
%(init)s
......@@ -2163,7 +2172,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
{}, True)
reduce_init = self._assign_init(load_in + "(A[0])")
sio = StringIO()
print("""
print("""#include "cluda.h"
%(decl)s
{
%(init)s
......@@ -2195,7 +2205,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
kname = "kernel_reduce_001"
k_var = "kernel_reduce_001_" + nodename
sio = StringIO()
print("""
print("""#include "cluda.h"
KERNEL void %(kname)s(
const ga_size d0, const ga_size d1, const ga_size d2,
const %(in_type)s *A, const ga_size offset_A,
......@@ -2244,7 +2254,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
{}, True)
reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + i1 * sA1])")
sio = StringIO()
print("""
print("""#include "cluda.h"
%(decl)s
{
%(init)s
......@@ -2280,7 +2291,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
{}, True)
reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + i2 * sA2])")
sio = StringIO()
print("""
print("""#include "cluda.h"
%(decl)s
{
%(init)s
......@@ -2314,7 +2326,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
{}, True)
reduce_init = self._assign_init(load_in + "(A[0])")
sio = StringIO()
print("""
print("""#include "cluda.h"
%(decl)s
{
%(init)s
......@@ -2345,7 +2358,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
kname = "kernel_reduce_1011"
k_var = "kernel_reduce_1011_" + nodename
sio = StringIO()
print("""
print("""#include "cluda.h"
KERNEL void %(kname)s(
const ga_size d0, const ga_size d1, const ga_size d2, const ga_size d3,
const %(in_type)s *A, const ga_size offset_A,
......@@ -2502,8 +2516,8 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
def gpu_kernels(self, node, name):
if not any(getattr(self, 'redux', [node.inputs[0].ndim != 0])):
# Some OpenCL compilers do not accept no-arguments kernels
src = "KERNEL void reduk(GLOBAL_MEM float *a) {}"
# Some OpenCL compilers do not accept no-arguments empty kernels
src = "#include \"cluda.h\"\nKERNEL void reduk(GLOBAL_MEM float *a) { a[0] = 0; }"
params = ['float32']
else:
k = self.get_kernel_cache(node)
......
......@@ -74,7 +74,8 @@ class GpuCumOp(GpuKernelBase, Op):
k_var = "k_cumadd_" + nodename
dtype_x = node.inputs[0].dtype
flags = Kernel.get_flags(dtype_x)
code = """
code = """#include "cluda.h"
KERNEL void %(kname)s(float* input, ga_size input_offset,
float* output, ga_size output_offset,
ga_ssize inputStrides_x, ga_ssize inputStrides_y, ga_ssize inputStrides_z,
......@@ -112,7 +113,8 @@ class GpuCumOp(GpuKernelBase, Op):
gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
'int32', 'int32', gpuarray.GpuArray, gpuarray.SIZE]
code = """
code = """#include "cluda.h"
// helper functions
WITHIN_KERNEL
void k_reductionPhase(float* partialCumOp) {
......@@ -213,7 +215,8 @@ class GpuCumOp(GpuKernelBase, Op):
# k_finalCumOp
kname = "k_finalCumOp"
k_var = "k_finalCumOp_" + nodename
code = """
code = """#include "cluda.h"
KERNEL void k_finalCumOp(float* output, ga_size output_offset,
float* blockSum, ga_size blockSum_offset,
size_t nbElementsPerCumOp,
......
......@@ -34,7 +34,9 @@ def nvcc_kernel(name, params, body):
else:
yield b
bodystr = ';\n'.join(flatbody())
return """KERNEL void %(name)s (%(paramstr)s)
return """#include "cluda.h"
KERNEL void %(name)s (%(paramstr)s)
{
%(bodystr)s;
}
......
......@@ -66,7 +66,8 @@ class GPUAMultinomialFromUniform(GpuKernelBase, Op):
work_ctype = pygpu.gpuarray.dtype_to_ctype(work_dtype(node.inputs[0].dtype))
write_out_ctype = write_w(node.outputs[0].dtype)
load_in_ctype = load_w(node.inputs[0].dtype)
code = """
code = """#include "cluda.h"
KERNEL void k_multi_warp_multinomial(
const ga_size nb_multi,
const ga_size nb_outcomes,
......@@ -276,7 +277,8 @@ class GPUAChoiceFromUniform(GpuKernelBase, Op):
def gpu_kernels(self, node, name):
replace = int(self.replace)
code = """
code = """#include "cluda.h"
KERNEL void k_multi_warp_multinomial_wor(
const ga_size nb_multi,
const ga_size nb_outcomes,
......
......@@ -61,7 +61,8 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
kernels = []
kname = "k_multi_warp_less"
k_var = "k_multi_warp_less_" + nodename
code = """
code = """#include "cluda.h"
// a version that uses less registers but doesn't work in all cases.
%(mode_constants)s
KERNEL void %(kname)s(
......@@ -163,7 +164,8 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
kname = "k_multi_warp"
k_var = "k_multi_warp_" + nodename
code = """
code = """#include "cluda.h"
%(mode_constants)s
KERNEL void %(kname)s(
const ga_int mode,
......
......@@ -75,7 +75,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE
]
sio = StringIO()
print("""
print("""#include "cluda.h"
KERNEL void %(kname)s(const ga_size M, const ga_size N,
GLOBAL_MEM const %(type_x)s* x_data, const ga_size offset_x, const ga_ssize xs0, const ga_ssize xs1,
GLOBAL_MEM const %(type_b)s* b, const ga_size offset_b, const ga_ssize bs0,
......@@ -393,7 +394,8 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE,
]
sio = StringIO()
print("""
print("""#include "cluda.h"
KERNEL void %(kname)s(
const ga_size N, const ga_size K,
GLOBAL_MEM const %(type_dnll)s* dnll, const ga_size offset_dnll, const ga_ssize dnll_s0,
......@@ -557,7 +559,8 @@ class GpuSoftmax(GpuKernelBase, Op):
kernels = []
kname = "kSoftmax"
k_var = "kSoftmax_" + nodename
code = """
code = """#include "cluda.h"
KERNEL void %(kname)s (const ga_size M, const ga_size N,
GLOBAL_MEM const %(type_x)s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1,
GLOBAL_MEM %(type_sm)s * sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1 GA_DECL_SHARED_PARAM(%(type_acc)s, buf))
......@@ -630,7 +633,8 @@ class GpuSoftmax(GpuKernelBase, Op):
flags=flags, objvar=k_var))
kname = "kSoftmax_fixed_shared"
k_var = "kSoftmax_fixed_shared" + nodename
code = """
code = """#include "cluda.h"
KERNEL void %(kname)s (const ga_size M, const ga_size N,
GLOBAL_MEM const %(type_x)s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1,
GLOBAL_MEM %(type_sm)s * sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1 GA_DECL_SHARED_PARAM(%(type_acc)s, buf))
......@@ -854,7 +858,8 @@ class GpuSoftmaxWithBias(GpuKernelBase, Op):
kernels = []
kname = "kSoftmaxWithBias"
k_var = "kSoftmaxWithBias_" + nodename
code = """
code = """#include "cluda.h"
KERNEL void %(kname)s (const ga_size M, const ga_size N,
GLOBAL_MEM const %(type_x)s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1,
GLOBAL_MEM const %(type_b)s * b, const ga_size offset_b, const ga_ssize sb0,
......@@ -930,7 +935,8 @@ class GpuSoftmaxWithBias(GpuKernelBase, Op):
flags=flags, objvar=k_var))
kname = "kSoftmaxWithBias_fixed_shared"
k_var = "kSoftmaxWithBias_fixed_shared" + nodename
code = """
code = """#include "cluda.h"
KERNEL void %(kname)s (const ga_size M, const ga_size N,
GLOBAL_MEM const %(type_x)s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1,
GLOBAL_MEM const %(type_b)s * b, const ga_size offset_b, const ga_ssize sb0,
......
......@@ -80,7 +80,8 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
else:
raise ValueError('Unsupported data type for output',
self.output_type.dtype)
code = """
code = """#include "cluda.h"
KERNEL void mrg_uniform(
GLOBAL_MEM %(otype)s *sample_data,
ga_size sample_offset,
......
......@@ -1121,7 +1121,7 @@ if (GpuArray_vector_add_fast(%(out)s, %(y)s, %(ind)s, %(params)s->set_instead_of
flags = Kernel.get_flags(dtype_x, dtype_y, dtype_ind)
kname = "k_vector_add_fast"
k_var = "k_vector_add_fast_" + nodename
code = """#include <cluda.h>
code = """#include "cluda.h"
KERNEL void k_vector_add_fast(const ga_size numRowsX,
const ga_size numColsX,
const ga_ssize stridesX0,
......@@ -1211,7 +1211,7 @@ if (GpuArray_vector_add_fast(%(out)s, %(y)s, %(ind)s, %(params)s->set_instead_of
PyGpuArray_DIMS(py_other)[0],
PyGpuArray_DIMS(py_other)[1],
PyGpuArray_DIMS(py_other)[0] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[0] / itemsize_y,
PyGpuArray_DIMS(py_other)[1] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[1] / itemsize_y
PyGpuArray_DIMS(py_other)[1] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[1] / itemsize_y,
py_other->ga.data,
py_other->ga.offset,
PyGpuArray_DIMS(indices_arr)[0],
......
#section kernels
#kernel eye : *, size, size, size :
#include <cluda.h>
/* The eye name will be used to generate supporting objects. The only
you probably need to care about is the kernel object which will be
named 'k_' + <the name above> (k_eye in this case). This name also
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论