提交 d6886bcc authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron 提交者: Frederic Bastien

Remove atomics definitions in advincsub_dev20.

上级 ba114bd6
...@@ -22,7 +22,7 @@ def load_w(dtype): ...@@ -22,7 +22,7 @@ def load_w(dtype):
""" """
if dtype == 'float16': if dtype == 'float16':
return '__half2float' return 'ga_half2float'
else: else:
return '' return ''
...@@ -37,6 +37,6 @@ def write_w(dtype): ...@@ -37,6 +37,6 @@ def write_w(dtype):
""" """
if dtype == 'float16': if dtype == 'float16':
return '__float2half_rn' return 'ga_float2half'
else: else:
return '' return ''
...@@ -1037,8 +1037,7 @@ class GpuAdvancedIncSubtensor1(Op): ...@@ -1037,8 +1037,7 @@ class GpuAdvancedIncSubtensor1(Op):
class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC, class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC,
GpuAdvancedIncSubtensor1): GpuAdvancedIncSubtensor1):
""" """
Implement AdvancedIncSubtensor1 on the gpu, but use function Implement AdvancedIncSubtensor1 on the gpu with atomics
only avail on compute capability 2.0 and more recent.
""" """
_f16_ok = True _f16_ok = True
...@@ -1090,11 +1089,8 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC, ...@@ -1090,11 +1089,8 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC,
def c_code(self, node, name, inputs, outputs, sub): def c_code(self, node, name, inputs, outputs, sub):
ctx = self.get_params(node).context ctx = self.get_params(node).context
if ctx.kind != b'cuda':
raise NotImplementedError("cuda only")
if (node.inputs[0].ndim != node.inputs[1].ndim or if (node.inputs[0].ndim != node.inputs[1].ndim or
node.inputs[0].ndim != 2 or node.inputs[0].ndim != 2):
int(ctx.bin_id[-2]) < 2):
raise NotImplementedError("This case does not have C code yet.") raise NotImplementedError("This case does not have C code yet.")
return """ return """
...@@ -1125,85 +1121,7 @@ if (GpuArray_vector_add_fast(%(out)s, %(y)s, %(ind)s, %(params)s->set_instead_of ...@@ -1125,85 +1121,7 @@ if (GpuArray_vector_add_fast(%(out)s, %(y)s, %(ind)s, %(params)s->set_instead_of
flags = Kernel.get_flags(dtype_x, dtype_y, dtype_ind) flags = Kernel.get_flags(dtype_x, dtype_y, dtype_ind)
kname = "k_vector_add_fast" kname = "k_vector_add_fast"
k_var = "k_vector_add_fast_" + nodename k_var = "k_vector_add_fast_" + nodename
code = """ code = """#include <cluda.h>
/*
* This is an atomicAdd that works for doubles since that is not provided
* natively by cuda before arch 6.0.
*/
#if __CUDA_ARCH__ < 600
__device__ ga_double atomicAdd(ga_double* address, ga_double val) {
ga_ulong *address_as_ull = (ga_ulong *)address;
ga_ulong old = *address_as_ull, assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed,
__double_as_longlong(val +
__longlong_as_double(assumed)));
} while (assumed != old);
return __longlong_as_double(old);
}
#endif
__device__ ga_double atomicExch(ga_double *address, ga_double val) {
return atomicExch((ga_ulong *)address,
__double_as_longlong(val));
}
/* GA_LONG */
__device__ ga_long atomicAdd(ga_long* address, ga_long val) {
ga_ulong *address_as_ull = (ga_ulong *)address;
ga_ulong old = *address_as_ull, assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed,
(ga_ulong)(val + (ga_long)assumed));
} while (assumed != old);
return (ga_long)old;
}
__device__ ga_long atomicExch(ga_long *address, ga_long val) {
return (ga_long)atomicExch((ga_ulong *)address, (ga_ulong)val);
}
/* GA_HALF */
/*
* This may read and write 2 bytes more than the size of the array
* if the array has an uneven number of elements. The actual value
* at that spot will not be modified.
*/
__device__ ga_half atomicAdd(ga_half *addr, ga_half val) {
ga_uint *base = (ga_uint *)((ga_size)addr & ~2);
ga_uint old, assumed, sum, new_;
old = *base;
do {
assumed = old;
sum = __float2half_rn(
__half2float(val) +
__half2float((ga_half)__byte_perm(old, 0,
((ga_size)addr & 2) ? 0x4432 : 0x4410)));
new_ = __byte_perm(old, sum, ((ga_size)addr & 2) ? 0x5410 : 0x3254);
old = atomicCAS(base, assumed, new_);
} while (assumed != old);
return (ga_half)__byte_perm(old, 0,
((ga_size)addr & 2) ? 0x4432 : 0x4410);
}
__device__ ga_half atomicExch(ga_half *addr, ga_half val) {
ga_uint *base = (ga_uint *)((ga_size)addr & ~2);
ga_uint old, assumed, new_;
old = *base;
do {
assumed = old;
new_ = __byte_perm(old, val, ((ga_size)addr & 2) ? 0x5410 : 0x3254);
old = atomicCAS(base, assumed, new_);
} while (assumed != old);
return (ga_half)__byte_perm(old, 0,
((ga_size)addr & 2) ? 0x4432 : 0x4410);
}
KERNEL void k_vector_add_fast(const ga_size numRowsX, KERNEL void k_vector_add_fast(const ga_size numRowsX,
const ga_size numColsX, const ga_size numColsX,
const ga_ssize stridesX0, const ga_ssize stridesX0,
...@@ -1236,10 +1154,10 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) { ...@@ -1236,10 +1154,10 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) {
ga_ssize y_row = i; ga_ssize y_row = i;
if (x_row < numRowsX && x_row >= 0) { if (x_row < numRowsX && x_row >= 0) {
if (set_instead_of_inc) { if (set_instead_of_inc) {
atomicExch(&X[(x_row * stridesX0) + (j * stridesX1)], atom_xchg_%(tc)sg(&X[(x_row * stridesX0) + (j * stridesX1)],
Y[(y_row * stridesY0) + (j * stridesY1)]); Y[(y_row * stridesY0) + (j * stridesY1)]);
} else { } else {
atomicAdd(&X[(x_row * stridesX0) + (j * stridesX1)], atom_add_%(tc)sg(&X[(x_row * stridesX0) + (j * stridesX1)],
Y[(y_row * stridesY0) + (j * stridesY1)]); Y[(y_row * stridesY0) + (j * stridesY1)]);
} }
} else { } else {
...@@ -1249,7 +1167,8 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) { ...@@ -1249,7 +1167,8 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) {
} }
return; return;
} }
""" % dict(type_x=type_x, type_y=type_y, type_ind=type_ind) """ % dict(type_x=type_x, type_y=type_y, type_ind=type_ind,
tc=numpy.dtype(dtype_x).char)
params = [ params = [
'uintp', 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp', 'uintp', 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp',
'uintp', 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp', 'uintp', 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp',
...@@ -1265,15 +1184,15 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) { ...@@ -1265,15 +1184,15 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) {
PyGpuArrayObject* indices_arr, PyGpuArrayObject* indices_arr,
const int set_instead_of_inc) const int set_instead_of_inc)
{ {
size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(py_self)[1], (size_t)256), 1, 1}; size_t threads_per_block = std::min(PyGpuArray_DIMS(py_self)[1], (size_t)256);
size_t n_blocks[3] = {std::min(PyGpuArray_SIZE(indices_arr), (size_t)4096), 1, 1}; size_t n_blocks = std::min(PyGpuArray_SIZE(indices_arr), (size_t)4096);
gpudata *errbuf; gpudata *errbuf;
int err, kerr = 0; int err, kerr = 0;
size_t itemsize_x = GpuArray_ITEMSIZE(&py_self->ga); size_t itemsize_x = GpuArray_ITEMSIZE(&py_self->ga);
size_t itemsize_y = GpuArray_ITEMSIZE(&py_other->ga); size_t itemsize_y = GpuArray_ITEMSIZE(&py_other->ga);
size_t itemsize_ind = GpuArray_ITEMSIZE(&indices_arr->ga); size_t itemsize_ind = GpuArray_ITEMSIZE(&indices_arr->ga);
if (threads_per_block[0] > 0 && n_blocks[0] > 0) { if (threads_per_block > 0 && n_blocks > 0) {
err = gpudata_property(py_self->ga.data, err = gpudata_property(py_self->ga.data,
GA_CTX_PROP_ERRBUF, &errbuf); GA_CTX_PROP_ERRBUF, &errbuf);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
...@@ -1281,30 +1200,27 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) { ...@@ -1281,30 +1200,27 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) {
return 1; return 1;
} }
ssize_t stride_X0 = PyGpuArray_STRIDES(py_self)[0] / itemsize_x; err = k_vector_add_fast_call(
ssize_t stride_X1 = PyGpuArray_STRIDES(py_self)[1] / itemsize_x; 1, &n_blocks, &threads_per_block, 0,
ssize_t stride_Y0 = PyGpuArray_DIMS(py_other)[0] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[0] / itemsize_y; PyGpuArray_DIMS(py_self)[0],
ssize_t stride_Y1 = PyGpuArray_DIMS(py_other)[1] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[1] / itemsize_y; PyGpuArray_DIMS(py_self)[1],
ssize_t stride_ind = PyGpuArray_STRIDES(indices_arr)[0] / itemsize_ind; PyGpuArray_STRIDES(py_self)[0] / itemsize_x,
void *kernel_params[] = {(void *)&PyGpuArray_DIMS(py_self)[0], PyGpuArray_STRIDES(py_self)[1] / itemsize_x,
(void *)&PyGpuArray_DIMS(py_self)[1], py_self->ga.data,
(void *)&stride_X0, py_self->ga.offset,
(void *)&stride_X1, PyGpuArray_DIMS(py_other)[0],
(void *)py_self->ga.data, PyGpuArray_DIMS(py_other)[1],
(void *)&py_self->ga.offset, PyGpuArray_DIMS(py_other)[0] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[0] / itemsize_y,
(void *)&PyGpuArray_DIMS(py_other)[0], PyGpuArray_DIMS(py_other)[1] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[1] / itemsize_y
(void *)&PyGpuArray_DIMS(py_other)[1], py_other->ga.data,
(void *)&stride_Y0, py_other->ga.offset,
(void *)&stride_Y1, PyGpuArray_DIMS(indices_arr)[0],
(void *)py_other->ga.data, PyGpuArray_STRIDES(indices_arr)[0] / itemsize_ind,
(void *)&py_other->ga.offset, indices_arr->ga.data,
(void *)&PyGpuArray_DIMS(indices_arr)[0], indices_arr->ga.offset,
(void *)&stride_ind, set_instead_of_inc,
(void *)indices_arr->ga.data, errbuf);
(void *)&indices_arr->ga.offset,
(void *)&set_instead_of_inc,
(void *)errbuf};
err = GpuKernel_call(&%(k_var)s, 3, n_blocks, threads_per_block, 0, kernel_params);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"gpuarray error: %(k_var)s: %%s.", "gpuarray error: %(k_var)s: %%s.",
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论