提交 ebc0b40d authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Fix GpuAdvancedIncSubtensor1_dev20 c_code so that it raises IndexError and works correctly.

上级 85c63450
from __future__ import print_function from __future__ import print_function
import copy import copy
import os
import numpy import numpy
import os import os
...@@ -18,7 +21,6 @@ except ImportError: ...@@ -18,7 +21,6 @@ except ImportError:
from .type import GpuArrayType from .type import GpuArrayType
from .basic_ops import (as_gpuarray_variable, HideC, GpuKernelBase, Kernel) from .basic_ops import (as_gpuarray_variable, HideC, GpuKernelBase, Kernel)
from .elemwise import GpuElemwise from .elemwise import GpuElemwise
from .comp import NVCC_compiler
class GpuSubtensor(HideC, Subtensor): class GpuSubtensor(HideC, Subtensor):
...@@ -605,20 +607,22 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, GpuAdvancedIncSubtensor1): ...@@ -605,20 +607,22 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, GpuAdvancedIncSubtensor1):
return gof.Apply(self, [x_, y_, ilist_], [x_.type()]) return gof.Apply(self, [x_, y_, ilist_], [x_.type()])
def c_code_cache_version(self): def c_code_cache_version(self):
return (5,) return (6,)
def c_headers(self): def c_headers(self):
if pygpu.get_default_context().kind == 'opencl': if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only') raise MethodNotDefined('cuda only')
return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>', return ['cuda.h', '<numpy_compat.h>', '<gpuarray/ext_cuda.h>',
'<gpuarray/ext_cuda.h>', '<gpuarray/types.h>'] '<gpuarray/types.h>']
def c_header_dirs(self): def c_header_dirs(self):
if pygpu.get_default_context().kind == 'opencl': if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only') raise MethodNotDefined('cuda only')
cuda_root = config.cuda.root cuda_root = config.cuda.root
res = [os.path.dirname(__file__)]
if cuda_root: if cuda_root:
return [os.path.join(cuda_root, 'include')] res.append(os.path.join(cuda_root, 'include'))
return res
def c_init_code(self): def c_init_code(self):
if pygpu.get_default_context().kind == 'opencl': if pygpu.get_default_context().kind == 'opencl':
...@@ -642,19 +646,20 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, GpuAdvancedIncSubtensor1): ...@@ -642,19 +646,20 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, GpuAdvancedIncSubtensor1):
fail = sub['fail'] fail = sub['fail']
inplace = int(self.inplace) inplace = int(self.inplace)
return """ return """
Py_XDECREF(%(out)s); int err;
if (!%(inplace)s) { if (%(inplace)s) {
%(out)s = (PyGpuArrayObject*)pygpu_copy(%(x)s, GA_C_ORDER); Py_XDECREF(%(out)s);
} else { %(out)s = %(x)s;
%(out)s = %(x)s; Py_INCREF(%(out)s);
Py_XINCREF(%(out)s); } else {
} %(out)s = theano_try_copy(%(out)s, %(x)s);
}
GpuArray_vector_add_fast(%(out)s, %(y)s, %(ind)s); if (!%(out)s) {
%(fail)s
if (!%(out)s) { }
%(fail)s if (GpuArray_vector_add_fast(%(out)s, %(y)s, %(ind)s)) {
} %(fail)s
}
""" % locals() """ % locals()
def gpu_kernels(self, node, nodename): def gpu_kernels(self, node, nodename):
...@@ -730,7 +735,8 @@ __device__ ga_half atomicAdd(ga_half *addr, ga_half val) { ...@@ -730,7 +735,8 @@ __device__ ga_half atomicAdd(ga_half *addr, ga_half val) {
const ga_size numIndices, const ga_size numIndices,
const ga_ssize stridesIndices, const ga_ssize stridesIndices,
%(type_ind)s *indices_arr, %(type_ind)s *indices_arr,
const ga_size offset_indices_arr) const ga_size offset_indices_arr,
ga_int *err)
{ {
X = (%(type_x)s *)(((char *)X)+offset_X); X = (%(type_x)s *)(((char *)X)+offset_X);
Y = (%(type_y)s *)(((char *)Y)+offset_Y); Y = (%(type_y)s *)(((char *)Y)+offset_Y);
...@@ -739,11 +745,15 @@ __device__ ga_half atomicAdd(ga_half *addr, ga_half val) { ...@@ -739,11 +745,15 @@ __device__ ga_half atomicAdd(ga_half *addr, ga_half val) {
{ {
for(int j = (threadIdx.x); j < numColsX;j += blockDim.x) for(int j = (threadIdx.x); j < numColsX;j += blockDim.x)
{ {
int x_row = indices_arr[i * stridesIndices]; ssize_t x_row = indices_arr[i * stridesIndices];
if(x_row < 0) if (x_row < 0)
x_row += numRowsX; x_row += numRowsX;
int y_row = i; ssize_t y_row = i;
atomicAdd(&X[(x_row * stridesX0) + (j * stridesX1)], Y[(y_row * stridesY0) + (j * stridesY1)]); if (x_row < numRowsX && x_row >= 0) {
atomicAdd(&X[(x_row * stridesX0) + (j * stridesX1)], Y[(y_row * stridesY0) + (j * stridesY1)]);
} else {
*err = 1;
}
} }
} }
return; return;
...@@ -752,7 +762,7 @@ __device__ ga_half atomicAdd(ga_half *addr, ga_half val) { ...@@ -752,7 +762,7 @@ __device__ ga_half atomicAdd(ga_half *addr, ga_half val) {
params = [ params = [
'uintp', 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp', 'uintp', 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp',
'uintp', 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp', 'uintp', 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp',
'uintp', 'intp', gpuarray.GpuArray, 'uintp' 'uintp', 'intp', gpuarray.GpuArray, 'uintp', gpuarray.GpuArray
] ]
return [Kernel(code=code, name=kname, params=params, return [Kernel(code=code, name=kname, params=params,
flags=flags, objvar=k_var)] flags=flags, objvar=k_var)]
...@@ -767,52 +777,64 @@ __device__ ga_half atomicAdd(ga_half *addr, ga_half val) { ...@@ -767,52 +777,64 @@ __device__ ga_half atomicAdd(ga_half *addr, ga_half val) {
itemsize_ind = numpy.dtype(dtype_ind).itemsize itemsize_ind = numpy.dtype(dtype_ind).itemsize
itemsize_out = numpy.dtype(dtype_out).itemsize itemsize_out = numpy.dtype(dtype_out).itemsize
k_var = "k_vector_add_fast_" + nodename k_var = "k_vector_add_fast_" + nodename
err_check = """
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: %(k_var)s: %%s.",
GpuKernel_error(&%(k_var)s, err));
}
""" % locals()
sync = ""
if config.gpuarray.sync:
sync = """
err = GpuArray_sync(&%(z)s->ga);
%(err_check)s
""" % locals()
return super(GpuAdvancedIncSubtensor1_dev20, self).c_support_code_apply(node, nodename) + """ return super(GpuAdvancedIncSubtensor1_dev20, self).c_support_code_apply(node, nodename) + """
void GpuArray_vector_add_fast(PyGpuArrayObject* py_self, int GpuArray_vector_add_fast(PyGpuArrayObject* py_self,
PyGpuArrayObject* py_other, PyGpuArrayObject* py_other,
PyGpuArrayObject *indices_arr) PyGpuArrayObject *indices_arr)
{ {
size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(py_self)[1], (size_t)256), 1, 1}; size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(py_self)[1], (size_t)256), 1, 1};
size_t n_blocks[3] = {std::min(PyGpuArray_SIZE(indices_arr), (size_t)4096), 1, 1}; size_t n_blocks[3] = {std::min(PyGpuArray_SIZE(indices_arr), (size_t)4096), 1, 1};
gpudata *errbuf;
int err, kerr = 0;
if (threads_per_block[0] > 0 && n_blocks[0] > 0) { if (threads_per_block[0] > 0 && n_blocks[0] > 0) {
ssize_t stride_X0 = PyGpuArray_STRIDES(py_self)[0] / %(itemsize_x)s; err = py_self->ga.ops->property(NULL, py_self->ga.data, NULL,
ssize_t stride_X1 = PyGpuArray_STRIDES(py_self)[1] / %(itemsize_x)s; GA_CTX_PROP_ERRBUF, &errbuf);
ssize_t stride_Y0 = PyGpuArray_DIMS(py_other)[0] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[0] / %(itemsize_y)s; if (err != GA_NO_ERROR) {
ssize_t stride_Y1 = PyGpuArray_DIMS(py_other)[1] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[1] / %(itemsize_y)s; PyErr_SetString(PyExc_RuntimeError, "Can't fetch error buffer");
ssize_t stride_ind = PyGpuArray_STRIDES(indices_arr)[0] / %(itemsize_ind)s; return 1;
void *kernel_params[] = {(void *)&PyGpuArray_DIMS(py_self)[0], }
(void *)&PyGpuArray_DIMS(py_self)[1],
(void *)&stride_X0, ssize_t stride_X0 = PyGpuArray_STRIDES(py_self)[0] / %(itemsize_x)s;
(void *)&stride_X1, ssize_t stride_X1 = PyGpuArray_STRIDES(py_self)[1] / %(itemsize_x)s;
(void *)py_self->ga.data, ssize_t stride_Y0 = PyGpuArray_DIMS(py_other)[0] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[0] / %(itemsize_y)s;
(void *)&py_self->ga.offset, ssize_t stride_Y1 = PyGpuArray_DIMS(py_other)[1] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[1] / %(itemsize_y)s;
(void *)&PyGpuArray_DIMS(py_other)[0], ssize_t stride_ind = PyGpuArray_STRIDES(indices_arr)[0] / %(itemsize_ind)s;
(void *)&PyGpuArray_DIMS(py_other)[1], void *kernel_params[] = {(void *)&PyGpuArray_DIMS(py_self)[0],
(void *)&stride_Y0, (void *)&PyGpuArray_DIMS(py_self)[1],
(void *)&stride_Y1, (void *)&stride_X0,
(void *)py_other->ga.data, (void *)&stride_X1,
(void *)&py_other->ga.offset, (void *)py_self->ga.data,
(void *)&PyGpuArray_DIMS(indices_arr)[0], (void *)&py_self->ga.offset,
(void *)&stride_ind, (void *)&PyGpuArray_DIMS(py_other)[0],
(void *)indices_arr->ga.data, (void *)&PyGpuArray_DIMS(py_other)[1],
(void *)&indices_arr->ga.offset}; (void *)&stride_Y0,
int err = GpuKernel_call(&%(k_var)s, 3, threads_per_block, n_blocks, 0, kernel_params); (void *)&stride_Y1,
%(err_check)s (void *)py_other->ga.data,
%(sync)s (void *)&py_other->ga.offset,
(void *)&PyGpuArray_DIMS(indices_arr)[0],
(void *)&stride_ind,
(void *)indices_arr->ga.data,
(void *)&indices_arr->ga.offset,
(void *)errbuf};
err = GpuKernel_call(&%(k_var)s, 3, threads_per_block, n_blocks, 0, kernel_params);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: %(k_var)s: %%s.",
GpuKernel_error(&%(k_var)s, err));
return 1;
}
err = py_self->ga.ops->buffer_read(&kerr, errbuf, 0, sizeof(int));
if (err != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "Can't read error buffer");
return 1;
}
if (kerr != 0) {
PyErr_SetString(PyExc_IndexError, "Index out of bounds");
return 1;
}
} }
return 0;
} }
""" % locals() """ % locals()
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论