提交 f40bf432 authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #3169 from nouiz/GpuAdvancedIncSubtensor1_dev20

[WIP, BUG, CRASH] Fix GpuAdvancedIncSubtensor1_dev20 with negative index
......@@ -2974,7 +2974,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
return Apply(self, [x_, y_, ilist_], [x_.type()])
def c_code_cache_version(self):
return (3,)
return (6,)
def c_code(self, node, name, inputs, outputs, sub):
active_device_no = theano.sandbox.cuda.active_device_number()
......@@ -3023,15 +3023,23 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
int stridesY1,
float *Y ,
long *d_indices_arr,
int num)
int num,
int* err)
{
for (int i = (blockIdx.x); i < num; i += gridDim.x)
{
for(int j = (threadIdx.x); j < numColsX;j += blockDim.x)
{
int x_row = d_indices_arr[i];
if(x_row < 0)
x_row += numRowsX;
int y_row = i;
atomicAdd(&X[(x_row * stridesX0) + (j * stridesX1)], Y[(y_row * stridesY0) + (j * stridesY1)]);
if(x_row < numRowsX && x_row >= 0){
atomicAdd(&X[(x_row * stridesX0) + (j * stridesX1)],
Y[(y_row * stridesY0) + (j * stridesY1)]);
} else {
*err = 1;
}
}
}
return;
......@@ -3039,64 +3047,78 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
int CudaNdarray_vector_add_fast(CudaNdarray* py_self,
CudaNdarray* py_other, PyArrayObject *indices_arr)
{
const int *shapeX = CudaNdarray_HOST_DIMS(py_self);
const int *shapeY = CudaNdarray_HOST_DIMS(py_other);
const int *strX = CudaNdarray_HOST_STRIDES(py_self);
const int *strY = CudaNdarray_HOST_STRIDES(py_other);
unsigned int size = (unsigned int)PyArray_SIZE(indices_arr);
if(size == 0){
return 0;
}
unsigned int numcolsX = shapeX[1];
unsigned int num_threads_per_block = std::min(numcolsX, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
unsigned int num_blocks = std::min(size ,(unsigned int)NUM_VECTOR_OP_BLOCKS);
dim3 n_blocks(num_blocks);
dim3 n_threads(num_threads_per_block);
long *d_indices_arr = NULL;
PyArrayObject *cpu_indices_arr = PyArray_GETCONTIGUOUS(indices_arr);
d_indices_arr = (long*)device_malloc(PyArray_NBYTES(cpu_indices_arr));
if(!d_indices_arr)
return -1;
cudaError_t err = cudaMemcpy(d_indices_arr,
PyArray_DATA(cpu_indices_arr),
PyArray_NBYTES(cpu_indices_arr),
cudaMemcpyHostToDevice);
if(err != cudaSuccess){
PyErr_Format(
PyExc_RuntimeError,
"GpuAdvancedIncSubtensor1_dev20: cudaMemcpy returned an error: %%s",
cudaGetErrorString(err));
return -1;
}
k_vector_add_fast<<<n_blocks, n_threads>>>(shapeX[0],
shapeX[1],
strX[0],
strX[1],
CudaNdarray_DEV_DATA(py_self),
shapeY[0],
shapeY[1],
strY[0],
strY[1],
CudaNdarray_DEV_DATA(py_other),
d_indices_arr,
PyArray_SIZE(indices_arr)
);
device_free(d_indices_arr);
Py_XDECREF(cpu_indices_arr);
err = cudaGetLastError();
if(err != cudaSuccess){
PyErr_Format(
PyExc_RuntimeError,
"GpuAdvancedIncSubtensor1_dev20: cuda error: %%s",
cudaGetErrorString(err));
return -1;
}
{
if(init_err_var()!= 0) return -1;
const int *shapeX = CudaNdarray_HOST_DIMS(py_self);
const int *shapeY = CudaNdarray_HOST_DIMS(py_other);
const int *strX = CudaNdarray_HOST_STRIDES(py_self);
const int *strY = CudaNdarray_HOST_STRIDES(py_other);
unsigned int size = (unsigned int)PyArray_SIZE(indices_arr);
if(size == 0){
return 0;
}
unsigned int numcolsX = shapeX[1];
unsigned int num_threads_per_block = std::min(
numcolsX, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
unsigned int num_blocks = std::min(
size, (unsigned int)NUM_VECTOR_OP_BLOCKS);
dim3 n_blocks(num_blocks);
dim3 n_threads(num_threads_per_block);
long *d_indices_arr = NULL;
PyArrayObject *cpu_indices_arr = PyArray_GETCONTIGUOUS(
indices_arr);
d_indices_arr = (long*)device_malloc(
PyArray_NBYTES(cpu_indices_arr));
if(!d_indices_arr)
return -1;
cudaError_t err = cudaMemcpy(d_indices_arr,
PyArray_DATA(cpu_indices_arr),
PyArray_NBYTES(cpu_indices_arr),
cudaMemcpyHostToDevice);
if(err != cudaSuccess){
PyErr_Format(
PyExc_RuntimeError,
"GpuAdvancedIncSubtensor1_dev20:"
" cudaMemcpy returned an error: %%s",
cudaGetErrorString(err));
return -1;
}
k_vector_add_fast<<<n_blocks, n_threads>>>(
shapeX[0],
shapeX[1],
strX[0],
strX[1],
CudaNdarray_DEV_DATA(py_self),
shapeY[0],
shapeY[1],
strY[0],
strY[1],
CudaNdarray_DEV_DATA(py_other),
d_indices_arr,
PyArray_SIZE(indices_arr),
err_var
);
int index_err = check_err_var();
device_free(d_indices_arr);
Py_XDECREF(cpu_indices_arr);
if(index_err != 0) return -1;
err = cudaGetLastError();
if(err != cudaSuccess){
PyErr_Format(
PyExc_RuntimeError,
"GpuAdvancedIncSubtensor1_dev20: cuda error: %%s",
cudaGetErrorString(err));
return -1;
}
return 0;
}
""" % locals()
......
......@@ -27,22 +27,8 @@
//if you want this to work.
#define PRECHECK_ERROR 0
//If true, we release the GIL around blocking GPU calls, to allow other Python
//threads to run in the meantime. For a single-threaded program, the overhead
//is neglectible (about 20ms for 1 million GIL release/reclaim cycles). Can
//still be overridden on compilation with -DRELEASE_GIL=0 in nvcc.flags.
#ifndef RELEASE_GIL
#define RELEASE_GIL 1
#endif
#if RELEASE_GIL
#define CNDA_BEGIN_ALLOW_THREADS Py_BEGIN_ALLOW_THREADS
#define CNDA_END_ALLOW_THREADS Py_END_ALLOW_THREADS
#else
#define CNDA_BEGIN_ALLOW_THREADS
#define CNDA_END_ALLOW_THREADS
#endif
cublasHandle_t handle = NULL;
int* err_var = NULL;
/////////////////////////
// Alloc and Free
......@@ -976,13 +962,6 @@ __global__ void k_take_3(const int d0, const int d1, const int d2,
}
}
// Pointor to 1 int on the device
// Used in CudaNdarray_TakeFrom to tell that there is an out of bound error
// When it is allocated, it should always be 0
// So if there is an error, we must reset it to 0 BEFORE we raise the error
// This prevent us from setting it to 0 before each use
static int* err_var = NULL;
// We try to be similar to the PyArray_TakeFrom function
//http://docs.scipy.org/doc/numpy/reference/c-api.array.html
//TODO: support other clip mode then raise(clip, wrap)
......@@ -1163,30 +1142,7 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
k3 = k_take_3<CPY>;
// Create the memory place that will store the error information.
if (err_var == NULL) {
err_var = (int*)device_malloc(sizeof(int));
if (!err_var) { // PyErr set by device_malloc
Py_DECREF(indices);
Py_DECREF(out);
free(dims);
return NULL;
}
cudaError_t err = cudaMemset((void*)err_var, 0, sizeof(int));
if (cudaSuccess != err) {
// Clear the error flag, cudaMemset doesn't do it.
// Currently this returns the same thing as err, but if in future
// it returns something else I still don't see why we should ignore
// it. All we want to do here is reset the flag.
cudaGetLastError();
PyErr_Format(PyExc_RuntimeError,
"Error setting device error code to 0. %s",
cudaGetErrorString(err));
Py_DECREF(indices);
Py_DECREF(out);
free(dims);
return NULL;
}
}
if(init_err_var() != 0) return NULL;
dim3 n_blocks(std::min(CudaNdarray_HOST_DIMS(out)[0],65535),1,1);
if(CudaNdarray_HOST_DIMS(out)[0] == 0){
......@@ -1298,47 +1254,14 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
Py_DECREF(out);
return NULL;
}
//-10 could be any value different then 0.
int cpu_err_var=-10;
CNDA_BEGIN_ALLOW_THREADS
// As we execute cudaMemcpy on the default stream, it waits for all
// kernels (on all streams) to be finished before starting to copy
err = cudaMemcpy(&cpu_err_var, err_var, sizeof(int),
cudaMemcpyDeviceToHost);
CNDA_END_ALLOW_THREADS
if (cudaSuccess != err) {
PyErr_Format(
PyExc_RuntimeError,
"Cuda error: %s: %s when trying to get the error value.\n",
"CudaNdarray_TakeFrom",
cudaGetErrorString(err));
Py_DECREF(indices);
Py_DECREF(out);
return NULL;
}
if (cpu_err_var != 0) {
PyErr_Format(
PyExc_IndexError,
"CudaNdarray_TakeFrom: One of the index value is out of bound. Error code: %i.\n",
cpu_err_var);
// Must reset it to 0 to don't reset it before each use.
err = cudaMemset((void*)err_var, 0, sizeof(int));
if (cudaSuccess != err) {
PyErr_Format(PyExc_MemoryError, "Error setting device error code to 0 after having an index error. %s", cudaGetErrorString(err));
Py_DECREF(indices);
Py_DECREF(out);
return NULL;
}
Py_DECREF(indices);
int index_err = check_err_var();
Py_DECREF(indices);
if (index_err != 0) {
Py_DECREF(out);
return NULL;
}
Py_DECREF(indices);
if (verbose) printf("TAKE SUCCEDED\n");
return (PyObject *)out;
}
......
......@@ -76,6 +76,21 @@ typedef float real;
#define CNDA_THREAD_SYNC cudaThreadSynchronize();
#endif
//If true, we release the GIL around blocking GPU calls, to allow other Python
//threads to run in the meantime. For a single-threaded program, the overhead
//is neglectible (about 20ms for 1 million GIL release/reclaim cycles). Can
//still be overridden on compilation with -DRELEASE_GIL=0 in nvcc.flags.
#ifndef RELEASE_GIL
#define RELEASE_GIL 1
#endif
#if RELEASE_GIL
#define CNDA_BEGIN_ALLOW_THREADS Py_BEGIN_ALLOW_THREADS
#define CNDA_END_ALLOW_THREADS Py_END_ALLOW_THREADS
#else
#define CNDA_BEGIN_ALLOW_THREADS
#define CNDA_END_ALLOW_THREADS
#endif
#ifndef SHARED_SIZE
#define SHARED_SIZE (16*1024)
......@@ -101,6 +116,76 @@ DllExport void * device_malloc(size_t size, int verbose);
DllExport int device_free(void * ptr);
DllExport void *get_work_mem(size_t sz);
// Pointor to 1 int on the device
// Used in CudaNdarray_TakeFrom and in an op
// to tell that there is an out of bound error
// When it is allocated, it should always be 0
// So if there is an error, we must reset it to 0 BEFORE we raise the error
// This prevent us from setting it to 0 before each use
extern DllExport int* err_var;
static inline int init_err_var(){
if (err_var == NULL) {
err_var = (int*)device_malloc(sizeof(int));
if (!err_var) { // PyErr set by device_malloc
return -1;
}
cudaError_t err = cudaMemset((void*)err_var, 0,
sizeof(int));
if (cudaSuccess != err) {
// Clear the error flag, cudaMemset doesn't do it.
cudaGetLastError();
PyErr_Format(
PyExc_RuntimeError,
"Error setting device error code to 0. %s",
cudaGetErrorString(err));
return -1;
}
}
return 0;
}
static inline int check_err_var(){
//-10 could be any value different then 0.
int cpu_err_var=-10;
cudaError_t err;
CNDA_BEGIN_ALLOW_THREADS
// As we execute cudaMemcpy on the default stream, it waits
// for all kernels (on all streams) to be finished before
// starting to copy
err = cudaMemcpy(&cpu_err_var, err_var, sizeof(int),
cudaMemcpyDeviceToHost);
CNDA_END_ALLOW_THREADS
if (cudaSuccess != err) {
PyErr_Format(
PyExc_RuntimeError,
"Cuda error: %s when trying to get the error"
" value.\\n",
cudaGetErrorString(err));
return -1;
}
if (cpu_err_var != 0) {
PyErr_Format(
PyExc_IndexError,
"One of the index value is out of bound. Error code: %i.\\n",
cpu_err_var);
// Must reset it to 0 to don't reset it before each use.
err = cudaMemset((void*)err_var, 0, sizeof(int));
if (cudaSuccess != err) {
PyErr_Format(PyExc_MemoryError,
"Error setting device error code to 0 after having"
" an index error. %s", cudaGetErrorString(err));
return -1;
}
return -1;
}
return 0;
}
template <typename T>
static T ceil_intdiv(T a, T b)
{
......
......@@ -1016,8 +1016,9 @@ def local_gpu_advanced_incsubtensor1(node):
return [gpu_op(as_cuda_ndarray_variable(x), as_cuda_ndarray_variable(y), *coords)]
# Should not execute for GpuAdvancedIncSubtensor1
if node.op.__class__ is tensor.AdvancedIncSubtensor1 and \
node.inputs[0].dtype == "float32":
if (node.op.__class__ is tensor.AdvancedIncSubtensor1 and
node.inputs[0].dtype == "float32" and
node.inputs[1].dtype == "float32"):
x, y = node.inputs[0:2]
coords = node.inputs[2:]
go_gpu = False
......
......@@ -489,7 +489,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
return gof.Apply(self, [x_, y_, ilist_], [x_.type()])
def c_code_cache_version(self):
return (3,)
return (4,)
def c_headers(self):
return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
......@@ -587,6 +587,8 @@ __device__ npy_float16 atomicAdd(npy_float16 *addr, npy_float16 val) {
for(int j = (threadIdx.x); j < numColsX;j += blockDim.x)
{
int x_row = indices_arr[i * stridesIndices];
if(x_row < 0)
x_row += numRowsX;
int y_row = i;
atomicAdd(&X[(x_row * stridesX0) + (j * stridesX1)], Y[(y_row * stridesY0) + (j * stridesY1)]);
}
......
......@@ -437,7 +437,7 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
def test_ok_list(self):
for data, idx in [(rand(4), [1, 0]),
(rand(4, 5), [2, 3]),
(rand(4, 5), [2, 3, -1]),
(rand(4, 2, 3), [0, 3]),
(rand(4, 2, 3), [3, 3, 1, 1, 2, 2, 0, 0]),
(rand(4, 2, 3), [3, 3, 1, 1, 2, 2, 0, 0,
......@@ -479,6 +479,15 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
out2 = test_out[0][0]
assert out1 is out2
# test the grad
gn = theano.grad(t.sum(), n)
g = self.function([], gn, op=self.adv_incsub1)
utt.verify_grad(lambda m: m[[1, 3]],
[numpy.random.rand(5, 5).astype(self.dtype)])
g_0 = g()
utt.verify_grad(lambda m: m[idx],
[data])
def test_err_invalid_list(self):
n = self.shared(numpy.asarray(5, dtype=self.dtype))
self.assertRaises(TypeError, n.__getitem__, [0, 0])
......@@ -495,13 +504,15 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
self.assertTrue(isinstance(t.owner.op, tensor.AdvancedSubtensor1))
f = self.function([l], t, op=self.adv_sub1)
topo = f.maker.fgraph.toposort()
topo_ = [node for node in topo if not isinstance(node.op,
self.ignore_topo)]
assert len(topo_) == 1
self.assertTrue(isinstance(topo_[0].op, self.adv_sub1))
# the grad
g = self.function([l],
inc_subtensor(t, numpy.asarray([[1.]], self.dtype)),
op=self.adv_incsub1)
for shp in [[0, 4], [0, -3], [-10]]:
self.assertRaises(IndexError, f, shp)
self.assertRaises(IndexError, g, shp)
def test_adv_sub1_broadcast(self):
ones = numpy.ones((1, 3), dtype=self.dtype)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论