提交 f40bf432 authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #3169 from nouiz/GpuAdvancedIncSubtensor1_dev20

[WIP, BUG, CRASH] Fix GpuAdvancedIncSubtensor1_dev20 with negative index
...@@ -2974,7 +2974,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1): ...@@ -2974,7 +2974,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
return Apply(self, [x_, y_, ilist_], [x_.type()]) return Apply(self, [x_, y_, ilist_], [x_.type()])
def c_code_cache_version(self): def c_code_cache_version(self):
return (3,) return (6,)
def c_code(self, node, name, inputs, outputs, sub): def c_code(self, node, name, inputs, outputs, sub):
active_device_no = theano.sandbox.cuda.active_device_number() active_device_no = theano.sandbox.cuda.active_device_number()
...@@ -3023,15 +3023,23 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1): ...@@ -3023,15 +3023,23 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
int stridesY1, int stridesY1,
float *Y , float *Y ,
long *d_indices_arr, long *d_indices_arr,
int num) int num,
int* err)
{ {
for (int i = (blockIdx.x); i < num; i += gridDim.x) for (int i = (blockIdx.x); i < num; i += gridDim.x)
{ {
for(int j = (threadIdx.x); j < numColsX;j += blockDim.x) for(int j = (threadIdx.x); j < numColsX;j += blockDim.x)
{ {
int x_row = d_indices_arr[i]; int x_row = d_indices_arr[i];
if(x_row < 0)
x_row += numRowsX;
int y_row = i; int y_row = i;
atomicAdd(&X[(x_row * stridesX0) + (j * stridesX1)], Y[(y_row * stridesY0) + (j * stridesY1)]); if(x_row < numRowsX && x_row >= 0){
atomicAdd(&X[(x_row * stridesX0) + (j * stridesX1)],
Y[(y_row * stridesY0) + (j * stridesY1)]);
} else {
*err = 1;
}
} }
} }
return; return;
...@@ -3039,64 +3047,78 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1): ...@@ -3039,64 +3047,78 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
int CudaNdarray_vector_add_fast(CudaNdarray* py_self, int CudaNdarray_vector_add_fast(CudaNdarray* py_self,
CudaNdarray* py_other, PyArrayObject *indices_arr) CudaNdarray* py_other, PyArrayObject *indices_arr)
{ {
const int *shapeX = CudaNdarray_HOST_DIMS(py_self); if(init_err_var()!= 0) return -1;
const int *shapeY = CudaNdarray_HOST_DIMS(py_other);
const int *strX = CudaNdarray_HOST_STRIDES(py_self); const int *shapeX = CudaNdarray_HOST_DIMS(py_self);
const int *strY = CudaNdarray_HOST_STRIDES(py_other); const int *shapeY = CudaNdarray_HOST_DIMS(py_other);
unsigned int size = (unsigned int)PyArray_SIZE(indices_arr); const int *strX = CudaNdarray_HOST_STRIDES(py_self);
if(size == 0){ const int *strY = CudaNdarray_HOST_STRIDES(py_other);
return 0; unsigned int size = (unsigned int)PyArray_SIZE(indices_arr);
} if(size == 0){
unsigned int numcolsX = shapeX[1];
unsigned int num_threads_per_block = std::min(numcolsX, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
unsigned int num_blocks = std::min(size ,(unsigned int)NUM_VECTOR_OP_BLOCKS);
dim3 n_blocks(num_blocks);
dim3 n_threads(num_threads_per_block);
long *d_indices_arr = NULL;
PyArrayObject *cpu_indices_arr = PyArray_GETCONTIGUOUS(indices_arr);
d_indices_arr = (long*)device_malloc(PyArray_NBYTES(cpu_indices_arr));
if(!d_indices_arr)
return -1;
cudaError_t err = cudaMemcpy(d_indices_arr,
PyArray_DATA(cpu_indices_arr),
PyArray_NBYTES(cpu_indices_arr),
cudaMemcpyHostToDevice);
if(err != cudaSuccess){
PyErr_Format(
PyExc_RuntimeError,
"GpuAdvancedIncSubtensor1_dev20: cudaMemcpy returned an error: %%s",
cudaGetErrorString(err));
return -1;
}
k_vector_add_fast<<<n_blocks, n_threads>>>(shapeX[0],
shapeX[1],
strX[0],
strX[1],
CudaNdarray_DEV_DATA(py_self),
shapeY[0],
shapeY[1],
strY[0],
strY[1],
CudaNdarray_DEV_DATA(py_other),
d_indices_arr,
PyArray_SIZE(indices_arr)
);
device_free(d_indices_arr);
Py_XDECREF(cpu_indices_arr);
err = cudaGetLastError();
if(err != cudaSuccess){
PyErr_Format(
PyExc_RuntimeError,
"GpuAdvancedIncSubtensor1_dev20: cuda error: %%s",
cudaGetErrorString(err));
return -1;
}
return 0; return 0;
}
unsigned int numcolsX = shapeX[1];
unsigned int num_threads_per_block = std::min(
numcolsX, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
unsigned int num_blocks = std::min(
size, (unsigned int)NUM_VECTOR_OP_BLOCKS);
dim3 n_blocks(num_blocks);
dim3 n_threads(num_threads_per_block);
long *d_indices_arr = NULL;
PyArrayObject *cpu_indices_arr = PyArray_GETCONTIGUOUS(
indices_arr);
d_indices_arr = (long*)device_malloc(
PyArray_NBYTES(cpu_indices_arr));
if(!d_indices_arr)
return -1;
cudaError_t err = cudaMemcpy(d_indices_arr,
PyArray_DATA(cpu_indices_arr),
PyArray_NBYTES(cpu_indices_arr),
cudaMemcpyHostToDevice);
if(err != cudaSuccess){
PyErr_Format(
PyExc_RuntimeError,
"GpuAdvancedIncSubtensor1_dev20:"
" cudaMemcpy returned an error: %%s",
cudaGetErrorString(err));
return -1;
}
k_vector_add_fast<<<n_blocks, n_threads>>>(
shapeX[0],
shapeX[1],
strX[0],
strX[1],
CudaNdarray_DEV_DATA(py_self),
shapeY[0],
shapeY[1],
strY[0],
strY[1],
CudaNdarray_DEV_DATA(py_other),
d_indices_arr,
PyArray_SIZE(indices_arr),
err_var
);
int index_err = check_err_var();
device_free(d_indices_arr);
Py_XDECREF(cpu_indices_arr);
if(index_err != 0) return -1;
err = cudaGetLastError();
if(err != cudaSuccess){
PyErr_Format(
PyExc_RuntimeError,
"GpuAdvancedIncSubtensor1_dev20: cuda error: %%s",
cudaGetErrorString(err));
return -1;
}
return 0;
} }
""" % locals() """ % locals()
......
...@@ -27,22 +27,8 @@ ...@@ -27,22 +27,8 @@
//if you want this to work. //if you want this to work.
#define PRECHECK_ERROR 0 #define PRECHECK_ERROR 0
//If true, we release the GIL around blocking GPU calls, to allow other Python
//threads to run in the meantime. For a single-threaded program, the overhead
//is neglectible (about 20ms for 1 million GIL release/reclaim cycles). Can
//still be overridden on compilation with -DRELEASE_GIL=0 in nvcc.flags.
#ifndef RELEASE_GIL
#define RELEASE_GIL 1
#endif
#if RELEASE_GIL
#define CNDA_BEGIN_ALLOW_THREADS Py_BEGIN_ALLOW_THREADS
#define CNDA_END_ALLOW_THREADS Py_END_ALLOW_THREADS
#else
#define CNDA_BEGIN_ALLOW_THREADS
#define CNDA_END_ALLOW_THREADS
#endif
cublasHandle_t handle = NULL; cublasHandle_t handle = NULL;
int* err_var = NULL;
///////////////////////// /////////////////////////
// Alloc and Free // Alloc and Free
...@@ -976,13 +962,6 @@ __global__ void k_take_3(const int d0, const int d1, const int d2, ...@@ -976,13 +962,6 @@ __global__ void k_take_3(const int d0, const int d1, const int d2,
} }
} }
// Pointor to 1 int on the device
// Used in CudaNdarray_TakeFrom to tell that there is an out of bound error
// When it is allocated, it should always be 0
// So if there is an error, we must reset it to 0 BEFORE we raise the error
// This prevent us from setting it to 0 before each use
static int* err_var = NULL;
// We try to be similar to the PyArray_TakeFrom function // We try to be similar to the PyArray_TakeFrom function
//http://docs.scipy.org/doc/numpy/reference/c-api.array.html //http://docs.scipy.org/doc/numpy/reference/c-api.array.html
//TODO: support other clip mode then raise(clip, wrap) //TODO: support other clip mode then raise(clip, wrap)
...@@ -1163,30 +1142,7 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){ ...@@ -1163,30 +1142,7 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
k3 = k_take_3<CPY>; k3 = k_take_3<CPY>;
// Create the memory place that will store the error information. // Create the memory place that will store the error information.
if (err_var == NULL) { if(init_err_var() != 0) return NULL;
err_var = (int*)device_malloc(sizeof(int));
if (!err_var) { // PyErr set by device_malloc
Py_DECREF(indices);
Py_DECREF(out);
free(dims);
return NULL;
}
cudaError_t err = cudaMemset((void*)err_var, 0, sizeof(int));
if (cudaSuccess != err) {
// Clear the error flag, cudaMemset doesn't do it.
// Currently this returns the same thing as err, but if in future
// it returns something else I still don't see why we should ignore
// it. All we want to do here is reset the flag.
cudaGetLastError();
PyErr_Format(PyExc_RuntimeError,
"Error setting device error code to 0. %s",
cudaGetErrorString(err));
Py_DECREF(indices);
Py_DECREF(out);
free(dims);
return NULL;
}
}
dim3 n_blocks(std::min(CudaNdarray_HOST_DIMS(out)[0],65535),1,1); dim3 n_blocks(std::min(CudaNdarray_HOST_DIMS(out)[0],65535),1,1);
if(CudaNdarray_HOST_DIMS(out)[0] == 0){ if(CudaNdarray_HOST_DIMS(out)[0] == 0){
...@@ -1298,47 +1254,14 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){ ...@@ -1298,47 +1254,14 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
Py_DECREF(out); Py_DECREF(out);
return NULL; return NULL;
} }
//-10 could be any value different then 0.
int cpu_err_var=-10;
CNDA_BEGIN_ALLOW_THREADS int index_err = check_err_var();
// As we execute cudaMemcpy on the default stream, it waits for all Py_DECREF(indices);
// kernels (on all streams) to be finished before starting to copy if (index_err != 0) {
err = cudaMemcpy(&cpu_err_var, err_var, sizeof(int),
cudaMemcpyDeviceToHost);
CNDA_END_ALLOW_THREADS
if (cudaSuccess != err) {
PyErr_Format(
PyExc_RuntimeError,
"Cuda error: %s: %s when trying to get the error value.\n",
"CudaNdarray_TakeFrom",
cudaGetErrorString(err));
Py_DECREF(indices);
Py_DECREF(out);
return NULL;
}
if (cpu_err_var != 0) {
PyErr_Format(
PyExc_IndexError,
"CudaNdarray_TakeFrom: One of the index value is out of bound. Error code: %i.\n",
cpu_err_var);
// Must reset it to 0 to don't reset it before each use.
err = cudaMemset((void*)err_var, 0, sizeof(int));
if (cudaSuccess != err) {
PyErr_Format(PyExc_MemoryError, "Error setting device error code to 0 after having an index error. %s", cudaGetErrorString(err));
Py_DECREF(indices);
Py_DECREF(out);
return NULL;
}
Py_DECREF(indices);
Py_DECREF(out); Py_DECREF(out);
return NULL; return NULL;
} }
Py_DECREF(indices);
if (verbose) printf("TAKE SUCCEDED\n"); if (verbose) printf("TAKE SUCCEDED\n");
return (PyObject *)out; return (PyObject *)out;
} }
......
...@@ -76,6 +76,21 @@ typedef float real; ...@@ -76,6 +76,21 @@ typedef float real;
#define CNDA_THREAD_SYNC cudaThreadSynchronize(); #define CNDA_THREAD_SYNC cudaThreadSynchronize();
#endif #endif
//If true, we release the GIL around blocking GPU calls, to allow other Python
//threads to run in the meantime. For a single-threaded program, the overhead
//is neglectible (about 20ms for 1 million GIL release/reclaim cycles). Can
//still be overridden on compilation with -DRELEASE_GIL=0 in nvcc.flags.
#ifndef RELEASE_GIL
#define RELEASE_GIL 1
#endif
#if RELEASE_GIL
#define CNDA_BEGIN_ALLOW_THREADS Py_BEGIN_ALLOW_THREADS
#define CNDA_END_ALLOW_THREADS Py_END_ALLOW_THREADS
#else
#define CNDA_BEGIN_ALLOW_THREADS
#define CNDA_END_ALLOW_THREADS
#endif
#ifndef SHARED_SIZE #ifndef SHARED_SIZE
#define SHARED_SIZE (16*1024) #define SHARED_SIZE (16*1024)
...@@ -101,6 +116,76 @@ DllExport void * device_malloc(size_t size, int verbose); ...@@ -101,6 +116,76 @@ DllExport void * device_malloc(size_t size, int verbose);
DllExport int device_free(void * ptr); DllExport int device_free(void * ptr);
DllExport void *get_work_mem(size_t sz); DllExport void *get_work_mem(size_t sz);
// Pointor to 1 int on the device
// Used in CudaNdarray_TakeFrom and in an op
// to tell that there is an out of bound error
// When it is allocated, it should always be 0
// So if there is an error, we must reset it to 0 BEFORE we raise the error
// This prevent us from setting it to 0 before each use
extern DllExport int* err_var;
static inline int init_err_var(){
if (err_var == NULL) {
err_var = (int*)device_malloc(sizeof(int));
if (!err_var) { // PyErr set by device_malloc
return -1;
}
cudaError_t err = cudaMemset((void*)err_var, 0,
sizeof(int));
if (cudaSuccess != err) {
// Clear the error flag, cudaMemset doesn't do it.
cudaGetLastError();
PyErr_Format(
PyExc_RuntimeError,
"Error setting device error code to 0. %s",
cudaGetErrorString(err));
return -1;
}
}
return 0;
}
static inline int check_err_var(){
//-10 could be any value different then 0.
int cpu_err_var=-10;
cudaError_t err;
CNDA_BEGIN_ALLOW_THREADS
// As we execute cudaMemcpy on the default stream, it waits
// for all kernels (on all streams) to be finished before
// starting to copy
err = cudaMemcpy(&cpu_err_var, err_var, sizeof(int),
cudaMemcpyDeviceToHost);
CNDA_END_ALLOW_THREADS
if (cudaSuccess != err) {
PyErr_Format(
PyExc_RuntimeError,
"Cuda error: %s when trying to get the error"
" value.\\n",
cudaGetErrorString(err));
return -1;
}
if (cpu_err_var != 0) {
PyErr_Format(
PyExc_IndexError,
"One of the index value is out of bound. Error code: %i.\\n",
cpu_err_var);
// Must reset it to 0 to don't reset it before each use.
err = cudaMemset((void*)err_var, 0, sizeof(int));
if (cudaSuccess != err) {
PyErr_Format(PyExc_MemoryError,
"Error setting device error code to 0 after having"
" an index error. %s", cudaGetErrorString(err));
return -1;
}
return -1;
}
return 0;
}
template <typename T> template <typename T>
static T ceil_intdiv(T a, T b) static T ceil_intdiv(T a, T b)
{ {
......
...@@ -1016,8 +1016,9 @@ def local_gpu_advanced_incsubtensor1(node): ...@@ -1016,8 +1016,9 @@ def local_gpu_advanced_incsubtensor1(node):
return [gpu_op(as_cuda_ndarray_variable(x), as_cuda_ndarray_variable(y), *coords)] return [gpu_op(as_cuda_ndarray_variable(x), as_cuda_ndarray_variable(y), *coords)]
# Should not execute for GpuAdvancedIncSubtensor1 # Should not execute for GpuAdvancedIncSubtensor1
if node.op.__class__ is tensor.AdvancedIncSubtensor1 and \ if (node.op.__class__ is tensor.AdvancedIncSubtensor1 and
node.inputs[0].dtype == "float32": node.inputs[0].dtype == "float32" and
node.inputs[1].dtype == "float32"):
x, y = node.inputs[0:2] x, y = node.inputs[0:2]
coords = node.inputs[2:] coords = node.inputs[2:]
go_gpu = False go_gpu = False
......
...@@ -489,7 +489,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1): ...@@ -489,7 +489,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
return gof.Apply(self, [x_, y_, ilist_], [x_.type()]) return gof.Apply(self, [x_, y_, ilist_], [x_.type()])
def c_code_cache_version(self): def c_code_cache_version(self):
return (3,) return (4,)
def c_headers(self): def c_headers(self):
return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>', return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
...@@ -587,6 +587,8 @@ __device__ npy_float16 atomicAdd(npy_float16 *addr, npy_float16 val) { ...@@ -587,6 +587,8 @@ __device__ npy_float16 atomicAdd(npy_float16 *addr, npy_float16 val) {
for(int j = (threadIdx.x); j < numColsX;j += blockDim.x) for(int j = (threadIdx.x); j < numColsX;j += blockDim.x)
{ {
int x_row = indices_arr[i * stridesIndices]; int x_row = indices_arr[i * stridesIndices];
if(x_row < 0)
x_row += numRowsX;
int y_row = i; int y_row = i;
atomicAdd(&X[(x_row * stridesX0) + (j * stridesX1)], Y[(y_row * stridesY0) + (j * stridesY1)]); atomicAdd(&X[(x_row * stridesX0) + (j * stridesX1)], Y[(y_row * stridesY0) + (j * stridesY1)]);
} }
......
...@@ -437,7 +437,7 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin): ...@@ -437,7 +437,7 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
def test_ok_list(self): def test_ok_list(self):
for data, idx in [(rand(4), [1, 0]), for data, idx in [(rand(4), [1, 0]),
(rand(4, 5), [2, 3]), (rand(4, 5), [2, 3, -1]),
(rand(4, 2, 3), [0, 3]), (rand(4, 2, 3), [0, 3]),
(rand(4, 2, 3), [3, 3, 1, 1, 2, 2, 0, 0]), (rand(4, 2, 3), [3, 3, 1, 1, 2, 2, 0, 0]),
(rand(4, 2, 3), [3, 3, 1, 1, 2, 2, 0, 0, (rand(4, 2, 3), [3, 3, 1, 1, 2, 2, 0, 0,
...@@ -479,6 +479,15 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin): ...@@ -479,6 +479,15 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
out2 = test_out[0][0] out2 = test_out[0][0]
assert out1 is out2 assert out1 is out2
# test the grad
gn = theano.grad(t.sum(), n)
g = self.function([], gn, op=self.adv_incsub1)
utt.verify_grad(lambda m: m[[1, 3]],
[numpy.random.rand(5, 5).astype(self.dtype)])
g_0 = g()
utt.verify_grad(lambda m: m[idx],
[data])
def test_err_invalid_list(self): def test_err_invalid_list(self):
n = self.shared(numpy.asarray(5, dtype=self.dtype)) n = self.shared(numpy.asarray(5, dtype=self.dtype))
self.assertRaises(TypeError, n.__getitem__, [0, 0]) self.assertRaises(TypeError, n.__getitem__, [0, 0])
...@@ -495,13 +504,15 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin): ...@@ -495,13 +504,15 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
self.assertTrue(isinstance(t.owner.op, tensor.AdvancedSubtensor1)) self.assertTrue(isinstance(t.owner.op, tensor.AdvancedSubtensor1))
f = self.function([l], t, op=self.adv_sub1) f = self.function([l], t, op=self.adv_sub1)
topo = f.maker.fgraph.toposort()
topo_ = [node for node in topo if not isinstance(node.op, # the grad
self.ignore_topo)] g = self.function([l],
assert len(topo_) == 1 inc_subtensor(t, numpy.asarray([[1.]], self.dtype)),
self.assertTrue(isinstance(topo_[0].op, self.adv_sub1)) op=self.adv_incsub1)
for shp in [[0, 4], [0, -3], [-10]]: for shp in [[0, 4], [0, -3], [-10]]:
self.assertRaises(IndexError, f, shp) self.assertRaises(IndexError, f, shp)
self.assertRaises(IndexError, g, shp)
def test_adv_sub1_broadcast(self): def test_adv_sub1_broadcast(self):
ones = numpy.ones((1, 3), dtype=self.dtype) ones = numpy.ones((1, 3), dtype=self.dtype)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论