提交 574e7965 authored 作者: Frederic's avatar Frederic

Make GpuAdvancedIncSubtensor1_dev20 return user index error

上级 ebbaae5b
...@@ -2974,7 +2974,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1): ...@@ -2974,7 +2974,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
return Apply(self, [x_, y_, ilist_], [x_.type()]) return Apply(self, [x_, y_, ilist_], [x_.type()])
def c_code_cache_version(self): def c_code_cache_version(self):
return (4,) return (5,)
def c_code(self, node, name, inputs, outputs, sub): def c_code(self, node, name, inputs, outputs, sub):
active_device_no = theano.sandbox.cuda.active_device_number() active_device_no = theano.sandbox.cuda.active_device_number()
...@@ -3023,7 +3023,8 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1): ...@@ -3023,7 +3023,8 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
int stridesY1, int stridesY1,
float *Y , float *Y ,
long *d_indices_arr, long *d_indices_arr,
int num) int num,
int* err)
{ {
for (int i = (blockIdx.x); i < num; i += gridDim.x) for (int i = (blockIdx.x); i < num; i += gridDim.x)
{ {
...@@ -3033,7 +3034,12 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1): ...@@ -3033,7 +3034,12 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
if(x_row < 0) if(x_row < 0)
x_row += numRowsX; x_row += numRowsX;
int y_row = i; int y_row = i;
atomicAdd(&X[(x_row * stridesX0) + (j * stridesX1)], Y[(y_row * stridesY0) + (j * stridesY1)]); if(x_row < numRowsX && x_row >= 0){
atomicAdd(&X[(x_row * stridesX0) + (j * stridesX1)],
Y[(y_row * stridesY0) + (j * stridesY1)]);
} else {
*err = 1;
}
} }
} }
return; return;
...@@ -3042,6 +3048,23 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1): ...@@ -3042,6 +3048,23 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
int CudaNdarray_vector_add_fast(CudaNdarray* py_self, int CudaNdarray_vector_add_fast(CudaNdarray* py_self,
CudaNdarray* py_other, PyArrayObject *indices_arr) CudaNdarray* py_other, PyArrayObject *indices_arr)
{ {
if (err_var == NULL) {
err_var = (int*)device_malloc(sizeof(int));
if (!err_var) { // PyErr set by device_malloc
return -1;
}
cudaError_t err = cudaMemset((void*)err_var, 0,
sizeof(int));
if (cudaSuccess != err) {
// Clear the error flag, cudaMemset doesn't do it.
cudaGetLastError();
PyErr_Format(
PyExc_RuntimeError,
"Error setting device error code to 0. %%s",
cudaGetErrorString(err));
return -1;
}
}
const int *shapeX = CudaNdarray_HOST_DIMS(py_self); const int *shapeX = CudaNdarray_HOST_DIMS(py_self);
const int *shapeY = CudaNdarray_HOST_DIMS(py_other); const int *shapeY = CudaNdarray_HOST_DIMS(py_other);
const int *strX = CudaNdarray_HOST_STRIDES(py_self); const int *strX = CudaNdarray_HOST_STRIDES(py_self);
...@@ -3086,8 +3109,47 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1): ...@@ -3086,8 +3109,47 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
strY[1], strY[1],
CudaNdarray_DEV_DATA(py_other), CudaNdarray_DEV_DATA(py_other),
d_indices_arr, d_indices_arr,
PyArray_SIZE(indices_arr) PyArray_SIZE(indices_arr),
err_var
); );
//-10 could be any value different then 0.
int cpu_err_var=-10;
CNDA_BEGIN_ALLOW_THREADS
// As we execute cudaMemcpy on the default stream, it waits
// for all kernels (on all streams) to be finished before
// starting to copy
err = cudaMemcpy(&cpu_err_var, err_var, sizeof(int),
cudaMemcpyDeviceToHost);
CNDA_END_ALLOW_THREADS
if (cudaSuccess != err) {
PyErr_Format(
PyExc_RuntimeError,
"Cuda error: %%s: %%s when trying to get the error"
" value.\\n",
"GpuAdvancedIncSubtensor1_dev20",
cudaGetErrorString(err));
return NULL;
}
if (cpu_err_var != 0) {
PyErr_Format(
PyExc_IndexError,
"GpuAdvancedIncSubtensor1_dev20: One of the index"
" value is out of bound. Error code: %%i.\\n",
cpu_err_var);
// Must reset it to 0 to don't reset it before each use.
err = cudaMemset((void*)err_var, 0, sizeof(int));
if (cudaSuccess != err) {
PyErr_Format(PyExc_MemoryError,
"Error setting device error code to 0 after having"
" an index error. %%s", cudaGetErrorString(err));
return -1;
}
return -1;
}
device_free(d_indices_arr); device_free(d_indices_arr);
Py_XDECREF(cpu_indices_arr); Py_XDECREF(cpu_indices_arr);
err = cudaGetLastError(); err = cudaGetLastError();
......
...@@ -27,21 +27,6 @@ ...@@ -27,21 +27,6 @@
//if you want this to work. //if you want this to work.
#define PRECHECK_ERROR 0 #define PRECHECK_ERROR 0
//If true, we release the GIL around blocking GPU calls, to allow other Python
//threads to run in the meantime. For a single-threaded program, the overhead
//is neglectible (about 20ms for 1 million GIL release/reclaim cycles). Can
//still be overridden on compilation with -DRELEASE_GIL=0 in nvcc.flags.
#ifndef RELEASE_GIL
#define RELEASE_GIL 1
#endif
#if RELEASE_GIL
#define CNDA_BEGIN_ALLOW_THREADS Py_BEGIN_ALLOW_THREADS
#define CNDA_END_ALLOW_THREADS Py_END_ALLOW_THREADS
#else
#define CNDA_BEGIN_ALLOW_THREADS
#define CNDA_END_ALLOW_THREADS
#endif
cublasHandle_t handle = NULL; cublasHandle_t handle = NULL;
///////////////////////// /////////////////////////
...@@ -976,13 +961,6 @@ __global__ void k_take_3(const int d0, const int d1, const int d2, ...@@ -976,13 +961,6 @@ __global__ void k_take_3(const int d0, const int d1, const int d2,
} }
} }
// Pointor to 1 int on the device
// Used in CudaNdarray_TakeFrom to tell that there is an out of bound error
// When it is allocated, it should always be 0
// So if there is an error, we must reset it to 0 BEFORE we raise the error
// This prevent us from setting it to 0 before each use
static int* err_var = NULL;
// We try to be similar to the PyArray_TakeFrom function // We try to be similar to the PyArray_TakeFrom function
//http://docs.scipy.org/doc/numpy/reference/c-api.array.html //http://docs.scipy.org/doc/numpy/reference/c-api.array.html
//TODO: support other clip mode then raise(clip, wrap) //TODO: support other clip mode then raise(clip, wrap)
......
...@@ -76,6 +76,21 @@ typedef float real; ...@@ -76,6 +76,21 @@ typedef float real;
#define CNDA_THREAD_SYNC cudaThreadSynchronize(); #define CNDA_THREAD_SYNC cudaThreadSynchronize();
#endif #endif
//If true, we release the GIL around blocking GPU calls, to allow other Python
//threads to run in the meantime. For a single-threaded program, the overhead
//is neglectible (about 20ms for 1 million GIL release/reclaim cycles). Can
//still be overridden on compilation with -DRELEASE_GIL=0 in nvcc.flags.
#ifndef RELEASE_GIL
#define RELEASE_GIL 1
#endif
#if RELEASE_GIL
#define CNDA_BEGIN_ALLOW_THREADS Py_BEGIN_ALLOW_THREADS
#define CNDA_END_ALLOW_THREADS Py_END_ALLOW_THREADS
#else
#define CNDA_BEGIN_ALLOW_THREADS
#define CNDA_END_ALLOW_THREADS
#endif
#ifndef SHARED_SIZE #ifndef SHARED_SIZE
#define SHARED_SIZE (16*1024) #define SHARED_SIZE (16*1024)
...@@ -101,6 +116,14 @@ DllExport void * device_malloc(size_t size, int verbose); ...@@ -101,6 +116,14 @@ DllExport void * device_malloc(size_t size, int verbose);
DllExport int device_free(void * ptr); DllExport int device_free(void * ptr);
DllExport void *get_work_mem(size_t sz); DllExport void *get_work_mem(size_t sz);
// Pointor to 1 int on the device
// Used in CudaNdarray_TakeFrom and in an op
// to tell that there is an out of bound error
// When it is allocated, it should always be 0
// So if there is an error, we must reset it to 0 BEFORE we raise the error
// This prevent us from setting it to 0 before each use
static int* err_var = NULL;
template <typename T> template <typename T>
static T ceil_intdiv(T a, T b) static T ceil_intdiv(T a, T b)
{ {
......
...@@ -1016,8 +1016,9 @@ def local_gpu_advanced_incsubtensor1(node): ...@@ -1016,8 +1016,9 @@ def local_gpu_advanced_incsubtensor1(node):
return [gpu_op(as_cuda_ndarray_variable(x), as_cuda_ndarray_variable(y), *coords)] return [gpu_op(as_cuda_ndarray_variable(x), as_cuda_ndarray_variable(y), *coords)]
# Should not execute for GpuAdvancedIncSubtensor1 # Should not execute for GpuAdvancedIncSubtensor1
if node.op.__class__ is tensor.AdvancedIncSubtensor1 and \ if (node.op.__class__ is tensor.AdvancedIncSubtensor1 and
node.inputs[0].dtype == "float32": node.inputs[0].dtype == "float32" and
node.inputs[1].dtype == "float32"):
x, y = node.inputs[0:2] x, y = node.inputs[0:2]
coords = node.inputs[2:] coords = node.inputs[2:]
go_gpu = False go_gpu = False
......
...@@ -482,7 +482,6 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin): ...@@ -482,7 +482,6 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
# test the grad # test the grad
gn = theano.grad(t.sum(), n) gn = theano.grad(t.sum(), n)
g = self.function([], gn, op=self.adv_incsub1) g = self.function([], gn, op=self.adv_incsub1)
theano.printing.debugprint(g)
utt.verify_grad(lambda m: m[[1, 3]], utt.verify_grad(lambda m: m[[1, 3]],
[numpy.random.rand(5, 5).astype(self.dtype)]) [numpy.random.rand(5, 5).astype(self.dtype)])
g_0 = g() g_0 = g()
...@@ -505,13 +504,15 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin): ...@@ -505,13 +504,15 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
self.assertTrue(isinstance(t.owner.op, tensor.AdvancedSubtensor1)) self.assertTrue(isinstance(t.owner.op, tensor.AdvancedSubtensor1))
f = self.function([l], t, op=self.adv_sub1) f = self.function([l], t, op=self.adv_sub1)
topo = f.maker.fgraph.toposort()
topo_ = [node for node in topo if not isinstance(node.op, # the grad
self.ignore_topo)] g = self.function([l],
assert len(topo_) == 1 inc_subtensor(t, numpy.asarray([[1.]], self.dtype)),
self.assertTrue(isinstance(topo_[0].op, self.adv_sub1)) op=self.adv_incsub1)
for shp in [[0, 4], [0, -3], [-10]]: for shp in [[0, 4], [0, -3], [-10]]:
self.assertRaises(IndexError, f, shp) self.assertRaises(IndexError, f, shp)
self.assertRaises(IndexError, g, shp)
def test_adv_sub1_broadcast(self): def test_adv_sub1_broadcast(self):
ones = numpy.ones((1, 3), dtype=self.dtype) ones = numpy.ones((1, 3), dtype=self.dtype)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论