提交 574e7965 authored 作者: Frederic's avatar Frederic

Make GpuAdvancedIncSubtensor1_dev20 return user index error

上级 ebbaae5b
......@@ -2974,7 +2974,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
return Apply(self, [x_, y_, ilist_], [x_.type()])
def c_code_cache_version(self):
return (4,)
return (5,)
def c_code(self, node, name, inputs, outputs, sub):
active_device_no = theano.sandbox.cuda.active_device_number()
......@@ -3023,7 +3023,8 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
int stridesY1,
float *Y ,
long *d_indices_arr,
int num)
int num,
int* err)
{
for (int i = (blockIdx.x); i < num; i += gridDim.x)
{
......@@ -3033,7 +3034,12 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
if(x_row < 0)
x_row += numRowsX;
int y_row = i;
atomicAdd(&X[(x_row * stridesX0) + (j * stridesX1)], Y[(y_row * stridesY0) + (j * stridesY1)]);
if(x_row < numRowsX && x_row >= 0){
atomicAdd(&X[(x_row * stridesX0) + (j * stridesX1)],
Y[(y_row * stridesY0) + (j * stridesY1)]);
} else {
*err = 1;
}
}
}
return;
......@@ -3041,7 +3047,24 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
int CudaNdarray_vector_add_fast(CudaNdarray* py_self,
CudaNdarray* py_other, PyArrayObject *indices_arr)
{
{
if (err_var == NULL) {
err_var = (int*)device_malloc(sizeof(int));
if (!err_var) { // PyErr set by device_malloc
return -1;
}
cudaError_t err = cudaMemset((void*)err_var, 0,
sizeof(int));
if (cudaSuccess != err) {
// Clear the error flag, cudaMemset doesn't do it.
cudaGetLastError();
PyErr_Format(
PyExc_RuntimeError,
"Error setting device error code to 0. %%s",
cudaGetErrorString(err));
return -1;
}
}
const int *shapeX = CudaNdarray_HOST_DIMS(py_self);
const int *shapeY = CudaNdarray_HOST_DIMS(py_other);
const int *strX = CudaNdarray_HOST_STRIDES(py_self);
......@@ -3086,8 +3109,47 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
strY[1],
CudaNdarray_DEV_DATA(py_other),
d_indices_arr,
PyArray_SIZE(indices_arr)
PyArray_SIZE(indices_arr),
err_var
);
//-10 could be any value different then 0.
int cpu_err_var=-10;
CNDA_BEGIN_ALLOW_THREADS
// As we execute cudaMemcpy on the default stream, it waits
// for all kernels (on all streams) to be finished before
// starting to copy
err = cudaMemcpy(&cpu_err_var, err_var, sizeof(int),
cudaMemcpyDeviceToHost);
CNDA_END_ALLOW_THREADS
if (cudaSuccess != err) {
PyErr_Format(
PyExc_RuntimeError,
"Cuda error: %%s: %%s when trying to get the error"
" value.\\n",
"GpuAdvancedIncSubtensor1_dev20",
cudaGetErrorString(err));
return NULL;
}
if (cpu_err_var != 0) {
PyErr_Format(
PyExc_IndexError,
"GpuAdvancedIncSubtensor1_dev20: One of the index"
" value is out of bound. Error code: %%i.\\n",
cpu_err_var);
// Must reset it to 0 to don't reset it before each use.
err = cudaMemset((void*)err_var, 0, sizeof(int));
if (cudaSuccess != err) {
PyErr_Format(PyExc_MemoryError,
"Error setting device error code to 0 after having"
" an index error. %%s", cudaGetErrorString(err));
return -1;
}
return -1;
}
device_free(d_indices_arr);
Py_XDECREF(cpu_indices_arr);
err = cudaGetLastError();
......
......@@ -27,21 +27,6 @@
//if you want this to work.
#define PRECHECK_ERROR 0
//If true, we release the GIL around blocking GPU calls, to allow other Python
//threads to run in the meantime. For a single-threaded program, the overhead
//is neglectible (about 20ms for 1 million GIL release/reclaim cycles). Can
//still be overridden on compilation with -DRELEASE_GIL=0 in nvcc.flags.
#ifndef RELEASE_GIL
#define RELEASE_GIL 1
#endif
#if RELEASE_GIL
#define CNDA_BEGIN_ALLOW_THREADS Py_BEGIN_ALLOW_THREADS
#define CNDA_END_ALLOW_THREADS Py_END_ALLOW_THREADS
#else
#define CNDA_BEGIN_ALLOW_THREADS
#define CNDA_END_ALLOW_THREADS
#endif
cublasHandle_t handle = NULL;
/////////////////////////
......@@ -976,13 +961,6 @@ __global__ void k_take_3(const int d0, const int d1, const int d2,
}
}
// Pointor to 1 int on the device
// Used in CudaNdarray_TakeFrom to tell that there is an out of bound error
// When it is allocated, it should always be 0
// So if there is an error, we must reset it to 0 BEFORE we raise the error
// This prevent us from setting it to 0 before each use
static int* err_var = NULL;
// We try to be similar to the PyArray_TakeFrom function
//http://docs.scipy.org/doc/numpy/reference/c-api.array.html
//TODO: support other clip mode then raise(clip, wrap)
......
......@@ -76,6 +76,21 @@ typedef float real;
#define CNDA_THREAD_SYNC cudaThreadSynchronize();
#endif
//If true, we release the GIL around blocking GPU calls, to allow other Python
//threads to run in the meantime. For a single-threaded program, the overhead
//is neglectible (about 20ms for 1 million GIL release/reclaim cycles). Can
//still be overridden on compilation with -DRELEASE_GIL=0 in nvcc.flags.
#ifndef RELEASE_GIL
#define RELEASE_GIL 1
#endif
#if RELEASE_GIL
#define CNDA_BEGIN_ALLOW_THREADS Py_BEGIN_ALLOW_THREADS
#define CNDA_END_ALLOW_THREADS Py_END_ALLOW_THREADS
#else
#define CNDA_BEGIN_ALLOW_THREADS
#define CNDA_END_ALLOW_THREADS
#endif
#ifndef SHARED_SIZE
#define SHARED_SIZE (16*1024)
......@@ -101,6 +116,14 @@ DllExport void * device_malloc(size_t size, int verbose);
DllExport int device_free(void * ptr);
DllExport void *get_work_mem(size_t sz);
// Pointor to 1 int on the device
// Used in CudaNdarray_TakeFrom and in an op
// to tell that there is an out of bound error
// When it is allocated, it should always be 0
// So if there is an error, we must reset it to 0 BEFORE we raise the error
// This prevent us from setting it to 0 before each use
static int* err_var = NULL;
template <typename T>
static T ceil_intdiv(T a, T b)
{
......
......@@ -1016,8 +1016,9 @@ def local_gpu_advanced_incsubtensor1(node):
return [gpu_op(as_cuda_ndarray_variable(x), as_cuda_ndarray_variable(y), *coords)]
# Should not execute for GpuAdvancedIncSubtensor1
if node.op.__class__ is tensor.AdvancedIncSubtensor1 and \
node.inputs[0].dtype == "float32":
if (node.op.__class__ is tensor.AdvancedIncSubtensor1 and
node.inputs[0].dtype == "float32" and
node.inputs[1].dtype == "float32"):
x, y = node.inputs[0:2]
coords = node.inputs[2:]
go_gpu = False
......
......@@ -482,7 +482,6 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
# test the grad
gn = theano.grad(t.sum(), n)
g = self.function([], gn, op=self.adv_incsub1)
theano.printing.debugprint(g)
utt.verify_grad(lambda m: m[[1, 3]],
[numpy.random.rand(5, 5).astype(self.dtype)])
g_0 = g()
......@@ -505,13 +504,15 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
self.assertTrue(isinstance(t.owner.op, tensor.AdvancedSubtensor1))
f = self.function([l], t, op=self.adv_sub1)
topo = f.maker.fgraph.toposort()
topo_ = [node for node in topo if not isinstance(node.op,
self.ignore_topo)]
assert len(topo_) == 1
self.assertTrue(isinstance(topo_[0].op, self.adv_sub1))
# the grad
g = self.function([l],
inc_subtensor(t, numpy.asarray([[1.]], self.dtype)),
op=self.adv_incsub1)
for shp in [[0, 4], [0, -3], [-10]]:
self.assertRaises(IndexError, f, shp)
self.assertRaises(IndexError, g, shp)
def test_adv_sub1_broadcast(self):
ones = numpy.ones((1, 3), dtype=self.dtype)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论