提交 d08a40fa authored 作者: Frederic's avatar Frederic

refactore the new code

上级 574e7965
...@@ -2974,7 +2974,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1): ...@@ -2974,7 +2974,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
return Apply(self, [x_, y_, ilist_], [x_.type()]) return Apply(self, [x_, y_, ilist_], [x_.type()])
def c_code_cache_version(self): def c_code_cache_version(self):
return (5,) return (6,)
def c_code(self, node, name, inputs, outputs, sub): def c_code(self, node, name, inputs, outputs, sub):
active_device_no = theano.sandbox.cuda.active_device_number() active_device_no = theano.sandbox.cuda.active_device_number()
...@@ -3048,23 +3048,8 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1): ...@@ -3048,23 +3048,8 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
int CudaNdarray_vector_add_fast(CudaNdarray* py_self, int CudaNdarray_vector_add_fast(CudaNdarray* py_self,
CudaNdarray* py_other, PyArrayObject *indices_arr) CudaNdarray* py_other, PyArrayObject *indices_arr)
{ {
if (err_var == NULL) { if(init_err_var()!= 0) return -1;
err_var = (int*)device_malloc(sizeof(int));
if (!err_var) { // PyErr set by device_malloc
return -1;
}
cudaError_t err = cudaMemset((void*)err_var, 0,
sizeof(int));
if (cudaSuccess != err) {
// Clear the error flag, cudaMemset doesn't do it.
cudaGetLastError();
PyErr_Format(
PyExc_RuntimeError,
"Error setting device error code to 0. %%s",
cudaGetErrorString(err));
return -1;
}
}
const int *shapeX = CudaNdarray_HOST_DIMS(py_self); const int *shapeX = CudaNdarray_HOST_DIMS(py_self);
const int *shapeY = CudaNdarray_HOST_DIMS(py_other); const int *shapeY = CudaNdarray_HOST_DIMS(py_other);
const int *strX = CudaNdarray_HOST_STRIDES(py_self); const int *strX = CudaNdarray_HOST_STRIDES(py_self);
...@@ -3112,46 +3097,13 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1): ...@@ -3112,46 +3097,13 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
PyArray_SIZE(indices_arr), PyArray_SIZE(indices_arr),
err_var err_var
); );
int index_err = check_err_var();
//-10 could be any value different then 0.
int cpu_err_var=-10;
CNDA_BEGIN_ALLOW_THREADS
// As we execute cudaMemcpy on the default stream, it waits
// for all kernels (on all streams) to be finished before
// starting to copy
err = cudaMemcpy(&cpu_err_var, err_var, sizeof(int),
cudaMemcpyDeviceToHost);
CNDA_END_ALLOW_THREADS
if (cudaSuccess != err) {
PyErr_Format(
PyExc_RuntimeError,
"Cuda error: %%s: %%s when trying to get the error"
" value.\\n",
"GpuAdvancedIncSubtensor1_dev20",
cudaGetErrorString(err));
return NULL;
}
if (cpu_err_var != 0) {
PyErr_Format(
PyExc_IndexError,
"GpuAdvancedIncSubtensor1_dev20: One of the index"
" value is out of bound. Error code: %%i.\\n",
cpu_err_var);
// Must reset it to 0 to don't reset it before each use.
err = cudaMemset((void*)err_var, 0, sizeof(int));
if (cudaSuccess != err) {
PyErr_Format(PyExc_MemoryError,
"Error setting device error code to 0 after having"
" an index error. %%s", cudaGetErrorString(err));
return -1;
}
return -1;
}
device_free(d_indices_arr); device_free(d_indices_arr);
Py_XDECREF(cpu_indices_arr); Py_XDECREF(cpu_indices_arr);
if(index_err != 0) return -1;
err = cudaGetLastError(); err = cudaGetLastError();
if(err != cudaSuccess){ if(err != cudaSuccess){
PyErr_Format( PyErr_Format(
......
...@@ -1141,30 +1141,7 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){ ...@@ -1141,30 +1141,7 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
k3 = k_take_3<CPY>; k3 = k_take_3<CPY>;
// Create the memory place that will store the error information. // Create the memory place that will store the error information.
if (err_var == NULL) { if(init_err_var() != 0) return NULL;
err_var = (int*)device_malloc(sizeof(int));
if (!err_var) { // PyErr set by device_malloc
Py_DECREF(indices);
Py_DECREF(out);
free(dims);
return NULL;
}
cudaError_t err = cudaMemset((void*)err_var, 0, sizeof(int));
if (cudaSuccess != err) {
// Clear the error flag, cudaMemset doesn't do it.
// Currently this returns the same thing as err, but if in future
// it returns something else I still don't see why we should ignore
// it. All we want to do here is reset the flag.
cudaGetLastError();
PyErr_Format(PyExc_RuntimeError,
"Error setting device error code to 0. %s",
cudaGetErrorString(err));
Py_DECREF(indices);
Py_DECREF(out);
free(dims);
return NULL;
}
}
dim3 n_blocks(std::min(CudaNdarray_HOST_DIMS(out)[0],65535),1,1); dim3 n_blocks(std::min(CudaNdarray_HOST_DIMS(out)[0],65535),1,1);
if(CudaNdarray_HOST_DIMS(out)[0] == 0){ if(CudaNdarray_HOST_DIMS(out)[0] == 0){
...@@ -1276,47 +1253,14 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){ ...@@ -1276,47 +1253,14 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
Py_DECREF(out); Py_DECREF(out);
return NULL; return NULL;
} }
//-10 could be any value different then 0.
int cpu_err_var=-10;
CNDA_BEGIN_ALLOW_THREADS
// As we execute cudaMemcpy on the default stream, it waits for all
// kernels (on all streams) to be finished before starting to copy
err = cudaMemcpy(&cpu_err_var, err_var, sizeof(int),
cudaMemcpyDeviceToHost);
CNDA_END_ALLOW_THREADS
if (cudaSuccess != err) {
PyErr_Format(
PyExc_RuntimeError,
"Cuda error: %s: %s when trying to get the error value.\n",
"CudaNdarray_TakeFrom",
cudaGetErrorString(err));
Py_DECREF(indices);
Py_DECREF(out);
return NULL;
}
if (cpu_err_var != 0) { int index_err = check_err_var();
PyErr_Format(
PyExc_IndexError,
"CudaNdarray_TakeFrom: One of the index value is out of bound. Error code: %i.\n",
cpu_err_var);
// Must reset it to 0 to don't reset it before each use.
err = cudaMemset((void*)err_var, 0, sizeof(int));
if (cudaSuccess != err) {
PyErr_Format(PyExc_MemoryError, "Error setting device error code to 0 after having an index error. %s", cudaGetErrorString(err));
Py_DECREF(indices);
Py_DECREF(out);
return NULL;
}
Py_DECREF(indices); Py_DECREF(indices);
if (index_err != 0) {
Py_DECREF(out); Py_DECREF(out);
return NULL; return NULL;
} }
Py_DECREF(indices);
if (verbose) printf("TAKE SUCCEDED\n"); if (verbose) printf("TAKE SUCCEDED\n");
return (PyObject *)out; return (PyObject *)out;
} }
......
...@@ -122,7 +122,69 @@ DllExport void *get_work_mem(size_t sz); ...@@ -122,7 +122,69 @@ DllExport void *get_work_mem(size_t sz);
// When it is allocated, it should always be 0 // When it is allocated, it should always be 0
// So if there is an error, we must reset it to 0 BEFORE we raise the error // So if there is an error, we must reset it to 0 BEFORE we raise the error
// This prevent us from setting it to 0 before each use // This prevent us from setting it to 0 before each use
static int* err_var = NULL; extern DllExport int* err_var = NULL;
DllExport inline int init_err_var(){
if (err_var == NULL) {
err_var = (int*)device_malloc(sizeof(int));
if (!err_var) { // PyErr set by device_malloc
return -1;
}
cudaError_t err = cudaMemset((void*)err_var, 0,
sizeof(int));
if (cudaSuccess != err) {
// Clear the error flag, cudaMemset doesn't do it.
cudaGetLastError();
PyErr_Format(
PyExc_RuntimeError,
"Error setting device error code to 0. %s",
cudaGetErrorString(err));
return -1;
}
}
return 0;
}
DllExport inline int check_err_var(){
//-10 could be any value different then 0.
int cpu_err_var=-10;
cudaError_t err;
CNDA_BEGIN_ALLOW_THREADS
// As we execute cudaMemcpy on the default stream, it waits
// for all kernels (on all streams) to be finished before
// starting to copy
err = cudaMemcpy(&cpu_err_var, err_var, sizeof(int),
cudaMemcpyDeviceToHost);
CNDA_END_ALLOW_THREADS
if (cudaSuccess != err) {
PyErr_Format(
PyExc_RuntimeError,
"Cuda error: %s when trying to get the error"
" value.\\n",
cudaGetErrorString(err));
return -1;
}
if (cpu_err_var != 0) {
PyErr_Format(
PyExc_IndexError,
"One of the index value is out of bound. Error code: %i.\\n",
cpu_err_var);
// Must reset it to 0 to don't reset it before each use.
err = cudaMemset((void*)err_var, 0, sizeof(int));
if (cudaSuccess != err) {
PyErr_Format(PyExc_MemoryError,
"Error setting device error code to 0 after having"
" an index error. %s", cudaGetErrorString(err));
return -1;
}
return -1;
}
return 0;
}
template <typename T> template <typename T>
static T ceil_intdiv(T a, T b) static T ceil_intdiv(T a, T b)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论