提交 d146cf82 authored 作者: Frederic's avatar Frederic

Change assert() to raise a Python exception and handle shape=0 case.

上级 59225276
...@@ -2756,7 +2756,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1): ...@@ -2756,7 +2756,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
return Apply(self, [x_, y_, ilist_], [x_.type()]) return Apply(self, [x_, y_, ilist_], [x_.type()])
def c_code_cache_version(self): def c_code_cache_version(self):
return (2,) return (3,)
def c_code(self, node, name, inputs, outputs, sub): def c_code(self, node, name, inputs, outputs, sub):
active_device_no = theano.sandbox.cuda.active_device_number() active_device_no = theano.sandbox.cuda.active_device_number()
...@@ -2782,7 +2782,9 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1): ...@@ -2782,7 +2782,9 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
Py_XINCREF(%(out)s); Py_XINCREF(%(out)s);
} }
CudaNdarray_vector_add_fast(%(out)s, %(y)s, %(ind)s); if (CudaNdarray_vector_add_fast(%(out)s, %(y)s, %(ind)s) != 0){
%(fail)s
}
if (!%(out)s) { if (!%(out)s) {
%(fail)s %(fail)s
...@@ -2817,14 +2819,17 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1): ...@@ -2817,14 +2819,17 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
return; return;
} }
void CudaNdarray_vector_add_fast(CudaNdarray* py_self, CudaNdarray* py_other, PyArrayObject *indices_arr) int CudaNdarray_vector_add_fast(CudaNdarray* py_self,
CudaNdarray* py_other, PyArrayObject *indices_arr)
{ {
const int *shapeX = CudaNdarray_HOST_DIMS(py_self); const int *shapeX = CudaNdarray_HOST_DIMS(py_self);
const int *shapeY = CudaNdarray_HOST_DIMS(py_other); const int *shapeY = CudaNdarray_HOST_DIMS(py_other);
const int *strX = CudaNdarray_HOST_STRIDES(py_self); const int *strX = CudaNdarray_HOST_STRIDES(py_self);
const int *strY = CudaNdarray_HOST_STRIDES(py_other); const int *strY = CudaNdarray_HOST_STRIDES(py_other);
unsigned int size = (unsigned int)PyArray_SIZE(indices_arr); unsigned int size = (unsigned int)PyArray_SIZE(indices_arr);
if(size == 0){
return 0;
}
unsigned int numcolsX = shapeX[1]; unsigned int numcolsX = shapeX[1];
unsigned int num_threads_per_block = std::min(numcolsX, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK); unsigned int num_threads_per_block = std::min(numcolsX, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
unsigned int num_blocks = std::min(size ,(unsigned int)NUM_VECTOR_OP_BLOCKS); unsigned int num_blocks = std::min(size ,(unsigned int)NUM_VECTOR_OP_BLOCKS);
...@@ -2832,18 +2837,23 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1): ...@@ -2832,18 +2837,23 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
dim3 n_blocks(num_blocks); dim3 n_blocks(num_blocks);
dim3 n_threads(num_threads_per_block); dim3 n_threads(num_threads_per_block);
long *d_indices_arr = NULL; long *d_indices_arr = NULL;
PyArrayObject *cpu_indices_arr = PyArray_GETCONTIGUOUS(indices_arr); PyArrayObject *cpu_indices_arr = PyArray_GETCONTIGUOUS(indices_arr);
d_indices_arr = (long*)device_malloc(PyArray_NBYTES(cpu_indices_arr)); d_indices_arr = (long*)device_malloc(PyArray_NBYTES(cpu_indices_arr));
assert(d_indices_arr);
if(!d_indices_arr)
return -1;
cudaError_t err = cudaMemcpy(d_indices_arr, cudaError_t err = cudaMemcpy(d_indices_arr,
PyArray_DATA(cpu_indices_arr), PyArray_DATA(cpu_indices_arr),
PyArray_NBYTES(cpu_indices_arr), PyArray_NBYTES(cpu_indices_arr),
cudaMemcpyHostToDevice); cudaMemcpyHostToDevice);
if(err != cudaSuccess){
assert(err == cudaSuccess); PyErr_Format(
PyExc_RuntimeError,
"GpuAdvancedIncSubtensor1_dev20: cudaMemcpy returned an error: %%s",
cudaGetErrorString(err));
return -1;
}
k_vector_add_fast<<<n_blocks, n_threads>>>(shapeX[0], k_vector_add_fast<<<n_blocks, n_threads>>>(shapeX[0],
shapeX[1], shapeX[1],
...@@ -2858,12 +2868,20 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1): ...@@ -2858,12 +2868,20 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
d_indices_arr, d_indices_arr,
PyArray_SIZE(indices_arr) PyArray_SIZE(indices_arr)
); );
device_free(d_indices_arr); device_free(d_indices_arr);
Py_XDECREF(cpu_indices_arr); Py_XDECREF(cpu_indices_arr);
return; err = cudaGetLastError();
} if(err != cudaSuccess){
PyErr_Format(
PyExc_RuntimeError,
"GpuAdvancedIncSubtensor1_dev20: cuda error: %%s",
cudaGetErrorString(err));
return -1;
}
return 0;
}
""" %locals() """ % locals()
class GpuIncSubtensor(tensor.IncSubtensor, GpuOp): class GpuIncSubtensor(tensor.IncSubtensor, GpuOp):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论