提交 bb533b5e authored 作者: abergeron's avatar abergeron

Merge pull request #3219 from nouiz/2g

cuda bugfix with array over 2g
...@@ -625,17 +625,20 @@ PyObject * CudaNdarray_CreateArrayObj(CudaNdarray * self, PyObject *args) ...@@ -625,17 +625,20 @@ PyObject * CudaNdarray_CreateArrayObj(CudaNdarray * self, PyObject *args)
npy_intp rval_size = PyArray_SIZE(rval); npy_intp rval_size = PyArray_SIZE(rval);
void *rval_data = PyArray_DATA(rval); void *rval_data = PyArray_DATA(rval);
cublasStatus_t err; cudaError_t err;
CNDA_BEGIN_ALLOW_THREADS CNDA_BEGIN_ALLOW_THREADS;
err = cublasGetVector(rval_size, sizeof(real),
contiguous_self->devdata, 1,
rval_data, 1);
//CNDA_THREAD_SYNC; // unneeded because cublasGetVector is blocking anyway
CNDA_END_ALLOW_THREADS
if (CUBLAS_STATUS_SUCCESS != err) err = cudaMemcpy(rval_data, contiguous_self->devdata,
rval_size * sizeof(real),
cudaMemcpyDeviceToHost
);
//CNDA_THREAD_SYNC; // unneeded because cudaMemcpy is blocking anyway
CNDA_END_ALLOW_THREADS;
if (cudaSuccess != err)
{ {
PyErr_SetString(PyExc_RuntimeError, "error copying data to host"); PyErr_Format(PyExc_RuntimeError, "error (%s)copying data to host",
cudaGetErrorString(err));
Py_DECREF(rval); Py_DECREF(rval);
rval = NULL; rval = NULL;
} }
...@@ -3754,20 +3757,19 @@ CudaNdarray_CopyFromArray(CudaNdarray * self, PyArrayObject*obj) ...@@ -3754,20 +3757,19 @@ CudaNdarray_CopyFromArray(CudaNdarray * self, PyArrayObject*obj)
} }
npy_intp py_src_size = PyArray_SIZE(py_src); npy_intp py_src_size = PyArray_SIZE(py_src);
void *py_src_data = PyArray_DATA(py_src); void *py_src_data = PyArray_DATA(py_src);
cublasStatus_t cerr; cudaError_t cerr;
CNDA_BEGIN_ALLOW_THREADS CNDA_BEGIN_ALLOW_THREADS;
cerr = cublasSetVector(py_src_size, cerr = cudaMemcpy(self->devdata, py_src_data,
sizeof(real), py_src_size * sizeof(real),
py_src_data, 1, cudaMemcpyHostToDevice);
self->devdata, 1); //CNDA_THREAD_SYNC; // unneeded because cudaMemcpy is blocking anyway
//CNDA_THREAD_SYNC; // unneeded because cublasSetVector is blocking anyway CNDA_END_ALLOW_THREADS;
CNDA_END_ALLOW_THREADS if (cudaSuccess != cerr)
if (CUBLAS_STATUS_SUCCESS != cerr)
{ {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"CUBLAS error '%s' while copying %lli data element" "Cuda error '%s' while copying %lli data element"
" to device memory", " to device memory",
cublasGetErrorString(cerr), cudaGetErrorString(cerr),
(long long)py_src_size); (long long)py_src_size);
Py_DECREF(py_src); Py_DECREF(py_src);
return -1; return -1;
......
...@@ -813,10 +813,10 @@ class Subtensor(Op): ...@@ -813,10 +813,10 @@ class Subtensor(Op):
assert (slicelength <= length); assert (slicelength <= length);
xview_offset += %(c_prefix)s_STRIDES(%(x)s)[outer_ii] * start * xview_offset += (npy_intp)%(c_prefix)s_STRIDES(%(x)s)[outer_ii]
%(strides_mul)s; * start * %(strides_mul)s;
xview_dims[inner_ii] = slicelength; xview_dims[inner_ii] = slicelength;
xview_strides[inner_ii] = %(c_prefix)s_STRIDES(%(x)s)[outer_ii] * step; xview_strides[inner_ii] = (npy_intp)%(c_prefix)s_STRIDES(%(x)s)[outer_ii] * step;
inner_ii += 1; inner_ii += 1;
spec_pos += 3; spec_pos += 3;
...@@ -829,7 +829,7 @@ class Subtensor(Op): ...@@ -829,7 +829,7 @@ class Subtensor(Op):
{ {
if (idx < %(c_prefix)s_DIMS(%(x)s)[outer_ii]) if (idx < %(c_prefix)s_DIMS(%(x)s)[outer_ii])
{ {
xview_offset += %(c_prefix)s_STRIDES(%(x)s)[outer_ii] * idx * xview_offset += (npy_intp)%(c_prefix)s_STRIDES(%(x)s)[outer_ii] * idx *
%(strides_mul)s; %(strides_mul)s;
} }
else else
...@@ -863,7 +863,7 @@ class Subtensor(Op): ...@@ -863,7 +863,7 @@ class Subtensor(Op):
@staticmethod @staticmethod
def helper_c_code_cache_version(): def helper_c_code_cache_version():
return (8,) return (9,)
def c_code(self, node, name, inputs, outputs, sub): # DEBUG def c_code(self, node, name, inputs, outputs, sub): # DEBUG
if not isinstance(node.inputs[0].type, theano.tensor.TensorType): if not isinstance(node.inputs[0].type, theano.tensor.TensorType):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论