提交 bb533b5e authored 作者: abergeron's avatar abergeron

Merge pull request #3219 from nouiz/2g

cuda bugfix with array over 2g
......@@ -625,17 +625,20 @@ PyObject * CudaNdarray_CreateArrayObj(CudaNdarray * self, PyObject *args)
npy_intp rval_size = PyArray_SIZE(rval);
void *rval_data = PyArray_DATA(rval);
cublasStatus_t err;
CNDA_BEGIN_ALLOW_THREADS
err = cublasGetVector(rval_size, sizeof(real),
contiguous_self->devdata, 1,
rval_data, 1);
//CNDA_THREAD_SYNC; // unneeded because cublasGetVector is blocking anyway
CNDA_END_ALLOW_THREADS
cudaError_t err;
CNDA_BEGIN_ALLOW_THREADS;
if (CUBLAS_STATUS_SUCCESS != err)
err = cudaMemcpy(rval_data, contiguous_self->devdata,
rval_size * sizeof(real),
cudaMemcpyDeviceToHost
);
//CNDA_THREAD_SYNC; // unneeded because cudaMemcpy is blocking anyway
CNDA_END_ALLOW_THREADS;
if (cudaSuccess != err)
{
PyErr_SetString(PyExc_RuntimeError, "error copying data to host");
PyErr_Format(PyExc_RuntimeError, "error (%s)copying data to host",
cudaGetErrorString(err));
Py_DECREF(rval);
rval = NULL;
}
......@@ -3754,20 +3757,19 @@ CudaNdarray_CopyFromArray(CudaNdarray * self, PyArrayObject*obj)
}
npy_intp py_src_size = PyArray_SIZE(py_src);
void *py_src_data = PyArray_DATA(py_src);
cublasStatus_t cerr;
CNDA_BEGIN_ALLOW_THREADS
cerr = cublasSetVector(py_src_size,
sizeof(real),
py_src_data, 1,
self->devdata, 1);
//CNDA_THREAD_SYNC; // unneeded because cublasSetVector is blocking anyway
CNDA_END_ALLOW_THREADS
if (CUBLAS_STATUS_SUCCESS != cerr)
cudaError_t cerr;
CNDA_BEGIN_ALLOW_THREADS;
cerr = cudaMemcpy(self->devdata, py_src_data,
py_src_size * sizeof(real),
cudaMemcpyHostToDevice);
//CNDA_THREAD_SYNC; // unneeded because cudaMemcpy is blocking anyway
CNDA_END_ALLOW_THREADS;
if (cudaSuccess != cerr)
{
PyErr_Format(PyExc_RuntimeError,
"CUBLAS error '%s' while copying %lli data element"
"Cuda error '%s' while copying %lli data element"
" to device memory",
cublasGetErrorString(cerr),
cudaGetErrorString(cerr),
(long long)py_src_size);
Py_DECREF(py_src);
return -1;
......
......@@ -813,10 +813,10 @@ class Subtensor(Op):
assert (slicelength <= length);
xview_offset += %(c_prefix)s_STRIDES(%(x)s)[outer_ii] * start *
%(strides_mul)s;
xview_offset += (npy_intp)%(c_prefix)s_STRIDES(%(x)s)[outer_ii]
* start * %(strides_mul)s;
xview_dims[inner_ii] = slicelength;
xview_strides[inner_ii] = %(c_prefix)s_STRIDES(%(x)s)[outer_ii] * step;
xview_strides[inner_ii] = (npy_intp)%(c_prefix)s_STRIDES(%(x)s)[outer_ii] * step;
inner_ii += 1;
spec_pos += 3;
......@@ -829,7 +829,7 @@ class Subtensor(Op):
{
if (idx < %(c_prefix)s_DIMS(%(x)s)[outer_ii])
{
xview_offset += %(c_prefix)s_STRIDES(%(x)s)[outer_ii] * idx *
xview_offset += (npy_intp)%(c_prefix)s_STRIDES(%(x)s)[outer_ii] * idx *
%(strides_mul)s;
}
else
......@@ -863,7 +863,7 @@ class Subtensor(Op):
@staticmethod
def helper_c_code_cache_version():
return (8,)
return (9,)
def c_code(self, node, name, inputs, outputs, sub): # DEBUG
if not isinstance(node.inputs[0].type, theano.tensor.TensorType):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论