提交 7adc858b authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Some fixes to the code all around and make sure to initialize cublas along with the device.

上级 02cfe1b9
...@@ -539,14 +539,15 @@ PyObject * CudaNdarray_CreateArrayObj(CudaNdarray * self, PyObject *args) ...@@ -539,14 +539,15 @@ PyObject * CudaNdarray_CreateArrayObj(CudaNdarray * self, PyObject *args)
npy_intp rval_size = PyArray_SIZE(rval); npy_intp rval_size = PyArray_SIZE(rval);
void *rval_data = PyArray_DATA(rval); void *rval_data = PyArray_DATA(rval);
cublasStatus_t err;
CNDA_BEGIN_ALLOW_THREADS CNDA_BEGIN_ALLOW_THREADS
cublasGetVector(rval_size, sizeof(real), err = cublasGetVector(rval_size, sizeof(real),
contiguous_self->devdata, 1, contiguous_self->devdata, 1,
rval_data, 1); rval_data, 1);
//CNDA_THREAD_SYNC; // unneeded because cublasGetVector is blocking anyway //CNDA_THREAD_SYNC; // unneeded because cublasGetVector is blocking anyway
CNDA_END_ALLOW_THREADS CNDA_END_ALLOW_THREADS
if (CUBLAS_STATUS_SUCCESS != cublasGetError()) if (CUBLAS_STATUS_SUCCESS != err)
{ {
PyErr_SetString(PyExc_RuntimeError, "error copying data to host"); PyErr_SetString(PyExc_RuntimeError, "error copying data to host");
Py_DECREF(rval); Py_DECREF(rval);
...@@ -3009,7 +3010,7 @@ CudaNdarray_ptr_int_size(PyObject* _unused, PyObject* args) ...@@ -3009,7 +3010,7 @@ CudaNdarray_ptr_int_size(PyObject* _unused, PyObject* args)
"CudaNdarray_ptr_int_size: Can't allocate memory on the gpu."); "CudaNdarray_ptr_int_size: Can't allocate memory on the gpu.");
} }
get_gpu_ptr_size<<<1,1>>>(gpu_data); get_gpu_ptr_size<<<1,1>>>(gpu_data);
if (cudaSuccess != cublasGetError()){ if (cudaSuccess != cudaGetLastError()){
device_free(gpu_data); device_free(gpu_data);
return PyErr_Format(PyExc_RuntimeError, return PyErr_Format(PyExc_RuntimeError,
...@@ -3018,16 +3019,19 @@ CudaNdarray_ptr_int_size(PyObject* _unused, PyObject* args) ...@@ -3018,16 +3019,19 @@ CudaNdarray_ptr_int_size(PyObject* _unused, PyObject* args)
// Transfer the result to cpu // Transfer the result to cpu
int gpu_sizes[] = {-1,-1}; int gpu_sizes[] = {-1,-1};
cublasGetVector(2, sizeof(int), gpu_data, 1, gpu_sizes, 1); cublasStatus_t err;
err = cublasGetVector(2, sizeof(int), gpu_data, 1, gpu_sizes, 1);
device_free(gpu_data); device_free(gpu_data);
if (CUBLAS_STATUS_SUCCESS != cublasGetError()){ if (CUBLAS_STATUS_SUCCESS != err){
PyErr_SetString(PyExc_RuntimeError, "error copying data to from memory"); PyErr_SetString(PyExc_RuntimeError, "error copying data to from memory");
return NULL; return NULL;
} }
return Py_BuildValue("iiii", gpu_sizes[0], sizeof(float*), sizeof(int), gpu_sizes[1]); return Py_BuildValue("iiii", gpu_sizes[0], sizeof(float*), sizeof(int), gpu_sizes[1]);
} }
static int cublas_init();
static int cublas_shutdown();
// Initialize the gpu. // Initialize the gpu.
// Takes one optional parameter, the device number. // Takes one optional parameter, the device number.
// If provided, it sets that device to be the active device. // If provided, it sets that device to be the active device.
...@@ -3094,6 +3098,11 @@ CudaNdarray_gpu_init(PyObject* _unused, PyObject* args) ...@@ -3094,6 +3098,11 @@ CudaNdarray_gpu_init(PyObject* _unused, PyObject* args)
} }
} }
// Initialize cublas
if (handle != NULL)
cublas_shutdown();
cublas_init();
Py_INCREF(Py_None); Py_INCREF(Py_None);
return Py_None; return Py_None;
} }
...@@ -3537,7 +3546,7 @@ CudaNdarray_New(int nd) ...@@ -3537,7 +3546,7 @@ CudaNdarray_New(int nd)
// //
////////////////////////////// //////////////////////////////
int static int
cublas_init() cublas_init()
{ {
if (CUBLAS_STATUS_SUCCESS != cublasCreate(&handle)) if (CUBLAS_STATUS_SUCCESS != cublasCreate(&handle))
...@@ -3555,7 +3564,7 @@ cublas_init() ...@@ -3555,7 +3564,7 @@ cublas_init()
return 0; return 0;
} }
int static int
cublas_shutdown() cublas_shutdown()
{ {
if (CUBLAS_STATUS_SUCCESS != cublasDestroy(handle)) if (CUBLAS_STATUS_SUCCESS != cublasDestroy(handle))
...@@ -3587,15 +3596,15 @@ CudaNdarray_CopyFromArray(CudaNdarray * self, PyArrayObject*obj) ...@@ -3587,15 +3596,15 @@ CudaNdarray_CopyFromArray(CudaNdarray * self, PyArrayObject*obj)
} }
npy_intp py_src_size = PyArray_SIZE(py_src); npy_intp py_src_size = PyArray_SIZE(py_src);
void *py_src_data = PyArray_DATA(py_src); void *py_src_data = PyArray_DATA(py_src);
cublasStatus_t err; cublasStatus_t cerr;
CNDA_BEGIN_ALLOW_THREADS CNDA_BEGIN_ALLOW_THREADS
err= cublasSetVector(py_src_size, cerr = cublasSetVector(py_src_size,
sizeof(real), sizeof(real),
py_src_data, 1, py_src_data, 1,
self->devdata, 1); self->devdata, 1);
//CNDA_THREAD_SYNC; // unneeded because cublasSetVector is blocking anyway //CNDA_THREAD_SYNC; // unneeded because cublasSetVector is blocking anyway
CNDA_END_ALLOW_THREADS CNDA_END_ALLOW_THREADS
if (CUBLAS_STATUS_SUCCESS != err) if (CUBLAS_STATUS_SUCCESS != cerr)
{ {
PyErr_SetString(PyExc_RuntimeError, "error copying data to device memory"); PyErr_SetString(PyExc_RuntimeError, "error copying data to device memory");
Py_DECREF(py_src); Py_DECREF(py_src);
...@@ -4058,7 +4067,7 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B, ...@@ -4058,7 +4067,7 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B,
if (sy == 0){sy = 1;}\ if (sy == 0){sy = 1;}\
if (sz == 0){sz = 1;}\ if (sz == 0){sz = 1;}\
if ((sx > 0) && (sy > 0) && (sz > 0)) { \ if ((sx > 0) && (sy > 0) && (sz > 0)) { \
err = cublasSgemm(handle, T0, T1, D0, D1, D2, a, x, sx, y, sy, b, z, sz); \ err = cublasSgemm(handle, T0, T1, D0, D1, D2, &a, x, sx, y, sy, &b, z, sz); \
} else { \ } else { \
PyErr_SetString(PyExc_AssertionError, "negative stride to sGemm");\ PyErr_SetString(PyExc_AssertionError, "negative stride to sGemm");\
Py_XDECREF(A_new);\ Py_XDECREF(A_new);\
...@@ -4088,7 +4097,7 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B, ...@@ -4088,7 +4097,7 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B,
{ {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"cublasSgemm failed (%i) %s\n" "cublasSgemm failed (%i) %s\n"
" unit=%h N=%d, c.dims=[%d %d], a.dim=[%d %d], alpha=%f, beta=%f, a=%f, b=%f, c=%f" " unit=%h N=%d, c.dims=[%d %d], a.dim=[%d %d], alpha=%f, beta=%f, a=%p, b=%p, c=%p"
" sa_0=%d, sa_1=%d, sb_0=%d, sb_1=%d, sc_0=%d, sc_1=%d", " sa_0=%d, sa_1=%d, sb_0=%d, sb_1=%d, sc_0=%d, sc_1=%d",
err, cublasGetErrorString(err), err, cublasGetErrorString(err),
unit, N, CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(C)[1], unit, N, CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(C)[1],
...@@ -4189,10 +4198,10 @@ int CudaNdarray_sgemv(float alpha, const CudaNdarray * A, const CudaNdarray * B, ...@@ -4189,10 +4198,10 @@ int CudaNdarray_sgemv(float alpha, const CudaNdarray * A, const CudaNdarray * B,
{ {
err = cublasSgemv(handle, CUBLAS_OP_N, err = cublasSgemv(handle, CUBLAS_OP_N,
CudaNdarray_HOST_DIMS(A)[0], CudaNdarray_HOST_DIMS(A)[1], CudaNdarray_HOST_DIMS(A)[0], CudaNdarray_HOST_DIMS(A)[1],
alpha, &alpha,
CudaNdarray_DEV_DATA(A), sa_1, CudaNdarray_DEV_DATA(A), sa_1,
CudaNdarray_DEV_DATA(B), sb_0, CudaNdarray_DEV_DATA(B), sb_0,
beta, &beta,
CudaNdarray_DEV_DATA(C), sc_0); CudaNdarray_DEV_DATA(C), sc_0);
} }
else if ((CudaNdarray_HOST_DIMS(A)[1] <= 1) else if ((CudaNdarray_HOST_DIMS(A)[1] <= 1)
...@@ -4201,10 +4210,10 @@ int CudaNdarray_sgemv(float alpha, const CudaNdarray * A, const CudaNdarray * B, ...@@ -4201,10 +4210,10 @@ int CudaNdarray_sgemv(float alpha, const CudaNdarray * A, const CudaNdarray * B,
{ {
err = cublasSgemv(handle, CUBLAS_OP_T, err = cublasSgemv(handle, CUBLAS_OP_T,
CudaNdarray_HOST_DIMS(A)[1], CudaNdarray_HOST_DIMS(A)[0], CudaNdarray_HOST_DIMS(A)[1], CudaNdarray_HOST_DIMS(A)[0],
alpha, &alpha,
CudaNdarray_DEV_DATA(A), sa_0, CudaNdarray_DEV_DATA(A), sa_0,
CudaNdarray_DEV_DATA(B), sb_0, CudaNdarray_DEV_DATA(B), sb_0,
beta, &beta,
CudaNdarray_DEV_DATA(C), sc_0); CudaNdarray_DEV_DATA(C), sc_0);
} }
else else
...@@ -4297,14 +4306,14 @@ int CudaNdarray_sger(float alpha, const CudaNdarray * x, const CudaNdarray * y, ...@@ -4297,14 +4306,14 @@ int CudaNdarray_sger(float alpha, const CudaNdarray * x, const CudaNdarray * y,
int sa_1 = (CudaNdarray_HOST_DIMS(A)[1] > 1) ? CudaNdarray_HOST_STRIDES(A)[1] int sa_1 = (CudaNdarray_HOST_DIMS(A)[1] > 1) ? CudaNdarray_HOST_STRIDES(A)[1]
: CudaNdarray_HOST_DIMS(A)[0]; : CudaNdarray_HOST_DIMS(A)[0];
cublasStatus err; cublasStatus_t err;
if(CudaNdarray_SIZE(A)){ if(CudaNdarray_SIZE(A)){
// If A is in col-major // If A is in col-major
if ((CudaNdarray_HOST_DIMS(A)[0] <= 1) if ((CudaNdarray_HOST_DIMS(A)[0] <= 1)
|| ((CudaNdarray_HOST_STRIDES(A)[0] == 1) || ((CudaNdarray_HOST_STRIDES(A)[0] == 1)
&& (CudaNdarray_HOST_STRIDES(A)[1] > 0))) && (CudaNdarray_HOST_STRIDES(A)[1] > 0)))
{ {
err = cublasSger(handle, CudaNdarray_HOST_DIMS(x)[0], CudaNdarray_HOST_DIMS(y)[0], alpha, err = cublasSger(handle, CudaNdarray_HOST_DIMS(x)[0], CudaNdarray_HOST_DIMS(y)[0], &alpha,
CudaNdarray_DEV_DATA(x), x_strides, CudaNdarray_DEV_DATA(x), x_strides,
CudaNdarray_DEV_DATA(y), y_strides, CudaNdarray_DEV_DATA(y), y_strides,
CudaNdarray_DEV_DATA(A), sa_1); CudaNdarray_DEV_DATA(A), sa_1);
...@@ -4314,7 +4323,7 @@ int CudaNdarray_sger(float alpha, const CudaNdarray * x, const CudaNdarray * y, ...@@ -4314,7 +4323,7 @@ int CudaNdarray_sger(float alpha, const CudaNdarray * x, const CudaNdarray * y,
|| ((CudaNdarray_HOST_STRIDES(A)[1] == 1) || ((CudaNdarray_HOST_STRIDES(A)[1] == 1)
&& (CudaNdarray_HOST_STRIDES(A)[0] > 0))) && (CudaNdarray_HOST_STRIDES(A)[0] > 0)))
{ {
err = cublasSger(handle, CudaNdarray_HOST_DIMS(y)[0], CudaNdarray_HOST_DIMS(x)[0], alpha, err = cublasSger(handle, CudaNdarray_HOST_DIMS(y)[0], CudaNdarray_HOST_DIMS(x)[0], &alpha,
CudaNdarray_DEV_DATA(y), y_strides, CudaNdarray_DEV_DATA(y), y_strides,
CudaNdarray_DEV_DATA(x), x_strides, CudaNdarray_DEV_DATA(x), x_strides,
CudaNdarray_DEV_DATA(A), sa_0); CudaNdarray_DEV_DATA(A), sa_0);
......
...@@ -368,8 +368,8 @@ static int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd, ...@@ -368,8 +368,8 @@ static int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd,
//Detect overflow on unsigned integer //Detect overflow on unsigned integer
if (dim[i] != 0 && size > (SIZE_MAX / dim[i])) { if (dim[i] != 0 && size > (SIZE_MAX / dim[i])) {
PyErr_Format(PyExc_AssertionError, PyErr_Format(PyExc_AssertionError,
"Can't store in size_t for the bytes requested", "Can't store in size_t for the bytes requested %llu",
size); (unsigned long long)size);
return -1; return -1;
} }
size = size * dim[i]; size = size * dim[i];
...@@ -385,8 +385,8 @@ static int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd, ...@@ -385,8 +385,8 @@ static int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd,
//Detect overflow on unsigned integer //Detect overflow on unsigned integer
if (dim[i] != 0 && size > (SIZE_MAX / dim[i])) { if (dim[i] != 0 && size > (SIZE_MAX / dim[i])) {
PyErr_Format(PyExc_AssertionError, PyErr_Format(PyExc_AssertionError,
"Can't store in size_t for the bytes requested", "Can't store in size_t for the bytes requested %llu",
size); (unsigned long long)size);
return -1; return -1;
} }
size = size * dim[i]; size = size * dim[i];
...@@ -586,23 +586,29 @@ DllExport int CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_othe ...@@ -586,23 +586,29 @@ DllExport int CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_othe
DllExport int CudaNdarray_prep_output(CudaNdarray ** arr, int nd, DllExport int CudaNdarray_prep_output(CudaNdarray ** arr, int nd,
const int * dims, int fortran = 0); const int * dims, int fortran = 0);
DllExport inline const char* ALWAYS_INLINE cublasGetErrorString(cublasStatus err){ DllExport inline const char* ALWAYS_INLINE cublasGetErrorString(cublasStatus_t err){
if(CUBLAS_STATUS_SUCCESS == err) switch(err) {
case CUBLAS_STATUS_SUCCESS:
return "success"; return "success";
else if(CUBLAS_STATUS_NOT_INITIALIZED == err) case CUBLAS_STATUS_NOT_INITIALIZED:
return "the library was not initialized"; return "the library was not initialized";
else if(CUBLAS_STATUS_ALLOC_FAILED == err) case CUBLAS_STATUS_ALLOC_FAILED:
return "the resource allocation failed"; return "the resource allocation failed";
else if(CUBLAS_STATUS_INVALID_VALUE == err) case CUBLAS_STATUS_INVALID_VALUE:
return "the parameters n<0 or incx,incy=0"; return "the parameters n<0 or incx,incy=0";
else if(CUBLAS_STATUS_MAPPING_ERROR == err) case CUBLAS_STATUS_ARCH_MISMATCH:
return "required device feature not present";
case CUBLAS_STATUS_MAPPING_ERROR:
return "an access to GPU memory space failed"; return "an access to GPU memory space failed";
else if(CUBLAS_STATUS_EXECUTION_FAILED == err) case CUBLAS_STATUS_EXECUTION_FAILED:
return "the function failed to launch on the GPU"; return "the function failed to launch on the GPU";
else if(CUBLAS_STATUS_INTERNAL_ERROR == err) case CUBLAS_STATUS_INTERNAL_ERROR:
return "an internal operation failed"; return "an internal operation failed";
else case CUBLAS_STATUS_NOT_SUPPORTED:
return "unsupported function";
default:
return "unknow code"; return "unknow code";
}
} }
#endif #endif
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论