提交 0f07b4a5 authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #1966 from abergeron/cublas_v2

Switch to cublas v2
......@@ -41,6 +41,8 @@
#define CNDA_END_ALLOW_THREADS
#endif
cublasHandle_t handle;
/////////////////////////
// Alloc and Free
/////////////////////////
......@@ -537,14 +539,15 @@ PyObject * CudaNdarray_CreateArrayObj(CudaNdarray * self, PyObject *args)
npy_intp rval_size = PyArray_SIZE(rval);
void *rval_data = PyArray_DATA(rval);
cublasStatus_t err;
CNDA_BEGIN_ALLOW_THREADS
cublasGetVector(rval_size, sizeof(real),
contiguous_self->devdata, 1,
rval_data, 1);
err = cublasGetVector(rval_size, sizeof(real),
contiguous_self->devdata, 1,
rval_data, 1);
//CNDA_THREAD_SYNC; // unneeded because cublasGetVector is blocking anyway
CNDA_END_ALLOW_THREADS
if (CUBLAS_STATUS_SUCCESS != cublasGetError())
if (CUBLAS_STATUS_SUCCESS != err)
{
PyErr_SetString(PyExc_RuntimeError, "error copying data to host");
Py_DECREF(rval);
......@@ -1182,7 +1185,7 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
CudaNdarray_DEV_DATA(out),
CudaNdarray_HOST_STRIDES(out)[0], //strides
CudaNdarray_HOST_STRIDES(out)[1],
1,
1,
CudaNdarray_DEV_DATA(self),
CudaNdarray_HOST_DIMS(self)[0], //For indices check
CudaNdarray_HOST_STRIDES(self)[0], //strides
......@@ -1223,7 +1226,7 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
PyErr_SetString(PyExc_NotImplementedError,
"CudaNdarray_TakeFrom: only input with 1, 2 or 3"
" dimensions are currently supported");
}
free(dims);
CNDA_THREAD_SYNC;
......@@ -1256,7 +1259,7 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
Py_DECREF(out);
return NULL;
}
if (cpu_err_var != 0) {
PyErr_Format(
PyExc_IndexError,
......@@ -1274,11 +1277,11 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
Py_DECREF(indices);
Py_DECREF(out);
return NULL;
}
Py_DECREF(indices);
if (verbose) printf("TAKE SUCCEDED\n");
return (PyObject *)out;
}
......@@ -3007,7 +3010,7 @@ CudaNdarray_ptr_int_size(PyObject* _unused, PyObject* args)
"CudaNdarray_ptr_int_size: Can't allocate memory on the gpu.");
}
get_gpu_ptr_size<<<1,1>>>(gpu_data);
if (cudaSuccess != cublasGetError()){
if (cudaSuccess != cudaGetLastError()){
device_free(gpu_data);
return PyErr_Format(PyExc_RuntimeError,
......@@ -3016,16 +3019,19 @@ CudaNdarray_ptr_int_size(PyObject* _unused, PyObject* args)
// Transfer the result to cpu
int gpu_sizes[] = {-1,-1};
cublasGetVector(2, sizeof(int), gpu_data, 1, gpu_sizes, 1);
cublasStatus_t err;
err = cublasGetVector(2, sizeof(int), gpu_data, 1, gpu_sizes, 1);
device_free(gpu_data);
if (CUBLAS_STATUS_SUCCESS != cublasGetError()){
if (CUBLAS_STATUS_SUCCESS != err){
PyErr_SetString(PyExc_RuntimeError, "error copying data to from memory");
return NULL;
}
return Py_BuildValue("iiii", gpu_sizes[0], sizeof(float*), sizeof(int), gpu_sizes[1]);
}
static int cublas_init();
static int cublas_shutdown();
// Initialize the gpu.
// Takes one optional parameter, the device number.
// If provided, it sets that device to be the active device.
......@@ -3092,6 +3098,11 @@ CudaNdarray_gpu_init(PyObject* _unused, PyObject* args)
}
}
// Initialize cublas
if (handle != NULL)
cublas_shutdown();
cublas_init();
Py_INCREF(Py_None);
return Py_None;
}
......@@ -3535,22 +3546,28 @@ CudaNdarray_New(int nd)
//
//////////////////////////////
int
static int
cublas_init()
{
cublasInit();
if (CUBLAS_STATUS_SUCCESS != cublasGetError())
if (CUBLAS_STATUS_SUCCESS != cublasCreate(&handle))
{
PyErr_SetString(PyExc_RuntimeError, "error initializing device");
return -1;
}
// Set the default stream as the one to execute on (default)
cublasSetStream(handle, NULL);
// Pointer to scalars are on the host (also default)
cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST);
// atomics can be used in kernels to speed up operations (not default)
// This may lead to a slight variance from run to run in some operations
cublasSetAtomicsMode(handle, CUBLAS_ATOMICS_ALLOWED);
return 0;
}
int
static int
cublas_shutdown()
{
cublasShutdown();
if (CUBLAS_STATUS_SUCCESS != cublasGetError())
if (CUBLAS_STATUS_SUCCESS != cublasDestroy(handle))
{
PyErr_SetString(PyExc_RuntimeError, "error shutting down device");
return -1;
......@@ -3579,14 +3596,15 @@ CudaNdarray_CopyFromArray(CudaNdarray * self, PyArrayObject*obj)
}
npy_intp py_src_size = PyArray_SIZE(py_src);
void *py_src_data = PyArray_DATA(py_src);
cublasStatus_t cerr;
CNDA_BEGIN_ALLOW_THREADS
cublasSetVector(py_src_size,
sizeof(real),
py_src_data, 1,
self->devdata, 1);
cerr = cublasSetVector(py_src_size,
sizeof(real),
py_src_data, 1,
self->devdata, 1);
//CNDA_THREAD_SYNC; // unneeded because cublasSetVector is blocking anyway
CNDA_END_ALLOW_THREADS
if (CUBLAS_STATUS_SUCCESS != cublasGetError())
if (CUBLAS_STATUS_SUCCESS != cerr)
{
PyErr_SetString(PyExc_RuntimeError, "error copying data to device memory");
Py_DECREF(py_src);
......@@ -3750,11 +3768,12 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self,
if (verbose)
fprintf(stderr, "Copying contiguous vector with cublasScopy\n");
cublasScopy(size, CudaNdarray_DEV_DATA(other), 1,
CudaNdarray_DEV_DATA(self), 1);
cublasStatus_t err;
err = cublasScopy(handle, size, CudaNdarray_DEV_DATA(other), 1,
CudaNdarray_DEV_DATA(self), 1);
CNDA_THREAD_SYNC;
Py_XDECREF(new_other);
if (CUBLAS_STATUS_SUCCESS != cublasGetError())
if (CUBLAS_STATUS_SUCCESS != err)
{
PyErr_SetString(PyExc_RuntimeError, "Error copying memory");
return -1;
......@@ -3920,22 +3939,6 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B,
return -1;
}
#if PRECHECK_ERROR
cublasStatus prevErr = cublasGetError();
if (CUBLAS_STATUS_SUCCESS != prevErr)
{
//I don't know why, but I need to remove the cuda error too.
//Otherwise, the clean up before raising the Python error cause error too!
//So we don't see this python error.
fprintf(stderr,
"CudaNdarray_sgemm: Prev cublas error %s",
cublasGetErrorString(prevErr));
PyErr_Format(PyExc_RuntimeError,
"CudaNdarray_sgemm: Prev cublas error %s",
cublasGetErrorString(prevErr));
return -1;
}
#endif
// We must allow dimensions to be zeros.
if ((CudaNdarray_HOST_DIMS(A)[1] != CudaNdarray_HOST_DIMS(B)[0])
|| (CudaNdarray_HOST_DIMS(A)[0] != CudaNdarray_HOST_DIMS(C)[0])
......@@ -4055,8 +4058,8 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B,
float* a = CudaNdarray_DEV_DATA(A);
float* b = CudaNdarray_DEV_DATA(B);
float* c = CudaNdarray_DEV_DATA(C);
char N = 'N';
char T = 'T';
cublasOperation_t N = CUBLAS_OP_N;
cublasOperation_t T = CUBLAS_OP_T;
//std::cerr << (unit/256) MOD 16 << (unit / 16) MOD 16 << unit MOD 16<< '\\n';
// There should be no negative stride at that point
#define CHK_STRIDE_SGEMM(T0, T1, D0, D1, D2, a, x, sx, y, sy, b, z, sz) \
......@@ -4064,7 +4067,7 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B,
if (sy == 0){sy = 1;}\
if (sz == 0){sz = 1;}\
if ((sx > 0) && (sy > 0) && (sz > 0)) { \
cublasSgemm(T0, T1, D0, D1, D2, a, x, sx, y, sy, b, z, sz); \
err = cublasSgemm(handle, T0, T1, D0, D1, D2, &a, x, sx, y, sy, &b, z, sz); \
} else { \
PyErr_SetString(PyExc_AssertionError, "negative stride to sGemm");\
Py_XDECREF(A_new);\
......@@ -4072,6 +4075,7 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B,
return -1; \
}
cublasStatus_t err;
switch(unit)
{
case 0x000: CHK_STRIDE_SGEMM(N, N, CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(A)[1], alpha, b, sb_0, a, sa_0, beta, c, sc_0); break;
......@@ -4089,12 +4093,11 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B,
Py_XDECREF(A_new);
Py_XDECREF(B_new);
cublasStatus err = cublasGetError();
if (CUBLAS_STATUS_SUCCESS != err)
{
PyErr_Format(PyExc_RuntimeError,
"cublasSgemm failed (%i) %s\n"
" unit=%h N=%d, c.dims=[%d %d], a.dim=[%d %d], alpha=%f, beta=%f, a=%f, b=%f, c=%f"
" unit=%h N=%d, c.dims=[%d %d], a.dim=[%d %d], alpha=%f, beta=%f, a=%p, b=%p, c=%p"
" sa_0=%d, sa_1=%d, sb_0=%d, sb_1=%d, sc_0=%d, sc_1=%d",
err, cublasGetErrorString(err),
unit, N, CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(C)[1],
......@@ -4187,29 +4190,31 @@ int CudaNdarray_sgemv(float alpha, const CudaNdarray * A, const CudaNdarray * B,
if (sa_1 == 0)
sa_1 = 1;
// This is important because we can end up not calling Sgemv at all
cublasStatus_t err = CUBLAS_STATUS_SUCCESS;
if (CudaNdarray_SIZE(C)) {
if ((CudaNdarray_HOST_DIMS(A)[0] <= 1)
|| ((CudaNdarray_HOST_STRIDES(A)[0] == 1)
&& (CudaNdarray_HOST_STRIDES(A)[1] > 0)))
{
cublasSgemv('N',
err = cublasSgemv(handle, CUBLAS_OP_N,
CudaNdarray_HOST_DIMS(A)[0], CudaNdarray_HOST_DIMS(A)[1],
alpha,
&alpha,
CudaNdarray_DEV_DATA(A), sa_1,
CudaNdarray_DEV_DATA(B), sb_0,
beta,
&beta,
CudaNdarray_DEV_DATA(C), sc_0);
}
else if ((CudaNdarray_HOST_DIMS(A)[1] <= 1)
|| ((CudaNdarray_HOST_STRIDES(A)[1] == 1)
&& (CudaNdarray_HOST_STRIDES(A)[0] > 0)))
{
cublasSgemv('T',
err = cublasSgemv(handle, CUBLAS_OP_T,
CudaNdarray_HOST_DIMS(A)[1], CudaNdarray_HOST_DIMS(A)[0],
alpha,
&alpha,
CudaNdarray_DEV_DATA(A), sa_0,
CudaNdarray_DEV_DATA(B), sb_0,
beta,
&beta,
CudaNdarray_DEV_DATA(C), sc_0);
}
else
......@@ -4235,7 +4240,6 @@ int CudaNdarray_sgemv(float alpha, const CudaNdarray * A, const CudaNdarray * B,
Py_XDECREF(A_new);
Py_XDECREF(B_new);
cublasStatus err = cublasGetError();
if (CUBLAS_STATUS_SUCCESS != err)
{
PyErr_Format(PyExc_RuntimeError,
......@@ -4303,13 +4307,15 @@ int CudaNdarray_sger(float alpha, const CudaNdarray * x, const CudaNdarray * y,
int sa_1 = (CudaNdarray_HOST_DIMS(A)[1] > 1) ? CudaNdarray_HOST_STRIDES(A)[1]
: CudaNdarray_HOST_DIMS(A)[0];
// This is important because we can end up not calling Sger at all
cublasStatus_t err = CUBLAS_STATUS_SUCCESS;
if(CudaNdarray_SIZE(A)){
// If A is in col-major
if ((CudaNdarray_HOST_DIMS(A)[0] <= 1)
|| ((CudaNdarray_HOST_STRIDES(A)[0] == 1)
&& (CudaNdarray_HOST_STRIDES(A)[1] > 0)))
{
cublasSger(CudaNdarray_HOST_DIMS(x)[0], CudaNdarray_HOST_DIMS(y)[0], alpha,
err = cublasSger(handle, CudaNdarray_HOST_DIMS(x)[0], CudaNdarray_HOST_DIMS(y)[0], &alpha,
CudaNdarray_DEV_DATA(x), x_strides,
CudaNdarray_DEV_DATA(y), y_strides,
CudaNdarray_DEV_DATA(A), sa_1);
......@@ -4319,7 +4325,7 @@ int CudaNdarray_sger(float alpha, const CudaNdarray * x, const CudaNdarray * y,
|| ((CudaNdarray_HOST_STRIDES(A)[1] == 1)
&& (CudaNdarray_HOST_STRIDES(A)[0] > 0)))
{
cublasSger(CudaNdarray_HOST_DIMS(y)[0], CudaNdarray_HOST_DIMS(x)[0], alpha,
err = cublasSger(handle, CudaNdarray_HOST_DIMS(y)[0], CudaNdarray_HOST_DIMS(x)[0], &alpha,
CudaNdarray_DEV_DATA(y), y_strides,
CudaNdarray_DEV_DATA(x), x_strides,
CudaNdarray_DEV_DATA(A), sa_0);
......@@ -4338,7 +4344,6 @@ int CudaNdarray_sger(float alpha, const CudaNdarray * x, const CudaNdarray * y,
Py_XDECREF(x_new);
Py_XDECREF(y_new);
cublasStatus err = cublasGetError();
if (CUBLAS_STATUS_SUCCESS != err)
{
PyErr_Format(PyExc_RuntimeError,
......@@ -4973,14 +4978,12 @@ cnda_copy_structure_to_device(const CudaNdarray * self)
}
}
}
cublasSetVector(cnda_structure_size(self->nd),
sizeof(int),
self->host_structure,
1,
self->dev_structure,
1);
//CNDA_THREAD_SYNC; // unneeded because cublasSetVector is blocking anyway
if (CUBLAS_STATUS_SUCCESS != cublasGetError())
if (cublasSetVector(cnda_structure_size(self->nd),
sizeof(int),
self->host_structure,
1,
self->dev_structure,
1) != CUBLAS_STATUS_SUCCESS)
{
PyErr_SetString(PyExc_RuntimeError, "error copying structure to device memory");
return -1;
......
......@@ -40,7 +40,7 @@
#endif
#include <cublas.h>
#include <cublas_v2.h>
#ifdef _WIN32
#ifdef _CUDA_NDARRAY_C
......@@ -81,6 +81,9 @@ typedef float real;
#define VERBOSE_DEVICE_MALLOC 1
#define NO_VERBOSE_DEVICE_MALLOC 0
/* Use this handle to make cublas calls */
extern cublasHandle_t handle;
/**
* Allocation and freeing of device memory should go through these functions so that the lib can track memory usage.
*
......@@ -365,8 +368,8 @@ static int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd,
//Detect overflow on unsigned integer
if (dim[i] != 0 && size > (SIZE_MAX / dim[i])) {
PyErr_Format(PyExc_AssertionError,
"Can't store in size_t for the bytes requested",
size);
"Can't store in size_t for the bytes requested %llu",
(unsigned long long)size);
return -1;
}
size = size * dim[i];
......@@ -382,8 +385,8 @@ static int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd,
//Detect overflow on unsigned integer
if (dim[i] != 0 && size > (SIZE_MAX / dim[i])) {
PyErr_Format(PyExc_AssertionError,
"Can't store in size_t for the bytes requested",
size);
"Can't store in size_t for the bytes requested %llu",
(unsigned long long)size);
return -1;
}
size = size * dim[i];
......@@ -583,23 +586,33 @@ DllExport int CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_othe
DllExport int CudaNdarray_prep_output(CudaNdarray ** arr, int nd,
const int * dims, int fortran = 0);
DllExport inline const char* ALWAYS_INLINE cublasGetErrorString(cublasStatus err){
if(CUBLAS_STATUS_SUCCESS == err)
DllExport inline const char* ALWAYS_INLINE cublasGetErrorString(cublasStatus_t err){
switch(err) {
case CUBLAS_STATUS_SUCCESS:
return "success";
else if(CUBLAS_STATUS_NOT_INITIALIZED == err)
case CUBLAS_STATUS_NOT_INITIALIZED:
return "the library was not initialized";
else if(CUBLAS_STATUS_ALLOC_FAILED == err)
case CUBLAS_STATUS_ALLOC_FAILED:
return "the resource allocation failed";
else if(CUBLAS_STATUS_INVALID_VALUE == err)
case CUBLAS_STATUS_INVALID_VALUE:
return "the parameters n<0 or incx,incy=0";
else if(CUBLAS_STATUS_MAPPING_ERROR == err)
#ifdef CUBLAS_STATUS_ARCH_MISMATCH
case CUBLAS_STATUS_ARCH_MISMATCH:
return "required device feature not present";
#endif
case CUBLAS_STATUS_MAPPING_ERROR:
return "an access to GPU memory space failed";
else if(CUBLAS_STATUS_EXECUTION_FAILED == err)
case CUBLAS_STATUS_EXECUTION_FAILED:
return "the function failed to launch on the GPU";
else if(CUBLAS_STATUS_INTERNAL_ERROR == err)
case CUBLAS_STATUS_INTERNAL_ERROR:
return "an internal operation failed";
else
#ifdef CUBLAS_STATUS_NOT_SUPPORTED
case CUBLAS_STATUS_NOT_SUPPORTED:
return "unsupported function";
#endif
default:
return "unknow code";
}
}
#endif
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论