提交 7eb58359 authored 作者: Frederic's avatar Frederic

First cumem version

上级 03e77233
...@@ -9,6 +9,9 @@ ...@@ -9,6 +9,9 @@
#include "cuda_ndarray.cuh" #include "cuda_ndarray.cuh"
#include "cumem.h"
#include "cumem.cpp"
//If true, when there is a gpu malloc or free error, we print the size of allocated memory on the device. //If true, when there is a gpu malloc or free error, we print the size of allocated memory on the device.
#define COMPUTE_GPU_MEM_USED 0 #define COMPUTE_GPU_MEM_USED 0
...@@ -67,6 +70,41 @@ void * device_malloc(size_t size) ...@@ -67,6 +70,41 @@ void * device_malloc(size_t size)
return device_malloc(size, VERBOSE_DEVICE_MALLOC); return device_malloc(size, VERBOSE_DEVICE_MALLOC);
} }
///@TODO: thejaswi: link this option to a theano config variable?
static bool g_use_cumem = true;
static const int g_max_devices = 8;
int initCumem() {
static bool cumemInitialized = false;
if(cumemInitialized) {
return 0;
}
printf("Initializing cumem...\n");
int numDevices = 0;
cumemDevice_t devices[g_max_devices];
if(cudaGetDeviceCount(&numDevices) != cudaSuccess) {
fprintf(stderr, "initCumem: 'cudaGetDeviceCount' failed! Reason=%s\n",
cudaGetErrorString(cudaGetLastError()));
return -1;
}
for(int i=0;i<numDevices;++i) {
devices[i].device = i;
///@TODO: thejaswi: support for choosing mem size to be allocated before-hand?
devices[i].size = 0;
///@TODO: thejaswi: add support for multiple streams
devices[i].numStreams = 0;
devices[i].streams = NULL;
}
///@TODO: thejaswi: passing custom cumem flags?
cumemStatus_t status = cumemInit(numDevices, devices, CUMEM_FLAGS_DEFAULT);
if(status != CUMEM_STATUS_SUCCESS) {
fprintf(stderr, "initCumem: cumemInit call failed! Reason=%s\n",
cumemGetErrorString(status));
return -1;
}
cumemInitialized = true;
return 0;
}
void * device_malloc(size_t size, int verbose) void * device_malloc(size_t size, int verbose)
{ {
#if PRECHECK_ERROR #if PRECHECK_ERROR
...@@ -81,6 +119,16 @@ void * device_malloc(size_t size, int verbose) ...@@ -81,6 +119,16 @@ void * device_malloc(size_t size, int verbose)
} }
#endif #endif
void * rval=NULL; void * rval=NULL;
///@TODO: thejaswi: support for multiple-streams?
if(g_use_cumem) {
cumemStatus_t status = cumemMalloc(&rval, size, NULL);
if(status != CUMEM_STATUS_SUCCESS) {
fprintf(stderr, "device_malloc: cumemMallocAysnc call failed! Reason=%s\n",
cumemGetErrorString(status));
return NULL;
}
}
else {
cudaError_t err = cudaMalloc(&rval, size); cudaError_t err = cudaMalloc(&rval, size);
if (cudaSuccess != err) if (cudaSuccess != err)
{ {
...@@ -118,6 +166,7 @@ void * device_malloc(size_t size, int verbose) ...@@ -118,6 +166,7 @@ void * device_malloc(size_t size, int verbose)
size, cudaGetErrorString(err)); size, cudaGetErrorString(err));
return NULL; return NULL;
} }
}
if (rval != NULL){ if (rval != NULL){
// Can it happen that cudaMalloc return cudaSuccess, but return a NULL ptr? // Can it happen that cudaMalloc return cudaSuccess, but return a NULL ptr?
// Could this be what happen if size is 0? // Could this be what happen if size is 0?
...@@ -202,6 +251,15 @@ int device_free(void *ptr) ...@@ -202,6 +251,15 @@ int device_free(void *ptr)
return 0; return 0;
} }
///@TODO: thejaswi: multi-stream support
if(g_use_cumem) {
cumemStatus_t status = cumemFree(ptr, NULL);
if(status != CUMEM_STATUS_SUCCESS) {
fprintf(stderr, "device_free: cumemFree call failed! Reason=%s\n",
cumemGetErrorString(status));
}
}
else {
// We need sync as the Theano's GC could remove intermediate variable that // We need sync as the Theano's GC could remove intermediate variable that
// are still needed as the gpu kernel are running or in the queue. // are still needed as the gpu kernel are running or in the queue.
CNDA_BEGIN_ALLOW_THREADS CNDA_BEGIN_ALLOW_THREADS
...@@ -259,6 +317,7 @@ int device_free(void *ptr) ...@@ -259,6 +317,7 @@ int device_free(void *ptr)
cudaGetErrorString(err)); cudaGetErrorString(err));
return -1; return -1;
} }
}
_outstanding_mallocs[0] -= (ptr != NULL); _outstanding_mallocs[0] -= (ptr != NULL);
#if COMPUTE_GPU_MEM_USED #if COMPUTE_GPU_MEM_USED
int i=0; int i=0;
...@@ -3096,6 +3155,10 @@ CudaNdarray_gpu_init(PyObject* _unused, PyObject* args) ...@@ -3096,6 +3155,10 @@ CudaNdarray_gpu_init(PyObject* _unused, PyObject* args)
if (cublas_init() == -1) if (cublas_init() == -1)
return NULL; return NULL;
} }
if(g_use_cumem) {
if(initCumem() == -1)
return NULL;
}
Py_INCREF(Py_None); Py_INCREF(Py_None);
return Py_None; return Py_None;
...@@ -3126,8 +3189,21 @@ PyObject * ...@@ -3126,8 +3189,21 @@ PyObject *
CudaNdarray_gpu_shutdown(PyObject* _unused, PyObject* _unused_args) { CudaNdarray_gpu_shutdown(PyObject* _unused, PyObject* _unused_args) {
// Don't handle errors here // Don't handle errors here
cublas_shutdown(); cublas_shutdown();
cudaThreadExit();
g_gpu_context_active = 0; // context has now been closed down g_gpu_context_active = 0; // context has now been closed down
if(g_use_cumem) {
printf("Shutting down cumem...\n");
cumemStatus_t status = cumemFinalize();
if(status != CUMEM_STATUS_SUCCESS) {
fprintf(stderr, "CudaNdarray_gpu_shutdown: cumemFinalize failed! Reason=%s\n",
cumemGetErrorString(status));
if(status == CUMEM_STATUS_CUDA_ERROR) {
fprintf(stderr, " Cuda-Reason=%s\n",
cudaGetErrorString(cudaGetLastError()));
}
}
}
cudaThreadExit();
Py_INCREF(Py_None); Py_INCREF(Py_None);
return Py_None; return Py_None;
} }
......
差异被折叠。
差异被折叠。
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论