提交 130d2ce9 authored 作者: Julien Demouth's avatar Julien Demouth 提交者: Frederic

Add support for CNMeM library.

Update the code to support CNMeM (formerly known as Cumem).
上级 389c4aba
...@@ -2,6 +2,7 @@ global-include *.txt ...@@ -2,6 +2,7 @@ global-include *.txt
global-include *.c global-include *.c
global-include *.cu global-include *.cu
global-include *.cuh global-include *.cuh
global-include *.cpp
global-include *.h global-include *.h
global-include *.sh global-include *.sh
global-include *.pkl global-include *.pkl
......
...@@ -164,7 +164,7 @@ def do_setup(): ...@@ -164,7 +164,7 @@ def do_setup():
install_requires=['numpy>=1.6.2', 'scipy>=0.11', 'six>=1.9.0'], install_requires=['numpy>=1.6.2', 'scipy>=0.11', 'six>=1.9.0'],
package_data={ package_data={
'': ['*.txt', '*.rst', '*.cu', '*.cuh', '*.c', '*.sh', '*.pkl', '': ['*.txt', '*.rst', '*.cu', '*.cuh', '*.c', '*.sh', '*.pkl',
'*.h', 'ChangeLog'], '*.h', '*.cpp', 'ChangeLog'],
'theano.misc': ['*.sh'] 'theano.misc': ['*.sh']
}, },
scripts=['bin/theano-cache', 'bin/theano-nose', 'bin/theano-test'], scripts=['bin/theano-cache', 'bin/theano-nose', 'bin/theano-test'],
......
...@@ -54,8 +54,8 @@ AddConfigVar('cublas.lib', ...@@ -54,8 +54,8 @@ AddConfigVar('cublas.lib',
"""Name of the cuda blas library for the linker.""", """Name of the cuda blas library for the linker.""",
StrParam('cublas')) StrParam('cublas'))
AddConfigVar('lib.cumem', AddConfigVar('lib.cnmem',
"""Do we enable cumem or not.""", """Do we enable cnmem or not.""",
# We should not mix both allocator, so we can't override # We should not mix both allocator, so we can't override
BoolParam(False, allow_override=False), BoolParam(False, allow_override=False),
in_c_key=False) in_c_key=False)
...@@ -385,7 +385,7 @@ def use(device, ...@@ -385,7 +385,7 @@ def use(device,
try: try:
if (device != 'gpu') and not pycuda_init_dev: if (device != 'gpu') and not pycuda_init_dev:
assert isinstance(device, int) assert isinstance(device, int)
gpu_init(device, config.lib.cumem) gpu_init(device, config.lib.cnmem)
use.device_number = device use.device_number = device
assert active_device_number() == device assert active_device_number() == device
else: else:
...@@ -398,7 +398,7 @@ def use(device, ...@@ -398,7 +398,7 @@ def use(device,
cuda_ndarray.cuda_ndarray.select_a_gpu() cuda_ndarray.cuda_ndarray.select_a_gpu()
use.device_number = active_device_number() use.device_number = active_device_number()
# This is needed to initialize the cublas handle. # This is needed to initialize the cublas handle.
gpu_init(use.device_number, config.lib.cumem) gpu_init(use.device_number, config.lib.cnmem)
if test_driver: if test_driver:
import theano.sandbox.cuda.tests.test_driver import theano.sandbox.cuda.tests.test_driver
...@@ -411,8 +411,9 @@ def use(device, ...@@ -411,8 +411,9 @@ def use(device,
" this property") " this property")
if config.print_active_device: if config.print_active_device:
print("Using gpu device %d: %s" % ( cnmem_enabled = "enabled" if config.lib.cnmem else "disabled"
active_device_number(), active_device_name()), file=sys.stderr) print("Using gpu device %d: %s (cnmem is %s)" % (
active_device_number(), active_device_name(), cnmem_enabled), file=sys.stderr)
if device_properties(use.device_number)['regsPerBlock'] < 16384: if device_properties(use.device_number)['regsPerBlock'] < 16384:
# We will try to use too much register per bloc at many places # We will try to use too much register per bloc at many places
# when there is only 8k register per multi-processor. # when there is only 8k register per multi-processor.
......
差异被折叠。
...@@ -9,8 +9,8 @@ ...@@ -9,8 +9,8 @@
#include "cuda_ndarray.cuh" #include "cuda_ndarray.cuh"
#include "cumem.h" #include "cnmem.h"
#include "cumem.cpp" #include "cnmem.cpp"
//If true, when there is a gpu malloc or free error, we print the size of allocated memory on the device. //If true, when there is a gpu malloc or free error, we print the size of allocated memory on the device.
#define COMPUTE_GPU_MEM_USED 0 #define COMPUTE_GPU_MEM_USED 0
...@@ -71,20 +71,20 @@ void * device_malloc(size_t size) ...@@ -71,20 +71,20 @@ void * device_malloc(size_t size)
} }
///@TODO: thejaswi: link this option to a theano config variable? ///@TODO: thejaswi: link this option to a theano config variable?
static bool g_use_cumem = false; static bool g_use_cnmem = false;
static const int g_max_devices = 8; static const int g_max_devices = 8;
int initCumem(int card_number_provided, int card_nb) { int initCnmem(int card_number_provided, int card_nb) {
static bool cumemInitialized = false; static bool cnmemInitialized = false;
if(cumemInitialized) { if(cnmemInitialized) {
return 0; return 0;
} }
// On stderr to be at the same place as "Using gpu device..." // On stderr to be at the same place as "Using gpu device..."
fprintf(stderr, "Initializing cumem...\n"); fprintf(stderr, "Initializing cnmem...\n");
int numDevices = 0; int numDevices = 0;
cumemDevice_t devices[g_max_devices]; cnmemDevice_t devices[g_max_devices];
if(cudaGetDeviceCount(&numDevices) != cudaSuccess) { if(cudaGetDeviceCount(&numDevices) != cudaSuccess) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"initCumem: 'cudaGetDeviceCount' failed! Reason=%s\n", "initCnmem: 'cudaGetDeviceCount' failed! Reason=%s\n",
cudaGetErrorString(cudaGetLastError())); cudaGetErrorString(cudaGetLastError()));
return -1; return -1;
} }
...@@ -97,7 +97,6 @@ int initCumem(int card_number_provided, int card_nb) { ...@@ -97,7 +97,6 @@ int initCumem(int card_number_provided, int card_nb) {
///@TODO: thejaswi: add support for multiple streams ///@TODO: thejaswi: add support for multiple streams
devices[i].numStreams = 0; devices[i].numStreams = 0;
devices[i].streams = NULL; devices[i].streams = NULL;
devices[i].granularity = 0;
}else{ }else{
for(int i=0;i<numDevices;++i) { for(int i=0;i<numDevices;++i) {
...@@ -107,19 +106,18 @@ int initCumem(int card_number_provided, int card_nb) { ...@@ -107,19 +106,18 @@ int initCumem(int card_number_provided, int card_nb) {
///@TODO: thejaswi: add support for multiple streams ///@TODO: thejaswi: add support for multiple streams
devices[i].numStreams = 0; devices[i].numStreams = 0;
devices[i].streams = NULL; devices[i].streams = NULL;
devices[i].granularity = 0;
} }
} }
///@TODO: thejaswi: passing custom cumem flags? ///@TODO: thejaswi: passing custom cnmem flags?
cumemStatus_t status = cumemInit(numDevices, devices, CUMEM_FLAGS_DEFAULT); cnmemStatus_t status = cnmemInit(numDevices, devices, CNMEM_FLAGS_DEFAULT);
if(status != CUMEM_STATUS_SUCCESS) { if(status != CNMEM_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"initCumem: cumemInit call failed! Reason=%s. numdev=%d\n", "initCnmem: cnmemInit call failed! Reason=%s. numdev=%d\n",
cumemGetErrorString(status), numDevices); cnmemGetErrorString(status), numDevices);
return -1; return -1;
} }
cumemInitialized = true; cnmemInitialized = true;
return 0; return 0;
} }
...@@ -138,12 +136,15 @@ void * device_malloc(size_t size, int verbose) ...@@ -138,12 +136,15 @@ void * device_malloc(size_t size, int verbose)
#endif #endif
void * rval=NULL; void * rval=NULL;
///@TODO: thejaswi: support for multiple-streams? ///@TODO: thejaswi: support for multiple-streams?
if(g_use_cumem) { if(g_use_cnmem) {
cumemStatus_t status = cumemMalloc(&rval, size, NULL); cnmemStatus_t status = CNMEM_STATUS_SUCCESS;
if(status != CUMEM_STATUS_SUCCESS) { if( size != 0 ) {
status = cnmemMalloc(&rval, size, NULL);
}
if(status != CNMEM_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, PyErr_Format(PyExc_MemoryError,
"Error allocating %zd bytes of device memory (%s).", "Error allocating %zd bytes of device memory (%s).",
size, cumemGetErrorString(status)); size, cnmemGetErrorString(status));
return NULL; return NULL;
} }
} }
...@@ -271,11 +272,11 @@ int device_free(void *ptr) ...@@ -271,11 +272,11 @@ int device_free(void *ptr)
} }
///@TODO: thejaswi: multi-stream support ///@TODO: thejaswi: multi-stream support
if(g_use_cumem) { if(g_use_cnmem) {
cumemStatus_t status = cumemFree(ptr, NULL); cnmemStatus_t status = cnmemFree(ptr, NULL);
if(status != CUMEM_STATUS_SUCCESS) { if(status != CNMEM_STATUS_SUCCESS) {
fprintf(stderr, "device_free: cumemFree call failed! Reason=%s\n", fprintf(stderr, "device_free: cnmemFree call failed! Reason=%s\n",
cumemGetErrorString(status)); cnmemGetErrorString(status));
} }
} }
else { else {
...@@ -3134,22 +3135,22 @@ CudaNdarray_ptr_int_size(PyObject* _unused, PyObject* args) ...@@ -3134,22 +3135,22 @@ CudaNdarray_ptr_int_size(PyObject* _unused, PyObject* args)
static int cublas_init(); static int cublas_init();
static void cublas_shutdown(); static void cublas_shutdown();
// Initialize the gpu. // Initialize the gpu.
// Takes two optional parameters, the device number and if we should use cumem. // Takes two optional parameters, the device number and if we should use cnmem.
// If the device number is provided, it sets that device to be the active device. // If the device number is provided, it sets that device to be the active device.
// If not provided (usually just to test whether the gpu is available at all), // If not provided (usually just to test whether the gpu is available at all),
// it does not set an active device. // it does not set an active device.
// Raises EnvironmentError or ValueError (as appropriate) if the initialization failed. // Raises EnvironmentError or ValueError (as appropriate) if the initialization failed.
// cumem is threaded like a bool. If converted to 0, don't use cumem. Otherwise, use it. // cnmem is threaded like a bool. If converted to 0, don't use cnmem. Otherwise, use it.
PyObject * PyObject *
CudaNdarray_gpu_init(PyObject* _unused, PyObject* args) CudaNdarray_gpu_init(PyObject* _unused, PyObject* args)
{ {
int card_nb = 0; int card_nb = 0;
int card_number_provided = 1; int card_number_provided = 1;
int cumem = 0; // 0 False, 1 True int cnmem = 0; // 0 False, 1 True
// if we're given something wildly invalid, this will throw a TypeError // if we're given something wildly invalid, this will throw a TypeError
PyArg_ParseTuple(args, "|ii", &card_nb, &cumem); PyArg_ParseTuple(args, "|ii", &card_nb, &cnmem);
if(cumem) if(cnmem)
g_use_cumem = true; g_use_cnmem = true;
if(PyTuple_Size(args) == 0) { if(PyTuple_Size(args) == 0) {
card_number_provided = 0; card_number_provided = 0;
...@@ -3204,8 +3205,8 @@ CudaNdarray_gpu_init(PyObject* _unused, PyObject* args) ...@@ -3204,8 +3205,8 @@ CudaNdarray_gpu_init(PyObject* _unused, PyObject* args)
if (cublas_init() == -1) if (cublas_init() == -1)
return NULL; return NULL;
} }
if(card_number_provided && g_use_cumem) { if(card_number_provided && g_use_cnmem) {
if(initCumem(card_number_provided, card_nb) == -1){ if(initCnmem(card_number_provided, card_nb) == -1){
return NULL; return NULL;
} }
} }
...@@ -3240,13 +3241,13 @@ CudaNdarray_gpu_shutdown(PyObject* _unused, PyObject* _unused_args) { ...@@ -3240,13 +3241,13 @@ CudaNdarray_gpu_shutdown(PyObject* _unused, PyObject* _unused_args) {
// Don't handle errors here // Don't handle errors here
cublas_shutdown(); cublas_shutdown();
g_gpu_context_active = 0; // context has now been closed down g_gpu_context_active = 0; // context has now been closed down
if(g_use_cumem) { if(g_use_cnmem) {
fprintf(stderr, "Shutting down cumem...\n"); fprintf(stderr, "Shutting down cnmem...\n");
cumemStatus_t status = cumemFinalize(); cnmemStatus_t status = cnmemFinalize();
if(status != CUMEM_STATUS_SUCCESS) { if(status != CNMEM_STATUS_SUCCESS && status != CNMEM_STATUS_MEMORY_LEAK) {
fprintf(stderr, "CudaNdarray_gpu_shutdown: cumemFinalize failed! Reason=%s\n", fprintf(stderr, "CudaNdarray_gpu_shutdown: cnmemFinalize failed! Reason=%s\n",
cumemGetErrorString(status)); cnmemGetErrorString(status));
if(status == CUMEM_STATUS_CUDA_ERROR) { if(status == CNMEM_STATUS_CUDA_ERROR) {
fprintf(stderr, " Cuda-Reason=%s\n", fprintf(stderr, " Cuda-Reason=%s\n",
cudaGetErrorString(cudaGetLastError())); cudaGetErrorString(cudaGetLastError()));
} }
......
差异被折叠。
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论