提交 78689343 authored 作者: Frederic's avatar Frederic

make lib.cnmem be the memory to start

上级 fe6afee9
...@@ -13,7 +13,8 @@ from theano.compile import optdb ...@@ -13,7 +13,8 @@ from theano.compile import optdb
from theano.gof import EquilibriumDB, SequenceDB from theano.gof import EquilibriumDB, SequenceDB
from theano.gof.cmodule import get_lib_extension from theano.gof.cmodule import get_lib_extension
from theano.gof.compilelock import get_lock, release_lock from theano.gof.compilelock import get_lock, release_lock
from theano.configparser import config, AddConfigVar, StrParam, BoolParam from theano.configparser import (
config, AddConfigVar, BoolParam, IntParam, StrParam)
from . import nvcc_compiler from . import nvcc_compiler
# ignore_newtrees is to speed the optimization as this is the pattern # ignore_newtrees is to speed the optimization as this is the pattern
...@@ -55,9 +56,16 @@ AddConfigVar('cublas.lib', ...@@ -55,9 +56,16 @@ AddConfigVar('cublas.lib',
StrParam('cublas')) StrParam('cublas'))
AddConfigVar('lib.cnmem', AddConfigVar('lib.cnmem',
"""Do we enable cnmem or not.""", """Do we enable CNMeM or not (a faster memory allocator).
The number (in MB) represent the start size of the memory pool.
0: not enabled.
-1: use half GPU memory.
>0: use that number of MB of memory.""",
# We should not mix both allocator, so we can't override # We should not mix both allocator, so we can't override
BoolParam(False, allow_override=False), # BoolParam(False, allow_override=False),
IntParam(0, lambda i: i >= 0 or i == -1, allow_override=False),
in_c_key=False) in_c_key=False)
# is_nvcc_available called here to initialize global vars in # is_nvcc_available called here to initialize global vars in
...@@ -412,7 +420,7 @@ def use(device, ...@@ -412,7 +420,7 @@ def use(device,
if config.print_active_device: if config.print_active_device:
cnmem_enabled = "enabled" if config.lib.cnmem else "disabled" cnmem_enabled = "enabled" if config.lib.cnmem else "disabled"
print("Using gpu device %d: %s (cnmem is %s)" % ( print("Using gpu device %d: %s (CNMeM is %s)" % (
active_device_number(), active_device_name(), cnmem_enabled), file=sys.stderr) active_device_number(), active_device_name(), cnmem_enabled), file=sys.stderr)
if device_properties(use.device_number)['regsPerBlock'] < 16384: if device_properties(use.device_number)['regsPerBlock'] < 16384:
# We will try to use too much register per bloc at many places # We will try to use too much register per bloc at many places
......
...@@ -73,13 +73,12 @@ void * device_malloc(size_t size) ...@@ -73,13 +73,12 @@ void * device_malloc(size_t size)
///@TODO: thejaswi: link this option to a theano config variable? ///@TODO: thejaswi: link this option to a theano config variable?
static bool g_use_cnmem = false; static bool g_use_cnmem = false;
static const int g_max_devices = 8; static const int g_max_devices = 8;
int initCnmem(int card_number_provided, int card_nb) { int initCnmem(int card_number_provided, int card_nb, size_t mem) {
static bool cnmemInitialized = false; static bool cnmemInitialized = false;
if(cnmemInitialized) { if(cnmemInitialized) {
return 0; return 0;
} }
// On stderr to be at the same place as "Using gpu device..." // On stderr to be at the same place as "Using gpu device..."
fprintf(stderr, "Initializing cnmem...\n");
int numDevices = 0; int numDevices = 0;
cnmemDevice_t devices[g_max_devices]; cnmemDevice_t devices[g_max_devices];
if(cudaGetDeviceCount(&numDevices) != cudaSuccess) { if(cudaGetDeviceCount(&numDevices) != cudaSuccess) {
...@@ -92,8 +91,7 @@ int initCnmem(int card_number_provided, int card_nb) { ...@@ -92,8 +91,7 @@ int initCnmem(int card_number_provided, int card_nb) {
numDevices = 1; numDevices = 1;
int i = 0; int i = 0;
devices[i].device = card_nb; devices[i].device = card_nb;
///@TODO: thejaswi: support for choosing mem size to be allocated before-hand? devices[i].size = mem;
devices[i].size = 0;
///@TODO: thejaswi: add support for multiple streams ///@TODO: thejaswi: add support for multiple streams
devices[i].numStreams = 0; devices[i].numStreams = 0;
devices[i].streams = NULL; devices[i].streams = NULL;
...@@ -101,8 +99,7 @@ int initCnmem(int card_number_provided, int card_nb) { ...@@ -101,8 +99,7 @@ int initCnmem(int card_number_provided, int card_nb) {
}else{ }else{
for(int i=0;i<numDevices;++i) { for(int i=0;i<numDevices;++i) {
devices[i].device = i; devices[i].device = i;
///@TODO: thejaswi: support for choosing mem size to be allocated before-hand? devices[i].size = mem;
devices[i].size = 0;
///@TODO: thejaswi: add support for multiple streams ///@TODO: thejaswi: add support for multiple streams
devices[i].numStreams = 0; devices[i].numStreams = 0;
devices[i].streams = NULL; devices[i].streams = NULL;
...@@ -3144,7 +3141,7 @@ CudaNdarray_gpu_init(PyObject* _unused, PyObject* args) ...@@ -3144,7 +3141,7 @@ CudaNdarray_gpu_init(PyObject* _unused, PyObject* args)
{ {
int card_nb = 0; int card_nb = 0;
int card_number_provided = 1; int card_number_provided = 1;
int cnmem = 0; // 0 False, 1 True int cnmem = 0; // start qt memory in MB.
// if we're given something wildly invalid, this will throw a TypeError // if we're given something wildly invalid, this will throw a TypeError
PyArg_ParseTuple(args, "|ii", &card_nb, &cnmem); PyArg_ParseTuple(args, "|ii", &card_nb, &cnmem);
if(cnmem) if(cnmem)
...@@ -3204,7 +3201,16 @@ CudaNdarray_gpu_init(PyObject* _unused, PyObject* args) ...@@ -3204,7 +3201,16 @@ CudaNdarray_gpu_init(PyObject* _unused, PyObject* args)
return NULL; return NULL;
} }
if(card_number_provided && g_use_cnmem) { if(card_number_provided && g_use_cnmem) {
if(initCnmem(card_number_provided, card_nb) == -1){ size_t mem = 0;
if (cnmem > 0)
mem = cnmem * 1024 * 1024;
else if (cnmem != -1){
return PyErr_Format(
PyExc_EnvironmentError,
"CNMeM init: The config flag must be 0 (disabled),"
" -1: use half the GPU memory, > 0: that memory in MB.");
}
if(initCnmem(card_number_provided, card_nb, mem) == -1){
return NULL; return NULL;
} }
} }
...@@ -3240,7 +3246,6 @@ CudaNdarray_gpu_shutdown(PyObject* _unused, PyObject* _unused_args) { ...@@ -3240,7 +3246,6 @@ CudaNdarray_gpu_shutdown(PyObject* _unused, PyObject* _unused_args) {
cublas_shutdown(); cublas_shutdown();
g_gpu_context_active = 0; // context has now been closed down g_gpu_context_active = 0; // context has now been closed down
if(g_use_cnmem) { if(g_use_cnmem) {
fprintf(stderr, "Shutting down cnmem...\n");
cnmemStatus_t status = cnmemFinalize(); cnmemStatus_t status = cnmemFinalize();
if(status != CNMEM_STATUS_SUCCESS) { if(status != CNMEM_STATUS_SUCCESS) {
fprintf(stderr, "CudaNdarray_gpu_shutdown: cnmemFinalize failed! Reason=%s\n", fprintf(stderr, "CudaNdarray_gpu_shutdown: cnmemFinalize failed! Reason=%s\n",
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论