提交 130d2ce9 authored 作者: Julien Demouth's avatar Julien Demouth 提交者: Frederic

Add support for CNMeM library.

Update the code to support CNMeM (formerly known as Cumem).
上级 389c4aba
...@@ -2,6 +2,7 @@ global-include *.txt ...@@ -2,6 +2,7 @@ global-include *.txt
global-include *.c global-include *.c
global-include *.cu global-include *.cu
global-include *.cuh global-include *.cuh
global-include *.cpp
global-include *.h global-include *.h
global-include *.sh global-include *.sh
global-include *.pkl global-include *.pkl
......
...@@ -164,7 +164,7 @@ def do_setup(): ...@@ -164,7 +164,7 @@ def do_setup():
install_requires=['numpy>=1.6.2', 'scipy>=0.11', 'six>=1.9.0'], install_requires=['numpy>=1.6.2', 'scipy>=0.11', 'six>=1.9.0'],
package_data={ package_data={
'': ['*.txt', '*.rst', '*.cu', '*.cuh', '*.c', '*.sh', '*.pkl', '': ['*.txt', '*.rst', '*.cu', '*.cuh', '*.c', '*.sh', '*.pkl',
'*.h', 'ChangeLog'], '*.h', '*.cpp', 'ChangeLog'],
'theano.misc': ['*.sh'] 'theano.misc': ['*.sh']
}, },
scripts=['bin/theano-cache', 'bin/theano-nose', 'bin/theano-test'], scripts=['bin/theano-cache', 'bin/theano-nose', 'bin/theano-test'],
......
...@@ -54,8 +54,8 @@ AddConfigVar('cublas.lib', ...@@ -54,8 +54,8 @@ AddConfigVar('cublas.lib',
"""Name of the cuda blas library for the linker.""", """Name of the cuda blas library for the linker.""",
StrParam('cublas')) StrParam('cublas'))
AddConfigVar('lib.cumem', AddConfigVar('lib.cnmem',
"""Do we enable cumem or not.""", """Do we enable cnmem or not.""",
# We should not mix both allocator, so we can't override # We should not mix both allocator, so we can't override
BoolParam(False, allow_override=False), BoolParam(False, allow_override=False),
in_c_key=False) in_c_key=False)
...@@ -385,7 +385,7 @@ def use(device, ...@@ -385,7 +385,7 @@ def use(device,
try: try:
if (device != 'gpu') and not pycuda_init_dev: if (device != 'gpu') and not pycuda_init_dev:
assert isinstance(device, int) assert isinstance(device, int)
gpu_init(device, config.lib.cumem) gpu_init(device, config.lib.cnmem)
use.device_number = device use.device_number = device
assert active_device_number() == device assert active_device_number() == device
else: else:
...@@ -398,7 +398,7 @@ def use(device, ...@@ -398,7 +398,7 @@ def use(device,
cuda_ndarray.cuda_ndarray.select_a_gpu() cuda_ndarray.cuda_ndarray.select_a_gpu()
use.device_number = active_device_number() use.device_number = active_device_number()
# This is needed to initialize the cublas handle. # This is needed to initialize the cublas handle.
gpu_init(use.device_number, config.lib.cumem) gpu_init(use.device_number, config.lib.cnmem)
if test_driver: if test_driver:
import theano.sandbox.cuda.tests.test_driver import theano.sandbox.cuda.tests.test_driver
...@@ -411,8 +411,9 @@ def use(device, ...@@ -411,8 +411,9 @@ def use(device,
" this property") " this property")
if config.print_active_device: if config.print_active_device:
print("Using gpu device %d: %s" % ( cnmem_enabled = "enabled" if config.lib.cnmem else "disabled"
active_device_number(), active_device_name()), file=sys.stderr) print("Using gpu device %d: %s (cnmem is %s)" % (
active_device_number(), active_device_name(), cnmem_enabled), file=sys.stderr)
if device_properties(use.device_number)['regsPerBlock'] < 16384: if device_properties(use.device_number)['regsPerBlock'] < 16384:
# We will try to use too much register per bloc at many places # We will try to use too much register per bloc at many places
# when there is only 8k register per multi-processor. # when there is only 8k register per multi-processor.
......
///////////////////////////////////////////////////////////////////////////////////////////////////
// Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
///////////////////////////////////////////////////////////////////////////////////////////////////
#include "cnmem.h"
#include <cstddef>
#include <vector>
#include <cuda_runtime_api.h>
#if !defined(WIN32) && defined(_MSC_VER)
#define WIN32
#endif
#ifdef WIN32
#include <Windows.h>
#else
#include <pthread.h>
#endif
#define CNMEM_GRANULARITY 512
///////////////////////////////////////////////////////////////////////////////////////////////////
extern "C" const char* cnmemGetErrorString(cnmemStatus_t status) {
switch(status) {
case CNMEM_STATUS_SUCCESS: return "CNMEM_STATUS_SUCCESS";
case CNMEM_STATUS_CUDA_ERROR: return "CNMEM_STATUS_CUDA_ERROR";
case CNMEM_STATUS_INVALID_ARGUMENT: return "CNMEM_STATUS_INVALID_ARGUMENT";
case CNMEM_STATUS_MEMORY_LEAK: return "CNMEM_STATUS_MEMORY_LEAK";
case CNMEM_STATUS_NOT_INITIALIZED: return "CNMEM_STATUS_NOT_INITIALIZED";
case CNMEM_STATUS_OUT_OF_MEMORY: return "CNMEM_STATUS_OUT_OF_MEMORY";
default: return "CNMEM_STATUS_UNKNOWN_ERROR";
}
}
///////////////////////////////////////////////////////////////////////////////////////////////////
#if 0
#ifdef WIN32
#define CNMEM_DEBUG_ERROR(...) do { \
fprintf(stderr, "Error at line: %d\n", __LINE__); \
fprintf(stderr, __VA_ARGS__); \
} while(0)
#else
#include <execinfo.h>
static inline void printBacktrace() {
void *stackBuffer[64];
int numAddresses = backtrace((void**) &stackBuffer, 64);
char **addresses = backtrace_symbols(stackBuffer, numAddresses);
for( int i = 0 ; i < numAddresses ; ++i ) {
fprintf(stderr, "[%2d]: %s\n", i, addresses[i]);
}
free(addresses);
}
#define CNMEM_DEBUG_ERROR(...) do { \
fprintf(stderr, "Error at line: %d\n", __LINE__); \
fprintf(stderr, __VA_ARGS__); \
fprintf(stderr, "Backtrace:\n"); \
printBacktrace(); \
} while(0)
#endif
#else
#define CNMEM_DEBUG_ERROR(...)
#endif
#if 0
#define CNMEM_DEBUG_INFO printf
#else
#define CNMEM_DEBUG_INFO(...)
#endif
#if 0 // Enable/disable assertions
#include <cassert>
#define CNMEM_ASSERT assert
#else
#define CNMEM_ASSERT(...)
#endif
#define CNMEM_CHECK_TRUE(cond, error) do { \
if( !(cond) ) { \
CNMEM_DEBUG_ERROR("CNMEM_CHECK_TRUE evaluates to false\n"); \
return error; \
} \
} while(0)
#define CNMEM_CHECK(call) do { \
cnmemStatus_t status = (call); \
if( status != CNMEM_STATUS_SUCCESS ) { \
CNMEM_DEBUG_ERROR("CNMEM_CHECK failed with status \"%s\"\n", \
cnmemGetErrorString(status)); \
return status; \
} \
} while(0)
#define CNMEM_CHECK_OR_UNLOCK(call, mutex) do { \
cnmemStatus_t status = (call); \
if( status != CNMEM_STATUS_SUCCESS ) { \
CNMEM_DEBUG_ERROR("CNMEM_CHECK_OR_UNLOCK failed with status \"%s\"\n", \
cnmemGetErrorString(status)); \
(mutex).unlock(); \
return status; \
} \
} while(0)
#define CNMEM_CHECK_CUDA(call) do { \
cudaError_t cudaError = (call); \
if( cudaError == cudaErrorMemoryAllocation ) { \
CNMEM_DEBUG_ERROR("CNMEM_CHECK_CUDA failed with CUDA error \"%s\"\n", \
cudaGetErrorString(cudaError)); \
return CNMEM_STATUS_OUT_OF_MEMORY; \
} \
else if( cudaError != cudaSuccess ) { \
CNMEM_DEBUG_ERROR("CNMEM_CHECK_CUDA failed with CUDA error \"%s\"\n", \
cudaGetErrorString(cudaError)); \
return CNMEM_STATUS_CUDA_ERROR; \
} \
} while(0)
#define CNMEM_CHECK_CUDA_OR_UNLOCK(call, mutex) do { \
cudaError_t cudaError = (call); \
if( cudaError == cudaErrorMemoryAllocation ) { \
CNMEM_DEBUG_ERROR("CNMEM_CHECK_CUDA_OR_UNLOCK failed with CUDA error \"%s\"\n", \
cudaGetErrorString(cudaError)); \
(mutex).unlock(); \
return CNMEM_STATUS_OUT_OF_MEMORY; \
} \
else if( cudaError != cudaSuccess ) { \
CNMEM_DEBUG_ERROR("CNMEM_CHECK_CUDA_OR_UNLOCK failed with CUDA error \"%s\"\n", \
cudaGetErrorString(cudaError)); \
(mutex).unlock(); \
return CNMEM_STATUS_CUDA_ERROR; \
} \
} while(0)
#ifdef WIN32
#define CNMEM_CHECK_WIN32(call, error_code) do { \
SetLastError(0); /* Clean the flag. */ \
call; \
DWORD status = GetLastError(); \
if( status ) \
return error_code; \
} while(0)
#else
#define CNMEM_CHECK_PTHREAD(call, error_code) do { \
int status = call; \
if( status ) { \
CNMEM_DEBUG_ERROR("CNMEM_CHECK_PTHREAD failed with status %d\n", status); \
return error_code; \
} \
} while(0)
#endif
///////////////////////////////////////////////////////////////////////////////////////////////////
namespace cnmem {
static inline std::size_t ceilInt(std::size_t m, std::size_t n) {
CNMEM_ASSERT(n > 0);
return (m + n-1) / n * n;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
class Mutex {
#ifdef WIN32
CRITICAL_SECTION mCriticalSection;
#else
pthread_mutex_t mMutex;
#endif
public:
/// Initialize the mutex.
cnmemStatus_t initialize();
/// Finalize the mutex.
cnmemStatus_t finalize();
/// Lock the mutex.
cnmemStatus_t lock() const;
/// Unlock the mutex.
cnmemStatus_t unlock() const;
};
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Mutex::initialize() {
#ifdef WIN32
CNMEM_CHECK_WIN32(InitializeCriticalSection((CRITICAL_SECTION*) &mCriticalSection), CNMEM_STATUS_UNKNOWN_ERROR);
#else
#if 0
pthread_mutexattr_t attr;
CNMEM_CHECK_PTHREAD(pthread_mutexattr_init(&attr), CNMEM_STATUS_UNKNOWN_ERROR);
CNMEM_CHECK_PTHREAD(pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE), CNMEM_STATUS_UNKNOWN_ERROR);
CNMEM_CHECK_PTHREAD(pthread_mutex_init(&mMutex, &attr), CNMEM_STATUS_UNKNOWN_ERROR);
#else
CNMEM_CHECK_PTHREAD(pthread_mutex_init(&mMutex, NULL), CNMEM_STATUS_UNKNOWN_ERROR);
#endif
#endif
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Mutex::finalize() {
#ifdef WIN32
CNMEM_CHECK_WIN32(DeleteCriticalSection((CRITICAL_SECTION*) &mCriticalSection), CNMEM_STATUS_UNKNOWN_ERROR);
#else
CNMEM_CHECK_PTHREAD(pthread_mutex_destroy(&mMutex), CNMEM_STATUS_UNKNOWN_ERROR);
#endif
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Mutex::lock() const {
#ifdef WIN32
CNMEM_CHECK_WIN32(EnterCriticalSection(&mCriticalSection), CNMEM_STATUS_UNKNOWN_ERROR);
#else
CNMEM_CHECK_PTHREAD(pthread_mutex_lock((pthread_mutex_t*) &mMutex), CNMEM_STATUS_UNKNOWN_ERROR);
#endif
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Mutex::unlock() const {
#ifdef WIN32
CNMEM_CHECK_WIN32(LeaveCriticalSection(&mCriticalSection), CNMEM_STATUS_UNKNOWN_ERROR);
#else
CNMEM_CHECK_PTHREAD(pthread_mutex_unlock((pthread_mutex_t*) &mMutex), CNMEM_STATUS_UNKNOWN_ERROR);
#endif
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
class Block {
/// The pointer to the memory region on the device.
char *mData;
/// The size of the memory buffer.
std::size_t mSize;
/// The prev/next blocks in the linked list of blocks.
Block *mNext;
/// Is it a head node (i.e. a node obtained from parent->allocate or cudaMalloc).
bool mIsHead;
public:
/// Create a block.
Block(char *data, std::size_t size, Block *next, bool isHead)
: mData(data)
, mSize(size)
, mNext(next)
, mIsHead(isHead) {
}
/// The data.
inline const char* getData() const { return mData; }
/// The data (mutable).
inline char* getData() { return mData; }
/// The size of the block.
inline std::size_t getSize() const { return mSize; }
/// The next block in the linked list.
inline const Block* getNext() const { return mNext; }
/// The next block in the linked list (mutable).
inline Block* getNext() { return mNext; }
/// Is it a head block.
inline bool isHead() const { return mIsHead; }
/// Change the next block.
inline void setNext(Block *next) { mNext = next; }
/// Change the size of the block.
inline void setSize(std::size_t size) { mSize = size; }
/// Set the head flag.
inline void setHeadFlag(bool isHead) { mIsHead = isHead; }
};
///////////////////////////////////////////////////////////////////////////////////////////////////
class Manager {
/// The parent manager.
Manager *mParent;
/// The children managers.
std::vector<Manager*> mChildren;
/// The GPU device where the memory is allocated.
int mDevice;
/// The stream this manager is associated with. It could be NULL.
cudaStream_t mStream;
/// Is the stream blocking?
bool mIsStreamBlocking;
/// The list of used blocks.
Block *mUsedBlocks;
/// The list of free blocks.
Block *mFreeBlocks;
/// The managed memory size.
std::size_t mSize;
/// The flags.
unsigned mFlags;
/// To support multi-threading. Each manager has its own mutex.
Mutex mMutex;
public:
/// The root manager for a given device.
static inline Manager& getRootManager(int device) { return getRootManagers()[device]; }
/// The list of all the root managers.
static std::vector<Manager>& getRootManagers();
public:
/// Create an unitialized manager.
Manager();
/// Dtor.
~Manager();
/// Allocate a block of memory.
cnmemStatus_t allocate(void *&ptr, std::size_t size, bool isBlocking = true);
/// Release a block of memory.
cnmemStatus_t release(void *ptr);
/// Release memory. It returns true if we have no memory leak.
cnmemStatus_t releaseAllUnsafe(bool &memoryLeak);
/// Reserve memory for a manager.
cnmemStatus_t reserve(std::size_t size);
/// Steal memory from another manager.
cnmemStatus_t stealUnsafe(void *&ptr, std::size_t size);
/// Print the full memory state.
cnmemStatus_t printMemoryState(FILE *file) const;
/// The amount of used memory.
inline cnmemStatus_t getUsedMemoryUnsafe(std::size_t &usedMemory) const {
return getMemoryUnsafe(usedMemory, mUsedBlocks);
}
/// The amount of used memory.
inline cnmemStatus_t getFreeMemoryUnsafe(std::size_t &freeMemory) const {
return getMemoryUnsafe(freeMemory, mFreeBlocks);
}
/// Get a specific child based on the stream id.
cnmemStatus_t getChildFromStream(Manager *&manager, cudaStream_t stream) const;
/// Get a specific child based on the stream id.
cnmemStatus_t getChild(Manager *&manager, std::size_t i) const;
/// Add a new child.
cnmemStatus_t addChild(Manager *manager);
/// The number of children.
cnmemStatus_t getNumChildren(std::size_t &numChildren) const;
/// The associated device.
inline int getDevice() const { return mDevice; }
/// The flags.
inline unsigned getFlags() const { return mFlags; }
/// Get the mutex.
inline const Mutex* getMutex() const { return &mMutex; }
/// The size allocated to that manager.
inline std::size_t getSize() const { return mSize; }
/// The CUDA stream.
inline cudaStream_t getStream() const { return mStream; }
/// Define the parent.
inline void setParent(Manager *parent) { mParent = parent; }
/// Define the device.
inline void setDevice(int device) { mDevice = device; }
/// Define the stream.
inline cnmemStatus_t setStream(cudaStream_t stream) {
mStream = stream;
#ifdef CUDA_API_PER_THREAD_DEFAULT_STREAM
mIsStreamBlocking = false;
#else
unsigned flags = 0;
CNMEM_CHECK_CUDA(cudaStreamGetFlags(mStream, &flags));
mIsStreamBlocking = !mStream || !(flags & cudaStreamNonBlocking);
#endif
return CNMEM_STATUS_SUCCESS;
}
/// Define the flags.
inline void setFlags(unsigned flags) { mFlags = flags; }
private:
/// The member functions below which are marked "Unsafe" are not thread-safe when called on a
/// same Manager object. Make sure they are called by a single thread in that case.
/// Allocate a new block and add it to the free list.
cnmemStatus_t allocateBlockUnsafe(Block *&curr, Block *&prev, std::size_t size);
/// Release a block from the active list.
cnmemStatus_t releaseBlockUnsafe(Block *curr, Block *prev);
/// Find the best free node based on the size.
cnmemStatus_t findBestBlockUnsafe(Block *&curr, Block *&prev, std::size_t size);
/// Extract a node from the list of free blocks.
cnmemStatus_t extractBlockUnsafe(Block *curr, Block *prev, std::size_t size, bool stolen);
/// Give a free block from that manager.
cnmemStatus_t giveBlockUnsafe(void *&data, std::size_t &dataSize, std::size_t size);
/// Steal a block from another manager.
cnmemStatus_t stealBlockUnsafe(void *&data, std::size_t &dataSize, std::size_t size);
/// The memory consumption of a list.
cnmemStatus_t getMemoryUnsafe(std::size_t &memSize, const Block *head) const;
/// Print an internal linked list.
cnmemStatus_t printListUnsafe(FILE *file, const char *name, const Block *head) const;
};
///////////////////////////////////////////////////////////////////////////////////////////////////
Manager::Manager()
: mParent(NULL)
, mChildren()
, mDevice(-1)
, mStream(NULL)
, mIsStreamBlocking(false)
, mUsedBlocks(NULL)
, mFreeBlocks(NULL)
, mSize(0)
, mFlags(CNMEM_FLAGS_DEFAULT)
, mMutex() {
mMutex.initialize();
}
///////////////////////////////////////////////////////////////////////////////////////////////////
Manager::~Manager() {
if( mDevice == -1 || cudaSetDevice(mDevice) != cudaSuccess ) { // Invalid device, skip it.
return;
}
bool memoryLeak;
releaseAllUnsafe(memoryLeak);
mMutex.finalize();
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Manager::addChild(Manager *manager) {
CNMEM_CHECK(mMutex.lock());
mChildren.push_back(manager);
CNMEM_CHECK(mMutex.unlock());
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Manager::allocate(void *&ptr, std::size_t size, bool isBlocking) {
CNMEM_CHECK(mMutex.lock());
// If the client is not blocking, we have to explicitly synchronize before giving one buffer.
if( !isBlocking ) {
CNMEM_CHECK_CUDA_OR_UNLOCK(cudaStreamSynchronize(mStream), mMutex);
}
// Find the best fit.
Block *best = NULL, *prev = NULL;
CNMEM_CHECK_OR_UNLOCK(findBestBlockUnsafe(best, prev, size), mMutex);
// If there's no block left in the list of free blocks (with a sufficient size). Request a new block.
if( best == NULL && !(mFlags & CNMEM_FLAGS_CANNOT_GROW) ) {
CNMEM_CHECK_OR_UNLOCK(allocateBlockUnsafe(best, prev, size), mMutex);
}
// Make sure we do have a block or quit.
if( !best ) {
ptr = NULL;
CNMEM_CHECK(mMutex.unlock());
return CNMEM_STATUS_OUT_OF_MEMORY;
}
// Split the free block if needed.
CNMEM_CHECK_OR_UNLOCK(extractBlockUnsafe(best, prev, size, false), mMutex);
// Push the node to the list of used nodes.
best->setNext(mUsedBlocks);
mUsedBlocks = best;
// Return the new pointer into memory.
ptr = mUsedBlocks->getData();
CNMEM_CHECK(mMutex.unlock());
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Manager::allocateBlockUnsafe(Block *&curr, Block *&prev, std::size_t size) {
// Reset the outputs.
curr = prev = NULL;
// Try to allocate data from the parent or the device.
void *data = NULL;
if( mParent ) {
CNMEM_CHECK(mParent->allocate(data, size, mIsStreamBlocking));
}
else {
CNMEM_DEBUG_INFO("cudaMalloc(%lu)\n", size);
CNMEM_CHECK_CUDA(cudaMalloc(&data, size));
CNMEM_DEBUG_INFO(">> returned address=0x%016lx\n", (size_t) data);
}
// If it failed, there's an unexpected issue.
CNMEM_ASSERT(data);
// We have data, we now need to add it to the list of free nodes. We keep the list sorted.
Block *next = mFreeBlocks;
for( ; next && next->getData() < data ; next = next->getNext() ) {
prev = next;
}
curr = new Block((char*) data, size, next, true);
if( !curr ) {
return CNMEM_STATUS_OUT_OF_MEMORY;
}
if( prev ) {
prev->setNext(curr);
}
else {
mFreeBlocks = curr;
}
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Manager::extractBlockUnsafe(Block *curr, Block *prev, std::size_t size, bool stolen) {
// We have two cases: 1/ It is the right size so we keep it or 2/ it is too large and we split the node.
Block *next;
if( curr->getSize() == size ) {
next = curr->getNext();
}
else {
std::size_t remaining = curr->getSize()-size;
Block *newBlock = new Block(curr->getData() + size, remaining, curr->getNext(), stolen);
if( !newBlock ) {
return CNMEM_STATUS_OUT_OF_MEMORY;
}
next = newBlock;
curr->setSize(size);
}
// Redo the "branching" in the nodes.
if( prev ) {
prev->setNext(next);
}
else {
mFreeBlocks = next;
}
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Manager::findBestBlockUnsafe(Block *&best, Block *&prev, std::size_t size) {
best = NULL, prev = NULL;
for( Block *temp = mFreeBlocks, *tempPrev = NULL ; temp ; temp = temp->getNext() ) {
if( temp->getSize() >= size && (!best || temp->getSize() < best->getSize()) ) {
best = temp;
prev = tempPrev;
}
tempPrev = temp;
}
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Manager::getChildFromStream(Manager *&manager, cudaStream_t stream) const {
CNMEM_CHECK(mMutex.lock());
std::size_t i = 0, numChildren = mChildren.size();
for( ; i < numChildren ; ++i ) {
if( mChildren[i]->mStream == stream ) {
manager = mChildren[i];
break;
}
}
CNMEM_CHECK(mMutex.unlock());
return i < numChildren ? CNMEM_STATUS_SUCCESS : CNMEM_STATUS_INVALID_ARGUMENT;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Manager::getChild(Manager *&manager, std::size_t i) const {
CNMEM_CHECK(mMutex.lock());
if( i >= mChildren.size() ) {
CNMEM_CHECK(mMutex.unlock());
return CNMEM_STATUS_INVALID_ARGUMENT;
}
manager = mChildren[i];
CNMEM_CHECK(mMutex.unlock());
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Manager::getMemoryUnsafe(std::size_t &size, const Block *head) const {
size = 0;
for( Block *curr = (Block*) head ; curr ; curr = curr->getNext() ) {
size += curr->getSize();
}
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
#if 0
cnmemStatus_t Manager::getMemory(std::size_t &size, const Block *head) const {
CNMEM_CHECK(mMutex.lock());
CNMEM_CHECK_OR_UNLOCK(getMemoryUnsafe(size, head));
CNMEM_CHECK(mMutex.unlock());
return status;
}
#endif
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Manager::getNumChildren(std::size_t &numChildren) const {
CNMEM_CHECK(mMutex.lock());
numChildren = mChildren.size();
CNMEM_CHECK(mMutex.unlock());
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
std::vector<Manager>& Manager::getRootManagers() {
static std::vector<Manager> managers;
return managers;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Manager::giveBlockUnsafe(void *&blockData, std::size_t &blockSize, std::size_t size) {
// Make sure the block is not in use any more. It could be too coarse grain and we may change
// it in the future.
CNMEM_CHECK_CUDA(cudaStreamSynchronize(mStream));
// Init the returned values to 0.
blockData = NULL;
blockSize = 0;
// Find the best node to steal and reserve it.
Block *best = NULL, *prev = NULL;
CNMEM_CHECK(findBestBlockUnsafe(best, prev, size));
if( !best ) {
return CNMEM_STATUS_OUT_OF_MEMORY;
}
CNMEM_CHECK(extractBlockUnsafe(best, prev, size, true));
blockData = best->getData();
blockSize = best->getSize();
// Release the memory used by that block.
delete best;
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Manager::printListUnsafe(FILE *file, const char *name, const Block *head) const {
std::size_t size = 0;
for( Block *curr = (Block*) head; curr; curr = curr->getNext() ) {
size += curr->getSize();
}
fprintf(file, "| list=\"%s\", size=%lu\n", name, size);
for( Block *curr = (Block*) head ; curr ; curr = curr->getNext() ) {
fprintf(file, "| | node=0x%016lx, data=0x%016lx, size=%lu, next=0x%016lx, head=%2lu\n",
(std::size_t) curr,
(std::size_t) curr->getData(),
(std::size_t) curr->getSize(),
(std::size_t) curr->getNext(),
(std::size_t) curr->isHead ());
}
fprintf(file, "|\n");
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Manager::printMemoryState(FILE *file) const {
CNMEM_CHECK(mMutex.lock());
std::size_t streamCode = (std::size_t) mStream;
std::size_t usedMemory, freeMemory;
CNMEM_CHECK_OR_UNLOCK(getUsedMemoryUnsafe(usedMemory), mMutex);
CNMEM_CHECK_OR_UNLOCK(getFreeMemoryUnsafe(freeMemory), mMutex);
fprintf(file, ">> [%s] device=%d, stream=0x%016lx, used=%luB, free=%luB\n",
mParent ? "child" : "root",
mDevice,
streamCode,
usedMemory,
freeMemory);
CNMEM_CHECK_OR_UNLOCK(printListUnsafe(file, "used", mUsedBlocks), mMutex);
CNMEM_CHECK_OR_UNLOCK(printListUnsafe(file, "free", mFreeBlocks), mMutex);
fprintf(file, "\n");
CNMEM_CHECK(mMutex.unlock());
if( mParent ) {
CNMEM_CHECK(mParent->printMemoryState(file));
}
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Manager::release(void *ptr) {
// Skip if ptr is NULL.
if( ptr == NULL ) {
return CNMEM_STATUS_SUCCESS;
}
// Lock to make sure only one thread execute that fragment of code.
CNMEM_CHECK(mMutex.lock());
// Find the node in the list of used blocks.
Block *curr = mUsedBlocks, *prev = NULL;
for( ; curr && curr->getData() != ptr ; curr = curr->getNext() ) {
prev = curr;
}
// Make sure we have found a node.
if( curr == NULL ) {
CNMEM_CHECK(mMutex.unlock());
return CNMEM_STATUS_INVALID_ARGUMENT;
}
// We have the node so release it.
cnmemStatus_t result = releaseBlockUnsafe(curr, prev);
CNMEM_CHECK(mMutex.unlock());
return result;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Manager::releaseAllUnsafe(bool &memoryLeaks) {
// Destroy the children if any.
bool ok = true;
for( std::size_t i = 0; i < mChildren.size(); ++i ) {
Manager *child = mChildren[i];
bool tmp;
CNMEM_CHECK(child->releaseAllUnsafe(tmp));
ok = ok && !tmp;
delete child;
}
mChildren.clear();
// We have some issues when integrating into some libraries. This has to fixed in the libs.
memoryLeaks = !ok || mUsedBlocks;
// Destroy used blocks. It's a kind of panic mode to avoid leaks. NOTE: Do that only with roots!!!
if( !mParent ) {
while( mUsedBlocks ) {
CNMEM_CHECK(releaseBlockUnsafe(mUsedBlocks, NULL));
}
}
// We should be having only free blocks that are head blocks. Release those blocks.
while( mFreeBlocks ) {
if( mParent ) {
CNMEM_CHECK(mParent->release(mFreeBlocks->getData()));
}
else if( mFreeBlocks->isHead() ) {
void *data = mFreeBlocks->getData();
CNMEM_DEBUG_INFO("cudaFree(%lu, 0x%016lx)\n", mFreeBlocks->getSize(), (size_t) data);
CNMEM_CHECK_CUDA(cudaFree(data));
CNMEM_DEBUG_INFO(">> success\n");
}
Block *block = mFreeBlocks;
mFreeBlocks = mFreeBlocks->getNext();
delete block;
}
// We shouldn't have any used block left. Or, it means the user is causing memory leaks!
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Manager::releaseBlockUnsafe(Block *curr, Block *prev) {
// The current node cannot be NULL!
CNMEM_ASSERT(curr != NULL);
// Change the connection of the node.
if( prev ) {
prev->setNext(curr->getNext());
}
else {
mUsedBlocks = curr->getNext();
}
// Find the location where this block should be added to the free list.
prev = NULL;
Block *iter = mFreeBlocks;
for( ; iter && iter->getData() < curr->getData() ; iter = iter->getNext() ) {
prev = iter;
}
// Keep track of the successor of pred. We may lose track of it in the following "else".
Block *next = prev ? prev->getNext() : mFreeBlocks;
// We first check if we can merge the block with its predecessor in the list and curr can be merged.
if( prev && prev->getData() + prev->getSize() == curr->getData() && !curr->isHead() ) {
prev->setSize(prev->getSize() + curr->getSize());
delete curr;
curr = prev;
}
else if( prev ) {
prev->setNext(curr);
}
else {
mFreeBlocks = curr;
}
// Check if we can merge curr and next. We can't merge over "cudaMalloc" boundaries.
if( next && curr->getData() + curr->getSize() == next->getData() && !next->isHead() ) {
curr->setSize(curr->getSize() + next->getSize());
curr->setNext(next->getNext());
delete next;
}
else {
curr->setNext(next);
}
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Manager::reserve(std::size_t size) {
CNMEM_CHECK(mMutex.lock());
Block *curr, *prev;
CNMEM_CHECK_OR_UNLOCK(allocateBlockUnsafe(curr, prev, size), mMutex);
mSize = size;
CNMEM_CHECK(mMutex.unlock());
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Manager::stealUnsafe(void *&stolen, std::size_t size) {
// If we cannot steal, don't even try.
if( mFlags & CNMEM_FLAGS_CANNOT_STEAL ) {
stolen = NULL;
return CNMEM_STATUS_INVALID_ARGUMENT;
}
// The stolen block.
void *data = NULL; std::size_t dataSize = 0;
if( !mChildren.empty() ) {
CNMEM_CHECK(stealBlockUnsafe(data, dataSize, size));
}
else if( mParent ) {
CNMEM_CHECK(mParent->stealBlockUnsafe(data, dataSize, size));
}
// Make sure we do have a block of memory or quit.
if( !data ) {
stolen = NULL;
return CNMEM_STATUS_OUT_OF_MEMORY;
}
// Push the block in the used list.
mUsedBlocks = new Block((char*) data, dataSize, mUsedBlocks, true);
if( !mUsedBlocks ) {
return CNMEM_STATUS_OUT_OF_MEMORY;
}
// Return the new pointer into memory.
stolen = data;
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Manager::stealBlockUnsafe(void *&data, std::size_t &dataSize, ::size_t size) {
// No block found and no room to grow. Try to steal from a children (if we have any).
data = NULL;
for( std::size_t i = 0 ; !data && i < mChildren.size() ; ++i ) {
Manager *child = mChildren[i];
if( child->giveBlockUnsafe(data, dataSize, size) == CNMEM_STATUS_SUCCESS ) {
break;
}
}
// If no memory space found, simply return NULL. We have failed to allocate. Quit miserably.
if( !data ) {
return CNMEM_STATUS_OUT_OF_MEMORY;
}
// We have got a node from a children. We need to update our "used" list before we can do
// anything with it.
Block *curr = mUsedBlocks, *prev = NULL;
for( ; curr ; curr = curr->getNext() ) {
if( curr->getData() <= data && data < curr->getData()+curr->getSize() ) {
break;
}
prev = curr;
}
// Curr points to the node which contains that memory region.
CNMEM_ASSERT(curr);
// If it is exactly the same memory region, we are done!!!
if( curr->getData() == data && curr->getSize() == dataSize ) {
return CNMEM_STATUS_SUCCESS;
}
// Track the blocks before and after curr.
Block *next = curr->getNext();
// We may have up to 3 blocks.
std::size_t sizeBefore = (std::size_t) ((char*) data - curr->getData());
std::size_t sizeAfter = (curr->getSize() - sizeBefore - dataSize);
// The resulting block.
Block *result = curr;
// If we have no space between curr->getData and block->getData.
if( sizeBefore == 0 ) {
curr->setSize(dataSize);
}
else {
curr->setSize(sizeBefore);
Block *block = new Block((char*) data, dataSize, next, false);
if( !block ) {
return CNMEM_STATUS_OUT_OF_MEMORY;
}
curr->setNext(block);
curr = block;
data = (char*) data + dataSize;
dataSize = sizeAfter;
result = block;
}
// We have space at the end so we may need to add a new node.
if( sizeAfter > 0 ) {
Block *block = new Block(curr->getData() + curr->getSize(), sizeAfter, next, false);
if( !block ) {
return CNMEM_STATUS_OUT_OF_MEMORY;
}
curr->setNext(block);
curr = block;
}
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace cnmem
///////////////////////////////////////////////////////////////////////////////////////////////////
extern "C" {
///////////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t cnmemInit(int numDevices, const cnmemDevice_t *devices, unsigned flags) {
// Make sure we have at least one device declared.
CNMEM_CHECK_TRUE(numDevices > 0, CNMEM_STATUS_INVALID_ARGUMENT);
// Find the largest ID of the device.
int maxDevice = 0;
for( int i = 0 ; i < numDevices ; ++i ) {
if( devices[i].device > maxDevice ) {
maxDevice = devices[i].device;
}
}
// Allocate enough managers.
CNMEM_CHECK_TRUE(maxDevice >= 0, CNMEM_STATUS_INVALID_ARGUMENT);
std::vector<cnmem::Manager> &managers = cnmem::Manager::getRootManagers();
managers.resize(maxDevice+1);
// Create a root manager for each device and create the children.
int oldDevice;
CNMEM_CHECK_CUDA(cudaGetDevice(&oldDevice));
for( int i = 0 ; i < numDevices ; ++i ) {
CNMEM_CHECK_CUDA(cudaSetDevice(devices[i].device));
std::size_t size = devices[i].size;
if( size == 0 ) {
cudaDeviceProp props;
CNMEM_CHECK_CUDA(cudaGetDeviceProperties(&props, devices[i].device));
size = props.totalGlobalMem / 2;
}
CNMEM_CHECK_TRUE(size > 0, CNMEM_STATUS_INVALID_ARGUMENT);
cnmem::Manager &manager = cnmem::Manager::getRootManager(devices[i].device);
manager.setDevice(devices[i].device);
manager.setFlags(flags);
size = cnmem::ceilInt(size, CNMEM_GRANULARITY);
CNMEM_CHECK(manager.reserve(size));
for( int j = 0 ; j < devices[i].numStreams ; ++j ) {
cnmem::Manager *child = new cnmem::Manager;
child->setParent(&manager);
child->setDevice(devices[i].device);
child->setStream(devices[i].streams[j]);
child->setFlags(flags & ~CNMEM_FLAGS_CANNOT_GROW);
if( devices[i].streamSizes && devices[i].streamSizes[j] > 0 ) {
CNMEM_CHECK(child->reserve(devices[i].streamSizes[j]));
}
CNMEM_CHECK(manager.addChild(child));
}
}
CNMEM_CHECK_CUDA(cudaSetDevice(oldDevice));
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t cnmemRegisterStream(cudaStream_t stream) {
CNMEM_CHECK_TRUE(!cnmem::Manager::getRootManagers().empty(), CNMEM_STATUS_NOT_INITIALIZED);
CNMEM_CHECK_TRUE(stream, CNMEM_STATUS_INVALID_ARGUMENT);
int device;
CNMEM_CHECK_CUDA(cudaGetDevice(&device));
cnmem::Manager &root = cnmem::Manager::getRootManager(device);
cnmem::Manager *child = new cnmem::Manager;
child->setParent(&root);
child->setDevice(device);
child->setStream(stream);
child->setFlags(root.getFlags() & ~CNMEM_FLAGS_CANNOT_GROW);
root.addChild(child);
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t cnmemFinalize() {
CNMEM_CHECK_TRUE(!cnmem::Manager::getRootManagers().empty(), CNMEM_STATUS_NOT_INITIALIZED);
int oldDevice;
CNMEM_CHECK_CUDA(cudaGetDevice(&oldDevice));
std::vector<cnmem::Manager> &managers = cnmem::Manager::getRootManagers();
bool memoryLeaks = false;
for( std::size_t i = 0; i < managers.size(); ++i ) {
CNMEM_CHECK_CUDA(cudaSetDevice(managers[i].getDevice()));
bool tmpLeaks;
CNMEM_CHECK(managers[i].releaseAllUnsafe(tmpLeaks));
memoryLeaks = memoryLeaks || tmpLeaks;
}
managers.clear();
CNMEM_CHECK_CUDA(cudaSetDevice(oldDevice));
return memoryLeaks ? CNMEM_STATUS_MEMORY_LEAK : CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t cnmemMalloc(void **ptr, std::size_t size, cudaStream_t stream) {
CNMEM_CHECK_TRUE(!cnmem::Manager::getRootManagers().empty(), CNMEM_STATUS_NOT_INITIALIZED);
if( !ptr && !size )
return CNMEM_STATUS_SUCCESS;
CNMEM_CHECK_TRUE(ptr, CNMEM_STATUS_INVALID_ARGUMENT);
CNMEM_CHECK_TRUE(size, CNMEM_STATUS_INVALID_ARGUMENT);
int device;
CNMEM_CHECK_CUDA(cudaGetDevice(&device));
cnmem::Manager &root = cnmem::Manager::getRootManager(device);
cnmem::Manager *manager = &root;
if( stream ) {
CNMEM_CHECK(root.getChildFromStream(manager, stream));
}
CNMEM_ASSERT(manager);
size = cnmem::ceilInt(size, CNMEM_GRANULARITY);
cnmemStatus_t result = manager->allocate(ptr[0], size);
// We failed to allocate but there might still be a buffer available in another manager. Try to
// steal it.
if( result == CNMEM_STATUS_OUT_OF_MEMORY ) {
// Try to acquire locks on all the children.
std::size_t numChildren;
CNMEM_CHECK(root.getNumChildren(numChildren));
std::vector<const cnmem::Mutex*> mutexes(numChildren);
std::size_t numLocked = 0;
for( size_t i = 0 ; i < numChildren ; ++i, ++numLocked ) {
cnmem::Manager *child;
CNMEM_CHECK(root.getChild(child, i));
mutexes[numLocked] = child->getMutex();
if( mutexes[numLocked]->lock() != CNMEM_STATUS_SUCCESS ) {
break;
}
}
// One lock failed, quit. Reduce the damage as much as possible, though.
if( numLocked != numChildren ) {
for( std::size_t i = 0 ; i < numLocked ; ++i ) {
cnmemStatus_t lockStatus = mutexes[i]->unlock();
}
return CNMEM_STATUS_UNKNOWN_ERROR;
}
// Grab the lock on the root, first.
const cnmem::Mutex *rootMutex = root.getMutex();
CNMEM_CHECK(rootMutex->lock());
// We acquired all the lock so we try to steal a node from another child.
if( numLocked == mutexes.size() ) {
result = manager->stealUnsafe(ptr[0], size);
}
for( std::size_t i = 0 ; i < numLocked ; ++i ) {
cnmemStatus_t lockStatus = mutexes[i]->unlock();
if( lockStatus != CNMEM_STATUS_SUCCESS ) {
// Starting from now we are panicking!!! One lock failed to be released, we try
// we others. We could also give up because we are already screwed. I don't know
// what's best! Comment are welcome.
result = lockStatus;
}
}
CNMEM_CHECK(rootMutex->unlock());
}
return result;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t cnmemFree(void *ptr, cudaStream_t stream) {
CNMEM_CHECK_TRUE(!cnmem::Manager::getRootManagers().empty(), CNMEM_STATUS_NOT_INITIALIZED);
if( ptr == NULL ) {
return CNMEM_STATUS_SUCCESS;
}
int device;
CNMEM_CHECK_CUDA(cudaGetDevice(&device));
cnmem::Manager &root = cnmem::Manager::getRootManager(device);
cnmem::Manager *manager = &root;
if( stream ) {
CNMEM_CHECK(root.getChildFromStream(manager, stream));
}
CNMEM_ASSERT(manager);
return manager->release(ptr);
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t cnmemMemGetInfo(size_t *freeMem, size_t *totalMem, cudaStream_t stream) {
CNMEM_CHECK_TRUE(!cnmem::Manager::getRootManagers().empty(), CNMEM_STATUS_NOT_INITIALIZED);
CNMEM_CHECK_TRUE(totalMem && freeMem, CNMEM_STATUS_INVALID_ARGUMENT);
int device;
CNMEM_CHECK_CUDA(cudaGetDevice(&device));
cnmem::Manager &root = cnmem::Manager::getRootManager(device);
cnmem::Manager *manager = &root;
if( stream ) {
CNMEM_CHECK(root.getChildFromStream(manager, stream));
}
CNMEM_ASSERT(manager);
const cnmem::Mutex *mutex = manager->getMutex();
CNMEM_CHECK(mutex->lock());
CNMEM_CHECK_OR_UNLOCK(manager->getFreeMemoryUnsafe(*freeMem), *mutex);
size_t usedMem;
CNMEM_CHECK_OR_UNLOCK(manager->getUsedMemoryUnsafe(usedMem), *mutex);
CNMEM_CHECK(mutex->unlock());
totalMem[0] = usedMem + freeMem[0];
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t cnmemPrintMemoryState(FILE *file, cudaStream_t stream) {
CNMEM_CHECK_TRUE(!cnmem::Manager::getRootManagers().empty(), CNMEM_STATUS_NOT_INITIALIZED);
int device;
CNMEM_CHECK_CUDA(cudaGetDevice(&device));
cnmem::Manager &root = cnmem::Manager::getRootManager(device);
cnmem::Manager *manager = &root;
if( stream ) {
CNMEM_CHECK(root.getChildFromStream(manager, stream));
}
CNMEM_ASSERT(manager);
return manager->printMemoryState(file);
}
///////////////////////////////////////////////////////////////////////////////////////////////////
} // extern "C"
...@@ -35,16 +35,16 @@ ...@@ -35,16 +35,16 @@
#include "cuda_runtime_api.h" #include "cuda_runtime_api.h"
#if defined(_MSC_VER) || defined(WIN32) #if defined(_MSC_VER) || defined(WIN32)
#ifdef CUMEM_DLLEXPORT #ifdef CNMEM_DLLEXPORT
#define CUMEM_API __declspec(dllexport) #define CNMEM_API __declspec(dllexport)
#else #else
#define CUMEM_API __declspec(dllimport) #define CNMEM_API __declspec(dllimport)
#endif #endif
#else #else
#define CUMEM_API #define CNMEM_API
#endif #endif
#define CUMEM_VERSION 100 // It corresponds to 1.0.0 #define CNMEM_VERSION 100 // It corresponds to 1.0.0
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
...@@ -54,29 +54,27 @@ extern "C" { ...@@ -54,29 +54,27 @@ extern "C" {
typedef enum typedef enum
{ {
CUMEM_STATUS_SUCCESS = 0, CNMEM_STATUS_SUCCESS = 0,
CUMEM_STATUS_CUDA_ERROR, CNMEM_STATUS_CUDA_ERROR,
CUMEM_STATUS_INVALID_ARGUMENT, CNMEM_STATUS_INVALID_ARGUMENT,
CUMEM_STATUS_MEMORY_LEAK, CNMEM_STATUS_MEMORY_LEAK,
CUMEM_STATUS_NOT_INITIALIZED, CNMEM_STATUS_NOT_INITIALIZED,
CUMEM_STATUS_OUT_OF_MEMORY, CNMEM_STATUS_OUT_OF_MEMORY,
CUMEM_STATUS_UNKNOWN_ERROR CNMEM_STATUS_UNKNOWN_ERROR
} cumemStatus_t; } cnmemStatus_t;
/* ********************************************************************************************* */ /* ********************************************************************************************* */
typedef enum typedef enum
{ {
CUMEM_FLAGS_DEFAULT = 0, /// Default flags. CNMEM_FLAGS_DEFAULT = 0, /// Default flags.
CUMEM_FLAGS_CANNOT_GROW = 1, /// Prevent the manager from growing its memory consumption. CNMEM_FLAGS_CANNOT_GROW = 1, /// Prevent the manager from growing its memory consumption.
CUMEM_FLAGS_CANNOT_STEAL = 2, /// Prevent the manager from stealing memory. CNMEM_FLAGS_CANNOT_STEAL = 2, /// Prevent the manager from stealing memory.
CUMEM_FLAGS_USE_UNIFIED_MEM = 4, /// Use Managed Memory for allocating memory. } cnmemManagerFlags_t;
CUMEM_FLAGS_MEM_ATTACH_HOST = 8 /// Host-only visible unified memory. (valid only for unified memory!)
} cumemManagerFlags_t;
/* ********************************************************************************************* */ /* ********************************************************************************************* */
typedef struct cumemDevice_t_ typedef struct cnmemDevice_t_
{ {
/** The device number. */ /** The device number. */
int device; int device;
...@@ -86,10 +84,10 @@ typedef struct cumemDevice_t_ ...@@ -86,10 +84,10 @@ typedef struct cumemDevice_t_
int numStreams; int numStreams;
/** The streams associated with the device. It can be NULL. The NULL stream is managed. */ /** The streams associated with the device. It can be NULL. The NULL stream is managed. */
cudaStream_t *streams; cudaStream_t *streams;
/** The memory allocation granularity (in multiples of 512 B). If 0, the implementation chooses the size */ /** The size reserved for each streams. It can be 0. */
size_t granularity; size_t *streamSizes;
} cumemDevice_t; } cnmemDevice_t;
/** /**
* \brief Initialize the library and allocate memory on the listed devices. * \brief Initialize the library and allocate memory on the listed devices.
...@@ -99,47 +97,58 @@ typedef struct cumemDevice_t_ ...@@ -99,47 +97,58 @@ typedef struct cumemDevice_t_
* memory manager is created. Currently, it is implemented as a tree of memory managers: A root * memory manager is created. Currently, it is implemented as a tree of memory managers: A root
* manager for the device and a list of children, one for each named stream. * manager for the device and a list of children, one for each named stream.
* *
* This function must be before any other function in the library. It has to be called by a * This function must be called before any other function in the library. It has to be called
* single thread since it is not thread-safe. * by a single thread since it is not thread-safe.
* *
* \return * \return
* CUMEM_STATUS_SUCCESS, if everything goes fine, * CNMEM_STATUS_SUCCESS, if everything goes fine,
* CUMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid, * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid,
* CUMEM_STATUS_OUT_OF_MEMORY, if the requested size exceeds the available memory, * CNMEM_STATUS_OUT_OF_MEMORY, if the requested size exceeds the available memory,
* CUMEM_STATUS_CUDA_ERROR, if an error happens in a CUDA function. * CNMEM_STATUS_CUDA_ERROR, if an error happens in a CUDA function.
*/ */
cumemStatus_t CUMEM_API cumemInit(int numDevices, const cumemDevice_t *devices, unsigned flags); cnmemStatus_t CNMEM_API cnmemInit(int numDevices, const cnmemDevice_t *devices, unsigned flags);
/**
* \brief Add a new stream to the pool of managed streams on a device.
*
* This function registers a new stream into a device memory manager. It is thread-safe.
*
* \return
* CNMEM_STATUS_SUCCESS, if everything goes fine,
* CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid,
*/
cnmemStatus_t CNMEM_API cnmemRegisterStream(cudaStream_t stream);
/** /**
* \brief Release all the allocated memory. * \brief Release all the allocated memory.
* *
* This function must be called by a single thread and after all threads that called * This function must be called by a single thread and after all threads that called
* cumemMalloc/cumemFree have joined. This function is not thread-safe. * cnmemMalloc/cnmemFree have joined. This function is not thread-safe.
* *
* \return * \return
* CUMEM_STATUS_SUCCESS, if everything goes fine, * CNMEM_STATUS_SUCCESS, if everything goes fine,
* CUMEM_STATUS_NOT_INITIALIZED, if the ::cumemInit function has not been called, * CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called,
* CUMEM_STATUS_MEMORY_LEAK, if there are unreleased blocks in the memory queues, * CNMEM_STATUS_MEMORY_LEAK, if there are unreleased blocks in the memory queues,
* CUMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions. * CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions.
*/ */
cumemStatus_t CUMEM_API cumemFinalize(); cnmemStatus_t CNMEM_API cnmemFinalize();
/** /**
* \brief Allocate memory. * \brief Allocate memory.
* *
* This function allocates memory and initializes a pointer to device memory. If no memory * This function allocates memory and initializes a pointer to device memory. If no memory
* is available, it returns a CUMEM_STATUS_OUT_OF_MEMORY error. This function is thread safe. * is available, it returns a CNMEM_STATUS_OUT_OF_MEMORY error. This function is thread safe.
* *
* The behavior of that function is the following: * The behavior of that function is the following:
* *
* - If the stream is NULL, the root memory manager is asked to allocate a buffer of device * - If the stream is NULL, the root memory manager is asked to allocate a buffer of device
* memory. If there's a buffer of size larger or equal to the requested size in the list of * memory. If there's a buffer of size larger or equal to the requested size in the list of
* free blocks, it is returned. If there's no such buffer but the manager is allowed to grow * free blocks, it is returned. If there's no such buffer but the manager is allowed to grow
* its memory usage (the CUMEM_FLAGS_CANNOT_GROW flag is not set), the memory manager calls * its memory usage (the CNMEM_FLAGS_CANNOT_GROW flag is not set), the memory manager calls
* cudaMalloc. If cudaMalloc fails due to no more available memory or the manager is not * cudaMalloc. If cudaMalloc fails due to no more available memory or the manager is not
* allowed to grow, the manager attempts to steal memory from one of its children (unless * allowed to grow, the manager attempts to steal memory from one of its children (unless
* CUMEM_FLAGS_CANNOT_STEAL is set). If that attempt also fails, the manager returns * CNMEM_FLAGS_CANNOT_STEAL is set). If that attempt also fails, the manager returns
* CUMEM_STATUS_OUT_OF_MEMORY. * CNMEM_STATUS_OUT_OF_MEMORY.
* *
* - If the stream is a named stream, the initial request goes to the memory manager associated * - If the stream is a named stream, the initial request goes to the memory manager associated
* with that stream. If a free node is available in the lists of that manager, it is returned. * with that stream. If a free node is available in the lists of that manager, it is returned.
...@@ -148,17 +157,18 @@ cumemStatus_t CUMEM_API cumemFinalize(); ...@@ -148,17 +157,18 @@ cumemStatus_t CUMEM_API cumemFinalize();
* *
* The calls to cudaMalloc are potentially costly and may induce GPU synchronizations. Also the * The calls to cudaMalloc are potentially costly and may induce GPU synchronizations. Also the
* mechanism to steal memory from the children induces GPU synchronizations (the manager has to * mechanism to steal memory from the children induces GPU synchronizations (the manager has to
* make sure no kernel uses a given buffer before stealing it) and it cannot be executed by more * make sure no kernel uses a given buffer before stealing it) and it the execution is
* than one thread at the time (per device). * sequential (in a multi-threaded context, the code is executed in a critical section inside
* the cnmem library - no need for the user to wrap cnmemMalloc with locks).
* *
* \return * \return
* CUMEM_STATUS_SUCCESS, if everything goes fine, * CNMEM_STATUS_SUCCESS, if everything goes fine,
* CUMEM_STATUS_NOT_INITIALIZED, if the ::cumemInit function has not been called, * CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called,
* CUMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, ptr == 0, * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, ptr == 0,
* CUMEM_STATUS_OUT_OF_MEMORY, if there is not enough memory available, * CNMEM_STATUS_OUT_OF_MEMORY, if there is not enough memory available,
* CUMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions. * CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions.
*/ */
cumemStatus_t CUMEM_API cumemMalloc(void **ptr, size_t size, cudaStream_t stream); cnmemStatus_t CNMEM_API cnmemMalloc(void **ptr, size_t size, cudaStream_t stream);
/** /**
* \brief Release memory. * \brief Release memory.
...@@ -167,31 +177,31 @@ cumemStatus_t CUMEM_API cumemMalloc(void **ptr, size_t size, cudaStream_t stream ...@@ -167,31 +177,31 @@ cumemStatus_t CUMEM_API cumemMalloc(void **ptr, size_t size, cudaStream_t stream
* thread safe. * thread safe.
* *
* \return * \return
* CUMEM_STATUS_SUCCESS, if everything goes fine, * CNMEM_STATUS_SUCCESS, if everything goes fine,
* CUMEM_STATUS_NOT_INITIALIZED, if the ::cumemInit function has not been called, * CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called,
* CUMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, ptr == 0, * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, ptr == 0,
* CUMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions. * CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions.
*/ */
cumemStatus_t CUMEM_API cumemFree(void *ptr, cudaStream_t stream); cnmemStatus_t CNMEM_API cnmemFree(void *ptr, cudaStream_t stream);
/* ********************************************************************************************* */ /* ********************************************************************************************* */
/* Utility functions. */ /* Utility functions. */
/* ********************************************************************************************* */ /* ********************************************************************************************* */
/** /**
* \brief Returns the amount of memory managed by the root memory manager on devices. * \brief Returns the amount of memory managed by the memory manager associated with a stream.
* *
* The pointers used_mem and free_mem must point to memory regions of numDevices*sizeof(size_t) * The pointers totalMem and freeMem must be valid. At the moment, this function has a comple-
* bytes. At the moment, this function has a complexity linear in the number of allocated blocks * xity linear in the number of allocated blocks so do not call it in performance critical
* so do not call it in performance critical sections. * sections.
* *
* \return * \return
* CUMEM_STATUS_SUCCESS, if everything goes fine, * CNMEM_STATUS_SUCCESS, if everything goes fine,
* CUMEM_STATUS_NOT_INITIALIZED, if the ::cumemInit function has not been called, * CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called,
* CUMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid, * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid,
* CUMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions. * CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions.
*/ */
cumemStatus_t CUMEM_API cumemGetMemoryUsage(size_t *used_mem, size_t *free_mem); cnmemStatus_t CNMEM_API cnmemMemGetInfo(size_t *freeMem, size_t *totalMem, cudaStream_t stream);
/** /**
* \brief Print a list of nodes to a file. * \brief Print a list of nodes to a file.
...@@ -200,18 +210,18 @@ cumemStatus_t CUMEM_API cumemGetMemoryUsage(size_t *used_mem, size_t *free_mem); ...@@ -200,18 +210,18 @@ cumemStatus_t CUMEM_API cumemGetMemoryUsage(size_t *used_mem, size_t *free_mem);
* behaviour of the memory managers/application. It is thread safe. * behaviour of the memory managers/application. It is thread safe.
* *
* \return * \return
* CUMEM_STATUS_SUCCESS, if everything goes fine, * CNMEM_STATUS_SUCCESS, if everything goes fine,
* CUMEM_STATUS_NOT_INITIALIZED, if the ::cumemInit function has not been called, * CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called,
* CUMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, used_mem == 0 * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, used_mem == 0
* or free_mem == 0, * or free_mem == 0,
* CUMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions. * CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions.
*/ */
cumemStatus_t CUMEM_API cumemPrintMemoryState(FILE *file); cnmemStatus_t CNMEM_API cnmemPrintMemoryState(FILE *file, cudaStream_t stream);
/** /**
* \brief Converts a cumemStatus_t value to a string. * \brief Converts a cnmemStatus_t value to a string.
*/ */
const char* cumemGetErrorString(cumemStatus_t status); const char* cnmemGetErrorString(cnmemStatus_t status);
/* ********************************************************************************************* */ /* ********************************************************************************************* */
......
...@@ -9,8 +9,8 @@ ...@@ -9,8 +9,8 @@
#include "cuda_ndarray.cuh" #include "cuda_ndarray.cuh"
#include "cumem.h" #include "cnmem.h"
#include "cumem.cpp" #include "cnmem.cpp"
//If true, when there is a gpu malloc or free error, we print the size of allocated memory on the device. //If true, when there is a gpu malloc or free error, we print the size of allocated memory on the device.
#define COMPUTE_GPU_MEM_USED 0 #define COMPUTE_GPU_MEM_USED 0
...@@ -71,20 +71,20 @@ void * device_malloc(size_t size) ...@@ -71,20 +71,20 @@ void * device_malloc(size_t size)
} }
///@TODO: thejaswi: link this option to a theano config variable? ///@TODO: thejaswi: link this option to a theano config variable?
static bool g_use_cumem = false; static bool g_use_cnmem = false;
static const int g_max_devices = 8; static const int g_max_devices = 8;
int initCumem(int card_number_provided, int card_nb) { int initCnmem(int card_number_provided, int card_nb) {
static bool cumemInitialized = false; static bool cnmemInitialized = false;
if(cumemInitialized) { if(cnmemInitialized) {
return 0; return 0;
} }
// On stderr to be at the same place as "Using gpu device..." // On stderr to be at the same place as "Using gpu device..."
fprintf(stderr, "Initializing cumem...\n"); fprintf(stderr, "Initializing cnmem...\n");
int numDevices = 0; int numDevices = 0;
cumemDevice_t devices[g_max_devices]; cnmemDevice_t devices[g_max_devices];
if(cudaGetDeviceCount(&numDevices) != cudaSuccess) { if(cudaGetDeviceCount(&numDevices) != cudaSuccess) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"initCumem: 'cudaGetDeviceCount' failed! Reason=%s\n", "initCnmem: 'cudaGetDeviceCount' failed! Reason=%s\n",
cudaGetErrorString(cudaGetLastError())); cudaGetErrorString(cudaGetLastError()));
return -1; return -1;
} }
...@@ -97,7 +97,6 @@ int initCumem(int card_number_provided, int card_nb) { ...@@ -97,7 +97,6 @@ int initCumem(int card_number_provided, int card_nb) {
///@TODO: thejaswi: add support for multiple streams ///@TODO: thejaswi: add support for multiple streams
devices[i].numStreams = 0; devices[i].numStreams = 0;
devices[i].streams = NULL; devices[i].streams = NULL;
devices[i].granularity = 0;
}else{ }else{
for(int i=0;i<numDevices;++i) { for(int i=0;i<numDevices;++i) {
...@@ -107,19 +106,18 @@ int initCumem(int card_number_provided, int card_nb) { ...@@ -107,19 +106,18 @@ int initCumem(int card_number_provided, int card_nb) {
///@TODO: thejaswi: add support for multiple streams ///@TODO: thejaswi: add support for multiple streams
devices[i].numStreams = 0; devices[i].numStreams = 0;
devices[i].streams = NULL; devices[i].streams = NULL;
devices[i].granularity = 0;
} }
} }
///@TODO: thejaswi: passing custom cumem flags? ///@TODO: thejaswi: passing custom cnmem flags?
cumemStatus_t status = cumemInit(numDevices, devices, CUMEM_FLAGS_DEFAULT); cnmemStatus_t status = cnmemInit(numDevices, devices, CNMEM_FLAGS_DEFAULT);
if(status != CUMEM_STATUS_SUCCESS) { if(status != CNMEM_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"initCumem: cumemInit call failed! Reason=%s. numdev=%d\n", "initCnmem: cnmemInit call failed! Reason=%s. numdev=%d\n",
cumemGetErrorString(status), numDevices); cnmemGetErrorString(status), numDevices);
return -1; return -1;
} }
cumemInitialized = true; cnmemInitialized = true;
return 0; return 0;
} }
...@@ -138,12 +136,15 @@ void * device_malloc(size_t size, int verbose) ...@@ -138,12 +136,15 @@ void * device_malloc(size_t size, int verbose)
#endif #endif
void * rval=NULL; void * rval=NULL;
///@TODO: thejaswi: support for multiple-streams? ///@TODO: thejaswi: support for multiple-streams?
if(g_use_cumem) { if(g_use_cnmem) {
cumemStatus_t status = cumemMalloc(&rval, size, NULL); cnmemStatus_t status = CNMEM_STATUS_SUCCESS;
if(status != CUMEM_STATUS_SUCCESS) { if( size != 0 ) {
status = cnmemMalloc(&rval, size, NULL);
}
if(status != CNMEM_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, PyErr_Format(PyExc_MemoryError,
"Error allocating %zd bytes of device memory (%s).", "Error allocating %zd bytes of device memory (%s).",
size, cumemGetErrorString(status)); size, cnmemGetErrorString(status));
return NULL; return NULL;
} }
} }
...@@ -271,11 +272,11 @@ int device_free(void *ptr) ...@@ -271,11 +272,11 @@ int device_free(void *ptr)
} }
///@TODO: thejaswi: multi-stream support ///@TODO: thejaswi: multi-stream support
if(g_use_cumem) { if(g_use_cnmem) {
cumemStatus_t status = cumemFree(ptr, NULL); cnmemStatus_t status = cnmemFree(ptr, NULL);
if(status != CUMEM_STATUS_SUCCESS) { if(status != CNMEM_STATUS_SUCCESS) {
fprintf(stderr, "device_free: cumemFree call failed! Reason=%s\n", fprintf(stderr, "device_free: cnmemFree call failed! Reason=%s\n",
cumemGetErrorString(status)); cnmemGetErrorString(status));
} }
} }
else { else {
...@@ -3134,22 +3135,22 @@ CudaNdarray_ptr_int_size(PyObject* _unused, PyObject* args) ...@@ -3134,22 +3135,22 @@ CudaNdarray_ptr_int_size(PyObject* _unused, PyObject* args)
static int cublas_init(); static int cublas_init();
static void cublas_shutdown(); static void cublas_shutdown();
// Initialize the gpu. // Initialize the gpu.
// Takes two optional parameters, the device number and if we should use cumem. // Takes two optional parameters, the device number and if we should use cnmem.
// If the device number is provided, it sets that device to be the active device. // If the device number is provided, it sets that device to be the active device.
// If not provided (usually just to test whether the gpu is available at all), // If not provided (usually just to test whether the gpu is available at all),
// it does not set an active device. // it does not set an active device.
// Raises EnvironmentError or ValueError (as appropriate) if the initialization failed. // Raises EnvironmentError or ValueError (as appropriate) if the initialization failed.
// cumem is threaded like a bool. If converted to 0, don't use cumem. Otherwise, use it. // cnmem is threaded like a bool. If converted to 0, don't use cnmem. Otherwise, use it.
PyObject * PyObject *
CudaNdarray_gpu_init(PyObject* _unused, PyObject* args) CudaNdarray_gpu_init(PyObject* _unused, PyObject* args)
{ {
int card_nb = 0; int card_nb = 0;
int card_number_provided = 1; int card_number_provided = 1;
int cumem = 0; // 0 False, 1 True int cnmem = 0; // 0 False, 1 True
// if we're given something wildly invalid, this will throw a TypeError // if we're given something wildly invalid, this will throw a TypeError
PyArg_ParseTuple(args, "|ii", &card_nb, &cumem); PyArg_ParseTuple(args, "|ii", &card_nb, &cnmem);
if(cumem) if(cnmem)
g_use_cumem = true; g_use_cnmem = true;
if(PyTuple_Size(args) == 0) { if(PyTuple_Size(args) == 0) {
card_number_provided = 0; card_number_provided = 0;
...@@ -3204,8 +3205,8 @@ CudaNdarray_gpu_init(PyObject* _unused, PyObject* args) ...@@ -3204,8 +3205,8 @@ CudaNdarray_gpu_init(PyObject* _unused, PyObject* args)
if (cublas_init() == -1) if (cublas_init() == -1)
return NULL; return NULL;
} }
if(card_number_provided && g_use_cumem) { if(card_number_provided && g_use_cnmem) {
if(initCumem(card_number_provided, card_nb) == -1){ if(initCnmem(card_number_provided, card_nb) == -1){
return NULL; return NULL;
} }
} }
...@@ -3240,13 +3241,13 @@ CudaNdarray_gpu_shutdown(PyObject* _unused, PyObject* _unused_args) { ...@@ -3240,13 +3241,13 @@ CudaNdarray_gpu_shutdown(PyObject* _unused, PyObject* _unused_args) {
// Don't handle errors here // Don't handle errors here
cublas_shutdown(); cublas_shutdown();
g_gpu_context_active = 0; // context has now been closed down g_gpu_context_active = 0; // context has now been closed down
if(g_use_cumem) { if(g_use_cnmem) {
fprintf(stderr, "Shutting down cumem...\n"); fprintf(stderr, "Shutting down cnmem...\n");
cumemStatus_t status = cumemFinalize(); cnmemStatus_t status = cnmemFinalize();
if(status != CUMEM_STATUS_SUCCESS) { if(status != CNMEM_STATUS_SUCCESS && status != CNMEM_STATUS_MEMORY_LEAK) {
fprintf(stderr, "CudaNdarray_gpu_shutdown: cumemFinalize failed! Reason=%s\n", fprintf(stderr, "CudaNdarray_gpu_shutdown: cnmemFinalize failed! Reason=%s\n",
cumemGetErrorString(status)); cnmemGetErrorString(status));
if(status == CUMEM_STATUS_CUDA_ERROR) { if(status == CNMEM_STATUS_CUDA_ERROR) {
fprintf(stderr, " Cuda-Reason=%s\n", fprintf(stderr, " Cuda-Reason=%s\n",
cudaGetErrorString(cudaGetLastError())); cudaGetErrorString(cudaGetLastError()));
} }
......
///////////////////////////////////////////////////////////////////////////////////////////////////
// Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
///////////////////////////////////////////////////////////////////////////////////////////////////
#include "cumem.h"
#include <algorithm>
#include <cassert>
#include <cstddef>
#include <iostream>
#include <vector>
#include <cuda_runtime_api.h>
#if !defined(WIN32) && defined(_MSC_VER)
#define WIN32
#endif
#ifdef WIN32
#include <Windows.h>
#else
#include <pthread.h>
#endif
#define CUMEM_DEFAULT_GRANULARITY 512
//#define CUMEM_DEBUG
///////////////////////////////////////////////////////////////////////////////////////////////////
namespace cumem {
///////////////////////////////////////////////////////////////////////////////////////////////////
#define CUMEM_CHECK(call) do { \
cumemStatus_t status = call; \
if( status != CUMEM_STATUS_SUCCESS ) { \
return status; \
} \
} while(0)
///////////////////////////////////////////////////////////////////////////////////////////////////
#define CUMEM_CHECK_OR_UNLOCK_AND_RETURN(call, lock) do { \
cumemStatus_t status = call; \
if( status != CUMEM_STATUS_SUCCESS ) { \
lock.unlock(); \
return status; \
} \
} while(0)
///////////////////////////////////////////////////////////////////////////////////////////////////
#define CHECK(cond, error) do { \
if( !(cond) ) { \
return error; \
} \
} while(0)
///////////////////////////////////////////////////////////////////////////////////////////////////
#define CHECK_CUDA(call) do { \
cudaError_t cuda_error = call; \
if( cuda_error == cudaErrorMemoryAllocation ) { \
return CUMEM_STATUS_OUT_OF_MEMORY; \
} \
else if( cuda_error != cudaSuccess ) { \
return CUMEM_STATUS_CUDA_ERROR; \
} \
} while(0)
///////////////////////////////////////////////////////////////////////////////////////////////////
#ifdef WIN32
#define CHECK_WIN32(call, error_code) do { \
SetLastError(0); /* Clean the flag. */ \
call; \
DWORD status = GetLastError(); \
if( status ) \
return error_code; \
} while(0)
#else
#define CHECK_PTHREAD(call, error_code) do { \
int status = call; \
if( status ) { \
return error_code; \
} \
} while(0)
#endif
///////////////////////////////////////////////////////////////////////////////////////////////////
class Mutex
{
#ifdef WIN32
CRITICAL_SECTION m_critical_section;
#else
pthread_mutex_t m_mutex;
#endif
public:
/// Initialize the mutex.
cumemStatus_t initialize();
/// Finalize the mutex.
cumemStatus_t finalize();
/// Lock the mutex.
cumemStatus_t lock();
/// Unlock the mutex.
cumemStatus_t unlock();
};
///////////////////////////////////////////////////////////////////////////////////////////////////
cumemStatus_t Mutex::initialize()
{
#ifdef WIN32
CHECK_WIN32(InitializeCriticalSection(&m_critical_section), CUMEM_STATUS_UNKNOWN_ERROR);
#else
// pthread_mutexattr_t attr;
// CHECK_PTHREAD_OR_THROW(pthread_mutexattr_init(&attr), CUMEM_STATUS_UNKNOWN_ERROR);
// CHECK_PTHREAD_OR_THROW(pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE), CUMEM_STATUS_UNKNOWN_ERROR);
// CHECK_PTHREAD_OR_THROW(pthread_mutex_init(&m_mutex, &attr), CUMEM_STATUS_UNKNOWN_ERROR);
CHECK_PTHREAD(pthread_mutex_init(&m_mutex, NULL), CUMEM_STATUS_UNKNOWN_ERROR);
#endif
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cumemStatus_t Mutex::finalize()
{
#ifdef WIN32
CHECK_WIN32(DeleteCriticalSection(&m_critical_section), CUMEM_STATUS_UNKNOWN_ERROR);
#else
CHECK_PTHREAD(pthread_mutex_destroy(&m_mutex), CUMEM_STATUS_UNKNOWN_ERROR);
#endif
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cumemStatus_t Mutex::lock()
{
#ifdef WIN32
CHECK_WIN32(EnterCriticalSection(&m_critical_section), CUMEM_STATUS_UNKNOWN_ERROR);
#else
CHECK_PTHREAD(pthread_mutex_lock(&m_mutex), CUMEM_STATUS_UNKNOWN_ERROR);
#endif
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cumemStatus_t Mutex::unlock()
{
#ifdef WIN32
CHECK_WIN32(LeaveCriticalSection(&m_critical_section), CUMEM_STATUS_UNKNOWN_ERROR);
#else
CHECK_PTHREAD(pthread_mutex_unlock(&m_mutex), CUMEM_STATUS_UNKNOWN_ERROR);
#endif
}
///////////////////////////////////////////////////////////////////////////////////////////////////
class Lock
{
/// The mutex.
Mutex *m_mutex;
public:
/// Ctor.
Lock() : m_mutex(NULL) {}
/// Lock the mutex.
inline cumemStatus_t lock(Mutex *mutex) { m_mutex = mutex; return m_mutex->lock(); }
/// Unlock the mutex.
inline cumemStatus_t unlock() { return m_mutex->unlock(); }
};
///////////////////////////////////////////////////////////////////////////////////////////////////
class Block
{
/// The pointer to the memory region on the device.
char *m_data;
/// The size of the memory buffer.
std::size_t m_size;
/// The prev/next blocks in the linked list of blocks.
Block *m_next;
/// Is it a head node (i.e. a node obtained from parent->allocate or cudaMalloc).
bool m_is_head;
public:
/// Create a block.
Block(char *data, std::size_t size, Block *next, bool is_head)
: m_data(data)
, m_size(size)
, m_next(next)
, m_is_head(is_head)
{}
/// The data.
inline const char* get_data() const { return m_data; }
/// The data (mutable).
inline char* get_data() { return m_data; }
/// The size of the block.
inline std::size_t get_size() const { return m_size; }
/// The next block in the linked list.
inline const Block* get_next() const { return m_next; }
/// The next block in the linked list (mutable).
inline Block* get_next() { return m_next; }
/// Is it a head block.
inline bool is_head() const { return m_is_head; }
/// Change the next block.
inline void set_next(Block *next) { m_next = next; }
/// Change the size of the block.
inline void set_size(std::size_t size) { m_size = size; }
/// Set the head flag.
inline void set_head_flag(bool is_head) { m_is_head = is_head; }
};
///////////////////////////////////////////////////////////////////////////////////////////////////
class Manager
{
/// The parent manager.
Manager *m_parent;
/// The children managers.
std::vector<Manager> m_children;
/// The GPU device where the memory is allocated.
int m_device;
/// The stream this manager is associated with. It could be NULL.
cudaStream_t m_stream;
/// The list of used blocks.
Block *m_used_blocks;
/// The list of free blocks.
Block *m_free_blocks;
/// The managed memory size.
std::size_t m_size;
/// The memory allocation granularity
std::size_t m_granularity;
/// The flags.
unsigned m_flags;
/// To support multi-threading. Each manager has its own mutex.
Mutex m_mutex;
public:
/// Create an unitialized manager.
Manager();
/// Dtor.
~Manager();
/// Allocate a block of memory.
cumemStatus_t allocate(void *&ptr, std::size_t size);
/// Release a block of memory.
cumemStatus_t release(void *ptr);
/// Release memory. It returns true if we have no memory leak.
cumemStatus_t release_all(bool &memory_leak);
/// Reserve memory for a manager.
cumemStatus_t reserve(std::size_t size);
/// Steal memory from another manager.
cumemStatus_t steal(void *&ptr, std::size_t size);
/// Print the list of free blocks.
inline std::size_t print_free_blocks(FILE *file) const
{
return print_list(file, "free", m_free_blocks);
}
/// Print the list of used blocks.
inline std::size_t print_used_blocks(FILE *file) const
{
return print_list(file, "used", m_used_blocks);
}
/// The root manager for a given device.
static inline Manager& get_root_manager(int device) { return get_root_managers()[device]; }
/// The list of all the root managers.
static std::vector<Manager>& get_root_managers();
/// The amount of used memory.
inline std::size_t get_used_memory() const { return get_memory(m_used_blocks); }
/// The amount of used memory.
inline std::size_t get_free_memory() const { return get_memory(m_free_blocks); }
/// The children.
inline std::vector<Manager>& get_children() { return m_children; }
/// The children.
inline const std::vector<Manager>& get_children() const { return m_children; }
/// Get a specific child based on the stream id.
cumemStatus_t get_child(Manager *&manager, cudaStream_t stream);
/// The associated device.
inline int get_device() const { return m_device; }
/// The flags.
inline unsigned get_flags() const { return m_flags; }
/// Get the mutex.
inline Mutex* get_mutex() { return &m_mutex; }
/// The size allocated to that manager.
inline std::size_t get_size() const { return m_size; }
/// The CUDA stream.
inline cudaStream_t get_stream() const { return m_stream; }
/// The allocation granularity.
inline size_t get_granularity() const { return m_granularity; }
/// Define the parent.
inline void set_parent(Manager *parent) { m_parent = parent; }
/// Define the device.
inline void set_device(int device) { m_device = device; }
/// Define the stream.
inline void set_stream(cudaStream_t stream) { m_stream = stream; }
/// Define the flags.
inline void set_flags(unsigned flags) { m_flags = flags; }
/// Define the granularity
inline void set_granularity(unsigned granularity) { m_granularity = granularity; }
private:
/// Allocate a new block and add it to the free list.
cumemStatus_t allocate_block(Block *&curr, Block *&prev, std::size_t size);
/// Release a block from the active list.
cumemStatus_t release_block(Block *curr, Block *prev);
/// Find the best free node based on the size.
cumemStatus_t find_best_block(Block *&curr, Block *&prev, std::size_t size);
/// Extract a node from the list of free blocks.
cumemStatus_t extract_block(Block *curr, Block *prev, std::size_t size, bool stolen);
/// Give a free block from that manager.
cumemStatus_t give_block(void *&data, std::size_t &data_size, std::size_t size);
/// Steal a block from another manager.
cumemStatus_t steal_block(void *&data, std::size_t &data_size, std::size_t size);
/// The memory consumption of a list.
std::size_t get_memory(const Block *head) const;
/// Print an internal linked list.
std::size_t print_list(FILE *file, const char *name, const Block *head) const;
};
///////////////////////////////////////////////////////////////////////////////////////////////////
Manager::Manager()
: m_parent(NULL)
, m_children()
, m_device(-1)
, m_stream(NULL)
, m_used_blocks(NULL)
, m_free_blocks(NULL)
, m_size(0)
, m_granularity(CUMEM_DEFAULT_GRANULARITY)
, m_flags(CUMEM_FLAGS_DEFAULT)
, m_mutex()
{}
///////////////////////////////////////////////////////////////////////////////////////////////////
Manager::~Manager()
{
bool memory_leak;
release_all(memory_leak);
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cumemStatus_t Manager::allocate(void *&ptr, std::size_t size)
{
// Lock to make sure only one thread execute that fragment of code.
Lock lock;
CUMEM_CHECK(lock.lock(&m_mutex));
// Find the best fit.
Block *best = NULL, *prev = NULL;
CUMEM_CHECK_OR_UNLOCK_AND_RETURN(find_best_block(best, prev, size), lock);
// If there's no block left in the list of free blocks (with a sufficient size). Request a new block.
if( best == NULL && !(m_flags & CUMEM_FLAGS_CANNOT_GROW) )
{
CUMEM_CHECK_OR_UNLOCK_AND_RETURN(allocate_block(best, prev, size), lock);
}
// Make sure we do have a block or quit.
if( !best )
{
CUMEM_CHECK(lock.unlock());
ptr = NULL;
return CUMEM_STATUS_OUT_OF_MEMORY;
}
// Split the free block if needed.
CUMEM_CHECK_OR_UNLOCK_AND_RETURN(extract_block(best, prev, size, false), lock);
// Push the node to the list of used nodes.
best->set_next(m_used_blocks);
m_used_blocks = best;
// Return the new pointer into memory.
CUMEM_CHECK(lock.unlock());
ptr = m_used_blocks->get_data();
return CUMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cumemStatus_t Manager::allocate_block(Block *&curr, Block *&prev, std::size_t size)
{
// Reset the outputs.
curr = prev = NULL;
// Try to allocate data from the parent or the device.
void *data = NULL;
if( m_parent )
CUMEM_CHECK(m_parent->allocate(data, size));
else
{
if( m_flags & CUMEM_FLAGS_USE_UNIFIED_MEM )
{
#ifdef CUMEM_DEBUG
std::cout << "attempting cudaMallocManaged of size " << size << "B" << std::endl;
#endif
if( m_flags & CUMEM_FLAGS_MEM_ATTACH_HOST )
{
CHECK_CUDA(cudaMallocManaged(&data, size, cudaMemAttachHost));
}
else
{
CHECK_CUDA(cudaMallocManaged(&data, size, cudaMemAttachGlobal));
}
#ifdef CUMEM_DEBUG
std::cout << "cudaMallocManaged of size " << size << "B address=" << (void*)data << std::endl;
#endif
}
else
{
CHECK_CUDA(cudaSetDevice(m_device));
#ifdef CUMEM_DEBUG
std::cout << "attempting cudaMalloc of size " << size << "B" << std::endl;
#endif
CHECK_CUDA(cudaMalloc(&data, size));
#ifdef CUMEM_DEBUG
std::cout << "cudaMalloc of size " << size << "B address=" << (void*)data << std::endl;
#endif
}
}
// If it failed, there's an unexpected issue.
assert(data);
// We have data, we now need to add it to the list of free nodes. We keep the list sorted.
Block *next = m_free_blocks;
for( ; next && next->get_data() < data ; next = next->get_next() )
prev = next;
curr = new Block((char*) data, size, next, true);
if( prev )
prev->set_next(curr);
else
m_free_blocks = curr;
return CUMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cumemStatus_t Manager::extract_block(Block *curr, Block *prev, std::size_t size, bool stolen)
{
// We have two cases: 1/ It is the right size so we keep it or 2/ it is too large and we split the node.
Block *next;
if( curr->get_size() == size )
next = curr->get_next();
else
{
std::size_t remaining = curr->get_size()-size;
Block *new_block = new Block(curr->get_data() + size, remaining, curr->get_next(), stolen);
next = new_block;
curr->set_size(size);
}
// Redo the "branching" in the nodes.
if( prev )
prev->set_next(next);
else
m_free_blocks = next;
return CUMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cumemStatus_t Manager::find_best_block(Block *&best, Block *&prev, std::size_t size)
{
best = NULL, prev = NULL;
for( Block *temp = m_free_blocks, *temp_prev = NULL ; temp ; temp = temp->get_next() )
{
if( temp->get_size() >= size && (!best || temp->get_size() < best->get_size()) )
{
best = temp;
prev = temp_prev;
}
temp_prev = temp;
}
return CUMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cumemStatus_t Manager::get_child(Manager *&manager, cudaStream_t stream)
{
for( std::size_t i = 0 ; i < m_children.size() ; ++i )
if( m_children[i].m_stream == stream )
{
manager = &m_children[i];
return CUMEM_STATUS_SUCCESS;
}
return CUMEM_STATUS_INVALID_ARGUMENT;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
std::size_t Manager::get_memory(const Block *head) const
{
std::size_t size = 0;
for( Block *curr = (Block*) head ; curr ; curr = curr->get_next() )
size += curr->get_size();
return size;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
std::vector<Manager>& Manager::get_root_managers()
{
static std::vector<Manager> managers;
return managers;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cumemStatus_t Manager::give_block(void *&block_data, std::size_t &block_size, std::size_t size)
{
// Make sure the block is not in use any more. It could be too coarse grain and we may change
// it in the future.
CHECK_CUDA(cudaStreamSynchronize(m_stream));
// Init the returned values to 0.
block_data = NULL;
block_size = 0;
// Find the best node to steal and reserve it.
Block *best = NULL, *prev = NULL;
CUMEM_CHECK(find_best_block(best, prev, size));
if( !best )
return CUMEM_STATUS_OUT_OF_MEMORY;
CUMEM_CHECK(extract_block(best, prev, size, true));
block_data = best->get_data();
block_size = best->get_size();
// Release the memory used by that block.
delete best;
return CUMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
std::size_t Manager::print_list(FILE *file, const char *name, const Block *head) const
{
std::size_t size = 0;
for( Block *curr = (Block*) head; curr; curr = curr->get_next() )
size += curr->get_size();
fprintf(file, "| list=\"%s\", size=%lu\n", name, size);
for( Block *curr = (Block*) head ; curr ; curr = curr->get_next() )
{
fprintf(file, "| | node=0x%016lx, data=0x%016lx, size=%lu, next=0x%016lx, head=%2lu\n",
(std::size_t) curr,
(std::size_t) curr->get_data(),
(std::size_t) curr->get_size(),
(std::size_t) curr->get_next(),
(std::size_t) curr->is_head ());
}
fprintf(file, "|\n");
return size;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cumemStatus_t Manager::release(void *ptr)
{
// Skip if ptr is NULL.
if( ptr == NULL )
{
printf("release(NULL)\n");
CUMEM_STATUS_SUCCESS;
}
// Lock to make sure only one thread execute that fragment of code.
Lock lock;
CUMEM_CHECK(lock.lock(&m_mutex));
// Find the node in the list of used blocks.
Block *curr = m_used_blocks, *prev = NULL;
for( ; curr && curr->get_data() != ptr ; curr = curr->get_next() )
prev = curr;
// Make sure we have found a node.
if( curr == NULL )
{
CUMEM_CHECK(lock.unlock());
return CUMEM_STATUS_INVALID_ARGUMENT;
}
// We have the node so release it.
cumemStatus_t result = release_block(curr, prev);
CUMEM_CHECK(lock.unlock());
return result;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cumemStatus_t Manager::release_all(bool &memory_leaks)
{
// Destroy the children if any.
bool ok = true;
for( std::size_t i = 0; i < m_children.size(); ++i )
{
bool tmp;
CUMEM_CHECK(m_children[i].release_all(tmp));
ok = ok && !tmp;
}
// TODO: thejaswi: HACK! HACK! HACK!
// We have some issues when integrating into some libraries. This has to fixed in the libs.
// memory_leaks = !ok || m_used_blocks;
memory_leaks = !ok;
// Destroy used blocks. It's a kind of panic mode to avoid leaks. NOTE: Do that only with roots!!!
if( !m_parent )
while( m_used_blocks )
CUMEM_CHECK(release_block(m_used_blocks, NULL));
// We should be having only free blocks that are head blocks. Release those blocks.
while( m_free_blocks )
{
if( m_parent )
CUMEM_CHECK(m_parent->release(m_free_blocks->get_data()));
else if( m_free_blocks->is_head() )
{
#ifdef CUMEM_DEBUG
std::cout << "attempting cudaFree of size " << m_free_blocks->get_size()
<< "B address=" << (void*)m_free_blocks->get_data() << std::endl;
#endif
CHECK_CUDA(cudaFree(m_free_blocks->get_data()));
#ifdef CUMEM_DEBUG
std::cout << "cudaFree of size " << m_free_blocks->get_size() << "B successful" << std::endl;
#endif
}
Block *block = m_free_blocks;
m_free_blocks = m_free_blocks->get_next();
delete block;
}
// We shouldn't have any used block left. Or, it means the user is causing memory leaks!
return CUMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cumemStatus_t Manager::release_block(Block *curr, Block *prev)
{
// The current node cannot be NULL!
assert(curr != NULL);
// Change the connection of the node.
if( prev )
prev->set_next(curr->get_next());
else
m_used_blocks = curr->get_next();
// Find the location where this block should be added to the free list.
prev = NULL;
Block *iter = m_free_blocks;
for( ; iter && iter->get_data() < curr->get_data() ; iter = iter->get_next() )
prev = iter;
// Keep track of the successor of pred. We may lose track of it in the following "else".
Block *next = prev ? prev->get_next() : m_free_blocks;
// We first check if we can merge the block with its predecessor in the list and curr can be merged.
if( prev && prev->get_data() + prev->get_size() == curr->get_data() && !curr->is_head() )
{
prev->set_size(prev->get_size() + curr->get_size());
delete curr;
curr = prev;
}
else if( prev )
prev->set_next(curr);
else
m_free_blocks = curr;
// Check if we can merge curr and next. We can't merge over "cudaMalloc" boundaries.
if( next && curr->get_data() + curr->get_size() == next->get_data() && !next->is_head() )
{
curr->set_size(curr->get_size() + next->get_size());
curr->set_next(next->get_next());
delete next;
}
else
curr->set_next(next);
return CUMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cumemStatus_t Manager::reserve(std::size_t size)
{
Block *curr, *prev;
CUMEM_CHECK(allocate_block(curr, prev, size));
m_size = size;
return CUMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cumemStatus_t Manager::steal(void *&stolen, std::size_t size)
{
// If we cannot steal, don't even try.
if( m_flags & CUMEM_FLAGS_CANNOT_STEAL )
{
stolen = NULL;
return CUMEM_STATUS_INVALID_ARGUMENT;
}
// The stolen block.
void *data = NULL; std::size_t data_size = 0;
if( !m_children.empty() )
CUMEM_CHECK(steal_block(data, data_size, size));
else if( m_parent )
CUMEM_CHECK(m_parent->steal_block(data, data_size, size));
// Make sure we do have a block of memory or quit.
if( !data )
{
stolen = NULL;
return CUMEM_STATUS_OUT_OF_MEMORY;
}
// Push the block in the used list.
m_used_blocks = new Block((char*) data, data_size, m_used_blocks, true);
// Return the new pointer into memory.
stolen = data;
return CUMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cumemStatus_t Manager::steal_block(void *&data, std::size_t &data_size, ::size_t size)
{
// No block found and no room to grow. Try to steal from a children (if we have any).
data = NULL;
for( std::size_t i = 0 ; !data && i < m_children.size() ; ++i )
if( m_children[i].give_block(data, data_size, size) == CUMEM_STATUS_SUCCESS )
break;
// If no memory space found, simply return NULL. We have failed to allocate. Quit miserably.
if( !data )
return CUMEM_STATUS_OUT_OF_MEMORY;
// We have got a node from a children. We need to update our "used" list before we can do
// anything with it.
Block *curr = m_used_blocks, *prev = NULL;
for( ; curr ; curr = curr->get_next() )
{
if( curr->get_data() <= data && data < curr->get_data()+curr->get_size() )
break;
prev = curr;
}
// Curr points to the node which contains that memory region.
assert(curr);
// If it is exactly the same memory region, we are done!!!
if( curr->get_data() == data && curr->get_size() == data_size )
return CUMEM_STATUS_SUCCESS;
// Track the blocks before and after curr.
Block *next = curr->get_next();
// We may have up to 3 blocks.
std::size_t size_before = (std::size_t) ((char*) data - curr->get_data());
std::size_t size_after = (curr->get_size() - size_before - data_size);
// The resulting block.
Block *result = curr;
// If we have no space between curr->get_data and block->get_data.
if( size_before == 0 )
curr->set_size(data_size);
else
{
curr->set_size(size_before);
Block *block = new Block((char*) data, data_size, next, false);
curr->set_next(block);
curr = block;
data = (char*) data + data_size;
data_size = size_after;
result = block;
}
// We have space at the end so we may need to add a new node.
if( size_after > 0 )
{
Block *block = new Block(curr->get_data() + curr->get_size(), size_after, next, false);
curr->set_next(block);
curr = block;
}
return CUMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
static void print_blocks(FILE *file, const Manager &manager)
{
fprintf(file, "device=%d, stream=0x%016lx, used=%luB, free=%luB\n",
manager.get_device(),
(std::size_t) manager.get_stream(),
manager.get_used_memory(),
manager.get_free_memory());
manager.print_used_blocks(file);
manager.print_free_blocks(file);
fprintf(file, "\n");
for( std::size_t i = 0 ; i < manager.get_children().size() ; ++i )
print_blocks(file, manager.get_children()[i]);
}
///////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace cumem
///////////////////////////////////////////////////////////////////////////////////////////////////
extern "C" {
///////////////////////////////////////////////////////////////////////////////////////////////////
cumemStatus_t cumemInit(int numDevices, const cumemDevice_t *devices, unsigned flags)
{
// Make sure we have at least one device declared.
CHECK(numDevices > 0, CUMEM_STATUS_INVALID_ARGUMENT);
// Find the largest ID of the device.
int max_device = 0;
for( int i = 0 ; i < numDevices ; ++i )
if( devices[i].device > max_device )
max_device = devices[i].device;
// Allocate enough managers.
CHECK(max_device >= 0, CUMEM_STATUS_INVALID_ARGUMENT);
std::vector<cumem::Manager> &managers = cumem::Manager::get_root_managers();
managers.resize(max_device+1);
// Create a root manager for each device and create the children.
for( int i = 0 ; i < numDevices ; ++i )
{
std::size_t size = devices[i].size;
if( size == 0 )
{
cudaDeviceProp props;
CHECK_CUDA(cudaGetDeviceProperties(&props, devices[i].device));
size = props.totalGlobalMem / 2;
}
CHECK(size > 0, CUMEM_STATUS_INVALID_ARGUMENT);
std::size_t granularity = devices[i].granularity;
if( granularity == 0 )
{
granularity = CUMEM_DEFAULT_GRANULARITY;
}
CHECK(granularity > 0, CUMEM_STATUS_INVALID_ARGUMENT);
CHECK(((granularity % 512) == 0), CUMEM_STATUS_INVALID_ARGUMENT);
cumem::Manager &manager = cumem::Manager::get_root_manager(devices[i].device);
manager.set_device(devices[i].device);
manager.set_flags(flags);
manager.set_granularity(granularity);
size = ((size + granularity - 1) / granularity) * granularity;
CUMEM_CHECK(manager.reserve(size));
std::vector<cumem::Manager> &children = manager.get_children();
children.resize(devices[i].numStreams);
for( int j = 0 ; j < devices[i].numStreams ; ++j )
{
children[j].set_parent(&manager);
children[j].set_device(devices[i].device);
children[j].set_stream(devices[i].streams[j]);
children[j].set_flags(flags & ~CUMEM_FLAGS_CANNOT_GROW);
CUMEM_CHECK(children[j].reserve(size / (devices[i].numStreams + 1)));
}
}
return CUMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cumemStatus_t cumemFinalize()
{
if( cumem::Manager::get_root_managers().empty() )
return CUMEM_STATUS_NOT_INITIALIZED;
std::vector<cumem::Manager> &managers = cumem::Manager::get_root_managers();
bool memory_leaks = false;
for( std::size_t i = 0; i < managers.size(); ++i )
{
bool tmp_leaks;
CUMEM_CHECK(managers[i].release_all(tmp_leaks));
memory_leaks = memory_leaks || tmp_leaks;
}
managers.clear();
return memory_leaks ? CUMEM_STATUS_MEMORY_LEAK : CUMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cumemStatus_t cumemMalloc(void **ptr, std::size_t size, cudaStream_t stream)
{
if( cumem::Manager::get_root_managers().empty() )
return CUMEM_STATUS_NOT_INITIALIZED;
if( !ptr && !size )
return CUMEM_STATUS_SUCCESS;
if( !ptr )
return CUMEM_STATUS_INVALID_ARGUMENT;
if( !size )
return CUMEM_STATUS_INVALID_ARGUMENT;
int device;
CHECK_CUDA(cudaGetDevice(&device));
cumem::Manager &root = cumem::Manager::get_root_manager(device);
cumem::Manager *manager = &root;
if( stream )
CUMEM_CHECK(root.get_child(manager, stream));
assert(manager);
size_t granularity = manager->get_granularity();
size = ((size + granularity - 1) / granularity) * granularity;
cumemStatus_t result = manager->allocate(ptr[0], size);
// We failed to allocate but there might still be a buffer available in another manager. Try to
// steal it.
if( result == CUMEM_STATUS_OUT_OF_MEMORY )
{
// We need to acquire all the locks to all the managers to be able to steal memory. It's costly!
typedef std::vector<cumem::Manager>::iterator Iterator;
// Try to acquire locks on all the children.
std::vector<cumem::Manager> &children = root.get_children();
std::vector<cumem::Lock> locks(children.size() + 1);
std::size_t num_locked = 0;
for( Iterator it = children.begin() ; it != children.end() ; ++it, ++num_locked )
{
cumem::Mutex *mutex = it->get_mutex();
if( locks[num_locked].lock(mutex) != CUMEM_STATUS_SUCCESS )
break;
}
// We locked all the children, so we try to lock the root.
if( num_locked == children.size() )
{
cumemStatus_t tmp_status = locks.back().lock(root.get_mutex());
if( tmp_status == CUMEM_STATUS_SUCCESS )
num_locked++;
}
// We acquired so we try to steal a node from another child.
if( num_locked == locks.size() )
result = manager->steal(ptr[0], size);
for( std::size_t i = 0 ; i < num_locked ; ++i )
CUMEM_CHECK(locks[i].unlock());
}
return result;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cumemStatus_t cumemFree(void *ptr, cudaStream_t stream)
{
if( cumem::Manager::get_root_managers().empty() )
return CUMEM_STATUS_NOT_INITIALIZED;
if( ptr == NULL )
return CUMEM_STATUS_SUCCESS;
int device;
CHECK_CUDA(cudaGetDevice(&device));
cumem::Manager &root = cumem::Manager::get_root_manager(device);
cumem::Manager *manager = &root;
if( stream )
CUMEM_CHECK(root.get_child(manager, stream));
assert(manager);
return manager->release(ptr);
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cumemStatus_t cumemGetMemoryUsage(size_t *used_mem, size_t *free_mem)
{
if( cumem::Manager::get_root_managers().empty() )
return CUMEM_STATUS_NOT_INITIALIZED;
if( !used_mem || !free_mem )
return CUMEM_STATUS_INVALID_ARGUMENT;
for( std::size_t i = 0, j = 0 ; i < cumem::Manager::get_root_managers().size() ; ++i )
{
cumem::Manager &manager = cumem::Manager::get_root_managers()[i];
if( manager.get_device() == -1 )
continue;
cumem::Lock lock;
CUMEM_CHECK(lock.lock(manager.get_mutex()));
used_mem[j] = manager.get_used_memory();
free_mem[j] = manager.get_free_memory();
j++;
CUMEM_CHECK(lock.unlock());
}
return CUMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cumemStatus_t cumemPrintMemoryState(FILE *file)
{
if( cumem::Manager::get_root_managers().empty() )
return CUMEM_STATUS_NOT_INITIALIZED;
for( std::size_t i = 0 ; i < cumem::Manager::get_root_managers().size() ; ++i )
{
cumem::Manager &manager = cumem::Manager::get_root_managers()[i];
if( manager.get_device() == -1 )
continue;
cumem::Lock lock;
CUMEM_CHECK(lock.lock(manager.get_mutex()));
print_blocks(file, manager);
CUMEM_CHECK(lock.unlock());
}
return CUMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
const char* cumemGetErrorString(cumemStatus_t status)
{
switch(status)
{
case CUMEM_STATUS_SUCCESS: return "CUMEM_STATUS_SUCCESS";
case CUMEM_STATUS_CUDA_ERROR: return "CUMEM_STATUS_CUDA_ERROR";
case CUMEM_STATUS_INVALID_ARGUMENT: return "CUMEM_STATUS_INVALID_ARGUMENT";
case CUMEM_STATUS_MEMORY_LEAK: return "CUMEM_STATUS_MEMORY_LEAK";
case CUMEM_STATUS_NOT_INITIALIZED: return "CUMEM_STATUS_NOT_INITIALIZED";
case CUMEM_STATUS_OUT_OF_MEMORY: return "CUMEM_STATUS_OUT_OF_MEMORY";
default: return "CUMEM_STATUS_UNKNOWN_ERROR";
}
}
///////////////////////////////////////////////////////////////////////////////////////////////////
} // extern "C"
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论