提交 130d2ce9 authored 作者: Julien Demouth's avatar Julien Demouth 提交者: Frederic

Add support for CNMeM library.

Update the code to support CNMeM (formerly known as Cumem).
上级 389c4aba
......@@ -2,6 +2,7 @@ global-include *.txt
global-include *.c
global-include *.cu
global-include *.cuh
global-include *.cpp
global-include *.h
global-include *.sh
global-include *.pkl
......
......@@ -164,7 +164,7 @@ def do_setup():
install_requires=['numpy>=1.6.2', 'scipy>=0.11', 'six>=1.9.0'],
package_data={
'': ['*.txt', '*.rst', '*.cu', '*.cuh', '*.c', '*.sh', '*.pkl',
'*.h', 'ChangeLog'],
'*.h', '*.cpp', 'ChangeLog'],
'theano.misc': ['*.sh']
},
scripts=['bin/theano-cache', 'bin/theano-nose', 'bin/theano-test'],
......
......@@ -54,8 +54,8 @@ AddConfigVar('cublas.lib',
"""Name of the cuda blas library for the linker.""",
StrParam('cublas'))
AddConfigVar('lib.cumem',
"""Do we enable cumem or not.""",
AddConfigVar('lib.cnmem',
"""Do we enable cnmem or not.""",
# We should not mix both allocator, so we can't override
BoolParam(False, allow_override=False),
in_c_key=False)
......@@ -385,7 +385,7 @@ def use(device,
try:
if (device != 'gpu') and not pycuda_init_dev:
assert isinstance(device, int)
gpu_init(device, config.lib.cumem)
gpu_init(device, config.lib.cnmem)
use.device_number = device
assert active_device_number() == device
else:
......@@ -398,7 +398,7 @@ def use(device,
cuda_ndarray.cuda_ndarray.select_a_gpu()
use.device_number = active_device_number()
# This is needed to initialize the cublas handle.
gpu_init(use.device_number, config.lib.cumem)
gpu_init(use.device_number, config.lib.cnmem)
if test_driver:
import theano.sandbox.cuda.tests.test_driver
......@@ -411,8 +411,9 @@ def use(device,
" this property")
if config.print_active_device:
print("Using gpu device %d: %s" % (
active_device_number(), active_device_name()), file=sys.stderr)
cnmem_enabled = "enabled" if config.lib.cnmem else "disabled"
print("Using gpu device %d: %s (cnmem is %s)" % (
active_device_number(), active_device_name(), cnmem_enabled), file=sys.stderr)
if device_properties(use.device_number)['regsPerBlock'] < 16384:
# We will try to use too much register per bloc at many places
# when there is only 8k register per multi-processor.
......
///////////////////////////////////////////////////////////////////////////////////////////////////
// Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
///////////////////////////////////////////////////////////////////////////////////////////////////
#include "cnmem.h"
#include <cstddef>
#include <vector>
#include <cuda_runtime_api.h>
#if !defined(WIN32) && defined(_MSC_VER)
#define WIN32
#endif
#ifdef WIN32
#include <Windows.h>
#else
#include <pthread.h>
#endif
#define CNMEM_GRANULARITY 512
///////////////////////////////////////////////////////////////////////////////////////////////////
extern "C" const char* cnmemGetErrorString(cnmemStatus_t status) {
switch(status) {
case CNMEM_STATUS_SUCCESS: return "CNMEM_STATUS_SUCCESS";
case CNMEM_STATUS_CUDA_ERROR: return "CNMEM_STATUS_CUDA_ERROR";
case CNMEM_STATUS_INVALID_ARGUMENT: return "CNMEM_STATUS_INVALID_ARGUMENT";
case CNMEM_STATUS_MEMORY_LEAK: return "CNMEM_STATUS_MEMORY_LEAK";
case CNMEM_STATUS_NOT_INITIALIZED: return "CNMEM_STATUS_NOT_INITIALIZED";
case CNMEM_STATUS_OUT_OF_MEMORY: return "CNMEM_STATUS_OUT_OF_MEMORY";
default: return "CNMEM_STATUS_UNKNOWN_ERROR";
}
}
///////////////////////////////////////////////////////////////////////////////////////////////////
#if 0
#ifdef WIN32
#define CNMEM_DEBUG_ERROR(...) do { \
fprintf(stderr, "Error at line: %d\n", __LINE__); \
fprintf(stderr, __VA_ARGS__); \
} while(0)
#else
#include <execinfo.h>
static inline void printBacktrace() {
void *stackBuffer[64];
int numAddresses = backtrace((void**) &stackBuffer, 64);
char **addresses = backtrace_symbols(stackBuffer, numAddresses);
for( int i = 0 ; i < numAddresses ; ++i ) {
fprintf(stderr, "[%2d]: %s\n", i, addresses[i]);
}
free(addresses);
}
#define CNMEM_DEBUG_ERROR(...) do { \
fprintf(stderr, "Error at line: %d\n", __LINE__); \
fprintf(stderr, __VA_ARGS__); \
fprintf(stderr, "Backtrace:\n"); \
printBacktrace(); \
} while(0)
#endif
#else
#define CNMEM_DEBUG_ERROR(...)
#endif
#if 0
#define CNMEM_DEBUG_INFO printf
#else
#define CNMEM_DEBUG_INFO(...)
#endif
#if 0 // Enable/disable assertions
#include <cassert>
#define CNMEM_ASSERT assert
#else
#define CNMEM_ASSERT(...)
#endif
#define CNMEM_CHECK_TRUE(cond, error) do { \
if( !(cond) ) { \
CNMEM_DEBUG_ERROR("CNMEM_CHECK_TRUE evaluates to false\n"); \
return error; \
} \
} while(0)
#define CNMEM_CHECK(call) do { \
cnmemStatus_t status = (call); \
if( status != CNMEM_STATUS_SUCCESS ) { \
CNMEM_DEBUG_ERROR("CNMEM_CHECK failed with status \"%s\"\n", \
cnmemGetErrorString(status)); \
return status; \
} \
} while(0)
#define CNMEM_CHECK_OR_UNLOCK(call, mutex) do { \
cnmemStatus_t status = (call); \
if( status != CNMEM_STATUS_SUCCESS ) { \
CNMEM_DEBUG_ERROR("CNMEM_CHECK_OR_UNLOCK failed with status \"%s\"\n", \
cnmemGetErrorString(status)); \
(mutex).unlock(); \
return status; \
} \
} while(0)
#define CNMEM_CHECK_CUDA(call) do { \
cudaError_t cudaError = (call); \
if( cudaError == cudaErrorMemoryAllocation ) { \
CNMEM_DEBUG_ERROR("CNMEM_CHECK_CUDA failed with CUDA error \"%s\"\n", \
cudaGetErrorString(cudaError)); \
return CNMEM_STATUS_OUT_OF_MEMORY; \
} \
else if( cudaError != cudaSuccess ) { \
CNMEM_DEBUG_ERROR("CNMEM_CHECK_CUDA failed with CUDA error \"%s\"\n", \
cudaGetErrorString(cudaError)); \
return CNMEM_STATUS_CUDA_ERROR; \
} \
} while(0)
#define CNMEM_CHECK_CUDA_OR_UNLOCK(call, mutex) do { \
cudaError_t cudaError = (call); \
if( cudaError == cudaErrorMemoryAllocation ) { \
CNMEM_DEBUG_ERROR("CNMEM_CHECK_CUDA_OR_UNLOCK failed with CUDA error \"%s\"\n", \
cudaGetErrorString(cudaError)); \
(mutex).unlock(); \
return CNMEM_STATUS_OUT_OF_MEMORY; \
} \
else if( cudaError != cudaSuccess ) { \
CNMEM_DEBUG_ERROR("CNMEM_CHECK_CUDA_OR_UNLOCK failed with CUDA error \"%s\"\n", \
cudaGetErrorString(cudaError)); \
(mutex).unlock(); \
return CNMEM_STATUS_CUDA_ERROR; \
} \
} while(0)
#ifdef WIN32
#define CNMEM_CHECK_WIN32(call, error_code) do { \
SetLastError(0); /* Clean the flag. */ \
call; \
DWORD status = GetLastError(); \
if( status ) \
return error_code; \
} while(0)
#else
#define CNMEM_CHECK_PTHREAD(call, error_code) do { \
int status = call; \
if( status ) { \
CNMEM_DEBUG_ERROR("CNMEM_CHECK_PTHREAD failed with status %d\n", status); \
return error_code; \
} \
} while(0)
#endif
///////////////////////////////////////////////////////////////////////////////////////////////////
namespace cnmem {
static inline std::size_t ceilInt(std::size_t m, std::size_t n) {
CNMEM_ASSERT(n > 0);
return (m + n-1) / n * n;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
class Mutex {
#ifdef WIN32
CRITICAL_SECTION mCriticalSection;
#else
pthread_mutex_t mMutex;
#endif
public:
/// Initialize the mutex.
cnmemStatus_t initialize();
/// Finalize the mutex.
cnmemStatus_t finalize();
/// Lock the mutex.
cnmemStatus_t lock() const;
/// Unlock the mutex.
cnmemStatus_t unlock() const;
};
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Mutex::initialize() {
#ifdef WIN32
CNMEM_CHECK_WIN32(InitializeCriticalSection((CRITICAL_SECTION*) &mCriticalSection), CNMEM_STATUS_UNKNOWN_ERROR);
#else
#if 0
pthread_mutexattr_t attr;
CNMEM_CHECK_PTHREAD(pthread_mutexattr_init(&attr), CNMEM_STATUS_UNKNOWN_ERROR);
CNMEM_CHECK_PTHREAD(pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE), CNMEM_STATUS_UNKNOWN_ERROR);
CNMEM_CHECK_PTHREAD(pthread_mutex_init(&mMutex, &attr), CNMEM_STATUS_UNKNOWN_ERROR);
#else
CNMEM_CHECK_PTHREAD(pthread_mutex_init(&mMutex, NULL), CNMEM_STATUS_UNKNOWN_ERROR);
#endif
#endif
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Mutex::finalize() {
#ifdef WIN32
CNMEM_CHECK_WIN32(DeleteCriticalSection((CRITICAL_SECTION*) &mCriticalSection), CNMEM_STATUS_UNKNOWN_ERROR);
#else
CNMEM_CHECK_PTHREAD(pthread_mutex_destroy(&mMutex), CNMEM_STATUS_UNKNOWN_ERROR);
#endif
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Mutex::lock() const {
#ifdef WIN32
CNMEM_CHECK_WIN32(EnterCriticalSection(&mCriticalSection), CNMEM_STATUS_UNKNOWN_ERROR);
#else
CNMEM_CHECK_PTHREAD(pthread_mutex_lock((pthread_mutex_t*) &mMutex), CNMEM_STATUS_UNKNOWN_ERROR);
#endif
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Mutex::unlock() const {
#ifdef WIN32
CNMEM_CHECK_WIN32(LeaveCriticalSection(&mCriticalSection), CNMEM_STATUS_UNKNOWN_ERROR);
#else
CNMEM_CHECK_PTHREAD(pthread_mutex_unlock((pthread_mutex_t*) &mMutex), CNMEM_STATUS_UNKNOWN_ERROR);
#endif
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
class Block {
/// The pointer to the memory region on the device.
char *mData;
/// The size of the memory buffer.
std::size_t mSize;
/// The prev/next blocks in the linked list of blocks.
Block *mNext;
/// Is it a head node (i.e. a node obtained from parent->allocate or cudaMalloc).
bool mIsHead;
public:
/// Create a block.
Block(char *data, std::size_t size, Block *next, bool isHead)
: mData(data)
, mSize(size)
, mNext(next)
, mIsHead(isHead) {
}
/// The data.
inline const char* getData() const { return mData; }
/// The data (mutable).
inline char* getData() { return mData; }
/// The size of the block.
inline std::size_t getSize() const { return mSize; }
/// The next block in the linked list.
inline const Block* getNext() const { return mNext; }
/// The next block in the linked list (mutable).
inline Block* getNext() { return mNext; }
/// Is it a head block.
inline bool isHead() const { return mIsHead; }
/// Change the next block.
inline void setNext(Block *next) { mNext = next; }
/// Change the size of the block.
inline void setSize(std::size_t size) { mSize = size; }
/// Set the head flag.
inline void setHeadFlag(bool isHead) { mIsHead = isHead; }
};
///////////////////////////////////////////////////////////////////////////////////////////////////
class Manager {
/// The parent manager.
Manager *mParent;
/// The children managers.
std::vector<Manager*> mChildren;
/// The GPU device where the memory is allocated.
int mDevice;
/// The stream this manager is associated with. It could be NULL.
cudaStream_t mStream;
/// Is the stream blocking?
bool mIsStreamBlocking;
/// The list of used blocks.
Block *mUsedBlocks;
/// The list of free blocks.
Block *mFreeBlocks;
/// The managed memory size.
std::size_t mSize;
/// The flags.
unsigned mFlags;
/// To support multi-threading. Each manager has its own mutex.
Mutex mMutex;
public:
/// The root manager for a given device.
static inline Manager& getRootManager(int device) { return getRootManagers()[device]; }
/// The list of all the root managers.
static std::vector<Manager>& getRootManagers();
public:
/// Create an unitialized manager.
Manager();
/// Dtor.
~Manager();
/// Allocate a block of memory.
cnmemStatus_t allocate(void *&ptr, std::size_t size, bool isBlocking = true);
/// Release a block of memory.
cnmemStatus_t release(void *ptr);
/// Release memory. It returns true if we have no memory leak.
cnmemStatus_t releaseAllUnsafe(bool &memoryLeak);
/// Reserve memory for a manager.
cnmemStatus_t reserve(std::size_t size);
/// Steal memory from another manager.
cnmemStatus_t stealUnsafe(void *&ptr, std::size_t size);
/// Print the full memory state.
cnmemStatus_t printMemoryState(FILE *file) const;
/// The amount of used memory.
inline cnmemStatus_t getUsedMemoryUnsafe(std::size_t &usedMemory) const {
return getMemoryUnsafe(usedMemory, mUsedBlocks);
}
/// The amount of used memory.
inline cnmemStatus_t getFreeMemoryUnsafe(std::size_t &freeMemory) const {
return getMemoryUnsafe(freeMemory, mFreeBlocks);
}
/// Get a specific child based on the stream id.
cnmemStatus_t getChildFromStream(Manager *&manager, cudaStream_t stream) const;
/// Get a specific child based on the stream id.
cnmemStatus_t getChild(Manager *&manager, std::size_t i) const;
/// Add a new child.
cnmemStatus_t addChild(Manager *manager);
/// The number of children.
cnmemStatus_t getNumChildren(std::size_t &numChildren) const;
/// The associated device.
inline int getDevice() const { return mDevice; }
/// The flags.
inline unsigned getFlags() const { return mFlags; }
/// Get the mutex.
inline const Mutex* getMutex() const { return &mMutex; }
/// The size allocated to that manager.
inline std::size_t getSize() const { return mSize; }
/// The CUDA stream.
inline cudaStream_t getStream() const { return mStream; }
/// Define the parent.
inline void setParent(Manager *parent) { mParent = parent; }
/// Define the device.
inline void setDevice(int device) { mDevice = device; }
/// Define the stream.
inline cnmemStatus_t setStream(cudaStream_t stream) {
mStream = stream;
#ifdef CUDA_API_PER_THREAD_DEFAULT_STREAM
mIsStreamBlocking = false;
#else
unsigned flags = 0;
CNMEM_CHECK_CUDA(cudaStreamGetFlags(mStream, &flags));
mIsStreamBlocking = !mStream || !(flags & cudaStreamNonBlocking);
#endif
return CNMEM_STATUS_SUCCESS;
}
/// Define the flags.
inline void setFlags(unsigned flags) { mFlags = flags; }
private:
/// The member functions below which are marked "Unsafe" are not thread-safe when called on a
/// same Manager object. Make sure they are called by a single thread in that case.
/// Allocate a new block and add it to the free list.
cnmemStatus_t allocateBlockUnsafe(Block *&curr, Block *&prev, std::size_t size);
/// Release a block from the active list.
cnmemStatus_t releaseBlockUnsafe(Block *curr, Block *prev);
/// Find the best free node based on the size.
cnmemStatus_t findBestBlockUnsafe(Block *&curr, Block *&prev, std::size_t size);
/// Extract a node from the list of free blocks.
cnmemStatus_t extractBlockUnsafe(Block *curr, Block *prev, std::size_t size, bool stolen);
/// Give a free block from that manager.
cnmemStatus_t giveBlockUnsafe(void *&data, std::size_t &dataSize, std::size_t size);
/// Steal a block from another manager.
cnmemStatus_t stealBlockUnsafe(void *&data, std::size_t &dataSize, std::size_t size);
/// The memory consumption of a list.
cnmemStatus_t getMemoryUnsafe(std::size_t &memSize, const Block *head) const;
/// Print an internal linked list.
cnmemStatus_t printListUnsafe(FILE *file, const char *name, const Block *head) const;
};
///////////////////////////////////////////////////////////////////////////////////////////////////
Manager::Manager()
: mParent(NULL)
, mChildren()
, mDevice(-1)
, mStream(NULL)
, mIsStreamBlocking(false)
, mUsedBlocks(NULL)
, mFreeBlocks(NULL)
, mSize(0)
, mFlags(CNMEM_FLAGS_DEFAULT)
, mMutex() {
mMutex.initialize();
}
///////////////////////////////////////////////////////////////////////////////////////////////////
Manager::~Manager() {
if( mDevice == -1 || cudaSetDevice(mDevice) != cudaSuccess ) { // Invalid device, skip it.
return;
}
bool memoryLeak;
releaseAllUnsafe(memoryLeak);
mMutex.finalize();
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Manager::addChild(Manager *manager) {
CNMEM_CHECK(mMutex.lock());
mChildren.push_back(manager);
CNMEM_CHECK(mMutex.unlock());
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Manager::allocate(void *&ptr, std::size_t size, bool isBlocking) {
CNMEM_CHECK(mMutex.lock());
// If the client is not blocking, we have to explicitly synchronize before giving one buffer.
if( !isBlocking ) {
CNMEM_CHECK_CUDA_OR_UNLOCK(cudaStreamSynchronize(mStream), mMutex);
}
// Find the best fit.
Block *best = NULL, *prev = NULL;
CNMEM_CHECK_OR_UNLOCK(findBestBlockUnsafe(best, prev, size), mMutex);
// If there's no block left in the list of free blocks (with a sufficient size). Request a new block.
if( best == NULL && !(mFlags & CNMEM_FLAGS_CANNOT_GROW) ) {
CNMEM_CHECK_OR_UNLOCK(allocateBlockUnsafe(best, prev, size), mMutex);
}
// Make sure we do have a block or quit.
if( !best ) {
ptr = NULL;
CNMEM_CHECK(mMutex.unlock());
return CNMEM_STATUS_OUT_OF_MEMORY;
}
// Split the free block if needed.
CNMEM_CHECK_OR_UNLOCK(extractBlockUnsafe(best, prev, size, false), mMutex);
// Push the node to the list of used nodes.
best->setNext(mUsedBlocks);
mUsedBlocks = best;
// Return the new pointer into memory.
ptr = mUsedBlocks->getData();
CNMEM_CHECK(mMutex.unlock());
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Manager::allocateBlockUnsafe(Block *&curr, Block *&prev, std::size_t size) {
// Reset the outputs.
curr = prev = NULL;
// Try to allocate data from the parent or the device.
void *data = NULL;
if( mParent ) {
CNMEM_CHECK(mParent->allocate(data, size, mIsStreamBlocking));
}
else {
CNMEM_DEBUG_INFO("cudaMalloc(%lu)\n", size);
CNMEM_CHECK_CUDA(cudaMalloc(&data, size));
CNMEM_DEBUG_INFO(">> returned address=0x%016lx\n", (size_t) data);
}
// If it failed, there's an unexpected issue.
CNMEM_ASSERT(data);
// We have data, we now need to add it to the list of free nodes. We keep the list sorted.
Block *next = mFreeBlocks;
for( ; next && next->getData() < data ; next = next->getNext() ) {
prev = next;
}
curr = new Block((char*) data, size, next, true);
if( !curr ) {
return CNMEM_STATUS_OUT_OF_MEMORY;
}
if( prev ) {
prev->setNext(curr);
}
else {
mFreeBlocks = curr;
}
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Manager::extractBlockUnsafe(Block *curr, Block *prev, std::size_t size, bool stolen) {
// We have two cases: 1/ It is the right size so we keep it or 2/ it is too large and we split the node.
Block *next;
if( curr->getSize() == size ) {
next = curr->getNext();
}
else {
std::size_t remaining = curr->getSize()-size;
Block *newBlock = new Block(curr->getData() + size, remaining, curr->getNext(), stolen);
if( !newBlock ) {
return CNMEM_STATUS_OUT_OF_MEMORY;
}
next = newBlock;
curr->setSize(size);
}
// Redo the "branching" in the nodes.
if( prev ) {
prev->setNext(next);
}
else {
mFreeBlocks = next;
}
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Manager::findBestBlockUnsafe(Block *&best, Block *&prev, std::size_t size) {
best = NULL, prev = NULL;
for( Block *temp = mFreeBlocks, *tempPrev = NULL ; temp ; temp = temp->getNext() ) {
if( temp->getSize() >= size && (!best || temp->getSize() < best->getSize()) ) {
best = temp;
prev = tempPrev;
}
tempPrev = temp;
}
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Manager::getChildFromStream(Manager *&manager, cudaStream_t stream) const {
CNMEM_CHECK(mMutex.lock());
std::size_t i = 0, numChildren = mChildren.size();
for( ; i < numChildren ; ++i ) {
if( mChildren[i]->mStream == stream ) {
manager = mChildren[i];
break;
}
}
CNMEM_CHECK(mMutex.unlock());
return i < numChildren ? CNMEM_STATUS_SUCCESS : CNMEM_STATUS_INVALID_ARGUMENT;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Manager::getChild(Manager *&manager, std::size_t i) const {
CNMEM_CHECK(mMutex.lock());
if( i >= mChildren.size() ) {
CNMEM_CHECK(mMutex.unlock());
return CNMEM_STATUS_INVALID_ARGUMENT;
}
manager = mChildren[i];
CNMEM_CHECK(mMutex.unlock());
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Manager::getMemoryUnsafe(std::size_t &size, const Block *head) const {
size = 0;
for( Block *curr = (Block*) head ; curr ; curr = curr->getNext() ) {
size += curr->getSize();
}
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
#if 0
cnmemStatus_t Manager::getMemory(std::size_t &size, const Block *head) const {
CNMEM_CHECK(mMutex.lock());
CNMEM_CHECK_OR_UNLOCK(getMemoryUnsafe(size, head));
CNMEM_CHECK(mMutex.unlock());
return status;
}
#endif
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Manager::getNumChildren(std::size_t &numChildren) const {
CNMEM_CHECK(mMutex.lock());
numChildren = mChildren.size();
CNMEM_CHECK(mMutex.unlock());
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
std::vector<Manager>& Manager::getRootManagers() {
static std::vector<Manager> managers;
return managers;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Manager::giveBlockUnsafe(void *&blockData, std::size_t &blockSize, std::size_t size) {
// Make sure the block is not in use any more. It could be too coarse grain and we may change
// it in the future.
CNMEM_CHECK_CUDA(cudaStreamSynchronize(mStream));
// Init the returned values to 0.
blockData = NULL;
blockSize = 0;
// Find the best node to steal and reserve it.
Block *best = NULL, *prev = NULL;
CNMEM_CHECK(findBestBlockUnsafe(best, prev, size));
if( !best ) {
return CNMEM_STATUS_OUT_OF_MEMORY;
}
CNMEM_CHECK(extractBlockUnsafe(best, prev, size, true));
blockData = best->getData();
blockSize = best->getSize();
// Release the memory used by that block.
delete best;
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Manager::printListUnsafe(FILE *file, const char *name, const Block *head) const {
std::size_t size = 0;
for( Block *curr = (Block*) head; curr; curr = curr->getNext() ) {
size += curr->getSize();
}
fprintf(file, "| list=\"%s\", size=%lu\n", name, size);
for( Block *curr = (Block*) head ; curr ; curr = curr->getNext() ) {
fprintf(file, "| | node=0x%016lx, data=0x%016lx, size=%lu, next=0x%016lx, head=%2lu\n",
(std::size_t) curr,
(std::size_t) curr->getData(),
(std::size_t) curr->getSize(),
(std::size_t) curr->getNext(),
(std::size_t) curr->isHead ());
}
fprintf(file, "|\n");
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Manager::printMemoryState(FILE *file) const {
CNMEM_CHECK(mMutex.lock());
std::size_t streamCode = (std::size_t) mStream;
std::size_t usedMemory, freeMemory;
CNMEM_CHECK_OR_UNLOCK(getUsedMemoryUnsafe(usedMemory), mMutex);
CNMEM_CHECK_OR_UNLOCK(getFreeMemoryUnsafe(freeMemory), mMutex);
fprintf(file, ">> [%s] device=%d, stream=0x%016lx, used=%luB, free=%luB\n",
mParent ? "child" : "root",
mDevice,
streamCode,
usedMemory,
freeMemory);
CNMEM_CHECK_OR_UNLOCK(printListUnsafe(file, "used", mUsedBlocks), mMutex);
CNMEM_CHECK_OR_UNLOCK(printListUnsafe(file, "free", mFreeBlocks), mMutex);
fprintf(file, "\n");
CNMEM_CHECK(mMutex.unlock());
if( mParent ) {
CNMEM_CHECK(mParent->printMemoryState(file));
}
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Manager::release(void *ptr) {
// Skip if ptr is NULL.
if( ptr == NULL ) {
return CNMEM_STATUS_SUCCESS;
}
// Lock to make sure only one thread execute that fragment of code.
CNMEM_CHECK(mMutex.lock());
// Find the node in the list of used blocks.
Block *curr = mUsedBlocks, *prev = NULL;
for( ; curr && curr->getData() != ptr ; curr = curr->getNext() ) {
prev = curr;
}
// Make sure we have found a node.
if( curr == NULL ) {
CNMEM_CHECK(mMutex.unlock());
return CNMEM_STATUS_INVALID_ARGUMENT;
}
// We have the node so release it.
cnmemStatus_t result = releaseBlockUnsafe(curr, prev);
CNMEM_CHECK(mMutex.unlock());
return result;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Manager::releaseAllUnsafe(bool &memoryLeaks) {
// Destroy the children if any.
bool ok = true;
for( std::size_t i = 0; i < mChildren.size(); ++i ) {
Manager *child = mChildren[i];
bool tmp;
CNMEM_CHECK(child->releaseAllUnsafe(tmp));
ok = ok && !tmp;
delete child;
}
mChildren.clear();
// We have some issues when integrating into some libraries. This has to fixed in the libs.
memoryLeaks = !ok || mUsedBlocks;
// Destroy used blocks. It's a kind of panic mode to avoid leaks. NOTE: Do that only with roots!!!
if( !mParent ) {
while( mUsedBlocks ) {
CNMEM_CHECK(releaseBlockUnsafe(mUsedBlocks, NULL));
}
}
// We should be having only free blocks that are head blocks. Release those blocks.
while( mFreeBlocks ) {
if( mParent ) {
CNMEM_CHECK(mParent->release(mFreeBlocks->getData()));
}
else if( mFreeBlocks->isHead() ) {
void *data = mFreeBlocks->getData();
CNMEM_DEBUG_INFO("cudaFree(%lu, 0x%016lx)\n", mFreeBlocks->getSize(), (size_t) data);
CNMEM_CHECK_CUDA(cudaFree(data));
CNMEM_DEBUG_INFO(">> success\n");
}
Block *block = mFreeBlocks;
mFreeBlocks = mFreeBlocks->getNext();
delete block;
}
// We shouldn't have any used block left. Or, it means the user is causing memory leaks!
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Manager::releaseBlockUnsafe(Block *curr, Block *prev) {
// The current node cannot be NULL!
CNMEM_ASSERT(curr != NULL);
// Change the connection of the node.
if( prev ) {
prev->setNext(curr->getNext());
}
else {
mUsedBlocks = curr->getNext();
}
// Find the location where this block should be added to the free list.
prev = NULL;
Block *iter = mFreeBlocks;
for( ; iter && iter->getData() < curr->getData() ; iter = iter->getNext() ) {
prev = iter;
}
// Keep track of the successor of pred. We may lose track of it in the following "else".
Block *next = prev ? prev->getNext() : mFreeBlocks;
// We first check if we can merge the block with its predecessor in the list and curr can be merged.
if( prev && prev->getData() + prev->getSize() == curr->getData() && !curr->isHead() ) {
prev->setSize(prev->getSize() + curr->getSize());
delete curr;
curr = prev;
}
else if( prev ) {
prev->setNext(curr);
}
else {
mFreeBlocks = curr;
}
// Check if we can merge curr and next. We can't merge over "cudaMalloc" boundaries.
if( next && curr->getData() + curr->getSize() == next->getData() && !next->isHead() ) {
curr->setSize(curr->getSize() + next->getSize());
curr->setNext(next->getNext());
delete next;
}
else {
curr->setNext(next);
}
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Manager::reserve(std::size_t size) {
CNMEM_CHECK(mMutex.lock());
Block *curr, *prev;
CNMEM_CHECK_OR_UNLOCK(allocateBlockUnsafe(curr, prev, size), mMutex);
mSize = size;
CNMEM_CHECK(mMutex.unlock());
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Manager::stealUnsafe(void *&stolen, std::size_t size) {
// If we cannot steal, don't even try.
if( mFlags & CNMEM_FLAGS_CANNOT_STEAL ) {
stolen = NULL;
return CNMEM_STATUS_INVALID_ARGUMENT;
}
// The stolen block.
void *data = NULL; std::size_t dataSize = 0;
if( !mChildren.empty() ) {
CNMEM_CHECK(stealBlockUnsafe(data, dataSize, size));
}
else if( mParent ) {
CNMEM_CHECK(mParent->stealBlockUnsafe(data, dataSize, size));
}
// Make sure we do have a block of memory or quit.
if( !data ) {
stolen = NULL;
return CNMEM_STATUS_OUT_OF_MEMORY;
}
// Push the block in the used list.
mUsedBlocks = new Block((char*) data, dataSize, mUsedBlocks, true);
if( !mUsedBlocks ) {
return CNMEM_STATUS_OUT_OF_MEMORY;
}
// Return the new pointer into memory.
stolen = data;
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Manager::stealBlockUnsafe(void *&data, std::size_t &dataSize, ::size_t size) {
// No block found and no room to grow. Try to steal from a children (if we have any).
data = NULL;
for( std::size_t i = 0 ; !data && i < mChildren.size() ; ++i ) {
Manager *child = mChildren[i];
if( child->giveBlockUnsafe(data, dataSize, size) == CNMEM_STATUS_SUCCESS ) {
break;
}
}
// If no memory space found, simply return NULL. We have failed to allocate. Quit miserably.
if( !data ) {
return CNMEM_STATUS_OUT_OF_MEMORY;
}
// We have got a node from a children. We need to update our "used" list before we can do
// anything with it.
Block *curr = mUsedBlocks, *prev = NULL;
for( ; curr ; curr = curr->getNext() ) {
if( curr->getData() <= data && data < curr->getData()+curr->getSize() ) {
break;
}
prev = curr;
}
// Curr points to the node which contains that memory region.
CNMEM_ASSERT(curr);
// If it is exactly the same memory region, we are done!!!
if( curr->getData() == data && curr->getSize() == dataSize ) {
return CNMEM_STATUS_SUCCESS;
}
// Track the blocks before and after curr.
Block *next = curr->getNext();
// We may have up to 3 blocks.
std::size_t sizeBefore = (std::size_t) ((char*) data - curr->getData());
std::size_t sizeAfter = (curr->getSize() - sizeBefore - dataSize);
// The resulting block.
Block *result = curr;
// If we have no space between curr->getData and block->getData.
if( sizeBefore == 0 ) {
curr->setSize(dataSize);
}
else {
curr->setSize(sizeBefore);
Block *block = new Block((char*) data, dataSize, next, false);
if( !block ) {
return CNMEM_STATUS_OUT_OF_MEMORY;
}
curr->setNext(block);
curr = block;
data = (char*) data + dataSize;
dataSize = sizeAfter;
result = block;
}
// We have space at the end so we may need to add a new node.
if( sizeAfter > 0 ) {
Block *block = new Block(curr->getData() + curr->getSize(), sizeAfter, next, false);
if( !block ) {
return CNMEM_STATUS_OUT_OF_MEMORY;
}
curr->setNext(block);
curr = block;
}
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace cnmem
///////////////////////////////////////////////////////////////////////////////////////////////////
extern "C" {
///////////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t cnmemInit(int numDevices, const cnmemDevice_t *devices, unsigned flags) {
// Make sure we have at least one device declared.
CNMEM_CHECK_TRUE(numDevices > 0, CNMEM_STATUS_INVALID_ARGUMENT);
// Find the largest ID of the device.
int maxDevice = 0;
for( int i = 0 ; i < numDevices ; ++i ) {
if( devices[i].device > maxDevice ) {
maxDevice = devices[i].device;
}
}
// Allocate enough managers.
CNMEM_CHECK_TRUE(maxDevice >= 0, CNMEM_STATUS_INVALID_ARGUMENT);
std::vector<cnmem::Manager> &managers = cnmem::Manager::getRootManagers();
managers.resize(maxDevice+1);
// Create a root manager for each device and create the children.
int oldDevice;
CNMEM_CHECK_CUDA(cudaGetDevice(&oldDevice));
for( int i = 0 ; i < numDevices ; ++i ) {
CNMEM_CHECK_CUDA(cudaSetDevice(devices[i].device));
std::size_t size = devices[i].size;
if( size == 0 ) {
cudaDeviceProp props;
CNMEM_CHECK_CUDA(cudaGetDeviceProperties(&props, devices[i].device));
size = props.totalGlobalMem / 2;
}
CNMEM_CHECK_TRUE(size > 0, CNMEM_STATUS_INVALID_ARGUMENT);
cnmem::Manager &manager = cnmem::Manager::getRootManager(devices[i].device);
manager.setDevice(devices[i].device);
manager.setFlags(flags);
size = cnmem::ceilInt(size, CNMEM_GRANULARITY);
CNMEM_CHECK(manager.reserve(size));
for( int j = 0 ; j < devices[i].numStreams ; ++j ) {
cnmem::Manager *child = new cnmem::Manager;
child->setParent(&manager);
child->setDevice(devices[i].device);
child->setStream(devices[i].streams[j]);
child->setFlags(flags & ~CNMEM_FLAGS_CANNOT_GROW);
if( devices[i].streamSizes && devices[i].streamSizes[j] > 0 ) {
CNMEM_CHECK(child->reserve(devices[i].streamSizes[j]));
}
CNMEM_CHECK(manager.addChild(child));
}
}
CNMEM_CHECK_CUDA(cudaSetDevice(oldDevice));
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t cnmemRegisterStream(cudaStream_t stream) {
CNMEM_CHECK_TRUE(!cnmem::Manager::getRootManagers().empty(), CNMEM_STATUS_NOT_INITIALIZED);
CNMEM_CHECK_TRUE(stream, CNMEM_STATUS_INVALID_ARGUMENT);
int device;
CNMEM_CHECK_CUDA(cudaGetDevice(&device));
cnmem::Manager &root = cnmem::Manager::getRootManager(device);
cnmem::Manager *child = new cnmem::Manager;
child->setParent(&root);
child->setDevice(device);
child->setStream(stream);
child->setFlags(root.getFlags() & ~CNMEM_FLAGS_CANNOT_GROW);
root.addChild(child);
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t cnmemFinalize() {
CNMEM_CHECK_TRUE(!cnmem::Manager::getRootManagers().empty(), CNMEM_STATUS_NOT_INITIALIZED);
int oldDevice;
CNMEM_CHECK_CUDA(cudaGetDevice(&oldDevice));
std::vector<cnmem::Manager> &managers = cnmem::Manager::getRootManagers();
bool memoryLeaks = false;
for( std::size_t i = 0; i < managers.size(); ++i ) {
CNMEM_CHECK_CUDA(cudaSetDevice(managers[i].getDevice()));
bool tmpLeaks;
CNMEM_CHECK(managers[i].releaseAllUnsafe(tmpLeaks));
memoryLeaks = memoryLeaks || tmpLeaks;
}
managers.clear();
CNMEM_CHECK_CUDA(cudaSetDevice(oldDevice));
return memoryLeaks ? CNMEM_STATUS_MEMORY_LEAK : CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t cnmemMalloc(void **ptr, std::size_t size, cudaStream_t stream) {
CNMEM_CHECK_TRUE(!cnmem::Manager::getRootManagers().empty(), CNMEM_STATUS_NOT_INITIALIZED);
if( !ptr && !size )
return CNMEM_STATUS_SUCCESS;
CNMEM_CHECK_TRUE(ptr, CNMEM_STATUS_INVALID_ARGUMENT);
CNMEM_CHECK_TRUE(size, CNMEM_STATUS_INVALID_ARGUMENT);
int device;
CNMEM_CHECK_CUDA(cudaGetDevice(&device));
cnmem::Manager &root = cnmem::Manager::getRootManager(device);
cnmem::Manager *manager = &root;
if( stream ) {
CNMEM_CHECK(root.getChildFromStream(manager, stream));
}
CNMEM_ASSERT(manager);
size = cnmem::ceilInt(size, CNMEM_GRANULARITY);
cnmemStatus_t result = manager->allocate(ptr[0], size);
// We failed to allocate but there might still be a buffer available in another manager. Try to
// steal it.
if( result == CNMEM_STATUS_OUT_OF_MEMORY ) {
// Try to acquire locks on all the children.
std::size_t numChildren;
CNMEM_CHECK(root.getNumChildren(numChildren));
std::vector<const cnmem::Mutex*> mutexes(numChildren);
std::size_t numLocked = 0;
for( size_t i = 0 ; i < numChildren ; ++i, ++numLocked ) {
cnmem::Manager *child;
CNMEM_CHECK(root.getChild(child, i));
mutexes[numLocked] = child->getMutex();
if( mutexes[numLocked]->lock() != CNMEM_STATUS_SUCCESS ) {
break;
}
}
// One lock failed, quit. Reduce the damage as much as possible, though.
if( numLocked != numChildren ) {
for( std::size_t i = 0 ; i < numLocked ; ++i ) {
cnmemStatus_t lockStatus = mutexes[i]->unlock();
}
return CNMEM_STATUS_UNKNOWN_ERROR;
}
// Grab the lock on the root, first.
const cnmem::Mutex *rootMutex = root.getMutex();
CNMEM_CHECK(rootMutex->lock());
// We acquired all the lock so we try to steal a node from another child.
if( numLocked == mutexes.size() ) {
result = manager->stealUnsafe(ptr[0], size);
}
for( std::size_t i = 0 ; i < numLocked ; ++i ) {
cnmemStatus_t lockStatus = mutexes[i]->unlock();
if( lockStatus != CNMEM_STATUS_SUCCESS ) {
// Starting from now we are panicking!!! One lock failed to be released, we try
// we others. We could also give up because we are already screwed. I don't know
// what's best! Comment are welcome.
result = lockStatus;
}
}
CNMEM_CHECK(rootMutex->unlock());
}
return result;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t cnmemFree(void *ptr, cudaStream_t stream) {
CNMEM_CHECK_TRUE(!cnmem::Manager::getRootManagers().empty(), CNMEM_STATUS_NOT_INITIALIZED);
if( ptr == NULL ) {
return CNMEM_STATUS_SUCCESS;
}
int device;
CNMEM_CHECK_CUDA(cudaGetDevice(&device));
cnmem::Manager &root = cnmem::Manager::getRootManager(device);
cnmem::Manager *manager = &root;
if( stream ) {
CNMEM_CHECK(root.getChildFromStream(manager, stream));
}
CNMEM_ASSERT(manager);
return manager->release(ptr);
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t cnmemMemGetInfo(size_t *freeMem, size_t *totalMem, cudaStream_t stream) {
CNMEM_CHECK_TRUE(!cnmem::Manager::getRootManagers().empty(), CNMEM_STATUS_NOT_INITIALIZED);
CNMEM_CHECK_TRUE(totalMem && freeMem, CNMEM_STATUS_INVALID_ARGUMENT);
int device;
CNMEM_CHECK_CUDA(cudaGetDevice(&device));
cnmem::Manager &root = cnmem::Manager::getRootManager(device);
cnmem::Manager *manager = &root;
if( stream ) {
CNMEM_CHECK(root.getChildFromStream(manager, stream));
}
CNMEM_ASSERT(manager);
const cnmem::Mutex *mutex = manager->getMutex();
CNMEM_CHECK(mutex->lock());
CNMEM_CHECK_OR_UNLOCK(manager->getFreeMemoryUnsafe(*freeMem), *mutex);
size_t usedMem;
CNMEM_CHECK_OR_UNLOCK(manager->getUsedMemoryUnsafe(usedMem), *mutex);
CNMEM_CHECK(mutex->unlock());
totalMem[0] = usedMem + freeMem[0];
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t cnmemPrintMemoryState(FILE *file, cudaStream_t stream) {
CNMEM_CHECK_TRUE(!cnmem::Manager::getRootManagers().empty(), CNMEM_STATUS_NOT_INITIALIZED);
int device;
CNMEM_CHECK_CUDA(cudaGetDevice(&device));
cnmem::Manager &root = cnmem::Manager::getRootManager(device);
cnmem::Manager *manager = &root;
if( stream ) {
CNMEM_CHECK(root.getChildFromStream(manager, stream));
}
CNMEM_ASSERT(manager);
return manager->printMemoryState(file);
}
///////////////////////////////////////////////////////////////////////////////////////////////////
} // extern "C"
......@@ -35,16 +35,16 @@
#include "cuda_runtime_api.h"
#if defined(_MSC_VER) || defined(WIN32)
#ifdef CUMEM_DLLEXPORT
#define CUMEM_API __declspec(dllexport)
#ifdef CNMEM_DLLEXPORT
#define CNMEM_API __declspec(dllexport)
#else
#define CUMEM_API __declspec(dllimport)
#define CNMEM_API __declspec(dllimport)
#endif
#else
#define CUMEM_API
#define CNMEM_API
#endif
#define CUMEM_VERSION 100 // It corresponds to 1.0.0
#define CNMEM_VERSION 100 // It corresponds to 1.0.0
#ifdef __cplusplus
extern "C" {
......@@ -54,29 +54,27 @@ extern "C" {
typedef enum
{
CUMEM_STATUS_SUCCESS = 0,
CUMEM_STATUS_CUDA_ERROR,
CUMEM_STATUS_INVALID_ARGUMENT,
CUMEM_STATUS_MEMORY_LEAK,
CUMEM_STATUS_NOT_INITIALIZED,
CUMEM_STATUS_OUT_OF_MEMORY,
CUMEM_STATUS_UNKNOWN_ERROR
} cumemStatus_t;
CNMEM_STATUS_SUCCESS = 0,
CNMEM_STATUS_CUDA_ERROR,
CNMEM_STATUS_INVALID_ARGUMENT,
CNMEM_STATUS_MEMORY_LEAK,
CNMEM_STATUS_NOT_INITIALIZED,
CNMEM_STATUS_OUT_OF_MEMORY,
CNMEM_STATUS_UNKNOWN_ERROR
} cnmemStatus_t;
/* ********************************************************************************************* */
typedef enum
{
CUMEM_FLAGS_DEFAULT = 0, /// Default flags.
CUMEM_FLAGS_CANNOT_GROW = 1, /// Prevent the manager from growing its memory consumption.
CUMEM_FLAGS_CANNOT_STEAL = 2, /// Prevent the manager from stealing memory.
CUMEM_FLAGS_USE_UNIFIED_MEM = 4, /// Use Managed Memory for allocating memory.
CUMEM_FLAGS_MEM_ATTACH_HOST = 8 /// Host-only visible unified memory. (valid only for unified memory!)
} cumemManagerFlags_t;
CNMEM_FLAGS_DEFAULT = 0, /// Default flags.
CNMEM_FLAGS_CANNOT_GROW = 1, /// Prevent the manager from growing its memory consumption.
CNMEM_FLAGS_CANNOT_STEAL = 2, /// Prevent the manager from stealing memory.
} cnmemManagerFlags_t;
/* ********************************************************************************************* */
typedef struct cumemDevice_t_
typedef struct cnmemDevice_t_
{
/** The device number. */
int device;
......@@ -86,10 +84,10 @@ typedef struct cumemDevice_t_
int numStreams;
/** The streams associated with the device. It can be NULL. The NULL stream is managed. */
cudaStream_t *streams;
/** The memory allocation granularity (in multiples of 512 B). If 0, the implementation chooses the size */
size_t granularity;
/** The size reserved for each streams. It can be 0. */
size_t *streamSizes;
} cumemDevice_t;
} cnmemDevice_t;
/**
* \brief Initialize the library and allocate memory on the listed devices.
......@@ -99,47 +97,58 @@ typedef struct cumemDevice_t_
* memory manager is created. Currently, it is implemented as a tree of memory managers: A root
* manager for the device and a list of children, one for each named stream.
*
* This function must be before any other function in the library. It has to be called by a
* single thread since it is not thread-safe.
* This function must be called before any other function in the library. It has to be called
* by a single thread since it is not thread-safe.
*
* \return
* CUMEM_STATUS_SUCCESS, if everything goes fine,
* CUMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid,
* CUMEM_STATUS_OUT_OF_MEMORY, if the requested size exceeds the available memory,
* CUMEM_STATUS_CUDA_ERROR, if an error happens in a CUDA function.
* CNMEM_STATUS_SUCCESS, if everything goes fine,
* CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid,
* CNMEM_STATUS_OUT_OF_MEMORY, if the requested size exceeds the available memory,
* CNMEM_STATUS_CUDA_ERROR, if an error happens in a CUDA function.
*/
cumemStatus_t CUMEM_API cumemInit(int numDevices, const cumemDevice_t *devices, unsigned flags);
cnmemStatus_t CNMEM_API cnmemInit(int numDevices, const cnmemDevice_t *devices, unsigned flags);
/**
* \brief Add a new stream to the pool of managed streams on a device.
*
* This function registers a new stream into a device memory manager. It is thread-safe.
*
* \return
* CNMEM_STATUS_SUCCESS, if everything goes fine,
* CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid,
*/
cnmemStatus_t CNMEM_API cnmemRegisterStream(cudaStream_t stream);
/**
* \brief Release all the allocated memory.
*
* This function must be called by a single thread and after all threads that called
* cumemMalloc/cumemFree have joined. This function is not thread-safe.
* cnmemMalloc/cnmemFree have joined. This function is not thread-safe.
*
* \return
* CUMEM_STATUS_SUCCESS, if everything goes fine,
* CUMEM_STATUS_NOT_INITIALIZED, if the ::cumemInit function has not been called,
* CUMEM_STATUS_MEMORY_LEAK, if there are unreleased blocks in the memory queues,
* CUMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions.
* CNMEM_STATUS_SUCCESS, if everything goes fine,
* CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called,
* CNMEM_STATUS_MEMORY_LEAK, if there are unreleased blocks in the memory queues,
* CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions.
*/
cumemStatus_t CUMEM_API cumemFinalize();
cnmemStatus_t CNMEM_API cnmemFinalize();
/**
* \brief Allocate memory.
*
* This function allocates memory and initializes a pointer to device memory. If no memory
* is available, it returns a CUMEM_STATUS_OUT_OF_MEMORY error. This function is thread safe.
* is available, it returns a CNMEM_STATUS_OUT_OF_MEMORY error. This function is thread safe.
*
* The behavior of that function is the following:
*
* - If the stream is NULL, the root memory manager is asked to allocate a buffer of device
* memory. If there's a buffer of size larger or equal to the requested size in the list of
* free blocks, it is returned. If there's no such buffer but the manager is allowed to grow
* its memory usage (the CUMEM_FLAGS_CANNOT_GROW flag is not set), the memory manager calls
* its memory usage (the CNMEM_FLAGS_CANNOT_GROW flag is not set), the memory manager calls
* cudaMalloc. If cudaMalloc fails due to no more available memory or the manager is not
* allowed to grow, the manager attempts to steal memory from one of its children (unless
* CUMEM_FLAGS_CANNOT_STEAL is set). If that attempt also fails, the manager returns
* CUMEM_STATUS_OUT_OF_MEMORY.
* CNMEM_FLAGS_CANNOT_STEAL is set). If that attempt also fails, the manager returns
* CNMEM_STATUS_OUT_OF_MEMORY.
*
* - If the stream is a named stream, the initial request goes to the memory manager associated
* with that stream. If a free node is available in the lists of that manager, it is returned.
......@@ -148,17 +157,18 @@ cumemStatus_t CUMEM_API cumemFinalize();
*
* The calls to cudaMalloc are potentially costly and may induce GPU synchronizations. Also the
* mechanism to steal memory from the children induces GPU synchronizations (the manager has to
* make sure no kernel uses a given buffer before stealing it) and it cannot be executed by more
* than one thread at the time (per device).
* make sure no kernel uses a given buffer before stealing it) and it the execution is
* sequential (in a multi-threaded context, the code is executed in a critical section inside
* the cnmem library - no need for the user to wrap cnmemMalloc with locks).
*
* \return
* CUMEM_STATUS_SUCCESS, if everything goes fine,
* CUMEM_STATUS_NOT_INITIALIZED, if the ::cumemInit function has not been called,
* CUMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, ptr == 0,
* CUMEM_STATUS_OUT_OF_MEMORY, if there is not enough memory available,
* CUMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions.
* CNMEM_STATUS_SUCCESS, if everything goes fine,
* CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called,
* CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, ptr == 0,
* CNMEM_STATUS_OUT_OF_MEMORY, if there is not enough memory available,
* CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions.
*/
cumemStatus_t CUMEM_API cumemMalloc(void **ptr, size_t size, cudaStream_t stream);
cnmemStatus_t CNMEM_API cnmemMalloc(void **ptr, size_t size, cudaStream_t stream);
/**
* \brief Release memory.
......@@ -167,31 +177,31 @@ cumemStatus_t CUMEM_API cumemMalloc(void **ptr, size_t size, cudaStream_t stream
* thread safe.
*
* \return
* CUMEM_STATUS_SUCCESS, if everything goes fine,
* CUMEM_STATUS_NOT_INITIALIZED, if the ::cumemInit function has not been called,
* CUMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, ptr == 0,
* CUMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions.
* CNMEM_STATUS_SUCCESS, if everything goes fine,
* CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called,
* CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, ptr == 0,
* CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions.
*/
cumemStatus_t CUMEM_API cumemFree(void *ptr, cudaStream_t stream);
cnmemStatus_t CNMEM_API cnmemFree(void *ptr, cudaStream_t stream);
/* ********************************************************************************************* */
/* Utility functions. */
/* ********************************************************************************************* */
/**
* \brief Returns the amount of memory managed by the root memory manager on devices.
* \brief Returns the amount of memory managed by the memory manager associated with a stream.
*
* The pointers used_mem and free_mem must point to memory regions of numDevices*sizeof(size_t)
* bytes. At the moment, this function has a complexity linear in the number of allocated blocks
* so do not call it in performance critical sections.
* The pointers totalMem and freeMem must be valid. At the moment, this function has a comple-
* xity linear in the number of allocated blocks so do not call it in performance critical
* sections.
*
* \return
* CUMEM_STATUS_SUCCESS, if everything goes fine,
* CUMEM_STATUS_NOT_INITIALIZED, if the ::cumemInit function has not been called,
* CUMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid,
* CUMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions.
* CNMEM_STATUS_SUCCESS, if everything goes fine,
* CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called,
* CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid,
* CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions.
*/
cumemStatus_t CUMEM_API cumemGetMemoryUsage(size_t *used_mem, size_t *free_mem);
cnmemStatus_t CNMEM_API cnmemMemGetInfo(size_t *freeMem, size_t *totalMem, cudaStream_t stream);
/**
* \brief Print a list of nodes to a file.
......@@ -200,18 +210,18 @@ cumemStatus_t CUMEM_API cumemGetMemoryUsage(size_t *used_mem, size_t *free_mem);
* behaviour of the memory managers/application. It is thread safe.
*
* \return
* CUMEM_STATUS_SUCCESS, if everything goes fine,
* CUMEM_STATUS_NOT_INITIALIZED, if the ::cumemInit function has not been called,
* CUMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, used_mem == 0
* CNMEM_STATUS_SUCCESS, if everything goes fine,
* CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called,
* CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, used_mem == 0
* or free_mem == 0,
* CUMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions.
* CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions.
*/
cumemStatus_t CUMEM_API cumemPrintMemoryState(FILE *file);
cnmemStatus_t CNMEM_API cnmemPrintMemoryState(FILE *file, cudaStream_t stream);
/**
* \brief Converts a cumemStatus_t value to a string.
* \brief Converts a cnmemStatus_t value to a string.
*/
const char* cumemGetErrorString(cumemStatus_t status);
const char* cnmemGetErrorString(cnmemStatus_t status);
/* ********************************************************************************************* */
......
......@@ -9,8 +9,8 @@
#include "cuda_ndarray.cuh"
#include "cumem.h"
#include "cumem.cpp"
#include "cnmem.h"
#include "cnmem.cpp"
//If true, when there is a gpu malloc or free error, we print the size of allocated memory on the device.
#define COMPUTE_GPU_MEM_USED 0
......@@ -71,20 +71,20 @@ void * device_malloc(size_t size)
}
///@TODO: thejaswi: link this option to a theano config variable?
static bool g_use_cumem = false;
static bool g_use_cnmem = false;
static const int g_max_devices = 8;
int initCumem(int card_number_provided, int card_nb) {
static bool cumemInitialized = false;
if(cumemInitialized) {
int initCnmem(int card_number_provided, int card_nb) {
static bool cnmemInitialized = false;
if(cnmemInitialized) {
return 0;
}
// On stderr to be at the same place as "Using gpu device..."
fprintf(stderr, "Initializing cumem...\n");
fprintf(stderr, "Initializing cnmem...\n");
int numDevices = 0;
cumemDevice_t devices[g_max_devices];
cnmemDevice_t devices[g_max_devices];
if(cudaGetDeviceCount(&numDevices) != cudaSuccess) {
PyErr_Format(PyExc_RuntimeError,
"initCumem: 'cudaGetDeviceCount' failed! Reason=%s\n",
"initCnmem: 'cudaGetDeviceCount' failed! Reason=%s\n",
cudaGetErrorString(cudaGetLastError()));
return -1;
}
......@@ -97,7 +97,6 @@ int initCumem(int card_number_provided, int card_nb) {
///@TODO: thejaswi: add support for multiple streams
devices[i].numStreams = 0;
devices[i].streams = NULL;
devices[i].granularity = 0;
}else{
for(int i=0;i<numDevices;++i) {
......@@ -107,19 +106,18 @@ int initCumem(int card_number_provided, int card_nb) {
///@TODO: thejaswi: add support for multiple streams
devices[i].numStreams = 0;
devices[i].streams = NULL;
devices[i].granularity = 0;
}
}
///@TODO: thejaswi: passing custom cumem flags?
cumemStatus_t status = cumemInit(numDevices, devices, CUMEM_FLAGS_DEFAULT);
if(status != CUMEM_STATUS_SUCCESS) {
///@TODO: thejaswi: passing custom cnmem flags?
cnmemStatus_t status = cnmemInit(numDevices, devices, CNMEM_FLAGS_DEFAULT);
if(status != CNMEM_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"initCumem: cumemInit call failed! Reason=%s. numdev=%d\n",
cumemGetErrorString(status), numDevices);
"initCnmem: cnmemInit call failed! Reason=%s. numdev=%d\n",
cnmemGetErrorString(status), numDevices);
return -1;
}
cumemInitialized = true;
cnmemInitialized = true;
return 0;
}
......@@ -138,12 +136,15 @@ void * device_malloc(size_t size, int verbose)
#endif
void * rval=NULL;
///@TODO: thejaswi: support for multiple-streams?
if(g_use_cumem) {
cumemStatus_t status = cumemMalloc(&rval, size, NULL);
if(status != CUMEM_STATUS_SUCCESS) {
if(g_use_cnmem) {
cnmemStatus_t status = CNMEM_STATUS_SUCCESS;
if( size != 0 ) {
status = cnmemMalloc(&rval, size, NULL);
}
if(status != CNMEM_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError,
"Error allocating %zd bytes of device memory (%s).",
size, cumemGetErrorString(status));
size, cnmemGetErrorString(status));
return NULL;
}
}
......@@ -271,11 +272,11 @@ int device_free(void *ptr)
}
///@TODO: thejaswi: multi-stream support
if(g_use_cumem) {
cumemStatus_t status = cumemFree(ptr, NULL);
if(status != CUMEM_STATUS_SUCCESS) {
fprintf(stderr, "device_free: cumemFree call failed! Reason=%s\n",
cumemGetErrorString(status));
if(g_use_cnmem) {
cnmemStatus_t status = cnmemFree(ptr, NULL);
if(status != CNMEM_STATUS_SUCCESS) {
fprintf(stderr, "device_free: cnmemFree call failed! Reason=%s\n",
cnmemGetErrorString(status));
}
}
else {
......@@ -3134,22 +3135,22 @@ CudaNdarray_ptr_int_size(PyObject* _unused, PyObject* args)
static int cublas_init();
static void cublas_shutdown();
// Initialize the gpu.
// Takes two optional parameters, the device number and if we should use cumem.
// Takes two optional parameters, the device number and if we should use cnmem.
// If the device number is provided, it sets that device to be the active device.
// If not provided (usually just to test whether the gpu is available at all),
// it does not set an active device.
// Raises EnvironmentError or ValueError (as appropriate) if the initialization failed.
// cumem is threaded like a bool. If converted to 0, don't use cumem. Otherwise, use it.
// cnmem is threaded like a bool. If converted to 0, don't use cnmem. Otherwise, use it.
PyObject *
CudaNdarray_gpu_init(PyObject* _unused, PyObject* args)
{
int card_nb = 0;
int card_number_provided = 1;
int cumem = 0; // 0 False, 1 True
int cnmem = 0; // 0 False, 1 True
// if we're given something wildly invalid, this will throw a TypeError
PyArg_ParseTuple(args, "|ii", &card_nb, &cumem);
if(cumem)
g_use_cumem = true;
PyArg_ParseTuple(args, "|ii", &card_nb, &cnmem);
if(cnmem)
g_use_cnmem = true;
if(PyTuple_Size(args) == 0) {
card_number_provided = 0;
......@@ -3204,8 +3205,8 @@ CudaNdarray_gpu_init(PyObject* _unused, PyObject* args)
if (cublas_init() == -1)
return NULL;
}
if(card_number_provided && g_use_cumem) {
if(initCumem(card_number_provided, card_nb) == -1){
if(card_number_provided && g_use_cnmem) {
if(initCnmem(card_number_provided, card_nb) == -1){
return NULL;
}
}
......@@ -3240,13 +3241,13 @@ CudaNdarray_gpu_shutdown(PyObject* _unused, PyObject* _unused_args) {
// Don't handle errors here
cublas_shutdown();
g_gpu_context_active = 0; // context has now been closed down
if(g_use_cumem) {
fprintf(stderr, "Shutting down cumem...\n");
cumemStatus_t status = cumemFinalize();
if(status != CUMEM_STATUS_SUCCESS) {
fprintf(stderr, "CudaNdarray_gpu_shutdown: cumemFinalize failed! Reason=%s\n",
cumemGetErrorString(status));
if(status == CUMEM_STATUS_CUDA_ERROR) {
if(g_use_cnmem) {
fprintf(stderr, "Shutting down cnmem...\n");
cnmemStatus_t status = cnmemFinalize();
if(status != CNMEM_STATUS_SUCCESS && status != CNMEM_STATUS_MEMORY_LEAK) {
fprintf(stderr, "CudaNdarray_gpu_shutdown: cnmemFinalize failed! Reason=%s\n",
cnmemGetErrorString(status));
if(status == CNMEM_STATUS_CUDA_ERROR) {
fprintf(stderr, " Cuda-Reason=%s\n",
cudaGetErrorString(cudaGetLastError()));
}
......
///////////////////////////////////////////////////////////////////////////////////////////////////
// Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
///////////////////////////////////////////////////////////////////////////////////////////////////
#include "cumem.h"
#include <algorithm>
#include <cassert>
#include <cstddef>
#include <iostream>
#include <vector>
#include <cuda_runtime_api.h>
#if !defined(WIN32) && defined(_MSC_VER)
#define WIN32
#endif
#ifdef WIN32
#include <Windows.h>
#else
#include <pthread.h>
#endif
#define CUMEM_DEFAULT_GRANULARITY 512
//#define CUMEM_DEBUG
///////////////////////////////////////////////////////////////////////////////////////////////////
namespace cumem {
///////////////////////////////////////////////////////////////////////////////////////////////////
#define CUMEM_CHECK(call) do { \
cumemStatus_t status = call; \
if( status != CUMEM_STATUS_SUCCESS ) { \
return status; \
} \
} while(0)
///////////////////////////////////////////////////////////////////////////////////////////////////
#define CUMEM_CHECK_OR_UNLOCK_AND_RETURN(call, lock) do { \
cumemStatus_t status = call; \
if( status != CUMEM_STATUS_SUCCESS ) { \
lock.unlock(); \
return status; \
} \
} while(0)
///////////////////////////////////////////////////////////////////////////////////////////////////
#define CHECK(cond, error) do { \
if( !(cond) ) { \
return error; \
} \
} while(0)
///////////////////////////////////////////////////////////////////////////////////////////////////
#define CHECK_CUDA(call) do { \
cudaError_t cuda_error = call; \
if( cuda_error == cudaErrorMemoryAllocation ) { \
return CUMEM_STATUS_OUT_OF_MEMORY; \
} \
else if( cuda_error != cudaSuccess ) { \
return CUMEM_STATUS_CUDA_ERROR; \
} \
} while(0)
///////////////////////////////////////////////////////////////////////////////////////////////////
#ifdef WIN32
#define CHECK_WIN32(call, error_code) do { \
SetLastError(0); /* Clean the flag. */ \
call; \
DWORD status = GetLastError(); \
if( status ) \
return error_code; \
} while(0)
#else
#define CHECK_PTHREAD(call, error_code) do { \
int status = call; \
if( status ) { \
return error_code; \
} \
} while(0)
#endif
///////////////////////////////////////////////////////////////////////////////////////////////////
class Mutex
{
#ifdef WIN32
CRITICAL_SECTION m_critical_section;
#else
pthread_mutex_t m_mutex;
#endif
public:
/// Initialize the mutex.
cumemStatus_t initialize();
/// Finalize the mutex.
cumemStatus_t finalize();
/// Lock the mutex.
cumemStatus_t lock();
/// Unlock the mutex.
cumemStatus_t unlock();
};
///////////////////////////////////////////////////////////////////////////////////////////////////
cumemStatus_t Mutex::initialize()
{
#ifdef WIN32
CHECK_WIN32(InitializeCriticalSection(&m_critical_section), CUMEM_STATUS_UNKNOWN_ERROR);
#else
// pthread_mutexattr_t attr;
// CHECK_PTHREAD_OR_THROW(pthread_mutexattr_init(&attr), CUMEM_STATUS_UNKNOWN_ERROR);
// CHECK_PTHREAD_OR_THROW(pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE), CUMEM_STATUS_UNKNOWN_ERROR);
// CHECK_PTHREAD_OR_THROW(pthread_mutex_init(&m_mutex, &attr), CUMEM_STATUS_UNKNOWN_ERROR);
CHECK_PTHREAD(pthread_mutex_init(&m_mutex, NULL), CUMEM_STATUS_UNKNOWN_ERROR);
#endif
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cumemStatus_t Mutex::finalize()
{
#ifdef WIN32
CHECK_WIN32(DeleteCriticalSection(&m_critical_section), CUMEM_STATUS_UNKNOWN_ERROR);
#else
CHECK_PTHREAD(pthread_mutex_destroy(&m_mutex), CUMEM_STATUS_UNKNOWN_ERROR);
#endif
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cumemStatus_t Mutex::lock()
{
#ifdef WIN32
CHECK_WIN32(EnterCriticalSection(&m_critical_section), CUMEM_STATUS_UNKNOWN_ERROR);
#else
CHECK_PTHREAD(pthread_mutex_lock(&m_mutex), CUMEM_STATUS_UNKNOWN_ERROR);
#endif
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cumemStatus_t Mutex::unlock()
{
#ifdef WIN32
CHECK_WIN32(LeaveCriticalSection(&m_critical_section), CUMEM_STATUS_UNKNOWN_ERROR);
#else
CHECK_PTHREAD(pthread_mutex_unlock(&m_mutex), CUMEM_STATUS_UNKNOWN_ERROR);
#endif
}
///////////////////////////////////////////////////////////////////////////////////////////////////
class Lock
{
/// The mutex.
Mutex *m_mutex;
public:
/// Ctor.
Lock() : m_mutex(NULL) {}
/// Lock the mutex.
inline cumemStatus_t lock(Mutex *mutex) { m_mutex = mutex; return m_mutex->lock(); }
/// Unlock the mutex.
inline cumemStatus_t unlock() { return m_mutex->unlock(); }
};
///////////////////////////////////////////////////////////////////////////////////////////////////
class Block
{
/// The pointer to the memory region on the device.
char *m_data;
/// The size of the memory buffer.
std::size_t m_size;
/// The prev/next blocks in the linked list of blocks.
Block *m_next;
/// Is it a head node (i.e. a node obtained from parent->allocate or cudaMalloc).
bool m_is_head;
public:
/// Create a block.
Block(char *data, std::size_t size, Block *next, bool is_head)
: m_data(data)
, m_size(size)
, m_next(next)
, m_is_head(is_head)
{}
/// The data.
inline const char* get_data() const { return m_data; }
/// The data (mutable).
inline char* get_data() { return m_data; }
/// The size of the block.
inline std::size_t get_size() const { return m_size; }
/// The next block in the linked list.
inline const Block* get_next() const { return m_next; }
/// The next block in the linked list (mutable).
inline Block* get_next() { return m_next; }
/// Is it a head block.
inline bool is_head() const { return m_is_head; }
/// Change the next block.
inline void set_next(Block *next) { m_next = next; }
/// Change the size of the block.
inline void set_size(std::size_t size) { m_size = size; }
/// Set the head flag.
inline void set_head_flag(bool is_head) { m_is_head = is_head; }
};
///////////////////////////////////////////////////////////////////////////////////////////////////
class Manager
{
/// The parent manager.
Manager *m_parent;
/// The children managers.
std::vector<Manager> m_children;
/// The GPU device where the memory is allocated.
int m_device;
/// The stream this manager is associated with. It could be NULL.
cudaStream_t m_stream;
/// The list of used blocks.
Block *m_used_blocks;
/// The list of free blocks.
Block *m_free_blocks;
/// The managed memory size.
std::size_t m_size;
/// The memory allocation granularity
std::size_t m_granularity;
/// The flags.
unsigned m_flags;
/// To support multi-threading. Each manager has its own mutex.
Mutex m_mutex;
public:
/// Create an unitialized manager.
Manager();
/// Dtor.
~Manager();
/// Allocate a block of memory.
cumemStatus_t allocate(void *&ptr, std::size_t size);
/// Release a block of memory.
cumemStatus_t release(void *ptr);
/// Release memory. It returns true if we have no memory leak.
cumemStatus_t release_all(bool &memory_leak);
/// Reserve memory for a manager.
cumemStatus_t reserve(std::size_t size);
/// Steal memory from another manager.
cumemStatus_t steal(void *&ptr, std::size_t size);
/// Print the list of free blocks.
inline std::size_t print_free_blocks(FILE *file) const
{
return print_list(file, "free", m_free_blocks);
}
/// Print the list of used blocks.
inline std::size_t print_used_blocks(FILE *file) const
{
return print_list(file, "used", m_used_blocks);
}
/// The root manager for a given device.
static inline Manager& get_root_manager(int device) { return get_root_managers()[device]; }
/// The list of all the root managers.
static std::vector<Manager>& get_root_managers();
/// The amount of used memory.
inline std::size_t get_used_memory() const { return get_memory(m_used_blocks); }
/// The amount of used memory.
inline std::size_t get_free_memory() const { return get_memory(m_free_blocks); }
/// The children.
inline std::vector<Manager>& get_children() { return m_children; }
/// The children.
inline const std::vector<Manager>& get_children() const { return m_children; }
/// Get a specific child based on the stream id.
cumemStatus_t get_child(Manager *&manager, cudaStream_t stream);
/// The associated device.
inline int get_device() const { return m_device; }
/// The flags.
inline unsigned get_flags() const { return m_flags; }
/// Get the mutex.
inline Mutex* get_mutex() { return &m_mutex; }
/// The size allocated to that manager.
inline std::size_t get_size() const { return m_size; }
/// The CUDA stream.
inline cudaStream_t get_stream() const { return m_stream; }
/// The allocation granularity.
inline size_t get_granularity() const { return m_granularity; }
/// Define the parent.
inline void set_parent(Manager *parent) { m_parent = parent; }
/// Define the device.
inline void set_device(int device) { m_device = device; }
/// Define the stream.
inline void set_stream(cudaStream_t stream) { m_stream = stream; }
/// Define the flags.
inline void set_flags(unsigned flags) { m_flags = flags; }
/// Define the granularity
inline void set_granularity(unsigned granularity) { m_granularity = granularity; }
private:
/// Allocate a new block and add it to the free list.
cumemStatus_t allocate_block(Block *&curr, Block *&prev, std::size_t size);
/// Release a block from the active list.
cumemStatus_t release_block(Block *curr, Block *prev);
/// Find the best free node based on the size.
cumemStatus_t find_best_block(Block *&curr, Block *&prev, std::size_t size);
/// Extract a node from the list of free blocks.
cumemStatus_t extract_block(Block *curr, Block *prev, std::size_t size, bool stolen);
/// Give a free block from that manager.
cumemStatus_t give_block(void *&data, std::size_t &data_size, std::size_t size);
/// Steal a block from another manager.
cumemStatus_t steal_block(void *&data, std::size_t &data_size, std::size_t size);
/// The memory consumption of a list.
std::size_t get_memory(const Block *head) const;
/// Print an internal linked list.
std::size_t print_list(FILE *file, const char *name, const Block *head) const;
};
///////////////////////////////////////////////////////////////////////////////////////////////////
Manager::Manager()
: m_parent(NULL)
, m_children()
, m_device(-1)
, m_stream(NULL)
, m_used_blocks(NULL)
, m_free_blocks(NULL)
, m_size(0)
, m_granularity(CUMEM_DEFAULT_GRANULARITY)
, m_flags(CUMEM_FLAGS_DEFAULT)
, m_mutex()
{}
///////////////////////////////////////////////////////////////////////////////////////////////////
Manager::~Manager()
{
bool memory_leak;
release_all(memory_leak);
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cumemStatus_t Manager::allocate(void *&ptr, std::size_t size)
{
// Lock to make sure only one thread execute that fragment of code.
Lock lock;
CUMEM_CHECK(lock.lock(&m_mutex));
// Find the best fit.
Block *best = NULL, *prev = NULL;
CUMEM_CHECK_OR_UNLOCK_AND_RETURN(find_best_block(best, prev, size), lock);
// If there's no block left in the list of free blocks (with a sufficient size). Request a new block.
if( best == NULL && !(m_flags & CUMEM_FLAGS_CANNOT_GROW) )
{
CUMEM_CHECK_OR_UNLOCK_AND_RETURN(allocate_block(best, prev, size), lock);
}
// Make sure we do have a block or quit.
if( !best )
{
CUMEM_CHECK(lock.unlock());
ptr = NULL;
return CUMEM_STATUS_OUT_OF_MEMORY;
}
// Split the free block if needed.
CUMEM_CHECK_OR_UNLOCK_AND_RETURN(extract_block(best, prev, size, false), lock);
// Push the node to the list of used nodes.
best->set_next(m_used_blocks);
m_used_blocks = best;
// Return the new pointer into memory.
CUMEM_CHECK(lock.unlock());
ptr = m_used_blocks->get_data();
return CUMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cumemStatus_t Manager::allocate_block(Block *&curr, Block *&prev, std::size_t size)
{
// Reset the outputs.
curr = prev = NULL;
// Try to allocate data from the parent or the device.
void *data = NULL;
if( m_parent )
CUMEM_CHECK(m_parent->allocate(data, size));
else
{
if( m_flags & CUMEM_FLAGS_USE_UNIFIED_MEM )
{
#ifdef CUMEM_DEBUG
std::cout << "attempting cudaMallocManaged of size " << size << "B" << std::endl;
#endif
if( m_flags & CUMEM_FLAGS_MEM_ATTACH_HOST )
{
CHECK_CUDA(cudaMallocManaged(&data, size, cudaMemAttachHost));
}
else
{
CHECK_CUDA(cudaMallocManaged(&data, size, cudaMemAttachGlobal));
}
#ifdef CUMEM_DEBUG
std::cout << "cudaMallocManaged of size " << size << "B address=" << (void*)data << std::endl;
#endif
}
else
{
CHECK_CUDA(cudaSetDevice(m_device));
#ifdef CUMEM_DEBUG
std::cout << "attempting cudaMalloc of size " << size << "B" << std::endl;
#endif
CHECK_CUDA(cudaMalloc(&data, size));
#ifdef CUMEM_DEBUG
std::cout << "cudaMalloc of size " << size << "B address=" << (void*)data << std::endl;
#endif
}
}
// If it failed, there's an unexpected issue.
assert(data);
// We have data, we now need to add it to the list of free nodes. We keep the list sorted.
Block *next = m_free_blocks;
for( ; next && next->get_data() < data ; next = next->get_next() )
prev = next;
curr = new Block((char*) data, size, next, true);
if( prev )
prev->set_next(curr);
else
m_free_blocks = curr;
return CUMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cumemStatus_t Manager::extract_block(Block *curr, Block *prev, std::size_t size, bool stolen)
{
// We have two cases: 1/ It is the right size so we keep it or 2/ it is too large and we split the node.
Block *next;
if( curr->get_size() == size )
next = curr->get_next();
else
{
std::size_t remaining = curr->get_size()-size;
Block *new_block = new Block(curr->get_data() + size, remaining, curr->get_next(), stolen);
next = new_block;
curr->set_size(size);
}
// Redo the "branching" in the nodes.
if( prev )
prev->set_next(next);
else
m_free_blocks = next;
return CUMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cumemStatus_t Manager::find_best_block(Block *&best, Block *&prev, std::size_t size)
{
best = NULL, prev = NULL;
for( Block *temp = m_free_blocks, *temp_prev = NULL ; temp ; temp = temp->get_next() )
{
if( temp->get_size() >= size && (!best || temp->get_size() < best->get_size()) )
{
best = temp;
prev = temp_prev;
}
temp_prev = temp;
}
return CUMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cumemStatus_t Manager::get_child(Manager *&manager, cudaStream_t stream)
{
for( std::size_t i = 0 ; i < m_children.size() ; ++i )
if( m_children[i].m_stream == stream )
{
manager = &m_children[i];
return CUMEM_STATUS_SUCCESS;
}
return CUMEM_STATUS_INVALID_ARGUMENT;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
std::size_t Manager::get_memory(const Block *head) const
{
std::size_t size = 0;
for( Block *curr = (Block*) head ; curr ; curr = curr->get_next() )
size += curr->get_size();
return size;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
std::vector<Manager>& Manager::get_root_managers()
{
static std::vector<Manager> managers;
return managers;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cumemStatus_t Manager::give_block(void *&block_data, std::size_t &block_size, std::size_t size)
{
// Make sure the block is not in use any more. It could be too coarse grain and we may change
// it in the future.
CHECK_CUDA(cudaStreamSynchronize(m_stream));
// Init the returned values to 0.
block_data = NULL;
block_size = 0;
// Find the best node to steal and reserve it.
Block *best = NULL, *prev = NULL;
CUMEM_CHECK(find_best_block(best, prev, size));
if( !best )
return CUMEM_STATUS_OUT_OF_MEMORY;
CUMEM_CHECK(extract_block(best, prev, size, true));
block_data = best->get_data();
block_size = best->get_size();
// Release the memory used by that block.
delete best;
return CUMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
std::size_t Manager::print_list(FILE *file, const char *name, const Block *head) const
{
std::size_t size = 0;
for( Block *curr = (Block*) head; curr; curr = curr->get_next() )
size += curr->get_size();
fprintf(file, "| list=\"%s\", size=%lu\n", name, size);
for( Block *curr = (Block*) head ; curr ; curr = curr->get_next() )
{
fprintf(file, "| | node=0x%016lx, data=0x%016lx, size=%lu, next=0x%016lx, head=%2lu\n",
(std::size_t) curr,
(std::size_t) curr->get_data(),
(std::size_t) curr->get_size(),
(std::size_t) curr->get_next(),
(std::size_t) curr->is_head ());
}
fprintf(file, "|\n");
return size;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cumemStatus_t Manager::release(void *ptr)
{
// Skip if ptr is NULL.
if( ptr == NULL )
{
printf("release(NULL)\n");
CUMEM_STATUS_SUCCESS;
}
// Lock to make sure only one thread execute that fragment of code.
Lock lock;
CUMEM_CHECK(lock.lock(&m_mutex));
// Find the node in the list of used blocks.
Block *curr = m_used_blocks, *prev = NULL;
for( ; curr && curr->get_data() != ptr ; curr = curr->get_next() )
prev = curr;
// Make sure we have found a node.
if( curr == NULL )
{
CUMEM_CHECK(lock.unlock());
return CUMEM_STATUS_INVALID_ARGUMENT;
}
// We have the node so release it.
cumemStatus_t result = release_block(curr, prev);
CUMEM_CHECK(lock.unlock());
return result;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cumemStatus_t Manager::release_all(bool &memory_leaks)
{
// Destroy the children if any.
bool ok = true;
for( std::size_t i = 0; i < m_children.size(); ++i )
{
bool tmp;
CUMEM_CHECK(m_children[i].release_all(tmp));
ok = ok && !tmp;
}
// TODO: thejaswi: HACK! HACK! HACK!
// We have some issues when integrating into some libraries. This has to fixed in the libs.
// memory_leaks = !ok || m_used_blocks;
memory_leaks = !ok;
// Destroy used blocks. It's a kind of panic mode to avoid leaks. NOTE: Do that only with roots!!!
if( !m_parent )
while( m_used_blocks )
CUMEM_CHECK(release_block(m_used_blocks, NULL));
// We should be having only free blocks that are head blocks. Release those blocks.
while( m_free_blocks )
{
if( m_parent )
CUMEM_CHECK(m_parent->release(m_free_blocks->get_data()));
else if( m_free_blocks->is_head() )
{
#ifdef CUMEM_DEBUG
std::cout << "attempting cudaFree of size " << m_free_blocks->get_size()
<< "B address=" << (void*)m_free_blocks->get_data() << std::endl;
#endif
CHECK_CUDA(cudaFree(m_free_blocks->get_data()));
#ifdef CUMEM_DEBUG
std::cout << "cudaFree of size " << m_free_blocks->get_size() << "B successful" << std::endl;
#endif
}
Block *block = m_free_blocks;
m_free_blocks = m_free_blocks->get_next();
delete block;
}
// We shouldn't have any used block left. Or, it means the user is causing memory leaks!
return CUMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cumemStatus_t Manager::release_block(Block *curr, Block *prev)
{
// The current node cannot be NULL!
assert(curr != NULL);
// Change the connection of the node.
if( prev )
prev->set_next(curr->get_next());
else
m_used_blocks = curr->get_next();
// Find the location where this block should be added to the free list.
prev = NULL;
Block *iter = m_free_blocks;
for( ; iter && iter->get_data() < curr->get_data() ; iter = iter->get_next() )
prev = iter;
// Keep track of the successor of pred. We may lose track of it in the following "else".
Block *next = prev ? prev->get_next() : m_free_blocks;
// We first check if we can merge the block with its predecessor in the list and curr can be merged.
if( prev && prev->get_data() + prev->get_size() == curr->get_data() && !curr->is_head() )
{
prev->set_size(prev->get_size() + curr->get_size());
delete curr;
curr = prev;
}
else if( prev )
prev->set_next(curr);
else
m_free_blocks = curr;
// Check if we can merge curr and next. We can't merge over "cudaMalloc" boundaries.
if( next && curr->get_data() + curr->get_size() == next->get_data() && !next->is_head() )
{
curr->set_size(curr->get_size() + next->get_size());
curr->set_next(next->get_next());
delete next;
}
else
curr->set_next(next);
return CUMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cumemStatus_t Manager::reserve(std::size_t size)
{
Block *curr, *prev;
CUMEM_CHECK(allocate_block(curr, prev, size));
m_size = size;
return CUMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cumemStatus_t Manager::steal(void *&stolen, std::size_t size)
{
// If we cannot steal, don't even try.
if( m_flags & CUMEM_FLAGS_CANNOT_STEAL )
{
stolen = NULL;
return CUMEM_STATUS_INVALID_ARGUMENT;
}
// The stolen block.
void *data = NULL; std::size_t data_size = 0;
if( !m_children.empty() )
CUMEM_CHECK(steal_block(data, data_size, size));
else if( m_parent )
CUMEM_CHECK(m_parent->steal_block(data, data_size, size));
// Make sure we do have a block of memory or quit.
if( !data )
{
stolen = NULL;
return CUMEM_STATUS_OUT_OF_MEMORY;
}
// Push the block in the used list.
m_used_blocks = new Block((char*) data, data_size, m_used_blocks, true);
// Return the new pointer into memory.
stolen = data;
return CUMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cumemStatus_t Manager::steal_block(void *&data, std::size_t &data_size, ::size_t size)
{
// No block found and no room to grow. Try to steal from a children (if we have any).
data = NULL;
for( std::size_t i = 0 ; !data && i < m_children.size() ; ++i )
if( m_children[i].give_block(data, data_size, size) == CUMEM_STATUS_SUCCESS )
break;
// If no memory space found, simply return NULL. We have failed to allocate. Quit miserably.
if( !data )
return CUMEM_STATUS_OUT_OF_MEMORY;
// We have got a node from a children. We need to update our "used" list before we can do
// anything with it.
Block *curr = m_used_blocks, *prev = NULL;
for( ; curr ; curr = curr->get_next() )
{
if( curr->get_data() <= data && data < curr->get_data()+curr->get_size() )
break;
prev = curr;
}
// Curr points to the node which contains that memory region.
assert(curr);
// If it is exactly the same memory region, we are done!!!
if( curr->get_data() == data && curr->get_size() == data_size )
return CUMEM_STATUS_SUCCESS;
// Track the blocks before and after curr.
Block *next = curr->get_next();
// We may have up to 3 blocks.
std::size_t size_before = (std::size_t) ((char*) data - curr->get_data());
std::size_t size_after = (curr->get_size() - size_before - data_size);
// The resulting block.
Block *result = curr;
// If we have no space between curr->get_data and block->get_data.
if( size_before == 0 )
curr->set_size(data_size);
else
{
curr->set_size(size_before);
Block *block = new Block((char*) data, data_size, next, false);
curr->set_next(block);
curr = block;
data = (char*) data + data_size;
data_size = size_after;
result = block;
}
// We have space at the end so we may need to add a new node.
if( size_after > 0 )
{
Block *block = new Block(curr->get_data() + curr->get_size(), size_after, next, false);
curr->set_next(block);
curr = block;
}
return CUMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
static void print_blocks(FILE *file, const Manager &manager)
{
fprintf(file, "device=%d, stream=0x%016lx, used=%luB, free=%luB\n",
manager.get_device(),
(std::size_t) manager.get_stream(),
manager.get_used_memory(),
manager.get_free_memory());
manager.print_used_blocks(file);
manager.print_free_blocks(file);
fprintf(file, "\n");
for( std::size_t i = 0 ; i < manager.get_children().size() ; ++i )
print_blocks(file, manager.get_children()[i]);
}
///////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace cumem
///////////////////////////////////////////////////////////////////////////////////////////////////
extern "C" {
///////////////////////////////////////////////////////////////////////////////////////////////////
cumemStatus_t cumemInit(int numDevices, const cumemDevice_t *devices, unsigned flags)
{
// Make sure we have at least one device declared.
CHECK(numDevices > 0, CUMEM_STATUS_INVALID_ARGUMENT);
// Find the largest ID of the device.
int max_device = 0;
for( int i = 0 ; i < numDevices ; ++i )
if( devices[i].device > max_device )
max_device = devices[i].device;
// Allocate enough managers.
CHECK(max_device >= 0, CUMEM_STATUS_INVALID_ARGUMENT);
std::vector<cumem::Manager> &managers = cumem::Manager::get_root_managers();
managers.resize(max_device+1);
// Create a root manager for each device and create the children.
for( int i = 0 ; i < numDevices ; ++i )
{
std::size_t size = devices[i].size;
if( size == 0 )
{
cudaDeviceProp props;
CHECK_CUDA(cudaGetDeviceProperties(&props, devices[i].device));
size = props.totalGlobalMem / 2;
}
CHECK(size > 0, CUMEM_STATUS_INVALID_ARGUMENT);
std::size_t granularity = devices[i].granularity;
if( granularity == 0 )
{
granularity = CUMEM_DEFAULT_GRANULARITY;
}
CHECK(granularity > 0, CUMEM_STATUS_INVALID_ARGUMENT);
CHECK(((granularity % 512) == 0), CUMEM_STATUS_INVALID_ARGUMENT);
cumem::Manager &manager = cumem::Manager::get_root_manager(devices[i].device);
manager.set_device(devices[i].device);
manager.set_flags(flags);
manager.set_granularity(granularity);
size = ((size + granularity - 1) / granularity) * granularity;
CUMEM_CHECK(manager.reserve(size));
std::vector<cumem::Manager> &children = manager.get_children();
children.resize(devices[i].numStreams);
for( int j = 0 ; j < devices[i].numStreams ; ++j )
{
children[j].set_parent(&manager);
children[j].set_device(devices[i].device);
children[j].set_stream(devices[i].streams[j]);
children[j].set_flags(flags & ~CUMEM_FLAGS_CANNOT_GROW);
CUMEM_CHECK(children[j].reserve(size / (devices[i].numStreams + 1)));
}
}
return CUMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cumemStatus_t cumemFinalize()
{
if( cumem::Manager::get_root_managers().empty() )
return CUMEM_STATUS_NOT_INITIALIZED;
std::vector<cumem::Manager> &managers = cumem::Manager::get_root_managers();
bool memory_leaks = false;
for( std::size_t i = 0; i < managers.size(); ++i )
{
bool tmp_leaks;
CUMEM_CHECK(managers[i].release_all(tmp_leaks));
memory_leaks = memory_leaks || tmp_leaks;
}
managers.clear();
return memory_leaks ? CUMEM_STATUS_MEMORY_LEAK : CUMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cumemStatus_t cumemMalloc(void **ptr, std::size_t size, cudaStream_t stream)
{
if( cumem::Manager::get_root_managers().empty() )
return CUMEM_STATUS_NOT_INITIALIZED;
if( !ptr && !size )
return CUMEM_STATUS_SUCCESS;
if( !ptr )
return CUMEM_STATUS_INVALID_ARGUMENT;
if( !size )
return CUMEM_STATUS_INVALID_ARGUMENT;
int device;
CHECK_CUDA(cudaGetDevice(&device));
cumem::Manager &root = cumem::Manager::get_root_manager(device);
cumem::Manager *manager = &root;
if( stream )
CUMEM_CHECK(root.get_child(manager, stream));
assert(manager);
size_t granularity = manager->get_granularity();
size = ((size + granularity - 1) / granularity) * granularity;
cumemStatus_t result = manager->allocate(ptr[0], size);
// We failed to allocate but there might still be a buffer available in another manager. Try to
// steal it.
if( result == CUMEM_STATUS_OUT_OF_MEMORY )
{
// We need to acquire all the locks to all the managers to be able to steal memory. It's costly!
typedef std::vector<cumem::Manager>::iterator Iterator;
// Try to acquire locks on all the children.
std::vector<cumem::Manager> &children = root.get_children();
std::vector<cumem::Lock> locks(children.size() + 1);
std::size_t num_locked = 0;
for( Iterator it = children.begin() ; it != children.end() ; ++it, ++num_locked )
{
cumem::Mutex *mutex = it->get_mutex();
if( locks[num_locked].lock(mutex) != CUMEM_STATUS_SUCCESS )
break;
}
// We locked all the children, so we try to lock the root.
if( num_locked == children.size() )
{
cumemStatus_t tmp_status = locks.back().lock(root.get_mutex());
if( tmp_status == CUMEM_STATUS_SUCCESS )
num_locked++;
}
// We acquired so we try to steal a node from another child.
if( num_locked == locks.size() )
result = manager->steal(ptr[0], size);
for( std::size_t i = 0 ; i < num_locked ; ++i )
CUMEM_CHECK(locks[i].unlock());
}
return result;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cumemStatus_t cumemFree(void *ptr, cudaStream_t stream)
{
if( cumem::Manager::get_root_managers().empty() )
return CUMEM_STATUS_NOT_INITIALIZED;
if( ptr == NULL )
return CUMEM_STATUS_SUCCESS;
int device;
CHECK_CUDA(cudaGetDevice(&device));
cumem::Manager &root = cumem::Manager::get_root_manager(device);
cumem::Manager *manager = &root;
if( stream )
CUMEM_CHECK(root.get_child(manager, stream));
assert(manager);
return manager->release(ptr);
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cumemStatus_t cumemGetMemoryUsage(size_t *used_mem, size_t *free_mem)
{
if( cumem::Manager::get_root_managers().empty() )
return CUMEM_STATUS_NOT_INITIALIZED;
if( !used_mem || !free_mem )
return CUMEM_STATUS_INVALID_ARGUMENT;
for( std::size_t i = 0, j = 0 ; i < cumem::Manager::get_root_managers().size() ; ++i )
{
cumem::Manager &manager = cumem::Manager::get_root_managers()[i];
if( manager.get_device() == -1 )
continue;
cumem::Lock lock;
CUMEM_CHECK(lock.lock(manager.get_mutex()));
used_mem[j] = manager.get_used_memory();
free_mem[j] = manager.get_free_memory();
j++;
CUMEM_CHECK(lock.unlock());
}
return CUMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cumemStatus_t cumemPrintMemoryState(FILE *file)
{
if( cumem::Manager::get_root_managers().empty() )
return CUMEM_STATUS_NOT_INITIALIZED;
for( std::size_t i = 0 ; i < cumem::Manager::get_root_managers().size() ; ++i )
{
cumem::Manager &manager = cumem::Manager::get_root_managers()[i];
if( manager.get_device() == -1 )
continue;
cumem::Lock lock;
CUMEM_CHECK(lock.lock(manager.get_mutex()));
print_blocks(file, manager);
CUMEM_CHECK(lock.unlock());
}
return CUMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
const char* cumemGetErrorString(cumemStatus_t status)
{
switch(status)
{
case CUMEM_STATUS_SUCCESS: return "CUMEM_STATUS_SUCCESS";
case CUMEM_STATUS_CUDA_ERROR: return "CUMEM_STATUS_CUDA_ERROR";
case CUMEM_STATUS_INVALID_ARGUMENT: return "CUMEM_STATUS_INVALID_ARGUMENT";
case CUMEM_STATUS_MEMORY_LEAK: return "CUMEM_STATUS_MEMORY_LEAK";
case CUMEM_STATUS_NOT_INITIALIZED: return "CUMEM_STATUS_NOT_INITIALIZED";
case CUMEM_STATUS_OUT_OF_MEMORY: return "CUMEM_STATUS_OUT_OF_MEMORY";
default: return "CUMEM_STATUS_UNKNOWN_ERROR";
}
}
///////////////////////////////////////////////////////////////////////////////////////////////////
} // extern "C"
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论