Add support for CNMeM library.

Update the code to support CNMeM (formerly known as Cumem).

Add support for CNMeM library.
130d2ce9 · Julien Demouth · Frederic · 389c4aba · 130d2ce9 · 130d2ce9
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -2,6 +2,7 @@ global-include *.txt
 global-include *.c
 global-include *.cu
 global-include *.cuh
+global-include *.cpp
 global-include *.h
 global-include *.sh
 global-include *.pkl

--- a/setup.py
+++ b/setup.py
@@ -164,7 +164,7 @@ def do_setup():
          install_requires=['numpy>=1.6.2', 'scipy>=0.11', 'six>=1.9.0'],
          package_data={
              '': ['*.txt', '*.rst', '*.cu', '*.cuh', '*.c', '*.sh', '*.pkl',
-                   '*.h', 'ChangeLog'],
+                   '*.h', '*.cpp', 'ChangeLog'],
              'theano.misc': ['*.sh']
          },
          scripts=['bin/theano-cache', 'bin/theano-nose', 'bin/theano-test'],

--- a/theano/sandbox/cuda/__init__.py
+++ b/theano/sandbox/cuda/__init__.py
@@ -54,8 +54,8 @@ AddConfigVar('cublas.lib',
        """Name of the cuda blas library for the linker.""",
        StrParam('cublas'))
-AddConfigVar('lib.cumem',
+AddConfigVar('lib.cnmem',
-             """Do we enable cumem or not.""",
+             """Do we enable cnmem or not.""",
             # We should not mix both allocator, so we can't override
             BoolParam(False, allow_override=False),
             in_c_key=False)
@@ -385,7 +385,7 @@ def use(device,
        try:
            if (device != 'gpu') and not pycuda_init_dev:
                assert isinstance(device, int)
-                gpu_init(device, config.lib.cumem)
+                gpu_init(device, config.lib.cnmem)
                use.device_number = device
                assert active_device_number() == device
            else:
@@ -398,7 +398,7 @@ def use(device,
                cuda_ndarray.cuda_ndarray.select_a_gpu()
                use.device_number = active_device_number()
                # This is needed to initialize the cublas handle.
-                gpu_init(use.device_number, config.lib.cumem)
+                gpu_init(use.device_number, config.lib.cnmem)
            if test_driver:
                import theano.sandbox.cuda.tests.test_driver
@@ -411,8 +411,9 @@ def use(device,
                                 " this property")
            if config.print_active_device:
-                print("Using gpu device %d: %s" % (
+                cnmem_enabled = "enabled" if config.lib.cnmem else "disabled"
-                        active_device_number(), active_device_name()), file=sys.stderr)
+                print("Using gpu device %d: %s (cnmem is %s)" % (
+                        active_device_number(), active_device_name(), cnmem_enabled), file=sys.stderr)
            if device_properties(use.device_number)['regsPerBlock'] < 16384:
                # We will try to use too much register per bloc at many places
                # when there is only 8k register per multi-processor.

--- a/theano/sandbox/cuda/cnmem.cpp
+++ b/theano/sandbox/cuda/cnmem.cpp
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+///////////////////////////////////////////////////////////////////////////////////////////////////
+#include "cnmem.h"
+#include <cstddef>
+#include <vector>
+#include <cuda_runtime_api.h>
+#if !defined(WIN32) && defined(_MSC_VER)
+#define WIN32
+#endif
+#ifdef WIN32
+#include <Windows.h>
+#else
+#include <pthread.h>
+#endif
+#define CNMEM_GRANULARITY 512
+///////////////////////////////////////////////////////////////////////////////////////////////////
+extern "C" const char* cnmemGetErrorString(cnmemStatus_t status) {
+    switch(status) {
+        case CNMEM_STATUS_SUCCESS: return "CNMEM_STATUS_SUCCESS";
+        case CNMEM_STATUS_CUDA_ERROR: return "CNMEM_STATUS_CUDA_ERROR";
+        case CNMEM_STATUS_INVALID_ARGUMENT: return "CNMEM_STATUS_INVALID_ARGUMENT";
+        case CNMEM_STATUS_MEMORY_LEAK: return "CNMEM_STATUS_MEMORY_LEAK";
+        case CNMEM_STATUS_NOT_INITIALIZED: return "CNMEM_STATUS_NOT_INITIALIZED";
+        case CNMEM_STATUS_OUT_OF_MEMORY: return "CNMEM_STATUS_OUT_OF_MEMORY";
+        default: return "CNMEM_STATUS_UNKNOWN_ERROR";
+    }
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+#if 0
+#ifdef WIN32
+#define CNMEM_DEBUG_ERROR(...) do { \
+    fprintf(stderr, "Error at line: %d\n", __LINE__); \
+    fprintf(stderr, __VA_ARGS__); \
+} while(0)
+#else
+#include <execinfo.h>
+static inline void printBacktrace() {
+    void *stackBuffer[64]; 
+    int numAddresses = backtrace((void**) &stackBuffer, 64); 
+    char **addresses = backtrace_symbols(stackBuffer, numAddresses); 
+    for( int i = 0 ; i < numAddresses ; ++i ) { 
+        fprintf(stderr, "[%2d]: %s\n", i, addresses[i]); 
+    } 
+    free(addresses); 
+}
+#define CNMEM_DEBUG_ERROR(...) do { \
+    fprintf(stderr, "Error at line: %d\n", __LINE__); \
+    fprintf(stderr, __VA_ARGS__); \
+    fprintf(stderr, "Backtrace:\n"); \
+    printBacktrace(); \
+} while(0)
+#endif
+#else
+#define CNMEM_DEBUG_ERROR(...)
+#endif
+#if 0
+#define CNMEM_DEBUG_INFO printf
+#else
+#define CNMEM_DEBUG_INFO(...)
+#endif
+#if 0 // Enable/disable assertions
+#include <cassert>
+#define CNMEM_ASSERT assert
+#else
+#define CNMEM_ASSERT(...)
+#endif
+#define CNMEM_CHECK_TRUE(cond, error) do { \
+    if( !(cond) ) { \
+        CNMEM_DEBUG_ERROR("CNMEM_CHECK_TRUE evaluates to false\n"); \
+        return error; \
+    } \
+} while(0) 
+#define CNMEM_CHECK(call) do { \
+    cnmemStatus_t status = (call); \
+    if( status != CNMEM_STATUS_SUCCESS ) { \
+        CNMEM_DEBUG_ERROR("CNMEM_CHECK failed with status \"%s\"\n", \
+                cnmemGetErrorString(status)); \
+        return status; \
+    } \
+} while(0)
+#define CNMEM_CHECK_OR_UNLOCK(call, mutex) do { \
+    cnmemStatus_t status = (call); \
+    if( status != CNMEM_STATUS_SUCCESS ) { \
+        CNMEM_DEBUG_ERROR("CNMEM_CHECK_OR_UNLOCK failed with status \"%s\"\n", \
+                cnmemGetErrorString(status)); \
+        (mutex).unlock(); \
+        return status; \
+    } \
+} while(0)
+#define CNMEM_CHECK_CUDA(call) do { \
+    cudaError_t cudaError = (call); \
+    if( cudaError == cudaErrorMemoryAllocation ) { \
+        CNMEM_DEBUG_ERROR("CNMEM_CHECK_CUDA failed with CUDA error \"%s\"\n", \
+                cudaGetErrorString(cudaError)); \
+        return CNMEM_STATUS_OUT_OF_MEMORY; \
+    } \
+    else if( cudaError != cudaSuccess ) { \
+        CNMEM_DEBUG_ERROR("CNMEM_CHECK_CUDA failed with CUDA error \"%s\"\n", \
+                cudaGetErrorString(cudaError)); \
+        return CNMEM_STATUS_CUDA_ERROR; \
+    } \
+} while(0)
+#define CNMEM_CHECK_CUDA_OR_UNLOCK(call, mutex) do { \
+    cudaError_t cudaError = (call); \
+    if( cudaError == cudaErrorMemoryAllocation ) { \
+        CNMEM_DEBUG_ERROR("CNMEM_CHECK_CUDA_OR_UNLOCK failed with CUDA error \"%s\"\n", \
+                cudaGetErrorString(cudaError)); \
+        (mutex).unlock(); \
+        return CNMEM_STATUS_OUT_OF_MEMORY; \
+    } \
+    else if( cudaError != cudaSuccess ) { \
+        CNMEM_DEBUG_ERROR("CNMEM_CHECK_CUDA_OR_UNLOCK failed with CUDA error \"%s\"\n", \
+                cudaGetErrorString(cudaError)); \
+        (mutex).unlock(); \
+        return CNMEM_STATUS_CUDA_ERROR; \
+    } \
+} while(0)
+#ifdef WIN32
+#define CNMEM_CHECK_WIN32(call, error_code) do { \
+    SetLastError(0); /* Clean the flag. */ \
+    call; \
+    DWORD status = GetLastError(); \
+    if( status ) \
+        return error_code; \
+} while(0)
+#else
+#define CNMEM_CHECK_PTHREAD(call, error_code) do { \
+    int status = call; \
+    if( status ) { \
+        CNMEM_DEBUG_ERROR("CNMEM_CHECK_PTHREAD failed with status %d\n", status); \
+        return error_code; \
+    } \
+} while(0)
+#endif
+///////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cnmem {
+static inline std::size_t ceilInt(std::size_t m, std::size_t n) {
+    CNMEM_ASSERT(n > 0);
+    return (m + n-1) / n * n;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+class Mutex {
+#ifdef WIN32
+    CRITICAL_SECTION mCriticalSection;
+#else
+    pthread_mutex_t  mMutex;
+#endif
+public:
+    /// Initialize the mutex.
+    cnmemStatus_t initialize();
+    /// Finalize the mutex.
+    cnmemStatus_t finalize();
+    /// Lock the mutex.
+    cnmemStatus_t lock() const;
+    /// Unlock the mutex.
+    cnmemStatus_t unlock() const;
+};
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t Mutex::initialize() {
+#ifdef WIN32
+    CNMEM_CHECK_WIN32(InitializeCriticalSection((CRITICAL_SECTION*) &mCriticalSection), CNMEM_STATUS_UNKNOWN_ERROR);
+#else
+#if 0
+    pthread_mutexattr_t attr;
+    CNMEM_CHECK_PTHREAD(pthread_mutexattr_init(&attr), CNMEM_STATUS_UNKNOWN_ERROR);
+    CNMEM_CHECK_PTHREAD(pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE), CNMEM_STATUS_UNKNOWN_ERROR);
+    CNMEM_CHECK_PTHREAD(pthread_mutex_init(&mMutex, &attr), CNMEM_STATUS_UNKNOWN_ERROR);
+#else
+    CNMEM_CHECK_PTHREAD(pthread_mutex_init(&mMutex, NULL), CNMEM_STATUS_UNKNOWN_ERROR);
+#endif
+#endif
+    return CNMEM_STATUS_SUCCESS;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t Mutex::finalize() {
+#ifdef WIN32
+    CNMEM_CHECK_WIN32(DeleteCriticalSection((CRITICAL_SECTION*) &mCriticalSection), CNMEM_STATUS_UNKNOWN_ERROR);
+#else
+    CNMEM_CHECK_PTHREAD(pthread_mutex_destroy(&mMutex), CNMEM_STATUS_UNKNOWN_ERROR);
+#endif
+    return CNMEM_STATUS_SUCCESS;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t Mutex::lock() const {
+#ifdef WIN32
+    CNMEM_CHECK_WIN32(EnterCriticalSection(&mCriticalSection), CNMEM_STATUS_UNKNOWN_ERROR);
+#else
+    CNMEM_CHECK_PTHREAD(pthread_mutex_lock((pthread_mutex_t*) &mMutex), CNMEM_STATUS_UNKNOWN_ERROR);
+#endif
+    return CNMEM_STATUS_SUCCESS;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t Mutex::unlock() const {
+#ifdef WIN32
+    CNMEM_CHECK_WIN32(LeaveCriticalSection(&mCriticalSection), CNMEM_STATUS_UNKNOWN_ERROR);
+#else
+    CNMEM_CHECK_PTHREAD(pthread_mutex_unlock((pthread_mutex_t*) &mMutex), CNMEM_STATUS_UNKNOWN_ERROR);
+#endif
+    return CNMEM_STATUS_SUCCESS;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+class Block {
+    /// The pointer to the memory region on the device. 
+    char *mData;
+    /// The size of the memory buffer.
+    std::size_t mSize;
+    /// The prev/next blocks in the linked list of blocks.
+    Block *mNext;
+    /// Is it a head node (i.e. a node obtained from parent->allocate or cudaMalloc).
+    bool mIsHead;
+public:
+    /// Create a block.
+    Block(char *data, std::size_t size, Block *next, bool isHead)
+        : mData(data)
+        , mSize(size)
+        , mNext(next)
+        , mIsHead(isHead) {
+    }
+    /// The data.
+    inline const char* getData() const { return mData; }
+    /// The data (mutable).
+    inline char* getData() { return mData; }
+    /// The size of the block.
+    inline std::size_t getSize() const { return mSize; }
+    /// The next block in the linked list.
+    inline const Block* getNext() const { return mNext; }
+    /// The next block in the linked list (mutable).
+    inline Block* getNext() { return mNext; }
+    /// Is it a head block.
+    inline bool isHead() const { return mIsHead; }
+    /// Change the next block.
+    inline void setNext(Block *next) { mNext = next; }
+    /// Change the size of the block.
+    inline void setSize(std::size_t size) { mSize = size; }
+    /// Set the head flag.
+    inline void setHeadFlag(bool isHead) { mIsHead = isHead; }
+};
+///////////////////////////////////////////////////////////////////////////////////////////////////
+class Manager {
+    /// The parent manager.
+    Manager *mParent;
+    /// The children managers.
+    std::vector<Manager*> mChildren;
+    /// The GPU device where the memory is allocated.
+    int mDevice;
+    /// The stream this manager is associated with. It could be NULL.
+    cudaStream_t mStream;
+    /// Is the stream blocking?
+    bool mIsStreamBlocking;
+    /// The list of used blocks.
+    Block *mUsedBlocks;
+    /// The list of free blocks.
+    Block *mFreeBlocks;
+    /// The managed memory size.
+    std::size_t mSize;
+    /// The flags.
+    unsigned mFlags;
+    /// To support multi-threading. Each manager has its own mutex.
+    Mutex mMutex;
+public:
+    /// The root manager for a given device.
+    static inline Manager& getRootManager(int device) { return getRootManagers()[device]; }
+    /// The list of all the root managers.
+    static std::vector<Manager>& getRootManagers();
+public:
+    /// Create an unitialized manager.
+    Manager();
+    /// Dtor.
+    ~Manager();
+    /// Allocate a block of memory.
+    cnmemStatus_t allocate(void *&ptr, std::size_t size, bool isBlocking = true);
+    /// Release a block of memory.
+    cnmemStatus_t release(void *ptr);
+    /// Release memory. It returns true if we have no memory leak.
+    cnmemStatus_t releaseAllUnsafe(bool &memoryLeak);
+    /// Reserve memory for a manager.
+    cnmemStatus_t reserve(std::size_t size);
+    /// Steal memory from another manager.
+    cnmemStatus_t stealUnsafe(void *&ptr, std::size_t size);
+    /// Print the full memory state.
+    cnmemStatus_t printMemoryState(FILE *file) const;
+    /// The amount of used memory.
+    inline cnmemStatus_t getUsedMemoryUnsafe(std::size_t &usedMemory) const { 
+        return getMemoryUnsafe(usedMemory, mUsedBlocks); 
+    }
+    /// The amount of used memory.
+    inline cnmemStatus_t getFreeMemoryUnsafe(std::size_t &freeMemory) const { 
+        return getMemoryUnsafe(freeMemory, mFreeBlocks); 
+    }
+    /// Get a specific child based on the stream id. 
+    cnmemStatus_t getChildFromStream(Manager *&manager, cudaStream_t stream) const;
+    /// Get a specific child based on the stream id. 
+    cnmemStatus_t getChild(Manager *&manager, std::size_t i) const;
+    /// Add a new child.
+    cnmemStatus_t addChild(Manager *manager);
+    /// The number of children.
+    cnmemStatus_t getNumChildren(std::size_t &numChildren) const;
+    /// The associated device.
+    inline int getDevice() const { return mDevice; }
+    /// The flags.
+    inline unsigned getFlags() const { return mFlags; }
+    /// Get the mutex.
+    inline const Mutex* getMutex() const { return &mMutex; }
+    /// The size allocated to that manager.
+    inline std::size_t getSize() const { return mSize; }
+    /// The CUDA stream.
+    inline cudaStream_t getStream() const { return mStream; }
+    /// Define the parent.
+    inline void setParent(Manager *parent) { mParent = parent; }
+    /// Define the device.
+    inline void setDevice(int device) { mDevice = device; }
+    /// Define the stream.
+    inline cnmemStatus_t setStream(cudaStream_t stream) { 
+        mStream = stream; 
+#ifdef CUDA_API_PER_THREAD_DEFAULT_STREAM
+        mIsStreamBlocking = false;
+#else
+        unsigned flags = 0;
+        CNMEM_CHECK_CUDA(cudaStreamGetFlags(mStream, &flags));
+        mIsStreamBlocking = !mStream || !(flags & cudaStreamNonBlocking);
+#endif
+        return CNMEM_STATUS_SUCCESS;
+    }
+    /// Define the flags.
+    inline void setFlags(unsigned flags) { mFlags = flags; }
+private:
+    /// The member functions below which are marked "Unsafe" are not thread-safe when called on a
+    /// same Manager object. Make sure they are called by a single thread in that case.
+    /// Allocate a new block and add it to the free list.
+    cnmemStatus_t allocateBlockUnsafe(Block *&curr, Block *&prev, std::size_t size);
+    /// Release a block from the active list.
+    cnmemStatus_t releaseBlockUnsafe(Block *curr, Block *prev);
+    /// Find the best free node based on the size.
+    cnmemStatus_t findBestBlockUnsafe(Block *&curr, Block *&prev, std::size_t size);
+    /// Extract a node from the list of free blocks.
+    cnmemStatus_t extractBlockUnsafe(Block *curr, Block *prev, std::size_t size, bool stolen);
+    /// Give a free block from that manager.
+    cnmemStatus_t giveBlockUnsafe(void *&data, std::size_t &dataSize, std::size_t size);
+    /// Steal a block from another manager.
+    cnmemStatus_t stealBlockUnsafe(void *&data, std::size_t &dataSize, std::size_t size);
+    /// The memory consumption of a list.
+    cnmemStatus_t getMemoryUnsafe(std::size_t &memSize, const Block *head) const;
+    /// Print an internal linked list.
+    cnmemStatus_t printListUnsafe(FILE *file, const char *name, const Block *head) const;
+};
+///////////////////////////////////////////////////////////////////////////////////////////////////
+Manager::Manager()
+    : mParent(NULL)
+    , mChildren()
+    , mDevice(-1)
+    , mStream(NULL)
+    , mIsStreamBlocking(false)
+    , mUsedBlocks(NULL)
+    , mFreeBlocks(NULL)
+    , mSize(0)
+    , mFlags(CNMEM_FLAGS_DEFAULT)
+    , mMutex() {
+    mMutex.initialize();
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+Manager::~Manager() {
+    if( mDevice == -1 || cudaSetDevice(mDevice) != cudaSuccess ) { // Invalid device, skip it.
+        return;
+    }
+    bool memoryLeak;
+    releaseAllUnsafe(memoryLeak);
+    mMutex.finalize();
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t Manager::addChild(Manager *manager) {
+    CNMEM_CHECK(mMutex.lock());
+    mChildren.push_back(manager);
+    CNMEM_CHECK(mMutex.unlock());
+    return CNMEM_STATUS_SUCCESS;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t Manager::allocate(void *&ptr, std::size_t size, bool isBlocking) {
+    CNMEM_CHECK(mMutex.lock());
+    // If the client is not blocking, we have to explicitly synchronize before giving one buffer.
+    if( !isBlocking ) {
+        CNMEM_CHECK_CUDA_OR_UNLOCK(cudaStreamSynchronize(mStream), mMutex);
+    }
+    // Find the best fit.
+    Block *best = NULL, *prev = NULL;
+    CNMEM_CHECK_OR_UNLOCK(findBestBlockUnsafe(best, prev, size), mMutex);
+    // If there's no block left in the list of free blocks (with a sufficient size). Request a new block. 
+    if( best == NULL && !(mFlags & CNMEM_FLAGS_CANNOT_GROW) ) {
+        CNMEM_CHECK_OR_UNLOCK(allocateBlockUnsafe(best, prev, size), mMutex);
+    }
+    // Make sure we do have a block or quit.
+    if( !best ) {
+        ptr = NULL;
+        CNMEM_CHECK(mMutex.unlock());
+        return CNMEM_STATUS_OUT_OF_MEMORY;
+    }
+    // Split the free block if needed.
+    CNMEM_CHECK_OR_UNLOCK(extractBlockUnsafe(best, prev, size, false), mMutex);
+    // Push the node to the list of used nodes.
+    best->setNext(mUsedBlocks);
+    mUsedBlocks = best;
+    // Return the new pointer into memory.
+    ptr = mUsedBlocks->getData();
+    CNMEM_CHECK(mMutex.unlock());
+    return CNMEM_STATUS_SUCCESS;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t Manager::allocateBlockUnsafe(Block *&curr, Block *&prev, std::size_t size) {
+    // Reset the outputs.
+    curr = prev = NULL;
+    // Try to allocate data from the parent or the device.
+    void *data = NULL;
+    if( mParent ) {
+        CNMEM_CHECK(mParent->allocate(data, size, mIsStreamBlocking));
+    }
+    else {
+        CNMEM_DEBUG_INFO("cudaMalloc(%lu)\n", size);
+        CNMEM_CHECK_CUDA(cudaMalloc(&data, size));
+        CNMEM_DEBUG_INFO(">> returned address=0x%016lx\n", (size_t) data);
+    }
+    // If it failed, there's an unexpected issue.
+    CNMEM_ASSERT(data);
+    // We have data, we now need to add it to the list of free nodes. We keep the list sorted.
+    Block *next = mFreeBlocks;
+    for( ; next && next->getData() < data ; next = next->getNext() ) {
+        prev = next;
+    }
+    curr = new Block((char*) data, size, next, true);
+    if( !curr ) {
+        return CNMEM_STATUS_OUT_OF_MEMORY;
+    }
+    if( prev ) {
+        prev->setNext(curr);
+    }
+    else {
+        mFreeBlocks = curr;
+    }
+    return CNMEM_STATUS_SUCCESS;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t Manager::extractBlockUnsafe(Block *curr, Block *prev, std::size_t size, bool stolen) {
+    // We have two cases: 1/ It is the right size so we keep it or 2/ it is too large and we split the node.
+    Block *next;
+    if( curr->getSize() == size ) {
+        next = curr->getNext();
+    }
+    else {
+        std::size_t remaining = curr->getSize()-size;
+        Block *newBlock = new Block(curr->getData() + size, remaining, curr->getNext(), stolen);
+        if( !newBlock ) {
+            return CNMEM_STATUS_OUT_OF_MEMORY;
+        }
+        next = newBlock;
+        curr->setSize(size);
+    }
+    // Redo the "branching" in the nodes.
+    if( prev ) {
+        prev->setNext(next);
+    }
+    else {
+        mFreeBlocks = next;
+    }
+    return CNMEM_STATUS_SUCCESS;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t Manager::findBestBlockUnsafe(Block *&best, Block *&prev, std::size_t size) {
+    best = NULL, prev = NULL;
+    for( Block *temp = mFreeBlocks, *tempPrev = NULL ; temp ; temp = temp->getNext() ) {
+        if( temp->getSize() >= size && (!best || temp->getSize() < best->getSize()) ) {
+            best = temp;
+            prev = tempPrev;
+        }
+        tempPrev = temp;
+    }
+    return CNMEM_STATUS_SUCCESS;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t Manager::getChildFromStream(Manager *&manager, cudaStream_t stream) const {
+    CNMEM_CHECK(mMutex.lock());
+    std::size_t i = 0, numChildren = mChildren.size();
+    for( ; i < numChildren ; ++i ) {
+        if( mChildren[i]->mStream == stream ) {
+            manager = mChildren[i];
+            break;
+        }
+    }
+    CNMEM_CHECK(mMutex.unlock());
+    return i < numChildren ? CNMEM_STATUS_SUCCESS : CNMEM_STATUS_INVALID_ARGUMENT;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t Manager::getChild(Manager *&manager, std::size_t i) const {
+    CNMEM_CHECK(mMutex.lock());
+    if( i >= mChildren.size() ) {
+        CNMEM_CHECK(mMutex.unlock());
+        return CNMEM_STATUS_INVALID_ARGUMENT;
+    }
+    manager = mChildren[i];
+    CNMEM_CHECK(mMutex.unlock());
+    return CNMEM_STATUS_SUCCESS;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t Manager::getMemoryUnsafe(std::size_t &size, const Block *head) const {
+    size = 0;
+    for( Block *curr = (Block*) head ; curr ; curr = curr->getNext() ) {
+        size += curr->getSize();
+    }
+    return CNMEM_STATUS_SUCCESS;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+#if 0
+cnmemStatus_t Manager::getMemory(std::size_t &size, const Block *head) const {
+    CNMEM_CHECK(mMutex.lock());
+    CNMEM_CHECK_OR_UNLOCK(getMemoryUnsafe(size, head));
+    CNMEM_CHECK(mMutex.unlock());
+    return status;
+}
+#endif
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t Manager::getNumChildren(std::size_t &numChildren) const {
+    CNMEM_CHECK(mMutex.lock());
+    numChildren = mChildren.size();
+    CNMEM_CHECK(mMutex.unlock());
+    return CNMEM_STATUS_SUCCESS;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+std::vector<Manager>& Manager::getRootManagers() {
+    static std::vector<Manager> managers;
+    return managers;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t Manager::giveBlockUnsafe(void *&blockData, std::size_t &blockSize, std::size_t size) {
+    // Make sure the block is not in use any more. It could be too coarse grain and we may change 
+    // it in the future.
+    CNMEM_CHECK_CUDA(cudaStreamSynchronize(mStream));
+    // Init the returned values to 0.
+    blockData = NULL;
+    blockSize = 0;
+    // Find the best node to steal and reserve it.
+    Block *best = NULL, *prev = NULL;
+    CNMEM_CHECK(findBestBlockUnsafe(best, prev, size));
+    if( !best ) {
+        return CNMEM_STATUS_OUT_OF_MEMORY;
+    }
+    CNMEM_CHECK(extractBlockUnsafe(best, prev, size, true));
+    blockData = best->getData();
+    blockSize = best->getSize();
+    // Release the memory used by that block.
+    delete best;
+    return CNMEM_STATUS_SUCCESS;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t Manager::printListUnsafe(FILE *file, const char *name, const Block *head) const {
+    std::size_t size = 0;
+    for( Block *curr = (Block*) head; curr; curr = curr->getNext() ) {
+        size += curr->getSize();
+    }
+    fprintf(file, "| list=\"%s\", size=%lu\n", name, size);
+    for( Block *curr = (Block*) head ; curr ; curr = curr->getNext() ) {
+        fprintf(file, "| | node=0x%016lx, data=0x%016lx, size=%lu, next=0x%016lx, head=%2lu\n", 
+            (std::size_t) curr, 
+            (std::size_t) curr->getData(),
+            (std::size_t) curr->getSize(),
+            (std::size_t) curr->getNext(),
+            (std::size_t) curr->isHead ());
+    }
+    fprintf(file, "|\n");
+    return CNMEM_STATUS_SUCCESS;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t Manager::printMemoryState(FILE *file) const {
+    CNMEM_CHECK(mMutex.lock());
+    std::size_t streamCode = (std::size_t) mStream;
+    std::size_t usedMemory, freeMemory;
+    CNMEM_CHECK_OR_UNLOCK(getUsedMemoryUnsafe(usedMemory), mMutex);
+    CNMEM_CHECK_OR_UNLOCK(getFreeMemoryUnsafe(freeMemory), mMutex);
+    fprintf(file, ">> [%s] device=%d, stream=0x%016lx, used=%luB, free=%luB\n", 
+            mParent ? "child" : "root",
+            mDevice, 
+            streamCode,
+            usedMemory,
+            freeMemory);
+    CNMEM_CHECK_OR_UNLOCK(printListUnsafe(file, "used", mUsedBlocks), mMutex);
+    CNMEM_CHECK_OR_UNLOCK(printListUnsafe(file, "free", mFreeBlocks), mMutex);
+    fprintf(file, "\n");
+    CNMEM_CHECK(mMutex.unlock());
+    if( mParent ) {
+        CNMEM_CHECK(mParent->printMemoryState(file));
+    }
+    return CNMEM_STATUS_SUCCESS;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t Manager::release(void *ptr) {
+    // Skip if ptr is NULL.
+    if( ptr == NULL ) {
+        return CNMEM_STATUS_SUCCESS;
+    }
+    // Lock to make sure only one thread execute that fragment of code.
+    CNMEM_CHECK(mMutex.lock());
+    // Find the node in the list of used blocks.
+    Block *curr = mUsedBlocks, *prev = NULL;
+    for( ; curr && curr->getData() != ptr ; curr = curr->getNext() ) {
+        prev = curr;
+    }
+    // Make sure we have found a node.
+    if( curr == NULL ) {
+        CNMEM_CHECK(mMutex.unlock());
+        return CNMEM_STATUS_INVALID_ARGUMENT;
+    }
+    // We have the node so release it.
+    cnmemStatus_t result = releaseBlockUnsafe(curr, prev);
+    CNMEM_CHECK(mMutex.unlock());
+    return result;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t Manager::releaseAllUnsafe(bool &memoryLeaks) {
+    // Destroy the children if any.
+    bool ok = true;
+    for( std::size_t i = 0; i < mChildren.size(); ++i ) {
+        Manager *child = mChildren[i];
+        bool tmp;
+        CNMEM_CHECK(child->releaseAllUnsafe(tmp));
+        ok = ok && !tmp;
+        delete child;
+    }
+    mChildren.clear();
+    // We have some issues when integrating into some libraries. This has to fixed in the libs.
+    memoryLeaks = !ok || mUsedBlocks;
+    // Destroy used blocks. It's a kind of panic mode to avoid leaks. NOTE: Do that only with roots!!!
+    if( !mParent ) {
+        while( mUsedBlocks ) {
+            CNMEM_CHECK(releaseBlockUnsafe(mUsedBlocks, NULL));
+        }
+    }
+    // We should be having only free blocks that are head blocks. Release those blocks.
+    while( mFreeBlocks ) {
+        if( mParent ) {
+            CNMEM_CHECK(mParent->release(mFreeBlocks->getData()));
+        }
+        else if( mFreeBlocks->isHead() ) {
+            void *data = mFreeBlocks->getData();
+            CNMEM_DEBUG_INFO("cudaFree(%lu, 0x%016lx)\n", mFreeBlocks->getSize(), (size_t) data);
+            CNMEM_CHECK_CUDA(cudaFree(data));
+            CNMEM_DEBUG_INFO(">> success\n");
+        }
+        Block *block = mFreeBlocks;
+        mFreeBlocks = mFreeBlocks->getNext();
+        delete block;
+    }
+    // We shouldn't have any used block left. Or, it means the user is causing memory leaks!
+    return CNMEM_STATUS_SUCCESS;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t Manager::releaseBlockUnsafe(Block *curr, Block *prev) {
+    // The current node cannot be NULL!
+    CNMEM_ASSERT(curr != NULL);
+    // Change the connection of the node.
+    if( prev ) {
+        prev->setNext(curr->getNext());
+    }
+    else {
+        mUsedBlocks = curr->getNext();
+    }
+    // Find the location where this block should be added to the free list.
+    prev = NULL;
+    Block *iter = mFreeBlocks;
+    for( ; iter && iter->getData() < curr->getData() ; iter = iter->getNext() ) {
+        prev = iter;
+    }
+    // Keep track of the successor of pred. We may lose track of it in the following "else".
+    Block *next = prev ? prev->getNext() : mFreeBlocks;
+    // We first check if we can merge the block with its predecessor in the list and curr can be merged.
+    if( prev && prev->getData() + prev->getSize() == curr->getData() && !curr->isHead() ) {
+        prev->setSize(prev->getSize() + curr->getSize());
+        delete curr;
+        curr = prev;
+    }
+    else if( prev ) {
+        prev->setNext(curr);
+    }
+    else {
+        mFreeBlocks = curr;
+    }
+    // Check if we can merge curr and next. We can't merge over "cudaMalloc" boundaries.
+    if( next && curr->getData() + curr->getSize() == next->getData() && !next->isHead() ) {
+        curr->setSize(curr->getSize() + next->getSize());
+        curr->setNext(next->getNext());
+        delete next;
+    }
+    else {
+        curr->setNext(next);
+    }
+    return CNMEM_STATUS_SUCCESS;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t Manager::reserve(std::size_t size) {
+    CNMEM_CHECK(mMutex.lock());
+    Block *curr, *prev;
+    CNMEM_CHECK_OR_UNLOCK(allocateBlockUnsafe(curr, prev, size), mMutex);
+    mSize = size;
+    CNMEM_CHECK(mMutex.unlock());
+    return CNMEM_STATUS_SUCCESS;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t Manager::stealUnsafe(void *&stolen, std::size_t size) {
+    // If we cannot steal, don't even try.
+    if( mFlags & CNMEM_FLAGS_CANNOT_STEAL ) {
+        stolen = NULL;
+        return CNMEM_STATUS_INVALID_ARGUMENT;
+    }
+    // The stolen block.
+    void *data = NULL; std::size_t dataSize = 0;
+    if( !mChildren.empty() ) {
+        CNMEM_CHECK(stealBlockUnsafe(data, dataSize, size));
+    }
+    else if( mParent ) {
+        CNMEM_CHECK(mParent->stealBlockUnsafe(data, dataSize, size));
+    }
+    // Make sure we do have a block of memory or quit.
+    if( !data ) {
+        stolen = NULL;
+        return CNMEM_STATUS_OUT_OF_MEMORY;
+    }
+    // Push the block in the used list.
+    mUsedBlocks = new Block((char*) data, dataSize, mUsedBlocks, true);
+    if( !mUsedBlocks ) {
+        return CNMEM_STATUS_OUT_OF_MEMORY;
+    }
+    // Return the new pointer into memory.
+    stolen = data;
+    return CNMEM_STATUS_SUCCESS;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t Manager::stealBlockUnsafe(void *&data, std::size_t &dataSize, ::size_t size) {
+    // No block found and no room to grow. Try to steal from a children (if we have any).
+    data = NULL;
+    for( std::size_t i = 0 ; !data && i < mChildren.size() ; ++i ) {
+        Manager *child = mChildren[i];
+        if( child->giveBlockUnsafe(data, dataSize, size) == CNMEM_STATUS_SUCCESS ) {
+            break;
+        }
+    }
+    // If no memory space found, simply return NULL. We have failed to allocate. Quit miserably.
+    if( !data ) {
+        return CNMEM_STATUS_OUT_OF_MEMORY;
+    }
+    // We have got a node from a children. We need to update our "used" list before we can do 
+    // anything with it.
+    Block *curr = mUsedBlocks, *prev = NULL;
+    for( ; curr ; curr = curr->getNext() ) { 
+        if( curr->getData() <= data && data < curr->getData()+curr->getSize() ) {
+            break;
+        }
+        prev = curr;
+    }
+    // Curr points to the node which contains that memory region.
+    CNMEM_ASSERT(curr);
+    // If it is exactly the same memory region, we are done!!!
+    if( curr->getData() == data && curr->getSize() == dataSize ) {
+        return CNMEM_STATUS_SUCCESS;
+    }
+    // Track the blocks before and after curr.
+    Block *next = curr->getNext();
+    // We may have up to 3 blocks.
+    std::size_t sizeBefore = (std::size_t) ((char*) data - curr->getData());
+    std::size_t sizeAfter = (curr->getSize() - sizeBefore - dataSize);
+    // The resulting block.
+    Block *result = curr;
+    // If we have no space between curr->getData and block->getData.
+    if( sizeBefore == 0 ) {
+        curr->setSize(dataSize);
+    }
+    else {
+        curr->setSize(sizeBefore);
+        Block *block = new Block((char*) data, dataSize, next, false);
+        if( !block ) {
+            return CNMEM_STATUS_OUT_OF_MEMORY;
+        }
+        curr->setNext(block);
+        curr = block;
+        data = (char*) data + dataSize;
+        dataSize = sizeAfter; 
+        result = block;
+    }
+    // We have space at the end so we may need to add a new node.
+    if( sizeAfter > 0 ) {
+        Block *block = new Block(curr->getData() + curr->getSize(), sizeAfter, next, false);
+        if( !block ) {
+            return CNMEM_STATUS_OUT_OF_MEMORY;
+        }
+        curr->setNext(block);
+        curr = block;
+    }
+    return CNMEM_STATUS_SUCCESS;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace cnmem
+///////////////////////////////////////////////////////////////////////////////////////////////////
+extern "C" {
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t cnmemInit(int numDevices, const cnmemDevice_t *devices, unsigned flags) {
+    // Make sure we have at least one device declared.
+    CNMEM_CHECK_TRUE(numDevices > 0, CNMEM_STATUS_INVALID_ARGUMENT);
+    // Find the largest ID of the device.
+    int maxDevice = 0;
+    for( int i = 0 ; i < numDevices ; ++i ) {
+        if( devices[i].device > maxDevice ) {
+            maxDevice = devices[i].device;
+        }
+    }
+    // Allocate enough managers.
+    CNMEM_CHECK_TRUE(maxDevice >= 0, CNMEM_STATUS_INVALID_ARGUMENT);
+    std::vector<cnmem::Manager> &managers = cnmem::Manager::getRootManagers();
+    managers.resize(maxDevice+1);
+    // Create a root manager for each device and create the children.
+    int oldDevice;
+    CNMEM_CHECK_CUDA(cudaGetDevice(&oldDevice));
+    for( int i = 0 ; i < numDevices ; ++i ) {
+        CNMEM_CHECK_CUDA(cudaSetDevice(devices[i].device));
+        std::size_t size = devices[i].size;
+        if( size == 0 ) {
+            cudaDeviceProp props;
+            CNMEM_CHECK_CUDA(cudaGetDeviceProperties(&props, devices[i].device));
+            size = props.totalGlobalMem / 2;
+        }
+        CNMEM_CHECK_TRUE(size > 0, CNMEM_STATUS_INVALID_ARGUMENT);
+        cnmem::Manager &manager = cnmem::Manager::getRootManager(devices[i].device);
+        manager.setDevice(devices[i].device);
+        manager.setFlags(flags);
+        size = cnmem::ceilInt(size, CNMEM_GRANULARITY);
+        CNMEM_CHECK(manager.reserve(size));
+        for( int j = 0 ; j < devices[i].numStreams ; ++j ) {
+            cnmem::Manager *child = new cnmem::Manager;
+            child->setParent(&manager);
+            child->setDevice(devices[i].device);
+            child->setStream(devices[i].streams[j]);
+            child->setFlags(flags & ~CNMEM_FLAGS_CANNOT_GROW);
+            if( devices[i].streamSizes && devices[i].streamSizes[j] > 0 ) {
+                CNMEM_CHECK(child->reserve(devices[i].streamSizes[j]));
+            }
+            CNMEM_CHECK(manager.addChild(child));
+        }
+    }
+    CNMEM_CHECK_CUDA(cudaSetDevice(oldDevice));
+    return CNMEM_STATUS_SUCCESS;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t cnmemRegisterStream(cudaStream_t stream) {
+    CNMEM_CHECK_TRUE(!cnmem::Manager::getRootManagers().empty(), CNMEM_STATUS_NOT_INITIALIZED);
+    CNMEM_CHECK_TRUE(stream, CNMEM_STATUS_INVALID_ARGUMENT);
+    int device;
+    CNMEM_CHECK_CUDA(cudaGetDevice(&device));
+    cnmem::Manager &root = cnmem::Manager::getRootManager(device);
+    cnmem::Manager *child = new cnmem::Manager;
+    child->setParent(&root);
+    child->setDevice(device);
+    child->setStream(stream);
+    child->setFlags(root.getFlags() & ~CNMEM_FLAGS_CANNOT_GROW);
+    root.addChild(child);
+    return CNMEM_STATUS_SUCCESS;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t cnmemFinalize() {
+    CNMEM_CHECK_TRUE(!cnmem::Manager::getRootManagers().empty(), CNMEM_STATUS_NOT_INITIALIZED);
+    int oldDevice;
+    CNMEM_CHECK_CUDA(cudaGetDevice(&oldDevice));
+    std::vector<cnmem::Manager> &managers = cnmem::Manager::getRootManagers();
+    bool memoryLeaks = false;
+    for( std::size_t i = 0; i < managers.size(); ++i ) {
+        CNMEM_CHECK_CUDA(cudaSetDevice(managers[i].getDevice()));
+        bool tmpLeaks;
+        CNMEM_CHECK(managers[i].releaseAllUnsafe(tmpLeaks));
+        memoryLeaks = memoryLeaks || tmpLeaks;
+    }
+    managers.clear();
+    CNMEM_CHECK_CUDA(cudaSetDevice(oldDevice));
+    return memoryLeaks ? CNMEM_STATUS_MEMORY_LEAK : CNMEM_STATUS_SUCCESS;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t cnmemMalloc(void **ptr, std::size_t size, cudaStream_t stream) {
+    CNMEM_CHECK_TRUE(!cnmem::Manager::getRootManagers().empty(), CNMEM_STATUS_NOT_INITIALIZED);
+    if( !ptr && !size )
+        return CNMEM_STATUS_SUCCESS;
+    CNMEM_CHECK_TRUE(ptr,  CNMEM_STATUS_INVALID_ARGUMENT);
+    CNMEM_CHECK_TRUE(size, CNMEM_STATUS_INVALID_ARGUMENT);
+    int device;
+    CNMEM_CHECK_CUDA(cudaGetDevice(&device));
+    cnmem::Manager &root = cnmem::Manager::getRootManager(device);
+    cnmem::Manager *manager = &root;
+    if( stream ) {
+        CNMEM_CHECK(root.getChildFromStream(manager, stream));
+    }
+    CNMEM_ASSERT(manager);
+    size = cnmem::ceilInt(size, CNMEM_GRANULARITY);
+    cnmemStatus_t result = manager->allocate(ptr[0], size);
+    // We failed to allocate but there might still be a buffer available in another manager. Try to 
+    // steal it.
+    if( result == CNMEM_STATUS_OUT_OF_MEMORY ) {
+        // Try to acquire locks on all the children.
+        std::size_t numChildren;
+        CNMEM_CHECK(root.getNumChildren(numChildren));
+        std::vector<const cnmem::Mutex*> mutexes(numChildren);
+        std::size_t numLocked = 0;
+        for( size_t i = 0 ; i < numChildren ; ++i, ++numLocked ) {
+            cnmem::Manager *child;
+            CNMEM_CHECK(root.getChild(child, i));
+            mutexes[numLocked] = child->getMutex();
+            if( mutexes[numLocked]->lock() != CNMEM_STATUS_SUCCESS ) {
+                break;
+            }
+        }
+        // One lock failed, quit. Reduce the damage as much as possible, though.
+        if( numLocked != numChildren ) {
+            for( std::size_t i = 0 ; i < numLocked ; ++i ) {
+                cnmemStatus_t lockStatus = mutexes[i]->unlock();
+            }
+            return CNMEM_STATUS_UNKNOWN_ERROR;
+        }
+        // Grab the lock on the root, first.
+        const cnmem::Mutex *rootMutex = root.getMutex();
+        CNMEM_CHECK(rootMutex->lock());
+        // We acquired all the lock so we try to steal a node from another child.
+        if( numLocked == mutexes.size() ) {
+            result = manager->stealUnsafe(ptr[0], size);
+        }
+        for( std::size_t i = 0 ; i < numLocked ; ++i ) {
+            cnmemStatus_t lockStatus = mutexes[i]->unlock();
+            if( lockStatus != CNMEM_STATUS_SUCCESS ) { 
+                // Starting from now we are panicking!!! One lock failed to be released, we try
+                // we others. We could also give up because we are already screwed. I don't know
+                // what's best! Comment are welcome.
+                result = lockStatus;
+            }
+        }
+        CNMEM_CHECK(rootMutex->unlock());
+    }
+    return result;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t cnmemFree(void *ptr, cudaStream_t stream) {
+    CNMEM_CHECK_TRUE(!cnmem::Manager::getRootManagers().empty(), CNMEM_STATUS_NOT_INITIALIZED);
+    if( ptr == NULL ) {
+        return CNMEM_STATUS_SUCCESS;
+    }
+    int device;
+    CNMEM_CHECK_CUDA(cudaGetDevice(&device));
+    cnmem::Manager &root = cnmem::Manager::getRootManager(device);
+    cnmem::Manager *manager = &root;
+    if( stream ) {
+        CNMEM_CHECK(root.getChildFromStream(manager, stream));
+    }
+    CNMEM_ASSERT(manager);
+    return manager->release(ptr);
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t cnmemMemGetInfo(size_t *freeMem, size_t *totalMem, cudaStream_t stream) {
+    CNMEM_CHECK_TRUE(!cnmem::Manager::getRootManagers().empty(), CNMEM_STATUS_NOT_INITIALIZED);
+    CNMEM_CHECK_TRUE(totalMem && freeMem, CNMEM_STATUS_INVALID_ARGUMENT);
+    int device;
+    CNMEM_CHECK_CUDA(cudaGetDevice(&device));
+    cnmem::Manager &root = cnmem::Manager::getRootManager(device);
+    cnmem::Manager *manager = &root;
+    if( stream ) {
+        CNMEM_CHECK(root.getChildFromStream(manager, stream));
+    }
+    CNMEM_ASSERT(manager);
+    const cnmem::Mutex *mutex = manager->getMutex();
+    CNMEM_CHECK(mutex->lock());
+    CNMEM_CHECK_OR_UNLOCK(manager->getFreeMemoryUnsafe(*freeMem), *mutex);
+    size_t usedMem;
+    CNMEM_CHECK_OR_UNLOCK(manager->getUsedMemoryUnsafe(usedMem), *mutex);
+    CNMEM_CHECK(mutex->unlock());
+    totalMem[0] = usedMem + freeMem[0];
+    return CNMEM_STATUS_SUCCESS;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t cnmemPrintMemoryState(FILE *file, cudaStream_t stream) {
+    CNMEM_CHECK_TRUE(!cnmem::Manager::getRootManagers().empty(), CNMEM_STATUS_NOT_INITIALIZED);
+    int device;
+    CNMEM_CHECK_CUDA(cudaGetDevice(&device));
+    cnmem::Manager &root = cnmem::Manager::getRootManager(device);
+    cnmem::Manager *manager = &root;
+    if( stream ) {
+        CNMEM_CHECK(root.getChildFromStream(manager, stream));
+    }
+    CNMEM_ASSERT(manager);
+    return manager->printMemoryState(file); 
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+} // extern "C"
--- a/theano/sandbox/cuda/cumem.h
+++ b/theano/sandbox/cuda/cumem.h
@@ -35,16 +35,16 @@
 #include "cuda_runtime_api.h"
 #if defined(_MSC_VER) || defined(WIN32)
-#ifdef CUMEM_DLLEXPORT
+#ifdef CNMEM_DLLEXPORT
-#define CUMEM_API __declspec(dllexport)
+#define CNMEM_API __declspec(dllexport)
 #else
-#define CUMEM_API __declspec(dllimport)
+#define CNMEM_API __declspec(dllimport)
 #endif
 #else
-#define CUMEM_API
+#define CNMEM_API
 #endif
-#define CUMEM_VERSION 100 // It corresponds to 1.0.0
+#define CNMEM_VERSION 100 // It corresponds to 1.0.0
 #ifdef __cplusplus
 extern "C" {
@@ -54,29 +54,27 @@ extern "C" {
 typedef enum
 {
-  CUMEM_STATUS_SUCCESS = 0,
+  CNMEM_STATUS_SUCCESS = 0,
-  CUMEM_STATUS_CUDA_ERROR,
+  CNMEM_STATUS_CUDA_ERROR,
-  CUMEM_STATUS_INVALID_ARGUMENT,
+  CNMEM_STATUS_INVALID_ARGUMENT,
-  CUMEM_STATUS_MEMORY_LEAK,
+  CNMEM_STATUS_MEMORY_LEAK,
-  CUMEM_STATUS_NOT_INITIALIZED,
+  CNMEM_STATUS_NOT_INITIALIZED,
-  CUMEM_STATUS_OUT_OF_MEMORY,
+  CNMEM_STATUS_OUT_OF_MEMORY,
-  CUMEM_STATUS_UNKNOWN_ERROR
+  CNMEM_STATUS_UNKNOWN_ERROR
-} cumemStatus_t;
+} cnmemStatus_t;
 /* ********************************************************************************************* */
 typedef enum
 {
-  CUMEM_FLAGS_DEFAULT = 0,          /// Default flags.
+  CNMEM_FLAGS_DEFAULT = 0,       /// Default flags.
-  CUMEM_FLAGS_CANNOT_GROW = 1,      /// Prevent the manager from growing its memory consumption.
+  CNMEM_FLAGS_CANNOT_GROW = 1,   /// Prevent the manager from growing its memory consumption.
-  CUMEM_FLAGS_CANNOT_STEAL = 2,     /// Prevent the manager from stealing memory.
+  CNMEM_FLAGS_CANNOT_STEAL = 2,  /// Prevent the manager from stealing memory.
-  CUMEM_FLAGS_USE_UNIFIED_MEM = 4,  /// Use Managed Memory for allocating memory.
+} cnmemManagerFlags_t;
-  CUMEM_FLAGS_MEM_ATTACH_HOST = 8   /// Host-only visible unified memory. (valid only for unified memory!)
-} cumemManagerFlags_t;
 /* ********************************************************************************************* */
-typedef struct cumemDevice_t_
+typedef struct cnmemDevice_t_
 {
  /** The device number. */
  int device;
@@ -86,10 +84,10 @@ typedef struct cumemDevice_t_
  int numStreams;
  /** The streams associated with the device. It can be NULL. The NULL stream is managed. */
  cudaStream_t *streams;
-  /** The memory allocation granularity (in multiples of 512 B). If 0, the implementation chooses the size */
+  /** The size reserved for each streams. It can be 0. */
-  size_t granularity;
+  size_t *streamSizes;
-} cumemDevice_t;
+} cnmemDevice_t;
 /**
 * \brief Initialize the library and allocate memory on the listed devices.
@@ -99,47 +97,58 @@ typedef struct cumemDevice_t_
 * memory manager is created. Currently, it is implemented as a tree of memory managers: A root 
 * manager for the device and a list of children, one for each named stream.
 * 
- * This function must be before any other function in the library. It has to be called by a 
+ * This function must be called before any other function in the library. It has to be called 
- * single thread since it is not thread-safe.
+ * by a single thread since it is not thread-safe.
 *
 * \return 
- * CUMEM_STATUS_SUCCESS,          if everything goes fine,
+ * CNMEM_STATUS_SUCCESS,          if everything goes fine,
- * CUMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid,
+ * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid,
- * CUMEM_STATUS_OUT_OF_MEMORY,    if the requested size exceeds the available memory,
+ * CNMEM_STATUS_OUT_OF_MEMORY,    if the requested size exceeds the available memory,
- * CUMEM_STATUS_CUDA_ERROR,       if an error happens in a CUDA function.
+ * CNMEM_STATUS_CUDA_ERROR,       if an error happens in a CUDA function.
 */
-cumemStatus_t CUMEM_API cumemInit(int numDevices, const cumemDevice_t *devices, unsigned flags);
+cnmemStatus_t CNMEM_API cnmemInit(int numDevices, const cnmemDevice_t *devices, unsigned flags);
+/**
+ * \brief Add a new stream to the pool of managed streams on a device.
+ *
+ * This function registers a new stream into a device memory manager. It is thread-safe.
+ *
+ * \return 
+ * CNMEM_STATUS_SUCCESS,          if everything goes fine,
+ * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid,
+ */
+cnmemStatus_t CNMEM_API cnmemRegisterStream(cudaStream_t stream);
 /**
 * \brief Release all the allocated memory. 
 * 
 * This function must be called by a single thread and after all threads that called 
- * cumemMalloc/cumemFree have joined. This function is not thread-safe.
+ * cnmemMalloc/cnmemFree have joined. This function is not thread-safe.
 *
 * \return 
- * CUMEM_STATUS_SUCCESS,          if everything goes fine,
+ * CNMEM_STATUS_SUCCESS,          if everything goes fine,
- * CUMEM_STATUS_NOT_INITIALIZED,  if the ::cumemInit function has not been called,
+ * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
- * CUMEM_STATUS_MEMORY_LEAK,      if there are unreleased blocks in the memory queues,
+ * CNMEM_STATUS_MEMORY_LEAK,      if there are unreleased blocks in the memory queues,
- * CUMEM_STATUS_CUDA_ERROR,       if an error happens in one of the CUDA functions.
+ * CNMEM_STATUS_CUDA_ERROR,       if an error happens in one of the CUDA functions.
 */
-cumemStatus_t CUMEM_API cumemFinalize();
+cnmemStatus_t CNMEM_API cnmemFinalize();
 /**
 * \brief Allocate memory. 
 * 
 * This function allocates memory and initializes a pointer to device memory. If no memory 
- * is available, it returns a CUMEM_STATUS_OUT_OF_MEMORY error. This function is thread safe.
+ * is available, it returns a CNMEM_STATUS_OUT_OF_MEMORY error. This function is thread safe.
 *
 * The behavior of that function is the following: 
 *
 * - If the stream is NULL, the root memory manager is asked to allocate a buffer of device 
 *   memory. If there's a buffer of size larger or equal to the requested size in the list of 
 *   free blocks, it is returned. If there's no such buffer but the manager is allowed to grow 
- *   its memory usage (the CUMEM_FLAGS_CANNOT_GROW flag is not set), the memory manager calls 
+ *   its memory usage (the CNMEM_FLAGS_CANNOT_GROW flag is not set), the memory manager calls 
 *   cudaMalloc. If cudaMalloc fails due to no more available memory or the manager is not 
 *   allowed to grow, the manager attempts to steal memory from one of its children (unless 
- *   CUMEM_FLAGS_CANNOT_STEAL is set). If that attempt also fails, the manager returns 
+ *   CNMEM_FLAGS_CANNOT_STEAL is set). If that attempt also fails, the manager returns 
- *   CUMEM_STATUS_OUT_OF_MEMORY.
+ *   CNMEM_STATUS_OUT_OF_MEMORY.
 * 
 * - If the stream is a named stream, the initial request goes to the memory manager associated 
 *   with that stream. If a free node is available in the lists of that manager, it is returned. 
@@ -148,17 +157,18 @@ cumemStatus_t CUMEM_API cumemFinalize();
 *
 * The calls to cudaMalloc are potentially costly and may induce GPU synchronizations. Also the 
 * mechanism to steal memory from the children induces GPU synchronizations (the manager has to 
- * make sure no kernel uses a given buffer before stealing it) and it cannot be executed by more 
+ * make sure no kernel uses a given buffer before stealing it) and it the execution is 
- * than one thread at the time (per device).
+ * sequential (in a multi-threaded context, the code is executed in a critical section inside
+ * the cnmem library - no need for the user to wrap cnmemMalloc with locks).
 *
 * \return 
- * CUMEM_STATUS_SUCCESS,          if everything goes fine,
+ * CNMEM_STATUS_SUCCESS,          if everything goes fine,
- * CUMEM_STATUS_NOT_INITIALIZED,  if the ::cumemInit function has not been called,
+ * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
- * CUMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, ptr == 0,
+ * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, ptr == 0,
- * CUMEM_STATUS_OUT_OF_MEMORY,    if there is not enough memory available,
+ * CNMEM_STATUS_OUT_OF_MEMORY,    if there is not enough memory available,
- * CUMEM_STATUS_CUDA_ERROR,       if an error happens in one of the CUDA functions.
+ * CNMEM_STATUS_CUDA_ERROR,       if an error happens in one of the CUDA functions.
 */
-cumemStatus_t CUMEM_API cumemMalloc(void **ptr, size_t size, cudaStream_t stream);
+cnmemStatus_t CNMEM_API cnmemMalloc(void **ptr, size_t size, cudaStream_t stream);
 /**
 * \brief Release memory. 
@@ -167,31 +177,31 @@ cumemStatus_t CUMEM_API cumemMalloc(void **ptr, size_t size, cudaStream_t stream
 * thread safe. 
 *
 * \return 
- * CUMEM_STATUS_SUCCESS,          if everything goes fine,
+ * CNMEM_STATUS_SUCCESS,          if everything goes fine,
- * CUMEM_STATUS_NOT_INITIALIZED,  if the ::cumemInit function has not been called,
+ * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
- * CUMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, ptr == 0,
+ * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, ptr == 0,
- * CUMEM_STATUS_CUDA_ERROR,       if an error happens in one of the CUDA functions.
+ * CNMEM_STATUS_CUDA_ERROR,       if an error happens in one of the CUDA functions.
 */
-cumemStatus_t CUMEM_API cumemFree(void *ptr, cudaStream_t stream);
+cnmemStatus_t CNMEM_API cnmemFree(void *ptr, cudaStream_t stream);
 /* ********************************************************************************************* */
 /* Utility functions.                                                                            */
 /* ********************************************************************************************* */
 /**
- * \brief Returns the amount of memory managed by the root memory manager on devices. 
+ * \brief Returns the amount of memory managed by the memory manager associated with a stream.
 * 
- * The pointers used_mem and free_mem must point to memory regions of numDevices*sizeof(size_t) 
+ * The pointers totalMem and freeMem must be valid. At the moment, this function has a comple-
- * bytes. At the moment, this function has a complexity linear in the number of allocated blocks 
+ * xity linear in the number of allocated blocks so do not call it in performance critical 
- * so do not call it in performance critical sections. 
+ * sections. 
 *
 * \return 
- * CUMEM_STATUS_SUCCESS,          if everything goes fine,
+ * CNMEM_STATUS_SUCCESS,          if everything goes fine,
- * CUMEM_STATUS_NOT_INITIALIZED,  if the ::cumemInit function has not been called,
+ * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
- * CUMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid,
+ * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid,
- * CUMEM_STATUS_CUDA_ERROR,       if an error happens in one of the CUDA functions.
+ * CNMEM_STATUS_CUDA_ERROR,       if an error happens in one of the CUDA functions.
 */
-cumemStatus_t CUMEM_API cumemGetMemoryUsage(size_t *used_mem, size_t *free_mem);
+cnmemStatus_t CNMEM_API cnmemMemGetInfo(size_t *freeMem, size_t *totalMem, cudaStream_t stream);
 /**
 * \brief Print a list of nodes to a file. 
@@ -200,18 +210,18 @@ cumemStatus_t CUMEM_API cumemGetMemoryUsage(size_t *used_mem, size_t *free_mem);
 * behaviour of the memory managers/application. It is thread safe.
 *
 * \return 
- * CUMEM_STATUS_SUCCESS,          if everything goes fine,
+ * CNMEM_STATUS_SUCCESS,          if everything goes fine,
- * CUMEM_STATUS_NOT_INITIALIZED,  if the ::cumemInit function has not been called,
+ * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
- * CUMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, used_mem == 0 
+ * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, used_mem == 0 
 *                                or free_mem == 0,
- * CUMEM_STATUS_CUDA_ERROR,       if an error happens in one of the CUDA functions.
+ * CNMEM_STATUS_CUDA_ERROR,       if an error happens in one of the CUDA functions.
 */
-cumemStatus_t CUMEM_API cumemPrintMemoryState(FILE *file);
+cnmemStatus_t CNMEM_API cnmemPrintMemoryState(FILE *file, cudaStream_t stream);
 /**
- * \brief Converts a cumemStatus_t value to a string.
+ * \brief Converts a cnmemStatus_t value to a string.
 */
-const char* cumemGetErrorString(cumemStatus_t status);
+const char* cnmemGetErrorString(cnmemStatus_t status);
 /* ********************************************************************************************* */

--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -9,8 +9,8 @@
 #include "cuda_ndarray.cuh"
-#include "cumem.h"
+#include "cnmem.h"
-#include "cumem.cpp"
+#include "cnmem.cpp"
 //If true, when there is a gpu malloc or free error, we print the size of allocated memory on the device.
 #define COMPUTE_GPU_MEM_USED 0
@@ -71,20 +71,20 @@ void * device_malloc(size_t size)
 }
 ///@TODO: thejaswi: link this option to a theano config variable?
-static bool g_use_cumem = false;
+static bool g_use_cnmem = false;
 static const int g_max_devices = 8;
-int initCumem(int card_number_provided, int card_nb) {
+int initCnmem(int card_number_provided, int card_nb) {
-    static bool cumemInitialized = false;
+    static bool cnmemInitialized = false;
-    if(cumemInitialized) {
+    if(cnmemInitialized) {
        return 0;
    }
    // On stderr to be at the same place as "Using gpu device..."
-    fprintf(stderr, "Initializing cumem...\n");
+    fprintf(stderr, "Initializing cnmem...\n");
    int numDevices = 0;
-    cumemDevice_t devices[g_max_devices];
+    cnmemDevice_t devices[g_max_devices];
    if(cudaGetDeviceCount(&numDevices) != cudaSuccess) {
        PyErr_Format(PyExc_RuntimeError,
-                     "initCumem: 'cudaGetDeviceCount' failed! Reason=%s\n",
+                     "initCnmem: 'cudaGetDeviceCount' failed! Reason=%s\n",
                     cudaGetErrorString(cudaGetLastError()));
        return -1;
    }
@@ -97,7 +97,6 @@ int initCumem(int card_number_provided, int card_nb) {
        ///@TODO: thejaswi: add support for multiple streams
        devices[i].numStreams = 0;
        devices[i].streams = NULL;
-        devices[i].granularity = 0;
    }else{
        for(int i=0;i<numDevices;++i) {
@@ -107,19 +106,18 @@ int initCumem(int card_number_provided, int card_nb) {
            ///@TODO: thejaswi: add support for multiple streams
            devices[i].numStreams = 0;
            devices[i].streams = NULL;
-            devices[i].granularity = 0;
        }
    }
-    ///@TODO: thejaswi: passing custom cumem flags?
+    ///@TODO: thejaswi: passing custom cnmem flags?
-    cumemStatus_t status = cumemInit(numDevices, devices, CUMEM_FLAGS_DEFAULT);
+    cnmemStatus_t status = cnmemInit(numDevices, devices, CNMEM_FLAGS_DEFAULT);
-    if(status != CUMEM_STATUS_SUCCESS) {
+    if(status != CNMEM_STATUS_SUCCESS) {
        PyErr_Format(PyExc_RuntimeError,
-                     "initCumem: cumemInit call failed! Reason=%s. numdev=%d\n",
+                     "initCnmem: cnmemInit call failed! Reason=%s. numdev=%d\n",
-                     cumemGetErrorString(status), numDevices);
+                     cnmemGetErrorString(status), numDevices);
        return -1;
    }
-    cumemInitialized = true;
+    cnmemInitialized = true;
    return 0;
 }
@@ -138,12 +136,15 @@ void * device_malloc(size_t size, int verbose)
    #endif
    void * rval=NULL;
    ///@TODO: thejaswi: support for multiple-streams?
-    if(g_use_cumem) {
+    if(g_use_cnmem) {
-        cumemStatus_t status = cumemMalloc(&rval, size, NULL);
+        cnmemStatus_t status = CNMEM_STATUS_SUCCESS;
-        if(status != CUMEM_STATUS_SUCCESS) {
+        if( size != 0 ) {
+            status = cnmemMalloc(&rval, size, NULL);
+        }
+        if(status != CNMEM_STATUS_SUCCESS) {
            PyErr_Format(PyExc_MemoryError,
                         "Error allocating %zd bytes of device memory (%s).",
-                         size, cumemGetErrorString(status));
+                         size, cnmemGetErrorString(status));
            return NULL;
        }
    }
@@ -271,11 +272,11 @@ int device_free(void *ptr)
    }
    ///@TODO: thejaswi: multi-stream support
-    if(g_use_cumem) {
+    if(g_use_cnmem) {
-        cumemStatus_t status = cumemFree(ptr, NULL);
+        cnmemStatus_t status = cnmemFree(ptr, NULL);
-        if(status != CUMEM_STATUS_SUCCESS) {
+        if(status != CNMEM_STATUS_SUCCESS) {
-            fprintf(stderr, "device_free: cumemFree call failed! Reason=%s\n",
+            fprintf(stderr, "device_free: cnmemFree call failed! Reason=%s\n",
-                    cumemGetErrorString(status));
+                    cnmemGetErrorString(status));
        }
    }
    else {
@@ -3134,22 +3135,22 @@ CudaNdarray_ptr_int_size(PyObject* _unused, PyObject* args)
 static int cublas_init();
 static void cublas_shutdown();
 // Initialize the gpu.
-// Takes two optional parameters, the device number and if we should use cumem.
+// Takes two optional parameters, the device number and if we should use cnmem.
 // If the device number is provided, it sets that device to be the active device.
 // If not provided (usually just to test whether the gpu is available at all),
 // it does not set an active device.
 // Raises EnvironmentError or ValueError (as appropriate) if the initialization failed.
-// cumem is threaded like a bool. If converted to 0, don't use cumem. Otherwise, use it.
+// cnmem is threaded like a bool. If converted to 0, don't use cnmem. Otherwise, use it.
 PyObject *
 CudaNdarray_gpu_init(PyObject* _unused, PyObject* args)
 {
    int card_nb = 0;
    int card_number_provided = 1;
-    int cumem = 0; // 0 False, 1 True
+    int cnmem = 0; // 0 False, 1 True
    // if we're given something wildly invalid, this will throw a TypeError
-    PyArg_ParseTuple(args, "|ii", &card_nb, &cumem);
+    PyArg_ParseTuple(args, "|ii", &card_nb, &cnmem);
-    if(cumem)
+    if(cnmem)
-        g_use_cumem = true;
+        g_use_cnmem = true;
    if(PyTuple_Size(args) == 0) {
        card_number_provided = 0;
@@ -3204,8 +3205,8 @@ CudaNdarray_gpu_init(PyObject* _unused, PyObject* args)
        if (cublas_init() == -1)
            return NULL;
    }
-    if(card_number_provided && g_use_cumem) {
+    if(card_number_provided && g_use_cnmem) {
-        if(initCumem(card_number_provided, card_nb) == -1){
+        if(initCnmem(card_number_provided, card_nb) == -1){
            return NULL;
        }
    }
@@ -3240,13 +3241,13 @@ CudaNdarray_gpu_shutdown(PyObject* _unused, PyObject* _unused_args) {
    // Don't handle errors here
    cublas_shutdown();
    g_gpu_context_active = 0; // context has now been closed down
-    if(g_use_cumem) {
+    if(g_use_cnmem) {
-        fprintf(stderr, "Shutting down cumem...\n");
+        fprintf(stderr, "Shutting down cnmem...\n");
-        cumemStatus_t status = cumemFinalize();
+        cnmemStatus_t status = cnmemFinalize();
-        if(status != CUMEM_STATUS_SUCCESS) {
+        if(status != CNMEM_STATUS_SUCCESS && status != CNMEM_STATUS_MEMORY_LEAK) {
-            fprintf(stderr, "CudaNdarray_gpu_shutdown: cumemFinalize failed! Reason=%s\n",
+            fprintf(stderr, "CudaNdarray_gpu_shutdown: cnmemFinalize failed! Reason=%s\n",
-                    cumemGetErrorString(status));
+                    cnmemGetErrorString(status));
-            if(status == CUMEM_STATUS_CUDA_ERROR) {
+            if(status == CNMEM_STATUS_CUDA_ERROR) {
                fprintf(stderr, "  Cuda-Reason=%s\n",
                        cudaGetErrorString(cudaGetLastError()));
            }

--- a/theano/sandbox/cuda/cumem.cpp
+++ b/theano/sandbox/cuda/cumem.cpp
-///////////////////////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions
-// are met:
-//  * Redistributions of source code must retain the above copyright
-//    notice, this list of conditions and the following disclaimer.
-//  * Redistributions in binary form must reproduce the above copyright
-//    notice, this list of conditions and the following disclaimer in the
-//    documentation and/or other materials provided with the distribution.
-//  * Neither the name of NVIDIA CORPORATION nor the names of its
-//    contributors may be used to endorse or promote products derived
-//    from this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-///////////////////////////////////////////////////////////////////////////////////////////////////
-#include "cumem.h"
-#include <algorithm>
-#include <cassert>
-#include <cstddef>
-#include <iostream>
-#include <vector>
-#include <cuda_runtime_api.h>
-#if !defined(WIN32) && defined(_MSC_VER)
-#define WIN32
-#endif
-#ifdef WIN32
-#include <Windows.h>
-#else
-#include <pthread.h>
-#endif
-#define CUMEM_DEFAULT_GRANULARITY 512
-//#define CUMEM_DEBUG
-///////////////////////////////////////////////////////////////////////////////////////////////////
-namespace cumem {
-///////////////////////////////////////////////////////////////////////////////////////////////////
-#define CUMEM_CHECK(call) do { \
-  cumemStatus_t status = call; \
-  if( status != CUMEM_STATUS_SUCCESS ) { \
-    return status; \
-  } \
-} while(0)
-///////////////////////////////////////////////////////////////////////////////////////////////////
-#define CUMEM_CHECK_OR_UNLOCK_AND_RETURN(call, lock) do { \
-  cumemStatus_t status = call; \
-  if( status != CUMEM_STATUS_SUCCESS ) { \
-    lock.unlock(); \
-    return status; \
-  } \
-} while(0)
-///////////////////////////////////////////////////////////////////////////////////////////////////
-#define CHECK(cond, error) do { \
-  if( !(cond) ) { \
-    return error; \
-  } \
-} while(0)
-///////////////////////////////////////////////////////////////////////////////////////////////////
-#define CHECK_CUDA(call) do { \
-  cudaError_t cuda_error = call; \
-  if( cuda_error == cudaErrorMemoryAllocation ) { \
-    return CUMEM_STATUS_OUT_OF_MEMORY; \
-  } \
-  else if( cuda_error != cudaSuccess ) { \
-    return CUMEM_STATUS_CUDA_ERROR; \
-  } \
-} while(0)
-///////////////////////////////////////////////////////////////////////////////////////////////////
-#ifdef WIN32
-#define CHECK_WIN32(call, error_code) do { \
-  SetLastError(0); /* Clean the flag. */ \
-  call; \
-  DWORD status = GetLastError(); \
-  if( status ) \
-    return error_code; \
-} while(0)
-#else
-#define CHECK_PTHREAD(call, error_code) do { \
-  int status = call; \
-  if( status ) { \
-    return error_code; \
-  } \
-} while(0)
-#endif
-///////////////////////////////////////////////////////////////////////////////////////////////////
-class Mutex
-{
-#ifdef WIN32
-  CRITICAL_SECTION m_critical_section;
-#else
-  pthread_mutex_t  m_mutex;
-#endif
-public:
-  /// Initialize the mutex.
-  cumemStatus_t initialize();
-  /// Finalize the mutex.
-  cumemStatus_t finalize();
-  /// Lock the mutex.
-  cumemStatus_t lock();
-  /// Unlock the mutex.
-  cumemStatus_t unlock();
-};
-///////////////////////////////////////////////////////////////////////////////////////////////////
-cumemStatus_t Mutex::initialize()
-{
-#ifdef WIN32
-  CHECK_WIN32(InitializeCriticalSection(&m_critical_section), CUMEM_STATUS_UNKNOWN_ERROR);
-#else
-  // pthread_mutexattr_t attr;
-  // CHECK_PTHREAD_OR_THROW(pthread_mutexattr_init(&attr), CUMEM_STATUS_UNKNOWN_ERROR);
-  // CHECK_PTHREAD_OR_THROW(pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE), CUMEM_STATUS_UNKNOWN_ERROR);
-  // CHECK_PTHREAD_OR_THROW(pthread_mutex_init(&m_mutex, &attr), CUMEM_STATUS_UNKNOWN_ERROR);
-  CHECK_PTHREAD(pthread_mutex_init(&m_mutex, NULL), CUMEM_STATUS_UNKNOWN_ERROR);
-#endif
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-cumemStatus_t Mutex::finalize()
-{
-#ifdef WIN32
-  CHECK_WIN32(DeleteCriticalSection(&m_critical_section), CUMEM_STATUS_UNKNOWN_ERROR);
-#else
-  CHECK_PTHREAD(pthread_mutex_destroy(&m_mutex), CUMEM_STATUS_UNKNOWN_ERROR);
-#endif
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-cumemStatus_t Mutex::lock()
-{
-#ifdef WIN32
-  CHECK_WIN32(EnterCriticalSection(&m_critical_section), CUMEM_STATUS_UNKNOWN_ERROR);
-#else
-  CHECK_PTHREAD(pthread_mutex_lock(&m_mutex), CUMEM_STATUS_UNKNOWN_ERROR);
-#endif
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-cumemStatus_t Mutex::unlock()
-{
-#ifdef WIN32
-  CHECK_WIN32(LeaveCriticalSection(&m_critical_section), CUMEM_STATUS_UNKNOWN_ERROR);
-#else
-  CHECK_PTHREAD(pthread_mutex_unlock(&m_mutex), CUMEM_STATUS_UNKNOWN_ERROR);
-#endif
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-class Lock
-{
-  /// The mutex.
-  Mutex *m_mutex;
-public:
-  /// Ctor.
-  Lock() : m_mutex(NULL) {}
-  /// Lock the mutex.
-  inline cumemStatus_t lock(Mutex *mutex) { m_mutex = mutex; return m_mutex->lock(); }
-  /// Unlock the mutex.
-  inline cumemStatus_t unlock() { return m_mutex->unlock(); }
-};
-///////////////////////////////////////////////////////////////////////////////////////////////////
-class Block
-{
-  /// The pointer to the memory region on the device. 
-  char *m_data;
-  /// The size of the memory buffer.
-  std::size_t m_size;
-  /// The prev/next blocks in the linked list of blocks.
-  Block *m_next;
-  /// Is it a head node (i.e. a node obtained from parent->allocate or cudaMalloc).
-  bool m_is_head;
-public:
-  /// Create a block.
-  Block(char *data, std::size_t size, Block *next, bool is_head)
-    : m_data(data)
-    , m_size(size)
-    , m_next(next)
-    , m_is_head(is_head)
-  {}
-  /// The data.
-  inline const char* get_data() const { return m_data; }
-  /// The data (mutable).
-  inline char* get_data() { return m_data; }
-  /// The size of the block.
-  inline std::size_t get_size() const { return m_size; }
-  /// The next block in the linked list.
-  inline const Block* get_next() const { return m_next; }
-  /// The next block in the linked list (mutable).
-  inline Block* get_next() { return m_next; }
-  /// Is it a head block.
-  inline bool is_head() const { return m_is_head; }
-  /// Change the next block.
-  inline void set_next(Block *next) { m_next = next; }
-  /// Change the size of the block.
-  inline void set_size(std::size_t size) { m_size = size; }
-  /// Set the head flag.
-  inline void set_head_flag(bool is_head) { m_is_head = is_head; }
-};
-///////////////////////////////////////////////////////////////////////////////////////////////////
-class Manager
-{
-  /// The parent manager.
-  Manager *m_parent;
-  /// The children managers.
-  std::vector<Manager> m_children;
-  /// The GPU device where the memory is allocated.
-  int m_device;
-  /// The stream this manager is associated with. It could be NULL.
-  cudaStream_t m_stream;
-  /// The list of used blocks.
-  Block *m_used_blocks;
-  /// The list of free blocks.
-  Block *m_free_blocks;
-  /// The managed memory size.
-  std::size_t m_size;
-  /// The memory allocation granularity
-  std::size_t m_granularity;
-  /// The flags.
-  unsigned m_flags;
-  /// To support multi-threading. Each manager has its own mutex.
-  Mutex m_mutex;
-public:
-  /// Create an unitialized manager.
-  Manager();
-  /// Dtor.
-  ~Manager();
-  /// Allocate a block of memory.
-  cumemStatus_t allocate(void *&ptr, std::size_t size);
-  /// Release a block of memory.
-  cumemStatus_t release(void *ptr);
-  /// Release memory. It returns true if we have no memory leak.
-  cumemStatus_t release_all(bool &memory_leak);
-  /// Reserve memory for a manager.
-  cumemStatus_t reserve(std::size_t size);
-  /// Steal memory from another manager.
-  cumemStatus_t steal(void *&ptr, std::size_t size);
-  /// Print the list of free blocks.
-  inline std::size_t print_free_blocks(FILE *file) const 
-  { 
-    return print_list(file, "free", m_free_blocks); 
-  }
-  /// Print the list of used blocks.
-  inline std::size_t print_used_blocks(FILE *file) const 
-  { 
-    return print_list(file, "used", m_used_blocks); 
-  }
-  /// The root manager for a given device.
-  static inline Manager& get_root_manager(int device) { return get_root_managers()[device]; }
-  /// The list of all the root managers.
-  static std::vector<Manager>& get_root_managers();
-  /// The amount of used memory.
-  inline std::size_t get_used_memory() const { return get_memory(m_used_blocks); }
-  /// The amount of used memory.
-  inline std::size_t get_free_memory() const { return get_memory(m_free_blocks); }
-  /// The children.
-  inline std::vector<Manager>& get_children() { return m_children; }
-  /// The children.
-  inline const std::vector<Manager>& get_children() const { return m_children; }
-  /// Get a specific child based on the stream id.
-  cumemStatus_t get_child(Manager *&manager, cudaStream_t stream);
-  /// The associated device.
-  inline int get_device() const { return m_device; }
-  /// The flags.
-  inline unsigned get_flags() const { return m_flags; }
-  /// Get the mutex.
-  inline Mutex* get_mutex() { return &m_mutex; }
-  /// The size allocated to that manager.
-  inline std::size_t get_size() const { return m_size; }
-  /// The CUDA stream.
-  inline cudaStream_t get_stream() const { return m_stream; }
-  /// The allocation granularity.
-  inline size_t get_granularity() const { return m_granularity; }
-  /// Define the parent.
-  inline void set_parent(Manager *parent) { m_parent = parent; }
-  /// Define the device.
-  inline void set_device(int device) { m_device = device; }
-  /// Define the stream.
-  inline void set_stream(cudaStream_t stream) { m_stream = stream; }
-  /// Define the flags.
-  inline void set_flags(unsigned flags) { m_flags = flags; }
-  /// Define the granularity
-  inline void set_granularity(unsigned granularity) { m_granularity = granularity; }
-private:
-  /// Allocate a new block and add it to the free list.
-  cumemStatus_t allocate_block(Block *&curr, Block *&prev, std::size_t size);
-  /// Release a block from the active list.
-  cumemStatus_t release_block(Block *curr, Block *prev);
-  /// Find the best free node based on the size.
-  cumemStatus_t find_best_block(Block *&curr, Block *&prev, std::size_t size);
-  /// Extract a node from the list of free blocks.
-  cumemStatus_t extract_block(Block *curr, Block *prev, std::size_t size, bool stolen);
-  /// Give a free block from that manager.
-  cumemStatus_t give_block(void *&data, std::size_t &data_size, std::size_t size);
-  /// Steal a block from another manager.
-  cumemStatus_t steal_block(void *&data, std::size_t &data_size, std::size_t size);
-  /// The memory consumption of a list.
-  std::size_t get_memory(const Block *head) const;
-  /// Print an internal linked list.
-  std::size_t print_list(FILE *file, const char *name, const Block *head) const;
-};
-///////////////////////////////////////////////////////////////////////////////////////////////////
-Manager::Manager()
-  : m_parent(NULL)
-  , m_children()
-  , m_device(-1)
-  , m_stream(NULL)
-  , m_used_blocks(NULL)
-  , m_free_blocks(NULL)
-  , m_size(0)
-  , m_granularity(CUMEM_DEFAULT_GRANULARITY)
-  , m_flags(CUMEM_FLAGS_DEFAULT)
-  , m_mutex()
-{}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-Manager::~Manager()
-{
-  bool memory_leak;
-  release_all(memory_leak);
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-cumemStatus_t Manager::allocate(void *&ptr, std::size_t size)
-{
-  // Lock to make sure only one thread execute that fragment of code.
-  Lock lock;
-  CUMEM_CHECK(lock.lock(&m_mutex));
-  // Find the best fit.
-  Block *best = NULL, *prev = NULL;
-  CUMEM_CHECK_OR_UNLOCK_AND_RETURN(find_best_block(best, prev, size), lock);
-  // If there's no block left in the list of free blocks (with a sufficient size). Request a new block. 
-  if( best == NULL && !(m_flags & CUMEM_FLAGS_CANNOT_GROW) )
-  {
-    CUMEM_CHECK_OR_UNLOCK_AND_RETURN(allocate_block(best, prev, size), lock);
-  }
-  // Make sure we do have a block or quit.
-  if( !best )
-  {
-    CUMEM_CHECK(lock.unlock());
-    ptr = NULL;
-    return CUMEM_STATUS_OUT_OF_MEMORY;
-  }
-  // Split the free block if needed.
-  CUMEM_CHECK_OR_UNLOCK_AND_RETURN(extract_block(best, prev, size, false), lock);
-  // Push the node to the list of used nodes.
-  best->set_next(m_used_blocks);
-  m_used_blocks = best;
-  // Return the new pointer into memory.
-  CUMEM_CHECK(lock.unlock());
-  ptr = m_used_blocks->get_data();
-  return CUMEM_STATUS_SUCCESS;
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-cumemStatus_t Manager::allocate_block(Block *&curr, Block *&prev, std::size_t size)
-{
-  // Reset the outputs.
-  curr = prev = NULL;
-  // Try to allocate data from the parent or the device.
-  void *data = NULL;
-  if( m_parent )
-    CUMEM_CHECK(m_parent->allocate(data, size));
-  else
-  {
-    if( m_flags & CUMEM_FLAGS_USE_UNIFIED_MEM )
-    {
-      #ifdef CUMEM_DEBUG
-      std::cout << "attempting cudaMallocManaged of size " << size << "B" << std::endl;
-      #endif
-      if( m_flags & CUMEM_FLAGS_MEM_ATTACH_HOST )
-      {
-        CHECK_CUDA(cudaMallocManaged(&data, size, cudaMemAttachHost));
-      }
-      else
-      {
-        CHECK_CUDA(cudaMallocManaged(&data, size, cudaMemAttachGlobal));
-      }
-      #ifdef CUMEM_DEBUG
-      std::cout << "cudaMallocManaged of size " << size << "B address=" << (void*)data << std::endl;
-      #endif
-    }
-    else
-    {
-      CHECK_CUDA(cudaSetDevice(m_device));
-      #ifdef CUMEM_DEBUG
-      std::cout << "attempting cudaMalloc of size " << size << "B" << std::endl;
-      #endif
-      CHECK_CUDA(cudaMalloc(&data, size));
-      #ifdef CUMEM_DEBUG
-      std::cout << "cudaMalloc of size " << size << "B address=" << (void*)data << std::endl;
-      #endif
-    }
-  }
-  // If it failed, there's an unexpected issue.
-  assert(data);
-  // We have data, we now need to add it to the list of free nodes. We keep the list sorted.
-  Block *next = m_free_blocks;
-  for( ; next && next->get_data() < data ; next = next->get_next() )
-    prev = next;
-  curr = new Block((char*) data, size, next, true);
-  if( prev )
-    prev->set_next(curr);
-  else
-    m_free_blocks = curr;
-  return CUMEM_STATUS_SUCCESS;
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-cumemStatus_t Manager::extract_block(Block *curr, Block *prev, std::size_t size, bool stolen)
-{
-  // We have two cases: 1/ It is the right size so we keep it or 2/ it is too large and we split the node.
-  Block *next;
-  if( curr->get_size() == size )
-    next = curr->get_next();
-  else
-  {
-    std::size_t remaining = curr->get_size()-size;
-    Block *new_block = new Block(curr->get_data() + size, remaining, curr->get_next(), stolen);
-    next = new_block;
-    curr->set_size(size);
-  }
-  // Redo the "branching" in the nodes.
-  if( prev )
-    prev->set_next(next);
-  else
-    m_free_blocks = next;
-  return CUMEM_STATUS_SUCCESS;
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-cumemStatus_t Manager::find_best_block(Block *&best, Block *&prev, std::size_t size)
-{
-  best = NULL, prev = NULL;
-  for( Block *temp = m_free_blocks, *temp_prev = NULL ; temp ; temp = temp->get_next() )
-  {
-    if( temp->get_size() >= size && (!best || temp->get_size() < best->get_size()) )
-    {
-      best = temp;
-      prev = temp_prev;
-    }
-    temp_prev = temp;
-  }
-  return CUMEM_STATUS_SUCCESS;
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-cumemStatus_t Manager::get_child(Manager *&manager, cudaStream_t stream)
-{
-  for( std::size_t i = 0 ; i < m_children.size() ; ++i )
-    if( m_children[i].m_stream == stream )
-    {
-      manager = &m_children[i];
-      return CUMEM_STATUS_SUCCESS;
-    }
-  return CUMEM_STATUS_INVALID_ARGUMENT;
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-std::size_t Manager::get_memory(const Block *head) const
-{
-  std::size_t size = 0;
-  for( Block *curr = (Block*) head ; curr ; curr = curr->get_next() )
-    size += curr->get_size();
-  return size;
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-std::vector<Manager>& Manager::get_root_managers()
-{
-  static std::vector<Manager> managers;
-  return managers;
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-cumemStatus_t Manager::give_block(void *&block_data, std::size_t &block_size, std::size_t size)
-{
-  // Make sure the block is not in use any more. It could be too coarse grain and we may change 
-  // it in the future.
-  CHECK_CUDA(cudaStreamSynchronize(m_stream));
-  // Init the returned values to 0.
-  block_data = NULL;
-  block_size = 0;
-  // Find the best node to steal and reserve it.
-  Block *best = NULL, *prev = NULL;
-  CUMEM_CHECK(find_best_block(best, prev, size));
-  if( !best )
-    return CUMEM_STATUS_OUT_OF_MEMORY;
-  CUMEM_CHECK(extract_block(best, prev, size, true));
-  block_data = best->get_data();
-  block_size = best->get_size();
-  // Release the memory used by that block.
-  delete best;
-  return CUMEM_STATUS_SUCCESS;
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-std::size_t Manager::print_list(FILE *file, const char *name, const Block *head) const
-{
-  std::size_t size = 0;
-  for( Block *curr = (Block*) head; curr; curr = curr->get_next() )
-    size += curr->get_size();
-  fprintf(file, "| list=\"%s\", size=%lu\n", name, size);
-  for( Block *curr = (Block*) head ; curr ; curr = curr->get_next() )
-  {
-    fprintf(file, "| | node=0x%016lx, data=0x%016lx, size=%lu, next=0x%016lx, head=%2lu\n", 
-      (std::size_t) curr, 
-      (std::size_t) curr->get_data(),
-      (std::size_t) curr->get_size(),
-      (std::size_t) curr->get_next(),
-      (std::size_t) curr->is_head ());
-  }
-  fprintf(file, "|\n");
-  return size;
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-cumemStatus_t Manager::release(void *ptr)
-{
-  // Skip if ptr is NULL.
-  if( ptr == NULL )
-  {
-    printf("release(NULL)\n");
-    CUMEM_STATUS_SUCCESS;
-  }
-  // Lock to make sure only one thread execute that fragment of code.
-  Lock lock;
-  CUMEM_CHECK(lock.lock(&m_mutex));
-  // Find the node in the list of used blocks.
-  Block *curr = m_used_blocks, *prev = NULL;
-  for( ; curr && curr->get_data() != ptr ; curr = curr->get_next() )
-    prev = curr;
-  // Make sure we have found a node.
-  if( curr == NULL )
-  {
-    CUMEM_CHECK(lock.unlock());
-    return CUMEM_STATUS_INVALID_ARGUMENT;
-  }
-  // We have the node so release it.
-  cumemStatus_t result = release_block(curr, prev);
-  CUMEM_CHECK(lock.unlock());
-  return result;
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-cumemStatus_t Manager::release_all(bool &memory_leaks)
-{
-  // Destroy the children if any.
-  bool ok = true;
-  for( std::size_t i = 0; i < m_children.size(); ++i )
-  {
-    bool tmp;
-    CUMEM_CHECK(m_children[i].release_all(tmp));
-    ok = ok && !tmp;
-  }
-  // TODO: thejaswi: HACK! HACK! HACK!
-  // We have some issues when integrating into some libraries. This has to fixed in the libs.
-  // memory_leaks = !ok || m_used_blocks;
-  memory_leaks = !ok;
-  // Destroy used blocks. It's a kind of panic mode to avoid leaks. NOTE: Do that only with roots!!!
-  if( !m_parent )
-    while( m_used_blocks )
-      CUMEM_CHECK(release_block(m_used_blocks, NULL));
-  // We should be having only free blocks that are head blocks. Release those blocks.
-  while( m_free_blocks )
-  {
-    if( m_parent )
-      CUMEM_CHECK(m_parent->release(m_free_blocks->get_data()));
-    else if( m_free_blocks->is_head() )
-    {
-      #ifdef CUMEM_DEBUG
-      std::cout << "attempting cudaFree of size " << m_free_blocks->get_size()
-                << "B address=" << (void*)m_free_blocks->get_data() << std::endl;
-      #endif
-      CHECK_CUDA(cudaFree(m_free_blocks->get_data()));
-      #ifdef CUMEM_DEBUG
-      std::cout << "cudaFree of size " << m_free_blocks->get_size() << "B successful" << std::endl;
-      #endif
-    }
-    Block *block = m_free_blocks;
-    m_free_blocks = m_free_blocks->get_next();
-    delete block;
-  }
-  // We shouldn't have any used block left. Or, it means the user is causing memory leaks!
-  return CUMEM_STATUS_SUCCESS;
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-cumemStatus_t Manager::release_block(Block *curr, Block *prev)
-{
-  // The current node cannot be NULL!
-  assert(curr != NULL);
-  // Change the connection of the node.
-  if( prev )
-    prev->set_next(curr->get_next());
-  else
-    m_used_blocks = curr->get_next();
-  // Find the location where this block should be added to the free list.
-  prev = NULL;
-  Block *iter = m_free_blocks;
-  for( ; iter && iter->get_data() < curr->get_data() ; iter = iter->get_next() )
-    prev = iter;
-  // Keep track of the successor of pred. We may lose track of it in the following "else".
-  Block *next = prev ? prev->get_next() : m_free_blocks;
-  // We first check if we can merge the block with its predecessor in the list and curr can be merged.
-  if( prev && prev->get_data() + prev->get_size() == curr->get_data() && !curr->is_head() )
-  {
-    prev->set_size(prev->get_size() + curr->get_size());
-    delete curr;
-    curr = prev;
-  }
-  else if( prev )
-    prev->set_next(curr);
-  else
-    m_free_blocks = curr;
-  // Check if we can merge curr and next. We can't merge over "cudaMalloc" boundaries.
-  if( next && curr->get_data() + curr->get_size() == next->get_data() && !next->is_head() )
-  {
-    curr->set_size(curr->get_size() + next->get_size());
-    curr->set_next(next->get_next());
-    delete next;
-  }
-  else
-    curr->set_next(next);
-  return CUMEM_STATUS_SUCCESS;
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-cumemStatus_t Manager::reserve(std::size_t size)
-{
-  Block *curr, *prev;
-  CUMEM_CHECK(allocate_block(curr, prev, size));
-  m_size = size;
-  return CUMEM_STATUS_SUCCESS;
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-cumemStatus_t Manager::steal(void *&stolen, std::size_t size)
-{
-  // If we cannot steal, don't even try.
-  if( m_flags & CUMEM_FLAGS_CANNOT_STEAL )
-  {
-    stolen = NULL;
-    return CUMEM_STATUS_INVALID_ARGUMENT;
-  }
-  // The stolen block.
-  void *data = NULL; std::size_t data_size = 0;
-  if( !m_children.empty() )
-    CUMEM_CHECK(steal_block(data, data_size, size));
-  else if( m_parent )
-    CUMEM_CHECK(m_parent->steal_block(data, data_size, size));
-  // Make sure we do have a block of memory or quit.
-  if( !data )
-  {
-    stolen = NULL;
-    return CUMEM_STATUS_OUT_OF_MEMORY;
-  }
-  // Push the block in the used list.
-  m_used_blocks = new Block((char*) data, data_size, m_used_blocks, true);
-  // Return the new pointer into memory.
-  stolen = data;
-  return CUMEM_STATUS_SUCCESS;
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-cumemStatus_t Manager::steal_block(void *&data, std::size_t &data_size, ::size_t size)
-{
-  // No block found and no room to grow. Try to steal from a children (if we have any).
-  data = NULL;
-  for( std::size_t i = 0 ; !data && i < m_children.size() ; ++i )
-    if( m_children[i].give_block(data, data_size, size) == CUMEM_STATUS_SUCCESS )
-      break;
-  // If no memory space found, simply return NULL. We have failed to allocate. Quit miserably.
-  if( !data )
-    return CUMEM_STATUS_OUT_OF_MEMORY;
-  // We have got a node from a children. We need to update our "used" list before we can do 
-  // anything with it.
-  Block *curr = m_used_blocks, *prev = NULL;
-  for( ; curr ; curr = curr->get_next() )
-  {
-    if( curr->get_data() <= data && data < curr->get_data()+curr->get_size() )
-      break;
-    prev = curr;
-  }
-  // Curr points to the node which contains that memory region.
-  assert(curr);
-  // If it is exactly the same memory region, we are done!!!
-  if( curr->get_data() == data && curr->get_size() == data_size )
-    return CUMEM_STATUS_SUCCESS;
-  // Track the blocks before and after curr.
-  Block *next = curr->get_next();
-  // We may have up to 3 blocks.
-  std::size_t size_before = (std::size_t) ((char*) data - curr->get_data());
-  std::size_t size_after  = (curr->get_size() - size_before - data_size);
-  // The resulting block.
-  Block *result = curr;
-  // If we have no space between curr->get_data and block->get_data.
-  if( size_before == 0 )
-    curr->set_size(data_size);
-  else
-  {
-    curr->set_size(size_before);
-    Block *block = new Block((char*) data, data_size, next, false);
-    curr->set_next(block);
-    curr = block;
-    data = (char*) data + data_size;
-    data_size = size_after; 
-    result = block;
-  }
-  // We have space at the end so we may need to add a new node.
-  if( size_after > 0 )
-  {
-    Block *block = new Block(curr->get_data() + curr->get_size(), size_after, next, false);
-    curr->set_next(block);
-    curr = block;
-  }
-  return CUMEM_STATUS_SUCCESS;
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-static void print_blocks(FILE *file, const Manager &manager)
-{
-  fprintf(file, "device=%d, stream=0x%016lx, used=%luB, free=%luB\n", 
-      manager.get_device(), 
-      (std::size_t) manager.get_stream(),
-      manager.get_used_memory(),
-      manager.get_free_memory());
-  manager.print_used_blocks(file);
-  manager.print_free_blocks(file);
-  fprintf(file, "\n");
-  for( std::size_t i = 0 ; i < manager.get_children().size() ; ++i )
-    print_blocks(file, manager.get_children()[i]);
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-} // namespace cumem
-///////////////////////////////////////////////////////////////////////////////////////////////////
-extern "C" {
-///////////////////////////////////////////////////////////////////////////////////////////////////
-cumemStatus_t cumemInit(int numDevices, const cumemDevice_t *devices, unsigned flags)
-{
-  // Make sure we have at least one device declared.
-  CHECK(numDevices > 0, CUMEM_STATUS_INVALID_ARGUMENT);
-  // Find the largest ID of the device.
-  int max_device = 0;
-  for( int i = 0 ; i < numDevices ; ++i )
-    if( devices[i].device > max_device )
-      max_device = devices[i].device;
-  // Allocate enough managers.
-  CHECK(max_device >= 0, CUMEM_STATUS_INVALID_ARGUMENT);
-  std::vector<cumem::Manager> &managers = cumem::Manager::get_root_managers();
-  managers.resize(max_device+1);
-  // Create a root manager for each device and create the children.
-  for( int i = 0 ; i < numDevices ; ++i )
-  {
-    std::size_t size = devices[i].size;
-    if( size == 0 )
-    {
-      cudaDeviceProp props;
-      CHECK_CUDA(cudaGetDeviceProperties(&props, devices[i].device));
-      size = props.totalGlobalMem / 2;
-    }
-    CHECK(size > 0, CUMEM_STATUS_INVALID_ARGUMENT);
-    std::size_t granularity = devices[i].granularity;
-    if( granularity == 0 )
-    {
-      granularity = CUMEM_DEFAULT_GRANULARITY;
-    }
-    CHECK(granularity > 0, CUMEM_STATUS_INVALID_ARGUMENT);
-    CHECK(((granularity % 512) == 0), CUMEM_STATUS_INVALID_ARGUMENT);
-    cumem::Manager &manager = cumem::Manager::get_root_manager(devices[i].device);
-    manager.set_device(devices[i].device);
-    manager.set_flags(flags);
-    manager.set_granularity(granularity);
-    size = ((size + granularity - 1) / granularity) * granularity;
-    CUMEM_CHECK(manager.reserve(size));
-    std::vector<cumem::Manager> &children = manager.get_children();
-    children.resize(devices[i].numStreams);
-    for( int j = 0 ; j < devices[i].numStreams ; ++j )
-    {
-      children[j].set_parent(&manager);
-      children[j].set_device(devices[i].device);
-      children[j].set_stream(devices[i].streams[j]);
-      children[j].set_flags(flags & ~CUMEM_FLAGS_CANNOT_GROW);
-      CUMEM_CHECK(children[j].reserve(size / (devices[i].numStreams + 1)));
-    }
-  }
-  return CUMEM_STATUS_SUCCESS;
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-cumemStatus_t cumemFinalize()
-{
-  if( cumem::Manager::get_root_managers().empty() )
-    return CUMEM_STATUS_NOT_INITIALIZED;
-  std::vector<cumem::Manager> &managers = cumem::Manager::get_root_managers();
-  bool memory_leaks = false;
-  for( std::size_t i = 0; i < managers.size(); ++i )
-  {
-    bool tmp_leaks;
-    CUMEM_CHECK(managers[i].release_all(tmp_leaks));
-    memory_leaks = memory_leaks || tmp_leaks;
-  }
-  managers.clear();
-  return memory_leaks ? CUMEM_STATUS_MEMORY_LEAK : CUMEM_STATUS_SUCCESS;
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-cumemStatus_t cumemMalloc(void **ptr, std::size_t size, cudaStream_t stream)
-{
-  if( cumem::Manager::get_root_managers().empty() )
-    return CUMEM_STATUS_NOT_INITIALIZED;
-  if( !ptr && !size )
-    return CUMEM_STATUS_SUCCESS;
-  if( !ptr )
-    return CUMEM_STATUS_INVALID_ARGUMENT;
-  if( !size )
-    return CUMEM_STATUS_INVALID_ARGUMENT;
-  int device;
-  CHECK_CUDA(cudaGetDevice(&device));
-  cumem::Manager &root = cumem::Manager::get_root_manager(device);
-  cumem::Manager *manager = &root;
-  if( stream )
-    CUMEM_CHECK(root.get_child(manager, stream));
-  assert(manager);
-  size_t granularity = manager->get_granularity();
-  size = ((size + granularity - 1) / granularity) * granularity;
-  cumemStatus_t result = manager->allocate(ptr[0], size);
-  // We failed to allocate but there might still be a buffer available in another manager. Try to 
-  // steal it.
-  if( result == CUMEM_STATUS_OUT_OF_MEMORY )
-  {
-    // We need to acquire all the locks to all the managers to be able to steal memory. It's costly!
-    typedef std::vector<cumem::Manager>::iterator Iterator;
-    // Try to acquire locks on all the children.
-    std::vector<cumem::Manager> &children = root.get_children();
-    std::vector<cumem::Lock> locks(children.size() + 1);
-    std::size_t num_locked = 0;
-    for( Iterator it = children.begin() ; it != children.end() ; ++it, ++num_locked )
-    {
-      cumem::Mutex *mutex = it->get_mutex();
-      if( locks[num_locked].lock(mutex) != CUMEM_STATUS_SUCCESS )
-        break;
-    }
-    // We locked all the children, so we try to lock the root.
-    if( num_locked == children.size() )
-    {
-      cumemStatus_t tmp_status = locks.back().lock(root.get_mutex());
-      if( tmp_status == CUMEM_STATUS_SUCCESS )
-        num_locked++;
-    }
-    // We acquired so we try to steal a node from another child.
-    if( num_locked == locks.size() )
-      result = manager->steal(ptr[0], size);
-    for( std::size_t i = 0 ; i < num_locked ; ++i )
-      CUMEM_CHECK(locks[i].unlock());
-  }
-  return result;
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-cumemStatus_t cumemFree(void *ptr, cudaStream_t stream)
-{
-  if( cumem::Manager::get_root_managers().empty() )
-    return CUMEM_STATUS_NOT_INITIALIZED;
-  if( ptr == NULL )
-    return CUMEM_STATUS_SUCCESS;
-  int device;
-  CHECK_CUDA(cudaGetDevice(&device));
-  cumem::Manager &root = cumem::Manager::get_root_manager(device);
-  cumem::Manager *manager = &root;
-  if( stream )
-    CUMEM_CHECK(root.get_child(manager, stream));
-  assert(manager);
-  return manager->release(ptr);
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-cumemStatus_t cumemGetMemoryUsage(size_t *used_mem, size_t *free_mem)
-{
-  if( cumem::Manager::get_root_managers().empty() )
-    return CUMEM_STATUS_NOT_INITIALIZED;
-  if( !used_mem || !free_mem )
-    return CUMEM_STATUS_INVALID_ARGUMENT;
-  for( std::size_t i = 0, j = 0 ; i < cumem::Manager::get_root_managers().size() ; ++i )
-  {
-    cumem::Manager &manager = cumem::Manager::get_root_managers()[i];
-    if( manager.get_device() == -1 )
-      continue;
-    cumem::Lock lock;
-    CUMEM_CHECK(lock.lock(manager.get_mutex()));
-    used_mem[j] = manager.get_used_memory();
-    free_mem[j] = manager.get_free_memory();
-    j++;
-    CUMEM_CHECK(lock.unlock());
-  }
-  return CUMEM_STATUS_SUCCESS;
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-cumemStatus_t cumemPrintMemoryState(FILE *file)
-{
-  if( cumem::Manager::get_root_managers().empty() )
-    return CUMEM_STATUS_NOT_INITIALIZED;
-  for( std::size_t i = 0 ; i < cumem::Manager::get_root_managers().size() ; ++i )
-  {
-    cumem::Manager &manager = cumem::Manager::get_root_managers()[i];
-    if( manager.get_device() == -1 )
-      continue;
-    cumem::Lock lock;
-    CUMEM_CHECK(lock.lock(manager.get_mutex()));
-    print_blocks(file, manager);
-    CUMEM_CHECK(lock.unlock());
-  }
-  return CUMEM_STATUS_SUCCESS;
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-const char* cumemGetErrorString(cumemStatus_t status)
-{
-  switch(status)
-  {
-    case CUMEM_STATUS_SUCCESS: return "CUMEM_STATUS_SUCCESS";
-    case CUMEM_STATUS_CUDA_ERROR: return "CUMEM_STATUS_CUDA_ERROR";
-    case CUMEM_STATUS_INVALID_ARGUMENT: return "CUMEM_STATUS_INVALID_ARGUMENT";
-    case CUMEM_STATUS_MEMORY_LEAK: return "CUMEM_STATUS_MEMORY_LEAK";
-    case CUMEM_STATUS_NOT_INITIALIZED: return "CUMEM_STATUS_NOT_INITIALIZED";
-    case CUMEM_STATUS_OUT_OF_MEMORY: return "CUMEM_STATUS_OUT_OF_MEMORY";
-    default: return "CUMEM_STATUS_UNKNOWN_ERROR";
-  }
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-} // extern "C"