Merge pull request #3198 from nouiz/cumem3

Add CNMeM in Theano to speed up CUDA allocation.

Merge pull request #3198 from nouiz/cumem3
ca465be0 · abergeron · 2ddaca06 · 080a11fe · ca465be0 · ca465be0
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -2,6 +2,7 @@ global-include *.txt
 global-include *.c
 global-include *.cu
 global-include *.cuh
+global-include *.cpp
 global-include *.h
 global-include *.sh
 global-include *.pkl

--- a/doc/acknowledgement.txt
+++ b/doc/acknowledgement.txt
@@ -11,7 +11,7 @@ Acknowledgements
 * The developers of `NumPy <http://numpy.scipy.org/>`_. Theano is based on its ndarray object and uses much of its implementation.
 * The developers of `SciPy <http://scipy.org/>`_. Our sparse matrix support uses their sparse matrix objects. We also reuse other parts.
-* All Theano authors in the commit log.
+* All `Theano contributors <https://github.com/Theano/Theano/graphs/contributors>`_.
 * All Theano users that have given us feedback.
 * The GPU implementation of tensordot is based on code from Tijmen
  Tieleman's `gnumpy <http://www.cs.toronto.edu/~tijmen/gnumpy.html>`_
@@ -24,3 +24,4 @@ Acknowledgements
    P. L'Ecuyer and R. Touzin, `Fast Combined Multiple Recursive Generators with Multipliers of the form a = +/- 2^d +/- 2^e <http://www.informs-sim.org/wsc00papers/090.PDF>`_, Proceedings of the 2000 Winter Simulation Conference, Dec. 2000, 683--689.
  We were authorized by Pierre L'Ecuyer to copy/modify his Java implementation in the `SSJ <http://www.iro.umontreal.ca/~simardr/ssj/>`_ software and to relicense it under BSD 3-Clauses in Theano.
+* A better GPU memory allocator :attr:`CNMeM <config.lib.cnmem>` is included in Theano. It has the same license.
--- a/doc/faq.txt
+++ b/doc/faq.txt
@@ -72,13 +72,18 @@ and use directly the optimized graph from the pickled file.
 Faster Theano function
 ----------------------
-You can set the Theano flag ``allow_gc`` to ``False`` to get a speed-up by using
+You can set the Theano flag :attr:`allow_gc <config.allow_gc>` to ``False`` to get a speed-up by using
 more memory. By default, Theano frees intermediate results when we don't need
 them anymore. Doing so prevents us from reusing this memory. So disabling the
 garbage collection will keep all intermediate results' memory space to allow to
 reuse them during the next call to the same Theano function, if they are of the
 correct shape. The shape could change if the shapes of the inputs change.
+.. note::
+   With :attr:`CNMeM <config.lib.cnmem>`, this isn't very useful with GPU
+   anymore.
 .. _unsafe_optimization:
 Unsafe optimization

--- a/doc/index.txt
+++ b/doc/index.txt
@@ -21,6 +21,9 @@ Montreal).
 News
 ====
+* We added support for :attr:`CNMeM <config.lib.cnmem>` to speed up
+  the GPU memory allocation.
 * Theano 0.7 was released 26th March 2015. Everybody is encouraged to update.
 * We support `cuDNN <http://deeplearning.net/software/theano/library/sandbox/cuda/dnn.html>`_ if it is installed by the user.

--- a/doc/library/config.txt
+++ b/doc/library/config.txt
@@ -370,6 +370,34 @@ import theano and print the config variable, as in:
    `amdlibm <http://developer.amd.com/cpu/libraries/libm/>`__
    library, which is faster than the standard libm.
+.. attribute:: lib.cnmem
+    Float value: >= 0
+    Do we enable `CNMeM <https://github.com/NVIDIA/cnmem>`_ or not (a
+    faster CUDA memory allocator). In Theano dev version until 0.7.1
+    is released.
+    That library is included in Theano, you do not need to install it.
+    The value represents the start size (in MB or % of total GPU
+    memory) of the memory pool. If more memory are needed, it will
+    try to get more, but this can cause more memory fragmentation:
+        * 0: not enabled.
+        * 0 < N <= 1: % of the total GPU memory (clipped to .985 for driver memory)
+        * > 0: use that number of MB of memory.
+    Default 0 (but should change later)
+    .. note::
+        This could cause memory fragmentation. So if you have a
+        memory error while using cnmem, try to allocate more memory at
+        the start or disable it. If you try this, report your result
+        on :ref`theano-dev`.
 .. attribute:: linker
    String value: 'c|py', 'py', 'c', 'c|py_nogc'

--- a/setup.py
+++ b/setup.py
@@ -164,7 +164,7 @@ def do_setup():
          install_requires=['numpy>=1.6.2', 'scipy>=0.11', 'six>=1.9.0'],
          package_data={
              '': ['*.txt', '*.rst', '*.cu', '*.cuh', '*.c', '*.sh', '*.pkl',
-                   '*.h', 'ChangeLog'],
+                   '*.h', '*.cpp', 'ChangeLog'],
              'theano.misc': ['*.sh']
          },
          scripts=['bin/theano-cache', 'bin/theano-nose', 'bin/theano-test'],

--- a/theano/sandbox/cuda/__init__.py
+++ b/theano/sandbox/cuda/__init__.py
@@ -13,7 +13,8 @@ from theano.compile import optdb
 from theano.gof import EquilibriumDB, SequenceDB
 from theano.gof.cmodule import get_lib_extension
 from theano.gof.compilelock import get_lock, release_lock
-from theano.configparser import config, AddConfigVar, StrParam, BoolParam
+from theano.configparser import (
+    config, AddConfigVar, BoolParam, FloatParam, StrParam)
 from . import nvcc_compiler
 # ignore_newtrees is to speed the optimization as this is the pattern
@@ -54,6 +55,21 @@ AddConfigVar('cublas.lib',
        """Name of the cuda blas library for the linker.""",
        StrParam('cublas'))
+AddConfigVar('lib.cnmem',
+             """Do we enable CNMeM or not (a faster CUDA memory allocator).
+             The parameter represent the start size (in MB or % of
+             total GPU memory) of the memory pool.
+             0: not enabled.
+             0 < N <= 1: % of the total GPU memory (clipped to .985 for driver memory)
+             > 0: use that number of MB of memory.
+             """,
+             # We should not mix both allocator, so we can't override
+             FloatParam(0, lambda i: i >= 0, allow_override=False),
+             in_c_key=False)
 # is_nvcc_available called here to initialize global vars in
 # nvcc_compiler module
 nvcc_compiler.is_nvcc_available()
@@ -107,6 +123,8 @@ def try_import():
        'cuda_ndarray.cu',
        'cuda_ndarray.cuh',
        'conv_full_kernel.cu',
+        'cnmem.h',
+        'cnmem.cpp',
        'conv_kernel.cu')
    stat_times = [os.stat(os.path.join(cuda_path, cuda_file))[stat.ST_MTIME]
                  for cuda_file in cuda_files]
@@ -178,7 +196,8 @@ if compile_cuda_ndarray and cuda_available:
                            location=cuda_ndarray_loc,
                            include_dirs=[cuda_path],
                            libs=[config.cublas.lib],
-                            preargs=['-O3'] + compiler.compile_args())
+                            preargs=['-O3'] + compiler.compile_args(),
+                    )
                    from cuda_ndarray.cuda_ndarray import *
            except Exception as e:
                _logger.error("Failed to compile cuda_ndarray.cu: %s", str(e))
@@ -377,7 +396,7 @@ def use(device,
        try:
            if (device != 'gpu') and not pycuda_init_dev:
                assert isinstance(device, int)
-                gpu_init(device)
+                gpu_init(device, config.lib.cnmem)
                use.device_number = device
                assert active_device_number() == device
            else:
@@ -387,10 +406,10 @@ def use(device,
                # query the active GPU. If we check the active GPU before
                # the device is initialized we will always receive 0
                # event if another device is selected later.
-                cuda_ndarray.cuda_ndarray.CudaNdarray.zeros((2, 3))
+                cuda_ndarray.cuda_ndarray.select_a_gpu()
                use.device_number = active_device_number()
                # This is needed to initialize the cublas handle.
-                gpu_init(use.device_number)
+                gpu_init(use.device_number, config.lib.cnmem)
            if test_driver:
                import theano.sandbox.cuda.tests.test_driver
@@ -403,8 +422,9 @@ def use(device,
                                 " this property")
            if config.print_active_device:
-                print("Using gpu device %d: %s" % (
+                cnmem_enabled = "enabled" if config.lib.cnmem else "disabled"
-                        active_device_number(), active_device_name()), file=sys.stderr)
+                print("Using gpu device %d: %s (CNMeM is %s)" % (
+                        active_device_number(), active_device_name(), cnmem_enabled), file=sys.stderr)
            if device_properties(use.device_number)['regsPerBlock'] < 16384:
                # We will try to use too much register per bloc at many places
                # when there is only 8k register per multi-processor.

--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -137,13 +137,9 @@ class BatchedDotOp(GpuOp):
                host_z[i] = host_z[i - 1] + z_stride;
            }
-            err1 = cudaMalloc((void **)&gpu_x, ptr_array_size);
+            gpu_x = (float **) device_malloc(ptr_array_size);
-            if (err1 != cudaSuccess)
+            if (gpu_x == NULL){
-            {
-                CLEANUP();
-                PyErr_Format(PyExc_RuntimeError,
-                             "%%s", "cudaMalloc failure");
                %(fail)s;
            }
@@ -195,7 +191,7 @@ class BatchedDotOp(GpuOp):
            do                                          \
            {                                           \
                if (host_x) free (host_x);              \
-                if (gpu_x) cudaFree(gpu_x);             \
+                if (gpu_x) device_free(gpu_x);          \
            } while (0)
        """
@@ -213,6 +209,9 @@ class BatchedDotOp(GpuOp):
        return rval
+    def c_code_cache_version(self):
+        return (1,)
 batched_dot = BatchedDotOp()
 class GpuDot22(GpuOp):

--- a/theano/sandbox/cuda/blocksparse.py
+++ b/theano/sandbox/cuda/blocksparse.py
@@ -208,22 +208,28 @@ static int SparseBlockGemv_copy(PyArrayObject *a, npy_intp *b) {
        static int %(n)s_prep(int b, int i, int j, int outsize) {
          int s = b*i*j;
          if (%(n)s_list_len < s) {
-            cudaFree(%(n)s_inp_list);
+            device_free(%(n)s_inp_list);
-            cudaFree(%(n)s_out_list);
+            device_free(%(n)s_out_list);
-            cudaFree(%(n)s_W_list);
+            device_free(%(n)s_W_list);
-            if (cudaMalloc(&%(n)s_inp_list, s*sizeof(float *)) != cudaSuccess) return -1;
+            %(n)s_inp_list = (const float **) device_malloc(s*sizeof(float *));
-            if (cudaMalloc(&%(n)s_out_list, s*sizeof(float *)) != cudaSuccess) return -1;
+            if (%(n)s_inp_list == NULL) return -1;
-            if (cudaMalloc(&%(n)s_W_list, s*sizeof(float *)) != cudaSuccess) return -1;
+            %(n)s_out_list = (float **) device_malloc(s*sizeof(float *));
+            if (%(n)s_out_list == NULL) return -1;
+            %(n)s_W_list = (const float **) device_malloc(s*sizeof(float *));
+            if (%(n)s_W_list == NULL) return -1;
            %(n)s_list_len = s;
          }
          if (%(n)s_iIdx_len < b*i) {
-            cudaFree(%(n)s_iIdx);
+            device_free(%(n)s_iIdx);
-            if (cudaMalloc(&%(n)s_iIdx, b*i*sizeof(npy_intp)) != cudaSuccess) return -1;
+        %(n)s_iIdx = (npy_intp*) device_malloc(b*i*sizeof(npy_intp));
+        if (%(n)s_iIdx == NULL) return -1;
            %(n)s_iIdx_len = b*i;
          }
          if (%(n)s_oIdx_len < b*j) {
-            cudaFree(%(n)s_oIdx);
+            device_free(%(n)s_oIdx);
-            if (cudaMalloc(&%(n)s_oIdx, b*j*sizeof(npy_intp)) != cudaSuccess) return -1;
+            %(n)s_oIdx = (npy_intp*) device_malloc(b*j*sizeof(npy_intp));
+            if (%(n)s_oIdx == NULL) return -1;
            %(n)s_oIdx_len = b*j;
          }
          return 0;
@@ -326,7 +332,7 @@ CudaNdarray_HOST_STRIDES(%(out)s)[0], CudaNdarray_HOST_STRIDES(%(out)s)[1],
                   W=W, fail=sub['fail'], name=nodename)
    def c_code_cache_version(self):
-        return (11,)
+        return (12,)
    def grad(self, inputs, grads):
        o, W, h, inputIdx, outputIdx = inputs
@@ -509,24 +515,27 @@ static size_t %(n)s_yIdx_len;
 static int %(n)s_prep(int b, int i, int j) {
  int s = b*i*j;
  if (%(n)s_list_len < s) {
-    cudaFree(%(n)s_x_list);
+    device_free(%(n)s_x_list);
-    cudaFree(%(n)s_y_list);
+    device_free(%(n)s_y_list);
-    cudaFree(%(n)s_out_list);
+    device_free(%(n)s_out_list);
-    if (cudaMalloc(&%(n)s_x_list, s*sizeof(float *)) != cudaSuccess) return -1;
+    %(n)s_x_list = (const float **) device_malloc(s*sizeof(float *));
-    if (cudaMalloc(&%(n)s_y_list, s*sizeof(float *)) != cudaSuccess) return -1;
+    if (%(n)s_x_list == NULL) return -1;
-    if (cudaMalloc(&%(n)s_out_list, s*sizeof(float *)) != cudaSuccess) return -1;
+    %(n)s_y_list = (const float **) device_malloc(s*sizeof(float *));
+    if (%(n)s_y_list == NULL) return -1;
+    %(n)s_out_list = (float **) device_malloc(s*sizeof(float *));
+    if (%(n)s_out_list == NULL) return -1;
    %(n)s_list_len = s;
  }
  if (%(n)s_xIdx_len < b*i) {
-    cudaFree(%(n)s_xIdx);
+    device_free(%(n)s_xIdx);
-    if (cudaMalloc(&%(n)s_xIdx, b*i*sizeof(npy_intp)) != cudaSuccess)
+    %(n)s_xIdx = (npy_intp*) device_malloc(b*i*sizeof(npy_intp));
-      return -1;
+    if (%(n)s_xIdx == NULL) return -1;
    %(n)s_xIdx_len = b*i;
  }
  if (%(n)s_yIdx_len < b*j) {
-    cudaFree(%(n)s_yIdx);
+    device_free(%(n)s_yIdx);
-    if (cudaMalloc(&%(n)s_yIdx, b*j*sizeof(npy_intp)) != cudaSuccess)
+    %(n)s_yIdx = (npy_intp*) device_malloc(b*j*sizeof(npy_intp));
-      return -1;
+    if (%(n)s_yIdx == NULL) return -1;
    %(n)s_yIdx_len = b*j;
  }
  return 0;
@@ -626,7 +635,7 @@ CudaNdarray_HOST_STRIDES(%(out)s)[0], CudaNdarray_HOST_STRIDES(%(out)s)[1],
            alpha=alpha, fail=sub['fail'])
    def c_code_cache_version(self):
-        return (10,)
+        return (11,)
 sparse_block_outer_ss = SparseBlockOuterSS(False)

--- a/theano/sandbox/cuda/cnmem.cpp
+++ b/theano/sandbox/cuda/cnmem.cpp
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+///////////////////////////////////////////////////////////////////////////////////////////////////
+#include "cnmem.h"
+#include <cstddef>
+#include <vector>
+#include <cuda_runtime_api.h>
+#if !defined(WIN32) && defined(_MSC_VER)
+#define WIN32
+#endif
+#ifdef WIN32
+#include <Windows.h>
+#else
+#include <pthread.h>
+#endif
+#define CNMEM_GRANULARITY 512
+///////////////////////////////////////////////////////////////////////////////////////////////////
+extern "C" const char* cnmemGetErrorString(cnmemStatus_t status) {
+    switch(status) {
+        case CNMEM_STATUS_SUCCESS: return "CNMEM_STATUS_SUCCESS";
+        case CNMEM_STATUS_CUDA_ERROR: return "CNMEM_STATUS_CUDA_ERROR";
+        case CNMEM_STATUS_INVALID_ARGUMENT: return "CNMEM_STATUS_INVALID_ARGUMENT";
+        case CNMEM_STATUS_NOT_INITIALIZED: return "CNMEM_STATUS_NOT_INITIALIZED";
+        case CNMEM_STATUS_OUT_OF_MEMORY: return "CNMEM_STATUS_OUT_OF_MEMORY";
+        default: return "CNMEM_STATUS_UNKNOWN_ERROR";
+    }
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+#if 0
+#ifdef WIN32
+#define CNMEM_DEBUG_ERROR(...) do { \
+    fprintf(stderr, "Error at line: %d\n", __LINE__); \
+    fprintf(stderr, __VA_ARGS__); \
+} while(0)
+#else
+#include <execinfo.h>
+static inline void printBacktrace() {
+    void *stackBuffer[64]; 
+    int numAddresses = backtrace((void**) &stackBuffer, 64); 
+    char **addresses = backtrace_symbols(stackBuffer, numAddresses); 
+    for( int i = 0 ; i < numAddresses ; ++i ) { 
+        fprintf(stderr, "[%2d]: %s\n", i, addresses[i]); 
+    } 
+    free(addresses); 
+}
+#define CNMEM_DEBUG_ERROR(...) do { \
+    fprintf(stderr, "Error at line: %d\n", __LINE__); \
+    fprintf(stderr, __VA_ARGS__); \
+    fprintf(stderr, "Backtrace:\n"); \
+    printBacktrace(); \
+} while(0)
+#endif
+#else
+#define CNMEM_DEBUG_ERROR(...)
+#endif
+#if 0
+#define CNMEM_DEBUG_INFO printf
+#else
+#define CNMEM_DEBUG_INFO(...)
+#endif
+#if 0 // Enable/disable assertions
+#include <cassert>
+#define CNMEM_ASSERT assert
+#else
+#define CNMEM_ASSERT(...)
+#endif
+#define CNMEM_CHECK_TRUE(cond, error) do { \
+    if( !(cond) ) { \
+        CNMEM_DEBUG_ERROR("CNMEM_CHECK_TRUE evaluates to false\n"); \
+        return error; \
+    } \
+} while(0) 
+#define CNMEM_CHECK(call) do { \
+    cnmemStatus_t status = (call); \
+    if( status != CNMEM_STATUS_SUCCESS ) { \
+        CNMEM_DEBUG_ERROR("CNMEM_CHECK failed with status \"%s\"\n", \
+                cnmemGetErrorString(status)); \
+        return status; \
+    } \
+} while(0)
+#define CNMEM_CHECK_OR_UNLOCK(call, mutex) do { \
+    cnmemStatus_t status = (call); \
+    if( status != CNMEM_STATUS_SUCCESS ) { \
+        CNMEM_DEBUG_ERROR("CNMEM_CHECK_OR_UNLOCK failed with status \"%s\"\n", \
+                cnmemGetErrorString(status)); \
+        (mutex).unlock(); \
+        return status; \
+    } \
+} while(0)
+#define CNMEM_CHECK_CUDA(call) do { \
+    cudaError_t cudaError = (call); \
+    if( cudaError == cudaErrorMemoryAllocation ) { \
+        CNMEM_DEBUG_ERROR("CNMEM_CHECK_CUDA failed with CUDA error \"%s\"\n", \
+                cudaGetErrorString(cudaError)); \
+        return CNMEM_STATUS_OUT_OF_MEMORY; \
+    } \
+    else if( cudaError != cudaSuccess ) { \
+        CNMEM_DEBUG_ERROR("CNMEM_CHECK_CUDA failed with CUDA error \"%s\"\n", \
+                cudaGetErrorString(cudaError)); \
+        return CNMEM_STATUS_CUDA_ERROR; \
+    } \
+} while(0)
+#define CNMEM_CHECK_CUDA_OR_UNLOCK(call, mutex) do { \
+    cudaError_t cudaError = (call); \
+    if( cudaError == cudaErrorMemoryAllocation ) { \
+        CNMEM_DEBUG_ERROR("CNMEM_CHECK_CUDA_OR_UNLOCK failed with CUDA error \"%s\"\n", \
+                cudaGetErrorString(cudaError)); \
+        (mutex).unlock(); \
+        return CNMEM_STATUS_OUT_OF_MEMORY; \
+    } \
+    else if( cudaError != cudaSuccess ) { \
+        CNMEM_DEBUG_ERROR("CNMEM_CHECK_CUDA_OR_UNLOCK failed with CUDA error \"%s\"\n", \
+                cudaGetErrorString(cudaError)); \
+        (mutex).unlock(); \
+        return CNMEM_STATUS_CUDA_ERROR; \
+    } \
+} while(0)
+#ifdef WIN32
+#define CNMEM_CHECK_WIN32(call, error_code) do { \
+    SetLastError(0); /* Clean the flag. */ \
+    call; \
+    DWORD status = GetLastError(); \
+    if( status ) \
+        return error_code; \
+} while(0)
+#else
+#define CNMEM_CHECK_PTHREAD(call, error_code) do { \
+    int status = call; \
+    if( status ) { \
+        CNMEM_DEBUG_ERROR("CNMEM_CHECK_PTHREAD failed with status %d\n", status); \
+        return error_code; \
+    } \
+} while(0)
+#endif
+///////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cnmem {
+static inline std::size_t ceilInt(std::size_t m, std::size_t n) {
+    CNMEM_ASSERT(n > 0);
+    return (m + n-1) / n * n;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+class Mutex {
+#ifdef WIN32
+    CRITICAL_SECTION mCriticalSection;
+#else
+    pthread_mutex_t  mMutex;
+#endif
+public:
+    /// Initialize the mutex.
+    cnmemStatus_t initialize();
+    /// Finalize the mutex.
+    cnmemStatus_t finalize();
+    /// Lock the mutex.
+    cnmemStatus_t lock() const;
+    /// Unlock the mutex.
+    cnmemStatus_t unlock() const;
+};
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t Mutex::initialize() {
+#ifdef WIN32
+    CNMEM_CHECK_WIN32(InitializeCriticalSection((CRITICAL_SECTION*) &mCriticalSection), CNMEM_STATUS_UNKNOWN_ERROR);
+#else
+#if 0
+    pthread_mutexattr_t attr;
+    CNMEM_CHECK_PTHREAD(pthread_mutexattr_init(&attr), CNMEM_STATUS_UNKNOWN_ERROR);
+    CNMEM_CHECK_PTHREAD(pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE), CNMEM_STATUS_UNKNOWN_ERROR);
+    CNMEM_CHECK_PTHREAD(pthread_mutex_init(&mMutex, &attr), CNMEM_STATUS_UNKNOWN_ERROR);
+#else
+    CNMEM_CHECK_PTHREAD(pthread_mutex_init(&mMutex, NULL), CNMEM_STATUS_UNKNOWN_ERROR);
+#endif
+#endif
+    return CNMEM_STATUS_SUCCESS;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t Mutex::finalize() {
+#ifdef WIN32
+    CNMEM_CHECK_WIN32(DeleteCriticalSection((CRITICAL_SECTION*) &mCriticalSection), CNMEM_STATUS_UNKNOWN_ERROR);
+#else
+    CNMEM_CHECK_PTHREAD(pthread_mutex_destroy(&mMutex), CNMEM_STATUS_UNKNOWN_ERROR);
+#endif
+    return CNMEM_STATUS_SUCCESS;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t Mutex::lock() const {
+#ifdef WIN32
+    CNMEM_CHECK_WIN32(EnterCriticalSection(&mCriticalSection), CNMEM_STATUS_UNKNOWN_ERROR);
+#else
+    CNMEM_CHECK_PTHREAD(pthread_mutex_lock((pthread_mutex_t*) &mMutex), CNMEM_STATUS_UNKNOWN_ERROR);
+#endif
+    return CNMEM_STATUS_SUCCESS;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t Mutex::unlock() const {
+#ifdef WIN32
+    CNMEM_CHECK_WIN32(LeaveCriticalSection(&mCriticalSection), CNMEM_STATUS_UNKNOWN_ERROR);
+#else
+    CNMEM_CHECK_PTHREAD(pthread_mutex_unlock((pthread_mutex_t*) &mMutex), CNMEM_STATUS_UNKNOWN_ERROR);
+#endif
+    return CNMEM_STATUS_SUCCESS;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+class Block {
+    /// The pointer to the memory region on the device. 
+    char *mData;
+    /// The size of the memory buffer.
+    std::size_t mSize;
+    /// The prev/next blocks in the linked list of blocks.
+    Block *mNext;
+    /// Is it a head node (i.e. a node obtained from parent->allocate or cudaMalloc).
+    bool mIsHead;
+public:
+    /// Create a block.
+    Block(char *data, std::size_t size, Block *next, bool isHead)
+        : mData(data)
+        , mSize(size)
+        , mNext(next)
+        , mIsHead(isHead) {
+    }
+    /// The data.
+    inline const char* getData() const { return mData; }
+    /// The data (mutable).
+    inline char* getData() { return mData; }
+    /// The size of the block.
+    inline std::size_t getSize() const { return mSize; }
+    /// The next block in the linked list.
+    inline const Block* getNext() const { return mNext; }
+    /// The next block in the linked list (mutable).
+    inline Block* getNext() { return mNext; }
+    /// Is it a head block.
+    inline bool isHead() const { return mIsHead; }
+    /// Change the next block.
+    inline void setNext(Block *next) { mNext = next; }
+    /// Change the size of the block.
+    inline void setSize(std::size_t size) { mSize = size; }
+    /// Set the head flag.
+    inline void setHeadFlag(bool isHead) { mIsHead = isHead; }
+};
+///////////////////////////////////////////////////////////////////////////////////////////////////
+class Manager {
+    /// The parent manager.
+    Manager *mParent;
+    /// The children managers.
+    std::vector<Manager*> mChildren;
+    /// The GPU device where the memory is allocated.
+    int mDevice;
+    /// The stream this manager is associated with. It could be NULL.
+    cudaStream_t mStream;
+    /// Is the stream blocking?
+    bool mIsStreamBlocking;
+    /// The list of used blocks.
+    Block *mUsedBlocks;
+    /// The list of free blocks.
+    Block *mFreeBlocks;
+    /// The managed memory size.
+    std::size_t mSize;
+    /// The flags.
+    unsigned mFlags;
+    /// To support multi-threading. Each manager has its own mutex.
+    Mutex mMutex;
+public:
+    /// Create an unitialized manager.
+    Manager();
+    /// Dtor.
+    ~Manager();
+    /// Allocate a block of memory.
+    cnmemStatus_t allocate(void *&ptr, std::size_t size, bool isBlocking = true);
+    /// Release a block of memory.
+    cnmemStatus_t release(void *ptr);
+    /// Release memory. It returns true if we have no memory leak.
+    cnmemStatus_t releaseAllUnsafe();
+    /// Reserve memory for a manager.
+    cnmemStatus_t reserve(std::size_t size);
+    /// Steal memory from another manager.
+    cnmemStatus_t stealUnsafe(void *&ptr, std::size_t size);
+    /// Print the full memory state.
+    cnmemStatus_t printMemoryState(FILE *file) const;
+    /// The amount of used memory.
+    inline cnmemStatus_t getUsedMemoryUnsafe(std::size_t &usedMemory) const { 
+        return getMemoryUnsafe(usedMemory, mUsedBlocks); 
+    }
+    /// The amount of used memory.
+    inline cnmemStatus_t getFreeMemoryUnsafe(std::size_t &freeMemory) const { 
+        return getMemoryUnsafe(freeMemory, mFreeBlocks); 
+    }
+    /// Get a specific child based on the stream id. 
+    cnmemStatus_t getChildFromStream(Manager *&manager, cudaStream_t stream) const;
+    /// Get a specific child based on the stream id. 
+    cnmemStatus_t getChild(Manager *&manager, std::size_t i) const;
+    /// Add a new child.
+    cnmemStatus_t addChild(Manager *manager);
+    /// The number of children.
+    cnmemStatus_t getNumChildren(std::size_t &numChildren) const;
+    /// The associated device.
+    inline int getDevice() const { return mDevice; }
+    /// The flags.
+    inline unsigned getFlags() const { return mFlags; }
+    /// Get the mutex.
+    inline const Mutex* getMutex() const { return &mMutex; }
+    /// The size allocated to that manager.
+    inline std::size_t getSize() const { return mSize; }
+    /// The CUDA stream.
+    inline cudaStream_t getStream() const { return mStream; }
+    /// Define the parent.
+    inline void setParent(Manager *parent) { mParent = parent; }
+    /// Define the device.
+    inline void setDevice(int device) { mDevice = device; }
+    /// Define the stream.
+    inline cnmemStatus_t setStream(cudaStream_t stream) { 
+        mStream = stream; 
+#ifdef CUDA_API_PER_THREAD_DEFAULT_STREAM
+        mIsStreamBlocking = false;
+#else
+        unsigned flags = 0;
+        CNMEM_CHECK_CUDA(cudaStreamGetFlags(mStream, &flags));
+        mIsStreamBlocking = !mStream || !(flags & cudaStreamNonBlocking);
+#endif
+        return CNMEM_STATUS_SUCCESS;
+    }
+    /// Define the flags.
+    inline void setFlags(unsigned flags) { mFlags = flags; }
+private:
+    /// The member functions below which are marked "Unsafe" are not thread-safe when called on a
+    /// same Manager object. Make sure they are called by a single thread in that case.
+    /// Allocate a new block and add it to the free list.
+    cnmemStatus_t allocateBlockUnsafe(Block *&curr, Block *&prev, std::size_t size);
+    /// Release a block from the active list.
+    cnmemStatus_t releaseBlockUnsafe(Block *curr, Block *prev);
+    /// Find the best free node based on the size.
+    cnmemStatus_t findBestBlockUnsafe(Block *&curr, Block *&prev, std::size_t size);
+    /// Extract a node from the list of free blocks.
+    cnmemStatus_t extractBlockUnsafe(Block *curr, Block *prev, std::size_t size, bool stolen);
+    /// Give a free block from that manager.
+    cnmemStatus_t giveBlockUnsafe(void *&data, std::size_t &dataSize, std::size_t size);
+    /// Steal a block from another manager.
+    cnmemStatus_t stealBlockUnsafe(void *&data, std::size_t &dataSize, std::size_t size);
+    /// The memory consumption of a list.
+    cnmemStatus_t getMemoryUnsafe(std::size_t &memSize, const Block *head) const;
+    /// Print an internal linked list.
+    cnmemStatus_t printListUnsafe(FILE *file, const char *name, const Block *head) const;
+};
+///////////////////////////////////////////////////////////////////////////////////////////////////
+Manager::Manager()
+    : mParent(NULL)
+    , mChildren()
+    , mDevice(-1)
+    , mStream(NULL)
+    , mIsStreamBlocking(false)
+    , mUsedBlocks(NULL)
+    , mFreeBlocks(NULL)
+    , mSize(0)
+    , mFlags(CNMEM_FLAGS_DEFAULT)
+    , mMutex() {
+    mMutex.initialize();
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+Manager::~Manager() {
+    if( mDevice == -1 || cudaSetDevice(mDevice) != cudaSuccess ) { // Invalid device, skip it.
+        return;
+    }
+    releaseAllUnsafe();
+    mMutex.finalize();
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t Manager::addChild(Manager *manager) {
+    CNMEM_CHECK(mMutex.lock());
+    mChildren.push_back(manager);
+    CNMEM_CHECK(mMutex.unlock());
+    return CNMEM_STATUS_SUCCESS;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t Manager::allocate(void *&ptr, std::size_t size, bool isBlocking) {
+    CNMEM_CHECK(mMutex.lock());
+    // If the client is not blocking, we have to explicitly synchronize before giving one buffer.
+    if( !isBlocking ) {
+        CNMEM_CHECK_CUDA_OR_UNLOCK(cudaStreamSynchronize(mStream), mMutex);
+    }
+    // Find the best fit.
+    Block *best = NULL, *prev = NULL;
+    CNMEM_CHECK_OR_UNLOCK(findBestBlockUnsafe(best, prev, size), mMutex);
+    // If there's no block left in the list of free blocks (with a sufficient size). Request a new block. 
+    if( best == NULL && !(mFlags & CNMEM_FLAGS_CANNOT_GROW) ) {
+        CNMEM_CHECK_OR_UNLOCK(allocateBlockUnsafe(best, prev, size), mMutex);
+    }
+    // Make sure we do have a block or quit.
+    if( !best ) {
+        ptr = NULL;
+        CNMEM_CHECK(mMutex.unlock());
+        return CNMEM_STATUS_OUT_OF_MEMORY;
+    }
+    // Split the free block if needed.
+    CNMEM_CHECK_OR_UNLOCK(extractBlockUnsafe(best, prev, size, false), mMutex);
+    // Push the node to the list of used nodes.
+    best->setNext(mUsedBlocks);
+    mUsedBlocks = best;
+    // Return the new pointer into memory.
+    ptr = mUsedBlocks->getData();
+    CNMEM_CHECK(mMutex.unlock());
+    return CNMEM_STATUS_SUCCESS;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t Manager::allocateBlockUnsafe(Block *&curr, Block *&prev, std::size_t size) {
+    // Reset the outputs.
+    curr = prev = NULL;
+    // Try to allocate data from the parent or the device.
+    void *data = NULL;
+    if( mParent ) {
+        CNMEM_CHECK(mParent->allocate(data, size, mIsStreamBlocking));
+    }
+    else {
+        CNMEM_DEBUG_INFO("cudaMalloc(%lu)\n", size);
+        CNMEM_CHECK_CUDA(cudaMalloc(&data, size));
+        CNMEM_DEBUG_INFO(">> returned address=0x%016lx\n", (size_t) data);
+    }
+    // If it failed, there's an unexpected issue.
+    CNMEM_ASSERT(data);
+    // We have data, we now need to add it to the list of free nodes. We keep the list sorted.
+    Block *next = mFreeBlocks;
+    for( ; next && next->getData() < data ; next = next->getNext() ) {
+        prev = next;
+    }
+    curr = new Block((char*) data, size, next, true);
+    if( !curr ) {
+        return CNMEM_STATUS_OUT_OF_MEMORY;
+    }
+    if( prev ) {
+        prev->setNext(curr);
+    }
+    else {
+        mFreeBlocks = curr;
+    }
+    return CNMEM_STATUS_SUCCESS;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t Manager::extractBlockUnsafe(Block *curr, Block *prev, std::size_t size, bool stolen) {
+    // We have two cases: 1/ It is the right size so we keep it or 2/ it is too large and we split the node.
+    Block *next;
+    if( curr->getSize() == size ) {
+        next = curr->getNext();
+    }
+    else {
+        std::size_t remaining = curr->getSize()-size;
+        Block *newBlock = new Block(curr->getData() + size, remaining, curr->getNext(), stolen);
+        if( !newBlock ) {
+            return CNMEM_STATUS_OUT_OF_MEMORY;
+        }
+        next = newBlock;
+        curr->setSize(size);
+    }
+    // Redo the "branching" in the nodes.
+    if( prev ) {
+        prev->setNext(next);
+    }
+    else {
+        mFreeBlocks = next;
+    }
+    return CNMEM_STATUS_SUCCESS;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t Manager::findBestBlockUnsafe(Block *&best, Block *&prev, std::size_t size) {
+    best = NULL, prev = NULL;
+    for( Block *temp = mFreeBlocks, *tempPrev = NULL ; temp ; temp = temp->getNext() ) {
+        if( temp->getSize() >= size && (!best || temp->getSize() < best->getSize()) ) {
+            best = temp;
+            prev = tempPrev;
+        }
+        tempPrev = temp;
+    }
+    return CNMEM_STATUS_SUCCESS;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t Manager::getChildFromStream(Manager *&manager, cudaStream_t stream) const {
+    CNMEM_CHECK(mMutex.lock());
+    std::size_t i = 0, numChildren = mChildren.size();
+    for( ; i < numChildren ; ++i ) {
+        if( mChildren[i]->mStream == stream ) {
+            manager = mChildren[i];
+            break;
+        }
+    }
+    CNMEM_CHECK(mMutex.unlock());
+    return i < numChildren ? CNMEM_STATUS_SUCCESS : CNMEM_STATUS_INVALID_ARGUMENT;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t Manager::getChild(Manager *&manager, std::size_t i) const {
+    CNMEM_CHECK(mMutex.lock());
+    if( i >= mChildren.size() ) {
+        CNMEM_CHECK(mMutex.unlock());
+        return CNMEM_STATUS_INVALID_ARGUMENT;
+    }
+    manager = mChildren[i];
+    CNMEM_CHECK(mMutex.unlock());
+    return CNMEM_STATUS_SUCCESS;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t Manager::getMemoryUnsafe(std::size_t &size, const Block *head) const {
+    size = 0;
+    for( Block *curr = (Block*) head ; curr ; curr = curr->getNext() ) {
+        size += curr->getSize();
+    }
+    return CNMEM_STATUS_SUCCESS;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+#if 0
+cnmemStatus_t Manager::getMemory(std::size_t &size, const Block *head) const {
+    CNMEM_CHECK(mMutex.lock());
+    CNMEM_CHECK_OR_UNLOCK(getMemoryUnsafe(size, head));
+    CNMEM_CHECK(mMutex.unlock());
+    return status;
+}
+#endif
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t Manager::getNumChildren(std::size_t &numChildren) const {
+    CNMEM_CHECK(mMutex.lock());
+    numChildren = mChildren.size();
+    CNMEM_CHECK(mMutex.unlock());
+    return CNMEM_STATUS_SUCCESS;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t Manager::giveBlockUnsafe(void *&blockData, std::size_t &blockSize, std::size_t size) {
+    // Make sure the block is not in use any more. It could be too coarse grain and we may change 
+    // it in the future.
+    CNMEM_CHECK_CUDA(cudaStreamSynchronize(mStream));
+    // Init the returned values to 0.
+    blockData = NULL;
+    blockSize = 0;
+    // Find the best node to steal and reserve it.
+    Block *best = NULL, *prev = NULL;
+    CNMEM_CHECK(findBestBlockUnsafe(best, prev, size));
+    if( !best ) {
+        return CNMEM_STATUS_OUT_OF_MEMORY;
+    }
+    CNMEM_CHECK(extractBlockUnsafe(best, prev, size, true));
+    blockData = best->getData();
+    blockSize = best->getSize();
+    // Release the memory used by that block.
+    delete best;
+    return CNMEM_STATUS_SUCCESS;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t Manager::printListUnsafe(FILE *file, const char *name, const Block *head) const {
+    std::size_t size = 0;
+    for( Block *curr = (Block*) head; curr; curr = curr->getNext() ) {
+        size += curr->getSize();
+    }
+    fprintf(file, "| list=\"%s\", size=%lu\n", name, size);
+    for( Block *curr = (Block*) head ; curr ; curr = curr->getNext() ) {
+        fprintf(file, "| | node=0x%016lx, data=0x%016lx, size=%lu, next=0x%016lx, head=%2lu\n", 
+            (std::size_t) curr, 
+            (std::size_t) curr->getData(),
+            (std::size_t) curr->getSize(),
+            (std::size_t) curr->getNext(),
+            (std::size_t) curr->isHead ());
+    }
+    fprintf(file, "|\n");
+    return CNMEM_STATUS_SUCCESS;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t Manager::printMemoryState(FILE *file) const {
+    CNMEM_CHECK(mMutex.lock());
+    std::size_t streamCode = (std::size_t) mStream;
+    std::size_t usedMemory, freeMemory;
+    CNMEM_CHECK_OR_UNLOCK(getUsedMemoryUnsafe(usedMemory), mMutex);
+    CNMEM_CHECK_OR_UNLOCK(getFreeMemoryUnsafe(freeMemory), mMutex);
+    fprintf(file, ">> [%s] device=%d, stream=0x%016lx, used=%luB, free=%luB\n", 
+            mParent ? "child" : "root",
+            mDevice, 
+            streamCode,
+            usedMemory,
+            freeMemory);
+    CNMEM_CHECK_OR_UNLOCK(printListUnsafe(file, "used", mUsedBlocks), mMutex);
+    CNMEM_CHECK_OR_UNLOCK(printListUnsafe(file, "free", mFreeBlocks), mMutex);
+    fprintf(file, "\n");
+    CNMEM_CHECK(mMutex.unlock());
+    if( mParent ) {
+        CNMEM_CHECK(mParent->printMemoryState(file));
+    }
+    return CNMEM_STATUS_SUCCESS;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t Manager::release(void *ptr) {
+    // Skip if ptr is NULL.
+    if( ptr == NULL ) {
+        return CNMEM_STATUS_SUCCESS;
+    }
+    // Lock to make sure only one thread execute that fragment of code.
+    CNMEM_CHECK(mMutex.lock());
+    // Find the node in the list of used blocks.
+    Block *curr = mUsedBlocks, *prev = NULL;
+    for( ; curr && curr->getData() != ptr ; curr = curr->getNext() ) {
+        prev = curr;
+    }
+    // Make sure we have found a node.
+    if( curr == NULL ) {
+        CNMEM_CHECK(mMutex.unlock());
+        return CNMEM_STATUS_INVALID_ARGUMENT;
+    }
+    // We have the node so release it.
+    cnmemStatus_t result = releaseBlockUnsafe(curr, prev);
+    CNMEM_CHECK(mMutex.unlock());
+    return result;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t Manager::releaseAllUnsafe() {
+    // Destroy the children if any.
+    for( std::size_t i = 0; i < mChildren.size(); ++i ) {
+        Manager *child = mChildren[i];
+        CNMEM_CHECK(child->releaseAllUnsafe());
+        delete child;
+    }
+    mChildren.clear();
+    // Destroy used blocks. It's a kind of panic mode to avoid leaks. NOTE: Do that only with roots!!!
+    if( !mParent ) {
+        while( mUsedBlocks ) {
+            CNMEM_CHECK(releaseBlockUnsafe(mUsedBlocks, NULL));
+        }
+    }
+    // We should be having only free blocks that are head blocks. Release those blocks.
+    while( mFreeBlocks ) {
+        if( mParent ) {
+            CNMEM_CHECK(mParent->release(mFreeBlocks->getData()));
+        }
+        else if( mFreeBlocks->isHead() ) {
+            void *data = mFreeBlocks->getData();
+            CNMEM_DEBUG_INFO("cudaFree(%lu, 0x%016lx)\n", mFreeBlocks->getSize(), (size_t) data);
+            CNMEM_CHECK_CUDA(cudaFree(data));
+            CNMEM_DEBUG_INFO(">> success\n");
+        }
+        Block *block = mFreeBlocks;
+        mFreeBlocks = mFreeBlocks->getNext();
+        delete block;
+    }
+    // We shouldn't have any used block left. Or, it means the user is causing memory leaks!
+    return CNMEM_STATUS_SUCCESS;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t Manager::releaseBlockUnsafe(Block *curr, Block *prev) {
+    // The current node cannot be NULL!
+    CNMEM_ASSERT(curr != NULL);
+    // Change the connection of the node.
+    if( prev ) {
+        prev->setNext(curr->getNext());
+    }
+    else {
+        mUsedBlocks = curr->getNext();
+    }
+    // Find the location where this block should be added to the free list.
+    prev = NULL;
+    Block *iter = mFreeBlocks;
+    for( ; iter && iter->getData() < curr->getData() ; iter = iter->getNext() ) {
+        prev = iter;
+    }
+    // Keep track of the successor of pred. We may lose track of it in the following "else".
+    Block *next = prev ? prev->getNext() : mFreeBlocks;
+    // We first check if we can merge the block with its predecessor in the list and curr can be merged.
+    if( prev && prev->getData() + prev->getSize() == curr->getData() && !curr->isHead() ) {
+        prev->setSize(prev->getSize() + curr->getSize());
+        delete curr;
+        curr = prev;
+    }
+    else if( prev ) {
+        prev->setNext(curr);
+    }
+    else {
+        mFreeBlocks = curr;
+    }
+    // Check if we can merge curr and next. We can't merge over "cudaMalloc" boundaries.
+    if( next && curr->getData() + curr->getSize() == next->getData() && !next->isHead() ) {
+        curr->setSize(curr->getSize() + next->getSize());
+        curr->setNext(next->getNext());
+        delete next;
+    }
+    else {
+        curr->setNext(next);
+    }
+    return CNMEM_STATUS_SUCCESS;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t Manager::reserve(std::size_t size) {
+    CNMEM_CHECK(mMutex.lock());
+    Block *curr, *prev;
+    CNMEM_CHECK_OR_UNLOCK(allocateBlockUnsafe(curr, prev, size), mMutex);
+    mSize = size;
+    CNMEM_CHECK(mMutex.unlock());
+    return CNMEM_STATUS_SUCCESS;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t Manager::stealUnsafe(void *&stolen, std::size_t size) {
+    // If we cannot steal, don't even try.
+    if( mFlags & CNMEM_FLAGS_CANNOT_STEAL ) {
+        stolen = NULL;
+        return CNMEM_STATUS_INVALID_ARGUMENT;
+    }
+    // The stolen block.
+    void *data = NULL; std::size_t dataSize = 0;
+    if( !mChildren.empty() ) {
+        CNMEM_CHECK(stealBlockUnsafe(data, dataSize, size));
+    }
+    else if( mParent ) {
+        CNMEM_CHECK(mParent->stealBlockUnsafe(data, dataSize, size));
+    }
+    // Make sure we do have a block of memory or quit.
+    if( !data ) {
+        stolen = NULL;
+        return CNMEM_STATUS_OUT_OF_MEMORY;
+    }
+    // Push the block in the used list.
+    mUsedBlocks = new Block((char*) data, dataSize, mUsedBlocks, true);
+    if( !mUsedBlocks ) {
+        return CNMEM_STATUS_OUT_OF_MEMORY;
+    }
+    // Return the new pointer into memory.
+    stolen = data;
+    return CNMEM_STATUS_SUCCESS;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t Manager::stealBlockUnsafe(void *&data, std::size_t &dataSize, ::size_t size) {
+    // No block found and no room to grow. Try to steal from a children (if we have any).
+    data = NULL;
+    for( std::size_t i = 0 ; !data && i < mChildren.size() ; ++i ) {
+        Manager *child = mChildren[i];
+        if( child->giveBlockUnsafe(data, dataSize, size) == CNMEM_STATUS_SUCCESS ) {
+            break;
+        }
+    }
+    // If no memory space found, simply return NULL. We have failed to allocate. Quit miserably.
+    if( !data ) {
+        return CNMEM_STATUS_OUT_OF_MEMORY;
+    }
+    // We have got a node from a children. We need to update our "used" list before we can do 
+    // anything with it.
+    Block *curr = mUsedBlocks, *prev = NULL;
+    for( ; curr ; curr = curr->getNext() ) { 
+        if( curr->getData() <= data && data < curr->getData()+curr->getSize() ) {
+            break;
+        }
+        prev = curr;
+    }
+    // Curr points to the node which contains that memory region.
+    CNMEM_ASSERT(curr);
+    // If it is exactly the same memory region, we are done!!!
+    if( curr->getData() == data && curr->getSize() == dataSize ) {
+        return CNMEM_STATUS_SUCCESS;
+    }
+    // Track the blocks before and after curr.
+    Block *next = curr->getNext();
+    // We may have up to 3 blocks.
+    std::size_t sizeBefore = (std::size_t) ((char*) data - curr->getData());
+    std::size_t sizeAfter = (curr->getSize() - sizeBefore - dataSize);
+    // The resulting block.
+    Block *result = curr;
+    // If we have no space between curr->getData and block->getData.
+    if( sizeBefore == 0 ) {
+        curr->setSize(dataSize);
+    }
+    else {
+        curr->setSize(sizeBefore);
+        Block *block = new Block((char*) data, dataSize, next, false);
+        if( !block ) {
+            return CNMEM_STATUS_OUT_OF_MEMORY;
+        }
+        curr->setNext(block);
+        curr = block;
+        data = (char*) data + dataSize;
+        dataSize = sizeAfter; 
+        result = block;
+    }
+    // We have space at the end so we may need to add a new node.
+    if( sizeAfter > 0 ) {
+        Block *block = new Block(curr->getData() + curr->getSize(), sizeAfter, next, false);
+        if( !block ) {
+            return CNMEM_STATUS_OUT_OF_MEMORY;
+        }
+        curr->setNext(block);
+        curr = block;
+    }
+    return CNMEM_STATUS_SUCCESS;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+class Context {
+    /// Use a magic number to specify that the context is valid.
+    enum { CTX_VALID = 0x1f5632a3 };
+    /// The reference counting mechanism.
+    int mRefCount;
+    /// The mutex to increase/decrease the reference counter. TODO: Use atomics.
+    Mutex mMutex;
+    /// The memory managers.
+    std::vector<Manager> mManagers;
+    /// The global context.
+    static Context *sCtx;
+    /// Use a magic number to specify that the context was created.
+    static int sCtxCheck;
+public:
+    /// Ctor.
+    Context() : mRefCount(1) { mMutex.initialize(); }
+    /// Dtor.
+    ~Context();
+    /// Get the managers.
+    inline std::vector<Manager>& getManagers() { return mManagers; }
+    /// Get a single manager associated with a device.
+    inline Manager& getManager(int i) { return mManagers[i]; }
+    /// Create the global context.
+    static cnmemStatus_t create();
+    /// Check that the context was created.
+    static inline bool check() { return sCtxCheck == CTX_VALID && sCtx; }
+    /// Get the global context.
+    static Context* get();
+    /// Retain.
+    static cnmemStatus_t retain();
+    /// Release.
+    static cnmemStatus_t release();
+};
+Context *Context::sCtx;
+int Context::sCtxCheck;
+///////////////////////////////////////////////////////////////////////////////////////////////////
+Context::~Context() { 
+    int oldDevice;
+    cudaGetDevice(&oldDevice);
+    for( std::size_t i = 0 ; i < mManagers.size() ; ++i ) {
+        if( mManagers[i].getDevice() != -1 ) { // Skip invalid managers.
+            cudaSetDevice(mManagers[i].getDevice());
+            mManagers[i].releaseAllUnsafe();
+        }
+    }
+    mManagers.clear();
+    mMutex.finalize();
+    cudaSetDevice(oldDevice);
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t Context::create() {
+    sCtx = new Context;
+    sCtxCheck = CTX_VALID;
+    return CNMEM_STATUS_SUCCESS;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+Context* Context::get() {
+    CNMEM_ASSERT(Context::check());
+    return Context::sCtx;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t Context::retain() { 
+    CNMEM_CHECK(sCtx->mMutex.lock());
+    sCtx->mRefCount++; 
+    CNMEM_CHECK(sCtx->mMutex.unlock());
+    return CNMEM_STATUS_SUCCESS;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t Context::release() {
+    CNMEM_CHECK(sCtx->mMutex.lock());
+    int refCount = --sCtx->mRefCount;
+    CNMEM_CHECK(sCtx->mMutex.unlock());
+    if( refCount == 0 ) { // Kill the context.
+        delete sCtx;
+        Context::sCtx = NULL;
+        Context::sCtxCheck = 0;
+    }
+    return CNMEM_STATUS_SUCCESS;
+}
+} // namespace cnmem
+///////////////////////////////////////////////////////////////////////////////////////////////////
+extern "C" {
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t cnmemInit(int numDevices, const cnmemDevice_t *devices, unsigned flags) {
+    // Make sure we have at least one device declared.
+    CNMEM_CHECK_TRUE(numDevices > 0, CNMEM_STATUS_INVALID_ARGUMENT);
+    // Find the largest ID of the device.
+    int maxDevice = 0;
+    for( int i = 0 ; i < numDevices ; ++i ) {
+        if( devices[i].device > maxDevice ) {
+            maxDevice = devices[i].device;
+        }
+    }
+    // Create the global context.
+    cnmem::Context::create();
+    cnmem::Context *ctx = cnmem::Context::get();
+    // Allocate enough managers.
+    CNMEM_CHECK_TRUE(maxDevice >= 0, CNMEM_STATUS_INVALID_ARGUMENT);
+    std::vector<cnmem::Manager> &managers = ctx->getManagers();
+    managers.resize(maxDevice+1);
+    // Create a root manager for each device and create the children.
+    int oldDevice;
+    CNMEM_CHECK_CUDA(cudaGetDevice(&oldDevice));
+    for( int i = 0 ; i < numDevices ; ++i ) {
+        CNMEM_CHECK_CUDA(cudaSetDevice(devices[i].device));
+        std::size_t size = devices[i].size;
+        if( size == 0 ) {
+            cudaDeviceProp props;
+            CNMEM_CHECK_CUDA(cudaGetDeviceProperties(&props, devices[i].device));
+            size = props.totalGlobalMem / 2;
+        }
+        CNMEM_CHECK_TRUE(size > 0, CNMEM_STATUS_INVALID_ARGUMENT);
+        cnmem::Manager &manager = ctx->getManager(devices[i].device);
+        manager.setDevice(devices[i].device);
+        manager.setFlags(flags);
+        size = cnmem::ceilInt(size, CNMEM_GRANULARITY);
+        CNMEM_CHECK(manager.reserve(size));
+        for( int j = 0 ; j < devices[i].numStreams ; ++j ) {
+            cnmem::Manager *child = new cnmem::Manager;
+            child->setParent(&manager);
+            child->setDevice(devices[i].device);
+            child->setStream(devices[i].streams[j]);
+            child->setFlags(flags & ~CNMEM_FLAGS_CANNOT_GROW);
+            if( devices[i].streamSizes && devices[i].streamSizes[j] > 0 ) {
+                CNMEM_CHECK(child->reserve(devices[i].streamSizes[j]));
+            }
+            CNMEM_CHECK(manager.addChild(child));
+        }
+    }
+    CNMEM_CHECK_CUDA(cudaSetDevice(oldDevice));
+    return CNMEM_STATUS_SUCCESS;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t cnmemFinalize() {
+    CNMEM_CHECK_TRUE(cnmem::Context::check(), CNMEM_STATUS_NOT_INITIALIZED);
+    return cnmem::Context::release();
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t cnmemRetain() {
+    CNMEM_CHECK_TRUE(cnmem::Context::check(), CNMEM_STATUS_NOT_INITIALIZED);
+    return cnmem::Context::retain();
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t cnmemRelease() {
+    CNMEM_CHECK_TRUE(cnmem::Context::check(), CNMEM_STATUS_NOT_INITIALIZED);
+    return cnmem::Context::release();
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t cnmemRegisterStream(cudaStream_t stream) {
+    CNMEM_CHECK_TRUE(cnmem::Context::check(), CNMEM_STATUS_NOT_INITIALIZED);
+    CNMEM_CHECK_TRUE(stream, CNMEM_STATUS_INVALID_ARGUMENT);
+    int device;
+    CNMEM_CHECK_CUDA(cudaGetDevice(&device));
+    cnmem::Manager &root = cnmem::Context::get()->getManager(device);
+    cnmem::Manager *child = new cnmem::Manager;
+    child->setParent(&root);
+    child->setDevice(device);
+    child->setStream(stream);
+    child->setFlags(root.getFlags() & ~CNMEM_FLAGS_CANNOT_GROW);
+    root.addChild(child);
+    return CNMEM_STATUS_SUCCESS;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t cnmemMalloc(void **ptr, std::size_t size, cudaStream_t stream) {
+    CNMEM_CHECK_TRUE(cnmem::Context::check(), CNMEM_STATUS_NOT_INITIALIZED);
+    if( !ptr && !size ) {
+        return CNMEM_STATUS_SUCCESS;
+    }
+    else if( !size ) {
+        ptr[0] = NULL;
+        return CNMEM_STATUS_SUCCESS;
+    }
+    CNMEM_CHECK_TRUE(ptr,  CNMEM_STATUS_INVALID_ARGUMENT);
+    int device;
+    CNMEM_CHECK_CUDA(cudaGetDevice(&device));
+    cnmem::Manager &root = cnmem::Context::get()->getManager(device);
+    cnmem::Manager *manager = &root;
+    if( stream ) {
+        CNMEM_CHECK(root.getChildFromStream(manager, stream));
+    }
+    CNMEM_ASSERT(manager);
+    size = cnmem::ceilInt(size, CNMEM_GRANULARITY);
+    cnmemStatus_t result = manager->allocate(ptr[0], size);
+    // We failed to allocate but there might still be a buffer available in another manager. Try to 
+    // steal it.
+    if( result == CNMEM_STATUS_OUT_OF_MEMORY ) {
+        // Try to acquire locks on all the children.
+        std::size_t numChildren;
+        CNMEM_CHECK(root.getNumChildren(numChildren));
+        std::vector<const cnmem::Mutex*> mutexes(numChildren);
+        std::size_t numLocked = 0;
+        for( size_t i = 0 ; i < numChildren ; ++i, ++numLocked ) {
+            cnmem::Manager *child;
+            CNMEM_CHECK(root.getChild(child, i));
+            mutexes[numLocked] = child->getMutex();
+            if( mutexes[numLocked]->lock() != CNMEM_STATUS_SUCCESS ) {
+                break;
+            }
+        }
+        // One lock failed, quit. Reduce the damage as much as possible, though.
+        if( numLocked != numChildren ) {
+            for( std::size_t i = 0 ; i < numLocked ; ++i ) {
+                cnmemStatus_t lockStatus = mutexes[i]->unlock();
+            }
+            return CNMEM_STATUS_UNKNOWN_ERROR;
+        }
+        // Grab the lock on the root, first.
+        const cnmem::Mutex *rootMutex = root.getMutex();
+        CNMEM_CHECK(rootMutex->lock());
+        // We acquired all the lock so we try to steal a node from another child.
+        if( numLocked == mutexes.size() ) {
+            result = manager->stealUnsafe(ptr[0], size);
+        }
+        for( std::size_t i = 0 ; i < numLocked ; ++i ) {
+            cnmemStatus_t lockStatus = mutexes[i]->unlock();
+            if( lockStatus != CNMEM_STATUS_SUCCESS ) { 
+                // Starting from now we are panicking!!! One lock failed to be released, we try
+                // we others. We could also give up because we are already screwed. I don't know
+                // what's best! Comment are welcome.
+                result = lockStatus;
+            }
+        }
+        CNMEM_CHECK(rootMutex->unlock());
+    }
+    return result;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t cnmemFree(void *ptr, cudaStream_t stream) {
+    CNMEM_CHECK_TRUE(cnmem::Context::check(), CNMEM_STATUS_NOT_INITIALIZED);
+    if( ptr == NULL ) {
+        return CNMEM_STATUS_SUCCESS;
+    }
+    int device;
+    CNMEM_CHECK_CUDA(cudaGetDevice(&device));
+    cnmem::Manager &root = cnmem::Context::get()->getManager(device);
+    cnmem::Manager *manager = &root;
+    if( stream ) {
+        CNMEM_CHECK(root.getChildFromStream(manager, stream));
+    }
+    CNMEM_ASSERT(manager);
+    return manager->release(ptr);
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t cnmemMemGetInfo(size_t *freeMem, size_t *totalMem, cudaStream_t stream) {
+    CNMEM_CHECK_TRUE(cnmem::Context::check(), CNMEM_STATUS_NOT_INITIALIZED);
+    CNMEM_CHECK_TRUE(totalMem && freeMem, CNMEM_STATUS_INVALID_ARGUMENT);
+    int device;
+    CNMEM_CHECK_CUDA(cudaGetDevice(&device));
+    cnmem::Manager &root = cnmem::Context::get()->getManager(device);
+    cnmem::Manager *manager = &root;
+    if( stream ) {
+        CNMEM_CHECK(root.getChildFromStream(manager, stream));
+    }
+    CNMEM_ASSERT(manager);
+    const cnmem::Mutex *mutex = manager->getMutex();
+    CNMEM_CHECK(mutex->lock());
+    CNMEM_CHECK_OR_UNLOCK(manager->getFreeMemoryUnsafe(*freeMem), *mutex);
+    size_t usedMem;
+    CNMEM_CHECK_OR_UNLOCK(manager->getUsedMemoryUnsafe(usedMem), *mutex);
+    CNMEM_CHECK(mutex->unlock());
+    totalMem[0] = usedMem + freeMem[0];
+    return CNMEM_STATUS_SUCCESS;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+cnmemStatus_t cnmemPrintMemoryState(FILE *file, cudaStream_t stream) {
+    CNMEM_CHECK_TRUE(cnmem::Context::check(), CNMEM_STATUS_NOT_INITIALIZED);
+    int device;
+    CNMEM_CHECK_CUDA(cudaGetDevice(&device));
+    cnmem::Manager &root = cnmem::Context::get()->getManager(device);
+    cnmem::Manager *manager = &root;
+    if( stream ) {
+        CNMEM_CHECK(root.getChildFromStream(manager, stream));
+    }
+    CNMEM_ASSERT(manager);
+    return manager->printMemoryState(file); 
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+} // extern "C"
--- a/theano/sandbox/cuda/cnmem.h
+++ b/theano/sandbox/cuda/cnmem.h
+/* ********************************************************************** 
+ * Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ********************************************************************** */
+#pragma once
+#ifdef __cplusplus
+#include "cstdio"
+#else
+#include "stdio.h"
+#endif
+#include "cuda_runtime_api.h"
+#if defined(_MSC_VER) || defined(WIN32)
+#ifdef CNMEM_DLLEXPORT
+#define CNMEM_API __declspec(dllexport)
+#else
+#define CNMEM_API __declspec(dllimport)
+#endif
+#else
+#ifdef CNMEM_DLLEXPORT
+#define CNMEM_API __attribute__((visibility ("default")))
+#else
+#define CNMEM_API
+#endif
+#endif
+#define CNMEM_VERSION 100 // It corresponds to 1.0.0
+#ifdef __cplusplus
+extern "C" {
+#endif
+/* ********************************************************************************************* */
+typedef enum
+{
+  CNMEM_STATUS_SUCCESS = 0,
+  CNMEM_STATUS_CUDA_ERROR,
+  CNMEM_STATUS_INVALID_ARGUMENT,
+  CNMEM_STATUS_NOT_INITIALIZED,
+  CNMEM_STATUS_OUT_OF_MEMORY,
+  CNMEM_STATUS_UNKNOWN_ERROR
+} cnmemStatus_t;
+/* ********************************************************************************************* */
+typedef enum
+{
+  CNMEM_FLAGS_DEFAULT = 0,       /// Default flags.
+  CNMEM_FLAGS_CANNOT_GROW = 1,   /// Prevent the manager from growing its memory consumption.
+  CNMEM_FLAGS_CANNOT_STEAL = 2,  /// Prevent the manager from stealing memory.
+} cnmemManagerFlags_t;
+/* ********************************************************************************************* */
+typedef struct cnmemDevice_t_
+{
+  /** The device number. */
+  int device;
+  /** The size to allocate for that device. If 0, the implementation chooses the size. */
+  size_t size;
+  /** The number of named streams associated with the device. The NULL stream is not counted. */
+  int numStreams;
+  /** The streams associated with the device. It can be NULL. The NULL stream is managed. */
+  cudaStream_t *streams;
+  /** The size reserved for each streams. It can be 0. */
+  size_t *streamSizes;
+} cnmemDevice_t;
+/**
+ * \brief Initialize the library and allocate memory on the listed devices.
+ *
+ * For each device, an internal memory manager is created and the specified amount of memory is 
+ * allocated (it is the size defined in device[i].size). For each, named stream an additional 
+ * memory manager is created. Currently, it is implemented as a tree of memory managers: A root 
+ * manager for the device and a list of children, one for each named stream.
+ * 
+ * This function must be called before any other function in the library. It has to be called 
+ * by a single thread since it is not thread-safe.
+ *
+ * \return 
+ * CNMEM_STATUS_SUCCESS,          if everything goes fine,
+ * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid,
+ * CNMEM_STATUS_OUT_OF_MEMORY,    if the requested size exceeds the available memory,
+ * CNMEM_STATUS_CUDA_ERROR,       if an error happens in a CUDA function.
+ */
+cnmemStatus_t CNMEM_API cnmemInit(int numDevices, const cnmemDevice_t *devices, unsigned flags);
+/**
+ * \brief Release all the allocated memory. 
+ * 
+ * This function must be called by a single thread and after all threads that called 
+ * cnmemMalloc/cnmemFree have joined. This function is not thread-safe.
+ *
+ * \return 
+ * CNMEM_STATUS_SUCCESS,          if everything goes fine,
+ * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
+ * CNMEM_STATUS_CUDA_ERROR,       if an error happens in one of the CUDA functions.
+ */
+cnmemStatus_t CNMEM_API cnmemFinalize();
+/**
+ * \brief Increase the internal reference counter of the context object.
+ * 
+ * This function increases the internal reference counter of the library. The purpose of that
+ * reference counting mechanism is to give more control to the user over the lifetime of the 
+ * library. It is useful with scoped memory allocation which may be destroyed in a final 
+ * memory collection after the end of main(). That function is thread-safe.
+ *
+ * \return 
+ * CNMEM_STATUS_SUCCESS,          if everything goes fine,
+ * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
+ */
+cnmemStatus_t CNMEM_API cnmemRetain();
+/**
+ * \brief Decrease the internal reference counter of the context object.
+ * 
+ * This function decreases the internal reference counter of the library. The purpose of that
+ * reference counting mechanism is to give more control to the user over the lifetime of the 
+ * library. It is useful with scoped memory allocation which may be destroyed in a final 
+ * memory collection after the end of main(). That function is thread-safe.
+ *
+ * You can use \c cnmemRelease to explicitly finalize the library.
+ *
+ * \return 
+ * CNMEM_STATUS_SUCCESS,          if everything goes fine,
+ * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
+ */
+cnmemStatus_t CNMEM_API cnmemRelease();
+/**
+ * \brief Add a new stream to the pool of managed streams on a device.
+ *
+ * This function registers a new stream into a device memory manager. It is thread-safe.
+ *
+ * \return 
+ * CNMEM_STATUS_SUCCESS,          if everything goes fine,
+ * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid,
+ */
+cnmemStatus_t CNMEM_API cnmemRegisterStream(cudaStream_t stream);
+/**
+ * \brief Allocate memory. 
+ * 
+ * This function allocates memory and initializes a pointer to device memory. If no memory 
+ * is available, it returns a CNMEM_STATUS_OUT_OF_MEMORY error. This function is thread safe.
+ *
+ * The behavior of that function is the following: 
+ *
+ * - If the stream is NULL, the root memory manager is asked to allocate a buffer of device 
+ *   memory. If there's a buffer of size larger or equal to the requested size in the list of 
+ *   free blocks, it is returned. If there's no such buffer but the manager is allowed to grow 
+ *   its memory usage (the CNMEM_FLAGS_CANNOT_GROW flag is not set), the memory manager calls 
+ *   cudaMalloc. If cudaMalloc fails due to no more available memory or the manager is not 
+ *   allowed to grow, the manager attempts to steal memory from one of its children (unless 
+ *   CNMEM_FLAGS_CANNOT_STEAL is set). If that attempt also fails, the manager returns 
+ *   CNMEM_STATUS_OUT_OF_MEMORY.
+ * 
+ * - If the stream is a named stream, the initial request goes to the memory manager associated 
+ *   with that stream. If a free node is available in the lists of that manager, it is returned. 
+ *   Otherwise, the request is passed to the root node and works as if the request were made on 
+ *   the NULL stream.
+ *
+ * The calls to cudaMalloc are potentially costly and may induce GPU synchronizations. Also the 
+ * mechanism to steal memory from the children induces GPU synchronizations (the manager has to 
+ * make sure no kernel uses a given buffer before stealing it) and it the execution is 
+ * sequential (in a multi-threaded context, the code is executed in a critical section inside
+ * the cnmem library - no need for the user to wrap cnmemMalloc with locks).
+ *
+ * \return 
+ * CNMEM_STATUS_SUCCESS,          if everything goes fine,
+ * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
+ * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, ptr == 0,
+ * CNMEM_STATUS_OUT_OF_MEMORY,    if there is not enough memory available,
+ * CNMEM_STATUS_CUDA_ERROR,       if an error happens in one of the CUDA functions.
+ */
+cnmemStatus_t CNMEM_API cnmemMalloc(void **ptr, size_t size, cudaStream_t stream);
+/**
+ * \brief Release memory. 
+ * 
+ * This function releases memory and recycles a memory block in the manager. This function is 
+ * thread safe. 
+ *
+ * \return 
+ * CNMEM_STATUS_SUCCESS,          if everything goes fine,
+ * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
+ * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, ptr == 0,
+ * CNMEM_STATUS_CUDA_ERROR,       if an error happens in one of the CUDA functions.
+ */
+cnmemStatus_t CNMEM_API cnmemFree(void *ptr, cudaStream_t stream);
+/* ********************************************************************************************* */
+/* Utility functions.                                                                            */
+/* ********************************************************************************************* */
+/**
+ * \brief Returns the amount of memory managed by the memory manager associated with a stream.
+ * 
+ * The pointers totalMem and freeMem must be valid. At the moment, this function has a comple-
+ * xity linear in the number of allocated blocks so do not call it in performance critical 
+ * sections. 
+ *
+ * \return 
+ * CNMEM_STATUS_SUCCESS,          if everything goes fine,
+ * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
+ * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid,
+ * CNMEM_STATUS_CUDA_ERROR,       if an error happens in one of the CUDA functions.
+ */
+cnmemStatus_t CNMEM_API cnmemMemGetInfo(size_t *freeMem, size_t *totalMem, cudaStream_t stream);
+/**
+ * \brief Print a list of nodes to a file. 
+ * 
+ * This function is intended to be used in case of complex scenarios to help understand the 
+ * behaviour of the memory managers/application. It is thread safe.
+ *
+ * \return 
+ * CNMEM_STATUS_SUCCESS,          if everything goes fine,
+ * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
+ * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, used_mem == 0 
+ *                                or free_mem == 0,
+ * CNMEM_STATUS_CUDA_ERROR,       if an error happens in one of the CUDA functions.
+ */
+cnmemStatus_t CNMEM_API cnmemPrintMemoryState(FILE *file, cudaStream_t stream);
+/**
+ * \brief Converts a cnmemStatus_t value to a string.
+ */
+const char* CNMEM_API cnmemGetErrorString(cnmemStatus_t status);
+/* ********************************************************************************************* */
+#ifdef __cplusplus
+} // extern "C"
+#endif
--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -9,6 +9,13 @@
 #include "cuda_ndarray.cuh"
+#ifndef CNMEM_DLLEXPORT
+#define CNMEM_DLLEXPORT
+#endif
+#include "cnmem.h"
+#include "cnmem.cpp"
 //If true, when there is a gpu malloc or free error, we print the size of allocated memory on the device.
 #define COMPUTE_GPU_MEM_USED 0
@@ -67,6 +74,54 @@ void * device_malloc(size_t size)
    return device_malloc(size, VERBOSE_DEVICE_MALLOC);
 }
+///@TODO: thejaswi: link this option to a theano config variable?
+static bool g_use_cnmem = false;
+static const int g_max_devices = 8;
+int initCnmem(int card_number_provided, int card_nb, size_t mem) {
+    static bool cnmemInitialized = false;
+    if(cnmemInitialized) {
+        return 0;
+    }
+    // On stderr to be at the same place as "Using gpu device..."
+    int numDevices = 0;
+    cnmemDevice_t devices[g_max_devices];
+    if(cudaGetDeviceCount(&numDevices) != cudaSuccess) {
+        PyErr_Format(PyExc_RuntimeError,
+                     "initCnmem: 'cudaGetDeviceCount' failed! Reason=%s\n",
+                     cudaGetErrorString(cudaGetLastError()));
+        return -1;
+    }
+    if(card_number_provided){
+        numDevices = 1;
+        int i = 0;
+        devices[i].device = card_nb;
+        devices[i].size = mem;
+        ///@TODO: thejaswi: add support for multiple streams
+        devices[i].numStreams = 0;
+        devices[i].streams = NULL;
+        devices[i].streamSizes = NULL;
+    }else{
+        for(int i=0;i<numDevices;++i) {
+            devices[i].device = i;
+            devices[i].size = mem;
+            ///@TODO: thejaswi: add support for multiple streams
+            devices[i].numStreams = 0;
+            devices[i].streams = NULL;
+        }
+    }
+    ///@TODO: thejaswi: passing custom cnmem flags?
+    cnmemStatus_t status = cnmemInit(numDevices, devices, CNMEM_FLAGS_DEFAULT);
+    if(status != CNMEM_STATUS_SUCCESS) {
+        PyErr_Format(PyExc_RuntimeError,
+                     "initCnmem: cnmemInit call failed! Reason=%s. numdev=%d\n",
+                     cnmemGetErrorString(status), numDevices);
+        return -1;
+    }
+    cnmemInitialized = true;
+    return 0;
+}
 void * device_malloc(size_t size, int verbose)
 {
    #if PRECHECK_ERROR
@@ -81,6 +136,18 @@ void * device_malloc(size_t size, int verbose)
        }
    #endif
    void * rval=NULL;
+    ///@TODO: thejaswi: support for multiple-streams?
+    if(g_use_cnmem) {
+        cnmemStatus_t status = CNMEM_STATUS_SUCCESS;
+        status = cnmemMalloc(&rval, size, NULL);
+        if(status != CNMEM_STATUS_SUCCESS) {
+            PyErr_Format(PyExc_MemoryError,
+                         "Error allocating %zd bytes of device memory (%s).",
+                         size, cnmemGetErrorString(status));
+            return NULL;
+        }
+    }
+    else {
        cudaError_t err = cudaMalloc(&rval, size);
        if (cudaSuccess != err)
        {
@@ -118,6 +185,7 @@ void * device_malloc(size_t size, int verbose)
                         size, cudaGetErrorString(err));
            return NULL;
        }
+    }
    if (rval != NULL){
        // Can it happen that cudaMalloc return cudaSuccess, but return a NULL ptr?
        // Could this be what happen if size is 0?
@@ -202,6 +270,15 @@ int device_free(void *ptr)
        return 0;
    }
+    ///@TODO: thejaswi: multi-stream support
+    if(g_use_cnmem) {
+        cnmemStatus_t status = cnmemFree(ptr, NULL);
+        if(status != CNMEM_STATUS_SUCCESS) {
+            fprintf(stderr, "device_free: cnmemFree call failed! Reason=%s\n",
+                    cnmemGetErrorString(status));
+        }
+    }
+    else {
        // We need sync as the Theano's GC could remove intermediate variable that
        // are still needed as the gpu kernel are running or in the queue.
        CNDA_BEGIN_ALLOW_THREADS
@@ -259,6 +336,7 @@ int device_free(void *ptr)
                    cudaGetErrorString(err));
            return -1;
        }
+    }
    _outstanding_mallocs[0] -= (ptr != NULL);
    #if COMPUTE_GPU_MEM_USED
        int i=0;
@@ -2863,6 +2941,32 @@ CudaNdarray_cublasv2(PyObject* _unused, PyObject* dummy)
    return Py_True;
 }
+PyObject *
+CudaNdarray_select_a_gpu(PyObject* _unused, PyObject* dummy)
+{
+    void * rval = NULL;
+    cudaError_t err = cudaMalloc(&rval, 4);
+    if (cudaSuccess != err){
+        printf("ERR!\\n");
+            PyErr_Format(PyExc_RuntimeError,
+                         "Not able to do basic stuff on the GPU (alloc of 4 bytes) (%s).",
+                         cudaGetErrorString(err));
+            return NULL;
+    }
+    err = cudaFree(rval);
+    if (cudaSuccess != err){
+        printf("ERR!\\n");
+            PyErr_Format(PyExc_RuntimeError,
+                         "Not able to do basic stuff on the GPU (cudaFree failed) (%s).",
+                         cudaGetErrorString(err));
+            return NULL;
+    }
+    Py_INCREF(Py_None);
+    return Py_None;
+}
 #if COMPUTE_GPU_MEM_USED
 /*
 * Return the size in bytes that Theano currently have allocated on the gpu.
@@ -3030,18 +3134,23 @@ CudaNdarray_ptr_int_size(PyObject* _unused, PyObject* args)
 static int cublas_init();
 static void cublas_shutdown();
 // Initialize the gpu.
-// Takes one optional parameter, the device number.
+// Takes two optional parameters, the device number and if we should use cnmem.
-// If provided, it sets that device to be the active device.
+// If the device number is provided, it sets that device to be the active device.
 // If not provided (usually just to test whether the gpu is available at all),
 // it does not set an active device.
 // Raises EnvironmentError or ValueError (as appropriate) if the initialization failed.
+// cnmem is threaded like a bool. If converted to 0, don't use cnmem. Otherwise, use it.
 PyObject *
 CudaNdarray_gpu_init(PyObject* _unused, PyObject* args)
 {
    int card_nb = 0;
    int card_number_provided = 1;
+    float cnmem = 0; // Theano flag lib.cnmem
-    PyArg_ParseTuple(args, "|i", &card_nb); // if we're given something wildly invalid, this will throw a TypeError
+    // if we're given something wildly invalid, this will throw a TypeError
+    if(!PyArg_ParseTuple(args, "|if", &card_nb, &cnmem))
+        return NULL;
+    if(cnmem)
+        g_use_cnmem = true;
    if(PyTuple_Size(args) == 0) {
        card_number_provided = 0;
@@ -3096,6 +3205,34 @@ CudaNdarray_gpu_init(PyObject* _unused, PyObject* args)
        if (cublas_init() == -1)
            return NULL;
    }
+    if(card_number_provided && g_use_cnmem) {
+        size_t mem = 0;
+        if (cnmem > 1)
+            mem = cnmem * 1024 * 1024;
+        else{
+            // Clip to 98.5% to let memory for the driver.
+            if (cnmem > .985){
+                cnmem = .985;
+            }
+            size_t free = 0, total = 0;
+            cudaError_t err = cudaMemGetInfo(&free, &total);
+            if (err != cudaSuccess){
+                // Clear the error flag, cudaMemGetInfo doesn't do it.
+                // Currently this returns the same thing as err, but if in future
+                // it returns something else I still don't see why we should ignore
+                // it.  All we want to do here is reset the flag.
+                cudaGetLastError();
+                PyErr_Format(PyExc_RuntimeError,
+                             "Error while getting memory info about the gpu: %s",
+                             cudaGetErrorString(err));
+                return NULL;
+            }
+            mem = total * cnmem;
+        }
+        if(initCnmem(card_number_provided, card_nb, mem) == -1){
+            return NULL;
+        }
+    }
    Py_INCREF(Py_None);
    return Py_None;
@@ -3126,8 +3263,20 @@ PyObject *
 CudaNdarray_gpu_shutdown(PyObject* _unused, PyObject* _unused_args) {
    // Don't handle errors here
    cublas_shutdown();
-    cudaThreadExit();
    g_gpu_context_active = 0; // context has now been closed down
+    if(g_use_cnmem) {
+        cnmemStatus_t status = cnmemFinalize();
+        if(status != CNMEM_STATUS_SUCCESS) {
+            fprintf(stderr, "CudaNdarray_gpu_shutdown: cnmemFinalize failed! Reason=%s\n",
+                    cnmemGetErrorString(status));
+            if(status == CNMEM_STATUS_CUDA_ERROR) {
+                fprintf(stderr, "  Cuda-Reason=%s\n",
+                        cudaGetErrorString(cudaGetLastError()));
+            }
+        }
+    }
+    cudaThreadExit();
    Py_INCREF(Py_None);
    return Py_None;
 }
@@ -3392,6 +3541,7 @@ static PyMethodDef module_methods[] = {
    {"dimshuffle", CudaNdarray_Dimshuffle, METH_VARARGS, "Returns the dimshuffle of a CudaNdarray."},
    {"dot", CudaNdarray_Dot, METH_VARARGS, "Returns the matrix product of two CudaNdarray arguments."},
    {"gpu_init", CudaNdarray_gpu_init, METH_VARARGS, "Select the gpu card to use; also usable to test whether CUDA is available."},
+    {"select_a_gpu", CudaNdarray_select_a_gpu, METH_NOARGS, "Call this method if you want to select a GPU before gpu_init call and let the driver choose the GPU."},
    {"active_device_name", CudaNdarray_active_device_name, METH_VARARGS, "Get the name of the active device."},
    {"active_device_number", CudaNdarray_active_device_number, METH_VARARGS, "Get the number of the active device."},
    {"gpu_shutdown", CudaNdarray_gpu_shutdown, METH_VARARGS, "Shut down the gpu."},