Update the version of CNMeM

This new version of CNMeM corresponds to the following commit https://github.com/NVIDIA/cnmem/commit/2559e911ca5ad33c8a8aa7e5877345265115d963 It contains two bug fixes: 1/ One critical fix when the first device used by the library is not device 0. It could result in a call to cudaSetDevice(-1). 2/ One minor issue with a call to cnmemMalloc with a non-NULL pointer but a size of 0. It returned an CNMEM_STATUS_BAD_PARAM in the past. It is now fixed and it returns success. I also changed the code in cuda_ndarray.cu to remove the extra check when the size of the allocation was 0.

Update the version of CNMeM
2d9aff75 · Julien Demouth · Frederic · 130d2ce9 · 2d9aff75 · 2d9aff75
--- a/theano/sandbox/cuda/cnmem.cpp
+++ b/theano/sandbox/cuda/cnmem.cpp
--- a/theano/sandbox/cuda/cnmem.h
+++ b/theano/sandbox/cuda/cnmem.h
@@ -57,7 +57,6 @@ typedef enum
  CNMEM_STATUS_SUCCESS = 0,
  CNMEM_STATUS_CUDA_ERROR,
  CNMEM_STATUS_INVALID_ARGUMENT,
-  CNMEM_STATUS_MEMORY_LEAK,
  CNMEM_STATUS_NOT_INITIALIZED,
  CNMEM_STATUS_OUT_OF_MEMORY,
  CNMEM_STATUS_UNKNOWN_ERROR
@@ -109,29 +108,58 @@ typedef struct cnmemDevice_t_
 cnmemStatus_t CNMEM_API cnmemInit(int numDevices, const cnmemDevice_t *devices, unsigned flags);
 /**
- * \brief Add a new stream to the pool of managed streams on a device.
+ * \brief Release all the allocated memory. 
 * 
- * This function registers a new stream into a device memory manager. It is thread-safe.
+ * This function must be called by a single thread and after all threads that called 
+ * cnmemMalloc/cnmemFree have joined. This function is not thread-safe.
 *
 * \return 
 * CNMEM_STATUS_SUCCESS,          if everything goes fine,
- * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid,
+ * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
+ * CNMEM_STATUS_CUDA_ERROR,       if an error happens in one of the CUDA functions.
 */
-cnmemStatus_t CNMEM_API cnmemRegisterStream(cudaStream_t stream);
+cnmemStatus_t CNMEM_API cnmemFinalize();
 /**
- * \brief Release all the allocated memory. 
+ * \brief Increase the internal reference counter of the context object.
 * 
- * This function must be called by a single thread and after all threads that called 
+ * This function increases the internal reference counter of the library. The purpose of that
- * cnmemMalloc/cnmemFree have joined. This function is not thread-safe.
+ * reference counting mechanism is to give more control to the user over the lifetime of the 
+ * library. It is useful with scoped memory allocation which may be destroyed in a final 
+ * memory collection after the end of main(). That function is thread-safe.
 *
 * \return 
 * CNMEM_STATUS_SUCCESS,          if everything goes fine,
 * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
- * CNMEM_STATUS_MEMORY_LEAK,      if there are unreleased blocks in the memory queues,
- * CNMEM_STATUS_CUDA_ERROR,       if an error happens in one of the CUDA functions.
 */
-cnmemStatus_t CNMEM_API cnmemFinalize();
+cnmemStatus_t CNMEM_API cnmemRetain();
+/**
+ * \brief Decrease the internal reference counter of the context object.
+ * 
+ * This function decreases the internal reference counter of the library. The purpose of that
+ * reference counting mechanism is to give more control to the user over the lifetime of the 
+ * library. It is useful with scoped memory allocation which may be destroyed in a final 
+ * memory collection after the end of main(). That function is thread-safe.
+ *
+ * You can use \c cnmemRelease to explicitly finalize the library.
+ *
+ * \return 
+ * CNMEM_STATUS_SUCCESS,          if everything goes fine,
+ * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
+ */
+cnmemStatus_t CNMEM_API cnmemRelease();
+/**
+ * \brief Add a new stream to the pool of managed streams on a device.
+ *
+ * This function registers a new stream into a device memory manager. It is thread-safe.
+ *
+ * \return 
+ * CNMEM_STATUS_SUCCESS,          if everything goes fine,
+ * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid,
+ */
+cnmemStatus_t CNMEM_API cnmemRegisterStream(cudaStream_t stream);
 /**
 * \brief Allocate memory. 

--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -138,9 +138,7 @@ void * device_malloc(size_t size, int verbose)
    ///@TODO: thejaswi: support for multiple-streams?
    if(g_use_cnmem) {
        cnmemStatus_t status = CNMEM_STATUS_SUCCESS;
-        if( size != 0 ) {
        status = cnmemMalloc(&rval, size, NULL);
-        }
        if(status != CNMEM_STATUS_SUCCESS) {
            PyErr_Format(PyExc_MemoryError,
                         "Error allocating %zd bytes of device memory (%s).",