Merge pull request #3198 from nouiz/cumem3

Add CNMeM in Theano to speed up CUDA allocation.

Merge pull request #3198 from nouiz/cumem3
ca465be0 · abergeron · 2ddaca06 · 080a11fe · ca465be0 · ca465be0
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -2,6 +2,7 @@ global-include *.txt
 global-include *.c
 global-include *.cu
 global-include *.cuh
+global-include *.cpp
 global-include *.h
 global-include *.sh
 global-include *.pkl

--- a/doc/acknowledgement.txt
+++ b/doc/acknowledgement.txt
@@ -11,7 +11,7 @@ Acknowledgements
 * The developers of `NumPy <http://numpy.scipy.org/>`_. Theano is based on its ndarray object and uses much of its implementation.
 * The developers of `SciPy <http://scipy.org/>`_. Our sparse matrix support uses their sparse matrix objects. We also reuse other parts.
-* All Theano authors in the commit log.
+* All `Theano contributors <https://github.com/Theano/Theano/graphs/contributors>`_.
 * All Theano users that have given us feedback.
 * The GPU implementation of tensordot is based on code from Tijmen
  Tieleman's `gnumpy <http://www.cs.toronto.edu/~tijmen/gnumpy.html>`_
@@ -24,3 +24,4 @@ Acknowledgements
    P. L'Ecuyer and R. Touzin, `Fast Combined Multiple Recursive Generators with Multipliers of the form a = +/- 2^d +/- 2^e <http://www.informs-sim.org/wsc00papers/090.PDF>`_, Proceedings of the 2000 Winter Simulation Conference, Dec. 2000, 683--689.
  We were authorized by Pierre L'Ecuyer to copy/modify his Java implementation in the `SSJ <http://www.iro.umontreal.ca/~simardr/ssj/>`_ software and to relicense it under BSD 3-Clauses in Theano.
+* A better GPU memory allocator :attr:`CNMeM <config.lib.cnmem>` is included in Theano. It has the same license.
--- a/doc/faq.txt
+++ b/doc/faq.txt
@@ -72,13 +72,18 @@ and use directly the optimized graph from the pickled file.
 Faster Theano function
 ----------------------
-You can set the Theano flag ``allow_gc`` to ``False`` to get a speed-up by using
+You can set the Theano flag :attr:`allow_gc <config.allow_gc>` to ``False`` to get a speed-up by using
 more memory. By default, Theano frees intermediate results when we don't need
 them anymore. Doing so prevents us from reusing this memory. So disabling the
 garbage collection will keep all intermediate results' memory space to allow to
 reuse them during the next call to the same Theano function, if they are of the
 correct shape. The shape could change if the shapes of the inputs change.
+.. note::
+   With :attr:`CNMeM <config.lib.cnmem>`, this isn't very useful with GPU
+   anymore.
 .. _unsafe_optimization:
 Unsafe optimization

--- a/doc/index.txt
+++ b/doc/index.txt
@@ -21,6 +21,9 @@ Montreal).
 News
 ====
+* We added support for :attr:`CNMeM <config.lib.cnmem>` to speed up
+  the GPU memory allocation.
 * Theano 0.7 was released 26th March 2015. Everybody is encouraged to update.
 * We support `cuDNN <http://deeplearning.net/software/theano/library/sandbox/cuda/dnn.html>`_ if it is installed by the user.

--- a/doc/library/config.txt
+++ b/doc/library/config.txt
@@ -370,6 +370,34 @@ import theano and print the config variable, as in:
    `amdlibm <http://developer.amd.com/cpu/libraries/libm/>`__
    library, which is faster than the standard libm.
+.. attribute:: lib.cnmem
+    Float value: >= 0
+    Do we enable `CNMeM <https://github.com/NVIDIA/cnmem>`_ or not (a
+    faster CUDA memory allocator). In Theano dev version until 0.7.1
+    is released.
+    That library is included in Theano, you do not need to install it.
+    The value represents the start size (in MB or % of total GPU
+    memory) of the memory pool. If more memory are needed, it will
+    try to get more, but this can cause more memory fragmentation:
+        * 0: not enabled.
+        * 0 < N <= 1: % of the total GPU memory (clipped to .985 for driver memory)
+        * > 0: use that number of MB of memory.
+    Default 0 (but should change later)
+    .. note::
+        This could cause memory fragmentation. So if you have a
+        memory error while using cnmem, try to allocate more memory at
+        the start or disable it. If you try this, report your result
+        on :ref`theano-dev`.
 .. attribute:: linker
    String value: 'c|py', 'py', 'c', 'c|py_nogc'

--- a/setup.py
+++ b/setup.py
@@ -164,7 +164,7 @@ def do_setup():
          install_requires=['numpy>=1.6.2', 'scipy>=0.11', 'six>=1.9.0'],
          package_data={
              '': ['*.txt', '*.rst', '*.cu', '*.cuh', '*.c', '*.sh', '*.pkl',
-                   '*.h', 'ChangeLog'],
+                   '*.h', '*.cpp', 'ChangeLog'],
              'theano.misc': ['*.sh']
          },
          scripts=['bin/theano-cache', 'bin/theano-nose', 'bin/theano-test'],

--- a/theano/sandbox/cuda/__init__.py
+++ b/theano/sandbox/cuda/__init__.py
@@ -13,7 +13,8 @@ from theano.compile import optdb
 from theano.gof import EquilibriumDB, SequenceDB
 from theano.gof.cmodule import get_lib_extension
 from theano.gof.compilelock import get_lock, release_lock
-from theano.configparser import config, AddConfigVar, StrParam, BoolParam
+from theano.configparser import (
+    config, AddConfigVar, BoolParam, FloatParam, StrParam)
 from . import nvcc_compiler
 # ignore_newtrees is to speed the optimization as this is the pattern
@@ -54,6 +55,21 @@ AddConfigVar('cublas.lib',
        """Name of the cuda blas library for the linker.""",
        StrParam('cublas'))
+AddConfigVar('lib.cnmem',
+             """Do we enable CNMeM or not (a faster CUDA memory allocator).
+             The parameter represent the start size (in MB or % of
+             total GPU memory) of the memory pool.
+             0: not enabled.
+             0 < N <= 1: % of the total GPU memory (clipped to .985 for driver memory)
+             > 0: use that number of MB of memory.
+             """,
+             # We should not mix both allocator, so we can't override
+             FloatParam(0, lambda i: i >= 0, allow_override=False),
+             in_c_key=False)
 # is_nvcc_available called here to initialize global vars in
 # nvcc_compiler module
 nvcc_compiler.is_nvcc_available()
@@ -107,6 +123,8 @@ def try_import():
        'cuda_ndarray.cu',
        'cuda_ndarray.cuh',
        'conv_full_kernel.cu',
+        'cnmem.h',
+        'cnmem.cpp',
        'conv_kernel.cu')
    stat_times = [os.stat(os.path.join(cuda_path, cuda_file))[stat.ST_MTIME]
                  for cuda_file in cuda_files]
@@ -178,7 +196,8 @@ if compile_cuda_ndarray and cuda_available:
                            location=cuda_ndarray_loc,
                            include_dirs=[cuda_path],
                            libs=[config.cublas.lib],
-                            preargs=['-O3'] + compiler.compile_args())
+                            preargs=['-O3'] + compiler.compile_args(),
+                    )
                    from cuda_ndarray.cuda_ndarray import *
            except Exception as e:
                _logger.error("Failed to compile cuda_ndarray.cu: %s", str(e))
@@ -377,7 +396,7 @@ def use(device,
        try:
            if (device != 'gpu') and not pycuda_init_dev:
                assert isinstance(device, int)
-                gpu_init(device)
+                gpu_init(device, config.lib.cnmem)
                use.device_number = device
                assert active_device_number() == device
            else:
@@ -387,10 +406,10 @@ def use(device,
                # query the active GPU. If we check the active GPU before
                # the device is initialized we will always receive 0
                # event if another device is selected later.
-                cuda_ndarray.cuda_ndarray.CudaNdarray.zeros((2, 3))
+                cuda_ndarray.cuda_ndarray.select_a_gpu()
                use.device_number = active_device_number()
                # This is needed to initialize the cublas handle.
-                gpu_init(use.device_number)
+                gpu_init(use.device_number, config.lib.cnmem)
            if test_driver:
                import theano.sandbox.cuda.tests.test_driver
@@ -403,8 +422,9 @@ def use(device,
                                 " this property")
            if config.print_active_device:
-                print("Using gpu device %d: %s" % (
+                cnmem_enabled = "enabled" if config.lib.cnmem else "disabled"
-                        active_device_number(), active_device_name()), file=sys.stderr)
+                print("Using gpu device %d: %s (CNMeM is %s)" % (
+                        active_device_number(), active_device_name(), cnmem_enabled), file=sys.stderr)
            if device_properties(use.device_number)['regsPerBlock'] < 16384:
                # We will try to use too much register per bloc at many places
                # when there is only 8k register per multi-processor.

--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -137,13 +137,9 @@ class BatchedDotOp(GpuOp):
                host_z[i] = host_z[i - 1] + z_stride;
            }
-            err1 = cudaMalloc((void **)&gpu_x, ptr_array_size);
+            gpu_x = (float **) device_malloc(ptr_array_size);
-            if (err1 != cudaSuccess)
+            if (gpu_x == NULL){
-            {
-                CLEANUP();
-                PyErr_Format(PyExc_RuntimeError,
-                             "%%s", "cudaMalloc failure");
                %(fail)s;
            }
@@ -195,7 +191,7 @@ class BatchedDotOp(GpuOp):
            do                                          \
            {                                           \
                if (host_x) free (host_x);              \
-                if (gpu_x) cudaFree(gpu_x);             \
+                if (gpu_x) device_free(gpu_x);          \
            } while (0)
        """
@@ -213,6 +209,9 @@ class BatchedDotOp(GpuOp):
        return rval
+    def c_code_cache_version(self):
+        return (1,)
 batched_dot = BatchedDotOp()
 class GpuDot22(GpuOp):

--- a/theano/sandbox/cuda/blocksparse.py
+++ b/theano/sandbox/cuda/blocksparse.py
@@ -208,22 +208,28 @@ static int SparseBlockGemv_copy(PyArrayObject *a, npy_intp *b) {
        static int %(n)s_prep(int b, int i, int j, int outsize) {
          int s = b*i*j;
          if (%(n)s_list_len < s) {
-            cudaFree(%(n)s_inp_list);
+            device_free(%(n)s_inp_list);
-            cudaFree(%(n)s_out_list);
+            device_free(%(n)s_out_list);
-            cudaFree(%(n)s_W_list);
+            device_free(%(n)s_W_list);
-            if (cudaMalloc(&%(n)s_inp_list, s*sizeof(float *)) != cudaSuccess) return -1;
+            %(n)s_inp_list = (const float **) device_malloc(s*sizeof(float *));
-            if (cudaMalloc(&%(n)s_out_list, s*sizeof(float *)) != cudaSuccess) return -1;
+            if (%(n)s_inp_list == NULL) return -1;
-            if (cudaMalloc(&%(n)s_W_list, s*sizeof(float *)) != cudaSuccess) return -1;
+            %(n)s_out_list = (float **) device_malloc(s*sizeof(float *));
+            if (%(n)s_out_list == NULL) return -1;
+            %(n)s_W_list = (const float **) device_malloc(s*sizeof(float *));
+            if (%(n)s_W_list == NULL) return -1;
            %(n)s_list_len = s;
          }
          if (%(n)s_iIdx_len < b*i) {
-            cudaFree(%(n)s_iIdx);
+            device_free(%(n)s_iIdx);
-            if (cudaMalloc(&%(n)s_iIdx, b*i*sizeof(npy_intp)) != cudaSuccess) return -1;
+        %(n)s_iIdx = (npy_intp*) device_malloc(b*i*sizeof(npy_intp));
+        if (%(n)s_iIdx == NULL) return -1;
            %(n)s_iIdx_len = b*i;
          }
          if (%(n)s_oIdx_len < b*j) {
-            cudaFree(%(n)s_oIdx);
+            device_free(%(n)s_oIdx);
-            if (cudaMalloc(&%(n)s_oIdx, b*j*sizeof(npy_intp)) != cudaSuccess) return -1;
+            %(n)s_oIdx = (npy_intp*) device_malloc(b*j*sizeof(npy_intp));
+            if (%(n)s_oIdx == NULL) return -1;
            %(n)s_oIdx_len = b*j;
          }
          return 0;
@@ -326,7 +332,7 @@ CudaNdarray_HOST_STRIDES(%(out)s)[0], CudaNdarray_HOST_STRIDES(%(out)s)[1],
                   W=W, fail=sub['fail'], name=nodename)
    def c_code_cache_version(self):
-        return (11,)
+        return (12,)
    def grad(self, inputs, grads):
        o, W, h, inputIdx, outputIdx = inputs
@@ -509,24 +515,27 @@ static size_t %(n)s_yIdx_len;
 static int %(n)s_prep(int b, int i, int j) {
  int s = b*i*j;
  if (%(n)s_list_len < s) {
-    cudaFree(%(n)s_x_list);
+    device_free(%(n)s_x_list);
-    cudaFree(%(n)s_y_list);
+    device_free(%(n)s_y_list);
-    cudaFree(%(n)s_out_list);
+    device_free(%(n)s_out_list);
-    if (cudaMalloc(&%(n)s_x_list, s*sizeof(float *)) != cudaSuccess) return -1;
+    %(n)s_x_list = (const float **) device_malloc(s*sizeof(float *));
-    if (cudaMalloc(&%(n)s_y_list, s*sizeof(float *)) != cudaSuccess) return -1;
+    if (%(n)s_x_list == NULL) return -1;
-    if (cudaMalloc(&%(n)s_out_list, s*sizeof(float *)) != cudaSuccess) return -1;
+    %(n)s_y_list = (const float **) device_malloc(s*sizeof(float *));
+    if (%(n)s_y_list == NULL) return -1;
+    %(n)s_out_list = (float **) device_malloc(s*sizeof(float *));
+    if (%(n)s_out_list == NULL) return -1;
    %(n)s_list_len = s;
  }
  if (%(n)s_xIdx_len < b*i) {
-    cudaFree(%(n)s_xIdx);
+    device_free(%(n)s_xIdx);
-    if (cudaMalloc(&%(n)s_xIdx, b*i*sizeof(npy_intp)) != cudaSuccess)
+    %(n)s_xIdx = (npy_intp*) device_malloc(b*i*sizeof(npy_intp));
-      return -1;
+    if (%(n)s_xIdx == NULL) return -1;
    %(n)s_xIdx_len = b*i;
  }
  if (%(n)s_yIdx_len < b*j) {
-    cudaFree(%(n)s_yIdx);
+    device_free(%(n)s_yIdx);
-    if (cudaMalloc(&%(n)s_yIdx, b*j*sizeof(npy_intp)) != cudaSuccess)
+    %(n)s_yIdx = (npy_intp*) device_malloc(b*j*sizeof(npy_intp));
-      return -1;
+    if (%(n)s_yIdx == NULL) return -1;
    %(n)s_yIdx_len = b*j;
  }
  return 0;
@@ -626,7 +635,7 @@ CudaNdarray_HOST_STRIDES(%(out)s)[0], CudaNdarray_HOST_STRIDES(%(out)s)[1],
            alpha=alpha, fail=sub['fail'])
    def c_code_cache_version(self):
-        return (10,)
+        return (11,)
 sparse_block_outer_ss = SparseBlockOuterSS(False)

--- a/theano/sandbox/cuda/cnmem.cpp
+++ b/theano/sandbox/cuda/cnmem.cpp
--- a/theano/sandbox/cuda/cnmem.h
+++ b/theano/sandbox/cuda/cnmem.h
--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu