Merge pull request #4207 from nouiz/cnmem_threashold

Cnmem threashold, comment, don't call dnn_version and optimization registration.

Merge pull request #4207 from nouiz/cnmem_threashold
c6979aee · Pascal Lamblin · 2c9f2c4d · e3133b80 · c6979aee · c6979aee
--- a/doc/library/config.txt
+++ b/doc/library/config.txt
@@ -403,17 +403,18 @@ import theano and print the config variable, as in:
    Float value: >= 0
    Controls the use of `CNMeM <https://github.com/NVIDIA/cnmem>`_ (a
-    faster CUDA memory allocator). In Theano dev version until 0.7.1
+    faster CUDA memory allocator). In Theano dev version until 0.8
    is released.
-    The CNMeM library is included in Theano and does not need to be separately installed.
+    The CNMeM library is included in Theano and does not need to be
+    separately installed.
    The value represents the start size (either in MB or the fraction of total GPU
    memory) of the memory pool. If more memory is needed, Theano will
    try to obtain more, but this can cause memory fragmentation.
        * 0: not enabled.
-        * 0 < N <= 1: use this fraction of the total GPU memory (clipped to .985 for driver memory)
+        * 0 < N <= 1: use this fraction of the total GPU memory (clipped to .95 for driver memory).
        * > 1: use this number in megabytes (MB) of memory.
@@ -426,6 +427,13 @@ import theano and print the config variable, as in:
        the start or disable it. If you try this, report your result
        on :ref`theano-dev`.
+    .. note::
+        The clipping at 95% can be bypassed by specifing the exact
+        number of megabytes. If more then 95% are needed, it will try
+        automatically to get more memory. But this can cause
+        fragmentation, see note above.
 .. attribute:: linker
    String value: 'c|py', 'py', 'c', 'c|py_nogc'

--- a/theano/misc/check_blas.py
+++ b/theano/misc/check_blas.py
@@ -272,7 +272,7 @@ if __name__ == "__main__":
        GTX 660           2.32s  2.32s
        GTX 580           2.42s         2.47s
        GTX 480           2.87s         2.88s
-        TX1                      7.6s
+        TX1                      7.6s (float32 storage and computation)
        GT 610                   33.5s
        """)

--- a/theano/sandbox/cuda/__init__.py
+++ b/theano/sandbox/cuda/__init__.py
@@ -576,13 +576,14 @@ def use(device,
                cudnn_version = "not available"
                warn = None
                try:
-                    (hdr_v, runtime_v) = dnn_version()
+                    if dnn_available():
-                    cudnn_version = runtime_v
+                        (hdr_v, runtime_v) = dnn_version()
-                    # 4100 should not print warning with cudnn 4 final.
+                        cudnn_version = runtime_v
-                    if cudnn_version > 4100:
+                        # 4100 should not print warning with cudnn 4 final.
-                        warn = ("Your CuDNN version is more recent then Theano."
+                        if cudnn_version > 4100:
-                                " If you see problems, try updating Theano or"
+                            warn = ("Your CuDNN version is more recent then Theano."
-                                " downgrading CuDNN to version 4.")
+                                    " If you see problems, try updating Theano or"
+                                    " downgrading CuDNN to version 4.")
                except Exception:
                    pass
                print("Using gpu device %d: %s (CNMeM is %s, CuDNN %s)" % (

--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -3244,10 +3244,10 @@ CudaNdarray_gpu_init(PyObject* _unused, PyObject* args)
        if (cnmem > 1)
            mem = cnmem * 1024 * 1024;
        else{
-            // Clip to 98% to let memory for the driver.
+            // Clip to 95% to let memory for the driver.
-            // 98.5% didn't worked in some cases.
+            // 98% didn't worked in some cases.
-            if (cnmem > .98){
+            if (cnmem > .95){
-                cnmem = .98;
+                cnmem = .95;
            }
            size_t free = 0, total = 0;
            cudaError_t err = cudaMemGetInfo(&free, &total);

--- a/theano/tensor/nnet/nnet.py
+++ b/theano/tensor/nnet/nnet.py
@@ -736,6 +736,8 @@ class LogSoftmax(gof.Op):
 logsoftmax_op = LogSoftmax()
+# This is not registered in stabilize, as it cause some crossentropy
+# optimization to not be inserted.
 @opt.register_specialize('stabilize', 'fast_compile')
 @gof.local_optimizer([tensor.Elemwise])
 def local_logsoftmax(node):
@@ -757,6 +759,8 @@ def local_logsoftmax(node):
        return [ret]
+# This is not registered in stabilize, as it cause some crossentropy
+# optimization to not be inserted.
 @opt.register_specialize('stabilize', 'fast_compile')
 @gof.local_optimizer([SoftmaxGrad])
 def local_logsoftmax_grad(node):