Use the real amount of memory that is available to select algorithm in cudnn.

c251a77d · Arnaud Bergeron · Frederic Bastien · 59f671e2 · c251a77d · c251a77d
--- a/theano/gpuarray/dnn_fwd.c
+++ b/theano/gpuarray/dnn_fwd.c
@@ -98,12 +98,35 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
 #endif

  if (!reuse_algo) {
+    size_t free;
+
+    int err2 = gpucontext_property(ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
+    if (err2 != GA_NO_ERROR) {
+      PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
+                   "memory information on the GPU");
+      cuda_exit(c->ctx);
+      return 1;
+    }
+
+    // Guess 4Mb if the info is not available
+    if (free == 0) free = 4 * 1024 * 1024;
+
 #ifdef CHOOSE_TIME
    int count;
    cudnnConvolutionFwdAlgoPerf_t choice;
-    err = cudnnFindConvolutionForwardAlgorithm(
+    gpudata *tmpmem;
+
+    tmpmem = gpudata_alloc(ctx, free, NULL, 0, NULL);
+    if (tmpmem == NULL) {
+      PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
+      return -1;
+    }
+
+    err = cudnnFindConvolutionForwardAlgorithmEx(
      _handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
-      desc, APPLY_SPECIFIC(output), 1, &count, &choice);
+      desc, APPLY_SPECIFIC(output), 1, &count, &choice, *(void **)tmpmem,
+      free);
+    gpudata_release(tmpmem);

    if (err != CUDNN_STATUS_SUCCESS) {
      PyErr_Format(PyExc_RuntimeError,
@@ -114,16 +137,6 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
    }
    algo = choice.algo;
 #else
-    size_t free;
-    int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_FREE_GMEM, &free);
-
-    if (err2 != GA_NO_ERROR) {
-      PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
-                   "memory information on the GPU");
-      cuda_exit(c->ctx);
-      return 1;
-    }
-
    err = cudnnGetConvolutionForwardAlgorithm(
      _handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
      desc, APPLY_SPECIFIC(output),

--- a/theano/gpuarray/dnn_gi.c
+++ b/theano/gpuarray/dnn_gi.c
@@ -140,13 +140,31 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
 #endif

  if (!reuse_algo) {
+    size_t free;
+    int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
+
+    if (err2 != GA_NO_ERROR) {
+      PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
+                   "memory information on the GPU");
+      cuda_exit(c->ctx);
+      return 1;
+    }
+
 #ifdef CHOOSE_TIME
    int count;
    cudnnConvolutionBwdDataAlgoPerf_t choice;
+    gpudata *tmpmem;
+
+    tmpmem = gpudata_alloc(ctx, mem_sz, NULL, 0, NULL);
+    if (tmpmem == NULL) {
+      PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
+      return -1;
+    }

-    err = cudnnFindConvolutionBackwardDataAlgorithm(
+    err = cudnnFindConvolutionBackwardDataAlgorithmEx(
      _handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
-      APPLY_SPECIFIC(input), 1, &count, &choice);
+      APPLY_SPECIFIC(input), 1, &count, &choice, *(void **)tmpmem, free);
+    gpudata_release(tmpmem);

    if (err != CUDNN_STATUS_SUCCESS) {
      PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
@@ -157,16 +175,6 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,

    algo = choice.algo;
 #else
-    size_t free;
-    int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_FREE_GMEM, &free);
-
-    if (err2 != GA_NO_ERROR) {
-      PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
-                   "memory information on the GPU");
-      cuda_exit(c->ctx);
-      return 1;
-    }
-
    err = cudnnGetConvolutionBackwardDataAlgorithm(
      _handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output),
      desc, APPLY_SPECIFIC(input),

--- a/theano/gpuarray/dnn_gw.c
+++ b/theano/gpuarray/dnn_gw.c
@@ -140,13 +140,34 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
 #endif

  if (!reuse_algo) {
+    size_t free;
+
+    int err2 = gpucontext_property(ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
+    if (err2 != GA_NO_ERROR) {
+      PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
+                   "memory information on the GPU");
+      cuda_exit(c->ctx);
+      return 1;
+    }
+
+    // Guess 4Mb if the info is not available
+    if (free == 0) free = 4 * 1024 * 1024;
+
 #ifdef CHOOSE_TIME
    int count;
    cudnnConvolutionBwdFilterAlgoPerf_t choice;
+    gpudata *tmpmem;

-    err = cudnnFindConvolutionBackwardFilterAlgorithm(
+    tmpmem = gpudata_alloc(ctx, free, NULL, 0, NULL);
+    if (tmpmem == NULL) {
+      PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
+      return -1;
+    }
+
+    err = cudnnFindConvolutionBackwardFilterAlgorithmEx(
      _handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
-      APPLY_SPECIFIC(kerns), 1, &count, &choice);
+      APPLY_SPECIFIC(kerns), 1, &count, &choice, *(void **)tmpmem, free);
+    gpudata_release(tmpmem);

    if (err != CUDNN_STATUS_SUCCESS) {
      PyErr_Format(PyExc_RuntimeError,
@@ -158,16 +179,6 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,

    algo = choice.algo;
 #else
-    size_t free;
-    int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_FREE_GMEM, &free);
-
-    if (err2 != GA_NO_ERROR) {
-      PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
-                   "memory information on the GPU");
-      cuda_exit(c->ctx);
-      return 1;
-    }
-
    err = cudnnGetConvolutionBackwardFilterAlgorithm(
      _handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
      desc, APPLY_SPECIFIC(kerns),