merge

29a43a50 · Frederic Bastien · 73d15889 · 26116b1c · 29a43a50 · 29a43a50
--- a/theano/sandbox/cuda/__init__.py
+++ b/theano/sandbox/cuda/__init__.py
-import atexit, gc, os, stat
+import atexit, os, stat
 from theano.compile import optdb
 from theano import config
@@ -96,9 +96,6 @@ if cuda_available:
        cuda_initialization_error_message = ""
        # actively closing our gpu session presents segfault-on-exit on some systems
        atexit.register(gpu_shutdown)
-        # do garbage collection before releasing the gpu to avoid releasing invalid pointers later
-        # note that atexit-registered calls are called in LIFO order
-        atexit.register(gc.collect)
    except EnvironmentError, e:
        cuda_available = False
        cuda_initialization_error_message = e.message

--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -12,11 +12,43 @@
 //If true, we fill with NAN allocated device memory.
 #define ALLOC_MEMSET 0
+#define DEBUG_GPU_CONTEXT_REFCOUNT 0
+// g_gpu_context_refcount starts at one b/c the gpu context will be implicitly created
+// on the first successful cuda call. the matching decref is in CudaNdarray_gpu_shutdown.
+static int g_gpu_context_refcount = 1;
+///////////////////////////
+// cuda context management
+///////////////////////////
+void gpu_context_incref() {
+  g_gpu_context_refcount++;
+#if DEBUG_GPU_CONTEXT_REFCOUNT
+  fprintf(stderr, "gpu_context_incref, to %d\n", g_gpu_context_refcount);
+#endif
+}
+void gpu_context_decref() {
+  g_gpu_context_refcount--;
+#if DEBUG_GPU_CONTEXT_REFCOUNT
+  fprintf(stderr, "gpu_context_decref, to %d\n", g_gpu_context_refcount);
+#endif
+  if(g_gpu_context_refcount == 0) {
+    // we're now free to close the cuda context; if we don't explicitly
+    // exit our cuda context, some systems segfault on process exit
+    // for as-yet unknown reasons; see
+    // http://groups.google.com/group/theano-users/browse_thread/thread/c351846e5cebe35f
+    cudaThreadExit();
+#if DEBUG_GPU_CONTEXT_REFCOUNT
+    fprintf(stderr, "gpu_context_decref at 0, calling cudaThreadExit\n");
+#endif
+  }
+}
 /////////////////////////
 // Alloc and Free
 /////////////////////////
 /**
 *
 * In the test program I'm using, the _outstanding_mallocs decreases with every call.
@@ -48,6 +80,9 @@ void * device_malloc(size_t size)
        return NULL;
    }
    _outstanding_mallocs[0] += (rval != NULL);
+    if(rval != NULL) {
+        gpu_context_incref(); // keep the gpu context around until we've free this memory
+    }
 #if COMPUTE_GPU_MEM_USED
    for(int i=0;i<TABLE_SIZE;i++){
      if(NULL==_alloc_size_table[i].ptr){
@@ -81,6 +116,9 @@ int device_free(void *ptr)
        return -1;
    }
    _outstanding_mallocs[0] -= (ptr != NULL);
+    if(ptr != NULL) {
+        gpu_context_decref();
+    }
 #if COMPUTE_GPU_MEM_USED
    int i=0;
    for(;i<TABLE_SIZE;i++)
@@ -1888,7 +1926,7 @@ CudaNdarray_gpu_init(PyObject* _unused, PyObject* args)
 PyObject *
 CudaNdarray_gpu_shutdown(PyObject* _unused, PyObject* _unused_args) {
-    cudaThreadExit();
+    gpu_context_decref();
    Py_INCREF(Py_None);
    return Py_None;
 }