Use the async gpu kernel call by default.

Our transfer call are synchronized version. So no problem there. The problem that we need to work around is that the Theano gc could free the output var before we finish with it. cudaFree is instananeous, it don't get in the stream of command to execute.

Use the async gpu kernel call by default.
9cd61627 · Frederic · 00183e72 · 9cd61627 · 9cd61627
--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -88,6 +88,11 @@ int device_free(void *ptr)
    if(!g_gpu_context_active) {
        return 0;
    }
+
+    // We need sync as the Theano's GC could remove intermediate variable that
+    // are still needed as the gpu kernel are running or in the queue.
+    cudaThreadSynchronize();
+
    cudaError_t err =  cudaFree(ptr);
    if (cudaSuccess != err)
    {

--- a/theano/sandbox/cuda/cuda_ndarray.cuh
+++ b/theano/sandbox/cuda/cuda_ndarray.cuh
@@ -27,7 +27,7 @@ typedef float real;
 #define NUM_VECTOR_OP_THREADS_PER_BLOCK     256  //Should be read from device properties. (#10)
 #endif

-#if 0
+#if 1
 // Do not wait after every kernel & transfer.
 #define CNDA_THREAD_SYNC
 #else