postpone the creating of the device structure to when we need it.

This is a significant speed up with the gc as most of the time, we don't need it and allocating on the GPU is slow.

postpone the creating of the device structure to when we need it.
f441295b · Frederic · c50a2db1 · f441295b · f441295b
--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -4385,7 +4385,30 @@ CudaNdarray_Equal(CudaNdarray *cnda1, CudaNdarray *cnda2)
 int
 cnda_copy_structure_to_device(const CudaNdarray * self)
 {
-    cublasSetVector(cnda_structure_size(self->nd), sizeof(int), self->host_structure, 1, self->dev_structure, 1);
+    //If the device structure do not exists, create it.
+    //We allocate it here as we do not need it often.
+    //In fact, we need it so infrequently that we expect
+    //that most object won't need it. Not allocating it
+    //save a significant when creating object.
+    //This speed up a benchmark by 8% with the gc.
+    if (!self->dev_structure)
+    {
+        int struct_size = cnda_structure_size(self->nd);
+        if (struct_size)
+        {
+            self->dev_structure = (int*)device_malloc(struct_size* sizeof(int));
+            if (NULL == self->dev_structure)
+            {
+                return -1;
+            }
+        }
+    }
+    cublasSetVector(cnda_structure_size(self->nd),
+                    sizeof(int),
+                    self->host_structure,
+                    1,
+                    self->dev_structure,
+                    1);
    CNDA_THREAD_SYNC;
    if (CUBLAS_STATUS_SUCCESS != cublasGetError())
    {

--- a/theano/sandbox/cuda/cuda_ndarray.cuh
+++ b/theano/sandbox/cuda/cuda_ndarray.cuh
@@ -82,8 +82,9 @@ struct CudaNdarray
    //device pointers (allocated by cudaMalloc)
    mutable int dev_structure_fresh;
-    //dev_structure should be accessed via macros, otherwise may not be synchronized
+    //dev_structure should be accessed via macros, otherwise may not be
-    int * dev_structure; //dim0, dim1, ..., stride0, stride1, ...
+    //synchronized. The macro will allocate it when needed.
+    mutable int * dev_structure; //dim0, dim1, ..., stride0, stride1, ...
    real* devdata; //pointer to data element [0,..,0].
 };
@@ -251,19 +252,8 @@ CudaNdarray_set_nd(CudaNdarray * self, const int nd)
        {
            self->host_structure[i] = 0;
        }
+        //The device structure will be created in cnda_copy_structure_to_device
-        int struct_size = cnda_structure_size(nd);
+        //if needed.
-        if (struct_size)
-        {
-            self->dev_structure = (int*)device_malloc(struct_size* sizeof(int));
-            if (NULL == self->dev_structure)
-            {
-                free(self->host_structure);
-                self->host_structure = NULL;
-                self->dev_structure = NULL;
-                return -1;
-            }
-        }
        self->nd = nd;
        self->dev_structure_fresh = 0;
    }