Merge pull request #1012 from goodfeli/cuda_alloc

Ready to merge: Cuda allocation code

Merge pull request #1012 from goodfeli/cuda_alloc
561b084b · nouiz · 9eb9db60 · 4557c310 · 561b084b · 561b084b
--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -4666,6 +4666,33 @@ int fprint_CudaNdarray(FILE * fd, const CudaNdarray *self)
    return 0;
 }

+
+int CudaNdarray_prep_output(CudaNdarray ** arr, int nd,
+        const int * dims)
+{
+    bool allocated = false;
+    if (*arr == NULL)
+    {
+        // This allocates the metadata but not the data
+        *arr = (CudaNdarray *) CudaNdarray_new_nd(nd);
+        if (*arr == NULL)
+            return -1;
+        allocated = true;
+    }
+
+    if (CudaNdarray_alloc_contiguous(*arr, nd, dims))
+    {
+        if (allocated)
+        {
+            Py_DECREF(*arr);
+            *arr = NULL;
+        }
+        return -1;
+    }
+    return 0;
+}
+
+
 /*
  Local Variables:
  mode:c++

--- a/theano/sandbox/cuda/cuda_ndarray.cuh
+++ b/theano/sandbox/cuda/cuda_ndarray.cuh
@@ -149,11 +149,11 @@ DllExport int
 CudaNdarray_Equal(CudaNdarray *cnda1, CudaNdarray *cnda2);

 /****
- *  Set the idx'th dimension to value d.
+ *  Set the dimension[idx] to value d.
 *
 *  Updates the log2dim shadow array.
 *
- *  Does not sync structure to host.
+ *  Does not sync structure to device.
 */
 DllExport inline void __attribute__((always_inline))
 CudaNdarray_set_dim(CudaNdarray * self, int idx, int d) 
@@ -229,7 +229,8 @@ DllExport PyObject * CudaNdarray_new_nd(const int nd);
 /**
 * [Re]allocate a CudaNdarray with access to 'nd' dimensions.
 *
- * Note: This does not allocate storage for data.
+ * Note: This does not allocate storage for data, or free
+ *       pre-existing storage.
 */
 DllExport inline int __attribute__((always_inline))
 CudaNdarray_set_nd(CudaNdarray * self, const int nd)
@@ -276,6 +277,7 @@ CudaNdarray_set_nd(CudaNdarray * self, const int nd)
 * CudaNdarray_alloc_contiguous
 *
 * Allocate storage space for a tensor of rank 'nd' and given dimensions.
+ * (No-op if self already has a contiguous tensor of the right dimensions)
 *
 * Note: CudaNdarray_alloc_contiguous is templated to work for both int dimensions and npy_intp dimensions
 */
@@ -286,13 +288,13 @@ static int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd, const i
    // return 0 on success
    int size = 1; //set up the strides for contiguous tensor
    assert (nd >= 0);
+
+    // Here we modify the host structure to have the desired shape and
+    // strides. This does not cause the storage to be freed or reallocated.
    if (CudaNdarray_set_nd(self, nd))
    {
        return -1;
    }
-    //TODO: check if by any chance our current dims are correct,
-    //      and strides already contiguous
-    //      in that case we can return right here.
    for (int i = nd-1; i >= 0; --i)
    {
        CudaNdarray_set_stride(self, i, (dim[i] == 1) ? 0 : size);
@@ -300,7 +302,11 @@ static int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd, const i
        size = size * dim[i];
    }

-    if ((self->data_allocated == size) && CudaNdarray_is_c_contiguous(self))
+    // If the allocated buffer is already of the right size, we don't need to
+    // do anything else.
+    // Note: self->data_allocated is 0 for a view, so views will fail this
+    // check and be turned into independent arrays below.
+    if (self->data_allocated == size)
    {
        return 0;
    }
@@ -468,6 +474,15 @@ PyObject * CudaNdarray_View(const CudaNdarray * self);
 PyObject * CudaNdarray_inplace_add(PyObject* py_self, PyObject * py_other);


+
+// Ensures that *arr is a pointer to a contiguous ndarray of the specified
+// dimensions.
+// *arr may initially be NULL, a pointer to an ndarray of the wrong size,
+// or a pointer to an ndarray of the right size. In the last case it will
+// not change.
+int CudaNdarray_prep_output(CudaNdarray ** arr, int nd,
+        const int * dims);
+
 #endif
 /*
  Local Variables: