Some small modification to make allow making op that use pycuda generated fct.

d606816a · Frederic Bastien · 51c78fb1 · d606816a · d606816a · d606816a
--- a/theano/sandbox/cuda/__init__.py
+++ b/theano/sandbox/cuda/__init__.py
@@ -109,7 +109,7 @@ if cuda_available:
    import basic_ops
    from basic_ops import (GpuFromHost, HostFromGpu, GpuElemwise, 
-            GpuDimShuffle, GpuSum, GpuReshape, 
+            GpuDimShuffle, GpuSum, GpuReshape, GpuContiguous, 
            GpuSubtensor, GpuIncSubtensor, GpuFlatten, GpuShape, GpuAlloc,
            GpuJoin,fscalar, fscalar, fvector, fmatrix, frow, fcol, ftensor3, ftensor4
                           , scalar, vector, matrix, row, col, tensor3, tensor4)

--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -1894,6 +1894,43 @@ class GpuAlloc(Op):
 gpu_alloc = GpuAlloc()
+class GpuContiguous(Op):
+    def make_node(self, input):
+        input = as_cuda_ndarray_variable(input)
+        return Apply(self, [input], [input.type()])
+    def __str__(self):
+        return self.__class__.__name__
+    def c_code(self, node, name, (input,), (z,), sub):
+        fail = sub['fail']
+        str = """
+        {
+            if (CudaNdarray_is_c_contiguous(%(input)s)){
+                Py_XDECREF(%(z)s);
+                %(z)s = %(input)s;
+                Py_INCREF(%(z)s);
+            } else if ((NULL == %(z)s)"""%locals()
+        for i in range(len(node.inputs[0].type.broadcastable)):
+            str += "\n|| (CudaNdarray_HOST_DIMS(%(input)s)[%(i)s] != CudaNdarray_HOST_DIMS(%(z)s)[%(i)s])"%locals()
+        str += """)
+            {
+                Py_XDECREF(%(z)s);
+                %(z)s = (CudaNdarray*)CudaNdarray_Copy(%(input)s);
+                if (!%(z)s)
+                {
+                    %(fail)s;
+                }
+            }else if(CudaNdarray_CopyFromCudaNdarray(%(z)s,%(input)s)){
+                %(fail)s;
+            }
+        }
+        """%locals()
+        return str
+gpu_contiguous = GpuContiguous()
 # Those are predifined CudaNdarrayType as done in tensor.basic
 # Usefull mostly for test as the gpu op are inserted automatically...

--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -1584,6 +1584,12 @@ static PyGetSetDef CudaNdarray_getset[] = {
        (setter)CudaNdarray_set_strides, 
        "data pointer strides (in elements)",
        NULL},
+    //gpudata is needed to allow calling pycuda fct with CudaNdarray input.
+    {"gpudata", 
+        (getter)CudaNdarray_get_dev_data, 
+        NULL,//setter)CudaNdarray_set_dev_data,
+        "device data pointer",
+        NULL},
    {"_dev_data", 
        (getter)CudaNdarray_get_dev_data, 
        (setter)CudaNdarray_set_dev_data,
@@ -1599,6 +1605,12 @@ static PyGetSetDef CudaNdarray_getset[] = {
        NULL,
        "Return the number of element in this objects.",
        NULL},
+    //mem_size is neede for pycuda.elementwise.ElementwiseKernel Why do they use size and mem_size of the same value?
+    {"mem_size",
+        (getter)CudaNdarray_SIZE_Object,
+        NULL,
+        "Return the number of element in this objects.",
+        NULL},
    {NULL, NULL, NULL, NULL}  /* Sentinel */
 };