First version of GpuEye for cuda only.

0cd038b6 · Frederic · 85aede29 · 0cd038b6 · 0cd038b6 · 0cd038b6
--- a/theano/sandbox/gpuarray/basic_ops.py
+++ b/theano/sandbox/gpuarray/basic_ops.py
@@ -11,6 +11,7 @@ from theano.tensor.basic import Alloc
 from theano.gof.python25 import all, any
 from theano.gof.utils import MethodNotDefined
+from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
 try:
    import pygpu
    from pygpu import gpuarray, elemwise
@@ -561,11 +562,10 @@ class GpuReshape(HideC, tensor.Reshape):
        out[0] = x.reshape(tuple(shp))
-class GpuEye(GpuOp):
+class GpuEye(Op):
    def __init__(self, dtype=None):
        if dtype is None:
            dtype = config.floatX
-        assert dtype == 'float32'
        self.dtype = dtype
    def make_node(self, n, m, k):
@@ -575,10 +575,12 @@ class GpuEye(GpuOp):
        assert n.ndim == 0
        assert m.ndim == 0
        assert k.ndim == 0
+        otype = GpuArrayType(dtype=self.dtype,
+                             broadcastable=(False, False))
        # k != 0 isn't implemented on the GPU yet.
        assert tensor.get_scalar_constant_value(k) == 0
-        return Apply(self, [n, m], [matrix(dtype=self.dtype)])
+        return Apply(self, [n, m], [otype()])
    def infer_shape(self, node, in_shapes):
        out_shape = [node.inputs[0], node.inputs[1]]
@@ -593,55 +595,71 @@ class GpuEye(GpuOp):
    def __hash__(self):
        return hash(self.dtype) ^ hash(type(self))
+    def c_headers(self):
+        return ['cuda.h', '<compyte/extension.h>']
    def c_support_code(self):
+        dtype = self.dtype
        return """
-//Only 1 block is used.
+CUdeviceptr (*cuda_get_ptr)(gpudata *g);
-__global__ void kEye(float* a, int n, int m) {
+//TODO OPT: Only 1 block is used.
+__global__ void kEye_%(dtype)s(npy_%(dtype)s* a, int n, int m) {
    int nb_elem = min(n, m);
    for (unsigned int i = threadIdx.x; i < nb_elem; i += blockDim.x) {
        a[i*m + i] = 1;
    }
-}"""
+}""" % locals()
+    def c_init_code(self):
+        return ['cuda_get_ptr = (CUdeviceptr (*)(gpudata *g))compyte_get_extension("cuda_get_ptr");']
    def c_code(self, node, name, inp, out, sub):
+        #TODO assert that the back-end is cuda!
        n, m = inp
        z, = out
        fail = sub['fail']
+        dtype = self.dtype
+        typecode = numpy.dtype(self.dtype).num
+        sync = bool(config.gpuarray.sync)
        s = """
-        int dims[] = {0, 0};
+        npy_%(dtype)s* ptr;
+        size_t dims[] = {0, 0};
        dims[0] = ((dtype_%(n)s*)PyArray_DATA(%(n)s))[0];
        dims[1] = ((dtype_%(m)s*)PyArray_DATA(%(m)s))[0];
        int total_size = dims[0] * dims[1] * sizeof(float);
        cudaError_t sts;
-        void * orig_z = %(z)s;
+        Py_XDECREF(%(z)s);
+        %(z)s = pygpu_empty(2, dims,
-        if (CudaNdarray_prep_output(&%(z)s, 2, dims))
+                            %(typecode)d, GA_C_ORDER,
-        {
+                            pygpu_default_context(), Py_None);
-            %(fail)s;
+        if (!%(z)s) {
+            %(fail)s
        }
+        ptr = (npy_%(dtype)s*)(((char *)cuda_get_ptr(%(z)s->ga.data)) +
-        sts = cudaMemset(CudaNdarray_DEV_DATA(%(z)s), 0, total_size);
+                               %(z)s->ga.offset);
+        sts = cudaMemset(ptr, 0, total_size);
        if (cudaSuccess != sts)
        {
            PyErr_Format(PyExc_MemoryError,
                         "GpuEye: Error in memset %%d bytes of device memory.",
                         total_size);
-            if(orig_z == NULL)
-                Py_XDECREF(%(z)s);
            %(fail)s;
        }
-        kEye<<<1, 256>>>(CudaNdarray_DEV_DATA(%(z)s), dims[0], dims[1]);
+        kEye_%(dtype)s<<<1, 256>>>(ptr, dims[0], dims[1]);
-        CNDA_THREAD_SYNC;
+        if(%(sync)d)
+            GpuArray_sync(&%(z)s->ga);
        sts = cudaGetLastError();
        if (cudaSuccess != sts)
        {
-               PyErr_Format(PyExc_RuntimeError,
+            PyErr_Format(PyExc_RuntimeError,
-                    "Cuda error: kEye: %%s. n=%%d, m=%%d.",
+                    "Cuda error: kEye: %%s. n=%%ld, m=%%ld.",
                    cudaGetErrorString(sts),
-                    dims[0], dims[1]);
+                    (long int)dims[0], (long int)dims[1]);
            %(fail)s;
         }
        """ % locals()
@@ -649,5 +667,7 @@ __global__ void kEye(float* a, int n, int m) {
        return s
    def c_code_cache_version(self):
-        return (3,)
+        return (1,)
-gpu_eye = GpuEye(dtype='float32')
+    def c_compiler(self):
+        return NVCC_compiler
--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -10,8 +10,10 @@ from theano.gof import (local_optimizer, EquilibriumDB, SequenceDB, ProxyDB,
 from theano.gof.python25 import all, any
 from theano.sandbox.gpuarray.type import GpuArrayType
-from theano.sandbox.gpuarray.basic_ops import (host_from_gpu, gpu_from_host,
+from theano.sandbox.gpuarray.basic_ops import (host_from_gpu,
-                                               gpu_alloc, GpuReshape)
+                                               gpu_from_host,
+                                               gpu_alloc, GpuReshape,
+                                               GpuEye)
 from theano.sandbox.gpuarray.elemwise import (GpuElemwise, _is_scalar,
                                              GpuDimShuffle, GpuCAReduce)
 from theano.sandbox.gpuarray.subtensor import GpuSubtensor
@@ -232,3 +234,9 @@ def local_gpua_gemv2(node):
 @op_lifter(tensor.blas.Gemm)
 def local_gpua_gemm(node):
    return GpuGemm(inplace=node.op.inplace)
+@register_opt()
+@op_lifter(tensor.basic.Eye)
+def local_gpua_eye(node):
+    return GpuEye(dtype=node.op.dtype)
--- a/theano/sandbox/gpuarray/tests/test_basic_ops.py
+++ b/theano/sandbox/gpuarray/tests/test_basic_ops.py
@@ -35,7 +35,8 @@ from theano.sandbox.gpuarray.type import (GpuArrayType,
 from theano.sandbox.gpuarray.basic_ops import (host_from_gpu, gpu_from_host,
                                               gpu_alloc, gpu_from_cuda,
                                               cuda_from_gpu, HostFromGpu,
-                                               GpuFromHost, GpuReshape)
+                                               GpuFromHost, GpuReshape,
+                                               GpuEye)
 from theano.tests import unittest_tools as utt
 utt.seed_rng()
@@ -322,15 +323,15 @@ def test_gpueye():
        k_symb = numpy.asarray(0)
        out = T.eye(N_symb, M_symb, k_symb, dtype=dtype)
        f = theano.function([N_symb, M_symb],
-                            B.as_cuda_ndarray_variable(out),
+                            out,
                            mode=mode_with_gpu)
        result = numpy.asarray(f(N, M))
        assert numpy.allclose(result, numpy.eye(N, M_, dtype=dtype))
        assert result.dtype == numpy.dtype(dtype)
-        assert any([isinstance(node.op, B.GpuEye)
+        assert any([isinstance(node.op, GpuEye)
                    for node in f.maker.fgraph.toposort()])
-    for dtype in ['float32']:
+    for dtype in ['float32', 'int32']:
        yield check, dtype, 3
        # M != N, k = 0
        yield check, dtype, 3, 5