Merge pull request #1590 from nouiz/gpua_eye

[MRG]Gpuarray eye

Merge pull request #1590 from nouiz/gpua_eye
98c61e15 · abergeron · fd40e50e · 8403e222 · 98c61e15 · 98c61e15
--- a/theano/sandbox/cuda/nvcc_compiler.py
+++ b/theano/sandbox/cuda/nvcc_compiler.py
@@ -347,8 +347,6 @@ class NVCC_compiler(object):
        cmd.append(os.path.split(cppfilename)[-1])
        cmd.extend(['-L%s' % ldir for ldir in lib_dirs])
        cmd.extend(['-l%s' % l for l in libs])
-        if module_name != 'cuda_ndarray':
-            cmd.append("-lcuda_ndarray")
        if sys.platform == 'darwin':
            cmd.extend(darwin_python_lib.split())


--- a/theano/sandbox/cuda/type.py
+++ b/theano/sandbox/cuda/type.py
@@ -400,7 +400,7 @@ class CudaNdarrayType(Type):
    def c_libraries(self):
        # returning cublas because the cuda_ndarray.cuh header
        # includes calls to SetVector and cublasGetError
-        return ['cudart', 'cublas']
+        return ['cudart', 'cublas', 'cuda_ndarray']

    def c_support_code(cls):
        return ""

--- a/theano/sandbox/gpuarray/basic_ops.py
+++ b/theano/sandbox/gpuarray/basic_ops.py
@@ -11,6 +11,7 @@ from theano.tensor.basic import Alloc
 from theano.gof.python25 import all, any
 from theano.gof.utils import MethodNotDefined

+from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
 try:
    import pygpu
    from pygpu import gpuarray, elemwise
@@ -559,3 +560,115 @@ class GpuReshape(HideC, tensor.Reshape):
            else:
                raise ValueError("total size of new array must be unchanged")
        out[0] = x.reshape(tuple(shp))
+
+
+class GpuEye(Op):
+    def __init__(self, dtype=None):
+        if dtype is None:
+            dtype = config.floatX
+        self.dtype = dtype
+
+    def make_node(self, n, m, k):
+        n = tensor.as_tensor_variable(n)
+        m = tensor.as_tensor_variable(m)
+        k = tensor.as_tensor_variable(k)
+        assert n.ndim == 0
+        assert m.ndim == 0
+        assert k.ndim == 0
+        otype = GpuArrayType(dtype=self.dtype,
+                             broadcastable=(False, False))
+
+        # k != 0 isn't implemented on the GPU yet.
+        assert tensor.get_scalar_constant_value(k) == 0
+        return Apply(self, [n, m], [otype()])
+
+    def infer_shape(self, node, in_shapes):
+        out_shape = [node.inputs[0], node.inputs[1]]
+        return [out_shape]
+
+    def grad(self, inp, grads):
+        return [grad_undefined(self, i, inp[i]) for i in xrange(3)]
+
+    def __eq__(self, other):
+        return type(self) == type(other) and self.dtype == other.dtype
+
+    def __hash__(self):
+        return hash(self.dtype) ^ hash(type(self))
+
+    def c_headers(self):
+        return ['cuda.h', '<compyte/extension.h>', '<compyte/numpy_compat.h>']
+
+    def c_support_code(self):
+        dtype = self.dtype
+        return """
+CUdeviceptr (*cuda_get_ptr)(gpudata *g);
+
+//TODO OPT: Only 1 block is used.
+__global__ void kEye_%(dtype)s(npy_%(dtype)s* a, int n, int m) {
+    int nb_elem = min(n, m);
+    for (unsigned int i = threadIdx.x; i < nb_elem; i += blockDim.x) {
+        a[i*m + i] = 1;
+    }
+}""" % locals()
+
+    def c_init_code(self):
+        return ['cuda_get_ptr = (CUdeviceptr (*)(gpudata *g))compyte_get_extension("cuda_get_ptr");']
+
+    def c_code(self, node, name, inp, out, sub):
+        #TODO assert that the back-end is cuda!
+        n, m = inp
+        z, = out
+        fail = sub['fail']
+        dtype = self.dtype
+        typecode = pygpu.gpuarray.dtype_to_typecode(dtype)
+        sync = bool(config.gpuarray.sync)
+        s = """
+        npy_%(dtype)s* ptr;
+        size_t dims[] = {0, 0};
+
+        dims[0] = ((dtype_%(n)s*)PyArray_DATA(%(n)s))[0];
+        dims[1] = ((dtype_%(m)s*)PyArray_DATA(%(m)s))[0];
+        int total_size = dims[0] * dims[1] * sizeof(float);
+        cudaError_t sts;
+        Py_CLEAR(%(z)s);
+        %(z)s = pygpu_empty(2, dims,
+                            %(typecode)s,
+                            GA_C_ORDER,
+                            pygpu_default_context(), Py_None);
+        if (!%(z)s) {
+            %(fail)s
+        }
+        ptr = (npy_%(dtype)s*)(((char *)cuda_get_ptr(%(z)s->ga.data)) +
+                               %(z)s->ga.offset);
+        sts = cudaMemset(ptr, 0, total_size);
+        if (cudaSuccess != sts)
+        {
+            PyErr_Format(PyExc_MemoryError,
+                         "GpuEye: Error in memset %%d bytes of device memory.",
+                         total_size);
+            %(fail)s;
+        }
+
+        kEye_%(dtype)s<<<1, 256>>>(ptr, dims[0], dims[1]);
+
+        if(%(sync)d)
+            GpuArray_sync(&%(z)s->ga);
+
+        sts = cudaGetLastError();
+        if (cudaSuccess != sts)
+        {
+            PyErr_Format(PyExc_RuntimeError,
+                    "Cuda error: kEye: %%s. n=%%ld, m=%%ld.",
+                    cudaGetErrorString(sts),
+                    (long int)dims[0], (long int)dims[1]);
+            %(fail)s;
+         }
+        """ % locals()
+
+        return s
+
+    def c_code_cache_version(self):
+        return (1,)
+
+    def c_compiler(self):
+        return NVCC_compiler
--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -10,8 +10,10 @@ from theano.gof import (local_optimizer, EquilibriumDB, SequenceDB, ProxyDB,
 from theano.gof.python25 import all, any
 from theano.sandbox.gpuarray.type import GpuArrayType

-from theano.sandbox.gpuarray.basic_ops import (host_from_gpu, gpu_from_host,
-                                               gpu_alloc, GpuReshape)
+from theano.sandbox.gpuarray.basic_ops import (host_from_gpu,
+                                               gpu_from_host,
+                                               gpu_alloc, GpuReshape,
+                                               GpuEye)
 from theano.sandbox.gpuarray.elemwise import (GpuElemwise, _is_scalar,
                                              GpuDimShuffle, GpuCAReduce)
 from theano.sandbox.gpuarray.subtensor import GpuSubtensor
@@ -232,3 +234,9 @@ def local_gpua_gemv2(node):
 @op_lifter(tensor.blas.Gemm)
 def local_gpua_gemm(node):
    return GpuGemm(inplace=node.op.inplace)
+
+
+@register_opt()
+@op_lifter(tensor.basic.Eye)
+def local_gpua_eye(node):
+    return GpuEye(dtype=node.op.dtype)
--- a/theano/sandbox/gpuarray/tests/test_basic_ops.py
+++ b/theano/sandbox/gpuarray/tests/test_basic_ops.py
@@ -35,7 +35,8 @@ from theano.sandbox.gpuarray.type import (GpuArrayType,
 from theano.sandbox.gpuarray.basic_ops import (host_from_gpu, gpu_from_host,
                                               gpu_alloc, gpu_from_cuda,
                                               cuda_from_gpu, HostFromGpu,
-                                               GpuFromHost, GpuReshape)
+                                               GpuFromHost, GpuReshape,
+                                               GpuEye)

 from theano.tests import unittest_tools as utt
 utt.seed_rng()
@@ -306,3 +307,32 @@ class G_reshape(T_reshape):
                                          theano.tensor.opt.Shape_i,
                                          theano.tensor.opt.MakeVector))
        assert self.op == GpuReshape
+
+
+def test_gpueye():
+    def check(dtype, N, M_=None):
+        # Theano does not accept None as a tensor.
+        # So we must use a real value.
+        M = M_
+        # Currently DebugMode does not support None as inputs even if this is
+        # allowed.
+        if M is None:
+            M = N
+        N_symb = T.iscalar()
+        M_symb = T.iscalar()
+        k_symb = numpy.asarray(0)
+        out = T.eye(N_symb, M_symb, k_symb, dtype=dtype)
+        f = theano.function([N_symb, M_symb],
+                            out,
+                            mode=mode_with_gpu)
+        result = numpy.asarray(f(N, M))
+        assert numpy.allclose(result, numpy.eye(N, M_, dtype=dtype))
+        assert result.dtype == numpy.dtype(dtype)
+        assert any([isinstance(node.op, GpuEye)
+                    for node in f.maker.fgraph.toposort()])
+
+    for dtype in ['float32', 'int32']:
+        yield check, dtype, 3
+        # M != N, k = 0
+        yield check, dtype, 3, 5
+        yield check, dtype, 5, 3
--- a/theano/sandbox/gpuarray/type.py
+++ b/theano/sandbox/gpuarray/type.py
@@ -48,7 +48,7 @@ class GpuArrayType(Type):
        else:
            up_dtype = scalar.upcast(self.dtype, data.dtype)
            if up_dtype == self.dtype:
-                data = gpuarray.array(data, dtype=self.typecode, copy=False)
+                data = gpuarray.array(data, dtype=self.dtype, copy=False)
            else:
                raise TypeError("%s cannot store a value of dtype %s "
                                "without risking loss of precision." %

--- a/theano/tensor/tests/test_elemwise.py
+++ b/theano/tensor/tests/test_elemwise.py
@@ -172,7 +172,7 @@ class test_Broadcast(unittest.TestCase):
            yv = rand_val(ysh)
            zv = xv + yv

-            self.assertTrue((f(xv, yv) == zv).all())
+            unittest_tools.assert_allclose(f(xv, yv), zv)

            #test Elemwise.infer_shape
            #the Shape op don't implement c_code!