Merge pull request #1436 from nouiz/gpu_extract_diag

Gpu extract diag

Merge pull request #1436 from nouiz/gpu_extract_diag
cd50d5ef · lamblin · d7963c11 · fc8d85e1 · cd50d5ef · cd50d5ef
--- a/doc/library/tensor/nnet/conv.txt
+++ b/doc/library/tensor/nnet/conv.txt
@@ -16,6 +16,9 @@
    present in convolutional neural networks (where filters are 3D and pool
    over several input channels).
+    The project `TheanoConv3d2d <https://github.com/jaberg/TheanoConv3d2d>`_
+    is probably faster then the Conv3d documented here.
 .. module:: conv
   :platform: Unix, Windows
   :synopsis: ops for signal processing

--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -2391,8 +2391,58 @@ CudaNdarray_get_strides(CudaNdarray *self, void *closure)
 static int
 CudaNdarray_set_strides(CudaNdarray *self, PyObject *value, void *closure)
 {
-    PyErr_SetString(PyExc_NotImplementedError, "");
+    //npy_intp newstrides_bytes[PyTuple_Size(value)];
-    return -1;
+    if (PyTuple_Check(value)){
+        if (PyTuple_Size(value) != CudaNdarray_NDIM(self)){
+            PyErr_SetString(PyExc_ValueError,
+                            "The new strides tuple must have the same length"
+                            " as the number of dimensions");
+            return -1;
+        }
+    }else if (PyList_Check(value)){
+        if (PyList_Size(value) != CudaNdarray_NDIM(self)){
+            PyErr_SetString(PyExc_ValueError,
+                            "The new strides list must have the same length"
+                            " as the number of dimensions");
+            return -1;
+        }
+    }else{
+        PyErr_SetString(PyExc_ValueError,
+                        "The new strides need to be encoded in a tuple or list");
+        return -1;
+    }
+    npy_intp newstrides[CudaNdarray_NDIM(self)];
+    if (PyTuple_Check(value)){
+        for(int i=0; i < CudaNdarray_NDIM(self); i++){
+            newstrides[i] = PyInt_AsLong(PyTuple_GetItem(value, Py_ssize_t(i)));
+            //newstrides_bytes[i] = newstrides[i] * 4;
+        }
+    }else if (PyList_Check(value)){
+        for(int i=0; i < CudaNdarray_NDIM(self); i++){
+            newstrides[i] = PyInt_AsLong(PyList_GetItem(value, Py_ssize_t(i)));
+            //newstrides_bytes[i] = newstrides[i] * 4;
+        }
+    }
+    /*
+    // Do not do this check, as ExtractDiag needs that, and NumPy does not seem
+    // to do it.
+    npy_intp dims[PyTuple_Size(value)];
+    for(int i=0; i < CudaNdarray_NDIM(self); i++){
+        dims[i] = CudaNdarray_HOST_DIMS(self)[i];
+    }
+    if (!PyArray_CheckStrides(4,
+                              CudaNdarray_NDIM(self),
+                              0, 0,
+                              dims,
+                              newstrides_bytes)){
+        PyErr_SetString(PyExc_ValueError, "bad new strides");
+        return -1;
+        }
+    */
+    for(int i=0; i < CudaNdarray_NDIM(self); i++){
+        CudaNdarray_set_stride(self, i, newstrides[i]);
+    }
+    return 0;
 }
 static PyObject *

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -289,7 +289,7 @@ def local_gpu_dimshuffle_0(node):
 def local_gpu_specifyShape_0(node):
    """
    specify_shape(host_from_gpu()) -> host_from_gpu(specify_shape)
-    gpu_from_host(specify_shape) -> specifyshape(gpu_from_host)
+    gpu_from_host(specify_shape) -> specify_shape(gpu_from_host)
    """
    if isinstance(node.op, tensor.SpecifyShape):
        input = node.inputs[0]
@@ -1438,6 +1438,32 @@ def tensor_to_cuda(x):
        return x
+@register_opt()
+@local_optimizer([])
+def local_gpu_extract_diagonal(node):
+    """
+    extract_diagonal(host_from_gpu()) -> host_from_gpu(extract_diagonal)
+    gpu_from_host(extract_diagonal) -> extract_diagonal(gpu_from_host)
+    """
+    from theano.sandbox import linalg
+    if (isinstance(node.op, linalg.ops.ExtractDiag) and
+        isinstance(node.inputs[0].type,
+                   theano.tensor.TensorType)):
+        inp = node.inputs[0]
+        if inp.owner and isinstance(inp.owner.op, HostFromGpu):
+            return [host_from_gpu(linalg.extract_diag(gpu_from_host(inp)))]
+    if node.op == gpu_from_host:
+        host_input = node.inputs[0]
+        if (host_input.owner and
+            isinstance(host_input.owner.op, linalg.ops.ExtractDiag) and
+            isinstance(host_input.owner.inputs[0].type,
+                       theano.tensor.TensorType)):
+            diag_node = host_input.owner
+            return [linalg.extract_diag(
+                gpu_from_host(diag_node.inputs[0]))]
+    return False
 @register_opt('scan')
 @local_optimizer([])
 def gpuScanOptimization(node):

--- a/theano/sandbox/cuda/tests/test_cuda_ndarray.py
+++ b/theano/sandbox/cuda/tests/test_cuda_ndarray.py
@@ -941,6 +941,33 @@ def test_base():
    e = b.reshape((5,2,2,3))
    assert e.base is a
+def test_set_strides():
+    a = cuda_ndarray.CudaNdarray.zeros((5, 5))
+    # Test with tuple
+    new_strides = (a.strides[1], a.strides[0])
+    a.strides = new_strides
+    assert a.strides == new_strides
+    # Test with list
+    new_strides = (a.strides[1], a.strides[0])
+    a.strides = [a.strides[1], a.strides[0]]
+    assert a.strides == new_strides
+    try:
+        a.strides = (a.strides[1],)
+        assert False
+    except ValueError:
+        pass
+    try:
+        a.strides = (1, 1, 1)
+        assert False
+    except ValueError:
+        pass
 def test_is_c_contiguous():
    a = cuda_ndarray.CudaNdarray.zeros((3,4,5))
    assert a.is_c_contiguous()

--- a/theano/sandbox/cuda/tests/test_opt.py
+++ b/theano/sandbox/cuda/tests/test_opt.py
@@ -4,9 +4,10 @@ import numpy
 # Skip test if cuda_ndarray is not available.
 from nose.plugins.skip import SkipTest
+import theano
 from theano.compile.pfunc import pfunc
 from theano import config, tensor
-import theano
+import theano.sandbox.linalg.tests
 from theano.tests import unittest_tools as utt
@@ -381,6 +382,17 @@ def test_erfinvgpu():
    assert numpy.allclose(f(xv),f2(xv))
+class test_diag(theano.sandbox.linalg.tests.test_linalg.test_diag):
+    mode = mode_with_gpu
+    shared = staticmethod(cuda.shared_constructor)
+    floatX = 'float32'
+    type = CudaNdarrayType
+    def __init__(self, name):
+        super(theano.sandbox.linalg.tests.test_linalg.test_diag,
+              self).__init__(name)
 if __name__ == '__main__':
    test_gpualloc()
    test_opt_gpujoin_onlyajoin()

--- a/theano/sandbox/linalg/ops.py
+++ b/theano/sandbox/linalg/ops.py
@@ -684,7 +684,10 @@ solve = Solve()  # general solve
 class ExtractDiag(Op):
-    """ Return the diagonal of a matrix. """
+    """ Return the diagonal of a matrix.
+    :note: work on the GPU.
+    """
    def __init__(self, view=False):
        self.view = view
        if self.view:
@@ -697,10 +700,15 @@ class ExtractDiag(Op):
        return hash(type(self)) ^ hash(self.view)
    def make_node(self, _x):
-        x = as_tensor_variable(_x)
+        if not isinstance(_x, theano.Variable):
+            x = as_tensor_variable(_x)
+        else:
+            x = _x
        if x.type.ndim != 2:
            raise TypeError('ExtractDiag only works on matrices', _x)
-        return Apply(self, [x], [tensor.vector(dtype=x.type.dtype)])
+        return Apply(self, [x], [x.type.__class__(broadcastable=(False,),
+                                                  dtype=x.type.dtype)()])
    def perform(self, node, ins, outs):
        """ For some reason numpy.diag(x) is really slow, so we

--- a/theano/sandbox/linalg/tests/test_linalg.py
+++ b/theano/sandbox/linalg/tests/test_linalg.py
+import unittest
 import numpy
 import numpy.linalg
 from numpy.testing import assert_array_almost_equal
@@ -266,46 +268,7 @@ def test_det_shape():
    assert numpy.all(f(r).shape == f_shape(r))
-def test_alloc_diag():
+class test_diag(unittest.TestCase):
-    rng = numpy.random.RandomState(utt.fetch_seed())
-    x = theano.tensor.vector()
-    g = alloc_diag(x)
-    f = theano.function([x], g)
-    # test "normal" scenario (5x5 matrix) and special cases of 0x0 and 1x1
-    for shp in [5, 0, 1]:
-        m = rng.rand(shp).astype(config.floatX)
-        v = numpy.diag(m)
-        r = f(m)
-        # The right diagonal is extracted
-        assert (r == v).all()
-    # Test we accept only vectors
-    xx = theano.tensor.matrix()
-    ok = False
-    try:
-        alloc_diag(xx)
-    except TypeError:
-        ok = True
-    assert ok
-    # Test infer_shape
-    f = theano.function([x], g.shape)
-    topo = f.maker.fgraph.toposort()
-    if config.mode != 'FAST_COMPILE':
-        assert sum([node.op.__class__ == AllocDiag for node in topo]) == 0
-    for shp in [5, 0, 1]:
-        m = rng.rand(shp).astype(config.floatX)
-        assert (f(m) == m.shape).all()
-def test_alloc_diag_grad():
-    rng = numpy.random.RandomState(utt.fetch_seed())
-    x = rng.rand(5)
-    tensor.verify_grad(alloc_diag, [x], rng=rng)
-def test_diag():
    """
    Test that linalg.diag has the same behavior as numpy.diag.
    numpy.diag has two behaviors:
@@ -315,72 +278,130 @@ def test_diag():
    matrix.
    (1) and (2) are tested by test_alloc_diag and test_extract_diag
-    respectively. This test makes sure that linalg.diag instantiates
+    respectively.
+    test_diag test makes sure that linalg.diag instantiates
    the right op based on the dimension of the input.
    """
+    def __init__(self, name, mode=None, shared=tensor.shared,
+                 floatX=None, type=tensor.TensorType):
+        self.mode = mode
+        self.shared = shared
+        if floatX is None:
+            floatX = config.floatX
+        self.floatX = floatX
+        self.type = type
+        super(test_diag, self).__init__(name)
+    def test_alloc_diag(self):
+        rng = numpy.random.RandomState(utt.fetch_seed())
+        x = theano.tensor.vector()
+        g = alloc_diag(x)
+        f = theano.function([x], g)
+        # test "normal" scenario (5x5 matrix) and special cases of 0x0 and 1x1
+        for shp in [5, 0, 1]:
+            m = rng.rand(shp).astype(self.floatX)
+            v = numpy.diag(m)
+            r = f(m)
+            # The right matrix is created
+            assert (r == v).all()
+        # Test we accept only vectors
+        xx = theano.tensor.matrix()
+        ok = False
+        try:
+            alloc_diag(xx)
+        except TypeError:
+            ok = True
+        assert ok
+        # Test infer_shape
+        f = theano.function([x], g.shape)
+        topo = f.maker.fgraph.toposort()
+        if config.mode != 'FAST_COMPILE':
+            assert sum([node.op.__class__ == AllocDiag for node in topo]) == 0
+        for shp in [5, 0, 1]:
+            m = rng.rand(shp).astype(self.floatX)
+            assert (f(m) == m.shape).all()
-    # test that it builds a matrix with given diagonal when using vector inputs
+    def test_alloc_diag_grad(self):
-    x = theano.tensor.vector()
+        rng = numpy.random.RandomState(utt.fetch_seed())
-    y = diag(x)
+        x = rng.rand(5)
-    assert y.owner.op.__class__ == AllocDiag
+        tensor.verify_grad(alloc_diag, [x], rng=rng)
-    # test that it extracts the diagonal when using matrix input
+    def test_diag(self):
-    x = theano.tensor.matrix()
+        # test that it builds a matrix with given diagonal when using
-    y = extract_diag(x)
+        # vector inputs
-    assert y.owner.op.__class__ == ExtractDiag
+        x = theano.tensor.vector()
+        y = diag(x)
-    # other types should raise error
+        assert y.owner.op.__class__ == AllocDiag
-    x = theano.tensor.tensor3()
-    ok = False
+        # test that it extracts the diagonal when using matrix input
-    try:
+        x = theano.tensor.matrix()
        y = extract_diag(x)
-    except TypeError:
+        assert y.owner.op.__class__ == ExtractDiag
-        ok = True
-    assert ok
+        # other types should raise error
+        x = theano.tensor.tensor3()
+        ok = False
-# not testing the view=True case since it is not used anywhere.
+        try:
-def test_extract_diag():
+            y = extract_diag(x)
-    rng = numpy.random.RandomState(utt.fetch_seed())
+        except TypeError:
-    x = theano.tensor.matrix()
+            ok = True
-    g = extract_diag(x)
+        assert ok
-    f = theano.function([x], g)
+    # not testing the view=True case since it is not used anywhere.
-    for shp in [(2, 3), (3, 2), (3, 3), (1, 1), (0, 0)]:
+    def test_extract_diag(self):
-        m = rng.rand(*shp).astype(config.floatX)
+        rng = numpy.random.RandomState(utt.fetch_seed())
-        v = numpy.diag(m)
+        m = rng.rand(2, 3).astype(self.floatX)
-        r = f(m)
+        x = self.shared(m)
-        # The right diagonal is extracted
+        g = extract_diag(x)
-        assert (r == v).all()
+        f = theano.function([], g)
+        assert [isinstance(node.inputs[0].type, self.type)
-    # Test we accept only matrix
+                for node in f.maker.fgraph.toposort()
-    xx = theano.tensor.vector()
+                if isinstance(node.op, ExtractDiag)] == [True]
-    ok = False
-    try:
+        for shp in [(2, 3), (3, 2), (3, 3), (1, 1), (0, 0)]:
-        extract_diag(xx)
+            m = rng.rand(*shp).astype(self.floatX)
-    except TypeError:
+            x.set_value(m)
-        ok = True
+            v = numpy.diag(m)
-    assert ok
+            r = f()
+            # The right diagonal is extracted
-    # Test infer_shape
+            assert (r == v).all()
-    f = theano.function([x], g.shape)
-    topo = f.maker.fgraph.toposort()
+        # Test we accept only matrix
-    if config.mode != 'FAST_COMPILE':
+        xx = theano.tensor.vector()
-        assert sum([node.op.__class__ == ExtractDiag for node in topo]) == 0
+        ok = False
-    for shp in [(2, 3), (3, 2), (3, 3)]:
+        try:
-        m = rng.rand(*shp).astype(config.floatX)
+            extract_diag(xx)
-        assert f(m) == min(shp)
+        except TypeError:
+            ok = True
+        assert ok
-def test_extract_diag_grad():
-    rng = numpy.random.RandomState(utt.fetch_seed())
+        # Test infer_shape
-    x = rng.rand(5, 4)
+        f = theano.function([], g.shape)
-    tensor.verify_grad(extract_diag, [x], rng=rng)
+        topo = f.maker.fgraph.toposort()
+        if config.mode != 'FAST_COMPILE':
+            assert sum([node.op.__class__ == ExtractDiag
+                        for node in topo]) == 0
+        for shp in [(2, 3), (3, 2), (3, 3)]:
+            m = rng.rand(*shp).astype(self.floatX)
+            x.set_value(m)
+            assert f() == min(shp)
+    def test_extract_diag_grad(self):
+        rng = numpy.random.RandomState(utt.fetch_seed())
+        x = rng.rand(5, 4).astype(self.floatX)
+        tensor.verify_grad(extract_diag, [x], rng=rng)
+    def test_extract_diag_empty(self):
+        c = self.shared(numpy.array([[], []], self.floatX))
+        f = theano.function([], extract_diag(c), mode=self.mode)
-def test_extract_diag_empty():
+        assert [isinstance(node.inputs[0].type, self.type)
-    c = theano.tensor.constant(numpy.array([[], []], 'int32'))
+                for node in f.maker.fgraph.toposort()
-    extract_diag(c).eval()
+                if isinstance(node.op, ExtractDiag)] == [True]
 def test_trace():