Merge pull request #582 from nouiz/gpu_stuff

Gpu stuff

Merge pull request #582 from nouiz/gpu_stuff
3a4e6c78 · lamblin · 9aa99867 · 6fa74303 · 3a4e6c78 · 3a4e6c78
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -53,6 +53,10 @@ New Features
   (Frederic B., Simon McGregor)
 * MRG random now raises an error with a clear message when the passed shape
   contains dimensions with bad value like 0. (Frédéric B. reported by Ian G.)
+ * "CudaNdarray[*] = ndarray" work in more case (Frederic B.)
+ * "CudaNdarray[*] += ndarray" work in more case (Frederic B.)
+ * We add dimensions to CudaNdarray to automatically broadcast more frequently.
+   (Frederic B.)

 Sparse
 * Implement theano.sparse.mul(sparse1, sparse2) when both inputs don't

--- a/theano/compile/debugmode.py
+++ b/theano/compile/debugmode.py
@@ -1026,13 +1026,11 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val,
        for r in node.outputs:
            if isinstance(r.type, (TensorType, CudaNdarrayType)):
                # Build a C-contiguous buffer
-                new_buf = numpy.zeros(
-                        shape=r_vals[r].shape,
-                        dtype=r_vals[r].dtype,
-                        order='C')
-                new_buf += def_val
-                if isinstance(r.type, CudaNdarrayType):
-                    new_buf = CudaNdarray(new_buf)
+                new_buf = r.type.value_zeros(r_vals[r].shape)
+                # CudaNdarray don't have flags field
+                # assert new_buf.flags["C_CONTIGUOUS"]
+                new_buf += numpy.asarray(def_val).astype(r.type.dtype)
+
                c_cont_outputs[r] = new_buf

        if len(c_cont_outputs):
@@ -1096,21 +1094,12 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val,
                            shapes.append(slice(None, size, None))

                        r_buf = init_strided[r]
+
                        if r_buf.ndim > 0:
                            r_buf = r_buf[tuple(strides)][tuple(shapes)]
                        assert r_buf.shape == r_vals[r].shape

-                        if isinstance(r.type, CudaNdarrayType):
-                            # It seems stupid, but we need to allocate a
-                            # new ndarray and copy it into the GPU one.
-                            # TODO: When it is possible to simply do
-                            # r_buff[...] = def_val, do so.
-                            new_rbuf = numpy.zeros(r_vals[r].shape,
-                                    dtype=r.dtype)
-                            new_rbuf += def_val
-                            r_buf[...] = CudaNdarray(new_rbuf)
-                        else:
-                            r_buf[...] = def_val
+                        r_buf[...] = numpy.asarray(def_val).astype(r_buf.dtype)

                        strided[r] = r_buf

@@ -1133,12 +1122,8 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val,
                        out_shape = [max((s + sd), 0)
                                for s, sd in zip(r_vals[r].shape,
                                                 r_shape_diff)]
-                        new_buf = numpy.zeros(
-                                shape=out_shape,
-                                dtype=r.dtype)
-                        new_buf += def_val
-                        if isinstance(r.type, CudaNdarrayType):
-                            new_buf = CudaNdarray(new_buf)
+                        new_buf = r.type.value_zeros(r_vals[r].shape)
+                        new_buf += numpy.asarray(def_val).astype(r.type.dtype)
                        wrong_size[r] = new_buf

                yield (name, wrong_size)

--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -1923,10 +1923,6 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp):
        ilist_ = tensor.as_tensor_variable(ilist)

        assert x_.type.dtype == y_.type.dtype
-        assert x_.type.ndim == y_.type.ndim
-#        if (x_.type.ndim - 1) > y_.type.ndim:
-#            y_ = tensor.shape_padleft(y_, x_.type.ndim - y_.type.ndim)
-#        assert x_.type.ndim == y_.type.ndim
        assert x_.type.ndim >= y_.type.ndim

        if ilist_.type.dtype[:3] not in ('int', 'uin'):
@@ -1941,9 +1937,40 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp):

        return Apply(self, [x_, y_, ilist_], [x_.type()])

-    #def perform(self, node, inp, out_):
    # CudaNdarray_Subscript() don't support Advanced slicing.
-        # so we use the parent version that loop on each indices.
+    # But we can't use the parent version that loop on each indices
+    # as we also need to loop when set_instead_of_inc is True and the
+    # parent don't loop in that case.
+    def perform(self, node, inp, out_):
+        # TODO opt to make this inplace
+        x, y, idx = inp
+        out, = out_
+        if not self.inplace:
+            x = x.copy()
+        if self.set_instead_of_inc:
+            # CudaNdarray __setitem__ don't do broadcast nor support
+            # list of index.
+            assert y.ndim <= x.ndim   # Should be guaranteed by `make_node`
+            if y.ndim == x.ndim:
+                assert len(y) == len(idx)
+                for (j, i) in enumerate(idx):
+                    x[i] = y[j]
+            else:
+                for i in idx:
+                    x[i] = y
+        else:
+            # If `y` has as many dimensions as `x`, then we want to iterate
+            # jointly on `x` and `y`. Otherwise, it means `y` should be
+            # broadcasted to fill all relevant rows of `x`.
+            assert y.ndim <= x.ndim   # Should be guaranteed by `make_node`
+            if y.ndim == x.ndim:
+                assert len(y) == len(idx)
+                for (j, i) in enumerate(idx):
+                    x[i] += y[j]
+            else:
+                for i in idx:
+                    x[i] += y
+        out[0] = x


 class GpuIncSubtensor(tensor.IncSubtensor, GpuOp):

--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
--- a/theano/sandbox/cuda/cuda_ndarray.cuh
+++ b/theano/sandbox/cuda/cuda_ndarray.cuh
@@ -282,7 +282,7 @@ static PyObject *CudaNdarray_NewDims(int nd, const inttype * dims)
 * Set self to be a view of given `data`, owned by existing CudaNdarray `base`.
 */
 DllExport int CudaNdarray_set_device_data(CudaNdarray * self, float * data, PyObject * base);
-DllExport int CudaNdarray_set_device_data(CudaNdarray * self, float * data, CudaNdarray * base);
+DllExport int CudaNdarray_set_device_data(CudaNdarray * self, float * data, const CudaNdarray * base);

 /**
 * Return an independent copy of self

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -765,8 +765,6 @@ def local_gpu_advanced_incsubtensor1(node):
                    'either set the `warn.gpu_set_subtensor1` config '
                    'option to False, or `warn.ignore_bug_before` to at '
                    'least \'0.6\'.', stacklevel=1)
-            if set_instead_of_inc:
-                return

            gpu_op = GpuAdvancedIncSubtensor1(
                set_instead_of_inc=set_instead_of_inc)
@@ -799,8 +797,7 @@ def local_gpu_advanced_incsubtensor1(node):
                    'either set the `warn.gpu_set_subtensor1` config '
                    'option to False, or `warn.ignore_bug_before` to at '
                    'least \'0.6\'.', stacklevel=1)
-            if set_instead_of_inc:
-                return
+
            gpu_op = GpuAdvancedIncSubtensor1(
                set_instead_of_inc=set_instead_of_inc)
            return [host_from_gpu(gpu_op(gpu_x, gpu_y, *coords))]

--- a/theano/sandbox/cuda/tests/test_cuda_ndarray.py
+++ b/theano/sandbox/cuda/tests/test_cuda_ndarray.py
@@ -630,13 +630,9 @@ def test_setitem_matrixvector1():
    assert numpy.allclose(a,numpy.asarray(_a))

    #test direct transfert from numpy
-    try:
    _a[:,1] =  b*100
    a[:,1] =  b*100
-        raise Exception("CudaNdarray.__setitem__ should have returned an error")
    assert numpy.allclose(a,numpy.asarray(_a))
-    except NotImplementedError, e:
-        pass

    row = theano._asarray([777,888,999], dtype='float32')
    _a[1,:] = row
@@ -659,13 +655,9 @@ def test_setitem_matrix_tensor3():
    assert numpy.allclose(a,numpy.asarray(_a))

    #test direct transfert from numpy
-    try:
    _a[:,1,1] = b*100
    a[:,1,1] = b*100
-        raise Exception("CudaNdarray.__setitem__ should have returned an error")
    assert numpy.allclose(a,numpy.asarray(_a))
-    except NotImplementedError:
-        pass

    row = theano._asarray([777,888,999], dtype='float32')
    _a[1,1,:] = row
@@ -714,7 +706,7 @@ def test_setitem_matrix_bad_ndim():
        # attempt to assign the ndarray b with setitem
        _a[:,:,1] = _b
        assert False
-    except NotImplementedError, e:
+    except ValueError, e:
        #print e
        assert True

@@ -723,7 +715,7 @@ def test_setitem_matrix_bad_ndim():
        # attempt to assign the ndarray b with setitem
        _a[1,:,:] = b
        assert False
-    except NotImplementedError, e:
+    except ValueError, e:
        #print e
        assert True

@@ -806,7 +798,7 @@ def test_setitem_broadcast():
    a[:,:,1] = b.reshape((1,3))
    assert numpy.allclose(numpy.asarray(_a),a)

-#This is not supported for now.
+
 def test_setitem_broadcast_numpy():
    #test scalar to vector without stride
    a = numpy.arange(3)
@@ -814,73 +806,81 @@ def test_setitem_broadcast_numpy():
    _a = cuda_ndarray.CudaNdarray(a)

    b = theano._asarray(9, dtype='float32')
-    try:
    _a[:] = b.reshape((1,))
    a[:] = b.reshape((1,))
-        assert False
-        assert numpy.allclose(numpy.asarray(_a),a)
-    except ValueError:
-        pass
+    assert numpy.allclose(numpy.asarray(_a), a)
+
    #test vector to matrice without stride
    a = numpy.arange(9)
-    a.resize((3,3))
+    a.resize((3, 3))
    a = theano._asarray(a, dtype='float32')
    _a = cuda_ndarray.CudaNdarray(a)

-    try:
-        b = theano._asarray([7,8,9], dtype='float32')
-        _a[:,:] = b.reshape((1,3))
-        a[:,:] = b.reshape((1,3))
-        assert False
-        assert numpy.allclose(numpy.asarray(_a),a)
-    except ValueError:
-        pass
+    b = theano._asarray([7, 8, 9], dtype='float32')
+    _a[:, :] = b.reshape((1, 3))
+    a[:, :] = b.reshape((1, 3))
+    assert numpy.allclose(numpy.asarray(_a), a)

    #test vector to matrice with stride
    a = numpy.arange(27)
-    a.resize((3,3,3))
+    a.resize((3, 3, 3))
    a = theano._asarray(a, dtype='float32')
    _a = cuda_ndarray.CudaNdarray(a)

-    try:
-        b = theano._asarray([[7,8,9],[10,11,12]], dtype='float32')
+    b = theano._asarray([[7, 8, 9], [10, 11, 12]], dtype='float32')
    b = b[0]
-        _a[1,:,:] = b.reshape((1,3))
-        a[1,:,:] = b.reshape((1,3))
-        assert False
-        assert numpy.allclose(numpy.asarray(_a),a)
-    except ValueError:
-        pass
+    _a[1, :, :] = b.reshape((1, 3))
+    a[1, :, :] = b.reshape((1, 3))
+    assert numpy.allclose(numpy.asarray(_a), a)
+

 # this also fails for the moment
 def test_setitem_rightvalue_ndarray_fails():
    """
    Now we don't automatically add dimensions to broadcast
    """
-    a = numpy.arange(27)
-    a.resize((3,3,3))
+    a = numpy.arange(3 * 4 * 5)
+    a.resize((3, 4, 5))
    a = theano._asarray(a, dtype='float32')
    _a = cuda_ndarray.CudaNdarray(a)

-    b = theano._asarray([7,8,9], dtype='float32')
+    b = theano._asarray([7, 8, 9, 10], dtype='float32')
    _b = cuda_ndarray.CudaNdarray(b)
+    b5 = theano._asarray([7, 8, 9, 10, 11], dtype='float32')
+    _b5 = cuda_ndarray.CudaNdarray(b)

-    try:
    # attempt to assign the ndarray b with setitem
-        _a[:,:,1] = _b
-        assert False
-    except NotImplementedError, e:
-        #print e
-        assert True
+    _a[:, :, 1] = _b
+    a[:, :, 1] = b
+    assert numpy.allclose(numpy.asarray(_a), a)

-    #test direct transfert from numpy
+    #test direct transfert from numpy to contiguous region
+    # attempt to assign the ndarray b with setitem
+    # same number of dim
+    mat = numpy.random.rand(4, 5).astype('float32')
+    _a[2, :, :] = mat
+    a[2, :, :] = mat
+    assert numpy.allclose(numpy.asarray(_a), a)
+
+    # without same number of dim
    try:
+        _a[0, :, :] = mat
+        #a[0, :, :] = mat
+        #assert numpy.allclose(numpy.asarray(_a), a)
+    except ValueError, e:
+        pass
+
+    #test direct transfert from numpy with broadcast
+    _a[0, :, :] = b5
+    a[0, :, :] = b5
+    assert numpy.allclose(numpy.asarray(_a), a)
+
+    #test direct transfert from numpy to not contiguous region
    # attempt to assign the ndarray b with setitem
-        _a[:,:,1] = b
-        assert False
-    except NotImplementedError, e:
-        #print e
-        assert True
+    _a[:, :, 2] = b
+    a[:, :, 2] = b
+    assert numpy.allclose(numpy.asarray(_a), a)
+

 def test_zeros_basic():
    for shp in [(3,4,5), (300,), (), (0,7)]: