提交 3a4e6c78 authored 作者: lamblin's avatar lamblin

Merge pull request #582 from nouiz/gpu_stuff

Gpu stuff
......@@ -53,6 +53,10 @@ New Features
(Frederic B., Simon McGregor)
* MRG random now raises an error with a clear message when the passed shape
contains dimensions with bad value like 0. (Frédéric B. reported by Ian G.)
* "CudaNdarray[*] = ndarray" work in more case (Frederic B.)
* "CudaNdarray[*] += ndarray" work in more case (Frederic B.)
* We add dimensions to CudaNdarray to automatically broadcast more frequently.
(Frederic B.)
Sparse
* Implement theano.sparse.mul(sparse1, sparse2) when both inputs don't
......
......@@ -1026,13 +1026,11 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val,
for r in node.outputs:
if isinstance(r.type, (TensorType, CudaNdarrayType)):
# Build a C-contiguous buffer
new_buf = numpy.zeros(
shape=r_vals[r].shape,
dtype=r_vals[r].dtype,
order='C')
new_buf += def_val
if isinstance(r.type, CudaNdarrayType):
new_buf = CudaNdarray(new_buf)
new_buf = r.type.value_zeros(r_vals[r].shape)
# CudaNdarray don't have flags field
# assert new_buf.flags["C_CONTIGUOUS"]
new_buf += numpy.asarray(def_val).astype(r.type.dtype)
c_cont_outputs[r] = new_buf
if len(c_cont_outputs):
......@@ -1096,21 +1094,12 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val,
shapes.append(slice(None, size, None))
r_buf = init_strided[r]
if r_buf.ndim > 0:
r_buf = r_buf[tuple(strides)][tuple(shapes)]
assert r_buf.shape == r_vals[r].shape
if isinstance(r.type, CudaNdarrayType):
# It seems stupid, but we need to allocate a
# new ndarray and copy it into the GPU one.
# TODO: When it is possible to simply do
# r_buff[...] = def_val, do so.
new_rbuf = numpy.zeros(r_vals[r].shape,
dtype=r.dtype)
new_rbuf += def_val
r_buf[...] = CudaNdarray(new_rbuf)
else:
r_buf[...] = def_val
r_buf[...] = numpy.asarray(def_val).astype(r_buf.dtype)
strided[r] = r_buf
......@@ -1133,12 +1122,8 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val,
out_shape = [max((s + sd), 0)
for s, sd in zip(r_vals[r].shape,
r_shape_diff)]
new_buf = numpy.zeros(
shape=out_shape,
dtype=r.dtype)
new_buf += def_val
if isinstance(r.type, CudaNdarrayType):
new_buf = CudaNdarray(new_buf)
new_buf = r.type.value_zeros(r_vals[r].shape)
new_buf += numpy.asarray(def_val).astype(r.type.dtype)
wrong_size[r] = new_buf
yield (name, wrong_size)
......
......@@ -1923,10 +1923,6 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp):
ilist_ = tensor.as_tensor_variable(ilist)
assert x_.type.dtype == y_.type.dtype
assert x_.type.ndim == y_.type.ndim
# if (x_.type.ndim - 1) > y_.type.ndim:
# y_ = tensor.shape_padleft(y_, x_.type.ndim - y_.type.ndim)
# assert x_.type.ndim == y_.type.ndim
assert x_.type.ndim >= y_.type.ndim
if ilist_.type.dtype[:3] not in ('int', 'uin'):
......@@ -1941,9 +1937,40 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp):
return Apply(self, [x_, y_, ilist_], [x_.type()])
#def perform(self, node, inp, out_):
# CudaNdarray_Subscript() don't support Advanced slicing.
# so we use the parent version that loop on each indices.
# But we can't use the parent version that loop on each indices
# as we also need to loop when set_instead_of_inc is True and the
# parent don't loop in that case.
def perform(self, node, inp, out_):
# TODO opt to make this inplace
x, y, idx = inp
out, = out_
if not self.inplace:
x = x.copy()
if self.set_instead_of_inc:
# CudaNdarray __setitem__ don't do broadcast nor support
# list of index.
assert y.ndim <= x.ndim # Should be guaranteed by `make_node`
if y.ndim == x.ndim:
assert len(y) == len(idx)
for (j, i) in enumerate(idx):
x[i] = y[j]
else:
for i in idx:
x[i] = y
else:
# If `y` has as many dimensions as `x`, then we want to iterate
# jointly on `x` and `y`. Otherwise, it means `y` should be
# broadcasted to fill all relevant rows of `x`.
assert y.ndim <= x.ndim # Should be guaranteed by `make_node`
if y.ndim == x.ndim:
assert len(y) == len(idx)
for (j, i) in enumerate(idx):
x[i] += y[j]
else:
for i in idx:
x[i] += y
out[0] = x
class GpuIncSubtensor(tensor.IncSubtensor, GpuOp):
......
......@@ -282,7 +282,7 @@ static PyObject *CudaNdarray_NewDims(int nd, const inttype * dims)
* Set self to be a view of given `data`, owned by existing CudaNdarray `base`.
*/
DllExport int CudaNdarray_set_device_data(CudaNdarray * self, float * data, PyObject * base);
DllExport int CudaNdarray_set_device_data(CudaNdarray * self, float * data, CudaNdarray * base);
DllExport int CudaNdarray_set_device_data(CudaNdarray * self, float * data, const CudaNdarray * base);
/**
* Return an independent copy of self
......
......@@ -765,8 +765,6 @@ def local_gpu_advanced_incsubtensor1(node):
'either set the `warn.gpu_set_subtensor1` config '
'option to False, or `warn.ignore_bug_before` to at '
'least \'0.6\'.', stacklevel=1)
if set_instead_of_inc:
return
gpu_op = GpuAdvancedIncSubtensor1(
set_instead_of_inc=set_instead_of_inc)
......@@ -799,8 +797,7 @@ def local_gpu_advanced_incsubtensor1(node):
'either set the `warn.gpu_set_subtensor1` config '
'option to False, or `warn.ignore_bug_before` to at '
'least \'0.6\'.', stacklevel=1)
if set_instead_of_inc:
return
gpu_op = GpuAdvancedIncSubtensor1(
set_instead_of_inc=set_instead_of_inc)
return [host_from_gpu(gpu_op(gpu_x, gpu_y, *coords))]
......
......@@ -630,13 +630,9 @@ def test_setitem_matrixvector1():
assert numpy.allclose(a,numpy.asarray(_a))
#test direct transfert from numpy
try:
_a[:,1] = b*100
a[:,1] = b*100
raise Exception("CudaNdarray.__setitem__ should have returned an error")
assert numpy.allclose(a,numpy.asarray(_a))
except NotImplementedError, e:
pass
row = theano._asarray([777,888,999], dtype='float32')
_a[1,:] = row
......@@ -659,13 +655,9 @@ def test_setitem_matrix_tensor3():
assert numpy.allclose(a,numpy.asarray(_a))
#test direct transfert from numpy
try:
_a[:,1,1] = b*100
a[:,1,1] = b*100
raise Exception("CudaNdarray.__setitem__ should have returned an error")
assert numpy.allclose(a,numpy.asarray(_a))
except NotImplementedError:
pass
row = theano._asarray([777,888,999], dtype='float32')
_a[1,1,:] = row
......@@ -714,7 +706,7 @@ def test_setitem_matrix_bad_ndim():
# attempt to assign the ndarray b with setitem
_a[:,:,1] = _b
assert False
except NotImplementedError, e:
except ValueError, e:
#print e
assert True
......@@ -723,7 +715,7 @@ def test_setitem_matrix_bad_ndim():
# attempt to assign the ndarray b with setitem
_a[1,:,:] = b
assert False
except NotImplementedError, e:
except ValueError, e:
#print e
assert True
......@@ -806,7 +798,7 @@ def test_setitem_broadcast():
a[:,:,1] = b.reshape((1,3))
assert numpy.allclose(numpy.asarray(_a),a)
#This is not supported for now.
def test_setitem_broadcast_numpy():
#test scalar to vector without stride
a = numpy.arange(3)
......@@ -814,73 +806,81 @@ def test_setitem_broadcast_numpy():
_a = cuda_ndarray.CudaNdarray(a)
b = theano._asarray(9, dtype='float32')
try:
_a[:] = b.reshape((1,))
a[:] = b.reshape((1,))
assert False
assert numpy.allclose(numpy.asarray(_a),a)
except ValueError:
pass
assert numpy.allclose(numpy.asarray(_a), a)
#test vector to matrice without stride
a = numpy.arange(9)
a.resize((3,3))
a.resize((3, 3))
a = theano._asarray(a, dtype='float32')
_a = cuda_ndarray.CudaNdarray(a)
try:
b = theano._asarray([7,8,9], dtype='float32')
_a[:,:] = b.reshape((1,3))
a[:,:] = b.reshape((1,3))
assert False
assert numpy.allclose(numpy.asarray(_a),a)
except ValueError:
pass
b = theano._asarray([7, 8, 9], dtype='float32')
_a[:, :] = b.reshape((1, 3))
a[:, :] = b.reshape((1, 3))
assert numpy.allclose(numpy.asarray(_a), a)
#test vector to matrice with stride
a = numpy.arange(27)
a.resize((3,3,3))
a.resize((3, 3, 3))
a = theano._asarray(a, dtype='float32')
_a = cuda_ndarray.CudaNdarray(a)
try:
b = theano._asarray([[7,8,9],[10,11,12]], dtype='float32')
b = theano._asarray([[7, 8, 9], [10, 11, 12]], dtype='float32')
b = b[0]
_a[1,:,:] = b.reshape((1,3))
a[1,:,:] = b.reshape((1,3))
assert False
assert numpy.allclose(numpy.asarray(_a),a)
except ValueError:
pass
_a[1, :, :] = b.reshape((1, 3))
a[1, :, :] = b.reshape((1, 3))
assert numpy.allclose(numpy.asarray(_a), a)
# this also fails for the moment
def test_setitem_rightvalue_ndarray_fails():
"""
Now we don't automatically add dimensions to broadcast
"""
a = numpy.arange(27)
a.resize((3,3,3))
a = numpy.arange(3 * 4 * 5)
a.resize((3, 4, 5))
a = theano._asarray(a, dtype='float32')
_a = cuda_ndarray.CudaNdarray(a)
b = theano._asarray([7,8,9], dtype='float32')
b = theano._asarray([7, 8, 9, 10], dtype='float32')
_b = cuda_ndarray.CudaNdarray(b)
b5 = theano._asarray([7, 8, 9, 10, 11], dtype='float32')
_b5 = cuda_ndarray.CudaNdarray(b)
try:
# attempt to assign the ndarray b with setitem
_a[:,:,1] = _b
assert False
except NotImplementedError, e:
#print e
assert True
_a[:, :, 1] = _b
a[:, :, 1] = b
assert numpy.allclose(numpy.asarray(_a), a)
#test direct transfert from numpy
#test direct transfert from numpy to contiguous region
# attempt to assign the ndarray b with setitem
# same number of dim
mat = numpy.random.rand(4, 5).astype('float32')
_a[2, :, :] = mat
a[2, :, :] = mat
assert numpy.allclose(numpy.asarray(_a), a)
# without same number of dim
try:
_a[0, :, :] = mat
#a[0, :, :] = mat
#assert numpy.allclose(numpy.asarray(_a), a)
except ValueError, e:
pass
#test direct transfert from numpy with broadcast
_a[0, :, :] = b5
a[0, :, :] = b5
assert numpy.allclose(numpy.asarray(_a), a)
#test direct transfert from numpy to not contiguous region
# attempt to assign the ndarray b with setitem
_a[:,:,1] = b
assert False
except NotImplementedError, e:
#print e
assert True
_a[:, :, 2] = b
a[:, :, 2] = b
assert numpy.allclose(numpy.asarray(_a), a)
def test_zeros_basic():
for shp in [(3,4,5), (300,), (), (0,7)]:
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论