提交 3a4e6c78 authored 作者: lamblin's avatar lamblin

Merge pull request #582 from nouiz/gpu_stuff

Gpu stuff
...@@ -53,6 +53,10 @@ New Features ...@@ -53,6 +53,10 @@ New Features
(Frederic B., Simon McGregor) (Frederic B., Simon McGregor)
* MRG random now raises an error with a clear message when the passed shape * MRG random now raises an error with a clear message when the passed shape
contains dimensions with bad value like 0. (Frédéric B. reported by Ian G.) contains dimensions with bad value like 0. (Frédéric B. reported by Ian G.)
* "CudaNdarray[*] = ndarray" work in more case (Frederic B.)
* "CudaNdarray[*] += ndarray" work in more case (Frederic B.)
* We add dimensions to CudaNdarray to automatically broadcast more frequently.
(Frederic B.)
Sparse Sparse
* Implement theano.sparse.mul(sparse1, sparse2) when both inputs don't * Implement theano.sparse.mul(sparse1, sparse2) when both inputs don't
......
...@@ -1026,13 +1026,11 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val, ...@@ -1026,13 +1026,11 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val,
for r in node.outputs: for r in node.outputs:
if isinstance(r.type, (TensorType, CudaNdarrayType)): if isinstance(r.type, (TensorType, CudaNdarrayType)):
# Build a C-contiguous buffer # Build a C-contiguous buffer
new_buf = numpy.zeros( new_buf = r.type.value_zeros(r_vals[r].shape)
shape=r_vals[r].shape, # CudaNdarray don't have flags field
dtype=r_vals[r].dtype, # assert new_buf.flags["C_CONTIGUOUS"]
order='C') new_buf += numpy.asarray(def_val).astype(r.type.dtype)
new_buf += def_val
if isinstance(r.type, CudaNdarrayType):
new_buf = CudaNdarray(new_buf)
c_cont_outputs[r] = new_buf c_cont_outputs[r] = new_buf
if len(c_cont_outputs): if len(c_cont_outputs):
...@@ -1096,21 +1094,12 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val, ...@@ -1096,21 +1094,12 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val,
shapes.append(slice(None, size, None)) shapes.append(slice(None, size, None))
r_buf = init_strided[r] r_buf = init_strided[r]
if r_buf.ndim > 0: if r_buf.ndim > 0:
r_buf = r_buf[tuple(strides)][tuple(shapes)] r_buf = r_buf[tuple(strides)][tuple(shapes)]
assert r_buf.shape == r_vals[r].shape assert r_buf.shape == r_vals[r].shape
if isinstance(r.type, CudaNdarrayType): r_buf[...] = numpy.asarray(def_val).astype(r_buf.dtype)
# It seems stupid, but we need to allocate a
# new ndarray and copy it into the GPU one.
# TODO: When it is possible to simply do
# r_buff[...] = def_val, do so.
new_rbuf = numpy.zeros(r_vals[r].shape,
dtype=r.dtype)
new_rbuf += def_val
r_buf[...] = CudaNdarray(new_rbuf)
else:
r_buf[...] = def_val
strided[r] = r_buf strided[r] = r_buf
...@@ -1133,12 +1122,8 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val, ...@@ -1133,12 +1122,8 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val,
out_shape = [max((s + sd), 0) out_shape = [max((s + sd), 0)
for s, sd in zip(r_vals[r].shape, for s, sd in zip(r_vals[r].shape,
r_shape_diff)] r_shape_diff)]
new_buf = numpy.zeros( new_buf = r.type.value_zeros(r_vals[r].shape)
shape=out_shape, new_buf += numpy.asarray(def_val).astype(r.type.dtype)
dtype=r.dtype)
new_buf += def_val
if isinstance(r.type, CudaNdarrayType):
new_buf = CudaNdarray(new_buf)
wrong_size[r] = new_buf wrong_size[r] = new_buf
yield (name, wrong_size) yield (name, wrong_size)
......
...@@ -1923,10 +1923,6 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp): ...@@ -1923,10 +1923,6 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp):
ilist_ = tensor.as_tensor_variable(ilist) ilist_ = tensor.as_tensor_variable(ilist)
assert x_.type.dtype == y_.type.dtype assert x_.type.dtype == y_.type.dtype
assert x_.type.ndim == y_.type.ndim
# if (x_.type.ndim - 1) > y_.type.ndim:
# y_ = tensor.shape_padleft(y_, x_.type.ndim - y_.type.ndim)
# assert x_.type.ndim == y_.type.ndim
assert x_.type.ndim >= y_.type.ndim assert x_.type.ndim >= y_.type.ndim
if ilist_.type.dtype[:3] not in ('int', 'uin'): if ilist_.type.dtype[:3] not in ('int', 'uin'):
...@@ -1941,9 +1937,40 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp): ...@@ -1941,9 +1937,40 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp):
return Apply(self, [x_, y_, ilist_], [x_.type()]) return Apply(self, [x_, y_, ilist_], [x_.type()])
#def perform(self, node, inp, out_):
# CudaNdarray_Subscript() don't support Advanced slicing. # CudaNdarray_Subscript() don't support Advanced slicing.
# so we use the parent version that loop on each indices. # But we can't use the parent version that loop on each indices
# as we also need to loop when set_instead_of_inc is True and the
# parent don't loop in that case.
def perform(self, node, inp, out_):
# TODO opt to make this inplace
x, y, idx = inp
out, = out_
if not self.inplace:
x = x.copy()
if self.set_instead_of_inc:
# CudaNdarray __setitem__ don't do broadcast nor support
# list of index.
assert y.ndim <= x.ndim # Should be guaranteed by `make_node`
if y.ndim == x.ndim:
assert len(y) == len(idx)
for (j, i) in enumerate(idx):
x[i] = y[j]
else:
for i in idx:
x[i] = y
else:
# If `y` has as many dimensions as `x`, then we want to iterate
# jointly on `x` and `y`. Otherwise, it means `y` should be
# broadcasted to fill all relevant rows of `x`.
assert y.ndim <= x.ndim # Should be guaranteed by `make_node`
if y.ndim == x.ndim:
assert len(y) == len(idx)
for (j, i) in enumerate(idx):
x[i] += y[j]
else:
for i in idx:
x[i] += y
out[0] = x
class GpuIncSubtensor(tensor.IncSubtensor, GpuOp): class GpuIncSubtensor(tensor.IncSubtensor, GpuOp):
......
...@@ -282,7 +282,7 @@ static PyObject *CudaNdarray_NewDims(int nd, const inttype * dims) ...@@ -282,7 +282,7 @@ static PyObject *CudaNdarray_NewDims(int nd, const inttype * dims)
* Set self to be a view of given `data`, owned by existing CudaNdarray `base`. * Set self to be a view of given `data`, owned by existing CudaNdarray `base`.
*/ */
DllExport int CudaNdarray_set_device_data(CudaNdarray * self, float * data, PyObject * base); DllExport int CudaNdarray_set_device_data(CudaNdarray * self, float * data, PyObject * base);
DllExport int CudaNdarray_set_device_data(CudaNdarray * self, float * data, CudaNdarray * base); DllExport int CudaNdarray_set_device_data(CudaNdarray * self, float * data, const CudaNdarray * base);
/** /**
* Return an independent copy of self * Return an independent copy of self
......
...@@ -765,8 +765,6 @@ def local_gpu_advanced_incsubtensor1(node): ...@@ -765,8 +765,6 @@ def local_gpu_advanced_incsubtensor1(node):
'either set the `warn.gpu_set_subtensor1` config ' 'either set the `warn.gpu_set_subtensor1` config '
'option to False, or `warn.ignore_bug_before` to at ' 'option to False, or `warn.ignore_bug_before` to at '
'least \'0.6\'.', stacklevel=1) 'least \'0.6\'.', stacklevel=1)
if set_instead_of_inc:
return
gpu_op = GpuAdvancedIncSubtensor1( gpu_op = GpuAdvancedIncSubtensor1(
set_instead_of_inc=set_instead_of_inc) set_instead_of_inc=set_instead_of_inc)
...@@ -799,8 +797,7 @@ def local_gpu_advanced_incsubtensor1(node): ...@@ -799,8 +797,7 @@ def local_gpu_advanced_incsubtensor1(node):
'either set the `warn.gpu_set_subtensor1` config ' 'either set the `warn.gpu_set_subtensor1` config '
'option to False, or `warn.ignore_bug_before` to at ' 'option to False, or `warn.ignore_bug_before` to at '
'least \'0.6\'.', stacklevel=1) 'least \'0.6\'.', stacklevel=1)
if set_instead_of_inc:
return
gpu_op = GpuAdvancedIncSubtensor1( gpu_op = GpuAdvancedIncSubtensor1(
set_instead_of_inc=set_instead_of_inc) set_instead_of_inc=set_instead_of_inc)
return [host_from_gpu(gpu_op(gpu_x, gpu_y, *coords))] return [host_from_gpu(gpu_op(gpu_x, gpu_y, *coords))]
......
...@@ -630,13 +630,9 @@ def test_setitem_matrixvector1(): ...@@ -630,13 +630,9 @@ def test_setitem_matrixvector1():
assert numpy.allclose(a,numpy.asarray(_a)) assert numpy.allclose(a,numpy.asarray(_a))
#test direct transfert from numpy #test direct transfert from numpy
try:
_a[:,1] = b*100 _a[:,1] = b*100
a[:,1] = b*100 a[:,1] = b*100
raise Exception("CudaNdarray.__setitem__ should have returned an error")
assert numpy.allclose(a,numpy.asarray(_a)) assert numpy.allclose(a,numpy.asarray(_a))
except NotImplementedError, e:
pass
row = theano._asarray([777,888,999], dtype='float32') row = theano._asarray([777,888,999], dtype='float32')
_a[1,:] = row _a[1,:] = row
...@@ -659,13 +655,9 @@ def test_setitem_matrix_tensor3(): ...@@ -659,13 +655,9 @@ def test_setitem_matrix_tensor3():
assert numpy.allclose(a,numpy.asarray(_a)) assert numpy.allclose(a,numpy.asarray(_a))
#test direct transfert from numpy #test direct transfert from numpy
try:
_a[:,1,1] = b*100 _a[:,1,1] = b*100
a[:,1,1] = b*100 a[:,1,1] = b*100
raise Exception("CudaNdarray.__setitem__ should have returned an error")
assert numpy.allclose(a,numpy.asarray(_a)) assert numpy.allclose(a,numpy.asarray(_a))
except NotImplementedError:
pass
row = theano._asarray([777,888,999], dtype='float32') row = theano._asarray([777,888,999], dtype='float32')
_a[1,1,:] = row _a[1,1,:] = row
...@@ -714,7 +706,7 @@ def test_setitem_matrix_bad_ndim(): ...@@ -714,7 +706,7 @@ def test_setitem_matrix_bad_ndim():
# attempt to assign the ndarray b with setitem # attempt to assign the ndarray b with setitem
_a[:,:,1] = _b _a[:,:,1] = _b
assert False assert False
except NotImplementedError, e: except ValueError, e:
#print e #print e
assert True assert True
...@@ -723,7 +715,7 @@ def test_setitem_matrix_bad_ndim(): ...@@ -723,7 +715,7 @@ def test_setitem_matrix_bad_ndim():
# attempt to assign the ndarray b with setitem # attempt to assign the ndarray b with setitem
_a[1,:,:] = b _a[1,:,:] = b
assert False assert False
except NotImplementedError, e: except ValueError, e:
#print e #print e
assert True assert True
...@@ -806,7 +798,7 @@ def test_setitem_broadcast(): ...@@ -806,7 +798,7 @@ def test_setitem_broadcast():
a[:,:,1] = b.reshape((1,3)) a[:,:,1] = b.reshape((1,3))
assert numpy.allclose(numpy.asarray(_a),a) assert numpy.allclose(numpy.asarray(_a),a)
#This is not supported for now.
def test_setitem_broadcast_numpy(): def test_setitem_broadcast_numpy():
#test scalar to vector without stride #test scalar to vector without stride
a = numpy.arange(3) a = numpy.arange(3)
...@@ -814,73 +806,81 @@ def test_setitem_broadcast_numpy(): ...@@ -814,73 +806,81 @@ def test_setitem_broadcast_numpy():
_a = cuda_ndarray.CudaNdarray(a) _a = cuda_ndarray.CudaNdarray(a)
b = theano._asarray(9, dtype='float32') b = theano._asarray(9, dtype='float32')
try:
_a[:] = b.reshape((1,)) _a[:] = b.reshape((1,))
a[:] = b.reshape((1,)) a[:] = b.reshape((1,))
assert False assert numpy.allclose(numpy.asarray(_a), a)
assert numpy.allclose(numpy.asarray(_a),a)
except ValueError:
pass
#test vector to matrice without stride #test vector to matrice without stride
a = numpy.arange(9) a = numpy.arange(9)
a.resize((3,3)) a.resize((3, 3))
a = theano._asarray(a, dtype='float32') a = theano._asarray(a, dtype='float32')
_a = cuda_ndarray.CudaNdarray(a) _a = cuda_ndarray.CudaNdarray(a)
try: b = theano._asarray([7, 8, 9], dtype='float32')
b = theano._asarray([7,8,9], dtype='float32') _a[:, :] = b.reshape((1, 3))
_a[:,:] = b.reshape((1,3)) a[:, :] = b.reshape((1, 3))
a[:,:] = b.reshape((1,3)) assert numpy.allclose(numpy.asarray(_a), a)
assert False
assert numpy.allclose(numpy.asarray(_a),a)
except ValueError:
pass
#test vector to matrice with stride #test vector to matrice with stride
a = numpy.arange(27) a = numpy.arange(27)
a.resize((3,3,3)) a.resize((3, 3, 3))
a = theano._asarray(a, dtype='float32') a = theano._asarray(a, dtype='float32')
_a = cuda_ndarray.CudaNdarray(a) _a = cuda_ndarray.CudaNdarray(a)
try: b = theano._asarray([[7, 8, 9], [10, 11, 12]], dtype='float32')
b = theano._asarray([[7,8,9],[10,11,12]], dtype='float32')
b = b[0] b = b[0]
_a[1,:,:] = b.reshape((1,3)) _a[1, :, :] = b.reshape((1, 3))
a[1,:,:] = b.reshape((1,3)) a[1, :, :] = b.reshape((1, 3))
assert False assert numpy.allclose(numpy.asarray(_a), a)
assert numpy.allclose(numpy.asarray(_a),a)
except ValueError:
pass
# this also fails for the moment # this also fails for the moment
def test_setitem_rightvalue_ndarray_fails(): def test_setitem_rightvalue_ndarray_fails():
""" """
Now we don't automatically add dimensions to broadcast Now we don't automatically add dimensions to broadcast
""" """
a = numpy.arange(27) a = numpy.arange(3 * 4 * 5)
a.resize((3,3,3)) a.resize((3, 4, 5))
a = theano._asarray(a, dtype='float32') a = theano._asarray(a, dtype='float32')
_a = cuda_ndarray.CudaNdarray(a) _a = cuda_ndarray.CudaNdarray(a)
b = theano._asarray([7,8,9], dtype='float32') b = theano._asarray([7, 8, 9, 10], dtype='float32')
_b = cuda_ndarray.CudaNdarray(b) _b = cuda_ndarray.CudaNdarray(b)
b5 = theano._asarray([7, 8, 9, 10, 11], dtype='float32')
_b5 = cuda_ndarray.CudaNdarray(b)
try:
# attempt to assign the ndarray b with setitem # attempt to assign the ndarray b with setitem
_a[:,:,1] = _b _a[:, :, 1] = _b
assert False a[:, :, 1] = b
except NotImplementedError, e: assert numpy.allclose(numpy.asarray(_a), a)
#print e
assert True
#test direct transfert from numpy #test direct transfert from numpy to contiguous region
# attempt to assign the ndarray b with setitem
# same number of dim
mat = numpy.random.rand(4, 5).astype('float32')
_a[2, :, :] = mat
a[2, :, :] = mat
assert numpy.allclose(numpy.asarray(_a), a)
# without same number of dim
try: try:
_a[0, :, :] = mat
#a[0, :, :] = mat
#assert numpy.allclose(numpy.asarray(_a), a)
except ValueError, e:
pass
#test direct transfert from numpy with broadcast
_a[0, :, :] = b5
a[0, :, :] = b5
assert numpy.allclose(numpy.asarray(_a), a)
#test direct transfert from numpy to not contiguous region
# attempt to assign the ndarray b with setitem # attempt to assign the ndarray b with setitem
_a[:,:,1] = b _a[:, :, 2] = b
assert False a[:, :, 2] = b
except NotImplementedError, e: assert numpy.allclose(numpy.asarray(_a), a)
#print e
assert True
def test_zeros_basic(): def test_zeros_basic():
for shp in [(3,4,5), (300,), (), (0,7)]: for shp in [(3,4,5), (300,), (), (0,7)]:
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论