提交 75426101 authored 作者: fsavard's avatar fsavard

Added a few things to support GpuJoin: a ZerosWithPattern method to…

Added a few things to support GpuJoin: a ZerosWithPattern method to cuda_ndarray, and a slight refactoring of Join(Op) to inherit from this class in GpuJoin. And added GpuJoin itself. And a few unit tests for it.
上级 0e0f4802
...@@ -105,7 +105,8 @@ if cuda_available: ...@@ -105,7 +105,8 @@ if cuda_available:
import basic_ops import basic_ops
from basic_ops import (GpuFromHost, HostFromGpu, GpuElemwise, from basic_ops import (GpuFromHost, HostFromGpu, GpuElemwise,
GpuDimShuffle, GpuSum, GpuReshape, GpuDimShuffle, GpuSum, GpuReshape,
GpuSubtensor, GpuIncSubtensor, GpuFlatten, GpuShape) GpuSubtensor, GpuIncSubtensor, GpuFlatten, GpuShape,
GpuJoin)
import opt import opt
import cuda_ndarray import cuda_ndarray
......
...@@ -11,6 +11,9 @@ from theano.sandbox.cuda import filter as type_support_filter ...@@ -11,6 +11,9 @@ from theano.sandbox.cuda import filter as type_support_filter
from theano.sandbox.cuda.elemwise import NaiveAlgo from theano.sandbox.cuda.elemwise import NaiveAlgo
import logging, copy import logging, copy
import cuda_ndarray
_logger_name = 'theano.sandbox.cuda.basic_ops' _logger_name = 'theano.sandbox.cuda.basic_ops'
_logger = logging.getLogger(_logger_name) _logger = logging.getLogger(_logger_name)
_logger.setLevel(logging.INFO) _logger.setLevel(logging.INFO)
...@@ -1418,3 +1421,75 @@ class GpuShape(tensor.Shape): ...@@ -1418,3 +1421,75 @@ class GpuShape(tensor.Shape):
return Apply(self, [x], [tensor.lvector()]) return Apply(self, [x], [tensor.lvector()])
gpu_shape = GpuShape() gpu_shape = GpuShape()
class GpuJoin(tensor.Join):
def make_node(self, *axis_and_tensors):
axis, tensors = axis_and_tensors[0], axis_and_tensors[1:]
if not tensors:
raise ValueError('Cannot join an empty list of tensors')
are_instances = [isinstance(x.type, CudaNdarrayType) \
for x in tensors]
assert numpy.all(are_instances)
# no conversion needed, we just checked everything was
# a CNDA var
as_tensor_variable_args = tensors
output_maker = \
lambda bcast: CudaNdarrayType(broadcastable=bcast)()
return tensor.Join._make_node_internal(self,
axis, tensors,
as_tensor_variable_args, output_maker)
def perform(self, node, axis_and_tensors, (out, )):
axis, cndas = axis_and_tensors[0], axis_and_tensors[1:]
# compute size/shape
width_sum = 0
template_shape = cndas[0].shape
for cnda in cndas:
width_sum += cnda.shape[axis]
# and while we're at it, check that shapes match
tmp_shape = list(cnda.shape)
# dimension in "axis" can be different, so make equal for ==
tmp_shape[axis] = template_shape[axis]
if tuple(tmp_shape) != template_shape:
raise ValueError, "Shape of input CudaNdarrays must agree except for the 'axis' dimension"
if len(template_shape) != node.outputs[0].type.ndim:
raise ValueError, "Number of dimension of input tensors disagree with dimensions passed at graph creation time."
# final shape must be the same as all input tensors
# except for the "axis" dimension, so we can simply
# copy the shape of the first one
final_shape = list(cndas[0].shape)
final_shape[axis] = width_sum
# just to be explicit, set -1 for broadcastable
# dimensions
for i, val in enumerate(node.outputs[0].type.broadcastable):
if val:
final_shape[i] = -1
rval = cuda_ndarray.cuda_ndarray.CudaNdarray.zeros_with_pattern(final_shape)
curpos = 0
# we use a [:] (copy all) slice for all dimensions
# except for 'axis'
def construct_slices(curlen):
slices = [slice(None,None,None) for i in range(len(cndas))]
slices[axis] = slice(curpos,curpos+curlen,None)
return tuple(slices)
for i, cnda in enumerate(cndas):
curlen = cnda.shape[axis]
rval.__setitem__(construct_slices(curlen), cnda)
curpos += curlen
out[0] = rval
gpu_join = GpuJoin()
...@@ -247,6 +247,135 @@ PyObject * CudaNdarray_CreateArrayObj(CudaNdarray * self) ...@@ -247,6 +247,135 @@ PyObject * CudaNdarray_CreateArrayObj(CudaNdarray * self)
Py_DECREF(contiguous_self); Py_DECREF(contiguous_self);
return rval; return rval;
} }
// declared as a static method
// Based on _Copy and _dimshuffle
PyObject* CudaNdarray_ZerosWithPattern(PyObject* dummy, PyObject* pattern)
{
if(!PySequence_Check(pattern))
{
PyErr_SetString(PyExc_TypeError, "pattern argument must be a sequence");
return NULL;
}
int patlen = PySequence_Length(pattern);
if (patlen == 0)
{
PyErr_SetString(PyExc_ValueError,
"CudaNdarray_NewWithPattern: empty pattern");
return NULL;
}
//fprintf(stdout, "Pattern length: %d\n", patlen);
int* newdims = (int *)malloc(sizeof(int) * 2 * patlen);
if (!newdims)
{
PyErr_SetString(PyExc_MemoryError,
"CudaNdarray_NewWithPattern: Failed to allocate temporary space");
return NULL;
}
int* newstrides = newdims + patlen;
// strides are in number of floats, not bytes
int cur_stride = 1;
// start from the end to compute strides
for (int i = patlen-1; i >= 0; --i)
{
PyObject* pat_el_obj = PySequence_GetItem(pattern, i);
if(pat_el_obj == NULL)
{
// shouldn't happen since we checked length before...
PyErr_SetString(PyExc_RuntimeError, "CudaNdarray_NewWithPattern: Index out of bound in sequence");
free(newdims);
return NULL;
}
int pat_el = PyInt_AsLong(pat_el_obj);
if (pat_el == 0)
{
PyErr_SetString(PyExc_ValueError, "CudaNdarray_NewWithPattern: pattern must not contain 0 for size of a dimension");
free(newdims);
return NULL;
}
// apparently, from looking at alloc_contiguous, we set
// stride=0 if the dim == 1
if (pat_el < 0 || pat_el == 1)
{
// broadcast
newdims[i] = 1;
newstrides[i] = 0;
}
else
{
newdims[i] = pat_el;
newstrides[i] = cur_stride;
}
cur_stride *= newdims[i];
}
// cur_stride now contains the size of the array, in reals
int total_size = cur_stride * sizeof(real);
CudaNdarray* rval = (CudaNdarray*)CudaNdarray_new_null();
if (!rval)
{
PyErr_SetString(PyExc_RuntimeError, "CudaNdarray_NewWithPattern: call to new_null failed");
free(newdims);
return NULL;
}
if (CudaNdarray_alloc_contiguous(rval, patlen, newdims))
{
PyErr_SetString(PyExc_RuntimeError, "CudaNdarray_NewWithPattern: allocation failed.");
free(newdims);
Py_DECREF(rval);
return NULL;
}
// Fill with zeros
//fprintf(stdout, "Sizeof: %d\n", total_size);
if (cudaSuccess != cudaMemset(rval->devdata, 0, total_size))
{
fprintf(stderr, "Error memsetting %d bytes of device memory.\n", cur_stride);
PyErr_Format(PyExc_MemoryError, "Error memsetting %d bytes of device memory.", cur_stride);
free(newdims);
Py_DECREF(rval);
return NULL;
}
// change the strides to account for broadcastability
// (not necessary as alloc_contiguous sets stride=0 for dim=1)
//for (int i = 0; i < patlen; ++i)
//{
// CudaNdarray_set_stride(rval, i, newstrides[i]);
//}
if (cnda_copy_structure_to_device(rval))
{
PyErr_SetString(PyExc_RuntimeError, "CudaNdarray_NewWithPattern: syncing structure to device failed");
free(newdims);
Py_DECREF(rval);
return NULL;
}
free(newdims);
return (PyObject*)rval;
}
PyObject * CudaNdarray_Copy(CudaNdarray * self) PyObject * CudaNdarray_Copy(CudaNdarray * self)
{ {
PyObject * rval = CudaNdarray_new_null(); PyObject * rval = CudaNdarray_new_null();
...@@ -578,6 +707,9 @@ static PyMethodDef CudaNdarray_methods[] = ...@@ -578,6 +707,9 @@ static PyMethodDef CudaNdarray_methods[] =
{"__deepcopy__", {"__deepcopy__",
(PyCFunction)CudaNdarray_DeepCopy, METH_O, (PyCFunction)CudaNdarray_DeepCopy, METH_O,
"Create a copy of this object"}, "Create a copy of this object"},
{"zeros_with_pattern",
(PyCFunction)CudaNdarray_ZerosWithPattern, METH_STATIC,
"Create a new CudaNdarray with specified shape and broadcastability, filled with zeros."},
{"copy", {"copy",
(PyCFunction)CudaNdarray_Copy, METH_NOARGS, (PyCFunction)CudaNdarray_Copy, METH_NOARGS,
"Create a copy of this object"}, "Create a copy of this object"},
......
...@@ -493,3 +493,93 @@ def test_hostfromgpu_shape_i(): ...@@ -493,3 +493,93 @@ def test_hostfromgpu_shape_i():
assert isinstance(topo[2].op,T.opt.MakeVector) assert isinstance(topo[2].op,T.opt.MakeVector)
assert tuple(f(cv))==(5,4) assert tuple(f(cv))==(5,4)
# -----------------------------------------------------------------------
import theano.sandbox.cuda as cuda_ndarray
from theano.sandbox.cuda.basic_ops import gpu_join, GpuDimShuffle
def test_gpujoin_twomatrices_joincolumns():
_a = numpy.asarray([[1,2],[3,4]],dtype='float32')
_b = numpy.asarray([[5,6,7],[8,9,10]],dtype='float32')
a = theano.shared(_a)
b = theano.shared(_b)
c = gpu_join(1,a,b)
f = theano.function([], c)
assert numpy.all(f() == numpy.concatenate([_a,_b], axis=1))
def test_gpujoin_twomatrices_badshapes():
_a = numpy.asarray([[1,2],[3,4]],dtype='float32')
_b = numpy.asarray([[5,6,7],[8,9,10]],dtype='float32')
a = theano.shared(_a)
b = theano.shared(_b)
# try to join on dimension 0 where they don't agree (2!=3)
c = gpu_join(0,a,b)
f = theano.function([], c)
try:
f()
assert False
except ValueError:
assert True
def test_gpujoin_preserves_broadcasting():
_a = numpy.asarray([[1,2],[3,4]],dtype='float32')
_b = numpy.asarray([[5,6,7],[8,9,10]],dtype='float32')
a = theano.shared(_a)
b = theano.shared(_b)
# [0,0] : the two original dims were non-broadcastable
# [1,x,0]: new order and broadcastability
gpu_dimshuffle = GpuDimShuffle([0,0], [1,'x',0])
a_shuffled = gpu_dimshuffle(a)
b_shuffled = gpu_dimshuffle(b)
c = gpu_join(0,a_shuffled,b_shuffled)
assert c.type.broadcastable == (False,True,False)
f = theano.function([], c)
res = f()
a_reshaped = numpy.asarray([[[1,3]],[[2,4]]], dtype='float32')
b_reshaped = numpy.asarray([[[5,8]],[[6,9]],[[7,10]]], dtype='float32')
concat = numpy.concatenate([a_reshaped,b_reshaped], axis=0)
assert numpy.all(res == concat)
def test_gpujoin_assert_cndas():
# this will end up being an ndarray, as it's float64
_a = numpy.asarray([[1,2],[3,4]],dtype='float64')
a = theano.shared(_a)
try:
c = gpu_join(1,a)
# can't "assert False" here, as we want the assertion
# error from gpu_join
except AssertionError:
assert True
return
assert False
if __name__ == '__main__':
test_gpujoin_twomatrices_joincolumns()
test_gpujoin_assert_cndas()
test_gpujoin_preserves_broadcasting()
test_gpujoin_twomatrices_badshapes()
...@@ -2723,9 +2723,17 @@ class Join(Op): ...@@ -2723,9 +2723,17 @@ class Join(Op):
if not tensors: if not tensors:
raise ValueError('Cannot join an empty list of tensors') raise ValueError('Cannot join an empty list of tensors')
as_tensor_variable_args= [as_tensor_variable(x) for x in tensors] as_tensor_variable_args= [as_tensor_variable(x) for x in tensors]
dtypes = [x.type.dtype for x in as_tensor_variable_args] dtypes = [x.type.dtype for x in as_tensor_variable_args]
out_dtype = scal.upcast(*dtypes) out_dtype = scal.upcast(*dtypes)
output_maker = lambda bcastable: tensor(dtype=out_dtype, broadcastable=bcastable)
return self._make_node_internal(axis, tensors,
as_tensor_variable_args, output_maker)
def _make_node_internal(self, axis, tensors,
as_tensor_variable_args, output_maker):
if not all(targs.type.ndim for targs in as_tensor_variable_args): if not all(targs.type.ndim for targs in as_tensor_variable_args):
raise TypeError('Join cannot handle arguments of dimension 0. For joining scalar values, see @stack'); raise TypeError('Join cannot handle arguments of dimension 0. For joining scalar values, see @stack');
...@@ -2757,8 +2765,8 @@ class Join(Op): ...@@ -2757,8 +2765,8 @@ class Join(Op):
if inputs[0].type not in int_types: if inputs[0].type not in int_types:
raise TypeError('Axis could not be cast to an integer type', axis, inputs[0].type, int_types) raise TypeError('Axis could not be cast to an integer type', axis, inputs[0].type, int_types)
outputs = [tensor(dtype = out_dtype, outputs = [output_maker(bcastable)]
broadcastable = bcastable)]
node = Apply(self, inputs, outputs) node = Apply(self, inputs, outputs)
if any(not x.type.broadcastable[0] for x in orig): if any(not x.type.broadcastable[0] for x in orig):
node.tag.shape_zero = None node.tag.shape_zero = None
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论