提交 818bdf4b authored 作者: nouiz's avatar nouiz

Merge pull request #985 from goodfeli/rebase_gpu_incsub

C code for GpuIncsub, and a new CUDA kernel for one special case
......@@ -232,8 +232,9 @@ def rebuild_collect_shared(outputs,
copy_inputs_over)
cloned_outputs.append(Out(cloned_v, borrow=v.borrow))
else:
raise TypeError('outputs must be theano Variable or '
'Out instances', v)
raise TypeError('Outputs must be theano Variable or '
'Out instances. Received ' + str(v)\
+ ' of type '+str(type(v)))
#computed_list.append(cloned_v)
else:
if isinstance(outputs, Variable):
......
......@@ -589,6 +589,10 @@ class Op(utils.object2, PureOp, CLinkerOp):
rval.outputs = node_output_storage
rval.lazy = False
return rval
# the next line does nothing, but pyflakes is too
# stupid to realize the def rval below is not a
# redefinition unless I include this
del rval
except (NotImplementedError, utils.MethodNotDefined):
logger.debug('Falling back on perform')
......
......@@ -2175,6 +2175,12 @@ class GpuReshape(tensor.Reshape, GpuOp):
out[0] = x.reshape(tuple(shp))
# C Code shared by GpuSubtensor and GpuIncSubtensor
_define_set_data = """
#define CudaNdarray_set_device_data2(obj, ptr, base) \
CudaNdarray_set_device_data(obj, (float *)ptr, base)
"""
class GpuSubtensor(GpuOp, tensor.Subtensor):
"""
Implement subtensor on the gpu.
......@@ -2240,10 +2246,10 @@ class GpuSubtensor(GpuOp, tensor.Subtensor):
%(fail)s;
}
cnda_mark_dev_structure_dirty(xview);
#define CudaNdarray_set_device_data2(obj, ptr, base) \
CudaNdarray_set_device_data(obj, (float *)ptr, base)
""" % locals()
get_xview = self.helper_c_code(node, name, inputs, outputs, sub,
""" % locals()
get_xview = _define_set_data + \
self.helper_c_code(node, name, inputs, outputs, sub,
self.idx_list,
c_prefix='CudaNdarray',
set_data='CudaNdarray_set_device_data2',
......@@ -2251,6 +2257,7 @@ class GpuSubtensor(GpuOp, tensor.Subtensor):
set_stride='CudaNdarray_set_stride',
update_flags="", strides_mul=4)
finish_view = """
//Set the base only now
......@@ -2408,13 +2415,128 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp):
class GpuIncSubtensor(tensor.IncSubtensor, GpuOp):
"""
Implement IncSubtensor on the gpu.
Note: The optimization to make this inplace is in tensor/opt.
The same optimization handles IncSubtensor and GpuIncSubtensor.
This Op has c_code too; it inherits tensor.IncSubtensor's c_code.
The helper methods like do_type_checking, copy_of_x, etc. specialize
the c_code for this Op.
"""
def make_node(self, x, y, *inputs):
assert isinstance(x.type, CudaNdarrayType)
assert isinstance(y.type, CudaNdarrayType)
x = as_cuda_ndarray_variable(x)
y = as_cuda_ndarray_variable(y)
rval = tensor.IncSubtensor.make_node(self, x, y, *inputs)
return Apply(self, [x, y] + rval.inputs[2:], [x.type()])
def do_type_checking(self, node):
""" Should raise NotImplementedError if c_code does not support
the types involved in this node.
"""
if not isinstance(node.inputs[0].type, CudaNdarrayType):
raise NotImplementedError()
def copy_of_x(self, x):
"""
x: a string giving the name of a C variable pointing to an array
Returns C code expression to make a copy of x.
Base class uses PyArrayObject *, subclasses may override for
different types of arrays.
"""
return """(CudaNdarray*) CudaNdarray_Copy(%(x)s)""" % locals()
def make_view_array(self, x, view_ndim):
"""
x: a string identifying an array to be viewed
view_ndim: a string specifying the number of dimensions
to have in the view
This doesn't need to actually set up the view with the
right indexing; we'll do that manually later.
"""
return """CudaNdarray* zview = (CudaNdarray*)
CudaNdarray_New(%(view_ndim)s)""" % locals()
def get_helper_c_code_args(self):
""" Return a dictionary of arguments to use with helper_c_code"""
return { 'update_flags' : "",
'c_prefix' : 'CudaNdarray',
'set_data' :'CudaNdarray_set_device_data2',
'set_dim' : 'CudaNdarray_set_dim',
'set_stride' : 'CudaNdarray_set_stride',
'update_flags' : "",
'strides_mul': 4
}
def copy_into(self, view, source):
"""
view: string, C code expression for an array
source: string, C code expression for an array
returns a C code expression to copy source into view, and
return 0 on success
"""
return """CudaNdarray_CopyFromCudaNdarray(%(view)s, %(source)s)""" % locals()
def define_set_data(self):
return _define_set_data
def link_view_array(self, x, fail):
return """
if (CudaNdarray_set_device_data(zview, CudaNdarray_DEV_DATA(%(x)s),
(PyObject*) NULL))
{
PyErr_Format(PyExc_RuntimeError,
"GpuSubtensor is not able to set the"
" devdata field of the view");
Py_XDECREF(zview);
%(fail)s;
}
cnda_mark_dev_structure_dirty(zview);
""" % locals()
def set_view_base(self, x, fail):
return """
//Set the base only now
if(CudaNdarray_set_device_data(zview, CudaNdarray_DEV_DATA(zview),
%(x)s)){
PyErr_Format(PyExc_RuntimeError,
"GpuSubtensor is not able to set"
" the base of the view array");
Py_XDECREF(zview);
%(fail)s;
}""" % locals()
def add_to_zview(self, x, fail):
return """
PyObject * add_result = CudaNdarray_inplace_add((PyObject *) zview,
(PyObject *) py_%(x)s);
if (! add_result )
{
Py_DECREF(zview);
%(fail)s;
}
else
{
Py_DECREF(add_result);
}
""" % locals()
def c_code_cache_version(self):
parent_version = super(GpuIncSubtensor, self).c_code_cache_version()
if parent_version:
return parent_version + (0,)
return ()
class GpuFlatten(tensor.Flatten, GpuOp):
"""
......
......@@ -75,15 +75,16 @@ struct CudaNdarray
/* Type-specific fields go here. */
//GpuTensorType::VoidTensor * vt;
int nd; //the number of dimensions of the tensor
// Client should acces host_structure via CudaNdarray_HOST_DIMS / CudaNdarray_HOST_STRIDES macros
// Client should acces host_structure via CudaNdarray_HOST_DIMS / CudaNdarray_HOST_STRIDES functions
int * host_structure; //dim0, dim1, ... stride0, stride1, ...
int data_allocated; //the number of bytes allocated for devdata
//device pointers (allocated by cudaMalloc)
mutable int dev_structure_fresh;
//dev_structure should be accessed via macros, otherwise may not be
//synchronized. The macro will allocate it when needed.
//dev_structure should be accessed via the functions like
//CudaNdarray_DEV_DIMS, otherwise may not be
//synchronized with host_structure. The accessor functions will allocate it when needed.
mutable int * dev_structure; //dim0, dim1, ..., stride0, stride1, ...
real* devdata; //pointer to data element [0,..,0].
};
......@@ -118,6 +119,12 @@ CudaNdarray_is_c_contiguous(const CudaNdarray * self);
*/
DllExport int cnda_structure_size(int nd);
/*
* This describes the shape of the ndarray. The array
* of dimensions is itself stored on the host.
* If you need to access the dimensions array from inside
* a kernel, use CudaNdarray_DEVICE_DIMS.
*/
DllExport const int *
CudaNdarray_HOST_DIMS(const CudaNdarray * self);
......@@ -144,7 +151,7 @@ CudaNdarray_Equal(CudaNdarray *cnda1, CudaNdarray *cnda2);
/****
* Set the idx'th dimension to value d.
*
* Updates the log2dim shaddow array.
* Updates the log2dim shadow array.
*
* Does not sync structure to host.
*/
......@@ -188,6 +195,10 @@ CudaNdarray_set_stride(CudaNdarray * self, int idx, int s)
*/
DllExport int cnda_copy_structure_to_device(const CudaNdarray * self);
/* CudaNdarray_DEV_DIMS gives the same information as CudaNdarray_HOST_DIMS,
* but stored on the GPU. Use this pointer when it needs to be accessed
* from inside a CUDA kernel.
*/
DllExport const int *CudaNdarray_DEV_DIMS(const CudaNdarray * self);
DllExport const int *CudaNdarray_DEV_STRIDES(const CudaNdarray * self);
DllExport const int *CudaNdarray_DEV_LOG2DIMS(const CudaNdarray * self);
......@@ -389,8 +400,21 @@ DllExport int CudaNdarray_CopyFromArray(CudaNdarray * self, PyArrayObject*obj);
* Transfer the contents of CudaNdarray `other` to `self`.
*
* self is reallocated to have the correct dimensions if necessary.
* TODO: WRITEME: what does "if necessary" mean?
* TODO: we use this to implement set/inc subtensor, where self is a view of
* the original tensor so that we write only to the subtensor. How
* do we ensure that self is not reallocated in this case?
*
* unbroadcast: if true, this means that other is broadcastable in some
* dimensions, and the result, self, is not.
* ie, if unbroadcast=false, we must do the broadcasting
* operation as part of the copy.
* e.g. suppose self and other are 2D matrices and other
* has only one row. Then we need to copy this row several
* times when copying to self.
*/
DllExport int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, const CudaNdarray * other, bool unbroadcast = false);
DllExport int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self,
const CudaNdarray * other, bool unbroadcast = false);
/**
* Transfer the contents of CudaNdarray `self` to a new numpy ndarray.
......@@ -437,7 +461,12 @@ DllExport int CudaNdarray_dimshuffle(CudaNdarray * self, unsigned int len, const
DllExport PyObject*
CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args);
static void fprint_CudaNdarray(FILE * fd, const CudaNdarray *self);
int fprint_CudaNdarray(FILE * fd, const CudaNdarray *self);
PyObject * CudaNdarray_View(const CudaNdarray * self);
PyObject * CudaNdarray_inplace_add(PyObject* py_self, PyObject * py_other);
#endif
/*
......
......@@ -904,6 +904,12 @@ class T_Join_and_Split(theano.tensor.tests.test_basic.T_Join_and_Split):
# This is to don't duplicate test.
class T_subtensor(theano.tensor.tests.test_basic.T_subtensor):
# This prevents nose from printing method docstrings instead of method
# names
def shortDescription(self):
return None
shared = staticmethod(cuda.shared_constructor)
sub = cuda.GpuSubtensor
inc_sub = cuda.GpuIncSubtensor
......@@ -921,6 +927,7 @@ class T_subtensor(theano.tensor.tests.test_basic.T_subtensor):
self).__init__(name)
def test_adv_sub1_fast(self):
"""We check that the special cases of advanced indexing that
use CudaNdarrayTakeFrom are handled correctly
......
差异被折叠。
......@@ -1056,7 +1056,8 @@ class test_fusion(unittest.TestCase):
if gpu:
import theano.sandbox.cuda as cuda
topo_ = [x for x in topo if not isinstance(
x.op,cuda.basic_ops.GpuFromHost) and not isinstance(x.op,cuda.basic_ops.HostFromGpu)]
x.op, (cuda.basic_ops.GpuFromHost, cuda.basic_ops.HostFromGpu))]
gpu_ = [x for x in topo if isinstance(x.op,
cuda.basic_ops.GpuFromHost)]
if not len(gpu_) == len(sym_inputs):
......@@ -1067,13 +1068,16 @@ class test_fusion(unittest.TestCase):
if not len(topo_) == nb_elemwise:
fail3.append((id, topo_, nb_elemwise))
if nb_elemwise == 1:
# check that the number of input to the Composite Elemwise is ok
# when there is not variable that appear multiple time the in input
# of g
assert ((numpy.sum([not isinstance(x, theano.gof.Constant)
for x in topo_[0].inputs]) ==
len(sym_inputs)) or
len(set(g.owner.inputs)) != len(g.owner.inputs))
# if no variable appears multiple times in the
# input of g,
# check that the number of input to the Composite
# Elemwise is ok
if len(set(g.owner.inputs)) == len(g.owner.inputs):
expected_len_sym_inputs = numpy.sum(
[not isinstance(x, theano.gof.Constant)
for x in topo_[0].inputs])
assert expected_len_sym_inputs == len(sym_inputs)
if not out_dtype == out.dtype:
fail4.append((id, out_dtype, out.dtype))
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论