提交 818bdf4b authored 作者: nouiz's avatar nouiz

Merge pull request #985 from goodfeli/rebase_gpu_incsub

C code for GpuIncsub, and a new CUDA kernel for one special case
......@@ -232,8 +232,9 @@ def rebuild_collect_shared(outputs,
copy_inputs_over)
cloned_outputs.append(Out(cloned_v, borrow=v.borrow))
else:
raise TypeError('outputs must be theano Variable or '
'Out instances', v)
raise TypeError('Outputs must be theano Variable or '
'Out instances. Received ' + str(v)\
+ ' of type '+str(type(v)))
#computed_list.append(cloned_v)
else:
if isinstance(outputs, Variable):
......
......@@ -589,6 +589,10 @@ class Op(utils.object2, PureOp, CLinkerOp):
rval.outputs = node_output_storage
rval.lazy = False
return rval
# the next line does nothing, but pyflakes is too
# stupid to realize the def rval below is not a
# redefinition unless I include this
del rval
except (NotImplementedError, utils.MethodNotDefined):
logger.debug('Falling back on perform')
......
......@@ -2175,6 +2175,12 @@ class GpuReshape(tensor.Reshape, GpuOp):
out[0] = x.reshape(tuple(shp))
# C Code shared by GpuSubtensor and GpuIncSubtensor
_define_set_data = """
#define CudaNdarray_set_device_data2(obj, ptr, base) \
CudaNdarray_set_device_data(obj, (float *)ptr, base)
"""
class GpuSubtensor(GpuOp, tensor.Subtensor):
"""
Implement subtensor on the gpu.
......@@ -2240,10 +2246,10 @@ class GpuSubtensor(GpuOp, tensor.Subtensor):
%(fail)s;
}
cnda_mark_dev_structure_dirty(xview);
#define CudaNdarray_set_device_data2(obj, ptr, base) \
CudaNdarray_set_device_data(obj, (float *)ptr, base)
""" % locals()
get_xview = self.helper_c_code(node, name, inputs, outputs, sub,
""" % locals()
get_xview = _define_set_data + \
self.helper_c_code(node, name, inputs, outputs, sub,
self.idx_list,
c_prefix='CudaNdarray',
set_data='CudaNdarray_set_device_data2',
......@@ -2251,6 +2257,7 @@ class GpuSubtensor(GpuOp, tensor.Subtensor):
set_stride='CudaNdarray_set_stride',
update_flags="", strides_mul=4)
finish_view = """
//Set the base only now
......@@ -2408,13 +2415,128 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp):
class GpuIncSubtensor(tensor.IncSubtensor, GpuOp):
"""
Implement IncSubtensor on the gpu.
Note: The optimization to make this inplace is in tensor/opt.
The same optimization handles IncSubtensor and GpuIncSubtensor.
This Op has c_code too; it inherits tensor.IncSubtensor's c_code.
The helper methods like do_type_checking, copy_of_x, etc. specialize
the c_code for this Op.
"""
def make_node(self, x, y, *inputs):
assert isinstance(x.type, CudaNdarrayType)
assert isinstance(y.type, CudaNdarrayType)
x = as_cuda_ndarray_variable(x)
y = as_cuda_ndarray_variable(y)
rval = tensor.IncSubtensor.make_node(self, x, y, *inputs)
return Apply(self, [x, y] + rval.inputs[2:], [x.type()])
def do_type_checking(self, node):
""" Should raise NotImplementedError if c_code does not support
the types involved in this node.
"""
if not isinstance(node.inputs[0].type, CudaNdarrayType):
raise NotImplementedError()
def copy_of_x(self, x):
"""
x: a string giving the name of a C variable pointing to an array
Returns C code expression to make a copy of x.
Base class uses PyArrayObject *, subclasses may override for
different types of arrays.
"""
return """(CudaNdarray*) CudaNdarray_Copy(%(x)s)""" % locals()
def make_view_array(self, x, view_ndim):
"""
x: a string identifying an array to be viewed
view_ndim: a string specifying the number of dimensions
to have in the view
This doesn't need to actually set up the view with the
right indexing; we'll do that manually later.
"""
return """CudaNdarray* zview = (CudaNdarray*)
CudaNdarray_New(%(view_ndim)s)""" % locals()
def get_helper_c_code_args(self):
""" Return a dictionary of arguments to use with helper_c_code"""
return { 'update_flags' : "",
'c_prefix' : 'CudaNdarray',
'set_data' :'CudaNdarray_set_device_data2',
'set_dim' : 'CudaNdarray_set_dim',
'set_stride' : 'CudaNdarray_set_stride',
'update_flags' : "",
'strides_mul': 4
}
def copy_into(self, view, source):
"""
view: string, C code expression for an array
source: string, C code expression for an array
returns a C code expression to copy source into view, and
return 0 on success
"""
return """CudaNdarray_CopyFromCudaNdarray(%(view)s, %(source)s)""" % locals()
def define_set_data(self):
return _define_set_data
def link_view_array(self, x, fail):
return """
if (CudaNdarray_set_device_data(zview, CudaNdarray_DEV_DATA(%(x)s),
(PyObject*) NULL))
{
PyErr_Format(PyExc_RuntimeError,
"GpuSubtensor is not able to set the"
" devdata field of the view");
Py_XDECREF(zview);
%(fail)s;
}
cnda_mark_dev_structure_dirty(zview);
""" % locals()
def set_view_base(self, x, fail):
return """
//Set the base only now
if(CudaNdarray_set_device_data(zview, CudaNdarray_DEV_DATA(zview),
%(x)s)){
PyErr_Format(PyExc_RuntimeError,
"GpuSubtensor is not able to set"
" the base of the view array");
Py_XDECREF(zview);
%(fail)s;
}""" % locals()
def add_to_zview(self, x, fail):
return """
PyObject * add_result = CudaNdarray_inplace_add((PyObject *) zview,
(PyObject *) py_%(x)s);
if (! add_result )
{
Py_DECREF(zview);
%(fail)s;
}
else
{
Py_DECREF(add_result);
}
""" % locals()
def c_code_cache_version(self):
parent_version = super(GpuIncSubtensor, self).c_code_cache_version()
if parent_version:
return parent_version + (0,)
return ()
class GpuFlatten(tensor.Flatten, GpuOp):
"""
......
......@@ -14,6 +14,17 @@
//If true, we fill with NAN allocated device memory.
#define ALLOC_MEMSET 0
//If true, we print out when we free a device pointer, uninitialize a
//CudaNdarray, or allocate a device pointer
#define PRINT_FREE_MALLOC 0
//If true, we do error checking at the start of functions, to make sure there
//is not a pre-existing error when the function is called.
//You probably need to set the environment variable
//CUDA_LAUNCH_BLOCKING=1
//if you want this to work.
#define PRECHECK_ERROR 0
/////////////////////////
// Alloc and Free
/////////////////////////
......@@ -53,36 +64,53 @@ void * device_malloc(size_t size)
// it returns something else I still don't see why we should ignore
// it. All we want to do here is reset the flag.
cudaGetLastError();
#if COMPUTE_GPU_MEM_USED
fprintf(stderr, "Error allocating %li bytes of device memory (%s). new total bytes allocated: %d\n", (long)size, cudaGetErrorString(err),_allocated_size);
#else
fprintf(stderr, "Error allocating %li bytes of device memory (%s).\n", (long)size, cudaGetErrorString(err));
#endif
PyErr_Format(PyExc_MemoryError, "Error allocating %li bytes of device memory (%s).", (long)size, cudaGetErrorString(err));
#if COMPUTE_GPU_MEM_USED
fprintf(stderr, "Error allocating %li bytes of device memory (%s). new total bytes allocated: %d\n", (long)size, cudaGetErrorString(err),_allocated_size);
#else
fprintf(stderr, "Error allocating %li bytes of device memory (%s).\n", (long)size, cudaGetErrorString(err));
#endif
PyErr_Format(PyExc_MemoryError,
"Error allocating %li bytes of device memory (%s).", (long)size, cudaGetErrorString(err));
return NULL;
}
_outstanding_mallocs[0] += (rval != NULL);
#if COMPUTE_GPU_MEM_USED
for(int i=0;i<TABLE_SIZE;i++){
if(NULL==_alloc_size_table[i].ptr){
_alloc_size_table[i].ptr=rval;
_alloc_size_table[i].size=size;
break;
#if COMPUTE_GPU_MEM_USED
for(int i=0;i<TABLE_SIZE;i++){
if(NULL==_alloc_size_table[i].ptr){
_alloc_size_table[i].ptr=rval;
_alloc_size_table[i].size=size;
break;
}
}
}
_allocated_size += size;
#endif
//fprintf(stderr, "allocated %li bytes of device memory (%s). new total bytes allocated: %d. ptr: %p\n", (long)size, cudaGetErrorString(err),_allocated_size,rval);
_allocated_size += size;
#endif
//fprintf(stderr,
//"allocated %li bytes of device memory (%s). new total bytes allocated: %d. ptr: %p\n",
//(long)size, cudaGetErrorString(err),_allocated_size,rval);
if(ALLOC_MEMSET){
//We init them to nan to make sure we catch more debug case.
cudaMemset(rval, 0xFF, size);
//printf("MEMSET\n");
}
#if PRINT_FREE_MALLOC
fprintf(stderr, "device malloc %p\n",rval);
#endif
return rval;
}
int device_free(void *ptr)
{
#if PRINT_FREE_MALLOC
fprintf(stderr, "device_free %p\n",ptr);
#endif
#if PRECHECK_ERROR
cudaError_t prevError = cudaGetLastError();
if (cudaSuccess != prevError)
{
fprintf(stderr, "Error existed before calling device_free.\n");
}
#endif
// if there is no gpu context, the call to cudaFree will fail; skip it entirely
if(!g_gpu_context_active) {
......@@ -101,33 +129,42 @@ int device_free(void *ptr)
// it returns something else I still don't see why we should ignore
// it. All we want to do here is reset the flag.
cudaGetLastError();
#if COMPUTE_GPU_MEM_USED
fprintf(stderr, "Error freeing device pointer %p (%s).%d byte already allocated\n", ptr, cudaGetErrorString(err), _allocated_size);
#else
fprintf(stderr, "Error freeing device pointer %p (%s).\n", ptr, cudaGetErrorString(err));
#endif
PyErr_Format(PyExc_MemoryError, "error freeing device pointer %p (%s)", ptr, cudaGetErrorString(err));
#if COMPUTE_GPU_MEM_USED
fprintf(stderr,
"Error freeing device pointer %p (%s).%d byte already allocated\n",
ptr, cudaGetErrorString(err), _allocated_size);
#else
fprintf(stderr,
"Error freeing device pointer %p (%s).\n",
ptr,
cudaGetErrorString(err));
#endif
PyErr_Format(PyExc_MemoryError,
"error freeing device pointer %p (%s)",
ptr,
cudaGetErrorString(err));
return -1;
}
_outstanding_mallocs[0] -= (ptr != NULL);
#if COMPUTE_GPU_MEM_USED
int i=0;
size_t total_freed = 0;
for(;i<TABLE_SIZE;i++)
if(_alloc_size_table[i].ptr==ptr){
_allocated_size -= _alloc_size_table[i].size;
total_freed += _alloc_size_table[i].size;
_alloc_size_table[i].ptr=0;
_alloc_size_table[i].size=0;
#if COMPUTE_GPU_MEM_USED
int i=0;
size_t total_freed = 0;
for(;i<TABLE_SIZE;i++)
if(_alloc_size_table[i].ptr==ptr){
_allocated_size -= _alloc_size_table[i].size;
total_freed += _alloc_size_table[i].size;
_alloc_size_table[i].ptr=0;
_alloc_size_table[i].size=0;
break;
}
//if(i==TABLE_SIZE)
// printf("Unallocated unknow size!\n");
//fprintf(stderr, "freed %li bytes of device memory (%s). %d already allocated, ptr=%p\n", (long)total_freed, cudaGetErrorString(err),_allocated_size,ptr);
#endif
break;
}
//if(i==TABLE_SIZE)
// printf("Unallocated unknow size!\n");
//fprintf(stderr, "freed %li bytes of device memory (%s). %d already allocated, ptr=%p\n", (long)total_freed, cudaGetErrorString(err),_allocated_size,ptr);
#endif
return 0;
}
static PyObject *
outstanding_mallocs(PyObject* self, PyObject * args)
{
......@@ -153,14 +190,17 @@ CudaNdarray_null_init(CudaNdarray*self)
static int
CudaNdarray_uninit(CudaNdarray*self)
{
#if PRINT_FREE_MALLOC
fprintf(stderr, "CudaNdarray_uninit %p\n", self);
#endif
int rval = 0;
if (self->data_allocated) {
assert(self->devdata);
if (device_free(self->devdata))
{
fprintf(stderr,
"!!!! error freeing device memory %p (self=%p)\n",
self->devdata, self);
"CudaNdarray_uninit: error freeing self->devdata. (self=%p, self->devata=%p)\n",
self, self->devdata);
rval = -1;
}
self->devdata = NULL;
......@@ -171,7 +211,7 @@ CudaNdarray_uninit(CudaNdarray*self)
if (device_free(self->dev_structure))
{
fprintf(stderr,
"!!!! error freeing dev_structure memory %p (self=%p)\n",
"CudaNdarray_uninit: error freeing dev_structure memory %p (self=%p)\n",
self->dev_structure, self);
rval = -1;
}
......@@ -1164,6 +1204,7 @@ CudaNdarray_exp(CudaNdarray* self)
return (PyObject*)rval;
}
static PyMethodDef CudaNdarray_methods[] =
{
{"__array__",
......@@ -1740,7 +1781,7 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
* It returns py_self on success with an additional reference. Else NULL.
*/
// Will be called by __iadd__ in Python
static PyObject *
PyObject *
CudaNdarray_inplace_add(PyObject* py_self, PyObject * py_other)
{
if (CudaNdarray_inplace_elemwise(py_self, py_other, IADD))
......@@ -2127,7 +2168,7 @@ CudaNdarray_setitem(PyObject *o, PyObject *key, PyObject *value)
}
PyObject * intobj = NULL;
if(CudaNdarray_Check(o) && PyArray_Check(value)){
if (CudaNdarray_Check(o) && PyArray_Check(value)){
if (verbose)
fprintf(stderr,
"CudaNdarray_setitem dest is a CudaNdarray and"
......@@ -2137,7 +2178,7 @@ CudaNdarray_setitem(PyObject *o, PyObject *key, PyObject *value)
{
return -1;
}
if(CudaNdarray_CopyFromArray(new_value, (PyArrayObject *) value))
if (CudaNdarray_CopyFromArray(new_value, (PyArrayObject *) value))
{
Py_XDECREF(new_value);
Py_XDECREF(rval);
......@@ -2214,7 +2255,7 @@ CudaNdarray_setitem(PyObject *o, PyObject *key, PyObject *value)
PyObject *baseSavedForComparison = rval->base;
if(CudaNdarray_CopyFromCudaNdarray(rval, (CudaNdarray*)value, true))
if (CudaNdarray_CopyFromCudaNdarray(rval, (CudaNdarray*)value, true))
{
Py_DECREF((PyObject*)rval);
Py_XDECREF(new_value);
......@@ -3190,6 +3231,28 @@ static __global__ void k_copy_1d(const int N, const float * x, const int sx, flo
}
}
// N1 through N4 are the size of y
static __global__ void k_copy_4d(const int N1,
const int N2, const int N3, const int N4,
const float * x, const int sx1, const int sx2, const int sx3,
const int sx4, float * y, const int sy1, const int sy2,
const int sy3, const int sy4)
{
// These must be made int instead of unsigned int due to a bug in nvcc
int bx = blockIdx.x;
int by = blockIdx.y;
// N1 and N2 are kept in case a future implementation needs to
// loop on the first two dimensions if there are not enough blocks
for (int j = threadIdx.y; j < (int) N4; j += (int) blockDim.y)
{
for (int i = threadIdx.x; i < N3; i += (int) blockDim.x)
{
y[bx * sy1 + by * sy2 + i * sy3 + j * sy4] =
x[bx * sx1 + by * sx2 + i * sx3 + j * sx4];
}
}
}
//copy from other into self
int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self,
const CudaNdarray * other,
......@@ -3304,6 +3367,61 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self,
return -1;
}
}; break;
case 4: // 4-tensor
{
if (verbose)
{
if (0 != fprint_CudaNdarray(stderr, other))
{
Py_XDECREF(new_other);
return -1;
}
}
// The blocks implement the looping over the first two axes so
// this needs to be (N1, N2)
dim3 n_blocks( (unsigned int) CudaNdarray_HOST_DIMS(self)[0],
(unsigned int) CudaNdarray_HOST_DIMS(self)[1]);
// For the threads, just make as many as possible
dim3 n_threads( std::min( (unsigned int) CudaNdarray_HOST_DIMS(self)[2],
(unsigned int) NUM_VECTOR_OP_THREADS_PER_BLOCK),
std::min( (unsigned int) CudaNdarray_HOST_DIMS(self)[3],
(unsigned int) NUM_VECTOR_OP_THREADS_PER_BLOCK));
n_threads.x = std::min( (unsigned int) 32, (unsigned int) n_threads.x);
n_threads.y = std::min( n_threads.y, NUM_VECTOR_OP_THREADS_PER_BLOCK / n_threads.x);
k_copy_4d<<<n_blocks, n_threads>>>(
// size of y
(unsigned int) CudaNdarray_HOST_DIMS(self)[0], // N1
(unsigned int) CudaNdarray_HOST_DIMS(self)[1], // N2
(unsigned int) CudaNdarray_HOST_DIMS(self)[2], // N3
(unsigned int) CudaNdarray_HOST_DIMS(self)[3], // N4
CudaNdarray_DEV_DATA(other), // x
// x strides
CudaNdarray_HOST_STRIDES(other)[0],
CudaNdarray_HOST_STRIDES(other)[1],
CudaNdarray_HOST_STRIDES(other)[2],
CudaNdarray_HOST_STRIDES(other)[3],
CudaNdarray_DEV_DATA(self), // y
// y strides
CudaNdarray_HOST_STRIDES(self)[0],
CudaNdarray_HOST_STRIDES(self)[1],
CudaNdarray_HOST_STRIDES(self)[2],
CudaNdarray_HOST_STRIDES(self)[3]
);
CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
PyErr_Format(PyExc_RuntimeError,
"Cuda error: %s: %s.",
"k_copy_4d",
cudaGetErrorString(err));
Py_XDECREF(new_other);
return -1;
}
}; break;
default:
{
assert (cudaSuccess == cudaGetLastError());
......@@ -4486,8 +4604,17 @@ PyObject * CudaNdarray_IS_C_Contiguous(CudaNdarray * self)
return PyBool_FromLong(CudaNdarray_is_c_contiguous(self));
}
void fprint_CudaNdarray(FILE * fd, const CudaNdarray *self)
int fprint_CudaNdarray(FILE * fd, const CudaNdarray *self)
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
PyErr_Format(PyExc_RuntimeError,
"Cuda error: %s: %s.",
"fprint_CudaNdarray was called with an uncleared error",
cudaGetErrorString(err));
return -1;
}
fprintf(fd, "CudaNdarray <%p, %p> nd=%i dev_structure_fresh=%d data_allocated=%d\n",
self, self->devdata, self->nd, self->dev_structure_fresh, self->data_allocated);
fprintf(fd, "\tHOST_DIMS: ");
......@@ -4526,6 +4653,17 @@ void fprint_CudaNdarray(FILE * fd, const CudaNdarray *self)
{
fprintf(fd, "\n\tdev_structure not allocated\n");
}
err = cudaGetLastError();
if( cudaSuccess != err)
{
PyErr_Format(PyExc_RuntimeError,
"Cuda error: %s: %s.",
"fprint_CudaNdarray",
cudaGetErrorString(err));
return -1;
}
return 0;
}
/*
......
......@@ -75,15 +75,16 @@ struct CudaNdarray
/* Type-specific fields go here. */
//GpuTensorType::VoidTensor * vt;
int nd; //the number of dimensions of the tensor
// Client should acces host_structure via CudaNdarray_HOST_DIMS / CudaNdarray_HOST_STRIDES macros
// Client should acces host_structure via CudaNdarray_HOST_DIMS / CudaNdarray_HOST_STRIDES functions
int * host_structure; //dim0, dim1, ... stride0, stride1, ...
int data_allocated; //the number of bytes allocated for devdata
//device pointers (allocated by cudaMalloc)
mutable int dev_structure_fresh;
//dev_structure should be accessed via macros, otherwise may not be
//synchronized. The macro will allocate it when needed.
//dev_structure should be accessed via the functions like
//CudaNdarray_DEV_DIMS, otherwise may not be
//synchronized with host_structure. The accessor functions will allocate it when needed.
mutable int * dev_structure; //dim0, dim1, ..., stride0, stride1, ...
real* devdata; //pointer to data element [0,..,0].
};
......@@ -118,6 +119,12 @@ CudaNdarray_is_c_contiguous(const CudaNdarray * self);
*/
DllExport int cnda_structure_size(int nd);
/*
* This describes the shape of the ndarray. The array
* of dimensions is itself stored on the host.
* If you need to access the dimensions array from inside
* a kernel, use CudaNdarray_DEVICE_DIMS.
*/
DllExport const int *
CudaNdarray_HOST_DIMS(const CudaNdarray * self);
......@@ -144,7 +151,7 @@ CudaNdarray_Equal(CudaNdarray *cnda1, CudaNdarray *cnda2);
/****
* Set the idx'th dimension to value d.
*
* Updates the log2dim shaddow array.
* Updates the log2dim shadow array.
*
* Does not sync structure to host.
*/
......@@ -188,6 +195,10 @@ CudaNdarray_set_stride(CudaNdarray * self, int idx, int s)
*/
DllExport int cnda_copy_structure_to_device(const CudaNdarray * self);
/* CudaNdarray_DEV_DIMS gives the same information as CudaNdarray_HOST_DIMS,
* but stored on the GPU. Use this pointer when it needs to be accessed
* from inside a CUDA kernel.
*/
DllExport const int *CudaNdarray_DEV_DIMS(const CudaNdarray * self);
DllExport const int *CudaNdarray_DEV_STRIDES(const CudaNdarray * self);
DllExport const int *CudaNdarray_DEV_LOG2DIMS(const CudaNdarray * self);
......@@ -389,8 +400,21 @@ DllExport int CudaNdarray_CopyFromArray(CudaNdarray * self, PyArrayObject*obj);
* Transfer the contents of CudaNdarray `other` to `self`.
*
* self is reallocated to have the correct dimensions if necessary.
* TODO: WRITEME: what does "if necessary" mean?
* TODO: we use this to implement set/inc subtensor, where self is a view of
* the original tensor so that we write only to the subtensor. How
* do we ensure that self is not reallocated in this case?
*
* unbroadcast: if true, this means that other is broadcastable in some
* dimensions, and the result, self, is not.
* ie, if unbroadcast=false, we must do the broadcasting
* operation as part of the copy.
* e.g. suppose self and other are 2D matrices and other
* has only one row. Then we need to copy this row several
* times when copying to self.
*/
DllExport int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, const CudaNdarray * other, bool unbroadcast = false);
DllExport int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self,
const CudaNdarray * other, bool unbroadcast = false);
/**
* Transfer the contents of CudaNdarray `self` to a new numpy ndarray.
......@@ -437,7 +461,12 @@ DllExport int CudaNdarray_dimshuffle(CudaNdarray * self, unsigned int len, const
DllExport PyObject*
CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args);
static void fprint_CudaNdarray(FILE * fd, const CudaNdarray *self);
int fprint_CudaNdarray(FILE * fd, const CudaNdarray *self);
PyObject * CudaNdarray_View(const CudaNdarray * self);
PyObject * CudaNdarray_inplace_add(PyObject* py_self, PyObject * py_other);
#endif
/*
......
......@@ -904,6 +904,12 @@ class T_Join_and_Split(theano.tensor.tests.test_basic.T_Join_and_Split):
# This is to don't duplicate test.
class T_subtensor(theano.tensor.tests.test_basic.T_subtensor):
# This prevents nose from printing method docstrings instead of method
# names
def shortDescription(self):
return None
shared = staticmethod(cuda.shared_constructor)
sub = cuda.GpuSubtensor
inc_sub = cuda.GpuIncSubtensor
......@@ -921,6 +927,7 @@ class T_subtensor(theano.tensor.tests.test_basic.T_subtensor):
self).__init__(name)
def test_adv_sub1_fast(self):
"""We check that the special cases of advanced indexing that
use CudaNdarrayTakeFrom are handled correctly
......
......@@ -3761,21 +3761,28 @@ class AdvancedIndexingError(TypeError):
class Subtensor(Op):
"""Return a subtensor view
This class uses a relatively complex internal representation of the inputs
to remember how the input tensor x should be sliced. The instance variable
idx_list is a list whose elements are either integers, or slices. The
integers are indexes into the inputs array, and the start/stop/step members
of each slice are also integer indexes into the inputs array (or None).
The inputs array is the tensor x, followed by scalar integer variables.
TODO: WRITEME: how are the scalar integer variables formatted?
This class uses a relatively complex internal representation of the inputs
to remember how the input tensor x should be sliced.
idx_list: instance variable TODO: WRITEME: is this a list or a tuple?
(old docstring gives two conflicting
descriptions)
elements are either integers, theano scalars, or slices.
one element per "explicitly named dimension"
TODO: WRITEME: what is an "explicitly named dimension" ?
if integer:
indexes into the inputs array
if slice:
start/stop/step members of each slice are integer indices
into the inputs array or None
integer indices be actual integers or theano scalars
@todo: add support for advanced tensor indexing (in Subtensor_dx too).
The idx_list is a tuple similar in structure to the sort of key you might
expect in numpy's basic indexing mode. It has one element for each
explicitly named dimension. In numpy, the elements can be either integers
or slices containing integers and None. In Subtensor, each element can
additionally be a Scalar instance, and slice components can also be Scalar
instances too.
"""
e_invalid = ('The index list is longer (size %d) than the number of '
'dimensions of the tensor(namely %d). You are asking for '
......@@ -3873,6 +3880,10 @@ class Subtensor(Op):
return scal.as_scalar(a)
def make_node(self, x, *inputs):
"""
x: the tensor to take a subtensor of
inputs: a list of theano Scalars
"""
x = as_tensor_variable(x)
inputs = tuple(self.my_as_scalar(a) for a in inputs)
......@@ -4015,22 +4026,67 @@ class Subtensor(Op):
indices.append(str(entry))
return "%s{%s}" % (self.__class__.__name__, ", ".join(indices))
@staticmethod
def default_helper_c_code_args():
"""
Returns a dictionary of default arguments to
helper_c_code
"""
return {
"c_prefix" : "PyArray",
"update_flags": ("PyArray_UpdateFlags(%(view_name)s,"
" NPY_ARRAY_C_CONTIGUOUS|"
"NPY_ARRAY_F_CONTIGUOUS);"),
"set_data" : "PyArray_set_data",
"set_dim" : "PyArray_set_dim",
"set_stride" : "PyArray_set_stride",
"strides_mul" : 1,
"view_name" : "xview" }
@staticmethod
def helper_c_code(node, name, inputs, outputs, sub, idx_list,
c_prefix="PyArray",
update_flags=("PyArray_UpdateFlags(xview,"
" NPY_ARRAY_C_CONTIGUOUS|"
"NPY_ARRAY_F_CONTIGUOUS);"),
set_data='PyArray_set_data',
set_dim='PyArray_set_dim',
set_stride='PyArray_set_stride',
strides_mul=1,
c_prefix=None,
update_flags=None,
set_data=None,
set_dim=None,
set_stride=None,
strides_mul=None,
view_name=None
):
"""The parameters c_prefix, update_flags, set_data, set_dim,
"""
The parameters c_prefix, update_flags, set_data, set_dim,
set_stride and strides_mul are there to allow reusing this
function on PyArray and CudaNdarray object.
"""
default_args = Subtensor.default_helper_c_code_args()
if update_flags is None:
update_flags = default_args['update_flags']
if set_data is None:
set_data = default_args['set_data']
if set_dim is None:
set_dim = default_args['set_dim']
if set_stride is None:
set_stride = default_args['set_stride']
if strides_mul is None:
strides_mul = default_args['strides_mul']
if c_prefix is None:
c_prefix = default_args['c_prefix']
if view_name is None:
view_name = default_args['view_name']
#update_flags may depend on view_name
update_flags = update_flags % locals()
#
# two arrays are created in C code:
# is_slice: len == ndim, 0 means int, 1 means slice
......@@ -4104,6 +4160,8 @@ class Subtensor(Op):
x, = inputs[:1]
z, = outputs
xview = view_name
rval = """
#define PyArray_set_dim(obj, idx, d) PyArray_DIMS(obj)[idx]=d
#define PyArray_set_stride(obj, idx, d) PyArray_STRIDES(obj)[idx]=d
......@@ -4119,30 +4177,30 @@ class Subtensor(Op):
int inner_ii = 0; // the current dimension of zview
int outer_ii = 0; // current dimension of z
char* ptr = (char*) %(c_prefix)s_BYTES(xview);
char* ptr = (char*) %(c_prefix)s_BYTES(%(xview)s);
if ((%(c_prefix)s_DIMS(xview) == %(c_prefix)s_DIMS(%(x)s))
if ((%(c_prefix)s_DIMS(%(xview)s) == %(c_prefix)s_DIMS(%(x)s))
&& (%(c_prefix)s_DIMS(%(x)s) != NULL))
{
PyErr_Format(PyExc_ValueError, "x and xview"
PyErr_Format(PyExc_ValueError, "x and %(xview)s"
"(with %%d dims) have the same dimensions"
" pointers: %%p and %%p",
%(c_prefix)s_NDIM(%(x)s),
%(c_prefix)s_DIMS(xview),
%(c_prefix)s_DIMS(%(xview)s),
%(c_prefix)s_DIMS(%(x)s));
Py_XDECREF(xview);
Py_XDECREF(%(xview)s);
%(fail)s;
}
if (%(c_prefix)s_STRIDES(xview) == %(c_prefix)s_STRIDES(%(x)s)
if (%(c_prefix)s_STRIDES(%(xview)s) == %(c_prefix)s_STRIDES(%(x)s)
&& (%(c_prefix)s_DIMS(%(x)s) != NULL))
{
PyErr_Format(PyExc_ValueError, "x and xview"
PyErr_Format(PyExc_ValueError, "x and %(xview)s"
"(with %%d dims) have the same strides"
" pointers: %%p and %%p",
%(c_prefix)s_NDIM(%(x)s),
%(c_prefix)s_STRIDES(xview),
%(c_prefix)s_STRIDES(%(xview)s),
%(c_prefix)s_STRIDES(%(x)s));
Py_XDECREF(xview);
Py_XDECREF(%(xview)s);
%(fail)s;
}
......@@ -4164,10 +4222,10 @@ class Subtensor(Op):
// PySlice_GetIndicesEx in python source
if (!step)
{
Py_DECREF(xview);
Py_DECREF(%(xview)s);
PyErr_Format(PyExc_ValueError,
"slice step cannot be zero");
Py_XDECREF(xview);
Py_XDECREF(%(xview)s);
%(fail)s;
}
......@@ -4218,8 +4276,8 @@ class Subtensor(Op):
ptr += %(c_prefix)s_STRIDES(%(x)s)[outer_ii] * start *
%(strides_mul)s;
%(set_dim)s(xview, inner_ii, slicelength);
%(set_stride)s(xview, inner_ii,
%(set_dim)s(%(xview)s, inner_ii, slicelength);
%(set_stride)s(%(xview)s, inner_ii,
%(c_prefix)s_STRIDES(%(x)s)[outer_ii] * step);
inner_ii += 1;
......@@ -4239,27 +4297,27 @@ class Subtensor(Op):
else
{
PyErr_Format(PyExc_IndexError,"index out of bounds");
Py_XDECREF(xview);
Py_XDECREF(%(xview)s);
%(fail)s;
}
}
else
{
PyErr_Format(PyExc_IndexError,"index out of bounds");
Py_XDECREF(xview);
Py_XDECREF(%(xview)s);
%(fail)s;
}
spec_pos += 1;
}
}
%(set_data)s(xview, ptr, (PyObject*)NULL);
assert (inner_ii <= %(c_prefix)s_NDIM(xview));
while (inner_ii < %(c_prefix)s_NDIM(xview))
%(set_data)s(%(xview)s, ptr, (PyObject*)NULL);
assert (inner_ii <= %(c_prefix)s_NDIM(%(xview)s));
while (inner_ii < %(c_prefix)s_NDIM(%(xview)s))
{
assert (outer_ii < %(c_prefix)s_NDIM(%(x)s));
%(set_dim)s(xview, inner_ii, %(c_prefix)s_DIMS(%(x)s)[outer_ii]);
%(set_stride)s(xview, inner_ii, %(c_prefix)s_STRIDES(%(x)s)[outer_ii]);
%(set_dim)s(%(xview)s, inner_ii, %(c_prefix)s_DIMS(%(x)s)[outer_ii]);
%(set_stride)s(%(xview)s, inner_ii, %(c_prefix)s_STRIDES(%(x)s)[outer_ii]);
inner_ii += 1;
outer_ii += 1;
}
......@@ -4454,7 +4512,6 @@ def inc_subtensor(x, y, inplace=False, set_instead_of_inc=False,
else:
raise TypeError('x must be result of a subtensor operation')
class IncSubtensor(Op):
"""Increment a subtensor.
......@@ -4524,6 +4581,11 @@ class IncSubtensor(Op):
", ".join(indices))
def make_node(self, x, y, *inputs):
"""
x: the tensor to increment
y: the value to increment by
inputs: TODO WRITEME
"""
x, y = map(as_tensor_variable, [x, y])
if y.ndim > x.ndim:
raise ValueError(("Trying to increment a %d-dimensional "
......@@ -4602,9 +4664,16 @@ class IncSubtensor(Op):
x.__setitem__(cdata, y)
out[0] = x
def c_code(self, node, name, inputs, outputs, sub): # DEBUG
if not isinstance(node.inputs[0].type, TensorType):
raise NotImplementedError()
def c_code(self, node, name, inputs, outputs, sub):
# This method delegates much of the work to helper
# methods. This method implements the main logic
# but subclasses may override the helper methods
# to change the particulars, e.g. GpuIncSubtensor
# turns the view/copy operations on numpy arrays
# into the same operations on cuda arrays.
self.do_type_checking(node)
if self.inplace: # convert bool to int
inplace = 1
......@@ -4621,12 +4690,15 @@ class IncSubtensor(Op):
view_ndim = (node.inputs[0].ndim -
numpy.sum([not isinstance(idx, slice)
for idx in self.idx_list]))
copy_of_x = self.copy_of_x(x)
copy_input_if_necessary = """
if (%(inplace)s)
{
if (%(x)s != %(z)s)
{
if (%(z)s) Py_DECREF(%(z)s);
Py_XDECREF(%(z)s);
Py_INCREF(%(x)s);
%(z)s = %(x)s;
}
......@@ -4634,69 +4706,75 @@ class IncSubtensor(Op):
else
{
if (%(z)s) Py_DECREF(%(z)s);
%(z)s = (PyArrayObject*)PyArray_FromAny(py_%(x)s, NULL, 0, 0,
NPY_ARRAY_ENSURECOPY, NULL);
%(z)s = %(copy_of_x)s;
}
""" % locals()
alloc_zview = self.make_view_array(z, view_ndim)
# On GPU, it takes two steps to make a view
link_zview = self.link_view_array(z, fail);
#Make a first view on the output, as we will write into it.
build_view = """
//TODO: give this Op a second output so that this view can be cached
//TODO: alternatively, fix the memory leak on failure
Py_INCREF(PyArray_DESCR(%(z)s));
PyArrayObject * xview = (PyArrayObject*)PyArray_NewFromDescr(
&PyArray_Type,
PyArray_DESCR(%(z)s),
%(view_ndim)s,
PyArray_DIMS(%(z)s),
PyArray_STRIDES(%(z)s),
PyArray_DATA(%(z)s),
%(z)s->flags,
NULL);
if (!xview)
%(alloc_zview)s;
if (!zview)
{
%(fail)s;
}
%(link_zview)s;
""" % locals()
# make xview actually a view of %(z)s
get_xview = Subtensor.helper_c_code(node, name,
outputs[:1] + inputs[2:],
outputs, sub, self.idx_list)
# make zview actually a view of %(z)s
helper_args = self.get_helper_c_code_args()
helper_args['view_name'] = 'zview'
get_zview = self.define_set_data() + \
Subtensor.helper_c_code(
node=node,
name=name,
inputs=outputs[:1] + inputs[2:],
outputs=outputs,
sub=sub,
idx_list=self.idx_list,
** helper_args
)
copy_into = self.copy_into("zview", y)
add_to_zview = self.add_to_zview(y, fail)
make_modification = """
if (%(op_is_set)s)
{
if (PyArray_CopyInto(xview, %(y)s)) // does broadcasting
if (%(copy_into)s) // does broadcasting
{
Py_DECREF(xview);
Py_DECREF(zview);
%(fail)s;
}
}
else
{
PyArrayObject * add_rval = (PyArrayObject*)PyNumber_InPlaceAdd(
(PyObject*)xview, py_%(y)s);
if (add_rval)
{
assert (PyArray_Check((PyObject*)add_rval));
assert (PyArray_DATA(add_rval) == PyArray_DATA(xview));
Py_DECREF(add_rval);
}
else
{
Py_DECREF(xview);
%(fail)s;
}
%(add_to_zview)s
}
""" % locals()
return (copy_input_if_necessary
+ build_view
+ "{" + get_xview + "}"
+ "{" + get_zview + "}"
+ make_modification
+ "Py_DECREF(xview);"
+ "Py_DECREF(zview);"
)
def do_type_checking(self, node):
""" Should raise NotImplementedError if c_code does not support
the types involved in this node.
"""
if not isinstance(node.inputs[0].type, TensorType):
raise NotImplementedError()
def c_code_cache_version(self):
hv = Subtensor.helper_c_code_cache_version()
if hv:
......@@ -4704,6 +4782,99 @@ class IncSubtensor(Op):
else:
return ()
def copy_of_x(self, x):
"""
x: a string giving the name of a C variable pointing to an array
Returns C code expression to make a copy of x.
Base class uses PyArrayObject *, subclasses may override for
different types of arrays.
"""
# Parameters of PyArrary_FromAny are:
# array
# dtype: we pass NULL to say any dtype is acceptable, so the existing
# dtype will be copied
# min_depth: we pass 0 to have this parameter ignored
# max_depth: we pass 0 to have this parameter ignored
# requirements: here we pass NPY_ARRAY_ENSURECOPY to force a copy
# context: this is almost always NULL, I'm not sure what it's used for
return """(PyArrayObject*)PyArray_FromAny(py_%(x)s, NULL, 0, 0,
NPY_ARRAY_ENSURECOPY, NULL)""" % locals()
def make_view_array(self, x, view_ndim):
"""
x: a string identifying an array to be viewed
view_ndim: a string specifying the number of dimensions
to have in the view
This doesn't need to actually set up the view with the
right indexing; we'll do that manually later.
"""
return """Py_INCREF(PyArray_DESCR(%(x)s));
PyArrayObject * zview =
(PyArrayObject*)PyArray_NewFromDescr(
&PyArray_Type,
PyArray_DESCR(%(x)s),
%(view_ndim)s,
PyArray_DIMS(%(x)s),
PyArray_STRIDES(%(x)s),
PyArray_DATA(%(x)s),
%(x)s->flags,
NULL)""" % locals()
def get_helper_c_code_args(self):
""" Return a dictionary of arguments to pass to helper_c_code."""
return Subtensor.default_helper_c_code_args()
def copy_into(self, view, source):
"""
view: string, C code expression for an array
source: string, C code expression for an array
returns a C code expression to copy source into view, and
return 0 on success
"""
return """PyArray_CopyInto(%(view)s, %(source)s)""" % locals()
def define_set_data(self):
""" Returns C code used to define any macros used in the
set data argument to the helper C code. """
return ""
def link_view_array(self, x, fail):
""" Returns code to complete making zview a view of x"""
# On CPU there is nothing to do, make_view_array already did this
return ""
def set_view_base(self, x, fail):
""" Returns code to make zview be a correct view of x,
after helper_c_code is done messing with x"""
# On CPU there is nothing to do
return ""
def add_to_zview(self, x, fail):
""" Return C code to add x to zview. Should DECREF zview if the
add fails."""
return """
PyArrayObject * add_rval = (PyArrayObject*)PyNumber_InPlaceAdd(
(PyObject*)zview, py_%(x)s);
if (add_rval)
{
assert (PyArray_Check((PyObject*)add_rval));
assert (PyArray_DATA(add_rval) == PyArray_DATA(zview));
Py_DECREF(add_rval);
}
else
{
Py_DECREF(zview);
%(fail)s;
}""" % locals()
def infer_shape(self, node, shapes):
return [shapes[0]]
......
......@@ -1056,7 +1056,8 @@ class test_fusion(unittest.TestCase):
if gpu:
import theano.sandbox.cuda as cuda
topo_ = [x for x in topo if not isinstance(
x.op,cuda.basic_ops.GpuFromHost) and not isinstance(x.op,cuda.basic_ops.HostFromGpu)]
x.op, (cuda.basic_ops.GpuFromHost, cuda.basic_ops.HostFromGpu))]
gpu_ = [x for x in topo if isinstance(x.op,
cuda.basic_ops.GpuFromHost)]
if not len(gpu_) == len(sym_inputs):
......@@ -1067,13 +1068,16 @@ class test_fusion(unittest.TestCase):
if not len(topo_) == nb_elemwise:
fail3.append((id, topo_, nb_elemwise))
if nb_elemwise == 1:
# check that the number of input to the Composite Elemwise is ok
# when there is not variable that appear multiple time the in input
# of g
assert ((numpy.sum([not isinstance(x, theano.gof.Constant)
for x in topo_[0].inputs]) ==
len(sym_inputs)) or
len(set(g.owner.inputs)) != len(g.owner.inputs))
# if no variable appears multiple times in the
# input of g,
# check that the number of input to the Composite
# Elemwise is ok
if len(set(g.owner.inputs)) == len(g.owner.inputs):
expected_len_sym_inputs = numpy.sum(
[not isinstance(x, theano.gof.Constant)
for x in topo_[0].inputs])
assert expected_len_sym_inputs == len(sym_inputs)
if not out_dtype == out.dtype:
fail4.append((id, out_dtype, out.dtype))
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论