提交 818bdf4b authored 作者: nouiz's avatar nouiz

Merge pull request #985 from goodfeli/rebase_gpu_incsub

C code for GpuIncsub, and a new CUDA kernel for one special case
...@@ -232,8 +232,9 @@ def rebuild_collect_shared(outputs, ...@@ -232,8 +232,9 @@ def rebuild_collect_shared(outputs,
copy_inputs_over) copy_inputs_over)
cloned_outputs.append(Out(cloned_v, borrow=v.borrow)) cloned_outputs.append(Out(cloned_v, borrow=v.borrow))
else: else:
raise TypeError('outputs must be theano Variable or ' raise TypeError('Outputs must be theano Variable or '
'Out instances', v) 'Out instances. Received ' + str(v)\
+ ' of type '+str(type(v)))
#computed_list.append(cloned_v) #computed_list.append(cloned_v)
else: else:
if isinstance(outputs, Variable): if isinstance(outputs, Variable):
......
...@@ -589,6 +589,10 @@ class Op(utils.object2, PureOp, CLinkerOp): ...@@ -589,6 +589,10 @@ class Op(utils.object2, PureOp, CLinkerOp):
rval.outputs = node_output_storage rval.outputs = node_output_storage
rval.lazy = False rval.lazy = False
return rval return rval
# the next line does nothing, but pyflakes is too
# stupid to realize the def rval below is not a
# redefinition unless I include this
del rval
except (NotImplementedError, utils.MethodNotDefined): except (NotImplementedError, utils.MethodNotDefined):
logger.debug('Falling back on perform') logger.debug('Falling back on perform')
......
...@@ -2175,6 +2175,12 @@ class GpuReshape(tensor.Reshape, GpuOp): ...@@ -2175,6 +2175,12 @@ class GpuReshape(tensor.Reshape, GpuOp):
out[0] = x.reshape(tuple(shp)) out[0] = x.reshape(tuple(shp))
# C Code shared by GpuSubtensor and GpuIncSubtensor
_define_set_data = """
#define CudaNdarray_set_device_data2(obj, ptr, base) \
CudaNdarray_set_device_data(obj, (float *)ptr, base)
"""
class GpuSubtensor(GpuOp, tensor.Subtensor): class GpuSubtensor(GpuOp, tensor.Subtensor):
""" """
Implement subtensor on the gpu. Implement subtensor on the gpu.
...@@ -2240,10 +2246,10 @@ class GpuSubtensor(GpuOp, tensor.Subtensor): ...@@ -2240,10 +2246,10 @@ class GpuSubtensor(GpuOp, tensor.Subtensor):
%(fail)s; %(fail)s;
} }
cnda_mark_dev_structure_dirty(xview); cnda_mark_dev_structure_dirty(xview);
#define CudaNdarray_set_device_data2(obj, ptr, base) \ """ % locals()
CudaNdarray_set_device_data(obj, (float *)ptr, base)
""" % locals() get_xview = _define_set_data + \
get_xview = self.helper_c_code(node, name, inputs, outputs, sub, self.helper_c_code(node, name, inputs, outputs, sub,
self.idx_list, self.idx_list,
c_prefix='CudaNdarray', c_prefix='CudaNdarray',
set_data='CudaNdarray_set_device_data2', set_data='CudaNdarray_set_device_data2',
...@@ -2251,6 +2257,7 @@ class GpuSubtensor(GpuOp, tensor.Subtensor): ...@@ -2251,6 +2257,7 @@ class GpuSubtensor(GpuOp, tensor.Subtensor):
set_stride='CudaNdarray_set_stride', set_stride='CudaNdarray_set_stride',
update_flags="", strides_mul=4) update_flags="", strides_mul=4)
finish_view = """ finish_view = """
//Set the base only now //Set the base only now
...@@ -2408,13 +2415,128 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp): ...@@ -2408,13 +2415,128 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp):
class GpuIncSubtensor(tensor.IncSubtensor, GpuOp): class GpuIncSubtensor(tensor.IncSubtensor, GpuOp):
""" """
Implement IncSubtensor on the gpu. Implement IncSubtensor on the gpu.
Note: The optimization to make this inplace is in tensor/opt.
The same optimization handles IncSubtensor and GpuIncSubtensor.
This Op has c_code too; it inherits tensor.IncSubtensor's c_code.
The helper methods like do_type_checking, copy_of_x, etc. specialize
the c_code for this Op.
""" """
def make_node(self, x, y, *inputs): def make_node(self, x, y, *inputs):
assert isinstance(x.type, CudaNdarrayType) x = as_cuda_ndarray_variable(x)
assert isinstance(y.type, CudaNdarrayType) y = as_cuda_ndarray_variable(y)
rval = tensor.IncSubtensor.make_node(self, x, y, *inputs) rval = tensor.IncSubtensor.make_node(self, x, y, *inputs)
return Apply(self, [x, y] + rval.inputs[2:], [x.type()]) return Apply(self, [x, y] + rval.inputs[2:], [x.type()])
def do_type_checking(self, node):
""" Should raise NotImplementedError if c_code does not support
the types involved in this node.
"""
if not isinstance(node.inputs[0].type, CudaNdarrayType):
raise NotImplementedError()
def copy_of_x(self, x):
"""
x: a string giving the name of a C variable pointing to an array
Returns C code expression to make a copy of x.
Base class uses PyArrayObject *, subclasses may override for
different types of arrays.
"""
return """(CudaNdarray*) CudaNdarray_Copy(%(x)s)""" % locals()
def make_view_array(self, x, view_ndim):
"""
x: a string identifying an array to be viewed
view_ndim: a string specifying the number of dimensions
to have in the view
This doesn't need to actually set up the view with the
right indexing; we'll do that manually later.
"""
return """CudaNdarray* zview = (CudaNdarray*)
CudaNdarray_New(%(view_ndim)s)""" % locals()
def get_helper_c_code_args(self):
""" Return a dictionary of arguments to use with helper_c_code"""
return { 'update_flags' : "",
'c_prefix' : 'CudaNdarray',
'set_data' :'CudaNdarray_set_device_data2',
'set_dim' : 'CudaNdarray_set_dim',
'set_stride' : 'CudaNdarray_set_stride',
'update_flags' : "",
'strides_mul': 4
}
def copy_into(self, view, source):
"""
view: string, C code expression for an array
source: string, C code expression for an array
returns a C code expression to copy source into view, and
return 0 on success
"""
return """CudaNdarray_CopyFromCudaNdarray(%(view)s, %(source)s)""" % locals()
def define_set_data(self):
return _define_set_data
def link_view_array(self, x, fail):
return """
if (CudaNdarray_set_device_data(zview, CudaNdarray_DEV_DATA(%(x)s),
(PyObject*) NULL))
{
PyErr_Format(PyExc_RuntimeError,
"GpuSubtensor is not able to set the"
" devdata field of the view");
Py_XDECREF(zview);
%(fail)s;
}
cnda_mark_dev_structure_dirty(zview);
""" % locals()
def set_view_base(self, x, fail):
return """
//Set the base only now
if(CudaNdarray_set_device_data(zview, CudaNdarray_DEV_DATA(zview),
%(x)s)){
PyErr_Format(PyExc_RuntimeError,
"GpuSubtensor is not able to set"
" the base of the view array");
Py_XDECREF(zview);
%(fail)s;
}""" % locals()
def add_to_zview(self, x, fail):
return """
PyObject * add_result = CudaNdarray_inplace_add((PyObject *) zview,
(PyObject *) py_%(x)s);
if (! add_result )
{
Py_DECREF(zview);
%(fail)s;
}
else
{
Py_DECREF(add_result);
}
""" % locals()
def c_code_cache_version(self):
parent_version = super(GpuIncSubtensor, self).c_code_cache_version()
if parent_version:
return parent_version + (0,)
return ()
class GpuFlatten(tensor.Flatten, GpuOp): class GpuFlatten(tensor.Flatten, GpuOp):
""" """
......
...@@ -14,6 +14,17 @@ ...@@ -14,6 +14,17 @@
//If true, we fill with NAN allocated device memory. //If true, we fill with NAN allocated device memory.
#define ALLOC_MEMSET 0 #define ALLOC_MEMSET 0
//If true, we print out when we free a device pointer, uninitialize a
//CudaNdarray, or allocate a device pointer
#define PRINT_FREE_MALLOC 0
//If true, we do error checking at the start of functions, to make sure there
//is not a pre-existing error when the function is called.
//You probably need to set the environment variable
//CUDA_LAUNCH_BLOCKING=1
//if you want this to work.
#define PRECHECK_ERROR 0
///////////////////////// /////////////////////////
// Alloc and Free // Alloc and Free
///////////////////////// /////////////////////////
...@@ -53,36 +64,53 @@ void * device_malloc(size_t size) ...@@ -53,36 +64,53 @@ void * device_malloc(size_t size)
// it returns something else I still don't see why we should ignore // it returns something else I still don't see why we should ignore
// it. All we want to do here is reset the flag. // it. All we want to do here is reset the flag.
cudaGetLastError(); cudaGetLastError();
#if COMPUTE_GPU_MEM_USED #if COMPUTE_GPU_MEM_USED
fprintf(stderr, "Error allocating %li bytes of device memory (%s). new total bytes allocated: %d\n", (long)size, cudaGetErrorString(err),_allocated_size); fprintf(stderr, "Error allocating %li bytes of device memory (%s). new total bytes allocated: %d\n", (long)size, cudaGetErrorString(err),_allocated_size);
#else #else
fprintf(stderr, "Error allocating %li bytes of device memory (%s).\n", (long)size, cudaGetErrorString(err)); fprintf(stderr, "Error allocating %li bytes of device memory (%s).\n", (long)size, cudaGetErrorString(err));
#endif #endif
PyErr_Format(PyExc_MemoryError, "Error allocating %li bytes of device memory (%s).", (long)size, cudaGetErrorString(err)); PyErr_Format(PyExc_MemoryError,
"Error allocating %li bytes of device memory (%s).", (long)size, cudaGetErrorString(err));
return NULL; return NULL;
} }
_outstanding_mallocs[0] += (rval != NULL); _outstanding_mallocs[0] += (rval != NULL);
#if COMPUTE_GPU_MEM_USED #if COMPUTE_GPU_MEM_USED
for(int i=0;i<TABLE_SIZE;i++){ for(int i=0;i<TABLE_SIZE;i++){
if(NULL==_alloc_size_table[i].ptr){ if(NULL==_alloc_size_table[i].ptr){
_alloc_size_table[i].ptr=rval; _alloc_size_table[i].ptr=rval;
_alloc_size_table[i].size=size; _alloc_size_table[i].size=size;
break; break;
}
} }
} _allocated_size += size;
_allocated_size += size; #endif
#endif //fprintf(stderr,
//fprintf(stderr, "allocated %li bytes of device memory (%s). new total bytes allocated: %d. ptr: %p\n", (long)size, cudaGetErrorString(err),_allocated_size,rval); //"allocated %li bytes of device memory (%s). new total bytes allocated: %d. ptr: %p\n",
//(long)size, cudaGetErrorString(err),_allocated_size,rval);
if(ALLOC_MEMSET){ if(ALLOC_MEMSET){
//We init them to nan to make sure we catch more debug case. //We init them to nan to make sure we catch more debug case.
cudaMemset(rval, 0xFF, size); cudaMemset(rval, 0xFF, size);
//printf("MEMSET\n"); //printf("MEMSET\n");
} }
#if PRINT_FREE_MALLOC
fprintf(stderr, "device malloc %p\n",rval);
#endif
return rval; return rval;
} }
int device_free(void *ptr) int device_free(void *ptr)
{ {
#if PRINT_FREE_MALLOC
fprintf(stderr, "device_free %p\n",ptr);
#endif
#if PRECHECK_ERROR
cudaError_t prevError = cudaGetLastError();
if (cudaSuccess != prevError)
{
fprintf(stderr, "Error existed before calling device_free.\n");
}
#endif
// if there is no gpu context, the call to cudaFree will fail; skip it entirely // if there is no gpu context, the call to cudaFree will fail; skip it entirely
if(!g_gpu_context_active) { if(!g_gpu_context_active) {
...@@ -101,33 +129,42 @@ int device_free(void *ptr) ...@@ -101,33 +129,42 @@ int device_free(void *ptr)
// it returns something else I still don't see why we should ignore // it returns something else I still don't see why we should ignore
// it. All we want to do here is reset the flag. // it. All we want to do here is reset the flag.
cudaGetLastError(); cudaGetLastError();
#if COMPUTE_GPU_MEM_USED #if COMPUTE_GPU_MEM_USED
fprintf(stderr, "Error freeing device pointer %p (%s).%d byte already allocated\n", ptr, cudaGetErrorString(err), _allocated_size); fprintf(stderr,
#else "Error freeing device pointer %p (%s).%d byte already allocated\n",
fprintf(stderr, "Error freeing device pointer %p (%s).\n", ptr, cudaGetErrorString(err)); ptr, cudaGetErrorString(err), _allocated_size);
#endif #else
PyErr_Format(PyExc_MemoryError, "error freeing device pointer %p (%s)", ptr, cudaGetErrorString(err)); fprintf(stderr,
"Error freeing device pointer %p (%s).\n",
ptr,
cudaGetErrorString(err));
#endif
PyErr_Format(PyExc_MemoryError,
"error freeing device pointer %p (%s)",
ptr,
cudaGetErrorString(err));
return -1; return -1;
} }
_outstanding_mallocs[0] -= (ptr != NULL); _outstanding_mallocs[0] -= (ptr != NULL);
#if COMPUTE_GPU_MEM_USED #if COMPUTE_GPU_MEM_USED
int i=0; int i=0;
size_t total_freed = 0; size_t total_freed = 0;
for(;i<TABLE_SIZE;i++) for(;i<TABLE_SIZE;i++)
if(_alloc_size_table[i].ptr==ptr){ if(_alloc_size_table[i].ptr==ptr){
_allocated_size -= _alloc_size_table[i].size; _allocated_size -= _alloc_size_table[i].size;
total_freed += _alloc_size_table[i].size; total_freed += _alloc_size_table[i].size;
_alloc_size_table[i].ptr=0; _alloc_size_table[i].ptr=0;
_alloc_size_table[i].size=0; _alloc_size_table[i].size=0;
break; break;
} }
//if(i==TABLE_SIZE) //if(i==TABLE_SIZE)
// printf("Unallocated unknow size!\n"); // printf("Unallocated unknow size!\n");
//fprintf(stderr, "freed %li bytes of device memory (%s). %d already allocated, ptr=%p\n", (long)total_freed, cudaGetErrorString(err),_allocated_size,ptr); //fprintf(stderr, "freed %li bytes of device memory (%s). %d already allocated, ptr=%p\n", (long)total_freed, cudaGetErrorString(err),_allocated_size,ptr);
#endif #endif
return 0; return 0;
} }
static PyObject * static PyObject *
outstanding_mallocs(PyObject* self, PyObject * args) outstanding_mallocs(PyObject* self, PyObject * args)
{ {
...@@ -153,14 +190,17 @@ CudaNdarray_null_init(CudaNdarray*self) ...@@ -153,14 +190,17 @@ CudaNdarray_null_init(CudaNdarray*self)
static int static int
CudaNdarray_uninit(CudaNdarray*self) CudaNdarray_uninit(CudaNdarray*self)
{ {
#if PRINT_FREE_MALLOC
fprintf(stderr, "CudaNdarray_uninit %p\n", self);
#endif
int rval = 0; int rval = 0;
if (self->data_allocated) { if (self->data_allocated) {
assert(self->devdata); assert(self->devdata);
if (device_free(self->devdata)) if (device_free(self->devdata))
{ {
fprintf(stderr, fprintf(stderr,
"!!!! error freeing device memory %p (self=%p)\n", "CudaNdarray_uninit: error freeing self->devdata. (self=%p, self->devata=%p)\n",
self->devdata, self); self, self->devdata);
rval = -1; rval = -1;
} }
self->devdata = NULL; self->devdata = NULL;
...@@ -171,7 +211,7 @@ CudaNdarray_uninit(CudaNdarray*self) ...@@ -171,7 +211,7 @@ CudaNdarray_uninit(CudaNdarray*self)
if (device_free(self->dev_structure)) if (device_free(self->dev_structure))
{ {
fprintf(stderr, fprintf(stderr,
"!!!! error freeing dev_structure memory %p (self=%p)\n", "CudaNdarray_uninit: error freeing dev_structure memory %p (self=%p)\n",
self->dev_structure, self); self->dev_structure, self);
rval = -1; rval = -1;
} }
...@@ -1164,6 +1204,7 @@ CudaNdarray_exp(CudaNdarray* self) ...@@ -1164,6 +1204,7 @@ CudaNdarray_exp(CudaNdarray* self)
return (PyObject*)rval; return (PyObject*)rval;
} }
static PyMethodDef CudaNdarray_methods[] = static PyMethodDef CudaNdarray_methods[] =
{ {
{"__array__", {"__array__",
...@@ -1740,7 +1781,7 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t ...@@ -1740,7 +1781,7 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
* It returns py_self on success with an additional reference. Else NULL. * It returns py_self on success with an additional reference. Else NULL.
*/ */
// Will be called by __iadd__ in Python // Will be called by __iadd__ in Python
static PyObject * PyObject *
CudaNdarray_inplace_add(PyObject* py_self, PyObject * py_other) CudaNdarray_inplace_add(PyObject* py_self, PyObject * py_other)
{ {
if (CudaNdarray_inplace_elemwise(py_self, py_other, IADD)) if (CudaNdarray_inplace_elemwise(py_self, py_other, IADD))
...@@ -2127,7 +2168,7 @@ CudaNdarray_setitem(PyObject *o, PyObject *key, PyObject *value) ...@@ -2127,7 +2168,7 @@ CudaNdarray_setitem(PyObject *o, PyObject *key, PyObject *value)
} }
PyObject * intobj = NULL; PyObject * intobj = NULL;
if(CudaNdarray_Check(o) && PyArray_Check(value)){ if (CudaNdarray_Check(o) && PyArray_Check(value)){
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"CudaNdarray_setitem dest is a CudaNdarray and" "CudaNdarray_setitem dest is a CudaNdarray and"
...@@ -2137,7 +2178,7 @@ CudaNdarray_setitem(PyObject *o, PyObject *key, PyObject *value) ...@@ -2137,7 +2178,7 @@ CudaNdarray_setitem(PyObject *o, PyObject *key, PyObject *value)
{ {
return -1; return -1;
} }
if(CudaNdarray_CopyFromArray(new_value, (PyArrayObject *) value)) if (CudaNdarray_CopyFromArray(new_value, (PyArrayObject *) value))
{ {
Py_XDECREF(new_value); Py_XDECREF(new_value);
Py_XDECREF(rval); Py_XDECREF(rval);
...@@ -2214,7 +2255,7 @@ CudaNdarray_setitem(PyObject *o, PyObject *key, PyObject *value) ...@@ -2214,7 +2255,7 @@ CudaNdarray_setitem(PyObject *o, PyObject *key, PyObject *value)
PyObject *baseSavedForComparison = rval->base; PyObject *baseSavedForComparison = rval->base;
if(CudaNdarray_CopyFromCudaNdarray(rval, (CudaNdarray*)value, true)) if (CudaNdarray_CopyFromCudaNdarray(rval, (CudaNdarray*)value, true))
{ {
Py_DECREF((PyObject*)rval); Py_DECREF((PyObject*)rval);
Py_XDECREF(new_value); Py_XDECREF(new_value);
...@@ -3190,6 +3231,28 @@ static __global__ void k_copy_1d(const int N, const float * x, const int sx, flo ...@@ -3190,6 +3231,28 @@ static __global__ void k_copy_1d(const int N, const float * x, const int sx, flo
} }
} }
// N1 through N4 are the size of y
static __global__ void k_copy_4d(const int N1,
const int N2, const int N3, const int N4,
const float * x, const int sx1, const int sx2, const int sx3,
const int sx4, float * y, const int sy1, const int sy2,
const int sy3, const int sy4)
{
// These must be made int instead of unsigned int due to a bug in nvcc
int bx = blockIdx.x;
int by = blockIdx.y;
// N1 and N2 are kept in case a future implementation needs to
// loop on the first two dimensions if there are not enough blocks
for (int j = threadIdx.y; j < (int) N4; j += (int) blockDim.y)
{
for (int i = threadIdx.x; i < N3; i += (int) blockDim.x)
{
y[bx * sy1 + by * sy2 + i * sy3 + j * sy4] =
x[bx * sx1 + by * sx2 + i * sx3 + j * sx4];
}
}
}
//copy from other into self //copy from other into self
int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self,
const CudaNdarray * other, const CudaNdarray * other,
...@@ -3304,6 +3367,61 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, ...@@ -3304,6 +3367,61 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self,
return -1; return -1;
} }
}; break; }; break;
case 4: // 4-tensor
{
if (verbose)
{
if (0 != fprint_CudaNdarray(stderr, other))
{
Py_XDECREF(new_other);
return -1;
}
}
// The blocks implement the looping over the first two axes so
// this needs to be (N1, N2)
dim3 n_blocks( (unsigned int) CudaNdarray_HOST_DIMS(self)[0],
(unsigned int) CudaNdarray_HOST_DIMS(self)[1]);
// For the threads, just make as many as possible
dim3 n_threads( std::min( (unsigned int) CudaNdarray_HOST_DIMS(self)[2],
(unsigned int) NUM_VECTOR_OP_THREADS_PER_BLOCK),
std::min( (unsigned int) CudaNdarray_HOST_DIMS(self)[3],
(unsigned int) NUM_VECTOR_OP_THREADS_PER_BLOCK));
n_threads.x = std::min( (unsigned int) 32, (unsigned int) n_threads.x);
n_threads.y = std::min( n_threads.y, NUM_VECTOR_OP_THREADS_PER_BLOCK / n_threads.x);
k_copy_4d<<<n_blocks, n_threads>>>(
// size of y
(unsigned int) CudaNdarray_HOST_DIMS(self)[0], // N1
(unsigned int) CudaNdarray_HOST_DIMS(self)[1], // N2
(unsigned int) CudaNdarray_HOST_DIMS(self)[2], // N3
(unsigned int) CudaNdarray_HOST_DIMS(self)[3], // N4
CudaNdarray_DEV_DATA(other), // x
// x strides
CudaNdarray_HOST_STRIDES(other)[0],
CudaNdarray_HOST_STRIDES(other)[1],
CudaNdarray_HOST_STRIDES(other)[2],
CudaNdarray_HOST_STRIDES(other)[3],
CudaNdarray_DEV_DATA(self), // y
// y strides
CudaNdarray_HOST_STRIDES(self)[0],
CudaNdarray_HOST_STRIDES(self)[1],
CudaNdarray_HOST_STRIDES(self)[2],
CudaNdarray_HOST_STRIDES(self)[3]
);
CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
PyErr_Format(PyExc_RuntimeError,
"Cuda error: %s: %s.",
"k_copy_4d",
cudaGetErrorString(err));
Py_XDECREF(new_other);
return -1;
}
}; break;
default: default:
{ {
assert (cudaSuccess == cudaGetLastError()); assert (cudaSuccess == cudaGetLastError());
...@@ -4486,8 +4604,17 @@ PyObject * CudaNdarray_IS_C_Contiguous(CudaNdarray * self) ...@@ -4486,8 +4604,17 @@ PyObject * CudaNdarray_IS_C_Contiguous(CudaNdarray * self)
return PyBool_FromLong(CudaNdarray_is_c_contiguous(self)); return PyBool_FromLong(CudaNdarray_is_c_contiguous(self));
} }
void fprint_CudaNdarray(FILE * fd, const CudaNdarray *self) int fprint_CudaNdarray(FILE * fd, const CudaNdarray *self)
{ {
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
PyErr_Format(PyExc_RuntimeError,
"Cuda error: %s: %s.",
"fprint_CudaNdarray was called with an uncleared error",
cudaGetErrorString(err));
return -1;
}
fprintf(fd, "CudaNdarray <%p, %p> nd=%i dev_structure_fresh=%d data_allocated=%d\n", fprintf(fd, "CudaNdarray <%p, %p> nd=%i dev_structure_fresh=%d data_allocated=%d\n",
self, self->devdata, self->nd, self->dev_structure_fresh, self->data_allocated); self, self->devdata, self->nd, self->dev_structure_fresh, self->data_allocated);
fprintf(fd, "\tHOST_DIMS: "); fprintf(fd, "\tHOST_DIMS: ");
...@@ -4526,6 +4653,17 @@ void fprint_CudaNdarray(FILE * fd, const CudaNdarray *self) ...@@ -4526,6 +4653,17 @@ void fprint_CudaNdarray(FILE * fd, const CudaNdarray *self)
{ {
fprintf(fd, "\n\tdev_structure not allocated\n"); fprintf(fd, "\n\tdev_structure not allocated\n");
} }
err = cudaGetLastError();
if( cudaSuccess != err)
{
PyErr_Format(PyExc_RuntimeError,
"Cuda error: %s: %s.",
"fprint_CudaNdarray",
cudaGetErrorString(err));
return -1;
}
return 0;
} }
/* /*
......
...@@ -75,15 +75,16 @@ struct CudaNdarray ...@@ -75,15 +75,16 @@ struct CudaNdarray
/* Type-specific fields go here. */ /* Type-specific fields go here. */
//GpuTensorType::VoidTensor * vt; //GpuTensorType::VoidTensor * vt;
int nd; //the number of dimensions of the tensor int nd; //the number of dimensions of the tensor
// Client should acces host_structure via CudaNdarray_HOST_DIMS / CudaNdarray_HOST_STRIDES macros // Client should acces host_structure via CudaNdarray_HOST_DIMS / CudaNdarray_HOST_STRIDES functions
int * host_structure; //dim0, dim1, ... stride0, stride1, ... int * host_structure; //dim0, dim1, ... stride0, stride1, ...
int data_allocated; //the number of bytes allocated for devdata int data_allocated; //the number of bytes allocated for devdata
//device pointers (allocated by cudaMalloc) //device pointers (allocated by cudaMalloc)
mutable int dev_structure_fresh; mutable int dev_structure_fresh;
//dev_structure should be accessed via macros, otherwise may not be //dev_structure should be accessed via the functions like
//synchronized. The macro will allocate it when needed. //CudaNdarray_DEV_DIMS, otherwise may not be
//synchronized with host_structure. The accessor functions will allocate it when needed.
mutable int * dev_structure; //dim0, dim1, ..., stride0, stride1, ... mutable int * dev_structure; //dim0, dim1, ..., stride0, stride1, ...
real* devdata; //pointer to data element [0,..,0]. real* devdata; //pointer to data element [0,..,0].
}; };
...@@ -118,6 +119,12 @@ CudaNdarray_is_c_contiguous(const CudaNdarray * self); ...@@ -118,6 +119,12 @@ CudaNdarray_is_c_contiguous(const CudaNdarray * self);
*/ */
DllExport int cnda_structure_size(int nd); DllExport int cnda_structure_size(int nd);
/*
* This describes the shape of the ndarray. The array
* of dimensions is itself stored on the host.
* If you need to access the dimensions array from inside
* a kernel, use CudaNdarray_DEVICE_DIMS.
*/
DllExport const int * DllExport const int *
CudaNdarray_HOST_DIMS(const CudaNdarray * self); CudaNdarray_HOST_DIMS(const CudaNdarray * self);
...@@ -144,7 +151,7 @@ CudaNdarray_Equal(CudaNdarray *cnda1, CudaNdarray *cnda2); ...@@ -144,7 +151,7 @@ CudaNdarray_Equal(CudaNdarray *cnda1, CudaNdarray *cnda2);
/**** /****
* Set the idx'th dimension to value d. * Set the idx'th dimension to value d.
* *
* Updates the log2dim shaddow array. * Updates the log2dim shadow array.
* *
* Does not sync structure to host. * Does not sync structure to host.
*/ */
...@@ -188,6 +195,10 @@ CudaNdarray_set_stride(CudaNdarray * self, int idx, int s) ...@@ -188,6 +195,10 @@ CudaNdarray_set_stride(CudaNdarray * self, int idx, int s)
*/ */
DllExport int cnda_copy_structure_to_device(const CudaNdarray * self); DllExport int cnda_copy_structure_to_device(const CudaNdarray * self);
/* CudaNdarray_DEV_DIMS gives the same information as CudaNdarray_HOST_DIMS,
* but stored on the GPU. Use this pointer when it needs to be accessed
* from inside a CUDA kernel.
*/
DllExport const int *CudaNdarray_DEV_DIMS(const CudaNdarray * self); DllExport const int *CudaNdarray_DEV_DIMS(const CudaNdarray * self);
DllExport const int *CudaNdarray_DEV_STRIDES(const CudaNdarray * self); DllExport const int *CudaNdarray_DEV_STRIDES(const CudaNdarray * self);
DllExport const int *CudaNdarray_DEV_LOG2DIMS(const CudaNdarray * self); DllExport const int *CudaNdarray_DEV_LOG2DIMS(const CudaNdarray * self);
...@@ -389,8 +400,21 @@ DllExport int CudaNdarray_CopyFromArray(CudaNdarray * self, PyArrayObject*obj); ...@@ -389,8 +400,21 @@ DllExport int CudaNdarray_CopyFromArray(CudaNdarray * self, PyArrayObject*obj);
* Transfer the contents of CudaNdarray `other` to `self`. * Transfer the contents of CudaNdarray `other` to `self`.
* *
* self is reallocated to have the correct dimensions if necessary. * self is reallocated to have the correct dimensions if necessary.
* TODO: WRITEME: what does "if necessary" mean?
* TODO: we use this to implement set/inc subtensor, where self is a view of
* the original tensor so that we write only to the subtensor. How
* do we ensure that self is not reallocated in this case?
*
* unbroadcast: if true, this means that other is broadcastable in some
* dimensions, and the result, self, is not.
* ie, if unbroadcast=false, we must do the broadcasting
* operation as part of the copy.
* e.g. suppose self and other are 2D matrices and other
* has only one row. Then we need to copy this row several
* times when copying to self.
*/ */
DllExport int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, const CudaNdarray * other, bool unbroadcast = false); DllExport int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self,
const CudaNdarray * other, bool unbroadcast = false);
/** /**
* Transfer the contents of CudaNdarray `self` to a new numpy ndarray. * Transfer the contents of CudaNdarray `self` to a new numpy ndarray.
...@@ -437,7 +461,12 @@ DllExport int CudaNdarray_dimshuffle(CudaNdarray * self, unsigned int len, const ...@@ -437,7 +461,12 @@ DllExport int CudaNdarray_dimshuffle(CudaNdarray * self, unsigned int len, const
DllExport PyObject* DllExport PyObject*
CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args); CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args);
static void fprint_CudaNdarray(FILE * fd, const CudaNdarray *self); int fprint_CudaNdarray(FILE * fd, const CudaNdarray *self);
PyObject * CudaNdarray_View(const CudaNdarray * self);
PyObject * CudaNdarray_inplace_add(PyObject* py_self, PyObject * py_other);
#endif #endif
/* /*
......
...@@ -904,6 +904,12 @@ class T_Join_and_Split(theano.tensor.tests.test_basic.T_Join_and_Split): ...@@ -904,6 +904,12 @@ class T_Join_and_Split(theano.tensor.tests.test_basic.T_Join_and_Split):
# This is to don't duplicate test. # This is to don't duplicate test.
class T_subtensor(theano.tensor.tests.test_basic.T_subtensor): class T_subtensor(theano.tensor.tests.test_basic.T_subtensor):
# This prevents nose from printing method docstrings instead of method
# names
def shortDescription(self):
return None
shared = staticmethod(cuda.shared_constructor) shared = staticmethod(cuda.shared_constructor)
sub = cuda.GpuSubtensor sub = cuda.GpuSubtensor
inc_sub = cuda.GpuIncSubtensor inc_sub = cuda.GpuIncSubtensor
...@@ -921,6 +927,7 @@ class T_subtensor(theano.tensor.tests.test_basic.T_subtensor): ...@@ -921,6 +927,7 @@ class T_subtensor(theano.tensor.tests.test_basic.T_subtensor):
self).__init__(name) self).__init__(name)
def test_adv_sub1_fast(self): def test_adv_sub1_fast(self):
"""We check that the special cases of advanced indexing that """We check that the special cases of advanced indexing that
use CudaNdarrayTakeFrom are handled correctly use CudaNdarrayTakeFrom are handled correctly
......
...@@ -3761,21 +3761,28 @@ class AdvancedIndexingError(TypeError): ...@@ -3761,21 +3761,28 @@ class AdvancedIndexingError(TypeError):
class Subtensor(Op): class Subtensor(Op):
"""Return a subtensor view """Return a subtensor view
This class uses a relatively complex internal representation of the inputs
to remember how the input tensor x should be sliced. The instance variable
idx_list is a list whose elements are either integers, or slices. The
integers are indexes into the inputs array, and the start/stop/step members
of each slice are also integer indexes into the inputs array (or None).
The inputs array is the tensor x, followed by scalar integer variables. The inputs array is the tensor x, followed by scalar integer variables.
TODO: WRITEME: how are the scalar integer variables formatted?
This class uses a relatively complex internal representation of the inputs
to remember how the input tensor x should be sliced.
idx_list: instance variable TODO: WRITEME: is this a list or a tuple?
(old docstring gives two conflicting
descriptions)
elements are either integers, theano scalars, or slices.
one element per "explicitly named dimension"
TODO: WRITEME: what is an "explicitly named dimension" ?
if integer:
indexes into the inputs array
if slice:
start/stop/step members of each slice are integer indices
into the inputs array or None
integer indices be actual integers or theano scalars
@todo: add support for advanced tensor indexing (in Subtensor_dx too). @todo: add support for advanced tensor indexing (in Subtensor_dx too).
The idx_list is a tuple similar in structure to the sort of key you might
expect in numpy's basic indexing mode. It has one element for each
explicitly named dimension. In numpy, the elements can be either integers
or slices containing integers and None. In Subtensor, each element can
additionally be a Scalar instance, and slice components can also be Scalar
instances too.
""" """
e_invalid = ('The index list is longer (size %d) than the number of ' e_invalid = ('The index list is longer (size %d) than the number of '
'dimensions of the tensor(namely %d). You are asking for ' 'dimensions of the tensor(namely %d). You are asking for '
...@@ -3873,6 +3880,10 @@ class Subtensor(Op): ...@@ -3873,6 +3880,10 @@ class Subtensor(Op):
return scal.as_scalar(a) return scal.as_scalar(a)
def make_node(self, x, *inputs): def make_node(self, x, *inputs):
"""
x: the tensor to take a subtensor of
inputs: a list of theano Scalars
"""
x = as_tensor_variable(x) x = as_tensor_variable(x)
inputs = tuple(self.my_as_scalar(a) for a in inputs) inputs = tuple(self.my_as_scalar(a) for a in inputs)
...@@ -4015,22 +4026,67 @@ class Subtensor(Op): ...@@ -4015,22 +4026,67 @@ class Subtensor(Op):
indices.append(str(entry)) indices.append(str(entry))
return "%s{%s}" % (self.__class__.__name__, ", ".join(indices)) return "%s{%s}" % (self.__class__.__name__, ", ".join(indices))
@staticmethod
def default_helper_c_code_args():
"""
Returns a dictionary of default arguments to
helper_c_code
"""
return {
"c_prefix" : "PyArray",
"update_flags": ("PyArray_UpdateFlags(%(view_name)s,"
" NPY_ARRAY_C_CONTIGUOUS|"
"NPY_ARRAY_F_CONTIGUOUS);"),
"set_data" : "PyArray_set_data",
"set_dim" : "PyArray_set_dim",
"set_stride" : "PyArray_set_stride",
"strides_mul" : 1,
"view_name" : "xview" }
@staticmethod @staticmethod
def helper_c_code(node, name, inputs, outputs, sub, idx_list, def helper_c_code(node, name, inputs, outputs, sub, idx_list,
c_prefix="PyArray", c_prefix=None,
update_flags=("PyArray_UpdateFlags(xview," update_flags=None,
" NPY_ARRAY_C_CONTIGUOUS|" set_data=None,
"NPY_ARRAY_F_CONTIGUOUS);"), set_dim=None,
set_data='PyArray_set_data', set_stride=None,
set_dim='PyArray_set_dim', strides_mul=None,
set_stride='PyArray_set_stride', view_name=None
strides_mul=1,
): ):
"""The parameters c_prefix, update_flags, set_data, set_dim, """
The parameters c_prefix, update_flags, set_data, set_dim,
set_stride and strides_mul are there to allow reusing this set_stride and strides_mul are there to allow reusing this
function on PyArray and CudaNdarray object. function on PyArray and CudaNdarray object.
""" """
default_args = Subtensor.default_helper_c_code_args()
if update_flags is None:
update_flags = default_args['update_flags']
if set_data is None:
set_data = default_args['set_data']
if set_dim is None:
set_dim = default_args['set_dim']
if set_stride is None:
set_stride = default_args['set_stride']
if strides_mul is None:
strides_mul = default_args['strides_mul']
if c_prefix is None:
c_prefix = default_args['c_prefix']
if view_name is None:
view_name = default_args['view_name']
#update_flags may depend on view_name
update_flags = update_flags % locals()
# #
# two arrays are created in C code: # two arrays are created in C code:
# is_slice: len == ndim, 0 means int, 1 means slice # is_slice: len == ndim, 0 means int, 1 means slice
...@@ -4104,6 +4160,8 @@ class Subtensor(Op): ...@@ -4104,6 +4160,8 @@ class Subtensor(Op):
x, = inputs[:1] x, = inputs[:1]
z, = outputs z, = outputs
xview = view_name
rval = """ rval = """
#define PyArray_set_dim(obj, idx, d) PyArray_DIMS(obj)[idx]=d #define PyArray_set_dim(obj, idx, d) PyArray_DIMS(obj)[idx]=d
#define PyArray_set_stride(obj, idx, d) PyArray_STRIDES(obj)[idx]=d #define PyArray_set_stride(obj, idx, d) PyArray_STRIDES(obj)[idx]=d
...@@ -4119,30 +4177,30 @@ class Subtensor(Op): ...@@ -4119,30 +4177,30 @@ class Subtensor(Op):
int inner_ii = 0; // the current dimension of zview int inner_ii = 0; // the current dimension of zview
int outer_ii = 0; // current dimension of z int outer_ii = 0; // current dimension of z
char* ptr = (char*) %(c_prefix)s_BYTES(xview); char* ptr = (char*) %(c_prefix)s_BYTES(%(xview)s);
if ((%(c_prefix)s_DIMS(xview) == %(c_prefix)s_DIMS(%(x)s)) if ((%(c_prefix)s_DIMS(%(xview)s) == %(c_prefix)s_DIMS(%(x)s))
&& (%(c_prefix)s_DIMS(%(x)s) != NULL)) && (%(c_prefix)s_DIMS(%(x)s) != NULL))
{ {
PyErr_Format(PyExc_ValueError, "x and xview" PyErr_Format(PyExc_ValueError, "x and %(xview)s"
"(with %%d dims) have the same dimensions" "(with %%d dims) have the same dimensions"
" pointers: %%p and %%p", " pointers: %%p and %%p",
%(c_prefix)s_NDIM(%(x)s), %(c_prefix)s_NDIM(%(x)s),
%(c_prefix)s_DIMS(xview), %(c_prefix)s_DIMS(%(xview)s),
%(c_prefix)s_DIMS(%(x)s)); %(c_prefix)s_DIMS(%(x)s));
Py_XDECREF(xview); Py_XDECREF(%(xview)s);
%(fail)s; %(fail)s;
} }
if (%(c_prefix)s_STRIDES(xview) == %(c_prefix)s_STRIDES(%(x)s) if (%(c_prefix)s_STRIDES(%(xview)s) == %(c_prefix)s_STRIDES(%(x)s)
&& (%(c_prefix)s_DIMS(%(x)s) != NULL)) && (%(c_prefix)s_DIMS(%(x)s) != NULL))
{ {
PyErr_Format(PyExc_ValueError, "x and xview" PyErr_Format(PyExc_ValueError, "x and %(xview)s"
"(with %%d dims) have the same strides" "(with %%d dims) have the same strides"
" pointers: %%p and %%p", " pointers: %%p and %%p",
%(c_prefix)s_NDIM(%(x)s), %(c_prefix)s_NDIM(%(x)s),
%(c_prefix)s_STRIDES(xview), %(c_prefix)s_STRIDES(%(xview)s),
%(c_prefix)s_STRIDES(%(x)s)); %(c_prefix)s_STRIDES(%(x)s));
Py_XDECREF(xview); Py_XDECREF(%(xview)s);
%(fail)s; %(fail)s;
} }
...@@ -4164,10 +4222,10 @@ class Subtensor(Op): ...@@ -4164,10 +4222,10 @@ class Subtensor(Op):
// PySlice_GetIndicesEx in python source // PySlice_GetIndicesEx in python source
if (!step) if (!step)
{ {
Py_DECREF(xview); Py_DECREF(%(xview)s);
PyErr_Format(PyExc_ValueError, PyErr_Format(PyExc_ValueError,
"slice step cannot be zero"); "slice step cannot be zero");
Py_XDECREF(xview); Py_XDECREF(%(xview)s);
%(fail)s; %(fail)s;
} }
...@@ -4218,8 +4276,8 @@ class Subtensor(Op): ...@@ -4218,8 +4276,8 @@ class Subtensor(Op):
ptr += %(c_prefix)s_STRIDES(%(x)s)[outer_ii] * start * ptr += %(c_prefix)s_STRIDES(%(x)s)[outer_ii] * start *
%(strides_mul)s; %(strides_mul)s;
%(set_dim)s(xview, inner_ii, slicelength); %(set_dim)s(%(xview)s, inner_ii, slicelength);
%(set_stride)s(xview, inner_ii, %(set_stride)s(%(xview)s, inner_ii,
%(c_prefix)s_STRIDES(%(x)s)[outer_ii] * step); %(c_prefix)s_STRIDES(%(x)s)[outer_ii] * step);
inner_ii += 1; inner_ii += 1;
...@@ -4239,27 +4297,27 @@ class Subtensor(Op): ...@@ -4239,27 +4297,27 @@ class Subtensor(Op):
else else
{ {
PyErr_Format(PyExc_IndexError,"index out of bounds"); PyErr_Format(PyExc_IndexError,"index out of bounds");
Py_XDECREF(xview); Py_XDECREF(%(xview)s);
%(fail)s; %(fail)s;
} }
} }
else else
{ {
PyErr_Format(PyExc_IndexError,"index out of bounds"); PyErr_Format(PyExc_IndexError,"index out of bounds");
Py_XDECREF(xview); Py_XDECREF(%(xview)s);
%(fail)s; %(fail)s;
} }
spec_pos += 1; spec_pos += 1;
} }
} }
%(set_data)s(xview, ptr, (PyObject*)NULL); %(set_data)s(%(xview)s, ptr, (PyObject*)NULL);
assert (inner_ii <= %(c_prefix)s_NDIM(xview)); assert (inner_ii <= %(c_prefix)s_NDIM(%(xview)s));
while (inner_ii < %(c_prefix)s_NDIM(xview)) while (inner_ii < %(c_prefix)s_NDIM(%(xview)s))
{ {
assert (outer_ii < %(c_prefix)s_NDIM(%(x)s)); assert (outer_ii < %(c_prefix)s_NDIM(%(x)s));
%(set_dim)s(xview, inner_ii, %(c_prefix)s_DIMS(%(x)s)[outer_ii]); %(set_dim)s(%(xview)s, inner_ii, %(c_prefix)s_DIMS(%(x)s)[outer_ii]);
%(set_stride)s(xview, inner_ii, %(c_prefix)s_STRIDES(%(x)s)[outer_ii]); %(set_stride)s(%(xview)s, inner_ii, %(c_prefix)s_STRIDES(%(x)s)[outer_ii]);
inner_ii += 1; inner_ii += 1;
outer_ii += 1; outer_ii += 1;
} }
...@@ -4454,7 +4512,6 @@ def inc_subtensor(x, y, inplace=False, set_instead_of_inc=False, ...@@ -4454,7 +4512,6 @@ def inc_subtensor(x, y, inplace=False, set_instead_of_inc=False,
else: else:
raise TypeError('x must be result of a subtensor operation') raise TypeError('x must be result of a subtensor operation')
class IncSubtensor(Op): class IncSubtensor(Op):
"""Increment a subtensor. """Increment a subtensor.
...@@ -4524,6 +4581,11 @@ class IncSubtensor(Op): ...@@ -4524,6 +4581,11 @@ class IncSubtensor(Op):
", ".join(indices)) ", ".join(indices))
def make_node(self, x, y, *inputs): def make_node(self, x, y, *inputs):
"""
x: the tensor to increment
y: the value to increment by
inputs: TODO WRITEME
"""
x, y = map(as_tensor_variable, [x, y]) x, y = map(as_tensor_variable, [x, y])
if y.ndim > x.ndim: if y.ndim > x.ndim:
raise ValueError(("Trying to increment a %d-dimensional " raise ValueError(("Trying to increment a %d-dimensional "
...@@ -4602,9 +4664,16 @@ class IncSubtensor(Op): ...@@ -4602,9 +4664,16 @@ class IncSubtensor(Op):
x.__setitem__(cdata, y) x.__setitem__(cdata, y)
out[0] = x out[0] = x
def c_code(self, node, name, inputs, outputs, sub): # DEBUG def c_code(self, node, name, inputs, outputs, sub):
if not isinstance(node.inputs[0].type, TensorType):
raise NotImplementedError() # This method delegates much of the work to helper
# methods. This method implements the main logic
# but subclasses may override the helper methods
# to change the particulars, e.g. GpuIncSubtensor
# turns the view/copy operations on numpy arrays
# into the same operations on cuda arrays.
self.do_type_checking(node)
if self.inplace: # convert bool to int if self.inplace: # convert bool to int
inplace = 1 inplace = 1
...@@ -4621,12 +4690,15 @@ class IncSubtensor(Op): ...@@ -4621,12 +4690,15 @@ class IncSubtensor(Op):
view_ndim = (node.inputs[0].ndim - view_ndim = (node.inputs[0].ndim -
numpy.sum([not isinstance(idx, slice) numpy.sum([not isinstance(idx, slice)
for idx in self.idx_list])) for idx in self.idx_list]))
copy_of_x = self.copy_of_x(x)
copy_input_if_necessary = """ copy_input_if_necessary = """
if (%(inplace)s) if (%(inplace)s)
{ {
if (%(x)s != %(z)s) if (%(x)s != %(z)s)
{ {
if (%(z)s) Py_DECREF(%(z)s); Py_XDECREF(%(z)s);
Py_INCREF(%(x)s); Py_INCREF(%(x)s);
%(z)s = %(x)s; %(z)s = %(x)s;
} }
...@@ -4634,69 +4706,75 @@ class IncSubtensor(Op): ...@@ -4634,69 +4706,75 @@ class IncSubtensor(Op):
else else
{ {
if (%(z)s) Py_DECREF(%(z)s); if (%(z)s) Py_DECREF(%(z)s);
%(z)s = (PyArrayObject*)PyArray_FromAny(py_%(x)s, NULL, 0, 0, %(z)s = %(copy_of_x)s;
NPY_ARRAY_ENSURECOPY, NULL);
} }
""" % locals() """ % locals()
alloc_zview = self.make_view_array(z, view_ndim)
# On GPU, it takes two steps to make a view
link_zview = self.link_view_array(z, fail);
#Make a first view on the output, as we will write into it. #Make a first view on the output, as we will write into it.
build_view = """ build_view = """
//TODO: give this Op a second output so that this view can be cached //TODO: give this Op a second output so that this view can be cached
//TODO: alternatively, fix the memory leak on failure //TODO: alternatively, fix the memory leak on failure
Py_INCREF(PyArray_DESCR(%(z)s)); %(alloc_zview)s;
PyArrayObject * xview = (PyArrayObject*)PyArray_NewFromDescr( if (!zview)
&PyArray_Type,
PyArray_DESCR(%(z)s),
%(view_ndim)s,
PyArray_DIMS(%(z)s),
PyArray_STRIDES(%(z)s),
PyArray_DATA(%(z)s),
%(z)s->flags,
NULL);
if (!xview)
{ {
%(fail)s; %(fail)s;
} }
%(link_zview)s;
""" % locals() """ % locals()
# make xview actually a view of %(z)s # make zview actually a view of %(z)s
get_xview = Subtensor.helper_c_code(node, name, helper_args = self.get_helper_c_code_args()
outputs[:1] + inputs[2:], helper_args['view_name'] = 'zview'
outputs, sub, self.idx_list) get_zview = self.define_set_data() + \
Subtensor.helper_c_code(
node=node,
name=name,
inputs=outputs[:1] + inputs[2:],
outputs=outputs,
sub=sub,
idx_list=self.idx_list,
** helper_args
)
copy_into = self.copy_into("zview", y)
add_to_zview = self.add_to_zview(y, fail)
make_modification = """ make_modification = """
if (%(op_is_set)s) if (%(op_is_set)s)
{ {
if (PyArray_CopyInto(xview, %(y)s)) // does broadcasting if (%(copy_into)s) // does broadcasting
{ {
Py_DECREF(xview); Py_DECREF(zview);
%(fail)s; %(fail)s;
} }
} }
else else
{ {
PyArrayObject * add_rval = (PyArrayObject*)PyNumber_InPlaceAdd( %(add_to_zview)s
(PyObject*)xview, py_%(y)s);
if (add_rval)
{
assert (PyArray_Check((PyObject*)add_rval));
assert (PyArray_DATA(add_rval) == PyArray_DATA(xview));
Py_DECREF(add_rval);
}
else
{
Py_DECREF(xview);
%(fail)s;
}
} }
""" % locals() """ % locals()
return (copy_input_if_necessary return (copy_input_if_necessary
+ build_view + build_view
+ "{" + get_xview + "}" + "{" + get_zview + "}"
+ make_modification + make_modification
+ "Py_DECREF(xview);" + "Py_DECREF(zview);"
) )
def do_type_checking(self, node):
""" Should raise NotImplementedError if c_code does not support
the types involved in this node.
"""
if not isinstance(node.inputs[0].type, TensorType):
raise NotImplementedError()
def c_code_cache_version(self): def c_code_cache_version(self):
hv = Subtensor.helper_c_code_cache_version() hv = Subtensor.helper_c_code_cache_version()
if hv: if hv:
...@@ -4704,6 +4782,99 @@ class IncSubtensor(Op): ...@@ -4704,6 +4782,99 @@ class IncSubtensor(Op):
else: else:
return () return ()
def copy_of_x(self, x):
"""
x: a string giving the name of a C variable pointing to an array
Returns C code expression to make a copy of x.
Base class uses PyArrayObject *, subclasses may override for
different types of arrays.
"""
# Parameters of PyArrary_FromAny are:
# array
# dtype: we pass NULL to say any dtype is acceptable, so the existing
# dtype will be copied
# min_depth: we pass 0 to have this parameter ignored
# max_depth: we pass 0 to have this parameter ignored
# requirements: here we pass NPY_ARRAY_ENSURECOPY to force a copy
# context: this is almost always NULL, I'm not sure what it's used for
return """(PyArrayObject*)PyArray_FromAny(py_%(x)s, NULL, 0, 0,
NPY_ARRAY_ENSURECOPY, NULL)""" % locals()
def make_view_array(self, x, view_ndim):
"""
x: a string identifying an array to be viewed
view_ndim: a string specifying the number of dimensions
to have in the view
This doesn't need to actually set up the view with the
right indexing; we'll do that manually later.
"""
return """Py_INCREF(PyArray_DESCR(%(x)s));
PyArrayObject * zview =
(PyArrayObject*)PyArray_NewFromDescr(
&PyArray_Type,
PyArray_DESCR(%(x)s),
%(view_ndim)s,
PyArray_DIMS(%(x)s),
PyArray_STRIDES(%(x)s),
PyArray_DATA(%(x)s),
%(x)s->flags,
NULL)""" % locals()
def get_helper_c_code_args(self):
""" Return a dictionary of arguments to pass to helper_c_code."""
return Subtensor.default_helper_c_code_args()
def copy_into(self, view, source):
"""
view: string, C code expression for an array
source: string, C code expression for an array
returns a C code expression to copy source into view, and
return 0 on success
"""
return """PyArray_CopyInto(%(view)s, %(source)s)""" % locals()
def define_set_data(self):
""" Returns C code used to define any macros used in the
set data argument to the helper C code. """
return ""
def link_view_array(self, x, fail):
""" Returns code to complete making zview a view of x"""
# On CPU there is nothing to do, make_view_array already did this
return ""
def set_view_base(self, x, fail):
""" Returns code to make zview be a correct view of x,
after helper_c_code is done messing with x"""
# On CPU there is nothing to do
return ""
def add_to_zview(self, x, fail):
""" Return C code to add x to zview. Should DECREF zview if the
add fails."""
return """
PyArrayObject * add_rval = (PyArrayObject*)PyNumber_InPlaceAdd(
(PyObject*)zview, py_%(x)s);
if (add_rval)
{
assert (PyArray_Check((PyObject*)add_rval));
assert (PyArray_DATA(add_rval) == PyArray_DATA(zview));
Py_DECREF(add_rval);
}
else
{
Py_DECREF(zview);
%(fail)s;
}""" % locals()
def infer_shape(self, node, shapes): def infer_shape(self, node, shapes):
return [shapes[0]] return [shapes[0]]
......
...@@ -1056,7 +1056,8 @@ class test_fusion(unittest.TestCase): ...@@ -1056,7 +1056,8 @@ class test_fusion(unittest.TestCase):
if gpu: if gpu:
import theano.sandbox.cuda as cuda import theano.sandbox.cuda as cuda
topo_ = [x for x in topo if not isinstance( topo_ = [x for x in topo if not isinstance(
x.op,cuda.basic_ops.GpuFromHost) and not isinstance(x.op,cuda.basic_ops.HostFromGpu)] x.op, (cuda.basic_ops.GpuFromHost, cuda.basic_ops.HostFromGpu))]
gpu_ = [x for x in topo if isinstance(x.op, gpu_ = [x for x in topo if isinstance(x.op,
cuda.basic_ops.GpuFromHost)] cuda.basic_ops.GpuFromHost)]
if not len(gpu_) == len(sym_inputs): if not len(gpu_) == len(sym_inputs):
...@@ -1067,13 +1068,16 @@ class test_fusion(unittest.TestCase): ...@@ -1067,13 +1068,16 @@ class test_fusion(unittest.TestCase):
if not len(topo_) == nb_elemwise: if not len(topo_) == nb_elemwise:
fail3.append((id, topo_, nb_elemwise)) fail3.append((id, topo_, nb_elemwise))
if nb_elemwise == 1: if nb_elemwise == 1:
# check that the number of input to the Composite Elemwise is ok # if no variable appears multiple times in the
# when there is not variable that appear multiple time the in input # input of g,
# of g # check that the number of input to the Composite
assert ((numpy.sum([not isinstance(x, theano.gof.Constant) # Elemwise is ok
for x in topo_[0].inputs]) == if len(set(g.owner.inputs)) == len(g.owner.inputs):
len(sym_inputs)) or expected_len_sym_inputs = numpy.sum(
len(set(g.owner.inputs)) != len(g.owner.inputs)) [not isinstance(x, theano.gof.Constant)
for x in topo_[0].inputs])
assert expected_len_sym_inputs == len(sym_inputs)
if not out_dtype == out.dtype: if not out_dtype == out.dtype:
fail4.append((id, out_dtype, out.dtype)) fail4.append((id, out_dtype, out.dtype))
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论