提交 f517e1a0 authored 作者: Olivier Delalleau's avatar Olivier Delalleau

Merged

......@@ -2188,7 +2188,7 @@ CudaNdarray_Dot(PyObject* _unused, PyObject* args)
}
static PyObject *
filter(PyObject* __unsed_self, PyObject *args) // args = (data, broadcastable, strict)
filter(PyObject* __unsed_self, PyObject *args) // args = (data, broadcastable, strict, storage)
{
/*
* TODO: DOC what this function should do in the various cases of
......@@ -2282,10 +2282,10 @@ filter(PyObject* __unsed_self, PyObject *args) // args = (data, broadcastable, s
Py_DECREF(rval);
rval = NULL;
}
Py_DECREF(data);
Py_DECREF(py_data);
Py_DECREF(broadcastable);
}
Py_DECREF(data);
Py_DECREF(py_data);
Py_DECREF(broadcastable);
return (PyObject*)rval;
}
}
......@@ -2490,6 +2490,11 @@ CudaNdarray_new_nd(int nd)
return (PyObject *) rval;
}
/**
* Initialize 'self' as a view of 'base', with memory storage 'data'
*/
int CudaNdarray_set_device_data(CudaNdarray * self, float * data, PyObject * base)
{
if (self->data_allocated)
......
......@@ -26,7 +26,7 @@ typedef float real;
#endif
#ifndef SHARED_SIZE
#ifndef SHARED_SIZE
#define SHARED_SIZE (16*1024)
#endif
......@@ -48,10 +48,10 @@ static T ceil_intdiv(T a, T b)
/**
* struct CudaNdarray
*
* This is a Python type.
* This is a Python type.
*
*/
struct CudaNdarray
struct CudaNdarray
{
PyObject_HEAD
......@@ -65,40 +65,46 @@ struct CudaNdarray
/* Type-specific fields go here. */
//GpuTensorType::VoidTensor * vt;
int nd; //the number of dimensions of the tensor
// Client should acces host_structure via CudaNdarray_HOST_DIMS / CudaNdarray_HOST_STRIDES macros
// Client should acces host_structure via CudaNdarray_HOST_DIMS / CudaNdarray_HOST_STRIDES macros
int * host_structure; //dim0, dim1, ... stride0, stride1, ...
int data_allocated; //the number of bytes allocated for devdata
//device pointers (allocated by cudaMalloc)
int dev_structure_fresh;
//dev_structure should be accessed via macros, otherwise may not be synchronized
int * dev_structure; //dim0, dim1, ..., stride0, stride1, ...
//dev_structure should be accessed via macros, otherwise may not be synchronized
int * dev_structure; //dim0, dim1, ..., stride0, stride1, ...
real* devdata; //pointer to data element [0,..,0].
};
/*
* Return a CudaNdarray whose 'nd' dimensions are all 0.
*/
PyObject *
PyObject *
CudaNdarray_New(int nd=-1);
/**
* Return 1 for a CudaNdarray otw 0
*/
int
int
CudaNdarray_Check(const PyObject * ob);
/**
* Return 1 for a CudaNdarray otw 0
*/
int
int
CudaNdarray_CheckExact(const PyObject * ob);
/**
* Return true for a C-contiguous CudaNdarray, else false
*/
bool
CudaNdarray_is_c_contiguous(const CudaNdarray * self);
/****
* Returns the number of elements necessary in host_structure and dev_structure for a given number of dimensions.
*/
int
int
cnda_structure_size(int nd)
{
// dim0, dim1, ...
......@@ -107,23 +113,23 @@ cnda_structure_size(int nd)
return nd + nd + nd;
}
const int *
const int *
CudaNdarray_HOST_DIMS(const CudaNdarray * self)
{
return self->host_structure;
}
const int *
const int *
CudaNdarray_HOST_STRIDES(const CudaNdarray * self)
{
return self->host_structure + self->nd;
}
const int *
const int *
CudaNdarray_HOST_LOG2DIMS(const CudaNdarray * self)
{
return self->host_structure + 2*self->nd;
}
void
void
cnda_mark_dev_structure_dirty(CudaNdarray * self)
{
self->dev_structure_fresh = 0;
......@@ -190,7 +196,7 @@ CudaNdarray_Equal(CudaNdarray *cnda1, CudaNdarray *cnda2)
*
* Does not sync structure to host.
*/
void
void
CudaNdarray_set_dim(CudaNdarray * self, int idx, int d)
{
if ((idx >= self->nd) || (idx < 0) || (d < 0))
......@@ -206,7 +212,7 @@ CudaNdarray_set_dim(CudaNdarray * self, int idx, int d)
cnda_mark_dev_structure_dirty(self);
}
}
void
void
CudaNdarray_set_stride(CudaNdarray * self, int idx, int s)
{
if ((idx >= self->nd) || (idx < 0))
......@@ -225,7 +231,7 @@ CudaNdarray_set_stride(CudaNdarray * self, int idx, int s)
*
* This means: recalculate the log2dims and transfer structure to the card
*/
int
int
cnda_copy_structure_to_device(CudaNdarray * self)
{
cublasSetVector(cnda_structure_size(self->nd), sizeof(int), self->host_structure, 1, self->dev_structure, 1);
......@@ -239,7 +245,7 @@ cnda_copy_structure_to_device(CudaNdarray * self)
return 0;
}
const int *
const int *
CudaNdarray_DEV_DIMS(CudaNdarray * self)
{
if (!self->dev_structure_fresh)
......@@ -249,7 +255,7 @@ CudaNdarray_DEV_DIMS(CudaNdarray * self)
}
return self->dev_structure;
}
const int *
const int *
CudaNdarray_DEV_STRIDES(CudaNdarray * self)
{
if (!self->dev_structure_fresh)
......@@ -259,7 +265,7 @@ CudaNdarray_DEV_STRIDES(CudaNdarray * self)
}
return self->dev_structure + self->nd;
}
const int *
const int *
CudaNdarray_DEV_LOG2DIMS(CudaNdarray * self)
{
if (!self->dev_structure_fresh)
......@@ -269,7 +275,7 @@ CudaNdarray_DEV_LOG2DIMS(CudaNdarray * self)
}
return self->dev_structure + 2*self->nd;
}
float *
float *
CudaNdarray_DEV_DATA(const CudaNdarray * self)
{
return self->devdata;
......@@ -278,7 +284,7 @@ CudaNdarray_DEV_DATA(const CudaNdarray * self)
/**
* Return the number of elements in the ndarray (product of the dimensions)
*/
int
int
CudaNdarray_SIZE(const CudaNdarray *self)
{
if (self->nd == -1) return 0;
......@@ -289,7 +295,7 @@ CudaNdarray_SIZE(const CudaNdarray *self)
}
return size;
}
static PyObject *
static PyObject *
CudaNdarray_SIZE_Object(const CudaNdarray *self, void *closure)
{
return PyInt_FromLong(CudaNdarray_SIZE(self));
......@@ -320,7 +326,7 @@ int CudaNdarray_set_nd(CudaNdarray * self, const int nd)
}
self->dev_structure = NULL;
}
if (self->host_structure)
if (self->host_structure)
{
free(self->host_structure);
self->host_structure = NULL;
......@@ -386,29 +392,41 @@ int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd, const inttype
size = size * dim[i];
}
if (self->data_allocated != size)
if (CudaNdarray_is_c_contiguous(self) && (self->data_allocated == size))
{
if (device_free(self->devdata))
{
// Does this ever happen?? Do we need to set data_allocated or devdata to 0?
return -1;
}
assert(size>0);
self->devdata = (float*)device_malloc(size*sizeof(real));
if (!self->devdata)
{
CudaNdarray_set_nd(self,-1);
self->data_allocated = 0;
self->devdata = 0;
return -1;
}
if (0)
fprintf(stderr,
"Allocated devdata %p (self=%p)\n",
self->devdata,
self);
self->data_allocated = size;
return 0;
}
// The structure of self will be reused with newly allocated memory.
// If self was a view, we should remove the reference to its base.
// (If base was already NULL, the following has no effect.)
Py_XDECREF(self->base);
self->base = NULL;
// If self is a view, do not try to free its memory
if (self->data_allocated && device_free(self->devdata))
{
self->devdata = NULL;
self->data_allocated = 0;
return -1;
}
assert(size>0);
self->devdata = (float*)device_malloc(size*sizeof(real));
if (!self->devdata)
{
CudaNdarray_set_nd(self,-1);
self->data_allocated = 0;
self->devdata = 0;
return -1;
}
if (0)
fprintf(stderr,
"Allocated devdata %p (self=%p)\n",
self->devdata,
self);
self->data_allocated = size;
return 0;
}
......@@ -416,7 +434,7 @@ int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd, const inttype
* Return a CudaNdarray whose 'nd' dimensions are set to dims, and allocated.
*/
template<typename inttype>
PyObject *
PyObject *
CudaNdarray_NewDims(int nd, const inttype * dims)
{
CudaNdarray * rval = (CudaNdarray*)CudaNdarray_New();
......@@ -440,7 +458,7 @@ CudaNdarray_NewDims(int nd, const inttype * dims)
int CudaNdarray_set_device_data(CudaNdarray * self, float * data, PyObject * base);
int CudaNdarray_set_device_data(CudaNdarray * self, float * data, CudaNdarray * base)
{
return CudaNdarray_set_device_data(self, data, (PyObject *) base);
return CudaNdarray_set_device_data(self, data, (PyObject *) base);
}
/**
......@@ -475,10 +493,10 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, CudaNdarray * other, boo
/**
* Transfer the contents of CudaNdarray `self` to a new numpy ndarray.
*/
PyObject *
PyObject *
CudaNdarray_CreateArrayObj(CudaNdarray * self);
PyObject *
PyObject *
CudaNdarray_ZEROS(int n, int * dims);
/**
......@@ -499,7 +517,7 @@ int CudaNdarray_dimshuffle(CudaNdarray * self, unsigned int len, const int * pat
void fprint_CudaNdarray(FILE * fd, const CudaNdarray *self)
{
fprintf(fd, "CudaNdarray <%p, %p> nd=%i dev_structure_fresh=%d data_allocated=%d\n",
self, self->devdata, self->nd, self->dev_structure_fresh, self->data_allocated);
self, self->devdata, self->nd, self->dev_structure_fresh, self->data_allocated);
fprintf(fd, "\tHOST_DIMS: ");
for (int i = 0; i < self->nd; ++i)
{
......@@ -510,23 +528,23 @@ void fprint_CudaNdarray(FILE * fd, const CudaNdarray *self)
{
fprintf(fd, "%i\t", CudaNdarray_HOST_STRIDES(self)[i]);
}
int data=0;
fprintf(fd, "\n\tDEV_DIMS: ");
for (int i = 0; i < self->nd; ++i)
{
cublasGetVector(1, sizeof(int),
self->dev_structure+i, 1,
&data, 1);
fprintf(fd, "%i\t", data);
self->dev_structure+i, 1,
&data, 1);
fprintf(fd, "%i\t", data);
}
fprintf(fd, "\n\tDEV_STRIDES: ");
for (int i = 0; i < self->nd; ++i)
{
cublasGetVector(1, sizeof(int),
self->dev_structure + self->nd+i, 1,
&data, 1);
fprintf(fd, "%i \t", data);
self->dev_structure + self->nd+i, 1,
&data, 1);
fprintf(fd, "%i \t", data);
}
fprintf(fd, "\n");
}
......
......@@ -6,7 +6,6 @@
import logging
_logger = logging.getLogger('theano.tensor.opt')
import copy
import operator
import itertools
import sys
......@@ -574,14 +573,6 @@ class ShapeFeature(object):
if hasattr(r.type,"broadcastable") and r.type.broadcastable[i]:
return self.lscalar_one
# NOTE: This may cause problems bacause the shape is not asserted
# there is an equivalent mechanism to do this, namely
# specify_shape that one should use
# If user provided size
#elif ( hasattr(r.tag,'shape') and
# r.tag.shape is not None and
# r.tag.shape[i] is not None):
# return T.constant(copy.copy(r.tag.shape[i]),dtype='int64')
else:
return Shape_i(i).make_node(r).outputs[0]
......@@ -1101,7 +1092,6 @@ def local_alloc_elemwise(node):
return [node.op(*new)]
#TODO, global optimizer that lift the assert to the beginning of the graph.
#TODO, var.tag.shape to propagate the shape and lower the overhead of this op
#TODO, when all inputs can be optimized do all except one
theano.configparser.AddConfigVar('experimental.local_alloc_elemwise',
......@@ -2749,14 +2739,8 @@ register_specialize(local_mul_specialize)
@gof.local_optimizer([T.add])
def local_add_specialize(node):
def fill_chain(v):
# Not sure why this happens .. but I did not had the time to look
# into it, it probably has something to do with the dtype I'm
# providing the tag.shape of my variable
out = _fill_chain(v, node.inputs)
if out[0].dtype != node.outputs[0].dtype:
return [T.cast(out[0], dtype = node.outputs[0].dtype)]
else:
return out
return out
#here, we are past the point of canonicalization, so we don't want to put in un-necessary fills.
if node.op == T.add:
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论