提交 f517e1a0 authored 作者: Olivier Delalleau's avatar Olivier Delalleau

Merged

...@@ -2188,7 +2188,7 @@ CudaNdarray_Dot(PyObject* _unused, PyObject* args) ...@@ -2188,7 +2188,7 @@ CudaNdarray_Dot(PyObject* _unused, PyObject* args)
} }
static PyObject * static PyObject *
filter(PyObject* __unsed_self, PyObject *args) // args = (data, broadcastable, strict) filter(PyObject* __unsed_self, PyObject *args) // args = (data, broadcastable, strict, storage)
{ {
/* /*
* TODO: DOC what this function should do in the various cases of * TODO: DOC what this function should do in the various cases of
...@@ -2282,10 +2282,10 @@ filter(PyObject* __unsed_self, PyObject *args) // args = (data, broadcastable, s ...@@ -2282,10 +2282,10 @@ filter(PyObject* __unsed_self, PyObject *args) // args = (data, broadcastable, s
Py_DECREF(rval); Py_DECREF(rval);
rval = NULL; rval = NULL;
} }
Py_DECREF(data);
Py_DECREF(py_data);
Py_DECREF(broadcastable);
} }
Py_DECREF(data);
Py_DECREF(py_data);
Py_DECREF(broadcastable);
return (PyObject*)rval; return (PyObject*)rval;
} }
} }
...@@ -2490,6 +2490,11 @@ CudaNdarray_new_nd(int nd) ...@@ -2490,6 +2490,11 @@ CudaNdarray_new_nd(int nd)
return (PyObject *) rval; return (PyObject *) rval;
} }
/**
* Initialize 'self' as a view of 'base', with memory storage 'data'
*/
int CudaNdarray_set_device_data(CudaNdarray * self, float * data, PyObject * base) int CudaNdarray_set_device_data(CudaNdarray * self, float * data, PyObject * base)
{ {
if (self->data_allocated) if (self->data_allocated)
......
...@@ -26,7 +26,7 @@ typedef float real; ...@@ -26,7 +26,7 @@ typedef float real;
#endif #endif
#ifndef SHARED_SIZE #ifndef SHARED_SIZE
#define SHARED_SIZE (16*1024) #define SHARED_SIZE (16*1024)
#endif #endif
...@@ -48,10 +48,10 @@ static T ceil_intdiv(T a, T b) ...@@ -48,10 +48,10 @@ static T ceil_intdiv(T a, T b)
/** /**
* struct CudaNdarray * struct CudaNdarray
* *
* This is a Python type. * This is a Python type.
* *
*/ */
struct CudaNdarray struct CudaNdarray
{ {
PyObject_HEAD PyObject_HEAD
...@@ -65,40 +65,46 @@ struct CudaNdarray ...@@ -65,40 +65,46 @@ struct CudaNdarray
/* Type-specific fields go here. */ /* Type-specific fields go here. */
//GpuTensorType::VoidTensor * vt; //GpuTensorType::VoidTensor * vt;
int nd; //the number of dimensions of the tensor int nd; //the number of dimensions of the tensor
// Client should acces host_structure via CudaNdarray_HOST_DIMS / CudaNdarray_HOST_STRIDES macros // Client should acces host_structure via CudaNdarray_HOST_DIMS / CudaNdarray_HOST_STRIDES macros
int * host_structure; //dim0, dim1, ... stride0, stride1, ... int * host_structure; //dim0, dim1, ... stride0, stride1, ...
int data_allocated; //the number of bytes allocated for devdata int data_allocated; //the number of bytes allocated for devdata
//device pointers (allocated by cudaMalloc) //device pointers (allocated by cudaMalloc)
int dev_structure_fresh; int dev_structure_fresh;
//dev_structure should be accessed via macros, otherwise may not be synchronized //dev_structure should be accessed via macros, otherwise may not be synchronized
int * dev_structure; //dim0, dim1, ..., stride0, stride1, ... int * dev_structure; //dim0, dim1, ..., stride0, stride1, ...
real* devdata; //pointer to data element [0,..,0]. real* devdata; //pointer to data element [0,..,0].
}; };
/* /*
* Return a CudaNdarray whose 'nd' dimensions are all 0. * Return a CudaNdarray whose 'nd' dimensions are all 0.
*/ */
PyObject * PyObject *
CudaNdarray_New(int nd=-1); CudaNdarray_New(int nd=-1);
/** /**
* Return 1 for a CudaNdarray otw 0 * Return 1 for a CudaNdarray otw 0
*/ */
int int
CudaNdarray_Check(const PyObject * ob); CudaNdarray_Check(const PyObject * ob);
/** /**
* Return 1 for a CudaNdarray otw 0 * Return 1 for a CudaNdarray otw 0
*/ */
int int
CudaNdarray_CheckExact(const PyObject * ob); CudaNdarray_CheckExact(const PyObject * ob);
/**
* Return true for a C-contiguous CudaNdarray, else false
*/
bool
CudaNdarray_is_c_contiguous(const CudaNdarray * self);
/**** /****
* Returns the number of elements necessary in host_structure and dev_structure for a given number of dimensions. * Returns the number of elements necessary in host_structure and dev_structure for a given number of dimensions.
*/ */
int int
cnda_structure_size(int nd) cnda_structure_size(int nd)
{ {
// dim0, dim1, ... // dim0, dim1, ...
...@@ -107,23 +113,23 @@ cnda_structure_size(int nd) ...@@ -107,23 +113,23 @@ cnda_structure_size(int nd)
return nd + nd + nd; return nd + nd + nd;
} }
const int * const int *
CudaNdarray_HOST_DIMS(const CudaNdarray * self) CudaNdarray_HOST_DIMS(const CudaNdarray * self)
{ {
return self->host_structure; return self->host_structure;
} }
const int * const int *
CudaNdarray_HOST_STRIDES(const CudaNdarray * self) CudaNdarray_HOST_STRIDES(const CudaNdarray * self)
{ {
return self->host_structure + self->nd; return self->host_structure + self->nd;
} }
const int * const int *
CudaNdarray_HOST_LOG2DIMS(const CudaNdarray * self) CudaNdarray_HOST_LOG2DIMS(const CudaNdarray * self)
{ {
return self->host_structure + 2*self->nd; return self->host_structure + 2*self->nd;
} }
void void
cnda_mark_dev_structure_dirty(CudaNdarray * self) cnda_mark_dev_structure_dirty(CudaNdarray * self)
{ {
self->dev_structure_fresh = 0; self->dev_structure_fresh = 0;
...@@ -190,7 +196,7 @@ CudaNdarray_Equal(CudaNdarray *cnda1, CudaNdarray *cnda2) ...@@ -190,7 +196,7 @@ CudaNdarray_Equal(CudaNdarray *cnda1, CudaNdarray *cnda2)
* *
* Does not sync structure to host. * Does not sync structure to host.
*/ */
void void
CudaNdarray_set_dim(CudaNdarray * self, int idx, int d) CudaNdarray_set_dim(CudaNdarray * self, int idx, int d)
{ {
if ((idx >= self->nd) || (idx < 0) || (d < 0)) if ((idx >= self->nd) || (idx < 0) || (d < 0))
...@@ -206,7 +212,7 @@ CudaNdarray_set_dim(CudaNdarray * self, int idx, int d) ...@@ -206,7 +212,7 @@ CudaNdarray_set_dim(CudaNdarray * self, int idx, int d)
cnda_mark_dev_structure_dirty(self); cnda_mark_dev_structure_dirty(self);
} }
} }
void void
CudaNdarray_set_stride(CudaNdarray * self, int idx, int s) CudaNdarray_set_stride(CudaNdarray * self, int idx, int s)
{ {
if ((idx >= self->nd) || (idx < 0)) if ((idx >= self->nd) || (idx < 0))
...@@ -225,7 +231,7 @@ CudaNdarray_set_stride(CudaNdarray * self, int idx, int s) ...@@ -225,7 +231,7 @@ CudaNdarray_set_stride(CudaNdarray * self, int idx, int s)
* *
* This means: recalculate the log2dims and transfer structure to the card * This means: recalculate the log2dims and transfer structure to the card
*/ */
int int
cnda_copy_structure_to_device(CudaNdarray * self) cnda_copy_structure_to_device(CudaNdarray * self)
{ {
cublasSetVector(cnda_structure_size(self->nd), sizeof(int), self->host_structure, 1, self->dev_structure, 1); cublasSetVector(cnda_structure_size(self->nd), sizeof(int), self->host_structure, 1, self->dev_structure, 1);
...@@ -239,7 +245,7 @@ cnda_copy_structure_to_device(CudaNdarray * self) ...@@ -239,7 +245,7 @@ cnda_copy_structure_to_device(CudaNdarray * self)
return 0; return 0;
} }
const int * const int *
CudaNdarray_DEV_DIMS(CudaNdarray * self) CudaNdarray_DEV_DIMS(CudaNdarray * self)
{ {
if (!self->dev_structure_fresh) if (!self->dev_structure_fresh)
...@@ -249,7 +255,7 @@ CudaNdarray_DEV_DIMS(CudaNdarray * self) ...@@ -249,7 +255,7 @@ CudaNdarray_DEV_DIMS(CudaNdarray * self)
} }
return self->dev_structure; return self->dev_structure;
} }
const int * const int *
CudaNdarray_DEV_STRIDES(CudaNdarray * self) CudaNdarray_DEV_STRIDES(CudaNdarray * self)
{ {
if (!self->dev_structure_fresh) if (!self->dev_structure_fresh)
...@@ -259,7 +265,7 @@ CudaNdarray_DEV_STRIDES(CudaNdarray * self) ...@@ -259,7 +265,7 @@ CudaNdarray_DEV_STRIDES(CudaNdarray * self)
} }
return self->dev_structure + self->nd; return self->dev_structure + self->nd;
} }
const int * const int *
CudaNdarray_DEV_LOG2DIMS(CudaNdarray * self) CudaNdarray_DEV_LOG2DIMS(CudaNdarray * self)
{ {
if (!self->dev_structure_fresh) if (!self->dev_structure_fresh)
...@@ -269,7 +275,7 @@ CudaNdarray_DEV_LOG2DIMS(CudaNdarray * self) ...@@ -269,7 +275,7 @@ CudaNdarray_DEV_LOG2DIMS(CudaNdarray * self)
} }
return self->dev_structure + 2*self->nd; return self->dev_structure + 2*self->nd;
} }
float * float *
CudaNdarray_DEV_DATA(const CudaNdarray * self) CudaNdarray_DEV_DATA(const CudaNdarray * self)
{ {
return self->devdata; return self->devdata;
...@@ -278,7 +284,7 @@ CudaNdarray_DEV_DATA(const CudaNdarray * self) ...@@ -278,7 +284,7 @@ CudaNdarray_DEV_DATA(const CudaNdarray * self)
/** /**
* Return the number of elements in the ndarray (product of the dimensions) * Return the number of elements in the ndarray (product of the dimensions)
*/ */
int int
CudaNdarray_SIZE(const CudaNdarray *self) CudaNdarray_SIZE(const CudaNdarray *self)
{ {
if (self->nd == -1) return 0; if (self->nd == -1) return 0;
...@@ -289,7 +295,7 @@ CudaNdarray_SIZE(const CudaNdarray *self) ...@@ -289,7 +295,7 @@ CudaNdarray_SIZE(const CudaNdarray *self)
} }
return size; return size;
} }
static PyObject * static PyObject *
CudaNdarray_SIZE_Object(const CudaNdarray *self, void *closure) CudaNdarray_SIZE_Object(const CudaNdarray *self, void *closure)
{ {
return PyInt_FromLong(CudaNdarray_SIZE(self)); return PyInt_FromLong(CudaNdarray_SIZE(self));
...@@ -320,7 +326,7 @@ int CudaNdarray_set_nd(CudaNdarray * self, const int nd) ...@@ -320,7 +326,7 @@ int CudaNdarray_set_nd(CudaNdarray * self, const int nd)
} }
self->dev_structure = NULL; self->dev_structure = NULL;
} }
if (self->host_structure) if (self->host_structure)
{ {
free(self->host_structure); free(self->host_structure);
self->host_structure = NULL; self->host_structure = NULL;
...@@ -386,29 +392,41 @@ int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd, const inttype ...@@ -386,29 +392,41 @@ int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd, const inttype
size = size * dim[i]; size = size * dim[i];
} }
if (self->data_allocated != size) if (CudaNdarray_is_c_contiguous(self) && (self->data_allocated == size))
{ {
if (device_free(self->devdata)) return 0;
{ }
// Does this ever happen?? Do we need to set data_allocated or devdata to 0?
return -1; // The structure of self will be reused with newly allocated memory.
} // If self was a view, we should remove the reference to its base.
assert(size>0); // (If base was already NULL, the following has no effect.)
self->devdata = (float*)device_malloc(size*sizeof(real)); Py_XDECREF(self->base);
if (!self->devdata) self->base = NULL;
{
CudaNdarray_set_nd(self,-1); // If self is a view, do not try to free its memory
self->data_allocated = 0; if (self->data_allocated && device_free(self->devdata))
self->devdata = 0; {
return -1; self->devdata = NULL;
} self->data_allocated = 0;
if (0) return -1;
fprintf(stderr,
"Allocated devdata %p (self=%p)\n",
self->devdata,
self);
self->data_allocated = size;
} }
assert(size>0);
self->devdata = (float*)device_malloc(size*sizeof(real));
if (!self->devdata)
{
CudaNdarray_set_nd(self,-1);
self->data_allocated = 0;
self->devdata = 0;
return -1;
}
if (0)
fprintf(stderr,
"Allocated devdata %p (self=%p)\n",
self->devdata,
self);
self->data_allocated = size;
return 0; return 0;
} }
...@@ -416,7 +434,7 @@ int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd, const inttype ...@@ -416,7 +434,7 @@ int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd, const inttype
* Return a CudaNdarray whose 'nd' dimensions are set to dims, and allocated. * Return a CudaNdarray whose 'nd' dimensions are set to dims, and allocated.
*/ */
template<typename inttype> template<typename inttype>
PyObject * PyObject *
CudaNdarray_NewDims(int nd, const inttype * dims) CudaNdarray_NewDims(int nd, const inttype * dims)
{ {
CudaNdarray * rval = (CudaNdarray*)CudaNdarray_New(); CudaNdarray * rval = (CudaNdarray*)CudaNdarray_New();
...@@ -440,7 +458,7 @@ CudaNdarray_NewDims(int nd, const inttype * dims) ...@@ -440,7 +458,7 @@ CudaNdarray_NewDims(int nd, const inttype * dims)
int CudaNdarray_set_device_data(CudaNdarray * self, float * data, PyObject * base); int CudaNdarray_set_device_data(CudaNdarray * self, float * data, PyObject * base);
int CudaNdarray_set_device_data(CudaNdarray * self, float * data, CudaNdarray * base) int CudaNdarray_set_device_data(CudaNdarray * self, float * data, CudaNdarray * base)
{ {
return CudaNdarray_set_device_data(self, data, (PyObject *) base); return CudaNdarray_set_device_data(self, data, (PyObject *) base);
} }
/** /**
...@@ -475,10 +493,10 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, CudaNdarray * other, boo ...@@ -475,10 +493,10 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, CudaNdarray * other, boo
/** /**
* Transfer the contents of CudaNdarray `self` to a new numpy ndarray. * Transfer the contents of CudaNdarray `self` to a new numpy ndarray.
*/ */
PyObject * PyObject *
CudaNdarray_CreateArrayObj(CudaNdarray * self); CudaNdarray_CreateArrayObj(CudaNdarray * self);
PyObject * PyObject *
CudaNdarray_ZEROS(int n, int * dims); CudaNdarray_ZEROS(int n, int * dims);
/** /**
...@@ -499,7 +517,7 @@ int CudaNdarray_dimshuffle(CudaNdarray * self, unsigned int len, const int * pat ...@@ -499,7 +517,7 @@ int CudaNdarray_dimshuffle(CudaNdarray * self, unsigned int len, const int * pat
void fprint_CudaNdarray(FILE * fd, const CudaNdarray *self) void fprint_CudaNdarray(FILE * fd, const CudaNdarray *self)
{ {
fprintf(fd, "CudaNdarray <%p, %p> nd=%i dev_structure_fresh=%d data_allocated=%d\n", fprintf(fd, "CudaNdarray <%p, %p> nd=%i dev_structure_fresh=%d data_allocated=%d\n",
self, self->devdata, self->nd, self->dev_structure_fresh, self->data_allocated); self, self->devdata, self->nd, self->dev_structure_fresh, self->data_allocated);
fprintf(fd, "\tHOST_DIMS: "); fprintf(fd, "\tHOST_DIMS: ");
for (int i = 0; i < self->nd; ++i) for (int i = 0; i < self->nd; ++i)
{ {
...@@ -510,23 +528,23 @@ void fprint_CudaNdarray(FILE * fd, const CudaNdarray *self) ...@@ -510,23 +528,23 @@ void fprint_CudaNdarray(FILE * fd, const CudaNdarray *self)
{ {
fprintf(fd, "%i\t", CudaNdarray_HOST_STRIDES(self)[i]); fprintf(fd, "%i\t", CudaNdarray_HOST_STRIDES(self)[i]);
} }
int data=0; int data=0;
fprintf(fd, "\n\tDEV_DIMS: "); fprintf(fd, "\n\tDEV_DIMS: ");
for (int i = 0; i < self->nd; ++i) for (int i = 0; i < self->nd; ++i)
{ {
cublasGetVector(1, sizeof(int), cublasGetVector(1, sizeof(int),
self->dev_structure+i, 1, self->dev_structure+i, 1,
&data, 1); &data, 1);
fprintf(fd, "%i\t", data); fprintf(fd, "%i\t", data);
} }
fprintf(fd, "\n\tDEV_STRIDES: "); fprintf(fd, "\n\tDEV_STRIDES: ");
for (int i = 0; i < self->nd; ++i) for (int i = 0; i < self->nd; ++i)
{ {
cublasGetVector(1, sizeof(int), cublasGetVector(1, sizeof(int),
self->dev_structure + self->nd+i, 1, self->dev_structure + self->nd+i, 1,
&data, 1); &data, 1);
fprintf(fd, "%i \t", data); fprintf(fd, "%i \t", data);
} }
fprintf(fd, "\n"); fprintf(fd, "\n");
} }
......
...@@ -6,7 +6,6 @@ ...@@ -6,7 +6,6 @@
import logging import logging
_logger = logging.getLogger('theano.tensor.opt') _logger = logging.getLogger('theano.tensor.opt')
import copy
import operator import operator
import itertools import itertools
import sys import sys
...@@ -574,14 +573,6 @@ class ShapeFeature(object): ...@@ -574,14 +573,6 @@ class ShapeFeature(object):
if hasattr(r.type,"broadcastable") and r.type.broadcastable[i]: if hasattr(r.type,"broadcastable") and r.type.broadcastable[i]:
return self.lscalar_one return self.lscalar_one
# NOTE: This may cause problems bacause the shape is not asserted
# there is an equivalent mechanism to do this, namely
# specify_shape that one should use
# If user provided size
#elif ( hasattr(r.tag,'shape') and
# r.tag.shape is not None and
# r.tag.shape[i] is not None):
# return T.constant(copy.copy(r.tag.shape[i]),dtype='int64')
else: else:
return Shape_i(i).make_node(r).outputs[0] return Shape_i(i).make_node(r).outputs[0]
...@@ -1101,7 +1092,6 @@ def local_alloc_elemwise(node): ...@@ -1101,7 +1092,6 @@ def local_alloc_elemwise(node):
return [node.op(*new)] return [node.op(*new)]
#TODO, global optimizer that lift the assert to the beginning of the graph. #TODO, global optimizer that lift the assert to the beginning of the graph.
#TODO, var.tag.shape to propagate the shape and lower the overhead of this op
#TODO, when all inputs can be optimized do all except one #TODO, when all inputs can be optimized do all except one
theano.configparser.AddConfigVar('experimental.local_alloc_elemwise', theano.configparser.AddConfigVar('experimental.local_alloc_elemwise',
...@@ -2749,14 +2739,8 @@ register_specialize(local_mul_specialize) ...@@ -2749,14 +2739,8 @@ register_specialize(local_mul_specialize)
@gof.local_optimizer([T.add]) @gof.local_optimizer([T.add])
def local_add_specialize(node): def local_add_specialize(node):
def fill_chain(v): def fill_chain(v):
# Not sure why this happens .. but I did not had the time to look
# into it, it probably has something to do with the dtype I'm
# providing the tag.shape of my variable
out = _fill_chain(v, node.inputs) out = _fill_chain(v, node.inputs)
if out[0].dtype != node.outputs[0].dtype: return out
return [T.cast(out[0], dtype = node.outputs[0].dtype)]
else:
return out
#here, we are past the point of canonicalization, so we don't want to put in un-necessary fills. #here, we are past the point of canonicalization, so we don't want to put in un-necessary fills.
if node.op == T.add: if node.op == T.add:
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论