提交 460e9561 authored 作者: Frederic's avatar Frederic

move fct and force them to be inlined.

上级 d0e0274f
......@@ -3133,24 +3133,6 @@ CudaNdarray_CopyFromArray(CudaNdarray * self, PyArrayObject*obj)
return 0;
}
bool
CudaNdarray_is_c_contiguous(const CudaNdarray * self)
{
bool c_contiguous = true;
int size = 1;
for (int i = self->nd-1; (i >= 0) && c_contiguous; --i)
{
if (CudaNdarray_HOST_DIMS(self)[i] == 1)
continue;
if (CudaNdarray_HOST_STRIDES(self)[i] != size)
{
c_contiguous = false;
}
size = size * CudaNdarray_HOST_DIMS(self)[i];
}
return c_contiguous;
}
PyObject *
CudaNdarray_new_nd(int nd)
{
......@@ -4346,12 +4328,6 @@ CudaNdarray_HOST_LOG2DIMS(const CudaNdarray * self)
return self->host_structure + 2*self->nd;
}
void
cnda_mark_dev_structure_dirty(CudaNdarray * self)
{
self->dev_structure_fresh = 0;
}
int
CudaNdarray_EqualAndIgnore(CudaNdarray *cnda1, CudaNdarray *cnda2, int ignoreSync, int ignoreBase)
{
......@@ -4406,39 +4382,6 @@ CudaNdarray_Equal(CudaNdarray *cnda1, CudaNdarray *cnda2)
return CudaNdarray_EqualAndIgnore(cnda1, cnda2, 0, 0);
}
void
CudaNdarray_set_dim(CudaNdarray * self, int idx, int d)
{
if ((idx >= self->nd) || (idx < 0) || (d < 0))
{
fprintf(stderr, "WARNING: probably bad CudaNdarray_set_dim arguments: %i %i\n", idx, d);
}
if (d != self->host_structure[idx])
{
self->host_structure[idx] = d;
int log2d = (int)log2((double)d);
self->host_structure[idx + 2*self->nd] = (d == (1 << log2d)) ? log2d : -1;
cnda_mark_dev_structure_dirty(self);
}
}
void
CudaNdarray_set_stride(CudaNdarray * self, int idx, int s)
{
if ((idx >= self->nd) || (idx < 0))
{
fprintf(stderr, "WARNING: probably bad CudaNdarray_set_stride arguments: %i %i\n", idx, s);
}
if (s != CudaNdarray_HOST_STRIDES(self)[idx])
{
self->host_structure[idx+self->nd] = s;
cnda_mark_dev_structure_dirty(self);
}
}
int
cnda_copy_structure_to_device(const CudaNdarray * self)
{
......@@ -4510,56 +4453,6 @@ CudaNdarray_SIZE_Object(const CudaNdarray *self, void *closure)
return PyInt_FromLong(CudaNdarray_SIZE(self));
}
int CudaNdarray_set_nd(CudaNdarray * self, const int nd)
{
if (nd != self->nd)
{
if (self->dev_structure)
{
if (device_free(self->dev_structure))
{
return -1;
}
self->dev_structure = NULL;
}
if (self->host_structure)
{
free(self->host_structure);
self->host_structure = NULL;
self->nd = -1;
}
if (nd == -1) return 0;
self->host_structure = (int*)malloc(cnda_structure_size(nd)*sizeof(int));
if (NULL == self->host_structure)
{
PyErr_SetString(PyExc_MemoryError, "Failed to allocate dim or str");
return -1;
}
//initialize all dimensions and strides to 0
for (int i = 0; i < cnda_structure_size(nd); ++i)
{
self->host_structure[i] = 0;
}
int struct_size = cnda_structure_size(nd);
if (struct_size)
{
self->dev_structure = (int*)device_malloc(struct_size* sizeof(int));
if (NULL == self->dev_structure)
{
free(self->host_structure);
self->host_structure = NULL;
self->dev_structure = NULL;
return -1;
}
}
self->nd = nd;
self->dev_structure_fresh = 0;
}
return 0;
}
int CudaNdarray_set_device_data(CudaNdarray * self, float * data, const CudaNdarray * base)
{
return CudaNdarray_set_device_data(self, data, (PyObject *) base);
......
......@@ -126,8 +126,12 @@ CudaNdarray_HOST_STRIDES(const CudaNdarray * self);
DllExport const int *
CudaNdarray_HOST_LOG2DIMS(const CudaNdarray * self);
DllExport void
cnda_mark_dev_structure_dirty(CudaNdarray * self);
DllExport inline void __attribute__((always_inline))
cnda_mark_dev_structure_dirty(CudaNdarray * self)
{
self->dev_structure_fresh = 0;
}
DllExport int
CudaNdarray_EqualAndIgnore(CudaNdarray *cnda1, CudaNdarray *cnda2, int ignoreSync, int ignoreBase);
......@@ -143,11 +147,38 @@ CudaNdarray_Equal(CudaNdarray *cnda1, CudaNdarray *cnda2);
*
* Does not sync structure to host.
*/
DllExport void
CudaNdarray_set_dim(CudaNdarray * self, int idx, int d);
DllExport inline void __attribute__((always_inline))
CudaNdarray_set_dim(CudaNdarray * self, int idx, int d)
{
if ((idx >= self->nd) || (idx < 0) || (d < 0))
{
fprintf(stderr, "WARNING: probably bad CudaNdarray_set_dim arguments: %i %i\n", idx, d);
}
if (d != self->host_structure[idx])
{
self->host_structure[idx] = d;
int log2d = (int)log2((double)d);
self->host_structure[idx + 2*self->nd] = (d == (1 << log2d)) ? log2d : -1;
cnda_mark_dev_structure_dirty(self);
}
}
DllExport void
CudaNdarray_set_stride(CudaNdarray * self, int idx, int s);
DllExport inline void __attribute__((always_inline))
CudaNdarray_set_stride(CudaNdarray * self, int idx, int s)
{
if ((idx >= self->nd) || (idx < 0))
{
fprintf(stderr, "WARNING: probably bad CudaNdarray_set_stride arguments: %i %i\n", idx, s);
}
if (s != CudaNdarray_HOST_STRIDES(self)[idx])
{
self->host_structure[idx+self->nd] = s;
cnda_mark_dev_structure_dirty(self);
}
}
/***
* Update dependent variables from the contents of CudaNdarray_HOST_DIMS(self) and CudaNdarray_HOST_STRIDES(self)
......@@ -188,7 +219,57 @@ DllExport PyObject * CudaNdarray_new_nd(const int nd);
*
* Note: This does not allocate storage for data.
*/
DllExport int CudaNdarray_set_nd(CudaNdarray * self, const int nd);
DllExport inline int __attribute__((always_inline))
CudaNdarray_set_nd(CudaNdarray * self, const int nd)
{
if (nd != self->nd)
{
if (self->dev_structure)
{
if (device_free(self->dev_structure))
{
return -1;
}
self->dev_structure = NULL;
}
if (self->host_structure)
{
free(self->host_structure);
self->host_structure = NULL;
self->nd = -1;
}
if (nd == -1) return 0;
self->host_structure = (int*)malloc(cnda_structure_size(nd)*sizeof(int));
if (NULL == self->host_structure)
{
PyErr_SetString(PyExc_MemoryError, "Failed to allocate dim or str");
return -1;
}
//initialize all dimensions and strides to 0
for (int i = 0; i < cnda_structure_size(nd); ++i)
{
self->host_structure[i] = 0;
}
int struct_size = cnda_structure_size(nd);
if (struct_size)
{
self->dev_structure = (int*)device_malloc(struct_size* sizeof(int));
if (NULL == self->dev_structure)
{
free(self->host_structure);
self->host_structure = NULL;
self->dev_structure = NULL;
return -1;
}
}
self->nd = nd;
self->dev_structure_fresh = 0;
}
return 0;
}
/**
* CudaNdarray_alloc_contiguous
......@@ -333,7 +414,24 @@ CudaNdarray_ZEROS(int n, int * dims);
/**
* True iff the strides look like [dim[nd-2], dim[nd-3], ... , dim[0], 1]
*/
DllExport bool CudaNdarray_is_c_contiguous(const CudaNdarray * self);
DllExport inline bool __attribute__((always_inline))
CudaNdarray_is_c_contiguous(const CudaNdarray * self)
{
bool c_contiguous = true;
int size = 1;
for (int i = self->nd-1; (i >= 0) && c_contiguous; --i)
{
if (CudaNdarray_HOST_DIMS(self)[i] == 1)
continue;
if (CudaNdarray_HOST_STRIDES(self)[i] != size)
{
c_contiguous = false;
}
size = size * CudaNdarray_HOST_DIMS(self)[i];
}
return c_contiguous;
}
DllExport PyObject * CudaNdarray_IS_C_Contiguous(CudaNdarray * self);
DllExport int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C);
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论