提交 7194cb2b authored 作者: abergeron's avatar abergeron

Merge pull request #1788 from nouiz/cuda_fortran

Cuda fortran order
...@@ -5093,7 +5093,7 @@ int fprint_CudaNdarray(FILE * fd, const CudaNdarray *self) ...@@ -5093,7 +5093,7 @@ int fprint_CudaNdarray(FILE * fd, const CudaNdarray *self)
int CudaNdarray_prep_output(CudaNdarray ** arr, int nd, int CudaNdarray_prep_output(CudaNdarray ** arr, int nd,
const int * dims) const int * dims, int fortran)
{ {
bool allocated = false; bool allocated = false;
if (*arr == NULL) if (*arr == NULL)
...@@ -5105,7 +5105,7 @@ int CudaNdarray_prep_output(CudaNdarray ** arr, int nd, ...@@ -5105,7 +5105,7 @@ int CudaNdarray_prep_output(CudaNdarray ** arr, int nd,
allocated = true; allocated = true;
} }
if (CudaNdarray_alloc_contiguous(*arr, nd, dims)) if (CudaNdarray_alloc_contiguous(*arr, nd, dims, fortran))
{ {
if (allocated) if (allocated)
{ {
......
...@@ -160,6 +160,12 @@ CudaNdarray_CheckExact(const PyObject * ob); ...@@ -160,6 +160,12 @@ CudaNdarray_CheckExact(const PyObject * ob);
DllExport bool DllExport bool
CudaNdarray_is_c_contiguous(const CudaNdarray * self); CudaNdarray_is_c_contiguous(const CudaNdarray * self);
/**
* Return true for a F-contiguous CudaNdarray, else false
*/
DllExport bool
CudaNdarray_is_f_contiguous(const CudaNdarray * self);
/**** /****
* Returns the number of elements necessary in host_structure and dev_structure for a given number of dimensions. * Returns the number of elements necessary in host_structure and dev_structure for a given number of dimensions.
*/ */
...@@ -326,10 +332,13 @@ CudaNdarray_set_nd(CudaNdarray * self, const int nd) ...@@ -326,10 +332,13 @@ CudaNdarray_set_nd(CudaNdarray * self, const int nd)
* Allocate storage space for a tensor of rank 'nd' and given dimensions. * Allocate storage space for a tensor of rank 'nd' and given dimensions.
* (No-op if self already has a contiguous tensor of the right dimensions) * (No-op if self already has a contiguous tensor of the right dimensions)
* *
* If fortran is non-zeros, a fortran order is made, otherwise it is a c order.
*
* Note: CudaNdarray_alloc_contiguous is templated to work for both int dimensions and npy_intp dimensions * Note: CudaNdarray_alloc_contiguous is templated to work for both int dimensions and npy_intp dimensions
*/ */
template<typename inttype> template<typename inttype>
static int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd, const inttype * dim) static int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd,
const inttype * dim, int fortran=0)
{ {
// allocate an empty ndarray with c_contiguous access // allocate an empty ndarray with c_contiguous access
// return 0 on success // return 0 on success
...@@ -342,12 +351,24 @@ static int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd, const i ...@@ -342,12 +351,24 @@ static int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd, const i
{ {
return -1; return -1;
} }
if (fortran)
{
for (int i = 0; i < nd; i++)
{
CudaNdarray_set_stride(self, i, (dim[i] == 1) ? 0 : size);
CudaNdarray_set_dim(self, i, dim[i]);
size = size * dim[i];
}
}
else
{
for (int i = nd-1; i >= 0; --i) for (int i = nd-1; i >= 0; --i)
{ {
CudaNdarray_set_stride(self, i, (dim[i] == 1) ? 0 : size); CudaNdarray_set_stride(self, i, (dim[i] == 1) ? 0 : size);
CudaNdarray_set_dim(self, i, dim[i]); CudaNdarray_set_dim(self, i, dim[i]);
size = size * dim[i]; size = size * dim[i];
} }
}
// If the allocated buffer is already of the right size, we don't need to // If the allocated buffer is already of the right size, we don't need to
// do anything else. // do anything else.
...@@ -497,6 +518,27 @@ CudaNdarray_is_c_contiguous(const CudaNdarray * self) ...@@ -497,6 +518,27 @@ CudaNdarray_is_c_contiguous(const CudaNdarray * self)
return c_contiguous; return c_contiguous;
} }
/**
* True iff the strides look like [1, dim[0], dim[0]*dim[1], ...]
*/
DllExport inline bool ALWAYS_INLINE
CudaNdarray_is_f_contiguous(const CudaNdarray * self)
{
bool f_contiguous = true;
int size = 1;
for (int i = 0; (i < self->nd) && f_contiguous; i++)
{
if (CudaNdarray_HOST_DIMS(self)[i] == 1)
continue;
if (CudaNdarray_HOST_STRIDES(self)[i] != size)
{
f_contiguous = false;
}
size = size * CudaNdarray_HOST_DIMS(self)[i];
}
return f_contiguous;
}
DllExport PyObject * CudaNdarray_IS_C_Contiguous(CudaNdarray * self); DllExport PyObject * CudaNdarray_IS_C_Contiguous(CudaNdarray * self);
DllExport int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C); DllExport int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C);
...@@ -525,8 +567,9 @@ DllExport int CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_othe ...@@ -525,8 +567,9 @@ DllExport int CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_othe
// *arr may initially be NULL, a pointer to an ndarray of the wrong size, // *arr may initially be NULL, a pointer to an ndarray of the wrong size,
// or a pointer to an ndarray of the right size. In the last case it will // or a pointer to an ndarray of the right size. In the last case it will
// not change. // not change.
// If fortran is non-zero, a fortran order is expected/created
DllExport int CudaNdarray_prep_output(CudaNdarray ** arr, int nd, DllExport int CudaNdarray_prep_output(CudaNdarray ** arr, int nd,
const int * dims); const int * dims, int fortran = 0);
DllExport inline const char* ALWAYS_INLINE cublasGetErrorString(cublasStatus err){ DllExport inline const char* ALWAYS_INLINE cublasGetErrorString(cublasStatus err){
if(CUBLAS_STATUS_SUCCESS == err) if(CUBLAS_STATUS_SUCCESS == err)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论