提交 7194cb2b authored 作者: abergeron's avatar abergeron

Merge pull request #1788 from nouiz/cuda_fortran

Cuda fortran order
......@@ -5093,7 +5093,7 @@ int fprint_CudaNdarray(FILE * fd, const CudaNdarray *self)
int CudaNdarray_prep_output(CudaNdarray ** arr, int nd,
const int * dims)
const int * dims, int fortran)
{
bool allocated = false;
if (*arr == NULL)
......@@ -5105,7 +5105,7 @@ int CudaNdarray_prep_output(CudaNdarray ** arr, int nd,
allocated = true;
}
if (CudaNdarray_alloc_contiguous(*arr, nd, dims))
if (CudaNdarray_alloc_contiguous(*arr, nd, dims, fortran))
{
if (allocated)
{
......
......@@ -160,6 +160,12 @@ CudaNdarray_CheckExact(const PyObject * ob);
DllExport bool
CudaNdarray_is_c_contiguous(const CudaNdarray * self);
/**
* Return true for a F-contiguous CudaNdarray, else false
*/
DllExport bool
CudaNdarray_is_f_contiguous(const CudaNdarray * self);
/****
* Returns the number of elements necessary in host_structure and dev_structure for a given number of dimensions.
*/
......@@ -326,10 +332,13 @@ CudaNdarray_set_nd(CudaNdarray * self, const int nd)
* Allocate storage space for a tensor of rank 'nd' and given dimensions.
* (No-op if self already has a contiguous tensor of the right dimensions)
*
* If fortran is non-zeros, a fortran order is made, otherwise it is a c order.
*
* Note: CudaNdarray_alloc_contiguous is templated to work for both int dimensions and npy_intp dimensions
*/
template<typename inttype>
static int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd, const inttype * dim)
static int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd,
const inttype * dim, int fortran=0)
{
// allocate an empty ndarray with c_contiguous access
// return 0 on success
......@@ -342,12 +351,24 @@ static int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd, const i
{
return -1;
}
if (fortran)
{
for (int i = 0; i < nd; i++)
{
CudaNdarray_set_stride(self, i, (dim[i] == 1) ? 0 : size);
CudaNdarray_set_dim(self, i, dim[i]);
size = size * dim[i];
}
}
else
{
for (int i = nd-1; i >= 0; --i)
{
CudaNdarray_set_stride(self, i, (dim[i] == 1) ? 0 : size);
CudaNdarray_set_dim(self, i, dim[i]);
size = size * dim[i];
}
}
// If the allocated buffer is already of the right size, we don't need to
// do anything else.
......@@ -497,6 +518,27 @@ CudaNdarray_is_c_contiguous(const CudaNdarray * self)
return c_contiguous;
}
/**
* True iff the strides look like [1, dim[0], dim[0]*dim[1], ...]
*/
DllExport inline bool ALWAYS_INLINE
CudaNdarray_is_f_contiguous(const CudaNdarray * self)
{
bool f_contiguous = true;
int size = 1;
for (int i = 0; (i < self->nd) && f_contiguous; i++)
{
if (CudaNdarray_HOST_DIMS(self)[i] == 1)
continue;
if (CudaNdarray_HOST_STRIDES(self)[i] != size)
{
f_contiguous = false;
}
size = size * CudaNdarray_HOST_DIMS(self)[i];
}
return f_contiguous;
}
DllExport PyObject * CudaNdarray_IS_C_Contiguous(CudaNdarray * self);
DllExport int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C);
......@@ -525,8 +567,9 @@ DllExport int CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_othe
// *arr may initially be NULL, a pointer to an ndarray of the wrong size,
// or a pointer to an ndarray of the right size. In the last case it will
// not change.
// If fortran is non-zero, a fortran order is expected/created
DllExport int CudaNdarray_prep_output(CudaNdarray ** arr, int nd,
const int * dims);
const int * dims, int fortran = 0);
DllExport inline const char* ALWAYS_INLINE cublasGetErrorString(cublasStatus err){
if(CUBLAS_STATUS_SUCCESS == err)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论