提交 c2888726 authored 作者: Frederic's avatar Frederic

Allow to create fortran order memory region the gpu.

上级 19c0dbcb
......@@ -5093,7 +5093,7 @@ int fprint_CudaNdarray(FILE * fd, const CudaNdarray *self)
int CudaNdarray_prep_output(CudaNdarray ** arr, int nd,
const int * dims)
const int * dims, int fortran)
{
bool allocated = false;
if (*arr == NULL)
......@@ -5105,7 +5105,7 @@ int CudaNdarray_prep_output(CudaNdarray ** arr, int nd,
allocated = true;
}
if (CudaNdarray_alloc_contiguous(*arr, nd, dims))
if (CudaNdarray_alloc_contiguous(*arr, nd, dims, fortran))
{
if (allocated)
{
......
......@@ -326,10 +326,13 @@ CudaNdarray_set_nd(CudaNdarray * self, const int nd)
* Allocate storage space for a tensor of rank 'nd' and given dimensions.
* (No-op if self already has a contiguous tensor of the right dimensions)
*
* If fortran is non-zeros, a fortran order is made, otherwise it is a c order.
*
* Note: CudaNdarray_alloc_contiguous is templated to work for both int dimensions and npy_intp dimensions
*/
template<typename inttype>
static int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd, const inttype * dim)
static int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd,
const inttype * dim, int fortran=0)
{
// allocate an empty ndarray with c_contiguous access
// return 0 on success
......@@ -342,11 +345,23 @@ static int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd, const i
{
return -1;
}
for (int i = nd-1; i >= 0; --i)
if (fortran)
{
for (int i = 0; i < nd; i++)
{
CudaNdarray_set_stride(self, i, (dim[i] == 1) ? 0 : size);
CudaNdarray_set_dim(self, i, dim[i]);
size = size * dim[i];
}
}
else
{
CudaNdarray_set_stride(self, i, (dim[i] == 1) ? 0 : size);
CudaNdarray_set_dim(self, i, dim[i]);
size = size * dim[i];
for (int i = nd-1; i >= 0; --i)
{
CudaNdarray_set_stride(self, i, (dim[i] == 1) ? 0 : size);
CudaNdarray_set_dim(self, i, dim[i]);
size = size * dim[i];
}
}
// If the allocated buffer is already of the right size, we don't need to
......@@ -525,8 +540,9 @@ DllExport int CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_othe
// *arr may initially be NULL, a pointer to an ndarray of the wrong size,
// or a pointer to an ndarray of the right size. In the last case it will
// not change.
// If fortran is non-zero, a fortran order is expected/created
DllExport int CudaNdarray_prep_output(CudaNdarray ** arr, int nd,
const int * dims);
const int * dims, int fortran = 0);
DllExport inline const char* ALWAYS_INLINE cublasGetErrorString(cublasStatus err){
if(CUBLAS_STATUS_SUCCESS == err)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论