提交 f441295b authored 作者: Frederic's avatar Frederic

postpone the creating of the device structure to when we need it.

This is a significant speed up with the gc as most of the time, we don't need it and allocating on the GPU is slow.
上级 c50a2db1
...@@ -4385,7 +4385,30 @@ CudaNdarray_Equal(CudaNdarray *cnda1, CudaNdarray *cnda2) ...@@ -4385,7 +4385,30 @@ CudaNdarray_Equal(CudaNdarray *cnda1, CudaNdarray *cnda2)
int int
cnda_copy_structure_to_device(const CudaNdarray * self) cnda_copy_structure_to_device(const CudaNdarray * self)
{ {
cublasSetVector(cnda_structure_size(self->nd), sizeof(int), self->host_structure, 1, self->dev_structure, 1); //If the device structure do not exists, create it.
//We allocate it here as we do not need it often.
//In fact, we need it so infrequently that we expect
//that most object won't need it. Not allocating it
//save a significant when creating object.
//This speed up a benchmark by 8% with the gc.
if (!self->dev_structure)
{
int struct_size = cnda_structure_size(self->nd);
if (struct_size)
{
self->dev_structure = (int*)device_malloc(struct_size* sizeof(int));
if (NULL == self->dev_structure)
{
return -1;
}
}
}
cublasSetVector(cnda_structure_size(self->nd),
sizeof(int),
self->host_structure,
1,
self->dev_structure,
1);
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
if (CUBLAS_STATUS_SUCCESS != cublasGetError()) if (CUBLAS_STATUS_SUCCESS != cublasGetError())
{ {
......
...@@ -82,8 +82,9 @@ struct CudaNdarray ...@@ -82,8 +82,9 @@ struct CudaNdarray
//device pointers (allocated by cudaMalloc) //device pointers (allocated by cudaMalloc)
mutable int dev_structure_fresh; mutable int dev_structure_fresh;
//dev_structure should be accessed via macros, otherwise may not be synchronized //dev_structure should be accessed via macros, otherwise may not be
int * dev_structure; //dim0, dim1, ..., stride0, stride1, ... //synchronized. The macro will allocate it when needed.
mutable int * dev_structure; //dim0, dim1, ..., stride0, stride1, ...
real* devdata; //pointer to data element [0,..,0]. real* devdata; //pointer to data element [0,..,0].
}; };
...@@ -251,19 +252,8 @@ CudaNdarray_set_nd(CudaNdarray * self, const int nd) ...@@ -251,19 +252,8 @@ CudaNdarray_set_nd(CudaNdarray * self, const int nd)
{ {
self->host_structure[i] = 0; self->host_structure[i] = 0;
} }
//The device structure will be created in cnda_copy_structure_to_device
int struct_size = cnda_structure_size(nd); //if needed.
if (struct_size)
{
self->dev_structure = (int*)device_malloc(struct_size* sizeof(int));
if (NULL == self->dev_structure)
{
free(self->host_structure);
self->host_structure = NULL;
self->dev_structure = NULL;
return -1;
}
}
self->nd = nd; self->nd = nd;
self->dev_structure_fresh = 0; self->dev_structure_fresh = 0;
} }
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论