提交 163deb4b authored 作者: James Bergstra's avatar James Bergstra

cuda - added new device_malloc and device_free wrappers that count device allocations

上级 93c8954c
...@@ -6,6 +6,45 @@ ...@@ -6,6 +6,45 @@
#include "cuda_ndarray.cuh" #include "cuda_ndarray.cuh"
/////////////////////////
// Alloc and Free
/////////////////////////
/**
*
* In the test program I'm using, the _outstanding_mallocs decreases with every call.
* This suggests there are more free() calls being made than alloc(), but I can't figure out why.
*
*/
int _outstanding_mallocs = 0;
void * device_malloc(size_t size)
{
void * rval=NULL;
if (cudaSuccess != cudaMalloc(&rval, size))
{
fprintf(stderr, "Error allocating %i bytes of device memory.\n", size);
PyErr_Format(PyExc_MemoryError, "error allocating %i bytes of device memory", size);
return NULL;
}
_outstanding_mallocs += (rval != NULL);
return rval;
}
int device_free(void *ptr)
{
if (cudaSuccess != cudaFree(ptr))
{
PyErr_Format(PyExc_MemoryError, "error freeing device pointer %p", ptr);
return -1;
}
_outstanding_mallocs -= (ptr != NULL);
return 0;
}
static PyObject *
outstanding_mallocs(PyObject* self, PyObject * args)
{
return PyInt_FromLong(_outstanding_mallocs);
}
///////////////////////// /////////////////////////
// Static helper methods // Static helper methods
///////////////////////// /////////////////////////
...@@ -28,8 +67,7 @@ CudaNdarray_uninit(CudaNdarray*self) ...@@ -28,8 +67,7 @@ CudaNdarray_uninit(CudaNdarray*self)
int rval = 0; int rval = 0;
if (self->data_allocated) { if (self->data_allocated) {
assert(self->devdata); assert(self->devdata);
cublasFree(self->devdata); if (device_free(self->devdata))
if (CUBLAS_STATUS_SUCCESS != cublasGetError())
{ {
std::cerr << "!!!! error freeing device memory\n"; std::cerr << "!!!! error freeing device memory\n";
rval = -1; rval = -1;
...@@ -39,8 +77,7 @@ CudaNdarray_uninit(CudaNdarray*self) ...@@ -39,8 +77,7 @@ CudaNdarray_uninit(CudaNdarray*self)
} }
if (self->dev_structure) if (self->dev_structure)
{ {
cublasFree(self->dev_structure); if (device_free(self->dev_structure))
if (CUBLAS_STATUS_SUCCESS != cublasGetError())
{ {
std::cerr << "!!!! error freeing device memory\n"; std::cerr << "!!!! error freeing device memory\n";
rval = -1; rval = -1;
...@@ -269,6 +306,8 @@ PyObject * CudaNdarray_ReduceSum(CudaNdarray * self, PyObject * py_reduce_mask) ...@@ -269,6 +306,8 @@ PyObject * CudaNdarray_ReduceSum(CudaNdarray * self, PyObject * py_reduce_mask)
{ {
return NULL; return NULL;
} }
//TODO: allocate a fixed size dimshuffle_pattern_cache on the stack,
// and use it if it is big enough.
int * dimshuffle_pattern = (int*)malloc(len * 2 * sizeof(int)); int * dimshuffle_pattern = (int*)malloc(len * 2 * sizeof(int));
int * sum_dims = dimshuffle_pattern + len; int * sum_dims = dimshuffle_pattern + len;
int n_remaining_dims = 0; int n_remaining_dims = 0;
...@@ -1453,7 +1492,7 @@ filter(PyObject* __unsed_self, PyObject *args) // args = (data, broadcastable, s ...@@ -1453,7 +1492,7 @@ filter(PyObject* __unsed_self, PyObject *args) // args = (data, broadcastable, s
return NULL; return NULL;
} }
} }
if (CudaNdarray_Check(storage)) if (storage && CudaNdarray_Check(storage))
{ {
rval = (CudaNdarray*) storage; rval = (CudaNdarray*) storage;
Py_INCREF(rval); Py_INCREF(rval);
...@@ -1462,14 +1501,17 @@ filter(PyObject* __unsed_self, PyObject *args) // args = (data, broadcastable, s ...@@ -1462,14 +1501,17 @@ filter(PyObject* __unsed_self, PyObject *args) // args = (data, broadcastable, s
{ {
rval = (CudaNdarray*) CudaNdarray_new_null(); rval = (CudaNdarray*) CudaNdarray_new_null();
} }
if (CudaNdarray_CopyFromArray(rval, data)) if (rval)
{ {
Py_DECREF(rval); if (CudaNdarray_CopyFromArray(rval, data))
rval = NULL; {
Py_DECREF(rval);
rval = NULL;
}
Py_DECREF(data);
Py_DECREF(py_data);
Py_DECREF(broadcastable);
} }
Py_DECREF(data);
Py_DECREF(py_data);
Py_DECREF(broadcastable);
return (PyObject*)rval; return (PyObject*)rval;
} }
} }
...@@ -1478,6 +1520,7 @@ static PyMethodDef module_methods[] = { ...@@ -1478,6 +1520,7 @@ static PyMethodDef module_methods[] = {
{"dot", CudaNdarray_Dot, METH_VARARGS, "Returns the matrix product of two CudaNdarray arguments."}, {"dot", CudaNdarray_Dot, METH_VARARGS, "Returns the matrix product of two CudaNdarray arguments."},
{"gpu_init", CudaNdarray_gpu_init, METH_VARARGS, "Allow to select the gpu card to use."}, {"gpu_init", CudaNdarray_gpu_init, METH_VARARGS, "Allow to select the gpu card to use."},
{"filter", filter, METH_VARARGS, "filter(obj, broadcastable, strict, storage) returns a CudaNdarray initialized to obj if it matches the constraints of broadcastable. strict=True prevents any numeric casting. If storage is a CudaNdarray it may be overwritten and used as the return value."}, {"filter", filter, METH_VARARGS, "filter(obj, broadcastable, strict, storage) returns a CudaNdarray initialized to obj if it matches the constraints of broadcastable. strict=True prevents any numeric casting. If storage is a CudaNdarray it may be overwritten and used as the return value."},
{"outstanding_mallocs", outstanding_mallocs, METH_VARARGS, "how many more mallocs have been called than free's"},
{NULL, NULL, NULL, NULL} /* Sentinel */ {NULL, NULL, NULL, NULL} /* Sentinel */
}; };
...@@ -1670,10 +1713,8 @@ int CudaNdarray_set_device_data(CudaNdarray * self, float * data, CudaNdarray * ...@@ -1670,10 +1713,8 @@ int CudaNdarray_set_device_data(CudaNdarray * self, float * data, CudaNdarray *
if (self->data_allocated) if (self->data_allocated)
{ {
assert(self->devdata); assert(self->devdata);
cublasFree(self->devdata); if (device_free(self->devdata))
if (CUBLAS_STATUS_SUCCESS != cublasGetError())
{ {
PyErr_SetString(PyExc_MemoryError, "error freeing device memory");
self->devdata = NULL; self->devdata = NULL;
self->data_allocated = 0; self->data_allocated = 0;
return -1; return -1;
......
...@@ -30,6 +30,15 @@ typedef float real; ...@@ -30,6 +30,15 @@ typedef float real;
#define SHARED_SIZE (16*1024) #define SHARED_SIZE (16*1024)
#endif #endif
/**
* Allocation and freeing of device memory should go through these functions so that the lib can track memory usage.
*
* device_malloc will set the Python error message before returning None.
* device_free will return nonzero on failure (after setting the python error message)
*/
void * device_malloc(size_t size);
int device_free(void * ptr);
template <typename T> template <typename T>
static T ceil_intdiv(T a, T b) static T ceil_intdiv(T a, T b)
{ {
...@@ -248,10 +257,8 @@ int CudaNdarray_set_nd(CudaNdarray * self, const int nd) ...@@ -248,10 +257,8 @@ int CudaNdarray_set_nd(CudaNdarray * self, const int nd)
{ {
if (self->dev_structure) if (self->dev_structure)
{ {
cublasFree(self->dev_structure); if (device_free(self->dev_structure))
if (CUBLAS_STATUS_SUCCESS != cublasGetError())
{ {
PyErr_SetString(PyExc_MemoryError, "error freeing device memory");
return -1; return -1;
} }
self->dev_structure = NULL; self->dev_structure = NULL;
...@@ -272,14 +279,17 @@ int CudaNdarray_set_nd(CudaNdarray * self, const int nd) ...@@ -272,14 +279,17 @@ int CudaNdarray_set_nd(CudaNdarray * self, const int nd)
PyErr_SetString(PyExc_MemoryError, "Failed to allocate dim or str"); PyErr_SetString(PyExc_MemoryError, "Failed to allocate dim or str");
return -1; return -1;
} }
cublasAlloc(cnda_structure_size(nd), sizeof(int), (void**)&self->dev_structure); int struct_size = cnda_structure_size(nd);
if (CUBLAS_STATUS_SUCCESS != cublasGetError()) if (struct_size)
{ {
PyErr_SetString(PyExc_MemoryError, "error allocating device memory"); self->dev_structure = (int*)device_malloc(struct_size* sizeof(int));
free(self->host_structure); if (NULL == self->dev_structure)
self->host_structure = NULL; {
self->dev_structure = NULL; free(self->host_structure);
return -1; self->host_structure = NULL;
self->dev_structure = NULL;
return -1;
}
} }
self->nd = nd; self->nd = nd;
self->dev_structure_fresh = 0; self->dev_structure_fresh = 0;
...@@ -317,20 +327,15 @@ int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd, const inttype ...@@ -317,20 +327,15 @@ int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd, const inttype
if (self->data_allocated != size) if (self->data_allocated != size)
{ {
//std::cerr << "resizing from " << self->data_allocated << " to " << size << '\n'; if (device_free(self->devdata))
cublasFree(self->devdata); {
if (CUBLAS_STATUS_SUCCESS != cublasGetError()) // Does this ever happen?? Do we need to set data_allocated or devdata to 0?
{// Does this ever happen?? Do we need to set data_allocated or devdata to 0?
PyErr_SetString(PyExc_MemoryError, "error freeing device memory");
return -1; return -1;
} }
assert(size>0); assert(size>0);
cublasAlloc(size, sizeof(real), (void**)&(self->devdata)); self->devdata = (float*)device_malloc(size*sizeof(real));
//std::cerr << "cublasAlloc returned " << self->devdata << "\n"; if (!self->devdata)
//We must do both checks as the first one is not enough in some cases!
if (CUBLAS_STATUS_SUCCESS != cublasGetError() || !self->devdata)
{ {
PyErr_Format(PyExc_MemoryError, "error allocating %i bytes device memory",size);
CudaNdarray_set_nd(self,-1); CudaNdarray_set_nd(self,-1);
self->data_allocated = 0; self->data_allocated = 0;
self->devdata = 0; self->devdata = 0;
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论