提交 b9cda46e authored 作者: Pascal Lamblin's avatar Pascal Lamblin

Indentation

上级 1bdb8a65
...@@ -33,8 +33,8 @@ int _outstanding_mallocs[] = {0,0}; ...@@ -33,8 +33,8 @@ int _outstanding_mallocs[] = {0,0};
int _allocated_size = 0; int _allocated_size = 0;
const int TABLE_SIZE = 10000; const int TABLE_SIZE = 10000;
struct table_struct{ struct table_struct{
void* ptr; void* ptr;
int size; int size;
}; };
table_struct _alloc_size_table[TABLE_SIZE]; table_struct _alloc_size_table[TABLE_SIZE];
#endif #endif
...@@ -55,20 +55,20 @@ void * device_malloc(size_t size) ...@@ -55,20 +55,20 @@ void * device_malloc(size_t size)
_outstanding_mallocs[0] += (rval != NULL); _outstanding_mallocs[0] += (rval != NULL);
#if COMPUTE_GPU_MEM_USED #if COMPUTE_GPU_MEM_USED
for(int i=0;i<TABLE_SIZE;i++){ for(int i=0;i<TABLE_SIZE;i++){
if(NULL==_alloc_size_table[i].ptr){ if(NULL==_alloc_size_table[i].ptr){
_alloc_size_table[i].ptr=rval; _alloc_size_table[i].ptr=rval;
_alloc_size_table[i].size=size; _alloc_size_table[i].size=size;
break; break;
} }
} }
_allocated_size += size; _allocated_size += size;
#endif #endif
//fprintf(stderr, "allocated %li bytes of device memory (%s). %d already allocated, ptr: %p\n", (long)size, cudaGetErrorString(err),_allocated_size,rval); //fprintf(stderr, "allocated %li bytes of device memory (%s). %d already allocated, ptr: %p\n", (long)size, cudaGetErrorString(err),_allocated_size,rval);
if(ALLOC_MEMSET){ if(ALLOC_MEMSET){
//We init them to nan to make sure we catch more debug case. //We init them to nan to make sure we catch more debug case.
cudaMemset(rval, 0xFF, size); cudaMemset(rval, 0xFF, size);
//printf("MEMSET\n"); //printf("MEMSET\n");
} }
return rval; return rval;
} }
...@@ -94,16 +94,16 @@ int device_free(void *ptr) ...@@ -94,16 +94,16 @@ int device_free(void *ptr)
int i=0; int i=0;
size_t total_freed = 0; size_t total_freed = 0;
for(;i<TABLE_SIZE;i++) for(;i<TABLE_SIZE;i++)
if(_alloc_size_table[i].ptr==ptr){ if(_alloc_size_table[i].ptr==ptr){
_allocated_size -= _alloc_size_table[i].size; _allocated_size -= _alloc_size_table[i].size;
total_freed += _alloc_size_table[i].size; total_freed += _alloc_size_table[i].size;
_alloc_size_table[i].ptr=0; _alloc_size_table[i].ptr=0;
_alloc_size_table[i].size=0; _alloc_size_table[i].size=0;
break; break;
} }
if(i==TABLE_SIZE) if(i==TABLE_SIZE)
printf("Unallocated unknow size!\n"); printf("Unallocated unknow size!\n");
//fprintf(stderr, "freed %li bytes of device memory (%s). %d already allocated, ptr=%p\n", (long)total_freed, cudaGetErrorString(err),_allocated_size,ptr); //fprintf(stderr, "freed %li bytes of device memory (%s). %d already allocated, ptr=%p\n", (long)total_freed, cudaGetErrorString(err),_allocated_size,ptr);
#endif #endif
return 0; return 0;
...@@ -274,16 +274,16 @@ PyObject * CudaNdarray_CreateArrayObj(CudaNdarray * self) ...@@ -274,16 +274,16 @@ PyObject * CudaNdarray_CreateArrayObj(CudaNdarray * self)
{ {
int verbose = 0; int verbose = 0;
if(self->nd>=0 && CudaNdarray_SIZE(self)==0){ if(self->nd>=0 && CudaNdarray_SIZE(self)==0){
npy_intp * npydims = (npy_intp*)malloc(self->nd * sizeof(npy_intp)); npy_intp * npydims = (npy_intp*)malloc(self->nd * sizeof(npy_intp));
assert (npydims); assert (npydims);
for (int i = 0; i < self->nd; ++i) npydims[i] = (npy_intp)(CudaNdarray_HOST_DIMS(self)[i]); for (int i = 0; i < self->nd; ++i) npydims[i] = (npy_intp)(CudaNdarray_HOST_DIMS(self)[i]);
PyObject * rval = PyArray_SimpleNew(self->nd, npydims, REAL_TYPENUM); PyObject * rval = PyArray_SimpleNew(self->nd, npydims, REAL_TYPENUM);
free(npydims); free(npydims);
if (!rval){ if (!rval){
return NULL; return NULL;
} }
assert (PyArray_ITEMSIZE(rval) == sizeof(real)); assert (PyArray_ITEMSIZE(rval) == sizeof(real));
return rval; return rval;
} }
if ((self->nd < 0) || (self->devdata == 0)) if ((self->nd < 0) || (self->devdata == 0))
{ {
...@@ -309,7 +309,8 @@ PyObject * CudaNdarray_CreateArrayObj(CudaNdarray * self) ...@@ -309,7 +309,8 @@ PyObject * CudaNdarray_CreateArrayObj(CudaNdarray * self)
npy_intp * npydims = (npy_intp*)malloc(self->nd * sizeof(npy_intp)); npy_intp * npydims = (npy_intp*)malloc(self->nd * sizeof(npy_intp));
assert (npydims); assert (npydims);
for (int i = 0; i < self->nd; ++i) npydims[i] = (npy_intp)(CudaNdarray_HOST_DIMS(self)[i]); for (int i = 0; i < self->nd; ++i)
npydims[i] = (npy_intp)(CudaNdarray_HOST_DIMS(self)[i]);
PyObject * rval = PyArray_SimpleNew(self->nd, npydims, REAL_TYPENUM); PyObject * rval = PyArray_SimpleNew(self->nd, npydims, REAL_TYPENUM);
free(npydims); free(npydims);
if (!rval) if (!rval)
...@@ -321,8 +322,8 @@ PyObject * CudaNdarray_CreateArrayObj(CudaNdarray * self) ...@@ -321,8 +322,8 @@ PyObject * CudaNdarray_CreateArrayObj(CudaNdarray * self)
assert (PyArray_ITEMSIZE(rval) == sizeof(real)); assert (PyArray_ITEMSIZE(rval) == sizeof(real));
cublasGetVector(PyArray_SIZE(rval), sizeof(real), cublasGetVector(PyArray_SIZE(rval), sizeof(real),
contiguous_self->devdata, 1, contiguous_self->devdata, 1,
PyArray_DATA(rval), 1); PyArray_DATA(rval), 1);
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
if (CUBLAS_STATUS_SUCCESS != cublasGetError()) if (CUBLAS_STATUS_SUCCESS != cublasGetError())
...@@ -345,7 +346,7 @@ PyObject* CudaNdarray_ZEROS(int n, int * dims) ...@@ -345,7 +346,7 @@ PyObject* CudaNdarray_ZEROS(int n, int * dims)
int total_elements = 1; int total_elements = 1;
for(int i=0;i<n;i++) for(int i=0;i<n;i++)
total_elements*=dims[i]; total_elements*=dims[i];
// total_elements now contains the size of the array, in reals // total_elements now contains the size of the array, in reals
int total_size = total_elements * sizeof(real); int total_size = total_elements * sizeof(real);
...@@ -597,10 +598,10 @@ PyObject * CudaNdarray_Reshape(CudaNdarray * self, PyObject * shape) ...@@ -597,10 +598,10 @@ PyObject * CudaNdarray_Reshape(CudaNdarray * self, PyObject * shape)
unsigned int rval_size = 1; unsigned int rval_size = 1;
if (PyTuple_Check(shape)){ if (PyTuple_Check(shape)){
// copy shape to integer array // copy shape to integer array
rval_nd = PyTuple_Size(shape); rval_nd = PyTuple_Size(shape);
}else if (PyInt_Check(shape)){ }else if (PyInt_Check(shape)){
rval_nd = 1; rval_nd = 1;
}else{ }else{
PyErr_SetString(PyExc_TypeError, "shape must be tuple of integers or an integer"); PyErr_SetString(PyExc_TypeError, "shape must be tuple of integers or an integer");
return NULL; return NULL;
...@@ -608,24 +609,24 @@ PyObject * CudaNdarray_Reshape(CudaNdarray * self, PyObject * shape) ...@@ -608,24 +609,24 @@ PyObject * CudaNdarray_Reshape(CudaNdarray * self, PyObject * shape)
rval_dims = (unsigned int*)malloc(rval_nd * sizeof(int)); rval_dims = (unsigned int*)malloc(rval_nd * sizeof(int));
if(PyTuple_Check(shape)){ if(PyTuple_Check(shape)){
for (int i = 0; i < rval_nd; ++i) for (int i = 0; i < rval_nd; ++i)
{
rval_dims[i] = PyInt_AsLong(PyTuple_GetItem(shape, i)); //GetItem returns borrowed reference
if (PyErr_Occurred()) //error in AsLong
{ {
free(rval_dims); rval_dims[i] = PyInt_AsLong(PyTuple_GetItem(shape, i)); //GetItem returns borrowed reference
return NULL; if (PyErr_Occurred()) //error in AsLong
{
free(rval_dims);
return NULL;
}
if(rval_dims[i]<=0){
PyErr_Format(PyExc_ValueError, "Reshape has invalid dimension %i (must be >0)",rval_dims[i]);
free(rval_dims);
return NULL;
}
rval_size = rval_size * rval_dims[i];
} }
if(rval_dims[i]<=0){
PyErr_Format(PyExc_ValueError, "Reshape has invalid dimension %i (must be >0)",rval_dims[i]);
free(rval_dims);
return NULL;
}
rval_size = rval_size * rval_dims[i];
}
}else{ }else{
rval_size = PyInt_AsLong(shape); rval_size = PyInt_AsLong(shape);
rval_dims[0] = rval_size; rval_dims[0] = rval_size;
} }
// calculate new size, assert same as old size // calculate new size, assert same as old size
if (rval_size != CudaNdarray_SIZE(self)) if (rval_size != CudaNdarray_SIZE(self))
...@@ -637,8 +638,8 @@ PyObject * CudaNdarray_Reshape(CudaNdarray * self, PyObject * shape) ...@@ -637,8 +638,8 @@ PyObject * CudaNdarray_Reshape(CudaNdarray * self, PyObject * shape)
if (rval_size==0) if (rval_size==0)
{ {
PyObject * rval = CudaNdarray_NewDims(rval_nd, rval_dims); PyObject * rval = CudaNdarray_NewDims(rval_nd, rval_dims);
free(rval_dims); free(rval_dims);
return rval; return rval;
} }
if(CudaNdarray_is_c_contiguous(self)) if(CudaNdarray_is_c_contiguous(self))
...@@ -648,19 +649,19 @@ PyObject * CudaNdarray_Reshape(CudaNdarray * self, PyObject * shape) ...@@ -648,19 +649,19 @@ PyObject * CudaNdarray_Reshape(CudaNdarray * self, PyObject * shape)
if (!rval || 0 != rval->data_allocated if (!rval || 0 != rval->data_allocated
||CudaNdarray_set_device_data(rval, CudaNdarray_DEV_DATA(self), self)) ||CudaNdarray_set_device_data(rval, CudaNdarray_DEV_DATA(self), self))
{ {
Py_XDECREF(rval); Py_XDECREF(rval);
free(rval_dims); free(rval_dims);
return NULL; return NULL;
} }
//set dim and stride //set dim and stride
int size = 1; int size = 1;
for (int i = rval_nd-1; i >= 0; --i) for (int i = rval_nd-1; i >= 0; --i)
{ {
CudaNdarray_set_stride(rval, i, (rval_dims[i] == 1) ? 0 : size); CudaNdarray_set_stride(rval, i, (rval_dims[i] == 1) ? 0 : size);
CudaNdarray_set_dim(rval, i, rval_dims[i]); CudaNdarray_set_dim(rval, i, rval_dims[i]);
size = size * rval_dims[i]; size = size * rval_dims[i];
} }
free(rval_dims); free(rval_dims);
return (PyObject*)rval; return (PyObject*)rval;
} }
...@@ -1005,15 +1006,15 @@ CudaNdarray_inplace_add_div(PyObject* py_self, PyObject * py_other, int fct_nb) ...@@ -1005,15 +1006,15 @@ CudaNdarray_inplace_add_div(PyObject* py_self, PyObject * py_other, int fct_nb)
return NULL; return NULL;
} }
if (fct_nb<0 || fct_nb>1){ if (fct_nb<0 || fct_nb>1){
PyErr_SetString(PyExc_TypeError, "CudaNdarray_inplace_add_div fct_nb param supported are only 0 and 1."); PyErr_SetString(PyExc_TypeError, "CudaNdarray_inplace_add_div fct_nb param supported are only 0 and 1.");
return NULL; return NULL;
} }
CudaNdarray * self = (CudaNdarray *)py_self; CudaNdarray * self = (CudaNdarray *)py_self;
CudaNdarray * other = (CudaNdarray *)py_other; CudaNdarray * other = (CudaNdarray *)py_other;
if (verbose) fprintf(stderr, "INPLACE ADD/DIV for self->nd=%d other->nd=%d\n", if (verbose) fprintf(stderr, "INPLACE ADD/DIV for self->nd=%d other->nd=%d\n",
self->nd, other->nd); self->nd, other->nd);
//standard elemwise size checks //standard elemwise size checks
if (self->nd != other->nd) if (self->nd != other->nd)
...@@ -1035,8 +1036,8 @@ CudaNdarray_inplace_add_div(PyObject* py_self, PyObject * py_other, int fct_nb) ...@@ -1035,8 +1036,8 @@ CudaNdarray_inplace_add_div(PyObject* py_self, PyObject * py_other, int fct_nb)
} }
if(CudaNdarray_SIZE((CudaNdarray *)py_self)==0 && CudaNdarray_SIZE((CudaNdarray *)py_other)==0){ if(CudaNdarray_SIZE((CudaNdarray *)py_self)==0 && CudaNdarray_SIZE((CudaNdarray *)py_other)==0){
Py_INCREF(py_self); Py_INCREF(py_self);
return py_self; return py_self;
} }
void (*k_iop_3)(const int, const int, const int, void (*k_iop_3)(const int, const int, const int,
float*, const int, const int, const int, float*, const int, const int, const int,
...@@ -1047,11 +1048,11 @@ CudaNdarray_inplace_add_div(PyObject* py_self, PyObject * py_other, int fct_nb) ...@@ -1047,11 +1048,11 @@ CudaNdarray_inplace_add_div(PyObject* py_self, PyObject * py_other, int fct_nb)
const float*, const int, const int, const float*, const int, const int,
const int, const int); const int, const int);
if(fct_nb == 0){ if(fct_nb == 0){
k_iop_3 = k_iAdd_3; k_iop_3 = k_iAdd_3;
k_iop_4 = k_iAdd_4; k_iop_4 = k_iAdd_4;
}else if(fct_nb == 1){ }else if(fct_nb == 1){
k_iop_3 = k_iDiv_3; k_iop_3 = k_iDiv_3;
k_iop_4 = k_iDiv_4; k_iop_4 = k_iDiv_4;
} }
switch(self->nd) switch(self->nd)
...@@ -1062,7 +1063,7 @@ CudaNdarray_inplace_add_div(PyObject* py_self, PyObject * py_other, int fct_nb) ...@@ -1062,7 +1063,7 @@ CudaNdarray_inplace_add_div(PyObject* py_self, PyObject * py_other, int fct_nb)
dim3 n_threads(1); dim3 n_threads(1);
k_iop_3<<<n_blocks, n_threads>>>(1, k_iop_3<<<n_blocks, n_threads>>>(1,
1, //CudaNdarray_HOST_DIMS(self)[0], 1, //CudaNdarray_HOST_DIMS(self)[0],
1, //CudaNdarray_HOST_DIMS(self)[0], 1, //CudaNdarray_HOST_DIMS(self)[0],
CudaNdarray_DEV_DATA(self), CudaNdarray_DEV_DATA(self),
1, 1,
1, //CudaNdarray_HOST_STRIDES(self)[0], 1, //CudaNdarray_HOST_STRIDES(self)[0],
...@@ -1254,10 +1255,10 @@ CudaNdarray_inplace_add_div(PyObject* py_self, PyObject * py_other, int fct_nb) ...@@ -1254,10 +1255,10 @@ CudaNdarray_inplace_add_div(PyObject* py_self, PyObject * py_other, int fct_nb)
// Will be called by __iadd__ in Python // Will be called by __iadd__ in Python
static PyObject * static PyObject *
CudaNdarray_inplace_add(PyObject* py_self, PyObject * py_other){ CudaNdarray_inplace_add(PyObject* py_self, PyObject * py_other){
PyObject * rval = CudaNdarray_inplace_add_div(py_self, py_other, 0); PyObject * rval = CudaNdarray_inplace_add_div(py_self, py_other, 0);
//We should not increment the refcount as we are doing inplace operation //We should not increment the refcount as we are doing inplace operation
//And in this syntax, their is no additional reference created! //And in this syntax, their is no additional reference created!
return rval; return rval;
} }
/* /*
...@@ -1266,10 +1267,10 @@ CudaNdarray_inplace_add(PyObject* py_self, PyObject * py_other){ ...@@ -1266,10 +1267,10 @@ CudaNdarray_inplace_add(PyObject* py_self, PyObject * py_other){
// Will be called by __idiv__ in Python // Will be called by __idiv__ in Python
static PyObject * static PyObject *
CudaNdarray_inplace_div(PyObject* py_self, PyObject * py_other){ CudaNdarray_inplace_div(PyObject* py_self, PyObject * py_other){
PyObject * rval = CudaNdarray_inplace_add_div(py_self, py_other, 1); PyObject * rval = CudaNdarray_inplace_add_div(py_self, py_other, 1);
//We should not increment the refcount as we are doing inplace operation //We should not increment the refcount as we are doing inplace operation
//And in this syntax, their is no additional reference created! //And in this syntax, their is no additional reference created!
return rval; return rval;
} }
static PyNumberMethods CudaNdarrayNumberMethods = static PyNumberMethods CudaNdarrayNumberMethods =
...@@ -1902,36 +1903,36 @@ static PyTypeObject CudaNdarrayType = ...@@ -1902,36 +1903,36 @@ static PyTypeObject CudaNdarrayType =
static __global__ void get_gpu_ptr_size(int* dst) static __global__ void get_gpu_ptr_size(int* dst)
{ {
dst[0] = sizeof(float*); dst[0] = sizeof(float*);
dst[1] = sizeof(int); dst[1] = sizeof(int);
} }
PyObject * PyObject *
CudaNdarray_ptr_int_size(PyObject* _unused, PyObject* args) CudaNdarray_ptr_int_size(PyObject* _unused, PyObject* args)
{ {
int *gpu_data = (int*)device_malloc(sizeof(int)*2); int *gpu_data = (int*)device_malloc(sizeof(int)*2);
if(gpu_data == NULL){ if(gpu_data == NULL){
return PyErr_Format(PyExc_MemoryError, return PyErr_Format(PyExc_MemoryError,
"CudaNdarray_ptr_int_size: Can't allocate memory on the gpu."); "CudaNdarray_ptr_int_size: Can't allocate memory on the gpu.");
} }
get_gpu_ptr_size<<<1,1>>>(gpu_data); get_gpu_ptr_size<<<1,1>>>(gpu_data);
if (cudaSuccess != cublasGetError()){ if (cudaSuccess != cublasGetError()){
device_free(gpu_data); device_free(gpu_data);
return PyErr_Format(PyExc_RuntimeError, return PyErr_Format(PyExc_RuntimeError,
"CudaNdarray_ptr_int_size: error when calling the gpu code."); "CudaNdarray_ptr_int_size: error when calling the gpu code.");
} }
// Transfer the result to cpu // Transfer the result to cpu
int gpu_sizes[] = {-1,-1}; int gpu_sizes[] = {-1,-1};
cublasGetVector(2, sizeof(int), gpu_data, 1, gpu_sizes, 1); cublasGetVector(2, sizeof(int), gpu_data, 1, gpu_sizes, 1);
device_free(gpu_data); device_free(gpu_data);
if (CUBLAS_STATUS_SUCCESS != cublasGetError()){ if (CUBLAS_STATUS_SUCCESS != cublasGetError()){
PyErr_SetString(PyExc_RuntimeError, "error copying data to from memory"); PyErr_SetString(PyExc_RuntimeError, "error copying data to from memory");
return NULL; return NULL;
} }
return Py_BuildValue("iiii", gpu_sizes[0], sizeof(float*), sizeof(int), gpu_sizes[1]); return Py_BuildValue("iiii", gpu_sizes[0], sizeof(float*), sizeof(int), gpu_sizes[1]);
} }
// Initialize the gpu. // Initialize the gpu.
...@@ -1943,65 +1944,65 @@ CudaNdarray_ptr_int_size(PyObject* _unused, PyObject* args) ...@@ -1943,65 +1944,65 @@ CudaNdarray_ptr_int_size(PyObject* _unused, PyObject* args)
PyObject * PyObject *
CudaNdarray_gpu_init(PyObject* _unused, PyObject* args) CudaNdarray_gpu_init(PyObject* _unused, PyObject* args)
{ {
int card_nb = 0; int card_nb = 0;
int card_number_provided = 1; int card_number_provided = 1;
PyArg_ParseTuple(args, "|i", &card_nb); // if we're given something wildly invalid, this will throw a TypeError PyArg_ParseTuple(args, "|i", &card_nb); // if we're given something wildly invalid, this will throw a TypeError
if(PyTuple_Size(args) == 0) { if(PyTuple_Size(args) == 0) {
card_number_provided = 0; card_number_provided = 0;
card_nb = 0; card_nb = 0;
} }
int deviceCount; int deviceCount;
cudaError err = cudaGetDeviceCount(&deviceCount); cudaError err = cudaGetDeviceCount(&deviceCount);
if(cudaSuccess != err) { if(cudaSuccess != err) {
return PyErr_Format(PyExc_EnvironmentError, return PyErr_Format(PyExc_EnvironmentError,
"Unable to get the number of gpus available: %s", "Unable to get the number of gpus available: %s",
cudaGetErrorString(cudaGetLastError())); cudaGetErrorString(cudaGetLastError()));
} }
// as soon as the first successful call to a cuda* function is made, a // as soon as the first successful call to a cuda* function is made, a
// gpu context has been created // gpu context has been created
g_gpu_context_active = 1; g_gpu_context_active = 1;
if(deviceCount <= 0) { if(deviceCount <= 0) {
return PyErr_Format(PyExc_EnvironmentError, return PyErr_Format(PyExc_EnvironmentError,
"Can't use the GPU, no devices support CUDA"); "Can't use the GPU, no devices support CUDA");
} }
if(card_number_provided && (card_nb < 0 || card_nb > (deviceCount - 1))) { if(card_number_provided && (card_nb < 0 || card_nb > (deviceCount - 1))) {
return PyErr_Format(PyExc_ValueError, return PyErr_Format(PyExc_ValueError,
"Bad device number %d. Only %d devices available.", "Bad device number %d. Only %d devices available.",
card_nb, card_nb,
deviceCount); deviceCount);
} }
cudaDeviceProp deviceProp; cudaDeviceProp deviceProp;
err = cudaGetDeviceProperties(&deviceProp, card_nb); err = cudaGetDeviceProperties(&deviceProp, card_nb);
if(cudaSuccess != err) {
return PyErr_Format(PyExc_EnvironmentError,
"Unable to get properties of gpu %i: %s",
card_nb,
cudaGetErrorString(cudaGetLastError()));
}
if(deviceProp.major == 9999 && deviceProp.minor == 9999 ){
return PyErr_Format(PyExc_EnvironmentError,
"There is no device that supports CUDA");
}
if(card_number_provided) {
err = cudaSetDevice(card_nb);
if(cudaSuccess != err) { if(cudaSuccess != err) {
return PyErr_Format(PyExc_EnvironmentError, return PyErr_Format(PyExc_EnvironmentError,
"Unable to set device %i: %s", "Unable to get properties of gpu %i: %s",
card_nb, card_nb,
cudaGetErrorString(cudaGetLastError())); cudaGetErrorString(cudaGetLastError()));
}
if(deviceProp.major == 9999 && deviceProp.minor == 9999 ){
return PyErr_Format(PyExc_EnvironmentError,
"There is no device that supports CUDA");
}
if(card_number_provided) {
err = cudaSetDevice(card_nb);
if(cudaSuccess != err) {
return PyErr_Format(PyExc_EnvironmentError,
"Unable to set device %i: %s",
card_nb,
cudaGetErrorString(cudaGetLastError()));
}
} }
}
Py_INCREF(Py_None); Py_INCREF(Py_None);
return Py_None; return Py_None;
} }
PyObject * PyObject *
...@@ -2056,7 +2057,7 @@ CudaNdarray_from_gpu_pointer(PyObject* _unused, PyObject* args) ...@@ -2056,7 +2057,7 @@ CudaNdarray_from_gpu_pointer(PyObject* _unused, PyObject* args)
if (!PyLong_Check(gpu_ptr)) if (!PyLong_Check(gpu_ptr))
{ {
PyErr_Format(PyExc_Exception, "CudaNdarray_from_gpu_pointer: The gpu pointor is not an long"); PyErr_Format(PyExc_Exception, "CudaNdarray_from_gpu_pointer: The gpu pointor is not an long");
return NULL; return NULL;
} }
Py_ssize_t nd = PyObject_Length(shapes); Py_ssize_t nd = PyObject_Length(shapes);
...@@ -2071,7 +2072,7 @@ CudaNdarray_from_gpu_pointer(PyObject* _unused, PyObject* args) ...@@ -2071,7 +2072,7 @@ CudaNdarray_from_gpu_pointer(PyObject* _unused, PyObject* args)
PyErr_SetString(PyExc_TypeError, "CudaNdarray_from_gpu_pointer: Couldn't get length of third argument"); PyErr_SetString(PyExc_TypeError, "CudaNdarray_from_gpu_pointer: Couldn't get length of third argument");
return NULL; return NULL;
} }
if (nd != nd_stride) if (nd != nd_stride)
{ {
PyErr_SetString(PyExc_TypeError, "CudaNdarray_from_gpu_pointer: We need the same number of shapes and strides"); PyErr_SetString(PyExc_TypeError, "CudaNdarray_from_gpu_pointer: We need the same number of shapes and strides");
...@@ -2094,7 +2095,7 @@ CudaNdarray_from_gpu_pointer(PyObject* _unused, PyObject* args) ...@@ -2094,7 +2095,7 @@ CudaNdarray_from_gpu_pointer(PyObject* _unused, PyObject* args)
} }
// Set dims and strides // Set dims and strides
for (int i = nd-1; i >= 0; --i) for (int i = nd-1; i >= 0; --i)
{ {
PyObject * idx = PyLong_FromLong(i); PyObject * idx = PyLong_FromLong(i);
...@@ -2105,23 +2106,23 @@ CudaNdarray_from_gpu_pointer(PyObject* _unused, PyObject* args) ...@@ -2105,23 +2106,23 @@ CudaNdarray_from_gpu_pointer(PyObject* _unused, PyObject* args)
} }
PyObject* dim_ = PyObject_GetItem(shapes, idx); PyObject* dim_ = PyObject_GetItem(shapes, idx);
PyObject* strd_ = PyObject_GetItem(strides, idx); PyObject* strd_ = PyObject_GetItem(strides, idx);
if (!PyInt_Check(dim_)) if (!PyInt_Check(dim_))
{ {
PyErr_Format(PyExc_Exception, "CudaNdarray_from_gpu_pointer: shapes[%d] is not an int", i); PyErr_Format(PyExc_Exception, "CudaNdarray_from_gpu_pointer: shapes[%d] is not an int", i);
return NULL; return NULL;
} }
if (!PyInt_Check(strd_)) if (!PyInt_Check(strd_))
{ {
PyErr_Format(PyExc_Exception, "CudaNdarray_from_gpu_pointer: strides[%d] is not an int", i); PyErr_Format(PyExc_Exception, "CudaNdarray_from_gpu_pointer: strides[%d] is not an int", i);
return NULL; return NULL;
} }
int dim = PyInt_AsLong(dim_); int dim = PyInt_AsLong(dim_);
int strd = PyInt_AsLong(strd_); int strd = PyInt_AsLong(strd_);
CudaNdarray_set_stride((CudaNdarray *)rval, i, strd); CudaNdarray_set_stride((CudaNdarray *)rval, i, strd);
CudaNdarray_set_dim((CudaNdarray *)rval, i, dim); CudaNdarray_set_dim((CudaNdarray *)rval, i, dim);
Py_DECREF(idx); Py_DECREF(idx);
Py_DECREF(dim_); Py_DECREF(dim_);
Py_DECREF(strd_); Py_DECREF(strd_);
} }
printf("CudaNdarray_from_gpu_pointer normal return\n"); printf("CudaNdarray_from_gpu_pointer normal return\n");
return rval; return rval;
...@@ -2319,8 +2320,8 @@ initcuda_ndarray(void) ...@@ -2319,8 +2320,8 @@ initcuda_ndarray(void)
PyModule_AddObject(m, "CudaNdarray", (PyObject *)&CudaNdarrayType); PyModule_AddObject(m, "CudaNdarray", (PyObject *)&CudaNdarrayType);
#if COMPUTE_GPU_MEM_USED #if COMPUTE_GPU_MEM_USED
for(int i=0;i<TABLE_SIZE;i++){ for(int i=0;i<TABLE_SIZE;i++){
_alloc_size_table[i].ptr=NULL; _alloc_size_table[i].ptr=NULL;
_alloc_size_table[i].size=0; _alloc_size_table[i].size=0;
} }
#endif #endif
// cublasInit(); // cublasInit();
...@@ -2744,45 +2745,45 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B, ...@@ -2744,45 +2745,45 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B,
} }
int CudaNdarray_sger(float alpha, CudaNdarray * x, CudaNdarray * y, CudaNdarray * A) { int CudaNdarray_sger(float alpha, CudaNdarray * x, CudaNdarray * y, CudaNdarray * A) {
if (x->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg x to sger"); return -1; } if (x->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg x to sger"); return -1; }
if (y->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg y to sger"); return -1; } if (y->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg y to sger"); return -1; }
if (A->nd != 2) { PyErr_SetString(PyExc_ValueError, "non-matrix arg A to sger"); return -1; } if (A->nd != 2) { PyErr_SetString(PyExc_ValueError, "non-matrix arg A to sger"); return -1; }
if ((CudaNdarray_HOST_DIMS(A)[0] != CudaNdarray_HOST_DIMS(x)[0]) if ((CudaNdarray_HOST_DIMS(A)[0] != CudaNdarray_HOST_DIMS(x)[0])
|| (CudaNdarray_HOST_DIMS(A)[1] != CudaNdarray_HOST_DIMS(y)[0])) { || (CudaNdarray_HOST_DIMS(A)[1] != CudaNdarray_HOST_DIMS(y)[0])) {
PyErr_Format(PyExc_ValueError, PyErr_Format(PyExc_ValueError,
"dimension mismatch in args to sger (%i)x(%i)->(%i,%i)", "dimension mismatch in args to sger (%i)x(%i)->(%i,%i)",
CudaNdarray_HOST_DIMS(x)[0], CudaNdarray_HOST_DIMS(x)[0],
CudaNdarray_HOST_DIMS(y)[0], CudaNdarray_HOST_DIMS(y)[0],
CudaNdarray_HOST_DIMS(A)[0], CudaNdarray_HOST_DIMS(A)[0],
CudaNdarray_HOST_DIMS(A)[1]); CudaNdarray_HOST_DIMS(A)[1]);
return -1; return -1;
} }
// Maybe this could work, but be safe for now // Maybe this could work, but be safe for now
if (!CudaNdarray_is_c_contiguous(A)) { if (!CudaNdarray_is_c_contiguous(A)) {
PyErr_SetString(PyExc_NotImplementedError, "non-c continugous A in sger"); PyErr_SetString(PyExc_NotImplementedError, "non-c continugous A in sger");
return -1; return -1;
} }
// Same for this, be safe // Same for this, be safe
assert (CudaNdarray_HOST_STRIDES(x)[0] >= 0); assert (CudaNdarray_HOST_STRIDES(x)[0] >= 0);
assert (CudaNdarray_HOST_STRIDES(y)[0] >= 0); assert (CudaNdarray_HOST_STRIDES(y)[0] >= 0);
// Since Sger expects A in col-major, we invert x and y to fake this. // Since Sger expects A in col-major, we invert x and y to fake this.
cublasSger(CudaNdarray_HOST_DIMS(y)[0], CudaNdarray_HOST_DIMS(x)[0], alpha, cublasSger(CudaNdarray_HOST_DIMS(y)[0], CudaNdarray_HOST_DIMS(x)[0], alpha,
CudaNdarray_DEV_DATA(y), CudaNdarray_HOST_STRIDES(y)[0], CudaNdarray_DEV_DATA(y), CudaNdarray_HOST_STRIDES(y)[0],
CudaNdarray_DEV_DATA(x), CudaNdarray_HOST_STRIDES(x)[0], CudaNdarray_DEV_DATA(x), CudaNdarray_HOST_STRIDES(x)[0],
CudaNdarray_DEV_DATA(A), CudaNdarray_HOST_DIMS(A)[1]); CudaNdarray_DEV_DATA(A), CudaNdarray_HOST_DIMS(A)[1]);
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError(); cudaError_t err = cudaGetLastError();
if (CUBLAS_STATUS_SUCCESS != err) if (CUBLAS_STATUS_SUCCESS != err)
{ {
PyErr_Format(PyExc_RuntimeError, "cublasSger failed (%s)",cudaGetErrorString(err)); PyErr_Format(PyExc_RuntimeError, "cublasSger failed (%s)",cudaGetErrorString(err));
return -1; return -1;
} }
return 0; return 0;
} }
/** /**
...@@ -3143,22 +3144,22 @@ CudaNdarray_dimshuffle(CudaNdarray * self, unsigned int len, const int * pattern ...@@ -3143,22 +3144,22 @@ CudaNdarray_dimshuffle(CudaNdarray * self, unsigned int len, const int * pattern
newdims[i] = 1; newdims[i] = 1;
newstrides[i] = 0; newstrides[i] = 0;
} }
else if(dims_taken[pattern[i]]) else if(dims_taken[pattern[i]])
{ {
PyErr_Format(PyExc_ValueError, "Cudandarray_dimshuffle: invalid pattern for Cudandarray_dimshuffle. You used the dimensions %d multiple time", PyErr_Format(PyExc_ValueError, "Cudandarray_dimshuffle: invalid pattern for Cudandarray_dimshuffle. You used the dimensions %d multiple time",
pattern[i]); pattern[i]);
free(newdims); free(newdims);
return -1; return -1;
} }
else if (pattern[i]>= self->nd) else if (pattern[i]>= self->nd)
{ {
PyErr_Format(PyExc_ValueError, "Cudandarray_dimshuffle: invalid pattern for Cudandarray_dimshuffle. You asked for a dimensions that don't exist %d for a %d dims CudaNdarray", PyErr_Format(PyExc_ValueError, "Cudandarray_dimshuffle: invalid pattern for Cudandarray_dimshuffle. You asked for a dimensions that don't exist %d for a %d dims CudaNdarray",
pattern[i], self->nd); pattern[i], self->nd);
free(newdims); free(newdims);
return -1; return -1;
} }
else else
{ {
newdims[i] = CudaNdarray_HOST_DIMS(self)[pattern[i]]; newdims[i] = CudaNdarray_HOST_DIMS(self)[pattern[i]];
newstrides[i] = CudaNdarray_HOST_STRIDES(self)[pattern[i]]; newstrides[i] = CudaNdarray_HOST_STRIDES(self)[pattern[i]];
dims_taken[pattern[i]] = 1; dims_taken[pattern[i]] = 1;
...@@ -3169,9 +3170,9 @@ CudaNdarray_dimshuffle(CudaNdarray * self, unsigned int len, const int * pattern ...@@ -3169,9 +3170,9 @@ CudaNdarray_dimshuffle(CudaNdarray * self, unsigned int len, const int * pattern
{ {
if (dims_taken[i]==0 && CudaNdarray_HOST_DIMS(self)[i]!=1) if (dims_taken[i]==0 && CudaNdarray_HOST_DIMS(self)[i]!=1)
{ {
PyErr_SetString(PyExc_ValueError, "Cudandarray_dimshuffle: You cannot drop a non-broadcastable dimension."); PyErr_SetString(PyExc_ValueError, "Cudandarray_dimshuffle: You cannot drop a non-broadcastable dimension.");
free(newdims); free(newdims);
return -1; return -1;
} }
} }
//swap this structure in for the one in self, and sync to the card //swap this structure in for the one in self, and sync to the card
...@@ -3225,7 +3226,7 @@ CudaNdarray_Dimshuffle(PyObject* _unused, PyObject* args) ...@@ -3225,7 +3226,7 @@ CudaNdarray_Dimshuffle(PyObject* _unused, PyObject* args)
//parse pattern_object into int * pattern //parse pattern_object into int * pattern
Py_ssize_t pattern_dim = PyObject_Length(pattern_object); Py_ssize_t pattern_dim = PyObject_Length(pattern_object);
if (pattern_dim < 0) if (pattern_dim < 0)
{ {
PyErr_SetString(PyExc_TypeError, "Couldn't get length of third argument to cuda_ndarray.dimshuffle"); PyErr_SetString(PyExc_TypeError, "Couldn't get length of third argument to cuda_ndarray.dimshuffle");
...@@ -3233,7 +3234,7 @@ CudaNdarray_Dimshuffle(PyObject* _unused, PyObject* args) ...@@ -3233,7 +3234,7 @@ CudaNdarray_Dimshuffle(PyObject* _unused, PyObject* args)
} }
pattern = (int *) malloc( pattern_dim * sizeof(int)); pattern = (int *) malloc( pattern_dim * sizeof(int));
for (Py_ssize_t i = 0; i < pattern_dim; i++) for (Py_ssize_t i = 0; i < pattern_dim; i++)
{ {
PyObject * idx = PyLong_FromLong(i); PyObject * idx = PyLong_FromLong(i);
...@@ -3242,10 +3243,10 @@ CudaNdarray_Dimshuffle(PyObject* _unused, PyObject* args) ...@@ -3242,10 +3243,10 @@ CudaNdarray_Dimshuffle(PyObject* _unused, PyObject* args)
{ {
PyErr_SetString(PyExc_Exception, "Couldn't make long object to loop over list/tuple"); PyErr_SetString(PyExc_Exception, "Couldn't make long object to loop over list/tuple");
goto CudaNdarray_dimshuffle_fail; goto CudaNdarray_dimshuffle_fail;
} }
long elem_value = 0; long elem_value = 0;
PyObject * elem = PyObject_GetItem(pattern_object, idx); PyObject * elem = PyObject_GetItem(pattern_object, idx);
if (elem == NULL) if (elem == NULL)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论