提交 4d96960b authored 作者: Frederic Bastien's avatar Frederic Bastien

Implement CudaNdarray.__idiv__ and test it. This is needed for later test change.

上级 3ac57b00
...@@ -876,61 +876,123 @@ CudaNdarray_add(PyObject* py_self, PyObject * py_other) ...@@ -876,61 +876,123 @@ CudaNdarray_add(PyObject* py_self, PyObject * py_other)
} }
return (PyObject *) rval; return (PyObject *) rval;
} }
__global__ void k_iAdd_3(const int d0, const int d1, const int d2,
float* a, const int sA0, const int sA1, const int sA2, /*
const float* b, const int sB0, const int sB1, const int sB2) #define decl_k_elemwise_binary_inplace_rowmajor_3(name, F) \
{ __global__ void name(const int d0, const int d1, const int d2,\
for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x) float* a, const int sA0, const int sA1, const int sA2,\
{ const float* b, const int sB0, const int sB1, const int sB2){\
for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y) for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){\
{ for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y){\
for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x) for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x){\
{ F(a[i0*sA0 + i1*sA1 + i2*sA2], b[i0*sB0 + i1*sB1 + i2*sB2]); \
a[i0*sA0 + i1*sA1 + i2*sA2] += b[i0*sB0 + i1*sB1 + i2*sB2]; }\
} }\
} }\
}
} }
__global__ void k_iAdd_4(const int d0, const int d1, const int d2, const int d3,
float* a, const int sA0, const int sA1, #define decl_k_elemwise_binary_inplace_rowmajor_4(name, F) \
const int sA2, const int sA3, __global__ void name(const int d0, const int d1, const int d2, const int d3,\
const float* b, const int sB0, const int sB1, float* a, const int sA0, const int sA1,\
const int sB2, const int sB3) const int sA2, const int sA3,\
{ const float* b, const int sB0, const int sB1,\
for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x) const int sB2, const int sB3){\
{ for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){\
for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y) for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y){\
{ for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x){\
for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x) for (int i3 = threadIdx.y; i3 < d3; i3 += blockDim.y){\
{ F(a[i0*sA0 + i1*sA1 + i2*sA2 + i3*sA3], b[i0*sB0 + i1*sB1 + i2*sB2 + i3*sB3]); \
for (int i3 = threadIdx.y; i3 < d3; i3 += blockDim.y) }\
{ }\
a[i0*sA0 + i1*sA1 + i2*sA2 + i3*sA3] += b[i0*sB0 + i1*sB1 + i2*sB2 + i3*sB3]; }\
} }\
}
}
}
} }
/*
* We need this inplace Add to support IncSubTensor template<typename T> __device__ T binary_iadd(T a, T b) { a = a+b; }
*/ template<typename T> __device__ T binary_idiv(T a, T b) { a = a/b; }
// Will be called by __iadd__ in Python
decl_k_elemwise_binary_inplace_rowmajor_3(k_iAdd_3, binary_iadd<float>)
decl_k_elemwise_binary_inplace_rowmajor_4(k_iAdd_4, binary_iadd<float>)
decl_k_elemwise_binary_inplace_rowmajor_3(k_iDiv_3, binary_idiv<float>)
decl_k_elemwise_binary_inplace_rowmajor_4(k_iDiv_4, binary_idiv<float>)
*/
__global__ void k_iAdd_3(const int d0, const int d1, const int d2,\
float* a, const int sA0, const int sA1, const int sA2,\
const float* b, const int sB0, const int sB1, const int sB2){\
for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){\
for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y){\
for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x){\
a[i0*sA0 + i1*sA1 + i2*sA2]+= b[i0*sB0 + i1*sB1 + i2*sB2]; \
}\
}\
}\
}
__global__ void k_iAdd_4(const int d0, const int d1, const int d2, const int d3,\
float* a, const int sA0, const int sA1,\
const int sA2, const int sA3,\
const float* b, const int sB0, const int sB1,\
const int sB2, const int sB3){\
for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){\
for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y){\
for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x){\
for (int i3 = threadIdx.y; i3 < d3; i3 += blockDim.y){\
a[i0*sA0 + i1*sA1 + i2*sA2 + i3*sA3] += b[i0*sB0 + i1*sB1 + i2*sB2 + i3*sB3]; \
}\
}\
}\
}\
}
__global__ void k_iDiv_3(const int d0, const int d1, const int d2,\
float* a, const int sA0, const int sA1, const int sA2,\
const float* b, const int sB0, const int sB1, const int sB2){\
for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){\
for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y){\
for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x){\
a[i0*sA0 + i1*sA1 + i2*sA2]/= b[i0*sB0 + i1*sB1 + i2*sB2]; \
}\
}\
}\
}
__global__ void k_iDiv_4(const int d0, const int d1, const int d2, const int d3,\
float* a, const int sA0, const int sA1,\
const int sA2, const int sA3,\
const float* b, const int sB0, const int sB1,\
const int sB2, const int sB3){\
for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){\
for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y){\
for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x){\
for (int i3 = threadIdx.y; i3 < d3; i3 += blockDim.y){\
a[i0*sA0 + i1*sA1 + i2*sA2 + i3*sA3] /= b[i0*sB0 + i1*sB1 + i2*sB2 + i3*sB3]; \
}\
}\
}\
}\
}
static PyObject * static PyObject *
CudaNdarray_inplace_add(PyObject* py_self, PyObject * py_other) CudaNdarray_inplace_add_div(PyObject* py_self, PyObject * py_other, int fct_nb)
{ {
int verbose = 0; int verbose = 0;
if (! CudaNdarray_Check(py_self)) { if (! CudaNdarray_Check(py_self)) {
PyErr_SetString(PyExc_TypeError, "CudaNdarray_inplace_add need a CudaNdarray on left"); PyErr_SetString(PyExc_TypeError, "CudaNdarray_inplace_add_div need a CudaNdarray on left");
return NULL; return NULL;
} }
if (! CudaNdarray_Check(py_other)) { if (! CudaNdarray_Check(py_other)) {
PyErr_SetString(PyExc_TypeError, "CudaNdarray_inplace_add need a CudaNdarray on right"); PyErr_SetString(PyExc_TypeError, "CudaNdarray_inplace_add_div need a CudaNdarray on right");
return NULL; return NULL;
} }
if (fct_nb<0 || fct_nb>1){
PyErr_SetString(PyExc_TypeError, "CudaNdarray_inplace_add_div fct_nb param supported are only 0 and 1.");
return NULL;
}
CudaNdarray * self = (CudaNdarray *)py_self; CudaNdarray * self = (CudaNdarray *)py_self;
CudaNdarray * other = (CudaNdarray *)py_other; CudaNdarray * other = (CudaNdarray *)py_other;
if (verbose) fprintf(stderr, "INPLACE ADD for nd=%d\n",self->nd); if (verbose) fprintf(stderr, "INPLACE ADD/DIV for nd=%d\n",self->nd);
//standard elemwise size checks //standard elemwise size checks
if (self->nd != other->nd) if (self->nd != other->nd)
...@@ -955,6 +1017,21 @@ CudaNdarray_inplace_add(PyObject* py_self, PyObject * py_other) ...@@ -955,6 +1017,21 @@ CudaNdarray_inplace_add(PyObject* py_self, PyObject * py_other)
Py_INCREF(py_self); Py_INCREF(py_self);
return py_self; return py_self;
} }
void (*k_iop_3)(const int, const int, const int,
float*, const int, const int, const int,
const float*, const int, const int, const int);
void (*k_iop_4)(const int, const int, const int, const int,
float*, const int, const int,
const int, const int,
const float*, const int, const int,
const int, const int);
if(fct_nb == 0){
k_iop_3 = k_iAdd_3;
k_iop_4 = k_iAdd_4;
}else if(fct_nb == 1){
k_iop_3 = k_iDiv_3;
k_iop_4 = k_iDiv_4;
}
switch(self->nd) switch(self->nd)
{ {
...@@ -964,7 +1041,7 @@ CudaNdarray_inplace_add(PyObject* py_self, PyObject * py_other) ...@@ -964,7 +1041,7 @@ CudaNdarray_inplace_add(PyObject* py_self, PyObject * py_other)
dim3 n_threads( dim3 n_threads(
std::min(CudaNdarray_HOST_DIMS(self)[0], NUM_VECTOR_OP_THREADS_PER_BLOCK) std::min(CudaNdarray_HOST_DIMS(self)[0], NUM_VECTOR_OP_THREADS_PER_BLOCK)
); );
k_iAdd_3<<<n_blocks, n_threads>>>(1, k_iop_3<<<n_blocks, n_threads>>>(1,
1, //CudaNdarray_HOST_DIMS(self)[0], 1, //CudaNdarray_HOST_DIMS(self)[0],
CudaNdarray_HOST_DIMS(self)[0], CudaNdarray_HOST_DIMS(self)[0],
CudaNdarray_DEV_DATA(self), CudaNdarray_DEV_DATA(self),
...@@ -979,7 +1056,7 @@ CudaNdarray_inplace_add(PyObject* py_self, PyObject * py_other) ...@@ -979,7 +1056,7 @@ CudaNdarray_inplace_add(PyObject* py_self, PyObject * py_other)
cudaError_t err = cudaGetLastError(); cudaError_t err = cudaGetLastError();
if( cudaSuccess != err) if( cudaSuccess != err)
{ {
PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s.\n", "k_iAdd", cudaGetErrorString(err)); PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s.\n", "k_iop_3", cudaGetErrorString(err));
return NULL; return NULL;
} }
Py_INCREF(py_self); Py_INCREF(py_self);
...@@ -993,7 +1070,7 @@ CudaNdarray_inplace_add(PyObject* py_self, PyObject * py_other) ...@@ -993,7 +1070,7 @@ CudaNdarray_inplace_add(PyObject* py_self, PyObject * py_other)
dim3 n_threads( dim3 n_threads(
std::min(CudaNdarray_HOST_DIMS(self)[1], NUM_VECTOR_OP_THREADS_PER_BLOCK) std::min(CudaNdarray_HOST_DIMS(self)[1], NUM_VECTOR_OP_THREADS_PER_BLOCK)
); );
k_iAdd_3<<<n_blocks, n_threads>>>(1, k_iop_3<<<n_blocks, n_threads>>>(1,
CudaNdarray_HOST_DIMS(self)[0], CudaNdarray_HOST_DIMS(self)[0],
CudaNdarray_HOST_DIMS(self)[1], CudaNdarray_HOST_DIMS(self)[1],
CudaNdarray_DEV_DATA(self), CudaNdarray_DEV_DATA(self),
...@@ -1008,7 +1085,7 @@ CudaNdarray_inplace_add(PyObject* py_self, PyObject * py_other) ...@@ -1008,7 +1085,7 @@ CudaNdarray_inplace_add(PyObject* py_self, PyObject * py_other)
cudaError_t err = cudaGetLastError(); cudaError_t err = cudaGetLastError();
if( cudaSuccess != err) if( cudaSuccess != err)
{ {
PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s.\n", "k_iAdd", cudaGetErrorString(err)); PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s.\n", "k_iop_3", cudaGetErrorString(err));
return NULL; return NULL;
} }
Py_INCREF(py_self); Py_INCREF(py_self);
...@@ -1024,7 +1101,7 @@ CudaNdarray_inplace_add(PyObject* py_self, PyObject * py_other) ...@@ -1024,7 +1101,7 @@ CudaNdarray_inplace_add(PyObject* py_self, PyObject * py_other)
dim3 n_threads( dim3 n_threads(
std::min(CudaNdarray_HOST_DIMS(self)[2], NUM_VECTOR_OP_THREADS_PER_BLOCK) std::min(CudaNdarray_HOST_DIMS(self)[2], NUM_VECTOR_OP_THREADS_PER_BLOCK)
); );
k_iAdd_3<<<n_blocks, n_threads>>>( k_iop_3<<<n_blocks, n_threads>>>(
CudaNdarray_HOST_DIMS(self)[0], CudaNdarray_HOST_DIMS(self)[0],
CudaNdarray_HOST_DIMS(self)[1], CudaNdarray_HOST_DIMS(self)[1],
CudaNdarray_HOST_DIMS(self)[2], CudaNdarray_HOST_DIMS(self)[2],
...@@ -1040,7 +1117,7 @@ CudaNdarray_inplace_add(PyObject* py_self, PyObject * py_other) ...@@ -1040,7 +1117,7 @@ CudaNdarray_inplace_add(PyObject* py_self, PyObject * py_other)
cudaError_t err = cudaGetLastError(); cudaError_t err = cudaGetLastError();
if( cudaSuccess != err) if( cudaSuccess != err)
{ {
PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s.\n", "k_iAdd", cudaGetErrorString(err)); PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s.\n", "k_iop_3", cudaGetErrorString(err));
return NULL; return NULL;
} }
Py_INCREF(py_self); Py_INCREF(py_self);
...@@ -1056,7 +1133,7 @@ CudaNdarray_inplace_add(PyObject* py_self, PyObject * py_other) ...@@ -1056,7 +1133,7 @@ CudaNdarray_inplace_add(PyObject* py_self, PyObject * py_other)
dim3 n_threads( dim3 n_threads(
std::min(CudaNdarray_HOST_DIMS(self)[2], NUM_VECTOR_OP_THREADS_PER_BLOCK) std::min(CudaNdarray_HOST_DIMS(self)[2], NUM_VECTOR_OP_THREADS_PER_BLOCK)
); );
k_iAdd_4<<<n_blocks, n_threads>>>( k_iop_4<<<n_blocks, n_threads>>>(
CudaNdarray_HOST_DIMS(self)[0], CudaNdarray_HOST_DIMS(self)[0],
CudaNdarray_HOST_DIMS(self)[1], CudaNdarray_HOST_DIMS(self)[1],
CudaNdarray_HOST_DIMS(self)[2], CudaNdarray_HOST_DIMS(self)[2],
...@@ -1075,7 +1152,7 @@ CudaNdarray_inplace_add(PyObject* py_self, PyObject * py_other) ...@@ -1075,7 +1152,7 @@ CudaNdarray_inplace_add(PyObject* py_self, PyObject * py_other)
cudaError_t err = cudaGetLastError(); cudaError_t err = cudaGetLastError();
if( cudaSuccess != err) if( cudaSuccess != err)
{ {
PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s.\n", "k_iAdd", cudaGetErrorString(err)); PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s.\n", "k_iop_4", cudaGetErrorString(err));
return NULL; return NULL;
} }
Py_INCREF(py_self); Py_INCREF(py_self);
...@@ -1093,7 +1170,7 @@ CudaNdarray_inplace_add(PyObject* py_self, PyObject * py_other) ...@@ -1093,7 +1170,7 @@ CudaNdarray_inplace_add(PyObject* py_self, PyObject * py_other)
); );
for (int i = 0; i < CudaNdarray_HOST_DIMS(self)[0]; ++i) for (int i = 0; i < CudaNdarray_HOST_DIMS(self)[0]; ++i)
{ {
k_iAdd_4<<<n_blocks, n_threads>>>( k_iop_4<<<n_blocks, n_threads>>>(
CudaNdarray_HOST_DIMS(self)[1], CudaNdarray_HOST_DIMS(self)[1],
CudaNdarray_HOST_DIMS(self)[2], CudaNdarray_HOST_DIMS(self)[2],
CudaNdarray_HOST_DIMS(self)[3], CudaNdarray_HOST_DIMS(self)[3],
...@@ -1112,7 +1189,7 @@ CudaNdarray_inplace_add(PyObject* py_self, PyObject * py_other) ...@@ -1112,7 +1189,7 @@ CudaNdarray_inplace_add(PyObject* py_self, PyObject * py_other)
cudaError_t err = cudaGetLastError(); cudaError_t err = cudaGetLastError();
if( cudaSuccess != err) if( cudaSuccess != err)
{ {
PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s.\n", "k_iAdd", cudaGetErrorString(err)); PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s.\n", "k_iop_4", cudaGetErrorString(err));
return NULL; return NULL;
} }
} }
...@@ -1125,6 +1202,24 @@ CudaNdarray_inplace_add(PyObject* py_self, PyObject * py_other) ...@@ -1125,6 +1202,24 @@ CudaNdarray_inplace_add(PyObject* py_self, PyObject * py_other)
return NULL; return NULL;
} }
/*
* We need this inplace Add to support IncSubTensor
*/
// Will be called by __iadd__ in Python
static PyObject *
CudaNdarray_inplace_add(PyObject* py_self, PyObject * py_other){
CudaNdarray_inplace_add_div(py_self, py_other, 0);
}
/*
* We need this inplace div for cuda/tests/test_basic_ops.py:test_shared_options
*/
// Will be called by __idiv__ in Python
static PyObject *
CudaNdarray_inplace_div(PyObject* py_self, PyObject * py_other){
CudaNdarray_inplace_add_div(py_self, py_other, 1);
}
static PyNumberMethods CudaNdarrayNumberMethods = static PyNumberMethods CudaNdarrayNumberMethods =
{ {
(binaryfunc)CudaNdarray_add, //binaryfunc nb_add; __add__ (binaryfunc)CudaNdarray_add, //binaryfunc nb_add; __add__
...@@ -1155,7 +1250,7 @@ static PyNumberMethods CudaNdarrayNumberMethods = ...@@ -1155,7 +1250,7 @@ static PyNumberMethods CudaNdarrayNumberMethods =
(binaryfunc)CudaNdarray_inplace_add, //binaryfunc nb_inplace_add; __iadd__ (binaryfunc)CudaNdarray_inplace_add, //binaryfunc nb_inplace_add; __iadd__
0, //binaryfunc nb_inplace_subtract; __isub__ 0, //binaryfunc nb_inplace_subtract; __isub__
0, //binaryfunc nb_inplace_multiply; __imul__ 0, //binaryfunc nb_inplace_multiply; __imul__
0, //binaryfunc nb_inplace_divide; __idiv__ (binaryfunc)CudaNdarray_inplace_div, //binaryfunc nb_inplace_divide; __idiv__
0, //binaryfunc nb_inplace_remainder; __imod__ 0, //binaryfunc nb_inplace_remainder; __imod__
0, //ternaryfunc nb_inplace_power; __ipow__ 0, //ternaryfunc nb_inplace_power; __ipow__
0, //binaryfunc nb_inplace_lshift; __ilshift__ 0, //binaryfunc nb_inplace_lshift; __ilshift__
......
...@@ -15,7 +15,7 @@ def test_host_to_device(): ...@@ -15,7 +15,7 @@ def test_host_to_device():
c = numpy.asarray(b) c = numpy.asarray(b)
assert numpy.all(a == c) assert numpy.all(a == c)
def test_add(): def test_add_iadd_idiv():
for shape in ((), (0,), (3,), (2,3), (1,10000000),(10,1000000), (100,100000), for shape in ((), (0,), (3,), (2,3), (1,10000000),(10,1000000), (100,100000),
(1000,10000),(10000,1000), (1000,10000),(10000,1000),
(4100,33,34),(33,4100,34),(33,34,4100), (4100,33,34),(33,4100,34),(33,34,4100),
...@@ -51,6 +51,11 @@ def test_add(): ...@@ -51,6 +51,11 @@ def test_add():
assert numpy.allclose(a0, numpy.asarray(b0)) assert numpy.allclose(a0, numpy.asarray(b0))
assert numpy.allclose(a0,a1*2) assert numpy.allclose(a0,a1*2)
b0 /= b1
a0 /= a1
assert numpy.allclose(a0, numpy.asarray(b0))
assert numpy.allclose(a0,numpy.ones(a0.shape)*2)
if len(shape)==2: if len(shape)==2:
#test not contiguous version. #test not contiguous version.
#should raise not implemented. #should raise not implemented.
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论