提交 2faeb62c authored 作者: lamblin's avatar lamblin

Merge pull request #1437 from nouiz/gpu_iadd

Gpu iadd for 6d tensor
......@@ -1389,6 +1389,45 @@ __global__ void k_ielem_4(const int d0, const int d1, const int d2, const int d3
}
}
template <int operator_num>
__global__ void k_ielem_6(const int d0, const int d1,
const int d2, const int d3,
const int d4, const int d5,
float* a, const int sA0, const int sA1,
const int sA2, const int sA3,
const int sA4, const int sA5,
const float* b, const int sB0, const int sB1,
const int sB2, const int sB3,
const int sB4, const int sB5
){
for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){
for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y){
for (int i2 = blockIdx.z; i2 < d2; i2 += gridDim.z){
for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x){
for (int i4 = threadIdx.y; i4 < d4; i4 += blockDim.y){
for (int i5 = threadIdx.z; i5 < d5; i5 += blockDim.z){
switch (operator_num) {
case IADD:
a[i0*sA0 + i1*sA1 + i2*sA2 + i3*sA3 + i4*sA4 + i5*sA5]
+= b[i0*sB0 + i1*sB1 + i2*sB2 + i3*sB3 + i4*sB4 + i5*sB5];
break;
case IDIV:
a[i0*sA0 + i1*sA1 + i2*sA2 + i3*sA3 + i4*sA4 + i5*sA5]
/= b[i0*sB0 + i1*sB1 + i2*sB2 + i3*sB3 + i4*sB4 + i5*sB5];
break;
case CPY:
a[i0*sA0 + i1*sA1 + i2*sA2 + i3*sA3 + i4*sA4 + i5*sA5]
= b[i0*sB0 + i1*sB1 + i2*sB2 + i3*sB3 + i4*sB4 + i5*sB5];
break;
}
}
}
}
}
}
}
}
/*
CudaNdarray_inplace_elemwise
Compute elemwise, working inplace on A.
......@@ -1415,19 +1454,31 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
const int, const int,
const float*, const int, const int,
const int, const int);
void (*k6)(const int, const int,
const int, const int,
const int, const int,
float*, const int, const int,
const int, const int,
const int, const int,
const float*, const int, const int,
const int, const int,
const int, const int);
switch (fct_nb)
{
case IADD:
k3 = k_ielem_3<IADD>;
k4 = k_ielem_4<IADD>;
k6 = k_ielem_6<IADD>;
break;
case IDIV:
k3 = k_ielem_3<IDIV>;
k4 = k_ielem_4<IDIV>;
k6 = k_ielem_6<IDIV>;
break;
case CPY:
k3 = k_ielem_3<CPY>;
k4 = k_ielem_4<CPY>;
k6 = k_ielem_6<CPY>;
break;
default:
assert (0);
......@@ -1769,6 +1820,61 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
}
}
break;
case 6:
{
dim3 n_blocks(
std::min(
CudaNdarray_HOST_DIMS(self)[0],
NUM_VECTOR_OP_BLOCKS),
CudaNdarray_HOST_DIMS(self)[1],
CudaNdarray_HOST_DIMS(self)[2]
);
while (n_blocks.x * n_blocks.y > NUM_VECTOR_OP_BLOCKS)
n_blocks.y /= 2;
while (n_blocks.x * n_blocks.y * n_blocks.z > NUM_VECTOR_OP_BLOCKS)
n_blocks.z /= 2;
dim3 n_threads(
std::min(
CudaNdarray_HOST_DIMS(self)[3],
NUM_VECTOR_OP_THREADS_PER_BLOCK)
//TODO: DON"T YOU NEED OT PUT DIMS[4] in here???
//TODO: DON"T YOU NEED OT PUT DIMS[5] in here???
);
k6<<<n_blocks, n_threads>>>(
CudaNdarray_HOST_DIMS(self)[0],
CudaNdarray_HOST_DIMS(self)[1],
CudaNdarray_HOST_DIMS(self)[2],
CudaNdarray_HOST_DIMS(self)[3],
CudaNdarray_HOST_DIMS(self)[4],
CudaNdarray_HOST_DIMS(self)[5],
CudaNdarray_DEV_DATA(self),
CudaNdarray_HOST_STRIDES(self)[0],
CudaNdarray_HOST_STRIDES(self)[1],
CudaNdarray_HOST_STRIDES(self)[2],
CudaNdarray_HOST_STRIDES(self)[3],
CudaNdarray_HOST_STRIDES(self)[4],
CudaNdarray_HOST_STRIDES(self)[5],
CudaNdarray_DEV_DATA(other),
other_strides[0],
other_strides[1],
other_strides[2],
other_strides[3],
other_strides[4],
other_strides[5]);
CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError();
if (cudaSuccess != err)
{
PyErr_Format(
PyExc_RuntimeError,
"Cuda error: %s: %s.\n",
"k4",
cudaGetErrorString(err));
Py_XDECREF(new_other);
return -1;
}
}
break;
default:
{
PyErr_Format(
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论