提交 10d9a034 authored 作者: Frederic's avatar Frederic

Better indentation

上级 09f4c338
......@@ -3048,71 +3048,71 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
int CudaNdarray_vector_add_fast(CudaNdarray* py_self,
CudaNdarray* py_other, PyArrayObject *indices_arr)
{
if(init_err_var()!= 0) return -1;
const int *shapeX = CudaNdarray_HOST_DIMS(py_self);
const int *shapeY = CudaNdarray_HOST_DIMS(py_other);
const int *strX = CudaNdarray_HOST_STRIDES(py_self);
const int *strY = CudaNdarray_HOST_STRIDES(py_other);
unsigned int size = (unsigned int)PyArray_SIZE(indices_arr);
if(size == 0){
return 0;
}
unsigned int numcolsX = shapeX[1];
unsigned int num_threads_per_block = std::min(numcolsX, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
unsigned int num_blocks = std::min(size ,(unsigned int)NUM_VECTOR_OP_BLOCKS);
dim3 n_blocks(num_blocks);
dim3 n_threads(num_threads_per_block);
long *d_indices_arr = NULL;
PyArrayObject *cpu_indices_arr = PyArray_GETCONTIGUOUS(indices_arr);
d_indices_arr = (long*)device_malloc(PyArray_NBYTES(cpu_indices_arr));
if(!d_indices_arr)
return -1;
cudaError_t err = cudaMemcpy(d_indices_arr,
PyArray_DATA(cpu_indices_arr),
PyArray_NBYTES(cpu_indices_arr),
cudaMemcpyHostToDevice);
if(err != cudaSuccess){
PyErr_Format(
PyExc_RuntimeError,
"GpuAdvancedIncSubtensor1_dev20: cudaMemcpy returned an error: %%s",
cudaGetErrorString(err));
return -1;
}
k_vector_add_fast<<<n_blocks, n_threads>>>(shapeX[0],
shapeX[1],
strX[0],
strX[1],
CudaNdarray_DEV_DATA(py_self),
shapeY[0],
shapeY[1],
strY[0],
strY[1],
CudaNdarray_DEV_DATA(py_other),
d_indices_arr,
PyArray_SIZE(indices_arr),
err_var
);
int index_err = check_err_var();
device_free(d_indices_arr);
Py_XDECREF(cpu_indices_arr);
if(index_err != 0) return -1;
err = cudaGetLastError();
if(err != cudaSuccess){
PyErr_Format(
PyExc_RuntimeError,
"GpuAdvancedIncSubtensor1_dev20: cuda error: %%s",
cudaGetErrorString(err));
return -1;
}
if(init_err_var()!= 0) return -1;
const int *shapeX = CudaNdarray_HOST_DIMS(py_self);
const int *shapeY = CudaNdarray_HOST_DIMS(py_other);
const int *strX = CudaNdarray_HOST_STRIDES(py_self);
const int *strY = CudaNdarray_HOST_STRIDES(py_other);
unsigned int size = (unsigned int)PyArray_SIZE(indices_arr);
if(size == 0){
return 0;
}
unsigned int numcolsX = shapeX[1];
unsigned int num_threads_per_block = std::min(numcolsX, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
unsigned int num_blocks = std::min(size ,(unsigned int)NUM_VECTOR_OP_BLOCKS);
dim3 n_blocks(num_blocks);
dim3 n_threads(num_threads_per_block);
long *d_indices_arr = NULL;
PyArrayObject *cpu_indices_arr = PyArray_GETCONTIGUOUS(indices_arr);
d_indices_arr = (long*)device_malloc(PyArray_NBYTES(cpu_indices_arr));
if(!d_indices_arr)
return -1;
cudaError_t err = cudaMemcpy(d_indices_arr,
PyArray_DATA(cpu_indices_arr),
PyArray_NBYTES(cpu_indices_arr),
cudaMemcpyHostToDevice);
if(err != cudaSuccess){
PyErr_Format(
PyExc_RuntimeError,
"GpuAdvancedIncSubtensor1_dev20: cudaMemcpy returned an error: %%s",
cudaGetErrorString(err));
return -1;
}
k_vector_add_fast<<<n_blocks, n_threads>>>(shapeX[0],
shapeX[1],
strX[0],
strX[1],
CudaNdarray_DEV_DATA(py_self),
shapeY[0],
shapeY[1],
strY[0],
strY[1],
CudaNdarray_DEV_DATA(py_other),
d_indices_arr,
PyArray_SIZE(indices_arr),
err_var
);
int index_err = check_err_var();
device_free(d_indices_arr);
Py_XDECREF(cpu_indices_arr);
if(index_err != 0) return -1;
err = cudaGetLastError();
if(err != cudaSuccess){
PyErr_Format(
PyExc_RuntimeError,
"GpuAdvancedIncSubtensor1_dev20: cuda error: %%s",
cudaGetErrorString(err));
return -1;
}
return 0;
}
""" % locals()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论