提交 61ba61d5 authored 作者: Vivek Kulkarni's avatar Vivek Kulkarni

Check in works

上级 346f651f
...@@ -2458,18 +2458,6 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp): ...@@ -2458,18 +2458,6 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp):
inplace = int(self.inplace) inplace = int(self.inplace)
return """ return """
PyObject *x_obj, *y_obj, *row_x, *row_y;
PyObject *x_rowind_obj, *y_rowind_obj;
dtype_%(ind)s *p_index;
int num_indices, j;
int ret;
num_indices = PyArray_SIZE(%(ind)s);
if ((num_indices - 1) > LONG_MAX) {
PyErr_Format(PyExc_AssertionError, "num_indices %%d exceeds LONG_MAX + 1", num_indices);
%(fail)s;
}
Py_XDECREF(%(out)s); Py_XDECREF(%(out)s);
if (!%(inplace)s) { if (!%(inplace)s) {
%(out)s = (CudaNdarray*)CudaNdarray_Copy(%(x)s); %(out)s = (CudaNdarray*)CudaNdarray_Copy(%(x)s);
...@@ -2478,59 +2466,7 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp): ...@@ -2478,59 +2466,7 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp):
Py_XINCREF(%(out)s); Py_XINCREF(%(out)s);
} }
x_obj = (PyObject*)CudaNdarray_View(%(out)s); CudaNdarray_vector_add_fast(%(x)s, %(y)s, %(ind)s);
y_obj = (PyObject*)CudaNdarray_View(%(y)s);
for (j = 0;j < num_indices; j++) {
p_index = (dtype_%(ind)s *)PyArray_GETPTR1(%(ind)s, j);
x_rowind_obj = PyInt_FromLong(*p_index);
if (PyInt_AsLong(x_rowind_obj) != (*p_index)) {
PyErr_Format(PyExc_AssertionError, "Error in converting row index to integer from long");
// Dec Ref what ever we have increfed or allocated so far
// We deallocate objects exactly in the reverse order they were allocated.
Py_XDECREF(x_rowind_obj);
Py_XDECREF(y_obj);
Py_XDECREF(x_obj);
%(fail)s;
}
y_rowind_obj = PyInt_FromLong(j);
row_x = CudaNdarray_Subscript(x_obj, x_rowind_obj);
row_y = CudaNdarray_Subscript(y_obj, y_rowind_obj);
if ((row_x == NULL) || (row_y == NULL)) {
Py_XDECREF(row_y);
Py_XDECREF(row_x);
Py_XDECREF(y_rowind_obj);
Py_XDECREF(x_rowind_obj);
Py_XDECREF(y_obj);
Py_XDECREF(x_obj);
%(fail)s;
}
ret = CudaNdarray_inplace_elemwise(row_x, row_y, IADD);
if (ret != 0) {
Py_XDECREF(row_y);
Py_XDECREF(row_x);
Py_XDECREF(y_rowind_obj);
Py_XDECREF(x_rowind_obj);
Py_XDECREF(y_obj);
Py_XDECREF(x_obj);
%(fail)s;
}
Py_XDECREF(row_y);
Py_XDECREF(row_x);
Py_XDECREF(y_rowind_obj);
Py_XDECREF(x_rowind_obj);
}
Py_XDECREF(y_obj);
Py_XDECREF(x_obj);
if (!%(out)s) { if (!%(out)s) {
%(fail)s %(fail)s
......
...@@ -1326,6 +1326,28 @@ CudaNdarray_add(PyObject* py_self, PyObject * py_other) ...@@ -1326,6 +1326,28 @@ CudaNdarray_add(PyObject* py_self, PyObject * py_other)
return (PyObject *) rval; return (PyObject *) rval;
} }
__global__ void k_vector_add_fast(int numRowsX,
int numColsX,
int stridesX0,
int stridesX1,
float *X,
int numRowsY,
int numColsY,
int stridesY0,
int stridesY1,
float *Y ,
long *d_indices_arr,
int num)
{
int i = (blockIdx.x);
int j = (threadIdx.x);
int x_row = d_indices_arr[i];
int y_row = i;
atomicAdd(&X[(x_row * stridesX0) + (j * stridesX1)], Y[(y_row * stridesY0) + (j * stridesY1)]);
return;
}
template <int operator_num> template <int operator_num>
__global__ void k_ielem_3(const int d0, const int d1, const int d2, __global__ void k_ielem_3(const int d0, const int d1, const int d2,
float* a, const int sA0, const int sA1, const int sA2, float* a, const int sA0, const int sA1, const int sA2,
...@@ -1776,6 +1798,46 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t ...@@ -1776,6 +1798,46 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
return 0; return 0;
} }
void CudaNdarray_vector_add_fast(CudaNdarray* py_self, CudaNdarray* py_other, PyArrayObject *indices_arr)
{
const int *shapeX = CudaNdarray_HOST_DIMS(py_self);
const int *shapeY = CudaNdarray_HOST_DIMS(py_other);
const int *strX = CudaNdarray_HOST_STRIDES(py_self);
const int *strY = CudaNdarray_HOST_STRIDES(py_other);
unsigned int size = (unsigned int)PyArray_SIZE(indices_arr);
unsigned int num_threads_per_block = shapeY[1];
unsigned int num_blocks = size;
dim3 n_blocks(num_blocks);
dim3 n_threads(num_threads_per_block);
static long *d_indices_arr = NULL;
if (!d_indices_arr)
{
d_indices_arr = (long *)device_malloc(sizeof(long) * PyArray_SIZE(indices_arr));
}
assert(d_indices_arr);
cudaError_t err = cudaMemcpy(d_indices_arr,
PyArray_DATA(indices_arr) ,
sizeof(long) * PyArray_SIZE(indices_arr),
cudaMemcpyHostToDevice);
k_vector_add_fast<<<n_blocks, n_threads>>>(shapeX[0],
shapeX[1],
strX[0],
strX[1],
CudaNdarray_DEV_DATA(py_self),
shapeY[0],
shapeY[1],
strY[0],
strY[1],
CudaNdarray_DEV_DATA(py_other),
d_indices_arr,
PyArray_SIZE(indices_arr)
);
return;
}
/* /*
* We need this inplace Add to support IncSubTensor * We need this inplace Add to support IncSubTensor
* It returns py_self on success with an additional reference. Else NULL. * It returns py_self on success with an additional reference. Else NULL.
......
...@@ -490,7 +490,7 @@ DllExport PyObject * CudaNdarray_View(const CudaNdarray * self); ...@@ -490,7 +490,7 @@ DllExport PyObject * CudaNdarray_View(const CudaNdarray * self);
DllExport PyObject * CudaNdarray_inplace_add(PyObject* py_self, PyObject * py_other); DllExport PyObject * CudaNdarray_inplace_add(PyObject* py_self, PyObject * py_other);
DllExport PyObject * CudaNdarray_Subscript(PyObject * py_self, PyObject * key); DllExport PyObject * CudaNdarray_Subscript(PyObject * py_self, PyObject * key);
DllExport int CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t fct_nb); DllExport int CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t fct_nb);
DllExport void CudaNdarray_vector_add_fast(CudaNdarray* py_self, CudaNdarray *py_other, PyArrayObject *indices_arr);
// Ensures that *arr is a pointer to a contiguous ndarray of the specified // Ensures that *arr is a pointer to a contiguous ndarray of the specified
// dimensions. // dimensions.
......
...@@ -1353,6 +1353,19 @@ def local_gpualloc_memset_0(node): ...@@ -1353,6 +1353,19 @@ def local_gpualloc_memset_0(node):
new_out = GpuAlloc(memset_0=True)(*node.inputs) new_out = GpuAlloc(memset_0=True)(*node.inputs)
return [new_out] return [new_out]
@register_opt()
@local_optimizer([None])
def my_optimizer(node):
if isinstance(node.op, tensor.basic.AdvancedIncSubtensor1) and not node.op.inplace and str(node.op) == "GpuAdvancedIncSubtensor1{no_inplace,inc}":
x, y, z = node.inputs
if (str(x) == "GpuAlloc{memset_0=True}.0"):
new_x = GpuAlloc(memset_0=True)(*x.owner.inputs)
new_op = node.op.__class__(inplace=True, set_instead_of_inc = node.op.set_instead_of_inc)
new_node = new_op(new_x, y, z)
return [new_node]
return False
def safe_to_gpu(x): def safe_to_gpu(x):
if (isinstance(x.type, tensor.TensorType) and if (isinstance(x.type, tensor.TensorType) and
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论