提交 346f651f authored 作者: nouiz's avatar nouiz

Merge pull request #1309 from viveksck/RebasedBranch

REBASED: C code for GPUAdvancedIncSubtensor1.Handles one case to make it faster
...@@ -2442,6 +2442,100 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp): ...@@ -2442,6 +2442,100 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp):
x[i] += y x[i] += y
out[0] = x out[0] = x
def c_code_cache_version(self):
return (1,)
def c_code(self, node, name, inputs, outputs, sub):
if (self.set_instead_of_inc) or \
(node.inputs[0].ndim != node.inputs[1].ndim):
raise NotImplementedError("This case does not have C code yet.")
x = inputs[0]
y = inputs[1]
ind = inputs[2]
out = outputs[0]
fail = sub['fail']
inplace = int(self.inplace)
return """
PyObject *x_obj, *y_obj, *row_x, *row_y;
PyObject *x_rowind_obj, *y_rowind_obj;
dtype_%(ind)s *p_index;
int num_indices, j;
int ret;
num_indices = PyArray_SIZE(%(ind)s);
if ((num_indices - 1) > LONG_MAX) {
PyErr_Format(PyExc_AssertionError, "num_indices %%d exceeds LONG_MAX + 1", num_indices);
%(fail)s;
}
Py_XDECREF(%(out)s);
if (!%(inplace)s) {
%(out)s = (CudaNdarray*)CudaNdarray_Copy(%(x)s);
} else {
%(out)s = %(x)s;
Py_XINCREF(%(out)s);
}
x_obj = (PyObject*)CudaNdarray_View(%(out)s);
y_obj = (PyObject*)CudaNdarray_View(%(y)s);
for (j = 0;j < num_indices; j++) {
p_index = (dtype_%(ind)s *)PyArray_GETPTR1(%(ind)s, j);
x_rowind_obj = PyInt_FromLong(*p_index);
if (PyInt_AsLong(x_rowind_obj) != (*p_index)) {
PyErr_Format(PyExc_AssertionError, "Error in converting row index to integer from long");
// Dec Ref what ever we have increfed or allocated so far
// We deallocate objects exactly in the reverse order they were allocated.
Py_XDECREF(x_rowind_obj);
Py_XDECREF(y_obj);
Py_XDECREF(x_obj);
%(fail)s;
}
y_rowind_obj = PyInt_FromLong(j);
row_x = CudaNdarray_Subscript(x_obj, x_rowind_obj);
row_y = CudaNdarray_Subscript(y_obj, y_rowind_obj);
if ((row_x == NULL) || (row_y == NULL)) {
Py_XDECREF(row_y);
Py_XDECREF(row_x);
Py_XDECREF(y_rowind_obj);
Py_XDECREF(x_rowind_obj);
Py_XDECREF(y_obj);
Py_XDECREF(x_obj);
%(fail)s;
}
ret = CudaNdarray_inplace_elemwise(row_x, row_y, IADD);
if (ret != 0) {
Py_XDECREF(row_y);
Py_XDECREF(row_x);
Py_XDECREF(y_rowind_obj);
Py_XDECREF(x_rowind_obj);
Py_XDECREF(y_obj);
Py_XDECREF(x_obj);
%(fail)s;
}
Py_XDECREF(row_y);
Py_XDECREF(row_x);
Py_XDECREF(y_rowind_obj);
Py_XDECREF(x_rowind_obj);
}
Py_XDECREF(y_obj);
Py_XDECREF(x_obj);
if (!%(out)s) {
%(fail)s
}
""" %locals()
class GpuIncSubtensor(tensor.IncSubtensor, GpuOp): class GpuIncSubtensor(tensor.IncSubtensor, GpuOp):
""" """
......
...@@ -746,15 +746,6 @@ PyObject * CudaNdarray_View(const CudaNdarray * self) ...@@ -746,15 +746,6 @@ PyObject * CudaNdarray_View(const CudaNdarray * self)
return (PyObject*)rval; return (PyObject*)rval;
} }
enum operator_t
{
IADD=0,
IDIV,
CPY,
N_ELEMWISE_OPS // This is to know the number of operation
};
/* /*
* d0,... are the output dims * d0,... are the output dims
* indices are a list of index to operate on * indices are a list of index to operate on
...@@ -1889,7 +1880,7 @@ CudaNdarray_len(PyObject * py_self) ...@@ -1889,7 +1880,7 @@ CudaNdarray_len(PyObject * py_self)
} }
// Will by called by __getitem__ in Python // Will by called by __getitem__ in Python
static PyObject * PyObject *
CudaNdarray_Subscript(PyObject * py_self, PyObject * key) CudaNdarray_Subscript(PyObject * py_self, PyObject * key)
{ {
int verbose = 0; int verbose = 0;
......
...@@ -95,6 +95,15 @@ struct CudaNdarray ...@@ -95,6 +95,15 @@ struct CudaNdarray
real* devdata; //pointer to data element [0,..,0]. real* devdata; //pointer to data element [0,..,0].
}; };
enum operator_t
{
IADD=0,
IDIV,
CPY,
N_ELEMWISE_OPS // This is to know the number of operation
};
/* /*
* Return a CudaNdarray whose 'nd' dimensions are all 0. * Return a CudaNdarray whose 'nd' dimensions are all 0.
* if nd==-1, it is not initialized. * if nd==-1, it is not initialized.
...@@ -479,7 +488,8 @@ int fprint_CudaNdarray(FILE * fd, const CudaNdarray *self); ...@@ -479,7 +488,8 @@ int fprint_CudaNdarray(FILE * fd, const CudaNdarray *self);
DllExport PyObject * CudaNdarray_View(const CudaNdarray * self); DllExport PyObject * CudaNdarray_View(const CudaNdarray * self);
DllExport PyObject * CudaNdarray_inplace_add(PyObject* py_self, PyObject * py_other); DllExport PyObject * CudaNdarray_inplace_add(PyObject* py_self, PyObject * py_other);
DllExport PyObject * CudaNdarray_Subscript(PyObject * py_self, PyObject * key);
DllExport int CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t fct_nb);
// Ensures that *arr is a pointer to a contiguous ndarray of the specified // Ensures that *arr is a pointer to a contiguous ndarray of the specified
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论