提交 f293a0f6 authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #1469 from shackenberg/adding_GpuJoin.c_code

[WIP] Adding GpuJoin.c_code to cuda/basic_ops.py
...@@ -2666,10 +2666,10 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1): ...@@ -2666,10 +2666,10 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
int x_row = d_indices_arr[i]; int x_row = d_indices_arr[i];
int y_row = i; int y_row = i;
atomicAdd(&X[(x_row * stridesX0) + (j * stridesX1)], Y[(y_row * stridesY0) + (j * stridesY1)]); atomicAdd(&X[(x_row * stridesX0) + (j * stridesX1)], Y[(y_row * stridesY0) + (j * stridesY1)]);
} }
} }
return; return;
} }
void CudaNdarray_vector_add_fast(CudaNdarray* py_self, CudaNdarray* py_other, PyArrayObject *indices_arr) void CudaNdarray_vector_add_fast(CudaNdarray* py_self, CudaNdarray* py_other, PyArrayObject *indices_arr)
{ {
...@@ -2945,6 +2945,135 @@ class GpuJoin(tensor.Join, GpuOp): ...@@ -2945,6 +2945,135 @@ class GpuJoin(tensor.Join, GpuOp):
out[0] = rval out[0] = rval
def c_code(self, node, name, inputs, out_, sub):
nd = node.inputs[1].ndim
if not all(i.ndim == nd for i in node.inputs[2:]):
# all inputs ndarray need to have the same number of dimensions
raise NotImplementedError()
axis = inputs[0]
n_cndas = len(inputs[1:])
input_1 = inputs[1]
axis = inputs[0]
fail = sub['fail']
out = out_[0]
# getting the shapes of all the involved tensors (input[0]+out)
str = """
int axis = PyInt_AsLong((PyObject*)%(axis)s);
int nd = %(nd)s;
int shape_%(input_1)s[nd];
int shape_out[nd];
int width_sum = 0;
int errorcode;
int sum;
for(int i = 0; i<nd; i+=1)
{
shape_%(input_1)s[i] = CudaNdarray_HOST_DIMS(%(input_1)s)[i];
shape_out[i] = shape_%(input_1)s[i];
}
""" % locals()
# getting the shapes of all the involved tensors (input[1:])
# + check: all input tensors have same shape as final out
# execept for "axis" dimension
# shape_%(cdna)s[nd] is initialized before, to prevent following
# error: jump to label __label_9 crosses initialization of
# shape_%(cdna)s[nd]
for i, cdna in enumerate(inputs[2:]):
str += """
int shape_%(cdna)s[nd];
""" % locals()
for i, cdna in enumerate(inputs[2:]):
str += """
for(int i = 0; i<nd; i+=1)
{
shape_%(cdna)s[i] = CudaNdarray_HOST_DIMS(%(cdna)s)[i];
if((i!=axis) && (shape_%(cdna)s[i]!=shape_out[i]))
{
PyErr_Format(
PyExc_ValueError,
"GpuJoin: Wrong inputs for input %%d related"
" to inputs 0.!",
i);
%(fail)s;
}
}
""" % locals()
# computing the new shape for the out tensors
for i, cdna in enumerate(inputs[1:]):
str += "\t\twidth_sum += CudaNdarray_HOST_DIMS(%(cdna)s)[axis];\n" % locals()
str += "\t\tshape_out[axis] = width_sum;\n"
# preparing the output array + init of the necessary variables
# for the data transfer
str += """
if (CudaNdarray_prep_output(&%(out)s, nd, shape_out))
{
%(fail)s;
}
PyObject *slice_tuple;
PyObject *section_slice;
PyObject *full_slice;
full_slice = PySlice_New(NULL, NULL, NULL);
PyObject *out_sub;
PyObject *start, *stop, *step;
start = NULL;
stop = NULL;
step = NULL;
sum = 0;
""" % locals()
# start copying the data into the new out tensors
for i, cdna in enumerate(inputs[1:]):
str += """
sum += shape_%(cdna)s[axis];
Py_XDECREF(stop);
stop = PyInt_FromLong(sum);
slice_tuple = PyTuple_New(nd);
section_slice = PySlice_New(start, stop, step);
for(int i=0; i<nd; i++)
{
if(i!=axis)
{
Py_INCREF(full_slice);
PyTuple_SetItem(slice_tuple, i, full_slice);
}
else
{
Py_INCREF(section_slice);
PyTuple_SetItem(slice_tuple, i, section_slice);
}
}
out_sub = CudaNdarray_Subscript((PyObject*)%(out)s, slice_tuple);
errorcode = CudaNdarray_CopyFromCudaNdarray((CudaNdarray*)out_sub, %(cdna)s);
if((full_slice == NULL) || (section_slice == NULL) || (out_sub == NULL) || (errorcode != 0))
{
Py_XDECREF(start);
Py_XDECREF(stop);
Py_XDECREF(step);
Py_XDECREF(slice_tuple);
Py_XDECREF(out_sub);
Py_XDECREF(%(out)s);
%(fail)s;
}
Py_XDECREF(out_sub);
Py_XDECREF(slice_tuple);
Py_XDECREF(start);
start = stop;
""" % locals()
str+="""
Py_XDECREF(start);
Py_XDECREF(stop);
Py_XDECREF(step);"""
return str
def c_code_cache_version(self):
return (1,)
gpu_join = GpuJoin() gpu_join = GpuJoin()
......
...@@ -3200,7 +3200,7 @@ class T_Join_and_Split(unittest.TestCase): ...@@ -3200,7 +3200,7 @@ class T_Join_and_Split(unittest.TestCase):
def test_broadcastable_flags_many_dims_and_inputs(self): def test_broadcastable_flags_many_dims_and_inputs(self):
""" """
Test that the right broadcastable flags get set for a join Test that the right broadcastable flags get set for a join
with many inputs and many input dimensions. with many inputs and many input dimensions.
""" """
a = TensorType(dtype=self.floatX, broadcastable=[1, 0, 1, 0, 0, 0])() a = TensorType(dtype=self.floatX, broadcastable=[1, 0, 1, 0, 0, 0])()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论