works now also for vector, tensor3 and tensor4 and for more than 2 inputs + corrected memory bugs

上级 403ab069
...@@ -2942,181 +2942,116 @@ class GpuJoin(tensor.Join, GpuOp): ...@@ -2942,181 +2942,116 @@ class GpuJoin(tensor.Join, GpuOp):
out[0] = rval out[0] = rval
def c_code(self, node, name, inputs, out_, sub): def c_code(self, node, name, inputs, out_, sub):
if node.inputs[0].data not in [0, 1]:
raise NotImplementedError()
# only works for the first two axis
if len(inputs) != 3:
# only works for two arrays
raise NotImplementedError()
if any([i.ndim != 2 for i in node.inputs[1:]]):
# only works for type T.matrix
raise NotImplementedError()
axis = inputs[0] axis = inputs[0]
n_cndas = len(inputs[1:])
input_1 = inputs[1] input_1 = inputs[1]
input_2 = inputs[2]
axis = inputs[0] axis = inputs[0]
fail = sub['fail'] fail = sub['fail']
out = out_[0] out = out_[0]
# getting the shapes of all the involved tensors (input[0]+out)
str = """ str = """
int axis = PyInt_AsLong((PyObject*)%(axis)s); int axis = PyInt_AsLong((PyObject*)%(axis)s);
int nd = CudaNdarray_NDIM(%(input_1)s); int nd = CudaNdarray_NDIM(%(input_1)s);
int shape_%(input_1)s[nd];
int shape_out[nd];
int dims_array1[nd]; for(int i = 0; i<nd; i+=1)
int errorcode; {
for(int i = 0; i<nd; i+=1){ shape_%(input_1)s[i] = CudaNdarray_HOST_DIMS(%(input_1)s)[i];
dims_array1[i] = CudaNdarray_HOST_DIMS(%(input_1)s)[i]; shape_out[i] = shape_%(input_1)s[i];
}
nd = CudaNdarray_NDIM(%(input_2)s);
int dims_array2[nd];
for(int i = 0; i<nd; i+=1){
dims_array2[i] = CudaNdarray_HOST_DIMS(%(input_2)s)[i];
} }
""" % locals()
int dims_out[nd]; # getting the shapes of all the involved tensors (input[1:])
if(axis==0) # + check: all input tensors have same shape as final out
# execept for "axis" dimension
for i, cdna in enumerate(inputs[2:]):
str += """
nd = CudaNdarray_NDIM(%(cdna)s);
int shape_%(cdna)s[nd];
for(int i = 0; i<nd; i+=1)
{ {
dims_out[0] = dims_array1[0] + dims_array2[0]; shape_%(cdna)s[i] = CudaNdarray_HOST_DIMS(%(cdna)s)[i];
dims_out[1] = dims_array1[1]; if((i!=axis) && (shape_%(cdna)s[i]!=shape_out[i]))
}
if(axis==1)
{ {
dims_out[0] = dims_array1[0]; //(fail)s; //deactivated, because this causes segfault
dims_out[1] = dims_array1[1] + dims_array2[1];
} }
if (CudaNdarray_prep_output(& %(out)s, 2, dims_out))
{
%(fail)s;
} }
""" % locals()
PyObject *slice; # computing the new shape for the out tensors
PyObject *out_sub; str += """
PyObject *start, *stop, *step; int width_sum = 0;\n""" % locals()
step = NULL;
if(axis==0)
{
start = PyInt_FromLong(0);
stop = PyInt_FromLong(dims_array1[0]);
slice = PySlice_New(start, stop, step);
out_sub = CudaNdarray_Subscript((PyObject*)%(out)s, slice);
errorcode = CudaNdarray_CopyFromCudaNdarray((CudaNdarray*)out_sub, %(input_1)s);
if((slice == NULL) || (out_sub == NULL) || (errorcode != 0))
{
Py_XDECREF(slice);
Py_XDECREF(out_sub);
Py_XDECREF(start);
Py_XDECREF(stop);
Py_XDECREF(step);
Py_XDECREF(%(out)s);
%(fail)s;
}
Py_XDECREF(start);
Py_XDECREF(slice);
Py_XDECREF(out_sub);
start = stop; for i, cdna in enumerate(inputs[1:]):
stop = PyInt_FromLong(PyInt_AsLong(start) + dims_array2[0]); str += "\t\twidth_sum += CudaNdarray_HOST_DIMS(%(cdna)s)[axis];\n" % locals()
slice = PySlice_New(start, stop, step); str += "\t\tshape_out[axis] = width_sum;\n"
out_sub = CudaNdarray_Subscript((PyObject*)%(out)s, slice);
errorcode = CudaNdarray_CopyFromCudaNdarray((CudaNdarray*)out_sub, %(input_2)s); str += """
if((slice == NULL) || (out_sub == NULL) || (errorcode != 0)) if (CudaNdarray_prep_output(&%(out)s, nd, shape_out))
{ {
Py_XDECREF(slice);
Py_XDECREF(out_sub);
Py_XDECREF(start);
Py_XDECREF(stop);
Py_XDECREF(step);
Py_XDECREF(%(out)s);
%(fail)s; %(fail)s;
} }
Py_XDECREF(slice); PyObject *out_sub;
Py_XDECREF(out_sub); PyObject *start, *stop, *step;
Py_XDECREF(start); step = NULL;
Py_XDECREF(stop); int errorcode;
Py_XDECREF(step); int sum;
} sum =0;
if(axis==1)
{
PyObject *slice_tuple; PyObject *slice_tuple;
PyObject *full_slice; PyObject *full_slice;
PyObject *section_slice; PyObject *section_slice;
PyObject *start_axis2, *stop_axis2;
start = PyInt_FromLong(0); """ % locals()
stop = PyInt_FromLong(dims_out[0]);
stop_axis2 = PyInt_FromLong(dims_array1[1]);
slice_tuple = PyTuple_New(2);
full_slice = PySlice_New(start, stop, step);
section_slice = PySlice_New(start, stop_axis2, step);
PyTuple_SetItem(slice_tuple, 0, full_slice);
PyTuple_SetItem(slice_tuple, 1, section_slice);
out_sub = CudaNdarray_Subscript((PyObject*)%(out)s, slice_tuple); # start copying the data into the new out tensors
errorcode = CudaNdarray_CopyFromCudaNdarray((CudaNdarray*)out_sub, %(input_1)s); for i, cdna in enumerate(inputs[1:]):
if((full_slice == NULL) || (section_slice == NULL) || (out_sub == NULL) || (errorcode != 0)) str += """
sum += shape_%(cdna)s[axis];
stop = PyInt_FromLong(sum);
slice_tuple = PyTuple_New(nd);
full_slice = PySlice_New(NULL, NULL, NULL);
section_slice = PySlice_New(start, stop, step);
for(int i=0; i<nd; i++)
{ {
Py_XDECREF(full_slice); if(i!=axis)
Py_XDECREF(section_slice); {
Py_XDECREF(slice_tuple); Py_INCREF(full_slice);
Py_XDECREF(out_sub); PyTuple_SetItem(slice_tuple, i, full_slice);
Py_XDECREF(start); }
Py_XDECREF(stop); else if(i==axis)
Py_XDECREF(step); {
Py_XDECREF(start_axis2); Py_INCREF(section_slice);
Py_XDECREF(stop_axis2); PyTuple_SetItem(slice_tuple, i, section_slice);
Py_XDECREF(%(out)s); }
%(fail)s;
} }
Py_XDECREF(stop);
Py_XDECREF(full_slice);
Py_XDECREF(section_slice);
Py_XDECREF(out_sub);
Py_XDECREF(slice_tuple);
start_axis2 = stop_axis2;
stop = PyInt_FromLong(dims_out[0]);
stop_axis2 = PyInt_FromLong(dims_array2[1] + dims_array1[1]);
slice_tuple = PyTuple_New(2);
full_slice = PySlice_New(start, stop, step);
section_slice = PySlice_New(start_axis2, stop_axis2, step);
PyTuple_SetItem(slice_tuple, 0, full_slice);
PyTuple_SetItem(slice_tuple, 1, section_slice);
out_sub = CudaNdarray_Subscript((PyObject*)%(out)s, slice_tuple); out_sub = CudaNdarray_Subscript((PyObject*)%(out)s, slice_tuple);
errorcode = CudaNdarray_CopyFromCudaNdarray((CudaNdarray*)out_sub, %(input_2)s); errorcode = CudaNdarray_CopyFromCudaNdarray((CudaNdarray*)out_sub, %(cdna)s);
if((full_slice == NULL) || (section_slice == NULL) || (out_sub == NULL) || (errorcode != 0)) if((full_slice == NULL) || (section_slice == NULL) || (out_sub == NULL) || (errorcode != 0))
{ {
Py_XDECREF(full_slice); Py_XDECREF(full_slice);
Py_XDECREF(section_slice); Py_XDECREF(section_slice);
Py_XDECREF(slice_tuple); Py_XDECREF(slice_tuple);
Py_XDECREF(out_sub); Py_XDECREF(out_sub);
Py_XDECREF(start);
Py_XDECREF(stop);
Py_XDECREF(step);
Py_XDECREF(start_axis2);
Py_XDECREF(stop_axis2);
Py_XDECREF(%(out)s); Py_XDECREF(%(out)s);
%(fail)s; %(fail)s;
} }
Py_XDECREF(full_slice); Py_XDECREF(full_slice);
Py_XDECREF(section_slice); Py_XDECREF(section_slice);
Py_XDECREF(slice_tuple);
Py_XDECREF(out_sub); Py_XDECREF(out_sub);
Py_XDECREF(slice_tuple);
start = stop;
""" % locals()
str+="""
Py_XDECREF(start); Py_XDECREF(start);
Py_XDECREF(stop); Py_XDECREF(stop);
Py_XDECREF(step); Py_XDECREF(step);"""
Py_XDECREF(start_axis2);
Py_XDECREF(stop_axis2);
}
"""% locals()
return str return str
gpu_join = GpuJoin() gpu_join = GpuJoin()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论