提交 b9b760e8 authored 作者: Pascal Lamblin's avatar Pascal Lamblin

Make GpuGemm work on all kinds of memory layouts

It should now support matrices with negative strides, and non-unit strides in general, by making a copy in the appropriate cases.
上级 6c6b9e7a
......@@ -191,7 +191,7 @@ class GpuGemm(Op):
return Apply(self, [z, a, x, y, b], [z.type()])
def c_code_cache_version(self):
return (3,)
return (4,)
def c_code(self, node, name, inputs, outputs, sub):
#z_out = alpha * dot(x,y) + beta * z_in
......@@ -199,6 +199,7 @@ class GpuGemm(Op):
#not inplace version, we copy z_in to z_out.
z_in, a, x, y, b = inputs
z_out, = outputs
inplace = int(self.inplace)
fail = sub['fail']
sio = StringIO.StringIO()
......@@ -214,39 +215,50 @@ class GpuGemm(Op):
: (REAL)(((double*)%(b)s->data)[0]);
#undef REAL
"""
if self.inplace:
print >> sio, """
if (%(inplace)s
&& (CudaNdarray_HOST_STRIDES(%(z_in)s)[0] >= 0)
&& (CudaNdarray_HOST_STRIDES(%(z_in)s)[1] >= 0)
&& ((CudaNdarray_HOST_DIMS(%(z_in)s)[0] <= 1)
|| (CudaNdarray_HOST_STRIDES(%(z_in)s)[0] == 1)
|| (CudaNdarray_HOST_DIMS(%(z_in)s)[1] <= 1)
|| (CudaNdarray_HOST_STRIDES(%(z_in)s)[1] == 1)))
{
// The input has an appropriate layout, we work inplace
Py_XDECREF(%(z_out)s);
%(z_out)s = %(z_in)s;
Py_INCREF(%(z_out)s);
"""
else:
print >> sio, """
if (!%(z_out)s
|| (%(z_out)s->nd != 2)
|| (CudaNdarray_HOST_DIMS(%(z_out)s)[0] != CudaNdarray_HOST_DIMS(%(z_in)s)[0])
|| (CudaNdarray_HOST_DIMS(%(z_out)s)[1] != CudaNdarray_HOST_DIMS(%(z_in)s)[1])
)
}
else if (%(z_out)s
&& (%(z_out)s->nd == 2)
&& (CudaNdarray_HOST_DIMS(%(z_out)s)[0]
== CudaNdarray_HOST_DIMS(%(z_in)s)[0])
&& (CudaNdarray_HOST_DIMS(%(z_out)s)[1]
== CudaNdarray_HOST_DIMS(%(z_in)s)[1])
&& (CudaNdarray_HOST_STRIDES(%(z_out)s)[0] >= 0)
&& (CudaNdarray_HOST_STRIDES(%(z_out)s)[1] >= 0)
&& ((CudaNdarray_HOST_DIMS(%(z_out)s)[0] <= 1)
|| (CudaNdarray_HOST_STRIDES(%(z_out)s)[0] == 1)
|| (CudaNdarray_HOST_DIMS(%(z_out)s)[1] <= 1)
|| (CudaNdarray_HOST_STRIDES(%(z_out)s)[1] == 1)))
{
// The existing output has an appropriate layout,
// copy the input data into it, then work inplace
if (CudaNdarray_CopyFromCudaNdarray(%(z_out)s, %(z_in)s))
{
Py_XDECREF(%(z_out)s);
%(z_out)s = (CudaNdarray*)CudaNdarray_Copy(%(z_in)s);
if (!%(z_out)s)
{
%(fail)s;
}
%(fail)s;
}
else
}
else
{
// Copy the input, use the copy as output
Py_XDECREF(%(z_out)s);
%(z_out)s = (CudaNdarray*)CudaNdarray_Copy(%(z_in)s);
if (!%(z_out)s)
{
if (CudaNdarray_CopyFromCudaNdarray(%(z_out)s, %(z_in)s))
{
%(fail)s;
}
%(fail)s;
}
"""
}
print >> sio, """
if (CudaNdarray_gemm(%(name)s_a, %(x)s, %(y)s, %(name)s_b, %(z_out)s))
{
%(fail)s;
......
......@@ -2882,9 +2882,21 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, const CudaNdarray * othe
int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C)
{
if (A->nd != 2) { PyErr_SetString(PyExc_ValueError, "non-matrix arg to gemm"); return -1; }
if (B->nd != 2) { PyErr_SetString(PyExc_ValueError, "non-matrix arg to gemm"); return -1; }
if (C->nd != 2) { PyErr_SetString(PyExc_ValueError, "non-matrix arg to gemm"); return -1; }
if (A->nd != 2)
{
PyErr_SetString(PyExc_ValueError, "non-matrix arg A_ to gemm");
return -1;
}
if (B->nd != 2)
{
PyErr_SetString(PyExc_ValueError, "non-matrix arg B_ to gemm");
return -1;
}
if (C->nd != 2)
{
PyErr_SetString(PyExc_ValueError, "non-matrix arg C to gemm");
return -1;
}
// We must allow dimensions to be zeros.
if ((CudaNdarray_HOST_DIMS(A)[1] != CudaNdarray_HOST_DIMS(B)[0])
......@@ -2901,14 +2913,72 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B,
return -1;
}
// a matrix has non-unit size and non-unit stride in both directions, we can't operate in-place
// TODO: make a copy instead of returning in error
if (((CudaNdarray_HOST_DIMS(A)[0] > 1) && (CudaNdarray_HOST_STRIDES(A)[0] != 1)) && ((CudaNdarray_HOST_DIMS(A)[1] > 1) && (CudaNdarray_HOST_STRIDES(A)[1] != 1)))
{ PyErr_SetString(PyExc_NotImplementedError, "non-unit stride in gemm arg"); return -1; }
if (((CudaNdarray_HOST_DIMS(B)[0] > 1) && (CudaNdarray_HOST_STRIDES(B)[0] != 1)) && ((CudaNdarray_HOST_DIMS(B)[1] > 1) && (CudaNdarray_HOST_STRIDES(B)[1] != 1)))
{ PyErr_SetString(PyExc_NotImplementedError, "non-unit stride in gemm arg"); return -1; }
if (((CudaNdarray_HOST_DIMS(C)[0] > 1) && (CudaNdarray_HOST_STRIDES(C)[0] != 1)) && ((CudaNdarray_HOST_DIMS(C)[1] > 1) && (CudaNdarray_HOST_STRIDES(C)[1] != 1)))
{ PyErr_SetString(PyExc_NotImplementedError, "non-unit stride in gemm arg"); return -1; }
// If matrix A_ or B_ has non-unit size and non-unit stride in both
// dimensions, we can make a copy.
if (((CudaNdarray_HOST_DIMS(A)[0] > 1)
&& (CudaNdarray_HOST_STRIDES(A)[0] != 1)
&& (CudaNdarray_HOST_DIMS(A)[1] > 1)
&& (CudaNdarray_HOST_STRIDES(A)[1] != 1))
|| (CudaNdarray_HOST_STRIDES(A)[0] < 0)
|| (CudaNdarray_HOST_STRIDES(A)[1] < 0))
{
const CudaNdarray* A_new = (CudaNdarray*) CudaNdarray_Copy(A);
if (!A_new)
return -1;
A = A_new;
}
else
{
// In the case above, we will need to decref A_new at the end.
// To make things simpler, we incref A here, so we can always
// decref A.
Py_INCREF(A);
}
if (((CudaNdarray_HOST_DIMS(B)[0] > 1)
&& (CudaNdarray_HOST_STRIDES(B)[0] != 1)
&& (CudaNdarray_HOST_DIMS(B)[1] > 1)
&& (CudaNdarray_HOST_STRIDES(B)[1] != 1))
|| (CudaNdarray_HOST_STRIDES(B)[0] < 0)
|| (CudaNdarray_HOST_STRIDES(B)[1] < 0))
{
const CudaNdarray* B_new = (CudaNdarray*) CudaNdarray_Copy(B);
if (!B_new)
{
Py_XDECREF(A);
return -1;
}
B = B_new;
}
else
{
// In the case above, we will need to decref B_new at the end.
// To make things simpler, we incref B here, so we can always
// decref B.
Py_INCREF(B);
}
// If matrix C has non-unit size and non-unit stride in both
// dimensions, or negative strides, we can't operate. We cannot copy
// C either, because the calling code will expect the result to be
// in the original C container.
if (((CudaNdarray_HOST_DIMS(C)[0] > 1)
&& (CudaNdarray_HOST_STRIDES(C)[0] != 1)
&& (CudaNdarray_HOST_DIMS(C)[1] > 1)
&& (CudaNdarray_HOST_STRIDES(C)[1] != 1))
|| (CudaNdarray_HOST_STRIDES(C)[0] < 0)
|| (CudaNdarray_HOST_STRIDES(C)[1] < 0))
{
PyErr_Format(PyExc_AssertionError,
"non-unit or negative stride in gemm arg C (%i,%i) of shape (%i,%i)",
CudaNdarray_HOST_STRIDES(C)[0],
CudaNdarray_HOST_STRIDES(C)[1],
CudaNdarray_HOST_DIMS(C)[0],
CudaNdarray_HOST_DIMS(C)[1]);
Py_XDECREF(A);
Py_XDECREF(B);
return -1;
}
// the unit integer is divided logically into three fields of 4 bits
// the lowermost 4 bits encode the stride pattern of the output
......@@ -2943,24 +3013,6 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B,
unit |= (0x2 << 0);
}
// I don't know if cudablas handles negative strides
if ( (CudaNdarray_HOST_STRIDES(A)[0] < 0)
|| (CudaNdarray_HOST_STRIDES(A)[1] < 0)
|| (CudaNdarray_HOST_STRIDES(B)[0] < 0)
|| (CudaNdarray_HOST_STRIDES(B)[1] < 0)
|| (CudaNdarray_HOST_STRIDES(C)[0] < 0)
|| (CudaNdarray_HOST_STRIDES(C)[1] < 0))
{
PyErr_Format(PyExc_ValueError, "illegal strides in args to gemm (%i,%i)x(%i,%i)->(%i,%i)",
CudaNdarray_HOST_STRIDES(A)[0],
CudaNdarray_HOST_STRIDES(A)[1],
CudaNdarray_HOST_STRIDES(B)[0],
CudaNdarray_HOST_STRIDES(B)[1],
CudaNdarray_HOST_STRIDES(C)[0],
CudaNdarray_HOST_STRIDES(C)[1]);
return -1;
}
/* create appropriate strides for malformed matrices that are row or column
* vectors
*/
......@@ -2987,6 +3039,8 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B,
cublasSgemm(T0, T1, D0, D1, D2, a, x, sx, y, sy, b, z, sz); \
} else { \
PyErr_SetString(PyExc_NotImplementedError, "negative stride to sGemm");\
Py_XDECREF(A);\
Py_XDECREF(B);\
return -1; \
}
......@@ -3004,6 +3058,8 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B,
return -1;
};
CNDA_THREAD_SYNC;
Py_XDECREF(A);
Py_XDECREF(B);
cudaError_t err = cudaGetLastError();
if (CUBLAS_STATUS_SUCCESS != err)
{
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论