提交 1f6b0c73 authored 作者: nouiz's avatar nouiz

Merge pull request #437 from lamblin/blas_double_strides

Make blas functions work with all stride patterns
......@@ -192,7 +192,7 @@ class GpuGemm(GpuOp):
return Apply(self, [z, a, x, y, b], [z.type()])
def c_code_cache_version(self):
return (3,)
return (4,)
def c_code(self, node, name, inputs, outputs, sub):
#z_out = alpha * dot(x,y) + beta * z_in
......@@ -200,6 +200,7 @@ class GpuGemm(GpuOp):
#not inplace version, we copy z_in to z_out.
z_in, a, x, y, b = inputs
z_out, = outputs
inplace = int(self.inplace)
fail = sub['fail']
sio = StringIO.StringIO()
......@@ -215,39 +216,50 @@ class GpuGemm(GpuOp):
: (REAL)(((double*)%(b)s->data)[0]);
#undef REAL
"""
if self.inplace:
print >> sio, """
if (%(inplace)s
&& (CudaNdarray_HOST_STRIDES(%(z_in)s)[0] >= 0)
&& (CudaNdarray_HOST_STRIDES(%(z_in)s)[1] >= 0)
&& ((CudaNdarray_HOST_DIMS(%(z_in)s)[0] <= 1)
|| (CudaNdarray_HOST_STRIDES(%(z_in)s)[0] == 1)
|| (CudaNdarray_HOST_DIMS(%(z_in)s)[1] <= 1)
|| (CudaNdarray_HOST_STRIDES(%(z_in)s)[1] == 1)))
{
// The input has an appropriate layout, we work inplace
Py_XDECREF(%(z_out)s);
%(z_out)s = %(z_in)s;
Py_INCREF(%(z_out)s);
"""
else:
print >> sio, """
if (!%(z_out)s
|| (%(z_out)s->nd != 2)
|| (CudaNdarray_HOST_DIMS(%(z_out)s)[0] != CudaNdarray_HOST_DIMS(%(z_in)s)[0])
|| (CudaNdarray_HOST_DIMS(%(z_out)s)[1] != CudaNdarray_HOST_DIMS(%(z_in)s)[1])
)
}
else if (%(z_out)s
&& (%(z_out)s->nd == 2)
&& (CudaNdarray_HOST_DIMS(%(z_out)s)[0]
== CudaNdarray_HOST_DIMS(%(z_in)s)[0])
&& (CudaNdarray_HOST_DIMS(%(z_out)s)[1]
== CudaNdarray_HOST_DIMS(%(z_in)s)[1])
&& (CudaNdarray_HOST_STRIDES(%(z_out)s)[0] >= 0)
&& (CudaNdarray_HOST_STRIDES(%(z_out)s)[1] >= 0)
&& ((CudaNdarray_HOST_DIMS(%(z_out)s)[0] <= 1)
|| (CudaNdarray_HOST_STRIDES(%(z_out)s)[0] == 1)
|| (CudaNdarray_HOST_DIMS(%(z_out)s)[1] <= 1)
|| (CudaNdarray_HOST_STRIDES(%(z_out)s)[1] == 1)))
{
// The existing output has an appropriate layout,
// copy the input data into it, then work inplace
if (CudaNdarray_CopyFromCudaNdarray(%(z_out)s, %(z_in)s))
{
Py_XDECREF(%(z_out)s);
%(z_out)s = (CudaNdarray*)CudaNdarray_Copy(%(z_in)s);
if (!%(z_out)s)
{
%(fail)s;
}
%(fail)s;
}
else
}
else
{
// Copy the input, use the copy as output
Py_XDECREF(%(z_out)s);
%(z_out)s = (CudaNdarray*)CudaNdarray_Copy(%(z_in)s);
if (!%(z_out)s)
{
if (CudaNdarray_CopyFromCudaNdarray(%(z_out)s, %(z_in)s))
{
%(fail)s;
}
%(fail)s;
}
"""
}
print >> sio, """
if (CudaNdarray_gemm(%(name)s_a, %(x)s, %(y)s, %(name)s_b, %(z_out)s))
{
%(fail)s;
......@@ -294,7 +306,7 @@ class GpuGemv(GpuOp):
return Apply(self, [z, a, x, y, b], [z.type()])
def c_code_cache_version(self):
return (1,)
return (2,)
def c_code(self, node, name, inputs, outputs, sub):
#z_out = alpha * dot(x,y) + beta * z_in
......@@ -302,44 +314,46 @@ class GpuGemv(GpuOp):
#not inplace version, we copy z_in to z_out.
z_in, a, x, y, b = inputs
z_out, = outputs
inplace = int(self.inplace)
fail = sub['fail']
sio = StringIO.StringIO()
print >> sio, """
float %(name)s_alpha = ((dtype_%(a)s*)(%(a)s->data))[0];
float %(name)s_beta = ((dtype_%(b)s*)(%(b)s->data))[0];
"""
if self.inplace:
print >> sio, """
if (%(inplace)s
&& ((CudaNdarray_HOST_STRIDES(%(z_in)s)[0] > 0)
|| ((CudaNdarray_HOST_STRIDES(%(z_in)s)[0] == 0)
&& (CudaNdarray_HOST_DIMS(%(z_in)s)[0] == 1))))
{
// Work inplace on the input
Py_XDECREF(%(z_out)s);
%(z_out)s = %(z_in)s;
Py_INCREF(%(z_out)s);
"""
else:
print >> sio, """
if (!%(z_out)s
|| (%(z_out)s->nd != 1)
|| (CudaNdarray_HOST_DIMS(%(z_out)s)[0] != CudaNdarray_HOST_DIMS(%(z_in)s)[0])
)
}
else if (%(z_out)s
&& ((CudaNdarray_HOST_STRIDES(%(z_out)s)[0] > 0)
|| ((CudaNdarray_HOST_STRIDES(%(z_out)s)[0] == 0)
&& (CudaNdarray_HOST_DIMS(%(z_out)s)[0] == 1))))
{
// Work on the output
if (CudaNdarray_CopyFromCudaNdarray(%(z_out)s, %(z_in)s))
{
Py_XDECREF(%(z_out)s);
%(z_out)s = (CudaNdarray*)CudaNdarray_Copy(%(z_in)s);
if (!%(z_out)s)
{
%(fail)s;
}
%(fail)s;
}
else
}
else
{
// Copy
Py_XDECREF(%(z_out)s);
%(z_out)s = (CudaNdarray*)CudaNdarray_Copy(%(z_in)s);
if (!%(z_out)s)
{
if (CudaNdarray_CopyFromCudaNdarray(%(z_out)s, %(z_in)s))
{
%(fail)s;
}
%(fail)s;
}
"""
}
print >> sio, """
if (CudaNdarray_sgemv(%(name)s_alpha, %(x)s, %(y)s, %(name)s_beta, %(z_out)s))
{
%(fail)s;
......@@ -385,7 +399,7 @@ class GpuGer(GpuOp):
return Apply(self, [z, a, x, y], [z.type()])
def c_code_cache_version(self):
return (1,)
return (2,)
def c_code(self, node, name, inputs, outputs, sub):
#z_out = alpha * dot(x,y) + beta * z_in
......@@ -393,44 +407,57 @@ class GpuGer(GpuOp):
#not inplace version, we copy z_in to z_out.
z_in, a, x, y = inputs
z_out, = outputs
inplace = int(self.inplace)
fail = sub['fail']
sio = StringIO.StringIO()
print >> sio, """
float %(name)s_alpha = ((dtype_%(a)s*)(%(a)s->data))[0];
"""
if self.inplace:
print >> sio, """
if (%(inplace)s
&& (CudaNdarray_HOST_STRIDES(%(z_in)s)[0] >= 0)
&& (CudaNdarray_HOST_STRIDES(%(z_in)s)[1] >= 0)
&& ((CudaNdarray_HOST_DIMS(%(z_in)s)[0] <= 1)
|| (CudaNdarray_HOST_STRIDES(%(z_in)s)[0] == 1)
|| (CudaNdarray_HOST_DIMS(%(z_in)s)[1] <= 1)
|| (CudaNdarray_HOST_STRIDES(%(z_in)s)[1] == 1)))
{
// The input has an appropriate layout, we work inplace
Py_XDECREF(%(z_out)s);
%(z_out)s = %(z_in)s;
Py_INCREF(%(z_out)s);
"""
else:
print >> sio, """
if (!%(z_out)s
|| (%(z_out)s->nd != 2)
|| (CudaNdarray_HOST_DIMS(%(z_out)s)[0] != CudaNdarray_HOST_DIMS(%(z_in)s)[0])
|| (CudaNdarray_HOST_DIMS(%(z_out)s)[1] != CudaNdarray_HOST_DIMS(%(z_in)s)[1])
)
}
else if (%(z_out)s
&& (%(z_out)s->nd == 2)
&& (CudaNdarray_HOST_DIMS(%(z_out)s)[0]
== CudaNdarray_HOST_DIMS(%(z_in)s)[0])
&& (CudaNdarray_HOST_DIMS(%(z_out)s)[1]
== CudaNdarray_HOST_DIMS(%(z_in)s)[1])
&& (CudaNdarray_HOST_STRIDES(%(z_out)s)[0] >= 0)
&& (CudaNdarray_HOST_STRIDES(%(z_out)s)[1] >= 0)
&& ((CudaNdarray_HOST_DIMS(%(z_out)s)[0] <= 1)
|| (CudaNdarray_HOST_STRIDES(%(z_out)s)[0] == 1)
|| (CudaNdarray_HOST_DIMS(%(z_out)s)[1] <= 1)
|| (CudaNdarray_HOST_STRIDES(%(z_out)s)[1] == 1)))
{
// The existing output has an appropriate layout,
// copy the input data into it, then work inplace
if (CudaNdarray_CopyFromCudaNdarray(%(z_out)s, %(z_in)s))
{
Py_XDECREF(%(z_out)s);
%(z_out)s = (CudaNdarray*)CudaNdarray_Copy(%(z_in)s);
if (!%(z_out)s)
{
%(fail)s;
}
%(fail)s;
}
else
}
else
{
// Copy the input, use the copy as output
Py_XDECREF(%(z_out)s);
%(z_out)s = (CudaNdarray*)CudaNdarray_Copy(%(z_in)s);
if (!%(z_out)s)
{
if (CudaNdarray_CopyFromCudaNdarray(%(z_out)s, %(z_in)s))
{
%(fail)s;
}
%(fail)s;
}
"""
}
print >> sio, """
if (CudaNdarray_sger(%(name)s_alpha, %(x)s, %(y)s, %(z_out)s))
{
%(fail)s;
......
......@@ -453,7 +453,7 @@ PyObject* CudaNdarray_Zeros(PyObject* dummy, PyObject* shape)
PyObject * CudaNdarray_Copy(CudaNdarray * self)
PyObject * CudaNdarray_Copy(const CudaNdarray * self)
{
PyObject * rval = CudaNdarray_New();
if ((!rval) || (-1 == self->nd))
......@@ -1095,7 +1095,7 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
k3<<<n_blocks, n_threads>>>(
1, //d0
1, //d1
1, //d2
1, //d2
CudaNdarray_DEV_DATA(self),
1, //strides
1,
......@@ -2777,7 +2777,7 @@ static __global__ void k_copy_1d(const int N, const float * x, const int sx, flo
}
//copy from other into self
int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, CudaNdarray * other, bool unbroadcast)
int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, const CudaNdarray * other, bool unbroadcast)
{
int verbose = 0;
if (verbose>1) fprintf(stderr, "CudaNdarray_CopyFromCudaNdarray\n");
......@@ -2856,7 +2856,7 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, CudaNdarray * other, boo
// call worker routine
unsigned int n_blocks = std::min(size, (unsigned int)NUM_VECTOR_OP_BLOCKS);
unsigned int threads_per_block = std::min(ceil_intdiv(size, n_blocks), (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
CudaNdarray * cuda_dims = other;
const CudaNdarray * cuda_dims = other;
if(unbroadcast)
cuda_dims = self;
//copy from other into self
......@@ -2882,9 +2882,21 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, CudaNdarray * other, boo
int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C)
{
if (A->nd != 2) { PyErr_SetString(PyExc_ValueError, "non-matrix arg to gemm"); return -1; }
if (B->nd != 2) { PyErr_SetString(PyExc_ValueError, "non-matrix arg to gemm"); return -1; }
if (C->nd != 2) { PyErr_SetString(PyExc_ValueError, "non-matrix arg to gemm"); return -1; }
if (A->nd != 2)
{
PyErr_SetString(PyExc_ValueError, "non-matrix arg A to gemm");
return -1;
}
if (B->nd != 2)
{
PyErr_SetString(PyExc_ValueError, "non-matrix arg B to gemm");
return -1;
}
if (C->nd != 2)
{
PyErr_SetString(PyExc_ValueError, "non-matrix arg C to gemm");
return -1;
}
// We must allow dimensions to be zeros.
if ((CudaNdarray_HOST_DIMS(A)[1] != CudaNdarray_HOST_DIMS(B)[0])
......@@ -2901,14 +2913,72 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B,
return -1;
}
// a matrix has non-unit size and non-unit stride in both directions, we can't operate in-place
// TODO: make a copy instead of returning in error
if (((CudaNdarray_HOST_DIMS(A)[0] > 1) && (CudaNdarray_HOST_STRIDES(A)[0] != 1)) && ((CudaNdarray_HOST_DIMS(A)[1] > 1) && (CudaNdarray_HOST_STRIDES(A)[1] != 1)))
{ PyErr_SetString(PyExc_NotImplementedError, "non-unit stride in gemm arg"); return -1; }
if (((CudaNdarray_HOST_DIMS(B)[0] > 1) && (CudaNdarray_HOST_STRIDES(B)[0] != 1)) && ((CudaNdarray_HOST_DIMS(B)[1] > 1) && (CudaNdarray_HOST_STRIDES(B)[1] != 1)))
{ PyErr_SetString(PyExc_NotImplementedError, "non-unit stride in gemm arg"); return -1; }
if (((CudaNdarray_HOST_DIMS(C)[0] > 1) && (CudaNdarray_HOST_STRIDES(C)[0] != 1)) && ((CudaNdarray_HOST_DIMS(C)[1] > 1) && (CudaNdarray_HOST_STRIDES(C)[1] != 1)))
{ PyErr_SetString(PyExc_NotImplementedError, "non-unit stride in gemm arg"); return -1; }
// If matrix A or B has non-unit size and non-unit stride in both
// dimensions, we can make a copy.
if (((CudaNdarray_HOST_DIMS(A)[0] > 1)
&& (CudaNdarray_HOST_STRIDES(A)[0] != 1)
&& (CudaNdarray_HOST_DIMS(A)[1] > 1)
&& (CudaNdarray_HOST_STRIDES(A)[1] != 1))
|| (CudaNdarray_HOST_STRIDES(A)[0] < 0)
|| (CudaNdarray_HOST_STRIDES(A)[1] < 0))
{
const CudaNdarray* A_new = (CudaNdarray*) CudaNdarray_Copy(A);
if (!A_new)
return -1;
A = A_new;
}
else
{
// In the case above, we will need to decref A_new at the end.
// To make things simpler, we incref A here, so we can always
// decref A.
Py_INCREF(A);
}
if (((CudaNdarray_HOST_DIMS(B)[0] > 1)
&& (CudaNdarray_HOST_STRIDES(B)[0] != 1)
&& (CudaNdarray_HOST_DIMS(B)[1] > 1)
&& (CudaNdarray_HOST_STRIDES(B)[1] != 1))
|| (CudaNdarray_HOST_STRIDES(B)[0] < 0)
|| (CudaNdarray_HOST_STRIDES(B)[1] < 0))
{
const CudaNdarray* B_new = (CudaNdarray*) CudaNdarray_Copy(B);
if (!B_new)
{
Py_XDECREF(A);
return -1;
}
B = B_new;
}
else
{
// In the case above, we will need to decref B_new at the end.
// To make things simpler, we incref B here, so we can always
// decref B.
Py_INCREF(B);
}
// If matrix C has non-unit size and non-unit stride in both
// dimensions, or negative strides, we can't operate. We cannot copy
// C either, because the calling code will expect the result to be
// in the original C container.
if (((CudaNdarray_HOST_DIMS(C)[0] > 1)
&& (CudaNdarray_HOST_STRIDES(C)[0] != 1)
&& (CudaNdarray_HOST_DIMS(C)[1] > 1)
&& (CudaNdarray_HOST_STRIDES(C)[1] != 1))
|| (CudaNdarray_HOST_STRIDES(C)[0] < 0)
|| (CudaNdarray_HOST_STRIDES(C)[1] < 0))
{
PyErr_Format(PyExc_AssertionError,
"non-unit or negative stride in gemm arg C (%i,%i) of shape (%i,%i)",
CudaNdarray_HOST_STRIDES(C)[0],
CudaNdarray_HOST_STRIDES(C)[1],
CudaNdarray_HOST_DIMS(C)[0],
CudaNdarray_HOST_DIMS(C)[1]);
Py_XDECREF(A);
Py_XDECREF(B);
return -1;
}
// the unit integer is divided logically into three fields of 4 bits
// the lowermost 4 bits encode the stride pattern of the output
......@@ -2920,47 +2990,31 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B,
// a stride of 0 implies a dimension of 1 - so we can actually define
// a stride of 0 as a 'unit' stride because gemm will never use it.
// If a dimension is 0, its stride will not be used either, so we can
// consider it a 'unit' stride too.
int unit = 0;
if (CudaNdarray_HOST_STRIDES(A)[1] == 1 || CudaNdarray_HOST_STRIDES(A)[1] == 0) {
if (CudaNdarray_HOST_STRIDES(A)[1] == 1 || CudaNdarray_HOST_DIMS(A)[1] <= 1) {
unit |= (0x0 << 8);
} else if (CudaNdarray_HOST_STRIDES(A)[0] == 1 || CudaNdarray_HOST_STRIDES(A)[0] == 0) {
} else if (CudaNdarray_HOST_STRIDES(A)[0] == 1 || CudaNdarray_HOST_DIMS(A)[0] <= 1) {
unit |= (0x1 << 8);
} else {
unit |= (0x2 << 8);
}
if (CudaNdarray_HOST_STRIDES(B)[1] == 1 || CudaNdarray_HOST_STRIDES(B)[1] == 0) {
if (CudaNdarray_HOST_STRIDES(B)[1] == 1 || CudaNdarray_HOST_DIMS(B)[1] <= 1) {
unit |= (0x0 << 4);
} else if (CudaNdarray_HOST_STRIDES(B)[0] == 1 || CudaNdarray_HOST_STRIDES(B)[0] == 0) {
} else if (CudaNdarray_HOST_STRIDES(B)[0] == 1 || CudaNdarray_HOST_DIMS(B)[0] <= 1) {
unit |= (0x1 << 4);
} else {
unit |= (0x2 << 4);
}
if (CudaNdarray_HOST_STRIDES(C)[1] == 1 || CudaNdarray_HOST_STRIDES(C)[1] == 0) {
if (CudaNdarray_HOST_STRIDES(C)[1] == 1 || CudaNdarray_HOST_DIMS(C)[1] <= 1) {
unit |= (0x0 << 0);
} else if (CudaNdarray_HOST_STRIDES(C)[0] == 1 || CudaNdarray_HOST_STRIDES(C)[0] == 0) {
} else if (CudaNdarray_HOST_STRIDES(C)[0] == 1 || CudaNdarray_HOST_DIMS(C)[0] <= 1) {
unit |= (0x1 << 0);
} else {
unit |= (0x2 << 0);
}
// I don't know if cudablas handles negative strides
if ( (CudaNdarray_HOST_STRIDES(A)[0] < 0)
|| (CudaNdarray_HOST_STRIDES(A)[1] < 0)
|| (CudaNdarray_HOST_STRIDES(B)[0] < 0)
|| (CudaNdarray_HOST_STRIDES(B)[1] < 0)
|| (CudaNdarray_HOST_STRIDES(C)[0] < 0)
|| (CudaNdarray_HOST_STRIDES(C)[1] < 0))
{
PyErr_Format(PyExc_ValueError, "illegal strides in args to gemm (%i,%i)x(%i,%i)->(%i,%i)",
CudaNdarray_HOST_STRIDES(A)[0],
CudaNdarray_HOST_STRIDES(A)[1],
CudaNdarray_HOST_STRIDES(B)[0],
CudaNdarray_HOST_STRIDES(B)[1],
CudaNdarray_HOST_STRIDES(C)[0],
CudaNdarray_HOST_STRIDES(C)[1]);
return -1;
}
/* create appropriate strides for malformed matrices that are row or column
* vectors
*/
......@@ -2977,8 +3031,7 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B,
char N = 'N';
char T = 'T';
//std::cerr << (unit/256) MOD 16 << (unit / 16) MOD 16 << unit MOD 16<< '\\n';
//TODO: recognize the negative stride and make a copy of the offending argument,
//rather than aborting
// There should be no negative stride at that point
#define CHK_STRIDE_SGEMM(T0, T1, D0, D1, D2, a, x, sx, y, sy, b, z, sz) \
if (sx == 0){sx = 1;}\
if (sy == 0){sy = 1;}\
......@@ -2986,7 +3039,9 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B,
if ((sx > 0) && (sy > 0) && (sz > 0)) { \
cublasSgemm(T0, T1, D0, D1, D2, a, x, sx, y, sy, b, z, sz); \
} else { \
PyErr_SetString(PyExc_NotImplementedError, "negative stride to sGemm");\
PyErr_SetString(PyExc_AssertionError, "negative stride to sGemm");\
Py_XDECREF(A);\
Py_XDECREF(B);\
return -1; \
}
......@@ -3000,14 +3055,19 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B,
case 0x101: CHK_STRIDE_SGEMM(N, T, CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(A)[1], alpha, a, sa_1, b, sb_0, beta, c, sc_1); break;
case 0x011: CHK_STRIDE_SGEMM(T, N, CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(A)[1], alpha, a, sa_0, b, sb_1, beta, c, sc_1); break;
case 0x111: CHK_STRIDE_SGEMM(N, N, CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(A)[1], alpha, a, sa_1, b, sb_1, beta, c, sc_1); break;
default: PyErr_Format(PyExc_ValueError, "some matrix has no unit stride (unit=%i)", unit);
default: PyErr_Format(PyExc_ValueError, "some matrix has no unit stride (unit=%x)", unit);
return -1;
};
CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError();
Py_XDECREF(A);
Py_XDECREF(B);
cublasStatus err = cublasGetError();
if (CUBLAS_STATUS_SUCCESS != err)
{
PyErr_Format(PyExc_RuntimeError, "cublassGemm failed (%s)",cudaGetErrorString(err));
PyErr_Format(PyExc_RuntimeError,
"cublasSgemm failed (%i)",
err);
return -1;
}
return 0;
......@@ -3037,22 +3097,53 @@ int CudaNdarray_sgemv(float alpha, const CudaNdarray * A, const CudaNdarray * B,
return -1;
}
// a matrix has non-unit size and non-unit stride in both directions, we can't operate in-place
// TODO: make a copy instead of returning in error
if (((CudaNdarray_HOST_DIMS(A)[0] > 1) && (CudaNdarray_HOST_STRIDES(A)[0] != 1)) && ((CudaNdarray_HOST_DIMS(A)[1] > 1) && (CudaNdarray_HOST_STRIDES(A)[1] != 1)))
{ PyErr_SetString(PyExc_NotImplementedError, "non-unit stride in gemv arg"); return -1; }
// If matrix A has non-unit size and non-unit stride in both
// dimensions, or negative strides, we cannot operate, but we can
// make a copy.
if (((CudaNdarray_HOST_DIMS(A)[0] > 1)
&& (CudaNdarray_HOST_STRIDES(A)[0] != 1)
&& (CudaNdarray_HOST_DIMS(A)[1] > 1)
&& (CudaNdarray_HOST_STRIDES(A)[1] != 1))
|| (CudaNdarray_HOST_STRIDES(A)[0] < 0)
|| (CudaNdarray_HOST_STRIDES(A)[1] < 0))
{
const CudaNdarray* A_new = (CudaNdarray*) CudaNdarray_Copy(A);
if (!A_new)
return -1;
A = A_new;
}
else
{
// Incref A, so we can decref it at the end in all cases
Py_INCREF(A);
}
// If vector B as a negative stride, we also have to make a copy.
if (CudaNdarray_HOST_STRIDES(B)[0] < 0)
{
const CudaNdarray* B_new = (CudaNdarray*) CudaNdarray_Copy(B);
if (!B_new)
{
Py_XDECREF(A);
return -1;
}
B = B_new;
}
else
{
// Incref B, so we can decref it at the end in all cases
Py_INCREF(B);
}
// I don't know if cudablas handles negative strides
// cudablas does not handle negative strides as expected
if ( (CudaNdarray_HOST_STRIDES(A)[0] < 0)
|| (CudaNdarray_HOST_STRIDES(A)[1] < 0)
|| (CudaNdarray_HOST_STRIDES(B)[0] < 0)
|| (CudaNdarray_HOST_STRIDES(C)[0] < 0))
|| (CudaNdarray_HOST_STRIDES(A)[1] < 0))
{
PyErr_Format(PyExc_ValueError, "illegal strides in args to gemv (%i,%i)x(%i)->(%i)",
PyErr_Format(PyExc_ValueError, "illegal strides in args to gemv (%i,%i)",
CudaNdarray_HOST_STRIDES(A)[0],
CudaNdarray_HOST_STRIDES(A)[1],
CudaNdarray_HOST_STRIDES(B)[0],
CudaNdarray_HOST_STRIDES(C)[0]);
CudaNdarray_HOST_STRIDES(A)[1]);
Py_XDECREF(A);
Py_XDECREF(B);
return -1;
}
......@@ -3064,42 +3155,71 @@ int CudaNdarray_sgemv(float alpha, const CudaNdarray * A, const CudaNdarray * B,
int sb_0 = (CudaNdarray_HOST_DIMS(B)[0] > 1) ? CudaNdarray_HOST_STRIDES(B)[0] : 1;
int sc_0 = (CudaNdarray_HOST_DIMS(C)[0] > 1) ? CudaNdarray_HOST_STRIDES(C)[0] : 1;
if (sa_0 == 1)
{
cublasSgemv('N',
CudaNdarray_HOST_DIMS(A)[0], CudaNdarray_HOST_DIMS(A)[1],
alpha,
CudaNdarray_DEV_DATA(A), sa_1,
CudaNdarray_DEV_DATA(B), sb_0,
beta,
CudaNdarray_DEV_DATA(C), sc_0);
}
else if (sa_1 == 1)
{
cublasSgemv('T',
CudaNdarray_HOST_DIMS(A)[1], CudaNdarray_HOST_DIMS(A)[0],
alpha,
CudaNdarray_DEV_DATA(A), sa_0,
CudaNdarray_DEV_DATA(B), sb_0,
beta,
CudaNdarray_DEV_DATA(C), sc_0);
}
else
{
PyErr_SetString(PyExc_NotImplementedError, "too many strides strides in sgemv");
return -1;
if (sa_0 == 0)
sa_0 = 1;
if (sa_1 == 0)
sa_1 = 1;
if (CudaNdarray_SIZE(C)) {
if ((CudaNdarray_HOST_DIMS(A)[0] <= 1)
|| ((CudaNdarray_HOST_STRIDES(A)[0] == 1)
&& (CudaNdarray_HOST_STRIDES(A)[1] > 0)))
{
cublasSgemv('N',
CudaNdarray_HOST_DIMS(A)[0], CudaNdarray_HOST_DIMS(A)[1],
alpha,
CudaNdarray_DEV_DATA(A), sa_1,
CudaNdarray_DEV_DATA(B), sb_0,
beta,
CudaNdarray_DEV_DATA(C), sc_0);
}
else if ((CudaNdarray_HOST_DIMS(A)[1] <= 1)
|| ((CudaNdarray_HOST_STRIDES(A)[1] == 1)
&& (CudaNdarray_HOST_STRIDES(A)[0] > 0)))
{
cublasSgemv('T',
CudaNdarray_HOST_DIMS(A)[1], CudaNdarray_HOST_DIMS(A)[0],
alpha,
CudaNdarray_DEV_DATA(A), sa_0,
CudaNdarray_DEV_DATA(B), sb_0,
beta,
CudaNdarray_DEV_DATA(C), sc_0);
}
else
{
PyErr_Format(PyExc_AssertionError,
"Unexpected stride pattern in gemv: (%i, %i) x %i -> %i.\n"
"Shapes are: (%i, %i) x %i -> %i\n",
CudaNdarray_HOST_STRIDES(A)[0],
CudaNdarray_HOST_STRIDES(A)[1],
CudaNdarray_HOST_STRIDES(B)[0],
CudaNdarray_HOST_STRIDES(C)[0],
CudaNdarray_HOST_DIMS(A)[0],
CudaNdarray_HOST_DIMS(A)[1],
CudaNdarray_HOST_DIMS(B)[0],
CudaNdarray_HOST_DIMS(C)[0]);
Py_XDECREF(A);
Py_XDECREF(B);
return -1;
}
}
CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError();
Py_XDECREF(A);
Py_XDECREF(B);
cublasStatus err = cublasGetError();
if (CUBLAS_STATUS_SUCCESS != err)
{
PyErr_Format(PyExc_RuntimeError, "cublassGemv failed (%s)",cudaGetErrorString(err));
PyErr_Format(PyExc_RuntimeError,
"cublasSgemv failed (%i)",
err);
return -1;
}
return 0;
}
int CudaNdarray_sger(float alpha, CudaNdarray * x, CudaNdarray * y, CudaNdarray * A) {
int CudaNdarray_sger(float alpha, const CudaNdarray * x, const CudaNdarray * y, CudaNdarray * A) {
if (x->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg x to sger"); return -1; }
if (y->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg y to sger"); return -1; }
if (A->nd != 2) { PyErr_SetString(PyExc_ValueError, "non-matrix arg A to sger"); return -1; }
......@@ -3115,14 +3235,8 @@ int CudaNdarray_sger(float alpha, CudaNdarray * x, CudaNdarray * y, CudaNdarray
return -1;
}
// Maybe this could work, but be safe for now
if (!CudaNdarray_is_c_contiguous(A)) {
PyErr_SetString(PyExc_NotImplementedError, "non-c continugous A in sger");
return -1;
}
// Since Sger expects A in col-major, we invert x and y to fake this.
int x_strides = CudaNdarray_HOST_STRIDES(x)[0];
CudaNdarray * x_ = x;
const CudaNdarray * x_ = x;
if(x_strides == 0){
if(CudaNdarray_HOST_DIMS(x)[0] != 1){
PyErr_Format(PyExc_RuntimeError,
......@@ -3131,14 +3245,14 @@ int CudaNdarray_sger(float alpha, CudaNdarray * x, CudaNdarray * y, CudaNdarray
" that have more then 1 elements!");
return -1;
}
x_strides = 4;
x_strides = 1;
} else if(x_strides < 0){
x_ = (CudaNdarray*)CudaNdarray_Copy(x);
x_strides = CudaNdarray_HOST_STRIDES(x_)[0];
}
int y_strides = CudaNdarray_HOST_STRIDES(y)[0];
CudaNdarray * y_ = y;
const CudaNdarray * y_ = y;
if(y_strides == 0){
if(CudaNdarray_HOST_DIMS(y)[0] != 1){
PyErr_Format(PyExc_RuntimeError,
......@@ -3147,17 +3261,46 @@ int CudaNdarray_sger(float alpha, CudaNdarray * x, CudaNdarray * y, CudaNdarray
" that have more then 1 elements!");
return -1;
}
y_strides = 4;
y_strides = 1;
} else if(y_strides < 0){
y_ = (CudaNdarray*)CudaNdarray_Copy(y);
y_strides = CudaNdarray_HOST_STRIDES(y_)[0];
}
// Create appropriate strides if A is a row or column vector
int sa_0 = (CudaNdarray_HOST_DIMS(A)[0] > 1) ? CudaNdarray_HOST_STRIDES(A)[0]
: CudaNdarray_HOST_DIMS(A)[1];
int sa_1 = (CudaNdarray_HOST_DIMS(A)[1] > 1) ? CudaNdarray_HOST_STRIDES(A)[1]
: CudaNdarray_HOST_DIMS(A)[0];
if(CudaNdarray_SIZE(A)){
cublasSger(CudaNdarray_HOST_DIMS(y)[0], CudaNdarray_HOST_DIMS(x)[0], alpha,
CudaNdarray_DEV_DATA(y_), y_strides,
CudaNdarray_DEV_DATA(x_), x_strides,
CudaNdarray_DEV_DATA(A), CudaNdarray_HOST_DIMS(A)[1]);
// If A is in col-major
if ((CudaNdarray_HOST_DIMS(A)[0] <= 1)
|| ((CudaNdarray_HOST_STRIDES(A)[0] == 1)
&& (CudaNdarray_HOST_STRIDES(A)[1] > 0)))
{
cublasSger(CudaNdarray_HOST_DIMS(x)[0], CudaNdarray_HOST_DIMS(y)[0], alpha,
CudaNdarray_DEV_DATA(x_), x_strides,
CudaNdarray_DEV_DATA(y_), y_strides,
CudaNdarray_DEV_DATA(A), sa_1);
}
// Since Sger expects A in col-major, we invert x and y to fake this.
else if ((CudaNdarray_HOST_DIMS(A)[1] <= 1)
|| ((CudaNdarray_HOST_STRIDES(A)[1] == 1)
&& (CudaNdarray_HOST_STRIDES(A)[0] > 0)))
{
cublasSger(CudaNdarray_HOST_DIMS(y)[0], CudaNdarray_HOST_DIMS(x)[0], alpha,
CudaNdarray_DEV_DATA(y_), y_strides,
CudaNdarray_DEV_DATA(x_), x_strides,
CudaNdarray_DEV_DATA(A), sa_0);
}
// A has to be either c- or f-contiguous, with no negative strides
else
{
PyErr_SetString(PyExc_NotImplementedError,
"non-contiguous A, or negative strides, in sger");
return -1;
}
}
CNDA_THREAD_SYNC;
if(x_ != x)
......@@ -3165,12 +3308,15 @@ int CudaNdarray_sger(float alpha, CudaNdarray * x, CudaNdarray * y, CudaNdarray
if(y_ != y)
Py_DECREF(y_);
cudaError_t err = cudaGetLastError();
cublasStatus err = cublasGetError();
if (CUBLAS_STATUS_SUCCESS != err)
{
PyErr_Format(PyExc_RuntimeError, "cublasSger failed (%s)",cudaGetErrorString(err));
PyErr_Format(PyExc_RuntimeError,
"cublasSger failed (%i)",
err);
return -1;
}
return 0;
}
......@@ -3816,7 +3962,7 @@ CudaNdarray_set_stride(CudaNdarray * self, int idx, int s)
int
cnda_copy_structure_to_device(CudaNdarray * self)
cnda_copy_structure_to_device(const CudaNdarray * self)
{
cublasSetVector(cnda_structure_size(self->nd), sizeof(int), self->host_structure, 1, self->dev_structure, 1);
CNDA_THREAD_SYNC;
......@@ -3830,7 +3976,7 @@ cnda_copy_structure_to_device(CudaNdarray * self)
}
const int *
CudaNdarray_DEV_DIMS(CudaNdarray * self)
CudaNdarray_DEV_DIMS(const CudaNdarray * self)
{
if (!self->dev_structure_fresh)
{
......@@ -3840,7 +3986,7 @@ CudaNdarray_DEV_DIMS(CudaNdarray * self)
return self->dev_structure;
}
const int *
CudaNdarray_DEV_STRIDES(CudaNdarray * self)
CudaNdarray_DEV_STRIDES(const CudaNdarray * self)
{
if (!self->dev_structure_fresh)
{
......@@ -3850,7 +3996,7 @@ CudaNdarray_DEV_STRIDES(CudaNdarray * self)
return self->dev_structure + self->nd;
}
const int *
CudaNdarray_DEV_LOG2DIMS(CudaNdarray * self)
CudaNdarray_DEV_LOG2DIMS(const CudaNdarray * self)
{
if (!self->dev_structure_fresh)
{
......@@ -3991,4 +4137,4 @@ void fprint_CudaNdarray(FILE * fd, const CudaNdarray *self)
fill-column:79
End:
*/
// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:encoding=utf-8:textwidth=79 :
// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 :
......@@ -81,7 +81,7 @@ struct CudaNdarray
//device pointers (allocated by cudaMalloc)
int dev_structure_fresh;
mutable int dev_structure_fresh;
//dev_structure should be accessed via macros, otherwise may not be synchronized
int * dev_structure; //dim0, dim1, ..., stride0, stride1, ...
real* devdata; //pointer to data element [0,..,0].
......@@ -154,11 +154,11 @@ CudaNdarray_set_stride(CudaNdarray * self, int idx, int s);
*
* This means: recalculate the log2dims and transfer structure to the card
*/
DllExport int cnda_copy_structure_to_device(CudaNdarray * self);
DllExport int cnda_copy_structure_to_device(const CudaNdarray * self);
DllExport const int *CudaNdarray_DEV_DIMS(CudaNdarray * self);
DllExport const int *CudaNdarray_DEV_STRIDES(CudaNdarray * self);
DllExport const int *CudaNdarray_DEV_LOG2DIMS(CudaNdarray * self);
DllExport const int *CudaNdarray_DEV_DIMS(const CudaNdarray * self);
DllExport const int *CudaNdarray_DEV_STRIDES(const CudaNdarray * self);
DllExport const int *CudaNdarray_DEV_LOG2DIMS(const CudaNdarray * self);
DllExport float *CudaNdarray_DEV_DATA(const CudaNdarray * self);
/**
......@@ -229,13 +229,22 @@ static int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd, const i
return -1;
}
assert(size>0);
if (size < 0)
{
PyErr_Format(PyExc_AssertionError,
"size (%i) < 0",
size);
return -1;
}
self->devdata = (float*)device_malloc(size*sizeof(real));
if (!self->devdata)
if (size && !self->devdata)
{
CudaNdarray_set_nd(self,-1);
CudaNdarray_set_nd(self, -1);
self->data_allocated = 0;
self->devdata = 0;
PyErr_SetString(PyExc_RuntimeError,
"Could not allocate memory on device");
return -1;
}
if (0)
......@@ -283,7 +292,7 @@ DllExport PyObject * CudaNdarray_DeepCopy(CudaNdarray * self, PyObject * memo);
/**
* Return an independent copy of self
*/
DllExport PyObject * CudaNdarray_Copy(CudaNdarray * self);
DllExport PyObject * CudaNdarray_Copy(const CudaNdarray * self);
/**
* Return a new object obtained by summing over the dimensions for which there is a 1 in the mask.
......@@ -302,7 +311,7 @@ DllExport int CudaNdarray_CopyFromArray(CudaNdarray * self, PyArrayObject*obj);
*
* self is reallocated to have the correct dimensions if necessary.
*/
DllExport int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, CudaNdarray * other, bool unbroadcast = false);
DllExport int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, const CudaNdarray * other, bool unbroadcast = false);
/**
* Transfer the contents of CudaNdarray `self` to a new numpy ndarray.
......@@ -321,7 +330,7 @@ DllExport PyObject * CudaNdarray_IS_C_Contiguous(CudaNdarray * self);
DllExport int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C);
DllExport int CudaNdarray_sgemv(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C);
DllExport int CudaNdarray_sger(float alpha, CudaNdarray * x, CudaNdarray * y, CudaNdarray* A);
DllExport int CudaNdarray_sger(float alpha, const CudaNdarray * x, const CudaNdarray * y, CudaNdarray* A);
DllExport int CudaNdarray_reduce_sum(CudaNdarray * self, CudaNdarray * A);
DllExport int CudaNdarray_reduce_prod(CudaNdarray * self, CudaNdarray * A);
......@@ -343,4 +352,4 @@ static void fprint_CudaNdarray(FILE * fd, const CudaNdarray *self);
fill-column:79
End:
*/
// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:encoding=utf-8:textwidth=79 :
// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 :
import itertools
from unittest import TestCase
from theano.compile.pfunc import pfunc
......@@ -17,7 +18,7 @@ import theano.sandbox.cuda as tcn
from theano.tensor.signal.downsample import DownsampleFactorMax, DownsampleFactorMaxGrad
import theano.compile.mode
from theano.tensor.tests.test_blas import BaseGemv, TestGer
from theano.tensor.tests.test_blas import BaseGemv, TestBlasStrides, TestGer
from theano.sandbox.cuda.blas import gpu_gemv_no_inplace, gpu_gemv_inplace
from theano.sandbox.cuda.blas import gpu_ger_inplace, gpu_ger_no_inplace
......@@ -32,20 +33,31 @@ else:
def my_rand(*shape):
return theano._asarray(numpy.random.rand(*shape),dtype='float32')
def transpose(cuda_mat):
# The easiest way to transpose a cuda matrix for now
return tcn.dimshuffle(cuda_mat, [1, 0])
def test_dot22():
def cmp(a_shp, b_shp):
a = tcn.shared_constructor(my_rand(*a_shp), 'a')
a0 = my_rand(*a_shp)
a = tcn.shared_constructor(a0, 'a')
b = tensor.fmatrix()
f = pfunc([b], [], updates=[(a, tensor.dot(a,b))], mode=mode_with_gpu)
a0 = a.get_value() * 1.0
bval = my_rand(*b_shp)
f(bval)
assert numpy.allclose(numpy.dot(a0, bval), a.get_value())
# Try with a matrix equal to a0, but with strides in both dims
a.set_value(a0)
a.set_value(
a.get_value(borrow=True, return_internal_type=True)[::-1, ::-1],
borrow=True)
f(bval)
cmp((3,4),(4,5))
cmp((0,4),(4,5))
cmp((3,4),(4,0))
......@@ -90,7 +102,8 @@ def test_dot22scalar():
def test_gemm():
def cmp(a_shp, b_shp):
a = tcn.shared_constructor(my_rand(*a_shp), 'a')
a0 = my_rand(*a_shp)
a = tcn.shared_constructor(a0, 'a')
b = tensor.fmatrix('b')
c = tensor.fmatrix('c')
......@@ -98,12 +111,19 @@ def test_gemm():
f = pfunc([b,c], [], updates=[(a, tensor.dot(a,b) + tensor.exp(c))], mode=mode_with_gpu)
assert any([node.op == tcn.blas.gpu_gemm_inplace for node in f.maker.env.toposort()])
a0 = a.get_value() * 1.0
bval = my_rand(*b_shp)
cval = my_rand(a_shp[0],b_shp[1])
f(bval,cval)
assert numpy.allclose(numpy.dot(a0, bval)+numpy.exp(cval), a.get_value())
# Try with a matrix equal to a0, but with strides in both dims
a.set_value(a0)
a.set_value(
a.get_value(borrow=True, return_internal_type=True)[::-1, ::-1],
borrow=True)
f(bval, cval)
cmp((3,4),(4,5))
cmp((0,4),(4,5))
cmp((3,4),(4,0))
......@@ -114,7 +134,8 @@ def test_gemm():
def test_gemm_no_inplace():
def cmp(a_shp, b_shp):
a = tcn.shared_constructor(my_rand(*a_shp), 'a')
a0 = my_rand(*a_shp)
a = tcn.shared_constructor(a0, 'a')
cval = my_rand(a_shp[0], b_shp[1])
c = tcn.shared_constructor(cval.copy(), 'c')
......@@ -123,7 +144,6 @@ def test_gemm_no_inplace():
f = pfunc([b,b2], [tensor.dot(a,b2) + c], updates=[(a, tensor.dot(a,b) + c)], mode=mode_with_gpu)
a0 = a.get_value() * 1.0
assert any([node.op == tcn.blas.gpu_gemm_no_inplace for node in f.maker.env.toposort()])
bval = my_rand(*b_shp)
bval2 = my_rand(*b_shp)
......@@ -132,6 +152,13 @@ def test_gemm_no_inplace():
assert numpy.allclose(numpy.dot(a0, bval)+cval, a.get_value())
assert numpy.allclose(numpy.dot(a0, bval2)+cval, rval)
# Try with a matrix equal to a0, but with strides in both dims
a.set_value(a0)
a.set_value(
a.get_value(borrow=True, return_internal_type=True)[::-1, ::-1],
borrow=True)
f(bval, bval2)
cmp((3,4),(4,5))
cmp((0,4),(4,5))
cmp((3,4),(4,0))
......@@ -139,6 +166,13 @@ def test_gemm_no_inplace():
cmp((0,4),(4,0))
cmp((0,0),(0,0))
class TestBlasStridesGpu(TestBlasStrides):
dtype = 'float32'
shared = staticmethod(tcn.shared_constructor)
mode = mode_with_gpu
def test_outer():
x = tcn.shared_constructor(my_rand(8,), 'x')
y = tcn.shared_constructor(my_rand(6,), 'y')
......@@ -260,6 +294,23 @@ class TestGpuGemv(TestCase, BaseGemv,
gemv = gpu_gemv_inplace
gemv_inplace = gpu_gemv_inplace
class TestGpuGemvNoTransfer(TestCase, BaseGemv,
unittest_tools.TestOptimizationMixin):
mode = mode_with_gpu
dtype = 'float32'
# Mimic shared constructors registry
@staticmethod
def shared(val):
try:
return tcn.shared_constructor(val)
except TypeError:
return theano.shared(val)
# In this test, inputs are not always transfered to GPU
gemv = gpu_gemv_no_inplace
gemv_inplace = gpu_gemv_inplace
class TestVectorMatrixDot(TestCase):
### Tolerance factor used in this tests
......@@ -286,6 +337,14 @@ class TestVectorMatrixDot(TestCase):
assert sum([node.op is gpu_gemv_inplace for node in
gpu_f2.maker.env.toposort() ]) == 1
# Check double-strided m
m.set_value(
m.get_value(borrow=True, return_internal_type=True)[::-1, ::-1],
borrow=True)
assert numpy.allclose(no_gpu_f(), gpu_f(), atol=self.atol)
assert numpy.allclose(no_gpu_f(), gpu_f2(), atol=self.atol)
def test_dot_mv(self):
''' Test matrix dot vector '''
v = theano.shared( numpy.array(numpy.random.rand(2), dtype='float32'))
......@@ -365,6 +424,26 @@ class TestGpuGer(TestGer):
self.ger = gpu_ger_inplace
self.gemm = tcn.blas.gpu_gemm_inplace
class TestGpuGerNoTransfer(TestGer):
@staticmethod
def shared(val):
try:
return tcn.shared_constructor(val)
except TypeError:
return theano.shared(val)
def setUp(self):
self.mode = mode_with_gpu
dtype = self.dtype = 'float32' # optimization isn't dtype-dependent
self.A = tensor.tensor(dtype=dtype, broadcastable=(False, False))
self.a = tensor.tensor(dtype=dtype, broadcastable=())
self.x = tensor.tensor(dtype=dtype, broadcastable=(False,))
self.y = tensor.tensor(dtype=dtype, broadcastable=(False,))
# data on the gpu make the op always inplace
self.ger = gpu_ger_inplace
self.ger_destructive = gpu_ger_inplace
self.gemm = tcn.blas.gpu_gemm_inplace
class TestGpuGer_OpContract(TestCase, unittest_tools.T_OpContractMixin):
def setUp(self):
......
......@@ -496,7 +496,9 @@ class GemmRelated(Op):
if ((Sx[0] < 1) || (Sx[1] < 1) || (Sx[0] MOD type_size) || (Sx[1] MOD type_size)
|| ((Sx[0] != type_size) && (Sx[1] != type_size)))
{
PyArrayObject * _x_copy = PyArray_GETCONTIGUOUS(%(_x)s);
PyArrayObject * _x_copy = (PyArrayObject *) PyArray_Copy(%(_x)s);
if (!_x_copy)
%(fail)s
Py_XDECREF(%(_x)s);
%(_x)s = _x_copy;
Sx = %(_x)s->strides;
......@@ -505,7 +507,9 @@ class GemmRelated(Op):
if ((Sy[0] < 1) || (Sy[1] < 1) || (Sy[0] MOD type_size) || (Sy[1] MOD type_size)
|| ((Sy[0] != type_size) && (Sy[1] != type_size)))
{
PyArrayObject * _y_copy = PyArray_GETCONTIGUOUS(%(_y)s);
PyArrayObject * _y_copy = (PyArrayObject *) PyArray_Copy(%(_y)s);
if (!_y_copy)
%(fail)s
Py_XDECREF(%(_y)s);
%(_y)s = _y_copy;
Sy = %(_y)s->strides;
......@@ -514,7 +518,9 @@ class GemmRelated(Op):
if ((Sz[0] < 1) || (Sz[1] < 1) || (Sz[0] MOD type_size) || (Sz[1] MOD type_size)
|| ((Sz[0] != type_size) && (Sz[1] != type_size)))
{
PyArrayObject * _z_copy = PyArray_GETCONTIGUOUS(%(_zout)s);
PyArrayObject * _z_copy = (PyArrayObject *) PyArray_Copy(%(_zout)s);
if (!_z_copy)
%(fail)s
Py_XDECREF(%(_zout)s);
%(_zout)s = _z_copy;
Sz = %(_zout)s->strides;
......@@ -649,7 +655,7 @@ class GemmRelated(Op):
self.end_switch_typenum), '')
def build_gemm_version(self):
return (10,)
return (12,)
class Gemm(GemmRelated):
"""In-place version of matrix-matrix multiplication (with accumulation):
......
......@@ -58,9 +58,10 @@ def ger_c_code(A, a, x, y, Z, destructive, fail):
// copy A if !self.destructive or A is fully strided
if (!%(destructive)s
|| (%(A)s->strides[0] < 0)
|| (%(A)s->strides[1] < 0)
|| ((%(A)s->strides[0] != elemsize)
&&
(%(A)s->strides[1] != elemsize)))
&& (%(A)s->strides[1] != elemsize)))
{
npy_intp dims[2];
dims[0] = %(A)s->dimensions[0];
......@@ -68,7 +69,11 @@ def ger_c_code(A, a, x, y, Z, destructive, fail):
if ((NULL == %(Z)s)
|| (%(Z)s->dimensions[0] != %(A)s->dimensions[0])
|| (%(Z)s->dimensions[1] != %(A)s->dimensions[1]))
|| (%(Z)s->dimensions[1] != %(A)s->dimensions[1])
|| (%(Z)s->strides[0] < 0)
|| (%(Z)s->strides[1] < 0)
|| ((%(Z)s->strides[0] != elemsize)
&& (%(Z)s->strides[1] != elemsize)))
{
if (%(Z)s) Py_XDECREF(%(Z)s);
%(Z)s = (PyArrayObject*)PyArray_SimpleNew(2, dims, PyArray_TYPE(%(A)s));
......@@ -77,7 +82,11 @@ def ger_c_code(A, a, x, y, Z, destructive, fail):
%(fail)s
}
}
assert (%(Z)s != %(A)s);
if (%(Z)s == %(A)s)
{
PyErr_SetString(PyExc_AssertionError, "%(Z)s != %(A)s");
%(fail)s
}
if (%(Z)s->descr->type_num == PyArray_FLOAT)
{
float * zoutdata = (float*)%(Z)s->data;
......@@ -142,8 +151,16 @@ def ger_c_code(A, a, x, y, Z, destructive, fail):
int Sz0 = (Nz0 > 1) ? (%(Z)s->strides[0] / elemsize) : (Nz1 + 1);
int Sz1 = (Nz1 > 1) ? (%(Z)s->strides[1] / elemsize) : (Nz0 + 1);
if (1)
{
dtype_%(x)s* x_data = (dtype_%(x)s*) %(x)s->data;
dtype_%(y)s* y_data = (dtype_%(y)s*) %(y)s->data;
// gemv expects pointers to the beginning of memory arrays,
// but numpy provides provides a pointer to the first element,
// so when the stride is negative, we need to get the last one.
if (Sx < 0)
x_data += (Nz0 - 1) * Sx;
if (Sy < 0)
y_data += (Nz1 - 1) * Sy;
if (%(Z)s->strides[0] == elemsize)
{
if (%(Z)s->descr->type_num == PyArray_FLOAT)
......@@ -151,19 +168,22 @@ def ger_c_code(A, a, x, y, Z, destructive, fail):
//fprintf(stderr, "A\\n");
float alpha = ((dtype_%(a)s*)%(a)s->data)[0];
sger_(&Nz0, &Nz1, &alpha,
(float*)(%(x)s->data), &Sx,
(float*)(%(y)s->data), &Sy,
(float*)x_data, &Sx,
(float*)y_data, &Sy,
(float*)(%(Z)s->data), &Sz1);
}
else if (%(Z)s->descr->type_num == PyArray_DOUBLE)
{
double alpha = ((dtype_%(a)s*)%(a)s->data)[0];
dger_(&Nz0, &Nz1, &alpha,
(double*)(%(x)s->data), &Sx,
(double*)(%(y)s->data), &Sy,
(double*)x_data, &Sx,
(double*)y_data, &Sy,
(double*)(%(Z)s->data), &Sz1);
}
else { assert(0); }
else {
PyErr_SetString(PyExc_NotImplementedError, "not float nor double");
%(fail)s
}
}
else if (%(Z)s->strides[1] == elemsize)
{
......@@ -174,21 +194,30 @@ def ger_c_code(A, a, x, y, Z, destructive, fail):
//fprintf(stderr, "alpha=%%f\\n", alpha);
//fprintf(stderr, "sx sy %%i %%i\\n", Sx, Sy);
sger_(&Nz1, &Nz0, &alpha,
(float*)(%(y)s->data), &Sy,
(float*)(%(x)s->data), &Sx,
(float*)y_data, &Sy,
(float*)x_data, &Sx,
(float*)(%(Z)s->data), &Sz0);
}
else if (%(Z)s->descr->type_num == PyArray_DOUBLE)
{
double alpha = ((dtype_%(a)s*)%(a)s->data)[0];
dger_(&Nz1, &Nz0, &alpha,
(double*)(%(y)s->data), &Sy,
(double*)(%(x)s->data), &Sx,
(double*)y_data, &Sy,
(double*)x_data, &Sx,
(double*)(%(Z)s->data), &Sz0);
}
else { assert(0); }
else
{
PyErr_SetString(PyExc_NotImplementedError, "not float nor double");
%(fail)s
}
}
else { assert(0); }
else
{
PyErr_SetString(PyExc_AssertionError,
"A is a double-strided matrix, and should have been copied "
"into a memory-contiguous one.");
%(fail)s
}
}
......@@ -204,7 +233,7 @@ class CGer(BaseBLAS, Ger):
return code
def c_code_cache_version(self):
return (4,)
return (8,)
@local_optimizer([ger, ger_destructive])
......@@ -282,7 +311,11 @@ def gemv_c_code(aa, xx, yy, zz, alpha, beta, destructive, fail):
%(fail)s
}
}
assert (%(zz)s != %(aa)s);
if (%(zz)s == %(aa)s)
{
PyErr_SetString(PyExc_AssertionError, "%(zz)s != %(aa)s");
%(fail)s
}
if (dbeta != 0)
{
if (%(zz)s->descr->type_num == PyArray_FLOAT)
......@@ -330,13 +363,52 @@ def gemv_c_code(aa, xx, yy, zz, alpha, beta, destructive, fail):
char NOTRANS = 'N';
int Nx0 = %(xx)s->dimensions[0];
int Nx1 = %(xx)s->dimensions[1];
int Sx0 = %(xx)s->strides[0] / elemsize;
int Sx1 = %(xx)s->strides[1] / elemsize;
/* This formula is needed in the case where xx is actually a row or
* column matrix, because BLAS sometimes insists that the strides:
* - are not smaller than the number of elements in the array
* - are not 0.
*/
int Sx0 = (Nx0 > 1) ? (%(xx)s->strides[0] / elemsize) : (Nx1 + 1);
int Sx1 = (Nx1 > 1) ? (%(xx)s->strides[1] / elemsize) : (Nx0 + 1);
int Sz = %(zz)s->strides[0] / elemsize;
int Sy = %(yy)s->strides[0] / elemsize;
dtype_%(yy)s* yy_data = (dtype_%(yy)s*) %(yy)s->data;
dtype_%(zz)s* zz_data = (dtype_%(zz)s*) %(zz)s->data;
// gemv expects pointers to the beginning of memory arrays,
// but numpy provides provides a pointer to the first element,
// so when the stride is negative, we need to get the last one.
if (Sy < 0)
yy_data += (Nx1 - 1) * Sy;
if (Sz < 0)
zz_data += (Nx0 - 1) * Sz;
if (Nx0 * Nx1)
{
// If xx is neither C- nor F-contiguous, we make a copy.
// TODO:
// - if one stride is equal to "- elemsize", we can still call
// gemv on reversed matrix and vectors
// - if the copy is too long, maybe call vector/vector dot on
// each row instead
if ((%(xx)s->strides[0] < 0)
|| (%(xx)s->strides[1] < 0)
|| ((%(xx)s->strides[0] != elemsize)
&& (%(xx)s->strides[1] != elemsize)))
{
npy_intp dims[2];
dims[0] = Nx0;
dims[1] = Nx1;
PyArrayObject * xx_copy = (PyArrayObject *) PyArray_Copy(%(xx)s);
if (!xx_copy)
%(fail)s
Py_XDECREF(%(xx)s);
%(xx)s = xx_copy;
Sx0 = (Nx0 > 1) ? (%(xx)s->strides[0] / elemsize) : (Nx1 + 1);
Sx1 = (Nx1 > 1) ? (%(xx)s->strides[1] / elemsize) : (Nx0 + 1);
}
if (%(xx)s->strides[0] == elemsize)
{
if (%(xx)s->descr->type_num == PyArray_FLOAT)
......@@ -346,9 +418,9 @@ def gemv_c_code(aa, xx, yy, zz, alpha, beta, destructive, fail):
sgemv_(&NOTRANS, &Nx0, &Nx1,
&alpha,
(float*)(%(xx)s->data), &Sx1,
(float*)(%(yy)s->data), &Sy,
(float*)yy_data, &Sy,
&fbeta,
(float*)(%(zz)s->data), &Sz);
(float*)zz_data, &Sz);
}
else if (%(xx)s->descr->type_num == PyArray_DOUBLE)
{
......@@ -356,13 +428,14 @@ def gemv_c_code(aa, xx, yy, zz, alpha, beta, destructive, fail):
dgemv_(&NOTRANS, &Nx0, &Nx1,
&alpha,
(double*)(%(xx)s->data), &Sx1,
(double*)(%(yy)s->data), &Sy,
(double*)yy_data, &Sy,
&dbeta,
(double*)(%(zz)s->data), &Sz);
(double*)zz_data, &Sz);
}
else
{
assert(0);
PyErr_SetString(PyExc_AssertionError, "neither float nor double dtype");
%(fail)s
}
}
else if (%(xx)s->strides[1] == elemsize)
......@@ -376,9 +449,9 @@ def gemv_c_code(aa, xx, yy, zz, alpha, beta, destructive, fail):
sgemv_(&TRANS, &Nx1, &Nx0,
&alpha,
(float*)(%(xx)s->data), &Sx0,
(float*)(%(yy)s->data), &Sy,
(float*)yy_data, &Sy,
&fbeta,
(float*)(%(zz)s->data), &Sz);
(float*)zz_data, &Sz);
}
else if (%(xx)s->descr->type_num == PyArray_DOUBLE)
{
......@@ -386,20 +459,22 @@ def gemv_c_code(aa, xx, yy, zz, alpha, beta, destructive, fail):
dgemv_(&TRANS, &Nx1, &Nx0,
&alpha,
(double*)(%(xx)s->data), &Sx0,
(double*)(%(yy)s->data), &Sy,
(double*)yy_data, &Sy,
&dbeta,
(double*)(%(zz)s->data), &Sz);
(double*)zz_data, &Sz);
}
else
{
assert(0);
PyErr_SetString(PyExc_AssertionError, "neither float nor double dtype");
%(fail)s
}
}
else
{
// if xx is strided in both directions, then just do the gemv with a
// pair of for loops.
assert (0);
PyErr_SetString(PyExc_AssertionError,
"xx is a double-strided matrix, and should have been copied "
"into a memory-contiguous one.");
%(fail)s
}
}
else if (dbeta != 1.0)
......@@ -429,7 +504,7 @@ class CGemv(BaseBLAS, Gemv):
return code
def c_code_cache_version(self):
return (2,)
return (9,)
@local_optimizer([gemv_inplace, gemv_no_inplace])
......
......@@ -44,7 +44,12 @@ class ScipyGer(Ger):
# N.B. some versions of scipy (e.g. mine) don't actually work
# in-place on a, even when I tell it to.
A = cA[0]
if A.flags['C_CONTIGUOUS']:
if A.size == 0:
# We don't have to do anything, A is empty.
# We need this special case because Numpy considers it
# C-contiguous, wich is confusing.
pass
elif A.flags['C_CONTIGUOUS']:
A = local_ger(calpha[0], cy[0], cx[0], a=A.T,
overwrite_a=int(self.destructive)).T
else:
......
#from nose.plugins.skip import SkipTest
#import traceback
import sys
import itertools, sys
import theano.tensor as T
from theano import tensor
#from theano.gof import Env
from theano.printing import pp
......@@ -878,6 +879,11 @@ class TestGemv(TestCase, unittest_tools.TestOptimizationMixin):
# Assert they produce the same output
assert numpy.allclose(f(), numpy.dot(v.get_value(), m.get_value()))
# Assert it works when m has no contiguous dimension
m.set_value(
m.get_value(borrow=True)[::-1, ::-1],
borrow=True)
assert numpy.allclose(f(), numpy.dot(v.get_value(), m.get_value()))
def test_dot_mv(self):
......@@ -894,6 +900,11 @@ class TestGemv(TestCase, unittest_tools.TestOptimizationMixin):
# Assert they produce the same output
assert numpy.allclose(f(), numpy.dot(m.get_value(), v.get_value()))
# Assert it works when m has no contiguous dimension
m.set_value(
m.get_value(borrow=True)[::-1, ::-1],
borrow=True)
assert numpy.allclose(f(), numpy.dot(m.get_value(), v.get_value()))
@staticmethod
def t_gemv1(m_shp):
......@@ -915,19 +926,30 @@ class TestGemv(TestCase, unittest_tools.TestOptimizationMixin):
assert topo[0].op.inplace==False
#test the inplace version
f = theano.function([], [], updates={v2:v2+theano.dot(m,v1)}
g = theano.function([], [], updates={v2:v2+theano.dot(m,v1)}
, mode = mode_blas_opt)
# Assert they produce the same output
f()
g()
assert numpy.allclose(v2.get_value(),
numpy.dot(m.get_value(), v1.get_value()) + v2_orig)
topo = f.maker.env.toposort()
topo = g.maker.env.toposort()
assert len(topo)==1
assert isinstance(topo[0].op, Gemv)
if config.mode != 'FAST_COMPILE':
assert topo[0].op.inplace==True
# Do the same tests with a matrix with strides in both dimensions
m.set_value(
m.get_value(borrow=True)[::-1, ::-1],
borrow=True)
v2.set_value(v2_orig)
assert numpy.allclose(f(),
numpy.dot(m.get_value(), v1.get_value()) + v2_orig)
g()
assert numpy.allclose(v2.get_value(),
numpy.dot(m.get_value(), v1.get_value()) + v2_orig)
def test_gemv1(self):
self.t_gemv1((3,2))
self.t_gemv1((0,2))
......@@ -952,18 +974,29 @@ class TestGemv(TestCase, unittest_tools.TestOptimizationMixin):
assert topo[-1].op.inplace==False
#test the inplace version
f = theano.function([], [], updates={v2:v2+theano.dot(v1,m)}
g = theano.function([], [], updates={v2:v2+theano.dot(v1,m)}
, mode = mode_blas_opt)
# Assert they produce the same output
f()
g()
assert numpy.allclose(v2.get_value(),
numpy.dot(v1.get_value(), m.get_value()) + v2_orig)
topo = f.maker.env.toposort()
topo = g.maker.env.toposort()
assert sum(isinstance(node.op, Gemv) for node in topo)==1
if config.mode != 'FAST_COMPILE':
assert topo[-1].op.inplace==True
# Do the same tests with a matrix with strides in both dimensions
m.set_value(
m.get_value(borrow=True)[::-1, ::-1],
borrow=True)
v2.set_value(v2_orig)
assert numpy.allclose(f(),
numpy.dot(v1.get_value(), m.get_value()) + v2.get_value())
g()
assert numpy.allclose(v2.get_value(),
numpy.dot(v1.get_value(), m.get_value()) + v2_orig)
def test_gemv_dimensions(self):
A = T.matrix('A')
x, y = T.vectors('x', 'y')
......@@ -984,6 +1017,7 @@ class TestGemv(TestCase, unittest_tools.TestOptimizationMixin):
ones_6 = numpy.ones(6, dtype=config.floatX)
f(A_val, ones_3, ones_5)
f(A_val[::-1, ::-1], ones_3, ones_5)
self.assertRaises(ValueError, f, A_val, ones_4, ones_5)
self.assertRaises(ValueError, f, A_val, ones_3, ones_6)
self.assertRaises(ValueError, f, A_val, ones_4, ones_6)
......@@ -1017,6 +1051,7 @@ def matrixmultiply(a, b):
class BaseGemv(object):
mode = mode_blas_opt # can be overridden with self.mode
shared = staticmethod(theano.shared)
def get_data(self,x_stride=1,y_stride=1):
rng = numpy.random.RandomState(unittest_tools.fetch_seed())
......@@ -1031,7 +1066,7 @@ class BaseGemv(object):
return alpha,beta,a,x,y
def test_simple(self):
alpha, beta, a, x, y = [ shared(value) for value in self.get_data() ]
alpha, beta, a, x, y = [ self.shared(value) for value in self.get_data() ]
desired_oy = alpha.get_value() * matrixmultiply(a.get_value(),x.get_value()) + beta.get_value() * y.get_value()
oy = alpha * T.dot(a,x) + beta * y
......@@ -1049,8 +1084,8 @@ class BaseGemv(object):
vs = self.get_data()
alpha_v, beta_v, a_v, x_v, y_v = vs
a = shared(a_v)
x = shared(x_v)
a = self.shared(a_v)
x = self.shared(x_v)
desired_oy = matrixmultiply(a_v, x_v)
......@@ -1067,7 +1102,7 @@ class BaseGemv(object):
def test_simple_transpose(self):
vs = self.get_data()
alpha_v, beta_v, a_v, x_v, y_v = vs
alpha, beta, a, x, y = [ shared(v) for v in vs ]
alpha, beta, a, x, y = [ self.shared(v) for v in vs ]
desired_oy = alpha_v * matrixmultiply(transpose(a_v),x_v)+beta_v*y_v
......@@ -1083,7 +1118,7 @@ class BaseGemv(object):
def test_x_stride(self):
vs = self.get_data(x_stride = 2)
alpha_v, beta_v, a_v, x_v, y_v = vs
alpha, beta, a, x, y = [ shared(v) for v in vs ]
alpha, beta, a, x, y = [ self.shared(v) for v in vs ]
desired_oy = alpha_v * matrixmultiply(a_v,x_v[::2])+beta_v*y_v
......@@ -1099,7 +1134,7 @@ class BaseGemv(object):
def test_x_stride_transpose(self):
vs = self.get_data(x_stride = 2)
alpha_v, beta_v, a_v, x_v, y_v = vs
alpha, beta, a, x, y = [ shared(v) for v in vs ]
alpha, beta, a, x, y = [ self.shared(v) for v in vs ]
desired_oy = alpha_v * matrixmultiply(transpose(a_v),x_v[::2])+beta_v*y_v
......@@ -1115,7 +1150,7 @@ class BaseGemv(object):
def test_y_stride(self):
vs = self.get_data(y_stride = 2)
alpha_v, beta_v, a_v, x_v, y_v = vs
alpha, beta, a, x, y = [ shared(v) for v in vs ]
alpha, beta, a, x, y = [ self.shared(v) for v in vs ]
desired_oy = alpha_v * matrixmultiply(a_v,x_v)+beta_v*y_v[::2]
......@@ -1131,7 +1166,7 @@ class BaseGemv(object):
def test_y_stride_transpose(self):
vs = self.get_data(y_stride = 2)
alpha_v, beta_v, a_v, x_v, y_v = vs
alpha, beta, a, x, y = [ shared(v) for v in vs ]
alpha, beta, a, x, y = [ self.shared(v) for v in vs ]
desired_oy = alpha_v * matrixmultiply(transpose(a_v),x_v)+beta_v*y_v[::2]
......@@ -1144,6 +1179,46 @@ class BaseGemv(object):
oy_v = oy_func()
assert_array_almost_equal(desired_oy, oy_v)
def test_a_strides(self):
vs = self.get_data()
alpha_v, beta_v, a_v, x_v, y_v = vs
alpha, beta, a, x, y = [ self.shared(v) for v in vs ]
a_v = a_v[::-1, ::-1]
a.set_value(
a.get_value(borrow=True, return_internal_type=True)[::-1, ::-1],
borrow=True)
desired_oy = alpha_v * matrixmultiply(a_v,x_v)+beta_v*y_v
oy = alpha * T.dot(a,x)+beta*y
oy_func = theano.function([], oy, mode=self.mode)
self.assertFunctionContains1(oy_func, self.gemv)
oy_v = oy_func()
assert_array_almost_equal(desired_oy, oy_v)
def test_a_strides_transpose(self):
vs = self.get_data()
alpha_v, beta_v, a_v, x_v, y_v = vs
alpha, beta, a, x, y = [ self.shared(v) for v in vs ]
a_v = a_v[::-1, ::-1]
a.set_value(
a.get_value(borrow=True, return_internal_type=True)[::-1, ::-1],
borrow=True)
desired_oy = alpha_v * matrixmultiply(transpose(a_v),x_v)+beta_v*y_v
oy = alpha * T.dot(a.T,x)+beta*y
oy_func = theano.function([], oy, mode=self.mode)
self.assertFunctionContains1(oy_func, self.gemv)
oy_v = oy_func()
assert_array_almost_equal(desired_oy, oy_v)
def test_upcasting_scalar_nogemv(self):
# Test that the optimization does not crash when the scale has
# an incorrect dtype, and forces upcasting of the result
......@@ -1281,6 +1356,7 @@ class TestGer_OpContract(TestCase, unittest_tools.T_OpContractMixin):
class TestGer(TestCase, unittest_tools.TestOptimizationMixin):
shared = staticmethod(theano.shared)
def setUp(self):
self.mode = theano.compile.get_default_mode().including('fast_run')
......@@ -1332,6 +1408,9 @@ class TestGer(TestCase, unittest_tools.TestOptimizationMixin):
f(numpy.random.rand(5, 4).astype(self.dtype),
numpy.random.rand(5).astype(self.dtype),
numpy.random.rand(4).astype(self.dtype))
f(numpy.random.rand(5, 4).astype(self.dtype)[::-1, ::-1],
numpy.random.rand(5).astype(self.dtype),
numpy.random.rand(4).astype(self.dtype))
def test_A_plus_scaled_outer(self):
f = self.function([self.A, self.x, self.y],
......@@ -1340,6 +1419,9 @@ class TestGer(TestCase, unittest_tools.TestOptimizationMixin):
f(numpy.random.rand(5, 4).astype(self.dtype),
numpy.random.rand(5).astype(self.dtype),
numpy.random.rand(4).astype(self.dtype))
f(numpy.random.rand(5, 4).astype(self.dtype)[::-1, ::-1],
numpy.random.rand(5).astype(self.dtype),
numpy.random.rand(4).astype(self.dtype))
def test_scaled_A_plus_scaled_outer(self):
f = self.function([self.A, self.x, self.y],
......@@ -1352,6 +1434,9 @@ class TestGer(TestCase, unittest_tools.TestOptimizationMixin):
f(numpy.random.rand(5, 4).astype(self.dtype),
numpy.random.rand(5).astype(self.dtype),
numpy.random.rand(4).astype(self.dtype))
f(numpy.random.rand(5, 4).astype(self.dtype)[::-1, ::-1],
numpy.random.rand(5).astype(self.dtype),
numpy.random.rand(4).astype(self.dtype))
def given_dtype(self, dtype, M, N):
""" test corner case shape and dtype"""
......@@ -1362,6 +1447,9 @@ class TestGer(TestCase, unittest_tools.TestOptimizationMixin):
f(numpy.random.rand(M, N).astype(self.dtype),
numpy.random.rand(M).astype(self.dtype),
numpy.random.rand(N).astype(self.dtype))
f(numpy.random.rand(M, N).astype(self.dtype)[::-1, ::-1],
numpy.random.rand(M).astype(self.dtype),
numpy.random.rand(N).astype(self.dtype))
def test_f32_0_0(self):
return self.given_dtype('float32', 0, 0)
......@@ -1394,10 +1482,393 @@ class TestGer(TestCase, unittest_tools.TestOptimizationMixin):
return self.given_dtype('complex128', 1, 9)
def test_inplace(self):
A = theano.shared(numpy.random.rand(4, 5).astype(self.dtype))
A = self.shared(numpy.random.rand(4, 5).astype(self.dtype))
f = self.function([self.x, self.y], [],
updates={A: A + T.constant(0.1, dtype=self.dtype) *
T.outer(self.x, self.y)})
self.assertFunctionContains(f, self.ger_destructive)
f(numpy.random.rand(4).astype(self.dtype),
numpy.random.rand(5).astype(self.dtype))
A.set_value(
A.get_value(borrow=True, return_internal_type=True)[::-1, ::-1],
borrow=True)
f(numpy.random.rand(4).astype(self.dtype),
numpy.random.rand(5).astype(self.dtype))
class TestBlasStrides(TestCase):
dtype = 'float64'
shared = staticmethod(tensor._shared)
mode = theano.compile.get_default_mode()
mode = mode.including('fast_run').excluding('gpu', 'c_blas', 'scipy_blas')
rng = numpy.random.RandomState(seed=unittest_tools.fetch_seed())
def rand(self, *shape):
return theano._asarray(self.rng.rand(*shape), dtype=self.dtype)
def cmp_dot22(self, b_shp, c_shp):
av = numpy.zeros((0, 0), dtype=self.dtype)
bv = self.rand(*b_shp)
cv = self.rand(*c_shp)
a = self.shared(av, 'a')
b = self.shared(bv, 'b')
c = self.shared(cv, 'c')
b_t = self.shared(bv.T, 'b.T')
c_t = self.shared(cv.T, 'c.T')
b_dev = b.get_value(borrow=False, return_internal_type=True)
c_dev = c.get_value(borrow=False, return_internal_type=True)
bt_dev = b_t.get_value(borrow=False, return_internal_type=True)
ct_dev = c_t.get_value(borrow=False, return_internal_type=True)
f_nn = theano.function([], [], updates={a: tensor.dot(b, c)},
mode=self.mode)
print 'class name:', self.__class__.__name__
theano.printing.debugprint(f_nn)
f_nt = theano.function([], [], updates={a: tensor.dot(b, c_t.T)},
mode=self.mode)
f_tn = theano.function([], [], updates={a: tensor.dot(b_t.T, c)},
mode=self.mode)
f_tt = theano.function([], [], updates={a: tensor.dot(b_t.T, c_t.T)},
mode=self.mode)
# Try with all stride patterns, and all transposed pattern
for step_signs in itertools.product((-1, 1), repeat=4):
for step in (1, 2):
b_step1, b_step2, c_step1, c_step2 = (s * step
for s in step_signs)
b.set_value(b_dev.copy()[::b_step1, ::b_step2], borrow=True)
c.set_value(c_dev.copy()[::c_step1, ::c_step2], borrow=True)
b_t.set_value(bt_dev.copy()[::b_step2, ::b_step1], borrow=True)
c_t.set_value(ct_dev.copy()[::c_step2, ::c_step1], borrow=True)
# Numpy result
a_n = numpy.dot(bv[::b_step1, ::b_step2],
cv[::c_step1, ::c_step2])
f_nn()
assert numpy.allclose(a.get_value(), a_n)
f_nt()
assert numpy.allclose(a.get_value(), a_n)
f_tn()
assert numpy.allclose(a.get_value(), a_n)
f_tt()
assert numpy.allclose(a.get_value(), a_n)
def test_dot22(self):
self.cmp_dot22((3, 4), (4, 5))
self.cmp_dot22((1, 4), (4, 5))
self.cmp_dot22((3, 4), (4, 1))
self.cmp_dot22((3, 1), (1, 1))
self.cmp_dot22((1, 4), (4, 1))
self.cmp_dot22((3, 1), (1, 5))
self.cmp_dot22((0, 4), (4, 5))
self.cmp_dot22((0, 4), (4, 1))
self.cmp_dot22((0, 1), (1, 5))
self.cmp_dot22((3, 4), (4, 0))
self.cmp_dot22((3, 0), (0, 5))
self.cmp_dot22((0, 4), (4, 0))
self.cmp_dot22((0, 0), (0, 0))
def cmp_dot22scalar(self, b_shp, c_shp):
av = numpy.zeros((0, 0), dtype=self.dtype)
bv = self.rand(*b_shp)
cv = self.rand(*c_shp)
l = numpy.float32(0.2)
a = self.shared(av, 'a')
b = self.shared(bv, 'b')
c = self.shared(cv, 'c')
b_t = self.shared(bv.T, 'b.T')
c_t = self.shared(cv.T, 'c.T')
b_dev = b.get_value(borrow=False, return_internal_type=True)
c_dev = c.get_value(borrow=False, return_internal_type=True)
bt_dev = b_t.get_value(borrow=False, return_internal_type=True)
ct_dev = c_t.get_value(borrow=False, return_internal_type=True)
f_nn = theano.function([], [], updates={a: l * tensor.dot(b, c)},
mode=self.mode)
f_nt = theano.function([], [], updates={a: l * tensor.dot(b, c_t.T)},
mode=self.mode)
f_tn = theano.function([], [], updates={a: l * tensor.dot(b_t.T, c)},
mode=self.mode)
f_tt = theano.function([], [],
updates={a: l * tensor.dot(b_t.T, c_t.T)},
mode=self.mode)
# Try with all stride patterns, and all transposed pattern
for step_signs in itertools.product((-1, 1), repeat=4):
for step in (1, 2):
b_step1, b_step2, c_step1, c_step2 = (s * step
for s in step_signs)
b.set_value(b_dev.copy()[::b_step1, ::b_step2], borrow=True)
c.set_value(c_dev.copy()[::c_step1, ::c_step2], borrow=True)
b_t.set_value(bt_dev.copy()[::b_step2, ::b_step1], borrow=True)
c_t.set_value(ct_dev.copy()[::c_step2, ::c_step1], borrow=True)
# Numpy result
a_n = l * numpy.dot(bv[::b_step1, ::b_step2],
cv[::c_step1, ::c_step2])
f_nn()
assert numpy.allclose(a.get_value(), a_n)
f_nt()
assert numpy.allclose(a.get_value(), a_n)
f_tn()
assert numpy.allclose(a.get_value(), a_n)
f_tt()
assert numpy.allclose(a.get_value(), a_n)
def test_dot22scalar(self):
self.cmp_dot22scalar((3, 4), (4, 5))
self.cmp_dot22scalar((1, 4), (4, 5))
self.cmp_dot22scalar((3, 4), (4, 1))
self.cmp_dot22scalar((3, 1), (1, 1))
self.cmp_dot22scalar((1, 4), (4, 1))
self.cmp_dot22scalar((3, 1), (1, 5))
self.cmp_dot22scalar((0, 4), (4, 5))
self.cmp_dot22scalar((0, 4), (4, 1))
self.cmp_dot22scalar((0, 1), (1, 5))
self.cmp_dot22scalar((3, 4), (4, 0))
self.cmp_dot22scalar((3, 0), (0, 5))
self.cmp_dot22scalar((0, 4), (4, 0))
self.cmp_dot22scalar((0, 0), (0, 0))
def cmp_gemm(self, a_shp, b_shp, c_shp):
av = self.rand(*a_shp)
bv = self.rand(*b_shp)
cv = self.rand(*c_shp)
l = numpy.float32(0.2)
a = self.shared(av, 'a')
b = self.shared(bv, 'b')
c = self.shared(cv, 'c')
a_t = self.shared(av.T, 'a.T')
b_t = self.shared(bv.T, 'b.T')
c_t = self.shared(cv.T, 'c.T')
a_dev = a.get_value(borrow=False, return_internal_type=True)
b_dev = b.get_value(borrow=False, return_internal_type=True)
c_dev = c.get_value(borrow=False, return_internal_type=True)
bt_dev = b_t.get_value(borrow=False, return_internal_type=True)
ct_dev = c_t.get_value(borrow=False, return_internal_type=True)
f_nnn = theano.function([], [],
updates={a: (l * a + tensor.dot(b, c))},
mode=self.mode)
f_nnt = theano.function([], [],
updates={a: (l * a + tensor.dot(b, c_t.T))},
mode=self.mode)
f_ntn = theano.function([], [],
updates={a: (l * a + tensor.dot(b_t.T, c))},
mode=self.mode)
f_ntt = theano.function([], [],
updates={a: (l * a + tensor.dot(b_t.T, c_t.T))},
mode=self.mode)
f_tnn = theano.function([], [],
updates={a_t: (l * a_t + tensor.dot(b, c).T)},
mode=self.mode)
f_tnt = theano.function([], [],
updates={a_t: (l * a_t + tensor.dot(b, c_t.T).T)},
mode=self.mode)
f_ttn = theano.function([], [],
updates={a_t: (l * a_t + tensor.dot(b_t.T, c).T)},
mode=self.mode)
f_ttt = theano.function([], [],
updates={a_t: (l * a_t + tensor.dot(b_t.T, c_t.T).T)},
mode=self.mode)
# Try with all stride patterns, and all transposed pattern
for step_signs in itertools.product((-1, 1), repeat=6):
for step in (1, 2):
a_step1, a_step2, b_step1, b_step2, c_step1, c_step2 = \
(s * step for s in step_signs)
b.set_value(b_dev.copy()[::b_step1, ::b_step2], borrow=True)
c.set_value(c_dev.copy()[::c_step1, ::c_step2], borrow=True)
b_t.set_value(bt_dev.copy()[::b_step2, ::b_step1], borrow=True)
c_t.set_value(ct_dev.copy()[::c_step2, ::c_step1], borrow=True)
# Numpy results
a_n = (l * av[::a_step1, ::a_step2]
+ numpy.dot(bv[::b_step1, ::b_step2],
cv[::c_step1, ::c_step2]))
at_n = (l * av[::a_step1, ::a_step2].T
+ numpy.dot(bv[::b_step1, ::b_step2],
cv[::c_step1, ::c_step2]).T)
# a's value is updated, so we need to reinitialize it each time
a.set_value(a_dev.copy()[::a_step1, ::a_step2], borrow=True)
f_nnn()
assert numpy.allclose(a.get_value(), a_n)
a.set_value(a_dev.copy()[::a_step1, ::a_step2], borrow=True)
f_nnt()
assert numpy.allclose(a.get_value(), a_n)
a.set_value(a_dev.copy()[::a_step1, ::a_step2], borrow=True)
f_ntn()
assert numpy.allclose(a.get_value(), a_n)
a.set_value(a_dev.copy()[::a_step1, ::a_step2], borrow=True)
f_ntt()
assert numpy.allclose(a.get_value(), a_n)
a_t.set_value(transpose(a_dev.copy())[::a_step2, ::a_step1],
borrow=True)
f_tnn()
assert numpy.allclose(a_t.get_value(), at_n)
a_t.set_value(transpose(a_dev.copy())[::a_step2, ::a_step1],
borrow=True)
f_tnt()
assert numpy.allclose(a_t.get_value(), at_n)
a_t.set_value(transpose(a_dev.copy())[::a_step2, ::a_step1],
borrow=True)
f_ttn()
assert numpy.allclose(a_t.get_value(), at_n)
a_t.set_value(transpose(a_dev.copy())[::a_step2, ::a_step1],
borrow=True)
f_ttt()
assert numpy.allclose(a_t.get_value(), at_n)
def test_gemm(self):
self.cmp_gemm((3, 5), (3, 4), (4, 5))
self.cmp_gemm((1, 5), (1, 4), (4, 5))
self.cmp_gemm((3, 1), (3, 4), (4, 1))
self.cmp_gemm((3, 1), (3, 1), (1, 1))
self.cmp_gemm((1, 1), (1, 4), (4, 1))
self.cmp_gemm((3, 5), (3, 1), (1, 5))
self.cmp_gemm((0, 5), (0, 4), (4, 5))
self.cmp_gemm((0, 1), (0, 4), (4, 1))
self.cmp_gemm((0, 5), (0, 1), (1, 5))
self.cmp_gemm((3, 0), (3, 4), (4, 0))
self.cmp_gemm((3, 5), (3, 0), (0, 5))
self.cmp_gemm((0, 0), (0, 4), (4, 0))
self.cmp_gemm((0, 0), (0, 0), (0, 0))
def cmp_gemv(self, a_shp, b_shp, c_shp):
av = self.rand(a_shp)
bv = self.rand(*b_shp)
cv = self.rand(c_shp)
l = numpy.float32(0.2)
a = self.shared(av, 'a')
b = self.shared(bv, 'b')
c = self.shared(cv, 'c')
b_t = self.shared(bv.T, 'b.T')
a_dev = a.get_value(borrow=False, return_internal_type=True)
b_dev = b.get_value(borrow=False, return_internal_type=True)
c_dev = c.get_value(borrow=False, return_internal_type=True)
f_n = theano.function([], [], updates={a: (a + l * tensor.dot(b, c))},
mode=self.mode)
f_t = theano.function([], [],
updates={a: (a + l * tensor.dot(b_t.T, c))},
mode=self.mode)
# Try with all stride patterns, and all transposed pattern
for step_signs in itertools.product((1, -1), repeat=4):
for step in (1, 2):
a_step, b_step1, b_step2, c_step = (s * step
for s in step_signs)
a.set_value(a_dev.copy()[::a_step], borrow=True)
b.set_value(b_dev.copy()[::b_step1, ::b_step2],
borrow=True)
b_t.set_value(transpose(b_dev.copy())[::b_step2, ::b_step1],
borrow=True)
c.set_value(c_dev.copy()[::c_step], borrow=True)
a_n = (av[::a_step]
+ l * numpy.dot(bv[::b_step1, ::b_step2], cv[::c_step]))
f_n()
assert numpy.allclose(a.get_value(), a_n), (a.get_value(), a_n)
a.set_value(a_dev.copy()[::a_step], borrow=True)
f_t()
assert numpy.allclose(a.get_value(), a_n), (a.get_value(), a_n)
def test_gemv(self):
self.cmp_gemv(3, (3, 5), 5)
self.cmp_gemv(1, (1, 5), 5)
self.cmp_gemv(3, (3, 1), 1)
self.cmp_gemv(0, (0, 5), 5)
self.cmp_gemv(3, (3, 0), 0)
self.cmp_gemv(0, (0, 1), 1)
self.cmp_gemv(1, (1, 0), 0)
self.cmp_gemv(0, (0, 0), 0)
def cmp_ger(self, a_shp, b_shp, c_shp):
av = self.rand(*a_shp)
bv = self.rand(b_shp)
cv = self.rand(c_shp)
l = numpy.float32(0.2)
a = self.shared(av, 'a')
b = self.shared(bv, 'b')
c = self.shared(cv, 'c')
a_t = self.shared(av.T, 'a.T')
a_dev = a.get_value(borrow=False, return_internal_type=True)
b_dev = b.get_value(borrow=False, return_internal_type=True)
c_dev = c.get_value(borrow=False, return_internal_type=True)
f_n = theano.function([], [],
updates={a: (a + l * tensor.outer(b, c))},
mode=self.mode)
f_t = theano.function([], [],
updates={a_t: (a_t + l * tensor.outer(b, c).T)},
mode=self.mode)
# Try with all stride patterns, and all transposed patterns
for step_signs in itertools.product((1, -1), repeat=4):
for step in (1, 2):
a_step1, a_step2, b_step, c_step = (s * step
for s in step_signs)
a.set_value(a_dev.copy()[::a_step1, ::a_step2], borrow=True)
a_t.set_value(transpose(a_dev.copy())[::a_step1, ::a_step2],
borrow=True)
b.set_value(b_dev.copy()[::b_step], borrow=True)
c.set_value(c_dev.copy()[::c_step], borrow=True)
f_n()
n_n = (av[::a_step1, ::a_step2]
+ l * numpy.outer(bv[::b_step], cv[::c_step]))
assert numpy.allclose(a.get_value(), n_n), (a.get_value(), n_n)
f_t()
n_t = (av.T[::a_step1, ::a_step2]
+ l * numpy.outer(bv[::b_step], cv[::c_step]).T)
assert numpy.allclose(a_t.get_value(), n_t),\
(a_t.get_value(), n_t)
def test_ger_strides(self):
self.cmp_ger((3, 5), 3, 5)
self.cmp_ger((1, 5), 1, 5)
self.cmp_ger((3, 1), 3, 1)
self.cmp_ger((0, 5), 0, 5)
self.cmp_ger((3, 0), 3, 0)
self.cmp_ger((0, 1), 0, 1)
self.cmp_ger((1, 0), 1, 0)
self.cmp_ger((0, 0), 0, 0)
import sys
import numpy
from unittest import TestCase
import theano
import theano.tensor as tensor
......@@ -14,8 +16,7 @@ from theano.tensor.blas import Gemv
from theano.tests import unittest_tools
from theano.tests.unittest_tools import TestOptimizationMixin
from test_blas import TestCase
from test_blas import BaseGemv
from theano.tensor.tests.test_blas import BaseGemv, TestBlasStrides
mode_blas_opt = theano.compile.get_default_mode().including(
'BlasOpt', 'specialize', 'InplaceBlasOpt', 'c_blas')
......@@ -41,7 +42,8 @@ class TestCGer(TestCase, TestOptimizationMixin):
)
def run_f(self, f):
return f(self.Aval, self.xval, self.yval)
f(self.Aval, self.xval, self.yval)
f(self.Aval[::-1, ::-1], self.xval, self.yval)
def b(self, bval):
return tensor.as_tensor_variable(numpy.asarray(bval, dtype=self.dtype))
......@@ -132,6 +134,10 @@ class TestCGemv(TestCase, TestOptimizationMixin):
assert numpy.allclose(f(self.xval, self.Aval),
numpy.dot(self.xval, self.Aval))
# Test with negative strides on 2 dims
assert numpy.allclose(f(self.xval, self.Aval[::-1, ::-1]),
numpy.dot(self.xval, self.Aval[::-1, ::-1]))
def test_optimizations_mv(self):
''' Test matrix dot vector '''
f = theano.function([self.A, self.y],
......@@ -145,6 +151,10 @@ class TestCGemv(TestCase, TestOptimizationMixin):
# Assert they produce the same output
assert numpy.allclose(f(self.Aval, self.yval),
numpy.dot(self.Aval, self.yval))
# Test with negative strides on 2 dims
assert numpy.allclose(f(self.Aval[::-1, ::-1], self.yval),
numpy.dot(self.Aval[::-1, ::-1], self.yval))
def t_gemv1(self, m_shp):
''' test vector2 + dot(matrix, vector1) '''
......@@ -164,17 +174,28 @@ class TestCGemv(TestCase, TestOptimizationMixin):
assert topo == [CGemv(inplace=False)], topo
#test the inplace version
f = theano.function([], [],
g = theano.function([], [],
updates={v2:v2+theano.dot(m,v1)},
mode=self.mode)
# Assert they produce the same output
f()
g()
assert numpy.allclose(v2.get_value(),
numpy.dot(m.get_value(), v1.get_value()) + v2_orig)
topo = [n.op for n in f.maker.env.toposort()]
topo = [n.op for n in g.maker.env.toposort()]
assert topo == [CGemv(inplace=True)]
# Do the same tests with a matrix with strides in both dimensions
m.set_value(
m.get_value(borrow=True)[::-1, ::-1],
borrow=True)
v2.set_value(v2_orig)
assert numpy.allclose(f(),
numpy.dot(m.get_value(), v1.get_value()) + v2_orig)
g()
assert numpy.allclose(v2.get_value(),
numpy.dot(m.get_value(), v1.get_value()) + v2_orig)
def test_gemv1(self):
self.t_gemv1((3,2))
self.t_gemv1((0,2))
......@@ -200,6 +221,7 @@ class TestCGemv(TestCase, TestOptimizationMixin):
ones_6 = numpy.ones(6, dtype=dtype)
f(A_val, ones_3, ones_5)
f(A_val[::-1, ::-1], ones_3, ones_5)
self.assertRaises(ValueError, f, A_val, ones_4, ones_5)
self.assertRaises(ValueError, f, A_val, ones_3, ones_6)
self.assertRaises(ValueError, f, A_val, ones_4, ones_6)
......@@ -217,3 +239,6 @@ class TestCGemvFloat64(TestCase, BaseGemv, TestOptimizationMixin):
dtype = 'float64'
gemv = CGemv(inplace=False)
gemv_inplace = CGemv(inplace=True)
class TestBlasStridesC(TestBlasStrides):
mode = mode_blas_opt
......@@ -4,7 +4,7 @@ import theano
import theano.tensor as tensor
from theano.tensor.blas_scipy import ScipyGer
from test_blas import TestCase, gemm_no_inplace
from test_blas import TestCase, gemm_no_inplace, TestBlasStrides
from theano.tests.unittest_tools import TestOptimizationMixin
class TestScipyGer(TestCase, TestOptimizationMixin):
......@@ -30,6 +30,7 @@ class TestScipyGer(TestCase, TestOptimizationMixin):
def run_f(self, f):
f(self.Aval, self.xval, self.yval)
f(self.Aval[::-1, ::-1], self.xval[::-1], self.yval[::-1])
def b(self, bval):
return tensor.as_tensor_variable(numpy.asarray(bval, dtype=self.dtype))
......@@ -55,3 +56,7 @@ class TestScipyGer(TestCase, TestOptimizationMixin):
0.2 * self.A + 0.1 * tensor.outer(self.x, self.y))
self.assertFunctionContains(f, gemm_no_inplace)
self.run_f(f) #DebugMode tests correctness
class TestBlasStridesScipy(TestBlasStrides):
mode = theano.compile.get_default_mode()
mode = mode.including('fast_run').excluding('gpu', 'c_blas')
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论