提交 1f6b0c73 authored 作者: nouiz's avatar nouiz

Merge pull request #437 from lamblin/blas_double_strides

Make blas functions work with all stride patterns
...@@ -192,7 +192,7 @@ class GpuGemm(GpuOp): ...@@ -192,7 +192,7 @@ class GpuGemm(GpuOp):
return Apply(self, [z, a, x, y, b], [z.type()]) return Apply(self, [z, a, x, y, b], [z.type()])
def c_code_cache_version(self): def c_code_cache_version(self):
return (3,) return (4,)
def c_code(self, node, name, inputs, outputs, sub): def c_code(self, node, name, inputs, outputs, sub):
#z_out = alpha * dot(x,y) + beta * z_in #z_out = alpha * dot(x,y) + beta * z_in
...@@ -200,6 +200,7 @@ class GpuGemm(GpuOp): ...@@ -200,6 +200,7 @@ class GpuGemm(GpuOp):
#not inplace version, we copy z_in to z_out. #not inplace version, we copy z_in to z_out.
z_in, a, x, y, b = inputs z_in, a, x, y, b = inputs
z_out, = outputs z_out, = outputs
inplace = int(self.inplace)
fail = sub['fail'] fail = sub['fail']
sio = StringIO.StringIO() sio = StringIO.StringIO()
...@@ -215,39 +216,50 @@ class GpuGemm(GpuOp): ...@@ -215,39 +216,50 @@ class GpuGemm(GpuOp):
: (REAL)(((double*)%(b)s->data)[0]); : (REAL)(((double*)%(b)s->data)[0]);
#undef REAL #undef REAL
""" if (%(inplace)s
if self.inplace: && (CudaNdarray_HOST_STRIDES(%(z_in)s)[0] >= 0)
print >> sio, """ && (CudaNdarray_HOST_STRIDES(%(z_in)s)[1] >= 0)
&& ((CudaNdarray_HOST_DIMS(%(z_in)s)[0] <= 1)
|| (CudaNdarray_HOST_STRIDES(%(z_in)s)[0] == 1)
|| (CudaNdarray_HOST_DIMS(%(z_in)s)[1] <= 1)
|| (CudaNdarray_HOST_STRIDES(%(z_in)s)[1] == 1)))
{
// The input has an appropriate layout, we work inplace
Py_XDECREF(%(z_out)s); Py_XDECREF(%(z_out)s);
%(z_out)s = %(z_in)s; %(z_out)s = %(z_in)s;
Py_INCREF(%(z_out)s); Py_INCREF(%(z_out)s);
""" }
else: else if (%(z_out)s
print >> sio, """ && (%(z_out)s->nd == 2)
if (!%(z_out)s && (CudaNdarray_HOST_DIMS(%(z_out)s)[0]
|| (%(z_out)s->nd != 2) == CudaNdarray_HOST_DIMS(%(z_in)s)[0])
|| (CudaNdarray_HOST_DIMS(%(z_out)s)[0] != CudaNdarray_HOST_DIMS(%(z_in)s)[0]) && (CudaNdarray_HOST_DIMS(%(z_out)s)[1]
|| (CudaNdarray_HOST_DIMS(%(z_out)s)[1] != CudaNdarray_HOST_DIMS(%(z_in)s)[1]) == CudaNdarray_HOST_DIMS(%(z_in)s)[1])
) && (CudaNdarray_HOST_STRIDES(%(z_out)s)[0] >= 0)
&& (CudaNdarray_HOST_STRIDES(%(z_out)s)[1] >= 0)
&& ((CudaNdarray_HOST_DIMS(%(z_out)s)[0] <= 1)
|| (CudaNdarray_HOST_STRIDES(%(z_out)s)[0] == 1)
|| (CudaNdarray_HOST_DIMS(%(z_out)s)[1] <= 1)
|| (CudaNdarray_HOST_STRIDES(%(z_out)s)[1] == 1)))
{ {
Py_XDECREF(%(z_out)s); // The existing output has an appropriate layout,
%(z_out)s = (CudaNdarray*)CudaNdarray_Copy(%(z_in)s); // copy the input data into it, then work inplace
if (!%(z_out)s) if (CudaNdarray_CopyFromCudaNdarray(%(z_out)s, %(z_in)s))
{ {
%(fail)s; %(fail)s;
} }
} }
else else
{ {
if (CudaNdarray_CopyFromCudaNdarray(%(z_out)s, %(z_in)s)) // Copy the input, use the copy as output
Py_XDECREF(%(z_out)s);
%(z_out)s = (CudaNdarray*)CudaNdarray_Copy(%(z_in)s);
if (!%(z_out)s)
{ {
%(fail)s; %(fail)s;
} }
} }
"""
print >> sio, """
if (CudaNdarray_gemm(%(name)s_a, %(x)s, %(y)s, %(name)s_b, %(z_out)s)) if (CudaNdarray_gemm(%(name)s_a, %(x)s, %(y)s, %(name)s_b, %(z_out)s))
{ {
%(fail)s; %(fail)s;
...@@ -294,7 +306,7 @@ class GpuGemv(GpuOp): ...@@ -294,7 +306,7 @@ class GpuGemv(GpuOp):
return Apply(self, [z, a, x, y, b], [z.type()]) return Apply(self, [z, a, x, y, b], [z.type()])
def c_code_cache_version(self): def c_code_cache_version(self):
return (1,) return (2,)
def c_code(self, node, name, inputs, outputs, sub): def c_code(self, node, name, inputs, outputs, sub):
#z_out = alpha * dot(x,y) + beta * z_in #z_out = alpha * dot(x,y) + beta * z_in
...@@ -302,44 +314,46 @@ class GpuGemv(GpuOp): ...@@ -302,44 +314,46 @@ class GpuGemv(GpuOp):
#not inplace version, we copy z_in to z_out. #not inplace version, we copy z_in to z_out.
z_in, a, x, y, b = inputs z_in, a, x, y, b = inputs
z_out, = outputs z_out, = outputs
inplace = int(self.inplace)
fail = sub['fail'] fail = sub['fail']
sio = StringIO.StringIO() sio = StringIO.StringIO()
print >> sio, """ print >> sio, """
float %(name)s_alpha = ((dtype_%(a)s*)(%(a)s->data))[0]; float %(name)s_alpha = ((dtype_%(a)s*)(%(a)s->data))[0];
float %(name)s_beta = ((dtype_%(b)s*)(%(b)s->data))[0]; float %(name)s_beta = ((dtype_%(b)s*)(%(b)s->data))[0];
"""
if self.inplace: if (%(inplace)s
print >> sio, """ && ((CudaNdarray_HOST_STRIDES(%(z_in)s)[0] > 0)
|| ((CudaNdarray_HOST_STRIDES(%(z_in)s)[0] == 0)
&& (CudaNdarray_HOST_DIMS(%(z_in)s)[0] == 1))))
{
// Work inplace on the input
Py_XDECREF(%(z_out)s); Py_XDECREF(%(z_out)s);
%(z_out)s = %(z_in)s; %(z_out)s = %(z_in)s;
Py_INCREF(%(z_out)s); Py_INCREF(%(z_out)s);
""" }
else: else if (%(z_out)s
print >> sio, """ && ((CudaNdarray_HOST_STRIDES(%(z_out)s)[0] > 0)
if (!%(z_out)s || ((CudaNdarray_HOST_STRIDES(%(z_out)s)[0] == 0)
|| (%(z_out)s->nd != 1) && (CudaNdarray_HOST_DIMS(%(z_out)s)[0] == 1))))
|| (CudaNdarray_HOST_DIMS(%(z_out)s)[0] != CudaNdarray_HOST_DIMS(%(z_in)s)[0])
)
{ {
Py_XDECREF(%(z_out)s); // Work on the output
%(z_out)s = (CudaNdarray*)CudaNdarray_Copy(%(z_in)s); if (CudaNdarray_CopyFromCudaNdarray(%(z_out)s, %(z_in)s))
if (!%(z_out)s)
{ {
%(fail)s; %(fail)s;
} }
} }
else else
{ {
if (CudaNdarray_CopyFromCudaNdarray(%(z_out)s, %(z_in)s)) // Copy
Py_XDECREF(%(z_out)s);
%(z_out)s = (CudaNdarray*)CudaNdarray_Copy(%(z_in)s);
if (!%(z_out)s)
{ {
%(fail)s; %(fail)s;
} }
} }
"""
print >> sio, """
if (CudaNdarray_sgemv(%(name)s_alpha, %(x)s, %(y)s, %(name)s_beta, %(z_out)s)) if (CudaNdarray_sgemv(%(name)s_alpha, %(x)s, %(y)s, %(name)s_beta, %(z_out)s))
{ {
%(fail)s; %(fail)s;
...@@ -385,7 +399,7 @@ class GpuGer(GpuOp): ...@@ -385,7 +399,7 @@ class GpuGer(GpuOp):
return Apply(self, [z, a, x, y], [z.type()]) return Apply(self, [z, a, x, y], [z.type()])
def c_code_cache_version(self): def c_code_cache_version(self):
return (1,) return (2,)
def c_code(self, node, name, inputs, outputs, sub): def c_code(self, node, name, inputs, outputs, sub):
#z_out = alpha * dot(x,y) + beta * z_in #z_out = alpha * dot(x,y) + beta * z_in
...@@ -393,44 +407,57 @@ class GpuGer(GpuOp): ...@@ -393,44 +407,57 @@ class GpuGer(GpuOp):
#not inplace version, we copy z_in to z_out. #not inplace version, we copy z_in to z_out.
z_in, a, x, y = inputs z_in, a, x, y = inputs
z_out, = outputs z_out, = outputs
inplace = int(self.inplace)
fail = sub['fail'] fail = sub['fail']
sio = StringIO.StringIO() sio = StringIO.StringIO()
print >> sio, """ print >> sio, """
float %(name)s_alpha = ((dtype_%(a)s*)(%(a)s->data))[0]; float %(name)s_alpha = ((dtype_%(a)s*)(%(a)s->data))[0];
"""
if self.inplace: if (%(inplace)s
print >> sio, """ && (CudaNdarray_HOST_STRIDES(%(z_in)s)[0] >= 0)
&& (CudaNdarray_HOST_STRIDES(%(z_in)s)[1] >= 0)
&& ((CudaNdarray_HOST_DIMS(%(z_in)s)[0] <= 1)
|| (CudaNdarray_HOST_STRIDES(%(z_in)s)[0] == 1)
|| (CudaNdarray_HOST_DIMS(%(z_in)s)[1] <= 1)
|| (CudaNdarray_HOST_STRIDES(%(z_in)s)[1] == 1)))
{
// The input has an appropriate layout, we work inplace
Py_XDECREF(%(z_out)s); Py_XDECREF(%(z_out)s);
%(z_out)s = %(z_in)s; %(z_out)s = %(z_in)s;
Py_INCREF(%(z_out)s); Py_INCREF(%(z_out)s);
""" }
else: else if (%(z_out)s
print >> sio, """ && (%(z_out)s->nd == 2)
if (!%(z_out)s && (CudaNdarray_HOST_DIMS(%(z_out)s)[0]
|| (%(z_out)s->nd != 2) == CudaNdarray_HOST_DIMS(%(z_in)s)[0])
|| (CudaNdarray_HOST_DIMS(%(z_out)s)[0] != CudaNdarray_HOST_DIMS(%(z_in)s)[0]) && (CudaNdarray_HOST_DIMS(%(z_out)s)[1]
|| (CudaNdarray_HOST_DIMS(%(z_out)s)[1] != CudaNdarray_HOST_DIMS(%(z_in)s)[1]) == CudaNdarray_HOST_DIMS(%(z_in)s)[1])
) && (CudaNdarray_HOST_STRIDES(%(z_out)s)[0] >= 0)
&& (CudaNdarray_HOST_STRIDES(%(z_out)s)[1] >= 0)
&& ((CudaNdarray_HOST_DIMS(%(z_out)s)[0] <= 1)
|| (CudaNdarray_HOST_STRIDES(%(z_out)s)[0] == 1)
|| (CudaNdarray_HOST_DIMS(%(z_out)s)[1] <= 1)
|| (CudaNdarray_HOST_STRIDES(%(z_out)s)[1] == 1)))
{ {
Py_XDECREF(%(z_out)s); // The existing output has an appropriate layout,
%(z_out)s = (CudaNdarray*)CudaNdarray_Copy(%(z_in)s); // copy the input data into it, then work inplace
if (!%(z_out)s) if (CudaNdarray_CopyFromCudaNdarray(%(z_out)s, %(z_in)s))
{ {
%(fail)s; %(fail)s;
} }
} }
else else
{ {
if (CudaNdarray_CopyFromCudaNdarray(%(z_out)s, %(z_in)s)) // Copy the input, use the copy as output
Py_XDECREF(%(z_out)s);
%(z_out)s = (CudaNdarray*)CudaNdarray_Copy(%(z_in)s);
if (!%(z_out)s)
{ {
%(fail)s; %(fail)s;
} }
} }
"""
print >> sio, """
if (CudaNdarray_sger(%(name)s_alpha, %(x)s, %(y)s, %(z_out)s)) if (CudaNdarray_sger(%(name)s_alpha, %(x)s, %(y)s, %(z_out)s))
{ {
%(fail)s; %(fail)s;
......
...@@ -453,7 +453,7 @@ PyObject* CudaNdarray_Zeros(PyObject* dummy, PyObject* shape) ...@@ -453,7 +453,7 @@ PyObject* CudaNdarray_Zeros(PyObject* dummy, PyObject* shape)
PyObject * CudaNdarray_Copy(CudaNdarray * self) PyObject * CudaNdarray_Copy(const CudaNdarray * self)
{ {
PyObject * rval = CudaNdarray_New(); PyObject * rval = CudaNdarray_New();
if ((!rval) || (-1 == self->nd)) if ((!rval) || (-1 == self->nd))
...@@ -2777,7 +2777,7 @@ static __global__ void k_copy_1d(const int N, const float * x, const int sx, flo ...@@ -2777,7 +2777,7 @@ static __global__ void k_copy_1d(const int N, const float * x, const int sx, flo
} }
//copy from other into self //copy from other into self
int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, CudaNdarray * other, bool unbroadcast) int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, const CudaNdarray * other, bool unbroadcast)
{ {
int verbose = 0; int verbose = 0;
if (verbose>1) fprintf(stderr, "CudaNdarray_CopyFromCudaNdarray\n"); if (verbose>1) fprintf(stderr, "CudaNdarray_CopyFromCudaNdarray\n");
...@@ -2856,7 +2856,7 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, CudaNdarray * other, boo ...@@ -2856,7 +2856,7 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, CudaNdarray * other, boo
// call worker routine // call worker routine
unsigned int n_blocks = std::min(size, (unsigned int)NUM_VECTOR_OP_BLOCKS); unsigned int n_blocks = std::min(size, (unsigned int)NUM_VECTOR_OP_BLOCKS);
unsigned int threads_per_block = std::min(ceil_intdiv(size, n_blocks), (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK); unsigned int threads_per_block = std::min(ceil_intdiv(size, n_blocks), (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
CudaNdarray * cuda_dims = other; const CudaNdarray * cuda_dims = other;
if(unbroadcast) if(unbroadcast)
cuda_dims = self; cuda_dims = self;
//copy from other into self //copy from other into self
...@@ -2882,9 +2882,21 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, CudaNdarray * other, boo ...@@ -2882,9 +2882,21 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, CudaNdarray * other, boo
int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C) int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C)
{ {
if (A->nd != 2) { PyErr_SetString(PyExc_ValueError, "non-matrix arg to gemm"); return -1; } if (A->nd != 2)
if (B->nd != 2) { PyErr_SetString(PyExc_ValueError, "non-matrix arg to gemm"); return -1; } {
if (C->nd != 2) { PyErr_SetString(PyExc_ValueError, "non-matrix arg to gemm"); return -1; } PyErr_SetString(PyExc_ValueError, "non-matrix arg A to gemm");
return -1;
}
if (B->nd != 2)
{
PyErr_SetString(PyExc_ValueError, "non-matrix arg B to gemm");
return -1;
}
if (C->nd != 2)
{
PyErr_SetString(PyExc_ValueError, "non-matrix arg C to gemm");
return -1;
}
// We must allow dimensions to be zeros. // We must allow dimensions to be zeros.
if ((CudaNdarray_HOST_DIMS(A)[1] != CudaNdarray_HOST_DIMS(B)[0]) if ((CudaNdarray_HOST_DIMS(A)[1] != CudaNdarray_HOST_DIMS(B)[0])
...@@ -2901,14 +2913,72 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B, ...@@ -2901,14 +2913,72 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B,
return -1; return -1;
} }
// a matrix has non-unit size and non-unit stride in both directions, we can't operate in-place // If matrix A or B has non-unit size and non-unit stride in both
// TODO: make a copy instead of returning in error // dimensions, we can make a copy.
if (((CudaNdarray_HOST_DIMS(A)[0] > 1) && (CudaNdarray_HOST_STRIDES(A)[0] != 1)) && ((CudaNdarray_HOST_DIMS(A)[1] > 1) && (CudaNdarray_HOST_STRIDES(A)[1] != 1))) if (((CudaNdarray_HOST_DIMS(A)[0] > 1)
{ PyErr_SetString(PyExc_NotImplementedError, "non-unit stride in gemm arg"); return -1; } && (CudaNdarray_HOST_STRIDES(A)[0] != 1)
if (((CudaNdarray_HOST_DIMS(B)[0] > 1) && (CudaNdarray_HOST_STRIDES(B)[0] != 1)) && ((CudaNdarray_HOST_DIMS(B)[1] > 1) && (CudaNdarray_HOST_STRIDES(B)[1] != 1))) && (CudaNdarray_HOST_DIMS(A)[1] > 1)
{ PyErr_SetString(PyExc_NotImplementedError, "non-unit stride in gemm arg"); return -1; } && (CudaNdarray_HOST_STRIDES(A)[1] != 1))
if (((CudaNdarray_HOST_DIMS(C)[0] > 1) && (CudaNdarray_HOST_STRIDES(C)[0] != 1)) && ((CudaNdarray_HOST_DIMS(C)[1] > 1) && (CudaNdarray_HOST_STRIDES(C)[1] != 1))) || (CudaNdarray_HOST_STRIDES(A)[0] < 0)
{ PyErr_SetString(PyExc_NotImplementedError, "non-unit stride in gemm arg"); return -1; } || (CudaNdarray_HOST_STRIDES(A)[1] < 0))
{
const CudaNdarray* A_new = (CudaNdarray*) CudaNdarray_Copy(A);
if (!A_new)
return -1;
A = A_new;
}
else
{
// In the case above, we will need to decref A_new at the end.
// To make things simpler, we incref A here, so we can always
// decref A.
Py_INCREF(A);
}
if (((CudaNdarray_HOST_DIMS(B)[0] > 1)
&& (CudaNdarray_HOST_STRIDES(B)[0] != 1)
&& (CudaNdarray_HOST_DIMS(B)[1] > 1)
&& (CudaNdarray_HOST_STRIDES(B)[1] != 1))
|| (CudaNdarray_HOST_STRIDES(B)[0] < 0)
|| (CudaNdarray_HOST_STRIDES(B)[1] < 0))
{
const CudaNdarray* B_new = (CudaNdarray*) CudaNdarray_Copy(B);
if (!B_new)
{
Py_XDECREF(A);
return -1;
}
B = B_new;
}
else
{
// In the case above, we will need to decref B_new at the end.
// To make things simpler, we incref B here, so we can always
// decref B.
Py_INCREF(B);
}
// If matrix C has non-unit size and non-unit stride in both
// dimensions, or negative strides, we can't operate. We cannot copy
// C either, because the calling code will expect the result to be
// in the original C container.
if (((CudaNdarray_HOST_DIMS(C)[0] > 1)
&& (CudaNdarray_HOST_STRIDES(C)[0] != 1)
&& (CudaNdarray_HOST_DIMS(C)[1] > 1)
&& (CudaNdarray_HOST_STRIDES(C)[1] != 1))
|| (CudaNdarray_HOST_STRIDES(C)[0] < 0)
|| (CudaNdarray_HOST_STRIDES(C)[1] < 0))
{
PyErr_Format(PyExc_AssertionError,
"non-unit or negative stride in gemm arg C (%i,%i) of shape (%i,%i)",
CudaNdarray_HOST_STRIDES(C)[0],
CudaNdarray_HOST_STRIDES(C)[1],
CudaNdarray_HOST_DIMS(C)[0],
CudaNdarray_HOST_DIMS(C)[1]);
Py_XDECREF(A);
Py_XDECREF(B);
return -1;
}
// the unit integer is divided logically into three fields of 4 bits // the unit integer is divided logically into three fields of 4 bits
// the lowermost 4 bits encode the stride pattern of the output // the lowermost 4 bits encode the stride pattern of the output
...@@ -2920,47 +2990,31 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B, ...@@ -2920,47 +2990,31 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B,
// a stride of 0 implies a dimension of 1 - so we can actually define // a stride of 0 implies a dimension of 1 - so we can actually define
// a stride of 0 as a 'unit' stride because gemm will never use it. // a stride of 0 as a 'unit' stride because gemm will never use it.
// If a dimension is 0, its stride will not be used either, so we can
// consider it a 'unit' stride too.
int unit = 0; int unit = 0;
if (CudaNdarray_HOST_STRIDES(A)[1] == 1 || CudaNdarray_HOST_STRIDES(A)[1] == 0) { if (CudaNdarray_HOST_STRIDES(A)[1] == 1 || CudaNdarray_HOST_DIMS(A)[1] <= 1) {
unit |= (0x0 << 8); unit |= (0x0 << 8);
} else if (CudaNdarray_HOST_STRIDES(A)[0] == 1 || CudaNdarray_HOST_STRIDES(A)[0] == 0) { } else if (CudaNdarray_HOST_STRIDES(A)[0] == 1 || CudaNdarray_HOST_DIMS(A)[0] <= 1) {
unit |= (0x1 << 8); unit |= (0x1 << 8);
} else { } else {
unit |= (0x2 << 8); unit |= (0x2 << 8);
} }
if (CudaNdarray_HOST_STRIDES(B)[1] == 1 || CudaNdarray_HOST_STRIDES(B)[1] == 0) { if (CudaNdarray_HOST_STRIDES(B)[1] == 1 || CudaNdarray_HOST_DIMS(B)[1] <= 1) {
unit |= (0x0 << 4); unit |= (0x0 << 4);
} else if (CudaNdarray_HOST_STRIDES(B)[0] == 1 || CudaNdarray_HOST_STRIDES(B)[0] == 0) { } else if (CudaNdarray_HOST_STRIDES(B)[0] == 1 || CudaNdarray_HOST_DIMS(B)[0] <= 1) {
unit |= (0x1 << 4); unit |= (0x1 << 4);
} else { } else {
unit |= (0x2 << 4); unit |= (0x2 << 4);
} }
if (CudaNdarray_HOST_STRIDES(C)[1] == 1 || CudaNdarray_HOST_STRIDES(C)[1] == 0) { if (CudaNdarray_HOST_STRIDES(C)[1] == 1 || CudaNdarray_HOST_DIMS(C)[1] <= 1) {
unit |= (0x0 << 0); unit |= (0x0 << 0);
} else if (CudaNdarray_HOST_STRIDES(C)[0] == 1 || CudaNdarray_HOST_STRIDES(C)[0] == 0) { } else if (CudaNdarray_HOST_STRIDES(C)[0] == 1 || CudaNdarray_HOST_DIMS(C)[0] <= 1) {
unit |= (0x1 << 0); unit |= (0x1 << 0);
} else { } else {
unit |= (0x2 << 0); unit |= (0x2 << 0);
} }
// I don't know if cudablas handles negative strides
if ( (CudaNdarray_HOST_STRIDES(A)[0] < 0)
|| (CudaNdarray_HOST_STRIDES(A)[1] < 0)
|| (CudaNdarray_HOST_STRIDES(B)[0] < 0)
|| (CudaNdarray_HOST_STRIDES(B)[1] < 0)
|| (CudaNdarray_HOST_STRIDES(C)[0] < 0)
|| (CudaNdarray_HOST_STRIDES(C)[1] < 0))
{
PyErr_Format(PyExc_ValueError, "illegal strides in args to gemm (%i,%i)x(%i,%i)->(%i,%i)",
CudaNdarray_HOST_STRIDES(A)[0],
CudaNdarray_HOST_STRIDES(A)[1],
CudaNdarray_HOST_STRIDES(B)[0],
CudaNdarray_HOST_STRIDES(B)[1],
CudaNdarray_HOST_STRIDES(C)[0],
CudaNdarray_HOST_STRIDES(C)[1]);
return -1;
}
/* create appropriate strides for malformed matrices that are row or column /* create appropriate strides for malformed matrices that are row or column
* vectors * vectors
*/ */
...@@ -2977,8 +3031,7 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B, ...@@ -2977,8 +3031,7 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B,
char N = 'N'; char N = 'N';
char T = 'T'; char T = 'T';
//std::cerr << (unit/256) MOD 16 << (unit / 16) MOD 16 << unit MOD 16<< '\\n'; //std::cerr << (unit/256) MOD 16 << (unit / 16) MOD 16 << unit MOD 16<< '\\n';
//TODO: recognize the negative stride and make a copy of the offending argument, // There should be no negative stride at that point
//rather than aborting
#define CHK_STRIDE_SGEMM(T0, T1, D0, D1, D2, a, x, sx, y, sy, b, z, sz) \ #define CHK_STRIDE_SGEMM(T0, T1, D0, D1, D2, a, x, sx, y, sy, b, z, sz) \
if (sx == 0){sx = 1;}\ if (sx == 0){sx = 1;}\
if (sy == 0){sy = 1;}\ if (sy == 0){sy = 1;}\
...@@ -2986,7 +3039,9 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B, ...@@ -2986,7 +3039,9 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B,
if ((sx > 0) && (sy > 0) && (sz > 0)) { \ if ((sx > 0) && (sy > 0) && (sz > 0)) { \
cublasSgemm(T0, T1, D0, D1, D2, a, x, sx, y, sy, b, z, sz); \ cublasSgemm(T0, T1, D0, D1, D2, a, x, sx, y, sy, b, z, sz); \
} else { \ } else { \
PyErr_SetString(PyExc_NotImplementedError, "negative stride to sGemm");\ PyErr_SetString(PyExc_AssertionError, "negative stride to sGemm");\
Py_XDECREF(A);\
Py_XDECREF(B);\
return -1; \ return -1; \
} }
...@@ -3000,14 +3055,19 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B, ...@@ -3000,14 +3055,19 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B,
case 0x101: CHK_STRIDE_SGEMM(N, T, CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(A)[1], alpha, a, sa_1, b, sb_0, beta, c, sc_1); break; case 0x101: CHK_STRIDE_SGEMM(N, T, CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(A)[1], alpha, a, sa_1, b, sb_0, beta, c, sc_1); break;
case 0x011: CHK_STRIDE_SGEMM(T, N, CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(A)[1], alpha, a, sa_0, b, sb_1, beta, c, sc_1); break; case 0x011: CHK_STRIDE_SGEMM(T, N, CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(A)[1], alpha, a, sa_0, b, sb_1, beta, c, sc_1); break;
case 0x111: CHK_STRIDE_SGEMM(N, N, CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(A)[1], alpha, a, sa_1, b, sb_1, beta, c, sc_1); break; case 0x111: CHK_STRIDE_SGEMM(N, N, CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(A)[1], alpha, a, sa_1, b, sb_1, beta, c, sc_1); break;
default: PyErr_Format(PyExc_ValueError, "some matrix has no unit stride (unit=%i)", unit); default: PyErr_Format(PyExc_ValueError, "some matrix has no unit stride (unit=%x)", unit);
return -1; return -1;
}; };
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError(); Py_XDECREF(A);
Py_XDECREF(B);
cublasStatus err = cublasGetError();
if (CUBLAS_STATUS_SUCCESS != err) if (CUBLAS_STATUS_SUCCESS != err)
{ {
PyErr_Format(PyExc_RuntimeError, "cublassGemm failed (%s)",cudaGetErrorString(err)); PyErr_Format(PyExc_RuntimeError,
"cublasSgemm failed (%i)",
err);
return -1; return -1;
} }
return 0; return 0;
...@@ -3037,22 +3097,53 @@ int CudaNdarray_sgemv(float alpha, const CudaNdarray * A, const CudaNdarray * B, ...@@ -3037,22 +3097,53 @@ int CudaNdarray_sgemv(float alpha, const CudaNdarray * A, const CudaNdarray * B,
return -1; return -1;
} }
// a matrix has non-unit size and non-unit stride in both directions, we can't operate in-place // If matrix A has non-unit size and non-unit stride in both
// TODO: make a copy instead of returning in error // dimensions, or negative strides, we cannot operate, but we can
if (((CudaNdarray_HOST_DIMS(A)[0] > 1) && (CudaNdarray_HOST_STRIDES(A)[0] != 1)) && ((CudaNdarray_HOST_DIMS(A)[1] > 1) && (CudaNdarray_HOST_STRIDES(A)[1] != 1))) // make a copy.
{ PyErr_SetString(PyExc_NotImplementedError, "non-unit stride in gemv arg"); return -1; } if (((CudaNdarray_HOST_DIMS(A)[0] > 1)
&& (CudaNdarray_HOST_STRIDES(A)[0] != 1)
&& (CudaNdarray_HOST_DIMS(A)[1] > 1)
&& (CudaNdarray_HOST_STRIDES(A)[1] != 1))
|| (CudaNdarray_HOST_STRIDES(A)[0] < 0)
|| (CudaNdarray_HOST_STRIDES(A)[1] < 0))
{
const CudaNdarray* A_new = (CudaNdarray*) CudaNdarray_Copy(A);
if (!A_new)
return -1;
A = A_new;
}
else
{
// Incref A, so we can decref it at the end in all cases
Py_INCREF(A);
}
// If vector B as a negative stride, we also have to make a copy.
if (CudaNdarray_HOST_STRIDES(B)[0] < 0)
{
const CudaNdarray* B_new = (CudaNdarray*) CudaNdarray_Copy(B);
if (!B_new)
{
Py_XDECREF(A);
return -1;
}
B = B_new;
}
else
{
// Incref B, so we can decref it at the end in all cases
Py_INCREF(B);
}
// I don't know if cudablas handles negative strides // cudablas does not handle negative strides as expected
if ( (CudaNdarray_HOST_STRIDES(A)[0] < 0) if ( (CudaNdarray_HOST_STRIDES(A)[0] < 0)
|| (CudaNdarray_HOST_STRIDES(A)[1] < 0) || (CudaNdarray_HOST_STRIDES(A)[1] < 0))
|| (CudaNdarray_HOST_STRIDES(B)[0] < 0)
|| (CudaNdarray_HOST_STRIDES(C)[0] < 0))
{ {
PyErr_Format(PyExc_ValueError, "illegal strides in args to gemv (%i,%i)x(%i)->(%i)", PyErr_Format(PyExc_ValueError, "illegal strides in args to gemv (%i,%i)",
CudaNdarray_HOST_STRIDES(A)[0], CudaNdarray_HOST_STRIDES(A)[0],
CudaNdarray_HOST_STRIDES(A)[1], CudaNdarray_HOST_STRIDES(A)[1]);
CudaNdarray_HOST_STRIDES(B)[0], Py_XDECREF(A);
CudaNdarray_HOST_STRIDES(C)[0]); Py_XDECREF(B);
return -1; return -1;
} }
...@@ -3064,7 +3155,15 @@ int CudaNdarray_sgemv(float alpha, const CudaNdarray * A, const CudaNdarray * B, ...@@ -3064,7 +3155,15 @@ int CudaNdarray_sgemv(float alpha, const CudaNdarray * A, const CudaNdarray * B,
int sb_0 = (CudaNdarray_HOST_DIMS(B)[0] > 1) ? CudaNdarray_HOST_STRIDES(B)[0] : 1; int sb_0 = (CudaNdarray_HOST_DIMS(B)[0] > 1) ? CudaNdarray_HOST_STRIDES(B)[0] : 1;
int sc_0 = (CudaNdarray_HOST_DIMS(C)[0] > 1) ? CudaNdarray_HOST_STRIDES(C)[0] : 1; int sc_0 = (CudaNdarray_HOST_DIMS(C)[0] > 1) ? CudaNdarray_HOST_STRIDES(C)[0] : 1;
if (sa_0 == 1) if (sa_0 == 0)
sa_0 = 1;
if (sa_1 == 0)
sa_1 = 1;
if (CudaNdarray_SIZE(C)) {
if ((CudaNdarray_HOST_DIMS(A)[0] <= 1)
|| ((CudaNdarray_HOST_STRIDES(A)[0] == 1)
&& (CudaNdarray_HOST_STRIDES(A)[1] > 0)))
{ {
cublasSgemv('N', cublasSgemv('N',
CudaNdarray_HOST_DIMS(A)[0], CudaNdarray_HOST_DIMS(A)[1], CudaNdarray_HOST_DIMS(A)[0], CudaNdarray_HOST_DIMS(A)[1],
...@@ -3074,7 +3173,9 @@ int CudaNdarray_sgemv(float alpha, const CudaNdarray * A, const CudaNdarray * B, ...@@ -3074,7 +3173,9 @@ int CudaNdarray_sgemv(float alpha, const CudaNdarray * A, const CudaNdarray * B,
beta, beta,
CudaNdarray_DEV_DATA(C), sc_0); CudaNdarray_DEV_DATA(C), sc_0);
} }
else if (sa_1 == 1) else if ((CudaNdarray_HOST_DIMS(A)[1] <= 1)
|| ((CudaNdarray_HOST_STRIDES(A)[1] == 1)
&& (CudaNdarray_HOST_STRIDES(A)[0] > 0)))
{ {
cublasSgemv('T', cublasSgemv('T',
CudaNdarray_HOST_DIMS(A)[1], CudaNdarray_HOST_DIMS(A)[0], CudaNdarray_HOST_DIMS(A)[1], CudaNdarray_HOST_DIMS(A)[0],
...@@ -3086,20 +3187,39 @@ int CudaNdarray_sgemv(float alpha, const CudaNdarray * A, const CudaNdarray * B, ...@@ -3086,20 +3187,39 @@ int CudaNdarray_sgemv(float alpha, const CudaNdarray * A, const CudaNdarray * B,
} }
else else
{ {
PyErr_SetString(PyExc_NotImplementedError, "too many strides strides in sgemv"); PyErr_Format(PyExc_AssertionError,
"Unexpected stride pattern in gemv: (%i, %i) x %i -> %i.\n"
"Shapes are: (%i, %i) x %i -> %i\n",
CudaNdarray_HOST_STRIDES(A)[0],
CudaNdarray_HOST_STRIDES(A)[1],
CudaNdarray_HOST_STRIDES(B)[0],
CudaNdarray_HOST_STRIDES(C)[0],
CudaNdarray_HOST_DIMS(A)[0],
CudaNdarray_HOST_DIMS(A)[1],
CudaNdarray_HOST_DIMS(B)[0],
CudaNdarray_HOST_DIMS(C)[0]);
Py_XDECREF(A);
Py_XDECREF(B);
return -1; return -1;
} }
}
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError(); Py_XDECREF(A);
Py_XDECREF(B);
cublasStatus err = cublasGetError();
if (CUBLAS_STATUS_SUCCESS != err) if (CUBLAS_STATUS_SUCCESS != err)
{ {
PyErr_Format(PyExc_RuntimeError, "cublassGemv failed (%s)",cudaGetErrorString(err)); PyErr_Format(PyExc_RuntimeError,
"cublasSgemv failed (%i)",
err);
return -1; return -1;
} }
return 0; return 0;
} }
int CudaNdarray_sger(float alpha, CudaNdarray * x, CudaNdarray * y, CudaNdarray * A) { int CudaNdarray_sger(float alpha, const CudaNdarray * x, const CudaNdarray * y, CudaNdarray * A) {
if (x->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg x to sger"); return -1; } if (x->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg x to sger"); return -1; }
if (y->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg y to sger"); return -1; } if (y->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg y to sger"); return -1; }
if (A->nd != 2) { PyErr_SetString(PyExc_ValueError, "non-matrix arg A to sger"); return -1; } if (A->nd != 2) { PyErr_SetString(PyExc_ValueError, "non-matrix arg A to sger"); return -1; }
...@@ -3115,14 +3235,8 @@ int CudaNdarray_sger(float alpha, CudaNdarray * x, CudaNdarray * y, CudaNdarray ...@@ -3115,14 +3235,8 @@ int CudaNdarray_sger(float alpha, CudaNdarray * x, CudaNdarray * y, CudaNdarray
return -1; return -1;
} }
// Maybe this could work, but be safe for now
if (!CudaNdarray_is_c_contiguous(A)) {
PyErr_SetString(PyExc_NotImplementedError, "non-c continugous A in sger");
return -1;
}
// Since Sger expects A in col-major, we invert x and y to fake this.
int x_strides = CudaNdarray_HOST_STRIDES(x)[0]; int x_strides = CudaNdarray_HOST_STRIDES(x)[0];
CudaNdarray * x_ = x; const CudaNdarray * x_ = x;
if(x_strides == 0){ if(x_strides == 0){
if(CudaNdarray_HOST_DIMS(x)[0] != 1){ if(CudaNdarray_HOST_DIMS(x)[0] != 1){
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
...@@ -3131,14 +3245,14 @@ int CudaNdarray_sger(float alpha, CudaNdarray * x, CudaNdarray * y, CudaNdarray ...@@ -3131,14 +3245,14 @@ int CudaNdarray_sger(float alpha, CudaNdarray * x, CudaNdarray * y, CudaNdarray
" that have more then 1 elements!"); " that have more then 1 elements!");
return -1; return -1;
} }
x_strides = 4; x_strides = 1;
} else if(x_strides < 0){ } else if(x_strides < 0){
x_ = (CudaNdarray*)CudaNdarray_Copy(x); x_ = (CudaNdarray*)CudaNdarray_Copy(x);
x_strides = CudaNdarray_HOST_STRIDES(x_)[0]; x_strides = CudaNdarray_HOST_STRIDES(x_)[0];
} }
int y_strides = CudaNdarray_HOST_STRIDES(y)[0]; int y_strides = CudaNdarray_HOST_STRIDES(y)[0];
CudaNdarray * y_ = y; const CudaNdarray * y_ = y;
if(y_strides == 0){ if(y_strides == 0){
if(CudaNdarray_HOST_DIMS(y)[0] != 1){ if(CudaNdarray_HOST_DIMS(y)[0] != 1){
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
...@@ -3147,17 +3261,46 @@ int CudaNdarray_sger(float alpha, CudaNdarray * x, CudaNdarray * y, CudaNdarray ...@@ -3147,17 +3261,46 @@ int CudaNdarray_sger(float alpha, CudaNdarray * x, CudaNdarray * y, CudaNdarray
" that have more then 1 elements!"); " that have more then 1 elements!");
return -1; return -1;
} }
y_strides = 4; y_strides = 1;
} else if(y_strides < 0){ } else if(y_strides < 0){
y_ = (CudaNdarray*)CudaNdarray_Copy(y); y_ = (CudaNdarray*)CudaNdarray_Copy(y);
y_strides = CudaNdarray_HOST_STRIDES(y_)[0]; y_strides = CudaNdarray_HOST_STRIDES(y_)[0];
} }
// Create appropriate strides if A is a row or column vector
int sa_0 = (CudaNdarray_HOST_DIMS(A)[0] > 1) ? CudaNdarray_HOST_STRIDES(A)[0]
: CudaNdarray_HOST_DIMS(A)[1];
int sa_1 = (CudaNdarray_HOST_DIMS(A)[1] > 1) ? CudaNdarray_HOST_STRIDES(A)[1]
: CudaNdarray_HOST_DIMS(A)[0];
if(CudaNdarray_SIZE(A)){ if(CudaNdarray_SIZE(A)){
// If A is in col-major
if ((CudaNdarray_HOST_DIMS(A)[0] <= 1)
|| ((CudaNdarray_HOST_STRIDES(A)[0] == 1)
&& (CudaNdarray_HOST_STRIDES(A)[1] > 0)))
{
cublasSger(CudaNdarray_HOST_DIMS(x)[0], CudaNdarray_HOST_DIMS(y)[0], alpha,
CudaNdarray_DEV_DATA(x_), x_strides,
CudaNdarray_DEV_DATA(y_), y_strides,
CudaNdarray_DEV_DATA(A), sa_1);
}
// Since Sger expects A in col-major, we invert x and y to fake this.
else if ((CudaNdarray_HOST_DIMS(A)[1] <= 1)
|| ((CudaNdarray_HOST_STRIDES(A)[1] == 1)
&& (CudaNdarray_HOST_STRIDES(A)[0] > 0)))
{
cublasSger(CudaNdarray_HOST_DIMS(y)[0], CudaNdarray_HOST_DIMS(x)[0], alpha, cublasSger(CudaNdarray_HOST_DIMS(y)[0], CudaNdarray_HOST_DIMS(x)[0], alpha,
CudaNdarray_DEV_DATA(y_), y_strides, CudaNdarray_DEV_DATA(y_), y_strides,
CudaNdarray_DEV_DATA(x_), x_strides, CudaNdarray_DEV_DATA(x_), x_strides,
CudaNdarray_DEV_DATA(A), CudaNdarray_HOST_DIMS(A)[1]); CudaNdarray_DEV_DATA(A), sa_0);
}
// A has to be either c- or f-contiguous, with no negative strides
else
{
PyErr_SetString(PyExc_NotImplementedError,
"non-contiguous A, or negative strides, in sger");
return -1;
}
} }
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
if(x_ != x) if(x_ != x)
...@@ -3165,12 +3308,15 @@ int CudaNdarray_sger(float alpha, CudaNdarray * x, CudaNdarray * y, CudaNdarray ...@@ -3165,12 +3308,15 @@ int CudaNdarray_sger(float alpha, CudaNdarray * x, CudaNdarray * y, CudaNdarray
if(y_ != y) if(y_ != y)
Py_DECREF(y_); Py_DECREF(y_);
cudaError_t err = cudaGetLastError(); cublasStatus err = cublasGetError();
if (CUBLAS_STATUS_SUCCESS != err) if (CUBLAS_STATUS_SUCCESS != err)
{ {
PyErr_Format(PyExc_RuntimeError, "cublasSger failed (%s)",cudaGetErrorString(err)); PyErr_Format(PyExc_RuntimeError,
"cublasSger failed (%i)",
err);
return -1; return -1;
} }
return 0; return 0;
} }
...@@ -3816,7 +3962,7 @@ CudaNdarray_set_stride(CudaNdarray * self, int idx, int s) ...@@ -3816,7 +3962,7 @@ CudaNdarray_set_stride(CudaNdarray * self, int idx, int s)
int int
cnda_copy_structure_to_device(CudaNdarray * self) cnda_copy_structure_to_device(const CudaNdarray * self)
{ {
cublasSetVector(cnda_structure_size(self->nd), sizeof(int), self->host_structure, 1, self->dev_structure, 1); cublasSetVector(cnda_structure_size(self->nd), sizeof(int), self->host_structure, 1, self->dev_structure, 1);
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
...@@ -3830,7 +3976,7 @@ cnda_copy_structure_to_device(CudaNdarray * self) ...@@ -3830,7 +3976,7 @@ cnda_copy_structure_to_device(CudaNdarray * self)
} }
const int * const int *
CudaNdarray_DEV_DIMS(CudaNdarray * self) CudaNdarray_DEV_DIMS(const CudaNdarray * self)
{ {
if (!self->dev_structure_fresh) if (!self->dev_structure_fresh)
{ {
...@@ -3840,7 +3986,7 @@ CudaNdarray_DEV_DIMS(CudaNdarray * self) ...@@ -3840,7 +3986,7 @@ CudaNdarray_DEV_DIMS(CudaNdarray * self)
return self->dev_structure; return self->dev_structure;
} }
const int * const int *
CudaNdarray_DEV_STRIDES(CudaNdarray * self) CudaNdarray_DEV_STRIDES(const CudaNdarray * self)
{ {
if (!self->dev_structure_fresh) if (!self->dev_structure_fresh)
{ {
...@@ -3850,7 +3996,7 @@ CudaNdarray_DEV_STRIDES(CudaNdarray * self) ...@@ -3850,7 +3996,7 @@ CudaNdarray_DEV_STRIDES(CudaNdarray * self)
return self->dev_structure + self->nd; return self->dev_structure + self->nd;
} }
const int * const int *
CudaNdarray_DEV_LOG2DIMS(CudaNdarray * self) CudaNdarray_DEV_LOG2DIMS(const CudaNdarray * self)
{ {
if (!self->dev_structure_fresh) if (!self->dev_structure_fresh)
{ {
...@@ -3991,4 +4137,4 @@ void fprint_CudaNdarray(FILE * fd, const CudaNdarray *self) ...@@ -3991,4 +4137,4 @@ void fprint_CudaNdarray(FILE * fd, const CudaNdarray *self)
fill-column:79 fill-column:79
End: End:
*/ */
// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:encoding=utf-8:textwidth=79 : // vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 :
...@@ -81,7 +81,7 @@ struct CudaNdarray ...@@ -81,7 +81,7 @@ struct CudaNdarray
//device pointers (allocated by cudaMalloc) //device pointers (allocated by cudaMalloc)
int dev_structure_fresh; mutable int dev_structure_fresh;
//dev_structure should be accessed via macros, otherwise may not be synchronized //dev_structure should be accessed via macros, otherwise may not be synchronized
int * dev_structure; //dim0, dim1, ..., stride0, stride1, ... int * dev_structure; //dim0, dim1, ..., stride0, stride1, ...
real* devdata; //pointer to data element [0,..,0]. real* devdata; //pointer to data element [0,..,0].
...@@ -154,11 +154,11 @@ CudaNdarray_set_stride(CudaNdarray * self, int idx, int s); ...@@ -154,11 +154,11 @@ CudaNdarray_set_stride(CudaNdarray * self, int idx, int s);
* *
* This means: recalculate the log2dims and transfer structure to the card * This means: recalculate the log2dims and transfer structure to the card
*/ */
DllExport int cnda_copy_structure_to_device(CudaNdarray * self); DllExport int cnda_copy_structure_to_device(const CudaNdarray * self);
DllExport const int *CudaNdarray_DEV_DIMS(CudaNdarray * self); DllExport const int *CudaNdarray_DEV_DIMS(const CudaNdarray * self);
DllExport const int *CudaNdarray_DEV_STRIDES(CudaNdarray * self); DllExport const int *CudaNdarray_DEV_STRIDES(const CudaNdarray * self);
DllExport const int *CudaNdarray_DEV_LOG2DIMS(CudaNdarray * self); DllExport const int *CudaNdarray_DEV_LOG2DIMS(const CudaNdarray * self);
DllExport float *CudaNdarray_DEV_DATA(const CudaNdarray * self); DllExport float *CudaNdarray_DEV_DATA(const CudaNdarray * self);
/** /**
...@@ -229,13 +229,22 @@ static int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd, const i ...@@ -229,13 +229,22 @@ static int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd, const i
return -1; return -1;
} }
assert(size>0); if (size < 0)
{
PyErr_Format(PyExc_AssertionError,
"size (%i) < 0",
size);
return -1;
}
self->devdata = (float*)device_malloc(size*sizeof(real)); self->devdata = (float*)device_malloc(size*sizeof(real));
if (!self->devdata) if (size && !self->devdata)
{ {
CudaNdarray_set_nd(self,-1); CudaNdarray_set_nd(self, -1);
self->data_allocated = 0; self->data_allocated = 0;
self->devdata = 0; self->devdata = 0;
PyErr_SetString(PyExc_RuntimeError,
"Could not allocate memory on device");
return -1; return -1;
} }
if (0) if (0)
...@@ -283,7 +292,7 @@ DllExport PyObject * CudaNdarray_DeepCopy(CudaNdarray * self, PyObject * memo); ...@@ -283,7 +292,7 @@ DllExport PyObject * CudaNdarray_DeepCopy(CudaNdarray * self, PyObject * memo);
/** /**
* Return an independent copy of self * Return an independent copy of self
*/ */
DllExport PyObject * CudaNdarray_Copy(CudaNdarray * self); DllExport PyObject * CudaNdarray_Copy(const CudaNdarray * self);
/** /**
* Return a new object obtained by summing over the dimensions for which there is a 1 in the mask. * Return a new object obtained by summing over the dimensions for which there is a 1 in the mask.
...@@ -302,7 +311,7 @@ DllExport int CudaNdarray_CopyFromArray(CudaNdarray * self, PyArrayObject*obj); ...@@ -302,7 +311,7 @@ DllExport int CudaNdarray_CopyFromArray(CudaNdarray * self, PyArrayObject*obj);
* *
* self is reallocated to have the correct dimensions if necessary. * self is reallocated to have the correct dimensions if necessary.
*/ */
DllExport int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, CudaNdarray * other, bool unbroadcast = false); DllExport int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, const CudaNdarray * other, bool unbroadcast = false);
/** /**
* Transfer the contents of CudaNdarray `self` to a new numpy ndarray. * Transfer the contents of CudaNdarray `self` to a new numpy ndarray.
...@@ -321,7 +330,7 @@ DllExport PyObject * CudaNdarray_IS_C_Contiguous(CudaNdarray * self); ...@@ -321,7 +330,7 @@ DllExport PyObject * CudaNdarray_IS_C_Contiguous(CudaNdarray * self);
DllExport int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C); DllExport int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C);
DllExport int CudaNdarray_sgemv(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C); DllExport int CudaNdarray_sgemv(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C);
DllExport int CudaNdarray_sger(float alpha, CudaNdarray * x, CudaNdarray * y, CudaNdarray* A); DllExport int CudaNdarray_sger(float alpha, const CudaNdarray * x, const CudaNdarray * y, CudaNdarray* A);
DllExport int CudaNdarray_reduce_sum(CudaNdarray * self, CudaNdarray * A); DllExport int CudaNdarray_reduce_sum(CudaNdarray * self, CudaNdarray * A);
DllExport int CudaNdarray_reduce_prod(CudaNdarray * self, CudaNdarray * A); DllExport int CudaNdarray_reduce_prod(CudaNdarray * self, CudaNdarray * A);
...@@ -343,4 +352,4 @@ static void fprint_CudaNdarray(FILE * fd, const CudaNdarray *self); ...@@ -343,4 +352,4 @@ static void fprint_CudaNdarray(FILE * fd, const CudaNdarray *self);
fill-column:79 fill-column:79
End: End:
*/ */
// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:encoding=utf-8:textwidth=79 : // vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 :
import itertools
from unittest import TestCase from unittest import TestCase
from theano.compile.pfunc import pfunc from theano.compile.pfunc import pfunc
...@@ -17,7 +18,7 @@ import theano.sandbox.cuda as tcn ...@@ -17,7 +18,7 @@ import theano.sandbox.cuda as tcn
from theano.tensor.signal.downsample import DownsampleFactorMax, DownsampleFactorMaxGrad from theano.tensor.signal.downsample import DownsampleFactorMax, DownsampleFactorMaxGrad
import theano.compile.mode import theano.compile.mode
from theano.tensor.tests.test_blas import BaseGemv, TestGer from theano.tensor.tests.test_blas import BaseGemv, TestBlasStrides, TestGer
from theano.sandbox.cuda.blas import gpu_gemv_no_inplace, gpu_gemv_inplace from theano.sandbox.cuda.blas import gpu_gemv_no_inplace, gpu_gemv_inplace
from theano.sandbox.cuda.blas import gpu_ger_inplace, gpu_ger_no_inplace from theano.sandbox.cuda.blas import gpu_ger_inplace, gpu_ger_no_inplace
...@@ -32,20 +33,31 @@ else: ...@@ -32,20 +33,31 @@ else:
def my_rand(*shape): def my_rand(*shape):
return theano._asarray(numpy.random.rand(*shape),dtype='float32') return theano._asarray(numpy.random.rand(*shape),dtype='float32')
def transpose(cuda_mat):
# The easiest way to transpose a cuda matrix for now
return tcn.dimshuffle(cuda_mat, [1, 0])
def test_dot22(): def test_dot22():
def cmp(a_shp, b_shp): def cmp(a_shp, b_shp):
a = tcn.shared_constructor(my_rand(*a_shp), 'a') a0 = my_rand(*a_shp)
a = tcn.shared_constructor(a0, 'a')
b = tensor.fmatrix() b = tensor.fmatrix()
f = pfunc([b], [], updates=[(a, tensor.dot(a,b))], mode=mode_with_gpu) f = pfunc([b], [], updates=[(a, tensor.dot(a,b))], mode=mode_with_gpu)
a0 = a.get_value() * 1.0
bval = my_rand(*b_shp) bval = my_rand(*b_shp)
f(bval) f(bval)
assert numpy.allclose(numpy.dot(a0, bval), a.get_value()) assert numpy.allclose(numpy.dot(a0, bval), a.get_value())
# Try with a matrix equal to a0, but with strides in both dims
a.set_value(a0)
a.set_value(
a.get_value(borrow=True, return_internal_type=True)[::-1, ::-1],
borrow=True)
f(bval)
cmp((3,4),(4,5)) cmp((3,4),(4,5))
cmp((0,4),(4,5)) cmp((0,4),(4,5))
cmp((3,4),(4,0)) cmp((3,4),(4,0))
...@@ -90,7 +102,8 @@ def test_dot22scalar(): ...@@ -90,7 +102,8 @@ def test_dot22scalar():
def test_gemm(): def test_gemm():
def cmp(a_shp, b_shp): def cmp(a_shp, b_shp):
a = tcn.shared_constructor(my_rand(*a_shp), 'a') a0 = my_rand(*a_shp)
a = tcn.shared_constructor(a0, 'a')
b = tensor.fmatrix('b') b = tensor.fmatrix('b')
c = tensor.fmatrix('c') c = tensor.fmatrix('c')
...@@ -98,12 +111,19 @@ def test_gemm(): ...@@ -98,12 +111,19 @@ def test_gemm():
f = pfunc([b,c], [], updates=[(a, tensor.dot(a,b) + tensor.exp(c))], mode=mode_with_gpu) f = pfunc([b,c], [], updates=[(a, tensor.dot(a,b) + tensor.exp(c))], mode=mode_with_gpu)
assert any([node.op == tcn.blas.gpu_gemm_inplace for node in f.maker.env.toposort()]) assert any([node.op == tcn.blas.gpu_gemm_inplace for node in f.maker.env.toposort()])
a0 = a.get_value() * 1.0
bval = my_rand(*b_shp) bval = my_rand(*b_shp)
cval = my_rand(a_shp[0],b_shp[1]) cval = my_rand(a_shp[0],b_shp[1])
f(bval,cval) f(bval,cval)
assert numpy.allclose(numpy.dot(a0, bval)+numpy.exp(cval), a.get_value()) assert numpy.allclose(numpy.dot(a0, bval)+numpy.exp(cval), a.get_value())
# Try with a matrix equal to a0, but with strides in both dims
a.set_value(a0)
a.set_value(
a.get_value(borrow=True, return_internal_type=True)[::-1, ::-1],
borrow=True)
f(bval, cval)
cmp((3,4),(4,5)) cmp((3,4),(4,5))
cmp((0,4),(4,5)) cmp((0,4),(4,5))
cmp((3,4),(4,0)) cmp((3,4),(4,0))
...@@ -114,7 +134,8 @@ def test_gemm(): ...@@ -114,7 +134,8 @@ def test_gemm():
def test_gemm_no_inplace(): def test_gemm_no_inplace():
def cmp(a_shp, b_shp): def cmp(a_shp, b_shp):
a = tcn.shared_constructor(my_rand(*a_shp), 'a') a0 = my_rand(*a_shp)
a = tcn.shared_constructor(a0, 'a')
cval = my_rand(a_shp[0], b_shp[1]) cval = my_rand(a_shp[0], b_shp[1])
c = tcn.shared_constructor(cval.copy(), 'c') c = tcn.shared_constructor(cval.copy(), 'c')
...@@ -123,7 +144,6 @@ def test_gemm_no_inplace(): ...@@ -123,7 +144,6 @@ def test_gemm_no_inplace():
f = pfunc([b,b2], [tensor.dot(a,b2) + c], updates=[(a, tensor.dot(a,b) + c)], mode=mode_with_gpu) f = pfunc([b,b2], [tensor.dot(a,b2) + c], updates=[(a, tensor.dot(a,b) + c)], mode=mode_with_gpu)
a0 = a.get_value() * 1.0
assert any([node.op == tcn.blas.gpu_gemm_no_inplace for node in f.maker.env.toposort()]) assert any([node.op == tcn.blas.gpu_gemm_no_inplace for node in f.maker.env.toposort()])
bval = my_rand(*b_shp) bval = my_rand(*b_shp)
bval2 = my_rand(*b_shp) bval2 = my_rand(*b_shp)
...@@ -132,6 +152,13 @@ def test_gemm_no_inplace(): ...@@ -132,6 +152,13 @@ def test_gemm_no_inplace():
assert numpy.allclose(numpy.dot(a0, bval)+cval, a.get_value()) assert numpy.allclose(numpy.dot(a0, bval)+cval, a.get_value())
assert numpy.allclose(numpy.dot(a0, bval2)+cval, rval) assert numpy.allclose(numpy.dot(a0, bval2)+cval, rval)
# Try with a matrix equal to a0, but with strides in both dims
a.set_value(a0)
a.set_value(
a.get_value(borrow=True, return_internal_type=True)[::-1, ::-1],
borrow=True)
f(bval, bval2)
cmp((3,4),(4,5)) cmp((3,4),(4,5))
cmp((0,4),(4,5)) cmp((0,4),(4,5))
cmp((3,4),(4,0)) cmp((3,4),(4,0))
...@@ -139,6 +166,13 @@ def test_gemm_no_inplace(): ...@@ -139,6 +166,13 @@ def test_gemm_no_inplace():
cmp((0,4),(4,0)) cmp((0,4),(4,0))
cmp((0,0),(0,0)) cmp((0,0),(0,0))
class TestBlasStridesGpu(TestBlasStrides):
dtype = 'float32'
shared = staticmethod(tcn.shared_constructor)
mode = mode_with_gpu
def test_outer(): def test_outer():
x = tcn.shared_constructor(my_rand(8,), 'x') x = tcn.shared_constructor(my_rand(8,), 'x')
y = tcn.shared_constructor(my_rand(6,), 'y') y = tcn.shared_constructor(my_rand(6,), 'y')
...@@ -260,6 +294,23 @@ class TestGpuGemv(TestCase, BaseGemv, ...@@ -260,6 +294,23 @@ class TestGpuGemv(TestCase, BaseGemv,
gemv = gpu_gemv_inplace gemv = gpu_gemv_inplace
gemv_inplace = gpu_gemv_inplace gemv_inplace = gpu_gemv_inplace
class TestGpuGemvNoTransfer(TestCase, BaseGemv,
unittest_tools.TestOptimizationMixin):
mode = mode_with_gpu
dtype = 'float32'
# Mimic shared constructors registry
@staticmethod
def shared(val):
try:
return tcn.shared_constructor(val)
except TypeError:
return theano.shared(val)
# In this test, inputs are not always transfered to GPU
gemv = gpu_gemv_no_inplace
gemv_inplace = gpu_gemv_inplace
class TestVectorMatrixDot(TestCase): class TestVectorMatrixDot(TestCase):
### Tolerance factor used in this tests ### Tolerance factor used in this tests
...@@ -286,6 +337,14 @@ class TestVectorMatrixDot(TestCase): ...@@ -286,6 +337,14 @@ class TestVectorMatrixDot(TestCase):
assert sum([node.op is gpu_gemv_inplace for node in assert sum([node.op is gpu_gemv_inplace for node in
gpu_f2.maker.env.toposort() ]) == 1 gpu_f2.maker.env.toposort() ]) == 1
# Check double-strided m
m.set_value(
m.get_value(borrow=True, return_internal_type=True)[::-1, ::-1],
borrow=True)
assert numpy.allclose(no_gpu_f(), gpu_f(), atol=self.atol)
assert numpy.allclose(no_gpu_f(), gpu_f2(), atol=self.atol)
def test_dot_mv(self): def test_dot_mv(self):
''' Test matrix dot vector ''' ''' Test matrix dot vector '''
v = theano.shared( numpy.array(numpy.random.rand(2), dtype='float32')) v = theano.shared( numpy.array(numpy.random.rand(2), dtype='float32'))
...@@ -365,6 +424,26 @@ class TestGpuGer(TestGer): ...@@ -365,6 +424,26 @@ class TestGpuGer(TestGer):
self.ger = gpu_ger_inplace self.ger = gpu_ger_inplace
self.gemm = tcn.blas.gpu_gemm_inplace self.gemm = tcn.blas.gpu_gemm_inplace
class TestGpuGerNoTransfer(TestGer):
@staticmethod
def shared(val):
try:
return tcn.shared_constructor(val)
except TypeError:
return theano.shared(val)
def setUp(self):
self.mode = mode_with_gpu
dtype = self.dtype = 'float32' # optimization isn't dtype-dependent
self.A = tensor.tensor(dtype=dtype, broadcastable=(False, False))
self.a = tensor.tensor(dtype=dtype, broadcastable=())
self.x = tensor.tensor(dtype=dtype, broadcastable=(False,))
self.y = tensor.tensor(dtype=dtype, broadcastable=(False,))
# data on the gpu make the op always inplace
self.ger = gpu_ger_inplace
self.ger_destructive = gpu_ger_inplace
self.gemm = tcn.blas.gpu_gemm_inplace
class TestGpuGer_OpContract(TestCase, unittest_tools.T_OpContractMixin): class TestGpuGer_OpContract(TestCase, unittest_tools.T_OpContractMixin):
def setUp(self): def setUp(self):
......
...@@ -496,7 +496,9 @@ class GemmRelated(Op): ...@@ -496,7 +496,9 @@ class GemmRelated(Op):
if ((Sx[0] < 1) || (Sx[1] < 1) || (Sx[0] MOD type_size) || (Sx[1] MOD type_size) if ((Sx[0] < 1) || (Sx[1] < 1) || (Sx[0] MOD type_size) || (Sx[1] MOD type_size)
|| ((Sx[0] != type_size) && (Sx[1] != type_size))) || ((Sx[0] != type_size) && (Sx[1] != type_size)))
{ {
PyArrayObject * _x_copy = PyArray_GETCONTIGUOUS(%(_x)s); PyArrayObject * _x_copy = (PyArrayObject *) PyArray_Copy(%(_x)s);
if (!_x_copy)
%(fail)s
Py_XDECREF(%(_x)s); Py_XDECREF(%(_x)s);
%(_x)s = _x_copy; %(_x)s = _x_copy;
Sx = %(_x)s->strides; Sx = %(_x)s->strides;
...@@ -505,7 +507,9 @@ class GemmRelated(Op): ...@@ -505,7 +507,9 @@ class GemmRelated(Op):
if ((Sy[0] < 1) || (Sy[1] < 1) || (Sy[0] MOD type_size) || (Sy[1] MOD type_size) if ((Sy[0] < 1) || (Sy[1] < 1) || (Sy[0] MOD type_size) || (Sy[1] MOD type_size)
|| ((Sy[0] != type_size) && (Sy[1] != type_size))) || ((Sy[0] != type_size) && (Sy[1] != type_size)))
{ {
PyArrayObject * _y_copy = PyArray_GETCONTIGUOUS(%(_y)s); PyArrayObject * _y_copy = (PyArrayObject *) PyArray_Copy(%(_y)s);
if (!_y_copy)
%(fail)s
Py_XDECREF(%(_y)s); Py_XDECREF(%(_y)s);
%(_y)s = _y_copy; %(_y)s = _y_copy;
Sy = %(_y)s->strides; Sy = %(_y)s->strides;
...@@ -514,7 +518,9 @@ class GemmRelated(Op): ...@@ -514,7 +518,9 @@ class GemmRelated(Op):
if ((Sz[0] < 1) || (Sz[1] < 1) || (Sz[0] MOD type_size) || (Sz[1] MOD type_size) if ((Sz[0] < 1) || (Sz[1] < 1) || (Sz[0] MOD type_size) || (Sz[1] MOD type_size)
|| ((Sz[0] != type_size) && (Sz[1] != type_size))) || ((Sz[0] != type_size) && (Sz[1] != type_size)))
{ {
PyArrayObject * _z_copy = PyArray_GETCONTIGUOUS(%(_zout)s); PyArrayObject * _z_copy = (PyArrayObject *) PyArray_Copy(%(_zout)s);
if (!_z_copy)
%(fail)s
Py_XDECREF(%(_zout)s); Py_XDECREF(%(_zout)s);
%(_zout)s = _z_copy; %(_zout)s = _z_copy;
Sz = %(_zout)s->strides; Sz = %(_zout)s->strides;
...@@ -649,7 +655,7 @@ class GemmRelated(Op): ...@@ -649,7 +655,7 @@ class GemmRelated(Op):
self.end_switch_typenum), '') self.end_switch_typenum), '')
def build_gemm_version(self): def build_gemm_version(self):
return (10,) return (12,)
class Gemm(GemmRelated): class Gemm(GemmRelated):
"""In-place version of matrix-matrix multiplication (with accumulation): """In-place version of matrix-matrix multiplication (with accumulation):
......
...@@ -58,9 +58,10 @@ def ger_c_code(A, a, x, y, Z, destructive, fail): ...@@ -58,9 +58,10 @@ def ger_c_code(A, a, x, y, Z, destructive, fail):
// copy A if !self.destructive or A is fully strided // copy A if !self.destructive or A is fully strided
if (!%(destructive)s if (!%(destructive)s
|| (%(A)s->strides[0] < 0)
|| (%(A)s->strides[1] < 0)
|| ((%(A)s->strides[0] != elemsize) || ((%(A)s->strides[0] != elemsize)
&& && (%(A)s->strides[1] != elemsize)))
(%(A)s->strides[1] != elemsize)))
{ {
npy_intp dims[2]; npy_intp dims[2];
dims[0] = %(A)s->dimensions[0]; dims[0] = %(A)s->dimensions[0];
...@@ -68,7 +69,11 @@ def ger_c_code(A, a, x, y, Z, destructive, fail): ...@@ -68,7 +69,11 @@ def ger_c_code(A, a, x, y, Z, destructive, fail):
if ((NULL == %(Z)s) if ((NULL == %(Z)s)
|| (%(Z)s->dimensions[0] != %(A)s->dimensions[0]) || (%(Z)s->dimensions[0] != %(A)s->dimensions[0])
|| (%(Z)s->dimensions[1] != %(A)s->dimensions[1])) || (%(Z)s->dimensions[1] != %(A)s->dimensions[1])
|| (%(Z)s->strides[0] < 0)
|| (%(Z)s->strides[1] < 0)
|| ((%(Z)s->strides[0] != elemsize)
&& (%(Z)s->strides[1] != elemsize)))
{ {
if (%(Z)s) Py_XDECREF(%(Z)s); if (%(Z)s) Py_XDECREF(%(Z)s);
%(Z)s = (PyArrayObject*)PyArray_SimpleNew(2, dims, PyArray_TYPE(%(A)s)); %(Z)s = (PyArrayObject*)PyArray_SimpleNew(2, dims, PyArray_TYPE(%(A)s));
...@@ -77,7 +82,11 @@ def ger_c_code(A, a, x, y, Z, destructive, fail): ...@@ -77,7 +82,11 @@ def ger_c_code(A, a, x, y, Z, destructive, fail):
%(fail)s %(fail)s
} }
} }
assert (%(Z)s != %(A)s); if (%(Z)s == %(A)s)
{
PyErr_SetString(PyExc_AssertionError, "%(Z)s != %(A)s");
%(fail)s
}
if (%(Z)s->descr->type_num == PyArray_FLOAT) if (%(Z)s->descr->type_num == PyArray_FLOAT)
{ {
float * zoutdata = (float*)%(Z)s->data; float * zoutdata = (float*)%(Z)s->data;
...@@ -142,8 +151,16 @@ def ger_c_code(A, a, x, y, Z, destructive, fail): ...@@ -142,8 +151,16 @@ def ger_c_code(A, a, x, y, Z, destructive, fail):
int Sz0 = (Nz0 > 1) ? (%(Z)s->strides[0] / elemsize) : (Nz1 + 1); int Sz0 = (Nz0 > 1) ? (%(Z)s->strides[0] / elemsize) : (Nz1 + 1);
int Sz1 = (Nz1 > 1) ? (%(Z)s->strides[1] / elemsize) : (Nz0 + 1); int Sz1 = (Nz1 > 1) ? (%(Z)s->strides[1] / elemsize) : (Nz0 + 1);
if (1) dtype_%(x)s* x_data = (dtype_%(x)s*) %(x)s->data;
{ dtype_%(y)s* y_data = (dtype_%(y)s*) %(y)s->data;
// gemv expects pointers to the beginning of memory arrays,
// but numpy provides provides a pointer to the first element,
// so when the stride is negative, we need to get the last one.
if (Sx < 0)
x_data += (Nz0 - 1) * Sx;
if (Sy < 0)
y_data += (Nz1 - 1) * Sy;
if (%(Z)s->strides[0] == elemsize) if (%(Z)s->strides[0] == elemsize)
{ {
if (%(Z)s->descr->type_num == PyArray_FLOAT) if (%(Z)s->descr->type_num == PyArray_FLOAT)
...@@ -151,19 +168,22 @@ def ger_c_code(A, a, x, y, Z, destructive, fail): ...@@ -151,19 +168,22 @@ def ger_c_code(A, a, x, y, Z, destructive, fail):
//fprintf(stderr, "A\\n"); //fprintf(stderr, "A\\n");
float alpha = ((dtype_%(a)s*)%(a)s->data)[0]; float alpha = ((dtype_%(a)s*)%(a)s->data)[0];
sger_(&Nz0, &Nz1, &alpha, sger_(&Nz0, &Nz1, &alpha,
(float*)(%(x)s->data), &Sx, (float*)x_data, &Sx,
(float*)(%(y)s->data), &Sy, (float*)y_data, &Sy,
(float*)(%(Z)s->data), &Sz1); (float*)(%(Z)s->data), &Sz1);
} }
else if (%(Z)s->descr->type_num == PyArray_DOUBLE) else if (%(Z)s->descr->type_num == PyArray_DOUBLE)
{ {
double alpha = ((dtype_%(a)s*)%(a)s->data)[0]; double alpha = ((dtype_%(a)s*)%(a)s->data)[0];
dger_(&Nz0, &Nz1, &alpha, dger_(&Nz0, &Nz1, &alpha,
(double*)(%(x)s->data), &Sx, (double*)x_data, &Sx,
(double*)(%(y)s->data), &Sy, (double*)y_data, &Sy,
(double*)(%(Z)s->data), &Sz1); (double*)(%(Z)s->data), &Sz1);
} }
else { assert(0); } else {
PyErr_SetString(PyExc_NotImplementedError, "not float nor double");
%(fail)s
}
} }
else if (%(Z)s->strides[1] == elemsize) else if (%(Z)s->strides[1] == elemsize)
{ {
...@@ -174,21 +194,30 @@ def ger_c_code(A, a, x, y, Z, destructive, fail): ...@@ -174,21 +194,30 @@ def ger_c_code(A, a, x, y, Z, destructive, fail):
//fprintf(stderr, "alpha=%%f\\n", alpha); //fprintf(stderr, "alpha=%%f\\n", alpha);
//fprintf(stderr, "sx sy %%i %%i\\n", Sx, Sy); //fprintf(stderr, "sx sy %%i %%i\\n", Sx, Sy);
sger_(&Nz1, &Nz0, &alpha, sger_(&Nz1, &Nz0, &alpha,
(float*)(%(y)s->data), &Sy, (float*)y_data, &Sy,
(float*)(%(x)s->data), &Sx, (float*)x_data, &Sx,
(float*)(%(Z)s->data), &Sz0); (float*)(%(Z)s->data), &Sz0);
} }
else if (%(Z)s->descr->type_num == PyArray_DOUBLE) else if (%(Z)s->descr->type_num == PyArray_DOUBLE)
{ {
double alpha = ((dtype_%(a)s*)%(a)s->data)[0]; double alpha = ((dtype_%(a)s*)%(a)s->data)[0];
dger_(&Nz1, &Nz0, &alpha, dger_(&Nz1, &Nz0, &alpha,
(double*)(%(y)s->data), &Sy, (double*)y_data, &Sy,
(double*)(%(x)s->data), &Sx, (double*)x_data, &Sx,
(double*)(%(Z)s->data), &Sz0); (double*)(%(Z)s->data), &Sz0);
} }
else { assert(0); } else
{
PyErr_SetString(PyExc_NotImplementedError, "not float nor double");
%(fail)s
} }
else { assert(0); } }
else
{
PyErr_SetString(PyExc_AssertionError,
"A is a double-strided matrix, and should have been copied "
"into a memory-contiguous one.");
%(fail)s
} }
} }
...@@ -204,7 +233,7 @@ class CGer(BaseBLAS, Ger): ...@@ -204,7 +233,7 @@ class CGer(BaseBLAS, Ger):
return code return code
def c_code_cache_version(self): def c_code_cache_version(self):
return (4,) return (8,)
@local_optimizer([ger, ger_destructive]) @local_optimizer([ger, ger_destructive])
...@@ -282,7 +311,11 @@ def gemv_c_code(aa, xx, yy, zz, alpha, beta, destructive, fail): ...@@ -282,7 +311,11 @@ def gemv_c_code(aa, xx, yy, zz, alpha, beta, destructive, fail):
%(fail)s %(fail)s
} }
} }
assert (%(zz)s != %(aa)s); if (%(zz)s == %(aa)s)
{
PyErr_SetString(PyExc_AssertionError, "%(zz)s != %(aa)s");
%(fail)s
}
if (dbeta != 0) if (dbeta != 0)
{ {
if (%(zz)s->descr->type_num == PyArray_FLOAT) if (%(zz)s->descr->type_num == PyArray_FLOAT)
...@@ -330,13 +363,52 @@ def gemv_c_code(aa, xx, yy, zz, alpha, beta, destructive, fail): ...@@ -330,13 +363,52 @@ def gemv_c_code(aa, xx, yy, zz, alpha, beta, destructive, fail):
char NOTRANS = 'N'; char NOTRANS = 'N';
int Nx0 = %(xx)s->dimensions[0]; int Nx0 = %(xx)s->dimensions[0];
int Nx1 = %(xx)s->dimensions[1]; int Nx1 = %(xx)s->dimensions[1];
int Sx0 = %(xx)s->strides[0] / elemsize; /* This formula is needed in the case where xx is actually a row or
int Sx1 = %(xx)s->strides[1] / elemsize; * column matrix, because BLAS sometimes insists that the strides:
* - are not smaller than the number of elements in the array
* - are not 0.
*/
int Sx0 = (Nx0 > 1) ? (%(xx)s->strides[0] / elemsize) : (Nx1 + 1);
int Sx1 = (Nx1 > 1) ? (%(xx)s->strides[1] / elemsize) : (Nx0 + 1);
int Sz = %(zz)s->strides[0] / elemsize; int Sz = %(zz)s->strides[0] / elemsize;
int Sy = %(yy)s->strides[0] / elemsize; int Sy = %(yy)s->strides[0] / elemsize;
dtype_%(yy)s* yy_data = (dtype_%(yy)s*) %(yy)s->data;
dtype_%(zz)s* zz_data = (dtype_%(zz)s*) %(zz)s->data;
// gemv expects pointers to the beginning of memory arrays,
// but numpy provides provides a pointer to the first element,
// so when the stride is negative, we need to get the last one.
if (Sy < 0)
yy_data += (Nx1 - 1) * Sy;
if (Sz < 0)
zz_data += (Nx0 - 1) * Sz;
if (Nx0 * Nx1) if (Nx0 * Nx1)
{ {
// If xx is neither C- nor F-contiguous, we make a copy.
// TODO:
// - if one stride is equal to "- elemsize", we can still call
// gemv on reversed matrix and vectors
// - if the copy is too long, maybe call vector/vector dot on
// each row instead
if ((%(xx)s->strides[0] < 0)
|| (%(xx)s->strides[1] < 0)
|| ((%(xx)s->strides[0] != elemsize)
&& (%(xx)s->strides[1] != elemsize)))
{
npy_intp dims[2];
dims[0] = Nx0;
dims[1] = Nx1;
PyArrayObject * xx_copy = (PyArrayObject *) PyArray_Copy(%(xx)s);
if (!xx_copy)
%(fail)s
Py_XDECREF(%(xx)s);
%(xx)s = xx_copy;
Sx0 = (Nx0 > 1) ? (%(xx)s->strides[0] / elemsize) : (Nx1 + 1);
Sx1 = (Nx1 > 1) ? (%(xx)s->strides[1] / elemsize) : (Nx0 + 1);
}
if (%(xx)s->strides[0] == elemsize) if (%(xx)s->strides[0] == elemsize)
{ {
if (%(xx)s->descr->type_num == PyArray_FLOAT) if (%(xx)s->descr->type_num == PyArray_FLOAT)
...@@ -346,9 +418,9 @@ def gemv_c_code(aa, xx, yy, zz, alpha, beta, destructive, fail): ...@@ -346,9 +418,9 @@ def gemv_c_code(aa, xx, yy, zz, alpha, beta, destructive, fail):
sgemv_(&NOTRANS, &Nx0, &Nx1, sgemv_(&NOTRANS, &Nx0, &Nx1,
&alpha, &alpha,
(float*)(%(xx)s->data), &Sx1, (float*)(%(xx)s->data), &Sx1,
(float*)(%(yy)s->data), &Sy, (float*)yy_data, &Sy,
&fbeta, &fbeta,
(float*)(%(zz)s->data), &Sz); (float*)zz_data, &Sz);
} }
else if (%(xx)s->descr->type_num == PyArray_DOUBLE) else if (%(xx)s->descr->type_num == PyArray_DOUBLE)
{ {
...@@ -356,13 +428,14 @@ def gemv_c_code(aa, xx, yy, zz, alpha, beta, destructive, fail): ...@@ -356,13 +428,14 @@ def gemv_c_code(aa, xx, yy, zz, alpha, beta, destructive, fail):
dgemv_(&NOTRANS, &Nx0, &Nx1, dgemv_(&NOTRANS, &Nx0, &Nx1,
&alpha, &alpha,
(double*)(%(xx)s->data), &Sx1, (double*)(%(xx)s->data), &Sx1,
(double*)(%(yy)s->data), &Sy, (double*)yy_data, &Sy,
&dbeta, &dbeta,
(double*)(%(zz)s->data), &Sz); (double*)zz_data, &Sz);
} }
else else
{ {
assert(0); PyErr_SetString(PyExc_AssertionError, "neither float nor double dtype");
%(fail)s
} }
} }
else if (%(xx)s->strides[1] == elemsize) else if (%(xx)s->strides[1] == elemsize)
...@@ -376,9 +449,9 @@ def gemv_c_code(aa, xx, yy, zz, alpha, beta, destructive, fail): ...@@ -376,9 +449,9 @@ def gemv_c_code(aa, xx, yy, zz, alpha, beta, destructive, fail):
sgemv_(&TRANS, &Nx1, &Nx0, sgemv_(&TRANS, &Nx1, &Nx0,
&alpha, &alpha,
(float*)(%(xx)s->data), &Sx0, (float*)(%(xx)s->data), &Sx0,
(float*)(%(yy)s->data), &Sy, (float*)yy_data, &Sy,
&fbeta, &fbeta,
(float*)(%(zz)s->data), &Sz); (float*)zz_data, &Sz);
} }
else if (%(xx)s->descr->type_num == PyArray_DOUBLE) else if (%(xx)s->descr->type_num == PyArray_DOUBLE)
{ {
...@@ -386,20 +459,22 @@ def gemv_c_code(aa, xx, yy, zz, alpha, beta, destructive, fail): ...@@ -386,20 +459,22 @@ def gemv_c_code(aa, xx, yy, zz, alpha, beta, destructive, fail):
dgemv_(&TRANS, &Nx1, &Nx0, dgemv_(&TRANS, &Nx1, &Nx0,
&alpha, &alpha,
(double*)(%(xx)s->data), &Sx0, (double*)(%(xx)s->data), &Sx0,
(double*)(%(yy)s->data), &Sy, (double*)yy_data, &Sy,
&dbeta, &dbeta,
(double*)(%(zz)s->data), &Sz); (double*)zz_data, &Sz);
} }
else else
{ {
assert(0); PyErr_SetString(PyExc_AssertionError, "neither float nor double dtype");
%(fail)s
} }
} }
else else
{ {
// if xx is strided in both directions, then just do the gemv with a PyErr_SetString(PyExc_AssertionError,
// pair of for loops. "xx is a double-strided matrix, and should have been copied "
assert (0); "into a memory-contiguous one.");
%(fail)s
} }
} }
else if (dbeta != 1.0) else if (dbeta != 1.0)
...@@ -429,7 +504,7 @@ class CGemv(BaseBLAS, Gemv): ...@@ -429,7 +504,7 @@ class CGemv(BaseBLAS, Gemv):
return code return code
def c_code_cache_version(self): def c_code_cache_version(self):
return (2,) return (9,)
@local_optimizer([gemv_inplace, gemv_no_inplace]) @local_optimizer([gemv_inplace, gemv_no_inplace])
......
...@@ -44,7 +44,12 @@ class ScipyGer(Ger): ...@@ -44,7 +44,12 @@ class ScipyGer(Ger):
# N.B. some versions of scipy (e.g. mine) don't actually work # N.B. some versions of scipy (e.g. mine) don't actually work
# in-place on a, even when I tell it to. # in-place on a, even when I tell it to.
A = cA[0] A = cA[0]
if A.flags['C_CONTIGUOUS']: if A.size == 0:
# We don't have to do anything, A is empty.
# We need this special case because Numpy considers it
# C-contiguous, wich is confusing.
pass
elif A.flags['C_CONTIGUOUS']:
A = local_ger(calpha[0], cy[0], cx[0], a=A.T, A = local_ger(calpha[0], cy[0], cx[0], a=A.T,
overwrite_a=int(self.destructive)).T overwrite_a=int(self.destructive)).T
else: else:
......
#from nose.plugins.skip import SkipTest #from nose.plugins.skip import SkipTest
#import traceback #import traceback
import sys import itertools, sys
import theano.tensor as T import theano.tensor as T
from theano import tensor
#from theano.gof import Env #from theano.gof import Env
from theano.printing import pp from theano.printing import pp
...@@ -878,6 +879,11 @@ class TestGemv(TestCase, unittest_tools.TestOptimizationMixin): ...@@ -878,6 +879,11 @@ class TestGemv(TestCase, unittest_tools.TestOptimizationMixin):
# Assert they produce the same output # Assert they produce the same output
assert numpy.allclose(f(), numpy.dot(v.get_value(), m.get_value())) assert numpy.allclose(f(), numpy.dot(v.get_value(), m.get_value()))
# Assert it works when m has no contiguous dimension
m.set_value(
m.get_value(borrow=True)[::-1, ::-1],
borrow=True)
assert numpy.allclose(f(), numpy.dot(v.get_value(), m.get_value()))
def test_dot_mv(self): def test_dot_mv(self):
...@@ -894,6 +900,11 @@ class TestGemv(TestCase, unittest_tools.TestOptimizationMixin): ...@@ -894,6 +900,11 @@ class TestGemv(TestCase, unittest_tools.TestOptimizationMixin):
# Assert they produce the same output # Assert they produce the same output
assert numpy.allclose(f(), numpy.dot(m.get_value(), v.get_value())) assert numpy.allclose(f(), numpy.dot(m.get_value(), v.get_value()))
# Assert it works when m has no contiguous dimension
m.set_value(
m.get_value(borrow=True)[::-1, ::-1],
borrow=True)
assert numpy.allclose(f(), numpy.dot(m.get_value(), v.get_value()))
@staticmethod @staticmethod
def t_gemv1(m_shp): def t_gemv1(m_shp):
...@@ -915,19 +926,30 @@ class TestGemv(TestCase, unittest_tools.TestOptimizationMixin): ...@@ -915,19 +926,30 @@ class TestGemv(TestCase, unittest_tools.TestOptimizationMixin):
assert topo[0].op.inplace==False assert topo[0].op.inplace==False
#test the inplace version #test the inplace version
f = theano.function([], [], updates={v2:v2+theano.dot(m,v1)} g = theano.function([], [], updates={v2:v2+theano.dot(m,v1)}
, mode = mode_blas_opt) , mode = mode_blas_opt)
# Assert they produce the same output # Assert they produce the same output
f() g()
assert numpy.allclose(v2.get_value(), assert numpy.allclose(v2.get_value(),
numpy.dot(m.get_value(), v1.get_value()) + v2_orig) numpy.dot(m.get_value(), v1.get_value()) + v2_orig)
topo = f.maker.env.toposort() topo = g.maker.env.toposort()
assert len(topo)==1 assert len(topo)==1
assert isinstance(topo[0].op, Gemv) assert isinstance(topo[0].op, Gemv)
if config.mode != 'FAST_COMPILE': if config.mode != 'FAST_COMPILE':
assert topo[0].op.inplace==True assert topo[0].op.inplace==True
# Do the same tests with a matrix with strides in both dimensions
m.set_value(
m.get_value(borrow=True)[::-1, ::-1],
borrow=True)
v2.set_value(v2_orig)
assert numpy.allclose(f(),
numpy.dot(m.get_value(), v1.get_value()) + v2_orig)
g()
assert numpy.allclose(v2.get_value(),
numpy.dot(m.get_value(), v1.get_value()) + v2_orig)
def test_gemv1(self): def test_gemv1(self):
self.t_gemv1((3,2)) self.t_gemv1((3,2))
self.t_gemv1((0,2)) self.t_gemv1((0,2))
...@@ -952,18 +974,29 @@ class TestGemv(TestCase, unittest_tools.TestOptimizationMixin): ...@@ -952,18 +974,29 @@ class TestGemv(TestCase, unittest_tools.TestOptimizationMixin):
assert topo[-1].op.inplace==False assert topo[-1].op.inplace==False
#test the inplace version #test the inplace version
f = theano.function([], [], updates={v2:v2+theano.dot(v1,m)} g = theano.function([], [], updates={v2:v2+theano.dot(v1,m)}
, mode = mode_blas_opt) , mode = mode_blas_opt)
# Assert they produce the same output # Assert they produce the same output
f() g()
assert numpy.allclose(v2.get_value(), assert numpy.allclose(v2.get_value(),
numpy.dot(v1.get_value(), m.get_value()) + v2_orig) numpy.dot(v1.get_value(), m.get_value()) + v2_orig)
topo = f.maker.env.toposort() topo = g.maker.env.toposort()
assert sum(isinstance(node.op, Gemv) for node in topo)==1 assert sum(isinstance(node.op, Gemv) for node in topo)==1
if config.mode != 'FAST_COMPILE': if config.mode != 'FAST_COMPILE':
assert topo[-1].op.inplace==True assert topo[-1].op.inplace==True
# Do the same tests with a matrix with strides in both dimensions
m.set_value(
m.get_value(borrow=True)[::-1, ::-1],
borrow=True)
v2.set_value(v2_orig)
assert numpy.allclose(f(),
numpy.dot(v1.get_value(), m.get_value()) + v2.get_value())
g()
assert numpy.allclose(v2.get_value(),
numpy.dot(v1.get_value(), m.get_value()) + v2_orig)
def test_gemv_dimensions(self): def test_gemv_dimensions(self):
A = T.matrix('A') A = T.matrix('A')
x, y = T.vectors('x', 'y') x, y = T.vectors('x', 'y')
...@@ -984,6 +1017,7 @@ class TestGemv(TestCase, unittest_tools.TestOptimizationMixin): ...@@ -984,6 +1017,7 @@ class TestGemv(TestCase, unittest_tools.TestOptimizationMixin):
ones_6 = numpy.ones(6, dtype=config.floatX) ones_6 = numpy.ones(6, dtype=config.floatX)
f(A_val, ones_3, ones_5) f(A_val, ones_3, ones_5)
f(A_val[::-1, ::-1], ones_3, ones_5)
self.assertRaises(ValueError, f, A_val, ones_4, ones_5) self.assertRaises(ValueError, f, A_val, ones_4, ones_5)
self.assertRaises(ValueError, f, A_val, ones_3, ones_6) self.assertRaises(ValueError, f, A_val, ones_3, ones_6)
self.assertRaises(ValueError, f, A_val, ones_4, ones_6) self.assertRaises(ValueError, f, A_val, ones_4, ones_6)
...@@ -1017,6 +1051,7 @@ def matrixmultiply(a, b): ...@@ -1017,6 +1051,7 @@ def matrixmultiply(a, b):
class BaseGemv(object): class BaseGemv(object):
mode = mode_blas_opt # can be overridden with self.mode mode = mode_blas_opt # can be overridden with self.mode
shared = staticmethod(theano.shared)
def get_data(self,x_stride=1,y_stride=1): def get_data(self,x_stride=1,y_stride=1):
rng = numpy.random.RandomState(unittest_tools.fetch_seed()) rng = numpy.random.RandomState(unittest_tools.fetch_seed())
...@@ -1031,7 +1066,7 @@ class BaseGemv(object): ...@@ -1031,7 +1066,7 @@ class BaseGemv(object):
return alpha,beta,a,x,y return alpha,beta,a,x,y
def test_simple(self): def test_simple(self):
alpha, beta, a, x, y = [ shared(value) for value in self.get_data() ] alpha, beta, a, x, y = [ self.shared(value) for value in self.get_data() ]
desired_oy = alpha.get_value() * matrixmultiply(a.get_value(),x.get_value()) + beta.get_value() * y.get_value() desired_oy = alpha.get_value() * matrixmultiply(a.get_value(),x.get_value()) + beta.get_value() * y.get_value()
oy = alpha * T.dot(a,x) + beta * y oy = alpha * T.dot(a,x) + beta * y
...@@ -1049,8 +1084,8 @@ class BaseGemv(object): ...@@ -1049,8 +1084,8 @@ class BaseGemv(object):
vs = self.get_data() vs = self.get_data()
alpha_v, beta_v, a_v, x_v, y_v = vs alpha_v, beta_v, a_v, x_v, y_v = vs
a = shared(a_v) a = self.shared(a_v)
x = shared(x_v) x = self.shared(x_v)
desired_oy = matrixmultiply(a_v, x_v) desired_oy = matrixmultiply(a_v, x_v)
...@@ -1067,7 +1102,7 @@ class BaseGemv(object): ...@@ -1067,7 +1102,7 @@ class BaseGemv(object):
def test_simple_transpose(self): def test_simple_transpose(self):
vs = self.get_data() vs = self.get_data()
alpha_v, beta_v, a_v, x_v, y_v = vs alpha_v, beta_v, a_v, x_v, y_v = vs
alpha, beta, a, x, y = [ shared(v) for v in vs ] alpha, beta, a, x, y = [ self.shared(v) for v in vs ]
desired_oy = alpha_v * matrixmultiply(transpose(a_v),x_v)+beta_v*y_v desired_oy = alpha_v * matrixmultiply(transpose(a_v),x_v)+beta_v*y_v
...@@ -1083,7 +1118,7 @@ class BaseGemv(object): ...@@ -1083,7 +1118,7 @@ class BaseGemv(object):
def test_x_stride(self): def test_x_stride(self):
vs = self.get_data(x_stride = 2) vs = self.get_data(x_stride = 2)
alpha_v, beta_v, a_v, x_v, y_v = vs alpha_v, beta_v, a_v, x_v, y_v = vs
alpha, beta, a, x, y = [ shared(v) for v in vs ] alpha, beta, a, x, y = [ self.shared(v) for v in vs ]
desired_oy = alpha_v * matrixmultiply(a_v,x_v[::2])+beta_v*y_v desired_oy = alpha_v * matrixmultiply(a_v,x_v[::2])+beta_v*y_v
...@@ -1099,7 +1134,7 @@ class BaseGemv(object): ...@@ -1099,7 +1134,7 @@ class BaseGemv(object):
def test_x_stride_transpose(self): def test_x_stride_transpose(self):
vs = self.get_data(x_stride = 2) vs = self.get_data(x_stride = 2)
alpha_v, beta_v, a_v, x_v, y_v = vs alpha_v, beta_v, a_v, x_v, y_v = vs
alpha, beta, a, x, y = [ shared(v) for v in vs ] alpha, beta, a, x, y = [ self.shared(v) for v in vs ]
desired_oy = alpha_v * matrixmultiply(transpose(a_v),x_v[::2])+beta_v*y_v desired_oy = alpha_v * matrixmultiply(transpose(a_v),x_v[::2])+beta_v*y_v
...@@ -1115,7 +1150,7 @@ class BaseGemv(object): ...@@ -1115,7 +1150,7 @@ class BaseGemv(object):
def test_y_stride(self): def test_y_stride(self):
vs = self.get_data(y_stride = 2) vs = self.get_data(y_stride = 2)
alpha_v, beta_v, a_v, x_v, y_v = vs alpha_v, beta_v, a_v, x_v, y_v = vs
alpha, beta, a, x, y = [ shared(v) for v in vs ] alpha, beta, a, x, y = [ self.shared(v) for v in vs ]
desired_oy = alpha_v * matrixmultiply(a_v,x_v)+beta_v*y_v[::2] desired_oy = alpha_v * matrixmultiply(a_v,x_v)+beta_v*y_v[::2]
...@@ -1131,7 +1166,7 @@ class BaseGemv(object): ...@@ -1131,7 +1166,7 @@ class BaseGemv(object):
def test_y_stride_transpose(self): def test_y_stride_transpose(self):
vs = self.get_data(y_stride = 2) vs = self.get_data(y_stride = 2)
alpha_v, beta_v, a_v, x_v, y_v = vs alpha_v, beta_v, a_v, x_v, y_v = vs
alpha, beta, a, x, y = [ shared(v) for v in vs ] alpha, beta, a, x, y = [ self.shared(v) for v in vs ]
desired_oy = alpha_v * matrixmultiply(transpose(a_v),x_v)+beta_v*y_v[::2] desired_oy = alpha_v * matrixmultiply(transpose(a_v),x_v)+beta_v*y_v[::2]
...@@ -1144,6 +1179,46 @@ class BaseGemv(object): ...@@ -1144,6 +1179,46 @@ class BaseGemv(object):
oy_v = oy_func() oy_v = oy_func()
assert_array_almost_equal(desired_oy, oy_v) assert_array_almost_equal(desired_oy, oy_v)
def test_a_strides(self):
vs = self.get_data()
alpha_v, beta_v, a_v, x_v, y_v = vs
alpha, beta, a, x, y = [ self.shared(v) for v in vs ]
a_v = a_v[::-1, ::-1]
a.set_value(
a.get_value(borrow=True, return_internal_type=True)[::-1, ::-1],
borrow=True)
desired_oy = alpha_v * matrixmultiply(a_v,x_v)+beta_v*y_v
oy = alpha * T.dot(a,x)+beta*y
oy_func = theano.function([], oy, mode=self.mode)
self.assertFunctionContains1(oy_func, self.gemv)
oy_v = oy_func()
assert_array_almost_equal(desired_oy, oy_v)
def test_a_strides_transpose(self):
vs = self.get_data()
alpha_v, beta_v, a_v, x_v, y_v = vs
alpha, beta, a, x, y = [ self.shared(v) for v in vs ]
a_v = a_v[::-1, ::-1]
a.set_value(
a.get_value(borrow=True, return_internal_type=True)[::-1, ::-1],
borrow=True)
desired_oy = alpha_v * matrixmultiply(transpose(a_v),x_v)+beta_v*y_v
oy = alpha * T.dot(a.T,x)+beta*y
oy_func = theano.function([], oy, mode=self.mode)
self.assertFunctionContains1(oy_func, self.gemv)
oy_v = oy_func()
assert_array_almost_equal(desired_oy, oy_v)
def test_upcasting_scalar_nogemv(self): def test_upcasting_scalar_nogemv(self):
# Test that the optimization does not crash when the scale has # Test that the optimization does not crash when the scale has
# an incorrect dtype, and forces upcasting of the result # an incorrect dtype, and forces upcasting of the result
...@@ -1281,6 +1356,7 @@ class TestGer_OpContract(TestCase, unittest_tools.T_OpContractMixin): ...@@ -1281,6 +1356,7 @@ class TestGer_OpContract(TestCase, unittest_tools.T_OpContractMixin):
class TestGer(TestCase, unittest_tools.TestOptimizationMixin): class TestGer(TestCase, unittest_tools.TestOptimizationMixin):
shared = staticmethod(theano.shared)
def setUp(self): def setUp(self):
self.mode = theano.compile.get_default_mode().including('fast_run') self.mode = theano.compile.get_default_mode().including('fast_run')
...@@ -1332,6 +1408,9 @@ class TestGer(TestCase, unittest_tools.TestOptimizationMixin): ...@@ -1332,6 +1408,9 @@ class TestGer(TestCase, unittest_tools.TestOptimizationMixin):
f(numpy.random.rand(5, 4).astype(self.dtype), f(numpy.random.rand(5, 4).astype(self.dtype),
numpy.random.rand(5).astype(self.dtype), numpy.random.rand(5).astype(self.dtype),
numpy.random.rand(4).astype(self.dtype)) numpy.random.rand(4).astype(self.dtype))
f(numpy.random.rand(5, 4).astype(self.dtype)[::-1, ::-1],
numpy.random.rand(5).astype(self.dtype),
numpy.random.rand(4).astype(self.dtype))
def test_A_plus_scaled_outer(self): def test_A_plus_scaled_outer(self):
f = self.function([self.A, self.x, self.y], f = self.function([self.A, self.x, self.y],
...@@ -1340,6 +1419,9 @@ class TestGer(TestCase, unittest_tools.TestOptimizationMixin): ...@@ -1340,6 +1419,9 @@ class TestGer(TestCase, unittest_tools.TestOptimizationMixin):
f(numpy.random.rand(5, 4).astype(self.dtype), f(numpy.random.rand(5, 4).astype(self.dtype),
numpy.random.rand(5).astype(self.dtype), numpy.random.rand(5).astype(self.dtype),
numpy.random.rand(4).astype(self.dtype)) numpy.random.rand(4).astype(self.dtype))
f(numpy.random.rand(5, 4).astype(self.dtype)[::-1, ::-1],
numpy.random.rand(5).astype(self.dtype),
numpy.random.rand(4).astype(self.dtype))
def test_scaled_A_plus_scaled_outer(self): def test_scaled_A_plus_scaled_outer(self):
f = self.function([self.A, self.x, self.y], f = self.function([self.A, self.x, self.y],
...@@ -1352,6 +1434,9 @@ class TestGer(TestCase, unittest_tools.TestOptimizationMixin): ...@@ -1352,6 +1434,9 @@ class TestGer(TestCase, unittest_tools.TestOptimizationMixin):
f(numpy.random.rand(5, 4).astype(self.dtype), f(numpy.random.rand(5, 4).astype(self.dtype),
numpy.random.rand(5).astype(self.dtype), numpy.random.rand(5).astype(self.dtype),
numpy.random.rand(4).astype(self.dtype)) numpy.random.rand(4).astype(self.dtype))
f(numpy.random.rand(5, 4).astype(self.dtype)[::-1, ::-1],
numpy.random.rand(5).astype(self.dtype),
numpy.random.rand(4).astype(self.dtype))
def given_dtype(self, dtype, M, N): def given_dtype(self, dtype, M, N):
""" test corner case shape and dtype""" """ test corner case shape and dtype"""
...@@ -1362,6 +1447,9 @@ class TestGer(TestCase, unittest_tools.TestOptimizationMixin): ...@@ -1362,6 +1447,9 @@ class TestGer(TestCase, unittest_tools.TestOptimizationMixin):
f(numpy.random.rand(M, N).astype(self.dtype), f(numpy.random.rand(M, N).astype(self.dtype),
numpy.random.rand(M).astype(self.dtype), numpy.random.rand(M).astype(self.dtype),
numpy.random.rand(N).astype(self.dtype)) numpy.random.rand(N).astype(self.dtype))
f(numpy.random.rand(M, N).astype(self.dtype)[::-1, ::-1],
numpy.random.rand(M).astype(self.dtype),
numpy.random.rand(N).astype(self.dtype))
def test_f32_0_0(self): def test_f32_0_0(self):
return self.given_dtype('float32', 0, 0) return self.given_dtype('float32', 0, 0)
...@@ -1394,10 +1482,393 @@ class TestGer(TestCase, unittest_tools.TestOptimizationMixin): ...@@ -1394,10 +1482,393 @@ class TestGer(TestCase, unittest_tools.TestOptimizationMixin):
return self.given_dtype('complex128', 1, 9) return self.given_dtype('complex128', 1, 9)
def test_inplace(self): def test_inplace(self):
A = theano.shared(numpy.random.rand(4, 5).astype(self.dtype)) A = self.shared(numpy.random.rand(4, 5).astype(self.dtype))
f = self.function([self.x, self.y], [], f = self.function([self.x, self.y], [],
updates={A: A + T.constant(0.1, dtype=self.dtype) * updates={A: A + T.constant(0.1, dtype=self.dtype) *
T.outer(self.x, self.y)}) T.outer(self.x, self.y)})
self.assertFunctionContains(f, self.ger_destructive) self.assertFunctionContains(f, self.ger_destructive)
f(numpy.random.rand(4).astype(self.dtype), f(numpy.random.rand(4).astype(self.dtype),
numpy.random.rand(5).astype(self.dtype)) numpy.random.rand(5).astype(self.dtype))
A.set_value(
A.get_value(borrow=True, return_internal_type=True)[::-1, ::-1],
borrow=True)
f(numpy.random.rand(4).astype(self.dtype),
numpy.random.rand(5).astype(self.dtype))
class TestBlasStrides(TestCase):
dtype = 'float64'
shared = staticmethod(tensor._shared)
mode = theano.compile.get_default_mode()
mode = mode.including('fast_run').excluding('gpu', 'c_blas', 'scipy_blas')
rng = numpy.random.RandomState(seed=unittest_tools.fetch_seed())
def rand(self, *shape):
return theano._asarray(self.rng.rand(*shape), dtype=self.dtype)
def cmp_dot22(self, b_shp, c_shp):
av = numpy.zeros((0, 0), dtype=self.dtype)
bv = self.rand(*b_shp)
cv = self.rand(*c_shp)
a = self.shared(av, 'a')
b = self.shared(bv, 'b')
c = self.shared(cv, 'c')
b_t = self.shared(bv.T, 'b.T')
c_t = self.shared(cv.T, 'c.T')
b_dev = b.get_value(borrow=False, return_internal_type=True)
c_dev = c.get_value(borrow=False, return_internal_type=True)
bt_dev = b_t.get_value(borrow=False, return_internal_type=True)
ct_dev = c_t.get_value(borrow=False, return_internal_type=True)
f_nn = theano.function([], [], updates={a: tensor.dot(b, c)},
mode=self.mode)
print 'class name:', self.__class__.__name__
theano.printing.debugprint(f_nn)
f_nt = theano.function([], [], updates={a: tensor.dot(b, c_t.T)},
mode=self.mode)
f_tn = theano.function([], [], updates={a: tensor.dot(b_t.T, c)},
mode=self.mode)
f_tt = theano.function([], [], updates={a: tensor.dot(b_t.T, c_t.T)},
mode=self.mode)
# Try with all stride patterns, and all transposed pattern
for step_signs in itertools.product((-1, 1), repeat=4):
for step in (1, 2):
b_step1, b_step2, c_step1, c_step2 = (s * step
for s in step_signs)
b.set_value(b_dev.copy()[::b_step1, ::b_step2], borrow=True)
c.set_value(c_dev.copy()[::c_step1, ::c_step2], borrow=True)
b_t.set_value(bt_dev.copy()[::b_step2, ::b_step1], borrow=True)
c_t.set_value(ct_dev.copy()[::c_step2, ::c_step1], borrow=True)
# Numpy result
a_n = numpy.dot(bv[::b_step1, ::b_step2],
cv[::c_step1, ::c_step2])
f_nn()
assert numpy.allclose(a.get_value(), a_n)
f_nt()
assert numpy.allclose(a.get_value(), a_n)
f_tn()
assert numpy.allclose(a.get_value(), a_n)
f_tt()
assert numpy.allclose(a.get_value(), a_n)
def test_dot22(self):
self.cmp_dot22((3, 4), (4, 5))
self.cmp_dot22((1, 4), (4, 5))
self.cmp_dot22((3, 4), (4, 1))
self.cmp_dot22((3, 1), (1, 1))
self.cmp_dot22((1, 4), (4, 1))
self.cmp_dot22((3, 1), (1, 5))
self.cmp_dot22((0, 4), (4, 5))
self.cmp_dot22((0, 4), (4, 1))
self.cmp_dot22((0, 1), (1, 5))
self.cmp_dot22((3, 4), (4, 0))
self.cmp_dot22((3, 0), (0, 5))
self.cmp_dot22((0, 4), (4, 0))
self.cmp_dot22((0, 0), (0, 0))
def cmp_dot22scalar(self, b_shp, c_shp):
av = numpy.zeros((0, 0), dtype=self.dtype)
bv = self.rand(*b_shp)
cv = self.rand(*c_shp)
l = numpy.float32(0.2)
a = self.shared(av, 'a')
b = self.shared(bv, 'b')
c = self.shared(cv, 'c')
b_t = self.shared(bv.T, 'b.T')
c_t = self.shared(cv.T, 'c.T')
b_dev = b.get_value(borrow=False, return_internal_type=True)
c_dev = c.get_value(borrow=False, return_internal_type=True)
bt_dev = b_t.get_value(borrow=False, return_internal_type=True)
ct_dev = c_t.get_value(borrow=False, return_internal_type=True)
f_nn = theano.function([], [], updates={a: l * tensor.dot(b, c)},
mode=self.mode)
f_nt = theano.function([], [], updates={a: l * tensor.dot(b, c_t.T)},
mode=self.mode)
f_tn = theano.function([], [], updates={a: l * tensor.dot(b_t.T, c)},
mode=self.mode)
f_tt = theano.function([], [],
updates={a: l * tensor.dot(b_t.T, c_t.T)},
mode=self.mode)
# Try with all stride patterns, and all transposed pattern
for step_signs in itertools.product((-1, 1), repeat=4):
for step in (1, 2):
b_step1, b_step2, c_step1, c_step2 = (s * step
for s in step_signs)
b.set_value(b_dev.copy()[::b_step1, ::b_step2], borrow=True)
c.set_value(c_dev.copy()[::c_step1, ::c_step2], borrow=True)
b_t.set_value(bt_dev.copy()[::b_step2, ::b_step1], borrow=True)
c_t.set_value(ct_dev.copy()[::c_step2, ::c_step1], borrow=True)
# Numpy result
a_n = l * numpy.dot(bv[::b_step1, ::b_step2],
cv[::c_step1, ::c_step2])
f_nn()
assert numpy.allclose(a.get_value(), a_n)
f_nt()
assert numpy.allclose(a.get_value(), a_n)
f_tn()
assert numpy.allclose(a.get_value(), a_n)
f_tt()
assert numpy.allclose(a.get_value(), a_n)
def test_dot22scalar(self):
self.cmp_dot22scalar((3, 4), (4, 5))
self.cmp_dot22scalar((1, 4), (4, 5))
self.cmp_dot22scalar((3, 4), (4, 1))
self.cmp_dot22scalar((3, 1), (1, 1))
self.cmp_dot22scalar((1, 4), (4, 1))
self.cmp_dot22scalar((3, 1), (1, 5))
self.cmp_dot22scalar((0, 4), (4, 5))
self.cmp_dot22scalar((0, 4), (4, 1))
self.cmp_dot22scalar((0, 1), (1, 5))
self.cmp_dot22scalar((3, 4), (4, 0))
self.cmp_dot22scalar((3, 0), (0, 5))
self.cmp_dot22scalar((0, 4), (4, 0))
self.cmp_dot22scalar((0, 0), (0, 0))
def cmp_gemm(self, a_shp, b_shp, c_shp):
av = self.rand(*a_shp)
bv = self.rand(*b_shp)
cv = self.rand(*c_shp)
l = numpy.float32(0.2)
a = self.shared(av, 'a')
b = self.shared(bv, 'b')
c = self.shared(cv, 'c')
a_t = self.shared(av.T, 'a.T')
b_t = self.shared(bv.T, 'b.T')
c_t = self.shared(cv.T, 'c.T')
a_dev = a.get_value(borrow=False, return_internal_type=True)
b_dev = b.get_value(borrow=False, return_internal_type=True)
c_dev = c.get_value(borrow=False, return_internal_type=True)
bt_dev = b_t.get_value(borrow=False, return_internal_type=True)
ct_dev = c_t.get_value(borrow=False, return_internal_type=True)
f_nnn = theano.function([], [],
updates={a: (l * a + tensor.dot(b, c))},
mode=self.mode)
f_nnt = theano.function([], [],
updates={a: (l * a + tensor.dot(b, c_t.T))},
mode=self.mode)
f_ntn = theano.function([], [],
updates={a: (l * a + tensor.dot(b_t.T, c))},
mode=self.mode)
f_ntt = theano.function([], [],
updates={a: (l * a + tensor.dot(b_t.T, c_t.T))},
mode=self.mode)
f_tnn = theano.function([], [],
updates={a_t: (l * a_t + tensor.dot(b, c).T)},
mode=self.mode)
f_tnt = theano.function([], [],
updates={a_t: (l * a_t + tensor.dot(b, c_t.T).T)},
mode=self.mode)
f_ttn = theano.function([], [],
updates={a_t: (l * a_t + tensor.dot(b_t.T, c).T)},
mode=self.mode)
f_ttt = theano.function([], [],
updates={a_t: (l * a_t + tensor.dot(b_t.T, c_t.T).T)},
mode=self.mode)
# Try with all stride patterns, and all transposed pattern
for step_signs in itertools.product((-1, 1), repeat=6):
for step in (1, 2):
a_step1, a_step2, b_step1, b_step2, c_step1, c_step2 = \
(s * step for s in step_signs)
b.set_value(b_dev.copy()[::b_step1, ::b_step2], borrow=True)
c.set_value(c_dev.copy()[::c_step1, ::c_step2], borrow=True)
b_t.set_value(bt_dev.copy()[::b_step2, ::b_step1], borrow=True)
c_t.set_value(ct_dev.copy()[::c_step2, ::c_step1], borrow=True)
# Numpy results
a_n = (l * av[::a_step1, ::a_step2]
+ numpy.dot(bv[::b_step1, ::b_step2],
cv[::c_step1, ::c_step2]))
at_n = (l * av[::a_step1, ::a_step2].T
+ numpy.dot(bv[::b_step1, ::b_step2],
cv[::c_step1, ::c_step2]).T)
# a's value is updated, so we need to reinitialize it each time
a.set_value(a_dev.copy()[::a_step1, ::a_step2], borrow=True)
f_nnn()
assert numpy.allclose(a.get_value(), a_n)
a.set_value(a_dev.copy()[::a_step1, ::a_step2], borrow=True)
f_nnt()
assert numpy.allclose(a.get_value(), a_n)
a.set_value(a_dev.copy()[::a_step1, ::a_step2], borrow=True)
f_ntn()
assert numpy.allclose(a.get_value(), a_n)
a.set_value(a_dev.copy()[::a_step1, ::a_step2], borrow=True)
f_ntt()
assert numpy.allclose(a.get_value(), a_n)
a_t.set_value(transpose(a_dev.copy())[::a_step2, ::a_step1],
borrow=True)
f_tnn()
assert numpy.allclose(a_t.get_value(), at_n)
a_t.set_value(transpose(a_dev.copy())[::a_step2, ::a_step1],
borrow=True)
f_tnt()
assert numpy.allclose(a_t.get_value(), at_n)
a_t.set_value(transpose(a_dev.copy())[::a_step2, ::a_step1],
borrow=True)
f_ttn()
assert numpy.allclose(a_t.get_value(), at_n)
a_t.set_value(transpose(a_dev.copy())[::a_step2, ::a_step1],
borrow=True)
f_ttt()
assert numpy.allclose(a_t.get_value(), at_n)
def test_gemm(self):
self.cmp_gemm((3, 5), (3, 4), (4, 5))
self.cmp_gemm((1, 5), (1, 4), (4, 5))
self.cmp_gemm((3, 1), (3, 4), (4, 1))
self.cmp_gemm((3, 1), (3, 1), (1, 1))
self.cmp_gemm((1, 1), (1, 4), (4, 1))
self.cmp_gemm((3, 5), (3, 1), (1, 5))
self.cmp_gemm((0, 5), (0, 4), (4, 5))
self.cmp_gemm((0, 1), (0, 4), (4, 1))
self.cmp_gemm((0, 5), (0, 1), (1, 5))
self.cmp_gemm((3, 0), (3, 4), (4, 0))
self.cmp_gemm((3, 5), (3, 0), (0, 5))
self.cmp_gemm((0, 0), (0, 4), (4, 0))
self.cmp_gemm((0, 0), (0, 0), (0, 0))
def cmp_gemv(self, a_shp, b_shp, c_shp):
av = self.rand(a_shp)
bv = self.rand(*b_shp)
cv = self.rand(c_shp)
l = numpy.float32(0.2)
a = self.shared(av, 'a')
b = self.shared(bv, 'b')
c = self.shared(cv, 'c')
b_t = self.shared(bv.T, 'b.T')
a_dev = a.get_value(borrow=False, return_internal_type=True)
b_dev = b.get_value(borrow=False, return_internal_type=True)
c_dev = c.get_value(borrow=False, return_internal_type=True)
f_n = theano.function([], [], updates={a: (a + l * tensor.dot(b, c))},
mode=self.mode)
f_t = theano.function([], [],
updates={a: (a + l * tensor.dot(b_t.T, c))},
mode=self.mode)
# Try with all stride patterns, and all transposed pattern
for step_signs in itertools.product((1, -1), repeat=4):
for step in (1, 2):
a_step, b_step1, b_step2, c_step = (s * step
for s in step_signs)
a.set_value(a_dev.copy()[::a_step], borrow=True)
b.set_value(b_dev.copy()[::b_step1, ::b_step2],
borrow=True)
b_t.set_value(transpose(b_dev.copy())[::b_step2, ::b_step1],
borrow=True)
c.set_value(c_dev.copy()[::c_step], borrow=True)
a_n = (av[::a_step]
+ l * numpy.dot(bv[::b_step1, ::b_step2], cv[::c_step]))
f_n()
assert numpy.allclose(a.get_value(), a_n), (a.get_value(), a_n)
a.set_value(a_dev.copy()[::a_step], borrow=True)
f_t()
assert numpy.allclose(a.get_value(), a_n), (a.get_value(), a_n)
def test_gemv(self):
self.cmp_gemv(3, (3, 5), 5)
self.cmp_gemv(1, (1, 5), 5)
self.cmp_gemv(3, (3, 1), 1)
self.cmp_gemv(0, (0, 5), 5)
self.cmp_gemv(3, (3, 0), 0)
self.cmp_gemv(0, (0, 1), 1)
self.cmp_gemv(1, (1, 0), 0)
self.cmp_gemv(0, (0, 0), 0)
def cmp_ger(self, a_shp, b_shp, c_shp):
av = self.rand(*a_shp)
bv = self.rand(b_shp)
cv = self.rand(c_shp)
l = numpy.float32(0.2)
a = self.shared(av, 'a')
b = self.shared(bv, 'b')
c = self.shared(cv, 'c')
a_t = self.shared(av.T, 'a.T')
a_dev = a.get_value(borrow=False, return_internal_type=True)
b_dev = b.get_value(borrow=False, return_internal_type=True)
c_dev = c.get_value(borrow=False, return_internal_type=True)
f_n = theano.function([], [],
updates={a: (a + l * tensor.outer(b, c))},
mode=self.mode)
f_t = theano.function([], [],
updates={a_t: (a_t + l * tensor.outer(b, c).T)},
mode=self.mode)
# Try with all stride patterns, and all transposed patterns
for step_signs in itertools.product((1, -1), repeat=4):
for step in (1, 2):
a_step1, a_step2, b_step, c_step = (s * step
for s in step_signs)
a.set_value(a_dev.copy()[::a_step1, ::a_step2], borrow=True)
a_t.set_value(transpose(a_dev.copy())[::a_step1, ::a_step2],
borrow=True)
b.set_value(b_dev.copy()[::b_step], borrow=True)
c.set_value(c_dev.copy()[::c_step], borrow=True)
f_n()
n_n = (av[::a_step1, ::a_step2]
+ l * numpy.outer(bv[::b_step], cv[::c_step]))
assert numpy.allclose(a.get_value(), n_n), (a.get_value(), n_n)
f_t()
n_t = (av.T[::a_step1, ::a_step2]
+ l * numpy.outer(bv[::b_step], cv[::c_step]).T)
assert numpy.allclose(a_t.get_value(), n_t),\
(a_t.get_value(), n_t)
def test_ger_strides(self):
self.cmp_ger((3, 5), 3, 5)
self.cmp_ger((1, 5), 1, 5)
self.cmp_ger((3, 1), 3, 1)
self.cmp_ger((0, 5), 0, 5)
self.cmp_ger((3, 0), 3, 0)
self.cmp_ger((0, 1), 0, 1)
self.cmp_ger((1, 0), 1, 0)
self.cmp_ger((0, 0), 0, 0)
import sys import sys
import numpy import numpy
from unittest import TestCase
import theano import theano
import theano.tensor as tensor import theano.tensor as tensor
...@@ -14,8 +16,7 @@ from theano.tensor.blas import Gemv ...@@ -14,8 +16,7 @@ from theano.tensor.blas import Gemv
from theano.tests import unittest_tools from theano.tests import unittest_tools
from theano.tests.unittest_tools import TestOptimizationMixin from theano.tests.unittest_tools import TestOptimizationMixin
from test_blas import TestCase from theano.tensor.tests.test_blas import BaseGemv, TestBlasStrides
from test_blas import BaseGemv
mode_blas_opt = theano.compile.get_default_mode().including( mode_blas_opt = theano.compile.get_default_mode().including(
'BlasOpt', 'specialize', 'InplaceBlasOpt', 'c_blas') 'BlasOpt', 'specialize', 'InplaceBlasOpt', 'c_blas')
...@@ -41,7 +42,8 @@ class TestCGer(TestCase, TestOptimizationMixin): ...@@ -41,7 +42,8 @@ class TestCGer(TestCase, TestOptimizationMixin):
) )
def run_f(self, f): def run_f(self, f):
return f(self.Aval, self.xval, self.yval) f(self.Aval, self.xval, self.yval)
f(self.Aval[::-1, ::-1], self.xval, self.yval)
def b(self, bval): def b(self, bval):
return tensor.as_tensor_variable(numpy.asarray(bval, dtype=self.dtype)) return tensor.as_tensor_variable(numpy.asarray(bval, dtype=self.dtype))
...@@ -132,6 +134,10 @@ class TestCGemv(TestCase, TestOptimizationMixin): ...@@ -132,6 +134,10 @@ class TestCGemv(TestCase, TestOptimizationMixin):
assert numpy.allclose(f(self.xval, self.Aval), assert numpy.allclose(f(self.xval, self.Aval),
numpy.dot(self.xval, self.Aval)) numpy.dot(self.xval, self.Aval))
# Test with negative strides on 2 dims
assert numpy.allclose(f(self.xval, self.Aval[::-1, ::-1]),
numpy.dot(self.xval, self.Aval[::-1, ::-1]))
def test_optimizations_mv(self): def test_optimizations_mv(self):
''' Test matrix dot vector ''' ''' Test matrix dot vector '''
f = theano.function([self.A, self.y], f = theano.function([self.A, self.y],
...@@ -145,6 +151,10 @@ class TestCGemv(TestCase, TestOptimizationMixin): ...@@ -145,6 +151,10 @@ class TestCGemv(TestCase, TestOptimizationMixin):
# Assert they produce the same output # Assert they produce the same output
assert numpy.allclose(f(self.Aval, self.yval), assert numpy.allclose(f(self.Aval, self.yval),
numpy.dot(self.Aval, self.yval)) numpy.dot(self.Aval, self.yval))
# Test with negative strides on 2 dims
assert numpy.allclose(f(self.Aval[::-1, ::-1], self.yval),
numpy.dot(self.Aval[::-1, ::-1], self.yval))
def t_gemv1(self, m_shp): def t_gemv1(self, m_shp):
''' test vector2 + dot(matrix, vector1) ''' ''' test vector2 + dot(matrix, vector1) '''
...@@ -164,17 +174,28 @@ class TestCGemv(TestCase, TestOptimizationMixin): ...@@ -164,17 +174,28 @@ class TestCGemv(TestCase, TestOptimizationMixin):
assert topo == [CGemv(inplace=False)], topo assert topo == [CGemv(inplace=False)], topo
#test the inplace version #test the inplace version
f = theano.function([], [], g = theano.function([], [],
updates={v2:v2+theano.dot(m,v1)}, updates={v2:v2+theano.dot(m,v1)},
mode=self.mode) mode=self.mode)
# Assert they produce the same output # Assert they produce the same output
f() g()
assert numpy.allclose(v2.get_value(), assert numpy.allclose(v2.get_value(),
numpy.dot(m.get_value(), v1.get_value()) + v2_orig) numpy.dot(m.get_value(), v1.get_value()) + v2_orig)
topo = [n.op for n in f.maker.env.toposort()] topo = [n.op for n in g.maker.env.toposort()]
assert topo == [CGemv(inplace=True)] assert topo == [CGemv(inplace=True)]
# Do the same tests with a matrix with strides in both dimensions
m.set_value(
m.get_value(borrow=True)[::-1, ::-1],
borrow=True)
v2.set_value(v2_orig)
assert numpy.allclose(f(),
numpy.dot(m.get_value(), v1.get_value()) + v2_orig)
g()
assert numpy.allclose(v2.get_value(),
numpy.dot(m.get_value(), v1.get_value()) + v2_orig)
def test_gemv1(self): def test_gemv1(self):
self.t_gemv1((3,2)) self.t_gemv1((3,2))
self.t_gemv1((0,2)) self.t_gemv1((0,2))
...@@ -200,6 +221,7 @@ class TestCGemv(TestCase, TestOptimizationMixin): ...@@ -200,6 +221,7 @@ class TestCGemv(TestCase, TestOptimizationMixin):
ones_6 = numpy.ones(6, dtype=dtype) ones_6 = numpy.ones(6, dtype=dtype)
f(A_val, ones_3, ones_5) f(A_val, ones_3, ones_5)
f(A_val[::-1, ::-1], ones_3, ones_5)
self.assertRaises(ValueError, f, A_val, ones_4, ones_5) self.assertRaises(ValueError, f, A_val, ones_4, ones_5)
self.assertRaises(ValueError, f, A_val, ones_3, ones_6) self.assertRaises(ValueError, f, A_val, ones_3, ones_6)
self.assertRaises(ValueError, f, A_val, ones_4, ones_6) self.assertRaises(ValueError, f, A_val, ones_4, ones_6)
...@@ -217,3 +239,6 @@ class TestCGemvFloat64(TestCase, BaseGemv, TestOptimizationMixin): ...@@ -217,3 +239,6 @@ class TestCGemvFloat64(TestCase, BaseGemv, TestOptimizationMixin):
dtype = 'float64' dtype = 'float64'
gemv = CGemv(inplace=False) gemv = CGemv(inplace=False)
gemv_inplace = CGemv(inplace=True) gemv_inplace = CGemv(inplace=True)
class TestBlasStridesC(TestBlasStrides):
mode = mode_blas_opt
...@@ -4,7 +4,7 @@ import theano ...@@ -4,7 +4,7 @@ import theano
import theano.tensor as tensor import theano.tensor as tensor
from theano.tensor.blas_scipy import ScipyGer from theano.tensor.blas_scipy import ScipyGer
from test_blas import TestCase, gemm_no_inplace from test_blas import TestCase, gemm_no_inplace, TestBlasStrides
from theano.tests.unittest_tools import TestOptimizationMixin from theano.tests.unittest_tools import TestOptimizationMixin
class TestScipyGer(TestCase, TestOptimizationMixin): class TestScipyGer(TestCase, TestOptimizationMixin):
...@@ -30,6 +30,7 @@ class TestScipyGer(TestCase, TestOptimizationMixin): ...@@ -30,6 +30,7 @@ class TestScipyGer(TestCase, TestOptimizationMixin):
def run_f(self, f): def run_f(self, f):
f(self.Aval, self.xval, self.yval) f(self.Aval, self.xval, self.yval)
f(self.Aval[::-1, ::-1], self.xval[::-1], self.yval[::-1])
def b(self, bval): def b(self, bval):
return tensor.as_tensor_variable(numpy.asarray(bval, dtype=self.dtype)) return tensor.as_tensor_variable(numpy.asarray(bval, dtype=self.dtype))
...@@ -55,3 +56,7 @@ class TestScipyGer(TestCase, TestOptimizationMixin): ...@@ -55,3 +56,7 @@ class TestScipyGer(TestCase, TestOptimizationMixin):
0.2 * self.A + 0.1 * tensor.outer(self.x, self.y)) 0.2 * self.A + 0.1 * tensor.outer(self.x, self.y))
self.assertFunctionContains(f, gemm_no_inplace) self.assertFunctionContains(f, gemm_no_inplace)
self.run_f(f) #DebugMode tests correctness self.run_f(f) #DebugMode tests correctness
class TestBlasStridesScipy(TestBlasStrides):
mode = theano.compile.get_default_mode()
mode = mode.including('fast_run').excluding('gpu', 'c_blas')
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论