提交 1f6b0c73 authored 作者: nouiz's avatar nouiz

Merge pull request #437 from lamblin/blas_double_strides

Make blas functions work with all stride patterns
......@@ -192,7 +192,7 @@ class GpuGemm(GpuOp):
return Apply(self, [z, a, x, y, b], [z.type()])
def c_code_cache_version(self):
return (3,)
return (4,)
def c_code(self, node, name, inputs, outputs, sub):
#z_out = alpha * dot(x,y) + beta * z_in
......@@ -200,6 +200,7 @@ class GpuGemm(GpuOp):
#not inplace version, we copy z_in to z_out.
z_in, a, x, y, b = inputs
z_out, = outputs
inplace = int(self.inplace)
fail = sub['fail']
sio = StringIO.StringIO()
......@@ -215,39 +216,50 @@ class GpuGemm(GpuOp):
: (REAL)(((double*)%(b)s->data)[0]);
#undef REAL
"""
if self.inplace:
print >> sio, """
if (%(inplace)s
&& (CudaNdarray_HOST_STRIDES(%(z_in)s)[0] >= 0)
&& (CudaNdarray_HOST_STRIDES(%(z_in)s)[1] >= 0)
&& ((CudaNdarray_HOST_DIMS(%(z_in)s)[0] <= 1)
|| (CudaNdarray_HOST_STRIDES(%(z_in)s)[0] == 1)
|| (CudaNdarray_HOST_DIMS(%(z_in)s)[1] <= 1)
|| (CudaNdarray_HOST_STRIDES(%(z_in)s)[1] == 1)))
{
// The input has an appropriate layout, we work inplace
Py_XDECREF(%(z_out)s);
%(z_out)s = %(z_in)s;
Py_INCREF(%(z_out)s);
"""
else:
print >> sio, """
if (!%(z_out)s
|| (%(z_out)s->nd != 2)
|| (CudaNdarray_HOST_DIMS(%(z_out)s)[0] != CudaNdarray_HOST_DIMS(%(z_in)s)[0])
|| (CudaNdarray_HOST_DIMS(%(z_out)s)[1] != CudaNdarray_HOST_DIMS(%(z_in)s)[1])
)
}
else if (%(z_out)s
&& (%(z_out)s->nd == 2)
&& (CudaNdarray_HOST_DIMS(%(z_out)s)[0]
== CudaNdarray_HOST_DIMS(%(z_in)s)[0])
&& (CudaNdarray_HOST_DIMS(%(z_out)s)[1]
== CudaNdarray_HOST_DIMS(%(z_in)s)[1])
&& (CudaNdarray_HOST_STRIDES(%(z_out)s)[0] >= 0)
&& (CudaNdarray_HOST_STRIDES(%(z_out)s)[1] >= 0)
&& ((CudaNdarray_HOST_DIMS(%(z_out)s)[0] <= 1)
|| (CudaNdarray_HOST_STRIDES(%(z_out)s)[0] == 1)
|| (CudaNdarray_HOST_DIMS(%(z_out)s)[1] <= 1)
|| (CudaNdarray_HOST_STRIDES(%(z_out)s)[1] == 1)))
{
// The existing output has an appropriate layout,
// copy the input data into it, then work inplace
if (CudaNdarray_CopyFromCudaNdarray(%(z_out)s, %(z_in)s))
{
Py_XDECREF(%(z_out)s);
%(z_out)s = (CudaNdarray*)CudaNdarray_Copy(%(z_in)s);
if (!%(z_out)s)
{
%(fail)s;
}
%(fail)s;
}
else
}
else
{
// Copy the input, use the copy as output
Py_XDECREF(%(z_out)s);
%(z_out)s = (CudaNdarray*)CudaNdarray_Copy(%(z_in)s);
if (!%(z_out)s)
{
if (CudaNdarray_CopyFromCudaNdarray(%(z_out)s, %(z_in)s))
{
%(fail)s;
}
%(fail)s;
}
"""
}
print >> sio, """
if (CudaNdarray_gemm(%(name)s_a, %(x)s, %(y)s, %(name)s_b, %(z_out)s))
{
%(fail)s;
......@@ -294,7 +306,7 @@ class GpuGemv(GpuOp):
return Apply(self, [z, a, x, y, b], [z.type()])
def c_code_cache_version(self):
return (1,)
return (2,)
def c_code(self, node, name, inputs, outputs, sub):
#z_out = alpha * dot(x,y) + beta * z_in
......@@ -302,44 +314,46 @@ class GpuGemv(GpuOp):
#not inplace version, we copy z_in to z_out.
z_in, a, x, y, b = inputs
z_out, = outputs
inplace = int(self.inplace)
fail = sub['fail']
sio = StringIO.StringIO()
print >> sio, """
float %(name)s_alpha = ((dtype_%(a)s*)(%(a)s->data))[0];
float %(name)s_beta = ((dtype_%(b)s*)(%(b)s->data))[0];
"""
if self.inplace:
print >> sio, """
if (%(inplace)s
&& ((CudaNdarray_HOST_STRIDES(%(z_in)s)[0] > 0)
|| ((CudaNdarray_HOST_STRIDES(%(z_in)s)[0] == 0)
&& (CudaNdarray_HOST_DIMS(%(z_in)s)[0] == 1))))
{
// Work inplace on the input
Py_XDECREF(%(z_out)s);
%(z_out)s = %(z_in)s;
Py_INCREF(%(z_out)s);
"""
else:
print >> sio, """
if (!%(z_out)s
|| (%(z_out)s->nd != 1)
|| (CudaNdarray_HOST_DIMS(%(z_out)s)[0] != CudaNdarray_HOST_DIMS(%(z_in)s)[0])
)
}
else if (%(z_out)s
&& ((CudaNdarray_HOST_STRIDES(%(z_out)s)[0] > 0)
|| ((CudaNdarray_HOST_STRIDES(%(z_out)s)[0] == 0)
&& (CudaNdarray_HOST_DIMS(%(z_out)s)[0] == 1))))
{
// Work on the output
if (CudaNdarray_CopyFromCudaNdarray(%(z_out)s, %(z_in)s))
{
Py_XDECREF(%(z_out)s);
%(z_out)s = (CudaNdarray*)CudaNdarray_Copy(%(z_in)s);
if (!%(z_out)s)
{
%(fail)s;
}
%(fail)s;
}
else
}
else
{
// Copy
Py_XDECREF(%(z_out)s);
%(z_out)s = (CudaNdarray*)CudaNdarray_Copy(%(z_in)s);
if (!%(z_out)s)
{
if (CudaNdarray_CopyFromCudaNdarray(%(z_out)s, %(z_in)s))
{
%(fail)s;
}
%(fail)s;
}
"""
}
print >> sio, """
if (CudaNdarray_sgemv(%(name)s_alpha, %(x)s, %(y)s, %(name)s_beta, %(z_out)s))
{
%(fail)s;
......@@ -385,7 +399,7 @@ class GpuGer(GpuOp):
return Apply(self, [z, a, x, y], [z.type()])
def c_code_cache_version(self):
return (1,)
return (2,)
def c_code(self, node, name, inputs, outputs, sub):
#z_out = alpha * dot(x,y) + beta * z_in
......@@ -393,44 +407,57 @@ class GpuGer(GpuOp):
#not inplace version, we copy z_in to z_out.
z_in, a, x, y = inputs
z_out, = outputs
inplace = int(self.inplace)
fail = sub['fail']
sio = StringIO.StringIO()
print >> sio, """
float %(name)s_alpha = ((dtype_%(a)s*)(%(a)s->data))[0];
"""
if self.inplace:
print >> sio, """
if (%(inplace)s
&& (CudaNdarray_HOST_STRIDES(%(z_in)s)[0] >= 0)
&& (CudaNdarray_HOST_STRIDES(%(z_in)s)[1] >= 0)
&& ((CudaNdarray_HOST_DIMS(%(z_in)s)[0] <= 1)
|| (CudaNdarray_HOST_STRIDES(%(z_in)s)[0] == 1)
|| (CudaNdarray_HOST_DIMS(%(z_in)s)[1] <= 1)
|| (CudaNdarray_HOST_STRIDES(%(z_in)s)[1] == 1)))
{
// The input has an appropriate layout, we work inplace
Py_XDECREF(%(z_out)s);
%(z_out)s = %(z_in)s;
Py_INCREF(%(z_out)s);
"""
else:
print >> sio, """
if (!%(z_out)s
|| (%(z_out)s->nd != 2)
|| (CudaNdarray_HOST_DIMS(%(z_out)s)[0] != CudaNdarray_HOST_DIMS(%(z_in)s)[0])
|| (CudaNdarray_HOST_DIMS(%(z_out)s)[1] != CudaNdarray_HOST_DIMS(%(z_in)s)[1])
)
}
else if (%(z_out)s
&& (%(z_out)s->nd == 2)
&& (CudaNdarray_HOST_DIMS(%(z_out)s)[0]
== CudaNdarray_HOST_DIMS(%(z_in)s)[0])
&& (CudaNdarray_HOST_DIMS(%(z_out)s)[1]
== CudaNdarray_HOST_DIMS(%(z_in)s)[1])
&& (CudaNdarray_HOST_STRIDES(%(z_out)s)[0] >= 0)
&& (CudaNdarray_HOST_STRIDES(%(z_out)s)[1] >= 0)
&& ((CudaNdarray_HOST_DIMS(%(z_out)s)[0] <= 1)
|| (CudaNdarray_HOST_STRIDES(%(z_out)s)[0] == 1)
|| (CudaNdarray_HOST_DIMS(%(z_out)s)[1] <= 1)
|| (CudaNdarray_HOST_STRIDES(%(z_out)s)[1] == 1)))
{
// The existing output has an appropriate layout,
// copy the input data into it, then work inplace
if (CudaNdarray_CopyFromCudaNdarray(%(z_out)s, %(z_in)s))
{
Py_XDECREF(%(z_out)s);
%(z_out)s = (CudaNdarray*)CudaNdarray_Copy(%(z_in)s);
if (!%(z_out)s)
{
%(fail)s;
}
%(fail)s;
}
else
}
else
{
// Copy the input, use the copy as output
Py_XDECREF(%(z_out)s);
%(z_out)s = (CudaNdarray*)CudaNdarray_Copy(%(z_in)s);
if (!%(z_out)s)
{
if (CudaNdarray_CopyFromCudaNdarray(%(z_out)s, %(z_in)s))
{
%(fail)s;
}
%(fail)s;
}
"""
}
print >> sio, """
if (CudaNdarray_sger(%(name)s_alpha, %(x)s, %(y)s, %(z_out)s))
{
%(fail)s;
......
......@@ -81,7 +81,7 @@ struct CudaNdarray
//device pointers (allocated by cudaMalloc)
int dev_structure_fresh;
mutable int dev_structure_fresh;
//dev_structure should be accessed via macros, otherwise may not be synchronized
int * dev_structure; //dim0, dim1, ..., stride0, stride1, ...
real* devdata; //pointer to data element [0,..,0].
......@@ -154,11 +154,11 @@ CudaNdarray_set_stride(CudaNdarray * self, int idx, int s);
*
* This means: recalculate the log2dims and transfer structure to the card
*/
DllExport int cnda_copy_structure_to_device(CudaNdarray * self);
DllExport int cnda_copy_structure_to_device(const CudaNdarray * self);
DllExport const int *CudaNdarray_DEV_DIMS(CudaNdarray * self);
DllExport const int *CudaNdarray_DEV_STRIDES(CudaNdarray * self);
DllExport const int *CudaNdarray_DEV_LOG2DIMS(CudaNdarray * self);
DllExport const int *CudaNdarray_DEV_DIMS(const CudaNdarray * self);
DllExport const int *CudaNdarray_DEV_STRIDES(const CudaNdarray * self);
DllExport const int *CudaNdarray_DEV_LOG2DIMS(const CudaNdarray * self);
DllExport float *CudaNdarray_DEV_DATA(const CudaNdarray * self);
/**
......@@ -229,13 +229,22 @@ static int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd, const i
return -1;
}
assert(size>0);
if (size < 0)
{
PyErr_Format(PyExc_AssertionError,
"size (%i) < 0",
size);
return -1;
}
self->devdata = (float*)device_malloc(size*sizeof(real));
if (!self->devdata)
if (size && !self->devdata)
{
CudaNdarray_set_nd(self,-1);
CudaNdarray_set_nd(self, -1);
self->data_allocated = 0;
self->devdata = 0;
PyErr_SetString(PyExc_RuntimeError,
"Could not allocate memory on device");
return -1;
}
if (0)
......@@ -283,7 +292,7 @@ DllExport PyObject * CudaNdarray_DeepCopy(CudaNdarray * self, PyObject * memo);
/**
* Return an independent copy of self
*/
DllExport PyObject * CudaNdarray_Copy(CudaNdarray * self);
DllExport PyObject * CudaNdarray_Copy(const CudaNdarray * self);
/**
* Return a new object obtained by summing over the dimensions for which there is a 1 in the mask.
......@@ -302,7 +311,7 @@ DllExport int CudaNdarray_CopyFromArray(CudaNdarray * self, PyArrayObject*obj);
*
* self is reallocated to have the correct dimensions if necessary.
*/
DllExport int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, CudaNdarray * other, bool unbroadcast = false);
DllExport int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, const CudaNdarray * other, bool unbroadcast = false);
/**
* Transfer the contents of CudaNdarray `self` to a new numpy ndarray.
......@@ -321,7 +330,7 @@ DllExport PyObject * CudaNdarray_IS_C_Contiguous(CudaNdarray * self);
DllExport int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C);
DllExport int CudaNdarray_sgemv(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C);
DllExport int CudaNdarray_sger(float alpha, CudaNdarray * x, CudaNdarray * y, CudaNdarray* A);
DllExport int CudaNdarray_sger(float alpha, const CudaNdarray * x, const CudaNdarray * y, CudaNdarray* A);
DllExport int CudaNdarray_reduce_sum(CudaNdarray * self, CudaNdarray * A);
DllExport int CudaNdarray_reduce_prod(CudaNdarray * self, CudaNdarray * A);
......@@ -343,4 +352,4 @@ static void fprint_CudaNdarray(FILE * fd, const CudaNdarray *self);
fill-column:79
End:
*/
// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:encoding=utf-8:textwidth=79 :
// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 :
import itertools
from unittest import TestCase
from theano.compile.pfunc import pfunc
......@@ -17,7 +18,7 @@ import theano.sandbox.cuda as tcn
from theano.tensor.signal.downsample import DownsampleFactorMax, DownsampleFactorMaxGrad
import theano.compile.mode
from theano.tensor.tests.test_blas import BaseGemv, TestGer
from theano.tensor.tests.test_blas import BaseGemv, TestBlasStrides, TestGer
from theano.sandbox.cuda.blas import gpu_gemv_no_inplace, gpu_gemv_inplace
from theano.sandbox.cuda.blas import gpu_ger_inplace, gpu_ger_no_inplace
......@@ -32,20 +33,31 @@ else:
def my_rand(*shape):
return theano._asarray(numpy.random.rand(*shape),dtype='float32')
def transpose(cuda_mat):
# The easiest way to transpose a cuda matrix for now
return tcn.dimshuffle(cuda_mat, [1, 0])
def test_dot22():
def cmp(a_shp, b_shp):
a = tcn.shared_constructor(my_rand(*a_shp), 'a')
a0 = my_rand(*a_shp)
a = tcn.shared_constructor(a0, 'a')
b = tensor.fmatrix()
f = pfunc([b], [], updates=[(a, tensor.dot(a,b))], mode=mode_with_gpu)
a0 = a.get_value() * 1.0
bval = my_rand(*b_shp)
f(bval)
assert numpy.allclose(numpy.dot(a0, bval), a.get_value())
# Try with a matrix equal to a0, but with strides in both dims
a.set_value(a0)
a.set_value(
a.get_value(borrow=True, return_internal_type=True)[::-1, ::-1],
borrow=True)
f(bval)
cmp((3,4),(4,5))
cmp((0,4),(4,5))
cmp((3,4),(4,0))
......@@ -90,7 +102,8 @@ def test_dot22scalar():
def test_gemm():
def cmp(a_shp, b_shp):
a = tcn.shared_constructor(my_rand(*a_shp), 'a')
a0 = my_rand(*a_shp)
a = tcn.shared_constructor(a0, 'a')
b = tensor.fmatrix('b')
c = tensor.fmatrix('c')
......@@ -98,12 +111,19 @@ def test_gemm():
f = pfunc([b,c], [], updates=[(a, tensor.dot(a,b) + tensor.exp(c))], mode=mode_with_gpu)
assert any([node.op == tcn.blas.gpu_gemm_inplace for node in f.maker.env.toposort()])
a0 = a.get_value() * 1.0
bval = my_rand(*b_shp)
cval = my_rand(a_shp[0],b_shp[1])
f(bval,cval)
assert numpy.allclose(numpy.dot(a0, bval)+numpy.exp(cval), a.get_value())
# Try with a matrix equal to a0, but with strides in both dims
a.set_value(a0)
a.set_value(
a.get_value(borrow=True, return_internal_type=True)[::-1, ::-1],
borrow=True)
f(bval, cval)
cmp((3,4),(4,5))
cmp((0,4),(4,5))
cmp((3,4),(4,0))
......@@ -114,7 +134,8 @@ def test_gemm():
def test_gemm_no_inplace():
def cmp(a_shp, b_shp):
a = tcn.shared_constructor(my_rand(*a_shp), 'a')
a0 = my_rand(*a_shp)
a = tcn.shared_constructor(a0, 'a')
cval = my_rand(a_shp[0], b_shp[1])
c = tcn.shared_constructor(cval.copy(), 'c')
......@@ -123,7 +144,6 @@ def test_gemm_no_inplace():
f = pfunc([b,b2], [tensor.dot(a,b2) + c], updates=[(a, tensor.dot(a,b) + c)], mode=mode_with_gpu)
a0 = a.get_value() * 1.0
assert any([node.op == tcn.blas.gpu_gemm_no_inplace for node in f.maker.env.toposort()])
bval = my_rand(*b_shp)
bval2 = my_rand(*b_shp)
......@@ -132,6 +152,13 @@ def test_gemm_no_inplace():
assert numpy.allclose(numpy.dot(a0, bval)+cval, a.get_value())
assert numpy.allclose(numpy.dot(a0, bval2)+cval, rval)
# Try with a matrix equal to a0, but with strides in both dims
a.set_value(a0)
a.set_value(
a.get_value(borrow=True, return_internal_type=True)[::-1, ::-1],
borrow=True)
f(bval, bval2)
cmp((3,4),(4,5))
cmp((0,4),(4,5))
cmp((3,4),(4,0))
......@@ -139,6 +166,13 @@ def test_gemm_no_inplace():
cmp((0,4),(4,0))
cmp((0,0),(0,0))
class TestBlasStridesGpu(TestBlasStrides):
dtype = 'float32'
shared = staticmethod(tcn.shared_constructor)
mode = mode_with_gpu
def test_outer():
x = tcn.shared_constructor(my_rand(8,), 'x')
y = tcn.shared_constructor(my_rand(6,), 'y')
......@@ -260,6 +294,23 @@ class TestGpuGemv(TestCase, BaseGemv,
gemv = gpu_gemv_inplace
gemv_inplace = gpu_gemv_inplace
class TestGpuGemvNoTransfer(TestCase, BaseGemv,
unittest_tools.TestOptimizationMixin):
mode = mode_with_gpu
dtype = 'float32'
# Mimic shared constructors registry
@staticmethod
def shared(val):
try:
return tcn.shared_constructor(val)
except TypeError:
return theano.shared(val)
# In this test, inputs are not always transfered to GPU
gemv = gpu_gemv_no_inplace
gemv_inplace = gpu_gemv_inplace
class TestVectorMatrixDot(TestCase):
### Tolerance factor used in this tests
......@@ -286,6 +337,14 @@ class TestVectorMatrixDot(TestCase):
assert sum([node.op is gpu_gemv_inplace for node in
gpu_f2.maker.env.toposort() ]) == 1
# Check double-strided m
m.set_value(
m.get_value(borrow=True, return_internal_type=True)[::-1, ::-1],
borrow=True)
assert numpy.allclose(no_gpu_f(), gpu_f(), atol=self.atol)
assert numpy.allclose(no_gpu_f(), gpu_f2(), atol=self.atol)
def test_dot_mv(self):
''' Test matrix dot vector '''
v = theano.shared( numpy.array(numpy.random.rand(2), dtype='float32'))
......@@ -365,6 +424,26 @@ class TestGpuGer(TestGer):
self.ger = gpu_ger_inplace
self.gemm = tcn.blas.gpu_gemm_inplace
class TestGpuGerNoTransfer(TestGer):
@staticmethod
def shared(val):
try:
return tcn.shared_constructor(val)
except TypeError:
return theano.shared(val)
def setUp(self):
self.mode = mode_with_gpu
dtype = self.dtype = 'float32' # optimization isn't dtype-dependent
self.A = tensor.tensor(dtype=dtype, broadcastable=(False, False))
self.a = tensor.tensor(dtype=dtype, broadcastable=())
self.x = tensor.tensor(dtype=dtype, broadcastable=(False,))
self.y = tensor.tensor(dtype=dtype, broadcastable=(False,))
# data on the gpu make the op always inplace
self.ger = gpu_ger_inplace
self.ger_destructive = gpu_ger_inplace
self.gemm = tcn.blas.gpu_gemm_inplace
class TestGpuGer_OpContract(TestCase, unittest_tools.T_OpContractMixin):
def setUp(self):
......
......@@ -496,7 +496,9 @@ class GemmRelated(Op):
if ((Sx[0] < 1) || (Sx[1] < 1) || (Sx[0] MOD type_size) || (Sx[1] MOD type_size)
|| ((Sx[0] != type_size) && (Sx[1] != type_size)))
{
PyArrayObject * _x_copy = PyArray_GETCONTIGUOUS(%(_x)s);
PyArrayObject * _x_copy = (PyArrayObject *) PyArray_Copy(%(_x)s);
if (!_x_copy)
%(fail)s
Py_XDECREF(%(_x)s);
%(_x)s = _x_copy;
Sx = %(_x)s->strides;
......@@ -505,7 +507,9 @@ class GemmRelated(Op):
if ((Sy[0] < 1) || (Sy[1] < 1) || (Sy[0] MOD type_size) || (Sy[1] MOD type_size)
|| ((Sy[0] != type_size) && (Sy[1] != type_size)))
{
PyArrayObject * _y_copy = PyArray_GETCONTIGUOUS(%(_y)s);
PyArrayObject * _y_copy = (PyArrayObject *) PyArray_Copy(%(_y)s);
if (!_y_copy)
%(fail)s
Py_XDECREF(%(_y)s);
%(_y)s = _y_copy;
Sy = %(_y)s->strides;
......@@ -514,7 +518,9 @@ class GemmRelated(Op):
if ((Sz[0] < 1) || (Sz[1] < 1) || (Sz[0] MOD type_size) || (Sz[1] MOD type_size)
|| ((Sz[0] != type_size) && (Sz[1] != type_size)))
{
PyArrayObject * _z_copy = PyArray_GETCONTIGUOUS(%(_zout)s);
PyArrayObject * _z_copy = (PyArrayObject *) PyArray_Copy(%(_zout)s);
if (!_z_copy)
%(fail)s
Py_XDECREF(%(_zout)s);
%(_zout)s = _z_copy;
Sz = %(_zout)s->strides;
......@@ -649,7 +655,7 @@ class GemmRelated(Op):
self.end_switch_typenum), '')
def build_gemm_version(self):
return (10,)
return (12,)
class Gemm(GemmRelated):
"""In-place version of matrix-matrix multiplication (with accumulation):
......
差异被折叠。
......@@ -44,7 +44,12 @@ class ScipyGer(Ger):
# N.B. some versions of scipy (e.g. mine) don't actually work
# in-place on a, even when I tell it to.
A = cA[0]
if A.flags['C_CONTIGUOUS']:
if A.size == 0:
# We don't have to do anything, A is empty.
# We need this special case because Numpy considers it
# C-contiguous, wich is confusing.
pass
elif A.flags['C_CONTIGUOUS']:
A = local_ger(calpha[0], cy[0], cx[0], a=A.T,
overwrite_a=int(self.destructive)).T
else:
......
import sys
import numpy
from unittest import TestCase
import theano
import theano.tensor as tensor
......@@ -14,8 +16,7 @@ from theano.tensor.blas import Gemv
from theano.tests import unittest_tools
from theano.tests.unittest_tools import TestOptimizationMixin
from test_blas import TestCase
from test_blas import BaseGemv
from theano.tensor.tests.test_blas import BaseGemv, TestBlasStrides
mode_blas_opt = theano.compile.get_default_mode().including(
'BlasOpt', 'specialize', 'InplaceBlasOpt', 'c_blas')
......@@ -41,7 +42,8 @@ class TestCGer(TestCase, TestOptimizationMixin):
)
def run_f(self, f):
return f(self.Aval, self.xval, self.yval)
f(self.Aval, self.xval, self.yval)
f(self.Aval[::-1, ::-1], self.xval, self.yval)
def b(self, bval):
return tensor.as_tensor_variable(numpy.asarray(bval, dtype=self.dtype))
......@@ -132,6 +134,10 @@ class TestCGemv(TestCase, TestOptimizationMixin):
assert numpy.allclose(f(self.xval, self.Aval),
numpy.dot(self.xval, self.Aval))
# Test with negative strides on 2 dims
assert numpy.allclose(f(self.xval, self.Aval[::-1, ::-1]),
numpy.dot(self.xval, self.Aval[::-1, ::-1]))
def test_optimizations_mv(self):
''' Test matrix dot vector '''
f = theano.function([self.A, self.y],
......@@ -145,6 +151,10 @@ class TestCGemv(TestCase, TestOptimizationMixin):
# Assert they produce the same output
assert numpy.allclose(f(self.Aval, self.yval),
numpy.dot(self.Aval, self.yval))
# Test with negative strides on 2 dims
assert numpy.allclose(f(self.Aval[::-1, ::-1], self.yval),
numpy.dot(self.Aval[::-1, ::-1], self.yval))
def t_gemv1(self, m_shp):
''' test vector2 + dot(matrix, vector1) '''
......@@ -164,17 +174,28 @@ class TestCGemv(TestCase, TestOptimizationMixin):
assert topo == [CGemv(inplace=False)], topo
#test the inplace version
f = theano.function([], [],
g = theano.function([], [],
updates={v2:v2+theano.dot(m,v1)},
mode=self.mode)
# Assert they produce the same output
f()
g()
assert numpy.allclose(v2.get_value(),
numpy.dot(m.get_value(), v1.get_value()) + v2_orig)
topo = [n.op for n in f.maker.env.toposort()]
topo = [n.op for n in g.maker.env.toposort()]
assert topo == [CGemv(inplace=True)]
# Do the same tests with a matrix with strides in both dimensions
m.set_value(
m.get_value(borrow=True)[::-1, ::-1],
borrow=True)
v2.set_value(v2_orig)
assert numpy.allclose(f(),
numpy.dot(m.get_value(), v1.get_value()) + v2_orig)
g()
assert numpy.allclose(v2.get_value(),
numpy.dot(m.get_value(), v1.get_value()) + v2_orig)
def test_gemv1(self):
self.t_gemv1((3,2))
self.t_gemv1((0,2))
......@@ -200,6 +221,7 @@ class TestCGemv(TestCase, TestOptimizationMixin):
ones_6 = numpy.ones(6, dtype=dtype)
f(A_val, ones_3, ones_5)
f(A_val[::-1, ::-1], ones_3, ones_5)
self.assertRaises(ValueError, f, A_val, ones_4, ones_5)
self.assertRaises(ValueError, f, A_val, ones_3, ones_6)
self.assertRaises(ValueError, f, A_val, ones_4, ones_6)
......@@ -217,3 +239,6 @@ class TestCGemvFloat64(TestCase, BaseGemv, TestOptimizationMixin):
dtype = 'float64'
gemv = CGemv(inplace=False)
gemv_inplace = CGemv(inplace=True)
class TestBlasStridesC(TestBlasStrides):
mode = mode_blas_opt
......@@ -4,7 +4,7 @@ import theano
import theano.tensor as tensor
from theano.tensor.blas_scipy import ScipyGer
from test_blas import TestCase, gemm_no_inplace
from test_blas import TestCase, gemm_no_inplace, TestBlasStrides
from theano.tests.unittest_tools import TestOptimizationMixin
class TestScipyGer(TestCase, TestOptimizationMixin):
......@@ -30,6 +30,7 @@ class TestScipyGer(TestCase, TestOptimizationMixin):
def run_f(self, f):
f(self.Aval, self.xval, self.yval)
f(self.Aval[::-1, ::-1], self.xval[::-1], self.yval[::-1])
def b(self, bval):
return tensor.as_tensor_variable(numpy.asarray(bval, dtype=self.dtype))
......@@ -55,3 +56,7 @@ class TestScipyGer(TestCase, TestOptimizationMixin):
0.2 * self.A + 0.1 * tensor.outer(self.x, self.y))
self.assertFunctionContains(f, gemm_no_inplace)
self.run_f(f) #DebugMode tests correctness
class TestBlasStridesScipy(TestBlasStrides):
mode = theano.compile.get_default_mode()
mode = mode.including('fast_run').excluding('gpu', 'c_blas')
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论