提交 1f6b0c73 authored 作者: nouiz's avatar nouiz

Merge pull request #437 from lamblin/blas_double_strides

Make blas functions work with all stride patterns
...@@ -192,7 +192,7 @@ class GpuGemm(GpuOp): ...@@ -192,7 +192,7 @@ class GpuGemm(GpuOp):
return Apply(self, [z, a, x, y, b], [z.type()]) return Apply(self, [z, a, x, y, b], [z.type()])
def c_code_cache_version(self): def c_code_cache_version(self):
return (3,) return (4,)
def c_code(self, node, name, inputs, outputs, sub): def c_code(self, node, name, inputs, outputs, sub):
#z_out = alpha * dot(x,y) + beta * z_in #z_out = alpha * dot(x,y) + beta * z_in
...@@ -200,6 +200,7 @@ class GpuGemm(GpuOp): ...@@ -200,6 +200,7 @@ class GpuGemm(GpuOp):
#not inplace version, we copy z_in to z_out. #not inplace version, we copy z_in to z_out.
z_in, a, x, y, b = inputs z_in, a, x, y, b = inputs
z_out, = outputs z_out, = outputs
inplace = int(self.inplace)
fail = sub['fail'] fail = sub['fail']
sio = StringIO.StringIO() sio = StringIO.StringIO()
...@@ -215,39 +216,50 @@ class GpuGemm(GpuOp): ...@@ -215,39 +216,50 @@ class GpuGemm(GpuOp):
: (REAL)(((double*)%(b)s->data)[0]); : (REAL)(((double*)%(b)s->data)[0]);
#undef REAL #undef REAL
""" if (%(inplace)s
if self.inplace: && (CudaNdarray_HOST_STRIDES(%(z_in)s)[0] >= 0)
print >> sio, """ && (CudaNdarray_HOST_STRIDES(%(z_in)s)[1] >= 0)
&& ((CudaNdarray_HOST_DIMS(%(z_in)s)[0] <= 1)
|| (CudaNdarray_HOST_STRIDES(%(z_in)s)[0] == 1)
|| (CudaNdarray_HOST_DIMS(%(z_in)s)[1] <= 1)
|| (CudaNdarray_HOST_STRIDES(%(z_in)s)[1] == 1)))
{
// The input has an appropriate layout, we work inplace
Py_XDECREF(%(z_out)s); Py_XDECREF(%(z_out)s);
%(z_out)s = %(z_in)s; %(z_out)s = %(z_in)s;
Py_INCREF(%(z_out)s); Py_INCREF(%(z_out)s);
""" }
else: else if (%(z_out)s
print >> sio, """ && (%(z_out)s->nd == 2)
if (!%(z_out)s && (CudaNdarray_HOST_DIMS(%(z_out)s)[0]
|| (%(z_out)s->nd != 2) == CudaNdarray_HOST_DIMS(%(z_in)s)[0])
|| (CudaNdarray_HOST_DIMS(%(z_out)s)[0] != CudaNdarray_HOST_DIMS(%(z_in)s)[0]) && (CudaNdarray_HOST_DIMS(%(z_out)s)[1]
|| (CudaNdarray_HOST_DIMS(%(z_out)s)[1] != CudaNdarray_HOST_DIMS(%(z_in)s)[1]) == CudaNdarray_HOST_DIMS(%(z_in)s)[1])
) && (CudaNdarray_HOST_STRIDES(%(z_out)s)[0] >= 0)
&& (CudaNdarray_HOST_STRIDES(%(z_out)s)[1] >= 0)
&& ((CudaNdarray_HOST_DIMS(%(z_out)s)[0] <= 1)
|| (CudaNdarray_HOST_STRIDES(%(z_out)s)[0] == 1)
|| (CudaNdarray_HOST_DIMS(%(z_out)s)[1] <= 1)
|| (CudaNdarray_HOST_STRIDES(%(z_out)s)[1] == 1)))
{
// The existing output has an appropriate layout,
// copy the input data into it, then work inplace
if (CudaNdarray_CopyFromCudaNdarray(%(z_out)s, %(z_in)s))
{ {
Py_XDECREF(%(z_out)s); %(fail)s;
%(z_out)s = (CudaNdarray*)CudaNdarray_Copy(%(z_in)s);
if (!%(z_out)s)
{
%(fail)s;
}
} }
else }
else
{
// Copy the input, use the copy as output
Py_XDECREF(%(z_out)s);
%(z_out)s = (CudaNdarray*)CudaNdarray_Copy(%(z_in)s);
if (!%(z_out)s)
{ {
if (CudaNdarray_CopyFromCudaNdarray(%(z_out)s, %(z_in)s)) %(fail)s;
{
%(fail)s;
}
} }
""" }
print >> sio, """
if (CudaNdarray_gemm(%(name)s_a, %(x)s, %(y)s, %(name)s_b, %(z_out)s)) if (CudaNdarray_gemm(%(name)s_a, %(x)s, %(y)s, %(name)s_b, %(z_out)s))
{ {
%(fail)s; %(fail)s;
...@@ -294,7 +306,7 @@ class GpuGemv(GpuOp): ...@@ -294,7 +306,7 @@ class GpuGemv(GpuOp):
return Apply(self, [z, a, x, y, b], [z.type()]) return Apply(self, [z, a, x, y, b], [z.type()])
def c_code_cache_version(self): def c_code_cache_version(self):
return (1,) return (2,)
def c_code(self, node, name, inputs, outputs, sub): def c_code(self, node, name, inputs, outputs, sub):
#z_out = alpha * dot(x,y) + beta * z_in #z_out = alpha * dot(x,y) + beta * z_in
...@@ -302,44 +314,46 @@ class GpuGemv(GpuOp): ...@@ -302,44 +314,46 @@ class GpuGemv(GpuOp):
#not inplace version, we copy z_in to z_out. #not inplace version, we copy z_in to z_out.
z_in, a, x, y, b = inputs z_in, a, x, y, b = inputs
z_out, = outputs z_out, = outputs
inplace = int(self.inplace)
fail = sub['fail'] fail = sub['fail']
sio = StringIO.StringIO() sio = StringIO.StringIO()
print >> sio, """ print >> sio, """
float %(name)s_alpha = ((dtype_%(a)s*)(%(a)s->data))[0]; float %(name)s_alpha = ((dtype_%(a)s*)(%(a)s->data))[0];
float %(name)s_beta = ((dtype_%(b)s*)(%(b)s->data))[0]; float %(name)s_beta = ((dtype_%(b)s*)(%(b)s->data))[0];
"""
if self.inplace: if (%(inplace)s
print >> sio, """ && ((CudaNdarray_HOST_STRIDES(%(z_in)s)[0] > 0)
|| ((CudaNdarray_HOST_STRIDES(%(z_in)s)[0] == 0)
&& (CudaNdarray_HOST_DIMS(%(z_in)s)[0] == 1))))
{
// Work inplace on the input
Py_XDECREF(%(z_out)s); Py_XDECREF(%(z_out)s);
%(z_out)s = %(z_in)s; %(z_out)s = %(z_in)s;
Py_INCREF(%(z_out)s); Py_INCREF(%(z_out)s);
""" }
else: else if (%(z_out)s
print >> sio, """ && ((CudaNdarray_HOST_STRIDES(%(z_out)s)[0] > 0)
if (!%(z_out)s || ((CudaNdarray_HOST_STRIDES(%(z_out)s)[0] == 0)
|| (%(z_out)s->nd != 1) && (CudaNdarray_HOST_DIMS(%(z_out)s)[0] == 1))))
|| (CudaNdarray_HOST_DIMS(%(z_out)s)[0] != CudaNdarray_HOST_DIMS(%(z_in)s)[0]) {
) // Work on the output
if (CudaNdarray_CopyFromCudaNdarray(%(z_out)s, %(z_in)s))
{ {
Py_XDECREF(%(z_out)s); %(fail)s;
%(z_out)s = (CudaNdarray*)CudaNdarray_Copy(%(z_in)s);
if (!%(z_out)s)
{
%(fail)s;
}
} }
else }
else
{
// Copy
Py_XDECREF(%(z_out)s);
%(z_out)s = (CudaNdarray*)CudaNdarray_Copy(%(z_in)s);
if (!%(z_out)s)
{ {
if (CudaNdarray_CopyFromCudaNdarray(%(z_out)s, %(z_in)s)) %(fail)s;
{
%(fail)s;
}
} }
""" }
print >> sio, """
if (CudaNdarray_sgemv(%(name)s_alpha, %(x)s, %(y)s, %(name)s_beta, %(z_out)s)) if (CudaNdarray_sgemv(%(name)s_alpha, %(x)s, %(y)s, %(name)s_beta, %(z_out)s))
{ {
%(fail)s; %(fail)s;
...@@ -385,7 +399,7 @@ class GpuGer(GpuOp): ...@@ -385,7 +399,7 @@ class GpuGer(GpuOp):
return Apply(self, [z, a, x, y], [z.type()]) return Apply(self, [z, a, x, y], [z.type()])
def c_code_cache_version(self): def c_code_cache_version(self):
return (1,) return (2,)
def c_code(self, node, name, inputs, outputs, sub): def c_code(self, node, name, inputs, outputs, sub):
#z_out = alpha * dot(x,y) + beta * z_in #z_out = alpha * dot(x,y) + beta * z_in
...@@ -393,44 +407,57 @@ class GpuGer(GpuOp): ...@@ -393,44 +407,57 @@ class GpuGer(GpuOp):
#not inplace version, we copy z_in to z_out. #not inplace version, we copy z_in to z_out.
z_in, a, x, y = inputs z_in, a, x, y = inputs
z_out, = outputs z_out, = outputs
inplace = int(self.inplace)
fail = sub['fail'] fail = sub['fail']
sio = StringIO.StringIO() sio = StringIO.StringIO()
print >> sio, """ print >> sio, """
float %(name)s_alpha = ((dtype_%(a)s*)(%(a)s->data))[0]; float %(name)s_alpha = ((dtype_%(a)s*)(%(a)s->data))[0];
"""
if self.inplace: if (%(inplace)s
print >> sio, """ && (CudaNdarray_HOST_STRIDES(%(z_in)s)[0] >= 0)
&& (CudaNdarray_HOST_STRIDES(%(z_in)s)[1] >= 0)
&& ((CudaNdarray_HOST_DIMS(%(z_in)s)[0] <= 1)
|| (CudaNdarray_HOST_STRIDES(%(z_in)s)[0] == 1)
|| (CudaNdarray_HOST_DIMS(%(z_in)s)[1] <= 1)
|| (CudaNdarray_HOST_STRIDES(%(z_in)s)[1] == 1)))
{
// The input has an appropriate layout, we work inplace
Py_XDECREF(%(z_out)s); Py_XDECREF(%(z_out)s);
%(z_out)s = %(z_in)s; %(z_out)s = %(z_in)s;
Py_INCREF(%(z_out)s); Py_INCREF(%(z_out)s);
""" }
else: else if (%(z_out)s
print >> sio, """ && (%(z_out)s->nd == 2)
if (!%(z_out)s && (CudaNdarray_HOST_DIMS(%(z_out)s)[0]
|| (%(z_out)s->nd != 2) == CudaNdarray_HOST_DIMS(%(z_in)s)[0])
|| (CudaNdarray_HOST_DIMS(%(z_out)s)[0] != CudaNdarray_HOST_DIMS(%(z_in)s)[0]) && (CudaNdarray_HOST_DIMS(%(z_out)s)[1]
|| (CudaNdarray_HOST_DIMS(%(z_out)s)[1] != CudaNdarray_HOST_DIMS(%(z_in)s)[1]) == CudaNdarray_HOST_DIMS(%(z_in)s)[1])
) && (CudaNdarray_HOST_STRIDES(%(z_out)s)[0] >= 0)
&& (CudaNdarray_HOST_STRIDES(%(z_out)s)[1] >= 0)
&& ((CudaNdarray_HOST_DIMS(%(z_out)s)[0] <= 1)
|| (CudaNdarray_HOST_STRIDES(%(z_out)s)[0] == 1)
|| (CudaNdarray_HOST_DIMS(%(z_out)s)[1] <= 1)
|| (CudaNdarray_HOST_STRIDES(%(z_out)s)[1] == 1)))
{
// The existing output has an appropriate layout,
// copy the input data into it, then work inplace
if (CudaNdarray_CopyFromCudaNdarray(%(z_out)s, %(z_in)s))
{ {
Py_XDECREF(%(z_out)s); %(fail)s;
%(z_out)s = (CudaNdarray*)CudaNdarray_Copy(%(z_in)s);
if (!%(z_out)s)
{
%(fail)s;
}
} }
else }
else
{
// Copy the input, use the copy as output
Py_XDECREF(%(z_out)s);
%(z_out)s = (CudaNdarray*)CudaNdarray_Copy(%(z_in)s);
if (!%(z_out)s)
{ {
if (CudaNdarray_CopyFromCudaNdarray(%(z_out)s, %(z_in)s)) %(fail)s;
{
%(fail)s;
}
} }
""" }
print >> sio, """
if (CudaNdarray_sger(%(name)s_alpha, %(x)s, %(y)s, %(z_out)s)) if (CudaNdarray_sger(%(name)s_alpha, %(x)s, %(y)s, %(z_out)s))
{ {
%(fail)s; %(fail)s;
......
...@@ -81,7 +81,7 @@ struct CudaNdarray ...@@ -81,7 +81,7 @@ struct CudaNdarray
//device pointers (allocated by cudaMalloc) //device pointers (allocated by cudaMalloc)
int dev_structure_fresh; mutable int dev_structure_fresh;
//dev_structure should be accessed via macros, otherwise may not be synchronized //dev_structure should be accessed via macros, otherwise may not be synchronized
int * dev_structure; //dim0, dim1, ..., stride0, stride1, ... int * dev_structure; //dim0, dim1, ..., stride0, stride1, ...
real* devdata; //pointer to data element [0,..,0]. real* devdata; //pointer to data element [0,..,0].
...@@ -154,11 +154,11 @@ CudaNdarray_set_stride(CudaNdarray * self, int idx, int s); ...@@ -154,11 +154,11 @@ CudaNdarray_set_stride(CudaNdarray * self, int idx, int s);
* *
* This means: recalculate the log2dims and transfer structure to the card * This means: recalculate the log2dims and transfer structure to the card
*/ */
DllExport int cnda_copy_structure_to_device(CudaNdarray * self); DllExport int cnda_copy_structure_to_device(const CudaNdarray * self);
DllExport const int *CudaNdarray_DEV_DIMS(CudaNdarray * self); DllExport const int *CudaNdarray_DEV_DIMS(const CudaNdarray * self);
DllExport const int *CudaNdarray_DEV_STRIDES(CudaNdarray * self); DllExport const int *CudaNdarray_DEV_STRIDES(const CudaNdarray * self);
DllExport const int *CudaNdarray_DEV_LOG2DIMS(CudaNdarray * self); DllExport const int *CudaNdarray_DEV_LOG2DIMS(const CudaNdarray * self);
DllExport float *CudaNdarray_DEV_DATA(const CudaNdarray * self); DllExport float *CudaNdarray_DEV_DATA(const CudaNdarray * self);
/** /**
...@@ -229,13 +229,22 @@ static int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd, const i ...@@ -229,13 +229,22 @@ static int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd, const i
return -1; return -1;
} }
assert(size>0); if (size < 0)
{
PyErr_Format(PyExc_AssertionError,
"size (%i) < 0",
size);
return -1;
}
self->devdata = (float*)device_malloc(size*sizeof(real)); self->devdata = (float*)device_malloc(size*sizeof(real));
if (!self->devdata) if (size && !self->devdata)
{ {
CudaNdarray_set_nd(self,-1); CudaNdarray_set_nd(self, -1);
self->data_allocated = 0; self->data_allocated = 0;
self->devdata = 0; self->devdata = 0;
PyErr_SetString(PyExc_RuntimeError,
"Could not allocate memory on device");
return -1; return -1;
} }
if (0) if (0)
...@@ -283,7 +292,7 @@ DllExport PyObject * CudaNdarray_DeepCopy(CudaNdarray * self, PyObject * memo); ...@@ -283,7 +292,7 @@ DllExport PyObject * CudaNdarray_DeepCopy(CudaNdarray * self, PyObject * memo);
/** /**
* Return an independent copy of self * Return an independent copy of self
*/ */
DllExport PyObject * CudaNdarray_Copy(CudaNdarray * self); DllExport PyObject * CudaNdarray_Copy(const CudaNdarray * self);
/** /**
* Return a new object obtained by summing over the dimensions for which there is a 1 in the mask. * Return a new object obtained by summing over the dimensions for which there is a 1 in the mask.
...@@ -302,7 +311,7 @@ DllExport int CudaNdarray_CopyFromArray(CudaNdarray * self, PyArrayObject*obj); ...@@ -302,7 +311,7 @@ DllExport int CudaNdarray_CopyFromArray(CudaNdarray * self, PyArrayObject*obj);
* *
* self is reallocated to have the correct dimensions if necessary. * self is reallocated to have the correct dimensions if necessary.
*/ */
DllExport int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, CudaNdarray * other, bool unbroadcast = false); DllExport int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, const CudaNdarray * other, bool unbroadcast = false);
/** /**
* Transfer the contents of CudaNdarray `self` to a new numpy ndarray. * Transfer the contents of CudaNdarray `self` to a new numpy ndarray.
...@@ -321,7 +330,7 @@ DllExport PyObject * CudaNdarray_IS_C_Contiguous(CudaNdarray * self); ...@@ -321,7 +330,7 @@ DllExport PyObject * CudaNdarray_IS_C_Contiguous(CudaNdarray * self);
DllExport int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C); DllExport int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C);
DllExport int CudaNdarray_sgemv(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C); DllExport int CudaNdarray_sgemv(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C);
DllExport int CudaNdarray_sger(float alpha, CudaNdarray * x, CudaNdarray * y, CudaNdarray* A); DllExport int CudaNdarray_sger(float alpha, const CudaNdarray * x, const CudaNdarray * y, CudaNdarray* A);
DllExport int CudaNdarray_reduce_sum(CudaNdarray * self, CudaNdarray * A); DllExport int CudaNdarray_reduce_sum(CudaNdarray * self, CudaNdarray * A);
DllExport int CudaNdarray_reduce_prod(CudaNdarray * self, CudaNdarray * A); DllExport int CudaNdarray_reduce_prod(CudaNdarray * self, CudaNdarray * A);
...@@ -343,4 +352,4 @@ static void fprint_CudaNdarray(FILE * fd, const CudaNdarray *self); ...@@ -343,4 +352,4 @@ static void fprint_CudaNdarray(FILE * fd, const CudaNdarray *self);
fill-column:79 fill-column:79
End: End:
*/ */
// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:encoding=utf-8:textwidth=79 : // vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 :
import itertools
from unittest import TestCase from unittest import TestCase
from theano.compile.pfunc import pfunc from theano.compile.pfunc import pfunc
...@@ -17,7 +18,7 @@ import theano.sandbox.cuda as tcn ...@@ -17,7 +18,7 @@ import theano.sandbox.cuda as tcn
from theano.tensor.signal.downsample import DownsampleFactorMax, DownsampleFactorMaxGrad from theano.tensor.signal.downsample import DownsampleFactorMax, DownsampleFactorMaxGrad
import theano.compile.mode import theano.compile.mode
from theano.tensor.tests.test_blas import BaseGemv, TestGer from theano.tensor.tests.test_blas import BaseGemv, TestBlasStrides, TestGer
from theano.sandbox.cuda.blas import gpu_gemv_no_inplace, gpu_gemv_inplace from theano.sandbox.cuda.blas import gpu_gemv_no_inplace, gpu_gemv_inplace
from theano.sandbox.cuda.blas import gpu_ger_inplace, gpu_ger_no_inplace from theano.sandbox.cuda.blas import gpu_ger_inplace, gpu_ger_no_inplace
...@@ -32,20 +33,31 @@ else: ...@@ -32,20 +33,31 @@ else:
def my_rand(*shape): def my_rand(*shape):
return theano._asarray(numpy.random.rand(*shape),dtype='float32') return theano._asarray(numpy.random.rand(*shape),dtype='float32')
def transpose(cuda_mat):
# The easiest way to transpose a cuda matrix for now
return tcn.dimshuffle(cuda_mat, [1, 0])
def test_dot22(): def test_dot22():
def cmp(a_shp, b_shp): def cmp(a_shp, b_shp):
a = tcn.shared_constructor(my_rand(*a_shp), 'a') a0 = my_rand(*a_shp)
a = tcn.shared_constructor(a0, 'a')
b = tensor.fmatrix() b = tensor.fmatrix()
f = pfunc([b], [], updates=[(a, tensor.dot(a,b))], mode=mode_with_gpu) f = pfunc([b], [], updates=[(a, tensor.dot(a,b))], mode=mode_with_gpu)
a0 = a.get_value() * 1.0
bval = my_rand(*b_shp) bval = my_rand(*b_shp)
f(bval) f(bval)
assert numpy.allclose(numpy.dot(a0, bval), a.get_value()) assert numpy.allclose(numpy.dot(a0, bval), a.get_value())
# Try with a matrix equal to a0, but with strides in both dims
a.set_value(a0)
a.set_value(
a.get_value(borrow=True, return_internal_type=True)[::-1, ::-1],
borrow=True)
f(bval)
cmp((3,4),(4,5)) cmp((3,4),(4,5))
cmp((0,4),(4,5)) cmp((0,4),(4,5))
cmp((3,4),(4,0)) cmp((3,4),(4,0))
...@@ -90,7 +102,8 @@ def test_dot22scalar(): ...@@ -90,7 +102,8 @@ def test_dot22scalar():
def test_gemm(): def test_gemm():
def cmp(a_shp, b_shp): def cmp(a_shp, b_shp):
a = tcn.shared_constructor(my_rand(*a_shp), 'a') a0 = my_rand(*a_shp)
a = tcn.shared_constructor(a0, 'a')
b = tensor.fmatrix('b') b = tensor.fmatrix('b')
c = tensor.fmatrix('c') c = tensor.fmatrix('c')
...@@ -98,12 +111,19 @@ def test_gemm(): ...@@ -98,12 +111,19 @@ def test_gemm():
f = pfunc([b,c], [], updates=[(a, tensor.dot(a,b) + tensor.exp(c))], mode=mode_with_gpu) f = pfunc([b,c], [], updates=[(a, tensor.dot(a,b) + tensor.exp(c))], mode=mode_with_gpu)
assert any([node.op == tcn.blas.gpu_gemm_inplace for node in f.maker.env.toposort()]) assert any([node.op == tcn.blas.gpu_gemm_inplace for node in f.maker.env.toposort()])
a0 = a.get_value() * 1.0
bval = my_rand(*b_shp) bval = my_rand(*b_shp)
cval = my_rand(a_shp[0],b_shp[1]) cval = my_rand(a_shp[0],b_shp[1])
f(bval,cval) f(bval,cval)
assert numpy.allclose(numpy.dot(a0, bval)+numpy.exp(cval), a.get_value()) assert numpy.allclose(numpy.dot(a0, bval)+numpy.exp(cval), a.get_value())
# Try with a matrix equal to a0, but with strides in both dims
a.set_value(a0)
a.set_value(
a.get_value(borrow=True, return_internal_type=True)[::-1, ::-1],
borrow=True)
f(bval, cval)
cmp((3,4),(4,5)) cmp((3,4),(4,5))
cmp((0,4),(4,5)) cmp((0,4),(4,5))
cmp((3,4),(4,0)) cmp((3,4),(4,0))
...@@ -114,7 +134,8 @@ def test_gemm(): ...@@ -114,7 +134,8 @@ def test_gemm():
def test_gemm_no_inplace(): def test_gemm_no_inplace():
def cmp(a_shp, b_shp): def cmp(a_shp, b_shp):
a = tcn.shared_constructor(my_rand(*a_shp), 'a') a0 = my_rand(*a_shp)
a = tcn.shared_constructor(a0, 'a')
cval = my_rand(a_shp[0], b_shp[1]) cval = my_rand(a_shp[0], b_shp[1])
c = tcn.shared_constructor(cval.copy(), 'c') c = tcn.shared_constructor(cval.copy(), 'c')
...@@ -123,7 +144,6 @@ def test_gemm_no_inplace(): ...@@ -123,7 +144,6 @@ def test_gemm_no_inplace():
f = pfunc([b,b2], [tensor.dot(a,b2) + c], updates=[(a, tensor.dot(a,b) + c)], mode=mode_with_gpu) f = pfunc([b,b2], [tensor.dot(a,b2) + c], updates=[(a, tensor.dot(a,b) + c)], mode=mode_with_gpu)
a0 = a.get_value() * 1.0
assert any([node.op == tcn.blas.gpu_gemm_no_inplace for node in f.maker.env.toposort()]) assert any([node.op == tcn.blas.gpu_gemm_no_inplace for node in f.maker.env.toposort()])
bval = my_rand(*b_shp) bval = my_rand(*b_shp)
bval2 = my_rand(*b_shp) bval2 = my_rand(*b_shp)
...@@ -132,6 +152,13 @@ def test_gemm_no_inplace(): ...@@ -132,6 +152,13 @@ def test_gemm_no_inplace():
assert numpy.allclose(numpy.dot(a0, bval)+cval, a.get_value()) assert numpy.allclose(numpy.dot(a0, bval)+cval, a.get_value())
assert numpy.allclose(numpy.dot(a0, bval2)+cval, rval) assert numpy.allclose(numpy.dot(a0, bval2)+cval, rval)
# Try with a matrix equal to a0, but with strides in both dims
a.set_value(a0)
a.set_value(
a.get_value(borrow=True, return_internal_type=True)[::-1, ::-1],
borrow=True)
f(bval, bval2)
cmp((3,4),(4,5)) cmp((3,4),(4,5))
cmp((0,4),(4,5)) cmp((0,4),(4,5))
cmp((3,4),(4,0)) cmp((3,4),(4,0))
...@@ -139,6 +166,13 @@ def test_gemm_no_inplace(): ...@@ -139,6 +166,13 @@ def test_gemm_no_inplace():
cmp((0,4),(4,0)) cmp((0,4),(4,0))
cmp((0,0),(0,0)) cmp((0,0),(0,0))
class TestBlasStridesGpu(TestBlasStrides):
dtype = 'float32'
shared = staticmethod(tcn.shared_constructor)
mode = mode_with_gpu
def test_outer(): def test_outer():
x = tcn.shared_constructor(my_rand(8,), 'x') x = tcn.shared_constructor(my_rand(8,), 'x')
y = tcn.shared_constructor(my_rand(6,), 'y') y = tcn.shared_constructor(my_rand(6,), 'y')
...@@ -260,6 +294,23 @@ class TestGpuGemv(TestCase, BaseGemv, ...@@ -260,6 +294,23 @@ class TestGpuGemv(TestCase, BaseGemv,
gemv = gpu_gemv_inplace gemv = gpu_gemv_inplace
gemv_inplace = gpu_gemv_inplace gemv_inplace = gpu_gemv_inplace
class TestGpuGemvNoTransfer(TestCase, BaseGemv,
unittest_tools.TestOptimizationMixin):
mode = mode_with_gpu
dtype = 'float32'
# Mimic shared constructors registry
@staticmethod
def shared(val):
try:
return tcn.shared_constructor(val)
except TypeError:
return theano.shared(val)
# In this test, inputs are not always transfered to GPU
gemv = gpu_gemv_no_inplace
gemv_inplace = gpu_gemv_inplace
class TestVectorMatrixDot(TestCase): class TestVectorMatrixDot(TestCase):
### Tolerance factor used in this tests ### Tolerance factor used in this tests
...@@ -286,6 +337,14 @@ class TestVectorMatrixDot(TestCase): ...@@ -286,6 +337,14 @@ class TestVectorMatrixDot(TestCase):
assert sum([node.op is gpu_gemv_inplace for node in assert sum([node.op is gpu_gemv_inplace for node in
gpu_f2.maker.env.toposort() ]) == 1 gpu_f2.maker.env.toposort() ]) == 1
# Check double-strided m
m.set_value(
m.get_value(borrow=True, return_internal_type=True)[::-1, ::-1],
borrow=True)
assert numpy.allclose(no_gpu_f(), gpu_f(), atol=self.atol)
assert numpy.allclose(no_gpu_f(), gpu_f2(), atol=self.atol)
def test_dot_mv(self): def test_dot_mv(self):
''' Test matrix dot vector ''' ''' Test matrix dot vector '''
v = theano.shared( numpy.array(numpy.random.rand(2), dtype='float32')) v = theano.shared( numpy.array(numpy.random.rand(2), dtype='float32'))
...@@ -365,6 +424,26 @@ class TestGpuGer(TestGer): ...@@ -365,6 +424,26 @@ class TestGpuGer(TestGer):
self.ger = gpu_ger_inplace self.ger = gpu_ger_inplace
self.gemm = tcn.blas.gpu_gemm_inplace self.gemm = tcn.blas.gpu_gemm_inplace
class TestGpuGerNoTransfer(TestGer):
@staticmethod
def shared(val):
try:
return tcn.shared_constructor(val)
except TypeError:
return theano.shared(val)
def setUp(self):
self.mode = mode_with_gpu
dtype = self.dtype = 'float32' # optimization isn't dtype-dependent
self.A = tensor.tensor(dtype=dtype, broadcastable=(False, False))
self.a = tensor.tensor(dtype=dtype, broadcastable=())
self.x = tensor.tensor(dtype=dtype, broadcastable=(False,))
self.y = tensor.tensor(dtype=dtype, broadcastable=(False,))
# data on the gpu make the op always inplace
self.ger = gpu_ger_inplace
self.ger_destructive = gpu_ger_inplace
self.gemm = tcn.blas.gpu_gemm_inplace
class TestGpuGer_OpContract(TestCase, unittest_tools.T_OpContractMixin): class TestGpuGer_OpContract(TestCase, unittest_tools.T_OpContractMixin):
def setUp(self): def setUp(self):
......
...@@ -496,7 +496,9 @@ class GemmRelated(Op): ...@@ -496,7 +496,9 @@ class GemmRelated(Op):
if ((Sx[0] < 1) || (Sx[1] < 1) || (Sx[0] MOD type_size) || (Sx[1] MOD type_size) if ((Sx[0] < 1) || (Sx[1] < 1) || (Sx[0] MOD type_size) || (Sx[1] MOD type_size)
|| ((Sx[0] != type_size) && (Sx[1] != type_size))) || ((Sx[0] != type_size) && (Sx[1] != type_size)))
{ {
PyArrayObject * _x_copy = PyArray_GETCONTIGUOUS(%(_x)s); PyArrayObject * _x_copy = (PyArrayObject *) PyArray_Copy(%(_x)s);
if (!_x_copy)
%(fail)s
Py_XDECREF(%(_x)s); Py_XDECREF(%(_x)s);
%(_x)s = _x_copy; %(_x)s = _x_copy;
Sx = %(_x)s->strides; Sx = %(_x)s->strides;
...@@ -505,7 +507,9 @@ class GemmRelated(Op): ...@@ -505,7 +507,9 @@ class GemmRelated(Op):
if ((Sy[0] < 1) || (Sy[1] < 1) || (Sy[0] MOD type_size) || (Sy[1] MOD type_size) if ((Sy[0] < 1) || (Sy[1] < 1) || (Sy[0] MOD type_size) || (Sy[1] MOD type_size)
|| ((Sy[0] != type_size) && (Sy[1] != type_size))) || ((Sy[0] != type_size) && (Sy[1] != type_size)))
{ {
PyArrayObject * _y_copy = PyArray_GETCONTIGUOUS(%(_y)s); PyArrayObject * _y_copy = (PyArrayObject *) PyArray_Copy(%(_y)s);
if (!_y_copy)
%(fail)s
Py_XDECREF(%(_y)s); Py_XDECREF(%(_y)s);
%(_y)s = _y_copy; %(_y)s = _y_copy;
Sy = %(_y)s->strides; Sy = %(_y)s->strides;
...@@ -514,7 +518,9 @@ class GemmRelated(Op): ...@@ -514,7 +518,9 @@ class GemmRelated(Op):
if ((Sz[0] < 1) || (Sz[1] < 1) || (Sz[0] MOD type_size) || (Sz[1] MOD type_size) if ((Sz[0] < 1) || (Sz[1] < 1) || (Sz[0] MOD type_size) || (Sz[1] MOD type_size)
|| ((Sz[0] != type_size) && (Sz[1] != type_size))) || ((Sz[0] != type_size) && (Sz[1] != type_size)))
{ {
PyArrayObject * _z_copy = PyArray_GETCONTIGUOUS(%(_zout)s); PyArrayObject * _z_copy = (PyArrayObject *) PyArray_Copy(%(_zout)s);
if (!_z_copy)
%(fail)s
Py_XDECREF(%(_zout)s); Py_XDECREF(%(_zout)s);
%(_zout)s = _z_copy; %(_zout)s = _z_copy;
Sz = %(_zout)s->strides; Sz = %(_zout)s->strides;
...@@ -649,7 +655,7 @@ class GemmRelated(Op): ...@@ -649,7 +655,7 @@ class GemmRelated(Op):
self.end_switch_typenum), '') self.end_switch_typenum), '')
def build_gemm_version(self): def build_gemm_version(self):
return (10,) return (12,)
class Gemm(GemmRelated): class Gemm(GemmRelated):
"""In-place version of matrix-matrix multiplication (with accumulation): """In-place version of matrix-matrix multiplication (with accumulation):
......
差异被折叠。
...@@ -44,7 +44,12 @@ class ScipyGer(Ger): ...@@ -44,7 +44,12 @@ class ScipyGer(Ger):
# N.B. some versions of scipy (e.g. mine) don't actually work # N.B. some versions of scipy (e.g. mine) don't actually work
# in-place on a, even when I tell it to. # in-place on a, even when I tell it to.
A = cA[0] A = cA[0]
if A.flags['C_CONTIGUOUS']: if A.size == 0:
# We don't have to do anything, A is empty.
# We need this special case because Numpy considers it
# C-contiguous, wich is confusing.
pass
elif A.flags['C_CONTIGUOUS']:
A = local_ger(calpha[0], cy[0], cx[0], a=A.T, A = local_ger(calpha[0], cy[0], cx[0], a=A.T,
overwrite_a=int(self.destructive)).T overwrite_a=int(self.destructive)).T
else: else:
......
import sys import sys
import numpy import numpy
from unittest import TestCase
import theano import theano
import theano.tensor as tensor import theano.tensor as tensor
...@@ -14,8 +16,7 @@ from theano.tensor.blas import Gemv ...@@ -14,8 +16,7 @@ from theano.tensor.blas import Gemv
from theano.tests import unittest_tools from theano.tests import unittest_tools
from theano.tests.unittest_tools import TestOptimizationMixin from theano.tests.unittest_tools import TestOptimizationMixin
from test_blas import TestCase from theano.tensor.tests.test_blas import BaseGemv, TestBlasStrides
from test_blas import BaseGemv
mode_blas_opt = theano.compile.get_default_mode().including( mode_blas_opt = theano.compile.get_default_mode().including(
'BlasOpt', 'specialize', 'InplaceBlasOpt', 'c_blas') 'BlasOpt', 'specialize', 'InplaceBlasOpt', 'c_blas')
...@@ -41,7 +42,8 @@ class TestCGer(TestCase, TestOptimizationMixin): ...@@ -41,7 +42,8 @@ class TestCGer(TestCase, TestOptimizationMixin):
) )
def run_f(self, f): def run_f(self, f):
return f(self.Aval, self.xval, self.yval) f(self.Aval, self.xval, self.yval)
f(self.Aval[::-1, ::-1], self.xval, self.yval)
def b(self, bval): def b(self, bval):
return tensor.as_tensor_variable(numpy.asarray(bval, dtype=self.dtype)) return tensor.as_tensor_variable(numpy.asarray(bval, dtype=self.dtype))
...@@ -132,6 +134,10 @@ class TestCGemv(TestCase, TestOptimizationMixin): ...@@ -132,6 +134,10 @@ class TestCGemv(TestCase, TestOptimizationMixin):
assert numpy.allclose(f(self.xval, self.Aval), assert numpy.allclose(f(self.xval, self.Aval),
numpy.dot(self.xval, self.Aval)) numpy.dot(self.xval, self.Aval))
# Test with negative strides on 2 dims
assert numpy.allclose(f(self.xval, self.Aval[::-1, ::-1]),
numpy.dot(self.xval, self.Aval[::-1, ::-1]))
def test_optimizations_mv(self): def test_optimizations_mv(self):
''' Test matrix dot vector ''' ''' Test matrix dot vector '''
f = theano.function([self.A, self.y], f = theano.function([self.A, self.y],
...@@ -145,6 +151,10 @@ class TestCGemv(TestCase, TestOptimizationMixin): ...@@ -145,6 +151,10 @@ class TestCGemv(TestCase, TestOptimizationMixin):
# Assert they produce the same output # Assert they produce the same output
assert numpy.allclose(f(self.Aval, self.yval), assert numpy.allclose(f(self.Aval, self.yval),
numpy.dot(self.Aval, self.yval)) numpy.dot(self.Aval, self.yval))
# Test with negative strides on 2 dims
assert numpy.allclose(f(self.Aval[::-1, ::-1], self.yval),
numpy.dot(self.Aval[::-1, ::-1], self.yval))
def t_gemv1(self, m_shp): def t_gemv1(self, m_shp):
''' test vector2 + dot(matrix, vector1) ''' ''' test vector2 + dot(matrix, vector1) '''
...@@ -164,17 +174,28 @@ class TestCGemv(TestCase, TestOptimizationMixin): ...@@ -164,17 +174,28 @@ class TestCGemv(TestCase, TestOptimizationMixin):
assert topo == [CGemv(inplace=False)], topo assert topo == [CGemv(inplace=False)], topo
#test the inplace version #test the inplace version
f = theano.function([], [], g = theano.function([], [],
updates={v2:v2+theano.dot(m,v1)}, updates={v2:v2+theano.dot(m,v1)},
mode=self.mode) mode=self.mode)
# Assert they produce the same output # Assert they produce the same output
f() g()
assert numpy.allclose(v2.get_value(), assert numpy.allclose(v2.get_value(),
numpy.dot(m.get_value(), v1.get_value()) + v2_orig) numpy.dot(m.get_value(), v1.get_value()) + v2_orig)
topo = [n.op for n in f.maker.env.toposort()] topo = [n.op for n in g.maker.env.toposort()]
assert topo == [CGemv(inplace=True)] assert topo == [CGemv(inplace=True)]
# Do the same tests with a matrix with strides in both dimensions
m.set_value(
m.get_value(borrow=True)[::-1, ::-1],
borrow=True)
v2.set_value(v2_orig)
assert numpy.allclose(f(),
numpy.dot(m.get_value(), v1.get_value()) + v2_orig)
g()
assert numpy.allclose(v2.get_value(),
numpy.dot(m.get_value(), v1.get_value()) + v2_orig)
def test_gemv1(self): def test_gemv1(self):
self.t_gemv1((3,2)) self.t_gemv1((3,2))
self.t_gemv1((0,2)) self.t_gemv1((0,2))
...@@ -200,6 +221,7 @@ class TestCGemv(TestCase, TestOptimizationMixin): ...@@ -200,6 +221,7 @@ class TestCGemv(TestCase, TestOptimizationMixin):
ones_6 = numpy.ones(6, dtype=dtype) ones_6 = numpy.ones(6, dtype=dtype)
f(A_val, ones_3, ones_5) f(A_val, ones_3, ones_5)
f(A_val[::-1, ::-1], ones_3, ones_5)
self.assertRaises(ValueError, f, A_val, ones_4, ones_5) self.assertRaises(ValueError, f, A_val, ones_4, ones_5)
self.assertRaises(ValueError, f, A_val, ones_3, ones_6) self.assertRaises(ValueError, f, A_val, ones_3, ones_6)
self.assertRaises(ValueError, f, A_val, ones_4, ones_6) self.assertRaises(ValueError, f, A_val, ones_4, ones_6)
...@@ -217,3 +239,6 @@ class TestCGemvFloat64(TestCase, BaseGemv, TestOptimizationMixin): ...@@ -217,3 +239,6 @@ class TestCGemvFloat64(TestCase, BaseGemv, TestOptimizationMixin):
dtype = 'float64' dtype = 'float64'
gemv = CGemv(inplace=False) gemv = CGemv(inplace=False)
gemv_inplace = CGemv(inplace=True) gemv_inplace = CGemv(inplace=True)
class TestBlasStridesC(TestBlasStrides):
mode = mode_blas_opt
...@@ -4,7 +4,7 @@ import theano ...@@ -4,7 +4,7 @@ import theano
import theano.tensor as tensor import theano.tensor as tensor
from theano.tensor.blas_scipy import ScipyGer from theano.tensor.blas_scipy import ScipyGer
from test_blas import TestCase, gemm_no_inplace from test_blas import TestCase, gemm_no_inplace, TestBlasStrides
from theano.tests.unittest_tools import TestOptimizationMixin from theano.tests.unittest_tools import TestOptimizationMixin
class TestScipyGer(TestCase, TestOptimizationMixin): class TestScipyGer(TestCase, TestOptimizationMixin):
...@@ -30,6 +30,7 @@ class TestScipyGer(TestCase, TestOptimizationMixin): ...@@ -30,6 +30,7 @@ class TestScipyGer(TestCase, TestOptimizationMixin):
def run_f(self, f): def run_f(self, f):
f(self.Aval, self.xval, self.yval) f(self.Aval, self.xval, self.yval)
f(self.Aval[::-1, ::-1], self.xval[::-1], self.yval[::-1])
def b(self, bval): def b(self, bval):
return tensor.as_tensor_variable(numpy.asarray(bval, dtype=self.dtype)) return tensor.as_tensor_variable(numpy.asarray(bval, dtype=self.dtype))
...@@ -55,3 +56,7 @@ class TestScipyGer(TestCase, TestOptimizationMixin): ...@@ -55,3 +56,7 @@ class TestScipyGer(TestCase, TestOptimizationMixin):
0.2 * self.A + 0.1 * tensor.outer(self.x, self.y)) 0.2 * self.A + 0.1 * tensor.outer(self.x, self.y))
self.assertFunctionContains(f, gemm_no_inplace) self.assertFunctionContains(f, gemm_no_inplace)
self.run_f(f) #DebugMode tests correctness self.run_f(f) #DebugMode tests correctness
class TestBlasStridesScipy(TestBlasStrides):
mode = theano.compile.get_default_mode()
mode = mode.including('fast_run').excluding('gpu', 'c_blas')
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论