提交 cf80a8ef authored 作者: Yann N. Dauphin's avatar Yann N. Dauphin

added sparse dot

上级 11ee6cee
......@@ -18,6 +18,8 @@ from theano import scalar
from theano import config
from theano.gof.python25 import all, any
sparse_formats = ['csc', 'csr']
#TODO: move this decorator to the compile submodule
def register_specialize(lopt, *tags, **kwargs):
compile.optdb['specialize'].register((kwargs and kwargs.pop('name')) or lopt.__name__, lopt, 'fast_run', *tags)
......@@ -1449,3 +1451,385 @@ class StructuredDotGradCSR(gof.Op):
"""% dict(locals(), **sub)
sdg_csr = StructuredDotGradCSR()
class Dot(gof.op.Op):
"""
Operation for efficiently calculating the dot product when
one or all operands is sparse. Supported format are CSC and CSR.
The output of the operation is dense.
"""
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def __ne__(self, other):
return not (self == other)
def make_node(self, x, y):
dtype_out = scalar.upcast(x.type.dtype, y.type.dtype)
if not _is_sparse_variable(x) and not _is_sparse_variable(y):
raise TypeError(x)
return gof.Apply(self, [x, y], [tensor.tensor(dtype=dtype_out, broadcastable=(False, False))])
def perform(self, node, (x, y), (out, )):
x_is_sparse = _is_sparse(x)
y_is_sparse = _is_sparse(y)
if not x_is_sparse and not y_is_sparse:
raise TypeError(x)
rval = x * y
if x_is_sparse and y_is_sparse:
rval = rval.todense()
out[0] = rval
def grad(self, (x, y), (gz,)):
assert _is_sparse_variable(x) or _is_sparse_variable(y)
rval = [
tensor.dot(gz, y.T) if _is_dense_variable(y) else dot(gz, y.T),
tensor.dot(x.T, gz) if _is_dense_variable(x) else dot(x.T, gz)
]
return rval
_dot = Dot()
def dot(x, y):
"""
Operation for efficiently calculating the dot product when
one or all operands is sparse. Supported format are CSC and CSR.
The output of the operation is dense.
"""
if hasattr(x, 'getnnz'): x = as_sparse_variable(x)
if hasattr(y, 'getnnz'): y = as_sparse_variable(y)
x_is_sparse_variable = _is_sparse_variable(x)
y_is_sparse_variable = _is_sparse_variable(y)
if not x_is_sparse_variable and not y_is_sparse_variable:
raise TypeError()
return _dot(x, y)
class Usmm(gof.op.Op):
"""
Performs the expression is alpha * x y + z
x or y are sparse matrix(the other can be sparse or dense)
z is a dense matrix
alpha is a scalar
"""
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def __ne__(self, other):
return not (self == other)
def __str__(self):
return 'Usmm{no_inplace}'
def make_node(self, alpha, x, y, z):
if not _is_sparse_variable(x) and not _is_sparse_variable(y):
# If x and y are tensor, we don't want to use this class
# We should use Dot22 and Gemm in that case.
raise TypeError(x)
dtype_out = scalar.upcast(alpha.type.dtype, x.type.dtype, y.type.dtype, z.type.dtype)
alpha = tensor.as_tensor_variable(alpha)
z = tensor.as_tensor_variable(z)
assert z.ndim == 2
assert alpha.type.broadcastable == (True,)* alpha.ndim
if not _is_sparse_variable(x):
x = tensor.as_tensor_variable(x)
assert x.ndim == 2
if not _is_sparse_variable(y):
y = tensor.as_tensor_variable(y)
assert y.ndim == 2
return gof.Apply(self, [alpha, x, y, z], [tensor.tensor(dtype=dtype_out, broadcastable=(False, False))])
def perform(self, node, (alpha, x, y, z), (out, )):
x_is_sparse = _is_sparse(x)
y_is_sparse = _is_sparse(y)
if not x_is_sparse and not y_is_sparse:
raise TypeError(x)
rval = x * y
if isinstance(rval, scipy.sparse.spmatrix):
rval = rval.toarray()
if rval.dtype == alpha.dtype:
rval *= alpha # Faster because operation is inplace
else:
rval = rval * alpha
if rval.dtype == z.dtype:
rval += z # Faster because operation is inplace
else:
rval = rval + z
out[0] = rval
usmm = Usmm()
class UsmmCscDense(gof.Op):
"""
Performs the expression is alpha * x y + z
This is an optimized operation for the case when x is in CSC format.
x are sparse matrix
y, z is a dense matrix
alpha is a scalar
"""
def __init__(self, inplace):
self.inplace = inplace
if inplace:
self.destroy_map={ 0 : [6] }
def __str__(self):
if self.inplace:
return 'UsmmCscDense{inplace}'
else:
return 'UsmmCscDense{no_inplace}'
def __eq__(self, other):
return (type(self) == type(other)) and self.inplace == other.inplace
def __hash__(self):
return hash(type(self))
def make_node(self, alpha, x_val, x_ind, x_ptr, x_nrows, y, z):
x_val = tensor.as_tensor_variable(x_val)
x_ind = tensor.as_tensor_variable(x_ind)
x_ptr = tensor.as_tensor_variable(x_ptr)
y = tensor.as_tensor_variable(y)
z = tensor.as_tensor_variable(z)
assert x_ind.dtype == 'int32'
assert x_ptr.dtype == 'int32'
assert x_nrows.dtype == 'int32'
assert alpha.ndim == 2 and alpha.type.broadcastable == (True, True)
assert x_val.ndim == 1
assert y.ndim == 2
assert z.ndim == 2
dtype_out = scalar.upcast(alpha.type.dtype, x_val.type.dtype, y.type.dtype, z.type.dtype)
alpha = tensor.as_tensor_variable(alpha)
if self.inplace:
assert z.type.dtype == dtype_out
# axpy work only with the same dtype, so we should upcast the input
if dtype_out != alpha.type.dtype:
alpha = tensor.cast(alpha, dtype_out)
if dtype_out != x_val.type.dtype:
x_val = tensor.cast(x_val, dtype_out)
if dtype_out != y.type.dtype:
raise NotImplementedError("We need sparse cast to be implemented!")
if dtype_out != z.type.dtype:
z = tensor.cast(z, dtype_out)
r = gof.Apply(self, [alpha, x_val, x_ind, x_ptr, x_nrows, y, z],
[tensor.tensor(dtype_out, (False, y.type.broadcastable[1]))])
return r
#def perform(self, node, (alpha, x_val, x_ind, x_ptr, x_nrows, y, z), (out,)):
# raise NotImplemented()
def c_support_code(self):
return blas.blas_header_text()
def c_libraries(self):
return blas.ldflags()
def c_compile_args(self):
return blas.ldflags(libs=False, flags=True)
def c_lib_dirs(self):
return blas.ldflags(libs=False, libs_dir=True)
def c_header_dirs(self):
return blas.ldflags(libs=False, include_dir=True)
def c_code(self, node, name, (alpha, x_val, x_ind, x_ptr, x_nrows, y, z), (zn,), sub):
if node.inputs[1].type.dtype in ('complex64', 'complex128'):
raise NotImplementedError('Complex types are not supported for x_val')
if node.inputs[5].type.dtype in ('complex64', 'complex128'):
raise NotImplementedError('Complex types are not supported for y')
if node.inputs[6].type.dtype != node.outputs[0].type.dtype:
raise NotImplementedError('z and output must have same type')
if node.inputs[1].type.dtype == "float32":
conv_type = "float"
axpy = "saxpy_"
else:
conv_type = "double"
axpy = "daxpy_"
typenum_alpha = node.inputs[0].type.dtype_specs()[-1] # retrieve dtype number
typenum_x_val = node.inputs[1].type.dtype_specs()[-1] # retrieve dtype number
typenum_y = node.inputs[5].type.dtype_specs()[-1] # retrieve dtype number
typenum_z = node.inputs[6].type.dtype_specs()[-1] # retrieve dtype number
typenum_zn = node.outputs[0].type.dtype_specs()[-1] # retrieve dtype number
inplace = int(self.inplace)
rval = """
if (%(x_val)s->nd != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(x_val) != 1"); %(fail)s;}
if (%(x_ind)s->nd != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(x_ind) != 1"); %(fail)s;}
if (%(x_ptr)s->nd != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(x_ptr) != 1"); %(fail)s;}
if (%(x_nrows)s->nd != 0) {PyErr_SetString(PyExc_NotImplementedError, "rank(x_nrows) != 0"); %(fail)s;}
if (%(y)s->nd != 2) {PyErr_SetString(PyExc_NotImplementedError, "rank(y) != 2"); %(fail)s;}
if (%(x_val)s->descr->type_num != %(typenum_x_val)s) {
PyErr_SetString(PyExc_NotImplementedError, "Invalid type for x_val"); %(fail)s;}
if (%(y)s->descr->type_num != %(typenum_y)s) {
PyErr_SetString(PyExc_NotImplementedError, "Invalid type for y"); %(fail)s;}
if (%(z)s->descr->type_num != %(typenum_z)s) {
PyErr_SetString(PyExc_NotImplementedError, "Invalid type for z"); %(fail)s;}
if (%(alpha)s->descr->type_num != %(typenum_alpha)s) {
PyErr_SetString(PyExc_NotImplementedError, "Invalid type for alpha"); %(fail)s;}
if (%(x_ind)s->descr->type_num != PyArray_INT32) {
PyErr_SetString(PyExc_NotImplementedError, "x_ind dtype not INT32"); %(fail)s;}
if (%(x_ptr)s->descr->type_num != PyArray_INT32)
{PyErr_SetString(PyExc_NotImplementedError, "x_ptr dtype not INT32"); %(fail)s;}
if (%(x_nrows)s->descr->type_num != PyArray_INT32)
{PyErr_SetString(PyExc_NotImplementedError, "x_nrows dtype not INT32"); %(fail)s;}
if (%(x_val)s->dimensions[0] != %(x_ind)s->dimensions[0])
{PyErr_SetString(PyExc_NotImplementedError, "x_val and x_ind have different lengths"); %(fail)s;}
if (%(x_ptr)s->dimensions[0] != %(y)s->dimensions[0]+1)
{PyErr_SetString(PyExc_NotImplementedError, "x's number of columns doesn't match y's rows"); %(fail)s;}
if (%(z)s->dimensions[0] != ((npy_int32 *)%(x_nrows)s->data)[0] || %(z)s->dimensions[1] != %(y)s->dimensions[1])
{PyErr_SetString(PyExc_NotImplementedError, "The dimension of z and the output must match"); %(fail)s;}
if (PyArray_SIZE(%(alpha)s) != 1)
{PyErr_SetString(PyExc_NotImplementedError, "The number of element in alpha must be 1"); %(fail)s;}
if (%(alpha)s->nd != 2)
{PyErr_SetString(PyExc_NotImplementedError, "The number dimension of alpha must be 2"); %(fail)s;}
if (%(x_val)s->nd != 1)
{PyErr_SetString(PyExc_NotImplementedError, "The number dimension of x_val must be 1"); %(fail)s;}
if (%(y)s->nd != 2)
{PyErr_SetString(PyExc_NotImplementedError, "The number dimension of y must be 2"); %(fail)s;}
if (%(z)s->nd != 2)
{PyErr_SetString(PyExc_NotImplementedError, "The number dimension of z must be 2"); %(fail)s;}
if (%(inplace)s)
{
if (%(typenum_zn)s != %(typenum_z)s) {
PyErr_SetString(PyExc_NotImplementedError, "When inplace the output dtype must be the same as the input"); %(fail)s;}
Py_XDECREF(%(zn)s);
%(zn)s = %(z)s;
Py_INCREF(%(zn)s);
}
else if (!%(zn)s
|| (%(zn)s->dimensions[0] != ((npy_int32 *)%(x_nrows)s->data)[0])
|| (%(zn)s->dimensions[1] != %(y)s->dimensions[1])
)
{
{Py_XDECREF(%(zn)s);}
npy_intp dims[] = {0,0};
dims[0] = ((npy_int32 *)%(x_nrows)s->data)[0];
dims[1] = %(y)s->dimensions[1];
%(zn)s = (PyArrayObject*) PyArray_SimpleNew(2, dims, %(typenum_zn)s);
}
{
// sparse array has size MxK, dense KxN, output MxN
npy_intp M = %(zn)s->dimensions[0];
npy_intp N = %(zn)s->dimensions[1];
npy_intp K = %(y)s->dimensions[0];
// pointers to access actual data in the arrays passed as params.
dtype_%(z)s* __restrict__ Dz = (dtype_%(z)s*)%(z)s->data;
dtype_%(zn)s* __restrict__ Dzn = (dtype_%(zn)s*)%(zn)s->data;
const dtype_%(x_val)s* __restrict__ Dval = (dtype_%(x_val)s*)%(x_val)s->data;
const npy_int32 * __restrict__ Dind = (npy_int32*)%(x_ind)s->data;
const npy_int32 * __restrict__ Dptr = (npy_int32*)%(x_ptr)s->data;
const dtype_%(alpha)s alpha = ((dtype_%(alpha)s*)%(alpha)s->data)[0];
npy_intp Sz = %(z)s->strides[1] / %(z)s->descr->elsize;
npy_intp Szn = %(zn)s->strides[1] / %(zn)s->descr->elsize;
npy_intp Sval = %(x_val)s->strides[0] / %(x_val)s->descr->elsize;
npy_intp Sind = %(x_ind)s->strides[0] / %(x_ind)s->descr->elsize;
npy_intp Sptr = %(x_ptr)s->strides[0] / %(x_ptr)s->descr->elsize;
npy_intp Sy = %(y)s->strides[1] / %(y)s->descr->elsize;
if (!(%(inplace)s))
{
memcpy(Dzn, Dz, M*N*sizeof(dtype_%(zn)s));
}
for (npy_int32 k = 0; k < K; ++k)
{
for (npy_int32 m_idx = Dptr[k * Sptr]; m_idx < Dptr[(k+1)*Sptr]; ++m_idx)
{
const npy_int32 m = Dind[m_idx * Sind]; // row index of non-null value for column K
const dtype_%(x_val)s Amk = alpha * Dval[m_idx * Sval]; // actual value at that location
const dtype_%(y)s* y_row = (dtype_%(y)s*)(%(y)s->data + %(y)s->strides[0] * k);
const dtype_%(zn)s* z_row = (dtype_%(zn)s*)(%(zn)s->data + %(zn)s->strides[0] * m);
%(axpy)s((int*)&N, (%(conv_type)s*)&Amk, (%(conv_type)s*)y_row, (int*)&Sy, (%(conv_type)s*)z_row, (int*)&Szn);
}
}
}
"""% dict(locals(), **sub)
return rval
usmm_csc_dense = UsmmCscDense(inplace=False)
usmm_csc_dense_inplace = UsmmCscDense(inplace=True)
local_usmm = gof.opt.PatternSub((tensor.sub, 'z', (tensor.mul, {'pattern' : 'alpha', 'constraint' : lambda expr: numpy.all(expr.type.broadcastable) },
(_dot, 'x', 'y'))),
(usmm, (tensor.neg, 'alpha'), 'x', 'y', 'z'))
register_specialize(local_usmm, name="local_usmm")
@gof.local_optimizer([usmm])
def local_usmm_csx(node):
if node.op == usmm:
alpha, x, y, z = node.inputs
x_is_sparse_variable = _is_sparse_variable(x)
y_is_sparse_variable = _is_sparse_variable(y)
if x_is_sparse_variable and not y_is_sparse_variable:
if x.type.format == 'csc':
x_val, x_ind, x_ptr, x_shape = csm_properties(x)
x_nsparse = x_shape[0]
dtype_out = scalar.upcast(alpha.type.dtype, x.type.dtype, y.type.dtype, z.type.dtype)
# Sparse cast is not implemented.
if y.type.dtype != dtype_out:
return False
return [usmm_csc_dense(alpha, x_val, x_ind, x_ptr, x_nsparse, y, z)]
return False
register_specialize(local_usmm_csx)
@gof.local_optimizer([usmm_csc_dense])
def local_usmm_csc_dense_inplace(node):
if node.op == usmm_csc_dense:
return [usmm_csc_dense_inplace(*node.inputs)]
register_specialize(local_usmm_csc_dense_inplace, 'inplace')
......@@ -26,6 +26,15 @@ from theano import tensor
from theano.tensor.basic import _allclose
def as_sparse_format(data, format):
if format == 'csc':
return scipy.sparse.csc_matrix(data)
elif format == 'csr':
return scipy.sparse.csr_matrix(data)
else:
raise NotImplementedError()
def eval_outputs(outputs):
return compile.function([], outputs)()[0]
......@@ -513,6 +522,140 @@ class test_structureddot(unittest.TestCase):
if not theano.config.mode in ["DebugMode", "DEBUG_MODE"]:
self.assertFalse(theano_time > overhead_rtol*scipy_time + overhead_tol)
class DotTests(unittest.TestCase):
def setUp(self):
x_size = (10, 1000)
y_size = (1000, 10000)
self.x_csr = scipy.sparse.csr_matrix(numpy.random.binomial(1, 0.5, x_size), dtype=theano.config.floatX)
self.x_csc = scipy.sparse.csc_matrix(numpy.random.binomial(1, 0.5, x_size), dtype=theano.config.floatX)
self.y = numpy.asarray(numpy.random.uniform(-1, 1, y_size), dtype=theano.config.floatX)
self.y_csr = scipy.sparse.csr_matrix(numpy.random.binomial(1, 0.5, y_size), dtype=theano.config.floatX)
self.y_csc = scipy.sparse.csc_matrix(numpy.random.binomial(1, 0.5, y_size), dtype=theano.config.floatX)
def test_csr_dense(self):
x = theano.sparse.csr_matrix('x')
y = theano.tensor.matrix('y')
f_a = theano.function([x, y], theano.sparse.dot(x, y))
f_b = lambda x, y: x * y
assert abs(f_a(self.x_csr, self.y) - f_b(self.x_csr, self.y)).max() < 10**-4
def test_csc_dense(self):
x = theano.sparse.csc_matrix('x')
y = theano.tensor.matrix('y')
f_a = theano.function([x, y], theano.sparse.dot(x, y))
f_b = lambda x, y: x * y
assert abs(f_a(self.x_csc, self.y) - f_b(self.x_csc, self.y)).max() < 10**-4
def test_sparse_sparse(self):
for d1, d2 in [('float32', 'float32'),
('float32', 'float64'),
('float64', 'float32'),
('float64', 'float64'),
]:
for x_f, y_f in [('csc','csc'),
('csc','csr'),
('csr','csc'),
('csr','csr'),
]:
x = theano.sparse.SparseType(format=x_f, dtype=d1)('x')
y = theano.sparse.SparseType(format=x_f, dtype=d2)('x')
f_a = theano.function([x, y], theano.sparse.dot(x, y))
f_b = lambda x, y: x * y
vx = getattr(self,'x_'+x_f).astype(d1)
vy = getattr(self,'y_'+y_f).astype(d2)
assert abs(f_a(vx, vy) - f_b(vx, vy)).max() < 10**-4
class UsmmTests(unittest.TestCase):
def setUp(self):
x_size = (10, 200)
y_size = (200, 2000)
z_size = (x_size[0], y_size[1])
self.x = numpy.asarray(numpy.random.binomial(1, 0.5, x_size), dtype=theano.config.floatX)
self.y = numpy.asarray(numpy.random.uniform(-1, 1, y_size), dtype=theano.config.floatX)
self.z = numpy.asarray(numpy.random.uniform(-1, 1, z_size), dtype=theano.config.floatX)
def test(self):
def mat(format, name, dtype):
if format == 'dense':
return theano.tensor.matrix(name, dtype=dtype)
else:
return theano.sparse.matrix(format, name, dtype=dtype)
for dtype1 in ['float32', 'float64']:
for dtype2 in ['float32', 'float64']:
for dtype3 in ['float32', 'float64']:
for dtype4 in ['float32', 'float64']:
for format1 in ['dense', 'csc','csr']:
for format2 in ['dense', 'csc','csr']:
if format1 == 'dense' and format2 == 'dense':
# Usmm won't be used!
continue
x = mat(format1, 'x', dtype1)
y = mat(format2, 'y', dtype2)
a = theano.tensor.scalar('a', dtype=dtype3)
z = theano.tensor.shared(numpy.asarray(self.z,dtype=dtype4).copy())
f_b = lambda z, a, x, y: z - a * (x * y)
x_data = numpy.asarray(self.x, dtype = dtype1)
if format1 != 'dense':
x_data = as_sparse_format(x_data, format1)
y_data = numpy.asarray(self.y, dtype = dtype2)
if format2 != 'dense':
y_data = as_sparse_format(y_data, format2)
z_data = numpy.asarray(self.z, dtype = dtype3)
f_b_out = f_b(z_data, 1, x_data, y_data)
# Can it work inplace?
inplace = dtype4 == theano.scalar.upcast(dtype1, dtype2, dtype3)
# To make it easier to check the toposort
mode = theano.compile.mode.get_default_mode().excluding('fusion')
if inplace:
f_a = theano.function([a, x, y], [],
updates={ z : z - a * theano.sparse.dot(x, y)},
mode = mode)
f_a(1, x_data, y_data)
assert abs(z.get_value(borrow=True) - f_b_out).max() < 10**-4
else:
f_a = theano.function([a, x, y], z - a * theano.sparse.dot(x, y),
mode = mode)
f_a_out = f_a(1, x_data, y_data)
assert abs(f_a_out - f_b_out).max() < 10**-4
topo = f_a.maker.env.toposort()
if (y.type.dtype == theano.scalar.upcast(dtype1, dtype2, dtype3, dtype4)
and format1=='csc' and format2=='dense'):
assert sum([isinstance(node.op, tensor.Elemwise) and isinstance(node.op.scalar_op, theano.scalar.basic.Cast) for node in topo])==len(topo)-5
topo = [node for node in topo if not(isinstance(node.op, tensor.Elemwise) and isinstance(node.op.scalar_op, theano.scalar.basic.Cast))]
assert len(topo)==5, topo
# Usmm is tested at the same time in debugmode
# Check if the optimization local_usmm and local_usmm_csx is applied
assert isinstance(topo[0].op, theano.sparse.basic.CSMProperties)
assert isinstance(topo[1].op, theano.tensor.DimShuffle)
assert isinstance(topo[2].op, theano.tensor.Subtensor)
assert topo[3].op == theano.tensor.neg
assert isinstance(topo[4].op, theano.sparse.UsmmCscDense)
if inplace:
assert topo[4].op.inplace
else:
assert len(topo)==3, topo
assert isinstance(topo[0].op, theano.tensor.DimShuffle)
assert topo[1].op == theano.tensor.neg
assert isinstance(topo[2].op, theano.sparse.Usmm)
def test_shape_i():
sparse_dtype = 'float32'
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论