提交 2f9368a2 authored 作者: nouiz's avatar nouiz

Merge pull request #859 from bouchnic/opt

Move optimization to opt.py.
...@@ -853,135 +853,9 @@ class CSMGrad(gof.op.Op): ...@@ -853,135 +853,9 @@ class CSMGrad(gof.op.Op):
return [shapes[1]] return [shapes[1]]
else: else:
return [shapes[0]] return [shapes[0]]
csm_grad = CSMGrad csm_grad = CSMGrad
class CSMGradC(gof.Op):
def __eq__(self, other):
return (type(self) == type(other))
def __hash__(self):
return hash(type(self))
def __str__(self):
return self.__class__.__name__
def make_node(self, a_val, a_ind, a_ptr, a_dim,
b_val, b_ind, b_ptr, b_dim):
return gof.Apply(self, [a_val, a_ind, a_ptr, a_dim,
b_val, b_ind, b_ptr, b_dim], [b_val.type()])
def c_code(self, node, name, (a_val, a_ind, a_ptr, a_dim,
b_val, b_ind, b_ptr, b_dim), (z,), sub):
# retrieve dtype number
typenum_z = node.outputs[0].type.dtype_specs()[-1]
if node.inputs[0].type.dtype in ('complex64', 'complex128'):
raise NotImplementedError('Complex types are not supported for a_val')
if node.inputs[3].type.dtype in ('complex64', 'complex128'):
raise NotImplementedError('Complex types are not supported for b_val')
return """
if (%(a_val)s->nd != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(a_val) != 1"); %(fail)s;}
if (%(a_ind)s->nd != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(a_ind) != 1"); %(fail)s;}
if (%(a_ptr)s->nd != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(a_ptr) != 1"); %(fail)s;}
if (%(b_val)s->nd != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(b_val) != 1"); %(fail)s;}
if (%(b_ind)s->nd != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(b_ind) != 1"); %(fail)s;}
if (%(b_ptr)s->nd != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(b_ptr) != 1"); %(fail)s;}
if (%(a_ind)s->descr->type_num != PyArray_INT32) {
PyErr_SetString(PyExc_NotImplementedError, "a_ind dtype not INT32"); %(fail)s;}
if (%(a_ptr)s->descr->type_num != PyArray_INT32)
{PyErr_SetString(PyExc_NotImplementedError, "a_ptr dtype not INT32"); %(fail)s;}
if (%(b_ind)s->descr->type_num != PyArray_INT32) {
PyErr_SetString(PyExc_NotImplementedError, "b_ind dtype not INT32"); %(fail)s;}
if (%(b_ptr)s->descr->type_num != PyArray_INT32)
{PyErr_SetString(PyExc_NotImplementedError, "b_ptr dtype not INT32"); %(fail)s;}
if (%(a_val)s->dimensions[0] != %(a_ind)s->dimensions[0])
{PyErr_SetString(PyExc_NotImplementedError, "a_val and a_ind have different lengths"); %(fail)s;}
if (%(b_val)s->dimensions[0] != %(b_ind)s->dimensions[0])
{PyErr_SetString(PyExc_NotImplementedError, "b_val and b_ind have different lengths"); %(fail)s;}
if (%(a_ptr)s->dimensions[0] != %(b_ptr)s->dimensions[0])
{PyErr_SetString(PyExc_NotImplementedError, "a_ptr and b_ptr have different lengths"); %(fail)s;}
if ((!%(z)s) || (%(z)s->dimensions[0] != %(a_val)s->dimensions[0]))
{
{Py_XDECREF(%(z)s);}
npy_intp dims[] = {0};
dims[0] = %(a_val)s->dimensions[0];
%(z)s = (PyArrayObject*) PyArray_SimpleNew(1, dims, %(typenum_z)s);
}
{
// sparse array has size MxK, dense KxN, output MxN
npy_intp M = %(a_ptr)s->dimensions[0] - 1;
npy_intp a_dim_0 = ((npy_int32 *)%(a_dim)s->data)[0];
npy_intp a_dim_1 = ((npy_int32 *)%(a_dim)s->data)[1];
npy_intp sp_dim = (M == a_dim_0)?a_dim_1:a_dim_0;
// strides tell you how many bytes to skip to go to next column/row entry
npy_intp Sz = %(z)s->strides[0] / %(z)s->descr->elsize;
npy_intp Sa_val = %(a_val)s->strides[0] / %(a_val)s->descr->elsize;
npy_intp Sa_ind = %(a_ind)s->strides[0] / %(a_ind)s->descr->elsize;
npy_intp Sa_ptr = %(a_ptr)s->strides[0] / %(a_ptr)s->descr->elsize;
npy_intp Sb_val = %(b_val)s->strides[0] / %(b_val)s->descr->elsize;
npy_intp Sb_ind = %(b_ind)s->strides[0] / %(b_ind)s->descr->elsize;
npy_intp Sb_ptr = %(b_ptr)s->strides[0] / %(b_ptr)s->descr->elsize;
// pointers to access actual data in the arrays passed as params.
dtype_%(z)s* __restrict__ Dz = (dtype_%(z)s*)%(z)s->data;
const dtype_%(a_val)s* __restrict__ Da_val = (dtype_%(a_val)s*)%(a_val)s->data;
const npy_int32 * __restrict__ Da_ind = (npy_int32*)%(a_ind)s->data;
const npy_int32 * __restrict__ Da_ptr = (npy_int32*)%(a_ptr)s->data;
const dtype_%(b_val)s* __restrict__ Db_val = (dtype_%(b_val)s*)%(b_val)s->data;
const npy_int32 * __restrict__ Db_ind = (npy_int32*)%(b_ind)s->data;
const npy_int32 * __restrict__ Db_ptr = (npy_int32*)%(b_ptr)s->data;
npy_intp nnz = %(a_ind)s->dimensions[0];
dtype_%(b_val)s b_row[sp_dim];
//clear the output array
for (npy_int64 i = 0; i < nnz; ++i)
{
Dz[i*Sz] = 0;
}
memset(b_row, 0, sp_dim*sizeof(dtype_%(b_val)s));
// loop over inner dimension
for (npy_int64 m = 0; m < M; ++m)
{
for (npy_int32 j_ptr = Db_ptr[m * Sb_ptr];
j_ptr < Db_ptr[(m + 1) * Sb_ptr]; j_ptr++) {
b_row[Db_ind[j_ptr * Sb_ind]] += Db_val[j_ptr*Sb_val];
}
for (npy_int32 j_ptr = Da_ptr[m * Sa_ptr];
j_ptr < Da_ptr[(m + 1) * Sa_ptr]; j_ptr++) {
Dz[j_ptr*Sz] = b_row[Da_ind[j_ptr * Sa_ind]];
}
for (npy_int32 j_ptr = Db_ptr[m * Sb_ptr];
j_ptr < Db_ptr[(m + 1) * Sb_ptr]; j_ptr++) {
b_row[Db_ind[j_ptr * Sb_ind]] = 0;
}
}
}
""" % dict(locals(), **sub)
def c_code_cache_version(self):
return (3,)
csm_grad_c = CSMGradC()
class Cast(gof.op.Op): class Cast(gof.op.Op):
"""Cast sparse variable to the desired dtype. """Cast sparse variable to the desired dtype.
...@@ -2020,127 +1894,6 @@ class StructuredAddSV(gof.op.Op): ...@@ -2020,127 +1894,6 @@ class StructuredAddSV(gof.op.Op):
structured_add_s_v = StructuredAddSV() structured_add_s_v = StructuredAddSV()
class StructuredAddSVCSR(gof.Op):
# Structured addition of a sparse matrix and a dense vector.
# The elements of the vector are are only added to the corresponding
# non-zero elements. Therefore, this operation outputs another sparse
# matrix.
# :param a_data: Sparse matrix data.
# :param a_indices: Sparse matrix indices.
# :param a_indptr: Sparse matrix indptr.
# :param b: Tensor type vector.
# :return: A sparse matrix containing the addition of the vector to
# the data of the sparse matrix.
# :note: The a_* are the properties of a sparse matrix in csr
# format.
# :note: This op is used as an optimization for StructuredAddSV.
def __eq__(self, other):
return (type(self) == type(other))
def __hash__(self):
return hash(type(self))
def make_node(self, a_data, a_indices, a_indptr, b):
b = tensor.as_tensor_variable(b)
a_data = tensor.as_tensor_variable(a_data)
a_indices = tensor.as_tensor_variable(a_indices)
a_indptr = tensor.as_tensor_variable(a_indptr)
assert a_data.type.ndim == 1
assert a_indices.type.ndim == 1
assert a_indptr.type.ndim == 1
assert b.type.ndim == 1
return gof.Apply(self, [a_data, a_indices, a_indptr, b],
[tensor.tensor(b.dtype, (False,))])
def c_code_cache_version(self):
return (1,)
def c_code(self, node, name, inputs, outputs, sub):
_data, _indices, _indptr, _b, = inputs
_zout, = outputs
if node.inputs[0].type.dtype in ('complex64', 'complex128'):
raise NotImplementedError('Complex types are not supported for a')
if node.inputs[3].type.dtype in ('complex64', 'complex128'):
raise NotImplementedError('Complex types are not supported for b')
return """
if (%(_b)s->nd != 1) {
PyErr_SetString(PyExc_NotImplementedError, "rank(b) != 1");
%(fail)s;
}
if (%(_data)s->nd != 1) {
PyErr_SetString(PyExc_NotImplementedError, "rank(data) != 1");
%(fail)s;
}
if (%(_indices)s->nd != 1) {
PyErr_SetString(PyExc_NotImplementedError, "rank(indices) != 1");
%(fail)s;
}
if (%(_indptr)s->nd != 1) {
PyErr_SetString(PyExc_NotImplementedError, "rank(indptr) != 1");
%(fail)s;
}
if( %(_indices)s->descr->type_num != PyArray_INT32) {
PyErr_SetString(PyExc_NotImplementedError, "C"); %(fail)s;}
if( %(_indptr)s->descr->type_num != PyArray_INT32)
{PyErr_SetString(PyExc_NotImplementedError, "D"); %(fail)s;}
if (!%(_zout)s)
{
%(_zout)s = (PyArrayObject*) PyArray_SimpleNew(1,
%(_indices)s->dimensions, %(_b)s->descr->type_num);
}
if (%(_zout)s->dimensions[0] != %(_indices)s->dimensions[0])
{
PyErr_SetString(PyExc_NotImplementedError,
"somehow _zout got the wrong size.. and I don't know how to resize it.");
%(fail)s;
}
{ //makes it compile even though labels jump over variable definitions.
const npy_intp nnz = %(_indices)s->dimensions[0];
//TODO: error checking with this
const npy_intp N = %(_indptr)s->dimensions[0]-1;
const dtype_%(_data)s * const __restrict__ data = (dtype_%(_data)s*)%(_data)s->data;
const npy_int32 * const __restrict__ indptr = (npy_int32 *)%(_indptr)s->data;
const npy_int32 * const __restrict__ indices = (npy_int32 *)%(_indices)s->data;
const dtype_%(_b)s* __restrict__ Db = (dtype_%(_b)s*)%(_b)s->data;
dtype_%(_zout)s * const __restrict__ zout = (dtype_%(_zout)s*)%(_zout)s->data;
const npy_intp Sb = %(_b)s->strides[0] / %(_b)s->descr->elsize;
// loop over columns
for (npy_int32 j = 0; j < N; ++j)
{
// for each non-null value in the sparse column
for (npy_int32 i_idx = indptr[j]; i_idx < indptr[j+1]; ++i_idx)
{
// extract row index of non-null value
npy_int32 i = indices[i_idx];
// write resulting gradient to sparse output
zout[i_idx] = data[i_idx] + Db[i * Sb];
}
}
}
""" % dict(locals(), **sub)
def __str__(self):
return self.__class__.__name__
structured_add_s_v_csr = StructuredAddSVCSR()
def add(x, y): def add(x, y):
"""Add two matrices, at least one of which is sparse. """Add two matrices, at least one of which is sparse.
...@@ -2329,232 +2082,6 @@ class MulSD(gof.op.Op): ...@@ -2329,232 +2082,6 @@ class MulSD(gof.op.Op):
mul_s_d = MulSD() mul_s_d = MulSD()
class MulSDCSC(gof.Op):
# Multiplication of sparse matrix by a broadcasted dense vector
# element wise.
# :param a_data: Sparse matrix data.
# :param a_indices: Sparse matrix indices.
# :param a_indptr: Sparse matrix indptr.
# :param b: Tensor type matrix.
# :return: The multiplication of the two matrix element wise.
# :note: `a_data`, `a_indices` and `a_indptr` must be the properties
# of a sparse matrix in csc format.
# :note: The dtype of `a_data`, i.e. the dtype of the sparse matrix,
# cannot be a complex type.
# :note: This op is used as an optimization of mul_s_d.
def __eq__(self, other):
return (type(self) == type(other))
def __hash__(self):
return hash(type(self))
def make_node(self, a_data, a_indices, a_indptr, b):
assert b.type.ndim == 2
return gof.Apply(self, [a_data, a_indices, a_indptr, b],
[tensor.tensor(b.dtype, (False,))])
def c_code_cache_version(self):
return (1,)
#def perform(self, node, (a_data, a_indices, a_indptr, b), (out,)):
# return NotImplementedError()
def c_code(self, node, name, (_data, _indices, _indptr, _b,),
(_zout, ), sub):
if node.inputs[0].type.dtype in ('complex64', 'complex128'):
raise NotImplementedError('Complex types are not supported for a')
if node.inputs[3].type.dtype in ('complex64', 'complex128'):
raise NotImplementedError('Complex types are not supported for b')
return """
if (%(_b)s->nd != 2) {
PyErr_SetString(PyExc_NotImplementedError, "rank(b) != 2");
%(fail)s;}
if (%(_data)s->nd != 1) {
PyErr_SetString(PyExc_NotImplementedError, "rank(data) != 1");
%(fail)s;}
if (%(_indices)s->nd != 1) {
PyErr_SetString(PyExc_NotImplementedError, "rank(indices) != 1");
%(fail)s;}
if (%(_indptr)s->nd != 1) {
PyErr_SetString(PyExc_NotImplementedError, "rank(indptr) != 1");
%(fail)s;}
if( %(_indices)s->descr->type_num != PyArray_INT32) {
PyErr_SetString(PyExc_NotImplementedError, "C"); %(fail)s;}
if( %(_indptr)s->descr->type_num != PyArray_INT32)
{PyErr_SetString(PyExc_NotImplementedError, "D"); %(fail)s;}
if (!%(_zout)s)
{
%(_zout)s = (PyArrayObject*) PyArray_SimpleNew(1,
%(_indices)s->dimensions, %(_b)s->descr->type_num);
}
if (%(_zout)s->dimensions[0] != %(_indices)s->dimensions[0])
{
PyErr_SetString(PyExc_NotImplementedError,
"somehow _zout got the wrong size.. and I don't know how to resize it.");
%(fail)s;
}
{ //makes it compile even though labels jump over variable definitions.
const npy_intp nnz = %(_indices)s->dimensions[0];
//TODO: error checking with this
const npy_intp N = %(_indptr)s->dimensions[0]-1;
const dtype_%(_data)s * const __restrict__ data = (dtype_%(_data)s*)%(_data)s->data;
const npy_int32 * const __restrict__ indptr = (npy_int32 *)%(_indptr)s->data;
const npy_int32 * const __restrict__ indices = (npy_int32 *)%(_indices)s->data;
dtype_%(_zout)s * const __restrict__ zout = (dtype_%(_zout)s*)%(_zout)s->data;
const npy_intp Sb = %(_b)s->strides[0];
// loop over columns
for (npy_int32 j = 0; j < N; ++j)
{
// for each non-null value in the sparse column
for (npy_int32 i_idx = indptr[j]; i_idx < indptr[j+1]; ++i_idx)
{
// extract row index of non-null value
npy_int32 i = indices[i_idx];
// extract i-th row of dense matrix
const dtype_%(_b)s* __restrict__ b_row = (dtype_%(_b)s*)(%(_b)s->data + Sb * i);
// write resulting gradient to sparse output
zout[i_idx] = data[i_idx] * b_row[j];
}
}
}
""" % dict(locals(), **sub)
def __str__(self):
return self.__class__.__name__
mul_s_d_csc = MulSDCSC()
class MulSDCSR(gof.Op):
# Multiplication of sparse matrix by a broadcasted dense vector
# element wise.
# :param a_data: Sparse matrix data.
# :param a_indices: Sparse matrix indices.
# :param a_indptr: Sparse matrix indptr.
# :param b: Tensor type matrix.
# :return: The multiplication of the two matrix element wise.
# :note: `a_data`, `a_indices` and `a_indptr` must be the properties
# of a sparse matrix in csr format.
# :note: The dtype of `a_data`, i.e. the dtype of the sparse matrix,
# cannot be a complex type.
# :note: This op is used as an optimization of mul_s_d.
def __eq__(self, other):
return (type(self) == type(other))
def __hash__(self):
return hash(type(self))
def make_node(self, a_data, a_indices, a_indptr, b):
assert b.type.ndim == 2
return gof.Apply(self, [a_data, a_indices, a_indptr, b],
[tensor.tensor(b.dtype, (False,))])
def c_code_cache_version(self):
return (1,)
#def perform(self, node, (a_data, a_indices, a_indptr, b), (out,)):
# return NotImplemented()
def c_code(self, node, name, (_data, _indices, _indptr, _b,),
(_zout, ), sub):
if node.inputs[0].type.dtype in ('complex64', 'complex128'):
raise NotImplementedError('Complex types are not supported for a')
if node.inputs[3].type.dtype in ('complex64', 'complex128'):
raise NotImplementedError('Complex types are not supported for b')
return """
if (%(_b)s->nd != 2) {
PyErr_SetString(PyExc_NotImplementedError, "rank(b) != 2");
%(fail)s;}
if (%(_data)s->nd != 1) {
PyErr_SetString(PyExc_NotImplementedError, "rank(data) != 1");
%(fail)s;}
if (%(_indices)s->nd != 1) {
PyErr_SetString(PyExc_NotImplementedError, "rank(indices) != 1");
%(fail)s;}
if (%(_indptr)s->nd != 1) {
PyErr_SetString(PyExc_NotImplementedError, "rank(indptr) != 1");
%(fail)s;}
if( %(_indices)s->descr->type_num != PyArray_INT32) {
PyErr_SetString(PyExc_NotImplementedError, "C"); %(fail)s;}
if( %(_indptr)s->descr->type_num != PyArray_INT32)
{PyErr_SetString(PyExc_NotImplementedError, "D"); %(fail)s;}
if (!%(_zout)s)
{
%(_zout)s = (PyArrayObject*) PyArray_SimpleNew(1,
%(_indices)s->dimensions, %(_b)s->descr->type_num);
}
if (%(_zout)s->dimensions[0] != %(_indices)s->dimensions[0])
{
PyErr_SetString(PyExc_NotImplementedError,
"somehow _zout got the wrong size.. and I don't know how to resize it.");
%(fail)s;
}
{ //makes it compile even though labels jump over variable definitions.
const npy_intp nnz = %(_indices)s->dimensions[0];
//TODO: error checking with this
const npy_intp N = %(_indptr)s->dimensions[0]-1;
const dtype_%(_data)s * const __restrict__ data = (dtype_%(_data)s*)%(_data)s->data;
const npy_int32 * const __restrict__ indptr = (npy_int32 *)%(_indptr)s->data;
const npy_int32 * const __restrict__ indices = (npy_int32 *)%(_indices)s->data;
dtype_%(_zout)s * const __restrict__ zout = (dtype_%(_zout)s*)%(_zout)s->data;
const npy_intp Sb = %(_b)s->strides[0];
// loop over columns
for (npy_int32 j = 0; j < N; ++j)
{
// extract i-th row of dense matrix
const dtype_%(_b)s* __restrict__ b_row = (dtype_%(_b)s*)(%(_b)s->data + Sb * j);
// for each non-null value in the sparse column
for (npy_int32 i_idx = indptr[j]; i_idx < indptr[j+1]; ++i_idx)
{
// extract row index of non-null value
npy_int32 i = indices[i_idx];
// write resulting gradient to sparse output
zout[i_idx] = data[i_idx] * b_row[i];
}
}
}
""" % dict(locals(), **sub)
def __str__(self):
return self.__class__.__name__
mul_s_d_csr = MulSDCSR()
class MulSV(gof.op.Op): class MulSV(gof.op.Op):
"""Multiplication of sparse matrix by a broadcasted dense vector """Multiplication of sparse matrix by a broadcasted dense vector
element wise. element wise.
...@@ -2604,149 +2131,41 @@ class MulSV(gof.op.Op): ...@@ -2604,149 +2131,41 @@ class MulSV(gof.op.Op):
mul_s_v = MulSV() mul_s_v = MulSV()
class MulSVCSR(gof.Op): def mul(x, y):
# Multiplication of sparse matrix by a broadcasted dense vector """Multiply elementwise two matrices, at least one
# element wise. of which is sparse.
# :param a_data: Sparse matrix data.
# :param a_indices: Sparse matrix indices.
# :param a_indptr: Sparse matrix indptr.
# :param b: Tensor type matrix.
# :return: The multiplication of the two matrix element wise. This method will provide the right op according
to the inputs.
# :note: `a_data`, `a_indices` and `a_indptr` must be the properties :param x: A matrix variable.
# of a sparse matrix in csr format. :param y: A matrix variable.
# :note: The dtype of `a_data`, i.e. the dtype of the sparse matrix,
# cannot be a complex type.
# :note: This op is used as an optimization of MulSV.
def __eq__(self, other): :return: `x` + `y`
return (type(self) == type(other))
def __hash__(self): :note: At least one of `x` and `y` must be a sparse matrix.
return hash(type(self)) :note: The grad is regular, i.e. not structured.
"""
def make_node(self, a_data, a_indices, a_indptr, b): x = as_sparse_or_tensor_variable(x)
assert b.type.ndim == 1 y = as_sparse_or_tensor_variable(y)
return gof.Apply(self, [a_data, a_indices, a_indptr, b],
[tensor.tensor(b.dtype, (False,))])
def c_code_cache_version(self): x_is_sparse_variable = _is_sparse_variable(x)
return (1,) y_is_sparse_variable = _is_sparse_variable(y)
def c_code(self, node, name, inputs, outputs, sub): assert x_is_sparse_variable or y_is_sparse_variable
_data, _indices, _indptr, _b, = inputs if x_is_sparse_variable and y_is_sparse_variable:
_zout, = outputs return mul_s_s(x, y)
if node.inputs[0].type.dtype in ('complex64', 'complex128'): elif x_is_sparse_variable and not y_is_sparse_variable:
raise NotImplementedError('Complex types are not supported for a') return mul_s_d(x, y)
if node.inputs[3].type.dtype in ('complex64', 'complex128'): elif y_is_sparse_variable and not x_is_sparse_variable:
raise NotImplementedError('Complex types are not supported for b') return mul_s_d(y, x)
else:
raise NotImplementedError()
return """
if (%(_b)s->nd != 1) {
PyErr_SetString(PyExc_NotImplementedError, "rank(b) != 1");
%(fail)s;
}
if (%(_data)s->nd != 1) {
PyErr_SetString(PyExc_NotImplementedError, "rank(data) != 1");
%(fail)s;
}
if (%(_indices)s->nd != 1) {
PyErr_SetString(PyExc_NotImplementedError, "rank(indices) != 1");
%(fail)s;
}
if (%(_indptr)s->nd != 1) {
PyErr_SetString(PyExc_NotImplementedError, "rank(indptr) != 1");
%(fail)s;
}
if( %(_indices)s->descr->type_num != PyArray_INT32) { class HStack(gof.op.Op):
PyErr_SetString(PyExc_NotImplementedError, "C"); %(fail)s;} """Stack sparse matrices horizontally (column wise).
if( %(_indptr)s->descr->type_num != PyArray_INT32)
{PyErr_SetString(PyExc_NotImplementedError, "D"); %(fail)s;}
if (!%(_zout)s
|| %(_zout)s->dimensions[0] != %(_indices)s->dimensions[0]
|| !PyArray_ISCONTIGUOUS(%(_zout)s))
{
Py_XDECREF(%(_zout)s);
%(_zout)s = (PyArrayObject*) PyArray_SimpleNew(1,
%(_indices)s->dimensions, %(_b)s->descr->type_num);
}
{ //makes it compile even though labels jump over variable definitions.
const npy_intp nnz = %(_indices)s->dimensions[0];
//TODO: error checking with this
const npy_intp N = %(_indptr)s->dimensions[0]-1;
const dtype_%(_data)s * const __restrict__ data = (dtype_%(_data)s*)%(_data)s->data;
const npy_int32 * const __restrict__ indptr = (npy_int32 *)%(_indptr)s->data;
const npy_int32 * const __restrict__ indices = (npy_int32 *)%(_indices)s->data;
const dtype_%(_b)s* __restrict__ Db = (dtype_%(_b)s*)%(_b)s->data;
dtype_%(_zout)s * const __restrict__ zout = (dtype_%(_zout)s*)%(_zout)s->data;
const npy_intp Sb = %(_b)s->strides[0] / %(_b)s->descr->elsize;
// loop over rows
for (npy_int32 j = 0; j < N; ++j)
{
// for each non-null value in the sparse column
for (npy_int32 i_idx = indptr[j]; i_idx < indptr[j+1]; ++i_idx)
{
// extract row index of non-null value
npy_int32 i = indices[i_idx];
zout[i_idx] = data[i_idx] * Db[i * Sb];
}
}
}
""" % dict(locals(), **sub)
def __str__(self):
return self.__class__.__name__
mul_s_v_csr = MulSVCSR()
def mul(x, y):
"""Multiply elementwise two matrices, at least one
of which is sparse.
This method will provide the right op according
to the inputs.
:param x: A matrix variable.
:param y: A matrix variable.
:return: `x` + `y`
:note: At least one of `x` and `y` must be a sparse matrix.
:note: The grad is regular, i.e. not structured.
"""
x = as_sparse_or_tensor_variable(x)
y = as_sparse_or_tensor_variable(y)
x_is_sparse_variable = _is_sparse_variable(x)
y_is_sparse_variable = _is_sparse_variable(y)
assert x_is_sparse_variable or y_is_sparse_variable
if x_is_sparse_variable and y_is_sparse_variable:
return mul_s_s(x, y)
elif x_is_sparse_variable and not y_is_sparse_variable:
return mul_s_d(x, y)
elif y_is_sparse_variable and not x_is_sparse_variable:
return mul_s_d(y, x)
else:
raise NotImplementedError()
class HStack(gof.op.Op):
"""Stack sparse matrices horizontally (column wise).
:param blocks: Sequence of sparse array of compatible shape. :param blocks: Sequence of sparse array of compatible shape.
:param format: String representing the output format. Default :param format: String representing the output format. Default
...@@ -3463,690 +2882,6 @@ def structured_dot(x, y): ...@@ -3463,690 +2882,6 @@ def structured_dot(x, y):
return _structured_dot(y.T, x.T).T return _structured_dot(y.T, x.T).T
class StructuredDotCSC(gof.Op):
# Structured Dot CSC is like dot, except that only the
# gradient wrt non-zero elements of the sparse matrix
# `a` are calculated and propagated.
# The output is presumed to be a dense matrix, and is represented by a
# TensorType instance.
# :param a: A sparse matrix in csc format.
# :param b: A sparse or dense matrix.
# :return: The dot product of `a` and `b`.
# :note: The grad implemented is structured.
# :note: This op is used as an optimization for StructuredDot.
def __eq__(self, other):
return (type(self) == type(other))
def __hash__(self):
return hash(type(self))
def __str__(self):
return self.__class__.__name__
def make_node(self, a_val, a_ind, a_ptr, a_nrows, b):
dtype_out = scalar.upcast(a_val.type.dtype, b.type.dtype)
r = gof.Apply(self, [a_val, a_ind, a_ptr, a_nrows, b],
[tensor.tensor(dtype_out, (False, b.type.broadcastable[1]))])
return r
def perform(self, node, (a_val, a_ind, a_ptr, a_nrows, b), (out,)):
a = scipy.sparse.csc_matrix((a_val, a_ind, a_ptr),
(a_nrows, b.shape[0]),
copy=False)
#out[0] = a.dot(b)
out[0] = theano._asarray(a * b, dtype=node.outputs[0].type.dtype)
assert _is_dense(out[0]) # scipy 0.7 automatically converts to dense
def c_code(self, node, name, (a_val, a_ind, a_ptr, a_nrows, b), (z,), sub):
# C-implementation of the dot product of the sparse matrix A and matrix
# B.
# @param a_val: non-zero values of the sparse matrix
# @param a_ind: column indices of the non-null values (.indices of a
# scipy.csc_matrix)
# @param a_ptr: a_ptr indicates col indices for col. i are in the range
# a_ptr[i]:a_ptr[i+1]
# @param n_rows: number of rows of sparse matrix
# @param b: dense matrix to perform dot product with, as in dot(a, b)
# @param z: return value
# @param sub: TODO, not too sure, something to do with weave probably
if node.inputs[0].type.dtype in ('complex64', 'complex128'):
raise NotImplementedError('Complex types are not supported for a_val')
if node.inputs[4].type.dtype in ('complex64', 'complex128'):
raise NotImplementedError('Complex types are not supported for b')
typenum_z = node.outputs[0].type.dtype_specs()[-1] # retrieve dtype number
typenum_a_val = node.inputs[0].type.dtype_specs()[-1] # retrieve dtype number
typenum_b = node.inputs[4].type.dtype_specs()[-1] # retrieve dtype number
rval = """
if (%(a_val)s->nd != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(a_val) != 1"); %(fail)s;}
if (%(a_ind)s->nd != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(a_ind) != 1"); %(fail)s;}
if (%(a_ptr)s->nd != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(a_ptr) != 1"); %(fail)s;}
if (%(a_nrows)s->nd != 0) {PyErr_SetString(PyExc_NotImplementedError, "rank(nrows) != 0"); %(fail)s;}
if (%(b)s->nd != 2) {PyErr_SetString(PyExc_NotImplementedError, "rank(b) != 2"); %(fail)s;}
if (%(a_val)s->descr->type_num != %(typenum_a_val)s) {
PyErr_SetString(PyExc_NotImplementedError, "Invalid type for a_val"); %(fail)s;}
if (%(b)s->descr->type_num != %(typenum_b)s) {
PyErr_SetString(PyExc_NotImplementedError, "Invalid type for b"); %(fail)s;}
if (%(a_ind)s->descr->type_num != PyArray_INT32) {
PyErr_SetString(PyExc_NotImplementedError, "a_ind dtype not INT32"); %(fail)s;}
if (%(a_ptr)s->descr->type_num != PyArray_INT32)
{PyErr_SetString(PyExc_NotImplementedError, "a_ptr dtype not INT32"); %(fail)s;}
if (%(a_nrows)s->descr->type_num != PyArray_INT32)
{PyErr_SetString(PyExc_NotImplementedError, "a_nrows dtype not INT32"); %(fail)s;}
if (%(a_val)s->dimensions[0] != %(a_ind)s->dimensions[0])
{PyErr_SetString(PyExc_NotImplementedError, "a_val and a_ind have different lengths"); %(fail)s;}
if (%(a_ptr)s->dimensions[0] != %(b)s->dimensions[0]+1)
{PyErr_SetString(PyExc_NotImplementedError, "a's number of columns doesn't match b's rows"); %(fail)s;}
if ((!%(z)s)
|| (%(z)s->dimensions[0] != ((npy_int32 *)%(a_nrows)s->data)[0])
|| (%(z)s->dimensions[1] != %(b)s->dimensions[1])
)
{
{Py_XDECREF(%(z)s);}
npy_intp dims[] = {0, 0};
dims[0] = ((npy_int32 *)%(a_nrows)s->data)[0];
dims[1] = %(b)s->dimensions[1];
%(z)s = (PyArrayObject*) PyArray_SimpleNew(2, dims, %(typenum_z)s);
}
{
// sparse array has size MxK, dense KxN, output MxN
npy_intp M = %(z)s->dimensions[0];
npy_intp N = %(z)s->dimensions[1];
npy_intp K = %(b)s->dimensions[0];
// strides tell you how many bytes to skip to go to next column/row entry
npy_intp Szm = %(z)s->strides[0] / %(z)s->descr->elsize;
npy_intp Szn = %(z)s->strides[1] / %(z)s->descr->elsize;
//npy_intp Sbm = %(b)s->strides[0] / %(b)s->descr->elsize;
npy_intp Sbn = %(b)s->strides[1] / %(b)s->descr->elsize;
npy_intp Sval = %(a_val)s->strides[0] / %(a_val)s->descr->elsize;
npy_intp Sind = %(a_ind)s->strides[0] / %(a_ind)s->descr->elsize;
npy_intp Sptr = %(a_ptr)s->strides[0] / %(a_ptr)s->descr->elsize;
// pointers to access actual data in the arrays passed as params.
dtype_%(z)s* __restrict__ Dz = (dtype_%(z)s*)%(z)s->data;
const dtype_%(a_val)s* __restrict__ Dval = (dtype_%(a_val)s*)%(a_val)s->data;
const npy_int32 * __restrict__ Dind = (npy_int32*)%(a_ind)s->data;
const npy_int32 * __restrict__ Dptr = (npy_int32*)%(a_ptr)s->data;
//npy_intp nnz = %(a_ind)s->dimensions[0];
//clear the output array
memset(Dz, 0, M*N*sizeof(dtype_%(z)s));
//iterate over the sparse array, making the most of an entry wherever we find it.
//
// Normal matrix matrix multiply: A MxK, B KxN => Z = AB
// for m
// for n
// for k
// z[m, n] += a[m, k] * b[k, n]
// Here instead: Z =
// for k
// for m (sparse)
// for n
// z[m, n] += a[m, k] * b[k, n]
// loop over inner dimension
for (npy_int32 k = 0; k < K; ++k)
{
// get pointer to k-th row of dense matrix
const dtype_%(b)s* __restrict__ bk = (dtype_%(b)s*)(%(b)s->data + %(b)s->strides[0] * k);
// loop over sparse column indices through index pointer array
// (amounts to looping over rows M of sparse matrix)
for (npy_int32 m_idx = Dptr[k * Sptr]; m_idx < Dptr[(k+1) * Sptr]; ++m_idx)
{
npy_int32 m = Dind[m_idx * Sind]; // row index of non-null value for column K
const dtype_%(a_val)s Amk = Dval[m_idx * Sval]; // actual value at that location
// pointer to m-th row of the output matrix Z
dtype_%(z)s* __restrict__ zm = (dtype_%(z)s*)(%(z)s->data + %(z)s->strides[0] * m);
//RESOLVE: a.shape[0] equals z.shape[0], why is this not an equality constraint?
if (m >= %(z)s->dimensions[0])
{PyErr_SetString(PyExc_NotImplementedError, "illegal row index in a"); %(fail)s;}
// loop over final dimension (cols of dense matrix) and perform dot product
if ((Szn == 1) && (Sbn == 1)) {
for(npy_int32 n = 0; n < N; ++n)
{
zm[n] += Amk * bk[n];
}
}
else
{
for(npy_int32 n = 0; n < N; ++n)
{
zm[n*Szn] += Amk * bk[n*Sbn];
}
}
}
}
}
""" % dict(locals(), **sub)
return rval
def c_code_cache_version(self):
return (2,)
sd_csc = StructuredDotCSC()
class StructuredDotCSR(gof.Op):
# Structured Dot CSR is like dot, except that only the
# gradient wrt non-zero elements of the sparse matrix
# `a` are calculated and propagated.
# The output is presumed to be a dense matrix, and is represented by a
# TensorType instance.
# :param a: A sparse matrix in csr format.
# :param b: A sparse or dense matrix.
# :return: The dot product of `a` and `b`.
# :note: The grad implemented is structured.
# :note: This op is used as an optimization for StructuredDot.
def __eq__(self, other):
return (type(self) == type(other))
def __hash__(self):
return hash(type(self))
def __str__(self):
return self.__class__.__name__
def make_node(self, a_val, a_ind, a_ptr, b):
self.dtype_out = scalar.upcast(a_val.type.dtype, b.type.dtype)
r = gof.Apply(self, [a_val, a_ind, a_ptr, b],
[tensor.tensor(self.dtype_out, (False,
b.type.broadcastable[1]))])
return r
def perform(self, node, (a_val, a_ind, a_ptr, b), (out,)):
a = scipy.sparse.csr_matrix((a_val, a_ind, a_ptr),
(len(a_ptr) - 1, b.shape[0]),
copy=True) # use view_map before setting this to False
#out[0] = a.dot(b)
out[0] = a * b
# scipy 0.7 automatically converts to dense, but not .6 sometimes
assert _is_dense(out[0])
def c_code(self, node, name, (a_val, a_ind, a_ptr, b), (z,), sub):
"""
C-implementation of the dot product of the sparse matrix A and matrix
B.
@param a_val: non-zero values of the sparse matrix
@param a_ind: column indices of the non-null values (.indices of a
scipy.csc_matrix)
@param a_ptr: a_ptr indicates col indices for col. i are in the range
a_ptr[i]:a_ptr[i+1]
@param n_cols: number of columns of sparse matrix
@param b: dense matrix to perform dot product with, as in dot(a, b)
@param z: return value
@param sub: TODO, not too sure, something to do with weave probably
"""
# retrieve dtype number
typenum_z = tensor.TensorType(self.dtype_out, []).dtype_specs()[-1]
if node.inputs[0].type.dtype in ('complex64', 'complex128'):
raise NotImplementedError('Complex types are not supported for a_val')
if node.inputs[3].type.dtype in ('complex64', 'complex128'):
raise NotImplementedError('Complex types are not supported for b')
return """
if (%(a_val)s->nd != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(a_val) != 1"); %(fail)s;}
if (%(a_ind)s->nd != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(a_ind) != 1"); %(fail)s;}
if (%(a_ptr)s->nd != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(a_ptr) != 1"); %(fail)s;}
if (%(b)s->nd != 2) {PyErr_SetString(PyExc_NotImplementedError, "rank(b) != 2"); %(fail)s;}
if (%(a_ind)s->descr->type_num != PyArray_INT32) {
PyErr_SetString(PyExc_NotImplementedError, "a_ind dtype not INT32"); %(fail)s;}
if (%(a_ptr)s->descr->type_num != PyArray_INT32)
{PyErr_SetString(PyExc_NotImplementedError, "a_ptr dtype not INT32"); %(fail)s;}
if (%(a_val)s->dimensions[0] != %(a_ind)s->dimensions[0])
{PyErr_SetString(PyExc_NotImplementedError, "a_val and a_ind have different lengths"); %(fail)s;}
if ((!%(z)s)
|| (%(z)s->dimensions[0] != %(a_ptr)s->dimensions[0]-1) //a's rows
|| (%(z)s->dimensions[1] != %(b)s->dimensions[1]) //b's columns
)
{
{Py_XDECREF(%(z)s);}
npy_intp dims[] = {0, 0};
dims[0] = %(a_ptr)s->dimensions[0]-1;
dims[1] = %(b)s->dimensions[1];
%(z)s = (PyArrayObject*) PyArray_SimpleNew(2, dims, %(typenum_z)s);
}
{
// sparse array has size MxK, dense KxN, output MxN
npy_intp M = %(z)s->dimensions[0];
npy_intp N = %(z)s->dimensions[1];
npy_intp K = %(b)s->dimensions[0];
// strides tell you how many bytes to skip to go to next column/row entry
npy_intp Szm = %(z)s->strides[0] / %(z)s->descr->elsize;
npy_intp Szn = %(z)s->strides[1] / %(z)s->descr->elsize;
npy_intp Sbm = %(b)s->strides[0] / %(b)s->descr->elsize;
npy_intp Sbn = %(b)s->strides[1] / %(b)s->descr->elsize;
npy_intp Sval = %(a_val)s->strides[0] / %(a_val)s->descr->elsize;
npy_intp Sind = %(a_ind)s->strides[0] / %(a_ind)s->descr->elsize;
npy_intp Sptr = %(a_ptr)s->strides[0] / %(a_ptr)s->descr->elsize;
// pointers to access actual data in the arrays passed as params.
dtype_%(z)s* __restrict__ Dz = (dtype_%(z)s*)%(z)s->data;
const dtype_%(a_val)s* __restrict__ Dval = (dtype_%(a_val)s*)%(a_val)s->data;
const npy_int32 * __restrict__ Dind = (npy_int32*)%(a_ind)s->data;
const npy_int32 * __restrict__ Dptr = (npy_int32*)%(a_ptr)s->data;
//npy_intp nnz = %(a_ind)s->dimensions[0];
//clear the output array
memset(Dz, 0, M*N*sizeof(dtype_%(z)s));
//iterate over the sparse array, making the most of an entry wherever we find it.
// Normal matrix matrix multiply:
// for m
// for n
// for k
// z[m, n] += a[m, k] * b[k, n]
// Here instead:
// for m
// for k (sparse)
// for n
// z[m, n] += a[m, k] * b[k, n]
// loop over inner dimension
for (npy_int64 m = 0; m < M; ++m)
{
// pointer to m-th row of the output matrix Z
dtype_%(z)s* __restrict__ zm = (dtype_%(z)s*)(%(z)s->data + %(z)s->strides[0] * m);
// loop over sparse rows indices through index pointer array
// (amounts to looping over cols k of sparse matrix)
for (npy_int32 k_idx = Dptr[m * Sptr]; k_idx < Dptr[(m+1) * Sptr]; ++k_idx)
{
npy_int32 k = Dind[k_idx * Sind]; // col index of non-null value for row m
const dtype_%(a_val)s Amk = Dval[k_idx * Sval]; // actual value at that location
// get pointer to k-th row of dense matrix
const dtype_%(b)s* __restrict__ bk = (dtype_%(b)s*)(%(b)s->data + %(b)s->strides[0] * k);
// loop over final dimension (cols of dense matrix) and perform dot product
for(npy_int32 n = 0; n < N; ++n)
{
zm[n*Szn] += Amk * bk[n*Sbn];
}
}
}
}
""" % dict(locals(), **sub)
def c_code_cache_version(self):
return (1,)
sd_csr = StructuredDotCSR()
class SamplingDot(gof.op.Op):
"""Operand for calculating the dot product dot(`x`, `y`.T) = `z` when you
only want to calculate a subset of `z`.
It is equivalent to `p` o (`x` . `y`.T) where o is the element-wise
product, `x` and `y` operands of the dot product and `p` is a matrix that
contains 1 when the corresponding element of `z` should be calculated
and 0 when it shouldn't. Note that SamplingDot has a different interface
than `dot` because SamplingDot requires `x` to be a `m`x`k` matrix while
`y` is a `n`x`k` matrix instead of the usual `k`x`n` matrix.
.. note::
It will work if the pattern is not binary value, but if the
pattern doesn't have a high sparsity proportion it will be slower
then a more optimized dot followed by a normal elemwise
multiplication.
:param x: Tensor matrix.
:param y: Tensor matrix.
:param p: Sparse matrix in csr format.
:return: A dense matrix containing the dot product of `x` by `y`.T only
where `p` is 1.
:note: The grad implemented is regular, i.e. not structured.
"""
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def make_node(self, x, y, p):
x = tensor.as_tensor_variable(x)
y = tensor.as_tensor_variable(y)
p = as_sparse_variable(p)
if not _is_sparse_variable(p):
raise TypeError(p)
#TODO: use it.
dtype_out = scalar.upcast(x.type.dtype, y.type.dtype, p.type.dtype)
return gof.Apply(self, [x, y, p], [p.type()])
def perform(self, node, (x, y, p), (out,)):
if _is_sparse(x):
raise TypeError(x)
if _is_sparse(y):
raise TypeError(y)
if not _is_sparse(p):
raise TypeError(p)
out[0] = p.__class__(p.multiply(numpy.dot(x, y.T)))
def grad(self, (x, y, p), (gz,)):
rval = [
dot(p * gz, y),
dot((p * gz).T, x),
None
]
return rval
def infer_shape(self, node, ins_shapes):
return [ins_shapes[2]]
def __str__(self):
return self.__class__.__name__
sampling_dot = SamplingDot()
class SamplingDotCSR(gof.Op):
# Operand optimized for calculating the dot product dot(`x`, `y`.T) = `z`
# when you only want to calculate a subset of `z`.
# It is equivalent to `p` o (`x` . `y`.T) where o is the element-wise
# product, `x` and `y` operands of the dot product and `p` is a matrix
# that contains 1 when the corresponding element of `z` should be
# calculated and 0 when it shouldn't. Note that SamplingDot has a different
# interface than `dot` because SamplingDot requires `x` to be a `m`x`k`
# matrix while `y` is a `n`x`k` matrix instead of the usual `k`x`n` matrix.
# .. note::
# It will work if the pattern is not binary value, but if the
# pattern doesn't have a high sparsity proportion it will be slower
# then a more optimized dot followed by a normal elemwise
# multiplication.
# :param x: Tensor matrix.
# :param y: Tensor matrix.
# :param p_data: Sparse matrix data.
# :param p_ind: Sparse matrix indices.
# :param p_ptr: Sparse matric indptr.
# :param p_ncols: Sparse matrix number of columns.
# :return: A dense matrix containing the dot product of `x` by `y`.T only
# where `p` is 1.
# :note: If we have the input of mixed dtype, we insert cast elemwise
# in the graph to be able to call blas function as they don't
# allow mixed dtype.
# :note: This op is used as an optimization for SamplingDot.
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def __str__(self):
return 'SamplingDot{Csr}'
def make_node(self, x, y, p_data, p_ind, p_ptr, p_ncols):
x = tensor.as_tensor_variable(x)
y = tensor.as_tensor_variable(y)
p_data = tensor.as_tensor_variable(p_data)
p_ind = tensor.as_tensor_variable(p_ind)
p_ptr = tensor.as_tensor_variable(p_ptr)
p_ncols = tensor.as_tensor_variable(p_ncols)
assert p_ncols.dtype == 'int32'
dtype_out = scalar.upcast(x.type.dtype, y.type.dtype,
p_data.type.dtype)
dot_out = scalar.upcast(x.type.dtype, y.type.dtype)
# We call blas ?dot function that take only param of the same type
x = tensor.cast(x, dot_out)
y = tensor.cast(y, dot_out)
return gof.Apply(self, [x, y, p_data, p_ind, p_ptr, p_ncols], [
tensor.tensor(dtype=dtype_out, broadcastable=(False,)),
tensor.tensor(dtype=p_ind.type.dtype, broadcastable=(False,)),
tensor.tensor(dtype=p_ptr.type.dtype, broadcastable=(False,))
])
def c_support_code(self):
return blas.blas_header_text()
def c_libraries(self):
return blas.ldflags()
def c_compile_args(self):
return blas.ldflags(libs=False, flags=True)
def c_lib_dirs(self):
return blas.ldflags(libs=False, libs_dir=True)
def c_header_dirs(self):
return blas.ldflags(libs=False, include_dir=True)
def c_code(self, node, name, inputs, outputs, sub):
x, y, p_data, p_ind, p_ptr, p_ncols = inputs
z_data, z_ind, z_ptr = outputs
if node.inputs[0].type.dtype in ('complex64', 'complex128'):
raise NotImplementedError('Complex types are not supported for x')
if node.inputs[1].type.dtype in ('complex64', 'complex128'):
raise NotImplementedError('Complex types are not supported for y')
if node.inputs[2].type.dtype in ('complex64', 'complex128'):
raise NotImplementedError(
'Complex types are not supported for pattern')
dot_out = scalar.upcast(node.inputs[0].type.dtype,
node.inputs[1].type.dtype)
if dot_out == "float32":
conv_type = "float"
cdot = "sdot_"
else:
conv_type = "double"
cdot = "ddot_"
# retrieve dtype number
typenum_x = node.inputs[0].type.dtype_specs()[-1]
typenum_y = node.inputs[1].type.dtype_specs()[-1]
typenum_p = node.inputs[2].type.dtype_specs()[-1]
typenum_zd = tensor.TensorType(node.outputs[0].dtype,
[]).dtype_specs()[-1]
typenum_zi = tensor.TensorType(node.outputs[1].dtype,
[]).dtype_specs()[-1]
typenum_zp = tensor.TensorType(node.outputs[2].dtype,
[]).dtype_specs()[-1]
rval = """
if (%(x)s->nd != 2) {
PyErr_SetString(PyExc_NotImplementedError, "rank(x) != 2"); %(fail)s;}
if (%(y)s->nd != 2) {
PyErr_SetString(PyExc_NotImplementedError, "rank(y) != 2"); %(fail)s;}
if (%(x)s->descr->type_num != %(typenum_x)s) {
PyErr_SetString(PyExc_NotImplementedError,
"Invalid type for x");
%(fail)s;}
if (%(y)s->descr->type_num != %(typenum_y)s) {
PyErr_SetString(PyExc_NotImplementedError,
"Invalid type for y");
%(fail)s;}
if (%(p_data)s->descr->type_num != %(typenum_p)s) {
PyErr_SetString(PyExc_NotImplementedError,
"Invalid type for pattern");
%(fail)s;}
if (%(x)s->dimensions[1] != %(y)s->dimensions[1]) {
PyErr_SetString(PyExc_NotImplementedError,
"x's number of columns doesn't match y's rows! Note: sampling_dot is different from dot because y is assumed to be transposed.");
%(fail)s;}
if (%(y)s->dimensions[0] != ((npy_int32 *)%(p_ncols)s->data)[0] ||
%(x)s->dimensions[0] != (%(p_ptr)s->dimensions[0] - 1))
{PyErr_SetString(PyExc_NotImplementedError,
"The dimension of the pattern and the output must match"); %(fail)s;}
// Allocate output
if (!%(z_data)s
|| (%(z_data)s->dimensions[0] != %(p_data)s->dimensions[0])
|| (%(z_data)s->descr->type_num != %(typenum_zd)s)) {
{Py_XDECREF(%(z_data)s);}
npy_intp dims[] = {0};
dims[0] = %(p_data)s->dimensions[0];
%(z_data)s = (PyArrayObject*) PyArray_SimpleNew(1, dims,
%(typenum_zd)s);
}
if (!%(z_ind)s
|| (%(z_ind)s->dimensions[0] != %(p_ind)s->dimensions[0])
|| (%(z_ind)s->descr->type_num != %(typenum_zi)s)) {
{Py_XDECREF(%(z_ind)s);}
npy_intp dims[] = {0};
dims[0] = %(p_ind)s->dimensions[0];
%(z_ind)s = (PyArrayObject*) PyArray_SimpleNew(1, dims,
%(typenum_zi)s);
}
if (!%(z_ptr)s
|| (%(z_ptr)s->dimensions[0] != %(p_ptr)s->dimensions[0])
|| (%(z_ptr)s->descr->type_num != %(typenum_zp)s)) {
{Py_XDECREF(%(z_ptr)s);}
npy_intp dims[] = {0};
dims[0] = %(p_ptr)s->dimensions[0];
%(z_ptr)s = (PyArrayObject*) PyArray_SimpleNew(1, dims,
%(typenum_zp)s);
}
{
// Product of MxK and NxK, output MxN
npy_intp M = %(x)s->dimensions[0];
npy_intp N = %(y)s->dimensions[0];
npy_intp K = %(y)s->dimensions[1];
// pointers to access actual data in the arrays passed as params.
const dtype_%(x)s* __restrict__ Dx = (dtype_%(x)s*)%(x)s->data;
const dtype_%(y)s* __restrict__ Dy = (dtype_%(y)s*)%(y)s->data;
const dtype_%(p_data)s* __restrict__ Dpd = (dtype_%(p_data)s*)%(p_data)s->data;
const dtype_%(p_ind)s* __restrict__ Dpi = (dtype_%(p_ind)s*)%(p_ind)s->data;
const dtype_%(p_ptr)s* __restrict__ Dpp = (dtype_%(p_ptr)s*)%(p_ptr)s->data;
dtype_%(z_data)s* __restrict__ Dzd = (dtype_%(z_data)s*)%(z_data)s->data;
dtype_%(z_ind)s* __restrict__ Dzi = (dtype_%(z_ind)s*)%(z_ind)s->data;
dtype_%(z_ptr)s* __restrict__ Dzp = (dtype_%(z_ptr)s*)%(z_ptr)s->data;
const npy_intp Sdx = %(x)s->strides[1]/%(x)s->descr->elsize;
const npy_intp Sdy = %(y)s->strides[1]/%(y)s->descr->elsize;
const npy_intp Sdpd = %(p_data)s->strides[0] / %(p_data)s->descr->elsize;
const npy_intp Sdpi = %(p_ind)s->strides[0] / %(p_ind)s->descr->elsize;
const npy_intp Sdpp = %(p_ptr)s->strides[0] / %(p_ptr)s->descr->elsize;
const npy_intp Sdzd = %(z_data)s->strides[0] / %(z_data)s->descr->elsize;
const npy_intp Sdzi = %(z_ind)s->strides[0] / %(z_ind)s->descr->elsize;
const npy_intp Sdzp = %(z_ptr)s->strides[0] / %(z_ptr)s->descr->elsize;
memcpy(Dzi, Dpi, %(p_ind)s->dimensions[0]*sizeof(dtype_%(p_ind)s));
memcpy(Dzp, Dpp, %(p_ptr)s->dimensions[0]*sizeof(dtype_%(p_ptr)s));
for (npy_int32 m = 0; m < M; ++m) {
for (npy_int32 n_idx = Dpp[m * Sdpp]; n_idx < Dpp[(m+1)*Sdpp]; ++n_idx) {
const npy_int32 n = Dpi[n_idx * Sdpi]; // row index of non-null value for column K
const dtype_%(x)s* x_row = (dtype_%(x)s*)(%(x)s->data + %(x)s->strides[0] * m);
const dtype_%(y)s* y_col = (dtype_%(y)s*)(%(y)s->data + %(y)s->strides[0] * n);
Dzd[n_idx * Sdzd] = Dpd[n_idx * Sdpd] * %(cdot)s((int*)&K, (const %(conv_type)s*)x_row, (int*)&Sdx, (const %(conv_type)s*)y_col, (int*)&Sdy);
}
}
}
""" % dict(locals(), **sub)
return rval
sampling_dot_csr = SamplingDotCSR()
# register a specialization to replace StructuredDot -> StructuredDotCSx
@gof.local_optimizer([_structured_dot])
def local_structured_dot(node):
if node.op == _structured_dot:
a, b = node.inputs
if a.type.format == 'csc':
a_val, a_ind, a_ptr, a_shape = csm_properties(a)
a_nsparse = a_shape[0]
return [sd_csc(a_val, a_ind, a_ptr, a_nsparse, b)]
if a.type.format == 'csr':
a_val, a_ind, a_ptr, a_shape = csm_properties(a)
return [sd_csr(a_val, a_ind, a_ptr, b)]
return False
# Commented out because
# a) it is only slightly faster than scipy these days, and sometimes a little
# slower, and
# b) the resulting graphs make it very difficult for an op to do size checking
# on the matrices involved. dimension mismatches are hard to detect sensibly.
#register_specialize(local_structured_dot)
def structured_dot_grad(sparse_A, dense_B, ga):
if sparse_A.type.format in ('csc', 'csr'):
if sparse_A.type.format == 'csc':
sdgcsx = sdg_csc
CSx = CSC
else:
sdgcsx = sdg_csr
CSx = CSR
g_A_data = sdgcsx(csm_indices(sparse_A), \
csm_indptr(sparse_A), dense_B, ga)
return CSx(g_A_data, csm_indices(sparse_A), \
csm_indptr(sparse_A), \
csm_shape(sparse_A))
else:
raise NotImplementedError()
class StructuredDotGradCSC(gof.Op): class StructuredDotGradCSC(gof.Op):
# Op that produces the grad of StructuredDot. # Op that produces the grad of StructuredDot.
...@@ -4278,7 +3013,6 @@ class StructuredDotGradCSC(gof.Op): ...@@ -4278,7 +3013,6 @@ class StructuredDotGradCSC(gof.Op):
def infer_shape(self, node, shapes): def infer_shape(self, node, shapes):
return [shapes[0]] return [shapes[0]]
sdg_csc = StructuredDotGradCSC() sdg_csc = StructuredDotGradCSC()
...@@ -4416,10 +3150,104 @@ class StructuredDotGradCSR(gof.Op): ...@@ -4416,10 +3150,104 @@ class StructuredDotGradCSR(gof.Op):
def infer_shape(self, node, shapes): def infer_shape(self, node, shapes):
return [shapes[0]] return [shapes[0]]
sdg_csr = StructuredDotGradCSR() sdg_csr = StructuredDotGradCSR()
def structured_dot_grad(sparse_A, dense_B, ga):
if sparse_A.type.format in ('csc', 'csr'):
if sparse_A.type.format == 'csc':
sdgcsx = sdg_csc
CSx = CSC
else:
sdgcsx = sdg_csr
CSx = CSR
g_A_data = sdgcsx(csm_indices(sparse_A), \
csm_indptr(sparse_A), dense_B, ga)
return CSx(g_A_data, csm_indices(sparse_A), \
csm_indptr(sparse_A), \
csm_shape(sparse_A))
else:
raise NotImplementedError()
class SamplingDot(gof.op.Op):
"""Operand for calculating the dot product dot(`x`, `y`.T) = `z` when you
only want to calculate a subset of `z`.
It is equivalent to `p` o (`x` . `y`.T) where o is the element-wise
product, `x` and `y` operands of the dot product and `p` is a matrix that
contains 1 when the corresponding element of `z` should be calculated
and 0 when it shouldn't. Note that SamplingDot has a different interface
than `dot` because SamplingDot requires `x` to be a `m`x`k` matrix while
`y` is a `n`x`k` matrix instead of the usual `k`x`n` matrix.
.. note::
It will work if the pattern is not binary value, but if the
pattern doesn't have a high sparsity proportion it will be slower
then a more optimized dot followed by a normal elemwise
multiplication.
:param x: Tensor matrix.
:param y: Tensor matrix.
:param p: Sparse matrix in csr format.
:return: A dense matrix containing the dot product of `x` by `y`.T only
where `p` is 1.
:note: The grad implemented is regular, i.e. not structured.
"""
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def make_node(self, x, y, p):
x = tensor.as_tensor_variable(x)
y = tensor.as_tensor_variable(y)
p = as_sparse_variable(p)
if not _is_sparse_variable(p):
raise TypeError(p)
#TODO: use it.
dtype_out = scalar.upcast(x.type.dtype, y.type.dtype, p.type.dtype)
return gof.Apply(self, [x, y, p], [p.type()])
def perform(self, node, (x, y, p), (out,)):
if _is_sparse(x):
raise TypeError(x)
if _is_sparse(y):
raise TypeError(y)
if not _is_sparse(p):
raise TypeError(p)
out[0] = p.__class__(p.multiply(numpy.dot(x, y.T)))
def grad(self, (x, y, p), (gz,)):
rval = [
dot(p * gz, y),
dot((p * gz).T, x),
None
]
return rval
def infer_shape(self, node, ins_shapes):
return [ins_shapes[2]]
def __str__(self):
return self.__class__.__name__
sampling_dot = SamplingDot()
class Dot(gof.op.Op): class Dot(gof.op.Op):
"""Operation for efficiently calculating the dot product when """Operation for efficiently calculating the dot product when
one or all operands is sparse. Supported format are CSC and CSR. one or all operands is sparse. Supported format are CSC and CSR.
...@@ -4600,252 +3428,3 @@ class Usmm(gof.op.Op): ...@@ -4600,252 +3428,3 @@ class Usmm(gof.op.Op):
out[0] = rval out[0] = rval
usmm = Usmm() usmm = Usmm()
class UsmmCscDense(gof.Op):
# Performs the expression is `alpha` * `x` `y` + `z`.
# :param x: Matrix variable.
# :param y: Matrix variable.
# :param z: Dense matrix.
# :param alpha: A tensor scalar.
# :return: The dense matrix resulting from `alpha` * `x` `y` + `z`.
# :note: The grad is not implemented for this op.
# :note: Optimized version os Usmm when `x` is in csc format and
# `y` is dense.
def __init__(self, inplace):
self.inplace = inplace
if inplace:
self.destroy_map = {0: [6]}
def __str__(self):
if self.inplace:
return 'UsmmCscDense{inplace}'
else:
return 'UsmmCscDense{no_inplace}'
def __eq__(self, other):
return (type(self) == type(other)) and self.inplace == other.inplace
def __hash__(self):
return hash(type(self)) ^ self.inplace
def make_node(self, alpha, x_val, x_ind, x_ptr, x_nrows, y, z):
alpha = tensor.as_tensor_variable(alpha)
x_val = tensor.as_tensor_variable(x_val)
x_ind = tensor.as_tensor_variable(x_ind)
x_ptr = tensor.as_tensor_variable(x_ptr)
x_nrows = tensor.as_tensor_variable(x_nrows)
y = tensor.as_tensor_variable(y)
z = tensor.as_tensor_variable(z)
assert x_ind.dtype == 'int32'
assert x_ptr.dtype == 'int32'
assert x_nrows.dtype == 'int32'
assert alpha.ndim == 2 and alpha.type.broadcastable == (True, True)
assert x_val.ndim == 1
assert y.ndim == 2
assert z.ndim == 2
dtype_out = scalar.upcast(alpha.type.dtype, x_val.type.dtype,
y.type.dtype, z.type.dtype)
if dtype_out not in ('float32', 'float64'):
raise NotImplementedError('only float types are supported in '
'operands')
if self.inplace:
assert z.type.dtype == dtype_out
# axpy work only with the same dtype, so we should upcast the input
if dtype_out != alpha.type.dtype:
alpha = tensor.cast(alpha, dtype_out)
if dtype_out != x_val.type.dtype:
x_val = tensor.cast(x_val, dtype_out)
if dtype_out != y.type.dtype:
y = tensor.cast(y, dtype_out)
if dtype_out != z.type.dtype:
z = tensor.cast(z, dtype_out)
r = gof.Apply(self, [alpha, x_val, x_ind, x_ptr, x_nrows, y, z],
[tensor.tensor(dtype_out, (False, y.type.broadcastable[1]))])
return r
def c_support_code(self):
return blas.blas_header_text()
def c_libraries(self):
return blas.ldflags()
def c_compile_args(self):
return blas.ldflags(libs=False, flags=True)
def c_lib_dirs(self):
return blas.ldflags(libs=False, libs_dir=True)
def c_header_dirs(self):
return blas.ldflags(libs=False, include_dir=True)
def c_code(self, node, name, inputs, outputs, sub):
alpha, x_val, x_ind, x_ptr, x_nrows, y, z = inputs
zn = outputs[0]
if node.inputs[1].type.dtype in ('complex64', 'complex128'):
raise NotImplementedError('Complex types are not supported for '
'x_val')
if node.inputs[5].type.dtype in ('complex64', 'complex128'):
raise NotImplementedError('Complex types are not supported for y')
if node.inputs[6].type.dtype != node.outputs[0].type.dtype:
raise NotImplementedError('z and output must have same type')
if node.inputs[1].type.dtype == "float32":
conv_type = "float"
axpy = "saxpy_"
else:
conv_type = "double"
axpy = "daxpy_"
# retrieve dtype numbers
typenum_alpha = node.inputs[0].type.dtype_specs()[-1]
typenum_x_val = node.inputs[1].type.dtype_specs()[-1]
typenum_y = node.inputs[5].type.dtype_specs()[-1]
typenum_z = node.inputs[6].type.dtype_specs()[-1]
typenum_zn = node.outputs[0].type.dtype_specs()[-1]
inplace = int(self.inplace)
rval = """
if (%(x_val)s->nd != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(x_val) != 1"); %(fail)s;}
if (%(x_ind)s->nd != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(x_ind) != 1"); %(fail)s;}
if (%(x_ptr)s->nd != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(x_ptr) != 1"); %(fail)s;}
if (%(x_nrows)s->nd != 0) {PyErr_SetString(PyExc_NotImplementedError, "rank(x_nrows) != 0"); %(fail)s;}
if (%(y)s->nd != 2) {PyErr_SetString(PyExc_NotImplementedError, "rank(y) != 2"); %(fail)s;}
if (%(x_val)s->descr->type_num != %(typenum_x_val)s) {
PyErr_SetString(PyExc_NotImplementedError, "Invalid type for x_val"); %(fail)s;}
if (%(y)s->descr->type_num != %(typenum_y)s) {
PyErr_SetString(PyExc_NotImplementedError, "Invalid type for y"); %(fail)s;}
if (%(z)s->descr->type_num != %(typenum_z)s) {
PyErr_SetString(PyExc_NotImplementedError, "Invalid type for z"); %(fail)s;}
if (%(alpha)s->descr->type_num != %(typenum_alpha)s) {
PyErr_SetString(PyExc_NotImplementedError, "Invalid type for alpha"); %(fail)s;}
if (%(x_ind)s->descr->type_num != PyArray_INT32) {
PyErr_SetString(PyExc_NotImplementedError, "x_ind dtype not INT32"); %(fail)s;}
if (%(x_ptr)s->descr->type_num != PyArray_INT32)
{PyErr_SetString(PyExc_NotImplementedError, "x_ptr dtype not INT32"); %(fail)s;}
if (%(x_nrows)s->descr->type_num != PyArray_INT32)
{PyErr_SetString(PyExc_NotImplementedError, "x_nrows dtype not INT32"); %(fail)s;}
if (%(x_val)s->dimensions[0] != %(x_ind)s->dimensions[0])
{PyErr_SetString(PyExc_NotImplementedError, "x_val and x_ind have different lengths"); %(fail)s;}
if (%(x_ptr)s->dimensions[0] != %(y)s->dimensions[0]+1)
{PyErr_SetString(PyExc_NotImplementedError, "x's number of columns doesn't match y's rows"); %(fail)s;}
if (%(z)s->dimensions[0] != ((npy_int32 *)%(x_nrows)s->data)[0] || %(z)s->dimensions[1] != %(y)s->dimensions[1])
{PyErr_SetString(PyExc_NotImplementedError, "The dimension of the allocated output doesn't match the correct output size."); %(fail)s;}
if (PyArray_SIZE(%(alpha)s) != 1)
{PyErr_SetString(PyExc_NotImplementedError, "The number of element in alpha must be 1"); %(fail)s;}
if (%(alpha)s->nd != 2)
{PyErr_SetString(PyExc_NotImplementedError, "The number dimension of alpha must be 2"); %(fail)s;}
if (%(x_val)s->nd != 1)
{PyErr_SetString(PyExc_NotImplementedError, "The number dimension of x_val must be 1"); %(fail)s;}
if (%(y)s->nd != 2)
{PyErr_SetString(PyExc_NotImplementedError, "The number dimension of y must be 2"); %(fail)s;}
if (%(z)s->nd != 2)
{PyErr_SetString(PyExc_NotImplementedError, "The number dimension of z must be 2"); %(fail)s;}
if (%(inplace)s)
{
if (%(typenum_zn)s != %(typenum_z)s) {
PyErr_SetString(PyExc_NotImplementedError, "When inplace the output dtype must be the same as the input"); %(fail)s;}
Py_XDECREF(%(zn)s);
%(zn)s = %(z)s;
Py_INCREF(%(zn)s);
}
else if (!%(zn)s
|| (%(zn)s->dimensions[0] != ((npy_int32 *)%(x_nrows)s->data)[0])
|| (%(zn)s->dimensions[1] != %(y)s->dimensions[1])
)
{
{Py_XDECREF(%(zn)s);}
npy_intp dims[] = {0, 0};
dims[0] = ((npy_int32 *)%(x_nrows)s->data)[0];
dims[1] = %(y)s->dimensions[1];
%(zn)s = (PyArrayObject*) PyArray_SimpleNew(2, dims, %(typenum_zn)s);
}
{
// sparse array has size MxK, dense KxN, output MxN
npy_intp M = %(zn)s->dimensions[0];
npy_intp N = %(zn)s->dimensions[1];
npy_intp K = %(y)s->dimensions[0];
// pointers to access actual data in the arrays passed as params.
const dtype_%(x_val)s* __restrict__ Dval = (dtype_%(x_val)s*)%(x_val)s->data;
const npy_int32 * __restrict__ Dind = (npy_int32*)%(x_ind)s->data;
const npy_int32 * __restrict__ Dptr = (npy_int32*)%(x_ptr)s->data;
const dtype_%(alpha)s alpha = ((dtype_%(alpha)s*)%(alpha)s->data)[0];
npy_intp Sz = %(z)s->strides[1] / %(z)s->descr->elsize;
npy_intp Szn = %(zn)s->strides[1] / %(zn)s->descr->elsize;
npy_intp Sval = %(x_val)s->strides[0] / %(x_val)s->descr->elsize;
npy_intp Sind = %(x_ind)s->strides[0] / %(x_ind)s->descr->elsize;
npy_intp Sptr = %(x_ptr)s->strides[0] / %(x_ptr)s->descr->elsize;
npy_intp Sy = %(y)s->strides[1] / %(y)s->descr->elsize;
if (!(%(inplace)s))
{
if (PyArray_CopyInto(%(zn)s, %(z)s))
{
Py_XDECREF(%(zn)s);
%(fail)s;
}
}
for (npy_int32 k = 0; k < K; ++k)
{
for (npy_int32 m_idx = Dptr[k * Sptr]; m_idx < Dptr[(k+1)*Sptr]; ++m_idx)
{
const npy_int32 m = Dind[m_idx * Sind]; // row index of non-null value for column K
const dtype_%(x_val)s Amk = alpha * Dval[m_idx * Sval]; // actual value at that location
dtype_%(y)s* y_row = (dtype_%(y)s*)(%(y)s->data + %(y)s->strides[0] * k);
// axpy expects pointer to the beginning of memory arrays,
// so when the stride is negative, we need to get the
// last element
if (Sy < 0)
y_row += (K - 1) * Sy;
dtype_%(zn)s* z_row = (dtype_%(zn)s*)(%(zn)s->data + %(zn)s->strides[0] * m);
if (Szn < 0)
z_row += (N - 1) * Szn;
%(axpy)s((int*)&N, (%(conv_type)s*)&Amk, (%(conv_type)s*)y_row, (int*)&Sy, (%(conv_type)s*)z_row, (int*)&Szn);
}
}
}
""" % dict(locals(), **sub)
return rval
def c_code_cache_version(self):
return (1,)
usmm_csc_dense = UsmmCscDense(inplace=False)
usmm_csc_dense_inplace = UsmmCscDense(inplace=True)
...@@ -2,37 +2,16 @@ from itertools import izip ...@@ -2,37 +2,16 @@ from itertools import izip
import theano import theano
import numpy import numpy
from theano import gof, scalar from theano import gof, scalar, tensor
from theano.tensor import blas
from theano.sparse import (CSC, CSR, csm_properties, Remove0, from theano.sparse import (CSC, CSR, csm_properties, Remove0,
register_specialize, register_specialize,
csm_grad, csm_grad_c, csm_grad, usmm)
usmm_csc_dense, usmm)
from theano.sparse import basic as sparse from theano.sparse import basic as sparse
from basic import _is_sparse_variable from basic import _is_sparse_variable
# This is tested in tests/test_basic.py:UsmmTests
local_usmm = gof.opt.PatternSub(
(theano.tensor.sub, 'z',
(theano.tensor.mul,
{'pattern': 'alpha',
'constraint': lambda expr: numpy.all(expr.type.broadcastable)},
(sparse._dot, 'x', 'y'))),
(usmm, (theano.tensor.neg, 'alpha'), 'x', 'y', 'z'))
register_specialize(local_usmm, name="local_usmm")
# This is tested in tests/test_opt.py:test_local_csm_grad_c
@gof.local_optimizer([csm_grad(None)])
def local_csm_grad_c(node):
""" csm_grad(None) -> csm_grad_c """
if node.op == csm_grad(None):
return [csm_grad_c(*node.inputs)]
return False
register_specialize(local_csm_grad_c)
# This is tested in tests/test_opt.py:test_local_csm_properties_csm # This is tested in tests/test_opt.py:test_local_csm_properties_csm
@gof.local_optimizer([csm_properties]) @gof.local_optimizer([csm_properties])
def local_csm_properties_csm(node): def local_csm_properties_csm(node):
...@@ -68,6 +47,356 @@ theano.compile.optdb.register('local_inplace_remove0', ...@@ -68,6 +47,356 @@ theano.compile.optdb.register('local_inplace_remove0',
60, 'fast_run', 'inplace') 60, 'fast_run', 'inplace')
class StructuredDotCSC(gof.Op):
"""Structured Dot CSC is like dot, except that only the
gradient wrt non-zero elements of the sparse matrix
`a` are calculated and propagated.
The output is presumed to be a dense matrix, and is represented by a
TensorType instance.
:param a: A sparse matrix in csc format.
:param b: A sparse or dense matrix.
:return: The dot product of `a` and `b`.
:note: The grad implemented is structured.
:note: This op is used as an optimization for StructuredDot.
"""
def __eq__(self, other):
return (type(self) == type(other))
def __hash__(self):
return hash(type(self))
def __str__(self):
return self.__class__.__name__
def make_node(self, a_val, a_ind, a_ptr, a_nrows, b):
dtype_out = scalar.upcast(a_val.type.dtype, b.type.dtype)
r = gof.Apply(self, [a_val, a_ind, a_ptr, a_nrows, b],
[tensor.tensor(dtype_out, (False, b.type.broadcastable[1]))])
return r
def perform(self, node, (a_val, a_ind, a_ptr, a_nrows, b), (out,)):
a = scipy.sparse.csc_matrix((a_val, a_ind, a_ptr),
(a_nrows, b.shape[0]),
copy=False)
#out[0] = a.dot(b)
out[0] = theano._asarray(a * b, dtype=node.outputs[0].type.dtype)
assert _is_dense(out[0]) # scipy 0.7 automatically converts to dense
def c_code(self, node, name, (a_val, a_ind, a_ptr, a_nrows, b), (z,), sub):
# C-implementation of the dot product of the sparse matrix A and matrix
# B.
# @param a_val: non-zero values of the sparse matrix
# @param a_ind: column indices of the non-null values (.indices of a
# scipy.csc_matrix)
# @param a_ptr: a_ptr indicates col indices for col. i are in the range
# a_ptr[i]:a_ptr[i+1]
# @param n_rows: number of rows of sparse matrix
# @param b: dense matrix to perform dot product with, as in dot(a, b)
# @param z: return value
# @param sub: TODO, not too sure, something to do with weave probably
if node.inputs[0].type.dtype in ('complex64', 'complex128'):
raise NotImplementedError('Complex types are not supported for a_val')
if node.inputs[4].type.dtype in ('complex64', 'complex128'):
raise NotImplementedError('Complex types are not supported for b')
typenum_z = node.outputs[0].type.dtype_specs()[-1] # retrieve dtype number
typenum_a_val = node.inputs[0].type.dtype_specs()[-1] # retrieve dtype number
typenum_b = node.inputs[4].type.dtype_specs()[-1] # retrieve dtype number
rval = """
if (%(a_val)s->nd != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(a_val) != 1"); %(fail)s;}
if (%(a_ind)s->nd != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(a_ind) != 1"); %(fail)s;}
if (%(a_ptr)s->nd != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(a_ptr) != 1"); %(fail)s;}
if (%(a_nrows)s->nd != 0) {PyErr_SetString(PyExc_NotImplementedError, "rank(nrows) != 0"); %(fail)s;}
if (%(b)s->nd != 2) {PyErr_SetString(PyExc_NotImplementedError, "rank(b) != 2"); %(fail)s;}
if (%(a_val)s->descr->type_num != %(typenum_a_val)s) {
PyErr_SetString(PyExc_NotImplementedError, "Invalid type for a_val"); %(fail)s;}
if (%(b)s->descr->type_num != %(typenum_b)s) {
PyErr_SetString(PyExc_NotImplementedError, "Invalid type for b"); %(fail)s;}
if (%(a_ind)s->descr->type_num != PyArray_INT32) {
PyErr_SetString(PyExc_NotImplementedError, "a_ind dtype not INT32"); %(fail)s;}
if (%(a_ptr)s->descr->type_num != PyArray_INT32)
{PyErr_SetString(PyExc_NotImplementedError, "a_ptr dtype not INT32"); %(fail)s;}
if (%(a_nrows)s->descr->type_num != PyArray_INT32)
{PyErr_SetString(PyExc_NotImplementedError, "a_nrows dtype not INT32"); %(fail)s;}
if (%(a_val)s->dimensions[0] != %(a_ind)s->dimensions[0])
{PyErr_SetString(PyExc_NotImplementedError, "a_val and a_ind have different lengths"); %(fail)s;}
if (%(a_ptr)s->dimensions[0] != %(b)s->dimensions[0]+1)
{PyErr_SetString(PyExc_NotImplementedError, "a's number of columns doesn't match b's rows"); %(fail)s;}
if ((!%(z)s)
|| (%(z)s->dimensions[0] != ((npy_int32 *)%(a_nrows)s->data)[0])
|| (%(z)s->dimensions[1] != %(b)s->dimensions[1])
)
{
{Py_XDECREF(%(z)s);}
npy_intp dims[] = {0, 0};
dims[0] = ((npy_int32 *)%(a_nrows)s->data)[0];
dims[1] = %(b)s->dimensions[1];
%(z)s = (PyArrayObject*) PyArray_SimpleNew(2, dims, %(typenum_z)s);
}
{
// sparse array has size MxK, dense KxN, output MxN
npy_intp M = %(z)s->dimensions[0];
npy_intp N = %(z)s->dimensions[1];
npy_intp K = %(b)s->dimensions[0];
// strides tell you how many bytes to skip to go to next column/row entry
npy_intp Szm = %(z)s->strides[0] / %(z)s->descr->elsize;
npy_intp Szn = %(z)s->strides[1] / %(z)s->descr->elsize;
//npy_intp Sbm = %(b)s->strides[0] / %(b)s->descr->elsize;
npy_intp Sbn = %(b)s->strides[1] / %(b)s->descr->elsize;
npy_intp Sval = %(a_val)s->strides[0] / %(a_val)s->descr->elsize;
npy_intp Sind = %(a_ind)s->strides[0] / %(a_ind)s->descr->elsize;
npy_intp Sptr = %(a_ptr)s->strides[0] / %(a_ptr)s->descr->elsize;
// pointers to access actual data in the arrays passed as params.
dtype_%(z)s* __restrict__ Dz = (dtype_%(z)s*)%(z)s->data;
const dtype_%(a_val)s* __restrict__ Dval = (dtype_%(a_val)s*)%(a_val)s->data;
const npy_int32 * __restrict__ Dind = (npy_int32*)%(a_ind)s->data;
const npy_int32 * __restrict__ Dptr = (npy_int32*)%(a_ptr)s->data;
//npy_intp nnz = %(a_ind)s->dimensions[0];
//clear the output array
memset(Dz, 0, M*N*sizeof(dtype_%(z)s));
//iterate over the sparse array, making the most of an entry wherever we find it.
//
// Normal matrix matrix multiply: A MxK, B KxN => Z = AB
// for m
// for n
// for k
// z[m, n] += a[m, k] * b[k, n]
// Here instead: Z =
// for k
// for m (sparse)
// for n
// z[m, n] += a[m, k] * b[k, n]
// loop over inner dimension
for (npy_int32 k = 0; k < K; ++k)
{
// get pointer to k-th row of dense matrix
const dtype_%(b)s* __restrict__ bk = (dtype_%(b)s*)(%(b)s->data + %(b)s->strides[0] * k);
// loop over sparse column indices through index pointer array
// (amounts to looping over rows M of sparse matrix)
for (npy_int32 m_idx = Dptr[k * Sptr]; m_idx < Dptr[(k+1) * Sptr]; ++m_idx)
{
npy_int32 m = Dind[m_idx * Sind]; // row index of non-null value for column K
const dtype_%(a_val)s Amk = Dval[m_idx * Sval]; // actual value at that location
// pointer to m-th row of the output matrix Z
dtype_%(z)s* __restrict__ zm = (dtype_%(z)s*)(%(z)s->data + %(z)s->strides[0] * m);
//RESOLVE: a.shape[0] equals z.shape[0], why is this not an equality constraint?
if (m >= %(z)s->dimensions[0])
{PyErr_SetString(PyExc_NotImplementedError, "illegal row index in a"); %(fail)s;}
// loop over final dimension (cols of dense matrix) and perform dot product
if ((Szn == 1) && (Sbn == 1)) {
for(npy_int32 n = 0; n < N; ++n)
{
zm[n] += Amk * bk[n];
}
}
else
{
for(npy_int32 n = 0; n < N; ++n)
{
zm[n*Szn] += Amk * bk[n*Sbn];
}
}
}
}
}
""" % dict(locals(), **sub)
return rval
def c_code_cache_version(self):
return (2,)
sd_csc = StructuredDotCSC()
class StructuredDotCSR(gof.Op):
"""Structured Dot CSR is like dot, except that only the
gradient wrt non-zero elements of the sparse matrix
`a` are calculated and propagated.
The output is presumed to be a dense matrix, and is represented by a
TensorType instance.
:param a: A sparse matrix in csr format.
:param b: A sparse or dense matrix.
:return: The dot product of `a` and `b`.
:note: The grad implemented is structured.
:note: This op is used as an optimization for StructuredDot.
"""
def __eq__(self, other):
return (type(self) == type(other))
def __hash__(self):
return hash(type(self))
def __str__(self):
return self.__class__.__name__
def make_node(self, a_val, a_ind, a_ptr, b):
self.dtype_out = scalar.upcast(a_val.type.dtype, b.type.dtype)
r = gof.Apply(self, [a_val, a_ind, a_ptr, b],
[tensor.tensor(self.dtype_out, (False,
b.type.broadcastable[1]))])
return r
def perform(self, node, (a_val, a_ind, a_ptr, b), (out,)):
a = scipy.sparse.csr_matrix((a_val, a_ind, a_ptr),
(len(a_ptr) - 1, b.shape[0]),
copy=True) # use view_map before setting this to False
#out[0] = a.dot(b)
out[0] = a * b
# scipy 0.7 automatically converts to dense, but not .6 sometimes
assert _is_dense(out[0])
def c_code(self, node, name, (a_val, a_ind, a_ptr, b), (z,), sub):
"""
C-implementation of the dot product of the sparse matrix A and matrix
B.
@param a_val: non-zero values of the sparse matrix
@param a_ind: column indices of the non-null values (.indices of a
scipy.csc_matrix)
@param a_ptr: a_ptr indicates col indices for col. i are in the range
a_ptr[i]:a_ptr[i+1]
@param n_cols: number of columns of sparse matrix
@param b: dense matrix to perform dot product with, as in dot(a, b)
@param z: return value
@param sub: TODO, not too sure, something to do with weave probably
"""
# retrieve dtype number
typenum_z = tensor.TensorType(self.dtype_out, []).dtype_specs()[-1]
if node.inputs[0].type.dtype in ('complex64', 'complex128'):
raise NotImplementedError('Complex types are not supported for a_val')
if node.inputs[3].type.dtype in ('complex64', 'complex128'):
raise NotImplementedError('Complex types are not supported for b')
return """
if (%(a_val)s->nd != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(a_val) != 1"); %(fail)s;}
if (%(a_ind)s->nd != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(a_ind) != 1"); %(fail)s;}
if (%(a_ptr)s->nd != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(a_ptr) != 1"); %(fail)s;}
if (%(b)s->nd != 2) {PyErr_SetString(PyExc_NotImplementedError, "rank(b) != 2"); %(fail)s;}
if (%(a_ind)s->descr->type_num != PyArray_INT32) {
PyErr_SetString(PyExc_NotImplementedError, "a_ind dtype not INT32"); %(fail)s;}
if (%(a_ptr)s->descr->type_num != PyArray_INT32)
{PyErr_SetString(PyExc_NotImplementedError, "a_ptr dtype not INT32"); %(fail)s;}
if (%(a_val)s->dimensions[0] != %(a_ind)s->dimensions[0])
{PyErr_SetString(PyExc_NotImplementedError, "a_val and a_ind have different lengths"); %(fail)s;}
if ((!%(z)s)
|| (%(z)s->dimensions[0] != %(a_ptr)s->dimensions[0]-1) //a's rows
|| (%(z)s->dimensions[1] != %(b)s->dimensions[1]) //b's columns
)
{
{Py_XDECREF(%(z)s);}
npy_intp dims[] = {0, 0};
dims[0] = %(a_ptr)s->dimensions[0]-1;
dims[1] = %(b)s->dimensions[1];
%(z)s = (PyArrayObject*) PyArray_SimpleNew(2, dims, %(typenum_z)s);
}
{
// sparse array has size MxK, dense KxN, output MxN
npy_intp M = %(z)s->dimensions[0];
npy_intp N = %(z)s->dimensions[1];
npy_intp K = %(b)s->dimensions[0];
// strides tell you how many bytes to skip to go to next column/row entry
npy_intp Szm = %(z)s->strides[0] / %(z)s->descr->elsize;
npy_intp Szn = %(z)s->strides[1] / %(z)s->descr->elsize;
npy_intp Sbm = %(b)s->strides[0] / %(b)s->descr->elsize;
npy_intp Sbn = %(b)s->strides[1] / %(b)s->descr->elsize;
npy_intp Sval = %(a_val)s->strides[0] / %(a_val)s->descr->elsize;
npy_intp Sind = %(a_ind)s->strides[0] / %(a_ind)s->descr->elsize;
npy_intp Sptr = %(a_ptr)s->strides[0] / %(a_ptr)s->descr->elsize;
// pointers to access actual data in the arrays passed as params.
dtype_%(z)s* __restrict__ Dz = (dtype_%(z)s*)%(z)s->data;
const dtype_%(a_val)s* __restrict__ Dval = (dtype_%(a_val)s*)%(a_val)s->data;
const npy_int32 * __restrict__ Dind = (npy_int32*)%(a_ind)s->data;
const npy_int32 * __restrict__ Dptr = (npy_int32*)%(a_ptr)s->data;
//npy_intp nnz = %(a_ind)s->dimensions[0];
//clear the output array
memset(Dz, 0, M*N*sizeof(dtype_%(z)s));
//iterate over the sparse array, making the most of an entry wherever we find it.
// Normal matrix matrix multiply:
// for m
// for n
// for k
// z[m, n] += a[m, k] * b[k, n]
// Here instead:
// for m
// for k (sparse)
// for n
// z[m, n] += a[m, k] * b[k, n]
// loop over inner dimension
for (npy_int64 m = 0; m < M; ++m)
{
// pointer to m-th row of the output matrix Z
dtype_%(z)s* __restrict__ zm = (dtype_%(z)s*)(%(z)s->data + %(z)s->strides[0] * m);
// loop over sparse rows indices through index pointer array
// (amounts to looping over cols k of sparse matrix)
for (npy_int32 k_idx = Dptr[m * Sptr]; k_idx < Dptr[(m+1) * Sptr]; ++k_idx)
{
npy_int32 k = Dind[k_idx * Sind]; // col index of non-null value for row m
const dtype_%(a_val)s Amk = Dval[k_idx * Sval]; // actual value at that location
// get pointer to k-th row of dense matrix
const dtype_%(b)s* __restrict__ bk = (dtype_%(b)s*)(%(b)s->data + %(b)s->strides[0] * k);
// loop over final dimension (cols of dense matrix) and perform dot product
for(npy_int32 n = 0; n < N; ++n)
{
zm[n*Szn] += Amk * bk[n*Sbn];
}
}
}
}
""" % dict(locals(), **sub)
def c_code_cache_version(self):
return (1,)
sd_csr = StructuredDotCSR()
# register a specialization to replace StructuredDot -> StructuredDotCSx # register a specialization to replace StructuredDot -> StructuredDotCSx
# This is tested in tests/test_basic.py:792 # This is tested in tests/test_basic.py:792
@gof.local_optimizer([sparse._structured_dot]) @gof.local_optimizer([sparse._structured_dot])
...@@ -83,6 +412,7 @@ def local_structured_dot(node): ...@@ -83,6 +412,7 @@ def local_structured_dot(node):
return [sd_csr(a_val, a_ind, a_ptr, b)] return [sd_csr(a_val, a_ind, a_ptr, b)]
return False return False
# Commented out because # Commented out because
# a) it is only slightly faster than scipy these days, and sometimes a little # a) it is only slightly faster than scipy these days, and sometimes a little
# slower, and # slower, and
...@@ -91,29 +421,274 @@ def local_structured_dot(node): ...@@ -91,29 +421,274 @@ def local_structured_dot(node):
#register_specialize(local_structured_dot) #register_specialize(local_structured_dot)
class UsmmCscDense(gof.Op):
"""Performs the expression is `alpha` * `x` `y` + `z`.
:param x: Matrix variable.
:param y: Matrix variable.
:param z: Dense matrix.
:param alpha: A tensor scalar.
:return: The dense matrix resulting from `alpha` * `x` `y` + `z`.
:note: The grad is not implemented for this op.
:note: Optimized version os Usmm when `x` is in csc format and
`y` is dense.
"""
def __init__(self, inplace):
self.inplace = inplace
if inplace:
self.destroy_map = {0: [6]}
def __str__(self):
if self.inplace:
return 'UsmmCscDense{inplace}'
else:
return 'UsmmCscDense{no_inplace}'
def __eq__(self, other):
return (type(self) == type(other)) and self.inplace == other.inplace
def __hash__(self):
return hash(type(self)) ^ self.inplace
def make_node(self, alpha, x_val, x_ind, x_ptr, x_nrows, y, z):
alpha = tensor.as_tensor_variable(alpha)
x_val = tensor.as_tensor_variable(x_val)
x_ind = tensor.as_tensor_variable(x_ind)
x_ptr = tensor.as_tensor_variable(x_ptr)
x_nrows = tensor.as_tensor_variable(x_nrows)
y = tensor.as_tensor_variable(y)
z = tensor.as_tensor_variable(z)
assert x_ind.dtype == 'int32'
assert x_ptr.dtype == 'int32'
assert x_nrows.dtype == 'int32'
assert alpha.ndim == 2 and alpha.type.broadcastable == (True, True)
assert x_val.ndim == 1
assert y.ndim == 2
assert z.ndim == 2
dtype_out = scalar.upcast(alpha.type.dtype, x_val.type.dtype,
y.type.dtype, z.type.dtype)
if dtype_out not in ('float32', 'float64'):
raise NotImplementedError('only float types are supported in '
'operands')
if self.inplace:
assert z.type.dtype == dtype_out
# axpy work only with the same dtype, so we should upcast the input
if dtype_out != alpha.type.dtype:
alpha = tensor.cast(alpha, dtype_out)
if dtype_out != x_val.type.dtype:
x_val = tensor.cast(x_val, dtype_out)
if dtype_out != y.type.dtype:
y = tensor.cast(y, dtype_out)
if dtype_out != z.type.dtype:
z = tensor.cast(z, dtype_out)
r = gof.Apply(self, [alpha, x_val, x_ind, x_ptr, x_nrows, y, z],
[tensor.tensor(dtype_out, (False, y.type.broadcastable[1]))])
return r
def c_support_code(self):
return blas.blas_header_text()
def c_libraries(self):
return blas.ldflags()
def c_compile_args(self):
return blas.ldflags(libs=False, flags=True)
def c_lib_dirs(self):
return blas.ldflags(libs=False, libs_dir=True)
def c_header_dirs(self):
return blas.ldflags(libs=False, include_dir=True)
def c_code(self, node, name, inputs, outputs, sub):
alpha, x_val, x_ind, x_ptr, x_nrows, y, z = inputs
zn = outputs[0]
if node.inputs[1].type.dtype in ('complex64', 'complex128'):
raise NotImplementedError('Complex types are not supported for '
'x_val')
if node.inputs[5].type.dtype in ('complex64', 'complex128'):
raise NotImplementedError('Complex types are not supported for y')
if node.inputs[6].type.dtype != node.outputs[0].type.dtype:
raise NotImplementedError('z and output must have same type')
if node.inputs[1].type.dtype == "float32":
conv_type = "float"
axpy = "saxpy_"
else:
conv_type = "double"
axpy = "daxpy_"
# retrieve dtype numbers
typenum_alpha = node.inputs[0].type.dtype_specs()[-1]
typenum_x_val = node.inputs[1].type.dtype_specs()[-1]
typenum_y = node.inputs[5].type.dtype_specs()[-1]
typenum_z = node.inputs[6].type.dtype_specs()[-1]
typenum_zn = node.outputs[0].type.dtype_specs()[-1]
inplace = int(self.inplace)
rval = """
if (%(x_val)s->nd != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(x_val) != 1"); %(fail)s;}
if (%(x_ind)s->nd != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(x_ind) != 1"); %(fail)s;}
if (%(x_ptr)s->nd != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(x_ptr) != 1"); %(fail)s;}
if (%(x_nrows)s->nd != 0) {PyErr_SetString(PyExc_NotImplementedError, "rank(x_nrows) != 0"); %(fail)s;}
if (%(y)s->nd != 2) {PyErr_SetString(PyExc_NotImplementedError, "rank(y) != 2"); %(fail)s;}
if (%(x_val)s->descr->type_num != %(typenum_x_val)s) {
PyErr_SetString(PyExc_NotImplementedError, "Invalid type for x_val"); %(fail)s;}
if (%(y)s->descr->type_num != %(typenum_y)s) {
PyErr_SetString(PyExc_NotImplementedError, "Invalid type for y"); %(fail)s;}
if (%(z)s->descr->type_num != %(typenum_z)s) {
PyErr_SetString(PyExc_NotImplementedError, "Invalid type for z"); %(fail)s;}
if (%(alpha)s->descr->type_num != %(typenum_alpha)s) {
PyErr_SetString(PyExc_NotImplementedError, "Invalid type for alpha"); %(fail)s;}
if (%(x_ind)s->descr->type_num != PyArray_INT32) {
PyErr_SetString(PyExc_NotImplementedError, "x_ind dtype not INT32"); %(fail)s;}
if (%(x_ptr)s->descr->type_num != PyArray_INT32)
{PyErr_SetString(PyExc_NotImplementedError, "x_ptr dtype not INT32"); %(fail)s;}
if (%(x_nrows)s->descr->type_num != PyArray_INT32)
{PyErr_SetString(PyExc_NotImplementedError, "x_nrows dtype not INT32"); %(fail)s;}
if (%(x_val)s->dimensions[0] != %(x_ind)s->dimensions[0])
{PyErr_SetString(PyExc_NotImplementedError, "x_val and x_ind have different lengths"); %(fail)s;}
if (%(x_ptr)s->dimensions[0] != %(y)s->dimensions[0]+1)
{PyErr_SetString(PyExc_NotImplementedError, "x's number of columns doesn't match y's rows"); %(fail)s;}
if (%(z)s->dimensions[0] != ((npy_int32 *)%(x_nrows)s->data)[0] || %(z)s->dimensions[1] != %(y)s->dimensions[1])
{PyErr_SetString(PyExc_NotImplementedError, "The dimension of the allocated output doesn't match the correct output size."); %(fail)s;}
if (PyArray_SIZE(%(alpha)s) != 1)
{PyErr_SetString(PyExc_NotImplementedError, "The number of element in alpha must be 1"); %(fail)s;}
if (%(alpha)s->nd != 2)
{PyErr_SetString(PyExc_NotImplementedError, "The number dimension of alpha must be 2"); %(fail)s;}
if (%(x_val)s->nd != 1)
{PyErr_SetString(PyExc_NotImplementedError, "The number dimension of x_val must be 1"); %(fail)s;}
if (%(y)s->nd != 2)
{PyErr_SetString(PyExc_NotImplementedError, "The number dimension of y must be 2"); %(fail)s;}
if (%(z)s->nd != 2)
{PyErr_SetString(PyExc_NotImplementedError, "The number dimension of z must be 2"); %(fail)s;}
if (%(inplace)s)
{
if (%(typenum_zn)s != %(typenum_z)s) {
PyErr_SetString(PyExc_NotImplementedError, "When inplace the output dtype must be the same as the input"); %(fail)s;}
Py_XDECREF(%(zn)s);
%(zn)s = %(z)s;
Py_INCREF(%(zn)s);
}
else if (!%(zn)s
|| (%(zn)s->dimensions[0] != ((npy_int32 *)%(x_nrows)s->data)[0])
|| (%(zn)s->dimensions[1] != %(y)s->dimensions[1])
)
{
{Py_XDECREF(%(zn)s);}
npy_intp dims[] = {0, 0};
dims[0] = ((npy_int32 *)%(x_nrows)s->data)[0];
dims[1] = %(y)s->dimensions[1];
%(zn)s = (PyArrayObject*) PyArray_SimpleNew(2, dims, %(typenum_zn)s);
}
{
// sparse array has size MxK, dense KxN, output MxN
npy_intp M = %(zn)s->dimensions[0];
npy_intp N = %(zn)s->dimensions[1];
npy_intp K = %(y)s->dimensions[0];
// pointers to access actual data in the arrays passed as params.
const dtype_%(x_val)s* __restrict__ Dval = (dtype_%(x_val)s*)%(x_val)s->data;
const npy_int32 * __restrict__ Dind = (npy_int32*)%(x_ind)s->data;
const npy_int32 * __restrict__ Dptr = (npy_int32*)%(x_ptr)s->data;
const dtype_%(alpha)s alpha = ((dtype_%(alpha)s*)%(alpha)s->data)[0];
npy_intp Sz = %(z)s->strides[1] / %(z)s->descr->elsize;
npy_intp Szn = %(zn)s->strides[1] / %(zn)s->descr->elsize;
npy_intp Sval = %(x_val)s->strides[0] / %(x_val)s->descr->elsize;
npy_intp Sind = %(x_ind)s->strides[0] / %(x_ind)s->descr->elsize;
npy_intp Sptr = %(x_ptr)s->strides[0] / %(x_ptr)s->descr->elsize;
npy_intp Sy = %(y)s->strides[1] / %(y)s->descr->elsize;
if (!(%(inplace)s))
{
if (PyArray_CopyInto(%(zn)s, %(z)s))
{
Py_XDECREF(%(zn)s);
%(fail)s;
}
}
for (npy_int32 k = 0; k < K; ++k)
{
for (npy_int32 m_idx = Dptr[k * Sptr]; m_idx < Dptr[(k+1)*Sptr]; ++m_idx)
{
const npy_int32 m = Dind[m_idx * Sind]; // row index of non-null value for column K
const dtype_%(x_val)s Amk = alpha * Dval[m_idx * Sval]; // actual value at that location
dtype_%(y)s* y_row = (dtype_%(y)s*)(%(y)s->data + %(y)s->strides[0] * k);
// axpy expects pointer to the beginning of memory arrays,
// so when the stride is negative, we need to get the
// last element
if (Sy < 0)
y_row += (K - 1) * Sy;
dtype_%(zn)s* z_row = (dtype_%(zn)s*)(%(zn)s->data + %(zn)s->strides[0] * m);
if (Szn < 0)
z_row += (N - 1) * Szn;
%(axpy)s((int*)&N, (%(conv_type)s*)&Amk, (%(conv_type)s*)y_row, (int*)&Sy, (%(conv_type)s*)z_row, (int*)&Szn);
}
}
}
""" % dict(locals(), **sub)
return rval
def c_code_cache_version(self):
return (1,)
usmm_csc_dense = UsmmCscDense(inplace=False)
usmm_csc_dense_inplace = UsmmCscDense(inplace=True)
# This is tested in tests/test_basic.py:UsmmTests
local_usmm = gof.opt.PatternSub(
(theano.tensor.sub, 'z',
(theano.tensor.mul,
{'pattern': 'alpha',
'constraint': lambda expr: numpy.all(expr.type.broadcastable)},
(sparse._dot, 'x', 'y'))),
(usmm, (theano.tensor.neg, 'alpha'), 'x', 'y', 'z'))
register_specialize(local_usmm, name="local_usmm")
# register a specialization to replace usmm_csc_dense -> usmm_csc_dense_inplace
# This is tested in tests/test_basic.py:UsmmTests # This is tested in tests/test_basic.py:UsmmTests
@gof.local_optimizer([usmm_csc_dense]) @gof.local_optimizer([usmm_csc_dense])
def local_usmm_csc_dense_inplace(node): def local_usmm_csc_dense_inplace(node):
if node.op == usmm_csc_dense: if node.op == usmm_csc_dense:
return [sparse.usmm_csc_dense_inplace(*node.inputs)] return [usmm_csc_dense_inplace(*node.inputs)]
register_specialize(local_usmm_csc_dense_inplace, 'inplace') register_specialize(local_usmm_csc_dense_inplace, 'inplace')
@gof.local_optimizer([sparse.csm_properties])
def local_csm_properties_csm(node):
"""if we find csm_properties(CSM(*args)), then we can replace that with the
*args directly"""
if node.op == sparse.csm_properties:
csm, = node.inputs
if csm.owner and (csm.owner.op == sparse.CSC or
csm.owner.op == sparse.CSR):
# csm.owner.inputs could be broadcastable. In that case, we have
# to adjust the broadcasting flag here.
ret_var = [theano.tensor.patternbroadcast(i, o.broadcastable)
for i, o in izip(csm.owner.inputs, node.outputs)]
return ret_var
# This is tested in tests/test_basic.py:UsmmTests # This is tested in tests/test_basic.py:UsmmTests
@gof.local_optimizer([usmm]) @gof.local_optimizer([usmm])
def local_usmm_csx(node): def local_usmm_csx(node):
...@@ -142,6 +717,370 @@ def local_usmm_csx(node): ...@@ -142,6 +717,370 @@ def local_usmm_csx(node):
sparse.register_specialize(local_usmm_csx) sparse.register_specialize(local_usmm_csx)
class CSMGradC(gof.Op):
def __eq__(self, other):
return (type(self) == type(other))
def __hash__(self):
return hash(type(self))
def __str__(self):
return self.__class__.__name__
def make_node(self, a_val, a_ind, a_ptr, a_dim,
b_val, b_ind, b_ptr, b_dim):
return gof.Apply(self, [a_val, a_ind, a_ptr, a_dim,
b_val, b_ind, b_ptr, b_dim], [b_val.type()])
def c_code(self, node, name, (a_val, a_ind, a_ptr, a_dim,
b_val, b_ind, b_ptr, b_dim), (z,), sub):
# retrieve dtype number
typenum_z = node.outputs[0].type.dtype_specs()[-1]
if node.inputs[0].type.dtype in ('complex64', 'complex128'):
raise NotImplementedError('Complex types are not supported for a_val')
if node.inputs[3].type.dtype in ('complex64', 'complex128'):
raise NotImplementedError('Complex types are not supported for b_val')
return """
if (%(a_val)s->nd != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(a_val) != 1"); %(fail)s;}
if (%(a_ind)s->nd != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(a_ind) != 1"); %(fail)s;}
if (%(a_ptr)s->nd != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(a_ptr) != 1"); %(fail)s;}
if (%(b_val)s->nd != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(b_val) != 1"); %(fail)s;}
if (%(b_ind)s->nd != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(b_ind) != 1"); %(fail)s;}
if (%(b_ptr)s->nd != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(b_ptr) != 1"); %(fail)s;}
if (%(a_ind)s->descr->type_num != PyArray_INT32) {
PyErr_SetString(PyExc_NotImplementedError, "a_ind dtype not INT32"); %(fail)s;}
if (%(a_ptr)s->descr->type_num != PyArray_INT32)
{PyErr_SetString(PyExc_NotImplementedError, "a_ptr dtype not INT32"); %(fail)s;}
if (%(b_ind)s->descr->type_num != PyArray_INT32) {
PyErr_SetString(PyExc_NotImplementedError, "b_ind dtype not INT32"); %(fail)s;}
if (%(b_ptr)s->descr->type_num != PyArray_INT32)
{PyErr_SetString(PyExc_NotImplementedError, "b_ptr dtype not INT32"); %(fail)s;}
if (%(a_val)s->dimensions[0] != %(a_ind)s->dimensions[0])
{PyErr_SetString(PyExc_NotImplementedError, "a_val and a_ind have different lengths"); %(fail)s;}
if (%(b_val)s->dimensions[0] != %(b_ind)s->dimensions[0])
{PyErr_SetString(PyExc_NotImplementedError, "b_val and b_ind have different lengths"); %(fail)s;}
if (%(a_ptr)s->dimensions[0] != %(b_ptr)s->dimensions[0])
{PyErr_SetString(PyExc_NotImplementedError, "a_ptr and b_ptr have different lengths"); %(fail)s;}
if ((!%(z)s) || (%(z)s->dimensions[0] != %(a_val)s->dimensions[0]))
{
{Py_XDECREF(%(z)s);}
npy_intp dims[] = {0};
dims[0] = %(a_val)s->dimensions[0];
%(z)s = (PyArrayObject*) PyArray_SimpleNew(1, dims, %(typenum_z)s);
}
{
// sparse array has size MxK, dense KxN, output MxN
npy_intp M = %(a_ptr)s->dimensions[0] - 1;
npy_intp a_dim_0 = ((npy_int32 *)%(a_dim)s->data)[0];
npy_intp a_dim_1 = ((npy_int32 *)%(a_dim)s->data)[1];
npy_intp sp_dim = (M == a_dim_0)?a_dim_1:a_dim_0;
// strides tell you how many bytes to skip to go to next column/row entry
npy_intp Sz = %(z)s->strides[0] / %(z)s->descr->elsize;
npy_intp Sa_val = %(a_val)s->strides[0] / %(a_val)s->descr->elsize;
npy_intp Sa_ind = %(a_ind)s->strides[0] / %(a_ind)s->descr->elsize;
npy_intp Sa_ptr = %(a_ptr)s->strides[0] / %(a_ptr)s->descr->elsize;
npy_intp Sb_val = %(b_val)s->strides[0] / %(b_val)s->descr->elsize;
npy_intp Sb_ind = %(b_ind)s->strides[0] / %(b_ind)s->descr->elsize;
npy_intp Sb_ptr = %(b_ptr)s->strides[0] / %(b_ptr)s->descr->elsize;
// pointers to access actual data in the arrays passed as params.
dtype_%(z)s* __restrict__ Dz = (dtype_%(z)s*)%(z)s->data;
const dtype_%(a_val)s* __restrict__ Da_val = (dtype_%(a_val)s*)%(a_val)s->data;
const npy_int32 * __restrict__ Da_ind = (npy_int32*)%(a_ind)s->data;
const npy_int32 * __restrict__ Da_ptr = (npy_int32*)%(a_ptr)s->data;
const dtype_%(b_val)s* __restrict__ Db_val = (dtype_%(b_val)s*)%(b_val)s->data;
const npy_int32 * __restrict__ Db_ind = (npy_int32*)%(b_ind)s->data;
const npy_int32 * __restrict__ Db_ptr = (npy_int32*)%(b_ptr)s->data;
npy_intp nnz = %(a_ind)s->dimensions[0];
dtype_%(b_val)s b_row[sp_dim];
//clear the output array
for (npy_int64 i = 0; i < nnz; ++i)
{
Dz[i*Sz] = 0;
}
memset(b_row, 0, sp_dim*sizeof(dtype_%(b_val)s));
// loop over inner dimension
for (npy_int64 m = 0; m < M; ++m)
{
for (npy_int32 j_ptr = Db_ptr[m * Sb_ptr];
j_ptr < Db_ptr[(m + 1) * Sb_ptr]; j_ptr++) {
b_row[Db_ind[j_ptr * Sb_ind]] += Db_val[j_ptr*Sb_val];
}
for (npy_int32 j_ptr = Da_ptr[m * Sa_ptr];
j_ptr < Da_ptr[(m + 1) * Sa_ptr]; j_ptr++) {
Dz[j_ptr*Sz] = b_row[Da_ind[j_ptr * Sa_ind]];
}
for (npy_int32 j_ptr = Db_ptr[m * Sb_ptr];
j_ptr < Db_ptr[(m + 1) * Sb_ptr]; j_ptr++) {
b_row[Db_ind[j_ptr * Sb_ind]] = 0;
}
}
}
""" % dict(locals(), **sub)
def c_code_cache_version(self):
return (3,)
csm_grad_c = CSMGradC()
# register a specialization to replace csm_grad -> csm_grad_c
# This is tested in tests/test_opt.py:test_local_csm_grad_c
@gof.local_optimizer([csm_grad(None)])
def local_csm_grad_c(node):
""" csm_grad(None) -> csm_grad_c """
if node.op == csm_grad(None):
return [csm_grad_c(*node.inputs)]
return False
register_specialize(local_csm_grad_c)
class MulSDCSC(gof.Op):
"""Multiplication of sparse matrix by a broadcasted dense vector
element wise.
:param a_data: Sparse matrix data.
:param a_indices: Sparse matrix indices.
:param a_indptr: Sparse matrix indptr.
:param b: Tensor type matrix.
:return: The multiplication of the two matrix element wise.
:note: `a_data`, `a_indices` and `a_indptr` must be the properties
of a sparse matrix in csc format.
:note: The dtype of `a_data`, i.e. the dtype of the sparse matrix,
cannot be a complex type.
:note: This op is used as an optimization of mul_s_d.
"""
def __eq__(self, other):
return (type(self) == type(other))
def __hash__(self):
return hash(type(self))
def make_node(self, a_data, a_indices, a_indptr, b):
assert b.type.ndim == 2
return gof.Apply(self, [a_data, a_indices, a_indptr, b],
[tensor.tensor(b.dtype, (False,))])
def c_code_cache_version(self):
return (1,)
#def perform(self, node, (a_data, a_indices, a_indptr, b), (out,)):
# return NotImplementedError()
def c_code(self, node, name, (_data, _indices, _indptr, _b,),
(_zout, ), sub):
if node.inputs[0].type.dtype in ('complex64', 'complex128'):
raise NotImplementedError('Complex types are not supported for a')
if node.inputs[3].type.dtype in ('complex64', 'complex128'):
raise NotImplementedError('Complex types are not supported for b')
return """
if (%(_b)s->nd != 2) {
PyErr_SetString(PyExc_NotImplementedError, "rank(b) != 2");
%(fail)s;}
if (%(_data)s->nd != 1) {
PyErr_SetString(PyExc_NotImplementedError, "rank(data) != 1");
%(fail)s;}
if (%(_indices)s->nd != 1) {
PyErr_SetString(PyExc_NotImplementedError, "rank(indices) != 1");
%(fail)s;}
if (%(_indptr)s->nd != 1) {
PyErr_SetString(PyExc_NotImplementedError, "rank(indptr) != 1");
%(fail)s;}
if( %(_indices)s->descr->type_num != PyArray_INT32) {
PyErr_SetString(PyExc_NotImplementedError, "C"); %(fail)s;}
if( %(_indptr)s->descr->type_num != PyArray_INT32)
{PyErr_SetString(PyExc_NotImplementedError, "D"); %(fail)s;}
if (!%(_zout)s)
{
%(_zout)s = (PyArrayObject*) PyArray_SimpleNew(1,
%(_indices)s->dimensions, %(_b)s->descr->type_num);
}
if (%(_zout)s->dimensions[0] != %(_indices)s->dimensions[0])
{
PyErr_SetString(PyExc_NotImplementedError,
"somehow _zout got the wrong size.. and I don't know how to resize it.");
%(fail)s;
}
{ //makes it compile even though labels jump over variable definitions.
const npy_intp nnz = %(_indices)s->dimensions[0];
//TODO: error checking with this
const npy_intp N = %(_indptr)s->dimensions[0]-1;
const dtype_%(_data)s * const __restrict__ data = (dtype_%(_data)s*)%(_data)s->data;
const npy_int32 * const __restrict__ indptr = (npy_int32 *)%(_indptr)s->data;
const npy_int32 * const __restrict__ indices = (npy_int32 *)%(_indices)s->data;
dtype_%(_zout)s * const __restrict__ zout = (dtype_%(_zout)s*)%(_zout)s->data;
const npy_intp Sb = %(_b)s->strides[0];
// loop over columns
for (npy_int32 j = 0; j < N; ++j)
{
// for each non-null value in the sparse column
for (npy_int32 i_idx = indptr[j]; i_idx < indptr[j+1]; ++i_idx)
{
// extract row index of non-null value
npy_int32 i = indices[i_idx];
// extract i-th row of dense matrix
const dtype_%(_b)s* __restrict__ b_row = (dtype_%(_b)s*)(%(_b)s->data + Sb * i);
// write resulting gradient to sparse output
zout[i_idx] = data[i_idx] * b_row[j];
}
}
}
""" % dict(locals(), **sub)
def __str__(self):
return self.__class__.__name__
mul_s_d_csc = MulSDCSC()
class MulSDCSR(gof.Op):
"""Multiplication of sparse matrix by a broadcasted dense vector
element wise.
:param a_data: Sparse matrix data.
:param a_indices: Sparse matrix indices.
:param a_indptr: Sparse matrix indptr.
:param b: Tensor type matrix.
:return: The multiplication of the two matrix element wise.
:note: `a_data`, `a_indices` and `a_indptr` must be the properties
of a sparse matrix in csr format.
:note: The dtype of `a_data`, i.e. the dtype of the sparse matrix,
cannot be a complex type.
:note: This op is used as an optimization of mul_s_d.
"""
def __eq__(self, other):
return (type(self) == type(other))
def __hash__(self):
return hash(type(self))
def make_node(self, a_data, a_indices, a_indptr, b):
assert b.type.ndim == 2
return gof.Apply(self, [a_data, a_indices, a_indptr, b],
[tensor.tensor(b.dtype, (False,))])
def c_code_cache_version(self):
return (1,)
#def perform(self, node, (a_data, a_indices, a_indptr, b), (out,)):
# return NotImplemented()
def c_code(self, node, name, (_data, _indices, _indptr, _b,),
(_zout, ), sub):
if node.inputs[0].type.dtype in ('complex64', 'complex128'):
raise NotImplementedError('Complex types are not supported for a')
if node.inputs[3].type.dtype in ('complex64', 'complex128'):
raise NotImplementedError('Complex types are not supported for b')
return """
if (%(_b)s->nd != 2) {
PyErr_SetString(PyExc_NotImplementedError, "rank(b) != 2");
%(fail)s;}
if (%(_data)s->nd != 1) {
PyErr_SetString(PyExc_NotImplementedError, "rank(data) != 1");
%(fail)s;}
if (%(_indices)s->nd != 1) {
PyErr_SetString(PyExc_NotImplementedError, "rank(indices) != 1");
%(fail)s;}
if (%(_indptr)s->nd != 1) {
PyErr_SetString(PyExc_NotImplementedError, "rank(indptr) != 1");
%(fail)s;}
if( %(_indices)s->descr->type_num != PyArray_INT32) {
PyErr_SetString(PyExc_NotImplementedError, "C"); %(fail)s;}
if( %(_indptr)s->descr->type_num != PyArray_INT32)
{PyErr_SetString(PyExc_NotImplementedError, "D"); %(fail)s;}
if (!%(_zout)s)
{
%(_zout)s = (PyArrayObject*) PyArray_SimpleNew(1,
%(_indices)s->dimensions, %(_b)s->descr->type_num);
}
if (%(_zout)s->dimensions[0] != %(_indices)s->dimensions[0])
{
PyErr_SetString(PyExc_NotImplementedError,
"somehow _zout got the wrong size.. and I don't know how to resize it.");
%(fail)s;
}
{ //makes it compile even though labels jump over variable definitions.
const npy_intp nnz = %(_indices)s->dimensions[0];
//TODO: error checking with this
const npy_intp N = %(_indptr)s->dimensions[0]-1;
const dtype_%(_data)s * const __restrict__ data = (dtype_%(_data)s*)%(_data)s->data;
const npy_int32 * const __restrict__ indptr = (npy_int32 *)%(_indptr)s->data;
const npy_int32 * const __restrict__ indices = (npy_int32 *)%(_indices)s->data;
dtype_%(_zout)s * const __restrict__ zout = (dtype_%(_zout)s*)%(_zout)s->data;
const npy_intp Sb = %(_b)s->strides[0];
// loop over columns
for (npy_int32 j = 0; j < N; ++j)
{
// extract i-th row of dense matrix
const dtype_%(_b)s* __restrict__ b_row = (dtype_%(_b)s*)(%(_b)s->data + Sb * j);
// for each non-null value in the sparse column
for (npy_int32 i_idx = indptr[j]; i_idx < indptr[j+1]; ++i_idx)
{
// extract row index of non-null value
npy_int32 i = indices[i_idx];
// write resulting gradient to sparse output
zout[i_idx] = data[i_idx] * b_row[i];
}
}
}
""" % dict(locals(), **sub)
def __str__(self):
return self.__class__.__name__
mul_s_d_csr = MulSDCSR()
# register a specialization to replace MulSD -> MulSDCSX # register a specialization to replace MulSD -> MulSDCSX
@gof.local_optimizer([sparse.mul_s_d]) @gof.local_optimizer([sparse.mul_s_d])
def local_mul_s_d(node): def local_mul_s_d(node):
...@@ -161,10 +1100,10 @@ def local_mul_s_d(node): ...@@ -161,10 +1100,10 @@ def local_mul_s_d(node):
return False return False
if svar.type.format == 'csc': if svar.type.format == 'csc':
CSx = sparse.CSC CSx = sparse.CSC
mul_s_d_csx = sparse.mul_s_d_csc mul_s_d_csx = mul_s_d_csc
elif svar.type.format == 'csr': elif svar.type.format == 'csr':
CSx = sparse.CSR CSx = sparse.CSR
mul_s_d_csx = sparse.mul_s_d_csr mul_s_d_csx = mul_s_d_csr
else: else:
raise NotImplemented() raise NotImplemented()
...@@ -181,6 +1120,116 @@ def local_mul_s_d(node): ...@@ -181,6 +1120,116 @@ def local_mul_s_d(node):
sparse.register_specialize(local_mul_s_d) sparse.register_specialize(local_mul_s_d)
class MulSVCSR(gof.Op):
"""Multiplication of sparse matrix by a broadcasted dense vector
element wise.
:param a_data: Sparse matrix data.
:param a_indices: Sparse matrix indices.
:param a_indptr: Sparse matrix indptr.
:param b: Tensor type matrix.
:return: The multiplication of the two matrix element wise.
:note: `a_data`, `a_indices` and `a_indptr` must be the properties
of a sparse matrix in csr format.
:note: The dtype of `a_data`, i.e. the dtype of the sparse matrix,
cannot be a complex type.
:note: This op is used as an optimization of MulSV.
"""
def __eq__(self, other):
return (type(self) == type(other))
def __hash__(self):
return hash(type(self))
def make_node(self, a_data, a_indices, a_indptr, b):
assert b.type.ndim == 1
return gof.Apply(self, [a_data, a_indices, a_indptr, b],
[tensor.tensor(b.dtype, (False,))])
def c_code_cache_version(self):
return (1,)
def c_code(self, node, name, inputs, outputs, sub):
_data, _indices, _indptr, _b, = inputs
_zout, = outputs
if node.inputs[0].type.dtype in ('complex64', 'complex128'):
raise NotImplementedError('Complex types are not supported for a')
if node.inputs[3].type.dtype in ('complex64', 'complex128'):
raise NotImplementedError('Complex types are not supported for b')
return """
if (%(_b)s->nd != 1) {
PyErr_SetString(PyExc_NotImplementedError, "rank(b) != 1");
%(fail)s;
}
if (%(_data)s->nd != 1) {
PyErr_SetString(PyExc_NotImplementedError, "rank(data) != 1");
%(fail)s;
}
if (%(_indices)s->nd != 1) {
PyErr_SetString(PyExc_NotImplementedError, "rank(indices) != 1");
%(fail)s;
}
if (%(_indptr)s->nd != 1) {
PyErr_SetString(PyExc_NotImplementedError, "rank(indptr) != 1");
%(fail)s;
}
if( %(_indices)s->descr->type_num != PyArray_INT32) {
PyErr_SetString(PyExc_NotImplementedError, "C"); %(fail)s;}
if( %(_indptr)s->descr->type_num != PyArray_INT32)
{PyErr_SetString(PyExc_NotImplementedError, "D"); %(fail)s;}
if (!%(_zout)s
|| %(_zout)s->dimensions[0] != %(_indices)s->dimensions[0]
|| !PyArray_ISCONTIGUOUS(%(_zout)s))
{
Py_XDECREF(%(_zout)s);
%(_zout)s = (PyArrayObject*) PyArray_SimpleNew(1,
%(_indices)s->dimensions, %(_b)s->descr->type_num);
}
{ //makes it compile even though labels jump over variable definitions.
const npy_intp nnz = %(_indices)s->dimensions[0];
//TODO: error checking with this
const npy_intp N = %(_indptr)s->dimensions[0]-1;
const dtype_%(_data)s * const __restrict__ data = (dtype_%(_data)s*)%(_data)s->data;
const npy_int32 * const __restrict__ indptr = (npy_int32 *)%(_indptr)s->data;
const npy_int32 * const __restrict__ indices = (npy_int32 *)%(_indices)s->data;
const dtype_%(_b)s* __restrict__ Db = (dtype_%(_b)s*)%(_b)s->data;
dtype_%(_zout)s * const __restrict__ zout = (dtype_%(_zout)s*)%(_zout)s->data;
const npy_intp Sb = %(_b)s->strides[0] / %(_b)s->descr->elsize;
// loop over rows
for (npy_int32 j = 0; j < N; ++j)
{
// for each non-null value in the sparse column
for (npy_int32 i_idx = indptr[j]; i_idx < indptr[j+1]; ++i_idx)
{
// extract row index of non-null value
npy_int32 i = indices[i_idx];
zout[i_idx] = data[i_idx] * Db[i * Sb];
}
}
}
""" % dict(locals(), **sub)
def __str__(self):
return self.__class__.__name__
mul_s_v_csr = MulSVCSR()
# register a specialization to replace MulSV -> MulSVCSR
@gof.local_optimizer([sparse.mul_s_v]) @gof.local_optimizer([sparse.mul_s_v])
def local_mul_s_v(node): def local_mul_s_v(node):
if node.op == sparse.mul_s_v: if node.op == sparse.mul_s_v:
...@@ -199,7 +1248,7 @@ def local_mul_s_v(node): ...@@ -199,7 +1248,7 @@ def local_mul_s_v(node):
return False return False
elif svar.type.format == 'csr': elif svar.type.format == 'csr':
CSx = sparse.CSR CSx = sparse.CSR
mul_s_v_csx = sparse.mul_s_v_csr mul_s_v_csx = mul_s_v_csr
else: else:
return False return False
...@@ -213,6 +1262,130 @@ def local_mul_s_v(node): ...@@ -213,6 +1262,130 @@ def local_mul_s_v(node):
sparse.register_specialize(local_mul_s_v) sparse.register_specialize(local_mul_s_v)
class StructuredAddSVCSR(gof.Op):
"""Structured addition of a sparse matrix and a dense vector.
The elements of the vector are are only added to the corresponding
non-zero elements. Therefore, this operation outputs another sparse
matrix.
:param a_data: Sparse matrix data.
:param a_indices: Sparse matrix indices.
:param a_indptr: Sparse matrix indptr.
:param b: Tensor type vector.
:return: A sparse matrix containing the addition of the vector to
the data of the sparse matrix.
:note: The a_* are the properties of a sparse matrix in csr
format.
:note: This op is used as an optimization for StructuredAddSV.
"""
def __eq__(self, other):
return (type(self) == type(other))
def __hash__(self):
return hash(type(self))
def make_node(self, a_data, a_indices, a_indptr, b):
b = tensor.as_tensor_variable(b)
a_data = tensor.as_tensor_variable(a_data)
a_indices = tensor.as_tensor_variable(a_indices)
a_indptr = tensor.as_tensor_variable(a_indptr)
assert a_data.type.ndim == 1
assert a_indices.type.ndim == 1
assert a_indptr.type.ndim == 1
assert b.type.ndim == 1
return gof.Apply(self, [a_data, a_indices, a_indptr, b],
[tensor.tensor(b.dtype, (False,))])
def c_code_cache_version(self):
return (1,)
def c_code(self, node, name, inputs, outputs, sub):
_data, _indices, _indptr, _b, = inputs
_zout, = outputs
if node.inputs[0].type.dtype in ('complex64', 'complex128'):
raise NotImplementedError('Complex types are not supported for a')
if node.inputs[3].type.dtype in ('complex64', 'complex128'):
raise NotImplementedError('Complex types are not supported for b')
return """
if (%(_b)s->nd != 1) {
PyErr_SetString(PyExc_NotImplementedError, "rank(b) != 1");
%(fail)s;
}
if (%(_data)s->nd != 1) {
PyErr_SetString(PyExc_NotImplementedError, "rank(data) != 1");
%(fail)s;
}
if (%(_indices)s->nd != 1) {
PyErr_SetString(PyExc_NotImplementedError, "rank(indices) != 1");
%(fail)s;
}
if (%(_indptr)s->nd != 1) {
PyErr_SetString(PyExc_NotImplementedError, "rank(indptr) != 1");
%(fail)s;
}
if( %(_indices)s->descr->type_num != PyArray_INT32) {
PyErr_SetString(PyExc_NotImplementedError, "C"); %(fail)s;}
if( %(_indptr)s->descr->type_num != PyArray_INT32)
{PyErr_SetString(PyExc_NotImplementedError, "D"); %(fail)s;}
if (!%(_zout)s)
{
%(_zout)s = (PyArrayObject*) PyArray_SimpleNew(1,
%(_indices)s->dimensions, %(_b)s->descr->type_num);
}
if (%(_zout)s->dimensions[0] != %(_indices)s->dimensions[0])
{
PyErr_SetString(PyExc_NotImplementedError,
"somehow _zout got the wrong size.. and I don't know how to resize it.");
%(fail)s;
}
{ //makes it compile even though labels jump over variable definitions.
const npy_intp nnz = %(_indices)s->dimensions[0];
//TODO: error checking with this
const npy_intp N = %(_indptr)s->dimensions[0]-1;
const dtype_%(_data)s * const __restrict__ data = (dtype_%(_data)s*)%(_data)s->data;
const npy_int32 * const __restrict__ indptr = (npy_int32 *)%(_indptr)s->data;
const npy_int32 * const __restrict__ indices = (npy_int32 *)%(_indices)s->data;
const dtype_%(_b)s* __restrict__ Db = (dtype_%(_b)s*)%(_b)s->data;
dtype_%(_zout)s * const __restrict__ zout = (dtype_%(_zout)s*)%(_zout)s->data;
const npy_intp Sb = %(_b)s->strides[0] / %(_b)s->descr->elsize;
// loop over columns
for (npy_int32 j = 0; j < N; ++j)
{
// for each non-null value in the sparse column
for (npy_int32 i_idx = indptr[j]; i_idx < indptr[j+1]; ++i_idx)
{
// extract row index of non-null value
npy_int32 i = indices[i_idx];
// write resulting gradient to sparse output
zout[i_idx] = data[i_idx] + Db[i * Sb];
}
}
}
""" % dict(locals(), **sub)
def __str__(self):
return self.__class__.__name__
structured_add_s_v_csr = StructuredAddSVCSR()
# register a specialization to replace
# structured_add_s_v -> structured_add_s_v_csr
@gof.local_optimizer([sparse.structured_add_s_v]) @gof.local_optimizer([sparse.structured_add_s_v])
def local_structured_add_s_v(node): def local_structured_add_s_v(node):
if node.op == sparse.structured_add_s_v: if node.op == sparse.structured_add_s_v:
...@@ -232,7 +1405,7 @@ def local_structured_add_s_v(node): ...@@ -232,7 +1405,7 @@ def local_structured_add_s_v(node):
return False return False
elif svar.type.format == 'csr': elif svar.type.format == 'csr':
CSx = sparse.CSR CSx = sparse.CSR
structured_add_s_v_csx = sparse.structured_add_s_v_csr structured_add_s_v_csx = structured_add_s_v_csr
else: else:
return False return False
...@@ -246,6 +1419,229 @@ def local_structured_add_s_v(node): ...@@ -246,6 +1419,229 @@ def local_structured_add_s_v(node):
sparse.register_specialize(local_structured_add_s_v) sparse.register_specialize(local_structured_add_s_v)
class SamplingDotCSR(gof.Op):
"""Operand optimized for calculating the dot product dot(`x`, `y`.T) = `z`
when you only want to calculate a subset of `z`.
It is equivalent to `p` o (`x` . `y`.T) where o is the element-wise
product, `x` and `y` operands of the dot product and `p` is a matrix
that contains 1 when the corresponding element of `z` should be
calculated and 0 when it shouldn't. Note that SamplingDot has a different
interface than `dot` because SamplingDot requires `x` to be a `m`x`k`
matrix while `y` is a `n`x`k` matrix instead of the usual `k`x`n` matrix.
.. note::
It will work if the pattern is not binary value, but if the
pattern doesn't have a high sparsity proportion it will be slower
then a more optimized dot followed by a normal elemwise
multiplication.
:param x: Tensor matrix.
:param y: Tensor matrix.
:param p_data: Sparse matrix data.
:param p_ind: Sparse matrix indices.
:param p_ptr: Sparse matric indptr.
:param p_ncols: Sparse matrix number of columns.
:return: A dense matrix containing the dot product of `x` by `y`.T only
where `p` is 1.
:note: If we have the input of mixed dtype, we insert cast elemwise
in the graph to be able to call blas function as they don't
allow mixed dtype.
:note: This op is used as an optimization for SamplingDot.
"""
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def __str__(self):
return 'SamplingDot{Csr}'
def make_node(self, x, y, p_data, p_ind, p_ptr, p_ncols):
x = tensor.as_tensor_variable(x)
y = tensor.as_tensor_variable(y)
p_data = tensor.as_tensor_variable(p_data)
p_ind = tensor.as_tensor_variable(p_ind)
p_ptr = tensor.as_tensor_variable(p_ptr)
p_ncols = tensor.as_tensor_variable(p_ncols)
assert p_ncols.dtype == 'int32'
dtype_out = scalar.upcast(x.type.dtype, y.type.dtype,
p_data.type.dtype)
dot_out = scalar.upcast(x.type.dtype, y.type.dtype)
# We call blas ?dot function that take only param of the same type
x = tensor.cast(x, dot_out)
y = tensor.cast(y, dot_out)
return gof.Apply(self, [x, y, p_data, p_ind, p_ptr, p_ncols], [
tensor.tensor(dtype=dtype_out, broadcastable=(False,)),
tensor.tensor(dtype=p_ind.type.dtype, broadcastable=(False,)),
tensor.tensor(dtype=p_ptr.type.dtype, broadcastable=(False,))
])
def c_code_cache_version(self):
return (1, )
def c_support_code(self):
return blas.blas_header_text()
def c_libraries(self):
return blas.ldflags()
def c_compile_args(self):
return blas.ldflags(libs=False, flags=True)
def c_lib_dirs(self):
return blas.ldflags(libs=False, libs_dir=True)
def c_header_dirs(self):
return blas.ldflags(libs=False, include_dir=True)
def c_code(self, node, name, inputs, outputs, sub):
x, y, p_data, p_ind, p_ptr, p_ncols = inputs
z_data, z_ind, z_ptr = outputs
if node.inputs[0].type.dtype in ('complex64', 'complex128'):
raise NotImplementedError('Complex types are not supported for x')
if node.inputs[1].type.dtype in ('complex64', 'complex128'):
raise NotImplementedError('Complex types are not supported for y')
if node.inputs[2].type.dtype in ('complex64', 'complex128'):
raise NotImplementedError(
'Complex types are not supported for pattern')
dot_out = scalar.upcast(node.inputs[0].type.dtype,
node.inputs[1].type.dtype)
if dot_out == "float32":
conv_type = "float"
cdot = "sdot_"
else:
conv_type = "double"
cdot = "ddot_"
# retrieve dtype number
typenum_x = node.inputs[0].type.dtype_specs()[-1]
typenum_y = node.inputs[1].type.dtype_specs()[-1]
typenum_p = node.inputs[2].type.dtype_specs()[-1]
typenum_zd = tensor.TensorType(node.outputs[0].dtype,
[]).dtype_specs()[-1]
typenum_zi = tensor.TensorType(node.outputs[1].dtype,
[]).dtype_specs()[-1]
typenum_zp = tensor.TensorType(node.outputs[2].dtype,
[]).dtype_specs()[-1]
rval = """
if (%(x)s->nd != 2) {
PyErr_SetString(PyExc_NotImplementedError, "rank(x) != 2"); %(fail)s;}
if (%(y)s->nd != 2) {
PyErr_SetString(PyExc_NotImplementedError, "rank(y) != 2"); %(fail)s;}
if (%(x)s->descr->type_num != %(typenum_x)s) {
PyErr_SetString(PyExc_NotImplementedError,
"Invalid type for x");
%(fail)s;}
if (%(y)s->descr->type_num != %(typenum_y)s) {
PyErr_SetString(PyExc_NotImplementedError,
"Invalid type for y");
%(fail)s;}
if (%(p_data)s->descr->type_num != %(typenum_p)s) {
PyErr_SetString(PyExc_NotImplementedError,
"Invalid type for pattern");
%(fail)s;}
if (%(x)s->dimensions[1] != %(y)s->dimensions[1]) {
PyErr_SetString(PyExc_NotImplementedError,
"x's number of columns doesn't match y's rows! Note: sampling_dot is different from dot because y is assumed to be transposed.");
%(fail)s;}
if (%(y)s->dimensions[0] != ((npy_int32 *)%(p_ncols)s->data)[0] ||
%(x)s->dimensions[0] != (%(p_ptr)s->dimensions[0] - 1))
{PyErr_SetString(PyExc_NotImplementedError,
"The dimension of the pattern and the output must match"); %(fail)s;}
// Allocate output
if (!%(z_data)s
|| (%(z_data)s->dimensions[0] != %(p_data)s->dimensions[0])
|| (%(z_data)s->descr->type_num != %(typenum_zd)s)) {
{Py_XDECREF(%(z_data)s);}
npy_intp dims[] = {0};
dims[0] = %(p_data)s->dimensions[0];
%(z_data)s = (PyArrayObject*) PyArray_SimpleNew(1, dims,
%(typenum_zd)s);
}
if (!%(z_ind)s
|| (%(z_ind)s->dimensions[0] != %(p_ind)s->dimensions[0])
|| (%(z_ind)s->descr->type_num != %(typenum_zi)s)) {
{Py_XDECREF(%(z_ind)s);}
npy_intp dims[] = {0};
dims[0] = %(p_ind)s->dimensions[0];
%(z_ind)s = (PyArrayObject*) PyArray_SimpleNew(1, dims,
%(typenum_zi)s);
}
if (!%(z_ptr)s
|| (%(z_ptr)s->dimensions[0] != %(p_ptr)s->dimensions[0])
|| (%(z_ptr)s->descr->type_num != %(typenum_zp)s)) {
{Py_XDECREF(%(z_ptr)s);}
npy_intp dims[] = {0};
dims[0] = %(p_ptr)s->dimensions[0];
%(z_ptr)s = (PyArrayObject*) PyArray_SimpleNew(1, dims,
%(typenum_zp)s);
}
{
// Product of MxK and NxK, output MxN
npy_intp M = %(x)s->dimensions[0];
npy_intp N = %(y)s->dimensions[0];
npy_intp K = %(y)s->dimensions[1];
// pointers to access actual data in the arrays passed as params.
const dtype_%(x)s* __restrict__ Dx = (dtype_%(x)s*)%(x)s->data;
const dtype_%(y)s* __restrict__ Dy = (dtype_%(y)s*)%(y)s->data;
const dtype_%(p_data)s* __restrict__ Dpd = (dtype_%(p_data)s*)%(p_data)s->data;
const dtype_%(p_ind)s* __restrict__ Dpi = (dtype_%(p_ind)s*)%(p_ind)s->data;
const dtype_%(p_ptr)s* __restrict__ Dpp = (dtype_%(p_ptr)s*)%(p_ptr)s->data;
dtype_%(z_data)s* __restrict__ Dzd = (dtype_%(z_data)s*)%(z_data)s->data;
dtype_%(z_ind)s* __restrict__ Dzi = (dtype_%(z_ind)s*)%(z_ind)s->data;
dtype_%(z_ptr)s* __restrict__ Dzp = (dtype_%(z_ptr)s*)%(z_ptr)s->data;
const npy_intp Sdx = %(x)s->strides[1]/%(x)s->descr->elsize;
const npy_intp Sdy = %(y)s->strides[1]/%(y)s->descr->elsize;
const npy_intp Sdpd = %(p_data)s->strides[0] / %(p_data)s->descr->elsize;
const npy_intp Sdpi = %(p_ind)s->strides[0] / %(p_ind)s->descr->elsize;
const npy_intp Sdpp = %(p_ptr)s->strides[0] / %(p_ptr)s->descr->elsize;
const npy_intp Sdzd = %(z_data)s->strides[0] / %(z_data)s->descr->elsize;
const npy_intp Sdzi = %(z_ind)s->strides[0] / %(z_ind)s->descr->elsize;
const npy_intp Sdzp = %(z_ptr)s->strides[0] / %(z_ptr)s->descr->elsize;
memcpy(Dzi, Dpi, %(p_ind)s->dimensions[0]*sizeof(dtype_%(p_ind)s));
memcpy(Dzp, Dpp, %(p_ptr)s->dimensions[0]*sizeof(dtype_%(p_ptr)s));
for (npy_int32 m = 0; m < M; ++m) {
for (npy_int32 n_idx = Dpp[m * Sdpp]; n_idx < Dpp[(m+1)*Sdpp]; ++n_idx) {
const npy_int32 n = Dpi[n_idx * Sdpi]; // row index of non-null value for column K
const dtype_%(x)s* x_row = (dtype_%(x)s*)(%(x)s->data + %(x)s->strides[0] * m);
const dtype_%(y)s* y_col = (dtype_%(y)s*)(%(y)s->data + %(y)s->strides[0] * n);
Dzd[n_idx * Sdzd] = Dpd[n_idx * Sdpd] * %(cdot)s((int*)&K, (const %(conv_type)s*)x_row, (int*)&Sdx, (const %(conv_type)s*)y_col, (int*)&Sdy);
}
}
}
""" % dict(locals(), **sub)
return rval
sampling_dot_csr = SamplingDotCSR()
# register a specialization to replace SamplingDot -> SamplingDotCsr # register a specialization to replace SamplingDot -> SamplingDotCsr
@gof.local_optimizer([sparse.sampling_dot]) @gof.local_optimizer([sparse.sampling_dot])
def local_sampling_dot_csr(node): def local_sampling_dot_csr(node):
...@@ -254,7 +1650,7 @@ def local_sampling_dot_csr(node): ...@@ -254,7 +1650,7 @@ def local_sampling_dot_csr(node):
if p.type.format == 'csr': if p.type.format == 'csr':
p_data, p_ind, p_ptr, p_shape = sparse.csm_properties(p) p_data, p_ind, p_ptr, p_shape = sparse.csm_properties(p)
z_data, z_ind, z_ptr = sparse.sampling_dot_csr(x, y, p_data, z_data, z_ind, z_ptr = sampling_dot_csr(x, y, p_data,
p_ind, p_ptr, p_shape[1]) p_ind, p_ptr, p_shape[1])
return [sparse.CSR(z_data, z_ind, z_ptr, p_shape)] return [sparse.CSR(z_data, z_ind, z_ptr, p_shape)]
......
...@@ -17,19 +17,21 @@ from theano.sparse.basic import ( ...@@ -17,19 +17,21 @@ from theano.sparse.basic import (
Cast, bcast, wcast, icast, lcast, fcast, dcast, ccast, zcast, Cast, bcast, wcast, icast, lcast, fcast, dcast, ccast, zcast,
HStack, hstack, VStack, vstack, HStack, hstack, VStack, vstack,
AddSSData, add_s_s_data, AddSSData, add_s_s_data,
MulSDCSC, mul_s_d_csc, MulSDCSR, mul_s_d_csr, MulSV, mul_s_v,
Multinomial, multinomial, Poisson, poisson, Multinomial, multinomial, Poisson, poisson,
Binomial, csr_fbinomial, csc_fbinomial, csr_dbinomial, csc_dbinomial, Binomial, csr_fbinomial, csc_fbinomial, csr_dbinomial, csc_dbinomial,
structured_monoid, structured_monoid,
structured_sigmoid, structured_exp, structured_log, structured_pow, structured_sigmoid, structured_exp, structured_log, structured_pow,
structured_minimum, structured_maximum, structured_add, structured_minimum, structured_maximum, structured_add,
MulSV, mul_s_v, MulSVCSR, mul_s_v_csr,
StructuredAddSV, structured_add_s_v, StructuredAddSV, structured_add_s_v,
StructuredAddSVCSR, structured_add_s_v_csr, SamplingDot, sampling_dot)
SamplingDot, sampling_dot, SamplingDotCSR, sampling_dot_csr)
# Also for compatibility # Also for compatibility
from theano.sparse.opt import ( from theano.sparse.opt import (
MulSDCSC, mul_s_d_csc, MulSDCSR, mul_s_d_csr,
MulSVCSR, mul_s_v_csr,
StructuredAddSVCSR, structured_add_s_v_csr,
SamplingDotCSR, sampling_dot_csr,
local_mul_s_d, local_mul_s_v, local_mul_s_d, local_mul_s_v,
local_structured_add_s_v, local_sampling_dot_csr) local_structured_add_s_v, local_sampling_dot_csr)
......
...@@ -25,13 +25,13 @@ from theano.sparse.basic import _is_dense_variable, _is_sparse_variable ...@@ -25,13 +25,13 @@ from theano.sparse.basic import _is_dense_variable, _is_sparse_variable
from theano.sparse import ( from theano.sparse import (
verify_grad_sparse, as_sparse_variable, verify_grad_sparse, as_sparse_variable,
CSC, CSR, CSM, CSMProperties, csm_properties, CSC, CSR, CSM, CSMProperties, csm_properties,
SparseType, CSMGrad, CSMGradC, SparseType, CSMGrad,
StructuredDot, StructuredDotCSC, StructuredDot,
StructuredDotGradCSC, StructuredDotGradCSR, StructuredDotGradCSC, StructuredDotGradCSR,
AddSS, AddSD, MulSS, MulSD, Transpose, Neg, Remove0, AddSS, AddSD, MulSS, MulSD, Transpose, Neg, Remove0,
add, mul, structured_dot, transpose, add, mul, structured_dot, transpose,
csc_from_dense, csr_from_dense, dense_from_sparse, csc_from_dense, csr_from_dense, dense_from_sparse,
Dot, Usmm, UsmmCscDense, sp_ones_like, GetItemScalar, Dot, Usmm, sp_ones_like, GetItemScalar,
SparseFromDense, SparseFromDense,
Cast, cast, HStack, VStack, AddSSData, add_s_s_data, Cast, cast, HStack, VStack, AddSSData, add_s_s_data,
Poisson, poisson, Binomial, Multinomial, multinomial, Poisson, poisson, Binomial, Multinomial, multinomial,
...@@ -42,6 +42,8 @@ from theano.sparse import ( ...@@ -42,6 +42,8 @@ from theano.sparse import (
Diag, diag, SquareDiagonal, square_diagonal, Diag, diag, SquareDiagonal, square_diagonal,
EnsureSortedIndices, ensure_sorted_indices, clean) EnsureSortedIndices, ensure_sorted_indices, clean)
from theano.sparse.opt import (StructuredDotCSC, UsmmCscDense, CSMGradC)
from theano.tests import unittest_tools as utt from theano.tests import unittest_tools as utt
from theano.tensor.basic import _allclose from theano.tensor.basic import _allclose
...@@ -1218,7 +1220,7 @@ class UsmmTests(unittest.TestCase): ...@@ -1218,7 +1220,7 @@ class UsmmTests(unittest.TestCase):
assert isinstance(topo[1].op, theano.tensor.DimShuffle) assert isinstance(topo[1].op, theano.tensor.DimShuffle)
assert isinstance(topo[2].op, theano.tensor.Subtensor) assert isinstance(topo[2].op, theano.tensor.Subtensor)
assert topo[3].op == theano.tensor.neg assert topo[3].op == theano.tensor.neg
assert isinstance(topo[4].op, theano.sparse.UsmmCscDense) assert isinstance(topo[4].op, UsmmCscDense)
if inplace: if inplace:
assert topo[4].op.inplace assert topo[4].op.inplace
elif not fast_compile: elif not fast_compile:
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论