Move op and tests out of sandbox.

4791f726 · Nicolas Bouchard · Frederic · 512cb95c · 4791f726 · 4791f726
--- a/theano/sparse/basic.py
+++ b/theano/sparse/basic.py
@@ -21,6 +21,7 @@ import theano.tests.unittest_tools as utt
 sparse_formats = ['csc', 'csr']
 #TODO: move this decorator to the compile submodule
 def register_specialize(lopt, *tags, **kwargs):
    compile.optdb['specialize'].register((kwargs and kwargs.pop('name')) or
@@ -716,7 +717,8 @@ class CSMGrad(gof.op.Op):
    """
    This Op computes the gradient of the CSM Op. CSM creates a matrix from
    data, indices, and ind_ptr vectors; it's gradient is the gradient of
-    the data vector only. There are two complexities to calculate this gradient:
+    the data vector only. There are two complexities to calculate this
+    gradient:
    1. The gradient may be sparser than the input matrix defined by (data,
    indices, ind_ptr). In this case, the data vector of the gradient will have
    less elements than the data vector of the input because sparse formats
@@ -908,13 +910,58 @@ class CSMGradC(gof.Op):
 csm_grad_c = CSMGradC()
-@gof.local_optimizer([csm_grad(None)])
+class Cast(gof.op.Op):
-def local_csm_grad_c(node):
+    """Cast sparse variable to the desired dtype.
-    """ csm_grad(None) -> csm_grad_c """
-    if node.op == csm_grad(None):
+    :param x: Sparse matrix.
-        return [csm_grad_c(*node.inputs)]
-    return False
+    :return: Same as `x` but having `out_type` as dtype.
-register_specialize(local_csm_grad_c)
+    :note:
+    - The grad implemented is regular, i.e. not structured.
+    """
+    def __init__(self, out_type):
+        self.out_type = out_type
+    def __eq__(self, other):
+        return (type(self) == type(other)) and self.out_type == other.out_type
+    def __hash__(self):
+        return hash(type(self)) ^ hash(self.out_type)
+    def make_node(self, x):
+        x = as_sparse_variable(x)
+        return gof.Apply(
+            self, [x],
+            [SparseType(dtype=self.out_type, format=x.format).make_variable()])
+    def perform(self, node, (x, ), (out, )):
+        assert _is_sparse(x)
+        out[0] = x.astype(self.out_type)
+    def grad(self, inputs, outputs_gradients):
+        if inputs[0].dtype in tensor.continuous_dtypes:
+            gz = outputs_gradients[0]
+            return [Cast(inputs[0].dtype)(gz)]
+        else:
+            return [None]
+    def infer_shape(self, node, ins_shapes):
+        return ins_shapes
+    def __str__(self):
+        return "%s(%s)" % (self.__class__.__name__, self.out_type)
+bcast = Cast('int8')
+wcast = Cast('int16')
+icast = Cast('int32')
+lcast = Cast('int64')
+fcast = Cast('float32')
+dcast = Cast('float64')
+ccast = Cast('complex64')
+zcast = Cast('complex128')
 #
 # Conversion
@@ -1278,6 +1325,58 @@ class AddSS(gof.op.Op):
 add_s_s = AddSS()
+class AddSSData(gof.op.Op):
+    """Add two sparse matrices assuming they have the same sparsity
+    pattern.
+    :param x: Sparse matrix.
+    :param y: Sparse matrix.
+    :return: The sum of the two sparse matrix element wise.
+    :note:
+    - `x` and `y` are assumed to have the same sparsity pattern.
+    - The grad implemented is structured.
+    """
+    def __eq__(self, other):
+        return (type(self) == type(other))
+    def __hash__(self):
+        return hash(type(self))
+    def make_node(self, x, y):
+        x, y = map(as_sparse_variable, [x, y])
+        if x.type.dtype != y.type.dtype:
+            raise NotImplementedError()
+        if x.type.format != y.type.format:
+            raise NotImplementedError()
+        return gof.Apply(self,
+                         [x, y],
+                         [SparseType(dtype=x.type.dtype,
+                                 format=x.type.format).make_variable()])
+    def perform(self, node, (x, y), (out, )):
+        assert _is_sparse(x) and _is_sparse(y)
+        assert x.shape == y.shape
+        assert x.data.shape == y.data.shape
+        out[0] = x.copy()
+        out[0].data += y.data
+    def grad(self, inputs, (gz, )):
+        is_continuous = [(i.dtype in continuous_dtypes)
+                         for i in inputs]
+        derivative = {True: gz, False: None}
+        return [derivative[b] for b in is_continuous]
+    def infer_shape(self, node, ins_shapes):
+        return [ins_shapes[0]]
+    def __str__(self):
+        return self.__class__.__name__
+add_s_s_data = AddSSData()
 class AddSD(gof.op.Op):
    ''' Add a sparse and a dense matrix '''
    def __eq__(self, other):
@@ -1319,6 +1418,181 @@ class AddSD(gof.op.Op):
 add_s_d = AddSD()
+class StructuredAddSV(gof.op.Op):
+    """Structured addition of a sparse matrix and a dense vector.
+    The elements of the vector are are only added to the corresponding
+    non-zero elements. Therefore, this operation outputs another sparse
+    matrix.
+    :param x: Sparse matrix.
+    :param y: Tensor type vector.
+    :return: A sparse matrix containing the addition of the vector to
+             the data of the sparse matrix.
+    :note: The grad implemented is structured since the op is structured.
+    """
+    def __eq__(self, other):
+        return (type(self) == type(other))
+    def __hash__(self):
+        return hash(type(self))
+    def make_node(self, x, y):
+        x = as_sparse_variable(x)
+        y = tensor.as_tensor_variable(y)
+        assert y.type.ndim == 1
+        if x.type.dtype != y.type.dtype:
+            raise NotImplementedError()
+        return gof.Apply(self,
+                         [x, y],
+                         [SparseType(dtype=x.type.dtype,
+                                 format=x.type.format).make_variable()])
+    def perform(self, node, (x, y), (out, )):
+        assert _is_sparse(x) and not _is_sparse(y)
+        assert x.shape[1] == y.shape[0]
+        out[0] = x.__class__(x + (x.toarray() != 0) * y)
+    def grad(self, (x, y), (gz,)):
+        assert _is_sparse_variable(x) and not _is_sparse_variable(y)
+        assert _is_sparse_variable(gz)
+        return gz, theano.sparse.sandbox.sp.sp_sum(gz, axis=0, sparse_grad=True)
+    def infer_shape(self, node, ins_shapes):
+        return [ins_shapes[0]]
+    def __str__(self):
+        return self.__class__.__name__
+structured_add_s_v = StructuredAddSV()
+class StructuredAddSVCSR(gof.Op):
+    """Structured addition of a sparse matrix and a dense vector.
+    The elements of the vector are are only added to the corresponding
+    non-zero elements. Therefore, this operation outputs another sparse
+    matrix.
+    :param a_data: Sparse matrix data.
+    :param a_indices: Sparse matrix indices.
+    :param a_indptr: Sparse matrix indptr.
+    :param b: Tensor type vector.
+    :return: A sparse matrix containing the addition of the vector to
+             the data of the sparse matrix.
+    :note:
+    - The a_* are the properties of a sparse matrix in csr
+      format.
+    - This op is used as an optimization for StructuredAddSV.
+    """
+    def __eq__(self, other):
+        return (type(self) == type(other))
+    def __hash__(self):
+        return hash(type(self))
+    def make_node(self, a_data, a_indices, a_indptr, b):
+        b = tensor.as_tensor_variable(b)
+        a_data = tensor.as_tensor_variable(a_data)
+        a_indices = tensor.as_tensor_variable(a_indices)
+        a_indptr = tensor.as_tensor_variable(a_indptr)
+        assert a_data.type.ndim == 1
+        assert a_indices.type.ndim == 1
+        assert a_indptr.type.ndim == 1
+        assert b.type.ndim == 1
+        return gof.Apply(self, [a_data, a_indices, a_indptr, b],
+                               [tensor.tensor(b.dtype, (False,))])
+    def c_code_cache_version(self):
+        return (1,)
+    def c_code(self, node, name, inputs, outputs, sub):
+        _data, _indices, _indptr, _b, = inputs
+        _zout, = outputs
+        if node.inputs[0].type.dtype in ('complex64', 'complex128'):
+            raise NotImplementedError('Complex types are not supported for a')
+        if node.inputs[3].type.dtype in ('complex64', 'complex128'):
+            raise NotImplementedError('Complex types are not supported for b')
+        return """
+        if (%(_b)s->nd != 1) {
+            PyErr_SetString(PyExc_NotImplementedError, "rank(b) != 1");
+            %(fail)s;
+        }
+        if (%(_data)s->nd != 1) {
+            PyErr_SetString(PyExc_NotImplementedError, "rank(data) != 1");
+            %(fail)s;
+        }
+        if (%(_indices)s->nd != 1) {
+            PyErr_SetString(PyExc_NotImplementedError, "rank(indices) != 1");
+            %(fail)s;
+        }
+        if (%(_indptr)s->nd != 1) {
+            PyErr_SetString(PyExc_NotImplementedError, "rank(indptr) != 1");
+            %(fail)s;
+        }
+        if( %(_indices)s->descr->type_num != PyArray_INT32) {
+        PyErr_SetString(PyExc_NotImplementedError, "C"); %(fail)s;}
+        if( %(_indptr)s->descr->type_num != PyArray_INT32)
+        {PyErr_SetString(PyExc_NotImplementedError, "D"); %(fail)s;}
+        if (!%(_zout)s)
+        {
+            %(_zout)s = (PyArrayObject*) PyArray_SimpleNew(1,
+                    %(_indices)s->dimensions, %(_b)s->descr->type_num);
+        }
+        if (%(_zout)s->dimensions[0] != %(_indices)s->dimensions[0])
+        {
+            PyErr_SetString(PyExc_NotImplementedError,
+     "somehow _zout got the wrong size.. and I don't know how to resize it.");
+            %(fail)s;
+        }
+        { //makes it compile even though labels jump over variable definitions.
+            const npy_intp nnz = %(_indices)s->dimensions[0];
+            //TODO: error checking with this
+            const npy_intp N =  %(_indptr)s->dimensions[0]-1;
+            const dtype_%(_data)s * const __restrict__ data = (dtype_%(_data)s*)%(_data)s->data;
+            const npy_int32 * const __restrict__ indptr = (npy_int32 *)%(_indptr)s->data;
+            const npy_int32 * const __restrict__ indices = (npy_int32 *)%(_indices)s->data;
+            const dtype_%(_b)s* __restrict__ Db = (dtype_%(_b)s*)%(_b)s->data;
+            dtype_%(_zout)s * const __restrict__ zout = (dtype_%(_zout)s*)%(_zout)s->data;
+            const npy_intp Sb = %(_b)s->strides[0] / %(_b)s->descr->elsize;
+            // loop over columns
+            for (npy_int32 j = 0; j < N; ++j)
+            {
+                // for each non-null value in the sparse column
+                for (npy_int32 i_idx = indptr[j]; i_idx < indptr[j+1]; ++i_idx)
+                {
+                    // extract row index of non-null value
+                    npy_int32 i = indices[i_idx];
+                    // write resulting gradient to sparse output
+                    zout[i_idx] = data[i_idx] + Db[i * Sb];
+                }
+            }
+        }
+        """ % dict(locals(), **sub)
+    def __str__(self):
+        return self.__class__.__name__
+structured_add_s_v_csr = StructuredAddSVCSR()
 def add(x, y):
    """
    Add two matrices, at least one of which is sparse.
@@ -1462,94 +1736,834 @@ class MulSD(gof.op.Op):
 mul_s_d = MulSD()
-def mul(x, y):
+class MulSDCSC(gof.Op):
-    """
+    """Multiplication of sparse matrix by a broadcasted dense vector
-    Multiply (elementwise) two matrices, at least one of which is sparse.
+    element wise.
-    """
-    x = as_sparse_or_tensor_variable(x)
-    y = as_sparse_or_tensor_variable(y)
-    x_is_sparse_variable = _is_sparse_variable(x)
-    y_is_sparse_variable = _is_sparse_variable(y)
-    assert x_is_sparse_variable or y_is_sparse_variable
+    :param a_data: Sparse matrix data.
-    if x_is_sparse_variable and y_is_sparse_variable:
+    :param a_indices: Sparse matrix indices.
-        return mul_s_s(x, y)
+    :param a_indptr: Sparse matrix indptr.
-    elif x_is_sparse_variable and not y_is_sparse_variable:
+    :param b: Tensor type matrix.
-        return mul_s_d(x, y)
-    elif y_is_sparse_variable and not x_is_sparse_variable:
-        return mul_s_d(y, x)
-    else:
-        raise NotImplementedError()
+    :return: The multiplication of the two matrix element wise.
-class Remove0(gof.Op):
+    :note:
-    """
+    - `a_data`, `a_indices` and `a_indptr` must be the properties
-    Remove explicit zeros from a sparse matrix, and resort indices
+      of a sparse matrix in csc format.
+    - The dtype of `a_data`, i.e. the dtype of the sparse matrix,
+      cannot be a complex type.
+    - This op is used as an optimization of mul_s_d.
    """
-    def __init__(self, inplace=False, *args, **kwargs):
-        gof.Op.__init__(self, *args, **kwargs)
-        self.inplace = inplace
-        if self.inplace:
-            self.destroy_map = {0: [0]}
    def __eq__(self, other):
-        return type(self) == type(other) and self.inplace == other.inplace
+        return (type(self) == type(other))
    def __hash__(self):
-        return 64153 ^ hash(type(self)) ^ hash(self.inplace)
+        return hash(type(self))
-    def __str__(self):
+    def make_node(self, a_data, a_indices, a_indptr, b):
-        l = []
+        assert b.type.ndim == 2
-        if self.inplace:
+        return gof.Apply(self, [a_data, a_indices, a_indptr, b],
-            l.append('inplace')
+                               [tensor.tensor(b.dtype, (False,))])
-        return self.__class__.__name__ + '{%s}' % ', '.join(l)
-    def make_node(self, x):
+    def c_code_cache_version(self):
-        return gof.Apply(self, [x], [x.type()])
+        return (1,)
-    def perform(self, node, (x,), (z,)):
+    #def perform(self, node, (a_data, a_indices, a_indptr, b), (out,)):
-        if self.inplace:
+    #    return NotImplementedError()
-            c = x
-        else:
-            c = x.copy()
-        c.eliminate_zeros()
-        z[0] = c
-    def grad(self, (x,), (gz,)):
+    def c_code(self, node, name, (_data, _indices, _indptr, _b,),
-        return [gz]
+               (_zout, ), sub):
-    def infer_shape(self, node, i0_shapes):
+        if node.inputs[0].type.dtype in ('complex64', 'complex128'):
-        return i0_shapes
+            raise NotImplementedError('Complex types are not supported for a')
-remove0 = Remove0()
+        if node.inputs[3].type.dtype in ('complex64', 'complex128'):
+            raise NotImplementedError('Complex types are not supported for b')
+        return """
+        if (%(_b)s->nd != 2) {
+            PyErr_SetString(PyExc_NotImplementedError, "rank(b) != 2");
+            %(fail)s;}
+        if (%(_data)s->nd != 1) {
+            PyErr_SetString(PyExc_NotImplementedError, "rank(data) != 1");
+            %(fail)s;}
+        if (%(_indices)s->nd != 1) {
+            PyErr_SetString(PyExc_NotImplementedError, "rank(indices) != 1");
+            %(fail)s;}
+        if (%(_indptr)s->nd != 1) {
+            PyErr_SetString(PyExc_NotImplementedError, "rank(indptr) != 1");
+            %(fail)s;}
-###############
+        if( %(_indices)s->descr->type_num != PyArray_INT32) {
-#
+        PyErr_SetString(PyExc_NotImplementedError, "C"); %(fail)s;}
-# StructuredDot
-#
-class StructuredDot(gof.Op):
-    """Structured Dot is like dot, except that only the gradient wrt non-zero
-    elements of the sparse matrix A are calculated and propagated.
-    The output is presumed to be a dense matrix, and is represented by a
+        if( %(_indptr)s->descr->type_num != PyArray_INT32)
-    TensorType instance.
+        {PyErr_SetString(PyExc_NotImplementedError, "D"); %(fail)s;}
+        if (!%(_zout)s)
+        {
+            %(_zout)s = (PyArrayObject*) PyArray_SimpleNew(1,
+                  %(_indices)s->dimensions, %(_b)s->descr->type_num);
+        }
+        if (%(_zout)s->dimensions[0] != %(_indices)s->dimensions[0])
+        {
+            PyErr_SetString(PyExc_NotImplementedError,
+    "somehow _zout got the wrong size.. and I don't know how to resize it.");
+            %(fail)s;
+        }
+        { //makes it compile even though labels jump over variable definitions.
+            const npy_intp nnz = %(_indices)s->dimensions[0];
+            //TODO: error checking with this
+            const npy_intp N =  %(_indptr)s->dimensions[0]-1;
+            const dtype_%(_data)s * const __restrict__ data = (dtype_%(_data)s*)%(_data)s->data;
+            const npy_int32 * const __restrict__ indptr = (npy_int32 *)%(_indptr)s->data;
+            const npy_int32 * const __restrict__ indices = (npy_int32 *)%(_indices)s->data;
+            dtype_%(_zout)s * const __restrict__ zout = (dtype_%(_zout)s*)%(_zout)s->data;
+            const npy_intp Sb = %(_b)s->strides[0];
+            // loop over columns
+            for (npy_int32 j = 0; j < N; ++j)
+            {
+                // for each non-null value in the sparse column
+                for (npy_int32 i_idx = indptr[j]; i_idx < indptr[j+1]; ++i_idx)
+                {
+                    // extract row index of non-null value
+                    npy_int32 i = indices[i_idx];
+                    // extract i-th row of dense matrix
+                    const dtype_%(_b)s* __restrict__ b_row = (dtype_%(_b)s*)(%(_b)s->data + Sb * i);
+                    // write resulting gradient to sparse output
+                    zout[i_idx] = data[i_idx] * b_row[j];
+                }
+            }
+        }
+        """ % dict(locals(), **sub)
+    def __str__(self):
+        return self.__class__.__name__
+mul_s_d_csc = MulSDCSC()
+class MulSDCSR(gof.Op):
+    """Multiplication of sparse matrix by a broadcasted dense vector
+    element wise.
+    :param a_data: Sparse matrix data.
+    :param a_indices: Sparse matrix indices.
+    :param a_indptr: Sparse matrix indptr.
+    :param b: Tensor type matrix.
+    :return: The multiplication of the two matrix element wise.
+    :note:
+    - `a_data`, `a_indices` and `a_indptr` must be the properties
+      of a sparse matrix in csr format.
+    - The dtype of `a_data`, i.e. the dtype of the sparse matrix,
+      cannot be a complex type.
+    - This op is used as an optimization of mul_s_d.
    """
    def __eq__(self, other):
        return (type(self) == type(other))
    def __hash__(self):
        return hash(type(self))
-    def __str__(self):
+    def make_node(self, a_data, a_indices, a_indptr, b):
-        return self.__class__.__name__
+        assert b.type.ndim == 2
+        return gof.Apply(self, [a_data, a_indices, a_indptr, b],
+                               [tensor.tensor(b.dtype, (False,))])
-    def make_node(self, a, b):
+    def c_code_cache_version(self):
-        if not _is_sparse_variable(a):
+        return (1,)
-            raise TypeError('First argument must be of type SparseVariable '
-                            'or SparseConstant')
+    #def perform(self, node, (a_data, a_indices, a_indptr, b), (out,)):
-        dtype_out = scalar.upcast(a.type.dtype, b.type.dtype)
+    #    return NotImplemented()
+    def c_code(self, node, name, (_data, _indices, _indptr, _b,),
+               (_zout, ), sub):
+        if node.inputs[0].type.dtype in ('complex64', 'complex128'):
+            raise NotImplementedError('Complex types are not supported for a')
+        if node.inputs[3].type.dtype in ('complex64', 'complex128'):
+            raise NotImplementedError('Complex types are not supported for b')
+        return """
+        if (%(_b)s->nd != 2) {
+            PyErr_SetString(PyExc_NotImplementedError, "rank(b) != 2");
+            %(fail)s;}
+        if (%(_data)s->nd != 1) {
+            PyErr_SetString(PyExc_NotImplementedError, "rank(data) != 1");
+            %(fail)s;}
+        if (%(_indices)s->nd != 1) {
+            PyErr_SetString(PyExc_NotImplementedError, "rank(indices) != 1");
+            %(fail)s;}
+        if (%(_indptr)s->nd != 1) {
+            PyErr_SetString(PyExc_NotImplementedError, "rank(indptr) != 1");
+            %(fail)s;}
+        if( %(_indices)s->descr->type_num != PyArray_INT32) {
+        PyErr_SetString(PyExc_NotImplementedError, "C"); %(fail)s;}
+        if( %(_indptr)s->descr->type_num != PyArray_INT32)
+        {PyErr_SetString(PyExc_NotImplementedError, "D"); %(fail)s;}
+        if (!%(_zout)s)
+        {
+            %(_zout)s = (PyArrayObject*) PyArray_SimpleNew(1,
+                    %(_indices)s->dimensions, %(_b)s->descr->type_num);
+        }
+        if (%(_zout)s->dimensions[0] != %(_indices)s->dimensions[0])
+        {
+            PyErr_SetString(PyExc_NotImplementedError,
+    "somehow _zout got the wrong size.. and I don't know how to resize it.");
+            %(fail)s;
+        }
+        { //makes it compile even though labels jump over variable definitions.
+            const npy_intp nnz = %(_indices)s->dimensions[0];
+            //TODO: error checking with this
+            const npy_intp N =  %(_indptr)s->dimensions[0]-1;
+            const dtype_%(_data)s * const __restrict__ data = (dtype_%(_data)s*)%(_data)s->data;
+            const npy_int32 * const __restrict__ indptr = (npy_int32 *)%(_indptr)s->data;
+            const npy_int32 * const __restrict__ indices = (npy_int32 *)%(_indices)s->data;
+            dtype_%(_zout)s * const __restrict__ zout = (dtype_%(_zout)s*)%(_zout)s->data;
+            const npy_intp Sb = %(_b)s->strides[0];
+            // loop over columns
+            for (npy_int32 j = 0; j < N; ++j)
+            {
+                // extract i-th row of dense matrix
+                const dtype_%(_b)s* __restrict__ b_row = (dtype_%(_b)s*)(%(_b)s->data + Sb * j);
+                // for each non-null value in the sparse column
+                for (npy_int32 i_idx = indptr[j]; i_idx < indptr[j+1]; ++i_idx)
+                {
+                    // extract row index of non-null value
+                    npy_int32 i = indices[i_idx];
+                    // write resulting gradient to sparse output
+                    zout[i_idx] = data[i_idx] * b_row[i];
+                }
+            }
+        }
+        """ % dict(locals(), **sub)
+    def __str__(self):
+        return self.__class__.__name__
+mul_s_d_csr = MulSDCSR()
+class MulSV(gof.op.Op):
+    """Multiplication of sparse matrix by a broadcasted dense vector
+    element wise.
+    :param x: Sparse matrix to multiply.
+    :param y: Tensor broadcastable vector.
+    :Return: The product x * y element wise.
+    :note:
+    - The grad implemented is regular, i.e. not structured.
+    """
+    def __eq__(self, other):
+        return (type(self) == type(other))
+    def __hash__(self):
+        return hash(type(self))
+    def make_node(self, x, y):
+        x = as_sparse_variable(x)
+        y = tensor.as_tensor_variable(y)
+        assert y.type.ndim == 1
+        if x.type.dtype != y.type.dtype:
+            raise NotImplementedError()
+        return gof.Apply(self,
+                         [x, y],
+                         [SparseType(dtype=x.type.dtype,
+                                 format=x.type.format).make_variable()])
+    def perform(self, node, (x, y), (out, )):
+        assert _is_sparse(x) and not _is_sparse(y)
+        assert x.shape[1] == y.shape[0]
+        out[0] = x.__class__(x.toarray() * y)
+    def grad(self, (x, y), (gz,)):
+        assert _is_sparse_variable(x) and _is_dense_variable(y)
+        assert _is_sparse_variable(gz)
+        return mul_s_v(gz, y), theano.sparse.sandbox.sp.sp_sum(x * gz, axis=0, sparse_grad=True)
+    def infer_shape(self, node, ins_shapes):
+        return [ins_shapes[0]]
+    def __str__(self):
+        return self.__class__.__name__
+mul_s_v = MulSV()
+class MulSVCSR(gof.Op):
+    """Multiplication of sparse matrix by a broadcasted dense vector
+    element wise.
+    :param a_data: Sparse matrix data.
+    :param a_indices: Sparse matrix indices.
+    :param a_indptr: Sparse matrix indptr.
+    :param b: Tensor type matrix.
+    :return: The multiplication of the two matrix element wise.
+    :note:
+    - `a_data`, `a_indices` and `a_indptr` must be the properties
+      of a sparse matrix in csr format.
+    - The dtype of `a_data`, i.e. the dtype of the sparse matrix,
+      cannot be a complex type.
+    - This op is used as an optimization of MulSV.
+    """
+    def __eq__(self, other):
+        return (type(self) == type(other))
+    def __hash__(self):
+        return hash(type(self))
+    def make_node(self, a_data, a_indices, a_indptr, b):
+        assert b.type.ndim == 1
+        return gof.Apply(self, [a_data, a_indices, a_indptr, b],
+                               [tensor.tensor(b.dtype, (False,))])
+    def c_code_cache_version(self):
+        return (1,)
+    def c_code(self, node, name, inputs, outputs, sub):
+        _data, _indices, _indptr, _b, = inputs
+        _zout, = outputs
+        if node.inputs[0].type.dtype in ('complex64', 'complex128'):
+            raise NotImplementedError('Complex types are not supported for a')
+        if node.inputs[3].type.dtype in ('complex64', 'complex128'):
+            raise NotImplementedError('Complex types are not supported for b')
+        return """
+        if (%(_b)s->nd != 1) {
+            PyErr_SetString(PyExc_NotImplementedError, "rank(b) != 1");
+            %(fail)s;
+        }
+        if (%(_data)s->nd != 1) {
+            PyErr_SetString(PyExc_NotImplementedError, "rank(data) != 1");
+            %(fail)s;
+        }
+        if (%(_indices)s->nd != 1) {
+            PyErr_SetString(PyExc_NotImplementedError, "rank(indices) != 1");
+            %(fail)s;
+        }
+        if (%(_indptr)s->nd != 1) {
+            PyErr_SetString(PyExc_NotImplementedError, "rank(indptr) != 1");
+            %(fail)s;
+        }
+        if( %(_indices)s->descr->type_num != PyArray_INT32) {
+        PyErr_SetString(PyExc_NotImplementedError, "C"); %(fail)s;}
+        if( %(_indptr)s->descr->type_num != PyArray_INT32)
+        {PyErr_SetString(PyExc_NotImplementedError, "D"); %(fail)s;}
+        if (!%(_zout)s
+            || %(_zout)s->dimensions[0] != %(_indices)s->dimensions[0]
+            || !PyArray_ISCONTIGUOUS(%(_zout)s))
+        {
+            Py_XDECREF(%(_zout)s);
+            %(_zout)s = (PyArrayObject*) PyArray_SimpleNew(1,
+                    %(_indices)s->dimensions, %(_b)s->descr->type_num);
+        }
+        { //makes it compile even though labels jump over variable definitions.
+            const npy_intp nnz = %(_indices)s->dimensions[0];
+            //TODO: error checking with this
+            const npy_intp N =  %(_indptr)s->dimensions[0]-1;
+            const dtype_%(_data)s * const __restrict__ data = (dtype_%(_data)s*)%(_data)s->data;
+            const npy_int32 * const __restrict__ indptr = (npy_int32 *)%(_indptr)s->data;
+            const npy_int32 * const __restrict__ indices = (npy_int32 *)%(_indices)s->data;
+            const dtype_%(_b)s* __restrict__ Db = (dtype_%(_b)s*)%(_b)s->data;
+            dtype_%(_zout)s * const __restrict__ zout = (dtype_%(_zout)s*)%(_zout)s->data;
+            const npy_intp Sb = %(_b)s->strides[0] / %(_b)s->descr->elsize;
+            // loop over rows
+            for (npy_int32 j = 0; j < N; ++j)
+            {
+                // for each non-null value in the sparse column
+                for (npy_int32 i_idx = indptr[j]; i_idx < indptr[j+1]; ++i_idx)
+                {
+                    // extract row index of non-null value
+                    npy_int32 i = indices[i_idx];
+                    zout[i_idx] = data[i_idx] * Db[i * Sb];
+                }
+            }
+        }
+        """ % dict(locals(), **sub)
+    def __str__(self):
+        return self.__class__.__name__
+mul_s_v_csr = MulSVCSR()
+def mul(x, y):
+    """
+    Multiply (elementwise) two matrices, at least one of which is sparse.
+    """
+    x = as_sparse_or_tensor_variable(x)
+    y = as_sparse_or_tensor_variable(y)
+    x_is_sparse_variable = _is_sparse_variable(x)
+    y_is_sparse_variable = _is_sparse_variable(y)
+    assert x_is_sparse_variable or y_is_sparse_variable
+    if x_is_sparse_variable and y_is_sparse_variable:
+        return mul_s_s(x, y)
+    elif x_is_sparse_variable and not y_is_sparse_variable:
+        return mul_s_d(x, y)
+    elif y_is_sparse_variable and not x_is_sparse_variable:
+        return mul_s_d(y, x)
+    else:
+        raise NotImplementedError()
+class HStack(gof.op.Op):
+    """Stack sparse matrices horizontally (column wise).
+    :param blocks: Sequence of sparse array of compatible shape.
+    :param format: String representing the output format. Default
+                   is csc.
+    :param dtype: Output dtype. Must be specified.
+    :return: The concatenation of the sparse arrays column wise.
+    :note:
+    - The number of line of the sparse matrix must agree.
+    - The grad implemented is regular, i.e. not structured.
+    """
+    def __init__(self, format=None, dtype=None):
+        if format is None:
+            self.format = 'csc'
+        else:
+            self.format = format
+        if dtype is None:
+            raise ValueError('The output dtype must be specified.')
+        self.dtype = dtype
+    def __eq__(self, other):
+        return (type(self) == type(other) and
+                self.format == other.format and
+                self.dtype == other.dtype)
+    def __hash__(self):
+        return hash(type(self)) ^ hash(self.format) ^ hash(self.dtype)
+    def make_node(self, *mat):
+        if not mat:
+            raise ValueError('Cannot join an empty list of sparses.')
+        var = [as_sparse_variable(x) for x in mat]
+        return gof.Apply(
+            self, var,
+            [SparseType(dtype=self.dtype, format=self.format).make_variable()])
+    def perform(self, node, block, (out, )):
+        for b in block:
+            assert _is_sparse(b)
+        out[0] = scipy.sparse.hstack(block, format=self.format,
+                                     dtype=self.dtype)
+    def grad(self, inputs, (gz, )):
+        is_continuous = [(inputs[i].dtype in tensor.continuous_dtypes)
+                        for i in range(len(inputs))]
+        if _is_sparse_variable(gz):
+            gz = DenseFromSparse()(gz)
+        split = tensor.Split(len(inputs))(gz, 1,
+                                          tensor.stack(
+                                              *[x.shape[1]
+                                                for x in inputs]))
+        if not isinstance(split, list):
+            split = [split]
+        derivative = [SparseFromDense(self.format)(s) for s in split]
+        def choose(continuous, derivative):
+            if continuous:
+                return derivative
+            else:
+                return None
+        return [choose(c, d) for c, d in zip(is_continuous, derivative)]
+    def infer_shape(self, node, ins_shapes):
+        def _get(l):
+            return l[1]
+        d = sum(map(_get, ins_shapes))
+        return [(ins_shapes[0][0], d)]
+    def __str__(self):
+        return "%s(%s,%s)" % (self.__class__.__name__, self.format, self.dtype)
+def hstack(blocks, format=None, dtype=None):
+    """Stack sparse matrices horizontally (column wise).
+    This wrap the method hstack from scipy.
+    :param blocks: List of sparse array of compatible shape.
+    :param format: String representing the output format. Default
+                   is csc.
+    :param dtype: Output dtype.
+    :return: The concatenation of the sparse array column wise.
+    :note:
+    - The number of line of the sparse matrix must agree.
+    - The grad implemented is regular, i.e. not structured.
+    """
+    blocks = [as_sparse_variable(i) for i in blocks]
+    if dtype is None:
+        dtype = theano.scalar.upcast([i.dtype for i in blocks])
+    return HStack(format=format, dtype=dtype)(*blocks)
+class VStack(HStack):
+    """Stack sparse matrices vertically (row wise).
+    :param blocks: Sequence of sparse array of compatible shape.
+    :param format: String representing the output format. Default
+                   is csc.
+    :param dtype: Output dtype. Must be specified.
+    :return: The concatenation of the sparse arrays row wise.
+    :note:
+    - The number of column of the sparse matrix must agree.
+    - The grad implemented is regular, i.e. not structured.
+    """
+    def perform(self, node, block, (out, )):
+        for b in block:
+            assert _is_sparse(b)
+        out[0] = scipy.sparse.vstack(block, format=self.format,
+                                     dtype=self.dtype)
+    def grad(self, inputs, (gz, )):
+        is_continuous = [(inputs[i].dtype in tensor.continuous_dtypes)
+                        for i in range(len(inputs))]
+        if _is_sparse_variable(gz):
+            gz = DenseFromSparse()(gz)
+        split = tensor.Split(len(inputs))(gz, 0,
+                                          tensor.stack(
+                                              *[x.shape[0]
+                                                for x in inputs]))
+        if not isinstance(split, list):
+            split = [split]
+        derivative = [SparseFromDense(self.format)(s) for s in split]
+        def choose(continuous, derivative):
+            if continuous:
+                return derivative
+            else:
+                return None
+        return [choose(c, d) for c, d in zip(is_continuous, derivative)]
+    def infer_shape(self, node, ins_shapes):
+        def _get(l):
+            return l[0]
+        d = sum(map(_get, ins_shapes))
+        return [(d, ins_shapes[0][1])]
+def vstack(blocks, format=None, dtype=None):
+    """Stack sparse matrices vertically (row wise).
+    This wrap the method vstack from scipy.
+    :param blocks: List of sparse array of compatible shape.
+    :param format: String representing the output format. Default
+                   is csc.
+    :param dtype: Output dtype.
+    :return: The concatenation of the sparse array row wise.
+    :note:
+    - The number of column of the sparse matrix must agree.
+    - The grad implemented is regular, i.e. not structured.
+    """
+    blocks = [as_sparse_variable(i) for i in blocks]
+    if dtype is None:
+        dtype = theano.scalar.upcast([i.dtype for i in blocks])
+    return VStack(format=format, dtype=dtype)(*blocks)
+class Remove0(gof.Op):
+    """
+    Remove explicit zeros from a sparse matrix, and resort indices
+    """
+    def __init__(self, inplace=False, *args, **kwargs):
+        gof.Op.__init__(self, *args, **kwargs)
+        self.inplace = inplace
+        if self.inplace:
+            self.destroy_map = {0: [0]}
+    def __eq__(self, other):
+        return type(self) == type(other) and self.inplace == other.inplace
+    def __hash__(self):
+        return 64153 ^ hash(type(self)) ^ hash(self.inplace)
+    def __str__(self):
+        l = []
+        if self.inplace:
+            l.append('inplace')
+        return self.__class__.__name__ + '{%s}' % ', '.join(l)
+    def make_node(self, x):
+        return gof.Apply(self, [x], [x.type()])
+    def perform(self, node, (x,), (z,)):
+        if self.inplace:
+            c = x
+        else:
+            c = x.copy()
+        c.eliminate_zeros()
+        z[0] = c
+    def grad(self, (x,), (gz,)):
+        return [gz]
+    def infer_shape(self, node, i0_shapes):
+        return i0_shapes
+remove0 = Remove0()
+# Probability
+class Poisson(gof.op.Op):
+    """Return a sparse having random values from a Poisson density
+    with mean from the input.
+    :param x: Sparse matrix.
+    :return: A sparse matrix of random integers of a Poisson density
+             with mean of `x` element wise.
+    """
+    def __eq__(self, other):
+        return (type(self) == type(other))
+    def __hash__(self):
+        return hash(type(self))
+    def make_node(self, x):
+        x = as_sparse_variable(x)
+        return gof.Apply(self, [x], [x.type()])
+    def perform(self, node, (x, ), (out, )):
+        assert _is_sparse(x)
+        out[0] = x.copy()
+        out[0].data = numpy.asarray(numpy.random.poisson(out[0].data),
+                                    dtype=x.dtype)
+        out[0].eliminate_zeros()
+    def grad(self, inputs, outputs_gradients):
+        return [None]
+    def infer_shape(self, node, ins_shapes):
+        return ins_shapes
+    def __str__(self):
+        return self.__class__.__name__
+poisson = Poisson()
+class Multinomial(gof.op.Op):
+    """Return a sparse matrix having random values from a multinomial
+    density having number of experiment `n` and probability of succes
+    `p`.
+    :param n: Tensor type vector or scalar representing the number of
+              experiment for each row. If `n` is a scalar, it will be
+              used for each row.
+    :param p: Sparse matrix of probability where each row is a probability
+              vector representing the probability of succes. N.B. Each row
+              must sum to one.
+    :return: A sparse matrix of random integers from a multinomial density
+             for each row.
+    :note: It will works only if `p` have csr format.
+    """
+    def __eq__(self, other):
+        return (type(self) == type(other))
+    def __hash__(self):
+        return hash(type(self))
+    def make_node(self, n, p):
+        n = tensor.as_tensor_variable(n)
+        p = as_sparse_variable(p)
+        return gof.Apply(self, [n, p], [p.type()])
+    def perform(self, node, (n, p), (out, )):
+        assert _is_sparse(p)
+        if p.format != 'csr':
+            raise NotImplemented()
+        out[0] = p.copy()
+        if n.ndim == 0:
+            for i in xrange(p.shape[0]):
+                k, l = p.indptr[i], p.indptr[i + 1]
+                out[0].data[k:l] = numpy.random.multinomial(n, p.data[k:l])
+        elif n.ndim == 1:
+            if n.shape[0] != p.shape[0]:
+                raise ValueError('The number of element of n must be '
+                                 'the same as the number of row of p.')
+            for i in xrange(p.shape[0]):
+                k, l = p.indptr[i], p.indptr[i + 1]
+                out[0].data[k:l] = numpy.random.multinomial(n[i], p.data[k:l])
+    def grad(self, inputs, outputs_gradients):
+        return [None, None]
+    def infer_shape(self, node, ins_shapes):
+        return [ins_shapes[1]]
+    def __str__(self):
+        return self.__class__.__name__
+multinomial = Multinomial()
+# Structured monoid
+def structured_monoid(tensor_op):
+    """Generic operation to perform many kinds of monoid element-wise
+    operations on the non-zeros of a sparse matrix.
+    The first parameter must always be a sparse matrix. The other parameters
+    must be scalars which will be passed as argument to the tensor_op.
+    """
+    def decorator(f):
+        def wrapper(*args):
+            x = as_sparse_variable(args[0])
+            xs = [scalar.as_scalar(arg) for arg in args[1:]]
+            data, ind, ptr, shape = csm_properties(x)
+            data = tensor_op(data, *xs)
+            return CSM(x.format)(data, ind, ptr, shape)
+        return wrapper
+    return decorator
+@structured_monoid(tensor.nnet.sigmoid)
+def structured_sigmoid(x):
+    """structured elemwise sigmoid.
+    """
+    # see decorator for function body
+@structured_monoid(tensor.exp)
+def structured_exp(x):
+    """structured elemwise exponential.
+    """
+    # see decorator for function body
+@structured_monoid(tensor.log)
+def structured_log(x):
+    """structured elemwise logarithm.
+    """
+    # see decorator for function body
+@structured_monoid(tensor.pow)
+def structured_pow(x, y):
+    """structured elemwise power of sparse matrix
+    x by scalar y.
+    """
+    # see decorator for function body
+@structured_monoid(tensor.minimum)
+def structured_minimum(x, y):
+    """structured elemwise minimum of sparse matrix
+    x by scalar y.
+    """
+    # see decorator for function body
+@structured_monoid(tensor.maximum)
+def structured_maximum(x, y):
+    """structured elemwise maximum of sparse matrix
+    x by scalar y.
+    """
+    # see decorator for function body
+@structured_monoid(tensor.add)
+def structured_add(x):
+    """structured addition of sparse matrix
+    x and scalar y.
+    """
+    # see decorator for function body
+# Dot
+class StructuredDot(gof.Op):
+    """Structured Dot is like dot, except that only the gradient wrt non-zero
+    elements of the sparse matrix A are calculated and propagated.
+    The output is presumed to be a dense matrix, and is represented by a
+    TensorType instance.
+    """
+    def __eq__(self, other):
+        return (type(self) == type(other))
+    def __hash__(self):
+        return hash(type(self))
+    def __str__(self):
+        return self.__class__.__name__
+    def make_node(self, a, b):
+        if not _is_sparse_variable(a):
+            raise TypeError('First argument must be of type SparseVariable '
+                            'or SparseConstant')
+        dtype_out = scalar.upcast(a.type.dtype, b.type.dtype)
        if b.type.ndim != 2:
            raise NotImplementedError('non-matrix b')
@@ -1958,6 +2972,326 @@ class StructuredDotCSR(gof.Op):
 sd_csr = StructuredDotCSR()
+class SamplingDot(gof.op.Op):
+    """Operand for calculating the dot product dot(`x`, `y`.T) = `z` when you
+    only want to calculate a subset of `z`.
+    It is equivalent to `p` o (`x` . `y`.T) where o is the element-wise
+    product, `x` and `y` operands of the dot product and `p` is a matrix that
+    contains 1 when the corresponding element of `z` should be calculated
+    and 0 when it shouldn't. Note that SamplingDot has a different interface
+    than `dot` because SamplingDot requires `x` to be a `m`x`k` matrix while
+    `y` is a `n`x`k` matrix instead of the usual `k`x`n` matrix.
+    .. note::
+        It will work if the pattern is not binary value, but if the
+        pattern doesn't have a high sparsity proportion it will be slower
+        then a more optimized dot followed by a normal elemwise
+        multiplication.
+    :param x: Tensor matrix.
+    :param y: Tensor matrix.
+    :param p: Sparse matrix in csr format.
+    :return: A dense matrix containing the dot product of `x` by `y`.T only
+             where `p` is 1.
+    :note:
+    - The grad implemented is regular, i.e. not structured.
+    """
+    def __eq__(self, other):
+        return type(self) == type(other)
+    def __hash__(self):
+        return hash(type(self))
+    def make_node(self, x, y, p):
+        x = tensor.as_tensor_variable(x)
+        y = tensor.as_tensor_variable(y)
+        p = as_sparse_variable(p)
+        if not _is_sparse_variable(p):
+            raise TypeError(p)
+        #TODO: use it.
+        dtype_out = scalar.upcast(x.type.dtype, y.type.dtype, p.type.dtype)
+        return gof.Apply(self, [x, y, p], [p.type()])
+    def perform(self, node, (x, y, p), (out,)):
+        if _is_sparse(x):
+            raise TypeError(x)
+        if _is_sparse(y):
+            raise TypeError(y)
+        if not _is_sparse(p):
+            raise TypeError(p)
+        out[0] = p.__class__(p.multiply(numpy.dot(x, y.T)))
+    def grad(self, (x, y, p), (gz,)):
+        rval = [
+            dot(p * gz, y),
+            dot((p * gz).T, x),
+            None
+        ]
+        return rval
+    def infer_shape(self, node, ins_shapes):
+        return [ins_shapes[2]]
+    def __str__(self):
+        return self.__class__.__name__
+sampling_dot = SamplingDot()
+class SamplingDotCSR(gof.Op):
+    """Operand optimized for calculating the dot product dot(`x`, `y`.T) = `z`
+    when you only want to calculate a subset of `z`.
+    It is equivalent to `p` o (`x` . `y`.T) where o is the element-wise
+    product, `x` and `y` operands of the dot product and `p` is a matrix
+    that contains 1 when the corresponding element of `z` should be calculated
+    and 0 when it shouldn't. Note that SamplingDot has a different interface
+    than `dot` because SamplingDot requires `x` to be a `m`x`k` matrix while
+    `y` is a `n`x`k` matrix instead of the usual `k`x`n` matrix.
+    .. note::
+        It will work if the pattern is not binary value, but if the
+        pattern doesn't have a high sparsity proportion it will be slower
+        then a more optimized dot followed by a normal elemwise
+        multiplication.
+    :param x: Tensor matrix.
+    :param y: Tensor matrix.
+    :param p_data: Sparse matrix data.
+    :param p_ind: Sparse matrix indices.
+    :param p_ptr: Sparse matric indptr.
+    :param p_ncols: Sparse matrix number of columns.
+    :return: A dense matrix containing the dot product of `x` by `y`.T only
+             where `p` is 1.
+    :note:
+    - If we have the input of mixed dtype, we insert cast elemwise
+      in the graph to be able to call blas function as they don't
+      allow mixed dtype.
+    - This op is used as an optimization for SamplingDot.
+    """
+    def __eq__(self, other):
+        return type(self) == type(other)
+    def __hash__(self):
+        return hash(type(self))
+    def __str__(self):
+        return 'SamplingDot{Csr}'
+    def make_node(self, x, y, p_data, p_ind, p_ptr, p_ncols):
+        x = tensor.as_tensor_variable(x)
+        y = tensor.as_tensor_variable(y)
+        p_data = tensor.as_tensor_variable(p_data)
+        p_ind = tensor.as_tensor_variable(p_ind)
+        p_ptr = tensor.as_tensor_variable(p_ptr)
+        p_ncols = tensor.as_tensor_variable(p_ncols)
+        assert p_ncols.dtype == 'int32'
+        dtype_out = scalar.upcast(x.type.dtype, y.type.dtype,
+                                  p_data.type.dtype)
+        dot_out = scalar.upcast(x.type.dtype, y.type.dtype)
+        # We call blas ?dot function that take only param of the same type
+        x = tensor.cast(x, dot_out)
+        y = tensor.cast(y, dot_out)
+        return gof.Apply(self, [x, y, p_data, p_ind, p_ptr, p_ncols], [
+            tensor.tensor(dtype=dtype_out, broadcastable=(False,)),
+            tensor.tensor(dtype=p_ind.type.dtype, broadcastable=(False,)),
+            tensor.tensor(dtype=p_ptr.type.dtype, broadcastable=(False,))
+        ])
+    def c_support_code(self):
+        return blas.blas_header_text()
+    def c_libraries(self):
+        return blas.ldflags()
+    def c_compile_args(self):
+        return blas.ldflags(libs=False, flags=True)
+    def c_lib_dirs(self):
+        return blas.ldflags(libs=False, libs_dir=True)
+    def c_header_dirs(self):
+        return blas.ldflags(libs=False, include_dir=True)
+    def c_code(self, node, name, inputs, outputs, sub):
+        x, y, p_data, p_ind, p_ptr, p_ncols = inputs
+        z_data, z_ind, z_ptr = outputs
+        if node.inputs[0].type.dtype in ('complex64', 'complex128'):
+            raise NotImplementedError('Complex types are not supported for x')
+        if node.inputs[1].type.dtype in ('complex64', 'complex128'):
+            raise NotImplementedError('Complex types are not supported for y')
+        if node.inputs[2].type.dtype in ('complex64', 'complex128'):
+            raise NotImplementedError(
+                'Complex types are not supported for pattern')
+        dot_out = scalar.upcast(node.inputs[0].type.dtype,
+                                node.inputs[1].type.dtype)
+        if dot_out == "float32":
+            conv_type = "float"
+            cdot = "sdot_"
+        else:
+            conv_type = "double"
+            cdot = "ddot_"
+        # retrieve dtype number
+        typenum_x = node.inputs[0].type.dtype_specs()[-1]
+        typenum_y = node.inputs[1].type.dtype_specs()[-1]
+        typenum_p = node.inputs[2].type.dtype_specs()[-1]
+        typenum_zd = tensor.TensorType(node.outputs[0].dtype,
+                                       []).dtype_specs()[-1]
+        typenum_zi = tensor.TensorType(node.outputs[1].dtype,
+                                       []).dtype_specs()[-1]
+        typenum_zp = tensor.TensorType(node.outputs[2].dtype,
+                                       []).dtype_specs()[-1]
+        rval = """
+        if (%(x)s->nd != 2) {
+PyErr_SetString(PyExc_NotImplementedError, "rank(x) != 2"); %(fail)s;}
+        if (%(y)s->nd != 2) {
+PyErr_SetString(PyExc_NotImplementedError, "rank(y) != 2"); %(fail)s;}
+        if (%(x)s->descr->type_num != %(typenum_x)s) {
+            PyErr_SetString(PyExc_NotImplementedError,
+                            "Invalid type for x");
+            %(fail)s;}
+        if (%(y)s->descr->type_num != %(typenum_y)s) {
+            PyErr_SetString(PyExc_NotImplementedError,
+                            "Invalid type for y");
+            %(fail)s;}
+        if (%(p_data)s->descr->type_num != %(typenum_p)s) {
+            PyErr_SetString(PyExc_NotImplementedError,
+                            "Invalid type for pattern");
+            %(fail)s;}
+        if (%(x)s->dimensions[1] != %(y)s->dimensions[1]) {
+            PyErr_SetString(PyExc_NotImplementedError,
+              "x's number of columns doesn't match y's rows! Note: sampling_dot is different from dot because y is assumed to be transposed.");
+            %(fail)s;}
+        if (%(y)s->dimensions[0] != ((npy_int32 *)%(p_ncols)s->data)[0] ||
+            %(x)s->dimensions[0] != (%(p_ptr)s->dimensions[0] - 1))
+        {PyErr_SetString(PyExc_NotImplementedError,
+        "The dimension of the pattern and the output must match"); %(fail)s;}
+        // Allocate output
+        if (!%(z_data)s
+            || (%(z_data)s->dimensions[0] != %(p_data)s->dimensions[0])
+            || (%(z_data)s->descr->type_num != %(typenum_zd)s)) {
+            {Py_XDECREF(%(z_data)s);}
+            npy_intp dims[] = {0};
+            dims[0] = %(p_data)s->dimensions[0];
+            %(z_data)s = (PyArrayObject*) PyArray_SimpleNew(1, dims,
+                                                            %(typenum_zd)s);
+        }
+        if (!%(z_ind)s
+            || (%(z_ind)s->dimensions[0] != %(p_ind)s->dimensions[0])
+            || (%(z_ind)s->descr->type_num != %(typenum_zi)s)) {
+            {Py_XDECREF(%(z_ind)s);}
+            npy_intp dims[] = {0};
+            dims[0] = %(p_ind)s->dimensions[0];
+            %(z_ind)s = (PyArrayObject*) PyArray_SimpleNew(1, dims,
+                                                           %(typenum_zi)s);
+        }
+        if (!%(z_ptr)s
+            || (%(z_ptr)s->dimensions[0] != %(p_ptr)s->dimensions[0])
+            || (%(z_ptr)s->descr->type_num != %(typenum_zp)s)) {
+            {Py_XDECREF(%(z_ptr)s);}
+            npy_intp dims[] = {0};
+            dims[0] = %(p_ptr)s->dimensions[0];
+            %(z_ptr)s = (PyArrayObject*) PyArray_SimpleNew(1, dims,
+                                                           %(typenum_zp)s);
+        }
+        {
+            // Product of MxK and NxK, output MxN
+            npy_intp M = %(x)s->dimensions[0];
+            npy_intp N = %(y)s->dimensions[0];
+            npy_intp K = %(y)s->dimensions[1];
+            // pointers to access actual data in the arrays passed as params.
+            const dtype_%(x)s* __restrict__ Dx = (dtype_%(x)s*)%(x)s->data;
+            const dtype_%(y)s* __restrict__ Dy = (dtype_%(y)s*)%(y)s->data;
+            const dtype_%(p_data)s* __restrict__ Dpd = (dtype_%(p_data)s*)%(p_data)s->data;
+            const dtype_%(p_ind)s* __restrict__ Dpi = (dtype_%(p_ind)s*)%(p_ind)s->data;
+            const dtype_%(p_ptr)s* __restrict__ Dpp = (dtype_%(p_ptr)s*)%(p_ptr)s->data;
+            dtype_%(z_data)s* __restrict__ Dzd = (dtype_%(z_data)s*)%(z_data)s->data;
+            dtype_%(z_ind)s* __restrict__ Dzi = (dtype_%(z_ind)s*)%(z_ind)s->data;
+            dtype_%(z_ptr)s* __restrict__ Dzp = (dtype_%(z_ptr)s*)%(z_ptr)s->data;
+            const npy_intp Sdx = %(x)s->strides[1]/%(x)s->descr->elsize;
+            const npy_intp Sdy = %(y)s->strides[1]/%(y)s->descr->elsize;
+            const npy_intp Sdpd = %(p_data)s->strides[0] / %(p_data)s->descr->elsize;
+            const npy_intp Sdpi = %(p_ind)s->strides[0] / %(p_ind)s->descr->elsize;
+            const npy_intp Sdpp = %(p_ptr)s->strides[0] / %(p_ptr)s->descr->elsize;
+            const npy_intp Sdzd = %(z_data)s->strides[0] / %(z_data)s->descr->elsize;
+            const npy_intp Sdzi = %(z_ind)s->strides[0] / %(z_ind)s->descr->elsize;
+            const npy_intp Sdzp = %(z_ptr)s->strides[0] / %(z_ptr)s->descr->elsize;
+            memcpy(Dzi, Dpi, %(p_ind)s->dimensions[0]*sizeof(dtype_%(p_ind)s));
+            memcpy(Dzp, Dpp, %(p_ptr)s->dimensions[0]*sizeof(dtype_%(p_ptr)s));
+            for (npy_int32 m = 0; m < M; ++m) {
+                for (npy_int32 n_idx = Dpp[m * Sdpp]; n_idx < Dpp[(m+1)*Sdpp]; ++n_idx) {
+                    const npy_int32 n = Dpi[n_idx * Sdpi]; // row index of non-null value for column K
+                    const dtype_%(x)s* x_row = (dtype_%(x)s*)(%(x)s->data + %(x)s->strides[0] * m);
+                    const dtype_%(y)s* y_col = (dtype_%(y)s*)(%(y)s->data + %(y)s->strides[0] * n);
+                    Dzd[n_idx * Sdzd] = Dpd[n_idx * Sdpd] * %(cdot)s((int*)&K, (const %(conv_type)s*)x_row, (int*)&Sdx, (const %(conv_type)s*)y_col, (int*)&Sdy);
+                }
+            }
+        }
+        """ % dict(locals(), **sub)
+        return rval
+sampling_dot_csr = SamplingDotCSR()
+# register a specialization to replace StructuredDot -> StructuredDotCSx
+@gof.local_optimizer([_structured_dot])
+def local_structured_dot(node):
+    if node.op == _structured_dot:
+        a, b = node.inputs
+        if a.type.format == 'csc':
+            a_val, a_ind, a_ptr, a_shape = csm_properties(a)
+            a_nsparse = a_shape[0]
+            return [sd_csc(a_val, a_ind, a_ptr, a_nsparse, b)]
+        if a.type.format == 'csr':
+            a_val, a_ind, a_ptr, a_shape = csm_properties(a)
+            return [sd_csr(a_val, a_ind, a_ptr, b)]
+    return False
+# Commented out because
+# a) it is only slightly faster than scipy these days, and sometimes a little
+# slower, and
+# b) the resulting graphs make it very difficult for an op to do size checking
+# on the matrices involved.  dimension mismatches are hard to detect sensibly.
+#register_specialize(local_structured_dot)
 def structured_dot_grad(sparse_A, dense_B, ga):
    if sparse_A.type.format in ('csc', 'csr'):

--- a/theano/sparse/sandbox/__init__.py
+++ b/theano/sparse/sandbox/__init__.py
+import sp
+import sp2
+import truedot
--- a/theano/sparse/sandbox/sp2.py
+++ b/theano/sparse/sandbox/sp2.py
@@ -11,296 +11,27 @@ from theano.sparse.basic import (
    CSMProperties, CSM, register_specialize,
    _is_sparse_variable, _is_dense_variable, CSC, CSR,
    csm_properties, csm_data, csm_indices, csm_indptr, csm_shape,
-    _is_sparse, Remove0, remove0)
+    _is_sparse,
-from theano.sparse.sandbox.sp import sp_sum
+    Remove0, remove0,
+    # To maintain compatibility
+    Cast, bcast, wcast, icast, lcast, fcast, dcast, ccast, zcast,
+    HStack, hstack, VStack, vstack,
+    AddSSData, add_s_s_data,
+    MulSDCSC, mul_s_d_csc, MulSDCSR, mul_s_d_csr,
+    Multinomial, multinomial, Poisson, poisson,
+    structured_monoid,
+    structured_sigmoid, structured_exp, structured_log, structured_pow,
+    structured_minimum, structured_maximum, structured_add,
+    MulSV, mul_s_v, MulSVCSR, mul_s_v_csr,
+    StructuredAddSV, structured_add_s_v,
+    StructuredAddSVCSR, structured_add_s_v_csr,
+    SamplingDot, sampling_dot, SamplingDotCSR, sampling_dot_csr)
+# Alias to maintain compatibility
 EliminateZeros = Remove0
 eliminate_zeros = remove0
-class Cast(gof.op.Op):
-    """Cast sparse variable to the desired dtype.
-    :param x: Sparse matrix.
-    :return: Same as `x` but having `out_type` as dtype.
-    :note:
-    - The grad implemented is regular, i.e. not structured.
-    """
-    def __init__(self, out_type):
-        self.out_type = out_type
-    def __eq__(self, other):
-        return (type(self) == type(other)) and self.out_type == other.out_type
-    def __hash__(self):
-        return hash(type(self)) ^ hash(self.out_type)
-    def make_node(self, x):
-        x = as_sparse_variable(x)
-        return gof.Apply(
-            self, [x],
-            [SparseType(dtype=self.out_type, format=x.format).make_variable()])
-    def perform(self, node, (x, ), (out, )):
-        assert _is_sparse(x)
-        out[0] = x.astype(self.out_type)
-    def grad(self, inputs, outputs_gradients):
-        if inputs[0].dtype in tensor.continuous_dtypes:
-            gz = outputs_gradients[0]
-            return [Cast(inputs[0].dtype)(gz)]
-        else:
-            return [None]
-    def infer_shape(self, node, ins_shapes):
-        return ins_shapes
-    def __str__(self):
-        return "%s(%s)" % (self.__class__.__name__, self.out_type)
-bcast = Cast('int8')
-wcast = Cast('int16')
-icast = Cast('int32')
-lcast = Cast('int64')
-fcast = Cast('float32')
-dcast = Cast('float64')
-ccast = Cast('complex64')
-zcast = Cast('complex128')
-class HStack(gof.op.Op):
-    """Stack sparse matrices horizontally (column wise).
-    :param blocks: Sequence of sparse array of compatible shape.
-    :param format: String representing the output format. Default
-                   is csc.
-    :param dtype: Output dtype. Must be specified.
-    :return: The concatenation of the sparse arrays column wise.
-    :note:
-    - The number of line of the sparse matrix must agree.
-    - The grad implemented is regular, i.e. not structured.
-    """
-    def __init__(self, format=None, dtype=None):
-        if format is None:
-            self.format = 'csc'
-        else:
-            self.format = format
-        if dtype is None:
-            raise ValueError('The output dtype must be specified.')
-        self.dtype = dtype
-    def __eq__(self, other):
-        return (type(self) == type(other) and
-                self.format == other.format and
-                self.dtype == other.dtype)
-    def __hash__(self):
-        return hash(type(self)) ^ hash(self.format) ^ hash(self.dtype)
-    def make_node(self, *mat):
-        if not mat:
-            raise ValueError('Cannot join an empty list of sparses.')
-        var = [as_sparse_variable(x) for x in mat]
-        return gof.Apply(
-            self, var,
-            [SparseType(dtype=self.dtype, format=self.format).make_variable()])
-    def perform(self, node, block, (out, )):
-        for b in block:
-            assert _is_sparse(b)
-        out[0] = scipy.sparse.hstack(block, format=self.format,
-                                     dtype=self.dtype)
-    def grad(self, inputs, (gz, )):
-        is_continuous = [(inputs[i].dtype in tensor.continuous_dtypes)
-                        for i in range(len(inputs))]
-        if _is_sparse_variable(gz):
-            gz = sparse.DenseFromSparse()(gz)
-        split = tensor.Split(len(inputs))(gz, 1,
-                                          tensor.stack(
-                                              *[x.shape[1]
-                                                for x in inputs]))
-        if not isinstance(split, list):
-            split = [split]
-        derivative =  [sparse.SparseFromDense(self.format)(s) for s in split]
-        def choose(continuous, derivative):
-            if continuous:
-                return derivative
-            else:
-                return None
-        return [choose(c, d) for c, d in zip(is_continuous, derivative)]
-    def infer_shape(self, node, ins_shapes):
-        def _get(l):
-            return l[1]
-        d = sum(map(_get, ins_shapes))
-        return [(ins_shapes[0][0], d)]
-    def __str__(self):
-        return "%s(%s,%s)" % (self.__class__.__name__, self.format, self.dtype)
-def hstack(blocks, format=None, dtype=None):
-    """Stack sparse matrices horizontally (column wise).
-    This wrap the method hstack from scipy.
-    :param blocks: List of sparse array of compatible shape.
-    :param format: String representing the output format. Default
-                   is csc.
-    :param dtype: Output dtype.
-    :return: The concatenation of the sparse array column wise.
-    :note:
-    - The number of line of the sparse matrix must agree.
-    - The grad implemented is regular, i.e. not structured.
-    """
-    blocks = [as_sparse_variable(i) for i in blocks]
-    if dtype is None:
-        dtype = theano.scalar.upcast([i.dtype for i in blocks])
-    return HStack(format=format, dtype=dtype)(*blocks)
-class VStack(HStack):
-    """Stack sparse matrices vertically (row wise).
-    :param blocks: Sequence of sparse array of compatible shape.
-    :param format: String representing the output format. Default
-                   is csc.
-    :param dtype: Output dtype. Must be specified.
-    :return: The concatenation of the sparse arrays row wise.
-    :note:
-    - The number of column of the sparse matrix must agree.
-    - The grad implemented is regular, i.e. not structured.
-    """
-    def perform(self, node, block, (out, )):
-        for b in block:
-            assert _is_sparse(b)
-        out[0] = scipy.sparse.vstack(block, format=self.format,
-                                     dtype=self.dtype)
-    def grad(self, inputs, (gz, )):
-        is_continuous = [(inputs[i].dtype in tensor.continuous_dtypes)
-                        for i in range(len(inputs))]
-        if _is_sparse_variable(gz):
-            gz = sparse.DenseFromSparse()(gz)
-        split = tensor.Split(len(inputs))(gz, 0,
-                                          tensor.stack(
-                                              *[x.shape[0]
-                                                for x in inputs]))
-        if not isinstance(split, list):
-            split = [split]
-        derivative = [sparse.SparseFromDense(self.format)(s) for s in split]
-        def choose(continuous, derivative):
-            if continuous:
-                return derivative
-            else:
-                return None
-        return [choose(c, d) for c, d in zip(is_continuous, derivative)]
-    def infer_shape(self, node, ins_shapes):
-        def _get(l):
-            return l[0]
-        d = sum(map(_get, ins_shapes))
-        return [(d, ins_shapes[0][1])]
-def vstack(blocks, format=None, dtype=None):
-    """Stack sparse matrices vertically (row wise).
-    This wrap the method vstack from scipy.
-    :param blocks: List of sparse array of compatible shape.
-    :param format: String representing the output format. Default
-                   is csc.
-    :param dtype: Output dtype.
-    :return: The concatenation of the sparse array row wise.
-    :note:
-    - The number of column of the sparse matrix must agree.
-    - The grad implemented is regular, i.e. not structured.
-    """
-    blocks = [as_sparse_variable(i) for i in blocks]
-    if dtype is None:
-        dtype = theano.scalar.upcast([i.dtype for i in blocks])
-    return VStack(format=format, dtype=dtype)(*blocks)
-class AddSSData(gof.op.Op):
-    """Add two sparse matrices assuming they have the same sparsity
-    pattern.
-    :param x: Sparse matrix.
-    :param y: Sparse matrix.
-    :return: The sum of the two sparse matrix element wise.
-    :note:
-    - `x` and `y` are assumed to have the same sparsity pattern.
-    - The grad implemented is structured.
-    """
-    def __eq__(self, other):
-        return (type(self) == type(other))
-    def __hash__(self):
-        return hash(type(self))
-    def make_node(self, x, y):
-        x, y = map(as_sparse_variable, [x, y])
-        if x.type.dtype != y.type.dtype:
-            raise NotImplementedError()
-        if x.type.format != y.type.format:
-            raise NotImplementedError()
-        return gof.Apply(self,
-                         [x, y],
-                         [SparseType(dtype=x.type.dtype,
-                                 format=x.type.format).make_variable()])
-    def perform(self, node, (x, y), (out, )):
-        assert _is_sparse(x) and _is_sparse(y)
-        assert x.shape == y.shape
-        assert x.data.shape == y.data.shape
-        out[0] = x.copy()
-        out[0].data += y.data
-    def grad(self, inputs, (gz, )):
-        is_continuous = [(i.dtype in sparse.continuous_dtypes)
-                         for i in inputs]
-        derivative = {True: gz, False: None}
-        return [derivative[b] for b in is_continuous]
-    def infer_shape(self, node, ins_shapes):
-        return [ins_shapes[0]]
-    def __str__(self):
-        return self.__class__.__name__
-add_s_s_data = AddSSData()
 # register a specialization to replace MulSD -> MulSDCSX
 @gof.local_optimizer([mul_s_d])
 def local_mul_s_d(node):
@@ -338,335 +69,6 @@ def local_mul_s_d(node):
 register_specialize(local_mul_s_d)
-class MulSDCSC(gof.Op):
-    """Multiplication of sparse matrix by a broadcasted dense vector
-    element wise.
-    :param a_data: Sparse matrix data.
-    :param a_indices: Sparse matrix indices.
-    :param a_indptr: Sparse matrix indptr.
-    :param b: Tensor type matrix.
-    :return: The multiplication of the two matrix element wise.
-    :note:
-    - `a_data`, `a_indices` and `a_indptr` must be the properties
-      of a sparse matrix in csc format.
-    - The dtype of `a_data`, i.e. the dtype of the sparse matrix,
-      cannot be a complex type.
-    - This op is used as an optimization of mul_s_d.
-    """
-    def __eq__(self, other):
-        return (type(self) == type(other))
-    def __hash__(self):
-        return hash(type(self))
-    def make_node(self, a_data, a_indices, a_indptr, b):
-        assert b.type.ndim == 2
-        return gof.Apply(self, [a_data, a_indices, a_indptr, b],
-                               [tensor.tensor(b.dtype, (False,))])
-    def c_code_cache_version(self):
-        return (1,)
-    #def perform(self, node, (a_data, a_indices, a_indptr, b), (out,)):
-    #    return NotImplementedError()
-    def c_code(self, node, name, (_data, _indices, _indptr, _b,),
-               (_zout, ), sub):
-        if node.inputs[0].type.dtype in ('complex64', 'complex128'):
-            raise NotImplementedError('Complex types are not supported for a')
-        if node.inputs[3].type.dtype in ('complex64', 'complex128'):
-            raise NotImplementedError('Complex types are not supported for b')
-        return """
-        if (%(_b)s->nd != 2) {
-            PyErr_SetString(PyExc_NotImplementedError, "rank(b) != 2");
-            %(fail)s;}
-        if (%(_data)s->nd != 1) {
-            PyErr_SetString(PyExc_NotImplementedError, "rank(data) != 1");
-            %(fail)s;}
-        if (%(_indices)s->nd != 1) {
-            PyErr_SetString(PyExc_NotImplementedError, "rank(indices) != 1");
-            %(fail)s;}
-        if (%(_indptr)s->nd != 1) {
-            PyErr_SetString(PyExc_NotImplementedError, "rank(indptr) != 1");
-            %(fail)s;}
-        if( %(_indices)s->descr->type_num != PyArray_INT32) {
-        PyErr_SetString(PyExc_NotImplementedError, "C"); %(fail)s;}
-        if( %(_indptr)s->descr->type_num != PyArray_INT32)
-        {PyErr_SetString(PyExc_NotImplementedError, "D"); %(fail)s;}
-        if (!%(_zout)s)
-        {
-            %(_zout)s = (PyArrayObject*) PyArray_SimpleNew(1,
-                  %(_indices)s->dimensions, %(_b)s->descr->type_num);
-        }
-        if (%(_zout)s->dimensions[0] != %(_indices)s->dimensions[0])
-        {
-            PyErr_SetString(PyExc_NotImplementedError,
-    "somehow _zout got the wrong size.. and I don't know how to resize it.");
-            %(fail)s;
-        }
-        { //makes it compile even though labels jump over variable definitions.
-            const npy_intp nnz = %(_indices)s->dimensions[0];
-            //TODO: error checking with this
-            const npy_intp N =  %(_indptr)s->dimensions[0]-1;
-            const dtype_%(_data)s * const __restrict__ data = (dtype_%(_data)s*)%(_data)s->data;
-            const npy_int32 * const __restrict__ indptr = (npy_int32 *)%(_indptr)s->data;
-            const npy_int32 * const __restrict__ indices = (npy_int32 *)%(_indices)s->data;
-            dtype_%(_zout)s * const __restrict__ zout = (dtype_%(_zout)s*)%(_zout)s->data;
-            const npy_intp Sb = %(_b)s->strides[0];
-            // loop over columns
-            for (npy_int32 j = 0; j < N; ++j)
-            {
-                // for each non-null value in the sparse column
-                for (npy_int32 i_idx = indptr[j]; i_idx < indptr[j+1]; ++i_idx)
-                {
-                    // extract row index of non-null value
-                    npy_int32 i = indices[i_idx];
-                    // extract i-th row of dense matrix
-                    const dtype_%(_b)s* __restrict__ b_row = (dtype_%(_b)s*)(%(_b)s->data + Sb * i);
-                    // write resulting gradient to sparse output
-                    zout[i_idx] = data[i_idx] * b_row[j];
-                }
-            }
-        }
-        """ % dict(locals(), **sub)
-    def __str__(self):
-        return self.__class__.__name__
-mul_s_d_csc = MulSDCSC()
-class MulSDCSR(gof.Op):
-    """Multiplication of sparse matrix by a broadcasted dense vector
-    element wise.
-    :param a_data: Sparse matrix data.
-    :param a_indices: Sparse matrix indices.
-    :param a_indptr: Sparse matrix indptr.
-    :param b: Tensor type matrix.
-    :return: The multiplication of the two matrix element wise.
-    :note:
-    - `a_data`, `a_indices` and `a_indptr` must be the properties
-      of a sparse matrix in csr format.
-    - The dtype of `a_data`, i.e. the dtype of the sparse matrix,
-      cannot be a complex type.
-    - This op is used as an optimization of mul_s_d.
-    """
-    def __eq__(self, other):
-        return (type(self) == type(other))
-    def __hash__(self):
-        return hash(type(self))
-    def make_node(self, a_data, a_indices, a_indptr, b):
-        assert b.type.ndim == 2
-        return gof.Apply(self, [a_data, a_indices, a_indptr, b],
-                               [tensor.tensor(b.dtype, (False,))])
-    def c_code_cache_version(self):
-        return (1,)
-    #def perform(self, node, (a_data, a_indices, a_indptr, b), (out,)):
-    #    return NotImplemented()
-    def c_code(self, node, name, (_data, _indices, _indptr, _b,),
-               (_zout, ), sub):
-        if node.inputs[0].type.dtype in ('complex64', 'complex128'):
-            raise NotImplementedError('Complex types are not supported for a')
-        if node.inputs[3].type.dtype in ('complex64', 'complex128'):
-            raise NotImplementedError('Complex types are not supported for b')
-        return """
-        if (%(_b)s->nd != 2) {
-            PyErr_SetString(PyExc_NotImplementedError, "rank(b) != 2");
-            %(fail)s;}
-        if (%(_data)s->nd != 1) {
-            PyErr_SetString(PyExc_NotImplementedError, "rank(data) != 1");
-            %(fail)s;}
-        if (%(_indices)s->nd != 1) {
-            PyErr_SetString(PyExc_NotImplementedError, "rank(indices) != 1");
-            %(fail)s;}
-        if (%(_indptr)s->nd != 1) {
-            PyErr_SetString(PyExc_NotImplementedError, "rank(indptr) != 1");
-            %(fail)s;}
-        if( %(_indices)s->descr->type_num != PyArray_INT32) {
-        PyErr_SetString(PyExc_NotImplementedError, "C"); %(fail)s;}
-        if( %(_indptr)s->descr->type_num != PyArray_INT32)
-        {PyErr_SetString(PyExc_NotImplementedError, "D"); %(fail)s;}
-        if (!%(_zout)s)
-        {
-            %(_zout)s = (PyArrayObject*) PyArray_SimpleNew(1,
-                    %(_indices)s->dimensions, %(_b)s->descr->type_num);
-        }
-        if (%(_zout)s->dimensions[0] != %(_indices)s->dimensions[0])
-        {
-            PyErr_SetString(PyExc_NotImplementedError,
-    "somehow _zout got the wrong size.. and I don't know how to resize it.");
-            %(fail)s;
-        }
-        { //makes it compile even though labels jump over variable definitions.
-            const npy_intp nnz = %(_indices)s->dimensions[0];
-            //TODO: error checking with this
-            const npy_intp N =  %(_indptr)s->dimensions[0]-1;
-            const dtype_%(_data)s * const __restrict__ data = (dtype_%(_data)s*)%(_data)s->data;
-            const npy_int32 * const __restrict__ indptr = (npy_int32 *)%(_indptr)s->data;
-            const npy_int32 * const __restrict__ indices = (npy_int32 *)%(_indices)s->data;
-            dtype_%(_zout)s * const __restrict__ zout = (dtype_%(_zout)s*)%(_zout)s->data;
-            const npy_intp Sb = %(_b)s->strides[0];
-            // loop over columns
-            for (npy_int32 j = 0; j < N; ++j)
-            {
-                // extract i-th row of dense matrix
-                const dtype_%(_b)s* __restrict__ b_row = (dtype_%(_b)s*)(%(_b)s->data + Sb * j);
-                // for each non-null value in the sparse column
-                for (npy_int32 i_idx = indptr[j]; i_idx < indptr[j+1]; ++i_idx)
-                {
-                    // extract row index of non-null value
-                    npy_int32 i = indices[i_idx];
-                    // write resulting gradient to sparse output
-                    zout[i_idx] = data[i_idx] * b_row[i];
-                }
-            }
-        }
-        """ % dict(locals(), **sub)
-    def __str__(self):
-        return self.__class__.__name__
-mul_s_d_csr = MulSDCSR()
-class Poisson(gof.op.Op):
-    """Return a sparse having random values from a Poisson density
-    with mean from the input.
-    :param x: Sparse matrix.
-    :return: A sparse matrix of random integers of a Poisson density
-             with mean of `x` element wise.
-    """
-    def __eq__(self, other):
-        return (type(self) == type(other))
-    def __hash__(self):
-        return hash(type(self))
-    def make_node(self, x):
-        x = as_sparse_variable(x)
-        return gof.Apply(self, [x], [x.type()])
-    def perform(self, node, (x, ), (out, )):
-        assert _is_sparse(x)
-        out[0] = x.copy()
-        out[0].data = numpy.asarray(numpy.random.poisson(out[0].data),
-                                    dtype=x.dtype)
-        out[0].eliminate_zeros()
-    def grad(self, inputs, outputs_gradients):
-        return [None]
-    def infer_shape(self, node, ins_shapes):
-        return ins_shapes
-    def __str__(self):
-        return self.__class__.__name__
-poisson = Poisson()
-class Multinomial(gof.op.Op):
-    """Return a sparse matrix having random values from a multinomial
-    density having number of experiment `n` and probability of succes
-    `p`.
-    :param n: Tensor type vector or scalar representing the number of
-              experiment for each row. If `n` is a scalar, it will be
-              used for each row.
-    :param p: Sparse matrix of probability where each row is a probability
-              vector representing the probability of succes. N.B. Each row
-              must sum to one.
-    :return: A sparse matrix of random integers from a multinomial density
-             for each row.
-    :note: It will works only if `p` have csr format.
-    """
-    def __eq__(self, other):
-        return (type(self) == type(other))
-    def __hash__(self):
-        return hash(type(self))
-    def make_node(self, n, p):
-        n = tensor.as_tensor_variable(n)
-        p = as_sparse_variable(p)
-        return gof.Apply(self, [n, p], [p.type()])
-    def perform(self, node, (n, p), (out, )):
-        assert _is_sparse(p)
-        if p.format != 'csr':
-            raise NotImplemented()
-        out[0] = p.copy()
-        if n.ndim == 0:
-            for i in xrange(p.shape[0]):
-                k, l = p.indptr[i], p.indptr[i + 1]
-                out[0].data[k:l] = numpy.random.multinomial(n, p.data[k:l])
-        elif n.ndim == 1:
-            if n.shape[0] != p.shape[0]:
-                raise ValueError('The number of element of n must be '
-                                 'the same as the number of row of p.')
-            for i in xrange(p.shape[0]):
-                k, l = p.indptr[i], p.indptr[i + 1]
-                out[0].data[k:l] = numpy.random.multinomial(n[i], p.data[k:l])
-    def grad(self, inputs, outputs_gradients):
-        return [None, None]
-    def infer_shape(self, node, ins_shapes):
-        return [ins_shapes[1]]
-    def __str__(self):
-        return self.__class__.__name__
-multinomial = Multinomial()
 class Binomial(gof.op.Op):
    # TODO This op is not an equivalent of numpy.random.binomial. In
    # facts, this does not follow a binomial distribution at all.
@@ -728,241 +130,6 @@ csr_dbinomial = Binomial('csr', 'float64')
 csc_dbinomial = Binomial('csc', 'float64')
-def structured_monoid(tensor_op):
-    """Generic operation to perform many kinds of monoid element-wise
-    operations on the non-zeros of a sparse matrix.
-    The first parameter must always be a sparse matrix. The other parameters
-    must be scalars which will be passed as argument to the tensor_op.
-    """
-    def decorator(f):
-        def wrapper(*args):
-            x = as_sparse_variable(args[0])
-            xs = [scalar.as_scalar(arg) for arg in args[1:]]
-            data, ind, ptr, shape = csm_properties(x)
-            data = tensor_op(data, *xs)
-            return CSM(x.format)(data, ind, ptr, shape)
-        return wrapper
-    return decorator
-@structured_monoid(tensor.nnet.sigmoid)
-def structured_sigmoid(x):
-    """structured elemwise sigmoid.
-    """
-    # see decorator for function body
-@structured_monoid(tensor.exp)
-def structured_exp(x):
-    """structured elemwise exponential.
-    """
-    # see decorator for function body
-@structured_monoid(tensor.log)
-def structured_log(x):
-    """structured elemwise logarithm.
-    """
-    # see decorator for function body
-@structured_monoid(tensor.pow)
-def structured_pow(x, y):
-    """structured elemwise power of sparse matrix
-    x by scalar y.
-    """
-    # see decorator for function body
-@structured_monoid(tensor.minimum)
-def structured_minimum(x, y):
-    """structured elemwise minimum of sparse matrix
-    x by scalar y.
-    """
-    # see decorator for function body
-@structured_monoid(tensor.maximum)
-def structured_maximum(x, y):
-    """structured elemwise maximum of sparse matrix
-    x by scalar y.
-    """
-    # see decorator for function body
-@structured_monoid(tensor.add)
-def structured_add(x):
-    """structured addition of sparse matrix
-    x and scalar y.
-    """
-    # see decorator for function body
-class MulSV(gof.op.Op):
-    """Multiplication of sparse matrix by a broadcasted dense vector
-    element wise.
-    :param x: Sparse matrix to multiply.
-    :param y: Tensor broadcastable vector.
-    :Return: The product x * y element wise.
-    :note:
-    - The grad implemented is regular, i.e. not structured.
-    """
-    def __eq__(self, other):
-        return (type(self) == type(other))
-    def __hash__(self):
-        return hash(type(self))
-    def make_node(self, x, y):
-        x = as_sparse_variable(x)
-        y = tensor.as_tensor_variable(y)
-        assert y.type.ndim == 1
-        if x.type.dtype != y.type.dtype:
-            raise NotImplementedError()
-        return gof.Apply(self,
-                         [x, y],
-                         [SparseType(dtype=x.type.dtype,
-                                 format=x.type.format).make_variable()])
-    def perform(self, node, (x, y), (out, )):
-        assert _is_sparse(x) and not _is_sparse(y)
-        assert x.shape[1] == y.shape[0]
-        out[0] = x.__class__(x.toarray() * y)
-    def grad(self, (x, y), (gz,)):
-        assert _is_sparse_variable(x) and _is_dense_variable(y)
-        assert _is_sparse_variable(gz)
-        return mul_s_v(gz, y), sp_sum(x * gz, axis=0, sparse_grad=True)
-    def infer_shape(self, node, ins_shapes):
-        return [ins_shapes[0]]
-    def __str__(self):
-        return self.__class__.__name__
-mul_s_v = MulSV()
-class MulSVCSR(gof.Op):
-    """Multiplication of sparse matrix by a broadcasted dense vector
-    element wise.
-    :param a_data: Sparse matrix data.
-    :param a_indices: Sparse matrix indices.
-    :param a_indptr: Sparse matrix indptr.
-    :param b: Tensor type matrix.
-    :return: The multiplication of the two matrix element wise.
-    :note:
-    - `a_data`, `a_indices` and `a_indptr` must be the properties
-      of a sparse matrix in csr format.
-    - The dtype of `a_data`, i.e. the dtype of the sparse matrix,
-      cannot be a complex type.
-    - This op is used as an optimization of MulSV.
-    """
-    def __eq__(self, other):
-        return (type(self) == type(other))
-    def __hash__(self):
-        return hash(type(self))
-    def make_node(self, a_data, a_indices, a_indptr, b):
-        assert b.type.ndim == 1
-        return gof.Apply(self, [a_data, a_indices, a_indptr, b],
-                               [tensor.tensor(b.dtype, (False,))])
-    def c_code_cache_version(self):
-        return (1,)
-    def c_code(self, node, name, inputs, outputs, sub):
-        _data, _indices, _indptr, _b, = inputs
-        _zout, = outputs
-        if node.inputs[0].type.dtype in ('complex64', 'complex128'):
-            raise NotImplementedError('Complex types are not supported for a')
-        if node.inputs[3].type.dtype in ('complex64', 'complex128'):
-            raise NotImplementedError('Complex types are not supported for b')
-        return """
-        if (%(_b)s->nd != 1) {
-            PyErr_SetString(PyExc_NotImplementedError, "rank(b) != 1");
-            %(fail)s;
-        }
-        if (%(_data)s->nd != 1) {
-            PyErr_SetString(PyExc_NotImplementedError, "rank(data) != 1");
-            %(fail)s;
-        }
-        if (%(_indices)s->nd != 1) {
-            PyErr_SetString(PyExc_NotImplementedError, "rank(indices) != 1");
-            %(fail)s;
-        }
-        if (%(_indptr)s->nd != 1) {
-            PyErr_SetString(PyExc_NotImplementedError, "rank(indptr) != 1");
-            %(fail)s;
-        }
-        if( %(_indices)s->descr->type_num != PyArray_INT32) {
-        PyErr_SetString(PyExc_NotImplementedError, "C"); %(fail)s;}
-        if( %(_indptr)s->descr->type_num != PyArray_INT32)
-        {PyErr_SetString(PyExc_NotImplementedError, "D"); %(fail)s;}
-        if (!%(_zout)s
-            || %(_zout)s->dimensions[0] != %(_indices)s->dimensions[0]
-            || !PyArray_ISCONTIGUOUS(%(_zout)s))
-        {
-            Py_XDECREF(%(_zout)s);
-            %(_zout)s = (PyArrayObject*) PyArray_SimpleNew(1,
-                    %(_indices)s->dimensions, %(_b)s->descr->type_num);
-        }
-        { //makes it compile even though labels jump over variable definitions.
-            const npy_intp nnz = %(_indices)s->dimensions[0];
-            //TODO: error checking with this
-            const npy_intp N =  %(_indptr)s->dimensions[0]-1;
-            const dtype_%(_data)s * const __restrict__ data = (dtype_%(_data)s*)%(_data)s->data;
-            const npy_int32 * const __restrict__ indptr = (npy_int32 *)%(_indptr)s->data;
-            const npy_int32 * const __restrict__ indices = (npy_int32 *)%(_indices)s->data;
-            const dtype_%(_b)s* __restrict__ Db = (dtype_%(_b)s*)%(_b)s->data;
-            dtype_%(_zout)s * const __restrict__ zout = (dtype_%(_zout)s*)%(_zout)s->data;
-            const npy_intp Sb = %(_b)s->strides[0] / %(_b)s->descr->elsize;
-            // loop over rows
-            for (npy_int32 j = 0; j < N; ++j)
-            {
-                // for each non-null value in the sparse column
-                for (npy_int32 i_idx = indptr[j]; i_idx < indptr[j+1]; ++i_idx)
-                {
-                    // extract row index of non-null value
-                    npy_int32 i = indices[i_idx];
-                    zout[i_idx] = data[i_idx] * Db[i * Sb];
-                }
-            }
-        }
-        """ % dict(locals(), **sub)
-    def __str__(self):
-        return self.__class__.__name__
-mul_s_v_csr = MulSVCSR()
 @gof.local_optimizer([mul_s_v])
 def local_mul_s_v(node):
    if node.op == mul_s_v:
@@ -995,181 +162,6 @@ def local_mul_s_v(node):
 register_specialize(local_mul_s_v)
-class StructuredAddSV(gof.op.Op):
-    """Structured addition of a sparse matrix and a dense vector.
-    The elements of the vector are are only added to the corresponding
-    non-zero elements. Therefore, this operation outputs another sparse
-    matrix.
-    :param x: Sparse matrix.
-    :param y: Tensor type vector.
-    :return: A sparse matrix containing the addition of the vector to
-             the data of the sparse matrix.
-    :note: The grad implemented is structured since the op is structured.
-    """
-    def __eq__(self, other):
-        return (type(self) == type(other))
-    def __hash__(self):
-        return hash(type(self))
-    def make_node(self, x, y):
-        x = as_sparse_variable(x)
-        y = tensor.as_tensor_variable(y)
-        assert y.type.ndim == 1
-        if x.type.dtype != y.type.dtype:
-            raise NotImplementedError()
-        return gof.Apply(self,
-                         [x, y],
-                         [SparseType(dtype=x.type.dtype,
-                                 format=x.type.format).make_variable()])
-    def perform(self, node, (x, y), (out, )):
-        assert _is_sparse(x) and not _is_sparse(y)
-        assert x.shape[1] == y.shape[0]
-        out[0] = x.__class__(x + (x.toarray() != 0) * y)
-    def grad(self, (x, y), (gz,)):
-        assert _is_sparse_variable(x) and not _is_sparse_variable(y)
-        assert _is_sparse_variable(gz)
-        return gz, sp_sum(gz, axis=0, sparse_grad=True)
-    def infer_shape(self, node, ins_shapes):
-        return [ins_shapes[0]]
-    def __str__(self):
-        return self.__class__.__name__
-structured_add_s_v = StructuredAddSV()
-class StrucutedAddSVCSR(gof.Op):
-    """Structured addition of a sparse matrix and a dense vector.
-    The elements of the vector are are only added to the corresponding
-    non-zero elements. Therefore, this operation outputs another sparse
-    matrix.
-    :param a_data: Sparse matrix data.
-    :param a_indices: Sparse matrix indices.
-    :param a_indptr: Sparse matrix indptr.
-    :param b: Tensor type vector.
-    :return: A sparse matrix containing the addition of the vector to
-             the data of the sparse matrix.
-    :note:
-    - The a_* are the properties of a sparse matrix in csr
-      format.
-    - This op is used as an optimization for StructuredAddSV.
-    """
-    def __eq__(self, other):
-        return (type(self) == type(other))
-    def __hash__(self):
-        return hash(type(self))
-    def make_node(self, a_data, a_indices, a_indptr, b):
-        b = tensor.as_tensor_variable(b)
-        a_data = tensor.as_tensor_variable(a_data)
-        a_indices = tensor.as_tensor_variable(a_indices)
-        a_indptr = tensor.as_tensor_variable(a_indptr)
-        assert a_data.type.ndim == 1
-        assert a_indices.type.ndim == 1
-        assert a_indptr.type.ndim == 1
-        assert b.type.ndim == 1
-        return gof.Apply(self, [a_data, a_indices, a_indptr, b],
-                               [tensor.tensor(b.dtype, (False,))])
-    def c_code_cache_version(self):
-        return (1,)
-    def c_code(self, node, name, inputs, outputs, sub):
-        _data, _indices, _indptr, _b, = inputs
-        _zout, = outputs
-        if node.inputs[0].type.dtype in ('complex64', 'complex128'):
-            raise NotImplementedError('Complex types are not supported for a')
-        if node.inputs[3].type.dtype in ('complex64', 'complex128'):
-            raise NotImplementedError('Complex types are not supported for b')
-        return """
-        if (%(_b)s->nd != 1) {
-            PyErr_SetString(PyExc_NotImplementedError, "rank(b) != 1");
-            %(fail)s;
-        }
-        if (%(_data)s->nd != 1) {
-            PyErr_SetString(PyExc_NotImplementedError, "rank(data) != 1");
-            %(fail)s;
-        }
-        if (%(_indices)s->nd != 1) {
-            PyErr_SetString(PyExc_NotImplementedError, "rank(indices) != 1");
-            %(fail)s;
-        }
-        if (%(_indptr)s->nd != 1) {
-            PyErr_SetString(PyExc_NotImplementedError, "rank(indptr) != 1");
-            %(fail)s;
-        }
-        if( %(_indices)s->descr->type_num != PyArray_INT32) {
-        PyErr_SetString(PyExc_NotImplementedError, "C"); %(fail)s;}
-        if( %(_indptr)s->descr->type_num != PyArray_INT32)
-        {PyErr_SetString(PyExc_NotImplementedError, "D"); %(fail)s;}
-        if (!%(_zout)s)
-        {
-            %(_zout)s = (PyArrayObject*) PyArray_SimpleNew(1,
-                    %(_indices)s->dimensions, %(_b)s->descr->type_num);
-        }
-        if (%(_zout)s->dimensions[0] != %(_indices)s->dimensions[0])
-        {
-            PyErr_SetString(PyExc_NotImplementedError,
-     "somehow _zout got the wrong size.. and I don't know how to resize it.");
-            %(fail)s;
-        }
-        { //makes it compile even though labels jump over variable definitions.
-            const npy_intp nnz = %(_indices)s->dimensions[0];
-            //TODO: error checking with this
-            const npy_intp N =  %(_indptr)s->dimensions[0]-1;
-            const dtype_%(_data)s * const __restrict__ data = (dtype_%(_data)s*)%(_data)s->data;
-            const npy_int32 * const __restrict__ indptr = (npy_int32 *)%(_indptr)s->data;
-            const npy_int32 * const __restrict__ indices = (npy_int32 *)%(_indices)s->data;
-            const dtype_%(_b)s* __restrict__ Db = (dtype_%(_b)s*)%(_b)s->data;
-            dtype_%(_zout)s * const __restrict__ zout = (dtype_%(_zout)s*)%(_zout)s->data;
-            const npy_intp Sb = %(_b)s->strides[0] / %(_b)s->descr->elsize;
-            // loop over columns
-            for (npy_int32 j = 0; j < N; ++j)
-            {
-                // for each non-null value in the sparse column
-                for (npy_int32 i_idx = indptr[j]; i_idx < indptr[j+1]; ++i_idx)
-                {
-                    // extract row index of non-null value
-                    npy_int32 i = indices[i_idx];
-                    // write resulting gradient to sparse output
-                    zout[i_idx] = data[i_idx] + Db[i * Sb];
-                }
-            }
-        }
-        """ % dict(locals(), **sub)
-    def __str__(self):
-        return self.__class__.__name__
-structured_add_s_v_csr = StrucutedAddSVCSR()
 @gof.local_optimizer([structured_add_s_v])
 def local_structured_add_s_v(node):
    if node.op == structured_add_s_v:
@@ -1203,304 +195,6 @@ def local_structured_add_s_v(node):
 register_specialize(local_structured_add_s_v)
-class SamplingDot(gof.op.Op):
-    """Operand for calculating the dot product dot(`x`, `y`.T) = `z` when you
-    only want to calculate a subset of `z`.
-    It is equivalent to `p` o (`x` . `y`.T) where o is the element-wise
-    product, `x` and `y` operands of the dot product and `p` is a matrix that
-    contains 1 when the corresponding element of `z` should be calculated
-    and 0 when it shouldn't. Note that SamplingDot has a different interface
-    than `dot` because SamplingDot requires `x` to be a `m`x`k` matrix while
-    `y` is a `n`x`k` matrix instead of the usual `k`x`n` matrix.
-    .. note::
-        It will work if the pattern is not binary value, but if the
-        pattern doesn't have a high sparsity proportion it will be slower
-        then a more optimized dot followed by a normal elemwise
-        multiplication.
-    :param x: Tensor matrix.
-    :param y: Tensor matrix.
-    :param p: Sparse matrix in csr format.
-    :return: A dense matrix containing the dot product of `x` by `y`.T only
-             where `p` is 1.
-    :note:
-    - The grad implemented is regular, i.e. not structured.
-    """
-    def __eq__(self, other):
-        return type(self) == type(other)
-    def __hash__(self):
-        return hash(type(self))
-    def make_node(self, x, y, p):
-        x = tensor.as_tensor_variable(x)
-        y = tensor.as_tensor_variable(y)
-        p = sparse.as_sparse_variable(p)
-        if not _is_sparse_variable(p):
-            raise TypeError(p)
-        #TODO: use it.
-        dtype_out = scalar.upcast(x.type.dtype, y.type.dtype, p.type.dtype)
-        return gof.Apply(self, [x, y, p], [p.type()])
-    def perform(self, node, (x, y, p), (out,)):
-        if _is_sparse(x):
-            raise TypeError(x)
-        if _is_sparse(y):
-            raise TypeError(y)
-        if not _is_sparse(p):
-            raise TypeError(p)
-        out[0] = p.__class__(p.multiply(numpy.dot(x, y.T)))
-    def grad(self, (x, y, p), (gz,)):
-        rval = [
-            dot(p * gz, y),
-            dot((p * gz).T, x),
-            None
-        ]
-        return rval
-    def infer_shape(self, node, ins_shapes):
-        return [ins_shapes[2]]
-    def __str__(self):
-        return self.__class__.__name__
-sampling_dot = SamplingDot()
-class SamplingDotCsr(gof.Op):
-    """Operand optimized for calculating the dot product dot(`x`, `y`.T) = `z`
-    when you only want to calculate a subset of `z`.
-    It is equivalent to `p` o (`x` . `y`.T) where o is the element-wise
-    product, `x` and `y` operands of the dot product and `p` is a matrix
-    that contains 1 when the corresponding element of `z` should be calculated
-    and 0 when it shouldn't. Note that SamplingDot has a different interface
-    than `dot` because SamplingDot requires `x` to be a `m`x`k` matrix while
-    `y` is a `n`x`k` matrix instead of the usual `k`x`n` matrix.
-    .. note::
-        It will work if the pattern is not binary value, but if the
-        pattern doesn't have a high sparsity proportion it will be slower
-        then a more optimized dot followed by a normal elemwise
-        multiplication.
-    :param x: Tensor matrix.
-    :param y: Tensor matrix.
-    :param p_data: Sparse matrix data.
-    :param p_ind: Sparse matrix indices.
-    :param p_ptr: Sparse matric indptr.
-    :param p_ncols: Sparse matrix number of columns.
-    :return: A dense matrix containing the dot product of `x` by `y`.T only
-             where `p` is 1.
-    :note:
-    - If we have the input of mixed dtype, we insert cast elemwise
-      in the graph to be able to call blas function as they don't
-      allow mixed dtype.
-    - This op is used as an optimization for SamplingDot.
-    """
-    def __eq__(self, other):
-        return type(self) == type(other)
-    def __hash__(self):
-        return hash(type(self))
-    def __str__(self):
-        return 'SamplingDot{Csr}'
-    def make_node(self, x, y, p_data, p_ind, p_ptr, p_ncols):
-        x = tensor.as_tensor_variable(x)
-        y = tensor.as_tensor_variable(y)
-        p_data = tensor.as_tensor_variable(p_data)
-        p_ind = tensor.as_tensor_variable(p_ind)
-        p_ptr = tensor.as_tensor_variable(p_ptr)
-        p_ncols = tensor.as_tensor_variable(p_ncols)
-        assert p_ncols.dtype == 'int32'
-        dtype_out = scalar.upcast(x.type.dtype, y.type.dtype,
-                                  p_data.type.dtype)
-        dot_out = scalar.upcast(x.type.dtype, y.type.dtype)
-        # We call blas ?dot function that take only param of the same type
-        x = tensor.cast(x, dot_out)
-        y = tensor.cast(y, dot_out)
-        return gof.Apply(self, [x, y, p_data, p_ind, p_ptr, p_ncols], [
-            tensor.tensor(dtype=dtype_out, broadcastable=(False,)),
-            tensor.tensor(dtype=p_ind.type.dtype, broadcastable=(False,)),
-            tensor.tensor(dtype=p_ptr.type.dtype, broadcastable=(False,))
-        ])
-    def c_support_code(self):
-        return blas.blas_header_text()
-    def c_libraries(self):
-        return blas.ldflags()
-    def c_compile_args(self):
-        return blas.ldflags(libs=False, flags=True)
-    def c_lib_dirs(self):
-        return blas.ldflags(libs=False, libs_dir=True)
-    def c_header_dirs(self):
-        return blas.ldflags(libs=False, include_dir=True)
-    def c_code(self, node, name, inputs, outputs, sub):
-        x, y, p_data, p_ind, p_ptr, p_ncols = inputs
-        z_data, z_ind, z_ptr = outputs
-        if node.inputs[0].type.dtype in ('complex64', 'complex128'):
-            raise NotImplementedError('Complex types are not supported for x')
-        if node.inputs[1].type.dtype in ('complex64', 'complex128'):
-            raise NotImplementedError('Complex types are not supported for y')
-        if node.inputs[2].type.dtype in ('complex64', 'complex128'):
-            raise NotImplementedError(
-                'Complex types are not supported for pattern')
-        dot_out = scalar.upcast(node.inputs[0].type.dtype,
-                                node.inputs[1].type.dtype)
-        if dot_out == "float32":
-            conv_type = "float"
-            cdot = "sdot_"
-        else:
-            conv_type = "double"
-            cdot = "ddot_"
-        # retrieve dtype number
-        typenum_x = node.inputs[0].type.dtype_specs()[-1]
-        typenum_y = node.inputs[1].type.dtype_specs()[-1]
-        typenum_p = node.inputs[2].type.dtype_specs()[-1]
-        typenum_zd = tensor.TensorType(node.outputs[0].dtype,
-                                       []).dtype_specs()[-1]
-        typenum_zi = tensor.TensorType(node.outputs[1].dtype,
-                                       []).dtype_specs()[-1]
-        typenum_zp = tensor.TensorType(node.outputs[2].dtype,
-                                       []).dtype_specs()[-1]
-        rval = """
-        if (%(x)s->nd != 2) {
-PyErr_SetString(PyExc_NotImplementedError, "rank(x) != 2"); %(fail)s;}
-        if (%(y)s->nd != 2) {
-PyErr_SetString(PyExc_NotImplementedError, "rank(y) != 2"); %(fail)s;}
-        if (%(x)s->descr->type_num != %(typenum_x)s) {
-            PyErr_SetString(PyExc_NotImplementedError,
-                            "Invalid type for x");
-            %(fail)s;}
-        if (%(y)s->descr->type_num != %(typenum_y)s) {
-            PyErr_SetString(PyExc_NotImplementedError,
-                            "Invalid type for y");
-            %(fail)s;}
-        if (%(p_data)s->descr->type_num != %(typenum_p)s) {
-            PyErr_SetString(PyExc_NotImplementedError,
-                            "Invalid type for pattern");
-            %(fail)s;}
-        if (%(x)s->dimensions[1] != %(y)s->dimensions[1]) {
-            PyErr_SetString(PyExc_NotImplementedError,
-              "x's number of columns doesn't match y's rows! Note: sampling_dot is different from dot because y is assumed to be transposed.");
-            %(fail)s;}
-        if (%(y)s->dimensions[0] != ((npy_int32 *)%(p_ncols)s->data)[0] ||
-            %(x)s->dimensions[0] != (%(p_ptr)s->dimensions[0] - 1))
-        {PyErr_SetString(PyExc_NotImplementedError,
-        "The dimension of the pattern and the output must match"); %(fail)s;}
-        // Allocate output
-        if (!%(z_data)s
-            || (%(z_data)s->dimensions[0] != %(p_data)s->dimensions[0])
-            || (%(z_data)s->descr->type_num != %(typenum_zd)s)) {
-            {Py_XDECREF(%(z_data)s);}
-            npy_intp dims[] = {0};
-            dims[0] = %(p_data)s->dimensions[0];
-            %(z_data)s = (PyArrayObject*) PyArray_SimpleNew(1, dims,
-                                                            %(typenum_zd)s);
-        }
-        if (!%(z_ind)s
-            || (%(z_ind)s->dimensions[0] != %(p_ind)s->dimensions[0])
-            || (%(z_ind)s->descr->type_num != %(typenum_zi)s)) {
-            {Py_XDECREF(%(z_ind)s);}
-            npy_intp dims[] = {0};
-            dims[0] = %(p_ind)s->dimensions[0];
-            %(z_ind)s = (PyArrayObject*) PyArray_SimpleNew(1, dims,
-                                                           %(typenum_zi)s);
-        }
-        if (!%(z_ptr)s
-            || (%(z_ptr)s->dimensions[0] != %(p_ptr)s->dimensions[0])
-            || (%(z_ptr)s->descr->type_num != %(typenum_zp)s)) {
-            {Py_XDECREF(%(z_ptr)s);}
-            npy_intp dims[] = {0};
-            dims[0] = %(p_ptr)s->dimensions[0];
-            %(z_ptr)s = (PyArrayObject*) PyArray_SimpleNew(1, dims,
-                                                           %(typenum_zp)s);
-        }
-        {
-            // Product of MxK and NxK, output MxN
-            npy_intp M = %(x)s->dimensions[0];
-            npy_intp N = %(y)s->dimensions[0];
-            npy_intp K = %(y)s->dimensions[1];
-            // pointers to access actual data in the arrays passed as params.
-            const dtype_%(x)s* __restrict__ Dx = (dtype_%(x)s*)%(x)s->data;
-            const dtype_%(y)s* __restrict__ Dy = (dtype_%(y)s*)%(y)s->data;
-            const dtype_%(p_data)s* __restrict__ Dpd = (dtype_%(p_data)s*)%(p_data)s->data;
-            const dtype_%(p_ind)s* __restrict__ Dpi = (dtype_%(p_ind)s*)%(p_ind)s->data;
-            const dtype_%(p_ptr)s* __restrict__ Dpp = (dtype_%(p_ptr)s*)%(p_ptr)s->data;
-            dtype_%(z_data)s* __restrict__ Dzd = (dtype_%(z_data)s*)%(z_data)s->data;
-            dtype_%(z_ind)s* __restrict__ Dzi = (dtype_%(z_ind)s*)%(z_ind)s->data;
-            dtype_%(z_ptr)s* __restrict__ Dzp = (dtype_%(z_ptr)s*)%(z_ptr)s->data;
-            const npy_intp Sdx = %(x)s->strides[1]/%(x)s->descr->elsize;
-            const npy_intp Sdy = %(y)s->strides[1]/%(y)s->descr->elsize;
-            const npy_intp Sdpd = %(p_data)s->strides[0] / %(p_data)s->descr->elsize;
-            const npy_intp Sdpi = %(p_ind)s->strides[0] / %(p_ind)s->descr->elsize;
-            const npy_intp Sdpp = %(p_ptr)s->strides[0] / %(p_ptr)s->descr->elsize;
-            const npy_intp Sdzd = %(z_data)s->strides[0] / %(z_data)s->descr->elsize;
-            const npy_intp Sdzi = %(z_ind)s->strides[0] / %(z_ind)s->descr->elsize;
-            const npy_intp Sdzp = %(z_ptr)s->strides[0] / %(z_ptr)s->descr->elsize;
-            memcpy(Dzi, Dpi, %(p_ind)s->dimensions[0]*sizeof(dtype_%(p_ind)s));
-            memcpy(Dzp, Dpp, %(p_ptr)s->dimensions[0]*sizeof(dtype_%(p_ptr)s));
-            for (npy_int32 m = 0; m < M; ++m) {
-                for (npy_int32 n_idx = Dpp[m * Sdpp]; n_idx < Dpp[(m+1)*Sdpp]; ++n_idx) {
-                    const npy_int32 n = Dpi[n_idx * Sdpi]; // row index of non-null value for column K
-                    const dtype_%(x)s* x_row = (dtype_%(x)s*)(%(x)s->data + %(x)s->strides[0] * m);
-                    const dtype_%(y)s* y_col = (dtype_%(y)s*)(%(y)s->data + %(y)s->strides[0] * n);
-                    Dzd[n_idx * Sdzd] = Dpd[n_idx * Sdpd] * %(cdot)s((int*)&K, (const %(conv_type)s*)x_row, (int*)&Sdx, (const %(conv_type)s*)y_col, (int*)&Sdy);
-                }
-            }
-        }
-        """ % dict(locals(), **sub)
-        return rval
-sampling_dot_csr = SamplingDotCsr()
 # register a specialization to replace SamplingDot -> SamplingDotCsr
 @gof.local_optimizer([sampling_dot])
 def local_sampling_dot_csr(node):

--- a/theano/sparse/tests/test_basic.py
+++ b/theano/sparse/tests/test_basic.py
@@ -10,6 +10,8 @@ except ImportError:
    pass  # The variable enable_sparse will be used to disable the test file.
 import theano
+from theano import tensor
+from theano import sparse
 from theano import compile, config, gof
 from theano.sparse import enable_sparse
 from theano.gof.python25 import all, any, product
@@ -19,21 +21,25 @@ if enable_sparse == False:
 from theano.sparse.basic import _is_dense, _is_sparse, _mtypes
 from theano.sparse.basic import _is_dense_variable, _is_sparse_variable
-from theano.sparse.basic import verify_grad_sparse
+from theano.sparse import (
-from theano.sparse import (as_sparse_variable, CSC, CSR, CSM, CSMProperties,
+    verify_grad_sparse, as_sparse_variable,
-    csm_properties)
+    CSC, CSR, CSM, CSMProperties, csm_properties,
-from theano.sparse import SparseType, CSMGrad, CSMGradC
+    SparseType, CSMGrad, CSMGradC,
-from theano.sparse import StructuredDot, StructuredDotCSC
+    StructuredDot, StructuredDotCSC,
-from theano.sparse import StructuredDotGradCSC, StructuredDotGradCSR
+    StructuredDotGradCSC, StructuredDotGradCSR,
-from theano.sparse import AddSS, AddSD, MulSS, MulSD, Transpose, Neg, Remove0
+    AddSS, AddSD, MulSS, MulSD, Transpose, Neg, Remove0,
-from theano.sparse import add, mul, structured_dot, transpose
+    add, mul, structured_dot, transpose,
-from theano.sparse import (csc_from_dense, csr_from_dense, dense_from_sparse,
+    csc_from_dense, csr_from_dense, dense_from_sparse,
-        SparseFromDense)
+    Dot, Usmm, UsmmCscDense, sp_ones_like, GetItemScalar,
-from theano.sparse import Dot, Usmm, UsmmCscDense, sp_ones_like, GetItemScalar
+    SparseFromDense,
-#from theano.sparse import get_item_2d, get_item_scalar
+    Cast, HStack, VStack, AddSSData, add_s_s_data,
+    Poisson, poisson, Multinomial, multinomial,
+    structured_sigmoid, structured_exp, structured_log,
+    structured_pow, structured_minimum, structured_maximum, structured_add,
+    MulSV, mul_s_v, StructuredAddSV, structured_add_s_v,
+    SamplingDot, sampling_dot)
 from theano.tests import unittest_tools as utt
-from theano import tensor
 from theano.tensor.basic import _allclose
@@ -580,36 +586,36 @@ class test_csm_properties(unittest.TestCase):
    def test_csm_properties_grad(self):
        sp_types = {'csc': sp.csc_matrix,
            'csr': sp.csr_matrix}
        for format in ['csc', 'csr']:
            for dtype in ['float32', 'float64']:
                spmat = sp_types[format](random_lil((4, 3), dtype, 3))
                verify_grad_sparse(lambda *x: CSMProperties()(*x)[0], [spmat],
                    structured=True)
                verify_grad_sparse(lambda *x: CSMProperties()(*x)[1], [spmat],
                    structured=True)
                verify_grad_sparse(lambda *x: CSMProperties()(*x)[2], [spmat],
                    structured=True)
                verify_grad_sparse(lambda *x: CSMProperties()(*x)[2], [spmat],
                    structured=True)
    def test_csm_properties(self):
        sp_types = {'csc': sp.csc_matrix,
            'csr': sp.csr_matrix}
        for format in ['csc', 'csr']:
            for dtype in ['float32', 'float64']:
                x = SparseType(format, dtype=dtype)()
                f = theano.function([x], csm_properties(x))
                spmat = sp_types[format](random_lil((4, 3), dtype, 3))
                data, indices, indptr, shape = f(spmat)
                assert numpy.all(data == spmat.data)
                assert numpy.all(indices == spmat.indices)
                assert numpy.all(indptr == spmat.indptr)
@@ -623,49 +629,49 @@ class test_csm(unittest.TestCase):
    def test_csm_grad(self):
        sp_types = {'csc': sp.csc_matrix,
            'csr': sp.csr_matrix}
        for format in ['csc', 'csr']:
            for dtype in ['float32', 'float64']:
                spmat = sp_types[format](random_lil((4, 3), dtype, 3))
                verify_grad_sparse(lambda x: CSM(format)(x, spmat.indices,
                    spmat.indptr, numpy.asarray(spmat.shape, 'int32')),
                    [spmat.data], structured=True)
    def test_csm_sparser(self):
        """
        Test support for gradients sparser than the input.
        """
        sp_types = {'csc': sp.csc_matrix,
            'csr': sp.csr_matrix}
        for format in ['csc', 'csr']:
            for dtype in ['float32', 'float64']:
                x = tensor.tensor(dtype=dtype, broadcastable=(False,))
                y = tensor.ivector()
                z = tensor.ivector()
                s = tensor.ivector()
                a = as_sparse_variable(sp_types[format](random_lil((4, 3),
                    dtype, 1)))
                f = theano.function([x, y, z, s], tensor.grad(dense_from_sparse(
                    a * CSM(format)(x, y, z, s)).sum(), x))
                spmat = sp_types[format](random_lil((4, 3), dtype, 3))
                res = f(spmat.data, spmat.indices, spmat.indptr,
                    numpy.asarray(spmat.shape, 'int32'))
                assert len(spmat.data) == len(res)
    def test_csm_unsorted(self):
        """
        Test support for gradients of unsorted inputs.
        """
        sp_types = {'csc': sp.csc_matrix,
            'csr': sp.csr_matrix}
        for format in ['csr', 'csc', ]:
            for dtype in ['float32', 'float64']:
                x = tensor.tensor(dtype=dtype, broadcastable=(False,))
@@ -673,31 +679,34 @@ class test_csm(unittest.TestCase):
                z = tensor.ivector()
                s = tensor.ivector()
                # Sparse advanced indexing produces unsorted sparse matrices
-                a = sp_types[format]([[1,2,1], [1,2,1], [1,2,1], [1,2,1]],
+                a = sp_types[format]([[1, 2, 1],
+                                      [1, 2, 1],
+                                      [1, 2, 1],
+                                      [1, 2, 1]],
                    dtype=dtype)[range(4)]
                # Make sure it's unsorted
                assert not a.has_sorted_indices
                a = as_sparse_variable(a)
                f = theano.function([x, y, z, s], tensor.grad(tensor.sum(
                    dense_from_sparse(a * CSM(format)(x, y, z, s))), x))
                spmat = sp_types[format](random_lil((4, 3), dtype,
                    12))[range(4)]
                assert not spmat.has_sorted_indices
                res = f(spmat.data, spmat.indices, spmat.indptr,
                    numpy.asarray(spmat.shape, 'int32'))
                col1 = sp_types[format]((res, spmat.indices, spmat.indptr),
                    shape=numpy.asarray(spmat.shape, 'int32'))[:, 1].data
                assert numpy.all(col1 == 2)
    def test_csm(self):
        sp_types = {'csc': sp.csc_matrix,
-            'csr': sp.csr_matrix}
+                    'csr': sp.csr_matrix}
        for format in ['csc', 'csr']:
            for dtype in ['float32', 'float64']:
                x = tensor.tensor(dtype=dtype, broadcastable=(False,))
@@ -705,12 +714,12 @@ class test_csm(unittest.TestCase):
                z = tensor.ivector()
                s = tensor.ivector()
                f = theano.function([x, y, z, s], CSM(format)(x, y, z, s))
                spmat = sp_types[format](random_lil((4, 3), dtype, 3))
                res = f(spmat.data, spmat.indices, spmat.indptr,
                    numpy.asarray(spmat.shape, 'int32'))
                assert numpy.all(res.data == spmat.data)
                assert numpy.all(res.indices == spmat.indices)
                assert numpy.all(res.indptr == spmat.indptr)
@@ -1578,6 +1587,452 @@ class Test_getitem(unittest.TestCase):
            assert numpy.all(t1 == r1)
+class TestCast(utt.InferShapeTester):
+    compatible_types = (tensor.int_dtypes +
+                        tensor.continuous_dtypes)
+    x_csc = [theano.sparse.csc_matrix(dtype=t) for t in compatible_types]
+    x_csr = [theano.sparse.csr_matrix(dtype=t) for t in compatible_types]
+    indptr = numpy.array([0, 2, 3, 6])
+    indices = numpy.array([0, 2, 2, 0, 1, 2])
+    data = numpy.array([1, 2, 3, 4, 5, 6])
+    properties = (data, indices, indptr)
+    def setUp(self):
+        super(TestCast, self).setUp()
+        self.op_class = Cast
+    def test_cast(self):
+        cast_csc = dict([
+            (x, [theano.function([x], Cast(t)(x))
+                 for t in self.compatible_types])
+            for x in self.x_csc])
+        cast_csr = dict([
+            (x, [theano.function([x], Cast(t)(x))
+                 for t in self.compatible_types])
+            for x in self.x_csr])
+        for x in self.x_csc:
+            for f, t in zip(cast_csc[x], self.compatible_types):
+                a = sp.csc_matrix(self.properties, dtype=x.dtype).copy()
+                assert f(a).dtype == t
+        for x in self.x_csr:
+            for f, t in zip(cast_csr[x], self.compatible_types):
+                a = sp.csr_matrix(self.properties, dtype=x.dtype)
+                assert f(a).dtype == t
+    def test_infer_shape(self):
+        for x in self.x_csc:
+            for t in self.compatible_types:
+                a = sp.csc_matrix(self.properties, dtype=x.dtype)
+                self._compile_and_check([x],
+                                        [Cast(t)(x)],
+                                        [a],
+                                        self.op_class)
+        for x in self.x_csr:
+            for t in self.compatible_types:
+                a = sp.csr_matrix(self.properties, dtype=x.dtype)
+                self._compile_and_check([x],
+                                        [Cast(t)(x)],
+                                        [a],
+                                        self.op_class)
+    def test_grad(self):
+        for dtype in tensor.float_dtypes:
+            for t in tensor.float_dtypes:
+                eps = None
+                if t == 'float32':
+                    eps = 7e-4
+                a = sp.csc_matrix(self.properties, dtype=dtype)
+                verify_grad_sparse(Cast(t), [a], eps=eps)
+        for dtype in tensor.float_dtypes:
+            for t in tensor.float_dtypes:
+                eps = None
+                if t == 'float32':
+                    eps = 7e-4
+                a = sp.csr_matrix(self.properties, dtype=dtype)
+                verify_grad_sparse(Cast(t), [a], eps=eps)
+class _HVStackTester(utt.InferShapeTester):
+    """Test for both HStack and VStack.
+    """
+    nb = 3  # Number of sparse matrix to stack
+    x = {}
+    mat = {}
+    for format in sparse.sparse_formats:
+        variable = getattr(theano.sparse, format + '_matrix')
+        spa = getattr(sp, format + '_matrix')
+        x[format] = [variable() for t in range(nb)]
+        mat[format] = [spa(numpy.random.random_integers(5, size=(3, 4)) - 1,
+                           dtype=theano.config.floatX)
+                       for t in range(nb)]
+    def test_op(self):
+        for format in sparse.sparse_formats:
+            for out_f in sparse.sparse_formats:
+                for dtype in sparse.all_dtypes:
+                    blocks = self.mat[format]
+                    f = theano.function(
+                        self.x[format],
+                        self.op_class(
+                            format=out_f, dtype=dtype)(*self.x[format]),
+                        allow_input_downcast=True)
+                    tested = f(*blocks)
+                    expected = self.expected_f(blocks,
+                                               format=out_f,
+                                               dtype=dtype)
+                    assert numpy.allclose(tested.toarray(), expected.toarray())
+                    assert tested.format == expected.format
+                    assert tested.dtype == expected.dtype
+    def test_infer_shape(self):
+        for format in sparse.sparse_formats:
+            self._compile_and_check(self.x[format],
+                                    [self.op_class(dtype='float64')
+                                     (*self.x[format])],
+                                    self.mat[format],
+                                    self.op_class)
+    def test_grad(self):
+        for format in sparse.sparse_formats:
+            for out_f in sparse.sparse_formats:
+                for dtype in sparse.float_dtypes:
+                    verify_grad_sparse(
+                        self.op_class(format=out_f, dtype=dtype),
+                        self.mat[format],
+                        structured=False,
+                        eps=7e-4)
+def _hv_switch(op, expected_function):
+    """Return the right test class for HStack or VStack.
+    :Parameters:
+    - `op`: HStack or VStack class.
+    - `expected_function`: function from scipy for comparaison.
+    """
+    class XStackTester(_HVStackTester):
+        op_class = op
+        def expected_f(self, a, format=None, dtype=None):
+            return expected_function(a, format, dtype)
+    return XStackTester
+HStackTester = _hv_switch(HStack, sp.hstack)
+VStackTester = _hv_switch(VStack, sp.vstack)
+class AddSSDataTester(utt.InferShapeTester):
+    x = {}
+    a = {}
+    def setUp(self):
+        super(AddSSDataTester, self).setUp()
+        self.op_class = AddSSData
+        for format in sparse.sparse_formats:
+            variable = getattr(theano.sparse, format + '_matrix')
+            rand = numpy.array(
+                numpy.random.random_integers(3, size=(3, 4)) - 1,
+                dtype=theano.config.floatX)
+            constant = as_sparse_format(rand, format)
+            self.x[format] = [variable() for t in range(2)]
+            self.a[format] = [constant for t in range(2)]
+    def test_op(self):
+        for format in sparse.sparse_formats:
+            f = theano.function(
+                self.x[format],
+                add_s_s_data(*self.x[format]))
+            tested = f(*self.a[format])
+            expected = 2 * self.a[format][0]
+            assert numpy.allclose(tested.toarray(), expected.toarray())
+            assert tested.format == expected.format
+            assert tested.dtype == expected.dtype
+    def test_infer_shape(self):
+        for format in sparse.sparse_formats:
+            self._compile_and_check(self.x[format],
+                                    [add_s_s_data(*self.x[format])],
+                                    self.a[format],
+                                    self.op_class)
+    def test_grad(self):
+        for format in sparse.sparse_formats:
+            verify_grad_sparse(self.op_class(),
+                               self.a[format],
+                               structured=True)
+class PoissonTester(utt.InferShapeTester):
+    x = {}
+    a = {}
+    for format in sparse.sparse_formats:
+        variable = getattr(theano.sparse, format + '_matrix')
+        rand = numpy.array(numpy.random.random_integers(3, size=(3, 4)) - 1,
+                           dtype=theano.config.floatX)
+        x[format] = variable()
+        a[format] = as_sparse_format(rand, format)
+    def setUp(self):
+        super(PoissonTester, self).setUp()
+        self.op_class = Poisson
+    def test_op(self):
+        for format in sparse.sparse_formats:
+            f = theano.function(
+                [self.x[format]],
+                poisson(self.x[format]))
+            tested = f(self.a[format])
+            assert tested.format == format
+            assert tested.dtype == self.a[format].dtype
+            assert numpy.allclose(numpy.floor(tested.data), tested.data)
+            assert tested.shape == self.a[format].shape
+    def test_infer_shape(self):
+        for format in sparse.sparse_formats:
+            self._compile_and_check([self.x[format]],
+                                    [poisson(self.x[format])],
+                                    [self.a[format]],
+                                    self.op_class)
+class MultinomialTester(utt.InferShapeTester):
+    p = sparse.csr_matrix()
+    _p = sp.csr_matrix(numpy.asarray([[0.0, 0.5, 0.0, 0.5],
+                                      [0.1, 0.2, 0.3, 0.4],
+                                      [0.0, 1.0, 0.0, 0.0],
+                                      [0.3, 0.3, 0.0, 0.4]]))
+    def setUp(self):
+        super(MultinomialTester, self).setUp()
+        self.op_class = Multinomial
+    def test_op(self):
+        n = tensor.lscalar()
+        f = theano.function([self.p, n], multinomial(n, self.p))
+        _n = 5
+        tested = f(self._p, _n)
+        assert tested.shape == self._p.shape
+        assert numpy.allclose(numpy.floor(tested.todense()), tested.todense())
+        assert tested[2, 1] == _n
+        n = tensor.lvector()
+        f = theano.function([self.p, n], multinomial(n, self.p))
+        _n = numpy.asarray([1, 2, 3, 4], dtype='int64')
+        tested = f(self._p, _n)
+        assert tested.shape == self._p.shape
+        assert numpy.allclose(numpy.floor(tested.todense()), tested.todense())
+        assert tested[2, 1] == _n[2]
+    def test_infer_shape(self):
+        self._compile_and_check([self.p],
+                                [multinomial(5, self.p)],
+                                [self._p],
+                                self.op_class)
+class _StructuredMonoidUnaryTester(unittest.TestCase):
+    def test_op(self):
+        for format in sparse.sparse_formats:
+            x = getattr(theano.sparse, format + '_matrix')()
+            spa = getattr(sp, format + '_matrix')
+            a = spa(numpy.random.random_integers(5, size=(3, 4)) - 1,
+                    dtype=theano.config.floatX)
+            f = theano.function([x], self.op(x))
+            tested = f(a)
+            expected = self.expected_op(a.todense())
+            expected[a.todense() == 0] = 0
+            assert tested.shape == expected.shape
+            assert tested.dtype == expected.dtype
+            assert numpy.allclose(tested.todense(), expected)
+class StructuredSigmoidTester(_StructuredMonoidUnaryTester):
+    def setUp(self):
+        super(StructuredSigmoidTester, self).setUp()
+        self.op = structured_sigmoid
+        self.expected_op = lambda x: 1.0 / (1.0 + numpy.exp(-x))
+class StructuredExpTester(_StructuredMonoidUnaryTester):
+    def setUp(self):
+        super(StructuredExpTester, self).setUp()
+        self.op = structured_exp
+        self.expected_op = numpy.exp
+class StructuredLogTester(_StructuredMonoidUnaryTester):
+    def setUp(self):
+        super(StructuredLogTester, self).setUp()
+        self.op = structured_log
+        self.expected_op = numpy.log
+class StructuredPowTester(_StructuredMonoidUnaryTester):
+    def setUp(self):
+        super(StructuredPowTester, self).setUp()
+        self.op = lambda x: structured_pow(x, 2)
+        self.expected_op = lambda x: numpy.power(x, 2)
+class StructuredMinimumTester(_StructuredMonoidUnaryTester):
+    def setUp(self):
+        super(StructuredMinimumTester, self).setUp()
+        self.op = lambda x: structured_minimum(x, 2)
+        self.expected_op = lambda x: numpy.minimum(x, 2)
+class StructuredMaximumTester(_StructuredMonoidUnaryTester):
+    def setUp(self):
+        super(StructuredMaximumTester, self).setUp()
+        self.op = lambda x: structured_maximum(x, 2)
+        self.expected_op = lambda x: numpy.maximum(x, 2)
+class StructuredAddTester(_StructuredMonoidUnaryTester):
+    def setUp(self):
+        super(StructuredAddTester, self).setUp()
+        self.op = lambda x: structured_add(x, 2)
+        self.expected_op = lambda x: numpy.add(x, 2)
+class MulSVTester(unittest.TestCase):
+    def setUp(self):
+        utt.seed_rng()
+    def test_mul_s_v_grad(self):
+        sp_types = {'csc': sp.csc_matrix,
+            'csr': sp.csr_matrix}
+        for format in ['csr', 'csc']:
+            for dtype in ['float32', 'float64']:
+                spmat = sp_types[format](random_lil((4, 3), dtype, 3))
+                mat = numpy.asarray(numpy.random.rand(3), dtype=dtype)
+                theano.sparse.verify_grad_sparse(mul_s_v,
+                    [spmat, mat], structured=True)
+    def test_mul_s_v(self):
+        sp_types = {'csc': sp.csc_matrix,
+                    'csr': sp.csr_matrix}
+        for format in ['csr', 'csc']:
+            for dtype in ['float32', 'float64']:
+                x = theano.sparse.SparseType(format, dtype=dtype)()
+                y = tensor.vector(dtype=dtype)
+                f = theano.function([x, y], mul_s_v(x, y))
+                spmat = sp_types[format](random_lil((4, 3), dtype, 3))
+                mat = numpy.asarray(numpy.random.rand(3), dtype=dtype)
+                out = f(spmat, mat)
+                assert numpy.allclose(out.toarray(), spmat.toarray() * mat)
+class StructuredAddSVTester(unittest.TestCase):
+    def setUp(self):
+        utt.seed_rng()
+    def test_structured_add_s_v_grad(self):
+        sp_types = {'csc': sp.csc_matrix,
+                    'csr': sp.csr_matrix}
+        for format in ['csr', 'csc']:
+            for dtype in ['float32', 'float64']:
+                spmat = sp_types[format](random_lil((4, 3), dtype, 3))
+                mat = numpy.asarray(numpy.random.rand(3), dtype=dtype)
+                theano.sparse.verify_grad_sparse(structured_add_s_v,
+                    [spmat, mat], structured=True)
+    def test_structured_add_s_v(self):
+        sp_types = {'csc': sp.csc_matrix,
+                    'csr': sp.csr_matrix}
+        for format in ['csr', 'csc']:
+            for dtype in ['float32', 'float64']:
+                x = theano.sparse.SparseType(format, dtype=dtype)()
+                y = tensor.vector(dtype=dtype)
+                f = theano.function([x, y], structured_add_s_v(x, y))
+                spmat = sp_types[format](random_lil((4, 3), dtype, 3))
+                spones = spmat.copy()
+                spones.data = numpy.ones_like(spones.data)
+                mat = numpy.asarray(numpy.random.rand(3), dtype=dtype)
+                out = f(spmat, mat)
+                assert numpy.allclose(out.toarray(),
+                                      spones.multiply(spmat + mat))
+class SamplingDotTester(utt.InferShapeTester):
+    x = [tensor.matrix() for t in range(2)]
+    x.append(sparse.csr_matrix())
+    a = [numpy.array(numpy.random.random_integers(maximum, size=(3, 3)) - 1,
+                      dtype=theano.config.floatX)
+         for maximum in [5, 5, 2]]
+    a[2] = sp.csr_matrix(a[2])
+    def setUp(self):
+        super(SamplingDotTester, self).setUp()
+        self.op_class = SamplingDot
+    def test_op(self):
+        f = theano.function(
+            self.x,
+            sampling_dot(*self.x))
+        tested = f(*self.a)
+        x, y, p = self.a
+        expected = p.multiply(numpy.dot(x, y.T))
+        assert numpy.allclose(tested.toarray(), expected)
+        assert tested.format == 'csr'
+        assert tested.dtype == expected.dtype
+    def test_infer_shape(self):
+        self._compile_and_check(self.x,
+                                [sampling_dot(*self.x)],
+                                self.a,
+                                self.op_class,
+                                excluding=['local_sampling_dot_csr'])
+    def test_grad(self):
+        def _helper(x, y):
+            return sampling_dot(x, y, self.a[2])
+        verify_grad_sparse(_helper, self.a[:2])
 import theano.tensor.tests.test_sharedvar
 test_shared_options = theano.tensor.tests.test_sharedvar.makeSharedTester(
    shared_constructor_=theano.sparse.shared,

--- a/theano/sparse/tests/test_sp2.py
+++ b/theano/sparse/tests/test_sp2.py
@@ -21,300 +21,39 @@ from theano.sparse.sandbox import sp2 as S2
 from theano.tests import unittest_tools as utt
 from theano.sparse.basic import verify_grad_sparse
+# Already in test_basic.py
-def as_sparse_format(data, format):
+# Not used here
-    if format == 'csc':
+# def as_sparse_format(data, format):
-        return scipy.sparse.csc_matrix(data)
+#     if format == 'csc':
-    elif format == 'csr':
+#         return scipy.sparse.csc_matrix(data)
-        return scipy.sparse.csr_matrix(data)
+#     elif format == 'csr':
-    else:
+#         return scipy.sparse.csr_matrix(data)
-        raise NotImplementedError()
+#     else:
+#         raise NotImplementedError()
-def eval_outputs(outputs):
-    return compile.function([], outputs)()[0]
+# Already in test_basic.py
+# Not used here
+# def eval_outputs(outputs):
-def random_lil(shape, dtype, nnz):
+#     return compile.function([], outputs)()[0]
-    rval = sp.lil_matrix(shape, dtype=dtype)
-    huge = 2 ** 30
-    for k in range(nnz):
+# Already in test_basic.py
-        # set non-zeros in random locations (row x, col y)
+# Not used here
-        idx = np.random.random_integers(huge, size=len(shape)) % shape
+# def random_lil(shape, dtype, nnz):
-        value = np.random.rand()
+#     rval = sp.lil_matrix(shape, dtype=dtype)
-        #if dtype *int*, value will always be zeros!
+#     huge = 2 ** 30
-        if "int" in dtype:
+#     for k in range(nnz):
-            value = int(value * 100)
+#         # set non-zeros in random locations (row x, col y)
-        rval.__setitem__(
+#         idx = np.random.random_integers(huge, size=len(shape)) % shape
-                idx,
+#         value = np.random.rand()
-                value)
+#         #if dtype *int*, value will always be zeros!
-    return rval
+#         if "int" in dtype:
+#             value = int(value * 100)
+#         rval.__setitem__(
-class TestCast(utt.InferShapeTester):
+#                 idx,
-    compatible_types = (tensor.int_dtypes +
+#                 value)
-                        tensor.continuous_dtypes)
+#     return rval
-    x_csc = [theano.sparse.csc_matrix(dtype=t) for t in compatible_types]
-    x_csr = [theano.sparse.csr_matrix(dtype=t) for t in compatible_types]
-    indptr = np.array([0, 2, 3, 6])
-    indices = np.array([0, 2, 2, 0, 1, 2])
-    data = np.array([1, 2, 3, 4, 5, 6])
-    properties = (data, indices, indptr)
-    def setUp(self):
-        super(TestCast, self).setUp()
-        self.op_class = S2.Cast
-    def test_cast(self):
-        cast_csc = dict([
-            (x, [theano.function([x], S2.Cast(t)(x))
-                 for t in self.compatible_types])
-            for x in self.x_csc])
-        cast_csr = dict([
-            (x, [theano.function([x], S2.Cast(t)(x))
-                 for t in self.compatible_types])
-            for x in self.x_csr])
-        for x in self.x_csc:
-            for f, t in zip(cast_csc[x], self.compatible_types):
-                a = sp.csc_matrix(self.properties, dtype=x.dtype).copy()
-                assert f(a).dtype == t
-        for x in self.x_csr:
-            for f, t in zip(cast_csr[x], self.compatible_types):
-                a = sp.csr_matrix(self.properties, dtype=x.dtype)
-                assert f(a).dtype == t
-    def test_infer_shape(self):
-        for x in self.x_csc:
-            for t in self.compatible_types:
-                a = sp.csc_matrix(self.properties, dtype=x.dtype)
-                self._compile_and_check([x],
-                                        [S2.Cast(t)(x)],
-                                        [a],
-                                        self.op_class)
-        for x in self.x_csr:
-            for t in self.compatible_types:
-                a = sp.csr_matrix(self.properties, dtype=x.dtype)
-                self._compile_and_check([x],
-                                        [S2.Cast(t)(x)],
-                                        [a],
-                                        self.op_class)
-    def test_grad(self):
-        for dtype in tensor.float_dtypes:
-            for t in tensor.float_dtypes:
-                eps = None
-                if t == 'float32':
-                    eps = 7e-4
-                a = sp.csc_matrix(self.properties, dtype=dtype)
-                verify_grad_sparse(S2.Cast(t), [a], eps=eps)
-        for dtype in tensor.float_dtypes:
-            for t in tensor.float_dtypes:
-                eps = None
-                if t == 'float32':
-                    eps = 7e-4
-                a = sp.csr_matrix(self.properties, dtype=dtype)
-                verify_grad_sparse(S2.Cast(t), [a], eps=eps)
-class _HVStackTester(utt.InferShapeTester):
-    """Test for both HStack and VStack.
-    """
-    nb = 3  # Number of sparse matrix to stack
-    x = {}
-    mat = {}
-    for format in sparse.sparse_formats:
-        variable = getattr(theano.sparse, format + '_matrix')
-        spa = getattr(sp, format + '_matrix')
-        x[format] = [variable() for t in range(nb)]
-        mat[format] = [spa(np.random.random_integers(5, size=(3, 4)) - 1,
-                           dtype=theano.config.floatX)
-                       for t in range(nb)]
-    def test_op(self):
-        for format in sparse.sparse_formats:
-            for out_f in sparse.sparse_formats:
-                for dtype in sparse.all_dtypes:
-                    blocks = self.mat[format]
-                    f = theano.function(
-                        self.x[format],
-                        self.op_class(
-                            format=out_f, dtype=dtype)(*self.x[format]),
-                        allow_input_downcast=True)
-                    tested = f(*blocks)
-                    expected = self.expected_f(blocks,
-                                               format=out_f,
-                                               dtype=dtype)
-                    assert np.allclose(tested.toarray(), expected.toarray())
-                    assert tested.format == expected.format
-                    assert tested.dtype == expected.dtype
-    def test_infer_shape(self):
-        for format in sparse.sparse_formats:
-            self._compile_and_check(self.x[format],
-                                    [self.op_class(dtype='float64')(*self.x[format])],
-                                    self.mat[format],
-                                    self.op_class)
-    def test_grad(self):
-        for format in sparse.sparse_formats:
-            for out_f in sparse.sparse_formats:
-                for dtype in sparse.float_dtypes:
-                    verify_grad_sparse(
-                        self.op_class(format=out_f, dtype=dtype),
-                        self.mat[format],
-                        structured=False,
-                        eps=7e-4)
-def _hv_switch(op, expected_function):
-    """Return the right test class for HStack or VStack.
-    :Parameters:
-    - `op`: HStack or VStack class.
-    - `expected_function`: function from scipy for comparaison.
-    """
-    class XStackTester(_HVStackTester):
-        op_class = op
-        def expected_f(self, a, format=None, dtype=None):
-            return expected_function(a, format, dtype)
-    return XStackTester
-HStackTester = _hv_switch(S2.HStack, sp.hstack)
-VStackTester = _hv_switch(S2.VStack, sp.vstack)
-class AddSSDataTester(utt.InferShapeTester):
-    x = {}
-    a = {}
-    def setUp(self):
-        super(AddSSDataTester, self).setUp()
-        self.op_class = S2.AddSSData
-        for format in sparse.sparse_formats:
-            variable = getattr(theano.sparse, format + '_matrix')
-            rand = np.array(np.random.random_integers(3, size=(3, 4)) - 1,
-                          dtype=theano.config.floatX)
-            constant = as_sparse_format(rand, format)
-            self.x[format] = [variable() for t in range(2)]
-            self.a[format] = [constant for t in range(2)]
-    def test_op(self):
-        for format in sparse.sparse_formats:
-            f = theano.function(
-                self.x[format],
-                S2.add_s_s_data(*self.x[format]))
-            tested = f(*self.a[format])
-            expected = 2 * self.a[format][0]
-            assert np.allclose(tested.toarray(), expected.toarray())
-            assert tested.format == expected.format
-            assert tested.dtype == expected.dtype
-    def test_infer_shape(self):
-        for format in sparse.sparse_formats:
-            self._compile_and_check(self.x[format],
-                                    [S2.add_s_s_data(*self.x[format])],
-                                    self.a[format],
-                                    self.op_class)
-    def test_grad(self):
-        for format in sparse.sparse_formats:
-            verify_grad_sparse(self.op_class(),
-                               self.a[format],
-                               structured=True)
-class PoissonTester(utt.InferShapeTester):
-    x = {}
-    a = {}
-    for format in sparse.sparse_formats:
-        variable = getattr(theano.sparse, format + '_matrix')
-        rand = np.array(np.random.random_integers(3, size=(3, 4)) - 1,
-                        dtype=theano.config.floatX)
-        x[format] = variable()
-        a[format] = as_sparse_format(rand, format)
-    def setUp(self):
-        super(PoissonTester, self).setUp()
-        self.op_class = S2.Poisson
-    def test_op(self):
-        for format in sparse.sparse_formats:
-            f = theano.function(
-                [self.x[format]],
-                S2.poisson(self.x[format]))
-            tested = f(self.a[format])
-            assert tested.format == format
-            assert tested.dtype == self.a[format].dtype
-            assert np.allclose(np.floor(tested.data), tested.data)
-            assert tested.shape == self.a[format].shape
-    def test_infer_shape(self):
-        for format in sparse.sparse_formats:
-            self._compile_and_check([self.x[format]],
-                                    [S2.poisson(self.x[format])],
-                                    [self.a[format]],
-                                    self.op_class)
-class MultinomialTester(utt.InferShapeTester):
-    p = sparse.csr_matrix()
-    _p = sp.csr_matrix(np.asarray([[0.0, 0.5, 0.0, 0.5],
-                                   [0.1, 0.2, 0.3, 0.4],
-                                   [0.0, 1.0, 0.0, 0.0],
-                                   [0.3, 0.3, 0.0, 0.4]]))
-    def setUp(self):
-        super(MultinomialTester, self).setUp()
-        self.op_class = S2.Multinomial
-    def test_op(self):
-        n = tensor.lscalar()
-        f = theano.function([self.p, n], S2.multinomial(n, self.p))
-        _n = 5
-        tested = f(self._p, _n)
-        assert tested.shape == self._p.shape
-        assert np.allclose(np.floor(tested.todense()), tested.todense())
-        assert tested[2, 1] == _n
-        n = tensor.lvector()
-        f = theano.function([self.p, n], S2.multinomial(n, self.p))
-        _n = np.asarray([1, 2, 3, 4], dtype='int64')
-        tested = f(self._p, _n)
-        assert tested.shape == self._p.shape
-        assert np.allclose(np.floor(tested.todense()), tested.todense())
-        assert tested[2, 1] == _n[2]
-    def test_infer_shape(self):
-        self._compile_and_check([self.p],
-                                [S2.multinomial(5, self.p)],
-                                [self._p],
-                                self.op_class)
 class BinomialTester(utt.InferShapeTester):
@@ -357,188 +96,9 @@ class BinomialTester(utt.InferShapeTester):
                    self.op_class)
-class _StructuredMonoidUnaryTester(unittest.TestCase):
-    def test_op(self):
-        for format in sparse.sparse_formats:
-            x = getattr(theano.sparse, format + '_matrix')()
-            spa = getattr(sp, format + '_matrix')
-            a = spa(np.random.random_integers(5, size=(3, 4)) - 1,
-                    dtype=theano.config.floatX)
-            f = theano.function([x], self.op(x))
-            tested = f(a)
-            expected = self.expected_op(a.todense())
-            expected[a.todense() == 0] = 0
-            assert tested.shape == expected.shape
-            assert tested.dtype == expected.dtype
-            assert np.allclose(tested.todense(), expected)
-class StructuredSigmoidTester(_StructuredMonoidUnaryTester):
-    def setUp(self):
-        super(StructuredSigmoidTester, self).setUp()
-        self.op = S2.structured_sigmoid
-        self.expected_op = lambda x: 1.0 / (1.0 + np.exp(-x))
-class StructuredExpTester(_StructuredMonoidUnaryTester):
-    def setUp(self):
-        super(StructuredExpTester, self).setUp()
-        self.op = S2.structured_exp
-        self.expected_op = np.exp
-class StructuredLogTester(_StructuredMonoidUnaryTester):
-    def setUp(self):
-        super(StructuredLogTester, self).setUp()
-        self.op = S2.structured_log
-        self.expected_op = np.log
-class StructuredPowTester(_StructuredMonoidUnaryTester):
-    def setUp(self):
-        super(StructuredPowTester, self).setUp()
-        self.op = lambda x: S2.structured_pow(x, 2)
-        self.expected_op = lambda x: np.power(x, 2)
-class StructuredMinimumTester(_StructuredMonoidUnaryTester):
-    def setUp(self):
-        super(StructuredMinimumTester, self).setUp()
-        self.op = lambda x: S2.structured_minimum(x, 2)
-        self.expected_op = lambda x: np.minimum(x, 2)
-class StructuredMaximumTester(_StructuredMonoidUnaryTester):
-    def setUp(self):
-        super(StructuredMaximumTester, self).setUp()
-        self.op = lambda x: S2.structured_maximum(x, 2)
-        self.expected_op = lambda x: np.maximum(x, 2)
-class StructuredAddTester(_StructuredMonoidUnaryTester):
-    def setUp(self):
-        super(StructuredAddTester, self).setUp()
-        self.op = lambda x: S2.structured_add(x, 2)
-        self.expected_op = lambda x: np.add(x, 2)
-class MulSVTester(unittest.TestCase):
-    def setUp(self):
-        utt.seed_rng()
-    def test_mul_s_v_grad(self):
-        sp_types = {'csc': sp.csc_matrix,
-            'csr': sp.csr_matrix}
-        for format in ['csr', 'csc']:
-            for dtype in ['float32', 'float64']:
-                spmat = sp_types[format](random_lil((4, 3), dtype, 3))
-                mat = np.asarray(np.random.rand(3), dtype=dtype)
-                theano.sparse.verify_grad_sparse(S2.mul_s_v,
-                    [spmat, mat], structured=True)
-    def test_mul_s_v(self):
-        sp_types = {'csc': sp.csc_matrix,
-            'csr': sp.csr_matrix}
-        for format in ['csr', 'csc']:
-            for dtype in ['float32', 'float64']:
-                x = theano.sparse.SparseType(format, dtype=dtype)()
-                y = tensor.vector(dtype=dtype)
-                f = theano.function([x, y], S2.mul_s_v(x, y))
-                spmat = sp_types[format](random_lil((4, 3), dtype, 3))
-                mat = np.asarray(np.random.rand(3), dtype=dtype)
-                out = f(spmat, mat)
-                assert np.allclose(out.toarray(), spmat.toarray() * mat)
-class StructuredAddSVTester(unittest.TestCase):
-    def setUp(self):
-        utt.seed_rng()
-    def test_structured_add_s_v_grad(self):
-        sp_types = {'csc': sp.csc_matrix,
-            'csr': sp.csr_matrix}
-        for format in ['csr', 'csc']:
-            for dtype in ['float32', 'float64']:
-                spmat = sp_types[format](random_lil((4, 3), dtype, 3))
-                mat = np.asarray(np.random.rand(3), dtype=dtype)
-                theano.sparse.verify_grad_sparse(S2.structured_add_s_v,
-                    [spmat, mat], structured=True)
-    def test_structured_add_s_v(self):
-        sp_types = {'csc': sp.csc_matrix,
-            'csr': sp.csr_matrix}
-        for format in ['csr', 'csc']:
-            for dtype in ['float32', 'float64']:
-                x = theano.sparse.SparseType(format, dtype=dtype)()
-                y = tensor.vector(dtype=dtype)
-                f = theano.function([x, y], S2.structured_add_s_v(x, y))
-                spmat = sp_types[format](random_lil((4, 3), dtype, 3))
-                spones = spmat.copy()
-                spones.data = np.ones_like(spones.data)
-                mat = np.asarray(np.random.rand(3), dtype=dtype)
-                out = f(spmat, mat)
-                assert np.allclose(out.toarray(), spones.multiply(spmat + mat))
-class SamplingDotTester(utt.InferShapeTester):
-    x = [tensor.matrix() for t in range(2)]
-    x.append(sparse.csr_matrix())
-    a = [np.array(np.random.random_integers(maximum, size=(3, 3)) - 1,
-                      dtype=theano.config.floatX)
-         for maximum in [5, 5, 2]]
-    a[2] = sp.csr_matrix(a[2])
-    def setUp(self):
-        super(SamplingDotTester, self).setUp()
-        self.op_class = S2.SamplingDot
-    def test_op(self):
-        f = theano.function(
-            self.x,
-            S2.sampling_dot(*self.x))
-        tested = f(*self.a)
-        x, y, p = self.a
-        expected = p.multiply(np.dot(x, y.T))
-        assert np.allclose(tested.toarray(), expected)
-        assert tested.format == 'csr'
-        assert tested.dtype == expected.dtype
-    def test_infer_shape(self):
-        self._compile_and_check(self.x,
-                                [S2.sampling_dot(*self.x)],
-                                self.a,
-                                self.op_class,
-                                excluding=['local_sampling_dot_csr'])
-    def test_grad(self):
-        def _helper(x, y):
-            return S2.sampling_dot(x, y, self.a[2])
-        verify_grad_sparse(_helper, self.a[:2])
 # ##################
 # Optimization tests
 # ##################
 def test_local_mul_s_d():
    mode = theano.compile.mode.get_default_mode()
    mode = mode.including("specialize", "local_mul_s_d")
@@ -554,6 +114,7 @@ def test_local_mul_s_d():
        assert not any(isinstance(node.op, sparse.MulSD) for node
                       in f.maker.env.toposort())
 def test_local_mul_s_v():
    mode = theano.compile.mode.get_default_mode()
    mode = mode.including("specialize", "local_mul_s_v")
@@ -569,6 +130,7 @@ def test_local_mul_s_v():
        assert not any(isinstance(node.op, S2.MulSV) for node
                       in f.maker.env.toposort())
 def test_local_structured_add_s_v():
    mode = theano.compile.mode.get_default_mode()
    mode = mode.including("specialize", "local_structured_add_s_v")
@@ -584,6 +146,7 @@ def test_local_structured_add_s_v():
        assert not any(isinstance(node.op, S2.StructuredAddSV) for node
                       in f.maker.env.toposort())
 def test_local_sampling_dot_csr():
    mode = theano.compile.mode.get_default_mode()
    mode = mode.including("specialize", "local_sampling_dot_csr")