added sparse dot

cf80a8ef · Yann N. Dauphin · 11ee6cee · cf80a8ef · cf80a8ef
--- a/theano/sparse/basic.py
+++ b/theano/sparse/basic.py
@@ -18,6 +18,8 @@ from theano import scalar
 from theano import config
 from theano.gof.python25 import all, any

+sparse_formats = ['csc', 'csr']
+
 #TODO: move this decorator to the compile submodule
 def register_specialize(lopt, *tags, **kwargs):
    compile.optdb['specialize'].register((kwargs and kwargs.pop('name')) or lopt.__name__, lopt, 'fast_run', *tags)
@@ -1449,3 +1451,385 @@ class StructuredDotGradCSR(gof.Op):

        """% dict(locals(), **sub)
 sdg_csr = StructuredDotGradCSR()
+
+class Dot(gof.op.Op):
+    """
+    Operation for efficiently calculating the dot product when
+    one or all operands is sparse. Supported format are CSC and CSR.
+    The output of the operation is dense.
+    """
+    def __eq__(self, other):
+        return type(self) == type(other)
+    
+    def __hash__(self):
+        return hash(type(self))
+    
+    def __ne__(self, other):
+        return not (self == other)
+    
+    def make_node(self, x, y):
+        dtype_out = scalar.upcast(x.type.dtype, y.type.dtype)
+        
+        if not _is_sparse_variable(x) and not _is_sparse_variable(y):
+            raise TypeError(x)
+
+        return gof.Apply(self, [x, y], [tensor.tensor(dtype=dtype_out, broadcastable=(False, False))])
+    
+    def perform(self, node, (x, y), (out, )):
+        x_is_sparse = _is_sparse(x)
+        y_is_sparse = _is_sparse(y)
+        
+        if not x_is_sparse and not y_is_sparse:
+            raise TypeError(x)
+                
+        rval = x * y
+        
+        if x_is_sparse and y_is_sparse:
+            rval = rval.todense()
+        
+        out[0] = rval
+    
+    def grad(self, (x, y), (gz,)):
+        assert _is_sparse_variable(x) or _is_sparse_variable(y)
+        
+        rval = [
+            tensor.dot(gz, y.T) if _is_dense_variable(y) else dot(gz, y.T),
+            tensor.dot(x.T, gz) if _is_dense_variable(x) else dot(x.T, gz)
+        ]
+        
+        return rval
+_dot = Dot()
+
+def dot(x, y):
+    """
+    Operation for efficiently calculating the dot product when
+    one or all operands is sparse. Supported format are CSC and CSR.
+    The output of the operation is dense.
+    """
+    if hasattr(x, 'getnnz'): x = as_sparse_variable(x)
+    if hasattr(y, 'getnnz'): y = as_sparse_variable(y)
+
+    x_is_sparse_variable = _is_sparse_variable(x)
+    y_is_sparse_variable = _is_sparse_variable(y)
+    
+    if not x_is_sparse_variable and not y_is_sparse_variable:
+        raise TypeError()
+    
+    return _dot(x, y)
+
+
+class Usmm(gof.op.Op):
+    """
+    Performs the expression is alpha * x y + z
+
+    x or y are sparse matrix(the other can be sparse or dense)
+    z is a dense matrix
+    alpha is a scalar
+    """
+    def __eq__(self, other):
+        return type(self) == type(other)
+    
+    def __hash__(self):
+        return hash(type(self))
+    
+    def __ne__(self, other):
+        return not (self == other)
+    
+    def __str__(self):
+        return 'Usmm{no_inplace}'
+    
+    def make_node(self, alpha, x, y, z):
+        if not _is_sparse_variable(x) and not _is_sparse_variable(y):
+            # If x and y are tensor, we don't want to use this class
+            # We should use Dot22 and Gemm in that case.
+            raise TypeError(x)
+
+        dtype_out = scalar.upcast(alpha.type.dtype, x.type.dtype, y.type.dtype, z.type.dtype)
+        alpha = tensor.as_tensor_variable(alpha)
+        z = tensor.as_tensor_variable(z)
+
+        assert z.ndim == 2
+        assert alpha.type.broadcastable == (True,)* alpha.ndim
+        if not _is_sparse_variable(x):
+            x = tensor.as_tensor_variable(x)
+            assert x.ndim == 2
+        if not _is_sparse_variable(y):
+            y = tensor.as_tensor_variable(y)
+            assert y.ndim == 2
+
+        return gof.Apply(self, [alpha, x, y, z], [tensor.tensor(dtype=dtype_out, broadcastable=(False, False))])
+    
+    def perform(self, node, (alpha, x, y, z), (out, )):
+        x_is_sparse = _is_sparse(x)
+        y_is_sparse = _is_sparse(y)
+        
+        if not x_is_sparse and not y_is_sparse:
+            raise TypeError(x)
+        
+        rval = x * y
+        if isinstance(rval, scipy.sparse.spmatrix):
+            rval = rval.toarray()
+        if rval.dtype == alpha.dtype:
+            rval *= alpha # Faster because operation is inplace
+        else:
+            rval = rval * alpha
+        if rval.dtype == z.dtype:
+            rval += z # Faster because operation is inplace
+        else:
+            rval = rval + z
+        
+        out[0] = rval
+usmm = Usmm()
+
+class UsmmCscDense(gof.Op):
+    """
+    Performs the expression is alpha * x y + z
+    This is an optimized operation for the case when x is in CSC format.
+        
+    x are sparse matrix
+    y, z is a dense matrix
+    alpha is a scalar
+    """
+    def __init__(self, inplace):
+        self.inplace = inplace
+        if inplace:
+            self.destroy_map={ 0 : [6] }
+    def __str__(self):
+        if self.inplace:
+            return 'UsmmCscDense{inplace}'
+        else:
+            return 'UsmmCscDense{no_inplace}'
+    def __eq__(self, other):
+        return (type(self) == type(other)) and self.inplace == other.inplace
+    def __hash__(self):
+        return hash(type(self))
+    def make_node(self, alpha, x_val, x_ind, x_ptr, x_nrows, y, z):
+        x_val = tensor.as_tensor_variable(x_val)
+        x_ind = tensor.as_tensor_variable(x_ind)
+        x_ptr = tensor.as_tensor_variable(x_ptr)
+        y = tensor.as_tensor_variable(y)
+        z = tensor.as_tensor_variable(z)
+        assert x_ind.dtype == 'int32'
+        assert x_ptr.dtype == 'int32'
+        assert x_nrows.dtype == 'int32'
+        assert alpha.ndim == 2 and alpha.type.broadcastable == (True, True)
+        assert x_val.ndim == 1
+        assert y.ndim == 2
+        assert z.ndim == 2
+
+        dtype_out = scalar.upcast(alpha.type.dtype, x_val.type.dtype, y.type.dtype, z.type.dtype)
+        alpha = tensor.as_tensor_variable(alpha)
+
+        if self.inplace:
+            assert z.type.dtype == dtype_out
+
+        # axpy work only with the same dtype, so we should upcast the input
+        if dtype_out != alpha.type.dtype:
+            alpha = tensor.cast(alpha, dtype_out)
+        if dtype_out != x_val.type.dtype:
+            x_val = tensor.cast(x_val, dtype_out)
+        if dtype_out != y.type.dtype:
+            raise NotImplementedError("We need sparse cast to be implemented!")
+        if dtype_out != z.type.dtype:
+            z = tensor.cast(z, dtype_out)
+
+
+        r = gof.Apply(self, [alpha, x_val, x_ind, x_ptr, x_nrows, y, z], 
+                [tensor.tensor(dtype_out, (False, y.type.broadcastable[1]))])
+        return r
+
+    #def perform(self, node, (alpha, x_val, x_ind, x_ptr, x_nrows, y, z), (out,)):
+    #    raise NotImplemented()
+    
+    def c_support_code(self):
+        return blas.blas_header_text()
+    
+    def c_libraries(self):
+        return blas.ldflags()
+
+    def c_compile_args(self):
+        return blas.ldflags(libs=False, flags=True)
+
+    def c_lib_dirs(self):
+        return blas.ldflags(libs=False, libs_dir=True)
+    
+    def c_header_dirs(self):
+        return blas.ldflags(libs=False, include_dir=True)
+    
+    def c_code(self, node, name, (alpha, x_val, x_ind, x_ptr, x_nrows, y, z), (zn,), sub):
+        if node.inputs[1].type.dtype in ('complex64', 'complex128'):
+            raise NotImplementedError('Complex types are not supported for x_val')
+        if node.inputs[5].type.dtype in ('complex64', 'complex128'):
+            raise NotImplementedError('Complex types are not supported for y')
+        if node.inputs[6].type.dtype != node.outputs[0].type.dtype:
+            raise NotImplementedError('z and output must have same type')
+        
+        if node.inputs[1].type.dtype == "float32":
+            conv_type = "float"
+            axpy = "saxpy_"
+        else:
+            conv_type = "double"
+            axpy = "daxpy_"
+        
+        typenum_alpha = node.inputs[0].type.dtype_specs()[-1] # retrieve dtype number
+        typenum_x_val = node.inputs[1].type.dtype_specs()[-1] # retrieve dtype number
+        typenum_y = node.inputs[5].type.dtype_specs()[-1] # retrieve dtype number
+        typenum_z = node.inputs[6].type.dtype_specs()[-1] # retrieve dtype number
+        typenum_zn = node.outputs[0].type.dtype_specs()[-1] # retrieve dtype number
+        
+        inplace = int(self.inplace)
+        
+        rval = """
+        if (%(x_val)s->nd != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(x_val) != 1"); %(fail)s;}
+        if (%(x_ind)s->nd != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(x_ind) != 1"); %(fail)s;}
+        if (%(x_ptr)s->nd != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(x_ptr) != 1"); %(fail)s;}
+        if (%(x_nrows)s->nd != 0) {PyErr_SetString(PyExc_NotImplementedError, "rank(x_nrows) != 0"); %(fail)s;}
+        if (%(y)s->nd != 2) {PyErr_SetString(PyExc_NotImplementedError, "rank(y) != 2"); %(fail)s;}
+
+        if (%(x_val)s->descr->type_num != %(typenum_x_val)s) {
+        PyErr_SetString(PyExc_NotImplementedError, "Invalid type for x_val"); %(fail)s;}
+
+        if (%(y)s->descr->type_num != %(typenum_y)s) {
+        PyErr_SetString(PyExc_NotImplementedError, "Invalid type for y"); %(fail)s;}
+
+        if (%(z)s->descr->type_num != %(typenum_z)s) {
+        PyErr_SetString(PyExc_NotImplementedError, "Invalid type for z"); %(fail)s;}
+
+        if (%(alpha)s->descr->type_num != %(typenum_alpha)s) {
+        PyErr_SetString(PyExc_NotImplementedError, "Invalid type for alpha"); %(fail)s;}
+
+        if (%(x_ind)s->descr->type_num != PyArray_INT32) {
+        PyErr_SetString(PyExc_NotImplementedError, "x_ind dtype not INT32"); %(fail)s;}
+
+        if (%(x_ptr)s->descr->type_num != PyArray_INT32)
+        {PyErr_SetString(PyExc_NotImplementedError, "x_ptr dtype not INT32"); %(fail)s;}
+
+        if (%(x_nrows)s->descr->type_num != PyArray_INT32)
+        {PyErr_SetString(PyExc_NotImplementedError, "x_nrows dtype not INT32"); %(fail)s;}
+
+        if (%(x_val)s->dimensions[0] != %(x_ind)s->dimensions[0])
+        {PyErr_SetString(PyExc_NotImplementedError, "x_val and x_ind have different lengths"); %(fail)s;}
+
+        if (%(x_ptr)s->dimensions[0] != %(y)s->dimensions[0]+1)
+        {PyErr_SetString(PyExc_NotImplementedError, "x's number of columns doesn't match y's rows"); %(fail)s;}
+        
+        if (%(z)s->dimensions[0] != ((npy_int32 *)%(x_nrows)s->data)[0] || %(z)s->dimensions[1] != %(y)s->dimensions[1])
+        {PyErr_SetString(PyExc_NotImplementedError, "The dimension of z and the output must match"); %(fail)s;}
+        
+        if (PyArray_SIZE(%(alpha)s) != 1)
+        {PyErr_SetString(PyExc_NotImplementedError, "The number of element in alpha must be 1"); %(fail)s;}
+
+        if (%(alpha)s->nd != 2)
+        {PyErr_SetString(PyExc_NotImplementedError, "The number dimension of alpha must be 2"); %(fail)s;}
+
+        if (%(x_val)s->nd != 1)
+        {PyErr_SetString(PyExc_NotImplementedError, "The number dimension of x_val must be 1"); %(fail)s;}
+
+        if (%(y)s->nd != 2)
+        {PyErr_SetString(PyExc_NotImplementedError, "The number dimension of y must be 2"); %(fail)s;}
+
+        if (%(z)s->nd != 2)
+        {PyErr_SetString(PyExc_NotImplementedError, "The number dimension of z must be 2"); %(fail)s;}
+
+        if (%(inplace)s)
+        {
+            if (%(typenum_zn)s != %(typenum_z)s) {
+            PyErr_SetString(PyExc_NotImplementedError, "When inplace the output dtype must be the same as the input"); %(fail)s;}
+
+            Py_XDECREF(%(zn)s);
+            %(zn)s = %(z)s;
+            Py_INCREF(%(zn)s);
+        }        
+        else if (!%(zn)s
+            || (%(zn)s->dimensions[0] != ((npy_int32 *)%(x_nrows)s->data)[0])
+            || (%(zn)s->dimensions[1] != %(y)s->dimensions[1])
+            )
+        {
+            {Py_XDECREF(%(zn)s);}
+            npy_intp dims[] = {0,0};
+            dims[0] = ((npy_int32 *)%(x_nrows)s->data)[0];
+            dims[1] = %(y)s->dimensions[1];
+            %(zn)s = (PyArrayObject*) PyArray_SimpleNew(2, dims, %(typenum_zn)s);
+        }
+        
+        {
+            // sparse array has size MxK, dense KxN, output MxN
+            npy_intp M = %(zn)s->dimensions[0];
+            npy_intp N = %(zn)s->dimensions[1];
+            npy_intp K = %(y)s->dimensions[0];
+            
+            // pointers to access actual data in the arrays passed as params.
+            dtype_%(z)s* __restrict__ Dz   = (dtype_%(z)s*)%(z)s->data;
+            dtype_%(zn)s* __restrict__ Dzn   = (dtype_%(zn)s*)%(zn)s->data;
+            const dtype_%(x_val)s* __restrict__ Dval = (dtype_%(x_val)s*)%(x_val)s->data;
+            const npy_int32 * __restrict__ Dind = (npy_int32*)%(x_ind)s->data;
+            const npy_int32 * __restrict__ Dptr = (npy_int32*)%(x_ptr)s->data;
+            const dtype_%(alpha)s alpha = ((dtype_%(alpha)s*)%(alpha)s->data)[0];
+            
+            npy_intp Sz = %(z)s->strides[1] / %(z)s->descr->elsize;
+            npy_intp Szn = %(zn)s->strides[1] / %(zn)s->descr->elsize;
+            npy_intp Sval = %(x_val)s->strides[0] / %(x_val)s->descr->elsize;
+            npy_intp Sind = %(x_ind)s->strides[0] / %(x_ind)s->descr->elsize;
+            npy_intp Sptr = %(x_ptr)s->strides[0] / %(x_ptr)s->descr->elsize;
+            npy_intp Sy = %(y)s->strides[1] / %(y)s->descr->elsize;
+            
+            
+            if (!(%(inplace)s))
+            {
+                memcpy(Dzn, Dz, M*N*sizeof(dtype_%(zn)s));
+            }
+            
+            for (npy_int32 k = 0; k < K; ++k)
+            {
+                for (npy_int32 m_idx = Dptr[k * Sptr]; m_idx < Dptr[(k+1)*Sptr]; ++m_idx)
+                {
+                    const npy_int32 m = Dind[m_idx * Sind]; // row index of non-null value for column K
+                    
+                    const dtype_%(x_val)s Amk = alpha * Dval[m_idx * Sval]; // actual value at that location
+                    
+                    const dtype_%(y)s* y_row = (dtype_%(y)s*)(%(y)s->data + %(y)s->strides[0] * k);
+                    
+                    const dtype_%(zn)s* z_row = (dtype_%(zn)s*)(%(zn)s->data + %(zn)s->strides[0] * m);
+                    
+                    %(axpy)s((int*)&N, (%(conv_type)s*)&Amk, (%(conv_type)s*)y_row, (int*)&Sy, (%(conv_type)s*)z_row, (int*)&Szn);
+                }
+            }
+        }
+        """% dict(locals(), **sub)
+
+        return rval
+usmm_csc_dense = UsmmCscDense(inplace=False)
+usmm_csc_dense_inplace = UsmmCscDense(inplace=True)
+
+local_usmm = gof.opt.PatternSub((tensor.sub, 'z', (tensor.mul, {'pattern' : 'alpha', 'constraint' : lambda expr: numpy.all(expr.type.broadcastable) },
+                                                               (_dot, 'x', 'y'))),
+                                (usmm, (tensor.neg, 'alpha'), 'x', 'y', 'z'))
+register_specialize(local_usmm, name="local_usmm")
+
+
+@gof.local_optimizer([usmm])
+def local_usmm_csx(node):
+    if node.op == usmm:
+        alpha, x, y, z = node.inputs
+        
+        x_is_sparse_variable = _is_sparse_variable(x)
+        y_is_sparse_variable = _is_sparse_variable(y)
+        
+        if x_is_sparse_variable and not y_is_sparse_variable:
+            if x.type.format == 'csc':
+                x_val, x_ind, x_ptr, x_shape = csm_properties(x)
+                x_nsparse = x_shape[0]
+                dtype_out = scalar.upcast(alpha.type.dtype, x.type.dtype, y.type.dtype, z.type.dtype)
+                # Sparse cast is not implemented.
+                if y.type.dtype != dtype_out:
+                    return False
+
+                return [usmm_csc_dense(alpha, x_val, x_ind, x_ptr, x_nsparse, y, z)]
+    return False
+register_specialize(local_usmm_csx)
+
+@gof.local_optimizer([usmm_csc_dense])
+def local_usmm_csc_dense_inplace(node):
+    if node.op == usmm_csc_dense:
+        return [usmm_csc_dense_inplace(*node.inputs)]
+register_specialize(local_usmm_csc_dense_inplace, 'inplace')
--- a/theano/sparse/tests/test_basic.py
+++ b/theano/sparse/tests/test_basic.py
@@ -26,6 +26,15 @@ from theano import tensor
 from theano.tensor.basic import _allclose


+def as_sparse_format(data, format):
+    if format == 'csc':
+        return scipy.sparse.csc_matrix(data)
+    elif format == 'csr':
+        return scipy.sparse.csr_matrix(data)
+    else:
+        raise NotImplementedError()
+
+
 def eval_outputs(outputs):
    return compile.function([], outputs)()[0]

@@ -513,6 +522,140 @@ class test_structureddot(unittest.TestCase):
            if not theano.config.mode in ["DebugMode", "DEBUG_MODE"]:
                self.assertFalse(theano_time > overhead_rtol*scipy_time + overhead_tol)

+
+class DotTests(unittest.TestCase):
+    def setUp(self):
+        x_size = (10, 1000)
+        y_size = (1000, 10000)
+        
+        self.x_csr = scipy.sparse.csr_matrix(numpy.random.binomial(1, 0.5, x_size), dtype=theano.config.floatX)
+        self.x_csc = scipy.sparse.csc_matrix(numpy.random.binomial(1, 0.5, x_size), dtype=theano.config.floatX)
+        self.y = numpy.asarray(numpy.random.uniform(-1, 1, y_size), dtype=theano.config.floatX)
+        self.y_csr = scipy.sparse.csr_matrix(numpy.random.binomial(1, 0.5, y_size), dtype=theano.config.floatX)
+        self.y_csc = scipy.sparse.csc_matrix(numpy.random.binomial(1, 0.5, y_size), dtype=theano.config.floatX)
+
+    
+    def test_csr_dense(self):
+        x = theano.sparse.csr_matrix('x')
+        y = theano.tensor.matrix('y')
+    
+        f_a = theano.function([x, y], theano.sparse.dot(x, y))
+        f_b = lambda x, y: x * y
+        
+        assert abs(f_a(self.x_csr, self.y) - f_b(self.x_csr, self.y)).max() < 10**-4
+
+    def test_csc_dense(self):
+        x = theano.sparse.csc_matrix('x')
+        y = theano.tensor.matrix('y')
+    
+        f_a = theano.function([x, y], theano.sparse.dot(x, y))
+        f_b = lambda x, y: x * y
+        
+        assert abs(f_a(self.x_csc, self.y) - f_b(self.x_csc, self.y)).max() < 10**-4
+    def test_sparse_sparse(self):
+        for d1, d2 in [('float32', 'float32'),
+                       ('float32', 'float64'),
+                       ('float64', 'float32'),
+                       ('float64', 'float64'),
+                       ]:
+            for x_f, y_f in [('csc','csc'),
+                             ('csc','csr'),
+                             ('csr','csc'),
+                             ('csr','csr'),
+                             ]:
+                x = theano.sparse.SparseType(format=x_f, dtype=d1)('x')
+                y = theano.sparse.SparseType(format=x_f, dtype=d2)('x')
+
+                f_a = theano.function([x, y], theano.sparse.dot(x, y))
+                f_b = lambda x, y: x * y
+
+                vx = getattr(self,'x_'+x_f).astype(d1)
+                vy = getattr(self,'y_'+y_f).astype(d2)
+                assert abs(f_a(vx, vy) - f_b(vx, vy)).max() < 10**-4
+
+
+class UsmmTests(unittest.TestCase):
+    def setUp(self):
+        x_size = (10, 200)
+        y_size = (200, 2000)
+        z_size = (x_size[0], y_size[1])
+        
+        self.x = numpy.asarray(numpy.random.binomial(1, 0.5, x_size), dtype=theano.config.floatX)
+        self.y = numpy.asarray(numpy.random.uniform(-1, 1, y_size), dtype=theano.config.floatX)
+        self.z = numpy.asarray(numpy.random.uniform(-1, 1, z_size), dtype=theano.config.floatX)
+    
+    def test(self):
+        def mat(format, name, dtype):
+            if format == 'dense':
+                return theano.tensor.matrix(name, dtype=dtype)
+            else:
+                return theano.sparse.matrix(format, name, dtype=dtype)
+
+        for dtype1 in ['float32', 'float64']:
+            for dtype2 in ['float32', 'float64']:
+                for dtype3 in ['float32', 'float64']:
+                    for dtype4 in ['float32', 'float64']:
+                        for format1 in ['dense', 'csc','csr']:
+                            for format2 in ['dense', 'csc','csr']:
+                                if format1 == 'dense' and format2 == 'dense':
+                                    # Usmm won't be used!
+                                    continue
+                                x = mat(format1, 'x', dtype1)
+                                y = mat(format2, 'y', dtype2)
+                                a = theano.tensor.scalar('a', dtype=dtype3)
+                                z = theano.tensor.shared(numpy.asarray(self.z,dtype=dtype4).copy())
+
+                                f_b = lambda z, a, x, y: z - a * (x * y)
+                                x_data = numpy.asarray(self.x, dtype = dtype1)
+                                if format1 != 'dense':
+                                    x_data = as_sparse_format(x_data, format1)
+                                y_data = numpy.asarray(self.y, dtype = dtype2)
+                                if format2 != 'dense':
+                                    y_data = as_sparse_format(y_data, format2)
+                                z_data = numpy.asarray(self.z, dtype = dtype3)
+
+                                f_b_out = f_b(z_data, 1, x_data, y_data)
+
+                                # Can it work inplace?
+                                inplace = dtype4 == theano.scalar.upcast(dtype1, dtype2, dtype3)
+
+                                # To make it easier to check the toposort
+                                mode = theano.compile.mode.get_default_mode().excluding('fusion')
+
+                                if inplace:
+                                    f_a = theano.function([a, x, y], [],
+                                                          updates={ z : z - a * theano.sparse.dot(x, y)},
+                                                          mode = mode)
+                                    f_a(1, x_data, y_data)
+                                    assert abs(z.get_value(borrow=True) - f_b_out).max() < 10**-4
+                                else:
+                                    f_a = theano.function([a, x, y], z - a * theano.sparse.dot(x, y),
+                                                          mode = mode)
+                                    f_a_out = f_a(1, x_data, y_data)
+                                    assert abs(f_a_out - f_b_out).max() < 10**-4
+                                topo = f_a.maker.env.toposort()
+                                if (y.type.dtype == theano.scalar.upcast(dtype1, dtype2, dtype3, dtype4)
+                                    and format1=='csc' and format2=='dense'):
+
+                                    assert sum([isinstance(node.op, tensor.Elemwise) and isinstance(node.op.scalar_op, theano.scalar.basic.Cast) for node in topo])==len(topo)-5
+                                    topo = [node for node in topo if not(isinstance(node.op, tensor.Elemwise) and isinstance(node.op.scalar_op, theano.scalar.basic.Cast))]
+                                    assert len(topo)==5, topo
+                                    # Usmm is tested at the same time in debugmode
+                                    # Check if the optimization local_usmm and local_usmm_csx is applied
+                                    assert isinstance(topo[0].op, theano.sparse.basic.CSMProperties)
+                                    assert isinstance(topo[1].op, theano.tensor.DimShuffle)
+                                    assert isinstance(topo[2].op, theano.tensor.Subtensor)
+                                    assert topo[3].op == theano.tensor.neg
+                                    assert isinstance(topo[4].op, theano.sparse.UsmmCscDense)
+                                    if inplace:
+                                        assert topo[4].op.inplace
+                                else:
+                                    assert len(topo)==3, topo
+                                    assert isinstance(topo[0].op, theano.tensor.DimShuffle)
+                                    assert topo[1].op == theano.tensor.neg
+                                    assert isinstance(topo[2].op, theano.sparse.Usmm)
+
+
 def test_shape_i():
    sparse_dtype = 'float32'