Merge branch 'ynd_sparse' of https://github.com/dwf/Theano into sparse

Conflicts: theano/sparse/basic.py

Merge branch 'ynd_sparse' of https://github.com/dwf/Theano into sparse
34836892 · Yann N. Dauphin · 8498dfdd · d1c3a6bb · 34836892 · 34836892
--- a/theano/gof/python25.py
+++ b/theano/gof/python25.py
@@ -91,5 +91,17 @@ if sys.version_info[:2] < (2,6):
            for j in range(i+1, r):
                indices[j] = indices[j-1] + 1
            yield tuple(pool[i] for i in indices)
+
+
+    def product(*args, **kwds):
+        # product('ABCD', 'xy') --> Ax Ay Bx By Cx Cy Dx Dy
+        # product(range(2), repeat=3) --> 000 001 010 011 100 101 110 111
+        pools = map(tuple, args) * kwds.get('repeat', 1)
+        result = [[]]
+        for pool in pools:
+            result = [x+[y] for x in result for y in pool]
+        for prod in result:
+            yield tuple(prod)
+
 else:
-    from itertools import combinations
+    from itertools import combinations, product
--- a/theano/sparse/basic.py
+++ b/theano/sparse/basic.py
@@ -1451,8 +1451,11 @@ class StructuredDotGradCSR(gof.Op):
        }

        """% dict(locals(), **sub)
+
+
 sdg_csr = StructuredDotGradCSR()

+
 class Dot(gof.op.Op):
    """
    Operation for efficiently calculating the dot product when
@@ -1461,13 +1464,13 @@ class Dot(gof.op.Op):
    """
    def __eq__(self, other):
        return type(self) == type(other)
-    
+
    def __hash__(self):
        return hash(type(self))
-    
+
    def __ne__(self, other):
        return not (self == other)
-    
+
    def infer_shape(self, node, shapes):
        xshp, yshp = shapes
        x, y = node.inputs
@@ -1480,56 +1483,66 @@ class Dot(gof.op.Op):
        if x.ndim == 1 and y.ndim == 1:
            return [()]
        raise NotImplementedError()
-    
+
    def make_node(self, x, y):
        dtype_out = scalar.upcast(x.type.dtype, y.type.dtype)
-        
+
        if not _is_sparse_variable(x) and not _is_sparse_variable(y):
            raise TypeError(x)

        return gof.Apply(self, [x, y], [tensor.tensor(dtype=dtype_out,
                         broadcastable=(False, False))])
-    
-    def perform(self, node, (x, y), (out, )):
+
+    def perform(self, node, inputs, out):
+        x, y = inputs
+        out = out[0]
        x_is_sparse = _is_sparse(x)
        y_is_sparse = _is_sparse(y)
-        
+
        if not x_is_sparse and not y_is_sparse:
            raise TypeError(x)
-                
+
        rval = x * y
-        
+
        if x_is_sparse and y_is_sparse:
            rval = rval.todense()
-        
+
        out[0] = rval
-    
+
    def grad(self, (x, y), (gz,)):
        assert _is_sparse_variable(x) or _is_sparse_variable(y)
-        
-        rval = [
-            tensor.dot(gz, y.T) if _is_dense_variable(y) else dot(gz, y.T),
-            tensor.dot(x.T, gz) if _is_dense_variable(x) else dot(x.T, gz)
-        ]
-        
+        rval = []
+
+        if _is_dense_variable(y):
+            rval.append(tensor.dot(gz, y.T))
+        else:
+            rval.append(dot(gz, y.T))
+        if _is_dense_variable(x):
+            rval.append(tensor.dot(x.T, gz))
+        else:
+            rval.append(dot(x.T, gz))
+
        return rval
 _dot = Dot()

+
 def dot(x, y):
    """
    Operation for efficiently calculating the dot product when
    one or all operands is sparse. Supported format are CSC and CSR.
    The output of the operation is dense.
    """
-    if hasattr(x, 'getnnz'): x = as_sparse_variable(x)
-    if hasattr(y, 'getnnz'): y = as_sparse_variable(y)
+    if hasattr(x, 'getnnz'):
+        x = as_sparse_variable(x)
+    if hasattr(y, 'getnnz'):
+        y = as_sparse_variable(y)

    x_is_sparse_variable = _is_sparse_variable(x)
    y_is_sparse_variable = _is_sparse_variable(y)
-    
+
    if not x_is_sparse_variable and not y_is_sparse_variable:
        raise TypeError()
-    
+
    return _dot(x, y)


@@ -1543,16 +1556,16 @@ class Usmm(gof.op.Op):
    """
    def __eq__(self, other):
        return type(self) == type(other)
-    
+
    def __hash__(self):
        return hash(type(self))
-    
+
    def __ne__(self, other):
        return not (self == other)
-    
+
    def __str__(self):
        return 'Usmm{no_inplace}'
-    
+
    def infer_shape(self, node, shapes):
        xshp, yshp = shapes
        x, y = node.inputs
@@ -1565,19 +1578,20 @@ class Usmm(gof.op.Op):
        if x.ndim == 1 and y.ndim == 1:
            return [()]
        raise NotImplementedError()
-    
+
    def make_node(self, alpha, x, y, z):
        if not _is_sparse_variable(x) and not _is_sparse_variable(y):
            # If x and y are tensor, we don't want to use this class
            # We should use Dot22 and Gemm in that case.
            raise TypeError(x)

-        dtype_out = scalar.upcast(alpha.type.dtype, x.type.dtype, y.type.dtype, z.type.dtype)
+        dtype_out = scalar.upcast(alpha.type.dtype, x.type.dtype,
+                                  y.type.dtype, z.type.dtype)
        alpha = tensor.as_tensor_variable(alpha)
        z = tensor.as_tensor_variable(z)

        assert z.ndim == 2
-        assert alpha.type.broadcastable == (True,)* alpha.ndim
+        assert alpha.type.broadcastable == (True,) * alpha.ndim
        if not _is_sparse_variable(x):
            x = tensor.as_tensor_variable(x)
            assert x.ndim == 2
@@ -1585,35 +1599,38 @@ class Usmm(gof.op.Op):
            y = tensor.as_tensor_variable(y)
            assert y.ndim == 2

-        return gof.Apply(self, [alpha, x, y, z], [tensor.tensor(dtype=dtype_out, broadcastable=(False, False))])
-    
+        return gof.Apply(self, [alpha, x, y, z],
+                         [tensor.tensor(dtype=dtype_out,
+                                        broadcastable=(False, False))])
+
    def perform(self, node, (alpha, x, y, z), (out, )):
        x_is_sparse = _is_sparse(x)
        y_is_sparse = _is_sparse(y)
-        
+
        if not x_is_sparse and not y_is_sparse:
            raise TypeError(x)
-        
+
        rval = x * y
        if isinstance(rval, scipy.sparse.spmatrix):
            rval = rval.toarray()
        if rval.dtype == alpha.dtype:
-            rval *= alpha # Faster because operation is inplace
+            rval *= alpha  # Faster because operation is inplace
        else:
            rval = rval * alpha
        if rval.dtype == z.dtype:
-            rval += z # Faster because operation is inplace
+            rval += z   # Faster because operation is inplace
        else:
            rval = rval + z
-        
+
        out[0] = rval
 usmm = Usmm()

+
 class UsmmCscDense(gof.Op):
    """
    Performs the expression is alpha * x y + z
    This is an optimized operation for the case when x is in CSC format.
-        
+
    x are sparse matrix
    y, z is a dense matrix
    alpha is a scalar
@@ -1621,16 +1638,20 @@ class UsmmCscDense(gof.Op):
    def __init__(self, inplace):
        self.inplace = inplace
        if inplace:
-            self.destroy_map={ 0 : [6] }
+            self.destroy_map = {0: [6]}
+
    def __str__(self):
        if self.inplace:
            return 'UsmmCscDense{inplace}'
        else:
            return 'UsmmCscDense{no_inplace}'
+
    def __eq__(self, other):
        return (type(self) == type(other)) and self.inplace == other.inplace
+
    def __hash__(self):
        return hash(type(self)) ^ self.inplace
+
    def infer_shape(self, node, shapes):
        xshp, yshp = shapes
        x, y = node.inputs
@@ -1643,6 +1664,7 @@ class UsmmCscDense(gof.Op):
        if x.ndim == 1 and y.ndim == 1:
            return [()]
        raise NotImplementedError()
+
    def make_node(self, alpha, x_val, x_ind, x_ptr, x_nrows, y, z):
        alpha = tensor.as_tensor_variable(alpha)
        x_val = tensor.as_tensor_variable(x_val)
@@ -1678,16 +1700,13 @@ class UsmmCscDense(gof.Op):
        if dtype_out != z.type.dtype:
            z = tensor.cast(z, dtype_out)
        
-        r = gof.Apply(self, [alpha, x_val, x_ind, x_ptr, x_nrows, y, z], 
+        r = gof.Apply(self, [alpha, x_val, x_ind, x_ptr, x_nrows, y, z],
                [tensor.tensor(dtype_out, (False, y.type.broadcastable[1]))])
        return r

-    #def perform(self, node, (alpha, x_val, x_ind, x_ptr, x_nrows, y, z), (out,)):
-    #    raise NotImplemented()
-    
    def c_support_code(self):
        return blas.blas_header_text()
-    
+
    def c_libraries(self):
        return blas.ldflags()

@@ -1696,33 +1715,36 @@ class UsmmCscDense(gof.Op):

    def c_lib_dirs(self):
        return blas.ldflags(libs=False, libs_dir=True)
-    
+
    def c_header_dirs(self):
        return blas.ldflags(libs=False, include_dir=True)
-    
-    def c_code(self, node, name, (alpha, x_val, x_ind, x_ptr, x_nrows, y, z), (zn,), sub):
+
+    def c_code(self, node, name, inputs, outputs, sub):
+        alpha, x_val, x_ind, x_ptr, x_nrows, y, z = inputs
+        zn = outputs[0]
        if node.inputs[1].type.dtype in ('complex64', 'complex128'):
-            raise NotImplementedError('Complex types are not supported for x_val')
+            raise NotImplementedError('Complex types are not supported for '
+                                      'x_val')
        if node.inputs[5].type.dtype in ('complex64', 'complex128'):
            raise NotImplementedError('Complex types are not supported for y')
        if node.inputs[6].type.dtype != node.outputs[0].type.dtype:
            raise NotImplementedError('z and output must have same type')
-        
+
        if node.inputs[1].type.dtype == "float32":
            conv_type = "float"
            axpy = "saxpy_"
        else:
            conv_type = "double"
            axpy = "daxpy_"
-        
-        typenum_alpha = node.inputs[0].type.dtype_specs()[-1] # retrieve dtype number
-        typenum_x_val = node.inputs[1].type.dtype_specs()[-1] # retrieve dtype number
-        typenum_y = node.inputs[5].type.dtype_specs()[-1] # retrieve dtype number
-        typenum_z = node.inputs[6].type.dtype_specs()[-1] # retrieve dtype number
-        typenum_zn = node.outputs[0].type.dtype_specs()[-1] # retrieve dtype number
-        
+        # retrieve dtype numbers
+        typenum_alpha = node.inputs[0].type.dtype_specs()[-1]
+        typenum_x_val = node.inputs[1].type.dtype_specs()[-1]
+        typenum_y = node.inputs[5].type.dtype_specs()[-1]
+        typenum_z = node.inputs[6].type.dtype_specs()[-1]
+        typenum_zn = node.outputs[0].type.dtype_specs()[-1]
+
        inplace = int(self.inplace)
-        
+
        rval = """
        if (%(x_val)s->nd != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(x_val) != 1"); %(fail)s;}
        if (%(x_ind)s->nd != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(x_ind) != 1"); %(fail)s;}
@@ -1756,10 +1778,10 @@ class UsmmCscDense(gof.Op):

        if (%(x_ptr)s->dimensions[0] != %(y)s->dimensions[0]+1)
        {PyErr_SetString(PyExc_NotImplementedError, "x's number of columns doesn't match y's rows"); %(fail)s;}
-        
+
        if (%(z)s->dimensions[0] != ((npy_int32 *)%(x_nrows)s->data)[0] || %(z)s->dimensions[1] != %(y)s->dimensions[1])
        {PyErr_SetString(PyExc_NotImplementedError, "The dimension of the allocated output doesn't match the correct output size."); %(fail)s;}
-        
+
        if (PyArray_SIZE(%(alpha)s) != 1)
        {PyErr_SetString(PyExc_NotImplementedError, "The number of element in alpha must be 1"); %(fail)s;}

@@ -1783,7 +1805,7 @@ class UsmmCscDense(gof.Op):
            Py_XDECREF(%(zn)s);
            %(zn)s = %(z)s;
            Py_INCREF(%(zn)s);
-        }        
+        }
        else if (!%(zn)s
            || (%(zn)s->dimensions[0] != ((npy_int32 *)%(x_nrows)s->data)[0])
            || (%(zn)s->dimensions[1] != %(y)s->dimensions[1])
@@ -1795,13 +1817,13 @@ class UsmmCscDense(gof.Op):
            dims[1] = %(y)s->dimensions[1];
            %(zn)s = (PyArrayObject*) PyArray_SimpleNew(2, dims, %(typenum_zn)s);
        }
-        
+
        {
            // sparse array has size MxK, dense KxN, output MxN
            npy_intp M = %(zn)s->dimensions[0];
            npy_intp N = %(zn)s->dimensions[1];
            npy_intp K = %(y)s->dimensions[0];
-            
+
            // pointers to access actual data in the arrays passed as params.
            dtype_%(z)s* __restrict__ Dz   = (dtype_%(z)s*)%(z)s->data;
            dtype_%(zn)s* __restrict__ Dzn   = (dtype_%(zn)s*)%(zn)s->data;
@@ -1809,45 +1831,54 @@ class UsmmCscDense(gof.Op):
            const npy_int32 * __restrict__ Dind = (npy_int32*)%(x_ind)s->data;
            const npy_int32 * __restrict__ Dptr = (npy_int32*)%(x_ptr)s->data;
            const dtype_%(alpha)s alpha = ((dtype_%(alpha)s*)%(alpha)s->data)[0];
-            
+
            npy_intp Sz = %(z)s->strides[1] / %(z)s->descr->elsize;
            npy_intp Szn = %(zn)s->strides[1] / %(zn)s->descr->elsize;
            npy_intp Sval = %(x_val)s->strides[0] / %(x_val)s->descr->elsize;
            npy_intp Sind = %(x_ind)s->strides[0] / %(x_ind)s->descr->elsize;
            npy_intp Sptr = %(x_ptr)s->strides[0] / %(x_ptr)s->descr->elsize;
            npy_intp Sy = %(y)s->strides[1] / %(y)s->descr->elsize;
-            
-            
+
+
            if (!(%(inplace)s))
            {
                memcpy(Dzn, Dz, M*N*sizeof(dtype_%(zn)s));
            }
-            
+
            for (npy_int32 k = 0; k < K; ++k)
            {
                for (npy_int32 m_idx = Dptr[k * Sptr]; m_idx < Dptr[(k+1)*Sptr]; ++m_idx)
                {
                    const npy_int32 m = Dind[m_idx * Sind]; // row index of non-null value for column K
-                    
+
                    const dtype_%(x_val)s Amk = alpha * Dval[m_idx * Sval]; // actual value at that location
-                    
+
                    const dtype_%(y)s* y_row = (dtype_%(y)s*)(%(y)s->data + %(y)s->strides[0] * k);
-                    
+
                    const dtype_%(zn)s* z_row = (dtype_%(zn)s*)(%(zn)s->data + %(zn)s->strides[0] * m);
-                    
+
                    %(axpy)s((int*)&N, (%(conv_type)s*)&Amk, (%(conv_type)s*)y_row, (int*)&Sy, (%(conv_type)s*)z_row, (int*)&Szn);
                }
            }
        }
-        """% dict(locals(), **sub)
+        """ % dict(locals(), **sub)

        return rval
+
+
 usmm_csc_dense = UsmmCscDense(inplace=False)
 usmm_csc_dense_inplace = UsmmCscDense(inplace=True)

-local_usmm = gof.opt.PatternSub((tensor.sub, 'z', (tensor.mul, {'pattern' : 'alpha', 'constraint' : lambda expr: numpy.all(expr.type.broadcastable) },
-                                                               (_dot, 'x', 'y'))),
-                                (usmm, (tensor.neg, 'alpha'), 'x', 'y', 'z'))
+
+local_usmm = gof.opt.PatternSub(
+    (tensor.sub, 'z',
+     (tensor.mul,
+      {'pattern': 'alpha',
+       'constraint': lambda expr: numpy.all(expr.type.broadcastable)},
+    (_dot, 'x', 'y'))),
+    (usmm, (tensor.neg, 'alpha'), 'x', 'y', 'z'))
+
+
 register_specialize(local_usmm, name="local_usmm")


@@ -1855,23 +1886,26 @@ register_specialize(local_usmm, name="local_usmm")
 def local_usmm_csx(node):
    if node.op == usmm:
        alpha, x, y, z = node.inputs
-        
+
        x_is_sparse_variable = _is_sparse_variable(x)
        y_is_sparse_variable = _is_sparse_variable(y)
-        
+
        if x_is_sparse_variable and not y_is_sparse_variable:
            if x.type.format == 'csc':
                x_val, x_ind, x_ptr, x_shape = csm_properties(x)
                x_nsparse = x_shape[0]
-                dtype_out = scalar.upcast(alpha.type.dtype, x.type.dtype, y.type.dtype, z.type.dtype)
+                dtype_out = scalar.upcast(alpha.type.dtype, x.type.dtype,
+                                          y.type.dtype, z.type.dtype)
                # Sparse cast is not implemented.
                if y.type.dtype != dtype_out:
                    return False

-                return [usmm_csc_dense(alpha, x_val, x_ind, x_ptr, x_nsparse, y, z)]
+                return [usmm_csc_dense(alpha, x_val, x_ind, x_ptr,
+                                       x_nsparse, y, z)]
    return False
 register_specialize(local_usmm_csx)

+
 @gof.local_optimizer([usmm_csc_dense])
 def local_usmm_csc_dense_inplace(node):
    if node.op == usmm_csc_dense:

--- a/theano/sparse/tests/test_basic.py
+++ b/theano/sparse/tests/test_basic.py
@@ -12,6 +12,8 @@ except ImportError:
 import theano
 from theano import compile, config
 from theano.sparse import enable_sparse
+from theano.gof.python25 import product
+
 if enable_sparse == False:
    raise SkipTest('Optional package sparse disabled')

@@ -527,31 +529,32 @@ class DotTests(unittest.TestCase):
    def setUp(self):
        x_size = (10, 1000)
        y_size = (1000, 10000)
-        
+
        self.x_csr = scipy.sparse.csr_matrix(numpy.random.binomial(1, 0.5, x_size), dtype=theano.config.floatX)
        self.x_csc = scipy.sparse.csc_matrix(numpy.random.binomial(1, 0.5, x_size), dtype=theano.config.floatX)
        self.y = numpy.asarray(numpy.random.uniform(-1, 1, y_size), dtype=theano.config.floatX)
        self.y_csr = scipy.sparse.csr_matrix(numpy.random.binomial(1, 0.5, y_size), dtype=theano.config.floatX)
        self.y_csc = scipy.sparse.csc_matrix(numpy.random.binomial(1, 0.5, y_size), dtype=theano.config.floatX)

-    
    def test_csr_dense(self):
        x = theano.sparse.csr_matrix('x')
        y = theano.tensor.matrix('y')
-    
+
        f_a = theano.function([x, y], theano.sparse.dot(x, y))
        f_b = lambda x, y: x * y
-        
-        assert abs(f_a(self.x_csr, self.y) - f_b(self.x_csr, self.y)).max() < 10**-4
+
+        assert abs(f_a(self.x_csr, self.y) - f_b(self.x_csr, self.y)).max() < 1e-4

    def test_csc_dense(self):
        x = theano.sparse.csc_matrix('x')
        y = theano.tensor.matrix('y')
-    
+
        f_a = theano.function([x, y], theano.sparse.dot(x, y))
        f_b = lambda x, y: x * y
-        
-        assert abs(f_a(self.x_csc, self.y) - f_b(self.x_csc, self.y)).max() < 10**-4
+
+        assert (abs(f_a(self.x_csc, self.y) - f_b(self.x_csc, self.y)).max()
+                < 1e-4)
+
    def test_sparse_sparse(self):
        for d1, d2 in [('float32', 'float32'),
                       ('float32', 'float64'),
@@ -571,7 +574,7 @@ class DotTests(unittest.TestCase):

                vx = getattr(self,'x_'+x_f).astype(d1)
                vy = getattr(self,'y_'+y_f).astype(d2)
-                assert abs(f_a(vx, vy) - f_b(vx, vy)).max() < 10**-4
+                assert abs(f_a(vx, vy) - f_b(vx, vy)).max() < 1e-4


 class UsmmTests(unittest.TestCase):
@@ -579,11 +582,11 @@ class UsmmTests(unittest.TestCase):
        x_size = (10, 200)
        y_size = (200, 2000)
        z_size = (x_size[0], y_size[1])
-        
+
        self.x = numpy.asarray(numpy.random.binomial(1, 0.5, x_size), dtype=theano.config.floatX)
        self.y = numpy.asarray(numpy.random.uniform(-1, 1, y_size), dtype=theano.config.floatX)
        self.z = numpy.asarray(numpy.random.uniform(-1, 1, z_size), dtype=theano.config.floatX)
-    
+
    def test(self):
        def mat(format, name, dtype):
            if format == 'dense':
@@ -591,69 +594,80 @@ class UsmmTests(unittest.TestCase):
            else:
                return theano.sparse.matrix(format, name, dtype=dtype)

-        for dtype1 in ['float32', 'float64']:
-            for dtype2 in ['float32', 'float64']:
-                for dtype3 in ['float32', 'float64']:
-                    for dtype4 in ['float32', 'float64']:
-                        for format1 in ['dense', 'csc','csr']:
-                            for format2 in ['dense', 'csc','csr']:
-                                if format1 == 'dense' and format2 == 'dense':
-                                    # Usmm won't be used!
-                                    continue
-                                x = mat(format1, 'x', dtype1)
-                                y = mat(format2, 'y', dtype2)
-                                a = theano.tensor.scalar('a', dtype=dtype3)
-                                z = theano.tensor.shared(numpy.asarray(self.z,dtype=dtype4).copy())
-
-                                f_b = lambda z, a, x, y: z - a * (x * y)
-                                x_data = numpy.asarray(self.x, dtype = dtype1)
-                                if format1 != 'dense':
-                                    x_data = as_sparse_format(x_data, format1)
-                                y_data = numpy.asarray(self.y, dtype = dtype2)
-                                if format2 != 'dense':
-                                    y_data = as_sparse_format(y_data, format2)
-                                z_data = numpy.asarray(self.z, dtype = dtype3)
-
-                                f_b_out = f_b(z_data, 1, x_data, y_data)
-
-                                # Can it work inplace?
-                                inplace = dtype4 == theano.scalar.upcast(dtype1, dtype2, dtype3)
-
-                                # To make it easier to check the toposort
-                                mode = theano.compile.mode.get_default_mode().excluding('fusion')
-
-                                if inplace:
-                                    f_a = theano.function([a, x, y], [],
-                                                          updates={ z : z - a * theano.sparse.dot(x, y)},
-                                                          mode = mode)
-                                    f_a(1, x_data, y_data)
-                                    assert abs(z.get_value(borrow=True) - f_b_out).max() < 10**-4
-                                else:
-                                    f_a = theano.function([a, x, y], z - a * theano.sparse.dot(x, y),
-                                                          mode = mode)
-                                    f_a_out = f_a(1, x_data, y_data)
-                                    assert abs(f_a_out - f_b_out).max() < 10**-4
-                                topo = f_a.maker.env.toposort()
-                                if (y.type.dtype == theano.scalar.upcast(dtype1, dtype2, dtype3, dtype4)
-                                    and format1=='csc' and format2=='dense'):
-
-                                    assert sum([isinstance(node.op, tensor.Elemwise) and isinstance(node.op.scalar_op, theano.scalar.basic.Cast) for node in topo])==len(topo)-5
-                                    topo = [node for node in topo if not(isinstance(node.op, tensor.Elemwise) and isinstance(node.op.scalar_op, theano.scalar.basic.Cast))]
-                                    assert len(topo)==5, topo
-                                    # Usmm is tested at the same time in debugmode
-                                    # Check if the optimization local_usmm and local_usmm_csx is applied
-                                    assert isinstance(topo[0].op, theano.sparse.basic.CSMProperties)
-                                    assert isinstance(topo[1].op, theano.tensor.DimShuffle)
-                                    assert isinstance(topo[2].op, theano.tensor.Subtensor)
-                                    assert topo[3].op == theano.tensor.neg
-                                    assert isinstance(topo[4].op, theano.sparse.UsmmCscDense)
-                                    if inplace:
-                                        assert topo[4].op.inplace
-                                else:
-                                    assert len(topo)==3, topo
-                                    assert isinstance(topo[0].op, theano.tensor.DimShuffle)
-                                    assert topo[1].op == theano.tensor.neg
-                                    assert isinstance(topo[2].op, theano.sparse.Usmm)
+        params = product(*([['float32', 'float64']] * 4 +
+                           [['dense', 'csc', 'csr']] * 2))
+
+        for dtype1, dtype2, dtype3, dtype4, format1, format2 in params:
+            if format1 == 'dense' and format2 == 'dense':
+                # Usmm won't be used!
+                continue
+            x = mat(format1, 'x', dtype1)
+            y = mat(format2, 'y', dtype2)
+            a = theano.tensor.scalar('a', dtype=dtype3)
+            z = theano.tensor.shared(
+                numpy.asarray(self.z, dtype=dtype4).copy()
+            )
+
+            f_b = lambda z, a, x, y: z - a * (x * y)
+            x_data = numpy.asarray(self.x, dtype=dtype1)
+            if format1 != 'dense':
+                x_data = as_sparse_format(x_data, format1)
+            y_data = numpy.asarray(self.y, dtype=dtype2)
+            if format2 != 'dense':
+                y_data = as_sparse_format(y_data, format2)
+            z_data = numpy.asarray(self.z, dtype=dtype3)
+
+            f_b_out = f_b(z_data, 1, x_data, y_data)
+
+            # Can it work inplace?
+            inplace = dtype4 == theano.scalar.upcast(dtype1, dtype2, dtype3)
+
+            # To make it easier to check the toposort
+            mode = theano.compile.mode.get_default_mode().excluding('fusion')
+
+            if inplace:
+                updates = {z: z - a * theano.sparse.dot(x, y)}
+                f_a = theano.function([a, x, y], [],
+                                      updates=updates,
+                                      mode=mode)
+                f_a(1, x_data, y_data)
+                assert abs(z.get_value(borrow=True) - f_b_out).max() < 1e-4
+            else:
+                f_a = theano.function([a, x, y],
+                                      z - a * theano.sparse.dot(x, y),
+                                      mode=mode)
+                f_a_out = f_a(1, x_data, y_data)
+                assert abs(f_a_out - f_b_out).max() < 1e-4
+            topo = f_a.maker.env.toposort()
+            up = theano.scalar.upcast(dtype1, dtype2, dtype3, dtype4)
+            if y.type.dtype == up and format1 == 'csc' and format2 == 'dense':
+                assert (sum([isinstance(node.op, tensor.Elemwise) and
+                             isinstance(node.op.scalar_op,
+                                        theano.scalar.basic.Cast)
+                             for node in topo]) == len(topo) - 5)
+                new_topo = []
+                for node in topo:
+                    if not isinstance(node.op, tensor.Elemwise) and \
+                       isinstance(node.op.scalar_op, theano.scalar.basic.Cast):
+                        new_topo.append(node)
+                topo = new_topo
+                assert len(topo) == 5, topo
+                # Usmm is tested at the same time in debugmode
+                # Check if the optimization local_usmm and local_usmm_csx is
+                # applied
+                assert isinstance(topo[0].op,
+                                  theano.sparse.basic.CSMProperties)
+                assert isinstance(topo[1].op, theano.tensor.DimShuffle)
+                assert isinstance(topo[2].op, theano.tensor.Subtensor)
+                assert topo[3].op == theano.tensor.neg
+                assert isinstance(topo[4].op, theano.sparse.UsmmCscDense)
+                if inplace:
+                    assert topo[4].op.inplace
+            else:
+                assert len(topo)==3, topo
+                assert isinstance(topo[0].op, theano.tensor.DimShuffle)
+                assert topo[1].op == theano.tensor.neg
+                assert isinstance(topo[2].op, theano.sparse.Usmm)


 def test_shape_i():