Mostly gradient for CSR matrix (both for py and c linker), along with update

test file. + misc. tweaks

Mostly gradient for CSR matrix (both for py and c linker), along with update
1fd03d7c · desjagui@atchoum.iro.umontreal.ca · b594140c · 1fd03d7c · 1fd03d7c · 1fd03d7c
--- a/theano/sparse/basic.py
+++ b/theano/sparse/basic.py
@@ -294,7 +294,10 @@ class CSM(gof.Op):
    def perform(self, node, (data, indices, indptr, shape), (out,)):
        """Build a csc_matrix"""
        #assert len(data.flatten()) == len(indices.flatten())
-        data = data[self.map] if self.map!=None else data
+
+        # for efficiency, if remap does nothing, then do not apply it
+        if map is not None and all(map==numpy.arange(numpy.size(map))):
+            data = data[self.map]

        if len(shape) != 2:
            raise ValueError('Shape should be an array of length 2')
@@ -706,6 +709,7 @@ class StructuredDot(gof.Op):
        #gb = a.T x g_out
        return structured_dot_grad(a, b, g_out), structured_dot(a.T,g_out)
 _structured_dot = StructuredDot()
+
 def structured_dot(x, y):
    """
    @todo: Maybe the triple-transposition formulation (when x is dense)
@@ -880,7 +884,7 @@ class StructuredDotGrad(gof.Op):
            raise TypeError()
 _structured_dot_grad = StructuredDotGrad()

-class StructureDotGradCSC(gof.Op):
+class StructuredDotGradCSC(gof.Op):
    def make_node(self, a_indices, a_indptr, b, g_ab):
        return gof.Apply(self, [a_indices, a_indptr, b, g_ab], [tensor.tensor(b.dtype, (False,))])
    def perform(self, node, (a_indices, a_indptr, b, g_ab), (out,)):
@@ -940,31 +944,145 @@ class StructureDotGradCSC(gof.Op):
            const npy_int32 * __restrict__ indptr = (npy_int32 *)%(_indptr)s->data;
            const npy_int32 * __restrict__ indices = (npy_int32 *)%(_indices)s->data;

+            // loop over columns
            for (npy_int32 j = 0; j < N; ++j)
            {
+                // extract j-th row of dense matrix
                const npy_double * __restrict__ d_row = (double *)(%(_d)s->data + %(_d)s->strides[0] * j);
                if(j >= %(_d)s->dimensions[0]) {PyErr_SetString(PyExc_NotImplementedError, "G"); %(fail)s;}

+                // for each non-null value in the sparse column
                for (npy_int32 i_idx = indptr[j * Sindptr]; i_idx < indptr[(j+1) * Sindptr]; ++i_idx)
                {
+                    // extract row index of non-null value
                    npy_int32 i = indices[i_idx * Sindices];
+                    
+                    // extract corresponding row in gradient
                    const npy_double * __restrict__ g_row = (npy_double *)(%(_g)s->data + %(_g)s->strides[0] * i);
                    double ip = 0.0;

+                    // make sure that row index is not bigger than actual number of rows
+                    // Note: wouldn't the above operation fail if that were the case ? 
+                    //       when would this ever be true anyway ? 
                    if (i >= %(_g)s->dimensions[0]) 
                    {PyErr_SetString(PyExc_NotImplementedError, "H"); %(fail)s;}

+                    // perform dot product of dense and sparse rows
                    for(int k = 0; k < K; ++k)
                    {
                        ip += d_row[k * Sd1] * g_row[k*Sg1];
                    }
+
+                    // write resulting gradient to sparse output
                    ((double * __restrict__)(%(_zout)s->data + i_idx * %(_zout)s->strides[0]))[0] = ip;
                }
            }
        }

        """% dict(locals(), **sub)
-_sdgcsc = StructureDotGradCSC()
+_sdgcsc = StructuredDotGradCSC()
+
+
+class StructuredDotGradCSR(gof.Op):
+    def make_node(self, a_indices, a_indptr, b, g_ab):
+        return gof.Apply(self, [a_indices, a_indptr, b, g_ab], [tensor.tensor(b.dtype, (False,))])
+    def perform(self, node, (a_indices, a_indptr, b, g_ab), (out,)):
+        g_a_data = numpy.zeros(a_indices.shape, dtype=g_ab.dtype)
+        for i in xrange(len(a_indptr)-1): # loop over rows
+            ind0 = a_indptr[i]
+            ind1 = a_indptr[i+1]
+            for j_idx in xrange(ind0, ind1): # loop over values in that row (columns)
+                j = a_indices[j_idx]
+                # grad is dot product of i-th row of gradient with j-th row of b 
+                g_a_data[j_idx] = numpy.dot(g_ab[i], b[j])
+        out[0] = g_a_data
+    def c_code(self, node, name, (_indices, _indptr, _d, _g), (_zout, ), sub):
+        return """
+        if (%(_d)s->nd != 2) {PyErr_SetString(PyExc_NotImplementedError, "rank(d) != 2"); %(fail)s;}
+        if (%(_g)s->nd != 2) {PyErr_SetString(PyExc_NotImplementedError, "rank(g) != 2"); %(fail)s;}
+        if (%(_indices)s->nd != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(indices) != 1"); %(fail)s;}
+        if (%(_indptr)s->nd != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(indptr) != 1"); %(fail)s;}
+
+        if( %(_indices)s->descr->type_num != PyArray_INT32) {
+        PyErr_SetString(PyExc_NotImplementedError, "C"); %(fail)s;}
+
+        if( %(_indptr)s->descr->type_num != PyArray_INT32)
+        {PyErr_SetString(PyExc_NotImplementedError, "D"); %(fail)s;}
+
+        if( %(_d)s->descr->type_num != PyArray_DOUBLE)
+        {PyErr_SetString(PyExc_NotImplementedError, "d's dtype not NPY_DOUBLE"); %(fail)s;}
+
+        if( %(_g)s->descr->type_num != PyArray_DOUBLE)
+        {PyErr_SetString(PyExc_NotImplementedError, "g's dtype not NPY_DOUBLE"); %(fail)s;}
+
+        if( %(_d)s->dimensions[1] != %(_g)s->dimensions[1])
+        {PyErr_SetString(PyExc_NotImplementedError, "d and g have different numbers of columns"); %(fail)s;}
+
+        if (!%(_zout)s)
+        {
+            %(_zout)s = (PyArrayObject*) PyArray_SimpleNew(1, %(_indices)s->dimensions, %(_g)s->descr->type_num);
+        }
+
+        if (%(_zout)s->dimensions[0] != %(_indices)s->dimensions[0])
+        {
+            PyErr_SetString(PyExc_NotImplementedError, "somehow _zout got the wrong size.. and I don't know how to resize it.");
+            %(fail)s;
+        }
+        
+        {   //makes it compile even though labels jump over variable definitions.
+            npy_intp nnz = %(_indices)s->dimensions[0];
+            // extract number of rows
+            npy_intp N =  %(_indptr)s->dimensions[0]-1; //TODO: error checking with this
+
+            npy_intp Sindices = %(_indices)s->strides[0]/%(_indices)s->descr->elsize;
+            npy_intp Sindptr = %(_indptr)s->strides[0]/%(_indptr)s->descr->elsize;
+
+            const npy_intp Sd1 = %(_d)s->strides[1]/%(_d)s->descr->elsize;
+            const npy_intp Sg1 = %(_g)s->strides[1]/%(_g)s->descr->elsize;
+
+            const npy_intp K = %(_d)s->dimensions[1];
+
+            const npy_int32 * __restrict__ indptr = (npy_int32 *)%(_indptr)s->data;
+            const npy_int32 * __restrict__ indices = (npy_int32 *)%(_indices)s->data;
+
+            // loop over rows
+            for (npy_int32 i = 0; i < N; ++i)
+            {
+                // for each non-null value in the sparse row
+                for (npy_int32 j_idx = indptr[i * Sindptr]; j_idx < indptr[(i+1) * Sindptr]; ++j_idx)
+                {
+                    // extract column index of non-null value
+                    npy_int32 j = indices[j_idx * Sindices];
+                 
+                    // extract j-th row of dense matrix
+                    const npy_double * __restrict__ d_row = (double *)(%(_d)s->data + %(_d)s->strides[0] * j);
+                    if(j >= %(_d)s->dimensions[0]) {PyErr_SetString(PyExc_NotImplementedError, "G"); %(fail)s;}
+
+                    // extract corresponding row in gradient
+                    const npy_double * __restrict__ g_row = (npy_double *)(%(_g)s->data + %(_g)s->strides[0] * i);
+                    double ip = 0.0;
+
+                    // make sure that row index is not bigger than actual number of rows
+                    // Note: wouldn't the above operation fail if that were the case ? 
+                    //       when would this ever be true anyway ? 
+                    if (i >= %(_g)s->dimensions[0]) 
+                    {PyErr_SetString(PyExc_NotImplementedError, "H"); %(fail)s;}
+
+                    // perform dot product of dense and sparse rows
+                    for(int k = 0; k < K; ++k)
+                    {
+                        ip += d_row[k * Sd1] * g_row[k*Sg1];
+                    }
+
+                    // write resulting gradient to sparse output
+                    ((double * __restrict__)(%(_zout)s->data + j_idx * %(_zout)s->strides[0]))[0] = ip;
+                }
+            }
+        }
+ 
+        """% dict(locals(), **sub)
+_sdgcsr = StructuredDotGradCSR()
+

 def structured_dot_grad(sparse_A, dense_B, ga):
    #TODO: 1. move this switch to be a specialization of structuredDotGrad
@@ -972,10 +1090,14 @@ def structured_dot_grad(sparse_A, dense_B, ga):
    if 0:
        return _structured_dot_grad(sparse_A, dense_B, ga)
    else:
-        if sparse_A.type.format == 'csc':
-            g_A_data = _sdgcsc(csm_indices(sparse_A),\
+        if sparse_A.type.format in ('csc','csr'):
+
+            sdgcsx = _sdgcsc if sparse_A.type.format == 'csc' else _sdgcsr
+            CSx = CSC if sparse_A.type.format == 'csc' else CSR
+
+            g_A_data = sdgcsx(csm_indices(sparse_A),\
                              csm_indptr(sparse_A), dense_B, ga)
-            return CSC(g_A_data, csm_indices(sparse_A),\
+            return CSx(g_A_data, csm_indices(sparse_A),\
                                     csm_indptr(sparse_A),\
                                     csm_shape(sparse_A))
        else:

--- a/theano/sparse/tests/test_basic.py
+++ b/theano/sparse/tests/test_basic.py
 from theano.sparse import *

+import random
 import unittest
+import theano
+
 from theano import compile
 from theano import gradient
+from theano import gof

 from theano.sparse.basic import _is_dense, _is_sparse, _is_dense_result, _is_sparse_result
 from theano.sparse.basic import _mtypes, _mtype_to_str

-import random
-from theano import gof

 def eval_outputs(outputs):
    return compile.function([], outputs)()[0]
@@ -228,7 +230,7 @@ class test_true_dot(unittest.TestCase):
            x.data = x.data.T
            y.data = y.data.T

-#            zop = true_dot(y, x)
+            zop = true_dot(y, x)
            zop = transpose(true_dot(y, x))
            self.failUnless(_is_sparse_result(zop))
            z = eval_outputs([zop])
@@ -304,5 +306,59 @@ class test_true_dot(unittest.TestCase):

                self.failUnless(origloss > loss)

+import scipy.sparse as sp
+class test_structureddot(unittest.TestCase):
+
+    def test_structuredot(self):
+
+        #bsize = 5
+        #spmat = sp.csc_matrix((8,15))
+        #spmat[1,2] = 3
+        #spmat[4,7] = 6
+        #spmat[2,7] = 72
+        #spmat[1,9] = 2
+        #spmat[7,12] = 1
+        #spmat[4,2] = 7
+ 
+        bsize = 2
+        spmat = sp.csc_matrix((5,5))
+        spmat[1,2] = 1
+        spmat[0,1] = 2
+        spmat[0,2] = 3
+
+      
+        kerns = tensor.dvector()
+        images = tensor.dmatrix()
+
+        def buildgraphCSC(kerns,images):
+            csc = CSC(kerns, spmat.indices[:spmat.size], spmat.indptr, spmat.shape)
+            return structured_dot(csc, images.T)
+        out = buildgraphCSC(kerns,images)
+
+        for mode in 'FAST_COMPILE','FAST_RUN':
+            f = theano.function([kerns,images], out, mode=mode)
+            kernvals = spmat.data[:spmat.size]
+            imvals = 1.0 * numpy.arange(bsize*spmat.shape[1]).reshape(bsize,spmat.shape[1])
+            outvals = f(kernvals,imvals)
+            assert numpy.all(outvals == spmat.dot(imvals.T).todense())
+
+            tensor.verify_grad(None, buildgraphCSC, [kernvals,imvals], mode=mode)
+
+        spmat = spmat.tocsr()
+        def buildgraphCSR(kerns,images):
+            csr = CSR(kerns, spmat.indices[:spmat.size], spmat.indptr, spmat.shape)
+            return structured_dot(csr, images.T)
+        out = buildgraphCSR(kerns,images)
+
+        for mode in 'FAST_COMPILE','FAST_RUN':
+            f = theano.function([kerns,images], out, mode=mode)
+            kernvals = spmat.data[:spmat.size]
+            imvals = 1.0 * numpy.arange(bsize*spmat.shape[1]).reshape(bsize,spmat.shape[1])
+            outvals = f(kernvals,imvals)
+            assert numpy.all(outvals == spmat.dot(imvals.T).todense())
+
+            tensor.verify_grad(None, buildgraphCSR, [kernvals,imvals], mode=mode)
+
+
 if __name__ == '__main__':
    unittest.main()
--- a/theano/tensor/blas.py
+++ b/theano/tensor/blas.py
@@ -543,11 +543,11 @@ class GemmLocalOptimizer(LocalOptimizer):
 # TODO: This could be an equilibriumOptmizer, but I don't know how to combine an OpKeyOptimizer and
 # an EquilibriumOptimizer.
 compile.optdb.register('inplace_gemm_0', OpKeyOptimizer(GemmLocalOptimizer(), 
-    failure_callback=GemmLocalOptimizer.failure_callback), 70.00, 'fast_run', 'inplace')
+    failure_callback=GemmLocalOptimizer.failure_callback), 70.00, 'fast_run', 'inplace', 'gemm')
 compile.optdb.register('inplace_gemm_1', OpKeyOptimizer(GemmLocalOptimizer(), 
-    failure_callback=GemmLocalOptimizer.failure_callback), 70.01, 'fast_run', 'inplace')
+    failure_callback=GemmLocalOptimizer.failure_callback), 70.01, 'fast_run', 'inplace', 'gemm')
 compile.optdb.register('inplace_gemm_2', OpKeyOptimizer(GemmLocalOptimizer(), 
-    failure_callback=GemmLocalOptimizer.failure_callback), 70.02, 'fast_run', 'inplace')
+    failure_callback=GemmLocalOptimizer.failure_callback), 70.02, 'fast_run', 'inplace', 'gemm')

 class Dot22(GemmRelated):
    """Compute a matrix-matrix product.

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -588,6 +588,7 @@ def mul_calculate(num, denum, aslist = False):
    return v

 local_mul_canonizer = Canonizer(T.mul, T.div, T.inv, mul_calculate, False)
+register_canonicalize(local_mul_canonizer, name = 'local_mul_canonizer')

 @gof.local_optimizer([T.neg])
 def local_neg_to_mul(node):
@@ -693,7 +694,6 @@ def local_mul_specialize(node):
        return False
 register_specialize(local_mul_specialize)

-register_canonicalize(local_mul_canonizer, name = 'local_mul_canonizer')


 # neg_to_mul = out2in(gof.LocalOptGroup(local_neg_to_mul))