Use the non-scan version by default, since it's faster.

6b162a92 · Arnaud Bergeron · 9432f511 · 6b162a92 · 6b162a92
--- a/theano/sandbox/cuda/blocksparse.py
+++ b/theano/sandbox/cuda/blocksparse.py
@@ -127,11 +127,20 @@ class SparseBlockGemvSS(GpuOp):
 sparse_block_gemv_ss = SparseBlockGemvSS(False)
+sparse_block_gemv_ss_outer = SparseBlockGemvSS(True)
 class SparseBlockOuterSS(GpuOp):
-    def __init__(self):
+    def __init__(self, inplace=False):
-        self.inplace = False
+        self.inplace = inplace
+        if self.inplace:
+            self.destroy_map = {0: [0]}
+    def __eq__(self, other):
+        return type(self) == type(other) and self.inplace == other.inplace
+    def __hash__(self):
+        return hash(type(self)) ^ hash(self.inplace)
    def make_node(self, o, x, y, xIdx, yIdx):
        o = basic_ops.as_cuda_ndarray_variable(o)
@@ -175,15 +184,5 @@ def sparse_block_dot_SS(W, h, inputIdx, b, outputIdx):
    returns (oBlocks, oSize), dot(W[i, j], h[i]) + b[j]
         but b[j] is only added once
    """
-    o = b.take(outputIdx, axis=0)
+    return sparse_block_gemv_ss(b.take(outputIdx, axis=0), W, h,
-    def outer_fn(out_id, W, h, b, iIdx):
+                                inputIdx, outputIdx)
-        def inner_fn(inp_id, h_i, out_id, W):
-            return tensor.dot(W[inp_id, out_id], h_i)
-        return theano.scan(inner_fn, sequences=[iIdx, h],
-                           outputs_info=None,
-                           non_sequences=[out_id, W],
-                           n_steps=iIdx.shape[0])[0].sum(axis=0) + b[out_id]
-    return theano.scan(outer_fn, sequences=[outputIdx],
-                       outputs_info=None,
-                       non_sequences=[W, h, b, inputIdx],
-                       n_steps=outputIdx.shape[0])[0]
--- a/theano/sandbox/cuda/tests/test_blocksparse.py
+++ b/theano/sandbox/cuda/tests/test_blocksparse.py
@@ -57,26 +57,7 @@ def test_blocksparse():
    utt.assert_allclose(ref_out, th_out)
-def test_blocksparse_op():
+def test_blocksparse_grad():
-    b = tensor.fmatrix()
-    W = tensor.ftensor4()
-    h = tensor.fmatrix()
-    iIdx = tensor.lvector()
-    oIdx = tensor.lvector()
-    o = sparse_block_gemv_ss(b.take(oIdx, axis=0), W, h, iIdx, oIdx)
-    f = theano.function([W, h, iIdx, b, oIdx], o)
-    W_val, h_val, iIdx_val, b_val, oIdx_val = blocksparse_data()
-    th_out = f(W_val, h_val, iIdx_val, b_val, oIdx_val)
-    ref_out = blocksparse(W_val, h_val, iIdx_val, b_val, oIdx_val)
-    utt.assert_allclose(ref_out, th_out)
-def test_blocksparse_op_grad():
    h_val = randn(2, 3).astype('float32')
    iIdx_val = numpy.random.permutation(3)[:2]
    oIdx_val = numpy.random.permutation(3)[:2]
@@ -92,7 +73,7 @@ def test_blocksparse_op_grad():
    utt.verify_grad(f, [b_val, h_val, W_val])
-def test_blocksparse_op_grad_shape():
+def test_blocksparse_grad_shape():
    b = tensor.fmatrix()
    W = tensor.ftensor4()
    h = tensor.fmatrix()