Merge pull request #1929 from abergeron/sparse_blockdot

Add a blocksparse multiplication implementation

Merge pull request #1929 from abergeron/sparse_blockdot
22c4a29b · Pascal Lamblin · 081f64c8 · 94a43515 · 22c4a29b · 22c4a29b
--- a/doc/extending/unittest.txt
+++ b/doc/extending/unittest.txt
@@ -350,9 +350,9 @@ The behaviour of seed_rng is as follows:

 * If an explicit seed is given, it will be used for seeding numpy's rng.

-* If not, it will use ``config.unittest.rseed`` (its default value is 666).
+* If not, it will use ``config.unittests.rseed`` (its default value is 666).

-* If config.unittest.rseed is set to "random", it will seed the rng with
+* If config.unittests.rseed is set to "random", it will seed the rng with
  None, which is equivalent to seeding with a random seed.


@@ -364,7 +364,7 @@ a higher confidence that the variables are correct), while still
 making sure unittests are deterministic.

 Users who prefer their unittests to be random (when run on their local
-machine) can simply set ``config.unittest.rseed`` to 'random' (see
+machine) can simply set ``config.unittests.rseed`` to 'random' (see
 :mod:`config`).

 Similarly, to provide a seed to numpy.random.RandomState, simply use:

--- a/theano/misc/pycuda_init.py
+++ b/theano/misc/pycuda_init.py
@@ -49,6 +49,8 @@ else:
    if pycuda_available:
        if hasattr(pycuda.driver.Context, "attach"):
            pycuda.driver.Context.attach()
+            import atexit
+            atexit.register(pycuda.driver.Context.pop)
        else:
            # Now we always import this file when we call
            # theano.sandbox.cuda.use. So this should not happen

--- a/theano/sandbox/cuda/blocksparse.py
+++ b/theano/sandbox/cuda/blocksparse.py
--- a/theano/sandbox/cuda/nvcc_compiler.py
+++ b/theano/sandbox/cuda/nvcc_compiler.py
@@ -303,7 +303,7 @@ class NVCC_compiler(object):
        preargs2 = [pa for pa in preargs
                    if pa not in preargs1]  # other arguments

-        cmd = [nvcc_path, '-shared', '-g'] + preargs1
+        cmd = [nvcc_path, '-shared', '-g', '-G'] + preargs1
        if config.nvcc.compiler_bindir:
            cmd.extend(['--compiler-bindir', config.nvcc.compiler_bindir])


--- a/theano/sandbox/cuda/tests/test_blocksparse.py
+++ b/theano/sandbox/cuda/tests/test_blocksparse.py
+import numpy
+from numpy.random import randn
+
+from unittest import TestCase
+
+from nose.plugins.skip import SkipTest
+
+import theano
+from theano import tensor
+import theano.tests.unittest_tools as utt
+
+import theano.sandbox.cuda as cuda_ndarray
+if cuda_ndarray.cuda_available == False:
+    raise SkipTest('Optional package cuda disabled')
+
+from theano.sandbox.cuda.basic_ops import (GpuDimShuffle,
+                                           as_cuda_ndarray_variable)
+from theano.sandbox.cuda.blocksparse import (sparse_block_dot_SS,
+                                             sparse_block_gemv_ss,
+                                             sparse_block_outer_ss,
+                                             sparse_block_outer_ss_inplace)
+from theano.sandbox.cuda.var import float32_shared_constructor
+
+
+if theano.config.mode == 'FAST_COMPILE':
+    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
+else:
+    mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
+
+
+def setup():
+    utt.seed_rng()
+
+def blocksparse_data():
+    nInputBlock = 128
+    nOutputBlock = 64
+    inputSize = 40
+    outputSize = 30
+    inputWindowSize = 7
+    outputWindowSize = 9
+    batchSize = 4
+
+    input = randn(batchSize, inputWindowSize, inputSize).astype('float32')
+    inputIndice = numpy.vstack(numpy.random.permutation(nInputBlock)[:inputWindowSize] for _ in range(batchSize))
+    outputIndice = numpy.vstack(numpy.random.permutation(nOutputBlock)[:outputWindowSize] for _ in range(batchSize))
+    weight = randn(nInputBlock, nOutputBlock, inputSize, outputSize).astype('float32')
+    bias = randn(nOutputBlock, outputSize).astype('float32')
+
+    return weight, input, inputIndice, bias, outputIndice
+
+def blocksparse(W, h, iIdx, b, oIdx):
+    o = b.take(oIdx, axis=0)
+
+    for b in range(o.shape[0]):
+        for j in range(o.shape[1]):
+            outputIdx = oIdx[b, j]
+
+            for i in range(h.shape[1]):
+                inputIdx = iIdx[b, i]
+                w = W[inputIdx, outputIdx]
+                # this below is a gemv I think
+                o[b, j, :] += numpy.dot(h[b, i], w)
+    return o
+
+
+def test_blocksparse():
+    b = tensor.fmatrix()
+    W = tensor.ftensor4()
+    h = tensor.ftensor3()
+    iIdx = tensor.lmatrix()
+    oIdx = tensor.lmatrix()
+
+    o = sparse_block_dot_SS(W, h, iIdx, b, oIdx)
+
+    f = theano.function([W, h, iIdx, b, oIdx], o)
+
+    W_val, h_val, iIdx_val, b_val, oIdx_val = blocksparse_data()
+
+    th_out = f(W_val, h_val, iIdx_val, b_val, oIdx_val)
+    ref_out = blocksparse(W_val, h_val, iIdx_val, b_val, oIdx_val)
+
+    utt.assert_allclose(ref_out, th_out)
+
+test_blocksparse.setup = setup
+
+
+# test the fortan order for W (which can happen in the grad for some graphs).
+def test_blocksparseF():
+    b = tensor.fmatrix()
+    W = tensor.ftensor4()
+    h = tensor.ftensor3()
+    iIdx = tensor.lmatrix()
+    oIdx = tensor.lmatrix()
+
+    o = sparse_block_dot_SS(GpuDimShuffle((False, False, False, False),
+                                          (0, 1, 3, 2))(
+            as_cuda_ndarray_variable(W)),
+                            h, iIdx, b, oIdx)
+
+    f = theano.function([W, h, iIdx, b, oIdx], o)
+
+    W_val, h_val, iIdx_val, b_val, oIdx_val = blocksparse_data()
+
+    th_out = f(numpy.swapaxes(W_val, 2, 3), h_val, iIdx_val, b_val, oIdx_val)
+    ref_out = blocksparse(W_val, h_val, iIdx_val, b_val, oIdx_val)
+
+    utt.assert_allclose(ref_out, th_out)
+
+
+def test_blocksparse_grad():
+    h_val = randn(1, 2, 3).astype('float32')
+    iIdx_val = numpy.random.permutation(3)[:2][None, :]
+    oIdx_val = numpy.random.permutation(3)[:2][None, :]
+    W_val = randn(3, 3, 3, 4).astype('float32')
+    b_val = randn(3, 4).astype('float32')
+
+    iIdx = theano.tensor.constant(iIdx_val)
+    oIdx = theano.tensor.constant(oIdx_val)
+
+    def f(b, h, W):
+        return sparse_block_gemv_ss(b.take(oIdx, axis=0), W, h, iIdx, oIdx)
+
+    utt.verify_grad(f, [b_val, h_val, W_val])
+
+def test_blocksparse_grad_1():
+    # This tests that we correctly handle cases where dimensions are 1.
+    h_val = randn(1, 1, 1).astype('float32')
+    iIdx_val = numpy.random.permutation(1)[:1][None, :]
+    oIdx_val = numpy.random.permutation(1)[:1][None, :]
+    W_val = randn(1, 1, 1, 1).astype('float32')
+    b_val = randn(1, 1).astype('float32')
+
+    iIdx = theano.tensor.constant(iIdx_val)
+    oIdx = theano.tensor.constant(oIdx_val)
+
+    def f(b, h, W):
+        return sparse_block_gemv_ss(b.take(oIdx, axis=0), W, h, iIdx, oIdx)
+
+    utt.verify_grad(f, [b_val, h_val, W_val])
+
+
+def test_blocksparse_grad_shape():
+    b = tensor.fmatrix()
+    W = tensor.ftensor4()
+    h = tensor.ftensor3()
+    iIdx = tensor.lmatrix()
+    oIdx = tensor.lmatrix()
+
+    o = sparse_block_gemv_ss(b.take(oIdx, axis=0), W, h, iIdx, oIdx)
+    go = theano.grad(o.sum(), [b, W, h])
+
+    f = theano.function([W, h, iIdx, b, oIdx], go, mode=mode_with_gpu)
+
+    W_val, h_val, iIdx_val, b_val, oIdx_val = blocksparse_data()
+
+    # just make sure that it runs correcly and all the shapes are ok.
+    b_g, W_g, h_g = f(W_val, h_val, iIdx_val, b_val, oIdx_val)
+
+    assert b_g.shape == b_val.shape
+    assert h_g.shape == h_val.shape
+    assert W_g.shape == W_val.shape
+
+
+def test_blocksparse_grad_merge():
+    b = tensor.fmatrix()
+    h = tensor.ftensor3()
+    iIdx = tensor.lmatrix()
+    oIdx = tensor.lmatrix()
+
+    W_val, h_val, iIdx_val, b_val, oIdx_val = blocksparse_data()
+    W = float32_shared_constructor(W_val)
+
+    o = sparse_block_gemv_ss(b.take(oIdx, axis=0), W, h, iIdx, oIdx)
+    gW = theano.grad(o.sum(), W)
+
+    lr = numpy.asarray(0.05, dtype='float32')
+
+    upd = W - lr * gW
+
+    f1 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)],
+                         mode=mode_with_gpu)
+    # not running with mode=gpu ensures that the elemwise is not merged in
+    f2 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)])
+
+    f2(h_val, iIdx_val, b_val, oIdx_val)
+    W_ref = W.get_value()
+
+    # reset the var
+    W.set_value(W_val)
+    f1(h_val, iIdx_val, b_val, oIdx_val)
+    W_opt = W.get_value()
+
+    utt.assert_allclose(W_ref, W_opt)