Merge pull request #3311 from bouthilx/sparse_block_dot

Sparse block dot

Merge pull request #3311 from bouthilx/sparse_block_dot
565650e4 · abergeron · 7ba9c052 · aeb8c035 · 565650e4 · 565650e4
--- a/doc/images/blocksparse.png
+++ b/doc/images/blocksparse.png
--- a/doc/library/sandbox/blocksparse.txt
+++ b/doc/library/sandbox/blocksparse.txt
+.. _libdoc_blocksparse:
+
+===================================================================
+:mod:`sandbox.blocksparse` --  Block sparse dot operations (gemv and outer)
+===================================================================
+
+.. module:: sandbox.blocksparse
+   :platform: Unix, Windows
+   :synopsis: Block sparse dot
+.. moduleauthor:: LISA
+
+API
+===
+
+.. automodule:: theano.sandbox.blocksparse
+    :members:
--- a/theano/sandbox/__init__.py
+++ b/theano/sandbox/__init__.py
+from . import opt
--- a/theano/sandbox/blocksparse.py
+++ b/theano/sandbox/blocksparse.py
+import numpy
+
+import theano
+from theano import Op, Apply
+from theano import tensor
+from theano.tensor import discrete_dtypes
+from theano.gradient import grad_undefined
+
+
+class SparseBlockGemv(Op):
+    """
+    This op computes the dot product of specified pieces of vectors
+    and matrices, returning pieces of vectors:
+        for b in range(batch_size):
+            for j in range(o.shape[1]):
+                for i in range(h.shape[1]):
+                    o[b, j, :] += numpy.dot(h[b, i], W[iIdx[b, i], oIdx[b, j]])
+
+    where b, h, W, o iIdx, oIdx are defined in the docstring of make_node.
+
+    .. image:: ../../images/blocksparse.png
+        :scale: 50 %
+    """
+
+    registered_opts = []
+
+    def __init__(self, inplace=False):
+        self.inplace = inplace
+        if self.inplace:
+            self.destroy_map = {0: [0]}
+
+    def make_node(self, o, W, h, inputIdx, outputIdx):
+        """
+        Compute the dot product of the specified pieces of vectors
+        and matrices.
+
+        Parameters
+        ----------
+        var: shape, comment
+        o: (batch, oWin, oSize) output vector
+        W: (iBlocks, oBlocks, iSize, oSize), weight matrix
+        h: (batch, iWin, iSize), input from lower layer (sparse)
+        inputIdx: (batch, iWin), indexes of the input blocks
+        outputIdx: (batch, oWin), indexes of the output blocks
+        returns (batch, oWin, oSize), dot(W[i, j], h[i]) + o[j]
+
+        Notation
+        --------
+        - `batch` is the number of examples in a minibatch (batch size).
+        - `iBlocks` is the total number of blocks in the input (from lower
+            layer).
+        - `iSize` is the size of each of these input blocks.
+        - `iWin` is the number of blocks that will be used as inputs. Which
+            blocks
+          will be used is specified in `inputIdx`.
+        - `oBlocks` is the number or possible output blocks.
+        - `oSize` is the size of each of these output blocks.
+        - `oWin` is the number of output blocks that will actually be computed.
+          Which blocks will be computed is specified in `outputIdx`.
+        """
+        o = theano.tensor.as_tensor_variable(o)
+        W = theano.tensor.as_tensor_variable(W)
+        h = theano.tensor.as_tensor_variable(h)
+        inputIdx = theano.tensor.as_tensor_variable(inputIdx)
+        outputIdx = theano.tensor.as_tensor_variable(outputIdx)
+
+        if o.ndim != 3:
+            raise TypeError('The output o must be a 2D tensor')
+        if W.ndim != 4:
+            raise TypeError('The weight matrix W must be a 4D tensor')
+        if h.ndim != 3:
+            raise TypeError('The input h must be a 3D tensor')
+        if inputIdx.ndim != 2:
+            raise TypeError('The input indices inputIdx must be a 2D tensor')
+        if outputIdx.ndim != 2:
+            raise TypeError('The output indices outputIdx must be a 2D tensor')
+
+        assert inputIdx.type.dtype in discrete_dtypes
+        assert outputIdx.type.dtype in discrete_dtypes
+
+        output = o.type.__class__(dtype=o.type.dtype,
+                                  broadcastable=(False,) * o.ndim)()
+
+        return Apply(self, [o, W, h, inputIdx, outputIdx], [output])
+
+    def perform(self, node, inp, out_):
+        o, W, h, iIdx, oIdx = inp[:5]
+
+        if not self.inplace:
+            o = o.copy()
+
+        for b in range(o.shape[0]):
+            for j in range(o.shape[1]):
+                outputIdx = oIdx[b, j]
+                for i in range(h.shape[1]):
+                    inputIdx = iIdx[b, i]
+                    w = W[inputIdx, outputIdx]
+                    o[b, j, :] += numpy.dot(h[b, i], w)
+        out_[0][0] = o
+
+    def grad(self, inputs, grads):
+        o, W, h, inputIdx, outputIdx = inputs
+        go = grads[0]
+
+        outer_fun = SparseBlockOuter(self.inplace)
+        gemv_fun = SparseBlockGemv(self.inplace)
+
+        Wgrad = outer_fun(W.zeros_like(), h, go, inputIdx, outputIdx)
+        hgrad = gemv_fun(h.zeros_like(), W.dimshuffle((1, 0, 3, 2)),
+                         go, outputIdx, inputIdx)
+        return [go, Wgrad, hgrad,
+                grad_undefined(self, 3, inputIdx,
+                               "grad of inputIdx makes no sense"),
+                grad_undefined(self, 4, outputIdx,
+                               "grad of outputIdx makes no sense")]
+
+
+class SparseBlockOuter(Op):
+    """
+    This computes the outer product of two sets of pieces of vectors
+    updating a full matrix with the results:
+        for b in range(batch_size):
+            o[xIdx[b, i], yIdx[b, j]] += (alpha * outer(x[b, i], y[b, j]))
+    This op is involved in the gradient of SparseBlockGemv.
+    """
+
+    registered_opts = []
+
+    def __init__(self, inplace=False):
+        self.inplace = inplace
+        if self.inplace:
+            self.destroy_map = {0: [0]}
+
+    def make_node(self, o, x, y, xIdx, yIdx, alpha=None):
+        """
+        Compute the dot product of the specified pieces of vectors
+        and matrices.
+
+        Parameters
+        ----------
+        var: shape, comment
+        o: (xBlocks, yBlocks, xSize, ySize)
+        x: (batch, xWin, xSize)
+        y: (batch, yWin, ySize)
+        xIdx: (batch, iWin), indexes of the x blocks
+        yIdx: (batch, oWin), indexes of the y blocks
+        returns (xBlocks, yBlocks, xSize, ySize), outer(x[i], y[j]) + o[i, j]
+
+        Notation
+        --------
+        - `batch` is the number of examples in a minibatch (batch size).
+        - `xBlocks` is the total number of blocks in x.
+        - `xSize` is the size of each of these x blocks.
+        - `xWin` is the number of blocks that will be used as x. Which blocks
+          will be used is specified in `xIdx`.
+        - `yBlocks` is the number or possible y blocks.
+        - `ySize` is the size of each of these y blocks.
+        - `yWin` is the number of y blocks that will actually be computed.
+          Which blocks will be computed is specified in `yIdx`.
+        """
+        one = tensor.constant(numpy.asarray(1.0, dtype='float32'))
+        o = theano.tensor.as_tensor_variable(o)
+        x = theano.tensor.as_tensor_variable(x)
+        y = theano.tensor.as_tensor_variable(y)
+
+        if alpha is None:
+            alpha = one
+
+        output = o.type.__class__(dtype=o.type.dtype,
+                                  broadcastable=(False,) * o.ndim)()
+
+        return Apply(self, [o, x, y, xIdx, yIdx, alpha],
+                     [output])
+
+    def perform(self, node, inp, out_):
+        o, x, y, xIdx, yIdx, alpha = inp[:6]
+
+        if not self.inplace:
+            o = o.copy()
+
+        for b in range(x.shape[0]):
+            for i in range(xIdx.shape[1]):
+                for j in range(yIdx.shape[1]):
+                    o[xIdx[b, i], yIdx[b, j]] += numpy.outer(x[b, i],
+                                                             y[b, j, :])
+        out_[0][0] = o
+
+
+sparse_block_gemv = SparseBlockGemv(False)
+sparse_block_gemv_inplace = SparseBlockGemv(True)
+sparse_block_outer = SparseBlockOuter(False)
+sparse_block_outer_inplace = SparseBlockOuter(True)
+
+
+def sparse_block_dot(W, h, inputIdx, b, outputIdx):
+    """
+    Compute the dot product (plus bias) of the specified pieces of vectors
+    and matrices. See SparseBlockGemv to get more information.
+
+    Parameters
+    ----------
+    var: shape, comment
+    W: (iBlocks, oBlocks, iSize, oSize), weight matrix
+    h: (batch, iWin, iSize), input from lower layer (sparse)
+    inputIdx: (batch, iWin), indexes of the input blocks
+    b: (oBlocks, oSize), bias vector
+    outputIdx: (batch, oWin), indexes of the output blocks
+    returns (batch, oWin, oSize), dot(W[i, j], h[i]) + b[j]
+         but b[j] is only added once
+    Notation
+    --------
+    - `batch` is the number of examples in a minibatch (batch size).
+    - `iBlocks` is the total number of blocks in the input (from lower layer).
+    - `iSize` is the size of each of these input blocks.
+    - `iWin` is the number of blocks that will be used as inputs. Which blocks
+      will be used is specified in `inputIdx`.
+    - `oBlocks` is the number or possible output blocks.
+    - `oSize` is the size of each of these output blocks.
+    - `oWin` is the number of output blocks that will actually be computed.
+      Which blocks will be computed is specified in `outputIdx`.
+
+    """
+    assert inputIdx.ndim == h.ndim - 1
+    assert outputIdx.ndim == inputIdx.ndim
+    if h.ndim == 2:
+        h = h.dimshuffle('x', 0, 1)
+        inputIdx = inputIdx.dimshuffle('x', 0)
+        outputIdx = outputIdx.dimshuffle('x', 0)
+    return SparseBlockGemv()(b.take(outputIdx, axis=0), W, h,
+                             inputIdx, outputIdx)
--- a/theano/sandbox/cuda/blocksparse.py
+++ b/theano/sandbox/cuda/blocksparse.py
+import logging
+
 import numpy
-import theano
-from theano import Apply, tensor, scalar
+from theano import Apply, tensor
 from theano.tensor import discrete_dtypes

 from theano.gradient import grad_undefined

-from theano.sandbox.cuda import cuda_available, GpuOp, GpuElemwise
+from theano.sandbox.cuda import cuda_available, GpuOp
+
+_logger = logging.getLogger('theano.sandbox.cuda.blocksparse')

 if cuda_available:
-    from theano.sandbox.cuda import (basic_ops,
-                                     opt, GpuFromHost,
-                                     HostFromGpu, host_from_gpu,
-                                     GpuDimShuffle)
-    from theano.sandbox.cuda.opt_util import alpha_merge, output_merge
+    from theano.sandbox.cuda import basic_ops


-class SparseBlockGemvSS(GpuOp):
+class GpuSparseBlockGemv(GpuOp):
    """
-    This op computes the dot product of specified pieces of vectors
-    and matrices, returning pieces of vectors.
-
-    It computes something like this for each j:
-
-      o[j] = sum_over_i(dot(W[i, j], h[i])) + o[j]
-
-    The i and j are taken from the inputIdx and outputIdx lists
-    respectively.
+    GPU version of SparseBlockGemv. Check SparseBlockGemv's docstring for more
+    information.

    This should not be directly called since the interface is subject
-    to change without notice.  Use the sparse_block_dot_SS() function
-    for a stable interface.
-
+    to change without notice.  Use the sandbox.blocksparse.sparse_block_dot()
+    function for a stable interface.
    """

    def __init__(self, inplace=False):
@@ -45,7 +36,7 @@ class SparseBlockGemvSS(GpuOp):
        return hash(type(self)) ^ hash(self.inplace)

    def __str__(self):
-        return "SparseBlockGemvSS%s" % ("{inplace}" if self.inplace else "")
+        return "GpuSparseBlockGemv%s" % ("{inplace}" if self.inplace else "")

    def make_node(self, o, W, h, inputIdx, outputIdx):
        o = basic_ops.as_cuda_ndarray_variable(o)
@@ -340,12 +331,12 @@ CudaNdarray_HOST_STRIDES(%(out)s)[0], CudaNdarray_HOST_STRIDES(%(out)s)[1],
        o, W, h, inputIdx, outputIdx = inputs
        go = grads[0]

-        Wgrad = sparse_block_outer_ss(W.zeros_like(),
-                                      h, go, inputIdx, outputIdx)
-        hgrad = sparse_block_gemv_ss(h.zeros_like(),
-                                     W.dimshuffle((1, 0, 3, 2)),
-                                     go,
-                                     outputIdx, inputIdx)
+        Wgrad = gpu_sparse_block_outer(W.zeros_like(),
+                                       h, go, inputIdx, outputIdx)
+        hgrad = gpu_sparse_block_gemv(h.zeros_like(),
+                                      W.dimshuffle((1, 0, 3, 2)),
+                                      go,
+                                      outputIdx, inputIdx)
        return [go, Wgrad, hgrad,
                grad_undefined(self, 3, inputIdx,
                               "grad of inputIdx makes no sense"),
@@ -353,25 +344,18 @@ CudaNdarray_HOST_STRIDES(%(out)s)[0], CudaNdarray_HOST_STRIDES(%(out)s)[1],
                               "grad of outputIdx makes no sense")]


-sparse_block_gemv_ss = SparseBlockGemvSS(False)
-sparse_block_gemv_ss_inplace = SparseBlockGemvSS(True)
+gpu_sparse_block_gemv = GpuSparseBlockGemv(False)
+gpu_sparse_block_gemv_inplace = GpuSparseBlockGemv(True)


-class SparseBlockOuterSS(GpuOp):
+class GpuSparseBlockOuter(GpuOp):
    """
-    This computes the outer product of two sets of pieces of vectors
-    updating a full matrix with the results.
-
-    It computes something like this:
-
-      o[i, j] = (alpha * outer(x[i], y[j])) + o[i, j]
-
-    The i and j are taken from the xIdx and yIdx lists respectively.
+    CPU version of SparseBlockOuter. See SparseBlockOuter's docstring for more
+    information.

    This op should not be called directly since its interface is
-    subject to change without notice. It is involved in the gradient
-    of SparseBlockGemvSS.
-
+    subject to change without notice.  It is involved in the gradient
+    of GpuSparseBlockGemv. The gradient is not implemented.
    """

    def __init__(self, inplace=False):
@@ -386,7 +370,7 @@ class SparseBlockOuterSS(GpuOp):
        return hash(type(self)) ^ hash(self.inplace)

    def __str__(self):
-        return "SparseBlockOuterSS%s" % ("{inplace}" if self.inplace else "")
+        return "GpuSparseBlockOuter%s" % ("{inplace}" if self.inplace else "")

    def make_node(self, o, x, y, xIdx, yIdx, alpha=None):
        one = tensor.constant(numpy.asarray(1.0, dtype='float32'))
@@ -598,8 +582,10 @@ CudaNdarray_HOST_DIMS(%(x)s)[1], CudaNdarray_HOST_DIMS(%(y)s)[1],
 %(name)s_x_list,
 %(name)s_y_list,
 %(name)s_out_list,
-CudaNdarray_DEV_DATA(%(x)s), CudaNdarray_HOST_STRIDES(%(x)s)[0], CudaNdarray_HOST_STRIDES(%(x)s)[1],
-CudaNdarray_DEV_DATA(%(y)s), CudaNdarray_HOST_STRIDES(%(y)s)[0], CudaNdarray_HOST_STRIDES(%(y)s)[1],
+CudaNdarray_DEV_DATA(%(x)s), CudaNdarray_HOST_STRIDES(%(x)s)[0],
+CudaNdarray_HOST_STRIDES(%(x)s)[1],
+CudaNdarray_DEV_DATA(%(y)s), CudaNdarray_HOST_STRIDES(%(y)s)[0],
+CudaNdarray_HOST_STRIDES(%(y)s)[1],
 CudaNdarray_DEV_DATA(%(out)s),
 CudaNdarray_HOST_STRIDES(%(out)s)[0], CudaNdarray_HOST_STRIDES(%(out)s)[1],
 %(name)s_xIdx, PyArray_DIM(%(xIdx)s, 1),
@@ -642,83 +628,5 @@ CudaNdarray_HOST_STRIDES(%(out)s)[0], CudaNdarray_HOST_STRIDES(%(out)s)[1],
        return (11,)


-sparse_block_outer_ss = SparseBlockOuterSS(False)
-sparse_block_outer_ss_inplace = SparseBlockOuterSS(True)
-
-
-if cuda_available:
-    @opt.register_opt()
-    @opt.local_optimizer([sparse_block_gemv_ss], inplace=True)
-    def local_inplace_blocksparse_gemv(node):
-        if node.op == sparse_block_gemv_ss:
-            return [sparse_block_gemv_ss_inplace(*node.inputs)]
-
-    @opt.register_opt()
-    @opt.local_optimizer([sparse_block_outer_ss], inplace=True)
-    def local_inplace_blocksparse_outer(node):
-        if node.op == sparse_block_outer_ss:
-            return [sparse_block_outer_ss_inplace(*node.inputs)]
-
-# XXX: these optimisations were badly broken and now require a working
-# beta param (could only be a 0/1 thing for outer_merge, but
-# alpha_merge needs the full range).
-
-#    @opt.register_opt()
-#    @alpha_merge(SparseBlockOuterSS, alpha_in=5, beta_in=?, nd=4)
-#    def local_merge_blocksparse_alpha(node, *inputs):
-#        """
-# GpuElemwise{mul}(lr, SparseBlockOuterSS) -> SparseBlockOuterSS(..., alpha=lr)
-#        """
-#        return [sparse_block_outer_ss(*inputs)]
-
-#    @opt.register_opt()
-#    @output_merge(SparseBlockOuterSS, alpha_in=5, beta_in=? out_in=0, nd=4)
-#    def local_merge_blocksparse_output(node, *inputs):
-#        return [sparse_block_outer_ss(*inputs)]
-
-
-def sparse_block_dot_SS(W, h, inputIdx, b, outputIdx):
-    """
-    Compute the dot product (plus bias) of the specified pieces of vectors
-    and matrices.
-
-    Parameters
-    ----------
-    W : (iBlocks, oBlocks, iSize, oSize)
-        Weight matrix.
-    h : (batch, iWin, iSize)
-        Input from lower layer (sparse).
-    inputIdx : (batch, iWin)
-        Indexes of the input blocks.
-    b : (oBlocks, oSize)
-        Bias vector.
-    outputIdx : (batch, oWin)
-        Indexes of the output blocks.
-
-    Returns
-    -------
-    (batch, oWin, oSize)
-        dot(W[i, j], h[i]) + b[j], but b[j] is only added once.
-
-    Notes
-    -----
-    - `batch` is the number of examples in a minibatch (batch size).
-    - `iBlocks` is the total number of blocks in the input (from lower layer).
-    - `iSize` is the size of each of these input blocks.
-    - `iWin` is the number of blocks that will be used as inputs. Which blocks
-    will be used is specified in `inputIdx`.
-    - `oBlocks` is the number or possible output blocks.
-    - `oSize` is the size of each of these output blocks.
-    - `oWin` is the number of output blocks that will actually be computed.
-    Which blocks will be computed is specified in `outputIdx`.
-
-    """
-
-    assert inputIdx.ndim == h.ndim - 1
-    assert outputIdx.ndim == inputIdx.ndim
-    if h.ndim == 2:
-        h = h.dimshuffle('x', 0, 1)
-        inputIdx = inputIdx.dimshuffle('x', 0)
-        outputIdx = outputIdx.dimshuffle('x', 0)
-    return sparse_block_gemv_ss(b.take(outputIdx, axis=0), W, h,
-                                inputIdx, outputIdx)
+gpu_sparse_block_outer = GpuSparseBlockOuter(False)
+gpu_sparse_block_outer_inplace = GpuSparseBlockOuter(True)
--- a/theano/sandbox/cuda/nnet.py
+++ b/theano/sandbox/cuda/nnet.py
@@ -220,7 +220,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuOp):
        # return ()
        return (4,)

-gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1HotWithBias()
+gpu_crossentropy_softmax_argmax_1hot_with_bias = \
+    GpuCrossentropySoftmaxArgmax1HotWithBias()


 class GpuCrossentropySoftmax1HotWithBiasDx(GpuOp):
@@ -391,7 +392,8 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuOp):
        }
        """ % locals()

-gpu_crossentropy_softmax_1hot_with_bias_dx = GpuCrossentropySoftmax1HotWithBiasDx()
+gpu_crossentropy_softmax_1hot_with_bias_dx = \
+    GpuCrossentropySoftmax1HotWithBiasDx()


 class GpuSoftmax(GpuOp):

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
--- a/theano/sandbox/cuda/tests/test_blocksparse.py
+++ b/theano/sandbox/cuda/tests/test_blocksparse.py
 import numpy
-from numpy.random import randn
-
-from unittest import TestCase
-
 from nose.plugins.skip import SkipTest

 import theano
 from theano import tensor
 import theano.tests.unittest_tools as utt
+import theano.sandbox.tests.test_blocksparse

 import theano.sandbox.cuda as cuda_ndarray
-if not cuda_ndarray.cuda_available:
-    raise SkipTest('Optional package cuda disabled')
-
-from theano.sandbox.cuda.basic_ops import (GpuDimShuffle,
-                                           as_cuda_ndarray_variable)
-from theano.sandbox.cuda.blocksparse import (sparse_block_dot_SS,
-                                             sparse_block_gemv_ss,
-                                             sparse_block_outer_ss,
-                                             sparse_block_outer_ss_inplace,
-                                             SparseBlockOuterSS)
+from theano.sandbox.cuda.blocksparse import (GpuSparseBlockOuter,
+                                             gpu_sparse_block_gemv,
+                                             gpu_sparse_block_outer)
 from theano.sandbox.cuda.var import float32_shared_constructor

+if not cuda_ndarray.cuda_available:
+    raise SkipTest('Optional package cuda disabled')

 if theano.config.mode == 'FAST_COMPILE':
    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
@@ -29,187 +21,56 @@ else:
    mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')


-def setup():
-    utt.seed_rng()
-
-
-def blocksparse_data():
-    nInputBlock = 128
-    nOutputBlock = 64
-    inputSize = 40
-    outputSize = 30
-    inputWindowSize = 7
-    outputWindowSize = 9
-    batchSize = 2
-
-    input = randn(batchSize, inputWindowSize, inputSize).astype('float32')
-    permutation = numpy.random.permutation
-    inputIndice = numpy.vstack(permutation(nInputBlock)[:inputWindowSize]
-                               for _ in range(batchSize))
-    outputIndice = numpy.vstack(permutation(nOutputBlock)[:outputWindowSize]
-                                for _ in range(batchSize))
-    weight = randn(nInputBlock, nOutputBlock,
-                   inputSize, outputSize).astype('float32')
-    bias = randn(nOutputBlock, outputSize).astype('float32')
-
-    return weight, input, inputIndice, bias, outputIndice
-
-
-def blocksparse(W, h, iIdx, b, oIdx):
-    o = b.take(oIdx, axis=0)
-
-    for b in range(o.shape[0]):
-        for j in range(o.shape[1]):
-            outputIdx = oIdx[b, j]
-
-            for i in range(h.shape[1]):
-                inputIdx = iIdx[b, i]
-                w = W[inputIdx, outputIdx]
-                # this below is a gemv I think
-                o[b, j, :] += numpy.dot(h[b, i], w)
-    return o
-
-
-def test_blocksparse():
-    b = tensor.fmatrix()
-    W = tensor.ftensor4()
-    h = tensor.ftensor3()
-    iIdx = tensor.lmatrix()
-    oIdx = tensor.lmatrix()
-
-    o = sparse_block_dot_SS(W, h, iIdx, b, oIdx)
-
-    f = theano.function([W, h, iIdx, b, oIdx], o, mode=mode_with_gpu)
-
-    W_val, h_val, iIdx_val, b_val, oIdx_val = blocksparse_data()
-
-    th_out = f(W_val, h_val, iIdx_val, b_val, oIdx_val)
-    ref_out = blocksparse(W_val, h_val, iIdx_val, b_val, oIdx_val)
-
-    utt.assert_allclose(ref_out, th_out)
-
-test_blocksparse.setup = setup
-
-
-# test the fortan order for W (which can happen in the grad for some graphs).
-def test_blocksparseF():
-    b = tensor.fmatrix()
-    W = tensor.ftensor4()
-    h = tensor.ftensor3()
-    iIdx = tensor.lmatrix()
-    oIdx = tensor.lmatrix()
-
-    o = sparse_block_dot_SS(GpuDimShuffle((False, False, False, False),
-                                          (0, 1, 3, 2))(
-                                              as_cuda_ndarray_variable(W)),
-                            h, iIdx, b, oIdx)
-
-    f = theano.function([W, h, iIdx, b, oIdx], o, mode=mode_with_gpu)
-
-    W_val, h_val, iIdx_val, b_val, oIdx_val = blocksparse_data()
-
-    th_out = f(numpy.swapaxes(W_val, 2, 3), h_val, iIdx_val, b_val, oIdx_val)
-    ref_out = blocksparse(W_val, h_val, iIdx_val, b_val, oIdx_val)
-
-    utt.assert_allclose(ref_out, th_out)
-
-
-def test_blocksparse_grad():
-    h_val = randn(1, 2, 3).astype('float32')
-    iIdx_val = numpy.random.permutation(3)[:2][None, :]
-    oIdx_val = numpy.random.permutation(3)[:2][None, :]
-    W_val = randn(3, 3, 3, 4).astype('float32')
-    b_val = randn(3, 4).astype('float32')
-
-    iIdx = theano.tensor.constant(iIdx_val)
-    oIdx = theano.tensor.constant(oIdx_val)
-
-    def f(b, h, W):
-        return sparse_block_gemv_ss(b.take(oIdx, axis=0), W, h, iIdx, oIdx)
-
-    utt.verify_grad(f, [b_val, h_val, W_val], mode=mode_with_gpu)
-
-
-def test_blocksparse_grad_1():
-    # This tests that we correctly handle cases where dimensions are 1.
-    h_val = randn(1, 1, 1).astype('float32')
-    iIdx_val = numpy.random.permutation(1)[:1][None, :]
-    oIdx_val = numpy.random.permutation(1)[:1][None, :]
-    W_val = randn(1, 1, 1, 1).astype('float32')
-    b_val = randn(1, 1).astype('float32')
-
-    iIdx = theano.tensor.constant(iIdx_val)
-    oIdx = theano.tensor.constant(oIdx_val)
-
-    def f(b, h, W):
-        return sparse_block_gemv_ss(b.take(oIdx, axis=0), W, h, iIdx, oIdx)
-
-    utt.verify_grad(f, [b_val, h_val, W_val], mode=mode_with_gpu)
-
-
-def test_blocksparse_grad_shape():
-    b = tensor.fmatrix()
-    W = tensor.ftensor4()
-    h = tensor.ftensor3()
-    iIdx = tensor.lmatrix()
-    oIdx = tensor.lmatrix()
-
-    o = sparse_block_gemv_ss(b.take(oIdx, axis=0), W, h, iIdx, oIdx)
-    go = theano.grad(o.sum(), [b, W, h])
-
-    f = theano.function([W, h, iIdx, b, oIdx], go, mode=mode_with_gpu)
-
-    W_val, h_val, iIdx_val, b_val, oIdx_val = blocksparse_data()
-
-    # just make sure that it runs correcly and all the shapes are ok.
-    b_g, W_g, h_g = f(W_val, h_val, iIdx_val, b_val, oIdx_val)
-
-    assert b_g.shape == b_val.shape
-    assert h_g.shape == h_val.shape
-    assert W_g.shape == W_val.shape
-
+class BlockSparse_Gemv_and_Outer(
+        theano.sandbox.tests.test_blocksparse.BlockSparse_Gemv_and_Outer):
+    def setUp(self):
+        utt.seed_rng()
+        self.mode = mode_with_gpu.excluding('constant_folding')
+        self.gemv_op = gpu_sparse_block_gemv
+        self.outer_op = gpu_sparse_block_outer

-# This test is temporarily disabled since we disabled the output_merge
-# and alpha_merge optimizations for blocksparse due to brokeness.
-# Re-enable when those are re-added.
-def Xtest_blocksparse_grad_merge():
-    b = tensor.fmatrix()
-    h = tensor.ftensor3()
-    iIdx = tensor.lmatrix()
-    oIdx = tensor.lmatrix()
+    # This test is temporarily disabled since we disabled the output_merge
+    # and alpha_merge optimizations for blocksparse due to brokeness.
+    # Re-enable when those are re-added.
+    def Xtest_blocksparse_grad_merge(self):
+        b = tensor.fmatrix()
+        h = tensor.ftensor3()
+        iIdx = tensor.lmatrix()
+        oIdx = tensor.lmatrix()

-    W_val, h_val, iIdx_val, b_val, oIdx_val = blocksparse_data()
-    W = float32_shared_constructor(W_val)
+        W_val, h_val, iIdx_val, b_val, oIdx_val = self.gemv_data()
+        W = float32_shared_constructor(W_val)

-    o = sparse_block_gemv_ss(b.take(oIdx, axis=0), W, h, iIdx, oIdx)
-    gW = theano.grad(o.sum(), W)
+        o = gpu_sparse_block_gemv(b.take(oIdx, axis=0), W, h, iIdx, oIdx)
+        gW = theano.grad(o.sum(), W)

-    lr = numpy.asarray(0.05, dtype='float32')
+        lr = numpy.asarray(0.05, dtype='float32')

-    upd = W - lr * gW
+        upd = W - lr * gW

-    f1 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)],
-                         mode=mode_with_gpu)
+        f1 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)],
+                             mode=mode_with_gpu)

-    # Make sure the lr update was merged.
-    assert isinstance(f1.maker.fgraph.outputs[0].owner.op, SparseBlockOuterSS)
+        # Make sure the lr update was merged.
+        assert isinstance(f1.maker.fgraph.outputs[0].owner.op,
+                          GpuSparseBlockOuter)

-    # Exclude the merge optimizations.
-    mode = mode_with_gpu.excluding('local_merge_blocksparse_alpha')
-    mode = mode.excluding('local_merge_blocksparse_output')
+        # Exclude the merge optimizations.
+        mode = mode_with_gpu.excluding('local_merge_blocksparse_alpha')
+        mode = mode.excluding('local_merge_blocksparse_output')

-    f2 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)], mode=mode)
+        f2 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)], mode=mode)

-    # Make sure the lr update is not merged.
-    assert not isinstance(f2.maker.fgraph.outputs[0].owner.op,
-                          SparseBlockOuterSS)
+        # Make sure the lr update is not merged.
+        assert not isinstance(f2.maker.fgraph.outputs[0].owner.op,
+                              GpuSparseBlockOuter)

-    f2(h_val, iIdx_val, b_val, oIdx_val)
-    W_ref = W.get_value()
+        f2(h_val, iIdx_val, b_val, oIdx_val)
+        W_ref = W.get_value()

-    # reset the var
-    W.set_value(W_val)
-    f1(h_val, iIdx_val, b_val, oIdx_val)
-    W_opt = W.get_value()
+        # reset the var
+        W.set_value(W_val)
+        f1(h_val, iIdx_val, b_val, oIdx_val)
+        W_opt = W.get_value()

-    utt.assert_allclose(W_ref, W_opt)
+        utt.assert_allclose(W_ref, W_opt)
--- a/theano/sandbox/cuda/tests/test_opt.py
+++ b/theano/sandbox/cuda/tests/test_opt.py
@@ -29,6 +29,9 @@ from theano.sandbox.cuda import basic_ops
 from theano.sandbox.cuda.type import CudaNdarrayType
 from theano.scalar.basic_scipy import erfinv

+from theano.sandbox.blocksparse import sparse_block_dot
+from theano.sandbox.cuda.blocksparse import GpuSparseBlockGemv, GpuSparseBlockOuter
+
 if theano.config.mode == 'FAST_COMPILE':
    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
    mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpu')
@@ -740,6 +743,36 @@ def test_local_gpu_dot_to_dot22dot():
    cmp((3, 4), (4,))


+def test_blocksparse_gpu_gemv_opt():
+    b = tensor.fmatrix()
+    W = tensor.ftensor4()
+    h = tensor.ftensor3()
+    iIdx = tensor.lmatrix()
+    oIdx = tensor.lmatrix()
+
+    o = sparse_block_dot(W, h, iIdx, b, oIdx)
+
+    f = theano.function([W, h, iIdx, b, oIdx], o, mode=mode_with_gpu)
+
+    assert isinstance(f.maker.fgraph.toposort()[-2].op, GpuSparseBlockGemv)
+
+
+def test_blocksparse_gpu_outer_opt():
+    b = tensor.fmatrix()
+    W = tensor.ftensor4()
+    h = tensor.ftensor3()
+    iIdx = tensor.lmatrix()
+    oIdx = tensor.lmatrix()
+
+    o = sparse_block_dot(W, h, iIdx, b, oIdx)
+
+    f = theano.function([W, h, iIdx, b, oIdx], [o, tensor.grad(o.sum(),
+                                                               wrt=W)],
+                        mode=mode_with_gpu)
+
+    assert isinstance(f.maker.fgraph.toposort()[-2].op, GpuSparseBlockOuter)
+
+
 class test_diag(theano.tensor.tests.test_nlinalg.test_diag):
    mode = mode_with_gpu
    shared = staticmethod(cuda.shared_constructor)
@@ -756,4 +789,3 @@ if __name__ == '__main__':
    test_opt_gpujoin_onlyajoin()
    test_opt_gpujoin_joinvectors_elemwise_then_minusone()
    test_opt_gpujoin_joinvectors_negativeaxes()
-
--- a/theano/sandbox/opt.py
+++ b/theano/sandbox/opt.py
+"""
+Optimizations addressing the ops in sandbox root directory
+"""
+
+from theano import compile  # to register the optimizer built by this file
+from theano import gof
+from theano.sandbox.blocksparse import (
+    SparseBlockGemv,
+    SparseBlockOuter,
+    sparse_block_gemv_inplace,
+    sparse_block_outer_inplace)
+
+
+@gof.local_optimizer([SparseBlockGemv], inplace=True)
+def local_inplace_sparse_block_gemv(node):
+    """
+        SparseBlockGemv(inplace=False) -> SparseBlockGemv(inplace=True)
+    """
+    if isinstance(node.op, SparseBlockGemv) and not node.op.inplace:
+        new_node = sparse_block_gemv_inplace(*node.inputs)
+        return [new_node]
+    return False
+compile.optdb.register('local_inplace_sparse_block_gemv',
+                       gof.TopoOptimizer(
+                           local_inplace_sparse_block_gemv,
+                           failure_callback=gof.TopoOptimizer.warn_inplace),
+                       60, 'fast_run', 'inplace')  # DEBUG
+
+
+@gof.local_optimizer([SparseBlockOuter], inplace=True)
+def local_inplace_sparse_block_outer(node):
+    """
+        SparseBlockOuter(inplace=False) -> SparseBlockOuter(inplace=True)
+    """
+    if isinstance(node.op, SparseBlockOuter) and not node.op.inplace:
+        new_node = sparse_block_outer_inplace(*node.inputs)
+        return [new_node]
+    return False
+compile.optdb.register('local_inplace_sparse_block_outer',
+                       gof.TopoOptimizer(
+                           local_inplace_sparse_block_outer,
+                           failure_callback=gof.TopoOptimizer.warn_inplace),
+                       60, 'fast_run', 'inplace')  # DEBUG
--- a/theano/sandbox/tests/__init__.py
+++ b/theano/sandbox/tests/__init__.py
--- a/theano/sandbox/tests/test_blocksparse.py
+++ b/theano/sandbox/tests/test_blocksparse.py
--- a/theano/sandbox/test_multinomial.py
+++ b/theano/sandbox/test_multinomial.py
@@ -4,7 +4,7 @@ import numpy

 import theano
 from theano import config, function, tensor
-from . import multinomial
+from theano.sandbox import multinomial
 from theano.compile.mode import get_default_mode, predefined_linkers
 import theano.sandbox.cuda as cuda


--- a/theano/sandbox/test_neighbourhoods.py
+++ b/theano/sandbox/test_neighbourhoods.py
--- a/theano/sandbox/tests/test_opt.py
+++ b/theano/sandbox/tests/test_opt.py
+import theano
+from theano import tensor
+from theano.sandbox.blocksparse import sparse_block_dot
+
+
+def test_blocksparse_inplace_gemv_opt():
+    b = tensor.fmatrix()
+    W = tensor.ftensor4()
+    h = tensor.ftensor3()
+    iIdx = tensor.lmatrix()
+    oIdx = tensor.lmatrix()
+
+    o = sparse_block_dot(W, h, iIdx, b, oIdx)
+
+    f = theano.function([W, h, iIdx, b, oIdx], o)
+
+    if theano.config.mode == "FAST_COMPILE":
+        assert not f.maker.fgraph.toposort()[-1].op.inplace
+    else:
+        assert f.maker.fgraph.toposort()[-1].op.inplace
+
+
+def test_blocksparse_inplace_outer_opt():
+    b = tensor.fmatrix()
+    W = tensor.ftensor4()
+    h = tensor.ftensor3()
+    iIdx = tensor.lmatrix()
+    oIdx = tensor.lmatrix()
+
+    o = sparse_block_dot(W, h, iIdx, b, oIdx)
+
+    theano.printing.debugprint(tensor.grad(o.sum(), wrt=W))
+
+    f = theano.function([W, h, iIdx, b, oIdx],
+                        [o, tensor.grad(o.sum(), wrt=W)])
+
+    if theano.config.mode == "FAST_COMPILE":
+        assert not f.maker.fgraph.toposort()[-1].op.inplace
+    else:
+        assert f.maker.fgraph.toposort()[-1].op.inplace
--- a/theano/sandbox/test_rng_mrg.py
+++ b/theano/sandbox/test_rng_mrg.py
--- a/theano/sandbox/test_scan.py
+++ b/theano/sandbox/test_scan.py
 import theano
 import numpy
-from . import scan
+from theano.sandbox import scan


 def test_001():

--- a/theano/sandbox/test_theano_object.py
+++ b/theano/sandbox/test_theano_object.py
 from __future__ import print_function
-from .theano_object import *
+from theano.sandbox.theano_object import *


 RUN_TESTS = False

--- a/theano/tests/test_flake8.py
+++ b/theano/tests/test_flake8.py
@@ -98,17 +98,19 @@ whitelist_flake8 = [
    "tensor/nnet/tests/test_sigm.py",
    "scalar/__init__.py",
    "scalar/tests/test_basic.py",
-    "sandbox/test_theano_object.py",
-    "sandbox/test_scan.py",
+    "sandbox/__init__.py",
    "sandbox/rng_mrg.py",
    "sandbox/theano_object.py",
    "sandbox/scan.py",
-    "sandbox/test_multinomial.py",
-    "sandbox/test_rng_mrg.py",
-    "sandbox/test_neighbourhoods.py",
    "sandbox/symbolic_module.py",
    "sandbox/conv.py",
    "sandbox/debug.py",
+    "sandbox/tests/test_theano_object.py",
+    "sandbox/tests/test_scan.py",
+    "sandbox/tests/test_rng_mrg.py",
+    "sandbox/tests/test_neighbourhoods.py",
+    "sandbox/tests/test_multinomial.py",
+    "sandbox/tests/__init__.py",
    "sandbox/cuda/dnn.py",
    "sandbox/cuda/var.py",
    "sandbox/cuda/GpuConvGrad3D.py",