Merge pull request #3311 from bouthilx/sparse_block_dot

Sparse block dot

Merge pull request #3311 from bouthilx/sparse_block_dot
565650e4 · abergeron · 7ba9c052 · aeb8c035 · 565650e4 · 565650e4
--- a/doc/images/blocksparse.png
+++ b/doc/images/blocksparse.png
--- a/doc/library/sandbox/blocksparse.txt
+++ b/doc/library/sandbox/blocksparse.txt
+.. _libdoc_blocksparse:
+===================================================================
+:mod:`sandbox.blocksparse` --  Block sparse dot operations (gemv and outer)
+===================================================================
+.. module:: sandbox.blocksparse
+   :platform: Unix, Windows
+   :synopsis: Block sparse dot
+.. moduleauthor:: LISA
+API
+===
+.. automodule:: theano.sandbox.blocksparse
+    :members:
--- a/theano/sandbox/__init__.py
+++ b/theano/sandbox/__init__.py
+from . import opt
--- a/theano/sandbox/blocksparse.py
+++ b/theano/sandbox/blocksparse.py
+import numpy
+import theano
+from theano import Op, Apply
+from theano import tensor
+from theano.tensor import discrete_dtypes
+from theano.gradient import grad_undefined
+class SparseBlockGemv(Op):
+    """
+    This op computes the dot product of specified pieces of vectors
+    and matrices, returning pieces of vectors:
+        for b in range(batch_size):
+            for j in range(o.shape[1]):
+                for i in range(h.shape[1]):
+                    o[b, j, :] += numpy.dot(h[b, i], W[iIdx[b, i], oIdx[b, j]])
+    where b, h, W, o iIdx, oIdx are defined in the docstring of make_node.
+    .. image:: ../../images/blocksparse.png
+        :scale: 50 %
+    """
+    registered_opts = []
+    def __init__(self, inplace=False):
+        self.inplace = inplace
+        if self.inplace:
+            self.destroy_map = {0: [0]}
+    def make_node(self, o, W, h, inputIdx, outputIdx):
+        """
+        Compute the dot product of the specified pieces of vectors
+        and matrices.
+        Parameters
+        ----------
+        var: shape, comment
+        o: (batch, oWin, oSize) output vector
+        W: (iBlocks, oBlocks, iSize, oSize), weight matrix
+        h: (batch, iWin, iSize), input from lower layer (sparse)
+        inputIdx: (batch, iWin), indexes of the input blocks
+        outputIdx: (batch, oWin), indexes of the output blocks
+        returns (batch, oWin, oSize), dot(W[i, j], h[i]) + o[j]
+        Notation
+        --------
+        - `batch` is the number of examples in a minibatch (batch size).
+        - `iBlocks` is the total number of blocks in the input (from lower
+            layer).
+        - `iSize` is the size of each of these input blocks.
+        - `iWin` is the number of blocks that will be used as inputs. Which
+            blocks
+          will be used is specified in `inputIdx`.
+        - `oBlocks` is the number or possible output blocks.
+        - `oSize` is the size of each of these output blocks.
+        - `oWin` is the number of output blocks that will actually be computed.
+          Which blocks will be computed is specified in `outputIdx`.
+        """
+        o = theano.tensor.as_tensor_variable(o)
+        W = theano.tensor.as_tensor_variable(W)
+        h = theano.tensor.as_tensor_variable(h)
+        inputIdx = theano.tensor.as_tensor_variable(inputIdx)
+        outputIdx = theano.tensor.as_tensor_variable(outputIdx)
+        if o.ndim != 3:
+            raise TypeError('The output o must be a 2D tensor')
+        if W.ndim != 4:
+            raise TypeError('The weight matrix W must be a 4D tensor')
+        if h.ndim != 3:
+            raise TypeError('The input h must be a 3D tensor')
+        if inputIdx.ndim != 2:
+            raise TypeError('The input indices inputIdx must be a 2D tensor')
+        if outputIdx.ndim != 2:
+            raise TypeError('The output indices outputIdx must be a 2D tensor')
+        assert inputIdx.type.dtype in discrete_dtypes
+        assert outputIdx.type.dtype in discrete_dtypes
+        output = o.type.__class__(dtype=o.type.dtype,
+                                  broadcastable=(False,) * o.ndim)()
+        return Apply(self, [o, W, h, inputIdx, outputIdx], [output])
+    def perform(self, node, inp, out_):
+        o, W, h, iIdx, oIdx = inp[:5]
+        if not self.inplace:
+            o = o.copy()
+        for b in range(o.shape[0]):
+            for j in range(o.shape[1]):
+                outputIdx = oIdx[b, j]
+                for i in range(h.shape[1]):
+                    inputIdx = iIdx[b, i]
+                    w = W[inputIdx, outputIdx]
+                    o[b, j, :] += numpy.dot(h[b, i], w)
+        out_[0][0] = o
+    def grad(self, inputs, grads):
+        o, W, h, inputIdx, outputIdx = inputs
+        go = grads[0]
+        outer_fun = SparseBlockOuter(self.inplace)
+        gemv_fun = SparseBlockGemv(self.inplace)
+        Wgrad = outer_fun(W.zeros_like(), h, go, inputIdx, outputIdx)
+        hgrad = gemv_fun(h.zeros_like(), W.dimshuffle((1, 0, 3, 2)),
+                         go, outputIdx, inputIdx)
+        return [go, Wgrad, hgrad,
+                grad_undefined(self, 3, inputIdx,
+                               "grad of inputIdx makes no sense"),
+                grad_undefined(self, 4, outputIdx,
+                               "grad of outputIdx makes no sense")]
+class SparseBlockOuter(Op):
+    """
+    This computes the outer product of two sets of pieces of vectors
+    updating a full matrix with the results:
+        for b in range(batch_size):
+            o[xIdx[b, i], yIdx[b, j]] += (alpha * outer(x[b, i], y[b, j]))
+    This op is involved in the gradient of SparseBlockGemv.
+    """
+    registered_opts = []
+    def __init__(self, inplace=False):
+        self.inplace = inplace
+        if self.inplace:
+            self.destroy_map = {0: [0]}
+    def make_node(self, o, x, y, xIdx, yIdx, alpha=None):
+        """
+        Compute the dot product of the specified pieces of vectors
+        and matrices.
+        Parameters
+        ----------
+        var: shape, comment
+        o: (xBlocks, yBlocks, xSize, ySize)
+        x: (batch, xWin, xSize)
+        y: (batch, yWin, ySize)
+        xIdx: (batch, iWin), indexes of the x blocks
+        yIdx: (batch, oWin), indexes of the y blocks
+        returns (xBlocks, yBlocks, xSize, ySize), outer(x[i], y[j]) + o[i, j]
+        Notation
+        --------
+        - `batch` is the number of examples in a minibatch (batch size).
+        - `xBlocks` is the total number of blocks in x.
+        - `xSize` is the size of each of these x blocks.
+        - `xWin` is the number of blocks that will be used as x. Which blocks
+          will be used is specified in `xIdx`.
+        - `yBlocks` is the number or possible y blocks.
+        - `ySize` is the size of each of these y blocks.
+        - `yWin` is the number of y blocks that will actually be computed.
+          Which blocks will be computed is specified in `yIdx`.
+        """
+        one = tensor.constant(numpy.asarray(1.0, dtype='float32'))
+        o = theano.tensor.as_tensor_variable(o)
+        x = theano.tensor.as_tensor_variable(x)
+        y = theano.tensor.as_tensor_variable(y)
+        if alpha is None:
+            alpha = one
+        output = o.type.__class__(dtype=o.type.dtype,
+                                  broadcastable=(False,) * o.ndim)()
+        return Apply(self, [o, x, y, xIdx, yIdx, alpha],
+                     [output])
+    def perform(self, node, inp, out_):
+        o, x, y, xIdx, yIdx, alpha = inp[:6]
+        if not self.inplace:
+            o = o.copy()
+        for b in range(x.shape[0]):
+            for i in range(xIdx.shape[1]):
+                for j in range(yIdx.shape[1]):
+                    o[xIdx[b, i], yIdx[b, j]] += numpy.outer(x[b, i],
+                                                             y[b, j, :])
+        out_[0][0] = o
+sparse_block_gemv = SparseBlockGemv(False)
+sparse_block_gemv_inplace = SparseBlockGemv(True)
+sparse_block_outer = SparseBlockOuter(False)
+sparse_block_outer_inplace = SparseBlockOuter(True)
+def sparse_block_dot(W, h, inputIdx, b, outputIdx):
+    """
+    Compute the dot product (plus bias) of the specified pieces of vectors
+    and matrices. See SparseBlockGemv to get more information.
+    Parameters
+    ----------
+    var: shape, comment
+    W: (iBlocks, oBlocks, iSize, oSize), weight matrix
+    h: (batch, iWin, iSize), input from lower layer (sparse)
+    inputIdx: (batch, iWin), indexes of the input blocks
+    b: (oBlocks, oSize), bias vector
+    outputIdx: (batch, oWin), indexes of the output blocks
+    returns (batch, oWin, oSize), dot(W[i, j], h[i]) + b[j]
+         but b[j] is only added once
+    Notation
+    --------
+    - `batch` is the number of examples in a minibatch (batch size).
+    - `iBlocks` is the total number of blocks in the input (from lower layer).
+    - `iSize` is the size of each of these input blocks.
+    - `iWin` is the number of blocks that will be used as inputs. Which blocks
+      will be used is specified in `inputIdx`.
+    - `oBlocks` is the number or possible output blocks.
+    - `oSize` is the size of each of these output blocks.
+    - `oWin` is the number of output blocks that will actually be computed.
+      Which blocks will be computed is specified in `outputIdx`.
+    """
+    assert inputIdx.ndim == h.ndim - 1
+    assert outputIdx.ndim == inputIdx.ndim
+    if h.ndim == 2:
+        h = h.dimshuffle('x', 0, 1)
+        inputIdx = inputIdx.dimshuffle('x', 0)
+        outputIdx = outputIdx.dimshuffle('x', 0)
+    return SparseBlockGemv()(b.take(outputIdx, axis=0), W, h,
+                             inputIdx, outputIdx)
--- a/theano/sandbox/cuda/blocksparse.py
+++ b/theano/sandbox/cuda/blocksparse.py
+import logging
 import numpy
-import theano
+from theano import Apply, tensor
-from theano import Apply, tensor, scalar
 from theano.tensor import discrete_dtypes
 from theano.gradient import grad_undefined
-from theano.sandbox.cuda import cuda_available, GpuOp, GpuElemwise
+from theano.sandbox.cuda import cuda_available, GpuOp
+_logger = logging.getLogger('theano.sandbox.cuda.blocksparse')
 if cuda_available:
-    from theano.sandbox.cuda import (basic_ops,
+    from theano.sandbox.cuda import basic_ops
-                                     opt, GpuFromHost,
-                                     HostFromGpu, host_from_gpu,
-                                     GpuDimShuffle)
-    from theano.sandbox.cuda.opt_util import alpha_merge, output_merge
-class SparseBlockGemvSS(GpuOp):
+class GpuSparseBlockGemv(GpuOp):
    """
-    This op computes the dot product of specified pieces of vectors
+    GPU version of SparseBlockGemv. Check SparseBlockGemv's docstring for more
-    and matrices, returning pieces of vectors.
+    information.
-    It computes something like this for each j:
-      o[j] = sum_over_i(dot(W[i, j], h[i])) + o[j]
-    The i and j are taken from the inputIdx and outputIdx lists
-    respectively.
    This should not be directly called since the interface is subject
-    to change without notice.  Use the sparse_block_dot_SS() function
+    to change without notice.  Use the sandbox.blocksparse.sparse_block_dot()
-    for a stable interface.
+    function for a stable interface.
    """
    def __init__(self, inplace=False):
@@ -45,7 +36,7 @@ class SparseBlockGemvSS(GpuOp):
        return hash(type(self)) ^ hash(self.inplace)
    def __str__(self):
-        return "SparseBlockGemvSS%s" % ("{inplace}" if self.inplace else "")
+        return "GpuSparseBlockGemv%s" % ("{inplace}" if self.inplace else "")
    def make_node(self, o, W, h, inputIdx, outputIdx):
        o = basic_ops.as_cuda_ndarray_variable(o)
@@ -340,12 +331,12 @@ CudaNdarray_HOST_STRIDES(%(out)s)[0], CudaNdarray_HOST_STRIDES(%(out)s)[1],
        o, W, h, inputIdx, outputIdx = inputs
        go = grads[0]
-        Wgrad = sparse_block_outer_ss(W.zeros_like(),
+        Wgrad = gpu_sparse_block_outer(W.zeros_like(),
-                                      h, go, inputIdx, outputIdx)
+                                       h, go, inputIdx, outputIdx)
-        hgrad = sparse_block_gemv_ss(h.zeros_like(),
+        hgrad = gpu_sparse_block_gemv(h.zeros_like(),
-                                     W.dimshuffle((1, 0, 3, 2)),
+                                      W.dimshuffle((1, 0, 3, 2)),
-                                     go,
+                                      go,
-                                     outputIdx, inputIdx)
+                                      outputIdx, inputIdx)
        return [go, Wgrad, hgrad,
                grad_undefined(self, 3, inputIdx,
                               "grad of inputIdx makes no sense"),
@@ -353,25 +344,18 @@ CudaNdarray_HOST_STRIDES(%(out)s)[0], CudaNdarray_HOST_STRIDES(%(out)s)[1],
                               "grad of outputIdx makes no sense")]
-sparse_block_gemv_ss = SparseBlockGemvSS(False)
+gpu_sparse_block_gemv = GpuSparseBlockGemv(False)
-sparse_block_gemv_ss_inplace = SparseBlockGemvSS(True)
+gpu_sparse_block_gemv_inplace = GpuSparseBlockGemv(True)
-class SparseBlockOuterSS(GpuOp):
+class GpuSparseBlockOuter(GpuOp):
    """
-    This computes the outer product of two sets of pieces of vectors
+    CPU version of SparseBlockOuter. See SparseBlockOuter's docstring for more
-    updating a full matrix with the results.
+    information.
-    It computes something like this:
-      o[i, j] = (alpha * outer(x[i], y[j])) + o[i, j]
-    The i and j are taken from the xIdx and yIdx lists respectively.
    This op should not be called directly since its interface is
-    subject to change without notice. It is involved in the gradient
+    subject to change without notice.  It is involved in the gradient
-    of SparseBlockGemvSS.
+    of GpuSparseBlockGemv. The gradient is not implemented.
    """
    def __init__(self, inplace=False):
@@ -386,7 +370,7 @@ class SparseBlockOuterSS(GpuOp):
        return hash(type(self)) ^ hash(self.inplace)
    def __str__(self):
-        return "SparseBlockOuterSS%s" % ("{inplace}" if self.inplace else "")
+        return "GpuSparseBlockOuter%s" % ("{inplace}" if self.inplace else "")
    def make_node(self, o, x, y, xIdx, yIdx, alpha=None):
        one = tensor.constant(numpy.asarray(1.0, dtype='float32'))
@@ -598,8 +582,10 @@ CudaNdarray_HOST_DIMS(%(x)s)[1], CudaNdarray_HOST_DIMS(%(y)s)[1],
 %(name)s_x_list,
 %(name)s_y_list,
 %(name)s_out_list,
-CudaNdarray_DEV_DATA(%(x)s), CudaNdarray_HOST_STRIDES(%(x)s)[0], CudaNdarray_HOST_STRIDES(%(x)s)[1],
+CudaNdarray_DEV_DATA(%(x)s), CudaNdarray_HOST_STRIDES(%(x)s)[0],
-CudaNdarray_DEV_DATA(%(y)s), CudaNdarray_HOST_STRIDES(%(y)s)[0], CudaNdarray_HOST_STRIDES(%(y)s)[1],
+CudaNdarray_HOST_STRIDES(%(x)s)[1],
+CudaNdarray_DEV_DATA(%(y)s), CudaNdarray_HOST_STRIDES(%(y)s)[0],
+CudaNdarray_HOST_STRIDES(%(y)s)[1],
 CudaNdarray_DEV_DATA(%(out)s),
 CudaNdarray_HOST_STRIDES(%(out)s)[0], CudaNdarray_HOST_STRIDES(%(out)s)[1],
 %(name)s_xIdx, PyArray_DIM(%(xIdx)s, 1),
@@ -642,83 +628,5 @@ CudaNdarray_HOST_STRIDES(%(out)s)[0], CudaNdarray_HOST_STRIDES(%(out)s)[1],
        return (11,)
-sparse_block_outer_ss = SparseBlockOuterSS(False)
+gpu_sparse_block_outer = GpuSparseBlockOuter(False)
-sparse_block_outer_ss_inplace = SparseBlockOuterSS(True)
+gpu_sparse_block_outer_inplace = GpuSparseBlockOuter(True)
-if cuda_available:
-    @opt.register_opt()
-    @opt.local_optimizer([sparse_block_gemv_ss], inplace=True)
-    def local_inplace_blocksparse_gemv(node):
-        if node.op == sparse_block_gemv_ss:
-            return [sparse_block_gemv_ss_inplace(*node.inputs)]
-    @opt.register_opt()
-    @opt.local_optimizer([sparse_block_outer_ss], inplace=True)
-    def local_inplace_blocksparse_outer(node):
-        if node.op == sparse_block_outer_ss:
-            return [sparse_block_outer_ss_inplace(*node.inputs)]
-# XXX: these optimisations were badly broken and now require a working
-# beta param (could only be a 0/1 thing for outer_merge, but
-# alpha_merge needs the full range).
-#    @opt.register_opt()
-#    @alpha_merge(SparseBlockOuterSS, alpha_in=5, beta_in=?, nd=4)
-#    def local_merge_blocksparse_alpha(node, *inputs):
-#        """
-# GpuElemwise{mul}(lr, SparseBlockOuterSS) -> SparseBlockOuterSS(..., alpha=lr)
-#        """
-#        return [sparse_block_outer_ss(*inputs)]
-#    @opt.register_opt()
-#    @output_merge(SparseBlockOuterSS, alpha_in=5, beta_in=? out_in=0, nd=4)
-#    def local_merge_blocksparse_output(node, *inputs):
-#        return [sparse_block_outer_ss(*inputs)]
-def sparse_block_dot_SS(W, h, inputIdx, b, outputIdx):
-    """
-    Compute the dot product (plus bias) of the specified pieces of vectors
-    and matrices.
-    Parameters
-    ----------
-    W : (iBlocks, oBlocks, iSize, oSize)
-        Weight matrix.
-    h : (batch, iWin, iSize)
-        Input from lower layer (sparse).
-    inputIdx : (batch, iWin)
-        Indexes of the input blocks.
-    b : (oBlocks, oSize)
-        Bias vector.
-    outputIdx : (batch, oWin)
-        Indexes of the output blocks.
-    Returns
-    -------
-    (batch, oWin, oSize)
-        dot(W[i, j], h[i]) + b[j], but b[j] is only added once.
-    Notes
-    -----
-    - `batch` is the number of examples in a minibatch (batch size).
-    - `iBlocks` is the total number of blocks in the input (from lower layer).
-    - `iSize` is the size of each of these input blocks.
-    - `iWin` is the number of blocks that will be used as inputs. Which blocks
-    will be used is specified in `inputIdx`.
-    - `oBlocks` is the number or possible output blocks.
-    - `oSize` is the size of each of these output blocks.
-    - `oWin` is the number of output blocks that will actually be computed.
-    Which blocks will be computed is specified in `outputIdx`.
-    """
-    assert inputIdx.ndim == h.ndim - 1
-    assert outputIdx.ndim == inputIdx.ndim
-    if h.ndim == 2:
-        h = h.dimshuffle('x', 0, 1)
-        inputIdx = inputIdx.dimshuffle('x', 0)
-        outputIdx = outputIdx.dimshuffle('x', 0)
-    return sparse_block_gemv_ss(b.take(outputIdx, axis=0), W, h,
-                                inputIdx, outputIdx)
--- a/theano/sandbox/cuda/nnet.py
+++ b/theano/sandbox/cuda/nnet.py
@@ -220,7 +220,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuOp):
        # return ()
        return (4,)
-gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1HotWithBias()
+gpu_crossentropy_softmax_argmax_1hot_with_bias = \
+    GpuCrossentropySoftmaxArgmax1HotWithBias()
 class GpuCrossentropySoftmax1HotWithBiasDx(GpuOp):
@@ -391,7 +392,8 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuOp):
        }
        """ % locals()
-gpu_crossentropy_softmax_1hot_with_bias_dx = GpuCrossentropySoftmax1HotWithBiasDx()
+gpu_crossentropy_softmax_1hot_with_bias_dx = \
+    GpuCrossentropySoftmax1HotWithBiasDx()
 class GpuSoftmax(GpuOp):

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -18,7 +18,7 @@ import theano.ifelse
 from six.moves import reduce, xrange
 from theano.compile import optdb
 from theano.gof import (local_optimizer, EquilibriumDB, ProxyDB,
-                        Optimizer, toolbox)
+                        Optimizer, TopoOptimizer, toolbox)
 from theano.gof.opt import LocalMetaOptimizer
 from theano.sandbox.cuda import as_cuda_ndarray_variable
 from theano.sandbox.cuda.basic_ops import (
@@ -31,10 +31,10 @@ from theano.sandbox.cuda.basic_ops import (
    GpuIncSubtensor, gpu_alloc, GpuAlloc, gpu_shape, GpuSplit, GpuAllocEmpty)
 from theano.sandbox.cuda.type import CudaNdarrayType
-from theano.sandbox.cuda.blas import (gpu_dot22, gpu_dot22scalar,
+from theano.sandbox.cuda.blas import (
-        gpu_gemm_inplace, gpu_gemm_no_inplace, GpuConv,
+    gpu_dot22, gpu_dot22scalar, gpu_gemm_inplace, gpu_gemm_no_inplace, GpuConv,
-        GpuCorrMM, GpuCorrMM_gradInputs, GpuCorrMM_gradWeights,
+    GpuCorrMM, GpuCorrMM_gradInputs, GpuCorrMM_gradWeights,
-        GpuCorr3dMM, GpuCorr3dMM_gradInputs, GpuCorr3dMM_gradWeights)
+    GpuCorr3dMM, GpuCorr3dMM_gradInputs, GpuCorr3dMM_gradWeights)
 from theano.sandbox.cuda.blas import gpu_gemv_inplace
 from theano.sandbox.cuda.cula import gpu_solve
@@ -42,13 +42,22 @@ from theano.sandbox.cuda.cula import gpu_solve
 from theano.sandbox.cuda.blas import gpu_gemv_no_inplace
 from theano.sandbox.cuda.blas import gpu_ger_inplace
 from theano.sandbox.cuda.blas import gpu_ger_no_inplace
-from theano.sandbox.cuda.blas import (GpuDownsampleFactorMax,
+from theano.sandbox.cuda.blas import (
-        GpuDownsampleFactorMaxGrad, GpuDownsampleFactorMaxGradGrad)
+    GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad,
+    GpuDownsampleFactorMaxGradGrad)
+from theano.sandbox.blocksparse import SparseBlockGemv, SparseBlockOuter
+from theano.sandbox.cuda.blocksparse import (
+    GpuSparseBlockGemv,
+    GpuSparseBlockOuter,
+    gpu_sparse_block_gemv_inplace,
+    gpu_sparse_block_outer_inplace)
 from theano.sandbox.cuda.nnet import (
-        GpuCrossentropySoftmaxArgmax1HotWithBias,
+    GpuCrossentropySoftmaxArgmax1HotWithBias,
-        GpuCrossentropySoftmax1HotWithBiasDx,
+    GpuCrossentropySoftmax1HotWithBiasDx,
-        GpuSoftmax, GpuSoftmaxWithBias)
+    GpuSoftmax, GpuSoftmaxWithBias)
 from theano.sandbox.cuda.elemwise import SupportCodeError
 from theano.scalar.basic_scipy import Erfinv
@@ -77,10 +86,11 @@ except ImportError:
 gpu_cut_copies = EquilibriumDB()
 gpu_seqopt.register('gpu_local_optimizations', gpu_optimizer, 1,
-        'fast_run', 'fast_compile', 'inplace', 'gpu')
+                    'fast_run', 'fast_compile', 'inplace', 'gpu')
 gpu_seqopt.register('gpu_cut_transfers', gpu_cut_copies, 2,
-        'fast_run', 'fast_compile', 'gpu')
+                    'fast_run', 'fast_compile', 'gpu')
-# DO NOT PUT fast_run or fast_compile in gpu_opt! This will ALWAYS enable the GPU!
+# DO NOT PUT fast_run or fast_compile in gpu_opt! This will ALWAYS
+# enable the GPU!
 optdb.register('gpu_opt',
               gpu_seqopt,
               optdb.__position__.get('add_destroy_handler', 49.5) - 1,
@@ -266,8 +276,8 @@ def local_gpu_elemwise_0(node):
                                  'uint16'])
                # case 1 - all inputs are already float32
                if all([i.type.dtype == 'float32' for i in node.inputs]):
-                    # TODO: change this when fusion makes Elemwise with multiple
+                    # TODO: change this when fusion makes Elemwise with
-                    # outputs
+                    # multiple outputs
                    gpu_elemwise = new_op(*(gpu_from_host(i)
                                            for i in node.inputs))
                # case 2 - it is still ok if some inputs were upcast to float32
@@ -346,8 +356,8 @@ def local_gpu_split(node):
            any([c != 'output' and isinstance(c.op, GpuFromHost) for c, idx
                 in outs_clients])):
            new_op = GpuSplit(node.op.len_splits)
-            split_res = new_op(as_cuda_ndarray_variable(input), *node.inputs[1:],
+            split_res = new_op(as_cuda_ndarray_variable(input),
-                               return_list=True)
+                               *node.inputs[1:], return_list=True)
            return [host_from_gpu(o) for o in split_res]
    return False
@@ -374,7 +384,8 @@ def local_gpu_dimshuffle_0(node):
            dimshuffle_node = host_input.owner
            new_op = GpuDimShuffle(dimshuffle_node.op.input_broadcastable,
                                   dimshuffle_node.op.new_order)
-            return [new_op(as_cuda_ndarray_variable(dimshuffle_node.inputs[0]))]
+            return [new_op(
+                as_cuda_ndarray_variable(dimshuffle_node.inputs[0]))]
    return False
@@ -389,8 +400,8 @@ def local_gpu_specifyShape_0(node):
    if isinstance(node.op, tensor.SpecifyShape):
        input = node.inputs[0]
        if input.owner and isinstance(input.owner.op, HostFromGpu):
-            return [host_from_gpu(tensor.specify_shape(as_cuda_ndarray_variable(input),
+            return [host_from_gpu(tensor.specify_shape(
-                                                      *node.inputs[1:]))]
+                as_cuda_ndarray_variable(input), *node.inputs[1:]))]
    if isinstance(node.op, GpuFromHost):
        host_input = node.inputs[0]
        if host_input.owner and isinstance(host_input.owner.op,
@@ -467,11 +478,15 @@ def local_gpu_dot_to_dot22(node):
                                                shape_out))]
    return False
 @local_optimizer(None)
 def local_assert_no_cpu_op(node):
-    if not isinstance(node.op, GpuOp) and all([var.owner and isinstance(var.owner.op,
+    if (not isinstance(node.op, GpuOp) and
-        HostFromGpu) for var in node.inputs]) and any([[c for c in var.clients
+        all([var.owner and isinstance(var.owner.op, HostFromGpu)
-                if isinstance(c[0].op, GpuFromHost)] for var in node.outputs]):
+             for var in node.inputs]) and
+        any([[c for c in var.clients if isinstance(c[0].op, GpuFromHost)]
+             for var in node.outputs])):
            if config.assert_no_cpu_op == "warn":
                _logger.warning(("CPU op %s is detected in the computational"
                                 " graph") % node)
@@ -492,7 +507,7 @@ theano.compile.optdb.register('assert_no_cpu_op', assert_no_cpu_op, 49.2)
 @register_opt()
 @local_optimizer([theano.ifelse.IfElse, gpu_from_host])
 def local_gpu_lazy_ifelse(node):
-    """    
+    """
    gpu_from_host(ifelse) -> gpu_ifelse(gpu_from_host)
    ifelse(host_from_gpu) -> host_from_gpu(ifelse)
@@ -572,7 +587,8 @@ def local_gpu_dot22(node):
        if host_input.owner and isinstance(host_input.owner.op,
                                           tensor.blas.Dot22):
            x, y = host_input.owner.inputs
-            return [gpu_dot22(as_cuda_ndarray_variable(x), as_cuda_ndarray_variable(y))]
+            return [gpu_dot22(as_cuda_ndarray_variable(x),
+                              as_cuda_ndarray_variable(y))]
    if isinstance(node.op, tensor.blas.Dot22):
        if any([(i.owner and isinstance(i.owner.op, HostFromGpu))
                for i in node.inputs]):
@@ -597,7 +613,8 @@ def local_gpu_dot22scalar(node):
            isinstance(host_input.owner.op,
                       tensor.blas.Dot22Scalar)):
            x, y, scalar = host_input.owner.inputs
-            return [gpu_dot22scalar(as_cuda_ndarray_variable(x), as_cuda_ndarray_variable(y),
+            return [gpu_dot22scalar(as_cuda_ndarray_variable(x),
+                                    as_cuda_ndarray_variable(y),
                                    tensor.blas._as_scalar(scalar))]
    if isinstance(node.op, tensor.blas.Dot22Scalar):
        if any([i.owner and isinstance(i.owner.op, HostFromGpu)
@@ -625,7 +642,8 @@ def local_gpu_solve(node):
            isinstance(host_input.owner.op,
                       slinalg.Solve)):
            x, y = host_input.owner.inputs
-            return [gpu_solve(as_cuda_ndarray_variable(x), as_cuda_ndarray_variable(y))]
+            return [gpu_solve(as_cuda_ndarray_variable(x),
+                              as_cuda_ndarray_variable(y))]
    if isinstance(node.op, slinalg.Solve):
        if any([i.owner and isinstance(i.owner.op, HostFromGpu)
@@ -633,7 +651,7 @@ def local_gpu_solve(node):
            x, y = node.inputs
            return [host_from_gpu(
                    gpu_solve(as_cuda_ndarray_variable(x),
-                                as_cuda_ndarray_variable(y)))]
+                              as_cuda_ndarray_variable(y)))]
    return False
@@ -648,7 +666,7 @@ def local_gpu_gemv(node):
    """
    gemvs = (tensor.blas.Gemv,
             tensor.blas_c.CGemv,
-            )
+             )
    if isinstance(node.op, GpuFromHost):
        host_input = node.inputs[0]
        if host_input.owner and isinstance(host_input.owner.op, gemvs):
@@ -688,7 +706,7 @@ def local_gpu_ger(node):
    gers = (tensor.blas_c.CGer,
            tensor.blas.Ger,
            tensor.blas_scipy.ScipyGer,
-        )
+            )
    if isinstance(node.op, GpuFromHost):
        host_input = node.inputs[0]
@@ -711,8 +729,7 @@ def local_gpu_ger(node):
                    as_cuda_ndarray_variable(z),
                    a,
                    as_cuda_ndarray_variable(x),
-                    as_cuda_ndarray_variable(y)
+                    as_cuda_ndarray_variable(y)))]
-                    ))]
    return False
@@ -741,11 +758,12 @@ def local_gpu_gemm(node):
        y_on_gpu = (y.owner and isinstance(y.owner.op, HostFromGpu))
        z_on_gpu = (z.owner and isinstance(z.owner.op, HostFromGpu))
        if x_on_gpu or y_on_gpu or z_on_gpu:
-            return [host_from_gpu(gpu_gemm_no_inplace(as_cuda_ndarray_variable(z),
+            return [host_from_gpu(gpu_gemm_no_inplace(
-                                                 a,
+                as_cuda_ndarray_variable(z),
-                                                 as_cuda_ndarray_variable(x),
+                a,
-                                                 as_cuda_ndarray_variable(y),
+                as_cuda_ndarray_variable(x),
-                                                 b))]
+                as_cuda_ndarray_variable(y),
+                b))]
    return False
@@ -882,8 +900,8 @@ def local_gpu_elemwise_careduce(node):
        # automatically add more case, as some like trigonometic
        # operation with some reduction pattern will probably result
        # to slow down.
-        isinstance(node.inputs[0].owner.op.scalar_op, scal.basic.Sqr)
+        isinstance(node.inputs[0].owner.op.scalar_op, scal.basic.Sqr)):
-        ):
        op = node.op
        inp = node.inputs[0].owner.inputs[0]
        return [GpuCAReduce(op.reduce_mask, op.scalar_op, scal.basic.sqr)(inp)]
@@ -898,7 +916,8 @@ def local_gpu_reshape(node):
           isinstance(host_input.owner.op, tensor.Reshape):
            rshp = host_input.owner.op
            x, shp = host_input.owner.inputs
-            gpu_reshape = GpuReshape(rshp.ndim)(as_cuda_ndarray_variable(x), shp)
+            gpu_reshape = GpuReshape(rshp.ndim)(as_cuda_ndarray_variable(x),
+                                                shp)
            if gpu_reshape.broadcastable != node.outputs[0].broadcastable:
                # this can happen as we always return False for all broadcast
                # dim in GpuReshape but not for Reshape
@@ -957,23 +976,27 @@ def local_gpu_subtensor(node):
                # to the GPU in that case.
                return
            coords = host_input.owner.inputs[1:]
-            return [GpuSubtensor(subt.idx_list)(as_cuda_ndarray_variable(x), *coords)]
+            return [GpuSubtensor(subt.idx_list)(as_cuda_ndarray_variable(x),
+                                                *coords)]
    if isinstance(node.op, tensor.Subtensor):
        x = node.inputs[0]
        if (x.owner and
            isinstance(x.owner.op, HostFromGpu) and
            x.dtype == "float32"):
            gpu_x = x.owner.inputs[0]
            if (gpu_x.owner and
                isinstance(gpu_x.owner.op, GpuFromHost) and
                # And it is a shared var or an input of the graph.
                not gpu_x.owner.inputs[0].owner):
                if len(x.clients) == 1:
                    if any([n == 'output' or isinstance(n.op, GpuOp)
-                            for n, _  in node.outputs[0].clients]):
+                            for n, _ in node.outputs[0].clients]):
                        return
                    else:
-                        return [host_from_gpu(as_cuda_ndarray_variable(node.outputs[0]))]
+                        return [host_from_gpu(as_cuda_ndarray_variable(
+                            node.outputs[0]))]
                    return
            gpu_x, = x.owner.inputs
@@ -992,11 +1015,13 @@ def local_gpu_advanced_subtensor1(node):
           host_input.owner.op.__class__ is tensor.AdvancedSubtensor1:
            x = host_input.owner.inputs[0]
            coords = host_input.owner.inputs[1:]
-            return [GpuAdvancedSubtensor1()(as_cuda_ndarray_variable(x), *coords)]
+            return [GpuAdvancedSubtensor1()(as_cuda_ndarray_variable(x),
+                                            *coords)]
    if node.op.__class__ is tensor.AdvancedSubtensor1:
        x = node.inputs[0]
        coords = node.inputs[1:]
-        if x.owner and isinstance(x.owner.op, HostFromGpu) and x.dtype == "float32":
+        if (x.owner and isinstance(x.owner.op, HostFromGpu) and
+            x.dtype == "float32"):
            gpu_x, = x.owner.inputs
            return [host_from_gpu(GpuAdvancedSubtensor1()(gpu_x, *coords))]
    return False
@@ -1027,12 +1052,14 @@ def local_gpu_advanced_incsubtensor1(node):
            if (compute_capability < 2 or
                x.ndim != 2 or
                y.ndim != 2):
                gpu_op = GpuAdvancedIncSubtensor1(
                    set_instead_of_inc=set_instead_of_inc)
            else:
                gpu_op = GpuAdvancedIncSubtensor1_dev20(
                    set_instead_of_inc=set_instead_of_inc)
-            return [gpu_op(as_cuda_ndarray_variable(x), as_cuda_ndarray_variable(y), *coords)]
+            return [gpu_op(as_cuda_ndarray_variable(x),
+                           as_cuda_ndarray_variable(y), *coords)]
    # Should not execute for GpuAdvancedIncSubtensor1
    if (node.op.__class__ is tensor.AdvancedIncSubtensor1 and
@@ -1183,7 +1210,7 @@ def local_gpu_pdbbreakpoint_op(node):
        nb_monitored_vars = len(node.outputs)
        for i in range(nb_monitored_vars):
-            inp = old_inputs[i+1]
+            inp = old_inputs[i + 1]
            out = old_outputs[i]
            input_is_from_gpu = (inp.owner and
@@ -1248,18 +1275,17 @@ def local_gpu_crossentorpy_softmax_argmax_1hot_with_bias(node):
            # thing if we want, since this gpu op will cast to integers
            # internally anyway
            int_cast_ops = (
-                    tensor.basic._convert_to_int32,
+                tensor.basic._convert_to_int32,
-                    tensor.basic._convert_to_int8,
+                tensor.basic._convert_to_int8,
-                    tensor.basic._convert_to_int16,
+                tensor.basic._convert_to_int16,
-                    tensor.basic._convert_to_int64,
+                tensor.basic._convert_to_int64)
-                    )
            while y.owner and y.owner.op in int_cast_ops:
                y = y.owner.inputs[0]
            gpu_nll, gpu_sm, gpu_am = \
-                    GpuCrossentropySoftmaxArgmax1HotWithBias()(
+                GpuCrossentropySoftmaxArgmax1HotWithBias()(
-                        gpu_x,
+                    gpu_x,
-                        as_cuda_ndarray_variable(b),
+                    as_cuda_ndarray_variable(b),
-                        as_cuda_ndarray_variable(cast(y, 'float32')))
+                    as_cuda_ndarray_variable(cast(y, 'float32')))
            am_dtype = node.outputs[2].type.dtype
            return [host_from_gpu(gpu_nll),
                    host_from_gpu(gpu_sm),
@@ -1302,7 +1328,8 @@ def local_gpu_softmax_with_bias(node):
        x_on_gpu = x.owner and isinstance(x.owner.op, HostFromGpu)
        b_on_gpu = b.owner and isinstance(b.owner.op, HostFromGpu)
        if x_on_gpu or b_on_gpu:
-            gpu_sm = GpuSoftmaxWithBias()(as_cuda_ndarray_variable(x), as_cuda_ndarray_variable(b))
+            gpu_sm = GpuSoftmaxWithBias()(as_cuda_ndarray_variable(x),
+                                          as_cuda_ndarray_variable(b))
            return [host_from_gpu(gpu_sm)]
    return False
@@ -1319,6 +1346,7 @@ def _gpu_conv_to_fftconv(node):
    if (node.op.imshp is not None and
        node.op.imshp[-1] is not None and
        node.op.imshp[-1] % 2 == 1):
        kwargs['pad_last_dim'] = True
    # If the user supplied the full nonsymbolic image_shape and
    # filter_shape in conv2d(), we can pass it on to conv2d_fft().
@@ -1332,7 +1360,8 @@ def _gpu_conv_to_fftconv(node):
            (node.op.nkern is not None) and
            (len(node.op.imshp) == 3) and
            (node.op.imshp[0] is not None)):
-        kwargs['filter_shape'] = (node.op.nkern, node.op.imshp[0]) + node.op.kshp
+        kwargs['filter_shape'] = (node.op.nkern, node.op.imshp[0]) + \
+            node.op.kshp
    rval = conv2d_fft(node.inputs[0], node.inputs[1], **kwargs)
    if node.outputs[0].broadcastable != rval.broadcastable:
        # With given shape information, conv2d_fft may return a different
@@ -1348,6 +1377,7 @@ def local_conv_fft_valid(node):
        if (node.op.border_mode == 'valid' and
            node.op.subsample == (1, 1) and
            node.op.fft_opt):
            return [_gpu_conv_to_fftconv(node)]
        return False
@@ -1358,6 +1388,7 @@ def local_conv_fft_full(node):
        if (node.op.border_mode == 'full' and
            node.op.subsample == (1, 1) and
            node.op.fft_opt):
            return [_gpu_conv_to_fftconv(node)]
        return
@@ -1396,19 +1427,19 @@ def local_gpu_conv(node):
        # print op.kshp, op.imshp[1:3]
        # print op.kshp_logical, logical_img_hw
        ret = GpuConv(border_mode=op.out_mode,
-                    subsample=(op.dx, op.dy),
+                      subsample=(op.dx, op.dy),
-                    logical_img_hw=logical_img_hw,
+                      logical_img_hw=logical_img_hw,
-                    logical_kern_hw=op.kshp_logical,
+                      logical_kern_hw=op.kshp_logical,
-                    logical_kern_align_top=op.kshp_logical_top_aligned,
+                      logical_kern_align_top=op.kshp_logical_top_aligned,
-                    kshp=op.kshp,
+                      kshp=op.kshp,
-                    version=op.version,
+                      version=op.version,
-                    direction_hint=op.direction_hint,
+                      direction_hint=op.direction_hint,
-                    verbose=op.verbose,
+                      verbose=op.verbose,
-                    imshp=op.imshp,
+                      imshp=op.imshp,
-                    nkern=op.nkern,
+                      nkern=op.nkern,
-                    bsize=op.bsize,
+                      bsize=op.bsize,
-                    fft_opt=op.fft_opt
+                      fft_opt=op.fft_opt
-                    )
+                      )
        if op.imshp_logical is not None:
            logical_img_hw = op.imshp_logical[1:3]
            if logical_img_hw != op.imshp[1:3]:
@@ -1471,6 +1502,7 @@ def local_gpu_conv(node):
 def local_conv_gemm(node):
    if (isinstance(node.op, GpuConv) and
        node.op.border_mode in ['full', 'valid']):
        img, kern = node.inputs
        border_mode = node.op.border_mode
        subsample = node.op.subsample
@@ -1494,7 +1526,7 @@ def local_conv_gemm(node):
                # we know the kernel and output size
                prod1 = node.op.kshp[0] * node.op.kshp[1]
                prod2 = ((node.op.imshp[-2] - node.op.kshp[0] + 1) *
-                    (node.op.imshp[-1] - node.op.kshp[1] + 1))
+                         (node.op.imshp[-1] - node.op.kshp[1] + 1))
                if ((node.op.bsize is not None) and
                        (len(node.op.imshp) == 3) and
                        (node.op.imshp[0] is not None)):
@@ -1516,7 +1548,7 @@ def local_conv_gemm(node):
            kern = kern.dimshuffle(1, 0, 2, 3)
            # call GpuCorrMM_gradInputs
            rval = GpuCorrMM_gradInputs('valid', subsample)(
-                    gpu_contiguous(kern), gpu_contiguous(img))
+                gpu_contiguous(kern), gpu_contiguous(img))
        if node.outputs[0].broadcastable != rval.broadcastable:
            # With given shape information, conv2d_fft may return a different
            # broadcast pattern than GpuConv. This is forbidden, so we fix it.
@@ -1594,10 +1626,12 @@ class ConvMetaOptimizer(LocalCudaMetaOptimizer):
            if ((var in inputs) and
                (shape is not None) and
                not any(s is None for s in shape)):
                result[var] = theano.shared(
-# TODO: Use var.type.filter when cuda_ndarray.filter supports non-strict casts
+                    # TODO: Use var.type.filter when cuda_ndarray.filter
-#                        var.type.filter(numpy.random.randn(*shape),
+                    # supports non-strict casts
-#                                        allow_downcast=True),
+                    # var.type.filter(numpy.random.randn(*shape),
+                    # allow_downcast=True),
                    numpy.require(numpy.random.randn(*shape),
                                  dtype=var.dtype),
                    var.name,
@@ -1608,10 +1642,11 @@ class ConvMetaOptimizer(LocalCudaMetaOptimizer):
 # We just register all optimizers from conv_groupopt with the metaoptimizer
 conv_metaopt = ConvMetaOptimizer(
-        conv_groupopt.query(*['+' + name for name in conv_groupopt._names]).opts)
+    conv_groupopt.query(*['+' + name for name in conv_groupopt._names]).opts)
 # Then we add some optimizers that try less obvious options
 conv_metaopt.register(dnn.local_conv_dnn_alternative)
-# Finally, we register the metaoptimizer as the first optimizer in conv_groupopt
+# Finally, we register the metaoptimizer as the first optimizer in
+# conv_groupopt
 conv_groupopt.register('conv_meta', conv_metaopt, 0)
@@ -1656,6 +1691,7 @@ def local_convgrad3d_fft(node):
        return False
    if (isinstance(node.op, ConvGrad3D) and
        (stride_x, stride_y, stride_z) == (1, 1, 1)):
        # we import conv3d_fft locally to avoid pycuda warnings
        from theano.sandbox.cuda.fftconv import conv3d_fft
        # Shuffle inputs signal from (b, 0, 1, t, ic) to (ic, b, 0, 1, t)
@@ -1742,8 +1778,8 @@ def local_convgrad3d_gemm(node):
        f = node.inputs[3]
        f = gpu_contiguous(f.dimshuffle(0, 4, 1, 2, 3))
-        rval = GpuCorr3dMM_gradWeights(subsample=(sx, sy, sz))(x, f,
+        rval = GpuCorr3dMM_gradWeights(subsample=(sx, sy, sz))(
-                                                               shape=node.inputs[2][1:4])
+            x, f, shape=node.inputs[2][1:4])
        # Shuffle from (ic, oc, 0, 1, t) to (oc, 0, 1, t, ic)
        return [rval.dimshuffle(0, 2, 3, 4, 1)]
@@ -1765,7 +1801,8 @@ def local_convtransp3d_gemm(node):
        # Shuffle dCdH from (b, 0, 1, t, oc) to (b, oc, 0, 1, t)
        f = node.inputs[3]
        f = gpu_contiguous(f.dimshuffle(0, 4, 1, 2, 3))
-        rval = GpuCorr3dMM_gradInputs(subsample=(sx, sy, sz))(kern=x, topgrad=f)
+        rval = GpuCorr3dMM_gradInputs(subsample=(sx, sy, sz))(kern=x,
+                                                              topgrad=f)
        # Shuffle from (ic, b, 0, 1, t) to (b, 0, 1, t, ic)
        return [rval.dimshuffle(0, 2, 3, 4, 1) + node.inputs[1]]
@@ -1781,6 +1818,7 @@ import theano.tensor.signal.downsample as downsample
 def local_gpu_downsample_factor_max(node):
    if (isinstance(node.op, downsample.DownsampleFactorMax)
        and node.op.ds == node.op.st):
        assert node.op.__props__ == ('ds', 'ignore_border', 'st', 'padding',
                                     'mode')
        if node.op.padding != (0, 0) or node.op.mode != 'max':
@@ -1796,11 +1834,13 @@ def local_gpu_downsample_factor_max(node):
 def local_gpu_downsample_factor_max_grad(node):
    if (isinstance(node.op, downsample.MaxPoolGrad) and
        node.op.ds == node.op.st):
        assert node.op.__props__ == ('ds', 'ignore_border', 'st', 'padding',
                                     'mode')
        if (node.op.padding != (0, 0) or
            node.op.mode != 'max' or
            node.op.st != node.op.ds):
            return
        x, z, gz = node.inputs
        if (x.owner and isinstance(x.owner.op, HostFromGpu)):
@@ -1871,7 +1911,8 @@ def local_gpu_join(node):
        # print "OPT: axis_and_tensors=", axis_and_tensors
-        matches = [(not t.owner is None and isinstance(t.owner.op, HostFromGpu)) or
+        matches = [(t.owner is not None and
+                    isinstance(t.owner.op, HostFromGpu)) or
                   isinstance(t, gof.Constant) for t in axis_and_tensors[1:]]
        # print "OPT: matches =", matches
@@ -1879,7 +1920,8 @@ def local_gpu_join(node):
        if all(matches):
            # the extra gpu_from_host introduced here will
            # be removed by further optimizations
-            new_tensors = [as_cuda_ndarray_variable(t) for t in axis_and_tensors[1:]]
+            new_tensors = [as_cuda_ndarray_variable(t)
+                           for t in axis_and_tensors[1:]]
            new_a_and_t = [axis_and_tensors[0]] + new_tensors
            replacement_node = host_from_gpu(gpu_join(*new_a_and_t))
@@ -1936,7 +1978,6 @@ optdb.register('InplaceGpuBlasOpt',
 def get_device_type_sizes():
    """
    Returns
    -------
    tuple
@@ -1957,7 +1998,8 @@ def get_device_type_sizes():
        del gpu_int_size
        del t
    except Exception as e:
-        _logger.warning(("Optimization Warning: "
+        _logger.warning((
+            "Optimization Warning: "
            "Got the following error, but you can ignore it. "
            "This could cause less GpuElemwise fused together.\n"
            "%s") % e)
@@ -1992,7 +2034,7 @@ def max_inputs_to_GpuElemwise(node):
    size_param_mandatory = int_size  # for numels
    size_param_mandatory += int_size * ndim  # for the shape
    size_param_mandatory += sum((gpu_ptr_size + int_size * ndim)
-                                 for i in node.outputs)
+                                for i in node.outputs)
    nb_bytes_avail = argument_limit - size_param_mandatory
    nb_bytes_per_inputs = (ndim * int_size) + gpu_ptr_size
@@ -2032,11 +2074,11 @@ def split_huge_add_or_mul(node):
 # GpuElemwise fusion
 gpu_local_elemwise_fusion = tensor.opt.local_elemwise_fusion_op(
-        GpuElemwise,
+    GpuElemwise, max_inputs_to_GpuElemwise)
-        max_inputs_to_GpuElemwise)
 if config.gpu.local_elemwise_fusion:
    _logger.debug("enabling optimization fusion of gpu elemwise in fast_run")
-    # Must be after cpu fusion at 40, gpu at 48.5 and before AddDestroyHandler at 49.5
+    # Must be after cpu fusion at 40, gpu at 48.5 and before
+    # AddDestroyHandler at 49.5
    optdb.register('gpu_elemwise_fusion',
                   tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion),
                   49, 'fast_run', 'fusion',
@@ -2050,7 +2092,7 @@ else:
 # GpuElemwise inplace
 gpu_inplace_elemwise_optimizer = tensor.opt.inplace_elemwise_optimizer_op(
-        GpuElemwise)
+    GpuElemwise)
 # DO NOT PLACE add a 'gpu' tag here! This would enable it in fast_compile.
 # It still will be run in fast_run with device=gpu with the current tag.
 optdb.register('gpu_inplace_elemwise_opt', gpu_inplace_elemwise_optimizer, 75,
@@ -2064,7 +2106,8 @@ gpu_elemwise_alloc = gof.local_optimizer([GpuElemwise])(
    tensor.opt.local_elemwise_alloc_op(GpuElemwise, GpuAlloc, GpuDimShuffle)
 )
 register_opt()(gpu_elemwise_alloc)
-register_opt()(tensor.opt.local_useless_elemwise) # needed by gpu_elemwise_alloc
+# needed by gpu_elemwise_alloc
+register_opt()(tensor.opt.local_useless_elemwise)
 tensor.opt.register_specialize_device(gpu_elemwise_alloc)
@@ -2110,8 +2153,7 @@ def local_gpualloc(node):
                                    new_out.type.broadcastable):
                assert b_new or (not b_old)
            new_out = tensor.patternbroadcast(new_out, old_out.broadcastable)
-        # if old_out.type != new_out.type:
-            #import pdb; pdb.set_trace()
        return [new_out]
@@ -2134,12 +2176,14 @@ def local_gpualloc_memset_0(node):
        if (isinstance(inp, CudaNdarrayConstant) and
            inp.data.size == 1 and
            (numpy.asarray(inp.data) == 0).all()):
            new_out = GpuAlloc(memset_0=True)(*node.inputs)
            old_bcast = node.outputs[0].type.broadcastable
            if new_out.type.broadcastable != old_bcast:
-                # check that we did not try discarding a broadcastable dimension
+                # check that we did not try discarding a broadcastable
-                assert not any(b_old and not b_new for b_old, b_new in zip(
+                # dimension
-                        old_bcast, new_out.type.broadcastable))
+                assert not any(b_old and not b_new for b_old, b_new in
+                               zip(old_bcast, new_out.type.broadcastable))
                # force old broadcasting pattern; we must not change it here
                new_out = tensor.patternbroadcast(new_out, old_bcast)
            return [new_out]
@@ -2172,6 +2216,7 @@ def local_gpu_eye(node):
        if (host_input.owner and
            isinstance(host_input.owner.op, tensor.Eye) and
            host_input.owner.op.dtype == "float32"):
            return [gpu_eye(*host_input.owner.inputs)]
    if isinstance(node.op, tensor.Eye) and node.op.dtype == "float32":
        if any([(i.owner and isinstance(i.owner.op, HostFromGpu))
@@ -2183,6 +2228,7 @@ def local_gpu_eye(node):
 def safe_to_gpu(x):
    if (isinstance(x.type, tensor.TensorType) and
        x.type.dtype == 'float32'):
        return as_cuda_ndarray_variable(x)
    else:
        return x
@@ -2237,6 +2283,7 @@ def gpu_reconstruct_graph(inputs, outputs, tag=None):
 def tensor_to_cuda(x):
    if (isinstance(x.type, tensor.TensorType) and
        x.type.dtype == 'float32'):
        y = CudaNdarrayType(broadcastable=x.type.broadcastable)()
        if x.name:
            y.name = x.name + '[cuda]'
@@ -2259,7 +2306,8 @@ def local_gpu_extract_diagonal(node):
                   theano.tensor.TensorType)):
        inp = node.inputs[0]
        if inp.owner and isinstance(inp.owner.op, HostFromGpu):
-            return [host_from_gpu(nlinalg.extract_diag(as_cuda_ndarray_variable(inp)))]
+            return [host_from_gpu(nlinalg.extract_diag(
+                as_cuda_ndarray_variable(inp)))]
    if isinstance(node.op, GpuFromHost):
        host_input = node.inputs[0]
        if (host_input.owner and
@@ -2295,6 +2343,7 @@ def gpuScanOptimization(node):
            isinstance(host_input.owner.op, scan_op.Scan) and
            not host_input.owner.op.info['gpu'] and
            len(host_input.owner.outputs) == 1):
            # Note that we are not doing the right thing here !!
            # This is because the local optimizer expects only one
            # output that corresponds to the input of ``node``
@@ -2348,6 +2397,7 @@ def gpuScanOptimization(node):
    # scan(host_from_gpu) -> host_from_gpu(GPUscan)
    if (type(node.op) == scan_op.Scan
        and not node.op.info['gpu']):
        if any([(i.owner and isinstance(i.owner.op, HostFromGpu))
                for i in node.inputs]):
@@ -2420,4 +2470,132 @@ optdb.register('gpu_scanOp_make_inplace',
               'inplace',
               'scan')
+# XXX: these optimisations were badly broken and now require a working
+# beta param (could only be a 0/1 thing for outer_merge, but
+# alpha_merge needs the full range).
+#    @register_opt()
+#    @alpha_merge(GpuSparseBlockOuter, alpha_in=5, beta_in=?, nd=4)
+#    def local_merge_blocksparse_alpha(node, *inputs):
+#        """
+#            GpuElemwise{mul}(lr, GpuSparseBlockOuter) ->
+#                GpuSparseBlockOuter(..., alpha=lr)
+#        """
+#        return [gpu_sparse_block_outer(*inputs)]
+#    @register_opt()
+#    @output_merge(GpuSparseBlockOuter, alpha_in=5, beta_in=? out_in=0, nd=4)
+#    def local_merge_blocksparse_output(node, *inputs):
+#        return [gpu_sparse_block_outer(*inputs)]
+def _owner_isinstance(inp, test_class):
+    """
+        Tests whether input has an owner and if its owner is
+        of type `test_class`
+    """
+    return bool(inp.owner) and isinstance(inp.owner.op, test_class)
+def _clear_host_from_gpu(inputs):
+    """
+        Replace any HostFromGpu by its input
+    """
+    clean_inputs = []
+    for inp in inputs:
+        if _owner_isinstance(inp, HostFromGpu):
+            clean_inputs.append(inp.owner.inputs[0])
+        else:
+            clean_inputs.append(inp)
+    return clean_inputs
+@register_opt()
+@local_optimizer([SparseBlockGemv, GpuFromHost])
+def gpu_sparse_block_gemv_opt(node):
+    """
+        SparseBlockGemv(HostFromGpu(input)) ->
+        HostFromGpu(GpuSparseBlockGemv(input))
+        or
+        GpuFromHost(SparseBlockGemv) -> GpuSparseBlockGemv
+    """
+    if isinstance(node.op, SparseBlockGemv) and \
+            any(_owner_isinstance(inp, HostFromGpu) for inp in node.inputs):
+        inputs = _clear_host_from_gpu(node.inputs)
+        return [host_from_gpu(GpuSparseBlockGemv(node.op.inplace)(*inputs))]
+    elif isinstance(node.op, GpuFromHost) and \
+            _owner_isinstance(node.inputs[0], SparseBlockGemv):
+        meta_node = node.inputs[0].owner
+        inputs = _clear_host_from_gpu(meta_node.inputs)
+        return [GpuSparseBlockGemv(meta_node.op.inplace)(*inputs)]
+@register_opt()
+@local_optimizer([SparseBlockOuter, GpuFromHost])
+def gpu_sparse_block_outer_opt(node):
+    """
+        SparseBlockOuter(HostFromGpu(input)) ->
+        HostFromGpu(GpuSparseBlockOuter(input))
+        or
+        GpuFromHost(SparseBlockOuter) -> GpuSparseBlockOuter
+    """
+    if isinstance(node.op, SparseBlockOuter) and \
+            any(_owner_isinstance(inp, HostFromGpu) for inp in node.inputs):
+        inputs = _clear_host_from_gpu(node.inputs)
+        return [host_from_gpu(GpuSparseBlockOuter(node.op.inplace)(*inputs))]
+    elif isinstance(node.op, GpuFromHost) and \
+            _owner_isinstance(node.inputs[0], SparseBlockOuter):
+        meta_node = node.inputs[0].owner
+        inputs = _clear_host_from_gpu(meta_node.inputs)
+        return [GpuSparseBlockOuter(meta_node.op.inplace)(*inputs)]
+@local_optimizer([GpuSparseBlockGemv], inplace=True)
+def local_inplace_gpu_sparse_block_gemv(node):
+    """
+        GpuSparseBlockGemv(inplace=False) -> GpuSparseBlockGemv(inplace=True)
+    """
+    if isinstance(node.op, GpuSparseBlockGemv) and not node.op.inplace:
+        new_node = gpu_sparse_block_gemv_inplace(*node.inputs)
+        return [new_node]
+    return False
+compile.optdb.register('local_inplace_gpu_sparse_block_gemv',
+                       TopoOptimizer(
+                           local_inplace_gpu_sparse_block_gemv,
+                           failure_callback=TopoOptimizer.warn_inplace),
+                       60, 'fast_run', 'inplace', 'gpu')  # DEBUG
+@local_optimizer([GpuSparseBlockOuter], inplace=True)
+def local_inplace_gpu_sparse_block_outer(node):
+    """
+        GpuSparseBlockOuter(inplace=False) -> GpuSparseBlockOuter(inplace=True)
+    """
+    if isinstance(node.op, GpuSparseBlockOuter) and not node.op.inplace:
+        new_node = gpu_sparse_block_outer_inplace(*node.inputs)
+        return [new_node]
+    return False
+compile.optdb.register('local_inplace_gpu_sparse_block_outer',
+                       TopoOptimizer(
+                           local_inplace_gpu_sparse_block_outer,
+                           failure_callback=TopoOptimizer.warn_inplace),
+                       60, 'fast_run', 'inplace', 'gpu')  # DEBUG
 import theano.sandbox.cuda.extra_ops
--- a/theano/sandbox/cuda/tests/test_blocksparse.py
+++ b/theano/sandbox/cuda/tests/test_blocksparse.py
 import numpy
-from numpy.random import randn
-from unittest import TestCase
 from nose.plugins.skip import SkipTest
 import theano
 from theano import tensor
 import theano.tests.unittest_tools as utt
+import theano.sandbox.tests.test_blocksparse
 import theano.sandbox.cuda as cuda_ndarray
-if not cuda_ndarray.cuda_available:
+from theano.sandbox.cuda.blocksparse import (GpuSparseBlockOuter,
-    raise SkipTest('Optional package cuda disabled')
+                                             gpu_sparse_block_gemv,
+                                             gpu_sparse_block_outer)
-from theano.sandbox.cuda.basic_ops import (GpuDimShuffle,
-                                           as_cuda_ndarray_variable)
-from theano.sandbox.cuda.blocksparse import (sparse_block_dot_SS,
-                                             sparse_block_gemv_ss,
-                                             sparse_block_outer_ss,
-                                             sparse_block_outer_ss_inplace,
-                                             SparseBlockOuterSS)
 from theano.sandbox.cuda.var import float32_shared_constructor
+if not cuda_ndarray.cuda_available:
+    raise SkipTest('Optional package cuda disabled')
 if theano.config.mode == 'FAST_COMPILE':
    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
@@ -29,187 +21,56 @@ else:
    mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
-def setup():
+class BlockSparse_Gemv_and_Outer(
-    utt.seed_rng()
+        theano.sandbox.tests.test_blocksparse.BlockSparse_Gemv_and_Outer):
+    def setUp(self):
+        utt.seed_rng()
-def blocksparse_data():
+        self.mode = mode_with_gpu.excluding('constant_folding')
-    nInputBlock = 128
+        self.gemv_op = gpu_sparse_block_gemv
-    nOutputBlock = 64
+        self.outer_op = gpu_sparse_block_outer
-    inputSize = 40
-    outputSize = 30
-    inputWindowSize = 7
-    outputWindowSize = 9
-    batchSize = 2
-    input = randn(batchSize, inputWindowSize, inputSize).astype('float32')
-    permutation = numpy.random.permutation
-    inputIndice = numpy.vstack(permutation(nInputBlock)[:inputWindowSize]
-                               for _ in range(batchSize))
-    outputIndice = numpy.vstack(permutation(nOutputBlock)[:outputWindowSize]
-                                for _ in range(batchSize))
-    weight = randn(nInputBlock, nOutputBlock,
-                   inputSize, outputSize).astype('float32')
-    bias = randn(nOutputBlock, outputSize).astype('float32')
-    return weight, input, inputIndice, bias, outputIndice
-def blocksparse(W, h, iIdx, b, oIdx):
-    o = b.take(oIdx, axis=0)
-    for b in range(o.shape[0]):
-        for j in range(o.shape[1]):
-            outputIdx = oIdx[b, j]
-            for i in range(h.shape[1]):
-                inputIdx = iIdx[b, i]
-                w = W[inputIdx, outputIdx]
-                # this below is a gemv I think
-                o[b, j, :] += numpy.dot(h[b, i], w)
-    return o
-def test_blocksparse():
-    b = tensor.fmatrix()
-    W = tensor.ftensor4()
-    h = tensor.ftensor3()
-    iIdx = tensor.lmatrix()
-    oIdx = tensor.lmatrix()
-    o = sparse_block_dot_SS(W, h, iIdx, b, oIdx)
-    f = theano.function([W, h, iIdx, b, oIdx], o, mode=mode_with_gpu)
-    W_val, h_val, iIdx_val, b_val, oIdx_val = blocksparse_data()
-    th_out = f(W_val, h_val, iIdx_val, b_val, oIdx_val)
-    ref_out = blocksparse(W_val, h_val, iIdx_val, b_val, oIdx_val)
-    utt.assert_allclose(ref_out, th_out)
-test_blocksparse.setup = setup
-# test the fortan order for W (which can happen in the grad for some graphs).
-def test_blocksparseF():
-    b = tensor.fmatrix()
-    W = tensor.ftensor4()
-    h = tensor.ftensor3()
-    iIdx = tensor.lmatrix()
-    oIdx = tensor.lmatrix()
-    o = sparse_block_dot_SS(GpuDimShuffle((False, False, False, False),
-                                          (0, 1, 3, 2))(
-                                              as_cuda_ndarray_variable(W)),
-                            h, iIdx, b, oIdx)
-    f = theano.function([W, h, iIdx, b, oIdx], o, mode=mode_with_gpu)
-    W_val, h_val, iIdx_val, b_val, oIdx_val = blocksparse_data()
-    th_out = f(numpy.swapaxes(W_val, 2, 3), h_val, iIdx_val, b_val, oIdx_val)
-    ref_out = blocksparse(W_val, h_val, iIdx_val, b_val, oIdx_val)
-    utt.assert_allclose(ref_out, th_out)
-def test_blocksparse_grad():
-    h_val = randn(1, 2, 3).astype('float32')
-    iIdx_val = numpy.random.permutation(3)[:2][None, :]
-    oIdx_val = numpy.random.permutation(3)[:2][None, :]
-    W_val = randn(3, 3, 3, 4).astype('float32')
-    b_val = randn(3, 4).astype('float32')
-    iIdx = theano.tensor.constant(iIdx_val)
-    oIdx = theano.tensor.constant(oIdx_val)
-    def f(b, h, W):
-        return sparse_block_gemv_ss(b.take(oIdx, axis=0), W, h, iIdx, oIdx)
-    utt.verify_grad(f, [b_val, h_val, W_val], mode=mode_with_gpu)
-def test_blocksparse_grad_1():
-    # This tests that we correctly handle cases where dimensions are 1.
-    h_val = randn(1, 1, 1).astype('float32')
-    iIdx_val = numpy.random.permutation(1)[:1][None, :]
-    oIdx_val = numpy.random.permutation(1)[:1][None, :]
-    W_val = randn(1, 1, 1, 1).astype('float32')
-    b_val = randn(1, 1).astype('float32')
-    iIdx = theano.tensor.constant(iIdx_val)
-    oIdx = theano.tensor.constant(oIdx_val)
-    def f(b, h, W):
-        return sparse_block_gemv_ss(b.take(oIdx, axis=0), W, h, iIdx, oIdx)
-    utt.verify_grad(f, [b_val, h_val, W_val], mode=mode_with_gpu)
-def test_blocksparse_grad_shape():
-    b = tensor.fmatrix()
-    W = tensor.ftensor4()
-    h = tensor.ftensor3()
-    iIdx = tensor.lmatrix()
-    oIdx = tensor.lmatrix()
-    o = sparse_block_gemv_ss(b.take(oIdx, axis=0), W, h, iIdx, oIdx)
-    go = theano.grad(o.sum(), [b, W, h])
-    f = theano.function([W, h, iIdx, b, oIdx], go, mode=mode_with_gpu)
-    W_val, h_val, iIdx_val, b_val, oIdx_val = blocksparse_data()
-    # just make sure that it runs correcly and all the shapes are ok.
-    b_g, W_g, h_g = f(W_val, h_val, iIdx_val, b_val, oIdx_val)
-    assert b_g.shape == b_val.shape
-    assert h_g.shape == h_val.shape
-    assert W_g.shape == W_val.shape
-# This test is temporarily disabled since we disabled the output_merge
+    # This test is temporarily disabled since we disabled the output_merge
-# and alpha_merge optimizations for blocksparse due to brokeness.
+    # and alpha_merge optimizations for blocksparse due to brokeness.
-# Re-enable when those are re-added.
+    # Re-enable when those are re-added.
-def Xtest_blocksparse_grad_merge():
+    def Xtest_blocksparse_grad_merge(self):
-    b = tensor.fmatrix()
+        b = tensor.fmatrix()
-    h = tensor.ftensor3()
+        h = tensor.ftensor3()
-    iIdx = tensor.lmatrix()
+        iIdx = tensor.lmatrix()
-    oIdx = tensor.lmatrix()
+        oIdx = tensor.lmatrix()
-    W_val, h_val, iIdx_val, b_val, oIdx_val = blocksparse_data()
+        W_val, h_val, iIdx_val, b_val, oIdx_val = self.gemv_data()
-    W = float32_shared_constructor(W_val)
+        W = float32_shared_constructor(W_val)
-    o = sparse_block_gemv_ss(b.take(oIdx, axis=0), W, h, iIdx, oIdx)
+        o = gpu_sparse_block_gemv(b.take(oIdx, axis=0), W, h, iIdx, oIdx)
-    gW = theano.grad(o.sum(), W)
+        gW = theano.grad(o.sum(), W)
-    lr = numpy.asarray(0.05, dtype='float32')
+        lr = numpy.asarray(0.05, dtype='float32')
-    upd = W - lr * gW
+        upd = W - lr * gW
-    f1 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)],
+        f1 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)],
-                         mode=mode_with_gpu)
+                             mode=mode_with_gpu)
-    # Make sure the lr update was merged.
+        # Make sure the lr update was merged.
-    assert isinstance(f1.maker.fgraph.outputs[0].owner.op, SparseBlockOuterSS)
+        assert isinstance(f1.maker.fgraph.outputs[0].owner.op,
+                          GpuSparseBlockOuter)
-    # Exclude the merge optimizations.
+        # Exclude the merge optimizations.
-    mode = mode_with_gpu.excluding('local_merge_blocksparse_alpha')
+        mode = mode_with_gpu.excluding('local_merge_blocksparse_alpha')
-    mode = mode.excluding('local_merge_blocksparse_output')
+        mode = mode.excluding('local_merge_blocksparse_output')
-    f2 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)], mode=mode)
+        f2 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)], mode=mode)
-    # Make sure the lr update is not merged.
+        # Make sure the lr update is not merged.
-    assert not isinstance(f2.maker.fgraph.outputs[0].owner.op,
+        assert not isinstance(f2.maker.fgraph.outputs[0].owner.op,
-                          SparseBlockOuterSS)
+                              GpuSparseBlockOuter)
-    f2(h_val, iIdx_val, b_val, oIdx_val)
+        f2(h_val, iIdx_val, b_val, oIdx_val)
-    W_ref = W.get_value()
+        W_ref = W.get_value()
-    # reset the var
+        # reset the var
-    W.set_value(W_val)
+        W.set_value(W_val)
-    f1(h_val, iIdx_val, b_val, oIdx_val)
+        f1(h_val, iIdx_val, b_val, oIdx_val)
-    W_opt = W.get_value()
+        W_opt = W.get_value()
-    utt.assert_allclose(W_ref, W_opt)
+        utt.assert_allclose(W_ref, W_opt)
--- a/theano/sandbox/cuda/tests/test_opt.py
+++ b/theano/sandbox/cuda/tests/test_opt.py
@@ -29,6 +29,9 @@ from theano.sandbox.cuda import basic_ops
 from theano.sandbox.cuda.type import CudaNdarrayType
 from theano.scalar.basic_scipy import erfinv
+from theano.sandbox.blocksparse import sparse_block_dot
+from theano.sandbox.cuda.blocksparse import GpuSparseBlockGemv, GpuSparseBlockOuter
 if theano.config.mode == 'FAST_COMPILE':
    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
    mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpu')
@@ -740,6 +743,36 @@ def test_local_gpu_dot_to_dot22dot():
    cmp((3, 4), (4,))
+def test_blocksparse_gpu_gemv_opt():
+    b = tensor.fmatrix()
+    W = tensor.ftensor4()
+    h = tensor.ftensor3()
+    iIdx = tensor.lmatrix()
+    oIdx = tensor.lmatrix()
+    o = sparse_block_dot(W, h, iIdx, b, oIdx)
+    f = theano.function([W, h, iIdx, b, oIdx], o, mode=mode_with_gpu)
+    assert isinstance(f.maker.fgraph.toposort()[-2].op, GpuSparseBlockGemv)
+def test_blocksparse_gpu_outer_opt():
+    b = tensor.fmatrix()
+    W = tensor.ftensor4()
+    h = tensor.ftensor3()
+    iIdx = tensor.lmatrix()
+    oIdx = tensor.lmatrix()
+    o = sparse_block_dot(W, h, iIdx, b, oIdx)
+    f = theano.function([W, h, iIdx, b, oIdx], [o, tensor.grad(o.sum(),
+                                                               wrt=W)],
+                        mode=mode_with_gpu)
+    assert isinstance(f.maker.fgraph.toposort()[-2].op, GpuSparseBlockOuter)
 class test_diag(theano.tensor.tests.test_nlinalg.test_diag):
    mode = mode_with_gpu
    shared = staticmethod(cuda.shared_constructor)
@@ -756,4 +789,3 @@ if __name__ == '__main__':
    test_opt_gpujoin_onlyajoin()
    test_opt_gpujoin_joinvectors_elemwise_then_minusone()
    test_opt_gpujoin_joinvectors_negativeaxes()
--- a/theano/sandbox/opt.py
+++ b/theano/sandbox/opt.py
+"""
+Optimizations addressing the ops in sandbox root directory
+"""
+from theano import compile  # to register the optimizer built by this file
+from theano import gof
+from theano.sandbox.blocksparse import (
+    SparseBlockGemv,
+    SparseBlockOuter,
+    sparse_block_gemv_inplace,
+    sparse_block_outer_inplace)
+@gof.local_optimizer([SparseBlockGemv], inplace=True)
+def local_inplace_sparse_block_gemv(node):
+    """
+        SparseBlockGemv(inplace=False) -> SparseBlockGemv(inplace=True)
+    """
+    if isinstance(node.op, SparseBlockGemv) and not node.op.inplace:
+        new_node = sparse_block_gemv_inplace(*node.inputs)
+        return [new_node]
+    return False
+compile.optdb.register('local_inplace_sparse_block_gemv',
+                       gof.TopoOptimizer(
+                           local_inplace_sparse_block_gemv,
+                           failure_callback=gof.TopoOptimizer.warn_inplace),
+                       60, 'fast_run', 'inplace')  # DEBUG
+@gof.local_optimizer([SparseBlockOuter], inplace=True)
+def local_inplace_sparse_block_outer(node):
+    """
+        SparseBlockOuter(inplace=False) -> SparseBlockOuter(inplace=True)
+    """
+    if isinstance(node.op, SparseBlockOuter) and not node.op.inplace:
+        new_node = sparse_block_outer_inplace(*node.inputs)
+        return [new_node]
+    return False
+compile.optdb.register('local_inplace_sparse_block_outer',
+                       gof.TopoOptimizer(
+                           local_inplace_sparse_block_outer,
+                           failure_callback=gof.TopoOptimizer.warn_inplace),
+                       60, 'fast_run', 'inplace')  # DEBUG
--- a/theano/sandbox/tests/__init__.py
+++ b/theano/sandbox/tests/__init__.py
--- a/theano/sandbox/tests/test_blocksparse.py
+++ b/theano/sandbox/tests/test_blocksparse.py
+"""
+    Tests for block sparse dot
+"""
+import unittest
+import numpy
+from numpy.random import randn
+import theano
+from theano import tensor
+import theano.tests.unittest_tools as utt
+from theano.sandbox.blocksparse import sparse_block_dot, \
+    sparse_block_gemv, sparse_block_outer
+class BlockSparse_Gemv_and_Outer(unittest.TestCase):
+    def runTest(self):
+        pass
+    def setUp(self):
+        utt.seed_rng()
+        self.mode = theano.compile.get_default_mode().excluding(
+            'constant_folding'
+        )
+        self.gemv_op = sparse_block_gemv
+        self.outer_op = sparse_block_outer
+    @staticmethod
+    def gemv_data():
+        nInputBlock = 8
+        nOutputBlock = 7
+        inputSize = 6
+        outputSize = 5
+        inputWindowSize = 4
+        outputWindowSize = 3
+        batchSize = 2
+#        nInputBlock = 2
+#        nOutputBlock = 2
+#        inputSize = 2
+#        outputSize = 2
+#        inputWindowSize = 1
+#        outputWindowSize = 1
+#        batchSize = 1
+        input = randn(batchSize, inputWindowSize, inputSize).astype('float32')
+        permutation = numpy.random.permutation
+        inputIndice = numpy.vstack(permutation(nInputBlock)[:inputWindowSize]
+                                   for _ in range(batchSize)).astype('int32')
+        outputIndice = numpy.vstack(
+            permutation(nOutputBlock)[:outputWindowSize]
+            for _ in range(batchSize)).astype('int32')
+        weight = randn(nInputBlock, nOutputBlock,
+                       inputSize, outputSize).astype('float32')
+        bias = randn(nOutputBlock, outputSize).astype('float32')
+        return weight, input, inputIndice, bias, outputIndice
+    @staticmethod
+    def outer_data():
+        nInputBlock = 8
+        nOutputBlock = 7
+        xSize = 6
+        ySize = 5
+        xWindowSize = 4
+        yWindowSize = 3
+        batchSize = 2
+        o = randn(nInputBlock, nOutputBlock, xSize, ySize).astype('float32')
+        x = randn(batchSize, xWindowSize, xSize).astype('float32')
+        y = randn(batchSize, yWindowSize, ySize).astype('float32')
+        randint = numpy.random.randint
+        xIdx = numpy.vstack(randint(0, nInputBlock, size=xWindowSize)
+                            for _ in range(batchSize)).astype('int32')
+        yIdx = numpy.vstack(randint(0, nOutputBlock, size=yWindowSize)
+                            for _ in range(batchSize)).astype('int32')
+        return o, x, y, xIdx, yIdx
+    @staticmethod
+    def gemv_numpy(o, W, h, iIdx, oIdx):
+        for b in range(o.shape[0]):
+            for j in range(o.shape[1]):
+                outputIdx = oIdx[b, j]
+                for i in range(h.shape[1]):
+                    inputIdx = iIdx[b, i]
+                    w = W[inputIdx, outputIdx]
+                    o[b, j, :] += numpy.dot(h[b, i], w)
+        return o
+    @staticmethod
+    def gemv_numpy2(o, W, h, iIdx, oIdx):
+        """
+        Other implementation
+        """
+        from numpy import ix_
+        for b in range(o.shape[0]):
+            w = W[ix_(iIdx[b], oIdx[b])].swapaxes(1, 2)
+            w = w.reshape((w.shape[0] * w.shape[1], w.shape[2] * w.shape[3]))
+            o[b] += numpy.dot(h[b].ravel(), w).reshape(o.shape[1:])
+        return o
+    @staticmethod
+    def gemv_numpy3(o, W, h, iIdx, oIdx):
+        """
+        Other implementation
+        """
+        from numpy import ix_
+        for b in range(o.shape[0]):
+            w = W[ix_(iIdx[b], oIdx[b])]
+            # The next three lines do the same operation. The last one is the
+            # fastest
+            # o[b] += (h[b][:, None, :, None] * w).sum(axis=(0, 2))
+            # o[b] += numpy.tensordot(h[b], w, [(0,1),(0,2)])
+            o[b] += numpy.einsum('ik,ijkl', h[b], w)
+        return o
+    @staticmethod
+    def gemv_data2():
+        nInputBlock = 100
+        nOutputBlock = 100
+        inputSize = 50
+        outputSize = 50
+        inputWindowSize = 30
+        outputWindowSize = 30
+        batchSize = 1
+        input = randn(batchSize, inputWindowSize, inputSize).astype('float32')
+        permutation = numpy.random.permutation
+        inputIndice = numpy.vstack(permutation(nInputBlock)[:inputWindowSize]
+                                   for _ in range(batchSize)).astype('int32')
+        outputIndice = numpy.vstack(
+            permutation(nOutputBlock)[:outputWindowSize]
+            for _ in range(batchSize)).astype('int32')
+        weight = randn(nInputBlock, nOutputBlock,
+                       inputSize, outputSize).astype('float32')
+        bias = randn(nOutputBlock, outputSize).astype('float32')
+        return weight, input, inputIndice, bias, outputIndice
+    @staticmethod
+    def outer_numpy(o, x, y, xIdx, yIdx):
+        for b in range(x.shape[0]):
+            for i in range(xIdx.shape[1]):
+                for j in range(yIdx.shape[1]):
+                    o[xIdx[b, i], yIdx[b, j]] += numpy.outer(x[b, i, :],
+                                                             y[b, j, :])
+        return o
+    def test_sparseblockdot(self):
+        """
+        Compares the numpy version of sparseblockgemv to sparse_block_dot.
+        """
+        b = tensor.fmatrix()
+        W = tensor.ftensor4()
+        h = tensor.ftensor3()
+        iIdx = tensor.imatrix()
+        oIdx = tensor.imatrix()
+        o = sparse_block_dot(W, h, iIdx, b, oIdx)
+        f = theano.function([W, h, iIdx, b, oIdx], o, mode=self.mode)
+        W_val, h_val, iIdx_val, b_val, oIdx_val = \
+            BlockSparse_Gemv_and_Outer.gemv_data()
+        th_out = f(W_val, h_val, iIdx_val, b_val, oIdx_val)
+        ref_out = BlockSparse_Gemv_and_Outer.gemv_numpy(
+            b_val.take(oIdx_val, axis=0), W_val, h_val, iIdx_val, oIdx_val)
+        utt.assert_allclose(ref_out, th_out)
+    def test_sparseblockgemv(self):
+        """
+        Compares the numpy and theano versions of sparseblockgemv.
+        """
+        b = tensor.fmatrix()
+        W = tensor.ftensor4()
+        h = tensor.ftensor3()
+        iIdx = tensor.imatrix()
+        oIdx = tensor.imatrix()
+        o = self.gemv_op(b.take(oIdx, axis=0), W, h, iIdx, oIdx)
+        f = theano.function([W, h, iIdx, b, oIdx], o, mode=self.mode)
+        W_val, h_val, iIdx_val, b_val, oIdx_val = \
+            BlockSparse_Gemv_and_Outer.gemv_data()
+        th_out = f(W_val, h_val, iIdx_val, b_val, oIdx_val)
+        ref_out = BlockSparse_Gemv_and_Outer.gemv_numpy(
+            b_val.take(oIdx_val, axis=0), W_val, h_val, iIdx_val, oIdx_val)
+        utt.assert_allclose(ref_out, th_out)
+    def test_sparseblockgemvF(self):
+        """
+            Test the fortan order for W (which can happen in the grad for some
+            graphs).
+        """
+        b = tensor.fmatrix()
+        W = tensor.ftensor4()
+        h = tensor.ftensor3()
+        iIdx = tensor.imatrix()
+        oIdx = tensor.imatrix()
+        o = self.gemv_op(b.take(oIdx, axis=0),
+                         tensor.DimShuffle((False, False, False, False),
+                                           (0, 1, 3, 2))
+                         (tensor.as_tensor_variable(W)),
+                         h, iIdx, oIdx)
+        f = theano.function([W, h, iIdx, b, oIdx], o, mode=self.mode)
+        W_val, h_val, iIdx_val, b_val, oIdx_val = \
+            BlockSparse_Gemv_and_Outer.gemv_data()
+        th_out = f(numpy.swapaxes(W_val, 2, 3), h_val, iIdx_val, b_val,
+                   oIdx_val)
+        ref_out = BlockSparse_Gemv_and_Outer.gemv_numpy(
+            b_val.take(oIdx_val, axis=0), W_val, h_val, iIdx_val, oIdx_val)
+        utt.assert_allclose(ref_out, th_out)
+    def test_sparseblockgemv_grad(self):
+        W_val, h_val, iIdx_val, b_val, oIdx_val = \
+            BlockSparse_Gemv_and_Outer.gemv_data()
+        h_val = randn(1, 1, 1).astype('float32')
+        iIdx_val = numpy.random.permutation(1)[:1][None, :]
+        oIdx_val = numpy.random.permutation(1)[:1][None, :]
+        W_val = randn(1, 1, 1, 1).astype('float32')
+        b_val = randn(1, 1).astype('float32')
+        iIdx = theano.tensor.constant(iIdx_val)
+        oIdx = theano.tensor.constant(oIdx_val)
+        def metaop(b, h, W):
+            return sparse_block_dot(W, h, iIdx, b, oIdx)
+        def op(b, h, W):
+            return self.gemv_op(b.take(oIdx, axis=0), W, h, iIdx, oIdx)
+        utt.verify_grad(metaop, [b_val, h_val, W_val], mode=self.mode)
+        utt.verify_grad(op, [b_val, h_val, W_val], mode=self.mode)
+    def test_sparseblockgemv_grad_1(self):
+        """
+            Test that we correctly handle cases where dimensions are 1.
+        """
+        h_val = randn(1, 1, 1).astype('float32')
+        iIdx_val = numpy.random.permutation(1)[:1][None, :]
+        oIdx_val = numpy.random.permutation(1)[:1][None, :]
+        W_val = randn(1, 1, 1, 1).astype('float32')
+        b_val = randn(1, 1).astype('float32')
+        iIdx = theano.tensor.constant(iIdx_val)
+        oIdx = theano.tensor.constant(oIdx_val)
+        def metaop(b, h, W):
+            return sparse_block_dot(W, h, iIdx, b, oIdx)
+        def op(b, h, W):
+            return self.gemv_op(b.take(oIdx, axis=0), W, h, iIdx, oIdx)
+        utt.verify_grad(metaop, [b_val, h_val, W_val], mode=self.mode)
+        utt.verify_grad(op, [b_val, h_val, W_val], mode=self.mode)
+    def test_sparseblockgemv_grad_shape(self):
+        b = tensor.fmatrix()
+        W = tensor.ftensor4()
+        h = tensor.ftensor3()
+        iIdx = tensor.imatrix()
+        oIdx = tensor.imatrix()
+        o = self.gemv_op(b.take(oIdx, axis=0), W, h, iIdx, oIdx)
+        go = theano.grad(o.sum(), [b, W, h])
+        f = theano.function([W, h, iIdx, b, oIdx], go, mode=self.mode)
+        W_val, h_val, iIdx_val, b_val, oIdx_val = \
+            BlockSparse_Gemv_and_Outer.gemv_data()
+        # just make sure that it runs correcly and all the shapes are ok.
+        b_g, W_g, h_g = f(W_val, h_val, iIdx_val, b_val, oIdx_val)
+        assert b_g.shape == b_val.shape
+        assert h_g.shape == h_val.shape
+        assert W_g.shape == W_val.shape
+    def test_sparseblockouter(self):
+        o = tensor.ftensor4()
+        x = tensor.ftensor3()
+        y = tensor.ftensor3()
+        xIdx = tensor.imatrix()
+        yIdx = tensor.imatrix()
+        out = self.outer_op(o, x, y, xIdx, yIdx)
+        f = theano.function([o, x, y, xIdx, yIdx], out,
+                            on_unused_input="warn")
+        o_val, x_val, y_val, xIdx_val, yIdx_val = \
+            BlockSparse_Gemv_and_Outer.outer_data()
+        th_out = f(o_val, x_val, y_val, xIdx_val, yIdx_val)
+        ref_out = BlockSparse_Gemv_and_Outer.outer_numpy(
+            o_val, x_val, y_val, xIdx_val, yIdx_val)
+        utt.assert_allclose(ref_out, th_out)
--- a/theano/sandbox/test_multinomial.py
+++ b/theano/sandbox/test_multinomial.py
@@ -4,7 +4,7 @@ import numpy
 import theano
 from theano import config, function, tensor
-from . import multinomial
+from theano.sandbox import multinomial
 from theano.compile.mode import get_default_mode, predefined_linkers
 import theano.sandbox.cuda as cuda

--- a/theano/sandbox/test_neighbourhoods.py
+++ b/theano/sandbox/test_neighbourhoods.py
--- a/theano/sandbox/tests/test_opt.py
+++ b/theano/sandbox/tests/test_opt.py
+import theano
+from theano import tensor
+from theano.sandbox.blocksparse import sparse_block_dot
+def test_blocksparse_inplace_gemv_opt():
+    b = tensor.fmatrix()
+    W = tensor.ftensor4()
+    h = tensor.ftensor3()
+    iIdx = tensor.lmatrix()
+    oIdx = tensor.lmatrix()
+    o = sparse_block_dot(W, h, iIdx, b, oIdx)
+    f = theano.function([W, h, iIdx, b, oIdx], o)
+    if theano.config.mode == "FAST_COMPILE":
+        assert not f.maker.fgraph.toposort()[-1].op.inplace
+    else:
+        assert f.maker.fgraph.toposort()[-1].op.inplace
+def test_blocksparse_inplace_outer_opt():
+    b = tensor.fmatrix()
+    W = tensor.ftensor4()
+    h = tensor.ftensor3()
+    iIdx = tensor.lmatrix()
+    oIdx = tensor.lmatrix()
+    o = sparse_block_dot(W, h, iIdx, b, oIdx)
+    theano.printing.debugprint(tensor.grad(o.sum(), wrt=W))
+    f = theano.function([W, h, iIdx, b, oIdx],
+                        [o, tensor.grad(o.sum(), wrt=W)])
+    if theano.config.mode == "FAST_COMPILE":
+        assert not f.maker.fgraph.toposort()[-1].op.inplace
+    else:
+        assert f.maker.fgraph.toposort()[-1].op.inplace
--- a/theano/sandbox/test_rng_mrg.py
+++ b/theano/sandbox/test_rng_mrg.py
--- a/theano/sandbox/test_scan.py
+++ b/theano/sandbox/test_scan.py
 import theano
 import numpy
-from . import scan
+from theano.sandbox import scan
 def test_001():

--- a/theano/sandbox/test_theano_object.py
+++ b/theano/sandbox/test_theano_object.py
 from __future__ import print_function
-from .theano_object import *
+from theano.sandbox.theano_object import *
 RUN_TESTS = False

--- a/theano/tests/test_flake8.py
+++ b/theano/tests/test_flake8.py
@@ -98,17 +98,19 @@ whitelist_flake8 = [
    "tensor/nnet/tests/test_sigm.py",
    "scalar/__init__.py",
    "scalar/tests/test_basic.py",
-    "sandbox/test_theano_object.py",
+    "sandbox/__init__.py",
-    "sandbox/test_scan.py",
    "sandbox/rng_mrg.py",
    "sandbox/theano_object.py",
    "sandbox/scan.py",
-    "sandbox/test_multinomial.py",
-    "sandbox/test_rng_mrg.py",
-    "sandbox/test_neighbourhoods.py",
    "sandbox/symbolic_module.py",
    "sandbox/conv.py",
    "sandbox/debug.py",
+    "sandbox/tests/test_theano_object.py",
+    "sandbox/tests/test_scan.py",
+    "sandbox/tests/test_rng_mrg.py",
+    "sandbox/tests/test_neighbourhoods.py",
+    "sandbox/tests/test_multinomial.py",
+    "sandbox/tests/__init__.py",
    "sandbox/cuda/dnn.py",
    "sandbox/cuda/var.py",
    "sandbox/cuda/GpuConvGrad3D.py",