Add a meta op BlockSparseDot

Conflicts: theano/sandbox/cuda/blocksparse.py

Add a meta op BlockSparseDot
ed4e0095 · Alexandre de Brebisson · Xavier Bouthillier · 5e536853 · ed4e0095 · ed4e0095
--- a/theano/sandbox/blocksparse.py
+++ b/theano/sandbox/blocksparse.py
+import numpy
+import theano
+from theano import Op, Apply
+from theano import tensor
+from theano.tensor import discrete_dtypes
+from theano.gradient import grad_undefined
+class SparseBlockGemv(Op):
+    """
+    This op computes the dot product of specified pieces of vectors
+    and matrices, returning pieces of vectors:
+        for b in range(batch_size):
+            for j in range(o.shape[1]):
+                for i in range(h.shape[1]):
+                    o[b, j, :] += numpy.dot(h[b, i], W[iIdx[b, i], oIdx[b, j]])
+    .. image:: ../../images/blocksparse.png
+    """
+    registered_opts = []
+    def __init__(self, inplace=False):
+        self.inplace = inplace
+        if self.inplace:
+            self.destroy_map = {0: [0]}
+    def make_node(self, o, W, h, inputIdx, outputIdx):
+        """
+        Compute the dot product of the specified pieces of vectors
+        and matrices. 
+        Parameters
+        ----------
+        var: shape, comment
+        o: (batch, oWin, oSize) output vector
+        W: (iBlocks, oBlocks, iSize, oSize), weight matrix
+        h: (batch, iWin, iSize), input from lower layer (sparse)
+        inputIdx: (batch, iWin), indexes of the input blocks
+        outputIdx: (batch, oWin), indexes of the output blocks
+        returns (batch, oWin, oSize), dot(W[i, j], h[i]) + o[j]
+        Notation
+        --------
+        - `batch` is the number of examples in a minibatch (batch size).
+        - `iBlocks` is the total number of blocks in the input (from lower
+            layer).
+        - `iSize` is the size of each of these input blocks.
+        - `iWin` is the number of blocks that will be used as inputs. Which
+            blocks
+          will be used is specified in `inputIdx`.
+        - `oBlocks` is the number or possible output blocks.
+        - `oSize` is the size of each of these output blocks.
+        - `oWin` is the number of output blocks that will actually be computed.
+          Which blocks will be computed is specified in `outputIdx`.
+        """
+        o = theano.tensor.as_tensor_variable(o)
+        W = theano.tensor.as_tensor_variable(W)
+        h = theano.tensor.as_tensor_variable(h)
+        inputIdx = theano.tensor.as_tensor_variable(inputIdx)
+        outputIdx = theano.tensor.as_tensor_variable(outputIdx)
+        if o.ndim != 3:
+            raise TypeError('The output o must be a 2D tensor')
+        if W.ndim != 4:
+            raise TypeError('The weight matrix W must be a 4D tensor')
+        if h.ndim != 3:
+            raise TypeError('The input h must be a 3D tensor')
+        if inputIdx.ndim != 2:
+            raise TypeError('The input indices inputIdx must be a 2D tensor')
+        if outputIdx.ndim != 2:
+            raise TypeError('The output indices outputIdx must be a 2D tensor')
+        assert inputIdx.type.dtype in discrete_dtypes
+        assert outputIdx.type.dtype in discrete_dtypes
+        output = o.type.__class__(dtype=o.type.dtype,
+                                  broadcastable=(False,)*o.ndim)()
+        return Apply(self, [o, W, h, inputIdx, outputIdx], [output])
+    def perform(self, node, inp, out_):
+        raise NotImplementedError('Optimization of SparseBlockGemv failed.')
+    def grad(self, inputs, grads):
+        o, W, h, inputIdx, outputIdx = inputs
+        go = grads[0]
+        outer_fun = SparseBlockOuter(self.inplace)
+        gemv_fun = SparseBlockGemv(self.inplace)
+        Wgrad = outer_fun(W.zeros_like(), h, go, inputIdx, outputIdx)
+        hgrad = gemv_fun(h.zeros_like(), W.dimshuffle((1, 0, 3, 2)),
+                         go, outputIdx, inputIdx)
+        return [go, Wgrad, hgrad,
+                grad_undefined(self, 3, inputIdx,
+                               "grad of inputIdx makes no sense"),
+                grad_undefined(self, 4, outputIdx,
+                               "grad of outputIdx makes no sense")]
+class SparseBlockOuter(Op):
+    """
+    This computes the outer product of two sets of pieces of vectors
+    updating a full matrix with the results:
+        for b in range(batch_size):
+            o[xIdx[b, i], yIdx[b, j]] += (alpha * outer(x[b, i], y[b, j]))
+    This op is involved in the gradient of SparseBlockGemv.
+    """
+    registered_opts = []
+    def __init__(self, inplace=False):
+        self.inplace = inplace
+        if self.inplace:
+            self.destroy_map = {0: [0]}
+    def make_node(self, o, x, y, xIdx, yIdx, alpha=None):
+        """
+        Compute the dot product of the specified pieces of vectors
+        and matrices. 
+        Parameters
+        ----------
+        var: shape, comment
+        o: (xBlocks, yBlocks, xSize, ySize)
+        x: (batch, xWin, xSize)
+        y: (batch, yWin, ySize)
+        xIdx: (batch, iWin), indexes of the x blocks
+        yIdx: (batch, oWin), indexes of the y blocks
+        returns (xBlocks, yBlocks, xSize, ySize), outer(x[i], y[j]) + o[i, j]
+        Notation
+        --------
+        - `batch` is the number of examples in a minibatch (batch size).
+        - `xBlocks` is the total number of blocks in x.
+        - `xSize` is the size of each of these x blocks.
+        - `xWin` is the number of blocks that will be used as x. Which blocks
+          will be used is specified in `xIdx`.
+        - `yBlocks` is the number or possible y blocks.
+        - `ySize` is the size of each of these y blocks.
+        - `yWin` is the number of y blocks that will actually be computed.
+          Which blocks will be computed is specified in `yIdx`.
+        """
+        one = tensor.constant(numpy.asarray(1.0, dtype='float32'))
+        o = theano.tensor.as_tensor_variable(o)
+        x = theano.tensor.as_tensor_variable(x)
+        y = theano.tensor.as_tensor_variable(y)
+        if alpha is None:
+            alpha = one
+        output = o.type.__class__(dtype=o.type.dtype,
+                                  broadcastable=(False,)*o.ndim)()
+        return Apply(self, [o, x, y, xIdx, yIdx, alpha],
+                     [output])
+    def perform(self, node, inp, out_):
+        raise NotImplementedError('Optimization of SparseBlockOuter failed.')
+    def grad(self, inputs, output_gradients):
+        raise NotImplementedError("SparseBlockOuter has no gradient "
+                                  "implemented")
+class CpuSparseBlockGemv(SparseBlockGemv):
+    """
+    CPU version of SparseBlockGemv. Check SparseBlockGemv's docstring for more
+    information.
+    This should not be directly called since the interface is subject
+    to change without notice.  Use the sandbox.blocksparse.sparse_block_dot()
+    function for a stable interface.
+    """
+    def perform(self, node, inp, out_):
+        o, W, h, iIdx, oIdx = inp[:5]
+        if not self.inplace:
+            o = o.copy()
+        for b in range(o.shape[0]):
+            for j in range(o.shape[1]):
+                outputIdx = oIdx[b, j]
+                for i in range(h.shape[1]):
+                    inputIdx = iIdx[b, i]
+                    w = W[inputIdx, outputIdx]
+                    o[b, j, :] += numpy.dot(h[b, i], w)
+        out_[0][0] = o
+class CpuSparseBlockOuter(SparseBlockOuter):
+    """
+    CPU version of SparseBlockOuter. See SparseBlockOuter's docstring for more
+    information.
+    This op should not be called directly since its interface is
+    subject to change without notice.  It is involved in the gradient
+    of GpuSparseBlockGemv. The gradient is not implemented.
+    """
+    def perform(self, node, inp, out_):
+        o, x, y, xIdx, yIdx, alpha = inp[:6]
+        if not self.inplace:
+            o = o.copy()
+        for b in range(x.shape[0]):
+            for i in range(xIdx.shape[1]):
+                for j in range(yIdx.shape[1]):
+                    o[xIdx[b, i], yIdx[b, j]] += numpy.outer(x[b, i],
+                                                             y[b, j, :])
+        out_[0][0] = o
+sparse_block_gemv = SparseBlockGemv(False)
+sparse_block_gemv_inplace = SparseBlockGemv(True)
+sparse_block_outer = SparseBlockOuter(False)
+sparse_block_outer_inplace = SparseBlockOuter(True)
+cpu_sparse_block_gemv = CpuSparseBlockGemv(False)
+cpu_sparse_block_gemv_inplace = CpuSparseBlockGemv(True)
+cpu_sparse_block_outer = CpuSparseBlockOuter(False)
+cpu_sparse_block_outer_inplace = CpuSparseBlockOuter(True)
+def sparse_block_dot(W, h, inputIdx, b, outputIdx, inplace=False):
+    """
+    Compute the dot product (plus bias) of the specified pieces of vectors
+    and matrices. See SparseBlockGemv to get more information.
+    Parameters
+    ----------
+    var: shape, comment
+    W: (iBlocks, oBlocks, iSize, oSize), weight matrix
+    h: (batch, iWin, iSize), input from lower layer (sparse)
+    inputIdx: (batch, iWin), indexes of the input blocks
+    b: (oBlocks, oSize), bias vector
+    outputIdx: (batch, oWin), indexes of the output blocks
+    returns (batch, oWin, oSize), dot(W[i, j], h[i]) + b[j]
+         but b[j] is only added once
+    Notation
+    --------
+    - `batch` is the number of examples in a minibatch (batch size).
+    - `iBlocks` is the total number of blocks in the input (from lower layer).
+    - `iSize` is the size of each of these input blocks.
+    - `iWin` is the number of blocks that will be used as inputs. Which blocks
+      will be used is specified in `inputIdx`.
+    - `oBlocks` is the number or possible output blocks.
+    - `oSize` is the size of each of these output blocks.
+    - `oWin` is the number of output blocks that will actually be computed.
+      Which blocks will be computed is specified in `outputIdx`.
+    """
+    assert inputIdx.ndim == h.ndim - 1
+    assert outputIdx.ndim == inputIdx.ndim
+    if h.ndim == 2:
+        h = h.dimshuffle('x', 0, 1)
+        inputIdx = inputIdx.dimshuffle('x', 0)
+        outputIdx = outputIdx.dimshuffle('x', 0)
+    return SparseBlockGemv(inplace)(b.take(outputIdx, axis=0), W, h,
+                                    inputIdx, outputIdx)
--- a/theano/sandbox/cuda/blocksparse.py
+++ b/theano/sandbox/cuda/blocksparse.py
+import logging
 import numpy
-import theano
+from theano import Apply, tensor
-from theano import Apply, tensor, scalar
 from theano.tensor import discrete_dtypes
 from theano.gradient import grad_undefined
-from theano.sandbox.cuda import cuda_available, GpuOp, GpuElemwise
+from theano.sandbox.cuda import cuda_available, GpuOp
+_logger = logging.getLogger('theano.sandbox.cuda.blocksparse')
 if cuda_available:
-    from theano.sandbox.cuda import (basic_ops,
+    from theano.sandbox.cuda import basic_ops
-                                     opt, GpuFromHost,
-                                     HostFromGpu, host_from_gpu,
-                                     GpuDimShuffle)
-    from theano.sandbox.cuda.opt_util import alpha_merge, output_merge
-class SparseBlockGemvSS(GpuOp):
+class GpuSparseBlockGemv(GpuOp):
    """
-    This op computes the dot product of specified pieces of vectors
+    GPU version of SparseBlockGemv. Check SparseBlockGemv's docstring for more
-    and matrices, returning pieces of vectors.
+    information.
-    It computes something like this for each j:
-      o[j] = sum_over_i(dot(W[i, j], h[i])) + o[j]
-    The i and j are taken from the inputIdx and outputIdx lists
-    respectively.
    This should not be directly called since the interface is subject
-    to change without notice.  Use the sparse_block_dot_SS() function
+    to change without notice.  Use the sandbox.blocksparse.sparse_block_dot()
-    for a stable interface.
+    function for a stable interface.
    """
    def __init__(self, inplace=False):
@@ -45,7 +36,7 @@ class SparseBlockGemvSS(GpuOp):
        return hash(type(self)) ^ hash(self.inplace)
    def __str__(self):
-        return "SparseBlockGemvSS%s" % ("{inplace}" if self.inplace else "")
+        return "GpuSparseBlockGemv%s" % ("{inplace}" if self.inplace else "")
    def make_node(self, o, W, h, inputIdx, outputIdx):
        o = basic_ops.as_cuda_ndarray_variable(o)
@@ -340,9 +331,9 @@ CudaNdarray_HOST_STRIDES(%(out)s)[0], CudaNdarray_HOST_STRIDES(%(out)s)[1],
        o, W, h, inputIdx, outputIdx = inputs
        go = grads[0]
-        Wgrad = sparse_block_outer_ss(W.zeros_like(),
+        Wgrad = gpu_sparse_block_outer(W.zeros_like(),
                                       h, go, inputIdx, outputIdx)
-        hgrad = sparse_block_gemv_ss(h.zeros_like(),
+        hgrad = gpu_sparse_block_gemv(h.zeros_like(),
                                      W.dimshuffle((1, 0, 3, 2)),
                                      go,
                                      outputIdx, inputIdx)
@@ -353,25 +344,18 @@ CudaNdarray_HOST_STRIDES(%(out)s)[0], CudaNdarray_HOST_STRIDES(%(out)s)[1],
                               "grad of outputIdx makes no sense")]
-sparse_block_gemv_ss = SparseBlockGemvSS(False)
+gpu_sparse_block_gemv = GpuSparseBlockGemv(False)
-sparse_block_gemv_ss_inplace = SparseBlockGemvSS(True)
+gpu_sparse_block_gemv_inplace = GpuSparseBlockGemv(True)
-class SparseBlockOuterSS(GpuOp):
+class GpuSparseBlockOuter(GpuOp):
    """
-    This computes the outer product of two sets of pieces of vectors
+    CPU version of SparseBlockOuter. See SparseBlockOuter's docstring for more
-    updating a full matrix with the results.
+    information.
-    It computes something like this:
-      o[i, j] = (alpha * outer(x[i], y[j])) + o[i, j]
-    The i and j are taken from the xIdx and yIdx lists respectively.
    This op should not be called directly since its interface is
    subject to change without notice.  It is involved in the gradient
-    of SparseBlockGemvSS.
+    of GpuSparseBlockGemv. The gradient is not implemented.
    """
    def __init__(self, inplace=False):
@@ -386,7 +370,7 @@ class SparseBlockOuterSS(GpuOp):
        return hash(type(self)) ^ hash(self.inplace)
    def __str__(self):
-        return "SparseBlockOuterSS%s" % ("{inplace}" if self.inplace else "")
+        return "GpuSparseBlockOuter%s" % ("{inplace}" if self.inplace else "")
    def make_node(self, o, x, y, xIdx, yIdx, alpha=None):
        one = tensor.constant(numpy.asarray(1.0, dtype='float32'))
@@ -598,8 +582,10 @@ CudaNdarray_HOST_DIMS(%(x)s)[1], CudaNdarray_HOST_DIMS(%(y)s)[1],
 %(name)s_x_list,
 %(name)s_y_list,
 %(name)s_out_list,
-CudaNdarray_DEV_DATA(%(x)s), CudaNdarray_HOST_STRIDES(%(x)s)[0], CudaNdarray_HOST_STRIDES(%(x)s)[1],
+CudaNdarray_DEV_DATA(%(x)s), CudaNdarray_HOST_STRIDES(%(x)s)[0],
-CudaNdarray_DEV_DATA(%(y)s), CudaNdarray_HOST_STRIDES(%(y)s)[0], CudaNdarray_HOST_STRIDES(%(y)s)[1],
+CudaNdarray_HOST_STRIDES(%(x)s)[1],
+CudaNdarray_DEV_DATA(%(y)s), CudaNdarray_HOST_STRIDES(%(y)s)[0],
+CudaNdarray_HOST_STRIDES(%(y)s)[1],
 CudaNdarray_DEV_DATA(%(out)s),
 CudaNdarray_HOST_STRIDES(%(out)s)[0], CudaNdarray_HOST_STRIDES(%(out)s)[1],
 %(name)s_xIdx, PyArray_DIM(%(xIdx)s, 1),
@@ -642,83 +628,5 @@ CudaNdarray_HOST_STRIDES(%(out)s)[0], CudaNdarray_HOST_STRIDES(%(out)s)[1],
        return (11,)
-sparse_block_outer_ss = SparseBlockOuterSS(False)
+gpu_sparse_block_outer = GpuSparseBlockOuter(False)
-sparse_block_outer_ss_inplace = SparseBlockOuterSS(True)
+gpu_sparse_block_outer_inplace = GpuSparseBlockOuter(True)
-if cuda_available:
-    @opt.register_opt()
-    @opt.local_optimizer([sparse_block_gemv_ss], inplace=True)
-    def local_inplace_blocksparse_gemv(node):
-        if node.op == sparse_block_gemv_ss:
-            return [sparse_block_gemv_ss_inplace(*node.inputs)]
-    @opt.register_opt()
-    @opt.local_optimizer([sparse_block_outer_ss], inplace=True)
-    def local_inplace_blocksparse_outer(node):
-        if node.op == sparse_block_outer_ss:
-            return [sparse_block_outer_ss_inplace(*node.inputs)]
-# XXX: these optimisations were badly broken and now require a working
-# beta param (could only be a 0/1 thing for outer_merge, but
-# alpha_merge needs the full range).
-#    @opt.register_opt()
-#    @alpha_merge(SparseBlockOuterSS, alpha_in=5, beta_in=?, nd=4)
-#    def local_merge_blocksparse_alpha(node, *inputs):
-#        """
-# GpuElemwise{mul}(lr, SparseBlockOuterSS) -> SparseBlockOuterSS(..., alpha=lr)
-#        """
-#        return [sparse_block_outer_ss(*inputs)]
-#    @opt.register_opt()
-#    @output_merge(SparseBlockOuterSS, alpha_in=5, beta_in=? out_in=0, nd=4)
-#    def local_merge_blocksparse_output(node, *inputs):
-#        return [sparse_block_outer_ss(*inputs)]
-def sparse_block_dot_SS(W, h, inputIdx, b, outputIdx):
-    """
-    Compute the dot product (plus bias) of the specified pieces of vectors
-    and matrices.
-    Parameters
-    ----------
-    W : (iBlocks, oBlocks, iSize, oSize)
-        Weight matrix.
-    h : (batch, iWin, iSize)
-        Input from lower layer (sparse).
-    inputIdx : (batch, iWin)
-        Indexes of the input blocks.
-    b : (oBlocks, oSize)
-        Bias vector.
-    outputIdx : (batch, oWin)
-        Indexes of the output blocks.
-    Returns
-    -------
-    (batch, oWin, oSize)
-        dot(W[i, j], h[i]) + b[j], but b[j] is only added once.
-    Notes
-    -----
-    - `batch` is the number of examples in a minibatch (batch size).
-    - `iBlocks` is the total number of blocks in the input (from lower layer).
-    - `iSize` is the size of each of these input blocks.
-    - `iWin` is the number of blocks that will be used as inputs. Which blocks
-    will be used is specified in `inputIdx`.
-    - `oBlocks` is the number or possible output blocks.
-    - `oSize` is the size of each of these output blocks.
-    - `oWin` is the number of output blocks that will actually be computed.
-    Which blocks will be computed is specified in `outputIdx`.
-    """
-    assert inputIdx.ndim == h.ndim - 1
-    assert outputIdx.ndim == inputIdx.ndim
-    if h.ndim == 2:
-        h = h.dimshuffle('x', 0, 1)
-        inputIdx = inputIdx.dimshuffle('x', 0)
-        outputIdx = outputIdx.dimshuffle('x', 0)
-    return sparse_block_gemv_ss(b.take(outputIdx, axis=0), W, h,
-                                inputIdx, outputIdx)