Add a meta op BlockSparseDot

Conflicts: theano/sandbox/cuda/blocksparse.py

Add a meta op BlockSparseDot
ed4e0095 · Alexandre de Brebisson · Xavier Bouthillier · 5e536853 · ed4e0095 · ed4e0095
--- a/theano/sandbox/blocksparse.py
+++ b/theano/sandbox/blocksparse.py
+import numpy
+
+import theano
+from theano import Op, Apply
+from theano import tensor
+from theano.tensor import discrete_dtypes
+from theano.gradient import grad_undefined
+
+
+class SparseBlockGemv(Op):
+    """
+    This op computes the dot product of specified pieces of vectors
+    and matrices, returning pieces of vectors:
+        for b in range(batch_size):
+            for j in range(o.shape[1]):
+                for i in range(h.shape[1]):
+                    o[b, j, :] += numpy.dot(h[b, i], W[iIdx[b, i], oIdx[b, j]])
+
+    .. image:: ../../images/blocksparse.png
+    """
+
+    registered_opts = []
+
+    def __init__(self, inplace=False):
+        self.inplace = inplace
+        if self.inplace:
+            self.destroy_map = {0: [0]}
+
+    def make_node(self, o, W, h, inputIdx, outputIdx):
+        """
+        Compute the dot product of the specified pieces of vectors
+        and matrices. 
+
+        Parameters
+        ----------
+        var: shape, comment
+        o: (batch, oWin, oSize) output vector
+        W: (iBlocks, oBlocks, iSize, oSize), weight matrix
+        h: (batch, iWin, iSize), input from lower layer (sparse)
+        inputIdx: (batch, iWin), indexes of the input blocks
+        outputIdx: (batch, oWin), indexes of the output blocks
+        returns (batch, oWin, oSize), dot(W[i, j], h[i]) + o[j]
+
+        Notation
+        --------
+        - `batch` is the number of examples in a minibatch (batch size).
+        - `iBlocks` is the total number of blocks in the input (from lower
+            layer).
+        - `iSize` is the size of each of these input blocks.
+        - `iWin` is the number of blocks that will be used as inputs. Which
+            blocks
+          will be used is specified in `inputIdx`.
+        - `oBlocks` is the number or possible output blocks.
+        - `oSize` is the size of each of these output blocks.
+        - `oWin` is the number of output blocks that will actually be computed.
+          Which blocks will be computed is specified in `outputIdx`.
+        """
+        o = theano.tensor.as_tensor_variable(o)
+        W = theano.tensor.as_tensor_variable(W)
+        h = theano.tensor.as_tensor_variable(h)
+        inputIdx = theano.tensor.as_tensor_variable(inputIdx)
+        outputIdx = theano.tensor.as_tensor_variable(outputIdx)
+
+        if o.ndim != 3:
+            raise TypeError('The output o must be a 2D tensor')
+        if W.ndim != 4:
+            raise TypeError('The weight matrix W must be a 4D tensor')
+        if h.ndim != 3:
+            raise TypeError('The input h must be a 3D tensor')
+        if inputIdx.ndim != 2:
+            raise TypeError('The input indices inputIdx must be a 2D tensor')
+        if outputIdx.ndim != 2:
+            raise TypeError('The output indices outputIdx must be a 2D tensor')
+
+        assert inputIdx.type.dtype in discrete_dtypes
+        assert outputIdx.type.dtype in discrete_dtypes
+
+        output = o.type.__class__(dtype=o.type.dtype,
+                                  broadcastable=(False,)*o.ndim)()
+
+        return Apply(self, [o, W, h, inputIdx, outputIdx], [output])
+
+    def perform(self, node, inp, out_):
+        raise NotImplementedError('Optimization of SparseBlockGemv failed.')
+
+    def grad(self, inputs, grads):
+        o, W, h, inputIdx, outputIdx = inputs
+        go = grads[0]
+
+        outer_fun = SparseBlockOuter(self.inplace)
+        gemv_fun = SparseBlockGemv(self.inplace)
+
+        Wgrad = outer_fun(W.zeros_like(), h, go, inputIdx, outputIdx)
+        hgrad = gemv_fun(h.zeros_like(), W.dimshuffle((1, 0, 3, 2)),
+                         go, outputIdx, inputIdx)
+        return [go, Wgrad, hgrad,
+                grad_undefined(self, 3, inputIdx,
+                               "grad of inputIdx makes no sense"),
+                grad_undefined(self, 4, outputIdx,
+                               "grad of outputIdx makes no sense")]
+
+
+
+class SparseBlockOuter(Op):
+    """
+    This computes the outer product of two sets of pieces of vectors
+    updating a full matrix with the results:
+        for b in range(batch_size):
+            o[xIdx[b, i], yIdx[b, j]] += (alpha * outer(x[b, i], y[b, j]))
+    This op is involved in the gradient of SparseBlockGemv.
+    """
+
+    registered_opts = []
+
+    def __init__(self, inplace=False):
+        self.inplace = inplace
+        if self.inplace:
+            self.destroy_map = {0: [0]}
+
+    def make_node(self, o, x, y, xIdx, yIdx, alpha=None):
+        """
+
+        Compute the dot product of the specified pieces of vectors
+        and matrices. 
+
+        Parameters
+        ----------
+        var: shape, comment
+        o: (xBlocks, yBlocks, xSize, ySize)
+        x: (batch, xWin, xSize)
+        y: (batch, yWin, ySize)
+        xIdx: (batch, iWin), indexes of the x blocks
+        yIdx: (batch, oWin), indexes of the y blocks
+        returns (xBlocks, yBlocks, xSize, ySize), outer(x[i], y[j]) + o[i, j]
+
+        Notation
+        --------
+        - `batch` is the number of examples in a minibatch (batch size).
+        - `xBlocks` is the total number of blocks in x.
+        - `xSize` is the size of each of these x blocks.
+        - `xWin` is the number of blocks that will be used as x. Which blocks
+          will be used is specified in `xIdx`.
+        - `yBlocks` is the number or possible y blocks.
+        - `ySize` is the size of each of these y blocks.
+        - `yWin` is the number of y blocks that will actually be computed.
+          Which blocks will be computed is specified in `yIdx`.
+        """
+        one = tensor.constant(numpy.asarray(1.0, dtype='float32'))
+        o = theano.tensor.as_tensor_variable(o)
+        x = theano.tensor.as_tensor_variable(x)
+        y = theano.tensor.as_tensor_variable(y)
+
+        if alpha is None:
+            alpha = one
+
+        output = o.type.__class__(dtype=o.type.dtype,
+                                  broadcastable=(False,)*o.ndim)()
+
+        return Apply(self, [o, x, y, xIdx, yIdx, alpha],
+                     [output])
+
+    def perform(self, node, inp, out_):
+        raise NotImplementedError('Optimization of SparseBlockOuter failed.')
+
+    def grad(self, inputs, output_gradients):
+        raise NotImplementedError("SparseBlockOuter has no gradient "
+                                  "implemented")
+
+
+class CpuSparseBlockGemv(SparseBlockGemv):
+    """
+    CPU version of SparseBlockGemv. Check SparseBlockGemv's docstring for more
+    information.
+
+    This should not be directly called since the interface is subject
+    to change without notice.  Use the sandbox.blocksparse.sparse_block_dot()
+    function for a stable interface.
+    """
+
+    def perform(self, node, inp, out_):
+        o, W, h, iIdx, oIdx = inp[:5]
+
+        if not self.inplace:
+            o = o.copy()
+
+        for b in range(o.shape[0]):
+            for j in range(o.shape[1]):
+                outputIdx = oIdx[b, j]
+                for i in range(h.shape[1]):
+                    inputIdx = iIdx[b, i]
+                    w = W[inputIdx, outputIdx]
+                    o[b, j, :] += numpy.dot(h[b, i], w)
+        out_[0][0] = o
+
+
+class CpuSparseBlockOuter(SparseBlockOuter):
+    """
+    CPU version of SparseBlockOuter. See SparseBlockOuter's docstring for more
+    information.
+
+    This op should not be called directly since its interface is
+    subject to change without notice.  It is involved in the gradient
+    of GpuSparseBlockGemv. The gradient is not implemented.
+    """
+
+    def perform(self, node, inp, out_):
+        o, x, y, xIdx, yIdx, alpha = inp[:6]
+
+        if not self.inplace:
+            o = o.copy()
+
+        for b in range(x.shape[0]):
+            for i in range(xIdx.shape[1]):
+                for j in range(yIdx.shape[1]):
+                    o[xIdx[b, i], yIdx[b, j]] += numpy.outer(x[b, i],
+                                                             y[b, j, :])
+        out_[0][0] = o
+
+
+sparse_block_gemv = SparseBlockGemv(False)
+sparse_block_gemv_inplace = SparseBlockGemv(True)
+sparse_block_outer = SparseBlockOuter(False)
+sparse_block_outer_inplace = SparseBlockOuter(True)
+
+cpu_sparse_block_gemv = CpuSparseBlockGemv(False)
+cpu_sparse_block_gemv_inplace = CpuSparseBlockGemv(True)
+cpu_sparse_block_outer = CpuSparseBlockOuter(False)
+cpu_sparse_block_outer_inplace = CpuSparseBlockOuter(True)
+
+
+def sparse_block_dot(W, h, inputIdx, b, outputIdx, inplace=False):
+    """
+    Compute the dot product (plus bias) of the specified pieces of vectors
+    and matrices. See SparseBlockGemv to get more information.
+
+    Parameters
+    ----------
+    var: shape, comment
+    W: (iBlocks, oBlocks, iSize, oSize), weight matrix
+    h: (batch, iWin, iSize), input from lower layer (sparse)
+    inputIdx: (batch, iWin), indexes of the input blocks
+    b: (oBlocks, oSize), bias vector
+    outputIdx: (batch, oWin), indexes of the output blocks
+    returns (batch, oWin, oSize), dot(W[i, j], h[i]) + b[j]
+         but b[j] is only added once
+    Notation
+    --------
+    - `batch` is the number of examples in a minibatch (batch size).
+    - `iBlocks` is the total number of blocks in the input (from lower layer).
+    - `iSize` is the size of each of these input blocks.
+    - `iWin` is the number of blocks that will be used as inputs. Which blocks
+      will be used is specified in `inputIdx`.
+    - `oBlocks` is the number or possible output blocks.
+    - `oSize` is the size of each of these output blocks.
+    - `oWin` is the number of output blocks that will actually be computed.
+      Which blocks will be computed is specified in `outputIdx`.
+
+    """
+    assert inputIdx.ndim == h.ndim - 1
+    assert outputIdx.ndim == inputIdx.ndim
+    if h.ndim == 2:
+        h = h.dimshuffle('x', 0, 1)
+        inputIdx = inputIdx.dimshuffle('x', 0)
+        outputIdx = outputIdx.dimshuffle('x', 0)
+    return SparseBlockGemv(inplace)(b.take(outputIdx, axis=0), W, h,
+                                    inputIdx, outputIdx)
--- a/theano/sandbox/cuda/blocksparse.py
+++ b/theano/sandbox/cuda/blocksparse.py
+import logging
+
 import numpy
-import theano
-from theano import Apply, tensor, scalar
+from theano import Apply, tensor
 from theano.tensor import discrete_dtypes

 from theano.gradient import grad_undefined

-from theano.sandbox.cuda import cuda_available, GpuOp, GpuElemwise
+from theano.sandbox.cuda import cuda_available, GpuOp
+
+_logger = logging.getLogger('theano.sandbox.cuda.blocksparse')

 if cuda_available:
-    from theano.sandbox.cuda import (basic_ops,
-                                     opt, GpuFromHost,
-                                     HostFromGpu, host_from_gpu,
-                                     GpuDimShuffle)
-    from theano.sandbox.cuda.opt_util import alpha_merge, output_merge
+    from theano.sandbox.cuda import basic_ops


-class SparseBlockGemvSS(GpuOp):
+class GpuSparseBlockGemv(GpuOp):
    """
-    This op computes the dot product of specified pieces of vectors
-    and matrices, returning pieces of vectors.
-
-    It computes something like this for each j:
-
-      o[j] = sum_over_i(dot(W[i, j], h[i])) + o[j]
-
-    The i and j are taken from the inputIdx and outputIdx lists
-    respectively.
+    GPU version of SparseBlockGemv. Check SparseBlockGemv's docstring for more
+    information.

    This should not be directly called since the interface is subject
-    to change without notice.  Use the sparse_block_dot_SS() function
-    for a stable interface.
-
+    to change without notice.  Use the sandbox.blocksparse.sparse_block_dot()
+    function for a stable interface.
    """

    def __init__(self, inplace=False):
@@ -45,7 +36,7 @@ class SparseBlockGemvSS(GpuOp):
        return hash(type(self)) ^ hash(self.inplace)

    def __str__(self):
-        return "SparseBlockGemvSS%s" % ("{inplace}" if self.inplace else "")
+        return "GpuSparseBlockGemv%s" % ("{inplace}" if self.inplace else "")

    def make_node(self, o, W, h, inputIdx, outputIdx):
        o = basic_ops.as_cuda_ndarray_variable(o)
@@ -340,9 +331,9 @@ CudaNdarray_HOST_STRIDES(%(out)s)[0], CudaNdarray_HOST_STRIDES(%(out)s)[1],
        o, W, h, inputIdx, outputIdx = inputs
        go = grads[0]

-        Wgrad = sparse_block_outer_ss(W.zeros_like(),
+        Wgrad = gpu_sparse_block_outer(W.zeros_like(),
                                       h, go, inputIdx, outputIdx)
-        hgrad = sparse_block_gemv_ss(h.zeros_like(),
+        hgrad = gpu_sparse_block_gemv(h.zeros_like(),
                                      W.dimshuffle((1, 0, 3, 2)),
                                      go,
                                      outputIdx, inputIdx)
@@ -353,25 +344,18 @@ CudaNdarray_HOST_STRIDES(%(out)s)[0], CudaNdarray_HOST_STRIDES(%(out)s)[1],
                               "grad of outputIdx makes no sense")]


-sparse_block_gemv_ss = SparseBlockGemvSS(False)
-sparse_block_gemv_ss_inplace = SparseBlockGemvSS(True)
+gpu_sparse_block_gemv = GpuSparseBlockGemv(False)
+gpu_sparse_block_gemv_inplace = GpuSparseBlockGemv(True)


-class SparseBlockOuterSS(GpuOp):
+class GpuSparseBlockOuter(GpuOp):
    """
-    This computes the outer product of two sets of pieces of vectors
-    updating a full matrix with the results.
-
-    It computes something like this:
-
-      o[i, j] = (alpha * outer(x[i], y[j])) + o[i, j]
-
-    The i and j are taken from the xIdx and yIdx lists respectively.
+    CPU version of SparseBlockOuter. See SparseBlockOuter's docstring for more
+    information.

    This op should not be called directly since its interface is
    subject to change without notice.  It is involved in the gradient
-    of SparseBlockGemvSS.
-
+    of GpuSparseBlockGemv. The gradient is not implemented.
    """

    def __init__(self, inplace=False):
@@ -386,7 +370,7 @@ class SparseBlockOuterSS(GpuOp):
        return hash(type(self)) ^ hash(self.inplace)

    def __str__(self):
-        return "SparseBlockOuterSS%s" % ("{inplace}" if self.inplace else "")
+        return "GpuSparseBlockOuter%s" % ("{inplace}" if self.inplace else "")

    def make_node(self, o, x, y, xIdx, yIdx, alpha=None):
        one = tensor.constant(numpy.asarray(1.0, dtype='float32'))
@@ -598,8 +582,10 @@ CudaNdarray_HOST_DIMS(%(x)s)[1], CudaNdarray_HOST_DIMS(%(y)s)[1],
 %(name)s_x_list,
 %(name)s_y_list,
 %(name)s_out_list,
-CudaNdarray_DEV_DATA(%(x)s), CudaNdarray_HOST_STRIDES(%(x)s)[0], CudaNdarray_HOST_STRIDES(%(x)s)[1],
-CudaNdarray_DEV_DATA(%(y)s), CudaNdarray_HOST_STRIDES(%(y)s)[0], CudaNdarray_HOST_STRIDES(%(y)s)[1],
+CudaNdarray_DEV_DATA(%(x)s), CudaNdarray_HOST_STRIDES(%(x)s)[0],
+CudaNdarray_HOST_STRIDES(%(x)s)[1],
+CudaNdarray_DEV_DATA(%(y)s), CudaNdarray_HOST_STRIDES(%(y)s)[0],
+CudaNdarray_HOST_STRIDES(%(y)s)[1],
 CudaNdarray_DEV_DATA(%(out)s),
 CudaNdarray_HOST_STRIDES(%(out)s)[0], CudaNdarray_HOST_STRIDES(%(out)s)[1],
 %(name)s_xIdx, PyArray_DIM(%(xIdx)s, 1),
@@ -642,83 +628,5 @@ CudaNdarray_HOST_STRIDES(%(out)s)[0], CudaNdarray_HOST_STRIDES(%(out)s)[1],
        return (11,)


-sparse_block_outer_ss = SparseBlockOuterSS(False)
-sparse_block_outer_ss_inplace = SparseBlockOuterSS(True)
-
-
-if cuda_available:
-    @opt.register_opt()
-    @opt.local_optimizer([sparse_block_gemv_ss], inplace=True)
-    def local_inplace_blocksparse_gemv(node):
-        if node.op == sparse_block_gemv_ss:
-            return [sparse_block_gemv_ss_inplace(*node.inputs)]
-
-    @opt.register_opt()
-    @opt.local_optimizer([sparse_block_outer_ss], inplace=True)
-    def local_inplace_blocksparse_outer(node):
-        if node.op == sparse_block_outer_ss:
-            return [sparse_block_outer_ss_inplace(*node.inputs)]
-
-# XXX: these optimisations were badly broken and now require a working
-# beta param (could only be a 0/1 thing for outer_merge, but
-# alpha_merge needs the full range).
-
-#    @opt.register_opt()
-#    @alpha_merge(SparseBlockOuterSS, alpha_in=5, beta_in=?, nd=4)
-#    def local_merge_blocksparse_alpha(node, *inputs):
-#        """
-# GpuElemwise{mul}(lr, SparseBlockOuterSS) -> SparseBlockOuterSS(..., alpha=lr)
-#        """
-#        return [sparse_block_outer_ss(*inputs)]
-
-#    @opt.register_opt()
-#    @output_merge(SparseBlockOuterSS, alpha_in=5, beta_in=? out_in=0, nd=4)
-#    def local_merge_blocksparse_output(node, *inputs):
-#        return [sparse_block_outer_ss(*inputs)]
-
-
-def sparse_block_dot_SS(W, h, inputIdx, b, outputIdx):
-    """
-    Compute the dot product (plus bias) of the specified pieces of vectors
-    and matrices.
-
-    Parameters
-    ----------
-    W : (iBlocks, oBlocks, iSize, oSize)
-        Weight matrix.
-    h : (batch, iWin, iSize)
-        Input from lower layer (sparse).
-    inputIdx : (batch, iWin)
-        Indexes of the input blocks.
-    b : (oBlocks, oSize)
-        Bias vector.
-    outputIdx : (batch, oWin)
-        Indexes of the output blocks.
-
-    Returns
-    -------
-    (batch, oWin, oSize)
-        dot(W[i, j], h[i]) + b[j], but b[j] is only added once.
-
-    Notes
-    -----
-    - `batch` is the number of examples in a minibatch (batch size).
-    - `iBlocks` is the total number of blocks in the input (from lower layer).
-    - `iSize` is the size of each of these input blocks.
-    - `iWin` is the number of blocks that will be used as inputs. Which blocks
-    will be used is specified in `inputIdx`.
-    - `oBlocks` is the number or possible output blocks.
-    - `oSize` is the size of each of these output blocks.
-    - `oWin` is the number of output blocks that will actually be computed.
-    Which blocks will be computed is specified in `outputIdx`.
-
-    """
-
-    assert inputIdx.ndim == h.ndim - 1
-    assert outputIdx.ndim == inputIdx.ndim
-    if h.ndim == 2:
-        h = h.dimshuffle('x', 0, 1)
-        inputIdx = inputIdx.dimshuffle('x', 0)
-        outputIdx = outputIdx.dimshuffle('x', 0)
-    return sparse_block_gemv_ss(b.take(outputIdx, axis=0), W, h,
-                                inputIdx, outputIdx)
+gpu_sparse_block_outer = GpuSparseBlockOuter(False)
+gpu_sparse_block_outer_inplace = GpuSparseBlockOuter(True)