提交 ed4e0095 authored 作者: Alexandre de Brebisson's avatar Alexandre de Brebisson 提交者: Xavier Bouthillier

Add a meta op BlockSparseDot

Conflicts: theano/sandbox/cuda/blocksparse.py
上级 5e536853
import numpy
import theano
from theano import Op, Apply
from theano import tensor
from theano.tensor import discrete_dtypes
from theano.gradient import grad_undefined
class SparseBlockGemv(Op):
"""
This op computes the dot product of specified pieces of vectors
and matrices, returning pieces of vectors:
for b in range(batch_size):
for j in range(o.shape[1]):
for i in range(h.shape[1]):
o[b, j, :] += numpy.dot(h[b, i], W[iIdx[b, i], oIdx[b, j]])
.. image:: ../../images/blocksparse.png
"""
registered_opts = []
def __init__(self, inplace=False):
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [0]}
def make_node(self, o, W, h, inputIdx, outputIdx):
"""
Compute the dot product of the specified pieces of vectors
and matrices.
Parameters
----------
var: shape, comment
o: (batch, oWin, oSize) output vector
W: (iBlocks, oBlocks, iSize, oSize), weight matrix
h: (batch, iWin, iSize), input from lower layer (sparse)
inputIdx: (batch, iWin), indexes of the input blocks
outputIdx: (batch, oWin), indexes of the output blocks
returns (batch, oWin, oSize), dot(W[i, j], h[i]) + o[j]
Notation
--------
- `batch` is the number of examples in a minibatch (batch size).
- `iBlocks` is the total number of blocks in the input (from lower
layer).
- `iSize` is the size of each of these input blocks.
- `iWin` is the number of blocks that will be used as inputs. Which
blocks
will be used is specified in `inputIdx`.
- `oBlocks` is the number or possible output blocks.
- `oSize` is the size of each of these output blocks.
- `oWin` is the number of output blocks that will actually be computed.
Which blocks will be computed is specified in `outputIdx`.
"""
o = theano.tensor.as_tensor_variable(o)
W = theano.tensor.as_tensor_variable(W)
h = theano.tensor.as_tensor_variable(h)
inputIdx = theano.tensor.as_tensor_variable(inputIdx)
outputIdx = theano.tensor.as_tensor_variable(outputIdx)
if o.ndim != 3:
raise TypeError('The output o must be a 2D tensor')
if W.ndim != 4:
raise TypeError('The weight matrix W must be a 4D tensor')
if h.ndim != 3:
raise TypeError('The input h must be a 3D tensor')
if inputIdx.ndim != 2:
raise TypeError('The input indices inputIdx must be a 2D tensor')
if outputIdx.ndim != 2:
raise TypeError('The output indices outputIdx must be a 2D tensor')
assert inputIdx.type.dtype in discrete_dtypes
assert outputIdx.type.dtype in discrete_dtypes
output = o.type.__class__(dtype=o.type.dtype,
broadcastable=(False,)*o.ndim)()
return Apply(self, [o, W, h, inputIdx, outputIdx], [output])
def perform(self, node, inp, out_):
raise NotImplementedError('Optimization of SparseBlockGemv failed.')
def grad(self, inputs, grads):
o, W, h, inputIdx, outputIdx = inputs
go = grads[0]
outer_fun = SparseBlockOuter(self.inplace)
gemv_fun = SparseBlockGemv(self.inplace)
Wgrad = outer_fun(W.zeros_like(), h, go, inputIdx, outputIdx)
hgrad = gemv_fun(h.zeros_like(), W.dimshuffle((1, 0, 3, 2)),
go, outputIdx, inputIdx)
return [go, Wgrad, hgrad,
grad_undefined(self, 3, inputIdx,
"grad of inputIdx makes no sense"),
grad_undefined(self, 4, outputIdx,
"grad of outputIdx makes no sense")]
class SparseBlockOuter(Op):
"""
This computes the outer product of two sets of pieces of vectors
updating a full matrix with the results:
for b in range(batch_size):
o[xIdx[b, i], yIdx[b, j]] += (alpha * outer(x[b, i], y[b, j]))
This op is involved in the gradient of SparseBlockGemv.
"""
registered_opts = []
def __init__(self, inplace=False):
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [0]}
def make_node(self, o, x, y, xIdx, yIdx, alpha=None):
"""
Compute the dot product of the specified pieces of vectors
and matrices.
Parameters
----------
var: shape, comment
o: (xBlocks, yBlocks, xSize, ySize)
x: (batch, xWin, xSize)
y: (batch, yWin, ySize)
xIdx: (batch, iWin), indexes of the x blocks
yIdx: (batch, oWin), indexes of the y blocks
returns (xBlocks, yBlocks, xSize, ySize), outer(x[i], y[j]) + o[i, j]
Notation
--------
- `batch` is the number of examples in a minibatch (batch size).
- `xBlocks` is the total number of blocks in x.
- `xSize` is the size of each of these x blocks.
- `xWin` is the number of blocks that will be used as x. Which blocks
will be used is specified in `xIdx`.
- `yBlocks` is the number or possible y blocks.
- `ySize` is the size of each of these y blocks.
- `yWin` is the number of y blocks that will actually be computed.
Which blocks will be computed is specified in `yIdx`.
"""
one = tensor.constant(numpy.asarray(1.0, dtype='float32'))
o = theano.tensor.as_tensor_variable(o)
x = theano.tensor.as_tensor_variable(x)
y = theano.tensor.as_tensor_variable(y)
if alpha is None:
alpha = one
output = o.type.__class__(dtype=o.type.dtype,
broadcastable=(False,)*o.ndim)()
return Apply(self, [o, x, y, xIdx, yIdx, alpha],
[output])
def perform(self, node, inp, out_):
raise NotImplementedError('Optimization of SparseBlockOuter failed.')
def grad(self, inputs, output_gradients):
raise NotImplementedError("SparseBlockOuter has no gradient "
"implemented")
class CpuSparseBlockGemv(SparseBlockGemv):
"""
CPU version of SparseBlockGemv. Check SparseBlockGemv's docstring for more
information.
This should not be directly called since the interface is subject
to change without notice. Use the sandbox.blocksparse.sparse_block_dot()
function for a stable interface.
"""
def perform(self, node, inp, out_):
o, W, h, iIdx, oIdx = inp[:5]
if not self.inplace:
o = o.copy()
for b in range(o.shape[0]):
for j in range(o.shape[1]):
outputIdx = oIdx[b, j]
for i in range(h.shape[1]):
inputIdx = iIdx[b, i]
w = W[inputIdx, outputIdx]
o[b, j, :] += numpy.dot(h[b, i], w)
out_[0][0] = o
class CpuSparseBlockOuter(SparseBlockOuter):
"""
CPU version of SparseBlockOuter. See SparseBlockOuter's docstring for more
information.
This op should not be called directly since its interface is
subject to change without notice. It is involved in the gradient
of GpuSparseBlockGemv. The gradient is not implemented.
"""
def perform(self, node, inp, out_):
o, x, y, xIdx, yIdx, alpha = inp[:6]
if not self.inplace:
o = o.copy()
for b in range(x.shape[0]):
for i in range(xIdx.shape[1]):
for j in range(yIdx.shape[1]):
o[xIdx[b, i], yIdx[b, j]] += numpy.outer(x[b, i],
y[b, j, :])
out_[0][0] = o
sparse_block_gemv = SparseBlockGemv(False)
sparse_block_gemv_inplace = SparseBlockGemv(True)
sparse_block_outer = SparseBlockOuter(False)
sparse_block_outer_inplace = SparseBlockOuter(True)
cpu_sparse_block_gemv = CpuSparseBlockGemv(False)
cpu_sparse_block_gemv_inplace = CpuSparseBlockGemv(True)
cpu_sparse_block_outer = CpuSparseBlockOuter(False)
cpu_sparse_block_outer_inplace = CpuSparseBlockOuter(True)
def sparse_block_dot(W, h, inputIdx, b, outputIdx, inplace=False):
"""
Compute the dot product (plus bias) of the specified pieces of vectors
and matrices. See SparseBlockGemv to get more information.
Parameters
----------
var: shape, comment
W: (iBlocks, oBlocks, iSize, oSize), weight matrix
h: (batch, iWin, iSize), input from lower layer (sparse)
inputIdx: (batch, iWin), indexes of the input blocks
b: (oBlocks, oSize), bias vector
outputIdx: (batch, oWin), indexes of the output blocks
returns (batch, oWin, oSize), dot(W[i, j], h[i]) + b[j]
but b[j] is only added once
Notation
--------
- `batch` is the number of examples in a minibatch (batch size).
- `iBlocks` is the total number of blocks in the input (from lower layer).
- `iSize` is the size of each of these input blocks.
- `iWin` is the number of blocks that will be used as inputs. Which blocks
will be used is specified in `inputIdx`.
- `oBlocks` is the number or possible output blocks.
- `oSize` is the size of each of these output blocks.
- `oWin` is the number of output blocks that will actually be computed.
Which blocks will be computed is specified in `outputIdx`.
"""
assert inputIdx.ndim == h.ndim - 1
assert outputIdx.ndim == inputIdx.ndim
if h.ndim == 2:
h = h.dimshuffle('x', 0, 1)
inputIdx = inputIdx.dimshuffle('x', 0)
outputIdx = outputIdx.dimshuffle('x', 0)
return SparseBlockGemv(inplace)(b.take(outputIdx, axis=0), W, h,
inputIdx, outputIdx)
import logging
import numpy
import theano
from theano import Apply, tensor, scalar
from theano import Apply, tensor
from theano.tensor import discrete_dtypes
from theano.gradient import grad_undefined
from theano.sandbox.cuda import cuda_available, GpuOp, GpuElemwise
from theano.sandbox.cuda import cuda_available, GpuOp
_logger = logging.getLogger('theano.sandbox.cuda.blocksparse')
if cuda_available:
from theano.sandbox.cuda import (basic_ops,
opt, GpuFromHost,
HostFromGpu, host_from_gpu,
GpuDimShuffle)
from theano.sandbox.cuda.opt_util import alpha_merge, output_merge
from theano.sandbox.cuda import basic_ops
class SparseBlockGemvSS(GpuOp):
class GpuSparseBlockGemv(GpuOp):
"""
This op computes the dot product of specified pieces of vectors
and matrices, returning pieces of vectors.
It computes something like this for each j:
o[j] = sum_over_i(dot(W[i, j], h[i])) + o[j]
The i and j are taken from the inputIdx and outputIdx lists
respectively.
GPU version of SparseBlockGemv. Check SparseBlockGemv's docstring for more
information.
This should not be directly called since the interface is subject
to change without notice. Use the sparse_block_dot_SS() function
for a stable interface.
to change without notice. Use the sandbox.blocksparse.sparse_block_dot()
function for a stable interface.
"""
def __init__(self, inplace=False):
......@@ -45,7 +36,7 @@ class SparseBlockGemvSS(GpuOp):
return hash(type(self)) ^ hash(self.inplace)
def __str__(self):
return "SparseBlockGemvSS%s" % ("{inplace}" if self.inplace else "")
return "GpuSparseBlockGemv%s" % ("{inplace}" if self.inplace else "")
def make_node(self, o, W, h, inputIdx, outputIdx):
o = basic_ops.as_cuda_ndarray_variable(o)
......@@ -340,9 +331,9 @@ CudaNdarray_HOST_STRIDES(%(out)s)[0], CudaNdarray_HOST_STRIDES(%(out)s)[1],
o, W, h, inputIdx, outputIdx = inputs
go = grads[0]
Wgrad = sparse_block_outer_ss(W.zeros_like(),
Wgrad = gpu_sparse_block_outer(W.zeros_like(),
h, go, inputIdx, outputIdx)
hgrad = sparse_block_gemv_ss(h.zeros_like(),
hgrad = gpu_sparse_block_gemv(h.zeros_like(),
W.dimshuffle((1, 0, 3, 2)),
go,
outputIdx, inputIdx)
......@@ -353,25 +344,18 @@ CudaNdarray_HOST_STRIDES(%(out)s)[0], CudaNdarray_HOST_STRIDES(%(out)s)[1],
"grad of outputIdx makes no sense")]
sparse_block_gemv_ss = SparseBlockGemvSS(False)
sparse_block_gemv_ss_inplace = SparseBlockGemvSS(True)
gpu_sparse_block_gemv = GpuSparseBlockGemv(False)
gpu_sparse_block_gemv_inplace = GpuSparseBlockGemv(True)
class SparseBlockOuterSS(GpuOp):
class GpuSparseBlockOuter(GpuOp):
"""
This computes the outer product of two sets of pieces of vectors
updating a full matrix with the results.
It computes something like this:
o[i, j] = (alpha * outer(x[i], y[j])) + o[i, j]
The i and j are taken from the xIdx and yIdx lists respectively.
CPU version of SparseBlockOuter. See SparseBlockOuter's docstring for more
information.
This op should not be called directly since its interface is
subject to change without notice. It is involved in the gradient
of SparseBlockGemvSS.
of GpuSparseBlockGemv. The gradient is not implemented.
"""
def __init__(self, inplace=False):
......@@ -386,7 +370,7 @@ class SparseBlockOuterSS(GpuOp):
return hash(type(self)) ^ hash(self.inplace)
def __str__(self):
return "SparseBlockOuterSS%s" % ("{inplace}" if self.inplace else "")
return "GpuSparseBlockOuter%s" % ("{inplace}" if self.inplace else "")
def make_node(self, o, x, y, xIdx, yIdx, alpha=None):
one = tensor.constant(numpy.asarray(1.0, dtype='float32'))
......@@ -598,8 +582,10 @@ CudaNdarray_HOST_DIMS(%(x)s)[1], CudaNdarray_HOST_DIMS(%(y)s)[1],
%(name)s_x_list,
%(name)s_y_list,
%(name)s_out_list,
CudaNdarray_DEV_DATA(%(x)s), CudaNdarray_HOST_STRIDES(%(x)s)[0], CudaNdarray_HOST_STRIDES(%(x)s)[1],
CudaNdarray_DEV_DATA(%(y)s), CudaNdarray_HOST_STRIDES(%(y)s)[0], CudaNdarray_HOST_STRIDES(%(y)s)[1],
CudaNdarray_DEV_DATA(%(x)s), CudaNdarray_HOST_STRIDES(%(x)s)[0],
CudaNdarray_HOST_STRIDES(%(x)s)[1],
CudaNdarray_DEV_DATA(%(y)s), CudaNdarray_HOST_STRIDES(%(y)s)[0],
CudaNdarray_HOST_STRIDES(%(y)s)[1],
CudaNdarray_DEV_DATA(%(out)s),
CudaNdarray_HOST_STRIDES(%(out)s)[0], CudaNdarray_HOST_STRIDES(%(out)s)[1],
%(name)s_xIdx, PyArray_DIM(%(xIdx)s, 1),
......@@ -642,83 +628,5 @@ CudaNdarray_HOST_STRIDES(%(out)s)[0], CudaNdarray_HOST_STRIDES(%(out)s)[1],
return (11,)
sparse_block_outer_ss = SparseBlockOuterSS(False)
sparse_block_outer_ss_inplace = SparseBlockOuterSS(True)
if cuda_available:
@opt.register_opt()
@opt.local_optimizer([sparse_block_gemv_ss], inplace=True)
def local_inplace_blocksparse_gemv(node):
if node.op == sparse_block_gemv_ss:
return [sparse_block_gemv_ss_inplace(*node.inputs)]
@opt.register_opt()
@opt.local_optimizer([sparse_block_outer_ss], inplace=True)
def local_inplace_blocksparse_outer(node):
if node.op == sparse_block_outer_ss:
return [sparse_block_outer_ss_inplace(*node.inputs)]
# XXX: these optimisations were badly broken and now require a working
# beta param (could only be a 0/1 thing for outer_merge, but
# alpha_merge needs the full range).
# @opt.register_opt()
# @alpha_merge(SparseBlockOuterSS, alpha_in=5, beta_in=?, nd=4)
# def local_merge_blocksparse_alpha(node, *inputs):
# """
# GpuElemwise{mul}(lr, SparseBlockOuterSS) -> SparseBlockOuterSS(..., alpha=lr)
# """
# return [sparse_block_outer_ss(*inputs)]
# @opt.register_opt()
# @output_merge(SparseBlockOuterSS, alpha_in=5, beta_in=? out_in=0, nd=4)
# def local_merge_blocksparse_output(node, *inputs):
# return [sparse_block_outer_ss(*inputs)]
def sparse_block_dot_SS(W, h, inputIdx, b, outputIdx):
"""
Compute the dot product (plus bias) of the specified pieces of vectors
and matrices.
Parameters
----------
W : (iBlocks, oBlocks, iSize, oSize)
Weight matrix.
h : (batch, iWin, iSize)
Input from lower layer (sparse).
inputIdx : (batch, iWin)
Indexes of the input blocks.
b : (oBlocks, oSize)
Bias vector.
outputIdx : (batch, oWin)
Indexes of the output blocks.
Returns
-------
(batch, oWin, oSize)
dot(W[i, j], h[i]) + b[j], but b[j] is only added once.
Notes
-----
- `batch` is the number of examples in a minibatch (batch size).
- `iBlocks` is the total number of blocks in the input (from lower layer).
- `iSize` is the size of each of these input blocks.
- `iWin` is the number of blocks that will be used as inputs. Which blocks
will be used is specified in `inputIdx`.
- `oBlocks` is the number or possible output blocks.
- `oSize` is the size of each of these output blocks.
- `oWin` is the number of output blocks that will actually be computed.
Which blocks will be computed is specified in `outputIdx`.
"""
assert inputIdx.ndim == h.ndim - 1
assert outputIdx.ndim == inputIdx.ndim
if h.ndim == 2:
h = h.dimshuffle('x', 0, 1)
inputIdx = inputIdx.dimshuffle('x', 0)
outputIdx = outputIdx.dimshuffle('x', 0)
return sparse_block_gemv_ss(b.take(outputIdx, axis=0), W, h,
inputIdx, outputIdx)
gpu_sparse_block_outer = GpuSparseBlockOuter(False)
gpu_sparse_block_outer_inplace = GpuSparseBlockOuter(True)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论