提交 565650e4 authored 作者: abergeron's avatar abergeron

Merge pull request #3311 from bouthilx/sparse_block_dot

Sparse block dot
.. _libdoc_blocksparse:
===================================================================
:mod:`sandbox.blocksparse` -- Block sparse dot operations (gemv and outer)
===================================================================
.. module:: sandbox.blocksparse
:platform: Unix, Windows
:synopsis: Block sparse dot
.. moduleauthor:: LISA
API
===
.. automodule:: theano.sandbox.blocksparse
:members:
import numpy
import theano
from theano import Op, Apply
from theano import tensor
from theano.tensor import discrete_dtypes
from theano.gradient import grad_undefined
class SparseBlockGemv(Op):
"""
This op computes the dot product of specified pieces of vectors
and matrices, returning pieces of vectors:
for b in range(batch_size):
for j in range(o.shape[1]):
for i in range(h.shape[1]):
o[b, j, :] += numpy.dot(h[b, i], W[iIdx[b, i], oIdx[b, j]])
where b, h, W, o iIdx, oIdx are defined in the docstring of make_node.
.. image:: ../../images/blocksparse.png
:scale: 50 %
"""
registered_opts = []
def __init__(self, inplace=False):
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [0]}
def make_node(self, o, W, h, inputIdx, outputIdx):
"""
Compute the dot product of the specified pieces of vectors
and matrices.
Parameters
----------
var: shape, comment
o: (batch, oWin, oSize) output vector
W: (iBlocks, oBlocks, iSize, oSize), weight matrix
h: (batch, iWin, iSize), input from lower layer (sparse)
inputIdx: (batch, iWin), indexes of the input blocks
outputIdx: (batch, oWin), indexes of the output blocks
returns (batch, oWin, oSize), dot(W[i, j], h[i]) + o[j]
Notation
--------
- `batch` is the number of examples in a minibatch (batch size).
- `iBlocks` is the total number of blocks in the input (from lower
layer).
- `iSize` is the size of each of these input blocks.
- `iWin` is the number of blocks that will be used as inputs. Which
blocks
will be used is specified in `inputIdx`.
- `oBlocks` is the number or possible output blocks.
- `oSize` is the size of each of these output blocks.
- `oWin` is the number of output blocks that will actually be computed.
Which blocks will be computed is specified in `outputIdx`.
"""
o = theano.tensor.as_tensor_variable(o)
W = theano.tensor.as_tensor_variable(W)
h = theano.tensor.as_tensor_variable(h)
inputIdx = theano.tensor.as_tensor_variable(inputIdx)
outputIdx = theano.tensor.as_tensor_variable(outputIdx)
if o.ndim != 3:
raise TypeError('The output o must be a 2D tensor')
if W.ndim != 4:
raise TypeError('The weight matrix W must be a 4D tensor')
if h.ndim != 3:
raise TypeError('The input h must be a 3D tensor')
if inputIdx.ndim != 2:
raise TypeError('The input indices inputIdx must be a 2D tensor')
if outputIdx.ndim != 2:
raise TypeError('The output indices outputIdx must be a 2D tensor')
assert inputIdx.type.dtype in discrete_dtypes
assert outputIdx.type.dtype in discrete_dtypes
output = o.type.__class__(dtype=o.type.dtype,
broadcastable=(False,) * o.ndim)()
return Apply(self, [o, W, h, inputIdx, outputIdx], [output])
def perform(self, node, inp, out_):
o, W, h, iIdx, oIdx = inp[:5]
if not self.inplace:
o = o.copy()
for b in range(o.shape[0]):
for j in range(o.shape[1]):
outputIdx = oIdx[b, j]
for i in range(h.shape[1]):
inputIdx = iIdx[b, i]
w = W[inputIdx, outputIdx]
o[b, j, :] += numpy.dot(h[b, i], w)
out_[0][0] = o
def grad(self, inputs, grads):
o, W, h, inputIdx, outputIdx = inputs
go = grads[0]
outer_fun = SparseBlockOuter(self.inplace)
gemv_fun = SparseBlockGemv(self.inplace)
Wgrad = outer_fun(W.zeros_like(), h, go, inputIdx, outputIdx)
hgrad = gemv_fun(h.zeros_like(), W.dimshuffle((1, 0, 3, 2)),
go, outputIdx, inputIdx)
return [go, Wgrad, hgrad,
grad_undefined(self, 3, inputIdx,
"grad of inputIdx makes no sense"),
grad_undefined(self, 4, outputIdx,
"grad of outputIdx makes no sense")]
class SparseBlockOuter(Op):
"""
This computes the outer product of two sets of pieces of vectors
updating a full matrix with the results:
for b in range(batch_size):
o[xIdx[b, i], yIdx[b, j]] += (alpha * outer(x[b, i], y[b, j]))
This op is involved in the gradient of SparseBlockGemv.
"""
registered_opts = []
def __init__(self, inplace=False):
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [0]}
def make_node(self, o, x, y, xIdx, yIdx, alpha=None):
"""
Compute the dot product of the specified pieces of vectors
and matrices.
Parameters
----------
var: shape, comment
o: (xBlocks, yBlocks, xSize, ySize)
x: (batch, xWin, xSize)
y: (batch, yWin, ySize)
xIdx: (batch, iWin), indexes of the x blocks
yIdx: (batch, oWin), indexes of the y blocks
returns (xBlocks, yBlocks, xSize, ySize), outer(x[i], y[j]) + o[i, j]
Notation
--------
- `batch` is the number of examples in a minibatch (batch size).
- `xBlocks` is the total number of blocks in x.
- `xSize` is the size of each of these x blocks.
- `xWin` is the number of blocks that will be used as x. Which blocks
will be used is specified in `xIdx`.
- `yBlocks` is the number or possible y blocks.
- `ySize` is the size of each of these y blocks.
- `yWin` is the number of y blocks that will actually be computed.
Which blocks will be computed is specified in `yIdx`.
"""
one = tensor.constant(numpy.asarray(1.0, dtype='float32'))
o = theano.tensor.as_tensor_variable(o)
x = theano.tensor.as_tensor_variable(x)
y = theano.tensor.as_tensor_variable(y)
if alpha is None:
alpha = one
output = o.type.__class__(dtype=o.type.dtype,
broadcastable=(False,) * o.ndim)()
return Apply(self, [o, x, y, xIdx, yIdx, alpha],
[output])
def perform(self, node, inp, out_):
o, x, y, xIdx, yIdx, alpha = inp[:6]
if not self.inplace:
o = o.copy()
for b in range(x.shape[0]):
for i in range(xIdx.shape[1]):
for j in range(yIdx.shape[1]):
o[xIdx[b, i], yIdx[b, j]] += numpy.outer(x[b, i],
y[b, j, :])
out_[0][0] = o
sparse_block_gemv = SparseBlockGemv(False)
sparse_block_gemv_inplace = SparseBlockGemv(True)
sparse_block_outer = SparseBlockOuter(False)
sparse_block_outer_inplace = SparseBlockOuter(True)
def sparse_block_dot(W, h, inputIdx, b, outputIdx):
"""
Compute the dot product (plus bias) of the specified pieces of vectors
and matrices. See SparseBlockGemv to get more information.
Parameters
----------
var: shape, comment
W: (iBlocks, oBlocks, iSize, oSize), weight matrix
h: (batch, iWin, iSize), input from lower layer (sparse)
inputIdx: (batch, iWin), indexes of the input blocks
b: (oBlocks, oSize), bias vector
outputIdx: (batch, oWin), indexes of the output blocks
returns (batch, oWin, oSize), dot(W[i, j], h[i]) + b[j]
but b[j] is only added once
Notation
--------
- `batch` is the number of examples in a minibatch (batch size).
- `iBlocks` is the total number of blocks in the input (from lower layer).
- `iSize` is the size of each of these input blocks.
- `iWin` is the number of blocks that will be used as inputs. Which blocks
will be used is specified in `inputIdx`.
- `oBlocks` is the number or possible output blocks.
- `oSize` is the size of each of these output blocks.
- `oWin` is the number of output blocks that will actually be computed.
Which blocks will be computed is specified in `outputIdx`.
"""
assert inputIdx.ndim == h.ndim - 1
assert outputIdx.ndim == inputIdx.ndim
if h.ndim == 2:
h = h.dimshuffle('x', 0, 1)
inputIdx = inputIdx.dimshuffle('x', 0)
outputIdx = outputIdx.dimshuffle('x', 0)
return SparseBlockGemv()(b.take(outputIdx, axis=0), W, h,
inputIdx, outputIdx)
import logging
import numpy
import theano
from theano import Apply, tensor, scalar
from theano import Apply, tensor
from theano.tensor import discrete_dtypes
from theano.gradient import grad_undefined
from theano.sandbox.cuda import cuda_available, GpuOp, GpuElemwise
from theano.sandbox.cuda import cuda_available, GpuOp
_logger = logging.getLogger('theano.sandbox.cuda.blocksparse')
if cuda_available:
from theano.sandbox.cuda import (basic_ops,
opt, GpuFromHost,
HostFromGpu, host_from_gpu,
GpuDimShuffle)
from theano.sandbox.cuda.opt_util import alpha_merge, output_merge
from theano.sandbox.cuda import basic_ops
class SparseBlockGemvSS(GpuOp):
class GpuSparseBlockGemv(GpuOp):
"""
This op computes the dot product of specified pieces of vectors
and matrices, returning pieces of vectors.
It computes something like this for each j:
o[j] = sum_over_i(dot(W[i, j], h[i])) + o[j]
The i and j are taken from the inputIdx and outputIdx lists
respectively.
GPU version of SparseBlockGemv. Check SparseBlockGemv's docstring for more
information.
This should not be directly called since the interface is subject
to change without notice. Use the sparse_block_dot_SS() function
for a stable interface.
to change without notice. Use the sandbox.blocksparse.sparse_block_dot()
function for a stable interface.
"""
def __init__(self, inplace=False):
......@@ -45,7 +36,7 @@ class SparseBlockGemvSS(GpuOp):
return hash(type(self)) ^ hash(self.inplace)
def __str__(self):
return "SparseBlockGemvSS%s" % ("{inplace}" if self.inplace else "")
return "GpuSparseBlockGemv%s" % ("{inplace}" if self.inplace else "")
def make_node(self, o, W, h, inputIdx, outputIdx):
o = basic_ops.as_cuda_ndarray_variable(o)
......@@ -340,12 +331,12 @@ CudaNdarray_HOST_STRIDES(%(out)s)[0], CudaNdarray_HOST_STRIDES(%(out)s)[1],
o, W, h, inputIdx, outputIdx = inputs
go = grads[0]
Wgrad = sparse_block_outer_ss(W.zeros_like(),
h, go, inputIdx, outputIdx)
hgrad = sparse_block_gemv_ss(h.zeros_like(),
W.dimshuffle((1, 0, 3, 2)),
go,
outputIdx, inputIdx)
Wgrad = gpu_sparse_block_outer(W.zeros_like(),
h, go, inputIdx, outputIdx)
hgrad = gpu_sparse_block_gemv(h.zeros_like(),
W.dimshuffle((1, 0, 3, 2)),
go,
outputIdx, inputIdx)
return [go, Wgrad, hgrad,
grad_undefined(self, 3, inputIdx,
"grad of inputIdx makes no sense"),
......@@ -353,25 +344,18 @@ CudaNdarray_HOST_STRIDES(%(out)s)[0], CudaNdarray_HOST_STRIDES(%(out)s)[1],
"grad of outputIdx makes no sense")]
sparse_block_gemv_ss = SparseBlockGemvSS(False)
sparse_block_gemv_ss_inplace = SparseBlockGemvSS(True)
gpu_sparse_block_gemv = GpuSparseBlockGemv(False)
gpu_sparse_block_gemv_inplace = GpuSparseBlockGemv(True)
class SparseBlockOuterSS(GpuOp):
class GpuSparseBlockOuter(GpuOp):
"""
This computes the outer product of two sets of pieces of vectors
updating a full matrix with the results.
It computes something like this:
o[i, j] = (alpha * outer(x[i], y[j])) + o[i, j]
The i and j are taken from the xIdx and yIdx lists respectively.
CPU version of SparseBlockOuter. See SparseBlockOuter's docstring for more
information.
This op should not be called directly since its interface is
subject to change without notice. It is involved in the gradient
of SparseBlockGemvSS.
subject to change without notice. It is involved in the gradient
of GpuSparseBlockGemv. The gradient is not implemented.
"""
def __init__(self, inplace=False):
......@@ -386,7 +370,7 @@ class SparseBlockOuterSS(GpuOp):
return hash(type(self)) ^ hash(self.inplace)
def __str__(self):
return "SparseBlockOuterSS%s" % ("{inplace}" if self.inplace else "")
return "GpuSparseBlockOuter%s" % ("{inplace}" if self.inplace else "")
def make_node(self, o, x, y, xIdx, yIdx, alpha=None):
one = tensor.constant(numpy.asarray(1.0, dtype='float32'))
......@@ -598,8 +582,10 @@ CudaNdarray_HOST_DIMS(%(x)s)[1], CudaNdarray_HOST_DIMS(%(y)s)[1],
%(name)s_x_list,
%(name)s_y_list,
%(name)s_out_list,
CudaNdarray_DEV_DATA(%(x)s), CudaNdarray_HOST_STRIDES(%(x)s)[0], CudaNdarray_HOST_STRIDES(%(x)s)[1],
CudaNdarray_DEV_DATA(%(y)s), CudaNdarray_HOST_STRIDES(%(y)s)[0], CudaNdarray_HOST_STRIDES(%(y)s)[1],
CudaNdarray_DEV_DATA(%(x)s), CudaNdarray_HOST_STRIDES(%(x)s)[0],
CudaNdarray_HOST_STRIDES(%(x)s)[1],
CudaNdarray_DEV_DATA(%(y)s), CudaNdarray_HOST_STRIDES(%(y)s)[0],
CudaNdarray_HOST_STRIDES(%(y)s)[1],
CudaNdarray_DEV_DATA(%(out)s),
CudaNdarray_HOST_STRIDES(%(out)s)[0], CudaNdarray_HOST_STRIDES(%(out)s)[1],
%(name)s_xIdx, PyArray_DIM(%(xIdx)s, 1),
......@@ -642,83 +628,5 @@ CudaNdarray_HOST_STRIDES(%(out)s)[0], CudaNdarray_HOST_STRIDES(%(out)s)[1],
return (11,)
sparse_block_outer_ss = SparseBlockOuterSS(False)
sparse_block_outer_ss_inplace = SparseBlockOuterSS(True)
if cuda_available:
@opt.register_opt()
@opt.local_optimizer([sparse_block_gemv_ss], inplace=True)
def local_inplace_blocksparse_gemv(node):
if node.op == sparse_block_gemv_ss:
return [sparse_block_gemv_ss_inplace(*node.inputs)]
@opt.register_opt()
@opt.local_optimizer([sparse_block_outer_ss], inplace=True)
def local_inplace_blocksparse_outer(node):
if node.op == sparse_block_outer_ss:
return [sparse_block_outer_ss_inplace(*node.inputs)]
# XXX: these optimisations were badly broken and now require a working
# beta param (could only be a 0/1 thing for outer_merge, but
# alpha_merge needs the full range).
# @opt.register_opt()
# @alpha_merge(SparseBlockOuterSS, alpha_in=5, beta_in=?, nd=4)
# def local_merge_blocksparse_alpha(node, *inputs):
# """
# GpuElemwise{mul}(lr, SparseBlockOuterSS) -> SparseBlockOuterSS(..., alpha=lr)
# """
# return [sparse_block_outer_ss(*inputs)]
# @opt.register_opt()
# @output_merge(SparseBlockOuterSS, alpha_in=5, beta_in=? out_in=0, nd=4)
# def local_merge_blocksparse_output(node, *inputs):
# return [sparse_block_outer_ss(*inputs)]
def sparse_block_dot_SS(W, h, inputIdx, b, outputIdx):
"""
Compute the dot product (plus bias) of the specified pieces of vectors
and matrices.
Parameters
----------
W : (iBlocks, oBlocks, iSize, oSize)
Weight matrix.
h : (batch, iWin, iSize)
Input from lower layer (sparse).
inputIdx : (batch, iWin)
Indexes of the input blocks.
b : (oBlocks, oSize)
Bias vector.
outputIdx : (batch, oWin)
Indexes of the output blocks.
Returns
-------
(batch, oWin, oSize)
dot(W[i, j], h[i]) + b[j], but b[j] is only added once.
Notes
-----
- `batch` is the number of examples in a minibatch (batch size).
- `iBlocks` is the total number of blocks in the input (from lower layer).
- `iSize` is the size of each of these input blocks.
- `iWin` is the number of blocks that will be used as inputs. Which blocks
will be used is specified in `inputIdx`.
- `oBlocks` is the number or possible output blocks.
- `oSize` is the size of each of these output blocks.
- `oWin` is the number of output blocks that will actually be computed.
Which blocks will be computed is specified in `outputIdx`.
"""
assert inputIdx.ndim == h.ndim - 1
assert outputIdx.ndim == inputIdx.ndim
if h.ndim == 2:
h = h.dimshuffle('x', 0, 1)
inputIdx = inputIdx.dimshuffle('x', 0)
outputIdx = outputIdx.dimshuffle('x', 0)
return sparse_block_gemv_ss(b.take(outputIdx, axis=0), W, h,
inputIdx, outputIdx)
gpu_sparse_block_outer = GpuSparseBlockOuter(False)
gpu_sparse_block_outer_inplace = GpuSparseBlockOuter(True)
......@@ -220,7 +220,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuOp):
# return ()
return (4,)
gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1HotWithBias()
gpu_crossentropy_softmax_argmax_1hot_with_bias = \
GpuCrossentropySoftmaxArgmax1HotWithBias()
class GpuCrossentropySoftmax1HotWithBiasDx(GpuOp):
......@@ -391,7 +392,8 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuOp):
}
""" % locals()
gpu_crossentropy_softmax_1hot_with_bias_dx = GpuCrossentropySoftmax1HotWithBiasDx()
gpu_crossentropy_softmax_1hot_with_bias_dx = \
GpuCrossentropySoftmax1HotWithBiasDx()
class GpuSoftmax(GpuOp):
......
差异被折叠。
import numpy
from numpy.random import randn
from unittest import TestCase
from nose.plugins.skip import SkipTest
import theano
from theano import tensor
import theano.tests.unittest_tools as utt
import theano.sandbox.tests.test_blocksparse
import theano.sandbox.cuda as cuda_ndarray
if not cuda_ndarray.cuda_available:
raise SkipTest('Optional package cuda disabled')
from theano.sandbox.cuda.basic_ops import (GpuDimShuffle,
as_cuda_ndarray_variable)
from theano.sandbox.cuda.blocksparse import (sparse_block_dot_SS,
sparse_block_gemv_ss,
sparse_block_outer_ss,
sparse_block_outer_ss_inplace,
SparseBlockOuterSS)
from theano.sandbox.cuda.blocksparse import (GpuSparseBlockOuter,
gpu_sparse_block_gemv,
gpu_sparse_block_outer)
from theano.sandbox.cuda.var import float32_shared_constructor
if not cuda_ndarray.cuda_available:
raise SkipTest('Optional package cuda disabled')
if theano.config.mode == 'FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
......@@ -29,187 +21,56 @@ else:
mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
def setup():
utt.seed_rng()
def blocksparse_data():
nInputBlock = 128
nOutputBlock = 64
inputSize = 40
outputSize = 30
inputWindowSize = 7
outputWindowSize = 9
batchSize = 2
input = randn(batchSize, inputWindowSize, inputSize).astype('float32')
permutation = numpy.random.permutation
inputIndice = numpy.vstack(permutation(nInputBlock)[:inputWindowSize]
for _ in range(batchSize))
outputIndice = numpy.vstack(permutation(nOutputBlock)[:outputWindowSize]
for _ in range(batchSize))
weight = randn(nInputBlock, nOutputBlock,
inputSize, outputSize).astype('float32')
bias = randn(nOutputBlock, outputSize).astype('float32')
return weight, input, inputIndice, bias, outputIndice
def blocksparse(W, h, iIdx, b, oIdx):
o = b.take(oIdx, axis=0)
for b in range(o.shape[0]):
for j in range(o.shape[1]):
outputIdx = oIdx[b, j]
for i in range(h.shape[1]):
inputIdx = iIdx[b, i]
w = W[inputIdx, outputIdx]
# this below is a gemv I think
o[b, j, :] += numpy.dot(h[b, i], w)
return o
def test_blocksparse():
b = tensor.fmatrix()
W = tensor.ftensor4()
h = tensor.ftensor3()
iIdx = tensor.lmatrix()
oIdx = tensor.lmatrix()
o = sparse_block_dot_SS(W, h, iIdx, b, oIdx)
f = theano.function([W, h, iIdx, b, oIdx], o, mode=mode_with_gpu)
W_val, h_val, iIdx_val, b_val, oIdx_val = blocksparse_data()
th_out = f(W_val, h_val, iIdx_val, b_val, oIdx_val)
ref_out = blocksparse(W_val, h_val, iIdx_val, b_val, oIdx_val)
utt.assert_allclose(ref_out, th_out)
test_blocksparse.setup = setup
# test the fortan order for W (which can happen in the grad for some graphs).
def test_blocksparseF():
b = tensor.fmatrix()
W = tensor.ftensor4()
h = tensor.ftensor3()
iIdx = tensor.lmatrix()
oIdx = tensor.lmatrix()
o = sparse_block_dot_SS(GpuDimShuffle((False, False, False, False),
(0, 1, 3, 2))(
as_cuda_ndarray_variable(W)),
h, iIdx, b, oIdx)
f = theano.function([W, h, iIdx, b, oIdx], o, mode=mode_with_gpu)
W_val, h_val, iIdx_val, b_val, oIdx_val = blocksparse_data()
th_out = f(numpy.swapaxes(W_val, 2, 3), h_val, iIdx_val, b_val, oIdx_val)
ref_out = blocksparse(W_val, h_val, iIdx_val, b_val, oIdx_val)
utt.assert_allclose(ref_out, th_out)
def test_blocksparse_grad():
h_val = randn(1, 2, 3).astype('float32')
iIdx_val = numpy.random.permutation(3)[:2][None, :]
oIdx_val = numpy.random.permutation(3)[:2][None, :]
W_val = randn(3, 3, 3, 4).astype('float32')
b_val = randn(3, 4).astype('float32')
iIdx = theano.tensor.constant(iIdx_val)
oIdx = theano.tensor.constant(oIdx_val)
def f(b, h, W):
return sparse_block_gemv_ss(b.take(oIdx, axis=0), W, h, iIdx, oIdx)
utt.verify_grad(f, [b_val, h_val, W_val], mode=mode_with_gpu)
def test_blocksparse_grad_1():
# This tests that we correctly handle cases where dimensions are 1.
h_val = randn(1, 1, 1).astype('float32')
iIdx_val = numpy.random.permutation(1)[:1][None, :]
oIdx_val = numpy.random.permutation(1)[:1][None, :]
W_val = randn(1, 1, 1, 1).astype('float32')
b_val = randn(1, 1).astype('float32')
iIdx = theano.tensor.constant(iIdx_val)
oIdx = theano.tensor.constant(oIdx_val)
def f(b, h, W):
return sparse_block_gemv_ss(b.take(oIdx, axis=0), W, h, iIdx, oIdx)
utt.verify_grad(f, [b_val, h_val, W_val], mode=mode_with_gpu)
def test_blocksparse_grad_shape():
b = tensor.fmatrix()
W = tensor.ftensor4()
h = tensor.ftensor3()
iIdx = tensor.lmatrix()
oIdx = tensor.lmatrix()
o = sparse_block_gemv_ss(b.take(oIdx, axis=0), W, h, iIdx, oIdx)
go = theano.grad(o.sum(), [b, W, h])
f = theano.function([W, h, iIdx, b, oIdx], go, mode=mode_with_gpu)
W_val, h_val, iIdx_val, b_val, oIdx_val = blocksparse_data()
# just make sure that it runs correcly and all the shapes are ok.
b_g, W_g, h_g = f(W_val, h_val, iIdx_val, b_val, oIdx_val)
assert b_g.shape == b_val.shape
assert h_g.shape == h_val.shape
assert W_g.shape == W_val.shape
class BlockSparse_Gemv_and_Outer(
theano.sandbox.tests.test_blocksparse.BlockSparse_Gemv_and_Outer):
def setUp(self):
utt.seed_rng()
self.mode = mode_with_gpu.excluding('constant_folding')
self.gemv_op = gpu_sparse_block_gemv
self.outer_op = gpu_sparse_block_outer
# This test is temporarily disabled since we disabled the output_merge
# and alpha_merge optimizations for blocksparse due to brokeness.
# Re-enable when those are re-added.
def Xtest_blocksparse_grad_merge():
b = tensor.fmatrix()
h = tensor.ftensor3()
iIdx = tensor.lmatrix()
oIdx = tensor.lmatrix()
# This test is temporarily disabled since we disabled the output_merge
# and alpha_merge optimizations for blocksparse due to brokeness.
# Re-enable when those are re-added.
def Xtest_blocksparse_grad_merge(self):
b = tensor.fmatrix()
h = tensor.ftensor3()
iIdx = tensor.lmatrix()
oIdx = tensor.lmatrix()
W_val, h_val, iIdx_val, b_val, oIdx_val = blocksparse_data()
W = float32_shared_constructor(W_val)
W_val, h_val, iIdx_val, b_val, oIdx_val = self.gemv_data()
W = float32_shared_constructor(W_val)
o = sparse_block_gemv_ss(b.take(oIdx, axis=0), W, h, iIdx, oIdx)
gW = theano.grad(o.sum(), W)
o = gpu_sparse_block_gemv(b.take(oIdx, axis=0), W, h, iIdx, oIdx)
gW = theano.grad(o.sum(), W)
lr = numpy.asarray(0.05, dtype='float32')
lr = numpy.asarray(0.05, dtype='float32')
upd = W - lr * gW
upd = W - lr * gW
f1 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)],
mode=mode_with_gpu)
f1 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)],
mode=mode_with_gpu)
# Make sure the lr update was merged.
assert isinstance(f1.maker.fgraph.outputs[0].owner.op, SparseBlockOuterSS)
# Make sure the lr update was merged.
assert isinstance(f1.maker.fgraph.outputs[0].owner.op,
GpuSparseBlockOuter)
# Exclude the merge optimizations.
mode = mode_with_gpu.excluding('local_merge_blocksparse_alpha')
mode = mode.excluding('local_merge_blocksparse_output')
# Exclude the merge optimizations.
mode = mode_with_gpu.excluding('local_merge_blocksparse_alpha')
mode = mode.excluding('local_merge_blocksparse_output')
f2 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)], mode=mode)
f2 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)], mode=mode)
# Make sure the lr update is not merged.
assert not isinstance(f2.maker.fgraph.outputs[0].owner.op,
SparseBlockOuterSS)
# Make sure the lr update is not merged.
assert not isinstance(f2.maker.fgraph.outputs[0].owner.op,
GpuSparseBlockOuter)
f2(h_val, iIdx_val, b_val, oIdx_val)
W_ref = W.get_value()
f2(h_val, iIdx_val, b_val, oIdx_val)
W_ref = W.get_value()
# reset the var
W.set_value(W_val)
f1(h_val, iIdx_val, b_val, oIdx_val)
W_opt = W.get_value()
# reset the var
W.set_value(W_val)
f1(h_val, iIdx_val, b_val, oIdx_val)
W_opt = W.get_value()
utt.assert_allclose(W_ref, W_opt)
utt.assert_allclose(W_ref, W_opt)
......@@ -29,6 +29,9 @@ from theano.sandbox.cuda import basic_ops
from theano.sandbox.cuda.type import CudaNdarrayType
from theano.scalar.basic_scipy import erfinv
from theano.sandbox.blocksparse import sparse_block_dot
from theano.sandbox.cuda.blocksparse import GpuSparseBlockGemv, GpuSparseBlockOuter
if theano.config.mode == 'FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpu')
......@@ -740,6 +743,36 @@ def test_local_gpu_dot_to_dot22dot():
cmp((3, 4), (4,))
def test_blocksparse_gpu_gemv_opt():
b = tensor.fmatrix()
W = tensor.ftensor4()
h = tensor.ftensor3()
iIdx = tensor.lmatrix()
oIdx = tensor.lmatrix()
o = sparse_block_dot(W, h, iIdx, b, oIdx)
f = theano.function([W, h, iIdx, b, oIdx], o, mode=mode_with_gpu)
assert isinstance(f.maker.fgraph.toposort()[-2].op, GpuSparseBlockGemv)
def test_blocksparse_gpu_outer_opt():
b = tensor.fmatrix()
W = tensor.ftensor4()
h = tensor.ftensor3()
iIdx = tensor.lmatrix()
oIdx = tensor.lmatrix()
o = sparse_block_dot(W, h, iIdx, b, oIdx)
f = theano.function([W, h, iIdx, b, oIdx], [o, tensor.grad(o.sum(),
wrt=W)],
mode=mode_with_gpu)
assert isinstance(f.maker.fgraph.toposort()[-2].op, GpuSparseBlockOuter)
class test_diag(theano.tensor.tests.test_nlinalg.test_diag):
mode = mode_with_gpu
shared = staticmethod(cuda.shared_constructor)
......@@ -756,4 +789,3 @@ if __name__ == '__main__':
test_opt_gpujoin_onlyajoin()
test_opt_gpujoin_joinvectors_elemwise_then_minusone()
test_opt_gpujoin_joinvectors_negativeaxes()
"""
Optimizations addressing the ops in sandbox root directory
"""
from theano import compile # to register the optimizer built by this file
from theano import gof
from theano.sandbox.blocksparse import (
SparseBlockGemv,
SparseBlockOuter,
sparse_block_gemv_inplace,
sparse_block_outer_inplace)
@gof.local_optimizer([SparseBlockGemv], inplace=True)
def local_inplace_sparse_block_gemv(node):
"""
SparseBlockGemv(inplace=False) -> SparseBlockGemv(inplace=True)
"""
if isinstance(node.op, SparseBlockGemv) and not node.op.inplace:
new_node = sparse_block_gemv_inplace(*node.inputs)
return [new_node]
return False
compile.optdb.register('local_inplace_sparse_block_gemv',
gof.TopoOptimizer(
local_inplace_sparse_block_gemv,
failure_callback=gof.TopoOptimizer.warn_inplace),
60, 'fast_run', 'inplace') # DEBUG
@gof.local_optimizer([SparseBlockOuter], inplace=True)
def local_inplace_sparse_block_outer(node):
"""
SparseBlockOuter(inplace=False) -> SparseBlockOuter(inplace=True)
"""
if isinstance(node.op, SparseBlockOuter) and not node.op.inplace:
new_node = sparse_block_outer_inplace(*node.inputs)
return [new_node]
return False
compile.optdb.register('local_inplace_sparse_block_outer',
gof.TopoOptimizer(
local_inplace_sparse_block_outer,
failure_callback=gof.TopoOptimizer.warn_inplace),
60, 'fast_run', 'inplace') # DEBUG
差异被折叠。
......@@ -4,7 +4,7 @@ import numpy
import theano
from theano import config, function, tensor
from . import multinomial
from theano.sandbox import multinomial
from theano.compile.mode import get_default_mode, predefined_linkers
import theano.sandbox.cuda as cuda
......
import theano
from theano import tensor
from theano.sandbox.blocksparse import sparse_block_dot
def test_blocksparse_inplace_gemv_opt():
b = tensor.fmatrix()
W = tensor.ftensor4()
h = tensor.ftensor3()
iIdx = tensor.lmatrix()
oIdx = tensor.lmatrix()
o = sparse_block_dot(W, h, iIdx, b, oIdx)
f = theano.function([W, h, iIdx, b, oIdx], o)
if theano.config.mode == "FAST_COMPILE":
assert not f.maker.fgraph.toposort()[-1].op.inplace
else:
assert f.maker.fgraph.toposort()[-1].op.inplace
def test_blocksparse_inplace_outer_opt():
b = tensor.fmatrix()
W = tensor.ftensor4()
h = tensor.ftensor3()
iIdx = tensor.lmatrix()
oIdx = tensor.lmatrix()
o = sparse_block_dot(W, h, iIdx, b, oIdx)
theano.printing.debugprint(tensor.grad(o.sum(), wrt=W))
f = theano.function([W, h, iIdx, b, oIdx],
[o, tensor.grad(o.sum(), wrt=W)])
if theano.config.mode == "FAST_COMPILE":
assert not f.maker.fgraph.toposort()[-1].op.inplace
else:
assert f.maker.fgraph.toposort()[-1].op.inplace
import theano
import numpy
from . import scan
from theano.sandbox import scan
def test_001():
......
from __future__ import print_function
from .theano_object import *
from theano.sandbox.theano_object import *
RUN_TESTS = False
......
......@@ -98,17 +98,19 @@ whitelist_flake8 = [
"tensor/nnet/tests/test_sigm.py",
"scalar/__init__.py",
"scalar/tests/test_basic.py",
"sandbox/test_theano_object.py",
"sandbox/test_scan.py",
"sandbox/__init__.py",
"sandbox/rng_mrg.py",
"sandbox/theano_object.py",
"sandbox/scan.py",
"sandbox/test_multinomial.py",
"sandbox/test_rng_mrg.py",
"sandbox/test_neighbourhoods.py",
"sandbox/symbolic_module.py",
"sandbox/conv.py",
"sandbox/debug.py",
"sandbox/tests/test_theano_object.py",
"sandbox/tests/test_scan.py",
"sandbox/tests/test_rng_mrg.py",
"sandbox/tests/test_neighbourhoods.py",
"sandbox/tests/test_multinomial.py",
"sandbox/tests/__init__.py",
"sandbox/cuda/dnn.py",
"sandbox/cuda/var.py",
"sandbox/cuda/GpuConvGrad3D.py",
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论