提交 b192515a authored 作者: Brandon T. Willard's avatar Brandon T. Willard 提交者: Thomas Wiecki

Move theano.tensor.basic.batched_*dot functions to theano.tensor.blas

Because tests were moved, the RNG for subsequent tests was altered, which required a change to `utt.assert_allclose` in `tests.tensor.test_basic` for more reasonable `rtol` settings.
上级 ef854897
...@@ -22,7 +22,7 @@ from theano.gpuarray.blas import ( ...@@ -22,7 +22,7 @@ from theano.gpuarray.blas import (
gpuger_inplace, gpuger_inplace,
gpuger_no_inplace, gpuger_no_inplace,
) )
from theano.tensor.blas import _dot22, batched_dot, gemm_inplace, gemv, gemv_inplace from theano.tensor.blas import BatchedDot, _dot22, gemm_inplace, gemv, gemv_inplace
from theano.tensor.type import matrix, tensor, tensor3, vector from theano.tensor.type import matrix, tensor, tensor3, vector
...@@ -186,7 +186,7 @@ gemm_batched_tests["float64"] = [ ...@@ -186,7 +186,7 @@ gemm_batched_tests["float64"] = [
TestGpuGemmBatch = makeTester( TestGpuGemmBatch = makeTester(
"GpuGemmBatchTester", "GpuGemmBatchTester",
op=lambda z, alpha, x, y, beta: alpha * batched_dot(x, y) + beta * z, op=lambda z, alpha, x, y, beta: alpha * BatchedDot()(x, y) + beta * z,
gpu_op=gpugemmbatch_inplace, gpu_op=gpugemmbatch_inplace,
cases=gemm_batched_tests, cases=gemm_batched_tests,
) )
......
...@@ -8,7 +8,7 @@ from tempfile import mkstemp ...@@ -8,7 +8,7 @@ from tempfile import mkstemp
import numpy as np import numpy as np
import pytest import pytest
from numpy.testing import assert_allclose, assert_almost_equal, assert_array_equal from numpy.testing import assert_almost_equal, assert_array_equal
import theano import theano
import theano.scalar as ts import theano.scalar as ts
...@@ -113,7 +113,6 @@ from theano.tensor.basic import ( ...@@ -113,7 +113,6 @@ from theano.tensor.basic import (
argmax, argmax,
argmin, argmin,
as_tensor_variable, as_tensor_variable,
batched_dot,
cast, cast,
choose, choose,
clip, clip,
...@@ -743,67 +742,6 @@ TestDenseDot = makeTester( ...@@ -743,67 +742,6 @@ TestDenseDot = makeTester(
bad_runtime=dict(bad1=(rand(5, 7), rand(5, 7)), bad2=(rand(5, 7), rand(8, 3))), bad_runtime=dict(bad1=(rand(5, 7), rand(5, 7)), bad2=(rand(5, 7), rand(8, 3))),
) )
TestBatchedDot = makeTester(
name="BatchedDotTester",
op=batched_dot,
expected=(
lambda xs, ys: np.asarray(
list(
x * y if x.ndim == 0 or y.ndim == 0 else np.dot(x, y)
for x, y in zip(xs, ys)
),
dtype=ts.upcast(xs.dtype, ys.dtype),
)
),
checks={},
grad=dict(
correct1=(rand(3, 5, 7), rand(3, 7, 5)),
correct2=(rand(3, 5, 7), rand(3, 7, 9)),
correct3=(rand(3, 5, 7), rand(3, 7)),
correct4=(rand(3, 5), rand(3, 5, 7)),
correct5=(rand(3), rand(3, 5, 7)),
correct6=(rand(3, 5), rand(3)),
correct7=(rand(3, 5), rand(3, 5)),
correct8=(rand(3), rand(3)),
correct9=(rand(3, 5, 7, 11), rand(3)),
correct10=(rand(3, 2, 6, 5), rand(3, 5)),
correct11=(rand(3, 2, 6, 5), rand(3, 5, 7)),
correct12=(rand(3, 2, 6, 5), rand(3, 7, 5, 8)),
mixed1=(rand(3, 5).astype("float32"), rand(3, 5, 7)),
mixed2=(rand(3, 5).astype("float64"), rand(3, 5, 7)),
),
good=dict(
correct1=(rand(3, 5, 7), rand(3, 7, 5)),
correct2=(rand(3, 5, 7), rand(3, 7, 9)),
correct3=(rand(3, 5, 7), rand(3, 7)),
correct4=(rand(3, 5), rand(3, 5, 7)),
correct5=(rand(3), rand(3, 5, 7)),
correct6=(rand(3, 5), rand(3)),
correct7=(rand(3, 5), rand(3, 5)),
correct8=(rand(3), rand(3)),
correct9=(rand(3, 5, 7, 11), rand(3)),
correct10=(rand(3, 7, 11, 5), rand(3, 5)),
correct11=(rand(3, 7, 11, 5), rand(3, 5, 13)),
correct12=(rand(3, 7, 11, 5), rand(3, 13, 5, 17)),
mixed1=(rand(3, 5).astype("float32"), rand(3, 5, 7)),
mixed2=(rand(3, 5).astype("float64"), rand(3, 5, 7)),
),
bad_build=dict(
no_batch_axis2=(rand(), rand(3, 5)), no_batch_axis3=(rand(3, 5), rand())
),
bad_runtime=dict(
batch_dim_mismatch1=(rand(2, 5, 7), rand(3, 7, 9)),
batch_dim_mismatch2=(rand(3, 5, 7), rand(2, 7, 9)),
batch_dim_mismatch3=(rand(3), rand(5)),
bad_dim1=(rand(3, 5, 7), rand(3, 5, 7)),
bad_dim2=(rand(3, 5, 7), rand(3, 8, 3)),
bad_dim3=(rand(3, 5), rand(3, 7)),
bad_dim4=(rand(3, 5, 7, 11), rand(3, 5)),
bad_dim5=(rand(3, 5, 7, 11), rand(3, 5, 13)),
bad_dim6=(rand(3, 5, 7, 11), rand(3, 13, 5, 17)),
),
)
def _numpy_second(x, y): def _numpy_second(x, y):
return np.broadcast_arrays(x, y)[1] return np.broadcast_arrays(x, y)[1]
...@@ -1595,84 +1533,6 @@ class TestClip: ...@@ -1595,84 +1533,6 @@ class TestClip:
# gradient numerically # gradient numerically
def test_batched_dot():
first = tensor3("first")
second = tensor3("second")
output = batched_dot(first, second)
first_val = np.random.rand(10, 10, 20).astype(config.floatX)
second_val = np.random.rand(10, 20, 5).astype(config.floatX)
result_fn = theano.function([first, second], output)
result = result_fn(first_val, second_val)
assert result.shape[0] == first_val.shape[0]
assert result.shape[1] == first_val.shape[1]
assert result.shape[2] == second_val.shape[2]
first_mat = dmatrix("first")
second_mat = dmatrix("second")
output = batched_dot(first_mat, second_mat)
first_mat_val = np.random.rand(10, 10).astype(config.floatX)
second_mat_val = np.random.rand(10, 10).astype(config.floatX)
result_fn = theano.function([first_mat, second_mat], output)
result = result_fn(first_mat_val, second_mat_val)
assert result.shape[0] == first_mat_val.shape[0]
def test_batched_dot_not_contiguous():
def np_genarray(*_shape):
size = 1
for dimsize in _shape:
size *= dimsize
return np.arange(size, dtype=config.floatX).reshape(_shape)
X = tensor3()
W = tensor3()
Z = batched_dot(X, W)
f = function([X, W], Z)
w = np_genarray(30, 10, 5)
reversed_x_container = np_genarray(20, 40, 30)
x_container = reversed_x_container.T
def check_first_dim(inverted):
direction = -1 if inverted else 1
x = x_container[::direction, ::2, ::2]
assert x.shape == (30, 20, 10)
assert x.strides[0] == direction * np.dtype(config.floatX).itemsize
assert not (x.flags["C_CONTIGUOUS"] or x.flags["F_CONTIGUOUS"])
result = f(x, w)
ref_result = np.asarray(list(np.dot(u, v) for u, v in zip(x, w)))
utt.assert_allclose(ref_result, result)
for inverted in (0, 1):
check_first_dim(inverted)
def test_batched_tensordot():
first = tensor4("first")
second = tensor4("second")
axes = [[1, 2], [3, 1]]
output = tt.batched_tensordot(first, second, axes)
first_val = np.random.rand(8, 10, 20, 3).astype(config.floatX)
second_val = np.random.rand(8, 20, 5, 10).astype(config.floatX)
result_fn = theano.function([first, second], output)
result = result_fn(first_val, second_val)
assert result.shape[0] == first_val.shape[0]
assert result.shape[1] == first_val.shape[3]
assert result.shape[2] == second_val.shape[2]
first_mat = dmatrix("first")
second_mat = dmatrix("second")
axes = 1
output = tt.batched_tensordot(first_mat, second_mat, axes)
first_mat_val = np.random.rand(10, 4).astype(config.floatX)
second_mat_val = np.random.rand(10, 4).astype(config.floatX)
result_fn = theano.function([first_mat, second_mat], output)
result = result_fn(first_mat_val, second_mat_val)
assert result.shape[0] == first_mat_val.shape[0]
assert len(result.shape) == 1
def test_tensor_values_eq_approx(): def test_tensor_values_eq_approx():
# test, inf, -inf and nan equal themself # test, inf, -inf and nan equal themself
a = np.asarray([-np.inf, -1, 0, 1, np.inf, np.nan]) a = np.asarray([-np.inf, -1, 0, 1, np.inf, np.nan])
...@@ -2369,7 +2229,7 @@ class TestOuter: ...@@ -2369,7 +2229,7 @@ class TestOuter:
v1 = np.asarray(np.random.rand(*s1)).astype(config.floatX) v1 = np.asarray(np.random.rand(*s1)).astype(config.floatX)
v2 = np.asarray(np.random.rand(*s2)).astype(config.floatX) v2 = np.asarray(np.random.rand(*s2)).astype(config.floatX)
o = tt.outer(x, y).eval({x: v1, y: v2}) o = tt.outer(x, y).eval({x: v1, y: v2})
assert_allclose(o, np.outer(v1, v2)) utt.assert_allclose(o, np.outer(v1, v2))
def test_grad(self): def test_grad(self):
# Test the combined graph of the graph of outer # Test the combined graph of the graph of outer
...@@ -6719,10 +6579,10 @@ class TestTensorInstanceMethods: ...@@ -6719,10 +6579,10 @@ class TestTensorInstanceMethods:
x, y = self.vals x, y = self.vals
# Use allclose comparison as a user reported on the mailing # Use allclose comparison as a user reported on the mailing
# list failure otherwise with array that print exactly the same. # list failure otherwise with array that print exactly the same.
assert_allclose(x.dot(y), X.dot(Y).eval({X: x, Y: y})) utt.assert_allclose(x.dot(y), X.dot(Y).eval({X: x, Y: y}))
Z = X.dot(Y) Z = X.dot(Y)
z = x.dot(y) z = x.dot(y)
assert_allclose(x.dot(z), X.dot(Z).eval({X: x, Z: z})) utt.assert_allclose(x.dot(z), X.dot(Z).eval({X: x, Z: z}))
def test_real_imag(self): def test_real_imag(self):
X, Y = self.vars X, Y = self.vars
...@@ -6751,7 +6611,7 @@ class TestTensorInstanceMethods: ...@@ -6751,7 +6611,7 @@ class TestTensorInstanceMethods:
# std() is implemented as theano tree and does not pass its # std() is implemented as theano tree and does not pass its
# args directly to numpy. This sometimes results in small # args directly to numpy. This sometimes results in small
# difference, so we use allclose test. # difference, so we use allclose test.
assert_allclose(X.std().eval({X: x}), x.std()) utt.assert_allclose(X.std().eval({X: x}), x.std())
def test_repeat(self): def test_repeat(self):
X, _ = self.vars X, _ = self.vars
......
差异被折叠。
...@@ -20,6 +20,7 @@ from theano.tensor import ( ...@@ -20,6 +20,7 @@ from theano.tensor import (
xlogx, xlogx,
) )
from theano.tensor.basic import * from theano.tensor.basic import *
from theano.tensor.blas import batched_dot, batched_tensordot
from theano.tensor.extra_ops import ( from theano.tensor.extra_ops import (
bartlett, bartlett,
bincount, bincount,
......
...@@ -3365,82 +3365,6 @@ def transpose(x, axes=None): ...@@ -3365,82 +3365,6 @@ def transpose(x, axes=None):
return ret return ret
def batched_dot(a, b):
"""
Compute the batched dot product of two variables:
batched_dot(a, b)[i] = dot(a[i], b[i])
Note that this batched_dot function does one of three things, in the
following sequence:
1. If either a or b is a vector, it returns the batched elementwise
product without calling the Theano BatchedDot op.
2. If both a and b have either 2 or 3 dimensions, it calls Theano's
BatchedDot op on a and b.
3. If either a or b has more than 3 dimensions, it calls Theano's
batched_tensordot function with appropriate axes. The
batched_tensordot function expresses high-dimensional batched
dot products in terms of batched matrix-matrix dot products, so
it may be possible to futherize optimize for performance.
"""
a, b = as_tensor_variable(a), as_tensor_variable(b)
if a.ndim == 0:
raise TypeError("a must have at least one (batch) axis")
elif b.ndim == 0:
raise TypeError("b must have at least one (batch) axis")
elif a.ndim == 1:
return a.dimshuffle(*([0] + ["x"] * (b.ndim - 1))) * b
elif b.ndim == 1:
return a * b.dimshuffle(*([0] + ["x"] * (a.ndim - 1)))
elif a.ndim > 3 or b.ndim > 3:
return batched_tensordot(a, b, [[a.ndim - 1], [np.maximum(1, b.ndim - 2)]])
else:
# avoid circular import
return theano.tensor.blas.BatchedDot()(a, b)
def batched_tensordot(x, y, axes=2):
"""Compute a batched tensordot product.
A hybrid of batched_dot and tensordot, this function computes the
tensordot product between the two tensors, by iterating over the
first dimension to perform a sequence of tensordots.
Parameters
----------
x: TensorVariable
A tensor with sizes e.g.: for 3D (dim1, dim3, dim2)
y: TensorVariable
A tensor with sizes e.g.: for 3D (dim1, dim2, dim4)
axes: int or array-like of length 2
If an integer, the number of axes to sum over.
If an array, it must have two array elements containing the axes to sum
over in each tensor.
If an integer i, it is converted to an array containing
the last i dimensions of the first tensor and the first
i dimensions of the second tensor (excluding the first
(batch) dimension):
axes = [list(range(a.ndim - i, b.ndim)), list(range(1,i+1))]
If an array, its two elements must contain compatible axes
of the two tensors. For example, [[1, 2], [2, 4]] means sum
over the 2nd and 3rd axes of a and the 3rd and 5th axes of b.
(Remember axes are zero-indexed!) The 2nd axis of a and the
3rd axis of b must have the same shape; the same is true for
the 3rd axis of a and the 5th axis of b.
Like tensordot, this function uses a series of dimshuffles and
reshapes to reduce the tensor dot product to a matrix or vector
dot product. Finally, it calls batched_dot to compute the result.
"""
return _tensordot_as_dot(x, y, axes, dot=batched_dot, batched=True)
def split(x, splits_size, n_splits, axis=0): def split(x, splits_size, n_splits, axis=0):
the_split = Split(n_splits) the_split = Split(n_splits)
return the_split(x, axis, splits_size) return the_split(x, axis, splits_size)
...@@ -6771,8 +6695,6 @@ __all__ = [ ...@@ -6771,8 +6695,6 @@ __all__ = [
"unbroadcast", "unbroadcast",
"addbroadcast", "addbroadcast",
"split", "split",
"batched_tensordot",
"batched_dot",
"transpose", "transpose",
"extract_constant", "extract_constant",
"clip", "clip",
......
...@@ -2071,13 +2071,13 @@ class BatchedDot(COp): ...@@ -2071,13 +2071,13 @@ class BatchedDot(COp):
raise TypeError( raise TypeError(
"theano.tensor.blas.BatchedDot: input 0 (0-indexed)" "theano.tensor.blas.BatchedDot: input 0 (0-indexed)"
f" must have ndim of 2 or 3, {int(inputs[0].ndim)} given. Consider" f" must have ndim of 2 or 3, {int(inputs[0].ndim)} given. Consider"
" calling theano.tensor.batched_dot instead." " calling theano.tensor.blas.batched_dot instead."
) )
if inputs[1].ndim not in (2, 3): if inputs[1].ndim not in (2, 3):
raise TypeError( raise TypeError(
"theano.tensor.blas.BatchedDot: input 1 (0-indexed)" "theano.tensor.blas.BatchedDot: input 1 (0-indexed)"
f" must have ndim of 2 or 3, {int(inputs[1].ndim)} given. Consider" f" must have ndim of 2 or 3, {int(inputs[1].ndim)} given. Consider"
" calling theano.tensor.batched_dot instead." " calling theano.tensor.blas.batched_dot instead."
) )
dtype = theano.scalar.upcast(*[input.type.dtype for input in inputs]) dtype = theano.scalar.upcast(*[input.type.dtype for input in inputs])
...@@ -2424,18 +2424,18 @@ class BatchedDot(COp): ...@@ -2424,18 +2424,18 @@ class BatchedDot(COp):
# x is a matrix, y is a tensor3, grad is a matrix # x is a matrix, y is a tensor3, grad is a matrix
elif xdim == 2 and ydim == 3: elif xdim == 2 and ydim == 3:
xgrad = tt.batched_dot(gz, y.dimshuffle(0, 2, 1)) xgrad = batched_dot(gz, y.dimshuffle(0, 2, 1))
ygrad = x.dimshuffle(0, 1, "x") * gz.dimshuffle(0, "x", 1) ygrad = x.dimshuffle(0, 1, "x") * gz.dimshuffle(0, "x", 1)
# x is a tensor3, y is a matrix, grad is a matrix # x is a tensor3, y is a matrix, grad is a matrix
elif xdim == 3 and ydim == 2: elif xdim == 3 and ydim == 2:
xgrad = gz.dimshuffle(0, 1, "x") * y.dimshuffle(0, "x", 1) xgrad = gz.dimshuffle(0, 1, "x") * y.dimshuffle(0, "x", 1)
ygrad = tt.batched_dot(x.dimshuffle(0, 2, 1), gz) ygrad = batched_dot(x.dimshuffle(0, 2, 1), gz)
# x is a tensor3, y is a tensor3, grad is a tensor3 # x is a tensor3, y is a tensor3, grad is a tensor3
elif xdim == ydim == 3: elif xdim == ydim == 3:
xgrad = tt.batched_dot(gz, y.dimshuffle(0, 2, 1)) xgrad = batched_dot(gz, y.dimshuffle(0, 2, 1))
ygrad = tt.batched_dot(x.dimshuffle(0, 2, 1), gz) ygrad = batched_dot(x.dimshuffle(0, 2, 1), gz)
# If x or y contain broadcastable dimensions but only one of # If x or y contain broadcastable dimensions but only one of
# them know that a matching dimensions is broadcastable, the # them know that a matching dimensions is broadcastable, the
...@@ -2532,7 +2532,7 @@ class BatchedDot(COp): ...@@ -2532,7 +2532,7 @@ class BatchedDot(COp):
return [xshp[:-1] + yshp[2:]] return [xshp[:-1] + yshp[2:]]
batched_dot = BatchedDot() _batched_dot = BatchedDot()
# from opt import register_specialize, register_canonicalize # from opt import register_specialize, register_canonicalize
...@@ -2541,3 +2541,82 @@ batched_dot = BatchedDot() ...@@ -2541,3 +2541,82 @@ batched_dot = BatchedDot()
def local_print_as_we_go_along(fgraph, node): def local_print_as_we_go_along(fgraph, node):
if node.op in (tt.sub, tt.add): if node.op in (tt.sub, tt.add):
debugprint(node) debugprint(node)
def batched_dot(a, b):
"""Compute the batched dot product of two variables.
I.e.:
batched_dot(a, b)[i] = dot(a[i], b[i])
Note that this batched_dot function does one of three things, in the
following sequence:
1. If either a or b is a vector, it returns the batched elementwise
product without calling the Theano BatchedDot op.
2. If both a and b have either 2 or 3 dimensions, it calls Theano's
BatchedDot op on a and b.
3. If either a or b has more than 3 dimensions, it calls Theano's
batched_tensordot function with appropriate axes. The
batched_tensordot function expresses high-dimensional batched
dot products in terms of batched matrix-matrix dot products, so
it may be possible to futherize optimize for performance.
"""
a, b = tt.as_tensor_variable(a), tt.as_tensor_variable(b)
if a.ndim == 0:
raise TypeError("a must have at least one (batch) axis")
elif b.ndim == 0:
raise TypeError("b must have at least one (batch) axis")
elif a.ndim == 1:
return a.dimshuffle(*([0] + ["x"] * (b.ndim - 1))) * b
elif b.ndim == 1:
return a * b.dimshuffle(*([0] + ["x"] * (a.ndim - 1)))
elif a.ndim > 3 or b.ndim > 3:
return batched_tensordot(a, b, [[a.ndim - 1], [np.maximum(1, b.ndim - 2)]])
else:
# avoid circular import
return _batched_dot(a, b)
def batched_tensordot(x, y, axes=2):
"""Compute a batched tensordot product.
A hybrid of batched_dot and tensordot, this function computes the
tensordot product between the two tensors, by iterating over the
first dimension to perform a sequence of tensordots.
Parameters
----------
x: TensorVariable
A tensor with sizes e.g.: for 3D (dim1, dim3, dim2)
y: TensorVariable
A tensor with sizes e.g.: for 3D (dim1, dim2, dim4)
axes: int or array-like of length 2
If an integer, the number of axes to sum over.
If an array, it must have two array elements containing the axes to sum
over in each tensor.
If an integer i, it is converted to an array containing
the last i dimensions of the first tensor and the first
i dimensions of the second tensor (excluding the first
(batch) dimension):
axes = [list(range(a.ndim - i, b.ndim)), list(range(1,i+1))]
If an array, its two elements must contain compatible axes
of the two tensors. For example, [[1, 2], [2, 4]] means sum
over the 2nd and 3rd axes of a and the 3rd and 5th axes of b.
(Remember axes are zero-indexed!) The 2nd axis of a and the
3rd axis of b must have the same shape; the same is true for
the 3rd axis of a and the 5th axis of b.
Like tensordot, this function uses a series of dimshuffles and
reshapes to reduce the tensor dot product to a matrix or vector
dot product. Finally, it calls batched_dot to compute the result.
"""
from theano.tensor.basic import _tensordot_as_dot
return _tensordot_as_dot(x, y, axes, dot=batched_dot, batched=True)
...@@ -8,6 +8,7 @@ from theano.graph.basic import Apply ...@@ -8,6 +8,7 @@ from theano.graph.basic import Apply
from theano.graph.op import ExternalCOp, OpenMPOp from theano.graph.op import ExternalCOp, OpenMPOp
from theano.graph.opt import local_optimizer from theano.graph.opt import local_optimizer
from theano.link.c.cmodule import GCC_compiler from theano.link.c.cmodule import GCC_compiler
from theano.tensor.blas import batched_dot
from theano.tensor.extra_ops import cpu_contiguous from theano.tensor.extra_ops import cpu_contiguous
from theano.tensor.opt import register_canonicalize from theano.tensor.opt import register_canonicalize
from theano.tensor.type import ftensor3, fvector from theano.tensor.type import ftensor3, fvector
...@@ -200,7 +201,7 @@ class ConnectionistTemporalClassification(ExternalCOp, OpenMPOp): ...@@ -200,7 +201,7 @@ class ConnectionistTemporalClassification(ExternalCOp, OpenMPOp):
assert gradients is not None assert gradients is not None
grad_op = output_grads[0] grad_op = output_grads[0]
total_grad = tt.batched_dot(grad_op, gradients.dimshuffle(1, 0, 2)).dimshuffle( total_grad = batched_dot(grad_op, gradients.dimshuffle(1, 0, 2)).dimshuffle(
1, 0, 2 1, 0, 2
) )
return [ return [
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论