提交 f87853c5 authored 作者: carriepl's avatar carriepl

Merge pull request #3508 from cooijmanstim/batched_gemm

WIP: BatchedDotOp
......@@ -212,6 +212,10 @@ class BatchedDotOp(GpuOp):
def c_code_cache_version(self):
return (1,)
def infer_shape(self, node, shapes):
xshp, yshp = shapes
return [xshp[:-1] + yshp[2:]]
batched_dot = BatchedDotOp()
"""
Call cublasSgemmBatched. Take 2 3d tensor as input.
......
......@@ -33,7 +33,7 @@ from theano.sandbox.cuda.basic_ops import (
from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda.blas import (
gpu_dot22, gpu_dot22scalar, gpu_gemm_inplace, gpu_gemm_no_inplace, GpuConv,
GpuCorrMM, GpuCorrMM_gradInputs, GpuCorrMM_gradWeights,
BatchedDotOp, GpuCorrMM, GpuCorrMM_gradInputs, GpuCorrMM_gradWeights,
GpuCorr3dMM, GpuCorr3dMM_gradInputs, GpuCorr3dMM_gradWeights)
from theano.sandbox.cuda.blas import gpu_gemv_inplace
......@@ -156,7 +156,7 @@ cpu_ops_moved_to_gpu = [
tensor.Reshape, tensor.flatten, tensor.Subtensor,
tensor.AdvancedSubtensor1, tensor.AdvancedIncSubtensor1,
tensor.IncSubtensor, tensor.Shape, tensor.Join,
tensor.Alloc, tensor.Eye]
tensor.Alloc, tensor.Eye, tensor.BatchedDot]
class InputToGpuOptimizer(Optimizer):
......@@ -613,6 +613,45 @@ def local_gpu_dot22(node):
return False
@register_opt()
@local_optimizer([gpu_from_host, tensor.BatchedDot])
def local_gpu_batched_dot(node):
"""
gpu_from_host(batched_dot) -> gpu_batched_dot(gpu_from_host)
batched_dot(host_from_gpu) -> host_from_gpu(gpu_batched_dot)
"""
def gpu_batched_dot(x, y):
# pad x and y shapes to be third-order tensors
x_, y_ = x, y
if x.ndim == 2:
x_ = x_.dimshuffle(0, "x", 1)
if y.ndim == 2:
y_ = y_.dimshuffle(0, 1, "x")
z = BatchedDotOp()(as_cuda_ndarray_variable(x_),
as_cuda_ndarray_variable(y_))
# unpad z shape
if x.ndim == 2:
z = z.dimshuffle(0, *range(2, z.ndim))
if y.ndim == 2:
z = z.dimshuffle(*range(z.ndim - 1))
return as_cuda_ndarray_variable(z)
if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0]
if host_input.owner and isinstance(host_input.owner.op,
tensor.BatchedDot):
x, y = host_input.owner.inputs
return [gpu_batched_dot(x, y)]
if isinstance(node.op, tensor.BatchedDot):
if any([(i.owner and isinstance(i.owner.op, HostFromGpu))
for i in node.inputs]):
x, y = node.inputs
return [host_from_gpu(gpu_batched_dot(x, y))]
return False
@register_opt()
@local_optimizer([gpu_from_host, tensor.blas.Dot22Scalar])
def local_gpu_dot22scalar(node):
......
......@@ -23,7 +23,7 @@ import theano.compile.mode
from theano.tensor.tests.test_blas import BaseGemv, TestBlasStrides, TestGer
from theano.sandbox.cuda.blas import gpu_gemv_no_inplace, gpu_gemv_inplace
from theano.sandbox.cuda.blas import gpu_ger_inplace, gpu_ger_no_inplace
from theano.sandbox.cuda.blas import batched_dot
from theano.sandbox.cuda.blas import batched_dot, BatchedDotOp
if theano.config.mode == 'FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
......@@ -44,7 +44,7 @@ def my_rand(*shape):
return theano._asarray(numpy.random.rand(*shape), dtype='float32')
class TestBatchedDot(TestCase):
class TestBatchedDot(unittest_tools.InferShapeTester):
def test_batched_dot_correctness(self):
......@@ -114,6 +114,17 @@ class TestBatchedDot(TestCase):
numpy.random.randn(5,2,6).astype(numpy.float32)],
mode=mode_with_gpu)
def test_infer_shape(self):
# only matrix/matrix is supported
admat = tensor.ftensor3()
bdmat = tensor.ftensor3()
admat_val = my_rand(7, 4, 5)
bdmat_val = my_rand(7, 5, 3)
self._compile_and_check([admat, bdmat],
[BatchedDotOp()(admat, bdmat)],
[admat_val, bdmat_val],
BatchedDotOp)
def test_dot22():
def cmp(a_shp, b_shp):
......
......@@ -3373,52 +3373,52 @@ def transpose(x, axes=None):
return ret
def batched_dot(x, y):
def batched_dot(a, b):
"""
This function computes the dot product between the two tensors, by
iterating over the first dimension using scan.
Compute the batched dot product of two variables:
Parameters
----------
x : tensor
A Tensor with sizes e.g.: for 3D (dim1, dim3, dim2).
y : tensor
A Tensor with sizes e.g.: for 3D (dim1, dim2, dim4).
batched_dot(a, b)[i] = dot(a[i], b[i])
Returns
-------
tensor
A tensor of size e.g. if it is 3D: (dim1, dim3, dim4).
Note that this batched_dot function does one of three things, in the
following sequence:
Notes
-----
This is a subset of numpy.einsum, but we do not provide it for now.
But numpy einsum is slower than dot or tensordot:
http://mail.scipy.org/pipermail/numpy-discussion/2012-October/064259.html
1. If either a or b is a vector, it returns the batched elementwise
product without calling the Theano BatchedDot op.
Examples
--------
>>> first = tensor.tensor3('first')
>>> second = tensor.tensor3('second')
>>> result = batched_dot(first, second)
2. If both a and b have either 2 or 3 dimensions, it calls Theano's
BatchedDot op on a and b.
3. If either a or b has more than 3 dimensions, it calls Theano's
batched_tensordot function with appropriate axes. The
batched_tensordot function expresses high-dimensional batched
dot products in terms of batched matrix-matrix dot products, so
it may be possible to futherize optimize for performance.
"""
result, updates = theano.scan(
fn=lambda x_mat, y_mat:
theano.tensor.dot(x_mat, y_mat),
outputs_info=None,
sequences=[x, y],
non_sequences=None)
return result
a, b = as_tensor_variable(a), as_tensor_variable(b)
if a.ndim == 0:
raise TypeError("a must have at least one (batch) axis")
elif b.ndim == 0:
raise TypeError("b must have at least one (batch) axis")
elif a.ndim == 1:
return a.dimshuffle(*([0] + ["x"] * (b.ndim - 1))) * b
elif b.ndim == 1:
return a * b.dimshuffle(*([0] + ["x"] * (a.ndim - 1)))
elif a.ndim > 3 or b.ndim > 3:
return batched_tensordot(
a, b, [[a.ndim - 1], [numpy.maximum(1, b.ndim - 2)]])
else:
# avoid circular import
return theano.tensor.blas.BatchedDot()(a, b)
def batched_tensordot(x, y, axes=2):
"""
Compute the tensordot product.
Compute a batched tensordot product.
A hybrid of batch_dot and tensordot, this function computes the
A hybrid of batched_dot and tensordot, this function computes the
tensordot product between the two tensors, by iterating over the
first dimension using scan to perform a sequence of tensordots.
first dimension to perform a sequence of tensordots.
Parameters
----------
......
差异被折叠。
......@@ -31,9 +31,9 @@ from theano.tensor import (_shared, wvector, bvector, autocast_float_as,
horizontal_stack, vertical_stack, argmax, get_vector_length,
fscalar, zeros_like, sum, tensor3, vector, add, addbroadcast,
alloc, as_tensor_variable, tensor_from_scalar, ARange, autocast_float,
clip, constant, default, dot,
dmatrix, dscalar, dvector, eq, eye, fill, flatten, inverse_permutation, Flatten,
tensor4, permute_row_elements, fmatrix, fscalars, grad,
clip, constant, default, dot, batched_dot,
dmatrix, dscalar, dvector, eq, eye, fill, flatten, inverse_permutation,
tensor4, permute_row_elements, Flatten, fmatrix, fscalars, grad,
inplace, iscalar, matrix, minimum, matrices, maximum, mul, neq,
Reshape, row, scalar, scalars, second, smallest, stack, sub, Tensor,
tensor_copy, tensordot, TensorType, Tri, tri, tril, triu, unbroadcast,
......@@ -1938,6 +1938,59 @@ DotTester = makeTester(name='DotTester',
bad_runtime=dict(bad1=(rand(5, 7), rand(5, 7)),
bad2=(rand(5, 7), rand(8, 3))))
BatchedDotTester = makeTester(
name='BatchedDotTester',
op=batched_dot,
expected=(lambda xs, ys:
numpy.asarray(
list(x * y if x.ndim == 0 or y.ndim == 0 else numpy.dot(x, y)
for x, y in zip(xs, ys)),
dtype=theano.scalar.upcast(xs.dtype, ys.dtype))),
checks={},
grad=dict(correct1=(rand(3, 5, 7), rand(3, 7, 5)),
correct2=(rand(3, 5, 7), rand(3, 7, 9)),
correct3=(rand(3, 5, 7), rand(3, 7)),
correct4=(rand(3, 5), rand(3, 5, 7)),
correct5=(rand(3), rand(3, 5, 7)),
correct6=(rand(3, 5), rand(3)),
correct7=(rand(3, 5), rand(3, 5)),
correct8=(rand(3), rand(3)),
correct9=(rand(3, 5, 7, 11), rand(3)),
correct10=(rand(3, 7, 11, 5), rand(3, 5)),
correct11=(rand(3, 7, 11, 5), rand(3, 5, 13)),
correct12=(rand(3, 7, 11, 5), rand(3, 13, 5, 17)),
mixed1=(rand(3, 5).astype('float32'),
rand(3, 5, 7)),
mixed2=(rand(3, 5).astype('float64'),
rand(3, 5, 7))),
good=dict(correct1=(rand(3, 5, 7), rand(3, 7, 5)),
correct2=(rand(3, 5, 7), rand(3, 7, 9)),
correct3=(rand(3, 5, 7), rand(3, 7)),
correct4=(rand(3, 5), rand(3, 5, 7)),
correct5=(rand(3), rand(3, 5, 7)),
correct6=(rand(3, 5), rand(3)),
correct7=(rand(3, 5), rand(3, 5)),
correct8=(rand(3), rand(3)),
correct9=(rand(3, 5, 7, 11), rand(3)),
correct10=(rand(3, 7, 11, 5), rand(3, 5)),
correct11=(rand(3, 7, 11, 5), rand(3, 5, 13)),
correct12=(rand(3, 7, 11, 5), rand(3, 13, 5, 17)),
mixed1=(rand(3, 5).astype('float32'),
rand(3, 5, 7)),
mixed2=(rand(3, 5).astype('float64'),
rand(3, 5, 7))),
bad_build=dict(no_batch_axis2=(rand(), rand(3, 5)),
no_batch_axis3=(rand(3, 5), rand())),
bad_runtime=dict(batch_dim_mismatch1=(rand(2, 5, 7), rand(3, 7, 9)),
batch_dim_mismatch2=(rand(3, 5, 7), rand(2, 7, 9)),
batch_dim_mismatch3=(rand(3), rand(5)),
bad_dim1=(rand(3, 5, 7), rand(3, 5, 7)),
bad_dim2=(rand(3, 5, 7), rand(3, 8, 3)),
bad_dim3=(rand(3, 5), rand(3, 7)),
bad_dim4=(rand(3, 5, 7, 11), rand(3, 5)),
bad_dim5=(rand(3, 5, 7, 11), rand(3, 5, 13)),
bad_dim6=(rand(3, 5, 7, 11), rand(3, 13, 5, 17))))
def _numpy_second(x, y):
return numpy.broadcast_arrays(x, y)[1]
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论