提交 f87853c5 authored 作者: carriepl's avatar carriepl

Merge pull request #3508 from cooijmanstim/batched_gemm

WIP: BatchedDotOp
...@@ -212,6 +212,10 @@ class BatchedDotOp(GpuOp): ...@@ -212,6 +212,10 @@ class BatchedDotOp(GpuOp):
def c_code_cache_version(self): def c_code_cache_version(self):
return (1,) return (1,)
def infer_shape(self, node, shapes):
xshp, yshp = shapes
return [xshp[:-1] + yshp[2:]]
batched_dot = BatchedDotOp() batched_dot = BatchedDotOp()
""" """
Call cublasSgemmBatched. Take 2 3d tensor as input. Call cublasSgemmBatched. Take 2 3d tensor as input.
......
...@@ -33,7 +33,7 @@ from theano.sandbox.cuda.basic_ops import ( ...@@ -33,7 +33,7 @@ from theano.sandbox.cuda.basic_ops import (
from theano.sandbox.cuda.type import CudaNdarrayType from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda.blas import ( from theano.sandbox.cuda.blas import (
gpu_dot22, gpu_dot22scalar, gpu_gemm_inplace, gpu_gemm_no_inplace, GpuConv, gpu_dot22, gpu_dot22scalar, gpu_gemm_inplace, gpu_gemm_no_inplace, GpuConv,
GpuCorrMM, GpuCorrMM_gradInputs, GpuCorrMM_gradWeights, BatchedDotOp, GpuCorrMM, GpuCorrMM_gradInputs, GpuCorrMM_gradWeights,
GpuCorr3dMM, GpuCorr3dMM_gradInputs, GpuCorr3dMM_gradWeights) GpuCorr3dMM, GpuCorr3dMM_gradInputs, GpuCorr3dMM_gradWeights)
from theano.sandbox.cuda.blas import gpu_gemv_inplace from theano.sandbox.cuda.blas import gpu_gemv_inplace
...@@ -156,7 +156,7 @@ cpu_ops_moved_to_gpu = [ ...@@ -156,7 +156,7 @@ cpu_ops_moved_to_gpu = [
tensor.Reshape, tensor.flatten, tensor.Subtensor, tensor.Reshape, tensor.flatten, tensor.Subtensor,
tensor.AdvancedSubtensor1, tensor.AdvancedIncSubtensor1, tensor.AdvancedSubtensor1, tensor.AdvancedIncSubtensor1,
tensor.IncSubtensor, tensor.Shape, tensor.Join, tensor.IncSubtensor, tensor.Shape, tensor.Join,
tensor.Alloc, tensor.Eye] tensor.Alloc, tensor.Eye, tensor.BatchedDot]
class InputToGpuOptimizer(Optimizer): class InputToGpuOptimizer(Optimizer):
...@@ -613,6 +613,45 @@ def local_gpu_dot22(node): ...@@ -613,6 +613,45 @@ def local_gpu_dot22(node):
return False return False
@register_opt()
@local_optimizer([gpu_from_host, tensor.BatchedDot])
def local_gpu_batched_dot(node):
"""
gpu_from_host(batched_dot) -> gpu_batched_dot(gpu_from_host)
batched_dot(host_from_gpu) -> host_from_gpu(gpu_batched_dot)
"""
def gpu_batched_dot(x, y):
# pad x and y shapes to be third-order tensors
x_, y_ = x, y
if x.ndim == 2:
x_ = x_.dimshuffle(0, "x", 1)
if y.ndim == 2:
y_ = y_.dimshuffle(0, 1, "x")
z = BatchedDotOp()(as_cuda_ndarray_variable(x_),
as_cuda_ndarray_variable(y_))
# unpad z shape
if x.ndim == 2:
z = z.dimshuffle(0, *range(2, z.ndim))
if y.ndim == 2:
z = z.dimshuffle(*range(z.ndim - 1))
return as_cuda_ndarray_variable(z)
if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0]
if host_input.owner and isinstance(host_input.owner.op,
tensor.BatchedDot):
x, y = host_input.owner.inputs
return [gpu_batched_dot(x, y)]
if isinstance(node.op, tensor.BatchedDot):
if any([(i.owner and isinstance(i.owner.op, HostFromGpu))
for i in node.inputs]):
x, y = node.inputs
return [host_from_gpu(gpu_batched_dot(x, y))]
return False
@register_opt() @register_opt()
@local_optimizer([gpu_from_host, tensor.blas.Dot22Scalar]) @local_optimizer([gpu_from_host, tensor.blas.Dot22Scalar])
def local_gpu_dot22scalar(node): def local_gpu_dot22scalar(node):
......
...@@ -23,7 +23,7 @@ import theano.compile.mode ...@@ -23,7 +23,7 @@ import theano.compile.mode
from theano.tensor.tests.test_blas import BaseGemv, TestBlasStrides, TestGer from theano.tensor.tests.test_blas import BaseGemv, TestBlasStrides, TestGer
from theano.sandbox.cuda.blas import gpu_gemv_no_inplace, gpu_gemv_inplace from theano.sandbox.cuda.blas import gpu_gemv_no_inplace, gpu_gemv_inplace
from theano.sandbox.cuda.blas import gpu_ger_inplace, gpu_ger_no_inplace from theano.sandbox.cuda.blas import gpu_ger_inplace, gpu_ger_no_inplace
from theano.sandbox.cuda.blas import batched_dot from theano.sandbox.cuda.blas import batched_dot, BatchedDotOp
if theano.config.mode == 'FAST_COMPILE': if theano.config.mode == 'FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu') mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
...@@ -44,7 +44,7 @@ def my_rand(*shape): ...@@ -44,7 +44,7 @@ def my_rand(*shape):
return theano._asarray(numpy.random.rand(*shape), dtype='float32') return theano._asarray(numpy.random.rand(*shape), dtype='float32')
class TestBatchedDot(TestCase): class TestBatchedDot(unittest_tools.InferShapeTester):
def test_batched_dot_correctness(self): def test_batched_dot_correctness(self):
...@@ -114,6 +114,17 @@ class TestBatchedDot(TestCase): ...@@ -114,6 +114,17 @@ class TestBatchedDot(TestCase):
numpy.random.randn(5,2,6).astype(numpy.float32)], numpy.random.randn(5,2,6).astype(numpy.float32)],
mode=mode_with_gpu) mode=mode_with_gpu)
def test_infer_shape(self):
# only matrix/matrix is supported
admat = tensor.ftensor3()
bdmat = tensor.ftensor3()
admat_val = my_rand(7, 4, 5)
bdmat_val = my_rand(7, 5, 3)
self._compile_and_check([admat, bdmat],
[BatchedDotOp()(admat, bdmat)],
[admat_val, bdmat_val],
BatchedDotOp)
def test_dot22(): def test_dot22():
def cmp(a_shp, b_shp): def cmp(a_shp, b_shp):
......
...@@ -3373,52 +3373,52 @@ def transpose(x, axes=None): ...@@ -3373,52 +3373,52 @@ def transpose(x, axes=None):
return ret return ret
def batched_dot(x, y): def batched_dot(a, b):
""" """
This function computes the dot product between the two tensors, by Compute the batched dot product of two variables:
iterating over the first dimension using scan.
Parameters batched_dot(a, b)[i] = dot(a[i], b[i])
----------
x : tensor
A Tensor with sizes e.g.: for 3D (dim1, dim3, dim2).
y : tensor
A Tensor with sizes e.g.: for 3D (dim1, dim2, dim4).
Returns Note that this batched_dot function does one of three things, in the
------- following sequence:
tensor
A tensor of size e.g. if it is 3D: (dim1, dim3, dim4).
Notes 1. If either a or b is a vector, it returns the batched elementwise
----- product without calling the Theano BatchedDot op.
This is a subset of numpy.einsum, but we do not provide it for now.
But numpy einsum is slower than dot or tensordot:
http://mail.scipy.org/pipermail/numpy-discussion/2012-October/064259.html
Examples 2. If both a and b have either 2 or 3 dimensions, it calls Theano's
-------- BatchedDot op on a and b.
>>> first = tensor.tensor3('first')
>>> second = tensor.tensor3('second')
>>> result = batched_dot(first, second)
3. If either a or b has more than 3 dimensions, it calls Theano's
batched_tensordot function with appropriate axes. The
batched_tensordot function expresses high-dimensional batched
dot products in terms of batched matrix-matrix dot products, so
it may be possible to futherize optimize for performance.
""" """
result, updates = theano.scan( a, b = as_tensor_variable(a), as_tensor_variable(b)
fn=lambda x_mat, y_mat:
theano.tensor.dot(x_mat, y_mat), if a.ndim == 0:
outputs_info=None, raise TypeError("a must have at least one (batch) axis")
sequences=[x, y], elif b.ndim == 0:
non_sequences=None) raise TypeError("b must have at least one (batch) axis")
return result elif a.ndim == 1:
return a.dimshuffle(*([0] + ["x"] * (b.ndim - 1))) * b
elif b.ndim == 1:
return a * b.dimshuffle(*([0] + ["x"] * (a.ndim - 1)))
elif a.ndim > 3 or b.ndim > 3:
return batched_tensordot(
a, b, [[a.ndim - 1], [numpy.maximum(1, b.ndim - 2)]])
else:
# avoid circular import
return theano.tensor.blas.BatchedDot()(a, b)
def batched_tensordot(x, y, axes=2): def batched_tensordot(x, y, axes=2):
""" """
Compute the tensordot product. Compute a batched tensordot product.
A hybrid of batch_dot and tensordot, this function computes the A hybrid of batched_dot and tensordot, this function computes the
tensordot product between the two tensors, by iterating over the tensordot product between the two tensors, by iterating over the
first dimension using scan to perform a sequence of tensordots. first dimension to perform a sequence of tensordots.
Parameters Parameters
---------- ----------
......
...@@ -2188,6 +2188,440 @@ blas_optdb.register('local_dot22_to_dot22scalar', ...@@ -2188,6 +2188,440 @@ blas_optdb.register('local_dot22_to_dot22scalar',
11, 'fast_run') 11, 'fast_run')
class BatchedDot(Op):
"""
Computes the batched dot product of two variables:
batched_dot(a, b)[i] = dot(a[i], b[i])
"""
__props__ = ()
def make_node(self, *inputs):
inputs = list(map(T.as_tensor_variable, inputs))
if len(inputs) != 2:
raise TypeError("theano.tensor.blas.BatchedDot: 2 arguments"
" required, %d given " % len(inputs))
if inputs[0].ndim not in (2, 3):
raise TypeError("theano.tensor.blas.BatchedDot: input 0 (0-indexed)"
" must have ndim of 2 or 3, %d given. Consider"
" calling theano.tensor.batched_dot instead."
% inputs[0].ndim)
if inputs[1].ndim not in (2, 3):
raise TypeError("theano.tensor.blas.BatchedDot: input 1 (0-indexed)"
" must have ndim of 2 or 3, %d given. Consider"
" calling theano.tensor.batched_dot instead."
% inputs[1].ndim)
dtype = theano.scalar.upcast(*[input.type.dtype for input in inputs])
# upcast inputs to common dtype if needed
upcasted_inputs = [T.cast(input, dtype) for input in inputs]
broadcastable = ((inputs[0].type.broadcastable[0] or
inputs[1].type.broadcastable[0],) +
inputs[0].type.broadcastable[1:-1] +
inputs[1].type.broadcastable[2:])
return Apply(self, upcasted_inputs, [T.tensor(dtype, broadcastable)])
def perform(self, node, inp, out):
x, y = inp
z, = out
if x.shape[0] != y.shape[0]:
raise TypeError(
"theano.tensor.blas.BatchedDot: inputs [%s] must have the"
" same size in axis 0, but have sizes [%s]." %
(", ".join(map(str, inp)),
", ".join([str(i.shape[0]) for i in inp])))
shape = self.infer_shape(node, [i.shape for i in inp])[0]
dtype = node.outputs[0].dtype
z0 = z[0] = numpy.empty(shape, dtype=dtype)
for i in xrange(z0.shape[0]):
z0[i] = numpy.dot(x[i], y[i])
def c_support_code(self):
batch_gemm_defn = """
template<typename dtype, typename function>
bool batch_gemm(function gemm, int type_size,
PyArrayObject* xs, PyArrayObject* ys, PyArrayObject* zs) {
npy_intp *Nx = PyArray_DIMS(xs), *Sx = PyArray_STRIDES(xs);
npy_intp *Ny = PyArray_DIMS(ys), *Sy = PyArray_STRIDES(ys);
npy_intp *Nz = PyArray_DIMS(zs), *Sz = PyArray_STRIDES(zs);
if (Nx[0] != Ny[0]) {
PyErr_Format(PyExc_ValueError,
"Shape mismatch: batch sizes unequal."
" x.shape is (%d, %d, %d),"
" y.shape is (%d, %d, %d).",
Nx[0], Nx[1], Nx[2],
Ny[0], Ny[1], Ny[2]);
return 1;
}
if (Nx[2] != Ny[1]) {
PyErr_Format(PyExc_ValueError,
"Shape mismatch: summation axis sizes unequal."
" x.shape is (%d, %d, %d),"
" y.shape is (%d, %d, %d).",
Nx[0], Nx[1], Nx[2],
Ny[0], Ny[1], Ny[2]);
return 1;
}
/* encode the stride structure of _x,_y,_z into a single integer. */
int unit = 0;
unit |= ((Sx[2] == type_size || Nx[2] == 1) ? 0x0 : (Sx[1] == type_size || Nx[1]==1) ? 0x1 : 0x2) << 8;
unit |= ((Sy[2] == type_size || Ny[2] == 1) ? 0x0 : (Sy[1] == type_size || Ny[1]==1) ? 0x1 : 0x2) << 4;
unit |= ((Sz[2] == type_size || Nz[2] == 1) ? 0x0 : (Sz[1] == type_size || Nz[1]==1) ? 0x1 : 0x2) << 0;
/* create appropriate strides for malformed matrices that are row or column
* vectors, or empty matrices.
* In that case, the value of the stride does not really matter, but
* some versions of BLAS insist that:
* - they are not smaller than the number of elements in the array,
* - they are not 0.
*/
int sx_1 = (Nx[1] > 1) ? Sx[1]/type_size : (Nx[2] + 1);
int sx_2 = (Nx[2] > 1) ? Sx[2]/type_size : (Nx[1] + 1);
int sy_1 = (Ny[1] > 1) ? Sy[1]/type_size : (Ny[2] + 1);
int sy_2 = (Ny[2] > 1) ? Sy[2]/type_size : (Ny[1] + 1);
int sz_1 = (Nz[1] > 1) ? Sz[1]/type_size : (Nz[2] + 1);
int sz_2 = (Nz[2] > 1) ? Sz[2]/type_size : (Nz[1] + 1);
dtype* x = (dtype*)PyArray_DATA(xs);
dtype* y = (dtype*)PyArray_DATA(ys);
dtype* z = (dtype*)PyArray_DATA(zs);
dtype a = 1.0;
dtype b = 0.0;
char N = 'N';
char T = 'T';
int Nz1 = Nz[1], Nz2 = Nz[2], Nx2 = Nx[2];
// loop over batch axis
for (int i = 0; i < Nz[0]; i++) {
switch(unit)
{
case 0x000: gemm(&N, &N, &Nz2, &Nz1, &Nx2, &a, y, &sy_1, x, &sx_1, &b, z, &sz_1); break;
case 0x100: gemm(&N, &T, &Nz2, &Nz1, &Nx2, &a, y, &sy_1, x, &sx_2, &b, z, &sz_1); break;
case 0x010: gemm(&T, &N, &Nz2, &Nz1, &Nx2, &a, y, &sy_2, x, &sx_1, &b, z, &sz_1); break;
case 0x110: gemm(&T, &T, &Nz2, &Nz1, &Nx2, &a, y, &sy_2, x, &sx_2, &b, z, &sz_1); break;
case 0x001: gemm(&T, &T, &Nz1, &Nz2, &Nx2, &a, x, &sx_1, y, &sy_1, &b, z, &sz_2); break;
case 0x101: gemm(&N, &T, &Nz1, &Nz2, &Nx2, &a, x, &sx_2, y, &sy_1, &b, z, &sz_2); break;
case 0x011: gemm(&T, &N, &Nz1, &Nz2, &Nx2, &a, x, &sx_1, y, &sy_2, &b, z, &sz_2); break;
case 0x111: gemm(&N, &N, &Nz1, &Nz2, &Nx2, &a, x, &sx_2, y, &sy_2, &b, z, &sz_2); break;
default: PyErr_SetString(PyExc_ValueError, "some matrix has no unit stride"); return 1;
};
x += Sx[0] / type_size;
y += Sy[0] / type_size;
z += Sz[0] / type_size;
}
return 0;
}
"""
return blas_header_text() + batch_gemm_defn
def c_libraries(self):
return ldflags()
def c_compile_args(self):
return ldflags(libs=False, flags=True)
def c_lib_dirs(self):
return ldflags(libs=False, libs_dir=True)
def c_header_dirs(self):
return ldflags(libs=False, include_dir=True)
def c_code_cleanup(self, node, name, inputs, outputs, sub):
return """
// clean up views
Py_XDECREF(xs); xs = 0;
Py_XDECREF(ys); ys = 0;
Py_XDECREF(zs); zs = 0;
"""
def c_code(self, node, name, inp, out, sub):
_x, _y = inp
_z, = out
fail = sub["fail"]
# generate contiguity condition
def contiguous(var, ndim):
strides = "PyArray_STRIDES(%s)" % var
return " && ".join([
" && ".join("{strides}[{i}] > 0 && {strides}[{i}] % type_size == 0"
.format(strides=strides, i=i) for i in range(ndim)),
"(%s)" % " || ".join("{strides}[{i}] == type_size"
.format(strides=strides, i=i) for i in range(ndim)),
])
x_ndim, y_ndim, z_ndim = node.inputs[0].ndim, node.inputs[1].ndim, node.outputs[0].ndim
# generate code to allocate output based on runtime input shapes
z_dims = ["PyArray_DIMS(%s)[0]" % _x]
if x_ndim == 3:
z_dims.append("PyArray_DIMS(%s)[1]" % _x)
if y_ndim == 3:
z_dims.append("PyArray_DIMS(%s)[2]" % _y)
assert len(z_dims) == z_ndim
z_shape_correct = " && ".join("PyArray_DIMS(%s)[%i] == %s"
% (_z, i, dim) for i, dim in enumerate(z_dims))
z_shape = ", ".join(z_dims)
z_contiguous = contiguous(_z, z_ndim)
allocate = """
if (NULL == %(_z)s || !(%(z_shape_correct)s) || !(%(z_contiguous)s))
{
npy_intp dims[%(z_ndim)s] = {%(z_shape)s};
Py_XDECREF(%(_z)s);
%(_z)s = (PyArrayObject*)PyArray_SimpleNew(
%(z_ndim)s, dims, PyArray_TYPE(%(_x)s));
if(!%(_z)s) {
PyErr_SetString(PyExc_MemoryError,
"failed to alloc BatchedDot output");
%(fail)s
}
}
""" % locals()
# code to reallocate inputs contiguously if necessary
contiguate = []
for var, ndim in [(_x, x_ndim), (_y, y_ndim)]:
_contiguous = contiguous(var, ndim)
contiguate.append("""
if (!(%(_contiguous)s)) {
PyArrayObject * _copy = (PyArrayObject *) PyArray_Copy(%(var)s);
if (!_copy)
%(fail)s
Py_XDECREF(%(var)s);
%(var)s = _copy;
}
""" % locals())
contiguate = "\n".join(contiguate)
def c_dimshuffle(newname, oldname, shape):
_fail = fail
_shape = ", ".join("1" if axis is None else "PyArray_DIMS(%s)[%i]" % (oldname, axis)
for axis in shape)
return """{
npy_intp dims[3] = {%(_shape)s};
PyArray_Dims newshape = {dims, 3};
%(newname)s = (PyArrayObject*)PyArray_Newshape(%(oldname)s, &newshape, NPY_ANYORDER);
if (!%(newname)s)
%(_fail)s
// make sure we didn't accidentally copy
assert(PyArray_DATA(%(oldname)s) == PyArray_DATA(%(newname)s));
}""" % locals()
# create tensor3 views for any of x, y, z that are not tensor3, so that
# we only need to implement the tensor3-tensor3 batched dot product.
# xs, ys and zs will point to these views, or to the original array if
# it was already tensor3.
# in the latter case, we artificially increase the reference count of
# the original array so that the c_code_cleanup method can decref them
# all indiscriminately.
upcast = []
if x_ndim == 3:
upcast.append("xs = %(_x)s; Py_XINCREF(xs);")
elif x_ndim == 2:
upcast.append(c_dimshuffle("xs", _x, (0, None, 1)))
if y_ndim == 3:
upcast.append("ys = %(_y)s; Py_XINCREF(ys);")
elif y_ndim == 2:
upcast.append(c_dimshuffle("ys", _y, (0, 1, None)))
if z_ndim == 3:
upcast.append("zs = %(_z)s; Py_XINCREF(zs);")
else:
upcast.append(c_dimshuffle(
"zs", _z, (0,
None if x_ndim == 2 else 1,
None if y_ndim == 2 else 1)))
upcast = "\n".join(upcast) % locals()
return """
int type_num = PyArray_DESCR(%(_x)s)->type_num;
int type_size = PyArray_DESCR(%(_x)s)->elsize; // in bytes
// xs, ys, zs will point to views onto %(_x)s, %(_y)s, %(_z)s
PyArrayObject *xs = 0, *ys = 0, *zs = 0;
if (PyArray_NDIM(%(_x)s) != %(x_ndim)s) {
PyErr_Format(PyExc_NotImplementedError,
"rank(x) != %(x_ndim)s. rank(x) is %%d.",
PyArray_NDIM(%(_x)s));
%(fail)s;
}
if (PyArray_NDIM(%(_y)s) != %(y_ndim)s) {
PyErr_Format(PyExc_NotImplementedError,
"rank(y) != %(y_ndim)s. rank(y) is %%d.",
PyArray_NDIM(%(_y)s));
%(fail)s;
}
if (%(_z)s && PyArray_NDIM(%(_z)s) != %(z_ndim)s) {
PyErr_Format(PyExc_NotImplementedError,
"rank(z) != %(z_ndim)s. rank(z) is %%d.",
PyArray_NDIM(%(_z)s));
%(fail)s;
}
// allocate output
%(allocate)s
// reallocate any noncontiguous arrays or arrays with invalid strides
%(contiguate)s
// add dims to make sure everything is tensor3
%(upcast)s
// from here on, use xs, ys and zs as they are tensor3 and share memory
// with the original %(_x)s, %(_y)s and %(_z)s arrays.
if ((PyArray_DESCR(xs)->type_num != NPY_DOUBLE)
&& (PyArray_DESCR(xs)->type_num != NPY_FLOAT))
{PyErr_SetString(PyExc_NotImplementedError, "type(x) is not double or float"); %(fail)s;}
if ((PyArray_DESCR(ys)->type_num != NPY_DOUBLE)
&& (PyArray_DESCR(ys)->type_num != NPY_FLOAT))
{PyErr_SetString(PyExc_NotImplementedError, "type(y) is not double or float"); %(fail)s;}
if ((PyArray_DESCR(zs)->type_num != NPY_DOUBLE)
&& (PyArray_DESCR(zs)->type_num != NPY_FLOAT))
{PyErr_SetString(PyExc_NotImplementedError, "type(z) is not double or float"); %(fail)s;}
if ((PyArray_DESCR(xs)->type_num != PyArray_DESCR(ys)->type_num)
||(PyArray_DESCR(xs)->type_num != PyArray_DESCR(zs)->type_num))
{ PyErr_SetString(PyExc_NotImplementedError, "type(x), type(y), type(z) are not all the same"); %(fail)s; }
switch (type_num)
{
case NPY_FLOAT:
if (batch_gemm<float>(sgemm_, type_size, xs, ys, zs)) {
%(fail)s;
}
break;
case NPY_DOUBLE:
if (batch_gemm<double>(dgemm_, type_size, xs, ys, zs)) {
%(fail)s;
}
break;
}
""" % locals()
def c_code_cache_version(self):
from theano.tensor.blas_headers import blas_header_version
return (1, blas_header_version())
def grad(self, inp, grads):
x, y = inp
gz, = grads
xdim, ydim, gdim = x.type.ndim, y.type.ndim, gz.type.ndim
# grad is a vector, so x is a matrix and y is a matrix
if gdim == 1:
xgrad = gz.dimshuffle(0, 'x') * y
ygrad = gz.dimshuffle(0, 'x') * x
# x is a matrix, y is a tensor3, grad is a matrix
elif xdim == 2 and ydim == 3:
xgrad = T.batched_dot(gz, y.dimshuffle(0, 2, 1))
ygrad = x.dimshuffle(0, 1, 'x') * gz.dimshuffle(0, 'x', 1)
# x is a tensor3, y is a matrix, grad is a matrix
elif xdim == 3 and ydim == 2:
xgrad = gz.dimshuffle(0, 1, 'x') * y.dimshuffle(0, 'x', 1)
ygrad = T.batched_dot(x.dimshuffle(0, 2, 1), gz)
# x is a tensor3, y is a tensor3, grad is a tensor3
elif xdim == ydim == 3:
xgrad = T.batched_dot(gz, y.dimshuffle(0, 2, 1))
ygrad = T.batched_dot(x.dimshuffle(0, 2, 1), gz)
# If x or y contain broadcastable dimensions but only one of
# them know that a matching dimensions is broadcastable, the
# above code don't always return the right broadcast pattern.
# This cause problem down the road. See gh-1461.
if xgrad.broadcastable != x.broadcastable:
xgrad = T.patternbroadcast(xgrad, x.broadcastable)
if ygrad.broadcastable != y.broadcastable:
ygrad = T.patternbroadcast(ygrad, y.broadcastable)
return xgrad, ygrad
def R_op(self, inputs, eval_points):
# R_op for batched_dot(a, b) evaluted at c for a and d for b is
# simply batched_dot(c, b) + batched_dot(a, d)
assert len(inputs) == 2
assert len(eval_points) == 2
if eval_points[0] is None and eval_points[1] is None:
return [None]
debugger_available = config.compute_test_value != 'off'
if debugger_available:
try:
iv0 = theano.gof.op.get_test_value(inputs[0])
except AttributeError:
theano.gof.op.missing_test_message(
'first input passed to BatchedDot.R_op has no test value')
debugger_available = False
try:
iv1 = theano.gof.op.get_test_value(inputs[1])
except AttributeError:
theano.gof.op.missing_test_message(
'second input passed to BatchedDot.R_op has no test value')
debugger_available = False
if eval_points[0]:
try:
ev0 = theano.gof.op.get_test_value(eval_points[0])
except AttributeError:
theano.gof.op.missing_test_message(
'first eval point passed to BatchedDot.R_op '
'has no test value')
debugger_available = False
if eval_points[1]:
try:
ev1 = theano.gof.op.get_test_value(eval_points[1])
except AttributeError:
theano.gof.op.missing_test_message(
'second eval point passed to BatchedDot.R_op '
'has no test value')
debugger_available = False
if debugger_available:
input_values = [iv0, iv1]
eval_point_values = [ev0, ev1]
for i in xrange(2):
if eval_point_values[i] is not None and \
input_values[i].shape != eval_point_values[i].shape:
raise ValueError(
'input ' + str(i) + ' and eval_point ' + str(i) +
' to BatchedDot.R_op should have the same shape, but '
'their shapes are %s and %s, respectively' % (
str(input_values[i].shape),
str(eval_point_values[i].shape)))
if eval_points[0]:
t1 = self(eval_points[0], inputs[1])
if eval_points[1]:
t2 = self(inputs[0], eval_points[1])
if eval_points[0] and eval_points[1]:
return [t1 + t2]
elif eval_points[0]:
return [t1]
else:
return [t2]
def infer_shape(self, node, shapes):
for shape_ in shapes:
if len(shape_) not in (2, 3):
raise NotImplementedError()
xshp, yshp = shapes
return [xshp[:-1] + yshp[2:]]
# from opt import register_specialize, register_canonicalize # from opt import register_specialize, register_canonicalize
# @register_specialize # @register_specialize
@local_optimizer([T.sub, T.add]) @local_optimizer([T.sub, T.add])
......
...@@ -31,9 +31,9 @@ from theano.tensor import (_shared, wvector, bvector, autocast_float_as, ...@@ -31,9 +31,9 @@ from theano.tensor import (_shared, wvector, bvector, autocast_float_as,
horizontal_stack, vertical_stack, argmax, get_vector_length, horizontal_stack, vertical_stack, argmax, get_vector_length,
fscalar, zeros_like, sum, tensor3, vector, add, addbroadcast, fscalar, zeros_like, sum, tensor3, vector, add, addbroadcast,
alloc, as_tensor_variable, tensor_from_scalar, ARange, autocast_float, alloc, as_tensor_variable, tensor_from_scalar, ARange, autocast_float,
clip, constant, default, dot, clip, constant, default, dot, batched_dot,
dmatrix, dscalar, dvector, eq, eye, fill, flatten, inverse_permutation, Flatten, dmatrix, dscalar, dvector, eq, eye, fill, flatten, inverse_permutation,
tensor4, permute_row_elements, fmatrix, fscalars, grad, tensor4, permute_row_elements, Flatten, fmatrix, fscalars, grad,
inplace, iscalar, matrix, minimum, matrices, maximum, mul, neq, inplace, iscalar, matrix, minimum, matrices, maximum, mul, neq,
Reshape, row, scalar, scalars, second, smallest, stack, sub, Tensor, Reshape, row, scalar, scalars, second, smallest, stack, sub, Tensor,
tensor_copy, tensordot, TensorType, Tri, tri, tril, triu, unbroadcast, tensor_copy, tensordot, TensorType, Tri, tri, tril, triu, unbroadcast,
...@@ -1938,6 +1938,59 @@ DotTester = makeTester(name='DotTester', ...@@ -1938,6 +1938,59 @@ DotTester = makeTester(name='DotTester',
bad_runtime=dict(bad1=(rand(5, 7), rand(5, 7)), bad_runtime=dict(bad1=(rand(5, 7), rand(5, 7)),
bad2=(rand(5, 7), rand(8, 3)))) bad2=(rand(5, 7), rand(8, 3))))
BatchedDotTester = makeTester(
name='BatchedDotTester',
op=batched_dot,
expected=(lambda xs, ys:
numpy.asarray(
list(x * y if x.ndim == 0 or y.ndim == 0 else numpy.dot(x, y)
for x, y in zip(xs, ys)),
dtype=theano.scalar.upcast(xs.dtype, ys.dtype))),
checks={},
grad=dict(correct1=(rand(3, 5, 7), rand(3, 7, 5)),
correct2=(rand(3, 5, 7), rand(3, 7, 9)),
correct3=(rand(3, 5, 7), rand(3, 7)),
correct4=(rand(3, 5), rand(3, 5, 7)),
correct5=(rand(3), rand(3, 5, 7)),
correct6=(rand(3, 5), rand(3)),
correct7=(rand(3, 5), rand(3, 5)),
correct8=(rand(3), rand(3)),
correct9=(rand(3, 5, 7, 11), rand(3)),
correct10=(rand(3, 7, 11, 5), rand(3, 5)),
correct11=(rand(3, 7, 11, 5), rand(3, 5, 13)),
correct12=(rand(3, 7, 11, 5), rand(3, 13, 5, 17)),
mixed1=(rand(3, 5).astype('float32'),
rand(3, 5, 7)),
mixed2=(rand(3, 5).astype('float64'),
rand(3, 5, 7))),
good=dict(correct1=(rand(3, 5, 7), rand(3, 7, 5)),
correct2=(rand(3, 5, 7), rand(3, 7, 9)),
correct3=(rand(3, 5, 7), rand(3, 7)),
correct4=(rand(3, 5), rand(3, 5, 7)),
correct5=(rand(3), rand(3, 5, 7)),
correct6=(rand(3, 5), rand(3)),
correct7=(rand(3, 5), rand(3, 5)),
correct8=(rand(3), rand(3)),
correct9=(rand(3, 5, 7, 11), rand(3)),
correct10=(rand(3, 7, 11, 5), rand(3, 5)),
correct11=(rand(3, 7, 11, 5), rand(3, 5, 13)),
correct12=(rand(3, 7, 11, 5), rand(3, 13, 5, 17)),
mixed1=(rand(3, 5).astype('float32'),
rand(3, 5, 7)),
mixed2=(rand(3, 5).astype('float64'),
rand(3, 5, 7))),
bad_build=dict(no_batch_axis2=(rand(), rand(3, 5)),
no_batch_axis3=(rand(3, 5), rand())),
bad_runtime=dict(batch_dim_mismatch1=(rand(2, 5, 7), rand(3, 7, 9)),
batch_dim_mismatch2=(rand(3, 5, 7), rand(2, 7, 9)),
batch_dim_mismatch3=(rand(3), rand(5)),
bad_dim1=(rand(3, 5, 7), rand(3, 5, 7)),
bad_dim2=(rand(3, 5, 7), rand(3, 8, 3)),
bad_dim3=(rand(3, 5), rand(3, 7)),
bad_dim4=(rand(3, 5, 7, 11), rand(3, 5)),
bad_dim5=(rand(3, 5, 7, 11), rand(3, 5, 13)),
bad_dim6=(rand(3, 5, 7, 11), rand(3, 13, 5, 17))))
def _numpy_second(x, y): def _numpy_second(x, y):
return numpy.broadcast_arrays(x, y)[1] return numpy.broadcast_arrays(x, y)[1]
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论