Merge pull request #3508 from cooijmanstim/batched_gemm

WIP: BatchedDotOp

Merge pull request #3508 from cooijmanstim/batched_gemm
f87853c5 · carriepl · 4800a51c · 264d8bf2 · f87853c5 · f87853c5
--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -212,6 +212,10 @@ class BatchedDotOp(GpuOp):
    def c_code_cache_version(self):
        return (1,)

+    def infer_shape(self, node, shapes):
+        xshp, yshp = shapes
+        return [xshp[:-1] + yshp[2:]]
+
 batched_dot = BatchedDotOp()
 """
 Call cublasSgemmBatched. Take 2 3d tensor as input.

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -33,7 +33,7 @@ from theano.sandbox.cuda.basic_ops import (
 from theano.sandbox.cuda.type import CudaNdarrayType
 from theano.sandbox.cuda.blas import (
    gpu_dot22, gpu_dot22scalar, gpu_gemm_inplace, gpu_gemm_no_inplace, GpuConv,
-    GpuCorrMM, GpuCorrMM_gradInputs, GpuCorrMM_gradWeights,
+    BatchedDotOp, GpuCorrMM, GpuCorrMM_gradInputs, GpuCorrMM_gradWeights,
    GpuCorr3dMM, GpuCorr3dMM_gradInputs, GpuCorr3dMM_gradWeights)

 from theano.sandbox.cuda.blas import gpu_gemv_inplace
@@ -156,7 +156,7 @@ cpu_ops_moved_to_gpu = [
    tensor.Reshape, tensor.flatten, tensor.Subtensor,
    tensor.AdvancedSubtensor1, tensor.AdvancedIncSubtensor1,
    tensor.IncSubtensor, tensor.Shape, tensor.Join,
-    tensor.Alloc, tensor.Eye]
+    tensor.Alloc, tensor.Eye, tensor.BatchedDot]


 class InputToGpuOptimizer(Optimizer):
@@ -613,6 +613,45 @@ def local_gpu_dot22(node):
    return False


+@register_opt()
+@local_optimizer([gpu_from_host, tensor.BatchedDot])
+def local_gpu_batched_dot(node):
+    """
+    gpu_from_host(batched_dot) -> gpu_batched_dot(gpu_from_host)
+
+    batched_dot(host_from_gpu) -> host_from_gpu(gpu_batched_dot)
+
+    """
+    def gpu_batched_dot(x, y):
+        # pad x and y shapes to be third-order tensors
+        x_, y_ = x, y
+        if x.ndim == 2:
+            x_ = x_.dimshuffle(0, "x", 1)
+        if y.ndim == 2:
+            y_ = y_.dimshuffle(0, 1, "x")
+        z = BatchedDotOp()(as_cuda_ndarray_variable(x_),
+                           as_cuda_ndarray_variable(y_))
+        # unpad z shape
+        if x.ndim == 2:
+            z = z.dimshuffle(0, *range(2, z.ndim))
+        if y.ndim == 2:
+            z = z.dimshuffle(*range(z.ndim - 1))
+        return as_cuda_ndarray_variable(z)
+
+    if isinstance(node.op, GpuFromHost):
+        host_input = node.inputs[0]
+        if host_input.owner and isinstance(host_input.owner.op,
+                                           tensor.BatchedDot):
+            x, y = host_input.owner.inputs
+            return [gpu_batched_dot(x, y)]
+    if isinstance(node.op, tensor.BatchedDot):
+        if any([(i.owner and isinstance(i.owner.op, HostFromGpu))
+                for i in node.inputs]):
+            x, y = node.inputs
+            return [host_from_gpu(gpu_batched_dot(x, y))]
+    return False
+
+
 @register_opt()
 @local_optimizer([gpu_from_host, tensor.blas.Dot22Scalar])
 def local_gpu_dot22scalar(node):

--- a/theano/sandbox/cuda/tests/test_blas.py
+++ b/theano/sandbox/cuda/tests/test_blas.py
@@ -23,7 +23,7 @@ import theano.compile.mode
 from theano.tensor.tests.test_blas import BaseGemv, TestBlasStrides, TestGer
 from theano.sandbox.cuda.blas import gpu_gemv_no_inplace, gpu_gemv_inplace
 from theano.sandbox.cuda.blas import gpu_ger_inplace, gpu_ger_no_inplace
-from theano.sandbox.cuda.blas import batched_dot
+from theano.sandbox.cuda.blas import batched_dot, BatchedDotOp

 if theano.config.mode == 'FAST_COMPILE':
    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
@@ -44,7 +44,7 @@ def my_rand(*shape):
    return theano._asarray(numpy.random.rand(*shape), dtype='float32')


-class TestBatchedDot(TestCase):
+class TestBatchedDot(unittest_tools.InferShapeTester):

    def test_batched_dot_correctness(self):

@@ -114,6 +114,17 @@ class TestBatchedDot(TestCase):
             numpy.random.randn(5,2,6).astype(numpy.float32)],
            mode=mode_with_gpu)

+    def test_infer_shape(self):
+        # only matrix/matrix is supported
+        admat = tensor.ftensor3()
+        bdmat = tensor.ftensor3()
+        admat_val = my_rand(7, 4, 5)
+        bdmat_val = my_rand(7, 5, 3)
+        self._compile_and_check([admat, bdmat],
+                                [BatchedDotOp()(admat, bdmat)],
+                                [admat_val, bdmat_val],
+                                BatchedDotOp)
+

 def test_dot22():
    def cmp(a_shp, b_shp):

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -3373,52 +3373,52 @@ def transpose(x, axes=None):
    return ret


-def batched_dot(x, y):
+def batched_dot(a, b):
    """
-    This function computes the dot product between the two tensors, by
-    iterating over the first dimension using scan.
+    Compute the batched dot product of two variables:

-    Parameters
-    ----------
-    x : tensor
-        A Tensor with sizes e.g.: for  3D (dim1, dim3, dim2).
-    y : tensor
-        A Tensor with sizes e.g.: for 3D (dim1, dim2, dim4).
+        batched_dot(a, b)[i] = dot(a[i], b[i])

-    Returns
-    -------
-    tensor
-        A tensor of size e.g. if it is 3D: (dim1, dim3, dim4).
+    Note that this batched_dot function does one of three things, in the
+    following sequence:

-    Notes
-    -----
-    This is a subset of numpy.einsum, but we do not provide it for now.
-    But numpy einsum is slower than dot or tensordot:
-    http://mail.scipy.org/pipermail/numpy-discussion/2012-October/064259.html
+        1.  If either a or b is a vector, it returns the batched elementwise
+            product without calling the Theano BatchedDot op.

-    Examples
-    --------
-    >>> first = tensor.tensor3('first')
-    >>> second = tensor.tensor3('second')
-    >>> result = batched_dot(first, second)
+        2.  If both a and b have either 2 or 3 dimensions, it calls Theano's
+            BatchedDot op on a and b.

+        3.  If either a or b has more than 3 dimensions, it calls Theano's
+            batched_tensordot function with appropriate axes. The
+            batched_tensordot function expresses high-dimensional batched
+            dot products in terms of batched matrix-matrix dot products, so
+            it may be possible to futherize optimize for performance.
    """
-    result, updates = theano.scan(
-        fn=lambda x_mat, y_mat:
-        theano.tensor.dot(x_mat, y_mat),
-        outputs_info=None,
-        sequences=[x, y],
-        non_sequences=None)
-    return result
+    a, b = as_tensor_variable(a), as_tensor_variable(b)
+
+    if a.ndim == 0:
+        raise TypeError("a must have at least one (batch) axis")
+    elif b.ndim == 0:
+        raise TypeError("b must have at least one (batch) axis")
+    elif a.ndim == 1:
+        return a.dimshuffle(*([0] + ["x"] * (b.ndim - 1))) * b
+    elif b.ndim == 1:
+        return a * b.dimshuffle(*([0] + ["x"] * (a.ndim - 1)))
+    elif a.ndim > 3 or b.ndim > 3:
+        return batched_tensordot(
+            a, b, [[a.ndim - 1], [numpy.maximum(1, b.ndim - 2)]])
+    else:
+        # avoid circular import
+        return theano.tensor.blas.BatchedDot()(a, b)


 def batched_tensordot(x, y, axes=2):
    """
-    Compute the tensordot product.
+    Compute a batched tensordot product.

-    A hybrid of batch_dot and tensordot, this function computes the
+    A hybrid of batched_dot and tensordot, this function computes the
    tensordot product between the two tensors, by iterating over the
-    first dimension using scan to perform a sequence of tensordots.
+    first dimension to perform a sequence of tensordots.

    Parameters
    ----------

--- a/theano/tensor/blas.py
+++ b/theano/tensor/blas.py
--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
@@ -31,9 +31,9 @@ from theano.tensor import (_shared, wvector, bvector, autocast_float_as,
        horizontal_stack, vertical_stack, argmax, get_vector_length,
        fscalar, zeros_like, sum, tensor3, vector, add, addbroadcast,
        alloc, as_tensor_variable, tensor_from_scalar, ARange, autocast_float,
-        clip, constant, default, dot,
-        dmatrix, dscalar, dvector, eq, eye, fill, flatten, inverse_permutation, Flatten,
-        tensor4, permute_row_elements, fmatrix, fscalars, grad,
+        clip, constant, default, dot, batched_dot,
+        dmatrix, dscalar, dvector, eq, eye, fill, flatten, inverse_permutation,
+        tensor4, permute_row_elements, Flatten, fmatrix, fscalars, grad,
        inplace, iscalar, matrix, minimum, matrices, maximum, mul, neq,
        Reshape, row, scalar, scalars, second, smallest, stack, sub, Tensor,
        tensor_copy, tensordot, TensorType, Tri, tri, tril, triu, unbroadcast,
@@ -1938,6 +1938,59 @@ DotTester = makeTester(name='DotTester',
                        bad_runtime=dict(bad1=(rand(5, 7), rand(5, 7)),
                                         bad2=(rand(5, 7), rand(8, 3))))

+BatchedDotTester = makeTester(
+    name='BatchedDotTester',
+    op=batched_dot,
+    expected=(lambda xs, ys:
+        numpy.asarray(
+            list(x * y if x.ndim == 0 or y.ndim == 0 else numpy.dot(x, y)
+                 for x, y in zip(xs, ys)),
+            dtype=theano.scalar.upcast(xs.dtype, ys.dtype))),
+    checks={},
+    grad=dict(correct1=(rand(3, 5, 7), rand(3, 7, 5)),
+              correct2=(rand(3, 5, 7), rand(3, 7, 9)),
+              correct3=(rand(3, 5, 7), rand(3, 7)),
+              correct4=(rand(3, 5), rand(3, 5, 7)),
+              correct5=(rand(3), rand(3, 5, 7)),
+              correct6=(rand(3, 5), rand(3)),
+              correct7=(rand(3, 5), rand(3, 5)),
+              correct8=(rand(3), rand(3)),
+              correct9=(rand(3, 5, 7, 11), rand(3)),
+              correct10=(rand(3, 7, 11, 5), rand(3, 5)),
+              correct11=(rand(3, 7, 11, 5), rand(3, 5, 13)),
+              correct12=(rand(3, 7, 11, 5), rand(3, 13, 5, 17)),
+              mixed1=(rand(3, 5).astype('float32'),
+                      rand(3, 5, 7)),
+              mixed2=(rand(3, 5).astype('float64'),
+                      rand(3, 5, 7))),
+    good=dict(correct1=(rand(3, 5, 7), rand(3, 7, 5)),
+              correct2=(rand(3, 5, 7), rand(3, 7, 9)),
+              correct3=(rand(3, 5, 7), rand(3, 7)),
+              correct4=(rand(3, 5), rand(3, 5, 7)),
+              correct5=(rand(3), rand(3, 5, 7)),
+              correct6=(rand(3, 5), rand(3)),
+              correct7=(rand(3, 5), rand(3, 5)),
+              correct8=(rand(3), rand(3)),
+              correct9=(rand(3, 5, 7, 11), rand(3)),
+              correct10=(rand(3, 7, 11, 5), rand(3, 5)),
+              correct11=(rand(3, 7, 11, 5), rand(3, 5, 13)),
+              correct12=(rand(3, 7, 11, 5), rand(3, 13, 5, 17)),
+              mixed1=(rand(3, 5).astype('float32'),
+                      rand(3, 5, 7)),
+              mixed2=(rand(3, 5).astype('float64'),
+                      rand(3, 5, 7))),
+    bad_build=dict(no_batch_axis2=(rand(), rand(3, 5)),
+                   no_batch_axis3=(rand(3, 5), rand())),
+    bad_runtime=dict(batch_dim_mismatch1=(rand(2, 5, 7), rand(3, 7, 9)),
+                     batch_dim_mismatch2=(rand(3, 5, 7), rand(2, 7, 9)),
+                     batch_dim_mismatch3=(rand(3), rand(5)),
+                     bad_dim1=(rand(3, 5, 7), rand(3, 5, 7)),
+                     bad_dim2=(rand(3, 5, 7), rand(3, 8, 3)),
+                     bad_dim3=(rand(3, 5), rand(3, 7)),
+                     bad_dim4=(rand(3, 5, 7, 11), rand(3, 5)),
+                     bad_dim5=(rand(3, 5, 7, 11), rand(3, 5, 13)),
+                     bad_dim6=(rand(3, 5, 7, 11), rand(3, 13, 5, 17))))
+

 def _numpy_second(x, y):
    return numpy.broadcast_arrays(x, y)[1]