Merge pull request #3508 from cooijmanstim/batched_gemm

WIP: BatchedDotOp

Merge pull request #3508 from cooijmanstim/batched_gemm
f87853c5 · carriepl · 4800a51c · 264d8bf2 · f87853c5 · f87853c5
--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -212,6 +212,10 @@ class BatchedDotOp(GpuOp):
    def c_code_cache_version(self):
        return (1,)
+    def infer_shape(self, node, shapes):
+        xshp, yshp = shapes
+        return [xshp[:-1] + yshp[2:]]
 batched_dot = BatchedDotOp()
 """
 Call cublasSgemmBatched. Take 2 3d tensor as input.

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -33,7 +33,7 @@ from theano.sandbox.cuda.basic_ops import (
 from theano.sandbox.cuda.type import CudaNdarrayType
 from theano.sandbox.cuda.blas import (
    gpu_dot22, gpu_dot22scalar, gpu_gemm_inplace, gpu_gemm_no_inplace, GpuConv,
-    GpuCorrMM, GpuCorrMM_gradInputs, GpuCorrMM_gradWeights,
+    BatchedDotOp, GpuCorrMM, GpuCorrMM_gradInputs, GpuCorrMM_gradWeights,
    GpuCorr3dMM, GpuCorr3dMM_gradInputs, GpuCorr3dMM_gradWeights)
 from theano.sandbox.cuda.blas import gpu_gemv_inplace
@@ -156,7 +156,7 @@ cpu_ops_moved_to_gpu = [
    tensor.Reshape, tensor.flatten, tensor.Subtensor,
    tensor.AdvancedSubtensor1, tensor.AdvancedIncSubtensor1,
    tensor.IncSubtensor, tensor.Shape, tensor.Join,
-    tensor.Alloc, tensor.Eye]
+    tensor.Alloc, tensor.Eye, tensor.BatchedDot]
 class InputToGpuOptimizer(Optimizer):
@@ -613,6 +613,45 @@ def local_gpu_dot22(node):
    return False
+@register_opt()
+@local_optimizer([gpu_from_host, tensor.BatchedDot])
+def local_gpu_batched_dot(node):
+    """
+    gpu_from_host(batched_dot) -> gpu_batched_dot(gpu_from_host)
+    batched_dot(host_from_gpu) -> host_from_gpu(gpu_batched_dot)
+    """
+    def gpu_batched_dot(x, y):
+        # pad x and y shapes to be third-order tensors
+        x_, y_ = x, y
+        if x.ndim == 2:
+            x_ = x_.dimshuffle(0, "x", 1)
+        if y.ndim == 2:
+            y_ = y_.dimshuffle(0, 1, "x")
+        z = BatchedDotOp()(as_cuda_ndarray_variable(x_),
+                           as_cuda_ndarray_variable(y_))
+        # unpad z shape
+        if x.ndim == 2:
+            z = z.dimshuffle(0, *range(2, z.ndim))
+        if y.ndim == 2:
+            z = z.dimshuffle(*range(z.ndim - 1))
+        return as_cuda_ndarray_variable(z)
+    if isinstance(node.op, GpuFromHost):
+        host_input = node.inputs[0]
+        if host_input.owner and isinstance(host_input.owner.op,
+                                           tensor.BatchedDot):
+            x, y = host_input.owner.inputs
+            return [gpu_batched_dot(x, y)]
+    if isinstance(node.op, tensor.BatchedDot):
+        if any([(i.owner and isinstance(i.owner.op, HostFromGpu))
+                for i in node.inputs]):
+            x, y = node.inputs
+            return [host_from_gpu(gpu_batched_dot(x, y))]
+    return False
 @register_opt()
 @local_optimizer([gpu_from_host, tensor.blas.Dot22Scalar])
 def local_gpu_dot22scalar(node):

--- a/theano/sandbox/cuda/tests/test_blas.py
+++ b/theano/sandbox/cuda/tests/test_blas.py
@@ -23,7 +23,7 @@ import theano.compile.mode
 from theano.tensor.tests.test_blas import BaseGemv, TestBlasStrides, TestGer
 from theano.sandbox.cuda.blas import gpu_gemv_no_inplace, gpu_gemv_inplace
 from theano.sandbox.cuda.blas import gpu_ger_inplace, gpu_ger_no_inplace
-from theano.sandbox.cuda.blas import batched_dot
+from theano.sandbox.cuda.blas import batched_dot, BatchedDotOp
 if theano.config.mode == 'FAST_COMPILE':
    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
@@ -44,7 +44,7 @@ def my_rand(*shape):
    return theano._asarray(numpy.random.rand(*shape), dtype='float32')
-class TestBatchedDot(TestCase):
+class TestBatchedDot(unittest_tools.InferShapeTester):
    def test_batched_dot_correctness(self):
@@ -114,6 +114,17 @@ class TestBatchedDot(TestCase):
             numpy.random.randn(5,2,6).astype(numpy.float32)],
            mode=mode_with_gpu)
+    def test_infer_shape(self):
+        # only matrix/matrix is supported
+        admat = tensor.ftensor3()
+        bdmat = tensor.ftensor3()
+        admat_val = my_rand(7, 4, 5)
+        bdmat_val = my_rand(7, 5, 3)
+        self._compile_and_check([admat, bdmat],
+                                [BatchedDotOp()(admat, bdmat)],
+                                [admat_val, bdmat_val],
+                                BatchedDotOp)
 def test_dot22():
    def cmp(a_shp, b_shp):

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -3373,52 +3373,52 @@ def transpose(x, axes=None):
    return ret
-def batched_dot(x, y):
+def batched_dot(a, b):
    """
-    This function computes the dot product between the two tensors, by
+    Compute the batched dot product of two variables:
-    iterating over the first dimension using scan.
-    Parameters
+        batched_dot(a, b)[i] = dot(a[i], b[i])
-    ----------
-    x : tensor
-        A Tensor with sizes e.g.: for  3D (dim1, dim3, dim2).
-    y : tensor
-        A Tensor with sizes e.g.: for 3D (dim1, dim2, dim4).
-    Returns
+    Note that this batched_dot function does one of three things, in the
-    -------
+    following sequence:
-    tensor
-        A tensor of size e.g. if it is 3D: (dim1, dim3, dim4).
-    Notes
+        1.  If either a or b is a vector, it returns the batched elementwise
-    -----
+            product without calling the Theano BatchedDot op.
-    This is a subset of numpy.einsum, but we do not provide it for now.
-    But numpy einsum is slower than dot or tensordot:
-    http://mail.scipy.org/pipermail/numpy-discussion/2012-October/064259.html
-    Examples
+        2.  If both a and b have either 2 or 3 dimensions, it calls Theano's
-    --------
+            BatchedDot op on a and b.
-    >>> first = tensor.tensor3('first')
-    >>> second = tensor.tensor3('second')
-    >>> result = batched_dot(first, second)
+        3.  If either a or b has more than 3 dimensions, it calls Theano's
+            batched_tensordot function with appropriate axes. The
+            batched_tensordot function expresses high-dimensional batched
+            dot products in terms of batched matrix-matrix dot products, so
+            it may be possible to futherize optimize for performance.
    """
-    result, updates = theano.scan(
+    a, b = as_tensor_variable(a), as_tensor_variable(b)
-        fn=lambda x_mat, y_mat:
-        theano.tensor.dot(x_mat, y_mat),
+    if a.ndim == 0:
-        outputs_info=None,
+        raise TypeError("a must have at least one (batch) axis")
-        sequences=[x, y],
+    elif b.ndim == 0:
-        non_sequences=None)
+        raise TypeError("b must have at least one (batch) axis")
-    return result
+    elif a.ndim == 1:
+        return a.dimshuffle(*([0] + ["x"] * (b.ndim - 1))) * b
+    elif b.ndim == 1:
+        return a * b.dimshuffle(*([0] + ["x"] * (a.ndim - 1)))
+    elif a.ndim > 3 or b.ndim > 3:
+        return batched_tensordot(
+            a, b, [[a.ndim - 1], [numpy.maximum(1, b.ndim - 2)]])
+    else:
+        # avoid circular import
+        return theano.tensor.blas.BatchedDot()(a, b)
 def batched_tensordot(x, y, axes=2):
    """
-    Compute the tensordot product.
+    Compute a batched tensordot product.
-    A hybrid of batch_dot and tensordot, this function computes the
+    A hybrid of batched_dot and tensordot, this function computes the
    tensordot product between the two tensors, by iterating over the
-    first dimension using scan to perform a sequence of tensordots.
+    first dimension to perform a sequence of tensordots.
    Parameters
    ----------

--- a/theano/tensor/blas.py
+++ b/theano/tensor/blas.py
@@ -2188,6 +2188,440 @@ blas_optdb.register('local_dot22_to_dot22scalar',
                    11, 'fast_run')
+class BatchedDot(Op):
+    """
+    Computes the batched dot product of two variables:
+        batched_dot(a, b)[i] = dot(a[i], b[i])
+    """
+    __props__ = ()
+    def make_node(self, *inputs):
+        inputs = list(map(T.as_tensor_variable, inputs))
+        if len(inputs) != 2:
+            raise TypeError("theano.tensor.blas.BatchedDot: 2 arguments"
+                            " required, %d given " % len(inputs))
+        if inputs[0].ndim not in (2, 3):
+            raise TypeError("theano.tensor.blas.BatchedDot: input 0 (0-indexed)"
+                            " must have ndim of 2 or 3, %d given. Consider"
+                            " calling theano.tensor.batched_dot instead."
+                            % inputs[0].ndim)
+        if inputs[1].ndim not in (2, 3):
+            raise TypeError("theano.tensor.blas.BatchedDot: input 1 (0-indexed)"
+                            " must have ndim of 2 or 3, %d given. Consider"
+                            " calling theano.tensor.batched_dot instead."
+                            % inputs[1].ndim)
+        dtype = theano.scalar.upcast(*[input.type.dtype for input in inputs])
+        # upcast inputs to common dtype if needed
+        upcasted_inputs = [T.cast(input, dtype) for input in inputs]
+        broadcastable = ((inputs[0].type.broadcastable[0] or
+                          inputs[1].type.broadcastable[0],) +
+                         inputs[0].type.broadcastable[1:-1] +
+                         inputs[1].type.broadcastable[2:])
+        return Apply(self, upcasted_inputs, [T.tensor(dtype, broadcastable)])
+    def perform(self, node, inp, out):
+        x, y = inp
+        z, = out
+        if x.shape[0] != y.shape[0]:
+            raise TypeError(
+                "theano.tensor.blas.BatchedDot: inputs [%s] must have the"
+                " same size in axis 0, but have sizes [%s]." %
+                (", ".join(map(str, inp)),
+                 ", ".join([str(i.shape[0]) for i in inp])))
+        shape = self.infer_shape(node, [i.shape for i in inp])[0]
+        dtype = node.outputs[0].dtype
+        z0 = z[0] = numpy.empty(shape, dtype=dtype)
+        for i in xrange(z0.shape[0]):
+            z0[i] = numpy.dot(x[i], y[i])
+    def c_support_code(self):
+        batch_gemm_defn = """
+        template<typename dtype, typename function>
+        bool batch_gemm(function gemm, int type_size,
+                        PyArrayObject* xs, PyArrayObject* ys, PyArrayObject* zs) {
+            npy_intp *Nx = PyArray_DIMS(xs), *Sx = PyArray_STRIDES(xs);
+            npy_intp *Ny = PyArray_DIMS(ys), *Sy = PyArray_STRIDES(ys);
+            npy_intp *Nz = PyArray_DIMS(zs), *Sz = PyArray_STRIDES(zs);
+            if (Nx[0] != Ny[0]) {
+                PyErr_Format(PyExc_ValueError,
+                             "Shape mismatch: batch sizes unequal."
+                             " x.shape is (%d, %d, %d),"
+                             " y.shape is (%d, %d, %d).",
+                             Nx[0], Nx[1], Nx[2],
+                             Ny[0], Ny[1], Ny[2]);
+                return 1;
+            }
+            if (Nx[2] != Ny[1]) {
+                PyErr_Format(PyExc_ValueError,
+                             "Shape mismatch: summation axis sizes unequal."
+                             " x.shape is (%d, %d, %d),"
+                             " y.shape is (%d, %d, %d).",
+                             Nx[0], Nx[1], Nx[2],
+                             Ny[0], Ny[1], Ny[2]);
+                return 1;
+            }
+            /* encode the stride structure of _x,_y,_z into a single integer. */
+            int unit = 0;
+            unit |= ((Sx[2] == type_size || Nx[2] == 1) ? 0x0 : (Sx[1] == type_size || Nx[1]==1) ? 0x1 : 0x2) << 8;
+            unit |= ((Sy[2] == type_size || Ny[2] == 1) ? 0x0 : (Sy[1] == type_size || Ny[1]==1) ? 0x1 : 0x2) << 4;
+            unit |= ((Sz[2] == type_size || Nz[2] == 1) ? 0x0 : (Sz[1] == type_size || Nz[1]==1) ? 0x1 : 0x2) << 0;
+            /* create appropriate strides for malformed matrices that are row or column
+             * vectors, or empty matrices.
+             * In that case, the value of the stride does not really matter, but
+             * some versions of BLAS insist that:
+             *  - they are not smaller than the number of elements in the array,
+             *  - they are not 0.
+             */
+            int sx_1 = (Nx[1] > 1) ? Sx[1]/type_size : (Nx[2] + 1);
+            int sx_2 = (Nx[2] > 1) ? Sx[2]/type_size : (Nx[1] + 1);
+            int sy_1 = (Ny[1] > 1) ? Sy[1]/type_size : (Ny[2] + 1);
+            int sy_2 = (Ny[2] > 1) ? Sy[2]/type_size : (Ny[1] + 1);
+            int sz_1 = (Nz[1] > 1) ? Sz[1]/type_size : (Nz[2] + 1);
+            int sz_2 = (Nz[2] > 1) ? Sz[2]/type_size : (Nz[1] + 1);
+            dtype* x = (dtype*)PyArray_DATA(xs);
+            dtype* y = (dtype*)PyArray_DATA(ys);
+            dtype* z = (dtype*)PyArray_DATA(zs);
+            dtype a = 1.0;
+            dtype b = 0.0;
+            char N = 'N';
+            char T = 'T';
+            int Nz1 = Nz[1], Nz2 = Nz[2], Nx2 = Nx[2];
+            // loop over batch axis
+            for (int i = 0; i < Nz[0]; i++) {
+                switch(unit)
+                {
+                    case 0x000: gemm(&N, &N, &Nz2, &Nz1, &Nx2, &a, y, &sy_1, x, &sx_1, &b, z, &sz_1); break;
+                    case 0x100: gemm(&N, &T, &Nz2, &Nz1, &Nx2, &a, y, &sy_1, x, &sx_2, &b, z, &sz_1); break;
+                    case 0x010: gemm(&T, &N, &Nz2, &Nz1, &Nx2, &a, y, &sy_2, x, &sx_1, &b, z, &sz_1); break;
+                    case 0x110: gemm(&T, &T, &Nz2, &Nz1, &Nx2, &a, y, &sy_2, x, &sx_2, &b, z, &sz_1); break;
+                    case 0x001: gemm(&T, &T, &Nz1, &Nz2, &Nx2, &a, x, &sx_1, y, &sy_1, &b, z, &sz_2); break;
+                    case 0x101: gemm(&N, &T, &Nz1, &Nz2, &Nx2, &a, x, &sx_2, y, &sy_1, &b, z, &sz_2); break;
+                    case 0x011: gemm(&T, &N, &Nz1, &Nz2, &Nx2, &a, x, &sx_1, y, &sy_2, &b, z, &sz_2); break;
+                    case 0x111: gemm(&N, &N, &Nz1, &Nz2, &Nx2, &a, x, &sx_2, y, &sy_2, &b, z, &sz_2); break;
+                    default: PyErr_SetString(PyExc_ValueError, "some matrix has no unit stride"); return 1;
+                };
+                x += Sx[0] / type_size;
+                y += Sy[0] / type_size;
+                z += Sz[0] / type_size;
+            }
+            return 0;
+        }
+        """
+        return blas_header_text() + batch_gemm_defn
+    def c_libraries(self):
+        return ldflags()
+    def c_compile_args(self):
+        return ldflags(libs=False, flags=True)
+    def c_lib_dirs(self):
+        return ldflags(libs=False, libs_dir=True)
+    def c_header_dirs(self):
+        return ldflags(libs=False, include_dir=True)
+    def c_code_cleanup(self, node, name, inputs, outputs, sub):
+        return """
+        // clean up views
+        Py_XDECREF(xs); xs = 0;
+        Py_XDECREF(ys); ys = 0;
+        Py_XDECREF(zs); zs = 0;
+        """
+    def c_code(self, node, name, inp, out, sub):
+        _x, _y = inp
+        _z, = out
+        fail = sub["fail"]
+        # generate contiguity condition
+        def contiguous(var, ndim):
+            strides = "PyArray_STRIDES(%s)" % var
+            return " && ".join([
+                " && ".join("{strides}[{i}] > 0 && {strides}[{i}] % type_size == 0"
+                            .format(strides=strides, i=i) for i in range(ndim)),
+                "(%s)" % " || ".join("{strides}[{i}] == type_size"
+                                     .format(strides=strides, i=i) for i in range(ndim)),
+            ])
+        x_ndim, y_ndim, z_ndim = node.inputs[0].ndim, node.inputs[1].ndim, node.outputs[0].ndim
+        # generate code to allocate output based on runtime input shapes
+        z_dims = ["PyArray_DIMS(%s)[0]" % _x]
+        if x_ndim == 3:
+            z_dims.append("PyArray_DIMS(%s)[1]" % _x)
+        if y_ndim == 3:
+            z_dims.append("PyArray_DIMS(%s)[2]" % _y)
+        assert len(z_dims) == z_ndim
+        z_shape_correct = " && ".join("PyArray_DIMS(%s)[%i] == %s"
+                                      % (_z, i, dim) for i, dim in enumerate(z_dims))
+        z_shape = ", ".join(z_dims)
+        z_contiguous = contiguous(_z, z_ndim)
+        allocate = """
+            if (NULL == %(_z)s || !(%(z_shape_correct)s)  || !(%(z_contiguous)s))
+            {
+                npy_intp dims[%(z_ndim)s] = {%(z_shape)s};
+                Py_XDECREF(%(_z)s);
+                %(_z)s = (PyArrayObject*)PyArray_SimpleNew(
+                    %(z_ndim)s, dims, PyArray_TYPE(%(_x)s));
+                if(!%(_z)s) {
+                    PyErr_SetString(PyExc_MemoryError,
+                                    "failed to alloc BatchedDot output");
+                    %(fail)s
+                }
+            }
+        """ % locals()
+        # code to reallocate inputs contiguously if necessary
+        contiguate = []
+        for var, ndim in [(_x, x_ndim), (_y, y_ndim)]:
+            _contiguous = contiguous(var, ndim)
+            contiguate.append("""
+                if (!(%(_contiguous)s)) {
+                    PyArrayObject * _copy = (PyArrayObject *) PyArray_Copy(%(var)s);
+                    if (!_copy)
+                        %(fail)s
+                    Py_XDECREF(%(var)s);
+                    %(var)s = _copy;
+                }
+            """ % locals())
+        contiguate = "\n".join(contiguate)
+        def c_dimshuffle(newname, oldname, shape):
+            _fail = fail
+            _shape = ", ".join("1" if axis is None else "PyArray_DIMS(%s)[%i]" % (oldname, axis)
+                               for axis in shape)
+            return """{
+                npy_intp dims[3] = {%(_shape)s};
+                PyArray_Dims newshape = {dims, 3};
+                %(newname)s = (PyArrayObject*)PyArray_Newshape(%(oldname)s, &newshape, NPY_ANYORDER);
+                if (!%(newname)s)
+                    %(_fail)s
+                // make sure we didn't accidentally copy
+                assert(PyArray_DATA(%(oldname)s) == PyArray_DATA(%(newname)s));
+            }""" % locals()
+        # create tensor3 views for any of x, y, z that are not tensor3, so that
+        # we only need to implement the tensor3-tensor3 batched dot product.
+        # xs, ys and zs will point to these views, or to the original array if
+        # it was already tensor3.
+        # in the latter case, we artificially increase the reference count of
+        # the original array so that the c_code_cleanup method can decref them
+        # all indiscriminately.
+        upcast = []
+        if x_ndim == 3:
+            upcast.append("xs = %(_x)s; Py_XINCREF(xs);")
+        elif x_ndim == 2:
+            upcast.append(c_dimshuffle("xs", _x, (0, None, 1)))
+        if y_ndim == 3:
+            upcast.append("ys = %(_y)s; Py_XINCREF(ys);")
+        elif y_ndim == 2:
+            upcast.append(c_dimshuffle("ys", _y, (0, 1, None)))
+        if z_ndim == 3:
+            upcast.append("zs = %(_z)s; Py_XINCREF(zs);")
+        else:
+            upcast.append(c_dimshuffle(
+                "zs", _z, (0,
+                           None if x_ndim == 2 else 1,
+                           None if y_ndim == 2 else 1)))
+        upcast = "\n".join(upcast) % locals()
+        return """
+        int type_num = PyArray_DESCR(%(_x)s)->type_num;
+        int type_size = PyArray_DESCR(%(_x)s)->elsize; // in bytes
+        // xs, ys, zs will point to views onto %(_x)s, %(_y)s, %(_z)s
+        PyArrayObject *xs = 0, *ys = 0, *zs = 0;
+        if (PyArray_NDIM(%(_x)s) != %(x_ndim)s) {
+            PyErr_Format(PyExc_NotImplementedError,
+                         "rank(x) != %(x_ndim)s. rank(x) is %%d.",
+                         PyArray_NDIM(%(_x)s));
+            %(fail)s;
+        }
+        if (PyArray_NDIM(%(_y)s) != %(y_ndim)s) {
+            PyErr_Format(PyExc_NotImplementedError,
+                         "rank(y) != %(y_ndim)s. rank(y) is %%d.",
+                         PyArray_NDIM(%(_y)s));
+            %(fail)s;
+        }
+        if (%(_z)s && PyArray_NDIM(%(_z)s) != %(z_ndim)s) {
+            PyErr_Format(PyExc_NotImplementedError,
+                         "rank(z) != %(z_ndim)s. rank(z) is %%d.",
+                         PyArray_NDIM(%(_z)s));
+            %(fail)s;
+        }
+        // allocate output
+        %(allocate)s
+        // reallocate any noncontiguous arrays or arrays with invalid strides
+        %(contiguate)s
+        // add dims to make sure everything is tensor3
+        %(upcast)s
+        // from here on, use xs, ys and zs as they are tensor3 and share memory
+        // with the original %(_x)s, %(_y)s and %(_z)s arrays.
+        if ((PyArray_DESCR(xs)->type_num != NPY_DOUBLE)
+            && (PyArray_DESCR(xs)->type_num != NPY_FLOAT))
+        {PyErr_SetString(PyExc_NotImplementedError, "type(x) is not double or float"); %(fail)s;}
+        if ((PyArray_DESCR(ys)->type_num != NPY_DOUBLE)
+            && (PyArray_DESCR(ys)->type_num != NPY_FLOAT))
+        {PyErr_SetString(PyExc_NotImplementedError, "type(y) is not double or float"); %(fail)s;}
+        if ((PyArray_DESCR(zs)->type_num != NPY_DOUBLE)
+            && (PyArray_DESCR(zs)->type_num != NPY_FLOAT))
+        {PyErr_SetString(PyExc_NotImplementedError, "type(z) is not double or float"); %(fail)s;}
+        if ((PyArray_DESCR(xs)->type_num != PyArray_DESCR(ys)->type_num)
+            ||(PyArray_DESCR(xs)->type_num != PyArray_DESCR(zs)->type_num))
+        { PyErr_SetString(PyExc_NotImplementedError, "type(x), type(y), type(z) are not all the same"); %(fail)s; }
+        switch (type_num)
+        {
+            case NPY_FLOAT:
+            if (batch_gemm<float>(sgemm_, type_size, xs, ys, zs)) {
+                %(fail)s;
+            }
+            break;
+            case NPY_DOUBLE:
+            if (batch_gemm<double>(dgemm_, type_size, xs, ys, zs)) {
+                %(fail)s;
+            }
+            break;
+        }
+        """ % locals()
+    def c_code_cache_version(self):
+        from theano.tensor.blas_headers import blas_header_version
+        return (1, blas_header_version())
+    def grad(self, inp, grads):
+        x, y = inp
+        gz, = grads
+        xdim, ydim, gdim = x.type.ndim, y.type.ndim, gz.type.ndim
+        # grad is a vector, so x is a matrix and y is a matrix
+        if gdim == 1:
+            xgrad = gz.dimshuffle(0, 'x') * y
+            ygrad = gz.dimshuffle(0, 'x') * x
+        # x is a matrix, y is a tensor3, grad is a matrix
+        elif xdim == 2 and ydim == 3:
+            xgrad = T.batched_dot(gz, y.dimshuffle(0, 2, 1))
+            ygrad = x.dimshuffle(0, 1, 'x') * gz.dimshuffle(0, 'x', 1)
+        # x is a tensor3, y is a matrix, grad is a matrix
+        elif xdim == 3 and ydim == 2:
+            xgrad = gz.dimshuffle(0, 1, 'x') * y.dimshuffle(0, 'x', 1)
+            ygrad = T.batched_dot(x.dimshuffle(0, 2, 1), gz)
+        # x is a tensor3, y is a tensor3, grad is a tensor3
+        elif xdim == ydim == 3:
+            xgrad = T.batched_dot(gz, y.dimshuffle(0, 2, 1))
+            ygrad = T.batched_dot(x.dimshuffle(0, 2, 1), gz)
+        # If x or y contain broadcastable dimensions but only one of
+        # them know that a matching dimensions is broadcastable, the
+        # above code don't always return the right broadcast pattern.
+        # This cause problem down the road. See gh-1461.
+        if xgrad.broadcastable != x.broadcastable:
+            xgrad = T.patternbroadcast(xgrad, x.broadcastable)
+        if ygrad.broadcastable != y.broadcastable:
+            ygrad = T.patternbroadcast(ygrad, y.broadcastable)
+        return xgrad, ygrad
+    def R_op(self, inputs, eval_points):
+        # R_op for batched_dot(a, b) evaluted at c for a and d for b is
+        # simply batched_dot(c, b) + batched_dot(a, d)
+        assert len(inputs) == 2
+        assert len(eval_points) == 2
+        if eval_points[0] is None and eval_points[1] is None:
+            return [None]
+        debugger_available = config.compute_test_value != 'off'
+        if debugger_available:
+            try:
+                iv0 = theano.gof.op.get_test_value(inputs[0])
+            except AttributeError:
+                theano.gof.op.missing_test_message(
+                    'first input passed to BatchedDot.R_op has no test value')
+                debugger_available = False
+            try:
+                iv1 = theano.gof.op.get_test_value(inputs[1])
+            except AttributeError:
+                theano.gof.op.missing_test_message(
+                    'second input passed to BatchedDot.R_op has no test value')
+                debugger_available = False
+            if eval_points[0]:
+                try:
+                    ev0 = theano.gof.op.get_test_value(eval_points[0])
+                except AttributeError:
+                    theano.gof.op.missing_test_message(
+                        'first eval point passed to BatchedDot.R_op '
+                        'has no test value')
+                    debugger_available = False
+            if eval_points[1]:
+                try:
+                    ev1 = theano.gof.op.get_test_value(eval_points[1])
+                except AttributeError:
+                    theano.gof.op.missing_test_message(
+                        'second eval point passed to BatchedDot.R_op '
+                        'has no test value')
+                    debugger_available = False
+        if debugger_available:
+            input_values = [iv0, iv1]
+            eval_point_values = [ev0, ev1]
+            for i in xrange(2):
+                if eval_point_values[i] is not None and \
+                   input_values[i].shape != eval_point_values[i].shape:
+                    raise ValueError(
+                        'input ' + str(i) + ' and eval_point ' + str(i) +
+                        ' to BatchedDot.R_op should have the same shape, but '
+                        'their shapes are %s and %s, respectively' % (
+                            str(input_values[i].shape),
+                            str(eval_point_values[i].shape)))
+        if eval_points[0]:
+            t1 = self(eval_points[0], inputs[1])
+        if eval_points[1]:
+            t2 = self(inputs[0], eval_points[1])
+        if eval_points[0] and eval_points[1]:
+            return [t1 + t2]
+        elif eval_points[0]:
+            return [t1]
+        else:
+            return [t2]
+    def infer_shape(self, node, shapes):
+        for shape_ in shapes:
+            if len(shape_) not in (2, 3):
+                raise NotImplementedError()
+        xshp, yshp = shapes
+        return [xshp[:-1] + yshp[2:]]
 # from opt import register_specialize, register_canonicalize
 # @register_specialize
 @local_optimizer([T.sub, T.add])

--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
@@ -31,9 +31,9 @@ from theano.tensor import (_shared, wvector, bvector, autocast_float_as,
        horizontal_stack, vertical_stack, argmax, get_vector_length,
        fscalar, zeros_like, sum, tensor3, vector, add, addbroadcast,
        alloc, as_tensor_variable, tensor_from_scalar, ARange, autocast_float,
-        clip, constant, default, dot,
+        clip, constant, default, dot, batched_dot,
-        dmatrix, dscalar, dvector, eq, eye, fill, flatten, inverse_permutation, Flatten,
+        dmatrix, dscalar, dvector, eq, eye, fill, flatten, inverse_permutation,
-        tensor4, permute_row_elements, fmatrix, fscalars, grad,
+        tensor4, permute_row_elements, Flatten, fmatrix, fscalars, grad,
        inplace, iscalar, matrix, minimum, matrices, maximum, mul, neq,
        Reshape, row, scalar, scalars, second, smallest, stack, sub, Tensor,
        tensor_copy, tensordot, TensorType, Tri, tri, tril, triu, unbroadcast,
@@ -1938,6 +1938,59 @@ DotTester = makeTester(name='DotTester',
                        bad_runtime=dict(bad1=(rand(5, 7), rand(5, 7)),
                                         bad2=(rand(5, 7), rand(8, 3))))
+BatchedDotTester = makeTester(
+    name='BatchedDotTester',
+    op=batched_dot,
+    expected=(lambda xs, ys:
+        numpy.asarray(
+            list(x * y if x.ndim == 0 or y.ndim == 0 else numpy.dot(x, y)
+                 for x, y in zip(xs, ys)),
+            dtype=theano.scalar.upcast(xs.dtype, ys.dtype))),
+    checks={},
+    grad=dict(correct1=(rand(3, 5, 7), rand(3, 7, 5)),
+              correct2=(rand(3, 5, 7), rand(3, 7, 9)),
+              correct3=(rand(3, 5, 7), rand(3, 7)),
+              correct4=(rand(3, 5), rand(3, 5, 7)),
+              correct5=(rand(3), rand(3, 5, 7)),
+              correct6=(rand(3, 5), rand(3)),
+              correct7=(rand(3, 5), rand(3, 5)),
+              correct8=(rand(3), rand(3)),
+              correct9=(rand(3, 5, 7, 11), rand(3)),
+              correct10=(rand(3, 7, 11, 5), rand(3, 5)),
+              correct11=(rand(3, 7, 11, 5), rand(3, 5, 13)),
+              correct12=(rand(3, 7, 11, 5), rand(3, 13, 5, 17)),
+              mixed1=(rand(3, 5).astype('float32'),
+                      rand(3, 5, 7)),
+              mixed2=(rand(3, 5).astype('float64'),
+                      rand(3, 5, 7))),
+    good=dict(correct1=(rand(3, 5, 7), rand(3, 7, 5)),
+              correct2=(rand(3, 5, 7), rand(3, 7, 9)),
+              correct3=(rand(3, 5, 7), rand(3, 7)),
+              correct4=(rand(3, 5), rand(3, 5, 7)),
+              correct5=(rand(3), rand(3, 5, 7)),
+              correct6=(rand(3, 5), rand(3)),
+              correct7=(rand(3, 5), rand(3, 5)),
+              correct8=(rand(3), rand(3)),
+              correct9=(rand(3, 5, 7, 11), rand(3)),
+              correct10=(rand(3, 7, 11, 5), rand(3, 5)),
+              correct11=(rand(3, 7, 11, 5), rand(3, 5, 13)),
+              correct12=(rand(3, 7, 11, 5), rand(3, 13, 5, 17)),
+              mixed1=(rand(3, 5).astype('float32'),
+                      rand(3, 5, 7)),
+              mixed2=(rand(3, 5).astype('float64'),
+                      rand(3, 5, 7))),
+    bad_build=dict(no_batch_axis2=(rand(), rand(3, 5)),
+                   no_batch_axis3=(rand(3, 5), rand())),
+    bad_runtime=dict(batch_dim_mismatch1=(rand(2, 5, 7), rand(3, 7, 9)),
+                     batch_dim_mismatch2=(rand(3, 5, 7), rand(2, 7, 9)),
+                     batch_dim_mismatch3=(rand(3), rand(5)),
+                     bad_dim1=(rand(3, 5, 7), rand(3, 5, 7)),
+                     bad_dim2=(rand(3, 5, 7), rand(3, 8, 3)),
+                     bad_dim3=(rand(3, 5), rand(3, 7)),
+                     bad_dim4=(rand(3, 5, 7, 11), rand(3, 5)),
+                     bad_dim5=(rand(3, 5, 7, 11), rand(3, 5, 13)),
+                     bad_dim6=(rand(3, 5, 7, 11), rand(3, 13, 5, 17))))
 def _numpy_second(x, y):
    return numpy.broadcast_arrays(x, y)[1]