Merge pull request #5950 from lamblin/fix_5730

Test for fix in libgpuarray, plus fix in batched_dot opt

Merge pull request #5950 from lamblin/fix_5730
7c07a3ce · Frédéric Bastien · GitHub · 3fad40aa · dd5ea431 · 7c07a3ce
--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -1208,9 +1208,31 @@ def local_gpua_gemmbatch(op, context_name, inputs, outputs):
    if inputs[0].dtype not in ['float32', 'float64']:
        return
    a, b = inputs
-    c = tensor.AllocEmpty(a.dtype)(a.shape[0], a.shape[1], b.shape[2])
+    # Since GpuGemmBatch only supports 3D inputs and output,
-    return gpugemmbatch_no_inplace(c, np.asarray(1.0, dtype=a.dtype),
+    # we need to add broadcastable dims to the inputs, and drop
-                                   a, b, np.asarray(0.0, dtype=a.dtype))
+    # them from outputs
+    output_dims = [0, 1, 2]
+    if a.ndim == 2:
+        a = GpuDimShuffle(a.broadcastable, (0, 1, 'x'))(a)
+        del output_dims[1]
+    if b.ndim == 2:
+        b = GpuDimShuffle(b.broadcastable, (0, 'x', 1))(b)
+        del output_dims[-1]
+    # In case of mismatched dtypes, we also have to upcast
+    out_dtype = outputs[0].dtype
+    if a.dtype != out_dtype or b.dtype != out_dtype:
+        gpu_cast_op = GpuElemwise(Cast(Scalar(out_dtype)))
+        if a.dtype != out_dtype:
+            a = gpu_cast_op(a)
+        if b.dtype != out_dtype:
+            b = gpu_cast_op(b)
+    c = tensor.AllocEmpty(out_dtype)(a.shape[0], a.shape[1], b.shape[2])
+    out = gpugemmbatch_no_inplace(c, np.asarray(1.0, dtype=out_dtype),
+                                  a, b, np.asarray(0.0, dtype=out_dtype))
+    if len(output_dims) != 3:
+        out = GpuDimShuffle(out.broadcastable, output_dims)(out)
+    return out
 @register_opt()

--- a/theano/gpuarray/tests/test_blas.py
+++ b/theano/gpuarray/tests/test_blas.py
@@ -5,6 +5,7 @@ import itertools
 import numpy as np
 import theano
+from theano import config
 from theano import tensor
 from theano.tests import unittest_tools as utt
 from theano.tensor.blas import gemv_inplace, gemm_inplace, _dot22, batched_dot
@@ -13,7 +14,6 @@ from theano.tensor.tests.test_blas import TestGer, BaseGemv
 from .. import gpuarray_shared_constructor
 from .config import mode_with_gpu, test_ctx_name
 from .test_basic_ops import makeTester, rand
 from ..blas import (gpugemv_inplace, gpugemv_no_inplace,
                    gpugemm_inplace, gpugemm_no_inplace,
                    gpugemmbatch_no_inplace,
@@ -135,6 +135,18 @@ GpuGemmBatchTester = makeTester(
    )
+class TestGpuGemmBatchStrided(TestCase):
+    def test0(self):
+        # Reported in https://github.com/Theano/Theano/issues/5730
+        x = tensor.tensor3()
+        y = tensor.tensor3()
+        z = tensor.batched_dot(x, y[:, 0, :, np.newaxis])
+        f = theano.function([x, y], z, mode=mode_with_gpu)
+        x_num = np.arange(32 * 19 * 600, dtype=config.floatX).reshape((32, 19, 600))
+        y_num = np.arange(7 * 32 * 600, dtype=config.floatX).reshape((32, 7, 600))
+        f(x_num, y_num)
 class TestGpuSger(TestGer):
    def setUp(self):
        self.mode = mode_with_gpu

--- a/theano/gpuarray/tests/test_opt.py
+++ b/theano/gpuarray/tests/test_opt.py
@@ -656,3 +656,27 @@ def test_local_gpua_advanced_incsubtensor():
    w = tensor.set_subtensor(w[tensor.eq(y, 1.0).nonzero()], 100)
    w = tensor.set_subtensor(w[tensor.eq(y, -1.0).nonzero()], 0)
    theano.function([target], w)
+def test_batched_dot_lifter():
+    # The CPU Op accepts 2D and 3D inputs, as well as mixed dtypes.
+    # Make sure the lifter adds the appropriate dimshuffles and casts
+    rng = np.random.RandomState(utt.fetch_seed())
+    def randX(*args):
+        return rng.rand(*args).astype(theano.config.floatX)
+    cases = [
+        (randX(3, 5, 7), randX(3, 7)),
+        (randX(3, 5), randX(3, 5, 7)),
+        (randX(3, 5), randX(3, 5)),
+        (rng.rand(3, 5, 7).astype('float32'), randX(3, 7, 9)),
+        (rng.rand(3, 5, 7).astype('float64'), randX(3, 7, 9))]
+    for x_val, y_val in cases:
+        x = tensor.TensorType(broadcastable=[s == 1 for s in x_val.shape],
+                              dtype=x_val.dtype)('x')
+        y = tensor.TensorType(broadcastable=[s == 1 for s in y_val.shape],
+                              dtype=y_val.dtype)('y')
+        z = tensor.batched_dot(x, y)
+        f = theano.function([x, y], z, mode=mode_with_gpu)
+        f(x_val, y_val)