gpuarray GpuGemmBatch op and tests

3d0b9980 · Tim Cooijmans · 7444fdd6 · 3d0b9980 · 3d0b9980 · 3d0b9980
--- a/theano/sandbox/gpuarray/blas.py
+++ b/theano/sandbox/gpuarray/blas.py
@@ -331,6 +331,92 @@ class GpuDot22(BlasOp):
 gpu_dot22 = GpuDot22()
+class GpuGemmBatch(BlasOp):
+    __props__ = ('inplace',)
+    def __init__(self, inplace=False):
+        self.inplace = inplace
+        if self.inplace:
+            self.destroy_map = {0: [0]}
+    def make_node(self, C, alpha, A, B, beta):
+        ctx_name = infer_context_name(C, A, B)
+        A = as_gpuarray_variable(A, ctx_name)
+        B = as_gpuarray_variable(B, ctx_name)
+        C = as_gpuarray_variable(C, ctx_name)
+        alpha = as_tensor_variable(alpha)
+        beta = as_tensor_variable(beta)
+        assert alpha.ndim == 0
+        assert beta.ndim == 0
+        assert A.ndim == 3
+        assert B.ndim == 3
+        assert C.ndim == 3
+        assert A.dtype == B.dtype == C.dtype
+        return Apply(self, [C, alpha, A, B, beta], [C.type()])
+    def perform(self, node, inputs, outputs):
+        C, alpha, A, B, beta = inputs
+        if self.inplace and C.flags.forc:
+            C *= beta
+            C += alpha * blas.batched_dot(A, B)
+            outputs[0][0] = C
+        else:
+            outputs[0][0] = alpha * blas.batched_dot(A, B) + beta * C
+    def c_headers(self):
+        return super(GpuGemmBatch, self).c_headers() + ['<gpuarray/blas.h>']
+    def c_code(self, node, name, inp, out, sub):
+        vars = dict(out=out[0], C=inp[0], alpha=inp[1], A=inp[2], B=inp[3],
+                    beta=inp[4], fail=sub['fail'], name=name)
+        code = """
+        int err;
+        """
+        if self.inplace:
+            code += """
+                   if (!GpuArray_ISONESEGMENT(&%(C)s->ga)) {
+                     %(out)s = theano_try_copy(%(out)s, %(C)s);
+                     if (%(out)s == NULL) {
+                       %(fail)s
+                     }
+                   } else {
+                     Py_XDECREF(%(out)s);
+                     %(out)s = %(C)s;
+                     Py_INCREF(%(out)s);
+                   }
+                   """ % vars
+        else:
+            code += """
+                   %(out)s = theano_try_copy(%(out)s, %(C)s);
+                   if (%(out)s == NULL) {
+                       %(fail)s
+                   }
+                   """ % vars
+        code += """
+        err = GpuArray_rgemmBatch_3d(
+            cb_no_trans, cb_no_trans,
+            ((dtype_%(alpha)s *)PyArray_DATA(%(alpha)s))[0],
+            &%(A)s->ga, &%(B)s->ga,
+            ((dtype_%(beta)s *)PyArray_DATA(%(beta)s))[0],
+            &%(out)s->ga, 0);
+        if (err != GA_NO_ERROR) {
+            PyErr_Format(PyExc_RuntimeError,
+                         "%%s", GpuArray_error(&%(A)s->ga, err));
+            %(fail)s;
+        }
+        """ % vars
+        if config.gpuarray.sync:
+            code += """
+            GpuArray_sync(&%(out)s->ga);
+            """ % vars
+        return code
+    def c_code_cache_version(self):
+        return None
+gpugemmbatch_no_inplace = GpuGemmBatch(inplace=False)
+gpugemmbatch_inplace = GpuGemmBatch(inplace=True)
 @inplace_allocempty(GpuGemv, 0)
 def local_inplace_gpuagemv(node, inputs):

--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -31,8 +31,8 @@ from .basic_ops import (as_gpuarray_variable, infer_context_name,
                        GpuSplit, GpuContiguous,
                        GpuAlloc, GpuAllocEmpty, GpuReshape,
                        GpuEye, gpu_join, GpuJoin)
-from .blas import (gpu_dot22, GpuGemv, GpuGemm, GpuGer,
+from .blas import (gpu_dot22, GpuGemv, GpuGemm, GpuGer, GpuGemmBatch,
-                   gpugemm_no_inplace)
+                   gpugemm_no_inplace, gpugemmbatch_no_inplace)
 from .nnet import (GpuCrossentropySoftmaxArgmax1HotWithBias,
                   GpuCrossentropySoftmax1HotWithBiasDx,
                   GpuSoftmaxWithBias, GpuSoftmax)
@@ -742,6 +742,14 @@ def local_gpua_gemm(node, context_name):
    return GpuGemm(inplace=node.op.inplace)
+@register_opt('fast_compile')
+@op_lifter([tensor.blas.BatchedDot])
+def local_gpua_gemmbatch(node, context_name):
+    a, b = node.inputs
+    c = tensor.zeros((a.shape[0], a.shape[1], b.shape[2]))
+    return gpugemmbatch_no_inplace(c, 1.0, a, b, 0.0)
 @register_opt('fast_compile')
 @op_lifter([tensor.basic.Dot])
 def local_gpua_hgemm(node, context_name):
@@ -774,6 +782,18 @@ def local_gpuagemm_output_merge(node, *inputs):
    return [gpugemm_no_inplace(*inputs)]
+@register_opt()
+@alpha_merge(GpuGemmBatch, alpha_in=1, beta_in=4)
+def local_gpuagemmbatch_alpha_merge(node, *inputs):
+    return [gpugemmbatch_no_inplace(*inputs)]
+@register_opt()
+@output_merge(GpuGemmBatch, alpha_in=1, beta_in=4, out_in=0)
+def local_gpuagemmbatch_output_merge(node, *inputs):
+    return [gpugemmbatch_no_inplace(*inputs)]
 @register_opt('fast_compile')
 @op_lifter([tensor.blas.Ger, tensor.blas_c.CGer, tensor.blas_scipy.ScipyGer])
 def local_gpua_ger(node, context_name):

--- a/theano/sandbox/gpuarray/tests/test_blas.py
+++ b/theano/sandbox/gpuarray/tests/test_blas.py
 from __future__ import absolute_import, print_function, division
 from unittest import TestCase
 from nose.plugins.skip import SkipTest
+import itertools
 import numpy
 import theano
 from theano import tensor
 from theano.tests import unittest_tools as utt
-from theano.tensor.blas import gemv_inplace, gemm_inplace, _dot22
+from theano.tensor.blas import gemv_inplace, gemm_inplace, _dot22, batched_dot
 from theano.tensor.tests.test_blas import TestGer, BaseGemv
 from .. import gpuarray_shared_constructor
@@ -15,7 +16,7 @@ from .config import mode_with_gpu
 from .test_basic_ops import makeTester, rand
 from ..blas import (gpugemv_inplace, gpugemv_no_inplace,
-                    gpugemm_inplace,
+                    gpugemm_inplace, gpugemmbatch_no_inplace,
                    gpuger_inplace, gpuger_no_inplace,
                    GpuGer, gpu_dot22, GpuGemm)
@@ -68,6 +69,16 @@ GpuGemmTester = makeTester(
    )
+GpuGemmBatchTester = makeTester(
+    'GpuGemmBatchTester',
+    op=lambda z, alpha, x, y, beta: alpha * batched_dot(x, y) + beta * z,
+    gpu_op=gpugemmbatch_no_inplace,
+    cases=dict(
+        ("test_b%im%ik%in%i" % (b, m, k, n),
+         [rand(b, m, n), rand(), rand(b, m, k), rand(b, k, n), rand()])
+        for b, m, k, n in itertools.combinations([2, 3, 5, 7, 11, 13], 4)))
 class TestGpuSger(TestGer):
    def setUp(self):
        self.mode = mode_with_gpu

--- a/theano/tensor/blas.py
+++ b/theano/tensor/blas.py
@@ -2434,6 +2434,7 @@ class BatchedDot(Op):
        xshp, yshp = shapes
        return [xshp[:-1] + yshp[2:]]
+batched_dot = BatchedDot()
 # from opt import register_specialize, register_canonicalize
 # @register_specialize