提交 57ffd6a0 authored 作者: abergeron's avatar abergeron

Merge pull request #4371 from cooijmanstim/gpuarray_batched_dot

gpuarray GpuGemmBatch op
......@@ -332,6 +332,84 @@ class GpuDot22(BlasOp):
gpu_dot22 = GpuDot22()
class GpuGemmBatch(BlasOp):
__props__ = ('inplace',)
def __init__(self, inplace=False):
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [0]}
def make_node(self, C, alpha, A, B, beta):
ctx_name = infer_context_name(C, A, B)
A = as_gpuarray_variable(A, ctx_name)
B = as_gpuarray_variable(B, ctx_name)
C = as_gpuarray_variable(C, ctx_name)
alpha = as_tensor_variable(alpha)
beta = as_tensor_variable(beta)
assert alpha.ndim == 0
assert beta.ndim == 0
assert A.ndim == 3
assert B.ndim == 3
assert C.ndim == 3
assert A.dtype == B.dtype == C.dtype
return Apply(self, [C, alpha, A, B, beta], [C.type()])
def c_headers(self):
return super(GpuGemmBatch, self).c_headers() + ['<gpuarray/blas.h>']
def c_code(self, node, name, inp, out, sub):
vars = dict(out=out[0], C=inp[0], alpha=inp[1], A=inp[2], B=inp[3],
beta=inp[4], fail=sub['fail'], name=name)
code = """
int err;
"""
if self.inplace:
code += """
if (!GpuArray_ISONESEGMENT(&%(C)s->ga)) {
%(out)s = theano_try_copy(%(out)s, %(C)s);
if (%(out)s == NULL) {
%(fail)s
}
} else {
Py_XDECREF(%(out)s);
%(out)s = %(C)s;
Py_INCREF(%(out)s);
}
""" % vars
else:
code += """
%(out)s = theano_try_copy(%(out)s, %(C)s);
if (%(out)s == NULL) {
%(fail)s
}
""" % vars
code += """
err = GpuArray_rgemmBatch_3d(
cb_no_trans, cb_no_trans,
((dtype_%(alpha)s *)PyArray_DATA(%(alpha)s))[0],
&%(A)s->ga, &%(B)s->ga,
((dtype_%(beta)s *)PyArray_DATA(%(beta)s))[0],
&%(out)s->ga, 0);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"%%s", GpuArray_error(&%(A)s->ga, err));
%(fail)s;
}
""" % vars
if config.gpuarray.sync:
code += """
GpuArray_sync(&%(out)s->ga);
""" % vars
return code
def c_code_cache_version(self):
return (1,)
gpugemmbatch_no_inplace = GpuGemmBatch(inplace=False)
gpugemmbatch_inplace = GpuGemmBatch(inplace=True)
@inplace_allocempty(GpuGemv, 0)
def local_inplace_gpuagemv(node, inputs):
return [gpugemv_inplace(*inputs)]
......
......@@ -31,8 +31,8 @@ from .basic_ops import (as_gpuarray_variable, infer_context_name,
GpuSplit, GpuContiguous,
GpuAlloc, GpuAllocEmpty, GpuReshape,
GpuEye, gpu_join, GpuJoin)
from .blas import (gpu_dot22, GpuGemv, GpuGemm, GpuGer,
gpugemm_no_inplace)
from .blas import (gpu_dot22, GpuGemv, GpuGemm, GpuGer, GpuGemmBatch,
gpugemm_no_inplace, gpugemmbatch_no_inplace)
from .nnet import (GpuCrossentropySoftmaxArgmax1HotWithBias,
GpuCrossentropySoftmax1HotWithBiasDx,
GpuSoftmaxWithBias, GpuSoftmax)
......@@ -742,6 +742,14 @@ def local_gpua_gemm(node, context_name):
return GpuGemm(inplace=node.op.inplace)
@register_opt('fast_compile')
@op_lifter([tensor.blas.BatchedDot])
def local_gpua_gemmbatch(node, context_name):
a, b = node.inputs
c = tensor.AllocEmpty((a.shape[0], a.shape[1], b.shape[2]))
return gpugemmbatch_no_inplace(c, 1.0, a, b, 0.0)
@register_opt('fast_compile')
@op_lifter([tensor.basic.Dot])
def local_gpua_hgemm(node, context_name):
......@@ -774,6 +782,18 @@ def local_gpuagemm_output_merge(node, *inputs):
return [gpugemm_no_inplace(*inputs)]
@register_opt()
@alpha_merge(GpuGemmBatch, alpha_in=1, beta_in=4)
def local_gpuagemmbatch_alpha_merge(node, *inputs):
return [gpugemmbatch_no_inplace(*inputs)]
@register_opt()
@output_merge(GpuGemmBatch, alpha_in=1, beta_in=4, out_in=0)
def local_gpuagemmbatch_output_merge(node, *inputs):
return [gpugemmbatch_no_inplace(*inputs)]
@register_opt('fast_compile')
@op_lifter([tensor.blas.Ger, tensor.blas_c.CGer, tensor.blas_scipy.ScipyGer])
def local_gpua_ger(node, context_name):
......
from __future__ import absolute_import, print_function, division
from unittest import TestCase
from nose.plugins.skip import SkipTest
import itertools
import numpy
import theano
from theano import tensor
from theano.tests import unittest_tools as utt
from theano.tensor.blas import gemv_inplace, gemm_inplace, _dot22
from theano.tensor.blas import gemv_inplace, gemm_inplace, _dot22, batched_dot
from theano.tensor.tests.test_blas import TestGer, BaseGemv
from .. import gpuarray_shared_constructor
......@@ -15,7 +16,7 @@ from .config import mode_with_gpu
from .test_basic_ops import makeTester, rand
from ..blas import (gpugemv_inplace, gpugemv_no_inplace,
gpugemm_inplace,
gpugemm_inplace, gpugemmbatch_no_inplace,
gpuger_inplace, gpuger_no_inplace,
GpuGer, gpu_dot22, GpuGemm)
......@@ -68,6 +69,16 @@ GpuGemmTester = makeTester(
)
GpuGemmBatchTester = makeTester(
'GpuGemmBatchTester',
op=lambda z, alpha, x, y, beta: alpha * batched_dot(x, y) + beta * z,
gpu_op=gpugemmbatch_no_inplace,
cases=dict(
("test_b%im%ik%in%i" % (b, m, k, n),
[rand(b, m, n), rand(), rand(b, m, k), rand(b, k, n), rand()])
for b, m, k, n in itertools.combinations([2, 3, 5, 7, 11, 13], 4)))
class TestGpuSger(TestGer):
def setUp(self):
self.mode = mode_with_gpu
......
......@@ -2434,6 +2434,8 @@ class BatchedDot(Op):
xshp, yshp = shapes
return [xshp[:-1] + yshp[2:]]
batched_dot = BatchedDot()
# from opt import register_specialize, register_canonicalize
# @register_specialize
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论