提交 57ffd6a0 authored 作者: abergeron's avatar abergeron

Merge pull request #4371 from cooijmanstim/gpuarray_batched_dot

gpuarray GpuGemmBatch op
...@@ -332,6 +332,84 @@ class GpuDot22(BlasOp): ...@@ -332,6 +332,84 @@ class GpuDot22(BlasOp):
gpu_dot22 = GpuDot22() gpu_dot22 = GpuDot22()
class GpuGemmBatch(BlasOp):
__props__ = ('inplace',)
def __init__(self, inplace=False):
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [0]}
def make_node(self, C, alpha, A, B, beta):
ctx_name = infer_context_name(C, A, B)
A = as_gpuarray_variable(A, ctx_name)
B = as_gpuarray_variable(B, ctx_name)
C = as_gpuarray_variable(C, ctx_name)
alpha = as_tensor_variable(alpha)
beta = as_tensor_variable(beta)
assert alpha.ndim == 0
assert beta.ndim == 0
assert A.ndim == 3
assert B.ndim == 3
assert C.ndim == 3
assert A.dtype == B.dtype == C.dtype
return Apply(self, [C, alpha, A, B, beta], [C.type()])
def c_headers(self):
return super(GpuGemmBatch, self).c_headers() + ['<gpuarray/blas.h>']
def c_code(self, node, name, inp, out, sub):
vars = dict(out=out[0], C=inp[0], alpha=inp[1], A=inp[2], B=inp[3],
beta=inp[4], fail=sub['fail'], name=name)
code = """
int err;
"""
if self.inplace:
code += """
if (!GpuArray_ISONESEGMENT(&%(C)s->ga)) {
%(out)s = theano_try_copy(%(out)s, %(C)s);
if (%(out)s == NULL) {
%(fail)s
}
} else {
Py_XDECREF(%(out)s);
%(out)s = %(C)s;
Py_INCREF(%(out)s);
}
""" % vars
else:
code += """
%(out)s = theano_try_copy(%(out)s, %(C)s);
if (%(out)s == NULL) {
%(fail)s
}
""" % vars
code += """
err = GpuArray_rgemmBatch_3d(
cb_no_trans, cb_no_trans,
((dtype_%(alpha)s *)PyArray_DATA(%(alpha)s))[0],
&%(A)s->ga, &%(B)s->ga,
((dtype_%(beta)s *)PyArray_DATA(%(beta)s))[0],
&%(out)s->ga, 0);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"%%s", GpuArray_error(&%(A)s->ga, err));
%(fail)s;
}
""" % vars
if config.gpuarray.sync:
code += """
GpuArray_sync(&%(out)s->ga);
""" % vars
return code
def c_code_cache_version(self):
return (1,)
gpugemmbatch_no_inplace = GpuGemmBatch(inplace=False)
gpugemmbatch_inplace = GpuGemmBatch(inplace=True)
@inplace_allocempty(GpuGemv, 0) @inplace_allocempty(GpuGemv, 0)
def local_inplace_gpuagemv(node, inputs): def local_inplace_gpuagemv(node, inputs):
return [gpugemv_inplace(*inputs)] return [gpugemv_inplace(*inputs)]
......
...@@ -31,8 +31,8 @@ from .basic_ops import (as_gpuarray_variable, infer_context_name, ...@@ -31,8 +31,8 @@ from .basic_ops import (as_gpuarray_variable, infer_context_name,
GpuSplit, GpuContiguous, GpuSplit, GpuContiguous,
GpuAlloc, GpuAllocEmpty, GpuReshape, GpuAlloc, GpuAllocEmpty, GpuReshape,
GpuEye, gpu_join, GpuJoin) GpuEye, gpu_join, GpuJoin)
from .blas import (gpu_dot22, GpuGemv, GpuGemm, GpuGer, from .blas import (gpu_dot22, GpuGemv, GpuGemm, GpuGer, GpuGemmBatch,
gpugemm_no_inplace) gpugemm_no_inplace, gpugemmbatch_no_inplace)
from .nnet import (GpuCrossentropySoftmaxArgmax1HotWithBias, from .nnet import (GpuCrossentropySoftmaxArgmax1HotWithBias,
GpuCrossentropySoftmax1HotWithBiasDx, GpuCrossentropySoftmax1HotWithBiasDx,
GpuSoftmaxWithBias, GpuSoftmax) GpuSoftmaxWithBias, GpuSoftmax)
...@@ -742,6 +742,14 @@ def local_gpua_gemm(node, context_name): ...@@ -742,6 +742,14 @@ def local_gpua_gemm(node, context_name):
return GpuGemm(inplace=node.op.inplace) return GpuGemm(inplace=node.op.inplace)
@register_opt('fast_compile')
@op_lifter([tensor.blas.BatchedDot])
def local_gpua_gemmbatch(node, context_name):
a, b = node.inputs
c = tensor.AllocEmpty((a.shape[0], a.shape[1], b.shape[2]))
return gpugemmbatch_no_inplace(c, 1.0, a, b, 0.0)
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.basic.Dot]) @op_lifter([tensor.basic.Dot])
def local_gpua_hgemm(node, context_name): def local_gpua_hgemm(node, context_name):
...@@ -774,6 +782,18 @@ def local_gpuagemm_output_merge(node, *inputs): ...@@ -774,6 +782,18 @@ def local_gpuagemm_output_merge(node, *inputs):
return [gpugemm_no_inplace(*inputs)] return [gpugemm_no_inplace(*inputs)]
@register_opt()
@alpha_merge(GpuGemmBatch, alpha_in=1, beta_in=4)
def local_gpuagemmbatch_alpha_merge(node, *inputs):
return [gpugemmbatch_no_inplace(*inputs)]
@register_opt()
@output_merge(GpuGemmBatch, alpha_in=1, beta_in=4, out_in=0)
def local_gpuagemmbatch_output_merge(node, *inputs):
return [gpugemmbatch_no_inplace(*inputs)]
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.blas.Ger, tensor.blas_c.CGer, tensor.blas_scipy.ScipyGer]) @op_lifter([tensor.blas.Ger, tensor.blas_c.CGer, tensor.blas_scipy.ScipyGer])
def local_gpua_ger(node, context_name): def local_gpua_ger(node, context_name):
......
from __future__ import absolute_import, print_function, division from __future__ import absolute_import, print_function, division
from unittest import TestCase from unittest import TestCase
from nose.plugins.skip import SkipTest from nose.plugins.skip import SkipTest
import itertools
import numpy import numpy
import theano import theano
from theano import tensor from theano import tensor
from theano.tests import unittest_tools as utt from theano.tests import unittest_tools as utt
from theano.tensor.blas import gemv_inplace, gemm_inplace, _dot22 from theano.tensor.blas import gemv_inplace, gemm_inplace, _dot22, batched_dot
from theano.tensor.tests.test_blas import TestGer, BaseGemv from theano.tensor.tests.test_blas import TestGer, BaseGemv
from .. import gpuarray_shared_constructor from .. import gpuarray_shared_constructor
...@@ -15,7 +16,7 @@ from .config import mode_with_gpu ...@@ -15,7 +16,7 @@ from .config import mode_with_gpu
from .test_basic_ops import makeTester, rand from .test_basic_ops import makeTester, rand
from ..blas import (gpugemv_inplace, gpugemv_no_inplace, from ..blas import (gpugemv_inplace, gpugemv_no_inplace,
gpugemm_inplace, gpugemm_inplace, gpugemmbatch_no_inplace,
gpuger_inplace, gpuger_no_inplace, gpuger_inplace, gpuger_no_inplace,
GpuGer, gpu_dot22, GpuGemm) GpuGer, gpu_dot22, GpuGemm)
...@@ -68,6 +69,16 @@ GpuGemmTester = makeTester( ...@@ -68,6 +69,16 @@ GpuGemmTester = makeTester(
) )
GpuGemmBatchTester = makeTester(
'GpuGemmBatchTester',
op=lambda z, alpha, x, y, beta: alpha * batched_dot(x, y) + beta * z,
gpu_op=gpugemmbatch_no_inplace,
cases=dict(
("test_b%im%ik%in%i" % (b, m, k, n),
[rand(b, m, n), rand(), rand(b, m, k), rand(b, k, n), rand()])
for b, m, k, n in itertools.combinations([2, 3, 5, 7, 11, 13], 4)))
class TestGpuSger(TestGer): class TestGpuSger(TestGer):
def setUp(self): def setUp(self):
self.mode = mode_with_gpu self.mode = mode_with_gpu
......
...@@ -2434,6 +2434,8 @@ class BatchedDot(Op): ...@@ -2434,6 +2434,8 @@ class BatchedDot(Op):
xshp, yshp = shapes xshp, yshp = shapes
return [xshp[:-1] + yshp[2:]] return [xshp[:-1] + yshp[2:]]
batched_dot = BatchedDot()
# from opt import register_specialize, register_canonicalize # from opt import register_specialize, register_canonicalize
# @register_specialize # @register_specialize
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论