提交 5d7a1999 authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Add GpuGemm op.

上级 5649f8b6
from theano import Op, Apply, config
from theano.tensor.blas import Gemv
from theano.tensor.blas import Gemv, Gemm
from theano.sandbox.gpuarray.basic_ops import (HideC, as_gpuarray_variable)
try:
......@@ -10,7 +10,7 @@ except ImportError, e:
# To make sure theano is importable
pass
class BlasOp(HideC, Op):
class BlasOp(HideC):
def c_headers(self):
return ['<blas_api.h>']
......@@ -71,6 +71,58 @@ class GpuGemv(BlasOp, Gemv):
gpugemv_no_inplace = GpuGemv(inplace=False)
gpugemv_inplace = GpuGemv(inplace=True)
class GpuGemm(BlasOp, Gemm):
def make_node(self, C, alpha, A, B, beta):
res = Gemm.make_node(self, C, alpha, A, B, beta)
A = as_gpuarray_variable(A)
B = as_gpuarray_variable(B)
C = as_gpuarray_variable(C)
return Apply(self, [C, alpha, A, B, beta], [C.type()])
def perform(self, node, inputs, outputs):
C, alpha, A, B, beta = inputs
outputs[0][0] = blas.gemm(alpha, A, B, beta, C,
overwrite_c=self.inplace)
def c_code(self, node, name, inp, out, sub):
vars = dict(out=out[0], C=inp[0], alpha=inp[1], A=inp[2], B=inp[3],
beta=inp[4], fail=sub['fail'], name=name)
if self.inplace:
code = """
Py_XDECREF(%(out)s);
%(out)s = %(C)s;
Py_INCREF(%(out)s);
""" % vars
else:
code = """
Py_XDECREF(%(out)s);
%(out)s = pygpu_copy(%(C)s, GA_ANY_ORDER);
if (%(out)s == NULL) {
%(fail)s
}
""" % vars
code += """
if (pygpu_blas_rgemm(cb_no_trans, cb_no_trans,
((dtype_%(alpha)s *)PyArray_DATA(%(alpha)s))[0],
%(A)s, %(B)s,
((dtype_%(beta)s *)PyArray_DATA(%(beta)s))[0],
%(out)s) == NULL) {
%(fail)s
}
""" % vars
if config.gpuarray.sync:
code += """
GpuArray_sync(&%(out)s->ga);
"""
return code
def c_code_cache_version(self):
return (0,)
gpugemm_no_inplace = GpuGemm(inplace=False)
gpugemm_inplace = GpuGemm(inplace=True)
from theano.compile import optdb
from theano.gof import local_optimizer, EquilibriumOptimizer
......@@ -79,10 +131,15 @@ def local_inplace_gpuagemv(node):
if node.op == gpugemv_no_inplace:
return [gpugemv_inplace(*node.inputs)]
#gpuablas_opt_inplace = EquilibriumOptimzer(
# [local_inplace_gpuagemv],
# failure_callback=EquilibriumOptimizer.warn_inplace,
# max_use_ratio=5)
#optdb.register('InplaceGpuaBlasOpt',
# gpuablas_opt_inplace,
# 70.0, 'fast_run', 'inplace', 'gpuarray')
@local_optimizer([gpugemm_no_inplace])
def local_inplace_gpuagemm(node):
if node.op == gpugemm_no_inplace:
return [gpugemm_inplace(*node.inputs)]
gpuablas_opt_inplace = EquilibriumOptimzer(
[local_inplace_gpuagemv, local_inplace_gpuagemm],
failure_callback=EquilibriumOptimizer.warn_inplace,
max_use_ratio=5)
optdb.register('InplaceGpuaBlasOpt',
gpuablas_opt_inplace,
70.0, 'fast_run', 'inplace', 'gpuarray')
......@@ -186,3 +186,8 @@ def local_gpua_careduce(node):
@op_lifter(tensor.blas.Gemv)
def local_gpua_gemv(node):
return GpuGemv(inplace=node.op.inplace)
@register_opt()
@op_listfer(tensor.blas.Gemm)
def local_gpua_gemm(node):
return GpuGemm(inplace=node.op.inplace)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论