提交 dbffc96d authored 作者: Frederic's avatar Frederic

make gpu gemv/ger inplace.

上级 0a17cf8b
...@@ -386,9 +386,9 @@ def local_gpu_gemv(node): ...@@ -386,9 +386,9 @@ def local_gpu_gemv(node):
""" """
gemvs = { gemvs = {
#tensor.blas.gemv_inplace: gpu_gemv_inplace, tensor.blas.gemv_inplace: gpu_gemv_no_inplace,
tensor.blas.gemv_no_inplace: gpu_gemv_no_inplace, tensor.blas.gemv_no_inplace: gpu_gemv_no_inplace,
#tensor.blas_c.CGemv(inplace=True): gpu_gemv_inplace, tensor.blas_c.CGemv(inplace=True): gpu_gemv_no_inplace,
tensor.blas_c.CGemv(inplace=False): gpu_gemv_no_inplace, tensor.blas_c.CGemv(inplace=False): gpu_gemv_no_inplace,
} }
if node.op == gpu_from_host: if node.op == gpu_from_host:
...@@ -422,13 +422,15 @@ def local_gpu_gemv(node): ...@@ -422,13 +422,15 @@ def local_gpu_gemv(node):
@local_optimizer([]) @local_optimizer([])
def local_gpu_ger(node): def local_gpu_ger(node):
""" """
gpu_from_host(gemv) -> gpu_gemv(gpu_from_host) gpu_from_host(ger) -> gpu_ger(gpu_from_host)
gemv(host_from_gpu) -> host_from_gpu(gpu_gemv) ger(host_from_gpu) -> host_from_gpu(gpu_ger)
""" """
gers = { gers = {
#tensor.blas_c.CGer(destructive=True): gpu_ger_inplace, tensor.blas_c.CGer(destructive=True): gpu_ger_no_inplace,
tensor.blas_c.CGer(destructive=False): gpu_ger_no_inplace, tensor.blas_c.CGer(destructive=False): gpu_ger_no_inplace,
tensor.blas.Ger(destructive=True): gpu_ger_no_inplace,
tensor.blas.Ger(destructive=False): gpu_ger_no_inplace,
} }
if node.op == gpu_from_host: if node.op == gpu_from_host:
host_input = node.inputs[0] host_input = node.inputs[0]
...@@ -965,15 +967,32 @@ def local_inplace_gemm(node): ...@@ -965,15 +967,32 @@ def local_inplace_gemm(node):
if node.op == gpu_gemm_no_inplace: if node.op == gpu_gemm_no_inplace:
return [gpu_gemm_inplace(*node.inputs)] return [gpu_gemm_inplace(*node.inputs)]
@local_optimizer([gpu_gemv_no_inplace])
def local_inplace_gemv(node):
if node.op == gpu_gemv_no_inplace:
return [gpu_gemv_inplace(*node.inputs)]
@local_optimizer([gpu_gemm_no_inplace])
def local_inplace_ger(node):
if node.op == gpu_ger_no_inplace:
return [gpu_ger_inplace(*node.inputs)]
# After destroyhandler is in but before we try to make elemwise things inplace # After destroyhandler is in but before we try to make elemwise things inplace
# Try to make gpu gemm inplace # Try to make gpu gemm inplace
# Also, need to make the gemm optimisation(step 70) happen before the fusion of # Also, need to make the gemm optimisation(step 70) happen before the fusion of
# elemwise(step 71) # elemwise(step 71)
optdb.register('InplaceGpuBlasOpt', optdb.register('InplaceGpuBlasOpt',
EquilibriumOptimizer([local_inplace_gemm], failure_callback=EquilibriumOptimizer.warn_inplace, EquilibriumOptimizer([local_inplace_gemm,
local_inplace_gemv,
local_inplace_ger,
],
failure_callback=EquilibriumOptimizer.warn_inplace,
max_use_ratio=5), max_use_ratio=5),
70.0, 'fast_run', 'inplace', 'gpu') 70.0, 'fast_run', 'inplace', 'gpu')
def get_device_type_sizes(): def get_device_type_sizes():
""" """
:return:(gpu ptr size, cpu ptr size, int sizes(gpu and cpu)) :return:(gpu ptr size, cpu ptr size, int sizes(gpu and cpu))
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论