提交 43172ab9 authored 作者: abergeron's avatar abergeron 提交者: GitHub

Merge pull request #6183 from juancamilog/diag_lifters

Added lifter optimizations for AllocDiag and ExtractDiag
...@@ -71,7 +71,8 @@ from .subtensor import (GpuIncSubtensor, GpuSubtensor, ...@@ -71,7 +71,8 @@ from .subtensor import (GpuIncSubtensor, GpuSubtensor,
GpuAdvancedSubtensor1, GpuAdvancedSubtensor1,
GpuAdvancedIncSubtensor, GpuAdvancedIncSubtensor,
GpuAdvancedIncSubtensor1, GpuAdvancedIncSubtensor1,
GpuAdvancedIncSubtensor1_dev20) GpuAdvancedIncSubtensor1_dev20,
GpuAllocDiag, GpuExtractDiag)
from .opt_util import alpha_merge, output_merge, pad_dims, unpad_dims from .opt_util import alpha_merge, output_merge, pad_dims, unpad_dims
from .reduction import GpuMaxAndArgmax from .reduction import GpuMaxAndArgmax
from .linalg import (GpuCusolverSolve, MATRIX_STRUCTURES_SOLVE, GpuCholesky, from .linalg import (GpuCusolverSolve, MATRIX_STRUCTURES_SOLVE, GpuCholesky,
...@@ -1117,6 +1118,25 @@ def local_advincsub1_gpua_inplace(node): ...@@ -1117,6 +1118,25 @@ def local_advincsub1_gpua_inplace(node):
return [node.op.clone_inplace()(*node.inputs)] return [node.op.clone_inplace()(*node.inputs)]
# AllocDiag
@register_opt('fast_compile')
@op_lifter([tensor.AllocDiag])
@register_opt2([theano.tensor.AllocDiag], 'fast_compile')
def local_gpu_alloc_diag(op, context_name, inputs, outputs):
if outputs[0].ndim != 2:
# AllocDiag only supports 2d output
return False
return GpuAllocDiag(offset=op.offset)
# ExtractDiag
@register_opt('fast_compile')
@op_lifter([tensor.ExtractDiag])
@register_opt2([theano.tensor.ExtractDiag], 'fast_compile')
def local_gpu_extract_diag(op, context_name, inputs, outputs):
return GpuExtractDiag(offset=op.offset, axis1=op.axis1, axis2=op.axis2, view=op.view)
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.CAReduce, tensor.Sum, tensor.elemwise.Prod]) @op_lifter([tensor.CAReduce, tensor.Sum, tensor.elemwise.Prod])
@register_opt2([tensor.CAReduce, tensor.Sum, tensor.elemwise.Prod], 'fast_compile') @register_opt2([tensor.CAReduce, tensor.Sum, tensor.elemwise.Prod], 'fast_compile')
......
...@@ -274,6 +274,12 @@ def test_adv_subtensor(): ...@@ -274,6 +274,12 @@ def test_adv_subtensor():
class test_gpuextractdiag(unittest.TestCase): class test_gpuextractdiag(unittest.TestCase):
def test_extractdiag_opt(self):
x = tensor.matrix()
fn = theano.function([x], tensor.ExtractDiag()(x), mode=mode_with_gpu)
assert any([isinstance(node.op, GpuExtractDiag)
for node in fn.maker.fgraph.toposort()])
def test_matrix(self): def test_matrix(self):
x = tensor.matrix() x = tensor.matrix()
np_x = np.arange(77).reshape(7, 11).astype(theano.config.floatX) np_x = np.arange(77).reshape(7, 11).astype(theano.config.floatX)
...@@ -308,6 +314,12 @@ class test_gpuextractdiag(unittest.TestCase): ...@@ -308,6 +314,12 @@ class test_gpuextractdiag(unittest.TestCase):
class test_gpuallocdiag(unittest.TestCase): class test_gpuallocdiag(unittest.TestCase):
def test_allocdiag_opt(self):
x = tensor.vector()
fn = theano.function([x], tensor.AllocDiag()(x), mode=mode_with_gpu)
assert any([isinstance(node.op, GpuAllocDiag)
for node in fn.maker.fgraph.toposort()])
def test_matrix(self): def test_matrix(self):
x = tensor.vector() x = tensor.vector()
np_x = np.arange(7).astype(theano.config.floatX) np_x = np.arange(7).astype(theano.config.floatX)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论