提交 065e0f5e authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Factor out and generalize grab_ger and grab_lr.

上级 1586d6d8
import numpy
import theano
from theano import Apply, tensor, scalar, Constant
from theano.tensor import DimShuffle, discrete_dtypes
from theano import Apply, tensor, scalar
from theano.tensor import discrete_dtypes
from theano.gradient import grad_undefined
......@@ -645,19 +645,6 @@ if cuda_available:
if node.op == sparse_block_outer_ss:
return [sparse_block_outer_ss_inplace(*node.inputs)]
def grab_ger(v):
# We need to do some digging because apparently the
# cut_transfers op does not run before us.
if v.owner is not None:
if isinstance(v.owner.op, SparseBlockOuterSS):
return v.owner
elif (isinstance(v.owner.op, GpuFromHost) and
v.owner.inputs[0].owner is not None and
isinstance(v.owner.inputs[0].owner.op, HostFromGpu)):
return grab_ger(v.owner.inputs[0].owner.inputs[0])
else:
return None
# Should be run before elemwise fusion
@opt.register_opt()
@opt.local_optimizer([GpuElemwise])
......@@ -665,33 +652,15 @@ if cuda_available:
"""
GpuElemwise{mul}(lr, SparseBlockOuterSS) -> SparseBlockOuterSS(..., alpha=lr)
"""
def grab_lr(v):
if v.owner is not None:
n = v.owner
if (isinstance(n.op, GpuDimShuffle) and
n.op.new_order == ('x', 'x', 'x', 'x')):
return host_from_gpu(n.inputs[0])
elif (isinstance(n.op, DimShuffle) and
n.op.new_order == ('x', 'x', 'x', 'x')):
return n.inputs[0]
elif isinstance(n.op, GpuFromHost):
return grab_lr(n.inputs[0])
else:
return None
else:
if (isinstance(v, Constant) and
v.broadcastable == (True, True, True, True)):
return v.dimshuffle(())
if (isinstance(node.op, GpuElemwise) and
node.op.scalar_op == scalar.mul and
node.nin == 2):
ger = grab_ger(node.inputs[0])
ger = opt.find_node(node.inputs[0], SparseBlockOuterSS)
if ger is None:
ger = grab_ger(node.inputs[1])
lr = grab_lr(node.inputs[0])
ger = opt.find_node(node.inputs[1], SparseBlockOuterSS)
lr = opt.grab_cpu_scalar(node.inputs[0], nd=4)
else:
lr = grab_lr(node.inputs[1])
lr = opt.grab_cpu_scalar(node.inputs[1], nd=4)
if lr is None or ger is None:
return None
alpha = lr * ger.inputs[5]
......@@ -704,10 +673,10 @@ GpuElemwise{mul}(lr, SparseBlockOuterSS) -> SparseBlockOuterSS(..., alpha=lr)
(node.op.scalar_op == scalar.sub or
node.op.scalar_op == scalar.add) and
node.nin == 2):
ger = grab_ger(node.inputs[0])
ger = opt.find_node(node.inputs[0], SparseBlockOuterSS)
W = node.inputs[1]
if ger is None:
ger = grab_ger(node.inputs[1])
ger = opt.find_node(node.inputs[1], SparseBlockOuterSS)
W = node.inputs[0]
if ger is None:
return None
......
......@@ -10,7 +10,7 @@ import numpy
import theano
from theano import scalar as scal
from theano import config, tensor, gof
from theano import config, tensor, gof, Constant
import theano.ifelse
from theano.compile import optdb
......@@ -47,7 +47,7 @@ from theano.sandbox.cuda.var import CudaNdarrayConstant
from theano.sandbox.cuda import gpu_optimizer, register_opt, gpu_seqopt, GpuOp
from theano.scan_module import scan_utils, scan_op, scan_opt
from theano.tensor.blas import _is_real_vector, _is_real_matrix
from theano.tensor import nlinalg
from theano.tensor import nlinalg, DimShuffle
from theano.tensor.nnet.Conv3D import Conv3D
try:
......@@ -88,6 +88,38 @@ register_opt()(theano.tensor.opt.local_track_shape_i)
register_opt(name='gpu_constant_folding')(
tensor.opt.constant_folding)
def grab_cpu_scalar(v, nd):
if v.owner is not None:
n = v.owner
if (isinstance(n.op, GpuDimShuffle) and
n.op.new_order == ('x',) * nd):
return host_from_gpu(n.inputs[0])
elif (isinstance(n.op, DimShuffle) and
n.op.new_order == ('x',) * nd):
return n.inputs[0]
elif isinstance(n.op, GpuFromHost):
return grab_cpu_scalar(n.inputs[0], nd=nd)
else:
return None
else:
if (isinstance(v, Constant) and
v.broadcastable == (True,) * nd):
return v.dimshuffle(())
def find_node(v, cls):
# This digs through possibly redundant transfers to for the node
# that has the op class specified.
if v.owner is not None:
if isinstance(v.owner.op, cls):
return v.owner
elif (isinstance(v.owner.op, GpuFromHost) and
v.owner.inputs[0].owner is not None and
isinstance(v.owner.inputs[0].owner.op, HostFromGpu)):
return find_node(v.owner.inputs[0].owner.inputs[0], cls)
else:
return None
# This is a partial list of CPU ops that can be in some circonstance
# moved to the GPU. This list is used by an optimization.
# Hopefully, we can keep this list up to date.
......
......@@ -18,7 +18,8 @@ from theano.sandbox.cuda.basic_ops import (GpuDimShuffle,
from theano.sandbox.cuda.blocksparse import (sparse_block_dot_SS,
sparse_block_gemv_ss,
sparse_block_outer_ss,
sparse_block_outer_ss_inplace)
sparse_block_outer_ss_inplace,
SparseBlockOuterSS)
from theano.sandbox.cuda.var import float32_shared_constructor
......@@ -186,13 +187,20 @@ def test_blocksparse_grad_merge():
f1 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)],
mode=mode_with_gpu)
# not running with mode=gpu ensures that the elemwise is not merged in
mode = None
if theano.config.mode == 'FAST_COMPILE':
mode = theano.compile.mode.get_mode('FAST_RUN')
# Make sure the lr update was merged.
assert isinstance(f1.maker.fgraph.outputs[0].owner.op, SparseBlockOuterSS)
# Exclude the merge optimizations.
mode = mode_with_gpu.excluding('local_merge_blocksparse_alpha')
mode = mode.excluding('local_merge_blocksparse_output')
f2 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)], mode=mode)
# Make sure the lr update is not merged.
assert not isinstance(f2.maker.fgraph.outputs[0].owner.op,
SparseBlockOuterSS)
f2(h_val, iIdx_val, b_val, oIdx_val)
W_ref = W.get_value()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论