提交 065e0f5e authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Factor out and generalize grab_ger and grab_lr.

上级 1586d6d8
import numpy import numpy
import theano import theano
from theano import Apply, tensor, scalar, Constant from theano import Apply, tensor, scalar
from theano.tensor import DimShuffle, discrete_dtypes from theano.tensor import discrete_dtypes
from theano.gradient import grad_undefined from theano.gradient import grad_undefined
...@@ -645,19 +645,6 @@ if cuda_available: ...@@ -645,19 +645,6 @@ if cuda_available:
if node.op == sparse_block_outer_ss: if node.op == sparse_block_outer_ss:
return [sparse_block_outer_ss_inplace(*node.inputs)] return [sparse_block_outer_ss_inplace(*node.inputs)]
def grab_ger(v):
# We need to do some digging because apparently the
# cut_transfers op does not run before us.
if v.owner is not None:
if isinstance(v.owner.op, SparseBlockOuterSS):
return v.owner
elif (isinstance(v.owner.op, GpuFromHost) and
v.owner.inputs[0].owner is not None and
isinstance(v.owner.inputs[0].owner.op, HostFromGpu)):
return grab_ger(v.owner.inputs[0].owner.inputs[0])
else:
return None
# Should be run before elemwise fusion # Should be run before elemwise fusion
@opt.register_opt() @opt.register_opt()
@opt.local_optimizer([GpuElemwise]) @opt.local_optimizer([GpuElemwise])
...@@ -665,33 +652,15 @@ if cuda_available: ...@@ -665,33 +652,15 @@ if cuda_available:
""" """
GpuElemwise{mul}(lr, SparseBlockOuterSS) -> SparseBlockOuterSS(..., alpha=lr) GpuElemwise{mul}(lr, SparseBlockOuterSS) -> SparseBlockOuterSS(..., alpha=lr)
""" """
def grab_lr(v):
if v.owner is not None:
n = v.owner
if (isinstance(n.op, GpuDimShuffle) and
n.op.new_order == ('x', 'x', 'x', 'x')):
return host_from_gpu(n.inputs[0])
elif (isinstance(n.op, DimShuffle) and
n.op.new_order == ('x', 'x', 'x', 'x')):
return n.inputs[0]
elif isinstance(n.op, GpuFromHost):
return grab_lr(n.inputs[0])
else:
return None
else:
if (isinstance(v, Constant) and
v.broadcastable == (True, True, True, True)):
return v.dimshuffle(())
if (isinstance(node.op, GpuElemwise) and if (isinstance(node.op, GpuElemwise) and
node.op.scalar_op == scalar.mul and node.op.scalar_op == scalar.mul and
node.nin == 2): node.nin == 2):
ger = grab_ger(node.inputs[0]) ger = opt.find_node(node.inputs[0], SparseBlockOuterSS)
if ger is None: if ger is None:
ger = grab_ger(node.inputs[1]) ger = opt.find_node(node.inputs[1], SparseBlockOuterSS)
lr = grab_lr(node.inputs[0]) lr = opt.grab_cpu_scalar(node.inputs[0], nd=4)
else: else:
lr = grab_lr(node.inputs[1]) lr = opt.grab_cpu_scalar(node.inputs[1], nd=4)
if lr is None or ger is None: if lr is None or ger is None:
return None return None
alpha = lr * ger.inputs[5] alpha = lr * ger.inputs[5]
...@@ -704,10 +673,10 @@ GpuElemwise{mul}(lr, SparseBlockOuterSS) -> SparseBlockOuterSS(..., alpha=lr) ...@@ -704,10 +673,10 @@ GpuElemwise{mul}(lr, SparseBlockOuterSS) -> SparseBlockOuterSS(..., alpha=lr)
(node.op.scalar_op == scalar.sub or (node.op.scalar_op == scalar.sub or
node.op.scalar_op == scalar.add) and node.op.scalar_op == scalar.add) and
node.nin == 2): node.nin == 2):
ger = grab_ger(node.inputs[0]) ger = opt.find_node(node.inputs[0], SparseBlockOuterSS)
W = node.inputs[1] W = node.inputs[1]
if ger is None: if ger is None:
ger = grab_ger(node.inputs[1]) ger = opt.find_node(node.inputs[1], SparseBlockOuterSS)
W = node.inputs[0] W = node.inputs[0]
if ger is None: if ger is None:
return None return None
......
...@@ -10,7 +10,7 @@ import numpy ...@@ -10,7 +10,7 @@ import numpy
import theano import theano
from theano import scalar as scal from theano import scalar as scal
from theano import config, tensor, gof from theano import config, tensor, gof, Constant
import theano.ifelse import theano.ifelse
from theano.compile import optdb from theano.compile import optdb
...@@ -47,7 +47,7 @@ from theano.sandbox.cuda.var import CudaNdarrayConstant ...@@ -47,7 +47,7 @@ from theano.sandbox.cuda.var import CudaNdarrayConstant
from theano.sandbox.cuda import gpu_optimizer, register_opt, gpu_seqopt, GpuOp from theano.sandbox.cuda import gpu_optimizer, register_opt, gpu_seqopt, GpuOp
from theano.scan_module import scan_utils, scan_op, scan_opt from theano.scan_module import scan_utils, scan_op, scan_opt
from theano.tensor.blas import _is_real_vector, _is_real_matrix from theano.tensor.blas import _is_real_vector, _is_real_matrix
from theano.tensor import nlinalg from theano.tensor import nlinalg, DimShuffle
from theano.tensor.nnet.Conv3D import Conv3D from theano.tensor.nnet.Conv3D import Conv3D
try: try:
...@@ -88,6 +88,38 @@ register_opt()(theano.tensor.opt.local_track_shape_i) ...@@ -88,6 +88,38 @@ register_opt()(theano.tensor.opt.local_track_shape_i)
register_opt(name='gpu_constant_folding')( register_opt(name='gpu_constant_folding')(
tensor.opt.constant_folding) tensor.opt.constant_folding)
def grab_cpu_scalar(v, nd):
if v.owner is not None:
n = v.owner
if (isinstance(n.op, GpuDimShuffle) and
n.op.new_order == ('x',) * nd):
return host_from_gpu(n.inputs[0])
elif (isinstance(n.op, DimShuffle) and
n.op.new_order == ('x',) * nd):
return n.inputs[0]
elif isinstance(n.op, GpuFromHost):
return grab_cpu_scalar(n.inputs[0], nd=nd)
else:
return None
else:
if (isinstance(v, Constant) and
v.broadcastable == (True,) * nd):
return v.dimshuffle(())
def find_node(v, cls):
# This digs through possibly redundant transfers to for the node
# that has the op class specified.
if v.owner is not None:
if isinstance(v.owner.op, cls):
return v.owner
elif (isinstance(v.owner.op, GpuFromHost) and
v.owner.inputs[0].owner is not None and
isinstance(v.owner.inputs[0].owner.op, HostFromGpu)):
return find_node(v.owner.inputs[0].owner.inputs[0], cls)
else:
return None
# This is a partial list of CPU ops that can be in some circonstance # This is a partial list of CPU ops that can be in some circonstance
# moved to the GPU. This list is used by an optimization. # moved to the GPU. This list is used by an optimization.
# Hopefully, we can keep this list up to date. # Hopefully, we can keep this list up to date.
......
...@@ -18,7 +18,8 @@ from theano.sandbox.cuda.basic_ops import (GpuDimShuffle, ...@@ -18,7 +18,8 @@ from theano.sandbox.cuda.basic_ops import (GpuDimShuffle,
from theano.sandbox.cuda.blocksparse import (sparse_block_dot_SS, from theano.sandbox.cuda.blocksparse import (sparse_block_dot_SS,
sparse_block_gemv_ss, sparse_block_gemv_ss,
sparse_block_outer_ss, sparse_block_outer_ss,
sparse_block_outer_ss_inplace) sparse_block_outer_ss_inplace,
SparseBlockOuterSS)
from theano.sandbox.cuda.var import float32_shared_constructor from theano.sandbox.cuda.var import float32_shared_constructor
...@@ -186,13 +187,20 @@ def test_blocksparse_grad_merge(): ...@@ -186,13 +187,20 @@ def test_blocksparse_grad_merge():
f1 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)], f1 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)],
mode=mode_with_gpu) mode=mode_with_gpu)
# not running with mode=gpu ensures that the elemwise is not merged in
mode = None # Make sure the lr update was merged.
if theano.config.mode == 'FAST_COMPILE': assert isinstance(f1.maker.fgraph.outputs[0].owner.op, SparseBlockOuterSS)
mode = theano.compile.mode.get_mode('FAST_RUN')
# Exclude the merge optimizations.
mode = mode_with_gpu.excluding('local_merge_blocksparse_alpha')
mode = mode.excluding('local_merge_blocksparse_output')
f2 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)], mode=mode) f2 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)], mode=mode)
# Make sure the lr update is not merged.
assert not isinstance(f2.maker.fgraph.outputs[0].owner.op,
SparseBlockOuterSS)
f2(h_val, iIdx_val, b_val, oIdx_val) f2(h_val, iIdx_val, b_val, oIdx_val)
W_ref = W.get_value() W_ref = W.get_value()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论