提交 e93c61d1 authored 作者: abergeron's avatar abergeron

Merge pull request #1779 from nouiz/faster_opt

Faster opt
......@@ -1047,10 +1047,6 @@ class PatternSub(LocalOptimizer):
self.__name__ = name
self.pdb = pdb
def skip_identities(self, expr):
if self.skip_identities_fn:
return self.skip_identities_fn(expr)
def op_key(self):
return self.op
......@@ -1064,10 +1060,13 @@ class PatternSub(LocalOptimizer):
"""
if node.op != self.op:
return False
#TODO: if we remove pdb, do this speed things up?
def match(pattern, expr, u, allow_multiple_clients=False, pdb=False):
#TODO move outside match
def retry_with_equiv():
expr_equiv = self.skip_identities(expr)
if not self.skip_identities_fn:
return False
expr_equiv = self.skip_identities_fn(expr)
if expr_equiv is None:
return False
#TODO: Not sure how to handle multiple_clients flag
......@@ -1126,19 +1125,19 @@ class PatternSub(LocalOptimizer):
pdb.set_trace()
return u
def build(pattern, u):
if isinstance(pattern, (list, tuple)):
args = [build(p, u) for p in pattern[1:]]
return pattern[0](*args)
elif isinstance(pattern, basestring):
return u[unify.Var(pattern)]
elif isinstance(pattern, (int, float)):
return pattern
else:
return pattern.clone()
u = match(self.in_pattern, node.out, unify.Unification(), True,
self.pdb)
if u:
def build(pattern, u):
if isinstance(pattern, (list, tuple)):
args = [build(p, u) for p in pattern[1:]]
return pattern[0](*args)
elif isinstance(pattern, basestring):
return u[unify.Var(pattern)]
elif isinstance(pattern, (int, float)):
return pattern
else:
return pattern.clone()
p = self.out_pattern
new = build(p, u)
####print "PatternSub matched:", new
......@@ -1520,19 +1519,23 @@ class EquilibriumOptimizer(NavigatorOptimizer):
def __init__(self,
optimizers,
failure_callback=None,
ignore_newtrees=True,
max_use_ratio=None):
"""
""" Apply optimizations until equilibrium point.
:param optimizers: list or set of local or global optimizations to
apply until equilibrium.
:param max_use_ratio: each optimizer can be applied at most
(size of graph * this number) times
:param ignore_newtrees: See EquilibriumDB ignore_newtrees
parameter definition
"""
super(EquilibriumOptimizer, self).__init__(
None,
ignore_newtrees=True,
ignore_newtrees=ignore_newtrees,
failure_callback=failure_callback)
self.local_optimizers_map = dict()
self.local_optimizers_all = []
......
......@@ -179,23 +179,33 @@ class Query(object):
class EquilibriumDB(DB):
""" A set of potential optimizations which should be applied in an
"""A set of potential optimizations which should be applied in an
arbitrary order until equilibrium is reached.
Canonicalize, Stabilize, and Specialize are all equilibrium optimizations.
:param ignore_newtrees: If False, we will apply local opt on new
node introduced during local optimization application. This
could result in less fgraph iterations, but this don't mean it
will be faster globally.
.. note::
We can put LocalOptimizer and Optimizer as EquilibriumOptimizer
suppor both.
"""
def __init__(self, ignore_newtrees=True):
super(EquilibriumDB, self).__init__()
self.ignore_newtrees = ignore_newtrees
def query(self, *tags, **kwtags):
opts = super(EquilibriumDB, self).query(*tags, **kwtags)
return opt.EquilibriumOptimizer(opts,
max_use_ratio=config.optdb.max_use_ratio,
failure_callback=opt.NavigatorOptimizer.warn_inplace)
return opt.EquilibriumOptimizer(
opts,
max_use_ratio=config.optdb.max_use_ratio,
ignore_newtrees=self.ignore_newtrees,
failure_callback=opt.NavigatorOptimizer.warn_inplace)
class SequenceDB(DB):
......
......@@ -18,7 +18,7 @@ from theano.gof import (local_optimizer, EquilibriumDB, SequenceDB, ProxyDB,
from theano.gof.python25 import all, any
from theano.sandbox.cuda.basic_ops import (
device_properties, gpu_eye,
gpu_from_host, host_from_gpu, HostFromGpu,
gpu_from_host, host_from_gpu, GpuFromHost, HostFromGpu,
GpuElemwise, GpuDimShuffle, GpuReshape, GpuCAReduce, GpuFlatten,
GpuSubtensor, GpuAdvancedSubtensor1,
GpuAdvancedIncSubtensor1, GpuAdvancedIncSubtensor1_dev20,
......@@ -42,10 +42,14 @@ from theano.sandbox.cuda.elemwise import erfinv_gpu
from theano.sandbox.cuda.var import CudaNdarrayConstant
from theano.scan_module import scan_utils, scan_op, scan_opt
from theano.tensor.blas import _is_real_vector, _is_real_matrix
linalg = None
#optdb.print_summary() # shows what is currently registered
gpu_optimizer = EquilibriumDB()
#ignore_newtrees is to speed the optimization as this is the pattern
#we use for optimization. Otherwise, we can iterate 100s of time on
#the graph and apply only a few optimizations each time.
gpu_optimizer = EquilibriumDB(ignore_newtrees=False)
gpu_cut_copies = EquilibriumDB()
gpu_seqopt = SequenceDB()
gpu_seqopt.register('gpu_local_optimizations', gpu_optimizer, 1,
......@@ -65,6 +69,9 @@ optdb.register('gpu_after_fusion',
optdb.__position__.get('elemwise_fusion', 49) + .1,
'gpu')
## Register merge_optimizer as a global opt
gpu_optimizer.register('gpu_merge', theano.gof.opt.merge_optimizer, 'fast_run')
def register_opt(*tags, **kwargs):
def f(local_opt):
......@@ -76,6 +83,8 @@ def register_opt(*tags, **kwargs):
#register local_track_shape_i at this level too
#to make multi-level lift of shape work.
register_opt()(theano.tensor.opt.local_track_shape_i)
register_opt(name='gpu_constant_folding')(
tensor.opt.constant_folding)
class InputToGpuOptimizer(Optimizer):
......@@ -128,7 +137,7 @@ def local_cut_gpu_host_gpu(node):
return [node.inputs[0].owner.inputs[0]]
return False
gpu_cut_copies.register('cut_gpu_host_transfers', local_cut_gpu_host_gpu,
'fast_run', 'inplace', 'gpu')
'fast_run', 'gpu')
gpu_cut_copies.register('cut_gpu_constant_transfers',
tensor.opt.constant_folding,
'fast_run', 'gpu')
......@@ -176,10 +185,10 @@ def local_gpu_elemwise_0(node):
"""
if (isinstance(node.op, tensor.Elemwise) and
dtype_in_elemwise_supported(node.op)):
if numpy.any([i.owner and
isinstance(i.owner.op, HostFromGpu)
for i in node.inputs]):
if numpy.all([o.type.dtype == 'float32' for o in node.outputs]):
if any([i.owner and
isinstance(i.owner.op, HostFromGpu)
for i in node.inputs]):
if all([o.type.dtype == 'float32' for o in node.outputs]):
# Don't set any inplace pattern.
# gpu_inplace_elemwise_optimizer will do it later
......@@ -196,14 +205,14 @@ def local_gpu_elemwise_0(node):
upcastable = set(['float32', 'int8', 'int16', 'uint8',
'uint16'])
# case 1 - all inputs are already float32
if numpy.all([i.type.dtype == 'float32' for i in node.inputs]):
if all([i.type.dtype == 'float32' for i in node.inputs]):
#TODO: change this when fusion makes Elemwise with multiple
# outputs
gpu_elemwise = new_op(*(gpu_from_host(i)
for i in node.inputs))
# case 2 - it is still ok if some inputs were upcast to float32
elif numpy.all([i.type.dtype in upcastable
for i in node.inputs]):
elif all([i.type.dtype in upcastable
for i in node.inputs]):
# second - establish that a new node with upcasted inputs
# has the same outputs types as the original node
upcasted = node.op.make_node(*[tensor.cast(i, 'float32')
......@@ -233,7 +242,7 @@ def local_gpu_elemwise_1(node):
"""
gpu_from_host(Elemwise)) -> GpuElemwise(gpu_from_host(...))
"""
if node.op == gpu_from_host:
if isinstance(node.op, GpuFromHost):
host_i, = node.inputs
if (host_i.owner and
isinstance(host_i.owner.op, tensor.Elemwise) and
......@@ -277,7 +286,7 @@ def local_gpu_dimshuffle_0(node):
new_op = GpuDimShuffle(node.op.input_broadcastable,
node.op.new_order)
return [host_from_gpu(new_op(gpu_from_host(input)))]
if node.op == gpu_from_host:
if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0]
if host_input.owner and isinstance(host_input.owner.op,
tensor.DimShuffle):
......@@ -300,7 +309,7 @@ def local_gpu_specifyShape_0(node):
if input.owner and isinstance(input.owner.op, HostFromGpu):
return [host_from_gpu(tensor.specify_shape(gpu_from_host(input),
*node.inputs[1:]))]
if node.op == gpu_from_host:
if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0]
if host_input.owner and isinstance(host_input.owner.op,
tensor.SpecifyShape):
......@@ -327,7 +336,7 @@ def local_gpu_dot_to_dot22(node):
# In case the got do input upcast, we much check that we can
# make it run on the gpu.
if node.op == gpu_from_host:
if isinstance(node.op, GpuFromHost):
if node.outputs[0].type.dtype != 'float32':
return False
host_input = node.inputs[0]
......@@ -352,8 +361,8 @@ def local_gpu_dot_to_dot22(node):
if node.op == tensor.basic.dot:
if node.outputs[0].type.dtype != 'float32':
return False
if numpy.any([(i.owner and i.owner.op == host_from_gpu)
for i in node.inputs]):
if any([i.owner and isinstance(i.owner.op, HostFromGpu)
for i in node.inputs]):
x, y = node.inputs
if _is_real_vector(x) and _is_real_matrix(y):
new_op = GpuDimShuffle((False,), ['x', 0])
......@@ -386,10 +395,10 @@ def local_gpu_lazy_ifelse(node):
gpu_ifelse = theano.ifelse.IfElse(node.op.n_outs, gpu=True)
outs_clients = reduce(list.__add__,
[out.clients for out in node.outputs])
if numpy.any([(i.owner and i.owner.op == host_from_gpu)
for i in node.inputs]) or numpy.any(
[c != 'output' and c.op == gpu_from_host for c, idx
in outs_clients]):
if any([(i.owner and isinstance(i.owner.op, HostFromGpu))
for i in node.inputs]) or any(
[c != 'output' and c.op == gpu_from_host for c, idx
in outs_clients]):
c = node.inputs[0]
outs = node.inputs[1:]
......@@ -403,7 +412,7 @@ def local_gpu_lazy_ifelse(node):
return [host_from_gpu(out) for out in
gpu_ifelse.make_node(c, *outs).outputs]
if node.op == gpu_from_host:
if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0]
if (host_input.owner and
isinstance(host_input.owner.op, theano.ifelse.IfElse) and
......@@ -440,14 +449,15 @@ def local_gpu_dot22(node):
dot(host_from_gpu) -> host_from_gpu(gpudot22)
"""
if node.op == gpu_from_host:
if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0]
if host_input.owner and host_input.owner.op == tensor.blas._dot22:
if host_input.owner and isinstance(host_input.owner.op,
tensor.blas.Dot22):
x, y = host_input.owner.inputs
return [gpu_dot22(gpu_from_host(x), gpu_from_host(y))]
if node.op == tensor.blas._dot22:
if numpy.any([(i.owner and i.owner.op == host_from_gpu)
for i in node.inputs]):
if isinstance(node.op, tensor.blas.Dot22):
if any([(i.owner and isinstance(i.owner.op, HostFromGpu))
for i in node.inputs]):
x, y = node.inputs
return [host_from_gpu(gpu_dot22(gpu_from_host(x),
gpu_from_host(y)))]
......@@ -462,16 +472,17 @@ def local_gpu_dot22scalar(node):
dot(host_from_gpu) -> host_from_gpu(gpudot22scalar)
"""
if node.op == gpu_from_host:
if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0]
if (host_input.owner and
host_input.owner.op == tensor.blas._dot22scalar):
isinstance(host_input.owner.op,
tensor.blas.Dot22Scalar)):
x, y, scalar = host_input.owner.inputs
return [gpu_dot22scalar(gpu_from_host(x), gpu_from_host(y),
tensor.blas._as_scalar(scalar))]
if node.op == tensor.blas._dot22scalar:
if numpy.any([(i.owner and i.owner.op == host_from_gpu)
for i in node.inputs]):
if isinstance(node.op, tensor.blas.Dot22Scalar):
if any([i.owner and isinstance(i.owner.op, HostFromGpu)
for i in node.inputs]):
x, y, scalar = node.inputs
return [host_from_gpu(
gpu_dot22scalar(gpu_from_host(x),
......@@ -488,31 +499,28 @@ def local_gpu_gemv(node):
gemv(host_from_gpu) -> host_from_gpu(gpu_gemv)
"""
gemvs = {
tensor.blas.gemv_inplace: gpu_gemv_no_inplace,
tensor.blas.gemv_no_inplace: gpu_gemv_no_inplace,
tensor.blas_c.CGemv(inplace=True): gpu_gemv_no_inplace,
tensor.blas_c.CGemv(inplace=False): gpu_gemv_no_inplace,
}
if node.op == gpu_from_host:
gemvs = (tensor.blas.Gemv,
tensor.blas_c.CGemv,
)
if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0]
if host_input.owner and host_input.owner.op in gemvs:
if host_input.owner and isinstance(host_input.owner.op, gemvs):
op = host_input.owner.op
z, a, x, y, b = host_input.owner.inputs
return [gemvs[op](
return [gpu_gemv_no_inplace(
gpu_from_host(z),
a,
gpu_from_host(x),
gpu_from_host(y),
b)]
if node.op in gemvs:
if isinstance(node.op, gemvs):
z, a, x, y, b = node.inputs
x_on_gpu = (x.owner and x.owner.op == host_from_gpu)
y_on_gpu = (y.owner and y.owner.op == host_from_gpu)
z_on_gpu = (z.owner and z.owner.op == host_from_gpu)
x_on_gpu = (x.owner and isinstance(x.owner.op, HostFromGpu))
y_on_gpu = (y.owner and isinstance(y.owner.op, HostFromGpu))
z_on_gpu = (z.owner and isinstance(z.owner.op, HostFromGpu))
if x_on_gpu or y_on_gpu or z_on_gpu:
return [host_from_gpu(
gemvs[node.op](
gpu_gemv_no_inplace(
gpu_from_host(z),
a,
gpu_from_host(x),
......@@ -530,33 +538,30 @@ def local_gpu_ger(node):
ger(host_from_gpu) -> host_from_gpu(gpu_ger)
"""
gers = {
tensor.blas_c.CGer(destructive=True): gpu_ger_no_inplace,
tensor.blas_c.CGer(destructive=False): gpu_ger_no_inplace,
tensor.blas.Ger(destructive=True): gpu_ger_no_inplace,
tensor.blas.Ger(destructive=False): gpu_ger_no_inplace,
tensor.blas_scipy.ScipyGer(destructive=True): gpu_ger_no_inplace,
tensor.blas_scipy.ScipyGer(destructive=False): gpu_ger_no_inplace,
}
if node.op == gpu_from_host:
gers = (tensor.blas_c.CGer,
tensor.blas.Ger,
tensor.blas_scipy.ScipyGer,
)
if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0]
if host_input.owner and host_input.owner.op in gers:
if host_input.owner and isinstance(host_input.owner.op, gers):
op = host_input.owner.op
z, a, x, y = host_input.owner.inputs
return [gers[op](
return [gpu_ger_no_inplace(
gpu_from_host(z),
a,
gpu_from_host(x),
gpu_from_host(y)
)]
if node.op in gers:
if isinstance(node.op, gers):
z, a, x, y = node.inputs
x_on_gpu = (x.owner and x.owner.op == host_from_gpu)
y_on_gpu = (y.owner and y.owner.op == host_from_gpu)
z_on_gpu = (z.owner and z.owner.op == host_from_gpu)
x_on_gpu = (x.owner and isinstance(x.owner.op, HostFromGpu))
y_on_gpu = (y.owner and isinstance(y.owner.op, HostFromGpu))
z_on_gpu = (z.owner and isinstance(z.owner.op, HostFromGpu))
if x_on_gpu or y_on_gpu or z_on_gpu:
return [host_from_gpu(
gers[node.op](
gpu_ger_no_inplace(
gpu_from_host(z),
a,
gpu_from_host(x),
......@@ -573,26 +578,24 @@ def local_gpu_gemm(node):
gemm(host_from_gpu) -> host_from_gpu(gpu_gemm)
"""
gemms = {
#tensor.blas.gemm_inplace: gpu_gemm_inplace,
tensor.blas.gemm_no_inplace: gpu_gemm_no_inplace}
if node.op == gpu_from_host:
if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0]
if host_input.owner and host_input.owner.op in gemms:
if host_input.owner and isinstance(host_input.owner.op,
tensor.blas.Gemm):
op = host_input.owner.op
z, a, x, y, b = host_input.owner.inputs
return [gemms[op](gpu_from_host(z),
a,
gpu_from_host(x),
gpu_from_host(y),
b)]
if node.op in gemms:
return [gpu_gemm_no_inplace(gpu_from_host(z),
a,
gpu_from_host(x),
gpu_from_host(y),
b)]
if isinstance(node.op, tensor.blas.Gemm):
z, a, x, y, b = node.inputs
x_on_gpu = (x.owner and x.owner.op == host_from_gpu)
y_on_gpu = (y.owner and y.owner.op == host_from_gpu)
z_on_gpu = (z.owner and z.owner.op == host_from_gpu)
x_on_gpu = (x.owner and isinstance(x.owner.op, HostFromGpu))
y_on_gpu = (y.owner and isinstance(y.owner.op, HostFromGpu))
z_on_gpu = (z.owner and isinstance(z.owner.op, HostFromGpu))
if x_on_gpu or y_on_gpu or z_on_gpu:
return [host_from_gpu(gemms[node.op](gpu_from_host(z),
return [host_from_gpu(gpu_gemm_no_inplace(gpu_from_host(z),
a,
gpu_from_host(x),
gpu_from_host(y),
......@@ -613,9 +616,10 @@ def local_gpu_careduce(node):
scalar_op = node.op.scalar_op
# currently, only these two ops are supported at all,
# and max does not support all combinations of axes
if node.op.scalar_op in [scal.add, scal.mul, scal.maximum, scal.minimum]:
if isinstance(node.op.scalar_op, (scal.Add, scal.Mul,
scal.Maximum, scal.Minimum)):
x, = node.inputs
if x.owner and x.owner.op == host_from_gpu:
if x.owner and isinstance(x.owner.op, HostFromGpu):
if node.op.axis is None:
reduce_mask = [1] * x.type.ndim
else:
......@@ -685,7 +689,7 @@ def local_gpu_careduce(node):
@register_opt()
@local_optimizer([gpu_from_host, tensor.Reshape])
def local_gpu_reshape(node):
if node.op == gpu_from_host:
if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0]
if host_input.owner and \
isinstance(host_input.owner.op, tensor.Reshape):
......@@ -702,7 +706,7 @@ def local_gpu_reshape(node):
return [gpu_reshape]
if isinstance(node.op, tensor.Reshape):
x, shp = node.inputs
if x.owner and x.owner.op == host_from_gpu:
if x.owner and isinstance(x.owner.op, HostFromGpu):
gpu_x, = x.owner.inputs
gpu_reshape = GpuReshape(node.op.ndim)(gpu_x, shp)
if gpu_reshape.broadcastable != node.outputs[0].broadcastable:
......@@ -719,7 +723,7 @@ def local_gpu_reshape(node):
@register_opt()
@local_optimizer([gpu_from_host, tensor.Flatten])
def local_gpu_flatten(node):
if node.op == gpu_from_host:
if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0]
if host_input.owner and \
isinstance(host_input.owner.op, tensor.Flatten):
......@@ -729,7 +733,7 @@ def local_gpu_flatten(node):
if isinstance(node.op, tensor.Flatten):
x, = node.inputs
outdim = node.op.outdim
if x.owner and x.owner.op == host_from_gpu:
if x.owner and isinstance(x.owner.op, HostFromGpu):
gpu_x, = x.owner.inputs
return [host_from_gpu(GpuFlatten(outdim)(gpu_x))]
return False
......@@ -738,7 +742,7 @@ def local_gpu_flatten(node):
@register_opt()
@local_optimizer([gpu_from_host, tensor.Subtensor])
def local_gpu_subtensor(node):
if node.op == gpu_from_host:
if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0]
if host_input.owner and \
isinstance(host_input.owner.op, tensor.Subtensor):
......@@ -748,9 +752,11 @@ def local_gpu_subtensor(node):
return [GpuSubtensor(subt.idx_list)(gpu_from_host(x), *coords)]
if isinstance(node.op, tensor.Subtensor):
x = node.inputs[0]
coords = node.inputs[1:]
if x.owner and x.owner.op == host_from_gpu and x.dtype == "float32":
if (x.owner and
isinstance(x.owner.op, HostFromGpu) and
x.dtype == "float32"):
gpu_x, = x.owner.inputs
coords = node.inputs[1:]
return [host_from_gpu(GpuSubtensor(
node.op.idx_list)(gpu_x, *coords))]
return False
......@@ -759,7 +765,7 @@ def local_gpu_subtensor(node):
@register_opt()
@local_optimizer([gpu_from_host, tensor.AdvancedSubtensor1])
def local_gpu_advanced_subtensor1(node):
if node.op == gpu_from_host:
if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0]
if host_input.owner and \
host_input.owner.op.__class__ is tensor.AdvancedSubtensor1:
......@@ -769,7 +775,7 @@ def local_gpu_advanced_subtensor1(node):
if node.op.__class__ is tensor.AdvancedSubtensor1:
x = node.inputs[0]
coords = node.inputs[1:]
if x.owner and x.owner.op == host_from_gpu and x.dtype == "float32":
if x.owner and isinstance(x.owner.op, HostFromGpu) and x.dtype == "float32":
gpu_x, = x.owner.inputs
return [host_from_gpu(GpuAdvancedSubtensor1()(gpu_x, *coords))]
return False
......@@ -778,7 +784,7 @@ def local_gpu_advanced_subtensor1(node):
@register_opt()
@local_optimizer([gpu_from_host, tensor.AdvancedIncSubtensor1])
def local_gpu_advanced_incsubtensor1(node):
if node.op == gpu_from_host:
if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0]
# Should not execute for GpuAdvancedIncSubtensor1
if host_input.owner and \
......@@ -813,12 +819,12 @@ def local_gpu_advanced_incsubtensor1(node):
x, y = node.inputs[0:2]
coords = node.inputs[2:]
go_gpu = False
if x.owner and x.owner.op == host_from_gpu:
if x.owner and isinstance(x.owner.op, HostFromGpu):
go_gpu = True
gpu_x, = x.owner.inputs
else:
gpu_x = gpu_from_host(x)
if y.owner and y.owner.op == host_from_gpu:
if y.owner and isinstance(y.owner.op, HostFromGpu):
go_gpu = True
gpu_y, = y.owner.inputs
else:
......@@ -852,7 +858,7 @@ def local_gpu_advanced_incsubtensor1(node):
@register_opt()
@local_optimizer([gpu_from_host, tensor.IncSubtensor])
def local_gpu_incsubtensor(node):
if node.op == gpu_from_host:
if isinstance(node.op, GpuFromHost):
host_output = node.inputs[0]
if host_output.owner and \
type(host_output.owner.op) == tensor.IncSubtensor:
......@@ -876,12 +882,12 @@ def local_gpu_incsubtensor(node):
assert isinstance(y.type, tensor.TensorType)
coords = node.inputs[2:]
go_gpu = False
if x.owner and x.owner.op == host_from_gpu:
if x.owner and isinstance(x.owner.op, HostFromGpu):
go_gpu = True
gpu_x, = x.owner.inputs
else:
gpu_x = gpu_from_host(x)
if y.owner and y.owner.op == host_from_gpu:
if y.owner and isinstance(y.owner.op, HostFromGpu):
go_gpu = True
gpu_y, = y.owner.inputs
else:
......@@ -901,7 +907,7 @@ def local_gpu_incsubtensor(node):
def local_gpu_shape(node):
if isinstance(node.op, tensor.Shape):
x, = node.inputs
if x.owner and x.owner.op == host_from_gpu:
if x.owner and isinstance(x.owner.op, HostFromGpu):
gpu_x, = x.owner.inputs
return [gpu_shape(gpu_x)]
return False
......@@ -913,7 +919,7 @@ def local_gpu_rebroadcast(node):
'''rebroadcast(host_from_gpu(x)) -> host_from_gpu(rebroadcast(x))'''
if isinstance(node.op, tensor.Rebroadcast):
x, = node.inputs
if (x.owner and x.owner.op == host_from_gpu):
if (x.owner and isinstance(x.owner.op, HostFromGpu)):
gpu_x = x.owner.inputs[0]
return [host_from_gpu(node.op(gpu_x))]
......@@ -927,7 +933,7 @@ def gpu_print_wrapper(op, cnda):
def local_gpu_print_op(node):
if isinstance(node.op, tensor.printing.Print):
x, = node.inputs
if x.owner and x.owner.op == host_from_gpu:
if x.owner and isinstance(x.owner.op, HostFromGpu):
gpu_x, = x.owner.inputs
new_op = node.op.__class__(global_fn=gpu_print_wrapper)
new_op.old_op = node.op
......@@ -948,7 +954,7 @@ import theano.tensor.nnet
def local_gpu_crossentorpy_softmax_argmax_1hot_with_bias(node):
if isinstance(node.op, tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias):
x, b, y = node.inputs
if x.owner and x.owner.op == host_from_gpu:
if x.owner and isinstance(x.owner.op, HostFromGpu):
gpu_x, = x.owner.inputs
# if y is a cast to integers, we can go to the underlying
# thing if we want, since this gpu op will cast to integers
......@@ -978,7 +984,7 @@ def local_gpu_crossentorpy_softmax_argmax_1hot_with_bias(node):
def local_gpu_crossentorpy_softmax_1hot_with_bias_dx(node):
if isinstance(node.op, tensor.nnet.CrossentropySoftmax1HotWithBiasDx):
dnll, sm, yidx = node.inputs
if sm.owner and sm.owner.op == host_from_gpu:
if sm.owner and isinstance(sm.owner.op, HostFromGpu):
gpu_sm, = sm.owner.inputs
gpu_dx = GpuCrossentropySoftmax1HotWithBiasDx()(
gpu_from_host(dnll),
......@@ -993,7 +999,7 @@ def local_gpu_crossentorpy_softmax_1hot_with_bias_dx(node):
def local_gpu_softmax(node):
if isinstance(node.op, tensor.nnet.Softmax):
x, = node.inputs
if x.owner and x.owner.op == host_from_gpu:
if x.owner and isinstance(x.owner.op, HostFromGpu):
gpu_x, = x.owner.inputs
gpu_sm = GpuSoftmax()(gpu_x)
return [host_from_gpu(gpu_sm)]
......@@ -1005,8 +1011,8 @@ def local_gpu_softmax(node):
def local_gpu_softmax_with_bias(node):
if isinstance(node.op, tensor.nnet.SoftmaxWithBias):
x, b = node.inputs
x_on_gpu = x.owner and x.owner.op == host_from_gpu
b_on_gpu = b.owner and b.owner.op == host_from_gpu
x_on_gpu = x.owner and isinstance(x.owner.op, HostFromGpu)
b_on_gpu = b.owner and isinstance(b.owner.op, HostFromGpu)
if x_on_gpu or b_on_gpu:
gpu_sm = GpuSoftmaxWithBias()(gpu_from_host(x), gpu_from_host(b))
return [host_from_gpu(gpu_sm)]
......@@ -1078,7 +1084,7 @@ def local_gpu_conv(node):
atol = 3e-5
return CudaNdarrayType.values_eq_approx(a, b, atol=atol)
if node.op == gpu_from_host:
if isinstance(node.op, GpuFromHost):
#gpu_from_host(conv) -> gpu_conv(gpu_from_host)
host_input = node.inputs[0]
if host_input.owner and isinstance(host_input.owner.op, conv.ConvOp):
......@@ -1098,8 +1104,8 @@ def local_gpu_conv(node):
if isinstance(node.op, conv.ConvOp):
#conv(host_from_gpu) -> host_from_gpu(gpu_conv)
img, kern = node.inputs
img_on_gpu = (img.owner and img.owner.op == host_from_gpu)
kern_on_gpu = (kern.owner and kern.owner.op == host_from_gpu)
img_on_gpu = (img.owner and isinstance(img.owner.op, HostFromGpu))
kern_on_gpu = (kern.owner and isinstance(kern.owner.op, HostFromGpu))
if img_on_gpu or kern_on_gpu:
gpu_conv = GpuConvOp_from_ConvOp(node.op)
if gpu_conv is None:
......@@ -1122,7 +1128,7 @@ import theano.tensor.signal.downsample as downsample
def local_gpu_downsample_factor_max(node):
if isinstance(node.op, downsample.DownsampleFactorMax):
x, = node.inputs
if (x.owner and x.owner.op == host_from_gpu):
if (x.owner and isinstance(x.owner.op, HostFromGpu)):
gpu_ds = GpuDownsampleFactorMax(node.op.ds, node.op.ignore_border)
return [host_from_gpu(gpu_ds(x.owner.inputs[0]))]
......@@ -1132,7 +1138,7 @@ def local_gpu_downsample_factor_max(node):
def local_gpu_downsample_factor_max_grad(node):
if isinstance(node.op, downsample.DownsampleFactorMaxGrad):
x, z, gz = node.inputs
if (x.owner and x.owner.op == host_from_gpu):
if (x.owner and isinstance(x.owner.op, HostFromGpu)):
gpu_ds_grad = GpuDownsampleFactorMaxGrad(node.op.ds,
node.op.ignore_border)
return [host_from_gpu(gpu_ds_grad(x.owner.inputs[0],
......@@ -1184,12 +1190,12 @@ def local_gpu_join(node):
#print "OPT: axis_and_tensors=", axis_and_tensors
matches = [(not t.owner is None and t.owner.op == host_from_gpu) or
matches = [(not t.owner is None and isinstance(t.owner.op, HostFromGpu)) or
isinstance(t, gof.Constant) for t in axis_and_tensors[1:]]
#print "OPT: matches =", matches
# if all input tensors are host_from_gpu'ified
if numpy.all(matches):
if all(matches):
# the extra gpu_from_host introduced here will
# be removed by further optimizations
new_tensors = [gpu_from_host(t) for t in axis_and_tensors[1:]]
......@@ -1363,18 +1369,18 @@ def local_gpualloc(node):
replace = False
if node.op == tensor.alloc:
if node.inputs[0].owner and \
node.inputs[0].owner.op == host_from_gpu:
isinstance(node.inputs[0].owner.op, HostFromGpu):
replace = True
elif all([c != 'output' and c.op == gpu_from_host
for c, idx in node.outputs[0].clients]):
for c, idx in node.outputs[0].clients]):
# if all clients are on gpu
replace = True
elif all([c != 'output' and
c.op == tensor.join and
all([i.owner and
i.owner.op in [host_from_gpu, tensor.alloc]
for i in c.inputs[1:]])
for c, idx in node.outputs[0].clients]):
c.op == tensor.join and
all([i.owner and
i.owner.op in [host_from_gpu, tensor.alloc]
for i in c.inputs[1:]])
for c, idx in node.outputs[0].clients]):
# if the client is a subtensor with input on gpu or alloc
replace = True
if replace and node.inputs[0].dtype != 'float32':
......@@ -1424,15 +1430,15 @@ def local_gpu_eye(node):
eye(host_from_gpu) -> host_from_gpu(gpueye)
"""
if node.op == gpu_from_host:
if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0]
if (host_input.owner and
isinstance(host_input.owner.op, tensor.Eye) and
host_input.owner.op.dtype == "float32"):
return [gpu_eye(*host_input.owner.inputs)]
if isinstance(node.op, tensor.Eye) and node.op.dtype == "float32":
if numpy.any([(i.owner and i.owner.op == host_from_gpu)
for i in node.inputs]):
if any([(i.owner and isinstance(i.owner.op, HostFromGpu))
for i in node.inputs]):
return [host_from_gpu(gpu_eye(*node.inputs))]
return False
......@@ -1507,14 +1513,18 @@ def local_gpu_extract_diagonal(node):
extract_diagonal(host_from_gpu()) -> host_from_gpu(extract_diagonal)
gpu_from_host(extract_diagonal) -> extract_diagonal(gpu_from_host)
"""
from theano.sandbox import linalg
global linalg
if linalg is None:
from theano.sandbox import linalg
linalg = theano.sandbox.linalg
if (isinstance(node.op, linalg.ops.ExtractDiag) and
isinstance(node.inputs[0].type,
theano.tensor.TensorType)):
inp = node.inputs[0]
if inp.owner and isinstance(inp.owner.op, HostFromGpu):
return [host_from_gpu(linalg.extract_diag(gpu_from_host(inp)))]
if node.op == gpu_from_host:
if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0]
if (host_input.owner and
isinstance(host_input.owner.op, linalg.ops.ExtractDiag) and
......@@ -1535,7 +1545,7 @@ def gpuScanOptimization(node):
"""
#gpu_from_host(scan) -> GPUscan(gpu_from_host)
if node.op == gpu_from_host:
if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0]
if (host_input.owner and
isinstance(host_input.owner.op, scan_op.Scan) and
......@@ -1596,8 +1606,8 @@ def gpuScanOptimization(node):
#scan(host_from_gpu) -> host_from_gpu(GPUscan)
if (type(node.op) == scan_op.Scan
and not node.op.info['gpu']):
if numpy.any([(i.owner and i.owner.op == host_from_gpu)
for i in node.inputs]):
if any([(i.owner and isinstance(i.owner.op, HostFromGpu))
for i in node.inputs]):
thescan = node.op
info = copy.deepcopy(thescan.info)
......
......@@ -1190,32 +1190,31 @@ def _beta_L_plus_alpha_M(beta, L, alpha, M, recurse_flip=True):
# it also might be the case that there is a dimshuffle between the +
# and the dot22. local_dot_to_dot22 in particular will put in such things.
if M.owner and isinstance(M.owner.op, T.DimShuffle):
if (M.owner and isinstance(M.owner.op, T.DimShuffle) and
M.owner.inputs[0].owner and
isinstance(M.owner.inputs[0].owner.op, Dot22)):
MM = M.owner.inputs[0]
if tuple(M.owner.op.new_order) == (0,):
if M.owner.op.new_order == (0,):
# it is making a column MM into a vector
if MM.owner and MM.owner.op == _dot22:
MMl, MMr = MM.owner.inputs
g = gemm_no_inplace(L.dimshuffle(0, 'x'),
alpha, MMl, MMr, beta)
rval = [g.dimshuffle(0)]
return rval, MM
if tuple(M.owner.op.new_order) == (1,):
MMl, MMr = MM.owner.inputs
g = gemm_no_inplace(L.dimshuffle(0, 'x'),
alpha, MMl, MMr, beta)
rval = [g.dimshuffle(0)]
return rval, MM
if M.owner.op.new_order == (1,):
# it is making a row MM into a vector
if MM.owner and MM.owner.op == _dot22:
MMl, MMr = MM.owner.inputs
g = gemm_no_inplace(L.dimshuffle('x', 0),
alpha, MMl, MMr, beta)
rval = [g.dimshuffle(1)]
return rval, MM
if tuple(M.owner.op.new_order) == ():
MMl, MMr = MM.owner.inputs
g = gemm_no_inplace(L.dimshuffle('x', 0),
alpha, MMl, MMr, beta)
rval = [g.dimshuffle(1)]
return rval, MM
if len(M.owner.op.new_order) == 0:
# it is making a row MM into a vector
if MM.owner and MM.owner.op == _dot22:
MMl, MMr = MM.owner.inputs
g = gemm_no_inplace(L.dimshuffle('x', 'x'),
alpha, MMl, MMr, beta)
rval = [g.dimshuffle()]
return rval, MM
MMl, MMr = MM.owner.inputs
g = gemm_no_inplace(L.dimshuffle('x', 'x'),
alpha, MMl, MMr, beta)
rval = [g.dimshuffle()]
return rval, MM
# this is False'd out because of inadequate testing.
# TODO see ticket #237
......@@ -1379,29 +1378,31 @@ def _gemm_from_factored_list(lst):
"""Returns None, or a list to replace node.outputs
"""
# Make every pair in list have matching dtypes
# sM can be a tuple of 2 elements or a theano variable.
# We should not use __len__ as theano variables don't support
# it. I don't want to change this to isinstance(sM, tuple)
# as I'm not able to make a test that triggers this case.
def is_pair(sM):
try:
s, M = sM
return True
except Exception:
return False
lst2 = []
# Remove the tuple that can't be cast correctly.
# This can happen when we try to cast a complex to a real
for sM in lst:
if is_pair(sM):
# Make every pair in list have matching dtypes
# sM can be a tuple of 2 elements or a theano variable.
if isinstance(sM, tuple):
sm0, sm1 = sM
sm0 = T.as_tensor_variable(sm0)
if theano.scalar.upcast(sm0.dtype, sm1.dtype) == sm1.dtype:
lst2.append((T.cast(sm0, sm1.dtype), sM[1]))
lst = lst2
def item_to_var(t):
try:
s, M = t
except Exception:
return t
if s == 1:
return M
if s == -1:
return -M
return s * M
# Try every pair in the sM_list, trying to turn it into a gemm operation
for i in xrange(len(lst) - 1):
s_i, M_i = lst[i]
......@@ -1418,16 +1419,6 @@ def _gemm_from_factored_list(lst):
s_j, M_j)
#print 'GOT IT', gemm_of_sM_list
if gemm_of_sM_list:
def item_to_var(t):
try:
s, M = t
except Exception:
return t
if s == 1:
return M
if s == -1:
return -M
return s * M
assert len(gemm_of_sM_list) == 1
add_inputs = [item_to_var(input)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论