提交 e93c61d1 authored 作者: abergeron's avatar abergeron

Merge pull request #1779 from nouiz/faster_opt

Faster opt
...@@ -1047,10 +1047,6 @@ class PatternSub(LocalOptimizer): ...@@ -1047,10 +1047,6 @@ class PatternSub(LocalOptimizer):
self.__name__ = name self.__name__ = name
self.pdb = pdb self.pdb = pdb
def skip_identities(self, expr):
if self.skip_identities_fn:
return self.skip_identities_fn(expr)
def op_key(self): def op_key(self):
return self.op return self.op
...@@ -1064,10 +1060,13 @@ class PatternSub(LocalOptimizer): ...@@ -1064,10 +1060,13 @@ class PatternSub(LocalOptimizer):
""" """
if node.op != self.op: if node.op != self.op:
return False return False
#TODO: if we remove pdb, do this speed things up?
def match(pattern, expr, u, allow_multiple_clients=False, pdb=False): def match(pattern, expr, u, allow_multiple_clients=False, pdb=False):
#TODO move outside match
def retry_with_equiv(): def retry_with_equiv():
expr_equiv = self.skip_identities(expr) if not self.skip_identities_fn:
return False
expr_equiv = self.skip_identities_fn(expr)
if expr_equiv is None: if expr_equiv is None:
return False return False
#TODO: Not sure how to handle multiple_clients flag #TODO: Not sure how to handle multiple_clients flag
...@@ -1126,6 +1125,9 @@ class PatternSub(LocalOptimizer): ...@@ -1126,6 +1125,9 @@ class PatternSub(LocalOptimizer):
pdb.set_trace() pdb.set_trace()
return u return u
u = match(self.in_pattern, node.out, unify.Unification(), True,
self.pdb)
if u:
def build(pattern, u): def build(pattern, u):
if isinstance(pattern, (list, tuple)): if isinstance(pattern, (list, tuple)):
args = [build(p, u) for p in pattern[1:]] args = [build(p, u) for p in pattern[1:]]
...@@ -1136,9 +1138,6 @@ class PatternSub(LocalOptimizer): ...@@ -1136,9 +1138,6 @@ class PatternSub(LocalOptimizer):
return pattern return pattern
else: else:
return pattern.clone() return pattern.clone()
u = match(self.in_pattern, node.out, unify.Unification(), True,
self.pdb)
if u:
p = self.out_pattern p = self.out_pattern
new = build(p, u) new = build(p, u)
####print "PatternSub matched:", new ####print "PatternSub matched:", new
...@@ -1520,19 +1519,23 @@ class EquilibriumOptimizer(NavigatorOptimizer): ...@@ -1520,19 +1519,23 @@ class EquilibriumOptimizer(NavigatorOptimizer):
def __init__(self, def __init__(self,
optimizers, optimizers,
failure_callback=None, failure_callback=None,
ignore_newtrees=True,
max_use_ratio=None): max_use_ratio=None):
""" """ Apply optimizations until equilibrium point.
:param optimizers: list or set of local or global optimizations to :param optimizers: list or set of local or global optimizations to
apply until equilibrium. apply until equilibrium.
:param max_use_ratio: each optimizer can be applied at most :param max_use_ratio: each optimizer can be applied at most
(size of graph * this number) times (size of graph * this number) times
:param ignore_newtrees: See EquilibriumDB ignore_newtrees
parameter definition
""" """
super(EquilibriumOptimizer, self).__init__( super(EquilibriumOptimizer, self).__init__(
None, None,
ignore_newtrees=True, ignore_newtrees=ignore_newtrees,
failure_callback=failure_callback) failure_callback=failure_callback)
self.local_optimizers_map = dict() self.local_optimizers_map = dict()
self.local_optimizers_all = [] self.local_optimizers_all = []
......
...@@ -179,22 +179,32 @@ class Query(object): ...@@ -179,22 +179,32 @@ class Query(object):
class EquilibriumDB(DB): class EquilibriumDB(DB):
""" A set of potential optimizations which should be applied in an """A set of potential optimizations which should be applied in an
arbitrary order until equilibrium is reached. arbitrary order until equilibrium is reached.
Canonicalize, Stabilize, and Specialize are all equilibrium optimizations. Canonicalize, Stabilize, and Specialize are all equilibrium optimizations.
:param ignore_newtrees: If False, we will apply local opt on new
node introduced during local optimization application. This
could result in less fgraph iterations, but this don't mean it
will be faster globally.
.. note:: .. note::
We can put LocalOptimizer and Optimizer as EquilibriumOptimizer We can put LocalOptimizer and Optimizer as EquilibriumOptimizer
suppor both. suppor both.
""" """
def __init__(self, ignore_newtrees=True):
super(EquilibriumDB, self).__init__()
self.ignore_newtrees = ignore_newtrees
def query(self, *tags, **kwtags): def query(self, *tags, **kwtags):
opts = super(EquilibriumDB, self).query(*tags, **kwtags) opts = super(EquilibriumDB, self).query(*tags, **kwtags)
return opt.EquilibriumOptimizer(opts, return opt.EquilibriumOptimizer(
opts,
max_use_ratio=config.optdb.max_use_ratio, max_use_ratio=config.optdb.max_use_ratio,
ignore_newtrees=self.ignore_newtrees,
failure_callback=opt.NavigatorOptimizer.warn_inplace) failure_callback=opt.NavigatorOptimizer.warn_inplace)
......
...@@ -18,7 +18,7 @@ from theano.gof import (local_optimizer, EquilibriumDB, SequenceDB, ProxyDB, ...@@ -18,7 +18,7 @@ from theano.gof import (local_optimizer, EquilibriumDB, SequenceDB, ProxyDB,
from theano.gof.python25 import all, any from theano.gof.python25 import all, any
from theano.sandbox.cuda.basic_ops import ( from theano.sandbox.cuda.basic_ops import (
device_properties, gpu_eye, device_properties, gpu_eye,
gpu_from_host, host_from_gpu, HostFromGpu, gpu_from_host, host_from_gpu, GpuFromHost, HostFromGpu,
GpuElemwise, GpuDimShuffle, GpuReshape, GpuCAReduce, GpuFlatten, GpuElemwise, GpuDimShuffle, GpuReshape, GpuCAReduce, GpuFlatten,
GpuSubtensor, GpuAdvancedSubtensor1, GpuSubtensor, GpuAdvancedSubtensor1,
GpuAdvancedIncSubtensor1, GpuAdvancedIncSubtensor1_dev20, GpuAdvancedIncSubtensor1, GpuAdvancedIncSubtensor1_dev20,
...@@ -42,10 +42,14 @@ from theano.sandbox.cuda.elemwise import erfinv_gpu ...@@ -42,10 +42,14 @@ from theano.sandbox.cuda.elemwise import erfinv_gpu
from theano.sandbox.cuda.var import CudaNdarrayConstant from theano.sandbox.cuda.var import CudaNdarrayConstant
from theano.scan_module import scan_utils, scan_op, scan_opt from theano.scan_module import scan_utils, scan_op, scan_opt
from theano.tensor.blas import _is_real_vector, _is_real_matrix from theano.tensor.blas import _is_real_vector, _is_real_matrix
linalg = None
#optdb.print_summary() # shows what is currently registered #optdb.print_summary() # shows what is currently registered
gpu_optimizer = EquilibriumDB() #ignore_newtrees is to speed the optimization as this is the pattern
#we use for optimization. Otherwise, we can iterate 100s of time on
#the graph and apply only a few optimizations each time.
gpu_optimizer = EquilibriumDB(ignore_newtrees=False)
gpu_cut_copies = EquilibriumDB() gpu_cut_copies = EquilibriumDB()
gpu_seqopt = SequenceDB() gpu_seqopt = SequenceDB()
gpu_seqopt.register('gpu_local_optimizations', gpu_optimizer, 1, gpu_seqopt.register('gpu_local_optimizations', gpu_optimizer, 1,
...@@ -65,6 +69,9 @@ optdb.register('gpu_after_fusion', ...@@ -65,6 +69,9 @@ optdb.register('gpu_after_fusion',
optdb.__position__.get('elemwise_fusion', 49) + .1, optdb.__position__.get('elemwise_fusion', 49) + .1,
'gpu') 'gpu')
## Register merge_optimizer as a global opt
gpu_optimizer.register('gpu_merge', theano.gof.opt.merge_optimizer, 'fast_run')
def register_opt(*tags, **kwargs): def register_opt(*tags, **kwargs):
def f(local_opt): def f(local_opt):
...@@ -76,6 +83,8 @@ def register_opt(*tags, **kwargs): ...@@ -76,6 +83,8 @@ def register_opt(*tags, **kwargs):
#register local_track_shape_i at this level too #register local_track_shape_i at this level too
#to make multi-level lift of shape work. #to make multi-level lift of shape work.
register_opt()(theano.tensor.opt.local_track_shape_i) register_opt()(theano.tensor.opt.local_track_shape_i)
register_opt(name='gpu_constant_folding')(
tensor.opt.constant_folding)
class InputToGpuOptimizer(Optimizer): class InputToGpuOptimizer(Optimizer):
...@@ -128,7 +137,7 @@ def local_cut_gpu_host_gpu(node): ...@@ -128,7 +137,7 @@ def local_cut_gpu_host_gpu(node):
return [node.inputs[0].owner.inputs[0]] return [node.inputs[0].owner.inputs[0]]
return False return False
gpu_cut_copies.register('cut_gpu_host_transfers', local_cut_gpu_host_gpu, gpu_cut_copies.register('cut_gpu_host_transfers', local_cut_gpu_host_gpu,
'fast_run', 'inplace', 'gpu') 'fast_run', 'gpu')
gpu_cut_copies.register('cut_gpu_constant_transfers', gpu_cut_copies.register('cut_gpu_constant_transfers',
tensor.opt.constant_folding, tensor.opt.constant_folding,
'fast_run', 'gpu') 'fast_run', 'gpu')
...@@ -176,10 +185,10 @@ def local_gpu_elemwise_0(node): ...@@ -176,10 +185,10 @@ def local_gpu_elemwise_0(node):
""" """
if (isinstance(node.op, tensor.Elemwise) and if (isinstance(node.op, tensor.Elemwise) and
dtype_in_elemwise_supported(node.op)): dtype_in_elemwise_supported(node.op)):
if numpy.any([i.owner and if any([i.owner and
isinstance(i.owner.op, HostFromGpu) isinstance(i.owner.op, HostFromGpu)
for i in node.inputs]): for i in node.inputs]):
if numpy.all([o.type.dtype == 'float32' for o in node.outputs]): if all([o.type.dtype == 'float32' for o in node.outputs]):
# Don't set any inplace pattern. # Don't set any inplace pattern.
# gpu_inplace_elemwise_optimizer will do it later # gpu_inplace_elemwise_optimizer will do it later
...@@ -196,13 +205,13 @@ def local_gpu_elemwise_0(node): ...@@ -196,13 +205,13 @@ def local_gpu_elemwise_0(node):
upcastable = set(['float32', 'int8', 'int16', 'uint8', upcastable = set(['float32', 'int8', 'int16', 'uint8',
'uint16']) 'uint16'])
# case 1 - all inputs are already float32 # case 1 - all inputs are already float32
if numpy.all([i.type.dtype == 'float32' for i in node.inputs]): if all([i.type.dtype == 'float32' for i in node.inputs]):
#TODO: change this when fusion makes Elemwise with multiple #TODO: change this when fusion makes Elemwise with multiple
# outputs # outputs
gpu_elemwise = new_op(*(gpu_from_host(i) gpu_elemwise = new_op(*(gpu_from_host(i)
for i in node.inputs)) for i in node.inputs))
# case 2 - it is still ok if some inputs were upcast to float32 # case 2 - it is still ok if some inputs were upcast to float32
elif numpy.all([i.type.dtype in upcastable elif all([i.type.dtype in upcastable
for i in node.inputs]): for i in node.inputs]):
# second - establish that a new node with upcasted inputs # second - establish that a new node with upcasted inputs
# has the same outputs types as the original node # has the same outputs types as the original node
...@@ -233,7 +242,7 @@ def local_gpu_elemwise_1(node): ...@@ -233,7 +242,7 @@ def local_gpu_elemwise_1(node):
""" """
gpu_from_host(Elemwise)) -> GpuElemwise(gpu_from_host(...)) gpu_from_host(Elemwise)) -> GpuElemwise(gpu_from_host(...))
""" """
if node.op == gpu_from_host: if isinstance(node.op, GpuFromHost):
host_i, = node.inputs host_i, = node.inputs
if (host_i.owner and if (host_i.owner and
isinstance(host_i.owner.op, tensor.Elemwise) and isinstance(host_i.owner.op, tensor.Elemwise) and
...@@ -277,7 +286,7 @@ def local_gpu_dimshuffle_0(node): ...@@ -277,7 +286,7 @@ def local_gpu_dimshuffle_0(node):
new_op = GpuDimShuffle(node.op.input_broadcastable, new_op = GpuDimShuffle(node.op.input_broadcastable,
node.op.new_order) node.op.new_order)
return [host_from_gpu(new_op(gpu_from_host(input)))] return [host_from_gpu(new_op(gpu_from_host(input)))]
if node.op == gpu_from_host: if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0] host_input = node.inputs[0]
if host_input.owner and isinstance(host_input.owner.op, if host_input.owner and isinstance(host_input.owner.op,
tensor.DimShuffle): tensor.DimShuffle):
...@@ -300,7 +309,7 @@ def local_gpu_specifyShape_0(node): ...@@ -300,7 +309,7 @@ def local_gpu_specifyShape_0(node):
if input.owner and isinstance(input.owner.op, HostFromGpu): if input.owner and isinstance(input.owner.op, HostFromGpu):
return [host_from_gpu(tensor.specify_shape(gpu_from_host(input), return [host_from_gpu(tensor.specify_shape(gpu_from_host(input),
*node.inputs[1:]))] *node.inputs[1:]))]
if node.op == gpu_from_host: if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0] host_input = node.inputs[0]
if host_input.owner and isinstance(host_input.owner.op, if host_input.owner and isinstance(host_input.owner.op,
tensor.SpecifyShape): tensor.SpecifyShape):
...@@ -327,7 +336,7 @@ def local_gpu_dot_to_dot22(node): ...@@ -327,7 +336,7 @@ def local_gpu_dot_to_dot22(node):
# In case the got do input upcast, we much check that we can # In case the got do input upcast, we much check that we can
# make it run on the gpu. # make it run on the gpu.
if node.op == gpu_from_host: if isinstance(node.op, GpuFromHost):
if node.outputs[0].type.dtype != 'float32': if node.outputs[0].type.dtype != 'float32':
return False return False
host_input = node.inputs[0] host_input = node.inputs[0]
...@@ -352,7 +361,7 @@ def local_gpu_dot_to_dot22(node): ...@@ -352,7 +361,7 @@ def local_gpu_dot_to_dot22(node):
if node.op == tensor.basic.dot: if node.op == tensor.basic.dot:
if node.outputs[0].type.dtype != 'float32': if node.outputs[0].type.dtype != 'float32':
return False return False
if numpy.any([(i.owner and i.owner.op == host_from_gpu) if any([i.owner and isinstance(i.owner.op, HostFromGpu)
for i in node.inputs]): for i in node.inputs]):
x, y = node.inputs x, y = node.inputs
if _is_real_vector(x) and _is_real_matrix(y): if _is_real_vector(x) and _is_real_matrix(y):
...@@ -386,8 +395,8 @@ def local_gpu_lazy_ifelse(node): ...@@ -386,8 +395,8 @@ def local_gpu_lazy_ifelse(node):
gpu_ifelse = theano.ifelse.IfElse(node.op.n_outs, gpu=True) gpu_ifelse = theano.ifelse.IfElse(node.op.n_outs, gpu=True)
outs_clients = reduce(list.__add__, outs_clients = reduce(list.__add__,
[out.clients for out in node.outputs]) [out.clients for out in node.outputs])
if numpy.any([(i.owner and i.owner.op == host_from_gpu) if any([(i.owner and isinstance(i.owner.op, HostFromGpu))
for i in node.inputs]) or numpy.any( for i in node.inputs]) or any(
[c != 'output' and c.op == gpu_from_host for c, idx [c != 'output' and c.op == gpu_from_host for c, idx
in outs_clients]): in outs_clients]):
...@@ -403,7 +412,7 @@ def local_gpu_lazy_ifelse(node): ...@@ -403,7 +412,7 @@ def local_gpu_lazy_ifelse(node):
return [host_from_gpu(out) for out in return [host_from_gpu(out) for out in
gpu_ifelse.make_node(c, *outs).outputs] gpu_ifelse.make_node(c, *outs).outputs]
if node.op == gpu_from_host: if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0] host_input = node.inputs[0]
if (host_input.owner and if (host_input.owner and
isinstance(host_input.owner.op, theano.ifelse.IfElse) and isinstance(host_input.owner.op, theano.ifelse.IfElse) and
...@@ -440,13 +449,14 @@ def local_gpu_dot22(node): ...@@ -440,13 +449,14 @@ def local_gpu_dot22(node):
dot(host_from_gpu) -> host_from_gpu(gpudot22) dot(host_from_gpu) -> host_from_gpu(gpudot22)
""" """
if node.op == gpu_from_host: if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0] host_input = node.inputs[0]
if host_input.owner and host_input.owner.op == tensor.blas._dot22: if host_input.owner and isinstance(host_input.owner.op,
tensor.blas.Dot22):
x, y = host_input.owner.inputs x, y = host_input.owner.inputs
return [gpu_dot22(gpu_from_host(x), gpu_from_host(y))] return [gpu_dot22(gpu_from_host(x), gpu_from_host(y))]
if node.op == tensor.blas._dot22: if isinstance(node.op, tensor.blas.Dot22):
if numpy.any([(i.owner and i.owner.op == host_from_gpu) if any([(i.owner and isinstance(i.owner.op, HostFromGpu))
for i in node.inputs]): for i in node.inputs]):
x, y = node.inputs x, y = node.inputs
return [host_from_gpu(gpu_dot22(gpu_from_host(x), return [host_from_gpu(gpu_dot22(gpu_from_host(x),
...@@ -462,15 +472,16 @@ def local_gpu_dot22scalar(node): ...@@ -462,15 +472,16 @@ def local_gpu_dot22scalar(node):
dot(host_from_gpu) -> host_from_gpu(gpudot22scalar) dot(host_from_gpu) -> host_from_gpu(gpudot22scalar)
""" """
if node.op == gpu_from_host: if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0] host_input = node.inputs[0]
if (host_input.owner and if (host_input.owner and
host_input.owner.op == tensor.blas._dot22scalar): isinstance(host_input.owner.op,
tensor.blas.Dot22Scalar)):
x, y, scalar = host_input.owner.inputs x, y, scalar = host_input.owner.inputs
return [gpu_dot22scalar(gpu_from_host(x), gpu_from_host(y), return [gpu_dot22scalar(gpu_from_host(x), gpu_from_host(y),
tensor.blas._as_scalar(scalar))] tensor.blas._as_scalar(scalar))]
if node.op == tensor.blas._dot22scalar: if isinstance(node.op, tensor.blas.Dot22Scalar):
if numpy.any([(i.owner and i.owner.op == host_from_gpu) if any([i.owner and isinstance(i.owner.op, HostFromGpu)
for i in node.inputs]): for i in node.inputs]):
x, y, scalar = node.inputs x, y, scalar = node.inputs
return [host_from_gpu( return [host_from_gpu(
...@@ -488,31 +499,28 @@ def local_gpu_gemv(node): ...@@ -488,31 +499,28 @@ def local_gpu_gemv(node):
gemv(host_from_gpu) -> host_from_gpu(gpu_gemv) gemv(host_from_gpu) -> host_from_gpu(gpu_gemv)
""" """
gemvs = { gemvs = (tensor.blas.Gemv,
tensor.blas.gemv_inplace: gpu_gemv_no_inplace, tensor.blas_c.CGemv,
tensor.blas.gemv_no_inplace: gpu_gemv_no_inplace, )
tensor.blas_c.CGemv(inplace=True): gpu_gemv_no_inplace, if isinstance(node.op, GpuFromHost):
tensor.blas_c.CGemv(inplace=False): gpu_gemv_no_inplace,
}
if node.op == gpu_from_host:
host_input = node.inputs[0] host_input = node.inputs[0]
if host_input.owner and host_input.owner.op in gemvs: if host_input.owner and isinstance(host_input.owner.op, gemvs):
op = host_input.owner.op op = host_input.owner.op
z, a, x, y, b = host_input.owner.inputs z, a, x, y, b = host_input.owner.inputs
return [gemvs[op]( return [gpu_gemv_no_inplace(
gpu_from_host(z), gpu_from_host(z),
a, a,
gpu_from_host(x), gpu_from_host(x),
gpu_from_host(y), gpu_from_host(y),
b)] b)]
if node.op in gemvs: if isinstance(node.op, gemvs):
z, a, x, y, b = node.inputs z, a, x, y, b = node.inputs
x_on_gpu = (x.owner and x.owner.op == host_from_gpu) x_on_gpu = (x.owner and isinstance(x.owner.op, HostFromGpu))
y_on_gpu = (y.owner and y.owner.op == host_from_gpu) y_on_gpu = (y.owner and isinstance(y.owner.op, HostFromGpu))
z_on_gpu = (z.owner and z.owner.op == host_from_gpu) z_on_gpu = (z.owner and isinstance(z.owner.op, HostFromGpu))
if x_on_gpu or y_on_gpu or z_on_gpu: if x_on_gpu or y_on_gpu or z_on_gpu:
return [host_from_gpu( return [host_from_gpu(
gemvs[node.op]( gpu_gemv_no_inplace(
gpu_from_host(z), gpu_from_host(z),
a, a,
gpu_from_host(x), gpu_from_host(x),
...@@ -530,33 +538,30 @@ def local_gpu_ger(node): ...@@ -530,33 +538,30 @@ def local_gpu_ger(node):
ger(host_from_gpu) -> host_from_gpu(gpu_ger) ger(host_from_gpu) -> host_from_gpu(gpu_ger)
""" """
gers = { gers = (tensor.blas_c.CGer,
tensor.blas_c.CGer(destructive=True): gpu_ger_no_inplace, tensor.blas.Ger,
tensor.blas_c.CGer(destructive=False): gpu_ger_no_inplace, tensor.blas_scipy.ScipyGer,
tensor.blas.Ger(destructive=True): gpu_ger_no_inplace, )
tensor.blas.Ger(destructive=False): gpu_ger_no_inplace,
tensor.blas_scipy.ScipyGer(destructive=True): gpu_ger_no_inplace, if isinstance(node.op, GpuFromHost):
tensor.blas_scipy.ScipyGer(destructive=False): gpu_ger_no_inplace,
}
if node.op == gpu_from_host:
host_input = node.inputs[0] host_input = node.inputs[0]
if host_input.owner and host_input.owner.op in gers: if host_input.owner and isinstance(host_input.owner.op, gers):
op = host_input.owner.op op = host_input.owner.op
z, a, x, y = host_input.owner.inputs z, a, x, y = host_input.owner.inputs
return [gers[op]( return [gpu_ger_no_inplace(
gpu_from_host(z), gpu_from_host(z),
a, a,
gpu_from_host(x), gpu_from_host(x),
gpu_from_host(y) gpu_from_host(y)
)] )]
if node.op in gers: if isinstance(node.op, gers):
z, a, x, y = node.inputs z, a, x, y = node.inputs
x_on_gpu = (x.owner and x.owner.op == host_from_gpu) x_on_gpu = (x.owner and isinstance(x.owner.op, HostFromGpu))
y_on_gpu = (y.owner and y.owner.op == host_from_gpu) y_on_gpu = (y.owner and isinstance(y.owner.op, HostFromGpu))
z_on_gpu = (z.owner and z.owner.op == host_from_gpu) z_on_gpu = (z.owner and isinstance(z.owner.op, HostFromGpu))
if x_on_gpu or y_on_gpu or z_on_gpu: if x_on_gpu or y_on_gpu or z_on_gpu:
return [host_from_gpu( return [host_from_gpu(
gers[node.op]( gpu_ger_no_inplace(
gpu_from_host(z), gpu_from_host(z),
a, a,
gpu_from_host(x), gpu_from_host(x),
...@@ -573,26 +578,24 @@ def local_gpu_gemm(node): ...@@ -573,26 +578,24 @@ def local_gpu_gemm(node):
gemm(host_from_gpu) -> host_from_gpu(gpu_gemm) gemm(host_from_gpu) -> host_from_gpu(gpu_gemm)
""" """
gemms = { if isinstance(node.op, GpuFromHost):
#tensor.blas.gemm_inplace: gpu_gemm_inplace,
tensor.blas.gemm_no_inplace: gpu_gemm_no_inplace}
if node.op == gpu_from_host:
host_input = node.inputs[0] host_input = node.inputs[0]
if host_input.owner and host_input.owner.op in gemms: if host_input.owner and isinstance(host_input.owner.op,
tensor.blas.Gemm):
op = host_input.owner.op op = host_input.owner.op
z, a, x, y, b = host_input.owner.inputs z, a, x, y, b = host_input.owner.inputs
return [gemms[op](gpu_from_host(z), return [gpu_gemm_no_inplace(gpu_from_host(z),
a, a,
gpu_from_host(x), gpu_from_host(x),
gpu_from_host(y), gpu_from_host(y),
b)] b)]
if node.op in gemms: if isinstance(node.op, tensor.blas.Gemm):
z, a, x, y, b = node.inputs z, a, x, y, b = node.inputs
x_on_gpu = (x.owner and x.owner.op == host_from_gpu) x_on_gpu = (x.owner and isinstance(x.owner.op, HostFromGpu))
y_on_gpu = (y.owner and y.owner.op == host_from_gpu) y_on_gpu = (y.owner and isinstance(y.owner.op, HostFromGpu))
z_on_gpu = (z.owner and z.owner.op == host_from_gpu) z_on_gpu = (z.owner and isinstance(z.owner.op, HostFromGpu))
if x_on_gpu or y_on_gpu or z_on_gpu: if x_on_gpu or y_on_gpu or z_on_gpu:
return [host_from_gpu(gemms[node.op](gpu_from_host(z), return [host_from_gpu(gpu_gemm_no_inplace(gpu_from_host(z),
a, a,
gpu_from_host(x), gpu_from_host(x),
gpu_from_host(y), gpu_from_host(y),
...@@ -613,9 +616,10 @@ def local_gpu_careduce(node): ...@@ -613,9 +616,10 @@ def local_gpu_careduce(node):
scalar_op = node.op.scalar_op scalar_op = node.op.scalar_op
# currently, only these two ops are supported at all, # currently, only these two ops are supported at all,
# and max does not support all combinations of axes # and max does not support all combinations of axes
if node.op.scalar_op in [scal.add, scal.mul, scal.maximum, scal.minimum]: if isinstance(node.op.scalar_op, (scal.Add, scal.Mul,
scal.Maximum, scal.Minimum)):
x, = node.inputs x, = node.inputs
if x.owner and x.owner.op == host_from_gpu: if x.owner and isinstance(x.owner.op, HostFromGpu):
if node.op.axis is None: if node.op.axis is None:
reduce_mask = [1] * x.type.ndim reduce_mask = [1] * x.type.ndim
else: else:
...@@ -685,7 +689,7 @@ def local_gpu_careduce(node): ...@@ -685,7 +689,7 @@ def local_gpu_careduce(node):
@register_opt() @register_opt()
@local_optimizer([gpu_from_host, tensor.Reshape]) @local_optimizer([gpu_from_host, tensor.Reshape])
def local_gpu_reshape(node): def local_gpu_reshape(node):
if node.op == gpu_from_host: if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0] host_input = node.inputs[0]
if host_input.owner and \ if host_input.owner and \
isinstance(host_input.owner.op, tensor.Reshape): isinstance(host_input.owner.op, tensor.Reshape):
...@@ -702,7 +706,7 @@ def local_gpu_reshape(node): ...@@ -702,7 +706,7 @@ def local_gpu_reshape(node):
return [gpu_reshape] return [gpu_reshape]
if isinstance(node.op, tensor.Reshape): if isinstance(node.op, tensor.Reshape):
x, shp = node.inputs x, shp = node.inputs
if x.owner and x.owner.op == host_from_gpu: if x.owner and isinstance(x.owner.op, HostFromGpu):
gpu_x, = x.owner.inputs gpu_x, = x.owner.inputs
gpu_reshape = GpuReshape(node.op.ndim)(gpu_x, shp) gpu_reshape = GpuReshape(node.op.ndim)(gpu_x, shp)
if gpu_reshape.broadcastable != node.outputs[0].broadcastable: if gpu_reshape.broadcastable != node.outputs[0].broadcastable:
...@@ -719,7 +723,7 @@ def local_gpu_reshape(node): ...@@ -719,7 +723,7 @@ def local_gpu_reshape(node):
@register_opt() @register_opt()
@local_optimizer([gpu_from_host, tensor.Flatten]) @local_optimizer([gpu_from_host, tensor.Flatten])
def local_gpu_flatten(node): def local_gpu_flatten(node):
if node.op == gpu_from_host: if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0] host_input = node.inputs[0]
if host_input.owner and \ if host_input.owner and \
isinstance(host_input.owner.op, tensor.Flatten): isinstance(host_input.owner.op, tensor.Flatten):
...@@ -729,7 +733,7 @@ def local_gpu_flatten(node): ...@@ -729,7 +733,7 @@ def local_gpu_flatten(node):
if isinstance(node.op, tensor.Flatten): if isinstance(node.op, tensor.Flatten):
x, = node.inputs x, = node.inputs
outdim = node.op.outdim outdim = node.op.outdim
if x.owner and x.owner.op == host_from_gpu: if x.owner and isinstance(x.owner.op, HostFromGpu):
gpu_x, = x.owner.inputs gpu_x, = x.owner.inputs
return [host_from_gpu(GpuFlatten(outdim)(gpu_x))] return [host_from_gpu(GpuFlatten(outdim)(gpu_x))]
return False return False
...@@ -738,7 +742,7 @@ def local_gpu_flatten(node): ...@@ -738,7 +742,7 @@ def local_gpu_flatten(node):
@register_opt() @register_opt()
@local_optimizer([gpu_from_host, tensor.Subtensor]) @local_optimizer([gpu_from_host, tensor.Subtensor])
def local_gpu_subtensor(node): def local_gpu_subtensor(node):
if node.op == gpu_from_host: if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0] host_input = node.inputs[0]
if host_input.owner and \ if host_input.owner and \
isinstance(host_input.owner.op, tensor.Subtensor): isinstance(host_input.owner.op, tensor.Subtensor):
...@@ -748,9 +752,11 @@ def local_gpu_subtensor(node): ...@@ -748,9 +752,11 @@ def local_gpu_subtensor(node):
return [GpuSubtensor(subt.idx_list)(gpu_from_host(x), *coords)] return [GpuSubtensor(subt.idx_list)(gpu_from_host(x), *coords)]
if isinstance(node.op, tensor.Subtensor): if isinstance(node.op, tensor.Subtensor):
x = node.inputs[0] x = node.inputs[0]
coords = node.inputs[1:] if (x.owner and
if x.owner and x.owner.op == host_from_gpu and x.dtype == "float32": isinstance(x.owner.op, HostFromGpu) and
x.dtype == "float32"):
gpu_x, = x.owner.inputs gpu_x, = x.owner.inputs
coords = node.inputs[1:]
return [host_from_gpu(GpuSubtensor( return [host_from_gpu(GpuSubtensor(
node.op.idx_list)(gpu_x, *coords))] node.op.idx_list)(gpu_x, *coords))]
return False return False
...@@ -759,7 +765,7 @@ def local_gpu_subtensor(node): ...@@ -759,7 +765,7 @@ def local_gpu_subtensor(node):
@register_opt() @register_opt()
@local_optimizer([gpu_from_host, tensor.AdvancedSubtensor1]) @local_optimizer([gpu_from_host, tensor.AdvancedSubtensor1])
def local_gpu_advanced_subtensor1(node): def local_gpu_advanced_subtensor1(node):
if node.op == gpu_from_host: if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0] host_input = node.inputs[0]
if host_input.owner and \ if host_input.owner and \
host_input.owner.op.__class__ is tensor.AdvancedSubtensor1: host_input.owner.op.__class__ is tensor.AdvancedSubtensor1:
...@@ -769,7 +775,7 @@ def local_gpu_advanced_subtensor1(node): ...@@ -769,7 +775,7 @@ def local_gpu_advanced_subtensor1(node):
if node.op.__class__ is tensor.AdvancedSubtensor1: if node.op.__class__ is tensor.AdvancedSubtensor1:
x = node.inputs[0] x = node.inputs[0]
coords = node.inputs[1:] coords = node.inputs[1:]
if x.owner and x.owner.op == host_from_gpu and x.dtype == "float32": if x.owner and isinstance(x.owner.op, HostFromGpu) and x.dtype == "float32":
gpu_x, = x.owner.inputs gpu_x, = x.owner.inputs
return [host_from_gpu(GpuAdvancedSubtensor1()(gpu_x, *coords))] return [host_from_gpu(GpuAdvancedSubtensor1()(gpu_x, *coords))]
return False return False
...@@ -778,7 +784,7 @@ def local_gpu_advanced_subtensor1(node): ...@@ -778,7 +784,7 @@ def local_gpu_advanced_subtensor1(node):
@register_opt() @register_opt()
@local_optimizer([gpu_from_host, tensor.AdvancedIncSubtensor1]) @local_optimizer([gpu_from_host, tensor.AdvancedIncSubtensor1])
def local_gpu_advanced_incsubtensor1(node): def local_gpu_advanced_incsubtensor1(node):
if node.op == gpu_from_host: if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0] host_input = node.inputs[0]
# Should not execute for GpuAdvancedIncSubtensor1 # Should not execute for GpuAdvancedIncSubtensor1
if host_input.owner and \ if host_input.owner and \
...@@ -813,12 +819,12 @@ def local_gpu_advanced_incsubtensor1(node): ...@@ -813,12 +819,12 @@ def local_gpu_advanced_incsubtensor1(node):
x, y = node.inputs[0:2] x, y = node.inputs[0:2]
coords = node.inputs[2:] coords = node.inputs[2:]
go_gpu = False go_gpu = False
if x.owner and x.owner.op == host_from_gpu: if x.owner and isinstance(x.owner.op, HostFromGpu):
go_gpu = True go_gpu = True
gpu_x, = x.owner.inputs gpu_x, = x.owner.inputs
else: else:
gpu_x = gpu_from_host(x) gpu_x = gpu_from_host(x)
if y.owner and y.owner.op == host_from_gpu: if y.owner and isinstance(y.owner.op, HostFromGpu):
go_gpu = True go_gpu = True
gpu_y, = y.owner.inputs gpu_y, = y.owner.inputs
else: else:
...@@ -852,7 +858,7 @@ def local_gpu_advanced_incsubtensor1(node): ...@@ -852,7 +858,7 @@ def local_gpu_advanced_incsubtensor1(node):
@register_opt() @register_opt()
@local_optimizer([gpu_from_host, tensor.IncSubtensor]) @local_optimizer([gpu_from_host, tensor.IncSubtensor])
def local_gpu_incsubtensor(node): def local_gpu_incsubtensor(node):
if node.op == gpu_from_host: if isinstance(node.op, GpuFromHost):
host_output = node.inputs[0] host_output = node.inputs[0]
if host_output.owner and \ if host_output.owner and \
type(host_output.owner.op) == tensor.IncSubtensor: type(host_output.owner.op) == tensor.IncSubtensor:
...@@ -876,12 +882,12 @@ def local_gpu_incsubtensor(node): ...@@ -876,12 +882,12 @@ def local_gpu_incsubtensor(node):
assert isinstance(y.type, tensor.TensorType) assert isinstance(y.type, tensor.TensorType)
coords = node.inputs[2:] coords = node.inputs[2:]
go_gpu = False go_gpu = False
if x.owner and x.owner.op == host_from_gpu: if x.owner and isinstance(x.owner.op, HostFromGpu):
go_gpu = True go_gpu = True
gpu_x, = x.owner.inputs gpu_x, = x.owner.inputs
else: else:
gpu_x = gpu_from_host(x) gpu_x = gpu_from_host(x)
if y.owner and y.owner.op == host_from_gpu: if y.owner and isinstance(y.owner.op, HostFromGpu):
go_gpu = True go_gpu = True
gpu_y, = y.owner.inputs gpu_y, = y.owner.inputs
else: else:
...@@ -901,7 +907,7 @@ def local_gpu_incsubtensor(node): ...@@ -901,7 +907,7 @@ def local_gpu_incsubtensor(node):
def local_gpu_shape(node): def local_gpu_shape(node):
if isinstance(node.op, tensor.Shape): if isinstance(node.op, tensor.Shape):
x, = node.inputs x, = node.inputs
if x.owner and x.owner.op == host_from_gpu: if x.owner and isinstance(x.owner.op, HostFromGpu):
gpu_x, = x.owner.inputs gpu_x, = x.owner.inputs
return [gpu_shape(gpu_x)] return [gpu_shape(gpu_x)]
return False return False
...@@ -913,7 +919,7 @@ def local_gpu_rebroadcast(node): ...@@ -913,7 +919,7 @@ def local_gpu_rebroadcast(node):
'''rebroadcast(host_from_gpu(x)) -> host_from_gpu(rebroadcast(x))''' '''rebroadcast(host_from_gpu(x)) -> host_from_gpu(rebroadcast(x))'''
if isinstance(node.op, tensor.Rebroadcast): if isinstance(node.op, tensor.Rebroadcast):
x, = node.inputs x, = node.inputs
if (x.owner and x.owner.op == host_from_gpu): if (x.owner and isinstance(x.owner.op, HostFromGpu)):
gpu_x = x.owner.inputs[0] gpu_x = x.owner.inputs[0]
return [host_from_gpu(node.op(gpu_x))] return [host_from_gpu(node.op(gpu_x))]
...@@ -927,7 +933,7 @@ def gpu_print_wrapper(op, cnda): ...@@ -927,7 +933,7 @@ def gpu_print_wrapper(op, cnda):
def local_gpu_print_op(node): def local_gpu_print_op(node):
if isinstance(node.op, tensor.printing.Print): if isinstance(node.op, tensor.printing.Print):
x, = node.inputs x, = node.inputs
if x.owner and x.owner.op == host_from_gpu: if x.owner and isinstance(x.owner.op, HostFromGpu):
gpu_x, = x.owner.inputs gpu_x, = x.owner.inputs
new_op = node.op.__class__(global_fn=gpu_print_wrapper) new_op = node.op.__class__(global_fn=gpu_print_wrapper)
new_op.old_op = node.op new_op.old_op = node.op
...@@ -948,7 +954,7 @@ import theano.tensor.nnet ...@@ -948,7 +954,7 @@ import theano.tensor.nnet
def local_gpu_crossentorpy_softmax_argmax_1hot_with_bias(node): def local_gpu_crossentorpy_softmax_argmax_1hot_with_bias(node):
if isinstance(node.op, tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias): if isinstance(node.op, tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias):
x, b, y = node.inputs x, b, y = node.inputs
if x.owner and x.owner.op == host_from_gpu: if x.owner and isinstance(x.owner.op, HostFromGpu):
gpu_x, = x.owner.inputs gpu_x, = x.owner.inputs
# if y is a cast to integers, we can go to the underlying # if y is a cast to integers, we can go to the underlying
# thing if we want, since this gpu op will cast to integers # thing if we want, since this gpu op will cast to integers
...@@ -978,7 +984,7 @@ def local_gpu_crossentorpy_softmax_argmax_1hot_with_bias(node): ...@@ -978,7 +984,7 @@ def local_gpu_crossentorpy_softmax_argmax_1hot_with_bias(node):
def local_gpu_crossentorpy_softmax_1hot_with_bias_dx(node): def local_gpu_crossentorpy_softmax_1hot_with_bias_dx(node):
if isinstance(node.op, tensor.nnet.CrossentropySoftmax1HotWithBiasDx): if isinstance(node.op, tensor.nnet.CrossentropySoftmax1HotWithBiasDx):
dnll, sm, yidx = node.inputs dnll, sm, yidx = node.inputs
if sm.owner and sm.owner.op == host_from_gpu: if sm.owner and isinstance(sm.owner.op, HostFromGpu):
gpu_sm, = sm.owner.inputs gpu_sm, = sm.owner.inputs
gpu_dx = GpuCrossentropySoftmax1HotWithBiasDx()( gpu_dx = GpuCrossentropySoftmax1HotWithBiasDx()(
gpu_from_host(dnll), gpu_from_host(dnll),
...@@ -993,7 +999,7 @@ def local_gpu_crossentorpy_softmax_1hot_with_bias_dx(node): ...@@ -993,7 +999,7 @@ def local_gpu_crossentorpy_softmax_1hot_with_bias_dx(node):
def local_gpu_softmax(node): def local_gpu_softmax(node):
if isinstance(node.op, tensor.nnet.Softmax): if isinstance(node.op, tensor.nnet.Softmax):
x, = node.inputs x, = node.inputs
if x.owner and x.owner.op == host_from_gpu: if x.owner and isinstance(x.owner.op, HostFromGpu):
gpu_x, = x.owner.inputs gpu_x, = x.owner.inputs
gpu_sm = GpuSoftmax()(gpu_x) gpu_sm = GpuSoftmax()(gpu_x)
return [host_from_gpu(gpu_sm)] return [host_from_gpu(gpu_sm)]
...@@ -1005,8 +1011,8 @@ def local_gpu_softmax(node): ...@@ -1005,8 +1011,8 @@ def local_gpu_softmax(node):
def local_gpu_softmax_with_bias(node): def local_gpu_softmax_with_bias(node):
if isinstance(node.op, tensor.nnet.SoftmaxWithBias): if isinstance(node.op, tensor.nnet.SoftmaxWithBias):
x, b = node.inputs x, b = node.inputs
x_on_gpu = x.owner and x.owner.op == host_from_gpu x_on_gpu = x.owner and isinstance(x.owner.op, HostFromGpu)
b_on_gpu = b.owner and b.owner.op == host_from_gpu b_on_gpu = b.owner and isinstance(b.owner.op, HostFromGpu)
if x_on_gpu or b_on_gpu: if x_on_gpu or b_on_gpu:
gpu_sm = GpuSoftmaxWithBias()(gpu_from_host(x), gpu_from_host(b)) gpu_sm = GpuSoftmaxWithBias()(gpu_from_host(x), gpu_from_host(b))
return [host_from_gpu(gpu_sm)] return [host_from_gpu(gpu_sm)]
...@@ -1078,7 +1084,7 @@ def local_gpu_conv(node): ...@@ -1078,7 +1084,7 @@ def local_gpu_conv(node):
atol = 3e-5 atol = 3e-5
return CudaNdarrayType.values_eq_approx(a, b, atol=atol) return CudaNdarrayType.values_eq_approx(a, b, atol=atol)
if node.op == gpu_from_host: if isinstance(node.op, GpuFromHost):
#gpu_from_host(conv) -> gpu_conv(gpu_from_host) #gpu_from_host(conv) -> gpu_conv(gpu_from_host)
host_input = node.inputs[0] host_input = node.inputs[0]
if host_input.owner and isinstance(host_input.owner.op, conv.ConvOp): if host_input.owner and isinstance(host_input.owner.op, conv.ConvOp):
...@@ -1098,8 +1104,8 @@ def local_gpu_conv(node): ...@@ -1098,8 +1104,8 @@ def local_gpu_conv(node):
if isinstance(node.op, conv.ConvOp): if isinstance(node.op, conv.ConvOp):
#conv(host_from_gpu) -> host_from_gpu(gpu_conv) #conv(host_from_gpu) -> host_from_gpu(gpu_conv)
img, kern = node.inputs img, kern = node.inputs
img_on_gpu = (img.owner and img.owner.op == host_from_gpu) img_on_gpu = (img.owner and isinstance(img.owner.op, HostFromGpu))
kern_on_gpu = (kern.owner and kern.owner.op == host_from_gpu) kern_on_gpu = (kern.owner and isinstance(kern.owner.op, HostFromGpu))
if img_on_gpu or kern_on_gpu: if img_on_gpu or kern_on_gpu:
gpu_conv = GpuConvOp_from_ConvOp(node.op) gpu_conv = GpuConvOp_from_ConvOp(node.op)
if gpu_conv is None: if gpu_conv is None:
...@@ -1122,7 +1128,7 @@ import theano.tensor.signal.downsample as downsample ...@@ -1122,7 +1128,7 @@ import theano.tensor.signal.downsample as downsample
def local_gpu_downsample_factor_max(node): def local_gpu_downsample_factor_max(node):
if isinstance(node.op, downsample.DownsampleFactorMax): if isinstance(node.op, downsample.DownsampleFactorMax):
x, = node.inputs x, = node.inputs
if (x.owner and x.owner.op == host_from_gpu): if (x.owner and isinstance(x.owner.op, HostFromGpu)):
gpu_ds = GpuDownsampleFactorMax(node.op.ds, node.op.ignore_border) gpu_ds = GpuDownsampleFactorMax(node.op.ds, node.op.ignore_border)
return [host_from_gpu(gpu_ds(x.owner.inputs[0]))] return [host_from_gpu(gpu_ds(x.owner.inputs[0]))]
...@@ -1132,7 +1138,7 @@ def local_gpu_downsample_factor_max(node): ...@@ -1132,7 +1138,7 @@ def local_gpu_downsample_factor_max(node):
def local_gpu_downsample_factor_max_grad(node): def local_gpu_downsample_factor_max_grad(node):
if isinstance(node.op, downsample.DownsampleFactorMaxGrad): if isinstance(node.op, downsample.DownsampleFactorMaxGrad):
x, z, gz = node.inputs x, z, gz = node.inputs
if (x.owner and x.owner.op == host_from_gpu): if (x.owner and isinstance(x.owner.op, HostFromGpu)):
gpu_ds_grad = GpuDownsampleFactorMaxGrad(node.op.ds, gpu_ds_grad = GpuDownsampleFactorMaxGrad(node.op.ds,
node.op.ignore_border) node.op.ignore_border)
return [host_from_gpu(gpu_ds_grad(x.owner.inputs[0], return [host_from_gpu(gpu_ds_grad(x.owner.inputs[0],
...@@ -1184,12 +1190,12 @@ def local_gpu_join(node): ...@@ -1184,12 +1190,12 @@ def local_gpu_join(node):
#print "OPT: axis_and_tensors=", axis_and_tensors #print "OPT: axis_and_tensors=", axis_and_tensors
matches = [(not t.owner is None and t.owner.op == host_from_gpu) or matches = [(not t.owner is None and isinstance(t.owner.op, HostFromGpu)) or
isinstance(t, gof.Constant) for t in axis_and_tensors[1:]] isinstance(t, gof.Constant) for t in axis_and_tensors[1:]]
#print "OPT: matches =", matches #print "OPT: matches =", matches
# if all input tensors are host_from_gpu'ified # if all input tensors are host_from_gpu'ified
if numpy.all(matches): if all(matches):
# the extra gpu_from_host introduced here will # the extra gpu_from_host introduced here will
# be removed by further optimizations # be removed by further optimizations
new_tensors = [gpu_from_host(t) for t in axis_and_tensors[1:]] new_tensors = [gpu_from_host(t) for t in axis_and_tensors[1:]]
...@@ -1363,7 +1369,7 @@ def local_gpualloc(node): ...@@ -1363,7 +1369,7 @@ def local_gpualloc(node):
replace = False replace = False
if node.op == tensor.alloc: if node.op == tensor.alloc:
if node.inputs[0].owner and \ if node.inputs[0].owner and \
node.inputs[0].owner.op == host_from_gpu: isinstance(node.inputs[0].owner.op, HostFromGpu):
replace = True replace = True
elif all([c != 'output' and c.op == gpu_from_host elif all([c != 'output' and c.op == gpu_from_host
for c, idx in node.outputs[0].clients]): for c, idx in node.outputs[0].clients]):
...@@ -1424,14 +1430,14 @@ def local_gpu_eye(node): ...@@ -1424,14 +1430,14 @@ def local_gpu_eye(node):
eye(host_from_gpu) -> host_from_gpu(gpueye) eye(host_from_gpu) -> host_from_gpu(gpueye)
""" """
if node.op == gpu_from_host: if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0] host_input = node.inputs[0]
if (host_input.owner and if (host_input.owner and
isinstance(host_input.owner.op, tensor.Eye) and isinstance(host_input.owner.op, tensor.Eye) and
host_input.owner.op.dtype == "float32"): host_input.owner.op.dtype == "float32"):
return [gpu_eye(*host_input.owner.inputs)] return [gpu_eye(*host_input.owner.inputs)]
if isinstance(node.op, tensor.Eye) and node.op.dtype == "float32": if isinstance(node.op, tensor.Eye) and node.op.dtype == "float32":
if numpy.any([(i.owner and i.owner.op == host_from_gpu) if any([(i.owner and isinstance(i.owner.op, HostFromGpu))
for i in node.inputs]): for i in node.inputs]):
return [host_from_gpu(gpu_eye(*node.inputs))] return [host_from_gpu(gpu_eye(*node.inputs))]
return False return False
...@@ -1507,14 +1513,18 @@ def local_gpu_extract_diagonal(node): ...@@ -1507,14 +1513,18 @@ def local_gpu_extract_diagonal(node):
extract_diagonal(host_from_gpu()) -> host_from_gpu(extract_diagonal) extract_diagonal(host_from_gpu()) -> host_from_gpu(extract_diagonal)
gpu_from_host(extract_diagonal) -> extract_diagonal(gpu_from_host) gpu_from_host(extract_diagonal) -> extract_diagonal(gpu_from_host)
""" """
global linalg
if linalg is None:
from theano.sandbox import linalg from theano.sandbox import linalg
linalg = theano.sandbox.linalg
if (isinstance(node.op, linalg.ops.ExtractDiag) and if (isinstance(node.op, linalg.ops.ExtractDiag) and
isinstance(node.inputs[0].type, isinstance(node.inputs[0].type,
theano.tensor.TensorType)): theano.tensor.TensorType)):
inp = node.inputs[0] inp = node.inputs[0]
if inp.owner and isinstance(inp.owner.op, HostFromGpu): if inp.owner and isinstance(inp.owner.op, HostFromGpu):
return [host_from_gpu(linalg.extract_diag(gpu_from_host(inp)))] return [host_from_gpu(linalg.extract_diag(gpu_from_host(inp)))]
if node.op == gpu_from_host: if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0] host_input = node.inputs[0]
if (host_input.owner and if (host_input.owner and
isinstance(host_input.owner.op, linalg.ops.ExtractDiag) and isinstance(host_input.owner.op, linalg.ops.ExtractDiag) and
...@@ -1535,7 +1545,7 @@ def gpuScanOptimization(node): ...@@ -1535,7 +1545,7 @@ def gpuScanOptimization(node):
""" """
#gpu_from_host(scan) -> GPUscan(gpu_from_host) #gpu_from_host(scan) -> GPUscan(gpu_from_host)
if node.op == gpu_from_host: if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0] host_input = node.inputs[0]
if (host_input.owner and if (host_input.owner and
isinstance(host_input.owner.op, scan_op.Scan) and isinstance(host_input.owner.op, scan_op.Scan) and
...@@ -1596,7 +1606,7 @@ def gpuScanOptimization(node): ...@@ -1596,7 +1606,7 @@ def gpuScanOptimization(node):
#scan(host_from_gpu) -> host_from_gpu(GPUscan) #scan(host_from_gpu) -> host_from_gpu(GPUscan)
if (type(node.op) == scan_op.Scan if (type(node.op) == scan_op.Scan
and not node.op.info['gpu']): and not node.op.info['gpu']):
if numpy.any([(i.owner and i.owner.op == host_from_gpu) if any([(i.owner and isinstance(i.owner.op, HostFromGpu))
for i in node.inputs]): for i in node.inputs]):
thescan = node.op thescan = node.op
......
...@@ -1190,27 +1190,26 @@ def _beta_L_plus_alpha_M(beta, L, alpha, M, recurse_flip=True): ...@@ -1190,27 +1190,26 @@ def _beta_L_plus_alpha_M(beta, L, alpha, M, recurse_flip=True):
# it also might be the case that there is a dimshuffle between the + # it also might be the case that there is a dimshuffle between the +
# and the dot22. local_dot_to_dot22 in particular will put in such things. # and the dot22. local_dot_to_dot22 in particular will put in such things.
if M.owner and isinstance(M.owner.op, T.DimShuffle): if (M.owner and isinstance(M.owner.op, T.DimShuffle) and
M.owner.inputs[0].owner and
isinstance(M.owner.inputs[0].owner.op, Dot22)):
MM = M.owner.inputs[0] MM = M.owner.inputs[0]
if tuple(M.owner.op.new_order) == (0,): if M.owner.op.new_order == (0,):
# it is making a column MM into a vector # it is making a column MM into a vector
if MM.owner and MM.owner.op == _dot22:
MMl, MMr = MM.owner.inputs MMl, MMr = MM.owner.inputs
g = gemm_no_inplace(L.dimshuffle(0, 'x'), g = gemm_no_inplace(L.dimshuffle(0, 'x'),
alpha, MMl, MMr, beta) alpha, MMl, MMr, beta)
rval = [g.dimshuffle(0)] rval = [g.dimshuffle(0)]
return rval, MM return rval, MM
if tuple(M.owner.op.new_order) == (1,): if M.owner.op.new_order == (1,):
# it is making a row MM into a vector # it is making a row MM into a vector
if MM.owner and MM.owner.op == _dot22:
MMl, MMr = MM.owner.inputs MMl, MMr = MM.owner.inputs
g = gemm_no_inplace(L.dimshuffle('x', 0), g = gemm_no_inplace(L.dimshuffle('x', 0),
alpha, MMl, MMr, beta) alpha, MMl, MMr, beta)
rval = [g.dimshuffle(1)] rval = [g.dimshuffle(1)]
return rval, MM return rval, MM
if tuple(M.owner.op.new_order) == (): if len(M.owner.op.new_order) == 0:
# it is making a row MM into a vector # it is making a row MM into a vector
if MM.owner and MM.owner.op == _dot22:
MMl, MMr = MM.owner.inputs MMl, MMr = MM.owner.inputs
g = gemm_no_inplace(L.dimshuffle('x', 'x'), g = gemm_no_inplace(L.dimshuffle('x', 'x'),
alpha, MMl, MMr, beta) alpha, MMl, MMr, beta)
...@@ -1379,29 +1378,31 @@ def _gemm_from_factored_list(lst): ...@@ -1379,29 +1378,31 @@ def _gemm_from_factored_list(lst):
"""Returns None, or a list to replace node.outputs """Returns None, or a list to replace node.outputs
""" """
# Make every pair in list have matching dtypes
# sM can be a tuple of 2 elements or a theano variable.
# We should not use __len__ as theano variables don't support
# it. I don't want to change this to isinstance(sM, tuple)
# as I'm not able to make a test that triggers this case.
def is_pair(sM):
try:
s, M = sM
return True
except Exception:
return False
lst2 = [] lst2 = []
# Remove the tuple that can't be cast correctly. # Remove the tuple that can't be cast correctly.
# This can happen when we try to cast a complex to a real # This can happen when we try to cast a complex to a real
for sM in lst: for sM in lst:
if is_pair(sM): # Make every pair in list have matching dtypes
# sM can be a tuple of 2 elements or a theano variable.
if isinstance(sM, tuple):
sm0, sm1 = sM sm0, sm1 = sM
sm0 = T.as_tensor_variable(sm0) sm0 = T.as_tensor_variable(sm0)
if theano.scalar.upcast(sm0.dtype, sm1.dtype) == sm1.dtype: if theano.scalar.upcast(sm0.dtype, sm1.dtype) == sm1.dtype:
lst2.append((T.cast(sm0, sm1.dtype), sM[1])) lst2.append((T.cast(sm0, sm1.dtype), sM[1]))
lst = lst2 lst = lst2
def item_to_var(t):
try:
s, M = t
except Exception:
return t
if s == 1:
return M
if s == -1:
return -M
return s * M
# Try every pair in the sM_list, trying to turn it into a gemm operation # Try every pair in the sM_list, trying to turn it into a gemm operation
for i in xrange(len(lst) - 1): for i in xrange(len(lst) - 1):
s_i, M_i = lst[i] s_i, M_i = lst[i]
...@@ -1418,16 +1419,6 @@ def _gemm_from_factored_list(lst): ...@@ -1418,16 +1419,6 @@ def _gemm_from_factored_list(lst):
s_j, M_j) s_j, M_j)
#print 'GOT IT', gemm_of_sM_list #print 'GOT IT', gemm_of_sM_list
if gemm_of_sM_list: if gemm_of_sM_list:
def item_to_var(t):
try:
s, M = t
except Exception:
return t
if s == 1:
return M
if s == -1:
return -M
return s * M
assert len(gemm_of_sM_list) == 1 assert len(gemm_of_sM_list) == 1
add_inputs = [item_to_var(input) add_inputs = [item_to_var(input)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论