提交 c887bc14 authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Fix most of opt.py for type context.

Only scan is left.
上级 29615c83
......@@ -3,11 +3,6 @@ import numpy
import logging
from six.moves import xrange
try:
import pygpu
except ImportError:
pass
import theano
from theano import tensor, scalar, gof
from theano.compile import optdb
......@@ -22,12 +17,12 @@ from theano.scan_module import scan_utils, scan_op, scan_opt
from theano.tensor.nnet.conv import ConvOp
from theano.tests.breakpoint import PdbBreakpoint
from .type import GpuArrayType, GpuArrayConstant
from .type import GpuArrayType, GpuArrayConstant, get_context
from .basic_ops import (as_gpuarray_variable,
host_from_gpu, gpu_from_host,
host_from_gpu, GpuToGpu,
HostFromGpu, GpuFromHost,
GpuSplit, GpuContiguous,
gpu_alloc, GpuAlloc, GpuAllocEmpty, GpuReshape,
GpuAlloc, GpuAllocEmpty, GpuReshape,
GpuEye, gpu_join, GpuJoin)
from .blas import (gpu_dot22, GpuGemv, GpuGemm, GpuGer,
gpugemm_no_inplace)
......@@ -79,9 +74,9 @@ gpu_optimizer.register('local_remove_all_assert',
'unsafe')
def safe_to_gpu(x):
def safe_to_gpu(x, ctx_name):
if isinstance(x.type, tensor.TensorType):
return gpu_from_host(x)
return GpuFromHost(ctx_name)(x)
else:
return x
......@@ -102,28 +97,53 @@ def op_lifter(OP, cuda_only=False):
"""
def f(maker):
def local_opt(node):
dev = theano.sandbox.gpuarray.init_dev.device
if cuda_only and not dev.startswith('cuda'):
return
if type(node.op) in OP:
# Either one of our inputs is on the gpu or
# all of our client are on the gpu
if (any([i.owner and i.owner.op == host_from_gpu
for i in node.inputs]) or
all([c != 'output' and c.op == gpu_from_host
for c, idx in node.outputs[0].clients])):
new_op = maker(node)
# This is needed as sometimes new_op inherit from OP.
if new_op and new_op != node.op:
if isinstance(new_op, theano.Op):
return [safe_to_cpu(o) for o in
new_op(*node.inputs, return_list=True)]
elif isinstance(new_op, (tuple, list)):
return [safe_to_cpu(o) for o in new_op]
else: # suppose it is a variable on the GPU
return [host_from_gpu(new_op)]
# all of our clients are on the gpu
replace = False
# TODO: Maybe set context_name with infer_context_name()?
context_name = None
# We replace if any input is a host_from_gpu
for i in node.inputs:
if i.owner and i.owner.op == host_from_gpu:
context_name = i.owner.inputs[0].type.context_name
replace = True
break
if not replace:
# We replace if *all* clients are on the GPU
clients = [c for o in node.outputs for c in o.clients]
replace = len(clients) != 0
for c, idx in clients:
if (c == 'output' or
not isinstance(c.op, GpuFromHost)):
replace = False
# TODO: check that the clients want the same context?
if replace:
# All clients are GpuFromHost and we have at least one
context_name = clients[0][0].op.context_name
# Check if we should replace
if (not replace or
(cuda_only and
get_context(context_name).kind != 'cuda')):
return False
new_op = maker(node, context_name)
# This is needed as sometimes new_op inherits from OP.
if new_op and new_op != node.op:
if isinstance(new_op, theano.Op):
# tag the inputs with the context in case
# the context was derived from the outputs
def tag(i, ctx):
i.tag.context_name = ctx
return i
inputs = [tag(i, context_name) for i in node.inputs]
return [safe_to_cpu(o) for o in
new_op(*inputs, return_list=True)]
elif isinstance(new_op, (tuple, list)):
return [safe_to_cpu(o) for o in new_op]
else: # suppose it is a variable on the GPU
return [host_from_gpu(new_op)]
return False
local_opt.__name__ = maker.__name__
return local_optimizer(OP)(local_opt)
......@@ -146,35 +166,80 @@ class InputToGpuOptimizer(Optimizer):
if (len(input.clients) == 1 and
(input.clients[0][0] == 'output' or
input.clients[0][0].op == gpu_from_host)):
isinstance(input.clients[0][0].op, GpuFromHost))):
continue
try:
new_input = host_from_gpu(gpu_from_host(input))
ctx = getattr(input.tag, 'context_name', None)
new_input = host_from_gpu(GpuFromHost(ctx)(input))
fgraph.replace_validate(input, new_input,
"InputToGpuOptimizer")
except TypeError:
# This could fail if the inputs are not TensorTypes
pass
except ValueError:
# If there is no context tag and no default context
# then it stays on the CPU
assert ctx is None
pass
gpu_seqopt.register('InputToGpuArrayOptimizer', InputToGpuOptimizer(),
0, 'fast_run', 'fast_compile', 'merge')
@local_optimizer([gpu_from_host, host_from_gpu])
def local_cut_gpu_host_gpu(node):
if tensor.opt.opt.check_chain(node, gpu_from_host, host_from_gpu):
return [node.inputs[0].owner.inputs[0]]
if tensor.opt.opt.check_chain(node, host_from_gpu, gpu_from_host):
return [node.inputs[0].owner.inputs[0]]
return False
gpu_cut_copies.register('cut_gpua_host_transfers', local_cut_gpu_host_gpu,
@local_optimizer([GpuFromHost, GpuToGpu, host_from_gpu])
def local_cut_gpu_transfers(node):
# gpu[ab] -> host -> gpub
if (isinstance(node.op, GpuFromHost) and
node.inputs[0].owner and
node.inputs[0].owner.op == host_from_gpu):
other = node.inputs[0].owner.inputs[0]
if node.op.context_name == other.type.context_name:
return [other]
else:
return [GpuToGpu(node.op.context_name)(other)]
# ? -> gpua -> host
elif (node.op == host_from_gpu and
node.inputs[0].owner):
n2 = node.inputs[0].owner
# host ->
if isinstance(n2.op, GpuFromHost):
return [n2.inputs[0]]
# gpub ->
if isinstance(n2.op, GpuToGpu):
return [host_from_gpu(n2.inputs[0])]
# ? -> gpua -> gpub
elif isinstance(node.op, GpuToGpu):
# Transfer within same context
if node.inputs[0].type.context_name == node.op.context_name:
return [node.inputs[0]]
if node.inputs[0].owner:
n2 = node.inputs[0].owner
# host ->
if isinstance(n2.op, GpuFromHost):
return [GpuFromHost(node.op.context_name)(n2.inputs[0])]
# gpuc ->
if isinstance(n2.op, GpuToGpu):
if node.op.context_name == n2.inputs[0].type.context_name:
return [n2.inputs[0]]
else:
return [node.op(n2.inputs[0])]
gpu_cut_copies.register('cut_gpua_host_transfers', local_cut_gpu_transfers,
'fast_compile', 'fast_run', 'inplace', 'gpuarray')
gpu_cut_copies.register('cut_gpua_constant_transfers',
tensor.opt.constant_folding,
'fast_compile', 'fast_run', 'gpuarray')
optdb['canonicalize'].register('local_cut_gpua_host_gpua',
local_cut_gpu_host_gpu,
local_cut_gpu_transfers,
'fast_compile', 'fast_run', 'gpuarray')
......@@ -187,6 +252,11 @@ def local_gpuaalloc2(node):
Moves an alloc that is an input to join to the gpu.
"""
try:
get_context(None)
except ValueError:
# If there is no default context then we do not perform the move here.
return
if (isinstance(node.op, tensor.Alloc) and
all(c != 'output' and
c.op == tensor.join and
......@@ -194,23 +264,13 @@ def local_gpuaalloc2(node):
i.owner.op in [host_from_gpu, tensor.alloc]
for i in c.inputs[1:])
for c, idx in node.outputs[0].clients)):
return [host_from_gpu(gpu_alloc(*node.inputs))]
return [host_from_gpu(GpuAlloc(None)(*node.inputs))]
@register_opt('fast_compile')
@op_lifter([tensor.Alloc])
def local_gpuaalloc(node):
new_out = gpu_alloc(*node.inputs)
# We need to hide new broadcastable dimensions because
# ReplaceValidate doesn't like when they change.
if new_out.broadcastable != node.outputs[0].broadcastable:
# but if a dim is suddenly not broadcastable anymore then that's a bug
for b_old, b_new in zip(node.outputs[0].broadcastable,
new_out.broadcastable):
assert b_new or (not b_old)
new_out = tensor.patternbroadcast(new_out,
node.outputs[0].broadcastable)
return (new_out,)
def local_gpuaalloc(node, context_name):
return GpuAlloc(context_name)(*node.inputs)
@register_opt()
......@@ -221,8 +281,8 @@ def local_gpualloc_memset_0(node):
if (isinstance(inp, GpuArrayConstant) and
inp.data.size == 1 and
(numpy.asarray(inp.data) == 0).all()):
new_out = GpuAlloc(memset_0=True)(*node.inputs)
return [new_out]
new_op = GpuAlloc(node.op.context_name, memset_0=True)
return [new_op(*node.inputs)]
@register_opt()
......@@ -240,7 +300,7 @@ def local_gpu_contiguous_gpu_contiguous(node):
@register_opt('fast_compile')
@op_lifter([tensor.Reshape])
def local_gpureshape(node):
def local_gpureshape(node, context_name):
op = node.op
name = op.name
if name:
......@@ -251,14 +311,14 @@ def local_gpureshape(node):
@register_opt('fast_compile')
@op_lifter([tensor.Rebroadcast])
def local_gpu_rebroadcast(node):
def local_gpu_rebroadcast(node, context_name):
if isinstance(node.inputs[0].owner.op, HostFromGpu):
return node.op(node.inputs[0].owner.inputs[0])
@register_opt('fast_compile')
@op_lifter([tensor.Flatten])
def local_gpuflatten(node):
def local_gpuflatten(node, context_name):
op = node.op
shp = []
if op.outdim != 1:
......@@ -271,7 +331,7 @@ def local_gpuflatten(node):
@register_opt('fast_compile')
@op_lifter([tensor.Elemwise])
def local_gpu_elemwise(node):
def local_gpu_elemwise(node, context_name):
op = node.op
scal_op = op.scalar_op
name = op.name
......@@ -344,28 +404,28 @@ optdb.register('gpua_inplace_opt', inplace_gpu_elemwise_opt, 75,
@register_opt('fast_compile')
@op_lifter([tensor.DimShuffle])
def local_gpua_dimshuffle(node):
def local_gpua_dimshuffle(node, context_name):
return GpuDimShuffle(node.op.input_broadcastable,
node.op.new_order)
@register_opt('fast_compile')
@op_lifter([tensor.SpecifyShape])
def local_gpua_specifyShape(node):
def local_gpua_specifyShape(node, context_name):
if isinstance(node.inputs[0].type, GpuArrayType):
return
inp = [gpu_from_host(node.inputs[0])] + node.inputs[1:]
inp = [GpuFromHost(context_name)(node.inputs[0])] + node.inputs[1:]
return tensor.specify_shape(*inp)
@register_opt('fast_compile')
@op_lifter([theano.compile.ops.Shape])
def local_gpua_shape(node):
def local_gpua_shape(node, context_name):
# op_lifter will call this opt too frequently as the output is
# always on the CPU.
if isinstance(node.inputs[0].type, GpuArrayType):
return
return [gpu_from_host(node.inputs[0]).shape]
return [GpuFromHost(context_name)(node.inputs[0]).shape]
def gpu_print_wrapper(op, cnda):
......@@ -374,7 +434,7 @@ def gpu_print_wrapper(op, cnda):
@register_opt('fast_compile')
@op_lifter([tensor.printing.Print])
def local_gpu_print_op(node):
def local_gpu_print_op(node, context_name):
x, = node.inputs
gpu_x, = x.owner.inputs
new_op = node.op.__class__(global_fn=gpu_print_wrapper)
......@@ -404,9 +464,13 @@ def local_gpu_pdbbreakpoint_op(node):
input_is_from_gpu = (inp.owner and
isinstance(inp.owner.op, HostFromGpu))
output_goes_to_gpu = any([c[0] != "output" and
isinstance(c[0].op, GpuFromHost)
for c in out.clients])
for c in out.clients:
if c == 'output':
continue
if isinstance(c[0].op, GpuFromHost):
output_goes_to_gpu = True
context_name = c[0].op.context_name
break
if input_is_from_gpu:
# The op should be applied on the GPU version of the input
......@@ -415,7 +479,7 @@ def local_gpu_pdbbreakpoint_op(node):
elif output_goes_to_gpu:
# The input should be transfered to the gpu
new_inputs.append(gpu_from_host(inp))
new_inputs.append(GpuFromHost(context_name)(inp))
input_transfered.append(True)
else:
......@@ -447,7 +511,7 @@ def local_gpu_pdbbreakpoint_op(node):
@register_opt('fast_compile')
@op_lifter([tensor.Join])
def local_gpua_join(node):
def local_gpua_join(node, context_name):
return gpu_join
......@@ -462,13 +526,13 @@ def local_gpuajoin_1(node):
@register_opt('fast_compile')
@op_lifter([tensor.Split])
def local_gpua_split(node):
def local_gpua_split(node, context_name):
return GpuSplit(node.op.len_splits)
@register_opt('fast_compile')
@op_lifter([tensor.Subtensor])
def local_gpua_subtensor(node):
def local_gpua_subtensor(node, context_name):
x = node.inputs[0]
if (x.owner and isinstance(x.owner.op, HostFromGpu)):
gpu_x = x.owner.inputs[0]
......@@ -482,14 +546,14 @@ def local_gpua_subtensor(node):
for n, _ in node.outputs[0].clients]):
return
else:
return [host_from_gpu(gpu_from_host(node.outputs[0]))]
return [host_from_gpu(gpu_x.owner.op(node.outputs[0]))]
return GpuSubtensor(node.op.idx_list)
@register_opt('fast_compile')
@op_lifter([tensor.IncSubtensor])
def local_gpua_incsubtensor(node):
def local_gpua_incsubtensor(node, context_name):
return GpuIncSubtensor(node.op.idx_list, node.op.inplace,
node.op.set_instead_of_inc,
node.op.destroyhandler_tolerate_aliased)
......@@ -497,16 +561,16 @@ def local_gpua_incsubtensor(node):
@register_opt('fast_compile')
@op_lifter([tensor.AdvancedSubtensor1])
def local_gpua_advanced_subtensor(node):
def local_gpua_advanced_subtensor(node, context_name):
return GpuAdvancedSubtensor1()
@register_opt('fast_compile')
@op_lifter([tensor.AdvancedIncSubtensor1])
def local_gpua_advanced_incsubtensor(node):
def local_gpua_advanced_incsubtensor(node, context_name):
# This optimization is disabled if cuda is not active
if pygpu.get_default_context().kind != "cuda":
# This is disabled on non-cuda contexts
if get_context(context_name).kind != 'cuda':
return None
x, y, ilist = node.inputs
......@@ -535,17 +599,19 @@ def local_gpua_advanced_incsubtensor(node):
@register_opt('fast_compile')
@op_lifter([tensor.CAReduce, tensor.Sum, tensor.elemwise.Prod])
def local_gpua_careduce(node):
def local_gpua_careduce(node, context_name):
if isinstance(node.op.scalar_op, (scalar.Add, scalar.Mul,
scalar.Maximum, scalar.Minimum)):
dev = theano.sandbox.gpuarray.init_dev.device
if dev.startswith('opencl'):
ctx = get_context(context_name)
if ctx.kind == 'opencl':
op = GpuCAReduceCPY
if node.op.scalar_op not in [scalar.add, scalar.mul]:
# We don't support yet all reduction with cpy code.
return
else:
elif ctx.kind == 'cuda':
op = GpuCAReduceCuda
else:
return False
x, = node.inputs
greduce = op(
......@@ -556,7 +622,7 @@ def local_gpua_careduce(node):
# We need to have the make node called, otherwise the mask can
# be None
if (op is GpuCAReduceCPY or
gvar.owner.op.supports_c_code([gpu_from_host(x)])):
gvar.owner.op.supports_c_code([GpuFromHost(context_name)(x)])):
return greduce
else:
# Try to make a simpler pattern based on reshaping
......@@ -596,7 +662,7 @@ def local_gpua_careduce(node):
acc_dtype=getattr(node.op, 'acc_dtype', None))
reshaped_x = x.reshape(tensor.stack(new_in_shp))
gpu_reshaped_x = gpu_from_host(reshaped_x)
gpu_reshaped_x = GpuFromHost(context_name)(reshaped_x)
gvar = greduce(gpu_reshaped_x)
# We need to have the make node called, otherwise the mask can
# be None
......@@ -615,13 +681,13 @@ def local_gpua_careduce(node):
@register_opt('fast_compile')
@op_lifter([tensor.blas.Gemv, tensor.blas_c.CGemv])
def local_gpua_gemv(node):
def local_gpua_gemv(node, context_name):
return GpuGemv(inplace=node.op.inplace)
@register_opt('fast_compile')
@op_lifter([tensor.blas.Gemm])
def local_gpua_gemm(node):
def local_gpua_gemm(node, context_name):
return GpuGemm(inplace=node.op.inplace)
......@@ -658,49 +724,49 @@ def local_gpuagemm_output_merge(node, *inputs):
@register_opt('fast_compile')
@op_lifter([tensor.blas.Ger, tensor.blas_c.CGer, tensor.blas_scipy.ScipyGer])
def local_gpua_ger(node):
def local_gpua_ger(node, context_name):
return GpuGer(destructive=node.op.destructive)
@register_opt('fast_compile')
@op_lifter([tensor.blas.Dot22])
def local_gpua_dot22(node):
def local_gpua_dot22(node, context_name):
return gpu_dot22
@register_opt('fast_compile')
@op_lifter([tensor.basic.Eye])
def local_gpua_eye(node):
def local_gpua_eye(node, context_name):
return GpuEye(dtype=node.op.dtype)
@register_opt('fast_compile')
@op_lifter([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias], cuda_only=True)
def local_gpua_crossentropysoftmaxargmax1hotwithbias(node):
def local_gpua_crossentropysoftmaxargmax1hotwithbias(node, context_name):
return GpuCrossentropySoftmaxArgmax1HotWithBias()
@register_opt('fast_compile')
@op_lifter([tensor.nnet.CrossentropySoftmax1HotWithBiasDx], cuda_only=True)
def local_gpua_crossentropysoftmax1hotwithbiasdx(node):
def local_gpua_crossentropysoftmax1hotwithbiasdx(node, context_name):
return GpuCrossentropySoftmax1HotWithBiasDx()
@register_opt('fast_compile')
@op_lifter([tensor.nnet.Softmax], cuda_only=True)
def local_gpua_softmax(node):
def local_gpua_softmax(node, context_name):
return GpuSoftmax()
@register_opt('fast_compile')
@op_lifter([tensor.nnet.SoftmaxWithBias], cuda_only=True)
def local_gpua_softmaxwithbias(node):
def local_gpua_softmaxwithbias(node, context_name):
return GpuSoftmaxWithBias()
@register_opt('fast_compile')
@op_lifter([theano.tensor.opt.Assert])
def local_assert(node):
def local_assert(node, context_name):
if (node.inputs[0].owner and
isinstance(node.inputs[0].owner.op, HostFromGpu)):
return [host_from_gpu(node.op(node.inputs[0].owner.inputs[0],
......@@ -708,21 +774,14 @@ def local_assert(node):
@register_opt('fast_compile')
@op_lifter([gpu_from_host, ConvOp])
def local_gpu_conv(node):
"""
gpu_from_host(conv) -> gpu_conv(gpu_from_host)
conv(host_from_gpu) -> host_from_gpu(gpu_conv)
"""
@op_lifter([ConvOp])
def local_gpu_conv(node, context_name):
def GpuConvOp_from_ConvOp(op):
logical_img_hw = None
if op.kshp_logical is not None and op.kshp_logical != op.kshp:
return None
# print op.kshp, op.imshp[1:3]
# print op.kshp_logical, logical_img_hw
ret = GpuConv(border_mode=op.out_mode,
subsample=(op.dx, op.dy),
logical_img_hw=logical_img_hw,
......@@ -735,13 +794,10 @@ def local_gpu_conv(node):
imshp=op.imshp,
nkern=op.nkern,
bsize=op.bsize,
fft_opt=op.fft_opt
)
fft_opt=op.fft_opt)
if op.imshp_logical is not None:
logical_img_hw = op.imshp_logical[1:3]
if logical_img_hw != op.imshp[1:3]:
# this case is not implemented
# return None
rstride = int(numpy.ceil(op.imshp_logical[1] /
float(op.imshp[1])))
cstride = int(numpy.ceil(op.imshp_logical[2] /
......@@ -752,7 +808,7 @@ def local_gpu_conv(node):
img.shape[0], *op.imshp_logical)
img = tensor.set_subtensor(buf[:, :, ::rstride, ::cstride],
img)
img = gpu_from_host(img)
img = GpuFromHost(context_name)(img)
return ret(img, kern)
return make_graph
......@@ -779,15 +835,10 @@ def local_gpu_conv(node):
gpu_conv = GpuConvOp_from_ConvOp(node.op)
if gpu_conv is None:
return
out = gpu_conv(gpu_from_host(img),
gpu_from_host(kern))
# in some case the ConvOp broadcast the last 2 dimensions
# differently then the gpu ConvOp
out = tensor.patternbroadcast(
host_from_gpu(out),
node.outputs[0].broadcastable)
out = gpu_conv(GpuFromHost(context_name)(img),
GpuFromHost(context_name)(kern))
# op_lifter want the output on the GPU.
out = gpu_from_host(out)
out = GpuFromHost(context_name)(out)
out.values_eq_approx = values_eq_approx
return [out]
......@@ -818,9 +869,10 @@ def local_gpu_elemwise_careduce(node):
pre_scalar_op=scalar.basic.sqr)(inp)]
def tensor_to_gpu(x):
def tensor_to_gpu(x, context_name):
if isinstance(x.type, tensor.TensorType):
y = GpuArrayType(broadcastable=x.type.broadcastable,
context_name=context_name,
dtype=x.type.dtype)()
if x.name:
y.name = x.name + '[Gpua]'
......@@ -842,6 +894,7 @@ def gpu_safe_new(x, tag=''):
nw_name = x.name + tag
else:
nw_name = None
if isinstance(x, theano.Constant):
return x.clone()
......@@ -870,7 +923,7 @@ def gpu_reconstruct_graph(inputs, outputs, tag=None):
@register_opt('scan', 'fast_compile')
@op_lifter([scan_op.Scan])
def local_scan_to_gpua(node):
def local_scan_to_gpua(node, context_name):
info = copy.deepcopy(node.op.info)
if info.get('gpua', False):
return
......@@ -882,20 +935,20 @@ def local_scan_to_gpua(node):
node.op.n_mit_sot +
node.op.n_sit_sot +
node.op.n_shared_outs)
nw_ins += [safe_to_gpu(x) for x in node.inputs[1:e]]
nw_ins += [safe_to_gpu(x, context_name) for x in node.inputs[1:e]]
b = e
e = e + node.op.n_nit_sot
nw_ins += node.inputs[b:e]
nw_ins += [safe_to_gpu(x) for x in node.inputs[e:]]
scan_ins = [tensor_to_gpu(x) for x in node.op.inputs]
nw_ins += [safe_to_gpu(x, context_name) for x in node.inputs[e:]]
scan_ins = [tensor_to_gpu(x, context_name) for x in node.op.inputs]
# The inner output corresponding to the looping condition should not be
# moved to the gpu
if node.op.info['as_while']:
scan_outs = [safe_to_gpu(x) for x in node.op.outputs[:-1]]
scan_outs = [safe_to_gpu(x, context_name) for x in node.op.outputs[:-1]]
scan_outs += [node.op.outputs[-1]]
else:
scan_outs = [safe_to_gpu(x) for x in node.op.outputs]
scan_outs = [safe_to_gpu(x, context_name) for x in node.op.outputs]
scan_outs = scan_utils.clone(
scan_outs,
replace=list(zip(node.op.inputs,
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论