提交 c887bc14 authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Fix most of opt.py for type context.

Only scan is left.
上级 29615c83
...@@ -3,11 +3,6 @@ import numpy ...@@ -3,11 +3,6 @@ import numpy
import logging import logging
from six.moves import xrange from six.moves import xrange
try:
import pygpu
except ImportError:
pass
import theano import theano
from theano import tensor, scalar, gof from theano import tensor, scalar, gof
from theano.compile import optdb from theano.compile import optdb
...@@ -22,12 +17,12 @@ from theano.scan_module import scan_utils, scan_op, scan_opt ...@@ -22,12 +17,12 @@ from theano.scan_module import scan_utils, scan_op, scan_opt
from theano.tensor.nnet.conv import ConvOp from theano.tensor.nnet.conv import ConvOp
from theano.tests.breakpoint import PdbBreakpoint from theano.tests.breakpoint import PdbBreakpoint
from .type import GpuArrayType, GpuArrayConstant from .type import GpuArrayType, GpuArrayConstant, get_context
from .basic_ops import (as_gpuarray_variable, from .basic_ops import (as_gpuarray_variable,
host_from_gpu, gpu_from_host, host_from_gpu, GpuToGpu,
HostFromGpu, GpuFromHost, HostFromGpu, GpuFromHost,
GpuSplit, GpuContiguous, GpuSplit, GpuContiguous,
gpu_alloc, GpuAlloc, GpuAllocEmpty, GpuReshape, GpuAlloc, GpuAllocEmpty, GpuReshape,
GpuEye, gpu_join, GpuJoin) GpuEye, gpu_join, GpuJoin)
from .blas import (gpu_dot22, GpuGemv, GpuGemm, GpuGer, from .blas import (gpu_dot22, GpuGemv, GpuGemm, GpuGer,
gpugemm_no_inplace) gpugemm_no_inplace)
...@@ -79,9 +74,9 @@ gpu_optimizer.register('local_remove_all_assert', ...@@ -79,9 +74,9 @@ gpu_optimizer.register('local_remove_all_assert',
'unsafe') 'unsafe')
def safe_to_gpu(x): def safe_to_gpu(x, ctx_name):
if isinstance(x.type, tensor.TensorType): if isinstance(x.type, tensor.TensorType):
return gpu_from_host(x) return GpuFromHost(ctx_name)(x)
else: else:
return x return x
...@@ -102,24 +97,49 @@ def op_lifter(OP, cuda_only=False): ...@@ -102,24 +97,49 @@ def op_lifter(OP, cuda_only=False):
""" """
def f(maker): def f(maker):
def local_opt(node): def local_opt(node):
dev = theano.sandbox.gpuarray.init_dev.device
if cuda_only and not dev.startswith('cuda'):
return
if type(node.op) in OP: if type(node.op) in OP:
# Either one of our inputs is on the gpu or # Either one of our inputs is on the gpu or
# all of our client are on the gpu # all of our clients are on the gpu
if (any([i.owner and i.owner.op == host_from_gpu replace = False
for i in node.inputs]) or # TODO: Maybe set context_name with infer_context_name()?
all([c != 'output' and c.op == gpu_from_host context_name = None
for c, idx in node.outputs[0].clients])): # We replace if any input is a host_from_gpu
new_op = maker(node) for i in node.inputs:
# This is needed as sometimes new_op inherit from OP. if i.owner and i.owner.op == host_from_gpu:
context_name = i.owner.inputs[0].type.context_name
replace = True
break
if not replace:
# We replace if *all* clients are on the GPU
clients = [c for o in node.outputs for c in o.clients]
replace = len(clients) != 0
for c, idx in clients:
if (c == 'output' or
not isinstance(c.op, GpuFromHost)):
replace = False
# TODO: check that the clients want the same context?
if replace:
# All clients are GpuFromHost and we have at least one
context_name = clients[0][0].op.context_name
# Check if we should replace
if (not replace or
(cuda_only and
get_context(context_name).kind != 'cuda')):
return False
new_op = maker(node, context_name)
# This is needed as sometimes new_op inherits from OP.
if new_op and new_op != node.op: if new_op and new_op != node.op:
if isinstance(new_op, theano.Op): if isinstance(new_op, theano.Op):
# tag the inputs with the context in case
# the context was derived from the outputs
def tag(i, ctx):
i.tag.context_name = ctx
return i
inputs = [tag(i, context_name) for i in node.inputs]
return [safe_to_cpu(o) for o in return [safe_to_cpu(o) for o in
new_op(*node.inputs, return_list=True)] new_op(*inputs, return_list=True)]
elif isinstance(new_op, (tuple, list)): elif isinstance(new_op, (tuple, list)):
return [safe_to_cpu(o) for o in new_op] return [safe_to_cpu(o) for o in new_op]
else: # suppose it is a variable on the GPU else: # suppose it is a variable on the GPU
...@@ -146,35 +166,80 @@ class InputToGpuOptimizer(Optimizer): ...@@ -146,35 +166,80 @@ class InputToGpuOptimizer(Optimizer):
if (len(input.clients) == 1 and if (len(input.clients) == 1 and
(input.clients[0][0] == 'output' or (input.clients[0][0] == 'output' or
input.clients[0][0].op == gpu_from_host)): isinstance(input.clients[0][0].op, GpuFromHost))):
continue continue
try: try:
new_input = host_from_gpu(gpu_from_host(input)) ctx = getattr(input.tag, 'context_name', None)
new_input = host_from_gpu(GpuFromHost(ctx)(input))
fgraph.replace_validate(input, new_input, fgraph.replace_validate(input, new_input,
"InputToGpuOptimizer") "InputToGpuOptimizer")
except TypeError: except TypeError:
# This could fail if the inputs are not TensorTypes # This could fail if the inputs are not TensorTypes
pass pass
except ValueError:
# If there is no context tag and no default context
# then it stays on the CPU
assert ctx is None
pass
gpu_seqopt.register('InputToGpuArrayOptimizer', InputToGpuOptimizer(), gpu_seqopt.register('InputToGpuArrayOptimizer', InputToGpuOptimizer(),
0, 'fast_run', 'fast_compile', 'merge') 0, 'fast_run', 'fast_compile', 'merge')
@local_optimizer([gpu_from_host, host_from_gpu]) @local_optimizer([GpuFromHost, GpuToGpu, host_from_gpu])
def local_cut_gpu_host_gpu(node): def local_cut_gpu_transfers(node):
if tensor.opt.opt.check_chain(node, gpu_from_host, host_from_gpu): # gpu[ab] -> host -> gpub
return [node.inputs[0].owner.inputs[0]] if (isinstance(node.op, GpuFromHost) and
if tensor.opt.opt.check_chain(node, host_from_gpu, gpu_from_host): node.inputs[0].owner and
return [node.inputs[0].owner.inputs[0]] node.inputs[0].owner.op == host_from_gpu):
return False other = node.inputs[0].owner.inputs[0]
gpu_cut_copies.register('cut_gpua_host_transfers', local_cut_gpu_host_gpu, if node.op.context_name == other.type.context_name:
return [other]
else:
return [GpuToGpu(node.op.context_name)(other)]
# ? -> gpua -> host
elif (node.op == host_from_gpu and
node.inputs[0].owner):
n2 = node.inputs[0].owner
# host ->
if isinstance(n2.op, GpuFromHost):
return [n2.inputs[0]]
# gpub ->
if isinstance(n2.op, GpuToGpu):
return [host_from_gpu(n2.inputs[0])]
# ? -> gpua -> gpub
elif isinstance(node.op, GpuToGpu):
# Transfer within same context
if node.inputs[0].type.context_name == node.op.context_name:
return [node.inputs[0]]
if node.inputs[0].owner:
n2 = node.inputs[0].owner
# host ->
if isinstance(n2.op, GpuFromHost):
return [GpuFromHost(node.op.context_name)(n2.inputs[0])]
# gpuc ->
if isinstance(n2.op, GpuToGpu):
if node.op.context_name == n2.inputs[0].type.context_name:
return [n2.inputs[0]]
else:
return [node.op(n2.inputs[0])]
gpu_cut_copies.register('cut_gpua_host_transfers', local_cut_gpu_transfers,
'fast_compile', 'fast_run', 'inplace', 'gpuarray') 'fast_compile', 'fast_run', 'inplace', 'gpuarray')
gpu_cut_copies.register('cut_gpua_constant_transfers', gpu_cut_copies.register('cut_gpua_constant_transfers',
tensor.opt.constant_folding, tensor.opt.constant_folding,
'fast_compile', 'fast_run', 'gpuarray') 'fast_compile', 'fast_run', 'gpuarray')
optdb['canonicalize'].register('local_cut_gpua_host_gpua', optdb['canonicalize'].register('local_cut_gpua_host_gpua',
local_cut_gpu_host_gpu, local_cut_gpu_transfers,
'fast_compile', 'fast_run', 'gpuarray') 'fast_compile', 'fast_run', 'gpuarray')
...@@ -187,6 +252,11 @@ def local_gpuaalloc2(node): ...@@ -187,6 +252,11 @@ def local_gpuaalloc2(node):
Moves an alloc that is an input to join to the gpu. Moves an alloc that is an input to join to the gpu.
""" """
try:
get_context(None)
except ValueError:
# If there is no default context then we do not perform the move here.
return
if (isinstance(node.op, tensor.Alloc) and if (isinstance(node.op, tensor.Alloc) and
all(c != 'output' and all(c != 'output' and
c.op == tensor.join and c.op == tensor.join and
...@@ -194,23 +264,13 @@ def local_gpuaalloc2(node): ...@@ -194,23 +264,13 @@ def local_gpuaalloc2(node):
i.owner.op in [host_from_gpu, tensor.alloc] i.owner.op in [host_from_gpu, tensor.alloc]
for i in c.inputs[1:]) for i in c.inputs[1:])
for c, idx in node.outputs[0].clients)): for c, idx in node.outputs[0].clients)):
return [host_from_gpu(gpu_alloc(*node.inputs))] return [host_from_gpu(GpuAlloc(None)(*node.inputs))]
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.Alloc]) @op_lifter([tensor.Alloc])
def local_gpuaalloc(node): def local_gpuaalloc(node, context_name):
new_out = gpu_alloc(*node.inputs) return GpuAlloc(context_name)(*node.inputs)
# We need to hide new broadcastable dimensions because
# ReplaceValidate doesn't like when they change.
if new_out.broadcastable != node.outputs[0].broadcastable:
# but if a dim is suddenly not broadcastable anymore then that's a bug
for b_old, b_new in zip(node.outputs[0].broadcastable,
new_out.broadcastable):
assert b_new or (not b_old)
new_out = tensor.patternbroadcast(new_out,
node.outputs[0].broadcastable)
return (new_out,)
@register_opt() @register_opt()
...@@ -221,8 +281,8 @@ def local_gpualloc_memset_0(node): ...@@ -221,8 +281,8 @@ def local_gpualloc_memset_0(node):
if (isinstance(inp, GpuArrayConstant) and if (isinstance(inp, GpuArrayConstant) and
inp.data.size == 1 and inp.data.size == 1 and
(numpy.asarray(inp.data) == 0).all()): (numpy.asarray(inp.data) == 0).all()):
new_out = GpuAlloc(memset_0=True)(*node.inputs) new_op = GpuAlloc(node.op.context_name, memset_0=True)
return [new_out] return [new_op(*node.inputs)]
@register_opt() @register_opt()
...@@ -240,7 +300,7 @@ def local_gpu_contiguous_gpu_contiguous(node): ...@@ -240,7 +300,7 @@ def local_gpu_contiguous_gpu_contiguous(node):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.Reshape]) @op_lifter([tensor.Reshape])
def local_gpureshape(node): def local_gpureshape(node, context_name):
op = node.op op = node.op
name = op.name name = op.name
if name: if name:
...@@ -251,14 +311,14 @@ def local_gpureshape(node): ...@@ -251,14 +311,14 @@ def local_gpureshape(node):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.Rebroadcast]) @op_lifter([tensor.Rebroadcast])
def local_gpu_rebroadcast(node): def local_gpu_rebroadcast(node, context_name):
if isinstance(node.inputs[0].owner.op, HostFromGpu): if isinstance(node.inputs[0].owner.op, HostFromGpu):
return node.op(node.inputs[0].owner.inputs[0]) return node.op(node.inputs[0].owner.inputs[0])
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.Flatten]) @op_lifter([tensor.Flatten])
def local_gpuflatten(node): def local_gpuflatten(node, context_name):
op = node.op op = node.op
shp = [] shp = []
if op.outdim != 1: if op.outdim != 1:
...@@ -271,7 +331,7 @@ def local_gpuflatten(node): ...@@ -271,7 +331,7 @@ def local_gpuflatten(node):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.Elemwise]) @op_lifter([tensor.Elemwise])
def local_gpu_elemwise(node): def local_gpu_elemwise(node, context_name):
op = node.op op = node.op
scal_op = op.scalar_op scal_op = op.scalar_op
name = op.name name = op.name
...@@ -344,28 +404,28 @@ optdb.register('gpua_inplace_opt', inplace_gpu_elemwise_opt, 75, ...@@ -344,28 +404,28 @@ optdb.register('gpua_inplace_opt', inplace_gpu_elemwise_opt, 75,
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.DimShuffle]) @op_lifter([tensor.DimShuffle])
def local_gpua_dimshuffle(node): def local_gpua_dimshuffle(node, context_name):
return GpuDimShuffle(node.op.input_broadcastable, return GpuDimShuffle(node.op.input_broadcastable,
node.op.new_order) node.op.new_order)
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.SpecifyShape]) @op_lifter([tensor.SpecifyShape])
def local_gpua_specifyShape(node): def local_gpua_specifyShape(node, context_name):
if isinstance(node.inputs[0].type, GpuArrayType): if isinstance(node.inputs[0].type, GpuArrayType):
return return
inp = [gpu_from_host(node.inputs[0])] + node.inputs[1:] inp = [GpuFromHost(context_name)(node.inputs[0])] + node.inputs[1:]
return tensor.specify_shape(*inp) return tensor.specify_shape(*inp)
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([theano.compile.ops.Shape]) @op_lifter([theano.compile.ops.Shape])
def local_gpua_shape(node): def local_gpua_shape(node, context_name):
# op_lifter will call this opt too frequently as the output is # op_lifter will call this opt too frequently as the output is
# always on the CPU. # always on the CPU.
if isinstance(node.inputs[0].type, GpuArrayType): if isinstance(node.inputs[0].type, GpuArrayType):
return return
return [gpu_from_host(node.inputs[0]).shape] return [GpuFromHost(context_name)(node.inputs[0]).shape]
def gpu_print_wrapper(op, cnda): def gpu_print_wrapper(op, cnda):
...@@ -374,7 +434,7 @@ def gpu_print_wrapper(op, cnda): ...@@ -374,7 +434,7 @@ def gpu_print_wrapper(op, cnda):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.printing.Print]) @op_lifter([tensor.printing.Print])
def local_gpu_print_op(node): def local_gpu_print_op(node, context_name):
x, = node.inputs x, = node.inputs
gpu_x, = x.owner.inputs gpu_x, = x.owner.inputs
new_op = node.op.__class__(global_fn=gpu_print_wrapper) new_op = node.op.__class__(global_fn=gpu_print_wrapper)
...@@ -404,9 +464,13 @@ def local_gpu_pdbbreakpoint_op(node): ...@@ -404,9 +464,13 @@ def local_gpu_pdbbreakpoint_op(node):
input_is_from_gpu = (inp.owner and input_is_from_gpu = (inp.owner and
isinstance(inp.owner.op, HostFromGpu)) isinstance(inp.owner.op, HostFromGpu))
output_goes_to_gpu = any([c[0] != "output" and for c in out.clients:
isinstance(c[0].op, GpuFromHost) if c == 'output':
for c in out.clients]) continue
if isinstance(c[0].op, GpuFromHost):
output_goes_to_gpu = True
context_name = c[0].op.context_name
break
if input_is_from_gpu: if input_is_from_gpu:
# The op should be applied on the GPU version of the input # The op should be applied on the GPU version of the input
...@@ -415,7 +479,7 @@ def local_gpu_pdbbreakpoint_op(node): ...@@ -415,7 +479,7 @@ def local_gpu_pdbbreakpoint_op(node):
elif output_goes_to_gpu: elif output_goes_to_gpu:
# The input should be transfered to the gpu # The input should be transfered to the gpu
new_inputs.append(gpu_from_host(inp)) new_inputs.append(GpuFromHost(context_name)(inp))
input_transfered.append(True) input_transfered.append(True)
else: else:
...@@ -447,7 +511,7 @@ def local_gpu_pdbbreakpoint_op(node): ...@@ -447,7 +511,7 @@ def local_gpu_pdbbreakpoint_op(node):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.Join]) @op_lifter([tensor.Join])
def local_gpua_join(node): def local_gpua_join(node, context_name):
return gpu_join return gpu_join
...@@ -462,13 +526,13 @@ def local_gpuajoin_1(node): ...@@ -462,13 +526,13 @@ def local_gpuajoin_1(node):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.Split]) @op_lifter([tensor.Split])
def local_gpua_split(node): def local_gpua_split(node, context_name):
return GpuSplit(node.op.len_splits) return GpuSplit(node.op.len_splits)
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.Subtensor]) @op_lifter([tensor.Subtensor])
def local_gpua_subtensor(node): def local_gpua_subtensor(node, context_name):
x = node.inputs[0] x = node.inputs[0]
if (x.owner and isinstance(x.owner.op, HostFromGpu)): if (x.owner and isinstance(x.owner.op, HostFromGpu)):
gpu_x = x.owner.inputs[0] gpu_x = x.owner.inputs[0]
...@@ -482,14 +546,14 @@ def local_gpua_subtensor(node): ...@@ -482,14 +546,14 @@ def local_gpua_subtensor(node):
for n, _ in node.outputs[0].clients]): for n, _ in node.outputs[0].clients]):
return return
else: else:
return [host_from_gpu(gpu_from_host(node.outputs[0]))] return [host_from_gpu(gpu_x.owner.op(node.outputs[0]))]
return GpuSubtensor(node.op.idx_list) return GpuSubtensor(node.op.idx_list)
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.IncSubtensor]) @op_lifter([tensor.IncSubtensor])
def local_gpua_incsubtensor(node): def local_gpua_incsubtensor(node, context_name):
return GpuIncSubtensor(node.op.idx_list, node.op.inplace, return GpuIncSubtensor(node.op.idx_list, node.op.inplace,
node.op.set_instead_of_inc, node.op.set_instead_of_inc,
node.op.destroyhandler_tolerate_aliased) node.op.destroyhandler_tolerate_aliased)
...@@ -497,16 +561,16 @@ def local_gpua_incsubtensor(node): ...@@ -497,16 +561,16 @@ def local_gpua_incsubtensor(node):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.AdvancedSubtensor1]) @op_lifter([tensor.AdvancedSubtensor1])
def local_gpua_advanced_subtensor(node): def local_gpua_advanced_subtensor(node, context_name):
return GpuAdvancedSubtensor1() return GpuAdvancedSubtensor1()
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.AdvancedIncSubtensor1]) @op_lifter([tensor.AdvancedIncSubtensor1])
def local_gpua_advanced_incsubtensor(node): def local_gpua_advanced_incsubtensor(node, context_name):
# This optimization is disabled if cuda is not active # This is disabled on non-cuda contexts
if pygpu.get_default_context().kind != "cuda": if get_context(context_name).kind != 'cuda':
return None return None
x, y, ilist = node.inputs x, y, ilist = node.inputs
...@@ -535,17 +599,19 @@ def local_gpua_advanced_incsubtensor(node): ...@@ -535,17 +599,19 @@ def local_gpua_advanced_incsubtensor(node):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.CAReduce, tensor.Sum, tensor.elemwise.Prod]) @op_lifter([tensor.CAReduce, tensor.Sum, tensor.elemwise.Prod])
def local_gpua_careduce(node): def local_gpua_careduce(node, context_name):
if isinstance(node.op.scalar_op, (scalar.Add, scalar.Mul, if isinstance(node.op.scalar_op, (scalar.Add, scalar.Mul,
scalar.Maximum, scalar.Minimum)): scalar.Maximum, scalar.Minimum)):
dev = theano.sandbox.gpuarray.init_dev.device ctx = get_context(context_name)
if dev.startswith('opencl'): if ctx.kind == 'opencl':
op = GpuCAReduceCPY op = GpuCAReduceCPY
if node.op.scalar_op not in [scalar.add, scalar.mul]: if node.op.scalar_op not in [scalar.add, scalar.mul]:
# We don't support yet all reduction with cpy code. # We don't support yet all reduction with cpy code.
return return
else: elif ctx.kind == 'cuda':
op = GpuCAReduceCuda op = GpuCAReduceCuda
else:
return False
x, = node.inputs x, = node.inputs
greduce = op( greduce = op(
...@@ -556,7 +622,7 @@ def local_gpua_careduce(node): ...@@ -556,7 +622,7 @@ def local_gpua_careduce(node):
# We need to have the make node called, otherwise the mask can # We need to have the make node called, otherwise the mask can
# be None # be None
if (op is GpuCAReduceCPY or if (op is GpuCAReduceCPY or
gvar.owner.op.supports_c_code([gpu_from_host(x)])): gvar.owner.op.supports_c_code([GpuFromHost(context_name)(x)])):
return greduce return greduce
else: else:
# Try to make a simpler pattern based on reshaping # Try to make a simpler pattern based on reshaping
...@@ -596,7 +662,7 @@ def local_gpua_careduce(node): ...@@ -596,7 +662,7 @@ def local_gpua_careduce(node):
acc_dtype=getattr(node.op, 'acc_dtype', None)) acc_dtype=getattr(node.op, 'acc_dtype', None))
reshaped_x = x.reshape(tensor.stack(new_in_shp)) reshaped_x = x.reshape(tensor.stack(new_in_shp))
gpu_reshaped_x = gpu_from_host(reshaped_x) gpu_reshaped_x = GpuFromHost(context_name)(reshaped_x)
gvar = greduce(gpu_reshaped_x) gvar = greduce(gpu_reshaped_x)
# We need to have the make node called, otherwise the mask can # We need to have the make node called, otherwise the mask can
# be None # be None
...@@ -615,13 +681,13 @@ def local_gpua_careduce(node): ...@@ -615,13 +681,13 @@ def local_gpua_careduce(node):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.blas.Gemv, tensor.blas_c.CGemv]) @op_lifter([tensor.blas.Gemv, tensor.blas_c.CGemv])
def local_gpua_gemv(node): def local_gpua_gemv(node, context_name):
return GpuGemv(inplace=node.op.inplace) return GpuGemv(inplace=node.op.inplace)
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.blas.Gemm]) @op_lifter([tensor.blas.Gemm])
def local_gpua_gemm(node): def local_gpua_gemm(node, context_name):
return GpuGemm(inplace=node.op.inplace) return GpuGemm(inplace=node.op.inplace)
...@@ -658,49 +724,49 @@ def local_gpuagemm_output_merge(node, *inputs): ...@@ -658,49 +724,49 @@ def local_gpuagemm_output_merge(node, *inputs):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.blas.Ger, tensor.blas_c.CGer, tensor.blas_scipy.ScipyGer]) @op_lifter([tensor.blas.Ger, tensor.blas_c.CGer, tensor.blas_scipy.ScipyGer])
def local_gpua_ger(node): def local_gpua_ger(node, context_name):
return GpuGer(destructive=node.op.destructive) return GpuGer(destructive=node.op.destructive)
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.blas.Dot22]) @op_lifter([tensor.blas.Dot22])
def local_gpua_dot22(node): def local_gpua_dot22(node, context_name):
return gpu_dot22 return gpu_dot22
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.basic.Eye]) @op_lifter([tensor.basic.Eye])
def local_gpua_eye(node): def local_gpua_eye(node, context_name):
return GpuEye(dtype=node.op.dtype) return GpuEye(dtype=node.op.dtype)
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias], cuda_only=True) @op_lifter([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias], cuda_only=True)
def local_gpua_crossentropysoftmaxargmax1hotwithbias(node): def local_gpua_crossentropysoftmaxargmax1hotwithbias(node, context_name):
return GpuCrossentropySoftmaxArgmax1HotWithBias() return GpuCrossentropySoftmaxArgmax1HotWithBias()
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.nnet.CrossentropySoftmax1HotWithBiasDx], cuda_only=True) @op_lifter([tensor.nnet.CrossentropySoftmax1HotWithBiasDx], cuda_only=True)
def local_gpua_crossentropysoftmax1hotwithbiasdx(node): def local_gpua_crossentropysoftmax1hotwithbiasdx(node, context_name):
return GpuCrossentropySoftmax1HotWithBiasDx() return GpuCrossentropySoftmax1HotWithBiasDx()
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.nnet.Softmax], cuda_only=True) @op_lifter([tensor.nnet.Softmax], cuda_only=True)
def local_gpua_softmax(node): def local_gpua_softmax(node, context_name):
return GpuSoftmax() return GpuSoftmax()
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.nnet.SoftmaxWithBias], cuda_only=True) @op_lifter([tensor.nnet.SoftmaxWithBias], cuda_only=True)
def local_gpua_softmaxwithbias(node): def local_gpua_softmaxwithbias(node, context_name):
return GpuSoftmaxWithBias() return GpuSoftmaxWithBias()
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([theano.tensor.opt.Assert]) @op_lifter([theano.tensor.opt.Assert])
def local_assert(node): def local_assert(node, context_name):
if (node.inputs[0].owner and if (node.inputs[0].owner and
isinstance(node.inputs[0].owner.op, HostFromGpu)): isinstance(node.inputs[0].owner.op, HostFromGpu)):
return [host_from_gpu(node.op(node.inputs[0].owner.inputs[0], return [host_from_gpu(node.op(node.inputs[0].owner.inputs[0],
...@@ -708,21 +774,14 @@ def local_assert(node): ...@@ -708,21 +774,14 @@ def local_assert(node):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([gpu_from_host, ConvOp]) @op_lifter([ConvOp])
def local_gpu_conv(node): def local_gpu_conv(node, context_name):
"""
gpu_from_host(conv) -> gpu_conv(gpu_from_host)
conv(host_from_gpu) -> host_from_gpu(gpu_conv)
"""
def GpuConvOp_from_ConvOp(op): def GpuConvOp_from_ConvOp(op):
logical_img_hw = None logical_img_hw = None
if op.kshp_logical is not None and op.kshp_logical != op.kshp: if op.kshp_logical is not None and op.kshp_logical != op.kshp:
return None return None
# print op.kshp, op.imshp[1:3]
# print op.kshp_logical, logical_img_hw
ret = GpuConv(border_mode=op.out_mode, ret = GpuConv(border_mode=op.out_mode,
subsample=(op.dx, op.dy), subsample=(op.dx, op.dy),
logical_img_hw=logical_img_hw, logical_img_hw=logical_img_hw,
...@@ -735,13 +794,10 @@ def local_gpu_conv(node): ...@@ -735,13 +794,10 @@ def local_gpu_conv(node):
imshp=op.imshp, imshp=op.imshp,
nkern=op.nkern, nkern=op.nkern,
bsize=op.bsize, bsize=op.bsize,
fft_opt=op.fft_opt fft_opt=op.fft_opt)
)
if op.imshp_logical is not None: if op.imshp_logical is not None:
logical_img_hw = op.imshp_logical[1:3] logical_img_hw = op.imshp_logical[1:3]
if logical_img_hw != op.imshp[1:3]: if logical_img_hw != op.imshp[1:3]:
# this case is not implemented
# return None
rstride = int(numpy.ceil(op.imshp_logical[1] / rstride = int(numpy.ceil(op.imshp_logical[1] /
float(op.imshp[1]))) float(op.imshp[1])))
cstride = int(numpy.ceil(op.imshp_logical[2] / cstride = int(numpy.ceil(op.imshp_logical[2] /
...@@ -752,7 +808,7 @@ def local_gpu_conv(node): ...@@ -752,7 +808,7 @@ def local_gpu_conv(node):
img.shape[0], *op.imshp_logical) img.shape[0], *op.imshp_logical)
img = tensor.set_subtensor(buf[:, :, ::rstride, ::cstride], img = tensor.set_subtensor(buf[:, :, ::rstride, ::cstride],
img) img)
img = gpu_from_host(img) img = GpuFromHost(context_name)(img)
return ret(img, kern) return ret(img, kern)
return make_graph return make_graph
...@@ -779,15 +835,10 @@ def local_gpu_conv(node): ...@@ -779,15 +835,10 @@ def local_gpu_conv(node):
gpu_conv = GpuConvOp_from_ConvOp(node.op) gpu_conv = GpuConvOp_from_ConvOp(node.op)
if gpu_conv is None: if gpu_conv is None:
return return
out = gpu_conv(gpu_from_host(img), out = gpu_conv(GpuFromHost(context_name)(img),
gpu_from_host(kern)) GpuFromHost(context_name)(kern))
# in some case the ConvOp broadcast the last 2 dimensions
# differently then the gpu ConvOp
out = tensor.patternbroadcast(
host_from_gpu(out),
node.outputs[0].broadcastable)
# op_lifter want the output on the GPU. # op_lifter want the output on the GPU.
out = gpu_from_host(out) out = GpuFromHost(context_name)(out)
out.values_eq_approx = values_eq_approx out.values_eq_approx = values_eq_approx
return [out] return [out]
...@@ -818,9 +869,10 @@ def local_gpu_elemwise_careduce(node): ...@@ -818,9 +869,10 @@ def local_gpu_elemwise_careduce(node):
pre_scalar_op=scalar.basic.sqr)(inp)] pre_scalar_op=scalar.basic.sqr)(inp)]
def tensor_to_gpu(x): def tensor_to_gpu(x, context_name):
if isinstance(x.type, tensor.TensorType): if isinstance(x.type, tensor.TensorType):
y = GpuArrayType(broadcastable=x.type.broadcastable, y = GpuArrayType(broadcastable=x.type.broadcastable,
context_name=context_name,
dtype=x.type.dtype)() dtype=x.type.dtype)()
if x.name: if x.name:
y.name = x.name + '[Gpua]' y.name = x.name + '[Gpua]'
...@@ -842,6 +894,7 @@ def gpu_safe_new(x, tag=''): ...@@ -842,6 +894,7 @@ def gpu_safe_new(x, tag=''):
nw_name = x.name + tag nw_name = x.name + tag
else: else:
nw_name = None nw_name = None
if isinstance(x, theano.Constant): if isinstance(x, theano.Constant):
return x.clone() return x.clone()
...@@ -870,7 +923,7 @@ def gpu_reconstruct_graph(inputs, outputs, tag=None): ...@@ -870,7 +923,7 @@ def gpu_reconstruct_graph(inputs, outputs, tag=None):
@register_opt('scan', 'fast_compile') @register_opt('scan', 'fast_compile')
@op_lifter([scan_op.Scan]) @op_lifter([scan_op.Scan])
def local_scan_to_gpua(node): def local_scan_to_gpua(node, context_name):
info = copy.deepcopy(node.op.info) info = copy.deepcopy(node.op.info)
if info.get('gpua', False): if info.get('gpua', False):
return return
...@@ -882,20 +935,20 @@ def local_scan_to_gpua(node): ...@@ -882,20 +935,20 @@ def local_scan_to_gpua(node):
node.op.n_mit_sot + node.op.n_mit_sot +
node.op.n_sit_sot + node.op.n_sit_sot +
node.op.n_shared_outs) node.op.n_shared_outs)
nw_ins += [safe_to_gpu(x) for x in node.inputs[1:e]] nw_ins += [safe_to_gpu(x, context_name) for x in node.inputs[1:e]]
b = e b = e
e = e + node.op.n_nit_sot e = e + node.op.n_nit_sot
nw_ins += node.inputs[b:e] nw_ins += node.inputs[b:e]
nw_ins += [safe_to_gpu(x) for x in node.inputs[e:]] nw_ins += [safe_to_gpu(x, context_name) for x in node.inputs[e:]]
scan_ins = [tensor_to_gpu(x) for x in node.op.inputs] scan_ins = [tensor_to_gpu(x, context_name) for x in node.op.inputs]
# The inner output corresponding to the looping condition should not be # The inner output corresponding to the looping condition should not be
# moved to the gpu # moved to the gpu
if node.op.info['as_while']: if node.op.info['as_while']:
scan_outs = [safe_to_gpu(x) for x in node.op.outputs[:-1]] scan_outs = [safe_to_gpu(x, context_name) for x in node.op.outputs[:-1]]
scan_outs += [node.op.outputs[-1]] scan_outs += [node.op.outputs[-1]]
else: else:
scan_outs = [safe_to_gpu(x) for x in node.op.outputs] scan_outs = [safe_to_gpu(x, context_name) for x in node.op.outputs]
scan_outs = scan_utils.clone( scan_outs = scan_utils.clone(
scan_outs, scan_outs,
replace=list(zip(node.op.inputs, replace=list(zip(node.op.inputs,
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论