提交 3f9d0601 authored 作者: sentient07's avatar sentient07

Changed op_lifter to accept Op and Inputs

上级 e45b6cd6
...@@ -1498,35 +1498,35 @@ def local_dnn_convi_output_merge(node, *inputs): ...@@ -1498,35 +1498,35 @@ def local_dnn_convi_output_merge(node, *inputs):
@register_opt('cudnn', 'fast_compile') @register_opt('cudnn', 'fast_compile')
@op_lifter([Pool]) @op_lifter([Pool])
def local_pool_dnn_alternative(node, ctx_name): def local_pool_dnn_alternative(op, ctx_name, inputs):
if not dnn_available(ctx_name): if not dnn_available(ctx_name):
raise_no_cudnn() raise_no_cudnn()
if not node.op.ignore_border: if not op.ignore_border:
return return
img, = node.inputs img, = inputs
img = as_gpuarray_variable(img, ctx_name) img = as_gpuarray_variable(img, ctx_name)
ds = node.op.ds ds = op.ds
stride = node.op.st stride = op.st
pad = node.op.padding pad = op.padding
mode = node.op.mode mode = op.mode
return dnn_pool(gpu_contiguous(img), ds, stride=stride, pad=pad, mode=mode) return dnn_pool(gpu_contiguous(img), ds, stride=stride, pad=pad, mode=mode)
@register_opt('cudnn', 'fast_compile') @register_opt('cudnn', 'fast_compile')
@op_lifter([MaxPoolGrad]) @op_lifter([MaxPoolGrad])
def local_pool_dnn_grad_stride(node, ctx_name): def local_pool_dnn_grad_stride(op, ctx_name, inputs):
if not dnn_available(ctx_name): if not dnn_available(ctx_name):
raise_no_cudnn() raise_no_cudnn()
if not node.op.ignore_border: if not node.op.ignore_border:
return return
inp, out, out_grad = node.inputs inp, out, out_grad = inputs
inp = as_gpuarray_variable(inp, ctx_name) inp = as_gpuarray_variable(inp, ctx_name)
out = as_gpuarray_variable(out, ctx_name) out = as_gpuarray_variable(out, ctx_name)
out_grad = as_gpuarray_variable(out_grad, ctx_name) out_grad = as_gpuarray_variable(out_grad, ctx_name)
ds = node.op.ds ds = op.ds
st = node.op.st st = op.st
pad = node.op.padding pad = op.padding
mode = node.op.mode mode = op.mode
return GpuDnnPoolGrad(mode=mode)(gpu_contiguous(inp), return GpuDnnPoolGrad(mode=mode)(gpu_contiguous(inp),
gpu_contiguous(out), gpu_contiguous(out),
...@@ -1538,18 +1538,18 @@ def local_pool_dnn_grad_stride(node, ctx_name): ...@@ -1538,18 +1538,18 @@ def local_pool_dnn_grad_stride(node, ctx_name):
@register_opt('cudnn', 'fast_compile') @register_opt('cudnn', 'fast_compile')
@op_lifter([AveragePoolGrad]) @op_lifter([AveragePoolGrad])
def local_avg_pool_dnn_grad_stride(node, ctx_name): def local_avg_pool_dnn_grad_stride(op, ctx_name, inputs):
if not dnn_available(ctx_name): if not dnn_available(ctx_name):
raise_no_cudnn() raise_no_cudnn()
if not node.op.ignore_border: if not op.ignore_border:
return return
inp, out_grad = node.inputs inp, out_grad = inputs
inp = as_gpuarray_variable(inp, ctx_name) inp = as_gpuarray_variable(inp, ctx_name)
out_grad = as_gpuarray_variable(out_grad, ctx_name) out_grad = as_gpuarray_variable(out_grad, ctx_name)
ds = node.op.ds ds = op.ds
st = node.op.st st = op.st
pad = node.op.padding pad = op.padding
mode = node.op.mode mode = op.mode
cg = gpu_contiguous(out_grad) cg = gpu_contiguous(out_grad)
...@@ -1591,9 +1591,9 @@ def local_log_softmax_dnn(node): ...@@ -1591,9 +1591,9 @@ def local_log_softmax_dnn(node):
@register_opt('cudnn', 'fast_compile') @register_opt('cudnn', 'fast_compile')
@op_lifter([LogSoftmax]) @op_lifter([LogSoftmax])
def local_logsoftmax_to_dnn(node, ctx_name): def local_logsoftmax_to_dnn(op, ctx_name, inputs):
# Transform the input in the format expected by GpuDnnSoftmax # Transform the input in the format expected by GpuDnnSoftmax
inp = node.inputs[0] inp = inputs[0]
if inp.ndim != 2: if inp.ndim != 2:
return return
if not dnn_available(ctx_name) or version(raises=False) < 3000: if not dnn_available(ctx_name) or version(raises=False) < 3000:
...@@ -1629,11 +1629,11 @@ gpu_seqopt.register("NoCuDNNRaise", NoCuDNNRaise(), 0, 'cudnn') ...@@ -1629,11 +1629,11 @@ gpu_seqopt.register("NoCuDNNRaise", NoCuDNNRaise(), 0, 'cudnn')
@register_opt('cudnn', 'fast_compile') @register_opt('cudnn', 'fast_compile')
@op_lifter([SoftmaxGrad]) @op_lifter([SoftmaxGrad])
def local_softmax_dnn_grad(node, ctx_name): def local_softmax_dnn_grad(op, ctx_name, inputs):
if not dnn_available(ctx_name): if not dnn_available(ctx_name):
raise_no_cudnn("cuDNN needed for SoftmaxGrad") raise_no_cudnn("cuDNN needed for SoftmaxGrad")
ins = [] ins = []
for n in node.inputs: for n in inputs:
n = as_gpuarray_variable(n, ctx_name) n = as_gpuarray_variable(n, ctx_name)
if n.ndim != 2: if n.ndim != 2:
return return
......
...@@ -452,10 +452,10 @@ class GpuCumsum(GpuKernelBase, Op): ...@@ -452,10 +452,10 @@ class GpuCumsum(GpuKernelBase, Op):
@op_lifter([CumsumOp]) @op_lifter([CumsumOp])
def use_gpu_cumsumop(node, ctx_name): def use_gpu_cumsumop(op, ctx_name, inputs):
if node.inputs[0].dtype == 'float32': if inputs[0].dtype == 'float32':
axis = node.op.axis axis = op.axis
x = node.inputs[0] x = inputs[0]
if axis is not None and x.ndim > GpuCumsum.SUPPORTED_NDIMS: if axis is not None and x.ndim > GpuCumsum.SUPPORTED_NDIMS:
return None return None
......
...@@ -229,21 +229,21 @@ KERNEL void k_multi_warp_multinomial( ...@@ -229,21 +229,21 @@ KERNEL void k_multi_warp_multinomial(
@register_opt() @register_opt()
@op_lifter([theano.sandbox.multinomial.MultinomialFromUniform]) @op_lifter([theano.sandbox.multinomial.MultinomialFromUniform])
def local_gpua_multinomial(node, context_name): def local_gpua_multinomial(op, context_name, inputs):
# TODO : need description for function # TODO : need description for function
if len(node.inputs) == 2: if len(inputs) == 2:
p, u = node.inputs p, u = inputs
n_samples = 1 n_samples = 1
else: else:
p, u, n_samples = node.inputs p, u, n_samples = inputs
try: try:
if get_scalar_constant_value(n_samples) != 1: if get_scalar_constant_value(n_samples) != 1:
return None return None
except NotScalarConstantError: except NotScalarConstantError:
return None return None
m, = node.outputs m, = outputs
if (p.dtype == u.dtype == m.dtype == 'float32'): if (p.dtype == u.dtype == m.dtype == 'float32'):
gpu_op = GPUAMultinomialFromUniform(node.op.odtype) gpu_op = GPUAMultinomialFromUniform(op.odtype)
return gpuarray.elemwise.GpuDimShuffle([False, False], [1, 0])( return gpuarray.elemwise.GpuDimShuffle([False, False], [1, 0])(
gpu_op(p, u)) gpu_op(p, u))
...@@ -469,8 +469,8 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -469,8 +469,8 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
@op_lifter([Images2Neibs]) @op_lifter([Images2Neibs])
def use_gpu_images2neibs(node, context_name): def use_gpu_images2neibs(op, context_name, inputs):
if node.op.mode in ['valid', 'ignore_borders', 'wrap_centered']: if op.mode in ['valid', 'ignore_borders', 'wrap_centered']:
return GpuImages2Neibs(node.op.mode) return GpuImages2Neibs(op.mode)
register_gpu_opt()(use_gpu_images2neibs) register_gpu_opt()(use_gpu_images2neibs)
...@@ -149,14 +149,14 @@ if (GpuKernel_init(&k_%(name)s, c->ctx, 1, &bcode, &sz, ...@@ -149,14 +149,14 @@ if (GpuKernel_init(&k_%(name)s, c->ctx, 1, &bcode, &sz,
@opt.register_opt() @opt.register_opt()
@opt.op_lifter([tensor.Dot]) @opt.op_lifter([tensor.Dot])
def local_dot_to_gemm16(node, ctx_name): def local_dot_to_gemm16(op, ctx_name, inputs):
if nerv is None: if nerv is None:
return return
A = node.inputs[0] A = inputs[0]
B = node.inputs[1] B = inputs[1]
if (A.ndim == 2 and B.ndim == 2 and if (A.ndim == 2 and B.ndim == 2 and
A.dtype == 'float16' and B.dtype == 'float16'): A.dtype == 'float16' and B.dtype == 'float16'):
fgraph = node.inputs[0].fgraph fgraph = inputs[0].fgraph
C = GpuAllocEmpty(dtype='float16', context_name=ctx_name)( C = GpuAllocEmpty(dtype='float16', context_name=ctx_name)(
shape_i(A, 0, fgraph), shape_i(B, 1, fgraph)) shape_i(A, 0, fgraph), shape_i(B, 1, fgraph))
return Gemm16()(C, 1.0, A, B, 0.0) return Gemm16()(C, 1.0, A, B, 0.0)
......
...@@ -161,9 +161,18 @@ def op_lifter(OP, cuda_only=False): ...@@ -161,9 +161,18 @@ def op_lifter(OP, cuda_only=False):
context_name = i.owner.inputs[0].type.context_name context_name = i.owner.inputs[0].type.context_name
replace = True replace = True
break break
clients = [c for o in node.outputs for c in o.clients]
# list of list containing clients
# it is clients per node basis
out_clients = []
for o in node.outputs:
if o.clients:
out_clients.append(o.clients)
else:
out_clients.append([])
if not replace: if not replace:
# We replace if *all* clients are on the GPU # We replace if *all* clients are on the GPU
clients = [c for o in node.outputs for c in o.clients]
replace = len(clients) != 0 replace = len(clients) != 0
for c, idx in clients: for c, idx in clients:
if (c == 'output' or if (c == 'output' or
...@@ -184,10 +193,11 @@ def op_lifter(OP, cuda_only=False): ...@@ -184,10 +193,11 @@ def op_lifter(OP, cuda_only=False):
# the context was derived from the outputs # the context was derived from the outputs
for i in node.inputs: for i in node.inputs:
i.tag.context_name = context_name i.tag.context_name = context_name
try: try:
new_op = maker(node, context_name, node.inputs) new_op = maker(node.op, context_name, node.inputs)
except TypeError: except TypeError:
new_op = maker(node, context_name) new_op = maker(node.op, context_name, node.inputs, out_clients)
# This is needed as sometimes new_op inherits from OP. # This is needed as sometimes new_op inherits from OP.
if new_op and new_op != node.op: if new_op and new_op != node.op:
if isinstance(new_op, theano.Op): if isinstance(new_op, theano.Op):
...@@ -278,32 +288,45 @@ class GraphToGPU(Optimizer): ...@@ -278,32 +288,45 @@ class GraphToGPU(Optimizer):
continue continue
# Move only if any of the inputs are on the GPU. # Move only if any of the inputs are on the GPU.
move_to_GPU = False move_to_GPU = True
'''
if any([isinstance(i, GpuArrayVariable) or if any([isinstance(i, GpuArrayVariable) or
isinstance(i, GpuArraySharedVariable) isinstance(i, GpuArraySharedVariable)
for i in [mapping[v] for v in node.inputs] + for i in [mapping[v] for v in node.inputs] +
node.outputs]): node.outputs]):
move_to_GPU = True move_to_GPU = True
'''
out_clients = []
for o in node.outputs:
if o.clients:
out_clients.append(o.clients)
else:
out_clients.append([])
context_name = None
for i in [mapping[i] for i in node.inputs]:
if isinstance(i.type, GpuArrayType):
context_name = i.type.context_name
break
new_ops = None new_ops = None
outputs = []
# Apply the lifter # Apply the lifter
for lopt in (self.local_optimizers_all + for lopt in (self.local_optimizers_all +
self.local_optimizers_map.get(type(node.op), []) + self.local_optimizers_map.get(type(node.op), []) +
self.local_optimizers_map.get(node.op, [])): self.local_optimizers_map.get(node.op, [])):
replace = False
for i in [mapping[i] for i in node.inputs]: if move_to_GPU:
if isinstance(i.type, GpuArrayType):
context_name = i.type.context_name
replace = True
break
if replace:
try: try:
new_ops = lopt.transform( new_ops = lopt.transform(
node, context_name, node.op, context_name,
[mapping[i] for i in node.inputs]) [mapping[i] for i in node.inputs])
except TypeError: except TypeError:
new_ops = lopt.transform(node, context_name) new_ops = lopt.transform(node.op, context_name,
[mapping[i] for i in node.inputs],
out_clients)
if new_ops: if new_ops:
break break
if not new_ops: if not new_ops:
...@@ -422,18 +445,18 @@ def local_gpuaalloc2(node): ...@@ -422,18 +445,18 @@ def local_gpuaalloc2(node):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.Alloc]) @op_lifter([tensor.Alloc])
@register_opt2([tensor.Alloc], 'fast_compile') @register_opt2([tensor.Alloc], 'fast_compile')
def local_gpuaalloc(node, context_name): def local_gpuaalloc(op, context_name, inputs):
return GpuAlloc(context_name)(*node.inputs) return GpuAlloc(context_name)(*inputs)
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.AllocEmpty]) @op_lifter([tensor.AllocEmpty])
@register_opt2([tensor.AllocEmpty], 'fast_compile') @register_opt2([tensor.AllocEmpty], 'fast_compile')
def local_gpuaallocempty(node, context_name): def local_gpuaallocempty(op, context_name, inputs):
# We use _props_dict() to make sure that the GPU op know all the # We use _props_dict() to make sure that the GPU op know all the
# CPU op props. # CPU op props.
return GpuAllocEmpty(context_name=context_name, return GpuAllocEmpty(context_name=context_name,
**node.op._props_dict())(*node.inputs) **op._props_dict())(*inputs)
@register_opt() @register_opt()
...@@ -479,15 +502,14 @@ def local_gpu_contiguous_gpu_contiguous(node): ...@@ -479,15 +502,14 @@ def local_gpu_contiguous_gpu_contiguous(node):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.extra_ops.CpuContiguous]) @op_lifter([tensor.extra_ops.CpuContiguous])
@register_opt2([tensor.extra_ops.CpuContiguous], 'fast_compile') @register_opt2([tensor.extra_ops.CpuContiguous], 'fast_compile')
def local_gpu_contiguous(node, context_name): def local_gpu_contiguous(op, context_name, inputs):
return gpu_contiguous return gpu_contiguous
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.Reshape]) @op_lifter([tensor.Reshape])
@register_opt2([tensor.Reshape], 'fast_compile') @register_opt2([tensor.Reshape], 'fast_compile')
def local_gpureshape(node, context_name): def local_gpureshape(op, context_name, inputs):
op = node.op
name = op.name name = op.name
if name: if name:
name = 'Gpu' + name name = 'Gpu' + name
...@@ -498,15 +520,14 @@ def local_gpureshape(node, context_name): ...@@ -498,15 +520,14 @@ def local_gpureshape(node, context_name):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.Rebroadcast]) @op_lifter([tensor.Rebroadcast])
@register_opt2([tensor.Rebroadcast], 'fast_compile') @register_opt2([tensor.Rebroadcast], 'fast_compile')
def local_gpu_rebroadcast(node, context_name, inputs): def local_gpu_rebroadcast(op, context_name, inputs):
return node.op(as_gpuarray_variable(inputs[0], context_name)) return op(as_gpuarray_variable(inputs[0], context_name))
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.Flatten]) @op_lifter([tensor.Flatten])
@register_opt2([tensor.Flatten], 'fast_compile') @register_opt2([tensor.Flatten], 'fast_compile')
def local_gpuflatten(node, context_name, inputs): def local_gpuflatten(op, context_name, inputs):
op = node.op
shp = [] shp = []
if op.outdim != 1: if op.outdim != 1:
shp = [inputs[0].shape[i] for i in range(op.outdim - 1)] shp = [inputs[0].shape[i] for i in range(op.outdim - 1)]
...@@ -519,10 +540,10 @@ def local_gpuflatten(node, context_name, inputs): ...@@ -519,10 +540,10 @@ def local_gpuflatten(node, context_name, inputs):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.Elemwise]) @op_lifter([tensor.Elemwise])
@register_opt2([tensor.Elemwise], 'fast_compile') @register_opt2([tensor.Elemwise], 'fast_compile')
def local_gpu_elemwise(node, context_name, inputs): def local_gpu_elemwise(op, context_name, inputs):
op = node.op
scal_op = op.scalar_op scal_op = op.scalar_op
name = op.name name = op.name
node = op.make_node(*inputs)
if name: if name:
name = 'Gpu' + name name = 'Gpu' + name
if len(node.outputs) > 1: if len(node.outputs) > 1:
...@@ -593,15 +614,15 @@ optdb.register('gpua_inplace_opt', inplace_gpu_elemwise_opt, 75, ...@@ -593,15 +614,15 @@ optdb.register('gpua_inplace_opt', inplace_gpu_elemwise_opt, 75,
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.DimShuffle]) @op_lifter([tensor.DimShuffle])
@register_opt2([tensor.DimShuffle], 'fast_compile') @register_opt2([tensor.DimShuffle], 'fast_compile')
def local_gpua_dimshuffle(node, context_name): def local_gpua_dimshuffle(op, context_name, inputs):
return GpuDimShuffle(node.op.input_broadcastable, return GpuDimShuffle(op.input_broadcastable,
node.op.new_order) op.new_order)
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.SpecifyShape]) @op_lifter([tensor.SpecifyShape])
@register_opt2([tensor.SpecifyShape], 'fast_compile') @register_opt2([tensor.SpecifyShape], 'fast_compile')
def local_gpua_specifyShape(node, context_name, inputs): def local_gpua_specifyShape(op, context_name, inputs):
if isinstance(inputs[0].type, GpuArrayType): if isinstance(inputs[0].type, GpuArrayType):
return return
inp = [as_gpuarray_variable(inputs[0], context_name)] inp = [as_gpuarray_variable(inputs[0], context_name)]
...@@ -627,11 +648,11 @@ def gpu_print_wrapper(op, cnda): ...@@ -627,11 +648,11 @@ def gpu_print_wrapper(op, cnda):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.printing.Print]) @op_lifter([tensor.printing.Print])
@register_opt2([tensor.printing.Print], 'fast_compile') @register_opt2([tensor.printing.Print], 'fast_compile')
def local_gpu_print_op(node, context_name, inputs): def local_gpu_print_op(op, context_name, inputs):
x, = inputs x, = inputs
gpu_x = as_gpuarray_variable(x, context_name=context_name) gpu_x = as_gpuarray_variable(x, context_name=context_name)
new_op = node.op.__class__(global_fn=gpu_print_wrapper) new_op = op.__class__(global_fn=gpu_print_wrapper)
new_op.old_op = node.op new_op.old_op = op
return new_op(gpu_x) return new_op(gpu_x)
...@@ -706,8 +727,8 @@ def local_gpu_pdbbreakpoint_op(node): ...@@ -706,8 +727,8 @@ def local_gpu_pdbbreakpoint_op(node):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([IfElse]) @op_lifter([IfElse])
@register_opt2([IfElse], 'fast_compile') @register_opt2([IfElse], 'fast_compile')
def local_gpua_lazy_ifelse(node, context_name, inputs): def local_gpua_lazy_ifelse(op, context_name, inputs):
if node.op.gpu: if op.gpu:
return return
c = inputs[0] c = inputs[0]
inps = [] inps = []
...@@ -716,13 +737,13 @@ def local_gpua_lazy_ifelse(node, context_name, inputs): ...@@ -716,13 +737,13 @@ def local_gpua_lazy_ifelse(node, context_name, inputs):
inps.append(as_gpuarray_variable(v, context_name)) inps.append(as_gpuarray_variable(v, context_name))
else: else:
inps.append(v) inps.append(v)
return IfElse(node.op.n_outs, gpu=True)(c, *inps, return_list=True) return IfElse(op.n_outs, gpu=True)(c, *inps, return_list=True)
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.Join]) @op_lifter([tensor.Join])
@register_opt2([tensor.Join], 'fast_compile') @register_opt2([tensor.Join], 'fast_compile')
def local_gpua_join(node, context_name): def local_gpua_join(op, context_name, inputs):
return gpu_join return gpu_join
...@@ -738,16 +759,17 @@ def local_gpuajoin_1(node): ...@@ -738,16 +759,17 @@ def local_gpuajoin_1(node):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.Split]) @op_lifter([tensor.Split])
@register_opt2([tensor.Split], 'fast_compile') @register_opt2([tensor.Split], 'fast_compile')
def local_gpua_split(node, context_name): def local_gpua_split(op, context_name, inputs):
#TODO use props #TODO use props
return GpuSplit(node.op.len_splits) return GpuSplit(op.len_splits)
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.Subtensor]) @op_lifter([tensor.Subtensor])
@register_opt2([tensor.Subtensor], 'fast_compile') @register_opt2([tensor.Subtensor], 'fast_compile')
def local_gpua_subtensor(node, context_name): def local_gpua_subtensor(op, context_name, inputs, clients):
x = node.inputs[0] x = inputs[0]
node = op.make_node(*inputs)
if (x.owner and isinstance(x.owner.op, HostFromGpu)): if (x.owner and isinstance(x.owner.op, HostFromGpu)):
gpu_x = x.owner.inputs[0] gpu_x = x.owner.inputs[0]
if (gpu_x.owner and if (gpu_x.owner and
...@@ -757,23 +779,23 @@ def local_gpua_subtensor(node, context_name): ...@@ -757,23 +779,23 @@ def local_gpua_subtensor(node, context_name):
if len(x.clients) == 1: if len(x.clients) == 1:
if any([n == 'output' or any([isinstance(v.type, GpuArrayType) if any([n == 'output' or any([isinstance(v.type, GpuArrayType)
for v in n.inputs + n.outputs]) for v in n.inputs + n.outputs])
for n, _ in node.outputs[0].clients]): for n, _ in clients[0]]):
return return
else: else:
return [host_from_gpu(gpu_x.owner.op(node.outputs[0]))] return [host_from_gpu(gpu_x.owner.op(node.outputs[0]))]
return GpuSubtensor(node.op.idx_list) return GpuSubtensor(op.idx_list)
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.IncSubtensor]) @op_lifter([tensor.IncSubtensor])
@register_opt2([tensor.IncSubtensor], 'fast_compile') @register_opt2([tensor.IncSubtensor], 'fast_compile')
def local_gpua_incsubtensor(node, context_name, inputs): def local_gpua_incsubtensor(op, context_name, inputs):
op = GpuIncSubtensor(node.op.idx_list, node.op.inplace, op = GpuIncSubtensor(op.idx_list, op.inplace,
node.op.set_instead_of_inc, op.set_instead_of_inc,
node.op.destroyhandler_tolerate_aliased) op.destroyhandler_tolerate_aliased)
ret = op(*inputs) ret = op(*inputs)
val = getattr(node.outputs[0].tag, 'nan_guard_mode_check', True) val = getattr(op.make_node(*inputs).outputs[0].tag, 'nan_guard_mode_check', True)
ret.tag.nan_guard_mode_check = val ret.tag.nan_guard_mode_check = val
return ret return ret
...@@ -781,20 +803,20 @@ def local_gpua_incsubtensor(node, context_name, inputs): ...@@ -781,20 +803,20 @@ def local_gpua_incsubtensor(node, context_name, inputs):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.AdvancedSubtensor1]) @op_lifter([tensor.AdvancedSubtensor1])
@register_opt2([tensor.AdvancedSubtensor1], 'fast_compile') @register_opt2([tensor.AdvancedSubtensor1], 'fast_compile')
def local_gpua_advanced_subtensor(node, context_name): def local_gpua_advanced_subtensor(op, context_name, inputs):
return GpuAdvancedSubtensor1() return GpuAdvancedSubtensor1()
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.AdvancedIncSubtensor1]) @op_lifter([tensor.AdvancedIncSubtensor1])
@register_opt2([tensor.AdvancedIncSubtensor1], 'fast_compile') @register_opt2([tensor.AdvancedIncSubtensor1], 'fast_compile')
def local_gpua_advanced_incsubtensor(node, context_name): def local_gpua_advanced_incsubtensor(op, context_name, inputs):
context = get_context(context_name) context = get_context(context_name)
# This is disabled on non-cuda contexts # This is disabled on non-cuda contexts
if context.kind != b'cuda': if context.kind != b'cuda':
return None return None
x, y, ilist = node.inputs x, y, ilist = inputs
# Gpu Ops needs both inputs to have the same dtype # Gpu Ops needs both inputs to have the same dtype
if (x.type.dtype != y.type.dtype): if (x.type.dtype != y.type.dtype):
...@@ -804,7 +826,7 @@ def local_gpua_advanced_incsubtensor(node, context_name): ...@@ -804,7 +826,7 @@ def local_gpua_advanced_incsubtensor(node, context_name):
if y.type.dtype != dtype: if y.type.dtype != dtype:
y = tensor.cast(y, dtype) y = tensor.cast(y, dtype)
set_instead_of_inc = node.op.set_instead_of_inc set_instead_of_inc = op.set_instead_of_inc
compute_capability = int(context.bin_id[-2]) compute_capability = int(context.bin_id[-2])
...@@ -828,29 +850,31 @@ def local_advincsub1_gpua_inplace(node): ...@@ -828,29 +850,31 @@ def local_advincsub1_gpua_inplace(node):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.CAReduce, tensor.Sum, tensor.elemwise.Prod]) @op_lifter([tensor.CAReduce, tensor.Sum, tensor.elemwise.Prod])
@register_opt2([tensor.CAReduce, tensor.Sum, tensor.elemwise.Prod], 'fast_compile') @register_opt2([tensor.CAReduce, tensor.Sum, tensor.elemwise.Prod], 'fast_compile')
def local_gpua_careduce(node, context_name): def local_gpua_careduce(op, context_name, inputs):
if isinstance(node.op.scalar_op, (scalar.Add, scalar.Mul, if isinstance(op.scalar_op, (scalar.Add, scalar.Mul,
scalar.Maximum, scalar.Minimum)): scalar.Maximum, scalar.Minimum)):
node = op.make_node(*inputs)
ctx = get_context(context_name) ctx = get_context(context_name)
if ctx.kind == b'opencl': if ctx.kind == b'opencl':
op = GpuCAReduceCPY op2 = GpuCAReduceCPY
if node.op.scalar_op not in [scalar.add, scalar.mul]: if op.scalar_op not in [scalar.add, scalar.mul]:
# We don't support yet all reduction with cpy code. # We don't support yet all reduction with cpy code.
return return
elif ctx.kind == b'cuda': elif ctx.kind == b'cuda':
op = GpuCAReduceCuda op2 = GpuCAReduceCuda
else: else:
return False return False
x, = node.inputs x, = inputs
greduce = op( greduce = op2(
node.op.scalar_op, axis=node.op.axis, op.scalar_op, axis=op.axis,
dtype=getattr(node.op, 'dtype', None), dtype=getattr(op, 'dtype', None),
acc_dtype=getattr(node.op, 'acc_dtype', None)) acc_dtype=getattr(op, 'acc_dtype', None))
gvar = greduce(x) gvar = greduce(x)
# We need to have the make node called, otherwise the mask can # We need to have the make node called, otherwise the mask can
# be None # be None
if (op is GpuCAReduceCPY or if (op2 is GpuCAReduceCPY or
gvar.owner.op.supports_c_code([ gvar.owner.op.supports_c_code([
as_gpuarray_variable(x, context_name)])): as_gpuarray_variable(x, context_name)])):
return greduce return greduce
...@@ -861,11 +885,11 @@ def local_gpua_careduce(node, context_name): ...@@ -861,11 +885,11 @@ def local_gpua_careduce(node, context_name):
# to make them a single dimension, do the reduction, and # to make them a single dimension, do the reduction, and
# then reshape to get them back. # then reshape to get them back.
if node.op.axis is None: if op.axis is None:
reduce_mask = [1] * x.type.ndim reduce_mask = [1] * x.type.ndim
else: else:
reduce_mask = [0] * x.type.ndim reduce_mask = [0] * x.type.ndim
for a in node.op.axis: for a in op.axis:
assert reduce_mask[a] == 0 assert reduce_mask[a] == 0
reduce_mask[a] = 1 reduce_mask[a] = 1
...@@ -881,11 +905,11 @@ def local_gpua_careduce(node, context_name): ...@@ -881,11 +905,11 @@ def local_gpua_careduce(node, context_name):
for idx, m in enumerate(new_mask): for idx, m in enumerate(new_mask):
if m == 1: if m == 1:
new_axis.append(idx) new_axis.append(idx)
greduce = op( greduce = op2(
node.op.scalar_op, op.scalar_op,
axis=new_axis, reduce_mask=new_mask, axis=new_axis, reduce_mask=new_mask,
dtype=getattr(node.op, 'dtype', None), dtype=getattr(op, 'dtype', None),
acc_dtype=getattr(node.op, 'acc_dtype', None)) acc_dtype=getattr(op, 'acc_dtype', None))
reshaped_x = x.reshape(tensor.stack(new_in_shp)) reshaped_x = x.reshape(tensor.stack(new_in_shp))
gpu_reshaped_x = as_gpuarray_variable(reshaped_x, context_name) gpu_reshaped_x = as_gpuarray_variable(reshaped_x, context_name)
...@@ -912,8 +936,8 @@ def local_gpua_careduce(node, context_name): ...@@ -912,8 +936,8 @@ def local_gpua_careduce(node, context_name):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.blas.Gemv, tensor.blas_c.CGemv]) @op_lifter([tensor.blas.Gemv, tensor.blas_c.CGemv])
@register_opt2([tensor.blas.Gemv], 'fast_compile') @register_opt2([tensor.blas.Gemv], 'fast_compile')
def local_gpua_gemv(node, context_name): def local_gpua_gemv(op, context_name, inputs):
if node.op.inplace: if op.inplace:
return gpugemv_inplace return gpugemv_inplace
else: else:
return gpugemv_no_inplace return gpugemv_no_inplace
...@@ -922,8 +946,8 @@ def local_gpua_gemv(node, context_name): ...@@ -922,8 +946,8 @@ def local_gpua_gemv(node, context_name):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.blas.Gemm]) @op_lifter([tensor.blas.Gemm])
@register_opt2([tensor.blas.Gemm], 'fast_compile') @register_opt2([tensor.blas.Gemm], 'fast_compile')
def local_gpua_gemm(node, context_name): def local_gpua_gemm(op, context_name, inputs):
if node.op.inplace: if op.inplace:
return gpugemm_inplace return gpugemm_inplace
else: else:
return gpugemm_no_inplace return gpugemm_no_inplace
...@@ -932,7 +956,7 @@ def local_gpua_gemm(node, context_name): ...@@ -932,7 +956,7 @@ def local_gpua_gemm(node, context_name):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.blas.BatchedDot]) @op_lifter([tensor.blas.BatchedDot])
@register_opt2([tensor.blas.BatchedDot], 'fast_compile') @register_opt2([tensor.blas.BatchedDot], 'fast_compile')
def local_gpua_gemmbatch(node, context_name, inputs): def local_gpua_gemmbatch(op, context_name, inputs):
a, b = inputs a, b = inputs
c = tensor.AllocEmpty(a.dtype)(a.shape[0], a.shape[1], b.shape[2]) c = tensor.AllocEmpty(a.dtype)(a.shape[0], a.shape[1], b.shape[2])
return gpugemmbatch_no_inplace(c, 1.0, a, b, 0.0) return gpugemmbatch_no_inplace(c, 1.0, a, b, 0.0)
...@@ -941,7 +965,7 @@ def local_gpua_gemmbatch(node, context_name, inputs): ...@@ -941,7 +965,7 @@ def local_gpua_gemmbatch(node, context_name, inputs):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.basic.Dot]) @op_lifter([tensor.basic.Dot])
@register_opt2([tensor.basic.Dot], 'fast_compile') @register_opt2([tensor.basic.Dot], 'fast_compile')
def local_gpua_hgemm(node, context_name, inputs): def local_gpua_hgemm(op, context_name, inputs):
from theano.sandbox.cuda import nvcc_compiler from theano.sandbox.cuda import nvcc_compiler
if nvcc_compiler.nvcc_version < '7.5': if nvcc_compiler.nvcc_version < '7.5':
_logger.warning("Not performing dot of float16 on the GPU since " _logger.warning("Not performing dot of float16 on the GPU since "
...@@ -985,20 +1009,20 @@ def local_gpuagemmbatch_output_merge(node, *inputs): ...@@ -985,20 +1009,20 @@ def local_gpuagemmbatch_output_merge(node, *inputs):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.blas.Ger, tensor.blas_c.CGer, tensor.blas_scipy.ScipyGer]) @op_lifter([tensor.blas.Ger, tensor.blas_c.CGer, tensor.blas_scipy.ScipyGer])
def local_gpua_ger(node, context_name): def local_gpua_ger(op, context_name, inputs):
return GpuGer(inplace=node.op.destructive) return GpuGer(inplace=op.destructive)
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.blas.Dot22]) @op_lifter([tensor.blas.Dot22])
def local_gpua_dot22(node, context_name): def local_gpua_dot22(op, context_name, inputs):
return gpu_dot22 return gpu_dot22
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.blas.Dot22Scalar]) @op_lifter([tensor.blas.Dot22Scalar])
@register_opt2([tensor.blas.Dot22Scalar], 'fast_compile') @register_opt2([tensor.blas.Dot22Scalar], 'fast_compile')
def local_gpua_dot22scalar(node, context_name, inputs): def local_gpua_dot22scalar(op, context_name, inputs):
x, y, a = inputs x, y, a = inputs
x = as_gpuarray_variable(x, context_name) x = as_gpuarray_variable(x, context_name)
y = as_gpuarray_variable(y, context_name) y = as_gpuarray_variable(y, context_name)
...@@ -1009,28 +1033,28 @@ def local_gpua_dot22scalar(node, context_name, inputs): ...@@ -1009,28 +1033,28 @@ def local_gpua_dot22scalar(node, context_name, inputs):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.basic.Eye]) @op_lifter([tensor.basic.Eye])
@register_opt2([tensor.basic.Eye], 'fast_compile') @register_opt2([tensor.basic.Eye], 'fast_compile')
def local_gpua_eye(node, context_name): def local_gpua_eye(op, context_name, inputs):
return GpuEye(dtype=node.op.dtype, context_name=context_name) return GpuEye(dtype=op.dtype, context_name=context_name)
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias], cuda_only=True) @op_lifter([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias], cuda_only=True)
@register_opt2([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias], 'fast_compile') @register_opt2([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias], 'fast_compile')
def local_gpua_crossentropysoftmaxargmax1hotwithbias(node, context_name): def local_gpua_crossentropysoftmaxargmax1hotwithbias(op, context_name, inputs):
return gpu_crossentropy_softmax_argmax_1hot_with_bias return gpu_crossentropy_softmax_argmax_1hot_with_bias
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.nnet.CrossentropySoftmax1HotWithBiasDx], cuda_only=True) @op_lifter([tensor.nnet.CrossentropySoftmax1HotWithBiasDx], cuda_only=True)
@register_opt2([tensor.nnet.CrossentropySoftmax1HotWithBiasDx], 'fast_compile') @register_opt2([tensor.nnet.CrossentropySoftmax1HotWithBiasDx], 'fast_compile')
def local_gpua_crossentropysoftmax1hotwithbiasdx(node, context_name): def local_gpua_crossentropysoftmax1hotwithbiasdx(op, context_name, inputs):
return gpu_crossentropy_softmax_1hot_with_bias_dx return gpu_crossentropy_softmax_1hot_with_bias_dx
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.nnet.Softmax], cuda_only=True) @op_lifter([tensor.nnet.Softmax], cuda_only=True)
@register_opt2([tensor.nnet.Softmax], 'fast_compile') @register_opt2([tensor.nnet.Softmax], 'fast_compile')
def local_gpua_softmax(node, context_name): def local_gpua_softmax(op, context_name, inputs):
return gpu_softmax return gpu_softmax
...@@ -1043,17 +1067,20 @@ def local_gpua_softmaxwithbias(node, context_name): ...@@ -1043,17 +1067,20 @@ def local_gpua_softmaxwithbias(node, context_name):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([theano.tensor.opt.Assert]) @op_lifter([theano.tensor.opt.Assert])
def local_assert(node, context_name): @register_opt2([theano.tensor.opt.Assert], 'fast_compile')
def local_assert(op, context_name, inputs):
# Check if input nodes are already on the GPU # Check if input nodes are already on the GPU
if isinstance(node.inputs[0].type, GpuArrayType): if isinstance(node.inputs[0].type, GpuArrayType):
return return
return [host_from_gpu(node.op(as_gpuarray_variable(node.inputs[0], return [host_from_gpu(op(as_gpuarray_variable(inputs[0],
context_name),
*inputs[1:]))]
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([ConvOp]) @op_lifter([ConvOp])
def local_error_convop(node, context_name): def local_error_convop(op, context_name, inputs):
assert False, """ assert False, """
ConvOp does not work with the gpuarray backend. ConvOp does not work with the gpuarray backend.
...@@ -1065,8 +1092,8 @@ theano.tensor.nnet.conv2d() ...@@ -1065,8 +1092,8 @@ theano.tensor.nnet.conv2d()
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([SparseBlockGemv]) @op_lifter([SparseBlockGemv])
@register_opt2([SparseBlockGemv], 'fast_compile') @register_opt2([SparseBlockGemv], 'fast_compile')
def local_lift_sparseblockgemv(node, context_name): def local_lift_sparseblockgemv(op, context_name, inputs):
if node.op.inplace: if op.inplace:
return gpu_sparse_block_gemv_inplace return gpu_sparse_block_gemv_inplace
else: else:
return gpu_sparse_block_gemv return gpu_sparse_block_gemv
...@@ -1075,8 +1102,8 @@ def local_lift_sparseblockgemv(node, context_name): ...@@ -1075,8 +1102,8 @@ def local_lift_sparseblockgemv(node, context_name):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([SparseBlockOuter]) @op_lifter([SparseBlockOuter])
@register_opt2([SparseBlockOuter], 'fast_compile') @register_opt2([SparseBlockOuter], 'fast_compile')
def local_lift_sparseblockouter(node, context_name): def local_lift_sparseblockouter(op, context_name, inputs):
if node.op.inplace: if op.inplace:
return gpu_sparse_block_outer_inplace return gpu_sparse_block_outer_inplace
else: else:
return gpu_sparse_block_outer return gpu_sparse_block_outer
...@@ -1104,8 +1131,8 @@ def local_inplace_sparseblockouter(node): ...@@ -1104,8 +1131,8 @@ def local_inplace_sparseblockouter(node):
@register_opt2([AbstractConv2d, @register_opt2([AbstractConv2d,
AbstractConv2d_gradWeights, AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs], 'fast_compile') AbstractConv2d_gradInputs], 'fast_compile')
def local_lift_abstractconv2d(node, context_name, inputs): def local_lift_abstractconv2d(op, context_name, inputs):
if isinstance(node.outputs[0].type, GpuArrayType): if isinstance(inputs[0].type, GpuArrayType):
# Don't handle this node here, it's already on the GPU. # Don't handle this node here, it's already on the GPU.
return return
inps = list(inputs) inps = list(inputs)
...@@ -1113,7 +1140,7 @@ def local_lift_abstractconv2d(node, context_name, inputs): ...@@ -1113,7 +1140,7 @@ def local_lift_abstractconv2d(node, context_name, inputs):
context_name=context_name) context_name=context_name)
inps[1] = as_gpuarray_variable(inputs[1], inps[1] = as_gpuarray_variable(inputs[1],
context_name=context_name) context_name=context_name)
return [node.op(*inps)] return [op(*inps)]
# Register this here so that it goes after the abstract lifting # Register this here so that it goes after the abstract lifting
register_opt('fast_compile')(conv_groupopt) register_opt('fast_compile')(conv_groupopt)
...@@ -1220,36 +1247,36 @@ def gpu_reconstruct_graph(inputs, outputs, tag=None): ...@@ -1220,36 +1247,36 @@ def gpu_reconstruct_graph(inputs, outputs, tag=None):
@register_opt('scan', 'fast_compile') @register_opt('scan', 'fast_compile')
@op_lifter([scan_op.Scan]) @op_lifter([scan_op.Scan])
#@register_opt2([scan_op.Scan], 'fast_compile') @register_opt2([scan_op.Scan], 'fast_compile')
def local_scan_to_gpua(node, context_name): def local_scan_to_gpua(op, context_name, inputs):
info = copy.deepcopy(node.op.info) info = copy.deepcopy(op.info)
if info.get('gpua', False): if info.get('gpua', False):
return return
info['gpua'] = True info['gpua'] = True
nw_ins = [node.inputs[0]] nw_ins = [inputs[0]]
e = (1 + e = (1 +
node.op.n_seqs + op.n_seqs +
node.op.n_mit_mot + op.n_mit_mot +
node.op.n_mit_sot + op.n_mit_sot +
node.op.n_sit_sot + op.n_sit_sot +
node.op.n_shared_outs) op.n_shared_outs)
nw_ins += [safe_to_gpu(x, context_name) for x in node.inputs[1:e]] nw_ins += [safe_to_gpu(x, context_name) for x in inputs[1:e]]
b = e b = e
e = e + node.op.n_nit_sot e = e + op.n_nit_sot
nw_ins += node.inputs[b:e] nw_ins += inputs[b:e]
nw_ins += [safe_to_gpu(x, context_name) for x in node.inputs[e:]] nw_ins += [safe_to_gpu(x, context_name) for x in inputs[e:]]
scan_ins = [tensor_to_gpu(x, context_name) for x in node.op.inputs] scan_ins = [tensor_to_gpu(x, context_name) for x in op.inputs]
# The inner output corresponding to the looping condition should not be # The inner output corresponding to the looping condition should not be
# moved to the gpu # moved to the gpu
if node.op.info['as_while']: if op.info['as_while']:
scan_outs = [safe_to_gpu(x, context_name) for x in node.op.outputs[:-1]] scan_outs = [safe_to_gpu(x, context_name) for x in op.outputs[:-1]]
scan_outs += [node.op.outputs[-1]] scan_outs += [op.outputs[-1]]
else: else:
scan_outs = [safe_to_gpu(x, context_name) for x in node.op.outputs] scan_outs = [safe_to_gpu(x, context_name) for x in op.outputs]
scan_outs = scan_utils.clone( scan_outs = scan_utils.clone(
scan_outs, scan_outs,
replace=list(zip(node.op.inputs, replace=list(zip(op.inputs,
(safe_to_cpu(x) for x in scan_ins)))) (safe_to_cpu(x) for x in scan_ins))))
# We need to construct the hash here, because scan # We need to construct the hash here, because scan
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论