提交 ebcf5615 authored 作者: sentient07's avatar sentient07

standardized opt name and return type of gpualloc and gpueye

上级 aa853330
...@@ -54,7 +54,7 @@ def _atexit_print_fn(): ...@@ -54,7 +54,7 @@ def _atexit_print_fn():
destination_file = open(config.profiling.destination, 'w') destination_file = open(config.profiling.destination, 'w')
for ps in _atexit_print_list: for ps in _atexit_print_list:
if ps.fct_callcount > 1 or ps.compile_time > 1: if ps.fct_callcount >= 1 or ps.compile_time > 1:
ps.summary(file=destination_file, ps.summary(file=destination_file,
n_ops_to_print=config.profiling.n_ops, n_ops_to_print=config.profiling.n_ops,
n_apply_to_print=config.profiling.n_apply) n_apply_to_print=config.profiling.n_apply)
......
...@@ -1550,7 +1550,7 @@ def local_dnn_convi_output_merge(node, *inputs): ...@@ -1550,7 +1550,7 @@ def local_dnn_convi_output_merge(node, *inputs):
@register_opt('cudnn', 'fast_compile') @register_opt('cudnn', 'fast_compile')
@op_lifter([Pool]) @op_lifter([Pool])
@register_opt2([Pool], 'fast_compile', 'cudnn') @register_opt2([Pool], 'fast_compile', 'cudnn')
def local_pool_dnn_alternative(op, ctx_name, inputs, outputs): def local_gpua_pool_dnn_alternative(op, ctx_name, inputs, outputs):
if not dnn_available(ctx_name): if not dnn_available(ctx_name):
raise_no_cudnn() raise_no_cudnn()
if not op.ignore_border: if not op.ignore_border:
...@@ -1567,7 +1567,7 @@ def local_pool_dnn_alternative(op, ctx_name, inputs, outputs): ...@@ -1567,7 +1567,7 @@ def local_pool_dnn_alternative(op, ctx_name, inputs, outputs):
@register_opt('cudnn', 'fast_compile') @register_opt('cudnn', 'fast_compile')
@op_lifter([MaxPoolGrad]) @op_lifter([MaxPoolGrad])
@register_opt2([MaxPoolGrad], 'fast_compile', 'cudnn') @register_opt2([MaxPoolGrad], 'fast_compile', 'cudnn')
def local_pool_dnn_grad_stride(op, ctx_name, inputs, outputs): def local_gpua_pool_dnn_grad_stride(op, ctx_name, inputs, outputs):
if not dnn_available(ctx_name): if not dnn_available(ctx_name):
raise_no_cudnn() raise_no_cudnn()
if not op.ignore_border: if not op.ignore_border:
...@@ -1592,7 +1592,7 @@ def local_pool_dnn_grad_stride(op, ctx_name, inputs, outputs): ...@@ -1592,7 +1592,7 @@ def local_pool_dnn_grad_stride(op, ctx_name, inputs, outputs):
@register_opt('cudnn', 'fast_compile') @register_opt('cudnn', 'fast_compile')
@op_lifter([AveragePoolGrad]) @op_lifter([AveragePoolGrad])
@register_opt2([AveragePoolGrad], 'fast_compile', 'cudnn') @register_opt2([AveragePoolGrad], 'fast_compile', 'cudnn')
def local_avg_pool_dnn_grad_stride(op, ctx_name, inputs, outputs): def local_gpua_avg_pool_dnn_grad_stride(op, ctx_name, inputs, outputs):
if not dnn_available(ctx_name): if not dnn_available(ctx_name):
raise_no_cudnn() raise_no_cudnn()
if not op.ignore_border: if not op.ignore_border:
...@@ -1646,7 +1646,7 @@ def local_log_softmax_dnn(node): ...@@ -1646,7 +1646,7 @@ def local_log_softmax_dnn(node):
@register_opt('cudnn', 'fast_compile') @register_opt('cudnn', 'fast_compile')
@op_lifter([LogSoftmax]) @op_lifter([LogSoftmax])
@register_opt2([LogSoftmax], 'fast_compile', 'cudnn') @register_opt2([LogSoftmax], 'fast_compile', 'cudnn')
def local_logsoftmax_to_dnn(op, ctx_name, inputs, outputs): def local_gpua_logsoftmax_to_dnn(op, ctx_name, inputs, outputs):
# Transform the input in the format expected by GpuDnnSoftmax # Transform the input in the format expected by GpuDnnSoftmax
inp = inputs[0] inp = inputs[0]
if inp.ndim != 2: if inp.ndim != 2:
...@@ -1685,7 +1685,7 @@ gpu_seqopt.register("NoCuDNNRaise", NoCuDNNRaise(), 0, 'cudnn') ...@@ -1685,7 +1685,7 @@ gpu_seqopt.register("NoCuDNNRaise", NoCuDNNRaise(), 0, 'cudnn')
@register_opt('cudnn', 'fast_compile') @register_opt('cudnn', 'fast_compile')
@op_lifter([SoftmaxGrad]) @op_lifter([SoftmaxGrad])
@register_opt2([SoftmaxGrad], 'cudnn', 'fast_compile') @register_opt2([SoftmaxGrad], 'cudnn', 'fast_compile')
def local_softmax_dnn_grad(op, ctx_name, inputs, outputs): def local_gpua_softmax_dnn_grad(op, ctx_name, inputs, outputs):
if not dnn_available(ctx_name): if not dnn_available(ctx_name):
raise_no_cudnn("cuDNN needed for SoftmaxGrad") raise_no_cudnn("cuDNN needed for SoftmaxGrad")
ins = [] ins = []
......
...@@ -456,7 +456,7 @@ class GpuCumsum(GpuKernelBase, Op): ...@@ -456,7 +456,7 @@ class GpuCumsum(GpuKernelBase, Op):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([CumsumOp]) @op_lifter([CumsumOp])
@register_opt2([CumsumOp], 'fast_compile') @register_opt2([CumsumOp], 'fast_compile')
def use_gpu_cumsumop(op, ctx_name, inputs, outputs): def local_gpua_cumsumop(op, ctx_name, inputs, outputs):
if inputs[0].dtype == 'float32': if inputs[0].dtype == 'float32':
axis = op.axis axis = op.axis
x = inputs[0] x = inputs[0]
......
...@@ -471,6 +471,6 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -471,6 +471,6 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([Images2Neibs]) @op_lifter([Images2Neibs])
@register_opt2([Images2Neibs], 'fast_compile') @register_opt2([Images2Neibs], 'fast_compile')
def use_gpu_images2neibs(op, context_name, inputs): def local_gpua_images2neibs(op, context_name, inputs):
if op.mode in ['valid', 'ignore_borders', 'wrap_centered']: if op.mode in ['valid', 'ignore_borders', 'wrap_centered']:
return GpuImages2Neibs(op.mode) return GpuImages2Neibs(op.mode)
...@@ -150,14 +150,14 @@ if (GpuKernel_init(&k_%(name)s, c->ctx, 1, &bcode, &sz, ...@@ -150,14 +150,14 @@ if (GpuKernel_init(&k_%(name)s, c->ctx, 1, &bcode, &sz,
@opt.register_opt('fast_compile') @opt.register_opt('fast_compile')
@opt.op_lifter([tensor.Dot]) @opt.op_lifter([tensor.Dot])
@opt.register_opt2([tensor.Dot], 'fast_compile') @opt.register_opt2([tensor.Dot], 'fast_compile')
def local_dot_to_gemm16(op, ctx_name, inputs, outputs): def local_gpua_dot_to_gemm16(op, ctx_name, inputs, outputs):
if nerv is None: if nerv is None:
return return
A = inputs[0] A = inputs[0]
B = inputs[1] B = inputs[1]
if (A.ndim == 2 and B.ndim == 2 and if (A.ndim == 2 and B.ndim == 2 and
A.dtype == 'float16' and B.dtype == 'float16'): A.dtype == 'float16' and B.dtype == 'float16'):
fgraph = getattr(inputs[0], 'fgraph', None) fgraph = getattr(outputs[0], 'fgraph', None)
C = gpu_alloc_empty(ctx_name, dtype='float16')( C = gpu_alloc_empty(ctx_name, dtype='float16')(
shape_i(A, 0, fgraph), shape_i(B, 1, fgraph)) shape_i(A, 0, fgraph), shape_i(B, 1, fgraph))
return Gemm16()(C, 1.0, A, B, 0.0) return Gemm16()(C, 1.0, A, B, 0.0)
......
...@@ -299,12 +299,12 @@ class GraphToGPU(NavigatorOptimizer): ...@@ -299,12 +299,12 @@ class GraphToGPU(NavigatorOptimizer):
# Building a new graph # Building a new graph
# Iterating through inputs of graph # Iterating through inputs of graph
target = str(infer_context_name(*fgraph.inputs)) target = infer_context_name(*fgraph.inputs)
for i in fgraph.inputs: for i in fgraph.inputs:
# Do not move *int* scalar to the GPU. # Do not move *int* scalar to the GPU.
if (isinstance(i.type, tensor.TensorType) and if (isinstance(i.type, tensor.TensorType) and
(i.ndim > 0 or 'int' not in i.dtype)): (i.ndim > 0 or 'int' not in i.dtype)):
mapping[i] = i.transfer(getattr(i.tag, target, None)) mapping[i] = i.transfer(getattr(i.tag, 'target', target))
else: else:
mapping[i] = i mapping[i] = i
for i in fgraph.variables: for i in fgraph.variables:
...@@ -551,7 +551,7 @@ optdb['canonicalize'].register('local_cut_gpua_host_gpua', ...@@ -551,7 +551,7 @@ optdb['canonicalize'].register('local_cut_gpua_host_gpua',
@register_opt('fast_compile') @register_opt('fast_compile')
@local_optimizer([tensor.Alloc]) @local_optimizer([tensor.Alloc])
def local_gpuaalloc2(node): def local_gpua_alloc2(node):
""" """
Join(axis, {Alloc or HostFromGPU}, ...) -> Join(axis, GpuAlloc, Alloc, ...) Join(axis, {Alloc or HostFromGPU}, ...) -> Join(axis, GpuAlloc, Alloc, ...)
...@@ -576,17 +576,17 @@ def local_gpuaalloc2(node): ...@@ -576,17 +576,17 @@ def local_gpuaalloc2(node):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.Alloc]) @op_lifter([tensor.Alloc])
@register_opt2([tensor.Alloc], 'fast_compile') @register_opt2([tensor.Alloc], 'fast_compile')
def local_gpuaalloc(op, context_name, inputs, outputs): def local_gpua_alloc(op, context_name, inputs, outputs):
return gpu_alloc(context_name)(*inputs) return gpu_alloc(context_name)
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.AllocEmpty]) @op_lifter([tensor.AllocEmpty])
@register_opt2([tensor.AllocEmpty], 'fast_compile') @register_opt2([tensor.AllocEmpty], 'fast_compile')
def local_gpuaallocempty(op, context_name, inputs, outputs): def local_gpua_allocempty(op, context_name, inputs, outputs):
# We use _props_dict() to make sure that the GPU op know all the # We use _props_dict() to make sure that the GPU op know all the
# CPU op props. # CPU op props.
return gpu_alloc_empty(context_name, **op._props_dict())(*inputs) return gpu_alloc_empty(context_name, **op._props_dict())
@register_opt() @register_opt()
...@@ -632,14 +632,14 @@ def local_gpu_contiguous_gpu_contiguous(node): ...@@ -632,14 +632,14 @@ def local_gpu_contiguous_gpu_contiguous(node):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.extra_ops.CpuContiguous]) @op_lifter([tensor.extra_ops.CpuContiguous])
@register_opt2([tensor.extra_ops.CpuContiguous], 'fast_compile') @register_opt2([tensor.extra_ops.CpuContiguous], 'fast_compile')
def local_gpu_contiguous(op, context_name, inputs, outputs): def local_gpua_contiguous(op, context_name, inputs, outputs):
return gpu_contiguous return gpu_contiguous
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.Reshape]) @op_lifter([tensor.Reshape])
@register_opt2([tensor.Reshape], 'fast_compile') @register_opt2([tensor.Reshape], 'fast_compile')
def local_gpureshape(op, context_name, inputs, outputs): def local_gpua_reshape(op, context_name, inputs, outputs):
name = op.name name = op.name
if name: if name:
name = 'Gpu' + name name = 'Gpu' + name
...@@ -650,14 +650,14 @@ def local_gpureshape(op, context_name, inputs, outputs): ...@@ -650,14 +650,14 @@ def local_gpureshape(op, context_name, inputs, outputs):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.Rebroadcast]) @op_lifter([tensor.Rebroadcast])
@register_opt2([tensor.Rebroadcast], 'fast_compile') @register_opt2([tensor.Rebroadcast], 'fast_compile')
def local_gpu_rebroadcast(op, context_name, inputs, outputs): def local_gpua_rebroadcast(op, context_name, inputs, outputs):
return op(as_gpuarray_variable(inputs[0], context_name)) return op(as_gpuarray_variable(inputs[0], context_name))
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.Flatten]) @op_lifter([tensor.Flatten])
@register_opt2([tensor.Flatten], 'fast_compile') @register_opt2([tensor.Flatten], 'fast_compile')
def local_gpuflatten(op, context_name, inputs, outputs): def local_gpua_flatten(op, context_name, inputs, outputs):
shp = [] shp = []
if op.outdim != 1: if op.outdim != 1:
shp = [inputs[0].shape[i] for i in range(op.outdim - 1)] shp = [inputs[0].shape[i] for i in range(op.outdim - 1)]
...@@ -785,7 +785,7 @@ def gpu_print_wrapper(op, cnda): ...@@ -785,7 +785,7 @@ def gpu_print_wrapper(op, cnda):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.printing.Print]) @op_lifter([tensor.printing.Print])
@register_opt2([tensor.printing.Print], 'fast_compile') @register_opt2([tensor.printing.Print], 'fast_compile')
def local_gpu_print_op(op, context_name, inputs, outputs): def local_gpua_print_op(op, context_name, inputs, outputs):
x, = inputs x, = inputs
gpu_x = as_gpuarray_variable(x, context_name=context_name) gpu_x = as_gpuarray_variable(x, context_name=context_name)
new_op = op.__class__(global_fn=gpu_print_wrapper) new_op = op.__class__(global_fn=gpu_print_wrapper)
...@@ -886,7 +886,7 @@ def local_gpua_join(op, context_name, inputs, outputs): ...@@ -886,7 +886,7 @@ def local_gpua_join(op, context_name, inputs, outputs):
@register_opt('fast_compile') @register_opt('fast_compile')
@local_optimizer([GpuJoin]) @local_optimizer([GpuJoin])
def local_gpuajoin_1(node): def local_gpua_join_1(node):
# join of a single element # join of a single element
if (isinstance(node.op, GpuJoin) and if (isinstance(node.op, GpuJoin) and
len(node.inputs) == 2): len(node.inputs) == 2):
...@@ -1142,25 +1142,25 @@ def local_gpua_hgemm(op, context_name, inputs, outputs): ...@@ -1142,25 +1142,25 @@ def local_gpua_hgemm(op, context_name, inputs, outputs):
@register_opt() @register_opt()
@alpha_merge(GpuGemm, alpha_in=1, beta_in=4) @alpha_merge(GpuGemm, alpha_in=1, beta_in=4)
def local_gpuagemm_alpha_merge(node, *inputs): def local_gpua_gemm_alpha_merge(node, *inputs):
return [gpugemm_no_inplace(*inputs)] return [gpugemm_no_inplace(*inputs)]
@register_opt() @register_opt()
@output_merge(GpuGemm, alpha_in=1, beta_in=4, out_in=0) @output_merge(GpuGemm, alpha_in=1, beta_in=4, out_in=0)
def local_gpuagemm_output_merge(node, *inputs): def local_gpua_gemm_output_merge(node, *inputs):
return [gpugemm_no_inplace(*inputs)] return [gpugemm_no_inplace(*inputs)]
@register_opt() @register_opt()
@alpha_merge(GpuGemmBatch, alpha_in=1, beta_in=4) @alpha_merge(GpuGemmBatch, alpha_in=1, beta_in=4)
def local_gpuagemmbatch_alpha_merge(node, *inputs): def local_gpua_gemmbatch_alpha_merge(node, *inputs):
return [gpugemmbatch_no_inplace(*inputs)] return [gpugemmbatch_no_inplace(*inputs)]
@register_opt() @register_opt()
@output_merge(GpuGemmBatch, alpha_in=1, beta_in=4, out_in=0) @output_merge(GpuGemmBatch, alpha_in=1, beta_in=4, out_in=0)
def local_gpuagemmbatch_output_merge(node, *inputs): def local_gpua_gemmbatch_output_merge(node, *inputs):
return [gpugemmbatch_no_inplace(*inputs)] return [gpugemmbatch_no_inplace(*inputs)]
...@@ -1193,7 +1193,7 @@ def local_gpua_dot22scalar(op, context_name, inputs, outputs): ...@@ -1193,7 +1193,7 @@ def local_gpua_dot22scalar(op, context_name, inputs, outputs):
@op_lifter([tensor.basic.Eye]) @op_lifter([tensor.basic.Eye])
@register_opt2([tensor.basic.Eye], 'fast_compile') @register_opt2([tensor.basic.Eye], 'fast_compile')
def local_gpua_eye(op, context_name, inputs, outputs): def local_gpua_eye(op, context_name, inputs, outputs):
return GpuEye(dtype=op.dtype, context_name=context_name)(*inputs) return GpuEye(dtype=op.dtype, context_name=context_name)
@register_opt('fast_compile') @register_opt('fast_compile')
...@@ -1226,7 +1226,7 @@ def local_gpua_softmaxwithbias(op, context_name, inputs, outputs): ...@@ -1226,7 +1226,7 @@ def local_gpua_softmaxwithbias(op, context_name, inputs, outputs):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([theano.tensor.opt.Assert]) @op_lifter([theano.tensor.opt.Assert])
def local_assert(op, context_name, inputs, outputs): def local_gpua_assert(op, context_name, inputs, outputs):
if isinstance(inputs[0].type, GpuArrayType): if isinstance(inputs[0].type, GpuArrayType):
return return
return local_assert_graph(op, context_name, inputs, outputs) return local_assert_graph(op, context_name, inputs, outputs)
...@@ -1241,7 +1241,7 @@ def local_assert_graph(op, context_name, inputs, outputs): ...@@ -1241,7 +1241,7 @@ def local_assert_graph(op, context_name, inputs, outputs):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([ConvOp]) @op_lifter([ConvOp])
@register_opt2([ConvOp], 'fast_compile') @register_opt2([ConvOp], 'fast_compile')
def local_error_convop(op, context_name, inputs, outputs): def local_gpua_error_convop(op, context_name, inputs, outputs):
assert False, """ assert False, """
ConvOp does not work with the gpuarray backend. ConvOp does not work with the gpuarray backend.
...@@ -1253,7 +1253,7 @@ theano.tensor.nnet.conv2d() ...@@ -1253,7 +1253,7 @@ theano.tensor.nnet.conv2d()
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([SparseBlockGemv]) @op_lifter([SparseBlockGemv])
@register_opt2([SparseBlockGemv], 'fast_compile') @register_opt2([SparseBlockGemv], 'fast_compile')
def local_lift_sparseblockgemv(op, context_name, inputs, outputs): def local_gpua_lift_sparseblockgemv(op, context_name, inputs, outputs):
if op.inplace: if op.inplace:
return gpu_sparse_block_gemv_inplace return gpu_sparse_block_gemv_inplace
else: else:
...@@ -1263,7 +1263,7 @@ def local_lift_sparseblockgemv(op, context_name, inputs, outputs): ...@@ -1263,7 +1263,7 @@ def local_lift_sparseblockgemv(op, context_name, inputs, outputs):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([SparseBlockOuter]) @op_lifter([SparseBlockOuter])
@register_opt2([SparseBlockOuter], 'fast_compile') @register_opt2([SparseBlockOuter], 'fast_compile')
def local_lift_sparseblockouter(op, context_name, inputs, outputs): def local_gpua_lift_sparseblockouter(op, context_name, inputs, outputs):
if op.inplace: if op.inplace:
return gpu_sparse_block_outer_inplace return gpu_sparse_block_outer_inplace
else: else:
...@@ -1289,17 +1289,17 @@ def local_inplace_sparseblockouter(node): ...@@ -1289,17 +1289,17 @@ def local_inplace_sparseblockouter(node):
@op_lifter([AbstractConv2d, @op_lifter([AbstractConv2d,
AbstractConv2d_gradWeights, AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs]) AbstractConv2d_gradInputs])
def local_lift_abstractconv2d(op, context_name, inputs, outputs): def local_gpua_lift_abstractconv2d(op, context_name, inputs, outputs):
if isinstance(outputs[0].type, GpuArrayType): if isinstance(outputs[0].type, GpuArrayType):
# Don't handle this node here, it's already on the GPU. # Don't handle this node here, it's already on the GPU.
return return
return local_lift_abstractconv2d_graph(op, context_name, inputs, outputs) return local_gpua_lift_abstractconv2d_graph(op, context_name, inputs, outputs)
@register_opt2([AbstractConv2d, @register_opt2([AbstractConv2d,
AbstractConv2d_gradWeights, AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs], 'fast_compile', 'conv_dnn', 'cudnn') AbstractConv2d_gradInputs], 'fast_compile', 'conv_dnn', 'cudnn')
def local_lift_abstractconv2d_graph(op, context_name, inputs, outputs): def local_gpua_lift_abstractconv2d_graph(op, context_name, inputs, outputs):
inps = list(inputs) inps = list(inputs)
inps[0] = as_gpuarray_variable(inputs[0], inps[0] = as_gpuarray_variable(inputs[0],
context_name=context_name) context_name=context_name)
...@@ -1413,7 +1413,7 @@ def gpu_reconstruct_graph(inputs, outputs, tag=None): ...@@ -1413,7 +1413,7 @@ def gpu_reconstruct_graph(inputs, outputs, tag=None):
@register_opt('scan', 'fast_compile') @register_opt('scan', 'fast_compile')
@op_lifter([scan_op.Scan]) @op_lifter([scan_op.Scan])
@register_opt2([scan_op.Scan], 'fast_compile') @register_opt2([scan_op.Scan], 'fast_compile')
def local_scan_to_gpua(op, context_name, inputs, outputs): def local_gpua_scan_to_gpua(op, context_name, inputs, outputs):
info = copy.deepcopy(op.info) info = copy.deepcopy(op.info)
if info.get('gpua', False): if info.get('gpua', False):
return return
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论