提交 e8017096 authored 作者: sentient07's avatar sentient07

Added outputs argument

上级 3510323b
...@@ -1414,7 +1414,7 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase): ...@@ -1414,7 +1414,7 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
AbstractConv2d_gradInputs]) AbstractConv2d_gradInputs])
@register_opt2([AbstractConv2d, AbstractConv2d_gradWeights, @register_opt2([AbstractConv2d, AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs], 'fast_compile') AbstractConv2d_gradInputs], 'fast_compile')
def local_abstractconv_cudnn_graph(op, context_name, inputs): def local_abstractconv_cudnn_graph(op, context_name, inputs, outputs):
if (not isinstance(op, (AbstractConv2d, if (not isinstance(op, (AbstractConv2d,
AbstractConv2d_gradWeights, AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs))): AbstractConv2d_gradInputs))):
...@@ -1536,7 +1536,7 @@ def local_dnn_convi_output_merge(node, *inputs): ...@@ -1536,7 +1536,7 @@ def local_dnn_convi_output_merge(node, *inputs):
@register_opt('cudnn', 'fast_compile') @register_opt('cudnn', 'fast_compile')
@op_lifter([Pool]) @op_lifter([Pool])
@register_opt2([Pool], 'fast_compile') @register_opt2([Pool], 'fast_compile')
def local_pool_dnn_alternative(op, ctx_name, inputs): def local_pool_dnn_alternative(op, ctx_name, inputs, outputs):
if not dnn_available(ctx_name): if not dnn_available(ctx_name):
raise_no_cudnn() raise_no_cudnn()
if not op.ignore_border: if not op.ignore_border:
...@@ -1553,7 +1553,7 @@ def local_pool_dnn_alternative(op, ctx_name, inputs): ...@@ -1553,7 +1553,7 @@ def local_pool_dnn_alternative(op, ctx_name, inputs):
@register_opt('cudnn', 'fast_compile') @register_opt('cudnn', 'fast_compile')
@op_lifter([MaxPoolGrad]) @op_lifter([MaxPoolGrad])
@register_opt2([MaxPoolGrad], 'fast_compile') @register_opt2([MaxPoolGrad], 'fast_compile')
def local_pool_dnn_grad_stride(op, ctx_name, inputs): def local_pool_dnn_grad_stride(op, ctx_name, inputs, outputs):
if not dnn_available(ctx_name): if not dnn_available(ctx_name):
raise_no_cudnn() raise_no_cudnn()
if not op.ignore_border: if not op.ignore_border:
...@@ -1578,7 +1578,7 @@ def local_pool_dnn_grad_stride(op, ctx_name, inputs): ...@@ -1578,7 +1578,7 @@ def local_pool_dnn_grad_stride(op, ctx_name, inputs):
@register_opt('cudnn', 'fast_compile') @register_opt('cudnn', 'fast_compile')
@op_lifter([AveragePoolGrad]) @op_lifter([AveragePoolGrad])
@register_opt2([AveragePoolGrad], 'fast_compile') @register_opt2([AveragePoolGrad], 'fast_compile')
def local_avg_pool_dnn_grad_stride(op, ctx_name, inputs): def local_avg_pool_dnn_grad_stride(op, ctx_name, inputs, outputs):
if not dnn_available(ctx_name): if not dnn_available(ctx_name):
raise_no_cudnn() raise_no_cudnn()
if not op.ignore_border: if not op.ignore_border:
...@@ -1632,7 +1632,7 @@ def local_log_softmax_dnn(node): ...@@ -1632,7 +1632,7 @@ def local_log_softmax_dnn(node):
@register_opt('cudnn', 'fast_compile') @register_opt('cudnn', 'fast_compile')
@op_lifter([LogSoftmax]) @op_lifter([LogSoftmax])
@register_opt2([LogSoftmax], 'fast_compile') @register_opt2([LogSoftmax], 'fast_compile')
def local_logsoftmax_to_dnn(op, ctx_name, inputs): def local_logsoftmax_to_dnn(op, ctx_name, inputs, outputs):
# Transform the input in the format expected by GpuDnnSoftmax # Transform the input in the format expected by GpuDnnSoftmax
inp = inputs[0] inp = inputs[0]
if inp.ndim != 2: if inp.ndim != 2:
...@@ -1671,7 +1671,7 @@ gpu_seqopt.register("NoCuDNNRaise", NoCuDNNRaise(), 0, 'cudnn') ...@@ -1671,7 +1671,7 @@ gpu_seqopt.register("NoCuDNNRaise", NoCuDNNRaise(), 0, 'cudnn')
@register_opt('cudnn', 'fast_compile') @register_opt('cudnn', 'fast_compile')
@op_lifter([SoftmaxGrad]) @op_lifter([SoftmaxGrad])
@register_opt2([SoftmaxGrad], 'cudnn', 'fast_compile') @register_opt2([SoftmaxGrad], 'cudnn', 'fast_compile')
def local_softmax_dnn_grad(op, ctx_name, inputs): def local_softmax_dnn_grad(op, ctx_name, inputs, outputs):
if not dnn_available(ctx_name): if not dnn_available(ctx_name):
raise_no_cudnn("cuDNN needed for SoftmaxGrad") raise_no_cudnn("cuDNN needed for SoftmaxGrad")
ins = [] ins = []
......
...@@ -454,7 +454,7 @@ class GpuCumsum(GpuKernelBase, Op): ...@@ -454,7 +454,7 @@ class GpuCumsum(GpuKernelBase, Op):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([CumsumOp]) @op_lifter([CumsumOp])
@register_opt2([CumsumOp], 'fast_compile') @register_opt2([CumsumOp], 'fast_compile')
def use_gpu_cumsumop(op, ctx_name, inputs): def use_gpu_cumsumop(op, ctx_name, inputs, outputs):
if inputs[0].dtype == 'float32': if inputs[0].dtype == 'float32':
axis = op.axis axis = op.axis
x = inputs[0] x = inputs[0]
......
...@@ -230,7 +230,7 @@ KERNEL void k_multi_warp_multinomial( ...@@ -230,7 +230,7 @@ KERNEL void k_multi_warp_multinomial(
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([theano.sandbox.multinomial.MultinomialFromUniform]) @op_lifter([theano.sandbox.multinomial.MultinomialFromUniform])
@register_opt2([theano.sandbox.multinomial.MultinomialFromUniform], 'fast_compile') @register_opt2([theano.sandbox.multinomial.MultinomialFromUniform], 'fast_compile')
def local_gpua_multinomial(op, context_name, inputs): def local_gpua_multinomial(op, context_name, inputs, outputs):
# TODO : need description for function # TODO : need description for function
if len(inputs) == 2: if len(inputs) == 2:
......
...@@ -150,7 +150,7 @@ if (GpuKernel_init(&k_%(name)s, c->ctx, 1, &bcode, &sz, ...@@ -150,7 +150,7 @@ if (GpuKernel_init(&k_%(name)s, c->ctx, 1, &bcode, &sz,
@opt.register_opt('fast_compile') @opt.register_opt('fast_compile')
@opt.op_lifter([tensor.Dot]) @opt.op_lifter([tensor.Dot])
@opt.register_opt2([tensor.Dot], 'fast_compile') @opt.register_opt2([tensor.Dot], 'fast_compile')
def local_dot_to_gemm16(op, ctx_name, inputs): def local_dot_to_gemm16(op, ctx_name, inputs, outputs):
if nerv is None: if nerv is None:
return return
A = inputs[0] A = inputs[0]
......
...@@ -190,12 +190,7 @@ def op_lifter(OP, cuda_only=False): ...@@ -190,12 +190,7 @@ def op_lifter(OP, cuda_only=False):
for i in node.inputs: for i in node.inputs:
i.tag.context_name = context_name i.tag.context_name = context_name
try: new_op = maker(node.op, context_name, node.inputs, node.outputs)
new_op = maker(node.op, context_name, node.inputs)
except TypeError:
# Pass the outputs so that the Local Optimizers don't need to
# build the nodes again.
new_op = maker(node.op, context_name, node.inputs, node.outputs)
# This is needed as sometimes new_op inherits from OP. # This is needed as sometimes new_op inherits from OP.
if new_op and new_op != node.op: if new_op and new_op != node.op:
if isinstance(new_op, theano.Op): if isinstance(new_op, theano.Op):
...@@ -325,22 +320,14 @@ class GraphToGPU(NavigatorOptimizer): ...@@ -325,22 +320,14 @@ class GraphToGPU(NavigatorOptimizer):
for lopt in (self.local_optimizers_map.get(node.op, []) + for lopt in (self.local_optimizers_map.get(node.op, []) +
self.local_optimizers_map.get(type(node.op), []) + self.local_optimizers_map.get(type(node.op), []) +
self.local_optimizers_all): self.local_optimizers_all):
if move_to_GPU: if move_to_GPU:
t_opt = time.time() t_opt = time.time()
try: new_ops = lopt.transform(node.op, context_name,
new_ops = lopt.transform(node.op, context_name, [mapping[i] for i in node.inputs],
[mapping[i] for i in node.inputs]) node.outputs)
except TypeError: t_opt2 = time.time()
# Updating again because else we'd be counting time_opts[lopt] += t_opt2 - t_opt
# time for two except clauses
t_opt = time.time()
new_ops = lopt.transform(node.op, context_name,
[mapping[i] for i in node.inputs],
node.outputs)
finally:
t_opt2 = time.time()
time_opts[lopt] += t_opt2 - t_opt
if new_ops: if new_ops:
process_count[lopt] += 1 process_count[lopt] += 1
break break
...@@ -402,8 +389,7 @@ class GraphToGPU(NavigatorOptimizer): ...@@ -402,8 +389,7 @@ class GraphToGPU(NavigatorOptimizer):
print(blanc, getattr(opt, "name", print(blanc, getattr(opt, "name",
getattr(opt, "__name__", "")), file=stream) getattr(opt, "__name__", "")), file=stream)
print(blanc, " time io_toposort %.3fs" % sum( print(blanc, " time io_toposort %.3fs" % toposort_timing, file=stream)
toposort_timing), file=stream)
s = sum([v for k, v in time_opts.iteritems()]) s = sum([v for k, v in time_opts.iteritems()])
print(blanc, "Total time taken by local optimizers %.3fs " % s, file=stream) print(blanc, "Total time taken by local optimizers %.3fs " % s, file=stream)
...@@ -562,14 +548,14 @@ def local_gpuaalloc2(node): ...@@ -562,14 +548,14 @@ def local_gpuaalloc2(node):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.Alloc]) @op_lifter([tensor.Alloc])
@register_opt2([tensor.Alloc], 'fast_compile') @register_opt2([tensor.Alloc], 'fast_compile')
def local_gpuaalloc(op, context_name, inputs): def local_gpuaalloc(op, context_name, inputs, outputs):
return GpuAlloc(context_name)(*inputs) return GpuAlloc(context_name)(*inputs)
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.AllocEmpty]) @op_lifter([tensor.AllocEmpty])
@register_opt2([tensor.AllocEmpty], 'fast_compile') @register_opt2([tensor.AllocEmpty], 'fast_compile')
def local_gpuaallocempty(op, context_name, inputs): def local_gpuaallocempty(op, context_name, inputs, outputs):
# We use _props_dict() to make sure that the GPU op know all the # We use _props_dict() to make sure that the GPU op know all the
# CPU op props. # CPU op props.
return GpuAllocEmpty(context_name=context_name, return GpuAllocEmpty(context_name=context_name,
...@@ -619,14 +605,14 @@ def local_gpu_contiguous_gpu_contiguous(node): ...@@ -619,14 +605,14 @@ def local_gpu_contiguous_gpu_contiguous(node):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.extra_ops.CpuContiguous]) @op_lifter([tensor.extra_ops.CpuContiguous])
@register_opt2([tensor.extra_ops.CpuContiguous], 'fast_compile') @register_opt2([tensor.extra_ops.CpuContiguous], 'fast_compile')
def local_gpu_contiguous(op, context_name, inputs): def local_gpu_contiguous(op, context_name, inputs, outputs):
return gpu_contiguous return gpu_contiguous
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.Reshape]) @op_lifter([tensor.Reshape])
@register_opt2([tensor.Reshape], 'fast_compile') @register_opt2([tensor.Reshape], 'fast_compile')
def local_gpureshape(op, context_name, inputs): def local_gpureshape(op, context_name, inputs, outputs):
name = op.name name = op.name
if name: if name:
name = 'Gpu' + name name = 'Gpu' + name
...@@ -637,14 +623,14 @@ def local_gpureshape(op, context_name, inputs): ...@@ -637,14 +623,14 @@ def local_gpureshape(op, context_name, inputs):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.Rebroadcast]) @op_lifter([tensor.Rebroadcast])
@register_opt2([tensor.Rebroadcast], 'fast_compile') @register_opt2([tensor.Rebroadcast], 'fast_compile')
def local_gpu_rebroadcast(op, context_name, inputs): def local_gpu_rebroadcast(op, context_name, inputs, outputs):
return op(as_gpuarray_variable(inputs[0], context_name)) return op(as_gpuarray_variable(inputs[0], context_name))
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.Flatten]) @op_lifter([tensor.Flatten])
@register_opt2([tensor.Flatten], 'fast_compile') @register_opt2([tensor.Flatten], 'fast_compile')
def local_gpuflatten(op, context_name, inputs): def local_gpuflatten(op, context_name, inputs, outputs):
shp = [] shp = []
if op.outdim != 1: if op.outdim != 1:
shp = [inputs[0].shape[i] for i in range(op.outdim - 1)] shp = [inputs[0].shape[i] for i in range(op.outdim - 1)]
...@@ -730,7 +716,7 @@ optdb.register('gpua_inplace_opt', inplace_gpu_elemwise_opt, 75, ...@@ -730,7 +716,7 @@ optdb.register('gpua_inplace_opt', inplace_gpu_elemwise_opt, 75,
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.DimShuffle]) @op_lifter([tensor.DimShuffle])
@register_opt2([tensor.DimShuffle], 'fast_compile') @register_opt2([tensor.DimShuffle], 'fast_compile')
def local_gpua_dimshuffle(op, context_name, inputs): def local_gpua_dimshuffle(op, context_name, inputs, outputs):
return GpuDimShuffle(op.input_broadcastable, return GpuDimShuffle(op.input_broadcastable,
op.new_order) op.new_order)
...@@ -738,7 +724,7 @@ def local_gpua_dimshuffle(op, context_name, inputs): ...@@ -738,7 +724,7 @@ def local_gpua_dimshuffle(op, context_name, inputs):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.SpecifyShape]) @op_lifter([tensor.SpecifyShape])
@register_opt2([tensor.SpecifyShape], 'fast_compile') @register_opt2([tensor.SpecifyShape], 'fast_compile')
def local_gpua_specifyShape(op, context_name, inputs): def local_gpua_specifyShape(op, context_name, inputs, outputs):
if isinstance(inputs[0].type, GpuArrayType): if isinstance(inputs[0].type, GpuArrayType):
return return
inp = [as_gpuarray_variable(inputs[0], context_name)] inp = [as_gpuarray_variable(inputs[0], context_name)]
...@@ -749,7 +735,7 @@ def local_gpua_specifyShape(op, context_name, inputs): ...@@ -749,7 +735,7 @@ def local_gpua_specifyShape(op, context_name, inputs):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([theano.compile.ops.Shape]) @op_lifter([theano.compile.ops.Shape])
@register_opt2([tensor.compile.ops.Shape], 'fast_compile') @register_opt2([tensor.compile.ops.Shape], 'fast_compile')
def local_gpua_shape(node, context_name, inputs): def local_gpua_shape(node, context_name, inputs, outputs):
# op_lifter will call this opt too frequently as the output is # op_lifter will call this opt too frequently as the output is
# always on the CPU. # always on the CPU.
if isinstance(inputs[0].type, GpuArrayType): if isinstance(inputs[0].type, GpuArrayType):
...@@ -764,7 +750,7 @@ def gpu_print_wrapper(op, cnda): ...@@ -764,7 +750,7 @@ def gpu_print_wrapper(op, cnda):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.printing.Print]) @op_lifter([tensor.printing.Print])
@register_opt2([tensor.printing.Print], 'fast_compile') @register_opt2([tensor.printing.Print], 'fast_compile')
def local_gpu_print_op(op, context_name, inputs): def local_gpu_print_op(op, context_name, inputs, outputs):
x, = inputs x, = inputs
gpu_x = as_gpuarray_variable(x, context_name=context_name) gpu_x = as_gpuarray_variable(x, context_name=context_name)
new_op = op.__class__(global_fn=gpu_print_wrapper) new_op = op.__class__(global_fn=gpu_print_wrapper)
...@@ -843,7 +829,7 @@ def local_gpu_pdbbreakpoint_op(node): ...@@ -843,7 +829,7 @@ def local_gpu_pdbbreakpoint_op(node):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([IfElse]) @op_lifter([IfElse])
@register_opt2([IfElse], 'fast_compile') @register_opt2([IfElse], 'fast_compile')
def local_gpua_lazy_ifelse(op, context_name, inputs): def local_gpua_lazy_ifelse(op, context_name, inputs, outputs):
if op.gpu: if op.gpu:
return return
# this node is already on GPU, so don't change the graph # this node is already on GPU, so don't change the graph
...@@ -864,7 +850,7 @@ def local_gpua_lazy_ifelse(op, context_name, inputs): ...@@ -864,7 +850,7 @@ def local_gpua_lazy_ifelse(op, context_name, inputs):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.Join]) @op_lifter([tensor.Join])
@register_opt2([tensor.Join], 'fast_compile') @register_opt2([tensor.Join], 'fast_compile')
def local_gpua_join(op, context_name, inputs): def local_gpua_join(op, context_name, inputs, outputs):
return gpu_join return gpu_join
...@@ -880,7 +866,7 @@ def local_gpuajoin_1(node): ...@@ -880,7 +866,7 @@ def local_gpuajoin_1(node):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.Split]) @op_lifter([tensor.Split])
@register_opt2([tensor.Split], 'fast_compile') @register_opt2([tensor.Split], 'fast_compile')
def local_gpua_split(op, context_name, inputs): def local_gpua_split(op, context_name, inputs, outputs):
# TODO use props # TODO use props
return GpuSplit(op.len_splits) return GpuSplit(op.len_splits)
...@@ -937,7 +923,7 @@ def local_gpua_subtensor_graph(op, context_name, inputs, outputs): ...@@ -937,7 +923,7 @@ def local_gpua_subtensor_graph(op, context_name, inputs, outputs):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.IncSubtensor]) @op_lifter([tensor.IncSubtensor])
@register_opt2([tensor.IncSubtensor], 'fast_compile') @register_opt2([tensor.IncSubtensor], 'fast_compile')
def local_gpua_incsubtensor(op, context_name, inputs): def local_gpua_incsubtensor(op, context_name, inputs, outputs):
op = GpuIncSubtensor(op.idx_list, op.inplace, op = GpuIncSubtensor(op.idx_list, op.inplace,
op.set_instead_of_inc, op.set_instead_of_inc,
op.destroyhandler_tolerate_aliased) op.destroyhandler_tolerate_aliased)
...@@ -950,14 +936,14 @@ def local_gpua_incsubtensor(op, context_name, inputs): ...@@ -950,14 +936,14 @@ def local_gpua_incsubtensor(op, context_name, inputs):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.AdvancedSubtensor1]) @op_lifter([tensor.AdvancedSubtensor1])
@register_opt2([tensor.AdvancedSubtensor1], 'fast_compile') @register_opt2([tensor.AdvancedSubtensor1], 'fast_compile')
def local_gpua_advanced_subtensor(op, context_name, inputs): def local_gpua_advanced_subtensor(op, context_name, inputs, outputs):
return GpuAdvancedSubtensor1() return GpuAdvancedSubtensor1()
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.AdvancedIncSubtensor1]) @op_lifter([tensor.AdvancedIncSubtensor1])
@register_opt2([tensor.AdvancedIncSubtensor1], 'fast_compile') @register_opt2([tensor.AdvancedIncSubtensor1], 'fast_compile')
def local_gpua_advanced_incsubtensor(op, context_name, inputs): def local_gpua_advanced_incsubtensor(op, context_name, inputs, outputs):
context = get_context(context_name) context = get_context(context_name)
# This is disabled on non-cuda contexts # This is disabled on non-cuda contexts
if context.kind != b'cuda': if context.kind != b'cuda':
...@@ -1082,7 +1068,7 @@ def local_gpua_careduce(op, context_name, inputs, outputs): ...@@ -1082,7 +1068,7 @@ def local_gpua_careduce(op, context_name, inputs, outputs):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.blas.Gemv, tensor.blas_c.CGemv]) @op_lifter([tensor.blas.Gemv, tensor.blas_c.CGemv])
@register_opt2([tensor.blas.Gemv], 'fast_compile') @register_opt2([tensor.blas.Gemv], 'fast_compile')
def local_gpua_gemv(op, context_name, inputs): def local_gpua_gemv(op, context_name, inputs, outputs):
if op.inplace: if op.inplace:
return gpugemv_inplace return gpugemv_inplace
else: else:
...@@ -1092,7 +1078,7 @@ def local_gpua_gemv(op, context_name, inputs): ...@@ -1092,7 +1078,7 @@ def local_gpua_gemv(op, context_name, inputs):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.blas.Gemm]) @op_lifter([tensor.blas.Gemm])
@register_opt2([tensor.blas.Gemm], 'fast_compile') @register_opt2([tensor.blas.Gemm], 'fast_compile')
def local_gpua_gemm(op, context_name, inputs): def local_gpua_gemm(op, context_name, inputs, outputs):
if op.inplace: if op.inplace:
return gpugemm_inplace return gpugemm_inplace
else: else:
...@@ -1102,7 +1088,7 @@ def local_gpua_gemm(op, context_name, inputs): ...@@ -1102,7 +1088,7 @@ def local_gpua_gemm(op, context_name, inputs):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.blas.BatchedDot]) @op_lifter([tensor.blas.BatchedDot])
@register_opt2([tensor.blas.BatchedDot], 'fast_compile') @register_opt2([tensor.blas.BatchedDot], 'fast_compile')
def local_gpua_gemmbatch(op, context_name, inputs): def local_gpua_gemmbatch(op, context_name, inputs, outputs):
a, b = inputs a, b = inputs
c = tensor.AllocEmpty(a.dtype)(a.shape[0], a.shape[1], b.shape[2]) c = tensor.AllocEmpty(a.dtype)(a.shape[0], a.shape[1], b.shape[2])
return gpugemmbatch_no_inplace(c, 1.0, a, b, 0.0) return gpugemmbatch_no_inplace(c, 1.0, a, b, 0.0)
...@@ -1111,7 +1097,7 @@ def local_gpua_gemmbatch(op, context_name, inputs): ...@@ -1111,7 +1097,7 @@ def local_gpua_gemmbatch(op, context_name, inputs):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.basic.Dot]) @op_lifter([tensor.basic.Dot])
@register_opt2([tensor.basic.Dot], 'fast_compile') @register_opt2([tensor.basic.Dot], 'fast_compile')
def local_gpua_hgemm(op, context_name, inputs): def local_gpua_hgemm(op, context_name, inputs, outputs):
from theano.sandbox.cuda import nvcc_compiler from theano.sandbox.cuda import nvcc_compiler
if nvcc_compiler.nvcc_version < '7.5': if nvcc_compiler.nvcc_version < '7.5':
_logger.warning("Not performing dot of float16 on the GPU since " _logger.warning("Not performing dot of float16 on the GPU since "
...@@ -1155,20 +1141,20 @@ def local_gpuagemmbatch_output_merge(node, *inputs): ...@@ -1155,20 +1141,20 @@ def local_gpuagemmbatch_output_merge(node, *inputs):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.blas.Ger, tensor.blas_c.CGer, tensor.blas_scipy.ScipyGer]) @op_lifter([tensor.blas.Ger, tensor.blas_c.CGer, tensor.blas_scipy.ScipyGer])
def local_gpua_ger(op, context_name, inputs): def local_gpua_ger(op, context_name, inputs, outputs):
return GpuGer(inplace=op.destructive) return GpuGer(inplace=op.destructive)
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.blas.Dot22]) @op_lifter([tensor.blas.Dot22])
def local_gpua_dot22(op, context_name, inputs): def local_gpua_dot22(op, context_name, inputs, outputs):
return gpu_dot22 return gpu_dot22
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.blas.Dot22Scalar]) @op_lifter([tensor.blas.Dot22Scalar])
@register_opt2([tensor.blas.Dot22Scalar], 'fast_compile') @register_opt2([tensor.blas.Dot22Scalar], 'fast_compile')
def local_gpua_dot22scalar(op, context_name, inputs): def local_gpua_dot22scalar(op, context_name, inputs, outputs):
x, y, a = inputs x, y, a = inputs
x = as_gpuarray_variable(x, context_name) x = as_gpuarray_variable(x, context_name)
y = as_gpuarray_variable(y, context_name) y = as_gpuarray_variable(y, context_name)
...@@ -1179,42 +1165,42 @@ def local_gpua_dot22scalar(op, context_name, inputs): ...@@ -1179,42 +1165,42 @@ def local_gpua_dot22scalar(op, context_name, inputs):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.basic.Eye]) @op_lifter([tensor.basic.Eye])
@register_opt2([tensor.basic.Eye], 'fast_compile') @register_opt2([tensor.basic.Eye], 'fast_compile')
def local_gpua_eye(op, context_name, inputs): def local_gpua_eye(op, context_name, inputs, outputs):
return GpuEye(dtype=op.dtype, context_name=context_name) return GpuEye(dtype=op.dtype, context_name=context_name)
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias], cuda_only=True) @op_lifter([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias], cuda_only=True)
@register_opt2([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias], 'fast_compile') @register_opt2([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias], 'fast_compile')
def local_gpua_crossentropysoftmaxargmax1hotwithbias(op, context_name, inputs): def local_gpua_crossentropysoftmaxargmax1hotwithbias(op, context_name, inputs, outputs):
return gpu_crossentropy_softmax_argmax_1hot_with_bias return gpu_crossentropy_softmax_argmax_1hot_with_bias
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.nnet.CrossentropySoftmax1HotWithBiasDx], cuda_only=True) @op_lifter([tensor.nnet.CrossentropySoftmax1HotWithBiasDx], cuda_only=True)
@register_opt2([tensor.nnet.CrossentropySoftmax1HotWithBiasDx], 'fast_compile') @register_opt2([tensor.nnet.CrossentropySoftmax1HotWithBiasDx], 'fast_compile')
def local_gpua_crossentropysoftmax1hotwithbiasdx(op, context_name, inputs): def local_gpua_crossentropysoftmax1hotwithbiasdx(op, context_name, inputs, outputs):
return gpu_crossentropy_softmax_1hot_with_bias_dx return gpu_crossentropy_softmax_1hot_with_bias_dx
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.nnet.Softmax], cuda_only=True) @op_lifter([tensor.nnet.Softmax], cuda_only=True)
@register_opt2([tensor.nnet.Softmax], 'fast_compile') @register_opt2([tensor.nnet.Softmax], 'fast_compile')
def local_gpua_softmax(op, context_name, inputs): def local_gpua_softmax(op, context_name, inputs, outputs):
return gpu_softmax return gpu_softmax
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.nnet.SoftmaxWithBias], cuda_only=True) @op_lifter([tensor.nnet.SoftmaxWithBias], cuda_only=True)
@register_opt2([tensor.nnet.SoftmaxWithBias], 'fast_compile') @register_opt2([tensor.nnet.SoftmaxWithBias], 'fast_compile')
def local_gpua_softmaxwithbias(node, context_name): def local_gpua_softmaxwithbias(node, context_name, inputs, outputs):
return gpu_softmax_with_bias return gpu_softmax_with_bias
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([theano.tensor.opt.Assert]) @op_lifter([theano.tensor.opt.Assert])
@register_opt2([theano.tensor.opt.Assert], 'fast_compile') @register_opt2([theano.tensor.opt.Assert], 'fast_compile')
def local_assert(op, context_name, inputs): def local_assert(op, context_name, inputs, outputs):
# Check if input nodes are already on the GPU # Check if input nodes are already on the GPU
if isinstance(inputs[0].type, GpuArrayType): if isinstance(inputs[0].type, GpuArrayType):
return return
...@@ -1224,7 +1210,7 @@ def local_assert(op, context_name, inputs): ...@@ -1224,7 +1210,7 @@ def local_assert(op, context_name, inputs):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([ConvOp]) @op_lifter([ConvOp])
def local_error_convop(op, context_name, inputs): def local_error_convop(op, context_name, inputs, outputs):
assert False, """ assert False, """
ConvOp does not work with the gpuarray backend. ConvOp does not work with the gpuarray backend.
...@@ -1236,7 +1222,7 @@ theano.tensor.nnet.conv2d() ...@@ -1236,7 +1222,7 @@ theano.tensor.nnet.conv2d()
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([SparseBlockGemv]) @op_lifter([SparseBlockGemv])
@register_opt2([SparseBlockGemv], 'fast_compile') @register_opt2([SparseBlockGemv], 'fast_compile')
def local_lift_sparseblockgemv(op, context_name, inputs): def local_lift_sparseblockgemv(op, context_name, inputs, outputs):
if op.inplace: if op.inplace:
return gpu_sparse_block_gemv_inplace return gpu_sparse_block_gemv_inplace
else: else:
...@@ -1246,7 +1232,7 @@ def local_lift_sparseblockgemv(op, context_name, inputs): ...@@ -1246,7 +1232,7 @@ def local_lift_sparseblockgemv(op, context_name, inputs):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([SparseBlockOuter]) @op_lifter([SparseBlockOuter])
@register_opt2([SparseBlockOuter], 'fast_compile') @register_opt2([SparseBlockOuter], 'fast_compile')
def local_lift_sparseblockouter(op, context_name, inputs): def local_lift_sparseblockouter(op, context_name, inputs, outputs):
if op.inplace: if op.inplace:
return gpu_sparse_block_outer_inplace return gpu_sparse_block_outer_inplace
else: else:
...@@ -1275,7 +1261,7 @@ def local_inplace_sparseblockouter(node): ...@@ -1275,7 +1261,7 @@ def local_inplace_sparseblockouter(node):
@register_opt2([AbstractConv2d, @register_opt2([AbstractConv2d,
AbstractConv2d_gradWeights, AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs], 'fast_compile') AbstractConv2d_gradInputs], 'fast_compile')
def local_lift_abstractconv2d(op, context_name, inputs): def local_lift_abstractconv2d(op, context_name, inputs, outputs):
if isinstance(inputs[0].type, GpuArrayType): if isinstance(inputs[0].type, GpuArrayType):
# Don't handle this node here, it's already on the GPU. # Don't handle this node here, it's already on the GPU.
return return
...@@ -1392,7 +1378,7 @@ def gpu_reconstruct_graph(inputs, outputs, tag=None): ...@@ -1392,7 +1378,7 @@ def gpu_reconstruct_graph(inputs, outputs, tag=None):
@register_opt('scan', 'fast_compile') @register_opt('scan', 'fast_compile')
@op_lifter([scan_op.Scan]) @op_lifter([scan_op.Scan])
@register_opt2([scan_op.Scan], 'fast_compile') @register_opt2([scan_op.Scan], 'fast_compile')
def local_scan_to_gpua(op, context_name, inputs): def local_scan_to_gpua(op, context_name, inputs, outputs):
info = copy.deepcopy(op.info) info = copy.deepcopy(op.info)
if info.get('gpua', False): if info.get('gpua', False):
return return
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论