提交 e45b6cd6 authored 作者: Frederic Bastien's avatar Frederic Bastien 提交者: sentient07

Advance GraphToGPU. Make register_opt2 to register to it to bypass op_lifter

上级 d9d4fc8b
...@@ -55,6 +55,7 @@ _logger = logging.getLogger("theano.gpuarray.opt") ...@@ -55,6 +55,7 @@ _logger = logging.getLogger("theano.gpuarray.opt")
gpu_optimizer = EquilibriumDB() gpu_optimizer = EquilibriumDB()
gpu_optimizer2 = EquilibriumDB()
gpu_cut_copies = EquilibriumDB() gpu_cut_copies = EquilibriumDB()
...@@ -66,19 +67,17 @@ class GraphToGPUDB(DB): ...@@ -66,19 +67,17 @@ class GraphToGPUDB(DB):
""" """
def query(self, *tags, **kwtags): def query(self, *tags, **kwtags):
opt = gpu_optimizer.query(*tags, **kwtags) opt = gpu_optimizer2.query(*tags, **kwtags)
return GraphToGPU(opt.local_optimizers_all, opt.local_optimizers_map) return GraphToGPU(opt.local_optimizers_all, opt.local_optimizers_map)
graph_optimizer = GraphToGPUDB()
gpu_seqopt = SequenceDB() gpu_seqopt = SequenceDB()
# Don't register this right now # Don't register this right now
conv_groupopt = LocalGroupDB() conv_groupopt = LocalGroupDB()
conv_groupopt.__name__ = "gpua_conv_opts" conv_groupopt.__name__ = "gpua_conv_opts"
gpu_seqopt.register('gpu_graph_optimization', graph_optimizer, -0.5, gpu_seqopt.register('gpuarray_graph_optimization', GraphToGPUDB(), -0.5,
'fast_compile', 'fast_run', 'gpuarray') 'fast_compile', 'fast_run', 'gpuarray')
gpu_seqopt.register('gpuarray_local_optimiziations', gpu_optimizer, 1, gpu_seqopt.register('gpuarray_local_optimiziations', gpu_optimizer, 1,
...@@ -100,6 +99,15 @@ def register_opt(*tags, **kwargs): ...@@ -100,6 +99,15 @@ def register_opt(*tags, **kwargs):
return f return f
def register_opt2(tracks, *tags, **kwargs):
def f(local_opt):
name = (kwargs and kwargs.pop('name')) or local_opt.__name__
opt = theano.gof.local_optimizer(tracks)(local_opt)
gpu_optimizer2.register(name, opt, 'fast_run', 'gpuarray', *tags)
return local_opt
return f
def register_inplace(*tags, **kwargs): def register_inplace(*tags, **kwargs):
def f(local_opt): def f(local_opt):
name = (kwargs and kwargs.pop('name')) or local_opt.__name__ name = (kwargs and kwargs.pop('name')) or local_opt.__name__
...@@ -176,7 +184,10 @@ def op_lifter(OP, cuda_only=False): ...@@ -176,7 +184,10 @@ def op_lifter(OP, cuda_only=False):
# the context was derived from the outputs # the context was derived from the outputs
for i in node.inputs: for i in node.inputs:
i.tag.context_name = context_name i.tag.context_name = context_name
new_op = maker(node, context_name) try:
new_op = maker(node, context_name, node.inputs)
except TypeError:
new_op = maker(node, context_name)
# This is needed as sometimes new_op inherits from OP. # This is needed as sometimes new_op inherits from OP.
if new_op and new_op != node.op: if new_op and new_op != node.op:
if isinstance(new_op, theano.Op): if isinstance(new_op, theano.Op):
...@@ -253,7 +264,7 @@ class GraphToGPU(Optimizer): ...@@ -253,7 +264,7 @@ class GraphToGPU(Optimizer):
# Iterating through inputs of graph # Iterating through inputs of graph
for i in fgraph.inputs: for i in fgraph.inputs:
if isinstance(i.type, tensor.TensorType): if isinstance(i.type, tensor.TensorType):
mapping[i] = GpuFromHost(None)(i) mapping[i] = as_gpuarray_variable(i, None) # TODO context
else: else:
mapping[i] = i mapping[i] = i
for i in fgraph.variables: for i in fgraph.variables:
...@@ -262,39 +273,60 @@ class GraphToGPU(Optimizer): ...@@ -262,39 +273,60 @@ class GraphToGPU(Optimizer):
for node in fgraph.toposort(): for node in fgraph.toposort():
# The Extra condition if isinstance(node.op, HostFromGpu):
if any([isinstance(i, GpuArrayVariable) or mapping[node.outputs[0]] = node.inputs[0]
isinstance(i, GpuArraySharedVariable) continue
for i in node.inputs + node.outputs]):
move_to_GPU = False # Move only if any of the inputs are on the GPU.
move_to_GPU = False
if any([isinstance(i, GpuArrayVariable) or
isinstance(i, GpuArraySharedVariable)
for i in [mapping[v] for v in node.inputs] +
node.outputs]):
# Oplifter's condition move_to_GPU = True
# Will return a list of OP
# If None, means can't be moved.
new_ops = None new_ops = None
# Apply the lifter
# Selecting the best optimizer
# TODO : the tag should be updated to the one user provides
# currently using fast_run and fast_compile tag
for lopt in (self.local_optimizers_all + for lopt in (self.local_optimizers_all +
self.local_optimizers_map.get(type(node.op), []) + self.local_optimizers_map.get(type(node.op), []) +
self.local_optimizers_map.get(node.op, [])): self.local_optimizers_map.get(node.op, [])):
replace = False
new_ops = lopt.transform(node) or lopt(node) for i in [mapping[i] for i in node.inputs]:
break if isinstance(i.type, GpuArrayType):
context_name = i.type.context_name
if not new_ops or not isinstance(new_ops, theano.Op): replace = True
move_to_GPU = False break
if replace:
if move_to_GPU: try:
newnode = new_ops(*[mapping.get(i) for i in node.inputs]) new_ops = lopt.transform(
for new_o, old_o in zip(newnode.outputs, node.outputs): node, context_name,
mapping[old_o] = new_o [mapping[i] for i in node.inputs])
except TypeError:
new_ops = lopt.transform(node, context_name)
if new_ops:
break
if not new_ops:
newnode = node.clone_with_new_inputs([mapping.get(i)
for i in node.inputs])
outputs = newnode.outputs
elif isinstance(new_ops, (tuple, list)):
outputs = []
for o in new_ops:
if o.owner and isinstance(o.owner.op, HostFromGpu):
outputs.append(o.owner.inputs[0])
else:
outputs.append(o)
elif isinstance(new_ops, theano.Variable):
if new_ops.owner and isinstance(new_ops.owner.op, HostFromGpu):
outputs = new_ops.owner.inputs
else:
outputs = [new_ops]
else: else:
for o in node.outputs: outputs = new_ops(*[mapping[i] for i in node.inputs],
mapping[o] = o return_list=True)
for new_o, old_o in zip(outputs, node.outputs):
mapping[old_o] = new_o
new_nodes = [] new_nodes = []
for o in fgraph.outputs: for o in fgraph.outputs:
...@@ -389,12 +421,14 @@ def local_gpuaalloc2(node): ...@@ -389,12 +421,14 @@ def local_gpuaalloc2(node):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.Alloc]) @op_lifter([tensor.Alloc])
@register_opt2([tensor.Alloc], 'fast_compile')
def local_gpuaalloc(node, context_name): def local_gpuaalloc(node, context_name):
return GpuAlloc(context_name)(*node.inputs) return GpuAlloc(context_name)(*node.inputs)
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.AllocEmpty]) @op_lifter([tensor.AllocEmpty])
@register_opt2([tensor.AllocEmpty], 'fast_compile')
def local_gpuaallocempty(node, context_name): def local_gpuaallocempty(node, context_name):
# We use _props_dict() to make sure that the GPU op know all the # We use _props_dict() to make sure that the GPU op know all the
# CPU op props. # CPU op props.
...@@ -444,12 +478,14 @@ def local_gpu_contiguous_gpu_contiguous(node): ...@@ -444,12 +478,14 @@ def local_gpu_contiguous_gpu_contiguous(node):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.extra_ops.CpuContiguous]) @op_lifter([tensor.extra_ops.CpuContiguous])
@register_opt2([tensor.extra_ops.CpuContiguous], 'fast_compile')
def local_gpu_contiguous(node, context_name): def local_gpu_contiguous(node, context_name):
return gpu_contiguous return gpu_contiguous
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.Reshape]) @op_lifter([tensor.Reshape])
@register_opt2([tensor.Reshape], 'fast_compile')
def local_gpureshape(node, context_name): def local_gpureshape(node, context_name):
op = node.op op = node.op
name = op.name name = op.name
...@@ -461,26 +497,29 @@ def local_gpureshape(node, context_name): ...@@ -461,26 +497,29 @@ def local_gpureshape(node, context_name):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.Rebroadcast]) @op_lifter([tensor.Rebroadcast])
def local_gpu_rebroadcast(node, context_name): @register_opt2([tensor.Rebroadcast], 'fast_compile')
return node.op(as_gpuarray_variable(node.inputs[0], context_name)) def local_gpu_rebroadcast(node, context_name, inputs):
return node.op(as_gpuarray_variable(inputs[0], context_name))
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.Flatten]) @op_lifter([tensor.Flatten])
def local_gpuflatten(node, context_name): @register_opt2([tensor.Flatten], 'fast_compile')
def local_gpuflatten(node, context_name, inputs):
op = node.op op = node.op
shp = [] shp = []
if op.outdim != 1: if op.outdim != 1:
shp = [node.inputs[0].shape[i] for i in range(op.outdim - 1)] shp = [inputs[0].shape[i] for i in range(op.outdim - 1)]
shp += [-1] shp += [-1]
res = GpuReshape(op.outdim, None) res = GpuReshape(op.outdim, None)
o = res(node.inputs[0], theano.tensor.as_tensor_variable(shp)) o = res(inputs[0], theano.tensor.as_tensor_variable(shp))
return o return o
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.Elemwise]) @op_lifter([tensor.Elemwise])
def local_gpu_elemwise(node, context_name): @register_opt2([tensor.Elemwise], 'fast_compile')
def local_gpu_elemwise(node, context_name, inputs):
op = node.op op = node.op
scal_op = op.scalar_op scal_op = op.scalar_op
name = op.name name = op.name
...@@ -505,7 +544,7 @@ def local_gpu_elemwise(node, context_name): ...@@ -505,7 +544,7 @@ def local_gpu_elemwise(node, context_name):
# Transfer the inputs on the GPU and cast them to the right dtype. # Transfer the inputs on the GPU and cast them to the right dtype.
new_inputs = [] new_inputs = []
for inp in node.inputs: for inp in inputs:
if inp.dtype != out_dtype: if inp.dtype != out_dtype:
gpu_cast_op = GpuElemwise(Cast(Scalar(out_dtype))) gpu_cast_op = GpuElemwise(Cast(Scalar(out_dtype)))
new_inputs.append(gpu_cast_op(as_gpuarray_variable(inp, context_name))) new_inputs.append(gpu_cast_op(as_gpuarray_variable(inp, context_name)))
...@@ -553,6 +592,7 @@ optdb.register('gpua_inplace_opt', inplace_gpu_elemwise_opt, 75, ...@@ -553,6 +592,7 @@ optdb.register('gpua_inplace_opt', inplace_gpu_elemwise_opt, 75,
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.DimShuffle]) @op_lifter([tensor.DimShuffle])
@register_opt2([tensor.DimShuffle], 'fast_compile')
def local_gpua_dimshuffle(node, context_name): def local_gpua_dimshuffle(node, context_name):
return GpuDimShuffle(node.op.input_broadcastable, return GpuDimShuffle(node.op.input_broadcastable,
node.op.new_order) node.op.new_order)
...@@ -560,22 +600,24 @@ def local_gpua_dimshuffle(node, context_name): ...@@ -560,22 +600,24 @@ def local_gpua_dimshuffle(node, context_name):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.SpecifyShape]) @op_lifter([tensor.SpecifyShape])
def local_gpua_specifyShape(node, context_name): @register_opt2([tensor.SpecifyShape], 'fast_compile')
if isinstance(node.inputs[0].type, GpuArrayType): def local_gpua_specifyShape(node, context_name, inputs):
if isinstance(inputs[0].type, GpuArrayType):
return return
inp = [as_gpuarray_variable(node.inputs[0], context_name)] inp = [as_gpuarray_variable(inputs[0], context_name)]
inp += node.inputs[1:] inp += inputs[1:]
return tensor.specify_shape(*inp) return tensor.specify_shape(*inp)
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([theano.compile.ops.Shape]) @op_lifter([theano.compile.ops.Shape])
def local_gpua_shape(node, context_name): @register_opt2([tensor.compile.ops.Shape], 'fast_compile')
def local_gpua_shape(node, context_name, inputs):
# op_lifter will call this opt too frequently as the output is # op_lifter will call this opt too frequently as the output is
# always on the CPU. # always on the CPU.
if isinstance(node.inputs[0].type, GpuArrayType): if isinstance(inputs[0].type, GpuArrayType):
return return
return [as_gpuarray_variable(node.inputs[0], context_name).shape] return [as_gpuarray_variable(inputs[0], context_name).shape]
def gpu_print_wrapper(op, cnda): def gpu_print_wrapper(op, cnda):
...@@ -584,8 +626,9 @@ def gpu_print_wrapper(op, cnda): ...@@ -584,8 +626,9 @@ def gpu_print_wrapper(op, cnda):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.printing.Print]) @op_lifter([tensor.printing.Print])
def local_gpu_print_op(node, context_name): @register_opt2([tensor.printing.Print], 'fast_compile')
x, = node.inputs def local_gpu_print_op(node, context_name, inputs):
x, = inputs
gpu_x = as_gpuarray_variable(x, context_name=context_name) gpu_x = as_gpuarray_variable(x, context_name=context_name)
new_op = node.op.__class__(global_fn=gpu_print_wrapper) new_op = node.op.__class__(global_fn=gpu_print_wrapper)
new_op.old_op = node.op new_op.old_op = node.op
...@@ -662,12 +705,13 @@ def local_gpu_pdbbreakpoint_op(node): ...@@ -662,12 +705,13 @@ def local_gpu_pdbbreakpoint_op(node):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([IfElse]) @op_lifter([IfElse])
def local_gpua_lazy_ifelse(node, context_name): @register_opt2([IfElse], 'fast_compile')
def local_gpua_lazy_ifelse(node, context_name, inputs):
if node.op.gpu: if node.op.gpu:
return return
c = node.inputs[0] c = inputs[0]
inps = [] inps = []
for v in node.inputs[1:]: for v in inputs[1:]:
if isinstance(v.type, (tensor.TensorType, GpuArrayType)): if isinstance(v.type, (tensor.TensorType, GpuArrayType)):
inps.append(as_gpuarray_variable(v, context_name)) inps.append(as_gpuarray_variable(v, context_name))
else: else:
...@@ -677,6 +721,7 @@ def local_gpua_lazy_ifelse(node, context_name): ...@@ -677,6 +721,7 @@ def local_gpua_lazy_ifelse(node, context_name):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.Join]) @op_lifter([tensor.Join])
@register_opt2([tensor.Join], 'fast_compile')
def local_gpua_join(node, context_name): def local_gpua_join(node, context_name):
return gpu_join return gpu_join
...@@ -692,12 +737,15 @@ def local_gpuajoin_1(node): ...@@ -692,12 +737,15 @@ def local_gpuajoin_1(node):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.Split]) @op_lifter([tensor.Split])
@register_opt2([tensor.Split], 'fast_compile')
def local_gpua_split(node, context_name): def local_gpua_split(node, context_name):
#TODO use props
return GpuSplit(node.op.len_splits) return GpuSplit(node.op.len_splits)
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.Subtensor]) @op_lifter([tensor.Subtensor])
@register_opt2([tensor.Subtensor], 'fast_compile')
def local_gpua_subtensor(node, context_name): def local_gpua_subtensor(node, context_name):
x = node.inputs[0] x = node.inputs[0]
if (x.owner and isinstance(x.owner.op, HostFromGpu)): if (x.owner and isinstance(x.owner.op, HostFromGpu)):
...@@ -719,11 +767,12 @@ def local_gpua_subtensor(node, context_name): ...@@ -719,11 +767,12 @@ def local_gpua_subtensor(node, context_name):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.IncSubtensor]) @op_lifter([tensor.IncSubtensor])
def local_gpua_incsubtensor(node, context_name): @register_opt2([tensor.IncSubtensor], 'fast_compile')
def local_gpua_incsubtensor(node, context_name, inputs):
op = GpuIncSubtensor(node.op.idx_list, node.op.inplace, op = GpuIncSubtensor(node.op.idx_list, node.op.inplace,
node.op.set_instead_of_inc, node.op.set_instead_of_inc,
node.op.destroyhandler_tolerate_aliased) node.op.destroyhandler_tolerate_aliased)
ret = op(*node.inputs) ret = op(*inputs)
val = getattr(node.outputs[0].tag, 'nan_guard_mode_check', True) val = getattr(node.outputs[0].tag, 'nan_guard_mode_check', True)
ret.tag.nan_guard_mode_check = val ret.tag.nan_guard_mode_check = val
return ret return ret
...@@ -731,12 +780,14 @@ def local_gpua_incsubtensor(node, context_name): ...@@ -731,12 +780,14 @@ def local_gpua_incsubtensor(node, context_name):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.AdvancedSubtensor1]) @op_lifter([tensor.AdvancedSubtensor1])
@register_opt2([tensor.AdvancedSubtensor1], 'fast_compile')
def local_gpua_advanced_subtensor(node, context_name): def local_gpua_advanced_subtensor(node, context_name):
return GpuAdvancedSubtensor1() return GpuAdvancedSubtensor1()
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.AdvancedIncSubtensor1]) @op_lifter([tensor.AdvancedIncSubtensor1])
@register_opt2([tensor.AdvancedIncSubtensor1], 'fast_compile')
def local_gpua_advanced_incsubtensor(node, context_name): def local_gpua_advanced_incsubtensor(node, context_name):
context = get_context(context_name) context = get_context(context_name)
# This is disabled on non-cuda contexts # This is disabled on non-cuda contexts
...@@ -776,6 +827,7 @@ def local_advincsub1_gpua_inplace(node): ...@@ -776,6 +827,7 @@ def local_advincsub1_gpua_inplace(node):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.CAReduce, tensor.Sum, tensor.elemwise.Prod]) @op_lifter([tensor.CAReduce, tensor.Sum, tensor.elemwise.Prod])
@register_opt2([tensor.CAReduce, tensor.Sum, tensor.elemwise.Prod], 'fast_compile')
def local_gpua_careduce(node, context_name): def local_gpua_careduce(node, context_name):
if isinstance(node.op.scalar_op, (scalar.Add, scalar.Mul, if isinstance(node.op.scalar_op, (scalar.Add, scalar.Mul,
scalar.Maximum, scalar.Minimum)): scalar.Maximum, scalar.Minimum)):
...@@ -859,6 +911,7 @@ def local_gpua_careduce(node, context_name): ...@@ -859,6 +911,7 @@ def local_gpua_careduce(node, context_name):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.blas.Gemv, tensor.blas_c.CGemv]) @op_lifter([tensor.blas.Gemv, tensor.blas_c.CGemv])
@register_opt2([tensor.blas.Gemv], 'fast_compile')
def local_gpua_gemv(node, context_name): def local_gpua_gemv(node, context_name):
if node.op.inplace: if node.op.inplace:
return gpugemv_inplace return gpugemv_inplace
...@@ -868,6 +921,7 @@ def local_gpua_gemv(node, context_name): ...@@ -868,6 +921,7 @@ def local_gpua_gemv(node, context_name):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.blas.Gemm]) @op_lifter([tensor.blas.Gemm])
@register_opt2([tensor.blas.Gemm], 'fast_compile')
def local_gpua_gemm(node, context_name): def local_gpua_gemm(node, context_name):
if node.op.inplace: if node.op.inplace:
return gpugemm_inplace return gpugemm_inplace
...@@ -877,26 +931,28 @@ def local_gpua_gemm(node, context_name): ...@@ -877,26 +931,28 @@ def local_gpua_gemm(node, context_name):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.blas.BatchedDot]) @op_lifter([tensor.blas.BatchedDot])
def local_gpua_gemmbatch(node, context_name): @register_opt2([tensor.blas.BatchedDot], 'fast_compile')
a, b = node.inputs def local_gpua_gemmbatch(node, context_name, inputs):
a, b = inputs
c = tensor.AllocEmpty(a.dtype)(a.shape[0], a.shape[1], b.shape[2]) c = tensor.AllocEmpty(a.dtype)(a.shape[0], a.shape[1], b.shape[2])
return gpugemmbatch_no_inplace(c, 1.0, a, b, 0.0) return gpugemmbatch_no_inplace(c, 1.0, a, b, 0.0)
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.basic.Dot]) @op_lifter([tensor.basic.Dot])
def local_gpua_hgemm(node, context_name): @register_opt2([tensor.basic.Dot], 'fast_compile')
def local_gpua_hgemm(node, context_name, inputs):
from theano.sandbox.cuda import nvcc_compiler from theano.sandbox.cuda import nvcc_compiler
if nvcc_compiler.nvcc_version < '7.5': if nvcc_compiler.nvcc_version < '7.5':
_logger.warning("Not performing dot of float16 on the GPU since " _logger.warning("Not performing dot of float16 on the GPU since "
"cuda 7.5 is not available. Updating could speed up " "cuda 7.5 is not available. Updating could speed up "
"your code.") "your code.")
return return
A = node.inputs[0] A = inputs[0]
B = node.inputs[1] B = inputs[1]
if (A.ndim == 2 and B.ndim == 2 and if (A.ndim == 2 and B.ndim == 2 and
A.dtype == 'float16' and B.dtype == 'float16'): A.dtype == 'float16' and B.dtype == 'float16'):
fgraph = node.inputs[0].fgraph fgraph = inputs[0].fgraph
C = GpuAllocEmpty(dtype='float16', context_name=context_name)( C = GpuAllocEmpty(dtype='float16', context_name=context_name)(
shape_i(A, 0, fgraph), shape_i(A, 0, fgraph),
shape_i(B, 1, fgraph)) shape_i(B, 1, fgraph))
...@@ -941,8 +997,9 @@ def local_gpua_dot22(node, context_name): ...@@ -941,8 +997,9 @@ def local_gpua_dot22(node, context_name):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.blas.Dot22Scalar]) @op_lifter([tensor.blas.Dot22Scalar])
def local_gpua_dot22scalar(node, context_name): @register_opt2([tensor.blas.Dot22Scalar], 'fast_compile')
x, y, a = node.inputs def local_gpua_dot22scalar(node, context_name, inputs):
x, y, a = inputs
x = as_gpuarray_variable(x, context_name) x = as_gpuarray_variable(x, context_name)
y = as_gpuarray_variable(y, context_name) y = as_gpuarray_variable(y, context_name)
z = GpuAllocEmpty(x.dtype, context_name)(x.shape[0], y.shape[1]) z = GpuAllocEmpty(x.dtype, context_name)(x.shape[0], y.shape[1])
...@@ -951,30 +1008,35 @@ def local_gpua_dot22scalar(node, context_name): ...@@ -951,30 +1008,35 @@ def local_gpua_dot22scalar(node, context_name):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.basic.Eye]) @op_lifter([tensor.basic.Eye])
@register_opt2([tensor.basic.Eye], 'fast_compile')
def local_gpua_eye(node, context_name): def local_gpua_eye(node, context_name):
return GpuEye(dtype=node.op.dtype, context_name=context_name) return GpuEye(dtype=node.op.dtype, context_name=context_name)
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias], cuda_only=True) @op_lifter([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias], cuda_only=True)
@register_opt2([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias], 'fast_compile')
def local_gpua_crossentropysoftmaxargmax1hotwithbias(node, context_name): def local_gpua_crossentropysoftmaxargmax1hotwithbias(node, context_name):
return gpu_crossentropy_softmax_argmax_1hot_with_bias return gpu_crossentropy_softmax_argmax_1hot_with_bias
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.nnet.CrossentropySoftmax1HotWithBiasDx], cuda_only=True) @op_lifter([tensor.nnet.CrossentropySoftmax1HotWithBiasDx], cuda_only=True)
@register_opt2([tensor.nnet.CrossentropySoftmax1HotWithBiasDx], 'fast_compile')
def local_gpua_crossentropysoftmax1hotwithbiasdx(node, context_name): def local_gpua_crossentropysoftmax1hotwithbiasdx(node, context_name):
return gpu_crossentropy_softmax_1hot_with_bias_dx return gpu_crossentropy_softmax_1hot_with_bias_dx
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.nnet.Softmax], cuda_only=True) @op_lifter([tensor.nnet.Softmax], cuda_only=True)
@register_opt2([tensor.nnet.Softmax], 'fast_compile')
def local_gpua_softmax(node, context_name): def local_gpua_softmax(node, context_name):
return gpu_softmax return gpu_softmax
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.nnet.SoftmaxWithBias], cuda_only=True) @op_lifter([tensor.nnet.SoftmaxWithBias], cuda_only=True)
@register_opt2([tensor.nnet.SoftmaxWithBias], 'fast_compile')
def local_gpua_softmaxwithbias(node, context_name): def local_gpua_softmaxwithbias(node, context_name):
return gpu_softmax_with_bias return gpu_softmax_with_bias
...@@ -987,8 +1049,6 @@ def local_assert(node, context_name): ...@@ -987,8 +1049,6 @@ def local_assert(node, context_name):
if isinstance(node.inputs[0].type, GpuArrayType): if isinstance(node.inputs[0].type, GpuArrayType):
return return
return [host_from_gpu(node.op(as_gpuarray_variable(node.inputs[0], return [host_from_gpu(node.op(as_gpuarray_variable(node.inputs[0],
context_name),
*node.inputs[1:]))]
@register_opt('fast_compile') @register_opt('fast_compile')
...@@ -1004,6 +1064,7 @@ theano.tensor.nnet.conv2d() ...@@ -1004,6 +1064,7 @@ theano.tensor.nnet.conv2d()
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([SparseBlockGemv]) @op_lifter([SparseBlockGemv])
@register_opt2([SparseBlockGemv], 'fast_compile')
def local_lift_sparseblockgemv(node, context_name): def local_lift_sparseblockgemv(node, context_name):
if node.op.inplace: if node.op.inplace:
return gpu_sparse_block_gemv_inplace return gpu_sparse_block_gemv_inplace
...@@ -1013,6 +1074,7 @@ def local_lift_sparseblockgemv(node, context_name): ...@@ -1013,6 +1074,7 @@ def local_lift_sparseblockgemv(node, context_name):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([SparseBlockOuter]) @op_lifter([SparseBlockOuter])
@register_opt2([SparseBlockOuter], 'fast_compile')
def local_lift_sparseblockouter(node, context_name): def local_lift_sparseblockouter(node, context_name):
if node.op.inplace: if node.op.inplace:
return gpu_sparse_block_outer_inplace return gpu_sparse_block_outer_inplace
...@@ -1039,14 +1101,17 @@ def local_inplace_sparseblockouter(node): ...@@ -1039,14 +1101,17 @@ def local_inplace_sparseblockouter(node):
@op_lifter([AbstractConv2d, @op_lifter([AbstractConv2d,
AbstractConv2d_gradWeights, AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs]) AbstractConv2d_gradInputs])
def local_lift_abstractconv2d(node, context_name): @register_opt2([AbstractConv2d,
AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs], 'fast_compile')
def local_lift_abstractconv2d(node, context_name, inputs):
if isinstance(node.outputs[0].type, GpuArrayType): if isinstance(node.outputs[0].type, GpuArrayType):
# Don't handle this node here, it's already on the GPU. # Don't handle this node here, it's already on the GPU.
return return
inps = list(node.inputs) inps = list(inputs)
inps[0] = as_gpuarray_variable(node.inputs[0], inps[0] = as_gpuarray_variable(inputs[0],
context_name=context_name) context_name=context_name)
inps[1] = as_gpuarray_variable(node.inputs[1], inps[1] = as_gpuarray_variable(inputs[1],
context_name=context_name) context_name=context_name)
return [node.op(*inps)] return [node.op(*inps)]
...@@ -1155,6 +1220,7 @@ def gpu_reconstruct_graph(inputs, outputs, tag=None): ...@@ -1155,6 +1220,7 @@ def gpu_reconstruct_graph(inputs, outputs, tag=None):
@register_opt('scan', 'fast_compile') @register_opt('scan', 'fast_compile')
@op_lifter([scan_op.Scan]) @op_lifter([scan_op.Scan])
#@register_opt2([scan_op.Scan], 'fast_compile')
def local_scan_to_gpua(node, context_name): def local_scan_to_gpua(node, context_name):
info = copy.deepcopy(node.op.info) info = copy.deepcopy(node.op.info)
if info.get('gpua', False): if info.get('gpua', False):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论