提交 c022347b authored 作者: abergeron's avatar abergeron

Merge pull request #1967 from nouiz/fast_compile_gpu

[WIP] Fast compile gpu
...@@ -1837,8 +1837,7 @@ class EquilibriumOptimizer(NavigatorOptimizer): ...@@ -1837,8 +1837,7 @@ class EquilibriumOptimizer(NavigatorOptimizer):
process_count[process] += count process_count[process] += count
else: else:
process_count[process] = count process_count[process] = count
for i in range(len(loop_process_count), len(prof2[2])): loop_process_count.extend(prof2[2][len(loop_process_count):])
loop_process_count.append(list(prof2[2]))
max_nb_nodes = max(prof1[3], prof2[3]) max_nb_nodes = max(prof1[3], prof2[3])
......
...@@ -415,6 +415,7 @@ def use(device, ...@@ -415,6 +415,7 @@ def use(device,
if default_to_move_computation_to_gpu: if default_to_move_computation_to_gpu:
optdb.add_tags('gpu_opt', optdb.add_tags('gpu_opt',
'fast_compile',
'fast_run', 'fast_run',
'inplace') 'inplace')
optdb.add_tags('gpu_after_fusion', optdb.add_tags('gpu_after_fusion',
......
...@@ -55,10 +55,10 @@ gpu_optimizer = EquilibriumDB(ignore_newtrees=False) ...@@ -55,10 +55,10 @@ gpu_optimizer = EquilibriumDB(ignore_newtrees=False)
gpu_cut_copies = EquilibriumDB() gpu_cut_copies = EquilibriumDB()
gpu_seqopt = SequenceDB() gpu_seqopt = SequenceDB()
gpu_seqopt.register('gpu_local_optimizations', gpu_optimizer, 1, gpu_seqopt.register('gpu_local_optimizations', gpu_optimizer, 1,
'fast_run', 'inplace', 'gpu') 'fast_run', 'fast_compile', 'inplace', 'gpu')
gpu_seqopt.register('gpu_cut_transfers', gpu_cut_copies, 2, gpu_seqopt.register('gpu_cut_transfers', gpu_cut_copies, 2,
'fast_run', 'gpu') 'fast_run', 'fast_compile', 'gpu')
# DO NOT PUT fast_run in gpu_opt! This will ALWAYS enable the GPU! # DO NOT PUT fast_run or fast_compile in gpu_opt! This will ALWAYS enable the GPU!
optdb.register('gpu_opt', optdb.register('gpu_opt',
gpu_seqopt, gpu_seqopt,
optdb.__position__.get('add_destroy_handler', 49.5) - 1, optdb.__position__.get('add_destroy_handler', 49.5) - 1,
...@@ -72,13 +72,15 @@ optdb.register('gpu_after_fusion', ...@@ -72,13 +72,15 @@ optdb.register('gpu_after_fusion',
'gpu') 'gpu')
## Register merge_optimizer as a global opt ## Register merge_optimizer as a global opt
gpu_optimizer.register('gpu_merge', theano.gof.opt.merge_optimizer, 'fast_run') gpu_optimizer.register('gpu_merge', theano.gof.opt.merge_optimizer,
'fast_run', 'fast_compile')
def register_opt(*tags, **kwargs): def register_opt(*tags, **kwargs):
def f(local_opt): def f(local_opt):
name = (kwargs and kwargs.pop('name')) or local_opt.__name__ name = (kwargs and kwargs.pop('name')) or local_opt.__name__
gpu_optimizer.register(name, local_opt, 'fast_run', 'gpu', *tags) gpu_optimizer.register(name, local_opt, 'fast_run', 'fast_compile',
'gpu', *tags)
return local_opt return local_opt
return f return f
...@@ -163,14 +165,15 @@ def local_cut_gpu_host_gpu(node): ...@@ -163,14 +165,15 @@ def local_cut_gpu_host_gpu(node):
return [node.inputs[0].owner.inputs[0]] return [node.inputs[0].owner.inputs[0]]
return False return False
gpu_cut_copies.register('cut_gpu_host_transfers', local_cut_gpu_host_gpu, gpu_cut_copies.register('cut_gpu_host_transfers', local_cut_gpu_host_gpu,
'fast_run', 'gpu') 'fast_run', 'fast_compile', 'gpu')
gpu_cut_copies.register('cut_gpu_constant_transfers', gpu_cut_copies.register('cut_gpu_constant_transfers',
tensor.opt.constant_folding, tensor.opt.constant_folding,
'fast_run', 'gpu') 'fast_run', 'fast_compile', 'gpu')
#register it into canonicalize to allow other optimization to work without #register it into canonicalize to allow other optimization to work without
#botering with this useless pattern. #botering with this useless pattern.
optdb['canonicalize'].register('local_cut_gpu_host_gpu', optdb['canonicalize'].register('local_cut_gpu_host_gpu',
local_cut_gpu_host_gpu, 'fast_run', 'gpu') local_cut_gpu_host_gpu,
'fast_run', 'fast_compile', 'gpu')
# 'float64', 'complex128' and 'complex64' are not supported in elemwise # 'float64', 'complex128' and 'complex64' are not supported in elemwise
# on the gpu. # on the gpu.
...@@ -347,7 +350,7 @@ def local_gpu_specifyShape_0(node): ...@@ -347,7 +350,7 @@ def local_gpu_specifyShape_0(node):
@register_opt() @register_opt()
@local_optimizer([gpu_from_host]) # XXX: broken: tensor.basic.dot is not an op @local_optimizer([gpu_from_host, tensor.basic.Dot])
def local_gpu_dot_to_dot22(node): def local_gpu_dot_to_dot22(node):
""" """
gpu_from_host(dot) -> gpudot(gpu_from_host) gpu_from_host(dot) -> gpudot(gpu_from_host)
...@@ -358,6 +361,8 @@ def local_gpu_dot_to_dot22(node): ...@@ -358,6 +361,8 @@ def local_gpu_dot_to_dot22(node):
the output. the output.
A more suitable solution would be to use the right cublas call A more suitable solution would be to use the right cublas call
This is needed in fast_compile
""" """
# In case the got do input upcast, we much check that we can # In case the got do input upcast, we much check that we can
...@@ -366,17 +371,18 @@ def local_gpu_dot_to_dot22(node): ...@@ -366,17 +371,18 @@ def local_gpu_dot_to_dot22(node):
if node.outputs[0].type.dtype != 'float32': if node.outputs[0].type.dtype != 'float32':
return False return False
host_input = node.inputs[0] host_input = node.inputs[0]
if host_input.owner and host_input.owner.op == tensor.basic.dot: if host_input.owner and isinstance(host_input.owner.op,
tensor.basic.Dot):
x, y = host_input.owner.inputs x, y = host_input.owner.inputs
# case one: vector X matrix # case one: vector X matrix
if _is_real_vector(x) and _is_real_matrix(y): if _is_real_vector(x) and _is_real_matrix(y):
new_op = GpuDimShuffle((False,), ['x', 0]) new_op = GpuDimShuffle((False,), ('x', 0))
shape_out = y.shape[1].dimshuffle(['x']) shape_out = y.shape[1].dimshuffle(['x'])
gpu_x = new_op(gpu_from_host(x)) gpu_x = new_op(gpu_from_host(x))
gpu_y = gpu_from_host(y) gpu_y = gpu_from_host(y)
# case two: matrix X vector # case two: matrix X vector
elif _is_real_matrix(x) and _is_real_vector(y): elif _is_real_matrix(x) and _is_real_vector(y):
new_op = GpuDimShuffle((False,), [0, 'x']) new_op = GpuDimShuffle((False,), (0, 'x'))
shape_out = x.shape[0].dimshuffle(['x']) shape_out = x.shape[0].dimshuffle(['x'])
gpu_x = gpu_from_host(x) gpu_x = gpu_from_host(x)
gpu_y = new_op(gpu_from_host(y)) gpu_y = new_op(gpu_from_host(y))
...@@ -384,20 +390,20 @@ def local_gpu_dot_to_dot22(node): ...@@ -384,20 +390,20 @@ def local_gpu_dot_to_dot22(node):
return False return False
return [GpuReshape(1)(gpu_dot22(gpu_x, gpu_y), shape_out)] return [GpuReshape(1)(gpu_dot22(gpu_x, gpu_y), shape_out)]
if node.op == tensor.basic.dot: if isinstance(node.op, tensor.basic.Dot):
if node.outputs[0].type.dtype != 'float32': if node.outputs[0].type.dtype != 'float32':
return False return False
if any([i.owner and isinstance(i.owner.op, HostFromGpu) if any([i.owner and isinstance(i.owner.op, HostFromGpu)
for i in node.inputs]): for i in node.inputs]):
x, y = node.inputs x, y = node.inputs
if _is_real_vector(x) and _is_real_matrix(y): if _is_real_vector(x) and _is_real_matrix(y):
new_op = GpuDimShuffle((False,), ['x', 0]) new_op = GpuDimShuffle((False,), ('x', 0))
shape_out = y.shape[1].dimshuffle(['x']) shape_out = y.shape[1].dimshuffle(['x'])
gpu_x = new_op(gpu_from_host(x)) gpu_x = new_op(gpu_from_host(x))
gpu_y = gpu_from_host(y) gpu_y = gpu_from_host(y)
elif _is_real_matrix(x) and _is_real_vector(y): elif _is_real_matrix(x) and _is_real_vector(y):
new_op = GpuDimShuffle((False,), [0, 'x']) new_op = GpuDimShuffle((False,), (0, 'x'))
shape_out = x.shape[0].dimshuffle(['x']) shape_out = x.shape[0].dimshuffle(['x'])
gpu_x = gpu_from_host(x) gpu_x = gpu_from_host(x)
gpu_y = new_op(gpu_from_host(y)) gpu_y = new_op(gpu_from_host(y))
...@@ -1629,8 +1635,10 @@ else: ...@@ -1629,8 +1635,10 @@ else:
#GpuElemwise inplace #GpuElemwise inplace
gpu_inplace_elemwise_optimizer = tensor.opt.inplace_elemwise_optimizer_op( gpu_inplace_elemwise_optimizer = tensor.opt.inplace_elemwise_optimizer_op(
GpuElemwise) GpuElemwise)
# DO NOT PLACE add a 'gpu' tag here! This would enable it in fast_compile.
# It still will be run in fast_run with device=gpu with the current tag.
optdb.register('gpu_inplace_elemwise_opt', gpu_inplace_elemwise_optimizer, 75, optdb.register('gpu_inplace_elemwise_opt', gpu_inplace_elemwise_optimizer, 75,
'fast_run', 'inplace', 'gpu_inplace', 'gpu') 'fast_run', 'inplace', 'gpu_inplace')
@register_opt() @register_opt()
......
...@@ -404,6 +404,32 @@ def test_erfinvgpu(): ...@@ -404,6 +404,32 @@ def test_erfinvgpu():
assert numpy.allclose(f(xv),f2(xv)) assert numpy.allclose(f(xv),f2(xv))
def test_local_gpu_dot_to_dot22dot():
def cmp(a_shp, b_shp):
a0 = numpy.random.rand(*a_shp).astype('float32')
a = cuda.shared_constructor(a0, 'a')
b0 = numpy.random.rand(*b_shp).astype('float32')
b = cuda.shared_constructor(b0, 'a')
f = pfunc([], tensor.dot(a, b), mode=mode_with_gpu)
assert cuda.opt.local_gpu_dot_to_dot22.transform(
tensor.dot(a, b).owner)
out = f()
assert numpy.allclose(numpy.dot(a0, b0), out)
# Try with a matrix equal to a0, but with strides in both dims
a.set_value(a0)
a.set_value(
a.get_value(borrow=True,
return_internal_type=True)[::-1],
borrow=True)
f()
cmp((4,), (4, 5))
cmp((3, 4), (4,))
class test_diag(theano.tensor.tests.test_nlinalg.test_diag): class test_diag(theano.tensor.tests.test_nlinalg.test_diag):
mode = mode_with_gpu mode = mode_with_gpu
shared = staticmethod(cuda.shared_constructor) shared = staticmethod(cuda.shared_constructor)
......
...@@ -44,9 +44,9 @@ gpu_cut_copies = EquilibriumDB() ...@@ -44,9 +44,9 @@ gpu_cut_copies = EquilibriumDB()
gpu_seqopt = SequenceDB() gpu_seqopt = SequenceDB()
gpu_seqopt.register('gpuarray_local_optimiziations', gpu_optimizer, 1, gpu_seqopt.register('gpuarray_local_optimiziations', gpu_optimizer, 1,
'fast_run', 'inplace', 'gpuarray') 'fast_compile', 'fast_run', 'inplace', 'gpuarray')
gpu_seqopt.register('gpuarray_cut_transfers', gpu_cut_copies, 2, gpu_seqopt.register('gpuarray_cut_transfers', gpu_cut_copies, 2,
'fast_run', 'gpuarray') 'fast_compile', 'fast_run', 'gpuarray')
# do not add 'fast_run' to these two as this would always enable gpuarray mode # do not add 'fast_run' to these two as this would always enable gpuarray mode
optdb.register('gpuarray_opt', gpu_seqopt, optdb.register('gpuarray_opt', gpu_seqopt,
...@@ -61,7 +61,7 @@ def register_opt(*tags, **kwargs): ...@@ -61,7 +61,7 @@ def register_opt(*tags, **kwargs):
return local_opt return local_opt
return f return f
register_opt()(theano.tensor.opt.local_track_shape_i) register_opt('fast_compile')(theano.tensor.opt.local_track_shape_i)
def safe_to_gpu(x): def safe_to_gpu(x):
...@@ -145,19 +145,20 @@ def local_cut_gpu_host_gpu(node): ...@@ -145,19 +145,20 @@ def local_cut_gpu_host_gpu(node):
return [node.inputs[0].owner.inputs[0]] return [node.inputs[0].owner.inputs[0]]
return False return False
gpu_cut_copies.register('cut_gpua_host_transfers', local_cut_gpu_host_gpu, gpu_cut_copies.register('cut_gpua_host_transfers', local_cut_gpu_host_gpu,
'fast_run', 'inplace', 'gpuarray') 'fast_compile', 'fast_run', 'inplace', 'gpuarray')
gpu_cut_copies.register('cut_gpua_constant_transfers', gpu_cut_copies.register('cut_gpua_constant_transfers',
tensor.opt.constant_folding, tensor.opt.constant_folding,
'fast_run', 'gpuarray') 'fast_compile', 'fast_run', 'gpuarray')
optdb['canonicalize'].register('local_cut_gpua_host_gpua', optdb['canonicalize'].register('local_cut_gpua_host_gpua',
local_cut_gpu_host_gpu, 'fast_run', 'gpuarray') local_cut_gpu_host_gpu,
'fast_compile', 'fast_run', 'gpuarray')
@register_opt() @register_opt('fast_compile')
@local_optimizer([tensor.Alloc]) @local_optimizer([tensor.Alloc])
def local_gpuaalloc2(node): def local_gpuaalloc2(node):
""" """
Join(axis, Alloc, Alloc, ...) -> Join(axis, GpuAlloc, Alloc, ...) Join(axis, {Alloc or HostFromGPU}, ...) -> Join(axis, GpuAlloc, Alloc, ...)
Moves an alloc that is an input to join to the gpu. Moves an alloc that is an input to join to the gpu.
""" """
...@@ -171,7 +172,7 @@ def local_gpuaalloc2(node): ...@@ -171,7 +172,7 @@ def local_gpuaalloc2(node):
return [host_from_gpu(gpu_alloc(*node.inputs))] return [host_from_gpu(gpu_alloc(*node.inputs))]
@register_opt() @register_opt('fast_compile')
@op_lifter([tensor.Alloc]) @op_lifter([tensor.Alloc])
def local_gpuaalloc(node): def local_gpuaalloc(node):
new_out = gpu_alloc(*node.inputs) new_out = gpu_alloc(*node.inputs)
...@@ -199,7 +200,7 @@ def local_gpualloc_memset_0(node): ...@@ -199,7 +200,7 @@ def local_gpualloc_memset_0(node):
return [new_out] return [new_out]
@register_opt() @register_opt('fast_compile')
@op_lifter([tensor.Reshape]) @op_lifter([tensor.Reshape])
def local_gpureshape(node): def local_gpureshape(node):
op = node.op op = node.op
...@@ -210,14 +211,14 @@ def local_gpureshape(node): ...@@ -210,14 +211,14 @@ def local_gpureshape(node):
return res return res
@register_opt() @register_opt('fast_compile')
@op_lifter([tensor.Rebroadcast]) @op_lifter([tensor.Rebroadcast])
def local_gpu_rebroadcast(node): def local_gpu_rebroadcast(node):
if isinstance(node.inputs[0].owner.op, HostFromGpu): if isinstance(node.inputs[0].owner.op, HostFromGpu):
return node.op(node.inputs[0].owner.inputs[0]) return node.op(node.inputs[0].owner.inputs[0])
@register_opt() @register_opt('fast_compile')
@op_lifter([tensor.Flatten]) @op_lifter([tensor.Flatten])
def local_gpuflatten(node): def local_gpuflatten(node):
op = node.op op = node.op
...@@ -230,7 +231,7 @@ def local_gpuflatten(node): ...@@ -230,7 +231,7 @@ def local_gpuflatten(node):
return o return o
@register_opt() @register_opt('fast_compile')
@op_lifter([tensor.Elemwise]) @op_lifter([tensor.Elemwise])
def local_gpu_elemwise(node): def local_gpu_elemwise(node):
op = node.op op = node.op
...@@ -273,14 +274,14 @@ optdb.register('gpua_inplace_opt', inplace_gpu_elemwise_opt, 75, ...@@ -273,14 +274,14 @@ optdb.register('gpua_inplace_opt', inplace_gpu_elemwise_opt, 75,
'inplace_elemwise_optimizer', 'fast_run', 'inplace', 'gpuarray') 'inplace_elemwise_optimizer', 'fast_run', 'inplace', 'gpuarray')
@register_opt() @register_opt('fast_compile')
@op_lifter([tensor.DimShuffle]) @op_lifter([tensor.DimShuffle])
def local_gpua_dimshuffle(node): def local_gpua_dimshuffle(node):
return GpuDimShuffle(node.op.input_broadcastable, return GpuDimShuffle(node.op.input_broadcastable,
node.op.new_order) node.op.new_order)
@register_opt() @register_opt('fast_compile')
@op_lifter([tensor.SpecifyShape]) @op_lifter([tensor.SpecifyShape])
def local_gpua_specifyShape(node): def local_gpua_specifyShape(node):
if isinstance(node.inputs[0].type, GpuArrayType): if isinstance(node.inputs[0].type, GpuArrayType):
...@@ -289,11 +290,21 @@ def local_gpua_specifyShape(node): ...@@ -289,11 +290,21 @@ def local_gpua_specifyShape(node):
return tensor.specify_shape(*inp) return tensor.specify_shape(*inp)
@register_opt('fast_compile')
@op_lifter([theano.compile.ops.Shape])
def local_gpua_shape(node):
# op_lifter will call this opt too frequently as the output is
# always on the CPU.
if isinstance(node.inputs[0].type, GpuArrayType):
return
return [gpu_from_host(node.inputs[0]).shape]
def gpu_print_wrapper(op, cnda): def gpu_print_wrapper(op, cnda):
op.old_op.global_fn(op.old_op, numpy.asarray(cnda)) op.old_op.global_fn(op.old_op, numpy.asarray(cnda))
@register_opt() @register_opt('fast_compile')
@op_lifter([tensor.printing.Print]) @op_lifter([tensor.printing.Print])
def local_gpu_print_op(node): def local_gpu_print_op(node):
x, = node.inputs x, = node.inputs
...@@ -303,13 +314,13 @@ def local_gpu_print_op(node): ...@@ -303,13 +314,13 @@ def local_gpu_print_op(node):
return new_op(gpu_x) return new_op(gpu_x)
@register_opt() @register_opt('fast_compile')
@op_lifter([tensor.Join]) @op_lifter([tensor.Join])
def local_gpua_join(node): def local_gpua_join(node):
return gpu_join return gpu_join
@register_opt() @register_opt('fast_compile')
@local_optimizer([GpuJoin]) @local_optimizer([GpuJoin])
def local_gpuajoin_1(node): def local_gpuajoin_1(node):
# join of a single element # join of a single element
...@@ -318,19 +329,19 @@ def local_gpuajoin_1(node): ...@@ -318,19 +329,19 @@ def local_gpuajoin_1(node):
return [node.inputs[1]] return [node.inputs[1]]
@register_opt() @register_opt('fast_compile')
@op_lifter([tensor.Split]) @op_lifter([tensor.Split])
def local_gpua_split(node): def local_gpua_split(node):
return GpuSplit(node.op.len_splits) return GpuSplit(node.op.len_splits)
@register_opt() @register_opt('fast_compile')
@op_lifter([tensor.Subtensor]) @op_lifter([tensor.Subtensor])
def local_gpua_subtensor(node): def local_gpua_subtensor(node):
return GpuSubtensor(node.op.idx_list) return GpuSubtensor(node.op.idx_list)
@register_opt() @register_opt('fast_compile')
@op_lifter([tensor.IncSubtensor]) @op_lifter([tensor.IncSubtensor])
def local_gpua_incsubtensor(node): def local_gpua_incsubtensor(node):
return GpuIncSubtensor(node.op.idx_list, node.op.inplace, return GpuIncSubtensor(node.op.idx_list, node.op.inplace,
...@@ -338,7 +349,7 @@ def local_gpua_incsubtensor(node): ...@@ -338,7 +349,7 @@ def local_gpua_incsubtensor(node):
node.op.destroyhandler_tolerate_aliased) node.op.destroyhandler_tolerate_aliased)
@register_opt() @register_opt('fast_compile')
@op_lifter([tensor.AdvancedIncSubtensor1]) @op_lifter([tensor.AdvancedIncSubtensor1])
def local_gpua_advanced_incsubtensor(node): def local_gpua_advanced_incsubtensor(node):
...@@ -362,7 +373,7 @@ def local_gpua_advanced_incsubtensor(node): ...@@ -362,7 +373,7 @@ def local_gpua_advanced_incsubtensor(node):
set_instead_of_inc=set_instead_of_inc) set_instead_of_inc=set_instead_of_inc)
@register_opt() @register_opt('fast_compile')
@op_lifter([tensor.CAReduce, tensor.Sum, tensor.elemwise.Prod]) @op_lifter([tensor.CAReduce, tensor.Sum, tensor.elemwise.Prod])
def local_gpua_careduce(node): def local_gpua_careduce(node):
if isinstance(node.op.scalar_op, (scalar.Add, scalar.Mul, if isinstance(node.op.scalar_op, (scalar.Add, scalar.Mul,
...@@ -442,71 +453,67 @@ def local_gpua_careduce(node): ...@@ -442,71 +453,67 @@ def local_gpua_careduce(node):
return [unreshaped_reduce] return [unreshaped_reduce]
@register_opt() @register_opt('fast_compile')
@op_lifter([tensor.blas.Gemv, tensor.blas_c.CGemv]) @op_lifter([tensor.blas.Gemv, tensor.blas_c.CGemv])
def local_gpua_gemv(node): def local_gpua_gemv(node):
return GpuGemv(inplace=node.op.inplace) return GpuGemv(inplace=node.op.inplace)
@register_opt() @register_opt('fast_compile')
@op_lifter([tensor.blas.Gemm]) @op_lifter([tensor.blas.Gemm])
def local_gpua_gemm(node): def local_gpua_gemm(node):
return GpuGemm(inplace=node.op.inplace) return GpuGemm(inplace=node.op.inplace)
@register_opt() @register_opt('fast_compile')
@op_lifter([tensor.blas.Ger, tensor.blas_c.CGer, tensor.blas_scipy.ScipyGer]) @op_lifter([tensor.blas.Ger, tensor.blas_c.CGer, tensor.blas_scipy.ScipyGer])
def local_gpua_ger(node): def local_gpua_ger(node):
return GpuGer(destructive=node.op.destructive) return GpuGer(destructive=node.op.destructive)
@register_opt() @register_opt('fast_compile')
@op_lifter([tensor.blas.Dot22]) @op_lifter([tensor.blas.Dot22])
def local_gpua_dot22(node): def local_gpua_dot22(node):
return gpu_dot22 return gpu_dot22
@register_opt() @register_opt('fast_compile')
@op_lifter([tensor.basic.Eye]) @op_lifter([tensor.basic.Eye])
def local_gpua_eye(node): def local_gpua_eye(node):
return GpuEye(dtype=node.op.dtype) return GpuEye(dtype=node.op.dtype)
@register_opt() @register_opt('fast_compile')
@op_lifter([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias]) @op_lifter([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias])
def local_gpua_crossentropysoftmaxargmax1hotwithbias(node): def local_gpua_crossentropysoftmaxargmax1hotwithbias(node):
return GpuCrossentropySoftmaxArgmax1HotWithBias() return GpuCrossentropySoftmaxArgmax1HotWithBias()
@register_opt() @register_opt('fast_compile')
@op_lifter([tensor.nnet.CrossentropySoftmax1HotWithBiasDx]) @op_lifter([tensor.nnet.CrossentropySoftmax1HotWithBiasDx])
def local_gpua_crossentropysoftmax1hotwithbiasdx(node): def local_gpua_crossentropysoftmax1hotwithbiasdx(node):
return GpuCrossentropySoftmax1HotWithBiasDx() return GpuCrossentropySoftmax1HotWithBiasDx()
@register_opt() @register_opt('fast_compile')
@op_lifter([tensor.nnet.Softmax]) @op_lifter([tensor.nnet.Softmax])
def local_gpua_softmax(node): def local_gpua_softmax(node):
return GpuSoftmax() return GpuSoftmax()
@register_opt() @register_opt('fast_compile')
@op_lifter([tensor.nnet.SoftmaxWithBias]) @op_lifter([tensor.nnet.SoftmaxWithBias])
def local_gpua_softmaxwithbias(node): def local_gpua_softmaxwithbias(node):
return GpuSoftmaxWithBias() return GpuSoftmaxWithBias()
@register_opt() @register_opt('fast_compile')
@local_optimizer([theano.tensor.opt.Assert]) @op_lifter([theano.tensor.opt.Assert])
def local_assert(node): def local_assert(node):
if (isinstance(node.op, theano.tensor.opt.Assert) and
node.inputs[0].owner and
isinstance(node.inputs[0].owner.op,
HostFromGpu)):
return [host_from_gpu(node.op(node.inputs[0].owner.inputs[0]))] return [host_from_gpu(node.op(node.inputs[0].owner.inputs[0]))]
@register_opt() @register_opt('fast_compile')
@op_lifter([gpu_from_host, ConvOp]) @op_lifter([gpu_from_host, ConvOp])
def local_gpu_conv(node): def local_gpu_conv(node):
""" """
...@@ -654,7 +661,7 @@ def gpu_reconstruct_graph(inputs, outputs, tag=None): ...@@ -654,7 +661,7 @@ def gpu_reconstruct_graph(inputs, outputs, tag=None):
return (nw_inputs, nw_outputs) return (nw_inputs, nw_outputs)
@register_opt('scan') @register_opt('scan', 'fast_compile')
@op_lifter([scan_op.Scan]) @op_lifter([scan_op.Scan])
def local_scan_to_gpua(node): def local_scan_to_gpua(node):
info = copy.deepcopy(node.op.info) info = copy.deepcopy(node.op.info)
......
...@@ -4,7 +4,8 @@ import theano ...@@ -4,7 +4,8 @@ import theano
from theano import tensor from theano import tensor
from theano.tests import unittest_tools as utt from theano.tests import unittest_tools as utt
import theano.sandbox.gpuarray import theano.sandbox.gpuarray
from theano.sandbox.gpuarray.type import GpuArrayType from theano.sandbox.gpuarray.type import (
GpuArrayType, gpuarray_shared_constructor)
from theano.sandbox.gpuarray.basic_ops import ( from theano.sandbox.gpuarray.basic_ops import (
GpuAlloc, GpuReshape, gpu_alloc, gpu_from_host, host_from_gpu) GpuAlloc, GpuReshape, gpu_alloc, gpu_from_host, host_from_gpu)
from theano.sandbox.gpuarray.elemwise import ( from theano.sandbox.gpuarray.elemwise import (
......
...@@ -1815,13 +1815,14 @@ def local_dot22_to_ger_or_gemv(node): ...@@ -1815,13 +1815,14 @@ def local_dot22_to_ger_or_gemv(node):
blas_optdb = SequenceDB() blas_optdb = SequenceDB()
# run after numerical stability optimizations (1.5) # run after numerical stability optimizations (1.5)
optdb.register('BlasOpt', blas_optdb, 1.7, 'fast_run') optdb.register('BlasOpt', blas_optdb, 1.7, 'fast_run', 'fast_compile')
# run before specialize (2.0) because specialize is basically a # run before specialize (2.0) because specialize is basically a
# free-for-all that makes the graph crazy. # free-for-all that makes the graph crazy.
#fast_compile is needed to have GpuDot22 created.
blas_optdb.register('local_dot_to_dot22', blas_optdb.register('local_dot_to_dot22',
in2out(local_dot_to_dot22), in2out(local_dot_to_dot22),
0, 'fast_run') 0, 'fast_run', 'fast_compile')
blas_optdb.register('gemm_optimizer', blas_optdb.register('gemm_optimizer',
GemmOptimizer(), GemmOptimizer(),
10, 'fast_run') 10, 'fast_run')
......
"""Provides neural-network specific Ops. """Provides neural-network specific Ops.
:note: TODO: factor this out into a neural-network toolbox. :note: TODO: factor this out into a neural-network toolbox.
:note: We register all optimization with the gpu tag as we don't
implement all the intermediate case on the GPU (in particular
AdvancedSubtensor). So to make sure it run well on the gpu with
fast_compile, we register them as needed for the GPU. This can be
revisited later when all the intermediate part are on the GPU.
""" """
import logging import logging
import numpy import numpy
...@@ -570,7 +577,7 @@ class Softmax(gof.Op): ...@@ -570,7 +577,7 @@ class Softmax(gof.Op):
softmax = Softmax() softmax = Softmax()
@opt.register_specialize @opt.register_specialize('gpu')
@gof.local_optimizer([softmax]) @gof.local_optimizer([softmax])
def local_softmax_with_bias(node): def local_softmax_with_bias(node):
"""Try to turn softmax(sum_of_stuff) -> softmax_w_bias(matrix, bias) """Try to turn softmax(sum_of_stuff) -> softmax_w_bias(matrix, bias)
...@@ -1323,8 +1330,8 @@ class CrossentropyCategorical1Hot(gof.Op): ...@@ -1323,8 +1330,8 @@ class CrossentropyCategorical1Hot(gof.Op):
crossentropy_categorical_1hot = CrossentropyCategorical1Hot() crossentropy_categorical_1hot = CrossentropyCategorical1Hot()
@opt.register_stabilize @opt.register_stabilize('gpu')
@opt.register_specialize @opt.register_specialize('gpu')
@gof.optimizer @gof.optimizer
def crossentropy_to_crossentropy_with_softmax_with_bias(fgraph): def crossentropy_to_crossentropy_with_softmax_with_bias(fgraph):
"""This is a stabilization optimization """This is a stabilization optimization
...@@ -1397,9 +1404,10 @@ def crossentropy_to_crossentropy_with_softmax(fgraph): ...@@ -1397,9 +1404,10 @@ def crossentropy_to_crossentropy_with_softmax(fgraph):
optdb.register('crossentropy_to_crossentropy_with_softmax', optdb.register('crossentropy_to_crossentropy_with_softmax',
crossentropy_to_crossentropy_with_softmax, 2.01, crossentropy_to_crossentropy_with_softmax, 2.01,
'fast_run', 'xent') 'fast_run', 'xent', 'gpu')
@opt.register_specialize('gpu')
@gof.local_optimizer([softmax_grad]) @gof.local_optimizer([softmax_grad])
def local_crossentropy_to_crossentropy_with_softmax_grad(node): def local_crossentropy_to_crossentropy_with_softmax_grad(node):
if node.op == softmax_grad: if node.op == softmax_grad:
...@@ -1410,10 +1418,9 @@ def local_crossentropy_to_crossentropy_with_softmax_grad(node): ...@@ -1410,10 +1418,9 @@ def local_crossentropy_to_crossentropy_with_softmax_grad(node):
dx = crossentropy_softmax_1hot_with_bias_dx(g_nll, dx = crossentropy_softmax_1hot_with_bias_dx(g_nll,
coding_dist, true_one_of_n) coding_dist, true_one_of_n)
return [dx] return [dx]
opt.register_specialize(local_crossentropy_to_crossentropy_with_softmax_grad)
@opt.register_specialize @opt.register_specialize('gpu')
@gof.local_optimizer([tensor._max_and_argmax]) @gof.local_optimizer([tensor._max_and_argmax])
def local_argmax_pushdown(node): def local_argmax_pushdown(node):
if node.op == tensor._max_and_argmax and node.inputs[0].owner and \ if node.op == tensor._max_and_argmax and node.inputs[0].owner and \
...@@ -1499,7 +1506,7 @@ def _is_const(z, val, approx=False): ...@@ -1499,7 +1506,7 @@ def _is_const(z, val, approx=False):
return numpy.all(maybe == val) return numpy.all(maybe == val)
@opt.register_specialize @opt.register_specialize('gpu')
@gof.local_optimizer([subtensor.AdvancedSubtensor, tensor.log]) @gof.local_optimizer([subtensor.AdvancedSubtensor, tensor.log])
def local_advanced_indexing_crossentropy_onehot(node): def local_advanced_indexing_crossentropy_onehot(node):
log = None log = None
...@@ -1540,7 +1547,7 @@ def local_advanced_indexing_crossentropy_onehot(node): ...@@ -1540,7 +1547,7 @@ def local_advanced_indexing_crossentropy_onehot(node):
labels)[0]] labels)[0]]
@opt.register_specialize @opt.register_specialize('gpu')
@gof.local_optimizer([softmax_grad]) @gof.local_optimizer([softmax_grad])
def local_advanced_indexing_crossentropy_onehot_grad(node): def local_advanced_indexing_crossentropy_onehot_grad(node):
if not (node.op == softmax_grad): if not (node.op == softmax_grad):
...@@ -1763,7 +1770,7 @@ def local_advanced_indexing_crossentropy_onehot_grad(node): ...@@ -1763,7 +1770,7 @@ def local_advanced_indexing_crossentropy_onehot_grad(node):
return return
@opt.register_specialize @opt.register_specialize('gpu')
@gof.local_optimizer([softmax_with_bias]) @gof.local_optimizer([softmax_with_bias])
def graph_merge_softmax_with_crossentropy_softmax(node): def graph_merge_softmax_with_crossentropy_softmax(node):
if node.op == softmax_with_bias: if node.op == softmax_with_bias:
...@@ -1969,4 +1976,4 @@ local_log_softmax = gof.PatternSub(in_pattern=(tensor.log, (softmax, 'x')), ...@@ -1969,4 +1976,4 @@ local_log_softmax = gof.PatternSub(in_pattern=(tensor.log, (softmax, 'x')),
#don't do register_stabilize, this is to make local_log_softmax run #don't do register_stabilize, this is to make local_log_softmax run
#only after another more specific optimization that stabilizes cross entropy #only after another more specific optimization that stabilizes cross entropy
#opt.register_stabilize(local_log_softmax, name = 'local_log_softmax') #opt.register_stabilize(local_log_softmax, name = 'local_log_softmax')
opt.register_specialize(local_log_softmax, name='local_log_softmax') opt.register_specialize(local_log_softmax, 'gpu', name='local_log_softmax')
...@@ -310,18 +310,33 @@ compile.optdb.register('inplace_elemwise_opt', inplace_elemwise_optimizer, 75, ...@@ -310,18 +310,33 @@ compile.optdb.register('inplace_elemwise_opt', inplace_elemwise_optimizer, 75,
def register_canonicalize(lopt, *tags, **kwargs): def register_canonicalize(lopt, *tags, **kwargs):
if type(lopt) == str:
def register(inner_lopt):
return register_canonicalize(inner_lopt, *tags, **kwargs)
return register
else:
name = (kwargs and kwargs.pop('name')) or lopt.__name__ name = (kwargs and kwargs.pop('name')) or lopt.__name__
compile.optdb['canonicalize'].register(name, lopt, 'fast_run', *tags) compile.optdb['canonicalize'].register(name, lopt, 'fast_run', *tags)
return lopt return lopt
def register_stabilize(lopt, *tags, **kwargs): def register_stabilize(lopt, *tags, **kwargs):
if type(lopt) == str:
def register(inner_lopt):
return register_stabilize(inner_lopt, *tags, **kwargs)
return register
else:
name = (kwargs and kwargs.pop('name')) or lopt.__name__ name = (kwargs and kwargs.pop('name')) or lopt.__name__
compile.optdb['stabilize'].register(name, lopt, 'fast_run', *tags) compile.optdb['stabilize'].register(name, lopt, 'fast_run', *tags)
return lopt return lopt
def register_specialize(lopt, *tags, **kwargs): def register_specialize(lopt, *tags, **kwargs):
if type(lopt) == str:
def register(inner_lopt):
return register_specialize(inner_lopt, *tags, **kwargs)
return register
else:
name = (kwargs and kwargs.pop('name')) or lopt.__name__ name = (kwargs and kwargs.pop('name')) or lopt.__name__
compile.optdb['specialize'].register(name, lopt, 'fast_run', *tags) compile.optdb['specialize'].register(name, lopt, 'fast_run', *tags)
return lopt return lopt
...@@ -1304,7 +1319,7 @@ def local_track_shape_i(node): ...@@ -1304,7 +1319,7 @@ def local_track_shape_i(node):
@register_specialize @register_specialize
@register_canonicalize @register_canonicalize('gpu')
@gof.local_optimizer([Subtensor]) @gof.local_optimizer([Subtensor])
def local_subtensor_make_vector(node): def local_subtensor_make_vector(node):
# replace all subtensor(make_vector) like: # replace all subtensor(make_vector) like:
...@@ -1354,8 +1369,7 @@ def local_subtensor_make_vector(node): ...@@ -1354,8 +1369,7 @@ def local_subtensor_make_vector(node):
#TODO: the other optimization for and, or, xor, le and ge see ticket #496. #TODO: the other optimization for and, or, xor, le and ge see ticket #496.
@register_canonicalize('fast_compile')
@register_canonicalize
@register_specialize @register_specialize
@gof.local_optimizer([T.Elemwise]) @gof.local_optimizer([T.Elemwise])
def local_useless_elemwise(node): def local_useless_elemwise(node):
...@@ -3508,7 +3522,7 @@ def local_reduce_join(node): ...@@ -3508,7 +3522,7 @@ def local_reduce_join(node):
#else the reduction do something about the dtype. #else the reduction do something about the dtype.
@register_canonicalize @register_canonicalize('fast_compile')
@gof.local_optimizer(ALL_REDUCE) @gof.local_optimizer(ALL_REDUCE)
def local_cut_useless_reduce(node): def local_cut_useless_reduce(node):
"""Sum(a, axis=[]) -> a """ """Sum(a, axis=[]) -> a """
...@@ -4152,6 +4166,8 @@ def attempt_distribution(factor, num, denum, out_type): ...@@ -4152,6 +4166,8 @@ def attempt_distribution(factor, num, denum, out_type):
neg_pairs))), num, denum neg_pairs))), num, denum
@register_canonicalize
@register_stabilize
@gof.local_optimizer([T.mul, T.true_div, T.inv]) @gof.local_optimizer([T.mul, T.true_div, T.inv])
def local_greedy_distributor(node): def local_greedy_distributor(node):
""" """
...@@ -4216,10 +4232,10 @@ def local_greedy_distributor(node): ...@@ -4216,10 +4232,10 @@ def local_greedy_distributor(node):
return [rval] return [rval]
register_canonicalize(local_greedy_distributor)
register_stabilize(local_greedy_distributor)
@register_canonicalize('fast_compile')
@register_stabilize('fast_compile')
@register_specialize('fast_compile')
@gof.local_optimizer(None) @gof.local_optimizer(None)
def constant_folding(node): def constant_folding(node):
for input in node.inputs: for input in node.inputs:
...@@ -4253,10 +4269,6 @@ def constant_folding(node): ...@@ -4253,10 +4269,6 @@ def constant_folding(node):
rval.append(constant(output.type, storage_map[output][0])) rval.append(constant(output.type, storage_map[output][0]))
return rval return rval
register_canonicalize(constant_folding, 'fast_compile')
register_stabilize(constant_folding, 'fast_compile')
register_specialize(constant_folding, 'fast_compile')
def _is_1(expr): def _is_1(expr):
"""rtype bool. True iff expr is a constant close to 1 """rtype bool. True iff expr is a constant close to 1
...@@ -5145,4 +5157,4 @@ else: ...@@ -5145,4 +5157,4 @@ else:
# Although the op just returns its input, it should be removed from # Although the op just returns its input, it should be removed from
# the graph to make sure all possible optimizations can be applied. # the graph to make sure all possible optimizations can be applied.
register_canonicalize(gof.OpRemove(theano.gradient.consider_constant_), register_canonicalize(gof.OpRemove(theano.gradient.consider_constant_),
'fast_compile', name='remove_consider_constant') 'fast_compile', 'fast_run', name='remove_consider_constant')
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论