提交 00e40907 authored 作者: Razvan Pascanu's avatar Razvan Pascanu

Removed trailing spaces

上级 bdf1394f
...@@ -58,12 +58,12 @@ class InputToGpuOptimizer(Optimizer): ...@@ -58,12 +58,12 @@ class InputToGpuOptimizer(Optimizer):
if new_input.type==input.type: if new_input.type==input.type:
env.replace_validate(input, new_input, "To allow further optimisation to move Ops to gpu") env.replace_validate(input, new_input, "To allow further optimisation to move Ops to gpu")
except Exception, e: except Exception, e:
#as we currently only support float32, this can fail. #as we currently only support float32, this can fail.
#Using try except make that we won't need #Using try except make that we won't need
pass pass
#we register it before all other gpu optimizer to be sure that the input are on the gpu. #we register it before all other gpu optimizer to be sure that the input are on the gpu.
gpu_seqopt.register('InputToGpuOptimizer', InputToGpuOptimizer(), gpu_seqopt.register('InputToGpuOptimizer', InputToGpuOptimizer(),
0, 'fast_run', 'fast_compile', 'merge')#TODO: how to make it mandatory for gpu_seqopt? 0, 'fast_run', 'fast_compile', 'merge')#TODO: how to make it mandatory for gpu_seqopt?
@local_optimizer([]) @local_optimizer([])
...@@ -73,9 +73,9 @@ def local_cut_gpu_host_gpu(node): ...@@ -73,9 +73,9 @@ def local_cut_gpu_host_gpu(node):
if tensor.opt.opt.check_chain(node, host_from_gpu, gpu_from_host): if tensor.opt.opt.check_chain(node, host_from_gpu, gpu_from_host):
return [node.inputs[0].owner.inputs[0]] return [node.inputs[0].owner.inputs[0]]
return False return False
gpu_cut_copies.register('cut_gpu_host_transfers', local_cut_gpu_host_gpu, gpu_cut_copies.register('cut_gpu_host_transfers', local_cut_gpu_host_gpu,
'fast_run', 'inplace', 'gpu') 'fast_run', 'inplace', 'gpu')
gpu_cut_copies.register('cut_gpu_constant_transfers', tensor.opt.constant_folding, gpu_cut_copies.register('cut_gpu_constant_transfers', tensor.opt.constant_folding,
'fast_run', 'gpu') 'fast_run', 'gpu')
#register it into canonicalize to allow other optimization to work without #register it into canonicalize to allow other optimization to work without
#botering with this useless pattern. #botering with this useless pattern.
...@@ -84,7 +84,7 @@ compile.optdb['canonicalize'].register('local_cut_gpu_host_gpu', local_cut_gpu_h ...@@ -84,7 +84,7 @@ compile.optdb['canonicalize'].register('local_cut_gpu_host_gpu', local_cut_gpu_h
@register_opt() @register_opt()
@local_optimizer([]) @local_optimizer([])
def local_gpu_elemwise_0(node): def local_gpu_elemwise_0(node):
"""elemwise(..., host_from_gpu, ...) """elemwise(..., host_from_gpu, ...)
-> host_from_gpu(elemwise(gpu_from_host, ..., gpu_from_host) -> host_from_gpu(elemwise(gpu_from_host, ..., gpu_from_host)
""" """
if isinstance(node.op, tensor.Elemwise): if isinstance(node.op, tensor.Elemwise):
...@@ -139,14 +139,14 @@ def local_gpu_dimshuffle_0(node): ...@@ -139,14 +139,14 @@ def local_gpu_dimshuffle_0(node):
input, = node.inputs input, = node.inputs
if input.owner and isinstance(input.owner.op, HostFromGpu): if input.owner and isinstance(input.owner.op, HostFromGpu):
# move the add to a GpuAdd # move the add to a GpuAdd
new_op = GpuDimShuffle(node.op.input_broadcastable, new_op = GpuDimShuffle(node.op.input_broadcastable,
node.op.new_order) node.op.new_order)
return [host_from_gpu(new_op(gpu_from_host(input)))] return [host_from_gpu(new_op(gpu_from_host(input)))]
if node.op == gpu_from_host: if node.op == gpu_from_host:
host_input = node.inputs[0] host_input = node.inputs[0]
if host_input.owner and isinstance(host_input.owner.op, tensor.DimShuffle): if host_input.owner and isinstance(host_input.owner.op, tensor.DimShuffle):
dimshuffle_node = host_input.owner dimshuffle_node = host_input.owner
new_op = GpuDimShuffle(dimshuffle_node.op.input_broadcastable, new_op = GpuDimShuffle(dimshuffle_node.op.input_broadcastable,
dimshuffle_node.op.new_order) dimshuffle_node.op.new_order)
return [new_op(gpu_from_host(dimshuffle_node.inputs[0]))] return [new_op(gpu_from_host(dimshuffle_node.inputs[0]))]
return False return False
...@@ -516,7 +516,7 @@ def local_gpu_crossentorpy_softmax_argmax_1hot_with_bias(node): ...@@ -516,7 +516,7 @@ def local_gpu_crossentorpy_softmax_argmax_1hot_with_bias(node):
x,b,y = node.inputs x,b,y = node.inputs
if x.owner and x.owner.op == host_from_gpu: if x.owner and x.owner.op == host_from_gpu:
gpu_x, = x.owner.inputs gpu_x, = x.owner.inputs
# if y is a cast to integers, we can go to the underlying thing if we want, # if y is a cast to integers, we can go to the underlying thing if we want,
# since this gpu op will cast to integers internally anyway # since this gpu op will cast to integers internally anyway
int_cast_ops = ( int_cast_ops = (
tensor.basic._convert_to_int32, tensor.basic._convert_to_int32,
...@@ -531,8 +531,8 @@ def local_gpu_crossentorpy_softmax_argmax_1hot_with_bias(node): ...@@ -531,8 +531,8 @@ def local_gpu_crossentorpy_softmax_argmax_1hot_with_bias(node):
gpu_from_host(b), gpu_from_host(b),
gpu_from_host(cast(y, 'float32'))) gpu_from_host(cast(y, 'float32')))
am_dtype = node.outputs[2].type.dtype am_dtype = node.outputs[2].type.dtype
return [host_from_gpu(gpu_nll), return [host_from_gpu(gpu_nll),
host_from_gpu(gpu_sm), host_from_gpu(gpu_sm),
cast(host_from_gpu(gpu_am), am_dtype)] cast(host_from_gpu(gpu_am), am_dtype)]
return False return False
...@@ -728,7 +728,7 @@ else: ...@@ -728,7 +728,7 @@ else:
#GpuElemwise inplace #GpuElemwise inplace
gpu_insert_inplace_optimizer = tensor.opt.insert_inplace_optimizer_op(GpuElemwise) gpu_insert_inplace_optimizer = tensor.opt.insert_inplace_optimizer_op(GpuElemwise)
compile.optdb.register('gpu_inplace_opt', gpu_insert_inplace_optimizer, 75, 'fast_run', 'inplace','gpu_inplace') compile.optdb.register('gpu_inplace_opt', gpu_insert_inplace_optimizer, 75, 'fast_run', 'inplace','gpu_inplace')
@register_opt() @register_opt()
@local_optimizer([tensor.Alloc]) @local_optimizer([tensor.Alloc])
...@@ -749,7 +749,7 @@ def local_gpualloc(node): ...@@ -749,7 +749,7 @@ def local_gpualloc(node):
new_out = host_from_gpu(gpu_alloc(val2, *shp)) new_out = host_from_gpu(gpu_alloc(val2, *shp))
# Sigh. it's an annoying thing about theano # Sigh. it's an annoying thing about theano
# that you can't add information to the graph. # that you can't add information to the graph.
# If for some reason it has come to light that # If for some reason it has come to light that
# one of the dimensions is broadcastable, we have to hide that # one of the dimensions is broadcastable, we have to hide that
# or the optimization won't go through. # or the optimization won't go through.
if new_out.type != old_out.type: if new_out.type != old_out.type:
...@@ -763,7 +763,7 @@ def local_gpualloc(node): ...@@ -763,7 +763,7 @@ def local_gpualloc(node):
#if old_out.type != new_out.type: #if old_out.type != new_out.type:
#import pdb; pdb.set_trace() #import pdb; pdb.set_trace()
return [new_out] return [new_out]
@register_opt() @register_opt()
@local_optimizer([]) @local_optimizer([])
def local_gpu_huge_add_or_mul(node): def local_gpu_huge_add_or_mul(node):
...@@ -774,7 +774,7 @@ def local_gpu_huge_add_or_mul(node): ...@@ -774,7 +774,7 @@ def local_gpu_huge_add_or_mul(node):
The CUDA c compiler limits the number of arguments to 256 bytes' worth or something. The CUDA c compiler limits the number of arguments to 256 bytes' worth or something.
""" """
if isinstance(node.op, GpuElemwise) and node.op.scalar_op in (scal.add, scal.mul): if isinstance(node.op, GpuElemwise) and node.op.scalar_op in (scal.add, scal.mul):
if len(node.inputs)>10: if len(node.inputs)>10:
# TODO: look up how arguments are passed to the GpuElemwise function # TODO: look up how arguments are passed to the GpuElemwise function
# and figure out how many arguments can fit in 256 bytes. # and figure out how many arguments can fit in 256 bytes.
# this will depend on the number of dimensions in each argument. # this will depend on the number of dimensions in each argument.
......
...@@ -49,6 +49,9 @@ def test_int_pow(): ...@@ -49,6 +49,9 @@ def test_int_pow():
#theano.printing.debugprint(f) #theano.printing.debugprint(f)
def test_softmax(): def test_softmax():
x = tensor.fmatrix() x = tensor.fmatrix()
...@@ -78,7 +81,7 @@ def test_opt_gpujoin_onlyajoin(): ...@@ -78,7 +81,7 @@ def test_opt_gpujoin_onlyajoin():
b = cuda.shared_constructor(_b) b = cuda.shared_constructor(_b)
c = tensor.join(1,a,b) c = tensor.join(1,a,b)
f = theano.function([], c, mode=mode_with_gpu) f = theano.function([], c, mode=mode_with_gpu)
#theano.printing.debugprint(f) #theano.printing.debugprint(f)
...@@ -105,7 +108,7 @@ def test_opt_gpujoin_joinvectors_elemwise_then_minusone(): ...@@ -105,7 +108,7 @@ def test_opt_gpujoin_joinvectors_elemwise_then_minusone():
b_prime = tensor.sin(b) b_prime = tensor.sin(b)
c = tensor.join(0,a_prime,b_prime) c = tensor.join(0,a_prime,b_prime)
d = c[:-1] d = c[:-1]
f = theano.function([], d, mode=mode_with_gpu) f = theano.function([], d, mode=mode_with_gpu)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论