提交 9810017d authored 作者: Frederic Bastien's avatar Frederic Bastien

fix and add test for the gpu scan optimization when we have only the output moved to the gpu.

上级 21f930ce
...@@ -624,23 +624,22 @@ if cuda.cuda_available: ...@@ -624,23 +624,22 @@ if cuda.cuda_available:
@local_optimizer([]) @local_optimizer([])
def gpuScanOptimization(node): def gpuScanOptimization(node):
""" """
gpu_from_host(scan) -> GPUscan(gpu_from_host)
scan(host_from_gpu) -> host_from_gpu(GPUscan) scan(host_from_gpu) -> host_from_gpu(GPUscan)
gpu_from_host(scan) -> GPUscan(gpu_from_host)
""" """
#gpu_from_host(scan) -> GPUscan(gpu_from_host)
if node.op == gpu_from_host: if node.op == gpu_from_host:
host_input = node.inputs[0] host_input = node.inputs[0]
if ( host_input.owner if (host_input.owner and
and host_input.owner.op == scan_op.Scan isinstance(host_input.owner.op, scan_op.Scan) and
and not host_input.owner.op.info['gpu']): not host_input.owner.op.info['gpu']):
# NOT TESTED!!!!
thescan = host_input.owner.op thescan = host_input.owner.op
inputs = host_input.owner.inputs
# I need to cast thescan.inputs to gpuhost stuff
info = thescan.info.copy() info = thescan.info.copy()
info['gpu'] = True info['gpu'] = True
inputs = host_input.owner.inputs
nw_ins = [ inputs[0]] nw_ins = [ inputs[0]]
e = ( thescan.n_seqs e = ( 1+ thescan.n_seqs
+ thescan.n_mit_mot + thescan.n_mit_mot
+ thescan.n_mit_sot + thescan.n_mit_sot
+ thescan.n_sit_sot + thescan.n_sit_sot
...@@ -649,20 +648,20 @@ if cuda.cuda_available: ...@@ -649,20 +648,20 @@ if cuda.cuda_available:
b = e b = e
e = e + thescan.n_nit_sot + thescan.n_other_ignore e = e + thescan.n_nit_sot + thescan.n_other_ignore
nw_ins += inputs[b:e] nw_ins += inputs[b:e]
nw_ins += [safe_to_gpu(x) for x in inptus[e:] ] nw_ins += [safe_to_gpu(x) for x in inputs[e:] ]
scan_ins = [ tensor_to_cuda(x) for x in thescan.inputs] scan_ins = [ tensor_to_cuda(x) for x in thescan.inputs]
scan_outs = [ safe_to_gpu(x) for x in thescan.outputs ] scan_outs = [ safe_to_gpu(x) for x in thescan.outputs ]
scan_outs = scan_utils.clone( scan_outs = scan_utils.clone(
scan_outs scan_outs
, replace = zip(thescan.inputs , replace = zip(thescan.inputs,
,[safe_to_cpu(x) for x in scan_ins])) [safe_to_cpu(x) for x in scan_ins]))
nw_op = scan_op.Scan( scan_ins nw_op = scan_op.Scan( scan_ins
, scan_outs , scan_outs
, info).make_node(*nw_ins) , info).make_node(*nw_ins)
_outputs = nw_op.outputs _outputs = nw_op.outputs
outputs = [safe_to_cpu(x) for x in _outputs] return _outputs
return outputs
#scan(host_from_gpu) -> host_from_gpu(GPUscan)
if (type(node.op) == scan_op.Scan if (type(node.op) == scan_op.Scan
and not node.op.info['gpu']): and not node.op.info['gpu']):
if numpy.any([(i.owner and i.owner.op == host_from_gpu) if numpy.any([(i.owner and i.owner.op == host_from_gpu)
...@@ -689,7 +688,6 @@ if cuda.cuda_available: ...@@ -689,7 +688,6 @@ if cuda.cuda_available:
scan_outs scan_outs
, replace = zip(thescan.inputs , replace = zip(thescan.inputs
,[safe_to_cpu(x) for x in scan_ins])) ,[safe_to_cpu(x) for x in scan_ins]))
info['gpu'] = True
_outputs = scan_op.Scan( _outputs = scan_op.Scan(
scan_ins scan_ins
, scan_outs , scan_outs
......
...@@ -227,7 +227,8 @@ class T_Scan(unittest.TestCase): ...@@ -227,7 +227,8 @@ class T_Scan(unittest.TestCase):
assert numpy.allclose(theano_values, v_out) assert numpy.allclose(theano_values, v_out)
# as test_one_sequence_one_output_weights, but on the gpu # as test_one_sequence_one_output_weights, but on the gpu
def test_one_sequence_one_output_weights_gpu(self): # This first version test the first case in the optimizer to the gpu.
def test_one_sequence_one_output_weights_gpu1(self):
def f_rnn(u_t,x_tm1,W_in, W): def f_rnn(u_t,x_tm1,W_in, W):
return u_t*W_in+x_tm1*W return u_t*W_in+x_tm1*W
...@@ -235,13 +236,18 @@ class T_Scan(unittest.TestCase): ...@@ -235,13 +236,18 @@ class T_Scan(unittest.TestCase):
x0 = theano.tensor.fscalar('x0') x0 = theano.tensor.fscalar('x0')
W_in = theano.tensor.fscalar('win') W_in = theano.tensor.fscalar('win')
W = theano.tensor.fscalar('w') W = theano.tensor.fscalar('w')
mode = theano.compile.mode.get_default_mode().including('gpu') mode = theano.compile.mode.get_default_mode().including('gpu')
# The following line is needed to have the first case being used
# Otherwise, it is the second that is tested.
mode = mode.excluding('InputToGpuOptimizer')
output, updates = theano.scan(f_rnn, u,x0,[W_in,W] output, updates = theano.scan(f_rnn, u,x0,[W_in,W]
, n_steps = None , n_steps = None
, truncate_gradient = -1 , truncate_gradient = -1
, go_backwards = False , go_backwards = False
, mode = mode) , mode = mode)
output = theano.sandbox.cuda.gpu_from_host(output)
f2 = theano.function([u,x0,W_in,W], output, updates = updates, f2 = theano.function([u,x0,W_in,W], output, updates = updates,
allow_input_downcast = True, allow_input_downcast = True,
mode = mode) mode = mode)
...@@ -253,6 +259,11 @@ class T_Scan(unittest.TestCase): ...@@ -253,6 +259,11 @@ class T_Scan(unittest.TestCase):
W = rng.uniform() W = rng.uniform()
W_in = rng.uniform() W_in = rng.uniform()
v_u = numpy.asarray(v_u, dtype='float32')
v_x0 = numpy.asarray(v_x0, dtype='float32')
W = numpy.asarray(W, dtype='float32')
W_in = numpy.asarray(W_in, dtype='float32')
# compute the output in numpy # compute the output in numpy
v_out = numpy.zeros((4,)) v_out = numpy.zeros((4,))
v_out[0] = v_u[0]*W_in + v_x0 * W v_out[0] = v_u[0]*W_in + v_x0 * W
...@@ -261,8 +272,17 @@ class T_Scan(unittest.TestCase): ...@@ -261,8 +272,17 @@ class T_Scan(unittest.TestCase):
theano_values = f2(v_u,v_x0, W_in, W) theano_values = f2(v_u,v_x0, W_in, W)
assert numpy.allclose(theano_values, v_out) assert numpy.allclose(theano_values, v_out)
# TO DEL
topo = f2.maker.env.toposort() topo = f2.maker.env.toposort()
assert sum([isinstance(node.op, theano.sandbox.cuda.HostFromGpu) for node in topo]) == 1 scan_node = [node for node in topo if isinstance(node.op, theano.scan_module.scan_op.Scan)]
assert len(scan_node) == 1
scan_node = scan_node[0]
#theano.printing.pydotprint(f2, outfile='out1.png', high_contrast=True)
#theano.printing.pydotprint(scan_node.op.fn,
# outfile='inner1.png', high_contrast=True)
topo = f2.maker.env.toposort()
assert sum([isinstance(node.op, theano.sandbox.cuda.HostFromGpu) for node in topo]) == 0
assert sum([isinstance(node.op, theano.sandbox.cuda.GpuFromHost) for node in topo]) == 4 assert sum([isinstance(node.op, theano.sandbox.cuda.GpuFromHost) for node in topo]) == 4
scan_node = [node for node in topo if isinstance(node.op, theano.scan_module.scan_op.Scan)] scan_node = [node for node in topo if isinstance(node.op, theano.scan_module.scan_op.Scan)]
...@@ -270,11 +290,59 @@ class T_Scan(unittest.TestCase): ...@@ -270,11 +290,59 @@ class T_Scan(unittest.TestCase):
scan_node = scan_node[0] scan_node = scan_node[0]
scan_node_topo = scan_node.op.fn.maker.env.toposort() scan_node_topo = scan_node.op.fn.maker.env.toposort()
theano.printing.pydotprint(f2, outfile='out.png', high_contrast=True) # check that there is no gpu transfer in the inner loop.
theano.printing.pydotprint(scan_node.op.fn, assert any([isinstance(node.op, theano.sandbox.cuda.GpuElemwise) for node in scan_node_topo])
outfile='inner.png', high_contrast=True) assert not any([isinstance(node.op, theano.sandbox.cuda.HostFromGpu) for node in scan_node_topo])
assert not any([isinstance(node.op, theano.sandbox.cuda.GpuFromHost) for node in scan_node_topo])
# This second version test the second case in the optimizer to the gpu.
def test_one_sequence_one_output_weights_gpu2(self):
def f_rnn(u_t,x_tm1,W_in, W):
return u_t*W_in+x_tm1*W
u = theano.tensor.fvector('u')
x0 = theano.tensor.fscalar('x0')
W_in = theano.tensor.fscalar('win')
W = theano.tensor.fscalar('w')
mode = theano.compile.mode.get_default_mode().including('gpu')
output, updates = theano.scan(f_rnn, u,x0,[W_in,W]
, n_steps = None
, truncate_gradient = -1
, go_backwards = False
, mode = mode)
f2 = theano.function([u,x0,W_in,W], output, updates = updates,
allow_input_downcast = True,
mode = mode)
# get random initial values
rng = numpy.random.RandomState(utt.fetch_seed())
v_u = rng.uniform( size = (4,), low = -5., high = 5.)
v_x0 = rng.uniform()
W = rng.uniform()
W_in = rng.uniform()
# compute the output in numpy
v_out = numpy.zeros((4,))
v_out[0] = v_u[0]*W_in + v_x0 * W
for step in xrange(1,4):
v_out[step] = v_u[step]*W_in + v_out[step-1] * W
theano_values = f2(v_u,v_x0, W_in, W)
assert numpy.allclose(theano_values, v_out)
topo = f2.maker.env.toposort()
assert sum([isinstance(node.op, theano.sandbox.cuda.HostFromGpu) for node in topo]) == 1
assert sum([isinstance(node.op, theano.sandbox.cuda.GpuFromHost) for node in topo]) == 4
scan_node = [node for node in topo if isinstance(node.op, theano.scan_module.scan_op.Scan)]
assert len(scan_node) == 1
scan_node = scan_node[0]
scan_node_topo = scan_node.op.fn.maker.env.toposort()
#theano.printing.pydotprint(f2, outfile='out2.png', high_contrast=True)
#theano.printing.pydotprint(scan_node.op.fn,
# outfile='inner2.png', high_contrast=True)
#check that there is less gpu transfer # check that there is no gpu transfer in the inner loop.
assert any([isinstance(node.op, theano.sandbox.cuda.GpuElemwise) for node in scan_node_topo]) assert any([isinstance(node.op, theano.sandbox.cuda.GpuElemwise) for node in scan_node_topo])
assert not any([isinstance(node.op, theano.sandbox.cuda.HostFromGpu) for node in scan_node_topo]) assert not any([isinstance(node.op, theano.sandbox.cuda.HostFromGpu) for node in scan_node_topo])
assert not any([isinstance(node.op, theano.sandbox.cuda.GpuFromHost) for node in scan_node_topo]) assert not any([isinstance(node.op, theano.sandbox.cuda.GpuFromHost) for node in scan_node_topo])
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论