提交 e836fef3 authored 作者: Razvan Pascanu's avatar Razvan Pascanu

merge; no conflicts

...@@ -4,6 +4,7 @@ _logger = logging.getLogger('theano.sandbox.cuda.opt') ...@@ -4,6 +4,7 @@ _logger = logging.getLogger('theano.sandbox.cuda.opt')
import sys import sys
import theano import theano
import numpy import numpy
from theano.scan_module import scan_utils, scan_op
from theano import scalar as scal from theano import scalar as scal
from theano import tensor, compile, gof from theano import tensor, compile, gof
...@@ -1030,3 +1031,226 @@ def local_gpualloc(node): ...@@ -1030,3 +1031,226 @@ def local_gpualloc(node):
#if old_out.type != new_out.type: #if old_out.type != new_out.type:
#import pdb; pdb.set_trace() #import pdb; pdb.set_trace()
return [new_out] return [new_out]
def safe_to_gpu(x):
if (isinstance(x.type, tensor.TensorType) and
x.type.dtype == 'float32'):
return gpu_from_host(x)
else:
return x
def safe_to_cpu(x):
if isinstance(x.type, CudaNdarrayType):
return host_from_gpu(x)
else:
return x
def gpu_safe_new(x, tag = ''):
"""
Internal function that constructs a new variable from x with the same
type, but with a different name ( old name + tag). This function is used
by gradient, or the R-op to construct new variables for the inputs of
the inner graph such that there is no interference between the original
graph and the newly constructed graph.
"""
if hasattr(x, 'name') and x.name is not None:
nw_name = x.name + tag
else:
nw_name = None
if isinstance(x, theano.Constant):
return x.clone()
nw_x = x.type()
nw_x.name = nw_name
return nw_x
def gpu_reconstruct_graph(inputs, outputs, tag = None):
"""
Different interface to clone, that allows you to pass inputs.
Compared to clone, this method always replaces the inputs with
new variables of the same type, and returns those ( in the same
order as the original inputs).
"""
if tag is None:
tag = ''
nw_inputs = [gpu_safe_new(x,tag) for x in inputs]
givens = {}
for nw_x, x in zip(nw_inputs, inputs):
givens[x] = nw_x
nw_outputs = scan_utils.clone( outputs, replace=givens)
return (nw_inputs, nw_outputs)
def tensor_to_cuda(x):
if (isinstance(x.type, tensor.TensorType) and
x.type.dtype == 'float32'):
y = CudaNdarrayType( broadcastable = x.type.broadcastable)()
if x.name :
y.name = x.name +'[cuda]'
return y
else:
return x
@register_opt('scan')
@local_optimizer([])
def gpuScanOptimization(node):
"""
scan(host_from_gpu) -> host_from_gpu(GPUscan)
gpu_from_host(scan) -> GPUscan(gpu_from_host)
"""
#gpu_from_host(scan) -> GPUscan(gpu_from_host)
if node.op == gpu_from_host:
host_input = node.inputs[0]
if (host_input.owner and
isinstance(host_input.owner.op, scan_op.Scan) and
not host_input.owner.op.info['gpu'] and
len(host_input.owner.outputs) == 1 ):
# Note that we are not doing the right thing here !!
# This is because the local optimizer expects only one
# output that corresponds to the input of ``node``
# If we do this for each output seperately we will have
# multiple scan ops in the graph ( as many as outputs )
# and I'm not sure they will get merged into one again
# So for now I will just cover a limited case when there
# is only one output and the local optimizer can be used
# TODO (fix) : either make sure the different scans get
# merged or implement this optimization as a global
# optimization
thescan = host_input.owner.op
info = thescan.info.copy()
info['gpu'] = True
inputs = host_input.owner.inputs
nw_ins = [ inputs[0]]
e = ( 1+ thescan.n_seqs
+ thescan.n_mit_mot
+ thescan.n_mit_sot
+ thescan.n_sit_sot
+ thescan.n_shared_outs)
nw_ins += [safe_to_gpu(x) for x in inputs[1:e] ]
b = e
e = e + thescan.n_nit_sot
nw_ins += inputs[b:e]
nw_ins += [safe_to_gpu(x) for x in inputs[e:] ]
scan_ins = [ tensor_to_cuda(x) for x in thescan.inputs]
scan_outs = [ safe_to_gpu(x) for x in thescan.outputs ]
scan_outs = scan_utils.clone(
scan_outs
, replace = zip(thescan.inputs,
[safe_to_cpu(x) for x in scan_ins]))
# We need to construct the hash here, because scan
# __init__ does not know about cuda ndarray and can not
# handle graphs with inputs being Cuda Ndarrays
tmp_in, tmp_out = gpu_reconstruct_graph(scan_ins,
scan_outs)
local_env = gof.Env(tmp_in, tmp_out)
_cmodule_key = gof.CLinker.cmodule_key_(local_env,[])
info['gpu_hash'] = hash(_cmodule_key)
typeConstructor = lambda broadcastable, dtype: CudaNdarrayType(
broadcastable = broadcastable)
nw_op = scan_op.Scan( scan_ins
, scan_outs
, info
, typeConstructor = typeConstructor
).make_node(*nw_ins)
_outputs = nw_op.outputs
return _outputs
#scan(host_from_gpu) -> host_from_gpu(GPUscan)
if (type(node.op) == scan_op.Scan
and not node.op.info['gpu']):
if numpy.any([(i.owner and i.owner.op == host_from_gpu)
for i in node.inputs]):
thescan = node.op
info = thescan.info.copy()
info['gpu'] = True
inputs = node.inputs
nw_ins = [ inputs[0]]
e = ( 1+ thescan.n_seqs
+ thescan.n_mit_mot
+ thescan.n_mit_sot
+ thescan.n_sit_sot
+ thescan.n_shared_outs)
nw_ins += [safe_to_gpu(x) for x in inputs[1:e] ]
b = e
e = e + thescan.n_nit_sot
nw_ins += inputs[b:e]
nw_ins += [safe_to_gpu(x) for x in inputs[e:] ]
scan_ins = [ tensor_to_cuda(x) for x in thescan.inputs]
scan_outs = [ safe_to_gpu(x) for x in thescan.outputs ]
scan_outs = scan_utils.clone(
scan_outs
, replace = zip(thescan.inputs
,[safe_to_cpu(x) for x in scan_ins]))
# We need to construct the hash here, because scan
# __init__ does not know about cuda ndarray and can not
# handle graphs with inputs being Cuda Ndarrays
tmp_in, tmp_out = gpu_reconstruct_graph(scan_ins,
scan_outs)
local_env = gof.Env(tmp_in, tmp_out)
_cmodule_key = gof.CLinker.cmodule_key_(local_env,[])
info['gpu_hash'] = hash(_cmodule_key)
typeConstructor = lambda broadcastable, dtype: CudaNdarrayType(
broadcastable = broadcastable)
_outputs = scan_op.Scan(
scan_ins
, scan_outs
, info
, typeConstructor = typeConstructor
).make_node(*nw_ins).outputs
outputs = [safe_to_cpu(x) for x in _outputs]
return outputs
return False
@gof.local_optimizer([None])
def gpu_scan_make_inplace(node):
op = node.op
if ( isinstance(op, scan_op.Scan) and
(not op.info['inplace']) and
(op.info['gpu'])):
info = op.info.copy()
info['inplace'] = True
# inputs corresponding to sequences and n_steps
ls_begin = node.inputs[:1+op.n_seqs]
ls = op.outer_mitmot(node)
ls += op.outer_mitsot(node)
ls += op.outer_sitsot(node)
ls_end = op.outer_shared(node)
ls_end += op.outer_nitsot(node)
ls_end += op.outer_non_seqs(node)
n_outs = len(ls)
for idx in xrange(n_outs):
if ls[idx] in ls[:idx]:
ls[idx] = deep_copy_op(ls[idx])
inputs = ls_begin + ls + ls_end
typeConstructor = lambda broadcastable, dtype: CudaNdarrayType(
broadcastable = broadcastable)
new_op = scan_op.Scan( op.inputs
, op.outputs
, info
, typeConstructor = typeConstructor
)
return new_op.make_node(*inputs).outputs
return False
optdb.register( 'gpu_scanOp_make_inplace'
, theano.tensor.opt.in2out(gpu_scan_make_inplace,ignore_newtrees=True)
, 75
, 'gpu'
, 'fast_run'
, 'inplace'
, 'scan')
...@@ -28,7 +28,7 @@ from theano import gof ...@@ -28,7 +28,7 @@ from theano import gof
from theano.tensor import TensorType from theano.tensor import TensorType
from theano import tensor from theano import tensor
from theano.tensor.opt import Shape_i from theano.tensor.opt import Shape_i
from theano.sandbox import cuda #from theano.sandbox import cuda
from theano.compile.profiling import ScanProfileStats from theano.compile.profiling import ScanProfileStats
import scan_utils import scan_utils
...@@ -46,7 +46,9 @@ class Scan(Op): ...@@ -46,7 +46,9 @@ class Scan(Op):
def __init__( self def __init__( self
, inputs , inputs
, outputs , outputs
, info ): , info
, typeConstructor = None
):
""" """
:param inputs: inputs of the inner function of scan :param inputs: inputs of the inner function of scan
:param outputs: outputs of the inner function of scan :param outputs: outputs of the inner function of scan
...@@ -66,60 +68,31 @@ class Scan(Op): ...@@ -66,60 +68,31 @@ class Scan(Op):
self.output_types = [] self.output_types = []
idx = 0 idx = 0
jdx = 0 jdx = 0
if self.gpu: if typeConstructor is None:
# mit_mot typeConstructor = lambda broadcastable, dtype: TensorType(
while idx < self.n_mit_mot_outs: broadcastable = broadcastable, dtype = dtype)
# Not that for mit_mot there are several output slices per
# output sequence while idx < self.n_mit_mot_outs:
o = outputs[idx] # Not that for mit_mot there are several output slices per
self.output_types.append( # output sequence
cuda.CudaNdarrayType( o = outputs[idx]
broadcastable = (False,) + o.type.broadcastable)) self.output_types.append(
idx += len(self.mit_mot_out_slices[jdx]) typeConstructor( broadcastable = (False,) + o.type.broadcastable
jdx += 1 , dtype = o.type.dtype)
)
# mit_sot / sit_sot / nit_sot idx += len(self.mit_mot_out_slices[jdx])
end = idx + self.n_mit_sot + self.n_sit_sot + self.n_nit_sot jdx += 1
for o in outputs[idx:end]:
self.output_types.append( # mit_sot / sit_sot / nit_sot
cuda.CudaNdarrayType( broadcastable = (False,) + end = idx + self.n_mit_sot + self.n_sit_sot + self.n_nit_sot
o.type.broadcastable)) for o in outputs[idx:end]:
# shared outputs self.output_types.append(
for o in outputs[end:]: typeConstructor(
if isinstance(o.type, TensorType): broadcastable = (False,) + o.type.broadcastable
self.output_types.append(cuda.CudaNdarrayType( , dtype = o.type.dtype ))
broadcastable = o.type.broadcastable)) # shared outputs + possibly the ending condition
else: for o in outputs[end:]:
self.output_types.append( o.type ) self.output_types.append( o.type )
else:
while idx < self.n_mit_mot_outs:
# Not that for mit_mot there are several output slices per
# output sequence
o = outputs[idx]
self.output_types.append(
TensorType(
broadcastable = (False,) + o.type.broadcastable
, dtype = o.type.dtype)
)
idx += len(self.mit_mot_out_slices[jdx])
jdx += 1
# mit_sot / sit_sot / nit_sot
end = idx + self.n_mit_sot + self.n_sit_sot + self.n_nit_sot
for o in outputs[idx:end]:
self.output_types.append(
TensorType(
broadcastable = (False,) + o.type.broadcastable
, dtype = o.type.dtype ))
# shared outputs + possibly the ending condition
for o in outputs[end:]:
if cuda.cuda_available and isinstance(o.type,
cuda.CudaNdarrayType):
self.output_types.append( TensorType(
broadcastable = o.type.broadcastable
, dtype = theano.config.floatX) )
else:
self.output_types.append( o.type )
if self.as_while: if self.as_while:
self.output_types = self.output_types[:-1] self.output_types = self.output_types[:-1]
...@@ -168,11 +141,14 @@ class Scan(Op): ...@@ -168,11 +141,14 @@ class Scan(Op):
self.n_shared_outs ) self.n_shared_outs )
self.n_outs = self.n_mit_mot + self.n_mit_sot + self.n_sit_sot self.n_outs = self.n_mit_mot + self.n_mit_sot + self.n_sit_sot
self.n_tap_outs = self.n_mit_mot + self.n_mit_sot self.n_tap_outs = self.n_mit_mot + self.n_mit_sot
tmp_in, tmp_out = scan_utils.reconstruct_graph(self.inputs, if not self.info['gpu']:
tmp_in, tmp_out = scan_utils.reconstruct_graph(self.inputs,
self.outputs) self.outputs)
local_env = gof.Env(tmp_in, tmp_out) local_env = gof.Env(tmp_in, tmp_out)
self._cmodule_key = gof.CLinker.cmodule_key_(local_env,[]) self._cmodule_key = gof.CLinker.cmodule_key_(local_env,[])
self._hash_inner_graph = hash(self._cmodule_key) self._hash_inner_graph = hash(self._cmodule_key)
else:
self._hash_inner_graph = self.info['gpu_hash']
def make_node(self, *inputs): def make_node(self, *inputs):
...@@ -419,9 +395,9 @@ class Scan(Op): ...@@ -419,9 +395,9 @@ class Scan(Op):
cython_mit_mot_out_slices[_d0,_d1] = \ cython_mit_mot_out_slices[_d0,_d1] = \
self.mit_mot_out_slices[_d0][_d1] self.mit_mot_out_slices[_d0][_d1]
vector_seqs = [ seq.ndim == 1 for seq in vector_seqs = [ seq.ndim == 1 for seq in
self.inputs[1:1+self.n_seqs ] ] node.inputs[1:1+self.n_seqs ] ]
vector_outs = [ arg.ndim ==1 for arg in vector_outs = [ arg.ndim ==1 for arg in
self.inputs[1+self.n_seqs: (1+self.n_seqs + node.inputs[1+self.n_seqs: (1+self.n_seqs +
self.n_outs)] ] self.n_outs)] ]
vector_outs += [ False]*self.n_nit_sot vector_outs += [ False]*self.n_nit_sot
...@@ -610,6 +586,8 @@ class Scan(Op): ...@@ -610,6 +586,8 @@ class Scan(Op):
Y sequence outputs y_1, y_2, ... y_<self.n_outs> Y sequence outputs y_1, y_2, ... y_<self.n_outs>
""" """
# In order to be able to allocate cuda ndarrays if needed
from theano.sandbox import cuda
# 1. Unzip the number of steps and sequences. If number of steps is # 1. Unzip the number of steps and sequences. If number of steps is
# negative flip sequences around, and make n_steps positive # negative flip sequences around, and make n_steps positive
t0_call = time.time() t0_call = time.time()
......
...@@ -289,7 +289,8 @@ optdb.register('scanOp_pushout_nonseqs_ops', ...@@ -289,7 +289,8 @@ optdb.register('scanOp_pushout_nonseqs_ops',
def scan_make_inplace(node): def scan_make_inplace(node):
op = node.op op = node.op
if ( isinstance(op, scan_op.Scan) and if ( isinstance(op, scan_op.Scan) and
(not op.info['inplace']) ): (not op.info['inplace']) and
(not op.info['gpu'])):
info = op.info.copy() info = op.info.copy()
info['inplace'] = True info['inplace'] = True
# inputs corresponding to sequences and n_steps # inputs corresponding to sequences and n_steps
...@@ -1122,122 +1123,4 @@ optdb.register('scanOp_merge_inouts' ...@@ -1122,122 +1123,4 @@ optdb.register('scanOp_merge_inouts'
, 'fast_run' , 'fast_run'
, 'scan') , 'scan')
from theano.sandbox import cuda
if cuda.cuda_available:
from theano.sandbox.cuda.basic_ops import gpu_from_host, host_from_gpu
from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda.opt import register_opt, local_optimizer
def safe_to_gpu(x):
if (isinstance(x.type, TensorType) and
x.type.dtype == 'float32'):
return gpu_from_host(x)
else:
return x
def safe_to_cpu(x):
if isinstance(x.type, CudaNdarrayType):
return host_from_gpu(x)
else:
return x
def tensor_to_cuda(x):
if (isinstance(x.type, TensorType) and
x.type.dtype == 'float32'):
y = CudaNdarrayType( broadcastable = x.type.broadcastable)()
if x.name :
y.name = x.name +'[cuda]'
return y
else:
return x
@register_opt('scan')
@local_optimizer([])
def gpuScanOptimization(node):
"""
scan(host_from_gpu) -> host_from_gpu(GPUscan)
gpu_from_host(scan) -> GPUscan(gpu_from_host)
"""
#gpu_from_host(scan) -> GPUscan(gpu_from_host)
if node.op == gpu_from_host:
host_input = node.inputs[0]
if (host_input.owner and
isinstance(host_input.owner.op, scan_op.Scan) and
not host_input.owner.op.info['gpu'] and
len(host_input.owner.outputs) == 1 ):
# Note that we are not doing the right thing here !!
# This is because the local optimizer expects only one
# output that corresponds to the input of ``node``
# If we do this for each output seperately we will have
# multiple scan ops in the graph ( as many as outputs )
# and I'm not sure they will get merged into one again
# So for now I will just cover a limited case when there
# is only one output and the local optimizer can be used
# TODO (fix) : either make sure the different scans get
# merged or implement this optimization as a global
# optimization
thescan = host_input.owner.op
info = thescan.info.copy()
info['gpu'] = True
inputs = host_input.owner.inputs
nw_ins = [ inputs[0]]
e = ( 1+ thescan.n_seqs
+ thescan.n_mit_mot
+ thescan.n_mit_sot
+ thescan.n_sit_sot
+ thescan.n_shared_outs)
nw_ins += [safe_to_gpu(x) for x in inputs[1:e] ]
b = e
e = e + thescan.n_nit_sot
nw_ins += inputs[b:e]
nw_ins += [safe_to_gpu(x) for x in inputs[e:] ]
scan_ins = [ tensor_to_cuda(x) for x in thescan.inputs]
scan_outs = [ safe_to_gpu(x) for x in thescan.outputs ]
scan_outs = scan_utils.clone(
scan_outs
, replace = zip(thescan.inputs,
[safe_to_cpu(x) for x in scan_ins]))
nw_op = scan_op.Scan( scan_ins
, scan_outs
, info).make_node(*nw_ins)
_outputs = nw_op.outputs
return _outputs
#scan(host_from_gpu) -> host_from_gpu(GPUscan)
if (type(node.op) == scan_op.Scan
and not node.op.info['gpu']):
if numpy.any([(i.owner and i.owner.op == host_from_gpu)
for i in node.inputs]):
thescan = node.op
info = thescan.info.copy()
info['gpu'] = True
inputs = node.inputs
nw_ins = [ inputs[0]]
e = ( 1+ thescan.n_seqs
+ thescan.n_mit_mot
+ thescan.n_mit_sot
+ thescan.n_sit_sot
+ thescan.n_shared_outs)
nw_ins += [safe_to_gpu(x) for x in inputs[1:e] ]
b = e
e = e + thescan.n_nit_sot
nw_ins += inputs[b:e]
nw_ins += [safe_to_gpu(x) for x in inputs[e:] ]
scan_ins = [ tensor_to_cuda(x) for x in thescan.inputs]
scan_outs = [ safe_to_gpu(x) for x in thescan.outputs ]
scan_outs = scan_utils.clone(
scan_outs
, replace = zip(thescan.inputs
,[safe_to_cpu(x) for x in scan_ins]))
_outputs = scan_op.Scan(
scan_ins
, scan_outs
, info).make_node(*nw_ins).outputs
outputs = [safe_to_cpu(x) for x in _outputs]
return outputs
return False
...@@ -22,7 +22,6 @@ from theano import gof ...@@ -22,7 +22,6 @@ from theano import gof
from theano import tensor, scalar from theano import tensor, scalar
from theano.tensor.basic import get_constant_value from theano.tensor.basic import get_constant_value
from theano.sandbox import cuda
import theano import theano
...@@ -43,8 +42,7 @@ def safe_new(x, tag = ''): ...@@ -43,8 +42,7 @@ def safe_new(x, tag = ''):
nw_name = x.name + tag nw_name = x.name + tag
else: else:
nw_name = None nw_name = None
# Should it be theano.Constant? What is the difference between the two? if isinstance(x, theano.Constant):
if isinstance(x, tensor.Constant):
return x.clone() return x.clone()
# Note, as_tensor_variable will convert the Scalar into a # Note, as_tensor_variable will convert the Scalar into a
# TensorScalar that will require a ScalarFromTensor op, # TensorScalar that will require a ScalarFromTensor op,
...@@ -93,14 +91,11 @@ def traverse(out, x,x_copy, d): ...@@ -93,14 +91,11 @@ def traverse(out, x,x_copy, d):
fine for the main computational graph but confuses things a bit for the fine for the main computational graph but confuses things a bit for the
inner graph of scan ''' inner graph of scan '''
if out == x: if out == x:
d[out] = cuda.gpu_from_host(x_copy) d[out] = tensor.as_tensor_variable(x_copy)
return d return d
elif out.owner is None: elif out.owner is None:
return d return d
elif (out.owner.op == cuda.host_from_gpu
and out.owner.inputs == [x] ):
d[out] = x_copy
return d
else: else:
for inp in out.owner.inputs: for inp in out.owner.inputs:
d = traverse(inp, x, x_copy, d) d = traverse(inp, x, x_copy, d)
......
...@@ -2282,7 +2282,7 @@ class T_Scan(unittest.TestCase): ...@@ -2282,7 +2282,7 @@ class T_Scan(unittest.TestCase):
return x_t+1, theano.scan_module.until( x_t > 3) return x_t+1, theano.scan_module.until( x_t > 3)
o, _ = theano.scan(lambda_fn, x) o, _ = theano.scan(lambda_fn, x)
f = theano.function([x], o) f = theano.function([x], o)
vx = numpy.zeros((50,)) vx = numpy.zeros((50,), dtype = theano.config.floatX)
vx[23] = 4 vx[23] = 4
out = f(vx) out = f(vx)
assert numpy.sum(out[24:]) == 0 assert numpy.sum(out[24:]) == 0
...@@ -2296,7 +2296,7 @@ class T_Scan(unittest.TestCase): ...@@ -2296,7 +2296,7 @@ class T_Scan(unittest.TestCase):
x) x)
f = theano.function([x], [o,o2]) f = theano.function([x], [o,o2])
vx = numpy.zeros((50,)) vx = numpy.zeros((50,), dtype = theano.config.floatX)
vx[23] = 4 vx[23] = 4
out, out2 = f(vx) out, out2 = f(vx)
assert numpy.sum(out[24:]) == 0 assert numpy.sum(out[24:]) == 0
...@@ -2315,7 +2315,7 @@ class T_Scan(unittest.TestCase): ...@@ -2315,7 +2315,7 @@ class T_Scan(unittest.TestCase):
x) x)
f = theano.function([x], [o,o2]) f = theano.function([x], [o,o2])
vx = numpy.zeros((50,)) vx = numpy.zeros((50,), dtype = theano.config.floatX)
vx[23] = 4 vx[23] = 4
out, out2 = f(vx) out, out2 = f(vx)
assert numpy.sum(out[24:]) == 0 assert numpy.sum(out[24:]) == 0
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论