提交 edfc726f authored 作者: nouiz's avatar nouiz

Merge pull request #906 from pascanur/gpu_opt_scan_outputs_different_dtypes_rebased

Gpu opt scan outputs different dtypes rebased
...@@ -1525,8 +1525,9 @@ def gpuScanOptimization(node): ...@@ -1525,8 +1525,9 @@ def gpuScanOptimization(node):
local_fgraph = gof.FunctionGraph(tmp_in, tmp_out) local_fgraph = gof.FunctionGraph(tmp_in, tmp_out)
_cmodule_key = gof.CLinker.cmodule_key_(local_fgraph, []) _cmodule_key = gof.CLinker.cmodule_key_(local_fgraph, [])
info['gpu_hash'] = hash(_cmodule_key) info['gpu_hash'] = hash(_cmodule_key)
typeConstructor = lambda broadcastable, dtype: CudaNdarrayType( def typeConstructor(broadcastable, dtype):
broadcastable=broadcastable) assert dtype == 'float32'
return CudaNdarrayType(broadcastable=broadcastable)
_outputs = scan_op.Scan( _outputs = scan_op.Scan(
scan_ins, scan_ins,
scan_outs, scan_outs,
......
...@@ -31,7 +31,6 @@ from theano import tensor ...@@ -31,7 +31,6 @@ from theano import tensor
from theano.tensor.opt import Shape_i from theano.tensor.opt import Shape_i
from theano.gradient import grad_undefined from theano.gradient import grad_undefined
from theano.gradient import DisconnectedType from theano.gradient import DisconnectedType
#from theano.sandbox import cuda
from theano.compile.profiling import ScanProfileStats from theano.compile.profiling import ScanProfileStats
import scan_utils import scan_utils
...@@ -51,8 +50,26 @@ class Scan(PureOp): ...@@ -51,8 +50,26 @@ class Scan(PureOp):
""" """
:param inputs: inputs of the inner function of scan :param inputs: inputs of the inner function of scan
:param outputs: outputs of the inner function of scan :param outputs: outputs of the inner function of scan
:param properties: dictionary containing different properties of :param info: dictionary containing different properties of
the scan op. the scan op (like number of different types of
arguments, name, mode, if it should run on GPU or
not, etc.)
:param typeConstructor: function that constructs a Theano TensorType
able to represent a float32 ndarray.
Note: ``typeConstructor`` had been added to refactor how Theano
deals with the GPU. If it runs on the GPU, scan needs to construct
certain outputs (those who reside in the GPU memory) as CudaNdarray.
However we can not import cuda in this file (as it is in sandbox,
and not available on each machine) so the workaround is that the GPU
optimization (which is aware of cuda types) passes to the
constructor of this class a function that is able to construct
CudaNdarray. This way the class Scan does not need to be aware of
CudaNdarray, it just constructs any float32 tensor using this
function (which by default constructs normal tensors). Note that the
second assumption in this code is that any float32 output or input
will be moved on the GPU if the optimization gets applied (following
Theano's philosophy of moving as much as possible on gpu).
""" """
# adding properties into self # adding properties into self
self.inputs = inputs self.inputs = inputs
...@@ -67,29 +84,57 @@ class Scan(PureOp): ...@@ -67,29 +84,57 @@ class Scan(PureOp):
self.output_types = [] self.output_types = []
idx = 0 idx = 0
jdx = 0 jdx = 0
tensorConstructor = lambda broadcastable, dtype: TensorType(
broadcastable=broadcastable, dtype=dtype)
if typeConstructor is None: if typeConstructor is None:
typeConstructor = lambda broadcastable, dtype: TensorType( typeConstructor = tensorConstructor
broadcastable=broadcastable, dtype=dtype)
while idx < self.n_mit_mot_outs: while idx < self.n_mit_mot_outs:
# Not that for mit_mot there are several output slices per # Not that for mit_mot there are several output slices per
# output sequence # output sequence
o = outputs[idx] o = outputs[idx]
self.output_types.append( # Scan assumes that only variables of dtype float32 might need a
typeConstructor( # special constructor (i.e. CudaNdarray constructor) when the
broadcastable=(False,) + o.type.broadcastable, # code is running on GPU, as it is the only type supported by
dtype=o.type.dtype) # Theano yet. Therefore only for dtype float32 we use the passed
) # type constructor ``typeConstructor``. For anything else we
# know that even if we run it on the GPU we still construct
# normal Theano tensors.
if o.type.dtype in ['float32']:
self.output_types.append(
typeConstructor(
broadcastable=(False,) + o.type.broadcastable,
dtype=o.type.dtype))
else:
self.output_types.append(
tensorConstructor(
broadcastable=(False,) + o.type.broadcastable,
dtype=o.type.dtype))
idx += len(self.mit_mot_out_slices[jdx]) idx += len(self.mit_mot_out_slices[jdx])
jdx += 1 jdx += 1
# mit_sot / sit_sot / nit_sot # mit_sot / sit_sot / nit_sot
end = idx + self.n_mit_sot + self.n_sit_sot + self.n_nit_sot end = idx + self.n_mit_sot + self.n_sit_sot + self.n_nit_sot
for o in outputs[idx:end]: for o in outputs[idx:end]:
self.output_types.append( # Scan assumes that only variables of dtype float32 might need a
typeConstructor( # special constructor (i.e. CudaNdarray constructor) when the
broadcastable=(False,) + o.type.broadcastable, # code is running on GPU, as it is the only type supported by
dtype=o.type.dtype)) # Theano yet. Therefore only for dtype float32 we use the passed
# type constructor ``typeConstructor``. For anything else we
# know that even if we run it on the GPU we still construct
# normal Theano tensors.
if o.type.dtype in ['float32']:
self.output_types.append(
typeConstructor(
broadcastable=(False,) + o.type.broadcastable,
dtype=o.type.dtype))
else:
self.output_types.append(
tensorConstructor(
broadcastable=(False,) + o.type.broadcastable,
dtype=o.type.dtype))
# shared outputs + possibly the ending condition # shared outputs + possibly the ending condition
for o in outputs[end:]: for o in outputs[end:]:
self.output_types.append(o.type) self.output_types.append(o.type)
...@@ -572,7 +617,7 @@ class Scan(PureOp): ...@@ -572,7 +617,7 @@ class Scan(PureOp):
cython_destroy_map, cython_destroy_map,
args, args,
outs, outs,
self) self, node)
except (ImportError, theano.gof.cmodule.MissingGXX): except (ImportError, theano.gof.cmodule.MissingGXX):
p = self.execute p = self.execute
# default arguments are stored in the closure of `rval` # default arguments are stored in the closure of `rval`
...@@ -757,8 +802,6 @@ class Scan(PureOp): ...@@ -757,8 +802,6 @@ class Scan(PureOp):
Y sequence outputs y_1, y_2, ... y_<self.n_outs> Y sequence outputs y_1, y_2, ... y_<self.n_outs>
""" """
# In order to be able to allocate cuda ndarrays if needed
from theano.sandbox import cuda
# 1. Unzip the number of steps and sequences. If number of steps is # 1. Unzip the number of steps and sequences. If number of steps is
# negative flip sequences around, and make n_steps positive # negative flip sequences around, and make n_steps positive
t0_call = time.time() t0_call = time.time()
...@@ -949,14 +992,10 @@ class Scan(PureOp): ...@@ -949,14 +992,10 @@ class Scan(PureOp):
self.vector_outs[j] = True self.vector_outs[j] = True
dtype = output_storage[jout].storage[0].dtype dtype = output_storage[jout].storage[0].dtype
if (outs[j][0] is None or if (outs[j][0] is None or
outs[j][0].shape[0] < store_steps[j] or outs[j][0].shape[0] < store_steps[j] or
outs[j][0].shape[1:] != shape[1:] or outs[j][0].shape[1:] != shape[1:] or
outs[j][0].dtype != dtype): outs[j][0].dtype != dtype):
if self.gpu: outs[j][0] = node.outputs[j].type.value_zeros(shape)
_cuda = cuda.cuda_ndarray.cuda_ndarray.CudaNdarray
outs[j][0] = _cuda.zeros(shape)
else:
outs[j][0] = numpy.zeros(shape, dtype)
elif outs[j][0].shape[0] != store_steps[j]: elif outs[j][0].shape[0] != store_steps[j]:
outs[j][0] = outs[j][0][:store_steps[j]] outs[j][0] = outs[j][0][:store_steps[j]]
outs[j][0][pos[j]] = output_storage[jout].storage[0] outs[j][0][pos[j]] = output_storage[jout].storage[0]
...@@ -994,24 +1033,14 @@ class Scan(PureOp): ...@@ -994,24 +1033,14 @@ class Scan(PureOp):
# This way, there will be no information overwritten # This way, there will be no information overwritten
# before it is read (as it used to happen). # before it is read (as it used to happen).
shape = (pdx,) + outs[idx][0].shape[1:] shape = (pdx,) + outs[idx][0].shape[1:]
if cuda.cuda_available and isinstance(outs[idx][0], tmp = node.outputs[idx].type.value_zeros(shape)
cuda.CudaNdarray):
_cuda = cuda.cuda_ndarray.cuda_ndarray.CudaNdarray
tmp = _cuda.zeros(shape)
else:
tmp = numpy.empty(shape)
tmp[:] = outs[idx][0][:pdx] tmp[:] = outs[idx][0][:pdx]
outs[idx][0][:store_steps[idx] - pdx] = outs[idx][0][pdx:] outs[idx][0][:store_steps[idx] - pdx] = outs[idx][0][pdx:]
outs[idx][0][store_steps[idx] - pdx:] = tmp outs[idx][0][store_steps[idx] - pdx:] = tmp
del tmp del tmp
else: else:
shape = (store_steps[idx] - pdx,) + outs[idx][0].shape[1:] shape = (store_steps[idx] - pdx,) + outs[idx][0].shape[1:]
if cuda.cuda_available and isinstance(outs[idx][0], tmp = node.outputs[idx].type.value_zeros(shape)
cuda.CudaNdarray):
_cuda = cuda.cuda_ndarray.cuda_ndarray.CudaNdarray
tmp = _cuda.zeros(shape)
else:
tmp = numpy.empty(shape)
tmp[:] = outs[idx][0][pdx:] tmp[:] = outs[idx][0][pdx:]
outs[idx][0][store_steps[idx] - pdx:] = outs[idx][0][:pdx] outs[idx][0][store_steps[idx] - pdx:] = outs[idx][0][:pdx]
outs[idx][0][:store_steps[idx] - pdx] = tmp outs[idx][0][:store_steps[idx] - pdx] = tmp
......
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -59,11 +59,10 @@ cimport numpy ...@@ -59,11 +59,10 @@ cimport numpy
from theano import gof from theano import gof
import time import time
import copy import copy
from theano.sandbox import cuda
def get_version(): def get_version():
return 0.276 return 0.278
@cython.boundscheck(False) @cython.boundscheck(False)
def perform( def perform(
...@@ -88,7 +87,8 @@ def perform( ...@@ -88,7 +87,8 @@ def perform(
numpy.ndarray[numpy.int32_t,ndim=1] destroy_map, numpy.ndarray[numpy.int32_t,ndim=1] destroy_map,
args, args,
outs, outs,
self): self,
node):
""" """
Parameters Parameters
---------- ----------
...@@ -383,10 +383,7 @@ def perform( ...@@ -383,10 +383,7 @@ def perform(
outs[j][0].shape[0] < store_steps[j] or outs[j][0].shape[0] < store_steps[j] or
outs[j][0].shape[1:] != shape[1:] or outs[j][0].shape[1:] != shape[1:] or
outs[j][0].dtype != dtype ): outs[j][0].dtype != dtype ):
if self.gpu: outs[j][0] = node.outputs[j].type.value_zeros(shape)
outs[j][0] = cuda.cuda_ndarray.cuda_ndarray.CudaNdarray.zeros(shape)
else:
outs[j][0] = numpy.zeros(shape, dtype)
elif outs[j][0].shape[0] != store_steps[j]: elif outs[j][0].shape[0] != store_steps[j]:
outs[j][0] = outs[j][0][:store_steps[j]] outs[j][0] = outs[j][0][:store_steps[j]]
outs[j][0][pos[j]] = output_storage[jout].storage[0] outs[j][0][pos[j]] = output_storage[jout].storage[0]
...@@ -426,22 +423,13 @@ def perform( ...@@ -426,22 +423,13 @@ def perform(
# before it is read (as it used to happen). # before it is read (as it used to happen).
shape = (pdx,)+ outs[idx][0].shape[1:] shape = (pdx,)+ outs[idx][0].shape[1:]
if cuda.cuda_available and isinstance( outs[idx][0], tmp = node.outputs[idx].type.value_zeros(shape)
cuda.CudaNdarray):
tmp = cuda.cuda_ndarray.cuda_ndarray.CudaNdarray.zeros(shape)
else:
tmp = numpy.empty(shape, outs[idx][0].dtype)
tmp[:] = outs[idx][0][:pdx] tmp[:] = outs[idx][0][:pdx]
outs[idx][0][:store_steps[idx]-pdx] = outs[idx][0][pdx:] outs[idx][0][:store_steps[idx]-pdx] = outs[idx][0][pdx:]
outs[idx][0][store_steps[idx]-pdx:] = tmp outs[idx][0][store_steps[idx]-pdx:] = tmp
else: else:
shape = (store_steps[idx]-pdx,) + outs[idx][0].shape[1:] shape = (store_steps[idx]-pdx,) + outs[idx][0].shape[1:]
tmp = node.outputs[idx].type.value_zeros(shape)
if cuda.cuda_available and isinstance( outs[idx][0],
cuda.CudaNdarray):
tmp = cuda.cuda_ndarray.cuda_ndarray.CudaNdarray.zeros(shape)
else:
tmp = numpy.empty(shape, outs[idx][0].dtype)
tmp[:] = outs[idx][0][pdx:] tmp[:] = outs[idx][0][pdx:]
outs[idx][0][store_steps[idx]-pdx:] = outs[idx][0][:pdx] outs[idx][0][store_steps[idx]-pdx:] = outs[idx][0][:pdx]
outs[idx][0][:store_steps[idx]-pdx] = tmp outs[idx][0][:store_steps[idx]-pdx] = tmp
......
...@@ -14,7 +14,7 @@ logging.basicConfig(level=logging.DEBUG) ...@@ -14,7 +14,7 @@ logging.basicConfig(level=logging.DEBUG)
if config.compiledir not in sys.path: if config.compiledir not in sys.path:
sys.path.append(config.compiledir) sys.path.append(config.compiledir)
version = 0.276 # must match constant returned in function get_version() version = 0.278 # must match constant returned in function get_version()
need_reload = False need_reload = False
try: try:
......
...@@ -504,6 +504,64 @@ class T_Scan(unittest.TestCase): ...@@ -504,6 +504,64 @@ class T_Scan(unittest.TestCase):
assert not any([isinstance(node.op, theano.sandbox.cuda.GpuFromHost) assert not any([isinstance(node.op, theano.sandbox.cuda.GpuFromHost)
for node in scan_node_topo]) for node in scan_node_topo])
# This third test checks that scan can deal with a mixture of dtypes as
# outputs when is running on GPU
def test_gpu3_mixture_dtype_outputs(self):
from theano.sandbox import cuda
if cuda.cuda_available == False:
raise SkipTest('Optional package cuda disabled')
def f_rnn(u_t, x_tm1, W_in, W):
return (u_t * W_in + x_tm1 * W,
tensor.cast(u_t+x_tm1, 'int64'))
u = theano.tensor.fvector('u')
x0 = theano.tensor.fscalar('x0')
W_in = theano.tensor.fscalar('win')
W = theano.tensor.fscalar('w')
output, updates = theano.scan(f_rnn,
u,
[x0, None],
[W_in, W],
n_steps=None,
truncate_gradient=-1,
go_backwards=False,
mode=mode_with_gpu)
f2 = theano.function([u, x0, W_in, W],
output,
updates=updates,
allow_input_downcast=True,
mode=mode_with_gpu)
# get random initial values
rng = numpy.random.RandomState(utt.fetch_seed())
v_u = rng.uniform(size=(4,), low=-5., high=5.)
v_x0 = rng.uniform()
W = rng.uniform()
W_in = rng.uniform()
# compute the output in numpy
v_out1 = numpy.zeros((4,))
v_out2 = numpy.zeros((4,), dtype='int64')
v_out1[0] = v_u[0] * W_in + v_x0 * W
v_out2[0] = v_u[0] + v_x0
for step in xrange(1, 4):
v_out1[step] = v_u[step] * W_in + v_out1[step - 1] * W
v_out2[step] = numpy.int64(v_u[step] + v_out1[step - 1])
theano_out1, theano_out2 = f2(v_u, v_x0, W_in, W)
assert numpy.allclose(theano_out1, v_out1)
assert numpy.allclose(theano_out2, v_out2)
topo = f2.maker.fgraph.toposort()
scan_node = [node for node in topo
if isinstance(node.op, theano.scan_module.scan_op.Scan)]
assert len(scan_node) == 1
scan_node = scan_node[0]
assert scan_node.op.gpu
# simple rnn, one input, one state, weights for each; input/state # simple rnn, one input, one state, weights for each; input/state
# are vectors, weights are scalars; using shared variables # are vectors, weights are scalars; using shared variables
def test_one_sequence_one_output_weights_shared(self): def test_one_sequence_one_output_weights_shared(self):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论