提交 500601a2 authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #1807 from abergeron/gpuarray_scan

Make scan work with new backend.
...@@ -1535,6 +1535,11 @@ def local_gpu_extract_diagonal(node): ...@@ -1535,6 +1535,11 @@ def local_gpu_extract_diagonal(node):
gpu_from_host(diag_node.inputs[0]))] gpu_from_host(diag_node.inputs[0]))]
return False return False
def typeConstructor(broadcastable, dtype):
if dtype == 'float32':
return CudaNdarrayType(broadcastable=broadcastable)
else:
return tensor.TensorType(broadcastable=broadcastable, dtype=dtype)
@register_opt('scan') @register_opt('scan')
@local_optimizer([gpu_from_host, scan_op.Scan]) @local_optimizer([gpu_from_host, scan_op.Scan])
...@@ -1593,8 +1598,6 @@ def gpuScanOptimization(node): ...@@ -1593,8 +1598,6 @@ def gpuScanOptimization(node):
_cmodule_key = gof.CLinker().cmodule_key_(local_fgraph, []) _cmodule_key = gof.CLinker().cmodule_key_(local_fgraph, [])
info['gpu_hash'] = hash(_cmodule_key) info['gpu_hash'] = hash(_cmodule_key)
typeConstructor = lambda broadcastable, dtype: CudaNdarrayType(
broadcastable=broadcastable)
nw_op = scan_op.Scan(scan_ins, nw_op = scan_op.Scan(scan_ins,
scan_outs, scan_outs,
info, info,
...@@ -1642,10 +1645,6 @@ def gpuScanOptimization(node): ...@@ -1642,10 +1645,6 @@ def gpuScanOptimization(node):
_cmodule_key = gof.CLinker().cmodule_key_(local_fgraph, []) _cmodule_key = gof.CLinker().cmodule_key_(local_fgraph, [])
info['gpu_hash'] = hash(_cmodule_key) info['gpu_hash'] = hash(_cmodule_key)
def typeConstructor(broadcastable, dtype):
assert dtype == 'float32'
return CudaNdarrayType(broadcastable=broadcastable)
_outputs = scan_op.Scan( _outputs = scan_op.Scan(
scan_ins, scan_ins,
scan_outs, scan_outs,
...@@ -1662,7 +1661,7 @@ def gpuScanOptimization(node): ...@@ -1662,7 +1661,7 @@ def gpuScanOptimization(node):
optdb.register('gpu_scanOp_make_inplace', optdb.register('gpu_scanOp_make_inplace',
scan_opt.ScanInplaceOptimizer(typeConstructor=CudaNdarrayType, scan_opt.ScanInplaceOptimizer(typeConstructor=typeConstructor,
gpu_flag=True), gpu_flag=True),
75, 75,
'gpu', 'gpu',
......
...@@ -161,7 +161,7 @@ class HostFromGpu(Op): ...@@ -161,7 +161,7 @@ class HostFromGpu(Op):
raise TypeError(x) raise TypeError(x)
return Apply(self, [x], return Apply(self, [x],
[tensor.TensorType(dtype=x.dtype, [tensor.TensorType(dtype=x.dtype,
broadcastable=x.broadcastable,)()]) broadcastable=x.broadcastable)()])
def perform(self, node, inp, out): def perform(self, node, inp, out):
x, = inp x, = inp
......
import copy import copy
import theano import theano
import numpy import numpy
from theano import tensor, scalar from theano import tensor, scalar, gof
from theano.compile import optdb from theano.compile import optdb
from theano.gof import (local_optimizer, EquilibriumDB, from theano.gof import (local_optimizer, EquilibriumDB,
SequenceDB, ProxyDB, SequenceDB, ProxyDB,
Optimizer, toolbox, Optimizer, toolbox,
InconsistencyError, EquilibriumOptimizer) InconsistencyError, EquilibriumOptimizer)
from theano.scan_module import scan_utils, scan_op, scan_opt
from theano.gof.python25 import all, any from theano.gof.python25 import all, any
from theano.tensor.nnet.conv import ConvOp from theano.tensor.nnet.conv import ConvOp
from theano.sandbox.gpuarray.type import GpuArrayType from theano.sandbox.gpuarray.type import GpuArrayType
from theano.sandbox.gpuarray.basic_ops import (host_from_gpu, from theano.sandbox.gpuarray.basic_ops import (
gpu_from_host, host_from_gpu, gpu_from_host, HostFromGpu,
gpu_alloc, gpu_alloc, GpuAlloc, GpuReshape, GpuEye
GpuAlloc, )
GpuReshape,
GpuEye)
from theano.sandbox.gpuarray.blas import gpu_dot22, GpuGemv, GpuGemm, GpuGer from theano.sandbox.gpuarray.blas import gpu_dot22, GpuGemv, GpuGemm, GpuGer
from theano.sandbox.gpuarray.conv import GpuConv from theano.sandbox.gpuarray.conv import GpuConv
from theano.sandbox.gpuarray.nnet import (GpuCrossentropySoftmaxArgmax1HotWithBias, from theano.sandbox.gpuarray.nnet import (
GpuCrossentropySoftmax1HotWithBiasDx, GpuCrossentropySoftmaxArgmax1HotWithBias,
GpuSoftmaxWithBias, GpuCrossentropySoftmax1HotWithBiasDx,
GpuSoftmax) GpuSoftmaxWithBias, GpuSoftmax
)
from theano.sandbox.gpuarray.elemwise import (GpuElemwise, _is_scalar, from theano.sandbox.gpuarray.elemwise import (GpuElemwise, _is_scalar,
GpuDimShuffle, GpuCAReduceCuda) GpuDimShuffle, GpuCAReduceCuda)
from theano.sandbox.gpuarray.subtensor import GpuIncSubtensor, GpuSubtensor from theano.sandbox.gpuarray.subtensor import GpuIncSubtensor, GpuSubtensor
...@@ -54,6 +55,20 @@ def register_opt(*tags, **kwargs): ...@@ -54,6 +55,20 @@ def register_opt(*tags, **kwargs):
register_opt()(theano.tensor.opt.local_track_shape_i) register_opt()(theano.tensor.opt.local_track_shape_i)
def safe_to_gpu(x):
if isinstance(x.type, tensor.TensorType):
return gpu_from_host(x)
else:
return x
def safe_to_cpu(x):
if isinstance(x.type, GpuArrayType):
return host_from_gpu(x)
else:
return x
def op_lifter(OP): def op_lifter(OP):
""" """
OP(..., host_from_gpu(), ...) -> host_from_gpu(GpuOP(...)) OP(..., host_from_gpu(), ...) -> host_from_gpu(GpuOP(...))
...@@ -73,10 +88,10 @@ def op_lifter(OP): ...@@ -73,10 +88,10 @@ def op_lifter(OP):
# This is needed as sometimes new_op inherit from OP. # This is needed as sometimes new_op inherit from OP.
if new_op and new_op != node.op: if new_op and new_op != node.op:
if isinstance(new_op, theano.Op): if isinstance(new_op, theano.Op):
return [host_from_gpu(o) for o in return [safe_to_cpu(o) for o in
new_op(*node.inputs, return_list=True)] new_op(*node.inputs, return_list=True)]
elif isinstance(new_op, (tuple, list)): elif isinstance(new_op, (tuple, list)):
return [host_from_gpu(o) for o in new_op] return [safe_to_cpu(o) for o in new_op]
else: # suppose it is a variable on the GPU else: # suppose it is a variable on the GPU
return [host_from_gpu(new_op)] return [host_from_gpu(new_op)]
return False return False
...@@ -132,7 +147,17 @@ optdb['canonicalize'].register('local_cut_gpua_host_gpua', ...@@ -132,7 +147,17 @@ optdb['canonicalize'].register('local_cut_gpua_host_gpua',
@register_opt() @register_opt()
@op_lifter([tensor.Alloc]) @op_lifter([tensor.Alloc])
def local_gpualloc(node): def local_gpualloc(node):
return gpu_alloc new_out = gpu_alloc(*node.inputs)
# We need to hide new broadcastable dimensions because
# ReplaceValidate doesn't like when they change.
if new_out.broadcastable != node.outputs[0].broadcastable:
# but if a dim is suddenly not broadcastable anymore then that's a bug
for b_old, b_new in zip(node.outputs[0].broadcastable,
new_out.broadcastable):
assert b_new or (not b_old)
new_out = tensor.patternbroadcast(new_out,
node.outputs[0].broadcastable)
return (new_out,)
@register_opt() @register_opt()
...@@ -158,6 +183,13 @@ def local_gpureshape(node): ...@@ -158,6 +183,13 @@ def local_gpureshape(node):
return res return res
@register_opt()
@op_lifter([tensor.Rebroadcast])
def local_gpu_rebroadcast(node):
if isinstance(node.inputs[0].owner.op, HostFromGpu):
return node.op(node.inputs[0].owner.inputs[0])
@register_opt() @register_opt()
@op_lifter([tensor.Flatten]) @op_lifter([tensor.Flatten])
def local_gpuflatten(node): def local_gpuflatten(node):
...@@ -176,8 +208,6 @@ def local_gpuflatten(node): ...@@ -176,8 +208,6 @@ def local_gpuflatten(node):
def local_gpu_elemwise(node): def local_gpu_elemwise(node):
op = node.op op = node.op
name = op.name name = op.name
if node.outputs[0].ndim == 0:
return
if name: if name:
name = 'Gpu'+name name = 'Gpu'+name
res = GpuElemwise(op.scalar_op, name=name, res = GpuElemwise(op.scalar_op, name=name,
...@@ -432,3 +462,97 @@ def local_gpu_conv(node): ...@@ -432,3 +462,97 @@ def local_gpu_conv(node):
out = gpu_from_host(out) out = gpu_from_host(out)
out.values_eq_approx = values_eq_approx out.values_eq_approx = values_eq_approx
return [out] return [out]
def tensor_to_gpu(x):
if isinstance(x.type, tensor.TensorType):
y = GpuArrayType(broadcastable=x.type.broadcastable,
dtype=x.type.dtype)()
if x.name:
y.name = x.name + '[Gpua]'
return y
else:
return x
def gpu_safe_new(x, tag=''):
"""
Internal function that constructs a new variable from x with the same
type, but with a different name ( old name + tag). This function is used
by gradient, or the R-op to construct new variables for the inputs of
the inner graph such that there is no interference between the original
graph and the newly constructed graph.
"""
if hasattr(x, 'name') and x.name is not None:
nw_name = x.name + tag
else:
nw_name = None
if isinstance(x, theano.Constant):
return x.clone()
nw_x = x.type()
nw_x.name = nw_name
return nw_x
def gpu_reconstruct_graph(inputs, outputs, tag=None):
"""
Different interface to clone, that allows you to pass inputs.
Compared to clone, this method always replaces the inputs with
new variables of the same type, and returns those ( in the same
order as the original inputs).
"""
if tag is None:
tag = ''
nw_inputs = [gpu_safe_new(x, tag) for x in inputs]
givens = {}
for nw_x, x in zip(nw_inputs, inputs):
givens[x] = nw_x
nw_outputs = scan_utils.clone(outputs, replace=givens)
return (nw_inputs, nw_outputs)
@register_opt('scan')
@op_lifter([scan_op.Scan])
def local_scan_to_gpua(node):
info = copy.deepcopy(node.op.info)
info['gpua'] = True
nw_ins = [node.inputs[0]]
e = (1 +
node.op.n_seqs +
node.op.n_mit_mot +
node.op.n_mit_sot +
node.op.n_sit_sot +
node.op.n_shared_outs)
nw_ins += [safe_to_gpu(x) for x in node.inputs[1:e]]
b = e
e = e + node.op.n_nit_sot
nw_ins += node.inputs[b:e]
nw_ins += [safe_to_gpu(x) for x in node.inputs[e:]]
scan_ins = [tensor_to_gpu(x) for x in node.op.inputs]
scan_outs = [safe_to_gpu(x) for x in node.op.outputs]
scan_outs = scan_utils.clone(
scan_outs,
replace=zip(node.op.inputs,
[safe_to_cpu(x) for x in scan_ins]))
# We need to construct the hash here, because scan
# __init__ does not know about the gpu and can not
# handle graphs with inputs being on the gpu
tmp_in, tmp_out = gpu_reconstruct_graph(scan_ins, scan_outs)
local_fgraph = gof.FunctionGraph(tmp_in, tmp_out, clone=False)
_cmodule_key = gof.CLinker().cmodule_key_(local_fgraph, [])
info['gpu_hash'] = hash(_cmodule_key)
nw_op = scan_op.Scan(scan_ins, scan_outs, info,
typeConstructor=GpuArrayType).make_node(*nw_ins)
return nw_op.outputs
optdb.register('gpua_scanOp_make_inplace',
scan_opt.ScanInplaceOptimizer(typeConstructor=GpuArrayType,
gpua_flag=True),
75,
'gpua',
'fast_run',
'inplace',
'scan')
...@@ -7,6 +7,7 @@ import theano ...@@ -7,6 +7,7 @@ import theano
from theano import tensor, gof from theano import tensor, gof
from theano.gof.python25 import all, any from theano.gof.python25 import all, any
from theano.tensor.subtensor import IncSubtensor, Subtensor, get_idx_list from theano.tensor.subtensor import IncSubtensor, Subtensor, get_idx_list
import theano.tensor.inplace
from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
try: try:
......
import numpy import numpy
import theano import theano
from theano import tensor
from theano.tests import unittest_tools as utt from theano.tests import unittest_tools as utt
import theano.sandbox.gpuarray
from theano.sandbox.gpuarray.type import GpuArrayType
from theano.sandbox.gpuarray.basic_ops import GpuAlloc, GpuReshape, gpu_alloc from theano.sandbox.gpuarray.basic_ops import GpuAlloc, GpuReshape, gpu_alloc
from theano.sandbox.gpuarray.elemwise import GpuCAReduceCuda from theano.sandbox.gpuarray.elemwise import GpuCAReduceCuda
import theano.sandbox.gpuarray from theano.sandbox.gpuarray.tests.test_basic_ops import (
rand_gpuarray, mode_with_gpu, mode_without_gpu
)
from theano.tests.unittest_tools import SkipTest from theano.tests.unittest_tools import SkipTest
if theano.sandbox.gpuarray.pygpu is None:
raise SkipTest("pygpu not installed")
import theano.sandbox.cuda as cuda_ndarray
if cuda_ndarray.cuda_available and not theano.sandbox.gpuarray.pygpu_activated:
if not cuda_ndarray.use.device_number:
cuda_ndarray.use('gpu')
theano.sandbox.gpuarray.init_dev('cuda')
if not theano.sandbox.gpuarray.pygpu_activated:
raise SkipTest("pygpu disabled")
if theano.config.mode == 'FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpuarray').excluding('gpu')
mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpuarray')
else:
mode_with_gpu = theano.compile.mode.get_default_mode().including('gpuarray').excluding('gpu')
mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpuarray')
def test_flatten(): def test_flatten():
m = theano.tensor.fmatrix() m = theano.tensor.fmatrix()
f = theano.function([m], m.flatten(), mode=mode_with_gpu) f = theano.function([m], m.flatten(), mode=mode_with_gpu)
...@@ -104,3 +89,20 @@ def test_local_gpualloc_memset_0(): ...@@ -104,3 +89,20 @@ def test_local_gpualloc_memset_0():
assert isinstance(topo[0].op, GpuAlloc) assert isinstance(topo[0].op, GpuAlloc)
assert not topo[0].op.memset_0 assert not topo[0].op.memset_0
assert (numpy.asarray(f(2)) == 1).all() assert (numpy.asarray(f(2)) == 1).all()
def test_rebroadcast():
d = numpy.random.rand(10, 10).astype('float32')
v = theano.tensor.fmatrix()
up = tensor.unbroadcast(v.sum().dimshuffle('x', 'x'), 0, 1)
f = theano.function([v], [up], mode=mode_with_gpu)
f(d)
topo = f.maker.fgraph.toposort()
rebrs = [node for node in topo if isinstance(node.op, tensor.Rebroadcast)]
assert len(rebrs) == 1
rebr = rebrs[0]
assert isinstance(rebr.inputs[0].type, GpuArrayType)
assert isinstance(rebr.outputs[0].type, GpuArrayType)
from unittest import TestCase
import numpy
import theano
from theano.tests import unittest_tools as utt
import theano.sandbox.rng_mrg
from theano.sandbox.gpuarray.basic_ops import (
gpu_from_host, GpuFromHost, HostFromGpu
)
from theano.sandbox.gpuarray.elemwise import GpuElemwise
from theano.sandbox.gpuarray.tests.test_basic_ops import mode_with_gpu
class T_Scan(TestCase):
def setUp(self):
utt.seed_rng()
def test_one_sequence_one_output_weights_gpu1(self):
def f_rnn(u_t, x_tm1, W_in, W):
return u_t * W_in + x_tm1 * W
u = theano.tensor.fvector('u')
x0 = theano.tensor.fscalar('x0')
W_in = theano.tensor.fscalar('win')
W = theano.tensor.fscalar('w')
mode = mode_with_gpu.excluding('InputToGpuOptimizer')
output, updates = theano.scan(f_rnn,
u,
x0,
[W_in, W],
n_steps=None,
truncate_gradient=-1,
go_backwards=False,
mode=mode)
output = gpu_from_host(output)
f2 = theano.function([u, x0, W_in, W],
output,
updates=updates,
allow_input_downcast=True,
mode=mode)
rng = numpy.random.RandomState(utt.fetch_seed())
v_u = rng.uniform(size=(4,), low=-5., high=5.)
v_x0 = rng.uniform()
W = rng.uniform()
W_in = rng.uniform()
v_u = numpy.asarray(v_u, dtype='float32')
v_x0 = numpy.asarray(v_x0, dtype='float32')
W = numpy.asarray(W, dtype='float32')
W_in = numpy.asarray(W_in, dtype='float32')
# compute the output in numpy
v_out = numpy.zeros((4,))
v_out[0] = v_u[0] * W_in + v_x0 * W
for step in xrange(1, 4):
v_out[step] = v_u[step] * W_in + v_out[step - 1] * W
theano_values = f2(v_u, v_x0, W_in, W)
utt.assert_allclose(theano_values, v_out)
# TO DEL
topo = f2.maker.fgraph.toposort()
scan_node = [node for node in topo
if isinstance(node.op, theano.scan_module.scan_op.Scan)]
assert len(scan_node) == 1
scan_node = scan_node[0]
topo = f2.maker.fgraph.toposort()
assert sum([isinstance(node.op, HostFromGpu)
for node in topo]) == 0
assert sum([isinstance(node.op, GpuFromHost)
for node in topo]) == 4
scan_node = [node for node in topo
if isinstance(node.op, theano.scan_module.scan_op.Scan)]
assert len(scan_node) == 1
scan_node = scan_node[0]
scan_node_topo = scan_node.op.fn.maker.fgraph.toposort()
# check that there is no gpu transfer in the inner loop.
assert any([isinstance(node.op, GpuElemwise)
for node in scan_node_topo])
assert not any([isinstance(node.op, HostFromGpu)
for node in scan_node_topo])
assert not any([isinstance(node.op, GpuFromHost)
for node in scan_node_topo])
# This second version test the second case in the optimizer to the gpu.
def test_one_sequence_one_output_weights_gpu2(self):
def f_rnn(u_t, x_tm1, W_in, W):
return u_t * W_in + x_tm1 * W
u = theano.tensor.fvector('u')
x0 = theano.tensor.fscalar('x0')
W_in = theano.tensor.fscalar('win')
W = theano.tensor.fscalar('w')
output, updates = theano.scan(f_rnn,
u,
x0,
[W_in, W],
n_steps=None,
truncate_gradient=-1,
go_backwards=False,
mode=mode_with_gpu)
f2 = theano.function([u, x0, W_in, W],
output,
updates=updates,
allow_input_downcast=True,
mode=mode_with_gpu)
# get random initial values
rng = numpy.random.RandomState(utt.fetch_seed())
v_u = rng.uniform(size=(4,), low=-5., high=5.)
v_x0 = rng.uniform()
W = rng.uniform()
W_in = rng.uniform()
# compute the output in numpy
v_out = numpy.zeros((4,))
v_out[0] = v_u[0] * W_in + v_x0 * W
for step in xrange(1, 4):
v_out[step] = v_u[step] * W_in + v_out[step - 1] * W
theano_values = f2(v_u, v_x0, W_in, W)
utt.assert_allclose(theano_values, v_out)
topo = f2.maker.fgraph.toposort()
assert sum([isinstance(node.op, HostFromGpu)
for node in topo]) == 1
assert sum([isinstance(node.op, GpuFromHost)
for node in topo]) == 4
scan_node = [node for node in topo
if isinstance(node.op, theano.scan_module.scan_op.Scan)]
assert len(scan_node) == 1
scan_node = scan_node[0]
scan_node_topo = scan_node.op.fn.maker.fgraph.toposort()
# check that there is no gpu transfer in the inner loop.
assert any([isinstance(node.op, GpuElemwise)
for node in scan_node_topo])
assert not any([isinstance(node.op, HostFromGpu)
for node in scan_node_topo])
assert not any([isinstance(node.op, GpuFromHost)
for node in scan_node_topo])
# This third test checks that scan can deal with a mixture of dtypes as
# outputs when is running on GPU
def test_gpu3_mixture_dtype_outputs(self):
def f_rnn(u_t, x_tm1, W_in, W):
return (u_t * W_in + x_tm1 * W,
theano.tensor.cast(u_t + x_tm1, 'int64'))
u = theano.tensor.fvector('u')
x0 = theano.tensor.fscalar('x0')
W_in = theano.tensor.fscalar('win')
W = theano.tensor.fscalar('w')
output, updates = theano.scan(f_rnn,
u,
[x0, None],
[W_in, W],
n_steps=None,
truncate_gradient=-1,
go_backwards=False,
mode=mode_with_gpu)
f2 = theano.function([u, x0, W_in, W],
output,
updates=updates,
allow_input_downcast=True,
mode=mode_with_gpu)
# get random initial values
rng = numpy.random.RandomState(utt.fetch_seed())
v_u = rng.uniform(size=(4,), low=-5., high=5.)
v_x0 = rng.uniform()
W = rng.uniform()
W_in = rng.uniform()
# compute the output in numpy
v_out1 = numpy.zeros((4,))
v_out2 = numpy.zeros((4,), dtype='int64')
v_out1[0] = v_u[0] * W_in + v_x0 * W
v_out2[0] = v_u[0] + v_x0
for step in xrange(1, 4):
v_out1[step] = v_u[step] * W_in + v_out1[step - 1] * W
v_out2[step] = numpy.int64(v_u[step] + v_out1[step - 1])
theano_out1, theano_out2 = f2(v_u, v_x0, W_in, W)
utt.assert_allclose(theano_out1, v_out1)
utt.assert_allclose(theano_out2, v_out2)
topo = f2.maker.fgraph.toposort()
scan_node = [node for node in topo
if isinstance(node.op, theano.scan_module.scan_op.Scan)]
assert len(scan_node) == 1
scan_node = scan_node[0]
assert scan_node.op.gpua
scan_node_topo = scan_node.op.fn.maker.fgraph.toposort()
# check that there is no gpu transfer in the inner loop.
assert not any([isinstance(node.op, HostFromGpu)
for node in scan_node_topo])
assert not any([isinstance(node.op, GpuFromHost)
for node in scan_node_topo])
def test_gpu4_gibbs_chain(self):
rng = numpy.random.RandomState(utt.fetch_seed())
v_vsample = numpy.array(rng.binomial(1, .5, size=(3, 20),),
dtype='float32')
vsample = theano.shared(v_vsample)
trng = theano.sandbox.rng_mrg.MRG_RandomStreams(
utt.fetch_seed())
def f(vsample_tm1):
return trng.binomial(vsample_tm1.shape, n=1, p=0.3,
dtype='float32') * vsample_tm1
theano_vsamples, updates = theano.scan(f,
[],
vsample,
[],
n_steps=10,
truncate_gradient=-1,
go_backwards=False,
mode=mode_with_gpu)
my_f = theano.function([],
theano_vsamples[-1],
updates=updates,
allow_input_downcast=True,
mode=mode_with_gpu)
# I leave this to tested by debugmode, this test was anyway
# more of does the graph compile kind of test
t_result = my_f()
...@@ -56,23 +56,24 @@ class Scan(PureOp): ...@@ -56,23 +56,24 @@ class Scan(PureOp):
the scan op (like number of different types of the scan op (like number of different types of
arguments, name, mode, if it should run on GPU or arguments, name, mode, if it should run on GPU or
not, etc.) not, etc.)
:param typeConstructor: function that constructs a Theano TensorType :param typeConstructor: function that constructs an equivalent
able to represent a float32 ndarray. to Theano TensorType
Note: ``typeConstructor`` had been added to refactor how Theano
deals with the GPU. If it runs on the GPU, scan needs to construct Note: ``typeConstructor`` had been added to refactor how
certain outputs (those who reside in the GPU memory) as CudaNdarray. Theano deals with the GPU. If it runs on the GPU, scan needs
However we can not import cuda in this file (as it is in sandbox, to construct certain outputs (those who reside in the GPU
and not available on each machine) so the workaround is that the GPU memory) as the GPU-specific type. However we can not import
optimization (which is aware of cuda types) passes to the gpu code in this file (as it is in sandbox, and not available
constructor of this class a function that is able to construct on each machine) so the workaround is that the GPU
CudaNdarray. This way the class Scan does not need to be aware of optimization passes to the constructor of this class a
CudaNdarray, it just constructs any float32 tensor using this function that is able to construct a GPU type. This way the
function (which by default constructs normal tensors). Note that the class Scan does not need to be aware of the details for the
second assumption in this code is that any float32 output or input GPU, it just constructs any tensor using this function (which
will be moved on the GPU if the optimization gets applied (following by default constructs normal tensors).
Theano's philosophy of moving as much as possible on gpu).
""" """
if 'gpua' not in info:
info['gpua'] = False
# adding properties into self # adding properties into self
self.inputs = inputs self.inputs = inputs
self.outputs = outputs self.outputs = outputs
...@@ -95,23 +96,10 @@ class Scan(PureOp): ...@@ -95,23 +96,10 @@ class Scan(PureOp):
# Not that for mit_mot there are several output slices per # Not that for mit_mot there are several output slices per
# output sequence # output sequence
o = outputs[idx] o = outputs[idx]
# Scan assumes that only variables of dtype float32 might need a self.output_types.append(
# special constructor (i.e. CudaNdarray constructor) when the typeConstructor(
# code is running on GPU, as it is the only type supported by broadcastable=(False,) + o.type.broadcastable,
# Theano yet. Therefore only for dtype float32 we use the passed dtype=o.type.dtype))
# type constructor ``typeConstructor``. For anything else we
# know that even if we run it on the GPU we still construct
# normal Theano tensors.
if o.type.dtype in ['float32']:
self.output_types.append(
typeConstructor(
broadcastable=(False,) + o.type.broadcastable,
dtype=o.type.dtype))
else:
self.output_types.append(
tensorConstructor(
broadcastable=(False,) + o.type.broadcastable,
dtype=o.type.dtype))
idx += len(self.mit_mot_out_slices[jdx]) idx += len(self.mit_mot_out_slices[jdx])
jdx += 1 jdx += 1
...@@ -120,23 +108,11 @@ class Scan(PureOp): ...@@ -120,23 +108,11 @@ class Scan(PureOp):
end = idx + self.n_mit_sot + self.n_sit_sot + self.n_nit_sot end = idx + self.n_mit_sot + self.n_sit_sot + self.n_nit_sot
for o in outputs[idx:end]: for o in outputs[idx:end]:
# Scan assumes that only variables of dtype float32 might need a self.output_types.append(
# special constructor (i.e. CudaNdarray constructor) when the typeConstructor(
# code is running on GPU, as it is the only type supported by broadcastable=(False,) + o.type.broadcastable,
# Theano yet. Therefore only for dtype float32 we use the passed dtype=o.type.dtype))
# type constructor ``typeConstructor``. For anything else we
# know that even if we run it on the GPU we still construct
# normal Theano tensors.
if o.type.dtype in ['float32']:
self.output_types.append(
typeConstructor(
broadcastable=(False,) + o.type.broadcastable,
dtype=o.type.dtype))
else:
self.output_types.append(
tensorConstructor(
broadcastable=(False,) + o.type.broadcastable,
dtype=o.type.dtype))
# shared outputs + possibly the ending condition # shared outputs + possibly the ending condition
for o in outputs[end:]: for o in outputs[end:]:
self.output_types.append(o.type) self.output_types.append(o.type)
...@@ -182,14 +158,14 @@ class Scan(PureOp): ...@@ -182,14 +158,14 @@ class Scan(PureOp):
self.n_shared_outs) self.n_shared_outs)
self.n_outs = self.n_mit_mot + self.n_mit_sot + self.n_sit_sot self.n_outs = self.n_mit_mot + self.n_mit_sot + self.n_sit_sot
self.n_tap_outs = self.n_mit_mot + self.n_mit_sot self.n_tap_outs = self.n_mit_mot + self.n_mit_sot
if not self.info['gpu']: if self.info['gpu'] or self.info['gpua']:
self._hash_inner_graph = self.info['gpu_hash']
else:
tmp_in, tmp_out = scan_utils.reconstruct_graph(self.inputs, tmp_in, tmp_out = scan_utils.reconstruct_graph(self.inputs,
self.outputs) self.outputs)
local_fgraph = gof.FunctionGraph(tmp_in, tmp_out, clone=False) local_fgraph = gof.FunctionGraph(tmp_in, tmp_out, clone=False)
self._cmodule_key = gof.CLinker().cmodule_key_(local_fgraph, []) self._cmodule_key = gof.CLinker().cmodule_key_(local_fgraph, [])
self._hash_inner_graph = hash(self._cmodule_key) self._hash_inner_graph = hash(self._cmodule_key)
else:
self._hash_inner_graph = self.info['gpu_hash']
def make_node(self, *inputs): def make_node(self, *inputs):
""" """
......
...@@ -537,10 +537,11 @@ class PushOutSeqScan(gof.Optimizer): ...@@ -537,10 +537,11 @@ class PushOutSeqScan(gof.Optimizer):
class ScanInplaceOptimizer(Optimizer): class ScanInplaceOptimizer(Optimizer):
"""Graph optimizer for Scan(makes it run inplace)""" """Graph optimizer for Scan(makes it run inplace)"""
def __init__(self, typeConstructor=None, gpu_flag=False): def __init__(self, typeConstructor=None, gpu_flag=False, gpua_flag=False):
Optimizer.__init__(self) Optimizer.__init__(self)
self.typeConstructor = typeConstructor self.typeConstructor = typeConstructor
self.gpu_flag = gpu_flag self.gpu_flag = gpu_flag
self.gpua_flag = gpua_flag
def add_requirements(self, fgraph): def add_requirements(self, fgraph):
fgraph.attach_feature(toolbox.ReplaceValidate()) fgraph.attach_feature(toolbox.ReplaceValidate())
...@@ -551,7 +552,8 @@ class ScanInplaceOptimizer(Optimizer): ...@@ -551,7 +552,8 @@ class ScanInplaceOptimizer(Optimizer):
nodes = fgraph.toposort() nodes = fgraph.toposort()
scan_nodes = [x for x in nodes scan_nodes = [x for x in nodes
if (isinstance(x.op, scan_op.Scan) and if (isinstance(x.op, scan_op.Scan) and
x.op.info['gpu'] == self.gpu_flag)] x.op.info['gpu'] == self.gpu_flag and
x.op.info['gpua'] == self.gpua_flag)]
for scan_idx in xrange(len(scan_nodes)): for scan_idx in xrange(len(scan_nodes)):
node = scan_nodes[scan_idx] node = scan_nodes[scan_idx]
op = node.op op = node.op
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论