提交 d8cc425e authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Add some optimizations and start work on the ops needed for a simple NN.

上级 cd18dd6b
...@@ -43,6 +43,8 @@ if pygpu: ...@@ -43,6 +43,8 @@ if pygpu:
# TODO add optimization tags here (when we will have some) # TODO add optimization tags here (when we will have some)
import theano.compile import theano.compile
theano.compile.shared_constructor(gpuarray_shared_constructor) theano.compile.shared_constructor(gpuarray_shared_constructor)
optdb.add_tags('gpuarray_opt', 'fast_run', 'inplace')
optdb.add_tags('gpuarray_after_fusion', 'fast_run', 'inplace')
elif config.gpuarray.init_device != '': elif config.gpuarray.init_device != '':
init_dev(config.gpuarray.init_device) init_dev(config.gpuarray.init_device)
else: else:
......
...@@ -31,6 +31,9 @@ class HostFromGpu(Op): ...@@ -31,6 +31,9 @@ class HostFromGpu(Op):
def __hash__(self): def __hash__(self):
return hash(type(self)) return hash(type(self))
def __str__(self):
return 'HostFromGpu(gpuarray)'
def make_node(self, x): def make_node(self, x):
if not isinstance(x.type, GpuArrayType): if not isinstance(x.type, GpuArrayType):
raise TypeError(x) raise TypeError(x)
...@@ -110,6 +113,9 @@ class GpuFromHost(Op): ...@@ -110,6 +113,9 @@ class GpuFromHost(Op):
def __hash__(self): def __hash__(self):
return hash(type(self)) return hash(type(self))
def __str__(self):
return 'GpuFromHost(gpuarray)'
def make_node(self, x): def make_node(self, x):
if not isinstance(x.type, tensor.TensorType): if not isinstance(x.type, tensor.TensorType):
raise TypeError(x) raise TypeError(x)
...@@ -193,6 +199,9 @@ class GpuFromCuda(Op): ...@@ -193,6 +199,9 @@ class GpuFromCuda(Op):
def __hash__(self): def __hash__(self):
return hash(type(self)) return hash(type(self))
def __str__(self):
return 'GpuFromCuda'
def make_node(self, x): def make_node(self, x):
from theano.sandbox.cuda import CudaNdArrayType from theano.sandbox.cuda import CudaNdArrayType
if not isinstance(x.type, CudaNdArrayType): if not isinstance(x.type, CudaNdArrayType):
...@@ -328,6 +337,9 @@ class CudaFromGpu(Op): ...@@ -328,6 +337,9 @@ class CudaFromGpu(Op):
def __hash__(self): def __hash__(self):
return hash(type(self)) return hash(type(self))
def __str__(self):
return 'GpuFromCuda'
def make_node(self, x): def make_node(self, x):
from theano.sandbox.cuda import CudaNdArrayType from theano.sandbox.cuda import CudaNdArrayType
if not isinstance(x.type, GpuArrayType): if not isinstance(x.type, GpuArrayType):
...@@ -412,3 +424,44 @@ class CudaFromGpu(Op): ...@@ -412,3 +424,44 @@ class CudaFromGpu(Op):
cuda_from_gpu = CudaFromGpu() cuda_from_gpu = CudaFromGpu()
class GpuAlloc(Op):
def __str__(self):
return 'GpuAlloc'
def make_node(self, value, *shape):
v = as_gpuarray_variable(value)
sh = [tensor.as_tensor_variable(s) for s in shape]
if v.ndim = len(shape):
raise TypeError(
'GpuAlloc requires value of same dimensions as shape',
value len(shape))
bcast = []
for s in sh:
if s.type.dtype[:3] not in ('int', 'uint'):
raise TypeError('Shape arguments must be integers', s)
try:
const_shp = tensor.get_constant_value(s)
except TypeError:
const_shp = None
bcast.append(numpy.all(1 == const_shp))
otype = GpuArrayType(dtype=v.dtype, broadcastable=bcast)
return Apply(self, [v] + sh, [otype()])
def perform(self, node, inputs, outs):
out, = outs
v = inputs[0]
sh = tuple(map(int, inputs[1:]))
if out[0] is None or out[0].shape != sh:
out[0] = gpuarray.empty(sh, dtype=v.dtype)
out[0][...] = v
def infer_shape(self, node, input_shapes):
return [node.inputs[1:]]
def grad(self, input, grads):
return [None for i in inputs]
gpu_alloc = GpuAlloc()
from theano.compile import optdb
from theano.gof import (local_optimizer, EquilibriumDB, SequenceDB, ProxyDB,
Optimizer, toolbox, DestroyHandler,
InconsistencyError, EquilibriumOptimizer)
from theano.gof.python25 import all, any
from theano.sandbox.gpuarray.type import GpuArrayType
gpu_optimizer = EquilibriumDB()
gpu_cut_copies = EquilibriumDB()
gpu_seqopt = SequenceDB()
gpu_seqopt.register('gpuarray_local_optimiziations', gpu_optimizer, 1,
'fast_run', 'inplace', 'gpuarray')
gpu_seqopt.register('gpuarray_cut_transfers', gpu_cut_copies, 2,
'fast_run', 'gpuarray')
optdb.register('gpuarray_opt', gpu_seqopt,
optdb.__position__.get('add_destroy_handler', 49.5) - 1,
'gpu')
optdb.register('gpuarray_after_fusion', ProxyDB(gpu_seqopt),
optdb.__position__.get('elemwise_fusion', 71) + 1,
'gpu')
def register_opt(*tags, **kwargs):
def f(local_opt):
name = (kwargs and kwargs.pop('name')) or local_opt.__name__
gpu_optimizer.register(name, local_opt, 'fast_run', 'gpu', *tags)
return local_opt
return f
register_opt()(theano.tensor.opt.local_track_shape_i)
class InputToGpuOptimizer(Optimizer):
"Transfer the input to the gpu to start the rolling wave."
def add_requirements(self, fgraph):
fgraph.attach_feature(toolbox.ReplaceValidate())
fgraph.attach_feature(DestroyHandler())
def apply(self, fgraph):
for input in fgraph.inputs:
if isinstance(input.type, GpuArrayType):
return
if (len(input.clients) == 1 and
(input.clients[0][0] == 'output' or
input.clients[0][0].op == gpu_from_host)):
return
try:
new_input = host_from_gpu(gpu_from_host(input))
fgraph.replace_validate(input, new_input,
"InputToGpuOptimizer")
except TypeError, e:
# This could fail if the inputs are not TensorTypes
pass
gpu_seqopt.register('InputToGpuArrayOptimizer', InputToGpuOptimizer(),
0, 'fast_run', 'fast_compile', 'merge')
@local_optimizer([])
def local_cut_gpu_host_gpu(node):
if tensor.opt.opt.check_chain(node, gpu_from_host, host_from_gpu):
return [node.inputs[0].owner.inputs[0]]
if tensor.opt.opt.check_chain(node, host_from_gpu, gpu_from_host):
return [node.inputs[0].owner.inputs[0]]
return False
gpu_cut_copies.register('cut_gpu_host_transfers', local_cut_gpu_host_gpu,
'fast_run', 'inplace', 'gpu')
gpu_cut_copies.register('cut_gpu_constant_transfers',
tensor.opt.constant_folding,
'fast_run', 'gpu')
optdb['canonicalize'].register('local_cut_gpu_host_gpu',
local_cut_gpu_host_gpu, 'fast_run', 'gpu')
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论