提交 44f9d0f7 authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #1001 from abergeron/compyte

Support for a new type based on compyte in theano
......@@ -91,6 +91,10 @@ if config.device.startswith('gpu') or config.init_gpu_device.startswith('gpu'):
theano.sandbox.cuda.tests.test_driver.test_nvidia_driver1()
if config.device.startswith('cuda') or config.device.startswith('opencl') or \
config.gpuarray.init_device != '':
import theano.sandbox.gpuarray
# Use config.numpy to call numpy.seterr
import numpy
......
......@@ -2,9 +2,8 @@ import os
import logging
import subprocess
from theano.configparser import (
AddConfigVar, BoolParam, ConfigParam, EnumStr, IntParam,
TheanoConfigParser)
from theano.configparser import (AddConfigVar, BoolParam, ConfigParam, EnumStr,
IntParam, StrParam, TheanoConfigParser)
from theano.misc.cpucount import cpuCount
from theano.misc.windows import call_subprocess_Popen
......@@ -44,20 +43,42 @@ AddConfigVar('int_division',
# gpu means let the driver select the gpu. Needed in case of gpu in
# exclusive mode.
# gpuX mean use the gpu number X.
class DeviceParam(ConfigParam):
def __init__(self, default, *options, **kwargs):
self.default = default
def filter(val):
if val.startswith('cpu') or val.startswith('gpu') \
or val.startswith('opencl') or val.startswith('cuda'):
return val
else:
raise ValueError(('Invalid value ("%s") for configuration '
'variable "%s". Valid options start with '
'one of "cpu", "gpu", "opencl", "cuda"'
% (val, self.fullname)))
over = kwargs.get("allow_override", True)
super(DeviceParam, self).__init__(default, filter, over)
def __str__(self):
return '%s (cpu, gpu*, opencl*, cuda*) ' % (self.fullname,)
AddConfigVar('device',
("Default device for computations. If gpu*, change the default to try "
"to move computation to it and to put shared variable of float32 "
"on it. Do not use upper case letters, only lower case even if "
"NVIDIA use capital letters."),
EnumStr('cpu', 'gpu',
'gpu0', 'gpu1', 'gpu2', 'gpu3',
'gpu4', 'gpu5', 'gpu6', 'gpu7',
'gpu8', 'gpu9', 'gpu10', 'gpu11',
'gpu12', 'gpu13', 'gpu14', 'gpu15',
allow_override=False),
DeviceParam('cpu', allow_override=False),
in_c_key=False,
)
AddConfigVar('gpuarray.init_device',
"""
Device to initialize for gpuarray use without moving
computations automatically.
""",
StrParam(''),
in_c_key=False)
AddConfigVar('init_gpu_device',
("Initialize the gpu device to use, works only if device=cpu. "
"Unlike 'device', setting this option will NOT move computations, "
......
import logging
import theano
from theano.configparser import config
from theano.compile import optdb
_logger_name = 'theano.sandbox.gpuarray'
_logger = logging.getLogger(_logger_name)
_logger.setLevel(logging.WARNING)
error = _logger.error
info = _logger.info
pygpu_activated = False
try:
import pygpu
import pygpu.gpuarray
except ImportError:
pygpu = None
# This is for documentation not to depend on the availability of pygpu
from type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant,
GpuArraySharedVariable, gpuarray_shared_constructor)
import opt
def init_dev(dev):
global pygpu_activated
context = pygpu.init(dev)
pygpu.set_default_context(context)
pygpu_activated = True
if pygpu:
try:
if (config.device.startswith('cuda') or
config.device.startswith('opencl')):
init_dev(config.device)
import theano.compile
theano.compile.shared_constructor(gpuarray_shared_constructor)
optdb.add_tags('gpuarray_opt', 'fast_run', 'inplace')
elif config.gpuarray.init_device != '':
init_dev(config.gpuarray.init_device)
except Exception:
error("Could not initialize pygpu, support disabled", exc_info=True)
else:
if (config.gpuarray.init_device != '' or
config.device.startswith('opencl') or
config.device.startswith('cuda')):
error("pygpu was configured but could not be imported", exc_info=True)
差异被折叠。
import numpy
from theano import Op, Apply, scalar
try:
from pygpu.tools import ScalarArg, ArrayArg
from pygpu.elemwise import ElemwiseKernel
except ImportError:
pass
from basic_ops import as_gpuarray_variable
from type import GpuArrayType
from theano.gof.utils import MethodNotDefined
def _is_scalar(v):
False
def make_argument(v, name):
if _is_scalar(v):
return ScalarArg(numpy.dtype(v.type.dtype), name)
else:
return ArrayArg(numpy.dtype(v.type.dtype), name)
def ensure_out(o, ref):
if o is None:
return ref._empty_like_me()
else:
return o
class GpuElemwise(Op):
nin = property(lambda self: self.scalar_op.nin)
nout = property(lambda self: self.scalar_op.nout)
def __init__(self, scalar_op):
self.scalar_op = scalar_op
self.destroy_map = {}
def __getstate__(self):
d = copy.copy(self.__dict__)
d.pop('__epydoc_asRoutine', None)
d.pop('_hashval')
return d
def __setstate__(self, d):
self.__dict__.update(d)
self._rehash()
def __eq__(self, other):
return (type(self) == type(other) and
self.scalar_op == other.scalar_op)
def __hash__(self):
return hash(type(self)) ^ hash(self.scalar_op)
def __str__(self):
return "GpuElemwise{%s}(gpuarray)" % (self.scalar_op,)
def make_node(self, *inputs):
_inputs = [as_gpuarray_variable(i) for i in inputs]
if self.nin > 0 and len(_inputs) != self.nin:
raise TypeError("Wrong argument count", (self.nin, len(_inputs)))
for i in _inputs[1:]:
if i.type.ndim != inputs[0].type.ndim:
raise TypeError('mismatched rank amongst inputs')
broadcastable = []
for d in xrange(_inputs[0].type.ndim):
bcast_d = True
for i in _inputs:
if not i.type.broadcastable[d]:
bcast_d = False
break
broadcastable.append(bcast_d)
assert len(broadcastable) == _inputs[0].type.ndim
assert self.nout > 0
inps = [make_argument(i, 'i%d' % (n,)) for n, i in
enumerate(inputs)]
scal_ins = [scalar.Scalar(i.dtype) for i in inputs]
res = Apply(self, _inputs,
[GpuArrayType(o.dtype, broadcastable)()
for o in self.scalar_op.output_types(scal_ins)])
outs = [make_argument(o, 'o%d' % (n,)) for n, o in
enumerate(res.outputs)]
scal_out = [scalar.Scalar(o.dtype) for o in res.outputs]
fake_node = Apply(self.scalar_op, [i() for i in scal_ins],
[o() for o in scal_out])
kcode = self.scalar_op.c_code(fake_node, 'kcode',
[i.expr() for i in inps],
[o.expr() for o in outs],
sub=dict(fail='return;'))
res.tag.kcode = kcode
try:
code = self.scalar_op.c_support_code_apply(fake_node, 'kcode')
if code:
raise SupportCodeError()
except MethodNotDefined:
pass
support_code = ""
try:
support_code += self.scalar_op.c_support_code()
except MethodNotDefined:
pass
if support_code != "#define THEANO_MACRO_MOD(x,y) (x % y)":
# Avoid the C++ complex struct
raise SupportCodeError()
k = ElemwiseKernel(None, inps+outs, kcode, preamble=support_code)
res.tag.kernel = k
return res
def perform(self, node, inps, out):
k = node.tag.kernel
outs = [ensure_out(o[0], inps[0]) for o in out]
# the dict call is there to avoid syntax error in python <= 2.5
k(*(inps+outs), **dict(broadcast=True))
for o, og in zip(out, outs):
o[0] = og
class SupportCodeError(Exception):
"""
We do not support certain things (such as the C++ complex struct)
"""
import theano, numpy
from theano import tensor
from theano.compile import optdb
from theano.gof import (local_optimizer, EquilibriumDB, SequenceDB, ProxyDB,
Optimizer, toolbox, DestroyHandler,
InconsistencyError, EquilibriumOptimizer)
from theano.gof.python25 import all, any
from theano.sandbox.gpuarray.type import GpuArrayType
from basic_ops import host_from_gpu, gpu_from_host, gpu_alloc
from elemwise import GpuElemwise, _is_scalar
gpu_optimizer = EquilibriumDB()
gpu_cut_copies = EquilibriumDB()
gpu_seqopt = SequenceDB()
gpu_seqopt.register('gpuarray_local_optimiziations', gpu_optimizer, 1,
'fast_run', 'inplace', 'gpuarray')
gpu_seqopt.register('gpuarray_cut_transfers', gpu_cut_copies, 2,
'fast_run', 'gpuarray')
# do not add 'fast_run' to these two as this would always enable gpuarray mode
optdb.register('gpuarray_opt', gpu_seqopt,
optdb.__position__.get('add_destroy_handler', 49.5) - 1,
'gpuarray')
def register_opt(*tags, **kwargs):
def f(local_opt):
name = (kwargs and kwargs.pop('name')) or local_opt.__name__
gpu_optimizer.register(name, local_opt, 'fast_run', 'gpuarray', *tags)
return local_opt
return f
register_opt()(theano.tensor.opt.local_track_shape_i)
class InputToGpuOptimizer(Optimizer):
"Transfer the input to the gpu to start the rolling wave."
def add_requirements(self, fgraph):
fgraph.attach_feature(toolbox.ReplaceValidate())
fgraph.attach_feature(DestroyHandler())
def apply(self, fgraph):
for input in fgraph.inputs:
if isinstance(input.type, GpuArrayType):
continue
if (len(input.clients) == 1 and
(input.clients[0][0] == 'output' or
input.clients[0][0].op == gpu_from_host)):
continue
try:
new_input = host_from_gpu(gpu_from_host(input))
fgraph.replace_validate(input, new_input,
"InputToGpuOptimizer")
except TypeError, e:
# This could fail if the inputs are not TensorTypes
pass
gpu_seqopt.register('InputToGpuArrayOptimizer', InputToGpuOptimizer(),
0, 'fast_run', 'fast_compile', 'merge')
@local_optimizer([])
def local_cut_gpu_host_gpu(node):
if tensor.opt.opt.check_chain(node, gpu_from_host, host_from_gpu):
return [node.inputs[0].owner.inputs[0]]
if tensor.opt.opt.check_chain(node, host_from_gpu, gpu_from_host):
return [node.inputs[0].owner.inputs[0]]
return False
gpu_cut_copies.register('cut_gpua_host_transfers', local_cut_gpu_host_gpu,
'fast_run', 'inplace', 'gpuarray')
gpu_cut_copies.register('cut_gpua_constant_transfers',
tensor.opt.constant_folding,
'fast_run', 'gpuarray')
optdb['canonicalize'].register('local_cut_gpua_host_gpua',
local_cut_gpu_host_gpu, 'fast_run', 'gpuarray')
@register_opt()
@local_optimizer([tensor.Alloc])
def local_gpualloc(node):
replace = False
if node.op == tensor.alloc:
if node.inputs[0].owner and node.inputs[0].owner.op == host_from_gpu:
replace = True
elif all([c != 'output' and c.op == gpu_from_host
for c, idx in node.outputs[0].clients]):
replace = True
elif all([c != 'output' and c.op == tensor.join and
all([i.owner and i.owner.op in [host_from_gpu, tensor.alloc]
for i in c.inputs[1:]])
for c, idx in node.outputs[0].clients]):
replace = True
if replace:
val = node.inputs[0]
shp = node.inputs[1:]
old_out = node.outputs[0]
val2 = tensor.shape_padleft(val, len(shp) - val.ndim)
new_out = host_from_gpu(gpu_alloc(val, *shp))
if new_out.type != old_out.type:
assert new_out.type.ndim == old_out.type.ndim
assert new_out.type.dtype == old_out.type.dtype
for b_old, b_new in zip(old_out.type.broadcastable,
new_out.type.broadcastable):
assert b_new or (not b_old)
new_out = tensor.patternbroadcast(new_out. old_out.broadcastable)
return [new_out]
@register_opt()
@local_optimizer([])
def local_gpu_elemwise(node):
do_replace = False
gpu_out = False
# check for gpu_from_host(Elemwise)) and extract the Elemwise node
if node.op == gpu_from_host:
host_i, = node.inputs
if (host_i.owner and
isinstance(host_i.owner.op, tensor.Elemwise) and
len(host_i.clients) == 1):
node = host_i.owner
do_replace = True
gpu_out = True
# check for elemwise(..., host_from_gpu, ...)
if isinstance(node.op, tensor.Elemwise):
if numpy.any([i.owner and
i.owner.op == host_from_gpu
for i in node.inputs]):
do_replace = True
if numpy.all([_is_scalar(i)
for i in node.inputs]):
do_replace = False
if do_replace:
new_op = GpuElemwise(node.op.scalar_op)
gpu_elemwise = new_op(*(gpu_from_host(i) for i in node.inputs))
if gpu_out:
return [gpu_elemwise]
else:
return [host_from_gpu(gpu_elemwise)]
else:
return False
差异被折叠。
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论