提交 3a190f98 authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #3533 from abergeron/multi_gpu_followup

Multi gpu followup
......@@ -24,8 +24,8 @@ before_install:
- conda update --yes conda
install:
- if [[ $TRAVIS_PYTHON_VERSION == '2.6' ]]; then conda create --yes -q -n pyenv mkl python=2.6 numpy=1.7.1 scipy=0.11 nose=1.3.0 pyparsing=1.5 pip flake8==2.3 six==1.9.0 pep8==1.6.2 pyflakes==0.8.1; fi
- if [[ $TRAVIS_PYTHON_VERSION == '3.3' ]]; then conda create --yes -q -n pyenv mkl python=3.3 numpy=1.9.1 scipy=0.14.0 nose=1.3.4 pyparsing=1.5 pip flake8==2.3 six==1.9.0 pep8==1.6.2 pyflakes==0.8.1; fi
- if [[ $TRAVIS_PYTHON_VERSION == '2.6' ]]; then conda create --yes -q -n pyenv python=2.6 numpy=1.7.1 scipy=0.11 nose=1.3.0 pyparsing=1.5 pip flake8=2.3 six=1.9.0 pep8=1.6.2 pyflakes=0.8.1; fi
- if [[ $TRAVIS_PYTHON_VERSION == '3.3' ]]; then conda create --yes -q -n pyenv python=3.3 numpy=1.9.1 scipy=0.14.0 nose=1.3.4 pyparsing=1.5 pip flake8=2.3 six=1.9.0 pep8=1.6.2 pyflakes=0.8.1; fi
- source activate pyenv
- if [[ $TRAVIS_PYTHON_VERSION == '2.6' ]]; then pip install pydot; fi
- pip install . --no-deps
......
......@@ -167,6 +167,25 @@ overridden.
For more details you can go see the documentation for :ref:`type`.
Additional definitions
----------------------
For certain mechanisms, you can register functions and other such
things to plus your type into theano's mechanisms. These are optional
but will allow people to use you type with familiar interfaces.
`transfer()`
~~~~~~~~~~~~
To plug in additional options for the transfer target, define a
function which takes a theano variable and a target argument and
returns eitehr a new transferred variable (which can be the same as
the input if no transfer is nessecary) or returns None if the transfer
can't be done.
Then register that function by calling :func:`register_transfer()`
with it as argument.
Defining double
===============
......
......@@ -427,7 +427,8 @@ TensorVariable
you'll want to call.
.. class:: _tensor_py_operators(object)
.. autoclass:: _tensor_py_operators
:members:
This mix-in class adds convenient attributes, methods, and support
to TensorVariable, TensorConstant and TensorSharedVariable for
......
......@@ -121,6 +121,9 @@ class ContextsParam(ConfigParam):
s = v.split('->')
if len(s) != 2:
raise ValueError("Malformed context map: %s" % (v,))
if (s[0] == 'cpu' or s[0].startswith('cuda') or
s[0].startswith('opencl')):
raise ValueError("Cannot use %s as context name" % (s[0],))
return val
ConfigParam.__init__(self, '', filter, False)
......@@ -132,6 +135,8 @@ AddConfigVar(
'name->dev_name' format. An example that would map name 'test' to
device 'cuda0' and name 'test2' to device 'opencl0:0' follows:
"test->cuda0;test2->opencl0:0".
Invalid context names are 'cpu', 'cuda*' and 'opencl*'
""", ContextsParam(), in_c_key=False)
AddConfigVar(
......@@ -150,7 +155,7 @@ def default_cuda_root():
return ''
for dir in s.split(os.path.pathsep):
if os.path.exists(os.path.join(dir, "nvcc")):
return os.path.split(dir)[0]
return os.path.dirname(os.path.abspath(dir))
return ''
AddConfigVar(
......
......@@ -276,7 +276,18 @@ def struct_gen(args, struct_builders, blocks, sub):
%(storage_decl)s
%(struct_decl)s
%(name)s() {}
%(name)s() {
// This is only somewhat safe because we:
// 1) Are not a virtual class
// 2) Do not use any virtual classes in the members
// 3) Deal with mostly POD and pointers
// If this changes, we would have to revise this, but for
// now I am tired of chasing segfaults because
// initialization code had an error and some pointer has
// a junk value.
memset(this, 0, sizeof(*this));
}
~%(name)s(void) {
cleanup();
}
......
......@@ -294,7 +294,7 @@ def raise_with_op(node, thunk=None, exc_info=None, storage_map=None):
detailed_err_msg += "\n"
detailed_err_msg += " TotalSize: %s Byte(s) %.3f GB\n" % (
total_size, total_size / 1024. / 1024 / 1024)
detailed_err_msg += " TotalSize inputs: %s Byte(s) %.3f BG\n" % (
detailed_err_msg += " TotalSize inputs: %s Byte(s) %.3f GB\n" % (
total_size_inputs, total_size_inputs / 1024. / 1024 / 1024)
else:
......
......@@ -17,6 +17,8 @@ from theano.configparser import (
config, AddConfigVar, BoolParam, FloatParam, StrParam)
from . import nvcc_compiler
from theano.tensor.basic import register_transfer
# ignore_newtrees is to speed the optimization as this is the pattern
# we use for optimization. Otherwise, we can iterate 100s of time on
# the graph and apply only a few optimizations each time.
......@@ -327,6 +329,12 @@ if cuda_available:
from . import opt, dnn
from .rng_curand import CURAND_RandomStreams
def transfer(x, target):
if target == 'gpu':
return as_cuda_ndarray_variable(x)
register_transfer(transfer)
def use(device,
force=False,
......
......@@ -162,11 +162,15 @@ CudaNdarrayType.SharedVariable = CudaNdarraySharedVariable
def cuda_shared_constructor(value, name=None, strict=False,
allow_downcast=None, borrow=False, broadcastable=None):
allow_downcast=None, borrow=False,
broadcastable=None, target='gpu'):
"""
SharedVariable Constructor for CudaNdarrayType.
"""
if target != 'gpu':
raise TypeError('not for gpu')
# THIS CONSTRUCTOR TRIES TO CAST VALUE TO A FLOAT32, WHICH THEN GOES ONTO THE CARD
# SO INT shared vars, float64 shared vars, etc. all end up on the card.
# THIS IS NOT THE DEFAULT BEHAVIOUR THAT WE WANT.
......@@ -196,12 +200,15 @@ def cuda_shared_constructor(value, name=None, strict=False,
def float32_shared_constructor(value, name=None, strict=False,
allow_downcast=None, borrow=False, broadcastable=None):
allow_downcast=None, borrow=False,
broadcastable=None, target='gpu'):
"""
SharedVariable Constructor for CudaNdarrayType from numpy.ndarray or
CudaNdarray.
"""
if target != 'gpu':
raise TypeError('not for gpu')
if theano.sandbox.cuda.use.device_number is None:
theano.sandbox.cuda.use("gpu",
force=True,
......
......@@ -6,6 +6,8 @@ import theano
from theano.configparser import config, AddConfigVar, BoolParam
from theano.compile import optdb
from theano.tensor.basic import register_transfer
_logger_name = 'theano.sandbox.gpuarray'
_logger = logging.getLogger(_logger_name)
......@@ -22,9 +24,19 @@ except ImportError:
# This is for documentation not to depend on the availability of pygpu
from .type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant,
GpuArraySharedVariable, gpuarray_shared_constructor,
reg_context)
reg_context, get_context, ContextNotDefined)
from .basic_ops import as_gpuarray_variable
from . import opt, nerv
def transfer(x, target):
try:
get_context(target)
return as_gpuarray_variable(x, target)
except ContextNotDefined:
pass
register_transfer(transfer)
def init_dev(dev, name=None):
if pygpu.gpuarray.api_version() != (-10000, 0):
......
......@@ -21,7 +21,8 @@ try:
except ImportError:
pass
from .type import GpuArrayType, GpuArrayConstant, gpu_context_type, get_context
from .type import (GpuArrayType, GpuArrayConstant, gpu_context_type,
get_context, ContextNotDefined)
from .fp16_help import write_w
......@@ -96,8 +97,12 @@ def infer_context_name(*vars):
return v.owner.inputs[0].type.context_name
if len(v.owner.inputs) == 1:
todo.extendleft(v.owner.inputs)
# If we can't find a context we infer None, which is the default
return None
# If we can't find a context try None if it exists
try:
get_context(None)
return None
except ContextNotDefined:
raise ValueError("Could not infer context from inputs")
class Kernel(object):
......@@ -386,29 +391,49 @@ class GpuFromHost(Op):
def infer_shape(self, node, xshp):
return xshp
def c_headers(self):
return ["gpuarray_helper.h"]
def c_header_dirs(self):
return [os.path.dirname(__file__)]
def c_code(self, node, name, inputs, outputs, sub):
return """
PyArrayObject *%(name)s_tmp;
%(name)s_tmp = PyArray_GETCONTIGUOUS(%(inp)s);
if (%(name)s_tmp == NULL)
%(fail)s
Py_XDECREF(%(out)s);
%(out)s = pygpu_fromhostdata(PyArray_DATA(%(name)s_tmp),
get_typecode((PyObject *)PyArray_DESCR(%(name)s_tmp)),
PyArray_NDIM(%(name)s_tmp),
(size_t *)PyArray_DIMS(%(name)s_tmp),
(ssize_t *)PyArray_STRIDES(%(name)s_tmp),
%(ctx)s,
Py_None);
Py_DECREF(%(name)s_tmp);
if (%(out)s == NULL) {
%(fail)s
if (%(out)s != NULL && GpuArray_IS_C_CONTIGUOUS(&%(out)s->ga) &&
theano_size_check(%(out)s, PyArray_NDIM(%(name)s_tmp),
(size_t *)PyArray_DIMS(%(name)s_tmp),
get_typecode((PyObject *)PyArray_DESCR(%(name)s_tmp)))) {
int err = GpuArray_write(&%(out)s->ga, PyArray_DATA(%(name)s_tmp),
PyArray_NBYTES(%(name)s_tmp));
Py_DECREF(%(name)s_tmp);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "Could not write data to gpu");
%(fail)s;
}
} else {
Py_XDECREF(%(out)s);
%(out)s = pygpu_fromhostdata(PyArray_DATA(%(name)s_tmp),
get_typecode((PyObject *)PyArray_DESCR(%(name)s_tmp)),
PyArray_NDIM(%(name)s_tmp),
(size_t *)PyArray_DIMS(%(name)s_tmp),
(ssize_t *)PyArray_STRIDES(%(name)s_tmp),
%(ctx)s,
Py_None);
Py_DECREF(%(name)s_tmp);
if (%(out)s == NULL) {
%(fail)s
}
}
""" % {'name': name, 'inp': inputs[0], 'ctx': sub['context'],
'out': outputs[0], 'fail': sub['fail']}
def c_code_cache_version(self):
return (7,)
return (8,)
class GpuToGpu(Op):
......
......@@ -17,7 +17,8 @@ from theano.scan_module import scan_utils, scan_op, scan_opt
from theano.tensor.nnet.conv import ConvOp
from theano.tests.breakpoint import PdbBreakpoint
from .type import GpuArrayType, GpuArrayConstant, get_context
from .type import (GpuArrayType, GpuArrayConstant, get_context,
ContextNotDefined)
from .basic_ops import (as_gpuarray_variable, infer_context_name,
host_from_gpu, GpuToGpu,
HostFromGpu, GpuFromHost,
......@@ -164,9 +165,9 @@ class InputToGpuOptimizer(Optimizer):
if isinstance(input.type, GpuArrayType):
continue
if (len(input.clients) == 1 and
(input.clients[0][0] == 'output' or
isinstance(input.clients[0][0].op, GpuFromHost))):
# If all clients are outputs or transfers don't do anything.
if (all(cl[0] == 'output' or isinstance(cl[0].op, GpuFromHost)
for cl in input.clients)):
continue
ctx_name = getattr(input.tag, 'context_name', None)
......@@ -177,11 +178,11 @@ class InputToGpuOptimizer(Optimizer):
except TypeError:
# This could fail if the inputs are not TensorTypes
pass
except ValueError:
except ContextNotDefined:
if hasattr(input.tag, 'context_name'):
raise
# If there is no context tag and no default context
# then it stays on the CPU
if not hasattr(input.tag, 'context_name'):
raise
pass
......@@ -194,7 +195,7 @@ def local_cut_gpu_transfers(node):
# gpu[ab] -> host -> gpub
if (isinstance(node.op, GpuFromHost) and
node.inputs[0].owner and
node.inputs[0].owner.op == host_from_gpu):
isinstance(node.inputs[0].owner.op, HostFromGpu)):
other = node.inputs[0].owner.inputs[0]
if node.op.context_name == other.type.context_name:
return [other]
......@@ -202,7 +203,7 @@ def local_cut_gpu_transfers(node):
return [GpuToGpu(node.op.context_name)(other)]
# ? -> gpua -> host
elif (node.op == host_from_gpu and
elif (isinstance(node.op, HostFromGpu) and
node.inputs[0].owner):
n2 = node.inputs[0].owner
......@@ -255,7 +256,7 @@ def local_gpuaalloc2(node):
"""
try:
get_context(None)
except ValueError:
except ContextNotDefined:
# If there is no default context then we do not perform the move here.
return
if (isinstance(node.op, tensor.Alloc) and
......@@ -620,6 +621,7 @@ def local_gpua_careduce(node, context_name):
node.op.scalar_op, axis=node.op.axis,
dtype=getattr(node.op, 'dtype', None),
acc_dtype=getattr(node.op, 'acc_dtype', None))
x.tag.context_name = context_name
gvar = greduce(x)
# We need to have the make node called, otherwise the mask can
# be None
......
......@@ -17,6 +17,10 @@ except ImportError:
_context_reg = {}
class ContextNotDefined(ValueError):
pass
def reg_context(name, ctx):
"""
Register a context by mapping it to a name.
......@@ -56,7 +60,7 @@ def get_context(name):
"""
if name not in _context_reg:
raise ValueError("context name %s not defined" % (name,))
raise ContextNotDefined("context name %s not defined" % (name,))
return _context_reg[name]
......@@ -72,7 +76,7 @@ def _name_for_ctx(ctx):
for k, v in _context_reg:
if v == ctx:
return k
raise ValueError('context is not registered')
raise ContextNotDefined('context is not registered')
# This is a private method for use by the tests only
......@@ -88,6 +92,8 @@ class GpuArrayType(Type):
self.ndim = len(self.broadcastable)
self.name = name
self.context_name = context_name
# This will check that the passed context name is valid and registered.
get_context(self.context_name)
try:
self.typecode = gpuarray.dtype_to_typecode(self.dtype)
except gpuarray.GpuArrayException:
......@@ -468,27 +474,29 @@ GpuArrayType.SharedVariable = GpuArraySharedVariable
def gpuarray_shared_constructor(value, name=None, strict=False,
allow_downcast=None, borrow=False,
broadcastable=None,
context_name=None):
broadcastable=None, target=None):
"""
SharedVariable constructor for GpuArrayType.
"""
if target == 'gpu' or target == 'cpu':
raise TypeError('not for me')
if not isinstance(value, (numpy.ndarray, pygpu.gpuarray.GpuArray)):
raise TypeError('ndarray or GpuArray required')
try:
get_context(context_name)
except ValueError:
get_context(target)
except ContextNotDefined:
# Don't make this a hard error if we attempt to make a shared
# variable while there is no default context.
if context_name is None:
if target is None:
raise TypeError('No default context and no context specified')
raise
if broadcastable is None:
broadcastable = (False,) * value.ndim
type = GpuArrayType(value.dtype, broadcastable, context_name=context_name)
type = GpuArrayType(value.dtype, broadcastable, context_name=target)
deviceval = pygpu.gpuarray.array(value, copy=(not borrow),
context=type.context)
return GpuArraySharedVariable(type=type, value=deviceval, name=name,
......
......@@ -2851,11 +2851,46 @@ class Alloc(gof.Op):
return False
return True
alloc = Alloc()
pprint.assign(alloc, printing.FunctionPrinter('alloc'))
def transfer(var, target):
"""
Return a version of `var` transferred to `target`.
`cpu` mean a TensorType (on the CPU). Other types may define
additional targets.
Parameters
----------
var : variable
A theano variable
target : str
The target of the transfer
"""
if target == 'cpu':
return as_tensor_variable(var)
else:
for trans in transfer._others:
res = trans(var, target)
if res is not None:
return res
raise ValueError("Can't transfer to target %s" % (target,))
transfer._others = []
def register_transfer(fn):
"""
Register a transfer function for alternative targets.
Parameters
----------
fn : callable
"""
transfer._others.append(fn)
"""Create a duplicate of `a` (with duplicated storage)"""
tensor_copy = elemwise.Elemwise(scal.identity)
pprint.assign(tensor_copy, printing.IgnorePrinter())
......
......@@ -24,7 +24,7 @@ class TensorSharedVariable(_tensor_py_operators, SharedVariable):
@shared_constructor
def tensor_constructor(value, name=None, strict=False, allow_downcast=None,
borrow=False, broadcastable=None):
borrow=False, broadcastable=None, target='cpu'):
"""
SharedVariable Constructor for TensorType.
......@@ -36,6 +36,9 @@ def tensor_constructor(value, name=None, strict=False, allow_downcast=None,
The optional `broadcastable` argument will override this default.
"""
if target != 'cpu':
raise TypeError('not for cpu')
if not isinstance(value, numpy.ndarray):
raise TypeError()
......@@ -65,7 +68,7 @@ class ScalarSharedVariable(_tensor_py_operators, SharedVariable):
@shared_constructor
def scalar_constructor(value, name=None, strict=False, allow_downcast=None,
borrow=False):
borrow=False, target='cpu'):
"""
SharedVariable constructor for scalar values. Default: int64 or float64.
......@@ -78,6 +81,9 @@ def scalar_constructor(value, name=None, strict=False, allow_downcast=None,
borrow, as it is a hint to Theano that we can reuse it.
"""
if target != 'cpu':
raise TypeError('not for cpu')
if not isinstance(value, (numpy.number, float, int, complex)):
raise TypeError()
try:
......
......@@ -29,7 +29,7 @@ class AsTensorError(TypeError):
pass
class _tensor_py_operators:
class _tensor_py_operators(object):
# UNARY
def __abs__(self):
return theano.tensor.basic.abs_(self)
......@@ -369,6 +369,19 @@ class _tensor_py_operators:
def diagonal(self, offset=0, axis1=0, axis2=1):
return theano.tensor.basic.diagonal(self, offset, axis1, axis2)
# Transfer the data to another device
def transfer(self, target):
"""
If `target` is `'cpu'` this will transfer to a TensorType (if
not already one). Other types may define additional targets.
Paramters
---------
target : str
The desired location of the output variable
"""
return theano.tensor.transfer(self, target)
# Elemwise
def arccos(self):
return theano.tensor.arccos(self)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论