Merge pull request #3533 from abergeron/multi_gpu_followup

Multi gpu followup

Merge pull request #3533 from abergeron/multi_gpu_followup
3a190f98 · Frédéric Bastien · 7415e2f0 · 21b2f50a · 3a190f98 · 3a190f98
--- a/.travis.yml
+++ b/.travis.yml
@@ -24,8 +24,8 @@ before_install:
  - conda update --yes conda
 install:
-  - if [[ $TRAVIS_PYTHON_VERSION == '2.6' ]]; then conda create --yes -q -n pyenv mkl python=2.6 numpy=1.7.1 scipy=0.11 nose=1.3.0 pyparsing=1.5 pip flake8==2.3 six==1.9.0 pep8==1.6.2 pyflakes==0.8.1; fi
+  - if [[ $TRAVIS_PYTHON_VERSION == '2.6' ]]; then conda create --yes -q -n pyenv python=2.6 numpy=1.7.1 scipy=0.11 nose=1.3.0 pyparsing=1.5 pip flake8=2.3 six=1.9.0 pep8=1.6.2 pyflakes=0.8.1; fi
-  - if [[ $TRAVIS_PYTHON_VERSION == '3.3' ]]; then conda create --yes -q -n pyenv mkl python=3.3 numpy=1.9.1 scipy=0.14.0 nose=1.3.4 pyparsing=1.5 pip flake8==2.3 six==1.9.0 pep8==1.6.2 pyflakes==0.8.1; fi
+  - if [[ $TRAVIS_PYTHON_VERSION == '3.3' ]]; then conda create --yes -q -n pyenv python=3.3 numpy=1.9.1 scipy=0.14.0 nose=1.3.4 pyparsing=1.5 pip flake8=2.3 six=1.9.0 pep8=1.6.2 pyflakes=0.8.1; fi
  - source activate pyenv
  - if [[ $TRAVIS_PYTHON_VERSION == '2.6' ]]; then pip install pydot; fi
  - pip install . --no-deps

--- a/doc/extending/type.txt
+++ b/doc/extending/type.txt
@@ -167,6 +167,25 @@ overridden.
 For more details you can go see the documentation for :ref:`type`.
+Additional definitions
+----------------------
+For certain mechanisms, you can register functions and other such
+things to plus your type into theano's mechanisms.  These are optional
+but will allow people to use you type with familiar interfaces.
+`transfer()`
+~~~~~~~~~~~~
+To plug in additional options for the transfer target, define a
+function which takes a theano variable and a target argument and
+returns eitehr a new transferred variable (which can be the same as
+the input if no transfer is nessecary) or returns None if the transfer
+can't be done.
+Then register that function by calling :func:`register_transfer()`
+with it as argument.
 Defining double
 ===============

--- a/doc/library/tensor/basic.txt
+++ b/doc/library/tensor/basic.txt
@@ -427,7 +427,8 @@ TensorVariable
    you'll want to call.
-.. class:: _tensor_py_operators(object)
+.. autoclass:: _tensor_py_operators
+   :members:
    This mix-in class adds convenient attributes, methods, and support
    to TensorVariable, TensorConstant and TensorSharedVariable for

--- a/theano/configdefaults.py
+++ b/theano/configdefaults.py
@@ -121,6 +121,9 @@ class ContextsParam(ConfigParam):
                s = v.split('->')
                if len(s) != 2:
                    raise ValueError("Malformed context map: %s" % (v,))
+                if (s[0] == 'cpu' or s[0].startswith('cuda') or
+                        s[0].startswith('opencl')):
+                    raise ValueError("Cannot use %s as context name" % (s[0],))
            return val
        ConfigParam.__init__(self, '', filter, False)
@@ -132,6 +135,8 @@ AddConfigVar(
    'name->dev_name' format. An example that would map name 'test' to
    device 'cuda0' and name 'test2' to device 'opencl0:0' follows:
    "test->cuda0;test2->opencl0:0".
+    Invalid context names are 'cpu', 'cuda*' and 'opencl*'
    """, ContextsParam(), in_c_key=False)
 AddConfigVar(
@@ -150,7 +155,7 @@ def default_cuda_root():
        return ''
    for dir in s.split(os.path.pathsep):
        if os.path.exists(os.path.join(dir, "nvcc")):
-            return os.path.split(dir)[0]
+            return os.path.dirname(os.path.abspath(dir))
    return ''
 AddConfigVar(

--- a/theano/gof/cc.py
+++ b/theano/gof/cc.py
@@ -276,7 +276,18 @@ def struct_gen(args, struct_builders, blocks, sub):
        %(storage_decl)s
        %(struct_decl)s
-        %(name)s() {}
+        %(name)s() {
+            // This is only somewhat safe because we:
+            //  1) Are not a virtual class
+            //  2) Do not use any virtual classes in the members
+            //  3) Deal with mostly POD and pointers
+            // If this changes, we would have to revise this, but for
+            // now I am tired of chasing segfaults because
+            // initialization code had an error and some pointer has
+            // a junk value.
+            memset(this, 0, sizeof(*this));
+        }
        ~%(name)s(void) {
            cleanup();
        }

--- a/theano/gof/link.py
+++ b/theano/gof/link.py
@@ -294,7 +294,7 @@ def raise_with_op(node, thunk=None, exc_info=None, storage_map=None):
                detailed_err_msg += "\n"
        detailed_err_msg += " TotalSize: %s Byte(s) %.3f GB\n" % (
            total_size, total_size / 1024. / 1024 / 1024)
-        detailed_err_msg += " TotalSize inputs: %s Byte(s) %.3f BG\n" % (
+        detailed_err_msg += " TotalSize inputs: %s Byte(s) %.3f GB\n" % (
            total_size_inputs, total_size_inputs / 1024. / 1024 / 1024)
    else:

--- a/theano/sandbox/cuda/__init__.py
+++ b/theano/sandbox/cuda/__init__.py
@@ -17,6 +17,8 @@ from theano.configparser import (
    config, AddConfigVar, BoolParam, FloatParam, StrParam)
 from . import nvcc_compiler
+from theano.tensor.basic import register_transfer
 # ignore_newtrees is to speed the optimization as this is the pattern
 # we use for optimization. Otherwise, we can iterate 100s of time on
 # the graph and apply only a few optimizations each time.
@@ -327,6 +329,12 @@ if cuda_available:
    from . import opt, dnn
    from .rng_curand import CURAND_RandomStreams
+    def transfer(x, target):
+        if target == 'gpu':
+            return as_cuda_ndarray_variable(x)
+    register_transfer(transfer)
 def use(device,
        force=False,

--- a/theano/sandbox/cuda/var.py
+++ b/theano/sandbox/cuda/var.py
@@ -162,11 +162,15 @@ CudaNdarrayType.SharedVariable = CudaNdarraySharedVariable
 def cuda_shared_constructor(value, name=None, strict=False,
-        allow_downcast=None, borrow=False, broadcastable=None):
+                            allow_downcast=None, borrow=False,
+                            broadcastable=None, target='gpu'):
    """
    SharedVariable Constructor for CudaNdarrayType.
    """
+    if target != 'gpu':
+        raise TypeError('not for gpu')
    # THIS CONSTRUCTOR TRIES TO CAST VALUE TO A FLOAT32, WHICH THEN GOES ONTO THE CARD
    # SO INT shared vars, float64 shared vars, etc. all end up on the card.
    # THIS IS NOT THE DEFAULT BEHAVIOUR THAT WE WANT.
@@ -196,12 +200,15 @@ def cuda_shared_constructor(value, name=None, strict=False,
 def float32_shared_constructor(value, name=None, strict=False,
-        allow_downcast=None, borrow=False, broadcastable=None):
+                               allow_downcast=None, borrow=False,
+                               broadcastable=None, target='gpu'):
    """
    SharedVariable Constructor for CudaNdarrayType from numpy.ndarray or
    CudaNdarray.
    """
+    if target != 'gpu':
+        raise TypeError('not for gpu')
    if theano.sandbox.cuda.use.device_number is None:
        theano.sandbox.cuda.use("gpu",
                                force=True,

--- a/theano/sandbox/gpuarray/__init__.py
+++ b/theano/sandbox/gpuarray/__init__.py
@@ -6,6 +6,8 @@ import theano
 from theano.configparser import config, AddConfigVar, BoolParam
 from theano.compile import optdb
+from theano.tensor.basic import register_transfer
 _logger_name = 'theano.sandbox.gpuarray'
 _logger = logging.getLogger(_logger_name)
@@ -22,9 +24,19 @@ except ImportError:
 # This is for documentation not to depend on the availability of pygpu
 from .type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant,
                   GpuArraySharedVariable, gpuarray_shared_constructor,
-                   reg_context)
+                   reg_context, get_context, ContextNotDefined)
+from .basic_ops import as_gpuarray_variable
 from . import opt, nerv
+def transfer(x, target):
+    try:
+        get_context(target)
+        return as_gpuarray_variable(x, target)
+    except ContextNotDefined:
+        pass
+register_transfer(transfer)
 def init_dev(dev, name=None):
    if pygpu.gpuarray.api_version() != (-10000, 0):

--- a/theano/sandbox/gpuarray/basic_ops.py
+++ b/theano/sandbox/gpuarray/basic_ops.py
@@ -21,7 +21,8 @@ try:
 except ImportError:
    pass
-from .type import GpuArrayType, GpuArrayConstant, gpu_context_type, get_context
+from .type import (GpuArrayType, GpuArrayConstant, gpu_context_type,
+                   get_context, ContextNotDefined)
 from .fp16_help import write_w
@@ -96,8 +97,12 @@ def infer_context_name(*vars):
                return v.owner.inputs[0].type.context_name
            if len(v.owner.inputs) == 1:
                todo.extendleft(v.owner.inputs)
-    # If we can't find a context we infer None, which is the default
+    # If we can't find a context try None if it exists
-    return None
+    try:
+        get_context(None)
+        return None
+    except ContextNotDefined:
+        raise ValueError("Could not infer context from inputs")
 class Kernel(object):
@@ -386,29 +391,49 @@ class GpuFromHost(Op):
    def infer_shape(self, node, xshp):
        return xshp
+    def c_headers(self):
+        return ["gpuarray_helper.h"]
+    def c_header_dirs(self):
+        return [os.path.dirname(__file__)]
    def c_code(self, node, name, inputs, outputs, sub):
        return """
        PyArrayObject *%(name)s_tmp;
        %(name)s_tmp = PyArray_GETCONTIGUOUS(%(inp)s);
        if (%(name)s_tmp == NULL)
          %(fail)s
-        Py_XDECREF(%(out)s);
-        %(out)s = pygpu_fromhostdata(PyArray_DATA(%(name)s_tmp),
+        if (%(out)s != NULL && GpuArray_IS_C_CONTIGUOUS(&%(out)s->ga) &&
-                                     get_typecode((PyObject *)PyArray_DESCR(%(name)s_tmp)),
+            theano_size_check(%(out)s, PyArray_NDIM(%(name)s_tmp),
-                                     PyArray_NDIM(%(name)s_tmp),
+                              (size_t *)PyArray_DIMS(%(name)s_tmp),
-                                     (size_t *)PyArray_DIMS(%(name)s_tmp),
+                              get_typecode((PyObject *)PyArray_DESCR(%(name)s_tmp)))) {
-                                     (ssize_t *)PyArray_STRIDES(%(name)s_tmp),
+          int err = GpuArray_write(&%(out)s->ga, PyArray_DATA(%(name)s_tmp),
-                                     %(ctx)s,
+                                   PyArray_NBYTES(%(name)s_tmp));
-                                     Py_None);
+          Py_DECREF(%(name)s_tmp);
-        Py_DECREF(%(name)s_tmp);
+          if (err != GA_NO_ERROR) {
-        if (%(out)s == NULL) {
+            PyErr_Format(PyExc_RuntimeError, "Could not write data to gpu");
-            %(fail)s
+            %(fail)s;
+          }
+        } else {
+          Py_XDECREF(%(out)s);
+          %(out)s = pygpu_fromhostdata(PyArray_DATA(%(name)s_tmp),
+                                       get_typecode((PyObject *)PyArray_DESCR(%(name)s_tmp)),
+                                       PyArray_NDIM(%(name)s_tmp),
+                                       (size_t *)PyArray_DIMS(%(name)s_tmp),
+                                       (ssize_t *)PyArray_STRIDES(%(name)s_tmp),
+                                       %(ctx)s,
+                                       Py_None);
+          Py_DECREF(%(name)s_tmp);
+          if (%(out)s == NULL) {
+              %(fail)s
+          }
        }
        """ % {'name': name, 'inp': inputs[0], 'ctx': sub['context'],
               'out': outputs[0], 'fail': sub['fail']}
    def c_code_cache_version(self):
-        return (7,)
+        return (8,)
 class GpuToGpu(Op):

--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -17,7 +17,8 @@ from theano.scan_module import scan_utils, scan_op, scan_opt
 from theano.tensor.nnet.conv import ConvOp
 from theano.tests.breakpoint import PdbBreakpoint
-from .type import GpuArrayType, GpuArrayConstant, get_context
+from .type import (GpuArrayType, GpuArrayConstant, get_context,
+                   ContextNotDefined)
 from .basic_ops import (as_gpuarray_variable, infer_context_name,
                        host_from_gpu, GpuToGpu,
                        HostFromGpu, GpuFromHost,
@@ -164,9 +165,9 @@ class InputToGpuOptimizer(Optimizer):
            if isinstance(input.type, GpuArrayType):
                continue
-            if (len(input.clients) == 1 and
+            # If all clients are outputs or transfers don't do anything.
-                (input.clients[0][0] == 'output' or
+            if (all(cl[0] == 'output' or isinstance(cl[0].op, GpuFromHost)
-                 isinstance(input.clients[0][0].op, GpuFromHost))):
+                    for cl in input.clients)):
                continue
            ctx_name = getattr(input.tag, 'context_name', None)
@@ -177,11 +178,11 @@ class InputToGpuOptimizer(Optimizer):
            except TypeError:
                # This could fail if the inputs are not TensorTypes
                pass
-            except ValueError:
+            except ContextNotDefined:
+                if hasattr(input.tag, 'context_name'):
+                    raise
                # If there is no context tag and no default context
                # then it stays on the CPU
-                if not hasattr(input.tag, 'context_name'):
-                    raise
                pass
@@ -194,7 +195,7 @@ def local_cut_gpu_transfers(node):
    # gpu[ab] -> host -> gpub
    if (isinstance(node.op, GpuFromHost) and
            node.inputs[0].owner and
-            node.inputs[0].owner.op == host_from_gpu):
+            isinstance(node.inputs[0].owner.op, HostFromGpu)):
        other = node.inputs[0].owner.inputs[0]
        if node.op.context_name == other.type.context_name:
            return [other]
@@ -202,7 +203,7 @@ def local_cut_gpu_transfers(node):
            return [GpuToGpu(node.op.context_name)(other)]
    # ? -> gpua -> host
-    elif (node.op == host_from_gpu and
+    elif (isinstance(node.op, HostFromGpu) and
          node.inputs[0].owner):
        n2 = node.inputs[0].owner
@@ -255,7 +256,7 @@ def local_gpuaalloc2(node):
    """
    try:
        get_context(None)
-    except ValueError:
+    except ContextNotDefined:
        # If there is no default context then we do not perform the move here.
        return
    if (isinstance(node.op, tensor.Alloc) and
@@ -620,6 +621,7 @@ def local_gpua_careduce(node, context_name):
            node.op.scalar_op, axis=node.op.axis,
            dtype=getattr(node.op, 'dtype', None),
            acc_dtype=getattr(node.op, 'acc_dtype', None))
+        x.tag.context_name = context_name
        gvar = greduce(x)
        # We need to have the make node called, otherwise the mask can
        # be None

--- a/theano/sandbox/gpuarray/type.py
+++ b/theano/sandbox/gpuarray/type.py
@@ -17,6 +17,10 @@ except ImportError:
 _context_reg = {}
+class ContextNotDefined(ValueError):
+    pass
 def reg_context(name, ctx):
    """
    Register a context by mapping it to a name.
@@ -56,7 +60,7 @@ def get_context(name):
    """
    if name not in _context_reg:
-        raise ValueError("context name %s not defined" % (name,))
+        raise ContextNotDefined("context name %s not defined" % (name,))
    return _context_reg[name]
@@ -72,7 +76,7 @@ def _name_for_ctx(ctx):
    for k, v in _context_reg:
        if v == ctx:
            return k
-        raise ValueError('context is not registered')
+        raise ContextNotDefined('context is not registered')
 # This is a private method for use by the tests only
@@ -88,6 +92,8 @@ class GpuArrayType(Type):
        self.ndim = len(self.broadcastable)
        self.name = name
        self.context_name = context_name
+        # This will check that the passed context name is valid and registered.
+        get_context(self.context_name)
        try:
            self.typecode = gpuarray.dtype_to_typecode(self.dtype)
        except gpuarray.GpuArrayException:
@@ -468,27 +474,29 @@ GpuArrayType.SharedVariable = GpuArraySharedVariable
 def gpuarray_shared_constructor(value, name=None, strict=False,
                                allow_downcast=None, borrow=False,
-                                broadcastable=None,
+                                broadcastable=None, target=None):
-                                context_name=None):
    """
    SharedVariable constructor for GpuArrayType.
    """
+    if target == 'gpu' or target == 'cpu':
+        raise TypeError('not for me')
    if not isinstance(value, (numpy.ndarray, pygpu.gpuarray.GpuArray)):
        raise TypeError('ndarray or GpuArray required')
    try:
-        get_context(context_name)
+        get_context(target)
-    except ValueError:
+    except ContextNotDefined:
        # Don't make this a hard error if we attempt to make a shared
        # variable while there is no default context.
-        if context_name is None:
+        if target is None:
            raise TypeError('No default context and no context specified')
        raise
    if broadcastable is None:
        broadcastable = (False,) * value.ndim
-    type = GpuArrayType(value.dtype, broadcastable, context_name=context_name)
+    type = GpuArrayType(value.dtype, broadcastable, context_name=target)
    deviceval = pygpu.gpuarray.array(value, copy=(not borrow),
                                     context=type.context)
    return GpuArraySharedVariable(type=type, value=deviceval, name=name,

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -2851,11 +2851,46 @@ class Alloc(gof.Op):
                return False
        return True
 alloc = Alloc()
 pprint.assign(alloc, printing.FunctionPrinter('alloc'))
+def transfer(var, target):
+    """
+    Return a version of `var` transferred to `target`.
+    `cpu` mean a TensorType (on the CPU).  Other types may define
+    additional targets.
+    Parameters
+    ----------
+    var : variable
+        A theano variable
+    target : str
+        The target of the transfer
+    """
+    if target == 'cpu':
+        return as_tensor_variable(var)
+    else:
+        for trans in transfer._others:
+            res = trans(var, target)
+            if res is not None:
+                return res
+    raise ValueError("Can't transfer to target %s" % (target,))
+transfer._others = []
+def register_transfer(fn):
+    """
+    Register a transfer function for alternative targets.
+    Parameters
+    ----------
+    fn : callable
+    """
+    transfer._others.append(fn)
 """Create a duplicate of `a` (with duplicated storage)"""
 tensor_copy = elemwise.Elemwise(scal.identity)
 pprint.assign(tensor_copy, printing.IgnorePrinter())

--- a/theano/tensor/sharedvar.py
+++ b/theano/tensor/sharedvar.py
@@ -24,7 +24,7 @@ class TensorSharedVariable(_tensor_py_operators, SharedVariable):
 @shared_constructor
 def tensor_constructor(value, name=None, strict=False, allow_downcast=None,
-                       borrow=False, broadcastable=None):
+                       borrow=False, broadcastable=None, target='cpu'):
    """
    SharedVariable Constructor for TensorType.
@@ -36,6 +36,9 @@ def tensor_constructor(value, name=None, strict=False, allow_downcast=None,
    The optional `broadcastable` argument will override this default.
    """
+    if target != 'cpu':
+        raise TypeError('not for cpu')
    if not isinstance(value, numpy.ndarray):
        raise TypeError()
@@ -65,7 +68,7 @@ class ScalarSharedVariable(_tensor_py_operators, SharedVariable):
 @shared_constructor
 def scalar_constructor(value, name=None, strict=False, allow_downcast=None,
-                       borrow=False):
+                       borrow=False, target='cpu'):
    """
    SharedVariable constructor for scalar values. Default: int64 or float64.
@@ -78,6 +81,9 @@ def scalar_constructor(value, name=None, strict=False, allow_downcast=None,
    borrow, as it is a hint to Theano that we can reuse it.
    """
+    if target != 'cpu':
+        raise TypeError('not for cpu')
    if not isinstance(value, (numpy.number, float, int, complex)):
        raise TypeError()
    try:

--- a/theano/tensor/var.py
+++ b/theano/tensor/var.py
@@ -29,7 +29,7 @@ class AsTensorError(TypeError):
    pass
-class _tensor_py_operators:
+class _tensor_py_operators(object):
    # UNARY
    def __abs__(self):
        return theano.tensor.basic.abs_(self)
@@ -369,6 +369,19 @@ class _tensor_py_operators:
    def diagonal(self, offset=0, axis1=0, axis2=1):
        return theano.tensor.basic.diagonal(self, offset, axis1, axis2)
+    # Transfer the data to another device
+    def transfer(self, target):
+        """
+        If `target` is `'cpu'` this will transfer to a TensorType (if
+        not already one).  Other types may define additional targets.
+        Paramters
+        ---------
+        target : str
+            The desired location of the output variable
+        """
+        return theano.tensor.transfer(self, target)
    # Elemwise
    def arccos(self):
        return theano.tensor.arccos(self)