Merge pull request #1001 from abergeron/compyte

Support for a new type based on compyte in theano

Merge pull request #1001 from abergeron/compyte
44f9d0f7 · Frédéric Bastien · 93a7a5e3 · d935ba06 · 44f9d0f7 · 44f9d0f7
--- a/theano/__init__.py
+++ b/theano/__init__.py
@@ -91,6 +91,10 @@ if config.device.startswith('gpu') or config.init_gpu_device.startswith('gpu'):

        theano.sandbox.cuda.tests.test_driver.test_nvidia_driver1()

+if config.device.startswith('cuda') or config.device.startswith('opencl') or \
+        config.gpuarray.init_device != '':
+    import theano.sandbox.gpuarray
+
 # Use config.numpy to call numpy.seterr
 import numpy


--- a/theano/configdefaults.py
+++ b/theano/configdefaults.py
@@ -2,9 +2,8 @@ import os
 import logging
 import subprocess

-from theano.configparser import (
-        AddConfigVar, BoolParam, ConfigParam, EnumStr, IntParam,
-        TheanoConfigParser)
+from theano.configparser import (AddConfigVar, BoolParam, ConfigParam, EnumStr,
+                                 IntParam, StrParam, TheanoConfigParser)
 from theano.misc.cpucount import cpuCount
 from theano.misc.windows import call_subprocess_Popen

@@ -44,20 +43,42 @@ AddConfigVar('int_division',
 # gpu means let the driver select the gpu. Needed in case of gpu in
 # exclusive mode.
 # gpuX mean use the gpu number X.
+class DeviceParam(ConfigParam):
+    def __init__(self, default, *options, **kwargs):
+        self.default = default
+
+        def filter(val):
+            if val.startswith('cpu') or val.startswith('gpu') \
+                    or val.startswith('opencl') or val.startswith('cuda'):
+                return val
+            else:
+                raise ValueError(('Invalid value ("%s") for configuration '
+                                  'variable "%s". Valid options start with '
+                                  'one of "cpu", "gpu", "opencl", "cuda"'
+                                  % (val, self.fullname)))
+        over = kwargs.get("allow_override", True)
+        super(DeviceParam, self).__init__(default, filter, over)
+
+    def __str__(self):
+        return '%s (cpu, gpu*, opencl*, cuda*) ' % (self.fullname,)
+
 AddConfigVar('device',
        ("Default device for computations. If gpu*, change the default to try "
         "to move computation to it and to put shared variable of float32 "
         "on it. Do not use upper case letters, only lower case even if "
         "NVIDIA use capital letters."),
-        EnumStr('cpu', 'gpu',
-            'gpu0', 'gpu1', 'gpu2', 'gpu3',
-            'gpu4', 'gpu5', 'gpu6', 'gpu7',
-            'gpu8', 'gpu9', 'gpu10', 'gpu11',
-            'gpu12', 'gpu13', 'gpu14', 'gpu15',
-                allow_override=False),
+        DeviceParam('cpu', allow_override=False),
        in_c_key=False,
        )

+AddConfigVar('gpuarray.init_device',
+             """
+             Device to initialize for gpuarray use without moving
+             computations automatically.
+             """,
+             StrParam(''),
+             in_c_key=False)
+
 AddConfigVar('init_gpu_device',
        ("Initialize the gpu device to use, works only if device=cpu. "
         "Unlike 'device', setting this option will NOT move computations, "

--- a/theano/sandbox/gpuarray/__init__.py
+++ b/theano/sandbox/gpuarray/__init__.py
+import logging
+
+import theano
+from theano.configparser import config
+from theano.compile import optdb
+
+_logger_name = 'theano.sandbox.gpuarray'
+_logger = logging.getLogger(_logger_name)
+_logger.setLevel(logging.WARNING)
+
+error = _logger.error
+info = _logger.info
+
+pygpu_activated = False
+try:
+    import pygpu
+    import pygpu.gpuarray
+except ImportError:
+    pygpu = None
+
+# This is for documentation not to depend on the availability of pygpu
+from type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant,
+                  GpuArraySharedVariable, gpuarray_shared_constructor)
+import opt
+
+
+def init_dev(dev):
+    global pygpu_activated
+    context = pygpu.init(dev)
+    pygpu.set_default_context(context)
+    pygpu_activated = True
+
+if pygpu:
+    try:
+        if (config.device.startswith('cuda') or
+            config.device.startswith('opencl')):
+            init_dev(config.device)
+            import theano.compile
+            theano.compile.shared_constructor(gpuarray_shared_constructor)
+            optdb.add_tags('gpuarray_opt', 'fast_run', 'inplace')
+        elif config.gpuarray.init_device != '':
+            init_dev(config.gpuarray.init_device)
+    except Exception:
+        error("Could not initialize pygpu, support disabled", exc_info=True)
+else:
+    if (config.gpuarray.init_device != '' or
+        config.device.startswith('opencl') or
+        config.device.startswith('cuda')):
+        error("pygpu was configured but could not be imported", exc_info=True)
--- a/theano/sandbox/gpuarray/basic_ops.py
+++ b/theano/sandbox/gpuarray/basic_ops.py
--- a/theano/sandbox/gpuarray/elemwise.py
+++ b/theano/sandbox/gpuarray/elemwise.py
+import numpy
+from theano import Op, Apply, scalar
+
+try:
+    from pygpu.tools import ScalarArg, ArrayArg
+    from pygpu.elemwise import ElemwiseKernel
+except ImportError:
+    pass
+
+from basic_ops import as_gpuarray_variable
+from type import GpuArrayType
+
+from theano.gof.utils import MethodNotDefined
+
+def _is_scalar(v):
+    False
+
+def make_argument(v, name):
+    if _is_scalar(v):
+        return ScalarArg(numpy.dtype(v.type.dtype), name)
+    else:
+        return ArrayArg(numpy.dtype(v.type.dtype), name)
+
+def ensure_out(o, ref):
+    if o is None:
+        return ref._empty_like_me()
+    else:
+        return o
+
+class GpuElemwise(Op):
+    nin = property(lambda self: self.scalar_op.nin)
+    nout = property(lambda self: self.scalar_op.nout)
+
+    def __init__(self, scalar_op):
+        self.scalar_op = scalar_op
+        self.destroy_map = {}
+
+    def __getstate__(self):
+        d = copy.copy(self.__dict__)
+        d.pop('__epydoc_asRoutine', None)
+        d.pop('_hashval')
+        return d
+
+    def __setstate__(self, d):
+        self.__dict__.update(d)
+        self._rehash()
+
+    def __eq__(self, other):
+        return (type(self) == type(other) and
+                self.scalar_op == other.scalar_op)
+
+    def __hash__(self):
+        return hash(type(self)) ^ hash(self.scalar_op)
+
+    def __str__(self):
+        return "GpuElemwise{%s}(gpuarray)" % (self.scalar_op,)
+
+    def make_node(self, *inputs):
+        _inputs = [as_gpuarray_variable(i) for i in inputs]
+        if self.nin > 0 and len(_inputs) != self.nin:
+            raise TypeError("Wrong argument count", (self.nin, len(_inputs)))
+        for i in _inputs[1:]:
+            if i.type.ndim != inputs[0].type.ndim:
+                raise TypeError('mismatched rank amongst inputs')
+
+        broadcastable = []
+        for d in xrange(_inputs[0].type.ndim):
+            bcast_d = True
+            for i in _inputs:
+                if not i.type.broadcastable[d]:
+                    bcast_d = False
+                    break
+            broadcastable.append(bcast_d)
+        assert len(broadcastable) == _inputs[0].type.ndim
+
+        assert self.nout > 0
+        inps = [make_argument(i, 'i%d' % (n,)) for n, i in
+                enumerate(inputs)]
+        scal_ins = [scalar.Scalar(i.dtype) for i in inputs]
+                          
+        res = Apply(self, _inputs, 
+                    [GpuArrayType(o.dtype, broadcastable)()
+                     for o in self.scalar_op.output_types(scal_ins)])
+
+        outs = [make_argument(o, 'o%d' % (n,)) for n, o in
+                enumerate(res.outputs)]
+        scal_out = [scalar.Scalar(o.dtype) for o in res.outputs]
+
+        fake_node = Apply(self.scalar_op, [i() for i in scal_ins],
+                          [o() for o in scal_out])
+
+        kcode = self.scalar_op.c_code(fake_node, 'kcode',
+                                      [i.expr() for i in inps],
+                                      [o.expr() for o in outs],
+                                      sub=dict(fail='return;'))
+        res.tag.kcode = kcode
+
+        try:
+            code = self.scalar_op.c_support_code_apply(fake_node, 'kcode')
+            if code:
+                raise SupportCodeError()
+        except MethodNotDefined:
+            pass
+
+        support_code = ""
+        try:
+            support_code += self.scalar_op.c_support_code()
+        except MethodNotDefined:
+            pass
+
+        if support_code != "#define THEANO_MACRO_MOD(x,y) (x % y)":
+            # Avoid the C++ complex struct
+            raise SupportCodeError()
+
+        k = ElemwiseKernel(None, inps+outs, kcode, preamble=support_code)
+        res.tag.kernel = k
+
+        return res
+
+    def perform(self, node, inps, out):
+        k = node.tag.kernel
+        outs = [ensure_out(o[0], inps[0]) for o in out]
+
+        # the dict call is there to avoid syntax error in python <= 2.5
+        k(*(inps+outs), **dict(broadcast=True))
+
+        for o, og in zip(out, outs):
+            o[0] = og
+
+class SupportCodeError(Exception):
+    """
+    We do not support certain things (such as the C++ complex struct)
+    """
--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
+import theano, numpy
+from theano import tensor
+from theano.compile import optdb
+from theano.gof import (local_optimizer, EquilibriumDB, SequenceDB, ProxyDB,
+                        Optimizer, toolbox, DestroyHandler,
+                        InconsistencyError, EquilibriumOptimizer)
+
+from theano.gof.python25 import all, any
+from theano.sandbox.gpuarray.type import GpuArrayType
+
+from basic_ops import host_from_gpu, gpu_from_host, gpu_alloc
+from elemwise import GpuElemwise, _is_scalar
+
+gpu_optimizer = EquilibriumDB()
+gpu_cut_copies = EquilibriumDB()
+
+gpu_seqopt = SequenceDB()
+
+gpu_seqopt.register('gpuarray_local_optimiziations', gpu_optimizer, 1,
+                    'fast_run', 'inplace', 'gpuarray')
+gpu_seqopt.register('gpuarray_cut_transfers', gpu_cut_copies, 2,
+                    'fast_run', 'gpuarray')
+
+# do not add 'fast_run' to these two as this would always enable gpuarray mode
+optdb.register('gpuarray_opt', gpu_seqopt,
+               optdb.__position__.get('add_destroy_handler', 49.5) - 1,
+               'gpuarray')
+
+def register_opt(*tags, **kwargs):
+    def f(local_opt):
+        name = (kwargs and kwargs.pop('name')) or local_opt.__name__
+        gpu_optimizer.register(name, local_opt, 'fast_run', 'gpuarray', *tags)
+        return local_opt
+    return f
+
+register_opt()(theano.tensor.opt.local_track_shape_i)
+
+class InputToGpuOptimizer(Optimizer):
+    "Transfer the input to the gpu to start the rolling wave."
+
+    def add_requirements(self, fgraph):
+        fgraph.attach_feature(toolbox.ReplaceValidate())
+        fgraph.attach_feature(DestroyHandler())
+
+    def apply(self, fgraph):
+        for input in fgraph.inputs:
+            if isinstance(input.type, GpuArrayType):
+                continue
+
+            if (len(input.clients) == 1 and
+                (input.clients[0][0] == 'output' or
+                 input.clients[0][0].op == gpu_from_host)):
+                continue
+
+            try:
+                new_input = host_from_gpu(gpu_from_host(input))
+                fgraph.replace_validate(input, new_input,
+                                        "InputToGpuOptimizer")
+            except TypeError, e:
+                # This could fail if the inputs are not TensorTypes
+                pass
+
+gpu_seqopt.register('InputToGpuArrayOptimizer', InputToGpuOptimizer(),
+                    0, 'fast_run', 'fast_compile', 'merge')
+
+@local_optimizer([])
+def local_cut_gpu_host_gpu(node):
+    if tensor.opt.opt.check_chain(node, gpu_from_host, host_from_gpu):
+        return [node.inputs[0].owner.inputs[0]]
+    if tensor.opt.opt.check_chain(node, host_from_gpu, gpu_from_host):
+        return [node.inputs[0].owner.inputs[0]]
+    return False
+gpu_cut_copies.register('cut_gpua_host_transfers', local_cut_gpu_host_gpu,
+                        'fast_run', 'inplace', 'gpuarray')
+gpu_cut_copies.register('cut_gpua_constant_transfers',
+                        tensor.opt.constant_folding,
+                        'fast_run', 'gpuarray')
+optdb['canonicalize'].register('local_cut_gpua_host_gpua',
+                               local_cut_gpu_host_gpu, 'fast_run', 'gpuarray')
+
+@register_opt()
+@local_optimizer([tensor.Alloc])
+def local_gpualloc(node):
+    replace = False
+    if node.op == tensor.alloc:
+        if node.inputs[0].owner and node.inputs[0].owner.op == host_from_gpu:
+            replace = True
+        elif all([c != 'output' and c.op == gpu_from_host
+                  for c, idx in node.outputs[0].clients]):
+            replace = True
+        elif all([c != 'output' and c.op == tensor.join and
+                  all([i.owner and i.owner.op in [host_from_gpu, tensor.alloc]
+                       for i in c.inputs[1:]])
+                  for c, idx in node.outputs[0].clients]):
+            replace = True
+    if replace:
+        val = node.inputs[0]
+        shp = node.inputs[1:]
+        old_out = node.outputs[0]
+        val2 = tensor.shape_padleft(val, len(shp) - val.ndim)
+        new_out = host_from_gpu(gpu_alloc(val, *shp))
+        if new_out.type != old_out.type:
+            assert new_out.type.ndim == old_out.type.ndim
+            assert new_out.type.dtype == old_out.type.dtype
+            for b_old, b_new in zip(old_out.type.broadcastable,
+                                    new_out.type.broadcastable):
+                assert b_new or (not b_old)
+            new_out = tensor.patternbroadcast(new_out. old_out.broadcastable)
+
+        return [new_out]
+
+@register_opt()
+@local_optimizer([])
+def local_gpu_elemwise(node):
+    do_replace = False
+    gpu_out = False
+    # check for gpu_from_host(Elemwise)) and extract the Elemwise node
+    if node.op == gpu_from_host:
+        host_i, = node.inputs
+        if (host_i.owner and
+            isinstance(host_i.owner.op, tensor.Elemwise) and
+            len(host_i.clients) == 1):
+            node = host_i.owner
+            do_replace = True
+            gpu_out = True
+    # check for elemwise(..., host_from_gpu, ...)
+    if isinstance(node.op, tensor.Elemwise):
+        if numpy.any([i.owner and
+                      i.owner.op == host_from_gpu
+                      for i in node.inputs]):
+                do_replace = True
+    if numpy.all([_is_scalar(i)
+                  for i in node.inputs]):
+            do_replace = False
+
+    if do_replace:
+        new_op = GpuElemwise(node.op.scalar_op)
+        gpu_elemwise = new_op(*(gpu_from_host(i) for i in node.inputs))
+        if gpu_out:
+            return [gpu_elemwise]
+        else:
+            return [host_from_gpu(gpu_elemwise)]
+    else:
+        return False
--- a/theano/sandbox/gpuarray/tests/__init__.py
+++ b/theano/sandbox/gpuarray/tests/__init__.py
--- a/theano/sandbox/gpuarray/tests/test_basic_ops.py
+++ b/theano/sandbox/gpuarray/tests/test_basic_ops.py
--- a/theano/sandbox/gpuarray/type.py
+++ b/theano/sandbox/gpuarray/type.py