Merge pull request #4454 from slefrancois/gpu_out_sandbox

Move new GPU backend out of sandbox

Merge pull request #4454 from slefrancois/gpu_out_sandbox
69338f33 · abergeron · ed6c966d · 319382b5 · 69338f33 · 69338f33
--- a/doc/library/sandbox/gpuarray/dnn.txt
+++ b/doc/library/sandbox/gpuarray/dnn.txt
 .. _libdoc_gpuarray_dnn:

 ===========================================
-:mod:`theano.sandbox.gpuarray.dnn` -- cuDNN
+:mod:`gpuarray.dnn` -- cuDNN
 ===========================================

 .. moduleauthor:: LISA
@@ -135,27 +135,27 @@ To get an error if Theano can not use cuDNN, use this Theano flag:
 Functions
 =========

-.. automodule:: theano.sandbox.gpuarray.dnn
+.. automodule:: theano.gpuarray.dnn
   :noindex:
   :members: dnn_conv, dnn_pool

 Convolution Ops
 ===============

-.. automodule:: theano.sandbox.gpuarray.dnn
+.. automodule:: theano.gpuarray.dnn
   :noindex:
   :members: GpuDnnConvDesc, GpuDnnConv, GpuDnnConvGradW, GpuDnnConvGradI

 Pooling Ops
 ===========

-.. automodule:: theano.sandbox.gpuarray.dnn
+.. automodule:: theano.gpuarray.dnn
   :noindex:
   :members: GpuDnnPoolDesc, GpuDnnPool, GpuDnnPoolGrad

 Softmax Ops
 ===========

-.. automodule:: theano.sandbox.gpuarray.dnn
+.. automodule:: theano.gpuarray.dnn
   :noindex:
   :members: GpuDnnSoftmax, GpuDnnSoftmaxGrad
--- a/doc/library/sandbox/gpuarray/extra.txt
+++ b/doc/library/sandbox/gpuarray/extra.txt
@@ -7,11 +7,11 @@ Utility functions
 Optimisation
 ------------

-.. automodule:: theano.sandbox.gpuarray.opt_util
+.. automodule:: theano.gpuarray.opt_util
   :members:

 Kernel generation
 -----------------

-.. automodule:: theano.sandbox.gpuarray.kernel_codegen
+.. automodule:: theano.gpuarray.kernel_codegen
   :members:
--- a/doc/library/sandbox/gpuarray/index.txt
+++ b/doc/library/sandbox/gpuarray/index.txt
@@ -2,10 +2,10 @@
 .. _libdoc_gpuarray:

 =======================================================
-:mod:`theano.sandbox.gpuarray` -- The (new) GPU backend
+:mod:`gpuarray` -- The (new) GPU backend
 =======================================================

-.. module:: theano.sandbox.gpuarray
+.. module:: theano.gpuarray
   :platform: Unix, Windows
   :synopsis: Code for GPU programming (new)
 .. moduleauthor:: MILA

--- a/doc/library/sandbox/gpuarray/op.txt
+++ b/doc/library/sandbox/gpuarray/op.txt
@@ -13,35 +13,35 @@ is just useful to let people know what is implemented on the gpu.
 Basic Op
 ========

-.. automodule:: theano.sandbox.gpuarray.basic_ops
+.. automodule:: theano.gpuarray.basic_ops
    :members:

 Blas Op
 =======

-.. automodule:: theano.sandbox.gpuarray.blas
+.. automodule:: theano.gpuarray.blas
    :members:

-.. automodule:: theano.sandbox.gpuarray.nerv
+.. automodule:: theano.gpuarray.nerv
    :members:

 Elemwise Op
 ===========

-.. automodule:: theano.sandbox.gpuarray.elemwise
+.. automodule:: theano.gpuarray.elemwise
    :members:

 Subtensor Op
 ============

-.. automodule:: theano.sandbox.gpuarray.subtensor
+.. automodule:: theano.gpuarray.subtensor
    :members:

 Nnet Op
 =======

-.. automodule:: theano.sandbox.gpuarray.nnet
+.. automodule:: theano.gpuarray.nnet
    :members:

-.. automodule:: theano.sandbox.gpuarray.neighbours
+.. automodule:: theano.gpuarray.neighbours
    :members:
--- a/doc/library/sandbox/gpuarray/type.txt
+++ b/doc/library/sandbox/gpuarray/type.txt
 .. _libdoc_gpuarray_type:

 ===================================================
-:mod:`theano.sandbox.gpuarray.type` -- Type classes
+:mod:`gpuarray.type` -- Type classes
 ===================================================

-.. automodule:: theano.sandbox.gpuarray.type
+.. automodule:: theano.gpuarray.type
   :members:
--- a/doc/library/index.txt
+++ b/doc/library/index.txt
@@ -17,6 +17,7 @@ Types and Ops that you can use to build and compile expression graphs.
   printing
   d3viz/index
   compile/index
+   gpuarray/index
   sparse/index
   sparse/sandbox
   scalar/index

--- a/doc/library/sandbox/index.txt
+++ b/doc/library/sandbox/index.txt
@@ -14,7 +14,6 @@
    :maxdepth: 1

    cuda/index
-    gpuarray/index
    linalg
    neighbours
    rng_mrg
--- a/doc/tutorial/using_gpu.txt
+++ b/doc/tutorial/using_gpu.txt
@@ -393,7 +393,7 @@ into a file and run it.

 .. testcode::

-  from theano import function, config, shared, tensor, sandbox
+  from theano import function, config, shared, tensor
  import numpy
  import time

@@ -461,7 +461,7 @@ the GPU object directly.  The following code is modifed to do just that.

 .. testcode::

-  from theano import function, config, shared, tensor, sandbox
+  from theano import function, config, shared, tensor, gpuarray
  import numpy
  import time

@@ -470,7 +470,7 @@ the GPU object directly.  The following code is modifed to do just that.

  rng = numpy.random.RandomState(22)
  x = shared(numpy.asarray(rng.rand(vlen), config.floatX))
-  f = function([], sandbox.gpuarray.basic_ops.gpu_from_host(tensor.exp(x)))
+  f = function([], gpuarray.basic_ops.GpuFromHost(None)(tensor.exp(x)))
  print(f.maker.fgraph.toposort())
  t0 = time.time()
  for i in range(iters):
@@ -485,9 +485,10 @@ the GPU object directly.  The following code is modifed to do just that.
  else:
      print('Used the gpu')

-Here the :func:`theano.sandbox.gpuarray.basic.gpu_from_host` call
-means "copy input to the GPU".  However during the optimization phase,
-since the result will already be on th gpu, it will be removed.  It is
+Here the :func:`theano.gpuarray.basic_ops.GpuFromHost(None)` call
+means "copy input to the GPU", with ``None`` the default GPU context when not
+explicitly given. However during the optimization phase,
+since the result will already be on the gpu, it will be removed.  It is
 used here to tell theano that we want the result on the GPU.

 The output is

--- a/theano/__init__.py
+++ b/theano/__init__.py
@@ -116,7 +116,7 @@ if (config.device.startswith('cuda') or
        config.init_gpu_device.startswith('cuda') or
        config.init_gpu_device.startswith('opencl') or
        config.contexts != ''):
-    import theano.sandbox.gpuarray
+    import theano.gpuarray

 # Use config.numpy to call numpy.seterr
 import numpy

--- a/theano/gpuarray/__init__.py
+++ b/theano/gpuarray/__init__.py
+from __future__ import absolute_import, print_function, division
+import sys
+import logging
+import sys
+import warnings
+
+import theano
+from theano import config
+from theano.compile import optdb
+
+from theano.tensor.basic import register_transfer
+
+_logger_name = 'theano.gpuarray'
+_logger = logging.getLogger(_logger_name)
+
+error = _logger.error
+info = _logger.info
+
+pygpu_activated = False
+try:
+    import pygpu
+    import pygpu.gpuarray
+except ImportError:
+    pygpu = None
+
+# This is for documentation not to depend on the availability of pygpu
+from .type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant,
+                   GpuArraySharedVariable, gpuarray_shared_constructor,
+                   reg_context, get_context, ContextNotDefined)
+from .basic_ops import as_gpuarray_variable
+from . import dnn, opt, nerv, extra_ops
+
+def transfer(x, target):
+    try:
+        get_context(target)
+        return as_gpuarray_variable(x, target)
+    except ContextNotDefined:
+        pass
+
+register_transfer(transfer)
+
+
+def init_dev(dev, name=None):
+    v = pygpu.gpuarray.api_version()
+    if v[0] != -9998:
+        raise RuntimeError("Wrong major API version for gpuarray:", v[0],
+                           "Make sure Theano and libgpuarray/pygpu "
+                           "are in sync.")
+    if v[1] < 0:
+        raise RuntimeError("Wrong minor API version for gpuarray:", v[1],
+                           "Please update libgpuarray/pygpu.")
+    global pygpu_activated
+    if dev not in init_dev.devmap:
+        ctx = pygpu.init(dev)
+        init_dev.devmap[dev] = ctx
+        if config.gpuarray.preallocate != 0:
+            if config.gpuarray.preallocate < 1:
+                gmem = min(config.gpuarray.preallocate, 0.98) * ctx.total_gmem
+            else:
+                gmem = config.gpuarray.preallocate * (1024*1024)
+            # This will allocate and immediatly free an object of size gmem
+            # which will reserve that amount of memory on the GPU.
+            pygpu.empty((gmem,), dtype='int8', context=ctx)
+    context = init_dev.devmap[dev]
+    # This will map the context name to the real context object.
+    reg_context(name, context)
+    pygpu_activated = True
+    if config.print_active_device:
+        warn = None
+        cudnn_version = ""
+        if dev.startswith('cuda'):
+            cudnn_version = " (cuDNN not available)"
+            try:
+                cudnn_version = dnn.version()
+                # 5100 should not print warning with cudnn 5 final.
+                if cudnn_version > 5100:
+                    warn = ("Your cuDNN version is more recent than Theano."
+                            " If you see problems, try updating Theano or"
+                            " downgrading cuDNN to version 5.")
+                cudnn_version = " (cuDNN version %s)" % cudnn_version
+            except Exception:
+                cudnn_version = dnn.dnn_present.msg
+        print("Mapped name %s to device %s: %s%s" % (
+            name, dev, context.devname, cudnn_version),
+              file=sys.stderr)
+        if warn:
+            warnings.warn(warn)
+
+# This maps things like 'cuda0' to the context object on that device.
+init_dev.devmap = {}
+
+if pygpu:
+    try:
+        if (config.device.startswith('cuda') or
+            config.device.startswith('opencl')):
+            init_dev(config.device)
+            import theano.compile
+            theano.compile.shared_constructor(gpuarray_shared_constructor)
+            optdb.add_tags('gpuarray_opt', 'fast_run', 'fast_compile')
+            optdb.add_tags('gpua_scanOp_make_inplace', 'fast_run')
+        elif (config.init_gpu_device.startswith('cuda') or
+              config.init_gpu_device.startswith('opencl')):
+            if config.device != 'cpu':
+                raise ValueError('you must set device=cpu to use init_gpu_device.')
+            if config.contexts != '':
+                print("Using contexts will make init_gpu_device act like device and move all computations by default, which might not be what you want.")
+            init_dev(config.init_gpu_device)
+        if config.contexts != '':
+            for n, d in (c.split('->') for c in config.contexts.split(';')):
+                init_dev(d.strip(), n.strip())
+            import theano.compile
+            theano.compile.shared_constructor(gpuarray_shared_constructor)
+            optdb.add_tags('gpuarray_opt', 'fast_run', 'fast_compile')
+            optdb.add_tags('gpua_scanOp_make_inplace', 'fast_run')
+
+        from .basic_ops import (GpuAlloc, GpuAllocEmpty, GpuContiguous, GpuEye,
+                                GpuFromHost, GpuJoin, GpuReshape, GpuSplit,
+                                HostFromGpu)
+        from .basic_ops import host_from_gpu, GpuFromHost
+        from .elemwise import GpuElemwise
+        from .subtensor import (GpuSubtensor, GpuIncSubtensor,
+                                GpuAdvancedIncSubtensor1)
+
+    except Exception:
+        error("Could not initialize pygpu, support disabled", exc_info=True)
+else:
+    if (config.init_gpu_device.startswith('cuda') or
+            config.init_gpu_device.startswith('opencl') or
+            config.device.startswith('opencl') or
+            config.device.startswith('cuda') or
+            config.contexts != ''):
+        error("pygpu was configured but could not be imported", exc_info=True)
--- a/theano/sandbox/gpuarray/basic_ops.py
+++ b/theano/sandbox/gpuarray/basic_ops.py
--- a/theano/sandbox/gpuarray/blas.py
+++ b/theano/sandbox/gpuarray/blas.py
--- a/theano/sandbox/gpuarray/blockgemv.c
+++ b/theano/sandbox/gpuarray/blockgemv.c
--- a/theano/sandbox/gpuarray/blockger.c
+++ b/theano/sandbox/gpuarray/blockger.c
--- a/theano/sandbox/gpuarray/blocksparse.py
+++ b/theano/sandbox/gpuarray/blocksparse.py
@@ -12,7 +12,7 @@ from theano.gradient import grad_undefined
 from .type import gpu_context_type
 from .basic_ops import as_gpuarray_variable, infer_context_name

-_logger = logging.getLogger('theano.sandbox.gpuarray.blocksparse')
+_logger = logging.getLogger('theano.gpuarray.blocksparse')


 class GpuSparseBlockGemv(COp):

--- a/theano/sandbox/gpuarray/conv_desc.c
+++ b/theano/sandbox/gpuarray/conv_desc.c
--- a/theano/sandbox/gpuarray/cudnn_helper.h
+++ b/theano/sandbox/gpuarray/cudnn_helper.h
--- a/theano/sandbox/gpuarray/dnn.py
+++ b/theano/sandbox/gpuarray/dnn.py
--- a/theano/sandbox/gpuarray/dnn_base.c
+++ b/theano/sandbox/gpuarray/dnn_base.c
--- a/theano/sandbox/gpuarray/dnn_conv_base.c
+++ b/theano/sandbox/gpuarray/dnn_conv_base.c
--- a/theano/sandbox/gpuarray/dnn_fwd.c
+++ b/theano/sandbox/gpuarray/dnn_fwd.c
--- a/theano/sandbox/gpuarray/dnn_gi.c
+++ b/theano/sandbox/gpuarray/dnn_gi.c
--- a/theano/sandbox/gpuarray/dnn_gw.c
+++ b/theano/sandbox/gpuarray/dnn_gw.c
--- a/theano/sandbox/gpuarray/dnn_pool.c
+++ b/theano/sandbox/gpuarray/dnn_pool.c
--- a/theano/sandbox/gpuarray/dnn_pool_grad.c
+++ b/theano/sandbox/gpuarray/dnn_pool_grad.c
--- a/theano/sandbox/gpuarray/dnn_softmax.c
+++ b/theano/sandbox/gpuarray/dnn_softmax.c
--- a/theano/sandbox/gpuarray/dnn_softmax_grad.c
+++ b/theano/sandbox/gpuarray/dnn_softmax_grad.c
--- a/theano/sandbox/gpuarray/elemwise.py
+++ b/theano/sandbox/gpuarray/elemwise.py
--- a/theano/sandbox/gpuarray/extra_ops.py
+++ b/theano/sandbox/gpuarray/extra_ops.py
--- a/theano/sandbox/gpuarray/fp16_help.py
+++ b/theano/sandbox/gpuarray/fp16_help.py
--- a/theano/sandbox/gpuarray/gemm16.c
+++ b/theano/sandbox/gpuarray/gemm16.c
--- a/theano/sandbox/gpuarray/gpuarray_helper.h
+++ b/theano/sandbox/gpuarray/gpuarray_helper.h
--- a/theano/sandbox/gpuarray/kernel_codegen.py
+++ b/theano/sandbox/gpuarray/kernel_codegen.py
--- a/theano/sandbox/gpuarray/multinomial.py
+++ b/theano/sandbox/gpuarray/multinomial.py
@@ -12,7 +12,7 @@ import theano.sandbox.multinomial
 from theano import Apply, config
 from theano.gof import Op
 from theano.tensor import NotScalarConstantError, get_scalar_constant_value
-from theano.sandbox import gpuarray
+from theano import gpuarray
 from .basic_ops import as_gpuarray_variable, infer_context_name
 from .opt import register_opt, op_lifter
 from .type import GpuArrayType

--- a/theano/sandbox/gpuarray/neighbours.py
+++ b/theano/sandbox/gpuarray/neighbours.py
--- a/theano/sandbox/gpuarray/nerv.py
+++ b/theano/sandbox/gpuarray/nerv.py
--- a/theano/sandbox/gpuarray/nnet.py
+++ b/theano/sandbox/gpuarray/nnet.py
--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -46,7 +46,7 @@ from .subtensor import (GpuIncSubtensor, GpuSubtensor,
                        GpuAdvancedIncSubtensor1_dev20)
 from .opt_util import alpha_merge, output_merge

-_logger = logging.getLogger("theano.sandbox.gpuarray.opt")
+_logger = logging.getLogger("theano.gpuarray.opt")

 gpu_optimizer = EquilibriumDB()
 gpu_cut_copies = EquilibriumDB()

--- a/theano/sandbox/gpuarray/opt_util.py
+++ b/theano/sandbox/gpuarray/opt_util.py
--- a/theano/sandbox/gpuarray/subtensor.py
+++ b/theano/sandbox/gpuarray/subtensor.py
--- a/theano/sandbox/gpuarray/tests/GpuArray.pkl
+++ b/theano/sandbox/gpuarray/tests/GpuArray.pkl
-ctheano.sandbox.gpuarray.type
+ctheano.gpuarray.type
 GpuArray_unpickler
 p0
 (cnumpy.core.multiarray

--- a/theano/sandbox/gpuarray/tests/__init__.py
+++ b/theano/sandbox/gpuarray/tests/__init__.py
--- a/theano/sandbox/gpuarray/tests/config.py
+++ b/theano/sandbox/gpuarray/tests/config.py
 from __future__ import absolute_import, print_function, division
 from nose.plugins.skip import SkipTest

-import theano.sandbox.gpuarray
+import theano.gpuarray

-if theano.sandbox.gpuarray.pygpu is None:
+if theano.gpuarray.pygpu is None:
    raise SkipTest("pygpu not installed")

-if (not theano.sandbox.gpuarray.pygpu_activated and
+if (not theano.gpuarray.pygpu_activated and
        not theano.config.init_gpu_device.startswith('gpu')):
-    theano.sandbox.gpuarray.init_dev('cuda')
+    theano.gpuarray.init_dev('cuda')

-if not theano.sandbox.gpuarray.pygpu_activated:
+if not theano.gpuarray.pygpu_activated:
    raise SkipTest("pygpu disabled")

 test_ctx_name = None

--- a/theano/sandbox/gpuarray/tests/test_abstractconv.py
+++ b/theano/sandbox/gpuarray/tests/test_abstractconv.py
--- a/theano/sandbox/gpuarray/tests/test_basic_ops.py
+++ b/theano/sandbox/gpuarray/tests/test_basic_ops.py
@@ -302,7 +302,7 @@ class G_reshape(test_basic.T_reshape):
            mode=mode_with_gpu,
            ignore_topo=(HostFromGpu, GpuFromHost,
                         theano.compile.DeepCopyOp,
-                         theano.sandbox.gpuarray.elemwise.GpuElemwise,
+                         theano.gpuarray.elemwise.GpuElemwise,
                         theano.tensor.opt.Shape_i,
                         theano.tensor.opt.MakeVector))
        assert self.op == GpuReshape
@@ -405,7 +405,7 @@ def test_hostfromgpu_shape_i():
                                'local_dot22_to_dot22scalar',
                                'specialize')
    a = T.fmatrix('a')
-    ca = theano.sandbox.gpuarray.type.GpuArrayType('float32', (False, False))()
+    ca = theano.gpuarray.type.GpuArrayType('float32', (False, False))()
    av = numpy.asarray(numpy.random.rand(5, 4), dtype='float32')
    cv = gpuarray.asarray(numpy.random.rand(5, 4),
                          dtype='float32',

--- a/theano/sandbox/gpuarray/tests/test_blas.py
+++ b/theano/sandbox/gpuarray/tests/test_blas.py
--- a/theano/sandbox/gpuarray/tests/test_blocksparse.py
+++ b/theano/sandbox/gpuarray/tests/test_blocksparse.py
--- a/theano/sandbox/gpuarray/tests/test_dnn.py
+++ b/theano/sandbox/gpuarray/tests/test_dnn.py
--- a/theano/sandbox/gpuarray/tests/test_elemwise.py
+++ b/theano/sandbox/gpuarray/tests/test_elemwise.py
--- a/theano/sandbox/gpuarray/tests/test_extra_ops.py
+++ b/theano/sandbox/gpuarray/tests/test_extra_ops.py
--- a/theano/sandbox/gpuarray/tests/test_multinomial.py
+++ b/theano/sandbox/gpuarray/tests/test_multinomial.py
--- a/theano/sandbox/gpuarray/tests/test_neighbours.py
+++ b/theano/sandbox/gpuarray/tests/test_neighbours.py
--- a/theano/sandbox/gpuarray/tests/test_nerv.py
+++ b/theano/sandbox/gpuarray/tests/test_nerv.py
--- a/theano/sandbox/gpuarray/tests/test_nnet.py
+++ b/theano/sandbox/gpuarray/tests/test_nnet.py
--- a/theano/sandbox/gpuarray/tests/test_opt.py
+++ b/theano/sandbox/gpuarray/tests/test_opt.py
@@ -7,7 +7,7 @@ from theano.tests.breakpoint import PdbBreakpoint
 from theano.tests import unittest_tools as utt, test_ifelse
 from theano.tensor.tests import test_basic

-import theano.sandbox.gpuarray
+import theano.gpuarray
 from .. import basic_ops
 from ..type import GpuArrayType, gpuarray_shared_constructor, get_context
 from ..basic_ops import (

--- a/theano/sandbox/gpuarray/tests/test_pickle.py
+++ b/theano/sandbox/gpuarray/tests/test_pickle.py
@@ -14,24 +14,15 @@ from nose.plugins.skip import SkipTest
 from nose.tools import assert_raises
 import numpy

-import theano.sandbox.gpuarray
 from theano.compat import PY3
 from theano import config
 from theano.misc.pkl_utils import CompatUnpickler

-if not theano.sandbox.gpuarray.pygpu_activated:
-    try:
-        import pygpu
-    except ImportError:
-        pygpu = None
-    import theano.sandbox.cuda as cuda_ndarray
-    if pygpu and cuda_ndarray.cuda_available:
-        cuda_ndarray.use('gpu', default_to_move_computation_to_gpu=False,
-                         move_shared_float32_to_gpu=False,
-                         enable_cuda=False)
-        theano.sandbox.gpuarray.init_dev('cuda')
-
-from .. import pygpu_activated  # noqa
+try:
+    from . import config  # noqa
+    have_pygpu = True
+except SkipTest:
+    have_pygpu = False


 def test_unpickle_gpuarray_as_numpy_ndarray_flag1():
@@ -40,8 +31,8 @@ def test_unpickle_gpuarray_as_numpy_ndarray_flag1():
    test_type.py test it when pygpu is there.

    """
-    if pygpu_activated:
-        raise SkipTest("pygpu disabled")
+    if have_pygpu:
+        raise SkipTest("pygpu active")
    oldflag = config.experimental.unpickle_gpu_on_cpu
    config.experimental.unpickle_gpu_on_cpu = False


--- a/theano/sandbox/gpuarray/tests/test_scan.py
+++ b/theano/sandbox/gpuarray/tests/test_scan.py
--- a/theano/sandbox/gpuarray/tests/test_subtensor.py
+++ b/theano/sandbox/gpuarray/tests/test_subtensor.py
--- a/theano/sandbox/gpuarray/tests/test_type.py
+++ b/theano/sandbox/gpuarray/tests/test_type.py
--- a/theano/sandbox/gpuarray/type.py
+++ b/theano/sandbox/gpuarray/type.py
@@ -233,7 +233,7 @@ class GpuArrayType(Type):
        return data

    def filter_variable(self, other, allow_convert=True):
-        from theano.sandbox.gpuarray import GpuFromHost
+        from theano.gpuarray import GpuFromHost

        if hasattr(other, '_as_GpuArrayVariable'):
            other = other._as_GpuArrayVariable(self.context_name)

--- a/theano/misc/check_multi_gpu.py
+++ b/theano/misc/check_multi_gpu.py
@@ -12,8 +12,8 @@ import time
 import numpy

 import theano
-from theano.sandbox.gpuarray import init_dev
-from theano.sandbox.gpuarray.blas import gpu_dot22
+from theano.gpuarray import init_dev
+from theano.gpuarray.blas import gpu_dot22


 def main(dev1, dev2):

--- a/theano/misc/may_share_memory.py
+++ b/theano/misc/may_share_memory.py
@@ -19,7 +19,7 @@ except ImportError:
        return False

 from theano.sandbox import cuda
-from theano.sandbox import gpuarray
+from theano import gpuarray

 if cuda.cuda_available:
    from theano.sandbox.cuda.type import CudaNdarrayType

--- a/theano/sandbox/gpuarray/__init__.py
+++ b/theano/sandbox/gpuarray/__init__.py
-from __future__ import absolute_import, print_function, division
-import sys
-import logging
-import sys
-import warnings
-
-import theano
-from theano import config
-from theano.compile import optdb
-
-from theano.tensor.basic import register_transfer
-
-_logger_name = 'theano.sandbox.gpuarray'
-_logger = logging.getLogger(_logger_name)
-
-error = _logger.error
-info = _logger.info
-
-pygpu_activated = False
-try:
-    import pygpu
-    import pygpu.gpuarray
-except ImportError:
-    pygpu = None
-
-# This is for documentation not to depend on the availability of pygpu
-from .type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant,
-                   GpuArraySharedVariable, gpuarray_shared_constructor,
-                   reg_context, get_context, ContextNotDefined)
-from .basic_ops import as_gpuarray_variable
-from . import dnn, opt, nerv, extra_ops
+"""Placeholder for new gpuarray backend in sandbox. Supports old pickles
+which refered to theano.sandbox.gpuarray."""

-def transfer(x, target):
-    try:
-        get_context(target)
-        return as_gpuarray_variable(x, target)
-    except ContextNotDefined:
-        pass
-
-register_transfer(transfer)
-
-
-def init_dev(dev, name=None):
-    v = pygpu.gpuarray.api_version()
-    if v[0] != -9998:
-        raise RuntimeError("Wrong major API version for gpuarray:", v[0],
-                           "Make sure Theano and libgpuarray/pygpu "
-                           "are in sync.")
-    if v[1] < 0:
-        raise RuntimeError("Wrong minor API version for gpuarray:", v[1],
-                           "Please update libgpuarray/pygpu.")
-    global pygpu_activated
-    if dev not in init_dev.devmap:
-        ctx = pygpu.init(dev)
-        init_dev.devmap[dev] = ctx
-        if config.gpuarray.preallocate != 0:
-            if config.gpuarray.preallocate < 1:
-                gmem = min(config.gpuarray.preallocate, 0.98) * ctx.total_gmem
-            else:
-                gmem = config.gpuarray.preallocate * (1024*1024)
-            # This will allocate and immediatly free an object of size gmem
-            # which will reserve that amount of memory on the GPU.
-            pygpu.empty((gmem,), dtype='int8', context=ctx)
-    context = init_dev.devmap[dev]
-    # This will map the context name to the real context object.
-    reg_context(name, context)
-    pygpu_activated = True
-    if config.print_active_device:
-        warn = None
-        cudnn_version = ""
-        if dev.startswith('cuda'):
-            cudnn_version = " (cuDNN not available)"
-            try:
-                cudnn_version = dnn.version()
-                # 5100 should not print warning with cudnn 5 final.
-                if cudnn_version > 5100:
-                    warn = ("Your cuDNN version is more recent than Theano."
-                            " If you see problems, try updating Theano or"
-                            " downgrading cuDNN to version 5.")
-                cudnn_version = " (cuDNN version %s)" % cudnn_version
-            except Exception:
-                cudnn_version = dnn.dnn_present.msg
-        print("Mapped name %s to device %s: %s%s" % (
-            name, dev, context.devname, cudnn_version),
-              file=sys.stderr)
-        if warn:
-            warnings.warn(warn)
-
-# This maps things like 'cuda0' to the context object on that device.
-init_dev.devmap = {}
-
-if pygpu:
-    try:
-        if (config.device.startswith('cuda') or
-            config.device.startswith('opencl')):
-            init_dev(config.device)
-            import theano.compile
-            theano.compile.shared_constructor(gpuarray_shared_constructor)
-            optdb.add_tags('gpuarray_opt', 'fast_run', 'fast_compile')
-            optdb.add_tags('gpua_scanOp_make_inplace', 'fast_run')
-        elif (config.init_gpu_device.startswith('cuda') or
-              config.init_gpu_device.startswith('opencl')):
-            if config.device != 'cpu':
-                raise ValueError('you must set device=cpu to use init_gpu_device.')
-            if config.contexts != '':
-                print("Using contexts will make init_gpu_device act like device and move all computations by default, which might not be what you want.")
-            init_dev(config.init_gpu_device)
-        if config.contexts != '':
-            for n, d in (c.split('->') for c in config.contexts.split(';')):
-                init_dev(d.strip(), n.strip())
-            import theano.compile
-            theano.compile.shared_constructor(gpuarray_shared_constructor)
-            optdb.add_tags('gpuarray_opt', 'fast_run', 'fast_compile')
-            optdb.add_tags('gpua_scanOp_make_inplace', 'fast_run')
-
-        from .basic_ops import (GpuAlloc, GpuAllocEmpty, GpuContiguous, GpuEye,
-                                GpuFromHost, GpuJoin, GpuReshape, GpuSplit,
-                                HostFromGpu)
-        from .basic_ops import host_from_gpu, GpuFromHost
-        from .elemwise import GpuElemwise
-        from .subtensor import (GpuSubtensor, GpuIncSubtensor,
-                                GpuAdvancedIncSubtensor1)
+import warnings
+from theano.gpuarray import *

-    except Exception:
-        error("Could not initialize pygpu, support disabled", exc_info=True)
-else:
-    if (config.init_gpu_device.startswith('cuda') or
-            config.init_gpu_device.startswith('opencl') or
-            config.device.startswith('opencl') or
-            config.device.startswith('cuda') or
-            config.contexts != ''):
-        error("pygpu was configured but could not be imported", exc_info=True)
+message = "theano.sandbox.gpuarray has been moved to theano.gpuarray." + \
+    " Please update your code and pickles."
+warnings.warn(message)
--- a/theano/sandbox/gpuarray/comp.py
+++ b/theano/sandbox/gpuarray/comp.py
-from __future__ import absolute_import, print_function, division
-import os
-
-import numpy
-
-import theano
-from theano import config
-
-# This is a big hack to avoid creating a second context on the card.
-from theano.sandbox.cuda.nvcc_compiler import (NVCC_compiler as NVCC_base,
-                                               hash_from_file)
-
-
-class NVCC_compiler(NVCC_base):
-    @staticmethod
-    def compile_args():
-        """
-        Re-implementation of compile_args that does not create an
-        additionnal context on the GPU.
-
-        """
-        flags = [flag for flag in config.nvcc.flags.split(' ') if flag]
-        if config.nvcc.fastmath:
-            flags.append('-use_fast_math')
-        cuda_ndarray_cuh_hash = hash_from_file(
-            os.path.join(os.path.split(theano.sandbox.cuda.__file__)[0],
-                         'cuda_ndarray.cuh'))
-        flags.append('-DCUDA_NDARRAY_CUH=' + cuda_ndarray_cuh_hash)
-
-        # numpy 1.7 deprecated the following macros but they didn't
-        # exist in the past
-        numpy_ver = [int(n) for n in numpy.__version__.split('.')[:2]]
-        if bool(numpy_ver < [1, 7]):
-            flags.append("-DNPY_ARRAY_ENSURECOPY=NPY_ENSURECOPY")
-            flags.append("-DNPY_ARRAY_ALIGNED=NPY_ALIGNED")
-            flags.append("-DNPY_ARRAY_WRITEABLE=NPY_WRITEABLE")
-            flags.append("-DNPY_ARRAY_UPDATE_ALL=NPY_UPDATE_ALL")
-            flags.append("-DNPY_ARRAY_C_CONTIGUOUS=NPY_C_CONTIGUOUS")
-            flags.append("-DNPY_ARRAY_F_CONTIGUOUS=NPY_F_CONTIGUOUS")
-
-        # If the user didn't specify architecture flags add them
-        if not any(['-arch=sm_' in f for f in flags]):
-            dev = theano.sandbox.gpuarray.init_dev.device
-            if dev is None:
-                raise Exception("Trying to compile GPU code without a context")
-            if dev.startswith("opencl"):
-                raise Exception("Trying to call nvcc with an OpenCL context")
-            assert dev.startswith('cuda')
-            if dev == 'cuda':
-                n = theano.sandbox.cuda.use.device_number
-            else:
-                n = int(dev[4:])
-            p = theano.sandbox.cuda.device_properties(n)
-            flags.append('-arch=sm_' + str(p['major']) + str(p['minor']))
-
-        return flags
--- a/theano/sandbox/rng_mrg.py
+++ b/theano/sandbox/rng_mrg.py
@@ -24,11 +24,11 @@ from . import multinomial

 import theano.sandbox.cuda
 from theano.sandbox.cuda import GpuOp
-from theano.sandbox.gpuarray.basic_ops import GpuKernelBase, Kernel
-from theano.sandbox.gpuarray.type import GpuArrayType
-from theano.sandbox.gpuarray.fp16_help import write_w
-from theano.sandbox.gpuarray.opt import (register_opt as register_gpua,
-                                         host_from_gpu as host_from_gpua)
+from theano.gpuarray.basic_ops import GpuKernelBase, Kernel
+from theano.gpuarray.type import GpuArrayType
+from theano.gpuarray.fp16_help import write_w
+from theano.gpuarray.opt import (register_opt as register_gpua,
+                                 host_from_gpu as host_from_gpua)
 if theano.sandbox.cuda.cuda_available:
    from theano.sandbox.cuda import (CudaNdarrayType,
                                     float32_shared_constructor)

--- a/theano/sandbox/tests/test_rng_mrg.py
+++ b/theano/sandbox/tests/test_rng_mrg.py
@@ -366,9 +366,9 @@ def test_consistency_GPUA_serial():
    are the same as the reference (Java) implementation by L'Ecuyer et al.

    """
-    from theano.sandbox.gpuarray.tests.test_basic_ops import \
+    from theano.gpuarray.tests.test_basic_ops import \
        mode_with_gpu as mode
-    from theano.sandbox.gpuarray.type import gpuarray_shared_constructor
+    from theano.gpuarray.type import gpuarray_shared_constructor

    seed = 12345
    n_samples = 5
@@ -421,9 +421,9 @@ def test_consistency_GPUA_parallel():
    L'Ecuyer et al.

    """
-    from theano.sandbox.gpuarray.tests.test_basic_ops import \
+    from theano.gpuarray.tests.test_basic_ops import \
        mode_with_gpu as mode
-    from theano.sandbox.gpuarray.type import gpuarray_shared_constructor
+    from theano.gpuarray.type import gpuarray_shared_constructor

    seed = 12345
    n_samples = 5
@@ -1107,9 +1107,9 @@ def test_overflow_gpu_old_backend():

 def test_overflow_gpu_new_backend():
    # run with THEANO_FLAGS=mode=FAST_RUN,init_gpu_device=cuda1,device=cpu
-    from theano.sandbox.gpuarray.tests.test_basic_ops import \
+    from theano.gpuarray.tests.test_basic_ops import \
        mode_with_gpu as mode
-    from theano.sandbox.gpuarray.type import gpuarray_shared_constructor
+    from theano.gpuarray.type import gpuarray_shared_constructor
    seed = 12345
    n_substreams = 7
    curr_rstate = numpy.array([seed] * 6, dtype='int32')

--- a/theano/scan_module/scan.py
+++ b/theano/scan_module/scan.py
@@ -982,7 +982,8 @@ def scan(fn,
    # the file because that would force on the user some dependencies that we
    # might do not want to. Currently we are working on removing the
    # dependencies on sandbox code completeley.
-    from theano.sandbox import cuda, gpuarray
+    from theano.sandbox import cuda
+    from theano import gpuarray
    if cuda.cuda_available or gpuarray.pygpu_activated:
        # very often we end up in this situation when we want to
        # replace w with w_copy, where w is a GPU variable

--- a/theano/scan_module/scan_op.py
+++ b/theano/scan_module/scan_op.py
@@ -272,7 +272,7 @@ class Scan(PureOp):
        # If scan has the flag 'gpua' set to false (meaning that is shouldn't
        # use the gpuarray gpu backend ), ensure that is has no input and no
        # output with type GpuArrayType
-        from theano.sandbox.gpuarray import GpuArrayType
+        from theano.gpuarray import GpuArrayType
        if not self.info.get("gpua", False):
            for inp in self.inputs:
                if isinstance(inp.type, GpuArrayType):

--- a/theano/scan_module/scan_opt.py
+++ b/theano/scan_module/scan_opt.py
@@ -1008,8 +1008,8 @@ class ScanInplaceOptimizer(Optimizer):
            # gpuarray might be imported but not its GpuAlloc and
            # GpuAllopEmpty ops.
            try:
-                alloc_ops += (theano.sandbox.gpuarray.GpuAlloc,
-                              theano.sandbox.gpuarray.GpuAllocEmpty)
+                alloc_ops += (theano.gpuarray.GpuAlloc,
+                              theano.gpuarray.GpuAllocEmpty)
            except:
                pass


--- a/theano/scan_module/scan_utils.py
+++ b/theano/scan_module/scan_utils.py
@@ -151,7 +151,8 @@ def traverse(out, x, x_copy, d, visited=None):
    if out in visited:
        return d
    visited.add(out)
-    from theano.sandbox import cuda, gpuarray
+    from theano.sandbox import cuda
+    from theano import gpuarray
    if out == x:
        if isinstance(x.type, cuda.CudaNdarrayType):
            d[out] = cuda.gpu_from_host(x_copy)

--- a/theano/scan_module/tests/test_scan.py
+++ b/theano/scan_module/tests/test_scan.py
@@ -4939,7 +4939,7 @@ class T_Scan_Gpuarray(unittest.TestCase, ScanGpuTests):
    """

    def __init__(self, *args, **kwargs):
-        from theano.sandbox import gpuarray
+        from theano import gpuarray
        self.gpu_backend = gpuarray

        # This is unfortunate, but required

--- a/theano/tests/test_flake8.py
+++ b/theano/tests/test_flake8.py
@@ -39,6 +39,8 @@ whitelist_flake8 = [
    "compile/profiling.py",
    "compile/sandbox/__init__.py",
    "compile/tests/__init__.py",
+    "gpuarray/__init__.py",
+    "gpuarray/tests/__init__.py",
    "typed_list/__init__.py",
    "typed_list/tests/__init__.py",
    "tensor/__init__.py",
@@ -89,7 +91,7 @@ whitelist_flake8 = [
    "sandbox/tests/__init__.py",
    "sandbox/cuda/__init__.py",
    "sandbox/cuda/tests/__init__.py",
-    "sandbox/gpuarray/tests/__init__.py",
+    "sandbox/gpuarray/__init__.py",
    "sandbox/scan_module/scan_utils.py",
    "sandbox/scan_module/scan.py",
    "sandbox/scan_module/scan_op.py",
@@ -100,7 +102,6 @@ whitelist_flake8 = [
    "sandbox/linalg/__init__.py",
    "sandbox/linalg/tests/__init__.py",
    "sandbox/linalg/tests/test_linalg.py",
-    "sandbox/gpuarray/__init__.py",
    "scan_module/scan_utils.py",
    "scan_module/scan_views.py",
    "scan_module/scan.py",