Remove tentacles in compile.

cf7d0688 · Arnaud Bergeron · df95d9a9 · cf7d0688 · cf7d0688 · cf7d0688
--- a/theano/__init__.py
+++ b/theano/__init__.py
@@ -126,17 +126,6 @@ else:
        raise ImportError("The nose module is not installed."
                          " It is needed for Theano tests.")
-if config.device.startswith('gpu') or config.init_gpu_device.startswith('gpu'):
-    import theano.sandbox.cuda
-    # We can't test the driver during import of theano.sandbox.cuda as
-    # this cause circular import dependency. So we also test it manually
-    # after the import
-    if theano.sandbox.cuda.cuda_available:
-        import theano.sandbox.cuda.tests.test_driver
-        if config.enable_initial_driver_test:
-            theano.sandbox.cuda.tests.test_driver.test_nvidia_driver1()
 if (config.device.startswith('cuda') or
        config.device.startswith('opencl') or
        config.init_gpu_device.startswith('cuda') or

--- a/theano/compile/debugmode.py
+++ b/theano/compile/debugmode.py
@@ -1198,10 +1198,11 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val,
    # To avoid circular imports
    from theano.tensor import TensorType
-    from theano.sandbox.cuda import cuda_available, CudaNdarrayType
+    from theano.gpuarray import GpuArrayType
-    if cuda_available:
+    try:
-        from theano.sandbox.cuda import CudaNdarray
+        import pygpu
-        from theano.sandbox.cuda import dimshuffle as cuda_dimshuffle
+    except ImportError:
+        pass
    # TODO: Sparse? Scalar does not really make sense.
@@ -1240,7 +1241,7 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val,
        for r in considered_outputs:
            # There is no risk to overwrite inputs, since r does not work
            # inplace.
-            if isinstance(r.type, (TensorType, CudaNdarrayType)):
+            if isinstance(r.type, (TensorType, GpuArrayType)):
                reuse_outputs[r][...] = np.asarray(
                    def_val).astype(r.type.dtype)
@@ -1250,15 +1251,14 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val,
        del reuse_outputs
    # c_cont_output: use a c-continuous array
-    # (for TensorType and CudaNdarray, else None)
+    # (for TensorType, else None)
    if 'c_contiguous' in prealloc_modes or 'ALL' in prealloc_modes:
        c_cont_outputs = {}
        for r in considered_outputs:
-            if isinstance(r.type, (TensorType, CudaNdarrayType)):
+            if isinstance(r.type, (TensorType, GpuArrayType)):
                # Build a C-contiguous buffer
                new_buf = r.type.value_zeros(r_vals[r].shape)
-                # CudaNdarray don't have flags field
+                assert new_buf.flags["C_CONTIGUOUS"]
-                # assert new_buf.flags["C_CONTIGUOUS"]
                new_buf[...] = np.asarray(def_val).astype(r.type.dtype)
                c_cont_outputs[r] = new_buf
@@ -1272,18 +1272,14 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val,
    if 'f_contiguous' in prealloc_modes or 'ALL' in prealloc_modes:
        f_cont_outputs = {}
        for r in considered_outputs:
-            if isinstance(r.type, (TensorType, CudaNdarrayType)):
+            if isinstance(r.type, (TensorType, GpuArrayType)):
                new_buf = np.zeros(
                    shape=r_vals[r].shape,
                    dtype=r_vals[r].dtype,
                    order='F')
                new_buf[...] = def_val
-                if isinstance(r.type, CudaNdarrayType):
+                if isinstance(r.type, GpuArrayType):
-                    # When the CudaNdarray is built, the underlying memory
+                    new_buf = pygpu.array(new_buf)
-                    # is c-contiguous, so we transpose it before and after.
-                    new_buf = CudaNdarray(new_buf.T)
-                    new_buf = cuda_dimshuffle(
-                        new_buf, reversed(list(range(new_buf.ndim))))
                f_cont_outputs[r] = new_buf
@@ -1305,7 +1301,7 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val,
        max_ndim = 0
        rev_out_broadcastable = []
        for r in considered_outputs:
-            if isinstance(r.type, (TensorType, CudaNdarrayType)):
+            if isinstance(r.type, (TensorType, GpuArrayType)):
                if max_ndim < r.ndim:
                    rev_out_broadcastable += [True] * (r.ndim - max_ndim)
                    max_ndim = r.ndim
@@ -1320,7 +1316,7 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val,
        # Initial allocation
        init_strided = {}
        for r in considered_outputs:
-            if isinstance(r.type, (TensorType, CudaNdarrayType)):
+            if isinstance(r.type, (TensorType, GpuArrayType)):
                # Create a buffer twice as large in every dimension,
                # except if broadcastable, or for dimensions above
                # config.DebugMode.check_preallocated_output_ndim
@@ -1399,7 +1395,7 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val,
                name = 'wrong_size%s' % str(tuple(shape_diff))
                for r in considered_outputs:
-                    if isinstance(r.type, (TensorType, CudaNdarrayType)):
+                    if isinstance(r.type, (TensorType, GpuArrayType)):
                        r_shape_diff = shape_diff[:r.ndim]
                        out_shape = [max((s + sd), 0)
                                     for s, sd in zip(r_vals[r].shape,
@@ -1741,7 +1737,6 @@ class _VariableEquivalenceTracker(object):
 # List of default version of make thunk.
 # This is needed to know if the user overrided it.
-# The GpuOp will be added here when theano.sandbox.cuda is imported.
 default_make_thunk = [get_unbound_function(theano.gof.Op.make_thunk)]

--- a/theano/compile/nanguardmode.py
+++ b/theano/compile/nanguardmode.py
@@ -8,7 +8,6 @@ import numpy as np
 import theano
 from theano.configparser import config
 import theano.tensor as T
-import theano.sandbox.cuda as cuda
 from theano.compile import Mode
 from .mode import get_mode
@@ -107,16 +106,6 @@ def contains_nan(arr, node=None, var=None):
    """
    if not _is_numeric_value(arr, var):
        return False
-    elif cuda.cuda_available and isinstance(arr, cuda.CudaNdarray):
-        if (node and hasattr(theano.sandbox, 'rng_mrg') and
-            isinstance(
-                node.op,
-                # It store ints in float container
-                theano.sandbox.rng_mrg.GPU_mrg_uniform)):
-            return False
-        else:
-            compile_gpu_func(True, False, False)
-            return np.isnan(f_gpumin(arr.reshape(arr.size)))
    elif pygpu_available and isinstance(arr, GpuArray):
        return np.isnan(f_gpua_min(arr.reshape(arr.size)))
@@ -150,70 +139,12 @@ def contains_inf(arr, node=None, var=None):
    """
    if not _is_numeric_value(arr, var):
        return False
-    elif cuda.cuda_available and isinstance(arr, cuda.CudaNdarray):
-        if (node and hasattr(theano.sandbox, 'rng_mrg') and
-            isinstance(
-                node.op,
-                # It store ints in float container
-                theano.sandbox.rng_mrg.GPU_mrg_uniform)):
-            return False
-        else:
-            compile_gpu_func(False, True, False)
-            return (np.isinf(f_gpumin(arr.reshape(arr.size))) or
-                    np.isinf(f_gpumax(arr.reshape(arr.size))))
    elif pygpu_available and isinstance(arr, GpuArray):
        return (np.isinf(f_gpua_min(arr.reshape(arr.size))) or
                np.isinf(f_gpua_max(arr.reshape(arr.size))))
    return np.isinf(np.nanmax(arr)) or np.isinf(np.nanmin(arr))
-f_gpumin = None
-f_gpumax = None
-f_gpuabsmax = None
-def compile_gpu_func(nan_is_error, inf_is_error, big_is_error):
-    """ compile utility function used by contains_nan and contains_inf
-    """
-    global f_gpumin, f_gpumax, f_gpuabsmax
-    if not cuda.cuda_available:
-        return
-    guard_input = cuda.fvector('nan_guard')
-    cuda_compile_failed = False
-    if (nan_is_error or inf_is_error) and f_gpumin is None:
-        try:
-            f_gpumin = theano.function(
-                [guard_input], T.min(guard_input),
-                mode='FAST_RUN'
-            )
-        except RuntimeError:
-            # This can happen if cuda is available, but the
-            # device is in exclusive mode and used by another
-            # process.
-            cuda_compile_failed = True
-    if inf_is_error and not cuda_compile_failed and f_gpumax is None:
-        try:
-            f_gpumax = theano.function(
-                [guard_input], T.max(guard_input),
-                mode='FAST_RUN'
-            )
-        except RuntimeError:
-            # This can happen if cuda is available, but the
-            # device is in exclusive mode and used by another
-            # process.
-            cuda_compile_failed = True
-    if big_is_error and not cuda_compile_failed and f_gpuabsmax is None:
-        try:
-            f_gpuabsmax = theano.function(
-                [guard_input], T.max(T.abs_(guard_input)),
-                mode='FAST_RUN'
-                )
-        except RuntimeError:
-            # This can happen if cuda is available, but the
-            # device is in exclusive mode and used by another
-            # process.
-            cuda_compile_failed = True
 def f_compute(op):
    def result(inp):
@@ -270,9 +201,6 @@ class NanGuardMode(Mode):
        assert nan_is_error or inf_is_error or big_is_error
-        if cuda.cuda_enabled:
-            compile_gpu_func(nan_is_error, inf_is_error, big_is_error)
        def do_check_on(value, nd, var=None):
            """
            Checks `value` for NaNs / Infs. If detected, raises an exception
@@ -304,9 +232,6 @@ class NanGuardMode(Mode):
                err = False
                if not _is_numeric_value(value, var):
                    err = False
-                elif cuda.cuda_available and isinstance(value, cuda.CudaNdarray):
-                    compile_gpu_func(False, False, True)
-                    err = (f_gpuabsmax(value.reshape(value.size)) > 1e10)
                elif pygpu_available and isinstance(value, GpuArray):
                    err = (f_gpua_absmax(value.reshape(value.size)) > 1e10)
                else:

--- a/theano/compile/ops.py
+++ b/theano/compile/ops.py
@@ -810,7 +810,7 @@ class SpecifyShape(gof.Op):
    We currently don't support specifying partial shape information.
-    TODO : test this op with sparse and cuda ndarray. Do C code for them too.
+    TODO : test this op with sparse. Do C code for them too.
    """

--- a/theano/compile/profiling.py
+++ b/theano/compile/profiling.py
@@ -262,11 +262,8 @@ class ProfileStats(object):
    def __init__(self, atexit_print=True, flag_time_thunks=None,
                 gpu_checks=True, **kwargs):
        if (gpu_checks and
-                ((hasattr(theano, 'sandbox') and
+            (hasattr(theano, 'gpuarray') and
-                  hasattr(theano.sandbox, 'cuda') and
+             theano.gpuarray.pygpu_activated) and
-                  theano.sandbox.cuda.cuda_enabled) or (
-                      hasattr(theano, 'gpuarray') and
-                      theano.gpuarray.pygpu_activated)) and
                os.environ.get('CUDA_LAUNCH_BLOCKING', '0') != '1'):
            msg = (
                "You are running the Theano profiler with CUDA enabled."
@@ -285,9 +282,9 @@ class ProfileStats(object):
                theano.gpuarray.pygpu_activated and
                not config.profiling.ignore_first_call):
            warnings.warn(
-                "Theano flag profiling.ignore_first_call is False."
+                "Theano flag profiling.ignore_first_call is False. "
-                " This cause bad profiling result in the new gpu"
+                "This cause bad profiling result in the gpu "
-                " back-end, as sometimes we compile at the first call.")
+                "back-end, as sometimes we compile at the first call.")
        self.apply_callcount = {}
        self.output_size = {}
@@ -508,8 +505,8 @@ class ProfileStats(object):
            tot += t
            ftot = tot * 100 / local_time
            # Remove the useless start and end of the class name:
-            # "<class 'theano.sandbox.cuda.blas.GpuDot22'>" ->
+            # "<class 'theano.gpuarray.blas.GpuDot22'>" ->
-            #  "theano.sandbox.cuda.blas.GpuDot22"
+            #  "theano.gpuarray.blas.GpuDot22"
            class_name = str(a)[8:-2][:maxlen]
            print(format_str % (f, ftot, t, t / nb_call,
                                impl, nb_call,
@@ -820,7 +817,8 @@ class ProfileStats(object):
                new allocation.
            """
-            from theano.sandbox.cuda import CudaNdarrayType
+            from theano.gpuarray import GpuArrayType
            # Initial Mem info values [CPU, GPU]
            node_memory_size = [0, 0]
            running_memory_size = [0, 0]
@@ -870,7 +868,7 @@ class ProfileStats(object):
                # allocated by the node
                idx2 = 0
                for out in node.outputs:
-                    if isinstance(out.type, CudaNdarrayType):
+                    if isinstance(out.type, GpuArrayType):
                        cg = 1
                    else:
                        cg = 0
@@ -912,7 +910,7 @@ class ProfileStats(object):
                for ins in set(node.inputs):
                    assert not (ins in view_of and viewed_by[ins])
                    # we trac the original var, so this shouldn't happen
-                    if isinstance(ins.type, CudaNdarrayType):
+                    if isinstance(ins.type, GpuArrayType):
                        cg = 1
                    else:
                        cg = 0
@@ -1245,16 +1243,6 @@ class ProfileStats(object):
            print("---", file=file)
-        if (hasattr(theano, 'sandbox') and
-            hasattr(theano.sandbox, 'cuda') and
-            hasattr(theano.sandbox.cuda, 'cuda_ndarray') and
-            hasattr(theano.sandbox.cuda.cuda_ndarray.cuda_ndarray,
-                    'theano_allocated')):
-            cuda_ndarray = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray
-            _, gpu_max = cuda_ndarray.theano_allocated()
-            print("    Max Memory allocated on the GPU (for all functions): "
-                  "%dKB" % int(round(gpu_max / 1024.)), file=file)
        print("", file=file)
        if len(fct_memory) > 1:
            print("    This list is based on all functions in the profile",
@@ -1457,7 +1445,6 @@ class ProfileStats(object):
                printed_tip = True
        # tip 7
-        import theano.sandbox.cuda as cuda
        from theano.tensor.nnet import LogSoftmax
        import theano.tensor.signal.pool as pool
        import theano.gpuarray
@@ -1465,12 +1452,12 @@ class ProfileStats(object):
        for a in self.apply_time:
            node = a
            if (isinstance(node.op, pool.Pool)):
-                if (not cuda.dnn.dnn_available() and not theano.gpuarray.dnn.dnn_present()):
+                if not theano.gpuarray.dnn.dnn_present():
                    print("Install CuDNN to do pooling faster"
                          "this allows the operation to run on GPU")
                    printed_tip = True
            if (isinstance(node.op, LogSoftmax)):
-                if (not cuda.dnn.dnn_available() and not theano.gpuarray.dnn.dnn_present()):
+                if not theano.gpuarray.dnn.dnn_present():
                    print("Install CuDNN to do LogSoftmax faster"
                          "this allows the operation to run on GPU")
                    printed_tip = True

--- a/theano/compile/tests/test_debugmode.py
+++ b/theano/compile/tests/test_debugmode.py
@@ -713,7 +713,6 @@ class VecAsRowAndCol(gof.Op):
        if (c[0] is None) or (c[0].shape != (lv, 1)):
            c[0] = node.outputs[1].type.value_zeros((lv, 1))
-        # Python loop because CudaNdarrays do not support newaxis
        for i in range(lv):
            r[0][0, i] = v[i]
            c[0][i, 0] = v[i]
@@ -794,24 +793,3 @@ class Test_preallocated_output(unittest.TestCase):
        v_val = self.rng.randn(5).astype('float32')
        f(v_val)
-    def test_output_broadcast_cuda(self):
-        from theano.sandbox import cuda
-        if not cuda.cuda_available:
-            raise SkipTest("Optional package Cuda disabled")
-        if cuda.use.device_number is None:
-            # We should normally set VecAsRowAndCol as a GPUOp But we
-            # don't want to do this here as this will disable others
-            # tests in this file.  So we manually init the GPU if
-            # needed to remove warning.
-            cuda.use("gpu",
-                     force=True,
-                     default_to_move_computation_to_gpu=False,
-                     move_shared_float32_to_gpu=False,
-                     enable_cuda=False)
-        v = cuda.fvector('v')
-        c, r = VecAsRowAndCol()(v)
-        f = theano.function([v], [c, r])
-        v_val = cuda.CudaNdarray(self.rng.randn(5).astype('float32'))
-        f(v_val)