Remove tentacles in misc.

9cde027a · Arnaud Bergeron · 80a1e8e0 · 9cde027a · 9cde027a · 9cde027a
--- a/theano/gpuarray/tests/test_others.py
+++ b/theano/gpuarray/tests/test_others.py
+from .config import test_ctx_name
+
+from ..type import get_context, GpuArrayType, GpuArraySharedVariable
+
+import pygpu
+import numpy as np
+
+from theano.misc.tests.test_may_share_memory import may_share_memory_core
+
+from theano.misc.pkl_utils import dump, load
+
+
+def test_may_share_memory():
+    ctx = get_context(test_ctx_name)
+    a = pygpu.empty((5, 4), context=ctx)
+    b = pygpu.empty((5, 4), context=ctx)
+
+    may_share_memory_core(a, b)
+
+
+def test_dump_load():
+    x = GpuArraySharedVariable('x',
+                               GpuArrayType('float32', (1, 1), name='x',
+                                            context_name=test_ctx_name),
+                               [[1]], False)
+
+    with open('test', 'wb') as f:
+        dump(x, f)
+
+    with open('test', 'rb') as f:
+        x = load(f)
+
+    assert x.name == 'x'
+    np.testing.assert_allclose(x.get_value(), [[1]])
--- a/theano/ifelse.py
+++ b/theano/ifelse.py
@@ -168,8 +168,8 @@ class IfElse(Op):
        )
        c = theano.tensor.as_tensor_variable(c)
        if not self.gpu:
-            # When gpu is true, we are given only cuda ndarrays, and we want
-            # to keep them be cuda ndarrays
+            # When gpu is true, we are given only gpuarrays, and we want
+            # to keep them as gpuarrays
            nw_args = []
            for x in args:
                if hasattr(x, '_as_TensorVariable'):

--- a/theano/misc/check_blas.py
+++ b/theano/misc/check_blas.py
@@ -11,7 +11,6 @@ import os
 import sys
 import time
 from optparse import OptionParser
-import subprocess

 import numpy as np
 import theano
@@ -51,12 +50,6 @@ def execute(execute=True, verbose=True, M=2000, N=2000, K=2000,
        print('Numpy dot module:', np.dot.__module__)
        print('Numpy location:', np.__file__)
        print('Numpy version:', np.__version__)
-        if (theano.config.device.startswith("gpu") or
-                theano.config.init_gpu_device.startswith("gpu")):
-            print('nvcc version:')
-            subprocess.call((theano.sandbox.cuda.nvcc_compiler.nvcc_path,
-                             "--version"))
-            print()

    a = theano.shared(np.ones((M, N), dtype=theano.config.floatX,
                              order=order))
@@ -88,17 +81,15 @@ def execute(execute=True, verbose=True, M=2000, N=2000, K=2000,

    f()  # Ignore first function call to get representative time.
    if execute:
-        sync = (hasattr(theano, "sandbox") and
-                hasattr(theano.sandbox, "cuda") and
-                isinstance(c, theano.sandbox.cuda.CudaNdarraySharedVariable))
-        sync2 = (hasattr(theano, "gpuarray") and
+        sync = (hasattr(theano, "gpuarray") and
                isinstance(c, theano.gpuarray.GpuArraySharedVariable))
+        if sync:
+            # Make sure we don't include the time from the first call
+            c.get_value(borrow=True, return_internal_type=True).sync()
        t0 = time.time()
        for i in range(iters):
            f()
        if sync:
-            theano.sandbox.cuda.synchronize()
-        if sync2:
            c.get_value(borrow=True, return_internal_type=True).sync()
        t1 = time.time()
    return t1 - t0, impl
@@ -199,85 +190,30 @@ if __name__ == "__main__":
        goto2 1.13/8                                                      1.94s
        goto2 1.13/16                                                     3.16s

-        Test time in float32
-
-        cuda version      6.5    6.0    5.5    5.0    4.2    4.1    4.0    3.2    3.0   # note
-        gpu
-        K6000/NOECC       0.06s         0.06s
-        K40                             0.07s
-        K20m/ECC          0.08s 0.08s          0.07s
-        K20/NOECC                              0.07s
-        M2090                           0.19s
-        C2075                                         0.25s
-        M2075                                  0.25s
-        M2070                                  0.25s         0.27s         0.32s
-        M2070-Q                                0.48s         0.27s         0.32s
-        M2050(Amazon)                          0.25s
-        C1060                                                              0.46s
-        K600                            1.04s
-
-        GTX Titan Black                 0.05s
-        GTX Titan(D15U-50)              0.06s  0.06s  don't work
-        GTX 780                         0.06s
-        GTX 980           0.06s
-        GTX 970           0.08s
-        GTX 680                         0.11s  0.12s  0.154s               0.218s
-        GRID K520         0.14s
-        GTX 580                         0.16s  0.16s  0.164s               0.203s
-        GTX 480                         0.19s  0.19s  0.192s               0.237s 0.27s
-        GTX 750 Ti        0.20s
-        GTX 470                         0.23s  0.23s  0.238s               0.297s 0.34s
-        GTX 660                         0.18s  0.20s  0.23s
-        GTX 560                                       0.30s
-        GTX 650 Ti                             0.27s
-        GTX 765M                 0.27s
-        GTX 460                                0.37s                0.45s
-        GTX 285                         0.42s         0.452s        0.452s        0.40s # cuda 3.0 seems faster? driver version?
-        750M                                   0.49s
-        GT 610            2.38s
-        GTX 550 Ti                                                  0.57s
-        GT 520                                        2.68s                3.06s
-        GT 520M                                2.44s                       3.19s        # with bumblebee on Ubuntu 12.04
-        GT 220                                                             3.80s
-        GT 210                                                      6.35s
-        8500 GT                                                                   10.68s
-
-        Results for larger matrices.
-        There were 10 executions of gemm in float32
-        with matrices of shape 5000x5000 (M=N=K=5000).
+        Test time in float32. There were 10 executions of gemm in
+        float32 with matrices of shape 5000x5000 (M=N=K=5000)
        All memory layout was in C order.

-        cuda version      7.5    7.0    6.5
+
+        cuda version      8.0    7.5    7.0
        gpu
-        M40               0.47s
-        k80               0.96s
-        K6000/NOECC              0.69s
-        K40                             0.88s
-        K20m/ECC
-        K20/NOECC
-        M2090
-        C2075
-        M2075
-        M2070
-        M2070-Q
-        M2050(Amazon)
-        C1060
-        K600
-
-        GTX Titan X       0.45s  0.47s
-        GTX Titan Black   0.64s  0.64s
-        GTX Titan(D15U-50)
-        GTX 780
+        M40               0.45s  0.47s
+        k80               0.92s  0.96s
+        K6000/NOECC       0.71s         0.69s
+        P6000/NOECC       0.25s
+
+        Titan X (Pascal)  0.28s
+        GTX Titan X       0.45s  0.45s  0.47s
+        GTX Titan Black   0.66s  0.64s  0.64s
+        GTX 1080          0.35s
        GTX 980 Ti               0.41s
-        GTX 980
        GTX 970                  0.66s
        GTX 680                         1.57s
-        GRID K520
        GTX 750 Ti               2.01s  2.01s
        GTX 750                  2.46s  2.37s
        GTX 660                  2.32s  2.32s
-        GTX 580           2.42s         2.47s
-        GTX 480           2.87s         2.88s
+        GTX 580                  2.42s
+        GTX 480                  2.87s
        TX1                             7.6s (float32 storage and computation)
        GT 610                          33.5s
        """)

--- a/theano/misc/cudamat_utils.py
+++ b/theano/misc/cudamat_utils.py
-"""
-This code can only work if cudamat and theano are initialized on the
-same gpu as theano.
-
-
-WARNING: In the test of this file there is a transpose that is used...
-         So there can be problem with shape and stride order...
-"""
-from __future__ import absolute_import, print_function, division
-
-import six
-
-try:
-    import cudamat
-
-    cudamat_available = True
-
-    import theano.sandbox.cuda as cuda
-    if cuda.cuda_available is False:
-        raise ImportError('Optional theano package cuda disabled')
-
-    if six.PY3:
-        long = int
-
-    def cudandarray_to_cudamat(x, copyif=False):
-        """ take a CudaNdarray and return a cudamat.CUDAMatrix object.
-
-        :type x: CudaNdarray
-        :param x: The array to transform to cudamat.CUDAMatrix.
-        :type copyif: bool
-        :param copyif: If False, raise an error if x is not c contiguous.
-                       If it is c contiguous, we return a GPUArray that share
-                       the same memory region as x.
-                       If True, copy x if it is no c contiguous, so the return won't
-                       shape the same memory region. If c contiguous, the return
-                       will share the same memory region.
-
-                       We need to do this as GPUArray don't fully support strided memory.
-
-        :return type: cudamat.CUDAMatrix
-        """
-        if not isinstance(x, cuda.CudaNdarray):
-            raise ValueError("We can transfer only CudaNdarray to cudamat.CUDAMatrix")
-        elif x.ndim != 2:
-            raise TypeError("cudandarray_to_cudamat: input must be 2-d (has %s dims). That's "
-                            "because cudamat arrays are always 2-dimensional")
-
-        else:
-            # Check if it is c contiguous
-            size = 1
-            c_contiguous = True
-            for i in range(x.ndim - 1, -1, -1):
-                if x.shape[i] == 1:
-                    continue
-                if x._strides[i] != size:
-                    c_contiguous = False
-                    break
-                size *= x.shape[i]
-            if not c_contiguous:
-                if copyif:
-                    x = x.copy()
-                else:
-                    raise ValueError("We where asked to don't copy memory, but the memory is not c contiguous.")
-
-            # Now x is always c contiguous.
-
-            # the next step is to create a CUDAMatrix object. We do so by first creating
-            # a cudamat object with no data_host.
-            cm_mat = cudamat.cudamat()
-            cm_mat.size[0] = x.shape[0]
-            cm_mat.size[1] = x.shape[1]
-            cm_mat.on_host = 0
-            cm_mat.on_device = 1
-            cm_mat.is_trans = 0
-            cm_mat.owns_data = 0  # <-- note: cm_mat dosen't owe the data; x does. So x will delete it.
-
-            # x.gpudata is a long. We need a pointer to a float. cast.
-            import ctypes
-            cm_mat.data_device = ctypes.cast(x.gpudata, ctypes.POINTER(ctypes.c_float))
-
-            px = cudamat.CUDAMatrix(cm_mat)
-            px._base = x  # x won't be __del__'ed as long as px is around.
-
-            # let cudamat know that we don't have a numpy array attached.
-            px.mat_on_host = False
-            return px
-
-    def cudamat_to_cudandarray(x):
-        """ take a cudamat.CUDAMatrix and make a CudaNdarray that point to its memory
-        """
-        if not isinstance(x, cudamat.CUDAMatrix):
-            raise ValueError("We can transfer only cudamat.CUDAMatrix to CudaNdarray")
-        # elif x.dtype != "float32":
-        # raise ValueError("CudaNdarray support only float32")
-        # We don't need this, because cudamat is always float32.
-        else:
-            strides = [1]
-            for i in x.shape[::-1][:-1]:
-                strides.append(strides[-1] * i)
-            strides = tuple(strides[::-1])
-
-            import ctypes
-            ptr_long = long(ctypes.cast(x.mat.data_device, ctypes.c_void_p).value)
-
-            # seems legit.
-            z = cuda.from_gpu_pointer(ptr_long, x.shape, strides, x)
-            return z
-
-except (ImportError, OSError):
-    cudamat_available = False
--- a/theano/misc/gnumpy_utils.py
+++ b/theano/misc/gnumpy_utils.py
-"""
-This code can only work if gnumpy and theano are initialized on the
-same gpu as theano.
-"""
-from __future__ import absolute_import, print_function, division
-
-import six
-from six.moves import reduce
-
-try:
-    import gnumpy
-    import cudamat
-    gnumpy_available = True
-
-    ___const_garray = gnumpy.rand(1)
-
-    import theano.sandbox.cuda as cuda
-    if cuda.cuda_available is False:
-        raise ImportError('Optional theano package cuda disabled')
-
-    if six.PY3:
-        long = int
-
-    def cudandarray_to_garray(x, copyif=False):
-        """ take a CudaNdarray and return a gnumpy.garray object.
-
-        :type x: CudaNdarray
-        :param x: The array to transform to gnumpy.garray.
-        :type copyif: bool
-        :param copyif: If False, raise an error if x is not c contiguous.
-                       If it is c contiguous, we return a GPUArray that share
-                       the same memory region as x.
-                       If True, copy x if it is no c contiguous, so the return won't
-                       shape the same memory region. If c contiguous, the return
-                       will share the same memory region.
-
-                       We need to do this as GPUArray don't fully support strided memory.
-
-        :return type: cudamat.CUDAMatrix
-        """
-        if not isinstance(x, cuda.CudaNdarray):
-            raise ValueError("We can transfer only CudaNdarray to cudamat.CUDAMatrix")
-        else:
-            # Check if it is c contiguous
-            size = 1
-            c_contiguous = True
-            for i in range(x.ndim - 1, -1, -1):
-                if x.shape[i] == 1:
-                    continue
-                if x._strides[i] != size:
-                    c_contiguous = False
-                    break
-                size *= x.shape[i]
-            if not c_contiguous:
-                if copyif:
-                    x = x.copy()
-                else:
-                    raise ValueError("We where asked to don't copy memory, but the memory is not c contiguous.")
-
-            # Now x is always c contiguous.
-
-            # the next step is to create a CUDAMatrix object. We do so by first creating
-            # a cudamat object with no data_host.
-            cm_mat = cudamat.cudamat()
-
-            cm_mat.size[0] = reduce(lambda x, y: x * y, x.shape, 1)
-            cm_mat.size[1] = 1
-            cm_mat.on_host = 0
-            cm_mat.on_device = 1
-            cm_mat.is_trans = 0
-            cm_mat.owns_data = 0  # <-- note: cm_mat dosen't owe the data; x does. So x will delete it.
-
-            # x.gpudata is a long. We need a pointer to a float. cast.
-            import ctypes
-            cm_mat.data_device = ctypes.cast(x.gpudata, ctypes.POINTER(ctypes.c_float))
-
-            px = cudamat.CUDAMatrix(cm_mat)
-
-            px._base = x  # x won't be freed if the cudamat object isn't freed.
-
-            # let cudamat know that we don't have a numpy array attached.
-            px.mat_on_host = False
-
-            # Note how gnumpy tracks its cudamat objects: it moves things to the
-            # _cmsReuseCache when the gnumpy array is deleted, thus the arrays
-            # returned by theano will never be deleted.
-            # However, if the garray thinks that the object is a view, then it won't
-            # move the _base to the _cmsResueCache; so the cudamat object will be deleted,
-            # and we won't overpump the world with memory.
-            _is_alias_of = ___const_garray
-
-            ans = gnumpy.garray(px,
-                                x.shape,
-                                _is_alias_of)
-
-            return ans
-
-    def garray_to_cudandarray(x):
-        """ take a gnumpy.garray and make a CudaNdarray that point to its memory
-        """
-        if not isinstance(x, gnumpy.garray):
-            raise ValueError("We can transfer only gnumpy.garray to CudaNdarray")
-        # elif x.dtype != "float32":
-        #     raise ValueError("CudaNdarray support only float32")
-        # We don't need this, because cudamat is always float32.
-        else:
-            strides = [1]
-            for i in x.shape[::-1][:-1]:
-                strides.append(strides[-1] * i)
-            strides = strides[::-1]
-            for i in range(len(strides)):
-                if x.shape[i] == 1:
-                    strides[i] = 0
-            strides = tuple(strides)
-
-            import ctypes
-            ptr_long = long(ctypes.cast(x._base.mat.data_device, ctypes.c_void_p).value)
-
-            # seems legit.
-            z = cuda.from_gpu_pointer(ptr_long, x.shape, strides, x._base)
-            return z
-
-except (ImportError, OSError):
-    gnumpy_available = False
--- a/theano/misc/may_share_memory.py
+++ b/theano/misc/may_share_memory.py
 """
-Function to detect memory sharing for ndarray AND sparse type AND CudaNdarray.
+Function to detect memory sharing for ndarray AND sparse type AND GpuArray.
 numpy version support only ndarray.
 """
 from __future__ import absolute_import, print_function, division
@@ -14,25 +14,12 @@ try:
    def _is_sparse(a):
        return scipy.sparse.issparse(a)
 except ImportError:
-    # scipy not imported, their can be only ndarray and cudandarray
+    # scipy not imported, their can be only ndarray and gpuarray
    def _is_sparse(a):
        return False

-from theano.sandbox import cuda
 from theano import gpuarray

-if cuda.cuda_available:
-    from theano.sandbox.cuda.type import CudaNdarrayType
-
-    def _is_cuda(a):
-        return isinstance(a, cuda.CudaNdarray)
-else:
-    def _is_cuda(a):
-        return False
-
-__docformat__ = "restructuredtext en"
-
-
 if gpuarray.pygpu:
    def _is_gpua(a):
        return isinstance(a, gpuarray.pygpu.gpuarray.GpuArray)
@@ -40,16 +27,14 @@ else:
    def _is_gpua(a):
        return False

+__docformat__ = "restructuredtext en"
+

 def may_share_memory(a, b, raise_other_type=True):
    a_ndarray = isinstance(a, np.ndarray)
    b_ndarray = isinstance(b, np.ndarray)
    if a_ndarray and b_ndarray:
        return TensorType.may_share_memory(a, b)
-    a_cuda = _is_cuda(a)
-    b_cuda = _is_cuda(b)
-    if a_cuda and b_cuda:
-        return CudaNdarrayType.may_share_memory(a, b)
    a_gpua = _is_gpua(a)
    b_gpua = _is_gpua(b)
    if a_gpua and b_gpua:
@@ -57,13 +42,13 @@ def may_share_memory(a, b, raise_other_type=True):

    a_sparse = _is_sparse(a)
    b_sparse = _is_sparse(b)
-    if (not(a_ndarray or a_sparse or a_cuda or a_gpua) or
-            not(b_ndarray or b_sparse or b_cuda or b_gpua)):
+    if (not(a_ndarray or a_sparse or a_gpua) or
+            not(b_ndarray or b_sparse or b_gpua)):
        if raise_other_type:
            raise TypeError("may_share_memory support only ndarray"
-                            " and scipy.sparse, CudaNdarray or GpuArray type")
+                            " and scipy.sparse or GpuArray type")
        return False

-    if a_cuda or b_cuda or a_gpua or b_gpua:
+    if a_gpua or b_gpua:
        return False
    return SparseType.may_share_memory(a, b)
--- a/theano/misc/pkl_utils.py
+++ b/theano/misc/pkl_utils.py
@@ -26,11 +26,11 @@ from theano import config
 from theano.compat import PY3
 from six import string_types
 from theano.compile.sharedvalue import SharedVariable
+
 try:
-    from theano.sandbox.cuda import cuda_ndarray
+    import pygpu
 except ImportError:
-    cuda_ndarray = None
-
+    pygpu = None

 __docformat__ = "restructuredtext en"
 __authors__ = "Pascal Lamblin"
@@ -202,21 +202,21 @@ class PersistentNdarrayID(object):
            return self.seen[id(obj)]


-class PersistentCudaNdarrayID(PersistentNdarrayID):
+class PersistentGpuArrayID(PersistentNdarrayID):
    def __call__(self, obj):
-        if (cuda_ndarray is not None and
-                type(obj) is cuda_ndarray.cuda_ndarray.CudaNdarray):
+        if (pygpu and
+                isinstance(obj, pygpu.gpuarray.GpuArray)):
            if id(obj) not in self.seen:
                def write_array(f):
                    np.lib.format.write_array(f, np.asarray(obj))
                name = self._resolve_name(obj)
                zipadd(write_array, self.zip_file, name)
-                self.seen[id(obj)] = 'cuda_ndarray.{0}'.format(name)
+                self.seen[id(obj)] = 'gpuarray.{0}'.format(name)
            return self.seen[id(obj)]
-        return super(PersistentCudaNdarrayID, self).__call__(obj)
+        return super(PersistentGpuArrayID, self).__call__(obj)


-class PersistentSharedVariableID(PersistentCudaNdarrayID):
+class PersistentSharedVariableID(PersistentGpuArrayID):
    """Uses shared variable names when persisting to zip file.

    If a shared variable has a name, this name is used as the name of the
@@ -288,18 +288,16 @@ class PersistentNdarrayLoad(object):
            return self.cache[name]
        ret = None
        array = np.lib.format.read_array(self.zip_file.open(name))
-        if array_type == 'cuda_ndarray':
+        if array_type == 'gpuarray':
            if config.experimental.unpickle_gpu_on_cpu:
                # directly return numpy array
                warnings.warn("config.experimental.unpickle_gpu_on_cpu is set "
-                              "to True. Unpickling CudaNdarray as "
-                              "numpy.ndarray")
+                              "to True. Unpickling GpuArray as numpy.ndarray")
                ret = array
-            elif cuda_ndarray:
-                ret = cuda_ndarray.cuda_ndarray.CudaNdarray(array)
+            elif pygpu:
+                ret = pygpu.array(array)
            else:
-                raise ImportError("Cuda not found. Cannot unpickle "
-                                  "CudaNdarray")
+                raise ImportError("pygpu not found. Cannot unpickle GpuArray")
        else:
            ret = array
        self.cache[name] = ret

--- a/theano/misc/pycuda_example.py
+++ b/theano/misc/pycuda_example.py
--- a/theano/misc/pycuda_init.py
+++ b/theano/misc/pycuda_init.py
-from __future__ import absolute_import, print_function, division
-import os
-import warnings
-
-import theano
-import theano.sandbox.cuda
-from theano import config
-
-
-def set_gpu_from_theano():
-    """
-    This set the GPU used by PyCUDA to the same as the one used by Theano.
-    """
-    # Transfer the theano gpu binding to pycuda, for consistency
-    if config.device.startswith("gpu") and len(config.device) > 3:
-        os.environ["CUDA_DEVICE"] = theano.config.device[3:]
-    elif (config.init_gpu_device.startswith("gpu") and
-          len(config.init_gpu_device) > 3):
-        os.environ["CUDA_DEVICE"] = theano.config.init_gpu_device[3:]
-
-
-set_gpu_from_theano()
-pycuda_available = False
-# If theano.sandbox.cuda don't exist, it is because we are importing
-# it and it try to import this file! This mean we must init the device.
-if (not hasattr(theano.sandbox, 'cuda') or
-        theano.sandbox.cuda.use.device_number is None):
-    try:
-        import pycuda
-        import pycuda.autoinit
-        pycuda_available = True
-    except (ImportError, RuntimeError):
-        # presumably, the user wanted to use pycuda, else they wouldn't have
-        # imported this module, so issue a warning that the import failed.
-        warnings.warn("PyCUDA import failed in theano.misc.pycuda_init")
-    except pycuda._driver.LogicError:
-        if theano.config.force_device:
-            raise
-        else:
-            if "CUDA_DEVICE" in os.environ:
-                del os.environ["CUDA_DEVICE"]
-            import pycuda.autoinit
-            pycuda_available = True
-else:
-    try:
-        import pycuda.driver
-        pycuda_available = True
-    except ImportError:
-        pass
-    if pycuda_available:
-        if hasattr(pycuda.driver.Context, "attach"):
-            pycuda.driver.Context.attach()
-            import atexit
-            atexit.register(pycuda.driver.Context.pop)
-        else:
-            # Now we always import this file when we call
-            # theano.sandbox.cuda.use. So this should not happen
-            # normally.
-            # TODO: make this an error.
-            warnings.warn("For some unknow reason, theano.misc.pycuda_init was"
-                          " not imported before Theano initialized the GPU and"
-                          " your PyCUDA version is 2011.2.2 or earlier."
-                          " To fix the problem, import theano.misc.pycuda_init"
-                          " manually before using/initializing the GPU, use the"
-                          " Theano flag pycuda.init=True or use a"
-                          " more recent version of PyCUDA.")
--- a/theano/misc/pycuda_utils.py
+++ b/theano/misc/pycuda_utils.py
-from __future__ import absolute_import, print_function, division
-import pycuda.gpuarray
-
-from theano.sandbox import cuda
-if cuda.cuda_available is False:
-    raise ImportError('Optional theano package cuda disabled')
-
-
-def to_gpuarray(x, copyif=False):
-    """ take a CudaNdarray and return a pycuda.gpuarray.GPUArray
-
-    :type x: CudaNdarray
-    :param x: The array to transform to pycuda.gpuarray.GPUArray.
-    :type copyif: bool
-    :param copyif: If False, raise an error if x is not c contiguous.
-                   If it is c contiguous, we return a GPUArray that share
-                   the same memory region as x.
-                   If True, copy x if it is no c contiguous, so the return won't
-                   shape the same memory region. If c contiguous, the return
-                   will share the same memory region.
-
-                   We need to do this as GPUArray don't fully support strided memory.
-
-    :return type: pycuda.gpuarray.GPUArray
-    """
-    if not isinstance(x, cuda.CudaNdarray):
-        raise ValueError("We can transfer only CudaNdarray to pycuda.gpuarray.GPUArray")
-    else:
-        # Check if it is c contiguous
-        size = 1
-        c_contiguous = True
-        for i in range(x.ndim - 1, -1, -1):
-            if x.shape[i] == 1:
-                continue
-            if x._strides[i] != size:
-                c_contiguous = False
-                break
-            size *= x.shape[i]
-        if not c_contiguous:
-            if copyif:
-                x = x.copy()
-            else:
-                raise ValueError("We were asked to not copy memory, but the memory is not c contiguous.")
-
-        # Now x is always c contiguous
-        px = pycuda.gpuarray.GPUArray(x.shape, x.dtype, base=x, gpudata=x.gpudata)
-        return px
-
-
-def to_cudandarray(x):
-    """ take a pycuda.gpuarray.GPUArray and make a CudaNdarray that point to its memory
-
-    :note: CudaNdarray support only float32, so only float32 GPUArray are accepted
-    """
-    if not isinstance(x, pycuda.gpuarray.GPUArray):
-        raise ValueError("We can transfer only pycuda.gpuarray.GPUArray to CudaNdarray")
-    elif x.dtype != "float32":
-        raise ValueError("CudaNdarray support only float32")
-    else:
-        strides = [1]
-        for i in x.shape[::-1][:-1]:
-            strides.append(strides[-1] * i)
-        strides = tuple(strides[::-1])
-        ptr = int(x.gpudata)  # in pycuda trunk, y.ptr also works, which is a little cleaner
-        z = cuda.from_gpu_pointer(ptr, x.shape, strides, x)
-        return z
--- a/theano/misc/tests/test_cudamat_utils.py
+++ b/theano/misc/tests/test_cudamat_utils.py
-from __future__ import absolute_import, print_function, division
-import numpy as np
-import theano
-from theano.misc.cudamat_utils import cudamat_available
-
-if not cudamat_available:  # noqa
-    from nose.plugins.skip import SkipTest
-    raise SkipTest("gnumpy not installed. Skip test of theano op with pycuda "
-                   "code.")
-
-from theano.misc.cudamat_utils import (cudandarray_to_cudamat,
-                                       cudamat_to_cudandarray)
-
-
-def test(shape=(3, 4)):
-    """
-    Make sure that the cudamat conversion is exact.
-    """
-    gpu = theano.sandbox.cuda.basic_ops.gpu_from_host
-    U = gpu(theano.tensor.fmatrix('U'))
-    ii = theano.function([U], gpu(U + 1))
-
-    A_cpu = np.asarray(np.random.rand(*shape), dtype="float32")
-    A_cnd = theano.sandbox.cuda.CudaNdarray(A_cpu)
-    A_cmat = cudandarray_to_cudamat(A_cnd)
-
-    B_cnd = cudamat_to_cudandarray(A_cmat)
-    B_cnd = ii(A_cnd)
-
-    u = A_cnd.copy()
-    u += theano.sandbox.cuda.CudaNdarray(np.asarray([[1]], dtype='float32'))
-    u = np.asarray(u)
-    v = np.asarray(B_cnd)
-    w = A_cmat.add(1).asarray()
-
-    assert abs(u - v).max() == 0
-    assert abs(u - w.T.reshape(u.shape)).max() == 0
--- a/theano/misc/tests/test_gnumpy_utils.py
+++ b/theano/misc/tests/test_gnumpy_utils.py
-from __future__ import absolute_import, print_function, division
-import numpy as np
-
-import theano
-from theano.misc.gnumpy_utils import gnumpy_available
-
-if not gnumpy_available:  # noqa
-    from nose.plugins.skip import SkipTest
-    raise SkipTest("gnumpy not installed. Skip test related to it.")
-
-from theano.misc.gnumpy_utils import (garray_to_cudandarray,
-                                      cudandarray_to_garray)
-
-import gnumpy
-
-
-def test(shape=(3, 4, 5)):
-    """
-    Make sure that the gnumpy conversion is exact from garray to
-    CudaNdarray back to garray.
-    """
-    gpu = theano.sandbox.cuda.basic_ops.gpu_from_host
-    U = gpu(theano.tensor.ftensor3('U'))
-    ii = theano.function([U], gpu(U + 1))
-
-    A = gnumpy.rand(*shape)
-    A_cnd = garray_to_cudandarray(A)
-    assert A_cnd.shape == A.shape
-    # dtype always float32
-    # garray don't have strides
-    B_cnd = ii(A_cnd)
-    B = cudandarray_to_garray(B_cnd)
-    assert A_cnd.shape == A.shape
-
-    u = (A + 1).asarray()
-    v = B.asarray()
-    w = np.array(B_cnd)
-    assert (u == v).all()
-    assert (u == w).all()
-
-
-def test2(shape=(3, 4, 5)):
-    """
-    Make sure that the gnumpy conversion is exact from CudaNdarray to
-    garray back to CudaNdarray.
-    """
-    gpu = theano.sandbox.cuda.basic_ops.gpu_from_host
-    U = gpu(theano.tensor.ftensor3('U'))
-    theano.function([U], gpu(U + 1))
-
-    A = np.random.rand(*shape).astype('float32')
-    A_cnd = theano.sandbox.cuda.CudaNdarray(A)
-    A_gar = cudandarray_to_garray(A_cnd)
-    assert A_cnd.shape == A_gar.shape
-    # dtype always float32
-    # garray don't have strides
-
-    B = garray_to_cudandarray(A_gar)
-
-    assert A_cnd.shape == B.shape
-    # dtype always float32
-    assert A_cnd._strides == B._strides
-    assert A_cnd.gpudata == B.gpudata
-    v = np.asarray(B)
-    assert (v == A).all()
-
-
-def test_broadcast_dims():
-    """
-    Test with some dimensions being 1.
-    CudaNdarray use 0 for strides for those dimensions.
-    """
-    test((1, 2, 3))
-    test((2, 1, 3))
-    test((2, 3, 1))
-    test2((1, 2, 3))
-    test2((2, 1, 3))
-    test2((2, 3, 1))
--- a/theano/misc/tests/test_may_share_memory.py
+++ b/theano/misc/tests/test_may_share_memory.py
 """
-test the tensor and sparse type. The CudaNdarray type is tested in
-sandbox/cuda/tests/test_tensor_op.py.test_may_share_memory_cuda
+test the tensor and sparse type. (gpuarray is tested in the gpuarray folder).
 """
 from __future__ import absolute_import, print_function, division
 import numpy as np
@@ -15,9 +14,7 @@ except ImportError:
 from theano.misc.may_share_memory import may_share_memory


-def test_may_share_memory():
-    a = np.random.rand(5, 4)
-    b = np.random.rand(5, 4)
+def may_share_memory_core(a, b):
    va = a.view()
    vb = b.view()
    ra = a.reshape((4, 5))
@@ -51,6 +48,13 @@ def test_may_share_memory():
        except TypeError:
            pass

+
+def test_may_share_memory():
+    a = np.random.rand(5, 4)
+    b = np.random.rand(5, 4)
+
+    may_share_memory_core(a, b)
+
 if scipy_imported:
    def test_may_share_memory_scipy():
        a = scipy.sparse.csc_matrix(scipy.sparse.eye(5, 3))

--- a/theano/misc/tests/test_pkl_utils.py
+++ b/theano/misc/tests/test_pkl_utils.py
@@ -5,13 +5,9 @@ import unittest
 from tempfile import mkdtemp

 import numpy as np
-from nose.plugins.skip import SkipTest

 import theano
-import theano.sandbox.cuda as cuda_ndarray

-from theano.sandbox.cuda.type import CudaNdarrayType
-from theano.sandbox.cuda.var import CudaNdarraySharedVariable
 from theano.sandbox.rng_mrg import MRG_RandomStreams
 from theano.misc.pkl_utils import dump, load, StripPickler

@@ -29,24 +25,8 @@ class T_dump_load(unittest.TestCase):
        if self.tmpdir is not None:
            shutil.rmtree(self.tmpdir)

-    def test_dump_load(self):
-        if not cuda_ndarray.cuda_enabled:
-            raise SkipTest('Optional package cuda disabled')
-
-        x = CudaNdarraySharedVariable('x', CudaNdarrayType((1, 1), name='x'),
-                                      [[1]], False)
-
-        with open('test', 'wb') as f:
-            dump(x, f)
-
-        with open('test', 'rb') as f:
-            x = load(f)
-
-        assert x.name == 'x'
-        np.testing.assert_allclose(x.get_value(), [[1]])
-
    def test_dump_load_mrg(self):
-        rng = MRG_RandomStreams(use_cuda=cuda_ndarray.cuda_enabled)
+        rng = MRG_RandomStreams()

        with open('test', 'wb') as f:
            dump(rng, f)

--- a/theano/misc/tests/test_pycuda_example.py
+++ b/theano/misc/tests/test_pycuda_example.py
-from __future__ import absolute_import, print_function, division
-import numpy as np
-
-import theano
-import theano.misc.pycuda_init
-
-if not theano.misc.pycuda_init.pycuda_available:  # noqa
-    from nose.plugins.skip import SkipTest
-    raise SkipTest("Pycuda not installed. Skip test of theano op"
-                   " with pycuda code.")
-
-import theano.sandbox.cuda as cuda_ndarray
-if not cuda_ndarray.cuda_available:  # noqa
-    from nose.plugins.skip import SkipTest
-    raise SkipTest('Optional package cuda disabled')
-
-import theano.tensor as T
-from theano.misc.pycuda_example import (PycudaElemwiseSourceModuleOp,
-                                        PycudaElemwiseSourceModuleMakeThunkOp)
-
-if theano.config.mode == 'FAST_COMPILE':
-    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
-    mode_without_gpu = theano.compile.mode.get_mode(
-        'FAST_RUN').excluding('gpu')
-else:
-    mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
-    mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpu')
-
-
-def test_pycuda_elemwise_source_module():
-    for shape in [(5, 5), (10, 49), (50, 49), (500, 501)]:
-        for op in [theano.scalar.basic.mul, theano.scalar.basic.add]:
-            x = T.fmatrix('x')
-            y = T.fmatrix('y')
-            elemwise_op = theano.tensor.Elemwise(op)
-            pycuda_op = PycudaElemwiseSourceModuleOp(op)
-            pycuda_op_thunk = PycudaElemwiseSourceModuleMakeThunkOp(op)
-            f = theano.function([x, y], elemwise_op(x, y), mode=mode_with_gpu)
-            f2 = theano.function([x, y],
-                                 theano.sandbox.cuda.host_from_gpu(
-                                     pycuda_op(x, y)),
-                                 mode=mode_with_gpu)
-            mode_pycuda = mode_with_gpu.including("local_pycuda_gpu_elemwise")
-            f3 = theano.function([x, y], elemwise_op(x, y),
-                                 mode=mode_pycuda)
-            f4 = theano.function([x, y],
-                                 theano.sandbox.cuda.host_from_gpu(
-                                     pycuda_op_thunk(x, y)),
-                                 mode=mode_with_gpu)
-
-            assert any([isinstance(node.op, theano.sandbox.cuda.GpuElemwise)
-                        for node in f.maker.fgraph.toposort()])
-            assert any([isinstance(node.op, PycudaElemwiseSourceModuleOp)
-                        for node in f2.maker.fgraph.toposort()])
-            assert any([isinstance(node.op, PycudaElemwiseSourceModuleOp)
-                        for node in f3.maker.fgraph.toposort()])
-            assert any([isinstance(node.op,
-                                   PycudaElemwiseSourceModuleMakeThunkOp)
-                        for node in f4.maker.fgraph.toposort()])
-
-            val1 = np.asarray(np.random.rand(*shape), dtype='float32')
-            val2 = np.asarray(np.random.rand(*shape), dtype='float32')
-            assert np.allclose(f(val1, val2), f2(val1, val2))
-            assert np.allclose(f(val1, val2), f3(val1, val2))
-            assert np.allclose(f(val1, val2), f4(val1, val2))
-            # print f(val1,val2)
-            # print f2(val1,val2)
-
-"""
-#commented as it work only with old pycuda version.
-def test_pycuda_elemwise_kernel():
-    x = T.fmatrix('x')
-    y = T.fmatrix('y')
-    f = theano.function([x, y], x + y, mode=mode_with_gpu)
-    print(f.maker.fgraph.toposort())
-    mode_pycuda = mode_with_gpu.including("local_pycuda_gpu_elemwise_kernel")
-    f2 = theano.function([x, y], x + y, mode=mode_pycuda)
-    print(f2.maker.fgraph.toposort())
-
-    assert any([isinstance(node.op, theano.sandbox.cuda.GpuElemwise)
-                for node in f.maker.fgraph.toposort()])
-    assert any([isinstance(node.op, PycudaElemwiseKernelOp)
-                for node in f2.maker.fgraph.toposort()])
-
-    val1 = np.asarray(np.random.rand(5, 5), dtype='float32')
-    val2 = np.asarray(np.random.rand(5, 5), dtype='float32')
-    #val1 = np.ones((5,5))
-    #val2 = np.arange(25).reshape(5,5)
-    assert (f(val1, val2) == f2(val1, val2)).all()
-    print(f(val1, val2))
-    print(f2(val1, val2))
-
-    x3 = T.ftensor3('x')
-    y3 = T.ftensor3('y')
-    z3 = T.ftensor3('y')
-
-    f4 = theano.function([x3, y3, z3], x3 * y3 + z3, mode=mode_pycuda)
-    print(f4.maker.fgraph.toposort())
-    assert any([isinstance(node.op, PycudaElemwiseKernelOp)
-                for node in f4.maker.fgraph.toposort()])
-
-    val1 = np.random.rand(2, 2, 2)
-    print(val1)
-    print(f4(val1, val1, val1))
-    assert np.allclose(f4(val1, val1, val1), val1 * val1 + val1)
-"""
--- a/theano/misc/tests/test_pycuda_theano_simple.py
+++ b/theano/misc/tests/test_pycuda_theano_simple.py
-"""
-This file is an example of view the memory allocated by pycuda in a GpuArray
-in a CudaNdarray to be able to use it in Theano.
-
-This also serve as a test for the function: cuda_ndarray.from_gpu_pointer
-"""
-from __future__ import absolute_import, print_function, division
-
-import sys
-
-import numpy as np
-
-import theano
-import theano.sandbox.cuda as cuda_ndarray
-import theano.misc.pycuda_init
-
-if not theano.misc.pycuda_init.pycuda_available:  # noqa
-    from nose.plugins.skip import SkipTest
-    raise SkipTest("Pycuda not installed."
-                   " We skip tests of Theano Ops with pycuda code.")
-
-if cuda_ndarray.cuda_available is False:  # noqa
-    from nose.plugins.skip import SkipTest
-    raise SkipTest('Optional theano package cuda disabled')
-
-import pycuda
-import pycuda.driver as drv
-import pycuda.gpuarray
-
-
-def test_pycuda_only():
-    """Run pycuda only example to test that pycuda works."""
-    from pycuda.compiler import SourceModule
-    mod = SourceModule("""
-__global__ void multiply_them(float *dest, float *a, float *b)
-{
-  const int i = threadIdx.x;
-  dest[i] = a[i] * b[i];
-}
-""")
-
-    multiply_them = mod.get_function("multiply_them")
-
-    # Test with pycuda in/out of numpy.ndarray
-    a = np.random.randn(100).astype(np.float32)
-    b = np.random.randn(100).astype(np.float32)
-    dest = np.zeros_like(a)
-    multiply_them(
-        drv.Out(dest), drv.In(a), drv.In(b),
-        block=(400, 1, 1), grid=(1, 1))
-    assert (dest == a * b).all()
-
-
-def test_pycuda_theano():
-    """Simple example with pycuda function and Theano CudaNdarray object."""
-    from pycuda.compiler import SourceModule
-    mod = SourceModule("""
-__global__ void multiply_them(float *dest, float *a, float *b)
-{
-  const int i = threadIdx.x;
-  dest[i] = a[i] * b[i];
-}
-""")
-
-    multiply_them = mod.get_function("multiply_them")
-
-    a = np.random.randn(100).astype(np.float32)
-    b = np.random.randn(100).astype(np.float32)
-
-    # Test with Theano object
-    ga = cuda_ndarray.CudaNdarray(a)
-    gb = cuda_ndarray.CudaNdarray(b)
-    dest = cuda_ndarray.CudaNdarray.zeros(a.shape)
-    multiply_them(dest, ga, gb,
-                  block=(400, 1, 1), grid=(1, 1))
-    assert (np.asarray(dest) == a * b).all()
-
-
-def test_pycuda_memory_to_theano():
-    # Test that we can use the GpuArray memory space in pycuda in a CudaNdarray
-    y = pycuda.gpuarray.zeros((3, 4, 5), 'float32')
-    print(sys.getrefcount(y))
-    # This increase the ref count with never pycuda. Do pycuda also
-    # cache ndarray?
-    # print y.get()
-    initial_refcount = sys.getrefcount(y)
-    print("gpuarray ref count before creating a CudaNdarray", end=' ')
-    print(sys.getrefcount(y))
-    assert sys.getrefcount(y) == initial_refcount
-    rand = np.random.randn(*y.shape).astype(np.float32)
-    cuda_rand = cuda_ndarray.CudaNdarray(rand)
-
-    strides = [1]
-    for i in y.shape[::-1][:-1]:
-        strides.append(strides[-1] * i)
-    strides = tuple(strides[::-1])
-    print('strides', strides)
-    assert cuda_rand._strides == strides, (cuda_rand._strides, strides)
-
-    # in pycuda trunk, y.ptr also works, which is a little cleaner
-    y_ptr = int(y.gpudata)
-    z = cuda_ndarray.from_gpu_pointer(y_ptr, y.shape, strides, y)
-    print("gpuarray ref count after creating a CudaNdarray", sys.getrefcount(y))
-    assert sys.getrefcount(y) == initial_refcount + 1
-    assert (np.asarray(z) == 0).all()
-    assert z.base is y
-
-    # Test that we can take a view from this cuda view on pycuda memory
-    zz = z.view()
-    assert sys.getrefcount(y) == initial_refcount + 2
-    assert zz.base is y
-    del zz
-    assert sys.getrefcount(y) == initial_refcount + 1
-
-    cuda_ones = cuda_ndarray.CudaNdarray(np.asarray([[[1]]],
-                                                    dtype='float32'))
-    z += cuda_ones
-    assert (np.asarray(z) == np.ones(y.shape)).all()
-    assert (np.asarray(z) == 1).all()
-
-    assert cuda_rand.shape == z.shape
-    assert cuda_rand._strides == z._strides, (cuda_rand._strides, z._strides)
-    assert (np.asarray(cuda_rand) == rand).all()
-    z += cuda_rand
-    assert (np.asarray(z) == (rand + 1)).all()
-
-    # Check that the ref count to the gpuarray is right.
-    del z
-    print("gpuarray ref count after deleting the CudaNdarray", end=' ')
-    print(sys.getrefcount(y))
-    assert sys.getrefcount(y) == initial_refcount
--- a/theano/misc/tests/test_pycuda_utils.py
+++ b/theano/misc/tests/test_pycuda_utils.py
-from __future__ import absolute_import, print_function, division
-import numpy as np
-
-import theano.sandbox.cuda as cuda
-import theano.misc.pycuda_init
-
-if not theano.misc.pycuda_init.pycuda_available:  # noqa
-    from nose.plugins.skip import SkipTest
-    raise SkipTest("Pycuda not installed. Skip test of theano op with pycuda "
-                   "code.")
-
-if cuda.cuda_available is False:  # noqa
-    from nose.plugins.skip import SkipTest
-    raise SkipTest('Optional theano package cuda disabled')
-
-from theano.misc.pycuda_utils import to_gpuarray, to_cudandarray
-import pycuda.gpuarray
-
-
-def test_to_gpuarray():
-    cx = cuda.CudaNdarray.zeros((5, 4))
-
-    px = to_gpuarray(cx)
-    assert isinstance(px, pycuda.gpuarray.GPUArray)
-    cx[0, 0] = np.asarray(1, dtype="float32")
-    # Check that they share the same memory space
-    assert px.gpudata == cx.gpudata
-    assert np.asarray(cx[0, 0]) == 1
-
-    assert np.allclose(np.asarray(cx), px.get())
-    assert px.dtype == cx.dtype
-    assert px.shape == cx.shape
-    assert all(np.asarray(cx._strides) * 4 == px.strides)
-
-    # Test when the CudaNdarray is strided
-    cx = cx[::2, ::]
-    px = to_gpuarray(cx, copyif=True)
-    assert isinstance(px, pycuda.gpuarray.GPUArray)
-    cx[0, 0] = np.asarray(2, dtype="float32")
-
-    # Check that they do not share the same memory space
-    assert px.gpudata != cx.gpudata
-    assert np.asarray(cx[0, 0]) == 2
-    assert not np.allclose(np.asarray(cx), px.get())
-
-    assert px.dtype == cx.dtype
-    assert px.shape == cx.shape
-    assert not all(np.asarray(cx._strides) * 4 == px.strides)
-
-    # Test that we return an error
-    try:
-        px = to_gpuarray(cx)
-        assert False
-    except ValueError:
-        pass
-
-
-def test_to_cudandarray():
-    px = pycuda.gpuarray.zeros((3, 4, 5), 'float32')
-    cx = to_cudandarray(px)
-    assert isinstance(cx, cuda.CudaNdarray)
-    assert np.allclose(px.get(),
-                       np.asarray(cx))
-    assert px.dtype == cx.dtype
-    assert px.shape == cx.shape
-    assert all(np.asarray(cx._strides) * 4 == px.strides)
-
-    try:
-        px = pycuda.gpuarray.zeros((3, 4, 5), 'float64')
-        to_cudandarray(px)
-        assert False
-    except ValueError:
-        pass
-
-    try:
-        to_cudandarray(np.zeros(4))
-        assert False
-    except ValueError:
-        pass