Remove tentacles in misc.

9cde027a · Arnaud Bergeron · 80a1e8e0 · 9cde027a · 9cde027a · 9cde027a
--- a/theano/gpuarray/tests/test_others.py
+++ b/theano/gpuarray/tests/test_others.py
+from .config import test_ctx_name
+from ..type import get_context, GpuArrayType, GpuArraySharedVariable
+import pygpu
+import numpy as np
+from theano.misc.tests.test_may_share_memory import may_share_memory_core
+from theano.misc.pkl_utils import dump, load
+def test_may_share_memory():
+    ctx = get_context(test_ctx_name)
+    a = pygpu.empty((5, 4), context=ctx)
+    b = pygpu.empty((5, 4), context=ctx)
+    may_share_memory_core(a, b)
+def test_dump_load():
+    x = GpuArraySharedVariable('x',
+                               GpuArrayType('float32', (1, 1), name='x',
+                                            context_name=test_ctx_name),
+                               [[1]], False)
+    with open('test', 'wb') as f:
+        dump(x, f)
+    with open('test', 'rb') as f:
+        x = load(f)
+    assert x.name == 'x'
+    np.testing.assert_allclose(x.get_value(), [[1]])
--- a/theano/ifelse.py
+++ b/theano/ifelse.py
@@ -168,8 +168,8 @@ class IfElse(Op):
        )
        c = theano.tensor.as_tensor_variable(c)
        if not self.gpu:
-            # When gpu is true, we are given only cuda ndarrays, and we want
+            # When gpu is true, we are given only gpuarrays, and we want
-            # to keep them be cuda ndarrays
+            # to keep them as gpuarrays
            nw_args = []
            for x in args:
                if hasattr(x, '_as_TensorVariable'):

--- a/theano/misc/check_blas.py
+++ b/theano/misc/check_blas.py
@@ -11,7 +11,6 @@ import os
 import sys
 import time
 from optparse import OptionParser
-import subprocess
 import numpy as np
 import theano
@@ -51,12 +50,6 @@ def execute(execute=True, verbose=True, M=2000, N=2000, K=2000,
        print('Numpy dot module:', np.dot.__module__)
        print('Numpy location:', np.__file__)
        print('Numpy version:', np.__version__)
-        if (theano.config.device.startswith("gpu") or
-                theano.config.init_gpu_device.startswith("gpu")):
-            print('nvcc version:')
-            subprocess.call((theano.sandbox.cuda.nvcc_compiler.nvcc_path,
-                             "--version"))
-            print()
    a = theano.shared(np.ones((M, N), dtype=theano.config.floatX,
                              order=order))
@@ -88,17 +81,15 @@ def execute(execute=True, verbose=True, M=2000, N=2000, K=2000,
    f()  # Ignore first function call to get representative time.
    if execute:
-        sync = (hasattr(theano, "sandbox") and
+        sync = (hasattr(theano, "gpuarray") and
-                hasattr(theano.sandbox, "cuda") and
+                isinstance(c, theano.gpuarray.GpuArraySharedVariable))
-                isinstance(c, theano.sandbox.cuda.CudaNdarraySharedVariable))
+        if sync:
-        sync2 = (hasattr(theano, "gpuarray") and
+            # Make sure we don't include the time from the first call
-                 isinstance(c, theano.gpuarray.GpuArraySharedVariable))
+            c.get_value(borrow=True, return_internal_type=True).sync()
        t0 = time.time()
        for i in range(iters):
            f()
        if sync:
-            theano.sandbox.cuda.synchronize()
-        if sync2:
            c.get_value(borrow=True, return_internal_type=True).sync()
        t1 = time.time()
    return t1 - t0, impl
@@ -199,87 +190,32 @@ if __name__ == "__main__":
        goto2 1.13/8                                                      1.94s
        goto2 1.13/16                                                     3.16s
-        Test time in float32
+        Test time in float32. There were 10 executions of gemm in
+        float32 with matrices of shape 5000x5000 (M=N=K=5000)
-        cuda version      6.5    6.0    5.5    5.0    4.2    4.1    4.0    3.2    3.0   # note
-        gpu
-        K6000/NOECC       0.06s         0.06s
-        K40                             0.07s
-        K20m/ECC          0.08s 0.08s          0.07s
-        K20/NOECC                              0.07s
-        M2090                           0.19s
-        C2075                                         0.25s
-        M2075                                  0.25s
-        M2070                                  0.25s         0.27s         0.32s
-        M2070-Q                                0.48s         0.27s         0.32s
-        M2050(Amazon)                          0.25s
-        C1060                                                              0.46s
-        K600                            1.04s
-        GTX Titan Black                 0.05s
-        GTX Titan(D15U-50)              0.06s  0.06s  don't work
-        GTX 780                         0.06s
-        GTX 980           0.06s
-        GTX 970           0.08s
-        GTX 680                         0.11s  0.12s  0.154s               0.218s
-        GRID K520         0.14s
-        GTX 580                         0.16s  0.16s  0.164s               0.203s
-        GTX 480                         0.19s  0.19s  0.192s               0.237s 0.27s
-        GTX 750 Ti        0.20s
-        GTX 470                         0.23s  0.23s  0.238s               0.297s 0.34s
-        GTX 660                         0.18s  0.20s  0.23s
-        GTX 560                                       0.30s
-        GTX 650 Ti                             0.27s
-        GTX 765M                 0.27s
-        GTX 460                                0.37s                0.45s
-        GTX 285                         0.42s         0.452s        0.452s        0.40s # cuda 3.0 seems faster? driver version?
-        750M                                   0.49s
-        GT 610            2.38s
-        GTX 550 Ti                                                  0.57s
-        GT 520                                        2.68s                3.06s
-        GT 520M                                2.44s                       3.19s        # with bumblebee on Ubuntu 12.04
-        GT 220                                                             3.80s
-        GT 210                                                      6.35s
-        8500 GT                                                                   10.68s
-        Results for larger matrices.
-        There were 10 executions of gemm in float32
-        with matrices of shape 5000x5000 (M=N=K=5000).
        All memory layout was in C order.
-        cuda version      7.5    7.0    6.5
+        cuda version      8.0    7.5    7.0
        gpu
-        M40               0.47s
+        M40               0.45s  0.47s
-        k80               0.96s
+        k80               0.92s  0.96s
-        K6000/NOECC              0.69s
+        K6000/NOECC       0.71s         0.69s
-        K40                             0.88s
+        P6000/NOECC       0.25s
-        K20m/ECC
-        K20/NOECC
+        Titan X (Pascal)  0.28s
-        M2090
+        GTX Titan X       0.45s  0.45s  0.47s
-        C2075
+        GTX Titan Black   0.66s  0.64s  0.64s
-        M2075
+        GTX 1080          0.35s
-        M2070
+        GTX 980 Ti               0.41s
-        M2070-Q
+        GTX 970                  0.66s
-        M2050(Amazon)
+        GTX 680                         1.57s
-        C1060
+        GTX 750 Ti               2.01s  2.01s
-        K600
+        GTX 750                  2.46s  2.37s
+        GTX 660                  2.32s  2.32s
-        GTX Titan X       0.45s  0.47s
+        GTX 580                  2.42s
-        GTX Titan Black   0.64s  0.64s
+        GTX 480                  2.87s
-        GTX Titan(D15U-50)
+        TX1                             7.6s (float32 storage and computation)
-        GTX 780
+        GT 610                          33.5s
-        GTX 980 Ti        0.41s
-        GTX 980
-        GTX 970           0.66s
-        GTX 680                  1.57s
-        GRID K520
-        GTX 750 Ti        2.01s  2.01s
-        GTX 750           2.46s  2.37s
-        GTX 660           2.32s  2.32s
-        GTX 580           2.42s         2.47s
-        GTX 480           2.87s         2.88s
-        TX1                      7.6s (float32 storage and computation)
-        GT 610                   33.5s
        """)
    if options.M == 0:

--- a/theano/misc/cudamat_utils.py
+++ b/theano/misc/cudamat_utils.py
-"""
-This code can only work if cudamat and theano are initialized on the
-same gpu as theano.
-WARNING: In the test of this file there is a transpose that is used...
-         So there can be problem with shape and stride order...
-"""
-from __future__ import absolute_import, print_function, division
-import six
-try:
-    import cudamat
-    cudamat_available = True
-    import theano.sandbox.cuda as cuda
-    if cuda.cuda_available is False:
-        raise ImportError('Optional theano package cuda disabled')
-    if six.PY3:
-        long = int
-    def cudandarray_to_cudamat(x, copyif=False):
-        """ take a CudaNdarray and return a cudamat.CUDAMatrix object.
-        :type x: CudaNdarray
-        :param x: The array to transform to cudamat.CUDAMatrix.
-        :type copyif: bool
-        :param copyif: If False, raise an error if x is not c contiguous.
-                       If it is c contiguous, we return a GPUArray that share
-                       the same memory region as x.
-                       If True, copy x if it is no c contiguous, so the return won't
-                       shape the same memory region. If c contiguous, the return
-                       will share the same memory region.
-                       We need to do this as GPUArray don't fully support strided memory.
-        :return type: cudamat.CUDAMatrix
-        """
-        if not isinstance(x, cuda.CudaNdarray):
-            raise ValueError("We can transfer only CudaNdarray to cudamat.CUDAMatrix")
-        elif x.ndim != 2:
-            raise TypeError("cudandarray_to_cudamat: input must be 2-d (has %s dims). That's "
-                            "because cudamat arrays are always 2-dimensional")
-        else:
-            # Check if it is c contiguous
-            size = 1
-            c_contiguous = True
-            for i in range(x.ndim - 1, -1, -1):
-                if x.shape[i] == 1:
-                    continue
-                if x._strides[i] != size:
-                    c_contiguous = False
-                    break
-                size *= x.shape[i]
-            if not c_contiguous:
-                if copyif:
-                    x = x.copy()
-                else:
-                    raise ValueError("We where asked to don't copy memory, but the memory is not c contiguous.")
-            # Now x is always c contiguous.
-            # the next step is to create a CUDAMatrix object. We do so by first creating
-            # a cudamat object with no data_host.
-            cm_mat = cudamat.cudamat()
-            cm_mat.size[0] = x.shape[0]
-            cm_mat.size[1] = x.shape[1]
-            cm_mat.on_host = 0
-            cm_mat.on_device = 1
-            cm_mat.is_trans = 0
-            cm_mat.owns_data = 0  # <-- note: cm_mat dosen't owe the data; x does. So x will delete it.
-            # x.gpudata is a long. We need a pointer to a float. cast.
-            import ctypes
-            cm_mat.data_device = ctypes.cast(x.gpudata, ctypes.POINTER(ctypes.c_float))
-            px = cudamat.CUDAMatrix(cm_mat)
-            px._base = x  # x won't be __del__'ed as long as px is around.
-            # let cudamat know that we don't have a numpy array attached.
-            px.mat_on_host = False
-            return px
-    def cudamat_to_cudandarray(x):
-        """ take a cudamat.CUDAMatrix and make a CudaNdarray that point to its memory
-        """
-        if not isinstance(x, cudamat.CUDAMatrix):
-            raise ValueError("We can transfer only cudamat.CUDAMatrix to CudaNdarray")
-        # elif x.dtype != "float32":
-        # raise ValueError("CudaNdarray support only float32")
-        # We don't need this, because cudamat is always float32.
-        else:
-            strides = [1]
-            for i in x.shape[::-1][:-1]:
-                strides.append(strides[-1] * i)
-            strides = tuple(strides[::-1])
-            import ctypes
-            ptr_long = long(ctypes.cast(x.mat.data_device, ctypes.c_void_p).value)
-            # seems legit.
-            z = cuda.from_gpu_pointer(ptr_long, x.shape, strides, x)
-            return z
-except (ImportError, OSError):
-    cudamat_available = False
--- a/theano/misc/gnumpy_utils.py
+++ b/theano/misc/gnumpy_utils.py
-"""
-This code can only work if gnumpy and theano are initialized on the
-same gpu as theano.
-"""
-from __future__ import absolute_import, print_function, division
-import six
-from six.moves import reduce
-try:
-    import gnumpy
-    import cudamat
-    gnumpy_available = True
-    ___const_garray = gnumpy.rand(1)
-    import theano.sandbox.cuda as cuda
-    if cuda.cuda_available is False:
-        raise ImportError('Optional theano package cuda disabled')
-    if six.PY3:
-        long = int
-    def cudandarray_to_garray(x, copyif=False):
-        """ take a CudaNdarray and return a gnumpy.garray object.
-        :type x: CudaNdarray
-        :param x: The array to transform to gnumpy.garray.
-        :type copyif: bool
-        :param copyif: If False, raise an error if x is not c contiguous.
-                       If it is c contiguous, we return a GPUArray that share
-                       the same memory region as x.
-                       If True, copy x if it is no c contiguous, so the return won't
-                       shape the same memory region. If c contiguous, the return
-                       will share the same memory region.
-                       We need to do this as GPUArray don't fully support strided memory.
-        :return type: cudamat.CUDAMatrix
-        """
-        if not isinstance(x, cuda.CudaNdarray):
-            raise ValueError("We can transfer only CudaNdarray to cudamat.CUDAMatrix")
-        else:
-            # Check if it is c contiguous
-            size = 1
-            c_contiguous = True
-            for i in range(x.ndim - 1, -1, -1):
-                if x.shape[i] == 1:
-                    continue
-                if x._strides[i] != size:
-                    c_contiguous = False
-                    break
-                size *= x.shape[i]
-            if not c_contiguous:
-                if copyif:
-                    x = x.copy()
-                else:
-                    raise ValueError("We where asked to don't copy memory, but the memory is not c contiguous.")
-            # Now x is always c contiguous.
-            # the next step is to create a CUDAMatrix object. We do so by first creating
-            # a cudamat object with no data_host.
-            cm_mat = cudamat.cudamat()
-            cm_mat.size[0] = reduce(lambda x, y: x * y, x.shape, 1)
-            cm_mat.size[1] = 1
-            cm_mat.on_host = 0
-            cm_mat.on_device = 1
-            cm_mat.is_trans = 0
-            cm_mat.owns_data = 0  # <-- note: cm_mat dosen't owe the data; x does. So x will delete it.
-            # x.gpudata is a long. We need a pointer to a float. cast.
-            import ctypes
-            cm_mat.data_device = ctypes.cast(x.gpudata, ctypes.POINTER(ctypes.c_float))
-            px = cudamat.CUDAMatrix(cm_mat)
-            px._base = x  # x won't be freed if the cudamat object isn't freed.
-            # let cudamat know that we don't have a numpy array attached.
-            px.mat_on_host = False
-            # Note how gnumpy tracks its cudamat objects: it moves things to the
-            # _cmsReuseCache when the gnumpy array is deleted, thus the arrays
-            # returned by theano will never be deleted.
-            # However, if the garray thinks that the object is a view, then it won't
-            # move the _base to the _cmsResueCache; so the cudamat object will be deleted,
-            # and we won't overpump the world with memory.
-            _is_alias_of = ___const_garray
-            ans = gnumpy.garray(px,
-                                x.shape,
-                                _is_alias_of)
-            return ans
-    def garray_to_cudandarray(x):
-        """ take a gnumpy.garray and make a CudaNdarray that point to its memory
-        """
-        if not isinstance(x, gnumpy.garray):
-            raise ValueError("We can transfer only gnumpy.garray to CudaNdarray")
-        # elif x.dtype != "float32":
-        #     raise ValueError("CudaNdarray support only float32")
-        # We don't need this, because cudamat is always float32.
-        else:
-            strides = [1]
-            for i in x.shape[::-1][:-1]:
-                strides.append(strides[-1] * i)
-            strides = strides[::-1]
-            for i in range(len(strides)):
-                if x.shape[i] == 1:
-                    strides[i] = 0
-            strides = tuple(strides)
-            import ctypes
-            ptr_long = long(ctypes.cast(x._base.mat.data_device, ctypes.c_void_p).value)
-            # seems legit.
-            z = cuda.from_gpu_pointer(ptr_long, x.shape, strides, x._base)
-            return z
-except (ImportError, OSError):
-    gnumpy_available = False
--- a/theano/misc/may_share_memory.py
+++ b/theano/misc/may_share_memory.py
 """
-Function to detect memory sharing for ndarray AND sparse type AND CudaNdarray.
+Function to detect memory sharing for ndarray AND sparse type AND GpuArray.
 numpy version support only ndarray.
 """
 from __future__ import absolute_import, print_function, division
@@ -14,25 +14,12 @@ try:
    def _is_sparse(a):
        return scipy.sparse.issparse(a)
 except ImportError:
-    # scipy not imported, their can be only ndarray and cudandarray
+    # scipy not imported, their can be only ndarray and gpuarray
    def _is_sparse(a):
        return False
-from theano.sandbox import cuda
 from theano import gpuarray
-if cuda.cuda_available:
-    from theano.sandbox.cuda.type import CudaNdarrayType
-    def _is_cuda(a):
-        return isinstance(a, cuda.CudaNdarray)
-else:
-    def _is_cuda(a):
-        return False
-__docformat__ = "restructuredtext en"
 if gpuarray.pygpu:
    def _is_gpua(a):
        return isinstance(a, gpuarray.pygpu.gpuarray.GpuArray)
@@ -40,16 +27,14 @@ else:
    def _is_gpua(a):
        return False
+__docformat__ = "restructuredtext en"
 def may_share_memory(a, b, raise_other_type=True):
    a_ndarray = isinstance(a, np.ndarray)
    b_ndarray = isinstance(b, np.ndarray)
    if a_ndarray and b_ndarray:
        return TensorType.may_share_memory(a, b)
-    a_cuda = _is_cuda(a)
-    b_cuda = _is_cuda(b)
-    if a_cuda and b_cuda:
-        return CudaNdarrayType.may_share_memory(a, b)
    a_gpua = _is_gpua(a)
    b_gpua = _is_gpua(b)
    if a_gpua and b_gpua:
@@ -57,13 +42,13 @@ def may_share_memory(a, b, raise_other_type=True):
    a_sparse = _is_sparse(a)
    b_sparse = _is_sparse(b)
-    if (not(a_ndarray or a_sparse or a_cuda or a_gpua) or
+    if (not(a_ndarray or a_sparse or a_gpua) or
-            not(b_ndarray or b_sparse or b_cuda or b_gpua)):
+            not(b_ndarray or b_sparse or b_gpua)):
        if raise_other_type:
            raise TypeError("may_share_memory support only ndarray"
-                            " and scipy.sparse, CudaNdarray or GpuArray type")
+                            " and scipy.sparse or GpuArray type")
        return False
-    if a_cuda or b_cuda or a_gpua or b_gpua:
+    if a_gpua or b_gpua:
        return False
    return SparseType.may_share_memory(a, b)
--- a/theano/misc/pkl_utils.py
+++ b/theano/misc/pkl_utils.py
@@ -26,11 +26,11 @@ from theano import config
 from theano.compat import PY3
 from six import string_types
 from theano.compile.sharedvalue import SharedVariable
 try:
-    from theano.sandbox.cuda import cuda_ndarray
+    import pygpu
 except ImportError:
-    cuda_ndarray = None
+    pygpu = None
 __docformat__ = "restructuredtext en"
 __authors__ = "Pascal Lamblin"
@@ -202,21 +202,21 @@ class PersistentNdarrayID(object):
            return self.seen[id(obj)]
-class PersistentCudaNdarrayID(PersistentNdarrayID):
+class PersistentGpuArrayID(PersistentNdarrayID):
    def __call__(self, obj):
-        if (cuda_ndarray is not None and
+        if (pygpu and
-                type(obj) is cuda_ndarray.cuda_ndarray.CudaNdarray):
+                isinstance(obj, pygpu.gpuarray.GpuArray)):
            if id(obj) not in self.seen:
                def write_array(f):
                    np.lib.format.write_array(f, np.asarray(obj))
                name = self._resolve_name(obj)
                zipadd(write_array, self.zip_file, name)
-                self.seen[id(obj)] = 'cuda_ndarray.{0}'.format(name)
+                self.seen[id(obj)] = 'gpuarray.{0}'.format(name)
            return self.seen[id(obj)]
-        return super(PersistentCudaNdarrayID, self).__call__(obj)
+        return super(PersistentGpuArrayID, self).__call__(obj)
-class PersistentSharedVariableID(PersistentCudaNdarrayID):
+class PersistentSharedVariableID(PersistentGpuArrayID):
    """Uses shared variable names when persisting to zip file.
    If a shared variable has a name, this name is used as the name of the
@@ -288,18 +288,16 @@ class PersistentNdarrayLoad(object):
            return self.cache[name]
        ret = None
        array = np.lib.format.read_array(self.zip_file.open(name))
-        if array_type == 'cuda_ndarray':
+        if array_type == 'gpuarray':
            if config.experimental.unpickle_gpu_on_cpu:
                # directly return numpy array
                warnings.warn("config.experimental.unpickle_gpu_on_cpu is set "
-                              "to True. Unpickling CudaNdarray as "
+                              "to True. Unpickling GpuArray as numpy.ndarray")
-                              "numpy.ndarray")
                ret = array
-            elif cuda_ndarray:
+            elif pygpu:
-                ret = cuda_ndarray.cuda_ndarray.CudaNdarray(array)
+                ret = pygpu.array(array)
            else:
-                raise ImportError("Cuda not found. Cannot unpickle "
+                raise ImportError("pygpu not found. Cannot unpickle GpuArray")
-                                  "CudaNdarray")
        else:
            ret = array
        self.cache[name] = ret

--- a/theano/misc/pycuda_example.py
+++ b/theano/misc/pycuda_example.py
-"""This file show how we can use Pycuda compiled fct in a Theano
-Op. Do no use those op in production code. See the TODO.
-You can use them as a guide to use your pycuda code into a Theano op.
-The PycudaElemwiseSourceModuleOp is a Theano op use pycuda code
-generated with pycuda.compiler.SourceModule
-Their is a test in test_pycuda.py.
-This don't work with broadcast and non-contiguous memory as pycuda
-don't support that, but we make sure we don't introduce problem.
-  If the memory is non-contiguous, we create a new copy that is contiguous.
-  If their is broadcasted dimensions, we raise an error.
-#The following is commented as it work only with old pycuda version
-The PycudaElemwiseKernelOp op use pycuda code generated with
-pycuda.elementwise.ElementwiseKernel. It must be wrapper by
-TheanoElementwiseKernel.
-"""
-from __future__ import absolute_import, print_function, division
-from itertools import chain
-import numpy as np
-import theano
-from six.moves import xrange
-from theano.compat import izip
-from theano.gof import Op, Apply, local_optimizer, EquilibriumDB
-from theano.sandbox.cuda import GpuElemwise, CudaNdarrayType, GpuOp
-from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
-                                           gpu_contiguous)
-from theano.sandbox.cuda.opt import gpu_seqopt
-from theano.misc.frozendict import frozendict
-import pycuda
-from pycuda.compiler import SourceModule
-import pycuda.gpuarray
-from . import pycuda_init
-if not pycuda_init.pycuda_available:
-    raise Exception("No pycuda available. You can't load pycuda_example.py")
-def _replace_npy_types(c_arg):
-    c_arg = c_arg.replace('npy_float32', 'float')
-    c_arg = c_arg.replace('npy_float64', 'double')
-    c_arg = c_arg.replace('npy_int32', 'int')
-    c_arg = c_arg.replace('npy_int8', 'char')
-    c_arg = c_arg.replace('npy_ucs4', 'unsigned int')
-    c_arg = c_arg.replace('npy_uint32', 'unsigned int')
-    c_arg = c_arg.replace('npy_uint16', 'unsigned short')
-    c_arg = c_arg.replace('npy_uint8', 'unsigned char')
-    return c_arg
-def theano_parse_c_arg(c_arg):
-    c_arg = _replace_npy_types(c_arg)
-    return pycuda.tools.parse_c_arg(c_arg)
-"""
-class TheanoElementwiseKernel(pycuda.elementwise.ElementwiseKernel):
-    def __init__(self, arguments, operation,
-                 name="kernel", keep=False, options=None, **kwargs):
-        if options is None:
-            options = []
-        if isinstance(arguments, string_types):
-            arguments = [theano_parse_c_arg(arg)
-                         for arg in arguments.split(",")]
-        pycuda.elementwise.ElementwiseKernel.__init__(self, arguments,
-                                                      operation, name, keep,
-                                                      options, **kwargs)
-    def __call__(self, *args):
-        vectors = []
-        invocation_args = []
-        for arg, arg_descr in zip(args, self.gen_kwargs["arguments"]):
-            if isinstance(arg_descr, VectorArg):
-                vectors.append(arg)
-                invocation_args.append(arg.gpudata)
-            else:
-                invocation_args.append(arg)
-        repr_vec = vectors[0]
-        invocation_args.append(repr_vec.mem_size)
-        if hasattr(repr_vec, "_block") and hasattr(repr_vec, "_grid"):
-            self.func.set_block_shape(*repr_vec._block)
-            self.func.prepared_call(repr_vec._grid, *invocation_args)
-        else:
-            _grid, _block = pycuda.gpuarray.splay(repr_vec.mem_size)
-            self.func.set_block_shape(*_block)
-            self.func.prepared_call(_grid, *invocation_args)
-class PycudaElemwiseKernelOp(GpuOp):
-    nin = property(lambda self: self.scalar_op.nin)
-    nout = property(lambda self: self.scalar_op.nout)
-    def __init__(self, scalar_op, inplace_pattern=None, name=None):
-        if inplace_pattern is None:
-            inplace_pattern = {}
-        self.name = name
-        self.scalar_op = scalar_op
-        self.inplace_pattern = inplace_pattern
-    def __str__(self):
-        if self.name is None:
-            if self.inplace_pattern:
-                items = self.inplace_pattern.items()
-                items.sort()
-                return self.__class__.__name__ + "{%s}%s" % (self.scalar_op,
-                                                             str(items))
-            else:
-                return self.__class__.__name__ + "{%s}" % (self.scalar_op)
-        else:
-            return self.name
-    def __eq__(self, other):
-        return (type(self) == type(other) and
-                self.scalar_op == other.scalar_op and
-                self.inplace_pattern == other.inplace_pattern)
-    def __hash__(self):
-        return (hash(type(self)) ^ hash(self.scalar_op) ^
-                hash_from_dict(self.inplace_pattern))
-    def make_node(self, *inputs):
-        _inputs = [gpu_contiguous(as_cuda_ndarray_variable(i)) for i in inputs]
-        if self.nin > 0 and len(_inputs) != self.nin:
-            raise TypeError('Wrong argument count', (self.nin, len(_inputs)))
-        for i in _inputs[1:]:
-            if i.type.ndim != inputs[0].type.ndim:
-                raise TypeError('different ranks among inputs')
-        if any([any(i.type.broadcastable) for i in inputs]):
-            raise Exception("pycuda don't support broadcasted dimensions")
-        assert len(inputs) == 2  # TODO remove
-        # output is broadcastable only along dimensions where all inputs are
-        # broadcastable
-        broadcastable = []
-        for d in xrange(_inputs[0].type.ndim):
-            bcast_d = True
-            for i in _inputs:
-                if not i.type.broadcastable[d]:
-                    bcast_d = False
-                    break
-            broadcastable.append(bcast_d)
-        assert len(broadcastable) == _inputs[0].type.ndim
-        otype = CudaNdarrayType(broadcastable=broadcastable)
-        assert self.nout == 1
-        out_node = Apply(self, _inputs, [otype() for o in xrange(self.nout)])
-        in_name = ["i" + str(id) for id in range(len(inputs))]
-        out_name = ["o" + str(id) for id in range(self.nout)]
-        c_code = self.scalar_op.c_code(out_node, "some_name",
-                                       tuple([n + "[i]"for n in in_name]),
-                                       tuple(n + "[i]"for n in out_name), {})
-        self.pycuda_fct = TheanoElementwiseKernel(
-            ", ".join([var.type.dtype_specs()[1] + " *" + name
-                       for var, name in (zip(inputs, in_name) +
-                                         zip(out_node.outputs, out_name))]),
-            c_code,
-            "pycuda_elemwise_kernel_%s" % str(self.scalar_op),
-            preamble=("#include<Python.h>\n"
-"#include <numpy/arrayobject.h>"))
-        return out_node
-    def perform(self, node, inputs, out):
-        #TODO assert all input have the same shape
-        z, = out
-        if z[0] is None or z[0].shape != inputs[0].shape:
-            z[0] = theano.sandbox.cuda.CudaNdarray.zeros(inputs[0].shape)
-        i = inputs + z
-        self.pycuda_fct(*i)
-"""
-class PycudaElemwiseSourceModuleOp(GpuOp):
-    nin = property(lambda self: self.scalar_op.nin)
-    nout = property(lambda self: self.scalar_op.nout)
-    __props__ = ("scalar_op", "inplace_pattern")
-    def __init__(self, scalar_op, inplace_pattern=None, name=None):
-        if inplace_pattern is None:
-            inplace_pattern = frozendict({})
-        self.name = name
-        self.scalar_op = scalar_op
-        self.inplace_pattern = frozendict(inplace_pattern)
-    def __str__(self):
-        if self.name is None:
-            if self.inplace_pattern:
-                items = list(self.inplace_pattern.items())
-                items.sort()
-                return self.__class__.__name__ + "{%s}%s" % (self.scalar_op,
-                                                             str(items))
-            else:
-                return self.__class__.__name__ + "{%s}" % (self.scalar_op)
-        else:
-            return self.name
-    def make_node(self, *inputs):
-        _inputs = [gpu_contiguous(as_cuda_ndarray_variable(i)) for i in inputs]
-        if self.nin > 0 and len(_inputs) != self.nin:
-            raise TypeError('Wrong argument count', (self.nin, len(_inputs)))
-        for i in _inputs[1:]:
-            if i.type.ndim != inputs[0].type.ndim:
-                raise TypeError('different ranks among inputs')
-        if any([any(i.type.broadcastable) for i in inputs]):
-            raise Exception("pycuda don't support broadcasted dimensions")
-        assert len(inputs) == 2  # TODO remove
-        otype = CudaNdarrayType(broadcastable=[False] * _inputs[0].type.ndim)
-        assert self.nout == 1
-        fct_name = "pycuda_elemwise_%s" % str(self.scalar_op)
-        out_node = Apply(self, _inputs, [otype() for o in xrange(self.nout)])
-        in_name = ["i" + str(id) for id in range(len(inputs))]
-        out_name = ["o" + str(id) for id in range(self.nout)]
-        c_code = self.scalar_op.c_code(out_node, "some_name",
-                                       tuple([n + "[i]" for n in in_name]),
-                                       tuple(n + "[i]" for n in out_name), {})
-        c_code_param = ", ".join(
-            [_replace_npy_types(var.type.dtype_specs()[1]) + " *" + name
-             for var, name in chain(izip(inputs, in_name),
-                                    izip(out_node.outputs, out_name))] +
-            ["int size"])
-        mod = SourceModule("""
-  __global__ void %s(%s)
-  {
-    int i = (blockIdx.x+blockIdx.y*gridDim.x)*(blockDim.x*blockDim.y);
-    i += threadIdx.x + threadIdx.y*blockDim.x;
-    if(i<size){
-        %s
-    }
-  }
-  """ % (fct_name, c_code_param, c_code))
-        self.pycuda_fct = mod.get_function(fct_name)
-        return out_node
-    def perform(self, node, inputs, out):
-        # TODO support broadcast!
-        # TODO assert all input have the same shape
-        z, = out
-        if (z[0] is None or
-                z[0].shape != inputs[0].shape or
-                not z[0].is_c_contiguous()):
-            z[0] = theano.sandbox.cuda.CudaNdarray.zeros(inputs[0].shape)
-        if inputs[0].shape != inputs[1].shape:
-            raise TypeError("PycudaElemwiseSourceModuleOp:"
-                            " inputs don't have the same shape!")
-        if inputs[0].size > 512:
-            grid = (int(np.ceil(inputs[0].size / 512.)), 1)
-            block = (512, 1, 1)
-        else:
-            grid = (1, 1)
-            block = (inputs[0].shape[0], inputs[0].shape[1], 1)
-        self.pycuda_fct(inputs[0], inputs[1], z[0],
-                        np.intc(inputs[1].size), block=block, grid=grid)
-class PycudaElemwiseSourceModuleMakeThunkOp(Op):
-    nin = property(lambda self: self.scalar_op.nin)
-    nout = property(lambda self: self.scalar_op.nout)
-    __props__ = ("scalar_op", "inplace_pattern")
-    def __init__(self, scalar_op, inplace_pattern=None, name=None):
-        if inplace_pattern is None:
-            inplace_pattern = {}
-        self.name = name
-        self.scalar_op = scalar_op
-        self.inplace_pattern = frozendict(inplace_pattern)
-    def __str__(self):
-        if self.name is None:
-            if self.inplace_pattern:
-                items = list(self.inplace_pattern.items())
-                items.sort()
-                return self.__class__.__name__ + "{%s}%s" % (self.scalar_op,
-                                                             str(items))
-            else:
-                return self.__class__.__name__ + "{%s}" % (self.scalar_op)
-        else:
-            return self.name
-    def make_node(self, *inputs):
-        assert self.nout == 1
-        assert len(inputs) == 2  # TODO remove
-        _inputs = [gpu_contiguous(as_cuda_ndarray_variable(i)) for i in inputs]
-        if self.nin > 0 and len(_inputs) != self.nin:
-            raise TypeError('Wrong argument count', (self.nin, len(_inputs)))
-        for i in _inputs[1:]:
-            if i.type.ndim != inputs[0].type.ndim:
-                raise TypeError('different ranks among inputs')
-        if any([any(i.type.broadcastable) for i in inputs]):
-            raise Exception("pycuda don't support broadcasted dimensions")
-        otype = CudaNdarrayType(broadcastable=[False] * _inputs[0].type.ndim)
-        out_node = Apply(self, _inputs, [otype() for o in xrange(self.nout)])
-        return out_node
-    def make_thunk(self, node, storage_map, _, _2, impl=None):
-        # TODO support broadcast!
-        # TODO assert all input have the same shape
-        fct_name = "pycuda_elemwise_%s" % str(self.scalar_op)
-        in_name = ["i" + str(id) for id in range(len(node.inputs))]
-        out_name = ["o" + str(id) for id in range(self.nout)]
-        c_code = self.scalar_op.c_code(node, "some_name",
-                                       tuple([n + "[i]" for n in in_name]),
-                                       tuple(n + "[i]" for n in out_name), {})
-        c_code_param = ", ".join(
-            [_replace_npy_types(var.type.dtype_specs()[1]) + " *" + name
-             for var, name in chain(izip(node.inputs, in_name),
-                                    izip(node.outputs, out_name))] +
-            ["int size"])
-        mod = SourceModule("""
-  __global__ void %s(%s)
-  {
-    int i = (blockIdx.x+blockIdx.y*gridDim.x)*(blockDim.x*blockDim.y);
-    i += threadIdx.x + threadIdx.y*blockDim.x;
-    if(i<size){
-        %s
-    }
-  }
-  """ % (fct_name, c_code_param, c_code))
-        pycuda_fct = mod.get_function(fct_name)
-        inputs = [storage_map[v] for v in node.inputs]
-        outputs = [storage_map[v] for v in node.outputs]
-        def thunk():
-            z = outputs[0]
-            if (z[0] is None or
-                    z[0].shape != inputs[0][0].shape or
-                    not z[0].is_c_contiguous()):
-                z[0] = theano.sandbox.cuda.CudaNdarray.zeros(
-                    inputs[0][0].shape)
-            if inputs[0][0].shape != inputs[1][0].shape:
-                raise TypeError("PycudaElemwiseSourceModuleMakeThunkOp:"
-                                " inputs don't have the same shape!")
-            if inputs[0][0].size > 512:
-                grid = (int(np.ceil(inputs[0][0].size / 512.)), 1)
-                block = (512, 1, 1)
-            else:
-                grid = (1, 1)
-                block = (inputs[0][0].shape[0], inputs[0][0].shape[1], 1)
-            pycuda_fct(inputs[0][0], inputs[1][0], z[0],
-                       np.intc(inputs[1][0].size), block=block,
-                       grid=grid)
-        thunk.inputs = inputs
-        thunk.outputs = outputs
-        thunk.lazy = False
-        return thunk
-pycuda_optimizer = EquilibriumDB()
-gpu_seqopt.register("pycuda_optimizer", pycuda_optimizer, 1.5, "fast_run")
-@local_optimizer([GpuElemwise])
-def local_pycuda_gpu_elemwise(node):
-    """
-       GpuElemwise -> PycudaElemwiseSourceModuleOp
-    """
-    if isinstance(node.op, GpuElemwise):
-        if (not any([any(i.type.broadcastable) for i in node.inputs]) and
-                all([i.ndim <= 2 for i in node.inputs])):
-            new_op = PycudaElemwiseSourceModuleOp(node.op.scalar_op,
-                                                  node.op.inplace_pattern)(
-                                                      *node.inputs)
-            return [new_op]
-pycuda_optimizer.register("local_pycuda_gpu_elemwise",
-                          local_pycuda_gpu_elemwise)
-"""
-@local_optimizer([GpuElemwise])
-def local_pycuda_gpu_elemwise_kernel(node):
-    ""
-       GpuElemwise -> PycudaElemwiseKernelOp
-    ""
-    if isinstance(node.op, GpuElemwise):
-        if not any([any(i.type.broadcastable) for i in node.inputs]):
-            new_op = PycudaElemwiseKernelOp(node.op.scalar_op,
-                                            node.op.inplace_pattern)(
-                                                *node.inputs)
-            return [new_op]
-pycuda_optimizer.register("local_pycuda_gpu_elemwise_kernel",
-                          local_pycuda_gpu_elemwise_kernel, 1.5)
-"""
--- a/theano/misc/pycuda_init.py
+++ b/theano/misc/pycuda_init.py
-from __future__ import absolute_import, print_function, division
-import os
-import warnings
-import theano
-import theano.sandbox.cuda
-from theano import config
-def set_gpu_from_theano():
-    """
-    This set the GPU used by PyCUDA to the same as the one used by Theano.
-    """
-    # Transfer the theano gpu binding to pycuda, for consistency
-    if config.device.startswith("gpu") and len(config.device) > 3:
-        os.environ["CUDA_DEVICE"] = theano.config.device[3:]
-    elif (config.init_gpu_device.startswith("gpu") and
-          len(config.init_gpu_device) > 3):
-        os.environ["CUDA_DEVICE"] = theano.config.init_gpu_device[3:]
-set_gpu_from_theano()
-pycuda_available = False
-# If theano.sandbox.cuda don't exist, it is because we are importing
-# it and it try to import this file! This mean we must init the device.
-if (not hasattr(theano.sandbox, 'cuda') or
-        theano.sandbox.cuda.use.device_number is None):
-    try:
-        import pycuda
-        import pycuda.autoinit
-        pycuda_available = True
-    except (ImportError, RuntimeError):
-        # presumably, the user wanted to use pycuda, else they wouldn't have
-        # imported this module, so issue a warning that the import failed.
-        warnings.warn("PyCUDA import failed in theano.misc.pycuda_init")
-    except pycuda._driver.LogicError:
-        if theano.config.force_device:
-            raise
-        else:
-            if "CUDA_DEVICE" in os.environ:
-                del os.environ["CUDA_DEVICE"]
-            import pycuda.autoinit
-            pycuda_available = True
-else:
-    try:
-        import pycuda.driver
-        pycuda_available = True
-    except ImportError:
-        pass
-    if pycuda_available:
-        if hasattr(pycuda.driver.Context, "attach"):
-            pycuda.driver.Context.attach()
-            import atexit
-            atexit.register(pycuda.driver.Context.pop)
-        else:
-            # Now we always import this file when we call
-            # theano.sandbox.cuda.use. So this should not happen
-            # normally.
-            # TODO: make this an error.
-            warnings.warn("For some unknow reason, theano.misc.pycuda_init was"
-                          " not imported before Theano initialized the GPU and"
-                          " your PyCUDA version is 2011.2.2 or earlier."
-                          " To fix the problem, import theano.misc.pycuda_init"
-                          " manually before using/initializing the GPU, use the"
-                          " Theano flag pycuda.init=True or use a"
-                          " more recent version of PyCUDA.")
--- a/theano/misc/pycuda_utils.py
+++ b/theano/misc/pycuda_utils.py
-from __future__ import absolute_import, print_function, division
-import pycuda.gpuarray
-from theano.sandbox import cuda
-if cuda.cuda_available is False:
-    raise ImportError('Optional theano package cuda disabled')
-def to_gpuarray(x, copyif=False):
-    """ take a CudaNdarray and return a pycuda.gpuarray.GPUArray
-    :type x: CudaNdarray
-    :param x: The array to transform to pycuda.gpuarray.GPUArray.
-    :type copyif: bool
-    :param copyif: If False, raise an error if x is not c contiguous.
-                   If it is c contiguous, we return a GPUArray that share
-                   the same memory region as x.
-                   If True, copy x if it is no c contiguous, so the return won't
-                   shape the same memory region. If c contiguous, the return
-                   will share the same memory region.
-                   We need to do this as GPUArray don't fully support strided memory.
-    :return type: pycuda.gpuarray.GPUArray
-    """
-    if not isinstance(x, cuda.CudaNdarray):
-        raise ValueError("We can transfer only CudaNdarray to pycuda.gpuarray.GPUArray")
-    else:
-        # Check if it is c contiguous
-        size = 1
-        c_contiguous = True
-        for i in range(x.ndim - 1, -1, -1):
-            if x.shape[i] == 1:
-                continue
-            if x._strides[i] != size:
-                c_contiguous = False
-                break
-            size *= x.shape[i]
-        if not c_contiguous:
-            if copyif:
-                x = x.copy()
-            else:
-                raise ValueError("We were asked to not copy memory, but the memory is not c contiguous.")
-        # Now x is always c contiguous
-        px = pycuda.gpuarray.GPUArray(x.shape, x.dtype, base=x, gpudata=x.gpudata)
-        return px
-def to_cudandarray(x):
-    """ take a pycuda.gpuarray.GPUArray and make a CudaNdarray that point to its memory
-    :note: CudaNdarray support only float32, so only float32 GPUArray are accepted
-    """
-    if not isinstance(x, pycuda.gpuarray.GPUArray):
-        raise ValueError("We can transfer only pycuda.gpuarray.GPUArray to CudaNdarray")
-    elif x.dtype != "float32":
-        raise ValueError("CudaNdarray support only float32")
-    else:
-        strides = [1]
-        for i in x.shape[::-1][:-1]:
-            strides.append(strides[-1] * i)
-        strides = tuple(strides[::-1])
-        ptr = int(x.gpudata)  # in pycuda trunk, y.ptr also works, which is a little cleaner
-        z = cuda.from_gpu_pointer(ptr, x.shape, strides, x)
-        return z
--- a/theano/misc/tests/test_cudamat_utils.py
+++ b/theano/misc/tests/test_cudamat_utils.py
-from __future__ import absolute_import, print_function, division
-import numpy as np
-import theano
-from theano.misc.cudamat_utils import cudamat_available
-if not cudamat_available:  # noqa
-    from nose.plugins.skip import SkipTest
-    raise SkipTest("gnumpy not installed. Skip test of theano op with pycuda "
-                   "code.")
-from theano.misc.cudamat_utils import (cudandarray_to_cudamat,
-                                       cudamat_to_cudandarray)
-def test(shape=(3, 4)):
-    """
-    Make sure that the cudamat conversion is exact.
-    """
-    gpu = theano.sandbox.cuda.basic_ops.gpu_from_host
-    U = gpu(theano.tensor.fmatrix('U'))
-    ii = theano.function([U], gpu(U + 1))
-    A_cpu = np.asarray(np.random.rand(*shape), dtype="float32")
-    A_cnd = theano.sandbox.cuda.CudaNdarray(A_cpu)
-    A_cmat = cudandarray_to_cudamat(A_cnd)
-    B_cnd = cudamat_to_cudandarray(A_cmat)
-    B_cnd = ii(A_cnd)
-    u = A_cnd.copy()
-    u += theano.sandbox.cuda.CudaNdarray(np.asarray([[1]], dtype='float32'))
-    u = np.asarray(u)
-    v = np.asarray(B_cnd)
-    w = A_cmat.add(1).asarray()
-    assert abs(u - v).max() == 0
-    assert abs(u - w.T.reshape(u.shape)).max() == 0
--- a/theano/misc/tests/test_gnumpy_utils.py
+++ b/theano/misc/tests/test_gnumpy_utils.py
-from __future__ import absolute_import, print_function, division
-import numpy as np
-import theano
-from theano.misc.gnumpy_utils import gnumpy_available
-if not gnumpy_available:  # noqa
-    from nose.plugins.skip import SkipTest
-    raise SkipTest("gnumpy not installed. Skip test related to it.")
-from theano.misc.gnumpy_utils import (garray_to_cudandarray,
-                                      cudandarray_to_garray)
-import gnumpy
-def test(shape=(3, 4, 5)):
-    """
-    Make sure that the gnumpy conversion is exact from garray to
-    CudaNdarray back to garray.
-    """
-    gpu = theano.sandbox.cuda.basic_ops.gpu_from_host
-    U = gpu(theano.tensor.ftensor3('U'))
-    ii = theano.function([U], gpu(U + 1))
-    A = gnumpy.rand(*shape)
-    A_cnd = garray_to_cudandarray(A)
-    assert A_cnd.shape == A.shape
-    # dtype always float32
-    # garray don't have strides
-    B_cnd = ii(A_cnd)
-    B = cudandarray_to_garray(B_cnd)
-    assert A_cnd.shape == A.shape
-    u = (A + 1).asarray()
-    v = B.asarray()
-    w = np.array(B_cnd)
-    assert (u == v).all()
-    assert (u == w).all()
-def test2(shape=(3, 4, 5)):
-    """
-    Make sure that the gnumpy conversion is exact from CudaNdarray to
-    garray back to CudaNdarray.
-    """
-    gpu = theano.sandbox.cuda.basic_ops.gpu_from_host
-    U = gpu(theano.tensor.ftensor3('U'))
-    theano.function([U], gpu(U + 1))
-    A = np.random.rand(*shape).astype('float32')
-    A_cnd = theano.sandbox.cuda.CudaNdarray(A)
-    A_gar = cudandarray_to_garray(A_cnd)
-    assert A_cnd.shape == A_gar.shape
-    # dtype always float32
-    # garray don't have strides
-    B = garray_to_cudandarray(A_gar)
-    assert A_cnd.shape == B.shape
-    # dtype always float32
-    assert A_cnd._strides == B._strides
-    assert A_cnd.gpudata == B.gpudata
-    v = np.asarray(B)
-    assert (v == A).all()
-def test_broadcast_dims():
-    """
-    Test with some dimensions being 1.
-    CudaNdarray use 0 for strides for those dimensions.
-    """
-    test((1, 2, 3))
-    test((2, 1, 3))
-    test((2, 3, 1))
-    test2((1, 2, 3))
-    test2((2, 1, 3))
-    test2((2, 3, 1))
--- a/theano/misc/tests/test_may_share_memory.py
+++ b/theano/misc/tests/test_may_share_memory.py
 """
-test the tensor and sparse type. The CudaNdarray type is tested in
+test the tensor and sparse type. (gpuarray is tested in the gpuarray folder).
-sandbox/cuda/tests/test_tensor_op.py.test_may_share_memory_cuda
 """
 from __future__ import absolute_import, print_function, division
 import numpy as np
@@ -15,9 +14,7 @@ except ImportError:
 from theano.misc.may_share_memory import may_share_memory
-def test_may_share_memory():
+def may_share_memory_core(a, b):
-    a = np.random.rand(5, 4)
-    b = np.random.rand(5, 4)
    va = a.view()
    vb = b.view()
    ra = a.reshape((4, 5))
@@ -51,6 +48,13 @@ def test_may_share_memory():
        except TypeError:
            pass
+def test_may_share_memory():
+    a = np.random.rand(5, 4)
+    b = np.random.rand(5, 4)
+    may_share_memory_core(a, b)
 if scipy_imported:
    def test_may_share_memory_scipy():
        a = scipy.sparse.csc_matrix(scipy.sparse.eye(5, 3))

--- a/theano/misc/tests/test_pkl_utils.py
+++ b/theano/misc/tests/test_pkl_utils.py
@@ -5,13 +5,9 @@ import unittest
 from tempfile import mkdtemp
 import numpy as np
-from nose.plugins.skip import SkipTest
 import theano
-import theano.sandbox.cuda as cuda_ndarray
-from theano.sandbox.cuda.type import CudaNdarrayType
-from theano.sandbox.cuda.var import CudaNdarraySharedVariable
 from theano.sandbox.rng_mrg import MRG_RandomStreams
 from theano.misc.pkl_utils import dump, load, StripPickler
@@ -29,24 +25,8 @@ class T_dump_load(unittest.TestCase):
        if self.tmpdir is not None:
            shutil.rmtree(self.tmpdir)
-    def test_dump_load(self):
-        if not cuda_ndarray.cuda_enabled:
-            raise SkipTest('Optional package cuda disabled')
-        x = CudaNdarraySharedVariable('x', CudaNdarrayType((1, 1), name='x'),
-                                      [[1]], False)
-        with open('test', 'wb') as f:
-            dump(x, f)
-        with open('test', 'rb') as f:
-            x = load(f)
-        assert x.name == 'x'
-        np.testing.assert_allclose(x.get_value(), [[1]])
    def test_dump_load_mrg(self):
-        rng = MRG_RandomStreams(use_cuda=cuda_ndarray.cuda_enabled)
+        rng = MRG_RandomStreams()
        with open('test', 'wb') as f:
            dump(rng, f)

--- a/theano/misc/tests/test_pycuda_example.py
+++ b/theano/misc/tests/test_pycuda_example.py
-from __future__ import absolute_import, print_function, division
-import numpy as np
-import theano
-import theano.misc.pycuda_init
-if not theano.misc.pycuda_init.pycuda_available:  # noqa
-    from nose.plugins.skip import SkipTest
-    raise SkipTest("Pycuda not installed. Skip test of theano op"
-                   " with pycuda code.")
-import theano.sandbox.cuda as cuda_ndarray
-if not cuda_ndarray.cuda_available:  # noqa
-    from nose.plugins.skip import SkipTest
-    raise SkipTest('Optional package cuda disabled')
-import theano.tensor as T
-from theano.misc.pycuda_example import (PycudaElemwiseSourceModuleOp,
-                                        PycudaElemwiseSourceModuleMakeThunkOp)
-if theano.config.mode == 'FAST_COMPILE':
-    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
-    mode_without_gpu = theano.compile.mode.get_mode(
-        'FAST_RUN').excluding('gpu')
-else:
-    mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
-    mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpu')
-def test_pycuda_elemwise_source_module():
-    for shape in [(5, 5), (10, 49), (50, 49), (500, 501)]:
-        for op in [theano.scalar.basic.mul, theano.scalar.basic.add]:
-            x = T.fmatrix('x')
-            y = T.fmatrix('y')
-            elemwise_op = theano.tensor.Elemwise(op)
-            pycuda_op = PycudaElemwiseSourceModuleOp(op)
-            pycuda_op_thunk = PycudaElemwiseSourceModuleMakeThunkOp(op)
-            f = theano.function([x, y], elemwise_op(x, y), mode=mode_with_gpu)
-            f2 = theano.function([x, y],
-                                 theano.sandbox.cuda.host_from_gpu(
-                                     pycuda_op(x, y)),
-                                 mode=mode_with_gpu)
-            mode_pycuda = mode_with_gpu.including("local_pycuda_gpu_elemwise")
-            f3 = theano.function([x, y], elemwise_op(x, y),
-                                 mode=mode_pycuda)
-            f4 = theano.function([x, y],
-                                 theano.sandbox.cuda.host_from_gpu(
-                                     pycuda_op_thunk(x, y)),
-                                 mode=mode_with_gpu)
-            assert any([isinstance(node.op, theano.sandbox.cuda.GpuElemwise)
-                        for node in f.maker.fgraph.toposort()])
-            assert any([isinstance(node.op, PycudaElemwiseSourceModuleOp)
-                        for node in f2.maker.fgraph.toposort()])
-            assert any([isinstance(node.op, PycudaElemwiseSourceModuleOp)
-                        for node in f3.maker.fgraph.toposort()])
-            assert any([isinstance(node.op,
-                                   PycudaElemwiseSourceModuleMakeThunkOp)
-                        for node in f4.maker.fgraph.toposort()])
-            val1 = np.asarray(np.random.rand(*shape), dtype='float32')
-            val2 = np.asarray(np.random.rand(*shape), dtype='float32')
-            assert np.allclose(f(val1, val2), f2(val1, val2))
-            assert np.allclose(f(val1, val2), f3(val1, val2))
-            assert np.allclose(f(val1, val2), f4(val1, val2))
-            # print f(val1,val2)
-            # print f2(val1,val2)
-"""
-#commented as it work only with old pycuda version.
-def test_pycuda_elemwise_kernel():
-    x = T.fmatrix('x')
-    y = T.fmatrix('y')
-    f = theano.function([x, y], x + y, mode=mode_with_gpu)
-    print(f.maker.fgraph.toposort())
-    mode_pycuda = mode_with_gpu.including("local_pycuda_gpu_elemwise_kernel")
-    f2 = theano.function([x, y], x + y, mode=mode_pycuda)
-    print(f2.maker.fgraph.toposort())
-    assert any([isinstance(node.op, theano.sandbox.cuda.GpuElemwise)
-                for node in f.maker.fgraph.toposort()])
-    assert any([isinstance(node.op, PycudaElemwiseKernelOp)
-                for node in f2.maker.fgraph.toposort()])
-    val1 = np.asarray(np.random.rand(5, 5), dtype='float32')
-    val2 = np.asarray(np.random.rand(5, 5), dtype='float32')
-    #val1 = np.ones((5,5))
-    #val2 = np.arange(25).reshape(5,5)
-    assert (f(val1, val2) == f2(val1, val2)).all()
-    print(f(val1, val2))
-    print(f2(val1, val2))
-    x3 = T.ftensor3('x')
-    y3 = T.ftensor3('y')
-    z3 = T.ftensor3('y')
-    f4 = theano.function([x3, y3, z3], x3 * y3 + z3, mode=mode_pycuda)
-    print(f4.maker.fgraph.toposort())
-    assert any([isinstance(node.op, PycudaElemwiseKernelOp)
-                for node in f4.maker.fgraph.toposort()])
-    val1 = np.random.rand(2, 2, 2)
-    print(val1)
-    print(f4(val1, val1, val1))
-    assert np.allclose(f4(val1, val1, val1), val1 * val1 + val1)
-"""
--- a/theano/misc/tests/test_pycuda_theano_simple.py
+++ b/theano/misc/tests/test_pycuda_theano_simple.py
-"""
-This file is an example of view the memory allocated by pycuda in a GpuArray
-in a CudaNdarray to be able to use it in Theano.
-This also serve as a test for the function: cuda_ndarray.from_gpu_pointer
-"""
-from __future__ import absolute_import, print_function, division
-import sys
-import numpy as np
-import theano
-import theano.sandbox.cuda as cuda_ndarray
-import theano.misc.pycuda_init
-if not theano.misc.pycuda_init.pycuda_available:  # noqa
-    from nose.plugins.skip import SkipTest
-    raise SkipTest("Pycuda not installed."
-                   " We skip tests of Theano Ops with pycuda code.")
-if cuda_ndarray.cuda_available is False:  # noqa
-    from nose.plugins.skip import SkipTest
-    raise SkipTest('Optional theano package cuda disabled')
-import pycuda
-import pycuda.driver as drv
-import pycuda.gpuarray
-def test_pycuda_only():
-    """Run pycuda only example to test that pycuda works."""
-    from pycuda.compiler import SourceModule
-    mod = SourceModule("""
-__global__ void multiply_them(float *dest, float *a, float *b)
-{
-  const int i = threadIdx.x;
-  dest[i] = a[i] * b[i];
-}
-""")
-    multiply_them = mod.get_function("multiply_them")
-    # Test with pycuda in/out of numpy.ndarray
-    a = np.random.randn(100).astype(np.float32)
-    b = np.random.randn(100).astype(np.float32)
-    dest = np.zeros_like(a)
-    multiply_them(
-        drv.Out(dest), drv.In(a), drv.In(b),
-        block=(400, 1, 1), grid=(1, 1))
-    assert (dest == a * b).all()
-def test_pycuda_theano():
-    """Simple example with pycuda function and Theano CudaNdarray object."""
-    from pycuda.compiler import SourceModule
-    mod = SourceModule("""
-__global__ void multiply_them(float *dest, float *a, float *b)
-{
-  const int i = threadIdx.x;
-  dest[i] = a[i] * b[i];
-}
-""")
-    multiply_them = mod.get_function("multiply_them")
-    a = np.random.randn(100).astype(np.float32)
-    b = np.random.randn(100).astype(np.float32)
-    # Test with Theano object
-    ga = cuda_ndarray.CudaNdarray(a)
-    gb = cuda_ndarray.CudaNdarray(b)
-    dest = cuda_ndarray.CudaNdarray.zeros(a.shape)
-    multiply_them(dest, ga, gb,
-                  block=(400, 1, 1), grid=(1, 1))
-    assert (np.asarray(dest) == a * b).all()
-def test_pycuda_memory_to_theano():
-    # Test that we can use the GpuArray memory space in pycuda in a CudaNdarray
-    y = pycuda.gpuarray.zeros((3, 4, 5), 'float32')
-    print(sys.getrefcount(y))
-    # This increase the ref count with never pycuda. Do pycuda also
-    # cache ndarray?
-    # print y.get()
-    initial_refcount = sys.getrefcount(y)
-    print("gpuarray ref count before creating a CudaNdarray", end=' ')
-    print(sys.getrefcount(y))
-    assert sys.getrefcount(y) == initial_refcount
-    rand = np.random.randn(*y.shape).astype(np.float32)
-    cuda_rand = cuda_ndarray.CudaNdarray(rand)
-    strides = [1]
-    for i in y.shape[::-1][:-1]:
-        strides.append(strides[-1] * i)
-    strides = tuple(strides[::-1])
-    print('strides', strides)
-    assert cuda_rand._strides == strides, (cuda_rand._strides, strides)
-    # in pycuda trunk, y.ptr also works, which is a little cleaner
-    y_ptr = int(y.gpudata)
-    z = cuda_ndarray.from_gpu_pointer(y_ptr, y.shape, strides, y)
-    print("gpuarray ref count after creating a CudaNdarray", sys.getrefcount(y))
-    assert sys.getrefcount(y) == initial_refcount + 1
-    assert (np.asarray(z) == 0).all()
-    assert z.base is y
-    # Test that we can take a view from this cuda view on pycuda memory
-    zz = z.view()
-    assert sys.getrefcount(y) == initial_refcount + 2
-    assert zz.base is y
-    del zz
-    assert sys.getrefcount(y) == initial_refcount + 1
-    cuda_ones = cuda_ndarray.CudaNdarray(np.asarray([[[1]]],
-                                                    dtype='float32'))
-    z += cuda_ones
-    assert (np.asarray(z) == np.ones(y.shape)).all()
-    assert (np.asarray(z) == 1).all()
-    assert cuda_rand.shape == z.shape
-    assert cuda_rand._strides == z._strides, (cuda_rand._strides, z._strides)
-    assert (np.asarray(cuda_rand) == rand).all()
-    z += cuda_rand
-    assert (np.asarray(z) == (rand + 1)).all()
-    # Check that the ref count to the gpuarray is right.
-    del z
-    print("gpuarray ref count after deleting the CudaNdarray", end=' ')
-    print(sys.getrefcount(y))
-    assert sys.getrefcount(y) == initial_refcount
--- a/theano/misc/tests/test_pycuda_utils.py
+++ b/theano/misc/tests/test_pycuda_utils.py
-from __future__ import absolute_import, print_function, division
-import numpy as np
-import theano.sandbox.cuda as cuda
-import theano.misc.pycuda_init
-if not theano.misc.pycuda_init.pycuda_available:  # noqa
-    from nose.plugins.skip import SkipTest
-    raise SkipTest("Pycuda not installed. Skip test of theano op with pycuda "
-                   "code.")
-if cuda.cuda_available is False:  # noqa
-    from nose.plugins.skip import SkipTest
-    raise SkipTest('Optional theano package cuda disabled')
-from theano.misc.pycuda_utils import to_gpuarray, to_cudandarray
-import pycuda.gpuarray
-def test_to_gpuarray():
-    cx = cuda.CudaNdarray.zeros((5, 4))
-    px = to_gpuarray(cx)
-    assert isinstance(px, pycuda.gpuarray.GPUArray)
-    cx[0, 0] = np.asarray(1, dtype="float32")
-    # Check that they share the same memory space
-    assert px.gpudata == cx.gpudata
-    assert np.asarray(cx[0, 0]) == 1
-    assert np.allclose(np.asarray(cx), px.get())
-    assert px.dtype == cx.dtype
-    assert px.shape == cx.shape
-    assert all(np.asarray(cx._strides) * 4 == px.strides)
-    # Test when the CudaNdarray is strided
-    cx = cx[::2, ::]
-    px = to_gpuarray(cx, copyif=True)
-    assert isinstance(px, pycuda.gpuarray.GPUArray)
-    cx[0, 0] = np.asarray(2, dtype="float32")
-    # Check that they do not share the same memory space
-    assert px.gpudata != cx.gpudata
-    assert np.asarray(cx[0, 0]) == 2
-    assert not np.allclose(np.asarray(cx), px.get())
-    assert px.dtype == cx.dtype
-    assert px.shape == cx.shape
-    assert not all(np.asarray(cx._strides) * 4 == px.strides)
-    # Test that we return an error
-    try:
-        px = to_gpuarray(cx)
-        assert False
-    except ValueError:
-        pass
-def test_to_cudandarray():
-    px = pycuda.gpuarray.zeros((3, 4, 5), 'float32')
-    cx = to_cudandarray(px)
-    assert isinstance(cx, cuda.CudaNdarray)
-    assert np.allclose(px.get(),
-                       np.asarray(cx))
-    assert px.dtype == cx.dtype
-    assert px.shape == cx.shape
-    assert all(np.asarray(cx._strides) * 4 == px.strides)
-    try:
-        px = pycuda.gpuarray.zeros((3, 4, 5), 'float64')
-        to_cudandarray(px)
-        assert False
-    except ValueError:
-        pass
-    try:
-        to_cudandarray(np.zeros(4))
-        assert False
-    except ValueError:
-        pass