Remove tentacles in sandbox

a388d94d · Arnaud Bergeron · 9dcf3f4c · a388d94d · a388d94d · a388d94d
--- a/theano/sandbox/multinomial.py
+++ b/theano/sandbox/multinomial.py
@@ -10,12 +10,6 @@ from theano.tensor import NotScalarConstantError, get_scalar_constant_value
 from theano.scalar import as_scalar
 import copy

-from theano.sandbox.cuda import cuda_available, GpuOp, register_opt
-if cuda_available:
-    from theano.sandbox.cuda import CudaNdarrayType
-    from theano.sandbox.cuda.basic_ops import host_from_gpu, gpu_from_host
-
-
 class MultinomialFromUniform(Op):
    # TODO : need description for parameter 'odtype'
    """
@@ -403,232 +397,6 @@ class ChoiceFromUniform(MultinomialFromUniform):
                        break


-class GpuMultinomialFromUniform(MultinomialFromUniform, GpuOp):
-    """
-    The output is transposed compared to MultinomialFromUniform.
-    We must insert a Transpose op after it.
-
-    The optimization that moves it to the gpu does it.
-
-    """
-
-    def make_node(self, pvals, unis):
-        assert pvals.dtype == 'float32'
-        assert unis.dtype == 'float32'
-        if not isinstance(pvals.type, CudaNdarrayType):
-            raise TypeError('pvals must be cudandarray', pvals)
-        if not isinstance(unis.type, CudaNdarrayType):
-            raise TypeError('unis must be cudandarray', unis)
-        if self.odtype == 'auto':
-            odtype = pvals.dtype
-        else:
-            odtype = self.odtype
-        if odtype != pvals.dtype:
-            raise NotImplementedError(
-                'GpuMultinomialFromUniform works only if '
-                'self.odtype == pvals.dtype', odtype, pvals.dtype)
-        br = (pvals.broadcastable[1], pvals.broadcastable[0])
-        out = CudaNdarrayType(broadcastable=br)()
-        return Apply(self, [pvals, unis], [out])
-
-    def perform(self, node, ins, outs):
-        # The perform from parent don't work with CudaNdarray.  We
-        # don't need it as DebugMode will test again it as an
-        # optimization insert the GPU op.
-        return Op.perform(self, node, ins, outs)
-
-    def c_code_cache_version(self):
-        return (9,)
-
-    def c_support_code_apply(self, node, nodename):
-        return """
-        static __global__ void k_multi_warp_%(nodename)s(
-            const int nb_multi,
-            const int nb_outcomes,
-            float * global_pvals,
-            const int pvals_row_stride,
-            const int pvals_col_stride,
-            float * global_unis,
-            const int unis_stride,
-            float * global_outs,
-            const int outs_row_stride,
-            const int outs_col_stride
-        )
-        {
-            // each thread takes care of one multinomial draw
-            int n = blockDim.x*blockIdx.x + threadIdx.x;
-            if (n < nb_multi)
-            {
-            float cummul = 0.;
-            bool done = false;
-            const float unis_n = global_unis[n*unis_stride];
-            for (int m = 0; m < nb_outcomes; ++m)
-            {
-                float current_out = 0.;
-                if (!done)
-                {
-                    cummul += global_pvals[m * pvals_col_stride + n * pvals_row_stride];
-                    if (unis_n < cummul)
-                    {
-                        current_out = 1.;
-                        done = true;
-                    }
-                }
-                //write out transposed for speed.
-                global_outs[n * outs_col_stride + m * outs_row_stride] = current_out;
-            }
-            }
-        }
-
-        """ % locals()
-
-    def c_code(self, node, name, ins, outs, sub):
-        (pvals, unis) = ins
-        (z,) = outs
-
-        fail = sub['fail']
-        return """
-        if (CudaNdarray_NDIM(%(pvals)s) != 2)
-        {
-            PyErr_Format(PyExc_TypeError, "pvals wrong rank");
-            %(fail)s;
-        }
-        if (CudaNdarray_NDIM(%(unis)s) != 1)
-        {
-            PyErr_Format(PyExc_TypeError, "unis wrong rank");
-            %(fail)s;
-        }
-        if (CudaNdarray_HOST_DIMS(%(unis)s)[0] != CudaNdarray_HOST_DIMS(%(pvals)s)[0])
-        {
-            PyErr_Format(PyExc_ValueError, "unis.shape[0] != pvals.shape[0]");
-            %(fail)s;
-        }
-
-        //N.B. that the output is TRANSPOSED compared with pvals
-        if ((NULL == %(z)s)
-            || (CudaNdarray_HOST_DIMS(%(z)s)[0] != CudaNdarray_HOST_DIMS(%(pvals)s)[1])
-            || (CudaNdarray_HOST_DIMS(%(z)s)[1] != CudaNdarray_HOST_DIMS(%(pvals)s)[0]))
-        {
-            Py_XDECREF(%(z)s);
-            npy_intp dims[2];
-            dims[0] = (CudaNdarray_HOST_DIMS(%(pvals)s)[1]);
-            dims[1] = (CudaNdarray_HOST_DIMS(%(pvals)s)[0]);
-            %(z)s = (CudaNdarray*)CudaNdarray_NewDims(2, dims);
-            if (!%(z)s)
-            {
-                PyErr_SetString(PyExc_MemoryError, "failed to alloc z output");
-                %(fail)s;
-            }
-        }
-
-        { // NESTED SCOPE
-            int nb_multi = CudaNdarray_HOST_DIMS(%(pvals)s)[0];
-            int nb_outcomes = CudaNdarray_HOST_DIMS(%(pvals)s)[1];
-            //TODO : change this for a beautiful constant
-            int max_nb_blocks = 2<<15 - 1;
-            int nb_blocks = max_nb_blocks + 1;
-            int nb_threads=16; // so it really starts at 32, because of the *2
-            do
-            {
-                nb_threads*=2;
-                if (nb_multi %% nb_threads == 0)
-                    nb_blocks = nb_multi/nb_threads;
-                else
-                    nb_blocks = (int)((float)nb_multi/(float)nb_threads + 1.);
-            } while (nb_blocks > max_nb_blocks);
-
-            //printf("\\nN=%%i b=%%i t=%%i t*b=%%i", nb_multi, nb_blocks, nb_threads, nb_blocks*nb_threads);
-
-            // TODO : next line is a bit hardcoded...
-            if (nb_threads > 512)
-            {
-                PyErr_Format(PyExc_ValueError, "Mutinomial is not implemented for so many rows in the matrix (%%i)", nb_multi);
-                %(fail)s;
-            }
-            dim3 n_blocks(nb_blocks,1,1);
-            dim3 n_threads(nb_threads,1,1);
-            int n_shared = 0;
-
-            assert(nb_blocks*nb_threads >= nb_multi);
-
-            k_multi_warp_%(name)s<<<n_blocks, n_threads, n_shared>>>(
-                CudaNdarray_HOST_DIMS(%(z)s)[1],
-                CudaNdarray_HOST_DIMS(%(z)s)[0],
-                CudaNdarray_DEV_DATA(%(pvals)s),
-                CudaNdarray_HOST_STRIDES(%(pvals)s)[0],
-                CudaNdarray_HOST_STRIDES(%(pvals)s)[1],
-                CudaNdarray_DEV_DATA(%(unis)s),
-                CudaNdarray_HOST_STRIDES(%(unis)s)[0],
-                CudaNdarray_DEV_DATA(%(z)s),
-                CudaNdarray_HOST_STRIDES(%(z)s)[0],
-                CudaNdarray_HOST_STRIDES(%(z)s)[1]
-            );
-            CNDA_THREAD_SYNC;
-            cudaError_t sts = cudaGetLastError();
-            if (cudaSuccess != sts)
-            {
-                PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i; shared: %%i)\\n",
-                    "k_multi_warp_%(name)s",
-                    cudaGetErrorString(sts),
-                    n_blocks.x,
-                    n_blocks.y,
-                    n_threads.x,
-                    n_threads.y,
-                    n_threads.z,
-                    n_shared);
-                %(fail)s;
-            }
-
-        } // END NESTED SCOPE
-        """ % locals()
-
-
-@register_opt()
-@local_optimizer([MultinomialFromUniform])
-def local_gpu_multinomial(node):
-    # TODO : need description for function
-    if type(node.op) is MultinomialFromUniform:
-        if len(node.inputs) == 2:
-            p, u = node.inputs
-            n_samples = 1
-        else:
-            p, u, n_samples = node.inputs
-        try:
-            if get_scalar_constant_value(n_samples) != 1:
-                return None
-        except NotScalarConstantError:
-            return None
-        m, = node.outputs
-        if (p.dtype == u.dtype == m.dtype == 'float32' and
-            any([i.owner and isinstance(i.owner.op,
-                                        theano.sandbox.cuda.HostFromGpu)
-                 for i in node.inputs])):
-            gpu_op = GpuMultinomialFromUniform(node.op.odtype)
-            return [host_from_gpu(gpu_op(*[gpu_from_host(i)
-                                           for i in [p, u]])).T]
-    if (isinstance(node.op, theano.sandbox.cuda.GpuFromHost) and
-            node.inputs[0].owner and
-            type(node.inputs[0].owner.op) is MultinomialFromUniform):
-        multi = node.inputs[0].owner
-        if len(node.inputs) == 2:
-            p, u = node.inputs
-            n_samples = 1
-        else:
-            p, u, n_samples = node.inputs
-        try:
-            if get_scalar_constant_value(n_samples) != 1:
-                return None
-        except NotScalarConstantError:
-            return None
-        m, = multi.outputs
-        if (p.dtype == u.dtype == m.dtype == 'float32'):
-            gpu_op = GpuMultinomialFromUniform(multi.op.odtype)
-            ret = gpu_op(*[gpu_from_host(i) for i in [p, u]]).T
-            # The dimshuffle is on the cpu, but will be moved to the
-            # gpu by an opt.
-            return [gpu_from_host(ret)]
-
-
 class MultinomialWOReplacementFromUniform(ChoiceFromUniform):
    def __init__(self, *args, **kwargs):
        warnings.warn("MultinomialWOReplacementFromUniform is deprecated, "

--- a/theano/sandbox/rng_mrg.py
+++ b/theano/sandbox/rng_mrg.py
@@ -12,6 +12,7 @@ import numpy as np
 from six import integer_types
 from six.moves import xrange

+import theano
 from theano import Op, Apply, shared, config, Variable
 from theano import gradient, function
 from theano import tensor
@@ -22,17 +23,11 @@ from theano.compile import optdb
 from theano.gof import local_optimizer
 from . import multinomial

-import theano.sandbox.cuda
-from theano.sandbox.cuda import GpuOp
-from theano.sandbox.cuda.basic_ops import as_cuda_ndarray_variable
 from theano.gpuarray.basic_ops import GpuKernelBase, Kernel, infer_context_name, as_gpuarray_variable
 from theano.gpuarray.type import GpuArrayType
 from theano.gpuarray.fp16_help import write_w
 from theano.gpuarray.opt import (register_opt as register_gpua,
                                 register_opt2)
-if theano.sandbox.cuda.cuda_available:
-    from theano.sandbox.cuda import (CudaNdarrayType,
-                                     float32_shared_constructor)


 def matVecModM(A, s, m):
@@ -562,264 +557,6 @@ class mrg_uniform(mrg_uniform_base):
        return (8, )


-class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
-    # GPU VERSION
-
-    def make_node(self, rstate, size):
-        # error checking slightly redundant here, since
-        # this op should not be called directly.
-        #
-        # call through MRG_RandomStreams instead.
-        broad = []
-        for i in range(self.output_type.ndim):
-                broad.append(tensor.extract_constant(size[i]) == 1)
-        output_type = self.output_type.clone(broadcastable=broad)()
-        rstate = as_cuda_ndarray_variable(rstate)
-        return Apply(self,
-                     [rstate, size],
-                     [rstate.type(), output_type])
-
-    @classmethod
-    def new(cls, rstate, ndim, dtype, size):
-        v_size = as_tensor_variable(size)
-        if ndim is None:
-            ndim = get_vector_length(v_size)
-        op = cls(CudaNdarrayType((False,) * ndim))
-        return op(rstate, v_size)
-
-    def c_support_code_apply(self, node, nodename):
-        if self.output_type.dtype == 'float32':
-            otype = 'float'
-            NORM = '4.6566126e-10f'  # np.float32(1.0/(2**31+65))
-            # this was determined by finding the biggest number such that
-            # np.float32(number * M1) < 1.0
-        else:
-            otype = 'double'
-            NORM = '4.656612873077392578125e-10'
-        return """
-        // FB: I disable the printing of the warning, as we
-        //receive too much email about this and this don't help
-        //people. I'm not even sure if the "fix" to give the info about
-        //the shape statically give a speed up. So I consider this
-        //warning as useless until proved it can speed the user code.
-        static int %(nodename)s_printed_warning = 1;
-
-        static __global__ void %(nodename)s_mrg_uniform(
-                %(otype)s*sample_data,
-                npy_int32*state_data,
-                const int Nsamples,
-                const int Nstreams_used)
-        {
-            const npy_int32 i0 = 0;
-            const npy_int32 i7 = 7;
-            const npy_int32 i9 = 9;
-            const npy_int32 i15 = 15;
-            const npy_int32 i16 = 16;
-            const npy_int32 i22 = 22;
-            const npy_int32 i24 = 24;
-
-            const npy_int32 M1 = 2147483647;      //2^31 - 1
-            const npy_int32 M2 = 2147462579;      //2^31 - 21069
-            const npy_int32 MASK12 = 511;       //2^9 - 1
-            const npy_int32 MASK13 = 16777215;  //2^24 - 1
-            const npy_int32 MASK2 = 65535;      //2^16 - 1
-            const npy_int32 MULT2 = 21069;
-
-            const unsigned int numThreads = blockDim.x * gridDim.x;
-            const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
-            npy_int32 y1, y2, x11, x12, x13, x21, x22, x23;
-
-            if (idx < Nstreams_used)
-            {
-            x11 = state_data[idx*6+0];
-            x12 = state_data[idx*6+1];
-            x13 = state_data[idx*6+2];
-            x21 = state_data[idx*6+3];
-            x22 = state_data[idx*6+4];
-            x23 = state_data[idx*6+5];
-
-            for (int i = idx; i < Nsamples; i += Nstreams_used)
-            {
-                y1 = ((x12 & MASK12) << i22) + (x12 >> i9) + ((x13 & MASK13) << i7) + (x13 >> i24);
-                y1 -= (y1 < 0 || y1 >= M1) ? M1 : 0;
-                y1 += x13;
-                y1 -= (y1 < 0 || y1 >= M1) ? M1 : 0;
-                x13 = x12;
-                x12 = x11;
-                x11 = y1;
-
-                y1 = ((x21 & MASK2) << i15) + (MULT2 * (x21 >> i16));
-                y1 -= (y1 < 0 || y1 >= M2) ? M2 : 0;
-                y2 = ((x23 & MASK2) << i15) + (MULT2 * (x23 >> i16));
-                y2 -= (y2 < 0 || y2 >= M2) ? M2 : 0;
-                y2 += x23;
-                y2 -= (y2 < 0 || y2 >= M2) ? M2 : 0;
-                y2 += y1;
-                y2 -= (y2 < 0 || y2 >= M2) ? M2 : 0;
-
-                x23 = x22;
-                x22 = x21;
-                x21 = y2;
-
-                if (x11 <= x21) {
-                    sample_data[i] = (x11 - x21 + M1) * %(NORM)s;
-                }
-                else
-                {
-                    sample_data[i] = (x11 - x21) * %(NORM)s;
-                }
-            }
-
-            state_data[idx*6+0]= x11;
-            state_data[idx*6+1]= x12;
-            state_data[idx*6+2]= x13;
-            state_data[idx*6+3]= x21;
-            state_data[idx*6+4]= x22;
-            state_data[idx*6+5]= x23;
-            }
-        }
-
-        """ % locals()
-
-    def c_code(self, node, nodename, inp, out, sub):
-        rstate, size = inp
-        o_rstate, o_sample = out
-        inplace = int(self.inplace)
-        ndim = self.output_type.ndim
-        o_type_num = np.asarray(0, dtype=self.output_type.dtype).dtype.num
-        fail = sub['fail']
-
-        if self.output_type.dtype == 'float32':
-            otype = 'float'
-        else:
-            otype = 'double'
-
-        SYNC = "CNDA_THREAD_SYNC"
-        return """
-        //////// <code generated by mrg_uniform>
-        npy_int64 M1 = 2147483647;      //2^31 - 1
-        // The +1 is to avoid odims[0] which fails on windows
-        npy_int64 odims[%(ndim)s+1];
-        npy_int64 n_elements = 1;
-        int n_streams, n_streams_used_in_this_call;
-        int must_alloc_sample = ((NULL == %(o_sample)s)
-                || !CudaNdarray_Check((PyObject*)%(o_sample)s)
-                || !CudaNdarray_is_c_contiguous(%(o_sample)s)
-                || (CudaNdarray_NDIM(%(o_sample)s) != %(ndim)s));
-
-        if (PyArray_NDIM(%(size)s) != 1)
-        {
-            PyErr_SetString(PyExc_ValueError, "size must be vector");
-            %(fail)s
-        }
-        if (PyArray_DIMS(%(size)s)[0] != %(ndim)s)
-        {
-            PyErr_Format(PyExc_ValueError, "size must have length %%i (not %%i)",
-                %(ndim)s, PyArray_DIMS(%(size)s)[0]);
-            %(fail)s
-        }
-
-        for (int i = 0; i < %(ndim)s; ++i)
-        {
-            odims[i] = *(dtype_%(size)s *)PyArray_GETPTR1(%(size)s, i);
-            n_elements *= odims[i];
-            must_alloc_sample = (must_alloc_sample
-                    || CudaNdarray_HOST_DIMS(%(o_sample)s)[i] != odims[i]);
-        }
-
-        if (n_elements > M1)
-        {
-            PyErr_SetString(
-                PyExc_ValueError,
-                "rng_mrg gpu implementation does not support more than (2**31 -1) samples");
-            %(fail)s
-        }
-
-        if (must_alloc_sample)
-        {
-            Py_XDECREF(%(o_sample)s);
-            %(o_sample)s = (CudaNdarray*)CudaNdarray_NewDims(%(ndim)s, odims);
-            if(!%(o_sample)s)
-            {
-                %(fail)s;
-            }
-        }
-        if (!CudaNdarray_Check((PyObject*)%(rstate)s))
-        {
-            PyErr_Format(PyExc_ValueError, "rstate must be cudandarray");
-            %(fail)s;
-        }
-
-        Py_XDECREF(%(o_rstate)s);
-        if (%(inplace)s)
-        {
-            Py_INCREF(%(rstate)s);
-            %(o_rstate)s = %(rstate)s;
-        }
-        else
-        {
-            %(o_rstate)s = (CudaNdarray*)CudaNdarray_Copy(%(rstate)s);
-            if (!%(o_rstate)s) {
-                PyErr_SetString(PyExc_RuntimeError, "GPU_mrg_uniform: "
-                                "could not copy rstate");
-                %(fail)s
-            }
-        }
-
-        if (CudaNdarray_NDIM(%(o_rstate)s) != 1)
-        {
-            PyErr_SetString(PyExc_ValueError, "rstate must be vector");
-            %(fail)s;
-        }
-        if (CudaNdarray_HOST_DIMS(%(o_rstate)s)[0] %% 6)
-        {
-            PyErr_Format(PyExc_ValueError, "rstate len must be multiple of 6");
-            %(fail)s;
-        }
-        n_streams = CudaNdarray_HOST_DIMS(%(o_rstate)s)[0]/6;
-        n_streams_used_in_this_call = std::min(n_streams, (int)n_elements);
-
-        {
-            unsigned int threads_per_block = std::min((unsigned int)n_streams_used_in_this_call, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
-            unsigned int n_blocks = std::min(ceil_intdiv((unsigned int)n_streams_used_in_this_call, threads_per_block), (unsigned int)NUM_VECTOR_OP_BLOCKS);
-
-            if (n_streams > (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK * (unsigned int)NUM_VECTOR_OP_BLOCKS)
-            {
-                PyErr_Format(PyExc_ValueError, "On GPU, n_streams should be at most %%u",
-                    (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK * (unsigned int)NUM_VECTOR_OP_BLOCKS);
-                %(fail)s;
-            }
-
-            if (threads_per_block * n_blocks < n_streams)
-            {
-                if (! %(nodename)s_printed_warning)
-                  fprintf(stderr, "WARNING: unused streams above %%i (Tune GPU_mrg get_n_streams)\\n", threads_per_block * n_blocks );
-                %(nodename)s_printed_warning = 1;
-            }
-            %(nodename)s_mrg_uniform<<<n_blocks,threads_per_block>>>(
-                CudaNdarray_DEV_DATA(%(o_sample)s),
-                (npy_int32*)CudaNdarray_DEV_DATA(%(o_rstate)s),
-                n_elements, n_streams_used_in_this_call);
-        }
-
-        %(SYNC)s;
-
-        {
-            cudaError_t err = cudaGetLastError();
-            if( cudaSuccess != err)
-            {
-                PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n", "mrg_uniform", cudaGetErrorString(err));
-                %(fail)s;
-            }
-        }
-
-        //////// </ code generated by mrg_uniform>
-        """ % locals()
-
-    def c_code_cache_version(self):
-        return (12,)
-
-
 class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
    # GpuArray version
    _f16_ok = True
@@ -1131,7 +868,6 @@ def guess_n_streams(size, warn=False):


 class MRG_RandomStreams(object):
-    # TODO : need description for parameter 'use_cuda'
    """
    Module component with similar interface to numpy.random
    (numpy.random.RandomState).
@@ -1151,7 +887,7 @@ class MRG_RandomStreams(object):
        # TODO : need description for method and return
        return list(self.state_updates)

-    def __init__(self, seed=12345, use_cuda=None):
+    def __init__(self, seed=12345):
        # A list of pairs of the form (input_r, output_r), representing the
        # update rules of all the random states generated
        # by this RandomStreams.
@@ -1164,11 +900,6 @@ class MRG_RandomStreams(object):

        self.set_rstate(seed)

-        if use_cuda is None:
-            self.use_cuda = theano.sandbox.cuda.cuda_enabled
-        else:
-            self.use_cuda = use_cuda
-
    def set_rstate(self, seed):
        # TODO : need description for method, parameter
        if isinstance(seed, integer_types):
@@ -1271,15 +1002,6 @@ class MRG_RandomStreams(object):

        if inc_rstate:
            self.inc_rstate()
-        if self.use_cuda and dtype == 'float32':
-            rval = rval.flatten()
-            # HACK - we use fact that int32 and float32 have same size to
-            # sneak ints into the CudaNdarray type.
-            # these *SHOULD NEVER BE USED AS FLOATS*
-            tmp_float_buf = np.frombuffer(rval.data, dtype='float32')
-            assert tmp_float_buf.shape == rval.shape
-            assert (tmp_float_buf.view('int32') == rval).all()
-            rval = tmp_float_buf

        return rval

@@ -1352,25 +1074,11 @@ class MRG_RandomStreams(object):
            nstreams = self.n_streams(size)
        rstates = self.get_substream_rstates(nstreams, dtype)

-        if self.use_cuda and dtype == 'float32':
-            node_rstate = float32_shared_constructor(rstates)
-            assert isinstance(node_rstate.type, CudaNdarrayType)
-
-            # we can't use the normal mrg_uniform constructor + later
-            # optimization
-            # because of the tmp_float_buf hack above.  There is
-            # currently no Theano node that will do a frombuffer
-            # reinterpretation.
-            u = self.pretty_return(node_rstate,
-                                   *GPU_mrg_uniform.new(node_rstate,
-                                                        ndim, dtype, size),
-                                   size=size, nstreams=orig_nstreams)
-        else:
-            node_rstate = shared(rstates)
-            u = self.pretty_return(node_rstate,
-                                   *mrg_uniform.new(node_rstate,
-                                                    ndim, dtype, size),
-                                   size=size, nstreams=orig_nstreams)
+        node_rstate = shared(rstates)
+        u = self.pretty_return(node_rstate,
+                               *mrg_uniform.new(node_rstate,
+                                                ndim, dtype, size),
+                               size=size, nstreams=orig_nstreams)
        # Add a reference to distinguish from other shared variables
        node_rstate.tag.is_rng = True
        r = u * (high - low) + low
@@ -1387,10 +1095,7 @@ class MRG_RandomStreams(object):
                 nstreams=None):
        # TODO : need description for method, parameter and return
        if n == 1:
-            if dtype == 'float32' and self.use_cuda:
-                x = self.uniform(size=size, dtype=dtype, nstreams=nstreams)
-            else:
-                x = self.uniform(size=size, nstreams=nstreams)
+            x = self.uniform(size=size, nstreams=nstreams)
            return cast(x < p, dtype)
        else:
            raise NotImplementedError("MRG_RandomStreams.binomial with n > 1")
@@ -1630,7 +1335,7 @@ def local_gpua_mrg(node):
    return local_gpua_mrg_graph(node.op, context_name, node.inputs, node.outputs)


-MRG_RNGs = (mrg_uniform, GPU_mrg_uniform, GPUA_mrg_uniform)
+MRG_RNGs = (mrg_uniform, GPUA_mrg_uniform)


 @local_optimizer(MRG_RNGs)

--- a/theano/sandbox/tests/test_multinomial.py
+++ b/theano/sandbox/tests/test_multinomial.py
@@ -10,28 +10,11 @@ import theano
 from theano import config, function, tensor
 from theano.sandbox import multinomial
 from theano.compile.mode import get_default_mode
-import theano.sandbox.cuda as cuda
 import theano.tests.unittest_tools as utt
 from theano.compat import PY3
 from theano.misc.pkl_utils import CompatUnpickler


-def get_mode(gpu):
-    mode = get_default_mode()
-    if theano.config.mode == 'FAST_COMPILE':
-        mode = theano.compile.get_mode('FAST_RUN')
-    if gpu:
-        mode = mode.including('gpu', 'gpu_local_optimizations',
-                              'local_cut_gpu_host_gpu',
-                              'local_gpu_multinomial')
-    return mode
-
-
-def run_with_c(f, gpu=False):
-    mode = get_mode(gpu)
-    f(mode, gpu)
-
-
 def test_n_samples_1():
    p = tensor.fmatrix()
    u = tensor.fvector()
@@ -117,69 +100,52 @@ def test_multinomial_0():

    m = multinomial.MultinomialFromUniform('auto')(p, u)

-    def body(mode, gpu):
-        # the m*2 allows the multinomial to reuse output
-        f = function([p, u], m * 2, allow_input_downcast=True, mode=mode)
+    # the m*2 allows the multinomial to reuse output
+    f = function([p, u], m * 2, allow_input_downcast=True)

-        if gpu:
-            assert any([type(node.op) is multinomial.GpuMultinomialFromUniform
-                        for node in f.maker.fgraph.toposort()])
+    # test that both first and second samples can be drawn
+    utt.assert_allclose(f([[1, 0], [0, 1]], [.1, .1]),
+                        [[2, 0], [0, 2]])

-        # test that both first and second samples can be drawn
-        utt.assert_allclose(f([[1, 0], [0, 1]], [.1, .1]),
-                            [[2, 0], [0, 2]])
+    # test that both second labels can be drawn
+    r = f([[.2, .8], [.3, .7]], [.31, .31])
+    utt.assert_allclose(r, [[0, 2], [0, 2]])

-        # test that both second labels can be drawn
-        r = f([[.2, .8], [.3, .7]], [.31, .31])
-        utt.assert_allclose(r, [[0, 2], [0, 2]])
+    # test that both first labels can be drawn
+    r = f([[.2, .8], [.3, .7]], [.21, .21])
+    utt.assert_allclose(r, [[0, 2], [2, 0]])

-        # test that both first labels can be drawn
-        r = f([[.2, .8], [.3, .7]], [.21, .21])
-        utt.assert_allclose(r, [[0, 2], [2, 0]])
-
-        # change the size to make sure output gets reallocated ok
-        # and also make sure that the GPU version doesn't screw up the
-        # transposed-ness
-        r = f([[.2, .8]], [.25])
-        utt.assert_allclose(r, [[0, 2]])
-
-    run_with_c(body)
-    if cuda.cuda_available:
-        run_with_c(body, True)
+    # change the size to make sure output gets reallocated ok
+    # and also make sure that the GPU version doesn't screw up the
+    # transposed-ness
+    r = f([[.2, .8]], [.25])
+    utt.assert_allclose(r, [[0, 2]])


 # TODO: check a bigger example (make sure blocking on GPU is handled correctly)
 def test_multinomial_large():
-    # DEBUG_MODE will test this on GPU
-    def body(mode, gpu):
-        p = tensor.fmatrix()
-        u = tensor.fvector()
-        m = multinomial.MultinomialFromUniform('auto')(p, u)
-        f = function([p, u], m * 2, allow_input_downcast=True, mode=mode)
-        if gpu:
-            assert any([type(node.op) is multinomial.GpuMultinomialFromUniform
-                        for node in f.maker.fgraph.toposort()])
-
-        pval = np.arange(10000 * 4, dtype='float32').reshape((10000, 4)) + 0.1
-        pval = pval / pval.sum(axis=1)[:, None]
-        uval = np.ones_like(pval[:, 0]) * 0.5
-        mval = f(pval, uval)
-
-        assert mval.shape == pval.shape
-        if config.cast_policy == 'custom':
-            assert mval.dtype == pval.dtype
-        elif config.cast_policy == 'numpy+floatX':
-            assert mval.dtype == config.floatX
-        elif config.cast_policy == 'numpy':
-            assert mval.dtype == 'float64'
-        else:
-            raise NotImplementedError(config.cast_policy)
-        utt.assert_allclose(mval.sum(axis=1), 2)
-        asdf = np.asarray([0, 0, 2, 0]) + 0 * pval
-        utt.assert_allclose(mval, asdf)  # broadcast over all rows
-    run_with_c(body)
-    if cuda.cuda_available:
-        run_with_c(body, True)
+    p = tensor.fmatrix()
+    u = tensor.fvector()
+    m = multinomial.MultinomialFromUniform('auto')(p, u)
+    f = function([p, u], m * 2, allow_input_downcast=True, mode=mode)
+
+    pval = np.arange(10000 * 4, dtype='float32').reshape((10000, 4)) + 0.1
+    pval = pval / pval.sum(axis=1)[:, None]
+    uval = np.ones_like(pval[:, 0]) * 0.5
+    mval = f(pval, uval)
+
+    assert mval.shape == pval.shape
+    if config.cast_policy == 'custom':
+        assert mval.dtype == pval.dtype
+    elif config.cast_policy == 'numpy+floatX':
+        assert mval.dtype == config.floatX
+    elif config.cast_policy == 'numpy':
+        assert mval.dtype == 'float64'
+    else:
+        raise NotImplementedError(config.cast_policy)
+    utt.assert_allclose(mval.sum(axis=1), 2)
+    asdf = np.asarray([0, 0, 2, 0]) + 0 * pval
+    utt.assert_allclose(mval, asdf)  # broadcast over all rows


 def test_multinomial_dtypes():
@@ -197,40 +163,3 @@ def test_multinomial_dtypes():
    u = tensor.fvector()
    m = multinomial.MultinomialFromUniform('float64')(p, u)
    assert m.dtype == 'float64', m.dtype
-
-
-def test_gpu_opt():
-    if not cuda.cuda_available:
-        # Skip test if cuda_ndarray is not available.
-        from nose.plugins.skip import SkipTest
-        raise SkipTest('Optional package cuda not available')
-
-    # We test the case where we put the op on the gpu when the output
-    # is moved to the gpu.
-    p = tensor.fmatrix()
-    u = tensor.fvector()
-    m = multinomial.MultinomialFromUniform('auto')(p, u)
-    assert m.dtype == 'float32', m.dtype
-    m_gpu = cuda.gpu_from_host(m)
-
-    f = function([p, u], m_gpu, allow_input_downcast=True, mode=get_mode(True))
-    assert any([type(node.op) is multinomial.GpuMultinomialFromUniform
-                for node in f.maker.fgraph.toposort()])
-    pval = np.arange(10000 * 4, dtype='float32').reshape((10000, 4)) + 0.1
-    pval = pval / pval.sum(axis=1)[:, None]
-    uval = np.ones_like(pval[:, 0]) * 0.5
-    f(pval, uval)
-
-    # Test with a row, it was failing in the past.
-    r = tensor.frow()
-    m = multinomial.MultinomialFromUniform('auto')(r, u)
-    assert m.dtype == 'float32', m.dtype
-    m_gpu = cuda.gpu_from_host(m)
-
-    f = function([r, u], m_gpu, allow_input_downcast=True, mode=get_mode(True))
-    assert any([type(node.op) is multinomial.GpuMultinomialFromUniform
-                for node in f.maker.fgraph.toposort()])
-    pval = np.arange(1 * 4, dtype='float32').reshape((1, 4)) + 0.1
-    pval = pval / pval.sum(axis=1)[:, None]
-    uval = np.ones_like(pval[:, 0]) * 0.5
-    f(pval, uval)
--- a/theano/sandbox/tests/test_rng_mrg.py
+++ b/theano/sandbox/tests/test_rng_mrg.py
@@ -15,28 +15,15 @@ import theano
 from theano import tensor, config
 from theano.sandbox import rng_mrg
 from theano.sandbox.rng_mrg import MRG_RandomStreams
-from theano.sandbox.cuda import cuda_available
 from theano.tests import unittest_tools as utt
 from theano.tests.unittest_tools import attr
 import theano.gpuarray.tests.config

-if cuda_available:
-    from theano.sandbox.cuda import float32_shared_constructor
-
-# TODO: test gpu
-# Done in test_consistency_GPU_{serial,parallel}
-
 # TODO: test MRG_RandomStreams
 # Partly done in test_consistency_randomstreams

 # TODO: test optimizer mrg_random_make_inplace
-# TODO: make tests work when no flags gived. Now need:
-#      THEANO_FLAGS=device=gpu0,floatX=float32
-# Partly done, in test_consistency_GPU_{serial,parallel}
-

-mode = config.mode
-mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
 utt.seed_rng()

 # Results generated by Java code using L'Ecuyer et al.'s code, with:
@@ -53,61 +40,46 @@ def test_deterministic():
    seed = utt.fetch_seed()
    sample_size = (10, 20)

-    test_use_cuda = [False]
-    if cuda_available:
-        test_use_cuda.append(True)
-
-    for use_cuda in test_use_cuda:
-        # print 'use_cuda =', use_cuda
-        R = MRG_RandomStreams(seed=seed, use_cuda=use_cuda)
-        u = R.uniform(size=sample_size)
-        f = theano.function([], u)
+    R = MRG_RandomStreams(seed=seed)
+    u = R.uniform(size=sample_size)
+    f = theano.function([], u)

-        fsample1 = f()
-        fsample2 = f()
-        assert not np.allclose(fsample1, fsample2)
+    fsample1 = f()
+    fsample2 = f()
+    assert not np.allclose(fsample1, fsample2)

-        R2 = MRG_RandomStreams(seed=seed, use_cuda=use_cuda)
-        u2 = R2.uniform(size=sample_size)
-        g = theano.function([], u2)
-        gsample1 = g()
-        gsample2 = g()
-        assert np.allclose(fsample1, gsample1)
-        assert np.allclose(fsample2, gsample2)
+    R2 = MRG_RandomStreams(seed=seed)
+    u2 = R2.uniform(size=sample_size)
+    g = theano.function([], u2)
+    gsample1 = g()
+    gsample2 = g()
+    assert np.allclose(fsample1, gsample1)
+    assert np.allclose(fsample2, gsample2)


 def test_consistency_randomstreams():
-    """
-    Verify that the random numbers generated by MRG_RandomStreams
-    are the same as the reference (Java) implementation by L'Ecuyer et al.
-
-    """
+    # Verify that the random numbers generated by MRG_RandomStreams
+    # are the same as the reference (Java) implementation by L'Ecuyer et al.
    seed = 12345
    n_samples = 5
    n_streams = 12
    n_substreams = 7

-    test_use_cuda = [False]
-    if cuda_available:
-        test_use_cuda.append(True)
-
-    for use_cuda in test_use_cuda:
-        # print 'use_cuda =', use_cuda
-        samples = []
-        rng = MRG_RandomStreams(seed=seed, use_cuda=use_cuda)
-        for i in range(n_streams):
-            stream_samples = []
-            u = rng.uniform(size=(n_substreams,), nstreams=n_substreams)
-            f = theano.function([], u)
-            for j in range(n_samples):
-                s = f()
-                stream_samples.append(s)
-            stream_samples = np.array(stream_samples)
-            stream_samples = stream_samples.T.flatten()
-            samples.append(stream_samples)
+    samples = []
+    rng = MRG_RandomStreams(seed=seed)
+    for i in range(n_streams):
+        stream_samples = []
+        u = rng.uniform(size=(n_substreams,), nstreams=n_substreams)
+        f = theano.function([], u)
+        for j in range(n_samples):
+            s = f()
+            stream_samples.append(s)
+        stream_samples = np.array(stream_samples)
+        stream_samples = stream_samples.T.flatten()
+    samples.append(stream_samples)

-        samples = np.array(samples).flatten()
-        assert(np.allclose(samples, java_samples))
+    samples = np.array(samples).flatten()
+    assert(np.allclose(samples, java_samples))


 def test_get_substream_rstates():
@@ -214,153 +186,6 @@ def test_consistency_cpu_parallel():
    assert(np.allclose(samples, java_samples))


-def test_consistency_GPU_serial():
-    """
-    Verify that the random numbers generated by GPU_mrg_uniform, serially,
-    are the same as the reference (Java) implementation by L'Ecuyer et al.
-
-    """
-    if not cuda_available:
-        raise SkipTest('Optional package cuda not available')
-    if config.mode == 'FAST_COMPILE':
-        mode = 'FAST_RUN'
-    else:
-        mode = config.mode
-
-    seed = 12345
-    n_samples = 5
-    n_streams = 12
-    n_substreams = 7
-
-    samples = []
-    curr_rstate = np.array([seed] * 6, dtype='int32')
-
-    for i in range(n_streams):
-        stream_rstate = curr_rstate.copy()
-        for j in range(n_substreams):
-            substream_rstate = np.array(stream_rstate.copy(), dtype='int32')
-            # HACK - we transfer these int32 to the GPU memory as float32
-            # (reinterpret_cast)
-            tmp_float_buf = np.frombuffer(substream_rstate.data,
-                                          dtype='float32')
-            # Transfer to device
-            rstate = float32_shared_constructor(tmp_float_buf)
-
-            new_rstate, sample = rng_mrg.GPU_mrg_uniform.new(rstate, ndim=None,
-                                                             dtype='float32',
-                                                             size=(1,))
-            rstate.default_update = new_rstate
-
-            # Not really necessary, just mimicking
-            # rng_mrg.MRG_RandomStreams' behavior
-            sample.rstate = rstate
-            sample.update = (rstate, new_rstate)
-
-            # We need the sample back in the main memory
-            cpu_sample = tensor.as_tensor_variable(sample)
-            f = theano.function([], cpu_sample, mode=mode)
-            for k in range(n_samples):
-                s = f()
-                samples.append(s)
-
-            # next substream
-            stream_rstate = rng_mrg.ff_2p72(stream_rstate)
-
-        # next stream
-        curr_rstate = rng_mrg.ff_2p134(curr_rstate)
-
-    samples = np.array(samples).flatten()
-    assert(np.allclose(samples, java_samples))
-
-
-def test_consistency_GPU_parallel():
-    """
-    Verify that the random numbers generated by GPU_mrg_uniform, in
-    parallel, are the same as the reference (Java) implementation by
-    L'Ecuyer et al.
-
-    """
-    if not cuda_available:
-        raise SkipTest('Optional package cuda not available')
-    if config.mode == 'FAST_COMPILE':
-        mode = 'FAST_RUN'
-    else:
-        mode = config.mode
-
-    seed = 12345
-    n_samples = 5
-    n_streams = 12
-    n_substreams = 7  # 7 samples will be drawn in parallel
-
-    samples = []
-    curr_rstate = np.array([seed] * 6, dtype='int32')
-
-    for i in range(n_streams):
-        stream_samples = []
-        rstate = [curr_rstate.copy()]
-        for j in range(1, n_substreams):
-            rstate.append(rng_mrg.ff_2p72(rstate[-1]))
-        rstate = np.asarray(rstate).flatten()
-        # HACK - transfer these int32 to the GPU memory as float32
-        # (reinterpret_cast)
-        tmp_float_buf = np.frombuffer(rstate.data, dtype='float32')
-        # Transfer to device
-        rstate = float32_shared_constructor(tmp_float_buf)
-
-        new_rstate, sample = rng_mrg.GPU_mrg_uniform.new(rstate, ndim=None,
-                                                         dtype='float32',
-                                                         size=(n_substreams,))
-        rstate.default_update = new_rstate
-
-        # Not really necessary, just mimicking
-        # rng_mrg.MRG_RandomStreams' behavior
-        sample.rstate = rstate
-        sample.update = (rstate, new_rstate)
-
-        # We need the sample back in the main memory
-        cpu_sample = tensor.as_tensor_variable(sample)
-        f = theano.function([], cpu_sample, mode=mode)
-
-        for k in range(n_samples):
-            s = f()
-            stream_samples.append(s)
-
-        samples.append(np.array(stream_samples).T.flatten())
-
-        # next stream
-        curr_rstate = rng_mrg.ff_2p134(curr_rstate)
-
-    samples = np.array(samples).flatten()
-    assert(np.allclose(samples, java_samples))
-
-
-def test_GPU_nstreams_limit():
-    """
-    Verify that a ValueError is raised when n_streams
-    is greater than 2**20 on GPU. This is the value of
-    (NUM_VECTOR_OP_THREADS_PER_BLOCK * NUM_VECTOR_OP_BLOCKS).
-
-    """
-    if not cuda_available:
-        raise SkipTest('Optional package cuda not available')
-
-    seed = 12345
-    R = MRG_RandomStreams(seed=seed, use_cuda=True)
-
-    def eval_uniform(size, nstreams):
-        if theano.config.mode == "FAST_COMPILE":
-            mode = "FAST_RUN"
-        else:
-            mode = copy.copy(theano.compile.get_default_mode())
-            mode.check_py_code = False
-        out = R.uniform(size=size, nstreams=nstreams, dtype='float32')
-        f = theano.function([], out, mode=mode)
-        return f()
-
-    eval_uniform((10,), 2**20)
-    assert_raises(ValueError, eval_uniform, (10,), 2**20 + 1)
-
-
 def test_consistency_GPUA_serial():
    # Verify that the random numbers generated by GPUA_mrg_uniform, serially,
    # are the same as the reference (Java) implementation by L'Ecuyer et al.
@@ -470,7 +295,7 @@ def test_GPUA_full_fill():
    # This needs to be large to trigger the problem on GPU
    size = (10, 1000)

-    R = MRG_RandomStreams(234, use_cuda=False)
+    R = MRG_RandomStreams(234)
    uni = R.uniform(size, nstreams=60 * 256)
    f_cpu = theano.function([], uni)

@@ -568,7 +393,7 @@ def test_uniform():
        # print ''
        # print 'ON CPU with size=(%s):' % str(size)
        x = tensor.matrix()
-        R = MRG_RandomStreams(234, use_cuda=False)
+        R = MRG_RandomStreams(234)
        # Note: we specify `nstreams` to avoid a warning.
        # TODO Look for all occurrences of `guess_n_streams` and `30 * 256`
        # for such situations: it would be better to instead filter the
@@ -592,31 +417,6 @@ def test_uniform():
            steps_ = steps
        basictest(f, steps_, const_size, prefix='mrg cpu', inputs=input)

-        if mode != 'FAST_COMPILE' and cuda_available:
-            # print ''
-            # print 'ON GPU with size=(%s):' % str(size)
-            R = MRG_RandomStreams(234, use_cuda=True)
-            u = R.uniform(size=size, dtype='float32',
-                          nstreams=rng_mrg.guess_n_streams(size, warn=False))
-            # well, it's really that this test w GPU doesn't make sense otw
-            assert u.dtype == 'float32'
-            f = theano.function(var_input, theano.Out(
-                theano.sandbox.cuda.basic_ops.gpu_from_host(u),
-                borrow=True), mode=mode_with_gpu)
-            assert any([isinstance(node.op,
-                                   theano.sandbox.rng_mrg.GPU_mrg_uniform)
-                        for node in f.maker.fgraph.toposort()])
-            # theano.printing.debugprint(f)
-            gpu_out = np.asarray(f(*input))
-
-            # print 'GPU: random?[:10], random?[-10:]'
-            # print gpu_out[0, 0:10]
-            # print gpu_out[-1, -10:]
-            basictest(f, steps_, const_size, prefix='mrg  gpu', inputs=input)
-
-            np.testing.assert_array_almost_equal(cpu_out, gpu_out,
-                                                 decimal=6)
-
        # print ''
        # print 'ON CPU w Numpy with size=(%s):' % str(size)
        RR = theano.tensor.shared_randomstreams.RandomStreams(234)
@@ -629,7 +429,7 @@ def test_uniform():


 def test_broadcastable():
-    R = MRG_RandomStreams(234, use_cuda=False)
+    R = MRG_RandomStreams(234)
    x = tensor.matrix()
    size1 = (10, 1)
    size2 = (x.shape[0], 1)
@@ -695,7 +495,7 @@ def test_binomial():


 def t_binomial(mean, size, const_size, var_input, input, steps, rtol):
-    R = MRG_RandomStreams(234, use_cuda=False)
+    R = MRG_RandomStreams(234)
    u = R.binomial(size=size, p=mean)
    f = theano.function(var_input, u, mode=mode)
    out = f(*input)
@@ -709,22 +509,6 @@ def t_binomial(mean, size, const_size, var_input, input, steps, rtol):
              inputs=input, allow_01=True,
              target_avg=mean, mean_rtol=rtol)

-    if mode != 'FAST_COMPILE' and cuda_available:
-        R = MRG_RandomStreams(234, use_cuda=True)
-        u = R.binomial(size=size, p=mean, dtype='float32')
-        # well, it's really that this test w GPU doesn't make sense otw
-        assert u.dtype == 'float32'
-        f = theano.function(var_input, theano.Out(
-            theano.sandbox.cuda.basic_ops.gpu_from_host(u),
-            borrow=True), mode=mode_with_gpu)
-        gpu_out = np.asarray(f(*input))
-
-        basictest(f, steps_, const_size, prefix='mrg  gpu',
-                  inputs=input, allow_01=True,
-                  target_avg=mean, mean_rtol=rtol)
-        np.testing.assert_array_almost_equal(out, gpu_out,
-                                             decimal=6)
-
    RR = theano.tensor.shared_randomstreams.RandomStreams(234)

    uu = RR.binomial(size=size, p=mean)
@@ -778,7 +562,7 @@ def test_normal0():
        # print ''
        # print 'ON CPU:'

-        R = MRG_RandomStreams(234, use_cuda=False)
+        R = MRG_RandomStreams(234)
        # Note: we specify `nstreams` to avoid a warning.
        n = R.normal(size=size, avg=avg, std=std,
                     nstreams=rng_mrg.guess_n_streams(size, warn=False))
@@ -798,31 +582,6 @@ def test_normal0():

        sys.stdout.flush()

-        if mode != 'FAST_COMPILE' and cuda_available:
-            # print ''
-            # print 'ON GPU:'
-            R = MRG_RandomStreams(234, use_cuda=True)
-            n = R.normal(size=size, avg=avg, std=std, dtype='float32',
-                         nstreams=rng_mrg.guess_n_streams(size, warn=False))
-            # well, it's really that this test w GPU doesn't make sense otw
-            assert n.dtype == 'float32'
-            f = theano.function(var_input, theano.Out(
-                theano.sandbox.cuda.basic_ops.gpu_from_host(n),
-                borrow=True), mode=mode_with_gpu)
-
-            # theano.printing.debugprint(f)
-            sys.stdout.flush()
-            gpu_out = np.asarray(f(*input))
-            # print 'random?[:10]\n', gpu_out[0, 0:10]
-            # print '----'
-            sys.stdout.flush()
-            basictest(f, steps_, const_size, target_avg=avg, target_std=std,
-                      prefix='gpu mrg ', allow_01=True, inputs=input,
-                      mean_rtol=rtol, std_tol=std_tol)
-            # Need to allow some rounding error as their is float
-            # computation that are done on the gpu vs cpu
-            assert np.allclose(out, gpu_out, rtol=5e-6, atol=5e-6)
-
        # print ''
        # print 'ON CPU w NUMPY:'
        RR = theano.tensor.shared_randomstreams.RandomStreams(234)
@@ -877,7 +636,7 @@ def test_multinomial():

    pvals = np.asarray(np.random.uniform(size=sample_size))
    pvals = np.apply_along_axis(lambda row: row / np.sum(row), 1, pvals)
-    R = MRG_RandomStreams(234, use_cuda=False)
+    R = MRG_RandomStreams(234)
    # Note: we specify `nstreams` to avoid a warning.
    m = R.multinomial(pvals=pvals, dtype=config.floatX, nstreams=30 * 256)
    f = theano.function([], m, mode=mode_)
@@ -886,29 +645,6 @@ def test_multinomial():
    basic_multinomialtest(f, steps, sample_size, pvals, n_samples=1,
                          prefix='mrg ')

-    sys.stdout.flush()
-
-    if mode != 'FAST_COMPILE' and cuda_available:
-        # print ''
-        # print 'ON GPU:'
-        R = MRG_RandomStreams(234, use_cuda=True)
-        pvals = np.asarray(pvals, dtype='float32')
-        # We give the number of streams to avoid a warning.
-        n = R.multinomial(pvals=pvals, dtype='float32', nstreams=30 * 256)
-        # well, it's really that this test w GPU doesn't make sense otw
-        assert n.dtype == 'float32'
-        f = theano.function(
-            [],
-            theano.sandbox.cuda.basic_ops.gpu_from_host(n),
-            mode=mode_.including('gpu'))
-
-        # theano.printing.debugprint(f)
-        gpu_out = f()
-        sys.stdout.flush()
-        basic_multinomialtest(f, steps, sample_size, pvals, n_samples=1,
-                              prefix='gpu mrg ')
-        np.testing.assert_array_almost_equal(out, gpu_out, decimal=6)
-

 def test_multinomial_n_samples():
    mode_ = mode
@@ -924,7 +660,7 @@ def test_multinomial_n_samples():

    pvals = np.asarray(np.random.uniform(size=sample_size))
    pvals = np.apply_along_axis(lambda row: row / np.sum(row), 1, pvals)
-    R = MRG_RandomStreams(234, use_cuda=False)
+    R = MRG_RandomStreams(234)

    for n_samples, steps in zip([5, 10, 100, 1000], [20, 10, 1, 1]):
        m = R.multinomial(pvals=pvals, n=n_samples,
@@ -934,26 +670,11 @@ def test_multinomial_n_samples():
                              n_samples, prefix='mrg ')
        sys.stdout.flush()

-        if mode != 'FAST_COMPILE' and cuda_available:
-            R = MRG_RandomStreams(234, use_cuda=True)
-            pvals = np.asarray(pvals, dtype='float32')
-            n = R.multinomial(pvals=pvals, n=n_samples,
-                              dtype='float32', nstreams=30 * 256)
-            assert n.dtype == 'float32'
-            f = theano.function(
-                [],
-                theano.sandbox.cuda.basic_ops.gpu_from_host(n),
-                mode=mode_.including('gpu'))
-
-            sys.stdout.flush()
-            basic_multinomialtest(f, steps, sample_size, pvals,
-                                  n_samples, prefix='gpu mrg ')
-

 class T_MRG(unittest.TestCase):
    def test_bad_size(self):

-        R = MRG_RandomStreams(234, use_cuda=False)
+        R = MRG_RandomStreams(234)

        for size in [
                (0, 100),
@@ -1055,54 +776,43 @@ def test_multMatVect():


 def test_seed_fn():
-    test_use_cuda = [False]
-    if cuda_available:
-        test_use_cuda.append(True)
    idx = tensor.ivector()
-    for use_cuda in test_use_cuda:
-        if config.mode == 'FAST_COMPILE' and use_cuda:
-            mode = 'FAST_RUN'
-        else:
-            mode = config.mode
-
-        for new_seed, same in [(234, True), (None, True), (23, False)]:
-            random = MRG_RandomStreams(234, use_cuda=use_cuda)
-            fn1 = theano.function([], random.uniform((2, 2), dtype='float32'),
-                                  mode=mode)
-            fn2 = theano.function([], random.uniform((3, 3), nstreams=2,
-                                                     dtype='float32'),
-                                  mode=mode)
-            fn3 = theano.function([idx],
-                                  random.uniform(idx, nstreams=3, ndim=1,
-                                                 dtype='float32'),
-                                  mode=mode)
-
-            fn1_val0 = fn1()
-            fn1_val1 = fn1()
-            assert not np.allclose(fn1_val0, fn1_val1)
-            fn2_val0 = fn2()
-            fn2_val1 = fn2()
-            assert not np.allclose(fn2_val0, fn2_val1)
-            fn3_val0 = fn3([4])
-            fn3_val1 = fn3([4])
-            assert not np.allclose(fn3_val0, fn3_val1)
-            assert fn1_val0.size == 4
-            assert fn2_val0.size == 9
-
-            random.seed(new_seed)
-
-            fn1_val2 = fn1()
-            fn1_val3 = fn1()
-            fn2_val2 = fn2()
-            fn2_val3 = fn2()
-            fn3_val2 = fn3([4])
-            fn3_val3 = fn3([4])
-            assert np.allclose(fn1_val0, fn1_val2) == same
-            assert np.allclose(fn1_val1, fn1_val3) == same
-            assert np.allclose(fn2_val0, fn2_val2) == same
-            assert np.allclose(fn2_val1, fn2_val3) == same
-            assert np.allclose(fn3_val0, fn3_val2) == same
-            assert np.allclose(fn3_val1, fn3_val3) == same
+
+    for new_seed, same in [(234, True), (None, True), (23, False)]:
+        random = MRG_RandomStreams(234)
+        fn1 = theano.function([], random.uniform((2, 2), dtype='float32'))
+        fn2 = theano.function([], random.uniform((3, 3), nstreams=2,
+                                                 dtype='float32'))
+        fn3 = theano.function([idx],
+                              random.uniform(idx, nstreams=3, ndim=1,
+                                             dtype='float32'))
+
+        fn1_val0 = fn1()
+        fn1_val1 = fn1()
+        assert not np.allclose(fn1_val0, fn1_val1)
+        fn2_val0 = fn2()
+        fn2_val1 = fn2()
+        assert not np.allclose(fn2_val0, fn2_val1)
+        fn3_val0 = fn3([4])
+        fn3_val1 = fn3([4])
+        assert not np.allclose(fn3_val0, fn3_val1)
+        assert fn1_val0.size == 4
+        assert fn2_val0.size == 9
+
+        random.seed(new_seed)
+
+        fn1_val2 = fn1()
+        fn1_val3 = fn1()
+        fn2_val2 = fn2()
+        fn2_val3 = fn2()
+        fn3_val2 = fn3([4])
+        fn3_val3 = fn3([4])
+        assert np.allclose(fn1_val0, fn1_val2) == same
+        assert np.allclose(fn1_val1, fn1_val3) == same
+        assert np.allclose(fn2_val0, fn2_val2) == same
+        assert np.allclose(fn2_val1, fn2_val3) == same
+        assert np.allclose(fn3_val0, fn3_val2) == same
+        assert np.allclose(fn3_val1, fn3_val3) == same


 def rng_mrg_overflow(sizes, fct, mode, should_raise_error):
@@ -1132,28 +842,7 @@ def test_overflow_cpu():
    rng_mrg_overflow(sizes, fct, config.mode, should_raise_error=False)


-def test_overflow_gpu_old_backend():
-    # run with THEANO_FLAGS=mode=FAST_RUN,init_gpu_device=gpu1,device=cpu
-    if not cuda_available:
-        raise SkipTest('Optional package cuda not available')
-    mode = mode_with_gpu
-    seed = 12345
-    rng = MRG_RandomStreams(seed=seed, use_cuda=True)
-    fct = rng.uniform
-    # should raise error as the size overflows
-    sizes = [(2**31, ), (2**32, ), (2**15, 2**16,), (2, 2**15, 2**15)]
-    rng_mrg_overflow(sizes, fct, mode, should_raise_error=True)
-    # should not raise error
-    sizes = [(2**5, ), (2**5, 2**5), (2**5, 2**5, 2**5)]
-    rng_mrg_overflow(sizes, fct, mode, should_raise_error=False)
-    # should support int32 sizes
-    sizes = [(np.int32(2**10), ),
-             (np.int32(2), np.int32(2**10), np.int32(2**10))]
-    rng_mrg_overflow(sizes, fct, mode, should_raise_error=False)
-
-
 def test_overflow_gpu_new_backend():
-    # run with THEANO_FLAGS=mode=FAST_RUN,init_gpu_device=cuda1,device=cpu
    from theano.gpuarray.tests.test_basic_ops import \
        mode_with_gpu as mode
    from theano.gpuarray.type import gpuarray_shared_constructor