Move the gpuarray ops and tests under the gpuarray directory.

3c4ac6d0 · Arnaud Bergeron · ae9139ac · 3c4ac6d0 · 3c4ac6d0 · 3c4ac6d0
--- a/theano/gpuarray/rng_mrg.py
+++ b/theano/gpuarray/rng_mrg.py
+"""
+GPU implementation of MRG31k3p random number generator for Theano.
+
+Generator code in SSJ package (L'Ecuyer & Simard).
+http://www.iro.umontreal.ca/~simardr/ssj/indexe.html
+
+"""
+from __future__ import absolute_import, print_function, division
+
+import numpy
+
+from theano.gof import local_optimizer
+from theano.sandbox.rng_mrg import mrg_uniform_base, mrg_uniform
+from theano.tensor import as_tensor_variable, get_vector_length
+
+from theano.gpuarray.basic_ops import GpuKernelBase, Kernel, infer_context_name
+from theano.gpuarray.type import GpuArrayType
+from theano.gpuarray.fp16_help import write_w
+from theano.gpuarray.opt import (register_opt as register_gpua,
+                                 register_opt2,
+                                 host_from_gpu as host_from_gpua)
+
+
+class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
+    # GpuArray version
+    _f16_ok = True
+
+    def get_params(self, node):
+        return node.inputs[0].type.context
+
+    @classmethod
+    def new(cls, rstate, ndim, dtype, size):
+        v_size = as_tensor_variable(size)
+        if ndim is None:
+            ndim = get_vector_length(v_size)
+        op = cls(GpuArrayType(dtype, (False,) * ndim))
+        return op(rstate, v_size)
+
+    def c_headers(self):
+        return super(GPUA_mrg_uniform, self).c_headers() + ['numpy_compat.h']
+
+    def gpu_kernels(self, node, name):
+        write = write_w(self.output_type.dtype)
+        if self.output_type.dtype == 'float16':
+            otype = 'ga_half'
+            # limit the values of the state that we use.
+            mask = '& 0x7fff'
+            NORM = '3.0518e-05f'  # numpy.float16(1.0/(2**15+8))
+            # this was determined by finding the biggest number such that
+            # numpy.float16(number * (M1 & 0x7fff)) < 1.0
+        elif self.output_type.dtype == 'float32':
+            otype = 'float'
+            mask = ''
+            NORM = '4.6566126e-10f'  # numpy.float32(1.0/(2**31+65))
+            # this was determined by finding the biggest number such that
+            # numpy.float32(number * M1) < 1.0
+        elif self.output_type.dtype == 'float64':
+            otype = 'double'
+            mask = ''
+            NORM = '4.656612873077392578125e-10'
+        else:
+            raise ValueError('Unsupported data type for output',
+                             self.output_type.dtype)
+        code = """
+        KERNEL void mrg_uniform(
+                GLOBAL_MEM %(otype)s *sample_data,
+                GLOBAL_MEM ga_int *state_data,
+                const ga_uint Nsamples,
+                const ga_uint Nstreams_used)
+        {
+            /*
+             * The cluda backend makes sure that ga_int corresponds to
+             * a 32 bit signed type on the target device.  It is not a
+             * variable width type.
+             */
+            const ga_int i7 = 7;
+            const ga_int i9 = 9;
+            const ga_int i15 = 15;
+            const ga_int i16 = 16;
+            const ga_int i22 = 22;
+            const ga_int i24 = 24;
+
+            const ga_int M1 = 2147483647;      //2^31 - 1
+            const ga_int M2 = 2147462579;      //2^31 - 21069
+            const ga_int MASK12 = 511;       //2^9 - 1
+            const ga_int MASK13 = 16777215;  //2^24 - 1
+            const ga_int MASK2 = 65535;      //2^16 - 1
+            const ga_int MULT2 = 21069;
+
+            const ga_uint idx = GID_0 * LDIM_0 + LID_0;
+            ga_int y1, y2, x11, x12, x13, x21, x22, x23;
+
+            if (idx < Nstreams_used)
+            {
+            x11 = state_data[idx*6+0];
+            x12 = state_data[idx*6+1];
+            x13 = state_data[idx*6+2];
+            x21 = state_data[idx*6+3];
+            x22 = state_data[idx*6+4];
+            x23 = state_data[idx*6+5];
+
+            for (ga_uint i = idx; i < Nsamples; i += Nstreams_used)
+            {
+                y1 = ((x12 & MASK12) << i22) + (x12 >> i9) + ((x13 & MASK13) << i7) + (x13 >> i24);
+                y1 -= (y1 < 0 || y1 >= M1) ? M1 : 0;
+                y1 += x13;
+                y1 -= (y1 < 0 || y1 >= M1) ? M1 : 0;
+                x13 = x12;
+                x12 = x11;
+                x11 = y1;
+
+                y1 = ((x21 & MASK2) << i15) + (MULT2 * (x21 >> i16));
+                y1 -= (y1 < 0 || y1 >= M2) ? M2 : 0;
+                y2 = ((x23 & MASK2) << i15) + (MULT2 * (x23 >> i16));
+                y2 -= (y2 < 0 || y2 >= M2) ? M2 : 0;
+                y2 += x23;
+                y2 -= (y2 < 0 || y2 >= M2) ? M2 : 0;
+                y2 += y1;
+                y2 -= (y2 < 0 || y2 >= M2) ? M2 : 0;
+
+                x23 = x22;
+                x22 = x21;
+                x21 = y2;
+
+                if (x11 <= x21) {
+                    sample_data[i] = %(write)s(((x11 - x21 + M1) %(mask)s) * %(NORM)s);
+                }
+                else
+                {
+                    sample_data[i] = %(write)s(((x11 - x21) %(mask)s) * %(NORM)s);
+                }
+            }
+
+            state_data[idx*6+0]= x11;
+            state_data[idx*6+1]= x12;
+            state_data[idx*6+2]= x13;
+            state_data[idx*6+3]= x21;
+            state_data[idx*6+4]= x22;
+            state_data[idx*6+5]= x23;
+            }
+        }
+
+        """ % locals()
+
+        # we shouldn't get to this line if it's about to fail
+        from pygpu import gpuarray
+
+        return [Kernel(code=code, name="mrg_uniform",
+                       params=[gpuarray.GpuArray, gpuarray.GpuArray,
+                               'uint32', 'uint32'],
+                       flags=Kernel.get_flags(self.output_type.dtype, 'int32'))
+                ]
+
+    def c_code(self, node, nodename, inp, out, sub):
+        rstate, size = inp
+        o_rstate, o_sample = out
+        inplace = int(self.inplace)
+        ndim = self.output_type.ndim
+        o_type_num = numpy.asarray(0, dtype=self.output_type.dtype).dtype.num
+        fail = sub['fail']
+        ctx = sub['params']
+        kname = self.gpu_kernels(node, nodename)[0].objvar
+        otypecode = str(self.output_type.typecode)
+
+        return """
+        npy_int64 M1 = 2147483647;      //2^31 - 1
+        // The +1 is to avoid odims[0] which fails on windows
+        size_t odims[%(ndim)s+1];
+        size_t n_elements = 1;
+        unsigned int n_streams;
+        int must_alloc_sample = ((NULL == %(o_sample)s)
+                || !pygpu_GpuArray_Check((PyObject*)%(o_sample)s)
+                || !(%(o_sample)s->ga.flags & GA_C_CONTIGUOUS)
+                || (PyGpuArray_NDIM(%(o_sample)s) != %(ndim)s));
+
+        if (PyArray_NDIM(%(size)s) != 1)
+        {
+            PyErr_SetString(PyExc_ValueError, "size must be vector");
+            %(fail)s
+        }
+        if (PyArray_DIMS(%(size)s)[0] != %(ndim)s)
+        {
+            PyErr_Format(PyExc_ValueError, "size must have length %%i (not %%li)",
+                %(ndim)s, PyArray_DIMS(%(size)s)[0]);
+            %(fail)s
+        }
+
+        for (int i = 0; i < %(ndim)s; ++i)
+        {
+            odims[i] = *(dtype_%(size)s *)PyArray_GETPTR1(%(size)s, i);
+            n_elements *= odims[i];
+            must_alloc_sample = (must_alloc_sample
+                    || PyGpuArray_DIMS(%(o_sample)s)[i] != odims[i]);
+        }
+
+        if (n_elements > M1)
+        {
+            PyErr_SetString(
+                PyExc_ValueError,
+                "rng_mrg gpu implementation does not support more than (2**31 -1) samples");
+            %(fail)s
+        }
+        if (must_alloc_sample)
+        {
+            Py_XDECREF(%(o_sample)s);
+            %(o_sample)s = pygpu_empty(%(ndim)s, odims, %(otypecode)s, GA_C_ORDER,
+                                       %(ctx)s, Py_None);
+            if(!%(o_sample)s)
+            {
+                %(fail)s;
+            }
+        }
+        if (!pygpu_GpuArray_Check((PyObject*)%(rstate)s))
+        {
+            PyErr_Format(PyExc_ValueError, "rstate must be gpuarray");
+            %(fail)s;
+        }
+
+        Py_XDECREF(%(o_rstate)s);
+        if (%(inplace)s)
+        {
+            Py_INCREF(%(rstate)s);
+            %(o_rstate)s = %(rstate)s;
+        }
+        else
+        {
+            %(o_rstate)s = pygpu_copy(%(rstate)s, GA_ANY_ORDER);
+            if (!%(o_rstate)s) {
+                %(fail)s
+            }
+        }
+
+        if (PyGpuArray_NDIM(%(o_rstate)s) != 2)
+        {
+            PyErr_SetString(PyExc_ValueError, "rstate must be a matrix");
+            %(fail)s
+        }
+        if (PyGpuArray_DIMS(%(o_rstate)s)[1] != 6)
+        {
+            PyErr_Format(PyExc_ValueError, "rstate must have 6 columns");
+            %(fail)s
+        }
+        if (%(o_rstate)s->ga.typecode != GA_INT) {
+            PyErr_Format(PyExc_ValueError, "rstate must be int32");
+            %(fail)s
+        }
+        if (!GpuArray_CHKFLAGS(&%(o_rstate)s->ga, GA_C_CONTIGUOUS)) {
+            PyErr_Format(PyExc_ValueError, "rstate must be C contiguous");
+            %(fail)s
+        }
+        n_streams = PyGpuArray_DIMS(%(o_rstate)s)[0];
+        if (n_streams > n_elements)
+          n_streams = n_elements;
+
+        {
+          size_t ls = 0, gs = 0;
+          int err = GpuKernel_sched(&%(kname)s, n_streams, &ls, &gs);
+          if (err != GA_NO_ERROR) {
+              PyErr_Format(PyExc_RuntimeError, "GpuKernel_sched: %%s\\n",
+                           GpuKernel_error(&%(kname)s, err));
+              %(fail)s
+          }
+          // Make sure we run as many blocks as we need to cover the whole n_streams
+          gs = (n_streams + ls - 1)/ls;
+          err = mrg_uniform_call(1, &ls, &gs, 0, %(o_sample)s->ga.data, %(o_rstate)s->ga.data, n_elements, n_streams);
+          if (err != GA_NO_ERROR) {
+              PyErr_Format(PyExc_RuntimeError, "mrg_uniform_call: %%s\\n",
+                           GpuKernel_error(&%(kname)s, err));
+              %(fail)s
+          }
+        }
+        """ % locals()
+
+    def c_code_cache_version(self):
+        return (12,)
+
+
+@register_opt2([mrg_uniform], 'fast_compile')
+def local_gpua_mrg_graph(op, context_name, inputs, outputs):
+    if (type(op) == mrg_uniform and
+            isinstance(inputs[0].type, GpuArrayType)):
+        outs = GPUA_mrg_uniform.new(inputs[0],
+                                    op.output_type.ndim,
+                                    op.output_type.dtype,
+                                    inputs[1])
+        return [outs[0], host_from_gpua(outs[1])]
+
+
+@register_gpua('fast_compile')
+@local_optimizer([mrg_uniform])
+def local_gpua_mrg(node):
+    context_name = infer_context_name(*node.inputs)
+    return local_gpua_mrg_graph(node.op, context_name, node.inputs, node.outputs)
--- a/theano/gpuarray/tests/test_others.py
+++ b/theano/gpuarray/tests/test_others.py
+from __future__ import absolute_import, print_function, division
 from .config import test_ctx_name, mode_with_gpu

 from ..type import (get_context, GpuArrayType, GpuArraySharedVariable,
@@ -11,6 +12,7 @@ from theano.misc.pkl_utils import dump, load

 from theano.tensor.tests.test_opt import test_fusion as t_fusion

+
 class test_fusion(t_fusion):
    mode = mode_with_gpu
    shared = gpuarray_shared_constructor

--- a/theano/gpuarray/tests/test_rng_mrg.py
+++ b/theano/gpuarray/tests/test_rng_mrg.py
+from __future__ import absolute_import, print_function, division
+import functools
+
+import numpy
+
+import theano
+from theano import tensor
+from theano.sandbox import rng_mrg
+from theano.sandbox.rng_mrg import MRG_RandomStreams
+from theano.sandbox.tests.test_rng_mrg import java_samples, rng_mrg_overflow
+from theano.tests import unittest_tools as utt
+
+from theano.gpuarray.tests.config import mode_with_gpu as mode
+from theano.gpuarray.type import gpuarray_shared_constructor
+
+utt.seed_rng()
+
+
+def test_consistency_GPUA_serial():
+    # Verify that the random numbers generated by GPUA_mrg_uniform, serially,
+    # are the same as the reference (Java) implementation by L'Ecuyer et al.
+
+    seed = 12345
+    n_samples = 5
+    n_streams = 12
+    n_substreams = 7
+
+    samples = []
+    curr_rstate = numpy.array([seed] * 6, dtype='int32')
+
+    for i in range(n_streams):
+        stream_rstate = curr_rstate.copy()
+        for j in range(n_substreams):
+            substream_rstate = numpy.array([stream_rstate.copy()],
+                                           dtype='int32')
+            # Transfer to device
+            rstate = gpuarray_shared_constructor(substream_rstate)
+
+            new_rstate, sample = rng_mrg.GPUA_mrg_uniform.new(rstate,
+                                                              ndim=None,
+                                                              dtype='float32',
+                                                              size=(1,))
+            rstate.default_update = new_rstate
+
+            # Not really necessary, just mimicking
+            # rng_mrg.MRG_RandomStreams' behavior
+            sample.rstate = rstate
+            sample.update = (rstate, new_rstate)
+
+            # We need the sample back in the main memory
+            cpu_sample = tensor.as_tensor_variable(sample)
+            f = theano.function([], cpu_sample, mode=mode)
+            for k in range(n_samples):
+                s = f()
+                samples.append(s)
+
+            # next substream
+            stream_rstate = rng_mrg.ff_2p72(stream_rstate)
+
+        # next stream
+        curr_rstate = rng_mrg.ff_2p134(curr_rstate)
+
+    samples = numpy.array(samples).flatten()
+    assert(numpy.allclose(samples, java_samples))
+
+
+def test_consistency_GPUA_parallel():
+    # Verify that the random numbers generated by GPUA_mrg_uniform, in
+    # parallel, are the same as the reference (Java) implementation by
+    # L'Ecuyer et al.
+    seed = 12345
+    n_samples = 5
+    n_streams = 12
+    n_substreams = 7  # 7 samples will be drawn in parallel
+
+    samples = []
+    curr_rstate = numpy.array([seed] * 6, dtype='int32')
+
+    for i in range(n_streams):
+        stream_samples = []
+        rstate = [curr_rstate.copy()]
+        for j in range(1, n_substreams):
+            rstate.append(rng_mrg.ff_2p72(rstate[-1]))
+        rstate = numpy.asarray(rstate)
+        rstate = gpuarray_shared_constructor(rstate)
+
+        new_rstate, sample = rng_mrg.GPUA_mrg_uniform.new(rstate, ndim=None,
+                                                          dtype='float32',
+                                                          size=(n_substreams,))
+        rstate.default_update = new_rstate
+
+        # Not really necessary, just mimicking
+        # rng_mrg.MRG_RandomStreams' behavior
+        sample.rstate = rstate
+        sample.update = (rstate, new_rstate)
+
+        # We need the sample back in the main memory
+        cpu_sample = tensor.as_tensor_variable(sample)
+        f = theano.function([], cpu_sample, mode=mode)
+
+        for k in range(n_samples):
+            s = f()
+            stream_samples.append(s)
+
+        samples.append(numpy.array(stream_samples).T.flatten())
+
+        # next stream
+        curr_rstate = rng_mrg.ff_2p134(curr_rstate)
+
+    samples = numpy.array(samples).flatten()
+    assert(numpy.allclose(samples, java_samples))
+
+
+def test_GPUA_full_fill():
+    # Make sure the whole sample buffer is filled.  Also make sure
+    # large samples are consistent with CPU results.
+    import theano.gpuarray.tests.config
+    from theano.gpuarray.type import gpuarray_shared_constructor
+
+    # This needs to be large to trigger the problem on GPU
+    size = (10, 1000)
+
+    R = MRG_RandomStreams(234)
+    uni = R.uniform(size, nstreams=60 * 256)
+    f_cpu = theano.function([], uni)
+
+    rstate_gpu = gpuarray_shared_constructor(R.state_updates[-1][0].get_value())
+    new_rstate, sample = rng_mrg.GPUA_mrg_uniform.new(rstate_gpu, ndim=None,
+                                                      dtype='float32',
+                                                      size=size)
+    rstate_gpu.default_update = new_rstate
+    f_gpu = theano.function([], sample)
+
+    utt.assert_allclose(f_cpu(), f_gpu())
+
+
+def test_overflow_gpu_new_backend():
+    from theano.gpuarray.tests.test_basic_ops import \
+        mode_with_gpu as mode
+    from theano.gpuarray.type import gpuarray_shared_constructor
+    seed = 12345
+    n_substreams = 7
+    curr_rstate = numpy.array([seed] * 6, dtype='int32')
+    rstate = [curr_rstate.copy()]
+    for j in range(1, n_substreams):
+        rstate.append(rng_mrg.ff_2p72(rstate[-1]))
+    rstate = numpy.asarray(rstate)
+    rstate = gpuarray_shared_constructor(rstate)
+    fct = functools.partial(rng_mrg.GPUA_mrg_uniform.new, rstate,
+                            ndim=None, dtype='float32')
+    # should raise error as the size overflows
+    sizes = [(2**31, ), (2**32, ), (2**15, 2**16,), (2, 2**15, 2**15)]
+    rng_mrg_overflow(sizes, fct, mode, should_raise_error=True)
+    # should not raise error
+    sizes = [(2**5, ), (2**5, 2**5), (2**5, 2**5, 2**5)]
+    rng_mrg_overflow(sizes, fct, mode, should_raise_error=False)
+    # should support int32 sizes
+    sizes = [(numpy.int32(2**10), ),
+             (numpy.int32(2), numpy.int32(2**10), numpy.int32(2**10))]
+    rng_mrg_overflow(sizes, fct, mode, should_raise_error=False)
+
+
+def test_validate_input_types_gpuarray_backend():
+    from theano.sandbox.rng_mrg import mrg_uniform
+    from theano.gpuarray.type import gpuarray_shared_constructor
+    from theano.configparser import change_flags
+
+    with change_flags(compute_test_value="raise"):
+        rstate = numpy.zeros((7, 6), dtype="int32")
+        rstate = gpuarray_shared_constructor(rstate)
+        mrg_uniform.new(rstate, ndim=None, dtype="float32", size=(3,))
--- a/theano/sandbox/rng_mrg.py
+++ b/theano/sandbox/rng_mrg.py
@@ -23,12 +23,6 @@ from theano.compile import optdb
 from theano.gof import local_optimizer
 from . import multinomial

-from theano.gpuarray.basic_ops import GpuKernelBase, Kernel, infer_context_name, as_gpuarray_variable
-from theano.gpuarray.type import GpuArrayType
-from theano.gpuarray.fp16_help import write_w
-from theano.gpuarray.opt import (register_opt as register_gpua,
-                                 register_opt2)
-

 def matVecModM(A, s, m):
    # TODO : need description for method, parameter and return
@@ -557,274 +551,6 @@ class mrg_uniform(mrg_uniform_base):
        return (8, )


-class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
-    # GpuArray version
-    _f16_ok = True
-
-    def make_node(self, rstate, size):
-        # error checking slightly redundant here, since
-        # this op should not be called directly.
-        #
-        # call through MRG_RandomStreams instead.
-        broad = []
-        for i in range(self.output_type.ndim):
-                broad.append(tensor.extract_constant(size[i]) == 1)
-        output_type = self.output_type.clone(broadcastable=broad)()
-        rstate = as_gpuarray_variable(rstate, infer_context_name(rstate))
-        return Apply(self,
-                     [rstate, size],
-                     [rstate.type(), output_type])
-
-    def get_params(self, node):
-        return node.inputs[0].type.context
-
-    @classmethod
-    def new(cls, rstate, ndim, dtype, size):
-        v_size = as_tensor_variable(size)
-        if ndim is None:
-            ndim = get_vector_length(v_size)
-        op = cls(GpuArrayType(dtype, (False,) * ndim))
-        return op(rstate, v_size)
-
-    def c_headers(self):
-        return super(GPUA_mrg_uniform, self).c_headers() + ['numpy_compat.h']
-
-    def gpu_kernels(self, node, name):
-        write = write_w(self.output_type.dtype)
-        if self.output_type.dtype == 'float16':
-            otype = 'ga_half'
-            # limit the values of the state that we use.
-            mask = '& 0x7fff'
-            NORM = '3.0518e-05f'  # np.float16(1.0/(2**15+8))
-            # this was determined by finding the biggest number such that
-            # np.float16(number * (M1 & 0x7fff)) < 1.0
-        elif self.output_type.dtype == 'float32':
-            otype = 'float'
-            mask = ''
-            NORM = '4.6566126e-10f'  # np.float32(1.0/(2**31+65))
-            # this was determined by finding the biggest number such that
-            # np.float32(number * M1) < 1.0
-        elif self.output_type.dtype == 'float64':
-            otype = 'double'
-            mask = ''
-            NORM = '4.656612873077392578125e-10'
-        else:
-            raise ValueError('Unsupported data type for output',
-                             self.output_type.dtype)
-        code = """
-        KERNEL void mrg_uniform(
-                GLOBAL_MEM %(otype)s *sample_data,
-                GLOBAL_MEM ga_int *state_data,
-                const ga_uint Nsamples,
-                const ga_uint Nstreams_used)
-        {
-            /*
-             * The cluda backend makes sure that ga_int corresponds to
-             * a 32 bit signed type on the target device.  It is not a
-             * variable width type.
-             */
-            const ga_int i7 = 7;
-            const ga_int i9 = 9;
-            const ga_int i15 = 15;
-            const ga_int i16 = 16;
-            const ga_int i22 = 22;
-            const ga_int i24 = 24;
-
-            const ga_int M1 = 2147483647;      //2^31 - 1
-            const ga_int M2 = 2147462579;      //2^31 - 21069
-            const ga_int MASK12 = 511;       //2^9 - 1
-            const ga_int MASK13 = 16777215;  //2^24 - 1
-            const ga_int MASK2 = 65535;      //2^16 - 1
-            const ga_int MULT2 = 21069;
-
-            const ga_uint idx = GID_0 * LDIM_0 + LID_0;
-            ga_int y1, y2, x11, x12, x13, x21, x22, x23;
-
-            if (idx < Nstreams_used)
-            {
-            x11 = state_data[idx*6+0];
-            x12 = state_data[idx*6+1];
-            x13 = state_data[idx*6+2];
-            x21 = state_data[idx*6+3];
-            x22 = state_data[idx*6+4];
-            x23 = state_data[idx*6+5];
-
-            for (ga_uint i = idx; i < Nsamples; i += Nstreams_used)
-            {
-                y1 = ((x12 & MASK12) << i22) + (x12 >> i9) + ((x13 & MASK13) << i7) + (x13 >> i24);
-                y1 -= (y1 < 0 || y1 >= M1) ? M1 : 0;
-                y1 += x13;
-                y1 -= (y1 < 0 || y1 >= M1) ? M1 : 0;
-                x13 = x12;
-                x12 = x11;
-                x11 = y1;
-
-                y1 = ((x21 & MASK2) << i15) + (MULT2 * (x21 >> i16));
-                y1 -= (y1 < 0 || y1 >= M2) ? M2 : 0;
-                y2 = ((x23 & MASK2) << i15) + (MULT2 * (x23 >> i16));
-                y2 -= (y2 < 0 || y2 >= M2) ? M2 : 0;
-                y2 += x23;
-                y2 -= (y2 < 0 || y2 >= M2) ? M2 : 0;
-                y2 += y1;
-                y2 -= (y2 < 0 || y2 >= M2) ? M2 : 0;
-
-                x23 = x22;
-                x22 = x21;
-                x21 = y2;
-
-                if (x11 <= x21) {
-                    sample_data[i] = %(write)s(((x11 - x21 + M1) %(mask)s) * %(NORM)s);
-                }
-                else
-                {
-                    sample_data[i] = %(write)s(((x11 - x21) %(mask)s) * %(NORM)s);
-                }
-            }
-
-            state_data[idx*6+0]= x11;
-            state_data[idx*6+1]= x12;
-            state_data[idx*6+2]= x13;
-            state_data[idx*6+3]= x21;
-            state_data[idx*6+4]= x22;
-            state_data[idx*6+5]= x23;
-            }
-        }
-
-        """ % locals()
-
-        # we shouldn't get to this line if it's about to fail
-        from pygpu import gpuarray
-
-        return [Kernel(code=code, name="mrg_uniform",
-                       params=[gpuarray.GpuArray, gpuarray.GpuArray,
-                               'uint32', 'uint32'],
-                       flags=Kernel.get_flags(self.output_type.dtype, 'int32'))
-                ]
-
-    def c_code(self, node, nodename, inp, out, sub):
-        rstate, size = inp
-        o_rstate, o_sample = out
-        inplace = int(self.inplace)
-        ndim = self.output_type.ndim
-        o_type_num = np.asarray(0, dtype=self.output_type.dtype).dtype.num
-        fail = sub['fail']
-        ctx = sub['params']
-        kname = self.gpu_kernels(node, nodename)[0].objvar
-        otypecode = str(self.output_type.typecode)
-
-        return """
-        npy_int64 M1 = 2147483647;      //2^31 - 1
-        // The +1 is to avoid odims[0] which fails on windows
-        size_t odims[%(ndim)s+1];
-        size_t n_elements = 1;
-        unsigned int n_streams;
-        int must_alloc_sample = ((NULL == %(o_sample)s)
-                || !pygpu_GpuArray_Check((PyObject*)%(o_sample)s)
-                || !(%(o_sample)s->ga.flags & GA_C_CONTIGUOUS)
-                || (PyGpuArray_NDIM(%(o_sample)s) != %(ndim)s));
-
-        if (PyArray_NDIM(%(size)s) != 1)
-        {
-            PyErr_SetString(PyExc_ValueError, "size must be vector");
-            %(fail)s
-        }
-        if (PyArray_DIMS(%(size)s)[0] != %(ndim)s)
-        {
-            PyErr_Format(PyExc_ValueError, "size must have length %%i (not %%li)",
-                %(ndim)s, PyArray_DIMS(%(size)s)[0]);
-            %(fail)s
-        }
-
-        for (int i = 0; i < %(ndim)s; ++i)
-        {
-            odims[i] = *(dtype_%(size)s *)PyArray_GETPTR1(%(size)s, i);
-            n_elements *= odims[i];
-            must_alloc_sample = (must_alloc_sample
-                    || PyGpuArray_DIMS(%(o_sample)s)[i] != odims[i]);
-        }
-
-        if (n_elements > M1)
-        {
-            PyErr_SetString(
-                PyExc_ValueError,
-                "rng_mrg gpu implementation does not support more than (2**31 -1) samples");
-            %(fail)s
-        }
-        if (must_alloc_sample)
-        {
-            Py_XDECREF(%(o_sample)s);
-            %(o_sample)s = pygpu_empty(%(ndim)s, odims, %(otypecode)s, GA_C_ORDER,
-                                       %(ctx)s, Py_None);
-            if(!%(o_sample)s)
-            {
-                %(fail)s;
-            }
-        }
-        if (!pygpu_GpuArray_Check((PyObject*)%(rstate)s))
-        {
-            PyErr_Format(PyExc_ValueError, "rstate must be gpuarray");
-            %(fail)s;
-        }
-
-        Py_XDECREF(%(o_rstate)s);
-        if (%(inplace)s)
-        {
-            Py_INCREF(%(rstate)s);
-            %(o_rstate)s = %(rstate)s;
-        }
-        else
-        {
-            %(o_rstate)s = pygpu_copy(%(rstate)s, GA_ANY_ORDER);
-            if (!%(o_rstate)s) {
-                %(fail)s
-            }
-        }
-
-        if (PyGpuArray_NDIM(%(o_rstate)s) != 2)
-        {
-            PyErr_SetString(PyExc_ValueError, "rstate must be a matrix");
-            %(fail)s
-        }
-        if (PyGpuArray_DIMS(%(o_rstate)s)[1] != 6)
-        {
-            PyErr_Format(PyExc_ValueError, "rstate must have 6 columns");
-            %(fail)s
-        }
-        if (%(o_rstate)s->ga.typecode != GA_INT) {
-            PyErr_Format(PyExc_ValueError, "rstate must be int32");
-            %(fail)s
-        }
-        if (!GpuArray_CHKFLAGS(&%(o_rstate)s->ga, GA_C_CONTIGUOUS)) {
-            PyErr_Format(PyExc_ValueError, "rstate must be C contiguous");
-            %(fail)s
-        }
-        n_streams = PyGpuArray_DIMS(%(o_rstate)s)[0];
-        if (n_streams > n_elements)
-          n_streams = n_elements;
-
-        {
-          size_t ls = 0, gs = 0;
-          int err = GpuKernel_sched(&%(kname)s, n_streams, &ls, &gs);
-          if (err != GA_NO_ERROR) {
-              PyErr_Format(PyExc_RuntimeError, "GpuKernel_sched: %%s\\n",
-                           GpuKernel_error(&%(kname)s, err));
-              %(fail)s
-          }
-          // Make sure we run as many blocks as we need to cover the whole n_streams
-          gs = (n_streams + ls - 1)/ls;
-          err = mrg_uniform_call(1, &ls, &gs, 0, %(o_sample)s->ga.data, %(o_rstate)s->ga.data, n_elements, n_streams);
-          if (err != GA_NO_ERROR) {
-              PyErr_Format(PyExc_RuntimeError, "mrg_uniform_call: %%s\\n",
-                           GpuKernel_error(&%(kname)s, err));
-              %(fail)s
-          }
-        }
-        """ % locals()
-
-    def c_code_cache_version(self):
-        return (12,)
-
-
 def guess_n_streams(size, warn=False):
    # TODO : need description for parameter 'size'
    """
@@ -1317,36 +1043,16 @@ class MRG_RandomStreams(object):
        return final_samples


-@register_opt2([mrg_uniform], 'fast_compile')
-def local_gpua_mrg_graph(op, context_name, inputs, outputs):
-    if (type(op) == mrg_uniform and
-            isinstance(inputs[0].type, GpuArrayType)):
-        outs = GPUA_mrg_uniform.new(inputs[0],
-                                    op.output_type.ndim,
-                                    op.output_type.dtype,
-                                    inputs[1])
-        return [outs[0], outs[1].transfer('cpu')]
-
-
-@register_gpua('fast_compile')
-@local_optimizer([mrg_uniform])
-def local_gpua_mrg(node):
-    context_name = infer_context_name(*node.inputs)
-    return local_gpua_mrg_graph(node.op, context_name, node.inputs, node.outputs)
-
-
-MRG_RNGs = (mrg_uniform, GPUA_mrg_uniform)
-
-
-@local_optimizer(MRG_RNGs)
+@local_optimizer((mrg_uniform_base,))
 def mrg_random_make_inplace(node):

    op = node.op
-    if isinstance(op, MRG_RNGs) and not op.inplace:
+    if isinstance(op, mrg_uniform_base) and not op.inplace:
        # op might be gpu version
        new_op = op.__class__(op.output_type, inplace=True)
        return new_op.make_node(*node.inputs).outputs
    return False
+
 optdb.register('random_make_inplace_mrg',
               opt.in2out(mrg_random_make_inplace, ignore_newtrees=True),
               99, 'fast_run', 'inplace')
--- a/theano/sandbox/tests/test_rng_mrg.py
+++ b/theano/sandbox/tests/test_rng_mrg.py
@@ -17,7 +17,6 @@ from theano.sandbox import rng_mrg
 from theano.sandbox.rng_mrg import MRG_RandomStreams
 from theano.tests import unittest_tools as utt
 from theano.tests.unittest_tools import attr
-import theano.gpuarray.tests.config

 # TODO: test MRG_RandomStreams
 # Partly done in test_consistency_randomstreams
@@ -186,129 +185,6 @@ def test_consistency_cpu_parallel():
    assert(np.allclose(samples, java_samples))


-def test_consistency_GPUA_serial():
-    # Verify that the random numbers generated by GPUA_mrg_uniform, serially,
-    # are the same as the reference (Java) implementation by L'Ecuyer et al.
-    from theano.gpuarray.tests.config import mode_with_gpu as mode
-    from theano.gpuarray.type import gpuarray_shared_constructor
-
-    seed = 12345
-    n_samples = 5
-    n_streams = 12
-    n_substreams = 7
-
-    samples = []
-    curr_rstate = np.array([seed] * 6, dtype='int32')
-
-    for i in range(n_streams):
-        stream_rstate = curr_rstate.copy()
-        for j in range(n_substreams):
-            substream_rstate = np.array([stream_rstate.copy()],
-                                        dtype='int32')
-            # Transfer to device
-            rstate = gpuarray_shared_constructor(substream_rstate)
-
-            new_rstate, sample = rng_mrg.GPUA_mrg_uniform.new(rstate,
-                                                              ndim=None,
-                                                              dtype='float32',
-                                                              size=(1,))
-            rstate.default_update = new_rstate
-
-            # Not really necessary, just mimicking
-            # rng_mrg.MRG_RandomStreams' behavior
-            sample.rstate = rstate
-            sample.update = (rstate, new_rstate)
-
-            # We need the sample back in the main memory
-            cpu_sample = tensor.as_tensor_variable(sample)
-            f = theano.function([], cpu_sample, mode=mode)
-            for k in range(n_samples):
-                s = f()
-                samples.append(s)
-
-            # next substream
-            stream_rstate = rng_mrg.ff_2p72(stream_rstate)
-
-        # next stream
-        curr_rstate = rng_mrg.ff_2p134(curr_rstate)
-
-    samples = np.array(samples).flatten()
-    assert(np.allclose(samples, java_samples))
-
-
-def test_consistency_GPUA_parallel():
-    # Verify that the random numbers generated by GPUA_mrg_uniform, in
-    # parallel, are the same as the reference (Java) implementation by
-    # L'Ecuyer et al.
-    from theano.gpuarray.tests.config import mode_with_gpu as mode
-    from theano.gpuarray.type import gpuarray_shared_constructor
-
-    seed = 12345
-    n_samples = 5
-    n_streams = 12
-    n_substreams = 7  # 7 samples will be drawn in parallel
-
-    samples = []
-    curr_rstate = np.array([seed] * 6, dtype='int32')
-
-    for i in range(n_streams):
-        stream_samples = []
-        rstate = [curr_rstate.copy()]
-        for j in range(1, n_substreams):
-            rstate.append(rng_mrg.ff_2p72(rstate[-1]))
-        rstate = np.asarray(rstate)
-        rstate = gpuarray_shared_constructor(rstate)
-
-        new_rstate, sample = rng_mrg.GPUA_mrg_uniform.new(rstate, ndim=None,
-                                                          dtype='float32',
-                                                          size=(n_substreams,))
-        rstate.default_update = new_rstate
-
-        # Not really necessary, just mimicking
-        # rng_mrg.MRG_RandomStreams' behavior
-        sample.rstate = rstate
-        sample.update = (rstate, new_rstate)
-
-        # We need the sample back in the main memory
-        cpu_sample = tensor.as_tensor_variable(sample)
-        f = theano.function([], cpu_sample, mode=mode)
-
-        for k in range(n_samples):
-            s = f()
-            stream_samples.append(s)
-
-        samples.append(np.array(stream_samples).T.flatten())
-
-        # next stream
-        curr_rstate = rng_mrg.ff_2p134(curr_rstate)
-
-    samples = np.array(samples).flatten()
-    assert(np.allclose(samples, java_samples))
-
-
-def test_GPUA_full_fill():
-    # Make sure the whole sample buffer is filled.  Also make sure
-    # large samples are consistent with CPU results.
-    import theano.gpuarray.tests.config
-    from theano.gpuarray.type import gpuarray_shared_constructor
-
-    # This needs to be large to trigger the problem on GPU
-    size = (10, 1000)
-
-    R = MRG_RandomStreams(234)
-    uni = R.uniform(size, nstreams=60 * 256)
-    f_cpu = theano.function([], uni)
-
-    rstate_gpu = gpuarray_shared_constructor(R.state_updates[-1][0].get_value())
-    new_rstate, sample = rng_mrg.GPUA_mrg_uniform.new(rstate_gpu, ndim=None,
-                                                      dtype='float32',
-                                                      size=size)
-    rstate_gpu.default_update = new_rstate
-    f_gpu = theano.function([], sample)
-
-    utt.assert_allclose(f_cpu(), f_gpu())
-
-
 def basictest(f, steps, sample_size, prefix="", allow_01=False, inputs=None,
              target_avg=0.5, target_std=None, mean_rtol=0.01, std_tol=0.01):
    if inputs is None:
@@ -842,42 +718,6 @@ def test_overflow_cpu():
    rng_mrg_overflow(sizes, fct, config.mode, should_raise_error=False)


-def test_overflow_gpu_new_backend():
-    from theano.gpuarray.tests.test_basic_ops import \
-        mode_with_gpu as mode
-    from theano.gpuarray.type import gpuarray_shared_constructor
-    seed = 12345
-    n_substreams = 7
-    curr_rstate = np.array([seed] * 6, dtype='int32')
-    rstate = [curr_rstate.copy()]
-    for j in range(1, n_substreams):
-        rstate.append(rng_mrg.ff_2p72(rstate[-1]))
-    rstate = np.asarray(rstate)
-    rstate = gpuarray_shared_constructor(rstate)
-    fct = functools.partial(rng_mrg.GPUA_mrg_uniform.new, rstate,
-                            ndim=None, dtype='float32')
-    # should raise error as the size overflows
-    sizes = [(2**31, ), (2**32, ), (2**15, 2**16,), (2, 2**15, 2**15)]
-    rng_mrg_overflow(sizes, fct, mode, should_raise_error=True)
-    # should not raise error
-    sizes = [(2**5, ), (2**5, 2**5), (2**5, 2**5, 2**5)]
-    rng_mrg_overflow(sizes, fct, mode, should_raise_error=False)
-    # should support int32 sizes
-    sizes = [(np.int32(2**10), ),
-             (np.int32(2), np.int32(2**10), np.int32(2**10))]
-    rng_mrg_overflow(sizes, fct, mode, should_raise_error=False)
-
-
-def test_validate_input_types_gpuarray_backend():
-    from theano.sandbox.rng_mrg import mrg_uniform
-    from theano.gpuarray.type import gpuarray_shared_constructor
-    from theano.configparser import change_flags
-
-    with change_flags(compute_test_value="raise"):
-        rstate = np.zeros((7, 6), dtype="int32")
-        rstate = gpuarray_shared_constructor(rstate)
-        mrg_uniform.new(rstate, ndim=None, dtype="float32", size=(3,))
-
 if __name__ == "__main__":
    rng = MRG_RandomStreams(np.random.randint(2147462579))
    print(theano.__file__)