Merge pull request #1799 from ynd/fast_rng_mrg

Fast rng mrg

Merge pull request #1799 from ynd/fast_rng_mrg
c6c37e1d · Frédéric Bastien · 90b5a114 · 9fdbadc1 · c6c37e1d · c6c37e1d
--- a/theano/sandbox/rng_mrg.py
+++ b/theano/sandbox/rng_mrg.py
@@ -9,8 +9,8 @@ import warnings
 import numpy
-from theano import Op, Apply, shared, config, Variable
+from theano import Op, Apply, shared, config, Variable, Out
-from theano import gradient
+from theano import gradient, function
 from theano import tensor
 from theano.tensor import (raw_random, TensorType, as_tensor_variable,
                           get_vector_length, cast, opt, scal)
@@ -35,27 +35,165 @@ def matVecModM(A, s, m):
 def multMatVect(v, A, m1, B, m2):
-    #multiply the first half of v by A with a modulo of m1
+    """
-    #and the second half by B with a modulo of m2
+    multiply the first half of v by A with a modulo of m1
-    err_orig = numpy.seterr(over='ignore')
+    and the second half by B with a modulo of m2
-    try:
-        r = numpy.zeros_like(v)
+    Note: The parameters of dot_modulo are passed implicitly because passing
-        r[:3] = matVecModM(A, v[:3], m1)
+    them explicitly takes more time then running the function's C-code.
-        r[3:] = matVecModM(B, v[3:], m2)
+    """
-    finally:
+    if multMatVect.dot_modulo is None:
-        numpy.seterr(**err_orig)
+        A_sym = tensor.lmatrix('A')
+        s_sym = tensor.ivector('s')
+        m_sym = tensor.iscalar('m')
+        A2_sym = tensor.lmatrix('A2')
+        s2_sym = tensor.ivector('s2')
+        m2_sym = tensor.iscalar('m2')
+        o = DotModulo()(A_sym, s_sym, m_sym, A2_sym, s2_sym, m2_sym)
+        multMatVect.dot_modulo = function(
+            [A_sym, s_sym, m_sym, A2_sym, s2_sym, m2_sym], o)
+    # This way of calling the Theano fct is done to bypass Theano overhead.
+    f = multMatVect.dot_modulo
+    f.input_storage[0].storage[0] = A
+    f.input_storage[1].storage[0] = v[:3]
+    f.input_storage[2].storage[0] = m1
+    f.input_storage[3].storage[0] = B
+    f.input_storage[4].storage[0] = v[3:]
+    f.input_storage[5].storage[0] = m2
+    f.fn()
+    r = f.output_storage[0].storage[0]
    return r
+multMatVect.dot_modulo = None
+class DotModulo(Op):
+    """
+    Efficient and numerically stable implementation of a dot product followed
+    by a modulo operation. This performs the same function as matVecModM.
+    We do this 2 times on 2 triple inputs and concatenating the output
+    """
+    def __eq__(self, other):
+        return type(self) == type(other)
+    def __hash__(self):
+        return hash(type(self))
+    def make_node(self, A, s, m, A2, s2, m2):
+        return Apply(self, [A, s, m, A2, s2, m2], [s.type()])
+    def perform(self, node, (A, s, m, A2, s2, m2), (out, )):
+        o1 = matVecModM(A, s, m)
+        o2 = matVecModM(A2, s2, m2)
+        out[0] = numpy.concatenate((o1, o2))
+    def c_code_cache_version(self):
+        return
+        return (5,)
+    def c_code(self, node, name, (_A, _s, _m, _A2, _s2, _m2), (_z, ), sub):
+        return """
+        int osize = -1;
+        if (PyArray_NDIM(%(_A)s) != 2) {PyErr_SetString(PyExc_NotImplementedError, "rank(A) != 2"); %(fail)s;}
+        if (PyArray_NDIM(%(_s)s) != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(v) != 1"); %(fail)s;}
+        if (PyArray_NDIM(%(_m)s) != 0) {PyErr_SetString(PyExc_NotImplementedError, "rank(m) != 0"); %(fail)s;}
+        if (PyArray_NDIM(%(_A2)s) != 2) {PyErr_SetString(PyExc_NotImplementedError, "rank(A2) != 2"); %(fail)s;}
+        if (PyArray_NDIM(%(_s2)s) != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(v2) != 1"); %(fail)s;}
+        if (PyArray_NDIM(%(_m2)s) != 0) {PyErr_SetString(PyExc_NotImplementedError, "rank(m2) != 0"); %(fail)s;}
+        if( PyArray_DIMS(%(_A)s)[1] != PyArray_DIMS(%(_s)s)[0])
+        {PyErr_SetString(PyExc_NotImplementedError, "A and s shapes don't agree."); %(fail)s;}
+        if( PyArray_DIMS(%(_A2)s)[1] != PyArray_DIMS(%(_s2)s)[0])
+        {PyErr_SetString(PyExc_NotImplementedError, "A2 and s2 shapes don't agree."); %(fail)s;}
+        osize = PyArray_DIMS(%(_A)s)[0] + PyArray_DIMS(%(_A2)s)[0];
+        if (!%(_z)s
+            || (PyArray_DIMS(%(_z)s)[0] != osize))
+        {
+            {Py_XDECREF(%(_z)s);}
+            npy_intp dims[] = {0,};
+            dims[0] = osize;
+            %(_z)s = (PyArrayObject*) PyArray_SimpleNew(1, dims, PyArray_TYPE(%(_s)s));
+        }
+        if(!%(_z)s){%(fail)s;}
+        {   //makes it compile even though labels jump over variable definitions.
+            // A has size MxN, s has N, output M
+            npy_intp M = PyArray_DIMS(%(_A)s)[0];
+            npy_intp N = PyArray_DIMS(%(_A)s)[1];
+            const dtype_%(_A)s* __restrict__ DA = (dtype_%(_A)s*)PyArray_DATA(%(_A)s);
+            dtype_%(_s)s* __restrict__ Ds = (dtype_%(_s)s*)PyArray_DATA(%(_s)s);
+            dtype_%(_z)s* __restrict__ Dz = (dtype_%(_z)s*)PyArray_DATA(%(_z)s);
+            const dtype_%(_m)s m = ((dtype_%(_m)s*)PyArray_DATA(%(_m)s))[0];
+            npy_intp SA = PyArray_STRIDES(%(_A)s)[1] / PyArray_DESCR(%(_A)s)->elsize;
+            npy_intp Ss = PyArray_STRIDES(%(_s)s)[0] / PyArray_DESCR(%(_s)s)->elsize;
+            npy_intp Sz = PyArray_STRIDES(%(_z)s)[0] / PyArray_DESCR(%(_z)s)->elsize;
+            for (npy_int32 i = 0; i < M; ++i)
+            {
+                const dtype_%(_A)s* __restrict__ Ak = (dtype_%(_A)s*)(PyArray_BYTES(%(_A)s) + PyArray_STRIDES(%(_A)s)[0] * i);
+                npy_int64 r = 0;
+                for (npy_int32 j = 0; j < N; ++j)
+                {
+                    r += (npy_int64)(Ds[j * Ss] * (npy_int64)(Ak[j * SA])) %% m;
+                }
+                Dz[i * Sz] = r %% m;
+            }
+        }
+        //redo it with the second triple of inputs
+        {
+            // A has size MxN, s has N, output M
+            npy_intp M = PyArray_DIMS(%(_A2)s)[0];
+            npy_intp N = PyArray_DIMS(%(_A2)s)[1];
+            const dtype_%(_A2)s* __restrict__ DA = (dtype_%(_A2)s*)PyArray_DATA(%(_A2)s);
+            dtype_%(_s2)s* __restrict__ Ds = (dtype_%(_s2)s*)PyArray_DATA(%(_s2)s);
+            const dtype_%(_m2)s m = ((dtype_%(_m2)s*)PyArray_DATA(%(_m2)s))[0];
+            npy_intp SA = PyArray_STRIDES(%(_A2)s)[1] / PyArray_DESCR(%(_A2)s)->elsize;
+            npy_intp Ss = PyArray_STRIDES(%(_s2)s)[0] / PyArray_DESCR(%(_s2)s)->elsize;
+            npy_intp Sz = PyArray_STRIDES(%(_z)s)[0] / PyArray_DESCR(%(_z)s)->elsize;
+            dtype_%(_z)s* __restrict__ Dz = (dtype_%(_z)s*)PyArray_DATA(%(_z)s) + PyArray_DIMS(%(_A)s)[0] * Sz;
+            for (npy_int32 i = 0; i < M; ++i)
+            {
+                const dtype_%(_A2)s* __restrict__ Ak = (dtype_%(_A2)s*)(PyArray_BYTES(%(_A2)s) + PyArray_STRIDES(%(_A2)s)[0] * i);
+                npy_int64 r = 0;
+                for (npy_int32 j = 0; j < N; ++j)
+                {
+                    r += (npy_int64)(Ds[j * Ss] * (npy_int64)(Ak[j * SA])) %% m;
+                }
+                Dz[i * Sz] = r %% m;
+            }
+        }
+        """ % dict(locals(), **sub)
 #MRG31k3p
 #generator constants :
-M1 = numpy.int32(2147483647)    #2^31 - 1
+M1 = numpy.asarray(numpy.int32(2147483647))    #2^31 - 1
-M2 = numpy.int32(2147462579)    #2^31 - 21069
+M2 = numpy.asarray(numpy.int32(2147462579))    #2^31 - 21069
-MASK12 = numpy.int32(511)       #2^9 - 1
+MASK12 = numpy.int32(511)                      #2^9 - 1
-MASK13 = numpy.int32(16777215)  #2^24 - 1
+MASK13 = numpy.int32(16777215)                 #2^24 - 1
-MASK2 = numpy.int32(65535)      #2^16 - 1
+MASK2 = numpy.int32(65535)                     #2^16 - 1
 MULT2 = numpy.int32(21069)
-NORM = 4.656612873077392578125e-10; #1./2^31
+NORM = 4.656612873077392578125e-10;            #1./2^31
 #A1p0 = numpy.asarray([[0, 4194304, 129], [1, 0, 0], [0, 1, 0]],
 #                      dtype='int64')
@@ -96,42 +234,41 @@ def mrg_next_value(rstate, new_rstate):
    x11, x12, x13, x21, x22, x23 = rstate
    assert type(x11) == numpy.int32
-    #i0, i7, i9, i15, i16, i22, i24 = [numpy.int32(i) for i in (0, 7, 9, 15, 16, 22, 24)]
    i0, i7, i9, i15, i16, i22, i24 = np_int32_vals
    #first component
    y1 = (((x12 & MASK12) << i22) + (x12 >> i9) +
          ((x13 & MASK13) << i7) + (x13 >> i24))
    assert type(y1) == numpy.int32
-    if (y1 < 0 or y1 >= M1):     #must also check overflow
+    if (y1 < 0 or y1 >= M1):  # must also check overflow
-        y1 -= M1;
+        y1 -= M1
-    y1 += x13;
+    y1 += x13
    if (y1 < 0 or y1 >= M1):
-        y1 -= M1;
+        y1 -= M1
-    x13 = x12;
+    x13 = x12
-    x12 = x11;
+    x12 = x11
-    x11 = y1;
+    x11 = y1
    #second component
-    y1 = ((x21 & MASK2) << i15) + (MULT2 * (x21 >> i16));
+    y1 = ((x21 & MASK2) << i15) + (MULT2 * (x21 >> i16))
    assert type(y1) == numpy.int32
    if (y1 < 0 or y1 >= M2):
-        y1 -= M2;
+        y1 -= M2
-    y2 = ((x23 & MASK2) << i15) + (MULT2 * (x23 >> i16));
+    y2 = ((x23 & MASK2) << i15) + (MULT2 * (x23 >> i16))
    assert type(y2) == numpy.int32
    if (y2 < 0 or y2 >= M2):
-        y2 -= M2;
+        y2 -= M2
-    y2 += x23;
+    y2 += x23
    if (y2 < 0 or y2 >= M2):
-        y2 -= M2;
+        y2 -= M2
-    y2 += y1;
+    y2 += y1
    if (y2 < 0 or y2 >= M2):
-        y2 -= M2;
+        y2 -= M2
-    x23 = x22;
+    x23 = x22
-    x22 = x21;
+    x22 = x21
-    x21 = y2;
+    x21 = y2
    # Must never return either 0 or M1+1
    new_rstate[...] = [x11, x12, x13, x21, x22, x23]
@@ -146,9 +283,9 @@ class mrg_uniform_base(Op):
    def __init__(self, output_type, inplace=False):
        Op.__init__(self)
        self.output_type = output_type
-        self.inplace=inplace
+        self.inplace = inplace
        if inplace:
-            self.destroy_map = {0:[0]}
+            self.destroy_map = {0: [0]}
        self.warned_numpy_version = False
    def __eq__(self, other):
@@ -200,8 +337,12 @@ class mrg_uniform(mrg_uniform_base):
        rstate, size = inp
        o_rstate, o_sample = out
        numpy_version = numpy.__version__.split('.')
-        if not self.warned_numpy_version and int(numpy_version[0]) <= 1 and int(numpy_version[1]) <3 :
-            print "Warning: you must use numpy version 1.3.0 or higher with the python version of this op. Otherwise numpy leak memory."
+        if (not self.warned_numpy_version and
+            int(numpy_version[0]) <= 1 and
+            int(numpy_version[1]) < 3):
+            print "Warning: you must use numpy version 1.3.0 or higher with the python version of this op. Otherwise numpy leak memory. and numpy"
            self.warned_numpy_version = True
        n_elements = 1
@@ -226,8 +367,9 @@ class mrg_uniform(mrg_uniform_base):
        finally:
            numpy.seterr(**err_orig)
-        o_rstate[0] = node.outputs[0].type.filter(rstate)  # send to GPU if necessary
+        # send to GPU if necessary
-        o_sample[0] = node.outputs[1].type.filter(rval.reshape(size))  # send to GPU if necessary
+        o_rstate[0] = node.outputs[0].type.filter(rstate)
+        o_sample[0] = node.outputs[1].type.filter(rval.reshape(size))
    def c_code(self, node, name, inp, out, sub):
        rstate, size = inp
@@ -862,7 +1004,7 @@ def guess_n_streams(size, warn=True):
        for s in size:
            r *= s
        if r > 6:
-            r = r // 6 # chosen as fastest for rbm_benchmark
+            r = r // 6  # chosen as fastest for rbm_benchmark
        # The purpose of sampling from many streams is to be able to use
        # the GPU to its full capacity.  It just wastes RAM and stream-initialization time to
@@ -875,8 +1017,8 @@ def guess_n_streams(size, warn=True):
    else:
        if warn:
            warnings.warn((
-                    "MRG_RandomStreams Can't determine #streams from "
+                "MRG_RandomStreams Can't determine #streams from "
-                    "size (%s), guessing 60*256") % str(size),
+                "size (%s), guessing 60*256") % str(size),
                    stacklevel=3)
        return 60 * 256
@@ -928,7 +1070,8 @@ class MRG_RandomStreams(object):
    def inc_rstate(self):
        """Update self.rstate to be skipped 2^134 steps forward to the next stream start"""
-        self.rstate = ff_2p134(self.rstate)
+        #self.rstate = ff_2p134(self.rstate)
+        self.rstate = multMatVect(self.rstate, A1p134, M1, A2p134, M2)
        assert self.rstate.dtype == numpy.int32
    def get_substream_rstates(self, n_streams, inc_rstate=True):
@@ -939,8 +1082,26 @@ class MRG_RandomStreams(object):
        assert n_streams > 0
        rval = numpy.zeros((n_streams, 6), dtype='int32')
        rval[0] = self.rstate
+        # If multMatVect.dot_modulo isn't compiled, compile it.
+        if multMatVect.dot_modulo is None:
+            multMatVect(rval[0], A1p72, M1, A2p72, M2)
+        # This way of calling the Theano fct is done to bypass Theano overhead.
+        f = multMatVect.dot_modulo
+        f.input_storage[0].storage[0] = A1p72
+        f.input_storage[2].storage[0] = M1
+        f.input_storage[3].storage[0] = A2p72
+        f.input_storage[5].storage[0] = M2
        for i in xrange(1, n_streams):
-            rval[i] = ff_2p72(rval[i - 1])
+            # Inline the following call to bypass Python overhead
+            #rval[i] = ff_2p72(rval[i - 1])
+            v = rval[i - 1]
+            f.input_storage[1].storage[0] = v[:3]
+            f.input_storage[4].storage[0] = v[3:]
+            f.fn()
+            rval[i] = f.output_storage[0].storage[0]
        if inc_rstate:
            self.inc_rstate()
        return rval
@@ -992,7 +1153,8 @@ class MRG_RandomStreams(object):
            msg = "size must be a tuple of int or a Theano variable"
            assert all([isinstance(i, (numpy.integer, int, Variable))
                        for i in size]), msg
-            if any([isinstance(i, (numpy.integer, int)) and i <= 0 for i in size]):
+            if any([isinstance(i, (numpy.integer, int)) and i <= 0
+                    for i in size]):
                raise ValueError(
                    "The specified size contains a dimension with value <= 0",
                    size)

--- a/theano/sandbox/test_rng_mrg.py
+++ b/theano/sandbox/test_rng_mrg.py
@@ -874,3 +874,47 @@ def test_gradient_scan():
    gw = theano.grad(tensor.sum(values[-1]), w)
    f = theano.function([x], gw)
    f(numpy.arange(1, dtype='float32'))
+def test_multMatVect():
+    A1 = tensor.lmatrix('A1')
+    s1 = tensor.ivector('s1')
+    m1 = tensor.iscalar('m1')
+    A2 = tensor.lmatrix('A2')
+    s2 = tensor.ivector('s2')
+    m2 = tensor.iscalar('m2')
+    g0 = rng_mrg.DotModulo()(A1, s1, m1, A2, s2, m2)
+    f0 = theano.function([A1, s1, m1, A2, s2, m2], g0)
+    A1 = numpy.random.randint(0, numpy.iinfo(numpy.int32).max, (3, 3)).astype('int64')
+    s1 = numpy.random.randint(0, numpy.iinfo(numpy.int32).max, 3).astype('int32')
+    m1 = numpy.asarray(numpy.random.randint(numpy.iinfo(numpy.int32).max), dtype="int32")
+    A2 = numpy.random.randint(0, numpy.iinfo(numpy.int32).max, (3, 3)).astype('int64')
+    s2 = numpy.random.randint(0, numpy.iinfo(numpy.int32).max, 3).astype('int32')
+    m2 = numpy.asarray(numpy.random.randint(numpy.iinfo(numpy.int32).max), dtype="int32")
+    f0.input_storage[0].storage[0] = A1
+    f0.input_storage[1].storage[0] = s1
+    f0.input_storage[2].storage[0] = m1
+    f0.input_storage[3].storage[0] = A2
+    f0.input_storage[4].storage[0] = s2
+    f0.input_storage[5].storage[0] = m2
+    r_a1 = rng_mrg.matVecModM(A1, s1, m1)
+    r_a2 = rng_mrg.matVecModM(A2, s2, m2)
+    r_b = f0.fn()[0]
+    assert numpy.allclose(r_a1, r_b[:3])
+    assert numpy.allclose(r_a2, r_b[3:])
+if __name__ == "__main__":
+    rng = MRG_RandomStreams(numpy.random.randint(2147462579))
+    import time
+    print theano.__file__
+    pvals = theano.tensor.fmatrix()
+    for i in range(10):
+        t0 = time.time()
+        multinomial = rng.multinomial(pvals=pvals)
+        print time.time() - t0