提交 5a8017a9 authored 作者: Yann N. Dauphin's avatar Yann N. Dauphin

Merge pull request #2 from nouiz/ynd-fast_rng_mrg

Other rng mrg state init speed up.
...@@ -9,7 +9,7 @@ import warnings ...@@ -9,7 +9,7 @@ import warnings
import numpy import numpy
from theano import Op, Apply, shared, config, Variable from theano import Op, Apply, shared, config, Variable, Out
from theano import gradient, function from theano import gradient, function
from theano import tensor from theano import tensor
from theano.tensor import (raw_random, TensorType, as_tensor_variable, from theano.tensor import (raw_random, TensorType, as_tensor_variable,
...@@ -36,29 +36,34 @@ def multMatVect(v, A, m1, B, m2): ...@@ -36,29 +36,34 @@ def multMatVect(v, A, m1, B, m2):
""" """
multiply the first half of v by A with a modulo of m1 multiply the first half of v by A with a modulo of m1
and the second half by B with a modulo of m2 and the second half by B with a modulo of m2
Note: The parameters of dot_modulo are passed implicitly because passing Note: The parameters of dot_modulo are passed implicitly because passing
them explicitly takes more time then running the function's C-code. them explicitly takes more time then running the function's C-code.
""" """
if multMatVect.dot_modulo == None: if multMatVect.dot_modulo is None:
A_sym = tensor.lmatrix('A') A_sym = tensor.lmatrix('A')
s_sym = tensor.ivector('s') s_sym = tensor.ivector('s')
m_sym = tensor.iscalar('m') m_sym = tensor.iscalar('m')
A2_sym = tensor.lmatrix('A2')
multMatVect.dot_modulo = function([A_sym, s_sym, m_sym], s2_sym = tensor.ivector('s2')
DotModulo()(A_sym, s_sym, m_sym)) m2_sym = tensor.iscalar('m2')
# We borrow the output as we will copy the answer elsewhere
r = numpy.zeros_like(v) o = Out(DotModulo()(A_sym, s_sym, m_sym, A2_sym, s2_sym, m2_sym),
multMatVect.dot_modulo.input_storage[0].storage[0] = A borrow=True)
multMatVect.dot_modulo.input_storage[1].storage[0] = v[:3] multMatVect.dot_modulo = function(
multMatVect.dot_modulo.input_storage[2].storage[0] = m1 [A_sym, s_sym, m_sym, A2_sym, s2_sym, m2_sym], o)
r[:3] = multMatVect.dot_modulo.fn()[0]
# This way of calling the Theano fct is done to bypass Theano overhead.
multMatVect.dot_modulo.input_storage[0].storage[0] = B f = multMatVect.dot_modulo
multMatVect.dot_modulo.input_storage[1].storage[0] = v[3:] f.input_storage[0].storage[0] = A
multMatVect.dot_modulo.input_storage[2].storage[0] = m2 f.input_storage[1].storage[0] = v[:3]
r[3:] = multMatVect.dot_modulo.fn()[0] f.input_storage[2].storage[0] = m1
f.input_storage[3].storage[0] = B
f.input_storage[4].storage[0] = v[3:]
f.input_storage[5].storage[0] = m2
f.fn()
r = f.output_storage[0].storage[0]
return r return r
multMatVect.dot_modulo = None multMatVect.dot_modulo = None
...@@ -67,53 +72,65 @@ class DotModulo(Op): ...@@ -67,53 +72,65 @@ class DotModulo(Op):
""" """
Efficient and numerically stable implementation of a dot product followed Efficient and numerically stable implementation of a dot product followed
by a modulo operation. This performs the same function as matVecModM. by a modulo operation. This performs the same function as matVecModM.
We do this 2 times on 2 triple inputs and concatenating the output
""" """
def __eq__(self, other): def __eq__(self, other):
return type(self) == type(other) return type(self) == type(other)
def __hash__(self): def __hash__(self):
return hash(type(self)) return hash(type(self))
def make_node(self, A, s, m):
return Apply(self, [A, s, m], [s.type()])
def perform(self, node, (A, s, m), (out, )): def make_node(self, A, s, m, A2, s2, m2):
out[0] = matVecModM(A, s, m) return Apply(self, [A, s, m, A2, s2, m2], [s.type()])
def perform(self, node, (A, s, m, A2, s2, m2), (out, )):
o1 = matVecModM(A, s, m)
o2 = matVecModM(A2, s2, m2)
out[0] = numpy.concatenate((o1, o2))
def c_code_cache_version(self): def c_code_cache_version(self):
return
return (5,) return (5,)
def c_code(self, node, name, (_A, _s, _m), (_z, ), sub): def c_code(self, node, name, (_A, _s, _m, _A2, _s2, _m2), (_z, ), sub):
return """ return """
int osize = -1;
if (PyArray_NDIM(%(_A)s) != 2) {PyErr_SetString(PyExc_NotImplementedError, "rank(A) != 2"); %(fail)s;} if (PyArray_NDIM(%(_A)s) != 2) {PyErr_SetString(PyExc_NotImplementedError, "rank(A) != 2"); %(fail)s;}
if (PyArray_NDIM(%(_s)s) != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(v) != 1"); %(fail)s;} if (PyArray_NDIM(%(_s)s) != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(v) != 1"); %(fail)s;}
if (PyArray_NDIM(%(_m)s) != 0) {PyErr_SetString(PyExc_NotImplementedError, "rank(m) != 0"); %(fail)s;} if (PyArray_NDIM(%(_m)s) != 0) {PyErr_SetString(PyExc_NotImplementedError, "rank(m) != 0"); %(fail)s;}
if (PyArray_NDIM(%(_A2)s) != 2) {PyErr_SetString(PyExc_NotImplementedError, "rank(A2) != 2"); %(fail)s;}
if (PyArray_NDIM(%(_s2)s) != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(v2) != 1"); %(fail)s;}
if (PyArray_NDIM(%(_m2)s) != 0) {PyErr_SetString(PyExc_NotImplementedError, "rank(m2) != 0"); %(fail)s;}
if( PyArray_DIMS(%(_A)s)[1] != PyArray_DIMS(%(_s)s)[0]) if( PyArray_DIMS(%(_A)s)[1] != PyArray_DIMS(%(_s)s)[0])
{PyErr_SetString(PyExc_NotImplementedError, "A and s shapes don't agree."); %(fail)s;} {PyErr_SetString(PyExc_NotImplementedError, "A and s shapes don't agree."); %(fail)s;}
if( PyArray_DIMS(%(_A2)s)[1] != PyArray_DIMS(%(_s2)s)[0])
{PyErr_SetString(PyExc_NotImplementedError, "A2 and s2 shapes don't agree."); %(fail)s;}
osize = PyArray_DIMS(%(_A)s)[0] + PyArray_DIMS(%(_A2)s)[0];
if (!%(_z)s if (!%(_z)s
|| (PyArray_DIMS(%(_z)s)[0] != PyArray_DIMS(%(_A)s)[0])) || (PyArray_DIMS(%(_z)s)[0] != osize))
{ {
{Py_XDECREF(%(_z)s);} {Py_XDECREF(%(_z)s);}
npy_intp dims[] = {0,}; npy_intp dims[] = {0,};
dims[0] = PyArray_DIMS(%(_A)s)[0]; dims[0] = osize;
%(_z)s = (PyArrayObject*) PyArray_SimpleNew(1, dims, PyArray_TYPE(%(_s)s)); %(_z)s = (PyArrayObject*) PyArray_SimpleNew(1, dims, PyArray_TYPE(%(_s)s));
} }
if(!%(_z)s){%(fail)s;} if(!%(_z)s){%(fail)s;}
{ //makes it compile even though labels jump over variable definitions. { //makes it compile even though labels jump over variable definitions.
// A has size MxN, s has N, output M // A has size MxN, s has N, output M
npy_intp M = PyArray_DIMS(%(_A)s)[0]; npy_intp M = PyArray_DIMS(%(_A)s)[0];
npy_intp N = PyArray_DIMS(%(_A)s)[1]; npy_intp N = PyArray_DIMS(%(_A)s)[1];
const dtype_%(_A)s* __restrict__ DA = (dtype_%(_A)s*)PyArray_DATA(%(_A)s); const dtype_%(_A)s* __restrict__ DA = (dtype_%(_A)s*)PyArray_DATA(%(_A)s);
dtype_%(_s)s* __restrict__ Ds = (dtype_%(_s)s*)PyArray_DATA(%(_s)s); dtype_%(_s)s* __restrict__ Ds = (dtype_%(_s)s*)PyArray_DATA(%(_s)s);
dtype_%(_z)s* __restrict__ Dz = (dtype_%(_z)s*)PyArray_DATA(%(_z)s); dtype_%(_z)s* __restrict__ Dz = (dtype_%(_z)s*)PyArray_DATA(%(_z)s);
const dtype_%(_m)s m = ((dtype_%(_m)s*)PyArray_DATA(%(_m)s))[0]; const dtype_%(_m)s m = ((dtype_%(_m)s*)PyArray_DATA(%(_m)s))[0];
npy_intp SA = PyArray_STRIDES(%(_A)s)[1] / PyArray_DESCR(%(_A)s)->elsize; npy_intp SA = PyArray_STRIDES(%(_A)s)[1] / PyArray_DESCR(%(_A)s)->elsize;
npy_intp Ss = PyArray_STRIDES(%(_s)s)[0] / PyArray_DESCR(%(_s)s)->elsize; npy_intp Ss = PyArray_STRIDES(%(_s)s)[0] / PyArray_DESCR(%(_s)s)->elsize;
npy_intp Sz = PyArray_STRIDES(%(_z)s)[0] / PyArray_DESCR(%(_z)s)->elsize; npy_intp Sz = PyArray_STRIDES(%(_z)s)[0] / PyArray_DESCR(%(_z)s)->elsize;
...@@ -121,18 +138,50 @@ class DotModulo(Op): ...@@ -121,18 +138,50 @@ class DotModulo(Op):
for (npy_int32 i = 0; i < M; ++i) for (npy_int32 i = 0; i < M; ++i)
{ {
const dtype_%(_A)s* __restrict__ Ak = (dtype_%(_A)s*)(PyArray_BYTES(%(_A)s) + PyArray_STRIDES(%(_A)s)[0] * i); const dtype_%(_A)s* __restrict__ Ak = (dtype_%(_A)s*)(PyArray_BYTES(%(_A)s) + PyArray_STRIDES(%(_A)s)[0] * i);
npy_int64 r = 0; npy_int64 r = 0;
for (npy_int32 j = 0; j < N; ++j) for (npy_int32 j = 0; j < N; ++j)
{ {
r += (npy_int64)(Ds[j * Ss] * (npy_int64)(Ak[j * SA])) %% m; r += (npy_int64)(Ds[j * Ss] * (npy_int64)(Ak[j * SA])) %% m;
} }
Dz[i * Sz] = r %% m; Dz[i * Sz] = r %% m;
} }
} }
//redo it with the second triple of inputs
{
// A has size MxN, s has N, output M
npy_intp M = PyArray_DIMS(%(_A2)s)[0];
npy_intp N = PyArray_DIMS(%(_A2)s)[1];
const dtype_%(_A2)s* __restrict__ DA = (dtype_%(_A2)s*)PyArray_DATA(%(_A2)s);
dtype_%(_s2)s* __restrict__ Ds = (dtype_%(_s2)s*)PyArray_DATA(%(_s2)s);
const dtype_%(_m2)s m = ((dtype_%(_m2)s*)PyArray_DATA(%(_m2)s))[0];
npy_intp SA = PyArray_STRIDES(%(_A2)s)[1] / PyArray_DESCR(%(_A2)s)->elsize;
npy_intp Ss = PyArray_STRIDES(%(_s2)s)[0] / PyArray_DESCR(%(_s2)s)->elsize;
npy_intp Sz = PyArray_STRIDES(%(_z)s)[0] / PyArray_DESCR(%(_z)s)->elsize;
dtype_%(_z)s* __restrict__ Dz = (dtype_%(_z)s*)PyArray_DATA(%(_z)s) + PyArray_DIMS(%(_A)s)[0] * Sz;
for (npy_int32 i = 0; i < M; ++i)
{
const dtype_%(_A2)s* __restrict__ Ak = (dtype_%(_A2)s*)(PyArray_BYTES(%(_A2)s) + PyArray_STRIDES(%(_A2)s)[0] * i);
npy_int64 r = 0;
for (npy_int32 j = 0; j < N; ++j)
{
r += (npy_int64)(Ds[j * Ss] * (npy_int64)(Ak[j * SA])) %% m;
}
Dz[i * Sz] = r %% m;
}
}
""" % dict(locals(), **sub) """ % dict(locals(), **sub)
...@@ -185,42 +234,41 @@ def mrg_next_value(rstate, new_rstate): ...@@ -185,42 +234,41 @@ def mrg_next_value(rstate, new_rstate):
x11, x12, x13, x21, x22, x23 = rstate x11, x12, x13, x21, x22, x23 = rstate
assert type(x11) == numpy.int32 assert type(x11) == numpy.int32
#i0, i7, i9, i15, i16, i22, i24 = [numpy.int32(i) for i in (0, 7, 9, 15, 16, 22, 24)]
i0, i7, i9, i15, i16, i22, i24 = np_int32_vals i0, i7, i9, i15, i16, i22, i24 = np_int32_vals
#first component #first component
y1 = (((x12 & MASK12) << i22) + (x12 >> i9) + y1 = (((x12 & MASK12) << i22) + (x12 >> i9) +
((x13 & MASK13) << i7) + (x13 >> i24)) ((x13 & MASK13) << i7) + (x13 >> i24))
assert type(y1) == numpy.int32 assert type(y1) == numpy.int32
if (y1 < 0 or y1 >= M1): #must also check overflow if (y1 < 0 or y1 >= M1): # must also check overflow
y1 -= M1; y1 -= M1
y1 += x13; y1 += x13
if (y1 < 0 or y1 >= M1): if (y1 < 0 or y1 >= M1):
y1 -= M1; y1 -= M1
x13 = x12; x13 = x12
x12 = x11; x12 = x11
x11 = y1; x11 = y1
#second component #second component
y1 = ((x21 & MASK2) << i15) + (MULT2 * (x21 >> i16)); y1 = ((x21 & MASK2) << i15) + (MULT2 * (x21 >> i16))
assert type(y1) == numpy.int32 assert type(y1) == numpy.int32
if (y1 < 0 or y1 >= M2): if (y1 < 0 or y1 >= M2):
y1 -= M2; y1 -= M2
y2 = ((x23 & MASK2) << i15) + (MULT2 * (x23 >> i16)); y2 = ((x23 & MASK2) << i15) + (MULT2 * (x23 >> i16))
assert type(y2) == numpy.int32 assert type(y2) == numpy.int32
if (y2 < 0 or y2 >= M2): if (y2 < 0 or y2 >= M2):
y2 -= M2; y2 -= M2
y2 += x23; y2 += x23
if (y2 < 0 or y2 >= M2): if (y2 < 0 or y2 >= M2):
y2 -= M2; y2 -= M2
y2 += y1; y2 += y1
if (y2 < 0 or y2 >= M2): if (y2 < 0 or y2 >= M2):
y2 -= M2; y2 -= M2
x23 = x22; x23 = x22
x22 = x21; x22 = x21
x21 = y2; x21 = y2
# Must never return either 0 or M1+1 # Must never return either 0 or M1+1
new_rstate[...] = [x11, x12, x13, x21, x22, x23] new_rstate[...] = [x11, x12, x13, x21, x22, x23]
...@@ -235,9 +283,9 @@ class mrg_uniform_base(Op): ...@@ -235,9 +283,9 @@ class mrg_uniform_base(Op):
def __init__(self, output_type, inplace=False): def __init__(self, output_type, inplace=False):
Op.__init__(self) Op.__init__(self)
self.output_type = output_type self.output_type = output_type
self.inplace=inplace self.inplace = inplace
if inplace: if inplace:
self.destroy_map = {0:[0]} self.destroy_map = {0: [0]}
self.warned_numpy_version = False self.warned_numpy_version = False
def __eq__(self, other): def __eq__(self, other):
...@@ -289,7 +337,10 @@ class mrg_uniform(mrg_uniform_base): ...@@ -289,7 +337,10 @@ class mrg_uniform(mrg_uniform_base):
rstate, size = inp rstate, size = inp
o_rstate, o_sample = out o_rstate, o_sample = out
numpy_version = numpy.__version__.split('.') numpy_version = numpy.__version__.split('.')
if not self.warned_numpy_version and int(numpy_version[0]) <= 1 and int(numpy_version[1]) <3 : if (not self.warned_numpy_version and
int(numpy_version[0]) <= 1 and
int(numpy_version[1]) < 3):
print "Warning: you must use numpy version 1.3.0 or higher with the python version of this op. Otherwise numpy leak memory. and numpy" print "Warning: you must use numpy version 1.3.0 or higher with the python version of this op. Otherwise numpy leak memory. and numpy"
self.warned_numpy_version = True self.warned_numpy_version = True
...@@ -315,8 +366,9 @@ class mrg_uniform(mrg_uniform_base): ...@@ -315,8 +366,9 @@ class mrg_uniform(mrg_uniform_base):
finally: finally:
numpy.seterr(**err_orig) numpy.seterr(**err_orig)
o_rstate[0] = node.outputs[0].type.filter(rstate) # send to GPU if necessary # send to GPU if necessary
o_sample[0] = node.outputs[1].type.filter(rval.reshape(size)) # send to GPU if necessary o_rstate[0] = node.outputs[0].type.filter(rstate)
o_sample[0] = node.outputs[1].type.filter(rval.reshape(size))
def c_code(self, node, name, inp, out, sub): def c_code(self, node, name, inp, out, sub):
rstate, size = inp rstate, size = inp
...@@ -718,7 +770,7 @@ def guess_n_streams(size, warn=True): ...@@ -718,7 +770,7 @@ def guess_n_streams(size, warn=True):
for s in size: for s in size:
r *= s r *= s
if r > 6: if r > 6:
r = r // 6 # chosen as fastest for rbm_benchmark r = r // 6 # chosen as fastest for rbm_benchmark
# The purpose of sampling from many streams is to be able to use # The purpose of sampling from many streams is to be able to use
# the GPU to its full capacity. It just wastes RAM and stream-initialization time to # the GPU to its full capacity. It just wastes RAM and stream-initialization time to
...@@ -731,8 +783,8 @@ def guess_n_streams(size, warn=True): ...@@ -731,8 +783,8 @@ def guess_n_streams(size, warn=True):
else: else:
if warn: if warn:
warnings.warn(( warnings.warn((
"MRG_RandomStreams Can't determine #streams from " "MRG_RandomStreams Can't determine #streams from "
"size (%s), guessing 60*256") % str(size), "size (%s), guessing 60*256") % str(size),
stacklevel=3) stacklevel=3)
return 60 * 256 return 60 * 256
...@@ -784,7 +836,8 @@ class MRG_RandomStreams(object): ...@@ -784,7 +836,8 @@ class MRG_RandomStreams(object):
def inc_rstate(self): def inc_rstate(self):
"""Update self.rstate to be skipped 2^134 steps forward to the next stream start""" """Update self.rstate to be skipped 2^134 steps forward to the next stream start"""
self.rstate = ff_2p134(self.rstate) #self.rstate = ff_2p134(self.rstate)
self.rstate = multMatVect(self.rstate, A1p134, M1, A2p134, M2)
assert self.rstate.dtype == numpy.int32 assert self.rstate.dtype == numpy.int32
def get_substream_rstates(self, n_streams, inc_rstate=True): def get_substream_rstates(self, n_streams, inc_rstate=True):
...@@ -795,8 +848,26 @@ class MRG_RandomStreams(object): ...@@ -795,8 +848,26 @@ class MRG_RandomStreams(object):
assert n_streams > 0 assert n_streams > 0
rval = numpy.zeros((n_streams, 6), dtype='int32') rval = numpy.zeros((n_streams, 6), dtype='int32')
rval[0] = self.rstate rval[0] = self.rstate
# If multMatVect.dot_modulo isn't compiled, compile it.
if multMatVect.dot_modulo is None:
multMatVect(rval[0], A1p72, M1, A2p72, M2)
# This way of calling the Theano fct is done to bypass Theano overhead.
f = multMatVect.dot_modulo
f.input_storage[0].storage[0] = A1p72
f.input_storage[2].storage[0] = M1
f.input_storage[3].storage[0] = A2p72
f.input_storage[5].storage[0] = M2
for i in xrange(1, n_streams): for i in xrange(1, n_streams):
rval[i] = ff_2p72(rval[i - 1]) # Inline the following call to bypass Python overhead
#rval[i] = ff_2p72(rval[i - 1])
v = rval[i - 1]
f.input_storage[1].storage[0] = v[:3]
f.input_storage[4].storage[0] = v[3:]
f.fn()
rval[i] = f.output_storage[0].storage[0]
if inc_rstate: if inc_rstate:
self.inc_rstate() self.inc_rstate()
return rval return rval
...@@ -848,7 +919,8 @@ class MRG_RandomStreams(object): ...@@ -848,7 +919,8 @@ class MRG_RandomStreams(object):
msg = "size must be a tuple of int or a Theano variable" msg = "size must be a tuple of int or a Theano variable"
assert all([isinstance(i, (numpy.integer, int, Variable)) assert all([isinstance(i, (numpy.integer, int, Variable))
for i in size]), msg for i in size]), msg
if any([isinstance(i, (numpy.integer, int)) and i <= 0 for i in size]): if any([isinstance(i, (numpy.integer, int)) and i <= 0
for i in size]):
raise ValueError( raise ValueError(
"The specified size contains a dimension with value <= 0", "The specified size contains a dimension with value <= 0",
size) size)
......
...@@ -793,3 +793,14 @@ def test_multMatVect(): ...@@ -793,3 +793,14 @@ def test_multMatVect():
r_b = f0.fn() r_b = f0.fn()
assert numpy.allclose(r_a, r_b) assert numpy.allclose(r_a, r_b)
if __name__ == "__main__":
rng = MRG_RandomStreams(numpy.random.randint(2147462579))
import time
print theano.__file__
pvals = theano.tensor.fmatrix()
for i in range(10):
t0 = time.time()
multinomial = rng.multinomial(pvals=pvals)
print time.time() - t0
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论