提交 5a8017a9 authored 作者: Yann N. Dauphin's avatar Yann N. Dauphin

Merge pull request #2 from nouiz/ynd-fast_rng_mrg

Other rng mrg state init speed up.
......@@ -9,7 +9,7 @@ import warnings
import numpy
from theano import Op, Apply, shared, config, Variable
from theano import Op, Apply, shared, config, Variable, Out
from theano import gradient, function
from theano import tensor
from theano.tensor import (raw_random, TensorType, as_tensor_variable,
......@@ -36,29 +36,34 @@ def multMatVect(v, A, m1, B, m2):
"""
multiply the first half of v by A with a modulo of m1
and the second half by B with a modulo of m2
Note: The parameters of dot_modulo are passed implicitly because passing
them explicitly takes more time then running the function's C-code.
"""
if multMatVect.dot_modulo == None:
if multMatVect.dot_modulo is None:
A_sym = tensor.lmatrix('A')
s_sym = tensor.ivector('s')
m_sym = tensor.iscalar('m')
multMatVect.dot_modulo = function([A_sym, s_sym, m_sym],
DotModulo()(A_sym, s_sym, m_sym))
r = numpy.zeros_like(v)
multMatVect.dot_modulo.input_storage[0].storage[0] = A
multMatVect.dot_modulo.input_storage[1].storage[0] = v[:3]
multMatVect.dot_modulo.input_storage[2].storage[0] = m1
r[:3] = multMatVect.dot_modulo.fn()[0]
multMatVect.dot_modulo.input_storage[0].storage[0] = B
multMatVect.dot_modulo.input_storage[1].storage[0] = v[3:]
multMatVect.dot_modulo.input_storage[2].storage[0] = m2
r[3:] = multMatVect.dot_modulo.fn()[0]
A2_sym = tensor.lmatrix('A2')
s2_sym = tensor.ivector('s2')
m2_sym = tensor.iscalar('m2')
# We borrow the output as we will copy the answer elsewhere
o = Out(DotModulo()(A_sym, s_sym, m_sym, A2_sym, s2_sym, m2_sym),
borrow=True)
multMatVect.dot_modulo = function(
[A_sym, s_sym, m_sym, A2_sym, s2_sym, m2_sym], o)
# This way of calling the Theano fct is done to bypass Theano overhead.
f = multMatVect.dot_modulo
f.input_storage[0].storage[0] = A
f.input_storage[1].storage[0] = v[:3]
f.input_storage[2].storage[0] = m1
f.input_storage[3].storage[0] = B
f.input_storage[4].storage[0] = v[3:]
f.input_storage[5].storage[0] = m2
f.fn()
r = f.output_storage[0].storage[0]
return r
multMatVect.dot_modulo = None
......@@ -67,53 +72,65 @@ class DotModulo(Op):
"""
Efficient and numerically stable implementation of a dot product followed
by a modulo operation. This performs the same function as matVecModM.
We do this 2 times on 2 triple inputs and concatenating the output
"""
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def make_node(self, A, s, m):
return Apply(self, [A, s, m], [s.type()])
def perform(self, node, (A, s, m), (out, )):
out[0] = matVecModM(A, s, m)
def make_node(self, A, s, m, A2, s2, m2):
return Apply(self, [A, s, m, A2, s2, m2], [s.type()])
def perform(self, node, (A, s, m, A2, s2, m2), (out, )):
o1 = matVecModM(A, s, m)
o2 = matVecModM(A2, s2, m2)
out[0] = numpy.concatenate((o1, o2))
def c_code_cache_version(self):
return
return (5,)
def c_code(self, node, name, (_A, _s, _m), (_z, ), sub):
def c_code(self, node, name, (_A, _s, _m, _A2, _s2, _m2), (_z, ), sub):
return """
int osize = -1;
if (PyArray_NDIM(%(_A)s) != 2) {PyErr_SetString(PyExc_NotImplementedError, "rank(A) != 2"); %(fail)s;}
if (PyArray_NDIM(%(_s)s) != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(v) != 1"); %(fail)s;}
if (PyArray_NDIM(%(_m)s) != 0) {PyErr_SetString(PyExc_NotImplementedError, "rank(m) != 0"); %(fail)s;}
if (PyArray_NDIM(%(_A2)s) != 2) {PyErr_SetString(PyExc_NotImplementedError, "rank(A2) != 2"); %(fail)s;}
if (PyArray_NDIM(%(_s2)s) != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(v2) != 1"); %(fail)s;}
if (PyArray_NDIM(%(_m2)s) != 0) {PyErr_SetString(PyExc_NotImplementedError, "rank(m2) != 0"); %(fail)s;}
if( PyArray_DIMS(%(_A)s)[1] != PyArray_DIMS(%(_s)s)[0])
{PyErr_SetString(PyExc_NotImplementedError, "A and s shapes don't agree."); %(fail)s;}
if( PyArray_DIMS(%(_A2)s)[1] != PyArray_DIMS(%(_s2)s)[0])
{PyErr_SetString(PyExc_NotImplementedError, "A2 and s2 shapes don't agree."); %(fail)s;}
osize = PyArray_DIMS(%(_A)s)[0] + PyArray_DIMS(%(_A2)s)[0];
if (!%(_z)s
|| (PyArray_DIMS(%(_z)s)[0] != PyArray_DIMS(%(_A)s)[0]))
|| (PyArray_DIMS(%(_z)s)[0] != osize))
{
{Py_XDECREF(%(_z)s);}
npy_intp dims[] = {0,};
dims[0] = PyArray_DIMS(%(_A)s)[0];
dims[0] = osize;
%(_z)s = (PyArrayObject*) PyArray_SimpleNew(1, dims, PyArray_TYPE(%(_s)s));
}
if(!%(_z)s){%(fail)s;}
{ //makes it compile even though labels jump over variable definitions.
// A has size MxN, s has N, output M
npy_intp M = PyArray_DIMS(%(_A)s)[0];
npy_intp N = PyArray_DIMS(%(_A)s)[1];
const dtype_%(_A)s* __restrict__ DA = (dtype_%(_A)s*)PyArray_DATA(%(_A)s);
dtype_%(_s)s* __restrict__ Ds = (dtype_%(_s)s*)PyArray_DATA(%(_s)s);
dtype_%(_z)s* __restrict__ Dz = (dtype_%(_z)s*)PyArray_DATA(%(_z)s);
const dtype_%(_m)s m = ((dtype_%(_m)s*)PyArray_DATA(%(_m)s))[0];
npy_intp SA = PyArray_STRIDES(%(_A)s)[1] / PyArray_DESCR(%(_A)s)->elsize;
npy_intp Ss = PyArray_STRIDES(%(_s)s)[0] / PyArray_DESCR(%(_s)s)->elsize;
npy_intp Sz = PyArray_STRIDES(%(_z)s)[0] / PyArray_DESCR(%(_z)s)->elsize;
......@@ -121,18 +138,50 @@ class DotModulo(Op):
for (npy_int32 i = 0; i < M; ++i)
{
const dtype_%(_A)s* __restrict__ Ak = (dtype_%(_A)s*)(PyArray_BYTES(%(_A)s) + PyArray_STRIDES(%(_A)s)[0] * i);
npy_int64 r = 0;
for (npy_int32 j = 0; j < N; ++j)
{
r += (npy_int64)(Ds[j * Ss] * (npy_int64)(Ak[j * SA])) %% m;
}
Dz[i * Sz] = r %% m;
}
}
//redo it with the second triple of inputs
{
// A has size MxN, s has N, output M
npy_intp M = PyArray_DIMS(%(_A2)s)[0];
npy_intp N = PyArray_DIMS(%(_A2)s)[1];
const dtype_%(_A2)s* __restrict__ DA = (dtype_%(_A2)s*)PyArray_DATA(%(_A2)s);
dtype_%(_s2)s* __restrict__ Ds = (dtype_%(_s2)s*)PyArray_DATA(%(_s2)s);
const dtype_%(_m2)s m = ((dtype_%(_m2)s*)PyArray_DATA(%(_m2)s))[0];
npy_intp SA = PyArray_STRIDES(%(_A2)s)[1] / PyArray_DESCR(%(_A2)s)->elsize;
npy_intp Ss = PyArray_STRIDES(%(_s2)s)[0] / PyArray_DESCR(%(_s2)s)->elsize;
npy_intp Sz = PyArray_STRIDES(%(_z)s)[0] / PyArray_DESCR(%(_z)s)->elsize;
dtype_%(_z)s* __restrict__ Dz = (dtype_%(_z)s*)PyArray_DATA(%(_z)s) + PyArray_DIMS(%(_A)s)[0] * Sz;
for (npy_int32 i = 0; i < M; ++i)
{
const dtype_%(_A2)s* __restrict__ Ak = (dtype_%(_A2)s*)(PyArray_BYTES(%(_A2)s) + PyArray_STRIDES(%(_A2)s)[0] * i);
npy_int64 r = 0;
for (npy_int32 j = 0; j < N; ++j)
{
r += (npy_int64)(Ds[j * Ss] * (npy_int64)(Ak[j * SA])) %% m;
}
Dz[i * Sz] = r %% m;
}
}
""" % dict(locals(), **sub)
......@@ -185,42 +234,41 @@ def mrg_next_value(rstate, new_rstate):
x11, x12, x13, x21, x22, x23 = rstate
assert type(x11) == numpy.int32
#i0, i7, i9, i15, i16, i22, i24 = [numpy.int32(i) for i in (0, 7, 9, 15, 16, 22, 24)]
i0, i7, i9, i15, i16, i22, i24 = np_int32_vals
#first component
y1 = (((x12 & MASK12) << i22) + (x12 >> i9) +
((x13 & MASK13) << i7) + (x13 >> i24))
assert type(y1) == numpy.int32
if (y1 < 0 or y1 >= M1): #must also check overflow
y1 -= M1;
y1 += x13;
if (y1 < 0 or y1 >= M1): # must also check overflow
y1 -= M1
y1 += x13
if (y1 < 0 or y1 >= M1):
y1 -= M1;
y1 -= M1
x13 = x12;
x12 = x11;
x11 = y1;
x13 = x12
x12 = x11
x11 = y1
#second component
y1 = ((x21 & MASK2) << i15) + (MULT2 * (x21 >> i16));
y1 = ((x21 & MASK2) << i15) + (MULT2 * (x21 >> i16))
assert type(y1) == numpy.int32
if (y1 < 0 or y1 >= M2):
y1 -= M2;
y2 = ((x23 & MASK2) << i15) + (MULT2 * (x23 >> i16));
y1 -= M2
y2 = ((x23 & MASK2) << i15) + (MULT2 * (x23 >> i16))
assert type(y2) == numpy.int32
if (y2 < 0 or y2 >= M2):
y2 -= M2;
y2 += x23;
y2 -= M2
y2 += x23
if (y2 < 0 or y2 >= M2):
y2 -= M2;
y2 += y1;
y2 -= M2
y2 += y1
if (y2 < 0 or y2 >= M2):
y2 -= M2;
y2 -= M2
x23 = x22;
x22 = x21;
x21 = y2;
x23 = x22
x22 = x21
x21 = y2
# Must never return either 0 or M1+1
new_rstate[...] = [x11, x12, x13, x21, x22, x23]
......@@ -235,9 +283,9 @@ class mrg_uniform_base(Op):
def __init__(self, output_type, inplace=False):
Op.__init__(self)
self.output_type = output_type
self.inplace=inplace
self.inplace = inplace
if inplace:
self.destroy_map = {0:[0]}
self.destroy_map = {0: [0]}
self.warned_numpy_version = False
def __eq__(self, other):
......@@ -289,7 +337,10 @@ class mrg_uniform(mrg_uniform_base):
rstate, size = inp
o_rstate, o_sample = out
numpy_version = numpy.__version__.split('.')
if not self.warned_numpy_version and int(numpy_version[0]) <= 1 and int(numpy_version[1]) <3 :
if (not self.warned_numpy_version and
int(numpy_version[0]) <= 1 and
int(numpy_version[1]) < 3):
print "Warning: you must use numpy version 1.3.0 or higher with the python version of this op. Otherwise numpy leak memory. and numpy"
self.warned_numpy_version = True
......@@ -315,8 +366,9 @@ class mrg_uniform(mrg_uniform_base):
finally:
numpy.seterr(**err_orig)
o_rstate[0] = node.outputs[0].type.filter(rstate) # send to GPU if necessary
o_sample[0] = node.outputs[1].type.filter(rval.reshape(size)) # send to GPU if necessary
# send to GPU if necessary
o_rstate[0] = node.outputs[0].type.filter(rstate)
o_sample[0] = node.outputs[1].type.filter(rval.reshape(size))
def c_code(self, node, name, inp, out, sub):
rstate, size = inp
......@@ -718,7 +770,7 @@ def guess_n_streams(size, warn=True):
for s in size:
r *= s
if r > 6:
r = r // 6 # chosen as fastest for rbm_benchmark
r = r // 6 # chosen as fastest for rbm_benchmark
# The purpose of sampling from many streams is to be able to use
# the GPU to its full capacity. It just wastes RAM and stream-initialization time to
......@@ -731,8 +783,8 @@ def guess_n_streams(size, warn=True):
else:
if warn:
warnings.warn((
"MRG_RandomStreams Can't determine #streams from "
"size (%s), guessing 60*256") % str(size),
"MRG_RandomStreams Can't determine #streams from "
"size (%s), guessing 60*256") % str(size),
stacklevel=3)
return 60 * 256
......@@ -784,7 +836,8 @@ class MRG_RandomStreams(object):
def inc_rstate(self):
"""Update self.rstate to be skipped 2^134 steps forward to the next stream start"""
self.rstate = ff_2p134(self.rstate)
#self.rstate = ff_2p134(self.rstate)
self.rstate = multMatVect(self.rstate, A1p134, M1, A2p134, M2)
assert self.rstate.dtype == numpy.int32
def get_substream_rstates(self, n_streams, inc_rstate=True):
......@@ -795,8 +848,26 @@ class MRG_RandomStreams(object):
assert n_streams > 0
rval = numpy.zeros((n_streams, 6), dtype='int32')
rval[0] = self.rstate
# If multMatVect.dot_modulo isn't compiled, compile it.
if multMatVect.dot_modulo is None:
multMatVect(rval[0], A1p72, M1, A2p72, M2)
# This way of calling the Theano fct is done to bypass Theano overhead.
f = multMatVect.dot_modulo
f.input_storage[0].storage[0] = A1p72
f.input_storage[2].storage[0] = M1
f.input_storage[3].storage[0] = A2p72
f.input_storage[5].storage[0] = M2
for i in xrange(1, n_streams):
rval[i] = ff_2p72(rval[i - 1])
# Inline the following call to bypass Python overhead
#rval[i] = ff_2p72(rval[i - 1])
v = rval[i - 1]
f.input_storage[1].storage[0] = v[:3]
f.input_storage[4].storage[0] = v[3:]
f.fn()
rval[i] = f.output_storage[0].storage[0]
if inc_rstate:
self.inc_rstate()
return rval
......@@ -848,7 +919,8 @@ class MRG_RandomStreams(object):
msg = "size must be a tuple of int or a Theano variable"
assert all([isinstance(i, (numpy.integer, int, Variable))
for i in size]), msg
if any([isinstance(i, (numpy.integer, int)) and i <= 0 for i in size]):
if any([isinstance(i, (numpy.integer, int)) and i <= 0
for i in size]):
raise ValueError(
"The specified size contains a dimension with value <= 0",
size)
......
......@@ -793,3 +793,14 @@ def test_multMatVect():
r_b = f0.fn()
assert numpy.allclose(r_a, r_b)
if __name__ == "__main__":
rng = MRG_RandomStreams(numpy.random.randint(2147462579))
import time
print theano.__file__
pvals = theano.tensor.fmatrix()
for i in range(10):
t0 = time.time()
multinomial = rng.multinomial(pvals=pvals)
print time.time() - t0
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论