提交 6736be29 authored 作者: Olivier Delalleau's avatar Olivier Delalleau

Merged

...@@ -131,7 +131,7 @@ optdb.register('merge1', gof.MergeOptimizer(), ...@@ -131,7 +131,7 @@ optdb.register('merge1', gof.MergeOptimizer(),
0, 'fast_run', 'fast_compile') 0, 'fast_run', 'fast_compile')
optdb.register('canonicalize', gof.EquilibriumDB(), # rearranges elemwise expressions optdb.register('canonicalize', gof.EquilibriumDB(), # rearranges elemwise expressions
1, 'fast_run') 1, 'fast_run')
optdb.register('merge1.2', gof.MergeOptimizer(skip_const_merge=True), optdb.register('merge1.2', gof.MergeOptimizer(skip_const_merge=False),
1.2, 'fast_run', 'fast_compile') 1.2, 'fast_run', 'fast_compile')
optdb.register('stabilize', gof.EquilibriumDB(), # replace unstable subgraphs optdb.register('stabilize', gof.EquilibriumDB(), # replace unstable subgraphs
1.5, 'fast_run') 1.5, 'fast_run')
......
...@@ -956,21 +956,26 @@ CudaNdarray_Subscript(PyObject * py_self, PyObject * key) ...@@ -956,21 +956,26 @@ CudaNdarray_Subscript(PyObject * py_self, PyObject * key)
CudaNdarray * self = (CudaNdarray*) py_self; CudaNdarray * self = (CudaNdarray*) py_self;
PyObject * py_rval = NULL; PyObject * py_rval = NULL;
CudaNdarray * rval = NULL; CudaNdarray * rval = NULL;
PyObject * intobj = NULL;
//PyObject_Print(key, stderr, 0);
if (key == Py_Ellipsis) if (key == Py_Ellipsis)
{ {
Py_INCREF(py_self); Py_INCREF(py_self);
return py_self; return py_self;
} }
else if (PyInt_Check(key)) //INDEXING BY INTEGER if ((intobj=PyNumber_Int(key))) //INDEXING BY INTEGER
//else if (PyInt_Check(key)) //INDEXING BY INTEGER
{ {
int d_idx = PyInt_AsLong(intobj);
Py_DECREF(intobj); intobj=NULL;
//int d_idx = PyInt_AsLong(key);
if (self->nd == 0) if (self->nd == 0)
{ {
PyErr_SetString(PyExc_NotImplementedError, "index into 0-d array"); PyErr_SetString(PyExc_NotImplementedError, "index into 0-d array");
return NULL; return NULL;
} }
int d_idx = PyInt_AsLong(key);
int d_dim = CudaNdarray_HOST_DIMS(self)[0]; int d_dim = CudaNdarray_HOST_DIMS(self)[0];
int offset = 0; int offset = 0;
...@@ -1009,7 +1014,11 @@ CudaNdarray_Subscript(PyObject * py_self, PyObject * key) ...@@ -1009,7 +1014,11 @@ CudaNdarray_Subscript(PyObject * py_self, PyObject * key)
CudaNdarray_set_dim(rval, d-1, CudaNdarray_HOST_DIMS(self)[d]); CudaNdarray_set_dim(rval, d-1, CudaNdarray_HOST_DIMS(self)[d]);
} }
} }
else if (PySlice_Check(key)) //INDEXING BY SLICE else
{
PyErr_Clear();
}
if (PySlice_Check(key)) //INDEXING BY SLICE
{ {
if (self->nd == 0) if (self->nd == 0)
{ {
...@@ -1057,7 +1066,7 @@ CudaNdarray_Subscript(PyObject * py_self, PyObject * key) ...@@ -1057,7 +1066,7 @@ CudaNdarray_Subscript(PyObject * py_self, PyObject * key)
CudaNdarray_set_dim(rval, d, CudaNdarray_HOST_DIMS(self)[d]); CudaNdarray_set_dim(rval, d, CudaNdarray_HOST_DIMS(self)[d]);
} }
} }
else if (PyTuple_Check(key)) //INDEXING BY TUPLE if (PyTuple_Check(key)) //INDEXING BY TUPLE
{ {
//elements of the tuple can be either integers or slices //elements of the tuple can be either integers or slices
//the dimensionality of the view we will return is diminished for each slice in the tuple //the dimensionality of the view we will return is diminished for each slice in the tuple
...@@ -1127,9 +1136,11 @@ CudaNdarray_Subscript(PyObject * py_self, PyObject * key) ...@@ -1127,9 +1136,11 @@ CudaNdarray_Subscript(PyObject * py_self, PyObject * key)
} }
++rval_d; ++rval_d;
} }
else if (PyInt_Check(key_d)) else if ((intobj=PyNumber_Int(key_d)))
{ {
int d_idx = PyInt_AsLong(key_d); int d_idx = PyInt_AsLong(intobj);
Py_DECREF(intobj);
intobj = NULL;
int d_dim = CudaNdarray_HOST_DIMS(self)[d]; int d_dim = CudaNdarray_HOST_DIMS(self)[d];
if ((d_idx >= 0) && (d_idx < d_dim)) if ((d_idx >= 0) && (d_idx < d_dim))
...@@ -1151,6 +1162,7 @@ CudaNdarray_Subscript(PyObject * py_self, PyObject * key) ...@@ -1151,6 +1162,7 @@ CudaNdarray_Subscript(PyObject * py_self, PyObject * key)
} }
else else
{ {
PyErr_Clear(); // clear the error set by PyNumber_Int
PyErr_SetString(PyExc_IndexError, "index must be either int or slice"); PyErr_SetString(PyExc_IndexError, "index must be either int or slice");
Py_DECREF(rval); Py_DECREF(rval);
return NULL; return NULL;
...@@ -1158,16 +1170,16 @@ CudaNdarray_Subscript(PyObject * py_self, PyObject * key) ...@@ -1158,16 +1170,16 @@ CudaNdarray_Subscript(PyObject * py_self, PyObject * key)
} }
} }
} }
else
{
PyErr_SetString(PyExc_NotImplementedError, "Unknown key type");
return NULL;
}
if (py_rval) if (py_rval)
{ {
if (verbose) fprint_CudaNdarray(stderr, self); if (verbose) fprint_CudaNdarray(stderr, self);
if (verbose) fprint_CudaNdarray(stderr, rval); if (verbose) fprint_CudaNdarray(stderr, rval);
} }
else
{
PyErr_SetString(PyExc_NotImplementedError, "Unknown key type");
return NULL;
}
return py_rval; return py_rval;
} }
...@@ -1776,6 +1788,10 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, CudaNdarray * other) ...@@ -1776,6 +1788,10 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, CudaNdarray * other)
} }
size *= (unsigned int) CudaNdarray_HOST_DIMS(self)[i]; size *= (unsigned int) CudaNdarray_HOST_DIMS(self)[i];
} }
if (0 == size)
{
return 0; //nothing to copy, we're done.
}
if (CudaNdarray_is_c_contiguous(self) && CudaNdarray_is_c_contiguous(other)) if (CudaNdarray_is_c_contiguous(self) && CudaNdarray_is_c_contiguous(other))
{ {
cublasScopy(size, CudaNdarray_DEV_DATA(other), 1, CudaNdarray_DEV_DATA(self), 1); cublasScopy(size, CudaNdarray_DEV_DATA(other), 1, CudaNdarray_DEV_DATA(self), 1);
......
...@@ -9,7 +9,12 @@ import sys ...@@ -9,7 +9,12 @@ import sys
import numpy import numpy
from theano import Op, Apply, shared, config from theano import Op, Apply, shared, config
from theano.tensor import raw_random, TensorType, as_tensor_variable, get_vector_length, cast from theano.tensor import raw_random, TensorType, as_tensor_variable, get_vector_length, cast, opt
from theano.compile import optdb
from theano.gof import local_optimizer
from theano.sandbox.cuda.opt import register_opt as gpu_register_opt
from theano.sandbox.cuda import cuda_enabled, CudaNdarrayType #, gpu_from_host, host_from_gpu, CudaNdarrayType
def mulmod(a, b, c, m): def mulmod(a, b, c, m):
r = numpy.int32(numpy.int64(a*b + c) % m) r = numpy.int32(numpy.int64(a*b + c) % m)
...@@ -114,8 +119,9 @@ def mrg_next_value(rstate, new_rstate): ...@@ -114,8 +119,9 @@ def mrg_next_value(rstate, new_rstate):
else: else:
return (x11 - x21) * NORM return (x11 - x21) * NORM
class mrg_uniform(Op): class mrg_uniform_base(Op):
def __init__(self, output_type, inplace=False): def __init__(self, output_type, inplace=False):
Op.__init__(self)
self.output_type = output_type self.output_type = output_type
self.inplace=inplace self.inplace=inplace
if inplace: if inplace:
...@@ -129,6 +135,18 @@ class mrg_uniform(Op): ...@@ -129,6 +135,18 @@ class mrg_uniform(Op):
def __hash__(self): def __hash__(self):
return hash(type(self)) ^ hash(self.output_type) ^ hash(self.inplace) return hash(type(self)) ^ hash(self.output_type) ^ hash(self.inplace)
def make_node(self, rstate, size):
# error checking slightly redundant here, since
# this op should not be called directly.
#
# call through MRG_RandomStreams instead.
return Apply(self,
[rstate, size],
[rstate.type(), self.output_type()])
class mrg_uniform(mrg_uniform_base):
#CPU VERSION
@classmethod @classmethod
def new(cls, rstate, ndim, dtype, size): def new(cls, rstate, ndim, dtype, size):
v_size = as_tensor_variable(size) v_size = as_tensor_variable(size)
...@@ -137,12 +155,10 @@ class mrg_uniform(Op): ...@@ -137,12 +155,10 @@ class mrg_uniform(Op):
op = cls(TensorType(dtype, (False,)*ndim)) op = cls(TensorType(dtype, (False,)*ndim))
return op(rstate, cast(v_size, 'int32')) return op(rstate, cast(v_size, 'int32'))
def make_node(self, rstate, size):
return Apply(self,
[rstate, size],
[rstate.type(), self.output_type()])
def perform(self, node, (rstate, size), (o_rstate, o_sample)): def perform(self, node, (rstate, size), (o_rstate, o_sample)):
n_elements = 1 n_elements = 1
rstate = numpy.asarray(rstate) # bring state from GPU if necessary
if not self.inplace: if not self.inplace:
rstate = rstate.copy() rstate = rstate.copy()
...@@ -157,8 +173,8 @@ class mrg_uniform(Op): ...@@ -157,8 +173,8 @@ class mrg_uniform(Op):
sample = mrg_next_value(rstate[i%n_streams], rstate[i%n_streams]) sample = mrg_next_value(rstate[i%n_streams], rstate[i%n_streams])
rval[i] = sample rval[i] = sample
o_rstate[0] = rstate.copy() o_rstate[0] = node.outputs[0].type.filter(rstate) # send to GPU if necessary
o_sample[0] = rval.reshape(size) o_sample[0] = node.outputs[1].type.filter(rval.reshape(size))# send to GPU if necessary
def c_code_cache_version(self): def c_code_cache_version(self):
return () return ()
...@@ -317,10 +333,223 @@ class mrg_uniform(Op): ...@@ -317,10 +333,223 @@ class mrg_uniform(Op):
//////// </ code generated by mrg_uniform> //////// </ code generated by mrg_uniform>
""" %locals() """ %locals()
class GPU_mrg_uniform(mrg_uniform_base):
#GPU VERSION
@classmethod
def new(cls, rstate, ndim, dtype, size):
v_size = as_tensor_variable(size)
if ndim is None:
ndim = get_vector_length(v_size)
op = cls(CudaNdarrayType((False,)*ndim))
return op(rstate, cast(v_size, 'int32'))
def c_support_code_apply(self, node, nodename):
if self.output_type.dtype == 'float32':
otype = 'float'
NORM = '4.6566126e-10f' #numpy.float32(1.0/(2**31+65))
# this was determined by finding the biggest number such that
# numpy.float32(number * M1) < 1.0
else:
otype = 'double'
NORM = '4.656612873077392578125e-10'
return """
static __global__ void %(nodename)s_mrg_uniform(
%(otype)s*sample_data,
npy_int32*state_data,
const int Nsamples)
{
const npy_int32 i0 = 0;
const npy_int32 i7 = 7;
const npy_int32 i9 = 9;
const npy_int32 i15 = 15;
const npy_int32 i16 = 16;
const npy_int32 i22 = 22;
const npy_int32 i24 = 24;
const npy_int32 M1 = 2147483647; //2^31 - 1
const npy_int32 M2 = 2147462579; //2^31 - 21069
const npy_int32 MASK12 = 511; //2^9 - 1
const npy_int32 MASK13 = 16777215; //2^24 - 1
const npy_int32 MASK2 = 65535; //2^16 - 1
const npy_int32 MULT2 = 21069;
const unsigned int numThreads = blockDim.x * gridDim.x;
const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
npy_int32 y1, y2, x11, x12, x13, x21, x22, x23;
x11 = state_data[idx*6+0];
x12 = state_data[idx*6+1];
x13 = state_data[idx*6+2];
x21 = state_data[idx*6+3];
x22 = state_data[idx*6+4];
x23 = state_data[idx*6+5];
for (int i = idx; i < Nsamples; i += numThreads)
{
y1 = ((x12 & MASK12) << i22) + (x12 >> i9) + ((x13 & MASK13) << i7) + (x13 >> i24);
if ((y1 < 0 || y1 >= M1)) //must also check overflow
y1 -= M1;
y1 += x13;
if ((y1 < 0 or y1 >= M1))
y1 -= M1;
x13 = x12;
x12 = x11;
x11 = y1;
y1 = ((x21 & MASK2) << i15) + (MULT2 * (x21 >> i16));
if (y1 < 0 || y1 >= M2)
y1 -= M2;
y2 = ((x23 & MASK2) << i15) + (MULT2 * (x23 >> i16));
if (y2 < 0 || y2 >= M2)
y2 -= M2;
y2 += x23;
if (y2 < 0 || y2 >= M2)
y2 -= M2;
y2 += y1;
if (y2 < 0 or y2 >= M2)
y2 -= M2;
x23 = x22;
x22 = x21;
x21 = y2;
if (x11 <= x21) {
sample_data[i] = (x11 - x21 + M1) * %(NORM)s;
}
else
{
sample_data[i] = (x11 - x21) * %(NORM)s;
}
}
state_data[idx*6+0]= x11;
state_data[idx*6+1]= x12;
state_data[idx*6+2]= x13;
state_data[idx*6+3]= x21;
state_data[idx*6+4]= x22;
state_data[idx*6+5]= x23;
}
""" %locals()
def c_code_cache_version(self):
return ()
def c_code(self, node, nodename, (rstate, size), (o_rstate, o_sample), sub):
inplace = int(self.inplace)
ndim = self.output_type.ndim
o_type_num = numpy.asarray(0, dtype=self.output_type.dtype).dtype.num
fail = sub['fail']
if self.output_type.dtype == 'float32':
otype = 'float'
else:
otype = 'double'
SYNC="CNDA_THREAD_SYNC";
return """
//////// <code generated by mrg_uniform>
int odims[%(ndim)s];
int n_elements = 1;
unsigned int n_streams;
int must_alloc_sample = ((NULL == %(o_sample)s)
|| !CudaNdarray_Check(py_%(o_sample)s)
|| (%(o_sample)s->nd != %(ndim)s));
if (%(size)s->nd != 1)
{
PyErr_SetString(PyExc_ValueError, "size must be vector");
%(fail)s
}
if (%(size)s->dimensions[0] != %(ndim)s)
{
PyErr_Format(PyExc_ValueError, "size must have length %%i", %(ndim)s);
%(fail)s
}
if (%(size)s->descr->type_num != PyArray_INT32)
{
PyErr_SetString(PyExc_ValueError, "size must be int32");
%(fail)s
}
for (int i = 0; i < %(ndim)s; ++i)
{
odims[i] = ((npy_int32*)(%(size)s->data + %(size)s->strides[0] * i))[0];
n_elements *= odims[i];
must_alloc_sample = (must_alloc_sample
|| CudaNdarray_HOST_DIMS(%(o_sample)s)[i] != odims[i]);
}
if (must_alloc_sample)
{
Py_XDECREF(%(o_sample)s);
%(o_sample)s = (CudaNdarray*)CudaNdarray_NewDims(%(ndim)s, odims);
if(!%(o_sample)s)
{
%(fail)s;
}
}
if (!CudaNdarray_Check(py_%(rstate)s))
{
PyErr_Format(PyExc_ValueError, "rstate must be cudandarray");
%(fail)s;
}
Py_XDECREF(%(o_rstate)s);
if (%(inplace)s)
{
Py_INCREF(%(rstate)s);
%(o_rstate)s = %(rstate)s;
}
else
{
%(o_rstate)s = (CudaNdarray*)CudaNdarray_Copy(%(rstate)s);
}
if (%(o_rstate)s->nd != 1)
{
PyErr_SetString(PyExc_ValueError, "rstate must be vector");
%(fail)s;
}
if (CudaNdarray_HOST_DIMS(%(o_rstate)s)[0] %% 6)
{
PyErr_Format(PyExc_ValueError, "rstate len must be multiple of 6");
%(fail)s;
}
n_streams = std::min(CudaNdarray_HOST_DIMS(%(o_rstate)s)[0]/6, n_elements);
{
unsigned int threads_per_block = std::min(n_streams, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
unsigned int n_blocks = std::min(ceil_intdiv(n_streams, threads_per_block), (unsigned int)NUM_VECTOR_OP_BLOCKS);
if (threads_per_block * n_blocks < n_streams)
{
fprintf(stderr, "WARNING: unused streams above %%i (Tune GPU_mrg get_n_streams)\\n", threads_per_block * n_blocks );
}
%(nodename)s_mrg_uniform<<<n_blocks,threads_per_block>>>(
CudaNdarray_DEV_DATA(%(o_sample)s),
(npy_int32*)CudaNdarray_DEV_DATA(%(o_rstate)s),
n_elements);
}
%(SYNC)s;
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n", "mrg_uniform", cudaGetErrorString(err));
%(fail)s;
}
}
//////// </ code generated by mrg_uniform>
""" %locals()
class MRG_RandomStreams(object): class MRG_RandomStreams(object):
"""Module component with similar interface to numpy.random (numpy.random.RandomState)""" """Module component with similar interface to numpy.random (numpy.random.RandomState)"""
def __init__(self, seed=None): def __init__(self, seed=12345, use_cuda=None):
""" """
:type seed: None or int :type seed: None or int
...@@ -328,7 +557,16 @@ class MRG_RandomStreams(object): ...@@ -328,7 +557,16 @@ class MRG_RandomStreams(object):
`RandomStreamsInstance.__init__` for more details. `RandomStreamsInstance.__init__` for more details.
""" """
super(MRG_RandomStreams, self).__init__() super(MRG_RandomStreams, self).__init__()
self.rstate = numpy.asarray([12345]*6, dtype='int32') if isinstance(seed, int):
self.rstate = numpy.asarray([seed]*6, dtype='int32')
elif len(seed)==6:
self.rstate = numpy.asarray(seed, dtype='int32')
else:
raise TypeError("seed should be 1 integer or 6 integers")
if use_cuda is None:
self.use_cuda = cuda_enabled
else:
self.use_cuda = use_cuda
def inc_rstate(self): def inc_rstate(self):
"""Update self.rstate to be skipped 2^134 steps forward to the next stream start""" """Update self.rstate to be skipped 2^134 steps forward to the next stream start"""
...@@ -350,10 +588,19 @@ class MRG_RandomStreams(object): ...@@ -350,10 +588,19 @@ class MRG_RandomStreams(object):
return rval return rval
def n_streams(self, size): def n_streams(self, size):
r = 1 if isinstance(size, (tuple, list)):
for s in size: r = 1
r *= s for s in size:
return r r *= s
return r
try:
rval = int(size)
assert rval > 0
return rval
except:
pass
print >> sys.stderr, "MRG_RandomStreams Can't determine #streams from size (%s), guessing 30*256"%str(size)
return 30*256
def pretty_return(self, node_rstate, new_rstate, sample): def pretty_return(self, node_rstate, new_rstate, sample):
sample.rstate = node_rstate sample.rstate = node_rstate
...@@ -361,7 +608,6 @@ class MRG_RandomStreams(object): ...@@ -361,7 +608,6 @@ class MRG_RandomStreams(object):
node_rstate.default_update = new_rstate node_rstate.default_update = new_rstate
return sample return sample
def uniform(self, size=None, low=0.0, high=1.0, ndim=None, dtype=config.floatX): def uniform(self, size=None, low=0.0, high=1.0, ndim=None, dtype=config.floatX):
""" """
Sample a tensor of given size whose element from a uniform Sample a tensor of given size whose element from a uniform
...@@ -371,15 +617,50 @@ class MRG_RandomStreams(object): ...@@ -371,15 +617,50 @@ class MRG_RandomStreams(object):
ndim may be a plain integer to supplement the missing ndim may be a plain integer to supplement the missing
information. information.
""" """
node_rstate = shared(self.get_substream_rstates(self.n_streams(size))) if self.use_cuda and dtype=='float32':
u = self.pretty_return(node_rstate, rstates = self.get_substream_rstates(self.n_streams(size))
*mrg_uniform.new(node_rstate, ndim, dtype, size)) rstates = rstates.flatten()
# HACK - we use fact that int32 and float32 have same size to
# sneak ints into the CudaNdarray type.
# these *SHOULD NEVER BE USED AS FLOATS*
tmp_float_buf = numpy.frombuffer(rstates.data, dtype='float32')
assert tmp_float_buf.shape == rstates.shape
assert tmp_float_buf.data[:24] == rstates.data[:24]
node_rstate = shared(tmp_float_buf) # transfer to device
assert isinstance(node_rstate.type, CudaNdarrayType)
# we can't use the normal mrg_uniform constructor + later optimization
# because of the tmp_float_buf hack above. There is
# currently no Theano node that will do a frombuffer reinterpretation.
u = self.pretty_return(node_rstate,
*GPU_mrg_uniform.new(node_rstate, ndim, dtype, size))
else:
node_rstate = shared(self.get_substream_rstates(self.n_streams(size)))
u = self.pretty_return(node_rstate,
*mrg_uniform.new(node_rstate, ndim, dtype, size))
r = u * (high-low) + low r = u * (high-low) + low
if u.type.broadcastable != r.type.broadcastable: if u.type.broadcastable != r.type.broadcastable:
raise NotImplementedError( 'Increase the size to match the broadcasting pattern of `low` and `high` arguments') raise NotImplementedError( 'Increase the size to match the broadcasting pattern of `low` and `high` arguments')
return r return r
def binomial(self, size=None, n=1, prob=0.5, ndim=None, dtype='int64'):
if n == 1:
return cast(self.uniform(size=size) < prob, dtype)
else:
raise NotImplementedError("MRG_RandomStreams.binomial with n > 1")
@local_optimizer([None])
def mrg_random_make_inplace(node):
op = node.op
if isinstance(op, mrg_uniform) and not op.inplace:
# op might be gpu version
new_op = op.__class__(op.output_type, inplace=True)
return new_op.make_node(*node.inputs).outputs
return False
optdb.register('random_make_inplace_mrg', opt.in2out(mrg_random_make_inplace, ignore_newtrees=True), 99, 'fast_run', 'inplace')
# #
# #
# #
...@@ -391,37 +672,61 @@ import theano ...@@ -391,37 +672,61 @@ import theano
def test_rng0(): def test_rng0():
def basictest(f, steps, prefix=""): def basictest(f, steps, prefix=""):
t0 = time.time() dt = 0.0
l = [f() for i in xrange(steps)] for i in xrange(steps):
tt = time.time() t0 = time.time()
ival = f()
mean, std, min, max = numpy.mean(l), numpy.std(l), numpy.min(l), numpy.max(l) dt += time.time() - t0
ival = numpy.asarray(ival)
print prefix, 'mean', mean if i == 0:
print prefix, 'std', std mean = numpy.array(ival, copy=True)
print prefix, 'min', repr(min) else:
print prefix, 'max', repr(max) alpha = 1.0 / (1+i)
print prefix, 'samples/sec', steps*sample_size[0]*sample_size[1] / (tt-t0) mean = alpha * ival + (1-alpha)*mean
assert max < 1.0 print prefix, 'mean', numpy.mean(mean)
assert min >= 0.0 assert abs(numpy.mean(mean) - 0.5) < .01, 'bad mean?'
assert abs(mean - 0.5) < .01, 'bad mean?' print prefix, 'time', dt
print prefix, 'elements', steps*sample_size[0]*sample_size[1]
print prefix, 'samples/sec', steps*sample_size[0]*sample_size[1] / dt
R = MRG_RandomStreams(234) if 0:
mean, std, min, max = numpy.mean(l), numpy.std(l), numpy.min(l), numpy.max(l)
sample_size = (200,20)
print prefix, 'mean', mean
print prefix, 'std', std
print prefix, 'min', repr(min)
print prefix, 'max', repr(max)
assert max < 1.0
assert min >= 0.0
assert abs(mean - 0.5) < .01, 'bad mean?'
sample_size = (1000,100)
print ''
print 'ON CPU:'
R = MRG_RandomStreams(234, use_cuda=False)
u = R.uniform(size=sample_size) u = R.uniform(size=sample_size)
print "U dtype", u.dtype
f = theano.function([], u) f = theano.function([], u)
theano.printing.debugprint(f)
print 'random?[:10]\n', f()[0,0:10]
basictest(f, 1000, prefix='mrg ')
print 'random?', f()[0] print ''
print 'random?', f()[0] print 'ON GPU:'
R = MRG_RandomStreams(234, use_cuda=True)
u = R.uniform(size=sample_size)
assert u.dtype == 'float32' #well, it's really that this test w GPU doesn't make sense otw
f = theano.function([], theano.Out(
theano.sandbox.cuda.basic_ops.gpu_from_host(u),
borrow=True))
theano.printing.debugprint(f)
print 'random?[:10]\n', numpy.asarray(f())[0,0:10]
basictest(f, 1000, prefix='mrg ') basictest(f, 1000, prefix='mrg ')
print ''
print 'ON CPU w NUMPY:'
RR = theano.tensor.shared_randomstreams.RandomStreams(234) RR = theano.tensor.shared_randomstreams.RandomStreams(234)
uu = RR.uniform(size=sample_size) uu = RR.uniform(size=sample_size)
......
...@@ -1257,7 +1257,7 @@ class Prepend_scalar_constant_to_each_row(gof.Op): ...@@ -1257,7 +1257,7 @@ class Prepend_scalar_constant_to_each_row(gof.Op):
def __eq__(self, other): def __eq__(self, other):
return (type(self) == type(other)) and (self.val == other.val) return (type(self) == type(other)) and (self.val == other.val)
def __hash__(self): def __hash__(self):
return tensor.hashtype(self) ^ hash(self.val.value) return tensor.hashtype(self) ^ hash(self.val.data)
def __str__(self): def __str__(self):
return '%s{%s}'%(self.__class__.__name__,self.val) return '%s{%s}'%(self.__class__.__name__,self.val)
......
...@@ -610,6 +610,43 @@ def local_alloc_unary(node): ...@@ -610,6 +610,43 @@ def local_alloc_unary(node):
return [T.alloc(T.cast(v, node.outputs[0].dtype), *shp)] return [T.alloc(T.cast(v, node.outputs[0].dtype), *shp)]
############################
# Constant Canonicalization
############################
@register_canonicalize
@gof.local_optimizer([])
def local_upcast_elemwise_constant_inputs(node):
"""This explicitly upcasts constant inputs to elemwise Ops, when those Ops do implicit upcasting anyway.
Rationale: it helps merge things like (1-x) and (1.0 - x).
"""
if isinstance(node.op, T.Elemwise):
scalar_op = node.op.scalar_op
#print "aa", scalar_op.output_types_preference
if scalar_op.output_types_preference in (T.scal.upgrade_to_float, T.scal.upcast_out):
# this is the kind of op that we can screw with the input dtypes by upcasting
# explicitly
#print "HELLO??"
output_dtype = node.outputs[0].type.dtype
new_inputs = []
for i in node.inputs:
if i.type.dtype == output_dtype:
new_inputs.append(i)
else:
try:
cval_i = get_constant_value(i) # works only for scalars I think
new_inputs.append(T.cast(cval_i, output_dtype))
except:
if isinstance(i, T.TensorConstant): #for the case of a non-scalar
new_inputs.append(T.cast(i, output_dtype))
else:
new_inputs.append(i)
if new_inputs != node.inputs:
return [node.op(*new_inputs)]
################## ##################
# Subtensor opts # # Subtensor opts #
################## ##################
...@@ -1717,6 +1754,7 @@ def local_greedy_distributor(node): ...@@ -1717,6 +1754,7 @@ def local_greedy_distributor(node):
return [rval] return [rval]
register_canonicalize(local_greedy_distributor) register_canonicalize(local_greedy_distributor)
register_stabilize(local_greedy_distributor)
...@@ -1748,6 +1786,7 @@ def constant_folding(node): ...@@ -1748,6 +1786,7 @@ def constant_folding(node):
return msg return msg
register_canonicalize(constant_folding) register_canonicalize(constant_folding)
register_stabilize(constant_folding) # because
register_specialize(constant_folding) register_specialize(constant_folding)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论