提交 6736be29 authored 作者: Olivier Delalleau's avatar Olivier Delalleau

Merged

...@@ -131,7 +131,7 @@ optdb.register('merge1', gof.MergeOptimizer(), ...@@ -131,7 +131,7 @@ optdb.register('merge1', gof.MergeOptimizer(),
0, 'fast_run', 'fast_compile') 0, 'fast_run', 'fast_compile')
optdb.register('canonicalize', gof.EquilibriumDB(), # rearranges elemwise expressions optdb.register('canonicalize', gof.EquilibriumDB(), # rearranges elemwise expressions
1, 'fast_run') 1, 'fast_run')
optdb.register('merge1.2', gof.MergeOptimizer(skip_const_merge=True), optdb.register('merge1.2', gof.MergeOptimizer(skip_const_merge=False),
1.2, 'fast_run', 'fast_compile') 1.2, 'fast_run', 'fast_compile')
optdb.register('stabilize', gof.EquilibriumDB(), # replace unstable subgraphs optdb.register('stabilize', gof.EquilibriumDB(), # replace unstable subgraphs
1.5, 'fast_run') 1.5, 'fast_run')
......
...@@ -956,21 +956,26 @@ CudaNdarray_Subscript(PyObject * py_self, PyObject * key) ...@@ -956,21 +956,26 @@ CudaNdarray_Subscript(PyObject * py_self, PyObject * key)
CudaNdarray * self = (CudaNdarray*) py_self; CudaNdarray * self = (CudaNdarray*) py_self;
PyObject * py_rval = NULL; PyObject * py_rval = NULL;
CudaNdarray * rval = NULL; CudaNdarray * rval = NULL;
PyObject * intobj = NULL;
//PyObject_Print(key, stderr, 0);
if (key == Py_Ellipsis) if (key == Py_Ellipsis)
{ {
Py_INCREF(py_self); Py_INCREF(py_self);
return py_self; return py_self;
} }
else if (PyInt_Check(key)) //INDEXING BY INTEGER if ((intobj=PyNumber_Int(key))) //INDEXING BY INTEGER
//else if (PyInt_Check(key)) //INDEXING BY INTEGER
{ {
int d_idx = PyInt_AsLong(intobj);
Py_DECREF(intobj); intobj=NULL;
//int d_idx = PyInt_AsLong(key);
if (self->nd == 0) if (self->nd == 0)
{ {
PyErr_SetString(PyExc_NotImplementedError, "index into 0-d array"); PyErr_SetString(PyExc_NotImplementedError, "index into 0-d array");
return NULL; return NULL;
} }
int d_idx = PyInt_AsLong(key);
int d_dim = CudaNdarray_HOST_DIMS(self)[0]; int d_dim = CudaNdarray_HOST_DIMS(self)[0];
int offset = 0; int offset = 0;
...@@ -1009,7 +1014,11 @@ CudaNdarray_Subscript(PyObject * py_self, PyObject * key) ...@@ -1009,7 +1014,11 @@ CudaNdarray_Subscript(PyObject * py_self, PyObject * key)
CudaNdarray_set_dim(rval, d-1, CudaNdarray_HOST_DIMS(self)[d]); CudaNdarray_set_dim(rval, d-1, CudaNdarray_HOST_DIMS(self)[d]);
} }
} }
else if (PySlice_Check(key)) //INDEXING BY SLICE else
{
PyErr_Clear();
}
if (PySlice_Check(key)) //INDEXING BY SLICE
{ {
if (self->nd == 0) if (self->nd == 0)
{ {
...@@ -1057,7 +1066,7 @@ CudaNdarray_Subscript(PyObject * py_self, PyObject * key) ...@@ -1057,7 +1066,7 @@ CudaNdarray_Subscript(PyObject * py_self, PyObject * key)
CudaNdarray_set_dim(rval, d, CudaNdarray_HOST_DIMS(self)[d]); CudaNdarray_set_dim(rval, d, CudaNdarray_HOST_DIMS(self)[d]);
} }
} }
else if (PyTuple_Check(key)) //INDEXING BY TUPLE if (PyTuple_Check(key)) //INDEXING BY TUPLE
{ {
//elements of the tuple can be either integers or slices //elements of the tuple can be either integers or slices
//the dimensionality of the view we will return is diminished for each slice in the tuple //the dimensionality of the view we will return is diminished for each slice in the tuple
...@@ -1127,9 +1136,11 @@ CudaNdarray_Subscript(PyObject * py_self, PyObject * key) ...@@ -1127,9 +1136,11 @@ CudaNdarray_Subscript(PyObject * py_self, PyObject * key)
} }
++rval_d; ++rval_d;
} }
else if (PyInt_Check(key_d)) else if ((intobj=PyNumber_Int(key_d)))
{ {
int d_idx = PyInt_AsLong(key_d); int d_idx = PyInt_AsLong(intobj);
Py_DECREF(intobj);
intobj = NULL;
int d_dim = CudaNdarray_HOST_DIMS(self)[d]; int d_dim = CudaNdarray_HOST_DIMS(self)[d];
if ((d_idx >= 0) && (d_idx < d_dim)) if ((d_idx >= 0) && (d_idx < d_dim))
...@@ -1151,6 +1162,7 @@ CudaNdarray_Subscript(PyObject * py_self, PyObject * key) ...@@ -1151,6 +1162,7 @@ CudaNdarray_Subscript(PyObject * py_self, PyObject * key)
} }
else else
{ {
PyErr_Clear(); // clear the error set by PyNumber_Int
PyErr_SetString(PyExc_IndexError, "index must be either int or slice"); PyErr_SetString(PyExc_IndexError, "index must be either int or slice");
Py_DECREF(rval); Py_DECREF(rval);
return NULL; return NULL;
...@@ -1158,16 +1170,16 @@ CudaNdarray_Subscript(PyObject * py_self, PyObject * key) ...@@ -1158,16 +1170,16 @@ CudaNdarray_Subscript(PyObject * py_self, PyObject * key)
} }
} }
} }
else
{
PyErr_SetString(PyExc_NotImplementedError, "Unknown key type");
return NULL;
}
if (py_rval) if (py_rval)
{ {
if (verbose) fprint_CudaNdarray(stderr, self); if (verbose) fprint_CudaNdarray(stderr, self);
if (verbose) fprint_CudaNdarray(stderr, rval); if (verbose) fprint_CudaNdarray(stderr, rval);
} }
else
{
PyErr_SetString(PyExc_NotImplementedError, "Unknown key type");
return NULL;
}
return py_rval; return py_rval;
} }
...@@ -1776,6 +1788,10 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, CudaNdarray * other) ...@@ -1776,6 +1788,10 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, CudaNdarray * other)
} }
size *= (unsigned int) CudaNdarray_HOST_DIMS(self)[i]; size *= (unsigned int) CudaNdarray_HOST_DIMS(self)[i];
} }
if (0 == size)
{
return 0; //nothing to copy, we're done.
}
if (CudaNdarray_is_c_contiguous(self) && CudaNdarray_is_c_contiguous(other)) if (CudaNdarray_is_c_contiguous(self) && CudaNdarray_is_c_contiguous(other))
{ {
cublasScopy(size, CudaNdarray_DEV_DATA(other), 1, CudaNdarray_DEV_DATA(self), 1); cublasScopy(size, CudaNdarray_DEV_DATA(other), 1, CudaNdarray_DEV_DATA(self), 1);
......
...@@ -9,7 +9,12 @@ import sys ...@@ -9,7 +9,12 @@ import sys
import numpy import numpy
from theano import Op, Apply, shared, config from theano import Op, Apply, shared, config
from theano.tensor import raw_random, TensorType, as_tensor_variable, get_vector_length, cast from theano.tensor import raw_random, TensorType, as_tensor_variable, get_vector_length, cast, opt
from theano.compile import optdb
from theano.gof import local_optimizer
from theano.sandbox.cuda.opt import register_opt as gpu_register_opt
from theano.sandbox.cuda import cuda_enabled, CudaNdarrayType #, gpu_from_host, host_from_gpu, CudaNdarrayType
def mulmod(a, b, c, m): def mulmod(a, b, c, m):
r = numpy.int32(numpy.int64(a*b + c) % m) r = numpy.int32(numpy.int64(a*b + c) % m)
...@@ -114,8 +119,9 @@ def mrg_next_value(rstate, new_rstate): ...@@ -114,8 +119,9 @@ def mrg_next_value(rstate, new_rstate):
else: else:
return (x11 - x21) * NORM return (x11 - x21) * NORM
class mrg_uniform(Op): class mrg_uniform_base(Op):
def __init__(self, output_type, inplace=False): def __init__(self, output_type, inplace=False):
Op.__init__(self)
self.output_type = output_type self.output_type = output_type
self.inplace=inplace self.inplace=inplace
if inplace: if inplace:
...@@ -129,6 +135,18 @@ class mrg_uniform(Op): ...@@ -129,6 +135,18 @@ class mrg_uniform(Op):
def __hash__(self): def __hash__(self):
return hash(type(self)) ^ hash(self.output_type) ^ hash(self.inplace) return hash(type(self)) ^ hash(self.output_type) ^ hash(self.inplace)
def make_node(self, rstate, size):
# error checking slightly redundant here, since
# this op should not be called directly.
#
# call through MRG_RandomStreams instead.
return Apply(self,
[rstate, size],
[rstate.type(), self.output_type()])
class mrg_uniform(mrg_uniform_base):
#CPU VERSION
@classmethod @classmethod
def new(cls, rstate, ndim, dtype, size): def new(cls, rstate, ndim, dtype, size):
v_size = as_tensor_variable(size) v_size = as_tensor_variable(size)
...@@ -137,12 +155,10 @@ class mrg_uniform(Op): ...@@ -137,12 +155,10 @@ class mrg_uniform(Op):
op = cls(TensorType(dtype, (False,)*ndim)) op = cls(TensorType(dtype, (False,)*ndim))
return op(rstate, cast(v_size, 'int32')) return op(rstate, cast(v_size, 'int32'))
def make_node(self, rstate, size):
return Apply(self,
[rstate, size],
[rstate.type(), self.output_type()])
def perform(self, node, (rstate, size), (o_rstate, o_sample)): def perform(self, node, (rstate, size), (o_rstate, o_sample)):
n_elements = 1 n_elements = 1
rstate = numpy.asarray(rstate) # bring state from GPU if necessary
if not self.inplace: if not self.inplace:
rstate = rstate.copy() rstate = rstate.copy()
...@@ -157,8 +173,8 @@ class mrg_uniform(Op): ...@@ -157,8 +173,8 @@ class mrg_uniform(Op):
sample = mrg_next_value(rstate[i%n_streams], rstate[i%n_streams]) sample = mrg_next_value(rstate[i%n_streams], rstate[i%n_streams])
rval[i] = sample rval[i] = sample
o_rstate[0] = rstate.copy() o_rstate[0] = node.outputs[0].type.filter(rstate) # send to GPU if necessary
o_sample[0] = rval.reshape(size) o_sample[0] = node.outputs[1].type.filter(rval.reshape(size))# send to GPU if necessary
def c_code_cache_version(self): def c_code_cache_version(self):
return () return ()
...@@ -317,10 +333,223 @@ class mrg_uniform(Op): ...@@ -317,10 +333,223 @@ class mrg_uniform(Op):
//////// </ code generated by mrg_uniform> //////// </ code generated by mrg_uniform>
""" %locals() """ %locals()
class GPU_mrg_uniform(mrg_uniform_base):
#GPU VERSION
@classmethod
def new(cls, rstate, ndim, dtype, size):
v_size = as_tensor_variable(size)
if ndim is None:
ndim = get_vector_length(v_size)
op = cls(CudaNdarrayType((False,)*ndim))
return op(rstate, cast(v_size, 'int32'))
def c_support_code_apply(self, node, nodename):
if self.output_type.dtype == 'float32':
otype = 'float'
NORM = '4.6566126e-10f' #numpy.float32(1.0/(2**31+65))
# this was determined by finding the biggest number such that
# numpy.float32(number * M1) < 1.0
else:
otype = 'double'
NORM = '4.656612873077392578125e-10'
return """
static __global__ void %(nodename)s_mrg_uniform(
%(otype)s*sample_data,
npy_int32*state_data,
const int Nsamples)
{
const npy_int32 i0 = 0;
const npy_int32 i7 = 7;
const npy_int32 i9 = 9;
const npy_int32 i15 = 15;
const npy_int32 i16 = 16;
const npy_int32 i22 = 22;
const npy_int32 i24 = 24;
const npy_int32 M1 = 2147483647; //2^31 - 1
const npy_int32 M2 = 2147462579; //2^31 - 21069
const npy_int32 MASK12 = 511; //2^9 - 1
const npy_int32 MASK13 = 16777215; //2^24 - 1
const npy_int32 MASK2 = 65535; //2^16 - 1
const npy_int32 MULT2 = 21069;
const unsigned int numThreads = blockDim.x * gridDim.x;
const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
npy_int32 y1, y2, x11, x12, x13, x21, x22, x23;
x11 = state_data[idx*6+0];
x12 = state_data[idx*6+1];
x13 = state_data[idx*6+2];
x21 = state_data[idx*6+3];
x22 = state_data[idx*6+4];
x23 = state_data[idx*6+5];
for (int i = idx; i < Nsamples; i += numThreads)
{
y1 = ((x12 & MASK12) << i22) + (x12 >> i9) + ((x13 & MASK13) << i7) + (x13 >> i24);
if ((y1 < 0 || y1 >= M1)) //must also check overflow
y1 -= M1;
y1 += x13;
if ((y1 < 0 or y1 >= M1))
y1 -= M1;
x13 = x12;
x12 = x11;
x11 = y1;
y1 = ((x21 & MASK2) << i15) + (MULT2 * (x21 >> i16));
if (y1 < 0 || y1 >= M2)
y1 -= M2;
y2 = ((x23 & MASK2) << i15) + (MULT2 * (x23 >> i16));
if (y2 < 0 || y2 >= M2)
y2 -= M2;
y2 += x23;
if (y2 < 0 || y2 >= M2)
y2 -= M2;
y2 += y1;
if (y2 < 0 or y2 >= M2)
y2 -= M2;
x23 = x22;
x22 = x21;
x21 = y2;
if (x11 <= x21) {
sample_data[i] = (x11 - x21 + M1) * %(NORM)s;
}
else
{
sample_data[i] = (x11 - x21) * %(NORM)s;
}
}
state_data[idx*6+0]= x11;
state_data[idx*6+1]= x12;
state_data[idx*6+2]= x13;
state_data[idx*6+3]= x21;
state_data[idx*6+4]= x22;
state_data[idx*6+5]= x23;
}
""" %locals()
def c_code_cache_version(self):
return ()
def c_code(self, node, nodename, (rstate, size), (o_rstate, o_sample), sub):
inplace = int(self.inplace)
ndim = self.output_type.ndim
o_type_num = numpy.asarray(0, dtype=self.output_type.dtype).dtype.num
fail = sub['fail']
if self.output_type.dtype == 'float32':
otype = 'float'
else:
otype = 'double'
SYNC="CNDA_THREAD_SYNC";
return """
//////// <code generated by mrg_uniform>
int odims[%(ndim)s];
int n_elements = 1;
unsigned int n_streams;
int must_alloc_sample = ((NULL == %(o_sample)s)
|| !CudaNdarray_Check(py_%(o_sample)s)
|| (%(o_sample)s->nd != %(ndim)s));
if (%(size)s->nd != 1)
{
PyErr_SetString(PyExc_ValueError, "size must be vector");
%(fail)s
}
if (%(size)s->dimensions[0] != %(ndim)s)
{
PyErr_Format(PyExc_ValueError, "size must have length %%i", %(ndim)s);
%(fail)s
}
if (%(size)s->descr->type_num != PyArray_INT32)
{
PyErr_SetString(PyExc_ValueError, "size must be int32");
%(fail)s
}
for (int i = 0; i < %(ndim)s; ++i)
{
odims[i] = ((npy_int32*)(%(size)s->data + %(size)s->strides[0] * i))[0];
n_elements *= odims[i];
must_alloc_sample = (must_alloc_sample
|| CudaNdarray_HOST_DIMS(%(o_sample)s)[i] != odims[i]);
}
if (must_alloc_sample)
{
Py_XDECREF(%(o_sample)s);
%(o_sample)s = (CudaNdarray*)CudaNdarray_NewDims(%(ndim)s, odims);
if(!%(o_sample)s)
{
%(fail)s;
}
}
if (!CudaNdarray_Check(py_%(rstate)s))
{
PyErr_Format(PyExc_ValueError, "rstate must be cudandarray");
%(fail)s;
}
Py_XDECREF(%(o_rstate)s);
if (%(inplace)s)
{
Py_INCREF(%(rstate)s);
%(o_rstate)s = %(rstate)s;
}
else
{
%(o_rstate)s = (CudaNdarray*)CudaNdarray_Copy(%(rstate)s);
}
if (%(o_rstate)s->nd != 1)
{
PyErr_SetString(PyExc_ValueError, "rstate must be vector");
%(fail)s;
}
if (CudaNdarray_HOST_DIMS(%(o_rstate)s)[0] %% 6)
{
PyErr_Format(PyExc_ValueError, "rstate len must be multiple of 6");
%(fail)s;
}
n_streams = std::min(CudaNdarray_HOST_DIMS(%(o_rstate)s)[0]/6, n_elements);
{
unsigned int threads_per_block = std::min(n_streams, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
unsigned int n_blocks = std::min(ceil_intdiv(n_streams, threads_per_block), (unsigned int)NUM_VECTOR_OP_BLOCKS);
if (threads_per_block * n_blocks < n_streams)
{
fprintf(stderr, "WARNING: unused streams above %%i (Tune GPU_mrg get_n_streams)\\n", threads_per_block * n_blocks );
}
%(nodename)s_mrg_uniform<<<n_blocks,threads_per_block>>>(
CudaNdarray_DEV_DATA(%(o_sample)s),
(npy_int32*)CudaNdarray_DEV_DATA(%(o_rstate)s),
n_elements);
}
%(SYNC)s;
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n", "mrg_uniform", cudaGetErrorString(err));
%(fail)s;
}
}
//////// </ code generated by mrg_uniform>
""" %locals()
class MRG_RandomStreams(object): class MRG_RandomStreams(object):
"""Module component with similar interface to numpy.random (numpy.random.RandomState)""" """Module component with similar interface to numpy.random (numpy.random.RandomState)"""
def __init__(self, seed=None): def __init__(self, seed=12345, use_cuda=None):
""" """
:type seed: None or int :type seed: None or int
...@@ -328,7 +557,16 @@ class MRG_RandomStreams(object): ...@@ -328,7 +557,16 @@ class MRG_RandomStreams(object):
`RandomStreamsInstance.__init__` for more details. `RandomStreamsInstance.__init__` for more details.
""" """
super(MRG_RandomStreams, self).__init__() super(MRG_RandomStreams, self).__init__()
self.rstate = numpy.asarray([12345]*6, dtype='int32') if isinstance(seed, int):
self.rstate = numpy.asarray([seed]*6, dtype='int32')
elif len(seed)==6:
self.rstate = numpy.asarray(seed, dtype='int32')
else:
raise TypeError("seed should be 1 integer or 6 integers")
if use_cuda is None:
self.use_cuda = cuda_enabled
else:
self.use_cuda = use_cuda
def inc_rstate(self): def inc_rstate(self):
"""Update self.rstate to be skipped 2^134 steps forward to the next stream start""" """Update self.rstate to be skipped 2^134 steps forward to the next stream start"""
...@@ -350,10 +588,19 @@ class MRG_RandomStreams(object): ...@@ -350,10 +588,19 @@ class MRG_RandomStreams(object):
return rval return rval
def n_streams(self, size): def n_streams(self, size):
if isinstance(size, (tuple, list)):
r = 1 r = 1
for s in size: for s in size:
r *= s r *= s
return r return r
try:
rval = int(size)
assert rval > 0
return rval
except:
pass
print >> sys.stderr, "MRG_RandomStreams Can't determine #streams from size (%s), guessing 30*256"%str(size)
return 30*256
def pretty_return(self, node_rstate, new_rstate, sample): def pretty_return(self, node_rstate, new_rstate, sample):
sample.rstate = node_rstate sample.rstate = node_rstate
...@@ -361,7 +608,6 @@ class MRG_RandomStreams(object): ...@@ -361,7 +608,6 @@ class MRG_RandomStreams(object):
node_rstate.default_update = new_rstate node_rstate.default_update = new_rstate
return sample return sample
def uniform(self, size=None, low=0.0, high=1.0, ndim=None, dtype=config.floatX): def uniform(self, size=None, low=0.0, high=1.0, ndim=None, dtype=config.floatX):
""" """
Sample a tensor of given size whose element from a uniform Sample a tensor of given size whose element from a uniform
...@@ -371,6 +617,24 @@ class MRG_RandomStreams(object): ...@@ -371,6 +617,24 @@ class MRG_RandomStreams(object):
ndim may be a plain integer to supplement the missing ndim may be a plain integer to supplement the missing
information. information.
""" """
if self.use_cuda and dtype=='float32':
rstates = self.get_substream_rstates(self.n_streams(size))
rstates = rstates.flatten()
# HACK - we use fact that int32 and float32 have same size to
# sneak ints into the CudaNdarray type.
# these *SHOULD NEVER BE USED AS FLOATS*
tmp_float_buf = numpy.frombuffer(rstates.data, dtype='float32')
assert tmp_float_buf.shape == rstates.shape
assert tmp_float_buf.data[:24] == rstates.data[:24]
node_rstate = shared(tmp_float_buf) # transfer to device
assert isinstance(node_rstate.type, CudaNdarrayType)
# we can't use the normal mrg_uniform constructor + later optimization
# because of the tmp_float_buf hack above. There is
# currently no Theano node that will do a frombuffer reinterpretation.
u = self.pretty_return(node_rstate,
*GPU_mrg_uniform.new(node_rstate, ndim, dtype, size))
else:
node_rstate = shared(self.get_substream_rstates(self.n_streams(size))) node_rstate = shared(self.get_substream_rstates(self.n_streams(size)))
u = self.pretty_return(node_rstate, u = self.pretty_return(node_rstate,
*mrg_uniform.new(node_rstate, ndim, dtype, size)) *mrg_uniform.new(node_rstate, ndim, dtype, size))
...@@ -380,6 +644,23 @@ class MRG_RandomStreams(object): ...@@ -380,6 +644,23 @@ class MRG_RandomStreams(object):
raise NotImplementedError( 'Increase the size to match the broadcasting pattern of `low` and `high` arguments') raise NotImplementedError( 'Increase the size to match the broadcasting pattern of `low` and `high` arguments')
return r return r
def binomial(self, size=None, n=1, prob=0.5, ndim=None, dtype='int64'):
if n == 1:
return cast(self.uniform(size=size) < prob, dtype)
else:
raise NotImplementedError("MRG_RandomStreams.binomial with n > 1")
@local_optimizer([None])
def mrg_random_make_inplace(node):
op = node.op
if isinstance(op, mrg_uniform) and not op.inplace:
# op might be gpu version
new_op = op.__class__(op.output_type, inplace=True)
return new_op.make_node(*node.inputs).outputs
return False
optdb.register('random_make_inplace_mrg', opt.in2out(mrg_random_make_inplace, ignore_newtrees=True), 99, 'fast_run', 'inplace')
# #
# #
# #
...@@ -391,37 +672,61 @@ import theano ...@@ -391,37 +672,61 @@ import theano
def test_rng0(): def test_rng0():
def basictest(f, steps, prefix=""): def basictest(f, steps, prefix=""):
dt = 0.0
for i in xrange(steps):
t0 = time.time() t0 = time.time()
l = [f() for i in xrange(steps)] ival = f()
tt = time.time() dt += time.time() - t0
ival = numpy.asarray(ival)
if i == 0:
mean = numpy.array(ival, copy=True)
else:
alpha = 1.0 / (1+i)
mean = alpha * ival + (1-alpha)*mean
print prefix, 'mean', numpy.mean(mean)
assert abs(numpy.mean(mean) - 0.5) < .01, 'bad mean?'
print prefix, 'time', dt
print prefix, 'elements', steps*sample_size[0]*sample_size[1]
print prefix, 'samples/sec', steps*sample_size[0]*sample_size[1] / dt
if 0:
mean, std, min, max = numpy.mean(l), numpy.std(l), numpy.min(l), numpy.max(l) mean, std, min, max = numpy.mean(l), numpy.std(l), numpy.min(l), numpy.max(l)
print prefix, 'mean', mean print prefix, 'mean', mean
print prefix, 'std', std print prefix, 'std', std
print prefix, 'min', repr(min) print prefix, 'min', repr(min)
print prefix, 'max', repr(max) print prefix, 'max', repr(max)
print prefix, 'samples/sec', steps*sample_size[0]*sample_size[1] / (tt-t0)
assert max < 1.0 assert max < 1.0
assert min >= 0.0 assert min >= 0.0
assert abs(mean - 0.5) < .01, 'bad mean?' assert abs(mean - 0.5) < .01, 'bad mean?'
sample_size = (1000,100)
R = MRG_RandomStreams(234) print ''
print 'ON CPU:'
sample_size = (200,20)
R = MRG_RandomStreams(234, use_cuda=False)
u = R.uniform(size=sample_size) u = R.uniform(size=sample_size)
print "U dtype", u.dtype
f = theano.function([], u) f = theano.function([], u)
theano.printing.debugprint(f)
print 'random?[:10]\n', f()[0,0:10]
basictest(f, 1000, prefix='mrg ')
print 'random?', f()[0] print ''
print 'random?', f()[0] print 'ON GPU:'
R = MRG_RandomStreams(234, use_cuda=True)
u = R.uniform(size=sample_size)
assert u.dtype == 'float32' #well, it's really that this test w GPU doesn't make sense otw
f = theano.function([], theano.Out(
theano.sandbox.cuda.basic_ops.gpu_from_host(u),
borrow=True))
theano.printing.debugprint(f)
print 'random?[:10]\n', numpy.asarray(f())[0,0:10]
basictest(f, 1000, prefix='mrg ') basictest(f, 1000, prefix='mrg ')
print ''
print 'ON CPU w NUMPY:'
RR = theano.tensor.shared_randomstreams.RandomStreams(234) RR = theano.tensor.shared_randomstreams.RandomStreams(234)
uu = RR.uniform(size=sample_size) uu = RR.uniform(size=sample_size)
......
...@@ -1257,7 +1257,7 @@ class Prepend_scalar_constant_to_each_row(gof.Op): ...@@ -1257,7 +1257,7 @@ class Prepend_scalar_constant_to_each_row(gof.Op):
def __eq__(self, other): def __eq__(self, other):
return (type(self) == type(other)) and (self.val == other.val) return (type(self) == type(other)) and (self.val == other.val)
def __hash__(self): def __hash__(self):
return tensor.hashtype(self) ^ hash(self.val.value) return tensor.hashtype(self) ^ hash(self.val.data)
def __str__(self): def __str__(self):
return '%s{%s}'%(self.__class__.__name__,self.val) return '%s{%s}'%(self.__class__.__name__,self.val)
......
...@@ -610,6 +610,43 @@ def local_alloc_unary(node): ...@@ -610,6 +610,43 @@ def local_alloc_unary(node):
return [T.alloc(T.cast(v, node.outputs[0].dtype), *shp)] return [T.alloc(T.cast(v, node.outputs[0].dtype), *shp)]
############################
# Constant Canonicalization
############################
@register_canonicalize
@gof.local_optimizer([])
def local_upcast_elemwise_constant_inputs(node):
"""This explicitly upcasts constant inputs to elemwise Ops, when those Ops do implicit upcasting anyway.
Rationale: it helps merge things like (1-x) and (1.0 - x).
"""
if isinstance(node.op, T.Elemwise):
scalar_op = node.op.scalar_op
#print "aa", scalar_op.output_types_preference
if scalar_op.output_types_preference in (T.scal.upgrade_to_float, T.scal.upcast_out):
# this is the kind of op that we can screw with the input dtypes by upcasting
# explicitly
#print "HELLO??"
output_dtype = node.outputs[0].type.dtype
new_inputs = []
for i in node.inputs:
if i.type.dtype == output_dtype:
new_inputs.append(i)
else:
try:
cval_i = get_constant_value(i) # works only for scalars I think
new_inputs.append(T.cast(cval_i, output_dtype))
except:
if isinstance(i, T.TensorConstant): #for the case of a non-scalar
new_inputs.append(T.cast(i, output_dtype))
else:
new_inputs.append(i)
if new_inputs != node.inputs:
return [node.op(*new_inputs)]
################## ##################
# Subtensor opts # # Subtensor opts #
################## ##################
...@@ -1717,6 +1754,7 @@ def local_greedy_distributor(node): ...@@ -1717,6 +1754,7 @@ def local_greedy_distributor(node):
return [rval] return [rval]
register_canonicalize(local_greedy_distributor) register_canonicalize(local_greedy_distributor)
register_stabilize(local_greedy_distributor)
...@@ -1748,6 +1786,7 @@ def constant_folding(node): ...@@ -1748,6 +1786,7 @@ def constant_folding(node):
return msg return msg
register_canonicalize(constant_folding) register_canonicalize(constant_folding)
register_stabilize(constant_folding) # because
register_specialize(constant_folding) register_specialize(constant_folding)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论