提交 5aac104a authored 作者: David Warde-Farley's avatar David Warde-Farley

Merge.

......@@ -75,6 +75,8 @@ Community
* Register and post to `theano-buildbot`_ if you want to receive our daily buildbot email.
* Ask/view questions/answers at `metaoptimize/qa/tags/theano/`_ (it's like stack overflow for machine learning)
* We try to stay organized with `Theano's Trac <http://trac-hg.assembla.com/theano/report/1>`__
* Come visit us in Montreal! Most of the developers are students in the LISA_ group at the `University of Montreal`_.
......@@ -104,6 +106,8 @@ Community
.. _theano-buildbot: http://groups.google.com/group/theano-buildbot
.. _tickets: http://pylearn.org/theano/trac/query?status=accepted&status=assigned&status=new&status=reopened&group=milestone&max=200&col=id&col=summary&col=status&col=owner&col=type&col=priority&col=component&col=time&report=9&order=priority
.. _metaoptimize/qa/tags/theano: http://metaoptimize.com/qa/tags/theano/
.. _LISA: http://www.iro.umontreal.ca/~lisa
.. _University of Montreal: http://www.umontreal.ca
......@@ -344,6 +344,41 @@ def MergeOptMerge(opt):
return SeqOptimizer([merger, opt, merger])
def pre_constant_merge(vars):
"""
Merge constants in the subgraph used to compute nodes in `vars`.
`vars` is a list of nodes, and we want to merge together nodes
that are constant inputs used to compute nodes in that list.
:note: This function will ignore nodes that are in an env.
It is used to pre-merge nodes generated inside an optimization,
before it is inserted in the env.
It is useful if there are many such replacements to make,
so that DebugMode will not check each of them.
"""
seen_var = set()
const_sig = {} # variable -> variable.signature() (for constants)
const_sig_inv = {} # signature -> variable (for constants)
def recursive_merge(var):
if var in seen_var:
return var
if var.owner and hasattr(var.owner, "env"):
return var
seen_var.add(var)
if isinstance(var, graph.Constant):
sig = var.signature()
if sig in const_sig_inv:
return const_sig_inv[sig]
const_sig_inv[sig] = var
return var
if var.owner:
for idx,inp in enumerate(var.owner.inputs):
var.owner.inputs[idx] = recursive_merge(inp)
return var
return map(recursive_merge, vars)
########################
### Local Optimizers ###
......@@ -1111,6 +1146,66 @@ def check_chain(r, *chain):
return _check_chain(r, reduce(list.__iadd__, ([x, 0] for x in chain)))
def pre_greedy_local_optimizer(list_optimizations, out):
'''
This function traverses the computation graph described by all
``node`` in the graph before the variable out but that are not in the env.
it applies each of the local_optimizations on the traversed graph.
Its main use is to apply locally constant folding when generating
the graph of the indices of a subtensor.
We should not apply optimizations on node that are in env.
So we don't optimize node that have an attribute env.
:note: This don't do an equilibrium... So if there is optimization
like local_upcast_elemwise_constant_inputs in the list, that
add additional node to the inputs of the node, it can
be needed to call this function multiple time.
'''
def local_recursive_function( list_opt, out, optimized_vars, depth):
if not out.owner :
return [out], optimized_vars
node = out.owner
if hasattr(node, 'env'):
return node.outputs, optimized_vars
for idx, inp in enumerate(node.inputs):
if inp in optimized_vars:
nw_in = optimized_vars[inp]
else:
if inp.owner:
outs, optimized_vars = local_recursive_function(
list_opt
, inp
, optimized_vars
, depth+1)
for k,v in zip(inp.owner.outputs, outs):
optimized_vars[k] = v
nw_in = outs[inp.owner.outputs.index(inp)]
else:
nw_in = inp
optimized_vars[inp] = inp
node.inputs[idx] = nw_in
results = node.outputs
for opt in list_opt:
ret = opt.transform(node)
if ret is not False and ret is not None:
assert len(ret) == len(node.outputs)
for k,v in zip(node.outputs, ret):
optimized_vars[k] = v
results = ret
if ret[0].owner :
node = out.owner
else:
break
return results, optimized_vars
final_outs, optimized_nodes = local_recursive_function(
list_optimizations, out, {}, 0)
return final_outs[0]
......
......@@ -69,3 +69,27 @@ else:
partial = functools.partial
defaultdict = collections.defaultdict
__all__ = ['all', 'any']
if sys.version_info[:2] < (2,6):
# Borrowed from Python docs
def combinations(iterable, r):
# combinations('ABCD', 2) --> AB AC AD BC BD CD
# combinations(range(4), 3) --> 012 013 023 123
pool = tuple(iterable)
n = len(pool)
if r > n:
return
indices = range(r)
yield tuple(pool[i] for i in indices)
while True:
for i in reversed(range(r)):
if indices[i] != i + n - r:
break
else:
return
indices[i] += 1
for j in range(i+1, r):
indices[j] = indices[j-1] + 1
yield tuple(pool[i] for i in indices)
else:
from itertools import combinations
import cPickle
import os, sys
import theano
DISPLAY_DUPLICATE_KEYS = False
DISPLAY_MOST_FREQUENT_DUPLICATE_CCODE = False
dirs = []
if len(sys.argv)>1:
for compiledir in sys.argv[1:]:
dirs.extend([os.path.join(compiledir,d) for d in os.listdir(compiledir)])
else:
dirs = os.listdir(theano.config.compiledir)
dirs = [os.path.join(theano.config.compiledir,d) for d in dirs]
keys = {} # key -> nb seen
mods = {}
for dir in dirs:
key = None
try:
f = open(os.path.join(dir, "key.pkl"))
key = f.read()
f.close()
keys.setdefault(key, 0)
keys[key]+=1
del f
except IOError:
#print dir, "don't have a key.pkl file"
pass
try:
path = os.path.join(dir, "mod.cpp")
if not os.path.exists(path):
path = os.path.join(dir, "mod.cu")
f = open(path)
mod = f.read()
f.close()
mods.setdefault(mod, ())
mods[mod]+=(key,)
del mod
del f
del path
except IOError:
print dir, "don't have a mod.{cpp,cu} file"
pass
if DISPLAY_DUPLICATE_KEYS:
for k, v in keys.iteritems():
if v > 1:
print "Duplicate key (%i copies): %s" % (v, cPickle.loads(k))
nbs_keys = {} # nb seen -> now many key
for val in keys.values():
nbs_keys.setdefault(val, 0)
nbs_keys[val]+=1
nbs_mod = {} # nb seen -> how many key
nbs_mod_to_key = {} #nb seen -> keys
more_then_one = 0
for mod,kk in mods.iteritems():
val = len(kk)
nbs_mod.setdefault(val, 0)
nbs_mod[val]+=1
if val>1:
more_then_one += 1
nbs_mod_to_key[val] = kk
if DISPLAY_MOST_FREQUENT_DUPLICATE_CCODE:
m = max(nbs_mod.keys())
print "The keys associated to the mod.{cpp,cu} with the most number of copy:"
for kk in nbs_mod_to_key[m]:
kk = cPickle.loads(kk)
print kk
print "key.pkl histograph"
l = nbs_keys.items()
l.sort()
print l
print "mod.{cpp,cu} histogram"
l = nbs_mod.items()
l.sort()
print l
total = sum([len(k) for k in mods.values()])
uniq = len(mods)
useless = total - uniq
print "mod.{cpp,cu} total:", total
print "mod.{cpp,cu} uniq:", uniq
print "mod.{cpp,cu} with more then 1 copy:", more_then_one
print "mod.{cpp,cu} useless:", useless, float(useless)/total*100,"%"
print "nb directory", len(dirs)
......@@ -17,19 +17,20 @@ This don't work with broadcast and non-contiguous memory as pycuda don't support
import numpy
import theano
import theano.tensor as T
from theano.gof import Op, Apply, local_optimizer, EquilibriumDB
from theano.sandbox.cuda import GpuElemwise, CudaNdarrayType, CudaNdarray
from theano.sandbox.cuda.basic_ops import as_cuda_ndarray_variable, gpu_contiguous, host_from_gpu
from theano.sandbox.cuda import GpuElemwise, CudaNdarrayType
from theano.sandbox.cuda.basic_ops import as_cuda_ndarray_variable, gpu_contiguous
from theano.sandbox.cuda.opt import gpu_seqopt
import pycuda_init
if not pycuda_init.pycuda_available:
raise Exception("No pycuda available. You can't load pycuda_example.py")
import pycuda
from pycuda.elementwise import ElementwiseKernel
from pycuda.compiler import SourceModule
from pycuda.gpuarray import splay
from pycuda.tools import VectorArg
import pycuda.autoinit
def theano_parse_c_arg(c_arg):
c_arg = c_arg.replace('npy_float32','float')
c_arg = c_arg.replace('npy_float64','double')
......
import os
import theano
import theano.sandbox.cuda as cuda
def select_gpu_from_theano():
# Transfer the theano gpu binding to pycuda, for consistency
theano_to_pycuda_device_map = {"cpu": "0",
"gpu0": "0",
"gpu1": "1",
"gpu2": "2",
"gpu3": "3"}
dev = theano_to_pycuda_device_map.get(theano.config.device, "0")
if theano.config.device == 'gpu':
dev = str(cuda.cuda_ndarray.cuda_ndarray.active_device_number())
os.environ["CUDA_DEVICE"] = dev
select_gpu_from_theano()
pycuda_available = False
try:
import pycuda
import pycuda.autoinit
pycuda_available = True
except ImportError:
# presumably, the user wanted to use pycuda, else they wouldn't have
# imported this module, so issue a warning that the import failed.
import warnings
warnings.warn("PyCUDA import failed in theano.misc.pycuda_init")
import numpy
try:
import pycuda
except ImportError:
import theano
import theano.misc.pycuda_init
if not theano.misc.pycuda_init.pycuda_available:
from nose.plugins.skip import SkipTest
raise SkipTest("Pycuda not installed. Skip test of theano op with pycuda code.")
......@@ -14,10 +15,6 @@ if cuda_ndarray.cuda_available == False:
import theano
import theano.tensor as T
from theano.misc.pycuda_example import PycudaElemwiseSourceModuleOp, PycudaElemwiseKernelOp
from theano.sandbox.cuda import GpuContiguous
import theano.misc.pycuda_example
import theano.sandbox.cuda as cuda_ndarray
if theano.config.mode=='FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
......@@ -37,8 +34,8 @@ def test_pycuda_elemwise_source_module():
assert any([ isinstance(node.op, theano.sandbox.cuda.GpuElemwise) for node in f.maker.env.toposort()])
assert any([ isinstance(node.op, PycudaElemwiseSourceModuleOp) for node in f2.maker.env.toposort()])
val1 = numpy.random.rand(5,5)
val2 = numpy.random.rand(5,5)
val1 = numpy.asarray(numpy.random.rand(5,5), dtype='float32')
val2 = numpy.asarray(numpy.random.rand(5,5), dtype='float32')
#val1 = numpy.ones((5,5))
#val2 = numpy.arange(25).reshape(5,5)
assert (f(val1,val2) == f2(val1,val2)).all()
......
"""
This file is an example of view the memory allocated by pycuda in a GpuArray
in a CudaNdarray to be able to use it in Theano.
This also serve as a test for the function: cuda_ndarray.from_gpu_pointer
"""
import sys
import numpy
import theano
import theano.sandbox.cuda as cuda_ndarray
import theano.misc.pycuda_init
if not theano.misc.pycuda_init.pycuda_available:
from nose.plugins.skip import SkipTest
raise SkipTest("Pycuda not installed. Skip test of theano op with pycuda code.")
if cuda_ndarray.cuda_available == False:
from nose.plugins.skip import SkipTest
raise SkipTest('Optional package cuda disabled')
import pycuda
import pycuda.driver as drv
import pycuda.gpuarray
def test_pycuda_simple():
x = cuda_ndarray.CudaNdarray.zeros((5,5))
from pycuda.compiler import SourceModule
mod = SourceModule("""
__global__ void multiply_them(float *dest, float *a, float *b)
{
const int i = threadIdx.x;
dest[i] = a[i] * b[i];
}
""")
multiply_them = mod.get_function("multiply_them")
a = numpy.random.randn(100).astype(numpy.float32)
b = numpy.random.randn(100).astype(numpy.float32)
dest = numpy.zeros_like(a)
multiply_them(
drv.Out(dest), drv.In(a), drv.In(b),
block=(400,1,1), grid=(1,1))
assert (dest==a*b).all()
def test_pycuda_memory_to_theano():
#Test that we can use the GpuArray memory space in pycuda in a CudaNdarray
y = pycuda.gpuarray.zeros((3,4,5), 'float32')
print numpy.asarray(y)
print "gpuarray ref count before creating a CudaNdarray", sys.getrefcount(y)
assert sys.getrefcount(y)==2
rand = numpy.random.randn(*y.shape).astype(numpy.float32)
cuda_rand = cuda_ndarray.CudaNdarray(rand)
strides = [1]
for i in y.shape[::-1][:-1]:
strides.append(strides[-1]*i)
strides = tuple(strides[::-1])
print 'strides', strides
assert cuda_rand._strides == strides, (cuda_rand._strides, strides)
y_ptr = int(y.gpudata) # in pycuda trunk, y.ptr also works, which is a little cleaner
z = cuda_ndarray.from_gpu_pointer(y_ptr, y.shape, strides, y)
print "gpuarray ref count after creating a CudaNdarray", sys.getrefcount(y)
assert sys.getrefcount(y)==3
assert (numpy.asarray(z) == 0).all()
cuda_ones = cuda_ndarray.CudaNdarray(numpy.asarray([[[1]]],dtype='float32'))
z += cuda_ones
assert (numpy.asarray(z) == numpy.ones(y.shape)).all()
assert (numpy.asarray(z) == 1).all()
assert cuda_rand.shape == z.shape
assert cuda_rand._strides == z._strides, (cuda_rand._strides, z._strides)
assert (numpy.asarray(cuda_rand) == rand).all()
z += cuda_rand
assert (numpy.asarray(z)==(rand+1)).all()
# Check that the ref count to the gpuarray is right.
del z
print "gpuarray ref count after deleting the CudaNdarray", sys.getrefcount(y)
assert sys.getrefcount(y)==2
......@@ -57,6 +57,9 @@ def debugprint(obj, depth=-1, print_type=False, file=None):
order = obj.maker.env.toposort()
elif isinstance(obj, (list, tuple)):
results_to_print.extend(obj)
elif isinstance(obj, gof.Env):
results_to_print.extend(obj.outputs)
order = obj.toposort()
else:
raise TypeError("debugprint cannot print an object of this type", obj)
for r in results_to_print:
......@@ -611,11 +614,18 @@ def pydotprint(fct, outfile=None,
def pydotprint_variables(vars,
outfile=os.path.join(config.compiledir,'theano.pydotprint.png'),
outfile=None,
format='png',
depth = -1,
high_contrast = True):
high_contrast = True, colorCodes = None):
''' Identical to pydotprint just that it starts from a variable instead
of a compiled function. Could be useful ? '''
if colorCodes is None:
colorCodes = default_colorCodes
if outfile is None:
outfile = os.path.join(config.compiledir,'theano.pydotprint.' +
config.device + '.' + format)
try:
import pydot as pd
except:
......
......@@ -156,10 +156,12 @@ def use(device, force=False, default_to_move_computation_to_gpu = True,
raise EnvironmentError("You forced use of device %s, but CUDA initialization failed "
"with error:\n%s" % (device, cuda_initialization_error_message))
if not cuda_available:
if cuda_initialization_error_message:
error_addendum = " (error: %s)" % cuda_initialization_error_message
else:
error_addendum = ""
error_addendum = ""
try:
if cuda_initialization_error_message:
error_addendum = " (error: %s)" % cuda_initialization_error_message
except NameError: # cuda_initialization_error_message is not available b/c compilation failed
pass
warning('CUDA is installed, but device %s is not available%s' % (device, error_addendum))
return
......
......@@ -1767,15 +1767,9 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1):
return Apply(self, [x_, y_, ilist_], [x_.type()])
def perform_(self, node, inp, out_):
# This don't work as CudaNdarray_Subscript() don't support it.
#super(GpuAdvancedSubtensor1, self).perform(node, inp, out_)
x, idx = inp
out, = out_
o = cuda_ndarray.cuda_ndarray.CudaNdarray.zeros((len(idx),)+x.shape[1:])
for (j,i) in enumerate(idx):
o[j] = x[i]
out[0] = o
#def perform(self, node, inp, out_):
# CudaNdarray_Subscript() don't support Advanced slicing.
# so we use the parent version that loop on each indices.
class GpuIncSubtensor(tensor.IncSubtensor):
def make_node(self, x, y, *inputs):
......
......@@ -630,7 +630,7 @@ PyObject * CudaNdarray_Reshape(CudaNdarray * self, PyObject * shape)
// calculate new size, assert same as old size
if (rval_size != CudaNdarray_SIZE(self))
{
PyErr_SetString(PyExc_ValueError, "size must remain unchanged");
PyErr_Format(PyExc_ValueError, "size must remain unchanged, changed from %i to %i", CudaNdarray_SIZE(self), rval_size);
free(rval_dims);
return NULL;
}
......@@ -2010,6 +2010,100 @@ CudaNdarray_gpu_shutdown(PyObject* _unused, PyObject* _unused_args) {
return Py_None;
}
/*
* This function is tested in theano/misc/test_pycuda_theano_simple.py
*/
PyObject *
CudaNdarray_from_gpu_pointer(PyObject* _unused, PyObject* args)
{
PyObject *gpu_ptr = NULL;
PyObject *shapes = NULL;
PyObject *strides = NULL;
PyObject *base = NULL;
PyObject *rval = NULL;
//args should consist of 3 python objects
//The first is the gpu ptr
//The second if the shape
//The third if the strides
if (! PyArg_ParseTuple(args, "OOOO", &gpu_ptr, &shapes, &strides, &base))
return NULL;
printf("In CudaNdarray_from_gpu_pointer\n");
if (!PyLong_Check(gpu_ptr))
{
PyErr_Format(PyExc_Exception, "CudaNdarray_from_gpu_pointer: The gpu pointor is not an long");
return NULL;
}
Py_ssize_t nd = PyObject_Length(shapes);
if (nd < 0)
{
PyErr_SetString(PyExc_TypeError, "CudaNdarray_from_gpu_pointer: Couldn't get length of second argument");
return NULL;
}
Py_ssize_t nd_stride = PyObject_Length(strides);
if (nd_stride < 0)
{
PyErr_SetString(PyExc_TypeError, "CudaNdarray_from_gpu_pointer: Couldn't get length of third argument");
return NULL;
}
if (nd != nd_stride)
{
PyErr_SetString(PyExc_TypeError, "CudaNdarray_from_gpu_pointer: We need the same number of shapes and strides");
return NULL;
}
rval = CudaNdarray_new_null();
if (CudaNdarray_set_nd((CudaNdarray *)rval, nd))
{
//CudaNdarray_set_nd set the error msg
return NULL;
}
// set gpu pointeur
assert(((CudaNdarray *)rval)->data_allocated == 0);
if (CudaNdarray_set_device_data((CudaNdarray *)rval, (float *)PyInt_AsLong(gpu_ptr), base))
{
PyErr_SetString(PyExc_TypeError, "CudaNdarray_from_gpu_pointer: Error while setting the gpu pointor");
return NULL;
}
// Set dims and strides
for (int i = nd-1; i >= 0; --i)
{
PyObject * idx = PyLong_FromLong(i);
if (idx == NULL)
{
PyErr_SetString(PyExc_Exception, "CudaNdarray_from_gpu_pointer: Couldn't make long object to loop over list/tuple");
return NULL;
}
PyObject* dim_ = PyObject_GetItem(shapes, idx);
PyObject* strd_ = PyObject_GetItem(strides, idx);
if (!PyInt_Check(dim_))
{
PyErr_Format(PyExc_Exception, "CudaNdarray_from_gpu_pointer: shapes[%d] is not an int", i);
return NULL;
}
if (!PyInt_Check(strd_))
{
PyErr_Format(PyExc_Exception, "CudaNdarray_from_gpu_pointer: strides[%d] is not an int", i);
return NULL;
}
int dim = PyInt_AsLong(dim_);
int strd = PyInt_AsLong(strd_);
CudaNdarray_set_stride((CudaNdarray *)rval, i, strd);
CudaNdarray_set_dim((CudaNdarray *)rval, i, dim);
Py_DECREF(idx);
Py_DECREF(dim_);
Py_DECREF(strd_);
}
printf("CudaNdarray_from_gpu_pointer normal return\n");
return rval;
}
PyObject *
CudaNdarray_Dot(PyObject* _unused, PyObject* args)
{
......@@ -2175,6 +2269,7 @@ static PyMethodDef module_methods[] = {
{"ptr_int_size", CudaNdarray_ptr_int_size, METH_VARARGS, "Return a tuple with the size of gpu pointer, cpu pointer and int in bytes."},
{"filter", filter, METH_VARARGS, "filter(obj, broadcastable, strict, storage) returns a CudaNdarray initialized to obj if it matches the constraints of broadcastable. strict=True prevents any numeric casting. If storage is a CudaNdarray it may be overwritten and used as the return value."},
{"outstanding_mallocs", outstanding_mallocs, METH_VARARGS, "how many more mallocs have been called than free's"},
{"from_gpu_pointer", CudaNdarray_from_gpu_pointer, METH_VARARGS, "Used to create a CudaNdarray from already allocated memory on the gpu.(example by pycuda)"},
{NULL, NULL, NULL, NULL} /* Sentinel */
};
......@@ -2367,7 +2462,7 @@ CudaNdarray_new_nd(int nd)
return (PyObject *) rval;
}
int CudaNdarray_set_device_data(CudaNdarray * self, float * data, CudaNdarray * base)
int CudaNdarray_set_device_data(CudaNdarray * self, float * data, PyObject * base)
{
if (self->data_allocated)
{
......@@ -2380,10 +2475,10 @@ int CudaNdarray_set_device_data(CudaNdarray * self, float * data, CudaNdarray *
}
}
//N.B. XDECREF and XINCREF are no-ops for NULL pointers
if (self->base != (PyObject*)base)
if (self->base != base)
{
Py_XDECREF(self->base);
self->base = (PyObject*)base;
self->base = base;
Py_XINCREF(self->base);
}
self->data_allocated = 0;
......@@ -2982,18 +3077,20 @@ CudaNdarray_dimshuffle(CudaNdarray * self, unsigned int len, const int * pattern
}
else if(dims_taken[pattern[i]])
{
PyErr_SetString(PyExc_ValueError, "Cudandarray_dimshuffle: The same input dimension may not appear twice in the list of output dimensions");
PyErr_Format(PyExc_ValueError, "Cudandarray_dimshuffle: invalid pattern for Cudandarray_dimshuffle. You used the dimensions %d multiple time",
pattern[i]);
free(newdims);
return -1;
}
else
{
if ((dims_taken[pattern[i]]) || (pattern[i]>= self->nd))
{
PyErr_SetString(PyExc_ValueError, "Cudandarray_dimshuffle: invalid pattern for Cudandarray_dimshuffle");
free(newdims);
return -1;
}
else if (pattern[i]>= self->nd)
{
PyErr_Format(PyExc_ValueError, "Cudandarray_dimshuffle: invalid pattern for Cudandarray_dimshuffle. You asked for a dimensions that don't exist %d for a %d dims CudaNdarray",
pattern[i], self->nd);
free(newdims);
return -1;
}
else
{
newdims[i] = CudaNdarray_HOST_DIMS(self)[pattern[i]];
newstrides[i] = CudaNdarray_HOST_STRIDES(self)[pattern[i]];
dims_taken[pattern[i]] = 1;
......
......@@ -438,7 +438,11 @@ CudaNdarray_NewDims(int nd, const inttype * dims)
*
* Set self to be a view of given `data`, owned by existing CudaNdarray `base`.
*/
int CudaNdarray_set_device_data(CudaNdarray * self, float * data, CudaNdarray * base);
int CudaNdarray_set_device_data(CudaNdarray * self, float * data, PyObject * base);
int CudaNdarray_set_device_data(CudaNdarray * self, float * data, CudaNdarray * base)
{
return CudaNdarray_set_device_data(self, data, (PyObject *) base);
}
/**
* Return an independent copy of self
......
......@@ -778,8 +778,8 @@ switch = Switch()
class UnaryBitOp(UnaryScalarOp):
def output_types(self, *input_types):
for i in input_types[0]:
if i not in (int8, int32, int64):
raise TypeError('input to a BitOp must have type int8, int32 or int64... not %s' % i)
if i not in (int8, int16, int32, int64):
raise TypeError('input to a BitOp must have type int8, int16, int32 or int64... not %s' % i)
return upcast_out(*input_types[0])
def grad(self, inputs, output_gradients):
return [None]
......@@ -788,8 +788,8 @@ class BinaryBitOp(BinaryScalarOp):
def output_types(self, *input_types):
t0, t1 = input_types[0]
for i in input_types[0]:
if i not in (int8, int32, int64):
raise TypeError('input to a BitOp must have type int8, int32 or int64... not %s' % i)
if i not in (int8, int16, int32, int64):
raise TypeError('input to a BitOp must have type int8, int16, int32 or int64... not %s' % i)
return upcast_out(*input_types[0])
def grad(self, inputs, output_gradients):
return [None, None]
......@@ -800,6 +800,8 @@ class OR(BinaryBitOp):
associative = False
def impl(self, x, y):
return x | y
def c_code(self, node, name, (x, y), (z, ), sub):
return "%(z)s = (%(x)s | %(y)s);" % locals()
or_ = OR()
class XOR(BinaryBitOp):
......@@ -808,6 +810,8 @@ class XOR(BinaryBitOp):
associative = False
def impl(self, x, y):
return x ^ y
def c_code(self, node, name, (x, y), (z, ), sub):
return "%(z)s = (%(x)s ^ %(y)s);" % locals()
xor = XOR()
class AND(BinaryBitOp):
......@@ -816,12 +820,16 @@ class AND(BinaryBitOp):
associative = False
def impl(self, x, y):
return x & y
def c_code(self, node, name, (x, y), (z, ), sub):
return "%(z)s = (%(x)s & %(y)s);" % locals()
and_ = AND()
class Invert(UnaryBitOp):
identity = False
def impl(self, x):
return ~x
def c_code(self, node, name, (x,), (z, ), sub):
return "%(z)s = (~%(x)s);" % locals()
invert = Invert()
......
......@@ -650,7 +650,9 @@ class AddSD(gof.op.Op):
broadcastable = y.type.broadcastable).make_variable()])
def perform(self, node, (x, y), (out, )):
assert _is_sparse(x) and _is_dense(y)
out[0] = x + y
# The asarray is needed as in some case, this return a
# numpy.matrixlib.defmatrix.matrix object and not an ndarray.
out[0] = theano._asarray(x + y, dtype=node.outputs[0].type.dtype)
def grad(self, (x, y), (gz,)):
assert _is_sparse_variable(x) and _is_dense_variable(y)
assert _is_dense_variable(gz)
......
......@@ -1103,14 +1103,6 @@ def local_argmax_pushdown(node):
def _check_rows_is_arange_len_labels(rows, labels):
'''Check that 'rows' is the same node as T.arange(labels.shape[0])'''
# this is admittedly a pretty random thing to have here... but it's not wrong (I think)
# and it has the effect of making the advanced_indexing -> crossentropy optimization work
# in the case where the labels are float32s casted to integers. "Why would anyone do that?"
# you ask... it is a handy trick for storing labels on a pre-FERMI GPU device so that
# logistic regression goes faster.
if labels.owner and labels.owner.op == tensor._convert_to_int32:
labels = labels.owner.inputs[0]
if rows.owner and isinstance(rows.owner.op, tensor.ARange):
start, stop, step = rows.owner.inputs
if getattr(start, 'data', None) != 0: #constants will have data
......@@ -1119,11 +1111,12 @@ def _check_rows_is_arange_len_labels(rows, labels):
return False
if not stop.owner:
return False
# Not sure if that case happens any more after the introduction
# of ShapeOptimizer
# Not sure if that case happens any more after the introduction of
# ShapeOptimizer, but we keep it if ShapeOptimizer is not present
if isinstance(stop.owner.op, tensor.Subtensor):
shape_subtensor = stop.owner
if shape_subtensor.op.idx_list == [0]:
if list(shape_subtensor.op.idx_list) == [0]:
shape_var, = shape_subtensor.inputs
if shape_var.owner and shape_var.owner.op == tensor._shape:
return shape_var.owner.inputs[0] is labels
......
......@@ -25,7 +25,7 @@ import basic as T
from theano import compile #to register the optimizer built by this file
from theano.gof.python25 import any, all
from theano.gof.opt import Optimizer
from theano.gof.opt import Optimizer, pre_constant_merge, pre_greedy_local_optimizer
from theano.gof import toolbox, DestroyHandler
from basic import get_constant_value
......@@ -602,13 +602,66 @@ class ShapeFeature(object):
s_i, type(s_i), getattr(s_i, 'type', None))
def set_shape(self, r, s):
assert r not in self.shape_of
assert r not in self.shape_of, 'r already in shape_of'
if s is None:
self.shape_of[r] = s
else:
self.shape_of[r] = tuple([self.unpack(s_i) for s_i in s])
def init_r(self,r):
def update_shape(self, r, other_r):
'''Replace shape of r by shape of other_r.
If, on some dimensions, the shape of other_r is not informative,
keep the shape of r on those dimensions.
'''
# other_r should already have a shape
assert other_r in self.shape_of, ('other_r not in shape_of', other_r)
other_shape = self.shape_of[other_r]
if r in self.shape_of:
r_shape = self.shape_of[r]
else:
# If no info is known on r's shape, use other_shape
self.shape_of[r] = other_shape
return
# If other_shape has no information, use r_shape
if other_shape is None:
self.shape_of[r] = r_shape
return
# Merge other_shape with r_shape, giving the priority to other_shape
merged_shape = []
for i, ps in enumerate(other_shape):
# If other_shape[i] is uninformative, use r_shape[i].
# For now, we consider 2 cases of uninformative other_shape[i]:
# - Shape_i(i)(other_r);
# - Shape_i(i)(r).
if (ps.owner and
isinstance(getattr(ps.owner,'op',None), Shape_i) and
ps.owner.op.i == i and
ps.owner.inputs[0] in (r, other_r)):
merged_shape.append(r_shape[i])
else:
merged_shape.append(other_shape[i])
self.shape_of[r] = tuple(merged_shape)
def set_shape_i(self, r, i, s_i):
'''Replace element i of shape_of[r] by s_i'''
assert r in self.shape_of
prev_shape = self.shape_of[r]
# prev_shape is a tuple, so we cannot change it inplace,
# so we build another one.
new_shape = []
for j, s_j in enumerate(prev_shape):
if j == i:
new_shape.append(self.unpack(s_i))
else:
new_shape.append(s_j)
self.shape_of[r] = tuple(new_shape)
def init_r(self, r):
'''Register r's shape in the shape_of dictionary.'''
if r not in self.shape_of:
try:
self.set_shape(r, self.shape_tuple(r))
......@@ -619,7 +672,7 @@ class ShapeFeature(object):
return make_vector(*self.shape_of[r])
#
#
# Feature inteface
# Feature interface
#
#
def on_attach(self, env):
......@@ -669,10 +722,10 @@ class ShapeFeature(object):
self.set_shape(r, s)
def on_change_input(self, env, node, i, r, new_r):
# TODO:
# This tells us that r and new_r must have the same shape
# if we didn't know that the shapes are related, now we do.
self.init_r(new_r)
self.update_shape(new_r, r)
# change_input happens in two cases:
# 1) we are trying to get rid of r, or
# 2) we are putting things back after a failed transaction.
......@@ -690,6 +743,15 @@ class ShapeFeature(object):
if v == r:
del self.scheduled[k]
# In either case, r could be in shape_of.values(), that is, r itself
# is the shape of something. In that case, we want to update
# the value in shape_of, to keep it up-to-date.
for k,v in self.shape_of.iteritems():
if v is not None:
for ii, vi in enumerate(v):
if vi == r:
self.set_shape_i(k, ii, new_r)
class ShapeOptimizer(Optimizer):
"""Optimizer that serves to add ShapeFeature as an env feature.
"""
......@@ -1125,8 +1187,6 @@ def local_useless_subtensor(node):
node_input_idx += sum([isinstance(idx.start, theano.scalar.Scalar),
isinstance(idx.stop, theano.scalar.Scalar),
isinstance(idx.step, theano.scalar.Scalar)])
if isinstance(idx, theano.scalar.Scalar):
node_input_idx += 1
return [node.inputs[0]]
......@@ -1171,6 +1231,7 @@ def local_subtensor_lift(node):
new_inputs.append(i.dimshuffle(['x']*node.outputs[0].ndim))
return [u.owner.op(*new_inputs)]
def merge_two_slices(slice1, len1, slice2, len2):
'''
This function merges two slices into a single slice. The code works on
......@@ -1186,18 +1247,7 @@ def merge_two_slices(slice1, len1, slice2, len2):
``len1`` is the length of the tensor **before** applying the first slice,
while ``len2`` is the length **after** applying the first slice.
'''
def const_fold(n):
while True:
ret = constant_folding.transform(n)
if ret is not False and ret is not None:
#print n,ret
assert len(ret)==len(n.outputs)
assert len(ret)==1
n = ret[0].owner
else: break
return n.outputs
list_opt = [ local_abs_merge, local_mul_switch_sink, local_upcast_elemwise_constant_inputs, local_remove_switch_const_cond, constant_folding ]
if type(slice1) is not slice:
......@@ -1250,38 +1300,65 @@ def merge_two_slices(slice1, len1, slice2, len2):
# according to the two steps we have 4 different combinations of
# positive/negative. I will denote the case I'm looking at by
# suffixes to the variables (nn,np,pn,pp):
pp_start = sl1.start + sl2.start * sl1.step
pp_stop = sl1.start + sl2.stop * sl1.step
pp_step = sl1.step * sl2.step
flen = sl2.stop - sl2.start
p_step = sl1.step * sl2.step
n_step = sl1.step * sl2.step * -1
pp_start = T.minimum(sl1.start + sl2.start * sl1.step, sl1.stop)
pp_stop = T.minimum(sl1.start + sl2.stop * sl1.step, sl1.stop)
pn_stop = sl1.start + (sl2.start -1) * sl1.step
pn_stop = T.switch(T.and_(T.lt(pn_stop,0)
, T.gt(flen,0))
, -len1 -1
, T.minimum(pn_stop, sl1.stop))
pn_start = sl1.start + (sl2.stop -1) * sl1.step
pn_start = T.minimum( pn_start, sl1.stop )
pn_start = T.maximum( pn_start, 0 )
pn_stop = sl1.start + sl2.start * sl1.step
pn_start = sl1.start + sl2.stop * sl1.step
pn_step = sl1.step * sl2.step * -1
pn_stop = T.switch(T.eq(pn_stop,-1), -len1 -1, pn_stop)
np_stop = sl1.stop - sl2.stop * sl1.step -1
np_start = sl1.stop - sl2.start * sl1.step -1
np_step = sl1.step * sl2.step * -1
np_stop = T.switch(T.eq(np_stop,-1), -len1 -1, np_stop)
np_stop = T.switch(T.and_(T.lt(np_stop,0)
, T.gt(flen,0))
,-len1-1
, T.maximum(sl1.start-1, np_stop))
np_start = T.maximum(sl1.start,sl1.stop - sl2.start * sl1.step -1)
nn_start = T.maximum(sl1.start,(sl1.stop -1)- (sl2.stop-1) * sl1.step)
nn_stop = T.maximum(sl1.start,sl1.stop - sl2.start * sl1.step)
nn_start = sl1.stop - sl2.start * sl1.step
nn_stop = sl1.stop - sl2.stop * sl1.step
nn_step = sl1.step * sl2.step
start = const_fold(T.switch(T.lt(reverse2*reverse1,0),
start = T.switch(T.lt(reverse2*reverse1,0),
T.switch(T.lt(reverse1,0), np_start, pn_start),
T.switch(T.lt(reverse1,0), nn_start,
pp_start)).owner)[0]
pp_start))
stop = const_fold(T.switch(T.lt(reverse2*reverse1,0),
stop = T.switch(T.lt(reverse2*reverse1,0),
T.switch(T.lt(reverse1,0), np_stop , pn_stop ),
T.switch(T.lt(reverse1,0), nn_stop , pp_stop
)).owner)[0]
))
step = T.switch( T.lt(reverse2*reverse1,0),n_step, p_step)
start = T.switch(T.le(flen,0), 0, start)
stop = T.switch(T.le(flen,0), 0, stop)
# The canonical form of the slice is pretty complicated
# and is not simplified. We simplify it in advance here
# as otherwise this create too many useless optimization that
# DebugMode must check.
start = pre_greedy_local_optimizer( list_opt, start)
stop = pre_greedy_local_optimizer( list_opt, stop)
step = pre_greedy_local_optimizer( list_opt, step)
start = pre_greedy_local_optimizer( list_opt, start)
stop = pre_greedy_local_optimizer( list_opt, stop)
step = pre_greedy_local_optimizer( list_opt, step)
#Pre merge constant for the same reason.
start, stop, step = pre_constant_merge([start, stop, step])
step = const_fold( T.switch(T.lt(reverse2*reverse1,0),
T.switch(T.lt(reverse1,0), np_step , pn_step ),
T.switch(T.lt(reverse1,0), nn_step , pp_step
)).owner)[0]
return slice(start, stop, step)
@register_canonicalize
......
......@@ -302,10 +302,11 @@ def test_mlp():
x:train_set_x[index*batch_size:(index+1)*batch_size],
y:train_set_y[index*batch_size:(index+1)*batch_size]},
mode=mode)
for i in train_model.maker.env.toposort(): print i
#theano.printing.pydotprint(train_model)
print 'MODEL 1'
theano.printing.debugprint(train_model, print_type=True)
assert any([isinstance(i.op,T.nnet.CrossentropySoftmax1HotWithBiasDx) for i in train_model.maker.env.toposort()])
assert any( [isinstance(i.op,T.nnet.CrossentropySoftmax1HotWithBiasDx) for i in train_model.maker.env.toposort()])
# Now, this case works, too!
train_model =theano.function( inputs = [index],
updates = updates2,
mode=mode.excluding('local_track_shape_i'),
......@@ -313,9 +314,21 @@ def test_mlp():
x:train_set_x[index*batch_size:(index+1)*batch_size],
y:train_set_y[index*batch_size:(index+1)*batch_size]})
print
for i in train_model.maker.env.toposort(): print i
print 'MODEL 2'
theano.printing.debugprint(train_model, print_type=True)
assert any([isinstance(i.op,T.nnet.CrossentropySoftmax1HotWithBiasDx) for i in train_model.maker.env.toposort()])
assert not any( [isinstance(i.op,T.nnet.CrossentropySoftmax1HotWithBiasDx) for i in train_model.maker.env.toposort()])
# Even without FeatureShape
train_model =theano.function( inputs = [index],
updates = updates2,
mode=mode.excluding('local_shape_to_shape_i'),
givens={
x:train_set_x[index*batch_size:(index+1)*batch_size],
y:train_set_y[index*batch_size:(index+1)*batch_size]})
print
print 'MODEL 3'
theano.printing.debugprint(train_model, print_type=True)
assert any([isinstance(i.op,T.nnet.CrossentropySoftmax1HotWithBiasDx) for i in train_model.maker.env.toposort()])
if __name__ == '__main__':
test_mlp()
......@@ -765,17 +765,26 @@ class TestGemv(TestCase):
def test_gemv_dimensions(self):
A = T.matrix('A')
x, y = T.vectors('x', 'y')
alpha = theano.shared(1.0, name='alpha')
beta = theano.shared(1.0, name='beta')
alpha = theano.shared(theano._asarray(1.0, dtype=config.floatX),
name='alpha')
beta = theano.shared(theano._asarray(1.0, dtype=config.floatX),
name='beta')
z = beta * y + alpha * T.dot(A, x)
f = theano.function([A, x, y], z)
# Matrix value
A_val = numpy.ones((5,3), dtype=config.floatX)
f(A_val, numpy.ones(3), numpy.ones(5))
self.assertRaises(ValueError, f, A_val, numpy.ones(4), numpy.ones(5))
self.assertRaises(ValueError, f, A_val, numpy.ones(3), numpy.ones(6))
self.assertRaises(ValueError, f, A_val, numpy.ones(4), numpy.ones(6))
# Different vector length
ones_3 = numpy.ones(3, dtype=config.floatX)
ones_4 = numpy.ones(4, dtype=config.floatX)
ones_5 = numpy.ones(5, dtype=config.floatX)
ones_6 = numpy.ones(6, dtype=config.floatX)
f(A_val, ones_3, ones_5)
self.assertRaises(ValueError, f, A_val, ones_4, ones_5)
self.assertRaises(ValueError, f, A_val, ones_3, ones_6)
self.assertRaises(ValueError, f, A_val, ones_4, ones_6)
# The following gemv tests were added in March 2011 by Ian Goodfellow
......
......@@ -1526,6 +1526,41 @@ class test_local_subtensor_merge(unittest.TestCase):
def test_scalar5(self):
# var[int1:][:int2]
x = TT.matrix('x')
b1 = TT.iscalar('b1')
e1 = TT.iscalar('e1')
s1 = TT.iscalar('s1')
b2 = TT.iscalar('b2')
e2 = TT.iscalar('e2')
s2 = TT.iscalar('s2')
f = function([x,b1,e1,s1,b2,e2,s2], x[b1:e1:s1][b2:e2:s2], mode=mode_opt)
#theano.printing.debugprint(f, print_type=True)
topo=f.maker.env.toposort()
#print [t for t in topo if isinstance(t.op, TT.Subtensor)]
assert len([t for t in topo if isinstance(t.op, TT.Subtensor)]) == 1
#print topo[-1].op
assert isinstance(topo[-1].op, theano.compile.function_module.DeepCopyOp)
b1r = self.rng.permutation(range(-8,8))[:2]
e1r = self.rng.permutation(range(-8,8))[:2]
b2r = self.rng.permutation(range(-8,8))[:2]
e2r = self.rng.permutation(range(-8,8))[:2]
s1r = self.rng.permutation([-7,-6,-5,-4,-3,-2,-1,1,2,3,4,5,6,7])[:2]
s2r = self.rng.permutation([-7,-6,-5,-4,-3,-2,-1,1,2,3,4,5,6,7])[:2]
for x_s in self.x_shapes:
x_val = self.rng.uniform(size=x_s).astype(config.floatX)
for b1 in b1r:
for e1 in e1r:
for s1 in s1r:
for b2 in b2r:
for e2 in e2r:
for s2 in s2r:
f(x_val, b1,e1,s1,b2,e2,s2)
def test_local_fill_useless():
......@@ -1635,7 +1670,7 @@ class test_shapeoptimizer(unittest.TestCase):
register_specialize(local_identity_noshape_to_identity_shape)
# With the optimization
# The identity_shape op is should not be needed anymore to compute
# The identity_shape op should not be needed anymore to compute
# the shape
g = theano.function([x], ins_x.shape, mode=mode)
xval = rng.randn(6,1,2).astype(config.floatX)
......@@ -1995,7 +2030,7 @@ class T_useless_elemwise(unittest.TestCase):
# tensor_copy, and view
x = T.matrix()
f = theano.function([x], T.tensor_copy(x), mode=self.mode)
vx = numpy.random.rand(5,4)
vx = numpy.random.rand(5,4).astype(config.floatX)
f(vx)
topo = f.maker.env.toposort()
assert len(topo) == 1
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论