提交 9adfd678 authored 作者: James Bergstra's avatar James Bergstra

merge

from .sharedvalue import shared, shared_constructor
from .pfunc import pfunc
from theano.compile.sandbox.sharedvalue import shared, shared_constructor
from theano.compile.sandbox.pfunc import pfunc
......@@ -792,14 +792,24 @@ class CLinker(link.Linker):
function raises a KeyError exception.
"""
order = list(self.env.toposort())
env_inputs_dict = dict((i, [-1, pos]) for pos, i in enumerate(self.env.inputs))
return self.cmodule_key_(self.env, self.no_recycling,
compile_args=self.compile_args(),
libraries=self.libraries()
)
@staticmethod
def cmodule_key_(env, no_recycling, compile_args=None, libraries=None):
"""
Do the actual computation of cmodule_key in a static method
to allow it to be reused in scalar.Composite.__eq__
"""
order = list(env.toposort())
env_computed_set = set()
env_inputs_dict = dict((i, [-1, pos]) for pos, i in enumerate(env.inputs))
constant_ids = dict()
op_pos = {} # Apply -> topological position
rval = ['CLinker.cmodule_key'] # will be cast to tuple on return
rval.append(tuple(self.compile_args()))
rval.append(tuple(self.libraries()))
if compile_args is not None: rval.append(tuple(compile_args))
if libraries is not None: rval.append(tuple(libraries))
version = []
# assert that every input to every node is one of'
......@@ -822,16 +832,16 @@ class CLinker(link.Linker):
else:
if i.owner is None:
assert all( all(out is not None for out in o.outputs) for o in order)
assert all( input.owner is None for input in self.env.inputs)
raise Exception('what is this?', (i, type(i), i.clients, self.env))
if i in self.env.outputs:
assert all( input.owner is None for input in env.inputs)
raise Exception('what is this?', (i, type(i), i.clients, env))
if i in env.outputs:
rval += [op_pos[i.owner], # outputs
i.owner.outputs.index(i),
self.env.outputs.index(i)]
env.outputs.index(i)]
else:
rval += [op_pos[i.owner], i.owner.outputs.index(i)] # temps
assert rval
rval.append(i in self.no_recycling)
rval.append(i in no_recycling)
return tuple(rval)
for node_pos, node in enumerate(order):
......
......@@ -386,7 +386,7 @@ class ModuleCache(object):
try:
module = fn(location=location) # WILL FAIL FOR BAD C CODE
except Exception, e:
shutil.rmtree(location)
_rmtree(location)
#try:
#except Exception, ee:
#error('failed to cleanup location', location, ee)
......@@ -515,7 +515,8 @@ class ModuleCache(object):
def _rmtree(parent):
try:
shutil.rmtree(parent)
if not os.getenv('THEANO_NOCLEANUP',0):
shutil.rmtree(parent)
except Exception, e:
try:
# mark this directory for deletion by a future refresh()
......
from .. import gof
from theano import gof
import sys
......
......@@ -348,6 +348,9 @@ def int_out(*types):
def float_out(*types):
return float64,
def upgrade_to_float(*types):
"""
This upgrade the types to float32 or float64 to don't loose any precision.
"""
conv = {int8: float32,
int16: float32,
int32: float64,
......@@ -370,8 +373,8 @@ class ScalarOp(Op):
def make_node(self, *inputs):
if self.nin >= 0:
if len(inputs) != self.nin:
raise TypeError("Wrong number of inputs for %s.make_node (got %i, expected %i)" \
% (self, len(inputs), self.nin))
raise TypeError("Wrong number of inputs for %s.make_node (got %i(%s), expected %i)" \
% (self, len(inputs), str(inputs), self.nin))
inputs = [as_scalar(input) for input in inputs]
outputs = [t() for t in self.output_types([input.type for input in inputs])]
if len(outputs) != self.nout:
......@@ -977,6 +980,7 @@ class Inv(UnaryScalarOp):
inv = Inv(upgrade_to_float, name = 'inv')
class Log(UnaryScalarOp):
""" log base e """
def impl(self, x):
return math.log(x)
def grad(self, (x, ), (gz, )):
......@@ -994,6 +998,7 @@ class Log(UnaryScalarOp):
log = Log(upgrade_to_float, name = 'log')
class Log2(UnaryScalarOp):
""" log base 2 """
def impl(self, x):
return numpy.log2(x)
def grad(self, (x, ), (gz, )):
......@@ -1009,6 +1014,7 @@ class Log2(UnaryScalarOp):
log2 = Log2(upgrade_to_float, name = 'log2')
class Log10(UnaryScalarOp):
""" log base 10 """
def impl(self, x):
return numpy.log10(x)
def grad(self, (x, ), (gz, )):
......@@ -1170,6 +1176,14 @@ class Composite(ScalarOp):
implement the loop fusion optimizer (which I have yet to do
someday...)
"""
def __str__(self):
if hasattr(self, 'name') and self.name:
return self.name
else:
return "%s{%s}" % (self.__class__.__name__, ", ".join(
"%s=%s" % (k, v) for k, v in self.__dict__.items()
if k not in ["name","env","_c_code"] ))
def __init__(self, inputs, outputs):
env = Env(*gof.graph.clone(inputs, outputs))
gof.MergeOptimizer().optimize(env)
......@@ -1233,12 +1247,15 @@ class Composite(ScalarOp):
self.nin = len(inputs)
self.nout = len(outputs)
self.env = env
self.inputs_type = tuple([input.type for input in self.env.inputs])
self.outputs_type = tuple([output.type for output in self.env.outputs])
self._rehash()
def output_types(self, input_types):
if tuple(input_types) != tuple([input.type for input in self.env.inputs]):
if tuple(input_types) != self.inputs_type:
raise TypeError("Wrong types for Composite. Expected %s, got %s."
% (tuple([input.type for input in self.env.inputs]), tuple(input_types)))
return [output.type for output in self.env.outputs]
% (self.inputs_type, tuple(input_types)))
return self.outputs_type
def perform(self, node, inputs, output_storage):
for storage, impl in zip(output_storage, self._impls):
......@@ -1259,10 +1276,36 @@ class Composite(ScalarOp):
onames),
**sub)
d['name'] = name
if not sub.has_key('id'):
#The use of a dummy id is safe as the code is in a separate block.
#It won't generate conflicting variable name.
d['id']='_DUMMY_ID_'
return self._c_code % d
def __eq__(self, other):
return self is other
if self is other: return True
if not isinstance(other, self.__class__): return False
if self.nin!=other.nin or self.nout != other.nout: return False
return self._hashval == other._hashval
return self._cmodule_key == other._cmodule_key
def _rehash(self):
#TODO: What no_recycling is used for? What I need to put their?
# no_recycling = []
self._cmodule_key = gof.CLinker.cmodule_key_(self.env, [])
self._hashval = hash(self._cmodule_key)
def __hash__(self):
return id(self)
return self._hashval
# def __getstate__(self):
# d = copy(self.__dict__)
# d.pop('env')
# d.pop('_impls')
# #TODO: the self._impls must be restored to allow the perform to work.(c version continue to work.
# return d
# def __setstate__(self, d):
# self.__dict__.update(d)
# #TODO: how to restore the _impls?
......@@ -1227,6 +1227,68 @@ register_canonicalize(local_transposed_dot, name='local_transposed_dot')
# # Loop fusion #
# ###############
@gof.local_optimizer([T.Elemwise, T.Elemwise])
def local_elemwise_fusion(node):
"""
As part of specialisation, we fusion two consecutif elemwise op of the same shape.
For mixed dtype, we let the Compise op do the cast. It let the C compile do the cast.
The number of dimension is validated at call time by theano itself.
TODO:The broadcast flag?
"""
# TODO:implement Composite.__eq__ by using CLinker.cmodule_key() to compare the graph.
#TODO: Merge when nb_clients>1? When this optimisation could introduce duplication of computation? When this will be faster?
if not isinstance(node.op, T.Elemwise):
return False
nb_elemwise=0
inputs=[]#inputs of the new Elemwise op.
s_inputs = []#inputs of the new scalar op.
s_g=[]#graph of scalar, what will by done in the inner loop.
for i in node.inputs:
if i.owner and isinstance(i.owner.op,T.Elemwise) and len(i.clients)<=1:
if len(i.clients)>1:
#should we put this in the first if, then we would go to the elif to don't fuse it?
#if one of the inputs have more then 1 clients and it is an intermediate result. We don't fuse.
print "local_elemwise_fusion: Elemwise inputs have more then 1 client. Don't optimise for now"
return False
nb_elemwise+=1
inputs.extend(i.owner.inputs)
s_input = [scalar.Scalar(x.dtype).make_variable() for x in i.owner.inputs]
s_inputs.extend(s_input)
s_op=i.owner.op.scalar_op(*s_input)
s_g.append(s_op)
else:
if i.owner and isinstance(i.owner.op,T.Elemwise) and len(i.clients)>1:
#should we put this in the first if, then we would go to the elif to don't fuse it?
print "local_elemwise_fusion: inputs have more then 1 client. Don't fuse it for now.!"
return False
inputs.append(i)
s=scalar.Scalar(i.dtype).make_variable()
s_inputs.append(s)
s_g.append(s)
#if no inputs have are an elemwise, their is nothing to fuse.
if nb_elemwise==0:
# print "local_elemwise_fusion: no elemwise in inputs. Nothing to fuse."
return False
otype = node.outputs[0].type
s_new_out=node.op.scalar_op(*s_g)
#create the composite op.
C = scalar.Composite(s_inputs,[s_new_out])
#create the new node.
n=T.Elemwise(C).make_node(*inputs)
assert len(n.outputs)==1
assert node.outputs[0].dtype==n.outputs[0].dtype
# print "local_elemwise_fusion: FUSED",nb_elemwise+1,"elemwise!"
return n.outputs
#register_specialize(local_elemwise_fusion)
# def make_composite(inputs, outputs):
# scalar_inputs = [scalar.Scalar(dtype = i.type.dtype)() for i in inputs]
# def transform(r):
......
......@@ -14,6 +14,7 @@ import numpy
#import scalar_opt
from theano import function, compile
from nose.plugins.skip import SkipTest
def inputs(xbc = (0, 0), ybc = (0, 0), zbc = (0, 0)):
......@@ -183,9 +184,87 @@ class test_canonize(unittest.TestCase):
def test_elemwise_multiple_inputs_optimisation(self):
"""
verify that the Canonizer merge sequential Elemwise({mul,add})
verify that the Canonizer merge sequential Elemwise({mul,add}) part 1
This part are that case that is done, but don't include case that are not implemented but are suposed to be.
Test with and without DimShuffle
"""
shp=(5,5)
fx, fy, fz = fmatrices('xyz')
dx, dy, dz = dmatrices('xyz')
fv = fvector('r').dimshuffle('x',0)
dv = dvector('s').dimshuffle('x',0)
fxv = numpy.asarray(numpy.random.rand(*shp),dtype='float32')
fyv = numpy.asarray(numpy.random.rand(*shp),dtype='float32')
fzv = numpy.asarray(numpy.random.rand(*shp),dtype='float32')
fvv = numpy.asarray(numpy.random.rand(shp[0]),dtype='float32').reshape(1,shp[0])
dxv = numpy.asarray(numpy.random.rand(*shp),dtype='float64')
dyv = numpy.asarray(numpy.random.rand(*shp),dtype='float64')
dzv = numpy.asarray(numpy.random.rand(*shp),dtype='float64')
dvv = numpy.asarray(numpy.random.rand(shp[0]),dtype='float64').reshape(1,shp[0])
cases = [
(fx+fy,(fx,fy),(fxv,fyv),1,'float32'),
(fx*fy,(fx,fy),(fxv,fyv),1,'float32'),
# (fx+fy+fz,(fx,fy,fz),(fxv,fyv,fzv),1,'float32'),
# (dx+dy+dz,(dx,dy,dz),(dxv,dyv,dzv),1,'float64'),
# (fx*fy*fz,(fx,fy,fz),(fxv,fyv,fzv),1,'float32'),
# (dx*dy*dz,(dx,dy,dz),(dxv,dyv,dzv),1,'float64'),
# (fx*fy*(fx+fy+fz),(fx,fy,fz),(fxv,fyv,fzv),2,'float32'),
# (dx*dy*(dx+dy+dz),(dx,dy,dz),(dxv,dyv,dzv),2,'float64'),
# (fx*fy*(fx+fy+dz),(fx,fy,dz),(dxv,dyv,dzv),2,'float64'),#check mixed type add
# (dz*fy*(fx+fy),(fx,fy,dz),(dxv,dyv,dzv),2,'float64'),#check mixed type mul
#check with dimshuffle of constant
(fx+fy+fz+2,(fx,fy,fz),(fxv,fyv,fzv),1,'float32'),
(fx*fy*fz*2,(fx,fy,fz),(fxv,fyv,fzv),1,'float32'),
# (2+fx+fy+fz,(fx,fy,fz),(fxv,fyv,fzv),1,'float32'),
# (2*fx*fy*fz,(fx,fy,fz),(fxv,fyv,fzv),1,'float32'),
(2+fx+fy+fz+2,(fx,fy,fz),(fxv,fyv,fzv),1,'float32'),
(2*fx*fy*fz*2,(fx,fy,fz),(fxv,fyv,fzv),1,'float32'),
# (fx*fy*2*(fx+fy+fz),(fx,fy,fz),(fxv,fyv,fzv),2,'float32'),
# (fx*fy*(2+fx+fy+fz),(fx,fy,fz),(fxv,fyv,fzv),2,'float32'),
(fx*fy*2*(fx+fy+fz+2),(fx,fy,fz),(fxv,fyv,fzv),2,'float32'),
#check with broadcast of row
# (fx+fy+fz+fv,(fx,fy,fz,fv),(fxv,fyv,fzv,fvv),1,'float32'),
# (fx*fy*fz*fv,(fx,fy,fz,fv),(fxv,fyv,fzv,fvv),1,'float32'),
# (fv+fx+fy+fz,(fx,fy,fz,fv),(fxv,fyv,fzv,fvv),1,'float32'),
# (fv*fx*fy*fz,(fx,fy,fz,fv),(fxv,fyv,fzv,fvv),1,'float32'),
# (fx*fy*fv*(fx+fy+fz),(fx,fy,fz,fv),(fxv,fyv,fzv,fvv),2,'float32'),
# (fx*fy*(fv+fx+fy+fz),(fx,fy,fz,fv),(fxv,fyv,fzv,fvv),2,'float32'),
# (fx*fy*fv*(fv+fx+fy+fz),(fx,fy,fz,fv),(fxv,fyv,fzv,fvv),2,'float32'),
# (dx+dy+dz+dv,(dx,dy,dz,dv),(dxv,dyv,dzv,dvv),1,'float64'),
# (dx*dy*dz*dv,(dx,dy,dz,dv),(dxv,dyv,dzv,dvv),1,'float64'),
# (dv+dx+dy+dz,(dx,dy,dz,dv),(dxv,dyv,dzv,dvv),1,'float64'),
# (dv*dx*dy*dz,(dx,dy,dz,dv),(dxv,dyv,dzv,dvv),1,'float64'),
# (dx*dy*dv*(dx+dy+dz),(dx,dy,dz,dv),(dxv,dyv,dzv,dvv),2,'float64'),
# (dx*dy*(dv+dx+dy+dz),(dx,dy,dz,dv),(dxv,dyv,dzv,dvv),2,'float64'),
# (dx*dy*dv*(dv+dx+dy+dz),(dx,dy,dz,dv),(dxv,dyv,dzv,dvv),2,'float64'),
]#[10:11]
# print cases
#We must be sure that the Canonizer is working, but that we don't have other
# optimisation that could hide bug in the Canonizer as local_elemwise_fusion
mode=compile.mode.predefined_modes[compile.mode.default_mode]
mode._optimizer=gof.Query(["canonicalize"])
mode._optimizer=mode._optimizer.excluding('local_elemwise_fusion')
for id, [g, sym_inputs, val_inputs, nb_elemwise, out_dtype] in enumerate(cases):
f = compile.function(list(sym_inputs), g,
#we need the optimisation enabled, debug do this.
mode=mode)
out = f(*val_inputs)
assert(len(f.maker.env.toposort())==nb_elemwise)
assert(out_dtype==out.dtype)
def test_elemwise_multiple_inputs_optimisation2(self):
"""
verify that the Canonizer merge sequential Elemwise({mul,add}) part 2.
This part are that case that should have been done, but that are not implemented.
Test with and without DimShuffle
"""
raise SkipTest("Current implementation of Canonizer don't implement all case. Skip the corresponding test")
shp=(5,5)
fx, fy, fz = fmatrices('xyz')
dx, dy, dz = dmatrices('xyz')
......@@ -240,13 +319,20 @@ class test_canonize(unittest.TestCase):
]#[10:11]
# print cases
for id, [g, sym_inputs, val_inputs, expected_out_nb_elemwise, out_dtype] in enumerate(cases):
#We must be sure that the Canonizer is working, but that we don't have other
# optimisation that could hide bug in the Canonizer as local_elemwise_fusion
mode=compile.mode.predefined_modes[compile.mode.default_mode]
mode._optimizer=gof.Query(["canonicalize"])
mode._optimizer=mode._optimizer.excluding('local_elemwise_fusion')
for id, [g, sym_inputs, val_inputs, nb_elemwise, out_dtype] in enumerate(cases):
f = compile.function(list(sym_inputs), g,
#we need the optimisation enabled, debug do this.
mode=compile.mode.predefined_modes['DEBUG_MODE'])
mode=mode)
out = f(*val_inputs)
assert(len(f.maker.env.toposort())==expected_out_nb_elemwise)
assert(len(f.maker.env.toposort())==nb_elemwise)
assert(out_dtype==out.dtype)
def test_multiple_case(self):
""" test those case take from the comment in Canonizer
x / x -> 1
......@@ -278,8 +364,11 @@ class test_canonize(unittest.TestCase):
dwv = numpy.asarray(numpy.random.rand(*shp),dtype='float64')
dvv = numpy.asarray(numpy.random.rand(shp[0]),dtype='float64').reshape(1,shp[0])
#we need the optimisation enabled, debug do this.
mode=compile.mode.predefined_modes['DEBUG_MODE']
#We must be sure that the Canonizer is working, but that we don't have other
# optimisation that could hide bug in the Canonizer as local_elemwise_fusion
mode=compile.mode.predefined_modes[compile.mode.default_mode]
mode._optimizer=gof.Query(["canonicalize"])
mode._optimizer=mode._optimizer.excluding('local_elemwise_fusion')
#test x / x -> 1
for id, (g, sym_inputs, val_inputs, out_dtype) in enumerate([(fx/fx,[fx],[fxv],'float32'),
......@@ -338,8 +427,7 @@ class test_canonize(unittest.TestCase):
topo=f.maker.env.toposort()
assert len(topo)==nb_elemwise
assert isinstance(topo[0].op,(T.Elemwise,))
assert isinstance(topo[0].op.scalar_op,theano.scalar.basic.Inv)
assert len(topo[0].inputs)==1
assert isinstance(topo[0].op.scalar_op,(theano.scalar.basic.Inv, theano.scalar.basic.TrueDiv))
assert(out_dtype==out.dtype)
#test (a / b) * (b / c) * (c / d) -> a / d
......@@ -407,6 +495,7 @@ class test_canonize(unittest.TestCase):
def test_multiple_case_that_fail(self):
import theano.tensor, theano.compile
raise SkipTest("Current implementation of Canonizer don't implement all case. Skip the corresponding test")
shp=(4,4)
fx, fy, fz = fmatrices('xyz')
......@@ -418,7 +507,11 @@ class test_canonize(unittest.TestCase):
dyv = numpy.asarray(numpy.random.rand(*shp),dtype='float32')
dzv = numpy.asarray(numpy.random.rand(*shp),dtype='float32')
fvv = numpy.asarray(numpy.random.rand(shp[0]),dtype='float32').reshape(1,shp[0])
mode=compile.mode.predefined_modes['DEBUG_MODE']
#We must be sure that the Canonizer is working, but that we don't have other
# optimisation that could hide bug in the Canonizer as local_elemwise_fusion
mode=compile.mode.predefined_modes[compile.mode.default_mode]
mode._optimizer=gof.Query(["canonicalize"])
mode._optimizer=mode._optimizer.excluding('local_elemwise_fusion')
#test fail!
#test x / y / z -> x / (y * z)
......@@ -455,6 +548,11 @@ class test_canonize(unittest.TestCase):
assert len(topo[0].inputs)==1
assert(out_dtype==out.dtype)
def test_dont_merge_if_multiple_client(self):
""" test those case take from the comment in Canonizer
"""
raise SkipTest("Not implemented")
def test_mixeddiv():
"""Test that int division is preserved"""
i = iscalar()
......@@ -692,8 +790,220 @@ def test_const_type_in_mul_canonizer():
f2(ival, wval, visbval, hidbval, betaval, aval),
f1(ival, wval, visbval, hidbval, betaval, aval))
from theano.compile.sandbox.pfunc import pfunc
from theano.compile.sandbox.sharedvalue import shared
import theano
class test_fusion(unittest.TestCase):
def do(self, mode, shared_fn, shp, gpu=False, nb_repeat=1, assert_len_topo=True, slice=None):
"""
param shared_fn: if None, will use compile.function
verify that the elemwise fusion work
Test with and without DimShuffle
"""
#TODO: disable the canonizer?
def my_init(shp,dtype, num=0):
#ret = numpy.asarray(numpy.random.rand(*shp),dtype=dtype)
ret = numpy.zeros(shp, dtype=dtype)+num
return ret
fw, fx, fy, fz = fmatrices('wxyz')
dw, dx, dy, dz = dmatrices('wxyz')
fv = fvector('r').dimshuffle('x',0)
dv = dvector('s').dimshuffle('x',0)
fwv = my_init(shp,'float32',1)
fxv = my_init(shp,'float32',2)
fyv = my_init(shp,'float32',3)
fzv = my_init(shp,'float32',4)
fvv = numpy.asarray(numpy.random.rand(shp[0]),dtype='float32').reshape(1,shp[0])
dwv = my_init(shp,'float64',5)
# dxv = my_init(shp,'float64',6)
# dyv = my_init(shp,'float64',7)
# dzv = my_init(shp,'float64',8)
# dvv = numpy.asarray(numpy.random.rand(shp[0]),dtype='float64').reshape(1,shp[0])
fwx=fw+fx
cases = [
(fx+fy+fz,(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv+fzv,'float32'),#1
(fx*fy*fz,(fx,fy,fz),(fxv,fyv,fzv),1,fxv*fyv*fzv,'float32'),
(fx+fy*fz,(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv*fzv,'float32'),
(fx*fy+fz,(fx,fy,fz),(fxv,fyv,fzv),1,fxv*fyv+fzv,'float32'),
(fw+fx+fy+fz,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv,'float32'),#5
((fw+fx)+(fy+fz),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv,'float32'),
(((fw+fx)+fy)+fz,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv,'float32'),
((fw+(fx+fy))+fz,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv,'float32'),
((fw+(fx+fy)+fz),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv,'float32'),
(fw+(fx+(fy+fz)),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv,'float32'),#10
((fw+fx)+(fy+fz),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv,'float32'),
(fw*fx*fy*fz,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv*fxv*fyv*fzv,'float32'),
(fw+fx*fy*fz,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv*fyv*fzv,'float32'),
(fx+fy*fz*fx,(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv*fzv*fxv,'float32'),
(fx*fy+fz+fy,(fx,fy,fz),(fxv,fyv,fzv),1,fxv*fyv+fzv+fyv,'float32'),#15
(fx*fy*fz*fw+fx+fy+fz+fw,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fxv*fyv*fzv*fwv+fxv+fyv+fzv+fwv,'float32'),
#test with constant
((fw+fx)+(fy+fz)+2,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv+2,'float32'),
(((fw+fx)+2+fy)+fz,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv+2,'float32'),
((fw+(fx+2+fy))+fz,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv+2,'float32'),
((fw+(fx+fy)+2+fz),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv+2,'float32'),#20
(fw+(fx+(fy+fz)+2),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv+2,'float32'),
(2+(fw+fx)+(fy+fz),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv+2,'float32'),
#mix float32 and float64
(2+(dw+fx)+(fy+fz),(dw,fx,fy,fz),(dwv,fxv,fyv,fzv),1,dwv+fxv+fyv+fzv+2,'float64'),
(2+(fw+dw)+(fy+fz),(fw,dw,fy,fz),(fwv,dwv,fyv,fzv),1,fwv+dwv+fyv+fzv+2,'float64'),
(2+(fw+fx)+(dw+fz),(fw,fx,dw,fz),(fwv,fxv,dwv,fzv),1,fwv+fxv+dwv+fzv+2,'float64'),#25
(2+(fw+fx)+(fy+dw),(fw,fx,fy,dw),(fwv,fxv,fyv,dwv),1,fwv+fxv+fyv+dwv+2,'float64'),
#test when their is other op then elemwise.
#the good output for the next test.
# (Pdb) p f.maker.env.toposort()
#[Elemwise{add,no_inplace}(w, x), Sum(Elemwise{add,no_inplace}.0), InplaceDimShuffle{x,x}(Sum.0), Elemwise{Composite{_impls=[<function <lambda> at 0x2c5c8c0>], nin=4, _c_code={
#npy_float32 V%(id)s_tmp1;
#V%(id)s_tmp1 = %(i2)s + %(i3)s;
#npy_float32 V%(id)s_tmp2;
#V%(id)s_tmp2 = %(i0)s + %(i1)s;
#%(o0)s = V%(id)s_tmp2 + V%(id)s_tmp1;
#}
#, nout=1, env=[add(add(<float32>, <float32>), add(<float32>, <float32>))]}}(InplaceDimShuffle{x,x}.0, Elemwise{add,no_inplace}.0, y, z)]
((fwx.sum())+(fwx)+(fy+fz),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),4,(fwv+fxv).sum()+fwv+fxv+fyv+fzv,'float32'),
#test other elemwise op
(fx+fy+cos(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv+numpy.cos(fzv),'float32'),
(fx+fy+cosh(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv+numpy.cosh(fzv),'float32'),
(fx+fy+abs(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv+numpy.absolute(fzv),'float32'),#30
(fx+fy+theano.tensor.log(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv+numpy.log(fzv),'float32'),
(fx+fy+theano.tensor.log2(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv+numpy.log2(fzv),'float32'),
(fx+fy+theano.tensor.log10(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv+numpy.log10(fzv),'float32'),
(fx+fy**fz,(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv**fzv,'float32'),#pow
(fx+fy+theano.tensor.exp(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv+numpy.exp(fzv),'float32'),#35
(fx-fy-fz,(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv-fzv,'float32'),
(fx-(fy/fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv/fzv),'float32'),
# (fx-(fy%fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv%fzv),'float32'),#TODO: c_code not implemented for %
(fx-(fy>fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv>fzv),'float32'),
(fx-(fy>=fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv>=fzv),'float32'),
(fx-(fy<fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv<fzv),'float32'),
(fx-(fy<=fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv<=fzv),'float32'),
# (fx-(fy==fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv==fzv),'float32'),#TODO: bugged
(fx-(fy!=fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv!=fzv),'float32'),
(fx-fy+tan(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+numpy.tan(fzv),'float32'),
(fx-fy+tanh(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+numpy.tanh(fzv),'float32'),
(fx-fy+sin(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+numpy.sin(fzv),'float32'),
(fx-fy+sinh(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+numpy.sinh(fzv),'float32'),
(fx-fy+theano.tensor.sqr(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+(fzv*fzv),'float32'),
(fx-fy+theano.tensor.sqrt(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+numpy.sqrt(fzv),'float32'),
(fx-fy+theano.tensor.inv(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+(1/fzv),'float32'),
(fx-fy+theano.tensor.neg(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+(-fzv),'float32'),
# (fx-fy+theano.tensor.iround(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+numpy.round(fzv),'float32'),#TODO: trouble with the output type. To my understanding, numpy and c round fct return the same type as the input. Why we don't do this?
#TODO: BIT OP only with ints, xor, or, and, invert
# (fx-theano.tensor.or_(fy,fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fy|fz),'float32'),
# (fx-theano.tensor.xor(fy,fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fy^fz),'float32'),
]
if slice:
cases = cases[slice]
import time
times=numpy.zeros(len(cases))
for id, [g, sym_inputs, val_inputs, nb_elemwise, answer, out_dtype] in enumerate(cases):
print "new cases", id
if shared_fn == None:
assert gpu==False
f = compile.function(list(sym_inputs), g,mode=mode)
#pre-call to have the data in cache if it fit to don't penalise the first iteration
# if id==0:
# out=f(*val_inputs)
t0=time.time()
for x in range(nb_repeat):
out=f(*val_inputs)
t1=time.time()
nb_repeat=1
else:
out=shared_fn(numpy.zeros(shp, dtype=out_dtype),'out')
f = pfunc(sym_inputs,[],updates=[(out,out+g)],mode=mode)
#pre-call to have the data in cache if it fit to don't penalise the first iteration
# if id==0:
# f(*val_inputs)
t0=time.time()
for x in range(nb_repeat):
f(*val_inputs)
t1=time.time()
out=out.value
# if id==0:
# nb_repeat+=1
times[id]=t1-t0
assert numpy.allclose(out,answer*nb_repeat,atol=1e-6 if out_dtype=='float32' else 1e-8)
topo=f.maker.env.toposort()
if gpu:
import theano_cuda_ndarray as tcn
topo_ = [x for x in topo if not isinstance(x.op,tcn.basic_ops.GpuFromHost)]
gpu_ = [x for x in topo if isinstance(x.op,tcn.basic_ops.GpuFromHost)]
assert len(gpu_)==len(sym_inputs)
else: topo_=topo
if assert_len_topo:
assert(len(topo_)==nb_elemwise)
assert(out_dtype==out.dtype)
print "Executed",len(cases),"cases"
return times
def test_elemwise_fusion(self):
raise SkipTest("Current implementation of test_fusion is not enabled. So we skip the corresponding test")
shp=(5,5)
#we need the optimisation enabled, debug do this.
mode=compile.mode.predefined_modes['FAST_COMPILE']
mode=compile.mode.predefined_modes['FAST_RUN']
mode=compile.mode.predefined_modes['DEBUG_MODE']
self.do(mode, shared, shp)
def gpu_fusion(self):
shp=(5,5)
#we need the optimisation enabled, debug do this.
mode=compile.mode.predefined_modes['FAST_COMPILE']
mode=compile.mode.predefined_modes['FAST_RUN']
mode=compile.mode.predefined_modes['DEBUG_MODE']
import theano_cuda_ndarray as tcn
self.do(mode, tcn.shared_constructor, shp, gpu=True)
def speed_fusion(self, shared_fn = shared, gpu = False, s=None):
"""
param type s: a slice object
param s: a slice to apply to the case to execute. If None, exec all case.
"""
import copy
shp=(3000,3000)
#mode1=copy.copy(compile.mode.predefined_modes['FAST_RUN'])
linker=gof.CLinker
linker=gof.OpWiseCLinker
mode1=compile.Mode(linker(), copy.copy(compile.mode.OPT_FAST_RUN))
#TODO:clinker is much faster... but use to much memory
#Possible cause: as their is do deletion of intermediate value when we don't keep the fct.
#More plausible cause: we keep a link to the output data?
#Follow up. Clinker do the same... second cause?
mode2=compile.Mode(linker(), copy.copy(compile.mode.OPT_FAST_RUN))
# mode2=copy.copy(compile.mode.predefined_modes['FAST_RUN'])
mode2._optimizer=mode2._optimizer.excluding('local_elemwise_fusion')
# mode2=compile.Mode(gof.OpWiseCLinker(allow_gc=True), compile.mode.OPT_FAST_COMPILE)
if s is None:
s=slice(0,49)
#s=slice(49,59)
nb_repeat=10
print "test with linker", str(linker)
times1=self.do(mode1, shared_fn, shp, gpu=gpu, nb_repeat=nb_repeat, assert_len_topo=False,slice=s)
times2=self.do(mode2, shared_fn, shp, gpu=gpu, nb_repeat=nb_repeat, assert_len_topo=False,slice=s)
print "times1 FAST_RUN optimisation"
print times1, times1.min(), times1.max(), times1.sum()
print "times2 FAST_RUN optimisation without local_elemwise_fusion"
print times2, times2.min(), times2.max(), times2.sum()
d=times2/times1
# d.sort()
print "times2/times1",d,d.min(), d.max(), d.mean(), d.std()
def speed_fusion_gpu(self):
import theano_cuda_ndarray as tcn
self.speed_fusion(shared_fn=tcn.shared_constructor, gpu=True, s=slice(0,15))
if __name__ == '__main__':
unittest.main()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论