提交 9adfd678 authored 作者: James Bergstra's avatar James Bergstra

merge

from .sharedvalue import shared, shared_constructor from theano.compile.sandbox.sharedvalue import shared, shared_constructor
from .pfunc import pfunc from theano.compile.sandbox.pfunc import pfunc
...@@ -792,14 +792,24 @@ class CLinker(link.Linker): ...@@ -792,14 +792,24 @@ class CLinker(link.Linker):
function raises a KeyError exception. function raises a KeyError exception.
""" """
order = list(self.env.toposort()) return self.cmodule_key_(self.env, self.no_recycling,
env_inputs_dict = dict((i, [-1, pos]) for pos, i in enumerate(self.env.inputs)) compile_args=self.compile_args(),
libraries=self.libraries()
)
@staticmethod
def cmodule_key_(env, no_recycling, compile_args=None, libraries=None):
"""
Do the actual computation of cmodule_key in a static method
to allow it to be reused in scalar.Composite.__eq__
"""
order = list(env.toposort())
env_computed_set = set() env_computed_set = set()
env_inputs_dict = dict((i, [-1, pos]) for pos, i in enumerate(env.inputs))
constant_ids = dict() constant_ids = dict()
op_pos = {} # Apply -> topological position op_pos = {} # Apply -> topological position
rval = ['CLinker.cmodule_key'] # will be cast to tuple on return rval = ['CLinker.cmodule_key'] # will be cast to tuple on return
rval.append(tuple(self.compile_args())) if compile_args is not None: rval.append(tuple(compile_args))
rval.append(tuple(self.libraries())) if libraries is not None: rval.append(tuple(libraries))
version = [] version = []
# assert that every input to every node is one of' # assert that every input to every node is one of'
...@@ -822,16 +832,16 @@ class CLinker(link.Linker): ...@@ -822,16 +832,16 @@ class CLinker(link.Linker):
else: else:
if i.owner is None: if i.owner is None:
assert all( all(out is not None for out in o.outputs) for o in order) assert all( all(out is not None for out in o.outputs) for o in order)
assert all( input.owner is None for input in self.env.inputs) assert all( input.owner is None for input in env.inputs)
raise Exception('what is this?', (i, type(i), i.clients, self.env)) raise Exception('what is this?', (i, type(i), i.clients, env))
if i in self.env.outputs: if i in env.outputs:
rval += [op_pos[i.owner], # outputs rval += [op_pos[i.owner], # outputs
i.owner.outputs.index(i), i.owner.outputs.index(i),
self.env.outputs.index(i)] env.outputs.index(i)]
else: else:
rval += [op_pos[i.owner], i.owner.outputs.index(i)] # temps rval += [op_pos[i.owner], i.owner.outputs.index(i)] # temps
assert rval assert rval
rval.append(i in self.no_recycling) rval.append(i in no_recycling)
return tuple(rval) return tuple(rval)
for node_pos, node in enumerate(order): for node_pos, node in enumerate(order):
......
...@@ -386,7 +386,7 @@ class ModuleCache(object): ...@@ -386,7 +386,7 @@ class ModuleCache(object):
try: try:
module = fn(location=location) # WILL FAIL FOR BAD C CODE module = fn(location=location) # WILL FAIL FOR BAD C CODE
except Exception, e: except Exception, e:
shutil.rmtree(location) _rmtree(location)
#try: #try:
#except Exception, ee: #except Exception, ee:
#error('failed to cleanup location', location, ee) #error('failed to cleanup location', location, ee)
...@@ -515,7 +515,8 @@ class ModuleCache(object): ...@@ -515,7 +515,8 @@ class ModuleCache(object):
def _rmtree(parent): def _rmtree(parent):
try: try:
shutil.rmtree(parent) if not os.getenv('THEANO_NOCLEANUP',0):
shutil.rmtree(parent)
except Exception, e: except Exception, e:
try: try:
# mark this directory for deletion by a future refresh() # mark this directory for deletion by a future refresh()
......
from .. import gof from theano import gof
import sys import sys
......
...@@ -348,6 +348,9 @@ def int_out(*types): ...@@ -348,6 +348,9 @@ def int_out(*types):
def float_out(*types): def float_out(*types):
return float64, return float64,
def upgrade_to_float(*types): def upgrade_to_float(*types):
"""
This upgrade the types to float32 or float64 to don't loose any precision.
"""
conv = {int8: float32, conv = {int8: float32,
int16: float32, int16: float32,
int32: float64, int32: float64,
...@@ -370,8 +373,8 @@ class ScalarOp(Op): ...@@ -370,8 +373,8 @@ class ScalarOp(Op):
def make_node(self, *inputs): def make_node(self, *inputs):
if self.nin >= 0: if self.nin >= 0:
if len(inputs) != self.nin: if len(inputs) != self.nin:
raise TypeError("Wrong number of inputs for %s.make_node (got %i, expected %i)" \ raise TypeError("Wrong number of inputs for %s.make_node (got %i(%s), expected %i)" \
% (self, len(inputs), self.nin)) % (self, len(inputs), str(inputs), self.nin))
inputs = [as_scalar(input) for input in inputs] inputs = [as_scalar(input) for input in inputs]
outputs = [t() for t in self.output_types([input.type for input in inputs])] outputs = [t() for t in self.output_types([input.type for input in inputs])]
if len(outputs) != self.nout: if len(outputs) != self.nout:
...@@ -977,6 +980,7 @@ class Inv(UnaryScalarOp): ...@@ -977,6 +980,7 @@ class Inv(UnaryScalarOp):
inv = Inv(upgrade_to_float, name = 'inv') inv = Inv(upgrade_to_float, name = 'inv')
class Log(UnaryScalarOp): class Log(UnaryScalarOp):
""" log base e """
def impl(self, x): def impl(self, x):
return math.log(x) return math.log(x)
def grad(self, (x, ), (gz, )): def grad(self, (x, ), (gz, )):
...@@ -994,6 +998,7 @@ class Log(UnaryScalarOp): ...@@ -994,6 +998,7 @@ class Log(UnaryScalarOp):
log = Log(upgrade_to_float, name = 'log') log = Log(upgrade_to_float, name = 'log')
class Log2(UnaryScalarOp): class Log2(UnaryScalarOp):
""" log base 2 """
def impl(self, x): def impl(self, x):
return numpy.log2(x) return numpy.log2(x)
def grad(self, (x, ), (gz, )): def grad(self, (x, ), (gz, )):
...@@ -1009,6 +1014,7 @@ class Log2(UnaryScalarOp): ...@@ -1009,6 +1014,7 @@ class Log2(UnaryScalarOp):
log2 = Log2(upgrade_to_float, name = 'log2') log2 = Log2(upgrade_to_float, name = 'log2')
class Log10(UnaryScalarOp): class Log10(UnaryScalarOp):
""" log base 10 """
def impl(self, x): def impl(self, x):
return numpy.log10(x) return numpy.log10(x)
def grad(self, (x, ), (gz, )): def grad(self, (x, ), (gz, )):
...@@ -1170,6 +1176,14 @@ class Composite(ScalarOp): ...@@ -1170,6 +1176,14 @@ class Composite(ScalarOp):
implement the loop fusion optimizer (which I have yet to do implement the loop fusion optimizer (which I have yet to do
someday...) someday...)
""" """
def __str__(self):
if hasattr(self, 'name') and self.name:
return self.name
else:
return "%s{%s}" % (self.__class__.__name__, ", ".join(
"%s=%s" % (k, v) for k, v in self.__dict__.items()
if k not in ["name","env","_c_code"] ))
def __init__(self, inputs, outputs): def __init__(self, inputs, outputs):
env = Env(*gof.graph.clone(inputs, outputs)) env = Env(*gof.graph.clone(inputs, outputs))
gof.MergeOptimizer().optimize(env) gof.MergeOptimizer().optimize(env)
...@@ -1233,12 +1247,15 @@ class Composite(ScalarOp): ...@@ -1233,12 +1247,15 @@ class Composite(ScalarOp):
self.nin = len(inputs) self.nin = len(inputs)
self.nout = len(outputs) self.nout = len(outputs)
self.env = env self.env = env
self.inputs_type = tuple([input.type for input in self.env.inputs])
self.outputs_type = tuple([output.type for output in self.env.outputs])
self._rehash()
def output_types(self, input_types): def output_types(self, input_types):
if tuple(input_types) != tuple([input.type for input in self.env.inputs]): if tuple(input_types) != self.inputs_type:
raise TypeError("Wrong types for Composite. Expected %s, got %s." raise TypeError("Wrong types for Composite. Expected %s, got %s."
% (tuple([input.type for input in self.env.inputs]), tuple(input_types))) % (self.inputs_type, tuple(input_types)))
return [output.type for output in self.env.outputs] return self.outputs_type
def perform(self, node, inputs, output_storage): def perform(self, node, inputs, output_storage):
for storage, impl in zip(output_storage, self._impls): for storage, impl in zip(output_storage, self._impls):
...@@ -1259,10 +1276,36 @@ class Composite(ScalarOp): ...@@ -1259,10 +1276,36 @@ class Composite(ScalarOp):
onames), onames),
**sub) **sub)
d['name'] = name d['name'] = name
if not sub.has_key('id'):
#The use of a dummy id is safe as the code is in a separate block.
#It won't generate conflicting variable name.
d['id']='_DUMMY_ID_'
return self._c_code % d return self._c_code % d
def __eq__(self, other): def __eq__(self, other):
return self is other if self is other: return True
if not isinstance(other, self.__class__): return False
if self.nin!=other.nin or self.nout != other.nout: return False
return self._hashval == other._hashval
return self._cmodule_key == other._cmodule_key
def _rehash(self):
#TODO: What no_recycling is used for? What I need to put their?
# no_recycling = []
self._cmodule_key = gof.CLinker.cmodule_key_(self.env, [])
self._hashval = hash(self._cmodule_key)
def __hash__(self): def __hash__(self):
return id(self) return self._hashval
# def __getstate__(self):
# d = copy(self.__dict__)
# d.pop('env')
# d.pop('_impls')
# #TODO: the self._impls must be restored to allow the perform to work.(c version continue to work.
# return d
# def __setstate__(self, d):
# self.__dict__.update(d)
# #TODO: how to restore the _impls?
...@@ -1227,6 +1227,68 @@ register_canonicalize(local_transposed_dot, name='local_transposed_dot') ...@@ -1227,6 +1227,68 @@ register_canonicalize(local_transposed_dot, name='local_transposed_dot')
# # Loop fusion # # # Loop fusion #
# ############### # ###############
@gof.local_optimizer([T.Elemwise, T.Elemwise])
def local_elemwise_fusion(node):
"""
As part of specialisation, we fusion two consecutif elemwise op of the same shape.
For mixed dtype, we let the Compise op do the cast. It let the C compile do the cast.
The number of dimension is validated at call time by theano itself.
TODO:The broadcast flag?
"""
# TODO:implement Composite.__eq__ by using CLinker.cmodule_key() to compare the graph.
#TODO: Merge when nb_clients>1? When this optimisation could introduce duplication of computation? When this will be faster?
if not isinstance(node.op, T.Elemwise):
return False
nb_elemwise=0
inputs=[]#inputs of the new Elemwise op.
s_inputs = []#inputs of the new scalar op.
s_g=[]#graph of scalar, what will by done in the inner loop.
for i in node.inputs:
if i.owner and isinstance(i.owner.op,T.Elemwise) and len(i.clients)<=1:
if len(i.clients)>1:
#should we put this in the first if, then we would go to the elif to don't fuse it?
#if one of the inputs have more then 1 clients and it is an intermediate result. We don't fuse.
print "local_elemwise_fusion: Elemwise inputs have more then 1 client. Don't optimise for now"
return False
nb_elemwise+=1
inputs.extend(i.owner.inputs)
s_input = [scalar.Scalar(x.dtype).make_variable() for x in i.owner.inputs]
s_inputs.extend(s_input)
s_op=i.owner.op.scalar_op(*s_input)
s_g.append(s_op)
else:
if i.owner and isinstance(i.owner.op,T.Elemwise) and len(i.clients)>1:
#should we put this in the first if, then we would go to the elif to don't fuse it?
print "local_elemwise_fusion: inputs have more then 1 client. Don't fuse it for now.!"
return False
inputs.append(i)
s=scalar.Scalar(i.dtype).make_variable()
s_inputs.append(s)
s_g.append(s)
#if no inputs have are an elemwise, their is nothing to fuse.
if nb_elemwise==0:
# print "local_elemwise_fusion: no elemwise in inputs. Nothing to fuse."
return False
otype = node.outputs[0].type
s_new_out=node.op.scalar_op(*s_g)
#create the composite op.
C = scalar.Composite(s_inputs,[s_new_out])
#create the new node.
n=T.Elemwise(C).make_node(*inputs)
assert len(n.outputs)==1
assert node.outputs[0].dtype==n.outputs[0].dtype
# print "local_elemwise_fusion: FUSED",nb_elemwise+1,"elemwise!"
return n.outputs
#register_specialize(local_elemwise_fusion)
# def make_composite(inputs, outputs): # def make_composite(inputs, outputs):
# scalar_inputs = [scalar.Scalar(dtype = i.type.dtype)() for i in inputs] # scalar_inputs = [scalar.Scalar(dtype = i.type.dtype)() for i in inputs]
# def transform(r): # def transform(r):
......
...@@ -14,6 +14,7 @@ import numpy ...@@ -14,6 +14,7 @@ import numpy
#import scalar_opt #import scalar_opt
from theano import function, compile from theano import function, compile
from nose.plugins.skip import SkipTest
def inputs(xbc = (0, 0), ybc = (0, 0), zbc = (0, 0)): def inputs(xbc = (0, 0), ybc = (0, 0), zbc = (0, 0)):
...@@ -183,9 +184,87 @@ class test_canonize(unittest.TestCase): ...@@ -183,9 +184,87 @@ class test_canonize(unittest.TestCase):
def test_elemwise_multiple_inputs_optimisation(self): def test_elemwise_multiple_inputs_optimisation(self):
""" """
verify that the Canonizer merge sequential Elemwise({mul,add}) verify that the Canonizer merge sequential Elemwise({mul,add}) part 1
This part are that case that is done, but don't include case that are not implemented but are suposed to be.
Test with and without DimShuffle Test with and without DimShuffle
""" """
shp=(5,5)
fx, fy, fz = fmatrices('xyz')
dx, dy, dz = dmatrices('xyz')
fv = fvector('r').dimshuffle('x',0)
dv = dvector('s').dimshuffle('x',0)
fxv = numpy.asarray(numpy.random.rand(*shp),dtype='float32')
fyv = numpy.asarray(numpy.random.rand(*shp),dtype='float32')
fzv = numpy.asarray(numpy.random.rand(*shp),dtype='float32')
fvv = numpy.asarray(numpy.random.rand(shp[0]),dtype='float32').reshape(1,shp[0])
dxv = numpy.asarray(numpy.random.rand(*shp),dtype='float64')
dyv = numpy.asarray(numpy.random.rand(*shp),dtype='float64')
dzv = numpy.asarray(numpy.random.rand(*shp),dtype='float64')
dvv = numpy.asarray(numpy.random.rand(shp[0]),dtype='float64').reshape(1,shp[0])
cases = [
(fx+fy,(fx,fy),(fxv,fyv),1,'float32'),
(fx*fy,(fx,fy),(fxv,fyv),1,'float32'),
# (fx+fy+fz,(fx,fy,fz),(fxv,fyv,fzv),1,'float32'),
# (dx+dy+dz,(dx,dy,dz),(dxv,dyv,dzv),1,'float64'),
# (fx*fy*fz,(fx,fy,fz),(fxv,fyv,fzv),1,'float32'),
# (dx*dy*dz,(dx,dy,dz),(dxv,dyv,dzv),1,'float64'),
# (fx*fy*(fx+fy+fz),(fx,fy,fz),(fxv,fyv,fzv),2,'float32'),
# (dx*dy*(dx+dy+dz),(dx,dy,dz),(dxv,dyv,dzv),2,'float64'),
# (fx*fy*(fx+fy+dz),(fx,fy,dz),(dxv,dyv,dzv),2,'float64'),#check mixed type add
# (dz*fy*(fx+fy),(fx,fy,dz),(dxv,dyv,dzv),2,'float64'),#check mixed type mul
#check with dimshuffle of constant
(fx+fy+fz+2,(fx,fy,fz),(fxv,fyv,fzv),1,'float32'),
(fx*fy*fz*2,(fx,fy,fz),(fxv,fyv,fzv),1,'float32'),
# (2+fx+fy+fz,(fx,fy,fz),(fxv,fyv,fzv),1,'float32'),
# (2*fx*fy*fz,(fx,fy,fz),(fxv,fyv,fzv),1,'float32'),
(2+fx+fy+fz+2,(fx,fy,fz),(fxv,fyv,fzv),1,'float32'),
(2*fx*fy*fz*2,(fx,fy,fz),(fxv,fyv,fzv),1,'float32'),
# (fx*fy*2*(fx+fy+fz),(fx,fy,fz),(fxv,fyv,fzv),2,'float32'),
# (fx*fy*(2+fx+fy+fz),(fx,fy,fz),(fxv,fyv,fzv),2,'float32'),
(fx*fy*2*(fx+fy+fz+2),(fx,fy,fz),(fxv,fyv,fzv),2,'float32'),
#check with broadcast of row
# (fx+fy+fz+fv,(fx,fy,fz,fv),(fxv,fyv,fzv,fvv),1,'float32'),
# (fx*fy*fz*fv,(fx,fy,fz,fv),(fxv,fyv,fzv,fvv),1,'float32'),
# (fv+fx+fy+fz,(fx,fy,fz,fv),(fxv,fyv,fzv,fvv),1,'float32'),
# (fv*fx*fy*fz,(fx,fy,fz,fv),(fxv,fyv,fzv,fvv),1,'float32'),
# (fx*fy*fv*(fx+fy+fz),(fx,fy,fz,fv),(fxv,fyv,fzv,fvv),2,'float32'),
# (fx*fy*(fv+fx+fy+fz),(fx,fy,fz,fv),(fxv,fyv,fzv,fvv),2,'float32'),
# (fx*fy*fv*(fv+fx+fy+fz),(fx,fy,fz,fv),(fxv,fyv,fzv,fvv),2,'float32'),
# (dx+dy+dz+dv,(dx,dy,dz,dv),(dxv,dyv,dzv,dvv),1,'float64'),
# (dx*dy*dz*dv,(dx,dy,dz,dv),(dxv,dyv,dzv,dvv),1,'float64'),
# (dv+dx+dy+dz,(dx,dy,dz,dv),(dxv,dyv,dzv,dvv),1,'float64'),
# (dv*dx*dy*dz,(dx,dy,dz,dv),(dxv,dyv,dzv,dvv),1,'float64'),
# (dx*dy*dv*(dx+dy+dz),(dx,dy,dz,dv),(dxv,dyv,dzv,dvv),2,'float64'),
# (dx*dy*(dv+dx+dy+dz),(dx,dy,dz,dv),(dxv,dyv,dzv,dvv),2,'float64'),
# (dx*dy*dv*(dv+dx+dy+dz),(dx,dy,dz,dv),(dxv,dyv,dzv,dvv),2,'float64'),
]#[10:11]
# print cases
#We must be sure that the Canonizer is working, but that we don't have other
# optimisation that could hide bug in the Canonizer as local_elemwise_fusion
mode=compile.mode.predefined_modes[compile.mode.default_mode]
mode._optimizer=gof.Query(["canonicalize"])
mode._optimizer=mode._optimizer.excluding('local_elemwise_fusion')
for id, [g, sym_inputs, val_inputs, nb_elemwise, out_dtype] in enumerate(cases):
f = compile.function(list(sym_inputs), g,
#we need the optimisation enabled, debug do this.
mode=mode)
out = f(*val_inputs)
assert(len(f.maker.env.toposort())==nb_elemwise)
assert(out_dtype==out.dtype)
def test_elemwise_multiple_inputs_optimisation2(self):
"""
verify that the Canonizer merge sequential Elemwise({mul,add}) part 2.
This part are that case that should have been done, but that are not implemented.
Test with and without DimShuffle
"""
raise SkipTest("Current implementation of Canonizer don't implement all case. Skip the corresponding test")
shp=(5,5) shp=(5,5)
fx, fy, fz = fmatrices('xyz') fx, fy, fz = fmatrices('xyz')
dx, dy, dz = dmatrices('xyz') dx, dy, dz = dmatrices('xyz')
...@@ -240,13 +319,20 @@ class test_canonize(unittest.TestCase): ...@@ -240,13 +319,20 @@ class test_canonize(unittest.TestCase):
]#[10:11] ]#[10:11]
# print cases # print cases
for id, [g, sym_inputs, val_inputs, expected_out_nb_elemwise, out_dtype] in enumerate(cases): #We must be sure that the Canonizer is working, but that we don't have other
# optimisation that could hide bug in the Canonizer as local_elemwise_fusion
mode=compile.mode.predefined_modes[compile.mode.default_mode]
mode._optimizer=gof.Query(["canonicalize"])
mode._optimizer=mode._optimizer.excluding('local_elemwise_fusion')
for id, [g, sym_inputs, val_inputs, nb_elemwise, out_dtype] in enumerate(cases):
f = compile.function(list(sym_inputs), g, f = compile.function(list(sym_inputs), g,
#we need the optimisation enabled, debug do this. #we need the optimisation enabled, debug do this.
mode=compile.mode.predefined_modes['DEBUG_MODE']) mode=mode)
out = f(*val_inputs) out = f(*val_inputs)
assert(len(f.maker.env.toposort())==expected_out_nb_elemwise) assert(len(f.maker.env.toposort())==nb_elemwise)
assert(out_dtype==out.dtype) assert(out_dtype==out.dtype)
def test_multiple_case(self): def test_multiple_case(self):
""" test those case take from the comment in Canonizer """ test those case take from the comment in Canonizer
x / x -> 1 x / x -> 1
...@@ -278,8 +364,11 @@ class test_canonize(unittest.TestCase): ...@@ -278,8 +364,11 @@ class test_canonize(unittest.TestCase):
dwv = numpy.asarray(numpy.random.rand(*shp),dtype='float64') dwv = numpy.asarray(numpy.random.rand(*shp),dtype='float64')
dvv = numpy.asarray(numpy.random.rand(shp[0]),dtype='float64').reshape(1,shp[0]) dvv = numpy.asarray(numpy.random.rand(shp[0]),dtype='float64').reshape(1,shp[0])
#we need the optimisation enabled, debug do this. #We must be sure that the Canonizer is working, but that we don't have other
mode=compile.mode.predefined_modes['DEBUG_MODE'] # optimisation that could hide bug in the Canonizer as local_elemwise_fusion
mode=compile.mode.predefined_modes[compile.mode.default_mode]
mode._optimizer=gof.Query(["canonicalize"])
mode._optimizer=mode._optimizer.excluding('local_elemwise_fusion')
#test x / x -> 1 #test x / x -> 1
for id, (g, sym_inputs, val_inputs, out_dtype) in enumerate([(fx/fx,[fx],[fxv],'float32'), for id, (g, sym_inputs, val_inputs, out_dtype) in enumerate([(fx/fx,[fx],[fxv],'float32'),
...@@ -338,8 +427,7 @@ class test_canonize(unittest.TestCase): ...@@ -338,8 +427,7 @@ class test_canonize(unittest.TestCase):
topo=f.maker.env.toposort() topo=f.maker.env.toposort()
assert len(topo)==nb_elemwise assert len(topo)==nb_elemwise
assert isinstance(topo[0].op,(T.Elemwise,)) assert isinstance(topo[0].op,(T.Elemwise,))
assert isinstance(topo[0].op.scalar_op,theano.scalar.basic.Inv) assert isinstance(topo[0].op.scalar_op,(theano.scalar.basic.Inv, theano.scalar.basic.TrueDiv))
assert len(topo[0].inputs)==1
assert(out_dtype==out.dtype) assert(out_dtype==out.dtype)
#test (a / b) * (b / c) * (c / d) -> a / d #test (a / b) * (b / c) * (c / d) -> a / d
...@@ -407,6 +495,7 @@ class test_canonize(unittest.TestCase): ...@@ -407,6 +495,7 @@ class test_canonize(unittest.TestCase):
def test_multiple_case_that_fail(self): def test_multiple_case_that_fail(self):
import theano.tensor, theano.compile import theano.tensor, theano.compile
raise SkipTest("Current implementation of Canonizer don't implement all case. Skip the corresponding test")
shp=(4,4) shp=(4,4)
fx, fy, fz = fmatrices('xyz') fx, fy, fz = fmatrices('xyz')
...@@ -418,7 +507,11 @@ class test_canonize(unittest.TestCase): ...@@ -418,7 +507,11 @@ class test_canonize(unittest.TestCase):
dyv = numpy.asarray(numpy.random.rand(*shp),dtype='float32') dyv = numpy.asarray(numpy.random.rand(*shp),dtype='float32')
dzv = numpy.asarray(numpy.random.rand(*shp),dtype='float32') dzv = numpy.asarray(numpy.random.rand(*shp),dtype='float32')
fvv = numpy.asarray(numpy.random.rand(shp[0]),dtype='float32').reshape(1,shp[0]) fvv = numpy.asarray(numpy.random.rand(shp[0]),dtype='float32').reshape(1,shp[0])
mode=compile.mode.predefined_modes['DEBUG_MODE'] #We must be sure that the Canonizer is working, but that we don't have other
# optimisation that could hide bug in the Canonizer as local_elemwise_fusion
mode=compile.mode.predefined_modes[compile.mode.default_mode]
mode._optimizer=gof.Query(["canonicalize"])
mode._optimizer=mode._optimizer.excluding('local_elemwise_fusion')
#test fail! #test fail!
#test x / y / z -> x / (y * z) #test x / y / z -> x / (y * z)
...@@ -455,6 +548,11 @@ class test_canonize(unittest.TestCase): ...@@ -455,6 +548,11 @@ class test_canonize(unittest.TestCase):
assert len(topo[0].inputs)==1 assert len(topo[0].inputs)==1
assert(out_dtype==out.dtype) assert(out_dtype==out.dtype)
def test_dont_merge_if_multiple_client(self):
""" test those case take from the comment in Canonizer
"""
raise SkipTest("Not implemented")
def test_mixeddiv(): def test_mixeddiv():
"""Test that int division is preserved""" """Test that int division is preserved"""
i = iscalar() i = iscalar()
...@@ -692,8 +790,220 @@ def test_const_type_in_mul_canonizer(): ...@@ -692,8 +790,220 @@ def test_const_type_in_mul_canonizer():
f2(ival, wval, visbval, hidbval, betaval, aval), f2(ival, wval, visbval, hidbval, betaval, aval),
f1(ival, wval, visbval, hidbval, betaval, aval)) f1(ival, wval, visbval, hidbval, betaval, aval))
from theano.compile.sandbox.pfunc import pfunc
from theano.compile.sandbox.sharedvalue import shared
import theano
class test_fusion(unittest.TestCase):
def do(self, mode, shared_fn, shp, gpu=False, nb_repeat=1, assert_len_topo=True, slice=None):
"""
param shared_fn: if None, will use compile.function
verify that the elemwise fusion work
Test with and without DimShuffle
"""
#TODO: disable the canonizer?
def my_init(shp,dtype, num=0):
#ret = numpy.asarray(numpy.random.rand(*shp),dtype=dtype)
ret = numpy.zeros(shp, dtype=dtype)+num
return ret
fw, fx, fy, fz = fmatrices('wxyz')
dw, dx, dy, dz = dmatrices('wxyz')
fv = fvector('r').dimshuffle('x',0)
dv = dvector('s').dimshuffle('x',0)
fwv = my_init(shp,'float32',1)
fxv = my_init(shp,'float32',2)
fyv = my_init(shp,'float32',3)
fzv = my_init(shp,'float32',4)
fvv = numpy.asarray(numpy.random.rand(shp[0]),dtype='float32').reshape(1,shp[0])
dwv = my_init(shp,'float64',5)
# dxv = my_init(shp,'float64',6)
# dyv = my_init(shp,'float64',7)
# dzv = my_init(shp,'float64',8)
# dvv = numpy.asarray(numpy.random.rand(shp[0]),dtype='float64').reshape(1,shp[0])
fwx=fw+fx
cases = [
(fx+fy+fz,(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv+fzv,'float32'),#1
(fx*fy*fz,(fx,fy,fz),(fxv,fyv,fzv),1,fxv*fyv*fzv,'float32'),
(fx+fy*fz,(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv*fzv,'float32'),
(fx*fy+fz,(fx,fy,fz),(fxv,fyv,fzv),1,fxv*fyv+fzv,'float32'),
(fw+fx+fy+fz,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv,'float32'),#5
((fw+fx)+(fy+fz),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv,'float32'),
(((fw+fx)+fy)+fz,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv,'float32'),
((fw+(fx+fy))+fz,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv,'float32'),
((fw+(fx+fy)+fz),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv,'float32'),
(fw+(fx+(fy+fz)),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv,'float32'),#10
((fw+fx)+(fy+fz),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv,'float32'),
(fw*fx*fy*fz,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv*fxv*fyv*fzv,'float32'),
(fw+fx*fy*fz,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv*fyv*fzv,'float32'),
(fx+fy*fz*fx,(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv*fzv*fxv,'float32'),
(fx*fy+fz+fy,(fx,fy,fz),(fxv,fyv,fzv),1,fxv*fyv+fzv+fyv,'float32'),#15
(fx*fy*fz*fw+fx+fy+fz+fw,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fxv*fyv*fzv*fwv+fxv+fyv+fzv+fwv,'float32'),
#test with constant
((fw+fx)+(fy+fz)+2,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv+2,'float32'),
(((fw+fx)+2+fy)+fz,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv+2,'float32'),
((fw+(fx+2+fy))+fz,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv+2,'float32'),
((fw+(fx+fy)+2+fz),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv+2,'float32'),#20
(fw+(fx+(fy+fz)+2),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv+2,'float32'),
(2+(fw+fx)+(fy+fz),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv+2,'float32'),
#mix float32 and float64
(2+(dw+fx)+(fy+fz),(dw,fx,fy,fz),(dwv,fxv,fyv,fzv),1,dwv+fxv+fyv+fzv+2,'float64'),
(2+(fw+dw)+(fy+fz),(fw,dw,fy,fz),(fwv,dwv,fyv,fzv),1,fwv+dwv+fyv+fzv+2,'float64'),
(2+(fw+fx)+(dw+fz),(fw,fx,dw,fz),(fwv,fxv,dwv,fzv),1,fwv+fxv+dwv+fzv+2,'float64'),#25
(2+(fw+fx)+(fy+dw),(fw,fx,fy,dw),(fwv,fxv,fyv,dwv),1,fwv+fxv+fyv+dwv+2,'float64'),
#test when their is other op then elemwise.
#the good output for the next test.
# (Pdb) p f.maker.env.toposort()
#[Elemwise{add,no_inplace}(w, x), Sum(Elemwise{add,no_inplace}.0), InplaceDimShuffle{x,x}(Sum.0), Elemwise{Composite{_impls=[<function <lambda> at 0x2c5c8c0>], nin=4, _c_code={
#npy_float32 V%(id)s_tmp1;
#V%(id)s_tmp1 = %(i2)s + %(i3)s;
#npy_float32 V%(id)s_tmp2;
#V%(id)s_tmp2 = %(i0)s + %(i1)s;
#%(o0)s = V%(id)s_tmp2 + V%(id)s_tmp1;
#}
#, nout=1, env=[add(add(<float32>, <float32>), add(<float32>, <float32>))]}}(InplaceDimShuffle{x,x}.0, Elemwise{add,no_inplace}.0, y, z)]
((fwx.sum())+(fwx)+(fy+fz),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),4,(fwv+fxv).sum()+fwv+fxv+fyv+fzv,'float32'),
#test other elemwise op
(fx+fy+cos(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv+numpy.cos(fzv),'float32'),
(fx+fy+cosh(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv+numpy.cosh(fzv),'float32'),
(fx+fy+abs(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv+numpy.absolute(fzv),'float32'),#30
(fx+fy+theano.tensor.log(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv+numpy.log(fzv),'float32'),
(fx+fy+theano.tensor.log2(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv+numpy.log2(fzv),'float32'),
(fx+fy+theano.tensor.log10(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv+numpy.log10(fzv),'float32'),
(fx+fy**fz,(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv**fzv,'float32'),#pow
(fx+fy+theano.tensor.exp(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv+numpy.exp(fzv),'float32'),#35
(fx-fy-fz,(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv-fzv,'float32'),
(fx-(fy/fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv/fzv),'float32'),
# (fx-(fy%fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv%fzv),'float32'),#TODO: c_code not implemented for %
(fx-(fy>fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv>fzv),'float32'),
(fx-(fy>=fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv>=fzv),'float32'),
(fx-(fy<fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv<fzv),'float32'),
(fx-(fy<=fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv<=fzv),'float32'),
# (fx-(fy==fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv==fzv),'float32'),#TODO: bugged
(fx-(fy!=fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv!=fzv),'float32'),
(fx-fy+tan(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+numpy.tan(fzv),'float32'),
(fx-fy+tanh(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+numpy.tanh(fzv),'float32'),
(fx-fy+sin(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+numpy.sin(fzv),'float32'),
(fx-fy+sinh(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+numpy.sinh(fzv),'float32'),
(fx-fy+theano.tensor.sqr(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+(fzv*fzv),'float32'),
(fx-fy+theano.tensor.sqrt(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+numpy.sqrt(fzv),'float32'),
(fx-fy+theano.tensor.inv(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+(1/fzv),'float32'),
(fx-fy+theano.tensor.neg(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+(-fzv),'float32'),
# (fx-fy+theano.tensor.iround(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+numpy.round(fzv),'float32'),#TODO: trouble with the output type. To my understanding, numpy and c round fct return the same type as the input. Why we don't do this?
#TODO: BIT OP only with ints, xor, or, and, invert
# (fx-theano.tensor.or_(fy,fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fy|fz),'float32'),
# (fx-theano.tensor.xor(fy,fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fy^fz),'float32'),
]
if slice:
cases = cases[slice]
import time
times=numpy.zeros(len(cases))
for id, [g, sym_inputs, val_inputs, nb_elemwise, answer, out_dtype] in enumerate(cases):
print "new cases", id
if shared_fn == None:
assert gpu==False
f = compile.function(list(sym_inputs), g,mode=mode)
#pre-call to have the data in cache if it fit to don't penalise the first iteration
# if id==0:
# out=f(*val_inputs)
t0=time.time()
for x in range(nb_repeat):
out=f(*val_inputs)
t1=time.time()
nb_repeat=1
else:
out=shared_fn(numpy.zeros(shp, dtype=out_dtype),'out')
f = pfunc(sym_inputs,[],updates=[(out,out+g)],mode=mode)
#pre-call to have the data in cache if it fit to don't penalise the first iteration
# if id==0:
# f(*val_inputs)
t0=time.time()
for x in range(nb_repeat):
f(*val_inputs)
t1=time.time()
out=out.value
# if id==0:
# nb_repeat+=1
times[id]=t1-t0
assert numpy.allclose(out,answer*nb_repeat,atol=1e-6 if out_dtype=='float32' else 1e-8)
topo=f.maker.env.toposort()
if gpu:
import theano_cuda_ndarray as tcn
topo_ = [x for x in topo if not isinstance(x.op,tcn.basic_ops.GpuFromHost)]
gpu_ = [x for x in topo if isinstance(x.op,tcn.basic_ops.GpuFromHost)]
assert len(gpu_)==len(sym_inputs)
else: topo_=topo
if assert_len_topo:
assert(len(topo_)==nb_elemwise)
assert(out_dtype==out.dtype)
print "Executed",len(cases),"cases"
return times
def test_elemwise_fusion(self):
raise SkipTest("Current implementation of test_fusion is not enabled. So we skip the corresponding test")
shp=(5,5)
#we need the optimisation enabled, debug do this.
mode=compile.mode.predefined_modes['FAST_COMPILE']
mode=compile.mode.predefined_modes['FAST_RUN']
mode=compile.mode.predefined_modes['DEBUG_MODE']
self.do(mode, shared, shp)
def gpu_fusion(self):
shp=(5,5)
#we need the optimisation enabled, debug do this.
mode=compile.mode.predefined_modes['FAST_COMPILE']
mode=compile.mode.predefined_modes['FAST_RUN']
mode=compile.mode.predefined_modes['DEBUG_MODE']
import theano_cuda_ndarray as tcn
self.do(mode, tcn.shared_constructor, shp, gpu=True)
def speed_fusion(self, shared_fn = shared, gpu = False, s=None):
"""
param type s: a slice object
param s: a slice to apply to the case to execute. If None, exec all case.
"""
import copy
shp=(3000,3000)
#mode1=copy.copy(compile.mode.predefined_modes['FAST_RUN'])
linker=gof.CLinker
linker=gof.OpWiseCLinker
mode1=compile.Mode(linker(), copy.copy(compile.mode.OPT_FAST_RUN))
#TODO:clinker is much faster... but use to much memory
#Possible cause: as their is do deletion of intermediate value when we don't keep the fct.
#More plausible cause: we keep a link to the output data?
#Follow up. Clinker do the same... second cause?
mode2=compile.Mode(linker(), copy.copy(compile.mode.OPT_FAST_RUN))
# mode2=copy.copy(compile.mode.predefined_modes['FAST_RUN'])
mode2._optimizer=mode2._optimizer.excluding('local_elemwise_fusion')
# mode2=compile.Mode(gof.OpWiseCLinker(allow_gc=True), compile.mode.OPT_FAST_COMPILE)
if s is None:
s=slice(0,49)
#s=slice(49,59)
nb_repeat=10
print "test with linker", str(linker)
times1=self.do(mode1, shared_fn, shp, gpu=gpu, nb_repeat=nb_repeat, assert_len_topo=False,slice=s)
times2=self.do(mode2, shared_fn, shp, gpu=gpu, nb_repeat=nb_repeat, assert_len_topo=False,slice=s)
print "times1 FAST_RUN optimisation"
print times1, times1.min(), times1.max(), times1.sum()
print "times2 FAST_RUN optimisation without local_elemwise_fusion"
print times2, times2.min(), times2.max(), times2.sum()
d=times2/times1
# d.sort()
print "times2/times1",d,d.min(), d.max(), d.mean(), d.std()
def speed_fusion_gpu(self):
import theano_cuda_ndarray as tcn
self.speed_fusion(shared_fn=tcn.shared_constructor, gpu=True, s=slice(0,15))
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论