提交 56267692 authored 作者: Simon Lemieux's avatar Simon Lemieux

merge

...@@ -150,38 +150,35 @@ def execs_timeit_2vector(exprs, fname=None): ...@@ -150,38 +150,35 @@ def execs_timeit_2vector(exprs, fname=None):
assert len(colors)>=len(times) assert len(colors)>=len(times)
fig = pylab.figure() fig = pylab.figure()
for idx,(time,expr) in enumerate(zip(times,str_expr)): for idx,(time,expr) in enumerate(zip(times,str_expr)):
###
###
###
# Creating each subplot
###
###
###
###
pylab.subplot(220+idx+1) pylab.subplot(220+idx+1)
pylab.subplots_adjust(wspace=0.25, hspace=0.25) pylab.subplots_adjust(wspace=0.25, hspace=0.25)
#legend=[] #legend=[]
#plot = fig.add_subplot(1,len(exprs),idx) #plot = fig.add_subplot(1,len(exprs),idx)
speedup = [t[0].min()/t[1].min() for t in time] speedup = [t[0].min()/t[1].min() for t in time]
pylab.semilogx(nb_calls, speedup, linewidth=1.0, linestyle = '--', color='r') pylab.semilogx(nb_calls, speedup, linewidth=1.0, linestyle = '--', color='r')
speedup = [t[0].min()/t[2].min() for t in time] speedup = [t[0].min()/t[2].min() for t in time]
pylab.semilogx(nb_calls, speedup, linewidth=1.0, color = 'b') pylab.semilogx(nb_calls, speedup, linewidth=1.0, color = 'b')
pylab.grid(True) pylab.grid(True)
if (idx == 2) or (idx == 3): if (idx == 2) or (idx == 3):
pylab.xlabel('Dimension of vectors a and b') pylab.xlabel('Dimension of vectors a and b', fontsize = 15)
if (idx == 0) or (idx == 2): if (idx == 0) or (idx == 2):
pylab.ylabel('Speed up vs NumPy') pylab.ylabel('Speed up vs NumPy', fontsize = 15)
pylab.axhline(y=1, linewidth=1.0, color='black') pylab.axhline(y=1, linewidth=1.0, color='black')
pylab.xlim(1e3,1e7) pylab.xlim(1e3,1e7)
pylab.xticks([1e3,1e5,1e7],['1e3','1e5','1e7']) pylab.xticks([1e3,1e5,1e7],['1e3','1e5','1e7'])
pylab.title(expr) pylab.title(expr)
#for time,expr,color in zip(times,str_expr,colors):
# speedup = [t[0].min()/t[1].min() for t in time]
# plot.semilogx(nb_calls, speedup, linewidth=1.0, linestyle='--', color=color)
# speedup = [t[0].min()/t[2].min() for t in time]
# plot.semilogx(nb_calls, speedup, linewidth=1.0, color=color)
#legend += ["Numexpr "+expr,"Theano "+expr]
#pylab.title('Speed up Numexpr and Theano vs NumPy')
#pylab.grid(True)
#pylab.xlabel('Nb element')
#pylab.ylabel('Speed up vs NumPy')
#pylab.legend(legend,loc='upper left')
# fig.legend(legend,loc='upper left')
if fname: if fname:
fig.savefig(fname) fig.savefig(fname)
......
...@@ -126,6 +126,18 @@ class AddDestroyHandler(gof.Optimizer): ...@@ -126,6 +126,18 @@ class AddDestroyHandler(gof.Optimizer):
super(AddDestroyHandler, self).add_requirements(env) super(AddDestroyHandler, self).add_requirements(env)
env.extend(gof.DestroyHandler()) env.extend(gof.DestroyHandler())
class PrintCurrentEnv(gof.Optimizer):
"""This optimizer is for debugging.
Toss it into the optimization pipeline to see the state of things at any given point.
"""
def __init__(self, header):
self.header =header
def apply(self, env):
import theano.printing
print "PrintCurrentEnv:", self.header
theano.printing.debugprint(env.outputs)
optdb = gof.SequenceDB() optdb = gof.SequenceDB()
optdb.register('merge1', gof.MergeOptimizer(), optdb.register('merge1', gof.MergeOptimizer(),
0, 'fast_run', 'fast_compile') 0, 'fast_run', 'fast_compile')
...@@ -133,10 +145,19 @@ optdb.register('canonicalize', gof.EquilibriumDB(), # rearranges elemwis ...@@ -133,10 +145,19 @@ optdb.register('canonicalize', gof.EquilibriumDB(), # rearranges elemwis
1, 'fast_run', 'fast_compile') 1, 'fast_run', 'fast_compile')
optdb.register('merge1.2', gof.MergeOptimizer(skip_const_merge=False), optdb.register('merge1.2', gof.MergeOptimizer(skip_const_merge=False),
1.2, 'fast_run', 'fast_compile') 1.2, 'fast_run', 'fast_compile')
optdb.register('Print1.21', PrintCurrentEnv('Post-canonicalize'),
1.21,)# 'fast_run', 'fast_compile')
optdb.register('stabilize', gof.EquilibriumDB(), # replace unstable subgraphs optdb.register('stabilize', gof.EquilibriumDB(), # replace unstable subgraphs
1.5, 'fast_run') 1.5, 'fast_run')
optdb.register('Print1.51', PrintCurrentEnv('Post-stabilize'),
1.51,) #'fast_run', 'fast_compile')
optdb.register('specialize', gof.EquilibriumDB(), # misc special cases for speed optdb.register('specialize', gof.EquilibriumDB(), # misc special cases for speed
2, 'fast_run') 2, 'fast_run')
optdb.register('Print2.01', PrintCurrentEnv('Post-specialize'),
2.01, )#'fast_run', 'fast_compile')
optdb.register('specialize_device', gof.EquilibriumDB(), # misc special cases for speed that are dependent on the device.
48.6, 'fast_run')#must be after gpu stuff at 48.5
optdb.register('merge2', gof.MergeOptimizer(), # especially constant merge optdb.register('merge2', gof.MergeOptimizer(), # especially constant merge
49, 'fast_run') 49, 'fast_run')
optdb.register('add_destroy_handler', AddDestroyHandler(), optdb.register('add_destroy_handler', AddDestroyHandler(),
......
...@@ -341,6 +341,8 @@ class Value(Variable): ...@@ -341,6 +341,8 @@ class Value(Variable):
if value is not None: if value is not None:
raise ValueError("Value instances cannot have an owner.") raise ValueError("Value instances cannot have an owner.")
owner = property(lambda self: None, __set_owner) owner = property(lambda self: None, __set_owner)
value = property(lambda self: self.data,
doc='read-only data access method')
# index is not defined, because the `owner` attribute must necessarily be None # index is not defined, because the `owner` attribute must necessarily be None
......
...@@ -525,7 +525,8 @@ class PatternSub(LocalOptimizer): ...@@ -525,7 +525,8 @@ class PatternSub(LocalOptimizer):
(scrabble, 'x')) (scrabble, 'x'))
""" """
def __init__(self, in_pattern, out_pattern, allow_multiple_clients = False): def __init__(self, in_pattern, out_pattern, allow_multiple_clients = False,
skip_identities_fn=None):
""" """
Creates a PatternSub that replaces occurrences of Creates a PatternSub that replaces occurrences of
in_pattern by occurrences of out_pattern. in_pattern by occurrences of out_pattern.
...@@ -543,7 +544,12 @@ class PatternSub(LocalOptimizer): ...@@ -543,7 +544,12 @@ class PatternSub(LocalOptimizer):
raise TypeError("The pattern to search for must start with a specific Op instance.") raise TypeError("The pattern to search for must start with a specific Op instance.")
self.__doc__ = self.__class__.__doc__ + "\n\nThis instance does: " + str(self) + "\n" self.__doc__ = self.__class__.__doc__ + "\n\nThis instance does: " + str(self) + "\n"
self.allow_multiple_clients = allow_multiple_clients self.allow_multiple_clients = allow_multiple_clients
self.skip_identities_fn = skip_identities_fn
def skip_identities(self, expr):
if self.skip_identities_fn:
return self.skip_identities_fn(expr)
def op_key(self): def op_key(self):
return self.op return self.op
...@@ -568,13 +574,22 @@ class PatternSub(LocalOptimizer): ...@@ -568,13 +574,22 @@ class PatternSub(LocalOptimizer):
if node.op != self.op: if node.op != self.op:
return False return False
def match(pattern, expr, u, allow_multiple_clients = False): def match(pattern, expr, u, allow_multiple_clients = False):
def retry_with_equiv():
expr_equiv = self.skip_identities(expr)
if expr_equiv is None:
return False
#TODO: Not sure how to handle multiple_clients flag
###print 'retrying match', pattern, expr_equiv
return match(pattern, expr_equiv, u,
allow_multiple_clients=allow_multiple_clients)
if isinstance(pattern, (list, tuple)): if isinstance(pattern, (list, tuple)):
if expr.owner is None: if expr.owner is None:
return False return False
if not (expr.owner.op == pattern[0]) or (not allow_multiple_clients and len(expr.clients) > 1): if not (expr.owner.op == pattern[0]) or (not allow_multiple_clients and len(expr.clients) > 1):
return False return retry_with_equiv()
if len(pattern) - 1 != len(expr.owner.inputs): if len(pattern) - 1 != len(expr.owner.inputs):
return False return retry_with_equiv()
for p, v in zip(pattern[1:], expr.owner.inputs): for p, v in zip(pattern[1:], expr.owner.inputs):
u = match(p, v, u, self.allow_multiple_clients) u = match(p, v, u, self.allow_multiple_clients)
if not u: if not u:
...@@ -588,17 +603,17 @@ class PatternSub(LocalOptimizer): ...@@ -588,17 +603,17 @@ class PatternSub(LocalOptimizer):
if constraint(expr): if constraint(expr):
return match(real_pattern, expr, u, pattern.get('allow_multiple_clients', False)) return match(real_pattern, expr, u, pattern.get('allow_multiple_clients', False))
else: else:
return False return retry_with_equiv()
elif isinstance(pattern, str): elif isinstance(pattern, str):
v = unify.Var(pattern) v = unify.Var(pattern)
if u[v] is not v and u[v] is not expr: if u[v] is not v and u[v] is not expr:
return False return retry_with_equiv()
else: else:
u = u.merge(expr, v) u = u.merge(expr, v)
elif isinstance(pattern, graph.Constant) and isinstance(expr, graph.Constant) and pattern.equals(expr): elif isinstance(pattern, graph.Constant) and isinstance(expr, graph.Constant) and pattern.equals(expr):
return u return u
else: else:
return False return retry_with_equiv()
return u return u
def build(pattern, u): def build(pattern, u):
...@@ -614,6 +629,7 @@ class PatternSub(LocalOptimizer): ...@@ -614,6 +629,7 @@ class PatternSub(LocalOptimizer):
if u: if u:
p = self.out_pattern p = self.out_pattern
new = build(p, u) new = build(p, u)
####print "PatternSub matched:", new
return [new] return [new]
else: else:
return False return False
......
...@@ -359,7 +359,8 @@ pprint.assign(lambda pstate, r: hasattr(pstate, 'target') and pstate.target is n ...@@ -359,7 +359,8 @@ pprint.assign(lambda pstate, r: hasattr(pstate, 'target') and pstate.target is n
pp = pprint pp = pprint
def pydotprint(fct, outfile=os.path.join(config.compiledir,'theano.pydotprint.png'), compact=True, mode=None, format='png'): def pydotprint(fct, outfile=os.path.join(config.compiledir,'theano.pydotprint.png'),
compact=True, mode=None, format='png', with_ids=False):
""" """
print to a file in png format the graph of op of a compile theano fct. print to a file in png format the graph of op of a compile theano fct.
...@@ -390,14 +391,15 @@ def pydotprint(fct, outfile=os.path.join(config.compiledir,'theano.pydotprint.pn ...@@ -390,14 +391,15 @@ def pydotprint(fct, outfile=os.path.join(config.compiledir,'theano.pydotprint.pn
g=pd.Dot() g=pd.Dot()
var_str={} var_str={}
all_strings = set()
def var_name(var): def var_name(var):
if var in var_str: if var in var_str:
return var_str[var] return var_str[var]
if var.name is not None: if var.name is not None:
varstr = var.name+" "+str(var.type) varstr = 'name='+var.name+" "+str(var.type)
elif isinstance(var,gof.Constant): elif isinstance(var,gof.Constant):
dstr = str(var.data) dstr = 'val='+str(var.data)
if '\n' in dstr: if '\n' in dstr:
dstr = dstr[:dstr.index('\n')] dstr = dstr[:dstr.index('\n')]
if len(dstr) > 30: if len(dstr) > 30:
...@@ -408,12 +410,17 @@ def pydotprint(fct, outfile=os.path.join(config.compiledir,'theano.pydotprint.pn ...@@ -408,12 +410,17 @@ def pydotprint(fct, outfile=os.path.join(config.compiledir,'theano.pydotprint.pn
else: else:
#a var id is needed as otherwise var with the same type will be merged in the graph. #a var id is needed as otherwise var with the same type will be merged in the graph.
varstr = str(var.type) varstr = str(var.type)
varstr += ' ' + str(len(var_str)) if (varstr in all_strings) or with_ids:
varstr += ' id=' + str(len(var_str))
var_str[var]=varstr var_str[var]=varstr
all_strings.add(varstr)
return varstr return varstr
topo = fct.maker.env.toposort() topo = fct.maker.env.toposort()
apply_name_cache = {}
def apply_name(node): def apply_name(node):
if node in apply_name_cache:
return apply_name_cache[node]
prof_str='' prof_str=''
if mode: if mode:
time = mode.apply_time.get((topo.index(node),node),0) time = mode.apply_time.get((topo.index(node),node),0)
...@@ -425,7 +432,12 @@ def pydotprint(fct, outfile=os.path.join(config.compiledir,'theano.pydotprint.pn ...@@ -425,7 +432,12 @@ def pydotprint(fct, outfile=os.path.join(config.compiledir,'theano.pydotprint.pn
pf=0 pf=0
else: pf = time*100/mode.fct_call_time[fct] else: pf = time*100/mode.fct_call_time[fct]
prof_str=' (%.3fs,%.3f%%,%.3f%%)'%(time,pt,pf) prof_str=' (%.3fs,%.3f%%,%.3f%%)'%(time,pt,pf)
return str(node.op).replace(':','_')+' '+str(topo.index(node))+prof_str applystr = str(node.op).replace(':','_')
if (applystr in all_strings) or with_ids:
applystr = applystr+' id='+str(topo.index(node))+prof_str
all_strings.add(applystr)
apply_name_cache[node] = applystr
return applystr
# Update the inputs that have an update function # Update the inputs that have an update function
input_update={} input_update={}
...@@ -434,16 +446,18 @@ def pydotprint(fct, outfile=os.path.join(config.compiledir,'theano.pydotprint.pn ...@@ -434,16 +446,18 @@ def pydotprint(fct, outfile=os.path.join(config.compiledir,'theano.pydotprint.pn
if i.update is not None: if i.update is not None:
input_update[outputs.pop()] = i input_update[outputs.pop()] = i
apply_shape='ellipse'
var_shape='box'
for node_idx,node in enumerate(topo): for node_idx,node in enumerate(topo):
astr=apply_name(node) astr=apply_name(node)
g.add_node(pd.Node(astr,shape='box')) g.add_node(pd.Node(astr,shape=apply_shape))
for id,var in enumerate(node.inputs): for id,var in enumerate(node.inputs):
varstr=var_name(var) varstr=var_name(var)
label='' label=''
if len(node.inputs)>1: if len(node.inputs)>1:
label=str(id) label=str(id)
if var.owner is None: if var.owner is None:
g.add_node(pd.Node(varstr,color='green')) g.add_node(pd.Node(varstr,color='green',shape=var_shape))
g.add_edge(pd.Edge(varstr,astr, label=label)) g.add_edge(pd.Edge(varstr,astr, label=label))
elif var.name or not compact: elif var.name or not compact:
g.add_edge(pd.Edge(varstr,astr, label=label)) g.add_edge(pd.Edge(varstr,astr, label=label))
...@@ -460,10 +474,10 @@ def pydotprint(fct, outfile=os.path.join(config.compiledir,'theano.pydotprint.pn ...@@ -460,10 +474,10 @@ def pydotprint(fct, outfile=os.path.join(config.compiledir,'theano.pydotprint.pn
label=str(id) label=str(id)
if out: if out:
g.add_edge(pd.Edge(astr, varstr, label=label)) g.add_edge(pd.Edge(astr, varstr, label=label))
g.add_node(pd.Node(varstr,color='blue')) g.add_node(pd.Node(varstr,color='blue',shape=var_shape))
elif len(var.clients)==0: elif len(var.clients)==0:
g.add_edge(pd.Edge(astr, varstr, label=label)) g.add_edge(pd.Edge(astr, varstr, label=label))
g.add_node(pd.Node(varstr,color='grey')) g.add_node(pd.Node(varstr,color='grey',shape=var_shape))
elif var.name or not compact: elif var.name or not compact:
g.add_edge(pd.Edge(astr, varstr, label=label)) g.add_edge(pd.Edge(astr, varstr, label=label))
# else: # else:
...@@ -495,9 +509,9 @@ def pydot_var(vars, outfile=os.path.join(config.compiledir,'theano.pydotprint.pn ...@@ -495,9 +509,9 @@ def pydot_var(vars, outfile=os.path.join(config.compiledir,'theano.pydotprint.pn
return var_str[var] return var_str[var]
if var.name is not None: if var.name is not None:
varstr = var.name varstr = 'name='+var.name
elif isinstance(var,gof.Constant): elif isinstance(var,gof.Constant):
dstr = str(var.data) dstr = 'val='+str(var.data)
if '\n' in dstr: if '\n' in dstr:
dstr = dstr[:dstr.index('\n')] dstr = dstr[:dstr.index('\n')]
if len(dstr) > 30: if len(dstr) > 30:
......
...@@ -21,8 +21,8 @@ def debug(*msg): ...@@ -21,8 +21,8 @@ def debug(*msg):
# printed and this module will not be working properly (we set `cuda_available` # printed and this module will not be working properly (we set `cuda_available`
# to False). # to False).
# This variable is True by default, and set to False if something goes wrong # This variable is True by default, and set to False if nvcc is not available or
# when trying to initialize cuda. # their is no cuda card or something goes wrong when trying to initialize cuda.
cuda_available = True cuda_available = True
# Global variable to avoid displaying the same warning multiple times. # Global variable to avoid displaying the same warning multiple times.
...@@ -89,6 +89,9 @@ except Exception, e: ...@@ -89,6 +89,9 @@ except Exception, e:
error( "Failed to compile cuda_ndarray.cu: %s" % str(e)) error( "Failed to compile cuda_ndarray.cu: %s" % str(e))
set_cuda_disabled() set_cuda_disabled()
if cuda_available:
cuda_available=device_available()
if cuda_available: if cuda_available:
#check if their is an old cuda_ndarray that was loading instead of the one we compiled! #check if their is an old cuda_ndarray that was loading instead of the one we compiled!
import cuda_ndarray.cuda_ndarray import cuda_ndarray.cuda_ndarray
......
...@@ -1715,14 +1715,21 @@ class GpuSubtensor(tensor.Subtensor): ...@@ -1715,14 +1715,21 @@ class GpuSubtensor(tensor.Subtensor):
cdata = tuple(map(convert, self.idx_list)) cdata = tuple(map(convert, self.idx_list))
if len(cdata) == 1: if len(cdata) == 1:
cdata = cdata[0] cdata = cdata[0]
out[0] = x.__getitem__(cdata)
# some numpy installations don't expose the __index__() methods for if 0:
# numpy.int8/16/32/64. Casting to python's int instead # JSB: commenting this out because it breaks code and does not look right
start = int(cdata.start) if cdata.start!=None else None # Dumi could you try to run the examples in the DeepLearningBenchmarks
stop = int(cdata.stop) if cdata.stop!=None else None # for example? This logic doesn't seem right -- we just
step = int(cdata.step) if cdata.step!=None else None # cast cdata to a tuple, so it doesn't have a .start field.
newslice = slice(start,stop,step)
out[0] = x.__getitem__(newslice) # some numpy installations don't expose the __index__() methods for
# numpy.int8/16/32/64. Casting to python's int instead
start = int(cdata.start) if cdata.start!=None else None
stop = int(cdata.stop) if cdata.stop!=None else None
step = int(cdata.step) if cdata.step!=None else None
newslice = slice(start,stop,step)
out[0] = x.__getitem__(newslice)
class GpuIncSubtensor(tensor.IncSubtensor): class GpuIncSubtensor(tensor.IncSubtensor):
def make_node(self, x, y, *inputs): def make_node(self, x, y, *inputs):
......
...@@ -722,7 +722,9 @@ conv_rows_stack2( float* img, float* kern, float* out, ...@@ -722,7 +722,9 @@ conv_rows_stack2( float* img, float* kern, float* out,
if(preload_full_kern) idx_kern=&d_kern[row*kern_wid]; if(preload_full_kern) idx_kern=&d_kern[row*kern_wid];
else idx_kern=d_kern; else idx_kern=d_kern;
const float* idx_in=&d_img[((shared_row+row)%nb_rows)*img_wid+out_col]; const float* idx_in=&d_img[((shared_row+row)%nb_rows)*img_wid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid); float sum_ =0.0f;
convolutionRowNoFlip<KERN_WIDTH>(sum_,idx_in,idx_kern,kern_wid);
sum+=sum_;//We pass by an intermediate variable to have more precission.
} }
} }
} }
......
...@@ -1604,6 +1604,35 @@ static PyTypeObject CudaNdarrayType = ...@@ -1604,6 +1604,35 @@ static PyTypeObject CudaNdarrayType =
CudaNdarray_new, /* tp_new */ CudaNdarray_new, /* tp_new */
}; };
//This fct return True it is able to find a cuda card and query its properti
//Otherwise we return False
PyObject *
device_available(PyObject* _unsed, PyObject * args)
{
int deviceCount;
cudaError err = cudaGetDeviceCount(&deviceCount);
if( cudaSuccess != err) {
Py_RETURN_FALSE;
}
if (deviceCount <= 0) {
Py_RETURN_FALSE;
}
cudaDeviceProp deviceProp;
err=cudaGetDeviceProperties(&deviceProp, 0);
if( cudaSuccess != err) {
Py_RETURN_FALSE;
}
if(deviceProp.major == 9999 && deviceProp.minor == 9999 ){
Py_RETURN_FALSE;
}
Py_RETURN_TRUE;
}
PyObject * PyObject *
CudaNdarray_gpu_init(PyObject* _unsed, PyObject * args) CudaNdarray_gpu_init(PyObject* _unsed, PyObject * args)
{ {
...@@ -1810,6 +1839,7 @@ filter(PyObject* __unsed_self, PyObject *args) // args = (data, broadcastable, s ...@@ -1810,6 +1839,7 @@ filter(PyObject* __unsed_self, PyObject *args) // args = (data, broadcastable, s
static PyMethodDef module_methods[] = { static PyMethodDef module_methods[] = {
{"dot", CudaNdarray_Dot, METH_VARARGS, "Returns the matrix product of two CudaNdarray arguments."}, {"dot", CudaNdarray_Dot, METH_VARARGS, "Returns the matrix product of two CudaNdarray arguments."},
{"device_available", device_available, METH_VARARGS, "Return Py_True if a cuda card is available."},
{"gpu_init", CudaNdarray_gpu_init, METH_VARARGS, "Allow to select the gpu card to use."}, {"gpu_init", CudaNdarray_gpu_init, METH_VARARGS, "Allow to select the gpu card to use."},
{"filter", filter, METH_VARARGS, "filter(obj, broadcastable, strict, storage) returns a CudaNdarray initialized to obj if it matches the constraints of broadcastable. strict=True prevents any numeric casting. If storage is a CudaNdarray it may be overwritten and used as the return value."}, {"filter", filter, METH_VARARGS, "filter(obj, broadcastable, strict, storage) returns a CudaNdarray initialized to obj if it matches the constraints of broadcastable. strict=True prevents any numeric casting. If storage is a CudaNdarray it may be overwritten and used as the return value."},
{"outstanding_mallocs", outstanding_mallocs, METH_VARARGS, "how many more mallocs have been called than free's"}, {"outstanding_mallocs", outstanding_mallocs, METH_VARARGS, "how many more mallocs have been called than free's"},
......
...@@ -165,7 +165,9 @@ def exec_conv(version, shapes, verbose, random, mode, print_=None, rtol=1e-5, on ...@@ -165,7 +165,9 @@ def exec_conv(version, shapes, verbose, random, mode, print_=None, rtol=1e-5, on
ret = _params_allgood(ishape, kshape, mode, ret = _params_allgood(ishape, kshape, mode,
subsample=subshape, img_stride=istride, kern_stride=kstride, subsample=subshape, img_stride=istride, kern_stride=kstride,
version=ver, verbose=verbose, random=random, id=id,print_=print_,rtol=rtol,ones=ones) version=ver, verbose=verbose, random=random, id=id,print_=print_,rtol=rtol,ones=ones)
except: except Exception, e:
print ver, id,(ishape, kshape, subshape, istride, kstride)
print e
pass pass
if not ret: if not ret:
failed_version.add(ver) failed_version.add(ver)
......
...@@ -11,6 +11,7 @@ if cuda_available: ...@@ -11,6 +11,7 @@ if cuda_available:
import unittest import unittest
from theano.tests import unittest_tools as utt from theano.tests import unittest_tools as utt
from nose.plugins.skip import SkipTest
#TODO: test gpu #TODO: test gpu
# Done in test_consistency_GPU_{serial,parallel} # Done in test_consistency_GPU_{serial,parallel}
...@@ -22,7 +23,6 @@ from theano.tests import unittest_tools as utt ...@@ -22,7 +23,6 @@ from theano.tests import unittest_tools as utt
#TODO: make tests work when no flags gived. Now need: THEANO_FLAGS=device=gpu0,floatX=float32 #TODO: make tests work when no flags gived. Now need: THEANO_FLAGS=device=gpu0,floatX=float32
# Partly done, in test_consistency_GPU_{serial,parallel} # Partly done, in test_consistency_GPU_{serial,parallel}
#TODO: bug fix test_normal0, in normal() fct, n_samples currently need to be numpy.prod(size) not self.n_streams(size)
mode = config.mode mode = config.mode
mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu') mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
...@@ -287,6 +287,7 @@ def basictest(f, steps, sample_size, prefix="", allow_01=False, inputs=[], ...@@ -287,6 +287,7 @@ def basictest(f, steps, sample_size, prefix="", allow_01=False, inputs=[],
for i in xrange(steps): for i in xrange(steps):
t0 = time.time() t0 = time.time()
ival = f(*inputs) ival = f(*inputs)
assert ival.shape==sample_size
dt += time.time() - t0 dt += time.time() - t0
ival = numpy.asarray(ival) ival = numpy.asarray(ival)
if i == 0: if i == 0:
...@@ -324,7 +325,7 @@ def test_uniform(): ...@@ -324,7 +325,7 @@ def test_uniform():
sample_size = (10,100) sample_size = (10,100)
steps = 50 steps = 50
else: else:
sample_size = (500,100) sample_size = (500,50)
steps = int(1e3) steps = int(1e3)
x = tensor.matrix() x = tensor.matrix()
...@@ -381,9 +382,9 @@ def test_binomial(): ...@@ -381,9 +382,9 @@ def test_binomial():
if mode in ['DEBUG_MODE','FAST_COMPILE']: if mode in ['DEBUG_MODE','FAST_COMPILE']:
sample_size = (10,50) sample_size = (10,50)
steps = 70 steps = 50
else: else:
sample_size = (500,100) sample_size = (500,50)
steps = int(1e3) steps = int(1e3)
x = tensor.matrix() x = tensor.matrix()
...@@ -430,9 +431,9 @@ def test_normal0(): ...@@ -430,9 +431,9 @@ def test_normal0():
steps = 50 steps = 50
if mode in ['DEBUG_MODE','FAST_COMPILE']: if mode in ['DEBUG_MODE','FAST_COMPILE']:
sample_size = (99,100) sample_size = (99,30)
else: else:
sample_size = (999,100) sample_size = (999,50)
print '' print ''
print 'ON CPU:' print 'ON CPU:'
...@@ -464,8 +465,8 @@ def test_normal0(): ...@@ -464,8 +465,8 @@ def test_normal0():
print 'random?[:10]\n', numpy.asarray(f())[0,0:10] print 'random?[:10]\n', numpy.asarray(f())[0,0:10]
print '----' print '----'
sys.stdout.flush() sys.stdout.flush()
basictest(f, steps, sample_size, target_avg=-5.0, target_std=2.0, prefix='gpu mrg ', allow_01=True) basictest(f, steps, sample_size_odd, target_avg=-5.0, target_std=2.0, prefix='gpu mrg ', allow_01=True)
print '' print ''
print 'ON CPU w NUMPY:' print 'ON CPU w NUMPY:'
...@@ -476,7 +477,7 @@ def test_normal0(): ...@@ -476,7 +477,7 @@ def test_normal0():
basictest(ff, steps, sample_size, target_avg=-5.0, target_std=2.0, prefix='numpy ', allow_01=True) basictest(ff, steps, sample_size, target_avg=-5.0, target_std=2.0, prefix='numpy ', allow_01=True)
def basic_multinomialtest(f, steps, target_pvals, prefix="", mean_rtol=0.04): def basic_multinomialtest(f, steps, sample_size, target_pvals, prefix="", mean_rtol=0.04):
dt = 0.0 dt = 0.0
avg_pvals = numpy.zeros(target_pvals.shape, dtype=config.floatX) avg_pvals = numpy.zeros(target_pvals.shape, dtype=config.floatX)
...@@ -484,6 +485,7 @@ def basic_multinomialtest(f, steps, target_pvals, prefix="", mean_rtol=0.04): ...@@ -484,6 +485,7 @@ def basic_multinomialtest(f, steps, target_pvals, prefix="", mean_rtol=0.04):
for i in xrange(steps): for i in xrange(steps):
t0 = time.time() t0 = time.time()
ival = f() ival = f()
assert ival.shape==sample_size
dt += time.time() - t0 dt += time.time() - t0
#ival = numpy.asarray(ival) #ival = numpy.asarray(ival)
avg_pvals += ival avg_pvals += ival
...@@ -518,7 +520,7 @@ def test_multinomial(): ...@@ -518,7 +520,7 @@ def test_multinomial():
f = theano.function([], m, mode=mode_) f = theano.function([], m, mode=mode_)
theano.printing.debugprint(f) theano.printing.debugprint(f)
basic_multinomialtest(f, steps, pvals, prefix='mrg ') basic_multinomialtest(f, steps, sample_size, pvals, prefix='mrg ')
sys.stdout.flush() sys.stdout.flush()
...@@ -535,4 +537,4 @@ def test_multinomial(): ...@@ -535,4 +537,4 @@ def test_multinomial():
theano.printing.debugprint(f) theano.printing.debugprint(f)
sys.stdout.flush() sys.stdout.flush()
basic_multinomialtest(f, steps, pvals, prefix='gpu mrg ') basic_multinomialtest(f, steps, sample_size, pvals, prefix='gpu mrg ')
...@@ -302,7 +302,7 @@ class Scalar(Type): ...@@ -302,7 +302,7 @@ class Scalar(Type):
return "" return ""
def c_code_cache_version(self): def c_code_cache_version(self):
return (8,) # put const around operators and added unary '-' operator return (8, numpy.__version__) # put const around operators and added unary '-' operator
# no need to put lib.amdlibm here as c_compile_args() are put in the key. # no need to put lib.amdlibm here as c_compile_args() are put in the key.
return (7,) # make complex c code optional return (7,) # make complex c code optional
return (6,) # added implemeentations of operators that work with scalar arguments return (6,) # added implemeentations of operators that work with scalar arguments
...@@ -932,6 +932,7 @@ class IntDiv(BinaryScalarOp): ...@@ -932,6 +932,7 @@ class IntDiv(BinaryScalarOp):
return [None] * len(inputs) return [None] * len(inputs)
int_div = IntDiv(upcast_out, name = 'int_div') int_div = IntDiv(upcast_out, name = 'int_div')
floor_div = int_div
class Mod(BinaryScalarOp): class Mod(BinaryScalarOp):
def impl(self, x, y): def impl(self, x, y):
......
...@@ -887,6 +887,11 @@ class _tensor_py_operators: ...@@ -887,6 +887,11 @@ class _tensor_py_operators:
except Exception, e: except Exception, e:
return NotImplemented return NotImplemented
def __truediv__(self,other): return true_div(self, other)
def __floordiv__(self,other): return floor_div(self, other)
def __rtruediv__(self,other): return true_div(other, self)
def __rfloordiv__(self,other): return floor_div(other, self)
# ##### DON"T USE THESE BECAUSE INPLACE OPS SHOULD BE INSERTED BY OPTIMIZATION ONLY # ##### DON"T USE THESE BECAUSE INPLACE OPS SHOULD BE INSERTED BY OPTIMIZATION ONLY
# #ARITHMETIC - INPLACE # #ARITHMETIC - INPLACE
# def __iadd__(self,other): return _add_inplace(self,other) # def __iadd__(self,other): return _add_inplace(self,other)
...@@ -2066,6 +2071,11 @@ def true_div(a, b): ...@@ -2066,6 +2071,11 @@ def true_div(a, b):
"""elementwise [true] division (inverse of multiplication)""" """elementwise [true] division (inverse of multiplication)"""
# see decorator for function body # see decorator for function body
@_scal_elemwise
def floor_div(a, b):
"""elementwise [floor] division (inverse of multiplication)"""
# see decorator for function body
@_scal_elemwise @_scal_elemwise
def int_div(a, b): def int_div(a, b):
"""elementwise integer-division""" """elementwise integer-division"""
...@@ -3607,8 +3617,12 @@ class Dot(Op): ...@@ -3607,8 +3617,12 @@ class Dot(Op):
nx = x.type.ndim nx = x.type.ndim
ny = y.type.ndim ny = y.type.ndim
if nx not in (1,2): raise TypeError('not matrix or vector', x) if nx not in (1,2):
if ny not in (1,2): raise TypeError('not matrix or vector', y) raise TypeError(('dot supports matrix and vector args: email theano-dev about'
' enabling numpy dot semantics if you want them'), x)
if ny not in (1,2):
raise TypeError(('dot supports matrix and vector args: email theano-dev about'
' enabling numpy dot semantics if you want them'), y)
if nx == 2 and ny == 2: if nx == 2 and ny == 2:
bz = [x.type.broadcastable[0], y.type.broadcastable[1]] bz = [x.type.broadcastable[0], y.type.broadcastable[1]]
......
...@@ -7,12 +7,13 @@ from theano.configparser import config, AddConfigVar, StrParam ...@@ -7,12 +7,13 @@ from theano.configparser import config, AddConfigVar, StrParam
from theano.gof import (utils, Op, view_roots, PatternSub, DestroyHandler, from theano.gof import (utils, Op, view_roots, PatternSub, DestroyHandler,
SeqOptimizer, local_optimizer, Optimizer, LocalOptimizer, OpKeyOptimizer, SeqOptimizer, local_optimizer, Optimizer, LocalOptimizer, OpKeyOptimizer,
InconsistencyError, toolbox, SequenceDB, EquilibriumOptimizer) InconsistencyError, toolbox, SequenceDB, EquilibriumOptimizer)
from theano.printing import pprint, FunctionPrinter from theano.printing import pprint, FunctionPrinter, debugprint
from theano.compile.mode import optdb from theano.compile.mode import optdb
from theano.gof.python25 import any from theano.gof.python25 import any
import theano.scalar import theano.scalar
import basic as T import basic as T
from theano.tensor.tsor_apply import Apply from theano.tensor.tsor_apply import Apply
#NB: this clobbers the builtin 'compile' symbol #NB: this clobbers the builtin 'compile' symbol
...@@ -28,6 +29,74 @@ def warn(*msg): _logger.warn(' '.join(str(m) for m in msg)) ...@@ -28,6 +29,74 @@ def warn(*msg): _logger.warn(' '.join(str(m) for m in msg))
def warning(*msg): _logger.warning(' '.join(str(m) for m in msg)) def warning(*msg): _logger.warning(' '.join(str(m) for m in msg))
def error(*msg): _logger.error(' '.join(str(m) for m in msg)) def error(*msg): _logger.error(' '.join(str(m) for m in msg))
try:
import scipy.linalg.blas
_have_fblas = True
_blas_gemv_fns = {
numpy.dtype('float32'):scipy.linalg.blas.fblas.sgemv,
numpy.dtype('float64'):scipy.linalg.blas.fblas.dgemv,
numpy.dtype('complex64'):scipy.linalg.blas.fblas.cgemv,
numpy.dtype('complex128'):scipy.linalg.blas.fblas.zgemv,
}
except ImportError, e:
_have_fblas = False
warning('Failed to import scipy.linalg.blas.fblas. Falling back on slower implementations (%s)' % str(e))
class Gemv(Op):
"""
expression is beta * y + alpha * A x
A is matrix
x, y are vectors
alpha, beta are scalars
"""
def __init__(self, inplace):
self.inplace=inplace
if inplace:
self.destroy_map={0:[0]}
def __eq__(self, other):
return type(self)==type(other) and self.inplace == other.inplace
def __str__(self):
if self.inplace:
return 'Gemv{inplace}'
else:
return 'Gemv{no_inplace}'
def __hash__(self):
return hash(type(self)) ^ hash(self.inplace)
def make_node(self, y, alpha, A, x, beta):
y = T.as_tensor_variable(y)
x = T.as_tensor_variable(x)
A = T.as_tensor_variable(A)
alpha = T.as_tensor_variable(alpha)
beta = T.as_tensor_variable(beta)
if y.dtype != A.dtype or y.dtype != x.dtype:
raise TypeError('Gemv requires matching dtypes', (y.dtype, A.dtype, x.dtype))
if A.ndim != 2: raise TypeError('gemv requires matrix for A', A.type)
if x.ndim != 1: raise TypeError('gemv requires vector for x', x.type)
if y.ndim != 1: raise TypeError('gemv requires vector for y', y.type)
if y.broadcastable[0] != A.broadcastable[0]:
raise TypeError('broadcastable mismatch between y and A', (y.type, A.type))
# The following is not grounds for error
# because as long as sizes are 1 at time of perform() there is no problem
#if x.broadcastable[0] != A.broadcastable[1]:
#raise TypeError('broadcastable mismatch between x and A', (x.type, A.type))
return Apply(self, [y, alpha, A, x, beta], [y.type()])
def perform(self, node, inputs, out_storage):
y, alpha, A, x, beta = inputs
if _have_fblas:
if not self.inplace:
y = y.copy()
gemv = _blas_gemv_fns[y.dtype]
out_storage[0][0] = gemv(alpha, A, x, beta, y, overwrite_y=self.inplace)
else:
out_storage[0][0] = numpy.asarray(
beta * y + alpha * numpy.dot(A, x)
, dtype=y.dtype)
gemv_no_inplace = Gemv(inplace=False)
gemv_inplace = Gemv(inplace=True)
def default_blas_ldflags(): def default_blas_ldflags():
try: try:
return ' '.join( return ' '.join(
...@@ -520,6 +589,9 @@ class Gemm(GemmRelated): ...@@ -520,6 +589,9 @@ class Gemm(GemmRelated):
""" """
def c_code(self, node, name, (_z, _a, _x, _y, _b), (_zout, ), sub): #DEBUG def c_code(self, node, name, (_z, _a, _x, _y, _b), (_zout, ), sub): #DEBUG
if node.inputs[0].type.dtype.startswith('complex'):
raise utils.MethodNotDefined('%s.c_code' \
% self.__class__.__name__)
if not config.blas.ldflags: if not config.blas.ldflags:
return super(Gemm, self).c_code(node, name, (_z, _a, _x, _y, _b), (_zout, ), sub) return super(Gemm, self).c_code(node, name, (_z, _a, _x, _y, _b), (_zout, ), sub)
full_code = self.build_gemm_call() % dict(locals(), **sub) full_code = self.build_gemm_call() % dict(locals(), **sub)
...@@ -571,6 +643,10 @@ def _is_real_matrix(res): ...@@ -571,6 +643,10 @@ def _is_real_matrix(res):
and res.type.ndim == 2 \ and res.type.ndim == 2 \
and res.type.broadcastable[0] == False \ and res.type.broadcastable[0] == False \
and res.type.broadcastable[1] == False #cope with tuple vs. list and res.type.broadcastable[1] == False #cope with tuple vs. list
def _is_real_vector(res):
return res.type.dtype in ('float32', 'float64') \
and res.type.ndim == 1 \
and res.type.broadcastable[0] == False
def _beta_L_plus_alpha_M(beta, L, alpha, M, recurse_flip = True): def _beta_L_plus_alpha_M(beta, L, alpha, M, recurse_flip = True):
#print 'BETA L + ALPHA M', beta, L, alpha, M, recurse_flip #print 'BETA L + ALPHA M', beta, L, alpha, M, recurse_flip
...@@ -579,9 +655,41 @@ def _beta_L_plus_alpha_M(beta, L, alpha, M, recurse_flip = True): ...@@ -579,9 +655,41 @@ def _beta_L_plus_alpha_M(beta, L, alpha, M, recurse_flip = True):
# we've already checked the client counts, now just make the type check. # we've already checked the client counts, now just make the type check.
####if res_is_a(M, _dot22, 1): ####if res_is_a(M, _dot22, 1):
if M.owner and M.owner.op == _dot22: if M.owner and M.owner.op == _dot22:
if M.broadcastable == L.broadcastable:
Ml, Mr = M.owner.inputs
rval = [gemm_no_inplace(L, alpha, Ml, Mr, beta)]
#print 'GEMM 0', rval, beta, L, alpha, M
return rval
if M.owner and M.owner.op == T.dot\
and L.broadcastable==(False,) \
and M.broadcastable==(False,):
Ml, Mr = M.owner.inputs Ml, Mr = M.owner.inputs
rval = [gemm_no_inplace(L, alpha, Ml, Mr, beta)] rval = None
#print 'GEMM 0', rval, beta, L, alpha, M if Ml.ndim == 1:
if Mr.ndim == 1:
#TODO: insert a BLAS ddot Op
pass
if Mr.ndim == 2:
#print "RETURNING GEMV (case 2)"
if Mr.dtype == Ml.dtype:
rval = [gemv_no_inplace(L, alpha, Mr.T, Ml, beta)]
assert L.type == rval[0].type, (L.type, rval[0].type)
else:
# TODO
pass
if Ml.ndim == 2:
if Mr.ndim == 1:
#print "RETURNING GEMV (case 3)"
if Mr.dtype == Ml.dtype:
rval = [gemv_no_inplace(L, alpha, Ml, Mr, beta)]
assert L.type == rval[0].type, (L.type, rval[0].type)
else:
# TODO
pass
if Mr.ndim == 2:
# should have already got this case with a _dot22
pass
return rval return rval
# this is False'd out because of inadequate testing. # this is False'd out because of inadequate testing.
...@@ -616,7 +724,7 @@ def _beta_L_plus_alpha_M(beta, L, alpha, M, recurse_flip = True): ...@@ -616,7 +724,7 @@ def _beta_L_plus_alpha_M(beta, L, alpha, M, recurse_flip = True):
def _gemm_canonicalize(r, scale, rval, maxclients): def _gemm_canonicalize(r, scale, rval, maxclients):
# Tries to interpret node as a sum of scalars * matrices # Tries to interpret node as a sum of scalars * (vectors or matrices)
def scaled(thing): def scaled(thing):
if scale == 1: if scale == 1:
return thing return thing
...@@ -629,7 +737,7 @@ def _gemm_canonicalize(r, scale, rval, maxclients): ...@@ -629,7 +737,7 @@ def _gemm_canonicalize(r, scale, rval, maxclients):
except: except:
return None return None
if (tuple(r.type.broadcastable) != (False, False) or if ((r.type.ndim not in (1, 2)) or
r.type.dtype not in ('float32', 'float64', 'complex64', 'complex128')): r.type.dtype not in ('float32', 'float64', 'complex64', 'complex128')):
rval.append(scaled(r)) rval.append(scaled(r))
return rval return rval
...@@ -651,6 +759,7 @@ def _gemm_canonicalize(r, scale, rval, maxclients): ...@@ -651,6 +759,7 @@ def _gemm_canonicalize(r, scale, rval, maxclients):
elif r.owner and r.owner.op == T.mul: elif r.owner and r.owner.op == T.mul:
scalars = [] scalars = []
vectors = []
matrices = [] matrices = []
for i in r.owner.inputs: for i in r.owner.inputs:
if numpy.all(i.type.broadcastable): if numpy.all(i.type.broadcastable):
...@@ -660,6 +769,8 @@ def _gemm_canonicalize(r, scale, rval, maxclients): ...@@ -660,6 +769,8 @@ def _gemm_canonicalize(r, scale, rval, maxclients):
scalars.append(i.dimshuffle()) scalars.append(i.dimshuffle())
else: else:
scalars.append(i) scalars.append(i)
elif _is_real_vector(i):
vectors.append(i)
elif _is_real_matrix(i): elif _is_real_matrix(i):
matrices.append(i) matrices.append(i)
else: else:
...@@ -667,6 +778,7 @@ def _gemm_canonicalize(r, scale, rval, maxclients): ...@@ -667,6 +778,7 @@ def _gemm_canonicalize(r, scale, rval, maxclients):
rval.append((scale,r)) rval.append((scale,r))
return rval return rval
if len(matrices)==1: if len(matrices)==1:
assert len(vectors)==0
m = matrices[0] m = matrices[0]
if len(scalars) == 0: if len(scalars) == 0:
_gemm_canonicalize(m, scale, rval, 1) _gemm_canonicalize(m, scale, rval, 1)
...@@ -674,7 +786,16 @@ def _gemm_canonicalize(r, scale, rval, maxclients): ...@@ -674,7 +786,16 @@ def _gemm_canonicalize(r, scale, rval, maxclients):
_gemm_canonicalize(m, scaled(scalars[0]), rval, 1) _gemm_canonicalize(m, scaled(scalars[0]), rval, 1)
else: else:
_gemm_canonicalize(m, T.mul(scaled(scalars[0]), *scalars[1:]), rval, 1) _gemm_canonicalize(m, T.mul(scaled(scalars[0]), *scalars[1:]), rval, 1)
else: #there are many matrices... lets not open this up elif len(vectors)==1:
assert len(matrices)==0
v = vectors[0]
if len(scalars) == 0:
_gemm_canonicalize(v, scale, rval, 1)
elif len(scalars) == 1:
_gemm_canonicalize(v, scaled(scalars[0]), rval, 1)
else:
_gemm_canonicalize(v, T.mul(scaled(scalars[0]), *scalars[1:]), rval, 1)
else: #lets not open this up
rval.append((scale,r)) rval.append((scale,r))
else: else:
rval.append((scale,r)) rval.append((scale,r))
...@@ -735,8 +856,8 @@ def _gemm_from_factored_list(lst): ...@@ -735,8 +856,8 @@ def _gemm_from_factored_list(lst):
#print 'TRYING', (s_i, M_i, s_j, M_j) #print 'TRYING', (s_i, M_i, s_j, M_j)
gemm_of_sM_list = _beta_L_plus_alpha_M(s_i, M_i, s_j, M_j) gemm_of_sM_list = _beta_L_plus_alpha_M(s_i, M_i, s_j, M_j)
#print 'GOT IT', gemm_of_sM_list
if gemm_of_sM_list: if gemm_of_sM_list:
#print 'GOT IT', gemm_of_sM_list
def item_to_var(t): def item_to_var(t):
try: s,M = t try: s,M = t
except: return t except: return t
...@@ -749,9 +870,11 @@ def _gemm_from_factored_list(lst): ...@@ -749,9 +870,11 @@ def _gemm_from_factored_list(lst):
for k, input in enumerate(lst) if k not in (i,j)] for k, input in enumerate(lst) if k not in (i,j)]
add_inputs.extend(gemm_of_sM_list) add_inputs.extend(gemm_of_sM_list)
if len(add_inputs) > 1: if len(add_inputs) > 1:
return [T.add(*add_inputs)] rval = [T.add(*add_inputs)]
else: else:
return add_inputs rval = add_inputs
#print "RETURNING GEMM THIGN", rval
return rval
def _gemm_from_node2(node): def _gemm_from_node2(node):
""" """
...@@ -762,9 +885,13 @@ def _gemm_from_node2(node): ...@@ -762,9 +885,13 @@ def _gemm_from_node2(node):
""" """
lst = [] lst = []
_gemm_canonicalize(node.outputs[0], 1.0, lst, 0) _gemm_canonicalize(node.outputs[0], 1.0, lst, 0)
#print "GEMM CANON", lst
if len(lst) > 1: if len(lst) > 1:
lst = _factor_canonicalized(lst) lst = _factor_canonicalized(lst)
rval = _gemm_from_factored_list(lst) rval = _gemm_from_factored_list(lst)
#print "RVAL", rval
if rval:
assert rval[0].type == node.outputs[0].type, (rval[0].type, node.outputs[0].type)
return rval return rval
class GemmOptimizer(Optimizer): class GemmOptimizer(Optimizer):
...@@ -783,7 +910,6 @@ class GemmOptimizer(Optimizer): ...@@ -783,7 +910,6 @@ class GemmOptimizer(Optimizer):
did_something = False did_something = False
nodelist.reverse() nodelist.reverse()
for node in nodelist: for node in nodelist:
#new_outputs = _gemm_from_node(node)
try: try:
new_outputs = _gemm_from_node2(node) new_outputs = _gemm_from_node2(node)
except InconsistencyError, e: except InconsistencyError, e:
...@@ -805,13 +931,13 @@ class Dot22(GemmRelated): ...@@ -805,13 +931,13 @@ class Dot22(GemmRelated):
This is a specialization of the more general Dot() This is a specialization of the more general Dot()
""" """
def make_node(self, x, y): def make_node(self, x, y):
if not _is_real_matrix(x): if x.type.ndim != 2 or x.type.dtype not in ('float32', 'float64'):
raise TypeError(x) raise TypeError(x)
if not _is_real_matrix(x): if y.type.ndim != 2 or y.type.dtype not in ('float32', 'float64'):
raise TypeError(y) raise TypeError(y)
if y.type.dtype != x.type.dtype: if y.type.dtype != x.type.dtype:
raise TypeError('dtype mismatch to Dot22') raise TypeError('dtype mismatch to Dot22')
bz = [False, False] bz = (x.type.broadcastable[0], y.type.broadcastable[1])
outputs = [T.tensor(x.type.dtype, bz)] outputs = [T.tensor(x.type.dtype, bz)]
return Apply(self, [x,y], outputs) return Apply(self, [x,y], outputs)
...@@ -855,6 +981,9 @@ class Dot22(GemmRelated): ...@@ -855,6 +981,9 @@ class Dot22(GemmRelated):
double b = 0.0; double b = 0.0;
""" """
def c_code(self, node, name, (_x, _y), (_zout, ), sub): #DEBUG def c_code(self, node, name, (_x, _y), (_zout, ), sub): #DEBUG
if node.inputs[0].type.dtype.startswith('complex'):
raise utils.MethodNotDefined('%s.c_code' \
% self.__class__.__name__)
if len(self.c_libraries())<=0: if len(self.c_libraries())<=0:
return super(Dot22, self).c_code(node, name, (_x, _y), (_zout, ), sub) return super(Dot22, self).c_code(node, name, (_x, _y), (_zout, ), sub)
full_code = self.build_gemm_call() % dict(locals(), **sub) full_code = self.build_gemm_call() % dict(locals(), **sub)
...@@ -870,19 +999,35 @@ _dot22 = Dot22() ...@@ -870,19 +999,35 @@ _dot22 = Dot22()
@local_optimizer([T.dot]) @local_optimizer([T.dot])
def local_dot_to_dot22(node): def local_dot_to_dot22(node):
if node.op == T.dot: if node.op != T.dot:
x,y = node.inputs return
if _is_real_matrix(x) and _is_real_matrix(y) and y.type.dtype == x.type.dtype:
x,y = node.inputs
if y.type.dtype != x.type.dtype:
# TODO: upcast one so the types match
info('Not optimizing dot with inputs', x, y, x.type, y.type)
return
if y.type.dtype.startswith('float'):
if _is_real_matrix(x) and _is_real_matrix(y):
return [_dot22(*node.inputs)] return [_dot22(*node.inputs)]
else: if 0:
info('Not optimizing dot with inputs', x, y, x.type, y.type) if _is_real_matrix(x) and _is_real_vector(y):
else: return [_dot22(x, y.dimshuffle(0,'x')).dimshuffle(0)]
return False if _is_real_vector(x) and _is_real_matrix(y):
return [_dot22(x.dimshuffle('x',0), y).dimshuffle(1)]
if _is_real_vector(x) and _is_real_vector(x):
return [_dot22(x.dimshuffle('x',0), y.dimshuffle(0,'x')).dimshuffle()]
info('Not optimizing dot with inputs', x, y, x.type, y.type)
@local_optimizer([gemm_no_inplace]) @local_optimizer([gemm_no_inplace])
def local_inplace_gemm(node): def local_inplace_gemm(node):
if node.op == gemm_no_inplace: if node.op == gemm_no_inplace:
return [gemm_inplace(*node.inputs)] return [gemm_inplace(*node.inputs)]
@local_optimizer([gemv_no_inplace])
def local_inplace_gemv(node):
if node.op == gemv_no_inplace:
return [gemv_inplace(*node.inputs)]
################################# #################################
# #
...@@ -906,7 +1051,7 @@ blas_optdb.register('local_dot_to_gemm', GemmOptimizer(), 10, 'fast_run') ...@@ -906,7 +1051,7 @@ blas_optdb.register('local_dot_to_gemm', GemmOptimizer(), 10, 'fast_run')
# Try to make gemm inplace # Try to make gemm inplace
# Also, need to make the gemm optimisation(step 70) happen before the fusion of elemwise(step 71) # Also, need to make the gemm optimisation(step 70) happen before the fusion of elemwise(step 71)
optdb.register('InplaceBlasOpt', optdb.register('InplaceBlasOpt',
EquilibriumOptimizer([local_inplace_gemm], failure_callback=EquilibriumOptimizer.warn_inplace, EquilibriumOptimizer([local_inplace_gemm, local_inplace_gemv], failure_callback=EquilibriumOptimizer.warn_inplace,
max_use_ratio=5), max_use_ratio=5),
70.0, 'fast_run', 'inplace') 70.0, 'fast_run', 'inplace')
...@@ -1048,3 +1193,10 @@ blas_optdb.register('local_dot22_to_dot22scalar', ...@@ -1048,3 +1193,10 @@ blas_optdb.register('local_dot22_to_dot22scalar',
11, 'fast_run') 11, 'fast_run')
from opt import register_specialize, register_canonicalize
#@register_specialize
@local_optimizer([])
def local_print_as_we_go_along(node):
if node.op in (T.sub, T.add):
debugprint(node)
...@@ -397,41 +397,41 @@ def local_softmax_with_bias(node): ...@@ -397,41 +397,41 @@ def local_softmax_with_bias(node):
return return
return [sm_bias] return [sm_bias]
if 0: def softmax_simplifier(numerators, denominators):
def softmax_simplifier(numerators, denominators): for numerator in list(numerators):
for numerator in list(numerators): #TODO: a single softmax'd vector??
#TODO: a single softmax'd vector?? if not numerator.type.dtype.startswith('float'):
if not numerator.type.dtype.startswith('float'): continue
continue
if not numerator.type.broadcastable == (False, False):
if not numerator.type.broadcastable == (False, False): continue
continue if numerator.owner and numerator.owner.op == tensor.exp:
if numerator.owner and numerator.owner.op == tensor.exp: x = numerator.owner.inputs[0]
x = numerator.owner.inputs[0] else:
else: continue
continue
matching_denom = None
matching_denom = None
for denominator in denominators:
for denominator in denominators: if denominator.owner and isinstance(denominator.owner.op, tensor.DimShuffle):
if denominator.owner and isinstance(denominator.owner.op, tensor.DimShuffle): if denominator.owner.op.new_order == (0,'x'):
if denominator.owner.op.new_order == (0,'x'): z = denominator.owner.inputs[0] # thing getting dimshuffled
z = denominator.owner.inputs[0] # thing getting dimshuffled if z.owner and isinstance(z.owner.op, tensor.Sum):
if z.owner and isinstance(z.owner.op, tensor.Sum): #print 'ASDF', denominator.owner.op.new_order
#print 'ASDF', denominator.owner.op.new_order #print z.owner.op.axis
#print z.owner.op.axis if z.owner.op.axis == (1,):
if z.owner.op.axis == (1,): #print "almost there.. softmax", x, z.owner.inputs[0]
#print "almost there.. softmax", x, z.owner.inputs[0] if z.owner.inputs[0] is numerator:
if z.owner.inputs[0] is numerator: matching_denom = denominator
matching_denom = denominator break
break if matching_denom:
if matching_denom: numerators.remove(numerator)
numerators.remove(numerator) denominators.remove(matching_denom)
denominators.remove(matching_denom) numerators.append(softmax(x))
numerators.append(softmax(x)) return numerators, denominators
return numerators, denominators opt.local_mul_canonizer.add_simplifier(softmax_simplifier, 'softmax_simplifier')
opt.local_mul_canonizer.add_simplifier(softmax_simplifier, 'softmax_simplifier')
if 0:
def softmax_grad_simplifier(numerators, denominators): def softmax_grad_simplifier(numerators, denominators):
print "mul simplify numerators" print "mul simplify numerators"
printing.debugprint(numerators) printing.debugprint(numerators)
......
...@@ -8,7 +8,7 @@ from theano import gof ...@@ -8,7 +8,7 @@ from theano import gof
from theano import scalar from theano import scalar
from theano import printing from theano import printing
from theano.tensor import basic as tensor from theano.tensor import basic as tensor
from theano.printing import pprint from theano.printing import pprint, debugprint
from theano.tensor import elemwise from theano.tensor import elemwise
from theano.tensor import opt from theano.tensor import opt
from theano.compile import optdb from theano.compile import optdb
...@@ -95,10 +95,17 @@ softplus = elemwise.Elemwise(scalar_softplus, name='softplus') ...@@ -95,10 +95,17 @@ softplus = elemwise.Elemwise(scalar_softplus, name='softplus')
pprint.assign(softplus, printing.FunctionPrinter('softplus')) pprint.assign(softplus, printing.FunctionPrinter('softplus'))
def _skip_mul_1(r):
if r.owner and r.owner.op == tensor.mul:
not_is_1 = [i for i in r.owner.inputs if not _is_1(i) ]
if len(not_is_1)==1:
return not_is_1[0]
logsigm_to_softplus = gof.PatternSub( logsigm_to_softplus = gof.PatternSub(
(tensor.log, (sigmoid, 'x')), (tensor.log, (sigmoid, 'x')),
(tensor.neg, (softplus, (tensor.neg, 'x'))), (tensor.neg, (softplus, (tensor.neg, 'x'))),
allow_multiple_clients = True) allow_multiple_clients = True,
skip_identities_fn=_skip_mul_1)
def _is_1(expr): def _is_1(expr):
"""rtype bool. True iff expr is a constant close to 1 """rtype bool. True iff expr is a constant close to 1
...@@ -115,7 +122,8 @@ log1msigm_to_softplus = gof.PatternSub( ...@@ -115,7 +122,8 @@ log1msigm_to_softplus = gof.PatternSub(
dict(pattern='y', constraint = _is_1), dict(pattern='y', constraint = _is_1),
(sigmoid, 'x'))), (sigmoid, 'x'))),
(tensor.neg, (softplus, 'x')), (tensor.neg, (softplus, 'x')),
allow_multiple_clients = True) allow_multiple_clients = True,
skip_identities_fn=_skip_mul_1)
log1pexp_to_softplus = gof.PatternSub( log1pexp_to_softplus = gof.PatternSub(
(tensor.log1p, (tensor.log1p,
...@@ -329,3 +337,48 @@ register_local_1msigmoid = False ...@@ -329,3 +337,48 @@ register_local_1msigmoid = False
if register_local_1msigmoid: if register_local_1msigmoid:
opt.register_canonicalize(local_1msigmoid) opt.register_canonicalize(local_1msigmoid)
if 0:
# This code is if'd out because it is not complete,
# and it isn't obviously a good idea anyway.
# The motivation here was to identify the last exp() node
# in the SciPy2010 article, which was not optimized away at the time of publication,
# so the example is actually not numerically stable, even though it should be.
@opt.register_stabilize
@gof.local_optimizer([tensor.mul])
def local_sigm_gest(node):
print "CANONICALIZE"
print sigm_canonicalize(node)
def sigm_canonicalize(node):
add = tensor.add
mul = tensor.mul
div = tensor.true_div
if node.op == tensor.add:
rval = []
for i in node.inputs:
rval += sigm_canonicalize(i)
return rval
if node.op == tensor.mul:
rval = sigm_canonicalize(node.inputs[0])
for i in node.inputs[1:]:
old_rval = rval
rval = []
for t1 in sigm_canonicalize(i):
for t0 in old_rval:
assert t1.owner.op == div
assert t0.owner.op == div
t0top, t0bot = t0.owner.inputs
t1top, t1bot = t1.owner.inputs
rval.append(div(mul(*(t0top+t1top)), mul(*(t0bot+t1bot))))
if len(rval) > 100:
# This loop can be exponentially long.
# aborting
return []
elif len(node.outputs)>1:
return []
else:
return [node.outputs[0]]
...@@ -924,6 +924,10 @@ class Test_softmax_opt(): ...@@ -924,6 +924,10 @@ class Test_softmax_opt():
assert softmax in f_ops assert softmax in f_ops
f(self.rng.rand(3,4)) f(self.rng.rand(3,4))
def test_grad(self):
c = T.matrix()
p_y = T.exp(c) / T.exp(c).sum(axis=1).dimshuffle(0,'x')
# test that function contains softmax and no div. # test that function contains softmax and no div.
w = T.matrix() w = T.matrix()
g = theano.function([c,w],T.grad((p_y*w).sum(), c)) g = theano.function([c,w],T.grad((p_y*w).sum(), c))
......
...@@ -144,6 +144,11 @@ def register_specialize(lopt, *tags, **kwargs): ...@@ -144,6 +144,11 @@ def register_specialize(lopt, *tags, **kwargs):
compile.optdb['specialize'].register(name, lopt, 'fast_run', *tags) compile.optdb['specialize'].register(name, lopt, 'fast_run', *tags)
return lopt return lopt
def register_specialize_device(lopt, *tags, **kwargs):
name = (kwargs and kwargs.pop('name')) or lopt.__name__
compile.optdb['specialize_device'].register(name, lopt, 'fast_run', *tags)
return lopt
def register_stabilize(lopt, *tags, **kwargs): def register_stabilize(lopt, *tags, **kwargs):
name = (kwargs and kwargs.pop('name')) or lopt.__name__ name = (kwargs and kwargs.pop('name')) or lopt.__name__
compile.optdb['stabilize'].register(name, lopt, 'fast_run', *tags) compile.optdb['stabilize'].register(name, lopt, 'fast_run', *tags)
...@@ -189,6 +194,11 @@ def local_dimshuffle_lift(node): ...@@ -189,6 +194,11 @@ def local_dimshuffle_lift(node):
register_canonicalize(local_dimshuffle_lift) register_canonicalize(local_dimshuffle_lift)
register_specialize(local_dimshuffle_lift) register_specialize(local_dimshuffle_lift)
@register_canonicalize
@gof.local_optimizer([])
def local_dimshuffle_no_inplace_at_canonicalize(node):
if isinstance(node.op, T.DimShuffle) and node.op.inplace:
return [T.DimShuffle(node.op.input_broadcastable, node.op.new_order, inplace=False)(node.inputs[0])]
##################################### #####################################
...@@ -1603,18 +1613,20 @@ def local_sum_mul_by_scalar(node): ...@@ -1603,18 +1613,20 @@ def local_sum_mul_by_scalar(node):
@register_canonicalize @register_canonicalize
@gof.local_optimizer([]) @gof.local_optimizer([])
def local_sum_div_dimshuffle(node): def local_sum_div_dimshuffle(node):
'''sum(a / dimshuffle{...}(b), axis=l) -> sum(a, axis=l) / b, '''sum(a / dimshuffle{...}(b), axis=l) -> sum(a, axis={...}) / b,
if dimension l of the DimShuffle is 'x'.''' if dimension l of the DimShuffle is 'x'.'''
# TODO: extend it to product, and quotient of products # TODO: extend it to product, and quotient of products
if isinstance(node.op, T.Sum): if isinstance(node.op, T.Sum):
axis = node.op.axis axis = node.op.axis
if axis is None:
axis = range(node.inputs[0].ndim)
#print 'axis =', axis #print 'axis =', axis
thing_summed = node.inputs[0] thing_summed = node.inputs[0]
dimshuffled = None dimshuffled = None
if thing_summed.owner and thing_summed.owner.op == T.true_div: if thing_summed.owner and thing_summed.owner.op == T.true_div:
numerator, denominator = thing_summed.owner.inputs numerator, denominator = thing_summed.owner.inputs
if isinstance(numerator.owner.op, T.DimShuffle): if numerator.owner and isinstance(numerator.owner.op, T.DimShuffle):
new_order = numerator.owner.op.new_order new_order = numerator.owner.op.new_order
#print 'new_order =', new_order #print 'new_order =', new_order
# check compatibility # check compatibility
...@@ -1630,7 +1642,7 @@ def local_sum_div_dimshuffle(node): ...@@ -1630,7 +1642,7 @@ def local_sum_div_dimshuffle(node):
#else: #else:
# print 'incompatible dims:', axis, new_order # print 'incompatible dims:', axis, new_order
if isinstance(denominator.owner.op, T.DimShuffle): if denominator.owner and isinstance(denominator.owner.op, T.DimShuffle):
new_order = denominator.owner.op.new_order new_order = denominator.owner.op.new_order
#print 'new_order =', new_order #print 'new_order =', new_order
# check compatibility # check compatibility
...@@ -1827,9 +1839,31 @@ def local_pow_specialize(node): ...@@ -1827,9 +1839,31 @@ def local_pow_specialize(node):
rval = [T.inv(xsym)] rval = [T.inv(xsym)]
if N.all(y == -2): if N.all(y == -2):
rval = [T.inv(T.sqr(xsym))] rval = [T.inv(T.sqr(xsym))]
if rval:
rval[0] = T.cast(rval[0], odtype)
assert rval[0].type == node.outputs[0].type, (rval, node.outputs)
return rval
else:
return False
register_specialize(local_pow_specialize)
# Optimize all integral powers in [-RANGE, RANGE] @register_specialize_device
if config.experimental.pow and rval is None and abs(y)==int(abs(y)) and abs(y) <= 512:# 512 is too small for the cpu and too big for some gpu! @gof.local_optimizer([T.pow])
def local_pow_specialize_device(node):
"""
This optimization is not the same on all device. We do it only on cpu here.
"""
if node.op == T.pow:
#the idea here is that we have pow(x, y)
odtype = node.outputs[0].dtype
xsym = node.inputs[0]
ysym = node.inputs[1]
y = local_mul_canonizer.get_constant(ysym)
if (y is not None) \
and encompasses_broadcastable(xsym.type.broadcastable, ysym.type.broadcastable):
rval = None
# 512 is too small for the cpu and too big for some gpu!
if abs(y)==int(abs(y)) and abs(y) <= 512:
pow2 = [xsym] pow2 = [xsym]
pow2_scal = [theano.scalar.Scalar(xsym.dtype)()] pow2_scal = [theano.scalar.Scalar(xsym.dtype)()]
y_to_do = abs(y) y_to_do = abs(y)
...@@ -1859,14 +1893,7 @@ def local_pow_specialize(node): ...@@ -1859,14 +1893,7 @@ def local_pow_specialize(node):
rval[0] = T.cast(rval[0], odtype) rval[0] = T.cast(rval[0], odtype)
assert rval[0].type == node.outputs[0].type, (rval, node.outputs) assert rval[0].type == node.outputs[0].type, (rval, node.outputs)
return rval return rval
else:
return False
register_specialize(local_pow_specialize)
theano.configparser.AddConfigVar('experimental.pow',
"Transform a pow to a constant integer to a graph of mul. Fast on cpu, but more work needed for gpu.",
theano.configparser.BoolParam(False),
)
@gof.local_optimizer([T.mul]) @gof.local_optimizer([T.mul])
def local_mul_specialize(node): def local_mul_specialize(node):
"""Remove special-case constants from mul arguments """Remove special-case constants from mul arguments
...@@ -1965,20 +1992,28 @@ register_specialize(local_add_specialize) ...@@ -1965,20 +1992,28 @@ register_specialize(local_add_specialize)
mul_canonizer = in2out(gof.LocalOptGroup(local_mul_canonizer, local_fill_cut, local_fill_sink)) mul_canonizer = in2out(gof.LocalOptGroup(local_mul_canonizer, local_fill_cut, local_fill_sink))
def check_for_x_over_absX(numerators, denominators): def check_for_x_over_absX(numerators, denominators):
"""Convert x/abs(x) into sign(x). """
# TODO: this function should dig/search through dimshuffles # TODO: this function should dig/search through dimshuffles
# This won't catch a dimshuffled absolute value # This won't catch a dimshuffled absolute value
for den in list(denominators): for den in list(denominators):
if den.owner and den.owner.op == T.abs_ and den.owner.inputs[0] in numerators: if den.owner and den.owner.op == T.abs_ and den.owner.inputs[0] in numerators:
denominators.remove(den) if den.owner.inputs[0].type.dtype.startswith('complex'):
numerators.remove(den.owner.inputs[0]) #TODO: Make an Op that projects a complex number to have unit length
numerators.append(T.sgn(den.owner.inputs[0])) # but projects 0 to 0. That would be a weird Op, but consistent with the
# special case below. I heard there's some convention in Matlab that is
# similar to this... but not sure.
pass
else:
denominators.remove(den)
numerators.remove(den.owner.inputs[0])
numerators.append(T.sgn(den.owner.inputs[0]))
return numerators, denominators return numerators, denominators
local_mul_canonizer.add_simplifier(check_for_x_over_absX, 'teststest') local_mul_canonizer.add_simplifier(check_for_x_over_absX, 'teststest')
@register_stabilize @register_stabilize
@gof.local_optimizer([T.log]) @gof.local_optimizer([T.log])
def local_log1p(node): def local_log1p(node):
# log(1+exp(x)) -> log1p(x) # log(1+x) -> log1p(x)
if node.op == T.log: if node.op == T.log:
log_arg, = node.inputs log_arg, = node.inputs
if log_arg.owner and log_arg.owner.op == T.add: if log_arg.owner and log_arg.owner.op == T.add:
...@@ -2207,7 +2242,7 @@ def local_elemwise_fusion_op(OP): ...@@ -2207,7 +2242,7 @@ def local_elemwise_fusion_op(OP):
""" """
def local_fuse(node): def local_fuse(node):
""" """
As part of specialisation, we fusion two consecutif elemwise op of the same shape. As part of specialisation, we fuse two consecutive elemwise op of the same shape.
For mixed dtype, we let the Compise op do the cast. It let the C compile do the cast. For mixed dtype, we let the Compise op do the cast. It let the C compile do the cast.
The number of dimension is validated at call time by theano itself. The number of dimension is validated at call time by theano itself.
...@@ -2240,7 +2275,7 @@ def local_elemwise_fusion_op(OP): ...@@ -2240,7 +2275,7 @@ def local_elemwise_fusion_op(OP):
for i in node.inputs: for i in node.inputs:
do_fusion = False do_fusion = False
catch = False catch = False
if i.owner and isinstance(i.owner.op, OP) and len(i.clients)<=1: if i.owner and isinstance(i.owner.op, OP) and len(i.clients)==1:
#if the scalar_op don't have a c implementation, we skip its fusion to allow the fusion of the other ops. #if the scalar_op don't have a c implementation, we skip its fusion to allow the fusion of the other ops.
do_fusion=True do_fusion=True
try: try:
...@@ -2296,7 +2331,7 @@ def local_elemwise_fusion_op(OP): ...@@ -2296,7 +2331,7 @@ def local_elemwise_fusion_op(OP):
# There is a hard limit of 256 bytes for the formal argument list to a GPU kernel function. # There is a hard limit of 256 bytes for the formal argument list to a GPU kernel function.
# Here, we estimate how many bytes the new Op will need, and abort if it needs too much. # Here, we estimate how many bytes the new Op will need, and abort if it needs too much.
if True: if OP != T.Elemwise:
argument_limit = 240 # 16 bytes are used for block and thread coords etc. argument_limit = 240 # 16 bytes are used for block and thread coords etc.
#TODO: read in from architecture to make this 4 or 8 #TODO: read in from architecture to make this 4 or 8
int_size = 8 int_size = 8
......
...@@ -1604,6 +1604,54 @@ class t_dot(unittest.TestCase): ...@@ -1604,6 +1604,54 @@ class t_dot(unittest.TestCase):
#utt.verify_grad(dot, [self.rand(), self.rand(2)]) #utt.verify_grad(dot, [self.rand(), self.rand(2)])
#utt.verify_grad(dot, [self.rand(), self.rand(2,5)]) #utt.verify_grad(dot, [self.rand(), self.rand(2,5)])
def test_broadcastable_patterns(self):
#
# These examples hsould all work because we broadcastable or no, all dimensions of all
# results have size 1.
#
def val_for(r):
if r.ndim == 0:
return numpy.asarray(1.1, dtype=r.dtype)
if r.ndim == 1:
return numpy.asarray([1.2], dtype=r.dtype)
elif r.ndim == 2:
return numpy.asarray([[1.3]], dtype=r.dtype)
raise ValueError()
failures = []
for dtype0 in ('float32', 'float64', 'complex64', 'complex128'):
for dtype1 in ('float32', 'float64', 'complex64', 'complex128'):
for bc0 in ((True,), (False,), (True, True), (True, False), (False, True),
(False,False)):
for bc1 in ((True,), (False,), (True, True), (True, False), (False, True),
(False,False)):
x = TensorType(dtype=dtype0, broadcastable=bc0)()
y = TensorType(dtype=dtype1, broadcastable=bc1)()
z = dot(x,y)
t = TensorType(dtype=dtype0, broadcastable=z.broadcastable)()
rval = z * 3 + 2*t
if rval.type.dtype.startswith('complex'):
# there is a problem with complex numbers right now
# Elemwise code doesn't compile when both precisions of complex
# numbers are used in the same file because the operators
# aren't declared properly.
failures.append((dtype0, dtype1, bc0, bc1))
continue
f = function([x,y,t], rval)
xval = val_for(x)
yval = val_for(y)
tval = val_for(t)
f(xval, yval, tval) #debugmode checks result
#if failures:
#print failures
assert not failures
class T_tensorfromscalar(unittest.TestCase): class T_tensorfromscalar(unittest.TestCase):
def test0(self): def test0(self):
s = scal.constant(56) s = scal.constant(56)
......
...@@ -416,7 +416,7 @@ def test_gemm_canonicalize(): ...@@ -416,7 +416,7 @@ def test_gemm_canonicalize():
can = [] can = []
_gemm_canonicalize(X + Y + u, 1.0, can, 0) _gemm_canonicalize(X + Y + u, 1.0, can, 0)
assert can == [(1.0, X), (1.0, Y), u] assert can == [(1.0, X), (1.0, Y), u], can
can = [] can = []
_gemm_canonicalize(a*X + Y - b*Z*c, 1.0, can, 0) _gemm_canonicalize(a*X + Y - b*Z*c, 1.0, can, 0)
......
...@@ -14,7 +14,6 @@ from theano.gof import Env ...@@ -14,7 +14,6 @@ from theano.gof import Env
from theano.tensor.elemwise import DimShuffle from theano.tensor.elemwise import DimShuffle
from theano import pprint, shared from theano import pprint, shared
from theano.tests import unittest_tools as utt from theano.tests import unittest_tools as utt
import scalar as scal
from theano import function, compile from theano import function, compile
from nose.plugins.skip import SkipTest from nose.plugins.skip import SkipTest
...@@ -467,6 +466,22 @@ class test_canonize(unittest.TestCase): ...@@ -467,6 +466,22 @@ class test_canonize(unittest.TestCase):
topo=f.maker.env.toposort() topo=f.maker.env.toposort()
assert len(topo)==0 assert len(topo)==0
assert(out_dtype==out.dtype) assert(out_dtype==out.dtype)
#test x / abs(x) -> sign(x)
for id,(g, sym_inputs, val_inputs, out_dtype) in enumerate([
(dx/abs(dx),[dx],[0.5-dxv],'float64'),
(fx/abs(fx),[fx],[0.5-fxv],'float32'),
(dx/abs(dx),[dx],[0.0*dxv],'float64'),
(fx/abs(fx),[fx],[0.0*fxv],'float32'),
(dv/abs(dv),[dv],[0.5-dvv],'float64'),
(fv/abs(fv),[fv],[0.5-fvv],'float32'),
]):
f = compile.function(list(sym_inputs), g,
mode=mode)
out = f(*val_inputs)
assert numpy.all(numpy.isfinite(out))
assert numpy.allclose(out,numpy.sign(val_inputs[0]))
assert(out_dtype==out.dtype)
finally: finally:
mode._optimizer = old_optimizer mode._optimizer = old_optimizer
...@@ -599,34 +614,34 @@ class test_fusion(unittest.TestCase): ...@@ -599,34 +614,34 @@ class test_fusion(unittest.TestCase):
izv = theano._asarray(my_init(shp,num=70),dtype='int32') izv = theano._asarray(my_init(shp,num=70),dtype='int32')
fwx=fw+fx fwx=fw+fx
cases = [ cases = [
(fx+fy+fz,(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv+fzv,'float32'),#1 (fx+fy+fz,(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv+fzv,'float32'),#0
(fx*fy*fz,(fx,fy,fz),(fxv,fyv,fzv),1,fxv*fyv*fzv,'float32'), (fx*fy*fz,(fx,fy,fz),(fxv,fyv,fzv),1,fxv*fyv*fzv,'float32'),#1
(fx+fy*fz,(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv*fzv,'float32'), (fx+fy*fz,(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv*fzv,'float32'),
(fx*fy+fz,(fx,fy,fz),(fxv,fyv,fzv),1,fxv*fyv+fzv,'float32'), (fx*fy+fz,(fx,fy,fz),(fxv,fyv,fzv),1,fxv*fyv+fzv,'float32'),
(fw+fx+fy+fz,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv,'float32'),#5 (fw+fx+fy+fz,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv,'float32'),
((fw+fx)+(fy+fz),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv,'float32'), ((fw+fx)+(fy+fz),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv,'float32'),#5
(((fw+fx)+fy)+fz,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv,'float32'), (((fw+fx)+fy)+fz,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv,'float32'),
((fw+(fx+fy))+fz,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv,'float32'), ((fw+(fx+fy))+fz,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv,'float32'),
((fw+(fx+fy)+fz),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv,'float32'), ((fw+(fx+fy)+fz),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv,'float32'),
(fw+(fx+(fy+fz)),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv,'float32'),#10 (fw+(fx+(fy+fz)),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv,'float32'),
((fw+fx)+(fy+fz),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv,'float32'), ((fw+fx)+(fy+fz),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv,'float32'),#10
(fw*fx*fy*fz,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv*fxv*fyv*fzv,'float32'), (fw*fx*fy*fz,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv*fxv*fyv*fzv,'float32'),
(fw+fx*fy*fz,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv*fyv*fzv,'float32'), (fw+fx*fy*fz,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv*fyv*fzv,'float32'),
(fx+fy*fz*fx,(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv*fzv*fxv,'float32'), (fx+fy*fz*fx,(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv*fzv*fxv,'float32'),
(fx*fy+fz+fy,(fx,fy,fz),(fxv,fyv,fzv),1,fxv*fyv+fzv+fyv,'float32'),#15 (fx*fy+fz+fy,(fx,fy,fz),(fxv,fyv,fzv),1,fxv*fyv+fzv+fyv,'float32'),
(fx*fy*fz*fw+fx+fy+fz+fw,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),2,fxv*fyv*fzv*fwv+fxv+fyv+fzv+fwv,'float32'),#expect 2 as their is limit to the fusion on the gpu. (fx*fy*fz*fw+fx+fy+fz+fw,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fxv*fyv*fzv*fwv+fxv+fyv+fzv+fwv,'float32'),#15
#test with constant #test with constant
((fw+fx)+(fy+fz)+2,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv+2,'float32'), ((fw+fx)+(fy+fz)+2,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv+2,'float32'),
(((fw+fx)+2+fy)+fz,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv+2,'float32'), (((fw+fx)+2+fy)+fz,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv+2,'float32'),
((fw+(fx+2+fy))+fz,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv+2,'float32'), ((fw+(fx+2+fy))+fz,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv+2,'float32'),
((fw+(fx+fy)+2+fz),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv+2,'float32'),#20 ((fw+(fx+fy)+2+fz),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv+2,'float32'),
(fw+(fx+(fy+fz)+2),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv+2,'float32'), (fw+(fx+(fy+fz)+2),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv+2,'float32'),#20
(2+(fw+fx)+(fy+fz),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv+2,'float32'), (2+(fw+fx)+(fy+fz),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv+2,'float32'),
#mix float32 and float64 #mix float32 and float64
(2+(dw+fx)+(fy+fz),(dw,fx,fy,fz),(dwv,fxv,fyv,fzv),1,dwv+fxv+fyv+fzv+2,'float64'), (2+(dw+fx)+(fy+fz),(dw,fx,fy,fz),(dwv,fxv,fyv,fzv),1,dwv+fxv+fyv+fzv+2,'float64'),
(2+(fw+dw)+(fy+fz),(fw,dw,fy,fz),(fwv,dwv,fyv,fzv),1,fwv+dwv+fyv+fzv+2,'float64'), (2+(fw+dw)+(fy+fz),(fw,dw,fy,fz),(fwv,dwv,fyv,fzv),1,fwv+dwv+fyv+fzv+2,'float64'),
(2+(fw+fx)+(dw+fz),(fw,fx,dw,fz),(fwv,fxv,dwv,fzv),1,fwv+fxv+dwv+fzv+2,'float64'),#25 (2+(fw+fx)+(dw+fz),(fw,fx,dw,fz),(fwv,fxv,dwv,fzv),1,fwv+fxv+dwv+fzv+2,'float64'),
(2+(fw+fx)+(fy+dw),(fw,fx,fy,dw),(fwv,fxv,fyv,dwv),1,fwv+fxv+fyv+dwv+2,'float64'), (2+(fw+fx)+(fy+dw),(fw,fx,fy,dw),(fwv,fxv,fyv,dwv),1,fwv+fxv+fyv+dwv+2,'float64'),#25
#test when their is other op then elemwise. #test when their is other op then elemwise.
#the good output for the next test. #the good output for the next test.
# (Pdb) p f.maker.env.toposort() # (Pdb) p f.maker.env.toposort()
...@@ -642,33 +657,33 @@ class test_fusion(unittest.TestCase): ...@@ -642,33 +657,33 @@ class test_fusion(unittest.TestCase):
#test other elemwise op #test other elemwise op
(fx+fy+cos(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv+numpy.cos(fzv),'float32'), (fx+fy+cos(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv+numpy.cos(fzv),'float32'),
(fx+fy+cosh(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv+numpy.cosh(fzv),'float32'), (fx+fy+cosh(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv+numpy.cosh(fzv),'float32'),
(fx+fy+abs(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv+numpy.absolute(fzv),'float32'),#30 (fx+fy+abs(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv+numpy.absolute(fzv),'float32'),
(ix+iy+abs(iz),(ix,iy,iz),(ixv,iyv,izv),1,ixv+iyv+numpy.absolute(izv),'int32'), (ix+iy+abs(iz),(ix,iy,iz),(ixv,iyv,izv),1,ixv+iyv+numpy.absolute(izv),'int32'),#30
(fx+fy+theano.tensor.log(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv+numpy.log(fzv),'float32'), (fx+fy+theano.tensor.log(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv+numpy.log(fzv),'float32'),
(fx+fy+theano.tensor.log2(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv+numpy.log2(fzv),'float32'), (fx+fy+theano.tensor.log2(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv+numpy.log2(fzv),'float32'),
(fx+fy+theano.tensor.log10(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv+numpy.log10(fzv),'float32'), (fx+fy+theano.tensor.log10(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv+numpy.log10(fzv),'float32'),
(fx+fy**fz,(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv**fzv,'float32'),#pow #35 (fx+fy**fz,(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv**fzv,'float32'),#pow
(fx+fy+theano.tensor.exp(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv+numpy.exp(fzv),'float32'), (fx+fy+theano.tensor.exp(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv+numpy.exp(fzv),'float32'),#35
(fx-fy-fz,(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv-fzv,'float32'), (fx-fy-fz,(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv-fzv,'float32'),
(fx-(fy/fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv/fzv),'float32'), (fx-(fy/fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv/fzv),'float32'),
(fx-theano.tensor.true_div(fy,2),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv/2),'float32'), (fx-theano.tensor.true_div(fy,2),(fx,fy),(fxv,fyv),1,fxv-(fyv/2),'float32'),
(fx-theano.tensor.true_div(fy,fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv/fzv),'float32'),#40 (fx-theano.tensor.true_div(fy,fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv/fzv),'float32'),
(fx-theano.tensor.int_div(ix*100,iy*1000),(fx,ix,iy),(fxv,ixv,iyv),4,fxv-((ixv*100)//(iyv*1000)),'float64'),#int32 - float32 = float64 #No c_code for int_div (fx-theano.tensor.int_div(ix*100,iy*1000),(fx,ix,iy),(fxv,ixv,iyv),4,fxv-((ixv*100)//(iyv*1000)),'float64'),#int32 - float32 = float64 #No c_code for int_div#40
(fx-(fy/2),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv/2),'float32'), (fx-(fy/2),(fx,fy),(fxv,fyv),1,fxv-(fyv/2),'float32'),
(fx-(fy%fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv%fzv),'float32'), (fx-(fy%fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv%fzv),'float32'),
(fx-(fy>fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv>fzv),'float32'), (fx-(fy>fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv>fzv),'float32'),
(fx-(fy>=fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv>=fzv),'float32'),#45 (fx-(fy>=fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv>=fzv),'float32'),
(fx-(fy<fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv<fzv),'float32'), (fx-(fy<fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv<fzv),'float32'),#45
(fx-(fy<=fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv<=fzv),'float32'),#TODO: bugged on the gpu (fx-(fy<=fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv<=fzv),'float32'),
(fx-(fy==fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv==fzv),'float32'),#TODO: bugged (fx-T.eq(fy,fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv==fzv),'float32'),
(fx-(fy!=fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv!=fzv),'float32'), (fx-T.neq(fy,fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv!=fzv),'float32'),
(fx-fy+tan(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+numpy.tan(fzv),'float32'),#50 (fx-fy+tan(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+numpy.tan(fzv),'float32'),
(fx-fy+tanh(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+numpy.tanh(fzv),'float32'), (fx-fy+tanh(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+numpy.tanh(fzv),'float32'),#50
(fx-fy+sin(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+numpy.sin(fzv),'float32'), (fx-fy+sin(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+numpy.sin(fzv),'float32'),
(fx-fy+sinh(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+numpy.sinh(fzv),'float32'), (fx-fy+sinh(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+numpy.sinh(fzv),'float32'),
(fx-fy+theano.tensor.sqr(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+(fzv*fzv),'float32'), (fx-fy+theano.tensor.sqr(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+(fzv*fzv),'float32'),
(fx-fy+theano.tensor.sqrt(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+numpy.sqrt(fzv),'float32'),#55 (fx-fy+theano.tensor.sqrt(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+numpy.sqrt(fzv),'float32'),
(fx-fy+theano.tensor.inv(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+(1/fzv),'float32'), (fx-fy+theano.tensor.inv(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+(1/fzv),'float32'),#55
(fx-fy+theano.tensor.neg(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+(-fzv),'float32'), (fx-fy+theano.tensor.neg(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+(-fzv),'float32'),
# (fx-fy+theano.tensor.iround(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+numpy.round(fzv),'float32'),#TODO: trouble with the output type. To my understanding, numpy and c round fct return the same type as the input. Why we don't do this? # (fx-fy+theano.tensor.iround(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+numpy.round(fzv),'float32'),#TODO: trouble with the output type. To my understanding, numpy and c round fct return the same type as the input. Why we don't do this?
...@@ -714,10 +729,9 @@ class test_fusion(unittest.TestCase): ...@@ -714,10 +729,9 @@ class test_fusion(unittest.TestCase):
fail1.append(id) fail1.append(id)
topo=f.maker.env.toposort() topo=f.maker.env.toposort()
if gpu: if gpu:
import theano_cuda_ndarray as tcn import theano.sandbox.cuda as cuda
topo_ = [x for x in topo if not isinstance(x.op,cuda.basic_ops.GpuFromHost) and not isinstance(x.op,cuda.basic_ops.HostFromGpu)]
topo_ = [x for x in topo if not isinstance(x.op,tcn.basic_ops.GpuFromHost)] gpu_ = [x for x in topo if isinstance(x.op,cuda.basic_ops.GpuFromHost)]
gpu_ = [x for x in topo if isinstance(x.op,tcn.basic_ops.GpuFromHost)]
if not len(gpu_)==len(sym_inputs): if not len(gpu_)==len(sym_inputs):
fail2.append((id,gpu_,sym_inputs)) fail2.append((id,gpu_,sym_inputs))
else: topo_=topo else: topo_=topo
...@@ -727,9 +741,6 @@ class test_fusion(unittest.TestCase): ...@@ -727,9 +741,6 @@ class test_fusion(unittest.TestCase):
if not out_dtype==out.dtype: if not out_dtype==out.dtype:
fail4.append((id,out_dtype,out.dtype)) fail4.append((id,out_dtype,out.dtype))
# cases[id]=None #to remove g, that link to out that link to the ndarray!
#g.owner.inputs[0] is out... make owner a weakref?
failed=len(fail1+fail2+fail3+fail4) failed=len(fail1+fail2+fail3+fail4)
print "Executed",len(cases),"cases", "failed", failed print "Executed",len(cases),"cases", "failed", failed
if failed>0: if failed>0:
...@@ -751,8 +762,9 @@ class test_fusion(unittest.TestCase): ...@@ -751,8 +762,9 @@ class test_fusion(unittest.TestCase):
mode=compile.mode.predefined_modes['FAST_COMPILE'] mode=compile.mode.predefined_modes['FAST_COMPILE']
mode=compile.mode.predefined_modes['FAST_RUN'] mode=compile.mode.predefined_modes['FAST_RUN']
mode=compile.mode.predefined_modes['DEBUG_MODE'] mode=compile.mode.predefined_modes['DEBUG_MODE']
import theano.sandbox.cuda as tcn mode = theano.compile.mode.get_mode(mode).including('gpu')
self.do(mode, tcn.shared_constructor, shp, gpu=True) import theano.sandbox.cuda as cuda
self.do(mode, cuda.float32_shared_constructor, shp, gpu=True)
def speed_fusion(self, shared_fn = shared, gpu = False, s=None): def speed_fusion(self, shared_fn = shared, gpu = False, s=None):
""" """
...@@ -788,8 +800,8 @@ class test_fusion(unittest.TestCase): ...@@ -788,8 +800,8 @@ class test_fusion(unittest.TestCase):
print "min", d.min(), "argmin", d.argmin(), "max", d.max(), "mean", d.mean(), "std", d.std() print "min", d.min(), "argmin", d.argmin(), "max", d.max(), "mean", d.mean(), "std", d.std()
def speed_fusion_gpu(self): def speed_fusion_gpu(self):
import theano_cuda_ndarray as tcn import theano.sandbox.cuda as cuda
self.speed_fusion(shared_fn=tcn.shared_constructor, gpu=True, s=slice(0,15)) self.speed_fusion(shared_fn=tcn.float32_shared_constructor, gpu=True, s=slice(0,15))
def speed_log_exp(self): def speed_log_exp(self):
s=slice(31,36) s=slice(31,36)
...@@ -1260,6 +1272,7 @@ def test_local_pow_specialize(): ...@@ -1260,6 +1272,7 @@ def test_local_pow_specialize():
v = T.vector() v = T.vector()
val = numpy.arange(10,dtype=theano.config.floatX) val = numpy.arange(10,dtype=theano.config.floatX)
val_no0 = numpy.arange(1,10,dtype=theano.config.floatX) val_no0 = numpy.arange(1,10,dtype=theano.config.floatX)
f = function([v], v**0, mode=mode) f = function([v], v**0, mode=mode)
nodes = [node.op for node in f.maker.env.toposort()] nodes = [node.op for node in f.maker.env.toposort()]
assert nodes == [Shape_i(0), T.alloc] assert nodes == [Shape_i(0), T.alloc]
...@@ -1301,33 +1314,44 @@ def test_local_pow_specialize(): ...@@ -1301,33 +1314,44 @@ def test_local_pow_specialize():
# assert nodes == [T.sqrt,T.inv]#Why this don't work? # assert nodes == [T.sqrt,T.inv]#Why this don't work?
assert numpy.allclose(f(val_no0),val_no0**(-.5)) assert numpy.allclose(f(val_no0),val_no0**(-.5))
if config.experimental.pow: def test_local_pow_specialize_device():
print "Test experimental.pow=True"
f = function([v], v**(15), mode=mode) # test that on cpu we use more agressive optimization
nodes = [node.op for node in f.maker.env.toposort()]
assert len(nodes)==1 mode = theano.config.mode
assert isinstance(nodes[0].scalar_op,theano.scalar.Composite) if mode == 'FAST_COMPILE':
assert numpy.allclose(f(val),val**15) mode = 'FAST_RUN'
mode = compile.mode.get_mode(mode)
f = function([v], v**(-15), mode=mode) mode = mode.excluding('fusion').excluding('gpu')
nodes = [node.op for node in f.maker.env.toposort()]
assert len(nodes)==2 v = T.vector()
assert isinstance(nodes[0].scalar_op,theano.scalar.Composite) val = numpy.arange(10,dtype=theano.config.floatX)
assert isinstance(nodes[-1].scalar_op,theano.scalar.basic.Inv) val_no0 = numpy.arange(1,10,dtype=theano.config.floatX)
assert numpy.allclose(f(val_no0),val_no0**(-15)) f = function([v], v**(15), mode=mode)
nodes = [node.op for node in f.maker.env.toposort()]
f = function([v], v**(16), mode=mode) assert len(nodes)==1
nodes = [node.op for node in f.maker.env.toposort()] assert isinstance(nodes[0].scalar_op,theano.scalar.Composite)
assert len(nodes) == 1 assert numpy.allclose(f(val),val**15)
assert isinstance(nodes[0].scalar_op,theano.scalar.Composite)
assert numpy.allclose(f(val),val**16) f = function([v], v**(-15), mode=mode)
nodes = [node.op for node in f.maker.env.toposort()]
f = function([v], v**(-16), mode=mode) assert len(nodes)==2
nodes = [node.op for node in f.maker.env.toposort()] assert isinstance(nodes[0].scalar_op,theano.scalar.Composite)
assert len(nodes) == 2 assert isinstance(nodes[-1].scalar_op,theano.scalar.basic.Inv)
assert isinstance(nodes[0].scalar_op,theano.scalar.Composite) assert numpy.allclose(f(val_no0),val_no0**(-15))
assert isinstance(nodes[-1].scalar_op,theano.scalar.basic.Inv)
assert numpy.allclose(f(val_no0),val_no0**(-16)) f = function([v], v**(16), mode=mode)
nodes = [node.op for node in f.maker.env.toposort()]
assert len(nodes) == 1
assert isinstance(nodes[0].scalar_op,theano.scalar.Composite)
assert numpy.allclose(f(val),val**16)
f = function([v], v**(-16), mode=mode)
nodes = [node.op for node in f.maker.env.toposort()]
assert len(nodes) == 2
assert isinstance(nodes[0].scalar_op,theano.scalar.Composite)
assert isinstance(nodes[-1].scalar_op,theano.scalar.basic.Inv)
assert numpy.allclose(f(val_no0),val_no0**(-16))
class T_Rebroadcast(unittest.TestCase): class T_Rebroadcast(unittest.TestCase):
......
from nose.plugins.skip import SkipTest
import unittest
import theano
import numpy
import random
import numpy.random
from theano.tests import unittest_tools as utt
'''
Different tests that are not connected to any particular Op, or functionality of
Theano. Here will go for example code that we will publish in papers, that we
should ensure that it will remain operational
'''
class T_diverse(unittest.TestCase):
def setUp(self):
utt.seed_rng()
def scipy_paper_example1(self):
a = theano.tensor.vector('a') # declare variable
b = a + a**10 # build expression
f = theano.function([a], b) # compile function
assert numpy.all(f([0,1,2]) == numpy.array([0,2,1026]))
def scipy_papaer_example2(self):
''' This just sees if things compile well and if they run '''
x = T.matrix()
y = T.vector()
w = shared(rng.randn(100))
b = shared(numpy.zeros(()))
# Construct Theano expression graph
p_1 = 1 / (1 + T.exp(-T.dot(x, w)-b))
xent = -y*T.log(p_1) - (1-y)*T.log(1-p_1)
prediction = p_1 > 0.5
cost = xent.mean() + 0.01*(w**2).sum()
gw,gb = T.grad(cost, [w,b])
# Compile expressions to functions
train = function(
inputs=[x,y],
outputs=[prediction, xent],
updates={w:w-0.1*gw, b:b-0.1*gb})
predict = function(inputs=[x], outputs=prediction)
N = 4
feats = 100
D = (rng.randn(N, feats), rng.randint(size=4,low=0, high=2))
training_steps = 10
for i in range(training_steps):
pred, err = train(D[0], D[1])
if __name__ == '__main__':
unittest.main()
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论