提交 56267692 authored 作者: Simon Lemieux's avatar Simon Lemieux

merge

......@@ -150,38 +150,35 @@ def execs_timeit_2vector(exprs, fname=None):
assert len(colors)>=len(times)
fig = pylab.figure()
for idx,(time,expr) in enumerate(zip(times,str_expr)):
###
###
###
# Creating each subplot
###
###
###
###
pylab.subplot(220+idx+1)
pylab.subplots_adjust(wspace=0.25, hspace=0.25)
#legend=[]
#plot = fig.add_subplot(1,len(exprs),idx)
speedup = [t[0].min()/t[1].min() for t in time]
pylab.semilogx(nb_calls, speedup, linewidth=1.0, linestyle = '--', color='r')
speedup = [t[0].min()/t[2].min() for t in time]
pylab.semilogx(nb_calls, speedup, linewidth=1.0, color = 'b')
pylab.grid(True)
if (idx == 2) or (idx == 3):
pylab.xlabel('Dimension of vectors a and b')
pylab.xlabel('Dimension of vectors a and b', fontsize = 15)
if (idx == 0) or (idx == 2):
pylab.ylabel('Speed up vs NumPy')
pylab.ylabel('Speed up vs NumPy', fontsize = 15)
pylab.axhline(y=1, linewidth=1.0, color='black')
pylab.xlim(1e3,1e7)
pylab.xticks([1e3,1e5,1e7],['1e3','1e5','1e7'])
pylab.title(expr)
#for time,expr,color in zip(times,str_expr,colors):
# speedup = [t[0].min()/t[1].min() for t in time]
# plot.semilogx(nb_calls, speedup, linewidth=1.0, linestyle='--', color=color)
# speedup = [t[0].min()/t[2].min() for t in time]
# plot.semilogx(nb_calls, speedup, linewidth=1.0, color=color)
#legend += ["Numexpr "+expr,"Theano "+expr]
#pylab.title('Speed up Numexpr and Theano vs NumPy')
#pylab.grid(True)
#pylab.xlabel('Nb element')
#pylab.ylabel('Speed up vs NumPy')
#pylab.legend(legend,loc='upper left')
# fig.legend(legend,loc='upper left')
if fname:
fig.savefig(fname)
......
......@@ -126,6 +126,18 @@ class AddDestroyHandler(gof.Optimizer):
super(AddDestroyHandler, self).add_requirements(env)
env.extend(gof.DestroyHandler())
class PrintCurrentEnv(gof.Optimizer):
"""This optimizer is for debugging.
Toss it into the optimization pipeline to see the state of things at any given point.
"""
def __init__(self, header):
self.header =header
def apply(self, env):
import theano.printing
print "PrintCurrentEnv:", self.header
theano.printing.debugprint(env.outputs)
optdb = gof.SequenceDB()
optdb.register('merge1', gof.MergeOptimizer(),
0, 'fast_run', 'fast_compile')
......@@ -133,10 +145,19 @@ optdb.register('canonicalize', gof.EquilibriumDB(), # rearranges elemwis
1, 'fast_run', 'fast_compile')
optdb.register('merge1.2', gof.MergeOptimizer(skip_const_merge=False),
1.2, 'fast_run', 'fast_compile')
optdb.register('Print1.21', PrintCurrentEnv('Post-canonicalize'),
1.21,)# 'fast_run', 'fast_compile')
optdb.register('stabilize', gof.EquilibriumDB(), # replace unstable subgraphs
1.5, 'fast_run')
optdb.register('Print1.51', PrintCurrentEnv('Post-stabilize'),
1.51,) #'fast_run', 'fast_compile')
optdb.register('specialize', gof.EquilibriumDB(), # misc special cases for speed
2, 'fast_run')
optdb.register('Print2.01', PrintCurrentEnv('Post-specialize'),
2.01, )#'fast_run', 'fast_compile')
optdb.register('specialize_device', gof.EquilibriumDB(), # misc special cases for speed that are dependent on the device.
48.6, 'fast_run')#must be after gpu stuff at 48.5
optdb.register('merge2', gof.MergeOptimizer(), # especially constant merge
49, 'fast_run')
optdb.register('add_destroy_handler', AddDestroyHandler(),
......
......@@ -341,6 +341,8 @@ class Value(Variable):
if value is not None:
raise ValueError("Value instances cannot have an owner.")
owner = property(lambda self: None, __set_owner)
value = property(lambda self: self.data,
doc='read-only data access method')
# index is not defined, because the `owner` attribute must necessarily be None
......
......@@ -525,7 +525,8 @@ class PatternSub(LocalOptimizer):
(scrabble, 'x'))
"""
def __init__(self, in_pattern, out_pattern, allow_multiple_clients = False):
def __init__(self, in_pattern, out_pattern, allow_multiple_clients = False,
skip_identities_fn=None):
"""
Creates a PatternSub that replaces occurrences of
in_pattern by occurrences of out_pattern.
......@@ -543,7 +544,12 @@ class PatternSub(LocalOptimizer):
raise TypeError("The pattern to search for must start with a specific Op instance.")
self.__doc__ = self.__class__.__doc__ + "\n\nThis instance does: " + str(self) + "\n"
self.allow_multiple_clients = allow_multiple_clients
self.skip_identities_fn = skip_identities_fn
def skip_identities(self, expr):
if self.skip_identities_fn:
return self.skip_identities_fn(expr)
def op_key(self):
return self.op
......@@ -568,13 +574,22 @@ class PatternSub(LocalOptimizer):
if node.op != self.op:
return False
def match(pattern, expr, u, allow_multiple_clients = False):
def retry_with_equiv():
expr_equiv = self.skip_identities(expr)
if expr_equiv is None:
return False
#TODO: Not sure how to handle multiple_clients flag
###print 'retrying match', pattern, expr_equiv
return match(pattern, expr_equiv, u,
allow_multiple_clients=allow_multiple_clients)
if isinstance(pattern, (list, tuple)):
if expr.owner is None:
return False
if not (expr.owner.op == pattern[0]) or (not allow_multiple_clients and len(expr.clients) > 1):
return False
return retry_with_equiv()
if len(pattern) - 1 != len(expr.owner.inputs):
return False
return retry_with_equiv()
for p, v in zip(pattern[1:], expr.owner.inputs):
u = match(p, v, u, self.allow_multiple_clients)
if not u:
......@@ -588,17 +603,17 @@ class PatternSub(LocalOptimizer):
if constraint(expr):
return match(real_pattern, expr, u, pattern.get('allow_multiple_clients', False))
else:
return False
return retry_with_equiv()
elif isinstance(pattern, str):
v = unify.Var(pattern)
if u[v] is not v and u[v] is not expr:
return False
return retry_with_equiv()
else:
u = u.merge(expr, v)
elif isinstance(pattern, graph.Constant) and isinstance(expr, graph.Constant) and pattern.equals(expr):
return u
else:
return False
return retry_with_equiv()
return u
def build(pattern, u):
......@@ -614,6 +629,7 @@ class PatternSub(LocalOptimizer):
if u:
p = self.out_pattern
new = build(p, u)
####print "PatternSub matched:", new
return [new]
else:
return False
......
......@@ -359,7 +359,8 @@ pprint.assign(lambda pstate, r: hasattr(pstate, 'target') and pstate.target is n
pp = pprint
def pydotprint(fct, outfile=os.path.join(config.compiledir,'theano.pydotprint.png'), compact=True, mode=None, format='png'):
def pydotprint(fct, outfile=os.path.join(config.compiledir,'theano.pydotprint.png'),
compact=True, mode=None, format='png', with_ids=False):
"""
print to a file in png format the graph of op of a compile theano fct.
......@@ -390,14 +391,15 @@ def pydotprint(fct, outfile=os.path.join(config.compiledir,'theano.pydotprint.pn
g=pd.Dot()
var_str={}
all_strings = set()
def var_name(var):
if var in var_str:
return var_str[var]
if var.name is not None:
varstr = var.name+" "+str(var.type)
varstr = 'name='+var.name+" "+str(var.type)
elif isinstance(var,gof.Constant):
dstr = str(var.data)
dstr = 'val='+str(var.data)
if '\n' in dstr:
dstr = dstr[:dstr.index('\n')]
if len(dstr) > 30:
......@@ -408,12 +410,17 @@ def pydotprint(fct, outfile=os.path.join(config.compiledir,'theano.pydotprint.pn
else:
#a var id is needed as otherwise var with the same type will be merged in the graph.
varstr = str(var.type)
varstr += ' ' + str(len(var_str))
if (varstr in all_strings) or with_ids:
varstr += ' id=' + str(len(var_str))
var_str[var]=varstr
all_strings.add(varstr)
return varstr
topo = fct.maker.env.toposort()
apply_name_cache = {}
def apply_name(node):
if node in apply_name_cache:
return apply_name_cache[node]
prof_str=''
if mode:
time = mode.apply_time.get((topo.index(node),node),0)
......@@ -425,7 +432,12 @@ def pydotprint(fct, outfile=os.path.join(config.compiledir,'theano.pydotprint.pn
pf=0
else: pf = time*100/mode.fct_call_time[fct]
prof_str=' (%.3fs,%.3f%%,%.3f%%)'%(time,pt,pf)
return str(node.op).replace(':','_')+' '+str(topo.index(node))+prof_str
applystr = str(node.op).replace(':','_')
if (applystr in all_strings) or with_ids:
applystr = applystr+' id='+str(topo.index(node))+prof_str
all_strings.add(applystr)
apply_name_cache[node] = applystr
return applystr
# Update the inputs that have an update function
input_update={}
......@@ -434,16 +446,18 @@ def pydotprint(fct, outfile=os.path.join(config.compiledir,'theano.pydotprint.pn
if i.update is not None:
input_update[outputs.pop()] = i
apply_shape='ellipse'
var_shape='box'
for node_idx,node in enumerate(topo):
astr=apply_name(node)
g.add_node(pd.Node(astr,shape='box'))
g.add_node(pd.Node(astr,shape=apply_shape))
for id,var in enumerate(node.inputs):
varstr=var_name(var)
label=''
if len(node.inputs)>1:
label=str(id)
if var.owner is None:
g.add_node(pd.Node(varstr,color='green'))
g.add_node(pd.Node(varstr,color='green',shape=var_shape))
g.add_edge(pd.Edge(varstr,astr, label=label))
elif var.name or not compact:
g.add_edge(pd.Edge(varstr,astr, label=label))
......@@ -460,10 +474,10 @@ def pydotprint(fct, outfile=os.path.join(config.compiledir,'theano.pydotprint.pn
label=str(id)
if out:
g.add_edge(pd.Edge(astr, varstr, label=label))
g.add_node(pd.Node(varstr,color='blue'))
g.add_node(pd.Node(varstr,color='blue',shape=var_shape))
elif len(var.clients)==0:
g.add_edge(pd.Edge(astr, varstr, label=label))
g.add_node(pd.Node(varstr,color='grey'))
g.add_node(pd.Node(varstr,color='grey',shape=var_shape))
elif var.name or not compact:
g.add_edge(pd.Edge(astr, varstr, label=label))
# else:
......@@ -495,9 +509,9 @@ def pydot_var(vars, outfile=os.path.join(config.compiledir,'theano.pydotprint.pn
return var_str[var]
if var.name is not None:
varstr = var.name
varstr = 'name='+var.name
elif isinstance(var,gof.Constant):
dstr = str(var.data)
dstr = 'val='+str(var.data)
if '\n' in dstr:
dstr = dstr[:dstr.index('\n')]
if len(dstr) > 30:
......
......@@ -21,8 +21,8 @@ def debug(*msg):
# printed and this module will not be working properly (we set `cuda_available`
# to False).
# This variable is True by default, and set to False if something goes wrong
# when trying to initialize cuda.
# This variable is True by default, and set to False if nvcc is not available or
# their is no cuda card or something goes wrong when trying to initialize cuda.
cuda_available = True
# Global variable to avoid displaying the same warning multiple times.
......@@ -89,6 +89,9 @@ except Exception, e:
error( "Failed to compile cuda_ndarray.cu: %s" % str(e))
set_cuda_disabled()
if cuda_available:
cuda_available=device_available()
if cuda_available:
#check if their is an old cuda_ndarray that was loading instead of the one we compiled!
import cuda_ndarray.cuda_ndarray
......
......@@ -1715,14 +1715,21 @@ class GpuSubtensor(tensor.Subtensor):
cdata = tuple(map(convert, self.idx_list))
if len(cdata) == 1:
cdata = cdata[0]
out[0] = x.__getitem__(cdata)
# some numpy installations don't expose the __index__() methods for
# numpy.int8/16/32/64. Casting to python's int instead
start = int(cdata.start) if cdata.start!=None else None
stop = int(cdata.stop) if cdata.stop!=None else None
step = int(cdata.step) if cdata.step!=None else None
newslice = slice(start,stop,step)
out[0] = x.__getitem__(newslice)
if 0:
# JSB: commenting this out because it breaks code and does not look right
# Dumi could you try to run the examples in the DeepLearningBenchmarks
# for example? This logic doesn't seem right -- we just
# cast cdata to a tuple, so it doesn't have a .start field.
# some numpy installations don't expose the __index__() methods for
# numpy.int8/16/32/64. Casting to python's int instead
start = int(cdata.start) if cdata.start!=None else None
stop = int(cdata.stop) if cdata.stop!=None else None
step = int(cdata.step) if cdata.step!=None else None
newslice = slice(start,stop,step)
out[0] = x.__getitem__(newslice)
class GpuIncSubtensor(tensor.IncSubtensor):
def make_node(self, x, y, *inputs):
......
......@@ -722,7 +722,9 @@ conv_rows_stack2( float* img, float* kern, float* out,
if(preload_full_kern) idx_kern=&d_kern[row*kern_wid];
else idx_kern=d_kern;
const float* idx_in=&d_img[((shared_row+row)%nb_rows)*img_wid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
float sum_ =0.0f;
convolutionRowNoFlip<KERN_WIDTH>(sum_,idx_in,idx_kern,kern_wid);
sum+=sum_;//We pass by an intermediate variable to have more precission.
}
}
}
......
......@@ -1604,6 +1604,35 @@ static PyTypeObject CudaNdarrayType =
CudaNdarray_new, /* tp_new */
};
//This fct return True it is able to find a cuda card and query its properti
//Otherwise we return False
PyObject *
device_available(PyObject* _unsed, PyObject * args)
{
int deviceCount;
cudaError err = cudaGetDeviceCount(&deviceCount);
if( cudaSuccess != err) {
Py_RETURN_FALSE;
}
if (deviceCount <= 0) {
Py_RETURN_FALSE;
}
cudaDeviceProp deviceProp;
err=cudaGetDeviceProperties(&deviceProp, 0);
if( cudaSuccess != err) {
Py_RETURN_FALSE;
}
if(deviceProp.major == 9999 && deviceProp.minor == 9999 ){
Py_RETURN_FALSE;
}
Py_RETURN_TRUE;
}
PyObject *
CudaNdarray_gpu_init(PyObject* _unsed, PyObject * args)
{
......@@ -1810,6 +1839,7 @@ filter(PyObject* __unsed_self, PyObject *args) // args = (data, broadcastable, s
static PyMethodDef module_methods[] = {
{"dot", CudaNdarray_Dot, METH_VARARGS, "Returns the matrix product of two CudaNdarray arguments."},
{"device_available", device_available, METH_VARARGS, "Return Py_True if a cuda card is available."},
{"gpu_init", CudaNdarray_gpu_init, METH_VARARGS, "Allow to select the gpu card to use."},
{"filter", filter, METH_VARARGS, "filter(obj, broadcastable, strict, storage) returns a CudaNdarray initialized to obj if it matches the constraints of broadcastable. strict=True prevents any numeric casting. If storage is a CudaNdarray it may be overwritten and used as the return value."},
{"outstanding_mallocs", outstanding_mallocs, METH_VARARGS, "how many more mallocs have been called than free's"},
......
......@@ -165,7 +165,9 @@ def exec_conv(version, shapes, verbose, random, mode, print_=None, rtol=1e-5, on
ret = _params_allgood(ishape, kshape, mode,
subsample=subshape, img_stride=istride, kern_stride=kstride,
version=ver, verbose=verbose, random=random, id=id,print_=print_,rtol=rtol,ones=ones)
except:
except Exception, e:
print ver, id,(ishape, kshape, subshape, istride, kstride)
print e
pass
if not ret:
failed_version.add(ver)
......
......@@ -11,6 +11,7 @@ if cuda_available:
import unittest
from theano.tests import unittest_tools as utt
from nose.plugins.skip import SkipTest
#TODO: test gpu
# Done in test_consistency_GPU_{serial,parallel}
......@@ -22,7 +23,6 @@ from theano.tests import unittest_tools as utt
#TODO: make tests work when no flags gived. Now need: THEANO_FLAGS=device=gpu0,floatX=float32
# Partly done, in test_consistency_GPU_{serial,parallel}
#TODO: bug fix test_normal0, in normal() fct, n_samples currently need to be numpy.prod(size) not self.n_streams(size)
mode = config.mode
mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
......@@ -287,6 +287,7 @@ def basictest(f, steps, sample_size, prefix="", allow_01=False, inputs=[],
for i in xrange(steps):
t0 = time.time()
ival = f(*inputs)
assert ival.shape==sample_size
dt += time.time() - t0
ival = numpy.asarray(ival)
if i == 0:
......@@ -324,7 +325,7 @@ def test_uniform():
sample_size = (10,100)
steps = 50
else:
sample_size = (500,100)
sample_size = (500,50)
steps = int(1e3)
x = tensor.matrix()
......@@ -381,9 +382,9 @@ def test_binomial():
if mode in ['DEBUG_MODE','FAST_COMPILE']:
sample_size = (10,50)
steps = 70
steps = 50
else:
sample_size = (500,100)
sample_size = (500,50)
steps = int(1e3)
x = tensor.matrix()
......@@ -430,9 +431,9 @@ def test_normal0():
steps = 50
if mode in ['DEBUG_MODE','FAST_COMPILE']:
sample_size = (99,100)
sample_size = (99,30)
else:
sample_size = (999,100)
sample_size = (999,50)
print ''
print 'ON CPU:'
......@@ -464,8 +465,8 @@ def test_normal0():
print 'random?[:10]\n', numpy.asarray(f())[0,0:10]
print '----'
sys.stdout.flush()
basictest(f, steps, sample_size, target_avg=-5.0, target_std=2.0, prefix='gpu mrg ', allow_01=True)
basictest(f, steps, sample_size_odd, target_avg=-5.0, target_std=2.0, prefix='gpu mrg ', allow_01=True)
print ''
print 'ON CPU w NUMPY:'
......@@ -476,7 +477,7 @@ def test_normal0():
basictest(ff, steps, sample_size, target_avg=-5.0, target_std=2.0, prefix='numpy ', allow_01=True)
def basic_multinomialtest(f, steps, target_pvals, prefix="", mean_rtol=0.04):
def basic_multinomialtest(f, steps, sample_size, target_pvals, prefix="", mean_rtol=0.04):
dt = 0.0
avg_pvals = numpy.zeros(target_pvals.shape, dtype=config.floatX)
......@@ -484,6 +485,7 @@ def basic_multinomialtest(f, steps, target_pvals, prefix="", mean_rtol=0.04):
for i in xrange(steps):
t0 = time.time()
ival = f()
assert ival.shape==sample_size
dt += time.time() - t0
#ival = numpy.asarray(ival)
avg_pvals += ival
......@@ -518,7 +520,7 @@ def test_multinomial():
f = theano.function([], m, mode=mode_)
theano.printing.debugprint(f)
basic_multinomialtest(f, steps, pvals, prefix='mrg ')
basic_multinomialtest(f, steps, sample_size, pvals, prefix='mrg ')
sys.stdout.flush()
......@@ -535,4 +537,4 @@ def test_multinomial():
theano.printing.debugprint(f)
sys.stdout.flush()
basic_multinomialtest(f, steps, pvals, prefix='gpu mrg ')
basic_multinomialtest(f, steps, sample_size, pvals, prefix='gpu mrg ')
......@@ -302,7 +302,7 @@ class Scalar(Type):
return ""
def c_code_cache_version(self):
return (8,) # put const around operators and added unary '-' operator
return (8, numpy.__version__) # put const around operators and added unary '-' operator
# no need to put lib.amdlibm here as c_compile_args() are put in the key.
return (7,) # make complex c code optional
return (6,) # added implemeentations of operators that work with scalar arguments
......@@ -932,6 +932,7 @@ class IntDiv(BinaryScalarOp):
return [None] * len(inputs)
int_div = IntDiv(upcast_out, name = 'int_div')
floor_div = int_div
class Mod(BinaryScalarOp):
def impl(self, x, y):
......
......@@ -887,6 +887,11 @@ class _tensor_py_operators:
except Exception, e:
return NotImplemented
def __truediv__(self,other): return true_div(self, other)
def __floordiv__(self,other): return floor_div(self, other)
def __rtruediv__(self,other): return true_div(other, self)
def __rfloordiv__(self,other): return floor_div(other, self)
# ##### DON"T USE THESE BECAUSE INPLACE OPS SHOULD BE INSERTED BY OPTIMIZATION ONLY
# #ARITHMETIC - INPLACE
# def __iadd__(self,other): return _add_inplace(self,other)
......@@ -2066,6 +2071,11 @@ def true_div(a, b):
"""elementwise [true] division (inverse of multiplication)"""
# see decorator for function body
@_scal_elemwise
def floor_div(a, b):
"""elementwise [floor] division (inverse of multiplication)"""
# see decorator for function body
@_scal_elemwise
def int_div(a, b):
"""elementwise integer-division"""
......@@ -3607,8 +3617,12 @@ class Dot(Op):
nx = x.type.ndim
ny = y.type.ndim
if nx not in (1,2): raise TypeError('not matrix or vector', x)
if ny not in (1,2): raise TypeError('not matrix or vector', y)
if nx not in (1,2):
raise TypeError(('dot supports matrix and vector args: email theano-dev about'
' enabling numpy dot semantics if you want them'), x)
if ny not in (1,2):
raise TypeError(('dot supports matrix and vector args: email theano-dev about'
' enabling numpy dot semantics if you want them'), y)
if nx == 2 and ny == 2:
bz = [x.type.broadcastable[0], y.type.broadcastable[1]]
......
差异被折叠。
......@@ -397,41 +397,41 @@ def local_softmax_with_bias(node):
return
return [sm_bias]
if 0:
def softmax_simplifier(numerators, denominators):
for numerator in list(numerators):
#TODO: a single softmax'd vector??
if not numerator.type.dtype.startswith('float'):
continue
if not numerator.type.broadcastable == (False, False):
continue
if numerator.owner and numerator.owner.op == tensor.exp:
x = numerator.owner.inputs[0]
else:
continue
matching_denom = None
for denominator in denominators:
if denominator.owner and isinstance(denominator.owner.op, tensor.DimShuffle):
if denominator.owner.op.new_order == (0,'x'):
z = denominator.owner.inputs[0] # thing getting dimshuffled
if z.owner and isinstance(z.owner.op, tensor.Sum):
#print 'ASDF', denominator.owner.op.new_order
#print z.owner.op.axis
if z.owner.op.axis == (1,):
#print "almost there.. softmax", x, z.owner.inputs[0]
if z.owner.inputs[0] is numerator:
matching_denom = denominator
break
if matching_denom:
numerators.remove(numerator)
denominators.remove(matching_denom)
numerators.append(softmax(x))
return numerators, denominators
opt.local_mul_canonizer.add_simplifier(softmax_simplifier, 'softmax_simplifier')
def softmax_simplifier(numerators, denominators):
for numerator in list(numerators):
#TODO: a single softmax'd vector??
if not numerator.type.dtype.startswith('float'):
continue
if not numerator.type.broadcastable == (False, False):
continue
if numerator.owner and numerator.owner.op == tensor.exp:
x = numerator.owner.inputs[0]
else:
continue
matching_denom = None
for denominator in denominators:
if denominator.owner and isinstance(denominator.owner.op, tensor.DimShuffle):
if denominator.owner.op.new_order == (0,'x'):
z = denominator.owner.inputs[0] # thing getting dimshuffled
if z.owner and isinstance(z.owner.op, tensor.Sum):
#print 'ASDF', denominator.owner.op.new_order
#print z.owner.op.axis
if z.owner.op.axis == (1,):
#print "almost there.. softmax", x, z.owner.inputs[0]
if z.owner.inputs[0] is numerator:
matching_denom = denominator
break
if matching_denom:
numerators.remove(numerator)
denominators.remove(matching_denom)
numerators.append(softmax(x))
return numerators, denominators
opt.local_mul_canonizer.add_simplifier(softmax_simplifier, 'softmax_simplifier')
if 0:
def softmax_grad_simplifier(numerators, denominators):
print "mul simplify numerators"
printing.debugprint(numerators)
......
......@@ -8,7 +8,7 @@ from theano import gof
from theano import scalar
from theano import printing
from theano.tensor import basic as tensor
from theano.printing import pprint
from theano.printing import pprint, debugprint
from theano.tensor import elemwise
from theano.tensor import opt
from theano.compile import optdb
......@@ -95,10 +95,17 @@ softplus = elemwise.Elemwise(scalar_softplus, name='softplus')
pprint.assign(softplus, printing.FunctionPrinter('softplus'))
def _skip_mul_1(r):
if r.owner and r.owner.op == tensor.mul:
not_is_1 = [i for i in r.owner.inputs if not _is_1(i) ]
if len(not_is_1)==1:
return not_is_1[0]
logsigm_to_softplus = gof.PatternSub(
(tensor.log, (sigmoid, 'x')),
(tensor.neg, (softplus, (tensor.neg, 'x'))),
allow_multiple_clients = True)
allow_multiple_clients = True,
skip_identities_fn=_skip_mul_1)
def _is_1(expr):
"""rtype bool. True iff expr is a constant close to 1
......@@ -115,7 +122,8 @@ log1msigm_to_softplus = gof.PatternSub(
dict(pattern='y', constraint = _is_1),
(sigmoid, 'x'))),
(tensor.neg, (softplus, 'x')),
allow_multiple_clients = True)
allow_multiple_clients = True,
skip_identities_fn=_skip_mul_1)
log1pexp_to_softplus = gof.PatternSub(
(tensor.log1p,
......@@ -329,3 +337,48 @@ register_local_1msigmoid = False
if register_local_1msigmoid:
opt.register_canonicalize(local_1msigmoid)
if 0:
# This code is if'd out because it is not complete,
# and it isn't obviously a good idea anyway.
# The motivation here was to identify the last exp() node
# in the SciPy2010 article, which was not optimized away at the time of publication,
# so the example is actually not numerically stable, even though it should be.
@opt.register_stabilize
@gof.local_optimizer([tensor.mul])
def local_sigm_gest(node):
print "CANONICALIZE"
print sigm_canonicalize(node)
def sigm_canonicalize(node):
add = tensor.add
mul = tensor.mul
div = tensor.true_div
if node.op == tensor.add:
rval = []
for i in node.inputs:
rval += sigm_canonicalize(i)
return rval
if node.op == tensor.mul:
rval = sigm_canonicalize(node.inputs[0])
for i in node.inputs[1:]:
old_rval = rval
rval = []
for t1 in sigm_canonicalize(i):
for t0 in old_rval:
assert t1.owner.op == div
assert t0.owner.op == div
t0top, t0bot = t0.owner.inputs
t1top, t1bot = t1.owner.inputs
rval.append(div(mul(*(t0top+t1top)), mul(*(t0bot+t1bot))))
if len(rval) > 100:
# This loop can be exponentially long.
# aborting
return []
elif len(node.outputs)>1:
return []
else:
return [node.outputs[0]]
......@@ -924,6 +924,10 @@ class Test_softmax_opt():
assert softmax in f_ops
f(self.rng.rand(3,4))
def test_grad(self):
c = T.matrix()
p_y = T.exp(c) / T.exp(c).sum(axis=1).dimshuffle(0,'x')
# test that function contains softmax and no div.
w = T.matrix()
g = theano.function([c,w],T.grad((p_y*w).sum(), c))
......
......@@ -144,6 +144,11 @@ def register_specialize(lopt, *tags, **kwargs):
compile.optdb['specialize'].register(name, lopt, 'fast_run', *tags)
return lopt
def register_specialize_device(lopt, *tags, **kwargs):
name = (kwargs and kwargs.pop('name')) or lopt.__name__
compile.optdb['specialize_device'].register(name, lopt, 'fast_run', *tags)
return lopt
def register_stabilize(lopt, *tags, **kwargs):
name = (kwargs and kwargs.pop('name')) or lopt.__name__
compile.optdb['stabilize'].register(name, lopt, 'fast_run', *tags)
......@@ -189,6 +194,11 @@ def local_dimshuffle_lift(node):
register_canonicalize(local_dimshuffle_lift)
register_specialize(local_dimshuffle_lift)
@register_canonicalize
@gof.local_optimizer([])
def local_dimshuffle_no_inplace_at_canonicalize(node):
if isinstance(node.op, T.DimShuffle) and node.op.inplace:
return [T.DimShuffle(node.op.input_broadcastable, node.op.new_order, inplace=False)(node.inputs[0])]
#####################################
......@@ -1603,18 +1613,20 @@ def local_sum_mul_by_scalar(node):
@register_canonicalize
@gof.local_optimizer([])
def local_sum_div_dimshuffle(node):
'''sum(a / dimshuffle{...}(b), axis=l) -> sum(a, axis=l) / b,
'''sum(a / dimshuffle{...}(b), axis=l) -> sum(a, axis={...}) / b,
if dimension l of the DimShuffle is 'x'.'''
# TODO: extend it to product, and quotient of products
if isinstance(node.op, T.Sum):
axis = node.op.axis
if axis is None:
axis = range(node.inputs[0].ndim)
#print 'axis =', axis
thing_summed = node.inputs[0]
dimshuffled = None
if thing_summed.owner and thing_summed.owner.op == T.true_div:
numerator, denominator = thing_summed.owner.inputs
if isinstance(numerator.owner.op, T.DimShuffle):
if numerator.owner and isinstance(numerator.owner.op, T.DimShuffle):
new_order = numerator.owner.op.new_order
#print 'new_order =', new_order
# check compatibility
......@@ -1630,7 +1642,7 @@ def local_sum_div_dimshuffle(node):
#else:
# print 'incompatible dims:', axis, new_order
if isinstance(denominator.owner.op, T.DimShuffle):
if denominator.owner and isinstance(denominator.owner.op, T.DimShuffle):
new_order = denominator.owner.op.new_order
#print 'new_order =', new_order
# check compatibility
......@@ -1827,9 +1839,31 @@ def local_pow_specialize(node):
rval = [T.inv(xsym)]
if N.all(y == -2):
rval = [T.inv(T.sqr(xsym))]
if rval:
rval[0] = T.cast(rval[0], odtype)
assert rval[0].type == node.outputs[0].type, (rval, node.outputs)
return rval
else:
return False
register_specialize(local_pow_specialize)
# Optimize all integral powers in [-RANGE, RANGE]
if config.experimental.pow and rval is None and abs(y)==int(abs(y)) and abs(y) <= 512:# 512 is too small for the cpu and too big for some gpu!
@register_specialize_device
@gof.local_optimizer([T.pow])
def local_pow_specialize_device(node):
"""
This optimization is not the same on all device. We do it only on cpu here.
"""
if node.op == T.pow:
#the idea here is that we have pow(x, y)
odtype = node.outputs[0].dtype
xsym = node.inputs[0]
ysym = node.inputs[1]
y = local_mul_canonizer.get_constant(ysym)
if (y is not None) \
and encompasses_broadcastable(xsym.type.broadcastable, ysym.type.broadcastable):
rval = None
# 512 is too small for the cpu and too big for some gpu!
if abs(y)==int(abs(y)) and abs(y) <= 512:
pow2 = [xsym]
pow2_scal = [theano.scalar.Scalar(xsym.dtype)()]
y_to_do = abs(y)
......@@ -1859,14 +1893,7 @@ def local_pow_specialize(node):
rval[0] = T.cast(rval[0], odtype)
assert rval[0].type == node.outputs[0].type, (rval, node.outputs)
return rval
else:
return False
register_specialize(local_pow_specialize)
theano.configparser.AddConfigVar('experimental.pow',
"Transform a pow to a constant integer to a graph of mul. Fast on cpu, but more work needed for gpu.",
theano.configparser.BoolParam(False),
)
@gof.local_optimizer([T.mul])
def local_mul_specialize(node):
"""Remove special-case constants from mul arguments
......@@ -1965,20 +1992,28 @@ register_specialize(local_add_specialize)
mul_canonizer = in2out(gof.LocalOptGroup(local_mul_canonizer, local_fill_cut, local_fill_sink))
def check_for_x_over_absX(numerators, denominators):
"""Convert x/abs(x) into sign(x). """
# TODO: this function should dig/search through dimshuffles
# This won't catch a dimshuffled absolute value
for den in list(denominators):
if den.owner and den.owner.op == T.abs_ and den.owner.inputs[0] in numerators:
denominators.remove(den)
numerators.remove(den.owner.inputs[0])
numerators.append(T.sgn(den.owner.inputs[0]))
if den.owner.inputs[0].type.dtype.startswith('complex'):
#TODO: Make an Op that projects a complex number to have unit length
# but projects 0 to 0. That would be a weird Op, but consistent with the
# special case below. I heard there's some convention in Matlab that is
# similar to this... but not sure.
pass
else:
denominators.remove(den)
numerators.remove(den.owner.inputs[0])
numerators.append(T.sgn(den.owner.inputs[0]))
return numerators, denominators
local_mul_canonizer.add_simplifier(check_for_x_over_absX, 'teststest')
@register_stabilize
@gof.local_optimizer([T.log])
def local_log1p(node):
# log(1+exp(x)) -> log1p(x)
# log(1+x) -> log1p(x)
if node.op == T.log:
log_arg, = node.inputs
if log_arg.owner and log_arg.owner.op == T.add:
......@@ -2207,7 +2242,7 @@ def local_elemwise_fusion_op(OP):
"""
def local_fuse(node):
"""
As part of specialisation, we fusion two consecutif elemwise op of the same shape.
As part of specialisation, we fuse two consecutive elemwise op of the same shape.
For mixed dtype, we let the Compise op do the cast. It let the C compile do the cast.
The number of dimension is validated at call time by theano itself.
......@@ -2240,7 +2275,7 @@ def local_elemwise_fusion_op(OP):
for i in node.inputs:
do_fusion = False
catch = False
if i.owner and isinstance(i.owner.op, OP) and len(i.clients)<=1:
if i.owner and isinstance(i.owner.op, OP) and len(i.clients)==1:
#if the scalar_op don't have a c implementation, we skip its fusion to allow the fusion of the other ops.
do_fusion=True
try:
......@@ -2296,7 +2331,7 @@ def local_elemwise_fusion_op(OP):
# There is a hard limit of 256 bytes for the formal argument list to a GPU kernel function.
# Here, we estimate how many bytes the new Op will need, and abort if it needs too much.
if True:
if OP != T.Elemwise:
argument_limit = 240 # 16 bytes are used for block and thread coords etc.
#TODO: read in from architecture to make this 4 or 8
int_size = 8
......
......@@ -1604,6 +1604,54 @@ class t_dot(unittest.TestCase):
#utt.verify_grad(dot, [self.rand(), self.rand(2)])
#utt.verify_grad(dot, [self.rand(), self.rand(2,5)])
def test_broadcastable_patterns(self):
#
# These examples hsould all work because we broadcastable or no, all dimensions of all
# results have size 1.
#
def val_for(r):
if r.ndim == 0:
return numpy.asarray(1.1, dtype=r.dtype)
if r.ndim == 1:
return numpy.asarray([1.2], dtype=r.dtype)
elif r.ndim == 2:
return numpy.asarray([[1.3]], dtype=r.dtype)
raise ValueError()
failures = []
for dtype0 in ('float32', 'float64', 'complex64', 'complex128'):
for dtype1 in ('float32', 'float64', 'complex64', 'complex128'):
for bc0 in ((True,), (False,), (True, True), (True, False), (False, True),
(False,False)):
for bc1 in ((True,), (False,), (True, True), (True, False), (False, True),
(False,False)):
x = TensorType(dtype=dtype0, broadcastable=bc0)()
y = TensorType(dtype=dtype1, broadcastable=bc1)()
z = dot(x,y)
t = TensorType(dtype=dtype0, broadcastable=z.broadcastable)()
rval = z * 3 + 2*t
if rval.type.dtype.startswith('complex'):
# there is a problem with complex numbers right now
# Elemwise code doesn't compile when both precisions of complex
# numbers are used in the same file because the operators
# aren't declared properly.
failures.append((dtype0, dtype1, bc0, bc1))
continue
f = function([x,y,t], rval)
xval = val_for(x)
yval = val_for(y)
tval = val_for(t)
f(xval, yval, tval) #debugmode checks result
#if failures:
#print failures
assert not failures
class T_tensorfromscalar(unittest.TestCase):
def test0(self):
s = scal.constant(56)
......
......@@ -416,7 +416,7 @@ def test_gemm_canonicalize():
can = []
_gemm_canonicalize(X + Y + u, 1.0, can, 0)
assert can == [(1.0, X), (1.0, Y), u]
assert can == [(1.0, X), (1.0, Y), u], can
can = []
_gemm_canonicalize(a*X + Y - b*Z*c, 1.0, can, 0)
......
from nose.plugins.skip import SkipTest
import unittest
import theano
import numpy
import random
import numpy.random
from theano.tests import unittest_tools as utt
'''
Different tests that are not connected to any particular Op, or functionality of
Theano. Here will go for example code that we will publish in papers, that we
should ensure that it will remain operational
'''
class T_diverse(unittest.TestCase):
def setUp(self):
utt.seed_rng()
def scipy_paper_example1(self):
a = theano.tensor.vector('a') # declare variable
b = a + a**10 # build expression
f = theano.function([a], b) # compile function
assert numpy.all(f([0,1,2]) == numpy.array([0,2,1026]))
def scipy_papaer_example2(self):
''' This just sees if things compile well and if they run '''
x = T.matrix()
y = T.vector()
w = shared(rng.randn(100))
b = shared(numpy.zeros(()))
# Construct Theano expression graph
p_1 = 1 / (1 + T.exp(-T.dot(x, w)-b))
xent = -y*T.log(p_1) - (1-y)*T.log(1-p_1)
prediction = p_1 > 0.5
cost = xent.mean() + 0.01*(w**2).sum()
gw,gb = T.grad(cost, [w,b])
# Compile expressions to functions
train = function(
inputs=[x,y],
outputs=[prediction, xent],
updates={w:w-0.1*gw, b:b-0.1*gb})
predict = function(inputs=[x], outputs=prediction)
N = 4
feats = 100
D = (rng.randn(N, feats), rng.randint(size=4,low=0, high=2))
training_steps = 10
for i in range(training_steps):
pred, err = train(D[0], D[1])
if __name__ == '__main__':
unittest.main()
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论