提交 fe55d742 authored 作者: Frederic Bastien's avatar Frederic Bastien

merge

...@@ -188,9 +188,22 @@ def pfunc(params, outputs=None, mode=None, updates=[], givens=[], ...@@ -188,9 +188,22 @@ def pfunc(params, outputs=None, mode=None, updates=[], givens=[],
update_val = store_into.filter_update(update_val) # typically this might be a cast() update_val = store_into.filter_update(update_val) # typically this might be a cast()
if update_val.type != store_into.type: if update_val.type != store_into.type:
raise TypeError('an update must have the same type as the original shared variable(dest, dest.type, update_val, update_val.type', #the number of clients don't seam to be defined
def get_first_node(node, dtype):
if node is None: return None
if any([o.dtype!=dtype for o in node.outputs]):
for i in node.inputs:
n = get_first_node(i.owner, dtype)
if n is not None:
return n
return node#no parent generated a different type
else: return None
raise TypeError('an update must have the same type as the original shared variable(dest, dest.type, update_val, update_val.type)',
(store_into, store_into.type, (store_into, store_into.type,
update_val, update_val.type)) update_val, update_val.type),
'Here is the first node we found that generated a type that is not the same as the output wanted.',
get_first_node(update_val.owner, store_into.dtype))
update_d[store_into] = update_val update_d[store_into] = update_val
update_expr.append((store_into, update_val)) update_expr.append((store_into, update_val))
......
...@@ -343,6 +343,74 @@ class ProfileMode(Mode): ...@@ -343,6 +343,74 @@ class ProfileMode(Mode):
if hasattr(i.type, 'dtype') and i.type.dtype=='float64': if hasattr(i.type, 'dtype') and i.type.dtype=='float64':
print fct.name, i.name, i.type, i print fct.name, i.name, i.type, i
print
print "We guess some tips to make your code faster. If you think of new one, suggest them on the mailing list. Test them before use as they are not guaranted to always give a speed up."
from theano import tensor as T
from theano.tensor.raw_random import RandomFunction
import theano
import theano.scalar as scal
scalar_op_amdlibm_no_speed_up = [scal.LT, scal.GT, scal.LE, scal.GE, scal.EQ, scal.NEQ, scal.InRange, scal.Switch, scal.OR, scal.XOR, scal.AND, scal.Invert, scal.Maximum, scal.Minimum, scal.Add, scal.Mul, scal.Sub, scal.TrueDiv, scal.IntDiv, scal.Clip, scal.First, scal.Second, scal.Identity, scal.Cast, scal.Sgn, scal.Neg, scal.Inv, scal.Sqr ]
scalar_op_amdlibm_speed_up = [scal.Mod, scal.Pow, scal.Ceil, scal.Floor, scal.RoundHalfToEven, scal.RoundHalfAwayFromZero, scal.Log, scal.Log2, scal.Log10, scal.Log1p, scal.Exp, scal.Sqrt, scal.Abs, scal.Cos, scal.Sin, scal.Tan, scal.Tanh, scal.Cosh, scal.Sinh, T.nnet.sigm.ScalarSigmoid, T.nnet.sigm.ScalarSoftplus ]#Abs, Mod in float{32,64} only
def get_scalar_ops(s):
if isinstance(s, theano.scalar.Composite):
l = []
for node in s.env.toposort():
l+=get_scalar_ops(node.op)
return l
else: return [s]
def list_scalar_op(op):
if isinstance(op.scalar_op, theano.scalar.Composite):
return get_scalar_ops(op.scalar_op)
else: return [op.scalar_op]
def amdlibm_speed_up(op):
if not isinstance(op, T.Elemwise):
return False
else:
l = list_scalar_op(op)
for s_op in l:
if s_op.__class__ in scalar_op_amdlibm_speed_up:
return True
elif s_op.__class__ not in scalar_op_amdlibm_no_speed_up:
import pdb;pdb.set_trace()
print "We don't know if amdlibm will accelerate this scalar op.", s_op
return False
def exp_float32_op(op):
if not isinstance(op, T.Elemwise):
return False
else:
l = list_scalar_op(op)
return any([s_op.__class__ in [scal.Exp] for s_op in l])
#tip 1
if config.floatX=='float64':
print " - Try the Theano flag floatX=float32"
#tip 2
if not config.lib.amdlibm and any([amdlibm_speed_up(a.op) for i,a in apply_time]):
print " - Try installing amdlibm and set the Theano flag lib.amdlibm=True. This speed up only some Elemwise operation."
#tip 3
if not config.lib.amdlibm and any([exp_float32_op(a.op) and a.inputs[0].dtype=='float32' for i,a in apply_time]):
print " - With the default gcc libm, exp in float32 is slower then in float64! Try Theano flags floatX=float64 or install amdlibm and set the theano flags lib.amdlibm=True"
#tip 4
for a, t in apply_time.iteritems():
node = a[1]
if isinstance(node.op, T.Dot) and all([ len(i.type.broadcastable)==2 for i in node.inputs]):
print " - You have a dot operation that was not optimized to dot22 that is faster. Make sure the inputs are float32 or 64 and are the same for both input. Currently they are:",[i.type for i in node.inputs]
#tip 5
for a, t in apply_time.iteritems():
node = a[1]
if isinstance(node.op, RandomFunction):
print " - Replace the default random number generator by 'from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams' as this is is faster. It is still experimental, but seam to work correctly."
if config.device.startswith("gpu"):
print " - MRG_RandomStreams is the only random number supported on the GPU."
break
register_mode('PROFILE_MODE',ProfileMode()) register_mode('PROFILE_MODE',ProfileMode())
#needed to print the profile at the end automatically #needed to print the profile at the end automatically
......
...@@ -1032,13 +1032,10 @@ class GpuSum(Op): ...@@ -1032,13 +1032,10 @@ class GpuSum(Op):
std::min(CudaNdarray_HOST_DIMS(%(x)s)[0], std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],
NUM_VECTOR_OP_THREADS_PER_BLOCK)); NUM_VECTOR_OP_THREADS_PER_BLOCK));
dim3 n_blocks(CudaNdarray_HOST_DIMS(%(x)s)[1]); dim3 n_blocks(CudaNdarray_HOST_DIMS(%(x)s)[1]);
while (n_blocks.x * n_blocks.y <= NUM_VECTOR_OP_BLOCKS) while (n_blocks.x * (n_blocks.y+1) <= NUM_VECTOR_OP_BLOCKS && n_blocks.y <= CudaNdarray_HOST_DIMS(%(x)s)[2])
{ {
if (n_blocks.y > CudaNdarray_HOST_DIMS(%(x)s)[2])
break;
n_blocks.y += 1; n_blocks.y += 1;
} }
n_blocks.y -= 1;
%(makecall)s %(makecall)s
} }
""" %locals() """ %locals()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论