提交 70e25931 authored 作者: abergeron's avatar abergeron

Merge pull request #1756 from nouiz/mixed

Fix gpu crash and faster optimization
...@@ -18,7 +18,7 @@ those operations will run in parallel in Theano. ...@@ -18,7 +18,7 @@ those operations will run in parallel in Theano.
The most frequent way to control the number of threads used is via the The most frequent way to control the number of threads used is via the
``OMP_NUM_THREADS`` environment variable. Set it to the number of ``OMP_NUM_THREADS`` environment variable. Set it to the number of
threads you want to use before starting the python process. Some BLAS threads you want to use before starting the python process. Some BLAS
implementation support other enviroment variable. implementations support other enviroment variables.
Parallel element wise ops with OpenMP Parallel element wise ops with OpenMP
...@@ -27,8 +27,8 @@ Parallel element wise ops with OpenMP ...@@ -27,8 +27,8 @@ Parallel element wise ops with OpenMP
Because element wise ops work on every tensor entry independently they Because element wise ops work on every tensor entry independently they
can be easily parallelized using OpenMP. can be easily parallelized using OpenMP.
To use OpenMP you must set the ``openmp`` flag to ``True`` in Theano To use OpenMP you must set the ``openmp`` :ref:`flag <libdoc_config>`
configuration. to ``True``.
You can use the flag ``openmp_elemwise_minsize`` to set the minimum You can use the flag ``openmp_elemwise_minsize`` to set the minimum
tensor size for which the operation is parallelized because for short tensor size for which the operation is parallelized because for short
......
...@@ -62,7 +62,6 @@ from theano.gof.opt import (Optimizer, optimizer, SeqOptimizer, ...@@ -62,7 +62,6 @@ from theano.gof.opt import (Optimizer, optimizer, SeqOptimizer,
LocalOptimizer, local_optimizer, LocalOptGroup, LocalOptimizer, local_optimizer, LocalOptGroup,
OpSub, OpRemove, PatternSub, OpSub, OpRemove, PatternSub,
NavigatorOptimizer, TopoOptimizer, EquilibriumOptimizer, NavigatorOptimizer, TopoOptimizer, EquilibriumOptimizer,
InplaceOptimizer, PureThenInplaceOptimizer,
OpKeyOptimizer) OpKeyOptimizer)
from theano.gof.optdb import \ from theano.gof.optdb import \
......
...@@ -131,6 +131,9 @@ class FromFunctionOptimizer(Optimizer): ...@@ -131,6 +131,9 @@ class FromFunctionOptimizer(Optimizer):
def __call__(self, *args, **kwargs): def __call__(self, *args, **kwargs):
return self.fn(*args, **kwargs) return self.fn(*args, **kwargs)
def __str__(self):
return self.__name__
def optimizer(f): def optimizer(f):
"""decorator for FromFunctionOptimizer""" """decorator for FromFunctionOptimizer"""
...@@ -626,7 +629,10 @@ class MergeOptimizer(Optimizer): ...@@ -626,7 +629,10 @@ class MergeOptimizer(Optimizer):
print >> stream, blanc, " replace_time", replace_time print >> stream, blanc, " replace_time", replace_time
print >> stream, blanc, " validate_time", validate_time print >> stream, blanc, " validate_time", validate_time
print >> stream, blanc, " callback_time", callback_time print >> stream, blanc, " callback_time", callback_time
print >> stream, blanc, " callback_times", callbacks_time print >> stream, blanc, " callbacks_time"
for i in sorted(callbacks_time.iteritems(), key=lambda a: a[1]):
if i[1] > 0:
print i
print >> stream, blanc, " nb_merged", nb_merged print >> stream, blanc, " nb_merged", nb_merged
print >> stream, blanc, " nb_constant", nb_constant print >> stream, blanc, " nb_constant", nb_constant
...@@ -1490,7 +1496,6 @@ class EquilibriumOptimizer(NavigatorOptimizer): ...@@ -1490,7 +1496,6 @@ class EquilibriumOptimizer(NavigatorOptimizer):
def __init__(self, def __init__(self,
optimizers, optimizers,
failure_callback=None, failure_callback=None,
max_depth=None,
max_use_ratio=None): max_use_ratio=None):
""" """
:param optimizers: list or set of local or global optimizations to :param optimizers: list or set of local or global optimizations to
...@@ -1499,8 +1504,6 @@ class EquilibriumOptimizer(NavigatorOptimizer): ...@@ -1499,8 +1504,6 @@ class EquilibriumOptimizer(NavigatorOptimizer):
:param max_use_ratio: each optimizer can be applied at most :param max_use_ratio: each optimizer can be applied at most
(size of graph * this number) times (size of graph * this number) times
:param max_depth: TODO what does this do? (EquilibriumDB sets it to 5)
""" """
super(EquilibriumOptimizer, self).__init__( super(EquilibriumOptimizer, self).__init__(
...@@ -1520,7 +1523,6 @@ class EquilibriumOptimizer(NavigatorOptimizer): ...@@ -1520,7 +1523,6 @@ class EquilibriumOptimizer(NavigatorOptimizer):
self.local_optimizers_map.setdefault(c, []).append(opt) self.local_optimizers_map.setdefault(c, []).append(opt)
else: else:
self.global_optimizers.append(opt) self.global_optimizers.append(opt)
self.max_depth = max_depth
self.max_use_ratio = max_use_ratio self.max_use_ratio = max_use_ratio
assert self.max_use_ratio is not None, ( assert self.max_use_ratio is not None, (
'max_use_ratio has to be a number') 'max_use_ratio has to be a number')
...@@ -1723,10 +1725,12 @@ class EquilibriumOptimizer(NavigatorOptimizer): ...@@ -1723,10 +1725,12 @@ class EquilibriumOptimizer(NavigatorOptimizer):
for (t, count, opt) in count_opt[::-1]: for (t, count, opt) in count_opt[::-1]:
print >> stream, blanc, ' %.3fs - %d - %s' % ( print >> stream, blanc, ' %.3fs - %d - %s' % (
t, count, opt) t, count, opt)
print >> stream, blanc, ' %.3fs - in %d optimization that where not used' % ( print >> stream, blanc, ' %.3fs - in %d optimization that where not used (display only those with a runtime > 0)' % (
not_used_time, len(not_used)) not_used_time, len(not_used))
not_used.sort() not_used.sort()
for (t, opt) in not_used[::-1]: for (t, opt) in not_used[::-1]:
if t > 0:
# Skip opt that have 0 times, they probably wasn't even tried.
print >> stream, blanc + " ", ' %.3fs - %s' % (t, opt) print >> stream, blanc + " ", ' %.3fs - %s' % (t, opt)
print >> stream print >> stream
...@@ -1899,31 +1903,3 @@ def pre_greedy_local_optimizer(list_optimizations, out): ...@@ -1899,31 +1903,3 @@ def pre_greedy_local_optimizer(list_optimizations, out):
final_outs, optimized_nodes = local_recursive_function( final_outs, optimized_nodes = local_recursive_function(
list_optimizations, out, {}, 0) list_optimizations, out, {}, 0)
return final_outs[out_index] return final_outs[out_index]
############
### Misc ###
############
class InplaceOptimizer(Optimizer):
def __init__(self, inplace):
self.inplace = inplace
def apply(self, fgraph):
self.inplace(fgraph)
def add_requirements(self, fgraph):
fgraph.attach_feature(dh.DestroyHandler())
class PureThenInplaceOptimizer(Optimizer):
def __init__(self, pure, inplace):
self.pure = pure
self.inplace = inplace
def apply(self, fgraph):
self.pure(fgraph)
fgraph.attach_feature(dh.DestroyHandler())
self.inplace(fgraph)
...@@ -194,7 +194,6 @@ class EquilibriumDB(DB): ...@@ -194,7 +194,6 @@ class EquilibriumDB(DB):
def query(self, *tags, **kwtags): def query(self, *tags, **kwtags):
opts = super(EquilibriumDB, self).query(*tags, **kwtags) opts = super(EquilibriumDB, self).query(*tags, **kwtags)
return opt.EquilibriumOptimizer(opts, return opt.EquilibriumOptimizer(opts,
max_depth=5,
max_use_ratio=config.optdb.max_use_ratio, max_use_ratio=config.optdb.max_use_ratio,
failure_callback=opt.NavigatorOptimizer.warn_inplace) failure_callback=opt.NavigatorOptimizer.warn_inplace)
......
...@@ -671,7 +671,7 @@ class GpuConv(GpuOp): ...@@ -671,7 +671,7 @@ class GpuConv(GpuOp):
def c_code_cache_version(self): def c_code_cache_version(self):
# raise this whenever modifying any of the support_code_files # raise this whenever modifying any of the support_code_files
return (0, 20) return (0, 21)
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
# REMEMBER TO RAISE c_code_cache_version when changing any of # REMEMBER TO RAISE c_code_cache_version when changing any of
......
...@@ -1018,6 +1018,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -1018,6 +1018,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
(version==3||version==4||version==5||version==-1) && (version==3||version==4||version==5||version==-1) &&
out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
(kern_len+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte<shared_avail && //their is only 16k of shared memory (kern_len+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte<shared_avail && //their is only 16k of shared memory
(kern_len > 1 || (img_size_padded_byte+kern_size_byte)<=shared_avail) &&
!work_complete) //conv_full_patch_stack_padded !work_complete) //conv_full_patch_stack_padded
{ {
//version 3 without split //version 3 without split
......
...@@ -14,7 +14,7 @@ import theano.ifelse ...@@ -14,7 +14,7 @@ import theano.ifelse
from theano.compile import optdb from theano.compile import optdb
from theano.gof import (local_optimizer, EquilibriumDB, SequenceDB, ProxyDB, from theano.gof import (local_optimizer, EquilibriumDB, SequenceDB, ProxyDB,
Optimizer, toolbox, DestroyHandler) Optimizer, toolbox)
from theano.gof.python25 import all, any from theano.gof.python25 import all, any
from theano.sandbox.cuda.basic_ops import ( from theano.sandbox.cuda.basic_ops import (
device_properties, gpu_eye, device_properties, gpu_eye,
...@@ -62,7 +62,7 @@ optdb.register('gpu_opt', ...@@ -62,7 +62,7 @@ optdb.register('gpu_opt',
# inside the elemwise. When there is no float64 op, this is working. # inside the elemwise. When there is no float64 op, this is working.
optdb.register('gpu_after_fusion', optdb.register('gpu_after_fusion',
ProxyDB(gpu_seqopt), ProxyDB(gpu_seqopt),
optdb.__position__.get('elemwise_fusion', 71) + .1, optdb.__position__.get('elemwise_fusion', 49) + .1,
'gpu') 'gpu')
...@@ -88,7 +88,6 @@ class InputToGpuOptimizer(Optimizer): ...@@ -88,7 +88,6 @@ class InputToGpuOptimizer(Optimizer):
def add_requirements(self, fgraph): def add_requirements(self, fgraph):
fgraph.attach_feature(toolbox.ReplaceValidate()) fgraph.attach_feature(toolbox.ReplaceValidate())
fgraph.attach_feature(DestroyHandler())
def apply(self, fgraph): def apply(self, fgraph):
for input in fgraph.inputs: for input in fgraph.inputs:
...@@ -1339,9 +1338,10 @@ gpu_local_elemwise_fusion = tensor.opt.local_elemwise_fusion_op( ...@@ -1339,9 +1338,10 @@ gpu_local_elemwise_fusion = tensor.opt.local_elemwise_fusion_op(
max_inputs_to_GpuElemwise) max_inputs_to_GpuElemwise)
if config.gpu.local_elemwise_fusion: if config.gpu.local_elemwise_fusion:
_logger.debug("enabling optimization fusion of gpu elemwise in fast_run") _logger.debug("enabling optimization fusion of gpu elemwise in fast_run")
#Must be after cpu fusion at 40, gpu at 48.5 and before AddDestroyHandler at 49.5
optdb.register('gpu_elemwise_fusion', optdb.register('gpu_elemwise_fusion',
tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion), tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion),
71.00, 'fast_run', 'fusion', 49, 'fast_run', 'fusion',
'local_elemwise_fusion', 'gpu') 'local_elemwise_fusion', 'gpu')
else: else:
_logger.debug(("not enabling optimization fusion of gpu elemwise in " _logger.debug(("not enabling optimization fusion of gpu elemwise in "
......
...@@ -679,6 +679,7 @@ def test_full(): ...@@ -679,6 +679,7 @@ def test_full():
#Test more than maxThreadsDim0 #Test more than maxThreadsDim0
, ((2,4,13,1050), (3,4,10, 11), (1, 1), (1, 1), (1, 1)) , ((2,4,13,1050), (3,4,10, 11), (1, 1), (1, 1), (1, 1))
, ((2,4,1050,13), (3,4,10, 11), (1, 1), (1, 1), (1, 1)) , ((2,4,1050,13), (3,4,10, 11), (1, 1), (1, 1), (1, 1))
, ((1,1,44800,1), (6,1,1,1), (1, 1), (1, 1), (1, 1))#This caused crash
] ]
# shapes=shapes[:277] # shapes=shapes[:277]
......
...@@ -5,7 +5,7 @@ from theano import tensor, scalar ...@@ -5,7 +5,7 @@ from theano import tensor, scalar
from theano.compile import optdb from theano.compile import optdb
from theano.gof import (local_optimizer, EquilibriumDB, from theano.gof import (local_optimizer, EquilibriumDB,
SequenceDB, ProxyDB, SequenceDB, ProxyDB,
Optimizer, toolbox, DestroyHandler, Optimizer, toolbox,
InconsistencyError, EquilibriumOptimizer) InconsistencyError, EquilibriumOptimizer)
from theano.gof.python25 import all, any from theano.gof.python25 import all, any
...@@ -90,7 +90,6 @@ class InputToGpuOptimizer(Optimizer): ...@@ -90,7 +90,6 @@ class InputToGpuOptimizer(Optimizer):
def add_requirements(self, fgraph): def add_requirements(self, fgraph):
fgraph.attach_feature(toolbox.ReplaceValidate()) fgraph.attach_feature(toolbox.ReplaceValidate())
fgraph.attach_feature(DestroyHandler())
def apply(self, fgraph): def apply(self, fgraph):
for input in fgraph.inputs: for input in fgraph.inputs:
......
...@@ -1509,7 +1509,6 @@ class PushOutDot1(gof.Optimizer): ...@@ -1509,7 +1509,6 @@ class PushOutDot1(gof.Optimizer):
def add_requirements(self, fgraph): def add_requirements(self, fgraph):
fgraph.attach_feature(toolbox.ReplaceValidate()) fgraph.attach_feature(toolbox.ReplaceValidate())
fgraph.attach_feature(DestroyHandler())
def apply(self, fgraph): def apply(self, fgraph):
......
...@@ -139,7 +139,7 @@ except ImportError: ...@@ -139,7 +139,7 @@ except ImportError:
pass pass
from theano.configparser import config, AddConfigVar, StrParam from theano.configparser import config, AddConfigVar, StrParam
from theano.gof import (utils, Op, view_roots, DestroyHandler, from theano.gof import (utils, Op, view_roots,
local_optimizer, Optimizer, local_optimizer, Optimizer,
InconsistencyError, toolbox, SequenceDB, InconsistencyError, toolbox, SequenceDB,
EquilibriumOptimizer, Apply, EquilibriumOptimizer, Apply,
...@@ -1488,7 +1488,6 @@ class GemmOptimizer(Optimizer): ...@@ -1488,7 +1488,6 @@ class GemmOptimizer(Optimizer):
def add_requirements(self, fgraph): def add_requirements(self, fgraph):
fgraph.attach_feature(toolbox.ReplaceValidate()) fgraph.attach_feature(toolbox.ReplaceValidate())
fgraph.attach_feature(DestroyHandler())
def apply(self, fgraph): def apply(self, fgraph):
did_something = True did_something = True
...@@ -1501,9 +1500,21 @@ class GemmOptimizer(Optimizer): ...@@ -1501,9 +1500,21 @@ class GemmOptimizer(Optimizer):
time_factor_can = 0 time_factor_can = 0
time_factor_list = 0 time_factor_list = 0
time_toposort = 0 time_toposort = 0
if fgraph.profile:
validate_before = fgraph.profile.validate_time
callbacks_before = fgraph.execute_callbacks_times.copy()
callback_before = fgraph.execute_callbacks_time
class Updater:
def on_import(self, fgraph, new_node, reason):
if new_node is not node:
nodelist.append(new_node)
u = Updater()
fgraph.attach_feature(u)
while did_something: while did_something:
nb_iter += 1
t0 = time.time() t0 = time.time()
nodelist = list(fgraph.toposort()) nodelist = theano.gof.graph.io_toposort(fgraph.inputs, fgraph.outputs)
time_toposort += time.time() - t0 time_toposort += time.time() - t0
did_something = False did_something = False
nodelist.reverse() nodelist.reverse()
...@@ -1546,16 +1557,30 @@ class GemmOptimizer(Optimizer): ...@@ -1546,16 +1557,30 @@ class GemmOptimizer(Optimizer):
except ReplacementDidntRemovedError, e: except ReplacementDidntRemovedError, e:
nb_replacement_didn_t_remove += 1 nb_replacement_didn_t_remove += 1
self.warned = True self.warned = True
nb_iter += 1 fgraph.remove_feature(u)
if fgraph.profile:
validate_time = fgraph.profile.validate_time - validate_before
callback_time = fgraph.execute_callbacks_time - callback_before
callbacks_time = {}
for k, v in fgraph.execute_callbacks_times.iteritems():
if k in callbacks_before:
callbacks_time[k] = v - callbacks_before[k]
else:
callbacks_time[k] = v
else:
validate_time = None
callback_time = None
callbacks_time = {}
return (self, nb_iter, nb_replacement, nb_replacement_didn_t_remove, return (self, nb_iter, nb_replacement, nb_replacement_didn_t_remove,
nb_inconsistency_make, nb_inconsistency_replace, nb_inconsistency_make, nb_inconsistency_replace,
time_canonicalize, time_factor_can, time_canonicalize, time_factor_can,
time_factor_list, time_toposort) time_factor_list, time_toposort,
validate_time, callback_time, callbacks_time,)
@staticmethod @staticmethod
def print_profile(stream, prof, level=0): def print_profile(stream, prof, level=0):
blanc = (' ' * level) blanc = (' ' * level)
#1946.912556s - ('gemm_optimizer', 'GemmOptimizer', 1)
print >> stream, blanc, "GemmOptimizer" print >> stream, blanc, "GemmOptimizer"
print >> stream, blanc, " nb_iter", prof[1] print >> stream, blanc, " nb_iter", prof[1]
print >> stream, blanc, " nb_replacement", prof[2] print >> stream, blanc, " nb_replacement", prof[2]
...@@ -1566,6 +1591,12 @@ class GemmOptimizer(Optimizer): ...@@ -1566,6 +1591,12 @@ class GemmOptimizer(Optimizer):
print >> stream, blanc, " time_factor_can", prof[7] print >> stream, blanc, " time_factor_can", prof[7]
print >> stream, blanc, " time_factor_list", prof[8] print >> stream, blanc, " time_factor_list", prof[8]
print >> stream, blanc, " time_toposort", prof[9] print >> stream, blanc, " time_toposort", prof[9]
print >> stream, blanc, " validate_time", prof[10]
print >> stream, blanc, " callback_time", prof[11]
print >> stream, blanc, " callbacks_time"
for i in sorted(prof[12].iteritems(), key=lambda a: a[1]):
if i[1] > 0:
print i
class Dot22(GemmRelated): class Dot22(GemmRelated):
...@@ -1816,17 +1847,15 @@ blas_optdb.register('local_gemm_to_gemv', ...@@ -1816,17 +1847,15 @@ blas_optdb.register('local_gemm_to_gemv',
15, 'fast_run') 15, 'fast_run')
# After destroyhandler is in but before we try to make elemwise things inplace # After destroyhandler(49.5) but before we try to make elemwise things
# Try to make gemm inplace # inplace (75)
# Also, need to make the gemm optimisation(step 70) happen before the
# fusion of elemwise(step 71)
blas_opt_inplace = in2out(local_inplace_gemm, blas_opt_inplace = in2out(local_inplace_gemm,
local_inplace_gemv, local_inplace_gemv,
local_inplace_ger, local_inplace_ger,
name="blas_opt_inplace") name="blas_opt_inplace")
optdb.register('InplaceBlasOpt', optdb.register('InplaceBlasOpt',
blas_opt_inplace, blas_opt_inplace,
70.0, 'fast_run', 'inplace') 70.0, 'fast_run', 'inplace', 'blas_opt_inplace')
class Dot22Scalar(GemmRelated): class Dot22Scalar(GemmRelated):
......
差异被折叠。
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论