提交 f92215df authored 作者: Nicholas Leonard's avatar Nicholas Leonard

Merge branch 'master' of https://github.com/Theano/Theano

.. _install: .. _install:
...@@ -130,20 +129,11 @@ by typing ...@@ -130,20 +129,11 @@ by typing
You may need to add ``sudo`` before this command to install into your You may need to add ``sudo`` before this command to install into your
system's ``site-packages`` directory. If you do not have administrator access system's ``site-packages`` directory. If you do not have administrator access
to your machine, you can install to an alternate prefix using to your machine, you can install Theano locally (to ~/.local) using
.. code-block:: bash .. code-block:: bash
pip install Theano --install-option='--prefix=~/.local' pip install Theano --user
e.g. using ``--install-option='--prefix=~/.local'`` on Python 2.4 would
install Theano into ``.local/lib/python2.4/site-packages`` inside your home
directory on Mac OS X or Unix/Linux (this ``site-packages`` directory must be
listed in your ``PYTHONPATH`` environment variable; for Python 2.6 and later,
``~/.local`` is
automatically searched and does *not* need to be explicitly included in
``PYTHONPATH``, see :ref:`config_pythonpath` for instructions).
You can change ``~/.local``, but you need to change your ``PYTHONPATH`` as said above.
Alternatively you can use virtualenv_ to create an isolated ``site-packages`` Alternatively you can use virtualenv_ to create an isolated ``site-packages``
directory; see the `virtualenv documentation`_ for details. directory; see the `virtualenv documentation`_ for details.
...@@ -225,7 +215,7 @@ or (if you want to install it for the current user only): ...@@ -225,7 +215,7 @@ or (if you want to install it for the current user only):
.. code-block:: bash .. code-block:: bash
pip install --upgrade --no-deps git+git://github.com/Theano/Theano.git --install-option='--prefix=~/.local' pip install --upgrade --no-deps git+git://github.com/Theano/Theano.git --user
The following are general instructions that will set you up with the The following are general instructions that will set you up with the
bleeding-edge version of Theano and allow you to hack it. First, bleeding-edge version of Theano and allow you to hack it. First,
......
...@@ -18,7 +18,7 @@ those operations will run in parallel in Theano. ...@@ -18,7 +18,7 @@ those operations will run in parallel in Theano.
The most frequent way to control the number of threads used is via the The most frequent way to control the number of threads used is via the
``OMP_NUM_THREADS`` environment variable. Set it to the number of ``OMP_NUM_THREADS`` environment variable. Set it to the number of
threads you want to use before starting the python process. Some BLAS threads you want to use before starting the python process. Some BLAS
implementation support other enviroment variable. implementations support other enviroment variables.
Parallel element wise ops with OpenMP Parallel element wise ops with OpenMP
...@@ -27,8 +27,8 @@ Parallel element wise ops with OpenMP ...@@ -27,8 +27,8 @@ Parallel element wise ops with OpenMP
Because element wise ops work on every tensor entry independently they Because element wise ops work on every tensor entry independently they
can be easily parallelized using OpenMP. can be easily parallelized using OpenMP.
To use OpenMP you must set the ``openmp`` flag to ``True`` in Theano To use OpenMP you must set the ``openmp`` :ref:`flag <libdoc_config>`
configuration. to ``True``.
You can use the flag ``openmp_elemwise_minsize`` to set the minimum You can use the flag ``openmp_elemwise_minsize`` to set the minimum
tensor size for which the operation is parallelized because for short tensor size for which the operation is parallelized because for short
......
import theano
from theano import gof from theano import gof
from theano import gradient as G
from theano.compile.function_module import orig_function from theano.compile.function_module import orig_function
from theano.compile import SharedVariable, rebuild_collect_shared from theano.compile import SharedVariable, rebuild_collect_shared
from theano.gof import ops_with_inner_function from theano.gof import ops_with_inner_function
...@@ -142,7 +142,7 @@ class OpFromGraph(gof.Op): ...@@ -142,7 +142,7 @@ class OpFromGraph(gof.Op):
if hasattr(self, "grad_ops"): if hasattr(self, "grad_ops"):
grad_ops = self.grad_ops grad_ops = self.grad_ops
else: else:
gs = G.grad(cost=None, gs = theano.gradient.grad(cost=None,
known_grads=dict(zip(self.new_outputs, output_grads)), known_grads=dict(zip(self.new_outputs, output_grads)),
wrt=self.new_inputs, wrt=self.new_inputs,
disconnected_inputs='ignore') disconnected_inputs='ignore')
......
...@@ -62,7 +62,6 @@ from theano.gof.opt import (Optimizer, optimizer, SeqOptimizer, ...@@ -62,7 +62,6 @@ from theano.gof.opt import (Optimizer, optimizer, SeqOptimizer,
LocalOptimizer, local_optimizer, LocalOptGroup, LocalOptimizer, local_optimizer, LocalOptGroup,
OpSub, OpRemove, PatternSub, OpSub, OpRemove, PatternSub,
NavigatorOptimizer, TopoOptimizer, EquilibriumOptimizer, NavigatorOptimizer, TopoOptimizer, EquilibriumOptimizer,
InplaceOptimizer, PureThenInplaceOptimizer,
OpKeyOptimizer) OpKeyOptimizer)
from theano.gof.optdb import \ from theano.gof.optdb import \
......
...@@ -165,8 +165,12 @@ def lock(tmp_dir, timeout=120, min_wait=5, max_wait=10, verbosity=1): ...@@ -165,8 +165,12 @@ def lock(tmp_dir, timeout=120, min_wait=5, max_wait=10, verbosity=1):
my_pid = os.getpid() my_pid = os.getpid()
no_display = (verbosity == 0) no_display = (verbosity == 0)
# Acquire lock.
nb_error = 0 nb_error = 0
# The number of time we sleep when their is no errors.
# Used to don't display it the first time to display it less frequently.
# And so don't get as much email about this!
nb_wait = 0
# Acquire lock.
while True: while True:
try: try:
last_owner = 'no_owner' last_owner = 'no_owner'
...@@ -214,7 +218,7 @@ def lock(tmp_dir, timeout=120, min_wait=5, max_wait=10, verbosity=1): ...@@ -214,7 +218,7 @@ def lock(tmp_dir, timeout=120, min_wait=5, max_wait=10, verbosity=1):
last_owner = read_owner last_owner = read_owner
time_start = time.time() time_start = time.time()
no_display = (verbosity == 0) no_display = (verbosity == 0)
if not no_display: if not no_display and nb_wait > 0:
if read_owner == 'failure': if read_owner == 'failure':
msg = 'unknown process' msg = 'unknown process'
else: else:
...@@ -225,6 +229,7 @@ def lock(tmp_dir, timeout=120, min_wait=5, max_wait=10, verbosity=1): ...@@ -225,6 +229,7 @@ def lock(tmp_dir, timeout=120, min_wait=5, max_wait=10, verbosity=1):
tmp_dir) tmp_dir)
if verbosity <= 1: if verbosity <= 1:
no_display = True no_display = True
nb_wait += 1
time.sleep(random.uniform(min_wait, max_wait)) time.sleep(random.uniform(min_wait, max_wait))
try: try:
......
...@@ -131,6 +131,9 @@ class FromFunctionOptimizer(Optimizer): ...@@ -131,6 +131,9 @@ class FromFunctionOptimizer(Optimizer):
def __call__(self, *args, **kwargs): def __call__(self, *args, **kwargs):
return self.fn(*args, **kwargs) return self.fn(*args, **kwargs)
def __str__(self):
return self.__name__
def optimizer(f): def optimizer(f):
"""decorator for FromFunctionOptimizer""" """decorator for FromFunctionOptimizer"""
...@@ -626,7 +629,10 @@ class MergeOptimizer(Optimizer): ...@@ -626,7 +629,10 @@ class MergeOptimizer(Optimizer):
print >> stream, blanc, " replace_time", replace_time print >> stream, blanc, " replace_time", replace_time
print >> stream, blanc, " validate_time", validate_time print >> stream, blanc, " validate_time", validate_time
print >> stream, blanc, " callback_time", callback_time print >> stream, blanc, " callback_time", callback_time
print >> stream, blanc, " callback_times", callbacks_time print >> stream, blanc, " callbacks_time"
for i in sorted(callbacks_time.iteritems(), key=lambda a: a[1]):
if i[1] > 0:
print i
print >> stream, blanc, " nb_merged", nb_merged print >> stream, blanc, " nb_merged", nb_merged
print >> stream, blanc, " nb_constant", nb_constant print >> stream, blanc, " nb_constant", nb_constant
...@@ -1490,7 +1496,6 @@ class EquilibriumOptimizer(NavigatorOptimizer): ...@@ -1490,7 +1496,6 @@ class EquilibriumOptimizer(NavigatorOptimizer):
def __init__(self, def __init__(self,
optimizers, optimizers,
failure_callback=None, failure_callback=None,
max_depth=None,
max_use_ratio=None): max_use_ratio=None):
""" """
:param optimizers: list or set of local or global optimizations to :param optimizers: list or set of local or global optimizations to
...@@ -1499,8 +1504,6 @@ class EquilibriumOptimizer(NavigatorOptimizer): ...@@ -1499,8 +1504,6 @@ class EquilibriumOptimizer(NavigatorOptimizer):
:param max_use_ratio: each optimizer can be applied at most :param max_use_ratio: each optimizer can be applied at most
(size of graph * this number) times (size of graph * this number) times
:param max_depth: TODO what does this do? (EquilibriumDB sets it to 5)
""" """
super(EquilibriumOptimizer, self).__init__( super(EquilibriumOptimizer, self).__init__(
...@@ -1520,7 +1523,6 @@ class EquilibriumOptimizer(NavigatorOptimizer): ...@@ -1520,7 +1523,6 @@ class EquilibriumOptimizer(NavigatorOptimizer):
self.local_optimizers_map.setdefault(c, []).append(opt) self.local_optimizers_map.setdefault(c, []).append(opt)
else: else:
self.global_optimizers.append(opt) self.global_optimizers.append(opt)
self.max_depth = max_depth
self.max_use_ratio = max_use_ratio self.max_use_ratio = max_use_ratio
assert self.max_use_ratio is not None, ( assert self.max_use_ratio is not None, (
'max_use_ratio has to be a number') 'max_use_ratio has to be a number')
...@@ -1723,10 +1725,12 @@ class EquilibriumOptimizer(NavigatorOptimizer): ...@@ -1723,10 +1725,12 @@ class EquilibriumOptimizer(NavigatorOptimizer):
for (t, count, opt) in count_opt[::-1]: for (t, count, opt) in count_opt[::-1]:
print >> stream, blanc, ' %.3fs - %d - %s' % ( print >> stream, blanc, ' %.3fs - %d - %s' % (
t, count, opt) t, count, opt)
print >> stream, blanc, ' %.3fs - in %d optimization that where not used' % ( print >> stream, blanc, ' %.3fs - in %d optimization that where not used (display only those with a runtime > 0)' % (
not_used_time, len(not_used)) not_used_time, len(not_used))
not_used.sort() not_used.sort()
for (t, opt) in not_used[::-1]: for (t, opt) in not_used[::-1]:
if t > 0:
# Skip opt that have 0 times, they probably wasn't even tried.
print >> stream, blanc + " ", ' %.3fs - %s' % (t, opt) print >> stream, blanc + " ", ' %.3fs - %s' % (t, opt)
print >> stream print >> stream
...@@ -1899,31 +1903,3 @@ def pre_greedy_local_optimizer(list_optimizations, out): ...@@ -1899,31 +1903,3 @@ def pre_greedy_local_optimizer(list_optimizations, out):
final_outs, optimized_nodes = local_recursive_function( final_outs, optimized_nodes = local_recursive_function(
list_optimizations, out, {}, 0) list_optimizations, out, {}, 0)
return final_outs[out_index] return final_outs[out_index]
############
### Misc ###
############
class InplaceOptimizer(Optimizer):
def __init__(self, inplace):
self.inplace = inplace
def apply(self, fgraph):
self.inplace(fgraph)
def add_requirements(self, fgraph):
fgraph.attach_feature(dh.DestroyHandler())
class PureThenInplaceOptimizer(Optimizer):
def __init__(self, pure, inplace):
self.pure = pure
self.inplace = inplace
def apply(self, fgraph):
self.pure(fgraph)
fgraph.attach_feature(dh.DestroyHandler())
self.inplace(fgraph)
...@@ -194,7 +194,6 @@ class EquilibriumDB(DB): ...@@ -194,7 +194,6 @@ class EquilibriumDB(DB):
def query(self, *tags, **kwtags): def query(self, *tags, **kwtags):
opts = super(EquilibriumDB, self).query(*tags, **kwtags) opts = super(EquilibriumDB, self).query(*tags, **kwtags)
return opt.EquilibriumOptimizer(opts, return opt.EquilibriumOptimizer(opts,
max_depth=5,
max_use_ratio=config.optdb.max_use_ratio, max_use_ratio=config.optdb.max_use_ratio,
failure_callback=opt.NavigatorOptimizer.warn_inplace) failure_callback=opt.NavigatorOptimizer.warn_inplace)
......
...@@ -23,6 +23,7 @@ from theano.gof import Variable ...@@ -23,6 +23,7 @@ from theano.gof import Variable
from theano.gof.python25 import OrderedDict from theano.gof.python25 import OrderedDict
from theano.gof.null_type import NullType from theano.gof.null_type import NullType
from theano.gof.op import get_debug_values from theano.gof.op import get_debug_values
from theano.compile import ViewOp
# we can't do "import theano.tensor" # we can't do "import theano.tensor"
# tensor depends on theano.compile # tensor depends on theano.compile
...@@ -1788,3 +1789,29 @@ def _is_zero(x): ...@@ -1788,3 +1789,29 @@ def _is_zero(x):
return 'no' return 'no'
return 'yes' return 'yes'
class ConsiderConstant(ViewOp):
def grad(self, args, g_outs):
return [g_out.zeros_like(g_out) for g_out in g_outs]
consider_constant_ = ConsiderConstant()
#I create a function only to have the doc show well.
def consider_constant(x):
""" Consider an expression constant when computing gradients.
The expression itself is unaffected, but when its gradient is
computed, or the gradient of another expression that this
expression is a subexpression of, it will not be backpropagated
through. In other words, the gradient of the expression is
truncated to 0.
:param x: A Theano expression whose gradient should be truncated.
:return: The expression is returned unmodified, but its gradient
is now truncated to 0.
.. versionadded:: 0.6.1
"""
return consider_constant_(x)
...@@ -1198,7 +1198,11 @@ class GpuCAReduce(GpuOp): ...@@ -1198,7 +1198,11 @@ class GpuCAReduce(GpuOp):
n_threads.z += 1; n_threads.z += 1;
else else
break; break;
}""" % locals() }
//Maximum for Fermi GPU on that dimensions.
n_threads.z = std::min(n_threads.z, (unsigned)64);
""" % locals()
if len(self.reduce_mask) == 2: if len(self.reduce_mask) == 2:
threads_y = '' threads_y = ''
...@@ -1509,6 +1513,8 @@ class GpuCAReduce(GpuOp): ...@@ -1509,6 +1513,8 @@ class GpuCAReduce(GpuOp):
n_threads.z += 1; n_threads.z += 1;
} }
n_threads.z -= 1; n_threads.z -= 1;
//Maximum for Fermi GPU on that dimensions.
n_threads.z = std::min(n_threads.z, (unsigned)64);
dim3 n_blocks(1,1,1); dim3 n_blocks(1,1,1);
%(makecall)s %(makecall)s
......
...@@ -671,7 +671,7 @@ class GpuConv(GpuOp): ...@@ -671,7 +671,7 @@ class GpuConv(GpuOp):
def c_code_cache_version(self): def c_code_cache_version(self):
# raise this whenever modifying any of the support_code_files # raise this whenever modifying any of the support_code_files
return (0, 20) return (0, 21)
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
# REMEMBER TO RAISE c_code_cache_version when changing any of # REMEMBER TO RAISE c_code_cache_version when changing any of
......
...@@ -1018,6 +1018,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -1018,6 +1018,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
(version==3||version==4||version==5||version==-1) && (version==3||version==4||version==5||version==-1) &&
out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
(kern_len+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte<shared_avail && //their is only 16k of shared memory (kern_len+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte<shared_avail && //their is only 16k of shared memory
(kern_len > 1 || (img_size_padded_byte+kern_size_byte)<=shared_avail) &&
!work_complete) //conv_full_patch_stack_padded !work_complete) //conv_full_patch_stack_padded
{ {
//version 3 without split //version 3 without split
......
...@@ -14,7 +14,7 @@ import theano.ifelse ...@@ -14,7 +14,7 @@ import theano.ifelse
from theano.compile import optdb from theano.compile import optdb
from theano.gof import (local_optimizer, EquilibriumDB, SequenceDB, ProxyDB, from theano.gof import (local_optimizer, EquilibriumDB, SequenceDB, ProxyDB,
Optimizer, toolbox, DestroyHandler) Optimizer, toolbox)
from theano.gof.python25 import all, any from theano.gof.python25 import all, any
from theano.sandbox.cuda.basic_ops import ( from theano.sandbox.cuda.basic_ops import (
device_properties, gpu_eye, device_properties, gpu_eye,
...@@ -62,7 +62,7 @@ optdb.register('gpu_opt', ...@@ -62,7 +62,7 @@ optdb.register('gpu_opt',
# inside the elemwise. When there is no float64 op, this is working. # inside the elemwise. When there is no float64 op, this is working.
optdb.register('gpu_after_fusion', optdb.register('gpu_after_fusion',
ProxyDB(gpu_seqopt), ProxyDB(gpu_seqopt),
optdb.__position__.get('elemwise_fusion', 71) + .1, optdb.__position__.get('elemwise_fusion', 49) + .1,
'gpu') 'gpu')
...@@ -88,7 +88,6 @@ class InputToGpuOptimizer(Optimizer): ...@@ -88,7 +88,6 @@ class InputToGpuOptimizer(Optimizer):
def add_requirements(self, fgraph): def add_requirements(self, fgraph):
fgraph.attach_feature(toolbox.ReplaceValidate()) fgraph.attach_feature(toolbox.ReplaceValidate())
fgraph.attach_feature(DestroyHandler())
def apply(self, fgraph): def apply(self, fgraph):
for input in fgraph.inputs: for input in fgraph.inputs:
...@@ -1339,9 +1338,10 @@ gpu_local_elemwise_fusion = tensor.opt.local_elemwise_fusion_op( ...@@ -1339,9 +1338,10 @@ gpu_local_elemwise_fusion = tensor.opt.local_elemwise_fusion_op(
max_inputs_to_GpuElemwise) max_inputs_to_GpuElemwise)
if config.gpu.local_elemwise_fusion: if config.gpu.local_elemwise_fusion:
_logger.debug("enabling optimization fusion of gpu elemwise in fast_run") _logger.debug("enabling optimization fusion of gpu elemwise in fast_run")
#Must be after cpu fusion at 40, gpu at 48.5 and before AddDestroyHandler at 49.5
optdb.register('gpu_elemwise_fusion', optdb.register('gpu_elemwise_fusion',
tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion), tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion),
71.00, 'fast_run', 'fusion', 49, 'fast_run', 'fusion',
'local_elemwise_fusion', 'gpu') 'local_elemwise_fusion', 'gpu')
else: else:
_logger.debug(("not enabling optimization fusion of gpu elemwise in " _logger.debug(("not enabling optimization fusion of gpu elemwise in "
......
...@@ -109,11 +109,13 @@ def test_careduce(): ...@@ -109,11 +109,13 @@ def test_careduce():
((4100,4,3),[1,2]),((5,4100,3),[1,2]),((5,4,4100),[1,2]),#011 ((4100,4,3),[1,2]),((5,4100,3),[1,2]),((5,4,4100),[1,2]),#011
#((4100,4,3),[0,2]),((5,4100,3),[0,2]),((5,4,4100),[0,2]),#101 ##not implemented #((4100,4,3),[0,2]),((5,4100,3),[0,2]),((5,4,4100),[0,2]),#101 ##not implemented
((4100,4,3),[0,1,2]),((5,4100,3),[0,1,2]),((5,4,4100),[0,1,2]),#111 ((4100,4,3),[0,1,2]),((5,4100,3),[0,1,2]),((5,4,4100),[0,1,2]),#111
((65,4,3),[0,1,2]),((5,65,3),[0,1,2]),((5,4,65),[0,1,2]),#111
((4100,4,3,2),[2,3]),((4,4100,3,2),[2,3]),((4,3,4100,2),[2,3]),((4,3,2,4100),[2,3]),#0011 ((4100,4,3,2),[2,3]),((4,4100,3,2),[2,3]),((4,3,4100,2),[2,3]),((4,3,2,4100),[2,3]),#0011
((4100,4,3,2),[1,3]),((4,4100,3,2),[1,3]),((4,3,4100,2),[1,3]),((4,3,2,4100),[1,3]),#0101 ((4100,4,3,2),[1,3]),((4,4100,3,2),[1,3]),((4,3,4100,2),[1,3]),((4,3,2,4100),[1,3]),#0101
((4100,4,3,2),[0,2,3]),((4,4100,3,2),[0,2,3]),((4,3,4100,2),[0,2,3]),#((4,3,2,4100),[0,2,3]),#1011 ((4100,4,3,2),[0,2,3]),((4,4100,3,2),[0,2,3]),((4,3,4100,2),[0,2,3]),#((4,3,2,4100),[0,2,3]),#1011
((4100,4,3,2),[1,2,3]),((4,4100,3,2),[1,2,3]),((4,3,4100,2),[1,2,3]),((4,3,2,4100),[1,2,3]),#0111 ((4100,4,3,2),[1,2,3]),((4,4100,3,2),[1,2,3]),((4,3,4100,2),[1,2,3]),((4,3,2,4100),[1,2,3]),#0111
((65,4,3,2),[1,2,3]),((4,65,3,2),[1,2,3]),((4,3,65,2),[1,2,3]),((4,3,2,65),[1,2,3]),#0111
((4100,2,3,4),[0,1,2,3]),((2,4100,3,4),[0,1,2,3]),((2,3,4100,4),[0,1,2,3]),((2,3,4,4100),[0,1,2,3]),((128,1,3,3), [0,1,2,3]),#1111 ((4100,2,3,4),[0,1,2,3]),((2,4100,3,4),[0,1,2,3]),((2,3,4100,4),[0,1,2,3]),((2,3,4,4100),[0,1,2,3]),((128,1,3,3), [0,1,2,3]),#1111
......
...@@ -679,6 +679,7 @@ def test_full(): ...@@ -679,6 +679,7 @@ def test_full():
#Test more than maxThreadsDim0 #Test more than maxThreadsDim0
, ((2,4,13,1050), (3,4,10, 11), (1, 1), (1, 1), (1, 1)) , ((2,4,13,1050), (3,4,10, 11), (1, 1), (1, 1), (1, 1))
, ((2,4,1050,13), (3,4,10, 11), (1, 1), (1, 1), (1, 1)) , ((2,4,1050,13), (3,4,10, 11), (1, 1), (1, 1), (1, 1))
, ((1,1,44800,1), (6,1,1,1), (1, 1), (1, 1), (1, 1))#This caused crash
] ]
# shapes=shapes[:277] # shapes=shapes[:277]
......
...@@ -61,7 +61,7 @@ class GpuGemv(BlasOp, Gemv): ...@@ -61,7 +61,7 @@ class GpuGemv(BlasOp, Gemv):
((dtype_%(alpha)s *)PyArray_DATA(%(alpha)s))[0], ((dtype_%(alpha)s *)PyArray_DATA(%(alpha)s))[0],
%(A)s, %(x)s, %(A)s, %(x)s,
((dtype_%(beta)s *)PyArray_DATA(%(beta)s))[0], ((dtype_%(beta)s *)PyArray_DATA(%(beta)s))[0],
%(out)s) == NULL) { %(out)s, 0) == -1) {
%(fail)s %(fail)s
} }
""" % vars """ % vars
...@@ -72,7 +72,7 @@ class GpuGemv(BlasOp, Gemv): ...@@ -72,7 +72,7 @@ class GpuGemv(BlasOp, Gemv):
return code return code
def c_code_cache_version(self): def c_code_cache_version(self):
return (0,) return (1,)
gpugemv_no_inplace = GpuGemv(inplace=False) gpugemv_no_inplace = GpuGemv(inplace=False)
gpugemv_inplace = GpuGemv(inplace=True) gpugemv_inplace = GpuGemv(inplace=True)
...@@ -117,7 +117,7 @@ class GpuGemm(BlasOp, Gemm): ...@@ -117,7 +117,7 @@ class GpuGemm(BlasOp, Gemm):
((dtype_%(alpha)s *)PyArray_DATA(%(alpha)s))[0], ((dtype_%(alpha)s *)PyArray_DATA(%(alpha)s))[0],
%(A)s, %(B)s, %(A)s, %(B)s,
((dtype_%(beta)s *)PyArray_DATA(%(beta)s))[0], ((dtype_%(beta)s *)PyArray_DATA(%(beta)s))[0],
%(out)s) == NULL) { %(out)s, 0) == -1) {
%(fail)s %(fail)s
} }
""" % vars """ % vars
...@@ -128,7 +128,7 @@ class GpuGemm(BlasOp, Gemm): ...@@ -128,7 +128,7 @@ class GpuGemm(BlasOp, Gemm):
return code return code
def c_code_cache_version(self): def c_code_cache_version(self):
return (0,) return (1,)
gpugemm_no_inplace = GpuGemm(inplace=False) gpugemm_no_inplace = GpuGemm(inplace=False)
...@@ -176,7 +176,7 @@ class GpuDot22(BlasOp, Dot22): ...@@ -176,7 +176,7 @@ class GpuDot22(BlasOp, Dot22):
one, one,
%(A)s, %(B)s, %(A)s, %(B)s,
zero, zero,
%(out)s) == NULL) { %(out)s, 0) == -1) {
%(fail)s %(fail)s
} }
""" % vars """ % vars
...@@ -187,7 +187,7 @@ class GpuDot22(BlasOp, Dot22): ...@@ -187,7 +187,7 @@ class GpuDot22(BlasOp, Dot22):
return code return code
def c_code_cache_version(self): def c_code_cache_version(self):
return (0,) return (1,)
def c_headers(self): def c_headers(self):
ret = super(GpuDot22, self).c_headers() ret = super(GpuDot22, self).c_headers()
......
...@@ -1281,7 +1281,10 @@ class GpuCAReduceCuda(HideC, CAReduce): ...@@ -1281,7 +1281,10 @@ class GpuCAReduceCuda(HideC, CAReduce):
n_threads.z += 1; n_threads.z += 1;
else else
break; break;
}""" % locals() }
//Maximum for Fermi GPU on that dimensions.
n_threads.z = std::min(n_threads.z, (unsigned)64);
""" % locals()
if len(self.reduce_mask) == 2: if len(self.reduce_mask) == 2:
threads_y = '' threads_y = ''
...@@ -1601,6 +1604,8 @@ class GpuCAReduceCuda(HideC, CAReduce): ...@@ -1601,6 +1604,8 @@ class GpuCAReduceCuda(HideC, CAReduce):
n_threads.z += 1; n_threads.z += 1;
} }
n_threads.z -= 1; n_threads.z -= 1;
//Maximum for Fermi GPU on that dimensions.
n_threads.z = std::min(n_threads.z, (unsigned)64);
dim3 n_blocks(1,1,1); dim3 n_blocks(1,1,1);
%(makecall)s %(makecall)s
......
...@@ -5,7 +5,7 @@ from theano import tensor, scalar ...@@ -5,7 +5,7 @@ from theano import tensor, scalar
from theano.compile import optdb from theano.compile import optdb
from theano.gof import (local_optimizer, EquilibriumDB, from theano.gof import (local_optimizer, EquilibriumDB,
SequenceDB, ProxyDB, SequenceDB, ProxyDB,
Optimizer, toolbox, DestroyHandler, Optimizer, toolbox,
InconsistencyError, EquilibriumOptimizer) InconsistencyError, EquilibriumOptimizer)
from theano.gof.python25 import all, any from theano.gof.python25 import all, any
...@@ -90,7 +90,6 @@ class InputToGpuOptimizer(Optimizer): ...@@ -90,7 +90,6 @@ class InputToGpuOptimizer(Optimizer):
def add_requirements(self, fgraph): def add_requirements(self, fgraph):
fgraph.attach_feature(toolbox.ReplaceValidate()) fgraph.attach_feature(toolbox.ReplaceValidate())
fgraph.attach_feature(DestroyHandler())
def apply(self, fgraph): def apply(self, fgraph):
for input in fgraph.inputs: for input in fgraph.inputs:
......
import unittest
from theano import scalar, gof from theano import scalar, gof
from theano.gof import FunctionGraph
from theano.gof.python25 import all, any from theano.gof.python25 import all, any
from theano.tests.unittest_tools import SkipTest
from theano.tensor.tests.test_elemwise import (test_Broadcast, test_DimShuffle, from theano.tensor.tests.test_elemwise import (test_Broadcast, test_DimShuffle,
test_CAReduce) test_CAReduce)
...@@ -126,11 +122,13 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY): ...@@ -126,11 +122,13 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
((4100,4,3),[1,2]),((5,4100,3),[1,2]),((5,4,4100),[1,2]),#011 ((4100,4,3),[1,2]),((5,4100,3),[1,2]),((5,4,4100),[1,2]),#011
#((4100,4,3),[0,2]),((5,4100,3),[0,2]),((5,4,4100),[0,2]),#101 ##not implemented #((4100,4,3),[0,2]),((5,4100,3),[0,2]),((5,4,4100),[0,2]),#101 ##not implemented
((4100,4,3),[0,1,2]),((5,4100,3),[0,1,2]),((5,4,4100),[0,1,2]),#111 ((4100,4,3),[0,1,2]),((5,4100,3),[0,1,2]),((5,4,4100),[0,1,2]),#111
((65,4,3),[0,1,2]),((5,65,3),[0,1,2]),((5,4,65),[0,1,2]),#111
((4100,4,3,2),[2,3]),((4,4100,3,2),[2,3]),((4,3,4100,2),[2,3]),((4,3,2,4100),[2,3]),#0011 ((4100,4,3,2),[2,3]),((4,4100,3,2),[2,3]),((4,3,4100,2),[2,3]),((4,3,2,4100),[2,3]),#0011
((4100,4,3,2),[1,3]),((4,4100,3,2),[1,3]),((4,3,4100,2),[1,3]),((4,3,2,4100),[1,3]),#0101 ((4100,4,3,2),[1,3]),((4,4100,3,2),[1,3]),((4,3,4100,2),[1,3]),((4,3,2,4100),[1,3]),#0101
((4100,4,3,2),[0,2,3]),((4,4100,3,2),[0,2,3]),((4,3,4100,2),[0,2,3]),#((4,3,2,4100),[0,2,3]),#1011 ((4100,4,3,2),[0,2,3]),((4,4100,3,2),[0,2,3]),((4,3,4100,2),[0,2,3]),#((4,3,2,4100),[0,2,3]),#1011
((4100,4,3,2),[1,2,3]),((4,4100,3,2),[1,2,3]),((4,3,4100,2),[1,2,3]),((4,3,2,4100),[1,2,3]),#0111 ((4100,4,3,2),[1,2,3]),((4,4100,3,2),[1,2,3]),((4,3,4100,2),[1,2,3]),((4,3,2,4100),[1,2,3]),#0111
((65,4,3,2),[1,2,3]),((4,65,3,2),[1,2,3]),((4,3,65,2),[1,2,3]),((4,3,2,65),[1,2,3]),#0111
((4100,2,3,4),[0,1,2,3]),((2,4100,3,4),[0,1,2,3]),((2,3,4100,4),[0,1,2,3]),((2,3,4,4100),[0,1,2,3]),((128,1,3,3), [0,1,2,3]),#1111 ((4100,2,3,4),[0,1,2,3]),((2,4100,3,4),[0,1,2,3]),((2,3,4100,4),[0,1,2,3]),((2,3,4,4100),[0,1,2,3]),((128,1,3,3), [0,1,2,3]),#1111
#test pattern implemented by reshape #test pattern implemented by reshape
......
...@@ -28,16 +28,8 @@ if cuda_available: ...@@ -28,16 +28,8 @@ if cuda_available:
def matVecModM(A, s, m): def matVecModM(A, s, m):
# return (A * s) % m assert A.dtype == 'int64'
x = numpy.zeros_like(s) return numpy.int32(numpy.sum((A*s) % m, 1) % m)
for i in xrange(len(x)):
for j in xrange(len(s)):
r = numpy.int32((numpy.int64(A[i][j]) * s[j] + x[i]) % m)
if r >= 0:
x[i] = r
else:
x[i] = r + m
return x
def multMatVect(v, A, m1, B, m2): def multMatVect(v, A, m1, B, m2):
...@@ -63,24 +55,30 @@ MASK2 = numpy.int32(65535) #2^16 - 1 ...@@ -63,24 +55,30 @@ MASK2 = numpy.int32(65535) #2^16 - 1
MULT2 = numpy.int32(21069) MULT2 = numpy.int32(21069)
NORM = 4.656612873077392578125e-10; #1./2^31 NORM = 4.656612873077392578125e-10; #1./2^31
A1p0 = numpy.asarray([[0, 4194304, 129], [1, 0, 0], [0, 1, 0]]) #A1p0 = numpy.asarray([[0, 4194304, 129], [1, 0, 0], [0, 1, 0]],
A2p0 = numpy.asarray([[32768, 0, 32769], [1, 0, 0], [0, 1, 0]]) # dtype='int64')
#A2p0 = numpy.asarray([[32768, 0, 32769], [1, 0, 0], [0, 1, 0]],
# dtype='int64')
A1p72 = numpy.asarray([[1516919229, 758510237, 499121365], A1p72 = numpy.asarray([[1516919229, 758510237, 499121365],
[1884998244, 1516919229, 335398200], [1884998244, 1516919229, 335398200],
[601897748, 1884998244, 358115744]]) [601897748, 1884998244, 358115744]],
dtype='int64')
A2p72 = numpy.asarray([[1228857673, 1496414766, 954677935], A2p72 = numpy.asarray([[1228857673, 1496414766, 954677935],
[1133297478, 1407477216, 1496414766], [1133297478, 1407477216, 1496414766],
[2002613992, 1639496704, 1407477216]]) [2002613992, 1639496704, 1407477216]],
dtype='int64')
A1p134 = numpy.asarray( A1p134 = numpy.asarray(
[[1702500920, 1849582496, 1656874625], [[1702500920, 1849582496, 1656874625],
[828554832, 1702500920, 1512419905], [828554832, 1702500920, 1512419905],
[1143731069, 828554832, 102237247]]) [1143731069, 828554832, 102237247]],
dtype='int64')
A2p134 = numpy.asarray( A2p134 = numpy.asarray(
[[796789021, 1464208080, 607337906], [[796789021, 1464208080, 607337906],
[1241679051, 1431130166, 1464208080], [1241679051, 1431130166, 1464208080],
[1401213391, 1178684362, 1431130166]]) [1401213391, 1178684362, 1431130166]],
dtype='int64')
np_int32_vals = [numpy.int32(i) for i in (0, 7, 9, 15, 16, 22, 24)] np_int32_vals = [numpy.int32(i) for i in (0, 7, 9, 15, 16, 22, 24)]
......
...@@ -1509,7 +1509,6 @@ class PushOutDot1(gof.Optimizer): ...@@ -1509,7 +1509,6 @@ class PushOutDot1(gof.Optimizer):
def add_requirements(self, fgraph): def add_requirements(self, fgraph):
fgraph.attach_feature(toolbox.ReplaceValidate()) fgraph.attach_feature(toolbox.ReplaceValidate())
fgraph.attach_feature(DestroyHandler())
def apply(self, fgraph): def apply(self, fgraph):
......
...@@ -58,7 +58,7 @@ def shared(*args, **kw): ...@@ -58,7 +58,7 @@ def shared(*args, **kw):
from theano.tensor import nnet # used for softmax, sigmoid, etc. from theano.tensor import nnet # used for softmax, sigmoid, etc.
from theano.gradient import Rop, Lop, grad, numeric_grad, verify_grad, \ from theano.gradient import Rop, Lop, grad, numeric_grad, verify_grad, \
jacobian, hessian jacobian, hessian, consider_constant
from theano.tensor.sort import sort, argsort from theano.tensor.sort import sort, argsort
from theano.tensor.extra_ops import (DiffOp, bincount, squeeze, from theano.tensor.extra_ops import (DiffOp, bincount, squeeze,
......
...@@ -139,7 +139,7 @@ except ImportError: ...@@ -139,7 +139,7 @@ except ImportError:
pass pass
from theano.configparser import config, AddConfigVar, StrParam from theano.configparser import config, AddConfigVar, StrParam
from theano.gof import (utils, Op, view_roots, DestroyHandler, from theano.gof import (utils, Op, view_roots,
local_optimizer, Optimizer, local_optimizer, Optimizer,
InconsistencyError, toolbox, SequenceDB, InconsistencyError, toolbox, SequenceDB,
EquilibriumOptimizer, Apply, EquilibriumOptimizer, Apply,
...@@ -1488,7 +1488,6 @@ class GemmOptimizer(Optimizer): ...@@ -1488,7 +1488,6 @@ class GemmOptimizer(Optimizer):
def add_requirements(self, fgraph): def add_requirements(self, fgraph):
fgraph.attach_feature(toolbox.ReplaceValidate()) fgraph.attach_feature(toolbox.ReplaceValidate())
fgraph.attach_feature(DestroyHandler())
def apply(self, fgraph): def apply(self, fgraph):
did_something = True did_something = True
...@@ -1501,9 +1500,21 @@ class GemmOptimizer(Optimizer): ...@@ -1501,9 +1500,21 @@ class GemmOptimizer(Optimizer):
time_factor_can = 0 time_factor_can = 0
time_factor_list = 0 time_factor_list = 0
time_toposort = 0 time_toposort = 0
if fgraph.profile:
validate_before = fgraph.profile.validate_time
callbacks_before = fgraph.execute_callbacks_times.copy()
callback_before = fgraph.execute_callbacks_time
class Updater:
def on_import(self, fgraph, new_node, reason):
if new_node is not node:
nodelist.append(new_node)
u = Updater()
fgraph.attach_feature(u)
while did_something: while did_something:
nb_iter += 1
t0 = time.time() t0 = time.time()
nodelist = list(fgraph.toposort()) nodelist = theano.gof.graph.io_toposort(fgraph.inputs, fgraph.outputs)
time_toposort += time.time() - t0 time_toposort += time.time() - t0
did_something = False did_something = False
nodelist.reverse() nodelist.reverse()
...@@ -1546,16 +1557,30 @@ class GemmOptimizer(Optimizer): ...@@ -1546,16 +1557,30 @@ class GemmOptimizer(Optimizer):
except ReplacementDidntRemovedError, e: except ReplacementDidntRemovedError, e:
nb_replacement_didn_t_remove += 1 nb_replacement_didn_t_remove += 1
self.warned = True self.warned = True
nb_iter += 1 fgraph.remove_feature(u)
if fgraph.profile:
validate_time = fgraph.profile.validate_time - validate_before
callback_time = fgraph.execute_callbacks_time - callback_before
callbacks_time = {}
for k, v in fgraph.execute_callbacks_times.iteritems():
if k in callbacks_before:
callbacks_time[k] = v - callbacks_before[k]
else:
callbacks_time[k] = v
else:
validate_time = None
callback_time = None
callbacks_time = {}
return (self, nb_iter, nb_replacement, nb_replacement_didn_t_remove, return (self, nb_iter, nb_replacement, nb_replacement_didn_t_remove,
nb_inconsistency_make, nb_inconsistency_replace, nb_inconsistency_make, nb_inconsistency_replace,
time_canonicalize, time_factor_can, time_canonicalize, time_factor_can,
time_factor_list, time_toposort) time_factor_list, time_toposort,
validate_time, callback_time, callbacks_time,)
@staticmethod @staticmethod
def print_profile(stream, prof, level=0): def print_profile(stream, prof, level=0):
blanc = (' ' * level) blanc = (' ' * level)
#1946.912556s - ('gemm_optimizer', 'GemmOptimizer', 1)
print >> stream, blanc, "GemmOptimizer" print >> stream, blanc, "GemmOptimizer"
print >> stream, blanc, " nb_iter", prof[1] print >> stream, blanc, " nb_iter", prof[1]
print >> stream, blanc, " nb_replacement", prof[2] print >> stream, blanc, " nb_replacement", prof[2]
...@@ -1566,6 +1591,12 @@ class GemmOptimizer(Optimizer): ...@@ -1566,6 +1591,12 @@ class GemmOptimizer(Optimizer):
print >> stream, blanc, " time_factor_can", prof[7] print >> stream, blanc, " time_factor_can", prof[7]
print >> stream, blanc, " time_factor_list", prof[8] print >> stream, blanc, " time_factor_list", prof[8]
print >> stream, blanc, " time_toposort", prof[9] print >> stream, blanc, " time_toposort", prof[9]
print >> stream, blanc, " validate_time", prof[10]
print >> stream, blanc, " callback_time", prof[11]
print >> stream, blanc, " callbacks_time"
for i in sorted(prof[12].iteritems(), key=lambda a: a[1]):
if i[1] > 0:
print i
class Dot22(GemmRelated): class Dot22(GemmRelated):
...@@ -1816,17 +1847,15 @@ blas_optdb.register('local_gemm_to_gemv', ...@@ -1816,17 +1847,15 @@ blas_optdb.register('local_gemm_to_gemv',
15, 'fast_run') 15, 'fast_run')
# After destroyhandler is in but before we try to make elemwise things inplace # After destroyhandler(49.5) but before we try to make elemwise things
# Try to make gemm inplace # inplace (75)
# Also, need to make the gemm optimisation(step 70) happen before the
# fusion of elemwise(step 71)
blas_opt_inplace = in2out(local_inplace_gemm, blas_opt_inplace = in2out(local_inplace_gemm,
local_inplace_gemv, local_inplace_gemv,
local_inplace_ger, local_inplace_ger,
name="blas_opt_inplace") name="blas_opt_inplace")
optdb.register('InplaceBlasOpt', optdb.register('InplaceBlasOpt',
blas_opt_inplace, blas_opt_inplace,
70.0, 'fast_run', 'inplace') 70.0, 'fast_run', 'inplace', 'blas_opt_inplace')
class Dot22Scalar(GemmRelated): class Dot22Scalar(GemmRelated):
......
...@@ -7,11 +7,12 @@ Tensor optimizations addressing the ops in basic.py ...@@ -7,11 +7,12 @@ Tensor optimizations addressing the ops in basic.py
import logging import logging
_logger = logging.getLogger('theano.tensor.opt') _logger = logging.getLogger('theano.tensor.opt')
import operator
import itertools import itertools
from itertools import izip
import operator
import sys import sys
import time
import traceback import traceback
from itertools import izip
import numpy import numpy
import numpy as N # guys... please don't do this in the library :( import numpy as N # guys... please don't do this in the library :(
...@@ -25,7 +26,8 @@ from theano.gof.utils import MethodNotDefined ...@@ -25,7 +26,8 @@ from theano.gof.utils import MethodNotDefined
from theano.configparser import config from theano.configparser import config
from theano.tensor.elemwise import Elemwise, DimShuffle from theano.tensor.elemwise import Elemwise, DimShuffle
from theano.tensor.subtensor import (get_idx_list, get_canonical_form_slice, from theano.tensor.subtensor import (get_idx_list, get_canonical_form_slice,
Subtensor, IncSubtensor, AdvancedIncSubtensor1) Subtensor, IncSubtensor, make_constant,
AdvancedIncSubtensor1)
from theano import scalar from theano import scalar
from theano.tensor import basic as T from theano.tensor import basic as T
from theano import compile # to register the optimizer built by this file from theano import compile # to register the optimizer built by this file
...@@ -35,7 +37,7 @@ from theano.gof.python25 import any, all ...@@ -35,7 +37,7 @@ from theano.gof.python25 import any, all
from theano.gof.opt import (Optimizer, pre_constant_merge, from theano.gof.opt import (Optimizer, pre_constant_merge,
pre_greedy_local_optimizer) pre_greedy_local_optimizer)
from theano.gof.opt import merge_optimizer from theano.gof.opt import merge_optimizer
from theano.gof import toolbox, DestroyHandler from theano.gof import toolbox
from theano.tensor.basic import get_scalar_constant_value, ShapeError, NotScalarConstantError from theano.tensor.basic import get_scalar_constant_value, ShapeError, NotScalarConstantError
from theano.compat.six import StringIO from theano.compat.six import StringIO
...@@ -1955,6 +1957,7 @@ def local_subtensor_merge(node): ...@@ -1955,6 +1957,7 @@ def local_subtensor_merge(node):
else: else:
merged_slices += slices1[pos_1:] merged_slices += slices1[pos_1:]
merged_slices = make_constant(merged_slices)
subtens = Subtensor(merged_slices) subtens = Subtensor(merged_slices)
sl_ins = Subtensor.collapse( sl_ins = Subtensor.collapse(
merged_slices, merged_slices,
...@@ -4072,20 +4075,22 @@ local_one_plus_erf = gof.PatternSub((T.add, ...@@ -4072,20 +4075,22 @@ local_one_plus_erf = gof.PatternSub((T.add,
dict(pattern='y', constraint=_is_1), dict(pattern='y', constraint=_is_1),
(T.erf, 'x')), (T.erf, 'x')),
(T.erfc, (T.neg, 'x')), (T.erfc, (T.neg, 'x')),
allow_multiple_clients=True,) allow_multiple_clients=True,
register_canonicalize(local_one_plus_erf, name='local_one_plus_erf') name='local_one_plus_erf')
register_stabilize(local_one_plus_erf, name='local_one_plus_erf') register_canonicalize(local_one_plus_erf)
register_specialize(local_one_plus_erf, name='local_one_plus_erf') register_stabilize(local_one_plus_erf)
register_specialize(local_one_plus_erf)
#1-erf(x)=>erfc(x) #1-erf(x)=>erfc(x)
local_one_minus_erf = gof.PatternSub((T.sub, local_one_minus_erf = gof.PatternSub((T.sub,
dict(pattern='y', constraint=_is_1), dict(pattern='y', constraint=_is_1),
(T.erf, 'x')), (T.erf, 'x')),
(T.erfc, 'x'), (T.erfc, 'x'),
allow_multiple_clients=True,) allow_multiple_clients=True,
register_canonicalize(local_one_minus_erf, name='local_one_minus_erf') name='local_one_minus_erf',)
register_stabilize(local_one_minus_erf, name='local_one_minus_erf') register_canonicalize(local_one_minus_erf)
register_specialize(local_one_minus_erf, name='local_one_minus_erf') register_stabilize(local_one_minus_erf)
register_specialize(local_one_minus_erf)
local_one_minus_erf2 = gof.PatternSub((T.add, local_one_minus_erf2 = gof.PatternSub((T.add,
1, 1,
...@@ -4103,10 +4108,11 @@ local_one_plus_neg_erf = gof.PatternSub((T.add, ...@@ -4103,10 +4108,11 @@ local_one_plus_neg_erf = gof.PatternSub((T.add,
dict(pattern='y', constraint=_is_1), dict(pattern='y', constraint=_is_1),
(T.neg, (T.erf, 'x'))), (T.neg, (T.erf, 'x'))),
(T.erfc, 'x'), (T.erfc, 'x'),
allow_multiple_clients=True,) allow_multiple_clients=True,
register_canonicalize(local_one_plus_neg_erf, name='local_one_plus_neg_erf') name='local_one_plus_neg_erf')
register_stabilize(local_one_plus_neg_erf, name='local_one_plus_neg_erf') register_canonicalize(local_one_plus_neg_erf)
register_specialize(local_one_plus_neg_erf, name='local_one_plus_neg_erf') register_stabilize(local_one_plus_neg_erf)
register_specialize(local_one_plus_neg_erf)
#(-1)+erf(x) => -erfc(x) don't need erf(x)+(-1) as the canonicalize #(-1)+erf(x) => -erfc(x) don't need erf(x)+(-1) as the canonicalize
#will put the -1 as the first argument. #will put the -1 as the first argument.
...@@ -4114,20 +4120,22 @@ local_erf_minus_one = gof.PatternSub((T.add, ...@@ -4114,20 +4120,22 @@ local_erf_minus_one = gof.PatternSub((T.add,
dict(pattern='y', constraint=_is_minus1), dict(pattern='y', constraint=_is_minus1),
(T.erf, 'x')), (T.erf, 'x')),
(T.neg, (T.erfc, 'x')), (T.neg, (T.erfc, 'x')),
allow_multiple_clients=True,) allow_multiple_clients=True,
register_canonicalize(local_erf_minus_one, name='local_erf_minus_one') name='local_erf_minus_one')
register_stabilize(local_erf_minus_one, name='local_erf_minus_one') register_canonicalize(local_erf_minus_one)
register_specialize(local_erf_minus_one, name='local_erf_minus_one') register_stabilize(local_erf_minus_one)
register_specialize(local_erf_minus_one)
#1-erfc(x) => erf(x) #1-erfc(x) => erf(x)
local_one_minus_erfc = gof.PatternSub((T.sub, local_one_minus_erfc = gof.PatternSub((T.sub,
dict(pattern='y', constraint=_is_1), dict(pattern='y', constraint=_is_1),
(T.erfc, 'x')), (T.erfc, 'x')),
(T.erf, 'x'), (T.erf, 'x'),
allow_multiple_clients=True,) allow_multiple_clients=True,
register_canonicalize(local_one_minus_erfc, name='local_one_minus_erfc') name='local_one_minus_erfc')
register_stabilize(local_one_minus_erfc, name='local_one_minus_erfc') register_canonicalize(local_one_minus_erfc)
register_specialize(local_one_minus_erfc, name='local_one_minus_erfc') register_stabilize(local_one_minus_erfc)
register_specialize(local_one_minus_erfc)
local_one_minus_erfc2 = gof.PatternSub((T.add, local_one_minus_erfc2 = gof.PatternSub((T.add,
1, 1,
...@@ -4155,20 +4163,22 @@ local_one_add_neg_erfc = gof.PatternSub((T.add, ...@@ -4155,20 +4163,22 @@ local_one_add_neg_erfc = gof.PatternSub((T.add,
dict(pattern='y', constraint=_is_1), dict(pattern='y', constraint=_is_1),
(T.neg, (T.erfc, 'x'))), (T.neg, (T.erfc, 'x'))),
(T.erf, 'x'), (T.erf, 'x'),
allow_multiple_clients=True,) allow_multiple_clients=True,
register_canonicalize(local_one_add_neg_erfc, name='local_one_add_neg_erfc') name='local_one_add_neg_erfc')
register_stabilize(local_one_add_neg_erfc, name='local_one_add_neg_erfc') register_canonicalize(local_one_add_neg_erfc)
register_specialize(local_one_add_neg_erfc, name='local_one_add_neg_erfc') register_stabilize(local_one_add_neg_erfc)
register_specialize(local_one_add_neg_erfc)
#(-1)+erfc(-x)=>erf(x) #(-1)+erfc(-x)=>erf(x)
local_erf_neg_minus_one = gof.PatternSub((T.add, local_erf_neg_minus_one = gof.PatternSub((T.add,
dict(pattern='y', constraint=_is_minus1), dict(pattern='y', constraint=_is_minus1),
(T.erfc, (T.neg, 'x'))), (T.erfc, (T.neg, 'x'))),
(T.erf, 'x'), (T.erf, 'x'),
allow_multiple_clients=True,) allow_multiple_clients=True,
register_canonicalize(local_erf_neg_minus_one, name='local_erf_neg_minus_one') name='local_erf_neg_minus_one')
register_stabilize(local_erf_neg_minus_one, name='local_erf_neg_minus_one') register_canonicalize(local_erf_neg_minus_one)
register_specialize(local_erf_neg_minus_one, name='local_erf_neg_minus_one') register_stabilize(local_erf_neg_minus_one)
register_specialize(local_erf_neg_minus_one)
#(-1)+erfc(-1*x)=>erf(x) #(-1)+erfc(-1*x)=>erf(x)
local_erf_neg_minus_one2 = gof.PatternSub((T.add, local_erf_neg_minus_one2 = gof.PatternSub((T.add,
...@@ -4775,12 +4785,21 @@ class FusionOptimizer(Optimizer): ...@@ -4775,12 +4785,21 @@ class FusionOptimizer(Optimizer):
def add_requirements(self, fgraph): def add_requirements(self, fgraph):
fgraph.attach_feature(toolbox.ReplaceValidate()) fgraph.attach_feature(toolbox.ReplaceValidate())
fgraph.attach_feature(DestroyHandler())
def apply(self, fgraph): def apply(self, fgraph):
did_something = True did_something = True
nb_iter = 0
nb_replacement = 0
nb_inconsistency_replace = 0
time_toposort = 0
if fgraph.profile:
validate_before = fgraph.profile.validate_time
callbacks_before = fgraph.execute_callbacks_times.copy()
callback_before = fgraph.execute_callbacks_time
while did_something: while did_something:
t0 = time.time()
nodelist = list(fgraph.toposort()) nodelist = list(fgraph.toposort())
time_toposort += time.time() - t0
nodelist.reverse() nodelist.reverse()
did_something = False did_something = False
for node in nodelist: for node in nodelist:
...@@ -4794,18 +4813,66 @@ class FusionOptimizer(Optimizer): ...@@ -4794,18 +4813,66 @@ class FusionOptimizer(Optimizer):
zip(node.outputs, new_outputs), zip(node.outputs, new_outputs),
reason=self.__class__.__name__) reason=self.__class__.__name__)
did_something = True did_something = True
nb_replacement += 1
except InconsistencyError: except InconsistencyError:
nb_inconsistency_replace += 1
pass pass
nb_iter += 1
if fgraph.profile:
validate_time = fgraph.profile.validate_time - validate_before
callback_time = fgraph.execute_callbacks_time - callback_before
callbacks_time = {}
for k, v in fgraph.execute_callbacks_times.iteritems():
if k in callbacks_before:
callbacks_time[k] = v - callbacks_before[k]
else:
callbacks_time[k] = v
else:
validate_time = None
callback_time = None
callbacks_time = {}
return (self, nb_iter, nb_replacement,
nb_inconsistency_replace,
validate_time, callback_time, callbacks_time,
time_toposort)
@staticmethod
def print_profile(stream, prof, level=0):
blanc = (' ' * level)
print >> stream, blanc, "FusionOptimizer"
print >> stream, blanc, " nb_iter", prof[1]
print >> stream, blanc, " nb_replacement", prof[2]
print >> stream, blanc, " nb_inconsistency_replace", prof[3]
print >> stream, blanc, " validate_time", prof[4]
print >> stream, blanc, " callback_time", prof[5]
print >> stream, blanc, " callbacks_time"
for i in sorted(prof[6].iteritems(), key=lambda a: a[1]):
if i[1] > 0:
print i
print >> stream, blanc, " time_toposort", prof[7]
if config.tensor.local_elemwise_fusion: if config.tensor.local_elemwise_fusion:
_logger.debug("enabling optimization fusion elemwise in fast_run") _logger.debug("enabling optimization fusion elemwise in fast_run")
#Must be after gpu(48.5) and before AddDestroyHandler(49.5)
compile.optdb.register('elemwise_fusion', compile.optdb.register('elemwise_fusion',
FusionOptimizer(local_elemwise_fusion), 71.00, FusionOptimizer(local_elemwise_fusion), 49,
'fast_run', 'fusion', 'local_elemwise_fusion', 'fast_run', 'fusion', 'local_elemwise_fusion',
'FusionOptimizer') 'FusionOptimizer')
else: else:
_logger.debug("not enabling optimization fusion elemwise in fast_run") _logger.debug("not enabling optimization fusion elemwise in fast_run")
compile.optdb.register('elemwise_fusion', compile.optdb.register('elemwise_fusion',
FusionOptimizer(local_elemwise_fusion), 71.00, FusionOptimizer(local_elemwise_fusion), 49,
'fusion', 'local_elemwise_fusion', 'fusion', 'local_elemwise_fusion',
'FusionOptimizer') 'FusionOptimizer')
# ############################
# # Remove consider_constant #
# ############################
# Although the op just returns its input, it should be removed from
# the graph to make sure all possible optimizations can be applied.
register_canonicalize(gof.OpRemove(theano.gradient.consider_constant_),
'fast_compile', name='remove_consider_constant')
...@@ -47,6 +47,23 @@ class AdvancedIndexingError(TypeError): ...@@ -47,6 +47,23 @@ class AdvancedIndexingError(TypeError):
# Helpful functions to deal with Subtensor and IncSubtensor # Helpful functions to deal with Subtensor and IncSubtensor
########## ##########
def make_constant(args):
"""
Convert python litterals to theano constants in subtensor arguments.
"""
def conv(a):
if a is None:
return a
elif isinstance(a, slice):
return slice(conv(a.start),
conv(a.stop),
conv(a.step))
elif isinstance(a, (int, long, numpy.integer)):
return scal.ScalarConstant(scal.int64, a)
else:
return a
return tuple(map(conv, args))
def get_idx_list(inputs, idx_list): def get_idx_list(inputs, idx_list):
''' '''
Given a list of inputs to the subtensor and its idx_list reorders Given a list of inputs to the subtensor and its idx_list reorders
......
import numpy as np import numpy as np
import numpy import numpy
import unittest
import theano import theano
from theano.tests import unittest_tools as utt from theano.tests import unittest_tools as utt
......
...@@ -164,7 +164,8 @@ class TensorType(Type): ...@@ -164,7 +164,8 @@ class TensorType(Type):
" Theano C code does not support that.", " Theano C code does not support that.",
msg, msg,
"object shape", data.shape, "object shape", data.shape,
"object strides", data.strides) "object strides", data.strides,
"object dtype", data.dtype)
i = 0 i = 0
for b in self.broadcastable: for b in self.broadcastable:
......
...@@ -4,8 +4,7 @@ import numpy ...@@ -4,8 +4,7 @@ import numpy
import theano import theano
from theano.compat import all, PY3 from theano.compat import all, PY3
from theano.scalar import (ComplexError, IntegerDivisionError, from theano.scalar import ComplexError, IntegerDivisionError
ScalarConstant, int64)
from theano.gof import Constant, Variable from theano.gof import Constant, Variable
from theano.gof.utils import hashtype from theano.gof.utils import hashtype
from theano.tensor.utils import hash_from_ndarray from theano.tensor.utils import hash_from_ndarray
...@@ -350,18 +349,7 @@ class _tensor_py_operators: ...@@ -350,18 +349,7 @@ class _tensor_py_operators:
if not isinstance(args, tuple): if not isinstance(args, tuple):
args = args, args = args,
# Convert python literals to theano constants # Convert python literals to theano constants
def conv(a): args = theano.tensor.subtensor.make_constant(args)
if a is None:
return a
elif isinstance(a, slice):
return slice(conv(a.start),
conv(a.stop),
conv(a.step))
elif isinstance(a, (int, long, numpy.integer)):
return ScalarConstant(int64, a)
else:
return a
args = tuple(map(conv, args))
# Determine if advanced indexing is needed or not # Determine if advanced indexing is needed or not
# The logic is already in Subtensor.convert: if it succeeds, # The logic is already in Subtensor.convert: if it succeeds,
# standard indexing is used; if it fails with # standard indexing is used; if it fails with
......
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
import unittest import unittest
import theano import theano
from theano import gof from theano import gof
from theano.tests import unittest_tools as utt
from theano import gradient from theano import gradient
from theano.tensor.nnet.Conv3D import conv3D from theano.tensor.nnet.Conv3D import conv3D
...@@ -601,5 +602,45 @@ def test_subgrad(): ...@@ -601,5 +602,45 @@ def test_subgrad():
print(true_grad, pgrad) print(true_grad, pgrad)
assert(np.sum(np.abs(true_grad - pgrad)) < 0.00001) assert(np.sum(np.abs(true_grad - pgrad)) < 0.00001)
class TestConsiderConstant(unittest.TestCase):
def setUp(self):
utt.seed_rng()
self.rng = np.random.RandomState(seed=utt.fetch_seed())
def test_op_removed(self):
x = theano.tensor.matrix('x')
y = x * gradient.consider_constant(x)
f = theano.function([x], y)
# need to refer to theano.gradient.consider_constant_ here,
# theano.gradient.consider_constant is a wrapper function!
assert gradient.consider_constant_ not in \
[node.op for node in f.maker.fgraph.toposort()]
def test_grad(self):
T = theano.tensor
a = np.asarray(self.rng.randn(5, 5),
dtype=config.floatX)
x = T.matrix('x')
expressions_gradients = [
(x * gradient.consider_constant(x), x),
(x * gradient.consider_constant(T.exp(x)), T.exp(x)),
(gradient.consider_constant(x), T.constant(0.)),
(x**2 * gradient.consider_constant(x), 2 * x**2),
]
for expr, expr_grad in expressions_gradients:
g = gradient.grad(expr.sum(), x)
# gradient according to theano
f = theano.function([x], g, on_unused_input='ignore')
# desired gradient
f2 = theano.function([x], expr_grad, on_unused_input='ignore')
assert np.allclose(f(a), f2(a))
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论