提交 f92215df authored 作者: Nicholas Leonard's avatar Nicholas Leonard

Merge branch 'master' of https://github.com/Theano/Theano

.. _install:
......@@ -130,20 +129,11 @@ by typing
You may need to add ``sudo`` before this command to install into your
system's ``site-packages`` directory. If you do not have administrator access
to your machine, you can install to an alternate prefix using
to your machine, you can install Theano locally (to ~/.local) using
.. code-block:: bash
pip install Theano --install-option='--prefix=~/.local'
e.g. using ``--install-option='--prefix=~/.local'`` on Python 2.4 would
install Theano into ``.local/lib/python2.4/site-packages`` inside your home
directory on Mac OS X or Unix/Linux (this ``site-packages`` directory must be
listed in your ``PYTHONPATH`` environment variable; for Python 2.6 and later,
``~/.local`` is
automatically searched and does *not* need to be explicitly included in
``PYTHONPATH``, see :ref:`config_pythonpath` for instructions).
You can change ``~/.local``, but you need to change your ``PYTHONPATH`` as said above.
pip install Theano --user
Alternatively you can use virtualenv_ to create an isolated ``site-packages``
directory; see the `virtualenv documentation`_ for details.
......@@ -225,7 +215,7 @@ or (if you want to install it for the current user only):
.. code-block:: bash
pip install --upgrade --no-deps git+git://github.com/Theano/Theano.git --install-option='--prefix=~/.local'
pip install --upgrade --no-deps git+git://github.com/Theano/Theano.git --user
The following are general instructions that will set you up with the
bleeding-edge version of Theano and allow you to hack it. First,
......
......@@ -18,7 +18,7 @@ those operations will run in parallel in Theano.
The most frequent way to control the number of threads used is via the
``OMP_NUM_THREADS`` environment variable. Set it to the number of
threads you want to use before starting the python process. Some BLAS
implementation support other enviroment variable.
implementations support other enviroment variables.
Parallel element wise ops with OpenMP
......@@ -27,8 +27,8 @@ Parallel element wise ops with OpenMP
Because element wise ops work on every tensor entry independently they
can be easily parallelized using OpenMP.
To use OpenMP you must set the ``openmp`` flag to ``True`` in Theano
configuration.
To use OpenMP you must set the ``openmp`` :ref:`flag <libdoc_config>`
to ``True``.
You can use the flag ``openmp_elemwise_minsize`` to set the minimum
tensor size for which the operation is parallelized because for short
......
import theano
from theano import gof
from theano import gradient as G
from theano.compile.function_module import orig_function
from theano.compile import SharedVariable, rebuild_collect_shared
from theano.gof import ops_with_inner_function
......@@ -142,7 +142,7 @@ class OpFromGraph(gof.Op):
if hasattr(self, "grad_ops"):
grad_ops = self.grad_ops
else:
gs = G.grad(cost=None,
gs = theano.gradient.grad(cost=None,
known_grads=dict(zip(self.new_outputs, output_grads)),
wrt=self.new_inputs,
disconnected_inputs='ignore')
......
......@@ -62,7 +62,6 @@ from theano.gof.opt import (Optimizer, optimizer, SeqOptimizer,
LocalOptimizer, local_optimizer, LocalOptGroup,
OpSub, OpRemove, PatternSub,
NavigatorOptimizer, TopoOptimizer, EquilibriumOptimizer,
InplaceOptimizer, PureThenInplaceOptimizer,
OpKeyOptimizer)
from theano.gof.optdb import \
......
......@@ -165,8 +165,12 @@ def lock(tmp_dir, timeout=120, min_wait=5, max_wait=10, verbosity=1):
my_pid = os.getpid()
no_display = (verbosity == 0)
# Acquire lock.
nb_error = 0
# The number of time we sleep when their is no errors.
# Used to don't display it the first time to display it less frequently.
# And so don't get as much email about this!
nb_wait = 0
# Acquire lock.
while True:
try:
last_owner = 'no_owner'
......@@ -214,7 +218,7 @@ def lock(tmp_dir, timeout=120, min_wait=5, max_wait=10, verbosity=1):
last_owner = read_owner
time_start = time.time()
no_display = (verbosity == 0)
if not no_display:
if not no_display and nb_wait > 0:
if read_owner == 'failure':
msg = 'unknown process'
else:
......@@ -225,6 +229,7 @@ def lock(tmp_dir, timeout=120, min_wait=5, max_wait=10, verbosity=1):
tmp_dir)
if verbosity <= 1:
no_display = True
nb_wait += 1
time.sleep(random.uniform(min_wait, max_wait))
try:
......
......@@ -131,6 +131,9 @@ class FromFunctionOptimizer(Optimizer):
def __call__(self, *args, **kwargs):
return self.fn(*args, **kwargs)
def __str__(self):
return self.__name__
def optimizer(f):
"""decorator for FromFunctionOptimizer"""
......@@ -626,7 +629,10 @@ class MergeOptimizer(Optimizer):
print >> stream, blanc, " replace_time", replace_time
print >> stream, blanc, " validate_time", validate_time
print >> stream, blanc, " callback_time", callback_time
print >> stream, blanc, " callback_times", callbacks_time
print >> stream, blanc, " callbacks_time"
for i in sorted(callbacks_time.iteritems(), key=lambda a: a[1]):
if i[1] > 0:
print i
print >> stream, blanc, " nb_merged", nb_merged
print >> stream, blanc, " nb_constant", nb_constant
......@@ -1490,7 +1496,6 @@ class EquilibriumOptimizer(NavigatorOptimizer):
def __init__(self,
optimizers,
failure_callback=None,
max_depth=None,
max_use_ratio=None):
"""
:param optimizers: list or set of local or global optimizations to
......@@ -1499,8 +1504,6 @@ class EquilibriumOptimizer(NavigatorOptimizer):
:param max_use_ratio: each optimizer can be applied at most
(size of graph * this number) times
:param max_depth: TODO what does this do? (EquilibriumDB sets it to 5)
"""
super(EquilibriumOptimizer, self).__init__(
......@@ -1520,7 +1523,6 @@ class EquilibriumOptimizer(NavigatorOptimizer):
self.local_optimizers_map.setdefault(c, []).append(opt)
else:
self.global_optimizers.append(opt)
self.max_depth = max_depth
self.max_use_ratio = max_use_ratio
assert self.max_use_ratio is not None, (
'max_use_ratio has to be a number')
......@@ -1723,11 +1725,13 @@ class EquilibriumOptimizer(NavigatorOptimizer):
for (t, count, opt) in count_opt[::-1]:
print >> stream, blanc, ' %.3fs - %d - %s' % (
t, count, opt)
print >> stream, blanc, ' %.3fs - in %d optimization that where not used' % (
print >> stream, blanc, ' %.3fs - in %d optimization that where not used (display only those with a runtime > 0)' % (
not_used_time, len(not_used))
not_used.sort()
for (t, opt) in not_used[::-1]:
print >> stream, blanc + " ", ' %.3fs - %s' % (t, opt)
if t > 0:
# Skip opt that have 0 times, they probably wasn't even tried.
print >> stream, blanc + " ", ' %.3fs - %s' % (t, opt)
print >> stream
@staticmethod
......@@ -1899,31 +1903,3 @@ def pre_greedy_local_optimizer(list_optimizations, out):
final_outs, optimized_nodes = local_recursive_function(
list_optimizations, out, {}, 0)
return final_outs[out_index]
############
### Misc ###
############
class InplaceOptimizer(Optimizer):
def __init__(self, inplace):
self.inplace = inplace
def apply(self, fgraph):
self.inplace(fgraph)
def add_requirements(self, fgraph):
fgraph.attach_feature(dh.DestroyHandler())
class PureThenInplaceOptimizer(Optimizer):
def __init__(self, pure, inplace):
self.pure = pure
self.inplace = inplace
def apply(self, fgraph):
self.pure(fgraph)
fgraph.attach_feature(dh.DestroyHandler())
self.inplace(fgraph)
......@@ -194,7 +194,6 @@ class EquilibriumDB(DB):
def query(self, *tags, **kwtags):
opts = super(EquilibriumDB, self).query(*tags, **kwtags)
return opt.EquilibriumOptimizer(opts,
max_depth=5,
max_use_ratio=config.optdb.max_use_ratio,
failure_callback=opt.NavigatorOptimizer.warn_inplace)
......
......@@ -23,6 +23,7 @@ from theano.gof import Variable
from theano.gof.python25 import OrderedDict
from theano.gof.null_type import NullType
from theano.gof.op import get_debug_values
from theano.compile import ViewOp
# we can't do "import theano.tensor"
# tensor depends on theano.compile
......@@ -1788,3 +1789,29 @@ def _is_zero(x):
return 'no'
return 'yes'
class ConsiderConstant(ViewOp):
def grad(self, args, g_outs):
return [g_out.zeros_like(g_out) for g_out in g_outs]
consider_constant_ = ConsiderConstant()
#I create a function only to have the doc show well.
def consider_constant(x):
""" Consider an expression constant when computing gradients.
The expression itself is unaffected, but when its gradient is
computed, or the gradient of another expression that this
expression is a subexpression of, it will not be backpropagated
through. In other words, the gradient of the expression is
truncated to 0.
:param x: A Theano expression whose gradient should be truncated.
:return: The expression is returned unmodified, but its gradient
is now truncated to 0.
.. versionadded:: 0.6.1
"""
return consider_constant_(x)
......@@ -1198,7 +1198,11 @@ class GpuCAReduce(GpuOp):
n_threads.z += 1;
else
break;
}""" % locals()
}
//Maximum for Fermi GPU on that dimensions.
n_threads.z = std::min(n_threads.z, (unsigned)64);
""" % locals()
if len(self.reduce_mask) == 2:
threads_y = ''
......@@ -1509,6 +1513,8 @@ class GpuCAReduce(GpuOp):
n_threads.z += 1;
}
n_threads.z -= 1;
//Maximum for Fermi GPU on that dimensions.
n_threads.z = std::min(n_threads.z, (unsigned)64);
dim3 n_blocks(1,1,1);
%(makecall)s
......
......@@ -671,7 +671,7 @@ class GpuConv(GpuOp):
def c_code_cache_version(self):
# raise this whenever modifying any of the support_code_files
return (0, 20)
return (0, 21)
def c_support_code_apply(self, node, nodename):
# REMEMBER TO RAISE c_code_cache_version when changing any of
......
......@@ -1018,6 +1018,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
(version==3||version==4||version==5||version==-1) &&
out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
(kern_len+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte<shared_avail && //their is only 16k of shared memory
(kern_len > 1 || (img_size_padded_byte+kern_size_byte)<=shared_avail) &&
!work_complete) //conv_full_patch_stack_padded
{
//version 3 without split
......
......@@ -14,7 +14,7 @@ import theano.ifelse
from theano.compile import optdb
from theano.gof import (local_optimizer, EquilibriumDB, SequenceDB, ProxyDB,
Optimizer, toolbox, DestroyHandler)
Optimizer, toolbox)
from theano.gof.python25 import all, any
from theano.sandbox.cuda.basic_ops import (
device_properties, gpu_eye,
......@@ -62,7 +62,7 @@ optdb.register('gpu_opt',
# inside the elemwise. When there is no float64 op, this is working.
optdb.register('gpu_after_fusion',
ProxyDB(gpu_seqopt),
optdb.__position__.get('elemwise_fusion', 71) + .1,
optdb.__position__.get('elemwise_fusion', 49) + .1,
'gpu')
......@@ -88,7 +88,6 @@ class InputToGpuOptimizer(Optimizer):
def add_requirements(self, fgraph):
fgraph.attach_feature(toolbox.ReplaceValidate())
fgraph.attach_feature(DestroyHandler())
def apply(self, fgraph):
for input in fgraph.inputs:
......@@ -1339,9 +1338,10 @@ gpu_local_elemwise_fusion = tensor.opt.local_elemwise_fusion_op(
max_inputs_to_GpuElemwise)
if config.gpu.local_elemwise_fusion:
_logger.debug("enabling optimization fusion of gpu elemwise in fast_run")
#Must be after cpu fusion at 40, gpu at 48.5 and before AddDestroyHandler at 49.5
optdb.register('gpu_elemwise_fusion',
tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion),
71.00, 'fast_run', 'fusion',
49, 'fast_run', 'fusion',
'local_elemwise_fusion', 'gpu')
else:
_logger.debug(("not enabling optimization fusion of gpu elemwise in "
......
......@@ -109,11 +109,13 @@ def test_careduce():
((4100,4,3),[1,2]),((5,4100,3),[1,2]),((5,4,4100),[1,2]),#011
#((4100,4,3),[0,2]),((5,4100,3),[0,2]),((5,4,4100),[0,2]),#101 ##not implemented
((4100,4,3),[0,1,2]),((5,4100,3),[0,1,2]),((5,4,4100),[0,1,2]),#111
((65,4,3),[0,1,2]),((5,65,3),[0,1,2]),((5,4,65),[0,1,2]),#111
((4100,4,3,2),[2,3]),((4,4100,3,2),[2,3]),((4,3,4100,2),[2,3]),((4,3,2,4100),[2,3]),#0011
((4100,4,3,2),[1,3]),((4,4100,3,2),[1,3]),((4,3,4100,2),[1,3]),((4,3,2,4100),[1,3]),#0101
((4100,4,3,2),[0,2,3]),((4,4100,3,2),[0,2,3]),((4,3,4100,2),[0,2,3]),#((4,3,2,4100),[0,2,3]),#1011
((4100,4,3,2),[1,2,3]),((4,4100,3,2),[1,2,3]),((4,3,4100,2),[1,2,3]),((4,3,2,4100),[1,2,3]),#0111
((65,4,3,2),[1,2,3]),((4,65,3,2),[1,2,3]),((4,3,65,2),[1,2,3]),((4,3,2,65),[1,2,3]),#0111
((4100,2,3,4),[0,1,2,3]),((2,4100,3,4),[0,1,2,3]),((2,3,4100,4),[0,1,2,3]),((2,3,4,4100),[0,1,2,3]),((128,1,3,3), [0,1,2,3]),#1111
......
......@@ -679,6 +679,7 @@ def test_full():
#Test more than maxThreadsDim0
, ((2,4,13,1050), (3,4,10, 11), (1, 1), (1, 1), (1, 1))
, ((2,4,1050,13), (3,4,10, 11), (1, 1), (1, 1), (1, 1))
, ((1,1,44800,1), (6,1,1,1), (1, 1), (1, 1), (1, 1))#This caused crash
]
# shapes=shapes[:277]
......
......@@ -61,7 +61,7 @@ class GpuGemv(BlasOp, Gemv):
((dtype_%(alpha)s *)PyArray_DATA(%(alpha)s))[0],
%(A)s, %(x)s,
((dtype_%(beta)s *)PyArray_DATA(%(beta)s))[0],
%(out)s) == NULL) {
%(out)s, 0) == -1) {
%(fail)s
}
""" % vars
......@@ -72,7 +72,7 @@ class GpuGemv(BlasOp, Gemv):
return code
def c_code_cache_version(self):
return (0,)
return (1,)
gpugemv_no_inplace = GpuGemv(inplace=False)
gpugemv_inplace = GpuGemv(inplace=True)
......@@ -117,7 +117,7 @@ class GpuGemm(BlasOp, Gemm):
((dtype_%(alpha)s *)PyArray_DATA(%(alpha)s))[0],
%(A)s, %(B)s,
((dtype_%(beta)s *)PyArray_DATA(%(beta)s))[0],
%(out)s) == NULL) {
%(out)s, 0) == -1) {
%(fail)s
}
""" % vars
......@@ -128,7 +128,7 @@ class GpuGemm(BlasOp, Gemm):
return code
def c_code_cache_version(self):
return (0,)
return (1,)
gpugemm_no_inplace = GpuGemm(inplace=False)
......@@ -176,7 +176,7 @@ class GpuDot22(BlasOp, Dot22):
one,
%(A)s, %(B)s,
zero,
%(out)s) == NULL) {
%(out)s, 0) == -1) {
%(fail)s
}
""" % vars
......@@ -187,7 +187,7 @@ class GpuDot22(BlasOp, Dot22):
return code
def c_code_cache_version(self):
return (0,)
return (1,)
def c_headers(self):
ret = super(GpuDot22, self).c_headers()
......
......@@ -1281,7 +1281,10 @@ class GpuCAReduceCuda(HideC, CAReduce):
n_threads.z += 1;
else
break;
}""" % locals()
}
//Maximum for Fermi GPU on that dimensions.
n_threads.z = std::min(n_threads.z, (unsigned)64);
""" % locals()
if len(self.reduce_mask) == 2:
threads_y = ''
......@@ -1601,6 +1604,8 @@ class GpuCAReduceCuda(HideC, CAReduce):
n_threads.z += 1;
}
n_threads.z -= 1;
//Maximum for Fermi GPU on that dimensions.
n_threads.z = std::min(n_threads.z, (unsigned)64);
dim3 n_blocks(1,1,1);
%(makecall)s
......
......@@ -5,7 +5,7 @@ from theano import tensor, scalar
from theano.compile import optdb
from theano.gof import (local_optimizer, EquilibriumDB,
SequenceDB, ProxyDB,
Optimizer, toolbox, DestroyHandler,
Optimizer, toolbox,
InconsistencyError, EquilibriumOptimizer)
from theano.gof.python25 import all, any
......@@ -90,7 +90,6 @@ class InputToGpuOptimizer(Optimizer):
def add_requirements(self, fgraph):
fgraph.attach_feature(toolbox.ReplaceValidate())
fgraph.attach_feature(DestroyHandler())
def apply(self, fgraph):
for input in fgraph.inputs:
......
import unittest
from theano import scalar, gof
from theano.gof import FunctionGraph
from theano.gof.python25 import all, any
from theano.tests.unittest_tools import SkipTest
from theano.tensor.tests.test_elemwise import (test_Broadcast, test_DimShuffle,
test_CAReduce)
......@@ -126,11 +122,13 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
((4100,4,3),[1,2]),((5,4100,3),[1,2]),((5,4,4100),[1,2]),#011
#((4100,4,3),[0,2]),((5,4100,3),[0,2]),((5,4,4100),[0,2]),#101 ##not implemented
((4100,4,3),[0,1,2]),((5,4100,3),[0,1,2]),((5,4,4100),[0,1,2]),#111
((65,4,3),[0,1,2]),((5,65,3),[0,1,2]),((5,4,65),[0,1,2]),#111
((4100,4,3,2),[2,3]),((4,4100,3,2),[2,3]),((4,3,4100,2),[2,3]),((4,3,2,4100),[2,3]),#0011
((4100,4,3,2),[1,3]),((4,4100,3,2),[1,3]),((4,3,4100,2),[1,3]),((4,3,2,4100),[1,3]),#0101
((4100,4,3,2),[0,2,3]),((4,4100,3,2),[0,2,3]),((4,3,4100,2),[0,2,3]),#((4,3,2,4100),[0,2,3]),#1011
((4100,4,3,2),[1,2,3]),((4,4100,3,2),[1,2,3]),((4,3,4100,2),[1,2,3]),((4,3,2,4100),[1,2,3]),#0111
((65,4,3,2),[1,2,3]),((4,65,3,2),[1,2,3]),((4,3,65,2),[1,2,3]),((4,3,2,65),[1,2,3]),#0111
((4100,2,3,4),[0,1,2,3]),((2,4100,3,4),[0,1,2,3]),((2,3,4100,4),[0,1,2,3]),((2,3,4,4100),[0,1,2,3]),((128,1,3,3), [0,1,2,3]),#1111
#test pattern implemented by reshape
......
......@@ -28,16 +28,8 @@ if cuda_available:
def matVecModM(A, s, m):
# return (A * s) % m
x = numpy.zeros_like(s)
for i in xrange(len(x)):
for j in xrange(len(s)):
r = numpy.int32((numpy.int64(A[i][j]) * s[j] + x[i]) % m)
if r >= 0:
x[i] = r
else:
x[i] = r + m
return x
assert A.dtype == 'int64'
return numpy.int32(numpy.sum((A*s) % m, 1) % m)
def multMatVect(v, A, m1, B, m2):
......@@ -63,24 +55,30 @@ MASK2 = numpy.int32(65535) #2^16 - 1
MULT2 = numpy.int32(21069)
NORM = 4.656612873077392578125e-10; #1./2^31
A1p0 = numpy.asarray([[0, 4194304, 129], [1, 0, 0], [0, 1, 0]])
A2p0 = numpy.asarray([[32768, 0, 32769], [1, 0, 0], [0, 1, 0]])
#A1p0 = numpy.asarray([[0, 4194304, 129], [1, 0, 0], [0, 1, 0]],
# dtype='int64')
#A2p0 = numpy.asarray([[32768, 0, 32769], [1, 0, 0], [0, 1, 0]],
# dtype='int64')
A1p72 = numpy.asarray([[1516919229, 758510237, 499121365],
[1884998244, 1516919229, 335398200],
[601897748, 1884998244, 358115744]])
[601897748, 1884998244, 358115744]],
dtype='int64')
A2p72 = numpy.asarray([[1228857673, 1496414766, 954677935],
[1133297478, 1407477216, 1496414766],
[2002613992, 1639496704, 1407477216]])
[2002613992, 1639496704, 1407477216]],
dtype='int64')
A1p134 = numpy.asarray(
[[1702500920, 1849582496, 1656874625],
[828554832, 1702500920, 1512419905],
[1143731069, 828554832, 102237247]])
[1143731069, 828554832, 102237247]],
dtype='int64')
A2p134 = numpy.asarray(
[[796789021, 1464208080, 607337906],
[1241679051, 1431130166, 1464208080],
[1401213391, 1178684362, 1431130166]])
[1401213391, 1178684362, 1431130166]],
dtype='int64')
np_int32_vals = [numpy.int32(i) for i in (0, 7, 9, 15, 16, 22, 24)]
......
......@@ -1509,7 +1509,6 @@ class PushOutDot1(gof.Optimizer):
def add_requirements(self, fgraph):
fgraph.attach_feature(toolbox.ReplaceValidate())
fgraph.attach_feature(DestroyHandler())
def apply(self, fgraph):
......
......@@ -58,7 +58,7 @@ def shared(*args, **kw):
from theano.tensor import nnet # used for softmax, sigmoid, etc.
from theano.gradient import Rop, Lop, grad, numeric_grad, verify_grad, \
jacobian, hessian
jacobian, hessian, consider_constant
from theano.tensor.sort import sort, argsort
from theano.tensor.extra_ops import (DiffOp, bincount, squeeze,
......
......@@ -139,7 +139,7 @@ except ImportError:
pass
from theano.configparser import config, AddConfigVar, StrParam
from theano.gof import (utils, Op, view_roots, DestroyHandler,
from theano.gof import (utils, Op, view_roots,
local_optimizer, Optimizer,
InconsistencyError, toolbox, SequenceDB,
EquilibriumOptimizer, Apply,
......@@ -1488,7 +1488,6 @@ class GemmOptimizer(Optimizer):
def add_requirements(self, fgraph):
fgraph.attach_feature(toolbox.ReplaceValidate())
fgraph.attach_feature(DestroyHandler())
def apply(self, fgraph):
did_something = True
......@@ -1501,9 +1500,21 @@ class GemmOptimizer(Optimizer):
time_factor_can = 0
time_factor_list = 0
time_toposort = 0
if fgraph.profile:
validate_before = fgraph.profile.validate_time
callbacks_before = fgraph.execute_callbacks_times.copy()
callback_before = fgraph.execute_callbacks_time
class Updater:
def on_import(self, fgraph, new_node, reason):
if new_node is not node:
nodelist.append(new_node)
u = Updater()
fgraph.attach_feature(u)
while did_something:
nb_iter += 1
t0 = time.time()
nodelist = list(fgraph.toposort())
nodelist = theano.gof.graph.io_toposort(fgraph.inputs, fgraph.outputs)
time_toposort += time.time() - t0
did_something = False
nodelist.reverse()
......@@ -1546,16 +1557,30 @@ class GemmOptimizer(Optimizer):
except ReplacementDidntRemovedError, e:
nb_replacement_didn_t_remove += 1
self.warned = True
nb_iter += 1
fgraph.remove_feature(u)
if fgraph.profile:
validate_time = fgraph.profile.validate_time - validate_before
callback_time = fgraph.execute_callbacks_time - callback_before
callbacks_time = {}
for k, v in fgraph.execute_callbacks_times.iteritems():
if k in callbacks_before:
callbacks_time[k] = v - callbacks_before[k]
else:
callbacks_time[k] = v
else:
validate_time = None
callback_time = None
callbacks_time = {}
return (self, nb_iter, nb_replacement, nb_replacement_didn_t_remove,
nb_inconsistency_make, nb_inconsistency_replace,
time_canonicalize, time_factor_can,
time_factor_list, time_toposort)
time_factor_list, time_toposort,
validate_time, callback_time, callbacks_time,)
@staticmethod
def print_profile(stream, prof, level=0):
blanc = (' ' * level)
#1946.912556s - ('gemm_optimizer', 'GemmOptimizer', 1)
print >> stream, blanc, "GemmOptimizer"
print >> stream, blanc, " nb_iter", prof[1]
print >> stream, blanc, " nb_replacement", prof[2]
......@@ -1566,6 +1591,12 @@ class GemmOptimizer(Optimizer):
print >> stream, blanc, " time_factor_can", prof[7]
print >> stream, blanc, " time_factor_list", prof[8]
print >> stream, blanc, " time_toposort", prof[9]
print >> stream, blanc, " validate_time", prof[10]
print >> stream, blanc, " callback_time", prof[11]
print >> stream, blanc, " callbacks_time"
for i in sorted(prof[12].iteritems(), key=lambda a: a[1]):
if i[1] > 0:
print i
class Dot22(GemmRelated):
......@@ -1816,17 +1847,15 @@ blas_optdb.register('local_gemm_to_gemv',
15, 'fast_run')
# After destroyhandler is in but before we try to make elemwise things inplace
# Try to make gemm inplace
# Also, need to make the gemm optimisation(step 70) happen before the
# fusion of elemwise(step 71)
# After destroyhandler(49.5) but before we try to make elemwise things
# inplace (75)
blas_opt_inplace = in2out(local_inplace_gemm,
local_inplace_gemv,
local_inplace_ger,
name="blas_opt_inplace")
optdb.register('InplaceBlasOpt',
blas_opt_inplace,
70.0, 'fast_run', 'inplace')
blas_opt_inplace,
70.0, 'fast_run', 'inplace', 'blas_opt_inplace')
class Dot22Scalar(GemmRelated):
......
差异被折叠。
......@@ -47,6 +47,23 @@ class AdvancedIndexingError(TypeError):
# Helpful functions to deal with Subtensor and IncSubtensor
##########
def make_constant(args):
"""
Convert python litterals to theano constants in subtensor arguments.
"""
def conv(a):
if a is None:
return a
elif isinstance(a, slice):
return slice(conv(a.start),
conv(a.stop),
conv(a.step))
elif isinstance(a, (int, long, numpy.integer)):
return scal.ScalarConstant(scal.int64, a)
else:
return a
return tuple(map(conv, args))
def get_idx_list(inputs, idx_list):
'''
Given a list of inputs to the subtensor and its idx_list reorders
......
import numpy as np
import numpy
import unittest
import theano
from theano.tests import unittest_tools as utt
......
......@@ -164,7 +164,8 @@ class TensorType(Type):
" Theano C code does not support that.",
msg,
"object shape", data.shape,
"object strides", data.strides)
"object strides", data.strides,
"object dtype", data.dtype)
i = 0
for b in self.broadcastable:
......
......@@ -4,8 +4,7 @@ import numpy
import theano
from theano.compat import all, PY3
from theano.scalar import (ComplexError, IntegerDivisionError,
ScalarConstant, int64)
from theano.scalar import ComplexError, IntegerDivisionError
from theano.gof import Constant, Variable
from theano.gof.utils import hashtype
from theano.tensor.utils import hash_from_ndarray
......@@ -350,18 +349,7 @@ class _tensor_py_operators:
if not isinstance(args, tuple):
args = args,
# Convert python literals to theano constants
def conv(a):
if a is None:
return a
elif isinstance(a, slice):
return slice(conv(a.start),
conv(a.stop),
conv(a.step))
elif isinstance(a, (int, long, numpy.integer)):
return ScalarConstant(int64, a)
else:
return a
args = tuple(map(conv, args))
args = theano.tensor.subtensor.make_constant(args)
# Determine if advanced indexing is needed or not
# The logic is already in Subtensor.convert: if it succeeds,
# standard indexing is used; if it fails with
......
......@@ -5,6 +5,7 @@
import unittest
import theano
from theano import gof
from theano.tests import unittest_tools as utt
from theano import gradient
from theano.tensor.nnet.Conv3D import conv3D
......@@ -601,5 +602,45 @@ def test_subgrad():
print(true_grad, pgrad)
assert(np.sum(np.abs(true_grad - pgrad)) < 0.00001)
class TestConsiderConstant(unittest.TestCase):
def setUp(self):
utt.seed_rng()
self.rng = np.random.RandomState(seed=utt.fetch_seed())
def test_op_removed(self):
x = theano.tensor.matrix('x')
y = x * gradient.consider_constant(x)
f = theano.function([x], y)
# need to refer to theano.gradient.consider_constant_ here,
# theano.gradient.consider_constant is a wrapper function!
assert gradient.consider_constant_ not in \
[node.op for node in f.maker.fgraph.toposort()]
def test_grad(self):
T = theano.tensor
a = np.asarray(self.rng.randn(5, 5),
dtype=config.floatX)
x = T.matrix('x')
expressions_gradients = [
(x * gradient.consider_constant(x), x),
(x * gradient.consider_constant(T.exp(x)), T.exp(x)),
(gradient.consider_constant(x), T.constant(0.)),
(x**2 * gradient.consider_constant(x), 2 * x**2),
]
for expr, expr_grad in expressions_gradients:
g = gradient.grad(expr.sum(), x)
# gradient according to theano
f = theano.function([x], g, on_unused_input='ignore')
# desired gradient
f2 = theano.function([x], expr_grad, on_unused_input='ignore')
assert np.allclose(f(a), f2(a))
if __name__ == '__main__':
unittest.main()
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论