提交 8deed652 authored 作者: abergeron's avatar abergeron

Merge pull request #2314 from nouiz/mixed

Fix test, compile less, doc, add tests.
......@@ -184,6 +184,20 @@ There are less methods to define for an Op than for a Type:
Overrides :meth:`c_code_cache_version` if defined, but
otherwise has the same contract.
.. method:: python_constant_folding(node)
Optional. If present this method will be called before doing
constant folding of a node, with that node as a parameter. If
it return True, we will not generate c code when doing constant
folding of this node. This is useful when the compilation of
the c code will be longer then the computation in python
(e.g. Elemwise of scalars).
In addition, this allow to lower the number of compiled module
and disk access. Particularly useful when the file system load
is high or when theano compilation directory is shared by many
process (like on a network file server on a cluster).
The ``name`` argument is currently given an invalid value, so steer
away from it. As was the case with Type, ``sub['fail']`` provides
failure code that you *must* use if you want to raise an exception,
......
......@@ -187,3 +187,5 @@ Reference
Replacements specified with
givens are different from optimizations in that Var2 is not expected to be
equivalent to Var1.
.. autofunction:: theano.compile.function.function_dump
......@@ -36,6 +36,11 @@ There are also some top-level imports that you might find more convenient:
Alias for :func:`function.function`
.. function:: function_dump(...)
Alias for :func:`theano.compile.function.function_dump`
.. function:: shared(...)
Alias for :func:`shared.shared`
......
......@@ -474,6 +474,7 @@ TensorVariable
* (2, 0, 1) -> AxBxC to CxAxB
* (0, 'x', 1) -> AxB to Ax1xB
* (1, 'x', 0) -> AxB to Bx1xA
* (1,) -> This remove dimensions 0. It must be a broadcastable dimension (1xA to A)
.. method:: flatten(ndim=1)
......
......@@ -25,7 +25,7 @@
.. note::
As of October 21st, 2014, the default GPU image convolution
changed: By default, if :ref:`cuDNN <_libdoc_cuda_dnn>`
changed: By default, if :ref:`cuDNN <libdoc_cuda_dnn>`
is available, we will use it, otherwise we will fall back to using the
gemm version (slower then cuDNN in most cases, uses more memory, but
faster than the legacy version we used before).
......
......@@ -129,6 +129,22 @@ def test_careduce():
((4100,4,3,2),[1]),((4,4100,3,2),[1]),((4,3,4100,2),[1]),((4,3,2,4100),[1]),#0100
((4100,4,3,2),[2]),((4,4100,3,2),[2]),((4,3,4100,2),[2]),((4,3,2,4100),[2]),#0010
((4100,4,3,2),[3]),((4,4100,3,2),[3]),((4,3,4100,2),[3]),((4,3,2,4100),[3]),#0001
# reduce over 2d
((4100,4,3,2),[1,2]),((4,4100,3,2),[1,2]),((4,3,4100,2),[1,2]),((4,3,2,4100),[1,2]),#0110
# ((4100,4,3,2),[0,3]),((4,4100,3,2),[0,3]),((4,3,4100,2),[0,3]),((4,3,2,4100),[0,3]),#1001 need 101
# ((4100,4,3,2),[0,2]),((4,4100,3,2),[0,2]),((4,3,4100,2),[0,2]),((4,3,2,4100),[0,2]),#1010 not implemented
((4100,4,3,2),[0,1]),((4,4100,3,2),[0,1]),((4,3,4100,2),[0,1]),((4,3,2,4100),[0,1]),#1100
# reduce over 3d
# 3d not tested: 1101, 1110, 1111
# ((4100,4,3,2),[0,1,3]),((4,4100,3,2),[0,1,3]),((4,3,4100,2),[0,1,3]),((4,3,2,4100),[0,1,3]),#1101 need 101
((4100,4,3,2),[0,1,2]),((4,4100,3,2),[0,1,2]),((4,3,4100,2),[0,1,2]),((4,3,2,4100),[0,1,2]),#1110
# reduce over 4d
((4100,4,3,2),[0]),((4,4100,3,2),[0]),((4,3,4100,2),[0]),((4,3,2,4100),[0]),#1111
# reduce over 5d
((1100,2,3,4,5),[0,1,2,3,4]),((2,1100,3,4,5),[0,1,2,3,4]),((2,3,1100,4,5),[0,1,2,3,4]),((2,3,4,1100,5),[0,1,2,3,4]),((2,3,4,5,1100),[0,1,2,3,4]),#11111
]:
......
......@@ -907,24 +907,37 @@ def gemm_directly(bs, ch, nf, rImg1, rImg2, rFlt1, rFlt2, subsx, subsy,
npy_img = theano._asarray(numpy.random.rand(*ishape), dtype='float32')
npy_kern = theano._asarray(numpy.random.rand(*kshape), dtype='float32')
i = cuda.CudaNdarrayType(
broadcastable=[sh == 1 for sh in npy_img.shape])()
k = cuda.CudaNdarrayType(
broadcastable=[sh == 1 for sh in npy_kern.shape])()
if direction == 'fprop':
i = cuda.CudaNdarrayType(
broadcastable=[sh == 1 for sh in npy_img.shape])()
k = cuda.CudaNdarrayType(
broadcastable=[sh == 1 for sh in npy_kern.shape])()
cpuval = py_conv(npy_img, npy_kern, 'valid', subsample)
op = theano.sandbox.cuda.blas.GpuCorrMM(border_mode='valid',
subsample=subsample)(i, k)
f = theano.function([i, k], op, mode=theano_mode)
gpuval = f(npy_img, npy_kern[:,:,::-1,::-1])
elif direction == 'bprop img':
i = cuda.CudaNdarrayType(
broadcastable=[sh == 1 for sh in
npy_kern.transpose(1, 0, 2, 3).shape])()
k = cuda.CudaNdarrayType(
broadcastable=[sh == 1 for sh in npy_img.shape])()
cpuval = py_conv(npy_img, npy_kern, 'full', subsample)
op = theano.sandbox.cuda.blas.GpuCorrMM_gradInputs(
border_mode='valid', subsample=subsample)(i, k)
f = theano.function([i, k], op, mode=theano_mode)
gpuval = f(npy_kern.transpose(1, 0, 2, 3), npy_img)
elif direction == 'bprop kern':
i = cuda.CudaNdarrayType(
broadcastable=[sh == 1 for sh in
npy_img.transpose(1, 0, 2, 3).shape])()
k = cuda.CudaNdarrayType(
broadcastable=[sh == 1 for sh in
npy_kern.transpose(1, 0, 2, 3).shape])()
cpuval = py_conv(npy_img, npy_kern, 'valid', subsample)
op = theano.sandbox.cuda.blas.GpuCorrMM_gradWeights(
border_mode='valid', subsample=subsample)(i, k)
......
......@@ -1193,6 +1193,13 @@ class Elemwise(OpenMPOp):
else:
return ()
def python_constant_folding(self, node):
"""
Return True if we do not want to compile c code
when doing constant folding of this node.
"""
return node.outputs[0].ndim == 0
# def elemwise_to_scal(fgraph):
# TODO: why is this commented out? should it be removed?
# it has needed maintenance despite being commented
......
......@@ -1605,7 +1605,7 @@ compile.optdb['specialize'].register('local_remove_all_assert',
local_remove_all_assert,
use_db_name_as_tag=False)
@register_specialize
@register_specialize("local_alloc_elemwise")
@gof.local_optimizer([T.Elemwise])
def local_elemwise_alloc(node):
"""
......@@ -4508,9 +4508,21 @@ def constant_folding(node):
for o in node.outputs:
storage_map[o] = [None]
compute_map[o] = [False]
if (hasattr(node.op, 'python_constant_folding') and
node.op.python_constant_folding(node)):
thunk = node.op.make_thunk(node, storage_map, compute_map,
no_recycling=[])
old_value = getattr(node.op, '_op_use_c_code', False)
try:
node.op._op_use_c_code = False
thunk = node.op.make_thunk(node,
storage_map,
compute_map,
[])
finally:
node.op._op_use_c_code = old_value
else:
thunk = node.op.make_thunk(node, storage_map, compute_map,
no_recycling=[])
required = thunk()
assert not required # a node whose inputs are all provided should always
......
......@@ -3671,6 +3671,17 @@ def test_constant_folding():
topo = f.maker.fgraph.toposort()
assert len(topo) == 2
# Test that we do not crash when constant folding elemwise scalar
# as they should not generate c code.
x = tensor.constant(3)
assert x.ndim == 0
mode = theano.compile.get_mode("FAST_COMPILE").excluding("fusion")
f = theano.function([], [x * 2, x + x], mode=mode)
topo = f.maker.fgraph.toposort()
assert len(topo) == 2
assert all([isinstance(n.op, DeepCopyOp) for n in topo])
def test_constant_get_stabilized():
"""
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论