Merge pull request #5434 from nouiz/bn4

mixed

Merge pull request #5434 from nouiz/bn4
275dd0d0 · Frédéric Bastien · GitHub · 6086dc1c · b5a288b2 · 275dd0d0
--- a/theano/gof/graph.py
+++ b/theano/gof/graph.py
@@ -547,6 +547,7 @@ class Constant(Variable):
    def __init__(self, type, data, name=None):
        Variable.__init__(self, type, None, None, name)
        self.data = type.filter(data)
+        utils.add_tag_trace(self)

    def equals(self, other):
        # this does what __eq__ should do, but Variable and Apply should always be hashable by id

--- a/theano/gof/tests/test_graph.py
+++ b/theano/gof/tests/test_graph.py
@@ -346,7 +346,14 @@ class TestAutoName:
        r1 = tensor.constant(1.5)
        r2 = tensor.constant(1.5)
        assert r1.auto_name == "auto_" + str(autoname_id)
-        assert r2.auto_name == "auto_" + str(autoname_id + 1)
+        # We reuse the same variable
+        assert r2.auto_name == "auto_" + str(autoname_id)
+        assert r1 is r2
+
+        r3 = tensor.constant(1.6)
+        # The cache still create a new object that we don't return.
+        # This is why we must increase by 2 and not 1.
+        assert r3.auto_name == "auto_" + str(autoname_id + 2)

    def test_tensorvariable(self):
        # Get counter value

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -192,7 +192,7 @@ class InputToGpuOptimizer(Optimizer):
            # This happen frequently as we do 2 pass of the gpu optimizations
            if (len(input.clients) == 1 and
                (input.clients[0][0] == 'output' or
-                 input.clients[0][0].op == gpu_from_host)):
+                 isinstance(input.clients[0][0].op, GpuFromHost))):
                continue

            try:
@@ -215,7 +215,7 @@ gpu_seqopt.register('InputToGpuOptimizer', InputToGpuOptimizer(),
                    'merge')  # TODO: how to make it mandatory for gpu_seqopt?


-@local_optimizer([gpu_from_host, host_from_gpu])
+@local_optimizer([GpuFromHost, HostFromGpu])
 def local_cut_gpu_host_gpu(node):
    if tensor.opt.opt.check_chain(node, gpu_from_host, host_from_gpu):
        return [node.inputs[0].owner.inputs[0]]
@@ -336,7 +336,7 @@ def local_gpu_elemwise_0(node):


 @register_opt()
-@local_optimizer([gpu_from_host])
+@local_optimizer([GpuFromHost])
 def local_gpu_elemwise_1(node):
    """
    gpu_from_host(Elemwise)) -> GpuElemwise(gpu_from_host(...))
@@ -346,6 +346,7 @@ def local_gpu_elemwise_1(node):
        host_i, = node.inputs
        if (host_i.owner and
                isinstance(host_i.owner.op, tensor.Elemwise) and
+                len(host_i.owner.outputs) == 1 and
                len(host_i.clients) == 1 and
                dtype_in_elemwise_supported(node.op)):

@@ -392,7 +393,7 @@ def local_gpu_split(node):


 @register_opt()
-@local_optimizer([tensor.DimShuffle, gpu_from_host])
+@local_optimizer([tensor.DimShuffle, GpuFromHost])
 def local_gpu_dimshuffle_0(node):
    """
    dimshuffle(host_from_gpu()) -> host_from_gpu(gpu_dimshuffle)
@@ -421,7 +422,7 @@ def local_gpu_dimshuffle_0(node):


 @register_opt()
-@local_optimizer([tensor.SpecifyShape, gpu_from_host])
+@local_optimizer([tensor.SpecifyShape, GpuFromHost])
 def local_gpu_specifyShape_0(node):
    """
    specify_shape(host_from_gpu()) -> host_from_gpu(specify_shape)
@@ -445,7 +446,7 @@ def local_gpu_specifyShape_0(node):


 @register_opt()
-@local_optimizer([gpu_from_host, tensor.basic.Dot])
+@local_optimizer([GpuFromHost, tensor.basic.Dot])
 def local_gpu_dot_to_dot22(node):
    """
    gpu_from_host(dot) -> gpudot(gpu_from_host)
@@ -537,7 +538,7 @@ optdb.register('gpu_assert_no_cpu_op', assert_no_cpu_op, 49.2,


 @register_opt()
-@local_optimizer([theano.ifelse.IfElse, gpu_from_host])
+@local_optimizer([theano.ifelse.IfElse, GpuFromHost])
 def local_gpu_lazy_ifelse(node):
    """
    gpu_from_host(ifelse) -> gpu_ifelse(gpu_from_host)
@@ -606,7 +607,7 @@ def local_gpu_lazy_ifelse(node):


 @register_opt()
-@local_optimizer([gpu_from_host, tensor.blas.Dot22])
+@local_optimizer([GpuFromHost, tensor.blas.Dot22])
 def local_gpu_dot22(node):
    """
    gpu_from_host(dot22) -> gpudot(gpu_from_host)
@@ -631,7 +632,7 @@ def local_gpu_dot22(node):


 @register_opt()
-@local_optimizer([gpu_from_host, tensor.blas.BatchedDot])
+@local_optimizer([GpuFromHost, tensor.blas.BatchedDot])
 def local_gpu_batched_dot(node):
    """
    gpu_from_host(batched_dot) -> gpu_batched_dot(gpu_from_host)
@@ -670,7 +671,7 @@ def local_gpu_batched_dot(node):


 @register_opt()
-@local_optimizer([gpu_from_host, tensor.blas.Dot22Scalar])
+@local_optimizer([GpuFromHost, tensor.blas.Dot22Scalar])
 def local_gpu_dot22scalar(node):
    """
    gpu_from_host(dot22scalar) -> gpudot(gpu_from_host)
@@ -699,7 +700,7 @@ def local_gpu_dot22scalar(node):


 @register_opt()
-@local_optimizer([gpu_from_host, tensor.blas_c.CGemv, tensor.blas.Gemv])
+@local_optimizer([GpuFromHost, tensor.blas_c.CGemv, tensor.blas.Gemv])
 def local_gpu_gemv(node):
    """
    gpu_from_host(gemv) -> gpu_gemv(gpu_from_host)
@@ -737,7 +738,7 @@ def local_gpu_gemv(node):


 @register_opt()
-@local_optimizer([gpu_from_host, tensor.blas_c.CGer, tensor.blas.Ger,
+@local_optimizer([GpuFromHost, tensor.blas_c.CGer, tensor.blas.Ger,
                  tensor.blas_scipy.ScipyGer])
 def local_gpu_ger(node):
    """
@@ -777,7 +778,7 @@ def local_gpu_ger(node):


 @register_opt()
-@local_optimizer([tensor.blas.Gemm, gpu_from_host])
+@local_optimizer([tensor.blas.Gemm, GpuFromHost])
 def local_gpu_gemm(node):
    """
    gpu_from_host(gemm) -> gpu_gemm(gpu_from_host)
@@ -966,7 +967,7 @@ def local_gpu_elemwise_careduce(node):


 @register_opt()
-@local_optimizer([gpu_from_host, tensor.Reshape])
+@local_optimizer([GpuFromHost, tensor.Reshape])
 def local_gpu_reshape(node):
    if isinstance(node.op, GpuFromHost):
        host_input = node.inputs[0]
@@ -999,7 +1000,7 @@ def local_gpu_reshape(node):


 @register_opt()
-@local_optimizer([gpu_from_host, tensor.Flatten])
+@local_optimizer([GpuFromHost, tensor.Flatten])
 def local_gpu_flatten(node):
    if isinstance(node.op, GpuFromHost):
        host_input = node.inputs[0]
@@ -1019,7 +1020,7 @@ def local_gpu_flatten(node):


 @register_opt()
-@local_optimizer([gpu_from_host, tensor.Subtensor])
+@local_optimizer([GpuFromHost, tensor.Subtensor])
 def local_gpu_subtensor(node):
    if isinstance(node.op, GpuFromHost):
        host_input = node.inputs[0]
@@ -1062,7 +1063,7 @@ def local_gpu_subtensor(node):


 @register_opt()
-@local_optimizer([gpu_from_host, tensor.AdvancedSubtensor1])
+@local_optimizer([GpuFromHost, tensor.AdvancedSubtensor1])
 def local_gpu_advanced_subtensor1(node):
    if isinstance(node.op, GpuFromHost):
        host_input = node.inputs[0]
@@ -1083,7 +1084,7 @@ def local_gpu_advanced_subtensor1(node):


 @register_opt()
-@local_optimizer([gpu_from_host, tensor.AdvancedIncSubtensor1])
+@local_optimizer([GpuFromHost, tensor.AdvancedIncSubtensor1])
 def local_gpu_advanced_incsubtensor1(node):
    if isinstance(node.op, GpuFromHost):
        host_input = node.inputs[0]
@@ -1153,7 +1154,7 @@ def local_gpu_advanced_incsubtensor1(node):


 @register_opt()
-@local_optimizer([gpu_from_host, tensor.IncSubtensor])
+@local_optimizer([GpuFromHost, tensor.IncSubtensor])
 def local_gpu_incsubtensor(node):
    if isinstance(node.op, GpuFromHost):
        host_output = node.inputs[0]
@@ -1463,7 +1464,7 @@ def values_eq_approx_high_tol(a, b):
    return CudaNdarrayType.values_eq_approx(a, b, atol=atol)


-@local_optimizer([gpu_from_host, conv.ConvOp])
+@local_optimizer([GpuFromHost, conv.ConvOp])
 def local_gpu_conv(node):
    """
    gpu_from_host(conv) -> gpu_conv(gpu_from_host)
@@ -2309,7 +2310,7 @@ def local_gpu_contiguous(node):


 @register_opt()
-@local_optimizer([gpu_from_host, tensor.Eye])
+@local_optimizer([GpuFromHost, tensor.Eye])
 def local_gpu_eye(node):
    """
    gpu_from_host(eye) -> gpueye(gpu_from_host)
@@ -2438,7 +2439,7 @@ def typeConstructor(broadcastable, dtype):


 @register_opt('scan')
-@local_optimizer([gpu_from_host, scan_op.Scan])
+@local_optimizer([GpuFromHost, scan_op.Scan])
 def gpuScanOptimization(node):
    """
    scan(host_from_gpu) -> host_from_gpu(GPUscan)
@@ -2560,7 +2561,7 @@ def gpuScanOptimization(node):


 @register_opt()
-@local_optimizer([tensor.AllocEmpty, gpu_from_host])
+@local_optimizer([tensor.AllocEmpty, GpuFromHost])
 def local_gpu_allocempty(node):
    if (isinstance(node.op, tensor.AllocEmpty) and
            node.op.dtype == "float32"):
@@ -2727,7 +2728,7 @@ optdb.register('local_inplace_gpu_sparse_block_outer',


 # Move to Gpu optimization
-@local_optimizer([gpu_from_host,
+@local_optimizer([GpuFromHost,
                  AbstractConv2d,
                  AbstractConv2d_gradWeights,
                  AbstractConv2d_gradInputs,

--- a/theano/scalar/basic.py
+++ b/theano/scalar/basic.py
@@ -277,10 +277,10 @@ def convert(x, dtype=None):
    return x_


-def constant(x):
-    x = convert(x)
+def constant(x, name=None, dtype=None):
+    x = convert(x, dtype=dtype)
    assert x.ndim == 0
-    return ScalarConstant(get_scalar_type(str(x.dtype)), x)
+    return ScalarConstant(get_scalar_type(str(x.dtype)), x, name=name)


 class Scalar(Type):

--- a/theano/scalar/tests/test_basic.py
+++ b/theano/scalar/tests/test_basic.py
@@ -488,5 +488,14 @@ def test_grad_abs():
 # in test_fusion, TestCompositeCodegen


+def test_constant():
+    c = constant(2, name='a')
+    assert c.name == 'a'
+    assert c.dtype == 'int8'
+    c = constant(2, dtype='float32')
+    assert c.name is None
+    assert c.dtype == 'float32'
+
+
 if __name__ == '__main__':
    unittest.main()
--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -276,7 +276,9 @@ def constant(x, name=None, ndim=None, dtype=None):
    if (sig not in constant_cache and ret.data.size == 1 and
        (-10) <= ret.data <= 10 and
        (ret.dtype in int_dtypes or ret.dtype in uint_dtypes or
-         (ret.dtype in float_dtypes and int(ret.data) == ret.data))):
+         (ret.dtype in float_dtypes and
+          # Limit the size of the cache.
+          len(constant_cache) < 10000))):
        constant_cache[sig] = ret
        # This is needed to raise a good error to the user.
        ret.cached = True

--- a/theano/tensor/elemwise.py
+++ b/theano/tensor/elemwise.py
@@ -1045,8 +1045,8 @@ second dimension
            Py_XINCREF(%(oname)s);
            """ % locals()
            # We alias the scalar variables
-            defines += "#define %(oname)s_i %(iname)s_i" % locals()
-            undefs += "#undef %(oname)s_i" % locals()
+            defines += "#define %(oname)s_i %(iname)s_i\n" % locals()
+            undefs += "#undef %(oname)s_i\n" % locals()

        # Note: here, olv_index is either the index of the last output
        # which is allocated, OR, if there are any aliased outputs,

--- a/theano/tensor/nnet/bn.py
+++ b/theano/tensor/nnet/bn.py
@@ -23,10 +23,12 @@ class BNComposite(Composite):
    def grad(self, inps, grads):
        x, mean, std, gamma, beta = inps
        top, = grads
-        dx = (top * gamma) / std
-        dmean = -(top * gamma) / std
-        dstd = -(top * gamma * (x - mean)) / (std * std)
-        dgamma = top * (x - mean) / std
+        top_gamma = top * gamma
+        x_mean = x - mean
+        dx = top_gamma / std
+        dmean = -dx
+        dstd = -(top_gamma * x_mean) / (std * std)
+        dgamma = top * x_mean / std
        return [dx, dmean, dstd, dgamma, top]



--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -1041,11 +1041,14 @@ class ShapeFeature(object):
                rval.append(None)
        return rval

-    def unpack(self, s_i):
+    def unpack(self, s_i, var):
        """Return a symbolic integer scalar for the shape element s_i.

        The s_i argument was produced by the infer_shape() of an Op subclass.

+        var: the variable that correspond to s_i. This is just for
+        error reporting.
+
        """
        # unpack the s_i that the Op returned
        assert s_i is not None
@@ -1059,7 +1062,10 @@ class ShapeFeature(object):
                isinstance(s_i, numpy.integer) or
                (isinstance(s_i, numpy.ndarray) and s_i.ndim == 0)):
            # this shape is a constant
-            assert s_i >= 0
+            if s_i < 0:
+                msg = "There is a negative shape in the graph!"
+                msg += gof.utils.get_variable_trace_string(var)
+                raise ValueError(msg)
            return T.constant(s_i, dtype='int64')
        if type(s_i) in (tuple, list):
            # this dimension is the same as many of the inputs
@@ -1137,7 +1143,7 @@ class ShapeFeature(object):
                        r.type.broadcastable[i]):
                    shape_vars.append(self.lscalar_one)
                else:
-                    shape_vars.append(self.unpack(s[i]))
+                    shape_vars.append(self.unpack(s[i], r))
            assert all([not hasattr(r.type, "broadcastable") or
                        not r.type.broadcastable[i] or
                        # The two following comparison are a speed optimization
@@ -1238,7 +1244,7 @@ class ShapeFeature(object):
        new_shape = []
        for j, s_j in enumerate(prev_shape):
            if j == i:
-                new_shape.append(self.unpack(s_i))
+                new_shape.append(self.unpack(s_i, r))
            else:
                new_shape.append(s_j)
        assert all([not hasattr(r.type, "broadcastable") or
@@ -7001,6 +7007,10 @@ def local_elemwise_fusion_op(OP, max_input_fct=lambda node: 32,

        if type(node.op) is not OP:
            return False
+
+        if len(node.outputs) > 1:
+            # We don't support the fusion for node with multiple outputs.
+            return
        inputs = []  # inputs of the new Elemwise op.
        s_inputs = []  # inputs of the new scalar op used by the Composite.
        # Inputs of the new scalar op that represents the current node.
@@ -7331,6 +7341,26 @@ else:
                           'FusionOptimizer')


+@register_canonicalize
+@gof.local_optimizer([Elemwise])
+def local_useless_composite(node):
+    """For elemwise Composite that have multiple outputs, remove the
+    outputs that are not used.
+
+    """
+    if (not isinstance(node.op, Elemwise) or
+            not isinstance(node.op.scalar_op, scalar.Composite)):
+        return
+    comp = node.op.scalar_op
+    idx = [i for i, o_extern in enumerate(node.outputs)
+           if o_extern.clients]
+    if len(idx) < len(node.outputs):
+        new_outputs = [comp.outputs[i] for i in idx]
+        c = scalar.Composite(inputs=comp.inputs,
+                             outputs=new_outputs)
+        e = Elemwise(scalar_op=c)(*node.inputs, return_list=True)
+        return dict(zip([node.outputs[i] for i in idx], e))
+
 # ############################
 # # Remove consider_constant #
 # ############################

--- a/theano/tensor/tests/test_opt.py
+++ b/theano/tensor/tests/test_opt.py
@@ -1526,6 +1526,26 @@ class TestCompositeCodegen(unittest.TestCase):
        fval = numpy.asarray(f([1, 2, 3]))
        assert numpy.all(fval == [6, 12, 18]), fval

+    def test_local_useless_composite(self):
+        x = theano.scalar.float32()
+        c = theano.scalar.Composite([x], [x + 1, x - 1])
+        X = theano.tensor.matrix()
+        o = theano.tensor.Elemwise(scalar_op=c)(X)
+        mode = theano.compile.mode.get_default_mode().including(
+            'local_useless_composite')
+
+        f = theano.function([X], o[0], mode=mode)
+        topo = f.maker.fgraph.toposort()
+        assert len(topo) == 1
+        assert len(topo[0].outputs) == 1
+        utt.assert_allclose(f([[1.]]), [[2.]])
+
+        f = theano.function([X], o[1], mode=mode)
+        topo = f.maker.fgraph.toposort()
+        assert len(topo) == 1
+        assert len(topo[0].outputs) == 1
+        utt.assert_allclose(f([[1.]]), [[0.]])
+

 def test_log1p():
    m = theano.config.mode