Merge pull request #1207 from nouiz/infer_shape_broadcast

Infer shape broadcast

Merge pull request #1207 from nouiz/infer_shape_broadcast
13989ba3 · lamblin · 5a421ca3 · 61707ce3 · 13989ba3 · 13989ba3
--- a/.mailmap
+++ b/.mailmap
+# Prevent git from showing duplicate names with commands like "git shortlog"
+# # See the manpage of git-shortlog for details.
+# # The syntax is:
+# # Name that should be used <email that should be used> Bad name <bad email>
+# #
+# # You can skip Bad name if it is the same as the one that should be used, and is unique.
+# #
+# # This file is up-to-date if the command git log --format="%aN <%aE>" | sort -u
+# # gives no duplicates.
+
 <abergeron@gmail.com> <anakha@kami.(none)>
 David Warde-Farley <wardefar@iro.umontreal.ca> David Warde-Farley <dwf@cs.toronto.edu>
 David Warde-Farley <wardefar@iro.umontreal.ca> David Warde Farley <dwf@cs.toronto.edu>

--- a/doc/install_ubuntu.txt
+++ b/doc/install_ubuntu.txt
@@ -39,7 +39,7 @@ probably do something similar on older computer.
 Installation steps
 ~~~~~~~~~~~~~~~~~~

-Ubuntu 11.10/12.04:
+Ubuntu 11.10/12.04/12.10:
 1) ``sudo apt-get install python-numpy python-scipy python-dev python-pip python-nose g++ libopenblas-dev git``
 2) ``sudo pip install Theano``

@@ -70,7 +70,7 @@ Theano/BLAS speed test:

 .. code-block:: bash

-    python /usr/lib/python2.*/site-packages/theano/misc/check_blas.py
+    python `python -c "import os, theano; print os.path.dirname(theano.__file__)"`/misc/check_blas.py

 This will print a table with different versions of BLAS/numbers of
 threads on multiple CPUs and GPUs. It will also print some Theano/NumPy
@@ -163,6 +163,8 @@ Test GPU configuration

   Ubuntu 12.04 LTS: default gcc version 4.6.3. gcc 4.4.7 and 4.5.3 availables.

+   Ubuntu 12.10: default gcc version 4.7.2. gcc 4.4.7, 4.5.4 and 4.6.3 availables.
+




--- a/theano/gof/cmodule.py
+++ b/theano/gof/cmodule.py
@@ -1472,7 +1472,7 @@ class GCC_compiler(object):
        #cxxflags.append("-D NPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION")
        numpy_ver = [int(n) for n in numpy.__version__.split('.')[:2]]

-        # numpy 1.7 deprecated the following macro but the didn't
+        # numpy 1.7 deprecated the following macro but the new one didn't
        # existed in the past
        if bool(numpy_ver < [1, 7]):
            cxxflags.append("-D NPY_ARRAY_ENSURECOPY=NPY_ENSURECOPY")
@@ -1609,6 +1609,7 @@ class GCC_compiler(object):

        try:
            p = call_subprocess_Popen(cmd, stderr=subprocess.PIPE)
+            p.wait()
            compile_stderr = p.communicate()[1]
        except Exception:
            # An exception can occur e.g. if `g++` is not found.

--- a/theano/misc/check_blas.py
+++ b/theano/misc/check_blas.py
@@ -194,41 +194,28 @@ if __name__ == "__main__":
        goto2 1.13/16                                                     3.16s

        Test time in float32
-        (cuda version 3.2RC and up have a faster gemm on the Fermi/GTX[45]??)
-
-        gpu/cuda version
-        M2050(Amazon)/5.0 0.25s
-
-        GTX680/4.2        0.154s
-        GTX580/4.2        0.164s
-        GTX480/4.2        0.192s
-        GTX470/4.2        0.238s
-        C2075/4.2         0.25s
-        GTX285/4.2        0.452s #cuda 3.0 seam faster? driver version?
-        GT520/4.2         2.68s
-        GTX560/4.2        0.30s
-
-        GTX460/4.0        0.45s
-
-        GTX580/3.2        0.203s
-        GTX680/3.2        0.218s
-        GTX480/3.2        0.237s
-        GTX470/3.2        0.297s
-        GTX285/3.2        0.452s #cuda 3.0 seam faster? driver version?
-
-        GTX480/3.0        0.27s
-        M2070/4.1         0.27s
-        GTX470/3.2        0.29s
-        M2070/3.2         0.32s
-        GTX470/3.0        0.34s
-        GTX285/3.0        0.40s
-        C1060/3.2         0.46s
-        GTX550Ti/4.0      0.57s
-        520/3.2           3.06s
-        520M/3.2          3.19s with bumblebee on Ubuntu 12.04
-        GT220/3.2RC       3.80s
-        GT210/4.0         6.35s
-        8500GT/3.0       10.68s
+
+        cuda version      5.0    4.2    4.1    4.0    3.2    3.0   # note
+        gpu
+        M2070             0.25s         0.27s         0.32s
+        M2050(Amazon)     0.25s
+        C2075                    0.25s
+        C1060                                         0.46s
+
+        GTX680                   0.154s               0.218s
+        GTX580                   0.164s               0.203s
+        GTX480                   0.192s               0.237s 0.27s
+        GTX470                   0.238s               0.297s 0.34s
+        GTX660                   0.24s
+        GTX560                   0.30s
+        GTX460            0.37s                0.45s
+        GTX285                   0.452s        0.452s        0.40s # cuda 3.0 seam faster? driver version?
+        GTX550Ti                               0.57s
+        GT520                    2.68s                3.06s
+        520M                                          3.19s        # with bumblebee on Ubuntu 12.04
+        GT220                                         3.80s
+        GT210                                  6.35s
+        8500GT                                               10.68s
        """

    t, impl = execute(not options.print_only, not options.quiet,

--- a/theano/sandbox/cuda/__init__.py
+++ b/theano/sandbox/cuda/__init__.py
@@ -218,7 +218,7 @@ if cuda_available:
        atexit.register(gpu_shutdown)
    except EnvironmentError, e:
        cuda_available = False
-        cuda_initialization_error_message = e.message
+        cuda_initialization_error_message = " ".join(e.args)


 class GpuOp(theano.gof.Op):

--- a/theano/scalar/basic.py
+++ b/theano/scalar/basic.py
@@ -561,6 +561,9 @@ class ScalarVariable(_scalar_py_operators, Variable):
 class ScalarConstant(_scalar_py_operators, Constant):
    pass

+# Register ScalarConstant as the type of Constant corresponding to Scalar
+Scalar.Constant = ScalarConstant
+

 # Easy constructors


--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -519,7 +519,6 @@ def get_scalar_constant_value(v):
    if isinstance(v, numpy.ndarray):
        return numpy_scalar(v)

-
    if isinstance(v, Constant):
        if getattr(v.tag, 'unique_value', None) is not None:
            data = v.tag.unique_value
@@ -528,11 +527,9 @@ def get_scalar_constant_value(v):
        return numpy_scalar(data)

    if v.owner:
-        if isinstance(v.owner.op, Alloc):
-            return get_scalar_constant_value(v.owner.inputs[0])
-        if isinstance(v.owner.op, DimShuffle):
-            return get_scalar_constant_value(v.owner.inputs[0])
-        if isinstance(v.owner.op, Rebroadcast):
+        if isinstance(v.owner.op, (Alloc, DimShuffle, Rebroadcast,
+                                   compile.ops.OutputGuard,
+                                   compile.DeepCopyOp)):
            return get_scalar_constant_value(v.owner.inputs[0])
        if isinstance(v.owner.op, Elemwise) and \
                isinstance(v.owner.op.scalar_op, scal.Second):
@@ -2007,6 +2004,13 @@ class TensorConstant(_tensor_py_operators, Constant):
    def signature(self):
        return TensorConstantSignature((self.type, self.data))

+    def equals(self, other):
+        # Override Contant.equals to allow to compare with numpy.ndarray
+        if isinstance(other, numpy.ndarray):
+            # Make a TensorConstant to be able to compare
+            other = constant(other)
+        return (isinstance(other, TensorConstant) and
+                self.signature() == other.signature())

 TensorType.Constant = TensorConstant


--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -813,7 +813,18 @@ class ShapeFeature(object):
                        "for a variable with %d dimensions." % (
                        len(s), r.ndim))

-            shape_vars = [self.unpack(s_i) for s_i in s]
+            shape_vars = []
+            for i in range(r.ndim):
+                if (hasattr(r.type, 'broadcastable') and
+                    r.type.broadcastable[i]):
+                    shape_vars.append(self.lscalar_one)
+                else:
+                    shape_vars.append(self.unpack(s[i]))
+            assert all([not r.type.broadcastable[i] or
+                        self.lscalar_one.equals(shape_vars[i]) or
+                        self.lscalar_one.equals(
+                            T.extract_constant(shape_vars[i]))
+                        for i in range(r.ndim)])
            self.shape_of[r] = tuple(shape_vars)
            for sv in shape_vars:
                self.shape_of_reverse_index.setdefault(sv, set()).add(r)
@@ -855,6 +866,12 @@ class ShapeFeature(object):
                merged_shape.append(r_shape[i])
            else:
                merged_shape.append(other_shape[i])
+        assert all([(not r.type.broadcastable[i] and
+                     not other_r.type.broadcastable[i]) or
+                    self.lscalar_one.equals(merged_shape[i]) or
+                    self.lscalar_one.equals(
+                        T.extract_constant(merged_shape[i]))
+                    for i in range(r.ndim)])
        self.shape_of[r] = tuple(merged_shape)
        for sv in self.shape_of[r]:
            self.shape_of_reverse_index.setdefault(sv, set()).add(r)
@@ -871,6 +888,10 @@ class ShapeFeature(object):
                new_shape.append(self.unpack(s_i))
            else:
                new_shape.append(s_j)
+        assert all([not r.type.broadcastable[i] or
+                    self.lscalar_one.equals(new_shape[i]) or
+                    self.lscalar_one.equals(T.extract_constant(new_shape[i]))
+                    for i in range(r.ndim)])
        self.shape_of[r] = tuple(new_shape)
        for sv in self.shape_of[r]:
            self.shape_of_reverse_index.setdefault(sv, set()).add(r)

--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
@@ -5456,8 +5456,9 @@ class test_tensordot(unittest.TestCase):
        f1 = inplace_func([avec, bvec], c)
        aval = rand(5)
        bval = rand(5)
-        self.assertTrue(numpy.tensordot(aval, bval, axes) == \
-                        f1(aval, bval))
+        out0 = numpy.tensordot(aval, bval, axes)
+        out1 = f1(aval, bval)
+        self.assertTrue(numpy.allclose(out0, out1), (out0, out1))
        utt.verify_grad(self.TensorDot(axes), [aval, bval])

        # Test matrix-vector

--- a/theano/tensor/tests/test_opt.py
+++ b/theano/tensor/tests/test_opt.py
@@ -2475,6 +2475,57 @@ class test_shapeoptimizer(unittest.TestCase):
        assert len(topo) == 1
        assert topo[0].op == deep_copy_op

+    @staticmethod
+    def max_pool_c01b(c01b, pool_shp, pool_stride, img_shp):
+        """Like max_pool but with input using axes ('c', 0, 1, 'b')
+          (Alex Krizhevsky format)
+
+        pool_shp, pool_stride and img_shp are int that represent
+        the same shp in x and y.
+        """
+        mx = None
+
+        # Compute index in pooled space of last needed pool
+        # (needed = each input pixel must appear in at least one pool)
+        def last_pool(im_shp, p_shp, p_strd):
+            rval = int(numpy.ceil(float(im_shp - p_shp) / p_strd))
+            assert p_strd * rval + p_shp >= im_shp
+            assert p_strd * (rval - 1) + p_shp < im_shp
+            return rval
+        # Compute starting row of the last pool
+        last_pool_r = last_pool(img_shp, pool_shp, pool_stride) * pool_stride
+        # Compute number of rows needed in img for all indexes to work out
+        required_r = last_pool_r + pool_shp
+
+        last_pool_c = last_pool(img_shp, pool_shp, pool_stride) * pool_stride
+        required_c = last_pool_c + pool_shp
+
+        wide_infinity = T.alloc(-numpy.inf, c01b.shape[0],
+                                required_r, required_c, c01b.shape[3])
+
+        c01b = T.set_subtensor(wide_infinity[:, 0:img_shp, 0:img_shp, :], c01b)
+
+        for row_within_pool in xrange(pool_shp):
+            row_stop = last_pool_r + row_within_pool + 1
+            for col_within_pool in xrange(pool_shp):
+                col_stop = last_pool_c + col_within_pool + 1
+                cur = c01b[:, row_within_pool:row_stop:pool_stride,
+                           col_within_pool:col_stop:pool_stride, :]
+                if mx is None:
+                    mx = cur
+                else:
+                    mx = T.maximum(mx, cur)
+        return mx
+
+    def test_broadcasted_dims(self):
+        #This test a case that caused a crash during optimization
+        shp = (1, 1, 1, 1)
+        rng = numpy.random.RandomState(utt.fetch_seed())
+        a = shared(rng.rand(*shp).astype(config.floatX))
+        out = self.max_pool_c01b(a, 1, 1, 1)
+        f = theano.function([], out)
+        f()
+
    def test_local_track_shape_i(self):
        class IdentityNoShape(gof.Op):
            '''Op that does not infer the output shape from the input one'''

--- a/theano/tests/run_tests_in_batch.py
+++ b/theano/tests/run_tests_in_batch.py
@@ -55,10 +55,12 @@ nosetests.


 import cPickle
+import datetime
 import os
 import subprocess
 import sys
-import datetime
+import time
+
 import theano
 from theano.misc.windows import call_subprocess_Popen

@@ -261,8 +263,8 @@ def run(stdout, stderr, argv, theano_nose, batch_size, time_profile,
                                                 n_tests + 1)):
                # Print the test we will start in the raw log to help
                # debug tests that are too long.
-                f_rawlog.write("\nWill run test #%d %s\n" % (test_id,
-                                                         data["ids"][test_id]))
+                f_rawlog.write("\n%s Will run test #%d %s\n" % (
+                    time.ctime(), test_id, data["ids"][test_id]))
                f_rawlog.flush()

                proc = call_subprocess_Popen(

--- a/theano/updates.py
+++ b/theano/updates.py
@@ -64,7 +64,8 @@ class OrderedUpdates(OrderedDict):
            # Warn about non-determinism.
            warnings.warn('Updating an `OrderedUpdates` with a '
                          'non-ordered dictionary with 2+ elements could '
-                          'make your code non-deterministic')
+                          'make your code non-deterministic',
+                          stacklevel=2)
        for key, val in OrderedDict(other).iteritems():
            if key in self:
                if self[key] == val: