Merge pull request #754 from jaberg/ipam2012

Misc for IPAM GSS 2012

Merge pull request #754 from jaberg/ipam2012
db2bd471 · lamblin · 44e57b3c · adb7ffd3 · db2bd471 · db2bd471
--- a/theano/sandbox/rng_mrg.py
+++ b/theano/sandbox/rng_mrg.py
@@ -605,7 +605,7 @@ def guess_n_streams(size, warn=True):
    Return a guess at a good number of streams.
    :param warn: If True, warn when a guess cannot be made (in which case
-    we return 30 * 256).
+    we return 60 * 256).
    """
    # TODO: a smart way of choosing the number of streams, see #612.
    # Note that this code was moved out of `MRG_RandomStreams` so that it can
@@ -618,14 +618,22 @@ def guess_n_streams(size, warn=True):
            r *= s
        if r > 6:
            r = r/6 # chosen as fastest for rbm_benchmark
-        return r
+        # The purpose of sampling from many streams is to be able to use
+        # the GPU to its full capacity.  It just wastes RAM and stream-initialization time to
+        # allocate more streams than necessary for the GPU.
+        # XXX: This number is chosen to be good for 280 and 480 architectures,
+        #      Better would be to use pycuda to query the number of
+        #      processors on the GPU device,
+        #      rather than guessing 60.
+        return min(r, 60 * 256)
    else:
        if warn:
            warnings.warn((
                    "MRG_RandomStreams Can't determine #streams from "
-                    "size (%s), guessing 30*256") % str(size),
+                    "size (%s), guessing 60*256") % str(size),
                    stacklevel=3)
-        return 30 * 256
+        return 60 * 256
 class MRG_RandomStreams(object):

--- a/theano/scalar/basic.py
+++ b/theano/scalar/basic.py
@@ -1085,9 +1085,9 @@ class Maximum(BinaryScalarOp):
        # max is not defined for complex_types
        gx, gy = None, None
        if x.type in float_types:
-            gx = eq(maximum(x, y), x) * gz
+            gx = cast(eq(maximum(x, y), x) * gz, x.type.dtype)
        if y.type in float_types:
-            gy = eq(maximum(x, y), y) * gz
+            gy = cast(eq(maximum(x, y), y) * gz, y.type.dtype)
        return (gx, gy)
 maximum = Maximum(upcast_out, name='maximum')
@@ -1110,9 +1110,9 @@ class Minimum(BinaryScalarOp):
        # max is not defined for complex_types
        gx, gy = None, None
        if x.type in float_types:
-            gx = eq(minimum(x, y), x) * gz
+            gx = cast(eq(minimum(x, y), x) * gz, x.type.dtype)
        if y.type in float_types:
-            gy = eq(minimum(x, y), y) * gz
+            gy = cast(eq(minimum(x, y), y) * gz, y.type.dtype)
        return (gx, gy)
 minimum = Minimum(upcast_out, name='minimum')

--- a/theano/tensor/blas.py
+++ b/theano/tensor/blas.py
@@ -375,7 +375,7 @@ def default_blas_ldflags():
                        # options part.
                        ['-L%s' % l for l in blas_info['library_dirs']] +
                        ['-l%s' % l for l in blas_info['libraries']] +
-                        extra)
+                        [])
 #                       ['-I%s' % l for l in blas_info['include_dirs']])
    except KeyError:
        return "-lblas"

--- a/theano/tensor/elemwise.py
+++ b/theano/tensor/elemwise.py
@@ -1594,8 +1594,9 @@ class Sum(CAReduceDtype):
            else:
                new_dims.append(i)
                i += 1
-        return Elemwise(scalar.second)(
+        ds_op = DimShuffle(gz.type.broadcastable, new_dims)
-                        x, DimShuffle(gz.type.broadcastable, new_dims)(gz)),
+        gx = Elemwise(scalar.second)(x, ds_op(gz).astype(x.dtype))
+        return [gx]
    def R_op(self, inputs, eval_points):
        # There is just one element in inputs and eval_points, the axis are

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -3136,6 +3136,39 @@ def local_cut_useless_reduce(node):
            return [summed]
+@register_canonicalize
+@gof.local_optimizer([])
+def local_sum_broadcastable(node):
+    """Remove reduction over broadcastable dimensions"""
+    if isinstance(node.op, T.CAReduce):
+        reduced, = node.inputs
+        odtype = node.outputs[0].dtype
+        if node.op.axis is None:
+            if all(reduced.broadcastable):
+                return [reduced.dimshuffle().astype(odtype)]
+        else:
+            axis = list(node.op.axis)
+            cuttable = [a for a in axis if reduced.broadcastable[a]]
+            if cuttable:
+                # -- we can remove some axes of summation,
+                #    which simplifies the codegen for sum, especially on GPU
+                new_axis = []
+                pattern = []
+                ii = 0
+                for p in range(reduced.ndim):
+                    if p not in cuttable:
+                        if p in axis:
+                            new_axis.append(ii)
+                        pattern.append(p)
+                        ii += 1
+                new_reduced = reduced.dimshuffle(*pattern)
+                if new_axis:
+                    new_op = node.op.__class__(axis=new_axis)
+                    return [new_op(new_reduced)]
+                else:
+                    # -- in this case we can remove the reduction completely
+                    return [new_reduced.astype(odtype)]
 @register_specialize
 @gof.local_optimizer([])
 def local_sum_alloc(node):

--- a/theano/tensor/tests/test_opt.py
+++ b/theano/tensor/tests/test_opt.py
@@ -46,6 +46,7 @@ from theano.tensor import (
        )
 from theano.tensor.elemwise import DimShuffle
 from theano.tests import unittest_tools as utt
+from theano.compile.mode import optdb
 mode_opt = theano.config.mode
 if mode_opt == 'FAST_COMPILE':
@@ -3288,6 +3289,51 @@ class T_local_sum(unittest.TestCase):
        finally:
            config.on_opt_error = backup
+    def test_local_sum_broadcast_all_0(self):
+        optimizer = optdb.query(self.mode._optimizer)
+        x = T.TensorType('int64', (True, True, True))()
+        env = Env([x], [x.sum()])
+        optimizer.optimize(env)
+        assert not any([
+            isinstance(node.op, T.CAReduce)
+            for node in env.toposort()])
+    def test_local_sum_broadcast_all_1(self):
+        optimizer = optdb.query(self.mode._optimizer)
+        x = T.TensorType('int64', (True, True))()
+        env = Env([x], [x.sum(axis=[0, 1])])
+        optimizer.optimize(env)
+        assert not any([
+            isinstance(node.op, T.CAReduce)
+            for node in env.toposort()])
+    def test_local_sum_broadcast_some_0(self):
+        optimizer = optdb.query(self.mode._optimizer)
+        x = T.TensorType('int64', (True, False, True))()
+        env = Env([x], [x.sum(axis=[0, 1])])
+        optimizer.optimize(env)
+        order = env.toposort()
+        assert 1 == sum([isinstance(node.op, T.CAReduce) for node in order])
+        op = order[-2].op
+        assert isinstance(op, T.CAReduce)
+        # -- the leading broadcastable dimension has been dropped
+        #   by the local_sum_broadcastable optimization
+        #   now summation is over the original x's dimension 1.
+        assert order[-2].inputs[0].ndim == 2, order[-2]
+        assert op.axis == (0,), op.axis
+    def test_local_sum_broadcast_some_1(self):
+        optimizer = optdb.query(self.mode._optimizer)
+        x = T.TensorType('int64', (True, False, True))()
+        env = Env([x], [x.sum(axis=[0, 2])])
+        optimizer.optimize(env)
+        order = env.toposort()
+        assert 0 == sum([isinstance(node.op, T.CAReduce) for node in order])
 class T_local_sum_dimshuffle(unittest.TestCase):
    def setUp(self):