Merge pull request #2725 from nouiz/alloc_empty

Disable GpuAllocEmpty merge

Merge pull request #2725 from nouiz/alloc_empty
d304bb64 · abergeron · e85ae3bd · 4dad8e79 · d304bb64 · d304bb64
--- a/doc/library/tensor/signal/downsample.txt
+++ b/doc/library/tensor/signal/downsample.txt
@@ -11,6 +11,7 @@
 .. autofunction:: theano.tensor.signal.downsample.max_pool_2d
+.. autofunction:: theano.tensor.signal.downsample.max_pool_2d_same_size
 .. function:: fft(*todo)

--- a/theano/gof/op.py
+++ b/theano/gof/op.py
@@ -655,6 +655,16 @@ class PureOp(object):
        """
        return True
+    def do_merge(self, node):
+        """This allow to disable the merge of ops in the graph.
+        This is very rarely a good idea to disable it. Do not use if
+        you do not understand this small comment. You probably do not
+        need it.
+        """
+        return True
 class Op(utils.object2, PureOp, CLinkerOp):
    """Convenience class to bundle `PureOp` and `CLinkerOp`"""

--- a/theano/gof/opt.py
+++ b/theano/gof/opt.py
@@ -517,6 +517,8 @@ class MergeFeature(object):
        """Check if a node can be merged, and queue that replacement."""
        if node in self.nodes_seen:
            return
+        if not node.op.do_merge(node):
+            return
        # These asserts ensure that the fgraph has set the clients field
        # properly.

--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -2584,6 +2584,15 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp):
            raise TypeError('index must be vector')
        if x_.type.ndim == 0:
            raise TypeError('cannot index into a scalar')
+        if y_.type.ndim > x_.type.ndim:
+            if self.set_instead_of_inc:
+                opname = 'set'
+            else:
+                opname = 'increment'
+            raise TypeError(
+                'cannot %s x subtensor with ndim=%s'
+                ' by y with ndim=%s to x subtensor with ndim=%s ' % (
+                    opname, x_.type.ndim, y_.type.ndim))
        return Apply(self, [x_, y_, ilist_], [x_.type()])
@@ -2750,6 +2759,15 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
            raise TypeError('index must be vector')
        if x_.type.ndim == 0:
            raise TypeError('cannot index into a scalar')
+        if y_.type.ndim > x_.type.ndim:
+            if self.set_instead_of_inc:
+                opname = 'set'
+            else:
+                opname = 'increment'
+            raise TypeError(
+                'cannot %s x subtensor with ndim=%s'
+                ' by y with ndim=%s to x subtensor with ndim=%s ' % (
+                    opname, x_.type.ndim, y_.type.ndim))
        return Apply(self, [x_, y_, ilist_], [x_.type()])
@@ -3288,6 +3306,9 @@ class GpuAllocEmpty(GpuOp):
            # XXX: We could implement and call CudaNdarray.empty(sh) instead.
            out[0] = cuda_ndarray.cuda_ndarray.CudaNdarray.zeros(sh)
+    def do_merge(self, node):
+        return False
    def c_code(self, node, name, inputs, out_, sub):
        out, = out_
        fail = sub['fail']
@@ -3340,6 +3361,9 @@ class GpuAlloc(GpuAllocEmpty):
    """
    __props__ = ('memset_0',)
+    def do_merge(self, node):
+        return True
    def __init__(self, memset_0=False):
        self.memset_0 = memset_0

--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
@@ -372,6 +372,26 @@ def test_reshape():
        pass
+def test_alloc_empty():
+    # Test that we allocated correctly
+    f = theano.function([], cuda.basic_ops.gpu_alloc_empty(2, 3))
+    assert len(f.maker.fgraph.apply_nodes) == 1
+    out = f()
+    assert out.shape == (2, 3)
+    assert out.dtype == 'float32'
+    # Test that we do not merge them.
+    f = theano.function([], [cuda.basic_ops.gpu_alloc_empty(2, 3),
+                             cuda.basic_ops.gpu_alloc_empty(2, 3)])
+    out = f()
+    assert out[0].shape == (2, 3)
+    assert out[0].dtype == 'float32'
+    assert out[1].shape == (2, 3)
+    assert out[1].dtype == 'float32'
+    assert len([node for node in f.maker.fgraph.apply_nodes
+                if isinstance(node.op, cuda.basic_ops.GpuAllocEmpty)]) == 2
 def test_elemwise_empty():
    # test with 0 element
    a = tcn.shared_constructor(theano._asarray(numpy.random.rand(0, 0),
@@ -953,8 +973,7 @@ class T_Join_and_Split(theano.tensor.tests.test_basic.T_Join_and_Split):
        utt.seed_rng()
        self.mode = mode_with_gpu.excluding('constant_folding')
        self.join_op = cuda.GpuJoin()
-        # No gpu split.
+        self.split_op_class = cuda.GpuSplit
-        self.split_op_class = tensor.Split
        # No Make vector on the gpu, Join used instead
        self.make_vector_op = cuda.GpuJoin()
        self.floatX = "float32"