Merge pull request #2829 from nouiz/merge_conv2

Merge conv2

Merge pull request #2829 from nouiz/merge_conv2
ba67b348 · abergeron · edbf47e0 · ddcc0fa0 · ba67b348 · ba67b348
--- a/theano/gof/op.py
+++ b/theano/gof/op.py
@@ -658,16 +658,6 @@ class PureOp(object):
        """
        return True

-    def do_merge(self, node):
-        """This allow to disable the merge of ops in the graph.
-
-        This is very rarely a good idea to disable it. Do not use if
-        you do not understand this small comment. You probably do not
-        need it.
-
-        """
-        return True
-

 class Op(utils.object2, PureOp, CLinkerOp):
    """Convenience class to bundle `PureOp` and `CLinkerOp`"""

--- a/theano/gof/opt.py
+++ b/theano/gof/opt.py
@@ -509,8 +509,6 @@ class MergeFeature(object):
        """Check if a node can be merged, and queue that replacement."""
        if node in self.nodes_seen:
            return
-        if not node.op.do_merge(node):
-            return

        # These asserts ensure that the fgraph has set the clients field
        # properly.

--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -3299,9 +3299,6 @@ class GpuAllocEmpty(GpuOp):
            # XXX: We could implement and call CudaNdarray.empty(sh) instead.
            out[0] = cuda_ndarray.cuda_ndarray.CudaNdarray.zeros(sh)

-    def do_merge(self, node):
-        return False
-
    def c_code(self, node, name, inputs, out_, sub):
        out, = out_
        fail = sub['fail']
@@ -3354,9 +3351,6 @@ class GpuAlloc(GpuAllocEmpty):
    """
    __props__ = ('memset_0',)

-    def do_merge(self, node):
-        return True
-
    def __init__(self, memset_0=False):
        self.memset_0 = memset_0


--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -17,7 +17,7 @@ from theano.sandbox.cuda import GpuOp
 from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
                                           host_from_gpu,
                                           gpu_contiguous, HostFromGpu,
-                                           gpu_alloc_empty)
+                                           gpu_alloc_empty, GpuAllocEmpty)
 from theano.sandbox.cuda.blas import (GpuConv, GpuDownsampleFactorMax,
                                      GpuDownsampleFactorMaxGrad)
 from theano.sandbox.cuda.nnet import GpuSoftmax
@@ -1533,19 +1533,37 @@ if True:
    def local_dnn_conv_inplace(node):
        if type(node.op) != GpuDnnConv or node.op.inplace:
            return
-        return [GpuDnnConv(workmem=node.op.workmem, inplace=True)(*node.inputs)]
+        inputs = list(node.inputs)
+        dest = inputs[2]
+        if (dest.owner and
+                isinstance(dest.owner.op, GpuAllocEmpty) and
+                len(dest.clients) > 1):
+            inputs[2] = gpu_alloc_empty(*dest.owner.inputs)
+        return [GpuDnnConv(workmem=node.op.workmem, inplace=True)(*inputs)]

    @local_optimizer([GpuDnnConvGradW], inplace=True)
    def local_dnn_convgw_inplace(node):
        if type(node.op) != GpuDnnConvGradW or node.op.inplace:
            return
-        return [GpuDnnConvGradW(inplace=True)(*node.inputs)]
+        inputs = list(node.inputs)
+        dest = inputs[2]
+        if (dest.owner and
+                isinstance(dest.owner.op, GpuAllocEmpty) and
+                len(dest.clients) > 1):
+            inputs[2] = gpu_alloc_empty(*dest.owner.inputs)
+        return [GpuDnnConvGradW(inplace=True)(*inputs)]

    @local_optimizer([GpuDnnConvGradI], inplace=True)
    def local_dnn_convgi_inplace(node):
        if type(node.op) != GpuDnnConvGradI or node.op.inplace:
            return
-        return [GpuDnnConvGradI(inplace=True)(*node.inputs)]
+        inputs = list(node.inputs)
+        dest = inputs[2]
+        if (dest.owner and
+                isinstance(dest.owner.op, GpuAllocEmpty) and
+                len(dest.clients) > 1):
+            inputs[2] = gpu_alloc_empty(*dest.owner.inputs)
+        return [GpuDnnConvGradI(inplace=True)(*inputs)]

    optdb.register('local_dnn_conv_inplace',
                   tensor.opt.in2out(local_dnn_conv_inplace,

--- a/theano/sandbox/cuda/tests/test_dnn.py
+++ b/theano/sandbox/cuda/tests/test_dnn.py
@@ -12,6 +12,7 @@ from theano.sandbox.neighbours import images2neibs
 from theano.tensor.signal.downsample import max_pool_2d
 from theano.tensor.signal.downsample import DownsampleFactorMaxGrad
 import theano.sandbox.cuda.dnn as dnn
+from theano.sandbox.cuda.basic_ops import GpuAllocEmpty, gpu_alloc_empty

 # Skip test if cuda_ndarray is not available.
 import theano.sandbox.cuda as cuda
@@ -49,6 +50,99 @@ def test_dnn_conv_desc_merge():
    assert d1 != d2


+def test_dnn_conv_merge():
+    """This test that we merge correctly multiple dnn_conv.
+
+    This can is more difficult due to GpuEmptyAlloc that aren't
+    merged.
+
+    """
+    if not cuda.dnn.dnn_available():
+        raise SkipTest(cuda.dnn.dnn_available.msg)
+    img_shp = [2, 5, 6, 8]
+    kern_shp = [3, 5, 5, 6]
+    img = T.ftensor4('img')
+    kern = T.ftensor4('kern')
+    out = T.ftensor4('out')
+    desc = dnn.GpuDnnConvDesc(
+        border_mode='valid')(img.shape, kern.shape)
+
+    # Test forward op
+    o1 = dnn.dnn_conv(img, kern)
+    o2 = dnn.dnn_conv(img, kern)
+    f = theano.function([img, kern], [o1, o2], mode=mode_with_gpu)
+    d1, d2 = f(numpy.random.rand(*img_shp).astype('float32'),
+               numpy.random.rand(*kern_shp).astype('float32'))
+    topo = f.maker.fgraph.toposort()
+    assert len([n for n in topo if isinstance(n.op, dnn.GpuDnnConv)]) == 1
+
+    # Test grad w op
+    o1 = dnn.GpuDnnConvGradW()(img, kern, out, desc)
+    o2 = dnn.GpuDnnConvGradW()(img, kern, out, desc)
+    f = theano.function([img, kern, out], [o1, o2], mode=mode_with_gpu)
+    topo = f.maker.fgraph.toposort()
+    assert len([n for n in topo if isinstance(n.op, dnn.GpuDnnConvGradW)]) == 1
+
+    # Test grad i op
+    o1 = dnn.GpuDnnConvGradI()(img, kern, out, desc)
+    o2 = dnn.GpuDnnConvGradI()(img, kern, out, desc)
+    f = theano.function([img, kern, out], [o1, o2], mode=mode_with_gpu)
+    topo = f.maker.fgraph.toposort()
+    assert len([n for n in topo if isinstance(n.op, dnn.GpuDnnConvGradI)]) == 1
+
+
+def test_dnn_conv_inplace():
+    """This test that we have inplace work correctly even when
+    GpuAllocEmpty get merged together.
+
+    """
+    if not cuda.dnn.dnn_available():
+        raise SkipTest(cuda.dnn.dnn_available.msg)
+    img_shp = [2, 5, 6, 8]
+    kern_shp = [3, 5, 5, 6]
+    img = T.ftensor4('img')
+    kern = T.ftensor4('kern')
+    out = T.ftensor4('out')
+    desc1 = dnn.GpuDnnConvDesc(border_mode='valid', conv_mode='conv')(
+        img.shape, kern.shape)
+    desc2 = dnn.GpuDnnConvDesc(
+        border_mode='valid', conv_mode='cross')(img.shape, kern.shape)
+
+    # Test forward op
+    o1 = dnn.dnn_conv(img, kern, conv_mode='conv')
+    o2 = dnn.dnn_conv(img, kern, conv_mode='cross')
+    f = theano.function([img, kern], [o1, o2], mode=mode_with_gpu)
+    d1, d2 = f(numpy.random.rand(*img_shp).astype('float32'),
+               numpy.random.rand(*kern_shp).astype('float32'))
+    topo = f.maker.fgraph.toposort()
+    convs = [n for n in topo if isinstance(n.op, dnn.GpuDnnConv)]
+    assert len(convs) == 2
+    assert all([node.op.inplace for node in convs])
+    assert len([n for n in topo if isinstance(n.op, GpuAllocEmpty)]) == 2
+
+    # Test grad w op
+    out = gpu_alloc_empty(*kern.shape)
+    o1 = dnn.GpuDnnConvGradW()(img, kern, out, desc1)
+    o2 = dnn.GpuDnnConvGradW()(img, kern, out, desc2)
+    f = theano.function([img, kern], [o1, o2], mode=mode_with_gpu)
+    topo = f.maker.fgraph.toposort()
+    convs = [n for n in topo if isinstance(n.op, dnn.GpuDnnConvGradW)]
+    assert len(convs) == 2
+    assert all([node.op.inplace for node in convs])
+    assert len([n for n in topo if isinstance(n.op, GpuAllocEmpty)]) == 2
+
+    # Test grad i op
+    out = gpu_alloc_empty(*img.shape)
+    o1 = dnn.GpuDnnConvGradI()(img, kern, out, desc1)
+    o2 = dnn.GpuDnnConvGradI()(img, kern, out, desc2)
+    f = theano.function([img, kern], [o1, o2], mode=mode_with_gpu)
+    topo = f.maker.fgraph.toposort()
+    convs = [n for n in topo if isinstance(n.op, dnn.GpuDnnConvGradI)]
+    assert len(convs) == 2
+    assert all([node.op.inplace for node in convs])
+    assert len([n for n in topo if isinstance(n.op, GpuAllocEmpty)]) == 2
+
+
 def pool_2d_i2n(input, ds=(2, 2), strides=None,
                pad=(0, 0),
                pool_function=T.max, mode='ignore_borders'):
@@ -338,7 +432,6 @@ class TestDnnInferShapes(utt.InferShapeTester):
            numpy.random.rand(2, 1, 5, 6),
            dtype='float32'
        )
-        out_vals = numpy.zeros((3, 3, 1, 1), dtype='float32')

        for params in product(
            ['valid', 'full'],
@@ -500,7 +593,7 @@ def test_dnn_conv_border_mode():
    dnn.dnn_conv(img, kern, border_mode='valid')


-def test_dnn_conv_merge():
+def test_dnn_conv_alpha_output_merge():
    if not cuda.dnn.dnn_available():
        raise SkipTest(cuda.dnn.dnn_available.msg)
    img = T.ftensor4()