Fix the merge of GpuDnnConv* by enabling again the merge of GpuAllocEmpty and…

Fix the merge of GpuDnnConv* by enabling again the merge of GpuAllocEmpty and fix the inplace by duplicating them in the inplace opt

Fix the merge of GpuDnnConv* by enabling again the merge of GpuAllocEmpty and…
21351eed · Frederic · b75cf2e1 · 21351eed · 21351eed · 21351eed
--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -3299,9 +3299,6 @@ class GpuAllocEmpty(GpuOp):
            # XXX: We could implement and call CudaNdarray.empty(sh) instead.
            out[0] = cuda_ndarray.cuda_ndarray.CudaNdarray.zeros(sh)
-    def do_merge(self, node):
-        return False
    def c_code(self, node, name, inputs, out_, sub):
        out, = out_
        fail = sub['fail']
@@ -3354,9 +3351,6 @@ class GpuAlloc(GpuAllocEmpty):
    """
    __props__ = ('memset_0',)
-    def do_merge(self, node):
-        return True
    def __init__(self, memset_0=False):
        self.memset_0 = memset_0

--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -17,7 +17,7 @@ from theano.sandbox.cuda import GpuOp
 from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
                                           host_from_gpu,
                                           gpu_contiguous, HostFromGpu,
-                                           gpu_alloc_empty)
+                                           gpu_alloc_empty, GpuAllocEmpty)
 from theano.sandbox.cuda.blas import (GpuConv, GpuDownsampleFactorMax,
                                      GpuDownsampleFactorMaxGrad)
 from theano.sandbox.cuda.nnet import GpuSoftmax
@@ -1533,19 +1533,37 @@ if True:
    def local_dnn_conv_inplace(node):
        if type(node.op) != GpuDnnConv or node.op.inplace:
            return
-        return [GpuDnnConv(workmem=node.op.workmem, inplace=True)(*node.inputs)]
+        inputs = list(node.inputs)
+        dest = inputs[2]
+        if (dest.owner and
+                isinstance(dest.owner.op, GpuAllocEmpty) and
+                len(dest.clients) > 1):
+            inputs[2] = gpu_alloc_empty(*dest.owner.inputs)
+        return [GpuDnnConv(workmem=node.op.workmem, inplace=True)(*inputs)]
    @local_optimizer([GpuDnnConvGradW], inplace=True)
    def local_dnn_convgw_inplace(node):
        if type(node.op) != GpuDnnConvGradW or node.op.inplace:
            return
-        return [GpuDnnConvGradW(inplace=True)(*node.inputs)]
+        inputs = list(node.inputs)
+        dest = inputs[2]
+        if (dest.owner and
+                isinstance(dest.owner.op, GpuAllocEmpty) and
+                len(dest.clients) > 1):
+            inputs[2] = gpu_alloc_empty(*dest.owner.inputs)
+        return [GpuDnnConvGradW(inplace=True)(*inputs)]
    @local_optimizer([GpuDnnConvGradI], inplace=True)
    def local_dnn_convgi_inplace(node):
        if type(node.op) != GpuDnnConvGradI or node.op.inplace:
            return
-        return [GpuDnnConvGradI(inplace=True)(*node.inputs)]
+        inputs = list(node.inputs)
+        dest = inputs[2]
+        if (dest.owner and
+                isinstance(dest.owner.op, GpuAllocEmpty) and
+                len(dest.clients) > 1):
+            inputs[2] = gpu_alloc_empty(*dest.owner.inputs)
+        return [GpuDnnConvGradI(inplace=True)(*inputs)]
    optdb.register('local_dnn_conv_inplace',
                   tensor.opt.in2out(local_dnn_conv_inplace,

--- a/theano/sandbox/cuda/tests/test_dnn.py
+++ b/theano/sandbox/cuda/tests/test_dnn.py
@@ -12,6 +12,7 @@ from theano.sandbox.neighbours import images2neibs
 from theano.tensor.signal.downsample import max_pool_2d
 from theano.tensor.signal.downsample import DownsampleFactorMaxGrad
 import theano.sandbox.cuda.dnn as dnn
+from theano.sandbox.cuda.basic_ops import GpuAllocEmpty, gpu_alloc_empty
 # Skip test if cuda_ndarray is not available.
 import theano.sandbox.cuda as cuda
@@ -49,6 +50,101 @@ def test_dnn_conv_desc_merge():
    assert d1 != d2
+def test_dnn_conv_merge():
+    """This test that we merge correctly multiple dnn_conv.
+    This can is more difficult due to GpuEmptyAlloc that aren't
+    merged.
+    """
+    if not cuda.dnn.dnn_available():
+        raise SkipTest(cuda.dnn.dnn_available.msg)
+    img_shp = [2, 5, 6, 8]
+    kern_shp = [3, 5, 5, 6]
+    out_shp = [2, 3, 2, 3]
+    img = T.ftensor4('img')
+    kern = T.ftensor4('kern')
+    out = T.ftensor4('out')
+    desc = dnn.GpuDnnConvDesc(
+        border_mode='valid')(img.shape, kern.shape)
+    # Test forward op
+    o1 = dnn.dnn_conv(img, kern)
+    o2 = dnn.dnn_conv(img, kern)
+    f = theano.function([img, kern], [o1, o2], mode=mode_with_gpu)
+    d1, d2 = f(numpy.random.rand(*img_shp).astype('float32'),
+               numpy.random.rand(*kern_shp).astype('float32'))
+    topo = f.maker.fgraph.toposort()
+    assert len([n for n in topo if isinstance(n.op, dnn.GpuDnnConv)]) == 1
+    # Test grad w op
+    o1 = dnn.GpuDnnConvGradW()(img, kern, out, desc)
+    o2 = dnn.GpuDnnConvGradW()(img, kern, out, desc)
+    f = theano.function([img, kern, out], [o1, o2], mode=mode_with_gpu)
+    topo = f.maker.fgraph.toposort()
+    assert len([n for n in topo if isinstance(n.op, dnn.GpuDnnConvGradW)]) == 1
+    # Test grad i op
+    o1 = dnn.GpuDnnConvGradI()(img, kern, out, desc)
+    o2 = dnn.GpuDnnConvGradI()(img, kern, out, desc)
+    f = theano.function([img, kern, out], [o1, o2], mode=mode_with_gpu)
+    topo = f.maker.fgraph.toposort()
+    assert len([n for n in topo if isinstance(n.op, dnn.GpuDnnConvGradI)]) == 1
+def test_dnn_conv_inplace():
+    """This test that we have inplace work correctly even when
+    GpuAllocEmpty get merged together.
+    """
+    if not cuda.dnn.dnn_available():
+        raise SkipTest(cuda.dnn.dnn_available.msg)
+    img_shp = [2, 5, 6, 8]
+    kern_shp = [3, 5, 5, 6]
+    out_shp = [2, 3, 2, 3]
+    img = T.ftensor4('img')
+    kern = T.ftensor4('kern')
+    out = T.ftensor4('out')
+    desc1 = dnn.GpuDnnConvDesc(border_mode='valid', conv_mode='conv')(
+        img.shape, kern.shape)
+    desc2 = dnn.GpuDnnConvDesc(
+        border_mode='valid', conv_mode='cross')(img.shape, kern.shape)
+    # Test forward op
+    o1 = dnn.dnn_conv(img, kern, conv_mode='conv')
+    o2 = dnn.dnn_conv(img, kern, conv_mode='cross')
+    f = theano.function([img, kern], [o1, o2], mode=mode_with_gpu)
+    d1, d2 = f(numpy.random.rand(*img_shp).astype('float32'),
+               numpy.random.rand(*kern_shp).astype('float32'))
+    topo = f.maker.fgraph.toposort()
+    convs = [n for n in topo if isinstance(n.op, dnn.GpuDnnConv)]
+    assert len(convs) == 2
+    assert all([node.op.inplace for node in convs])
+    assert len([n for n in topo if isinstance(n.op, GpuAllocEmpty)]) == 2
+    # Test grad w op
+    out = gpu_alloc_empty(*kern.shape)
+    o1 = dnn.GpuDnnConvGradW()(img, kern, out, desc1)
+    o2 = dnn.GpuDnnConvGradW()(img, kern, out, desc2)
+    f = theano.function([img, kern], [o1, o2], mode=mode_with_gpu)
+    topo = f.maker.fgraph.toposort()
+    convs = [n for n in topo if isinstance(n.op, dnn.GpuDnnConvGradW)]
+    assert len(convs) == 2
+    assert all([node.op.inplace for node in convs])
+    assert len([n for n in topo if isinstance(n.op, GpuAllocEmpty)]) == 2
+    # Test grad i op
+    out = gpu_alloc_empty(*img.shape)
+    o1 = dnn.GpuDnnConvGradI()(img, kern, out, desc1)
+    o2 = dnn.GpuDnnConvGradI()(img, kern, out, desc2)
+    f = theano.function([img, kern], [o1, o2], mode=mode_with_gpu)
+    topo = f.maker.fgraph.toposort()
+    convs = [n for n in topo if isinstance(n.op, dnn.GpuDnnConvGradI)]
+    assert len(convs) == 2
+    assert all([node.op.inplace for node in convs])
+    assert len([n for n in topo if isinstance(n.op, GpuAllocEmpty)]) == 2
 def pool_2d_i2n(input, ds=(2, 2), strides=None,
                pad=(0, 0),
                pool_function=T.max, mode='ignore_borders'):
@@ -500,7 +596,7 @@ def test_dnn_conv_border_mode():
    dnn.dnn_conv(img, kern, border_mode='valid')
-def test_dnn_conv_merge():
+def test_dnn_conv_alpha_output_merge():
    if not cuda.dnn.dnn_available():
        raise SkipTest(cuda.dnn.dnn_available.msg)
    img = T.ftensor4()