提交 ba67b348 authored 作者: abergeron's avatar abergeron

Merge pull request #2829 from nouiz/merge_conv2

Merge conv2
......@@ -658,16 +658,6 @@ class PureOp(object):
"""
return True
def do_merge(self, node):
"""This allow to disable the merge of ops in the graph.
This is very rarely a good idea to disable it. Do not use if
you do not understand this small comment. You probably do not
need it.
"""
return True
class Op(utils.object2, PureOp, CLinkerOp):
"""Convenience class to bundle `PureOp` and `CLinkerOp`"""
......
......@@ -509,8 +509,6 @@ class MergeFeature(object):
"""Check if a node can be merged, and queue that replacement."""
if node in self.nodes_seen:
return
if not node.op.do_merge(node):
return
# These asserts ensure that the fgraph has set the clients field
# properly.
......
......@@ -3299,9 +3299,6 @@ class GpuAllocEmpty(GpuOp):
# XXX: We could implement and call CudaNdarray.empty(sh) instead.
out[0] = cuda_ndarray.cuda_ndarray.CudaNdarray.zeros(sh)
def do_merge(self, node):
return False
def c_code(self, node, name, inputs, out_, sub):
out, = out_
fail = sub['fail']
......@@ -3354,9 +3351,6 @@ class GpuAlloc(GpuAllocEmpty):
"""
__props__ = ('memset_0',)
def do_merge(self, node):
return True
def __init__(self, memset_0=False):
self.memset_0 = memset_0
......
......@@ -17,7 +17,7 @@ from theano.sandbox.cuda import GpuOp
from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
host_from_gpu,
gpu_contiguous, HostFromGpu,
gpu_alloc_empty)
gpu_alloc_empty, GpuAllocEmpty)
from theano.sandbox.cuda.blas import (GpuConv, GpuDownsampleFactorMax,
GpuDownsampleFactorMaxGrad)
from theano.sandbox.cuda.nnet import GpuSoftmax
......@@ -1533,19 +1533,37 @@ if True:
def local_dnn_conv_inplace(node):
if type(node.op) != GpuDnnConv or node.op.inplace:
return
return [GpuDnnConv(workmem=node.op.workmem, inplace=True)(*node.inputs)]
inputs = list(node.inputs)
dest = inputs[2]
if (dest.owner and
isinstance(dest.owner.op, GpuAllocEmpty) and
len(dest.clients) > 1):
inputs[2] = gpu_alloc_empty(*dest.owner.inputs)
return [GpuDnnConv(workmem=node.op.workmem, inplace=True)(*inputs)]
@local_optimizer([GpuDnnConvGradW], inplace=True)
def local_dnn_convgw_inplace(node):
if type(node.op) != GpuDnnConvGradW or node.op.inplace:
return
return [GpuDnnConvGradW(inplace=True)(*node.inputs)]
inputs = list(node.inputs)
dest = inputs[2]
if (dest.owner and
isinstance(dest.owner.op, GpuAllocEmpty) and
len(dest.clients) > 1):
inputs[2] = gpu_alloc_empty(*dest.owner.inputs)
return [GpuDnnConvGradW(inplace=True)(*inputs)]
@local_optimizer([GpuDnnConvGradI], inplace=True)
def local_dnn_convgi_inplace(node):
if type(node.op) != GpuDnnConvGradI or node.op.inplace:
return
return [GpuDnnConvGradI(inplace=True)(*node.inputs)]
inputs = list(node.inputs)
dest = inputs[2]
if (dest.owner and
isinstance(dest.owner.op, GpuAllocEmpty) and
len(dest.clients) > 1):
inputs[2] = gpu_alloc_empty(*dest.owner.inputs)
return [GpuDnnConvGradI(inplace=True)(*inputs)]
optdb.register('local_dnn_conv_inplace',
tensor.opt.in2out(local_dnn_conv_inplace,
......
......@@ -12,6 +12,7 @@ from theano.sandbox.neighbours import images2neibs
from theano.tensor.signal.downsample import max_pool_2d
from theano.tensor.signal.downsample import DownsampleFactorMaxGrad
import theano.sandbox.cuda.dnn as dnn
from theano.sandbox.cuda.basic_ops import GpuAllocEmpty, gpu_alloc_empty
# Skip test if cuda_ndarray is not available.
import theano.sandbox.cuda as cuda
......@@ -49,6 +50,99 @@ def test_dnn_conv_desc_merge():
assert d1 != d2
def test_dnn_conv_merge():
"""This test that we merge correctly multiple dnn_conv.
This can is more difficult due to GpuEmptyAlloc that aren't
merged.
"""
if not cuda.dnn.dnn_available():
raise SkipTest(cuda.dnn.dnn_available.msg)
img_shp = [2, 5, 6, 8]
kern_shp = [3, 5, 5, 6]
img = T.ftensor4('img')
kern = T.ftensor4('kern')
out = T.ftensor4('out')
desc = dnn.GpuDnnConvDesc(
border_mode='valid')(img.shape, kern.shape)
# Test forward op
o1 = dnn.dnn_conv(img, kern)
o2 = dnn.dnn_conv(img, kern)
f = theano.function([img, kern], [o1, o2], mode=mode_with_gpu)
d1, d2 = f(numpy.random.rand(*img_shp).astype('float32'),
numpy.random.rand(*kern_shp).astype('float32'))
topo = f.maker.fgraph.toposort()
assert len([n for n in topo if isinstance(n.op, dnn.GpuDnnConv)]) == 1
# Test grad w op
o1 = dnn.GpuDnnConvGradW()(img, kern, out, desc)
o2 = dnn.GpuDnnConvGradW()(img, kern, out, desc)
f = theano.function([img, kern, out], [o1, o2], mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert len([n for n in topo if isinstance(n.op, dnn.GpuDnnConvGradW)]) == 1
# Test grad i op
o1 = dnn.GpuDnnConvGradI()(img, kern, out, desc)
o2 = dnn.GpuDnnConvGradI()(img, kern, out, desc)
f = theano.function([img, kern, out], [o1, o2], mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert len([n for n in topo if isinstance(n.op, dnn.GpuDnnConvGradI)]) == 1
def test_dnn_conv_inplace():
"""This test that we have inplace work correctly even when
GpuAllocEmpty get merged together.
"""
if not cuda.dnn.dnn_available():
raise SkipTest(cuda.dnn.dnn_available.msg)
img_shp = [2, 5, 6, 8]
kern_shp = [3, 5, 5, 6]
img = T.ftensor4('img')
kern = T.ftensor4('kern')
out = T.ftensor4('out')
desc1 = dnn.GpuDnnConvDesc(border_mode='valid', conv_mode='conv')(
img.shape, kern.shape)
desc2 = dnn.GpuDnnConvDesc(
border_mode='valid', conv_mode='cross')(img.shape, kern.shape)
# Test forward op
o1 = dnn.dnn_conv(img, kern, conv_mode='conv')
o2 = dnn.dnn_conv(img, kern, conv_mode='cross')
f = theano.function([img, kern], [o1, o2], mode=mode_with_gpu)
d1, d2 = f(numpy.random.rand(*img_shp).astype('float32'),
numpy.random.rand(*kern_shp).astype('float32'))
topo = f.maker.fgraph.toposort()
convs = [n for n in topo if isinstance(n.op, dnn.GpuDnnConv)]
assert len(convs) == 2
assert all([node.op.inplace for node in convs])
assert len([n for n in topo if isinstance(n.op, GpuAllocEmpty)]) == 2
# Test grad w op
out = gpu_alloc_empty(*kern.shape)
o1 = dnn.GpuDnnConvGradW()(img, kern, out, desc1)
o2 = dnn.GpuDnnConvGradW()(img, kern, out, desc2)
f = theano.function([img, kern], [o1, o2], mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
convs = [n for n in topo if isinstance(n.op, dnn.GpuDnnConvGradW)]
assert len(convs) == 2
assert all([node.op.inplace for node in convs])
assert len([n for n in topo if isinstance(n.op, GpuAllocEmpty)]) == 2
# Test grad i op
out = gpu_alloc_empty(*img.shape)
o1 = dnn.GpuDnnConvGradI()(img, kern, out, desc1)
o2 = dnn.GpuDnnConvGradI()(img, kern, out, desc2)
f = theano.function([img, kern], [o1, o2], mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
convs = [n for n in topo if isinstance(n.op, dnn.GpuDnnConvGradI)]
assert len(convs) == 2
assert all([node.op.inplace for node in convs])
assert len([n for n in topo if isinstance(n.op, GpuAllocEmpty)]) == 2
def pool_2d_i2n(input, ds=(2, 2), strides=None,
pad=(0, 0),
pool_function=T.max, mode='ignore_borders'):
......@@ -338,7 +432,6 @@ class TestDnnInferShapes(utt.InferShapeTester):
numpy.random.rand(2, 1, 5, 6),
dtype='float32'
)
out_vals = numpy.zeros((3, 3, 1, 1), dtype='float32')
for params in product(
['valid', 'full'],
......@@ -500,7 +593,7 @@ def test_dnn_conv_border_mode():
dnn.dnn_conv(img, kern, border_mode='valid')
def test_dnn_conv_merge():
def test_dnn_conv_alpha_output_merge():
if not cuda.dnn.dnn_available():
raise SkipTest(cuda.dnn.dnn_available.msg)
img = T.ftensor4()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论