提交 18fe0369 authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #2273 from f0k/conv-direction-hints

Make automatic insertions of dnn_conv() use the optimal cuDNN code path (2)
...@@ -1513,6 +1513,7 @@ class GpuConv(GpuOp): ...@@ -1513,6 +1513,7 @@ class GpuConv(GpuOp):
logical_kern_hw=None, logical_kern_hw=None,
logical_kern_align_top=True, logical_kern_align_top=True,
version=-1, version=-1,
direction_hint=None,
verbose=0, verbose=0,
kshp=None, kshp=None,
imshp=None, imshp=None,
...@@ -1525,6 +1526,10 @@ class GpuConv(GpuOp): ...@@ -1525,6 +1526,10 @@ class GpuConv(GpuOp):
convolution. By default we try to guess the best one. convolution. By default we try to guess the best one.
You can force one version with this parameter. This You can force one version with this parameter. This
parameter is used by the tests. parameter is used by the tests.
:param direction_hint: 'forward', 'bprop weights' or 'bprop inputs'.
Serves as a hint for graph optimizers replacing
GpuConv by other implementations. If the GpuConv is
inserted automatically, we take its value from ConvOp.
:param verbose: for value of 1,2 and 3. Print more information during :param verbose: for value of 1,2 and 3. Print more information during
the execution of the convolution. Mostly used for the execution of the convolution. Mostly used for
optimization or debugging. optimization or debugging.
...@@ -1570,6 +1575,7 @@ class GpuConv(GpuOp): ...@@ -1570,6 +1575,7 @@ class GpuConv(GpuOp):
self.logical_kern_hw = logical_kern_hw self.logical_kern_hw = logical_kern_hw
self.logical_kern_align_top = logical_kern_align_top self.logical_kern_align_top = logical_kern_align_top
self.version = version self.version = version
self.direction_hint = direction_hint
self.verbose = verbose self.verbose = verbose
self.kshp = kshp self.kshp = kshp
self.imshp = imshp self.imshp = imshp
...@@ -1597,6 +1603,8 @@ class GpuConv(GpuOp): ...@@ -1597,6 +1603,8 @@ class GpuConv(GpuOp):
self.imshp = None self.imshp = None
if not hasattr(self, "max_threads_dim0"): if not hasattr(self, "max_threads_dim0"):
self.max_threads_dim0 = None self.max_threads_dim0 = None
if not hasattr(self, "direction_hint"):
self.direction_hint = None
def __hash__(self): def __hash__(self):
# don't use hash(self.version) as hash(-1)==-2 and # don't use hash(self.version) as hash(-1)==-2 and
......
...@@ -469,7 +469,7 @@ class GpuDnnConvGradI(GpuDnnConvBase): ...@@ -469,7 +469,7 @@ class GpuDnnConvGradI(GpuDnnConvBase):
def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
conv_mode='conv'): conv_mode='conv', direction_hint=None):
""" """
GPU convolution using cuDNN from NVIDIA. GPU convolution using cuDNN from NVIDIA.
...@@ -481,13 +481,41 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), ...@@ -481,13 +481,41 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
:param border_mode: one of 'valid', 'full'; additionally, the padding size :param border_mode: one of 'valid', 'full'; additionally, the padding size
could be directly specified by an integer or a pair of integers could be directly specified by an integer or a pair of integers
:param subsample: perform subsampling of the output (default: (1, 1)) :param subsample: perform subsampling of the output (default: (1, 1))
:param conv_mode: perform convolution (kernels flipped) or cross-correlation. One of 'conv', 'cross'. (default: 'conv') :param conv_mode: perform convolution (kernels flipped) or cross-correlation.
One of 'conv', 'cross'. (default: 'conv')
:param direction_hint: Used by graph optimizers to change algorithm choice.
By default, GpuDnnConv will be used to carry out the convolution.
If border_mode is 'valid', subsample is (1,1) and direction_hint is
'bprop weights', it will use GpuDnnConvGradW.
If border_mode is 'full', subsample is (1,1) and direction_hint is
*not* 'forward!', it will use GpuDnnConvGradI.
This parameter is used internally by graph optimizers and may be
removed at any time without a deprecation period. You have been warned.
:warning: The cuDNN library only works with GPU that have a compute :warning: The cuDNN library only works with GPU that have a compute
capability of 3.0 or higer. This means that older GPU will not capability of 3.0 or higer. This means that older GPU will not
work with this Op. work with this Op.
""" """
if border_mode == 'full' and subsample == (1, 1): if (border_mode == 'valid' and subsample == (1,1) and
direction_hint == 'bprop weights'):
# Special case: We are asked to use GpuDnnConvGradW. We need to set
# up a suitable 'fake' convolution to compute the gradient for.
img = gpu_contiguous(img.dimshuffle(1, 0, 2, 3))
if conv_mode == 'conv':
# We need to flip manually. These 'kerns' are not the kernels
# that would be flipped by conv_mode='conv' in GpuDnnConvGradW.
kerns = kerns[:, :, ::-1, ::-1]
kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3))
shape = theano.tensor.stack(kerns.shape[1], img.shape[1],
img.shape[2] - kerns.shape[2] + 1,
img.shape[3] - kerns.shape[3] + 1)
desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
conv_mode='cross')(img.shape, shape)
conv = GpuDnnConvGradW()(img, kerns, desc)
return as_cuda_ndarray_variable(conv.dimshuffle(1, 0, 2, 3))
elif (border_mode == 'full' and subsample == (1, 1) and
direction_hint != 'forward!'):
# Special case: We can be faster by using GpuDnnConvGradI to compute # Special case: We can be faster by using GpuDnnConvGradI to compute
# the full convolution as the backward pass of a valid convolution. # the full convolution as the backward pass of a valid convolution.
# We just need to set up a suitable 'fake' valid convolution. # We just need to set up a suitable 'fake' valid convolution.
...@@ -501,6 +529,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), ...@@ -501,6 +529,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
conv_mode=conv_mode)(shape, kerns.shape) conv_mode=conv_mode)(shape, kerns.shape)
return GpuDnnConvGradI()(kerns, img, desc) return GpuDnnConvGradI()(kerns, img, desc)
# Standard case: We use GpuDnnConv with suitable padding.
img = gpu_contiguous(img) img = gpu_contiguous(img)
kerns = gpu_contiguous(kerns) kerns = gpu_contiguous(kerns)
desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample, desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
...@@ -1134,8 +1163,11 @@ if cuda_available: ...@@ -1134,8 +1163,11 @@ if cuda_available:
img, kern = node.inputs img, kern = node.inputs
border_mode = node.op.border_mode border_mode = node.op.border_mode
subsample = node.op.subsample subsample = node.op.subsample
return [dnn_conv(gpu_contiguous(img), gpu_contiguous(kern), direction_hint = node.op.direction_hint
border_mode=border_mode, subsample=subsample)] return [dnn_conv(img, kern,
border_mode=border_mode, subsample=subsample,
direction_hint=direction_hint)]
# DISABLED as there is problems in the handling of borders # DISABLED as there is problems in the handling of borders
# @register_opt('cudnn') # @register_opt('cudnn')
@local_optimizer([GpuDownsampleFactorMax]) @local_optimizer([GpuDownsampleFactorMax])
......
...@@ -1181,6 +1181,7 @@ def local_gpu_conv(node): ...@@ -1181,6 +1181,7 @@ def local_gpu_conv(node):
logical_kern_align_top=op.kshp_logical_top_aligned, logical_kern_align_top=op.kshp_logical_top_aligned,
kshp=op.kshp, kshp=op.kshp,
version=op.version, version=op.version,
direction_hint=op.direction_hint,
verbose=op.verbose, verbose=op.verbose,
imshp=op.imshp, imshp=op.imshp,
nkern=op.nkern, nkern=op.nkern,
......
...@@ -594,7 +594,7 @@ def test_gemm_valid(): ...@@ -594,7 +594,7 @@ def test_gemm_valid():
def test_dnn_valid(): def test_dnn_valid():
if not cuda.dnn.dnn_available(): if not cuda.dnn.dnn_available():
raise SkipTest(cuda.dnn.dnn_available.msg) raise SkipTest(cuda.dnn.dnn_available.msg)
for t in _test_valid(GpuDnnConv, mode=theano_mode.including("cudnn")): for t in _test_valid(GpuDnnConvBase, mode=theano_mode.including("cudnn")):
yield t yield t
...@@ -708,7 +708,7 @@ def test_gemm_full(): ...@@ -708,7 +708,7 @@ def test_gemm_full():
def test_dnn_full(): def test_dnn_full():
if not cuda.dnn.dnn_available(): if not cuda.dnn.dnn_available():
raise SkipTest(cuda.dnn.dnn_available.msg) raise SkipTest(cuda.dnn.dnn_available.msg)
for t in _test_full(GpuDnnConv, mode=theano_mode.including("cudnn")): for t in _test_full(GpuDnnConvBase, mode=theano_mode.including("cudnn")):
yield t yield t
...@@ -760,7 +760,7 @@ def test_gemm_subsample(): ...@@ -760,7 +760,7 @@ def test_gemm_subsample():
def test_dnn_subsample(): def test_dnn_subsample():
if not cuda.dnn.dnn_available(): if not cuda.dnn.dnn_available():
raise SkipTest(cuda.dnn.dnn_available.msg) raise SkipTest(cuda.dnn.dnn_available.msg)
for t in _test_subsample(GpuDnnConv, theano_mode.including('cudnn')): for t in _test_subsample(GpuDnnConvBase, theano_mode.including('cudnn')):
yield t yield t
......
...@@ -280,6 +280,7 @@ class ConvOp(OpenMPOp): ...@@ -280,6 +280,7 @@ class ConvOp(OpenMPOp):
kshp_logical_top_aligned=True, kshp_logical_top_aligned=True,
verbose=0, verbose=0,
version=-1, version=-1,
direction_hint='forward',
openmp=None): openmp=None):
""" """
Initializes a ConvOp with given output_mode (full/valid). All other Initializes a ConvOp with given output_mode (full/valid). All other
...@@ -348,6 +349,8 @@ class ConvOp(OpenMPOp): ...@@ -348,6 +349,8 @@ class ConvOp(OpenMPOp):
:type version: int or str :type version: int or str
:param version: passed to GpuConv, if version='no_fft', fft :param version: passed to GpuConv, if version='no_fft', fft
optimization will be desactivated at the op level. optimization will be desactivated at the op level.
:param direction_hint: 'forward', 'bprop weights' or 'bprop inputs'.
Passed to GpuConv, used by graph optimizers to aid algorithm choice
The 3 following parameters are used internally when we generate The 3 following parameters are used internally when we generate
the gradient when dx!=1 or dy!=1. the gradient when dx!=1 or dy!=1.
...@@ -423,6 +426,7 @@ class ConvOp(OpenMPOp): ...@@ -423,6 +426,7 @@ class ConvOp(OpenMPOp):
self.dy = dy self.dy = dy
self.verbose = verbose self.verbose = verbose
self.version = version self.version = version
self.direction_hint = direction_hint
# a triple # a triple
if imshp_logical is None: if imshp_logical is None:
...@@ -565,6 +569,7 @@ class ConvOp(OpenMPOp): ...@@ -565,6 +569,7 @@ class ConvOp(OpenMPOp):
def __setstate__(self, d): def __setstate__(self, d):
super(ConvOp, self).__setstate__(d) super(ConvOp, self).__setstate__(d)
self.direction_hint = d.get("direction_hint", None)
self._rehash() self._rehash()
def _rehash(self): def _rehash(self):
...@@ -888,6 +893,7 @@ class ConvOp(OpenMPOp): ...@@ -888,6 +893,7 @@ class ConvOp(OpenMPOp):
kshp_logical=kshp_logical, kshp_logical=kshp_logical,
kshp_logical_top_aligned=kshp_logical_top_aligned, kshp_logical_top_aligned=kshp_logical_top_aligned,
version=self.version, version=self.version,
direction_hint='bprop weights',
verbose=self.verbose) verbose=self.verbose)
else: # let __init__ choose c params be chosen automatically from shapes else: # let __init__ choose c params be chosen automatically from shapes
...@@ -897,6 +903,7 @@ class ConvOp(OpenMPOp): ...@@ -897,6 +903,7 @@ class ConvOp(OpenMPOp):
kshp_logical=kshp_logical, kshp_logical=kshp_logical,
kshp_logical_top_aligned=kshp_logical_top_aligned, kshp_logical_top_aligned=kshp_logical_top_aligned,
version=self.version, version=self.version,
direction_hint='bprop weights',
verbose=self.verbose) verbose=self.verbose)
dw = dw(img, filters) dw = dw(img, filters)
...@@ -929,6 +936,7 @@ class ConvOp(OpenMPOp): ...@@ -929,6 +936,7 @@ class ConvOp(OpenMPOp):
imshp_logical=imshp_logical, imshp_logical=imshp_logical,
kshp_logical=None, kshp_logical=None,
version=-1, # we we change the mode, we don't forward the version. version=-1, # we we change the mode, we don't forward the version.
direction_hint='bprop inputs',
verbose=self.verbose) verbose=self.verbose)
else: # let __init__ figure out the unrolling / patch sizes else: # let __init__ figure out the unrolling / patch sizes
din = ConvOp(imshp, self.kshp, nkern, self.bsize, din = ConvOp(imshp, self.kshp, nkern, self.bsize,
...@@ -938,6 +946,7 @@ class ConvOp(OpenMPOp): ...@@ -938,6 +946,7 @@ class ConvOp(OpenMPOp):
imshp_logical=imshp_logical, imshp_logical=imshp_logical,
kshp_logical=None, kshp_logical=None,
version=-1, # we we change the mode, we don't forward the version. version=-1, # we we change the mode, we don't forward the version.
direction_hint='bprop inputs',
verbose=self.verbose) verbose=self.verbose)
din = din(gz, filters) din = din(gz, filters)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论