提交 1477ded8 authored 作者: f0k's avatar f0k

GpuConv now adopts nkern and bsize from ConvOp, conv_gemm and conv_fft_*…

GpuConv now adopts nkern and bsize from ConvOp, conv_gemm and conv_fft_* optimizers use it if available
上级 fe7d8bab
...@@ -972,6 +972,8 @@ class GpuConv(GpuOp): ...@@ -972,6 +972,8 @@ class GpuConv(GpuOp):
kshp=None, kshp=None,
imshp=None, imshp=None,
max_threads_dim0=None, max_threads_dim0=None,
nkern=None,
bsize=None,
fft_opt=True): fft_opt=True):
""" """
:param version: each version of c_code implements many kernel for the :param version: each version of c_code implements many kernel for the
...@@ -991,7 +993,15 @@ class GpuConv(GpuOp): ...@@ -991,7 +993,15 @@ class GpuConv(GpuOp):
:param max_threads_dim0: The maximum number of threads for the :param max_threads_dim0: The maximum number of threads for the
block size dimensions 0 (blockDim.x) used by the block size dimensions 0 (blockDim.x) used by the
GPU function. GPU function.
:param fft_opt: desactivate fft_opt optimization at the op level when :param nkern: The number of kernels. Not used for this op, but can be
used by graph optimizers to select a more optimal
convolution implementation. If the GpuConv op is inserted
automatically, we take its value from the Conv op.
:param bsize: The batch size. Not used for this op, but can be
used by graph optimizers to select a more optimal
convolution implementation. If the GpuConv op is inserted
automatically, we take its value from the Conv op.
:param fft_opt: deactivate fft_opt optimization at the op level when
set to False. Note that by default fft optimization set to False. Note that by default fft optimization
aren't enabled. See aren't enabled. See
:ref:`convolution documentation <libdoc_tensor_nnet_conv>` :ref:`convolution documentation <libdoc_tensor_nnet_conv>`
...@@ -1019,6 +1029,8 @@ class GpuConv(GpuOp): ...@@ -1019,6 +1029,8 @@ class GpuConv(GpuOp):
self.kshp = kshp self.kshp = kshp
self.imshp = imshp self.imshp = imshp
self.max_threads_dim0 = max_threads_dim0 self.max_threads_dim0 = max_threads_dim0
self.nkern = nkern
self.bsize = bsize
self.fft_opt = fft_opt self.fft_opt = fft_opt
def __eq__(self, other): def __eq__(self, other):
......
...@@ -1122,6 +1122,8 @@ def local_gpu_conv(node): ...@@ -1122,6 +1122,8 @@ def local_gpu_conv(node):
version=op.version, version=op.version,
verbose=op.verbose, verbose=op.verbose,
imshp=op.imshp, imshp=op.imshp,
nkern=op.nkern,
bsize=op.bsize,
fft_opt=op.fft_opt fft_opt=op.fft_opt
) )
if op.imshp_logical is not None: if op.imshp_logical is not None:
...@@ -1207,14 +1209,19 @@ def _gpu_conv_to_fftconv(node): ...@@ -1207,14 +1209,19 @@ def _gpu_conv_to_fftconv(node):
node.op.imshp[-1] is not None and node.op.imshp[-1] is not None and
node.op.imshp[-1] % 2 == 1): node.op.imshp[-1] % 2 == 1):
kwargs['pad_last_dim'] = True kwargs['pad_last_dim'] = True
# TODO: If the user supplied the full nonsymbolic image_shape and # If the user supplied the full nonsymbolic image_shape and
# filter_shape in conv2d(), we could pass it on to conv2d_fft(). However, # filter_shape in conv2d(), we can pass it on to conv2d_fft().
# information on batch size and channel counts is currently discarded if ((node.op.imshp is not None) and
# when a ConvOp is replaced by a GpuConv, so this would need more changes. (len(node.op.imshp) == 3) and
#if (node.op.imshp is not None) and (None not in node.op.imshp): (None not in node.op.imshp) and
# kwargs['image_shape'] = (bsize, inchannels) + node.op.imshp (node.op.bsize is not None)):
#if (node.op.kshp is not None) and (None not in node.op.kshp): kwargs['image_shape'] = (node.op.bsize,) + node.op.imshp
# kwargs['filter_shape'] = (outchannels, inchannels) + node.op.kshp if ((node.op.kshp is not None) and
(None not in node.op.kshp) and
(node.op.nkern is not None) and
(len(node.op.imshp) == 3) and
(node.op.imshp[0] is not None)):
kwargs['filter_shape'] = (node.op.nkern, node.op.imshp[0]) + node.op.kshp
return conv2d_fft(node.inputs[0], node.inputs[1], **kwargs) return conv2d_fft(node.inputs[0], node.inputs[1], **kwargs)
...@@ -1363,25 +1370,34 @@ def local_conv_gemm(node): ...@@ -1363,25 +1370,34 @@ def local_conv_gemm(node):
# need to flip the kernel for valid convolution # need to flip the kernel for valid convolution
kern = kern[:, :, ::-1, ::-1] kern = kern[:, :, ::-1, ::-1]
# call GpuCorrMM or GpuCorrMM_gradWeights # call GpuCorrMM or GpuCorrMM_gradWeights
# (GpuCorrMM seems faster if batchsize * kernelHeight * kernelWidth # (the latter is faster if batchsize * kernelHeight * kernelWidth
# is smaller than inputChannels * outputHeight * outputWidth. # is larger than inputChannels * outputHeight * outputWidth.
# GpuConv does not store information on the batchsize and not always # GpuConv does not always store information on the batchsize and
# on the channels, so we only use what information we have.) # channels, though, so we only use what information we have.)
if ((subsample == (1,1)) and if ((subsample == (1,1)) and
(node.op.imshp is not None) and (node.op.imshp is not None) and
(None not in node.op.imshp[-2:]) and (None not in node.op.imshp[-2:]) and
(node.op.kshp is not None) and (node.op.kshp is not None) and
(None not in node.op.kshp) and (None not in node.op.kshp)):
(node.op.kshp[0] * node.op.kshp[1] > # we know the kernel and output size
(node.op.imshp[-2] - node.op.kshp[0] + 1) * prod1 = node.op.kshp[0] * node.op.kshp[1]
(node.op.imshp[-1] - node.op.kshp[1] + 1))): prod2 = ((node.op.imshp[-2] - node.op.kshp[0] + 1) *
return [gpu_contiguous(GpuCorrMM_gradWeights('valid', subsample, pad)( (node.op.imshp[-1] - node.op.kshp[1] + 1))
if ((node.op.bsize is not None) and
(len(node.op.imshp) == 3) and
(node.op.imshp[0] is not None)):
# we also know batchsize and input channels
prod1 *= node.op.bsize
prod2 *= node.op.imshp[0]
# compare to decide
if prod1 > prod2:
return [gpu_contiguous(GpuCorrMM_gradWeights('valid', subsample, pad)(
gpu_contiguous(img.dimshuffle(1, 0, 2, 3)), gpu_contiguous(img.dimshuffle(1, 0, 2, 3)),
gpu_contiguous(kern.dimshuffle(1, 0, 2, 3)) gpu_contiguous(kern.dimshuffle(1, 0, 2, 3))
).dimshuffle(1, 0, 2, 3))] ).dimshuffle(1, 0, 2, 3))]
else: # use GpuCorrMM if we did not choose GpuCorrMM_gradWeights above
return [GpuCorrMM('valid', subsample, pad)( return [GpuCorrMM('valid', subsample, pad)(
gpu_contiguous(img), gpu_contiguous(kern))] gpu_contiguous(img), gpu_contiguous(kern))]
elif (border_mode == 'full'): elif (border_mode == 'full'):
# need to dimshuffle the kernel for full convolution # need to dimshuffle the kernel for full convolution
kern = kern.dimshuffle(1, 0, 2, 3) kern = kern.dimshuffle(1, 0, 2, 3)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论