提交 4ad36ddc authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #3788 from nouiz/carriepl-v4

Rebased cudnn v4
......@@ -219,30 +219,91 @@ AddConfigVar('gpuarray.sync',
BoolParam(False),
in_c_key=True)
def safe_no_dnn_workmem(workmem):
"""
Make sure the user is not attempting to use dnn.conv.workmem`.
"""
if workmem:
raise RuntimeError(
'The option `dnn.conv.workmem` has been removed and should '
'not be used anymore. Please use the option '
'`dnn.conv.algo_fwd` instead.')
return True
AddConfigVar('dnn.conv.workmem',
"This flag is deprecated; use dnn.conv.algo_fwd.",
EnumStr(''),
ConfigParam('', allow_override=False, filter=safe_no_dnn_workmem),
in_c_key=False)
def safe_no_dnn_workmem_bwd(workmem):
"""
Make sure the user is not attempting to use dnn.conv.workmem_bwd`.
"""
if workmem:
raise RuntimeError(
'The option `dnn.conv.workmem_bwd` has been removed and '
'should not be used anymore. Please use the options '
'`dnn.conv.algo_bwd_filter` and `dnn.conv.algo_bwd_data` instead.')
return True
AddConfigVar('dnn.conv.workmem_bwd',
"This flag is deprecated; use dnn.conv.algo_bwd.",
EnumStr(''),
ConfigParam('', allow_override=False,
filter=safe_no_dnn_workmem_bwd),
in_c_key=False)
def safe_no_dnn_algo_bwd(algo):
"""
Make sure the user is not attempting to use dnn.conv.algo_bwd`.
"""
if algo:
raise RuntimeError(
'The option `dnn.conv.algo_bwd` has been removed and '
'should not be used anymore. Please use the options '
'`dnn.conv.algo_bwd_filter` and `dnn.conv.algo_bwd_data` instead.')
return True
AddConfigVar('dnn.conv.algo_bwd',
"This flag is deprecated; use dnn.conv.algo_bwd_data and "
"dnn.conv.algo_bwd_filter.",
ConfigParam('', allow_override=False,
filter=safe_no_dnn_algo_bwd),
in_c_key=False)
AddConfigVar('dnn.conv.algo_fwd',
"Default implementation to use for CuDNN forward convolution.",
EnumStr('small', 'none', 'large', 'fft', 'guess_once',
'guess_on_shape_change', 'time_once',
EnumStr('small', 'none', 'large', 'fft', 'fft_tiling',
'guess_once', 'guess_on_shape_change',
'time_once', 'time_on_shape_change'),
in_c_key=False)
AddConfigVar('dnn.conv.algo_bwd_data',
"Default implementation to use for CuDNN backward convolution to "
"get the gradients of the convolution with regard to the inputs.",
EnumStr('none', 'deterministic', 'fft', 'fft_tiling',
'guess_once', 'guess_on_shape_change', 'time_once',
'time_on_shape_change'),
in_c_key=False)
AddConfigVar('dnn.conv.algo_bwd',
"Default implementation to use for CuDNN backward convolution.",
EnumStr('none', 'deterministic', 'fft', 'guess_once',
AddConfigVar('dnn.conv.algo_bwd_filter',
"Default implementation to use for CuDNN backward convolution to "
"get the gradients of the convolution with regard to the "
"filters.",
EnumStr('none', 'deterministic', 'fft', 'small', 'guess_once',
'guess_on_shape_change', 'time_once',
'time_on_shape_change'),
in_c_key=False)
AddConfigVar('dnn.conv.precision',
"Default data precision to use for the computation in CuDNN "
"convolutions (defaults to the same dtype as the inputs of the "
"convolutions).",
EnumStr('as_input', 'float16', 'float32', 'float64'),
in_c_key=False)
def default_dnn_path(suffix):
def f(suffix=suffix):
......
......@@ -3,6 +3,15 @@
#include <cudnn.h>
// If needed, define element of the V4 interface in terms of elements of
// previous versions
#if defined(CUDNN_VERSION) && CUDNN_VERSION < 4000
#define CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING 5
#define CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING 3
#endif
#ifndef CUDNN_VERSION
#include <assert.h>
......
......@@ -92,27 +92,13 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
" from one version, but we link with"
" a different version %s" % str(v))
raise RuntimeError(dnn_available.msg)
if v == -1:
dnn_available.avail = False
dnn_available.msg = (
"CuDNN v1 detected. This version is no longer "
"supported by Theano. Update your CuDNN installation "
"to a more recent version")
raise RuntimeError(dnn_available.msg)
if v == (20, 20):
dnn_available.avail = False
dnn_available.msg = (
"You have installed a release candidate of CuDNN v2."
" This isn't supported anymore."
" Update to CuDNN v2 final version.")
raise RuntimeError(dnn_available.msg)
if 3000 <= v[0] < 3007:
if v == -1 or v[0] < 3007:
# 3007 is the final release of cudnn v3
dnn_available.avail = False
dnn_available.msg = (
"You have installed a release candidate of CuDNN v3."
" This isn't supported anymore."
" Update to CuDNN v3 final version.")
"You have an old release of CuDNN (or a release "
"candidate) that isn't supported. Please update to "
"at least v3 final version.")
raise RuntimeError(dnn_available.msg)
return dnn_available.avail
......@@ -248,7 +234,7 @@ class GpuDnnConvDesc(GpuOp):
"""
__props__ = ('border_mode', 'subsample', 'conv_mode')
__props__ = ('border_mode', 'subsample', 'conv_mode', 'precision')
def c_headers(self):
return ['cudnn.h', 'cudnn_helper.h']
......@@ -265,7 +251,8 @@ class GpuDnnConvDesc(GpuOp):
def do_constant_folding(self, node):
return False
def __init__(self, border_mode, subsample=(1, 1), conv_mode='conv'):
def __init__(self, border_mode, subsample=(1, 1), conv_mode='conv',
precision="float32"):
if isinstance(border_mode, int):
border_mode = (border_mode,) * len(subsample)
if isinstance(border_mode, tuple):
......@@ -283,6 +270,9 @@ class GpuDnnConvDesc(GpuOp):
assert conv_mode in ('conv', 'cross')
self.conv_mode = conv_mode
assert precision in ['float16', 'float32', 'float64']
self.precision = precision
def make_node(self, img_shape, kern_shape):
if img_shape.type.ndim != 1 or img_shape.type.dtype != 'int64':
raise TypeError('img must be 1D shape tensor')
......@@ -321,6 +311,14 @@ class GpuDnnConvDesc(GpuOp):
subsample_str = ", ".join([str(s) for s in self.subsample])
upscale_str = ", ".join(["1"] * nb_dim)
if self.precision == 'float16':
precision = 'CUDNN_DATA_HALF'
elif self.precision == 'float32':
precision = 'CUDNN_DATA_FLOAT'
else:
assert self.precision == 'float64'
precision = 'CUDNN_DATA_DOUBLE'
return """
{
cudnnStatus_t err;
......@@ -346,11 +344,11 @@ class GpuDnnConvDesc(GpuOp):
}
}
err = cudnnSetConvolutionNdDescriptor(
err = cudnnSetConvolutionNdDescriptor_v3(
%(desc)s,
%(nb_dim)d,
pad, subsample, upscale,
%(conv_flag)s
%(conv_flag)s, %(precision)s
);
#else
PyErr_Format(PyExc_RuntimeError, "could not set op descriptor: CUDNN_VERSION must be >= 30");
......@@ -364,10 +362,10 @@ class GpuDnnConvDesc(GpuOp):
""" % dict(name=name, img_shape=img_shape, kern_shape=kern_shape, desc=desc,
bmode=bmode, conv_flag=conv_flag, fail=sub['fail'],
pad_str=pad_str, subsample_str=subsample_str,
upscale_str=upscale_str, nb_dim=nb_dim)
upscale_str=upscale_str, nb_dim=nb_dim, precision=precision)
def c_code_cache_version(self):
return (2, version())
return (3, version())
# scalar constants
_zero = constant(numpy.asarray(0.0, dtype='float32'))
......@@ -401,9 +399,8 @@ class GpuDnnConv(DnnBase, COp):
workmem
*deprecated*, use parameter algo instead.
algo
['none', 'small', 'large', 'fft', 'guess_once',
'guess_on_shape_change', 'time_once',
'time_on_shape_change']
['none', 'small', 'large', 'fft', 'fft_tiling', 'guess_once',
'guess_on_shape_change', 'time_once', 'time_on_shape_change']
Default is the value of :attr:`config.dnn.conv.algo_fwd`.
......@@ -445,9 +442,15 @@ class GpuDnnConv(DnnBase, COp):
raise RuntimeError("CuDNN convolution timing requires CuDNN "
"v3")
assert self.algo in ['none', 'small', 'large', 'fft', 'guess_once',
'guess_on_shape_change', 'time_once',
'time_on_shape_change']
# The fft_tiling implementation is only available from CuDNN V4 onward
if version() < (4000, 4000):
if self.algo == 'fft_tiling':
raise RuntimeError("CuDNN tiled-FFT convolution requires "
"CuDNN v4 or more recent")
assert self.algo in ['none', 'small', 'large', 'fft', 'fft_tiling',
'guess_once', 'guess_on_shape_change',
'time_once', 'time_on_shape_change']
def __setstate__(self, d):
self.__dict__.update(d)
......@@ -477,8 +480,15 @@ class GpuDnnConv(DnnBase, COp):
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM'
elif self.algo == 'large':
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_GEMM'
elif self.algo == 'direct':
# need v2
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_DIRECT'
elif self.algo == 'fft':
# need v3
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_FFT'
elif self.algo == 'fft_tiling':
# need v4
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING'
elif self.algo in ['guess_once', 'guess_on_shape_change']:
# The convolution implementation should be choosen according
# to a heuristic
......@@ -652,8 +662,8 @@ class GpuDnnConvGradW(DnnBase, COp):
The convolution descriptor.
workmem
*deprecated*, use parameter algo instead.
algo : {'none', 'deterministic', 'fft', 'guess_once', 'guess_on_shape_change', 'time_once', 'time_on_shape_change'}
Default is the value of :attr:`config.dnn.conv.algo_bwd`.
algo : {'none', 'deterministic', 'fft', 'small', 'guess_once', 'guess_on_shape_change', 'time_once', 'time_on_shape_change'}
Default is the value of :attr:`config.dnn.conv.algo_bwd_filter`.
"""
......@@ -671,15 +681,16 @@ class GpuDnnConvGradW(DnnBase, COp):
self.algo = workmem
else:
if algo is None:
algo = config.dnn.conv.algo_bwd
algo = config.dnn.conv.algo_bwd_filter
self.algo = algo
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [2]}
assert self.algo in ['none', 'deterministic', 'fft', 'guess_once',
'guess_on_shape_change', 'time_once',
'time_on_shape_change']
assert self.algo in ['none', 'deterministic', 'fft', 'small',
'guess_once', 'guess_on_shape_change',
'time_once', 'time_on_shape_change']
def __setstate__(self, d):
self.__dict__.update(d)
......@@ -687,7 +698,7 @@ class GpuDnnConvGradW(DnnBase, COp):
if hasattr(self, 'workmem'):
self.algo = self.workmem
else:
self.algo = config.dnn.conv.algo_bwd
self.algo = config.dnn.conv.algo_bwd_filter
if not hasattr(self, 'inplace'):
self.inplace = False
......@@ -724,11 +735,15 @@ class GpuDnnConvGradW(DnnBase, COp):
alg = "0"
else:
if self.algo == 'none':
# non-deterministic
alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0'
elif self.algo == 'deterministic':
alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1'
elif self.algo == 'fft':
alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT'
elif self.algo == 'small':
# need v3, non-deterministic, small workspace
alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3'
elif self.algo in ['guess_once', 'guess_on_shape_change']:
# The convolution implementation should be chosen according
# to a heuristic
......@@ -788,7 +803,7 @@ class GpuDnnConv3dGradW(GpuDnnConvGradW):
:param workmem:
*deprecated*, use parameter algo instead.
:param algo: ['none', 'guess_once', 'guess_on_shape_change', 'time_once', 'time_on_shape_change']
Default is the value of :attr:`config.dnn.conv.algo_bwd`.
Default is the value of :attr:`config.dnn.conv.algo_bwd_filter`.
"""
__props__ = ('algo', 'inplace',)
......@@ -856,11 +871,11 @@ class GpuDnnConvGradI(DnnBase, COp):
workmem
*deprecated*, use parameter algo instead.
algo
['none', 'deterministic', 'fft', 'guess_once',
['none', 'deterministic', 'fft', 'fft_tiling', 'guess_once',
'guess_on_shape_change', 'time_once',
'time_on_shape_change']
Default is the value of :attr:`config.dnn.conv.algo_bwd`.
Default is the value of :attr:`config.dnn.conv.algo_bwd_data`.
"""
......@@ -879,15 +894,23 @@ class GpuDnnConvGradI(DnnBase, COp):
self.algo = workmem
else:
if algo is None:
algo = config.dnn.conv.algo_bwd
algo = config.dnn.conv.algo_bwd_data
self.algo = algo
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [2]}
assert self.algo in ['none', 'deterministic', 'fft', 'guess_once',
'guess_on_shape_change', 'time_once',
'time_on_shape_change']
# The small-workspace implementation is only available from CuDNN V4
# onward.
if version() < (4000, 4000):
if self.algo == 'fft_tiling':
raise RuntimeError("CuDNN's tiled-FFT convolution requires "
"CuDNN v4 or more recent")
assert self.algo in ['none', 'deterministic', 'fft', 'fft_tiling',
'guess_once', 'guess_on_shape_change',
'time_once', 'time_on_shape_change']
def __setstate__(self, d):
self.__dict__.update(d)
......@@ -895,7 +918,7 @@ class GpuDnnConvGradI(DnnBase, COp):
if hasattr(self, 'workmem'):
self.algo = self.workmem
else:
self.algo = config.dnn.conv.algo_bwd
self.algo = config.dnn.conv.algo_bwd_data
if not hasattr(self, 'inplace'):
self.inplace = False
......@@ -936,7 +959,11 @@ class GpuDnnConvGradI(DnnBase, COp):
elif self.algo == 'deterministic':
alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_1'
elif self.algo == 'fft':
# need v3, big workspace
alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT'
elif self.algo == 'fft_tiling':
# need v4, big workspace, but less then fft
alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING'
elif self.algo in ['guess_once', 'guess_on_shape_change']:
# The convolution implementation should be chosen according
# to a heuristic
......@@ -998,7 +1025,7 @@ class GpuDnnConv3dGradI(GpuDnnConvGradI):
:param algo: ['none', 'guess_once', 'guess_on_shape_change',
'time_once', 'time_on_shape_change']
Default is the value of :attr:`config.dnn.conv.algo_bwd`.
Default is the value of :attr:`config.dnn.conv.algo_bwd_data`.
"""
__props__ = ('algo', 'inplace',)
......@@ -1055,7 +1082,8 @@ class GpuDnnConv3dGradI(GpuDnnConvGradI):
def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
conv_mode='conv', direction_hint=None, workmem=None, algo=None):
conv_mode='conv', direction_hint=None, workmem=None, algo=None,
precision=None):
"""
GPU convolution using cuDNN from NVIDIA.
......@@ -1090,13 +1118,24 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
removed at any time without a deprecation period. You have been warned.
workmem
*deprecated*, use parameter algo instead.
algo : {'none', 'small', 'large', 'fft', 'guess_once', 'guess_on_shape_change', 'time_once', 'time_on_shape_change'}
algo : {'none', 'small', 'large', 'fft', 'guess_once', 'guess_on_shape_change', 'time_once', 'time_on_shape_change'}
Convolution implementation to use. Some of its values may require certain
versions of CuDNN to be installed. Default is the value of
:attr:`config.dnn.conv.algo_fwd`.
precision : {'as_input', 'float16', 'float32', 'float64'}
Description of the dtype in which the computation of the convolution
should be done. Possible values are 'as_input', 'float16', 'float32'
and 'float64'. Default is the value of
:attr:`config.dnn.conv.precision`.
"""
# Establish dtype in which to perform the computation of the convolution
if precision is None:
precision = theano.config.dnn.conv.precision
if precision == 'as_input':
precision = theano.scalar.upcast(img.dtype, kerns.dtype)
# Check if deprecated param 'workmem' is used
if workmem is not None:
warnings.warn(("dnn_conv: parameter 'workmem' is deprecated. Use "
......@@ -1123,7 +1162,8 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
out = gpu_alloc_empty(shape_i(kerns, 1, fgraph),
shape_i(img, 1, fgraph), shape2, shape3)
desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
conv_mode='cross')(img.shape, out.shape)
conv_mode='cross', precision=precision)(img.shape,
out.shape)
conv = GpuDnnConvGradW()(img, kerns, out, desc)
return as_cuda_ndarray_variable(conv.dimshuffle(1, 0, 2, 3))
......@@ -1139,7 +1179,8 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
out = gpu_alloc_empty(shape_i(img, 0, fgraph),
shape_i(kerns, 1, fgraph), shape2, shape3)
desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
conv_mode=conv_mode)(out.shape, kerns.shape)
conv_mode=conv_mode, precision=precision)(out.shape,
kerns.shape)
return GpuDnnConvGradI()(kerns, img, out, desc)
# Standard case: We use GpuDnnConv with suitable padding.
......@@ -1148,7 +1189,8 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
img = gpu_contiguous(img)
kerns = gpu_contiguous(kerns)
desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
conv_mode=conv_mode)(img.shape, kerns.shape)
conv_mode=conv_mode, precision=precision)(img.shape,
kerns.shape)
desc_op = desc.owner.op
out_shp = GpuDnnConv.get_out_shape(img.shape, kerns.shape,
desc_op.border_mode,
......@@ -1159,7 +1201,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1),
conv_mode='conv', direction_hint=None, workmem=None,
algo='none'):
algo='none', precision=None):
"""
GPU convolution using cuDNN from NVIDIA.
......@@ -1186,6 +1228,10 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1),
:param algo: convolution implementation to use. Only 'none' is implemented
for the conv3d. Default is the value of
:attr:`config.dnn.conv.algo_fwd`.
:param precision : dtype in which the computation of the convolution
should be done. Possible values are 'as_input', 'float16', 'float32'
and 'float64'. Default is the value of
:attr:`config.dnn.conv.precision`.
:warning: The cuDNN library only works with GPU that have a compute
capability of 3.0 or higer. This means that older GPU will not
......@@ -1194,6 +1240,12 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1),
"""
# Establish dtype in which to perform the computation of the convolution
if precision is None:
precision = theano.config.dnn.conv.precision
if precision == 'as_input':
precision = theano.scalar.upcast(img.dtype, kerns.dtype)
# Check if deprecated param 'workmem' is used
if workmem is not None:
warnings.warn(("dnn_conv3d: parameter 'workmem' is deprecated. Use "
......@@ -1221,7 +1273,8 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1),
out = gpu_alloc_empty(shape_i(kerns, 1, fgraph),
shape_i(img, 1, fgraph), shape2, shape3, shape4)
desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1, 1),
conv_mode='cross')(img.shape, out.shape)
conv_mode='cross', precision=precision)(img.shape,
out.shape)
conv = GpuDnnConv3dGradW()(img, kerns, out, desc)
return as_cuda_ndarray_variable(conv.dimshuffle(1, 0, 2, 3, 4))
......@@ -1231,7 +1284,8 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1),
img = gpu_contiguous(img)
kerns = gpu_contiguous(kerns)
desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
conv_mode=conv_mode)(img.shape, kerns.shape)
conv_mode=conv_mode, precision=precision)(img.shape,
kerns.shape)
desc_op = desc.owner.op
out_shp = GpuDnnConv3d.get_out_shape(img.shape, kerns.shape,
desc_op.border_mode,
......
......@@ -15,11 +15,8 @@ int APPLY_SPECIFIC(previous_kerns_shape)[5];
int APPLY_SPECIFIC(previous_output_shape)[5];
bool APPLY_SPECIFIC(previous_algo_set);
cudnnConvolutionFwdAlgo_t APPLY_SPECIFIC(previous_algo);
#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 3000
cudnnConvolutionBwdFilterAlgo_t APPLY_SPECIFIC(previous_bwd_f_algo);
cudnnConvolutionBwdDataAlgo_t APPLY_SPECIFIC(previous_bwd_d_algo);
#endif
#section init_code_struct
......@@ -55,10 +52,8 @@ APPLY_SPECIFIC(previous_algo_set) = false;
// Select default implementations for the case where the convolution
// implementations should be selected based on the size of the data.
APPLY_SPECIFIC(previous_algo) = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 3000
APPLY_SPECIFIC(previous_bwd_f_algo) = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
APPLY_SPECIFIC(previous_bwd_d_algo) = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
#endif
#section cleanup_code_struct
......
......@@ -81,7 +81,6 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
// CuDNN time every implementation and choose the best one.
if (CHOOSE_ALGO_TIME)
{
#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 3000
// Time the different implementations to choose the best one
int requestedCount = 1;
int count;
......@@ -102,7 +101,6 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
}
chosen_algo = choosen_algo_perf.algo;
#endif
}
else
{
......@@ -161,24 +159,28 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
chosen_algo = CONV_ALGO;
}
#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 3000
// The FFT implementation (only in V3 and onward) does not support strides,
// 1x1 filters or inputs with a spatial dimension larger than 1024.
// If the chosen implementation is FFT, validate that it can be used
// on the current data and default on a safe implementation if it
// The tiled-FFT implementation (only in V4 onward) does not support
// strides.
// If the chosen implementation is FFT or tiled-FFT, validate that it can
// be used on the current data and default on a safe implementation if it
// can't.
// Following code is 2d-specific, but it is fine as ftt is defined only for
// 2d-filters
if (chosen_algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT && nb_dim == 4)
// Following code is 2d-specific, but it is fine as FFT and tiled-FFT are
// defined only for 2d-filters
if ((chosen_algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT ||
chosen_algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) && nb_dim == 4)
{
// Extract the properties of the convolution descriptor
int pad_h, pad_w, stride_v, stride_h, upscale_x, upscale_y;
int nd;
int pad[2];
int stride[2];
int upscale[2];
cudnnConvolutionMode_t mode;
err = cudnnGetConvolution2dDescriptor(desc, &pad_h, &pad_w,
&stride_v, &stride_h,
&upscale_x, &upscale_y,
&mode);
cudnnDataType_t data_type;
err = cudnnGetConvolutionNdDescriptor_v3(desc, 2, &nd, pad, stride,
upscale, &mode, &data_type);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
......@@ -197,36 +199,23 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
// Ensure that the selected implementation supports the requested
// convolution. Fall back to a safe implementation otherwise.
if (stride_v != 1 || stride_h != 1 || input_h > 1024 ||
input_w > 1024 || (filter_h == 1 && filter_w == 1))
if (chosen_algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT)
{
chosen_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
if (stride[0] != 1 || stride[1] != 1 || input_h > 1024 ||
input_w > 1024 || (filter_h == 1 && filter_w == 1))
{
chosen_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
}
}
else
{
// chosen_algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING
if (stride[0] != 1 || stride[1] != 1)
{
chosen_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
}
}
}
#endif
#if defined(CUDNN_VERSION) && CUDNN_VERSION < 3000
// In versions before V3, CuDNN did not support kernels larger than the
// inputs in any spatial dimension, even if padding was used such that the
// padded inputs were larger than the kernels. If the kernels are larger
// then the inputs, raise an error message.
bool shape_mismatch = false;
for (int i=2; i < nb_dim; i++){
shape_mismatch = shape_mismatch || (CudaNdarray_HOST_DIMS(kerns)[i] >
CudaNdarray_HOST_DIMS(input)[i]);
}
if (shape_mismatch){
PyErr_Format(PyExc_RuntimeError,
"GpuDnnConv: the current version of CuDNN does not support "
"kernels larger than the inputs in any spatial dimension, "
"even if the inputs are padded such that the padded inputs "
"are larger than the kernels. Update your installation of "
"CuDNN to V3 or more recent to solve the issue.");
return 1;
}
#endif
err = cudnnGetConvolutionForwardWorkspaceSize(_handle,
APPLY_SPECIFIC(input),
......
......@@ -33,7 +33,6 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
if (c_set_tensorNd(*input, APPLY_SPECIFIC(input)) == -1)
return 1;
#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 3000
{
size_t worksize;
void *workspace;
......@@ -159,21 +158,28 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
chosen_algo = CONV_ALGO;
}
// The FFT implementation (only in v3 and onward) does not support strides,
// The FFT implementation (only in V3 and onward) does not support strides,
// 1x1 filters or inputs with a spatial dimension larger than 1024.
// If the chosen implementation is FFT, validate that it can be used
// on the current data and default on a safe implementation if it
// The tiled-FFT implementation (only in V4 onward) does not support
// strides.
// If the chosen implementation is FFT or tiled-FFT, validate that it can
// be used on the current data and default on a safe implementation if it
// can't.
if (chosen_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT && nb_dim == 4)
// Following code is 2d-specific, but it is fine as FFT and tiled-FFT are
// defined only for 2d-filters
if ((chosen_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING ||
chosen_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) && nb_dim == 4)
{
// Extract the properties of the convolution descriptor
int pad_h, pad_w, stride_v, stride_h, upscale_x, upscale_y;
int nd;
int pad[2];
int stride[2];
int upscale[2];
cudnnConvolutionMode_t mode;
err = cudnnGetConvolution2dDescriptor(desc, &pad_h, &pad_w,
&stride_v, &stride_h,
&upscale_x, &upscale_y,
&mode);
cudnnDataType_t data_type;
err = cudnnGetConvolutionNdDescriptor_v3(desc, 2, &nd, pad, stride,
upscale, &mode, &data_type);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
......@@ -192,10 +198,21 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
// Ensure that the selected implementation supports the requested
// convolution. Fall back to a safe implementation otherwise.
if (stride_v != 1 || stride_h != 1 || input_h > 1024 ||
input_w > 1024 || (filter_h == 1 && filter_w == 1))
if (chosen_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT)
{
chosen_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
if (stride[0] != 1 || stride[1] != 1 || input_h > 1024 ||
input_w > 1024 || (filter_h == 1 && filter_w == 1))
{
chosen_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
}
}
else
{
// chosen_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING
if (stride[0] != 1 || stride[1] != 1)
{
chosen_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
}
}
}
......@@ -231,16 +248,6 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
(void *)&beta,
APPLY_SPECIFIC(input), CudaNdarray_DEV_DATA(*input));
}
#else
err = cudnnConvolutionBackwardData(
_handle,
(void *)&alpha,
APPLY_SPECIFIC(kerns), CudaNdarray_DEV_DATA(kerns),
APPLY_SPECIFIC(output), CudaNdarray_DEV_DATA(output),
desc,
(void *)&beta,
APPLY_SPECIFIC(input), CudaNdarray_DEV_DATA(*input));
#endif
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradI: error doing operation: %s",
......
......@@ -33,7 +33,6 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
if (c_set_filterNd(*kerns, APPLY_SPECIFIC(kerns)) == -1)
return 1;
#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 3000
{
size_t worksize;
void *workspace;
......@@ -168,12 +167,14 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
{
// Extract the properties of the convolution descriptor
int pad_h, pad_w, stride_v, stride_h, upscale_x, upscale_y;
int nd;
int pad[2];
int stride[2];
int upscale[2];
cudnnConvolutionMode_t mode;
err = cudnnGetConvolution2dDescriptor(desc, &pad_h, &pad_w,
&stride_v, &stride_h,
&upscale_x, &upscale_y,
&mode);
cudnnDataType_t data_type;
err = cudnnGetConvolutionNdDescriptor_v3(desc, 2, &nd, pad, stride,
upscale, &mode, &data_type);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
......@@ -192,7 +193,7 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
// Ensure that the selected implementation supports the requested
// convolution. Fall back to a safe implementation otherwise.
if (stride_v != 1 || stride_h != 1 || input_h > 1024 ||
if (stride[0] != 1 || stride[1] != 1 || input_h > 1024 ||
input_w > 1024 || (filter_h == 1 && filter_w == 1))
{
chosen_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
......@@ -232,16 +233,6 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
APPLY_SPECIFIC(kerns), CudaNdarray_DEV_DATA(*kerns));
}
#else
err = cudnnConvolutionBackwardFilter(
_handle,
(void *)&alpha,
APPLY_SPECIFIC(input), CudaNdarray_DEV_DATA(input),
APPLY_SPECIFIC(output), CudaNdarray_DEV_DATA(output),
desc,
(void *)&beta,
APPLY_SPECIFIC(kerns), CudaNdarray_DEV_DATA(*kerns));
#endif
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradW: error doing operation: %s",
......
......@@ -29,7 +29,7 @@ int APPLY_SPECIFIC(conv_desc)(PyArrayObject *filt_shp,
return -1;
}
err = cudnnSetConvolutionNdDescriptor(*desc, NB_DIMS, pad, strides, upscale,
CONV_MODE);
err = cudnnSetConvolutionNdDescriptor_v3(*desc, NB_DIMS, pad, strides,
upscale, CONV_MODE, PRECISION);
return 0;
}
......@@ -13,99 +13,12 @@ static inline int cudnnGetVersion() {
#include <assert.h>
#if CUDNN_VERSION < 3000
// Here we define the R3 API in terms of functions in the R2 interface
// This is only for what we use
// If needed, define element of the V4 interface in terms of elements of
// previous versions
#if defined(CUDNN_VERSION) && CUDNN_VERSION < 4000
typedef int cudnnConvolutionBwdDataAlgo_t;
#define CUDNN_CONVOLUTION_BWD_DATA_ALGO_0 0
#define CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 1
#define CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT 2
static cudnnStatus_t cudnnGetConvolutionBackwardDataWorkspaceSize(
cudnnHandle_t handle,
const cudnnFilterDescriptor_t filterDesc,
const cudnnTensorDescriptor_t diffDesc,
const cudnnConvolutionDescriptor_t convDesc,
const cudnnTensorDescriptor_t gradDesc,
cudnnConvolutionBwdDataAlgo_t algo,
size_t *sizeInBytes) {
*sizeInBytes = 0;
return CUDNN_STATUS_SUCCESS;
}
static cudnnStatus_t cudnnConvolutionBackwardData_v3(
cudnnHandle_t handle,
const void *alpha,
const cudnnFilterDescriptor_t filterDesc,
const void *filterData,
const cudnnTensorDescriptor_t diffDesc,
const void *diffData,
const cudnnConvolutionDescriptor_t convDesc,
cudnnConvolutionBwdDataAlgo_t algo,
void *workspace,
size_t workspaceSizeInBytes,
const void *beta,
const cudnnTensorDescriptor_t gradDesc,
void *gradData) {
return cudnnConvolutionBackwardData(
handle,
alpha,
filterDesc,
filterData,
diffDesc,
diffData,
convDesc,
beta,
gradDesc,
gradData);
}
typedef int cudnnConvolutionBwdFilterAlgo_t;
#define CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0 0
#define CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 1
#define CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT 2
static cudnnStatus_t cudnnGetConvolutionBackwardFilterWorkspaceSize(
cudnnHandle_t handle,
const cudnnTensorDescriptor_t filterDesc,
const cudnnTensorDescriptor_t diffDesc,
const cudnnConvolutionDescriptor_t convDesc,
const cudnnFilterDescriptor_t gradDesc,
cudnnConvolutionBwdDataAlgo_t algo,
size_t *sizeInBytes) {
*sizeInBytes = 0;
return CUDNN_STATUS_SUCCESS;
}
static cudnnStatus_t cudnnConvolutionBackwardFilter_v3(
cudnnHandle_t handle,
const void *alpha,
const cudnnTensorDescriptor_t srcDesc,
const void *srcData,
const cudnnTensorDescriptor_t diffDesc,
const void *diffData,
const cudnnConvolutionDescriptor_t convDesc,
cudnnConvolutionBwdFilterAlgo_t algo,
void *workspace,
size_t workspaceSizeInBytes,
const void *beta,
const cudnnFilterDescriptor_t gradDesc,
void *gradData) {
return cudnnConvolutionBackwardFilter(
handle,
alpha,
srcDesc,
srcData,
diffDesc,
diffData,
convDesc,
beta,
gradDesc,
gradData);
}
#define CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING 5
#define CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING 3
#endif
......
......@@ -75,15 +75,11 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
def _dnn_check_version():
v = version()
if v < 2000:
if v < 3007:
return False, (
"You have an old release of CuDNN (or a release candidate) "
"that isn't supported. Please update to at least v2 final "
"that isn't supported. Please update to at least v3 final "
"version.")
if 3000 <= v < 3007:
return False, (
"You have installed a release candidate of CuDNN v3. This "
"isn't supported. Please update to v3 final version.")
return True, None
......@@ -241,7 +237,7 @@ class GpuDnnConvDesc(COp):
"""
__props__ = ('border_mode', 'subsample', 'conv_mode')
__props__ = ('border_mode', 'subsample', 'conv_mode', 'precision')
def c_headers(self):
return ['cudnn.h', 'cudnn_helper.h']
......@@ -258,7 +254,8 @@ class GpuDnnConvDesc(COp):
def do_constant_folding(self, node):
return False
def __init__(self, border_mode, subsample=(1, 1), conv_mode='conv'):
def __init__(self, border_mode, subsample=(1, 1), conv_mode='conv',
precision="float32"):
COp.__init__(self, ["conv_desc.c"], "APPLY_SPECIFIC(conv_desc)")
if isinstance(border_mode, int):
......@@ -278,6 +275,9 @@ class GpuDnnConvDesc(COp):
assert conv_mode in ('conv', 'cross')
self.conv_mode = conv_mode
assert precision in ['float16', 'float32', 'float64']
self.precision = precision
def make_node(self, kern_shape):
if kern_shape.type.ndim != 1 or kern_shape.type.dtype != 'int64':
raise TypeError('kern must be 1D shape tensor')
......@@ -315,11 +315,20 @@ class GpuDnnConvDesc(COp):
else:
sub2 = '0'
if self.precision == 'float16':
precision = 'CUDNN_DATA_HALF'
elif self.precision == 'float32':
precision = 'CUDNN_DATA_FLOAT'
else:
assert self.precision == 'float64'
precision = 'CUDNN_DATA_DOUBLE'
return [('NB_DIMS', str(len(self.subsample))),
('BORDER_MODE', bmode),
('PAD_0', pad0), ('PAD_1', pad1), ('PAD_2', pad2),
('CONV_MODE', conv_flag),
('SUB_0', sub0), ('SUB_1', sub1), ('SUB_2', sub2)]
('SUB_0', sub0), ('SUB_1', sub1), ('SUB_2', sub2),
('PRECISION', precision)]
def c_code_cache_version(self):
return (super(GpuDnnConvDesc, self).c_code_cache_version(), version())
......@@ -353,7 +362,8 @@ class GpuDnnConv(DnnBase):
kernel
descr
The convolution descriptor.
algo : {'small', 'none', 'large', 'fft', 'guess_once', 'guess_on_shape_change', 'time_once', 'time_on_shape_change'}
algo : {'small', 'none', 'large', 'fft', 'fft_tiling', 'guess_once',
'guess_on_shape_change', 'time_once', 'time_on_shape_change'}
Default is the value of :attr:`config.dnn.conv.algo_fwd`.
"""
......@@ -382,9 +392,15 @@ class GpuDnnConv(DnnBase):
elif self.algo in ['time_once', 'time_on_shape_change']:
raise RuntimeError("CuDNN convolution timing requires CuDNN v3")
assert self.algo in ['none', 'small', 'large', 'fft', 'guess_once',
'guess_on_shape_change', 'time_once',
'time_on_shape_change']
# The fft_tiling implementation is only available from CuDNN V4 onward
if version() < 4000:
if self.algo == 'fft_tiling':
raise RuntimeError("CuDNN tiled-FFT convolution requires "
"CuDNN v4 or more recent")
assert self.algo in ['none', 'small', 'large', 'fft', 'fft_tiling',
'guess_once', 'guess_on_shape_change',
'time_once', 'time_on_shape_change']
def __setstate__(self, d):
self.__dict__.update(d)
......@@ -408,8 +424,13 @@ class GpuDnnConv(DnnBase):
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM'
elif self.algo == 'large':
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_GEMM'
elif self.algo == 'direct':
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_DIRECT'
elif self.algo == 'fft':
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_FFT'
elif self.algo == 'fft_tiling':
# need v4
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING'
defs.append(('CONV_ALGO', alg))
if self.algo in ['guess_once', 'guess_on_shape_change',
......@@ -439,9 +460,10 @@ class GpuDnnConv(DnnBase):
raise TypeError("The number of dimensions of "
"img, kern and output must match")
if img.type.ndim == 5 and self.algo == 'fft':
raise ValueError("convolution algo fft can't be used for "
"3d convolutions")
if (img.type.ndim == 5 and
self.algo in ['small', 'large', 'fft', 'fft_tiling']):
raise ValueError("convolution algo %s can't be used for "
"3d convolutions", (self.algo,))
if (not isinstance(desc.type, CDataType) or
desc.type.ctype != 'cudnnConvolutionDescriptor_t'):
......@@ -479,6 +501,14 @@ class GpuDnnConv(DnnBase):
or scalar.
"""
# if ishape and/or kshape are not tuples or list, but rather symbolic
# vectors, turn them into lists of symbolic scalars.
if not isinstance(ishape, (list, tuple)):
ishape = [ishape[i] for i in range(len(subsample) + 2)]
if not isinstance(kshape, (list, tuple)):
kshape = [kshape[i] for i in range(len(subsample) + 2)]
return get_conv_output_shape(
ishape,
kshape,
......@@ -511,18 +541,19 @@ class GpuDnnConvGradW(DnnBase):
if self.inplace:
self.destroy_map = {0: [2]}
if algo is None:
algo = config.dnn.conv.algo_bwd
algo = config.dnn.conv.algo_bwd_filter
self.algo = algo
assert self.algo in ['none', 'deterministic', 'fft', 'guess_once',
'guess_on_shape_change', 'time_once',
'time_on_shape_change']
assert self.algo in ['none', 'deterministic', 'fft', 'small',
'guess_once', 'guess_on_shape_change',
'time_once', 'time_on_shape_change']
def __setstate__(self, d):
self.__dict__.update(d)
if not hasattr(self, 'inplace'):
self.inplace = False
if not hasattr(self, 'algo'):
self.algo = config.dnn.conv.algo_bwd
self.algo = config.dnn.conv.algo_bwd_filter
def grad(self, inp, grads):
img, top, output, desc, alpha, beta = inp
......@@ -557,7 +588,9 @@ class GpuDnnConvGradW(DnnBase):
alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1'
if self.algo == 'fft':
alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT'
if self.algo == 'small':
# non-deterministic, small workspace
alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3'
if self.algo in ['guess_once', 'guess_on_shape_change',
'time_once', 'time_on_shape_change']:
defs.append(('CHOOSE_ALGO', ''))
......@@ -587,7 +620,8 @@ class GpuDnnConvGradW(DnnBase):
raise TypeError("The number of dimensions of "
"img, topgrad and output must match")
if img.type.ndim == 5 and self.algo in ['fft', 'deterministic']:
if (img.type.ndim == 5 and
self.algo in ['fft', 'deterministic', 'small']):
raise ValueError("convolution algo %s can't be used for "
"3d convolutions", (self.algo,))
......@@ -627,16 +661,23 @@ class GpuDnnConvGradI(DnnBase):
if self.inplace:
self.destroy_map = {0: [2]}
if algo is None:
algo = config.dnn.conv.algo_bwd
algo = config.dnn.conv.algo_bwd_data
self.algo = algo
assert self.algo in ['none', 'deterministic', 'fft', 'guess_once',
'guess_on_shape_change', 'time_once',
'time_on_shape_change']
# The small-workspace implementation is only available from CuDNN V4
# onward.
if version() < 4000 and self.algo == 'fft_tiling':
raise RuntimeError("CuDNN's tiled-FFT convolution requires CuDNN "
"v4 or more recent")
assert self.algo in ['none', 'deterministic', 'fft', 'fft_tiling',
'guess_once', 'guess_on_shape_change',
'time_once', 'time_on_shape_change']
def __setstate__(self, d):
self.__dict__.update(d)
if not hasattr(self, 'algo'):
self.algo = config.dnn.conv.algo_bwd
self.algo = config.dnn.conv.algo_bwd_data
if not hasattr(self, 'inplace'):
self.inplace = False
......@@ -673,6 +714,9 @@ class GpuDnnConvGradI(DnnBase):
alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_1'
if self.algo == 'fft':
alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT'
if self.algo == 'fft_tiling':
# big workspace but less than fft
alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING'
if self.algo in ['guess_once', 'guess_on_shape_change',
'time_once', 'time_on_shape_change']:
......@@ -703,7 +747,8 @@ class GpuDnnConvGradI(DnnBase):
raise TypeError("The number of dimensions of "
"kern, topgrad and output must match")
if kern.type.ndim == 5 and self.algo in ['fft', 'deterministic']:
if (kern.type.ndim == 5 and
self.algo in ['fft', 'deterministic', 'fft_tiling']):
raise ValueError("convolution algo %s can't be used for "
"3d convolutions", (self.algo,))
......@@ -723,7 +768,7 @@ class GpuDnnConvGradI(DnnBase):
def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
conv_mode='conv', direction_hint=None, workmem=None,
algo=None):
algo=None, precision=None):
"""
GPU convolution using cuDNN from NVIDIA.
......@@ -757,12 +802,24 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
Convolution implementation to use. Some of its values may
require certain versions of CuDNN to be installed. Default is
the value of :attr:`config.dnn.conv.algo_fwd`.
precision : {'as_input', 'float16', 'float32', 'float64'}
Description of the dtype in which the computation of the convolution
should be done. Possible values are 'as_input', 'float16', 'float32'
and 'float64'. Default is the value of
:attr:`config.dnn.conv.precision`.
.. warning:: The cuDNN library only works with GPUs that have a compute
capability of 3.0 or higer. This means that older GPUs will not
work with this Op.
"""
# Establish dtype in which to perform the computation of the convolution
if precision is None:
precision = theano.config.dnn.conv.precision
if precision == 'as_input':
precision = theano.scalar.upcast(img.dtype, kerns.dtype)
if workmem is not None:
if algo is not None:
raise ValueError("You can't use both algo and workmem")
......@@ -786,7 +843,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
shape_i(kerns, 1, fgraph),
shape_i(img, 1, fgraph), shape2, shape3)
desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
conv_mode='cross')(out.shape)
conv_mode='cross', precision=precision)(out.shape)
conv = GpuDnnConvGradW()(img, kerns, out, desc)
return as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3), ctx_name)
......@@ -804,7 +861,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
shape_i(kerns, 1, fgraph),
shape2, shape3)
desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
conv_mode=conv_mode)(kerns.shape)
conv_mode=conv_mode, precision=precision)(kerns.shape)
return GpuDnnConvGradI()(kerns, img, out, desc)
# Standard case: We use GpuDnnConv with suitable padding.
......@@ -813,7 +870,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
img = gpu_contiguous(img)
kerns = gpu_contiguous(kerns)
desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
conv_mode=conv_mode)(kerns.shape)
conv_mode=conv_mode, precision=precision)(kerns.shape)
desc_op = desc.owner.op
out_shp = GpuDnnConv.get_out_shape(img.shape, kerns.shape,
desc_op.border_mode,
......
......@@ -136,15 +136,26 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
algo == CUDNN_CONVOLUTION_FWD_ALGO_GEMM))
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
#if CUDNN_VERSION > 3000
if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT) {
// The FFT implementation does not support strides, 1x1 filters or inputs
// with a spatial dimension larger than 1024. The tiled-FFT implementation
// does not support strides.
// If the chosen implementation is FFT or tiled-FFT, validate that it can
// be used on the current data and default to a safe implementation if it
// can't.
// The following code is 2d-specific but it is fine as FFT and tiled-FFT are
// defined only for 2d filters
if ((algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT ||
algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) && PyGpuArray_NDIM(input) == 4) {
// Extract the properties of the convolution descriptor
int nd;
int pad[2];
int stride[2];
int upscale[2];
cudnnConvolutionMode_t mode;
err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
upscale, &mode);
cudnnDataType_t data_type;
err = cudnnGetConvolutionNdDescriptor_v3(desc, 2, &nd, pad, stride,
upscale, &mode, &data_type);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error getting convolution properties: %s",
......@@ -153,30 +164,24 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
return 1;
}
if (stride[0] != 1 || stride[1] != 1 ||
PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
(PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1)) {
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT)
{
if (stride[0] != 1 || stride[1] != 1 ||
PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
(PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1))
{
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
}
}
}
#endif
#if CUDNN_VERSION < 3000
/* cuDNN before v3 does not support kernels larger than input even
* if appropriate padding is selected. */
for (unsigned int i = 2; i < PyGpuArray_NDIM(input); i++) {
if (PyGpuArray_DIM(kerns, i) > PyGpuArray_DIM(input, i)) {
PyErr_SetString(PyExc_RuntimeError, "the current version "
"of CuDNN does not support kernels larger than the "
"inputs in any spatial dimension, even if the inputs "
"are padded such that the padded inputs are larger "
"than the kernels. Update your installation of CuDNN "
"to V3 or more recent to solve the issue.");
cuda_exit(c->ctx);
return 1;
else
{
// algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING
if (stride[0] != 1 || stride[1] != 1)
{
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
}
}
}
#endif
{
size_t worksize;
......
......@@ -128,15 +128,26 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
#endif
#if CUDNN_VERSION > 3000
if (algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) {
// The FFT implementation does not support strides, 1x1 filters or inputs
// with a spatial dimension larger than 1024. The tiled-FFT implementation
// does not support strides.
// If the chosen implementation is FFT or tiled-FFT, validate that it can
// be used on the current data and default to a safe implementation if it
// can't.
// The following code is 2d-specific but it is fine as FFT and tiled-FFT are
// defined only for 2d filters
if ((algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING ||
algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) && PyGpuArray_NDIM(kerns) == 4) {
// Extract the properties of the convolution descriptor
int nd;
int pad[2];
int stride[2];
int upscale[2];
cudnnConvolutionMode_t mode;
err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
upscale, &mode);
cudnnDataType_t data_type;
err = cudnnGetConvolutionNdDescriptor_v3(desc, 2, &nd, pad, stride,
upscale, &mode, &data_type);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error getting convolution properties: %s",
......@@ -145,13 +156,24 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
return 1;
}
if (stride[0] != 1 || stride[1] != 1 ||
PyGpuArray_DIM(*input, 2) > 1024 || PyGpuArray_DIM(*input, 3) > 1024 ||
(PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1)) {
algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
if (algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT)
{
if (stride[0] != 1 || stride[1] != 1 ||
PyGpuArray_DIM(*input, 2) > 1024 || PyGpuArray_DIM(*input, 3) > 1024 ||
(PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1))
{
algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
}
}
else
{
// algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING
if (stride[0] != 1 || stride[1] != 1)
{
algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
}
}
}
#endif
size_t worksize;
gpudata *workspace;
......
......@@ -130,15 +130,24 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
#endif
#if CUDNN_VERSION > 3000
if (algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT) {
// The FFT implementation does not support strides, 1x1 filters or inputs
// with a spatial dimension larger than 1024.
// If the chosen implementation is FFT, validate that it can
// be used on the current data and default to a safe implementation if it
// can't.
// The following code is 2d-specific but it is fine as FFT and tiled-FFT are
// defined only for 2d filters
if (algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT &&
PyGpuArray_NDIM(input) == 4) {
// Extract the properties of the convolution descriptor
int nd;
int pad[2];
int stride[2];
int upscale[2];
cudnnConvolutionMode_t mode;
err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
upscale, &mode);
cudnnDataType_t data_type;
err = cudnnGetConvolutionNdDescriptor_v3(desc, 2, &nd, pad, stride,
upscale, &mode, &data_type);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error getting convolution properties: %s",
......@@ -153,7 +162,6 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
}
}
#endif
size_t worksize;
gpudata *workspace;
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论