提交 1b6e6389 authored 作者: notoraptor's avatar notoraptor

Cancel some changes into CEnumType.

Cancel changes into dnn_fwd.c Heavy simplification of check_dnn. Make check_dnn runnable as a python script.
上级 6cc0c5ca
......@@ -909,11 +909,7 @@ class EnumType(Type, dict):
.. note::
:class:`EnumType` is not complete and should never be used for regular graph operations.
:class:`EnumList` is not complete and should never be used for regular graph operations.
**:class:`CEnumType` is complete.**
This Type (and subclasses) is not complete and should never be used for regular graph operations.
"""
......@@ -1053,9 +1049,6 @@ class EnumType(Type, dict):
#ifndef PyInt_AsLong
#define PyInt_AsLong PyLong_AsLong
#endif
#ifndef PyInt_FromLong
#define PyInt_FromLong PyLong_FromLong
#endif
#endif
"""
......@@ -1248,22 +1241,5 @@ class CEnumType(EnumList):
""" % dict(i=i, name=name, constant_cname=swapped_dict[i]) for i in sorted(swapped_dict.keys())),
fail=sub['fail'])
def c_sync(self, name, sub):
return """
int py_value = -1;
Py_XDECREF(py_%(name)s);
/* We assume that ctype is an integer type usable in a switch. */
switch (%(name)s) {
%(cases)s
default:
PyErr_SetString(PyExc_ValueError, "CEnumType: cannot map C value to Python constant.");
{%(fail)s}
break;
}
py_%(name)s = PyInt_FromLong(py_value);
""" % dict(name=name, fail=sub['fail'], cases=''.join("""
case %(constant_cname)s: py_value = %(constant_pyvalue)d; break;
""" % dict(constant_cname=k, constant_pyvalue=v) for k, v in sorted(self.items(), key=lambda t: t[1])))
def c_code_cache_version(self):
return (1, super(CEnumType, self).c_code_cache_version())
......@@ -19,8 +19,27 @@ from __future__ import absolute_import, print_function, division
from theano.gof import CEnumType
HALF, FLOAT, DOUBLE = ('float16', 'float32', 'float64')
TRUE_HALF_CONFIG = (HALF, HALF)
PSEUDO_HALF_CONFIG = (HALF, FLOAT)
FLOAT_CONFIG = (FLOAT, FLOAT)
DOUBLE_CONFIG = (DOUBLE, DOUBLE)
def is_true_half_config(dtype, precision):
return dtype == precision == HALF
def is_pseudo_half_config(dtype, precision):
return dtype == HALF and precision == FLOAT
def is_float_config(dtype, precision):
return dtype == precision == FLOAT
def is_double_config(dtype, precision):
return dtype == precision == DOUBLE
# NB: Some cuDNN algorithms are listed in cuDNN enums but not implemented.
......@@ -103,22 +122,97 @@ class CuDNNV51(object):
# empty list of enum to don't crash with cudnn 5
cudnnReduceTensorOp_t = CEnumType()
def supported_precisions(self, dtype):
def get_supported_dtype_configs(self):
"""
Return the tuple of precisions supported by cuDNN for given input data type.
Return the tuple of data type configurations supported by this version of cuDNN.
This is currently convenient for both cuDNN V5.1 and V6, as Theano does not
yet support new data types (like INT8, INT8x4, etc.).
"""
assert dtype in (HALF, FLOAT, DOUBLE)
if dtype == HALF:
# TRUE_HALF_CONFIG, PSEUDO_HALF_CONFIG
return (HALF, FLOAT)
if dtype == FLOAT:
# FLOAT_CONFIG
return (FLOAT,)
if dtype == DOUBLE:
# DOUBLE_CONFIG
return (DOUBLE,)
return (TRUE_HALF_CONFIG, PSEUDO_HALF_CONFIG, FLOAT_CONFIG, DOUBLE_CONFIG)
def get_fwd_dtype_configs(self, check_runtime=None):
# NB: "TRUE_HALF_CONFIG is only supported on architectures with true fp16 support
# (compute capability 5.3 and 6.0)". Can be checked at runtime only.
if check_runtime is None or check_runtime(*TRUE_HALF_CONFIG):
return self.get_supported_dtype_configs()
return (PSEUDO_HALF_CONFIG, FLOAT_CONFIG, DOUBLE_CONFIG)
def get_bwd_filter_dtype_configs(self, check_runtime=None):
return self.get_supported_dtype_configs()
def get_bwd_data_dtype_configs(self, check_runtime=None):
return self.get_supported_dtype_configs()
def fwd_algo_supports_dtype_config(self, algo, dtype, precision, ndim):
algorithms = self.cudnnConvolutionFwdAlgo_t
algo = algorithms.fromalias(algo)
if algo == algorithms.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM:
return not is_true_half_config(dtype, precision)
if algo == algorithms.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM:
return ndim == 2 or not is_true_half_config(dtype, precision)
if algo == algorithms.CUDNN_CONVOLUTION_FWD_ALGO_GEMM:
return ndim == 2 and not is_true_half_config(dtype, precision)
# CUDNN_CONVOLUTION_FWD_ALGO_DIRECT: not implemented.
if algo == algorithms.CUDNN_CONVOLUTION_FWD_ALGO_FFT:
return ndim == 2 and (is_pseudo_half_config(dtype, precision) or is_float_config(dtype, precision))
if algo == algorithms.CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING:
if ndim == 2:
return is_pseudo_half_config(dtype, precision) or is_float_config(dtype, precision)
# NB: For cuDNN V6:
# " Data Type Config Support: PSEUDO_HALF_CONFIG, FLOAT_CONFIG
# (DOUBLE_CONFIG is also supported when the task can be handled by 1D FFT,
# ie, one of the filter dimension, width or height is 1)"
# Could be checked only when being in C code.
if ndim == 3:
return not is_true_half_config(dtype, precision)
if algo == algorithms.CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD:
return ndim == 2 and (is_pseudo_half_config(dtype, precision) or is_float_config(dtype, precision))
if algo == algorithms.CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED:
# NB: "If wDesc 's filter (height, width) is (5,5), data type config TRUE_HALF_CONFIG is not supported".
# We could not check it before being in C code.
return ndim == 2 and not is_double_config(dtype, precision)
return False
def bwd_filter_algo_supports_dtype_config(self, algo, dtype, precision, ndim):
algorithms = self.cudnnConvolutionBwdFilterAlgo_t
algo = algorithms.fromalias(algo)
if algo == algorithms.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0:
return not is_true_half_config(dtype, precision)
if algo == algorithms.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1:
return ndim == 2
if algo == algorithms.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT:
return ndim == 2 and (is_pseudo_half_config(dtype, precision) or is_float_config(dtype, precision))
if algo == algorithms.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3:
return not is_true_half_config(dtype, precision)
if algo == algorithms.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED:
# NB: "If wDesc 's filter (height, width) is (5,5), data type config TRUE_HALF_CONFIG is not supported".
# We could not check it before being in C code.
return ndim == 2 and not is_double_config(dtype, precision)
return False
def bwd_data_algo_supports_dtype_config(self, algo, dtype, precision, ndim):
algorithms = self.cudnnConvolutionBwdDataAlgo_t
algo = algorithms.fromalias(algo)
if algo == algorithms.CUDNN_CONVOLUTION_BWD_DATA_ALGO_0:
return not is_true_half_config(dtype, precision)
# CUDNN_CONVOLUTION_BWD_DATA_ALGO_1: all data type configs supported.
if algo == algorithms.CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT:
return ndim == 2 and (is_pseudo_half_config(dtype, precision) or is_float_config(dtype, precision))
if algo == algorithms.CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING:
if ndim == 2:
return is_pseudo_half_config(dtype, precision) or is_float_config(dtype, precision)
# NB: For cuDNN V6: "(DOUBLE_CONFIG is also supported when the task can be handled by 1D FFT,
# ie, one of the filter dimension, width or height is 1)"
# Could be checked only when being in C code.
if ndim == 3:
return not is_true_half_config(dtype, precision)
if algo == algorithms.CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD:
return ndim == 2 and is_pseudo_half_config(dtype, precision) or is_float_config(dtype, precision)
if algo == algorithms.CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED:
# NB: "If wDesc 's filter (height, width) is (5,5), data type config TRUE_HALF_CONFIG is not supported".
# We could not check it before being in C code.
return ndim == 2 and not is_double_config(dtype, precision)
return False
class CuDNNV6(CuDNNV51):
......@@ -162,6 +256,17 @@ class CuDNNV6(CuDNNV51):
('CUDNN_REDUCE_TENSOR_NORM2', 'norm2'),
ctype='cudnnReduceTensorOp_t')
def bwd_filter_algo_supports_dtype_config(self, algo, dtype, precision, ndim):
is_supported = super(CuDNNV6, self).bwd_filter_algo_supports_dtype_config(algo, dtype, precision, ndim)
if not is_supported:
algorithms = self.cudnnConvolutionBwdFilterAlgo_t
algo = algorithms.fromalias(algo)
if algo == algorithms.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING:
return ndim == 2 and (is_pseudo_half_config(dtype, precision) or
is_float_config(dtype, precision) or
is_double_config(dtype, precision))
return is_supported
class CuDNNV7(CuDNNV6):
version = 7
......
#!/usr/bin/env python
# You can pass nosetests args when running this script. Examples:
# python theano/gpuarray/tests/check_dnn.py # Normal mode.
# python theano/gpuarray/tests/check_dnn.py -xvs # Verbose mode, capture output, exit at first error.
from __future__ import absolute_import, print_function, division
from itertools import ifilter, product
from itertools import ifilter, product, chain
import nose
import numpy as np
from nose.plugins.skip import SkipTest
import theano
import theano.tests.unittest_tools as utt
from theano.compile.ops import shape_i_op
from theano.configdefaults import SUPPORTED_DNN_CONV_ALGO_RUNTIME
from theano.gof import COp, Apply, ParamsType
from theano.gof.type import CDataType
from theano.gpuarray import cudnn_defs
from theano.gpuarray.basic_ops import infer_context_name, as_gpuarray_variable, gpu_contiguous, GpuAllocEmpty
from theano.gpuarray.dnn import (GpuDnnConvDesc, GpuDnnConv, GpuDnnConvGradW, GpuDnnConvGradI, version, get_precision,
DnnBase, handle_type, DNN_CONV_ALGO_CHOOSE_ONCE, DNN_CONV_ALGO_CHOOSE_TIME)
from theano.gpuarray.tests.check_dnn_doc import check_fwd_algorithm
from theano.gpuarray.dnn import GpuDnnConvDesc, GpuDnnConv, GpuDnnConvGradW, GpuDnnConvGradI, version, get_precision
from theano.gpuarray.tests.config import mode_with_gpu, ref_cast
from theano.scalar import bool as bool_t
from theano.tensor.nnet.abstract_conv import get_conv_output_shape, assert_conv_shape
from theano.tensor.opt import Assert
cudnn = cudnn_defs.get_definitions(version(raises=False))
cudnnConvolutionFwdAlgo_t = cudnn.cudnnConvolutionFwdAlgo_t
cudnnConvolutionBwdFilterAlgo_t = cudnn.cudnnConvolutionBwdFilterAlgo_t
cudnnConvolutionBwdDataAlgo_t = cudnn.cudnnConvolutionBwdDataAlgo_t
AVAILABLE_PRECISIONS = cudnn.supported_precisions(theano.config.floatX)
class DnnCaseGenerator:
"""
......@@ -81,11 +76,11 @@ class DnnCaseGenerator:
@staticmethod
def get_if_valid_conv_output_shape(case_tuple):
out_shp = get_conv_output_shape(case_tuple[0][0], # input shape
case_tuple[0][1], # filter shape
case_tuple[1], # border mode
case_tuple[0][2], # subsample
case_tuple[0][3] # dilation
out_shp = get_conv_output_shape(case_tuple[0], # input shape
case_tuple[1], # filter shape
case_tuple[4], # border mode
case_tuple[2], # subsample
case_tuple[3] # dilation
)
try:
return assert_conv_shape(out_shp)
......@@ -94,7 +89,7 @@ class DnnCaseGenerator:
def get_cases(self):
# Generate an iterator of tuples with format:
# ( (input shape, filter shape, subsample, dilation), border mode, convolution mode, alpha, beta )
# (input shape, filter shape, subsample, dilation, border mode, convolution mode, alpha, beta)
all_batch_sizes = (self.batch_size,)
all_input_channels = (self.input_channels,)
all_input_sizes = self._shapes(self.input_size)
......@@ -114,7 +109,7 @@ class DnnCaseGenerator:
all_filter_shapes = ((oc, ic) + fis
for oc in all_output_channels for ic in all_input_channels for fis in all_filter_sizes)
return ifilter(DnnCaseGenerator.get_if_valid_conv_output_shape,
product(product(all_input_shapes, all_filter_shapes, all_subsamples, all_dilations),
product(all_input_shapes, all_filter_shapes, all_subsamples, all_dilations,
all_border_modes, all_conv_modes, all_alphas, all_betas))
......@@ -142,14 +137,14 @@ def dnn_conv(img, kerns, alpha=1, beta=0, out=None, border_mode='valid', subsamp
desc_op.subsample,
filter_dilation=dilation)
out_shp = assert_conv_shape(out_shp)
if beta != 0:
if beta == 0:
real_out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*out_shp)
else:
assert out is not None
out = as_gpuarray_variable(out, ctx_name)
out = gpu_contiguous(out)
check = Assert('GpuDnnConv: qiven output (for beta not null) does not have expected shape')
real_out = check(out, theano.tensor.all(theano.tensor.eq(out.shape, out_shp)))
else:
real_out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*out_shp)
return GpuDnnConv(algo=algo)(img, kerns, real_out, desc, alpha, beta)
......@@ -207,78 +202,28 @@ def dnn_gradinput(kerns, topgrad, img_shp, alpha=1, beta=0, out=None, border_mod
return GpuDnnConvGradI(algo=algo)(kerns, topgrad, real_out, desc, alpha, beta)
class BaseGpuDnnConvChooseAlgo(DnnBase):
"""
This class and its subclasses allow to retrieve a cuDNN algorithm
at runtime without any computation, given the user choose option
(time_once, time_on_shape_change, guess_once or guess_on_shape_change).
To help reduce whole test time, I suggest we use these classes when
algo is one of choose options, as any chosen algorithm would have
been tested by the other exhaustive tests.
"""
_f16_ok = True
check_input = False
__props__ = ('choice',)
params_type = ParamsType(choose_once=bool_t, choose_time=bool_t, handle=handle_type)
# Abstract attributes.
func_file = None
func_name = None
def __init__(self, choice):
COp.__init__(self, ["../dnn_base.c", "../dnn_conv_base.c", self.func_file], self.func_name)
assert choice in SUPPORTED_DNN_CONV_ALGO_RUNTIME
self.choice = choice
self.choose_once = self.choice in DNN_CONV_ALGO_CHOOSE_ONCE
self.choose_time = self.choice in DNN_CONV_ALGO_CHOOSE_TIME
def dnn_context(self, node):
return node.inputs[0].type.context_name
def _prepare_inputs(self, i1, name_i1, i2, name_i2, output, desc):
ctx_name = infer_context_name(i1, i2, output)
i1 = as_gpuarray_variable(i1, ctx_name)
i2 = as_gpuarray_variable(i2, ctx_name)
output = as_gpuarray_variable(output, ctx_name)
if i1.type.ndim not in (4, 5):
raise TypeError('%s must be 4D or 5D tensor' % name_i1)
if i2.type.ndim not in (4, 5):
raise TypeError('%s must be 4D or 5D tensor' % name_i2)
if output.type.ndim not in (4, 5):
raise TypeError('output must be 4D or 5D tensor')
if i1.type.ndim != i2.type.ndim or i1.type.ndim != output.type.ndim:
raise TypeError("The number of dimensions of %s, %s and output must match" % (name_i1, name_i2))
if not isinstance(desc.type, CDataType) or desc.type.ctype != 'cudnnConvolutionDescriptor_t':
raise TypeError('desc must be cudnnConvolutionDescriptor_t')
return (i1, i2, output, desc)
class GpuDnnConvChooseFwdAlgo(BaseGpuDnnConvChooseAlgo):
func_file = 'dnn_choose_fwd.c'
func_name = 'APPLY_SPECIFIC(choose_fwd_algo)'
def check_fwd_dtype_config_support(dtype, precision):
inputs_shape = (1, 1, 3, 3)
filters_shape = (1, 1, 2, 2)
inputs = np.zeros(inputs_shape, dtype=dtype)
filters = np.zeros(filters_shape, dtype=dtype)
inputs = theano.shared(inputs)
filters = theano.shared(filters)
conv = dnn_conv(inputs, filters, precision=precision)
f = theano.function([], conv, mode=mode_with_gpu)
try:
f()
except RuntimeError as e:
assert 'CUDNN_STATUS_ARCH_MISMATCH' in e.message
return False
return True
def make_node(self, img, kern, output, desc):
img, kern, output, desc = self._prepare_inputs(img, 'img', kern, 'kern', output, desc)
return Apply(self, [img, kern, output, desc], [cudnn.cudnnConvolutionFwdAlgo_t()])
class GpuDnnConvChooseBwdFilterAlgo(BaseGpuDnnConvChooseAlgo):
func_file = 'dnn_choose_gw.c'
func_name = 'APPLY_SPECIFIC(choose_bwd_filter_algo)'
def make_node(self, img, topgrad, output, desc):
img, topgrad, output, desc = self._prepare_inputs(img, 'img', topgrad, 'topgrad', output, desc)
return Apply(self, [img, topgrad, output, desc], [cudnn.cudnnConvolutionBwdFilterAlgo_t()])
class GpuDnnConvChooseBwdDataAlgo(BaseGpuDnnConvChooseAlgo):
func_file = 'dnn_choose_gi.c'
func_name = 'APPLY_SPECIFIC(choose_bwd_data_algo)'
def make_node(self, kern, topgrad, output, desc):
kern, topgrad, output, desc = self._prepare_inputs(kern, 'kern', topgrad, 'topgrad', output, desc)
return Apply(self, [kern, topgrad, output, desc], [cudnn.cudnnConvolutionBwdDataAlgo_t()])
def test_fwd_true_half_config_support():
# For cuDNN V5.1 and V6.0:
# "TRUE_HALF_CONFIG is only supported on architectures with true fp16 support (compute capability 5.3 and 6.0)"
if not check_fwd_dtype_config_support('float16', 'float16'):
raise SkipTest('FWD: TRUE_HALF_CONFIG not supported on this GPU.')
class BaseTestDnnConv(object):
......@@ -287,10 +232,6 @@ class BaseTestDnnConv(object):
to run actual tests.
"""
_functions_checked_for_fwd = False
_functions_checked_for_gradinput = False
_functions_checked_for_gradweight = False
# Abstract attributes.
ndim = 2
......@@ -303,25 +244,25 @@ class BaseTestDnnConv(object):
cpu_gradinput_class = None
cpu_gradweight_class = None
# Utility methods.
def get_cases(self):
# Return an iterable of test cases. Each test case is a tuple (or list) with following syntax:
# ( (input shape, filter shape, subsample, dilation), border mode, convolution mode, alpha, beta )
# (input shape, filter shape, subsample, dilation, border mode, convolution mode, alpha, beta)
generator = DnnCaseGenerator(ndim=self.ndim)
return generator.get_cases()
# Run and utility methods.
def array_like_conv_output(self, inputs_shape, filters_shape, border_mode, subsample, dilation):
def array_like_conv_output(self, inputs_shape, filters_shape, border_mode, subsample, dilation, dtype):
# Return an random array with inferred convolution output shape.
out_shp = get_conv_output_shape(inputs_shape, filters_shape, border_mode, subsample, dilation)
out_shp = assert_conv_shape(out_shp)
return np.random.random(out_shp).astype(theano.config.floatX)
return np.random.random(out_shp).astype(dtype)
def run_conv_fwd(self, algo, precision, parameters):
(inputs_shape, filters_shape, subsample, dilation), border_mode, conv_mode, alpha, beta = parameters
def run_conv_fwd(self, algo, dtype, precision, parameters):
inputs_shape, filters_shape, subsample, dilation, border_mode, conv_mode, alpha, beta = parameters
inputs_val = np.random.random(inputs_shape).astype(theano.config.floatX)
filters_val = np.random.random(filters_shape).astype(theano.config.floatX)
inputs_val = np.random.random(inputs_shape).astype(dtype)
filters_val = np.random.random(filters_shape).astype(dtype)
# Scale down the input values to prevent very large absolute errors
# due to float rounding
......@@ -331,8 +272,11 @@ class BaseTestDnnConv(object):
inputs = theano.shared(inputs_val)
filters = theano.shared(filters_val)
out = None if beta == 0 else self.array_like_conv_output(inputs_shape, filters_shape, border_mode, subsample,
dilation)
if beta == 0:
out = None
else:
out = self.array_like_conv_output(inputs_shape, filters_shape, border_mode, subsample, dilation, dtype)
out /= 10
# Compile a theano function for the cuDNN implementation
conv = dnn_conv(img=inputs, kerns=filters, alpha=alpha, beta=beta, out=out, border_mode=border_mode,
subsample=subsample, dilation=dilation, conv_mode=conv_mode, algo=algo, precision=precision)
......@@ -354,15 +298,6 @@ class BaseTestDnnConv(object):
filter_dilation=dilation)(ref_cast(inputs), flipped_filters)
f_ref = theano.function([], conv_ref, mode="FAST_RUN")
if not self._functions_checked_for_fwd:
self._functions_checked_for_fwd = True
assert any(isinstance(node.op, GpuDnnConv) for node in f.maker.fgraph.apply_nodes)
assert not any(isinstance(node.op, (GpuDnnConvGradI, GpuDnnConvGradW))
for node in f.maker.fgraph.apply_nodes)
assert not any(isinstance(node.op, (GpuDnnConv, GpuDnnConvGradW, GpuDnnConvGradI))
for node in f_ref.maker.fgraph.apply_nodes)
# Compare the results of the two implementations
res_ref = f_ref()
res = f()
......@@ -371,19 +306,26 @@ class BaseTestDnnConv(object):
utt.assert_allclose(res, res2)
# Raise tolerance for float16
rtol = 6e-2 if theano.config.floatX == 'float16' else None
rtol = 6e-2 if dtype == 'float16' else None
if beta == 0:
utt.assert_allclose(alpha * res_ref, res, rtol=rtol)
else:
# print('(conv: beta not null) ', end='')
utt.assert_allclose(alpha * res_ref + beta * out, res, rtol=rtol)
def run_conv_gradinput(self, algo, precision, parameters):
(inputs_shape, filters_shape, subsample, dilation), border_mode, conv_mode, alpha, beta = parameters
def run_conv_gradinput(self, algo, dtype, precision, parameters):
inputs_shape, filters_shape, subsample, dilation, border_mode, conv_mode, alpha, beta = parameters
if beta == 0:
inputs_val = None
else:
inputs_val = np.random.random(inputs_shape).astype(dtype)
inputs_val /= 10
filters_val = np.random.random(filters_shape).astype(dtype)
topgrad_val = self.array_like_conv_output(inputs_shape, filters_shape, border_mode, subsample, dilation, dtype)
inputs_val = np.random.random(inputs_shape).astype(theano.config.floatX)
filters_val = np.random.random(filters_shape).astype(theano.config.floatX)
topgrad_val = self.array_like_conv_output(inputs_shape, filters_shape, border_mode, subsample, dilation)
# Scale down the input values to prevent absolute errors in utt.assert_allclose.
filters_val /= 10
topgrad_val /= 10
filters = theano.shared(filters_val)
topgrad = theano.shared(topgrad_val)
......@@ -412,15 +354,6 @@ class BaseTestDnnConv(object):
)(ref_cast(flipped_filters), ref_cast(topgrad), inputs_shape[2:])
f_ref = theano.function([], grad_i_ref, mode="FAST_RUN")
if not self._functions_checked_for_gradinput:
self._functions_checked_for_gradinput = True
assert any(isinstance(node.op, GpuDnnConvGradI) for node in f.maker.fgraph.apply_nodes)
assert not any(isinstance(node.op, (GpuDnnConv, GpuDnnConvGradW))
for node in f.maker.fgraph.apply_nodes)
assert not any(isinstance(node.op, (GpuDnnConv, GpuDnnConvGradW, GpuDnnConvGradI))
for node in f_ref.maker.fgraph.apply_nodes)
# Compare the results of the two implementations
res_ref = f_ref()
res = f()
......@@ -429,15 +362,26 @@ class BaseTestDnnConv(object):
utt.assert_allclose(res, res2)
# Raise tolerance for float16
rtol = 5e-2 if theano.config.floatX == 'float16' else None
utt.assert_allclose(alpha * res_ref + beta * inputs_val, res, rtol=rtol)
rtol = 5e-2 if dtype == 'float16' else None
if beta == 0:
utt.assert_allclose(alpha * res_ref, res, rtol=rtol)
else:
utt.assert_allclose(alpha * res_ref + beta * inputs_val, res, rtol=rtol)
def run_conv_gradweight(self, algo, precision, parameters):
(inputs_shape, filters_shape, subsample, dilation), border_mode, conv_mode, alpha, beta = parameters
def run_conv_gradweight(self, algo, dtype, precision, parameters):
inputs_shape, filters_shape, subsample, dilation, border_mode, conv_mode, alpha, beta = parameters
inputs_val = np.random.random(inputs_shape).astype(theano.config.floatX)
filters_val = np.random.random(filters_shape).astype(theano.config.floatX)
topgrad_val = self.array_like_conv_output(inputs_shape, filters_shape, border_mode, subsample, dilation)
inputs_val = np.random.random(inputs_shape).astype(dtype)
if beta == 0:
filters_val = None
else:
filters_val = np.random.random(filters_shape).astype(dtype)
filters_val /= 10
topgrad_val = self.array_like_conv_output(inputs_shape, filters_shape, border_mode, subsample, dilation, dtype)
# Scale down the input values to prevent absolute errors in utt.assert_allclose.
inputs_val /= 10
topgrad_val /= 10
inputs = theano.shared(inputs_val)
topgrad = theano.shared(topgrad_val)
......@@ -458,15 +402,6 @@ class BaseTestDnnConv(object):
grad_w_ref = grad_w_ref[:, :, ::-1, ::-1, ::-1]
f_ref = theano.function([], grad_w_ref, mode="FAST_RUN")
if not self._functions_checked_for_gradweight:
self._functions_checked_for_gradweight = True
assert any(isinstance(node.op, GpuDnnConvGradW) for node in f.maker.fgraph.apply_nodes)
assert not any(isinstance(node.op, (GpuDnnConv, GpuDnnConvGradI))
for node in f.maker.fgraph.apply_nodes)
assert not any(isinstance(node.op, (GpuDnnConv, GpuDnnConvGradW, GpuDnnConvGradI))
for node in f_ref.maker.fgraph.apply_nodes)
# Compare the results of the two implementations
res_ref = f_ref()
res = f()
......@@ -475,119 +410,58 @@ class BaseTestDnnConv(object):
utt.assert_allclose(res, res2)
# Raise tolerance for float16
rtol = 5e-2 if theano.config.floatX == 'float16' else None
utt.assert_allclose(alpha * res_ref + beta * filters_val, res, rtol=rtol)
def run_choose_runtime_algos(self, algo, precision, parameters):
(inputs_shape, filters_shape, subsample, dilation), border_mode, conv_mode, alpha, beta = parameters
out_shp = assert_conv_shape(
get_conv_output_shape(inputs_shape, filters_shape, border_mode, subsample, dilation))
inputs_val = np.random.random(inputs_shape).astype(theano.config.floatX)
filters_val = np.random.random(filters_shape).astype(theano.config.floatX)
topgrad_val = self.array_like_conv_output(inputs_shape, filters_shape, border_mode, subsample, dilation)
inputs = theano.shared(inputs_val)
filters = theano.shared(filters_val)
topgrad = theano.shared(topgrad_val)
ctx_name = infer_context_name(inputs, topgrad)
desc_filter = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample, dilation=dilation,
conv_mode=conv_mode, precision=precision)(filters_shape)
array_like_filters = GpuAllocEmpty(dtype=inputs.dtype, context_name=ctx_name)(*filters_shape)
array_like_inputs = GpuAllocEmpty(dtype=inputs.dtype, context_name=ctx_name)(*inputs_shape)
array_like_conv_output = GpuAllocEmpty(dtype=inputs.dtype, context_name=ctx_name)(*out_shp)
algo_filter = GpuDnnConvChooseBwdFilterAlgo(algo)(inputs, topgrad, array_like_filters, desc_filter)
algo_input = GpuDnnConvChooseBwdDataAlgo(algo)(filters, topgrad, array_like_inputs, desc_filter)
algo_conv = GpuDnnConvChooseFwdAlgo(algo)(inputs, filters, array_like_conv_output, desc_filter)
f = theano.function([], [algo_filter, algo_input, algo_conv], mode=mode_with_gpu)
# Just test that it runs.
algo_filter_val, algo_input_val, algo_conv_val = f()
# How to test if it "works" ?
rtol = 5e-2 if dtype == 'float16' else None
if beta == 0:
utt.assert_allclose(alpha * res_ref, res, rtol=rtol)
else:
utt.assert_allclose(alpha * res_ref + beta * filters_val, res, rtol=rtol)
def get_expected_tcount(self):
"""
Utility function to get expected test count
without actually run nosetests.
"""
len_cases = 0
for c in self.get_cases():
len_cases += 1
print(len_cases, 'conv cases for %dD' % self.ndim)
return len(AVAILABLE_PRECISIONS) * len_cases * len(self.fwd_algorithms +
self.bwd_data_algorithms +
self.bwd_filter_algorithms +
SUPPORTED_DNN_CONV_ALGO_RUNTIME)
len_cases = sum(1 for case in self.get_cases())
count_contexts = 0
for dtype, precision in cudnn.get_fwd_dtype_configs(check_runtime=check_fwd_dtype_config_support):
algos = (algo for algo in self.fwd_algorithms
if cudnn.fwd_algo_supports_dtype_config(algo, dtype, precision, self.ndim))
count_contexts += sum(1 for algo in algos) + len(SUPPORTED_DNN_CONV_ALGO_RUNTIME)
for dtype, precision in cudnn.get_bwd_data_dtype_configs():
algos = (algo for algo in self.bwd_data_algorithms
if cudnn.bwd_data_algo_supports_dtype_config(algo, dtype, precision, self.ndim))
count_contexts += sum(1 for algo in algos) + len(SUPPORTED_DNN_CONV_ALGO_RUNTIME)
for dtype, precision in cudnn.get_bwd_filter_dtype_configs():
algos = (algo for algo in self.bwd_filter_algorithms
if cudnn.bwd_filter_algo_supports_dtype_config(algo, dtype, precision, self.ndim))
count_contexts += sum(1 for algo in algos) + len(SUPPORTED_DNN_CONV_ALGO_RUNTIME)
return len_cases * count_contexts
# Iterable test methods.
def test_fwd(self):
for precision, algo, parameters in product(AVAILABLE_PRECISIONS, self.fwd_algorithms, self.get_cases()):
yield (self.run_conv_fwd, algo, precision, parameters)
for dtype, precision in cudnn.get_fwd_dtype_configs(check_runtime=check_fwd_dtype_config_support):
algos = (algo for algo in self.fwd_algorithms
if cudnn.fwd_algo_supports_dtype_config(algo, dtype, precision, self.ndim))
for algo in chain(algos, SUPPORTED_DNN_CONV_ALGO_RUNTIME):
for parameters in self.get_cases():
yield (self.run_conv_fwd, algo, dtype, precision, parameters)
def test_gradinput(self):
for precision, algo, parameters in product(AVAILABLE_PRECISIONS, self.bwd_data_algorithms, self.get_cases()):
yield (self.run_conv_gradinput, algo, precision, parameters)
for dtype, precision in cudnn.get_bwd_data_dtype_configs():
algos = (algo for algo in self.bwd_data_algorithms
if cudnn.bwd_data_algo_supports_dtype_config(algo, dtype, precision, self.ndim))
for algo in chain(algos, SUPPORTED_DNN_CONV_ALGO_RUNTIME):
for parameters in self.get_cases():
yield (self.run_conv_gradinput, algo, dtype, precision, parameters)
def test_gradweight(self):
for precision, algo, parameters in product(AVAILABLE_PRECISIONS, self.bwd_filter_algorithms, self.get_cases()):
yield (self.run_conv_gradweight, algo, precision, parameters)
def test_choose_runtime_algos(self):
for precision, algo, parameters in product(AVAILABLE_PRECISIONS, SUPPORTED_DNN_CONV_ALGO_RUNTIME,
self.get_cases()):
yield (self.run_choose_runtime_algos, algo, precision, parameters)
def check_fwd_predictions(self):
"""
Call this method to check if tests fail when they
don't follow cuDNN V5.1 doc conditions for FWD algorithms.
Script will exit as soon as there is a test that does not fail when expected.
"""
print()
print('TESTING FWD FAILURES PREDICTED FOR %dD' % self.ndim)
count = 0
for precision, algo, parameters in product(AVAILABLE_PRECISIONS, self.fwd_algorithms,
self.get_cases()):
(inputs_shape, filters_shape, subsample, dilation), border_mode, conv_mode, alpha, beta = parameters
inputs_val = np.random.random(inputs_shape).astype(theano.config.floatX)
filters_val = np.random.random(filters_shape).astype(theano.config.floatX)
# Scale down the input values to prevent very large absolute errors
# due to float rounding
inputs_val /= 10
filters_val /= 10
out = self.array_like_conv_output(inputs_shape, filters_shape, border_mode, subsample, dilation)
desc_op = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample, dilation=dilation,
conv_mode=conv_mode, precision=precision)
should_compute = check_fwd_algorithm(inputs_val, filters_val, out, desc_op,
algo, precision, subsample, dilation)
if not should_compute.ok:
infos = ['ndim : %s' % (len(inputs_shape) - 2),
'precision : %s' % precision]
infos += should_compute.messages
try:
self.run_conv_fwd(algo, precision, parameters)
except Exception as e:
print('(FAILS as expected)', algo, precision, parameters)
print(e.message.split('\n')[0])
for info in infos:
print(info)
# exit(0)
else:
print('**SHOULD FAIL**|', algo, precision, parameters)
for info in infos:
print(info)
exit(-1)
count += 1
if count % 200 == 0:
print(count, 'passed')
print(count, 'finished')
for dtype, precision in cudnn.get_bwd_filter_dtype_configs():
algos = (algo for algo in self.bwd_filter_algorithms
if cudnn.bwd_filter_algo_supports_dtype_config(algo, dtype, precision, self.ndim))
for algo in chain(algos, SUPPORTED_DNN_CONV_ALGO_RUNTIME):
for parameters in self.get_cases():
yield (self.run_conv_gradweight, algo, dtype, precision, parameters)
class TestDnnConv2D(BaseTestDnnConv):
......@@ -615,23 +489,39 @@ class TestDnnConv3D(BaseTestDnnConv):
if __name__ == '__main__':
def dtype_config_to_str(dtype_config):
dtype, precision = dtype_config
if dtype == precision == 'float16':
return 'TRUE_HALF_CONFIG'
if dtype == 'float16' and precision == 'float32':
return 'PSEUDO_HALF_CONFIG'
if dtype == precision == 'float32':
return 'FLOAT_CONFIG'
if dtype == precision == 'float64':
return 'DOUBLE_CONFIG'
raise ValueError
test_2d = TestDnnConv2D()
test_3d = TestDnnConv3D()
print()
print('Available data type configurations :',
', '.join(dtype_config_to_str(d) for d in cudnn.get_supported_dtype_configs()))
print()
print('2D algorithms:')
print('FWD :', test_2d.fwd_algorithms)
print('BWD FILTER:', test_2d.bwd_filter_algorithms)
print('BWD DATA :', test_2d.bwd_data_algorithms)
print('FWD :', ', '.join(test_2d.fwd_algorithms))
print('BWD FILTER :', ', '.join(test_2d.bwd_filter_algorithms))
print('BWD DATA :', ', '.join(test_2d.bwd_data_algorithms))
print()
print('3D algorithms:')
print('FWD :', test_3d.fwd_algorithms)
print('BWD FILTER:', test_3d.bwd_filter_algorithms)
print('BWD DATA :', test_3d.bwd_data_algorithms)
print('FWD :', ', '.join(test_3d.fwd_algorithms))
print('BWD FILTER :', ', '.join(test_3d.bwd_filter_algorithms))
print('BWD DATA :', ', '.join(test_3d.bwd_data_algorithms))
print()
count_tests_2d = test_2d.get_expected_tcount()
count_tests_3d = test_3d.get_expected_tcount()
print(count_tests_2d, 'total cases for 2D.')
print(count_tests_3d, 'total cases for 3D.')
print(count_tests_2d + count_tests_3d, 'total cases.')
import sys
if len(sys.argv) == 2 and sys.argv[1] == 'run':
test_2d.check_fwd_predictions()
test_3d.check_fwd_predictions()
print(count_tests_2d, 'conv2D test cases.')
print(count_tests_3d, 'conv3D test cases.')
print(count_tests_2d + count_tests_3d, 'total conv test cases.')
print()
nose.main(defaultTest='theano.gpuarray.tests.check_dnn')
"""
This module is just a collection of definitions to be used by `check_dnn.py`.
Following classes, functions and definitions are used to check if
tests fail as expected when conditions listed into cuDNN documentation are not verified.
I have currently implemented checking only for 2D/3D FWD algorithms in cuDNN V5.1,
and in practice, many tests pass even when they don't follow cuDNN doc conditions.
So, I think we should better just run all tests and check ourselves
which tests pass, which fail, and why they fail.
Reminder:
N: batch number
C: number of feature maps
D: depth
H: height
W: width
NB: We assume that we **always** use NC(D)HW tensors in Theano.
"""
from __future__ import absolute_import, print_function, division
import theano
from ..cudnn_defs import HALF, FLOAT, DOUBLE, get_definitions
from ..dnn import version
UNKNOWN, TRUE_HALF_CONFIG, PSEUDO_HALF_CONFIG, FLOAT_CONFIG, DOUBLE_CONFIG = -1, 0, 1, 2, 3
cudnn = get_definitions(version(raises=False))
cudnnConvolutionFwdAlgo_t = cudnn.cudnnConvolutionFwdAlgo_t
class Success:
ok = True
messages = []
def __init__(self, messages=[]):
self.messages = list(messages)
def add_message(self, *parts):
self.messages.append(''.join(str(part) for part in parts))
class Failure(Success):
ok = False
def _and(*tests):
# `tests` is a list of tuples with format (lambda test, test description)
messages = []
for test_lambda, message in tests:
if not test_lambda():
messages.append(message)
return Failure(messages) if messages else Success()
def _or(*tests):
messages = []
ok = False
for test_lambda, message in tests:
if test_lambda():
ok = True
break
else:
messages.append(message)
return Success() if ok else Failure(messages)
def type_conf(precision):
# All Op's input tensors are floatX tensors.
floatX = theano.config.floatX
if floatX == precision == HALF:
return TRUE_HALF_CONFIG
if floatX == HALF and precision == FLOAT:
return PSEUDO_HALF_CONFIG
if floatX == precision == FLOAT:
return FLOAT_CONFIG
if floatX == precision == DOUBLE:
return DOUBLE_CONFIG
return UNKNOWN
# raise ValueError('Unknown data type configuration (%s %s)' % (floatX, precision))
def type_conf_to_string(conf):
if conf == -1:
return 'UNKNOWN'
if conf == 0:
return 'TRUE_HALF_CONFIG'
if conf == 1:
return 'PSEUDO_HALF_CONFIG'
if conf == 2:
return 'FLOAT_CONFIG'
if conf == 3:
return 'DOUBLE_CONFIG'
def strideof(tensor, i):
return tensor.strides[i] // tensor.itemsize
def tensor_is_partially_packed(tensor, packed_dim_names):
if tensor.ndim == 4:
dim_names = 'NCHW'
else:
dim_names = 'NCDHW'
packed_dims = []
unpacked_dims = []
for i in range(tensor.ndim - 1):
if dim_names[i] in packed_dim_names:
packed_dims.append(i)
else:
unpacked_dims.append(i)
if dim_names[tensor.ndim - 1] in packed_dim_names and strideof(tensor, -1) != 1:
# We won't put last dimension in the list of packed dims.
# We just need to check if stride of that dimension is 1.
return False
return (all(strideof(tensor, i) >= tensor.shape[i + 1] * strideof(tensor, i + 1) for i in unpacked_dims) and
all(strideof(tensor, i) == tensor.shape[i + 1] * strideof(tensor, i + 1) for i in packed_dims))
def tensor_is_fully_packed(tensor):
return strideof(tensor, -1) == 1 and all(strideof(tensor, i) == tensor.shape[i + 1] * strideof(tensor, i + 1)
for i in range(tensor.ndim - 1))
def check_fwd_algorithm(img, kern, out, desc_op, algo, precision, subsample, dilation):
# Based on cuDNN v5.1 user guide.
ndim = img.ndim - 2
if ndim == 2:
# rD won't be used.
rD, rH, rW = -1, 0, 1
else:
rD, rH, rW = 0, 1, 2
algo = cudnnConvolutionFwdAlgo_t.fromalias(algo)
kern_shape = kern.shape[2:]
kern_shape = tuple((kern_shape[i] - 1) * dilation[i] + 1 for i in range(len(dilation)))
pad = (desc_op.pad0, desc_op.pad1, desc_op.pad2)[:len(kern_shape)]
if desc_op.bmode == 'full':
pad = tuple(kern_shape[i] - 1 for i in range(len(pad)))
elif desc_op.bmode == 'half':
pad = tuple(kern_shape[i] // 2 for i in range(len(pad)))
img_shape = img.shape[2:]
img_with_borders = tuple(img_shape[i] + 2 * pad[i] for i in range(len(pad)))
def check_algo():
if algo == cudnnConvolutionFwdAlgo_t.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM:
return _and((lambda: type_conf(precision) != TRUE_HALF_CONFIG,
"Data Type Config Support: All except TRUE_HALF_CONFIG"))
# CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM: 2D: everything supported.
if ndim == 3 and algo == cudnnConvolutionFwdAlgo_t.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM:
return _and(
(lambda: type_conf(precision) != TRUE_HALF_CONFIG,
"Data Type Config Support: All except TRUE_HALF_CONFIG"),
(lambda: tensor_is_fully_packed(img),
"xDesc Format Support: NCDHW-fully-packed"),
(lambda: tensor_is_fully_packed(out),
"yDesc Format Support: NCDHW-fully-packed"),
)
if algo == cudnnConvolutionFwdAlgo_t.CUDNN_CONVOLUTION_FWD_ALGO_GEMM:
return _and(
(lambda: type_conf(precision) != TRUE_HALF_CONFIG,
"Data Type Config Support: All except TRUE_HALF_CONFIG"),
(lambda: ndim == 2,
"Only for conv2d")
)
# CUDNN_CONVOLUTION_FWD_ALGO_DIRECT: not implemented.
if algo == cudnnConvolutionFwdAlgo_t.CUDNN_CONVOLUTION_FWD_ALGO_FFT:
return _and(
(lambda: type_conf(precision) in (PSEUDO_HALF_CONFIG, FLOAT_CONFIG),
"Data Type Config Support: PSEUDO_HALF_CONFIG, FLOAT_CONFIG"),
(lambda: ndim == 2,
"Only for conv2d"),
(lambda: tensor_is_partially_packed(img, 'HW'),
"xDesc Format Support: NCHW HW-packed"),
(lambda: tensor_is_partially_packed(out, 'HW'),
"yDesc Format Support: NCHW HW-packed"),
(lambda: img_with_borders[rH] <= 256,
"xDesc 's feature map height + 2 * convDesc 's zero-padding height must equal 256 or less"),
(lambda: img_with_borders[rW] <= 256,
"xDesc 's feature map width + 2 * convDesc 's zero-padding width must equal 256 or less"),
(lambda: subsample[rH] == subsample[rW] == 1,
"convDesc 's vertical and horizontal filter stride must equal 1"),
(lambda: kern_shape[rH] > pad[rH],
"wDesc 's filter height must be greater than convDesc 's zero-padding height"),
(lambda: kern_shape[rW] > pad[rW],
"wDesc 's filter width must be greater than convDesc 's zero-padding width")
)
if algo == cudnnConvolutionFwdAlgo_t.CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING:
if ndim == 2:
return _and(
(lambda: type_conf(precision) in (PSEUDO_HALF_CONFIG, FLOAT_CONFIG),
"Data Type Config Support: PSEUDO_HALF_CONFIG, FLOAT_CONFIG"),
(lambda: tensor_is_partially_packed(img, 'HW'),
"xDesc Format Support: NCHW HW-packed"),
(lambda: tensor_is_partially_packed(out, 'HW'),
"yDesc Format Support: NCHW HW-packed"),
(lambda: kern_shape[rH] <= 32,
"wDesc 's filter height must equal 32 or less"),
(lambda: kern_shape[rW] <= 32,
"wDesc 's filter width must equal 32 or less"),
(lambda: subsample[rH] == subsample[rW] == 1,
"convDesc 's vertical and horizontal filter stride must equal 1"),
(lambda: pad[rH] < kern_shape[rH],
"wDesc 's filter height must be greater than convDesc 's zero-padding height"),
(lambda: pad[rW] < kern_shape[rW],
"wDesc 's filter width must be greater than convDesc 's zero-padding width"),
)
if ndim == 3:
return _and(
(lambda: type_conf(precision) != TRUE_HALF_CONFIG,
"Data Type Config Support: All except TRUE_HALF_CONFIG"),
(lambda: tensor_is_partially_packed(img, 'DHW'),
"xDesc Format Support: NCDHW DHW-packed"),
(lambda: tensor_is_partially_packed(out, 'DHW'),
"yDesc Format Support: NCDHW DHW-packed"),
(lambda: kern_shape[rH] <= 16,
"wDesc 's filter height must equal 16 or less"),
(lambda: kern_shape[rW] <= 16,
"wDesc 's filter width must equal 16 or less"),
(lambda: kern_shape[rD] <= 16,
"wDesc 's filter depth must equal 16 or less"),
(lambda: all(s == 1 for s in subsample),
"convDesc 's must have all filter strides equal to 1"),
(lambda: pad[rH] < kern_shape[rH],
"wDesc 's filter height must be greater than convDesc 's zero-padding height"),
(lambda: pad[rW] < kern_shape[rW],
"wDesc 's filter width must be greater than convDesc 's zero-padding width"),
(lambda: pad[rW] < kern_shape[rD],
"wDesc 's filter depth must be greater than convDesc 's zero-padding width"),
)
if algo == cudnnConvolutionFwdAlgo_t.CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD:
return _and(
(lambda: type_conf(precision) in (PSEUDO_HALF_CONFIG, FLOAT_CONFIG, DOUBLE_CONFIG),
"Data Type Config Support: PSEUDO_HALF_CONFIG, FLOAT_CONFIG"),
(lambda: ndim == 2,
"Only for conv2d"),
(lambda: subsample[rH] == subsample[rW] == 1,
"convDesc 's vertical and horizontal filter stride must equal 1"),
(lambda: kern_shape[rH] == 3,
"wDesc 's filter height must be 3"),
(lambda: kern_shape[rW] == 3,
"wDesc 's filter width must be 3"),
)
if algo == cudnnConvolutionFwdAlgo_t.CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED:
data_type_conf = type_conf(precision)
return _and(
(lambda: data_type_conf != DOUBLE_CONFIG,
"Data Type Config Support: All except DOUBLE_CONFIG"),
(lambda: ndim == 2,
"Only for conv2d"),
(lambda: subsample[rH] == subsample[rW] == 1,
"convDesc 's vertical and horizontal filter stride must equal 1"),
(lambda: kern_shape[rH] == kern_shape[rW] and kern_shape[rH] in (3, 5),
"wDesc 's filter (height, width) must be (3,3) or (5,5)"),
(lambda: kern_shape[rH] == 3 or data_type_conf != TRUE_HALF_CONFIG,
"If wDesc 's filter (height, width) is (5,5), "
"data type config TRUE_HALF_CONFIG is not supported")
)
checking = check_algo()
if not checking.ok:
messages = checking.messages
checking.messages = []
checking.add_message('config : ', type_conf_to_string(type_conf(precision)))
checking.add_message('computed borders : ', pad)
checking.add_message('img with borders : ', img_with_borders)
checking.add_message('computed kern shape: ', kern_shape)
checking.add_message('== why should fail ==')
checking.messages += messages
return checking
#section init_code_struct
reuse_algo = 0;
prev_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
if (!PARAMS->choose_once) {
memset(prev_img_dims, 0, sizeof(prev_img_dims));
memset(prev_kern_dims, 0, sizeof(prev_kern_dims));
}
#section support_code_struct
int reuse_algo;
cudnnConvolutionFwdAlgo_t prev_algo;
size_t prev_img_dims[5];
size_t prev_kern_dims[5];
int
APPLY_SPECIFIC(choose_fwd_algo)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
PyGpuArrayObject *output,
cudnnConvolutionDescriptor_t desc,
cudnnConvolutionFwdAlgo_t *output_algo,
PARAMS_TYPE* params) {
PyGpuContextObject *c = input->context;
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) {
PyErr_SetString(PyExc_ValueError, "images and kernel must have the same stack size");
return 1;
}
if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
return 1;
if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
return 1;
if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
return 1;
cudnnConvolutionFwdAlgo_t algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
cuda_enter(c->ctx);
int expected_output_dims[5] = {0};
err = cudnnGetConvolutionNdForwardOutputDim(desc, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
PyGpuArray_NDIM(input), expected_output_dims);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error computing convolution output dim: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
if (PyGpuArray_NDIM(input) == 4) {
if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
(PyGpuArray_DIMS(output)[1] != expected_output_dims[1]) ||
(PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
(PyGpuArray_DIMS(output)[3] != expected_output_dims[3])) {
PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ld"
" but received gradient with shape %ldx%ldx%ldx%ld",
expected_output_dims[0], expected_output_dims[1],
expected_output_dims[2], expected_output_dims[3],
PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3]);
cuda_exit(c->ctx);
return 1;
}
} else if (PyGpuArray_NDIM(input) == 5) {
if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
(PyGpuArray_DIMS(output)[1] != expected_output_dims[1]) ||
(PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
(PyGpuArray_DIMS(output)[3] != expected_output_dims[3]) ||
(PyGpuArray_DIMS(output)[4] != expected_output_dims[4])) {
PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ldx%ld"
" but received gradient with shape %ldx%ldx%ldx%ldx%ld",
expected_output_dims[0], expected_output_dims[1],
expected_output_dims[2], expected_output_dims[3],
expected_output_dims[4],
PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3],
PyGpuArray_DIMS(output)[4]);
cuda_exit(c->ctx);
return 1;
}
}
if (!params->choose_once) {
reuse_algo = 1;
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); ++i) {
reuse_algo = (reuse_algo && PyGpuArray_DIM(input, i) == prev_img_dims[i]);
reuse_algo = (reuse_algo && PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]);
}
}
if (!reuse_algo) {
size_t free;
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
if (err2 != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
"memory information on the GPU");
cuda_exit(c->ctx);
return 1;
}
// Guess 4Mb if the info is not available
if (free == 0) free = 4 * 1024 * 1024;
if (params->choose_time) {
int count;
cudnnConvolutionFwdAlgoPerf_t choice;
gpudata *tmpmem;
tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
if (tmpmem == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
return -1;
}
err = cudnnFindConvolutionForwardAlgorithmEx(
params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
desc, APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),
1, &count, &choice, *(void **)tmpmem,
free);
gpudata_release(tmpmem);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
algo = choice.algo;
} else {
err = cudnnGetConvolutionForwardAlgorithm(
params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
desc, APPLY_SPECIFIC(output),
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
}
prev_algo = algo;
} else {
algo = prev_algo;
}
if (params->choose_once) {
reuse_algo = 1;
} else {
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); ++i) {
prev_img_dims[i] = PyGpuArray_DIM(input, i);
prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
}
}
/* These two algos are not supported for 3d conv */
if (PyGpuArray_NDIM(input) == 5 &&
(algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM ||
algo == CUDNN_CONVOLUTION_FWD_ALGO_GEMM))
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
// The FFT implementation does not support strides, 1x1 filters or inputs
// with a spatial dimension larger than 1024. The tiled-FFT implementation
// does not support strides.
// If the chosen implementation is FFT or tiled-FFT, validate that it can
// be used on the current data and default to a safe implementation if it
// can't.
// The following code is 2d-specific but it is fine as FFT and tiled-FFT are
// defined only for 2d filters
if ((algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT ||
algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) && PyGpuArray_NDIM(input) == 4) {
// Extract the properties of the convolution descriptor
int nd;
int pad[2];
int stride[2];
int dilation[2];
cudnnConvolutionMode_t mode;
cudnnDataType_t data_type;
err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
dilation, &mode, &data_type);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error getting convolution properties: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT) {
if (stride[0] != 1 || stride[1] != 1 ||
PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
(PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1))
{
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
}
} else {
// algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING
if (stride[0] != 1 || stride[1] != 1) {
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
}
}
}
*output_algo = algo;
cuda_exit(c->ctx);
return 0;
}
#section init_code_struct
reuse_algo = 0;
prev_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
if (!PARAMS->choose_once) {
memset(prev_kern_dims, 0, sizeof(prev_kern_dims));
memset(prev_top_dims, 0, sizeof(prev_top_dims));
}
#section support_code_struct
int reuse_algo;
cudnnConvolutionBwdDataAlgo_t prev_algo;
size_t prev_kern_dims[5];
size_t prev_top_dims[5];
int
APPLY_SPECIFIC(choose_bwd_data_algo)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
PyGpuArrayObject *input,
cudnnConvolutionDescriptor_t desc,
cudnnConvolutionBwdDataAlgo_t *output_algo,
PARAMS_TYPE* params) {
PyGpuContextObject *c = kerns->context;
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) {
PyErr_SetString(PyExc_ValueError, "images and kernel must have the same stack size");
return 1;
}
if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
return 1;
if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
return 1;
if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
return 1;
cudnnConvolutionBwdDataAlgo_t algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
cuda_enter(c->ctx);
int expected_output_dims[5] = {0};
err = cudnnGetConvolutionNdForwardOutputDim(desc, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
PyGpuArray_NDIM(input), expected_output_dims);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error computing convolution output dim: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
if (PyGpuArray_NDIM(input) == 4) {
if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
(PyGpuArray_DIMS(output)[1] != expected_output_dims[1]) ||
(PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
(PyGpuArray_DIMS(output)[3] != expected_output_dims[3])) {
PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ld"
" but received gradient with shape %ldx%ldx%ldx%ld",
expected_output_dims[0], expected_output_dims[1],
expected_output_dims[2], expected_output_dims[3],
PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3]);
cuda_exit(c->ctx);
return 1;
}
} else if (PyGpuArray_NDIM(input) == 5) {
if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
(PyGpuArray_DIMS(output)[1] != expected_output_dims[1]) ||
(PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
(PyGpuArray_DIMS(output)[3] != expected_output_dims[3]) ||
(PyGpuArray_DIMS(output)[4] != expected_output_dims[4])) {
PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ldx%ld"
" but received gradient with shape %ldx%ldx%ldx%ldx%ld",
expected_output_dims[0], expected_output_dims[1],
expected_output_dims[2], expected_output_dims[3],
expected_output_dims[4],
PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3],
PyGpuArray_DIMS(output)[4]);
cuda_exit(c->ctx);
return 1;
}
}
if (!params->choose_once) {
reuse_algo = 1;
for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); ++i) {
reuse_algo = (reuse_algo && PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]);
reuse_algo = (reuse_algo && PyGpuArray_DIM(output, i) == prev_top_dims[i]);
}
}
if (!reuse_algo) {
size_t free;
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
if (err2 != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
"memory information on the GPU");
cuda_exit(c->ctx);
return 1;
}
// Guess 4Mb if the info is not available
if (free == 0) free = 4 * 1024 * 1024;
if (params->choose_time) {
int count;
cudnnConvolutionBwdDataAlgoPerf_t choice;
gpudata *tmpmem;
tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
if (tmpmem == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
return -1;
}
err = cudnnFindConvolutionBackwardDataAlgorithmEx(
params->handle, APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
1, &count, &choice, *(void **)tmpmem, free);
gpudata_release(tmpmem);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
algo = choice.algo;
} else {
err = cudnnGetConvolutionBackwardDataAlgorithm(
params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output),
desc, APPLY_SPECIFIC(input),
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
}
prev_algo = algo;
} else {
algo = prev_algo;
}
if (params->choose_once) {
reuse_algo = 1;
} else {
for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); ++i) {
prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
prev_top_dims[i] = PyGpuArray_DIM(output, i);
}
}
// The FFT implementation does not support strides, 1x1 filters or inputs
// with a spatial dimension larger than 1024. The tiled-FFT implementation
// does not support strides.
// If the chosen implementation is FFT or tiled-FFT, validate that it can
// be used on the current data and default to a safe implementation if it
// can't.
// The following code is 2d-specific but it is fine as FFT and tiled-FFT are
// defined only for 2d filters
if ((algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING ||
algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) && PyGpuArray_NDIM(kerns) == 4) {
// Extract the properties of the convolution descriptor
int nd;
int pad[2];
int stride[2];
int upscale[2];
cudnnConvolutionMode_t mode;
cudnnDataType_t data_type;
err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride, upscale, &mode, &data_type);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error getting convolution properties: %s", cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
if (algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) {
if (stride[0] != 1 || stride[1] != 1 ||
PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
(PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1))
{
algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
}
} else {
// algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING
if (stride[0] != 1 || stride[1] != 1) {
algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
}
}
}
*output_algo = algo;
cuda_exit(c->ctx);
return 0;
}
#section init_code_struct
reuse_algo = 0;
prev_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
if (!PARAMS->choose_once) {
memset(prev_img_dims, 0, sizeof(prev_img_dims));
memset(prev_top_dims, 0, sizeof(prev_top_dims));
}
#section support_code_struct
int reuse_algo;
cudnnConvolutionBwdFilterAlgo_t prev_algo;
size_t prev_img_dims[5];
size_t prev_top_dims[5];
int
APPLY_SPECIFIC(choose_bwd_filter_algo)(PyGpuArrayObject *input, PyGpuArrayObject *output,
PyGpuArrayObject *kerns,
cudnnConvolutionDescriptor_t desc,
cudnnConvolutionBwdFilterAlgo_t *output_algo,
PARAMS_TYPE* params) {
PyGpuContextObject *c = input->context;
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) {
PyErr_SetString(PyExc_ValueError, "GpuDnnConv images and kernel must have the same stack size");
return 1;
}
if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
return 1;
if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
return 1;
if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
return 1;
cudnnConvolutionBwdFilterAlgo_t algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
cuda_enter(c->ctx);
int expected_output_dims[5] = {0};
err = cudnnGetConvolutionNdForwardOutputDim(desc, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
PyGpuArray_NDIM(input), expected_output_dims);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error computing convolution output dim: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
if (PyGpuArray_NDIM(input) == 4) {
if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
(PyGpuArray_DIMS(output)[1] != expected_output_dims[1]) ||
(PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
(PyGpuArray_DIMS(output)[3] != expected_output_dims[3])) {
PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ld"
" but received gradient with shape %ldx%ldx%ldx%ld",
expected_output_dims[0], expected_output_dims[1],
expected_output_dims[2], expected_output_dims[3],
PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3]);
cuda_exit(c->ctx);
return 1;
}
} else if (PyGpuArray_NDIM(input) == 5) {
if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
(PyGpuArray_DIMS(output)[1] != expected_output_dims[1]) ||
(PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
(PyGpuArray_DIMS(output)[3] != expected_output_dims[3]) ||
(PyGpuArray_DIMS(output)[4] != expected_output_dims[4])) {
PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ldx%ld"
" but received gradient with shape %ldx%ldx%ldx%ldx%ld",
expected_output_dims[0], expected_output_dims[1],
expected_output_dims[2], expected_output_dims[3],
expected_output_dims[4],
PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3],
PyGpuArray_DIMS(output)[4]);
cuda_exit(c->ctx);
return 1;
}
}
if (!params->choose_once) {
reuse_algo = 1;
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); ++i) {
reuse_algo = (reuse_algo && PyGpuArray_DIM(input, i) == prev_img_dims[i]);
reuse_algo = (reuse_algo && PyGpuArray_DIM(output, i) == prev_top_dims[i]);
}
}
if (!reuse_algo) {
size_t free;
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
if (err2 != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
"memory information on the GPU");
cuda_exit(c->ctx);
return 1;
}
// Guess 4Mb if the info is not available
if (free == 0) free = 4 * 1024 * 1024;
if (params->choose_time) {
int count;
cudnnConvolutionBwdFilterAlgoPerf_t choice;
gpudata *tmpmem;
tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
if (tmpmem == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
return -1;
}
err = cudnnFindConvolutionBackwardFilterAlgorithmEx(
params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc,
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
1, &count, &choice, *(void **)tmpmem, free);
gpudata_release(tmpmem);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
algo = choice.algo;
} else {
err = cudnnGetConvolutionBackwardFilterAlgorithm(
params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
desc, APPLY_SPECIFIC(kerns),
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
}
prev_algo = algo;
} else {
algo = prev_algo;
}
if (params->choose_once) {
reuse_algo = 1;
} else {
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
prev_img_dims[i] = PyGpuArray_DIM(input, i);
prev_top_dims[i] = PyGpuArray_DIM(output, i);
}
}
// The FFT implementation does not support strides, 1x1 filters or inputs
// with a spatial dimension larger than 1024.
// If the chosen implementation is FFT, validate that it can
// be used on the current data and default to a safe implementation if it
// can't.
// The following code is 2d-specific but it is fine as FFT and tiled-FFT are
// defined only for 2d filters
if (algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT && PyGpuArray_NDIM(input) == 4) {
// Extract the properties of the convolution descriptor
int nd;
int pad[2];
int stride[2];
int upscale[2];
cudnnConvolutionMode_t mode;
cudnnDataType_t data_type;
err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride, upscale, &mode, &data_type);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error getting convolution properties: %s", cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
if (stride[0] != 1 || stride[1] != 1 ||
PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
(PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1)) {
algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
}
}
*output_algo = algo;
cuda_exit(c->ctx);
return 0;
}
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论