提交 1b6e6389 authored 作者: notoraptor's avatar notoraptor

Cancel some changes into CEnumType.

Cancel changes into dnn_fwd.c Heavy simplification of check_dnn. Make check_dnn runnable as a python script.
上级 6cc0c5ca
...@@ -909,11 +909,7 @@ class EnumType(Type, dict): ...@@ -909,11 +909,7 @@ class EnumType(Type, dict):
.. note:: .. note::
:class:`EnumType` is not complete and should never be used for regular graph operations. This Type (and subclasses) is not complete and should never be used for regular graph operations.
:class:`EnumList` is not complete and should never be used for regular graph operations.
**:class:`CEnumType` is complete.**
""" """
...@@ -1053,9 +1049,6 @@ class EnumType(Type, dict): ...@@ -1053,9 +1049,6 @@ class EnumType(Type, dict):
#ifndef PyInt_AsLong #ifndef PyInt_AsLong
#define PyInt_AsLong PyLong_AsLong #define PyInt_AsLong PyLong_AsLong
#endif #endif
#ifndef PyInt_FromLong
#define PyInt_FromLong PyLong_FromLong
#endif
#endif #endif
""" """
...@@ -1248,22 +1241,5 @@ class CEnumType(EnumList): ...@@ -1248,22 +1241,5 @@ class CEnumType(EnumList):
""" % dict(i=i, name=name, constant_cname=swapped_dict[i]) for i in sorted(swapped_dict.keys())), """ % dict(i=i, name=name, constant_cname=swapped_dict[i]) for i in sorted(swapped_dict.keys())),
fail=sub['fail']) fail=sub['fail'])
def c_sync(self, name, sub):
return """
int py_value = -1;
Py_XDECREF(py_%(name)s);
/* We assume that ctype is an integer type usable in a switch. */
switch (%(name)s) {
%(cases)s
default:
PyErr_SetString(PyExc_ValueError, "CEnumType: cannot map C value to Python constant.");
{%(fail)s}
break;
}
py_%(name)s = PyInt_FromLong(py_value);
""" % dict(name=name, fail=sub['fail'], cases=''.join("""
case %(constant_cname)s: py_value = %(constant_pyvalue)d; break;
""" % dict(constant_cname=k, constant_pyvalue=v) for k, v in sorted(self.items(), key=lambda t: t[1])))
def c_code_cache_version(self): def c_code_cache_version(self):
return (1, super(CEnumType, self).c_code_cache_version()) return (1, super(CEnumType, self).c_code_cache_version())
...@@ -19,8 +19,27 @@ from __future__ import absolute_import, print_function, division ...@@ -19,8 +19,27 @@ from __future__ import absolute_import, print_function, division
from theano.gof import CEnumType from theano.gof import CEnumType
HALF, FLOAT, DOUBLE = ('float16', 'float32', 'float64') HALF, FLOAT, DOUBLE = ('float16', 'float32', 'float64')
TRUE_HALF_CONFIG = (HALF, HALF)
PSEUDO_HALF_CONFIG = (HALF, FLOAT)
FLOAT_CONFIG = (FLOAT, FLOAT)
DOUBLE_CONFIG = (DOUBLE, DOUBLE)
def is_true_half_config(dtype, precision):
return dtype == precision == HALF
def is_pseudo_half_config(dtype, precision):
return dtype == HALF and precision == FLOAT
def is_float_config(dtype, precision):
return dtype == precision == FLOAT
def is_double_config(dtype, precision):
return dtype == precision == DOUBLE
# NB: Some cuDNN algorithms are listed in cuDNN enums but not implemented. # NB: Some cuDNN algorithms are listed in cuDNN enums but not implemented.
...@@ -103,22 +122,97 @@ class CuDNNV51(object): ...@@ -103,22 +122,97 @@ class CuDNNV51(object):
# empty list of enum to don't crash with cudnn 5 # empty list of enum to don't crash with cudnn 5
cudnnReduceTensorOp_t = CEnumType() cudnnReduceTensorOp_t = CEnumType()
def supported_precisions(self, dtype): def get_supported_dtype_configs(self):
""" """
Return the tuple of precisions supported by cuDNN for given input data type. Return the tuple of data type configurations supported by this version of cuDNN.
This is currently convenient for both cuDNN V5.1 and V6, as Theano does not This is currently convenient for both cuDNN V5.1 and V6, as Theano does not
yet support new data types (like INT8, INT8x4, etc.). yet support new data types (like INT8, INT8x4, etc.).
""" """
assert dtype in (HALF, FLOAT, DOUBLE) return (TRUE_HALF_CONFIG, PSEUDO_HALF_CONFIG, FLOAT_CONFIG, DOUBLE_CONFIG)
if dtype == HALF:
# TRUE_HALF_CONFIG, PSEUDO_HALF_CONFIG def get_fwd_dtype_configs(self, check_runtime=None):
return (HALF, FLOAT) # NB: "TRUE_HALF_CONFIG is only supported on architectures with true fp16 support
if dtype == FLOAT: # (compute capability 5.3 and 6.0)". Can be checked at runtime only.
# FLOAT_CONFIG if check_runtime is None or check_runtime(*TRUE_HALF_CONFIG):
return (FLOAT,) return self.get_supported_dtype_configs()
if dtype == DOUBLE: return (PSEUDO_HALF_CONFIG, FLOAT_CONFIG, DOUBLE_CONFIG)
# DOUBLE_CONFIG
return (DOUBLE,) def get_bwd_filter_dtype_configs(self, check_runtime=None):
return self.get_supported_dtype_configs()
def get_bwd_data_dtype_configs(self, check_runtime=None):
return self.get_supported_dtype_configs()
def fwd_algo_supports_dtype_config(self, algo, dtype, precision, ndim):
algorithms = self.cudnnConvolutionFwdAlgo_t
algo = algorithms.fromalias(algo)
if algo == algorithms.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM:
return not is_true_half_config(dtype, precision)
if algo == algorithms.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM:
return ndim == 2 or not is_true_half_config(dtype, precision)
if algo == algorithms.CUDNN_CONVOLUTION_FWD_ALGO_GEMM:
return ndim == 2 and not is_true_half_config(dtype, precision)
# CUDNN_CONVOLUTION_FWD_ALGO_DIRECT: not implemented.
if algo == algorithms.CUDNN_CONVOLUTION_FWD_ALGO_FFT:
return ndim == 2 and (is_pseudo_half_config(dtype, precision) or is_float_config(dtype, precision))
if algo == algorithms.CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING:
if ndim == 2:
return is_pseudo_half_config(dtype, precision) or is_float_config(dtype, precision)
# NB: For cuDNN V6:
# " Data Type Config Support: PSEUDO_HALF_CONFIG, FLOAT_CONFIG
# (DOUBLE_CONFIG is also supported when the task can be handled by 1D FFT,
# ie, one of the filter dimension, width or height is 1)"
# Could be checked only when being in C code.
if ndim == 3:
return not is_true_half_config(dtype, precision)
if algo == algorithms.CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD:
return ndim == 2 and (is_pseudo_half_config(dtype, precision) or is_float_config(dtype, precision))
if algo == algorithms.CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED:
# NB: "If wDesc 's filter (height, width) is (5,5), data type config TRUE_HALF_CONFIG is not supported".
# We could not check it before being in C code.
return ndim == 2 and not is_double_config(dtype, precision)
return False
def bwd_filter_algo_supports_dtype_config(self, algo, dtype, precision, ndim):
algorithms = self.cudnnConvolutionBwdFilterAlgo_t
algo = algorithms.fromalias(algo)
if algo == algorithms.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0:
return not is_true_half_config(dtype, precision)
if algo == algorithms.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1:
return ndim == 2
if algo == algorithms.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT:
return ndim == 2 and (is_pseudo_half_config(dtype, precision) or is_float_config(dtype, precision))
if algo == algorithms.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3:
return not is_true_half_config(dtype, precision)
if algo == algorithms.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED:
# NB: "If wDesc 's filter (height, width) is (5,5), data type config TRUE_HALF_CONFIG is not supported".
# We could not check it before being in C code.
return ndim == 2 and not is_double_config(dtype, precision)
return False
def bwd_data_algo_supports_dtype_config(self, algo, dtype, precision, ndim):
algorithms = self.cudnnConvolutionBwdDataAlgo_t
algo = algorithms.fromalias(algo)
if algo == algorithms.CUDNN_CONVOLUTION_BWD_DATA_ALGO_0:
return not is_true_half_config(dtype, precision)
# CUDNN_CONVOLUTION_BWD_DATA_ALGO_1: all data type configs supported.
if algo == algorithms.CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT:
return ndim == 2 and (is_pseudo_half_config(dtype, precision) or is_float_config(dtype, precision))
if algo == algorithms.CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING:
if ndim == 2:
return is_pseudo_half_config(dtype, precision) or is_float_config(dtype, precision)
# NB: For cuDNN V6: "(DOUBLE_CONFIG is also supported when the task can be handled by 1D FFT,
# ie, one of the filter dimension, width or height is 1)"
# Could be checked only when being in C code.
if ndim == 3:
return not is_true_half_config(dtype, precision)
if algo == algorithms.CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD:
return ndim == 2 and is_pseudo_half_config(dtype, precision) or is_float_config(dtype, precision)
if algo == algorithms.CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED:
# NB: "If wDesc 's filter (height, width) is (5,5), data type config TRUE_HALF_CONFIG is not supported".
# We could not check it before being in C code.
return ndim == 2 and not is_double_config(dtype, precision)
return False
class CuDNNV6(CuDNNV51): class CuDNNV6(CuDNNV51):
...@@ -162,6 +256,17 @@ class CuDNNV6(CuDNNV51): ...@@ -162,6 +256,17 @@ class CuDNNV6(CuDNNV51):
('CUDNN_REDUCE_TENSOR_NORM2', 'norm2'), ('CUDNN_REDUCE_TENSOR_NORM2', 'norm2'),
ctype='cudnnReduceTensorOp_t') ctype='cudnnReduceTensorOp_t')
def bwd_filter_algo_supports_dtype_config(self, algo, dtype, precision, ndim):
is_supported = super(CuDNNV6, self).bwd_filter_algo_supports_dtype_config(algo, dtype, precision, ndim)
if not is_supported:
algorithms = self.cudnnConvolutionBwdFilterAlgo_t
algo = algorithms.fromalias(algo)
if algo == algorithms.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING:
return ndim == 2 and (is_pseudo_half_config(dtype, precision) or
is_float_config(dtype, precision) or
is_double_config(dtype, precision))
return is_supported
class CuDNNV7(CuDNNV6): class CuDNNV7(CuDNNV6):
version = 7 version = 7
......
#!/usr/bin/env python
# You can pass nosetests args when running this script. Examples:
# python theano/gpuarray/tests/check_dnn.py # Normal mode.
# python theano/gpuarray/tests/check_dnn.py -xvs # Verbose mode, capture output, exit at first error.
from __future__ import absolute_import, print_function, division from __future__ import absolute_import, print_function, division
from itertools import ifilter, product from itertools import ifilter, product, chain
import nose
import numpy as np import numpy as np
from nose.plugins.skip import SkipTest
import theano import theano
import theano.tests.unittest_tools as utt import theano.tests.unittest_tools as utt
from theano.compile.ops import shape_i_op from theano.compile.ops import shape_i_op
from theano.configdefaults import SUPPORTED_DNN_CONV_ALGO_RUNTIME from theano.configdefaults import SUPPORTED_DNN_CONV_ALGO_RUNTIME
from theano.gof import COp, Apply, ParamsType
from theano.gof.type import CDataType
from theano.gpuarray import cudnn_defs from theano.gpuarray import cudnn_defs
from theano.gpuarray.basic_ops import infer_context_name, as_gpuarray_variable, gpu_contiguous, GpuAllocEmpty from theano.gpuarray.basic_ops import infer_context_name, as_gpuarray_variable, gpu_contiguous, GpuAllocEmpty
from theano.gpuarray.dnn import (GpuDnnConvDesc, GpuDnnConv, GpuDnnConvGradW, GpuDnnConvGradI, version, get_precision, from theano.gpuarray.dnn import GpuDnnConvDesc, GpuDnnConv, GpuDnnConvGradW, GpuDnnConvGradI, version, get_precision
DnnBase, handle_type, DNN_CONV_ALGO_CHOOSE_ONCE, DNN_CONV_ALGO_CHOOSE_TIME)
from theano.gpuarray.tests.check_dnn_doc import check_fwd_algorithm
from theano.gpuarray.tests.config import mode_with_gpu, ref_cast from theano.gpuarray.tests.config import mode_with_gpu, ref_cast
from theano.scalar import bool as bool_t
from theano.tensor.nnet.abstract_conv import get_conv_output_shape, assert_conv_shape from theano.tensor.nnet.abstract_conv import get_conv_output_shape, assert_conv_shape
from theano.tensor.opt import Assert from theano.tensor.opt import Assert
cudnn = cudnn_defs.get_definitions(version(raises=False)) cudnn = cudnn_defs.get_definitions(version(raises=False))
cudnnConvolutionFwdAlgo_t = cudnn.cudnnConvolutionFwdAlgo_t
cudnnConvolutionBwdFilterAlgo_t = cudnn.cudnnConvolutionBwdFilterAlgo_t
cudnnConvolutionBwdDataAlgo_t = cudnn.cudnnConvolutionBwdDataAlgo_t
AVAILABLE_PRECISIONS = cudnn.supported_precisions(theano.config.floatX)
class DnnCaseGenerator: class DnnCaseGenerator:
""" """
...@@ -81,11 +76,11 @@ class DnnCaseGenerator: ...@@ -81,11 +76,11 @@ class DnnCaseGenerator:
@staticmethod @staticmethod
def get_if_valid_conv_output_shape(case_tuple): def get_if_valid_conv_output_shape(case_tuple):
out_shp = get_conv_output_shape(case_tuple[0][0], # input shape out_shp = get_conv_output_shape(case_tuple[0], # input shape
case_tuple[0][1], # filter shape case_tuple[1], # filter shape
case_tuple[1], # border mode case_tuple[4], # border mode
case_tuple[0][2], # subsample case_tuple[2], # subsample
case_tuple[0][3] # dilation case_tuple[3] # dilation
) )
try: try:
return assert_conv_shape(out_shp) return assert_conv_shape(out_shp)
...@@ -94,7 +89,7 @@ class DnnCaseGenerator: ...@@ -94,7 +89,7 @@ class DnnCaseGenerator:
def get_cases(self): def get_cases(self):
# Generate an iterator of tuples with format: # Generate an iterator of tuples with format:
# ( (input shape, filter shape, subsample, dilation), border mode, convolution mode, alpha, beta ) # (input shape, filter shape, subsample, dilation, border mode, convolution mode, alpha, beta)
all_batch_sizes = (self.batch_size,) all_batch_sizes = (self.batch_size,)
all_input_channels = (self.input_channels,) all_input_channels = (self.input_channels,)
all_input_sizes = self._shapes(self.input_size) all_input_sizes = self._shapes(self.input_size)
...@@ -114,7 +109,7 @@ class DnnCaseGenerator: ...@@ -114,7 +109,7 @@ class DnnCaseGenerator:
all_filter_shapes = ((oc, ic) + fis all_filter_shapes = ((oc, ic) + fis
for oc in all_output_channels for ic in all_input_channels for fis in all_filter_sizes) for oc in all_output_channels for ic in all_input_channels for fis in all_filter_sizes)
return ifilter(DnnCaseGenerator.get_if_valid_conv_output_shape, return ifilter(DnnCaseGenerator.get_if_valid_conv_output_shape,
product(product(all_input_shapes, all_filter_shapes, all_subsamples, all_dilations), product(all_input_shapes, all_filter_shapes, all_subsamples, all_dilations,
all_border_modes, all_conv_modes, all_alphas, all_betas)) all_border_modes, all_conv_modes, all_alphas, all_betas))
...@@ -142,14 +137,14 @@ def dnn_conv(img, kerns, alpha=1, beta=0, out=None, border_mode='valid', subsamp ...@@ -142,14 +137,14 @@ def dnn_conv(img, kerns, alpha=1, beta=0, out=None, border_mode='valid', subsamp
desc_op.subsample, desc_op.subsample,
filter_dilation=dilation) filter_dilation=dilation)
out_shp = assert_conv_shape(out_shp) out_shp = assert_conv_shape(out_shp)
if beta != 0: if beta == 0:
real_out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*out_shp)
else:
assert out is not None assert out is not None
out = as_gpuarray_variable(out, ctx_name) out = as_gpuarray_variable(out, ctx_name)
out = gpu_contiguous(out) out = gpu_contiguous(out)
check = Assert('GpuDnnConv: qiven output (for beta not null) does not have expected shape') check = Assert('GpuDnnConv: qiven output (for beta not null) does not have expected shape')
real_out = check(out, theano.tensor.all(theano.tensor.eq(out.shape, out_shp))) real_out = check(out, theano.tensor.all(theano.tensor.eq(out.shape, out_shp)))
else:
real_out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*out_shp)
return GpuDnnConv(algo=algo)(img, kerns, real_out, desc, alpha, beta) return GpuDnnConv(algo=algo)(img, kerns, real_out, desc, alpha, beta)
...@@ -207,78 +202,28 @@ def dnn_gradinput(kerns, topgrad, img_shp, alpha=1, beta=0, out=None, border_mod ...@@ -207,78 +202,28 @@ def dnn_gradinput(kerns, topgrad, img_shp, alpha=1, beta=0, out=None, border_mod
return GpuDnnConvGradI(algo=algo)(kerns, topgrad, real_out, desc, alpha, beta) return GpuDnnConvGradI(algo=algo)(kerns, topgrad, real_out, desc, alpha, beta)
class BaseGpuDnnConvChooseAlgo(DnnBase): def check_fwd_dtype_config_support(dtype, precision):
""" inputs_shape = (1, 1, 3, 3)
This class and its subclasses allow to retrieve a cuDNN algorithm filters_shape = (1, 1, 2, 2)
at runtime without any computation, given the user choose option inputs = np.zeros(inputs_shape, dtype=dtype)
(time_once, time_on_shape_change, guess_once or guess_on_shape_change). filters = np.zeros(filters_shape, dtype=dtype)
To help reduce whole test time, I suggest we use these classes when inputs = theano.shared(inputs)
algo is one of choose options, as any chosen algorithm would have filters = theano.shared(filters)
been tested by the other exhaustive tests. conv = dnn_conv(inputs, filters, precision=precision)
""" f = theano.function([], conv, mode=mode_with_gpu)
try:
_f16_ok = True f()
check_input = False except RuntimeError as e:
__props__ = ('choice',) assert 'CUDNN_STATUS_ARCH_MISMATCH' in e.message
params_type = ParamsType(choose_once=bool_t, choose_time=bool_t, handle=handle_type) return False
return True
# Abstract attributes.
func_file = None
func_name = None
def __init__(self, choice):
COp.__init__(self, ["../dnn_base.c", "../dnn_conv_base.c", self.func_file], self.func_name)
assert choice in SUPPORTED_DNN_CONV_ALGO_RUNTIME
self.choice = choice
self.choose_once = self.choice in DNN_CONV_ALGO_CHOOSE_ONCE
self.choose_time = self.choice in DNN_CONV_ALGO_CHOOSE_TIME
def dnn_context(self, node):
return node.inputs[0].type.context_name
def _prepare_inputs(self, i1, name_i1, i2, name_i2, output, desc):
ctx_name = infer_context_name(i1, i2, output)
i1 = as_gpuarray_variable(i1, ctx_name)
i2 = as_gpuarray_variable(i2, ctx_name)
output = as_gpuarray_variable(output, ctx_name)
if i1.type.ndim not in (4, 5):
raise TypeError('%s must be 4D or 5D tensor' % name_i1)
if i2.type.ndim not in (4, 5):
raise TypeError('%s must be 4D or 5D tensor' % name_i2)
if output.type.ndim not in (4, 5):
raise TypeError('output must be 4D or 5D tensor')
if i1.type.ndim != i2.type.ndim or i1.type.ndim != output.type.ndim:
raise TypeError("The number of dimensions of %s, %s and output must match" % (name_i1, name_i2))
if not isinstance(desc.type, CDataType) or desc.type.ctype != 'cudnnConvolutionDescriptor_t':
raise TypeError('desc must be cudnnConvolutionDescriptor_t')
return (i1, i2, output, desc)
class GpuDnnConvChooseFwdAlgo(BaseGpuDnnConvChooseAlgo):
func_file = 'dnn_choose_fwd.c'
func_name = 'APPLY_SPECIFIC(choose_fwd_algo)'
def make_node(self, img, kern, output, desc):
img, kern, output, desc = self._prepare_inputs(img, 'img', kern, 'kern', output, desc)
return Apply(self, [img, kern, output, desc], [cudnn.cudnnConvolutionFwdAlgo_t()])
class GpuDnnConvChooseBwdFilterAlgo(BaseGpuDnnConvChooseAlgo):
func_file = 'dnn_choose_gw.c'
func_name = 'APPLY_SPECIFIC(choose_bwd_filter_algo)'
def make_node(self, img, topgrad, output, desc):
img, topgrad, output, desc = self._prepare_inputs(img, 'img', topgrad, 'topgrad', output, desc)
return Apply(self, [img, topgrad, output, desc], [cudnn.cudnnConvolutionBwdFilterAlgo_t()])
class GpuDnnConvChooseBwdDataAlgo(BaseGpuDnnConvChooseAlgo):
func_file = 'dnn_choose_gi.c'
func_name = 'APPLY_SPECIFIC(choose_bwd_data_algo)'
def make_node(self, kern, topgrad, output, desc): def test_fwd_true_half_config_support():
kern, topgrad, output, desc = self._prepare_inputs(kern, 'kern', topgrad, 'topgrad', output, desc) # For cuDNN V5.1 and V6.0:
return Apply(self, [kern, topgrad, output, desc], [cudnn.cudnnConvolutionBwdDataAlgo_t()]) # "TRUE_HALF_CONFIG is only supported on architectures with true fp16 support (compute capability 5.3 and 6.0)"
if not check_fwd_dtype_config_support('float16', 'float16'):
raise SkipTest('FWD: TRUE_HALF_CONFIG not supported on this GPU.')
class BaseTestDnnConv(object): class BaseTestDnnConv(object):
...@@ -287,10 +232,6 @@ class BaseTestDnnConv(object): ...@@ -287,10 +232,6 @@ class BaseTestDnnConv(object):
to run actual tests. to run actual tests.
""" """
_functions_checked_for_fwd = False
_functions_checked_for_gradinput = False
_functions_checked_for_gradweight = False
# Abstract attributes. # Abstract attributes.
ndim = 2 ndim = 2
...@@ -303,25 +244,25 @@ class BaseTestDnnConv(object): ...@@ -303,25 +244,25 @@ class BaseTestDnnConv(object):
cpu_gradinput_class = None cpu_gradinput_class = None
cpu_gradweight_class = None cpu_gradweight_class = None
# Utility methods.
def get_cases(self): def get_cases(self):
# Return an iterable of test cases. Each test case is a tuple (or list) with following syntax: # Return an iterable of test cases. Each test case is a tuple (or list) with following syntax:
# ( (input shape, filter shape, subsample, dilation), border mode, convolution mode, alpha, beta ) # (input shape, filter shape, subsample, dilation, border mode, convolution mode, alpha, beta)
generator = DnnCaseGenerator(ndim=self.ndim) generator = DnnCaseGenerator(ndim=self.ndim)
return generator.get_cases() return generator.get_cases()
# Run and utility methods. def array_like_conv_output(self, inputs_shape, filters_shape, border_mode, subsample, dilation, dtype):
def array_like_conv_output(self, inputs_shape, filters_shape, border_mode, subsample, dilation):
# Return an random array with inferred convolution output shape. # Return an random array with inferred convolution output shape.
out_shp = get_conv_output_shape(inputs_shape, filters_shape, border_mode, subsample, dilation) out_shp = get_conv_output_shape(inputs_shape, filters_shape, border_mode, subsample, dilation)
out_shp = assert_conv_shape(out_shp) out_shp = assert_conv_shape(out_shp)
return np.random.random(out_shp).astype(theano.config.floatX) return np.random.random(out_shp).astype(dtype)
def run_conv_fwd(self, algo, precision, parameters): def run_conv_fwd(self, algo, dtype, precision, parameters):
(inputs_shape, filters_shape, subsample, dilation), border_mode, conv_mode, alpha, beta = parameters inputs_shape, filters_shape, subsample, dilation, border_mode, conv_mode, alpha, beta = parameters
inputs_val = np.random.random(inputs_shape).astype(theano.config.floatX) inputs_val = np.random.random(inputs_shape).astype(dtype)
filters_val = np.random.random(filters_shape).astype(theano.config.floatX) filters_val = np.random.random(filters_shape).astype(dtype)
# Scale down the input values to prevent very large absolute errors # Scale down the input values to prevent very large absolute errors
# due to float rounding # due to float rounding
...@@ -331,8 +272,11 @@ class BaseTestDnnConv(object): ...@@ -331,8 +272,11 @@ class BaseTestDnnConv(object):
inputs = theano.shared(inputs_val) inputs = theano.shared(inputs_val)
filters = theano.shared(filters_val) filters = theano.shared(filters_val)
out = None if beta == 0 else self.array_like_conv_output(inputs_shape, filters_shape, border_mode, subsample, if beta == 0:
dilation) out = None
else:
out = self.array_like_conv_output(inputs_shape, filters_shape, border_mode, subsample, dilation, dtype)
out /= 10
# Compile a theano function for the cuDNN implementation # Compile a theano function for the cuDNN implementation
conv = dnn_conv(img=inputs, kerns=filters, alpha=alpha, beta=beta, out=out, border_mode=border_mode, conv = dnn_conv(img=inputs, kerns=filters, alpha=alpha, beta=beta, out=out, border_mode=border_mode,
subsample=subsample, dilation=dilation, conv_mode=conv_mode, algo=algo, precision=precision) subsample=subsample, dilation=dilation, conv_mode=conv_mode, algo=algo, precision=precision)
...@@ -354,15 +298,6 @@ class BaseTestDnnConv(object): ...@@ -354,15 +298,6 @@ class BaseTestDnnConv(object):
filter_dilation=dilation)(ref_cast(inputs), flipped_filters) filter_dilation=dilation)(ref_cast(inputs), flipped_filters)
f_ref = theano.function([], conv_ref, mode="FAST_RUN") f_ref = theano.function([], conv_ref, mode="FAST_RUN")
if not self._functions_checked_for_fwd:
self._functions_checked_for_fwd = True
assert any(isinstance(node.op, GpuDnnConv) for node in f.maker.fgraph.apply_nodes)
assert not any(isinstance(node.op, (GpuDnnConvGradI, GpuDnnConvGradW))
for node in f.maker.fgraph.apply_nodes)
assert not any(isinstance(node.op, (GpuDnnConv, GpuDnnConvGradW, GpuDnnConvGradI))
for node in f_ref.maker.fgraph.apply_nodes)
# Compare the results of the two implementations # Compare the results of the two implementations
res_ref = f_ref() res_ref = f_ref()
res = f() res = f()
...@@ -371,19 +306,26 @@ class BaseTestDnnConv(object): ...@@ -371,19 +306,26 @@ class BaseTestDnnConv(object):
utt.assert_allclose(res, res2) utt.assert_allclose(res, res2)
# Raise tolerance for float16 # Raise tolerance for float16
rtol = 6e-2 if theano.config.floatX == 'float16' else None rtol = 6e-2 if dtype == 'float16' else None
if beta == 0: if beta == 0:
utt.assert_allclose(alpha * res_ref, res, rtol=rtol) utt.assert_allclose(alpha * res_ref, res, rtol=rtol)
else: else:
# print('(conv: beta not null) ', end='')
utt.assert_allclose(alpha * res_ref + beta * out, res, rtol=rtol) utt.assert_allclose(alpha * res_ref + beta * out, res, rtol=rtol)
def run_conv_gradinput(self, algo, precision, parameters): def run_conv_gradinput(self, algo, dtype, precision, parameters):
(inputs_shape, filters_shape, subsample, dilation), border_mode, conv_mode, alpha, beta = parameters inputs_shape, filters_shape, subsample, dilation, border_mode, conv_mode, alpha, beta = parameters
if beta == 0:
inputs_val = None
else:
inputs_val = np.random.random(inputs_shape).astype(dtype)
inputs_val /= 10
filters_val = np.random.random(filters_shape).astype(dtype)
topgrad_val = self.array_like_conv_output(inputs_shape, filters_shape, border_mode, subsample, dilation, dtype)
inputs_val = np.random.random(inputs_shape).astype(theano.config.floatX) # Scale down the input values to prevent absolute errors in utt.assert_allclose.
filters_val = np.random.random(filters_shape).astype(theano.config.floatX) filters_val /= 10
topgrad_val = self.array_like_conv_output(inputs_shape, filters_shape, border_mode, subsample, dilation) topgrad_val /= 10
filters = theano.shared(filters_val) filters = theano.shared(filters_val)
topgrad = theano.shared(topgrad_val) topgrad = theano.shared(topgrad_val)
...@@ -412,15 +354,6 @@ class BaseTestDnnConv(object): ...@@ -412,15 +354,6 @@ class BaseTestDnnConv(object):
)(ref_cast(flipped_filters), ref_cast(topgrad), inputs_shape[2:]) )(ref_cast(flipped_filters), ref_cast(topgrad), inputs_shape[2:])
f_ref = theano.function([], grad_i_ref, mode="FAST_RUN") f_ref = theano.function([], grad_i_ref, mode="FAST_RUN")
if not self._functions_checked_for_gradinput:
self._functions_checked_for_gradinput = True
assert any(isinstance(node.op, GpuDnnConvGradI) for node in f.maker.fgraph.apply_nodes)
assert not any(isinstance(node.op, (GpuDnnConv, GpuDnnConvGradW))
for node in f.maker.fgraph.apply_nodes)
assert not any(isinstance(node.op, (GpuDnnConv, GpuDnnConvGradW, GpuDnnConvGradI))
for node in f_ref.maker.fgraph.apply_nodes)
# Compare the results of the two implementations # Compare the results of the two implementations
res_ref = f_ref() res_ref = f_ref()
res = f() res = f()
...@@ -429,15 +362,26 @@ class BaseTestDnnConv(object): ...@@ -429,15 +362,26 @@ class BaseTestDnnConv(object):
utt.assert_allclose(res, res2) utt.assert_allclose(res, res2)
# Raise tolerance for float16 # Raise tolerance for float16
rtol = 5e-2 if theano.config.floatX == 'float16' else None rtol = 5e-2 if dtype == 'float16' else None
if beta == 0:
utt.assert_allclose(alpha * res_ref, res, rtol=rtol)
else:
utt.assert_allclose(alpha * res_ref + beta * inputs_val, res, rtol=rtol) utt.assert_allclose(alpha * res_ref + beta * inputs_val, res, rtol=rtol)
def run_conv_gradweight(self, algo, precision, parameters): def run_conv_gradweight(self, algo, dtype, precision, parameters):
(inputs_shape, filters_shape, subsample, dilation), border_mode, conv_mode, alpha, beta = parameters inputs_shape, filters_shape, subsample, dilation, border_mode, conv_mode, alpha, beta = parameters
inputs_val = np.random.random(inputs_shape).astype(dtype)
if beta == 0:
filters_val = None
else:
filters_val = np.random.random(filters_shape).astype(dtype)
filters_val /= 10
topgrad_val = self.array_like_conv_output(inputs_shape, filters_shape, border_mode, subsample, dilation, dtype)
inputs_val = np.random.random(inputs_shape).astype(theano.config.floatX) # Scale down the input values to prevent absolute errors in utt.assert_allclose.
filters_val = np.random.random(filters_shape).astype(theano.config.floatX) inputs_val /= 10
topgrad_val = self.array_like_conv_output(inputs_shape, filters_shape, border_mode, subsample, dilation) topgrad_val /= 10
inputs = theano.shared(inputs_val) inputs = theano.shared(inputs_val)
topgrad = theano.shared(topgrad_val) topgrad = theano.shared(topgrad_val)
...@@ -458,15 +402,6 @@ class BaseTestDnnConv(object): ...@@ -458,15 +402,6 @@ class BaseTestDnnConv(object):
grad_w_ref = grad_w_ref[:, :, ::-1, ::-1, ::-1] grad_w_ref = grad_w_ref[:, :, ::-1, ::-1, ::-1]
f_ref = theano.function([], grad_w_ref, mode="FAST_RUN") f_ref = theano.function([], grad_w_ref, mode="FAST_RUN")
if not self._functions_checked_for_gradweight:
self._functions_checked_for_gradweight = True
assert any(isinstance(node.op, GpuDnnConvGradW) for node in f.maker.fgraph.apply_nodes)
assert not any(isinstance(node.op, (GpuDnnConv, GpuDnnConvGradI))
for node in f.maker.fgraph.apply_nodes)
assert not any(isinstance(node.op, (GpuDnnConv, GpuDnnConvGradW, GpuDnnConvGradI))
for node in f_ref.maker.fgraph.apply_nodes)
# Compare the results of the two implementations # Compare the results of the two implementations
res_ref = f_ref() res_ref = f_ref()
res = f() res = f()
...@@ -475,119 +410,58 @@ class BaseTestDnnConv(object): ...@@ -475,119 +410,58 @@ class BaseTestDnnConv(object):
utt.assert_allclose(res, res2) utt.assert_allclose(res, res2)
# Raise tolerance for float16 # Raise tolerance for float16
rtol = 5e-2 if theano.config.floatX == 'float16' else None rtol = 5e-2 if dtype == 'float16' else None
if beta == 0:
utt.assert_allclose(alpha * res_ref, res, rtol=rtol)
else:
utt.assert_allclose(alpha * res_ref + beta * filters_val, res, rtol=rtol) utt.assert_allclose(alpha * res_ref + beta * filters_val, res, rtol=rtol)
def run_choose_runtime_algos(self, algo, precision, parameters):
(inputs_shape, filters_shape, subsample, dilation), border_mode, conv_mode, alpha, beta = parameters
out_shp = assert_conv_shape(
get_conv_output_shape(inputs_shape, filters_shape, border_mode, subsample, dilation))
inputs_val = np.random.random(inputs_shape).astype(theano.config.floatX)
filters_val = np.random.random(filters_shape).astype(theano.config.floatX)
topgrad_val = self.array_like_conv_output(inputs_shape, filters_shape, border_mode, subsample, dilation)
inputs = theano.shared(inputs_val)
filters = theano.shared(filters_val)
topgrad = theano.shared(topgrad_val)
ctx_name = infer_context_name(inputs, topgrad)
desc_filter = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample, dilation=dilation,
conv_mode=conv_mode, precision=precision)(filters_shape)
array_like_filters = GpuAllocEmpty(dtype=inputs.dtype, context_name=ctx_name)(*filters_shape)
array_like_inputs = GpuAllocEmpty(dtype=inputs.dtype, context_name=ctx_name)(*inputs_shape)
array_like_conv_output = GpuAllocEmpty(dtype=inputs.dtype, context_name=ctx_name)(*out_shp)
algo_filter = GpuDnnConvChooseBwdFilterAlgo(algo)(inputs, topgrad, array_like_filters, desc_filter)
algo_input = GpuDnnConvChooseBwdDataAlgo(algo)(filters, topgrad, array_like_inputs, desc_filter)
algo_conv = GpuDnnConvChooseFwdAlgo(algo)(inputs, filters, array_like_conv_output, desc_filter)
f = theano.function([], [algo_filter, algo_input, algo_conv], mode=mode_with_gpu)
# Just test that it runs.
algo_filter_val, algo_input_val, algo_conv_val = f()
# How to test if it "works" ?
def get_expected_tcount(self): def get_expected_tcount(self):
""" """
Utility function to get expected test count Utility function to get expected test count
without actually run nosetests. without actually run nosetests.
""" """
len_cases = 0 len_cases = sum(1 for case in self.get_cases())
for c in self.get_cases(): count_contexts = 0
len_cases += 1 for dtype, precision in cudnn.get_fwd_dtype_configs(check_runtime=check_fwd_dtype_config_support):
print(len_cases, 'conv cases for %dD' % self.ndim) algos = (algo for algo in self.fwd_algorithms
return len(AVAILABLE_PRECISIONS) * len_cases * len(self.fwd_algorithms + if cudnn.fwd_algo_supports_dtype_config(algo, dtype, precision, self.ndim))
self.bwd_data_algorithms + count_contexts += sum(1 for algo in algos) + len(SUPPORTED_DNN_CONV_ALGO_RUNTIME)
self.bwd_filter_algorithms + for dtype, precision in cudnn.get_bwd_data_dtype_configs():
SUPPORTED_DNN_CONV_ALGO_RUNTIME) algos = (algo for algo in self.bwd_data_algorithms
if cudnn.bwd_data_algo_supports_dtype_config(algo, dtype, precision, self.ndim))
count_contexts += sum(1 for algo in algos) + len(SUPPORTED_DNN_CONV_ALGO_RUNTIME)
for dtype, precision in cudnn.get_bwd_filter_dtype_configs():
algos = (algo for algo in self.bwd_filter_algorithms
if cudnn.bwd_filter_algo_supports_dtype_config(algo, dtype, precision, self.ndim))
count_contexts += sum(1 for algo in algos) + len(SUPPORTED_DNN_CONV_ALGO_RUNTIME)
return len_cases * count_contexts
# Iterable test methods. # Iterable test methods.
def test_fwd(self): def test_fwd(self):
for precision, algo, parameters in product(AVAILABLE_PRECISIONS, self.fwd_algorithms, self.get_cases()): for dtype, precision in cudnn.get_fwd_dtype_configs(check_runtime=check_fwd_dtype_config_support):
yield (self.run_conv_fwd, algo, precision, parameters) algos = (algo for algo in self.fwd_algorithms
if cudnn.fwd_algo_supports_dtype_config(algo, dtype, precision, self.ndim))
for algo in chain(algos, SUPPORTED_DNN_CONV_ALGO_RUNTIME):
for parameters in self.get_cases():
yield (self.run_conv_fwd, algo, dtype, precision, parameters)
def test_gradinput(self): def test_gradinput(self):
for precision, algo, parameters in product(AVAILABLE_PRECISIONS, self.bwd_data_algorithms, self.get_cases()): for dtype, precision in cudnn.get_bwd_data_dtype_configs():
yield (self.run_conv_gradinput, algo, precision, parameters) algos = (algo for algo in self.bwd_data_algorithms
if cudnn.bwd_data_algo_supports_dtype_config(algo, dtype, precision, self.ndim))
for algo in chain(algos, SUPPORTED_DNN_CONV_ALGO_RUNTIME):
for parameters in self.get_cases():
yield (self.run_conv_gradinput, algo, dtype, precision, parameters)
def test_gradweight(self): def test_gradweight(self):
for precision, algo, parameters in product(AVAILABLE_PRECISIONS, self.bwd_filter_algorithms, self.get_cases()): for dtype, precision in cudnn.get_bwd_filter_dtype_configs():
yield (self.run_conv_gradweight, algo, precision, parameters) algos = (algo for algo in self.bwd_filter_algorithms
if cudnn.bwd_filter_algo_supports_dtype_config(algo, dtype, precision, self.ndim))
def test_choose_runtime_algos(self): for algo in chain(algos, SUPPORTED_DNN_CONV_ALGO_RUNTIME):
for precision, algo, parameters in product(AVAILABLE_PRECISIONS, SUPPORTED_DNN_CONV_ALGO_RUNTIME, for parameters in self.get_cases():
self.get_cases()): yield (self.run_conv_gradweight, algo, dtype, precision, parameters)
yield (self.run_choose_runtime_algos, algo, precision, parameters)
def check_fwd_predictions(self):
"""
Call this method to check if tests fail when they
don't follow cuDNN V5.1 doc conditions for FWD algorithms.
Script will exit as soon as there is a test that does not fail when expected.
"""
print()
print('TESTING FWD FAILURES PREDICTED FOR %dD' % self.ndim)
count = 0
for precision, algo, parameters in product(AVAILABLE_PRECISIONS, self.fwd_algorithms,
self.get_cases()):
(inputs_shape, filters_shape, subsample, dilation), border_mode, conv_mode, alpha, beta = parameters
inputs_val = np.random.random(inputs_shape).astype(theano.config.floatX)
filters_val = np.random.random(filters_shape).astype(theano.config.floatX)
# Scale down the input values to prevent very large absolute errors
# due to float rounding
inputs_val /= 10
filters_val /= 10
out = self.array_like_conv_output(inputs_shape, filters_shape, border_mode, subsample, dilation)
desc_op = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample, dilation=dilation,
conv_mode=conv_mode, precision=precision)
should_compute = check_fwd_algorithm(inputs_val, filters_val, out, desc_op,
algo, precision, subsample, dilation)
if not should_compute.ok:
infos = ['ndim : %s' % (len(inputs_shape) - 2),
'precision : %s' % precision]
infos += should_compute.messages
try:
self.run_conv_fwd(algo, precision, parameters)
except Exception as e:
print('(FAILS as expected)', algo, precision, parameters)
print(e.message.split('\n')[0])
for info in infos:
print(info)
# exit(0)
else:
print('**SHOULD FAIL**|', algo, precision, parameters)
for info in infos:
print(info)
exit(-1)
count += 1
if count % 200 == 0:
print(count, 'passed')
print(count, 'finished')
class TestDnnConv2D(BaseTestDnnConv): class TestDnnConv2D(BaseTestDnnConv):
...@@ -615,23 +489,39 @@ class TestDnnConv3D(BaseTestDnnConv): ...@@ -615,23 +489,39 @@ class TestDnnConv3D(BaseTestDnnConv):
if __name__ == '__main__': if __name__ == '__main__':
def dtype_config_to_str(dtype_config):
dtype, precision = dtype_config
if dtype == precision == 'float16':
return 'TRUE_HALF_CONFIG'
if dtype == 'float16' and precision == 'float32':
return 'PSEUDO_HALF_CONFIG'
if dtype == precision == 'float32':
return 'FLOAT_CONFIG'
if dtype == precision == 'float64':
return 'DOUBLE_CONFIG'
raise ValueError
test_2d = TestDnnConv2D() test_2d = TestDnnConv2D()
test_3d = TestDnnConv3D() test_3d = TestDnnConv3D()
print()
print('Available data type configurations :',
', '.join(dtype_config_to_str(d) for d in cudnn.get_supported_dtype_configs()))
print()
print('2D algorithms:') print('2D algorithms:')
print('FWD :', test_2d.fwd_algorithms) print('FWD :', ', '.join(test_2d.fwd_algorithms))
print('BWD FILTER:', test_2d.bwd_filter_algorithms) print('BWD FILTER :', ', '.join(test_2d.bwd_filter_algorithms))
print('BWD DATA :', test_2d.bwd_data_algorithms) print('BWD DATA :', ', '.join(test_2d.bwd_data_algorithms))
print()
print('3D algorithms:') print('3D algorithms:')
print('FWD :', test_3d.fwd_algorithms) print('FWD :', ', '.join(test_3d.fwd_algorithms))
print('BWD FILTER:', test_3d.bwd_filter_algorithms) print('BWD FILTER :', ', '.join(test_3d.bwd_filter_algorithms))
print('BWD DATA :', test_3d.bwd_data_algorithms) print('BWD DATA :', ', '.join(test_3d.bwd_data_algorithms))
print()
count_tests_2d = test_2d.get_expected_tcount() count_tests_2d = test_2d.get_expected_tcount()
count_tests_3d = test_3d.get_expected_tcount() count_tests_3d = test_3d.get_expected_tcount()
print(count_tests_2d, 'total cases for 2D.') print(count_tests_2d, 'conv2D test cases.')
print(count_tests_3d, 'total cases for 3D.') print(count_tests_3d, 'conv3D test cases.')
print(count_tests_2d + count_tests_3d, 'total cases.') print(count_tests_2d + count_tests_3d, 'total conv test cases.')
import sys print()
nose.main(defaultTest='theano.gpuarray.tests.check_dnn')
if len(sys.argv) == 2 and sys.argv[1] == 'run':
test_2d.check_fwd_predictions()
test_3d.check_fwd_predictions()
"""
This module is just a collection of definitions to be used by `check_dnn.py`.
Following classes, functions and definitions are used to check if
tests fail as expected when conditions listed into cuDNN documentation are not verified.
I have currently implemented checking only for 2D/3D FWD algorithms in cuDNN V5.1,
and in practice, many tests pass even when they don't follow cuDNN doc conditions.
So, I think we should better just run all tests and check ourselves
which tests pass, which fail, and why they fail.
Reminder:
N: batch number
C: number of feature maps
D: depth
H: height
W: width
NB: We assume that we **always** use NC(D)HW tensors in Theano.
"""
from __future__ import absolute_import, print_function, division
import theano
from ..cudnn_defs import HALF, FLOAT, DOUBLE, get_definitions
from ..dnn import version
UNKNOWN, TRUE_HALF_CONFIG, PSEUDO_HALF_CONFIG, FLOAT_CONFIG, DOUBLE_CONFIG = -1, 0, 1, 2, 3
cudnn = get_definitions(version(raises=False))
cudnnConvolutionFwdAlgo_t = cudnn.cudnnConvolutionFwdAlgo_t
class Success:
ok = True
messages = []
def __init__(self, messages=[]):
self.messages = list(messages)
def add_message(self, *parts):
self.messages.append(''.join(str(part) for part in parts))
class Failure(Success):
ok = False
def _and(*tests):
# `tests` is a list of tuples with format (lambda test, test description)
messages = []
for test_lambda, message in tests:
if not test_lambda():
messages.append(message)
return Failure(messages) if messages else Success()
def _or(*tests):
messages = []
ok = False
for test_lambda, message in tests:
if test_lambda():
ok = True
break
else:
messages.append(message)
return Success() if ok else Failure(messages)
def type_conf(precision):
# All Op's input tensors are floatX tensors.
floatX = theano.config.floatX
if floatX == precision == HALF:
return TRUE_HALF_CONFIG
if floatX == HALF and precision == FLOAT:
return PSEUDO_HALF_CONFIG
if floatX == precision == FLOAT:
return FLOAT_CONFIG
if floatX == precision == DOUBLE:
return DOUBLE_CONFIG
return UNKNOWN
# raise ValueError('Unknown data type configuration (%s %s)' % (floatX, precision))
def type_conf_to_string(conf):
if conf == -1:
return 'UNKNOWN'
if conf == 0:
return 'TRUE_HALF_CONFIG'
if conf == 1:
return 'PSEUDO_HALF_CONFIG'
if conf == 2:
return 'FLOAT_CONFIG'
if conf == 3:
return 'DOUBLE_CONFIG'
def strideof(tensor, i):
return tensor.strides[i] // tensor.itemsize
def tensor_is_partially_packed(tensor, packed_dim_names):
if tensor.ndim == 4:
dim_names = 'NCHW'
else:
dim_names = 'NCDHW'
packed_dims = []
unpacked_dims = []
for i in range(tensor.ndim - 1):
if dim_names[i] in packed_dim_names:
packed_dims.append(i)
else:
unpacked_dims.append(i)
if dim_names[tensor.ndim - 1] in packed_dim_names and strideof(tensor, -1) != 1:
# We won't put last dimension in the list of packed dims.
# We just need to check if stride of that dimension is 1.
return False
return (all(strideof(tensor, i) >= tensor.shape[i + 1] * strideof(tensor, i + 1) for i in unpacked_dims) and
all(strideof(tensor, i) == tensor.shape[i + 1] * strideof(tensor, i + 1) for i in packed_dims))
def tensor_is_fully_packed(tensor):
return strideof(tensor, -1) == 1 and all(strideof(tensor, i) == tensor.shape[i + 1] * strideof(tensor, i + 1)
for i in range(tensor.ndim - 1))
def check_fwd_algorithm(img, kern, out, desc_op, algo, precision, subsample, dilation):
# Based on cuDNN v5.1 user guide.
ndim = img.ndim - 2
if ndim == 2:
# rD won't be used.
rD, rH, rW = -1, 0, 1
else:
rD, rH, rW = 0, 1, 2
algo = cudnnConvolutionFwdAlgo_t.fromalias(algo)
kern_shape = kern.shape[2:]
kern_shape = tuple((kern_shape[i] - 1) * dilation[i] + 1 for i in range(len(dilation)))
pad = (desc_op.pad0, desc_op.pad1, desc_op.pad2)[:len(kern_shape)]
if desc_op.bmode == 'full':
pad = tuple(kern_shape[i] - 1 for i in range(len(pad)))
elif desc_op.bmode == 'half':
pad = tuple(kern_shape[i] // 2 for i in range(len(pad)))
img_shape = img.shape[2:]
img_with_borders = tuple(img_shape[i] + 2 * pad[i] for i in range(len(pad)))
def check_algo():
if algo == cudnnConvolutionFwdAlgo_t.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM:
return _and((lambda: type_conf(precision) != TRUE_HALF_CONFIG,
"Data Type Config Support: All except TRUE_HALF_CONFIG"))
# CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM: 2D: everything supported.
if ndim == 3 and algo == cudnnConvolutionFwdAlgo_t.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM:
return _and(
(lambda: type_conf(precision) != TRUE_HALF_CONFIG,
"Data Type Config Support: All except TRUE_HALF_CONFIG"),
(lambda: tensor_is_fully_packed(img),
"xDesc Format Support: NCDHW-fully-packed"),
(lambda: tensor_is_fully_packed(out),
"yDesc Format Support: NCDHW-fully-packed"),
)
if algo == cudnnConvolutionFwdAlgo_t.CUDNN_CONVOLUTION_FWD_ALGO_GEMM:
return _and(
(lambda: type_conf(precision) != TRUE_HALF_CONFIG,
"Data Type Config Support: All except TRUE_HALF_CONFIG"),
(lambda: ndim == 2,
"Only for conv2d")
)
# CUDNN_CONVOLUTION_FWD_ALGO_DIRECT: not implemented.
if algo == cudnnConvolutionFwdAlgo_t.CUDNN_CONVOLUTION_FWD_ALGO_FFT:
return _and(
(lambda: type_conf(precision) in (PSEUDO_HALF_CONFIG, FLOAT_CONFIG),
"Data Type Config Support: PSEUDO_HALF_CONFIG, FLOAT_CONFIG"),
(lambda: ndim == 2,
"Only for conv2d"),
(lambda: tensor_is_partially_packed(img, 'HW'),
"xDesc Format Support: NCHW HW-packed"),
(lambda: tensor_is_partially_packed(out, 'HW'),
"yDesc Format Support: NCHW HW-packed"),
(lambda: img_with_borders[rH] <= 256,
"xDesc 's feature map height + 2 * convDesc 's zero-padding height must equal 256 or less"),
(lambda: img_with_borders[rW] <= 256,
"xDesc 's feature map width + 2 * convDesc 's zero-padding width must equal 256 or less"),
(lambda: subsample[rH] == subsample[rW] == 1,
"convDesc 's vertical and horizontal filter stride must equal 1"),
(lambda: kern_shape[rH] > pad[rH],
"wDesc 's filter height must be greater than convDesc 's zero-padding height"),
(lambda: kern_shape[rW] > pad[rW],
"wDesc 's filter width must be greater than convDesc 's zero-padding width")
)
if algo == cudnnConvolutionFwdAlgo_t.CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING:
if ndim == 2:
return _and(
(lambda: type_conf(precision) in (PSEUDO_HALF_CONFIG, FLOAT_CONFIG),
"Data Type Config Support: PSEUDO_HALF_CONFIG, FLOAT_CONFIG"),
(lambda: tensor_is_partially_packed(img, 'HW'),
"xDesc Format Support: NCHW HW-packed"),
(lambda: tensor_is_partially_packed(out, 'HW'),
"yDesc Format Support: NCHW HW-packed"),
(lambda: kern_shape[rH] <= 32,
"wDesc 's filter height must equal 32 or less"),
(lambda: kern_shape[rW] <= 32,
"wDesc 's filter width must equal 32 or less"),
(lambda: subsample[rH] == subsample[rW] == 1,
"convDesc 's vertical and horizontal filter stride must equal 1"),
(lambda: pad[rH] < kern_shape[rH],
"wDesc 's filter height must be greater than convDesc 's zero-padding height"),
(lambda: pad[rW] < kern_shape[rW],
"wDesc 's filter width must be greater than convDesc 's zero-padding width"),
)
if ndim == 3:
return _and(
(lambda: type_conf(precision) != TRUE_HALF_CONFIG,
"Data Type Config Support: All except TRUE_HALF_CONFIG"),
(lambda: tensor_is_partially_packed(img, 'DHW'),
"xDesc Format Support: NCDHW DHW-packed"),
(lambda: tensor_is_partially_packed(out, 'DHW'),
"yDesc Format Support: NCDHW DHW-packed"),
(lambda: kern_shape[rH] <= 16,
"wDesc 's filter height must equal 16 or less"),
(lambda: kern_shape[rW] <= 16,
"wDesc 's filter width must equal 16 or less"),
(lambda: kern_shape[rD] <= 16,
"wDesc 's filter depth must equal 16 or less"),
(lambda: all(s == 1 for s in subsample),
"convDesc 's must have all filter strides equal to 1"),
(lambda: pad[rH] < kern_shape[rH],
"wDesc 's filter height must be greater than convDesc 's zero-padding height"),
(lambda: pad[rW] < kern_shape[rW],
"wDesc 's filter width must be greater than convDesc 's zero-padding width"),
(lambda: pad[rW] < kern_shape[rD],
"wDesc 's filter depth must be greater than convDesc 's zero-padding width"),
)
if algo == cudnnConvolutionFwdAlgo_t.CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD:
return _and(
(lambda: type_conf(precision) in (PSEUDO_HALF_CONFIG, FLOAT_CONFIG, DOUBLE_CONFIG),
"Data Type Config Support: PSEUDO_HALF_CONFIG, FLOAT_CONFIG"),
(lambda: ndim == 2,
"Only for conv2d"),
(lambda: subsample[rH] == subsample[rW] == 1,
"convDesc 's vertical and horizontal filter stride must equal 1"),
(lambda: kern_shape[rH] == 3,
"wDesc 's filter height must be 3"),
(lambda: kern_shape[rW] == 3,
"wDesc 's filter width must be 3"),
)
if algo == cudnnConvolutionFwdAlgo_t.CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED:
data_type_conf = type_conf(precision)
return _and(
(lambda: data_type_conf != DOUBLE_CONFIG,
"Data Type Config Support: All except DOUBLE_CONFIG"),
(lambda: ndim == 2,
"Only for conv2d"),
(lambda: subsample[rH] == subsample[rW] == 1,
"convDesc 's vertical and horizontal filter stride must equal 1"),
(lambda: kern_shape[rH] == kern_shape[rW] and kern_shape[rH] in (3, 5),
"wDesc 's filter (height, width) must be (3,3) or (5,5)"),
(lambda: kern_shape[rH] == 3 or data_type_conf != TRUE_HALF_CONFIG,
"If wDesc 's filter (height, width) is (5,5), "
"data type config TRUE_HALF_CONFIG is not supported")
)
checking = check_algo()
if not checking.ok:
messages = checking.messages
checking.messages = []
checking.add_message('config : ', type_conf_to_string(type_conf(precision)))
checking.add_message('computed borders : ', pad)
checking.add_message('img with borders : ', img_with_borders)
checking.add_message('computed kern shape: ', kern_shape)
checking.add_message('== why should fail ==')
checking.messages += messages
return checking
#section init_code_struct
reuse_algo = 0;
prev_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
if (!PARAMS->choose_once) {
memset(prev_img_dims, 0, sizeof(prev_img_dims));
memset(prev_kern_dims, 0, sizeof(prev_kern_dims));
}
#section support_code_struct
int reuse_algo;
cudnnConvolutionFwdAlgo_t prev_algo;
size_t prev_img_dims[5];
size_t prev_kern_dims[5];
int
APPLY_SPECIFIC(choose_fwd_algo)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
PyGpuArrayObject *output,
cudnnConvolutionDescriptor_t desc,
cudnnConvolutionFwdAlgo_t *output_algo,
PARAMS_TYPE* params) {
PyGpuContextObject *c = input->context;
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) {
PyErr_SetString(PyExc_ValueError, "images and kernel must have the same stack size");
return 1;
}
if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
return 1;
if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
return 1;
if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
return 1;
cudnnConvolutionFwdAlgo_t algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
cuda_enter(c->ctx);
int expected_output_dims[5] = {0};
err = cudnnGetConvolutionNdForwardOutputDim(desc, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
PyGpuArray_NDIM(input), expected_output_dims);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error computing convolution output dim: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
if (PyGpuArray_NDIM(input) == 4) {
if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
(PyGpuArray_DIMS(output)[1] != expected_output_dims[1]) ||
(PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
(PyGpuArray_DIMS(output)[3] != expected_output_dims[3])) {
PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ld"
" but received gradient with shape %ldx%ldx%ldx%ld",
expected_output_dims[0], expected_output_dims[1],
expected_output_dims[2], expected_output_dims[3],
PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3]);
cuda_exit(c->ctx);
return 1;
}
} else if (PyGpuArray_NDIM(input) == 5) {
if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
(PyGpuArray_DIMS(output)[1] != expected_output_dims[1]) ||
(PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
(PyGpuArray_DIMS(output)[3] != expected_output_dims[3]) ||
(PyGpuArray_DIMS(output)[4] != expected_output_dims[4])) {
PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ldx%ld"
" but received gradient with shape %ldx%ldx%ldx%ldx%ld",
expected_output_dims[0], expected_output_dims[1],
expected_output_dims[2], expected_output_dims[3],
expected_output_dims[4],
PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3],
PyGpuArray_DIMS(output)[4]);
cuda_exit(c->ctx);
return 1;
}
}
if (!params->choose_once) {
reuse_algo = 1;
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); ++i) {
reuse_algo = (reuse_algo && PyGpuArray_DIM(input, i) == prev_img_dims[i]);
reuse_algo = (reuse_algo && PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]);
}
}
if (!reuse_algo) {
size_t free;
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
if (err2 != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
"memory information on the GPU");
cuda_exit(c->ctx);
return 1;
}
// Guess 4Mb if the info is not available
if (free == 0) free = 4 * 1024 * 1024;
if (params->choose_time) {
int count;
cudnnConvolutionFwdAlgoPerf_t choice;
gpudata *tmpmem;
tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
if (tmpmem == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
return -1;
}
err = cudnnFindConvolutionForwardAlgorithmEx(
params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
desc, APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),
1, &count, &choice, *(void **)tmpmem,
free);
gpudata_release(tmpmem);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
algo = choice.algo;
} else {
err = cudnnGetConvolutionForwardAlgorithm(
params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
desc, APPLY_SPECIFIC(output),
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
}
prev_algo = algo;
} else {
algo = prev_algo;
}
if (params->choose_once) {
reuse_algo = 1;
} else {
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); ++i) {
prev_img_dims[i] = PyGpuArray_DIM(input, i);
prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
}
}
/* These two algos are not supported for 3d conv */
if (PyGpuArray_NDIM(input) == 5 &&
(algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM ||
algo == CUDNN_CONVOLUTION_FWD_ALGO_GEMM))
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
// The FFT implementation does not support strides, 1x1 filters or inputs
// with a spatial dimension larger than 1024. The tiled-FFT implementation
// does not support strides.
// If the chosen implementation is FFT or tiled-FFT, validate that it can
// be used on the current data and default to a safe implementation if it
// can't.
// The following code is 2d-specific but it is fine as FFT and tiled-FFT are
// defined only for 2d filters
if ((algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT ||
algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) && PyGpuArray_NDIM(input) == 4) {
// Extract the properties of the convolution descriptor
int nd;
int pad[2];
int stride[2];
int dilation[2];
cudnnConvolutionMode_t mode;
cudnnDataType_t data_type;
err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
dilation, &mode, &data_type);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error getting convolution properties: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT) {
if (stride[0] != 1 || stride[1] != 1 ||
PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
(PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1))
{
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
}
} else {
// algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING
if (stride[0] != 1 || stride[1] != 1) {
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
}
}
}
*output_algo = algo;
cuda_exit(c->ctx);
return 0;
}
#section init_code_struct
reuse_algo = 0;
prev_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
if (!PARAMS->choose_once) {
memset(prev_kern_dims, 0, sizeof(prev_kern_dims));
memset(prev_top_dims, 0, sizeof(prev_top_dims));
}
#section support_code_struct
int reuse_algo;
cudnnConvolutionBwdDataAlgo_t prev_algo;
size_t prev_kern_dims[5];
size_t prev_top_dims[5];
int
APPLY_SPECIFIC(choose_bwd_data_algo)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
PyGpuArrayObject *input,
cudnnConvolutionDescriptor_t desc,
cudnnConvolutionBwdDataAlgo_t *output_algo,
PARAMS_TYPE* params) {
PyGpuContextObject *c = kerns->context;
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) {
PyErr_SetString(PyExc_ValueError, "images and kernel must have the same stack size");
return 1;
}
if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
return 1;
if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
return 1;
if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
return 1;
cudnnConvolutionBwdDataAlgo_t algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
cuda_enter(c->ctx);
int expected_output_dims[5] = {0};
err = cudnnGetConvolutionNdForwardOutputDim(desc, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
PyGpuArray_NDIM(input), expected_output_dims);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error computing convolution output dim: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
if (PyGpuArray_NDIM(input) == 4) {
if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
(PyGpuArray_DIMS(output)[1] != expected_output_dims[1]) ||
(PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
(PyGpuArray_DIMS(output)[3] != expected_output_dims[3])) {
PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ld"
" but received gradient with shape %ldx%ldx%ldx%ld",
expected_output_dims[0], expected_output_dims[1],
expected_output_dims[2], expected_output_dims[3],
PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3]);
cuda_exit(c->ctx);
return 1;
}
} else if (PyGpuArray_NDIM(input) == 5) {
if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
(PyGpuArray_DIMS(output)[1] != expected_output_dims[1]) ||
(PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
(PyGpuArray_DIMS(output)[3] != expected_output_dims[3]) ||
(PyGpuArray_DIMS(output)[4] != expected_output_dims[4])) {
PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ldx%ld"
" but received gradient with shape %ldx%ldx%ldx%ldx%ld",
expected_output_dims[0], expected_output_dims[1],
expected_output_dims[2], expected_output_dims[3],
expected_output_dims[4],
PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3],
PyGpuArray_DIMS(output)[4]);
cuda_exit(c->ctx);
return 1;
}
}
if (!params->choose_once) {
reuse_algo = 1;
for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); ++i) {
reuse_algo = (reuse_algo && PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]);
reuse_algo = (reuse_algo && PyGpuArray_DIM(output, i) == prev_top_dims[i]);
}
}
if (!reuse_algo) {
size_t free;
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
if (err2 != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
"memory information on the GPU");
cuda_exit(c->ctx);
return 1;
}
// Guess 4Mb if the info is not available
if (free == 0) free = 4 * 1024 * 1024;
if (params->choose_time) {
int count;
cudnnConvolutionBwdDataAlgoPerf_t choice;
gpudata *tmpmem;
tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
if (tmpmem == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
return -1;
}
err = cudnnFindConvolutionBackwardDataAlgorithmEx(
params->handle, APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
1, &count, &choice, *(void **)tmpmem, free);
gpudata_release(tmpmem);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
algo = choice.algo;
} else {
err = cudnnGetConvolutionBackwardDataAlgorithm(
params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output),
desc, APPLY_SPECIFIC(input),
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
}
prev_algo = algo;
} else {
algo = prev_algo;
}
if (params->choose_once) {
reuse_algo = 1;
} else {
for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); ++i) {
prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
prev_top_dims[i] = PyGpuArray_DIM(output, i);
}
}
// The FFT implementation does not support strides, 1x1 filters or inputs
// with a spatial dimension larger than 1024. The tiled-FFT implementation
// does not support strides.
// If the chosen implementation is FFT or tiled-FFT, validate that it can
// be used on the current data and default to a safe implementation if it
// can't.
// The following code is 2d-specific but it is fine as FFT and tiled-FFT are
// defined only for 2d filters
if ((algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING ||
algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) && PyGpuArray_NDIM(kerns) == 4) {
// Extract the properties of the convolution descriptor
int nd;
int pad[2];
int stride[2];
int upscale[2];
cudnnConvolutionMode_t mode;
cudnnDataType_t data_type;
err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride, upscale, &mode, &data_type);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error getting convolution properties: %s", cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
if (algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) {
if (stride[0] != 1 || stride[1] != 1 ||
PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
(PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1))
{
algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
}
} else {
// algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING
if (stride[0] != 1 || stride[1] != 1) {
algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
}
}
}
*output_algo = algo;
cuda_exit(c->ctx);
return 0;
}
#section init_code_struct
reuse_algo = 0;
prev_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
if (!PARAMS->choose_once) {
memset(prev_img_dims, 0, sizeof(prev_img_dims));
memset(prev_top_dims, 0, sizeof(prev_top_dims));
}
#section support_code_struct
int reuse_algo;
cudnnConvolutionBwdFilterAlgo_t prev_algo;
size_t prev_img_dims[5];
size_t prev_top_dims[5];
int
APPLY_SPECIFIC(choose_bwd_filter_algo)(PyGpuArrayObject *input, PyGpuArrayObject *output,
PyGpuArrayObject *kerns,
cudnnConvolutionDescriptor_t desc,
cudnnConvolutionBwdFilterAlgo_t *output_algo,
PARAMS_TYPE* params) {
PyGpuContextObject *c = input->context;
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) {
PyErr_SetString(PyExc_ValueError, "GpuDnnConv images and kernel must have the same stack size");
return 1;
}
if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
return 1;
if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
return 1;
if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
return 1;
cudnnConvolutionBwdFilterAlgo_t algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
cuda_enter(c->ctx);
int expected_output_dims[5] = {0};
err = cudnnGetConvolutionNdForwardOutputDim(desc, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
PyGpuArray_NDIM(input), expected_output_dims);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error computing convolution output dim: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
if (PyGpuArray_NDIM(input) == 4) {
if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
(PyGpuArray_DIMS(output)[1] != expected_output_dims[1]) ||
(PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
(PyGpuArray_DIMS(output)[3] != expected_output_dims[3])) {
PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ld"
" but received gradient with shape %ldx%ldx%ldx%ld",
expected_output_dims[0], expected_output_dims[1],
expected_output_dims[2], expected_output_dims[3],
PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3]);
cuda_exit(c->ctx);
return 1;
}
} else if (PyGpuArray_NDIM(input) == 5) {
if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
(PyGpuArray_DIMS(output)[1] != expected_output_dims[1]) ||
(PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
(PyGpuArray_DIMS(output)[3] != expected_output_dims[3]) ||
(PyGpuArray_DIMS(output)[4] != expected_output_dims[4])) {
PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ldx%ld"
" but received gradient with shape %ldx%ldx%ldx%ldx%ld",
expected_output_dims[0], expected_output_dims[1],
expected_output_dims[2], expected_output_dims[3],
expected_output_dims[4],
PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3],
PyGpuArray_DIMS(output)[4]);
cuda_exit(c->ctx);
return 1;
}
}
if (!params->choose_once) {
reuse_algo = 1;
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); ++i) {
reuse_algo = (reuse_algo && PyGpuArray_DIM(input, i) == prev_img_dims[i]);
reuse_algo = (reuse_algo && PyGpuArray_DIM(output, i) == prev_top_dims[i]);
}
}
if (!reuse_algo) {
size_t free;
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
if (err2 != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
"memory information on the GPU");
cuda_exit(c->ctx);
return 1;
}
// Guess 4Mb if the info is not available
if (free == 0) free = 4 * 1024 * 1024;
if (params->choose_time) {
int count;
cudnnConvolutionBwdFilterAlgoPerf_t choice;
gpudata *tmpmem;
tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
if (tmpmem == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
return -1;
}
err = cudnnFindConvolutionBackwardFilterAlgorithmEx(
params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc,
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
1, &count, &choice, *(void **)tmpmem, free);
gpudata_release(tmpmem);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
algo = choice.algo;
} else {
err = cudnnGetConvolutionBackwardFilterAlgorithm(
params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
desc, APPLY_SPECIFIC(kerns),
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
}
prev_algo = algo;
} else {
algo = prev_algo;
}
if (params->choose_once) {
reuse_algo = 1;
} else {
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
prev_img_dims[i] = PyGpuArray_DIM(input, i);
prev_top_dims[i] = PyGpuArray_DIM(output, i);
}
}
// The FFT implementation does not support strides, 1x1 filters or inputs
// with a spatial dimension larger than 1024.
// If the chosen implementation is FFT, validate that it can
// be used on the current data and default to a safe implementation if it
// can't.
// The following code is 2d-specific but it is fine as FFT and tiled-FFT are
// defined only for 2d filters
if (algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT && PyGpuArray_NDIM(input) == 4) {
// Extract the properties of the convolution descriptor
int nd;
int pad[2];
int stride[2];
int upscale[2];
cudnnConvolutionMode_t mode;
cudnnDataType_t data_type;
err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride, upscale, &mode, &data_type);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error getting convolution properties: %s", cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
if (stride[0] != 1 || stride[1] != 1 ||
PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
(PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1)) {
algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
}
}
*output_algo = algo;
cuda_exit(c->ctx);
return 0;
}
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论