提交 1b6e6389 authored 作者: notoraptor's avatar notoraptor

Cancel some changes into CEnumType.

Cancel changes into dnn_fwd.c Heavy simplification of check_dnn. Make check_dnn runnable as a python script.
上级 6cc0c5ca
...@@ -909,11 +909,7 @@ class EnumType(Type, dict): ...@@ -909,11 +909,7 @@ class EnumType(Type, dict):
.. note:: .. note::
:class:`EnumType` is not complete and should never be used for regular graph operations. This Type (and subclasses) is not complete and should never be used for regular graph operations.
:class:`EnumList` is not complete and should never be used for regular graph operations.
**:class:`CEnumType` is complete.**
""" """
...@@ -1053,9 +1049,6 @@ class EnumType(Type, dict): ...@@ -1053,9 +1049,6 @@ class EnumType(Type, dict):
#ifndef PyInt_AsLong #ifndef PyInt_AsLong
#define PyInt_AsLong PyLong_AsLong #define PyInt_AsLong PyLong_AsLong
#endif #endif
#ifndef PyInt_FromLong
#define PyInt_FromLong PyLong_FromLong
#endif
#endif #endif
""" """
...@@ -1248,22 +1241,5 @@ class CEnumType(EnumList): ...@@ -1248,22 +1241,5 @@ class CEnumType(EnumList):
""" % dict(i=i, name=name, constant_cname=swapped_dict[i]) for i in sorted(swapped_dict.keys())), """ % dict(i=i, name=name, constant_cname=swapped_dict[i]) for i in sorted(swapped_dict.keys())),
fail=sub['fail']) fail=sub['fail'])
def c_sync(self, name, sub):
return """
int py_value = -1;
Py_XDECREF(py_%(name)s);
/* We assume that ctype is an integer type usable in a switch. */
switch (%(name)s) {
%(cases)s
default:
PyErr_SetString(PyExc_ValueError, "CEnumType: cannot map C value to Python constant.");
{%(fail)s}
break;
}
py_%(name)s = PyInt_FromLong(py_value);
""" % dict(name=name, fail=sub['fail'], cases=''.join("""
case %(constant_cname)s: py_value = %(constant_pyvalue)d; break;
""" % dict(constant_cname=k, constant_pyvalue=v) for k, v in sorted(self.items(), key=lambda t: t[1])))
def c_code_cache_version(self): def c_code_cache_version(self):
return (1, super(CEnumType, self).c_code_cache_version()) return (1, super(CEnumType, self).c_code_cache_version())
...@@ -19,8 +19,27 @@ from __future__ import absolute_import, print_function, division ...@@ -19,8 +19,27 @@ from __future__ import absolute_import, print_function, division
from theano.gof import CEnumType from theano.gof import CEnumType
HALF, FLOAT, DOUBLE = ('float16', 'float32', 'float64') HALF, FLOAT, DOUBLE = ('float16', 'float32', 'float64')
TRUE_HALF_CONFIG = (HALF, HALF)
PSEUDO_HALF_CONFIG = (HALF, FLOAT)
FLOAT_CONFIG = (FLOAT, FLOAT)
DOUBLE_CONFIG = (DOUBLE, DOUBLE)
def is_true_half_config(dtype, precision):
return dtype == precision == HALF
def is_pseudo_half_config(dtype, precision):
return dtype == HALF and precision == FLOAT
def is_float_config(dtype, precision):
return dtype == precision == FLOAT
def is_double_config(dtype, precision):
return dtype == precision == DOUBLE
# NB: Some cuDNN algorithms are listed in cuDNN enums but not implemented. # NB: Some cuDNN algorithms are listed in cuDNN enums but not implemented.
...@@ -103,22 +122,97 @@ class CuDNNV51(object): ...@@ -103,22 +122,97 @@ class CuDNNV51(object):
# empty list of enum to don't crash with cudnn 5 # empty list of enum to don't crash with cudnn 5
cudnnReduceTensorOp_t = CEnumType() cudnnReduceTensorOp_t = CEnumType()
def supported_precisions(self, dtype): def get_supported_dtype_configs(self):
""" """
Return the tuple of precisions supported by cuDNN for given input data type. Return the tuple of data type configurations supported by this version of cuDNN.
This is currently convenient for both cuDNN V5.1 and V6, as Theano does not This is currently convenient for both cuDNN V5.1 and V6, as Theano does not
yet support new data types (like INT8, INT8x4, etc.). yet support new data types (like INT8, INT8x4, etc.).
""" """
assert dtype in (HALF, FLOAT, DOUBLE) return (TRUE_HALF_CONFIG, PSEUDO_HALF_CONFIG, FLOAT_CONFIG, DOUBLE_CONFIG)
if dtype == HALF:
# TRUE_HALF_CONFIG, PSEUDO_HALF_CONFIG def get_fwd_dtype_configs(self, check_runtime=None):
return (HALF, FLOAT) # NB: "TRUE_HALF_CONFIG is only supported on architectures with true fp16 support
if dtype == FLOAT: # (compute capability 5.3 and 6.0)". Can be checked at runtime only.
# FLOAT_CONFIG if check_runtime is None or check_runtime(*TRUE_HALF_CONFIG):
return (FLOAT,) return self.get_supported_dtype_configs()
if dtype == DOUBLE: return (PSEUDO_HALF_CONFIG, FLOAT_CONFIG, DOUBLE_CONFIG)
# DOUBLE_CONFIG
return (DOUBLE,) def get_bwd_filter_dtype_configs(self, check_runtime=None):
return self.get_supported_dtype_configs()
def get_bwd_data_dtype_configs(self, check_runtime=None):
return self.get_supported_dtype_configs()
def fwd_algo_supports_dtype_config(self, algo, dtype, precision, ndim):
algorithms = self.cudnnConvolutionFwdAlgo_t
algo = algorithms.fromalias(algo)
if algo == algorithms.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM:
return not is_true_half_config(dtype, precision)
if algo == algorithms.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM:
return ndim == 2 or not is_true_half_config(dtype, precision)
if algo == algorithms.CUDNN_CONVOLUTION_FWD_ALGO_GEMM:
return ndim == 2 and not is_true_half_config(dtype, precision)
# CUDNN_CONVOLUTION_FWD_ALGO_DIRECT: not implemented.
if algo == algorithms.CUDNN_CONVOLUTION_FWD_ALGO_FFT:
return ndim == 2 and (is_pseudo_half_config(dtype, precision) or is_float_config(dtype, precision))
if algo == algorithms.CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING:
if ndim == 2:
return is_pseudo_half_config(dtype, precision) or is_float_config(dtype, precision)
# NB: For cuDNN V6:
# " Data Type Config Support: PSEUDO_HALF_CONFIG, FLOAT_CONFIG
# (DOUBLE_CONFIG is also supported when the task can be handled by 1D FFT,
# ie, one of the filter dimension, width or height is 1)"
# Could be checked only when being in C code.
if ndim == 3:
return not is_true_half_config(dtype, precision)
if algo == algorithms.CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD:
return ndim == 2 and (is_pseudo_half_config(dtype, precision) or is_float_config(dtype, precision))
if algo == algorithms.CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED:
# NB: "If wDesc 's filter (height, width) is (5,5), data type config TRUE_HALF_CONFIG is not supported".
# We could not check it before being in C code.
return ndim == 2 and not is_double_config(dtype, precision)
return False
def bwd_filter_algo_supports_dtype_config(self, algo, dtype, precision, ndim):
algorithms = self.cudnnConvolutionBwdFilterAlgo_t
algo = algorithms.fromalias(algo)
if algo == algorithms.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0:
return not is_true_half_config(dtype, precision)
if algo == algorithms.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1:
return ndim == 2
if algo == algorithms.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT:
return ndim == 2 and (is_pseudo_half_config(dtype, precision) or is_float_config(dtype, precision))
if algo == algorithms.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3:
return not is_true_half_config(dtype, precision)
if algo == algorithms.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED:
# NB: "If wDesc 's filter (height, width) is (5,5), data type config TRUE_HALF_CONFIG is not supported".
# We could not check it before being in C code.
return ndim == 2 and not is_double_config(dtype, precision)
return False
def bwd_data_algo_supports_dtype_config(self, algo, dtype, precision, ndim):
algorithms = self.cudnnConvolutionBwdDataAlgo_t
algo = algorithms.fromalias(algo)
if algo == algorithms.CUDNN_CONVOLUTION_BWD_DATA_ALGO_0:
return not is_true_half_config(dtype, precision)
# CUDNN_CONVOLUTION_BWD_DATA_ALGO_1: all data type configs supported.
if algo == algorithms.CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT:
return ndim == 2 and (is_pseudo_half_config(dtype, precision) or is_float_config(dtype, precision))
if algo == algorithms.CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING:
if ndim == 2:
return is_pseudo_half_config(dtype, precision) or is_float_config(dtype, precision)
# NB: For cuDNN V6: "(DOUBLE_CONFIG is also supported when the task can be handled by 1D FFT,
# ie, one of the filter dimension, width or height is 1)"
# Could be checked only when being in C code.
if ndim == 3:
return not is_true_half_config(dtype, precision)
if algo == algorithms.CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD:
return ndim == 2 and is_pseudo_half_config(dtype, precision) or is_float_config(dtype, precision)
if algo == algorithms.CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED:
# NB: "If wDesc 's filter (height, width) is (5,5), data type config TRUE_HALF_CONFIG is not supported".
# We could not check it before being in C code.
return ndim == 2 and not is_double_config(dtype, precision)
return False
class CuDNNV6(CuDNNV51): class CuDNNV6(CuDNNV51):
...@@ -162,6 +256,17 @@ class CuDNNV6(CuDNNV51): ...@@ -162,6 +256,17 @@ class CuDNNV6(CuDNNV51):
('CUDNN_REDUCE_TENSOR_NORM2', 'norm2'), ('CUDNN_REDUCE_TENSOR_NORM2', 'norm2'),
ctype='cudnnReduceTensorOp_t') ctype='cudnnReduceTensorOp_t')
def bwd_filter_algo_supports_dtype_config(self, algo, dtype, precision, ndim):
is_supported = super(CuDNNV6, self).bwd_filter_algo_supports_dtype_config(algo, dtype, precision, ndim)
if not is_supported:
algorithms = self.cudnnConvolutionBwdFilterAlgo_t
algo = algorithms.fromalias(algo)
if algo == algorithms.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING:
return ndim == 2 and (is_pseudo_half_config(dtype, precision) or
is_float_config(dtype, precision) or
is_double_config(dtype, precision))
return is_supported
class CuDNNV7(CuDNNV6): class CuDNNV7(CuDNNV6):
version = 7 version = 7
......
差异被折叠。
#section init_code_struct
reuse_algo = 0;
prev_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
if (!PARAMS->choose_once) {
memset(prev_img_dims, 0, sizeof(prev_img_dims));
memset(prev_kern_dims, 0, sizeof(prev_kern_dims));
}
#section support_code_struct
int reuse_algo;
cudnnConvolutionFwdAlgo_t prev_algo;
size_t prev_img_dims[5];
size_t prev_kern_dims[5];
int
APPLY_SPECIFIC(choose_fwd_algo)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
PyGpuArrayObject *output,
cudnnConvolutionDescriptor_t desc,
cudnnConvolutionFwdAlgo_t *output_algo,
PARAMS_TYPE* params) {
PyGpuContextObject *c = input->context;
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) {
PyErr_SetString(PyExc_ValueError, "images and kernel must have the same stack size");
return 1;
}
if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
return 1;
if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
return 1;
if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
return 1;
cudnnConvolutionFwdAlgo_t algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
cuda_enter(c->ctx);
int expected_output_dims[5] = {0};
err = cudnnGetConvolutionNdForwardOutputDim(desc, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
PyGpuArray_NDIM(input), expected_output_dims);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error computing convolution output dim: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
if (PyGpuArray_NDIM(input) == 4) {
if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
(PyGpuArray_DIMS(output)[1] != expected_output_dims[1]) ||
(PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
(PyGpuArray_DIMS(output)[3] != expected_output_dims[3])) {
PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ld"
" but received gradient with shape %ldx%ldx%ldx%ld",
expected_output_dims[0], expected_output_dims[1],
expected_output_dims[2], expected_output_dims[3],
PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3]);
cuda_exit(c->ctx);
return 1;
}
} else if (PyGpuArray_NDIM(input) == 5) {
if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
(PyGpuArray_DIMS(output)[1] != expected_output_dims[1]) ||
(PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
(PyGpuArray_DIMS(output)[3] != expected_output_dims[3]) ||
(PyGpuArray_DIMS(output)[4] != expected_output_dims[4])) {
PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ldx%ld"
" but received gradient with shape %ldx%ldx%ldx%ldx%ld",
expected_output_dims[0], expected_output_dims[1],
expected_output_dims[2], expected_output_dims[3],
expected_output_dims[4],
PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3],
PyGpuArray_DIMS(output)[4]);
cuda_exit(c->ctx);
return 1;
}
}
if (!params->choose_once) {
reuse_algo = 1;
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); ++i) {
reuse_algo = (reuse_algo && PyGpuArray_DIM(input, i) == prev_img_dims[i]);
reuse_algo = (reuse_algo && PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]);
}
}
if (!reuse_algo) {
size_t free;
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
if (err2 != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
"memory information on the GPU");
cuda_exit(c->ctx);
return 1;
}
// Guess 4Mb if the info is not available
if (free == 0) free = 4 * 1024 * 1024;
if (params->choose_time) {
int count;
cudnnConvolutionFwdAlgoPerf_t choice;
gpudata *tmpmem;
tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
if (tmpmem == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
return -1;
}
err = cudnnFindConvolutionForwardAlgorithmEx(
params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
desc, APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),
1, &count, &choice, *(void **)tmpmem,
free);
gpudata_release(tmpmem);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
algo = choice.algo;
} else {
err = cudnnGetConvolutionForwardAlgorithm(
params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
desc, APPLY_SPECIFIC(output),
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
}
prev_algo = algo;
} else {
algo = prev_algo;
}
if (params->choose_once) {
reuse_algo = 1;
} else {
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); ++i) {
prev_img_dims[i] = PyGpuArray_DIM(input, i);
prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
}
}
/* These two algos are not supported for 3d conv */
if (PyGpuArray_NDIM(input) == 5 &&
(algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM ||
algo == CUDNN_CONVOLUTION_FWD_ALGO_GEMM))
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
// The FFT implementation does not support strides, 1x1 filters or inputs
// with a spatial dimension larger than 1024. The tiled-FFT implementation
// does not support strides.
// If the chosen implementation is FFT or tiled-FFT, validate that it can
// be used on the current data and default to a safe implementation if it
// can't.
// The following code is 2d-specific but it is fine as FFT and tiled-FFT are
// defined only for 2d filters
if ((algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT ||
algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) && PyGpuArray_NDIM(input) == 4) {
// Extract the properties of the convolution descriptor
int nd;
int pad[2];
int stride[2];
int dilation[2];
cudnnConvolutionMode_t mode;
cudnnDataType_t data_type;
err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
dilation, &mode, &data_type);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error getting convolution properties: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT) {
if (stride[0] != 1 || stride[1] != 1 ||
PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
(PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1))
{
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
}
} else {
// algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING
if (stride[0] != 1 || stride[1] != 1) {
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
}
}
}
*output_algo = algo;
cuda_exit(c->ctx);
return 0;
}
#section init_code_struct
reuse_algo = 0;
prev_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
if (!PARAMS->choose_once) {
memset(prev_kern_dims, 0, sizeof(prev_kern_dims));
memset(prev_top_dims, 0, sizeof(prev_top_dims));
}
#section support_code_struct
int reuse_algo;
cudnnConvolutionBwdDataAlgo_t prev_algo;
size_t prev_kern_dims[5];
size_t prev_top_dims[5];
int
APPLY_SPECIFIC(choose_bwd_data_algo)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
PyGpuArrayObject *input,
cudnnConvolutionDescriptor_t desc,
cudnnConvolutionBwdDataAlgo_t *output_algo,
PARAMS_TYPE* params) {
PyGpuContextObject *c = kerns->context;
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) {
PyErr_SetString(PyExc_ValueError, "images and kernel must have the same stack size");
return 1;
}
if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
return 1;
if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
return 1;
if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
return 1;
cudnnConvolutionBwdDataAlgo_t algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
cuda_enter(c->ctx);
int expected_output_dims[5] = {0};
err = cudnnGetConvolutionNdForwardOutputDim(desc, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
PyGpuArray_NDIM(input), expected_output_dims);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error computing convolution output dim: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
if (PyGpuArray_NDIM(input) == 4) {
if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
(PyGpuArray_DIMS(output)[1] != expected_output_dims[1]) ||
(PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
(PyGpuArray_DIMS(output)[3] != expected_output_dims[3])) {
PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ld"
" but received gradient with shape %ldx%ldx%ldx%ld",
expected_output_dims[0], expected_output_dims[1],
expected_output_dims[2], expected_output_dims[3],
PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3]);
cuda_exit(c->ctx);
return 1;
}
} else if (PyGpuArray_NDIM(input) == 5) {
if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
(PyGpuArray_DIMS(output)[1] != expected_output_dims[1]) ||
(PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
(PyGpuArray_DIMS(output)[3] != expected_output_dims[3]) ||
(PyGpuArray_DIMS(output)[4] != expected_output_dims[4])) {
PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ldx%ld"
" but received gradient with shape %ldx%ldx%ldx%ldx%ld",
expected_output_dims[0], expected_output_dims[1],
expected_output_dims[2], expected_output_dims[3],
expected_output_dims[4],
PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3],
PyGpuArray_DIMS(output)[4]);
cuda_exit(c->ctx);
return 1;
}
}
if (!params->choose_once) {
reuse_algo = 1;
for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); ++i) {
reuse_algo = (reuse_algo && PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]);
reuse_algo = (reuse_algo && PyGpuArray_DIM(output, i) == prev_top_dims[i]);
}
}
if (!reuse_algo) {
size_t free;
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
if (err2 != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
"memory information on the GPU");
cuda_exit(c->ctx);
return 1;
}
// Guess 4Mb if the info is not available
if (free == 0) free = 4 * 1024 * 1024;
if (params->choose_time) {
int count;
cudnnConvolutionBwdDataAlgoPerf_t choice;
gpudata *tmpmem;
tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
if (tmpmem == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
return -1;
}
err = cudnnFindConvolutionBackwardDataAlgorithmEx(
params->handle, APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
1, &count, &choice, *(void **)tmpmem, free);
gpudata_release(tmpmem);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
algo = choice.algo;
} else {
err = cudnnGetConvolutionBackwardDataAlgorithm(
params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output),
desc, APPLY_SPECIFIC(input),
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
}
prev_algo = algo;
} else {
algo = prev_algo;
}
if (params->choose_once) {
reuse_algo = 1;
} else {
for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); ++i) {
prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
prev_top_dims[i] = PyGpuArray_DIM(output, i);
}
}
// The FFT implementation does not support strides, 1x1 filters or inputs
// with a spatial dimension larger than 1024. The tiled-FFT implementation
// does not support strides.
// If the chosen implementation is FFT or tiled-FFT, validate that it can
// be used on the current data and default to a safe implementation if it
// can't.
// The following code is 2d-specific but it is fine as FFT and tiled-FFT are
// defined only for 2d filters
if ((algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING ||
algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) && PyGpuArray_NDIM(kerns) == 4) {
// Extract the properties of the convolution descriptor
int nd;
int pad[2];
int stride[2];
int upscale[2];
cudnnConvolutionMode_t mode;
cudnnDataType_t data_type;
err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride, upscale, &mode, &data_type);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error getting convolution properties: %s", cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
if (algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) {
if (stride[0] != 1 || stride[1] != 1 ||
PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
(PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1))
{
algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
}
} else {
// algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING
if (stride[0] != 1 || stride[1] != 1) {
algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
}
}
}
*output_algo = algo;
cuda_exit(c->ctx);
return 0;
}
#section init_code_struct
reuse_algo = 0;
prev_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
if (!PARAMS->choose_once) {
memset(prev_img_dims, 0, sizeof(prev_img_dims));
memset(prev_top_dims, 0, sizeof(prev_top_dims));
}
#section support_code_struct
int reuse_algo;
cudnnConvolutionBwdFilterAlgo_t prev_algo;
size_t prev_img_dims[5];
size_t prev_top_dims[5];
int
APPLY_SPECIFIC(choose_bwd_filter_algo)(PyGpuArrayObject *input, PyGpuArrayObject *output,
PyGpuArrayObject *kerns,
cudnnConvolutionDescriptor_t desc,
cudnnConvolutionBwdFilterAlgo_t *output_algo,
PARAMS_TYPE* params) {
PyGpuContextObject *c = input->context;
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) {
PyErr_SetString(PyExc_ValueError, "GpuDnnConv images and kernel must have the same stack size");
return 1;
}
if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
return 1;
if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
return 1;
if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
return 1;
cudnnConvolutionBwdFilterAlgo_t algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
cuda_enter(c->ctx);
int expected_output_dims[5] = {0};
err = cudnnGetConvolutionNdForwardOutputDim(desc, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
PyGpuArray_NDIM(input), expected_output_dims);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error computing convolution output dim: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
if (PyGpuArray_NDIM(input) == 4) {
if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
(PyGpuArray_DIMS(output)[1] != expected_output_dims[1]) ||
(PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
(PyGpuArray_DIMS(output)[3] != expected_output_dims[3])) {
PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ld"
" but received gradient with shape %ldx%ldx%ldx%ld",
expected_output_dims[0], expected_output_dims[1],
expected_output_dims[2], expected_output_dims[3],
PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3]);
cuda_exit(c->ctx);
return 1;
}
} else if (PyGpuArray_NDIM(input) == 5) {
if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
(PyGpuArray_DIMS(output)[1] != expected_output_dims[1]) ||
(PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
(PyGpuArray_DIMS(output)[3] != expected_output_dims[3]) ||
(PyGpuArray_DIMS(output)[4] != expected_output_dims[4])) {
PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ldx%ld"
" but received gradient with shape %ldx%ldx%ldx%ldx%ld",
expected_output_dims[0], expected_output_dims[1],
expected_output_dims[2], expected_output_dims[3],
expected_output_dims[4],
PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3],
PyGpuArray_DIMS(output)[4]);
cuda_exit(c->ctx);
return 1;
}
}
if (!params->choose_once) {
reuse_algo = 1;
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); ++i) {
reuse_algo = (reuse_algo && PyGpuArray_DIM(input, i) == prev_img_dims[i]);
reuse_algo = (reuse_algo && PyGpuArray_DIM(output, i) == prev_top_dims[i]);
}
}
if (!reuse_algo) {
size_t free;
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
if (err2 != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
"memory information on the GPU");
cuda_exit(c->ctx);
return 1;
}
// Guess 4Mb if the info is not available
if (free == 0) free = 4 * 1024 * 1024;
if (params->choose_time) {
int count;
cudnnConvolutionBwdFilterAlgoPerf_t choice;
gpudata *tmpmem;
tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
if (tmpmem == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
return -1;
}
err = cudnnFindConvolutionBackwardFilterAlgorithmEx(
params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc,
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
1, &count, &choice, *(void **)tmpmem, free);
gpudata_release(tmpmem);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
algo = choice.algo;
} else {
err = cudnnGetConvolutionBackwardFilterAlgorithm(
params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
desc, APPLY_SPECIFIC(kerns),
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
}
prev_algo = algo;
} else {
algo = prev_algo;
}
if (params->choose_once) {
reuse_algo = 1;
} else {
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
prev_img_dims[i] = PyGpuArray_DIM(input, i);
prev_top_dims[i] = PyGpuArray_DIM(output, i);
}
}
// The FFT implementation does not support strides, 1x1 filters or inputs
// with a spatial dimension larger than 1024.
// If the chosen implementation is FFT, validate that it can
// be used on the current data and default to a safe implementation if it
// can't.
// The following code is 2d-specific but it is fine as FFT and tiled-FFT are
// defined only for 2d filters
if (algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT && PyGpuArray_NDIM(input) == 4) {
// Extract the properties of the convolution descriptor
int nd;
int pad[2];
int stride[2];
int upscale[2];
cudnnConvolutionMode_t mode;
cudnnDataType_t data_type;
err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride, upscale, &mode, &data_type);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error getting convolution properties: %s", cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
if (stride[0] != 1 || stride[1] != 1 ||
PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
(PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1)) {
algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
}
}
*output_algo = algo;
cuda_exit(c->ctx);
return 0;
}
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论