提交 560a4a56 authored 作者: notoraptor's avatar notoraptor

Wrap Op params for many gpuarray DNN Ops and add cuDNN v6 integration.

Ops rewritten: - GpuDnnConvDesc - GpuDnnPool - GpuDnnPoolGrad - GpuDnnConv - GpuDnnConvGradW - GpuDnnConvGradI - GpuDnnBatchNormInference - GpuDnnBatchNormGrad cuDNN v6 integration: - Support MAX DETERMINISTIC algorithm for GpuDnnPool with cuDNN v6. - Update pooling tests for DNN module so that they use the right available algorithms depending on runtime cuDNN version. - Allow CPU Pool and PoolGrad ops to use MAX_DETERMINISTIC algo when cuDNN v6 is used with GPU counterparts. - Encapsulate cuDNN constants used in DNN module, to help choose the right cuDNN definitions depending on the runtime cuDNN version. Currently supported cuDNN versions: v5.1, v6.0.
上级 043cb678
......@@ -268,19 +268,18 @@ def safe_no_dnn_algo_bwd(algo):
'`dnn.conv.algo_bwd_filter` and `dnn.conv.algo_bwd_data` instead.')
return True
# Those are the options provided by Theano to choose algorithms at runtime.
SUPPORTED_DNN_CONV_ALGO_RUNTIME = ('guess_once', 'guess_on_shape_change', 'time_once', 'time_on_shape_change')
# Those are the supported algorithm by Theano,
# The tests will reference those lists.
SUPPORTED_DNN_CONV_ALGO_FWD = ('small', 'none', 'large', 'fft', 'fft_tiling',
'winograd', 'guess_once', 'guess_on_shape_change',
'time_once', 'time_on_shape_change')
SUPPORTED_DNN_CONV_ALGO_FWD = ('small', 'none', 'large', 'fft', 'fft_tiling', 'winograd') + SUPPORTED_DNN_CONV_ALGO_RUNTIME
SUPPORTED_DNN_CONV_ALGO_BWD_DATA = ('none', 'deterministic', 'fft', 'fft_tiling', 'winograd') + SUPPORTED_DNN_CONV_ALGO_RUNTIME
SUPPORTED_DNN_CONV_ALGO_BWD_DATA = ('none', 'deterministic', 'fft', 'fft_tiling',
'winograd', 'guess_once', 'guess_on_shape_change',
'time_once', 'time_on_shape_change')
SUPPORTED_DNN_CONV_ALGO_BWD_FILTER = ('none', 'deterministic', 'fft', 'small') + SUPPORTED_DNN_CONV_ALGO_RUNTIME
SUPPORTED_DNN_CONV_ALGO_BWD_FILTER = ('none', 'deterministic', 'fft', 'small',
'guess_once', 'guess_on_shape_change',
'time_once', 'time_on_shape_change')
SUPPORTED_DNN_CONV_PRECISION = ('as_input_f32', 'as_input', 'float16', 'float32', 'float64')
AddConfigVar('dnn.conv.algo_bwd',
"This flag is deprecated; use dnn.conv.algo_bwd_data and "
......@@ -311,8 +310,7 @@ AddConfigVar('dnn.conv.precision',
"Default data precision to use for the computation in cuDNN "
"convolutions (defaults to the same dtype as the inputs of the "
"convolutions, or float32 if inputs are float16).",
EnumStr('as_input_f32', 'as_input', 'float16', 'float32',
'float64'),
EnumStr(*SUPPORTED_DNN_CONV_PRECISION),
in_c_key=False)
......
......@@ -963,6 +963,12 @@ class EnumType(Type, dict):
"""
return alias in self.aliases
def get_aliases(self):
"""
Return the list of all aliases in this enumeration.
"""
return self.aliases.keys()
def __repr__(self):
names_to_aliases = {constant_name: '' for constant_name in self}
for alias in self.aliases:
......
#section support_code_apply
int APPLY_SPECIFIC(conv_desc)(PyArrayObject *filt_shp,
cudnnConvolutionDescriptor_t *desc) {
cudnnConvolutionDescriptor_t *desc,
PARAMS_TYPE* params) {
cudnnStatus_t err;
int pad[3] = {PAD_0, PAD_1, PAD_2};
int strides[3] = {SUB_0, SUB_1, SUB_2};
int dilation[3] = {DIL_0, DIL_1, DIL_2};
int pad[3] = {params->pad0, params->pad1, params->pad2};
int strides[3] = {params->sub0, params->sub1, params->sub2};
int dilation[3] = {params->dil0, params->dil1, params->dil2};
#if BORDER_MODE == 0
pad[0] = (*(npy_int64 *)PyArray_GETPTR1(filt_shp, 2) - 1) * DIL_0;
pad[1] = (*(npy_int64 *)PyArray_GETPTR1(filt_shp, 3) - 1) * DIL_1;
#if NB_DIMS > 2
pad[2] = (*(npy_int64 *)PyArray_GETPTR1(filt_shp, 4) - 1) * DIL_2;
#endif
#elif BORDER_MODE == 2
pad[0] = ((*(npy_int64 *)PyArray_GETPTR1(filt_shp, 2) - 1) * DIL_0 + 1) / 2;
pad[1] = ((*(npy_int64 *)PyArray_GETPTR1(filt_shp, 3) - 1) * DIL_1 + 1) / 2;
#if NB_DIMS > 2
pad[2] = ((*(npy_int64 *)PyArray_GETPTR1(filt_shp, 4) - 1) * DIL_2 + 1) / 2;
#endif
#endif
if (params->bmode == BORDER_MODE_FULL) {
pad[0] = (*(npy_int64 *)PyArray_GETPTR1(filt_shp, 2) - 1) * dilation[0];
pad[1] = (*(npy_int64 *)PyArray_GETPTR1(filt_shp, 3) - 1) * dilation[1];
if (params->nb_dims > 2) {
pad[2] = (*(npy_int64 *)PyArray_GETPTR1(filt_shp, 4) - 1) * dilation[2];
}
} else if(params->bmode == BORDER_MODE_HALF) {
pad[0] = ((*(npy_int64 *)PyArray_GETPTR1(filt_shp, 2) - 1) * dilation[0] + 1) / 2;
pad[1] = ((*(npy_int64 *)PyArray_GETPTR1(filt_shp, 3) - 1) * dilation[1] + 1) / 2;
if (params->nb_dims > 2) {
pad[2] = ((*(npy_int64 *)PyArray_GETPTR1(filt_shp, 4) - 1) * dilation[2] + 1) / 2;
}
}
if (PyArray_DIM(filt_shp, 0) - 2 != NB_DIMS) {
if (PyArray_DIM(filt_shp, 0) - 2 != params->nb_dims) {
PyErr_Format(PyExc_ValueError, "Filter shape has too many dimensions: "
"expected %d, got %lld.", NB_DIMS,
"expected %d, got %lld.", params->nb_dims,
(long long)PyArray_DIM(filt_shp, 0));
return -1;
}
......@@ -35,8 +36,8 @@ int APPLY_SPECIFIC(conv_desc)(PyArrayObject *filt_shp,
return -1;
}
err = cudnnSetConvolutionNdDescriptor(*desc, NB_DIMS, pad, strides,
dilation, CONV_MODE, PRECISION);
err = cudnnSetConvolutionNdDescriptor(*desc, params->nb_dims, pad, strides,
dilation, params->conv_mode, params->precision);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not set convolution "
"descriptor: %s", cudnnGetErrorString(err));
......
"""
Declarations of cuDNN types and constants used in Theano gpuarray DNN module.
For every cuDNN API supported by Theano, this module defines a class that
provides the set of cuDNN definitions to be used in Theano Ops.
Use :func:`get_definitions` to get the right cuDNN definitions
for a given cuDNN version.
Currently supported cuDNN APIs:
- v5.1
- v6.0
"""
from __future__ import absolute_import, print_function, division
from theano.gof import CEnumType
# NB: Some cuDNN algorithms are listed in cuDNN enums but not implemented.
# We still register them here because we try to exactly copy cuDNN enums
# in Python side, but they will have no aliases associated, to help
# exclude them from lists of supported algorithms.
class CuDNNV51(object):
version = 5
cudnnConvolutionMode_t = CEnumType(('CUDNN_CONVOLUTION', 'conv'),
('CUDNN_CROSS_CORRELATION', 'cross'),
ctype='cudnnConvolutionMode_t')
cudnnDataType_t = CEnumType(('CUDNN_DATA_FLOAT', 'float32'),
('CUDNN_DATA_DOUBLE', 'float64'),
('CUDNN_DATA_HALF', 'float16'),
# CUDNN_DATA_INT8 # new in v6
# CUDNN_DATA_INT32 # new in v6
# CUDNN_DATA_INT8x4 # new in v6
ctype='cudnnDataType_t')
cudnnConvolutionFwdAlgo_t = CEnumType(('CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM', 'none'),
('CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM', 'small'),
('CUDNN_CONVOLUTION_FWD_ALGO_GEMM', 'large'),
# not implemented:
('CUDNN_CONVOLUTION_FWD_ALGO_DIRECT'),
('CUDNN_CONVOLUTION_FWD_ALGO_FFT', 'fft'),
('CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING', 'fft_tiling'),
('CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD', 'winograd'),
# Not yet tested/documented:
('CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED', 'winograd_non_fused'),
ctype='cudnnConvolutionFwdAlgo_t')
conv3d_fwd_algorithms = ('none', 'small', 'fft_tiling')
cudnnConvolutionBwdFilterAlgo_t = CEnumType(('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0', 'none'),
('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1', 'deterministic'),
('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT', 'fft'),
('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3', 'small'),
# not implemented:
('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD'),
# not yet tested/documented:
('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED', 'winograd_non_fused'),
ctype='cudnnConvolutionBwdFilterAlgo_t')
conv3d_bwd_filter_algorithms = ('none', 'small')
cudnnConvolutionBwdDataAlgo_t = CEnumType(('CUDNN_CONVOLUTION_BWD_DATA_ALGO_0', 'none'),
('CUDNN_CONVOLUTION_BWD_DATA_ALGO_1', 'deterministic'),
('CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT', 'fft'),
('CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING', 'fft_tiling'),
('CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD', 'winograd'),
# not yet tested/documented:
('CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED', 'winograd_non_fused'),
ctype='cudnnConvolutionBwdDataAlgo_t')
conv3d_bwd_data_algorithms = ('none', 'deterministic', 'fft_tiling')
cudnnPoolingMode_t = CEnumType(('CUDNN_POOLING_MAX', 'max'),
('CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING', 'average_inc_pad'),
('CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING', 'average_exc_pad'),
ctype='cudnnPoolingMode_t')
cudnnSoftmaxAlgorithm_t = CEnumType(('CUDNN_SOFTMAX_FAST', 'fast'),
('CUDNN_SOFTMAX_ACCURATE', 'accurate'),
('CUDNN_SOFTMAX_LOG', 'log'),
ctype='cudnnSoftmaxAlgorithm_t')
cudnnSoftmaxMode_t = CEnumType(('CUDNN_SOFTMAX_MODE_INSTANCE', 'instance'),
('CUDNN_SOFTMAX_MODE_CHANNEL', 'channel'),
ctype='cudnnSoftmaxMode_t')
cudnnBatchNormMode_t = CEnumType(('CUDNN_BATCHNORM_PER_ACTIVATION', 'per-activation'),
('CUDNN_BATCHNORM_SPATIAL', 'spatial'),
ctype='cudnnBatchNormMode_t')
class CuDNNV6(CuDNNV51):
version = 6
cudnnPoolingMode_t = CEnumType(('CUDNN_POOLING_MAX', 'max'),
('CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING', 'average_inc_pad'),
('CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING', 'average_exc_pad'),
# tested but not yet documented:
# new in v6:
('CUDNN_POOLING_MAX_DETERMINISTIC', 'max_deterministic'),
ctype='cudnnPoolingMode_t')
cudnnConvolutionBwdFilterAlgo_t = CEnumType(('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0', 'none'),
('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1', 'deterministic'),
('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT', 'fft'),
('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3', 'small'),
('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD'),
('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED', 'winograd_non_fused'),
# not yet tested/documented:
# new in v6:
('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING', 'fft_tiling'),
ctype='cudnnConvolutionBwdFilterAlgo_t')
def get_definitions(cudnn_version=None):
"""
Return cuDNN definitions to be used by Theano for the given cuDNN version.
``cudnn_version`` must be None or an integer
(typically the version returned by :func:`theano.gpuarray.dnn.version`).
if None, return definitions for the most recent supported cuDNN version.
"""
if cudnn_version is not None and cudnn_version // 1000 == 5:
return CuDNNV51()
# By default, we use definitions for the last supported cuDNN version.
return CuDNNV6()
差异被折叠。
......@@ -24,7 +24,7 @@ int dnn_batchnorm_grad(PyGpuArrayObject *inp, PyGpuArrayObject *doutp,
PyGpuArrayObject *scale, PyGpuArrayObject *x_mean,
PyGpuArrayObject *x_invstd, npy_float64 epsilon,
PyGpuArrayObject **dinp, PyGpuArrayObject **dscale,
PyGpuArrayObject **dbias, cudnnHandle_t _handle) {
PyGpuArrayObject **dbias, PARAMS_TYPE* params) {
PyGpuContextObject *c = inp->context;
if (c_set_tensorNd(inp, bn_input) != 0)
......@@ -70,8 +70,8 @@ int dnn_batchnorm_grad(PyGpuArrayObject *inp, PyGpuArrayObject *doutp,
betaParam = (void *)&fbeta;
}
cudnnStatus_t err = cudnnBatchNormalizationBackward(
_handle,
MODE,
params->handle,
params->mode,
alphaData,
betaData,
alphaParam,
......
......@@ -3,7 +3,7 @@
int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
PyGpuArrayObject *bias, PyGpuArrayObject *est_mean,
PyGpuArrayObject *est_var, npy_float64 epsilon,
PyGpuArrayObject **outp, cudnnHandle_t _handle) {
PyGpuArrayObject **outp, PARAMS_TYPE* params) {
PyGpuContextObject *c = inp->context;
if (c_set_tensorNd(inp, bn_input) != 0)
......@@ -16,14 +16,14 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
return 1;
}
#ifdef INPLACE_OUTPUT
Py_XDECREF(*outp);
*outp = inp;
Py_INCREF(*outp);
#else
if (theano_prep_output(outp, inp->ga.nd, inp->ga.dimensions, inp->ga.typecode, GA_C_ORDER, c) != 0)
return 1;
#endif
if (params->inplace) {
Py_XDECREF(*outp);
*outp = inp;
Py_INCREF(*outp);
} else {
if (theano_prep_output(outp, inp->ga.nd, inp->ga.dimensions, inp->ga.typecode, GA_C_ORDER, c) != 0)
return 1;
}
if (c_set_tensorNd(*outp, bn_output) != 0)
return 1;
......@@ -43,8 +43,8 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
beta = (void *)&fbeta;
}
cudnnStatus_t err = cudnnBatchNormalizationForwardInference(
_handle,
MODE,
params->handle,
params->mode,
alpha,
beta,
bn_input,
......
#section init_code_struct
#ifdef CHOOSE_ALGO
reuse_algo = 0;
prev_algo = CONV_ALGO;
#ifndef CHOOSE_ONCE
memset(prev_img_dims, 0, sizeof(prev_img_dims));
memset(prev_kern_dims, 0, sizeof(prev_kern_dims));
#endif
#endif
if (PARAMS->choose_algo) {
reuse_algo = 0;
prev_algo = PARAMS->conv_algo;
if (!PARAMS->choose_once) {
memset(prev_img_dims, 0, sizeof(prev_img_dims));
memset(prev_kern_dims, 0, sizeof(prev_kern_dims));
}
}
#section support_code_struct
#ifdef CHOOSE_ALGO
int reuse_algo;
cudnnConvolutionFwdAlgo_t prev_algo;
#ifndef CHOOSE_ONCE
size_t prev_img_dims[5];
size_t prev_kern_dims[5];
#endif
#endif
int
APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
......@@ -26,7 +22,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
cudnnConvolutionDescriptor_t desc,
double alpha, double beta,
PyGpuArrayObject **output,
cudnnHandle_t _handle) {
PARAMS_TYPE* params) {
PyGpuContextObject *c = input->context;
void *alpha_p;
void *beta_p;
......@@ -54,17 +50,17 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
return 1;
}
#ifdef CONV_INPLACE
Py_XDECREF(*output);
*output = om;
Py_INCREF(*output);
#else
if (theano_prep_output(output, PyGpuArray_NDIM(om), PyGpuArray_DIMS(om),
om->ga.typecode, GA_C_ORDER, c) != 0)
return 1;
if (beta != 0.0 && pygpu_move(*output, om))
return 1;
#endif
if (params->inplace) {
Py_XDECREF(*output);
*output = om;
Py_INCREF(*output);
} else {
if (theano_prep_output(output, PyGpuArray_NDIM(om), PyGpuArray_DIMS(om),
om->ga.typecode, GA_C_ORDER, c) != 0)
return 1;
if (beta != 0.0 && pygpu_move(*output, om))
return 1;
}
if (PyGpuArray_DIMS(input)[0] == 0 || PyGpuArray_DIMS(kerns)[0] == 0 || PyGpuArray_DIMS(kerns)[1] == 0) {
int err2 = GpuArray_memset(&(*output)->ga, 0);
......@@ -83,90 +79,90 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
if (c_set_tensorNd(*output, APPLY_SPECIFIC(output)) == -1)
return 1;
cudnnConvolutionFwdAlgo_t algo = CONV_ALGO;
cudnnConvolutionFwdAlgo_t algo = params->conv_algo;
cuda_enter(c->ctx);
#ifdef CHOOSE_ALGO
#ifndef CHOOSE_ONCE
reuse_algo = 1;
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(input, i) == prev_img_dims[i]);
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]);
}
#endif
if (!reuse_algo) {
size_t free;
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
if (err2 != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
"memory information on the GPU");
cuda_exit(c->ctx);
return 1;
if (params->choose_algo) {
if (params->choose_once) {
reuse_algo = 1;
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(input, i) == prev_img_dims[i]);
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]);
}
}
// Guess 4Mb if the info is not available
if (free == 0) free = 4 * 1024 * 1024;
if (!reuse_algo) {
size_t free;
#ifdef CHOOSE_TIME
int count;
cudnnConvolutionFwdAlgoPerf_t choice;
gpudata *tmpmem;
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
if (err2 != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
"memory information on the GPU");
cuda_exit(c->ctx);
return 1;
}
tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
if (tmpmem == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
return -1;
// Guess 4Mb if the info is not available
if (free == 0) free = 4 * 1024 * 1024;
if (params->choose_time) {
int count;
cudnnConvolutionFwdAlgoPerf_t choice;
gpudata *tmpmem;
tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
if (tmpmem == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
return -1;
}
// We don't sync the buffer as we don't care about the values.
err = cudnnFindConvolutionForwardAlgorithmEx(
params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
desc, APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output),
1, &count, &choice, *(void **)tmpmem,
free);
gpudata_release(tmpmem);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
algo = choice.algo;
} else {
err = cudnnGetConvolutionForwardAlgorithm(
params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
desc, APPLY_SPECIFIC(output),
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
}
prev_algo = algo;
} else {
algo = prev_algo;
}
// We don't sync the buffer as we don't care about the values.
err = cudnnFindConvolutionForwardAlgorithmEx(
_handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
desc, APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output),
1, &count, &choice, *(void **)tmpmem,
free);
gpudata_release(tmpmem);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
algo = choice.algo;
#else
err = cudnnGetConvolutionForwardAlgorithm(
_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
desc, APPLY_SPECIFIC(output),
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
if (params->choose_once) {
reuse_algo = 1;
} else {
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
prev_img_dims[i] = PyGpuArray_DIM(input, i);
prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
}
}
#endif
prev_algo = algo;
} else {
algo = prev_algo;
}
#ifdef CHOOSE_ONCE
reuse_algo = 1;
#else
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
prev_img_dims[i] = PyGpuArray_DIM(input, i);
prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
}
#endif
#endif
/* These two algos are not supported for 3d conv */
if (PyGpuArray_NDIM(input) == 5 &&
(algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM ||
......@@ -201,20 +197,16 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
return 1;
}
if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT)
{
if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT) {
if (stride[0] != 1 || stride[1] != 1 ||
PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
(PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1))
{
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
}
}
else
{
} else {
// algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING
if (stride[0] != 1 || stride[1] != 1)
{
if (stride[0] != 1 || stride[1] != 1) {
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
}
}
......@@ -223,7 +215,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
{
size_t worksize;
gpudata *workspace;
err = cudnnGetConvolutionForwardWorkspaceSize(_handle,
err = cudnnGetConvolutionForwardWorkspaceSize(params->handle,
APPLY_SPECIFIC(input),
APPLY_SPECIFIC(kerns),
desc,
......@@ -236,7 +228,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
// TODO: Print a warning
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
err = cudnnGetConvolutionForwardWorkspaceSize(_handle,
err = cudnnGetConvolutionForwardWorkspaceSize(params->handle,
APPLY_SPECIFIC(input),
APPLY_SPECIFIC(kerns),
desc,
......@@ -273,7 +265,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnConvolutionForward(
_handle,
params->handle,
alpha_p,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
......
#section init_code_struct
#ifdef CHOOSE_ALGO
reuse_algo = 0;
prev_algo = CONV_ALGO;
#ifndef CHOOSE_ONCE
memset(prev_kern_dims, 0, sizeof(prev_kern_dims));
memset(prev_top_dims, 0, sizeof(prev_top_dims));
#endif
#endif
// #ifdef CHOOSE_ALGO
if (PARAMS->choose_algo) {
reuse_algo = 0;
prev_algo = PARAMS->conv_algo;
// #ifndef CHOOSE_ONCE
if (!PARAMS->choose_once) {
memset(prev_kern_dims, 0, sizeof(prev_kern_dims));
memset(prev_top_dims, 0, sizeof(prev_top_dims));
}
// #endif
}
// #endif
#section support_code_struct
#ifdef CHOOSE_ALGO
int reuse_algo = 0;
cudnnConvolutionBwdDataAlgo_t prev_algo = CONV_ALGO;
#ifndef CHOOSE_ONCE
int reuse_algo;
cudnnConvolutionBwdDataAlgo_t prev_algo;
size_t prev_kern_dims[5] = {0};
size_t prev_top_dims[5] = {0};
#endif
#endif
int
APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
PyGpuArrayObject *im,
cudnnConvolutionDescriptor_t desc,
double alpha, double beta, PyGpuArrayObject **input,
cudnnHandle_t _handle) {
PARAMS_TYPE* params) {
PyGpuContextObject *c = kerns->context;
void *alpha_p;
void *beta_p;
......@@ -53,17 +53,20 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
return 1;
}
#ifdef CONV_INPLACE
Py_XDECREF(*input);
*input = im;
Py_INCREF(*input);
#else
if (theano_prep_output(input, PyGpuArray_NDIM(im), PyGpuArray_DIMS(im),
im->ga.typecode, GA_C_ORDER, c) != 0)
return 1;
if (beta != 0.0 && pygpu_move(*input, im))
return 1;
#endif
// #ifdef CONV_INPLACE
if (params->inplace) {
Py_XDECREF(*input);
*input = im;
Py_INCREF(*input);
// #else
} else {
if (theano_prep_output(input, PyGpuArray_NDIM(im), PyGpuArray_DIMS(im),
im->ga.typecode, GA_C_ORDER, c) != 0)
return 1;
if (beta != 0.0 && pygpu_move(*input, im))
return 1;
}
// #endif
if (PyGpuArray_DIMS(im)[0] == 0 || PyGpuArray_DIMS(kerns)[0] == 0 || PyGpuArray_DIMS(kerns)[1] == 0) {
int err2 = GpuArray_memset(&(*input)->ga, 0);
......@@ -82,7 +85,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
if (c_set_tensorNd(*input, APPLY_SPECIFIC(input)) == -1)
return 1;
cudnnConvolutionBwdDataAlgo_t algo = CONV_ALGO;
cudnnConvolutionBwdDataAlgo_t algo = params->conv_algo;
cuda_enter(c->ctx);
......@@ -128,84 +131,93 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
}
}
#ifdef CHOOSE_ALGO
#ifndef CHOOSE_ONCE
reuse_algo = 1;
for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); i++) {
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]);
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(output, i) == prev_top_dims[i]);
}
#endif
// #ifdef CHOOSE_ALGO
if (params->choose_algo) {
// #ifndef CHOOSE_ONCE
if (!params->choose_once) {
reuse_algo = 1;
for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); i++) {
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]);
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(output, i) == prev_top_dims[i]);
}
}
// #endif
if (!reuse_algo) {
size_t free;
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
if (!reuse_algo) {
size_t free;
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
if (err2 != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
"memory information on the GPU");
cuda_exit(c->ctx);
return 1;
}
if (err2 != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
"memory information on the GPU");
cuda_exit(c->ctx);
return 1;
}
// Guess 4Mb if the info is not available
if (free == 0) free = 4 * 1024 * 1024;
// Guess 4Mb if the info is not available
if (free == 0) free = 4 * 1024 * 1024;
#ifdef CHOOSE_TIME
int count;
cudnnConvolutionBwdDataAlgoPerf_t choice;
gpudata *tmpmem;
// #ifdef CHOOSE_TIME
if (params->choose_time) {
int count;
cudnnConvolutionBwdDataAlgoPerf_t choice;
gpudata *tmpmem;
tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
if (tmpmem == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
return -1;
}
tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
if (tmpmem == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
return -1;
}
err = cudnnFindConvolutionBackwardDataAlgorithmEx(
_handle, APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(*input),
1, &count, &choice, *(void **)tmpmem, free);
gpudata_release(tmpmem);
err = cudnnFindConvolutionBackwardDataAlgorithmEx(
params->handle, APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(*input),
1, &count, &choice, *(void **)tmpmem, free);
gpudata_release(tmpmem);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
algo = choice.algo;
// #else
} else {
err = cudnnGetConvolutionBackwardDataAlgorithm(
params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output),
desc, APPLY_SPECIFIC(input),
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
}
algo = choice.algo;
#else
err = cudnnGetConvolutionBackwardDataAlgorithm(
_handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output),
desc, APPLY_SPECIFIC(input),
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
// #endif
prev_algo = algo;
} else {
algo = prev_algo;
}
#endif
prev_algo = algo;
} else {
algo = prev_algo;
}
#ifdef CHOOSE_ONCE
reuse_algo = 1;
#else
for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); i++) {
prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
prev_top_dims[i] = PyGpuArray_DIM(output, i);
// #ifdef CHOOSE_ONCE
if (params->choose_once) {
reuse_algo = 1;
// #else
} else {
for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); i++) {
prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
prev_top_dims[i] = PyGpuArray_DIM(output, i);
}
}
// #endif
}
#endif
#endif
// #endif
// The FFT implementation does not support strides, 1x1 filters or inputs
// with a spatial dimension larger than 1024. The tiled-FFT implementation
......@@ -258,7 +270,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
gpudata *workspace;
err = cudnnGetConvolutionBackwardDataWorkspaceSize(
_handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(input), algo, &worksize);
if (err != CUDNN_STATUS_SUCCESS) {
......@@ -283,7 +295,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
cuda_wait((*input)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnConvolutionBackwardData(
_handle,
params->handle,
alpha_p,
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),
......
#section init_code_struct
#ifdef CHOOSE_ALGO
reuse_algo = 0;
prev_algo = CONV_ALGO;
#ifndef CHOOSE_ONCE
memset(prev_img_dims, 0, sizeof(prev_img_dims));
memset(prev_top_dims, 0, sizeof(prev_top_dims));
#endif
#endif
if (PARAMS->choose_algo) {
reuse_algo = 0;
prev_algo = PARAMS->conv_algo;
if (!PARAMS->choose_once) {
memset(prev_img_dims, 0, sizeof(prev_img_dims));
memset(prev_top_dims, 0, sizeof(prev_top_dims));
}
}
#section support_code_struct
#ifdef CHOOSE_ALGO
int reuse_algo;
cudnnConvolutionBwdFilterAlgo_t prev_algo;
#ifndef CHOOSE_ONCE
size_t prev_img_dims[5];
size_t prev_top_dims[5];
#endif
#endif
int
APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
PyGpuArrayObject *km,
cudnnConvolutionDescriptor_t desc,
double alpha, double beta, PyGpuArrayObject **kerns,
cudnnHandle_t _handle) {
PARAMS_TYPE* params) {
PyGpuContextObject *c = input->context;
void *alpha_p;
void *beta_p;
......@@ -53,17 +49,17 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
return 1;
}
#ifdef CONV_INPLACE
Py_XDECREF(*kerns);
*kerns = km;
Py_INCREF(*kerns);
#else
if (theano_prep_output(kerns, PyGpuArray_NDIM(km), PyGpuArray_DIMS(km),
km->ga.typecode, GA_C_ORDER, c) != 0)
return 1;
if (beta != 0.0 && pygpu_move(*kerns, km))
return 1;
#endif
if (params->inplace) {
Py_XDECREF(*kerns);
*kerns = km;
Py_INCREF(*kerns);
} else {
if (theano_prep_output(kerns, PyGpuArray_NDIM(km), PyGpuArray_DIMS(km),
km->ga.typecode, GA_C_ORDER, c) != 0)
return 1;
if (beta != 0.0 && pygpu_move(*kerns, km))
return 1;
}
if (PyGpuArray_DIMS(input)[0] == 0 || PyGpuArray_DIMS(km)[0] == 0 || PyGpuArray_DIMS(km)[1] == 0) {
int err2 = GpuArray_memset(&(*kerns)->ga, 0);
......@@ -82,7 +78,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns)) == -1)
return 1;
cudnnConvolutionBwdFilterAlgo_t algo = CONV_ALGO;
cudnnConvolutionBwdFilterAlgo_t algo = params->conv_algo;
cuda_enter(c->ctx);
......@@ -128,86 +124,85 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
}
}
#ifdef CHOOSE_ALGO
#ifndef CHOOSE_ONCE
reuse_algo = 1;
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(input, i) == prev_img_dims[i]);
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(output, i) == prev_top_dims[i]);
}
#endif
if (!reuse_algo) {
size_t free;
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
if (err2 != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
"memory information on the GPU");
cuda_exit(c->ctx);
return 1;
if (params->choose_algo) {
if (!params->choose_once) {
reuse_algo = 1;
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(input, i) == prev_img_dims[i]);
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(output, i) == prev_top_dims[i]);
}
}
// Guess 4Mb if the info is not available
if (free == 0) free = 4 * 1024 * 1024;
#ifdef CHOOSE_TIME
int count;
cudnnConvolutionBwdFilterAlgoPerf_t choice;
gpudata *tmpmem;
tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
if (tmpmem == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
return -1;
}
if (!reuse_algo) {
size_t free;
err = cudnnFindConvolutionBackwardFilterAlgorithmEx(
_handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc,
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(*kerns),
1, &count, &choice, *(void **)tmpmem, free);
gpudata_release(tmpmem);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
if (err2 != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
"memory information on the GPU");
cuda_exit(c->ctx);
return 1;
}
// Guess 4Mb if the info is not available
if (free == 0) free = 4 * 1024 * 1024;
if (params->choose_time) {
int count;
cudnnConvolutionBwdFilterAlgoPerf_t choice;
gpudata *tmpmem;
tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
if (tmpmem == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
return -1;
}
err = cudnnFindConvolutionBackwardFilterAlgorithmEx(
params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc,
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(*kerns),
1, &count, &choice, *(void **)tmpmem, free);
gpudata_release(tmpmem);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
algo = choice.algo;
} else {
err = cudnnGetConvolutionBackwardFilterAlgorithm(
params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
desc, APPLY_SPECIFIC(kerns),
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
}
prev_algo = algo;
} else {
algo = prev_algo;
}
algo = choice.algo;
#else
err = cudnnGetConvolutionBackwardFilterAlgorithm(
_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
desc, APPLY_SPECIFIC(kerns),
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
if (params->choose_once) {
reuse_algo = 1;
} else {
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
prev_img_dims[i] = PyGpuArray_DIM(input, i);
prev_top_dims[i] = PyGpuArray_DIM(output, i);
}
}
#endif
prev_algo = algo;
} else {
algo = prev_algo;
}
#ifdef CHOOSE_ONCE
reuse_algo = 1;
#else
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
prev_img_dims[i] = PyGpuArray_DIM(input, i);
prev_top_dims[i] = PyGpuArray_DIM(output, i);
}
#endif
#endif
// The FFT implementation does not support strides, 1x1 filters or inputs
// with a spatial dimension larger than 1024.
......@@ -246,7 +241,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
gpudata *workspace;
err = cudnnGetConvolutionBackwardFilterWorkspaceSize(
_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(kerns), algo, &worksize);
if (err != CUDNN_STATUS_SUCCESS) {
......@@ -270,7 +265,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
cuda_wait((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnConvolutionBackwardFilter(
_handle,
params->handle,
alpha_p,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),
......
......@@ -42,7 +42,7 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
PyArrayObject *stride,
PyArrayObject *pad,
PyGpuArrayObject **out,
cudnnHandle_t _handle) {
PARAMS_TYPE* params) {
PyGpuContextObject *c = img->context;
size_t dims[5];
cudnnStatus_t err;
......@@ -90,7 +90,7 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
if (c_set_tensorNd(*out, APPLY_SPECIFIC(output)) != 0)
return 1;
err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), MODE_FLAG, CUDNN_PROPAGATE_NAN, ndims, w, p, s);
err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), params->mode, CUDNN_PROPAGATE_NAN, ndims, w, p, s);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set op descriptor %s", cudnnGetErrorString(err));
......@@ -124,7 +124,7 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
cuda_wait((*out)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnPoolingForward(
_handle, APPLY_SPECIFIC(pool),
params->handle, APPLY_SPECIFIC(pool),
alpha,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(img),
beta,
......
......@@ -64,7 +64,7 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
PyArrayObject *stride,
PyArrayObject *pad,
PyGpuArrayObject **inp_grad,
cudnnHandle_t _handle) {
PARAMS_TYPE* params) {
PyGpuContextObject *c = inp->context;
cudnnStatus_t err;
......@@ -116,7 +116,7 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
s[i] = *((npy_intp*)PyArray_GETPTR1(stride, i));
}
err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), MODE_FLAG, CUDNN_PROPAGATE_NAN, ndims, w, p, s);
err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), params->mode, CUDNN_PROPAGATE_NAN, ndims, w, p, s);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set op descriptor %s", cudnnGetErrorString(err));
......@@ -155,7 +155,7 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
cuda_wait((*inp_grad)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnPoolingBackward(
_handle, APPLY_SPECIFIC(pool),
params->handle, APPLY_SPECIFIC(pool),
alpha,
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(out),
APPLY_SPECIFIC(output_grad), PyGpuArray_DEV_DATA(out_grad),
......
......@@ -31,6 +31,20 @@ mode_with_gpu = mode_with_gpu.including()
mode_with_gpu.check_py_code = False
# This variable will store the list of pooling modes available with the current runtime cuDNN version.
# Don't use this variable directly, always call `get_dnn_pool_modes()` instead.
dnn_pool_modes = None
def get_dnn_pool_modes():
# This function is called only by pooling tests to initialize and/or get dnn_pool_modes.
global dnn_pool_modes
if dnn_pool_modes is None:
from .. import cudnn_defs
dnn_pool_modes = cudnn_defs.get_definitions(dnn.version(raises=False)).cudnnPoolingMode_t.get_aliases()
return dnn_pool_modes
# If using float16, set CUDNN precision to float32
def set_precision(floatX):
if floatX == "float16":
......@@ -155,11 +169,7 @@ def test_pooling():
raise SkipTest(dnn.dnn_available.msg)
utt.seed_rng()
# 'average_exc_pad' is disabled for versions < 4004
if dnn.version(raises=False) < 4004:
modes = ('max', 'average_inc_pad')
else:
modes = ('max', 'average_inc_pad', 'average_exc_pad')
modes = get_dnn_pool_modes()
x = T.tensor4()
for mode, pad in product(modes,
......@@ -242,7 +252,9 @@ def test_pooling():
for node in fg.maker.fgraph.toposort()])
def test_pooling_with_tensor_vars():
# This test will be run with different values of 'mode'
# (see next test below).
def run_pooling_with_tensor_vars(mode):
if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg)
utt.seed_rng()
......@@ -251,7 +263,6 @@ def test_pooling_with_tensor_vars():
ws = theano.shared(np.array([2, 2], dtype='int32'))
stride = theano.shared(np.array([1, 1], dtype='int32'))
pad = theano.shared(np.array([0, 0], dtype='int32'))
mode = 'max'
def fn(x):
dnn_op = dnn.dnn_pool(
......@@ -297,6 +308,12 @@ def test_pooling_with_tensor_vars():
i += 1
def test_pooling_with_tensor_vars():
# Let's test for mode 'max' and also for 'max_deterministic' if available.
for mode in [m for m in get_dnn_pool_modes() if m in ('max', 'max_deterministic')]:
yield (run_pooling_with_tensor_vars, mode)
def test_pooling3d():
# 3d pooling requires version 3 or newer.
if not dnn.dnn_available(test_ctx_name) or dnn.version(raises=False) < 3000:
......@@ -307,11 +324,7 @@ def test_pooling3d():
mode_without_gpu_ref = theano.compile.mode.get_mode(
'FAST_RUN').excluding('gpuarray')
# 'average_exc_pad' is disabled for versions < 4004
if dnn.version(raises=False) < 4004:
modes = ('max', 'average_inc_pad')
else:
modes = ('max', 'average_inc_pad', 'average_exc_pad')
modes = get_dnn_pool_modes()
x = T.tensor5()
for mode, pad in product(modes,
......@@ -467,11 +480,7 @@ def test_pooling_opt_arbitrary_dimensions():
raise SkipTest(dnn.dnn_available.msg)
utt.seed_rng()
# 'average_exc_pad' is disabled for versions < 4004
if dnn.version(raises=False) < 4004:
modes = ('max', 'average_inc_pad')
else:
modes = ('max', 'average_inc_pad', 'average_exc_pad')
modes = get_dnn_pool_modes()
for n_non_pool_dims in (0, 1, 2, 3):
for ws in ((2, 2), (3, 3, 3)):
......@@ -498,7 +507,7 @@ def test_pooling_opt_arbitrary_dimensions():
fc = theano.function([], out, mode=mode_without_gpu)
assert any([isinstance(node.op, Pool)
for node in fc.maker.fgraph.toposort()])
if mode == 'max':
if mode in ('max', 'max_deterministic'):
assert any([isinstance(node.op, MaxPoolGrad)
for node in fc.maker.fgraph.toposort()])
else:
......@@ -780,11 +789,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
dtype=theano.config.floatX
)
# 'average_exc_pad' is disabled for versions < 4004
if dnn.version(raises=False) < 4004:
modes = ['max', 'average_inc_pad']
else:
modes = ['max', 'average_inc_pad', 'average_exc_pad']
modes = get_dnn_pool_modes()
for params in product(
[(1, 1), (2, 2), (3, 3)],
......@@ -807,11 +812,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
dtype=theano.config.floatX
)
# 'average_exc_pad' is disabled for versions < 4004
if dnn.version(raises=False) < 4004:
modes = ['max', 'average_inc_pad']
else:
modes = ['max', 'average_inc_pad', 'average_exc_pad']
modes = get_dnn_pool_modes()
for params in product(
[(1, 1, 1), (2, 2, 2), (3, 3, 3)],
......@@ -847,7 +848,8 @@ class TestDnnInferShapes(utt.InferShapeTester):
for params in product(
[(1, 1), (2, 2), (3, 3)],
[(1, 1), (2, 2), (3, 3)],
['max', 'average_inc_pad']
# modes without `average_exc_pad`
[m for m in get_dnn_pool_modes() if m != 'average_exc_pad']
):
pool_grad = dnn.GpuDnnPoolGrad(mode=params[2])(
img,
......@@ -886,7 +888,8 @@ class TestDnnInferShapes(utt.InferShapeTester):
for params in product(
[(1, 1, 1), (2, 2, 2), (3, 3, 3)],
[(1, 1, 1), (2, 2, 2), (3, 3, 3)],
['max', 'average_inc_pad']
# modes without `average_exc_pad`
[m for m in get_dnn_pool_modes() if m != 'average_exc_pad']
):
pool_grad = dnn.GpuDnnPoolGrad(mode=params[2])(
img,
......
......@@ -433,6 +433,9 @@ class Pool(OpenMPOp):
super(Pool, self).__init__(openmp=openmp)
self.ndim = ndim
self.ignore_border = ignore_border
if mode == 'max_deterministic':
# It seems max pool algo is already deterministic in CPU.
mode = 'max'
if mode not in ['max', 'average_inc_pad', 'average_exc_pad', 'sum']:
raise ValueError(
"Pool mode parameter only support 'max', 'sum',"
......@@ -1040,6 +1043,9 @@ class PoolGrad(OpenMPOp):
def __init__(self, ignore_border, mode='max', ndim=2, openmp=None):
self.ndim = ndim
self.ignore_border = ignore_border
if mode == 'max_deterministic':
# It seems max pool grad algo is already deterministic in CPU.
mode = 'max'
if mode not in ['max', 'sum', 'average_inc_pad', 'average_exc_pad']:
raise ValueError(
"Pool mode parameter only support 'max', 'sum',"
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论