提交 560a4a56 authored 作者: notoraptor's avatar notoraptor

Wrap Op params for many gpuarray DNN Ops and add cuDNN v6 integration.

Ops rewritten: - GpuDnnConvDesc - GpuDnnPool - GpuDnnPoolGrad - GpuDnnConv - GpuDnnConvGradW - GpuDnnConvGradI - GpuDnnBatchNormInference - GpuDnnBatchNormGrad cuDNN v6 integration: - Support MAX DETERMINISTIC algorithm for GpuDnnPool with cuDNN v6. - Update pooling tests for DNN module so that they use the right available algorithms depending on runtime cuDNN version. - Allow CPU Pool and PoolGrad ops to use MAX_DETERMINISTIC algo when cuDNN v6 is used with GPU counterparts. - Encapsulate cuDNN constants used in DNN module, to help choose the right cuDNN definitions depending on the runtime cuDNN version. Currently supported cuDNN versions: v5.1, v6.0.
上级 043cb678
...@@ -268,19 +268,18 @@ def safe_no_dnn_algo_bwd(algo): ...@@ -268,19 +268,18 @@ def safe_no_dnn_algo_bwd(algo):
'`dnn.conv.algo_bwd_filter` and `dnn.conv.algo_bwd_data` instead.') '`dnn.conv.algo_bwd_filter` and `dnn.conv.algo_bwd_data` instead.')
return True return True
# Those are the options provided by Theano to choose algorithms at runtime.
SUPPORTED_DNN_CONV_ALGO_RUNTIME = ('guess_once', 'guess_on_shape_change', 'time_once', 'time_on_shape_change')
# Those are the supported algorithm by Theano, # Those are the supported algorithm by Theano,
# The tests will reference those lists. # The tests will reference those lists.
SUPPORTED_DNN_CONV_ALGO_FWD = ('small', 'none', 'large', 'fft', 'fft_tiling', SUPPORTED_DNN_CONV_ALGO_FWD = ('small', 'none', 'large', 'fft', 'fft_tiling', 'winograd') + SUPPORTED_DNN_CONV_ALGO_RUNTIME
'winograd', 'guess_once', 'guess_on_shape_change',
'time_once', 'time_on_shape_change') SUPPORTED_DNN_CONV_ALGO_BWD_DATA = ('none', 'deterministic', 'fft', 'fft_tiling', 'winograd') + SUPPORTED_DNN_CONV_ALGO_RUNTIME
SUPPORTED_DNN_CONV_ALGO_BWD_DATA = ('none', 'deterministic', 'fft', 'fft_tiling', SUPPORTED_DNN_CONV_ALGO_BWD_FILTER = ('none', 'deterministic', 'fft', 'small') + SUPPORTED_DNN_CONV_ALGO_RUNTIME
'winograd', 'guess_once', 'guess_on_shape_change',
'time_once', 'time_on_shape_change')
SUPPORTED_DNN_CONV_ALGO_BWD_FILTER = ('none', 'deterministic', 'fft', 'small', SUPPORTED_DNN_CONV_PRECISION = ('as_input_f32', 'as_input', 'float16', 'float32', 'float64')
'guess_once', 'guess_on_shape_change',
'time_once', 'time_on_shape_change')
AddConfigVar('dnn.conv.algo_bwd', AddConfigVar('dnn.conv.algo_bwd',
"This flag is deprecated; use dnn.conv.algo_bwd_data and " "This flag is deprecated; use dnn.conv.algo_bwd_data and "
...@@ -311,8 +310,7 @@ AddConfigVar('dnn.conv.precision', ...@@ -311,8 +310,7 @@ AddConfigVar('dnn.conv.precision',
"Default data precision to use for the computation in cuDNN " "Default data precision to use for the computation in cuDNN "
"convolutions (defaults to the same dtype as the inputs of the " "convolutions (defaults to the same dtype as the inputs of the "
"convolutions, or float32 if inputs are float16).", "convolutions, or float32 if inputs are float16).",
EnumStr('as_input_f32', 'as_input', 'float16', 'float32', EnumStr(*SUPPORTED_DNN_CONV_PRECISION),
'float64'),
in_c_key=False) in_c_key=False)
......
...@@ -963,6 +963,12 @@ class EnumType(Type, dict): ...@@ -963,6 +963,12 @@ class EnumType(Type, dict):
""" """
return alias in self.aliases return alias in self.aliases
def get_aliases(self):
"""
Return the list of all aliases in this enumeration.
"""
return self.aliases.keys()
def __repr__(self): def __repr__(self):
names_to_aliases = {constant_name: '' for constant_name in self} names_to_aliases = {constant_name: '' for constant_name in self}
for alias in self.aliases: for alias in self.aliases:
......
#section support_code_apply #section support_code_apply
int APPLY_SPECIFIC(conv_desc)(PyArrayObject *filt_shp, int APPLY_SPECIFIC(conv_desc)(PyArrayObject *filt_shp,
cudnnConvolutionDescriptor_t *desc) { cudnnConvolutionDescriptor_t *desc,
PARAMS_TYPE* params) {
cudnnStatus_t err; cudnnStatus_t err;
int pad[3] = {PAD_0, PAD_1, PAD_2}; int pad[3] = {params->pad0, params->pad1, params->pad2};
int strides[3] = {SUB_0, SUB_1, SUB_2}; int strides[3] = {params->sub0, params->sub1, params->sub2};
int dilation[3] = {DIL_0, DIL_1, DIL_2}; int dilation[3] = {params->dil0, params->dil1, params->dil2};
#if BORDER_MODE == 0 if (params->bmode == BORDER_MODE_FULL) {
pad[0] = (*(npy_int64 *)PyArray_GETPTR1(filt_shp, 2) - 1) * DIL_0; pad[0] = (*(npy_int64 *)PyArray_GETPTR1(filt_shp, 2) - 1) * dilation[0];
pad[1] = (*(npy_int64 *)PyArray_GETPTR1(filt_shp, 3) - 1) * DIL_1; pad[1] = (*(npy_int64 *)PyArray_GETPTR1(filt_shp, 3) - 1) * dilation[1];
#if NB_DIMS > 2 if (params->nb_dims > 2) {
pad[2] = (*(npy_int64 *)PyArray_GETPTR1(filt_shp, 4) - 1) * DIL_2; pad[2] = (*(npy_int64 *)PyArray_GETPTR1(filt_shp, 4) - 1) * dilation[2];
#endif }
#elif BORDER_MODE == 2 } else if(params->bmode == BORDER_MODE_HALF) {
pad[0] = ((*(npy_int64 *)PyArray_GETPTR1(filt_shp, 2) - 1) * DIL_0 + 1) / 2; pad[0] = ((*(npy_int64 *)PyArray_GETPTR1(filt_shp, 2) - 1) * dilation[0] + 1) / 2;
pad[1] = ((*(npy_int64 *)PyArray_GETPTR1(filt_shp, 3) - 1) * DIL_1 + 1) / 2; pad[1] = ((*(npy_int64 *)PyArray_GETPTR1(filt_shp, 3) - 1) * dilation[1] + 1) / 2;
#if NB_DIMS > 2 if (params->nb_dims > 2) {
pad[2] = ((*(npy_int64 *)PyArray_GETPTR1(filt_shp, 4) - 1) * DIL_2 + 1) / 2; pad[2] = ((*(npy_int64 *)PyArray_GETPTR1(filt_shp, 4) - 1) * dilation[2] + 1) / 2;
#endif }
#endif }
if (PyArray_DIM(filt_shp, 0) - 2 != NB_DIMS) { if (PyArray_DIM(filt_shp, 0) - 2 != params->nb_dims) {
PyErr_Format(PyExc_ValueError, "Filter shape has too many dimensions: " PyErr_Format(PyExc_ValueError, "Filter shape has too many dimensions: "
"expected %d, got %lld.", NB_DIMS, "expected %d, got %lld.", params->nb_dims,
(long long)PyArray_DIM(filt_shp, 0)); (long long)PyArray_DIM(filt_shp, 0));
return -1; return -1;
} }
...@@ -35,8 +36,8 @@ int APPLY_SPECIFIC(conv_desc)(PyArrayObject *filt_shp, ...@@ -35,8 +36,8 @@ int APPLY_SPECIFIC(conv_desc)(PyArrayObject *filt_shp,
return -1; return -1;
} }
err = cudnnSetConvolutionNdDescriptor(*desc, NB_DIMS, pad, strides, err = cudnnSetConvolutionNdDescriptor(*desc, params->nb_dims, pad, strides,
dilation, CONV_MODE, PRECISION); dilation, params->conv_mode, params->precision);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not set convolution " PyErr_Format(PyExc_MemoryError, "could not set convolution "
"descriptor: %s", cudnnGetErrorString(err)); "descriptor: %s", cudnnGetErrorString(err));
......
"""
Declarations of cuDNN types and constants used in Theano gpuarray DNN module.
For every cuDNN API supported by Theano, this module defines a class that
provides the set of cuDNN definitions to be used in Theano Ops.
Use :func:`get_definitions` to get the right cuDNN definitions
for a given cuDNN version.
Currently supported cuDNN APIs:
- v5.1
- v6.0
"""
from __future__ import absolute_import, print_function, division
from theano.gof import CEnumType
# NB: Some cuDNN algorithms are listed in cuDNN enums but not implemented.
# We still register them here because we try to exactly copy cuDNN enums
# in Python side, but they will have no aliases associated, to help
# exclude them from lists of supported algorithms.
class CuDNNV51(object):
version = 5
cudnnConvolutionMode_t = CEnumType(('CUDNN_CONVOLUTION', 'conv'),
('CUDNN_CROSS_CORRELATION', 'cross'),
ctype='cudnnConvolutionMode_t')
cudnnDataType_t = CEnumType(('CUDNN_DATA_FLOAT', 'float32'),
('CUDNN_DATA_DOUBLE', 'float64'),
('CUDNN_DATA_HALF', 'float16'),
# CUDNN_DATA_INT8 # new in v6
# CUDNN_DATA_INT32 # new in v6
# CUDNN_DATA_INT8x4 # new in v6
ctype='cudnnDataType_t')
cudnnConvolutionFwdAlgo_t = CEnumType(('CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM', 'none'),
('CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM', 'small'),
('CUDNN_CONVOLUTION_FWD_ALGO_GEMM', 'large'),
# not implemented:
('CUDNN_CONVOLUTION_FWD_ALGO_DIRECT'),
('CUDNN_CONVOLUTION_FWD_ALGO_FFT', 'fft'),
('CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING', 'fft_tiling'),
('CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD', 'winograd'),
# Not yet tested/documented:
('CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED', 'winograd_non_fused'),
ctype='cudnnConvolutionFwdAlgo_t')
conv3d_fwd_algorithms = ('none', 'small', 'fft_tiling')
cudnnConvolutionBwdFilterAlgo_t = CEnumType(('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0', 'none'),
('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1', 'deterministic'),
('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT', 'fft'),
('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3', 'small'),
# not implemented:
('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD'),
# not yet tested/documented:
('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED', 'winograd_non_fused'),
ctype='cudnnConvolutionBwdFilterAlgo_t')
conv3d_bwd_filter_algorithms = ('none', 'small')
cudnnConvolutionBwdDataAlgo_t = CEnumType(('CUDNN_CONVOLUTION_BWD_DATA_ALGO_0', 'none'),
('CUDNN_CONVOLUTION_BWD_DATA_ALGO_1', 'deterministic'),
('CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT', 'fft'),
('CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING', 'fft_tiling'),
('CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD', 'winograd'),
# not yet tested/documented:
('CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED', 'winograd_non_fused'),
ctype='cudnnConvolutionBwdDataAlgo_t')
conv3d_bwd_data_algorithms = ('none', 'deterministic', 'fft_tiling')
cudnnPoolingMode_t = CEnumType(('CUDNN_POOLING_MAX', 'max'),
('CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING', 'average_inc_pad'),
('CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING', 'average_exc_pad'),
ctype='cudnnPoolingMode_t')
cudnnSoftmaxAlgorithm_t = CEnumType(('CUDNN_SOFTMAX_FAST', 'fast'),
('CUDNN_SOFTMAX_ACCURATE', 'accurate'),
('CUDNN_SOFTMAX_LOG', 'log'),
ctype='cudnnSoftmaxAlgorithm_t')
cudnnSoftmaxMode_t = CEnumType(('CUDNN_SOFTMAX_MODE_INSTANCE', 'instance'),
('CUDNN_SOFTMAX_MODE_CHANNEL', 'channel'),
ctype='cudnnSoftmaxMode_t')
cudnnBatchNormMode_t = CEnumType(('CUDNN_BATCHNORM_PER_ACTIVATION', 'per-activation'),
('CUDNN_BATCHNORM_SPATIAL', 'spatial'),
ctype='cudnnBatchNormMode_t')
class CuDNNV6(CuDNNV51):
version = 6
cudnnPoolingMode_t = CEnumType(('CUDNN_POOLING_MAX', 'max'),
('CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING', 'average_inc_pad'),
('CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING', 'average_exc_pad'),
# tested but not yet documented:
# new in v6:
('CUDNN_POOLING_MAX_DETERMINISTIC', 'max_deterministic'),
ctype='cudnnPoolingMode_t')
cudnnConvolutionBwdFilterAlgo_t = CEnumType(('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0', 'none'),
('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1', 'deterministic'),
('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT', 'fft'),
('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3', 'small'),
('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD'),
('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED', 'winograd_non_fused'),
# not yet tested/documented:
# new in v6:
('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING', 'fft_tiling'),
ctype='cudnnConvolutionBwdFilterAlgo_t')
def get_definitions(cudnn_version=None):
"""
Return cuDNN definitions to be used by Theano for the given cuDNN version.
``cudnn_version`` must be None or an integer
(typically the version returned by :func:`theano.gpuarray.dnn.version`).
if None, return definitions for the most recent supported cuDNN version.
"""
if cudnn_version is not None and cudnn_version // 1000 == 5:
return CuDNNV51()
# By default, we use definitions for the last supported cuDNN version.
return CuDNNV6()
...@@ -9,10 +9,10 @@ from six import integer_types ...@@ -9,10 +9,10 @@ from six import integer_types
import theano import theano
from theano import Op, Apply, tensor, config, Variable from theano import Op, Apply, tensor, config, Variable
from theano.scalar import as_scalar, constant, Log, get_scalar_type from theano.scalar import as_scalar, constant, Log, get_scalar_type, int32 as int_t, bool as bool_t
from theano.tensor import as_tensor_variable from theano.tensor import as_tensor_variable
from theano.gradient import DisconnectedType, grad_not_implemented from theano.gradient import DisconnectedType, grad_not_implemented
from theano.gof import Optimizer, local_optimizer, COp, ParamsType, CEnumType from theano.gof import Optimizer, local_optimizer, COp, ParamsType, EnumList
from theano.gof.cmodule import GCC_compiler from theano.gof.cmodule import GCC_compiler
from theano.gof.type import CDataType, Generic from theano.gof.type import CDataType, Generic
from theano.compile import optdb from theano.compile import optdb
...@@ -28,7 +28,7 @@ from theano.tensor.nnet.abstract_conv import (AbstractConv2d, ...@@ -28,7 +28,7 @@ from theano.tensor.nnet.abstract_conv import (AbstractConv2d,
assert_conv_shape) assert_conv_shape)
from theano.tensor.signal.pool import ( from theano.tensor.signal.pool import (
Pool, MaxPoolGrad, AveragePoolGrad) Pool, MaxPoolGrad, AveragePoolGrad)
from . import pygpu from . import pygpu, cudnn_defs
from .type import (get_context, gpu_context_type, list_contexts, from .type import (get_context, gpu_context_type, list_contexts,
GpuArraySharedVariable) GpuArraySharedVariable)
from .basic_ops import (as_gpuarray_variable, infer_context_name, from .basic_ops import (as_gpuarray_variable, infer_context_name,
...@@ -44,7 +44,10 @@ from .opt import (gpu_seqopt, register_opt, pool_db, pool_db2, ...@@ -44,7 +44,10 @@ from .opt import (gpu_seqopt, register_opt, pool_db, pool_db2,
from .opt_util import alpha_merge, output_merge, inplace_allocempty, pad_dims, unpad_dims from .opt_util import alpha_merge, output_merge, inplace_allocempty, pad_dims, unpad_dims
from theano.configdefaults import SUPPORTED_DNN_CONV_ALGO_BWD_FILTER from theano.configdefaults import SUPPORTED_DNN_CONV_ALGO_RUNTIME
DNN_CONV_ALGO_CHOOSE_ONCE = ['guess_once', 'time_once']
DNN_CONV_ALGO_CHOOSE_TIME = ['time_once', 'time_on_shape_change']
try: try:
from pygpu import gpuarray from pygpu import gpuarray
...@@ -59,12 +62,12 @@ def _dnn_lib(): ...@@ -59,12 +62,12 @@ def _dnn_lib():
lib_name = ctypes.util.find_library('cudnn') lib_name = ctypes.util.find_library('cudnn')
if lib_name is None and sys.platform == 'win32': if lib_name is None and sys.platform == 'win32':
# Update these names when new versions of cudnn are supported. # Update these names when new versions of cudnn are supported.
for name in ['cudnn64_5.dll']: for name in ['cudnn64_6.dll', 'cudnn64_5.dll']:
lib_name = ctypes.util.find_library(name) lib_name = ctypes.util.find_library(name)
if lib_name: if lib_name:
break break
if lib_name is None: if lib_name is None:
raise RuntimeError('Could not find cudnn library (looked for v5[.1])') raise RuntimeError('Could not find cudnn library (looked for v5* or v6*)')
_dnn_lib.handle = ctypes.cdll.LoadLibrary(lib_name) _dnn_lib.handle = ctypes.cdll.LoadLibrary(lib_name)
cudnn = _dnn_lib.handle cudnn = _dnn_lib.handle
cudnn.cudnnCreate.argtypes = [ctypes.POINTER(ctypes.c_void_p)] cudnn.cudnnCreate.argtypes = [ctypes.POINTER(ctypes.c_void_p)]
...@@ -116,10 +119,14 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) { ...@@ -116,10 +119,14 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
# default gpu, not the one selected by the user. If mixed # default gpu, not the one selected by the user. If mixed
# GPU are installed or if the GPUs are configured in # GPU are installed or if the GPUs are configured in
# exclusive mode, this cause bad detection. # exclusive mode, this cause bad detection.
avail, out, err = GCC_compiler.try_flags(
# NB: GCC_compiler.try_flags() may return just a boolean instead of a tuple (avail, out, here).
compiler_res = GCC_compiler.try_flags(
params, preambule=preambule, body=body, params, preambule=preambule, body=body,
try_run=False, output=True) try_run=False, output=True)
avail, out, err = compiler_res if isinstance(compiler_res, tuple) else (compiler_res, None, None)
if not avail: if not avail:
return False, ("cannot compile with cuDNN. " return False, ("cannot compile with cuDNN. "
"We got this error:\n" + str(err)) "We got this error:\n" + str(err))
...@@ -129,13 +136,12 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) { ...@@ -129,13 +136,12 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
def _dnn_check_version(): def _dnn_check_version():
v = version() v = version()
if v < 5000: if v < 5000:
return False, "cuDNN version is too old. Update to v5, was %d." % v return False, "cuDNN version is too old. Update to v5* or higher, was %d." % v
# 5200 should not print warning with cudnn 5.1 final.
if v >= 6100: if v >= 6100:
warnings.warn("Your cuDNN version is more recent than " warnings.warn("Your cuDNN version is more recent than "
"Theano. If you encounter problems, try " "Theano. If you encounter problems, try "
"updating Theano or downgrading cuDNN to " "updating Theano or downgrading cuDNN to "
"version 6.0.") "a version >= v5 and < v6.1.")
return True, None return True, None
...@@ -281,6 +287,9 @@ handle_type = CDataType('cudnnHandle_t', 'cudnnDestroy', ...@@ -281,6 +287,9 @@ handle_type = CDataType('cudnnHandle_t', 'cudnnDestroy',
lib_dirs=[config.dnn.library_path], lib_dirs=[config.dnn.library_path],
version=version(raises=False)) version=version(raises=False))
# Get cuDNN definitions to be used.
cudnn = cudnn_defs.get_definitions(version(raises=False))
def get_precision(precision, inputs): def get_precision(precision, inputs):
if precision is None: if precision is None:
...@@ -367,6 +376,15 @@ class GpuDnnConvDesc(COp): ...@@ -367,6 +376,15 @@ class GpuDnnConvDesc(COp):
""" """
__props__ = ('border_mode', 'subsample', 'dilation', 'conv_mode', 'precision') __props__ = ('border_mode', 'subsample', 'dilation', 'conv_mode', 'precision')
params_type = ParamsType(pad0=int_t, pad1=int_t, pad2=int_t,
sub0=int_t, sub1=int_t, sub2=int_t,
dil0=int_t, dil1=int_t, dil2=int_t,
nb_dims=int_t,
bmode=EnumList(('BORDER_MODE_FULL', 'full'),
('BORDER_MODE_VALID', 'valid'),
('BORDER_MODE_HALF', 'half')),
conv_mode=cudnn.cudnnConvolutionMode_t,
precision=cudnn.cudnnDataType_t)
def c_headers(self): def c_headers(self):
return ['cudnn.h', 'cudnn_helper.h'] return ['cudnn.h', 'cudnn_helper.h']
...@@ -404,13 +422,13 @@ class GpuDnnConvDesc(COp): ...@@ -404,13 +422,13 @@ class GpuDnnConvDesc(COp):
self.border_mode = border_mode self.border_mode = border_mode
assert len(subsample) in (2, 3) assert len(subsample) in (2, 3)
self.subsample = subsample self.subsample = subsample
assert conv_mode in ('conv', 'cross') assert cudnn.cudnnConvolutionMode_t.has_alias(conv_mode)
self.conv_mode = conv_mode self.conv_mode = conv_mode
assert len(dilation) == len(subsample) assert len(dilation) == len(subsample)
self.dilation = dilation self.dilation = dilation
assert precision in ['float16', 'float32', 'float64'] assert cudnn.cudnnDataType_t.has_alias(precision)
self.precision = precision self.precision = precision
def make_node(self, kern_shape): def make_node(self, kern_shape):
...@@ -430,59 +448,18 @@ class GpuDnnConvDesc(COp): ...@@ -430,59 +448,18 @@ class GpuDnnConvDesc(COp):
out.tag.values_eq_approx = tensor.type.values_eq_approx_always_true out.tag.values_eq_approx = tensor.type.values_eq_approx_always_true
return node return node
def get_op_params(self): bmode = property(lambda self: 'valid' if isinstance(self.border_mode, tuple) else self.border_mode)
pad0 = '0' pad0 = property(lambda self: self.border_mode[0] if isinstance(self.border_mode, tuple) else 0)
pad1 = '0' pad1 = property(lambda self: self.border_mode[1] if isinstance(self.border_mode, tuple) else 0)
pad2 = '0' pad2 = property(lambda self: self.border_mode[2] if (isinstance(self.border_mode, tuple) and
if isinstance(self.border_mode, tuple): len(self.border_mode) > 2) else 0)
pad0 = str(self.border_mode[0]) sub0 = property(lambda self: self.subsample[0])
pad1 = str(self.border_mode[1]) sub1 = property(lambda self: self.subsample[1])
if len(self.border_mode) > 2: sub2 = property(lambda self: self.subsample[2] if len(self.subsample) > 2 else 0)
pad2 = str(self.border_mode[2]) dil0 = property(lambda self: self.dilation[0])
bmode = '1' dil1 = property(lambda self: self.dilation[1])
elif self.border_mode == "valid": dil2 = property(lambda self: self.dilation[2] if len(self.dilation) > 2 else 0)
bmode = '1' nb_dims = property(lambda self: len(self.subsample))
elif self.border_mode == "half":
bmode = '2'
elif self.border_mode == "full":
bmode = '0'
else:
raise ValueError("Invalid value for border_mode")
if self.conv_mode == 'conv':
conv_flag = 'CUDNN_CONVOLUTION'
else:
conv_flag = 'CUDNN_CROSS_CORRELATION'
sub0 = str(self.subsample[0])
sub1 = str(self.subsample[1])
if len(self.subsample) > 2:
sub2 = str(self.subsample[2])
else:
sub2 = '0'
dil0 = str(self.dilation[0])
dil1 = str(self.dilation[1])
if len(self.dilation) > 2:
dil2 = str(self.dilation[2])
else:
dil2 = '0'
if self.precision == 'float16':
precision = 'CUDNN_DATA_HALF'
elif self.precision == 'float32':
precision = 'CUDNN_DATA_FLOAT'
else:
assert self.precision == 'float64'
precision = 'CUDNN_DATA_DOUBLE'
return [('NB_DIMS', str(len(self.subsample))),
('BORDER_MODE', bmode),
('PAD_0', pad0), ('PAD_1', pad1), ('PAD_2', pad2),
('DIL_0', dil0), ('DIL_1', dil1), ('DIL_2', dil2),
('CONV_MODE', conv_flag),
('SUB_0', sub0), ('SUB_1', sub1), ('SUB_2', sub2),
('PRECISION', precision)]
def c_code_cache_version(self): def c_code_cache_version(self):
return (super(GpuDnnConvDesc, self).c_code_cache_version(), version()) return (super(GpuDnnConvDesc, self).c_code_cache_version(), version())
...@@ -533,6 +510,12 @@ class GpuDnnConv(DnnBase): ...@@ -533,6 +510,12 @@ class GpuDnnConv(DnnBase):
_f16_ok = True _f16_ok = True
__props__ = ('algo', 'inplace') __props__ = ('algo', 'inplace')
check_input = False
params_type = ParamsType(conv_algo=cudnn.cudnnConvolutionFwdAlgo_t,
choose_algo=bool_t, choose_once=bool_t, choose_time=bool_t,
inplace=bool_t,
handle=handle_type)
def __init__(self, algo=None, inplace=False): def __init__(self, algo=None, inplace=False):
DnnBase.__init__(self, ["dnn_conv_base.c", "dnn_fwd.c"], DnnBase.__init__(self, ["dnn_conv_base.c", "dnn_fwd.c"],
"APPLY_SPECIFIC(conv_fwd)") "APPLY_SPECIFIC(conv_fwd)")
...@@ -541,13 +524,18 @@ class GpuDnnConv(DnnBase): ...@@ -541,13 +524,18 @@ class GpuDnnConv(DnnBase):
algo = config.dnn.conv.algo_fwd algo = config.dnn.conv.algo_fwd
self.algo = algo self.algo = algo
self.inplace = inplace self.inplace = bool(inplace)
if self.inplace: if self.inplace:
self.destroy_map = {0: [2]} self.destroy_map = {0: [2]}
assert self.algo in ['none', 'small', 'large', 'fft', 'fft_tiling', assert cudnn.cudnnConvolutionFwdAlgo_t.has_alias(self.algo) or self.algo in SUPPORTED_DNN_CONV_ALGO_RUNTIME
'winograd', 'guess_once', 'guess_on_shape_change',
'time_once', 'time_on_shape_change'] self.conv_algo = cudnn.cudnnConvolutionFwdAlgo_t.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
if self.algo not in SUPPORTED_DNN_CONV_ALGO_RUNTIME:
self.conv_algo = self.algo
self.choose_algo = self.algo in SUPPORTED_DNN_CONV_ALGO_RUNTIME
self.choose_once = self.algo in DNN_CONV_ALGO_CHOOSE_ONCE
self.choose_time = self.algo in DNN_CONV_ALGO_CHOOSE_TIME
def __setstate__(self, d): def __setstate__(self, d):
self.__dict__.update(d) self.__dict__.update(d)
...@@ -559,38 +547,6 @@ class GpuDnnConv(DnnBase): ...@@ -559,38 +547,6 @@ class GpuDnnConv(DnnBase):
if not hasattr(self, 'inplace'): if not hasattr(self, 'inplace'):
self.inplace = False self.inplace = False
def get_op_params(self):
defs = []
if self.inplace:
defs.append(('CONV_INPLACE', '1'))
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM'
if self.algo == 'none': # 3d
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM'
elif self.algo == 'small': # 3d
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM'
elif self.algo == 'large':
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_GEMM'
elif self.algo == 'direct':
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_DIRECT'
elif self.algo == 'fft':
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_FFT'
elif self.algo == 'fft_tiling': # 3d
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING'
elif self.algo == 'winograd':
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD'
defs.append(('CONV_ALGO', alg))
if self.algo in ['guess_once', 'guess_on_shape_change',
'time_once', 'time_on_shape_change']:
defs.append(('CHOOSE_ALGO', ''))
if self.algo in ['guess_once', 'time_once']:
defs.append(('CHOOSE_ONCE', ''))
if self.algo in ['time_once', 'time_on_shape_change']:
defs.append(('CHOOSE_TIME', ''))
return defs
def make_node(self, img, kern, output, desc, alpha=None, beta=None): def make_node(self, img, kern, output, desc, alpha=None, beta=None):
ctx_name = infer_context_name(img, kern, output) ctx_name = infer_context_name(img, kern, output)
img = as_gpuarray_variable(img, ctx_name) img = as_gpuarray_variable(img, ctx_name)
...@@ -609,7 +565,7 @@ class GpuDnnConv(DnnBase): ...@@ -609,7 +565,7 @@ class GpuDnnConv(DnnBase):
raise TypeError("The number of dimensions of " raise TypeError("The number of dimensions of "
"img, kern and output must match") "img, kern and output must match")
if img.type.ndim == 5 and self.algo in ['large', 'fft']: if img.type.ndim == 5 and self.algo not in cudnn.conv3d_fwd_algorithms:
raise ValueError("convolution algo %s can't be used for " raise ValueError("convolution algo %s can't be used for "
"3d convolutions", (self.algo,)) "3d convolutions", (self.algo,))
...@@ -687,17 +643,30 @@ class GpuDnnConvGradW(DnnBase): ...@@ -687,17 +643,30 @@ class GpuDnnConvGradW(DnnBase):
_f16_ok = True _f16_ok = True
__props__ = ('algo', 'inplace') __props__ = ('algo', 'inplace')
check_input = False
params_type = ParamsType(conv_algo=cudnn.cudnnConvolutionBwdFilterAlgo_t,
choose_algo=bool_t, choose_once=bool_t, choose_time=bool_t,
inplace=bool_t,
handle=handle_type)
def __init__(self, inplace=False, algo=None): def __init__(self, inplace=False, algo=None):
DnnBase.__init__(self, ["dnn_conv_base.c", "dnn_gw.c"], DnnBase.__init__(self, ["dnn_conv_base.c", "dnn_gw.c"],
"APPLY_SPECIFIC(conv_gw)") "APPLY_SPECIFIC(conv_gw)")
self.inplace = inplace self.inplace = bool(inplace)
if self.inplace: if self.inplace:
self.destroy_map = {0: [2]} self.destroy_map = {0: [2]}
if algo is None: if algo is None:
algo = config.dnn.conv.algo_bwd_filter algo = config.dnn.conv.algo_bwd_filter
self.algo = algo self.algo = algo
assert self.algo in SUPPORTED_DNN_CONV_ALGO_BWD_FILTER assert cudnn.cudnnConvolutionBwdFilterAlgo_t.has_alias(self.algo) or self.algo in SUPPORTED_DNN_CONV_ALGO_RUNTIME
self.conv_algo = cudnn.cudnnConvolutionBwdFilterAlgo_t.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0
if self.algo not in SUPPORTED_DNN_CONV_ALGO_RUNTIME:
self.conv_algo = self.algo
self.choose_algo = self.algo in SUPPORTED_DNN_CONV_ALGO_RUNTIME
self.choose_once = self.algo in DNN_CONV_ALGO_CHOOSE_ONCE
self.choose_time = self.algo in DNN_CONV_ALGO_CHOOSE_TIME
def __setstate__(self, d): def __setstate__(self, d):
self.__dict__.update(d) self.__dict__.update(d)
...@@ -724,33 +693,6 @@ class GpuDnnConvGradW(DnnBase): ...@@ -724,33 +693,6 @@ class GpuDnnConvGradW(DnnBase):
# not connected to desc # not connected to desc
return [[1], [1], [1], [0], [1], [1]] return [[1], [1], [1], [0], [1], [1]]
def get_op_params(self):
defs = []
if self.inplace:
defs.append(('CONV_INPLACE', '1'))
alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0'
if self.algo == 'none': # 3d
alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0'
if self.algo == 'deterministic':
alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1'
if self.algo == 'fft':
alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT'
if self.algo == 'small': # 3d
# non-deterministic, small workspace
alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3'
if self.algo in ['guess_once', 'guess_on_shape_change',
'time_once', 'time_on_shape_change']:
defs.append(('CHOOSE_ALGO', ''))
if self.algo in ['guess_once', 'time_once']:
defs.append(('CHOOSE_ONCE', ''))
if self.algo in ['time_once', 'time_on_shape_change']:
defs.append(('CHOOSE_TIME', ''))
defs.append(('CONV_ALGO', alg))
return defs
def op_may_fail_with_subsample(self, img, desc): def op_may_fail_with_subsample(self, img, desc):
return (version() < 6000 and return (version() < 6000 and
img.type.dtype == 'float32' and img.type.dtype == 'float32' and
...@@ -793,8 +735,7 @@ class GpuDnnConvGradW(DnnBase): ...@@ -793,8 +735,7 @@ class GpuDnnConvGradW(DnnBase):
raise TypeError("The number of dimensions of " raise TypeError("The number of dimensions of "
"img, topgrad and output must match") "img, topgrad and output must match")
if (img.type.ndim == 5 and if img.type.ndim == 5 and self.algo not in cudnn.conv3d_bwd_filter_algorithms:
self.algo in ['fft', 'deterministic']):
raise ValueError("convolution algo %s can't be used for " raise ValueError("convolution algo %s can't be used for "
"3d convolutions", (self.algo,)) "3d convolutions", (self.algo,))
...@@ -830,19 +771,30 @@ class GpuDnnConvGradI(DnnBase): ...@@ -830,19 +771,30 @@ class GpuDnnConvGradI(DnnBase):
_f16_ok = True _f16_ok = True
__props__ = ('algo', 'inplace',) __props__ = ('algo', 'inplace',)
check_input = False
params_type = ParamsType(conv_algo=cudnn.cudnnConvolutionBwdDataAlgo_t,
choose_algo=bool_t, choose_once=bool_t, choose_time=bool_t,
inplace=bool_t,
handle=handle_type)
def __init__(self, inplace=False, algo=None): def __init__(self, inplace=False, algo=None):
DnnBase.__init__(self, ["dnn_conv_base.c", "dnn_gi.c"], DnnBase.__init__(self, ["dnn_conv_base.c", "dnn_gi.c"],
"APPLY_SPECIFIC(conv_gi)") "APPLY_SPECIFIC(conv_gi)")
self.inplace = inplace self.inplace = bool(inplace)
if self.inplace: if self.inplace:
self.destroy_map = {0: [2]} self.destroy_map = {0: [2]}
if algo is None: if algo is None:
algo = config.dnn.conv.algo_bwd_data algo = config.dnn.conv.algo_bwd_data
self.algo = algo self.algo = algo
assert self.algo in ['none', 'deterministic', 'fft', 'fft_tiling', assert cudnn.cudnnConvolutionBwdDataAlgo_t.has_alias(self.algo) or self.algo in SUPPORTED_DNN_CONV_ALGO_RUNTIME
'winograd', 'guess_once', 'guess_on_shape_change',
'time_once', 'time_on_shape_change'] self.conv_algo = cudnn.cudnnConvolutionBwdDataAlgo_t.CUDNN_CONVOLUTION_BWD_DATA_ALGO_0
if self.algo not in SUPPORTED_DNN_CONV_ALGO_RUNTIME:
self.conv_algo = self.algo
self.choose_algo = self.algo in SUPPORTED_DNN_CONV_ALGO_RUNTIME
self.choose_once = self.algo in DNN_CONV_ALGO_CHOOSE_ONCE
self.choose_time = self.algo in DNN_CONV_ALGO_CHOOSE_TIME
def __setstate__(self, d): def __setstate__(self, d):
self.__dict__.update(d) self.__dict__.update(d)
...@@ -869,36 +821,6 @@ class GpuDnnConvGradI(DnnBase): ...@@ -869,36 +821,6 @@ class GpuDnnConvGradI(DnnBase):
# not connected to desc # not connected to desc
return [[1], [1], [1], [0], [1], [1]] return [[1], [1], [1], [0], [1], [1]]
def get_op_params(self):
defs = []
if self.inplace:
defs.append(('CONV_INPLACE', '1'))
alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_0'
if self.algo == 'none': # 3d
alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_0'
elif self.algo == 'deterministic': # 3d
alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_1'
elif self.algo == 'fft':
alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT'
elif self.algo == 'fft_tiling': # 3d
# big workspace but less than fft
alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING'
elif self.algo == 'winograd':
alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD'
if self.algo in ['guess_once', 'guess_on_shape_change',
'time_once', 'time_on_shape_change']:
defs.append(('CHOOSE_ALGO', ''))
if self.algo in ['guess_once', 'time_once']:
defs.append(('CHOOSE_ONCE', ''))
if self.algo in ['time_once', 'time_on_shape_change']:
defs.append(('CHOOSE_TIME', ''))
defs.append(('CONV_ALGO', alg))
return defs
def make_node(self, kern, topgrad, output, desc, alpha=None, beta=None): def make_node(self, kern, topgrad, output, desc, alpha=None, beta=None):
ctx_name = infer_context_name(kern, topgrad, output) ctx_name = infer_context_name(kern, topgrad, output)
kern = as_gpuarray_variable(kern, ctx_name) kern = as_gpuarray_variable(kern, ctx_name)
...@@ -916,7 +838,7 @@ class GpuDnnConvGradI(DnnBase): ...@@ -916,7 +838,7 @@ class GpuDnnConvGradI(DnnBase):
raise TypeError("The number of dimensions of " raise TypeError("The number of dimensions of "
"kern, topgrad and output must match") "kern, topgrad and output must match")
if kern.type.ndim == 5 and self.algo in ['fft']: if kern.type.ndim == 5 and self.algo not in cudnn.conv3d_bwd_data_algorithms:
raise ValueError("convolution algo %s can't be used for " raise ValueError("convolution algo %s can't be used for "
"3d convolutions", (self.algo,)) "3d convolutions", (self.algo,))
...@@ -1349,7 +1271,33 @@ class GpuDnnPoolDesc(Op): ...@@ -1349,7 +1271,33 @@ class GpuDnnPoolDesc(Op):
return (4, version()) return (4, version())
class GpuDnnPool(DnnBase): class GpuDnnPoolBase(DnnBase):
"""
Abstract base class for GpuDnnPool and GpuDnnPoolGrad.
"""
# c_file and c_function must be defined in sub-classes.
c_file = None
c_function = None
_f16_ok = True
__props__ = ('mode',)
check_input = False
params_type = ParamsType(mode=cudnn.cudnnPoolingMode_t,
handle=handle_type)
def __init__(self, mode='max'):
DnnBase.__init__(self, [self.c_file], self.c_function)
if mode == 'average':
mode = 'average_inc_pad'
# Supported modes depend on runtime cuDNN version.
assert cudnn.cudnnPoolingMode_t.has_alias(mode)
self.mode = mode
class GpuDnnPool(GpuDnnPoolBase):
""" """
Parameters Parameters
...@@ -1366,25 +1314,8 @@ class GpuDnnPool(DnnBase): ...@@ -1366,25 +1314,8 @@ class GpuDnnPool(DnnBase):
(padX, padY) or (padX, padY, padZ) (padX, padY) or (padX, padY, padZ)
""" """
_f16_ok = True c_file = "dnn_pool.c"
__props__ = ('mode',) c_function = "APPLY_SPECIFIC(dnn_pool)"
def __init__(self, mode='max'):
DnnBase.__init__(self, ["dnn_pool.c"], "APPLY_SPECIFIC(dnn_pool)")
if mode == 'average':
mode = 'average_inc_pad'
assert mode in ('max', 'average_inc_pad', 'average_exc_pad')
self.mode = mode
def get_op_params(self):
if self.mode == 'max':
mode_flag = 'CUDNN_POOLING_MAX'
elif self.mode == "average_inc_pad":
mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING'
elif self.mode == "average_exc_pad":
mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING'
return [('MODE_FLAG', mode_flag)]
def make_node(self, img, ws, stride, pad): def make_node(self, img, ws, stride, pad):
ctx_name = infer_context_name(img) ctx_name = infer_context_name(img)
...@@ -1428,7 +1359,7 @@ class GpuDnnPool(DnnBase): ...@@ -1428,7 +1359,7 @@ class GpuDnnPool(DnnBase):
return [[1], [0], [0], [0]] return [[1], [0], [0], [0]]
class GpuDnnPoolGrad(DnnBase): class GpuDnnPoolGrad(GpuDnnPoolBase):
""" """
The pooling gradient. The pooling gradient.
...@@ -1451,26 +1382,8 @@ class GpuDnnPoolGrad(DnnBase): ...@@ -1451,26 +1382,8 @@ class GpuDnnPoolGrad(DnnBase):
(padX, padY) or (padX, padY, padZ) (padX, padY) or (padX, padY, padZ)
""" """
_f16_ok = True c_file = "dnn_pool_grad.c"
__props__ = ('mode',) c_function = "APPLY_SPECIFIC(dnn_pool_grad)"
def __init__(self, mode='max'):
DnnBase.__init__(self, ["dnn_pool_grad.c"],
"APPLY_SPECIFIC(dnn_pool_grad)")
if mode == 'average':
mode = 'average_inc_pad'
assert mode in ('max', 'average_inc_pad', 'average_exc_pad')
self.mode = mode
def get_op_params(self):
if self.mode == 'max':
mode_flag = 'CUDNN_POOLING_MAX'
elif self.mode == "average_inc_pad":
mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING'
elif self.mode == "average_exc_pad":
mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING'
return [('MODE_FLAG', mode_flag)]
def make_node(self, inp, out, out_grad, ws, stride, pad): def make_node(self, inp, out, out_grad, ws, stride, pad):
ctx_name = infer_context_name(inp, out, out_grad) ctx_name = infer_context_name(inp, out, out_grad)
...@@ -1513,7 +1426,8 @@ def dnn_pool(img, ws, stride=None, mode='max', pad=None): ...@@ -1513,7 +1426,8 @@ def dnn_pool(img, ws, stride=None, mode='max', pad=None):
Subsampling window size. Should have 2 or 3 elements. Subsampling window size. Should have 2 or 3 elements.
stride : tuple stride : tuple
Subsampling stride (default: (1, 1) or (1, 1, 1)). Subsampling stride (default: (1, 1) or (1, 1, 1)).
mode : {'max', 'average_inc_pad', 'average_exc_pad', 'sum'} mode : {'max', 'average_inc_pad', 'average_exc_pad', 'sum', 'max_deterministic'}
**NB**: 'max_deterministic' is supported since cuDNN v6.
pad : tuple pad : tuple
(padX, padY) or (padX, padY, padZ) (padX, padY) or (padX, padY, padZ)
default: (0, 0) or (0, 0, 0) default: (0, 0) or (0, 0, 0)
...@@ -1562,22 +1476,17 @@ class GpuDnnSoftmaxBase(DnnBase): ...@@ -1562,22 +1476,17 @@ class GpuDnnSoftmaxBase(DnnBase):
# neither in dnn_base.c nor in dnn_softmax*.c, # neither in dnn_base.c nor in dnn_softmax*.c,
# so we can disable input checking. # so we can disable input checking.
check_input = False check_input = False
params_type = ParamsType(algo=CEnumType(('CUDNN_SOFTMAX_FAST', 'fast'), params_type = ParamsType(algo=cudnn.cudnnSoftmaxAlgorithm_t,
('CUDNN_SOFTMAX_LOG', 'log'), mode=cudnn.cudnnSoftmaxMode_t,
('CUDNN_SOFTMAX_ACCURATE', 'accurate'),
ctype='cudnnSoftmaxAlgorithm_t'),
mode=CEnumType(('CUDNN_SOFTMAX_MODE_INSTANCE', 'instance'),
('CUDNN_SOFTMAX_MODE_CHANNEL', 'channel'),
ctype='cudnnSoftmaxMode_t'),
handle=handle_type) handle=handle_type)
def __init__(self, algo, mode): def __init__(self, algo, mode):
DnnBase.__init__(self, [self.file], self.c_func) DnnBase.__init__(self, [self.file], self.c_func)
assert(algo in ('fast', 'accurate', 'log')) assert cudnn.cudnnSoftmaxAlgorithm_t.has_alias(algo)
self.algo = algo self.algo = algo
assert(mode in ('instance', 'channel')) assert cudnn.cudnnSoftmaxMode_t.has_alias(mode)
self.mode = mode self.mode = mode
def infer_shape(self, node, shape): def infer_shape(self, node, shape):
...@@ -1810,13 +1719,18 @@ class GpuDnnBatchNormInference(DnnBase): ...@@ -1810,13 +1719,18 @@ class GpuDnnBatchNormInference(DnnBase):
__props__ = ('mode', 'inplace') __props__ = ('mode', 'inplace')
check_input = False
params_type = ParamsType(mode=cudnn.cudnnBatchNormMode_t,
inplace=bool_t,
handle=handle_type)
def __init__(self, mode='per-activation', inplace=False): def __init__(self, mode='per-activation', inplace=False):
DnnBase.__init__(self, ['dnn_batchnorm_base.c', 'dnn_batchnorm_inf.c'], DnnBase.__init__(self, ['dnn_batchnorm_base.c', 'dnn_batchnorm_inf.c'],
'dnn_batchnorm_op') 'dnn_batchnorm_op')
assert (mode in ('per-activation', 'spatial')) assert cudnn.cudnnBatchNormMode_t.has_alias(mode)
self.mode = mode self.mode = mode
self.inplace = inplace self.inplace = bool(inplace)
if self.inplace: if self.inplace:
self.destroy_map = {0: [0]} self.destroy_map = {0: [0]}
...@@ -1825,15 +1739,6 @@ class GpuDnnBatchNormInference(DnnBase): ...@@ -1825,15 +1739,6 @@ class GpuDnnBatchNormInference(DnnBase):
if not hasattr(self, 'inplace'): if not hasattr(self, 'inplace'):
self.inplace = False self.inplace = False
def get_op_params(self):
params = []
if self.inplace:
params.append(('INPLACE_OUTPUT', '1'))
params.append(('MODE', ("CUDNN_BATCHNORM_SPATIAL"
if self.mode == "spatial"
else "CUDNN_BATCHNORM_PER_ACTIVATION")))
return params
def infer_shape(self, node, shape): def infer_shape(self, node, shape):
return [shape[0]] return [shape[0]]
...@@ -1882,20 +1787,17 @@ class GpuDnnBatchNormInference(DnnBase): ...@@ -1882,20 +1787,17 @@ class GpuDnnBatchNormInference(DnnBase):
class GpuDnnBatchNormGrad(DnnBase): class GpuDnnBatchNormGrad(DnnBase):
__props__ = ('mode',) __props__ = ('mode',)
check_input = False
params_type = ParamsType(mode=cudnn.cudnnBatchNormMode_t,
handle=handle_type)
def __init__(self, mode='per-activation'): def __init__(self, mode='per-activation'):
DnnBase.__init__(self, ['dnn_batchnorm_base.c', 'dnn_batchnorm_grad.c'], DnnBase.__init__(self, ['dnn_batchnorm_base.c', 'dnn_batchnorm_grad.c'],
'dnn_batchnorm_grad') 'dnn_batchnorm_grad')
assert (mode in ('per-activation', 'spatial')) assert cudnn.cudnnBatchNormMode_t.has_alias(mode)
self.mode = mode self.mode = mode
def get_op_params(self):
params = []
params.append(('MODE', ("CUDNN_BATCHNORM_SPATIAL"
if self.mode == "spatial"
else "CUDNN_BATCHNORM_PER_ACTIVATION")))
return params
def make_node(self, x, dy, scale, x_mean, x_invstd, epsilon=1e-4): def make_node(self, x, dy, scale, x_mean, x_invstd, epsilon=1e-4):
ctx_name = infer_context_name(x, dy, scale, x_mean, x_invstd) ctx_name = infer_context_name(x, dy, scale, x_mean, x_invstd)
x = as_gpuarray_variable(x, ctx_name) x = as_gpuarray_variable(x, ctx_name)
......
...@@ -24,7 +24,7 @@ int dnn_batchnorm_grad(PyGpuArrayObject *inp, PyGpuArrayObject *doutp, ...@@ -24,7 +24,7 @@ int dnn_batchnorm_grad(PyGpuArrayObject *inp, PyGpuArrayObject *doutp,
PyGpuArrayObject *scale, PyGpuArrayObject *x_mean, PyGpuArrayObject *scale, PyGpuArrayObject *x_mean,
PyGpuArrayObject *x_invstd, npy_float64 epsilon, PyGpuArrayObject *x_invstd, npy_float64 epsilon,
PyGpuArrayObject **dinp, PyGpuArrayObject **dscale, PyGpuArrayObject **dinp, PyGpuArrayObject **dscale,
PyGpuArrayObject **dbias, cudnnHandle_t _handle) { PyGpuArrayObject **dbias, PARAMS_TYPE* params) {
PyGpuContextObject *c = inp->context; PyGpuContextObject *c = inp->context;
if (c_set_tensorNd(inp, bn_input) != 0) if (c_set_tensorNd(inp, bn_input) != 0)
...@@ -70,8 +70,8 @@ int dnn_batchnorm_grad(PyGpuArrayObject *inp, PyGpuArrayObject *doutp, ...@@ -70,8 +70,8 @@ int dnn_batchnorm_grad(PyGpuArrayObject *inp, PyGpuArrayObject *doutp,
betaParam = (void *)&fbeta; betaParam = (void *)&fbeta;
} }
cudnnStatus_t err = cudnnBatchNormalizationBackward( cudnnStatus_t err = cudnnBatchNormalizationBackward(
_handle, params->handle,
MODE, params->mode,
alphaData, alphaData,
betaData, betaData,
alphaParam, alphaParam,
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale, int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
PyGpuArrayObject *bias, PyGpuArrayObject *est_mean, PyGpuArrayObject *bias, PyGpuArrayObject *est_mean,
PyGpuArrayObject *est_var, npy_float64 epsilon, PyGpuArrayObject *est_var, npy_float64 epsilon,
PyGpuArrayObject **outp, cudnnHandle_t _handle) { PyGpuArrayObject **outp, PARAMS_TYPE* params) {
PyGpuContextObject *c = inp->context; PyGpuContextObject *c = inp->context;
if (c_set_tensorNd(inp, bn_input) != 0) if (c_set_tensorNd(inp, bn_input) != 0)
...@@ -16,14 +16,14 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale, ...@@ -16,14 +16,14 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
return 1; return 1;
} }
#ifdef INPLACE_OUTPUT if (params->inplace) {
Py_XDECREF(*outp); Py_XDECREF(*outp);
*outp = inp; *outp = inp;
Py_INCREF(*outp); Py_INCREF(*outp);
#else } else {
if (theano_prep_output(outp, inp->ga.nd, inp->ga.dimensions, inp->ga.typecode, GA_C_ORDER, c) != 0) if (theano_prep_output(outp, inp->ga.nd, inp->ga.dimensions, inp->ga.typecode, GA_C_ORDER, c) != 0)
return 1; return 1;
#endif }
if (c_set_tensorNd(*outp, bn_output) != 0) if (c_set_tensorNd(*outp, bn_output) != 0)
return 1; return 1;
...@@ -43,8 +43,8 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale, ...@@ -43,8 +43,8 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
beta = (void *)&fbeta; beta = (void *)&fbeta;
} }
cudnnStatus_t err = cudnnBatchNormalizationForwardInference( cudnnStatus_t err = cudnnBatchNormalizationForwardInference(
_handle, params->handle,
MODE, params->mode,
alpha, alpha,
beta, beta,
bn_input, bn_input,
......
#section init_code_struct #section init_code_struct
#ifdef CHOOSE_ALGO if (PARAMS->choose_algo) {
reuse_algo = 0; reuse_algo = 0;
prev_algo = CONV_ALGO; prev_algo = PARAMS->conv_algo;
#ifndef CHOOSE_ONCE if (!PARAMS->choose_once) {
memset(prev_img_dims, 0, sizeof(prev_img_dims)); memset(prev_img_dims, 0, sizeof(prev_img_dims));
memset(prev_kern_dims, 0, sizeof(prev_kern_dims)); memset(prev_kern_dims, 0, sizeof(prev_kern_dims));
#endif }
#endif }
#section support_code_struct #section support_code_struct
#ifdef CHOOSE_ALGO
int reuse_algo; int reuse_algo;
cudnnConvolutionFwdAlgo_t prev_algo; cudnnConvolutionFwdAlgo_t prev_algo;
#ifndef CHOOSE_ONCE
size_t prev_img_dims[5]; size_t prev_img_dims[5];
size_t prev_kern_dims[5]; size_t prev_kern_dims[5];
#endif
#endif
int int
APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
...@@ -26,7 +22,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -26,7 +22,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
cudnnConvolutionDescriptor_t desc, cudnnConvolutionDescriptor_t desc,
double alpha, double beta, double alpha, double beta,
PyGpuArrayObject **output, PyGpuArrayObject **output,
cudnnHandle_t _handle) { PARAMS_TYPE* params) {
PyGpuContextObject *c = input->context; PyGpuContextObject *c = input->context;
void *alpha_p; void *alpha_p;
void *beta_p; void *beta_p;
...@@ -54,17 +50,17 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -54,17 +50,17 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
return 1; return 1;
} }
#ifdef CONV_INPLACE if (params->inplace) {
Py_XDECREF(*output); Py_XDECREF(*output);
*output = om; *output = om;
Py_INCREF(*output); Py_INCREF(*output);
#else } else {
if (theano_prep_output(output, PyGpuArray_NDIM(om), PyGpuArray_DIMS(om), if (theano_prep_output(output, PyGpuArray_NDIM(om), PyGpuArray_DIMS(om),
om->ga.typecode, GA_C_ORDER, c) != 0) om->ga.typecode, GA_C_ORDER, c) != 0)
return 1; return 1;
if (beta != 0.0 && pygpu_move(*output, om)) if (beta != 0.0 && pygpu_move(*output, om))
return 1; return 1;
#endif }
if (PyGpuArray_DIMS(input)[0] == 0 || PyGpuArray_DIMS(kerns)[0] == 0 || PyGpuArray_DIMS(kerns)[1] == 0) { if (PyGpuArray_DIMS(input)[0] == 0 || PyGpuArray_DIMS(kerns)[0] == 0 || PyGpuArray_DIMS(kerns)[1] == 0) {
int err2 = GpuArray_memset(&(*output)->ga, 0); int err2 = GpuArray_memset(&(*output)->ga, 0);
...@@ -83,11 +79,12 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -83,11 +79,12 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
if (c_set_tensorNd(*output, APPLY_SPECIFIC(output)) == -1) if (c_set_tensorNd(*output, APPLY_SPECIFIC(output)) == -1)
return 1; return 1;
cudnnConvolutionFwdAlgo_t algo = CONV_ALGO; cudnnConvolutionFwdAlgo_t algo = params->conv_algo;
cuda_enter(c->ctx); cuda_enter(c->ctx);
#ifdef CHOOSE_ALGO
#ifndef CHOOSE_ONCE if (params->choose_algo) {
if (params->choose_once) {
reuse_algo = 1; reuse_algo = 1;
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) { for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
reuse_algo = (reuse_algo && reuse_algo = (reuse_algo &&
...@@ -95,7 +92,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -95,7 +92,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
reuse_algo = (reuse_algo && reuse_algo = (reuse_algo &&
PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]); PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]);
} }
#endif }
if (!reuse_algo) { if (!reuse_algo) {
size_t free; size_t free;
...@@ -111,7 +108,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -111,7 +108,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
// Guess 4Mb if the info is not available // Guess 4Mb if the info is not available
if (free == 0) free = 4 * 1024 * 1024; if (free == 0) free = 4 * 1024 * 1024;
#ifdef CHOOSE_TIME if (params->choose_time) {
int count; int count;
cudnnConvolutionFwdAlgoPerf_t choice; cudnnConvolutionFwdAlgoPerf_t choice;
gpudata *tmpmem; gpudata *tmpmem;
...@@ -123,7 +120,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -123,7 +120,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
} }
// We don't sync the buffer as we don't care about the values. // We don't sync the buffer as we don't care about the values.
err = cudnnFindConvolutionForwardAlgorithmEx( err = cudnnFindConvolutionForwardAlgorithmEx(
_handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input), params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns), APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
desc, APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output), desc, APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output),
1, &count, &choice, *(void **)tmpmem, 1, &count, &choice, *(void **)tmpmem,
...@@ -138,9 +135,9 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -138,9 +135,9 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
return 1; return 1;
} }
algo = choice.algo; algo = choice.algo;
#else } else {
err = cudnnGetConvolutionForwardAlgorithm( err = cudnnGetConvolutionForwardAlgorithm(
_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns), params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
desc, APPLY_SPECIFIC(output), desc, APPLY_SPECIFIC(output),
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, &algo); CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
...@@ -150,22 +147,21 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -150,22 +147,21 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
cuda_exit(c->ctx); cuda_exit(c->ctx);
return 1; return 1;
} }
#endif }
prev_algo = algo; prev_algo = algo;
} else { } else {
algo = prev_algo; algo = prev_algo;
} }
#ifdef CHOOSE_ONCE if (params->choose_once) {
reuse_algo = 1; reuse_algo = 1;
#else } else {
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) { for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
prev_img_dims[i] = PyGpuArray_DIM(input, i); prev_img_dims[i] = PyGpuArray_DIM(input, i);
prev_kern_dims[i] = PyGpuArray_DIM(kerns, i); prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
} }
#endif }
}
#endif
/* These two algos are not supported for 3d conv */ /* These two algos are not supported for 3d conv */
if (PyGpuArray_NDIM(input) == 5 && if (PyGpuArray_NDIM(input) == 5 &&
...@@ -201,20 +197,16 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -201,20 +197,16 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
return 1; return 1;
} }
if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT) if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT) {
{
if (stride[0] != 1 || stride[1] != 1 || if (stride[0] != 1 || stride[1] != 1 ||
PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 || PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
(PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1)) (PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1))
{ {
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
} }
} } else {
else
{
// algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING // algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING
if (stride[0] != 1 || stride[1] != 1) if (stride[0] != 1 || stride[1] != 1) {
{
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
} }
} }
...@@ -223,7 +215,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -223,7 +215,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
{ {
size_t worksize; size_t worksize;
gpudata *workspace; gpudata *workspace;
err = cudnnGetConvolutionForwardWorkspaceSize(_handle, err = cudnnGetConvolutionForwardWorkspaceSize(params->handle,
APPLY_SPECIFIC(input), APPLY_SPECIFIC(input),
APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(kerns),
desc, desc,
...@@ -236,7 +228,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -236,7 +228,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
// TODO: Print a warning // TODO: Print a warning
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
err = cudnnGetConvolutionForwardWorkspaceSize(_handle, err = cudnnGetConvolutionForwardWorkspaceSize(params->handle,
APPLY_SPECIFIC(input), APPLY_SPECIFIC(input),
APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(kerns),
desc, desc,
...@@ -273,7 +265,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -273,7 +265,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnConvolutionForward( err = cudnnConvolutionForward(
_handle, params->handle,
alpha_p, alpha_p,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input), APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns), APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
......
#section init_code_struct #section init_code_struct
#ifdef CHOOSE_ALGO // #ifdef CHOOSE_ALGO
reuse_algo = 0; if (PARAMS->choose_algo) {
prev_algo = CONV_ALGO; reuse_algo = 0;
#ifndef CHOOSE_ONCE prev_algo = PARAMS->conv_algo;
memset(prev_kern_dims, 0, sizeof(prev_kern_dims)); // #ifndef CHOOSE_ONCE
memset(prev_top_dims, 0, sizeof(prev_top_dims)); if (!PARAMS->choose_once) {
#endif memset(prev_kern_dims, 0, sizeof(prev_kern_dims));
#endif memset(prev_top_dims, 0, sizeof(prev_top_dims));
}
// #endif
}
// #endif
#section support_code_struct #section support_code_struct
#ifdef CHOOSE_ALGO int reuse_algo;
int reuse_algo = 0; cudnnConvolutionBwdDataAlgo_t prev_algo;
cudnnConvolutionBwdDataAlgo_t prev_algo = CONV_ALGO;
#ifndef CHOOSE_ONCE
size_t prev_kern_dims[5] = {0}; size_t prev_kern_dims[5] = {0};
size_t prev_top_dims[5] = {0}; size_t prev_top_dims[5] = {0};
#endif
#endif
int int
APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
PyGpuArrayObject *im, PyGpuArrayObject *im,
cudnnConvolutionDescriptor_t desc, cudnnConvolutionDescriptor_t desc,
double alpha, double beta, PyGpuArrayObject **input, double alpha, double beta, PyGpuArrayObject **input,
cudnnHandle_t _handle) { PARAMS_TYPE* params) {
PyGpuContextObject *c = kerns->context; PyGpuContextObject *c = kerns->context;
void *alpha_p; void *alpha_p;
void *beta_p; void *beta_p;
...@@ -53,17 +53,20 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -53,17 +53,20 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
return 1; return 1;
} }
#ifdef CONV_INPLACE // #ifdef CONV_INPLACE
if (params->inplace) {
Py_XDECREF(*input); Py_XDECREF(*input);
*input = im; *input = im;
Py_INCREF(*input); Py_INCREF(*input);
#else // #else
} else {
if (theano_prep_output(input, PyGpuArray_NDIM(im), PyGpuArray_DIMS(im), if (theano_prep_output(input, PyGpuArray_NDIM(im), PyGpuArray_DIMS(im),
im->ga.typecode, GA_C_ORDER, c) != 0) im->ga.typecode, GA_C_ORDER, c) != 0)
return 1; return 1;
if (beta != 0.0 && pygpu_move(*input, im)) if (beta != 0.0 && pygpu_move(*input, im))
return 1; return 1;
#endif }
// #endif
if (PyGpuArray_DIMS(im)[0] == 0 || PyGpuArray_DIMS(kerns)[0] == 0 || PyGpuArray_DIMS(kerns)[1] == 0) { if (PyGpuArray_DIMS(im)[0] == 0 || PyGpuArray_DIMS(kerns)[0] == 0 || PyGpuArray_DIMS(kerns)[1] == 0) {
int err2 = GpuArray_memset(&(*input)->ga, 0); int err2 = GpuArray_memset(&(*input)->ga, 0);
...@@ -82,7 +85,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -82,7 +85,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
if (c_set_tensorNd(*input, APPLY_SPECIFIC(input)) == -1) if (c_set_tensorNd(*input, APPLY_SPECIFIC(input)) == -1)
return 1; return 1;
cudnnConvolutionBwdDataAlgo_t algo = CONV_ALGO; cudnnConvolutionBwdDataAlgo_t algo = params->conv_algo;
cuda_enter(c->ctx); cuda_enter(c->ctx);
...@@ -128,8 +131,10 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -128,8 +131,10 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
} }
} }
#ifdef CHOOSE_ALGO // #ifdef CHOOSE_ALGO
#ifndef CHOOSE_ONCE if (params->choose_algo) {
// #ifndef CHOOSE_ONCE
if (!params->choose_once) {
reuse_algo = 1; reuse_algo = 1;
for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); i++) { for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); i++) {
reuse_algo = (reuse_algo && reuse_algo = (reuse_algo &&
...@@ -137,7 +142,8 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -137,7 +142,8 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
reuse_algo = (reuse_algo && reuse_algo = (reuse_algo &&
PyGpuArray_DIM(output, i) == prev_top_dims[i]); PyGpuArray_DIM(output, i) == prev_top_dims[i]);
} }
#endif }
// #endif
if (!reuse_algo) { if (!reuse_algo) {
size_t free; size_t free;
...@@ -153,7 +159,8 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -153,7 +159,8 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
// Guess 4Mb if the info is not available // Guess 4Mb if the info is not available
if (free == 0) free = 4 * 1024 * 1024; if (free == 0) free = 4 * 1024 * 1024;
#ifdef CHOOSE_TIME // #ifdef CHOOSE_TIME
if (params->choose_time) {
int count; int count;
cudnnConvolutionBwdDataAlgoPerf_t choice; cudnnConvolutionBwdDataAlgoPerf_t choice;
gpudata *tmpmem; gpudata *tmpmem;
...@@ -165,7 +172,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -165,7 +172,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
} }
err = cudnnFindConvolutionBackwardDataAlgorithmEx( err = cudnnFindConvolutionBackwardDataAlgorithmEx(
_handle, APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns), params->handle, APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc, APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(*input), APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(*input),
1, &count, &choice, *(void **)tmpmem, free); 1, &count, &choice, *(void **)tmpmem, free);
...@@ -179,9 +186,10 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -179,9 +186,10 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
} }
algo = choice.algo; algo = choice.algo;
#else // #else
} else {
err = cudnnGetConvolutionBackwardDataAlgorithm( err = cudnnGetConvolutionBackwardDataAlgorithm(
_handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output),
desc, APPLY_SPECIFIC(input), desc, APPLY_SPECIFIC(input),
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, free, &algo); CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
...@@ -190,22 +198,26 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -190,22 +198,26 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
cuda_exit(c->ctx); cuda_exit(c->ctx);
return 1; return 1;
} }
#endif }
// #endif
prev_algo = algo; prev_algo = algo;
} else { } else {
algo = prev_algo; algo = prev_algo;
} }
#ifdef CHOOSE_ONCE // #ifdef CHOOSE_ONCE
if (params->choose_once) {
reuse_algo = 1; reuse_algo = 1;
#else // #else
} else {
for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); i++) { for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); i++) {
prev_kern_dims[i] = PyGpuArray_DIM(kerns, i); prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
prev_top_dims[i] = PyGpuArray_DIM(output, i); prev_top_dims[i] = PyGpuArray_DIM(output, i);
} }
#endif }
// #endif
#endif }
// #endif
// The FFT implementation does not support strides, 1x1 filters or inputs // The FFT implementation does not support strides, 1x1 filters or inputs
// with a spatial dimension larger than 1024. The tiled-FFT implementation // with a spatial dimension larger than 1024. The tiled-FFT implementation
...@@ -258,7 +270,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -258,7 +270,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
gpudata *workspace; gpudata *workspace;
err = cudnnGetConvolutionBackwardDataWorkspaceSize( err = cudnnGetConvolutionBackwardDataWorkspaceSize(
_handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc, params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(input), algo, &worksize); APPLY_SPECIFIC(input), algo, &worksize);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
...@@ -283,7 +295,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -283,7 +295,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
cuda_wait((*input)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); cuda_wait((*input)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnConvolutionBackwardData( err = cudnnConvolutionBackwardData(
_handle, params->handle,
alpha_p, alpha_p,
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns), APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),
......
#section init_code_struct #section init_code_struct
#ifdef CHOOSE_ALGO if (PARAMS->choose_algo) {
reuse_algo = 0; reuse_algo = 0;
prev_algo = CONV_ALGO; prev_algo = PARAMS->conv_algo;
#ifndef CHOOSE_ONCE if (!PARAMS->choose_once) {
memset(prev_img_dims, 0, sizeof(prev_img_dims)); memset(prev_img_dims, 0, sizeof(prev_img_dims));
memset(prev_top_dims, 0, sizeof(prev_top_dims)); memset(prev_top_dims, 0, sizeof(prev_top_dims));
#endif }
#endif }
#section support_code_struct #section support_code_struct
#ifdef CHOOSE_ALGO
int reuse_algo; int reuse_algo;
cudnnConvolutionBwdFilterAlgo_t prev_algo; cudnnConvolutionBwdFilterAlgo_t prev_algo;
#ifndef CHOOSE_ONCE
size_t prev_img_dims[5]; size_t prev_img_dims[5];
size_t prev_top_dims[5]; size_t prev_top_dims[5];
#endif
#endif
int int
APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
PyGpuArrayObject *km, PyGpuArrayObject *km,
cudnnConvolutionDescriptor_t desc, cudnnConvolutionDescriptor_t desc,
double alpha, double beta, PyGpuArrayObject **kerns, double alpha, double beta, PyGpuArrayObject **kerns,
cudnnHandle_t _handle) { PARAMS_TYPE* params) {
PyGpuContextObject *c = input->context; PyGpuContextObject *c = input->context;
void *alpha_p; void *alpha_p;
void *beta_p; void *beta_p;
...@@ -53,17 +49,17 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -53,17 +49,17 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
return 1; return 1;
} }
#ifdef CONV_INPLACE if (params->inplace) {
Py_XDECREF(*kerns); Py_XDECREF(*kerns);
*kerns = km; *kerns = km;
Py_INCREF(*kerns); Py_INCREF(*kerns);
#else } else {
if (theano_prep_output(kerns, PyGpuArray_NDIM(km), PyGpuArray_DIMS(km), if (theano_prep_output(kerns, PyGpuArray_NDIM(km), PyGpuArray_DIMS(km),
km->ga.typecode, GA_C_ORDER, c) != 0) km->ga.typecode, GA_C_ORDER, c) != 0)
return 1; return 1;
if (beta != 0.0 && pygpu_move(*kerns, km)) if (beta != 0.0 && pygpu_move(*kerns, km))
return 1; return 1;
#endif }
if (PyGpuArray_DIMS(input)[0] == 0 || PyGpuArray_DIMS(km)[0] == 0 || PyGpuArray_DIMS(km)[1] == 0) { if (PyGpuArray_DIMS(input)[0] == 0 || PyGpuArray_DIMS(km)[0] == 0 || PyGpuArray_DIMS(km)[1] == 0) {
int err2 = GpuArray_memset(&(*kerns)->ga, 0); int err2 = GpuArray_memset(&(*kerns)->ga, 0);
...@@ -82,7 +78,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -82,7 +78,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns)) == -1) if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns)) == -1)
return 1; return 1;
cudnnConvolutionBwdFilterAlgo_t algo = CONV_ALGO; cudnnConvolutionBwdFilterAlgo_t algo = params->conv_algo;
cuda_enter(c->ctx); cuda_enter(c->ctx);
...@@ -128,8 +124,8 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -128,8 +124,8 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
} }
} }
#ifdef CHOOSE_ALGO if (params->choose_algo) {
#ifndef CHOOSE_ONCE if (!params->choose_once) {
reuse_algo = 1; reuse_algo = 1;
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) { for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
reuse_algo = (reuse_algo && reuse_algo = (reuse_algo &&
...@@ -137,7 +133,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -137,7 +133,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
reuse_algo = (reuse_algo && reuse_algo = (reuse_algo &&
PyGpuArray_DIM(output, i) == prev_top_dims[i]); PyGpuArray_DIM(output, i) == prev_top_dims[i]);
} }
#endif }
if (!reuse_algo) { if (!reuse_algo) {
size_t free; size_t free;
...@@ -153,7 +149,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -153,7 +149,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
// Guess 4Mb if the info is not available // Guess 4Mb if the info is not available
if (free == 0) free = 4 * 1024 * 1024; if (free == 0) free = 4 * 1024 * 1024;
#ifdef CHOOSE_TIME if (params->choose_time) {
int count; int count;
cudnnConvolutionBwdFilterAlgoPerf_t choice; cudnnConvolutionBwdFilterAlgoPerf_t choice;
gpudata *tmpmem; gpudata *tmpmem;
...@@ -165,7 +161,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -165,7 +161,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
} }
err = cudnnFindConvolutionBackwardFilterAlgorithmEx( err = cudnnFindConvolutionBackwardFilterAlgorithmEx(
_handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input), params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc, APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc,
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(*kerns), APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(*kerns),
1, &count, &choice, *(void **)tmpmem, free); 1, &count, &choice, *(void **)tmpmem, free);
...@@ -180,9 +176,9 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -180,9 +176,9 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
} }
algo = choice.algo; algo = choice.algo;
#else } else {
err = cudnnGetConvolutionBackwardFilterAlgorithm( err = cudnnGetConvolutionBackwardFilterAlgorithm(
_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
desc, APPLY_SPECIFIC(kerns), desc, APPLY_SPECIFIC(kerns),
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, free, &algo); CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
...@@ -192,22 +188,21 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -192,22 +188,21 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
cuda_exit(c->ctx); cuda_exit(c->ctx);
return 1; return 1;
} }
#endif }
prev_algo = algo; prev_algo = algo;
} else { } else {
algo = prev_algo; algo = prev_algo;
} }
#ifdef CHOOSE_ONCE if (params->choose_once) {
reuse_algo = 1; reuse_algo = 1;
#else } else {
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) { for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
prev_img_dims[i] = PyGpuArray_DIM(input, i); prev_img_dims[i] = PyGpuArray_DIM(input, i);
prev_top_dims[i] = PyGpuArray_DIM(output, i); prev_top_dims[i] = PyGpuArray_DIM(output, i);
} }
#endif }
}
#endif
// The FFT implementation does not support strides, 1x1 filters or inputs // The FFT implementation does not support strides, 1x1 filters or inputs
// with a spatial dimension larger than 1024. // with a spatial dimension larger than 1024.
...@@ -246,7 +241,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -246,7 +241,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
gpudata *workspace; gpudata *workspace;
err = cudnnGetConvolutionBackwardFilterWorkspaceSize( err = cudnnGetConvolutionBackwardFilterWorkspaceSize(
_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc, params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(kerns), algo, &worksize); APPLY_SPECIFIC(kerns), algo, &worksize);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
...@@ -270,7 +265,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -270,7 +265,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
cuda_wait((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); cuda_wait((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnConvolutionBackwardFilter( err = cudnnConvolutionBackwardFilter(
_handle, params->handle,
alpha_p, alpha_p,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input), APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),
......
...@@ -42,7 +42,7 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img, ...@@ -42,7 +42,7 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
PyArrayObject *stride, PyArrayObject *stride,
PyArrayObject *pad, PyArrayObject *pad,
PyGpuArrayObject **out, PyGpuArrayObject **out,
cudnnHandle_t _handle) { PARAMS_TYPE* params) {
PyGpuContextObject *c = img->context; PyGpuContextObject *c = img->context;
size_t dims[5]; size_t dims[5];
cudnnStatus_t err; cudnnStatus_t err;
...@@ -90,7 +90,7 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img, ...@@ -90,7 +90,7 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
if (c_set_tensorNd(*out, APPLY_SPECIFIC(output)) != 0) if (c_set_tensorNd(*out, APPLY_SPECIFIC(output)) != 0)
return 1; return 1;
err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), MODE_FLAG, CUDNN_PROPAGATE_NAN, ndims, w, p, s); err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), params->mode, CUDNN_PROPAGATE_NAN, ndims, w, p, s);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set op descriptor %s", cudnnGetErrorString(err)); PyErr_Format(PyExc_RuntimeError, "could not set op descriptor %s", cudnnGetErrorString(err));
...@@ -124,7 +124,7 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img, ...@@ -124,7 +124,7 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
cuda_wait((*out)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); cuda_wait((*out)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnPoolingForward( err = cudnnPoolingForward(
_handle, APPLY_SPECIFIC(pool), params->handle, APPLY_SPECIFIC(pool),
alpha, alpha,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(img), APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(img),
beta, beta,
......
...@@ -64,7 +64,7 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp, ...@@ -64,7 +64,7 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
PyArrayObject *stride, PyArrayObject *stride,
PyArrayObject *pad, PyArrayObject *pad,
PyGpuArrayObject **inp_grad, PyGpuArrayObject **inp_grad,
cudnnHandle_t _handle) { PARAMS_TYPE* params) {
PyGpuContextObject *c = inp->context; PyGpuContextObject *c = inp->context;
cudnnStatus_t err; cudnnStatus_t err;
...@@ -116,7 +116,7 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp, ...@@ -116,7 +116,7 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
s[i] = *((npy_intp*)PyArray_GETPTR1(stride, i)); s[i] = *((npy_intp*)PyArray_GETPTR1(stride, i));
} }
err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), MODE_FLAG, CUDNN_PROPAGATE_NAN, ndims, w, p, s); err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), params->mode, CUDNN_PROPAGATE_NAN, ndims, w, p, s);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set op descriptor %s", cudnnGetErrorString(err)); PyErr_Format(PyExc_RuntimeError, "could not set op descriptor %s", cudnnGetErrorString(err));
...@@ -155,7 +155,7 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp, ...@@ -155,7 +155,7 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
cuda_wait((*inp_grad)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); cuda_wait((*inp_grad)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnPoolingBackward( err = cudnnPoolingBackward(
_handle, APPLY_SPECIFIC(pool), params->handle, APPLY_SPECIFIC(pool),
alpha, alpha,
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(out), APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(out),
APPLY_SPECIFIC(output_grad), PyGpuArray_DEV_DATA(out_grad), APPLY_SPECIFIC(output_grad), PyGpuArray_DEV_DATA(out_grad),
......
...@@ -31,6 +31,20 @@ mode_with_gpu = mode_with_gpu.including() ...@@ -31,6 +31,20 @@ mode_with_gpu = mode_with_gpu.including()
mode_with_gpu.check_py_code = False mode_with_gpu.check_py_code = False
# This variable will store the list of pooling modes available with the current runtime cuDNN version.
# Don't use this variable directly, always call `get_dnn_pool_modes()` instead.
dnn_pool_modes = None
def get_dnn_pool_modes():
# This function is called only by pooling tests to initialize and/or get dnn_pool_modes.
global dnn_pool_modes
if dnn_pool_modes is None:
from .. import cudnn_defs
dnn_pool_modes = cudnn_defs.get_definitions(dnn.version(raises=False)).cudnnPoolingMode_t.get_aliases()
return dnn_pool_modes
# If using float16, set CUDNN precision to float32 # If using float16, set CUDNN precision to float32
def set_precision(floatX): def set_precision(floatX):
if floatX == "float16": if floatX == "float16":
...@@ -155,11 +169,7 @@ def test_pooling(): ...@@ -155,11 +169,7 @@ def test_pooling():
raise SkipTest(dnn.dnn_available.msg) raise SkipTest(dnn.dnn_available.msg)
utt.seed_rng() utt.seed_rng()
# 'average_exc_pad' is disabled for versions < 4004 modes = get_dnn_pool_modes()
if dnn.version(raises=False) < 4004:
modes = ('max', 'average_inc_pad')
else:
modes = ('max', 'average_inc_pad', 'average_exc_pad')
x = T.tensor4() x = T.tensor4()
for mode, pad in product(modes, for mode, pad in product(modes,
...@@ -242,7 +252,9 @@ def test_pooling(): ...@@ -242,7 +252,9 @@ def test_pooling():
for node in fg.maker.fgraph.toposort()]) for node in fg.maker.fgraph.toposort()])
def test_pooling_with_tensor_vars(): # This test will be run with different values of 'mode'
# (see next test below).
def run_pooling_with_tensor_vars(mode):
if not dnn.dnn_available(test_ctx_name): if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg) raise SkipTest(dnn.dnn_available.msg)
utt.seed_rng() utt.seed_rng()
...@@ -251,7 +263,6 @@ def test_pooling_with_tensor_vars(): ...@@ -251,7 +263,6 @@ def test_pooling_with_tensor_vars():
ws = theano.shared(np.array([2, 2], dtype='int32')) ws = theano.shared(np.array([2, 2], dtype='int32'))
stride = theano.shared(np.array([1, 1], dtype='int32')) stride = theano.shared(np.array([1, 1], dtype='int32'))
pad = theano.shared(np.array([0, 0], dtype='int32')) pad = theano.shared(np.array([0, 0], dtype='int32'))
mode = 'max'
def fn(x): def fn(x):
dnn_op = dnn.dnn_pool( dnn_op = dnn.dnn_pool(
...@@ -297,6 +308,12 @@ def test_pooling_with_tensor_vars(): ...@@ -297,6 +308,12 @@ def test_pooling_with_tensor_vars():
i += 1 i += 1
def test_pooling_with_tensor_vars():
# Let's test for mode 'max' and also for 'max_deterministic' if available.
for mode in [m for m in get_dnn_pool_modes() if m in ('max', 'max_deterministic')]:
yield (run_pooling_with_tensor_vars, mode)
def test_pooling3d(): def test_pooling3d():
# 3d pooling requires version 3 or newer. # 3d pooling requires version 3 or newer.
if not dnn.dnn_available(test_ctx_name) or dnn.version(raises=False) < 3000: if not dnn.dnn_available(test_ctx_name) or dnn.version(raises=False) < 3000:
...@@ -307,11 +324,7 @@ def test_pooling3d(): ...@@ -307,11 +324,7 @@ def test_pooling3d():
mode_without_gpu_ref = theano.compile.mode.get_mode( mode_without_gpu_ref = theano.compile.mode.get_mode(
'FAST_RUN').excluding('gpuarray') 'FAST_RUN').excluding('gpuarray')
# 'average_exc_pad' is disabled for versions < 4004 modes = get_dnn_pool_modes()
if dnn.version(raises=False) < 4004:
modes = ('max', 'average_inc_pad')
else:
modes = ('max', 'average_inc_pad', 'average_exc_pad')
x = T.tensor5() x = T.tensor5()
for mode, pad in product(modes, for mode, pad in product(modes,
...@@ -467,11 +480,7 @@ def test_pooling_opt_arbitrary_dimensions(): ...@@ -467,11 +480,7 @@ def test_pooling_opt_arbitrary_dimensions():
raise SkipTest(dnn.dnn_available.msg) raise SkipTest(dnn.dnn_available.msg)
utt.seed_rng() utt.seed_rng()
# 'average_exc_pad' is disabled for versions < 4004 modes = get_dnn_pool_modes()
if dnn.version(raises=False) < 4004:
modes = ('max', 'average_inc_pad')
else:
modes = ('max', 'average_inc_pad', 'average_exc_pad')
for n_non_pool_dims in (0, 1, 2, 3): for n_non_pool_dims in (0, 1, 2, 3):
for ws in ((2, 2), (3, 3, 3)): for ws in ((2, 2), (3, 3, 3)):
...@@ -498,7 +507,7 @@ def test_pooling_opt_arbitrary_dimensions(): ...@@ -498,7 +507,7 @@ def test_pooling_opt_arbitrary_dimensions():
fc = theano.function([], out, mode=mode_without_gpu) fc = theano.function([], out, mode=mode_without_gpu)
assert any([isinstance(node.op, Pool) assert any([isinstance(node.op, Pool)
for node in fc.maker.fgraph.toposort()]) for node in fc.maker.fgraph.toposort()])
if mode == 'max': if mode in ('max', 'max_deterministic'):
assert any([isinstance(node.op, MaxPoolGrad) assert any([isinstance(node.op, MaxPoolGrad)
for node in fc.maker.fgraph.toposort()]) for node in fc.maker.fgraph.toposort()])
else: else:
...@@ -780,11 +789,7 @@ class TestDnnInferShapes(utt.InferShapeTester): ...@@ -780,11 +789,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
dtype=theano.config.floatX dtype=theano.config.floatX
) )
# 'average_exc_pad' is disabled for versions < 4004 modes = get_dnn_pool_modes()
if dnn.version(raises=False) < 4004:
modes = ['max', 'average_inc_pad']
else:
modes = ['max', 'average_inc_pad', 'average_exc_pad']
for params in product( for params in product(
[(1, 1), (2, 2), (3, 3)], [(1, 1), (2, 2), (3, 3)],
...@@ -807,11 +812,7 @@ class TestDnnInferShapes(utt.InferShapeTester): ...@@ -807,11 +812,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
dtype=theano.config.floatX dtype=theano.config.floatX
) )
# 'average_exc_pad' is disabled for versions < 4004 modes = get_dnn_pool_modes()
if dnn.version(raises=False) < 4004:
modes = ['max', 'average_inc_pad']
else:
modes = ['max', 'average_inc_pad', 'average_exc_pad']
for params in product( for params in product(
[(1, 1, 1), (2, 2, 2), (3, 3, 3)], [(1, 1, 1), (2, 2, 2), (3, 3, 3)],
...@@ -847,7 +848,8 @@ class TestDnnInferShapes(utt.InferShapeTester): ...@@ -847,7 +848,8 @@ class TestDnnInferShapes(utt.InferShapeTester):
for params in product( for params in product(
[(1, 1), (2, 2), (3, 3)], [(1, 1), (2, 2), (3, 3)],
[(1, 1), (2, 2), (3, 3)], [(1, 1), (2, 2), (3, 3)],
['max', 'average_inc_pad'] # modes without `average_exc_pad`
[m for m in get_dnn_pool_modes() if m != 'average_exc_pad']
): ):
pool_grad = dnn.GpuDnnPoolGrad(mode=params[2])( pool_grad = dnn.GpuDnnPoolGrad(mode=params[2])(
img, img,
...@@ -886,7 +888,8 @@ class TestDnnInferShapes(utt.InferShapeTester): ...@@ -886,7 +888,8 @@ class TestDnnInferShapes(utt.InferShapeTester):
for params in product( for params in product(
[(1, 1, 1), (2, 2, 2), (3, 3, 3)], [(1, 1, 1), (2, 2, 2), (3, 3, 3)],
[(1, 1, 1), (2, 2, 2), (3, 3, 3)], [(1, 1, 1), (2, 2, 2), (3, 3, 3)],
['max', 'average_inc_pad'] # modes without `average_exc_pad`
[m for m in get_dnn_pool_modes() if m != 'average_exc_pad']
): ):
pool_grad = dnn.GpuDnnPoolGrad(mode=params[2])( pool_grad = dnn.GpuDnnPoolGrad(mode=params[2])(
img, img,
......
...@@ -433,6 +433,9 @@ class Pool(OpenMPOp): ...@@ -433,6 +433,9 @@ class Pool(OpenMPOp):
super(Pool, self).__init__(openmp=openmp) super(Pool, self).__init__(openmp=openmp)
self.ndim = ndim self.ndim = ndim
self.ignore_border = ignore_border self.ignore_border = ignore_border
if mode == 'max_deterministic':
# It seems max pool algo is already deterministic in CPU.
mode = 'max'
if mode not in ['max', 'average_inc_pad', 'average_exc_pad', 'sum']: if mode not in ['max', 'average_inc_pad', 'average_exc_pad', 'sum']:
raise ValueError( raise ValueError(
"Pool mode parameter only support 'max', 'sum'," "Pool mode parameter only support 'max', 'sum',"
...@@ -1040,6 +1043,9 @@ class PoolGrad(OpenMPOp): ...@@ -1040,6 +1043,9 @@ class PoolGrad(OpenMPOp):
def __init__(self, ignore_border, mode='max', ndim=2, openmp=None): def __init__(self, ignore_border, mode='max', ndim=2, openmp=None):
self.ndim = ndim self.ndim = ndim
self.ignore_border = ignore_border self.ignore_border = ignore_border
if mode == 'max_deterministic':
# It seems max pool grad algo is already deterministic in CPU.
mode = 'max'
if mode not in ['max', 'sum', 'average_inc_pad', 'average_exc_pad']: if mode not in ['max', 'sum', 'average_inc_pad', 'average_exc_pad']:
raise ValueError( raise ValueError(
"Pool mode parameter only support 'max', 'sum'," "Pool mode parameter only support 'max', 'sum',"
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论