提交 5d3433c7 authored 作者: xiaoqie's avatar xiaoqie

Merge branch 'master' of https://github.com/Theano/Theano into gpureduce-fix

FROM nvidia/cuda:8.0-cudnn6-devel-ubuntu16.04
ENV DEBIAN_FRONTEND noninteractive
# Install tools
RUN apt-get update
RUN apt-get install -y build-essential apt-utils wget git dvipng time
# Install magma lib
RUN apt-get install -y libopenblas-dev
RUN wget http://icl.cs.utk.edu/projectsfiles/magma/downloads/magma-2.2.0.tar.gz
RUN tar xvf magma-2.2.0.tar.gz
RUN cp magma-2.2.0/make.inc-examples/make.inc.openblas magma-2.2.0/make.inc
ENV OPENBLASDIR /usr
ENV CUDADIR /usr/local/cuda
RUN (cd magma-2.2.0 && make && make install prefix=/usr/local)
RUN ldconfig
# Setup conda python for Theano
RUN wget https://repo.continuum.io/miniconda/Miniconda2-latest-Linux-x86_64.sh
RUN bash Miniconda2-latest-Linux-x86_64.sh -p /miniconda -b
ENV PATH=/miniconda/bin:${PATH}
RUN conda install numpy scipy sympy mkl nose pydot-ng graphviz cython cmake
RUN conda update -y conda
RUN pip install nose-timer parameterized "flake8<3" "sphinx==1.5.2" sphinx_rtd_theme
RUN pip install --upgrade-strategy only-if-needed git+https://github.com/lebedov/scikit-cuda.git
# Setup latex for doc test
RUN apt-get install -y texlive-latex-base texlive-latex-extra
# Install SSH server and Java runtime for Jenkins
RUN apt-get install -y openssh-server default-jre-headless
# Add jenkins user and setup environment
RUN useradd -m -s /bin/bash jenkins
RUN echo jenkins:jenkins | chpasswd
RUN echo "export PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/miniconda/bin:\$PATH" >> /home/jenkins/.bashrc
RUN echo "export LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64:\$LD_LIBRARY_PATH" >> /home/jenkins/.bashrc
# Copy jenkins slave.jar into container
RUN wget -P home/jenkins http://earlgrey.iro.umontreal.ca:8080/jnlpJars/slave.jar
RUN chown -R jenkins:jenkins /home/jenkins/*
# Set launch command as Jenkins slave.jar
CMD java -jar /home/jenkins/slave.jar
\ No newline at end of file
......@@ -6,6 +6,7 @@ export THEANO_FLAGS=init_gpu_device=cuda
# CUDA
export PATH=/usr/local/cuda/bin:$PATH
export CPATH=/usr/local/cuda/include/:$CPATH
export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
export LIBRARY_PATH=/usr/local/cuda/lib64:$LIBRARY_PATH
......@@ -79,6 +80,9 @@ FLAGS=on_shape_error=raise,$FLAGS
# while we want all other runs to run with 'floatX=float64'.
FLAGS=${FLAGS},device=cpu,floatX=float64
# Enable magma GPU library
FLAGS=${FLAGS},magma.enabled=true
# Only use elements in the cache for < 7 days
FLAGS=${FLAGS},cmodule.age_thresh_use=604800
......
......@@ -6,6 +6,7 @@ export THEANO_FLAGS=init_gpu_device=cuda
# CUDA
export PATH=/usr/local/cuda/bin:$PATH
export CPATH=/usr/local/cuda/include/:$CPATH
export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
export LIBRARY_PATH=/usr/local/cuda/lib64:$LIBRARY_PATH
......@@ -79,6 +80,9 @@ FLAGS=${FLAGS},device=cpu,floatX=float64
# Only use elements in the cache for < 7 days
FLAGS=${FLAGS},cmodule.age_thresh_use=604800
# Enable magma GPU library
FLAGS=${FLAGS},magma.enabled=true
#we change the seed and record it everyday to test different combination. We record it to be able to reproduce bug caused by different seed. We don't want multiple test in DEBUG_MODE each day as this take too long.
seed=$RANDOM
echo "Executing tests with mode=DEBUG_MODE with seed of the day $seed"
......
......@@ -8,6 +8,7 @@ export THEANO_FLAGS=init_gpu_device=cuda
# CUDA
export PATH=/usr/local/cuda/bin:$PATH
export CPATH=/usr/local/cuda/include/:$CPATH
export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
export LIBRARY_PATH=/usr/local/cuda/lib64:$LIBRARY_PATH
......@@ -19,4 +20,4 @@ echo
FILE=${BUILDBOT_DIR}/theano_python3_tests.xml
set -x
PYTHONPATH= THEANO_FLAGS=$THEANO_FLAGS,compiledir=$HOME/.theano/buildbot_theano_python3,mode=FAST_COMPILE,warn.ignore_bug_before=all,on_opt_error=raise,on_shape_error=raise python3 bin/theano-nose ${THEANO_PARAM} ${XUNIT}${FILE}
PYTHONPATH= THEANO_FLAGS=$THEANO_FLAGS,compiledir=$HOME/.theano/buildbot_theano_python3,mode=FAST_COMPILE,warn.ignore_bug_before=all,on_opt_error=raise,on_shape_error=raise,magma.enabled=true python3 bin/theano-nose ${THEANO_PARAM} ${XUNIT}${FILE}
......@@ -5,9 +5,6 @@
# Print commands as they are executed
set -x
# Anaconda python
export PATH=/usr/local/miniconda2/bin:$PATH
# Test flake8
echo "===== Testing flake8"
bin/theano-nose theano/tests/test_flake8.py --with-xunit --xunit-file=theano_pre_tests.xml || exit 1
......
......@@ -5,9 +5,6 @@
# Print commands as they are executed
set -x
# Anaconda python
export PATH=/usr/local/miniconda2/bin:$PATH
echo "===== Testing theano core"
# Test theano core
......
......@@ -5,11 +5,9 @@
# Print commands as they are executed
set -x
# Anaconda python
export PATH=/usr/local/miniconda2/bin:$PATH
# CUDA
export PATH=/usr/local/cuda/bin:$PATH
export CPATH=/usr/local/cuda/include/:$CPATH
export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
export LIBRARY_PATH=/usr/local/cuda/lib64:$LIBRARY_PATH
......@@ -54,4 +52,5 @@ THEANO_GPUARRAY_TESTS="theano/gpuarray/tests \
theano/scan_module/tests/test_scan.py:T_Scan_Gpuarray \
theano/scan_module/tests/test_scan_checkpoints.py:TestScanCheckpoint.test_memory"
FLAGS="init_gpu_device=$DEVICE,gpuarray.preallocate=1000,mode=FAST_RUN,on_opt_error=raise,on_shape_error=raise,cmodule.age_thresh_use=604800"
FLAGS=${FLAGS},magma.enabled=true # Enable magma GPU library
THEANO_FLAGS=${FLAGS} time nosetests --with-xunit --xunit-file=theanogpuarray_tests.xml ${THEANO_GPUARRAY_TESTS}
......@@ -271,6 +271,7 @@ class OpFromGraph(gof.Op):
is_inline = self.is_inline
return '%(name)s{inline=%(is_inline)s}' % locals()
@theano.configparser.change_flags(compute_test_value='off')
def _recompute_grad_op(self):
'''
converts self._grad_op from user supplied form to type(self) instance
......
......@@ -2,6 +2,7 @@ from __future__ import absolute_import, print_function, division
from functools import partial
import numpy as np
import theano
from theano import config, shared
from theano.gradient import DisconnectedType
......@@ -313,3 +314,14 @@ class T_OpFromGraph(unittest_tools.InferShapeTester):
[np.ones([3, 4], dtype=config.floatX),
np.ones([3, 4], dtype=config.floatX)],
OpFromGraph)
@theano.configparser.change_flags(compute_test_value='raise')
def test_compute_test_value(self):
x = T.scalar('x')
x.tag.test_value = np.array(1., dtype=config.floatX)
op = OpFromGraph([x], [x ** 3])
y = T.scalar('y')
y.tag.test_value = np.array(1., dtype=config.floatX)
f = op(y)
grad_f = T.grad(f, y)
assert grad_f.tag.test_value is not None
......@@ -268,19 +268,18 @@ def safe_no_dnn_algo_bwd(algo):
'`dnn.conv.algo_bwd_filter` and `dnn.conv.algo_bwd_data` instead.')
return True
# Those are the options provided by Theano to choose algorithms at runtime.
SUPPORTED_DNN_CONV_ALGO_RUNTIME = ('guess_once', 'guess_on_shape_change', 'time_once', 'time_on_shape_change')
# Those are the supported algorithm by Theano,
# The tests will reference those lists.
SUPPORTED_DNN_CONV_ALGO_FWD = ('small', 'none', 'large', 'fft', 'fft_tiling',
'winograd', 'guess_once', 'guess_on_shape_change',
'time_once', 'time_on_shape_change')
SUPPORTED_DNN_CONV_ALGO_FWD = ('small', 'none', 'large', 'fft', 'fft_tiling', 'winograd') + SUPPORTED_DNN_CONV_ALGO_RUNTIME
SUPPORTED_DNN_CONV_ALGO_BWD_DATA = ('none', 'deterministic', 'fft', 'fft_tiling', 'winograd') + SUPPORTED_DNN_CONV_ALGO_RUNTIME
SUPPORTED_DNN_CONV_ALGO_BWD_DATA = ('none', 'deterministic', 'fft', 'fft_tiling',
'winograd', 'guess_once', 'guess_on_shape_change',
'time_once', 'time_on_shape_change')
SUPPORTED_DNN_CONV_ALGO_BWD_FILTER = ('none', 'deterministic', 'fft', 'small') + SUPPORTED_DNN_CONV_ALGO_RUNTIME
SUPPORTED_DNN_CONV_ALGO_BWD_FILTER = ('none', 'deterministic', 'fft', 'small',
'guess_once', 'guess_on_shape_change',
'time_once', 'time_on_shape_change')
SUPPORTED_DNN_CONV_PRECISION = ('as_input_f32', 'as_input', 'float16', 'float32', 'float64')
AddConfigVar('dnn.conv.algo_bwd',
"This flag is deprecated; use dnn.conv.algo_bwd_data and "
......@@ -311,8 +310,7 @@ AddConfigVar('dnn.conv.precision',
"Default data precision to use for the computation in cuDNN "
"convolutions (defaults to the same dtype as the inputs of the "
"convolutions, or float32 if inputs are float16).",
EnumStr('as_input_f32', 'as_input', 'float16', 'float32',
'float64'),
EnumStr(*SUPPORTED_DNN_CONV_PRECISION),
in_c_key=False)
......
......@@ -1413,7 +1413,10 @@ class COp(Op):
return []
def c_code_cache_version(self):
return hash(tuple(self.func_codes))
version = (hash(tuple(self.func_codes)), )
if hasattr(self, 'params_type'):
version += (self.params_type.c_code_cache_version(), )
return version
def c_init_code(self):
"""
......
......@@ -963,6 +963,12 @@ class EnumType(Type, dict):
"""
return alias in self.aliases
def get_aliases(self):
"""
Return the list of all aliases in this enumeration.
"""
return self.aliases.keys()
def __repr__(self):
names_to_aliases = {constant_name: '' for constant_name in self}
for alias in self.aliases:
......@@ -1184,4 +1190,6 @@ class CEnumType(EnumList):
fail=sub['fail'])
def c_code_cache_version(self):
return (1, super(CEnumType, self).c_code_cache_version())
# C code depends on (C constant name, Python value) associations (given by `self.items()`),
# so we should better take them into account in C code version.
return (1, tuple(self.items()), super(CEnumType, self).c_code_cache_version())
#section support_code_apply
int APPLY_SPECIFIC(conv_desc)(PyArrayObject *filt_shp,
cudnnConvolutionDescriptor_t *desc) {
cudnnConvolutionDescriptor_t *desc,
PARAMS_TYPE* params) {
cudnnStatus_t err;
int pad[3] = {PAD_0, PAD_1, PAD_2};
int strides[3] = {SUB_0, SUB_1, SUB_2};
int dilation[3] = {DIL_0, DIL_1, DIL_2};
int pad[3] = {params->pad0, params->pad1, params->pad2};
int strides[3] = {params->sub0, params->sub1, params->sub2};
int dilation[3] = {params->dil0, params->dil1, params->dil2};
#if BORDER_MODE == 0
pad[0] = (*(npy_int64 *)PyArray_GETPTR1(filt_shp, 2) - 1) * DIL_0;
pad[1] = (*(npy_int64 *)PyArray_GETPTR1(filt_shp, 3) - 1) * DIL_1;
#if NB_DIMS > 2
pad[2] = (*(npy_int64 *)PyArray_GETPTR1(filt_shp, 4) - 1) * DIL_2;
#endif
#elif BORDER_MODE == 2
pad[0] = ((*(npy_int64 *)PyArray_GETPTR1(filt_shp, 2) - 1) * DIL_0 + 1) / 2;
pad[1] = ((*(npy_int64 *)PyArray_GETPTR1(filt_shp, 3) - 1) * DIL_1 + 1) / 2;
#if NB_DIMS > 2
pad[2] = ((*(npy_int64 *)PyArray_GETPTR1(filt_shp, 4) - 1) * DIL_2 + 1) / 2;
#endif
#endif
if (params->bmode == BORDER_MODE_FULL) {
pad[0] = (*(npy_int64 *)PyArray_GETPTR1(filt_shp, 2) - 1) * dilation[0];
pad[1] = (*(npy_int64 *)PyArray_GETPTR1(filt_shp, 3) - 1) * dilation[1];
if (params->nb_dims > 2) {
pad[2] = (*(npy_int64 *)PyArray_GETPTR1(filt_shp, 4) - 1) * dilation[2];
}
} else if(params->bmode == BORDER_MODE_HALF) {
pad[0] = ((*(npy_int64 *)PyArray_GETPTR1(filt_shp, 2) - 1) * dilation[0] + 1) / 2;
pad[1] = ((*(npy_int64 *)PyArray_GETPTR1(filt_shp, 3) - 1) * dilation[1] + 1) / 2;
if (params->nb_dims > 2) {
pad[2] = ((*(npy_int64 *)PyArray_GETPTR1(filt_shp, 4) - 1) * dilation[2] + 1) / 2;
}
}
if (PyArray_DIM(filt_shp, 0) - 2 != NB_DIMS) {
if (PyArray_DIM(filt_shp, 0) - 2 != params->nb_dims) {
PyErr_Format(PyExc_ValueError, "Filter shape has too many dimensions: "
"expected %d, got %lld.", NB_DIMS,
"expected %d, got %lld.", params->nb_dims,
(long long)PyArray_DIM(filt_shp, 0));
return -1;
}
......@@ -35,8 +36,8 @@ int APPLY_SPECIFIC(conv_desc)(PyArrayObject *filt_shp,
return -1;
}
err = cudnnSetConvolutionNdDescriptor(*desc, NB_DIMS, pad, strides,
dilation, CONV_MODE, PRECISION);
err = cudnnSetConvolutionNdDescriptor(*desc, params->nb_dims, pad, strides,
dilation, params->conv_mode, params->precision);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not set convolution "
"descriptor: %s", cudnnGetErrorString(err));
......
"""
Declarations of cuDNN types and constants used in Theano gpuarray DNN module.
For every cuDNN API supported by Theano, this module defines a class that
provides the set of cuDNN definitions to be used in Theano Ops.
Use :func:`get_definitions` to get the right cuDNN definitions
for a given cuDNN version.
Currently supported cuDNN APIs:
- v5.1
- v6.0
"""
from __future__ import absolute_import, print_function, division
from theano.gof import CEnumType
# NB: Some cuDNN algorithms are listed in cuDNN enums but not implemented.
# We still register them here because we try to exactly copy cuDNN enums
# in Python side, but they will have no aliases associated, to help
# exclude them from lists of supported algorithms.
class CuDNNV51(object):
version = 5
cudnnConvolutionMode_t = CEnumType(('CUDNN_CONVOLUTION', 'conv'),
('CUDNN_CROSS_CORRELATION', 'cross'),
ctype='cudnnConvolutionMode_t')
cudnnDataType_t = CEnumType(('CUDNN_DATA_FLOAT', 'float32'),
('CUDNN_DATA_DOUBLE', 'float64'),
('CUDNN_DATA_HALF', 'float16'),
# CUDNN_DATA_INT8 # new in v6
# CUDNN_DATA_INT32 # new in v6
# CUDNN_DATA_INT8x4 # new in v6
ctype='cudnnDataType_t')
cudnnConvolutionFwdAlgo_t = CEnumType(('CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM', 'none'),
('CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM', 'small'),
('CUDNN_CONVOLUTION_FWD_ALGO_GEMM', 'large'),
# not implemented:
('CUDNN_CONVOLUTION_FWD_ALGO_DIRECT'),
('CUDNN_CONVOLUTION_FWD_ALGO_FFT', 'fft'),
('CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING', 'fft_tiling'),
('CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD', 'winograd'),
# TODO: Not yet tested/documented:
('CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED', 'winograd_non_fused'),
ctype='cudnnConvolutionFwdAlgo_t')
conv3d_fwd_algorithms = ('none', 'small', 'fft_tiling')
cudnnConvolutionBwdFilterAlgo_t = CEnumType(('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0', 'none'),
('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1', 'deterministic'),
('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT', 'fft'),
('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3', 'small'),
# TODO: not yet tested/documented:
('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED', 'winograd_non_fused'),
ctype='cudnnConvolutionBwdFilterAlgo_t')
conv3d_bwd_filter_algorithms = ('none', 'small')
cudnnConvolutionBwdDataAlgo_t = CEnumType(('CUDNN_CONVOLUTION_BWD_DATA_ALGO_0', 'none'),
('CUDNN_CONVOLUTION_BWD_DATA_ALGO_1', 'deterministic'),
('CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT', 'fft'),
('CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING', 'fft_tiling'),
('CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD', 'winograd'),
# TODO: not yet tested/documented:
('CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED', 'winograd_non_fused'),
ctype='cudnnConvolutionBwdDataAlgo_t')
conv3d_bwd_data_algorithms = ('none', 'deterministic', 'fft_tiling')
cudnnPoolingMode_t = CEnumType(('CUDNN_POOLING_MAX', 'max'),
('CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING', 'average_inc_pad'),
('CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING', 'average_exc_pad'),
ctype='cudnnPoolingMode_t')
cudnnSoftmaxAlgorithm_t = CEnumType(('CUDNN_SOFTMAX_FAST', 'fast'),
('CUDNN_SOFTMAX_ACCURATE', 'accurate'),
('CUDNN_SOFTMAX_LOG', 'log'),
ctype='cudnnSoftmaxAlgorithm_t')
cudnnSoftmaxMode_t = CEnumType(('CUDNN_SOFTMAX_MODE_INSTANCE', 'instance'),
('CUDNN_SOFTMAX_MODE_CHANNEL', 'channel'),
ctype='cudnnSoftmaxMode_t')
cudnnBatchNormMode_t = CEnumType(('CUDNN_BATCHNORM_PER_ACTIVATION', 'per-activation'),
('CUDNN_BATCHNORM_SPATIAL', 'spatial'),
ctype='cudnnBatchNormMode_t')
class CuDNNV6(CuDNNV51):
version = 6
cudnnPoolingMode_t = CEnumType(('CUDNN_POOLING_MAX', 'max'),
('CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING', 'average_inc_pad'),
('CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING', 'average_exc_pad'),
# new in v6:
('CUDNN_POOLING_MAX_DETERMINISTIC', 'max_deterministic'),
ctype='cudnnPoolingMode_t')
cudnnConvolutionBwdFilterAlgo_t = CEnumType(('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0', 'none'),
('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1', 'deterministic'),
('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT', 'fft'),
('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3', 'small'),
# not implemented:
('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD'),
('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED', 'winograd_non_fused'),
# TODO: not yet tested/documented:
# new in v6:
('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING', 'fft_tiling'),
ctype='cudnnConvolutionBwdFilterAlgo_t')
def get_definitions(cudnn_version=None):
"""
Return cuDNN definitions to be used by Theano for the given cuDNN version.
``cudnn_version`` must be None or an integer
(typically the version returned by :func:`theano.gpuarray.dnn.version`).
if None, return definitions for the most recent supported cuDNN version.
"""
if cudnn_version is not None and cudnn_version // 1000 == 5:
return CuDNNV51()
# By default, we use definitions for the last supported cuDNN version.
return CuDNNV6()
......@@ -9,10 +9,10 @@ from six import integer_types
import theano
from theano import Op, Apply, tensor, config, Variable
from theano.scalar import as_scalar, constant, Log, get_scalar_type
from theano.scalar import as_scalar, constant, Log, get_scalar_type, int32 as int_t, bool as bool_t
from theano.tensor import as_tensor_variable
from theano.gradient import DisconnectedType, grad_not_implemented
from theano.gof import Optimizer, local_optimizer, COp, ParamsType, CEnumType
from theano.gof import Optimizer, local_optimizer, COp, ParamsType, EnumList
from theano.gof.cmodule import GCC_compiler
from theano.gof.type import CDataType, Generic
from theano.compile import optdb
......@@ -28,7 +28,7 @@ from theano.tensor.nnet.abstract_conv import (AbstractConv2d,
assert_conv_shape)
from theano.tensor.signal.pool import (
Pool, MaxPoolGrad, AveragePoolGrad)
from . import pygpu
from . import pygpu, cudnn_defs
from .type import (get_context, gpu_context_type, list_contexts,
GpuArraySharedVariable)
from .basic_ops import (as_gpuarray_variable, infer_context_name,
......@@ -44,7 +44,10 @@ from .opt import (gpu_seqopt, register_opt, pool_db, pool_db2,
from .opt_util import alpha_merge, output_merge, inplace_allocempty, pad_dims, unpad_dims
from theano.configdefaults import SUPPORTED_DNN_CONV_ALGO_BWD_FILTER
from theano.configdefaults import SUPPORTED_DNN_CONV_ALGO_RUNTIME
DNN_CONV_ALGO_CHOOSE_ONCE = ['guess_once', 'time_once']
DNN_CONV_ALGO_CHOOSE_TIME = ['time_once', 'time_on_shape_change']
try:
from pygpu import gpuarray
......@@ -59,12 +62,12 @@ def _dnn_lib():
lib_name = ctypes.util.find_library('cudnn')
if lib_name is None and sys.platform == 'win32':
# Update these names when new versions of cudnn are supported.
for name in ['cudnn64_5.dll']:
for name in ['cudnn64_6.dll', 'cudnn64_5.dll']:
lib_name = ctypes.util.find_library(name)
if lib_name:
break
if lib_name is None:
raise RuntimeError('Could not find cudnn library (looked for v5[.1])')
raise RuntimeError('Could not find cudnn library (looked for v5* or v6*)')
_dnn_lib.handle = ctypes.cdll.LoadLibrary(lib_name)
cudnn = _dnn_lib.handle
cudnn.cudnnCreate.argtypes = [ctypes.POINTER(ctypes.c_void_p)]
......@@ -116,10 +119,14 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
# default gpu, not the one selected by the user. If mixed
# GPU are installed or if the GPUs are configured in
# exclusive mode, this cause bad detection.
avail, out, err = GCC_compiler.try_flags(
# NB: GCC_compiler.try_flags() may return just a boolean instead of a tuple (avail, out, here).
compiler_res = GCC_compiler.try_flags(
params, preambule=preambule, body=body,
try_run=False, output=True)
avail, out, err = compiler_res if isinstance(compiler_res, tuple) else (compiler_res, None, None)
if not avail:
return False, ("cannot compile with cuDNN. "
"We got this error:\n" + str(err))
......@@ -129,13 +136,12 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
def _dnn_check_version():
v = version()
if v < 5000:
return False, "cuDNN version is too old. Update to v5, was %d." % v
# 5200 should not print warning with cudnn 5.1 final.
return False, "cuDNN version is too old. Update to v5* or higher, was %d." % v
if v >= 6100:
warnings.warn("Your cuDNN version is more recent than "
"Theano. If you encounter problems, try "
"updating Theano or downgrading cuDNN to "
"version 6.0.")
"a version >= v5 and <= v6.")
return True, None
......@@ -281,6 +287,9 @@ handle_type = CDataType('cudnnHandle_t', 'cudnnDestroy',
lib_dirs=[config.dnn.library_path],
version=version(raises=False))
# Get cuDNN definitions to be used.
cudnn = cudnn_defs.get_definitions(version(raises=False))
def get_precision(precision, inputs):
if precision is None:
......@@ -367,6 +376,15 @@ class GpuDnnConvDesc(COp):
"""
__props__ = ('border_mode', 'subsample', 'dilation', 'conv_mode', 'precision')
params_type = ParamsType(pad0=int_t, pad1=int_t, pad2=int_t,
sub0=int_t, sub1=int_t, sub2=int_t,
dil0=int_t, dil1=int_t, dil2=int_t,
nb_dims=int_t,
bmode=EnumList(('BORDER_MODE_FULL', 'full'),
('BORDER_MODE_VALID', 'valid'),
('BORDER_MODE_HALF', 'half')),
conv_mode=cudnn.cudnnConvolutionMode_t,
precision=cudnn.cudnnDataType_t)
def c_headers(self):
return ['cudnn.h', 'cudnn_helper.h']
......@@ -404,13 +422,13 @@ class GpuDnnConvDesc(COp):
self.border_mode = border_mode
assert len(subsample) in (2, 3)
self.subsample = subsample
assert conv_mode in ('conv', 'cross')
assert cudnn.cudnnConvolutionMode_t.has_alias(conv_mode)
self.conv_mode = conv_mode
assert len(dilation) == len(subsample)
self.dilation = dilation
assert precision in ['float16', 'float32', 'float64']
assert cudnn.cudnnDataType_t.has_alias(precision)
self.precision = precision
def make_node(self, kern_shape):
......@@ -430,59 +448,18 @@ class GpuDnnConvDesc(COp):
out.tag.values_eq_approx = tensor.type.values_eq_approx_always_true
return node
def get_op_params(self):
pad0 = '0'
pad1 = '0'
pad2 = '0'
if isinstance(self.border_mode, tuple):
pad0 = str(self.border_mode[0])
pad1 = str(self.border_mode[1])
if len(self.border_mode) > 2:
pad2 = str(self.border_mode[2])
bmode = '1'
elif self.border_mode == "valid":
bmode = '1'
elif self.border_mode == "half":
bmode = '2'
elif self.border_mode == "full":
bmode = '0'
else:
raise ValueError("Invalid value for border_mode")
if self.conv_mode == 'conv':
conv_flag = 'CUDNN_CONVOLUTION'
else:
conv_flag = 'CUDNN_CROSS_CORRELATION'
sub0 = str(self.subsample[0])
sub1 = str(self.subsample[1])
if len(self.subsample) > 2:
sub2 = str(self.subsample[2])
else:
sub2 = '0'
dil0 = str(self.dilation[0])
dil1 = str(self.dilation[1])
if len(self.dilation) > 2:
dil2 = str(self.dilation[2])
else:
dil2 = '0'
if self.precision == 'float16':
precision = 'CUDNN_DATA_HALF'
elif self.precision == 'float32':
precision = 'CUDNN_DATA_FLOAT'
else:
assert self.precision == 'float64'
precision = 'CUDNN_DATA_DOUBLE'
return [('NB_DIMS', str(len(self.subsample))),
('BORDER_MODE', bmode),
('PAD_0', pad0), ('PAD_1', pad1), ('PAD_2', pad2),
('DIL_0', dil0), ('DIL_1', dil1), ('DIL_2', dil2),
('CONV_MODE', conv_flag),
('SUB_0', sub0), ('SUB_1', sub1), ('SUB_2', sub2),
('PRECISION', precision)]
bmode = property(lambda self: 'valid' if isinstance(self.border_mode, tuple) else self.border_mode)
pad0 = property(lambda self: self.border_mode[0] if isinstance(self.border_mode, tuple) else 0)
pad1 = property(lambda self: self.border_mode[1] if isinstance(self.border_mode, tuple) else 0)
pad2 = property(lambda self: self.border_mode[2] if (isinstance(self.border_mode, tuple) and
len(self.border_mode) > 2) else 0)
sub0 = property(lambda self: self.subsample[0])
sub1 = property(lambda self: self.subsample[1])
sub2 = property(lambda self: self.subsample[2] if len(self.subsample) > 2 else 0)
dil0 = property(lambda self: self.dilation[0])
dil1 = property(lambda self: self.dilation[1])
dil2 = property(lambda self: self.dilation[2] if len(self.dilation) > 2 else 0)
nb_dims = property(lambda self: len(self.subsample))
def c_code_cache_version(self):
return (super(GpuDnnConvDesc, self).c_code_cache_version(), version())
......@@ -533,6 +510,12 @@ class GpuDnnConv(DnnBase):
_f16_ok = True
__props__ = ('algo', 'inplace')
check_input = False
params_type = ParamsType(conv_algo=cudnn.cudnnConvolutionFwdAlgo_t,
choose_algo=bool_t, choose_once=bool_t, choose_time=bool_t,
inplace=bool_t,
handle=handle_type)
def __init__(self, algo=None, inplace=False):
DnnBase.__init__(self, ["dnn_conv_base.c", "dnn_fwd.c"],
"APPLY_SPECIFIC(conv_fwd)")
......@@ -541,13 +524,18 @@ class GpuDnnConv(DnnBase):
algo = config.dnn.conv.algo_fwd
self.algo = algo
self.inplace = inplace
self.inplace = bool(inplace)
if self.inplace:
self.destroy_map = {0: [2]}
assert self.algo in ['none', 'small', 'large', 'fft', 'fft_tiling',
'winograd', 'guess_once', 'guess_on_shape_change',
'time_once', 'time_on_shape_change']
assert cudnn.cudnnConvolutionFwdAlgo_t.has_alias(self.algo) or self.algo in SUPPORTED_DNN_CONV_ALGO_RUNTIME
self.conv_algo = cudnn.cudnnConvolutionFwdAlgo_t.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
if self.algo not in SUPPORTED_DNN_CONV_ALGO_RUNTIME:
self.conv_algo = self.algo
self.choose_algo = self.algo in SUPPORTED_DNN_CONV_ALGO_RUNTIME
self.choose_once = self.algo in DNN_CONV_ALGO_CHOOSE_ONCE
self.choose_time = self.algo in DNN_CONV_ALGO_CHOOSE_TIME
def __setstate__(self, d):
self.__dict__.update(d)
......@@ -559,38 +547,6 @@ class GpuDnnConv(DnnBase):
if not hasattr(self, 'inplace'):
self.inplace = False
def get_op_params(self):
defs = []
if self.inplace:
defs.append(('CONV_INPLACE', '1'))
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM'
if self.algo == 'none': # 3d
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM'
elif self.algo == 'small': # 3d
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM'
elif self.algo == 'large':
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_GEMM'
elif self.algo == 'direct':
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_DIRECT'
elif self.algo == 'fft':
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_FFT'
elif self.algo == 'fft_tiling': # 3d
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING'
elif self.algo == 'winograd':
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD'
defs.append(('CONV_ALGO', alg))
if self.algo in ['guess_once', 'guess_on_shape_change',
'time_once', 'time_on_shape_change']:
defs.append(('CHOOSE_ALGO', ''))
if self.algo in ['guess_once', 'time_once']:
defs.append(('CHOOSE_ONCE', ''))
if self.algo in ['time_once', 'time_on_shape_change']:
defs.append(('CHOOSE_TIME', ''))
return defs
def make_node(self, img, kern, output, desc, alpha=None, beta=None):
ctx_name = infer_context_name(img, kern, output)
img = as_gpuarray_variable(img, ctx_name)
......@@ -609,7 +565,7 @@ class GpuDnnConv(DnnBase):
raise TypeError("The number of dimensions of "
"img, kern and output must match")
if img.type.ndim == 5 and self.algo in ['large', 'fft']:
if img.type.ndim == 5 and self.algo not in cudnn.conv3d_fwd_algorithms:
raise ValueError("convolution algo %s can't be used for "
"3d convolutions", (self.algo,))
......@@ -687,17 +643,30 @@ class GpuDnnConvGradW(DnnBase):
_f16_ok = True
__props__ = ('algo', 'inplace')
check_input = False
params_type = ParamsType(conv_algo=cudnn.cudnnConvolutionBwdFilterAlgo_t,
choose_algo=bool_t, choose_once=bool_t, choose_time=bool_t,
inplace=bool_t,
handle=handle_type)
def __init__(self, inplace=False, algo=None):
DnnBase.__init__(self, ["dnn_conv_base.c", "dnn_gw.c"],
"APPLY_SPECIFIC(conv_gw)")
self.inplace = inplace
self.inplace = bool(inplace)
if self.inplace:
self.destroy_map = {0: [2]}
if algo is None:
algo = config.dnn.conv.algo_bwd_filter
self.algo = algo
assert self.algo in SUPPORTED_DNN_CONV_ALGO_BWD_FILTER
assert cudnn.cudnnConvolutionBwdFilterAlgo_t.has_alias(self.algo) or self.algo in SUPPORTED_DNN_CONV_ALGO_RUNTIME
self.conv_algo = cudnn.cudnnConvolutionBwdFilterAlgo_t.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0
if self.algo not in SUPPORTED_DNN_CONV_ALGO_RUNTIME:
self.conv_algo = self.algo
self.choose_algo = self.algo in SUPPORTED_DNN_CONV_ALGO_RUNTIME
self.choose_once = self.algo in DNN_CONV_ALGO_CHOOSE_ONCE
self.choose_time = self.algo in DNN_CONV_ALGO_CHOOSE_TIME
def __setstate__(self, d):
self.__dict__.update(d)
......@@ -724,33 +693,6 @@ class GpuDnnConvGradW(DnnBase):
# not connected to desc
return [[1], [1], [1], [0], [1], [1]]
def get_op_params(self):
defs = []
if self.inplace:
defs.append(('CONV_INPLACE', '1'))
alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0'
if self.algo == 'none': # 3d
alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0'
if self.algo == 'deterministic':
alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1'
if self.algo == 'fft':
alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT'
if self.algo == 'small': # 3d
# non-deterministic, small workspace
alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3'
if self.algo in ['guess_once', 'guess_on_shape_change',
'time_once', 'time_on_shape_change']:
defs.append(('CHOOSE_ALGO', ''))
if self.algo in ['guess_once', 'time_once']:
defs.append(('CHOOSE_ONCE', ''))
if self.algo in ['time_once', 'time_on_shape_change']:
defs.append(('CHOOSE_TIME', ''))
defs.append(('CONV_ALGO', alg))
return defs
def op_may_fail_with_subsample(self, img, desc):
return (version() < 6000 and
img.type.dtype == 'float32' and
......@@ -793,8 +735,7 @@ class GpuDnnConvGradW(DnnBase):
raise TypeError("The number of dimensions of "
"img, topgrad and output must match")
if (img.type.ndim == 5 and
self.algo in ['fft', 'deterministic']):
if img.type.ndim == 5 and self.algo not in cudnn.conv3d_bwd_filter_algorithms:
raise ValueError("convolution algo %s can't be used for "
"3d convolutions", (self.algo,))
......@@ -830,19 +771,30 @@ class GpuDnnConvGradI(DnnBase):
_f16_ok = True
__props__ = ('algo', 'inplace',)
check_input = False
params_type = ParamsType(conv_algo=cudnn.cudnnConvolutionBwdDataAlgo_t,
choose_algo=bool_t, choose_once=bool_t, choose_time=bool_t,
inplace=bool_t,
handle=handle_type)
def __init__(self, inplace=False, algo=None):
DnnBase.__init__(self, ["dnn_conv_base.c", "dnn_gi.c"],
"APPLY_SPECIFIC(conv_gi)")
self.inplace = inplace
self.inplace = bool(inplace)
if self.inplace:
self.destroy_map = {0: [2]}
if algo is None:
algo = config.dnn.conv.algo_bwd_data
self.algo = algo
assert self.algo in ['none', 'deterministic', 'fft', 'fft_tiling',
'winograd', 'guess_once', 'guess_on_shape_change',
'time_once', 'time_on_shape_change']
assert cudnn.cudnnConvolutionBwdDataAlgo_t.has_alias(self.algo) or self.algo in SUPPORTED_DNN_CONV_ALGO_RUNTIME
self.conv_algo = cudnn.cudnnConvolutionBwdDataAlgo_t.CUDNN_CONVOLUTION_BWD_DATA_ALGO_0
if self.algo not in SUPPORTED_DNN_CONV_ALGO_RUNTIME:
self.conv_algo = self.algo
self.choose_algo = self.algo in SUPPORTED_DNN_CONV_ALGO_RUNTIME
self.choose_once = self.algo in DNN_CONV_ALGO_CHOOSE_ONCE
self.choose_time = self.algo in DNN_CONV_ALGO_CHOOSE_TIME
def __setstate__(self, d):
self.__dict__.update(d)
......@@ -869,36 +821,6 @@ class GpuDnnConvGradI(DnnBase):
# not connected to desc
return [[1], [1], [1], [0], [1], [1]]
def get_op_params(self):
defs = []
if self.inplace:
defs.append(('CONV_INPLACE', '1'))
alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_0'
if self.algo == 'none': # 3d
alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_0'
elif self.algo == 'deterministic': # 3d
alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_1'
elif self.algo == 'fft':
alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT'
elif self.algo == 'fft_tiling': # 3d
# big workspace but less than fft
alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING'
elif self.algo == 'winograd':
alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD'
if self.algo in ['guess_once', 'guess_on_shape_change',
'time_once', 'time_on_shape_change']:
defs.append(('CHOOSE_ALGO', ''))
if self.algo in ['guess_once', 'time_once']:
defs.append(('CHOOSE_ONCE', ''))
if self.algo in ['time_once', 'time_on_shape_change']:
defs.append(('CHOOSE_TIME', ''))
defs.append(('CONV_ALGO', alg))
return defs
def make_node(self, kern, topgrad, output, desc, alpha=None, beta=None):
ctx_name = infer_context_name(kern, topgrad, output)
kern = as_gpuarray_variable(kern, ctx_name)
......@@ -916,7 +838,7 @@ class GpuDnnConvGradI(DnnBase):
raise TypeError("The number of dimensions of "
"kern, topgrad and output must match")
if kern.type.ndim == 5 and self.algo in ['fft']:
if kern.type.ndim == 5 and self.algo not in cudnn.conv3d_bwd_data_algorithms:
raise ValueError("convolution algo %s can't be used for "
"3d convolutions", (self.algo,))
......@@ -1059,7 +981,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), dilation=(1, 1),
def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1), dilation=(1, 1, 1),
conv_mode='conv', direction_hint=None,
algo='none', precision=None):
algo=None, precision=None):
"""
GPU convolution using cuDNN from NVIDIA.
......@@ -1349,7 +1271,33 @@ class GpuDnnPoolDesc(Op):
return (4, version())
class GpuDnnPool(DnnBase):
class GpuDnnPoolBase(DnnBase):
"""
Abstract base class for GpuDnnPool and GpuDnnPoolGrad.
"""
# c_file and c_function must be defined in sub-classes.
c_file = None
c_function = None
_f16_ok = True
__props__ = ('mode',)
check_input = False
params_type = ParamsType(mode=cudnn.cudnnPoolingMode_t,
handle=handle_type)
def __init__(self, mode='max'):
DnnBase.__init__(self, [self.c_file], self.c_function)
if mode == 'average':
mode = 'average_inc_pad'
# Supported modes depend on runtime cuDNN version.
assert cudnn.cudnnPoolingMode_t.has_alias(mode)
self.mode = mode
class GpuDnnPool(GpuDnnPoolBase):
"""
Parameters
......@@ -1366,25 +1314,8 @@ class GpuDnnPool(DnnBase):
(padX, padY) or (padX, padY, padZ)
"""
_f16_ok = True
__props__ = ('mode',)
def __init__(self, mode='max'):
DnnBase.__init__(self, ["dnn_pool.c"], "APPLY_SPECIFIC(dnn_pool)")
if mode == 'average':
mode = 'average_inc_pad'
assert mode in ('max', 'average_inc_pad', 'average_exc_pad')
self.mode = mode
def get_op_params(self):
if self.mode == 'max':
mode_flag = 'CUDNN_POOLING_MAX'
elif self.mode == "average_inc_pad":
mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING'
elif self.mode == "average_exc_pad":
mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING'
return [('MODE_FLAG', mode_flag)]
c_file = "dnn_pool.c"
c_function = "APPLY_SPECIFIC(dnn_pool)"
def make_node(self, img, ws, stride, pad):
ctx_name = infer_context_name(img)
......@@ -1428,7 +1359,7 @@ class GpuDnnPool(DnnBase):
return [[1], [0], [0], [0]]
class GpuDnnPoolGrad(DnnBase):
class GpuDnnPoolGrad(GpuDnnPoolBase):
"""
The pooling gradient.
......@@ -1451,26 +1382,8 @@ class GpuDnnPoolGrad(DnnBase):
(padX, padY) or (padX, padY, padZ)
"""
_f16_ok = True
__props__ = ('mode',)
def __init__(self, mode='max'):
DnnBase.__init__(self, ["dnn_pool_grad.c"],
"APPLY_SPECIFIC(dnn_pool_grad)")
if mode == 'average':
mode = 'average_inc_pad'
assert mode in ('max', 'average_inc_pad', 'average_exc_pad')
self.mode = mode
def get_op_params(self):
if self.mode == 'max':
mode_flag = 'CUDNN_POOLING_MAX'
elif self.mode == "average_inc_pad":
mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING'
elif self.mode == "average_exc_pad":
mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING'
return [('MODE_FLAG', mode_flag)]
c_file = "dnn_pool_grad.c"
c_function = "APPLY_SPECIFIC(dnn_pool_grad)"
def make_node(self, inp, out, out_grad, ws, stride, pad):
ctx_name = infer_context_name(inp, out, out_grad)
......@@ -1513,7 +1426,8 @@ def dnn_pool(img, ws, stride=None, mode='max', pad=None):
Subsampling window size. Should have 2 or 3 elements.
stride : tuple
Subsampling stride (default: (1, 1) or (1, 1, 1)).
mode : {'max', 'average_inc_pad', 'average_exc_pad', 'sum'}
mode : {'max', 'average_inc_pad', 'average_exc_pad', 'sum', 'max_deterministic'}
**NB**: 'max_deterministic' is supported since cuDNN v6.
pad : tuple
(padX, padY) or (padX, padY, padZ)
default: (0, 0) or (0, 0, 0)
......@@ -1562,22 +1476,17 @@ class GpuDnnSoftmaxBase(DnnBase):
# neither in dnn_base.c nor in dnn_softmax*.c,
# so we can disable input checking.
check_input = False
params_type = ParamsType(algo=CEnumType(('CUDNN_SOFTMAX_FAST', 'fast'),
('CUDNN_SOFTMAX_LOG', 'log'),
('CUDNN_SOFTMAX_ACCURATE', 'accurate'),
ctype='cudnnSoftmaxAlgorithm_t'),
mode=CEnumType(('CUDNN_SOFTMAX_MODE_INSTANCE', 'instance'),
('CUDNN_SOFTMAX_MODE_CHANNEL', 'channel'),
ctype='cudnnSoftmaxMode_t'),
params_type = ParamsType(algo=cudnn.cudnnSoftmaxAlgorithm_t,
mode=cudnn.cudnnSoftmaxMode_t,
handle=handle_type)
def __init__(self, algo, mode):
DnnBase.__init__(self, [self.file], self.c_func)
assert(algo in ('fast', 'accurate', 'log'))
assert cudnn.cudnnSoftmaxAlgorithm_t.has_alias(algo)
self.algo = algo
assert(mode in ('instance', 'channel'))
assert cudnn.cudnnSoftmaxMode_t.has_alias(mode)
self.mode = mode
def infer_shape(self, node, shape):
......@@ -1810,13 +1719,18 @@ class GpuDnnBatchNormInference(DnnBase):
__props__ = ('mode', 'inplace')
check_input = False
params_type = ParamsType(mode=cudnn.cudnnBatchNormMode_t,
inplace=bool_t,
handle=handle_type)
def __init__(self, mode='per-activation', inplace=False):
DnnBase.__init__(self, ['dnn_batchnorm_base.c', 'dnn_batchnorm_inf.c'],
'dnn_batchnorm_op')
assert (mode in ('per-activation', 'spatial'))
assert cudnn.cudnnBatchNormMode_t.has_alias(mode)
self.mode = mode
self.inplace = inplace
self.inplace = bool(inplace)
if self.inplace:
self.destroy_map = {0: [0]}
......@@ -1825,15 +1739,6 @@ class GpuDnnBatchNormInference(DnnBase):
if not hasattr(self, 'inplace'):
self.inplace = False
def get_op_params(self):
params = []
if self.inplace:
params.append(('INPLACE_OUTPUT', '1'))
params.append(('MODE', ("CUDNN_BATCHNORM_SPATIAL"
if self.mode == "spatial"
else "CUDNN_BATCHNORM_PER_ACTIVATION")))
return params
def infer_shape(self, node, shape):
return [shape[0]]
......@@ -1882,20 +1787,17 @@ class GpuDnnBatchNormInference(DnnBase):
class GpuDnnBatchNormGrad(DnnBase):
__props__ = ('mode',)
check_input = False
params_type = ParamsType(mode=cudnn.cudnnBatchNormMode_t,
handle=handle_type)
def __init__(self, mode='per-activation'):
DnnBase.__init__(self, ['dnn_batchnorm_base.c', 'dnn_batchnorm_grad.c'],
'dnn_batchnorm_grad')
assert (mode in ('per-activation', 'spatial'))
assert cudnn.cudnnBatchNormMode_t.has_alias(mode)
self.mode = mode
def get_op_params(self):
params = []
params.append(('MODE', ("CUDNN_BATCHNORM_SPATIAL"
if self.mode == "spatial"
else "CUDNN_BATCHNORM_PER_ACTIVATION")))
return params
def make_node(self, x, dy, scale, x_mean, x_invstd, epsilon=1e-4):
ctx_name = infer_context_name(x, dy, scale, x_mean, x_invstd)
x = as_gpuarray_variable(x, ctx_name)
......
......@@ -24,7 +24,7 @@ int dnn_batchnorm_grad(PyGpuArrayObject *inp, PyGpuArrayObject *doutp,
PyGpuArrayObject *scale, PyGpuArrayObject *x_mean,
PyGpuArrayObject *x_invstd, npy_float64 epsilon,
PyGpuArrayObject **dinp, PyGpuArrayObject **dscale,
PyGpuArrayObject **dbias, cudnnHandle_t _handle) {
PyGpuArrayObject **dbias, PARAMS_TYPE* params) {
PyGpuContextObject *c = inp->context;
if (c_set_tensorNd(inp, bn_input) != 0)
......@@ -70,8 +70,8 @@ int dnn_batchnorm_grad(PyGpuArrayObject *inp, PyGpuArrayObject *doutp,
betaParam = (void *)&fbeta;
}
cudnnStatus_t err = cudnnBatchNormalizationBackward(
_handle,
MODE,
params->handle,
params->mode,
alphaData,
betaData,
alphaParam,
......
......@@ -3,7 +3,7 @@
int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
PyGpuArrayObject *bias, PyGpuArrayObject *est_mean,
PyGpuArrayObject *est_var, npy_float64 epsilon,
PyGpuArrayObject **outp, cudnnHandle_t _handle) {
PyGpuArrayObject **outp, PARAMS_TYPE* params) {
PyGpuContextObject *c = inp->context;
if (c_set_tensorNd(inp, bn_input) != 0)
......@@ -16,14 +16,14 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
return 1;
}
#ifdef INPLACE_OUTPUT
Py_XDECREF(*outp);
*outp = inp;
Py_INCREF(*outp);
#else
if (theano_prep_output(outp, inp->ga.nd, inp->ga.dimensions, inp->ga.typecode, GA_C_ORDER, c) != 0)
return 1;
#endif
if (params->inplace) {
Py_XDECREF(*outp);
*outp = inp;
Py_INCREF(*outp);
} else {
if (theano_prep_output(outp, inp->ga.nd, inp->ga.dimensions, inp->ga.typecode, GA_C_ORDER, c) != 0)
return 1;
}
if (c_set_tensorNd(*outp, bn_output) != 0)
return 1;
......@@ -43,8 +43,8 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
beta = (void *)&fbeta;
}
cudnnStatus_t err = cudnnBatchNormalizationForwardInference(
_handle,
MODE,
params->handle,
params->mode,
alpha,
beta,
bn_input,
......
#section init_code_struct
#ifdef CHOOSE_ALGO
reuse_algo = 0;
prev_algo = CONV_ALGO;
#ifndef CHOOSE_ONCE
memset(prev_img_dims, 0, sizeof(prev_img_dims));
memset(prev_kern_dims, 0, sizeof(prev_kern_dims));
#endif
#endif
if (PARAMS->choose_algo) {
reuse_algo = 0;
prev_algo = PARAMS->conv_algo;
if (!PARAMS->choose_once) {
memset(prev_img_dims, 0, sizeof(prev_img_dims));
memset(prev_kern_dims, 0, sizeof(prev_kern_dims));
}
}
#section support_code_struct
#ifdef CHOOSE_ALGO
int reuse_algo;
cudnnConvolutionFwdAlgo_t prev_algo;
#ifndef CHOOSE_ONCE
size_t prev_img_dims[5];
size_t prev_kern_dims[5];
#endif
#endif
int
APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
......@@ -26,7 +22,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
cudnnConvolutionDescriptor_t desc,
double alpha, double beta,
PyGpuArrayObject **output,
cudnnHandle_t _handle) {
PARAMS_TYPE* params) {
PyGpuContextObject *c = input->context;
void *alpha_p;
void *beta_p;
......@@ -54,17 +50,17 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
return 1;
}
#ifdef CONV_INPLACE
Py_XDECREF(*output);
*output = om;
Py_INCREF(*output);
#else
if (theano_prep_output(output, PyGpuArray_NDIM(om), PyGpuArray_DIMS(om),
om->ga.typecode, GA_C_ORDER, c) != 0)
return 1;
if (beta != 0.0 && pygpu_move(*output, om))
return 1;
#endif
if (params->inplace) {
Py_XDECREF(*output);
*output = om;
Py_INCREF(*output);
} else {
if (theano_prep_output(output, PyGpuArray_NDIM(om), PyGpuArray_DIMS(om),
om->ga.typecode, GA_C_ORDER, c) != 0)
return 1;
if (beta != 0.0 && pygpu_move(*output, om))
return 1;
}
if (PyGpuArray_DIMS(input)[0] == 0 || PyGpuArray_DIMS(kerns)[0] == 0 || PyGpuArray_DIMS(kerns)[1] == 0) {
int err2 = GpuArray_memset(&(*output)->ga, 0);
......@@ -83,90 +79,90 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
if (c_set_tensorNd(*output, APPLY_SPECIFIC(output)) == -1)
return 1;
cudnnConvolutionFwdAlgo_t algo = CONV_ALGO;
cudnnConvolutionFwdAlgo_t algo = params->conv_algo;
cuda_enter(c->ctx);
#ifdef CHOOSE_ALGO
#ifndef CHOOSE_ONCE
reuse_algo = 1;
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(input, i) == prev_img_dims[i]);
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]);
}
#endif
if (!reuse_algo) {
size_t free;
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
if (err2 != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
"memory information on the GPU");
cuda_exit(c->ctx);
return 1;
if (params->choose_algo) {
if (params->choose_once) {
reuse_algo = 1;
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(input, i) == prev_img_dims[i]);
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]);
}
}
// Guess 4Mb if the info is not available
if (free == 0) free = 4 * 1024 * 1024;
if (!reuse_algo) {
size_t free;
#ifdef CHOOSE_TIME
int count;
cudnnConvolutionFwdAlgoPerf_t choice;
gpudata *tmpmem;
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
if (err2 != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
"memory information on the GPU");
cuda_exit(c->ctx);
return 1;
}
tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
if (tmpmem == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
return -1;
// Guess 4Mb if the info is not available
if (free == 0) free = 4 * 1024 * 1024;
if (params->choose_time) {
int count;
cudnnConvolutionFwdAlgoPerf_t choice;
gpudata *tmpmem;
tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
if (tmpmem == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
return -1;
}
// We don't sync the buffer as we don't care about the values.
err = cudnnFindConvolutionForwardAlgorithmEx(
params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
desc, APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output),
1, &count, &choice, *(void **)tmpmem,
free);
gpudata_release(tmpmem);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
algo = choice.algo;
} else {
err = cudnnGetConvolutionForwardAlgorithm(
params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
desc, APPLY_SPECIFIC(output),
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
}
prev_algo = algo;
} else {
algo = prev_algo;
}
// We don't sync the buffer as we don't care about the values.
err = cudnnFindConvolutionForwardAlgorithmEx(
_handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
desc, APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output),
1, &count, &choice, *(void **)tmpmem,
free);
gpudata_release(tmpmem);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
algo = choice.algo;
#else
err = cudnnGetConvolutionForwardAlgorithm(
_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
desc, APPLY_SPECIFIC(output),
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
if (params->choose_once) {
reuse_algo = 1;
} else {
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
prev_img_dims[i] = PyGpuArray_DIM(input, i);
prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
}
}
#endif
prev_algo = algo;
} else {
algo = prev_algo;
}
#ifdef CHOOSE_ONCE
reuse_algo = 1;
#else
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
prev_img_dims[i] = PyGpuArray_DIM(input, i);
prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
}
#endif
#endif
/* These two algos are not supported for 3d conv */
if (PyGpuArray_NDIM(input) == 5 &&
(algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM ||
......@@ -201,20 +197,16 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
return 1;
}
if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT)
{
if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT) {
if (stride[0] != 1 || stride[1] != 1 ||
PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
(PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1))
{
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
}
}
else
{
} else {
// algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING
if (stride[0] != 1 || stride[1] != 1)
{
if (stride[0] != 1 || stride[1] != 1) {
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
}
}
......@@ -223,7 +215,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
{
size_t worksize;
gpudata *workspace;
err = cudnnGetConvolutionForwardWorkspaceSize(_handle,
err = cudnnGetConvolutionForwardWorkspaceSize(params->handle,
APPLY_SPECIFIC(input),
APPLY_SPECIFIC(kerns),
desc,
......@@ -236,7 +228,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
// TODO: Print a warning
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
err = cudnnGetConvolutionForwardWorkspaceSize(_handle,
err = cudnnGetConvolutionForwardWorkspaceSize(params->handle,
APPLY_SPECIFIC(input),
APPLY_SPECIFIC(kerns),
desc,
......@@ -273,7 +265,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnConvolutionForward(
_handle,
params->handle,
alpha_p,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
......
#section init_code_struct
#ifdef CHOOSE_ALGO
reuse_algo = 0;
prev_algo = CONV_ALGO;
#ifndef CHOOSE_ONCE
memset(prev_kern_dims, 0, sizeof(prev_kern_dims));
memset(prev_top_dims, 0, sizeof(prev_top_dims));
#endif
#endif
// #ifdef CHOOSE_ALGO
if (PARAMS->choose_algo) {
reuse_algo = 0;
prev_algo = PARAMS->conv_algo;
// #ifndef CHOOSE_ONCE
if (!PARAMS->choose_once) {
memset(prev_kern_dims, 0, sizeof(prev_kern_dims));
memset(prev_top_dims, 0, sizeof(prev_top_dims));
}
// #endif
}
// #endif
#section support_code_struct
#ifdef CHOOSE_ALGO
int reuse_algo = 0;
cudnnConvolutionBwdDataAlgo_t prev_algo = CONV_ALGO;
#ifndef CHOOSE_ONCE
int reuse_algo;
cudnnConvolutionBwdDataAlgo_t prev_algo;
size_t prev_kern_dims[5] = {0};
size_t prev_top_dims[5] = {0};
#endif
#endif
int
APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
PyGpuArrayObject *im,
cudnnConvolutionDescriptor_t desc,
double alpha, double beta, PyGpuArrayObject **input,
cudnnHandle_t _handle) {
PARAMS_TYPE* params) {
PyGpuContextObject *c = kerns->context;
void *alpha_p;
void *beta_p;
......@@ -53,17 +53,20 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
return 1;
}
#ifdef CONV_INPLACE
Py_XDECREF(*input);
*input = im;
Py_INCREF(*input);
#else
if (theano_prep_output(input, PyGpuArray_NDIM(im), PyGpuArray_DIMS(im),
im->ga.typecode, GA_C_ORDER, c) != 0)
return 1;
if (beta != 0.0 && pygpu_move(*input, im))
return 1;
#endif
// #ifdef CONV_INPLACE
if (params->inplace) {
Py_XDECREF(*input);
*input = im;
Py_INCREF(*input);
// #else
} else {
if (theano_prep_output(input, PyGpuArray_NDIM(im), PyGpuArray_DIMS(im),
im->ga.typecode, GA_C_ORDER, c) != 0)
return 1;
if (beta != 0.0 && pygpu_move(*input, im))
return 1;
}
// #endif
if (PyGpuArray_DIMS(im)[0] == 0 || PyGpuArray_DIMS(kerns)[0] == 0 || PyGpuArray_DIMS(kerns)[1] == 0) {
int err2 = GpuArray_memset(&(*input)->ga, 0);
......@@ -82,7 +85,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
if (c_set_tensorNd(*input, APPLY_SPECIFIC(input)) == -1)
return 1;
cudnnConvolutionBwdDataAlgo_t algo = CONV_ALGO;
cudnnConvolutionBwdDataAlgo_t algo = params->conv_algo;
cuda_enter(c->ctx);
......@@ -128,84 +131,93 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
}
}
#ifdef CHOOSE_ALGO
#ifndef CHOOSE_ONCE
reuse_algo = 1;
for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); i++) {
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]);
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(output, i) == prev_top_dims[i]);
}
#endif
// #ifdef CHOOSE_ALGO
if (params->choose_algo) {
// #ifndef CHOOSE_ONCE
if (!params->choose_once) {
reuse_algo = 1;
for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); i++) {
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]);
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(output, i) == prev_top_dims[i]);
}
}
// #endif
if (!reuse_algo) {
size_t free;
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
if (!reuse_algo) {
size_t free;
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
if (err2 != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
"memory information on the GPU");
cuda_exit(c->ctx);
return 1;
}
if (err2 != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
"memory information on the GPU");
cuda_exit(c->ctx);
return 1;
}
// Guess 4Mb if the info is not available
if (free == 0) free = 4 * 1024 * 1024;
// Guess 4Mb if the info is not available
if (free == 0) free = 4 * 1024 * 1024;
#ifdef CHOOSE_TIME
int count;
cudnnConvolutionBwdDataAlgoPerf_t choice;
gpudata *tmpmem;
// #ifdef CHOOSE_TIME
if (params->choose_time) {
int count;
cudnnConvolutionBwdDataAlgoPerf_t choice;
gpudata *tmpmem;
tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
if (tmpmem == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
return -1;
}
tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
if (tmpmem == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
return -1;
}
err = cudnnFindConvolutionBackwardDataAlgorithmEx(
_handle, APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(*input),
1, &count, &choice, *(void **)tmpmem, free);
gpudata_release(tmpmem);
err = cudnnFindConvolutionBackwardDataAlgorithmEx(
params->handle, APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(*input),
1, &count, &choice, *(void **)tmpmem, free);
gpudata_release(tmpmem);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
algo = choice.algo;
// #else
} else {
err = cudnnGetConvolutionBackwardDataAlgorithm(
params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output),
desc, APPLY_SPECIFIC(input),
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
}
algo = choice.algo;
#else
err = cudnnGetConvolutionBackwardDataAlgorithm(
_handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output),
desc, APPLY_SPECIFIC(input),
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
// #endif
prev_algo = algo;
} else {
algo = prev_algo;
}
#endif
prev_algo = algo;
} else {
algo = prev_algo;
}
#ifdef CHOOSE_ONCE
reuse_algo = 1;
#else
for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); i++) {
prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
prev_top_dims[i] = PyGpuArray_DIM(output, i);
// #ifdef CHOOSE_ONCE
if (params->choose_once) {
reuse_algo = 1;
// #else
} else {
for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); i++) {
prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
prev_top_dims[i] = PyGpuArray_DIM(output, i);
}
}
// #endif
}
#endif
#endif
// #endif
// The FFT implementation does not support strides, 1x1 filters or inputs
// with a spatial dimension larger than 1024. The tiled-FFT implementation
......@@ -258,7 +270,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
gpudata *workspace;
err = cudnnGetConvolutionBackwardDataWorkspaceSize(
_handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(input), algo, &worksize);
if (err != CUDNN_STATUS_SUCCESS) {
......@@ -283,7 +295,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
cuda_wait((*input)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnConvolutionBackwardData(
_handle,
params->handle,
alpha_p,
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),
......
#section init_code_struct
#ifdef CHOOSE_ALGO
reuse_algo = 0;
prev_algo = CONV_ALGO;
#ifndef CHOOSE_ONCE
memset(prev_img_dims, 0, sizeof(prev_img_dims));
memset(prev_top_dims, 0, sizeof(prev_top_dims));
#endif
#endif
if (PARAMS->choose_algo) {
reuse_algo = 0;
prev_algo = PARAMS->conv_algo;
if (!PARAMS->choose_once) {
memset(prev_img_dims, 0, sizeof(prev_img_dims));
memset(prev_top_dims, 0, sizeof(prev_top_dims));
}
}
#section support_code_struct
#ifdef CHOOSE_ALGO
int reuse_algo;
cudnnConvolutionBwdFilterAlgo_t prev_algo;
#ifndef CHOOSE_ONCE
size_t prev_img_dims[5];
size_t prev_top_dims[5];
#endif
#endif
int
APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
PyGpuArrayObject *km,
cudnnConvolutionDescriptor_t desc,
double alpha, double beta, PyGpuArrayObject **kerns,
cudnnHandle_t _handle) {
PARAMS_TYPE* params) {
PyGpuContextObject *c = input->context;
void *alpha_p;
void *beta_p;
......@@ -53,17 +49,17 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
return 1;
}
#ifdef CONV_INPLACE
Py_XDECREF(*kerns);
*kerns = km;
Py_INCREF(*kerns);
#else
if (theano_prep_output(kerns, PyGpuArray_NDIM(km), PyGpuArray_DIMS(km),
km->ga.typecode, GA_C_ORDER, c) != 0)
return 1;
if (beta != 0.0 && pygpu_move(*kerns, km))
return 1;
#endif
if (params->inplace) {
Py_XDECREF(*kerns);
*kerns = km;
Py_INCREF(*kerns);
} else {
if (theano_prep_output(kerns, PyGpuArray_NDIM(km), PyGpuArray_DIMS(km),
km->ga.typecode, GA_C_ORDER, c) != 0)
return 1;
if (beta != 0.0 && pygpu_move(*kerns, km))
return 1;
}
if (PyGpuArray_DIMS(input)[0] == 0 || PyGpuArray_DIMS(km)[0] == 0 || PyGpuArray_DIMS(km)[1] == 0) {
int err2 = GpuArray_memset(&(*kerns)->ga, 0);
......@@ -82,7 +78,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns)) == -1)
return 1;
cudnnConvolutionBwdFilterAlgo_t algo = CONV_ALGO;
cudnnConvolutionBwdFilterAlgo_t algo = params->conv_algo;
cuda_enter(c->ctx);
......@@ -128,86 +124,85 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
}
}
#ifdef CHOOSE_ALGO
#ifndef CHOOSE_ONCE
reuse_algo = 1;
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(input, i) == prev_img_dims[i]);
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(output, i) == prev_top_dims[i]);
}
#endif
if (!reuse_algo) {
size_t free;
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
if (err2 != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
"memory information on the GPU");
cuda_exit(c->ctx);
return 1;
if (params->choose_algo) {
if (!params->choose_once) {
reuse_algo = 1;
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(input, i) == prev_img_dims[i]);
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(output, i) == prev_top_dims[i]);
}
}
// Guess 4Mb if the info is not available
if (free == 0) free = 4 * 1024 * 1024;
#ifdef CHOOSE_TIME
int count;
cudnnConvolutionBwdFilterAlgoPerf_t choice;
gpudata *tmpmem;
tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
if (tmpmem == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
return -1;
}
if (!reuse_algo) {
size_t free;
err = cudnnFindConvolutionBackwardFilterAlgorithmEx(
_handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc,
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(*kerns),
1, &count, &choice, *(void **)tmpmem, free);
gpudata_release(tmpmem);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
if (err2 != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
"memory information on the GPU");
cuda_exit(c->ctx);
return 1;
}
// Guess 4Mb if the info is not available
if (free == 0) free = 4 * 1024 * 1024;
if (params->choose_time) {
int count;
cudnnConvolutionBwdFilterAlgoPerf_t choice;
gpudata *tmpmem;
tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
if (tmpmem == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
return -1;
}
err = cudnnFindConvolutionBackwardFilterAlgorithmEx(
params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc,
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(*kerns),
1, &count, &choice, *(void **)tmpmem, free);
gpudata_release(tmpmem);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
algo = choice.algo;
} else {
err = cudnnGetConvolutionBackwardFilterAlgorithm(
params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
desc, APPLY_SPECIFIC(kerns),
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
}
prev_algo = algo;
} else {
algo = prev_algo;
}
algo = choice.algo;
#else
err = cudnnGetConvolutionBackwardFilterAlgorithm(
_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
desc, APPLY_SPECIFIC(kerns),
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
if (params->choose_once) {
reuse_algo = 1;
} else {
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
prev_img_dims[i] = PyGpuArray_DIM(input, i);
prev_top_dims[i] = PyGpuArray_DIM(output, i);
}
}
#endif
prev_algo = algo;
} else {
algo = prev_algo;
}
#ifdef CHOOSE_ONCE
reuse_algo = 1;
#else
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
prev_img_dims[i] = PyGpuArray_DIM(input, i);
prev_top_dims[i] = PyGpuArray_DIM(output, i);
}
#endif
#endif
// The FFT implementation does not support strides, 1x1 filters or inputs
// with a spatial dimension larger than 1024.
......@@ -246,7 +241,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
gpudata *workspace;
err = cudnnGetConvolutionBackwardFilterWorkspaceSize(
_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(kerns), algo, &worksize);
if (err != CUDNN_STATUS_SUCCESS) {
......@@ -270,7 +265,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
cuda_wait((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnConvolutionBackwardFilter(
_handle,
params->handle,
alpha_p,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),
......
......@@ -42,7 +42,7 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
PyArrayObject *stride,
PyArrayObject *pad,
PyGpuArrayObject **out,
cudnnHandle_t _handle) {
PARAMS_TYPE* params) {
PyGpuContextObject *c = img->context;
size_t dims[5];
cudnnStatus_t err;
......@@ -90,7 +90,7 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
if (c_set_tensorNd(*out, APPLY_SPECIFIC(output)) != 0)
return 1;
err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), MODE_FLAG, CUDNN_PROPAGATE_NAN, ndims, w, p, s);
err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), params->mode, CUDNN_PROPAGATE_NAN, ndims, w, p, s);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set op descriptor %s", cudnnGetErrorString(err));
......@@ -124,7 +124,7 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
cuda_wait((*out)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnPoolingForward(
_handle, APPLY_SPECIFIC(pool),
params->handle, APPLY_SPECIFIC(pool),
alpha,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(img),
beta,
......
......@@ -64,7 +64,7 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
PyArrayObject *stride,
PyArrayObject *pad,
PyGpuArrayObject **inp_grad,
cudnnHandle_t _handle) {
PARAMS_TYPE* params) {
PyGpuContextObject *c = inp->context;
cudnnStatus_t err;
......@@ -116,7 +116,7 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
s[i] = *((npy_intp*)PyArray_GETPTR1(stride, i));
}
err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), MODE_FLAG, CUDNN_PROPAGATE_NAN, ndims, w, p, s);
err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), params->mode, CUDNN_PROPAGATE_NAN, ndims, w, p, s);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set op descriptor %s", cudnnGetErrorString(err));
......@@ -155,7 +155,7 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
cuda_wait((*inp_grad)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnPoolingBackward(
_handle, APPLY_SPECIFIC(pool),
params->handle, APPLY_SPECIFIC(pool),
alpha,
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(out),
APPLY_SPECIFIC(output_grad), PyGpuArray_DEV_DATA(out_grad),
......
......@@ -35,7 +35,7 @@ class GpuCumOp(GpuKernelBase, Op):
return hash(self.axis) ^ hash(self.mode)
def c_code_cache_version(self):
return (3,)
return (5,)
def c_headers(self):
return ['<numpy_compat.h>', '<gpuarray/types.h>', '<gpuarray_helper.h>']
......@@ -67,13 +67,16 @@ class GpuCumOp(GpuKernelBase, Op):
dtype_x = node.inputs[0].dtype
flags = Kernel.get_flags(dtype_x)
code = """
KERNEL void %(kname)s(float* input, float* output,
KERNEL void %(kname)s(float* input, ga_size input_offset,
float* output, ga_size output_offset,
ga_ssize inputStrides_x,
ga_ssize inputStrides_y,
ga_ssize inputStrides_z,
ga_ssize outputStrides_x, ga_ssize outputStrides_y,
ga_ssize outputStrides_z, const int offsetY, const int offsetZ,
const int beforeLastElementIdx, const int lastElementIdx){
input = (float *)(((char *)input) + input_offset);
output = (float *)(((char *)output) + output_offset);
int idY = blockIdx.y + offsetY;
int idZ = blockIdx.z + offsetZ;
......@@ -85,8 +88,10 @@ class GpuCumOp(GpuKernelBase, Op):
output[idx_last_output] = input[idx_last_input] %(op)s output[idx_beforelast];
}
""" % locals()
params = [gpuarray.GpuArray, gpuarray.GpuArray, gpuarray.SSIZE,
gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
params = [gpuarray.GpuArray, gpuarray.SIZE,
gpuarray.GpuArray, gpuarray.SIZE,
gpuarray.SSIZE, gpuarray.SSIZE,
gpuarray.SSIZE, gpuarray.SSIZE,
gpuarray.SSIZE, gpuarray.SSIZE,
'intc', 'intc',
'intc', 'intc',
......@@ -96,10 +101,11 @@ class GpuCumOp(GpuKernelBase, Op):
# blockCumOp
kname = "k_blockCumOp"
k_var = "k_blockCumOp_" + nodename
params = [gpuarray.GpuArray, gpuarray.GpuArray, gpuarray.SIZE,
params = [gpuarray.GpuArray, gpuarray.SIZE,
gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SIZE,
gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
'int32', 'int32', gpuarray.GpuArray, ]
'int32', 'int32', gpuarray.GpuArray, gpuarray.SIZE]
code = """
// helper functions
WITHIN_KERNEL
......@@ -154,12 +160,17 @@ class GpuCumOp(GpuKernelBase, Op):
output[idx_odd] = partialCumOp[threadIdx.x*2 + 1];
}
KERNEL void k_blockCumOp(float* input, float* output,
size_t nbElementsPerCumOp, ga_ssize inputStrides_x,
ga_ssize inputStrides_y, ga_ssize inputStrides_z,
ga_ssize outputStrides_x, ga_ssize outputStrides_y,
ga_ssize outputStrides_z, int offsetY,
int offsetZ, float* blockSum) {
KERNEL void k_blockCumOp(float* input, ga_size input_offset,
float* output, ga_size output_offset,
size_t nbElementsPerCumOp, ga_ssize inputStrides_x,
ga_ssize inputStrides_y, ga_ssize inputStrides_z,
ga_ssize outputStrides_x, ga_ssize outputStrides_y,
ga_ssize outputStrides_z, int offsetY,
int offsetZ, float* blockSum, ga_size blockSum_offset) {
input = (float *)(((char *)input) + input_offset);
output = (float *)(((char *)output) + output_offset);
blockSum = (float *)(((char *)blockSum) + blockSum_offset);
// Regarding blockIdx and threadIdx, 'CumOp' is always performed along the X axis.
// The Y and Z axis of the grid will contain all independent cumops of the 2D/3D case.
......@@ -197,9 +208,14 @@ class GpuCumOp(GpuKernelBase, Op):
kname = "k_finalCumOp"
k_var = "k_finalCumOp_" + nodename
code = """
KERNEL void k_finalCumOp(float* output, float* blockSum, size_t nbElementsPerCumOp,
ga_ssize dataStrides_x, ga_ssize dataStrides_y, ga_ssize dataStrides_z,
int offsetY, int offsetZ) {
KERNEL void k_finalCumOp(float* output, ga_size output_offset,
float* blockSum, ga_size blockSum_offset,
size_t nbElementsPerCumOp,
ga_ssize dataStrides_x, ga_ssize dataStrides_y, ga_ssize dataStrides_z,
int offsetY, int offsetZ) {
output = (float *)(((char *)output) + output_offset);
blockSum = (float *)(((char *)blockSum) + blockSum_offset);
int globalThreadID = (blockIdx.x + 1) * blockDim.x + threadIdx.x;
// Check if current has data to process.
......@@ -218,7 +234,8 @@ class GpuCumOp(GpuKernelBase, Op):
output[idx_odd] %(op)s= currentBlockSum;
}
""" % locals()
params = [gpuarray.GpuArray, gpuarray.GpuArray, gpuarray.SIZE,
params = [gpuarray.GpuArray, gpuarray.SIZE,
gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SIZE,
gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
'int32', 'int32', ]
kernels.append(Kernel(code=code, name=kname, params=params,
......@@ -381,7 +398,9 @@ class GpuCumOp(GpuKernelBase, Op):
size_t dimBlock[3] = {dimBlockX, 1, 1}; // One cum op per block.
size_t sharedBytes = (2*dimBlockX) * sizeof(float);
void* kernel_params[] = {(void*) input->ga.data,
(void*) &(input->ga.offset),
(void*) output->ga.data,
(void*) &(output->ga.offset),
(void*) &nbElementsPerCumOp,
(void*) &inputStrides_x,
(void*) &inputStrides_y,
......@@ -391,7 +410,8 @@ class GpuCumOp(GpuKernelBase, Op):
(void*) &outputStrides_z,
(void*) &offsetY,
(void*) &offsetZ,
(void*) deviceBlockSum->ga.data
(void*) deviceBlockSum->ga.data,
(void*) &(deviceBlockSum->ga.offset)
};
int err = GpuKernel_call(&k_blockCumOp_%(nodename)s, 3, dimGrid, dimBlock, sharedBytes, kernel_params);
if (err != GA_NO_ERROR){
......@@ -410,7 +430,9 @@ class GpuCumOp(GpuKernelBase, Op):
size_t dimGrid[3] = {dimGridX, localDimGridY, localDimGridZ};
size_t dimBlock[3] = {dimBlockX, 1, 1};
void* kernel_params[] = {(void*) output->ga.data,
(void*) &(output->ga.offset),
(void*) deviceBlockSum->ga.data,
(void*) &(deviceBlockSum->ga.offset),
(void*) &nbElementsPerCumOp,
(void*) &outputStrides_x,
(void*) &outputStrides_y,
......@@ -431,7 +453,9 @@ class GpuCumOp(GpuKernelBase, Op):
size_t tmp0 = shape[axis]-2;
size_t tmp1 = shape[axis]-1;
void* kernel_params[] = {(void*) input->ga.data,
(void*) &(input->ga.offset),
(void*) output->ga.data,
(void*) &(output->ga.offset),
(void*) &inputStrides_x,
(void*) &inputStrides_y,
(void*) &inputStrides_z,
......
......@@ -31,6 +31,20 @@ mode_with_gpu = mode_with_gpu.including()
mode_with_gpu.check_py_code = False
# This variable will store the list of pooling modes available with the current runtime cuDNN version.
# Don't use this variable directly, always call `get_dnn_pool_modes()` instead.
dnn_pool_modes = None
def get_dnn_pool_modes():
# This function is called only by pooling tests to initialize and/or get dnn_pool_modes.
global dnn_pool_modes
if dnn_pool_modes is None:
from .. import cudnn_defs
dnn_pool_modes = cudnn_defs.get_definitions(dnn.version(raises=False)).cudnnPoolingMode_t.get_aliases()
return dnn_pool_modes
# If using float16, set CUDNN precision to float32
def set_precision(floatX):
if floatX == "float16":
......@@ -155,11 +169,7 @@ def test_pooling():
raise SkipTest(dnn.dnn_available.msg)
utt.seed_rng()
# 'average_exc_pad' is disabled for versions < 4004
if dnn.version(raises=False) < 4004:
modes = ('max', 'average_inc_pad')
else:
modes = ('max', 'average_inc_pad', 'average_exc_pad')
modes = get_dnn_pool_modes()
x = T.tensor4()
for mode, pad in product(modes,
......@@ -242,7 +252,9 @@ def test_pooling():
for node in fg.maker.fgraph.toposort()])
def test_pooling_with_tensor_vars():
# This test will be run with different values of 'mode'
# (see next test below).
def run_pooling_with_tensor_vars(mode):
if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg)
utt.seed_rng()
......@@ -251,7 +263,6 @@ def test_pooling_with_tensor_vars():
ws = theano.shared(np.array([2, 2], dtype='int32'))
stride = theano.shared(np.array([1, 1], dtype='int32'))
pad = theano.shared(np.array([0, 0], dtype='int32'))
mode = 'max'
def fn(x):
dnn_op = dnn.dnn_pool(
......@@ -297,6 +308,12 @@ def test_pooling_with_tensor_vars():
i += 1
def test_pooling_with_tensor_vars():
# Let's test for mode 'max' and also for 'max_deterministic' if available.
for mode in [m for m in get_dnn_pool_modes() if m in ('max', 'max_deterministic')]:
yield (run_pooling_with_tensor_vars, mode)
def test_pooling3d():
# 3d pooling requires version 3 or newer.
if not dnn.dnn_available(test_ctx_name) or dnn.version(raises=False) < 3000:
......@@ -307,11 +324,7 @@ def test_pooling3d():
mode_without_gpu_ref = theano.compile.mode.get_mode(
'FAST_RUN').excluding('gpuarray')
# 'average_exc_pad' is disabled for versions < 4004
if dnn.version(raises=False) < 4004:
modes = ('max', 'average_inc_pad')
else:
modes = ('max', 'average_inc_pad', 'average_exc_pad')
modes = get_dnn_pool_modes()
x = T.tensor5()
for mode, pad in product(modes,
......@@ -467,11 +480,7 @@ def test_pooling_opt_arbitrary_dimensions():
raise SkipTest(dnn.dnn_available.msg)
utt.seed_rng()
# 'average_exc_pad' is disabled for versions < 4004
if dnn.version(raises=False) < 4004:
modes = ('max', 'average_inc_pad')
else:
modes = ('max', 'average_inc_pad', 'average_exc_pad')
modes = get_dnn_pool_modes()
for n_non_pool_dims in (0, 1, 2, 3):
for ws in ((2, 2), (3, 3, 3)):
......@@ -498,7 +507,7 @@ def test_pooling_opt_arbitrary_dimensions():
fc = theano.function([], out, mode=mode_without_gpu)
assert any([isinstance(node.op, Pool)
for node in fc.maker.fgraph.toposort()])
if mode == 'max':
if mode in ('max', 'max_deterministic'):
assert any([isinstance(node.op, MaxPoolGrad)
for node in fc.maker.fgraph.toposort()])
else:
......@@ -780,11 +789,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
dtype=theano.config.floatX
)
# 'average_exc_pad' is disabled for versions < 4004
if dnn.version(raises=False) < 4004:
modes = ['max', 'average_inc_pad']
else:
modes = ['max', 'average_inc_pad', 'average_exc_pad']
modes = get_dnn_pool_modes()
for params in product(
[(1, 1), (2, 2), (3, 3)],
......@@ -807,11 +812,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
dtype=theano.config.floatX
)
# 'average_exc_pad' is disabled for versions < 4004
if dnn.version(raises=False) < 4004:
modes = ['max', 'average_inc_pad']
else:
modes = ['max', 'average_inc_pad', 'average_exc_pad']
modes = get_dnn_pool_modes()
for params in product(
[(1, 1, 1), (2, 2, 2), (3, 3, 3)],
......@@ -847,7 +848,8 @@ class TestDnnInferShapes(utt.InferShapeTester):
for params in product(
[(1, 1), (2, 2), (3, 3)],
[(1, 1), (2, 2), (3, 3)],
['max', 'average_inc_pad']
# modes without `average_exc_pad`
[m for m in get_dnn_pool_modes() if m != 'average_exc_pad']
):
pool_grad = dnn.GpuDnnPoolGrad(mode=params[2])(
img,
......@@ -886,7 +888,8 @@ class TestDnnInferShapes(utt.InferShapeTester):
for params in product(
[(1, 1, 1), (2, 2, 2), (3, 3, 3)],
[(1, 1, 1), (2, 2, 2), (3, 3, 3)],
['max', 'average_inc_pad']
# modes without `average_exc_pad`
[m for m in get_dnn_pool_modes() if m != 'average_exc_pad']
):
pool_grad = dnn.GpuDnnPoolGrad(mode=params[2])(
img,
......
......@@ -5710,41 +5710,50 @@ def local_opt_alloc(node):
if node_inps.owner and isinstance(node_inps.owner.op, T.Alloc):
input = node_inps.owner.inputs[0]
shapes = node_inps.owner.inputs[1:]
if (node.op.axis is None or
node.op.axis == tuple(range(input.ndim))):
try:
val = get_scalar_constant_value(input,
only_process_constants=True)
assert val.size == 1
# check which type of op
casted = T.mul(*shapes).astype(str(input.dtype))
try:
val = get_scalar_constant_value(input,
only_process_constants=True)
assert val.size == 1
val = val.reshape(1)[0]
# check which type of op
size = T.mul(*shapes)
if input.dtype in ["float16", "float32"]:
# shapes are ints and normally int64.
# We don't want to have a float64 upcast
# We don't want to downcast to float16
# as we fear it could loose too much precision
# that will be amplified by the mul/pow below.
size = size.astype('float32')
if (node.op.axis is None or
node.op.axis == tuple(range(input.ndim))):
if isinstance(node.op, T.Sum):
val = val.reshape(1)[0] * casted
val = val * size
else:
val = val.reshape(1)[0] ** casted
val = val ** size
# Sum can change the input dtype (upcast or bool
# -> float32) by default or by user request.
# We can ignore the acc_dtype, as there is only 1
# elemwise we will do and not a sequence, so there is no
# accumulation of errors.
# So mostly, we just need to cast the output to the old
# dtype.
val = val.astype(node.outputs[0].dtype)
return [val]
except NotScalarConstantError:
pass
else:
try:
val = get_scalar_constant_value(input,
only_process_constants=True)
assert val.size == 1
val = val.reshape(1)[0]
to_prod = [shapes[i] for i in xrange(len(shapes))
if i in node.op.axis]
if to_prod:
casted = T.mul(*to_prod).astype(str(input.dtype))
if isinstance(node.op, T.Sum):
val *= casted
else:
val = val ** casted
return [T.alloc(val,
*[shapes[i] for i in xrange(len(shapes))
if i not in node.op.axis])]
except NotScalarConstantError:
pass
to_prod = [shapes[i] for i in xrange(len(shapes))
if i in node.op.axis]
if to_prod:
size = T.mul(*to_prod)
if isinstance(node.op, T.Sum):
val *= size
else:
val = val ** size
# See comments above.
val = val.astype(node.outputs[0].dtype)
return [T.alloc(val,
*[shapes[i] for i in xrange(len(shapes))
if i not in node.op.axis])]
except NotScalarConstantError:
pass
@register_specialize
......
......@@ -59,7 +59,8 @@ def pool_2d(input, ws=None, ignore_border=None, stride=None, pad=(0, 0),
stride : tuple of two ints or theano vector of ints of size 2.
Stride size, which is the number of shifts over rows/cols to get the
next pool region. If stride is None, it is considered equal to ws
(no overlap on pooling regions).
(no overlap on pooling regions), eg: stride=(1,1) will shifts over
one row and one col for every iteration.
pad : tuple of two ints or theano vector of ints of size 2.
(pad_h, pad_w), pad zeros to extend beyond four borders of the
images, pad_h is the size of the top and bottom margins, and
......@@ -433,6 +434,9 @@ class Pool(OpenMPOp):
super(Pool, self).__init__(openmp=openmp)
self.ndim = ndim
self.ignore_border = ignore_border
if mode == 'max_deterministic':
# It seems max pool algo is already deterministic in CPU.
mode = 'max'
if mode not in ['max', 'average_inc_pad', 'average_exc_pad', 'sum']:
raise ValueError(
"Pool mode parameter only support 'max', 'sum',"
......@@ -1040,6 +1044,9 @@ class PoolGrad(OpenMPOp):
def __init__(self, ignore_border, mode='max', ndim=2, openmp=None):
self.ndim = ndim
self.ignore_border = ignore_border
if mode == 'max_deterministic':
# It seems max pool grad algo is already deterministic in CPU.
mode = 'max'
if mode not in ['max', 'sum', 'average_inc_pad', 'average_exc_pad']:
raise ValueError(
"Pool mode parameter only support 'max', 'sum',"
......
......@@ -5558,9 +5558,11 @@ class T_local_sum_prod(unittest.TestCase):
class T_local_opt_alloc(unittest.TestCase):
dtype = 'float32'
def test_sum_upcast(self):
s = theano.tensor.lscalar()
a = theano.tensor.alloc(np.asarray(5, dtype='float32'), s, s)
a = theano.tensor.alloc(np.asarray(5, dtype=self.dtype), s, s)
orig = theano.config.warn_float64
theano.config.warn_float64 = "raise"
try:
......@@ -5571,7 +5573,7 @@ class T_local_opt_alloc(unittest.TestCase):
def test_prod_upcast(self):
s = theano.tensor.lscalar()
a = theano.tensor.alloc(np.asarray(5, dtype='float32'), s, s)
a = theano.tensor.alloc(np.asarray(5, dtype=self.dtype), s, s)
orig = theano.config.warn_float64
theano.config.warn_float64 = "raise"
try:
......@@ -5580,6 +5582,24 @@ class T_local_opt_alloc(unittest.TestCase):
finally:
theano.config.warn_float64 = orig
@theano.configparser.change_flags(on_opt_error='raise')
def test_sum_bool_upcast(self):
s = theano.tensor.lscalar()
a = theano.tensor.alloc(np.asarray(True, dtype='bool'), s, s)
f = theano.function([s], a.sum())
f(5)
# test with user specified dtype
f = theano.function([s], a.sum(dtype=self.dtype))
f(5)
# test only 1 axis summed
f = theano.function([s], a.sum(axis=0, dtype=self.dtype))
f(5)
print(self.dtype)
class T_local_opt_alloc_f16(T_local_opt_alloc):
dtype = 'float16'
class T_local_reduce(unittest.TestCase):
def setUp(self):
......
......@@ -804,8 +804,8 @@ theano.compile.register_specify_shape_c_code(
PyErr_Format(PyExc_AssertionError,
"SpecifyShape: vector of shape has %%d elements,"
" but the input has %%d dimensions.",
PyArray_NDIM(%(iname)s),
PyArray_DIMS(%(shape)s)[0]);
PyArray_DIMS(%(shape)s)[0],
PyArray_NDIM(%(iname)s));
%(fail)s;
}
for(int i = 0; i < PyArray_NDIM(%(iname)s); i++){
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论