提交 5d3433c7 authored 作者: xiaoqie's avatar xiaoqie

Merge branch 'master' of https://github.com/Theano/Theano into gpureduce-fix

FROM nvidia/cuda:8.0-cudnn6-devel-ubuntu16.04
ENV DEBIAN_FRONTEND noninteractive
# Install tools
RUN apt-get update
RUN apt-get install -y build-essential apt-utils wget git dvipng time
# Install magma lib
RUN apt-get install -y libopenblas-dev
RUN wget http://icl.cs.utk.edu/projectsfiles/magma/downloads/magma-2.2.0.tar.gz
RUN tar xvf magma-2.2.0.tar.gz
RUN cp magma-2.2.0/make.inc-examples/make.inc.openblas magma-2.2.0/make.inc
ENV OPENBLASDIR /usr
ENV CUDADIR /usr/local/cuda
RUN (cd magma-2.2.0 && make && make install prefix=/usr/local)
RUN ldconfig
# Setup conda python for Theano
RUN wget https://repo.continuum.io/miniconda/Miniconda2-latest-Linux-x86_64.sh
RUN bash Miniconda2-latest-Linux-x86_64.sh -p /miniconda -b
ENV PATH=/miniconda/bin:${PATH}
RUN conda install numpy scipy sympy mkl nose pydot-ng graphviz cython cmake
RUN conda update -y conda
RUN pip install nose-timer parameterized "flake8<3" "sphinx==1.5.2" sphinx_rtd_theme
RUN pip install --upgrade-strategy only-if-needed git+https://github.com/lebedov/scikit-cuda.git
# Setup latex for doc test
RUN apt-get install -y texlive-latex-base texlive-latex-extra
# Install SSH server and Java runtime for Jenkins
RUN apt-get install -y openssh-server default-jre-headless
# Add jenkins user and setup environment
RUN useradd -m -s /bin/bash jenkins
RUN echo jenkins:jenkins | chpasswd
RUN echo "export PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/miniconda/bin:\$PATH" >> /home/jenkins/.bashrc
RUN echo "export LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64:\$LD_LIBRARY_PATH" >> /home/jenkins/.bashrc
# Copy jenkins slave.jar into container
RUN wget -P home/jenkins http://earlgrey.iro.umontreal.ca:8080/jnlpJars/slave.jar
RUN chown -R jenkins:jenkins /home/jenkins/*
# Set launch command as Jenkins slave.jar
CMD java -jar /home/jenkins/slave.jar
\ No newline at end of file
...@@ -6,6 +6,7 @@ export THEANO_FLAGS=init_gpu_device=cuda ...@@ -6,6 +6,7 @@ export THEANO_FLAGS=init_gpu_device=cuda
# CUDA # CUDA
export PATH=/usr/local/cuda/bin:$PATH export PATH=/usr/local/cuda/bin:$PATH
export CPATH=/usr/local/cuda/include/:$CPATH
export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
export LIBRARY_PATH=/usr/local/cuda/lib64:$LIBRARY_PATH export LIBRARY_PATH=/usr/local/cuda/lib64:$LIBRARY_PATH
...@@ -79,6 +80,9 @@ FLAGS=on_shape_error=raise,$FLAGS ...@@ -79,6 +80,9 @@ FLAGS=on_shape_error=raise,$FLAGS
# while we want all other runs to run with 'floatX=float64'. # while we want all other runs to run with 'floatX=float64'.
FLAGS=${FLAGS},device=cpu,floatX=float64 FLAGS=${FLAGS},device=cpu,floatX=float64
# Enable magma GPU library
FLAGS=${FLAGS},magma.enabled=true
# Only use elements in the cache for < 7 days # Only use elements in the cache for < 7 days
FLAGS=${FLAGS},cmodule.age_thresh_use=604800 FLAGS=${FLAGS},cmodule.age_thresh_use=604800
......
...@@ -6,6 +6,7 @@ export THEANO_FLAGS=init_gpu_device=cuda ...@@ -6,6 +6,7 @@ export THEANO_FLAGS=init_gpu_device=cuda
# CUDA # CUDA
export PATH=/usr/local/cuda/bin:$PATH export PATH=/usr/local/cuda/bin:$PATH
export CPATH=/usr/local/cuda/include/:$CPATH
export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
export LIBRARY_PATH=/usr/local/cuda/lib64:$LIBRARY_PATH export LIBRARY_PATH=/usr/local/cuda/lib64:$LIBRARY_PATH
...@@ -79,6 +80,9 @@ FLAGS=${FLAGS},device=cpu,floatX=float64 ...@@ -79,6 +80,9 @@ FLAGS=${FLAGS},device=cpu,floatX=float64
# Only use elements in the cache for < 7 days # Only use elements in the cache for < 7 days
FLAGS=${FLAGS},cmodule.age_thresh_use=604800 FLAGS=${FLAGS},cmodule.age_thresh_use=604800
# Enable magma GPU library
FLAGS=${FLAGS},magma.enabled=true
#we change the seed and record it everyday to test different combination. We record it to be able to reproduce bug caused by different seed. We don't want multiple test in DEBUG_MODE each day as this take too long. #we change the seed and record it everyday to test different combination. We record it to be able to reproduce bug caused by different seed. We don't want multiple test in DEBUG_MODE each day as this take too long.
seed=$RANDOM seed=$RANDOM
echo "Executing tests with mode=DEBUG_MODE with seed of the day $seed" echo "Executing tests with mode=DEBUG_MODE with seed of the day $seed"
......
...@@ -8,6 +8,7 @@ export THEANO_FLAGS=init_gpu_device=cuda ...@@ -8,6 +8,7 @@ export THEANO_FLAGS=init_gpu_device=cuda
# CUDA # CUDA
export PATH=/usr/local/cuda/bin:$PATH export PATH=/usr/local/cuda/bin:$PATH
export CPATH=/usr/local/cuda/include/:$CPATH
export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
export LIBRARY_PATH=/usr/local/cuda/lib64:$LIBRARY_PATH export LIBRARY_PATH=/usr/local/cuda/lib64:$LIBRARY_PATH
...@@ -19,4 +20,4 @@ echo ...@@ -19,4 +20,4 @@ echo
FILE=${BUILDBOT_DIR}/theano_python3_tests.xml FILE=${BUILDBOT_DIR}/theano_python3_tests.xml
set -x set -x
PYTHONPATH= THEANO_FLAGS=$THEANO_FLAGS,compiledir=$HOME/.theano/buildbot_theano_python3,mode=FAST_COMPILE,warn.ignore_bug_before=all,on_opt_error=raise,on_shape_error=raise python3 bin/theano-nose ${THEANO_PARAM} ${XUNIT}${FILE} PYTHONPATH= THEANO_FLAGS=$THEANO_FLAGS,compiledir=$HOME/.theano/buildbot_theano_python3,mode=FAST_COMPILE,warn.ignore_bug_before=all,on_opt_error=raise,on_shape_error=raise,magma.enabled=true python3 bin/theano-nose ${THEANO_PARAM} ${XUNIT}${FILE}
...@@ -5,9 +5,6 @@ ...@@ -5,9 +5,6 @@
# Print commands as they are executed # Print commands as they are executed
set -x set -x
# Anaconda python
export PATH=/usr/local/miniconda2/bin:$PATH
# Test flake8 # Test flake8
echo "===== Testing flake8" echo "===== Testing flake8"
bin/theano-nose theano/tests/test_flake8.py --with-xunit --xunit-file=theano_pre_tests.xml || exit 1 bin/theano-nose theano/tests/test_flake8.py --with-xunit --xunit-file=theano_pre_tests.xml || exit 1
......
...@@ -5,9 +5,6 @@ ...@@ -5,9 +5,6 @@
# Print commands as they are executed # Print commands as they are executed
set -x set -x
# Anaconda python
export PATH=/usr/local/miniconda2/bin:$PATH
echo "===== Testing theano core" echo "===== Testing theano core"
# Test theano core # Test theano core
......
...@@ -5,11 +5,9 @@ ...@@ -5,11 +5,9 @@
# Print commands as they are executed # Print commands as they are executed
set -x set -x
# Anaconda python
export PATH=/usr/local/miniconda2/bin:$PATH
# CUDA # CUDA
export PATH=/usr/local/cuda/bin:$PATH export PATH=/usr/local/cuda/bin:$PATH
export CPATH=/usr/local/cuda/include/:$CPATH
export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
export LIBRARY_PATH=/usr/local/cuda/lib64:$LIBRARY_PATH export LIBRARY_PATH=/usr/local/cuda/lib64:$LIBRARY_PATH
...@@ -54,4 +52,5 @@ THEANO_GPUARRAY_TESTS="theano/gpuarray/tests \ ...@@ -54,4 +52,5 @@ THEANO_GPUARRAY_TESTS="theano/gpuarray/tests \
theano/scan_module/tests/test_scan.py:T_Scan_Gpuarray \ theano/scan_module/tests/test_scan.py:T_Scan_Gpuarray \
theano/scan_module/tests/test_scan_checkpoints.py:TestScanCheckpoint.test_memory" theano/scan_module/tests/test_scan_checkpoints.py:TestScanCheckpoint.test_memory"
FLAGS="init_gpu_device=$DEVICE,gpuarray.preallocate=1000,mode=FAST_RUN,on_opt_error=raise,on_shape_error=raise,cmodule.age_thresh_use=604800" FLAGS="init_gpu_device=$DEVICE,gpuarray.preallocate=1000,mode=FAST_RUN,on_opt_error=raise,on_shape_error=raise,cmodule.age_thresh_use=604800"
FLAGS=${FLAGS},magma.enabled=true # Enable magma GPU library
THEANO_FLAGS=${FLAGS} time nosetests --with-xunit --xunit-file=theanogpuarray_tests.xml ${THEANO_GPUARRAY_TESTS} THEANO_FLAGS=${FLAGS} time nosetests --with-xunit --xunit-file=theanogpuarray_tests.xml ${THEANO_GPUARRAY_TESTS}
...@@ -271,6 +271,7 @@ class OpFromGraph(gof.Op): ...@@ -271,6 +271,7 @@ class OpFromGraph(gof.Op):
is_inline = self.is_inline is_inline = self.is_inline
return '%(name)s{inline=%(is_inline)s}' % locals() return '%(name)s{inline=%(is_inline)s}' % locals()
@theano.configparser.change_flags(compute_test_value='off')
def _recompute_grad_op(self): def _recompute_grad_op(self):
''' '''
converts self._grad_op from user supplied form to type(self) instance converts self._grad_op from user supplied form to type(self) instance
......
...@@ -2,6 +2,7 @@ from __future__ import absolute_import, print_function, division ...@@ -2,6 +2,7 @@ from __future__ import absolute_import, print_function, division
from functools import partial from functools import partial
import numpy as np import numpy as np
import theano
from theano import config, shared from theano import config, shared
from theano.gradient import DisconnectedType from theano.gradient import DisconnectedType
...@@ -313,3 +314,14 @@ class T_OpFromGraph(unittest_tools.InferShapeTester): ...@@ -313,3 +314,14 @@ class T_OpFromGraph(unittest_tools.InferShapeTester):
[np.ones([3, 4], dtype=config.floatX), [np.ones([3, 4], dtype=config.floatX),
np.ones([3, 4], dtype=config.floatX)], np.ones([3, 4], dtype=config.floatX)],
OpFromGraph) OpFromGraph)
@theano.configparser.change_flags(compute_test_value='raise')
def test_compute_test_value(self):
x = T.scalar('x')
x.tag.test_value = np.array(1., dtype=config.floatX)
op = OpFromGraph([x], [x ** 3])
y = T.scalar('y')
y.tag.test_value = np.array(1., dtype=config.floatX)
f = op(y)
grad_f = T.grad(f, y)
assert grad_f.tag.test_value is not None
...@@ -268,19 +268,18 @@ def safe_no_dnn_algo_bwd(algo): ...@@ -268,19 +268,18 @@ def safe_no_dnn_algo_bwd(algo):
'`dnn.conv.algo_bwd_filter` and `dnn.conv.algo_bwd_data` instead.') '`dnn.conv.algo_bwd_filter` and `dnn.conv.algo_bwd_data` instead.')
return True return True
# Those are the options provided by Theano to choose algorithms at runtime.
SUPPORTED_DNN_CONV_ALGO_RUNTIME = ('guess_once', 'guess_on_shape_change', 'time_once', 'time_on_shape_change')
# Those are the supported algorithm by Theano, # Those are the supported algorithm by Theano,
# The tests will reference those lists. # The tests will reference those lists.
SUPPORTED_DNN_CONV_ALGO_FWD = ('small', 'none', 'large', 'fft', 'fft_tiling', SUPPORTED_DNN_CONV_ALGO_FWD = ('small', 'none', 'large', 'fft', 'fft_tiling', 'winograd') + SUPPORTED_DNN_CONV_ALGO_RUNTIME
'winograd', 'guess_once', 'guess_on_shape_change',
'time_once', 'time_on_shape_change') SUPPORTED_DNN_CONV_ALGO_BWD_DATA = ('none', 'deterministic', 'fft', 'fft_tiling', 'winograd') + SUPPORTED_DNN_CONV_ALGO_RUNTIME
SUPPORTED_DNN_CONV_ALGO_BWD_DATA = ('none', 'deterministic', 'fft', 'fft_tiling', SUPPORTED_DNN_CONV_ALGO_BWD_FILTER = ('none', 'deterministic', 'fft', 'small') + SUPPORTED_DNN_CONV_ALGO_RUNTIME
'winograd', 'guess_once', 'guess_on_shape_change',
'time_once', 'time_on_shape_change')
SUPPORTED_DNN_CONV_ALGO_BWD_FILTER = ('none', 'deterministic', 'fft', 'small', SUPPORTED_DNN_CONV_PRECISION = ('as_input_f32', 'as_input', 'float16', 'float32', 'float64')
'guess_once', 'guess_on_shape_change',
'time_once', 'time_on_shape_change')
AddConfigVar('dnn.conv.algo_bwd', AddConfigVar('dnn.conv.algo_bwd',
"This flag is deprecated; use dnn.conv.algo_bwd_data and " "This flag is deprecated; use dnn.conv.algo_bwd_data and "
...@@ -311,8 +310,7 @@ AddConfigVar('dnn.conv.precision', ...@@ -311,8 +310,7 @@ AddConfigVar('dnn.conv.precision',
"Default data precision to use for the computation in cuDNN " "Default data precision to use for the computation in cuDNN "
"convolutions (defaults to the same dtype as the inputs of the " "convolutions (defaults to the same dtype as the inputs of the "
"convolutions, or float32 if inputs are float16).", "convolutions, or float32 if inputs are float16).",
EnumStr('as_input_f32', 'as_input', 'float16', 'float32', EnumStr(*SUPPORTED_DNN_CONV_PRECISION),
'float64'),
in_c_key=False) in_c_key=False)
......
...@@ -1413,7 +1413,10 @@ class COp(Op): ...@@ -1413,7 +1413,10 @@ class COp(Op):
return [] return []
def c_code_cache_version(self): def c_code_cache_version(self):
return hash(tuple(self.func_codes)) version = (hash(tuple(self.func_codes)), )
if hasattr(self, 'params_type'):
version += (self.params_type.c_code_cache_version(), )
return version
def c_init_code(self): def c_init_code(self):
""" """
......
...@@ -963,6 +963,12 @@ class EnumType(Type, dict): ...@@ -963,6 +963,12 @@ class EnumType(Type, dict):
""" """
return alias in self.aliases return alias in self.aliases
def get_aliases(self):
"""
Return the list of all aliases in this enumeration.
"""
return self.aliases.keys()
def __repr__(self): def __repr__(self):
names_to_aliases = {constant_name: '' for constant_name in self} names_to_aliases = {constant_name: '' for constant_name in self}
for alias in self.aliases: for alias in self.aliases:
...@@ -1184,4 +1190,6 @@ class CEnumType(EnumList): ...@@ -1184,4 +1190,6 @@ class CEnumType(EnumList):
fail=sub['fail']) fail=sub['fail'])
def c_code_cache_version(self): def c_code_cache_version(self):
return (1, super(CEnumType, self).c_code_cache_version()) # C code depends on (C constant name, Python value) associations (given by `self.items()`),
# so we should better take them into account in C code version.
return (1, tuple(self.items()), super(CEnumType, self).c_code_cache_version())
#section support_code_apply #section support_code_apply
int APPLY_SPECIFIC(conv_desc)(PyArrayObject *filt_shp, int APPLY_SPECIFIC(conv_desc)(PyArrayObject *filt_shp,
cudnnConvolutionDescriptor_t *desc) { cudnnConvolutionDescriptor_t *desc,
PARAMS_TYPE* params) {
cudnnStatus_t err; cudnnStatus_t err;
int pad[3] = {PAD_0, PAD_1, PAD_2}; int pad[3] = {params->pad0, params->pad1, params->pad2};
int strides[3] = {SUB_0, SUB_1, SUB_2}; int strides[3] = {params->sub0, params->sub1, params->sub2};
int dilation[3] = {DIL_0, DIL_1, DIL_2}; int dilation[3] = {params->dil0, params->dil1, params->dil2};
#if BORDER_MODE == 0 if (params->bmode == BORDER_MODE_FULL) {
pad[0] = (*(npy_int64 *)PyArray_GETPTR1(filt_shp, 2) - 1) * DIL_0; pad[0] = (*(npy_int64 *)PyArray_GETPTR1(filt_shp, 2) - 1) * dilation[0];
pad[1] = (*(npy_int64 *)PyArray_GETPTR1(filt_shp, 3) - 1) * DIL_1; pad[1] = (*(npy_int64 *)PyArray_GETPTR1(filt_shp, 3) - 1) * dilation[1];
#if NB_DIMS > 2 if (params->nb_dims > 2) {
pad[2] = (*(npy_int64 *)PyArray_GETPTR1(filt_shp, 4) - 1) * DIL_2; pad[2] = (*(npy_int64 *)PyArray_GETPTR1(filt_shp, 4) - 1) * dilation[2];
#endif }
#elif BORDER_MODE == 2 } else if(params->bmode == BORDER_MODE_HALF) {
pad[0] = ((*(npy_int64 *)PyArray_GETPTR1(filt_shp, 2) - 1) * DIL_0 + 1) / 2; pad[0] = ((*(npy_int64 *)PyArray_GETPTR1(filt_shp, 2) - 1) * dilation[0] + 1) / 2;
pad[1] = ((*(npy_int64 *)PyArray_GETPTR1(filt_shp, 3) - 1) * DIL_1 + 1) / 2; pad[1] = ((*(npy_int64 *)PyArray_GETPTR1(filt_shp, 3) - 1) * dilation[1] + 1) / 2;
#if NB_DIMS > 2 if (params->nb_dims > 2) {
pad[2] = ((*(npy_int64 *)PyArray_GETPTR1(filt_shp, 4) - 1) * DIL_2 + 1) / 2; pad[2] = ((*(npy_int64 *)PyArray_GETPTR1(filt_shp, 4) - 1) * dilation[2] + 1) / 2;
#endif }
#endif }
if (PyArray_DIM(filt_shp, 0) - 2 != NB_DIMS) { if (PyArray_DIM(filt_shp, 0) - 2 != params->nb_dims) {
PyErr_Format(PyExc_ValueError, "Filter shape has too many dimensions: " PyErr_Format(PyExc_ValueError, "Filter shape has too many dimensions: "
"expected %d, got %lld.", NB_DIMS, "expected %d, got %lld.", params->nb_dims,
(long long)PyArray_DIM(filt_shp, 0)); (long long)PyArray_DIM(filt_shp, 0));
return -1; return -1;
} }
...@@ -35,8 +36,8 @@ int APPLY_SPECIFIC(conv_desc)(PyArrayObject *filt_shp, ...@@ -35,8 +36,8 @@ int APPLY_SPECIFIC(conv_desc)(PyArrayObject *filt_shp,
return -1; return -1;
} }
err = cudnnSetConvolutionNdDescriptor(*desc, NB_DIMS, pad, strides, err = cudnnSetConvolutionNdDescriptor(*desc, params->nb_dims, pad, strides,
dilation, CONV_MODE, PRECISION); dilation, params->conv_mode, params->precision);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not set convolution " PyErr_Format(PyExc_MemoryError, "could not set convolution "
"descriptor: %s", cudnnGetErrorString(err)); "descriptor: %s", cudnnGetErrorString(err));
......
"""
Declarations of cuDNN types and constants used in Theano gpuarray DNN module.
For every cuDNN API supported by Theano, this module defines a class that
provides the set of cuDNN definitions to be used in Theano Ops.
Use :func:`get_definitions` to get the right cuDNN definitions
for a given cuDNN version.
Currently supported cuDNN APIs:
- v5.1
- v6.0
"""
from __future__ import absolute_import, print_function, division
from theano.gof import CEnumType
# NB: Some cuDNN algorithms are listed in cuDNN enums but not implemented.
# We still register them here because we try to exactly copy cuDNN enums
# in Python side, but they will have no aliases associated, to help
# exclude them from lists of supported algorithms.
class CuDNNV51(object):
version = 5
cudnnConvolutionMode_t = CEnumType(('CUDNN_CONVOLUTION', 'conv'),
('CUDNN_CROSS_CORRELATION', 'cross'),
ctype='cudnnConvolutionMode_t')
cudnnDataType_t = CEnumType(('CUDNN_DATA_FLOAT', 'float32'),
('CUDNN_DATA_DOUBLE', 'float64'),
('CUDNN_DATA_HALF', 'float16'),
# CUDNN_DATA_INT8 # new in v6
# CUDNN_DATA_INT32 # new in v6
# CUDNN_DATA_INT8x4 # new in v6
ctype='cudnnDataType_t')
cudnnConvolutionFwdAlgo_t = CEnumType(('CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM', 'none'),
('CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM', 'small'),
('CUDNN_CONVOLUTION_FWD_ALGO_GEMM', 'large'),
# not implemented:
('CUDNN_CONVOLUTION_FWD_ALGO_DIRECT'),
('CUDNN_CONVOLUTION_FWD_ALGO_FFT', 'fft'),
('CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING', 'fft_tiling'),
('CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD', 'winograd'),
# TODO: Not yet tested/documented:
('CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED', 'winograd_non_fused'),
ctype='cudnnConvolutionFwdAlgo_t')
conv3d_fwd_algorithms = ('none', 'small', 'fft_tiling')
cudnnConvolutionBwdFilterAlgo_t = CEnumType(('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0', 'none'),
('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1', 'deterministic'),
('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT', 'fft'),
('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3', 'small'),
# TODO: not yet tested/documented:
('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED', 'winograd_non_fused'),
ctype='cudnnConvolutionBwdFilterAlgo_t')
conv3d_bwd_filter_algorithms = ('none', 'small')
cudnnConvolutionBwdDataAlgo_t = CEnumType(('CUDNN_CONVOLUTION_BWD_DATA_ALGO_0', 'none'),
('CUDNN_CONVOLUTION_BWD_DATA_ALGO_1', 'deterministic'),
('CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT', 'fft'),
('CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING', 'fft_tiling'),
('CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD', 'winograd'),
# TODO: not yet tested/documented:
('CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED', 'winograd_non_fused'),
ctype='cudnnConvolutionBwdDataAlgo_t')
conv3d_bwd_data_algorithms = ('none', 'deterministic', 'fft_tiling')
cudnnPoolingMode_t = CEnumType(('CUDNN_POOLING_MAX', 'max'),
('CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING', 'average_inc_pad'),
('CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING', 'average_exc_pad'),
ctype='cudnnPoolingMode_t')
cudnnSoftmaxAlgorithm_t = CEnumType(('CUDNN_SOFTMAX_FAST', 'fast'),
('CUDNN_SOFTMAX_ACCURATE', 'accurate'),
('CUDNN_SOFTMAX_LOG', 'log'),
ctype='cudnnSoftmaxAlgorithm_t')
cudnnSoftmaxMode_t = CEnumType(('CUDNN_SOFTMAX_MODE_INSTANCE', 'instance'),
('CUDNN_SOFTMAX_MODE_CHANNEL', 'channel'),
ctype='cudnnSoftmaxMode_t')
cudnnBatchNormMode_t = CEnumType(('CUDNN_BATCHNORM_PER_ACTIVATION', 'per-activation'),
('CUDNN_BATCHNORM_SPATIAL', 'spatial'),
ctype='cudnnBatchNormMode_t')
class CuDNNV6(CuDNNV51):
version = 6
cudnnPoolingMode_t = CEnumType(('CUDNN_POOLING_MAX', 'max'),
('CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING', 'average_inc_pad'),
('CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING', 'average_exc_pad'),
# new in v6:
('CUDNN_POOLING_MAX_DETERMINISTIC', 'max_deterministic'),
ctype='cudnnPoolingMode_t')
cudnnConvolutionBwdFilterAlgo_t = CEnumType(('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0', 'none'),
('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1', 'deterministic'),
('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT', 'fft'),
('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3', 'small'),
# not implemented:
('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD'),
('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED', 'winograd_non_fused'),
# TODO: not yet tested/documented:
# new in v6:
('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING', 'fft_tiling'),
ctype='cudnnConvolutionBwdFilterAlgo_t')
def get_definitions(cudnn_version=None):
"""
Return cuDNN definitions to be used by Theano for the given cuDNN version.
``cudnn_version`` must be None or an integer
(typically the version returned by :func:`theano.gpuarray.dnn.version`).
if None, return definitions for the most recent supported cuDNN version.
"""
if cudnn_version is not None and cudnn_version // 1000 == 5:
return CuDNNV51()
# By default, we use definitions for the last supported cuDNN version.
return CuDNNV6()
差异被折叠。
...@@ -24,7 +24,7 @@ int dnn_batchnorm_grad(PyGpuArrayObject *inp, PyGpuArrayObject *doutp, ...@@ -24,7 +24,7 @@ int dnn_batchnorm_grad(PyGpuArrayObject *inp, PyGpuArrayObject *doutp,
PyGpuArrayObject *scale, PyGpuArrayObject *x_mean, PyGpuArrayObject *scale, PyGpuArrayObject *x_mean,
PyGpuArrayObject *x_invstd, npy_float64 epsilon, PyGpuArrayObject *x_invstd, npy_float64 epsilon,
PyGpuArrayObject **dinp, PyGpuArrayObject **dscale, PyGpuArrayObject **dinp, PyGpuArrayObject **dscale,
PyGpuArrayObject **dbias, cudnnHandle_t _handle) { PyGpuArrayObject **dbias, PARAMS_TYPE* params) {
PyGpuContextObject *c = inp->context; PyGpuContextObject *c = inp->context;
if (c_set_tensorNd(inp, bn_input) != 0) if (c_set_tensorNd(inp, bn_input) != 0)
...@@ -70,8 +70,8 @@ int dnn_batchnorm_grad(PyGpuArrayObject *inp, PyGpuArrayObject *doutp, ...@@ -70,8 +70,8 @@ int dnn_batchnorm_grad(PyGpuArrayObject *inp, PyGpuArrayObject *doutp,
betaParam = (void *)&fbeta; betaParam = (void *)&fbeta;
} }
cudnnStatus_t err = cudnnBatchNormalizationBackward( cudnnStatus_t err = cudnnBatchNormalizationBackward(
_handle, params->handle,
MODE, params->mode,
alphaData, alphaData,
betaData, betaData,
alphaParam, alphaParam,
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale, int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
PyGpuArrayObject *bias, PyGpuArrayObject *est_mean, PyGpuArrayObject *bias, PyGpuArrayObject *est_mean,
PyGpuArrayObject *est_var, npy_float64 epsilon, PyGpuArrayObject *est_var, npy_float64 epsilon,
PyGpuArrayObject **outp, cudnnHandle_t _handle) { PyGpuArrayObject **outp, PARAMS_TYPE* params) {
PyGpuContextObject *c = inp->context; PyGpuContextObject *c = inp->context;
if (c_set_tensorNd(inp, bn_input) != 0) if (c_set_tensorNd(inp, bn_input) != 0)
...@@ -16,14 +16,14 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale, ...@@ -16,14 +16,14 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
return 1; return 1;
} }
#ifdef INPLACE_OUTPUT if (params->inplace) {
Py_XDECREF(*outp); Py_XDECREF(*outp);
*outp = inp; *outp = inp;
Py_INCREF(*outp); Py_INCREF(*outp);
#else } else {
if (theano_prep_output(outp, inp->ga.nd, inp->ga.dimensions, inp->ga.typecode, GA_C_ORDER, c) != 0) if (theano_prep_output(outp, inp->ga.nd, inp->ga.dimensions, inp->ga.typecode, GA_C_ORDER, c) != 0)
return 1; return 1;
#endif }
if (c_set_tensorNd(*outp, bn_output) != 0) if (c_set_tensorNd(*outp, bn_output) != 0)
return 1; return 1;
...@@ -43,8 +43,8 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale, ...@@ -43,8 +43,8 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
beta = (void *)&fbeta; beta = (void *)&fbeta;
} }
cudnnStatus_t err = cudnnBatchNormalizationForwardInference( cudnnStatus_t err = cudnnBatchNormalizationForwardInference(
_handle, params->handle,
MODE, params->mode,
alpha, alpha,
beta, beta,
bn_input, bn_input,
......
#section init_code_struct #section init_code_struct
#ifdef CHOOSE_ALGO if (PARAMS->choose_algo) {
reuse_algo = 0; reuse_algo = 0;
prev_algo = CONV_ALGO; prev_algo = PARAMS->conv_algo;
#ifndef CHOOSE_ONCE if (!PARAMS->choose_once) {
memset(prev_img_dims, 0, sizeof(prev_img_dims)); memset(prev_img_dims, 0, sizeof(prev_img_dims));
memset(prev_kern_dims, 0, sizeof(prev_kern_dims)); memset(prev_kern_dims, 0, sizeof(prev_kern_dims));
#endif }
#endif }
#section support_code_struct #section support_code_struct
#ifdef CHOOSE_ALGO
int reuse_algo; int reuse_algo;
cudnnConvolutionFwdAlgo_t prev_algo; cudnnConvolutionFwdAlgo_t prev_algo;
#ifndef CHOOSE_ONCE
size_t prev_img_dims[5]; size_t prev_img_dims[5];
size_t prev_kern_dims[5]; size_t prev_kern_dims[5];
#endif
#endif
int int
APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
...@@ -26,7 +22,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -26,7 +22,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
cudnnConvolutionDescriptor_t desc, cudnnConvolutionDescriptor_t desc,
double alpha, double beta, double alpha, double beta,
PyGpuArrayObject **output, PyGpuArrayObject **output,
cudnnHandle_t _handle) { PARAMS_TYPE* params) {
PyGpuContextObject *c = input->context; PyGpuContextObject *c = input->context;
void *alpha_p; void *alpha_p;
void *beta_p; void *beta_p;
...@@ -54,17 +50,17 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -54,17 +50,17 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
return 1; return 1;
} }
#ifdef CONV_INPLACE if (params->inplace) {
Py_XDECREF(*output); Py_XDECREF(*output);
*output = om; *output = om;
Py_INCREF(*output); Py_INCREF(*output);
#else } else {
if (theano_prep_output(output, PyGpuArray_NDIM(om), PyGpuArray_DIMS(om), if (theano_prep_output(output, PyGpuArray_NDIM(om), PyGpuArray_DIMS(om),
om->ga.typecode, GA_C_ORDER, c) != 0) om->ga.typecode, GA_C_ORDER, c) != 0)
return 1; return 1;
if (beta != 0.0 && pygpu_move(*output, om)) if (beta != 0.0 && pygpu_move(*output, om))
return 1; return 1;
#endif }
if (PyGpuArray_DIMS(input)[0] == 0 || PyGpuArray_DIMS(kerns)[0] == 0 || PyGpuArray_DIMS(kerns)[1] == 0) { if (PyGpuArray_DIMS(input)[0] == 0 || PyGpuArray_DIMS(kerns)[0] == 0 || PyGpuArray_DIMS(kerns)[1] == 0) {
int err2 = GpuArray_memset(&(*output)->ga, 0); int err2 = GpuArray_memset(&(*output)->ga, 0);
...@@ -83,90 +79,90 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -83,90 +79,90 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
if (c_set_tensorNd(*output, APPLY_SPECIFIC(output)) == -1) if (c_set_tensorNd(*output, APPLY_SPECIFIC(output)) == -1)
return 1; return 1;
cudnnConvolutionFwdAlgo_t algo = CONV_ALGO; cudnnConvolutionFwdAlgo_t algo = params->conv_algo;
cuda_enter(c->ctx); cuda_enter(c->ctx);
#ifdef CHOOSE_ALGO
#ifndef CHOOSE_ONCE
reuse_algo = 1;
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(input, i) == prev_img_dims[i]);
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]);
}
#endif
if (!reuse_algo) {
size_t free;
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free); if (params->choose_algo) {
if (err2 != GA_NO_ERROR) { if (params->choose_once) {
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the " reuse_algo = 1;
"memory information on the GPU"); for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
cuda_exit(c->ctx); reuse_algo = (reuse_algo &&
return 1; PyGpuArray_DIM(input, i) == prev_img_dims[i]);
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]);
}
} }
// Guess 4Mb if the info is not available if (!reuse_algo) {
if (free == 0) free = 4 * 1024 * 1024; size_t free;
#ifdef CHOOSE_TIME int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
int count; if (err2 != GA_NO_ERROR) {
cudnnConvolutionFwdAlgoPerf_t choice; PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
gpudata *tmpmem; "memory information on the GPU");
cuda_exit(c->ctx);
return 1;
}
tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL); // Guess 4Mb if the info is not available
if (tmpmem == NULL) { if (free == 0) free = 4 * 1024 * 1024;
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
return -1; if (params->choose_time) {
int count;
cudnnConvolutionFwdAlgoPerf_t choice;
gpudata *tmpmem;
tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
if (tmpmem == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
return -1;
}
// We don't sync the buffer as we don't care about the values.
err = cudnnFindConvolutionForwardAlgorithmEx(
params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
desc, APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output),
1, &count, &choice, *(void **)tmpmem,
free);
gpudata_release(tmpmem);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
algo = choice.algo;
} else {
err = cudnnGetConvolutionForwardAlgorithm(
params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
desc, APPLY_SPECIFIC(output),
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
}
prev_algo = algo;
} else {
algo = prev_algo;
} }
// We don't sync the buffer as we don't care about the values.
err = cudnnFindConvolutionForwardAlgorithmEx(
_handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
desc, APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output),
1, &count, &choice, *(void **)tmpmem,
free);
gpudata_release(tmpmem);
if (err != CUDNN_STATUS_SUCCESS) { if (params->choose_once) {
PyErr_Format(PyExc_RuntimeError, reuse_algo = 1;
"error selecting convolution algo: %s", } else {
cudnnGetErrorString(err)); for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
cuda_exit(c->ctx); prev_img_dims[i] = PyGpuArray_DIM(input, i);
return 1; prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
} }
algo = choice.algo;
#else
err = cudnnGetConvolutionForwardAlgorithm(
_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
desc, APPLY_SPECIFIC(output),
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
} }
#endif
prev_algo = algo;
} else {
algo = prev_algo;
} }
#ifdef CHOOSE_ONCE
reuse_algo = 1;
#else
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
prev_img_dims[i] = PyGpuArray_DIM(input, i);
prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
}
#endif
#endif
/* These two algos are not supported for 3d conv */ /* These two algos are not supported for 3d conv */
if (PyGpuArray_NDIM(input) == 5 && if (PyGpuArray_NDIM(input) == 5 &&
(algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM || (algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM ||
...@@ -201,20 +197,16 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -201,20 +197,16 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
return 1; return 1;
} }
if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT) if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT) {
{
if (stride[0] != 1 || stride[1] != 1 || if (stride[0] != 1 || stride[1] != 1 ||
PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 || PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
(PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1)) (PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1))
{ {
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
} }
} } else {
else
{
// algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING // algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING
if (stride[0] != 1 || stride[1] != 1) if (stride[0] != 1 || stride[1] != 1) {
{
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
} }
} }
...@@ -223,7 +215,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -223,7 +215,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
{ {
size_t worksize; size_t worksize;
gpudata *workspace; gpudata *workspace;
err = cudnnGetConvolutionForwardWorkspaceSize(_handle, err = cudnnGetConvolutionForwardWorkspaceSize(params->handle,
APPLY_SPECIFIC(input), APPLY_SPECIFIC(input),
APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(kerns),
desc, desc,
...@@ -236,7 +228,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -236,7 +228,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
// TODO: Print a warning // TODO: Print a warning
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
err = cudnnGetConvolutionForwardWorkspaceSize(_handle, err = cudnnGetConvolutionForwardWorkspaceSize(params->handle,
APPLY_SPECIFIC(input), APPLY_SPECIFIC(input),
APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(kerns),
desc, desc,
...@@ -273,7 +265,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -273,7 +265,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnConvolutionForward( err = cudnnConvolutionForward(
_handle, params->handle,
alpha_p, alpha_p,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input), APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns), APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
......
#section init_code_struct #section init_code_struct
#ifdef CHOOSE_ALGO // #ifdef CHOOSE_ALGO
reuse_algo = 0; if (PARAMS->choose_algo) {
prev_algo = CONV_ALGO; reuse_algo = 0;
#ifndef CHOOSE_ONCE prev_algo = PARAMS->conv_algo;
memset(prev_kern_dims, 0, sizeof(prev_kern_dims)); // #ifndef CHOOSE_ONCE
memset(prev_top_dims, 0, sizeof(prev_top_dims)); if (!PARAMS->choose_once) {
#endif memset(prev_kern_dims, 0, sizeof(prev_kern_dims));
#endif memset(prev_top_dims, 0, sizeof(prev_top_dims));
}
// #endif
}
// #endif
#section support_code_struct #section support_code_struct
#ifdef CHOOSE_ALGO int reuse_algo;
int reuse_algo = 0; cudnnConvolutionBwdDataAlgo_t prev_algo;
cudnnConvolutionBwdDataAlgo_t prev_algo = CONV_ALGO;
#ifndef CHOOSE_ONCE
size_t prev_kern_dims[5] = {0}; size_t prev_kern_dims[5] = {0};
size_t prev_top_dims[5] = {0}; size_t prev_top_dims[5] = {0};
#endif
#endif
int int
APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
PyGpuArrayObject *im, PyGpuArrayObject *im,
cudnnConvolutionDescriptor_t desc, cudnnConvolutionDescriptor_t desc,
double alpha, double beta, PyGpuArrayObject **input, double alpha, double beta, PyGpuArrayObject **input,
cudnnHandle_t _handle) { PARAMS_TYPE* params) {
PyGpuContextObject *c = kerns->context; PyGpuContextObject *c = kerns->context;
void *alpha_p; void *alpha_p;
void *beta_p; void *beta_p;
...@@ -53,17 +53,20 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -53,17 +53,20 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
return 1; return 1;
} }
#ifdef CONV_INPLACE // #ifdef CONV_INPLACE
Py_XDECREF(*input); if (params->inplace) {
*input = im; Py_XDECREF(*input);
Py_INCREF(*input); *input = im;
#else Py_INCREF(*input);
if (theano_prep_output(input, PyGpuArray_NDIM(im), PyGpuArray_DIMS(im), // #else
im->ga.typecode, GA_C_ORDER, c) != 0) } else {
return 1; if (theano_prep_output(input, PyGpuArray_NDIM(im), PyGpuArray_DIMS(im),
if (beta != 0.0 && pygpu_move(*input, im)) im->ga.typecode, GA_C_ORDER, c) != 0)
return 1; return 1;
#endif if (beta != 0.0 && pygpu_move(*input, im))
return 1;
}
// #endif
if (PyGpuArray_DIMS(im)[0] == 0 || PyGpuArray_DIMS(kerns)[0] == 0 || PyGpuArray_DIMS(kerns)[1] == 0) { if (PyGpuArray_DIMS(im)[0] == 0 || PyGpuArray_DIMS(kerns)[0] == 0 || PyGpuArray_DIMS(kerns)[1] == 0) {
int err2 = GpuArray_memset(&(*input)->ga, 0); int err2 = GpuArray_memset(&(*input)->ga, 0);
...@@ -82,7 +85,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -82,7 +85,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
if (c_set_tensorNd(*input, APPLY_SPECIFIC(input)) == -1) if (c_set_tensorNd(*input, APPLY_SPECIFIC(input)) == -1)
return 1; return 1;
cudnnConvolutionBwdDataAlgo_t algo = CONV_ALGO; cudnnConvolutionBwdDataAlgo_t algo = params->conv_algo;
cuda_enter(c->ctx); cuda_enter(c->ctx);
...@@ -128,84 +131,93 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -128,84 +131,93 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
} }
} }
#ifdef CHOOSE_ALGO // #ifdef CHOOSE_ALGO
#ifndef CHOOSE_ONCE if (params->choose_algo) {
reuse_algo = 1; // #ifndef CHOOSE_ONCE
for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); i++) { if (!params->choose_once) {
reuse_algo = (reuse_algo && reuse_algo = 1;
PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]); for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); i++) {
reuse_algo = (reuse_algo && reuse_algo = (reuse_algo &&
PyGpuArray_DIM(output, i) == prev_top_dims[i]); PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]);
} reuse_algo = (reuse_algo &&
#endif PyGpuArray_DIM(output, i) == prev_top_dims[i]);
}
}
// #endif
if (!reuse_algo) { if (!reuse_algo) {
size_t free; size_t free;
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free); int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
if (err2 != GA_NO_ERROR) { if (err2 != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the " PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
"memory information on the GPU"); "memory information on the GPU");
cuda_exit(c->ctx); cuda_exit(c->ctx);
return 1; return 1;
} }
// Guess 4Mb if the info is not available // Guess 4Mb if the info is not available
if (free == 0) free = 4 * 1024 * 1024; if (free == 0) free = 4 * 1024 * 1024;
#ifdef CHOOSE_TIME // #ifdef CHOOSE_TIME
int count; if (params->choose_time) {
cudnnConvolutionBwdDataAlgoPerf_t choice; int count;
gpudata *tmpmem; cudnnConvolutionBwdDataAlgoPerf_t choice;
gpudata *tmpmem;
tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL); tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
if (tmpmem == NULL) { if (tmpmem == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory"); PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
return -1; return -1;
} }
err = cudnnFindConvolutionBackwardDataAlgorithmEx( err = cudnnFindConvolutionBackwardDataAlgorithmEx(
_handle, APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns), params->handle, APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc, APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(*input), APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(*input),
1, &count, &choice, *(void **)tmpmem, free); 1, &count, &choice, *(void **)tmpmem, free);
gpudata_release(tmpmem); gpudata_release(tmpmem);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
if (err != CUDNN_STATUS_SUCCESS) { algo = choice.algo;
PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", // #else
cudnnGetErrorString(err)); } else {
cuda_exit(c->ctx); err = cudnnGetConvolutionBackwardDataAlgorithm(
return 1; params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output),
desc, APPLY_SPECIFIC(input),
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
} }
// #endif
algo = choice.algo; prev_algo = algo;
#else } else {
err = cudnnGetConvolutionBackwardDataAlgorithm( algo = prev_algo;
_handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output),
desc, APPLY_SPECIFIC(input),
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
} }
#endif
prev_algo = algo;
} else {
algo = prev_algo;
}
#ifdef CHOOSE_ONCE // #ifdef CHOOSE_ONCE
reuse_algo = 1; if (params->choose_once) {
#else reuse_algo = 1;
for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); i++) { // #else
prev_kern_dims[i] = PyGpuArray_DIM(kerns, i); } else {
prev_top_dims[i] = PyGpuArray_DIM(output, i); for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); i++) {
prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
prev_top_dims[i] = PyGpuArray_DIM(output, i);
}
}
// #endif
} }
#endif // #endif
#endif
// The FFT implementation does not support strides, 1x1 filters or inputs // The FFT implementation does not support strides, 1x1 filters or inputs
// with a spatial dimension larger than 1024. The tiled-FFT implementation // with a spatial dimension larger than 1024. The tiled-FFT implementation
...@@ -258,7 +270,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -258,7 +270,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
gpudata *workspace; gpudata *workspace;
err = cudnnGetConvolutionBackwardDataWorkspaceSize( err = cudnnGetConvolutionBackwardDataWorkspaceSize(
_handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc, params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(input), algo, &worksize); APPLY_SPECIFIC(input), algo, &worksize);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
...@@ -283,7 +295,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -283,7 +295,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
cuda_wait((*input)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); cuda_wait((*input)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnConvolutionBackwardData( err = cudnnConvolutionBackwardData(
_handle, params->handle,
alpha_p, alpha_p,
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns), APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),
......
#section init_code_struct #section init_code_struct
#ifdef CHOOSE_ALGO if (PARAMS->choose_algo) {
reuse_algo = 0; reuse_algo = 0;
prev_algo = CONV_ALGO; prev_algo = PARAMS->conv_algo;
#ifndef CHOOSE_ONCE if (!PARAMS->choose_once) {
memset(prev_img_dims, 0, sizeof(prev_img_dims)); memset(prev_img_dims, 0, sizeof(prev_img_dims));
memset(prev_top_dims, 0, sizeof(prev_top_dims)); memset(prev_top_dims, 0, sizeof(prev_top_dims));
#endif }
#endif }
#section support_code_struct #section support_code_struct
#ifdef CHOOSE_ALGO
int reuse_algo; int reuse_algo;
cudnnConvolutionBwdFilterAlgo_t prev_algo; cudnnConvolutionBwdFilterAlgo_t prev_algo;
#ifndef CHOOSE_ONCE
size_t prev_img_dims[5]; size_t prev_img_dims[5];
size_t prev_top_dims[5]; size_t prev_top_dims[5];
#endif
#endif
int int
APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
PyGpuArrayObject *km, PyGpuArrayObject *km,
cudnnConvolutionDescriptor_t desc, cudnnConvolutionDescriptor_t desc,
double alpha, double beta, PyGpuArrayObject **kerns, double alpha, double beta, PyGpuArrayObject **kerns,
cudnnHandle_t _handle) { PARAMS_TYPE* params) {
PyGpuContextObject *c = input->context; PyGpuContextObject *c = input->context;
void *alpha_p; void *alpha_p;
void *beta_p; void *beta_p;
...@@ -53,17 +49,17 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -53,17 +49,17 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
return 1; return 1;
} }
#ifdef CONV_INPLACE if (params->inplace) {
Py_XDECREF(*kerns); Py_XDECREF(*kerns);
*kerns = km; *kerns = km;
Py_INCREF(*kerns); Py_INCREF(*kerns);
#else } else {
if (theano_prep_output(kerns, PyGpuArray_NDIM(km), PyGpuArray_DIMS(km), if (theano_prep_output(kerns, PyGpuArray_NDIM(km), PyGpuArray_DIMS(km),
km->ga.typecode, GA_C_ORDER, c) != 0) km->ga.typecode, GA_C_ORDER, c) != 0)
return 1; return 1;
if (beta != 0.0 && pygpu_move(*kerns, km)) if (beta != 0.0 && pygpu_move(*kerns, km))
return 1; return 1;
#endif }
if (PyGpuArray_DIMS(input)[0] == 0 || PyGpuArray_DIMS(km)[0] == 0 || PyGpuArray_DIMS(km)[1] == 0) { if (PyGpuArray_DIMS(input)[0] == 0 || PyGpuArray_DIMS(km)[0] == 0 || PyGpuArray_DIMS(km)[1] == 0) {
int err2 = GpuArray_memset(&(*kerns)->ga, 0); int err2 = GpuArray_memset(&(*kerns)->ga, 0);
...@@ -82,7 +78,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -82,7 +78,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns)) == -1) if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns)) == -1)
return 1; return 1;
cudnnConvolutionBwdFilterAlgo_t algo = CONV_ALGO; cudnnConvolutionBwdFilterAlgo_t algo = params->conv_algo;
cuda_enter(c->ctx); cuda_enter(c->ctx);
...@@ -128,86 +124,85 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -128,86 +124,85 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
} }
} }
#ifdef CHOOSE_ALGO if (params->choose_algo) {
#ifndef CHOOSE_ONCE if (!params->choose_once) {
reuse_algo = 1; reuse_algo = 1;
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) { for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
reuse_algo = (reuse_algo && reuse_algo = (reuse_algo &&
PyGpuArray_DIM(input, i) == prev_img_dims[i]); PyGpuArray_DIM(input, i) == prev_img_dims[i]);
reuse_algo = (reuse_algo && reuse_algo = (reuse_algo &&
PyGpuArray_DIM(output, i) == prev_top_dims[i]); PyGpuArray_DIM(output, i) == prev_top_dims[i]);
} }
#endif
if (!reuse_algo) {
size_t free;
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
if (err2 != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
"memory information on the GPU");
cuda_exit(c->ctx);
return 1;
} }
// Guess 4Mb if the info is not available if (!reuse_algo) {
if (free == 0) free = 4 * 1024 * 1024; size_t free;
#ifdef CHOOSE_TIME
int count;
cudnnConvolutionBwdFilterAlgoPerf_t choice;
gpudata *tmpmem;
tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
if (tmpmem == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
return -1;
}
err = cudnnFindConvolutionBackwardFilterAlgorithmEx( int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
_handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input), if (err2 != GA_NO_ERROR) {
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc, PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(*kerns), "memory information on the GPU");
1, &count, &choice, *(void **)tmpmem, free); cuda_exit(c->ctx);
gpudata_release(tmpmem); return 1;
}
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, // Guess 4Mb if the info is not available
"error selecting convolution algo: %s", if (free == 0) free = 4 * 1024 * 1024;
cudnnGetErrorString(err));
cuda_exit(c->ctx); if (params->choose_time) {
return 1; int count;
cudnnConvolutionBwdFilterAlgoPerf_t choice;
gpudata *tmpmem;
tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
if (tmpmem == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
return -1;
}
err = cudnnFindConvolutionBackwardFilterAlgorithmEx(
params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc,
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(*kerns),
1, &count, &choice, *(void **)tmpmem, free);
gpudata_release(tmpmem);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
algo = choice.algo;
} else {
err = cudnnGetConvolutionBackwardFilterAlgorithm(
params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
desc, APPLY_SPECIFIC(kerns),
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
}
prev_algo = algo;
} else {
algo = prev_algo;
} }
algo = choice.algo; if (params->choose_once) {
#else reuse_algo = 1;
err = cudnnGetConvolutionBackwardFilterAlgorithm( } else {
_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
desc, APPLY_SPECIFIC(kerns), prev_img_dims[i] = PyGpuArray_DIM(input, i);
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, free, &algo); prev_top_dims[i] = PyGpuArray_DIM(output, i);
if (err != CUDNN_STATUS_SUCCESS) { }
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
} }
#endif
prev_algo = algo;
} else {
algo = prev_algo;
}
#ifdef CHOOSE_ONCE
reuse_algo = 1;
#else
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
prev_img_dims[i] = PyGpuArray_DIM(input, i);
prev_top_dims[i] = PyGpuArray_DIM(output, i);
} }
#endif
#endif
// The FFT implementation does not support strides, 1x1 filters or inputs // The FFT implementation does not support strides, 1x1 filters or inputs
// with a spatial dimension larger than 1024. // with a spatial dimension larger than 1024.
...@@ -246,7 +241,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -246,7 +241,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
gpudata *workspace; gpudata *workspace;
err = cudnnGetConvolutionBackwardFilterWorkspaceSize( err = cudnnGetConvolutionBackwardFilterWorkspaceSize(
_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc, params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(kerns), algo, &worksize); APPLY_SPECIFIC(kerns), algo, &worksize);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
...@@ -270,7 +265,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -270,7 +265,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
cuda_wait((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); cuda_wait((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnConvolutionBackwardFilter( err = cudnnConvolutionBackwardFilter(
_handle, params->handle,
alpha_p, alpha_p,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input), APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),
......
...@@ -42,7 +42,7 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img, ...@@ -42,7 +42,7 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
PyArrayObject *stride, PyArrayObject *stride,
PyArrayObject *pad, PyArrayObject *pad,
PyGpuArrayObject **out, PyGpuArrayObject **out,
cudnnHandle_t _handle) { PARAMS_TYPE* params) {
PyGpuContextObject *c = img->context; PyGpuContextObject *c = img->context;
size_t dims[5]; size_t dims[5];
cudnnStatus_t err; cudnnStatus_t err;
...@@ -90,7 +90,7 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img, ...@@ -90,7 +90,7 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
if (c_set_tensorNd(*out, APPLY_SPECIFIC(output)) != 0) if (c_set_tensorNd(*out, APPLY_SPECIFIC(output)) != 0)
return 1; return 1;
err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), MODE_FLAG, CUDNN_PROPAGATE_NAN, ndims, w, p, s); err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), params->mode, CUDNN_PROPAGATE_NAN, ndims, w, p, s);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set op descriptor %s", cudnnGetErrorString(err)); PyErr_Format(PyExc_RuntimeError, "could not set op descriptor %s", cudnnGetErrorString(err));
...@@ -124,7 +124,7 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img, ...@@ -124,7 +124,7 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
cuda_wait((*out)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); cuda_wait((*out)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnPoolingForward( err = cudnnPoolingForward(
_handle, APPLY_SPECIFIC(pool), params->handle, APPLY_SPECIFIC(pool),
alpha, alpha,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(img), APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(img),
beta, beta,
......
...@@ -64,7 +64,7 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp, ...@@ -64,7 +64,7 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
PyArrayObject *stride, PyArrayObject *stride,
PyArrayObject *pad, PyArrayObject *pad,
PyGpuArrayObject **inp_grad, PyGpuArrayObject **inp_grad,
cudnnHandle_t _handle) { PARAMS_TYPE* params) {
PyGpuContextObject *c = inp->context; PyGpuContextObject *c = inp->context;
cudnnStatus_t err; cudnnStatus_t err;
...@@ -116,7 +116,7 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp, ...@@ -116,7 +116,7 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
s[i] = *((npy_intp*)PyArray_GETPTR1(stride, i)); s[i] = *((npy_intp*)PyArray_GETPTR1(stride, i));
} }
err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), MODE_FLAG, CUDNN_PROPAGATE_NAN, ndims, w, p, s); err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), params->mode, CUDNN_PROPAGATE_NAN, ndims, w, p, s);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set op descriptor %s", cudnnGetErrorString(err)); PyErr_Format(PyExc_RuntimeError, "could not set op descriptor %s", cudnnGetErrorString(err));
...@@ -155,7 +155,7 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp, ...@@ -155,7 +155,7 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
cuda_wait((*inp_grad)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); cuda_wait((*inp_grad)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnPoolingBackward( err = cudnnPoolingBackward(
_handle, APPLY_SPECIFIC(pool), params->handle, APPLY_SPECIFIC(pool),
alpha, alpha,
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(out), APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(out),
APPLY_SPECIFIC(output_grad), PyGpuArray_DEV_DATA(out_grad), APPLY_SPECIFIC(output_grad), PyGpuArray_DEV_DATA(out_grad),
......
...@@ -35,7 +35,7 @@ class GpuCumOp(GpuKernelBase, Op): ...@@ -35,7 +35,7 @@ class GpuCumOp(GpuKernelBase, Op):
return hash(self.axis) ^ hash(self.mode) return hash(self.axis) ^ hash(self.mode)
def c_code_cache_version(self): def c_code_cache_version(self):
return (3,) return (5,)
def c_headers(self): def c_headers(self):
return ['<numpy_compat.h>', '<gpuarray/types.h>', '<gpuarray_helper.h>'] return ['<numpy_compat.h>', '<gpuarray/types.h>', '<gpuarray_helper.h>']
...@@ -67,13 +67,16 @@ class GpuCumOp(GpuKernelBase, Op): ...@@ -67,13 +67,16 @@ class GpuCumOp(GpuKernelBase, Op):
dtype_x = node.inputs[0].dtype dtype_x = node.inputs[0].dtype
flags = Kernel.get_flags(dtype_x) flags = Kernel.get_flags(dtype_x)
code = """ code = """
KERNEL void %(kname)s(float* input, float* output, KERNEL void %(kname)s(float* input, ga_size input_offset,
float* output, ga_size output_offset,
ga_ssize inputStrides_x, ga_ssize inputStrides_x,
ga_ssize inputStrides_y, ga_ssize inputStrides_y,
ga_ssize inputStrides_z, ga_ssize inputStrides_z,
ga_ssize outputStrides_x, ga_ssize outputStrides_y, ga_ssize outputStrides_x, ga_ssize outputStrides_y,
ga_ssize outputStrides_z, const int offsetY, const int offsetZ, ga_ssize outputStrides_z, const int offsetY, const int offsetZ,
const int beforeLastElementIdx, const int lastElementIdx){ const int beforeLastElementIdx, const int lastElementIdx){
input = (float *)(((char *)input) + input_offset);
output = (float *)(((char *)output) + output_offset);
int idY = blockIdx.y + offsetY; int idY = blockIdx.y + offsetY;
int idZ = blockIdx.z + offsetZ; int idZ = blockIdx.z + offsetZ;
...@@ -85,8 +88,10 @@ class GpuCumOp(GpuKernelBase, Op): ...@@ -85,8 +88,10 @@ class GpuCumOp(GpuKernelBase, Op):
output[idx_last_output] = input[idx_last_input] %(op)s output[idx_beforelast]; output[idx_last_output] = input[idx_last_input] %(op)s output[idx_beforelast];
} }
""" % locals() """ % locals()
params = [gpuarray.GpuArray, gpuarray.GpuArray, gpuarray.SSIZE, params = [gpuarray.GpuArray, gpuarray.SIZE,
gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.GpuArray, gpuarray.SIZE,
gpuarray.SSIZE, gpuarray.SSIZE,
gpuarray.SSIZE, gpuarray.SSIZE,
gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc',
...@@ -96,10 +101,11 @@ class GpuCumOp(GpuKernelBase, Op): ...@@ -96,10 +101,11 @@ class GpuCumOp(GpuKernelBase, Op):
# blockCumOp # blockCumOp
kname = "k_blockCumOp" kname = "k_blockCumOp"
k_var = "k_blockCumOp_" + nodename k_var = "k_blockCumOp_" + nodename
params = [gpuarray.GpuArray, gpuarray.GpuArray, gpuarray.SIZE, params = [gpuarray.GpuArray, gpuarray.SIZE,
gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SIZE,
gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
'int32', 'int32', gpuarray.GpuArray, ] 'int32', 'int32', gpuarray.GpuArray, gpuarray.SIZE]
code = """ code = """
// helper functions // helper functions
WITHIN_KERNEL WITHIN_KERNEL
...@@ -154,12 +160,17 @@ class GpuCumOp(GpuKernelBase, Op): ...@@ -154,12 +160,17 @@ class GpuCumOp(GpuKernelBase, Op):
output[idx_odd] = partialCumOp[threadIdx.x*2 + 1]; output[idx_odd] = partialCumOp[threadIdx.x*2 + 1];
} }
KERNEL void k_blockCumOp(float* input, float* output, KERNEL void k_blockCumOp(float* input, ga_size input_offset,
size_t nbElementsPerCumOp, ga_ssize inputStrides_x, float* output, ga_size output_offset,
ga_ssize inputStrides_y, ga_ssize inputStrides_z, size_t nbElementsPerCumOp, ga_ssize inputStrides_x,
ga_ssize outputStrides_x, ga_ssize outputStrides_y, ga_ssize inputStrides_y, ga_ssize inputStrides_z,
ga_ssize outputStrides_z, int offsetY, ga_ssize outputStrides_x, ga_ssize outputStrides_y,
int offsetZ, float* blockSum) { ga_ssize outputStrides_z, int offsetY,
int offsetZ, float* blockSum, ga_size blockSum_offset) {
input = (float *)(((char *)input) + input_offset);
output = (float *)(((char *)output) + output_offset);
blockSum = (float *)(((char *)blockSum) + blockSum_offset);
// Regarding blockIdx and threadIdx, 'CumOp' is always performed along the X axis. // Regarding blockIdx and threadIdx, 'CumOp' is always performed along the X axis.
// The Y and Z axis of the grid will contain all independent cumops of the 2D/3D case. // The Y and Z axis of the grid will contain all independent cumops of the 2D/3D case.
...@@ -197,9 +208,14 @@ class GpuCumOp(GpuKernelBase, Op): ...@@ -197,9 +208,14 @@ class GpuCumOp(GpuKernelBase, Op):
kname = "k_finalCumOp" kname = "k_finalCumOp"
k_var = "k_finalCumOp_" + nodename k_var = "k_finalCumOp_" + nodename
code = """ code = """
KERNEL void k_finalCumOp(float* output, float* blockSum, size_t nbElementsPerCumOp, KERNEL void k_finalCumOp(float* output, ga_size output_offset,
ga_ssize dataStrides_x, ga_ssize dataStrides_y, ga_ssize dataStrides_z, float* blockSum, ga_size blockSum_offset,
int offsetY, int offsetZ) { size_t nbElementsPerCumOp,
ga_ssize dataStrides_x, ga_ssize dataStrides_y, ga_ssize dataStrides_z,
int offsetY, int offsetZ) {
output = (float *)(((char *)output) + output_offset);
blockSum = (float *)(((char *)blockSum) + blockSum_offset);
int globalThreadID = (blockIdx.x + 1) * blockDim.x + threadIdx.x; int globalThreadID = (blockIdx.x + 1) * blockDim.x + threadIdx.x;
// Check if current has data to process. // Check if current has data to process.
...@@ -218,7 +234,8 @@ class GpuCumOp(GpuKernelBase, Op): ...@@ -218,7 +234,8 @@ class GpuCumOp(GpuKernelBase, Op):
output[idx_odd] %(op)s= currentBlockSum; output[idx_odd] %(op)s= currentBlockSum;
} }
""" % locals() """ % locals()
params = [gpuarray.GpuArray, gpuarray.GpuArray, gpuarray.SIZE, params = [gpuarray.GpuArray, gpuarray.SIZE,
gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SIZE,
gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
'int32', 'int32', ] 'int32', 'int32', ]
kernels.append(Kernel(code=code, name=kname, params=params, kernels.append(Kernel(code=code, name=kname, params=params,
...@@ -381,7 +398,9 @@ class GpuCumOp(GpuKernelBase, Op): ...@@ -381,7 +398,9 @@ class GpuCumOp(GpuKernelBase, Op):
size_t dimBlock[3] = {dimBlockX, 1, 1}; // One cum op per block. size_t dimBlock[3] = {dimBlockX, 1, 1}; // One cum op per block.
size_t sharedBytes = (2*dimBlockX) * sizeof(float); size_t sharedBytes = (2*dimBlockX) * sizeof(float);
void* kernel_params[] = {(void*) input->ga.data, void* kernel_params[] = {(void*) input->ga.data,
(void*) &(input->ga.offset),
(void*) output->ga.data, (void*) output->ga.data,
(void*) &(output->ga.offset),
(void*) &nbElementsPerCumOp, (void*) &nbElementsPerCumOp,
(void*) &inputStrides_x, (void*) &inputStrides_x,
(void*) &inputStrides_y, (void*) &inputStrides_y,
...@@ -391,7 +410,8 @@ class GpuCumOp(GpuKernelBase, Op): ...@@ -391,7 +410,8 @@ class GpuCumOp(GpuKernelBase, Op):
(void*) &outputStrides_z, (void*) &outputStrides_z,
(void*) &offsetY, (void*) &offsetY,
(void*) &offsetZ, (void*) &offsetZ,
(void*) deviceBlockSum->ga.data (void*) deviceBlockSum->ga.data,
(void*) &(deviceBlockSum->ga.offset)
}; };
int err = GpuKernel_call(&k_blockCumOp_%(nodename)s, 3, dimGrid, dimBlock, sharedBytes, kernel_params); int err = GpuKernel_call(&k_blockCumOp_%(nodename)s, 3, dimGrid, dimBlock, sharedBytes, kernel_params);
if (err != GA_NO_ERROR){ if (err != GA_NO_ERROR){
...@@ -410,7 +430,9 @@ class GpuCumOp(GpuKernelBase, Op): ...@@ -410,7 +430,9 @@ class GpuCumOp(GpuKernelBase, Op):
size_t dimGrid[3] = {dimGridX, localDimGridY, localDimGridZ}; size_t dimGrid[3] = {dimGridX, localDimGridY, localDimGridZ};
size_t dimBlock[3] = {dimBlockX, 1, 1}; size_t dimBlock[3] = {dimBlockX, 1, 1};
void* kernel_params[] = {(void*) output->ga.data, void* kernel_params[] = {(void*) output->ga.data,
(void*) &(output->ga.offset),
(void*) deviceBlockSum->ga.data, (void*) deviceBlockSum->ga.data,
(void*) &(deviceBlockSum->ga.offset),
(void*) &nbElementsPerCumOp, (void*) &nbElementsPerCumOp,
(void*) &outputStrides_x, (void*) &outputStrides_x,
(void*) &outputStrides_y, (void*) &outputStrides_y,
...@@ -431,7 +453,9 @@ class GpuCumOp(GpuKernelBase, Op): ...@@ -431,7 +453,9 @@ class GpuCumOp(GpuKernelBase, Op):
size_t tmp0 = shape[axis]-2; size_t tmp0 = shape[axis]-2;
size_t tmp1 = shape[axis]-1; size_t tmp1 = shape[axis]-1;
void* kernel_params[] = {(void*) input->ga.data, void* kernel_params[] = {(void*) input->ga.data,
(void*) &(input->ga.offset),
(void*) output->ga.data, (void*) output->ga.data,
(void*) &(output->ga.offset),
(void*) &inputStrides_x, (void*) &inputStrides_x,
(void*) &inputStrides_y, (void*) &inputStrides_y,
(void*) &inputStrides_z, (void*) &inputStrides_z,
......
...@@ -31,6 +31,20 @@ mode_with_gpu = mode_with_gpu.including() ...@@ -31,6 +31,20 @@ mode_with_gpu = mode_with_gpu.including()
mode_with_gpu.check_py_code = False mode_with_gpu.check_py_code = False
# This variable will store the list of pooling modes available with the current runtime cuDNN version.
# Don't use this variable directly, always call `get_dnn_pool_modes()` instead.
dnn_pool_modes = None
def get_dnn_pool_modes():
# This function is called only by pooling tests to initialize and/or get dnn_pool_modes.
global dnn_pool_modes
if dnn_pool_modes is None:
from .. import cudnn_defs
dnn_pool_modes = cudnn_defs.get_definitions(dnn.version(raises=False)).cudnnPoolingMode_t.get_aliases()
return dnn_pool_modes
# If using float16, set CUDNN precision to float32 # If using float16, set CUDNN precision to float32
def set_precision(floatX): def set_precision(floatX):
if floatX == "float16": if floatX == "float16":
...@@ -155,11 +169,7 @@ def test_pooling(): ...@@ -155,11 +169,7 @@ def test_pooling():
raise SkipTest(dnn.dnn_available.msg) raise SkipTest(dnn.dnn_available.msg)
utt.seed_rng() utt.seed_rng()
# 'average_exc_pad' is disabled for versions < 4004 modes = get_dnn_pool_modes()
if dnn.version(raises=False) < 4004:
modes = ('max', 'average_inc_pad')
else:
modes = ('max', 'average_inc_pad', 'average_exc_pad')
x = T.tensor4() x = T.tensor4()
for mode, pad in product(modes, for mode, pad in product(modes,
...@@ -242,7 +252,9 @@ def test_pooling(): ...@@ -242,7 +252,9 @@ def test_pooling():
for node in fg.maker.fgraph.toposort()]) for node in fg.maker.fgraph.toposort()])
def test_pooling_with_tensor_vars(): # This test will be run with different values of 'mode'
# (see next test below).
def run_pooling_with_tensor_vars(mode):
if not dnn.dnn_available(test_ctx_name): if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg) raise SkipTest(dnn.dnn_available.msg)
utt.seed_rng() utt.seed_rng()
...@@ -251,7 +263,6 @@ def test_pooling_with_tensor_vars(): ...@@ -251,7 +263,6 @@ def test_pooling_with_tensor_vars():
ws = theano.shared(np.array([2, 2], dtype='int32')) ws = theano.shared(np.array([2, 2], dtype='int32'))
stride = theano.shared(np.array([1, 1], dtype='int32')) stride = theano.shared(np.array([1, 1], dtype='int32'))
pad = theano.shared(np.array([0, 0], dtype='int32')) pad = theano.shared(np.array([0, 0], dtype='int32'))
mode = 'max'
def fn(x): def fn(x):
dnn_op = dnn.dnn_pool( dnn_op = dnn.dnn_pool(
...@@ -297,6 +308,12 @@ def test_pooling_with_tensor_vars(): ...@@ -297,6 +308,12 @@ def test_pooling_with_tensor_vars():
i += 1 i += 1
def test_pooling_with_tensor_vars():
# Let's test for mode 'max' and also for 'max_deterministic' if available.
for mode in [m for m in get_dnn_pool_modes() if m in ('max', 'max_deterministic')]:
yield (run_pooling_with_tensor_vars, mode)
def test_pooling3d(): def test_pooling3d():
# 3d pooling requires version 3 or newer. # 3d pooling requires version 3 or newer.
if not dnn.dnn_available(test_ctx_name) or dnn.version(raises=False) < 3000: if not dnn.dnn_available(test_ctx_name) or dnn.version(raises=False) < 3000:
...@@ -307,11 +324,7 @@ def test_pooling3d(): ...@@ -307,11 +324,7 @@ def test_pooling3d():
mode_without_gpu_ref = theano.compile.mode.get_mode( mode_without_gpu_ref = theano.compile.mode.get_mode(
'FAST_RUN').excluding('gpuarray') 'FAST_RUN').excluding('gpuarray')
# 'average_exc_pad' is disabled for versions < 4004 modes = get_dnn_pool_modes()
if dnn.version(raises=False) < 4004:
modes = ('max', 'average_inc_pad')
else:
modes = ('max', 'average_inc_pad', 'average_exc_pad')
x = T.tensor5() x = T.tensor5()
for mode, pad in product(modes, for mode, pad in product(modes,
...@@ -467,11 +480,7 @@ def test_pooling_opt_arbitrary_dimensions(): ...@@ -467,11 +480,7 @@ def test_pooling_opt_arbitrary_dimensions():
raise SkipTest(dnn.dnn_available.msg) raise SkipTest(dnn.dnn_available.msg)
utt.seed_rng() utt.seed_rng()
# 'average_exc_pad' is disabled for versions < 4004 modes = get_dnn_pool_modes()
if dnn.version(raises=False) < 4004:
modes = ('max', 'average_inc_pad')
else:
modes = ('max', 'average_inc_pad', 'average_exc_pad')
for n_non_pool_dims in (0, 1, 2, 3): for n_non_pool_dims in (0, 1, 2, 3):
for ws in ((2, 2), (3, 3, 3)): for ws in ((2, 2), (3, 3, 3)):
...@@ -498,7 +507,7 @@ def test_pooling_opt_arbitrary_dimensions(): ...@@ -498,7 +507,7 @@ def test_pooling_opt_arbitrary_dimensions():
fc = theano.function([], out, mode=mode_without_gpu) fc = theano.function([], out, mode=mode_without_gpu)
assert any([isinstance(node.op, Pool) assert any([isinstance(node.op, Pool)
for node in fc.maker.fgraph.toposort()]) for node in fc.maker.fgraph.toposort()])
if mode == 'max': if mode in ('max', 'max_deterministic'):
assert any([isinstance(node.op, MaxPoolGrad) assert any([isinstance(node.op, MaxPoolGrad)
for node in fc.maker.fgraph.toposort()]) for node in fc.maker.fgraph.toposort()])
else: else:
...@@ -780,11 +789,7 @@ class TestDnnInferShapes(utt.InferShapeTester): ...@@ -780,11 +789,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
dtype=theano.config.floatX dtype=theano.config.floatX
) )
# 'average_exc_pad' is disabled for versions < 4004 modes = get_dnn_pool_modes()
if dnn.version(raises=False) < 4004:
modes = ['max', 'average_inc_pad']
else:
modes = ['max', 'average_inc_pad', 'average_exc_pad']
for params in product( for params in product(
[(1, 1), (2, 2), (3, 3)], [(1, 1), (2, 2), (3, 3)],
...@@ -807,11 +812,7 @@ class TestDnnInferShapes(utt.InferShapeTester): ...@@ -807,11 +812,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
dtype=theano.config.floatX dtype=theano.config.floatX
) )
# 'average_exc_pad' is disabled for versions < 4004 modes = get_dnn_pool_modes()
if dnn.version(raises=False) < 4004:
modes = ['max', 'average_inc_pad']
else:
modes = ['max', 'average_inc_pad', 'average_exc_pad']
for params in product( for params in product(
[(1, 1, 1), (2, 2, 2), (3, 3, 3)], [(1, 1, 1), (2, 2, 2), (3, 3, 3)],
...@@ -847,7 +848,8 @@ class TestDnnInferShapes(utt.InferShapeTester): ...@@ -847,7 +848,8 @@ class TestDnnInferShapes(utt.InferShapeTester):
for params in product( for params in product(
[(1, 1), (2, 2), (3, 3)], [(1, 1), (2, 2), (3, 3)],
[(1, 1), (2, 2), (3, 3)], [(1, 1), (2, 2), (3, 3)],
['max', 'average_inc_pad'] # modes without `average_exc_pad`
[m for m in get_dnn_pool_modes() if m != 'average_exc_pad']
): ):
pool_grad = dnn.GpuDnnPoolGrad(mode=params[2])( pool_grad = dnn.GpuDnnPoolGrad(mode=params[2])(
img, img,
...@@ -886,7 +888,8 @@ class TestDnnInferShapes(utt.InferShapeTester): ...@@ -886,7 +888,8 @@ class TestDnnInferShapes(utt.InferShapeTester):
for params in product( for params in product(
[(1, 1, 1), (2, 2, 2), (3, 3, 3)], [(1, 1, 1), (2, 2, 2), (3, 3, 3)],
[(1, 1, 1), (2, 2, 2), (3, 3, 3)], [(1, 1, 1), (2, 2, 2), (3, 3, 3)],
['max', 'average_inc_pad'] # modes without `average_exc_pad`
[m for m in get_dnn_pool_modes() if m != 'average_exc_pad']
): ):
pool_grad = dnn.GpuDnnPoolGrad(mode=params[2])( pool_grad = dnn.GpuDnnPoolGrad(mode=params[2])(
img, img,
......
...@@ -5710,41 +5710,50 @@ def local_opt_alloc(node): ...@@ -5710,41 +5710,50 @@ def local_opt_alloc(node):
if node_inps.owner and isinstance(node_inps.owner.op, T.Alloc): if node_inps.owner and isinstance(node_inps.owner.op, T.Alloc):
input = node_inps.owner.inputs[0] input = node_inps.owner.inputs[0]
shapes = node_inps.owner.inputs[1:] shapes = node_inps.owner.inputs[1:]
if (node.op.axis is None or try:
node.op.axis == tuple(range(input.ndim))): val = get_scalar_constant_value(input,
try: only_process_constants=True)
val = get_scalar_constant_value(input, assert val.size == 1
only_process_constants=True) val = val.reshape(1)[0]
assert val.size == 1 # check which type of op
# check which type of op size = T.mul(*shapes)
casted = T.mul(*shapes).astype(str(input.dtype)) if input.dtype in ["float16", "float32"]:
# shapes are ints and normally int64.
# We don't want to have a float64 upcast
# We don't want to downcast to float16
# as we fear it could loose too much precision
# that will be amplified by the mul/pow below.
size = size.astype('float32')
if (node.op.axis is None or
node.op.axis == tuple(range(input.ndim))):
if isinstance(node.op, T.Sum): if isinstance(node.op, T.Sum):
val = val.reshape(1)[0] * casted val = val * size
else: else:
val = val.reshape(1)[0] ** casted val = val ** size
# Sum can change the input dtype (upcast or bool
# -> float32) by default or by user request.
# We can ignore the acc_dtype, as there is only 1
# elemwise we will do and not a sequence, so there is no
# accumulation of errors.
# So mostly, we just need to cast the output to the old
# dtype.
val = val.astype(node.outputs[0].dtype)
return [val] return [val]
to_prod = [shapes[i] for i in xrange(len(shapes))
except NotScalarConstantError: if i in node.op.axis]
pass if to_prod:
else: size = T.mul(*to_prod)
try: if isinstance(node.op, T.Sum):
val = get_scalar_constant_value(input, val *= size
only_process_constants=True) else:
assert val.size == 1 val = val ** size
val = val.reshape(1)[0] # See comments above.
to_prod = [shapes[i] for i in xrange(len(shapes)) val = val.astype(node.outputs[0].dtype)
if i in node.op.axis] return [T.alloc(val,
if to_prod: *[shapes[i] for i in xrange(len(shapes))
casted = T.mul(*to_prod).astype(str(input.dtype)) if i not in node.op.axis])]
if isinstance(node.op, T.Sum): except NotScalarConstantError:
val *= casted pass
else:
val = val ** casted
return [T.alloc(val,
*[shapes[i] for i in xrange(len(shapes))
if i not in node.op.axis])]
except NotScalarConstantError:
pass
@register_specialize @register_specialize
......
...@@ -59,7 +59,8 @@ def pool_2d(input, ws=None, ignore_border=None, stride=None, pad=(0, 0), ...@@ -59,7 +59,8 @@ def pool_2d(input, ws=None, ignore_border=None, stride=None, pad=(0, 0),
stride : tuple of two ints or theano vector of ints of size 2. stride : tuple of two ints or theano vector of ints of size 2.
Stride size, which is the number of shifts over rows/cols to get the Stride size, which is the number of shifts over rows/cols to get the
next pool region. If stride is None, it is considered equal to ws next pool region. If stride is None, it is considered equal to ws
(no overlap on pooling regions). (no overlap on pooling regions), eg: stride=(1,1) will shifts over
one row and one col for every iteration.
pad : tuple of two ints or theano vector of ints of size 2. pad : tuple of two ints or theano vector of ints of size 2.
(pad_h, pad_w), pad zeros to extend beyond four borders of the (pad_h, pad_w), pad zeros to extend beyond four borders of the
images, pad_h is the size of the top and bottom margins, and images, pad_h is the size of the top and bottom margins, and
...@@ -433,6 +434,9 @@ class Pool(OpenMPOp): ...@@ -433,6 +434,9 @@ class Pool(OpenMPOp):
super(Pool, self).__init__(openmp=openmp) super(Pool, self).__init__(openmp=openmp)
self.ndim = ndim self.ndim = ndim
self.ignore_border = ignore_border self.ignore_border = ignore_border
if mode == 'max_deterministic':
# It seems max pool algo is already deterministic in CPU.
mode = 'max'
if mode not in ['max', 'average_inc_pad', 'average_exc_pad', 'sum']: if mode not in ['max', 'average_inc_pad', 'average_exc_pad', 'sum']:
raise ValueError( raise ValueError(
"Pool mode parameter only support 'max', 'sum'," "Pool mode parameter only support 'max', 'sum',"
...@@ -1040,6 +1044,9 @@ class PoolGrad(OpenMPOp): ...@@ -1040,6 +1044,9 @@ class PoolGrad(OpenMPOp):
def __init__(self, ignore_border, mode='max', ndim=2, openmp=None): def __init__(self, ignore_border, mode='max', ndim=2, openmp=None):
self.ndim = ndim self.ndim = ndim
self.ignore_border = ignore_border self.ignore_border = ignore_border
if mode == 'max_deterministic':
# It seems max pool grad algo is already deterministic in CPU.
mode = 'max'
if mode not in ['max', 'sum', 'average_inc_pad', 'average_exc_pad']: if mode not in ['max', 'sum', 'average_inc_pad', 'average_exc_pad']:
raise ValueError( raise ValueError(
"Pool mode parameter only support 'max', 'sum'," "Pool mode parameter only support 'max', 'sum',"
......
...@@ -5558,9 +5558,11 @@ class T_local_sum_prod(unittest.TestCase): ...@@ -5558,9 +5558,11 @@ class T_local_sum_prod(unittest.TestCase):
class T_local_opt_alloc(unittest.TestCase): class T_local_opt_alloc(unittest.TestCase):
dtype = 'float32'
def test_sum_upcast(self): def test_sum_upcast(self):
s = theano.tensor.lscalar() s = theano.tensor.lscalar()
a = theano.tensor.alloc(np.asarray(5, dtype='float32'), s, s) a = theano.tensor.alloc(np.asarray(5, dtype=self.dtype), s, s)
orig = theano.config.warn_float64 orig = theano.config.warn_float64
theano.config.warn_float64 = "raise" theano.config.warn_float64 = "raise"
try: try:
...@@ -5571,7 +5573,7 @@ class T_local_opt_alloc(unittest.TestCase): ...@@ -5571,7 +5573,7 @@ class T_local_opt_alloc(unittest.TestCase):
def test_prod_upcast(self): def test_prod_upcast(self):
s = theano.tensor.lscalar() s = theano.tensor.lscalar()
a = theano.tensor.alloc(np.asarray(5, dtype='float32'), s, s) a = theano.tensor.alloc(np.asarray(5, dtype=self.dtype), s, s)
orig = theano.config.warn_float64 orig = theano.config.warn_float64
theano.config.warn_float64 = "raise" theano.config.warn_float64 = "raise"
try: try:
...@@ -5580,6 +5582,24 @@ class T_local_opt_alloc(unittest.TestCase): ...@@ -5580,6 +5582,24 @@ class T_local_opt_alloc(unittest.TestCase):
finally: finally:
theano.config.warn_float64 = orig theano.config.warn_float64 = orig
@theano.configparser.change_flags(on_opt_error='raise')
def test_sum_bool_upcast(self):
s = theano.tensor.lscalar()
a = theano.tensor.alloc(np.asarray(True, dtype='bool'), s, s)
f = theano.function([s], a.sum())
f(5)
# test with user specified dtype
f = theano.function([s], a.sum(dtype=self.dtype))
f(5)
# test only 1 axis summed
f = theano.function([s], a.sum(axis=0, dtype=self.dtype))
f(5)
print(self.dtype)
class T_local_opt_alloc_f16(T_local_opt_alloc):
dtype = 'float16'
class T_local_reduce(unittest.TestCase): class T_local_reduce(unittest.TestCase):
def setUp(self): def setUp(self):
......
...@@ -804,8 +804,8 @@ theano.compile.register_specify_shape_c_code( ...@@ -804,8 +804,8 @@ theano.compile.register_specify_shape_c_code(
PyErr_Format(PyExc_AssertionError, PyErr_Format(PyExc_AssertionError,
"SpecifyShape: vector of shape has %%d elements," "SpecifyShape: vector of shape has %%d elements,"
" but the input has %%d dimensions.", " but the input has %%d dimensions.",
PyArray_NDIM(%(iname)s), PyArray_DIMS(%(shape)s)[0],
PyArray_DIMS(%(shape)s)[0]); PyArray_NDIM(%(iname)s));
%(fail)s; %(fail)s;
} }
for(int i = 0; i < PyArray_NDIM(%(iname)s); i++){ for(int i = 0; i < PyArray_NDIM(%(iname)s); i++){
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论