提交 5d3433c7 authored 作者: xiaoqie's avatar xiaoqie

Merge branch 'master' of https://github.com/Theano/Theano into gpureduce-fix

FROM nvidia/cuda:8.0-cudnn6-devel-ubuntu16.04
ENV DEBIAN_FRONTEND noninteractive
# Install tools
RUN apt-get update
RUN apt-get install -y build-essential apt-utils wget git dvipng time
# Install magma lib
RUN apt-get install -y libopenblas-dev
RUN wget http://icl.cs.utk.edu/projectsfiles/magma/downloads/magma-2.2.0.tar.gz
RUN tar xvf magma-2.2.0.tar.gz
RUN cp magma-2.2.0/make.inc-examples/make.inc.openblas magma-2.2.0/make.inc
ENV OPENBLASDIR /usr
ENV CUDADIR /usr/local/cuda
RUN (cd magma-2.2.0 && make && make install prefix=/usr/local)
RUN ldconfig
# Setup conda python for Theano
RUN wget https://repo.continuum.io/miniconda/Miniconda2-latest-Linux-x86_64.sh
RUN bash Miniconda2-latest-Linux-x86_64.sh -p /miniconda -b
ENV PATH=/miniconda/bin:${PATH}
RUN conda install numpy scipy sympy mkl nose pydot-ng graphviz cython cmake
RUN conda update -y conda
RUN pip install nose-timer parameterized "flake8<3" "sphinx==1.5.2" sphinx_rtd_theme
RUN pip install --upgrade-strategy only-if-needed git+https://github.com/lebedov/scikit-cuda.git
# Setup latex for doc test
RUN apt-get install -y texlive-latex-base texlive-latex-extra
# Install SSH server and Java runtime for Jenkins
RUN apt-get install -y openssh-server default-jre-headless
# Add jenkins user and setup environment
RUN useradd -m -s /bin/bash jenkins
RUN echo jenkins:jenkins | chpasswd
RUN echo "export PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/miniconda/bin:\$PATH" >> /home/jenkins/.bashrc
RUN echo "export LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64:\$LD_LIBRARY_PATH" >> /home/jenkins/.bashrc
# Copy jenkins slave.jar into container
RUN wget -P home/jenkins http://earlgrey.iro.umontreal.ca:8080/jnlpJars/slave.jar
RUN chown -R jenkins:jenkins /home/jenkins/*
# Set launch command as Jenkins slave.jar
CMD java -jar /home/jenkins/slave.jar
\ No newline at end of file
......@@ -6,6 +6,7 @@ export THEANO_FLAGS=init_gpu_device=cuda
# CUDA
export PATH=/usr/local/cuda/bin:$PATH
export CPATH=/usr/local/cuda/include/:$CPATH
export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
export LIBRARY_PATH=/usr/local/cuda/lib64:$LIBRARY_PATH
......@@ -79,6 +80,9 @@ FLAGS=on_shape_error=raise,$FLAGS
# while we want all other runs to run with 'floatX=float64'.
FLAGS=${FLAGS},device=cpu,floatX=float64
# Enable magma GPU library
FLAGS=${FLAGS},magma.enabled=true
# Only use elements in the cache for < 7 days
FLAGS=${FLAGS},cmodule.age_thresh_use=604800
......
......@@ -6,6 +6,7 @@ export THEANO_FLAGS=init_gpu_device=cuda
# CUDA
export PATH=/usr/local/cuda/bin:$PATH
export CPATH=/usr/local/cuda/include/:$CPATH
export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
export LIBRARY_PATH=/usr/local/cuda/lib64:$LIBRARY_PATH
......@@ -79,6 +80,9 @@ FLAGS=${FLAGS},device=cpu,floatX=float64
# Only use elements in the cache for < 7 days
FLAGS=${FLAGS},cmodule.age_thresh_use=604800
# Enable magma GPU library
FLAGS=${FLAGS},magma.enabled=true
#we change the seed and record it everyday to test different combination. We record it to be able to reproduce bug caused by different seed. We don't want multiple test in DEBUG_MODE each day as this take too long.
seed=$RANDOM
echo "Executing tests with mode=DEBUG_MODE with seed of the day $seed"
......
......@@ -8,6 +8,7 @@ export THEANO_FLAGS=init_gpu_device=cuda
# CUDA
export PATH=/usr/local/cuda/bin:$PATH
export CPATH=/usr/local/cuda/include/:$CPATH
export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
export LIBRARY_PATH=/usr/local/cuda/lib64:$LIBRARY_PATH
......@@ -19,4 +20,4 @@ echo
FILE=${BUILDBOT_DIR}/theano_python3_tests.xml
set -x
PYTHONPATH= THEANO_FLAGS=$THEANO_FLAGS,compiledir=$HOME/.theano/buildbot_theano_python3,mode=FAST_COMPILE,warn.ignore_bug_before=all,on_opt_error=raise,on_shape_error=raise python3 bin/theano-nose ${THEANO_PARAM} ${XUNIT}${FILE}
PYTHONPATH= THEANO_FLAGS=$THEANO_FLAGS,compiledir=$HOME/.theano/buildbot_theano_python3,mode=FAST_COMPILE,warn.ignore_bug_before=all,on_opt_error=raise,on_shape_error=raise,magma.enabled=true python3 bin/theano-nose ${THEANO_PARAM} ${XUNIT}${FILE}
......@@ -5,9 +5,6 @@
# Print commands as they are executed
set -x
# Anaconda python
export PATH=/usr/local/miniconda2/bin:$PATH
# Test flake8
echo "===== Testing flake8"
bin/theano-nose theano/tests/test_flake8.py --with-xunit --xunit-file=theano_pre_tests.xml || exit 1
......
......@@ -5,9 +5,6 @@
# Print commands as they are executed
set -x
# Anaconda python
export PATH=/usr/local/miniconda2/bin:$PATH
echo "===== Testing theano core"
# Test theano core
......
......@@ -5,11 +5,9 @@
# Print commands as they are executed
set -x
# Anaconda python
export PATH=/usr/local/miniconda2/bin:$PATH
# CUDA
export PATH=/usr/local/cuda/bin:$PATH
export CPATH=/usr/local/cuda/include/:$CPATH
export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
export LIBRARY_PATH=/usr/local/cuda/lib64:$LIBRARY_PATH
......@@ -54,4 +52,5 @@ THEANO_GPUARRAY_TESTS="theano/gpuarray/tests \
theano/scan_module/tests/test_scan.py:T_Scan_Gpuarray \
theano/scan_module/tests/test_scan_checkpoints.py:TestScanCheckpoint.test_memory"
FLAGS="init_gpu_device=$DEVICE,gpuarray.preallocate=1000,mode=FAST_RUN,on_opt_error=raise,on_shape_error=raise,cmodule.age_thresh_use=604800"
FLAGS=${FLAGS},magma.enabled=true # Enable magma GPU library
THEANO_FLAGS=${FLAGS} time nosetests --with-xunit --xunit-file=theanogpuarray_tests.xml ${THEANO_GPUARRAY_TESTS}
......@@ -271,6 +271,7 @@ class OpFromGraph(gof.Op):
is_inline = self.is_inline
return '%(name)s{inline=%(is_inline)s}' % locals()
@theano.configparser.change_flags(compute_test_value='off')
def _recompute_grad_op(self):
'''
converts self._grad_op from user supplied form to type(self) instance
......
......@@ -2,6 +2,7 @@ from __future__ import absolute_import, print_function, division
from functools import partial
import numpy as np
import theano
from theano import config, shared
from theano.gradient import DisconnectedType
......@@ -313,3 +314,14 @@ class T_OpFromGraph(unittest_tools.InferShapeTester):
[np.ones([3, 4], dtype=config.floatX),
np.ones([3, 4], dtype=config.floatX)],
OpFromGraph)
@theano.configparser.change_flags(compute_test_value='raise')
def test_compute_test_value(self):
x = T.scalar('x')
x.tag.test_value = np.array(1., dtype=config.floatX)
op = OpFromGraph([x], [x ** 3])
y = T.scalar('y')
y.tag.test_value = np.array(1., dtype=config.floatX)
f = op(y)
grad_f = T.grad(f, y)
assert grad_f.tag.test_value is not None
......@@ -268,19 +268,18 @@ def safe_no_dnn_algo_bwd(algo):
'`dnn.conv.algo_bwd_filter` and `dnn.conv.algo_bwd_data` instead.')
return True
# Those are the options provided by Theano to choose algorithms at runtime.
SUPPORTED_DNN_CONV_ALGO_RUNTIME = ('guess_once', 'guess_on_shape_change', 'time_once', 'time_on_shape_change')
# Those are the supported algorithm by Theano,
# The tests will reference those lists.
SUPPORTED_DNN_CONV_ALGO_FWD = ('small', 'none', 'large', 'fft', 'fft_tiling',
'winograd', 'guess_once', 'guess_on_shape_change',
'time_once', 'time_on_shape_change')
SUPPORTED_DNN_CONV_ALGO_FWD = ('small', 'none', 'large', 'fft', 'fft_tiling', 'winograd') + SUPPORTED_DNN_CONV_ALGO_RUNTIME
SUPPORTED_DNN_CONV_ALGO_BWD_DATA = ('none', 'deterministic', 'fft', 'fft_tiling', 'winograd') + SUPPORTED_DNN_CONV_ALGO_RUNTIME
SUPPORTED_DNN_CONV_ALGO_BWD_DATA = ('none', 'deterministic', 'fft', 'fft_tiling',
'winograd', 'guess_once', 'guess_on_shape_change',
'time_once', 'time_on_shape_change')
SUPPORTED_DNN_CONV_ALGO_BWD_FILTER = ('none', 'deterministic', 'fft', 'small') + SUPPORTED_DNN_CONV_ALGO_RUNTIME
SUPPORTED_DNN_CONV_ALGO_BWD_FILTER = ('none', 'deterministic', 'fft', 'small',
'guess_once', 'guess_on_shape_change',
'time_once', 'time_on_shape_change')
SUPPORTED_DNN_CONV_PRECISION = ('as_input_f32', 'as_input', 'float16', 'float32', 'float64')
AddConfigVar('dnn.conv.algo_bwd',
"This flag is deprecated; use dnn.conv.algo_bwd_data and "
......@@ -311,8 +310,7 @@ AddConfigVar('dnn.conv.precision',
"Default data precision to use for the computation in cuDNN "
"convolutions (defaults to the same dtype as the inputs of the "
"convolutions, or float32 if inputs are float16).",
EnumStr('as_input_f32', 'as_input', 'float16', 'float32',
'float64'),
EnumStr(*SUPPORTED_DNN_CONV_PRECISION),
in_c_key=False)
......
......@@ -1413,7 +1413,10 @@ class COp(Op):
return []
def c_code_cache_version(self):
return hash(tuple(self.func_codes))
version = (hash(tuple(self.func_codes)), )
if hasattr(self, 'params_type'):
version += (self.params_type.c_code_cache_version(), )
return version
def c_init_code(self):
"""
......
......@@ -963,6 +963,12 @@ class EnumType(Type, dict):
"""
return alias in self.aliases
def get_aliases(self):
"""
Return the list of all aliases in this enumeration.
"""
return self.aliases.keys()
def __repr__(self):
names_to_aliases = {constant_name: '' for constant_name in self}
for alias in self.aliases:
......@@ -1184,4 +1190,6 @@ class CEnumType(EnumList):
fail=sub['fail'])
def c_code_cache_version(self):
return (1, super(CEnumType, self).c_code_cache_version())
# C code depends on (C constant name, Python value) associations (given by `self.items()`),
# so we should better take them into account in C code version.
return (1, tuple(self.items()), super(CEnumType, self).c_code_cache_version())
#section support_code_apply
int APPLY_SPECIFIC(conv_desc)(PyArrayObject *filt_shp,
cudnnConvolutionDescriptor_t *desc) {
cudnnConvolutionDescriptor_t *desc,
PARAMS_TYPE* params) {
cudnnStatus_t err;
int pad[3] = {PAD_0, PAD_1, PAD_2};
int strides[3] = {SUB_0, SUB_1, SUB_2};
int dilation[3] = {DIL_0, DIL_1, DIL_2};
int pad[3] = {params->pad0, params->pad1, params->pad2};
int strides[3] = {params->sub0, params->sub1, params->sub2};
int dilation[3] = {params->dil0, params->dil1, params->dil2};
#if BORDER_MODE == 0
pad[0] = (*(npy_int64 *)PyArray_GETPTR1(filt_shp, 2) - 1) * DIL_0;
pad[1] = (*(npy_int64 *)PyArray_GETPTR1(filt_shp, 3) - 1) * DIL_1;
#if NB_DIMS > 2
pad[2] = (*(npy_int64 *)PyArray_GETPTR1(filt_shp, 4) - 1) * DIL_2;
#endif
#elif BORDER_MODE == 2
pad[0] = ((*(npy_int64 *)PyArray_GETPTR1(filt_shp, 2) - 1) * DIL_0 + 1) / 2;
pad[1] = ((*(npy_int64 *)PyArray_GETPTR1(filt_shp, 3) - 1) * DIL_1 + 1) / 2;
#if NB_DIMS > 2
pad[2] = ((*(npy_int64 *)PyArray_GETPTR1(filt_shp, 4) - 1) * DIL_2 + 1) / 2;
#endif
#endif
if (params->bmode == BORDER_MODE_FULL) {
pad[0] = (*(npy_int64 *)PyArray_GETPTR1(filt_shp, 2) - 1) * dilation[0];
pad[1] = (*(npy_int64 *)PyArray_GETPTR1(filt_shp, 3) - 1) * dilation[1];
if (params->nb_dims > 2) {
pad[2] = (*(npy_int64 *)PyArray_GETPTR1(filt_shp, 4) - 1) * dilation[2];
}
} else if(params->bmode == BORDER_MODE_HALF) {
pad[0] = ((*(npy_int64 *)PyArray_GETPTR1(filt_shp, 2) - 1) * dilation[0] + 1) / 2;
pad[1] = ((*(npy_int64 *)PyArray_GETPTR1(filt_shp, 3) - 1) * dilation[1] + 1) / 2;
if (params->nb_dims > 2) {
pad[2] = ((*(npy_int64 *)PyArray_GETPTR1(filt_shp, 4) - 1) * dilation[2] + 1) / 2;
}
}
if (PyArray_DIM(filt_shp, 0) - 2 != NB_DIMS) {
if (PyArray_DIM(filt_shp, 0) - 2 != params->nb_dims) {
PyErr_Format(PyExc_ValueError, "Filter shape has too many dimensions: "
"expected %d, got %lld.", NB_DIMS,
"expected %d, got %lld.", params->nb_dims,
(long long)PyArray_DIM(filt_shp, 0));
return -1;
}
......@@ -35,8 +36,8 @@ int APPLY_SPECIFIC(conv_desc)(PyArrayObject *filt_shp,
return -1;
}
err = cudnnSetConvolutionNdDescriptor(*desc, NB_DIMS, pad, strides,
dilation, CONV_MODE, PRECISION);
err = cudnnSetConvolutionNdDescriptor(*desc, params->nb_dims, pad, strides,
dilation, params->conv_mode, params->precision);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not set convolution "
"descriptor: %s", cudnnGetErrorString(err));
......
"""
Declarations of cuDNN types and constants used in Theano gpuarray DNN module.
For every cuDNN API supported by Theano, this module defines a class that
provides the set of cuDNN definitions to be used in Theano Ops.
Use :func:`get_definitions` to get the right cuDNN definitions
for a given cuDNN version.
Currently supported cuDNN APIs:
- v5.1
- v6.0
"""
from __future__ import absolute_import, print_function, division
from theano.gof import CEnumType
# NB: Some cuDNN algorithms are listed in cuDNN enums but not implemented.
# We still register them here because we try to exactly copy cuDNN enums
# in Python side, but they will have no aliases associated, to help
# exclude them from lists of supported algorithms.
class CuDNNV51(object):
version = 5
cudnnConvolutionMode_t = CEnumType(('CUDNN_CONVOLUTION', 'conv'),
('CUDNN_CROSS_CORRELATION', 'cross'),
ctype='cudnnConvolutionMode_t')
cudnnDataType_t = CEnumType(('CUDNN_DATA_FLOAT', 'float32'),
('CUDNN_DATA_DOUBLE', 'float64'),
('CUDNN_DATA_HALF', 'float16'),
# CUDNN_DATA_INT8 # new in v6
# CUDNN_DATA_INT32 # new in v6
# CUDNN_DATA_INT8x4 # new in v6
ctype='cudnnDataType_t')
cudnnConvolutionFwdAlgo_t = CEnumType(('CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM', 'none'),
('CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM', 'small'),
('CUDNN_CONVOLUTION_FWD_ALGO_GEMM', 'large'),
# not implemented:
('CUDNN_CONVOLUTION_FWD_ALGO_DIRECT'),
('CUDNN_CONVOLUTION_FWD_ALGO_FFT', 'fft'),
('CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING', 'fft_tiling'),
('CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD', 'winograd'),
# TODO: Not yet tested/documented:
('CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED', 'winograd_non_fused'),
ctype='cudnnConvolutionFwdAlgo_t')
conv3d_fwd_algorithms = ('none', 'small', 'fft_tiling')
cudnnConvolutionBwdFilterAlgo_t = CEnumType(('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0', 'none'),
('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1', 'deterministic'),
('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT', 'fft'),
('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3', 'small'),
# TODO: not yet tested/documented:
('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED', 'winograd_non_fused'),
ctype='cudnnConvolutionBwdFilterAlgo_t')
conv3d_bwd_filter_algorithms = ('none', 'small')
cudnnConvolutionBwdDataAlgo_t = CEnumType(('CUDNN_CONVOLUTION_BWD_DATA_ALGO_0', 'none'),
('CUDNN_CONVOLUTION_BWD_DATA_ALGO_1', 'deterministic'),
('CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT', 'fft'),
('CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING', 'fft_tiling'),
('CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD', 'winograd'),
# TODO: not yet tested/documented:
('CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED', 'winograd_non_fused'),
ctype='cudnnConvolutionBwdDataAlgo_t')
conv3d_bwd_data_algorithms = ('none', 'deterministic', 'fft_tiling')
cudnnPoolingMode_t = CEnumType(('CUDNN_POOLING_MAX', 'max'),
('CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING', 'average_inc_pad'),
('CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING', 'average_exc_pad'),
ctype='cudnnPoolingMode_t')
cudnnSoftmaxAlgorithm_t = CEnumType(('CUDNN_SOFTMAX_FAST', 'fast'),
('CUDNN_SOFTMAX_ACCURATE', 'accurate'),
('CUDNN_SOFTMAX_LOG', 'log'),
ctype='cudnnSoftmaxAlgorithm_t')
cudnnSoftmaxMode_t = CEnumType(('CUDNN_SOFTMAX_MODE_INSTANCE', 'instance'),
('CUDNN_SOFTMAX_MODE_CHANNEL', 'channel'),
ctype='cudnnSoftmaxMode_t')
cudnnBatchNormMode_t = CEnumType(('CUDNN_BATCHNORM_PER_ACTIVATION', 'per-activation'),
('CUDNN_BATCHNORM_SPATIAL', 'spatial'),
ctype='cudnnBatchNormMode_t')
class CuDNNV6(CuDNNV51):
version = 6
cudnnPoolingMode_t = CEnumType(('CUDNN_POOLING_MAX', 'max'),
('CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING', 'average_inc_pad'),
('CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING', 'average_exc_pad'),
# new in v6:
('CUDNN_POOLING_MAX_DETERMINISTIC', 'max_deterministic'),
ctype='cudnnPoolingMode_t')
cudnnConvolutionBwdFilterAlgo_t = CEnumType(('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0', 'none'),
('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1', 'deterministic'),
('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT', 'fft'),
('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3', 'small'),
# not implemented:
('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD'),
('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED', 'winograd_non_fused'),
# TODO: not yet tested/documented:
# new in v6:
('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING', 'fft_tiling'),
ctype='cudnnConvolutionBwdFilterAlgo_t')
def get_definitions(cudnn_version=None):
"""
Return cuDNN definitions to be used by Theano for the given cuDNN version.
``cudnn_version`` must be None or an integer
(typically the version returned by :func:`theano.gpuarray.dnn.version`).
if None, return definitions for the most recent supported cuDNN version.
"""
if cudnn_version is not None and cudnn_version // 1000 == 5:
return CuDNNV51()
# By default, we use definitions for the last supported cuDNN version.
return CuDNNV6()
差异被折叠。
......@@ -24,7 +24,7 @@ int dnn_batchnorm_grad(PyGpuArrayObject *inp, PyGpuArrayObject *doutp,
PyGpuArrayObject *scale, PyGpuArrayObject *x_mean,
PyGpuArrayObject *x_invstd, npy_float64 epsilon,
PyGpuArrayObject **dinp, PyGpuArrayObject **dscale,
PyGpuArrayObject **dbias, cudnnHandle_t _handle) {
PyGpuArrayObject **dbias, PARAMS_TYPE* params) {
PyGpuContextObject *c = inp->context;
if (c_set_tensorNd(inp, bn_input) != 0)
......@@ -70,8 +70,8 @@ int dnn_batchnorm_grad(PyGpuArrayObject *inp, PyGpuArrayObject *doutp,
betaParam = (void *)&fbeta;
}
cudnnStatus_t err = cudnnBatchNormalizationBackward(
_handle,
MODE,
params->handle,
params->mode,
alphaData,
betaData,
alphaParam,
......
......@@ -3,7 +3,7 @@
int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
PyGpuArrayObject *bias, PyGpuArrayObject *est_mean,
PyGpuArrayObject *est_var, npy_float64 epsilon,
PyGpuArrayObject **outp, cudnnHandle_t _handle) {
PyGpuArrayObject **outp, PARAMS_TYPE* params) {
PyGpuContextObject *c = inp->context;
if (c_set_tensorNd(inp, bn_input) != 0)
......@@ -16,14 +16,14 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
return 1;
}
#ifdef INPLACE_OUTPUT
Py_XDECREF(*outp);
*outp = inp;
Py_INCREF(*outp);
#else
if (theano_prep_output(outp, inp->ga.nd, inp->ga.dimensions, inp->ga.typecode, GA_C_ORDER, c) != 0)
return 1;
#endif
if (params->inplace) {
Py_XDECREF(*outp);
*outp = inp;
Py_INCREF(*outp);
} else {
if (theano_prep_output(outp, inp->ga.nd, inp->ga.dimensions, inp->ga.typecode, GA_C_ORDER, c) != 0)
return 1;
}
if (c_set_tensorNd(*outp, bn_output) != 0)
return 1;
......@@ -43,8 +43,8 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
beta = (void *)&fbeta;
}
cudnnStatus_t err = cudnnBatchNormalizationForwardInference(
_handle,
MODE,
params->handle,
params->mode,
alpha,
beta,
bn_input,
......
#section init_code_struct
#ifdef CHOOSE_ALGO
reuse_algo = 0;
prev_algo = CONV_ALGO;
#ifndef CHOOSE_ONCE
memset(prev_img_dims, 0, sizeof(prev_img_dims));
memset(prev_kern_dims, 0, sizeof(prev_kern_dims));
#endif
#endif
if (PARAMS->choose_algo) {
reuse_algo = 0;
prev_algo = PARAMS->conv_algo;
if (!PARAMS->choose_once) {
memset(prev_img_dims, 0, sizeof(prev_img_dims));
memset(prev_kern_dims, 0, sizeof(prev_kern_dims));
}
}
#section support_code_struct
#ifdef CHOOSE_ALGO
int reuse_algo;
cudnnConvolutionFwdAlgo_t prev_algo;
#ifndef CHOOSE_ONCE
size_t prev_img_dims[5];
size_t prev_kern_dims[5];
#endif
#endif
int
APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
......@@ -26,7 +22,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
cudnnConvolutionDescriptor_t desc,
double alpha, double beta,
PyGpuArrayObject **output,
cudnnHandle_t _handle) {
PARAMS_TYPE* params) {
PyGpuContextObject *c = input->context;
void *alpha_p;
void *beta_p;
......@@ -54,17 +50,17 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
return 1;
}
#ifdef CONV_INPLACE
Py_XDECREF(*output);
*output = om;
Py_INCREF(*output);
#else
if (theano_prep_output(output, PyGpuArray_NDIM(om), PyGpuArray_DIMS(om),
om->ga.typecode, GA_C_ORDER, c) != 0)
return 1;
if (beta != 0.0 && pygpu_move(*output, om))
return 1;
#endif
if (params->inplace) {
Py_XDECREF(*output);
*output = om;
Py_INCREF(*output);
} else {
if (theano_prep_output(output, PyGpuArray_NDIM(om), PyGpuArray_DIMS(om),
om->ga.typecode, GA_C_ORDER, c) != 0)
return 1;
if (beta != 0.0 && pygpu_move(*output, om))
return 1;
}
if (PyGpuArray_DIMS(input)[0] == 0 || PyGpuArray_DIMS(kerns)[0] == 0 || PyGpuArray_DIMS(kerns)[1] == 0) {
int err2 = GpuArray_memset(&(*output)->ga, 0);
......@@ -83,90 +79,90 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
if (c_set_tensorNd(*output, APPLY_SPECIFIC(output)) == -1)
return 1;
cudnnConvolutionFwdAlgo_t algo = CONV_ALGO;
cudnnConvolutionFwdAlgo_t algo = params->conv_algo;
cuda_enter(c->ctx);
#ifdef CHOOSE_ALGO
#ifndef CHOOSE_ONCE
reuse_algo = 1;
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(input, i) == prev_img_dims[i]);
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]);
}
#endif
if (!reuse_algo) {
size_t free;
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
if (err2 != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
"memory information on the GPU");
cuda_exit(c->ctx);
return 1;
if (params->choose_algo) {
if (params->choose_once) {
reuse_algo = 1;
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(input, i) == prev_img_dims[i]);
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]);
}
}
// Guess 4Mb if the info is not available
if (free == 0) free = 4 * 1024 * 1024;
if (!reuse_algo) {
size_t free;
#ifdef CHOOSE_TIME
int count;
cudnnConvolutionFwdAlgoPerf_t choice;
gpudata *tmpmem;
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
if (err2 != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
"memory information on the GPU");
cuda_exit(c->ctx);
return 1;
}
tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
if (tmpmem == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
return -1;
// Guess 4Mb if the info is not available
if (free == 0) free = 4 * 1024 * 1024;
if (params->choose_time) {
int count;
cudnnConvolutionFwdAlgoPerf_t choice;
gpudata *tmpmem;
tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
if (tmpmem == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
return -1;
}
// We don't sync the buffer as we don't care about the values.
err = cudnnFindConvolutionForwardAlgorithmEx(
params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
desc, APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output),
1, &count, &choice, *(void **)tmpmem,
free);
gpudata_release(tmpmem);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
algo = choice.algo;
} else {
err = cudnnGetConvolutionForwardAlgorithm(
params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
desc, APPLY_SPECIFIC(output),
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
}
prev_algo = algo;
} else {
algo = prev_algo;
}
// We don't sync the buffer as we don't care about the values.
err = cudnnFindConvolutionForwardAlgorithmEx(
_handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
desc, APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output),
1, &count, &choice, *(void **)tmpmem,
free);
gpudata_release(tmpmem);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
algo = choice.algo;
#else
err = cudnnGetConvolutionForwardAlgorithm(
_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
desc, APPLY_SPECIFIC(output),
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
if (params->choose_once) {
reuse_algo = 1;
} else {
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
prev_img_dims[i] = PyGpuArray_DIM(input, i);
prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
}
}
#endif
prev_algo = algo;
} else {
algo = prev_algo;
}
#ifdef CHOOSE_ONCE
reuse_algo = 1;
#else
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
prev_img_dims[i] = PyGpuArray_DIM(input, i);
prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
}
#endif
#endif
/* These two algos are not supported for 3d conv */
if (PyGpuArray_NDIM(input) == 5 &&
(algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM ||
......@@ -201,20 +197,16 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
return 1;
}
if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT)
{
if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT) {
if (stride[0] != 1 || stride[1] != 1 ||
PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
(PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1))
{
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
}
}
else
{
} else {
// algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING
if (stride[0] != 1 || stride[1] != 1)
{
if (stride[0] != 1 || stride[1] != 1) {
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
}
}
......@@ -223,7 +215,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
{
size_t worksize;
gpudata *workspace;
err = cudnnGetConvolutionForwardWorkspaceSize(_handle,
err = cudnnGetConvolutionForwardWorkspaceSize(params->handle,
APPLY_SPECIFIC(input),
APPLY_SPECIFIC(kerns),
desc,
......@@ -236,7 +228,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
// TODO: Print a warning
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
err = cudnnGetConvolutionForwardWorkspaceSize(_handle,
err = cudnnGetConvolutionForwardWorkspaceSize(params->handle,
APPLY_SPECIFIC(input),
APPLY_SPECIFIC(kerns),
desc,
......@@ -273,7 +265,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnConvolutionForward(
_handle,
params->handle,
alpha_p,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
......
#section init_code_struct
#ifdef CHOOSE_ALGO
reuse_algo = 0;
prev_algo = CONV_ALGO;
#ifndef CHOOSE_ONCE
memset(prev_kern_dims, 0, sizeof(prev_kern_dims));
memset(prev_top_dims, 0, sizeof(prev_top_dims));
#endif
#endif
// #ifdef CHOOSE_ALGO
if (PARAMS->choose_algo) {
reuse_algo = 0;
prev_algo = PARAMS->conv_algo;
// #ifndef CHOOSE_ONCE
if (!PARAMS->choose_once) {
memset(prev_kern_dims, 0, sizeof(prev_kern_dims));
memset(prev_top_dims, 0, sizeof(prev_top_dims));
}
// #endif
}
// #endif
#section support_code_struct
#ifdef CHOOSE_ALGO
int reuse_algo = 0;
cudnnConvolutionBwdDataAlgo_t prev_algo = CONV_ALGO;
#ifndef CHOOSE_ONCE
int reuse_algo;
cudnnConvolutionBwdDataAlgo_t prev_algo;
size_t prev_kern_dims[5] = {0};
size_t prev_top_dims[5] = {0};
#endif
#endif
int
APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
PyGpuArrayObject *im,
cudnnConvolutionDescriptor_t desc,
double alpha, double beta, PyGpuArrayObject **input,
cudnnHandle_t _handle) {
PARAMS_TYPE* params) {
PyGpuContextObject *c = kerns->context;
void *alpha_p;
void *beta_p;
......@@ -53,17 +53,20 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
return 1;
}
#ifdef CONV_INPLACE
Py_XDECREF(*input);
*input = im;
Py_INCREF(*input);
#else
if (theano_prep_output(input, PyGpuArray_NDIM(im), PyGpuArray_DIMS(im),
im->ga.typecode, GA_C_ORDER, c) != 0)
return 1;
if (beta != 0.0 && pygpu_move(*input, im))
return 1;
#endif
// #ifdef CONV_INPLACE
if (params->inplace) {
Py_XDECREF(*input);
*input = im;
Py_INCREF(*input);
// #else
} else {
if (theano_prep_output(input, PyGpuArray_NDIM(im), PyGpuArray_DIMS(im),
im->ga.typecode, GA_C_ORDER, c) != 0)
return 1;
if (beta != 0.0 && pygpu_move(*input, im))
return 1;
}
// #endif
if (PyGpuArray_DIMS(im)[0] == 0 || PyGpuArray_DIMS(kerns)[0] == 0 || PyGpuArray_DIMS(kerns)[1] == 0) {
int err2 = GpuArray_memset(&(*input)->ga, 0);
......@@ -82,7 +85,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
if (c_set_tensorNd(*input, APPLY_SPECIFIC(input)) == -1)
return 1;
cudnnConvolutionBwdDataAlgo_t algo = CONV_ALGO;
cudnnConvolutionBwdDataAlgo_t algo = params->conv_algo;
cuda_enter(c->ctx);
......@@ -128,84 +131,93 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
}
}
#ifdef CHOOSE_ALGO
#ifndef CHOOSE_ONCE
reuse_algo = 1;
for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); i++) {
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]);
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(output, i) == prev_top_dims[i]);
}
#endif
// #ifdef CHOOSE_ALGO
if (params->choose_algo) {
// #ifndef CHOOSE_ONCE
if (!params->choose_once) {
reuse_algo = 1;
for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); i++) {
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]);
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(output, i) == prev_top_dims[i]);
}
}
// #endif
if (!reuse_algo) {
size_t free;
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
if (!reuse_algo) {
size_t free;
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
if (err2 != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
"memory information on the GPU");
cuda_exit(c->ctx);
return 1;
}
if (err2 != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
"memory information on the GPU");
cuda_exit(c->ctx);
return 1;
}
// Guess 4Mb if the info is not available
if (free == 0) free = 4 * 1024 * 1024;
// Guess 4Mb if the info is not available
if (free == 0) free = 4 * 1024 * 1024;
#ifdef CHOOSE_TIME
int count;
cudnnConvolutionBwdDataAlgoPerf_t choice;
gpudata *tmpmem;
// #ifdef CHOOSE_TIME
if (params->choose_time) {
int count;
cudnnConvolutionBwdDataAlgoPerf_t choice;
gpudata *tmpmem;
tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
if (tmpmem == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
return -1;
}
tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
if (tmpmem == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
return -1;
}
err = cudnnFindConvolutionBackwardDataAlgorithmEx(
_handle, APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(*input),
1, &count, &choice, *(void **)tmpmem, free);
gpudata_release(tmpmem);
err = cudnnFindConvolutionBackwardDataAlgorithmEx(
params->handle, APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(*input),
1, &count, &choice, *(void **)tmpmem, free);
gpudata_release(tmpmem);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
algo = choice.algo;
// #else
} else {
err = cudnnGetConvolutionBackwardDataAlgorithm(
params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output),
desc, APPLY_SPECIFIC(input),
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
}
algo = choice.algo;
#else
err = cudnnGetConvolutionBackwardDataAlgorithm(
_handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output),
desc, APPLY_SPECIFIC(input),
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
// #endif
prev_algo = algo;
} else {
algo = prev_algo;
}
#endif
prev_algo = algo;
} else {
algo = prev_algo;
}
#ifdef CHOOSE_ONCE
reuse_algo = 1;
#else
for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); i++) {
prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
prev_top_dims[i] = PyGpuArray_DIM(output, i);
// #ifdef CHOOSE_ONCE
if (params->choose_once) {
reuse_algo = 1;
// #else
} else {
for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); i++) {
prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
prev_top_dims[i] = PyGpuArray_DIM(output, i);
}
}
// #endif
}
#endif
#endif
// #endif
// The FFT implementation does not support strides, 1x1 filters or inputs
// with a spatial dimension larger than 1024. The tiled-FFT implementation
......@@ -258,7 +270,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
gpudata *workspace;
err = cudnnGetConvolutionBackwardDataWorkspaceSize(
_handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(input), algo, &worksize);
if (err != CUDNN_STATUS_SUCCESS) {
......@@ -283,7 +295,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
cuda_wait((*input)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnConvolutionBackwardData(
_handle,
params->handle,
alpha_p,
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),
......
#section init_code_struct
#ifdef CHOOSE_ALGO
reuse_algo = 0;
prev_algo = CONV_ALGO;
#ifndef CHOOSE_ONCE
memset(prev_img_dims, 0, sizeof(prev_img_dims));
memset(prev_top_dims, 0, sizeof(prev_top_dims));
#endif
#endif
if (PARAMS->choose_algo) {
reuse_algo = 0;
prev_algo = PARAMS->conv_algo;
if (!PARAMS->choose_once) {
memset(prev_img_dims, 0, sizeof(prev_img_dims));
memset(prev_top_dims, 0, sizeof(prev_top_dims));
}
}
#section support_code_struct
#ifdef CHOOSE_ALGO
int reuse_algo;
cudnnConvolutionBwdFilterAlgo_t prev_algo;
#ifndef CHOOSE_ONCE
size_t prev_img_dims[5];
size_t prev_top_dims[5];
#endif
#endif
int
APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
PyGpuArrayObject *km,
cudnnConvolutionDescriptor_t desc,
double alpha, double beta, PyGpuArrayObject **kerns,
cudnnHandle_t _handle) {
PARAMS_TYPE* params) {
PyGpuContextObject *c = input->context;
void *alpha_p;
void *beta_p;
......@@ -53,17 +49,17 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
return 1;
}
#ifdef CONV_INPLACE
Py_XDECREF(*kerns);
*kerns = km;
Py_INCREF(*kerns);
#else
if (theano_prep_output(kerns, PyGpuArray_NDIM(km), PyGpuArray_DIMS(km),
km->ga.typecode, GA_C_ORDER, c) != 0)
return 1;
if (beta != 0.0 && pygpu_move(*kerns, km))
return 1;
#endif
if (params->inplace) {
Py_XDECREF(*kerns);
*kerns = km;
Py_INCREF(*kerns);
} else {
if (theano_prep_output(kerns, PyGpuArray_NDIM(km), PyGpuArray_DIMS(km),
km->ga.typecode, GA_C_ORDER, c) != 0)
return 1;
if (beta != 0.0 && pygpu_move(*kerns, km))
return 1;
}
if (PyGpuArray_DIMS(input)[0] == 0 || PyGpuArray_DIMS(km)[0] == 0 || PyGpuArray_DIMS(km)[1] == 0) {
int err2 = GpuArray_memset(&(*kerns)->ga, 0);
......@@ -82,7 +78,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns)) == -1)
return 1;
cudnnConvolutionBwdFilterAlgo_t algo = CONV_ALGO;
cudnnConvolutionBwdFilterAlgo_t algo = params->conv_algo;
cuda_enter(c->ctx);
......@@ -128,86 +124,85 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
}
}
#ifdef CHOOSE_ALGO
#ifndef CHOOSE_ONCE
reuse_algo = 1;
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(input, i) == prev_img_dims[i]);
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(output, i) == prev_top_dims[i]);
}
#endif
if (!reuse_algo) {
size_t free;
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
if (err2 != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
"memory information on the GPU");
cuda_exit(c->ctx);
return 1;
if (params->choose_algo) {
if (!params->choose_once) {
reuse_algo = 1;
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(input, i) == prev_img_dims[i]);
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(output, i) == prev_top_dims[i]);
}
}
// Guess 4Mb if the info is not available
if (free == 0) free = 4 * 1024 * 1024;
#ifdef CHOOSE_TIME
int count;
cudnnConvolutionBwdFilterAlgoPerf_t choice;
gpudata *tmpmem;
tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
if (tmpmem == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
return -1;
}
if (!reuse_algo) {
size_t free;
err = cudnnFindConvolutionBackwardFilterAlgorithmEx(
_handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc,
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(*kerns),
1, &count, &choice, *(void **)tmpmem, free);
gpudata_release(tmpmem);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
if (err2 != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
"memory information on the GPU");
cuda_exit(c->ctx);
return 1;
}
// Guess 4Mb if the info is not available
if (free == 0) free = 4 * 1024 * 1024;
if (params->choose_time) {
int count;
cudnnConvolutionBwdFilterAlgoPerf_t choice;
gpudata *tmpmem;
tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
if (tmpmem == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
return -1;
}
err = cudnnFindConvolutionBackwardFilterAlgorithmEx(
params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc,
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(*kerns),
1, &count, &choice, *(void **)tmpmem, free);
gpudata_release(tmpmem);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
algo = choice.algo;
} else {
err = cudnnGetConvolutionBackwardFilterAlgorithm(
params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
desc, APPLY_SPECIFIC(kerns),
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
}
prev_algo = algo;
} else {
algo = prev_algo;
}
algo = choice.algo;
#else
err = cudnnGetConvolutionBackwardFilterAlgorithm(
_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
desc, APPLY_SPECIFIC(kerns),
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
if (params->choose_once) {
reuse_algo = 1;
} else {
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
prev_img_dims[i] = PyGpuArray_DIM(input, i);
prev_top_dims[i] = PyGpuArray_DIM(output, i);
}
}
#endif
prev_algo = algo;
} else {
algo = prev_algo;
}
#ifdef CHOOSE_ONCE
reuse_algo = 1;
#else
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
prev_img_dims[i] = PyGpuArray_DIM(input, i);
prev_top_dims[i] = PyGpuArray_DIM(output, i);
}
#endif
#endif
// The FFT implementation does not support strides, 1x1 filters or inputs
// with a spatial dimension larger than 1024.
......@@ -246,7 +241,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
gpudata *workspace;
err = cudnnGetConvolutionBackwardFilterWorkspaceSize(
_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(kerns), algo, &worksize);
if (err != CUDNN_STATUS_SUCCESS) {
......@@ -270,7 +265,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
cuda_wait((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnConvolutionBackwardFilter(
_handle,
params->handle,
alpha_p,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),
......
......@@ -42,7 +42,7 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
PyArrayObject *stride,
PyArrayObject *pad,
PyGpuArrayObject **out,
cudnnHandle_t _handle) {
PARAMS_TYPE* params) {
PyGpuContextObject *c = img->context;
size_t dims[5];
cudnnStatus_t err;
......@@ -90,7 +90,7 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
if (c_set_tensorNd(*out, APPLY_SPECIFIC(output)) != 0)
return 1;
err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), MODE_FLAG, CUDNN_PROPAGATE_NAN, ndims, w, p, s);
err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), params->mode, CUDNN_PROPAGATE_NAN, ndims, w, p, s);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set op descriptor %s", cudnnGetErrorString(err));
......@@ -124,7 +124,7 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
cuda_wait((*out)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnPoolingForward(
_handle, APPLY_SPECIFIC(pool),
params->handle, APPLY_SPECIFIC(pool),
alpha,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(img),
beta,
......
......@@ -64,7 +64,7 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
PyArrayObject *stride,
PyArrayObject *pad,
PyGpuArrayObject **inp_grad,
cudnnHandle_t _handle) {
PARAMS_TYPE* params) {
PyGpuContextObject *c = inp->context;
cudnnStatus_t err;
......@@ -116,7 +116,7 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
s[i] = *((npy_intp*)PyArray_GETPTR1(stride, i));
}
err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), MODE_FLAG, CUDNN_PROPAGATE_NAN, ndims, w, p, s);
err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), params->mode, CUDNN_PROPAGATE_NAN, ndims, w, p, s);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set op descriptor %s", cudnnGetErrorString(err));
......@@ -155,7 +155,7 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
cuda_wait((*inp_grad)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnPoolingBackward(
_handle, APPLY_SPECIFIC(pool),
params->handle, APPLY_SPECIFIC(pool),
alpha,
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(out),
APPLY_SPECIFIC(output_grad), PyGpuArray_DEV_DATA(out_grad),
......
......@@ -35,7 +35,7 @@ class GpuCumOp(GpuKernelBase, Op):
return hash(self.axis) ^ hash(self.mode)
def c_code_cache_version(self):
return (3,)
return (5,)
def c_headers(self):
return ['<numpy_compat.h>', '<gpuarray/types.h>', '<gpuarray_helper.h>']
......@@ -67,13 +67,16 @@ class GpuCumOp(GpuKernelBase, Op):
dtype_x = node.inputs[0].dtype
flags = Kernel.get_flags(dtype_x)
code = """
KERNEL void %(kname)s(float* input, float* output,
KERNEL void %(kname)s(float* input, ga_size input_offset,
float* output, ga_size output_offset,
ga_ssize inputStrides_x,
ga_ssize inputStrides_y,
ga_ssize inputStrides_z,
ga_ssize outputStrides_x, ga_ssize outputStrides_y,
ga_ssize outputStrides_z, const int offsetY, const int offsetZ,
const int beforeLastElementIdx, const int lastElementIdx){
input = (float *)(((char *)input) + input_offset);
output = (float *)(((char *)output) + output_offset);
int idY = blockIdx.y + offsetY;
int idZ = blockIdx.z + offsetZ;
......@@ -85,8 +88,10 @@ class GpuCumOp(GpuKernelBase, Op):
output[idx_last_output] = input[idx_last_input] %(op)s output[idx_beforelast];
}
""" % locals()
params = [gpuarray.GpuArray, gpuarray.GpuArray, gpuarray.SSIZE,
gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
params = [gpuarray.GpuArray, gpuarray.SIZE,
gpuarray.GpuArray, gpuarray.SIZE,
gpuarray.SSIZE, gpuarray.SSIZE,
gpuarray.SSIZE, gpuarray.SSIZE,
gpuarray.SSIZE, gpuarray.SSIZE,
'intc', 'intc',
'intc', 'intc',
......@@ -96,10 +101,11 @@ class GpuCumOp(GpuKernelBase, Op):
# blockCumOp
kname = "k_blockCumOp"
k_var = "k_blockCumOp_" + nodename
params = [gpuarray.GpuArray, gpuarray.GpuArray, gpuarray.SIZE,
params = [gpuarray.GpuArray, gpuarray.SIZE,
gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SIZE,
gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
'int32', 'int32', gpuarray.GpuArray, ]
'int32', 'int32', gpuarray.GpuArray, gpuarray.SIZE]
code = """
// helper functions
WITHIN_KERNEL
......@@ -154,12 +160,17 @@ class GpuCumOp(GpuKernelBase, Op):
output[idx_odd] = partialCumOp[threadIdx.x*2 + 1];
}
KERNEL void k_blockCumOp(float* input, float* output,
size_t nbElementsPerCumOp, ga_ssize inputStrides_x,
ga_ssize inputStrides_y, ga_ssize inputStrides_z,
ga_ssize outputStrides_x, ga_ssize outputStrides_y,
ga_ssize outputStrides_z, int offsetY,
int offsetZ, float* blockSum) {
KERNEL void k_blockCumOp(float* input, ga_size input_offset,
float* output, ga_size output_offset,
size_t nbElementsPerCumOp, ga_ssize inputStrides_x,
ga_ssize inputStrides_y, ga_ssize inputStrides_z,
ga_ssize outputStrides_x, ga_ssize outputStrides_y,
ga_ssize outputStrides_z, int offsetY,
int offsetZ, float* blockSum, ga_size blockSum_offset) {
input = (float *)(((char *)input) + input_offset);
output = (float *)(((char *)output) + output_offset);
blockSum = (float *)(((char *)blockSum) + blockSum_offset);
// Regarding blockIdx and threadIdx, 'CumOp' is always performed along the X axis.
// The Y and Z axis of the grid will contain all independent cumops of the 2D/3D case.
......@@ -197,9 +208,14 @@ class GpuCumOp(GpuKernelBase, Op):
kname = "k_finalCumOp"
k_var = "k_finalCumOp_" + nodename
code = """
KERNEL void k_finalCumOp(float* output, float* blockSum, size_t nbElementsPerCumOp,
ga_ssize dataStrides_x, ga_ssize dataStrides_y, ga_ssize dataStrides_z,
int offsetY, int offsetZ) {
KERNEL void k_finalCumOp(float* output, ga_size output_offset,
float* blockSum, ga_size blockSum_offset,
size_t nbElementsPerCumOp,
ga_ssize dataStrides_x, ga_ssize dataStrides_y, ga_ssize dataStrides_z,
int offsetY, int offsetZ) {
output = (float *)(((char *)output) + output_offset);
blockSum = (float *)(((char *)blockSum) + blockSum_offset);
int globalThreadID = (blockIdx.x + 1) * blockDim.x + threadIdx.x;
// Check if current has data to process.
......@@ -218,7 +234,8 @@ class GpuCumOp(GpuKernelBase, Op):
output[idx_odd] %(op)s= currentBlockSum;
}
""" % locals()
params = [gpuarray.GpuArray, gpuarray.GpuArray, gpuarray.SIZE,
params = [gpuarray.GpuArray, gpuarray.SIZE,
gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SIZE,
gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
'int32', 'int32', ]
kernels.append(Kernel(code=code, name=kname, params=params,
......@@ -381,7 +398,9 @@ class GpuCumOp(GpuKernelBase, Op):
size_t dimBlock[3] = {dimBlockX, 1, 1}; // One cum op per block.
size_t sharedBytes = (2*dimBlockX) * sizeof(float);
void* kernel_params[] = {(void*) input->ga.data,
(void*) &(input->ga.offset),
(void*) output->ga.data,
(void*) &(output->ga.offset),
(void*) &nbElementsPerCumOp,
(void*) &inputStrides_x,
(void*) &inputStrides_y,
......@@ -391,7 +410,8 @@ class GpuCumOp(GpuKernelBase, Op):
(void*) &outputStrides_z,
(void*) &offsetY,
(void*) &offsetZ,
(void*) deviceBlockSum->ga.data
(void*) deviceBlockSum->ga.data,
(void*) &(deviceBlockSum->ga.offset)
};
int err = GpuKernel_call(&k_blockCumOp_%(nodename)s, 3, dimGrid, dimBlock, sharedBytes, kernel_params);
if (err != GA_NO_ERROR){
......@@ -410,7 +430,9 @@ class GpuCumOp(GpuKernelBase, Op):
size_t dimGrid[3] = {dimGridX, localDimGridY, localDimGridZ};
size_t dimBlock[3] = {dimBlockX, 1, 1};
void* kernel_params[] = {(void*) output->ga.data,
(void*) &(output->ga.offset),
(void*) deviceBlockSum->ga.data,
(void*) &(deviceBlockSum->ga.offset),
(void*) &nbElementsPerCumOp,
(void*) &outputStrides_x,
(void*) &outputStrides_y,
......@@ -431,7 +453,9 @@ class GpuCumOp(GpuKernelBase, Op):
size_t tmp0 = shape[axis]-2;
size_t tmp1 = shape[axis]-1;
void* kernel_params[] = {(void*) input->ga.data,
(void*) &(input->ga.offset),
(void*) output->ga.data,
(void*) &(output->ga.offset),
(void*) &inputStrides_x,
(void*) &inputStrides_y,
(void*) &inputStrides_z,
......
......@@ -31,6 +31,20 @@ mode_with_gpu = mode_with_gpu.including()
mode_with_gpu.check_py_code = False
# This variable will store the list of pooling modes available with the current runtime cuDNN version.
# Don't use this variable directly, always call `get_dnn_pool_modes()` instead.
dnn_pool_modes = None
def get_dnn_pool_modes():
# This function is called only by pooling tests to initialize and/or get dnn_pool_modes.
global dnn_pool_modes
if dnn_pool_modes is None:
from .. import cudnn_defs
dnn_pool_modes = cudnn_defs.get_definitions(dnn.version(raises=False)).cudnnPoolingMode_t.get_aliases()
return dnn_pool_modes
# If using float16, set CUDNN precision to float32
def set_precision(floatX):
if floatX == "float16":
......@@ -155,11 +169,7 @@ def test_pooling():
raise SkipTest(dnn.dnn_available.msg)
utt.seed_rng()
# 'average_exc_pad' is disabled for versions < 4004
if dnn.version(raises=False) < 4004:
modes = ('max', 'average_inc_pad')
else:
modes = ('max', 'average_inc_pad', 'average_exc_pad')
modes = get_dnn_pool_modes()
x = T.tensor4()
for mode, pad in product(modes,
......@@ -242,7 +252,9 @@ def test_pooling():
for node in fg.maker.fgraph.toposort()])
def test_pooling_with_tensor_vars():
# This test will be run with different values of 'mode'
# (see next test below).
def run_pooling_with_tensor_vars(mode):
if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg)
utt.seed_rng()
......@@ -251,7 +263,6 @@ def test_pooling_with_tensor_vars():
ws = theano.shared(np.array([2, 2], dtype='int32'))
stride = theano.shared(np.array([1, 1], dtype='int32'))
pad = theano.shared(np.array([0, 0], dtype='int32'))
mode = 'max'
def fn(x):
dnn_op = dnn.dnn_pool(
......@@ -297,6 +308,12 @@ def test_pooling_with_tensor_vars():
i += 1
def test_pooling_with_tensor_vars():
# Let's test for mode 'max' and also for 'max_deterministic' if available.
for mode in [m for m in get_dnn_pool_modes() if m in ('max', 'max_deterministic')]:
yield (run_pooling_with_tensor_vars, mode)
def test_pooling3d():
# 3d pooling requires version 3 or newer.
if not dnn.dnn_available(test_ctx_name) or dnn.version(raises=False) < 3000:
......@@ -307,11 +324,7 @@ def test_pooling3d():
mode_without_gpu_ref = theano.compile.mode.get_mode(
'FAST_RUN').excluding('gpuarray')
# 'average_exc_pad' is disabled for versions < 4004
if dnn.version(raises=False) < 4004:
modes = ('max', 'average_inc_pad')
else:
modes = ('max', 'average_inc_pad', 'average_exc_pad')
modes = get_dnn_pool_modes()
x = T.tensor5()
for mode, pad in product(modes,
......@@ -467,11 +480,7 @@ def test_pooling_opt_arbitrary_dimensions():
raise SkipTest(dnn.dnn_available.msg)
utt.seed_rng()
# 'average_exc_pad' is disabled for versions < 4004
if dnn.version(raises=False) < 4004:
modes = ('max', 'average_inc_pad')
else:
modes = ('max', 'average_inc_pad', 'average_exc_pad')
modes = get_dnn_pool_modes()
for n_non_pool_dims in (0, 1, 2, 3):
for ws in ((2, 2), (3, 3, 3)):
......@@ -498,7 +507,7 @@ def test_pooling_opt_arbitrary_dimensions():
fc = theano.function([], out, mode=mode_without_gpu)
assert any([isinstance(node.op, Pool)
for node in fc.maker.fgraph.toposort()])
if mode == 'max':
if mode in ('max', 'max_deterministic'):
assert any([isinstance(node.op, MaxPoolGrad)
for node in fc.maker.fgraph.toposort()])
else:
......@@ -780,11 +789,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
dtype=theano.config.floatX
)
# 'average_exc_pad' is disabled for versions < 4004
if dnn.version(raises=False) < 4004:
modes = ['max', 'average_inc_pad']
else:
modes = ['max', 'average_inc_pad', 'average_exc_pad']
modes = get_dnn_pool_modes()
for params in product(
[(1, 1), (2, 2), (3, 3)],
......@@ -807,11 +812,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
dtype=theano.config.floatX
)
# 'average_exc_pad' is disabled for versions < 4004
if dnn.version(raises=False) < 4004:
modes = ['max', 'average_inc_pad']
else:
modes = ['max', 'average_inc_pad', 'average_exc_pad']
modes = get_dnn_pool_modes()
for params in product(
[(1, 1, 1), (2, 2, 2), (3, 3, 3)],
......@@ -847,7 +848,8 @@ class TestDnnInferShapes(utt.InferShapeTester):
for params in product(
[(1, 1), (2, 2), (3, 3)],
[(1, 1), (2, 2), (3, 3)],
['max', 'average_inc_pad']
# modes without `average_exc_pad`
[m for m in get_dnn_pool_modes() if m != 'average_exc_pad']
):
pool_grad = dnn.GpuDnnPoolGrad(mode=params[2])(
img,
......@@ -886,7 +888,8 @@ class TestDnnInferShapes(utt.InferShapeTester):
for params in product(
[(1, 1, 1), (2, 2, 2), (3, 3, 3)],
[(1, 1, 1), (2, 2, 2), (3, 3, 3)],
['max', 'average_inc_pad']
# modes without `average_exc_pad`
[m for m in get_dnn_pool_modes() if m != 'average_exc_pad']
):
pool_grad = dnn.GpuDnnPoolGrad(mode=params[2])(
img,
......
......@@ -5710,41 +5710,50 @@ def local_opt_alloc(node):
if node_inps.owner and isinstance(node_inps.owner.op, T.Alloc):
input = node_inps.owner.inputs[0]
shapes = node_inps.owner.inputs[1:]
if (node.op.axis is None or
node.op.axis == tuple(range(input.ndim))):
try:
val = get_scalar_constant_value(input,
only_process_constants=True)
assert val.size == 1
# check which type of op
casted = T.mul(*shapes).astype(str(input.dtype))
try:
val = get_scalar_constant_value(input,
only_process_constants=True)
assert val.size == 1
val = val.reshape(1)[0]
# check which type of op
size = T.mul(*shapes)
if input.dtype in ["float16", "float32"]:
# shapes are ints and normally int64.
# We don't want to have a float64 upcast
# We don't want to downcast to float16
# as we fear it could loose too much precision
# that will be amplified by the mul/pow below.
size = size.astype('float32')
if (node.op.axis is None or
node.op.axis == tuple(range(input.ndim))):
if isinstance(node.op, T.Sum):
val = val.reshape(1)[0] * casted
val = val * size
else:
val = val.reshape(1)[0] ** casted
val = val ** size
# Sum can change the input dtype (upcast or bool
# -> float32) by default or by user request.
# We can ignore the acc_dtype, as there is only 1
# elemwise we will do and not a sequence, so there is no
# accumulation of errors.
# So mostly, we just need to cast the output to the old
# dtype.
val = val.astype(node.outputs[0].dtype)
return [val]
except NotScalarConstantError:
pass
else:
try:
val = get_scalar_constant_value(input,
only_process_constants=True)
assert val.size == 1
val = val.reshape(1)[0]
to_prod = [shapes[i] for i in xrange(len(shapes))
if i in node.op.axis]
if to_prod:
casted = T.mul(*to_prod).astype(str(input.dtype))
if isinstance(node.op, T.Sum):
val *= casted
else:
val = val ** casted
return [T.alloc(val,
*[shapes[i] for i in xrange(len(shapes))
if i not in node.op.axis])]
except NotScalarConstantError:
pass
to_prod = [shapes[i] for i in xrange(len(shapes))
if i in node.op.axis]
if to_prod:
size = T.mul(*to_prod)
if isinstance(node.op, T.Sum):
val *= size
else:
val = val ** size
# See comments above.
val = val.astype(node.outputs[0].dtype)
return [T.alloc(val,
*[shapes[i] for i in xrange(len(shapes))
if i not in node.op.axis])]
except NotScalarConstantError:
pass
@register_specialize
......
......@@ -59,7 +59,8 @@ def pool_2d(input, ws=None, ignore_border=None, stride=None, pad=(0, 0),
stride : tuple of two ints or theano vector of ints of size 2.
Stride size, which is the number of shifts over rows/cols to get the
next pool region. If stride is None, it is considered equal to ws
(no overlap on pooling regions).
(no overlap on pooling regions), eg: stride=(1,1) will shifts over
one row and one col for every iteration.
pad : tuple of two ints or theano vector of ints of size 2.
(pad_h, pad_w), pad zeros to extend beyond four borders of the
images, pad_h is the size of the top and bottom margins, and
......@@ -433,6 +434,9 @@ class Pool(OpenMPOp):
super(Pool, self).__init__(openmp=openmp)
self.ndim = ndim
self.ignore_border = ignore_border
if mode == 'max_deterministic':
# It seems max pool algo is already deterministic in CPU.
mode = 'max'
if mode not in ['max', 'average_inc_pad', 'average_exc_pad', 'sum']:
raise ValueError(
"Pool mode parameter only support 'max', 'sum',"
......@@ -1040,6 +1044,9 @@ class PoolGrad(OpenMPOp):
def __init__(self, ignore_border, mode='max', ndim=2, openmp=None):
self.ndim = ndim
self.ignore_border = ignore_border
if mode == 'max_deterministic':
# It seems max pool grad algo is already deterministic in CPU.
mode = 'max'
if mode not in ['max', 'sum', 'average_inc_pad', 'average_exc_pad']:
raise ValueError(
"Pool mode parameter only support 'max', 'sum',"
......
......@@ -5558,9 +5558,11 @@ class T_local_sum_prod(unittest.TestCase):
class T_local_opt_alloc(unittest.TestCase):
dtype = 'float32'
def test_sum_upcast(self):
s = theano.tensor.lscalar()
a = theano.tensor.alloc(np.asarray(5, dtype='float32'), s, s)
a = theano.tensor.alloc(np.asarray(5, dtype=self.dtype), s, s)
orig = theano.config.warn_float64
theano.config.warn_float64 = "raise"
try:
......@@ -5571,7 +5573,7 @@ class T_local_opt_alloc(unittest.TestCase):
def test_prod_upcast(self):
s = theano.tensor.lscalar()
a = theano.tensor.alloc(np.asarray(5, dtype='float32'), s, s)
a = theano.tensor.alloc(np.asarray(5, dtype=self.dtype), s, s)
orig = theano.config.warn_float64
theano.config.warn_float64 = "raise"
try:
......@@ -5580,6 +5582,24 @@ class T_local_opt_alloc(unittest.TestCase):
finally:
theano.config.warn_float64 = orig
@theano.configparser.change_flags(on_opt_error='raise')
def test_sum_bool_upcast(self):
s = theano.tensor.lscalar()
a = theano.tensor.alloc(np.asarray(True, dtype='bool'), s, s)
f = theano.function([s], a.sum())
f(5)
# test with user specified dtype
f = theano.function([s], a.sum(dtype=self.dtype))
f(5)
# test only 1 axis summed
f = theano.function([s], a.sum(axis=0, dtype=self.dtype))
f(5)
print(self.dtype)
class T_local_opt_alloc_f16(T_local_opt_alloc):
dtype = 'float16'
class T_local_reduce(unittest.TestCase):
def setUp(self):
......
......@@ -804,8 +804,8 @@ theano.compile.register_specify_shape_c_code(
PyErr_Format(PyExc_AssertionError,
"SpecifyShape: vector of shape has %%d elements,"
" but the input has %%d dimensions.",
PyArray_NDIM(%(iname)s),
PyArray_DIMS(%(shape)s)[0]);
PyArray_DIMS(%(shape)s)[0],
PyArray_NDIM(%(iname)s));
%(fail)s;
}
for(int i = 0; i < PyArray_NDIM(%(iname)s); i++){
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论