提交 1ef9be9d authored 作者: Pascal Lamblin's avatar Pascal Lamblin

Merge pull request #3356 from abergeron/gpuarray_cudnnv3

cuDNN v3 support for gpuarray
...@@ -10,7 +10,6 @@ from theano.gof import Optimizer, local_optimizer, COp ...@@ -10,7 +10,6 @@ from theano.gof import Optimizer, local_optimizer, COp
from theano.gof.type import CDataType, Generic from theano.gof.type import CDataType, Generic
from theano.compile import optdb from theano.compile import optdb
from theano.compile.ops import shape_i from theano.compile.ops import shape_i
from theano.configparser import AddConfigVar, EnumStr
from theano.tensor.nnet import SoftmaxGrad from theano.tensor.nnet import SoftmaxGrad
from theano.tensor.signal.downsample import ( from theano.tensor.signal.downsample import (
DownsampleFactorMax, MaxPoolGrad, AveragePoolGrad) DownsampleFactorMax, MaxPoolGrad, AveragePoolGrad)
...@@ -28,6 +27,8 @@ from theano.sandbox.cuda import gpu_seqopt, register_opt ...@@ -28,6 +27,8 @@ from theano.sandbox.cuda import gpu_seqopt, register_opt
from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
import theano.sandbox.dnn_flags
def dnn_available(): def dnn_available():
if dnn_available.avail is None: if dnn_available.avail is None:
...@@ -62,8 +63,8 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) { ...@@ -62,8 +63,8 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
# exclusive mode, this cause bad detection. # exclusive mode, this cause bad detection.
comp, out, err = NVCC_compiler.try_flags( comp, out, err = NVCC_compiler.try_flags(
["-l", "cudnn", "-I" + os.path.dirname(__file__), ["-l", "cudnn", "-I" + os.path.dirname(__file__),
"-I" + os.path.join(theano.config.cuda.root, 'include'), "-I" + config.dnn.include_path,
"-L" + os.path.join(theano.config.cuda.root, 'lib64')], "-L" + config.dnn.library_path],
preambule=preambule, body=body, preambule=preambule, body=body,
try_run=False, output=True) try_run=False, output=True)
...@@ -141,7 +142,6 @@ if (%(err)s != CUDNN_STATUS_SUCCESS) { ...@@ -141,7 +142,6 @@ if (%(err)s != CUDNN_STATUS_SUCCESS) {
%(fail)s %(fail)s
} }
} }
""" % dict(var=var, err=err, desc=desc, fail=fail) """ % dict(var=var, err=err, desc=desc, fail=fail)
...@@ -359,37 +359,9 @@ class GpuDnnConvDesc(GpuOp): ...@@ -359,37 +359,9 @@ class GpuDnnConvDesc(GpuOp):
def c_code_cache_version(self): def c_code_cache_version(self):
return (2, version()) return (2, version())
AddConfigVar('dnn.conv.workmem',
"This flag is deprecated; use dnn.conv.algo_fwd.",
EnumStr(''),
in_c_key=False)
AddConfigVar('dnn.conv.workmem_bwd',
"This flag is deprecated; use dnn.conv.algo_bwd.",
EnumStr(''),
in_c_key=False)
AddConfigVar('dnn.conv.algo_fwd',
"Default implementation to use for CuDNN forward convolution.",
EnumStr('small', 'none', 'large', 'fft', 'guess_once',
'guess_on_shape_change', 'time_once',
'time_on_shape_change'),
in_c_key=False)
AddConfigVar('dnn.conv.algo_bwd',
"Default implementation to use for CuDNN backward convolution.",
EnumStr('none', 'deterministic', 'fft', 'guess_once',
'guess_on_shape_change', 'time_once',
'time_on_shape_change'),
in_c_key=False)
# scalar constants # scalar constants
_zero = constant(numpy.asarray(0.0, dtype='float32')) _zero = constant(numpy.asarray(0.0, dtype='float32'))
_one = constant(numpy.asarray(1.0, dtype='float32')) _one = constant(numpy.asarray(1.0, dtype='float32'))
_ifour = constant(numpy.asarray(4, dtype='int32'))
_ifive = constant(numpy.asarray(5, dtype='int32'))
def ensure_float(val, default, name): def ensure_float(val, default, name):
...@@ -406,20 +378,6 @@ def ensure_float(val, default, name): ...@@ -406,20 +378,6 @@ def ensure_float(val, default, name):
return val return val
def ensure_int(val, default, name):
if val is None:
return default.clone()
if not isinstance(val, Variable):
val = constant(val)
if hasattr(val, 'ndim') and val.ndim == 0:
val = as_scalar(val)
if not isinstance(val.type, theano.scalar.Scalar):
raise TypeError("%s: expected a scalar value" % (name,))
if not val.type.dtype == 'int32':
raise TypeError("%s: type is not int32" % (name,))
return val
class GpuDnnConv(DnnBase, COp): class GpuDnnConv(DnnBase, COp):
""" """
The forward convolution. The forward convolution.
...@@ -1448,11 +1406,12 @@ class GpuDnnPool(DnnBase): ...@@ -1448,11 +1406,12 @@ class GpuDnnPool(DnnBase):
or desc.type.ctype != 'cudnnPoolingDescriptor_t': or desc.type.ctype != 'cudnnPoolingDescriptor_t':
raise TypeError('desc must be cudnnPoolingDescriptor_t') raise TypeError('desc must be cudnnPoolingDescriptor_t')
dop = desc.owner.op if desc.owner is not None:
e_ndim = dop.get_ndim() + 2 # 4 or 5 dop = desc.owner.op
e_ndim = dop.get_ndim() + 2 # 4 or 5
if img.type.ndim != e_ndim: if img.type.ndim != e_ndim:
raise TypeError('img must be %dD tensor' % e_ndim) raise TypeError('img must be %dD tensor' % e_ndim)
return Apply(self, [img, desc], [img.type()]) return Apply(self, [img, desc], [img.type()])
...@@ -1616,19 +1575,21 @@ class GpuDnnPoolGrad(DnnBase): ...@@ -1616,19 +1575,21 @@ class GpuDnnPoolGrad(DnnBase):
or desc.type.ctype != 'cudnnPoolingDescriptor_t': or desc.type.ctype != 'cudnnPoolingDescriptor_t':
raise TypeError('desc must be cudnnPoolingDescriptor_t') raise TypeError('desc must be cudnnPoolingDescriptor_t')
nd = desc.owner.op.get_ndim() + 2 # 4 or 5
inp = as_cuda_ndarray_variable(inp) inp = as_cuda_ndarray_variable(inp)
if inp.type.ndim != nd:
raise TypeError('inp must be %dD tensor' % (nd,))
inp_grad = as_cuda_ndarray_variable(inp_grad) inp_grad = as_cuda_ndarray_variable(inp_grad)
if inp_grad.type.ndim != nd:
raise TypeError('inp_grad must be %dD tensor' % (nd,))
out = as_cuda_ndarray_variable(out) out = as_cuda_ndarray_variable(out)
if out.type.ndim != nd:
raise TypeError('out must be %dD tensor' % (nd,)) if desc.owner is not None:
nd = desc.owner.op.get_ndim() + 2 # 4 or 5
if inp.type.ndim != nd:
raise TypeError('inp must be %dD tensor' % (nd,))
if inp_grad.type.ndim != nd:
raise TypeError('inp_grad must be %dD tensor' % (nd,))
if out.type.ndim != nd:
raise TypeError('out must be %dD tensor' % (nd,))
return Apply(self, [inp, out, inp_grad, desc], return Apply(self, [inp, out, inp_grad, desc],
[inp.type()]) [inp.type()])
...@@ -1819,7 +1780,7 @@ class GpuDnnSoftmaxBase(DnnBase): ...@@ -1819,7 +1780,7 @@ class GpuDnnSoftmaxBase(DnnBase):
Parameters Parameters
---------- ----------
tensor_format tensor_format
Whether the data format is 'bc01' or 'b01c'. Always set this to 'bc01'.
algo algo
'fast', 'accurate' or 'log' indicating whether, respectively, computations 'fast', 'accurate' or 'log' indicating whether, respectively, computations
should be optimized for speed, for accuracy, or if CuDNN should rather should be optimized for speed, for accuracy, or if CuDNN should rather
...@@ -1834,7 +1795,13 @@ class GpuDnnSoftmaxBase(DnnBase): ...@@ -1834,7 +1795,13 @@ class GpuDnnSoftmaxBase(DnnBase):
__props__ = ('tensor_format', 'mode', 'algo') __props__ = ('tensor_format', 'mode', 'algo')
def __init__(self, tensor_format, algo, mode): def __init__(self, tensor_format, algo, mode):
assert(tensor_format in ('bc01', 'b01c')) if tensor_format != 'bc01':
raise ValueError(
"It was discovered that since December 2014, the "
"tensor_format parameter was ignored and the equivalent of "
"'bc01' is always used. Since your code seems to be using "
"another value, this might have affected previous results "
"ran with this code.")
DnnBase.__init__(self) DnnBase.__init__(self)
self.tensor_format = tensor_format self.tensor_format = tensor_format
...@@ -1976,7 +1943,7 @@ class GpuDnnSoftmax(GpuDnnSoftmaxBase): ...@@ -1976,7 +1943,7 @@ class GpuDnnSoftmax(GpuDnnSoftmaxBase):
Parameters Parameters
---------- ----------
tensor_format tensor_format
Whether the data format is 'bc01' or 'b01c'. Always set to 'bc01'.
algo algo
'fast' or 'accurate' indicating whether computations should be 'fast' or 'accurate' indicating whether computations should be
optimized for speed or accuracy respectively. optimized for speed or accuracy respectively.
...@@ -2044,7 +2011,7 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase): ...@@ -2044,7 +2011,7 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
Parameters Parameters
---------- ----------
tensor_format tensor_format
Whether the data format is 'bc01' or 'b01c'. Always set to 'bc01'.
algo algo
'fast' or 'accurate' indicating whether computations should be 'fast' or 'accurate' indicating whether computations should be
optimized for speed or accuracy respectively. optimized for speed or accuracy respectively.
......
"""
This module contains the configuration flags for cudnn support.
Those are shared between the cuda and gpuarray backend which is why
they are in this file.
"""
import os.path
from theano.configparser import AddConfigVar, EnumStr, StrParam
from theano import config
AddConfigVar('dnn.conv.workmem',
"This flag is deprecated; use dnn.conv.algo_fwd.",
EnumStr(''),
in_c_key=False)
AddConfigVar('dnn.conv.workmem_bwd',
"This flag is deprecated; use dnn.conv.algo_bwd.",
EnumStr(''),
in_c_key=False)
AddConfigVar('dnn.conv.algo_fwd',
"Default implementation to use for CuDNN forward convolution.",
EnumStr('small', 'none', 'large', 'fft', 'guess_once',
'guess_on_shape_change', 'time_once',
'time_on_shape_change'),
in_c_key=False)
AddConfigVar('dnn.conv.algo_bwd',
"Default implementation to use for CuDNN backward convolution.",
EnumStr('none', 'deterministic', 'fft', 'guess_once',
'guess_on_shape_change', 'time_once',
'time_on_shape_change'),
in_c_key=False)
AddConfigVar('dnn.include_path',
"Location of the cudnn header (defaults to the cuda root)",
StrParam(lambda: os.path.join(config.cuda.root, 'include')))
AddConfigVar('dnn.library_path',
"Location of the cudnn header (defaults to the cuda root)",
StrParam(lambda: os.path.join(config.cuda.root, 'lib64')))
#section support_code_apply
int APPLY_SPECIFIC(conv_desc)(PyArrayObject *filt_shp,
cudnnConvolutionDescriptor_t *desc) {
cudnnStatus_t err;
int pad[3] = {PAD_0, PAD_1, PAD_2};
int strides[3] = {SUB_0, SUB_1, SUB_2};
int upscale[3] = {1, 1, 1};
#if BORDER_MODE == 0
pad[0] = *(npy_int64 *)PyArray_GETPTR1(filt_shp, 2) - 1;
pad[1] = *(npy_int64 *)PyArray_GETPTR1(filt_shp, 3) - 1;
#if NB_DIMS > 2
pad[2] = *(npy_int64 *)PyArray_GETPTR1(filt_shp, 4) - 1;
#endif
#endif
if (PyArray_DIM(filt_shp, 0) - 2 != NB_DIMS) {
PyErr_Format(PyExc_ValueError, "Filter shape has too many dimensions: "
"expected %d, got %lld.", NB_DIMS,
(long long)PyArray_DIM(filt_shp, 0));
return -1;
}
err = cudnnCreateConvolutionDescriptor(desc);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate convolution "
"descriptor: %s", cudnnGetErrorString(err));
return -1;
}
err = cudnnSetConvolutionNdDescriptor(*desc, NB_DIMS, pad, strides, upscale,
CONV_MODE);
return 0;
}
...@@ -4,193 +4,109 @@ ...@@ -4,193 +4,109 @@
#include <cudnn.h> #include <cudnn.h>
#ifndef CUDNN_VERSION #ifndef CUDNN_VERSION
#include <assert.h>
// Here we define the R2 API in terms of functions in the R1 interface
// This is only for what we use
static inline const char *cudnnGetErrorString(cudnnStatus_t err) { #define CUDNN_VERSION -1
switch (err) { static inline int cudnnGetVersion() {
case CUDNN_STATUS_SUCCESS: return -1;
return "The operation completed successfully.";
case CUDNN_STATUS_NOT_INITIALIZED:
return "The handle was not initialized(Is your driver recent enought?).";
case CUDNN_STATUS_ALLOC_FAILED:
return "Ressource allocation failed inside the library.";
case CUDNN_STATUS_BAD_PARAM:
return "An incorrect value was passed in.";
case CUDNN_STATUS_ARCH_MISMATCH:
return "The current GPU does not support the required features (only cc 3.0+ are supported).";
case CUDNN_STATUS_MAPPING_ERROR:
return "An access to GPU memory space failed (probably due to a failure to bind texture).";
case CUDNN_STATUS_EXECUTION_FAILED:
return "A kernel failed to execute.";
case CUDNN_STATUS_INTERNAL_ERROR:
return "An internal cuDNN operation failed.";
case CUDNN_STATUS_NOT_SUPPORTED:
return "The combination of parameters is not currently supported.";
default:
return "Unknown error code.";
}
} }
#endif
// some macros to help support cudnn R1 while using R2 code. #include <assert.h>
#define cudnnCreateTensorDescriptor cudnnCreateTensor4dDescriptor
#define cudnnDestroyTensorDescriptor cudnnDestroyTensor4dDescriptor
#define cudnnSetFilter4dDescriptor cudnnSetFilterDescriptor
typedef cudnnTensor4dDescriptor_t cudnnTensorDescriptor_t;
static inline cudnnStatus_t #if CUDNN_VERSION < 3000
cudnnGetConvolution2dForwardOutputDim( // Here we define the R3 API in terms of functions in the R2 interface
const cudnnConvolutionDescriptor_t convDesc, // This is only for what we use
const cudnnTensorDescriptor_t inputTensorDesc,
const cudnnFilterDescriptor_t filterDesc,
int *n,
int *c,
int *h,
int *w) {
return cudnnGetOutputTensor4dDim(convDesc, CUDNN_CONVOLUTION_FWD,
n, c, h, w);
}
typedef int cudnnConvolutionFwdAlgo_t; typedef int cudnnConvolutionBwdDataAlgo_t;
typedef int cudnnConvolutionFwdPreference_t;
#define CUDNN_CONVOLUTION_FWD_NO_WORKSPACE 0 #define CUDNN_CONVOLUTION_BWD_DATA_ALGO_0 0
#define CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 1
#define CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT 2
static inline cudnnStatus_t static cudnnStatus_t cudnnGetConvolutionBackwardDataWorkspaceSize(
cudnnGetConvolutionForwardAlgorithm(
cudnnHandle_t handle, cudnnHandle_t handle,
const cudnnTensorDescriptor_t srcDesc,
const cudnnFilterDescriptor_t filterDesc, const cudnnFilterDescriptor_t filterDesc,
const cudnnTensorDescriptor_t diffDesc,
const cudnnConvolutionDescriptor_t convDesc, const cudnnConvolutionDescriptor_t convDesc,
const cudnnTensorDescriptor_t destDesc, const cudnnTensorDescriptor_t gradDesc,
cudnnConvolutionFwdPreference_t preference, cudnnConvolutionBwdDataAlgo_t algo,
size_t memoryLimitInbytes, size_t *sizeInBytes) {
cudnnConvolutionFwdAlgo_t *algo) {
*algo = 0;
return CUDNN_STATUS_SUCCESS;
}
static inline cudnnStatus_t
cudnnGetConvolutionForwardWorkspaceSize(
cudnnHandle_t handle,
const cudnnTensorDescriptor_t srcDesc,
const cudnnFilterDescriptor_t filterDesc,
const cudnnConvolutionDescriptor_t convDesc,
const cudnnTensor4dDescriptor_t destDesc,
cudnnConvolutionFwdAlgo_t algo,
size_t *sizeInBytes) {
*sizeInBytes = 0; *sizeInBytes = 0;
return CUDNN_STATUS_SUCCESS; return CUDNN_STATUS_SUCCESS;
} }
static cudnnStatus_t cudnnConvolutionBackwardData_v3(
static inline cudnnStatus_t
cudnnConvolutionForward_v2(
cudnnHandle_t handle, cudnnHandle_t handle,
const void *alpha, const void *alpha,
const cudnnTensorDescriptor_t srcDesc,
const void *srcData,
const cudnnFilterDescriptor_t filterDesc, const cudnnFilterDescriptor_t filterDesc,
const void *filterData, const void *filterData,
const cudnnTensorDescriptor_t diffDesc,
const void *diffData,
const cudnnConvolutionDescriptor_t convDesc, const cudnnConvolutionDescriptor_t convDesc,
cudnnConvolutionFwdAlgo_t algo, cudnnConvolutionBwdDataAlgo_t algo,
void *workSpace, void *workspace,
size_t workSpaceSizeInBytes, size_t workspaceSizeInBytes,
const void *beta, const void *beta,
const cudnnTensorDescriptor_t destDesc, const cudnnTensorDescriptor_t gradDesc,
void *destData) { void *gradData) {
assert(*(float *)alpha == 1.0); return cudnnConvolutionBackwardData(
cudnnAccumulateResult_t r; handle,
if (*(float *)beta == 0.0) { alpha,
r = CUDNN_RESULT_NO_ACCUMULATE; filterDesc,
} else if (*(float *)beta == 1.0) { filterData,
r = CUDNN_RESULT_ACCUMULATE; diffDesc,
} else { diffData,
assert(0 && "beta must be 0.0 or 1.0"); convDesc,
} beta,
return cudnnConvolutionForward(handle, srcDesc, srcData, gradDesc,
filterDesc, filterData, gradData);
convDesc, destDesc, destData,
r);
} }
#define cudnnConvolutionForward cudnnConvolutionForward_v2
static inline cudnnStatus_t typedef int cudnnConvolutionBwdFilterAlgo_t;
cudnnConvolutionBackwardFilter_v2(
cudnnHandle_t handle, #define CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0 0
const void *alpha, #define CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 1
const cudnnTensorDescriptor_t srcDesc, #define CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT 2
const void *srcData,
static cudnnStatus_t cudnnGetConvolutionBackwardFilterWorkspaceSize(
cudnnHandle_t handle,
const cudnnTensorDescriptor_t filterDesc,
const cudnnTensorDescriptor_t diffDesc, const cudnnTensorDescriptor_t diffDesc,
const void *diffData,
const cudnnConvolutionDescriptor_t convDesc, const cudnnConvolutionDescriptor_t convDesc,
const void *beta,
const cudnnFilterDescriptor_t gradDesc, const cudnnFilterDescriptor_t gradDesc,
void *gradData) { cudnnConvolutionBwdDataAlgo_t algo,
assert(*(float *)alpha == 1.0); size_t *sizeInBytes) {
cudnnAccumulateResult_t r; *sizeInBytes = 0;
if (*(float *)beta == 0.0) { return CUDNN_STATUS_SUCCESS;
r = CUDNN_RESULT_NO_ACCUMULATE;
} else if (*(float *)beta == 1.0) {
r = CUDNN_RESULT_ACCUMULATE;
} else {
assert(0 && "beta must be 0.0 or 1.0");
}
return cudnnConvolutionBackwardFilter(handle, srcDesc, srcData,
diffDesc, diffData,
convDesc, gradDesc, gradData,
r);
} }
#define cudnnConvolutionBackwardFilter cudnnConvolutionBackwardFilter_v2 static cudnnStatus_t cudnnConvolutionBackwardFilter_v3(
cudnnHandle_t handle,
static inline cudnnStatus_t
cudnnConvolutionBackwardData_v2(
cudnnHandle_t handle,
const void *alpha, const void *alpha,
const cudnnFilterDescriptor_t filterDesc, const cudnnTensorDescriptor_t srcDesc,
const void *filterData, const void *srcData,
const cudnnTensorDescriptor_t diffDesc, const cudnnTensorDescriptor_t diffDesc,
const void *diffData, const void *diffData,
const cudnnConvolutionDescriptor_t convDesc, const cudnnConvolutionDescriptor_t convDesc,
cudnnConvolutionBwdFilterAlgo_t algo,
void *workspace,
size_t workspaceSizeInBytes,
const void *beta, const void *beta,
const cudnnTensorDescriptor_t gradDesc, const cudnnFilterDescriptor_t gradDesc,
void *gradData) { void *gradData) {
assert(*(float *)alpha == 1.0); return cudnnConvolutionBackwardFilter(
cudnnAccumulateResult_t r; handle,
if (*(float *)beta == 0.0) { alpha,
r = CUDNN_RESULT_NO_ACCUMULATE; srcDesc,
} else if (*(float *)beta == 1.0) { srcData,
r = CUDNN_RESULT_ACCUMULATE; diffDesc,
} else { diffData,
assert(0 && "beta must be 0.0 or 1.0"); convDesc,
} beta,
/* This function needs the casting because its params are not gradDesc,
declared as const */ gradData);
return cudnnConvolutionBackwardData(handle,
(cudnnFilterDescriptor_t)filterDesc,
filterData,
(cudnnTensorDescriptor_t)diffDesc,
diffData,
(cudnnConvolutionDescriptor_t)convDesc,
(cudnnTensorDescriptor_t)gradDesc,
gradData,
r);
} }
#define cudnnConvolutionBackwardData cudnnConvolutionBackwardData_v2
//Needed for R2 rc2
# define CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING CUDNN_POOLING_AVERAGE
#else
// r2 rc1 and rc2 do not have the same macro defined
// I didn't checked if this the right combination, but as we do not wrap the padding interface, it is fine for now.
# define CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING ((cudnnPoolingMode_t)1)
#endif #endif
#endif #endif
import os import os
import numpy import numpy
import warnings
import theano import theano
from theano import Op, Apply, tensor, config, Variable from theano import Op, Apply, tensor, config, Variable
from theano.scalar import as_scalar, constant from theano.scalar import as_scalar, constant, Log
from theano.gradient import DisconnectedType, grad_not_implemented from theano.gradient import DisconnectedType, grad_not_implemented
from theano.gof import Optimizer, local_optimizer, COp from theano.gof import Optimizer, local_optimizer, COp
from theano.gof.cmodule import GCC_compiler from theano.gof.cmodule import GCC_compiler
from theano.gof.type import CDataType, Generic from theano.gof.type import CDataType, Generic
from theano.compile import optdb from theano.compile import optdb
from theano.compile.ops import shape_i from theano.compile.ops import shape_i
from theano.configparser import AddConfigVar, EnumStr, StrParam
from theano.tensor.nnet import SoftmaxGrad from theano.tensor.nnet import SoftmaxGrad
from theano.tensor.signal.downsample import ( from theano.tensor.signal.downsample import (
DownsampleFactorMax, MaxPoolGrad, AveragePoolGrad) DownsampleFactorMax, MaxPoolGrad, AveragePoolGrad)
...@@ -19,6 +19,7 @@ from . import pygpu, init_dev ...@@ -19,6 +19,7 @@ from . import pygpu, init_dev
from .basic_ops import (as_gpuarray_variable, from .basic_ops import (as_gpuarray_variable,
gpu_contiguous, HostFromGpu, gpu_contiguous, HostFromGpu,
GpuAllocEmpty, empty_like) GpuAllocEmpty, empty_like)
from .elemwise import GpuElemwise
from .conv import GpuConv from .conv import GpuConv
# These don't exist in gpuarray # These don't exist in gpuarray
...@@ -27,21 +28,8 @@ from .nnet import GpuSoftmax ...@@ -27,21 +28,8 @@ from .nnet import GpuSoftmax
from .opt import gpu_seqopt, register_opt, conv_groupopt, op_lifter from .opt import gpu_seqopt, register_opt, conv_groupopt, op_lifter
from .opt_util import alpha_merge, output_merge from .opt_util import alpha_merge, output_merge
# This is to avoid conflict with the one in cuda/dnn.py # We need to import this to define the flags.
if not hasattr(config, 'dnn'): from theano.sandbox import dnn_flags # noqa
AddConfigVar('dnn.conv.workmem',
"Default value for the workmem attribute of cudnn "
"convolutions.",
EnumStr('small', 'none', 'large'),
in_c_key=False)
AddConfigVar('dnn.include_path',
"Location of the cudnn header (defaults to the cuda root)",
StrParam(lambda: os.path.join(config.cuda.root, 'include')))
AddConfigVar('dnn.library_path',
"Location of the cudnn header (defaults to the cuda root)",
StrParam(lambda: os.path.join(config.cuda.root, 'lib64')))
def dnn_available(): def dnn_available():
...@@ -57,7 +45,7 @@ def dnn_available(): ...@@ -57,7 +45,7 @@ def dnn_available():
return False return False
# This is a hack because bin_id is in the from of # This is a hack because bin_id is in the from of
# "sm_<major><minor>" for cuda devices. # "sm_<major><minor>" for cuda devices.
if pygpu.get_default_context().bin_id < 'sm_30': if pygpu.get_default_context().bin_id[:-2] < '30':
dnn_available.msg = "Device not supported by cuDNN" dnn_available.msg = "Device not supported by cuDNN"
dnn_available.avail = False dnn_available.avail = False
preambule = """ preambule = """
...@@ -95,68 +83,26 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) { ...@@ -95,68 +83,26 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
else: else:
# If we can compile, check that we can import and run. # If we can compile, check that we can import and run.
v = version() v = version()
if isinstance(v, tuple) and v[0] != v[1]: if v < 2000:
dnn_available.avail = False dnn_available.avail = False
dnn_available.msg = ("Mixed dnn version. The header is" dnn_available.msg = (
" from one version, but we link with" "You have an old release of CuDNN (or a release candidate) "
" a different version %s" % str(v)) "that isn't supported. Please update to at least v2 final "
"version.")
raise RuntimeError(dnn_available.msg) raise RuntimeError(dnn_available.msg)
if version() == (20, 20): if v >= 3000 and v < 3007:
dnn_available.avail = False dnn_available.avail = False
dnn_available.msg = ( dnn_available.msg = (
"You have installed a release candidate of CuDNN v2." "You have installed a release candidate of CuDNN v3. This "
" This isn't supported anymore." "isn't supported. Please update to v3 final version.")
" Update to CuDNN v2 final version.")
raise RuntimeError(dnn_available.msg) raise RuntimeError(dnn_available.msg)
return dnn_available.avail
return dnn_available.avail
dnn_available.avail = None dnn_available.avail = None
dnn_available.msg = None dnn_available.msg = None
def c_set_tensor4d(var, desc, err, fail):
return """
{
cudnnDataType_t dt;
size_t ds;
switch (%(var)s->ga.typecode) {
case GA_FLOAT:
dt = CUDNN_DATA_FLOAT;
break;
case GA_DOUBLE:
dt = CUDNN_DATA_DOUBLE;
break;
default:
PyErr_SetString(PyExc_TypeError, "Non-float datatype in c_set_tensor4d");
return -1;
}
ds = gpuarray_get_elsize(%(var)s->ga.typecode);
int str0, str1, str2, str3;
// cudnn do not like 0s in strides
str3 = PyGpuArray_STRIDES(%(var)s)[3]?PyGpuArray_STRIDES(%(var)s)[3]/ds:1;
str2 = PyGpuArray_STRIDES(%(var)s)[2]?PyGpuArray_STRIDES(%(var)s)[2]/ds:PyGpuArray_DIMS(%(var)s)[3];
str1 = PyGpuArray_STRIDES(%(var)s)[1]?PyGpuArray_STRIDES(%(var)s)[1]/ds:PyGpuArray_DIMS(%(var)s)[2]*PyGpuArray_DIMS(%(var)s)[3];
str0 = PyGpuArray_STRIDES(%(var)s)[0]?PyGpuArray_STRIDES(%(var)s)[0]/ds:PyGpuArray_DIMS(%(var)s)[2]*PyGpuArray_DIMS(%(var)s)[3]*PyGpuArray_DIMS(%(var)s)[1];
%(err)s = cudnnSetTensor4dDescriptorEx(
%(desc)s, dt,
PyGpuArray_DIMS(%(var)s)[0],
PyGpuArray_DIMS(%(var)s)[1],
PyGpuArray_DIMS(%(var)s)[2],
PyGpuArray_DIMS(%(var)s)[3],
str0, str1, str2, str3);
if (%(err)s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"could not set tensor4d descriptor: %%s",
cudnnGetErrorString(%(err)s));
%(fail)s
}
}
""" % dict(var=var, err=err, desc=desc, fail=fail)
class DnnBase(COp): class DnnBase(COp):
""" """
Creates a handle for cudnn and pulls in the cudnn libraries and headers. Creates a handle for cudnn and pulls in the cudnn libraries and headers.
...@@ -166,13 +112,15 @@ class DnnBase(COp): ...@@ -166,13 +112,15 @@ class DnnBase(COp):
# the input broadcasting pattern. # the input broadcasting pattern.
check_broadcast = False check_broadcast = False
def __init__(self): def __init__(self, files=None, c_func=None):
COp.__init__(self, "dnn_base.c") if files is None:
files = []
COp.__init__(self, ["dnn_base.c"] + files, c_func)
def c_headers(self): def c_headers(self):
return ['cudnn.h', 'cudnn_helper.h', 'gpuarray_helper.h', return ['cudnn.h', 'cudnn_helper.h', 'gpuarray_helper.h',
'gpuarray/types.h', 'gpuarray/array.h', 'gpuarray/util.h', 'gpuarray/types.h', 'gpuarray/array.h', 'gpuarray/util.h',
'gpuarray_api.h', 'numpy_compat.h'] 'gpuarray/ext_cuda.h', 'gpuarray_api.h', 'numpy_compat.h']
def c_header_dirs(self): def c_header_dirs(self):
return [os.path.dirname(__file__), pygpu.get_include(), return [os.path.dirname(__file__), pygpu.get_include(),
...@@ -184,9 +132,11 @@ class DnnBase(COp): ...@@ -184,9 +132,11 @@ class DnnBase(COp):
def c_lib_dirs(self): def c_lib_dirs(self):
return [config.dnn.library_path] return [config.dnn.library_path]
def c_code_cache_version(self):
return (super(DnnBase, self).c_code_cache_version(), version())
class DnnVersion(Op):
class DnnVersion(Op):
__props__ = () __props__ = ()
def c_headers(self): def c_headers(self):
...@@ -214,11 +164,7 @@ class DnnVersion(Op): ...@@ -214,11 +164,7 @@ class DnnVersion(Op):
def c_code(self, node, name, inputs, outputs, sub): def c_code(self, node, name, inputs, outputs, sub):
o = outputs[0] o = outputs[0]
return """ return """
#if defined(CUDNN_VERSION)
%(o)s = PyTuple_Pack(2, PyInt_FromLong(CUDNN_VERSION), PyInt_FromLong(cudnnGetVersion())); %(o)s = PyTuple_Pack(2, PyInt_FromLong(CUDNN_VERSION), PyInt_FromLong(cudnnGetVersion()));
#else
%(o)s = PyInt_FromLong(-1);
#endif
""" % locals() """ % locals()
def do_constant_folding(self, node): def do_constant_folding(self, node):
...@@ -232,11 +178,9 @@ class DnnVersion(Op): ...@@ -232,11 +178,9 @@ class DnnVersion(Op):
def version(): def version():
""" """
Return the current cuDNN version we compile with. Return the current cuDNN version we link with.
This return a tuple with the header version and the library version we link
with. For older cudnn version without version information, we return -1.
This also does a check that the header version matches the runtime version.
""" """
if not dnn_available(): if not dnn_available():
raise Exception( raise Exception(
...@@ -247,12 +191,16 @@ def version(): ...@@ -247,12 +191,16 @@ def version():
f = theano.function([], DnnVersion()(), f = theano.function([], DnnVersion()(),
theano.Mode(optimizer=None), theano.Mode(optimizer=None),
profile=False) profile=False)
version.v = f() v = f()
if v[0] != v[1]:
raise RuntimeError("Mixed dnn version. The header is version %s "
"while the library is version %s." % v)
version.v = v[1]
return version.v return version.v
version.v = None version.v = None
class GpuDnnConvDesc(Op): class GpuDnnConvDesc(COp):
""" """
This Op builds a convolution descriptor for use in the other convolution This Op builds a convolution descriptor for use in the other convolution
operations. operations.
...@@ -275,12 +223,17 @@ class GpuDnnConvDesc(Op): ...@@ -275,12 +223,17 @@ class GpuDnnConvDesc(Op):
def c_lib_dirs(self): def c_lib_dirs(self):
return [config.dnn.library_path] return [config.dnn.library_path]
def do_constant_folding(self, node):
return False
def __init__(self, border_mode, subsample=(1, 1), conv_mode='conv'): def __init__(self, border_mode, subsample=(1, 1), conv_mode='conv'):
COp.__init__(self, ["conv_desc.c"], "APPLY_SPECIFIC(conv_desc)")
if isinstance(border_mode, int): if isinstance(border_mode, int):
border_mode = (border_mode, border_mode) border_mode = (border_mode,) * len(subsample)
if isinstance(border_mode, tuple): if isinstance(border_mode, tuple):
pad_h, pad_w = map(int, border_mode) assert len(border_mode) == len(subsample)
border_mode = (pad_h, pad_w) border_mode = tuple(map(int, border_mode))
if not ((isinstance(border_mode, tuple) and min(border_mode) >= 0) or if not ((isinstance(border_mode, tuple) and min(border_mode) >= 0) or
border_mode in ('valid', 'full')): border_mode in ('valid', 'full')):
raise ValueError( raise ValueError(
...@@ -288,105 +241,56 @@ class GpuDnnConvDesc(Op): ...@@ -288,105 +241,56 @@ class GpuDnnConvDesc(Op):
'"valid", "full", an integer or a pair of' '"valid", "full", an integer or a pair of'
' integers'.format(border_mode)) ' integers'.format(border_mode))
self.border_mode = border_mode self.border_mode = border_mode
assert len(subsample) == 2 assert len(subsample) in (2, 3)
self.subsample = subsample self.subsample = subsample
assert conv_mode in ('conv', 'cross') assert conv_mode in ('conv', 'cross')
self.conv_mode = conv_mode self.conv_mode = conv_mode
def make_node(self, img_shape, kern_shape): def make_node(self, kern_shape):
if img_shape.type.ndim != 1 or img_shape.type.dtype != 'int64':
raise TypeError('img must be 1D shape tensor')
if kern_shape.type.ndim != 1 or kern_shape.type.dtype != 'int64': if kern_shape.type.ndim != 1 or kern_shape.type.dtype != 'int64':
raise TypeError('kern must be 1D shape tensor') raise TypeError('kern must be 1D shape tensor')
return Apply(self, [img_shape, kern_shape], return Apply(self, [kern_shape],
[CDataType("cudnnConvolutionDescriptor_t", [CDataType("cudnnConvolutionDescriptor_t",
freefunc="cudnnDestroyConvolutionDescriptor")()]) freefunc="cudnnDestroyConvolutionDescriptor")()])
def c_code(self, node, name, inputs, outputs, sub): def get_op_params(self):
img_shape, kern_shape = inputs pad0 = '0'
desc, = outputs pad1 = '0'
pad2 = '0'
if isinstance(self.border_mode, tuple): if isinstance(self.border_mode, tuple):
pad_h_spec, pad_w_spec = map(int, self.border_mode) pad0 = str(self.border_mode[0])
assert pad_h_spec >= 0 and pad_w_spec >= 0 pad1 = str(self.border_mode[1])
bmode = 2 if len(self.border_mode) > 2:
pad2 = str(self.border_mode[2])
bmode = '2'
elif self.border_mode == "valid":
bmode = '1'
elif self.border_mode == "full":
bmode = '0'
else: else:
pad_h_spec = pad_w_spec = 0 raise ValueError("Invalid value for border_mode")
if self.border_mode == "valid":
bmode = 1
else:
assert self.border_mode == "full"
bmode = 0
if self.conv_mode == 'conv': if self.conv_mode == 'conv':
conv_flag = 'CUDNN_CONVOLUTION' conv_flag = 'CUDNN_CONVOLUTION'
else: else:
conv_flag = 'CUDNN_CROSS_CORRELATION' conv_flag = 'CUDNN_CROSS_CORRELATION'
return """ sub0 = str(self.subsample[0])
{ sub1 = str(self.subsample[1])
cudnnStatus_t err; if len(self.subsample) > 2:
int pad_h%(name)s; sub2 = str(self.subsample[2])
int pad_w%(name)s; else:
sub2 = '0'
if ((err = cudnnCreateConvolutionDescriptor(&%(desc)s)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate convolution "
"descriptor: %%s", cudnnGetErrorString(err));
%(fail)s
}
if (%(bmode)d == 2) { return [('NB_DIMS', str(len(self.subsample))),
pad_h%(name)s = %(pad_h_spec)d; ('BORDER_MODE', bmode),
pad_w%(name)s = %(pad_w_spec)d; ('PAD_0', pad0), ('PAD_1', pad1), ('PAD_2', pad2),
} else if (%(bmode)d == 1) { ('CONV_MODE', conv_flag),
pad_h%(name)s = 0; ('SUB_0', sub0), ('SUB_1', sub1), ('SUB_2', sub2)]
pad_w%(name)s = 0;
} else if (%(bmode)d == 0) {
pad_h%(name)s = *(npy_int64 *)PyArray_GETPTR1(%(kern_shape)s, 2) - 1;
pad_w%(name)s = *(npy_int64 *)PyArray_GETPTR1(%(kern_shape)s, 3) - 1;
} else {
PyErr_SetString(PyExc_ValueError, "bad border mode");
%(fail)s
}
#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 20
err = cudnnSetConvolution2dDescriptor(
%(desc)s,
pad_h%(name)s,
pad_w%(name)s,
%(subsx)d, %(subsy)d, 1, 1,
%(conv_flag)s
);
#else
err = cudnnSetConvolutionDescriptorEx(
%(desc)s,
*(npy_int64 *)PyArray_GETPTR1(%(img_shape)s, 0),
*(npy_int64 *)PyArray_GETPTR1(%(img_shape)s, 1),
*(npy_int64 *)PyArray_GETPTR1(%(img_shape)s, 2),
*(npy_int64 *)PyArray_GETPTR1(%(img_shape)s, 3),
*(npy_int64 *)PyArray_GETPTR1(%(kern_shape)s, 0),
*(npy_int64 *)PyArray_GETPTR1(%(kern_shape)s, 2),
*(npy_int64 *)PyArray_GETPTR1(%(kern_shape)s, 3),
pad_h%(name)s,
pad_w%(name)s,
%(subsx)d, %(subsy)d, 1, 1,
%(conv_flag)s
);
#endif
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set op descriptor: %%s",
cudnnGetErrorString(err));
%(fail)s
}
}
""" % dict(name=name, img_shape=img_shape, kern_shape=kern_shape, desc=desc,
bmode=bmode, conv_flag=conv_flag, fail=sub['fail'],
subsx=self.subsample[0], subsy=self.subsample[1],
pad_h_spec=pad_h_spec, pad_w_spec=pad_w_spec)
def c_code_cache_version(self): def c_code_cache_version(self):
return (1, version()) return (super(GpuDnnConvDesc, self).c_code_cache_version(), version())
# scalar constants # scalar constants
_zero = constant(numpy.asarray(0.0, dtype='float64')) _zero = constant(numpy.asarray(0.0, dtype='float64'))
...@@ -407,7 +311,7 @@ def ensure_dt(val, default, name, dtype): ...@@ -407,7 +311,7 @@ def ensure_dt(val, default, name, dtype):
return val return val
class GpuDnnConv(DnnBase, COp): class GpuDnnConv(DnnBase):
""" """
The forward convolution. The forward convolution.
...@@ -417,55 +321,97 @@ class GpuDnnConv(DnnBase, COp): ...@@ -417,55 +321,97 @@ class GpuDnnConv(DnnBase, COp):
kernel kernel
descr descr
The convolution descriptor. The convolution descriptor.
workmem algo : {'small', 'none', 'large', 'fft', 'guess_once', 'guess_on_shape_change', 'time_once', 'time_on_shape_change'}
Either 'none', 'small' or 'large'. Default is the value of Default is the value of :attr:`config.dnn.conv.algo_fwd`.
:attr:`config.dnn.conv.workmem`.
""" """
__props__ = ('workmem', 'inplace') __props__ = ('algo', 'inplace')
def __init__(self, algo=None, inplace=False):
DnnBase.__init__(self, ["dnn_conv_base.c", "dnn_fwd.c"],
"APPLY_SPECIFIC(conv_fwd)")
if algo is None:
algo = config.dnn.conv.algo_fwd
self.algo = algo
def __init__(self, workmem=None, inplace=False):
COp.__init__(self, ["dnn_base.c", "dnn_conv_base.c", "dnn_fwd.c"],
"APPLY_SPECIFIC(conv_fwd)")
if workmem is None:
workmem = config.dnn.conv.workmem
self.workmem = workmem
self.inplace = inplace self.inplace = inplace
if self.inplace: if self.inplace:
self.destroy_map = {0: [2]} self.destroy_map = {0: [2]}
assert self.workmem in ['none', 'small', 'large']
if version() < 3000:
if self.algo == 'fft':
raise RuntimeError("CuDNN FFT convolution requires CuDNN v3")
elif self.algo in ['guess_once', 'guess_on_shape_change']:
raise RuntimeError("CuDNN selection of convolution "
"implementation based on heuristics "
"requires CuDNN v3")
elif self.algo in ['time_once', 'time_on_shape_change']:
raise RuntimeError("CuDNN convolution timing requires CuDNN v3")
assert self.algo in ['none', 'small', 'large', 'fft', 'guess_once',
'guess_on_shape_change', 'time_once',
'time_on_shape_change']
def __setstate__(self, d):
self.__dict__.update(d)
if not hasattr(self, 'algo'):
if hasattr(self, 'workmem'):
self.algo = self.workmem
else:
self.algo = config.dnn.conv.algo_fwd
if not hasattr(self, 'inplace'):
self.inplace = False
def get_op_params(self): def get_op_params(self):
defs = []
if self.inplace: if self.inplace:
inpl_def = [('CONV_INPLACE', '1')] defs.append(('CONV_INPLACE', '1'))
else:
inpl_def = [] alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM'
if version() == -1: if self.algo == 'none':
alg_def = ('CONV_ALGO', "0") alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM'
else: elif self.algo == 'small':
if self.workmem == 'none': alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM'
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM' elif self.algo == 'large':
elif self.workmem == 'small': alg = 'CUDNN_CONVOLUTION_FWD_ALGO_GEMM'
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM' elif self.algo == 'fft':
elif self.workmem == 'large': alg = 'CUDNN_CONVOLUTION_FWD_ALGO_FFT'
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_GEMM' defs.append(('CONV_ALGO', alg))
alg_def = ('CONV_ALGO', alg)
return [alg_def] + inpl_def if self.algo in ['guess_once', 'guess_on_shape_change',
'time_once', 'time_on_shape_change']:
defs.append(('CHOOSE_ALGO', ''))
if self.algo in ['guess_once', 'time_once']:
defs.append(('CHOOSE_ONCE', ''))
if self.algo in ['time_once', 'time_on_shape_change']:
defs.append(('CHOOSE_TIME', ''))
return defs
def make_node(self, img, kern, output, desc, alpha=None, beta=None): def make_node(self, img, kern, output, desc, alpha=None, beta=None):
img = as_gpuarray_variable(img) img = as_gpuarray_variable(img)
kern = as_gpuarray_variable(kern) kern = as_gpuarray_variable(kern)
output = as_gpuarray_variable(output) output = as_gpuarray_variable(output)
if img.type.ndim != 4: if img.type.ndim not in (4, 5):
raise TypeError('img must be 4D tensor') raise TypeError('img must be 4D or 5D tensor')
if kern.type.ndim != 4: if kern.type.ndim not in (4, 5):
raise TypeError('kern must be 4D tensor') raise TypeError('kern must be 4D or 5D tensor')
if output.type.ndim != 4: if output.type.ndim not in (4, 5):
raise TypeError('output must be a 4D tensor') raise TypeError('output must be a 4D or 5D tensor')
if not isinstance(desc.type, CDataType) \ if (img.type.ndim != kern.type.ndim or
or desc.type.ctype != 'cudnnConvolutionDescriptor_t': img.type.ndim != output.type.ndim):
raise TypeError("The number of dimensions of "
"img, kern and output must match")
if img.type.ndim == 5 and self.algo == 'fft':
raise ValueError("convolution algo fft can't be used for "
"3d convolutions")
if (not isinstance(desc.type, CDataType) or
desc.type.ctype != 'cudnnConvolutionDescriptor_t'):
raise TypeError('desc must be cudnnConvolutionDescriptor_t') raise TypeError('desc must be cudnnConvolutionDescriptor_t')
alpha = ensure_dt(alpha, _one, 'alpha', img.dtype) alpha = ensure_dt(alpha, _one, 'alpha', img.dtype)
...@@ -507,28 +453,47 @@ class GpuDnnConv(DnnBase, COp): ...@@ -507,28 +453,47 @@ class GpuDnnConv(DnnBase, COp):
kh = kshape[2] # Height of each filter kh = kshape[2] # Height of each filter
kw = kshape[3] # Width of each filter kw = kshape[3] # Width of each filter
sh, sw = subsample nd = len(subsample)
if nd > 2:
d = ishape[4]
kd = ishape[4]
sh = subsample[0]
sw = subsample[1]
if nd > 2:
sd = subsample[2]
if border_mode == 'full': if border_mode == 'full':
padh = kh - 1 padh = kh - 1
padw = kw - 1 padw = kw - 1
if nd > 4:
padd = kd - 1
elif isinstance(border_mode, tuple): elif isinstance(border_mode, tuple):
padh, padw = border_mode padh = border_mode[0]
padw = border_mode[1]
if nd > 2:
padd = border_mode[2]
else: else:
assert border_mode == 'valid' assert border_mode == 'valid'
padh = 0 padh = 0
padw = 0 padw = 0
padd = 0
return ( res = [b, nb,
b, nb, (h + 2 * padh - kh) // sh + 1,
(h + 2 * padh - kh) // sh + 1, (w + 2 * padw - kw) // sw + 1]
(w + 2 * padw - kw) // sw + 1
) if nd > 2:
res.append(d + 2 * padd - kd // sd + 1)
return res
def infer_shape(self, node, shape): def infer_shape(self, node, shape):
return [shape[2]] return [shape[2]]
class GpuDnnConvGradW(DnnBase, COp): class GpuDnnConvGradW(DnnBase):
""" """
The convolution gradient with respect to the weights. The convolution gradient with respect to the weights.
...@@ -541,19 +506,27 @@ class GpuDnnConvGradW(DnnBase, COp): ...@@ -541,19 +506,27 @@ class GpuDnnConvGradW(DnnBase, COp):
""" """
__props__ = ('inplace',) __props__ = ('algo', 'inplace')
def __init__(self, inplace=False): def __init__(self, inplace=False, algo=None):
COp.__init__(self, ["dnn_base.c", "dnn_conv_base.c", "dnn_gw.c"], DnnBase.__init__(self, ["dnn_conv_base.c", "dnn_gw.c"],
"APPLY_SPECIFIC(conv_gw)") "APPLY_SPECIFIC(conv_gw)")
self.inplace = inplace self.inplace = inplace
if self.inplace: if self.inplace:
self.destroy_map = {0: [2]} self.destroy_map = {0: [2]}
if algo is None:
algo = config.dnn.conv.algo_bwd
self.algo = algo
assert self.algo in ['none', 'deterministic', 'fft', 'guess_once',
'guess_on_shape_change', 'time_once',
'time_on_shape_change']
def __setstate__(self, d): def __setstate__(self, d):
self.__dict__.update(d) self.__dict__.update(d)
if not hasattr(self, 'inplace'): if not hasattr(self, 'inplace'):
self.inplace = False self.inplace = False
if not hasattr(self, 'algo'):
self.algo = config.dnn.conv.algo_bwd
def grad(self, inp, grads): def grad(self, inp, grads):
img, top, output, desc, alpha, beta = inp img, top, output, desc, alpha, beta = inp
...@@ -574,24 +547,55 @@ class GpuDnnConvGradW(DnnBase, COp): ...@@ -574,24 +547,55 @@ class GpuDnnConvGradW(DnnBase, COp):
return [[1], [1], [1], [0], [1], [1]] return [[1], [1], [1], [0], [1], [1]]
def get_op_params(self): def get_op_params(self):
defs = []
if self.inplace: if self.inplace:
return [('CONV_INPLACE', '1')] defs.append(('CONV_INPLACE', '1'))
if version() < 3000:
alg = '0'
else: else:
return [] alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0'
if self.algo == 'none':
alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0'
if self.algo == 'deterministic':
alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1'
if self.algo == 'fft':
alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT'
if self.algo in ['guess_once', 'guess_on_shape_change',
'time_once', 'time_on_shape_change']:
defs.append(('CHOOSE_ALGO', ''))
if self.algo in ['guess_once', 'time_once']:
defs.append(('CHOOSE_ONCE', ''))
if self.algo in ['time_once', 'time_on_shape_change']:
defs.append(('CHOOSE_TIME', ''))
defs.append(('CONV_ALGO', alg))
return defs
def make_node(self, img, topgrad, output, desc, alpha=None, beta=None): def make_node(self, img, topgrad, output, desc, alpha=None, beta=None):
img = as_gpuarray_variable(img) img = as_gpuarray_variable(img)
topgrad = as_gpuarray_variable(topgrad) topgrad = as_gpuarray_variable(topgrad)
output = as_gpuarray_variable(output) output = as_gpuarray_variable(output)
if img.type.ndim != 4: if img.type.ndim not in (4, 5):
raise TypeError('img must be 4D tensor') raise TypeError('img must be 4D or 5D tensor')
if topgrad.type.ndim != 4: if topgrad.type.ndim not in (4, 5):
raise TypeError('topgrad must be 4D tensor') raise TypeError('topgrad must be 4D or 5D tensor')
if output.type.ndim != 4: if output.type.ndim not in (4, 5):
raise TypeError('output must be 4D tensor') raise TypeError('output must be 4D or 5D tensor')
if not isinstance(desc.type, CDataType) \ if (img.type.ndim != topgrad.type.ndim or
or desc.type.ctype != 'cudnnConvolutionDescriptor_t': img.type.ndim != output.type.ndim):
raise TypeError("The number of dimensions of "
"img, topgrad and output must match")
if img.type.ndim == 5 and self.algo in ['fft', 'deterministic']:
raise ValueError("convolution algo %s can't be used for "
"3d convolutions", (self.algo,))
if (not isinstance(desc.type, CDataType) or
desc.type.ctype != 'cudnnConvolutionDescriptor_t'):
raise TypeError('desc must be cudnnConvolutionDescriptor_t') raise TypeError('desc must be cudnnConvolutionDescriptor_t')
alpha = ensure_dt(alpha, _one, 'alpha', img.dtype) alpha = ensure_dt(alpha, _one, 'alpha', img.dtype)
...@@ -617,14 +621,27 @@ class GpuDnnConvGradI(DnnBase): ...@@ -617,14 +621,27 @@ class GpuDnnConvGradI(DnnBase):
""" """
__props__ = ('inplace',) __props__ = ('algo', 'inplace',)
def __init__(self, inplace=False): def __init__(self, inplace=False, algo=None):
COp.__init__(self, ["dnn_base.c", "dnn_conv_base.c", "dnn_gi.c"], DnnBase.__init__(self, ["dnn_conv_base.c", "dnn_gi.c"],
"APPLY_SPECIFIC(conv_gi)") "APPLY_SPECIFIC(conv_gi)")
self.inplace = inplace self.inplace = inplace
if self.inplace: if self.inplace:
self.destroy_map = {0: [2]} self.destroy_map = {0: [2]}
if algo is None:
algo = config.dnn.conv.algo_bwd
self.algo = algo
assert self.algo in ['none', 'deterministic', 'fft', 'guess_once',
'guess_on_shape_change', 'time_once',
'time_on_shape_change']
def __setstate__(self, d):
self.__dict__.update(d)
if not hasattr(self, 'algo'):
self.algo = config.dnn.conv.algo_bwd
if not hasattr(self, 'inplace'):
self.inplace = False
def grad(self, inp, grads): def grad(self, inp, grads):
kerns, top, output, desc, alpha, beta = inp kerns, top, output, desc, alpha, beta = inp
...@@ -645,24 +662,55 @@ class GpuDnnConvGradI(DnnBase): ...@@ -645,24 +662,55 @@ class GpuDnnConvGradI(DnnBase):
return [[1], [1], [1], [0], [1], [1]] return [[1], [1], [1], [0], [1], [1]]
def get_op_params(self): def get_op_params(self):
defs = []
if self.inplace: if self.inplace:
return [('CONV_INPLACE', '1')] defs.append(('CONV_INPLACE', '1'))
if version() < 3000:
alg = '0'
else: else:
return [] alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_0'
if self.algo == 'none':
alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_0'
if self.algo == 'deterministic':
alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_1'
if self.algo == 'fft':
alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT'
if self.algo in ['guess_once', 'guess_on_shape_change',
'time_once', 'time_on_shape_change']:
defs.append(('CHOOSE_ALGO', ''))
if self.algo in ['guess_once', 'time_once']:
defs.append(('CHOOSE_ONCE', ''))
if self.algo in ['time_once', 'time_on_shape_change']:
defs.append(('CHOOSE_TIME', ''))
defs.append(('CONV_ALGO', alg))
return defs
def make_node(self, kern, topgrad, output, desc, alpha=None, beta=None): def make_node(self, kern, topgrad, output, desc, alpha=None, beta=None):
kern = as_gpuarray_variable(kern) kern = as_gpuarray_variable(kern)
topgrad = as_gpuarray_variable(topgrad) topgrad = as_gpuarray_variable(topgrad)
output = as_gpuarray_variable(output) output = as_gpuarray_variable(output)
if kern.type.ndim != 4: if kern.type.ndim not in (4, 5):
raise TypeError('kern must be 4D tensor') raise TypeError('kern must be 4D or 5D tensor')
if topgrad.type.ndim != 4: if topgrad.type.ndim not in (4, 5):
raise TypeError('topgrad must be 4D tensor') raise TypeError('topgrad must be 4D or 5D tensor')
if output.type.ndim != 4: if output.type.ndim not in (4, 5):
raise TypeError('output must be 4D tensor') raise TypeError('output must be 4D or 5D tensor')
if not isinstance(desc.type, CDataType) \ if (kern.type.ndim != topgrad.type.ndim or
or desc.type.ctype != 'cudnnConvolutionDescriptor_t': kern.type.ndim != output.type.ndim):
raise TypeError("The number of dimensions of "
"kern, topgrad and output must match")
if kern.type.ndim == 5 and self.algo in ['fft', 'deterministic']:
raise ValueError("convolution algo %s can't be used for "
"3d convolutions", (self.algo,))
if (not isinstance(desc.type, CDataType) or
desc.type.ctype != 'cudnnConvolutionDescriptor_t'):
raise TypeError('desc must be cudnnConvolutionDescriptor_t') raise TypeError('desc must be cudnnConvolutionDescriptor_t')
alpha = ensure_dt(alpha, _one, 'alpha', kern.dtype) alpha = ensure_dt(alpha, _one, 'alpha', kern.dtype)
...@@ -676,7 +724,8 @@ class GpuDnnConvGradI(DnnBase): ...@@ -676,7 +724,8 @@ class GpuDnnConvGradI(DnnBase):
def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
conv_mode='conv', direction_hint=None, workmem=None): conv_mode='conv', direction_hint=None, workmem=None,
algo=None):
""" """
GPU convolution using cuDNN from NVIDIA. GPU convolution using cuDNN from NVIDIA.
...@@ -700,22 +749,27 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), ...@@ -700,22 +749,27 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
direction_hint direction_hint
Used by graph optimizers to change algorithm choice. Used by graph optimizers to change algorithm choice.
By default, GpuDnnConv will be used to carry out the convolution. By default, GpuDnnConv will be used to carry out the convolution.
If border_mode is 'valid', subsample is (1,1) and direction_hint is If border_mode is 'valid', subsample is (1, 1) and direction_hint is
'bprop weights', it will use GpuDnnConvGradW. 'bprop weights', it will use GpuDnnConvGradW.
If border_mode is 'full', subsample is (1,1) and direction_hint is If border_mode is 'full', subsample is (1, 1) and direction_hint is
*not* 'forward!', it will use GpuDnnConvGradI. *not* 'forward!', it will use GpuDnnConvGradI.
This parameter is used internally by graph optimizers and may be This parameter is used internally by graph optimizers and may be
removed at any time without a deprecation period. You have been warned. removed at any time without a deprecation period. You have been warned.
workmem algo : {'none', 'small', 'large', 'fft', 'guess_once', 'guess_on_shape_change', 'time_once', 'time_on_shape_change'}
Specify the amount of working memory allowed. More memory is usually Convolution implementation to use. Some of its values may
faster. One of 'none', 'small' or 'large' (default is None which takes require certain versions of CuDNN to be installed. Default is
its value from :attr:`config.dnn.conv.workmem`). the value of :attr:`config.dnn.conv.algo_fwd`.
.. warning:: The cuDNN library only works with GPU that have a compute .. warning:: The cuDNN library only works with GPUs that have a compute
capability of 3.0 or higer. This means that older GPU will not capability of 3.0 or higer. This means that older GPUs will not
work with this Op. work with this Op.
""" """
if workmem is not None:
if algo is not None:
raise ValueError("You can't use both algo and workmem")
warnings.warn("workmem is deprecated, use algo instead", stacklevel=2)
algo = workmem
fgraph = getattr(img, 'fgraph', None) or getattr(kerns, 'fgraph', None) fgraph = getattr(img, 'fgraph', None) or getattr(kerns, 'fgraph', None)
if (border_mode == 'valid' and subsample == (1, 1) and if (border_mode == 'valid' and subsample == (1, 1) and
direction_hint == 'bprop weights'): direction_hint == 'bprop weights'):
...@@ -732,7 +786,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), ...@@ -732,7 +786,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
out = GpuAllocEmpty(img.dtype)(shape_i(kerns, 1, fgraph), out = GpuAllocEmpty(img.dtype)(shape_i(kerns, 1, fgraph),
shape_i(img, 1, fgraph), shape2, shape3) shape_i(img, 1, fgraph), shape2, shape3)
desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1), desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
conv_mode='cross')(img.shape, out.shape) conv_mode='cross')(out.shape)
conv = GpuDnnConvGradW()(img, kerns, out, desc) conv = GpuDnnConvGradW()(img, kerns, out, desc)
return as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3)) return as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3))
...@@ -741,7 +795,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), ...@@ -741,7 +795,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
# Special case: We can be faster by using GpuDnnConvGradI to compute # Special case: We can be faster by using GpuDnnConvGradI to compute
# the full convolution as the backward pass of a valid convolution. # the full convolution as the backward pass of a valid convolution.
# We just need to set up a suitable 'fake' valid convolution. # We just need to set up a suitable 'fake' valid convolution.
img = gpu_contiguous(img) # cudnn v1 and v2 rc3 need contiguous data img = gpu_contiguous(img) # cudnn v2 rc3 need contiguous data
kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3)) kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3))
conv_mode = 'cross' if conv_mode == 'conv' else 'conv' conv_mode = 'cross' if conv_mode == 'conv' else 'conv'
shape2 = shape_i(img, 2, fgraph) + shape_i(kerns, 2, fgraph) - 1 shape2 = shape_i(img, 2, fgraph) + shape_i(kerns, 2, fgraph) - 1
...@@ -750,7 +804,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), ...@@ -750,7 +804,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
shape_i(kerns, 1, fgraph), shape_i(kerns, 1, fgraph),
shape2, shape3) shape2, shape3)
desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1), desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
conv_mode=conv_mode)(out.shape, kerns.shape) conv_mode=conv_mode)(kerns.shape)
return GpuDnnConvGradI()(kerns, img, out, desc) return GpuDnnConvGradI()(kerns, img, out, desc)
# Standard case: We use GpuDnnConv with suitable padding. # Standard case: We use GpuDnnConv with suitable padding.
...@@ -759,13 +813,13 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), ...@@ -759,13 +813,13 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
img = gpu_contiguous(img) img = gpu_contiguous(img)
kerns = gpu_contiguous(kerns) kerns = gpu_contiguous(kerns)
desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample, desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
conv_mode=conv_mode)(img.shape, kerns.shape) conv_mode=conv_mode)(kerns.shape)
desc_op = desc.owner.op desc_op = desc.owner.op
out_shp = GpuDnnConv.get_out_shape(img.shape, kerns.shape, out_shp = GpuDnnConv.get_out_shape(img.shape, kerns.shape,
desc_op.border_mode, desc_op.border_mode,
desc_op.subsample) desc_op.subsample)
out = GpuAllocEmpty(img.dtype)(*out_shp) out = GpuAllocEmpty(img.dtype)(*out_shp)
return GpuDnnConv(workmem=workmem)(img, kerns, out, desc) return GpuDnnConv(algo=algo)(img, kerns, out, desc)
class GpuDnnPoolDesc(Op): class GpuDnnPoolDesc(Op):
...@@ -773,18 +827,18 @@ class GpuDnnPoolDesc(Op): ...@@ -773,18 +827,18 @@ class GpuDnnPoolDesc(Op):
This Op builds a pooling descriptor for use in the other This Op builds a pooling descriptor for use in the other
pooling operations. pooling operations.
`ws`, `stride` and `pad` must have the same length.
Parameters Parameters
---------- ----------
ws ws : tuple
Windows size. Window size.
stride stride : tuple
(dx, dy). (dx, dy) or (dx, dy, dz).
mode : {'max', 'average_inc_pad', 'average_exc_pad'} mode : {'max', 'average_inc_pad', 'average_exc_pad'}
The old deprecated name 'average' correspond to 'average_inc_pad'. The old deprecated name 'average' corresponds to 'average_inc_pad'.
pad pad : tuple
(padX, padY) padding information. (padX, padY) or (padX, padY, padZ)
padX is the size of the left and right borders,
padY is the size of the top and bottom borders.
""" """
...@@ -810,14 +864,18 @@ class GpuDnnPoolDesc(Op): ...@@ -810,14 +864,18 @@ class GpuDnnPoolDesc(Op):
mode = 'average_inc_pad' mode = 'average_inc_pad'
assert mode in ('max', 'average_inc_pad', 'average_exc_pad') assert mode in ('max', 'average_inc_pad', 'average_exc_pad')
self.mode = mode self.mode = mode
assert len(ws) == 2
assert len(ws) == len(stride) and len(stride) == len(pad)
assert len(ws) in (2, 3)
self.ws = ws self.ws = ws
assert len(stride) == 2
self.stride = stride self.stride = stride
assert len(stride) == 2
self.pad = pad self.pad = pad
if (pad[0] != 0 or pad[1] != 0) and version() == -1:
raise RuntimeError("CuDNN pooling with padding requires CuDNN v2") if self.get_ndim() == 3 and version() < 3000:
raise RuntimeError("CuDNN 3d pooling requires v3")
def get_ndim(self):
return len(self.ws)
def __setstate__(self, d): def __setstate__(self, d):
self.__dict__.update(d) self.__dict__.update(d)
...@@ -825,9 +883,6 @@ class GpuDnnPoolDesc(Op): ...@@ -825,9 +883,6 @@ class GpuDnnPoolDesc(Op):
self.pad = (0, 0) self.pad = (0, 0)
def make_node(self): def make_node(self):
if self.pad != (0, 0) and version() == -1:
raise RuntimeError("CuDNN pooling with padding requires CuDNN v2")
return Apply(self, [], return Apply(self, [],
[CDataType("cudnnPoolingDescriptor_t", [CDataType("cudnnPoolingDescriptor_t",
freefunc="cudnnDestroyPoolingDescriptor")()]) freefunc="cudnnDestroyPoolingDescriptor")()])
...@@ -841,8 +896,6 @@ class GpuDnnPoolDesc(Op): ...@@ -841,8 +896,6 @@ class GpuDnnPoolDesc(Op):
mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING' mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING'
elif self.mode == "average_exc_pad": elif self.mode == "average_exc_pad":
mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING' mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING'
if version() == -1:
raise Exception("cudnn v1 do not support average_exc_pad")
else: else:
raise NotImplementedError("Unsupported pooling model.") raise NotImplementedError("Unsupported pooling model.")
...@@ -855,22 +908,13 @@ class GpuDnnPoolDesc(Op): ...@@ -855,22 +908,13 @@ class GpuDnnPoolDesc(Op):
"descriptor: %%s", cudnnGetErrorString(err)); "descriptor: %%s", cudnnGetErrorString(err));
%(fail)s %(fail)s
} }
#ifndef CUDNN_VERSION
err = cudnnSetPoolingDescriptor( static const int win[%(nd)d] = {%(win)s};
%(desc)s, static const int pad[%(nd)d] = {%(pad)s};
%(mode_flag)s, static const int str[%(nd)d] = {%(str)s};
%(wsX)d, %(wsY)d, err = cudnnSetPoolingNdDescriptor(
%(stridex)d, %(stridey)d %(desc)s, %(mode_flag)s, %(nd)d,
); win, pad, str);
#else
err = cudnnSetPooling2dDescriptor(
%(desc)s,
%(mode_flag)s,
%(wsX)d, %(wsY)d,
%(padX)d, %(padY)d,
%(stridex)d, %(stridey)d
);
#endif
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set op descriptor: %%s", PyErr_Format(PyExc_RuntimeError, "could not set op descriptor: %%s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
...@@ -878,12 +922,12 @@ class GpuDnnPoolDesc(Op): ...@@ -878,12 +922,12 @@ class GpuDnnPoolDesc(Op):
} }
} }
""" % dict(name=name, desc=desc, mode_flag=mode_flag, fail=sub['fail'], """ % dict(name=name, desc=desc, mode_flag=mode_flag, fail=sub['fail'],
wsX=self.ws[0], wsY=self.ws[1], nd=self.get_ndim(), win=', '.join(map(str, self.ws)),
stridex=self.stride[0], stridey=self.stride[1], pad=', '.join(map(str, self.pad)),
padX=self.pad[0], padY=self.pad[1]) str=', '.join(map(str, self.stride)))
def c_code_cache_version(self): def c_code_cache_version(self):
return (2, version()) return (3, version())
class GpuDnnPool(DnnBase): class GpuDnnPool(DnnBase):
...@@ -901,146 +945,36 @@ class GpuDnnPool(DnnBase): ...@@ -901,146 +945,36 @@ class GpuDnnPool(DnnBase):
__props__ = () __props__ = ()
def __init__(self):
DnnBase.__init__(self, ["dnn_pool.c"], "APPLY_SPECIFIC(dnn_pool)")
def make_node(self, img, desc): def make_node(self, img, desc):
img = as_gpuarray_variable(img) img = as_gpuarray_variable(img)
if img.type.ndim != 4:
raise TypeError('img must be 4D tensor')
if not isinstance(desc.type, CDataType) \ if desc.owner is not None:
or desc.type.ctype != 'cudnnPoolingDescriptor_t': e_ndim = desc.owner.op.get_ndim() + 2
if img.type.ndim != e_ndim:
raise TypeError('img must be %dD tensor' % (e_ndim,))
if (not isinstance(desc.type, CDataType) or
desc.type.ctype != 'cudnnPoolingDescriptor_t'):
raise TypeError('desc must be cudnnPoolingDescriptor_t') raise TypeError('desc must be cudnnPoolingDescriptor_t')
return Apply(self, [img, desc], return Apply(self, [img, desc], [img.type()])
[img.type()])
def infer_shape(self, node, shape): def infer_shape(self, node, shape):
desc = node.inputs[1].owner.op desc = node.inputs[1].owner.op
kh, kw = desc.ws w = desc.ws
sh, sw = desc.stride s = desc.stride
padh, padw = desc.pad p = desc.pad
return [( res = [shape[0][0], shape[0][1],
shape[0][0], (shape[0][2] + 2 * p[0] - w[0]) // s[0] + 1,
shape[0][1], (shape[0][3] + 2 * p[1] - w[1]) // s[1] + 1
(shape[0][2] + 2 * padh - kh) // sh + 1, ]
(shape[0][3] + 2 * padw - kw) // sw + 1 if len(w) > 2:
)] res.append((shape[0][4] + 2 * p[2] - w[2]) // s[2] + 1)
return [res]
def c_support_code_struct(self, node, name):
return """
cudnnTensorDescriptor_t input%(name)s;
cudnnTensorDescriptor_t output%(name)s;
""" % dict(name=name)
def c_init_code_struct(self, node, name, sub):
return """
cudnnStatus_t err%(name)s;
input%(name)s = NULL;
output%(name)s = NULL;
if ((err%(name)s = cudnnCreateTensorDescriptor(&input%(name)s)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
"(inp): %%s", cudnnGetErrorString(err%(name)s));
%(fail)s
}
if ((err%(name)s = cudnnCreateTensorDescriptor(&output%(name)s)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
"(out): %%s", cudnnGetErrorString(err%(name)s));
%(fail)s
}
""" % dict(name=name, fail=sub['fail'])
def c_cleanup_code_struct(self, node, name):
return """
if (input%(name)s != NULL) { cudnnDestroyTensorDescriptor(input%(name)s); }
if (output%(name)s != NULL) { cudnnDestroyTensorDescriptor(output%(name)s); }
""" % dict(name=name)
def c_code(self, node, name, inputs, outputs, sub):
desc = inputs[1]
out, = outputs
set_in = c_set_tensor4d(inputs[0], "input" + str(name),
'err' + name, sub['fail'])
set_out = c_set_tensor4d(out, "output" + str(name),
'err' + name, sub['fail'])
return """
cudnnStatus_t err%(name)s;
size_t %(out)s_dims[4];
if (!GpuArray_IS_C_CONTIGUOUS(&%(input)s->ga)) {
PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
%(fail)s
}
%(set_in)s
cudnnPoolingMode_t mode;
int wsX, wsY, vpad, hpad, strideX, strideY;
#ifndef CUDNN_VERSION
err%(name)s = cudnnGetPoolingDescriptor(
%(desc)s, &mode,
&wsX, &wsY,
&strideX, &strideY);
#else
err%(name)s = cudnnGetPooling2dDescriptor(
%(desc)s, &mode,
&wsX, &wsY,
&vpad, &hpad,
&strideX, &strideY);
#endif
if (err%(name)s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"GpuDnnPool: error doing cudnnGetPoolingDescriptor operation: %%s",
cudnnGetErrorString(err%(name)s));
%(fail)s
}
%(out)s_dims[0] = PyGpuArray_DIMS(%(input)s)[0];
%(out)s_dims[1] = PyGpuArray_DIMS(%(input)s)[1];
%(out)s_dims[2] = (PyGpuArray_DIMS(%(input)s)[2] + (vpad*2) - wsX) / strideX + 1;
%(out)s_dims[3] = (PyGpuArray_DIMS(%(input)s)[3] + (hpad*2) - wsY) / strideY + 1;
if (theano_prep_output(&%(out)s, 4, %(out)s_dims, %(input)s->ga.typecode,
GA_C_ORDER, pygpu_default_context()) != 0) {
%(fail)s
}
%(set_out)s
#ifndef CUDNN_VERSION
err%(name)s = cudnnPoolingForward(
_handle,
%(desc)s,
%(input_desc)s, PyGpuArray_DEV_DATA(%(input)s),
%(output_desc)s, PyGpuArray_DEV_DATA(%(out)s)
);
#else
{
const float alpha = 1;
const float beta = 0;
err%(name)s = cudnnPoolingForward(
_handle,
%(desc)s,
&alpha,
%(input_desc)s, PyGpuArray_DEV_DATA(%(input)s),
&beta,
%(output_desc)s, PyGpuArray_DEV_DATA(%(out)s)
);
}
#endif
if (err%(name)s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"GpuDnnPool: error doing cudnnPoolingForward operation: %%s",
cudnnGetErrorString(err%(name)s));
%(fail)s
}
""" % dict(out=out, desc=desc, fail=sub['fail'],
name=name, set_in=set_in,
set_out=set_out, input=inputs[0],
input_desc="input" + name,
output_desc="output" + name)
def grad(self, inp, grads): def grad(self, inp, grads):
img, desc = inp img, desc = inp
...@@ -1058,9 +992,6 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) { ...@@ -1058,9 +992,6 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) {
# not connected to desc # not connected to desc
return [[1], [0]] return [[1], [0]]
def c_code_cache_version(self):
return (7, version())
class GpuDnnPoolGrad(DnnBase): class GpuDnnPoolGrad(DnnBase):
""" """
...@@ -1081,167 +1012,32 @@ class GpuDnnPoolGrad(DnnBase): ...@@ -1081,167 +1012,32 @@ class GpuDnnPoolGrad(DnnBase):
__props__ = () __props__ = ()
def make_node(self, inp, out, inp_grad, desc): def __init__(self):
inp = as_gpuarray_variable(inp) DnnBase.__init__(self, ["dnn_pool_grad.c"],
if inp.type.ndim != 4: "APPLY_SPECIFIC(dnn_pool_grad)")
raise TypeError('inp must be 4D tensor')
inp_grad = as_gpuarray_variable(inp_grad)
if inp_grad.type.ndim != 4:
raise TypeError('inp_grad must be 4D tensor')
def make_node(self, inp, out, out_grad, desc):
inp = as_gpuarray_variable(inp)
out_grad = as_gpuarray_variable(out_grad)
out = as_gpuarray_variable(out) out = as_gpuarray_variable(out)
if out.type.ndim != 4:
raise TypeError('out must be 4D tensor')
if not isinstance(desc.type, CDataType) \
or desc.type.ctype != 'cudnnPoolingDescriptor_t':
raise TypeError('desc must be cudnnPoolingDescriptor_t')
return Apply(self, [inp, out, inp_grad, desc],
[inp.type()])
def c_support_code_struct(self, node, name):
return """
cudnnTensorDescriptor_t input%(name)s;
cudnnTensorDescriptor_t input_grad%(name)s;
cudnnTensorDescriptor_t output%(name)s;
cudnnTensorDescriptor_t output_grad%(name)s;
""" % dict(name=name)
def c_init_code_struct(self, node, name, sub):
return """
cudnnStatus_t err%(name)s;
input%(name)s = NULL;
input_grad%(name)s = NULL;
output%(name)s = NULL;
output_grad%(name)s = NULL;
if ((err%(name)s = cudnnCreateTensorDescriptor(&input%(name)s)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError,
"GpuDnnPoolGrad: could not allocate tensor4d descriptor "
"(input): %%s", cudnnGetErrorString(err%(name)s));
%(fail)s
}
if ((err%(name)s = cudnnCreateTensorDescriptor(&input_grad%(name)s)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError,
"GpuDnnPoolGrad: could not allocate tensor4d descriptor "
"(input_grad): %%s", cudnnGetErrorString(err%(name)s));
%(fail)s
}
if ((err%(name)s = cudnnCreateTensorDescriptor(&output%(name)s)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError,
"GpuDnnPoolGrad: could not allocate tensor4d descriptor "
"(output): %%s", cudnnGetErrorString(err%(name)s));
%(fail)s
}
if ((err%(name)s = cudnnCreateTensorDescriptor(&output_grad%(name)s)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError,
"GpuDnnPoolGrad: could not allocate tensor4d descriptor "
"(output_grad): %%s", cudnnGetErrorString(err%(name)s));
%(fail)s
}
""" % dict(name=name, fail=sub['fail'])
def c_cleanup_code_struct(self, node, name):
return """
if (input%(name)s != NULL) { cudnnDestroyTensorDescriptor(input%(name)s); }
if (input_grad%(name)s != NULL) { cudnnDestroyTensorDescriptor(input_grad%(name)s); }
if (output%(name)s != NULL) { cudnnDestroyTensorDescriptor(output%(name)s); }
if (output_grad%(name)s != NULL) { cudnnDestroyTensorDescriptor(output_grad%(name)s); }
""" % dict(name=name)
def c_code(self, node, name, inputs, outputs, sub): if desc.owner is not None:
# Here the name out and inp are based on the cudnn definition. nd = desc.owner.op.get_ndim() + 2
# Not the definition of this class.
# This make it complicated.
out, inp, inp_grad, desc = inputs
out_grad, = outputs
set_in = "\n".join([
c_set_tensor4d(inp, "input" + name,
'err' + name, sub['fail']),
c_set_tensor4d(inp_grad, "input_grad" + name,
'err' + name, sub['fail']),
c_set_tensor4d(out, "output" + name,
'err' + name, sub['fail'])
])
set_out = c_set_tensor4d(out, "output_grad" + name,
'err' + name, sub['fail'])
return """
cudnnStatus_t err%(name)s;
if (!GpuArray_IS_C_CONTIGUOUS(&%(input)s->ga)) {
PyErr_SetString(PyExc_ValueError,
"GpuDnnPoolGrad: Only contiguous inputs are supported.");
%(fail)s
}
if (!GpuArray_IS_C_CONTIGUOUS(&%(input_grad)s->ga)) {
PyErr_SetString(PyExc_ValueError,
"GpuDnnPoolGrad: Only contiguous input gradients are supported.");
%(fail)s
}
if (!GpuArray_IS_C_CONTIGUOUS(&%(output)s->ga)) { if inp.type.ndim != nd:
PyErr_SetString(PyExc_ValueError, raise TypeError('inp must be %dD tensor' % (nd,))
"GpuDnnPoolGrad: Only contiguous outputs are supported.");
%(fail)s
}
%(set_in)s if out_grad.type.ndim != nd:
raise TypeError('out_grad must be %dD tensor' % (nd,))
if (theano_prep_output(&%(output_grad)s, PyGpuArray_NDIM(%(output)s), if out.type.ndim != nd:
PyGpuArray_DIMS(%(output)s), %(output)s->ga.typecode, raise TypeError('out must be %dD tensor' % (nd,))
GA_C_ORDER, pygpu_default_context()) != 0)
{
%(fail)s
}
%(set_out)s if (not isinstance(desc.type, CDataType) or
#ifndef CUDNN_VERSION desc.type.ctype != 'cudnnPoolingDescriptor_t'):
err%(name)s = cudnnPoolingBackward( raise TypeError('desc must be cudnnPoolingDescriptor_t')
_handle,
%(desc)s,
%(input_desc)s, PyGpuArray_DEV_DATA(%(input)s),
%(input_grad_desc)s, PyGpuArray_DEV_DATA(%(input_grad)s),
%(output_desc)s, PyGpuArray_DEV_DATA(%(output)s),
%(output_grad_desc)s, PyGpuArray_DEV_DATA(%(output_grad)s)
);
#else
{
const float alpha = 1;
const float beta = 0;
err%(name)s = cudnnPoolingBackward(
_handle,
%(desc)s,
&alpha,
%(input_desc)s, PyGpuArray_DEV_DATA(%(input)s),
%(input_grad_desc)s, PyGpuArray_DEV_DATA(%(input_grad)s),
%(output_desc)s, PyGpuArray_DEV_DATA(%(output)s),
&beta,
%(output_grad_desc)s, PyGpuArray_DEV_DATA(%(output_grad)s)
);
}
#endif
if (err%(name)s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"GpuDnnPoolGrad: error doing operation: %%s.",
cudnnGetErrorString(err%(name)s));
%(fail)s
}
""" % dict(output_grad=out_grad, desc=desc,
fail=sub['fail'],
name=name, set_in=set_in,
set_out=set_out, input=inp, input_grad=inp_grad, output=out,
input_desc="input" + name,
input_grad_desc="input_grad" + name,
output_desc="output" + name,
output_grad_desc="output_grad" + name)
def c_code_cache_version(self): return Apply(self, [inp, out, out_grad, desc], [inp.type()])
return (5, version())
def infer_shape(self, node, shape): def infer_shape(self, node, shape):
return [shape[0]] return [shape[0]]
...@@ -1254,19 +1050,20 @@ def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)): ...@@ -1254,19 +1050,20 @@ def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)):
The memory layout to use is 'bc01', that is 'batch', 'channel', The memory layout to use is 'bc01', that is 'batch', 'channel',
'first dim', 'second dim' in that order. 'first dim', 'second dim' in that order.
`ws`, `stride` and `pad` must have the same length.
Parameters Parameters
---------- ----------
img img
Images to do the pooling over. Images to do the pooling over.
ws ws : tuple
Subsampling window size. Subsampling window size.
stride stride : tuple
Subsampling stride (default: (1, 1)). Subsampling stride (default: (1, 1)).
mode : {'max', 'average_inc_pad', 'average_exc_pad'} mode : {'max', 'average_inc_pad', 'average_exc_pad'}
pad pad : tuple
(padX, padY) padding information. (padX, padY) or (padX, padY, padZ)
padX is the size of the left and right borders, default: (0, 0)
padY is the size of the top and bottom borders.
.. warning:: The cuDNN library only works with GPU that have a compute .. warning:: The cuDNN library only works with GPU that have a compute
capability of 3.0 or higer. This means that older GPU will not capability of 3.0 or higer. This means that older GPU will not
...@@ -1288,8 +1085,6 @@ class GpuDnnSoftmaxBase(DnnBase): ...@@ -1288,8 +1085,6 @@ class GpuDnnSoftmaxBase(DnnBase):
Parameters Parameters
---------- ----------
tensor_format
Whether the data format is 'bc01' or 'b01c'.
algo algo
'fast' or 'accurate' indicating whether computations should be 'fast' or 'accurate' indicating whether computations should be
optimized for speed or accuracy respectively. optimized for speed or accuracy respectively.
...@@ -1300,149 +1095,45 @@ class GpuDnnSoftmaxBase(DnnBase): ...@@ -1300,149 +1095,45 @@ class GpuDnnSoftmaxBase(DnnBase):
""" """
__props__ = ('tensor_format', 'mode', 'algo') __props__ = ('mode', 'algo')
def __init__(self, tensor_format, algo, mode): def __init__(self, algo, mode):
assert(tensor_format in ('bc01', 'b01c')) DnnBase.__init__(self, [self.file], self.c_func)
DnnBase.__init__(self)
self.tensor_format = tensor_format
assert(algo in ('fast', 'accurate')) assert(algo in ('fast', 'accurate', 'log'))
if algo == 'log' and version() < 3000:
raise RuntimeError("Need CuDNN v3 for log-softmax")
self.algo = algo self.algo = algo
assert(mode in ('instance', 'channel')) assert(mode in ('instance', 'channel'))
self.mode = mode self.mode = mode
self.tensor_4d_descs = [softmax_input
for softmax_input in self.softmax_inputs]
self.tensor_4d_descs.append('softmax_output')
def infer_shape(self, node, shape): def infer_shape(self, node, shape):
if self.direction == 'forward': if self.direction == 'forward':
return [shape[0]] return [shape[0]]
else: else:
return [shape[1]] return [shape[1]]
def _define_tensor4d_desc(self, name, id): def get_op_params(self):
return """
cudnnTensorDescriptor_t %(id)s_%(name)s;
""" % dict(name=name, id=id)
def _init_tensor4d_desc(self, name, id, fail):
return """
%(id)s_%(name)s = NULL;
if ((err%(name)s = cudnnCreateTensorDescriptor(&%(id)s_%(name)s)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
": %%s", cudnnGetErrorString(err%(name)s));
%(fail)s
}
""" % dict(name=name, id=id, fail=fail)
def _clean_tensor4d_desc(self, name, id):
return """
if(%(id)s_%(name)s!= NULL)
cudnnDestroyTensorDescriptor(%(id)s_%(name)s);
""" % dict(name=name, id=id)
def c_support_code_struct(self, node, name):
result = ''
for id in self.tensor_4d_descs:
result += self._define_tensor4d_desc(name, id)
return result
def c_init_code_struct(self, node, name, sub):
result = """
cudnnStatus_t err%(name)s;
""" % dict(name=name)
for id in self.tensor_4d_descs:
result += self._init_tensor4d_desc(name, id, sub['fail'])
return result
def c_cleanup_code_struct(self, node, name):
result = ''
for id in self.tensor_4d_descs:
result += self._clean_tensor4d_desc(name, id)
return result
def c_code(self, node, name, inputs, outputs, sub):
ins = inputs
outs, = outputs
if self.tensor_format == 'b01c':
tensor_format = 1
else:
tensor_format = 0
if self.mode == 'instance': if self.mode == 'instance':
mode = 1 mode = "CUDNN_SOFTMAX_MODE_INSTANCE"
else: else:
mode = 0 mode = "CUDNN_SOFTMAX_MODE_CHANNEL"
if self.algo == 'fast': if self.algo == 'fast':
algo = 1 algo = "CUDNN_SOFTMAX_FAST"
elif self.algo == 'log':
algo = "CUDNN_SOFTMAX_LOG"
else: else:
algo = 0 algo = "CUDNN_SOFTMAX_ACCURATE"
# Setup configuration variables.
result = """
cudnnStatus_t err%(name)s;
cudnnTensorFormat_t format%(name)s = CUDNN_TENSOR_NCHW;
if (%(tensor_format)d == 1)
format%(name)s = CUDNN_TENSOR_NHWC;
cudnnSoftmaxAlgorithm_t algo%(name)s = CUDNN_SOFTMAX_ACCURATE;
if (%(algo)d == 1)
algo%(name)s = CUDNN_SOFTMAX_FAST;
cudnnSoftmaxMode_t mode%(name)s = CUDNN_SOFTMAX_MODE_CHANNEL;
if (%(mode)d == 1)
mode%(name)s = CUDNN_SOFTMAX_MODE_INSTANCE;
""" % dict(name=name, tensor_format=tensor_format, mode=mode, algo=algo)
# Validate the input and build the input variables.
for input_idx, input_name in enumerate(self.softmax_inputs):
result += c_set_tensor4d(ins[input_idx], input_name + "_" + name,
"err" + name, sub['fail'])
subs = dict(ins=ins[-1], outs=outs, fail=sub['fail'],
name=name)
for idx, softmax_input in enumerate(self.softmax_inputs):
subs['name%d' % idx] = softmax_input
subs['ins%d' % idx] = inputs[idx]
# Build and prepare the output variable.
result += """
if (theano_prep_output(&%(outs)s, PyGpuArray_NDIM(%(ins)s),
PyGpuArray_DIMS(%(ins)s), %(ins)s->ga.typecode,
GA_C_ORDER, pygpu_default_context()) != 0)
{
%(fail)s
}
""" % subs
result += c_set_tensor4d(outs,
"softmax_output_" + name,
"err" + name, sub['fail'])
# Add on a call to the method that does the actual work.
result += self.method() % subs
return result
def c_code_cache_version(self):
return (0, 7, version())
def method(self): return [("SOFTMAX_MODE", mode), ("SOFTMAX_ALGO", algo)]
raise NotImplementedError('GpuDnnSoftmaxBase::method')
class GpuDnnSoftmax(GpuDnnSoftmaxBase): class GpuDnnSoftmax(GpuDnnSoftmaxBase):
""" """
Op for the cuDNN Softmax. Op for the cuDNN Softmax.
tensor_format
Whether the data format is 'bc01' or 'b01c'.
algo algo
'fast' or 'accurate' indicating whether computations should be 'fast' or 'accurate' indicating whether computations should be
optimized for speed or accuracy respectively. optimized for speed or accuracy respectively.
...@@ -1452,55 +1143,23 @@ class GpuDnnSoftmax(GpuDnnSoftmaxBase): ...@@ -1452,55 +1143,23 @@ class GpuDnnSoftmax(GpuDnnSoftmaxBase):
image across 'c'. image across 'c'.
""" """
direction = "forward"
direction = 'forward' file = "dnn_softmax.c"
softmax_inputs = ['softmax_input'] c_func = "APPLY_SPECIFIC(softmax)"
def make_node(self, x): def make_node(self, x):
x = as_gpuarray_variable(x) x = as_gpuarray_variable(x)
assert x.ndim == 4 assert x.ndim == 4
return Apply(self, [x], [x.type()]) return Apply(self, [x], [x.type()])
def method(self):
return """
#ifndef CUDNN_VERSION
err%(name)s = cudnnSoftmaxForward(
_handle,
algo%(name)s,
mode%(name)s,
softmax_input_%(name)s,
PyGpuArray_DEV_DATA(%(ins)s),
softmax_output_%(name)s,
PyGpuArray_DEV_DATA(%(outs)s)
);
#else
{
const float alpha = 1.;
const float beta = 0.;
err%(name)s = cudnnSoftmaxForward(
_handle,
algo%(name)s,
mode%(name)s,
(void*) &alpha,
softmax_input_%(name)s,
PyGpuArray_DEV_DATA(%(ins)s),
(void*) &beta,
softmax_output_%(name)s,
PyGpuArray_DEV_DATA(%(outs)s)
);
}
#endif
"""
def grad(self, inp, grads): def grad(self, inp, grads):
x, = inp x, = inp
g_sm, = grads g_sm, = grads
sm = self.make_node(x).outputs[0] sm = self.make_node(x).outputs[0]
return [GpuDnnSoftmaxGrad( return [GpuDnnSoftmaxGrad(
self.tensor_format, self.algo,
self.algo, self.mode
self.mode )(g_sm, sm)]
)(g_sm, sm)]
class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase): class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
...@@ -1509,8 +1168,6 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase): ...@@ -1509,8 +1168,6 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
Parameters Parameters
---------- ----------
tensor_format
Whether the data format is 'bc01' or 'b01c'.
algo algo
'fast' or 'accurate' indicating whether computations should be 'fast' or 'accurate' indicating whether computations should be
optimized for speed or accuracy respectively. optimized for speed or accuracy respectively.
...@@ -1521,7 +1178,8 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase): ...@@ -1521,7 +1178,8 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
""" """
direction = 'backward' direction = 'backward'
softmax_inputs = ['softmax_gout', 'softmax_input'] file = "dnn_softmax_grad.c"
c_func = "APPLY_SPECIFIC(softmax_grad)"
def make_node(self, dy, sm): def make_node(self, dy, sm):
dy = as_gpuarray_variable(dy) dy = as_gpuarray_variable(dy)
...@@ -1530,41 +1188,6 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase): ...@@ -1530,41 +1188,6 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
assert sm.ndim == 4 assert sm.ndim == 4
return Apply(self, [dy, sm], [sm.type()]) return Apply(self, [dy, sm], [sm.type()])
def method(self):
return """
#ifndef CUDNN_VERSION
err%(name)s = cudnnSoftmaxBackward(
_handle,
algo%(name)s,
mode%(name)s,
%(name1)s_%(name)s,
PyGpuArray_DEV_DATA(%(ins1)s),
%(name0)s_%(name)s,
PyGpuArray_DEV_DATA(%(ins0)s),
softmax_output_%(name)s,
PyGpuArray_DEV_DATA(%(outs)s)
);
#else
{
const float alpha = 1.;
const float beta = 0.;
err%(name)s = cudnnSoftmaxBackward(
_handle,
algo%(name)s,
mode%(name)s,
(void*) &alpha,
%(name1)s_%(name)s,
PyGpuArray_DEV_DATA(%(ins1)s),
%(name0)s_%(name)s,
PyGpuArray_DEV_DATA(%(ins0)s),
(void*) &beta,
softmax_output_%(name)s,
PyGpuArray_DEV_DATA(%(outs)s)
);
}
#endif
"""
# @register_opt('cudnn') # this optimizer is registered in opt.py instead. # @register_opt('cudnn') # this optimizer is registered in opt.py instead.
@local_optimizer([GpuConv]) @local_optimizer([GpuConv])
...@@ -1612,9 +1235,6 @@ def local_conv_dnn_alternative(node): ...@@ -1612,9 +1235,6 @@ def local_conv_dnn_alternative(node):
rval = dnn_conv(img, kern, rval = dnn_conv(img, kern,
border_mode=border_mode, subsample=subsample, border_mode=border_mode, subsample=subsample,
direction_hint=direction_hint) direction_hint=direction_hint)
if node.outputs[0].broadcastable != rval.broadcastable:
rval = tensor.patternbroadcast(
rval, node.outputs[0].type.broadcastable)
return [rval] return [rval]
...@@ -1632,7 +1252,7 @@ def local_dnn_conv_inplace(node): ...@@ -1632,7 +1252,7 @@ def local_dnn_conv_inplace(node):
isinstance(dest.owner.op, GpuAllocEmpty) and isinstance(dest.owner.op, GpuAllocEmpty) and
len(dest.clients) > 1): len(dest.clients) > 1):
inputs[2] = GpuAllocEmpty(dest.owner.op.dtype)(*dest.owner.inputs) inputs[2] = GpuAllocEmpty(dest.owner.op.dtype)(*dest.owner.inputs)
return [GpuDnnConv(workmem=node.op.workmem, inplace=True)(*inputs)] return [GpuDnnConv(algo=node.op.algo, inplace=True)(*inputs)]
@local_optimizer([GpuDnnConvGradW], inplace=True) @local_optimizer([GpuDnnConvGradW], inplace=True)
...@@ -1645,7 +1265,7 @@ def local_dnn_convgw_inplace(node): ...@@ -1645,7 +1265,7 @@ def local_dnn_convgw_inplace(node):
isinstance(dest.owner.op, GpuAllocEmpty) and isinstance(dest.owner.op, GpuAllocEmpty) and
len(dest.clients) > 1): len(dest.clients) > 1):
inputs[2] = GpuAllocEmpty(dest.owner.op.dtype)(*dest.owner.inputs) inputs[2] = GpuAllocEmpty(dest.owner.op.dtype)(*dest.owner.inputs)
return [GpuDnnConvGradW(inplace=True)(*inputs)] return [GpuDnnConvGradW(algo=node.op.algo, inplace=True)(*inputs)]
@local_optimizer([GpuDnnConvGradI], inplace=True) @local_optimizer([GpuDnnConvGradI], inplace=True)
...@@ -1658,7 +1278,7 @@ def local_dnn_convgi_inplace(node): ...@@ -1658,7 +1278,7 @@ def local_dnn_convgi_inplace(node):
isinstance(dest.owner.op, GpuAllocEmpty) and isinstance(dest.owner.op, GpuAllocEmpty) and
len(dest.clients) > 1): len(dest.clients) > 1):
inputs[2] = GpuAllocEmpty(dest.owner.op.dtype)(*dest.owner.inputs) inputs[2] = GpuAllocEmpty(dest.owner.op.dtype)(*dest.owner.inputs)
return [GpuDnnConvGradI(inplace=True)(*inputs)] return [GpuDnnConvGradI(algo=node.op.algo, inplace=True)(*inputs)]
optdb.register('local_dnna_conv_inplace', optdb.register('local_dnna_conv_inplace',
tensor.opt.in2out(local_dnn_conv_inplace, tensor.opt.in2out(local_dnn_conv_inplace,
...@@ -1671,46 +1291,40 @@ optdb.register('local_dnna_conv_inplace', ...@@ -1671,46 +1291,40 @@ optdb.register('local_dnna_conv_inplace',
@register_opt('cudnn') @register_opt('cudnn')
@alpha_merge(GpuDnnConv, alpha_in=4, beta_in=5, nd=4) @alpha_merge(GpuDnnConv, alpha_in=4, beta_in=5, nd=4)
def local_dnn_conv_alpha_merge(node, *inputs): def local_dnn_conv_alpha_merge(node, *inputs):
if not dnn_available() or version() == -1: return [GpuDnnConv(algo=node.op.algo)(*inputs)]
return None
return [GpuDnnConv(workmem=node.op.workmem)(*inputs)]
@register_opt('cudnn') @register_opt('cudnn')
@alpha_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, nd=4) @alpha_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, nd=4)
def local_dnn_convw_alpha_merge(node, *inputs): def local_dnn_convw_alpha_merge(node, *inputs):
if not dnn_available() or version() == -1: return [GpuDnnConvGradW(algo=node.op.algo)(*inputs)]
return None
return [GpuDnnConvGradW()(*inputs)]
@register_opt('cudnn') @register_opt('cudnn')
@alpha_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, nd=4) @alpha_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, nd=4)
def local_dnn_convi_alpha_merge(node, *inputs): def local_dnn_convi_alpha_merge(node, *inputs):
if not dnn_available() or version() == -1: return [GpuDnnConvGradI(algo=node.op.algo)(*inputs)]
return None
return [GpuDnnConvGradI()(*inputs)]
@register_opt('cudnn') @register_opt('cudnn')
@output_merge(GpuDnnConv, alpha_in=4, beta_in=5, out_in=2, nd=4) @output_merge(GpuDnnConv, alpha_in=4, beta_in=5, out_in=2, nd=4)
def local_dnn_conv_output_merge(node, *inputs): def local_dnn_conv_output_merge(node, *inputs):
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:] inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConv(workmem=node.op.workmem)(*inputs)] return [GpuDnnConv(algo=node.op.algo)(*inputs)]
@register_opt('cudnn') @register_opt('cudnn')
@output_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, out_in=2, nd=4) @output_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, out_in=2, nd=4)
def local_dnn_convw_output_merge(node, *inputs): def local_dnn_convw_output_merge(node, *inputs):
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:] inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConvGradW()(*inputs)] return [GpuDnnConvGradW(algo=node.op.algo)(*inputs)]
@register_opt('cudnn') @register_opt('cudnn')
@output_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, out_in=2, nd=4) @output_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, out_in=2, nd=4)
def local_dnn_convi_output_merge(node, *inputs): def local_dnn_convi_output_merge(node, *inputs):
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:] inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConvGradI()(*inputs)] return [GpuDnnConvGradI(algo=node.op.algo)(*inputs)]
@register_opt('cudnn') @register_opt('cudnn')
...@@ -1736,7 +1350,7 @@ def local_pool_dnn_grad_stride(node): ...@@ -1736,7 +1350,7 @@ def local_pool_dnn_grad_stride(node):
return return
if not node.op.ignore_border: if not node.op.ignore_border:
return return
inp, out, inp_grad = node.inputs inp, out, out_grad = node.inputs
ds = node.op.ds ds = node.op.ds
st = node.op.st st = node.op.st
pad = node.op.padding pad = node.op.padding
...@@ -1745,7 +1359,7 @@ def local_pool_dnn_grad_stride(node): ...@@ -1745,7 +1359,7 @@ def local_pool_dnn_grad_stride(node):
desc = GpuDnnPoolDesc(ws=ds, stride=st, mode=mode, pad=pad)() desc = GpuDnnPoolDesc(ws=ds, stride=st, mode=mode, pad=pad)()
return GpuDnnPoolGrad()(gpu_contiguous(inp), return GpuDnnPoolGrad()(gpu_contiguous(inp),
gpu_contiguous(out), gpu_contiguous(out),
gpu_contiguous(inp_grad), gpu_contiguous(out_grad),
desc) desc)
...@@ -1756,18 +1370,19 @@ def local_avg_pool_dnn_grad_stride(node): ...@@ -1756,18 +1370,19 @@ def local_avg_pool_dnn_grad_stride(node):
return return
if not node.op.ignore_border: if not node.op.ignore_border:
return return
inp, inp_grad = node.inputs inp, out_grad = node.inputs
ds = node.op.ds ds = node.op.ds
st = node.op.st st = node.op.st
pad = node.op.padding pad = node.op.padding
mode = node.op.mode mode = node.op.mode
cg = gpu_contiguous(out_grad)
desc = GpuDnnPoolDesc(ws=ds, stride=st, mode=mode, pad=pad)() desc = GpuDnnPoolDesc(ws=ds, stride=st, mode=mode, pad=pad)()
contiguous_inp_grad = gpu_contiguous(inp_grad) # We reuse cg because CuDNN does not use the value of the `out`
return GpuDnnPoolGrad()(gpu_contiguous(inp), # argument but still checks its shape for average pooling. This
contiguous_inp_grad, # has been observed in v2 and v3 as far as I know.
contiguous_inp_grad, return GpuDnnPoolGrad()(gpu_contiguous(inp), cg, cg, desc)
desc)
@register_opt('cudnn') @register_opt('cudnn')
...@@ -1778,11 +1393,27 @@ def local_softmax_dnn(node): ...@@ -1778,11 +1393,27 @@ def local_softmax_dnn(node):
if isinstance(node.op, GpuSoftmax): if isinstance(node.op, GpuSoftmax):
ins = node.inputs[0].dimshuffle(0, 1, 'x', 'x') ins = node.inputs[0].dimshuffle(0, 1, 'x', 'x')
ins = gpu_contiguous(ins) ins = gpu_contiguous(ins)
out = GpuDnnSoftmax('bc01', 'accurate', 'channel')(ins) out = GpuDnnSoftmax('accurate', 'channel')(ins)
out = as_gpuarray_variable(out.dimshuffle(0, 1)) out = as_gpuarray_variable(out.dimshuffle(0, 1))
return [out] return [out]
@register_opt('cudnn')
@local_optimizer([GpuElemwise])
def local_log_softmax_dnn(node):
if not dnn_available() or version() < 3000:
# No log-softmax before cudnn v3
return
if (isinstance(node.op, GpuElemwise) and
isinstance(node.op.scalar_op, Log) and
node.inputs[0].owner and
isinstance(node.inputs[0].owner.op, GpuDnnSoftmax) and
len(node.inputs[0].clients) == 1):
softmax_node = node.inputs[0].owner
new_softmax = GpuDnnSoftmax('log', softmax_node.op.mode)
return [new_softmax(softmax_node.inputs[0])]
class NoCuDNNRaise(Optimizer): class NoCuDNNRaise(Optimizer):
def apply(self, fgraph): def apply(self, fgraph):
""" """
...@@ -1813,6 +1444,6 @@ def local_softmax_dnn_grad(node): ...@@ -1813,6 +1444,6 @@ def local_softmax_dnn_grad(node):
return return
ins.append(n.dimshuffle(0, 1, 'x', 'x')) ins.append(n.dimshuffle(0, 1, 'x', 'x'))
out = GpuDnnSoftmaxGrad('bc01', 'accurate', 'channel')( out = GpuDnnSoftmaxGrad('accurate', 'channel')(
gpu_contiguous(ins[0]), gpu_contiguous(ins[1])) gpu_contiguous(ins[0]), gpu_contiguous(ins[1]))
return [out.dimshuffle(0, 1)] return [out.dimshuffle(0, 1)]
#section support_code #section support_code
static cudnnHandle_t _handle = NULL;
static int static int
c_set_tensor4d(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc) { c_set_tensorNd(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc) {
cudnnDataType_t dt; cudnnDataType_t dt;
size_t ds; size_t ds;
switch (var->ga.typecode) { switch (var->ga.typecode) {
...@@ -12,26 +11,37 @@ c_set_tensor4d(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc) { ...@@ -12,26 +11,37 @@ c_set_tensor4d(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc) {
case GA_DOUBLE: case GA_DOUBLE:
dt = CUDNN_DATA_DOUBLE; dt = CUDNN_DATA_DOUBLE;
break; break;
#if CUDNN_VERSION > 3000
case GA_HALF:
dt = CUDNN_DATA_HALF;
break;
#endif
default: default:
PyErr_SetString(PyExc_TypeError, "Non-float datatype in c_set_tensor4d"); PyErr_SetString(PyExc_TypeError, "Non-float datatype in c_set_tensorNd");
return -1; return -1;
} }
ds = gpuarray_get_elsize(var->ga.typecode); ds = gpuarray_get_elsize(var->ga.typecode);
int str0, str1, str2, str3; int strs[5], dims[5], default_stride = 1;
// cudnn do not like 0s in strides unsigned int nd = PyGpuArray_NDIM(var);
str3 = PyGpuArray_STRIDES(var)[3]?PyGpuArray_STRIDES(var)[3]/ds:1;
str2 = PyGpuArray_STRIDES(var)[2]?PyGpuArray_STRIDES(var)[2]/ds:PyGpuArray_DIMS(var)[3]; if (nd > 5) {
str1 = PyGpuArray_STRIDES(var)[1]?PyGpuArray_STRIDES(var)[1]/ds:PyGpuArray_DIMS(var)[2]*PyGpuArray_DIMS(var)[3]; PyErr_SetString(PyExc_TypeError, "Tensor of more than 5d");
str0 = PyGpuArray_STRIDES(var)[0]?PyGpuArray_STRIDES(var)[0]/ds:PyGpuArray_DIMS(var)[2]*PyGpuArray_DIMS(var)[3]*PyGpuArray_DIMS(var)[1]; return -1;
cudnnStatus_t err = cudnnSetTensor4dDescriptorEx( }
desc, dt,
PyGpuArray_DIM(var, 0), PyGpuArray_DIM(var, 1), for (unsigned int _i = nd; _i > 0; _i--) {
PyGpuArray_DIM(var, 2), PyGpuArray_DIM(var, 3), unsigned int i = _i - 1;
str0, str1, str2, str3); strs[i] = PyGpuArray_STRIDE(var, i) ?
PyGpuArray_STRIDE(var, i)/ds : default_stride;
default_stride *= PyGpuArray_DIM(var, i);
dims[i] = PyGpuArray_DIM(var, i);
}
cudnnStatus_t err = cudnnSetTensorNdDescriptor(desc, dt, nd, dims, strs);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"Could not set tensor4d descriptor: %s", "Could not set tensorNd descriptor: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
return -1; return -1;
} }
...@@ -53,14 +63,30 @@ c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t desc) { ...@@ -53,14 +63,30 @@ c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t desc) {
case GA_DOUBLE: case GA_DOUBLE:
dt = CUDNN_DATA_DOUBLE; dt = CUDNN_DATA_DOUBLE;
break; break;
#if CUDNN_VERSION > 3000
case GA_HALF:
dt = CUDNN_DATA_HALF;
break;
#endif
default: default:
PyErr_SetString(PyExc_TypeError, "Non-float datatype in c_set_filter"); PyErr_SetString(PyExc_TypeError, "Non-float datatype in c_set_filter");
return -1; return -1;
} }
cudnnStatus_t err = cudnnSetFilter4dDescriptor(
desc, dt, int dims[5];
PyGpuArray_DIMS(var)[0], PyGpuArray_DIMS(var)[1], unsigned int nd = PyGpuArray_NDIM(var);
PyGpuArray_DIMS(var)[2], PyGpuArray_DIMS(var)[3]);
if (nd > 5) {
PyErr_SetString(PyExc_TypeError, "Tensor of more than 5d");
return -1;
}
for (unsigned int _i = nd; _i > 0; _i--) {
unsigned int i = _i - 1;
dims[i] = PyGpuArray_DIM(var, i);
}
cudnnStatus_t err = cudnnSetFilterNdDescriptor(desc, dt, nd, dims);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"Could not set filter descriptor: %s.", "Could not set filter descriptor: %s.",
...@@ -72,15 +98,23 @@ c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t desc) { ...@@ -72,15 +98,23 @@ c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t desc) {
#section init_code #section init_code
setup_ext_cuda();
#section support_code_struct
cudnnHandle_t APPLY_SPECIFIC(_handle);
#section init_code_struct
{ {
cuda_enter(pygpu_default_context()->ctx);
cudnnStatus_t err; cudnnStatus_t err;
if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) { APPLY_SPECIFIC(_handle) = NULL;
if ((err = cudnnCreate(&APPLY_SPECIFIC(_handle))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not create cuDNN handle: %s", PyErr_Format(PyExc_RuntimeError, "could not create cuDNN handle: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
#if PY_MAJOR_VERSION >= 3 cuda_exit(pygpu_default_context()->ctx);
return NULL; FAIL;
#else
return;
#endif
} }
cuda_exit(pygpu_default_context()->ctx);
} }
...@@ -10,12 +10,12 @@ APPLY_SPECIFIC(input) = NULL; ...@@ -10,12 +10,12 @@ APPLY_SPECIFIC(input) = NULL;
APPLY_SPECIFIC(output) = NULL; APPLY_SPECIFIC(output) = NULL;
APPLY_SPECIFIC(kerns) = NULL; APPLY_SPECIFIC(kerns) = NULL;
if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != CUDNN_STATUS_SUCCESS) { if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor " PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
"(inp): %s", cudnnGetErrorString(APPLY_SPECIFIC(err))); "(inp): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
FAIL; FAIL;
} }
if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) != CUDNN_STATUS_SUCCESS) { if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor " PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
"(out): %s", cudnnGetErrorString(APPLY_SPECIFIC(err))); "(out): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
FAIL; FAIL;
} }
......
...@@ -10,14 +10,15 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -10,14 +10,15 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
float af = alpha, bf = beta; float af = alpha, bf = beta;
void *alpha_p; void *alpha_p;
void *beta_p; void *beta_p;
PyGpuContextObject *c = pygpu_default_context();
if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) { if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) {
PyErr_SetString(PyExc_ValueError, PyErr_SetString(PyExc_ValueError,
"GpuDnnConv images and kernel must have the same stack size"); "images and kernel must have the same stack size");
return 1; return 1;
} }
if (c_set_tensor4d(input, APPLY_SPECIFIC(input)) == -1) if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
return 1; return 1;
if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1) if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
return 1; return 1;
...@@ -28,6 +29,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -28,6 +29,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
beta_p = (void *)&beta; beta_p = (void *)&beta;
break; break;
case GA_FLOAT: case GA_FLOAT:
case GA_HALF:
alpha_p = (void *)&af; alpha_p = (void *)&af;
beta_p = (void *)&bf; beta_p = (void *)&bf;
break; break;
...@@ -42,56 +44,179 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -42,56 +44,179 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
Py_INCREF(*output); Py_INCREF(*output);
#else #else
if (theano_prep_output(output, PyGpuArray_NDIM(om), PyGpuArray_DIMS(om), if (theano_prep_output(output, PyGpuArray_NDIM(om), PyGpuArray_DIMS(om),
om->ga.typecode, GA_C_ORDER, om->ga.typecode, GA_C_ORDER, c) != 0)
pygpu_default_context()) != 0)
return 1; return 1;
if (beta != 0.0 && pygpu_move(*output, om)) if (beta != 0.0 && pygpu_move(*output, om))
return 1; return 1;
#endif #endif
if (c_set_tensor4d(*output, APPLY_SPECIFIC(output)) == -1) if (c_set_tensorNd(*output, APPLY_SPECIFIC(output)) == -1)
return 1; return 1;
cudnnConvolutionFwdAlgo_t algo = CONV_ALGO;
cuda_enter(c->ctx);
#ifdef CHOOSE_ALGO
/* Static variables are only initialized once so this will not
* reset the previous algo every time */
static int reuse_algo = 0;
static cudnnConvolutionFwdAlgo_t prev_algo = CONV_ALGO;
#ifndef CHOOSE_ONCE
static size_t prev_img_dims[5] = {0};
static size_t prev_kern_dims[5] = {0};
reuse_algo = 1;
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(input, i) == prev_img_dims[i]);
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]);
}
#endif
if (!reuse_algo) {
#ifdef CHOOSE_TIME
int count;
cudnnConvolutionFwdAlgoPerf_t choice;
err = cudnnFindConvolutionForwardAlgorithm(
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
desc, APPLY_SPECIFIC(output), 1, &count, &choice);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
algo = choice.algo;
#else
size_t free = 0, total = 0;
cudaError_t err2 = cudaMemGetInfo(&free, &total);
if (err2 != cudaSuccess) {
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
"memory information on the GPU: %s\n",
cudaGetErrorString(err2));
cuda_exit(c->ctx);
return 1;
}
err = cudnnGetConvolutionForwardAlgorithm(
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
desc, APPLY_SPECIFIC(output),
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
#endif
prev_algo = algo;
} else {
algo = prev_algo;
}
#ifdef CHOOSE_ONCE
reuse_algo = 1;
#else
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
prev_img_dims[i] = PyGpuArray_DIM(input, i);
prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
}
#endif
#endif
/* These two algos are not supported for 3d conv */
if (PyGpuArray_NDIM(input) == 5 &&
(algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM ||
algo == CUDNN_CONVOLUTION_FWD_ALGO_GEMM))
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
#if CUDNN_VERSION > 3000
if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT) {
int nd;
int pad[2];
int stride[2];
int upscale[2];
cudnnConvolutionMode_t mode;
err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
upscale, &mode);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error getting convolution properties: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
if (stride[0] != 1 || stride[1] != 1 ||
PyGpuArray_DIM(input, 0) > 1024 || PyGpuArray_DIM(input, 1) > 1024 ||
(PyGpuArray_DIM(kerns, 0) == 1 && PyGpuArray_DIM(kerns, 1) == 1)) {
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
}
}
#endif
#if CUDNN_VERSION < 3000
/* cuDNN before v3 does not support kernels larger than input even
* if appropriate padding is selected. */
for (unsigned int i = 2; i < PyGpuArray_NDIM(input); i++) {
if (PyGpuArray_DIM(kerns, i) > PyGpuArray_DIM(input, i)) {
PyErr_SetString(PyExc_RuntimeError, "the current version "
"of CuDNN does not support kernels larger than the "
"inputs in any spatial dimension, even if the inputs "
"are padded such that the padded inputs are larger "
"than the kernels. Update your installation of CuDNN "
"to V3 or more recent to solve the issue.");
cuda_exit(c->ctx);
return 1;
}
}
#endif
{ {
size_t worksize; size_t worksize;
gpudata *workspace; gpudata *workspace;
PyGpuContextObject *c; err = cudnnGetConvolutionForwardWorkspaceSize(APPLY_SPECIFIC(_handle),
err = cudnnGetConvolutionForwardWorkspaceSize(_handle,
APPLY_SPECIFIC(input), APPLY_SPECIFIC(input),
APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(kerns),
desc, desc,
APPLY_SPECIFIC(output), APPLY_SPECIFIC(output),
CONV_ALGO, algo,
&worksize); &worksize);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuDnnConv: error getting worksize: %s", "error getting worksize: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1; return 1;
} }
/* /*
* This is less than ideal since we need to free it after (which * This is less than ideal since we need to free it after (which
* introduces a synchronization point. But we don't have a module * introduces a synchronization point. But we don't have a module
* to place a nice get_work_mem() function in. * to place a nice get_work_mem() function in.
*/ */
if (worksize != 0) { if (worksize != 0) {
c = pygpu_default_context();
workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL); workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL);
if (workspace == NULL) { if (workspace == NULL) {
PyErr_SetString(PyExc_RuntimeError, PyErr_SetString(PyExc_RuntimeError,
"Could not allocate working memory"); "Could not allocate working memory");
cuda_exit(c->ctx);
return 1; return 1;
} }
} }
err = cudnnConvolutionForward( err = cudnnConvolutionForward(
_handle, APPLY_SPECIFIC(_handle),
alpha_p, alpha_p,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input), APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns), APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
desc, CONV_ALGO, desc, algo,
worksize == 0 ? NULL : *(void **)workspace, worksize, worksize == 0 ? NULL : *(void **)workspace, worksize,
beta_p, beta_p,
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output)); APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output));
...@@ -99,9 +224,10 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -99,9 +224,10 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
if (worksize != 0) if (worksize != 0)
c->ops->buffer_release(workspace); c->ops->buffer_release(workspace);
} }
cuda_exit(c->ctx);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "GpuDnnConv: error doing operation: %s", PyErr_Format(PyExc_RuntimeError, "error doing operation: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
return 1; return 1;
} }
......
...@@ -9,14 +9,15 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -9,14 +9,15 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
float af = alpha, bf = beta; float af = alpha, bf = beta;
void *alpha_p; void *alpha_p;
void *beta_p; void *beta_p;
PyGpuContextObject *c = pygpu_default_context();
if (PyGpuArray_DIMS(im)[1] != PyGpuArray_DIMS(kerns)[1]) { if (PyGpuArray_DIMS(im)[1] != PyGpuArray_DIMS(kerns)[1]) {
PyErr_SetString(PyExc_ValueError, PyErr_SetString(PyExc_ValueError, "images and kernel must have the same "
"GpuDnnConv images and kernel must have the same stack size"); "stack size");
return 1; return 1;
} }
if (c_set_tensor4d(output, APPLY_SPECIFIC(output)) == -1) if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
return 1; return 1;
if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1) if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
return 1; return 1;
...@@ -27,6 +28,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -27,6 +28,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
beta_p = (void *)&beta; beta_p = (void *)&beta;
break; break;
case GA_FLOAT: case GA_FLOAT:
case GA_HALF:
alpha_p = (void *)&af; alpha_p = (void *)&af;
beta_p = (void *)&bf; beta_p = (void *)&bf;
break; break;
...@@ -41,26 +43,156 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -41,26 +43,156 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
Py_INCREF(*input); Py_INCREF(*input);
#else #else
if (theano_prep_output(input, PyGpuArray_NDIM(im), PyGpuArray_DIMS(im), if (theano_prep_output(input, PyGpuArray_NDIM(im), PyGpuArray_DIMS(im),
im->ga.typecode, GA_C_ORDER, im->ga.typecode, GA_C_ORDER, c) != 0)
pygpu_default_context()) != 0)
return 1; return 1;
if (beta != 0.0 && pygpu_move(*input, im)) if (beta != 0.0 && pygpu_move(*input, im))
return 1; return 1;
#endif #endif
if (c_set_tensor4d(*input, APPLY_SPECIFIC(input)) == -1) if (c_set_tensorNd(*input, APPLY_SPECIFIC(input)) == -1)
return 1; return 1;
err = cudnnConvolutionBackwardData( cudnnConvolutionBwdDataAlgo_t algo = CONV_ALGO;
_handle,
cuda_enter(c->ctx);
#ifdef CHOOSE_ALGO
static int reuse_algo = 0;
static cudnnConvolutionBwdDataAlgo_t prev_algo = CONV_ALGO;
#ifndef CHOOSE_ONCE
static size_t prev_kern_dims[5] = {0};
static size_t prev_top_dims[5] = {0};
reuse_algo = 1;
for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); i++) {
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]);
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(output, i) == prev_top_dims[i]);
}
#endif
if (!reuse_algo) {
#ifdef CHOOSE_TIME
int count;
cudnnConvolutionBwdDataAlgoPerf_t choice;
err = cudnnFindConvolutionBackwardDataAlgorithm(
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(kerns), 1, &count, &choice);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
algo = choice.algo;
#else
size_t free = 0, total = 0;
cudaError_t err2 = cudaMemGetInfo(&free, &total);
if (err2 != cudaSuccess){
cudaGetLastError();
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the memory "
"information on the GPU: %s\n", cudaGetErrorString(err2));
cuda_exit(c->ctx);
return 1;
}
err = cudnnGetConvolutionBackwardDataAlgorithm(
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
desc, APPLY_SPECIFIC(kerns),
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
#endif
prev_algo = algo;
} else {
algo = prev_algo;
}
#ifdef CHOOSE_ONCE
reuse_algo = 1;
#else
for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); i++) {
prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
prev_top_dims[i] = PyGpuArray_DIM(output, i);
}
#endif
#endif
#if CUDNN_VERSION > 3000
if (algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) {
int nd;
int pad[2];
int stride[2];
int upscale[2];
cudnnConvolutionMode_t mode;
err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
upscale, &mode);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error getting convolution properties: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
if (stride[0] != 1 || stride[1] != 1 ||
PyGpuArray_DIM(*input, 0) > 1024 || PyGpuArray_DIM(*input, 1) > 1024 ||
(PyGpuArray_DIM(kerns, 0) == 1 && PyGpuArray_DIM(kerns, 1) == 1)) {
algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
}
}
#endif
size_t worksize;
gpudata *workspace;
err = cudnnGetConvolutionBackwardDataWorkspaceSize(
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(input), algo, &worksize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
if (worksize != 0) {
workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL);
if (workspace == NULL) {
PyErr_SetString(PyExc_RuntimeError,
"Could not allocate working memory");
cuda_exit(c->ctx);
return 1;
}
}
err = cudnnConvolutionBackwardData_v3(
APPLY_SPECIFIC(_handle),
alpha_p, alpha_p,
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns), APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),
desc, desc, algo, worksize == 0 ? NULL : *(void **)workspace, worksize,
beta_p, beta_p,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(*input)); APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(*input));
if (worksize != 0)
c->ops->buffer_release(workspace);
cuda_exit(c->ctx);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradI: error doing operation: %s", PyErr_Format(PyExc_RuntimeError, "error doing operation: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
return 1; return 1;
} }
......
#section support_code_struct #section support_code_struct
int int
APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
PyGpuArrayObject *km, PyGpuArrayObject *km,
cudnnConvolutionDescriptor_t desc, cudnnConvolutionDescriptor_t desc,
...@@ -9,6 +9,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -9,6 +9,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
float af = alpha, bf = beta; float af = alpha, bf = beta;
void *alpha_p; void *alpha_p;
void *beta_p; void *beta_p;
PyGpuContextObject *c = pygpu_default_context();
if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1]) { if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1]) {
PyErr_SetString(PyExc_ValueError, PyErr_SetString(PyExc_ValueError,
...@@ -16,9 +17,9 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -16,9 +17,9 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
return 1; return 1;
} }
if (c_set_tensor4d(input, APPLY_SPECIFIC(input)) == -1) if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
return 1; return 1;
if (c_set_tensor4d(output, APPLY_SPECIFIC(output)) == -1) if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
return 1; return 1;
switch (input->ga.typecode) { switch (input->ga.typecode) {
...@@ -27,6 +28,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -27,6 +28,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
beta_p = (void *)&beta; beta_p = (void *)&beta;
break; break;
case GA_FLOAT: case GA_FLOAT:
case GA_HALF:
alpha_p = (void *)&af; alpha_p = (void *)&af;
beta_p = (void *)&bf; beta_p = (void *)&bf;
break; break;
...@@ -41,8 +43,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -41,8 +43,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
Py_INCREF(*kerns); Py_INCREF(*kerns);
#else #else
if (theano_prep_output(kerns, PyGpuArray_NDIM(km), PyGpuArray_DIMS(km), if (theano_prep_output(kerns, PyGpuArray_NDIM(km), PyGpuArray_DIMS(km),
km->ga.typecode, GA_C_ORDER, km->ga.typecode, GA_C_ORDER, c) != 0)
pygpu_default_context()) != 0)
return 1; return 1;
if (beta != 0.0 && pygpu_move(*kerns, km)) if (beta != 0.0 && pygpu_move(*kerns, km))
return 1; return 1;
...@@ -51,16 +52,148 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -51,16 +52,148 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns)) == -1) if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns)) == -1)
return 1; return 1;
err = cudnnConvolutionBackwardFilter( cudnnConvolutionBwdFilterAlgo_t algo = CONV_ALGO;
_handle,
cuda_enter(c->ctx);
#ifdef CHOOSE_ALGO
static int reuse_algo = 0;
static cudnnConvolutionBwdFilterAlgo_t prev_algo = CONV_ALGO;
#ifndef CHOOSE_ONCE
static size_t prev_img_dims[5] = {0};
static size_t prev_top_dims[5] = {0};
reuse_algo = 1;
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(input, i) == prev_img_dims[i]);
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(output, i) == prev_top_dims[i]);
}
#endif
if (!reuse_algo) {
#ifdef CHOOSE_TIME
int count;
cudnnConvolutionBwdFilterAlgoPerf_t choice;
err = cudnnFindConvolutionBackwardFilterAlgorithm(
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(kerns), 1, &count, &choice);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
algo = choice.algo;
#else
size_t free = 0, total = 0;
cudaError_t err2 = cudaMemGetInfo(&free, &total);
if (err2 != cudaSuccess){
cudaGetLastError();
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the memory "
"information on the GPU: %s\n", cudaGetErrorString(err2));
cuda_exit(c->ctx);
return 1;
}
err = cudnnGetConvolutionBackwardFilterAlgorithm(
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
desc, APPLY_SPECIFIC(kerns),
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
#endif
prev_algo = algo;
} else {
algo = prev_algo;
}
#ifdef CHOOSE_ONCE
reuse_algo = 1;
#else
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
prev_img_dims[i] = PyGpuArray_DIM(input, i);
prev_top_dims[i] = PyGpuArray_DIM(output, i);
}
#endif
#endif
#ifdef CUDNN_VERSION > 3000
if (algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT) {
int nd;
int pad[2];
int stride[2];
int upscale[2];
cudnnConvolutionMode_t mode;
err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
upscale, &mode);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error getting convolution properties: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
if (stride[0] != 1 || stride[1] != 1 ||
PyGpuArray_DIM(input, 0) > 1024 || PyGpuArray_DIM(input, 1) > 1024 ||
(PyGpuArray_DIM(*kerns, 0) == 1 && PyGpuArray_DIM(*kerns, 1) == 1)) {
algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
}
}
#endif
size_t worksize;
gpudata *workspace;
err = cudnnGetConvolutionBackwardFilterWorkspaceSize(
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(kerns), algo, &worksize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
if (worksize != 0) {
workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL);
if (workspace == NULL) {
PyErr_SetString(PyExc_RuntimeError, "Could not allocate working memory");
cuda_exit(c->ctx);
return 1;
}
}
err = cudnnConvolutionBackwardFilter_v3(
APPLY_SPECIFIC(_handle),
alpha_p, alpha_p,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input), APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),
desc, desc, algo, worksize == 0 ? NULL : *(void **)workspace, worksize,
beta_p, beta_p,
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(*kerns)); APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(*kerns));
if (worksize != 0)
c->ops->buffer_release(workspace);
cuda_exit(c->ctx);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradW: error doing operation: %s", PyErr_Format(PyExc_RuntimeError, "error doing operation: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
return 1; return 1;
} }
......
#section support_code_struct
cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
#section init_code_struct
cudnnStatus_t APPLY_SPECIFIC(err);
APPLY_SPECIFIC(input) = NULL;
APPLY_SPECIFIC(output) = NULL;
if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
"(inp): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
FAIL;
}
if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
"(out): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
FAIL;
}
#section cleanup_code_struct
if (APPLY_SPECIFIC(input) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input)); }
if (APPLY_SPECIFIC(output) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output)); }
#section support_code_struct
int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
cudnnPoolingDescriptor_t desc,
PyGpuArrayObject **out) {
cudnnStatus_t err;
size_t dims[5];
PyGpuContextObject *c = pygpu_default_context();
if (!GpuArray_IS_C_CONTIGUOUS(&img->ga)) {
PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
return 1;
}
if (c_set_tensorNd(img, APPLY_SPECIFIC(input)) != 0)
return 1;
cudnnPoolingMode_t mode;
int w[3];
int p[3];
int s[3];
int ndims;
err = cudnnGetPoolingNdDescriptor(desc, 3, &mode, &ndims, w, p, s);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error doing cudnnGetPoolingDescriptor operation: %s",
cudnnGetErrorString(err));
return 1;
}
dims[0] = PyGpuArray_DIM(img, 0);
dims[1] = PyGpuArray_DIM(img, 1);
dims[2] = (PyGpuArray_DIM(img, 2) + (p[0]*2) - w[0]) / s[0] + 1;
dims[3] = (PyGpuArray_DIM(img, 3) + (p[1]*2) - w[1]) / s[1] + 1;
if (ndims == 3)
dims[4] = (PyGpuArray_DIM(img, 4) + (p[2]*2) - w[2]) / s[2] + 1;
if (theano_prep_output(out, ndims+2, dims, img->ga.typecode,
GA_C_ORDER, c) != 0)
return 1;
if (c_set_tensorNd(*out, APPLY_SPECIFIC(output)) != 0)
return 1;
{
const float alphaf = 1;
const float betaf = 0;
const double alphad = 1;
const double betad = 0;
void *alpha, *beta;
switch (img->ga.typecode) {
case GA_DOUBLE:
alpha = (void *)&alphad;
beta = (void *)&betad;
break;
case GA_FLOAT:
case GA_HALF:
alpha = (void *)&alphaf;
beta = (void *)&betaf;
break;
default:
PyErr_SetString(PyExc_TypeError, "Unsupported type in pooling");
return 1;
}
cuda_enter(c->ctx);
err = cudnnPoolingForward(
APPLY_SPECIFIC(_handle), desc,
alpha,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(img),
beta,
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*out));
cuda_exit(c->ctx);
}
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"GpuDnnPool: error doing cudnnPoolingForward operation: %s",
cudnnGetErrorString(err));
return 1;
}
return 0;
}
#section support_code_struct
cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
cudnnTensorDescriptor_t APPLY_SPECIFIC(input_grad);
cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
cudnnTensorDescriptor_t APPLY_SPECIFIC(output_grad);
#section init_code_struct
APPLY_SPECIFIC(input) = NULL;
APPLY_SPECIFIC(input_grad) = NULL;
APPLY_SPECIFIC(output) = NULL;
APPLY_SPECIFIC(output_grad) = NULL;
{
cudnnStatus_t err;
if ((err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError,
"could not allocate tensor descriptor (input): %s",
cudnnGetErrorString(err));
FAIL;
}
if ((err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input_grad))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError,
"could not allocate tensor descriptor (input_grad): %s",
cudnnGetErrorString(err));
FAIL;
}
if ((err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError,
"could not allocate tensor descriptor (output): %s",
cudnnGetErrorString(err));
FAIL;
}
if ((err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output_grad))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError,
"could not allocate tensor descriptor (output_grad): %s",
cudnnGetErrorString(err));
FAIL;
}
}
#section cleanup_code_struct
if (APPLY_SPECIFIC(input) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input)); }
if (APPLY_SPECIFIC(input_grad) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input_grad)); }
if (APPLY_SPECIFIC(output) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output)); }
if (APPLY_SPECIFIC(output_grad) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output_grad)); }
#section support_code_struct
int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
PyGpuArrayObject *out,
PyGpuArrayObject *out_grad,
cudnnPoolingDescriptor_t desc,
PyGpuArrayObject **inp_grad) {
cudnnStatus_t err;
PyGpuContextObject *c = pygpu_default_context();
if (!GpuArray_IS_C_CONTIGUOUS(&inp->ga)) {
PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
return 1;
}
if (!GpuArray_IS_C_CONTIGUOUS(&out_grad->ga)) {
PyErr_SetString(PyExc_ValueError, "Only contiguous output gradients are supported.");
return 1;
}
if (!GpuArray_IS_C_CONTIGUOUS(&out->ga)) {
PyErr_SetString(PyExc_ValueError, "Only contiguous outputs are supported.");
return 1;
}
if (c_set_tensorNd(inp, APPLY_SPECIFIC(input)) != 0)
return 1;
if (c_set_tensorNd(out_grad, APPLY_SPECIFIC(output_grad)) != 0)
return 1;
if (c_set_tensorNd(out, APPLY_SPECIFIC(output)) != 0)
return 1;
if (theano_prep_output(inp_grad, PyGpuArray_NDIM(inp),
PyGpuArray_DIMS(inp), inp->ga.typecode,
GA_C_ORDER, pygpu_default_context()) != 0) {
return 1;
}
if (c_set_tensorNd(*inp_grad, APPLY_SPECIFIC(input_grad)) != 0)
return 1;
{
const float alphaf = 1;
const float betaf = 0;
const double alphad = 1;
const double betad = 0;
void *alpha, *beta;
switch (inp->ga.typecode) {
case GA_DOUBLE:
alpha = (void *)&alphad;
beta = (void *)&betad;
break;
case GA_FLOAT:
case GA_HALF:
alpha = (void *)&alphaf;
beta = (void *)&betaf;
break;
default:
PyErr_SetString(PyExc_TypeError, "Unsupported type in pooling gradient");
return 1;
}
cuda_enter(c->ctx);
err = cudnnPoolingBackward(
APPLY_SPECIFIC(_handle), desc,
alpha,
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(out),
APPLY_SPECIFIC(output_grad), PyGpuArray_DEV_DATA(out_grad),
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(inp),
beta,
APPLY_SPECIFIC(input_grad), PyGpuArray_DEV_DATA(*inp_grad)
);
cuda_exit(c->ctx);
}
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error doing operation: %s.",
cudnnGetErrorString(err));
return 1;
}
return 0;
}
#section support_code_struct
cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
#section init_code_struct
APPLY_SPECIFIC(input) = NULL;
APPLY_SPECIFIC(output) = NULL;
{
cudnnStatus_t err;
err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input));
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor: %s",
cudnnGetErrorString(err));
FAIL;
}
err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output));
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor: %s",
cudnnGetErrorString(err));
FAIL;
}
}
#section cleanup_code_struct
if (APPLY_SPECIFIC(input) != NULL)
cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input));
if (APPLY_SPECIFIC(output) != NULL)
cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output));
#section support_code_struct
int APPLY_SPECIFIC(softmax)(PyGpuArrayObject *x,
PyGpuArrayObject **out) {
cudnnStatus_t err;
PyGpuContextObject *c = pygpu_default_context();
if (c_set_tensorNd(x, APPLY_SPECIFIC(input)) != 0)
return 1;
if (theano_prep_output(out, PyGpuArray_NDIM(x),
PyGpuArray_DIMS(x), x->ga.typecode,
GA_C_ORDER, c) != 0)
return 1;
if (c_set_tensorNd(*out, APPLY_SPECIFIC(output)) != 0)
return 1;
{
const float alphaf = 1;
const float betaf = 0;
const double alphad = 1;
const double betad = 0;
void *alpha, *beta;
switch (x->ga.typecode) {
case GA_DOUBLE:
alpha = (void *)&alphad;
beta = (void *)&betad;
break;
case GA_FLOAT:
case GA_HALF:
alpha = (void *)&alphaf;
beta = (void *)&betaf;
break;
default:
PyErr_SetString(PyExc_TypeError, "Unsupported type in softmax");
return 1;
}
cuda_enter(c->ctx);
err = cudnnSoftmaxForward(
APPLY_SPECIFIC(_handle),
SOFTMAX_ALGO,
SOFTMAX_MODE,
alpha,
APPLY_SPECIFIC(input),
PyGpuArray_DEV_DATA(x),
beta,
APPLY_SPECIFIC(output),
PyGpuArray_DEV_DATA(*out)
);
cuda_exit(c->ctx);
}
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error during operation: %s",
cudnnGetErrorString(err));
return 1;
}
return 0;
}
#section support_code_struct
cudnnTensorDescriptor_t APPLY_SPECIFIC(dy);
cudnnTensorDescriptor_t APPLY_SPECIFIC(sm);
cudnnTensorDescriptor_t APPLY_SPECIFIC(dx);
#section init_code_struct
APPLY_SPECIFIC(dy) = NULL;
APPLY_SPECIFIC(sm) = NULL;
APPLY_SPECIFIC(dx) = NULL;
{
cudnnStatus_t err;
err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(dy));
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor: %s",
cudnnGetErrorString(err));
FAIL;
}
err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(sm));
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor: %s",
cudnnGetErrorString(err));
FAIL;
}
err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(dx));
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor: %s",
cudnnGetErrorString(err));
FAIL;
}
}
#section cleanup_code_struct
if (APPLY_SPECIFIC(dy) != NULL)
cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(dy));
if (APPLY_SPECIFIC(sm) != NULL)
cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(sm));
if (APPLY_SPECIFIC(dx) != NULL)
cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(dx));
#section support_code_struct
int APPLY_SPECIFIC(softmax_grad)(PyGpuArrayObject *dy,
PyGpuArrayObject *sm,
PyGpuArrayObject **dx) {
cudnnStatus_t err;
PyGpuContextObject *c = pygpu_default_context();
if (c_set_tensorNd(dy, APPLY_SPECIFIC(dy)) != 0)
return 1;
if (c_set_tensorNd(sm, APPLY_SPECIFIC(sm)) != 0)
return 1;
if (theano_prep_output(dx, PyGpuArray_NDIM(dy),
PyGpuArray_DIMS(dy), dy->ga.typecode,
GA_C_ORDER, c) != 0)
return 1;
if (c_set_tensorNd(*dx, APPLY_SPECIFIC(dx)) != 0)
return 1;
{
const float alphaf = 1;
const float betaf = 0;
const double alphad = 1;
const double betad = 0;
void *alpha, *beta;
switch (sm->ga.typecode) {
case GA_DOUBLE:
alpha = (void *)&alphad;
beta = (void *)&betad;
break;
case GA_FLOAT:
case GA_HALF:
alpha = (void *)&alphaf;
beta = (void *)&betaf;
break;
default:
PyErr_SetString(PyExc_TypeError, "Unsupported type in softmax gradient");
return 1;
}
cuda_enter(c->ctx);
err = cudnnSoftmaxBackward(
APPLY_SPECIFIC(_handle),
SOFTMAX_ALGO,
SOFTMAX_MODE,
alpha,
APPLY_SPECIFIC(sm),
PyGpuArray_DEV_DATA(sm),
APPLY_SPECIFIC(dy),
PyGpuArray_DEV_DATA(dy),
beta,
APPLY_SPECIFIC(dx),
PyGpuArray_DEV_DATA(*dx)
);
cuda_exit(c->ctx);
}
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error during operation: %s",
cudnnGetErrorString(err));
return 1;
}
return 0;
}
...@@ -22,14 +22,12 @@ from . import test_nnet ...@@ -22,14 +22,12 @@ from . import test_nnet
def test_dnn_conv_desc_merge(): def test_dnn_conv_desc_merge():
if not dnn.dnn_available(): if not dnn.dnn_available():
raise SkipTest(dnn.dnn_available.msg) raise SkipTest(dnn.dnn_available.msg)
img_shp = T.as_tensor_variable(
numpy.asarray([2, 1, 8, 8]).astype('int64'))
kern_shp = T.as_tensor_variable( kern_shp = T.as_tensor_variable(
numpy.asarray([3, 1, 2, 2]).astype('int64')) numpy.asarray([3, 1, 2, 2]).astype('int64'))
desc1 = dnn.GpuDnnConvDesc(border_mode='valid', subsample=(2, 2), desc1 = dnn.GpuDnnConvDesc(border_mode='valid', subsample=(2, 2),
conv_mode='conv')(img_shp, kern_shp) conv_mode='conv')(kern_shp)
desc2 = dnn.GpuDnnConvDesc(border_mode='full', subsample=(1, 1), desc2 = dnn.GpuDnnConvDesc(border_mode='full', subsample=(1, 1),
conv_mode='cross')(img_shp, kern_shp) conv_mode='cross')(kern_shp)
# CDataType is not DeepCopyable so this will crash if we don't use # CDataType is not DeepCopyable so this will crash if we don't use
# borrow=True # borrow=True
f = theano.function([], [theano.Out(desc1, borrow=True), f = theano.function([], [theano.Out(desc1, borrow=True),
...@@ -51,7 +49,7 @@ def test_dnn_conv_merge(): ...@@ -51,7 +49,7 @@ def test_dnn_conv_merge():
kern = T.ftensor4('kern') kern = T.ftensor4('kern')
out = T.ftensor4('out') out = T.ftensor4('out')
desc = dnn.GpuDnnConvDesc( desc = dnn.GpuDnnConvDesc(
border_mode='valid')(img.shape, kern.shape) border_mode='valid')(kern.shape)
# Test forward op # Test forward op
o1 = dnn.dnn_conv(img, kern) o1 = dnn.dnn_conv(img, kern)
...@@ -90,9 +88,9 @@ def test_dnn_conv_inplace(): ...@@ -90,9 +88,9 @@ def test_dnn_conv_inplace():
kern = T.ftensor4('kern') kern = T.ftensor4('kern')
out = T.ftensor4('out') out = T.ftensor4('out')
desc1 = dnn.GpuDnnConvDesc(border_mode='valid', conv_mode='conv')( desc1 = dnn.GpuDnnConvDesc(border_mode='valid', conv_mode='conv')(
img.shape, kern.shape) kern.shape)
desc2 = dnn.GpuDnnConvDesc( desc2 = dnn.GpuDnnConvDesc(
border_mode='valid', conv_mode='cross')(img.shape, kern.shape) border_mode='valid', conv_mode='cross')(kern.shape)
# Test forward op # Test forward op
o1 = dnn.dnn_conv(img, kern, conv_mode='conv') o1 = dnn.dnn_conv(img, kern, conv_mode='conv')
...@@ -175,8 +173,6 @@ def test_pooling(): ...@@ -175,8 +173,6 @@ def test_pooling():
func = T.max func = T.max
else: else:
func = T.mean func = T.mean
if pad != (0, 0) and dnn.version() == -1:
continue
if pad != (0, 0) and func is T.mean: if pad != (0, 0) and func is T.mean:
continue continue
...@@ -209,11 +205,10 @@ def test_pooling(): ...@@ -209,11 +205,10 @@ def test_pooling():
(32, 1, 147, 197), (32, 1, 147, 197),
]: ]:
data = numpy.random.normal(0, 1, shp).astype("float32") data = numpy.random.normal(0, 1, shp).astype("float32")
a = f1(data).__array__() a = f1(data)
b = f2(data)
b = f2(data).__array__() utt.assert_allclose(a, b)
assert numpy.allclose(a, b,
atol=numpy.finfo(numpy.float32).eps)
# Test the grad # Test the grad
for shp in [(1, 1, 2, 2), for shp in [(1, 1, 2, 2),
...@@ -230,9 +225,9 @@ def test_pooling(): ...@@ -230,9 +225,9 @@ def test_pooling():
def fn(x): def fn(x):
return max_pool_2d(x, (ws, ws), ignore_border=True, return max_pool_2d(x, (ws, ws), ignore_border=True,
padding=pad, mode=mode) padding=pad, mode=mode)
theano.tests.unittest_tools.verify_grad(fn, [data], utt.verify_grad(fn, [data],
cast_to_output_type=False, cast_to_output_type=False,
mode=mode_with_gpu) mode=mode_with_gpu)
# Confirm that the opt would have inserted it. # Confirm that the opt would have inserted it.
fg = theano.function([x], theano.grad(fn(x).sum(), x), fg = theano.function([x], theano.grad(fn(x).sum(), x),
mode=mode_with_gpu) mode=mode_with_gpu)
...@@ -247,10 +242,9 @@ def test_pooling(): ...@@ -247,10 +242,9 @@ def test_pooling():
pad=pad, pad=pad,
mode=mode) mode=mode)
return dnn_op return dnn_op
theano.tests.unittest_tools.verify_grad( utt.verify_grad(fn, [data],
fn, [data], cast_to_output_type=False,
cast_to_output_type=False, mode=mode_with_gpu)
mode=mode_with_gpu)
# Confirm that we get the good op. # Confirm that we get the good op.
fg = theano.function([x], theano.grad(fn(x).sum(), x), fg = theano.function([x], theano.grad(fn(x).sum(), x),
mode=mode_with_gpu) mode=mode_with_gpu)
...@@ -258,7 +252,7 @@ def test_pooling(): ...@@ -258,7 +252,7 @@ def test_pooling():
for node in fg.maker.fgraph.toposort()]) for node in fg.maker.fgraph.toposort()])
g_out = fg(data) g_out = fg(data)
# Compare again the CPU result # Compare against the CPU result
out = max_pool_2d(x, (ws, ws), out = max_pool_2d(x, (ws, ws),
padding=pad, padding=pad,
ignore_border=True, mode=mode) ignore_border=True, mode=mode)
...@@ -271,7 +265,7 @@ def test_pooling(): ...@@ -271,7 +265,7 @@ def test_pooling():
assert any([isinstance(node.op, AveragePoolGrad) assert any([isinstance(node.op, AveragePoolGrad)
for node in fc.maker.fgraph.toposort()]) for node in fc.maker.fgraph.toposort()])
c_out = fc(data) c_out = fc(data)
assert numpy.allclose(c_out, g_out) utt.assert_allclose(c_out, g_out)
def test_pooling_opt(): def test_pooling_opt():
...@@ -353,7 +347,7 @@ class TestDnnInferShapes(utt.InferShapeTester): ...@@ -353,7 +347,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
) )
self._compile_and_check( self._compile_and_check(
[t], [t],
[dnn.GpuDnnSoftmax('bc01', 'accurate', 'channel')(t)], [dnn.GpuDnnSoftmax('accurate', 'channel')(t)],
[rand_tensor], [rand_tensor],
dnn.GpuDnnSoftmax dnn.GpuDnnSoftmax
) )
...@@ -363,7 +357,6 @@ class TestDnnInferShapes(utt.InferShapeTester): ...@@ -363,7 +357,6 @@ class TestDnnInferShapes(utt.InferShapeTester):
[ [
T.grad( T.grad(
dnn.GpuDnnSoftmax( dnn.GpuDnnSoftmax(
'bc01',
'accurate', 'accurate',
'channel' 'channel'
)(t).mean(), )(t).mean(),
...@@ -403,7 +396,7 @@ class TestDnnInferShapes(utt.InferShapeTester): ...@@ -403,7 +396,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
border_mode=params[0], border_mode=params[0],
subsample=params[1], subsample=params[1],
conv_mode=params[2] conv_mode=params[2]
)(img.shape, kerns.shape) )(kerns.shape)
conv = dnn.GpuDnnConv()(img, kerns, out, desc) conv = dnn.GpuDnnConv()(img, kerns, out, desc)
self._compile_and_check( self._compile_and_check(
[img, kerns, out], [img, kerns, out],
...@@ -447,7 +440,7 @@ class TestDnnInferShapes(utt.InferShapeTester): ...@@ -447,7 +440,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
border_mode=params[0], border_mode=params[0],
subsample=params[1], subsample=params[1],
conv_mode=params[2] conv_mode=params[2]
)(temp_img.shape, out.shape) )(out.shape)
conv_grad_w = dnn.GpuDnnConvGradW()( conv_grad_w = dnn.GpuDnnConvGradW()(
temp_img, temp_img,
temp_kerns, temp_kerns,
...@@ -467,42 +460,41 @@ class TestDnnInferShapes(utt.InferShapeTester): ...@@ -467,42 +460,41 @@ class TestDnnInferShapes(utt.InferShapeTester):
img = T.ftensor4('img') img = T.ftensor4('img')
kerns = T.ftensor4('kerns') kerns = T.ftensor4('kerns')
out = T.ftensor4('out') out = T.ftensor4('out')
img_val = numpy.asarray(
numpy.random.rand(3, 4, 5, 6),
dtype='float32'
)
kern_vals = numpy.asarray( kern_vals = numpy.asarray(
numpy.random.rand(13, 14, 15, 16), numpy.random.rand(13, 14, 15, 16),
dtype='float32' dtype='float32'
) )
out_vals = numpy.asarray(
numpy.random.rand(3, 13, 5, 6),
dtype='float32'
)
for params in product( for params in product(
['valid'], # Should this work for 'full'? ['valid'], # Should this work for 'full'?
[(1, 1)], [(1, 1)],
['conv', 'cross'] ['conv', 'cross']
): ):
temp_kerns = kerns.dimshuffle(1, 0, 2, 3)
shape = ( shape = (
img_val.shape[0], kern_vals.shape[1], out_vals.shape[0], kern_vals.shape[1],
img_val.shape[2] + kern_vals.shape[2] - 1, out_vals.shape[2] + kern_vals.shape[2] - 1,
img_val.shape[3] + kern_vals.shape[3] - 1 out_vals.shape[3] + kern_vals.shape[3] - 1
) )
out_vals = numpy.zeros(shape, dtype='float32') img_vals = numpy.zeros(shape, dtype='float32')
desc = dnn.GpuDnnConvDesc( desc = dnn.GpuDnnConvDesc(
border_mode=params[0], border_mode=params[0],
subsample=params[1], subsample=params[1],
conv_mode=params[2] conv_mode=params[2]
)(out.shape, temp_kerns.shape) )(kerns.shape)
conv_grad_i = dnn.GpuDnnConvGradI()( conv_grad_i = dnn.GpuDnnConvGradI()(
temp_kerns, kerns,
img,
out, out,
img,
desc, desc,
) )
self._compile_and_check( self._compile_and_check(
[temp_kerns, img, out], [kerns, img, out],
[conv_grad_i], [conv_grad_i],
[kern_vals, img_val, out_vals], [kern_vals, img_vals, out_vals],
dnn.GpuDnnConvGradI dnn.GpuDnnConvGradI
) )
...@@ -612,15 +604,9 @@ def test_dnn_conv_alpha_output_merge(): ...@@ -612,15 +604,9 @@ def test_dnn_conv_alpha_output_merge():
lr = numpy.asarray(0.05, dtype='float32') lr = numpy.asarray(0.05, dtype='float32')
if dnn.version() == -1: fr = lr * (conv + out)
# Can't merge alpha with cudnn v1 wr = kern + lr * gw
fr = conv + out ir = img + lr * gi
wr = kern + gw
ir = img + gi
else:
fr = lr * (conv + out)
wr = kern + lr * gw
ir = img + lr * gi
f1 = theano.function([img, kern, out], [fr, wr, ir], mode=mode_with_gpu) f1 = theano.function([img, kern, out], [fr, wr, ir], mode=mode_with_gpu)
assert isinstance(f1.maker.fgraph.outputs[0].owner.inputs[0].owner.op, assert isinstance(f1.maker.fgraph.outputs[0].owner.inputs[0].owner.op,
...@@ -657,9 +643,6 @@ def test_dnn_conv_alpha_output_merge(): ...@@ -657,9 +643,6 @@ def test_dnn_conv_alpha_output_merge():
def test_dnn_conv_grad(): def test_dnn_conv_grad():
if not dnn.dnn_available() or dnn.version() == -1:
raise SkipTest('alpha != 1.0 not supported in cudnn v1')
b = 1 b = 1
c = 4 c = 4
f = 3 f = 3
...@@ -674,18 +657,18 @@ def test_dnn_conv_grad(): ...@@ -674,18 +657,18 @@ def test_dnn_conv_grad():
def dconv(img, kern, out): def dconv(img, kern, out):
desc = dnn.GpuDnnConvDesc(border_mode='valid', subsample=(1, 1), desc = dnn.GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
conv_mode='conv')(img.shape, kern.shape) conv_mode='conv')(kern.shape)
return dnn.GpuDnnConv()(img, kern, out, desc, alpha=0.5, beta=0.75) return dnn.GpuDnnConv()(img, kern, out, desc, alpha=0.5, beta=0.75)
def dconvi(img, kern, out): def dconvi(img, kern, out):
desc = dnn.GpuDnnConvDesc(border_mode='valid', subsample=(1, 1), desc = dnn.GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
conv_mode='conv')(img.shape, kern.shape) conv_mode='conv')(kern.shape)
return dnn.GpuDnnConvGradI()(kern, out, img, desc, alpha=-1.0, return dnn.GpuDnnConvGradI()(kern, out, img, desc, alpha=-1.0,
beta=0.0) beta=0.0)
def dconvw(img, kern, out): def dconvw(img, kern, out):
desc = dnn.GpuDnnConvDesc(border_mode='valid', subsample=(1, 1), desc = dnn.GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
conv_mode='conv')(img.shape, kern.shape) conv_mode='conv')(kern.shape)
return dnn.GpuDnnConvGradW()(img, out, kern, desc, alpha=0.75, return dnn.GpuDnnConvGradW()(img, out, kern, desc, alpha=0.75,
beta=-1.0) beta=-1.0)
...@@ -697,7 +680,7 @@ def test_dnn_conv_grad(): ...@@ -697,7 +680,7 @@ def test_dnn_conv_grad():
def test_version(): def test_version():
if not dnn.dnn_available(): if not dnn.dnn_available():
raise SkipTest(dnn.dnn_available.msg) raise SkipTest(dnn.dnn_available.msg)
assert isinstance(dnn.version(), (int, tuple)) assert isinstance(dnn.version(), int)
class test_SoftMax(test_nnet.test_SoftMax): class test_SoftMax(test_nnet.test_SoftMax):
...@@ -706,7 +689,7 @@ class test_SoftMax(test_nnet.test_SoftMax): ...@@ -706,7 +689,7 @@ class test_SoftMax(test_nnet.test_SoftMax):
mode = mode_with_gpu mode = mode_with_gpu
def test_softmax_shape_0(self): def test_softmax_shape_0(self):
raise SkipTest("Cudnn do not suport 0 shapes") raise SkipTest("Cudnn doesn't support 0 shapes")
def test_softmax_grad(self): def test_softmax_grad(self):
def cmp(n, m, f, f_gpu): def cmp(n, m, f, f_gpu):
...@@ -715,13 +698,12 @@ class test_SoftMax(test_nnet.test_SoftMax): ...@@ -715,13 +698,12 @@ class test_SoftMax(test_nnet.test_SoftMax):
out = f(data) out = f(data)
gout = numpy.asarray(f_gpu(gdata))[:, :, 0, 0] gout = numpy.asarray(f_gpu(gdata))[:, :, 0, 0]
assert numpy.allclose(out, gout), numpy.absolute(out - gout) utt.assert_allclose(out, gout)
x = T.matrix('x', 'float32') x = T.matrix('x', 'float32')
x_gpu = T.tensor4('x_gpu', 'float32') x_gpu = T.tensor4('x_gpu', 'float32')
f_z = T.nnet.softmax_op f_z = T.nnet.softmax_op
f_gpu = dnn.GpuDnnSoftmax( f_gpu = dnn.GpuDnnSoftmax(
'bc01',
'accurate', 'accurate',
'channel' 'channel'
) )
...@@ -763,14 +745,14 @@ class test_SoftMax(test_nnet.test_SoftMax): ...@@ -763,14 +745,14 @@ class test_SoftMax(test_nnet.test_SoftMax):
for i in sorted_f for i in sorted_f
if isinstance( if isinstance(
i.op, i.op,
self.gpu_grad_op self.gpu_grad_op)
)]) == 1) ]) == 1)
assert(len([i assert(len([i
for i in sorted_f for i in sorted_f
if isinstance( if isinstance(
i.op, i.op,
theano.tensor.nnet.SoftmaxGrad theano.tensor.nnet.SoftmaxGrad)
)]) == 0) ]) == 0)
# Verify that the SoftmaxGrad -> Gpu[Dnn]SoftmaxGrad # Verify that the SoftmaxGrad -> Gpu[Dnn]SoftmaxGrad
# optimization is not applied when cudnn is excluded or not # optimization is not applied when cudnn is excluded or not
...@@ -787,14 +769,14 @@ class test_SoftMax(test_nnet.test_SoftMax): ...@@ -787,14 +769,14 @@ class test_SoftMax(test_nnet.test_SoftMax):
for i in sorted_f for i in sorted_f
if isinstance( if isinstance(
i.op, i.op,
self.gpu_grad_op self.gpu_grad_op)
)]) == 0) ]) == 0)
assert(len([i assert(len([i
for i in sorted_f for i in sorted_f
if isinstance( if isinstance(
i.op, i.op,
theano.tensor.nnet.SoftmaxGrad theano.tensor.nnet.SoftmaxGrad)
)]) == 1) ]) == 1)
# Verify that the SoftmaxGrad -> GpuDnnSoftmaxGrad do not # Verify that the SoftmaxGrad -> GpuDnnSoftmaxGrad do not
# crash with manual graph # crash with manual graph
...@@ -806,11 +788,49 @@ class test_SoftMax(test_nnet.test_SoftMax): ...@@ -806,11 +788,49 @@ class test_SoftMax(test_nnet.test_SoftMax):
for i in sorted_f for i in sorted_f
if isinstance( if isinstance(
i.op, i.op,
self.gpu_grad_op self.gpu_grad_op)
)]) == 1) ]) == 1)
assert(len([i assert(len([i
for i in sorted_f for i in sorted_f
if isinstance( if isinstance(
i.op, i.op,
theano.tensor.nnet.SoftmaxGrad theano.tensor.nnet.SoftmaxGrad)
)]) == 0) ]) == 0)
def test_log_softmax(self):
# This is a test for an optimization that depends on CuDNN v3 or
# more recent. Don't test if the CuDNN version is too old.
if dnn.version() < 3000:
raise SkipTest("Log-softmax is only in cudnn v3+")
x = T.ftensor4()
softmax_out = dnn.GpuDnnSoftmax('accurate', 'channel')(x)
log_out = T.log(T.as_tensor_variable(softmax_out))
f = theano.function([x], log_out, mode=mode_with_gpu)
# Ensure that the optimization has been applied
dnn_softmax_nodes = [n for n in f.maker.fgraph.toposort() if
isinstance(n.op, dnn.GpuDnnSoftmax)]
assert len(dnn_softmax_nodes) == 1
assert dnn_softmax_nodes[0].op.algo == "log"
# Ensure that the output of the function is valid
input_shapes = [(3, 4, 5, 6),
(1025, 2, 3, 4),
(2, 1025, 3, 4),
(2, 3, 1025, 4),
(2, 3, 4, 1025),
(66000, 2, 3, 4),
(2, 66000, 3, 4),
(2, 3, 66000, 4),
(2, 3, 4, 66000)]
for inp_shape in input_shapes:
input_val = numpy.random.normal(0, 1, inp_shape).astype("float32")
out = f(input_val)
expected_out = numpy.log(numpy.exp(input_val) /
numpy.exp(input_val).sum(1)[:, None, :, :])
utt.assert_allclose(out, expected_out)
...@@ -326,7 +326,6 @@ class test_SoftMax(unittest.TestCase): ...@@ -326,7 +326,6 @@ class test_SoftMax(unittest.TestCase):
return f, f_gpu return f, f_gpu
def _cmp(self, n, m, f, f_gpu): def _cmp(self, n, m, f, f_gpu):
# print "test_softmax",n,m
data = numpy.arange(n * m, dtype='float32').reshape(n, m) data = numpy.arange(n * m, dtype='float32').reshape(n, m)
out = f(data) out = f(data)
gout = f_gpu(data) gout = f_gpu(data)
...@@ -349,8 +348,6 @@ class test_SoftMax(unittest.TestCase): ...@@ -349,8 +348,6 @@ class test_SoftMax(unittest.TestCase):
self._cmp self._cmp
) )
# cuDNN R1 cannot handle these test cases but the Theano softmax can so
# we test them only for the Theano softmax.
self._cmp(2 << 15, 5, f, f_gpu) self._cmp(2 << 15, 5, f, f_gpu)
def test_softmax_shape_0(self): def test_softmax_shape_0(self):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论