提交 1ef9be9d authored 作者: Pascal Lamblin's avatar Pascal Lamblin

Merge pull request #3356 from abergeron/gpuarray_cudnnv3

cuDNN v3 support for gpuarray
...@@ -10,7 +10,6 @@ from theano.gof import Optimizer, local_optimizer, COp ...@@ -10,7 +10,6 @@ from theano.gof import Optimizer, local_optimizer, COp
from theano.gof.type import CDataType, Generic from theano.gof.type import CDataType, Generic
from theano.compile import optdb from theano.compile import optdb
from theano.compile.ops import shape_i from theano.compile.ops import shape_i
from theano.configparser import AddConfigVar, EnumStr
from theano.tensor.nnet import SoftmaxGrad from theano.tensor.nnet import SoftmaxGrad
from theano.tensor.signal.downsample import ( from theano.tensor.signal.downsample import (
DownsampleFactorMax, MaxPoolGrad, AveragePoolGrad) DownsampleFactorMax, MaxPoolGrad, AveragePoolGrad)
...@@ -28,6 +27,8 @@ from theano.sandbox.cuda import gpu_seqopt, register_opt ...@@ -28,6 +27,8 @@ from theano.sandbox.cuda import gpu_seqopt, register_opt
from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
import theano.sandbox.dnn_flags
def dnn_available(): def dnn_available():
if dnn_available.avail is None: if dnn_available.avail is None:
...@@ -62,8 +63,8 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) { ...@@ -62,8 +63,8 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
# exclusive mode, this cause bad detection. # exclusive mode, this cause bad detection.
comp, out, err = NVCC_compiler.try_flags( comp, out, err = NVCC_compiler.try_flags(
["-l", "cudnn", "-I" + os.path.dirname(__file__), ["-l", "cudnn", "-I" + os.path.dirname(__file__),
"-I" + os.path.join(theano.config.cuda.root, 'include'), "-I" + config.dnn.include_path,
"-L" + os.path.join(theano.config.cuda.root, 'lib64')], "-L" + config.dnn.library_path],
preambule=preambule, body=body, preambule=preambule, body=body,
try_run=False, output=True) try_run=False, output=True)
...@@ -141,7 +142,6 @@ if (%(err)s != CUDNN_STATUS_SUCCESS) { ...@@ -141,7 +142,6 @@ if (%(err)s != CUDNN_STATUS_SUCCESS) {
%(fail)s %(fail)s
} }
} }
""" % dict(var=var, err=err, desc=desc, fail=fail) """ % dict(var=var, err=err, desc=desc, fail=fail)
...@@ -359,37 +359,9 @@ class GpuDnnConvDesc(GpuOp): ...@@ -359,37 +359,9 @@ class GpuDnnConvDesc(GpuOp):
def c_code_cache_version(self): def c_code_cache_version(self):
return (2, version()) return (2, version())
AddConfigVar('dnn.conv.workmem',
"This flag is deprecated; use dnn.conv.algo_fwd.",
EnumStr(''),
in_c_key=False)
AddConfigVar('dnn.conv.workmem_bwd',
"This flag is deprecated; use dnn.conv.algo_bwd.",
EnumStr(''),
in_c_key=False)
AddConfigVar('dnn.conv.algo_fwd',
"Default implementation to use for CuDNN forward convolution.",
EnumStr('small', 'none', 'large', 'fft', 'guess_once',
'guess_on_shape_change', 'time_once',
'time_on_shape_change'),
in_c_key=False)
AddConfigVar('dnn.conv.algo_bwd',
"Default implementation to use for CuDNN backward convolution.",
EnumStr('none', 'deterministic', 'fft', 'guess_once',
'guess_on_shape_change', 'time_once',
'time_on_shape_change'),
in_c_key=False)
# scalar constants # scalar constants
_zero = constant(numpy.asarray(0.0, dtype='float32')) _zero = constant(numpy.asarray(0.0, dtype='float32'))
_one = constant(numpy.asarray(1.0, dtype='float32')) _one = constant(numpy.asarray(1.0, dtype='float32'))
_ifour = constant(numpy.asarray(4, dtype='int32'))
_ifive = constant(numpy.asarray(5, dtype='int32'))
def ensure_float(val, default, name): def ensure_float(val, default, name):
...@@ -406,20 +378,6 @@ def ensure_float(val, default, name): ...@@ -406,20 +378,6 @@ def ensure_float(val, default, name):
return val return val
def ensure_int(val, default, name):
if val is None:
return default.clone()
if not isinstance(val, Variable):
val = constant(val)
if hasattr(val, 'ndim') and val.ndim == 0:
val = as_scalar(val)
if not isinstance(val.type, theano.scalar.Scalar):
raise TypeError("%s: expected a scalar value" % (name,))
if not val.type.dtype == 'int32':
raise TypeError("%s: type is not int32" % (name,))
return val
class GpuDnnConv(DnnBase, COp): class GpuDnnConv(DnnBase, COp):
""" """
The forward convolution. The forward convolution.
...@@ -1448,11 +1406,12 @@ class GpuDnnPool(DnnBase): ...@@ -1448,11 +1406,12 @@ class GpuDnnPool(DnnBase):
or desc.type.ctype != 'cudnnPoolingDescriptor_t': or desc.type.ctype != 'cudnnPoolingDescriptor_t':
raise TypeError('desc must be cudnnPoolingDescriptor_t') raise TypeError('desc must be cudnnPoolingDescriptor_t')
dop = desc.owner.op if desc.owner is not None:
e_ndim = dop.get_ndim() + 2 # 4 or 5 dop = desc.owner.op
e_ndim = dop.get_ndim() + 2 # 4 or 5
if img.type.ndim != e_ndim: if img.type.ndim != e_ndim:
raise TypeError('img must be %dD tensor' % e_ndim) raise TypeError('img must be %dD tensor' % e_ndim)
return Apply(self, [img, desc], [img.type()]) return Apply(self, [img, desc], [img.type()])
...@@ -1616,19 +1575,21 @@ class GpuDnnPoolGrad(DnnBase): ...@@ -1616,19 +1575,21 @@ class GpuDnnPoolGrad(DnnBase):
or desc.type.ctype != 'cudnnPoolingDescriptor_t': or desc.type.ctype != 'cudnnPoolingDescriptor_t':
raise TypeError('desc must be cudnnPoolingDescriptor_t') raise TypeError('desc must be cudnnPoolingDescriptor_t')
nd = desc.owner.op.get_ndim() + 2 # 4 or 5
inp = as_cuda_ndarray_variable(inp) inp = as_cuda_ndarray_variable(inp)
if inp.type.ndim != nd:
raise TypeError('inp must be %dD tensor' % (nd,))
inp_grad = as_cuda_ndarray_variable(inp_grad) inp_grad = as_cuda_ndarray_variable(inp_grad)
if inp_grad.type.ndim != nd:
raise TypeError('inp_grad must be %dD tensor' % (nd,))
out = as_cuda_ndarray_variable(out) out = as_cuda_ndarray_variable(out)
if out.type.ndim != nd:
raise TypeError('out must be %dD tensor' % (nd,)) if desc.owner is not None:
nd = desc.owner.op.get_ndim() + 2 # 4 or 5
if inp.type.ndim != nd:
raise TypeError('inp must be %dD tensor' % (nd,))
if inp_grad.type.ndim != nd:
raise TypeError('inp_grad must be %dD tensor' % (nd,))
if out.type.ndim != nd:
raise TypeError('out must be %dD tensor' % (nd,))
return Apply(self, [inp, out, inp_grad, desc], return Apply(self, [inp, out, inp_grad, desc],
[inp.type()]) [inp.type()])
...@@ -1819,7 +1780,7 @@ class GpuDnnSoftmaxBase(DnnBase): ...@@ -1819,7 +1780,7 @@ class GpuDnnSoftmaxBase(DnnBase):
Parameters Parameters
---------- ----------
tensor_format tensor_format
Whether the data format is 'bc01' or 'b01c'. Always set this to 'bc01'.
algo algo
'fast', 'accurate' or 'log' indicating whether, respectively, computations 'fast', 'accurate' or 'log' indicating whether, respectively, computations
should be optimized for speed, for accuracy, or if CuDNN should rather should be optimized for speed, for accuracy, or if CuDNN should rather
...@@ -1834,7 +1795,13 @@ class GpuDnnSoftmaxBase(DnnBase): ...@@ -1834,7 +1795,13 @@ class GpuDnnSoftmaxBase(DnnBase):
__props__ = ('tensor_format', 'mode', 'algo') __props__ = ('tensor_format', 'mode', 'algo')
def __init__(self, tensor_format, algo, mode): def __init__(self, tensor_format, algo, mode):
assert(tensor_format in ('bc01', 'b01c')) if tensor_format != 'bc01':
raise ValueError(
"It was discovered that since December 2014, the "
"tensor_format parameter was ignored and the equivalent of "
"'bc01' is always used. Since your code seems to be using "
"another value, this might have affected previous results "
"ran with this code.")
DnnBase.__init__(self) DnnBase.__init__(self)
self.tensor_format = tensor_format self.tensor_format = tensor_format
...@@ -1976,7 +1943,7 @@ class GpuDnnSoftmax(GpuDnnSoftmaxBase): ...@@ -1976,7 +1943,7 @@ class GpuDnnSoftmax(GpuDnnSoftmaxBase):
Parameters Parameters
---------- ----------
tensor_format tensor_format
Whether the data format is 'bc01' or 'b01c'. Always set to 'bc01'.
algo algo
'fast' or 'accurate' indicating whether computations should be 'fast' or 'accurate' indicating whether computations should be
optimized for speed or accuracy respectively. optimized for speed or accuracy respectively.
...@@ -2044,7 +2011,7 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase): ...@@ -2044,7 +2011,7 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
Parameters Parameters
---------- ----------
tensor_format tensor_format
Whether the data format is 'bc01' or 'b01c'. Always set to 'bc01'.
algo algo
'fast' or 'accurate' indicating whether computations should be 'fast' or 'accurate' indicating whether computations should be
optimized for speed or accuracy respectively. optimized for speed or accuracy respectively.
......
"""
This module contains the configuration flags for cudnn support.
Those are shared between the cuda and gpuarray backend which is why
they are in this file.
"""
import os.path
from theano.configparser import AddConfigVar, EnumStr, StrParam
from theano import config
AddConfigVar('dnn.conv.workmem',
"This flag is deprecated; use dnn.conv.algo_fwd.",
EnumStr(''),
in_c_key=False)
AddConfigVar('dnn.conv.workmem_bwd',
"This flag is deprecated; use dnn.conv.algo_bwd.",
EnumStr(''),
in_c_key=False)
AddConfigVar('dnn.conv.algo_fwd',
"Default implementation to use for CuDNN forward convolution.",
EnumStr('small', 'none', 'large', 'fft', 'guess_once',
'guess_on_shape_change', 'time_once',
'time_on_shape_change'),
in_c_key=False)
AddConfigVar('dnn.conv.algo_bwd',
"Default implementation to use for CuDNN backward convolution.",
EnumStr('none', 'deterministic', 'fft', 'guess_once',
'guess_on_shape_change', 'time_once',
'time_on_shape_change'),
in_c_key=False)
AddConfigVar('dnn.include_path',
"Location of the cudnn header (defaults to the cuda root)",
StrParam(lambda: os.path.join(config.cuda.root, 'include')))
AddConfigVar('dnn.library_path',
"Location of the cudnn header (defaults to the cuda root)",
StrParam(lambda: os.path.join(config.cuda.root, 'lib64')))
#section support_code_apply
int APPLY_SPECIFIC(conv_desc)(PyArrayObject *filt_shp,
cudnnConvolutionDescriptor_t *desc) {
cudnnStatus_t err;
int pad[3] = {PAD_0, PAD_1, PAD_2};
int strides[3] = {SUB_0, SUB_1, SUB_2};
int upscale[3] = {1, 1, 1};
#if BORDER_MODE == 0
pad[0] = *(npy_int64 *)PyArray_GETPTR1(filt_shp, 2) - 1;
pad[1] = *(npy_int64 *)PyArray_GETPTR1(filt_shp, 3) - 1;
#if NB_DIMS > 2
pad[2] = *(npy_int64 *)PyArray_GETPTR1(filt_shp, 4) - 1;
#endif
#endif
if (PyArray_DIM(filt_shp, 0) - 2 != NB_DIMS) {
PyErr_Format(PyExc_ValueError, "Filter shape has too many dimensions: "
"expected %d, got %lld.", NB_DIMS,
(long long)PyArray_DIM(filt_shp, 0));
return -1;
}
err = cudnnCreateConvolutionDescriptor(desc);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate convolution "
"descriptor: %s", cudnnGetErrorString(err));
return -1;
}
err = cudnnSetConvolutionNdDescriptor(*desc, NB_DIMS, pad, strides, upscale,
CONV_MODE);
return 0;
}
...@@ -4,193 +4,109 @@ ...@@ -4,193 +4,109 @@
#include <cudnn.h> #include <cudnn.h>
#ifndef CUDNN_VERSION #ifndef CUDNN_VERSION
#include <assert.h>
// Here we define the R2 API in terms of functions in the R1 interface
// This is only for what we use
static inline const char *cudnnGetErrorString(cudnnStatus_t err) { #define CUDNN_VERSION -1
switch (err) { static inline int cudnnGetVersion() {
case CUDNN_STATUS_SUCCESS: return -1;
return "The operation completed successfully.";
case CUDNN_STATUS_NOT_INITIALIZED:
return "The handle was not initialized(Is your driver recent enought?).";
case CUDNN_STATUS_ALLOC_FAILED:
return "Ressource allocation failed inside the library.";
case CUDNN_STATUS_BAD_PARAM:
return "An incorrect value was passed in.";
case CUDNN_STATUS_ARCH_MISMATCH:
return "The current GPU does not support the required features (only cc 3.0+ are supported).";
case CUDNN_STATUS_MAPPING_ERROR:
return "An access to GPU memory space failed (probably due to a failure to bind texture).";
case CUDNN_STATUS_EXECUTION_FAILED:
return "A kernel failed to execute.";
case CUDNN_STATUS_INTERNAL_ERROR:
return "An internal cuDNN operation failed.";
case CUDNN_STATUS_NOT_SUPPORTED:
return "The combination of parameters is not currently supported.";
default:
return "Unknown error code.";
}
} }
#endif
// some macros to help support cudnn R1 while using R2 code. #include <assert.h>
#define cudnnCreateTensorDescriptor cudnnCreateTensor4dDescriptor
#define cudnnDestroyTensorDescriptor cudnnDestroyTensor4dDescriptor
#define cudnnSetFilter4dDescriptor cudnnSetFilterDescriptor
typedef cudnnTensor4dDescriptor_t cudnnTensorDescriptor_t;
static inline cudnnStatus_t #if CUDNN_VERSION < 3000
cudnnGetConvolution2dForwardOutputDim( // Here we define the R3 API in terms of functions in the R2 interface
const cudnnConvolutionDescriptor_t convDesc, // This is only for what we use
const cudnnTensorDescriptor_t inputTensorDesc,
const cudnnFilterDescriptor_t filterDesc,
int *n,
int *c,
int *h,
int *w) {
return cudnnGetOutputTensor4dDim(convDesc, CUDNN_CONVOLUTION_FWD,
n, c, h, w);
}
typedef int cudnnConvolutionFwdAlgo_t; typedef int cudnnConvolutionBwdDataAlgo_t;
typedef int cudnnConvolutionFwdPreference_t;
#define CUDNN_CONVOLUTION_FWD_NO_WORKSPACE 0 #define CUDNN_CONVOLUTION_BWD_DATA_ALGO_0 0
#define CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 1
#define CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT 2
static inline cudnnStatus_t static cudnnStatus_t cudnnGetConvolutionBackwardDataWorkspaceSize(
cudnnGetConvolutionForwardAlgorithm(
cudnnHandle_t handle, cudnnHandle_t handle,
const cudnnTensorDescriptor_t srcDesc,
const cudnnFilterDescriptor_t filterDesc, const cudnnFilterDescriptor_t filterDesc,
const cudnnTensorDescriptor_t diffDesc,
const cudnnConvolutionDescriptor_t convDesc, const cudnnConvolutionDescriptor_t convDesc,
const cudnnTensorDescriptor_t destDesc, const cudnnTensorDescriptor_t gradDesc,
cudnnConvolutionFwdPreference_t preference, cudnnConvolutionBwdDataAlgo_t algo,
size_t memoryLimitInbytes, size_t *sizeInBytes) {
cudnnConvolutionFwdAlgo_t *algo) {
*algo = 0;
return CUDNN_STATUS_SUCCESS;
}
static inline cudnnStatus_t
cudnnGetConvolutionForwardWorkspaceSize(
cudnnHandle_t handle,
const cudnnTensorDescriptor_t srcDesc,
const cudnnFilterDescriptor_t filterDesc,
const cudnnConvolutionDescriptor_t convDesc,
const cudnnTensor4dDescriptor_t destDesc,
cudnnConvolutionFwdAlgo_t algo,
size_t *sizeInBytes) {
*sizeInBytes = 0; *sizeInBytes = 0;
return CUDNN_STATUS_SUCCESS; return CUDNN_STATUS_SUCCESS;
} }
static cudnnStatus_t cudnnConvolutionBackwardData_v3(
static inline cudnnStatus_t
cudnnConvolutionForward_v2(
cudnnHandle_t handle, cudnnHandle_t handle,
const void *alpha, const void *alpha,
const cudnnTensorDescriptor_t srcDesc,
const void *srcData,
const cudnnFilterDescriptor_t filterDesc, const cudnnFilterDescriptor_t filterDesc,
const void *filterData, const void *filterData,
const cudnnTensorDescriptor_t diffDesc,
const void *diffData,
const cudnnConvolutionDescriptor_t convDesc, const cudnnConvolutionDescriptor_t convDesc,
cudnnConvolutionFwdAlgo_t algo, cudnnConvolutionBwdDataAlgo_t algo,
void *workSpace, void *workspace,
size_t workSpaceSizeInBytes, size_t workspaceSizeInBytes,
const void *beta, const void *beta,
const cudnnTensorDescriptor_t destDesc, const cudnnTensorDescriptor_t gradDesc,
void *destData) { void *gradData) {
assert(*(float *)alpha == 1.0); return cudnnConvolutionBackwardData(
cudnnAccumulateResult_t r; handle,
if (*(float *)beta == 0.0) { alpha,
r = CUDNN_RESULT_NO_ACCUMULATE; filterDesc,
} else if (*(float *)beta == 1.0) { filterData,
r = CUDNN_RESULT_ACCUMULATE; diffDesc,
} else { diffData,
assert(0 && "beta must be 0.0 or 1.0"); convDesc,
} beta,
return cudnnConvolutionForward(handle, srcDesc, srcData, gradDesc,
filterDesc, filterData, gradData);
convDesc, destDesc, destData,
r);
} }
#define cudnnConvolutionForward cudnnConvolutionForward_v2
static inline cudnnStatus_t typedef int cudnnConvolutionBwdFilterAlgo_t;
cudnnConvolutionBackwardFilter_v2(
cudnnHandle_t handle, #define CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0 0
const void *alpha, #define CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 1
const cudnnTensorDescriptor_t srcDesc, #define CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT 2
const void *srcData,
static cudnnStatus_t cudnnGetConvolutionBackwardFilterWorkspaceSize(
cudnnHandle_t handle,
const cudnnTensorDescriptor_t filterDesc,
const cudnnTensorDescriptor_t diffDesc, const cudnnTensorDescriptor_t diffDesc,
const void *diffData,
const cudnnConvolutionDescriptor_t convDesc, const cudnnConvolutionDescriptor_t convDesc,
const void *beta,
const cudnnFilterDescriptor_t gradDesc, const cudnnFilterDescriptor_t gradDesc,
void *gradData) { cudnnConvolutionBwdDataAlgo_t algo,
assert(*(float *)alpha == 1.0); size_t *sizeInBytes) {
cudnnAccumulateResult_t r; *sizeInBytes = 0;
if (*(float *)beta == 0.0) { return CUDNN_STATUS_SUCCESS;
r = CUDNN_RESULT_NO_ACCUMULATE;
} else if (*(float *)beta == 1.0) {
r = CUDNN_RESULT_ACCUMULATE;
} else {
assert(0 && "beta must be 0.0 or 1.0");
}
return cudnnConvolutionBackwardFilter(handle, srcDesc, srcData,
diffDesc, diffData,
convDesc, gradDesc, gradData,
r);
} }
#define cudnnConvolutionBackwardFilter cudnnConvolutionBackwardFilter_v2 static cudnnStatus_t cudnnConvolutionBackwardFilter_v3(
cudnnHandle_t handle,
static inline cudnnStatus_t
cudnnConvolutionBackwardData_v2(
cudnnHandle_t handle,
const void *alpha, const void *alpha,
const cudnnFilterDescriptor_t filterDesc, const cudnnTensorDescriptor_t srcDesc,
const void *filterData, const void *srcData,
const cudnnTensorDescriptor_t diffDesc, const cudnnTensorDescriptor_t diffDesc,
const void *diffData, const void *diffData,
const cudnnConvolutionDescriptor_t convDesc, const cudnnConvolutionDescriptor_t convDesc,
cudnnConvolutionBwdFilterAlgo_t algo,
void *workspace,
size_t workspaceSizeInBytes,
const void *beta, const void *beta,
const cudnnTensorDescriptor_t gradDesc, const cudnnFilterDescriptor_t gradDesc,
void *gradData) { void *gradData) {
assert(*(float *)alpha == 1.0); return cudnnConvolutionBackwardFilter(
cudnnAccumulateResult_t r; handle,
if (*(float *)beta == 0.0) { alpha,
r = CUDNN_RESULT_NO_ACCUMULATE; srcDesc,
} else if (*(float *)beta == 1.0) { srcData,
r = CUDNN_RESULT_ACCUMULATE; diffDesc,
} else { diffData,
assert(0 && "beta must be 0.0 or 1.0"); convDesc,
} beta,
/* This function needs the casting because its params are not gradDesc,
declared as const */ gradData);
return cudnnConvolutionBackwardData(handle,
(cudnnFilterDescriptor_t)filterDesc,
filterData,
(cudnnTensorDescriptor_t)diffDesc,
diffData,
(cudnnConvolutionDescriptor_t)convDesc,
(cudnnTensorDescriptor_t)gradDesc,
gradData,
r);
} }
#define cudnnConvolutionBackwardData cudnnConvolutionBackwardData_v2
//Needed for R2 rc2
# define CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING CUDNN_POOLING_AVERAGE
#else
// r2 rc1 and rc2 do not have the same macro defined
// I didn't checked if this the right combination, but as we do not wrap the padding interface, it is fine for now.
# define CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING ((cudnnPoolingMode_t)1)
#endif #endif
#endif #endif
#section support_code #section support_code
static cudnnHandle_t _handle = NULL;
static int static int
c_set_tensor4d(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc) { c_set_tensorNd(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc) {
cudnnDataType_t dt; cudnnDataType_t dt;
size_t ds; size_t ds;
switch (var->ga.typecode) { switch (var->ga.typecode) {
...@@ -12,26 +11,37 @@ c_set_tensor4d(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc) { ...@@ -12,26 +11,37 @@ c_set_tensor4d(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc) {
case GA_DOUBLE: case GA_DOUBLE:
dt = CUDNN_DATA_DOUBLE; dt = CUDNN_DATA_DOUBLE;
break; break;
#if CUDNN_VERSION > 3000
case GA_HALF:
dt = CUDNN_DATA_HALF;
break;
#endif
default: default:
PyErr_SetString(PyExc_TypeError, "Non-float datatype in c_set_tensor4d"); PyErr_SetString(PyExc_TypeError, "Non-float datatype in c_set_tensorNd");
return -1; return -1;
} }
ds = gpuarray_get_elsize(var->ga.typecode); ds = gpuarray_get_elsize(var->ga.typecode);
int str0, str1, str2, str3; int strs[5], dims[5], default_stride = 1;
// cudnn do not like 0s in strides unsigned int nd = PyGpuArray_NDIM(var);
str3 = PyGpuArray_STRIDES(var)[3]?PyGpuArray_STRIDES(var)[3]/ds:1;
str2 = PyGpuArray_STRIDES(var)[2]?PyGpuArray_STRIDES(var)[2]/ds:PyGpuArray_DIMS(var)[3]; if (nd > 5) {
str1 = PyGpuArray_STRIDES(var)[1]?PyGpuArray_STRIDES(var)[1]/ds:PyGpuArray_DIMS(var)[2]*PyGpuArray_DIMS(var)[3]; PyErr_SetString(PyExc_TypeError, "Tensor of more than 5d");
str0 = PyGpuArray_STRIDES(var)[0]?PyGpuArray_STRIDES(var)[0]/ds:PyGpuArray_DIMS(var)[2]*PyGpuArray_DIMS(var)[3]*PyGpuArray_DIMS(var)[1]; return -1;
cudnnStatus_t err = cudnnSetTensor4dDescriptorEx( }
desc, dt,
PyGpuArray_DIM(var, 0), PyGpuArray_DIM(var, 1), for (unsigned int _i = nd; _i > 0; _i--) {
PyGpuArray_DIM(var, 2), PyGpuArray_DIM(var, 3), unsigned int i = _i - 1;
str0, str1, str2, str3); strs[i] = PyGpuArray_STRIDE(var, i) ?
PyGpuArray_STRIDE(var, i)/ds : default_stride;
default_stride *= PyGpuArray_DIM(var, i);
dims[i] = PyGpuArray_DIM(var, i);
}
cudnnStatus_t err = cudnnSetTensorNdDescriptor(desc, dt, nd, dims, strs);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"Could not set tensor4d descriptor: %s", "Could not set tensorNd descriptor: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
return -1; return -1;
} }
...@@ -53,14 +63,30 @@ c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t desc) { ...@@ -53,14 +63,30 @@ c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t desc) {
case GA_DOUBLE: case GA_DOUBLE:
dt = CUDNN_DATA_DOUBLE; dt = CUDNN_DATA_DOUBLE;
break; break;
#if CUDNN_VERSION > 3000
case GA_HALF:
dt = CUDNN_DATA_HALF;
break;
#endif
default: default:
PyErr_SetString(PyExc_TypeError, "Non-float datatype in c_set_filter"); PyErr_SetString(PyExc_TypeError, "Non-float datatype in c_set_filter");
return -1; return -1;
} }
cudnnStatus_t err = cudnnSetFilter4dDescriptor(
desc, dt, int dims[5];
PyGpuArray_DIMS(var)[0], PyGpuArray_DIMS(var)[1], unsigned int nd = PyGpuArray_NDIM(var);
PyGpuArray_DIMS(var)[2], PyGpuArray_DIMS(var)[3]);
if (nd > 5) {
PyErr_SetString(PyExc_TypeError, "Tensor of more than 5d");
return -1;
}
for (unsigned int _i = nd; _i > 0; _i--) {
unsigned int i = _i - 1;
dims[i] = PyGpuArray_DIM(var, i);
}
cudnnStatus_t err = cudnnSetFilterNdDescriptor(desc, dt, nd, dims);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"Could not set filter descriptor: %s.", "Could not set filter descriptor: %s.",
...@@ -72,15 +98,23 @@ c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t desc) { ...@@ -72,15 +98,23 @@ c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t desc) {
#section init_code #section init_code
setup_ext_cuda();
#section support_code_struct
cudnnHandle_t APPLY_SPECIFIC(_handle);
#section init_code_struct
{ {
cuda_enter(pygpu_default_context()->ctx);
cudnnStatus_t err; cudnnStatus_t err;
if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) { APPLY_SPECIFIC(_handle) = NULL;
if ((err = cudnnCreate(&APPLY_SPECIFIC(_handle))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not create cuDNN handle: %s", PyErr_Format(PyExc_RuntimeError, "could not create cuDNN handle: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
#if PY_MAJOR_VERSION >= 3 cuda_exit(pygpu_default_context()->ctx);
return NULL; FAIL;
#else
return;
#endif
} }
cuda_exit(pygpu_default_context()->ctx);
} }
...@@ -10,12 +10,12 @@ APPLY_SPECIFIC(input) = NULL; ...@@ -10,12 +10,12 @@ APPLY_SPECIFIC(input) = NULL;
APPLY_SPECIFIC(output) = NULL; APPLY_SPECIFIC(output) = NULL;
APPLY_SPECIFIC(kerns) = NULL; APPLY_SPECIFIC(kerns) = NULL;
if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != CUDNN_STATUS_SUCCESS) { if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor " PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
"(inp): %s", cudnnGetErrorString(APPLY_SPECIFIC(err))); "(inp): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
FAIL; FAIL;
} }
if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) != CUDNN_STATUS_SUCCESS) { if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor " PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
"(out): %s", cudnnGetErrorString(APPLY_SPECIFIC(err))); "(out): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
FAIL; FAIL;
} }
......
...@@ -10,14 +10,15 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -10,14 +10,15 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
float af = alpha, bf = beta; float af = alpha, bf = beta;
void *alpha_p; void *alpha_p;
void *beta_p; void *beta_p;
PyGpuContextObject *c = pygpu_default_context();
if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) { if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) {
PyErr_SetString(PyExc_ValueError, PyErr_SetString(PyExc_ValueError,
"GpuDnnConv images and kernel must have the same stack size"); "images and kernel must have the same stack size");
return 1; return 1;
} }
if (c_set_tensor4d(input, APPLY_SPECIFIC(input)) == -1) if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
return 1; return 1;
if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1) if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
return 1; return 1;
...@@ -28,6 +29,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -28,6 +29,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
beta_p = (void *)&beta; beta_p = (void *)&beta;
break; break;
case GA_FLOAT: case GA_FLOAT:
case GA_HALF:
alpha_p = (void *)&af; alpha_p = (void *)&af;
beta_p = (void *)&bf; beta_p = (void *)&bf;
break; break;
...@@ -42,56 +44,179 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -42,56 +44,179 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
Py_INCREF(*output); Py_INCREF(*output);
#else #else
if (theano_prep_output(output, PyGpuArray_NDIM(om), PyGpuArray_DIMS(om), if (theano_prep_output(output, PyGpuArray_NDIM(om), PyGpuArray_DIMS(om),
om->ga.typecode, GA_C_ORDER, om->ga.typecode, GA_C_ORDER, c) != 0)
pygpu_default_context()) != 0)
return 1; return 1;
if (beta != 0.0 && pygpu_move(*output, om)) if (beta != 0.0 && pygpu_move(*output, om))
return 1; return 1;
#endif #endif
if (c_set_tensor4d(*output, APPLY_SPECIFIC(output)) == -1) if (c_set_tensorNd(*output, APPLY_SPECIFIC(output)) == -1)
return 1; return 1;
cudnnConvolutionFwdAlgo_t algo = CONV_ALGO;
cuda_enter(c->ctx);
#ifdef CHOOSE_ALGO
/* Static variables are only initialized once so this will not
* reset the previous algo every time */
static int reuse_algo = 0;
static cudnnConvolutionFwdAlgo_t prev_algo = CONV_ALGO;
#ifndef CHOOSE_ONCE
static size_t prev_img_dims[5] = {0};
static size_t prev_kern_dims[5] = {0};
reuse_algo = 1;
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(input, i) == prev_img_dims[i]);
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]);
}
#endif
if (!reuse_algo) {
#ifdef CHOOSE_TIME
int count;
cudnnConvolutionFwdAlgoPerf_t choice;
err = cudnnFindConvolutionForwardAlgorithm(
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
desc, APPLY_SPECIFIC(output), 1, &count, &choice);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
algo = choice.algo;
#else
size_t free = 0, total = 0;
cudaError_t err2 = cudaMemGetInfo(&free, &total);
if (err2 != cudaSuccess) {
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
"memory information on the GPU: %s\n",
cudaGetErrorString(err2));
cuda_exit(c->ctx);
return 1;
}
err = cudnnGetConvolutionForwardAlgorithm(
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
desc, APPLY_SPECIFIC(output),
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
#endif
prev_algo = algo;
} else {
algo = prev_algo;
}
#ifdef CHOOSE_ONCE
reuse_algo = 1;
#else
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
prev_img_dims[i] = PyGpuArray_DIM(input, i);
prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
}
#endif
#endif
/* These two algos are not supported for 3d conv */
if (PyGpuArray_NDIM(input) == 5 &&
(algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM ||
algo == CUDNN_CONVOLUTION_FWD_ALGO_GEMM))
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
#if CUDNN_VERSION > 3000
if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT) {
int nd;
int pad[2];
int stride[2];
int upscale[2];
cudnnConvolutionMode_t mode;
err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
upscale, &mode);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error getting convolution properties: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
if (stride[0] != 1 || stride[1] != 1 ||
PyGpuArray_DIM(input, 0) > 1024 || PyGpuArray_DIM(input, 1) > 1024 ||
(PyGpuArray_DIM(kerns, 0) == 1 && PyGpuArray_DIM(kerns, 1) == 1)) {
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
}
}
#endif
#if CUDNN_VERSION < 3000
/* cuDNN before v3 does not support kernels larger than input even
* if appropriate padding is selected. */
for (unsigned int i = 2; i < PyGpuArray_NDIM(input); i++) {
if (PyGpuArray_DIM(kerns, i) > PyGpuArray_DIM(input, i)) {
PyErr_SetString(PyExc_RuntimeError, "the current version "
"of CuDNN does not support kernels larger than the "
"inputs in any spatial dimension, even if the inputs "
"are padded such that the padded inputs are larger "
"than the kernels. Update your installation of CuDNN "
"to V3 or more recent to solve the issue.");
cuda_exit(c->ctx);
return 1;
}
}
#endif
{ {
size_t worksize; size_t worksize;
gpudata *workspace; gpudata *workspace;
PyGpuContextObject *c; err = cudnnGetConvolutionForwardWorkspaceSize(APPLY_SPECIFIC(_handle),
err = cudnnGetConvolutionForwardWorkspaceSize(_handle,
APPLY_SPECIFIC(input), APPLY_SPECIFIC(input),
APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(kerns),
desc, desc,
APPLY_SPECIFIC(output), APPLY_SPECIFIC(output),
CONV_ALGO, algo,
&worksize); &worksize);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuDnnConv: error getting worksize: %s", "error getting worksize: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1; return 1;
} }
/* /*
* This is less than ideal since we need to free it after (which * This is less than ideal since we need to free it after (which
* introduces a synchronization point. But we don't have a module * introduces a synchronization point. But we don't have a module
* to place a nice get_work_mem() function in. * to place a nice get_work_mem() function in.
*/ */
if (worksize != 0) { if (worksize != 0) {
c = pygpu_default_context();
workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL); workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL);
if (workspace == NULL) { if (workspace == NULL) {
PyErr_SetString(PyExc_RuntimeError, PyErr_SetString(PyExc_RuntimeError,
"Could not allocate working memory"); "Could not allocate working memory");
cuda_exit(c->ctx);
return 1; return 1;
} }
} }
err = cudnnConvolutionForward( err = cudnnConvolutionForward(
_handle, APPLY_SPECIFIC(_handle),
alpha_p, alpha_p,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input), APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns), APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
desc, CONV_ALGO, desc, algo,
worksize == 0 ? NULL : *(void **)workspace, worksize, worksize == 0 ? NULL : *(void **)workspace, worksize,
beta_p, beta_p,
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output)); APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output));
...@@ -99,9 +224,10 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -99,9 +224,10 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
if (worksize != 0) if (worksize != 0)
c->ops->buffer_release(workspace); c->ops->buffer_release(workspace);
} }
cuda_exit(c->ctx);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "GpuDnnConv: error doing operation: %s", PyErr_Format(PyExc_RuntimeError, "error doing operation: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
return 1; return 1;
} }
......
...@@ -9,14 +9,15 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -9,14 +9,15 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
float af = alpha, bf = beta; float af = alpha, bf = beta;
void *alpha_p; void *alpha_p;
void *beta_p; void *beta_p;
PyGpuContextObject *c = pygpu_default_context();
if (PyGpuArray_DIMS(im)[1] != PyGpuArray_DIMS(kerns)[1]) { if (PyGpuArray_DIMS(im)[1] != PyGpuArray_DIMS(kerns)[1]) {
PyErr_SetString(PyExc_ValueError, PyErr_SetString(PyExc_ValueError, "images and kernel must have the same "
"GpuDnnConv images and kernel must have the same stack size"); "stack size");
return 1; return 1;
} }
if (c_set_tensor4d(output, APPLY_SPECIFIC(output)) == -1) if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
return 1; return 1;
if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1) if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
return 1; return 1;
...@@ -27,6 +28,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -27,6 +28,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
beta_p = (void *)&beta; beta_p = (void *)&beta;
break; break;
case GA_FLOAT: case GA_FLOAT:
case GA_HALF:
alpha_p = (void *)&af; alpha_p = (void *)&af;
beta_p = (void *)&bf; beta_p = (void *)&bf;
break; break;
...@@ -41,26 +43,156 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -41,26 +43,156 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
Py_INCREF(*input); Py_INCREF(*input);
#else #else
if (theano_prep_output(input, PyGpuArray_NDIM(im), PyGpuArray_DIMS(im), if (theano_prep_output(input, PyGpuArray_NDIM(im), PyGpuArray_DIMS(im),
im->ga.typecode, GA_C_ORDER, im->ga.typecode, GA_C_ORDER, c) != 0)
pygpu_default_context()) != 0)
return 1; return 1;
if (beta != 0.0 && pygpu_move(*input, im)) if (beta != 0.0 && pygpu_move(*input, im))
return 1; return 1;
#endif #endif
if (c_set_tensor4d(*input, APPLY_SPECIFIC(input)) == -1) if (c_set_tensorNd(*input, APPLY_SPECIFIC(input)) == -1)
return 1; return 1;
err = cudnnConvolutionBackwardData( cudnnConvolutionBwdDataAlgo_t algo = CONV_ALGO;
_handle,
cuda_enter(c->ctx);
#ifdef CHOOSE_ALGO
static int reuse_algo = 0;
static cudnnConvolutionBwdDataAlgo_t prev_algo = CONV_ALGO;
#ifndef CHOOSE_ONCE
static size_t prev_kern_dims[5] = {0};
static size_t prev_top_dims[5] = {0};
reuse_algo = 1;
for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); i++) {
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]);
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(output, i) == prev_top_dims[i]);
}
#endif
if (!reuse_algo) {
#ifdef CHOOSE_TIME
int count;
cudnnConvolutionBwdDataAlgoPerf_t choice;
err = cudnnFindConvolutionBackwardDataAlgorithm(
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(kerns), 1, &count, &choice);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
algo = choice.algo;
#else
size_t free = 0, total = 0;
cudaError_t err2 = cudaMemGetInfo(&free, &total);
if (err2 != cudaSuccess){
cudaGetLastError();
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the memory "
"information on the GPU: %s\n", cudaGetErrorString(err2));
cuda_exit(c->ctx);
return 1;
}
err = cudnnGetConvolutionBackwardDataAlgorithm(
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
desc, APPLY_SPECIFIC(kerns),
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
#endif
prev_algo = algo;
} else {
algo = prev_algo;
}
#ifdef CHOOSE_ONCE
reuse_algo = 1;
#else
for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); i++) {
prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
prev_top_dims[i] = PyGpuArray_DIM(output, i);
}
#endif
#endif
#if CUDNN_VERSION > 3000
if (algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) {
int nd;
int pad[2];
int stride[2];
int upscale[2];
cudnnConvolutionMode_t mode;
err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
upscale, &mode);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error getting convolution properties: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
if (stride[0] != 1 || stride[1] != 1 ||
PyGpuArray_DIM(*input, 0) > 1024 || PyGpuArray_DIM(*input, 1) > 1024 ||
(PyGpuArray_DIM(kerns, 0) == 1 && PyGpuArray_DIM(kerns, 1) == 1)) {
algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
}
}
#endif
size_t worksize;
gpudata *workspace;
err = cudnnGetConvolutionBackwardDataWorkspaceSize(
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(input), algo, &worksize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
if (worksize != 0) {
workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL);
if (workspace == NULL) {
PyErr_SetString(PyExc_RuntimeError,
"Could not allocate working memory");
cuda_exit(c->ctx);
return 1;
}
}
err = cudnnConvolutionBackwardData_v3(
APPLY_SPECIFIC(_handle),
alpha_p, alpha_p,
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns), APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),
desc, desc, algo, worksize == 0 ? NULL : *(void **)workspace, worksize,
beta_p, beta_p,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(*input)); APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(*input));
if (worksize != 0)
c->ops->buffer_release(workspace);
cuda_exit(c->ctx);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradI: error doing operation: %s", PyErr_Format(PyExc_RuntimeError, "error doing operation: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
return 1; return 1;
} }
......
#section support_code_struct #section support_code_struct
int int
APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
PyGpuArrayObject *km, PyGpuArrayObject *km,
cudnnConvolutionDescriptor_t desc, cudnnConvolutionDescriptor_t desc,
...@@ -9,6 +9,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -9,6 +9,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
float af = alpha, bf = beta; float af = alpha, bf = beta;
void *alpha_p; void *alpha_p;
void *beta_p; void *beta_p;
PyGpuContextObject *c = pygpu_default_context();
if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1]) { if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1]) {
PyErr_SetString(PyExc_ValueError, PyErr_SetString(PyExc_ValueError,
...@@ -16,9 +17,9 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -16,9 +17,9 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
return 1; return 1;
} }
if (c_set_tensor4d(input, APPLY_SPECIFIC(input)) == -1) if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
return 1; return 1;
if (c_set_tensor4d(output, APPLY_SPECIFIC(output)) == -1) if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
return 1; return 1;
switch (input->ga.typecode) { switch (input->ga.typecode) {
...@@ -27,6 +28,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -27,6 +28,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
beta_p = (void *)&beta; beta_p = (void *)&beta;
break; break;
case GA_FLOAT: case GA_FLOAT:
case GA_HALF:
alpha_p = (void *)&af; alpha_p = (void *)&af;
beta_p = (void *)&bf; beta_p = (void *)&bf;
break; break;
...@@ -41,8 +43,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -41,8 +43,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
Py_INCREF(*kerns); Py_INCREF(*kerns);
#else #else
if (theano_prep_output(kerns, PyGpuArray_NDIM(km), PyGpuArray_DIMS(km), if (theano_prep_output(kerns, PyGpuArray_NDIM(km), PyGpuArray_DIMS(km),
km->ga.typecode, GA_C_ORDER, km->ga.typecode, GA_C_ORDER, c) != 0)
pygpu_default_context()) != 0)
return 1; return 1;
if (beta != 0.0 && pygpu_move(*kerns, km)) if (beta != 0.0 && pygpu_move(*kerns, km))
return 1; return 1;
...@@ -51,16 +52,148 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -51,16 +52,148 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns)) == -1) if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns)) == -1)
return 1; return 1;
err = cudnnConvolutionBackwardFilter( cudnnConvolutionBwdFilterAlgo_t algo = CONV_ALGO;
_handle,
cuda_enter(c->ctx);
#ifdef CHOOSE_ALGO
static int reuse_algo = 0;
static cudnnConvolutionBwdFilterAlgo_t prev_algo = CONV_ALGO;
#ifndef CHOOSE_ONCE
static size_t prev_img_dims[5] = {0};
static size_t prev_top_dims[5] = {0};
reuse_algo = 1;
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(input, i) == prev_img_dims[i]);
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(output, i) == prev_top_dims[i]);
}
#endif
if (!reuse_algo) {
#ifdef CHOOSE_TIME
int count;
cudnnConvolutionBwdFilterAlgoPerf_t choice;
err = cudnnFindConvolutionBackwardFilterAlgorithm(
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(kerns), 1, &count, &choice);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
algo = choice.algo;
#else
size_t free = 0, total = 0;
cudaError_t err2 = cudaMemGetInfo(&free, &total);
if (err2 != cudaSuccess){
cudaGetLastError();
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the memory "
"information on the GPU: %s\n", cudaGetErrorString(err2));
cuda_exit(c->ctx);
return 1;
}
err = cudnnGetConvolutionBackwardFilterAlgorithm(
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
desc, APPLY_SPECIFIC(kerns),
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
#endif
prev_algo = algo;
} else {
algo = prev_algo;
}
#ifdef CHOOSE_ONCE
reuse_algo = 1;
#else
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
prev_img_dims[i] = PyGpuArray_DIM(input, i);
prev_top_dims[i] = PyGpuArray_DIM(output, i);
}
#endif
#endif
#ifdef CUDNN_VERSION > 3000
if (algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT) {
int nd;
int pad[2];
int stride[2];
int upscale[2];
cudnnConvolutionMode_t mode;
err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
upscale, &mode);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error getting convolution properties: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
if (stride[0] != 1 || stride[1] != 1 ||
PyGpuArray_DIM(input, 0) > 1024 || PyGpuArray_DIM(input, 1) > 1024 ||
(PyGpuArray_DIM(*kerns, 0) == 1 && PyGpuArray_DIM(*kerns, 1) == 1)) {
algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
}
}
#endif
size_t worksize;
gpudata *workspace;
err = cudnnGetConvolutionBackwardFilterWorkspaceSize(
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(kerns), algo, &worksize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
if (worksize != 0) {
workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL);
if (workspace == NULL) {
PyErr_SetString(PyExc_RuntimeError, "Could not allocate working memory");
cuda_exit(c->ctx);
return 1;
}
}
err = cudnnConvolutionBackwardFilter_v3(
APPLY_SPECIFIC(_handle),
alpha_p, alpha_p,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input), APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),
desc, desc, algo, worksize == 0 ? NULL : *(void **)workspace, worksize,
beta_p, beta_p,
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(*kerns)); APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(*kerns));
if (worksize != 0)
c->ops->buffer_release(workspace);
cuda_exit(c->ctx);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradW: error doing operation: %s", PyErr_Format(PyExc_RuntimeError, "error doing operation: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
return 1; return 1;
} }
......
#section support_code_struct
cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
#section init_code_struct
cudnnStatus_t APPLY_SPECIFIC(err);
APPLY_SPECIFIC(input) = NULL;
APPLY_SPECIFIC(output) = NULL;
if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
"(inp): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
FAIL;
}
if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
"(out): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
FAIL;
}
#section cleanup_code_struct
if (APPLY_SPECIFIC(input) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input)); }
if (APPLY_SPECIFIC(output) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output)); }
#section support_code_struct
int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
cudnnPoolingDescriptor_t desc,
PyGpuArrayObject **out) {
cudnnStatus_t err;
size_t dims[5];
PyGpuContextObject *c = pygpu_default_context();
if (!GpuArray_IS_C_CONTIGUOUS(&img->ga)) {
PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
return 1;
}
if (c_set_tensorNd(img, APPLY_SPECIFIC(input)) != 0)
return 1;
cudnnPoolingMode_t mode;
int w[3];
int p[3];
int s[3];
int ndims;
err = cudnnGetPoolingNdDescriptor(desc, 3, &mode, &ndims, w, p, s);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error doing cudnnGetPoolingDescriptor operation: %s",
cudnnGetErrorString(err));
return 1;
}
dims[0] = PyGpuArray_DIM(img, 0);
dims[1] = PyGpuArray_DIM(img, 1);
dims[2] = (PyGpuArray_DIM(img, 2) + (p[0]*2) - w[0]) / s[0] + 1;
dims[3] = (PyGpuArray_DIM(img, 3) + (p[1]*2) - w[1]) / s[1] + 1;
if (ndims == 3)
dims[4] = (PyGpuArray_DIM(img, 4) + (p[2]*2) - w[2]) / s[2] + 1;
if (theano_prep_output(out, ndims+2, dims, img->ga.typecode,
GA_C_ORDER, c) != 0)
return 1;
if (c_set_tensorNd(*out, APPLY_SPECIFIC(output)) != 0)
return 1;
{
const float alphaf = 1;
const float betaf = 0;
const double alphad = 1;
const double betad = 0;
void *alpha, *beta;
switch (img->ga.typecode) {
case GA_DOUBLE:
alpha = (void *)&alphad;
beta = (void *)&betad;
break;
case GA_FLOAT:
case GA_HALF:
alpha = (void *)&alphaf;
beta = (void *)&betaf;
break;
default:
PyErr_SetString(PyExc_TypeError, "Unsupported type in pooling");
return 1;
}
cuda_enter(c->ctx);
err = cudnnPoolingForward(
APPLY_SPECIFIC(_handle), desc,
alpha,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(img),
beta,
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*out));
cuda_exit(c->ctx);
}
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"GpuDnnPool: error doing cudnnPoolingForward operation: %s",
cudnnGetErrorString(err));
return 1;
}
return 0;
}
#section support_code_struct
cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
cudnnTensorDescriptor_t APPLY_SPECIFIC(input_grad);
cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
cudnnTensorDescriptor_t APPLY_SPECIFIC(output_grad);
#section init_code_struct
APPLY_SPECIFIC(input) = NULL;
APPLY_SPECIFIC(input_grad) = NULL;
APPLY_SPECIFIC(output) = NULL;
APPLY_SPECIFIC(output_grad) = NULL;
{
cudnnStatus_t err;
if ((err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError,
"could not allocate tensor descriptor (input): %s",
cudnnGetErrorString(err));
FAIL;
}
if ((err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input_grad))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError,
"could not allocate tensor descriptor (input_grad): %s",
cudnnGetErrorString(err));
FAIL;
}
if ((err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError,
"could not allocate tensor descriptor (output): %s",
cudnnGetErrorString(err));
FAIL;
}
if ((err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output_grad))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError,
"could not allocate tensor descriptor (output_grad): %s",
cudnnGetErrorString(err));
FAIL;
}
}
#section cleanup_code_struct
if (APPLY_SPECIFIC(input) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input)); }
if (APPLY_SPECIFIC(input_grad) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input_grad)); }
if (APPLY_SPECIFIC(output) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output)); }
if (APPLY_SPECIFIC(output_grad) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output_grad)); }
#section support_code_struct
int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
PyGpuArrayObject *out,
PyGpuArrayObject *out_grad,
cudnnPoolingDescriptor_t desc,
PyGpuArrayObject **inp_grad) {
cudnnStatus_t err;
PyGpuContextObject *c = pygpu_default_context();
if (!GpuArray_IS_C_CONTIGUOUS(&inp->ga)) {
PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
return 1;
}
if (!GpuArray_IS_C_CONTIGUOUS(&out_grad->ga)) {
PyErr_SetString(PyExc_ValueError, "Only contiguous output gradients are supported.");
return 1;
}
if (!GpuArray_IS_C_CONTIGUOUS(&out->ga)) {
PyErr_SetString(PyExc_ValueError, "Only contiguous outputs are supported.");
return 1;
}
if (c_set_tensorNd(inp, APPLY_SPECIFIC(input)) != 0)
return 1;
if (c_set_tensorNd(out_grad, APPLY_SPECIFIC(output_grad)) != 0)
return 1;
if (c_set_tensorNd(out, APPLY_SPECIFIC(output)) != 0)
return 1;
if (theano_prep_output(inp_grad, PyGpuArray_NDIM(inp),
PyGpuArray_DIMS(inp), inp->ga.typecode,
GA_C_ORDER, pygpu_default_context()) != 0) {
return 1;
}
if (c_set_tensorNd(*inp_grad, APPLY_SPECIFIC(input_grad)) != 0)
return 1;
{
const float alphaf = 1;
const float betaf = 0;
const double alphad = 1;
const double betad = 0;
void *alpha, *beta;
switch (inp->ga.typecode) {
case GA_DOUBLE:
alpha = (void *)&alphad;
beta = (void *)&betad;
break;
case GA_FLOAT:
case GA_HALF:
alpha = (void *)&alphaf;
beta = (void *)&betaf;
break;
default:
PyErr_SetString(PyExc_TypeError, "Unsupported type in pooling gradient");
return 1;
}
cuda_enter(c->ctx);
err = cudnnPoolingBackward(
APPLY_SPECIFIC(_handle), desc,
alpha,
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(out),
APPLY_SPECIFIC(output_grad), PyGpuArray_DEV_DATA(out_grad),
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(inp),
beta,
APPLY_SPECIFIC(input_grad), PyGpuArray_DEV_DATA(*inp_grad)
);
cuda_exit(c->ctx);
}
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error doing operation: %s.",
cudnnGetErrorString(err));
return 1;
}
return 0;
}
#section support_code_struct
cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
#section init_code_struct
APPLY_SPECIFIC(input) = NULL;
APPLY_SPECIFIC(output) = NULL;
{
cudnnStatus_t err;
err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input));
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor: %s",
cudnnGetErrorString(err));
FAIL;
}
err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output));
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor: %s",
cudnnGetErrorString(err));
FAIL;
}
}
#section cleanup_code_struct
if (APPLY_SPECIFIC(input) != NULL)
cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input));
if (APPLY_SPECIFIC(output) != NULL)
cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output));
#section support_code_struct
int APPLY_SPECIFIC(softmax)(PyGpuArrayObject *x,
PyGpuArrayObject **out) {
cudnnStatus_t err;
PyGpuContextObject *c = pygpu_default_context();
if (c_set_tensorNd(x, APPLY_SPECIFIC(input)) != 0)
return 1;
if (theano_prep_output(out, PyGpuArray_NDIM(x),
PyGpuArray_DIMS(x), x->ga.typecode,
GA_C_ORDER, c) != 0)
return 1;
if (c_set_tensorNd(*out, APPLY_SPECIFIC(output)) != 0)
return 1;
{
const float alphaf = 1;
const float betaf = 0;
const double alphad = 1;
const double betad = 0;
void *alpha, *beta;
switch (x->ga.typecode) {
case GA_DOUBLE:
alpha = (void *)&alphad;
beta = (void *)&betad;
break;
case GA_FLOAT:
case GA_HALF:
alpha = (void *)&alphaf;
beta = (void *)&betaf;
break;
default:
PyErr_SetString(PyExc_TypeError, "Unsupported type in softmax");
return 1;
}
cuda_enter(c->ctx);
err = cudnnSoftmaxForward(
APPLY_SPECIFIC(_handle),
SOFTMAX_ALGO,
SOFTMAX_MODE,
alpha,
APPLY_SPECIFIC(input),
PyGpuArray_DEV_DATA(x),
beta,
APPLY_SPECIFIC(output),
PyGpuArray_DEV_DATA(*out)
);
cuda_exit(c->ctx);
}
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error during operation: %s",
cudnnGetErrorString(err));
return 1;
}
return 0;
}
#section support_code_struct
cudnnTensorDescriptor_t APPLY_SPECIFIC(dy);
cudnnTensorDescriptor_t APPLY_SPECIFIC(sm);
cudnnTensorDescriptor_t APPLY_SPECIFIC(dx);
#section init_code_struct
APPLY_SPECIFIC(dy) = NULL;
APPLY_SPECIFIC(sm) = NULL;
APPLY_SPECIFIC(dx) = NULL;
{
cudnnStatus_t err;
err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(dy));
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor: %s",
cudnnGetErrorString(err));
FAIL;
}
err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(sm));
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor: %s",
cudnnGetErrorString(err));
FAIL;
}
err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(dx));
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor: %s",
cudnnGetErrorString(err));
FAIL;
}
}
#section cleanup_code_struct
if (APPLY_SPECIFIC(dy) != NULL)
cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(dy));
if (APPLY_SPECIFIC(sm) != NULL)
cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(sm));
if (APPLY_SPECIFIC(dx) != NULL)
cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(dx));
#section support_code_struct
int APPLY_SPECIFIC(softmax_grad)(PyGpuArrayObject *dy,
PyGpuArrayObject *sm,
PyGpuArrayObject **dx) {
cudnnStatus_t err;
PyGpuContextObject *c = pygpu_default_context();
if (c_set_tensorNd(dy, APPLY_SPECIFIC(dy)) != 0)
return 1;
if (c_set_tensorNd(sm, APPLY_SPECIFIC(sm)) != 0)
return 1;
if (theano_prep_output(dx, PyGpuArray_NDIM(dy),
PyGpuArray_DIMS(dy), dy->ga.typecode,
GA_C_ORDER, c) != 0)
return 1;
if (c_set_tensorNd(*dx, APPLY_SPECIFIC(dx)) != 0)
return 1;
{
const float alphaf = 1;
const float betaf = 0;
const double alphad = 1;
const double betad = 0;
void *alpha, *beta;
switch (sm->ga.typecode) {
case GA_DOUBLE:
alpha = (void *)&alphad;
beta = (void *)&betad;
break;
case GA_FLOAT:
case GA_HALF:
alpha = (void *)&alphaf;
beta = (void *)&betaf;
break;
default:
PyErr_SetString(PyExc_TypeError, "Unsupported type in softmax gradient");
return 1;
}
cuda_enter(c->ctx);
err = cudnnSoftmaxBackward(
APPLY_SPECIFIC(_handle),
SOFTMAX_ALGO,
SOFTMAX_MODE,
alpha,
APPLY_SPECIFIC(sm),
PyGpuArray_DEV_DATA(sm),
APPLY_SPECIFIC(dy),
PyGpuArray_DEV_DATA(dy),
beta,
APPLY_SPECIFIC(dx),
PyGpuArray_DEV_DATA(*dx)
);
cuda_exit(c->ctx);
}
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error during operation: %s",
cudnnGetErrorString(err));
return 1;
}
return 0;
}
...@@ -326,7 +326,6 @@ class test_SoftMax(unittest.TestCase): ...@@ -326,7 +326,6 @@ class test_SoftMax(unittest.TestCase):
return f, f_gpu return f, f_gpu
def _cmp(self, n, m, f, f_gpu): def _cmp(self, n, m, f, f_gpu):
# print "test_softmax",n,m
data = numpy.arange(n * m, dtype='float32').reshape(n, m) data = numpy.arange(n * m, dtype='float32').reshape(n, m)
out = f(data) out = f(data)
gout = f_gpu(data) gout = f_gpu(data)
...@@ -349,8 +348,6 @@ class test_SoftMax(unittest.TestCase): ...@@ -349,8 +348,6 @@ class test_SoftMax(unittest.TestCase):
self._cmp self._cmp
) )
# cuDNN R1 cannot handle these test cases but the Theano softmax can so
# we test them only for the Theano softmax.
self._cmp(2 << 15, 5, f, f_gpu) self._cmp(2 << 15, 5, f, f_gpu)
def test_softmax_shape_0(self): def test_softmax_shape_0(self):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论