提交 4b94a811 authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Create the cudnn handle on context initialization and pass it around.

上级 3f536fb7
...@@ -26,7 +26,7 @@ except ImportError: ...@@ -26,7 +26,7 @@ except ImportError:
# This is for documentation not to depend on the availability of pygpu # This is for documentation not to depend on the availability of pygpu
from .type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant, from .type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant,
GpuArraySharedVariable, gpuarray_shared_constructor, GpuArraySharedVariable, gpuarray_shared_constructor,
reg_context, get_context, ContextNotDefined) reg_context, get_context, ContextNotDefined, _get_props)
from .basic_ops import as_gpuarray_variable from .basic_ops import as_gpuarray_variable
from . import fft, dnn, opt, nerv, extra_ops, multinomial from . import fft, dnn, opt, nerv, extra_ops, multinomial
...@@ -89,19 +89,24 @@ def init_dev(dev, name=None): ...@@ -89,19 +89,24 @@ def init_dev(dev, name=None):
(name, dev, context.devname), (name, dev, context.devname),
file=sys.stderr) file=sys.stderr)
pygpu_activated = True pygpu_activated = True
ctx_props = _get_props(name)
ctx_props['dev'] = dev
if dev.startswith('cuda'): if dev.startswith('cuda'):
try: if 'cudnn_version' not in ctx_props:
cudnn_version = dnn.version() try:
# 5200 should not print warning with cudnn 5.1 final. ctx_props['cudnn_version'] = dnn.version()
if cudnn_version >= 5200: # 5200 should not print warning with cudnn 5.1 final.
warnings.warn("Your cuDNN version is more recent than Theano." if ctx_props['cudnn_version'] >= 5200:
" If you see problems, try updating Theano or" warnings.warn("Your cuDNN version is more recent than "
" downgrading cuDNN to version 5.1.") "Theano. If you encounter problems, try "
if config.print_active_device: "updating Theano or downgrading cuDNN to "
print("Using cuDNN version %d on context %s" % "version 5.1.")
(cudnn_version, name), file=sys.stderr) if config.print_active_device:
except Exception: print("Using cuDNN version %d on context %s" %
pass (ctx_props['cudnn_version'], name), file=sys.stderr)
ctx_props['cudnn_handle'] = dnn._make_handle(context)
except Exception:
pass
# This maps things like 'cuda0' to the context object on that device. # This maps things like 'cuda0' to the context object on that device.
init_dev.devmap = {} init_dev.devmap = {}
......
...@@ -26,7 +26,8 @@ from theano.tensor.nnet.abstract_conv import (AbstractConv2d, ...@@ -26,7 +26,8 @@ from theano.tensor.nnet.abstract_conv import (AbstractConv2d,
from theano.tensor.signal.pool import ( from theano.tensor.signal.pool import (
Pool, MaxPoolGrad, AveragePoolGrad) Pool, MaxPoolGrad, AveragePoolGrad)
from . import pygpu from . import pygpu
from .type import get_context, gpu_context_type, list_contexts from .type import (get_context, gpu_context_type, list_contexts,
get_prop, set_prop)
from .basic_ops import (as_gpuarray_variable, infer_context_name, from .basic_ops import (as_gpuarray_variable, infer_context_name,
gpu_contiguous, gpu_alloc_empty, gpu_contiguous, gpu_alloc_empty,
empty_like, GpuArrayType) empty_like, GpuArrayType)
...@@ -42,6 +43,44 @@ from .opt_util import alpha_merge, output_merge, inplace_allocempty, pad_dims, u ...@@ -42,6 +43,44 @@ from .opt_util import alpha_merge, output_merge, inplace_allocempty, pad_dims, u
from theano.configdefaults import SUPPORTED_DNN_CONV_ALGO_BWD_FILTER from theano.configdefaults import SUPPORTED_DNN_CONV_ALGO_BWD_FILTER
try:
from pygpu import gpuarray
except ImportError:
pass
def _dnn_lib():
if _dnn_lib.handle is None:
import ctypes.util
lib_name = ctypes.util.find_library('cudnn')
if lib_name is None and sys.platform == 'win32':
# Update these names when new versions of cudnn are supported.
lib_name = ctypes.util.find_library('cudnn64_5.dll')
if lib_name is None:
lib_name = ctypes.util.find_library('cudnn64_4.dll')
if lib_name is None:
raise RuntimeError('Could not find cudnn library')
_dnn_lib.handle = ctypes.cdll.LoadLibrary(lib_name)
cudnn = _dnn_lib.handle
cudnn.cudnnCreate.argtypes = [ctypes.POINTER(ctypes.c_void_p)]
cudnn.cudnnCreate.restype = ctypes.c_int
cudnn.cudnnDestroy.argtypes = [ctypes.c_void_p]
cudnn.cudnnDestroy.restype = ctypes.c_int
return _dnn_lib.handle
_dnn_lib.handle = None
def _make_handle(ctx):
cudnn = _dnn_lib()
handle = ctypes.c_void_p()
with ctx:
err = cudnn.cudnnCreate(ctypes.byref(handle))
if err != 0:
raise RuntimeError("error creating cudnn handle")
return handle
def raise_no_cudnn(msg="cuDNN is required for convolution and pooling"): def raise_no_cudnn(msg="cuDNN is required for convolution and pooling"):
raise RuntimeError(msg) raise RuntimeError(msg)
...@@ -144,6 +183,12 @@ def dnn_available(context_name): ...@@ -144,6 +183,12 @@ def dnn_available(context_name):
dnn_available.msg = None dnn_available.msg = None
handle_type = CDataType('cudnnHandle_t', 'cudnnDestroy',
headers=['cudnn.h'],
header_dirs=[config.dnn.include_path],
libraries=['cudnn'],
lib_dirs=[config.dnn.library_path])
class DnnBase(COp): class DnnBase(COp):
...@@ -154,10 +199,20 @@ class DnnBase(COp): ...@@ -154,10 +199,20 @@ class DnnBase(COp):
# dnn does not know about broadcasting, so we do not need to assert # dnn does not know about broadcasting, so we do not need to assert
# the input broadcasting pattern. # the input broadcasting pattern.
check_broadcast = False check_broadcast = False
params_type = gpu_context_type params_type = handle_type
def dnn_context(self, node):
return node.outputs[0].type.context_name
def get_params(self, node): def get_params(self, node):
return node.outputs[0].type.context try:
return get_prop(self.dnn_context(node), 'cudnn_handle_param')
except KeyError:
pass
ptr = get_prop(self.dnn_context(node), 'cudnn_handle').value
res = handle_type.make_value(ptr)
set_prop(self.dnn_context(node), 'cudnn_handle_param', res)
return res
def __init__(self, files=None, c_func=None): def __init__(self, files=None, c_func=None):
if files is None: if files is None:
...@@ -165,9 +220,10 @@ class DnnBase(COp): ...@@ -165,9 +220,10 @@ class DnnBase(COp):
COp.__init__(self, ["dnn_base.c"] + files, c_func) COp.__init__(self, ["dnn_base.c"] + files, c_func)
def c_headers(self): def c_headers(self):
return ['cudnn.h', 'cudnn_helper.h', 'gpuarray_helper.h', return ['gpuarray/types.h', 'gpuarray/array.h', 'gpuarray/kernel.h',
'gpuarray/types.h', 'gpuarray/array.h', 'gpuarray/util.h', 'gpuarray/util.h', 'gpuarray/ext_cuda.h', 'gpuarray_api.h',
'gpuarray/ext_cuda.h', 'gpuarray_api.h', 'numpy_compat.h'] 'numpy_compat.h', 'cudnn.h', 'cudnn_helper.h',
'gpuarray_helper.h']
def c_header_dirs(self): def c_header_dirs(self):
return [os.path.dirname(__file__), pygpu.get_include(), return [os.path.dirname(__file__), pygpu.get_include(),
...@@ -183,7 +239,7 @@ class DnnBase(COp): ...@@ -183,7 +239,7 @@ class DnnBase(COp):
return ['-Wl,-rpath,' + config.dnn.library_path] return ['-Wl,-rpath,' + config.dnn.library_path]
def c_code_cache_version(self): def c_code_cache_version(self):
return (super(DnnBase, self).c_code_cache_version(), version()) return (super(DnnBase, self).c_code_cache_version(), version(), 1)
class DnnVersion(Op): class DnnVersion(Op):
......
...@@ -149,41 +149,3 @@ static int c_make_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t *desc) { ...@@ -149,41 +149,3 @@ static int c_make_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t *desc) {
#section init_code #section init_code
setup_ext_cuda(); setup_ext_cuda();
#section support_code_struct
PyGpuContextObject *ctx;
cudnnHandle_t APPLY_SPECIFIC(_handle);
#section init_code_struct
{
// We need to keep a reference here to have it available in the destructor.
ctx = PARAMS;
Py_INCREF(ctx);
cuda_enter(PARAMS->ctx);
cudnnStatus_t err;
APPLY_SPECIFIC(_handle) = NULL;
if ((err = cudnnCreate(&APPLY_SPECIFIC(_handle))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not create cuDNN handle: %s",
cudnnGetErrorString(err));
cuda_exit(PARAMS->ctx);
FAIL;
}
if ((err = cudnnSetStream(APPLY_SPECIFIC(_handle),
cuda_get_stream(PARAMS->ctx))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "Could not set cudnn stream: %s",
cudnnGetErrorString(err));
cuda_exit(PARAMS->ctx);
FAIL;
}
cuda_exit(PARAMS->ctx);
}
#section cleanup_code_struct
cuda_enter(ctx->ctx);
cudnnDestroy(APPLY_SPECIFIC(_handle));
cuda_exit(ctx->ctx);
Py_DECREF((PyObject *)ctx);
...@@ -3,7 +3,9 @@ ...@@ -3,7 +3,9 @@
int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale, int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
PyGpuArrayObject *bias, npy_float64 epsilon, PyGpuArrayObject *bias, npy_float64 epsilon,
PyGpuArrayObject **outp, PyGpuArrayObject **x_mean, PyGpuArrayObject **outp, PyGpuArrayObject **x_mean,
PyGpuArrayObject **x_invstd, PyGpuContextObject *c) { PyGpuArrayObject **x_invstd, cudnnHandle_t _handle) {
PyGpuContextObject *c = inp->context;
if (c_set_tensorNd(inp, bn_input) != 0) if (c_set_tensorNd(inp, bn_input) != 0)
return 1; return 1;
if (c_set_tensorNd(scale, bn_params) != 0) if (c_set_tensorNd(scale, bn_params) != 0)
...@@ -37,7 +39,7 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale, ...@@ -37,7 +39,7 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
beta = (void *)&fbeta; beta = (void *)&fbeta;
} }
cudnnStatus_t err = cudnnBatchNormalizationForwardTraining( cudnnStatus_t err = cudnnBatchNormalizationForwardTraining(
APPLY_SPECIFIC(_handle), _handle,
MODE, MODE,
alpha, alpha,
beta, beta,
......
...@@ -24,7 +24,9 @@ int dnn_batchnorm_grad(PyGpuArrayObject *inp, PyGpuArrayObject *doutp, ...@@ -24,7 +24,9 @@ int dnn_batchnorm_grad(PyGpuArrayObject *inp, PyGpuArrayObject *doutp,
PyGpuArrayObject *scale, PyGpuArrayObject *x_mean, PyGpuArrayObject *scale, PyGpuArrayObject *x_mean,
PyGpuArrayObject *x_invstd, npy_float64 epsilon, PyGpuArrayObject *x_invstd, npy_float64 epsilon,
PyGpuArrayObject **dinp, PyGpuArrayObject **dscale, PyGpuArrayObject **dinp, PyGpuArrayObject **dscale,
PyGpuArrayObject **dbias, PyGpuContextObject *c) { PyGpuArrayObject **dbias, cudnnHandle_t _handle) {
PyGpuContextObject *c = inp->context;
if (c_set_tensorNd(inp, bn_input) != 0) if (c_set_tensorNd(inp, bn_input) != 0)
return 1; return 1;
if (c_set_tensorNd(doutp, bn_doutput) != 0) if (c_set_tensorNd(doutp, bn_doutput) != 0)
...@@ -66,7 +68,7 @@ int dnn_batchnorm_grad(PyGpuArrayObject *inp, PyGpuArrayObject *doutp, ...@@ -66,7 +68,7 @@ int dnn_batchnorm_grad(PyGpuArrayObject *inp, PyGpuArrayObject *doutp,
betaParam = (void *)&fbeta; betaParam = (void *)&fbeta;
} }
cudnnStatus_t err = cudnnBatchNormalizationBackward( cudnnStatus_t err = cudnnBatchNormalizationBackward(
APPLY_SPECIFIC(_handle), _handle,
MODE, MODE,
alphaData, alphaData,
betaData, betaData,
......
...@@ -3,7 +3,9 @@ ...@@ -3,7 +3,9 @@
int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale, int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
PyGpuArrayObject *bias, PyGpuArrayObject *est_mean, PyGpuArrayObject *bias, PyGpuArrayObject *est_mean,
PyGpuArrayObject *est_var, npy_float64 epsilon, PyGpuArrayObject *est_var, npy_float64 epsilon,
PyGpuArrayObject **outp, PyGpuContextObject *c) { PyGpuArrayObject **outp, cudnnHandle_t _handle) {
PyGpuContextObject *c = inp->context;
if (c_set_tensorNd(inp, bn_input) != 0) if (c_set_tensorNd(inp, bn_input) != 0)
return 1; return 1;
if (c_set_tensorNd(scale, bn_params) != 0) if (c_set_tensorNd(scale, bn_params) != 0)
...@@ -33,7 +35,7 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale, ...@@ -33,7 +35,7 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
beta = (void *)&fbeta; beta = (void *)&fbeta;
} }
cudnnStatus_t err = cudnnBatchNormalizationForwardInference( cudnnStatus_t err = cudnnBatchNormalizationForwardInference(
APPLY_SPECIFIC(_handle), _handle,
MODE, MODE,
alpha, alpha,
beta, beta,
......
...@@ -26,11 +26,12 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -26,11 +26,12 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
cudnnConvolutionDescriptor_t desc, cudnnConvolutionDescriptor_t desc,
double alpha, double beta, double alpha, double beta,
PyGpuArrayObject **output, PyGpuArrayObject **output,
PyGpuContextObject *c) { cudnnHandle_t _handle) {
cudnnStatus_t err = CUDNN_STATUS_SUCCESS; PyGpuContextObject *c = input->context;
float af = alpha, bf = beta;
void *alpha_p; void *alpha_p;
void *beta_p; void *beta_p;
float af = alpha, bf = beta;
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) { if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) {
PyErr_SetString(PyExc_ValueError, PyErr_SetString(PyExc_ValueError,
...@@ -92,7 +93,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -92,7 +93,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
int count; int count;
cudnnConvolutionFwdAlgoPerf_t choice; cudnnConvolutionFwdAlgoPerf_t choice;
err = cudnnFindConvolutionForwardAlgorithm( err = cudnnFindConvolutionForwardAlgorithm(
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns), _handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
desc, APPLY_SPECIFIC(output), 1, &count, &choice); desc, APPLY_SPECIFIC(output), 1, &count, &choice);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
...@@ -115,7 +116,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -115,7 +116,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
} }
err = cudnnGetConvolutionForwardAlgorithm( err = cudnnGetConvolutionForwardAlgorithm(
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns), _handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
desc, APPLY_SPECIFIC(output), desc, APPLY_SPECIFIC(output),
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, &algo); CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
...@@ -198,7 +199,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -198,7 +199,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
{ {
size_t worksize; size_t worksize;
gpudata *workspace; gpudata *workspace;
err = cudnnGetConvolutionForwardWorkspaceSize(APPLY_SPECIFIC(_handle), err = cudnnGetConvolutionForwardWorkspaceSize(_handle,
APPLY_SPECIFIC(input), APPLY_SPECIFIC(input),
APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(kerns),
desc, desc,
...@@ -211,7 +212,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -211,7 +212,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
// TODO: Print a warning // TODO: Print a warning
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
err = cudnnGetConvolutionForwardWorkspaceSize(APPLY_SPECIFIC(_handle), err = cudnnGetConvolutionForwardWorkspaceSize(_handle,
APPLY_SPECIFIC(input), APPLY_SPECIFIC(input),
APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(kerns),
desc, desc,
...@@ -248,7 +249,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -248,7 +249,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnConvolutionForward( err = cudnnConvolutionForward(
APPLY_SPECIFIC(_handle), _handle,
alpha_p, alpha_p,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input), APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns), APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
......
...@@ -25,11 +25,12 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -25,11 +25,12 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
PyGpuArrayObject *im, PyGpuArrayObject *im,
cudnnConvolutionDescriptor_t desc, cudnnConvolutionDescriptor_t desc,
double alpha, double beta, PyGpuArrayObject **input, double alpha, double beta, PyGpuArrayObject **input,
PyGpuContextObject *c) { cudnnHandle_t _handle) {
cudnnStatus_t err = CUDNN_STATUS_SUCCESS; PyGpuContextObject *c = kerns->context;
float af = alpha, bf = beta;
void *alpha_p; void *alpha_p;
void *beta_p; void *beta_p;
float af = alpha, bf = beta;
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (PyGpuArray_DIMS(im)[1] != PyGpuArray_DIMS(kerns)[1]) { if (PyGpuArray_DIMS(im)[1] != PyGpuArray_DIMS(kerns)[1]) {
PyErr_SetString(PyExc_ValueError, "images and kernel must have the same " PyErr_SetString(PyExc_ValueError, "images and kernel must have the same "
...@@ -93,7 +94,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -93,7 +94,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
cudnnConvolutionBwdDataAlgoPerf_t choice; cudnnConvolutionBwdDataAlgoPerf_t choice;
err = cudnnFindConvolutionBackwardDataAlgorithm( err = cudnnFindConvolutionBackwardDataAlgorithm(
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc, _handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(input), 1, &count, &choice); APPLY_SPECIFIC(input), 1, &count, &choice);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
...@@ -116,7 +117,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -116,7 +117,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
} }
err = cudnnGetConvolutionBackwardDataAlgorithm( err = cudnnGetConvolutionBackwardDataAlgorithm(
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), _handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output),
desc, APPLY_SPECIFIC(input), desc, APPLY_SPECIFIC(input),
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, free, &algo); CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
...@@ -193,7 +194,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -193,7 +194,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
gpudata *workspace; gpudata *workspace;
err = cudnnGetConvolutionBackwardDataWorkspaceSize( err = cudnnGetConvolutionBackwardDataWorkspaceSize(
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc, _handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(input), algo, &worksize); APPLY_SPECIFIC(input), algo, &worksize);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
...@@ -218,7 +219,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -218,7 +219,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
cuda_wait((*input)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); cuda_wait((*input)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnConvolutionBackwardData( err = cudnnConvolutionBackwardData(
APPLY_SPECIFIC(_handle), _handle,
alpha_p, alpha_p,
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns), APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),
......
...@@ -25,11 +25,12 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -25,11 +25,12 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
PyGpuArrayObject *km, PyGpuArrayObject *km,
cudnnConvolutionDescriptor_t desc, cudnnConvolutionDescriptor_t desc,
double alpha, double beta, PyGpuArrayObject **kerns, double alpha, double beta, PyGpuArrayObject **kerns,
PyGpuContextObject *c) { cudnnHandle_t _handle) {
cudnnStatus_t err = CUDNN_STATUS_SUCCESS; PyGpuContextObject *c = input->context;
float af = alpha, bf = beta;
void *alpha_p; void *alpha_p;
void *beta_p; void *beta_p;
float af = alpha, bf = beta;
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1]) { if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1]) {
PyErr_SetString(PyExc_ValueError, PyErr_SetString(PyExc_ValueError,
...@@ -93,7 +94,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -93,7 +94,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
cudnnConvolutionBwdFilterAlgoPerf_t choice; cudnnConvolutionBwdFilterAlgoPerf_t choice;
err = cudnnFindConvolutionBackwardFilterAlgorithm( err = cudnnFindConvolutionBackwardFilterAlgorithm(
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc, _handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(kerns), 1, &count, &choice); APPLY_SPECIFIC(kerns), 1, &count, &choice);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
...@@ -117,7 +118,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -117,7 +118,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
} }
err = cudnnGetConvolutionBackwardFilterAlgorithm( err = cudnnGetConvolutionBackwardFilterAlgorithm(
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), _handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
desc, APPLY_SPECIFIC(kerns), desc, APPLY_SPECIFIC(kerns),
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, free, &algo); CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
...@@ -181,7 +182,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -181,7 +182,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
gpudata *workspace; gpudata *workspace;
err = cudnnGetConvolutionBackwardFilterWorkspaceSize( err = cudnnGetConvolutionBackwardFilterWorkspaceSize(
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc, _handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(kerns), algo, &worksize); APPLY_SPECIFIC(kerns), algo, &worksize);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
...@@ -205,7 +206,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -205,7 +206,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
cuda_wait((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); cuda_wait((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnConvolutionBackwardFilter( err = cudnnConvolutionBackwardFilter(
APPLY_SPECIFIC(_handle), _handle,
alpha_p, alpha_p,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input), APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),
......
...@@ -42,9 +42,10 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img, ...@@ -42,9 +42,10 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
PyArrayObject *stride, PyArrayObject *stride,
PyArrayObject *pad, PyArrayObject *pad,
PyGpuArrayObject **out, PyGpuArrayObject **out,
PyGpuContextObject *c) { cudnnHandle_t _handle) {
cudnnStatus_t err; PyGpuContextObject *c = img->context;
size_t dims[5]; size_t dims[5];
cudnnStatus_t err;
if (!GpuArray_IS_C_CONTIGUOUS(&img->ga)) { if (!GpuArray_IS_C_CONTIGUOUS(&img->ga)) {
PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported."); PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
...@@ -122,7 +123,7 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img, ...@@ -122,7 +123,7 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
cuda_wait((*out)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); cuda_wait((*out)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnPoolingForward( err = cudnnPoolingForward(
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(pool), _handle, APPLY_SPECIFIC(pool),
alpha, alpha,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(img), APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(img),
beta, beta,
......
...@@ -64,7 +64,8 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp, ...@@ -64,7 +64,8 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
PyArrayObject *stride, PyArrayObject *stride,
PyArrayObject *pad, PyArrayObject *pad,
PyGpuArrayObject **inp_grad, PyGpuArrayObject **inp_grad,
PyGpuContextObject *c) { cudnnHandle_t _handle) {
PyGpuContextObject *c = inp->context;
cudnnStatus_t err; cudnnStatus_t err;
if (!GpuArray_IS_C_CONTIGUOUS(&inp->ga)) { if (!GpuArray_IS_C_CONTIGUOUS(&inp->ga)) {
...@@ -153,7 +154,7 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp, ...@@ -153,7 +154,7 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
cuda_wait((*inp_grad)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); cuda_wait((*inp_grad)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnPoolingBackward( err = cudnnPoolingBackward(
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(pool), _handle, APPLY_SPECIFIC(pool),
alpha, alpha,
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(out), APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(out),
APPLY_SPECIFIC(output_grad), PyGpuArray_DEV_DATA(out_grad), APPLY_SPECIFIC(output_grad), PyGpuArray_DEV_DATA(out_grad),
......
...@@ -35,7 +35,8 @@ if (APPLY_SPECIFIC(output) != NULL) ...@@ -35,7 +35,8 @@ if (APPLY_SPECIFIC(output) != NULL)
int APPLY_SPECIFIC(softmax)(PyGpuArrayObject *x, int APPLY_SPECIFIC(softmax)(PyGpuArrayObject *x,
PyGpuArrayObject **out, PyGpuArrayObject **out,
PyGpuContextObject *c) { cudnnHandle_t _handle) {
PyGpuContextObject *c = x->context;
cudnnStatus_t err; cudnnStatus_t err;
if (c_set_tensorNd(x, APPLY_SPECIFIC(input)) != 0) if (c_set_tensorNd(x, APPLY_SPECIFIC(input)) != 0)
...@@ -77,7 +78,7 @@ int APPLY_SPECIFIC(softmax)(PyGpuArrayObject *x, ...@@ -77,7 +78,7 @@ int APPLY_SPECIFIC(softmax)(PyGpuArrayObject *x,
cuda_wait((*out)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); cuda_wait((*out)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnSoftmaxForward( err = cudnnSoftmaxForward(
APPLY_SPECIFIC(_handle), _handle,
SOFTMAX_ALGO, SOFTMAX_ALGO,
SOFTMAX_MODE, SOFTMAX_MODE,
alpha, alpha,
......
...@@ -46,7 +46,8 @@ if (APPLY_SPECIFIC(dx) != NULL) ...@@ -46,7 +46,8 @@ if (APPLY_SPECIFIC(dx) != NULL)
int APPLY_SPECIFIC(softmax_grad)(PyGpuArrayObject *dy, int APPLY_SPECIFIC(softmax_grad)(PyGpuArrayObject *dy,
PyGpuArrayObject *sm, PyGpuArrayObject *sm,
PyGpuArrayObject **dx, PyGpuArrayObject **dx,
PyGpuContextObject *c) { cudnnHandle_t _handle) {
PyGpuContextObject *c = dy->context;
cudnnStatus_t err; cudnnStatus_t err;
if (c_set_tensorNd(dy, APPLY_SPECIFIC(dy)) != 0) if (c_set_tensorNd(dy, APPLY_SPECIFIC(dy)) != 0)
...@@ -91,7 +92,7 @@ int APPLY_SPECIFIC(softmax_grad)(PyGpuArrayObject *dy, ...@@ -91,7 +92,7 @@ int APPLY_SPECIFIC(softmax_grad)(PyGpuArrayObject *dy,
cuda_wait((*dx)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); cuda_wait((*dx)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnSoftmaxBackward( err = cudnnSoftmaxBackward(
APPLY_SPECIFIC(_handle), _handle,
SOFTMAX_ALGO, SOFTMAX_ALGO,
SOFTMAX_MODE, SOFTMAX_MODE,
alpha, alpha,
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论