提交 9147b12e authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #2542 from abergeron/cudnn_r2_alg

Enable algorithm selection in cudnn convolution
...@@ -484,6 +484,25 @@ import theano and print the config variable, as in: ...@@ -484,6 +484,25 @@ import theano and print the config variable, as in:
A directory with bin/, lib/, include/ folders containing cuda utilities. A directory with bin/, lib/, include/ folders containing cuda utilities.
.. attribute:: config.dnn.conv.workmem
String value: 'none', 'small', 'large'
Default: 'small'
The default value for the amount of working memory that is
tolerated in the convolution implementation in cudnn.
'none'
Don't allow any extra memory.
'small'
Allow extra memory that is much smaller than the input sizes.
'large'
Allow extra memory that is on the order of the input sizes.
.. attribute:: config.gcc.cxxflags .. attribute:: config.gcc.cxxflags
Default: "" Default: ""
......
...@@ -1146,10 +1146,27 @@ class COp(Op): ...@@ -1146,10 +1146,27 @@ class COp(Op):
raise ValueError("No valid section marker was found in file " raise ValueError("No valid section marker was found in file "
"%s" % self.func_files[i]) "%s" % self.func_files[i])
def get_op_params(self):
"""
Returns a list of (name, value) pairs that will be turned into
macros for use within the op code. This is intended to allow
an op's properties to influence the generated C code.
The names must be strings that are not a C keyword and the
values must be strings of literal C representations.
"""
return []
def c_code_cache_version(self): def c_code_cache_version(self):
return hash(tuple(self.func_codes)) return hash(tuple(self.func_codes))
c_init_code = simple_meth('init_code') def c_init_code(self):
if 'init_code' in self.code_sections:
return [self.code_sections['init_code']]
else:
raise utils.MethodNotDefined(
'c_init_code', type(self), type(self).__name__)
c_init_code_apply = apply_meth('init_code_apply') c_init_code_apply = apply_meth('init_code_apply')
c_support_code = simple_meth('support_code') c_support_code = simple_meth('support_code')
c_support_code_apply = apply_meth('support_code_apply') c_support_code_apply = apply_meth('support_code_apply')
...@@ -1208,6 +1225,10 @@ class COp(Op): ...@@ -1208,6 +1225,10 @@ class COp(Op):
"str##_%s" % name)) "str##_%s" % name))
undef_macros.append(undef_template % "APPLY_SPECIFIC") undef_macros.append(undef_template % "APPLY_SPECIFIC")
for n, v in self.get_op_params():
define_macros.append(define_template % (n, v))
undef_macros.append(undef_template % (n,))
return os.linesep.join(define_macros), os.linesep.join(undef_macros) return os.linesep.join(define_macros), os.linesep.join(undef_macros)
def _lquote_macro(self, txt): def _lquote_macro(self, txt):
......
...@@ -298,6 +298,25 @@ outstanding_mallocs(PyObject* self, PyObject * args) ...@@ -298,6 +298,25 @@ outstanding_mallocs(PyObject* self, PyObject * args)
return PyInt_FromLong(_outstanding_mallocs[0]); return PyInt_FromLong(_outstanding_mallocs[0]);
} }
static void *work_mem = NULL;
static size_t work_size = 0;
/*
* Returns a chunk of memory for temporary work inside of an op. You can only
* request a single chunk of memory at a time since it is reused.
*/
void *get_work_mem(size_t sz) {
if (sz < work_size)
return work_mem;
device_free(work_mem);
work_mem = device_malloc(sz);
work_size = sz;
if (work_mem == NULL)
work_size = 0;
return work_mem;
}
///////////////////////// /////////////////////////
// Static helper methods // Static helper methods
///////////////////////// /////////////////////////
......
...@@ -88,7 +88,8 @@ typedef float real; ...@@ -88,7 +88,8 @@ typedef float real;
extern DllExport cublasHandle_t handle; extern DllExport cublasHandle_t handle;
/** /**
* Allocation and freeing of device memory should go through these functions so that the lib can track memory usage. * Allocation and freeing of device memory should go through these functions so
* that the lib can track memory usage.
* *
* device_malloc will set the Python error message before returning None. * device_malloc will set the Python error message before returning None.
* device_free will return nonzero on failure (after setting the python error message) * device_free will return nonzero on failure (after setting the python error message)
...@@ -98,6 +99,7 @@ extern DllExport cublasHandle_t handle; ...@@ -98,6 +99,7 @@ extern DllExport cublasHandle_t handle;
DllExport void * device_malloc(size_t size); DllExport void * device_malloc(size_t size);
DllExport void * device_malloc(size_t size, int verbose); DllExport void * device_malloc(size_t size, int verbose);
DllExport int device_free(void * ptr); DllExport int device_free(void * ptr);
DllExport void *get_work_mem(size_t sz);
template <typename T> template <typename T>
static T ceil_intdiv(T a, T b) static T ceil_intdiv(T a, T b)
......
...@@ -73,6 +73,20 @@ cudnnGetConvolutionForwardAlgorithm( ...@@ -73,6 +73,20 @@ cudnnGetConvolutionForwardAlgorithm(
return CUDNN_STATUS_SUCCESS; return CUDNN_STATUS_SUCCESS;
} }
static inline cudnnStatus_t
cudnnGetConvolutionForwardWorkspaceSize(
cudnnHandle_t handle,
const cudnnTensorDescriptor_t srcDesc,
const cudnnFilterDescriptor_t filterDesc,
const cudnnConvolutionDescriptor_t convDesc,
const cudnnTensor4dDescriptor_t destDesc,
cudnnConvolutionFwdAlgo_t algo,
size_t *sizeInBytes) {
*sizeInBytes = 0;
return CUDNN_STATUS_SUCCESS;
}
static inline cudnnStatus_t static inline cudnnStatus_t
cudnnConvolutionForward_v2( cudnnConvolutionForward_v2(
cudnnHandle_t handle, cudnnHandle_t handle,
......
import os import os
import theano import theano
from theano import Apply, gof, tensor from theano import Apply, gof, tensor, config
from theano.scalar import as_scalar from theano.scalar import as_scalar
from theano.gradient import DisconnectedType from theano.gradient import DisconnectedType
from theano.gof import Optimizer, local_optimizer, COp from theano.gof import Optimizer, local_optimizer, COp
from theano.gof.type import CDataType, Generic from theano.gof.type import CDataType, Generic
from theano.compat import PY3 from theano.compat import PY3
from theano.compile.ops import shape_i from theano.compile.ops import shape_i
from theano.configparser import AddConfigVar, EnumStr
from theano.tensor.nnet import SoftmaxGrad from theano.tensor.nnet import SoftmaxGrad
from theano.tensor.basic import ShapeError from theano.tensor.basic import ShapeError
from theano.sandbox.cuda.type import CudaNdarrayType from theano.sandbox.cuda.type import CudaNdarrayType
...@@ -133,20 +134,6 @@ class DnnBase(GpuOp, COp): ...@@ -133,20 +134,6 @@ class DnnBase(GpuOp, COp):
def c_libraries(self): def c_libraries(self):
return ['cudnn'] return ['cudnn']
def c_init_code(self):
if PY3:
error_out = "NULL"
else:
error_out = ""
return ["""{
cudnnStatus_t err;
if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not create cuDNN handle: %%s",
cudnnGetErrorString(err));
return %s;
}
}""" % (error_out,)]
class DnnVersion(GpuOp): class DnnVersion(GpuOp):
def c_compiler(self): def c_compiler(self):
...@@ -342,6 +329,11 @@ class GpuDnnConvDesc(GpuOp): ...@@ -342,6 +329,11 @@ class GpuDnnConvDesc(GpuOp):
return (2, version()) return (2, version())
AddConfigVar('dnn.conv.workmem',
"Default value for the workmem attribute of cudnn convolutions.",
EnumStr('small', 'none', 'large'),
in_c_key=False)
class GpuDnnConv(DnnBase, COp): class GpuDnnConv(DnnBase, COp):
""" """
The forward convolution. The forward convolution.
...@@ -349,13 +341,36 @@ class GpuDnnConv(DnnBase, COp): ...@@ -349,13 +341,36 @@ class GpuDnnConv(DnnBase, COp):
:param image: :param image:
:param kernel: :param kernel:
:param descr: the convolution descriptor :param descr: the convolution descriptor
""" """
__props__ = () __props__ = ('workmem',)
def __init__(self): def __init__(self, workmem=None):
"""
:param workmem: either 'none', 'small' or 'large'. Default is
the value of :attr:`config.dnn.conv.workmem`.
"""
COp.__init__(self, ["dnn_base.c", "dnn_conv_base.c", "dnn_fwd.c"], COp.__init__(self, ["dnn_base.c", "dnn_conv_base.c", "dnn_fwd.c"],
"APPLY_SPECIFIC(conv_fwd)") "APPLY_SPECIFIC(conv_fwd)")
if workmem is None:
workmem = config.dnn.conv.workmem
self.workmem = workmem
assert self.workmem in ['none', 'small', 'large']
def __setstate__(self, d):
self.__dict__.update(d)
if not hasattr(self, 'workmem'):
self.workmem = 'small'
def get_op_params(self):
if version() == -1:
return [('CONV_ALGO', "0")]
if self.workmem == 'none':
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM'
elif self.workmem == 'small':
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM'
elif self.workmem == 'large':
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_GEMM'
return [('CONV_ALGO', alg)]
def make_node(self, img, kern, desc): def make_node(self, img, kern, desc):
img = as_cuda_ndarray_variable(img) img = as_cuda_ndarray_variable(img)
...@@ -575,6 +590,8 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), ...@@ -575,6 +590,8 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
:warning: The cuDNN library only works with GPU that have a compute :warning: The cuDNN library only works with GPU that have a compute
capability of 3.0 or higer. This means that older GPU will not capability of 3.0 or higer. This means that older GPU will not
work with this Op. work with this Op.
:note: The working memory of the op is influenced by
:attr:`config.dnn.conv.workmem`.
""" """
fgraph = getattr(img, 'fgraph', None) or getattr(kerns, 'fgraph', None) fgraph = getattr(img, 'fgraph', None) or getattr(kerns, 'fgraph', None)
if (border_mode == 'valid' and subsample == (1,1) and if (border_mode == 'valid' and subsample == (1,1) and
......
...@@ -33,24 +33,27 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns, ...@@ -33,24 +33,27 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
return 1; return 1;
{ {
cudnnConvolutionFwdAlgo_t algo; size_t worksize;
err = cudnnGetConvolutionForwardAlgorithm( void *workspace;
_handle,
err = cudnnGetConvolutionForwardWorkspaceSize(_handle,
APPLY_SPECIFIC(input), APPLY_SPECIFIC(input),
APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(kerns),
desc, desc,
APPLY_SPECIFIC(output), APPLY_SPECIFIC(output),
CUDNN_CONVOLUTION_FWD_NO_WORKSPACE, // TODO: add op param CONV_ALGO,
0, &worksize);
&algo);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuDnnConv: Couldn't select convolution algorithm: %s", "GpuDnnConv: error getting worksize: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
return 1; return 1;
} }
workspace = get_work_mem(worksize);
if (workspace == NULL && worksize != 0)
return 1;
const float alpha = 1; const float alpha = 1;
const float beta = 0; const float beta = 0;
...@@ -60,8 +63,8 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns, ...@@ -60,8 +63,8 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
APPLY_SPECIFIC(input), CudaNdarray_DEV_DATA(input), APPLY_SPECIFIC(input), CudaNdarray_DEV_DATA(input),
APPLY_SPECIFIC(kerns), CudaNdarray_DEV_DATA(kerns), APPLY_SPECIFIC(kerns), CudaNdarray_DEV_DATA(kerns),
desc, desc,
algo, CONV_ALGO,
NULL, 0, workspace, worksize,
(void *)&beta, (void *)&beta,
APPLY_SPECIFIC(output), CudaNdarray_DEV_DATA(*output)); APPLY_SPECIFIC(output), CudaNdarray_DEV_DATA(*output));
} }
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论