提交 f3900a58 authored 作者: affanv14's avatar affanv14

modify cudnn implementation to support grouped comvolutions

上级 19f14deb
......@@ -508,9 +508,10 @@ class GpuDnnConv(DnnBase):
params_type = ParamsType(conv_algo=cudnn.cudnnConvolutionFwdAlgo_t,
choose_algo=bool_t, choose_once=bool_t, choose_time=bool_t,
inplace=bool_t,
handle=handle_type)
handle=handle_type,
num_groups=int_t)
def __init__(self, algo=None, inplace=False):
def __init__(self, algo=None, inplace=False, num_groups=1):
DnnBase.__init__(self, ["dnn_conv_base.c", "dnn_fwd.c"],
"APPLY_SPECIFIC(conv_fwd)")
......@@ -530,6 +531,7 @@ class GpuDnnConv(DnnBase):
self.choose_algo = self.algo in SUPPORTED_DNN_CONV_ALGO_RUNTIME
self.choose_once = self.algo in DNN_CONV_ALGO_CHOOSE_ONCE
self.choose_time = self.algo in DNN_CONV_ALGO_CHOOSE_TIME
self.num_groups = num_groups
def __setstate__(self, d):
self.__dict__.update(d)
......@@ -642,9 +644,10 @@ class GpuDnnConvGradW(DnnBase):
params_type = ParamsType(conv_algo=cudnn.cudnnConvolutionBwdFilterAlgo_t,
choose_algo=bool_t, choose_once=bool_t, choose_time=bool_t,
inplace=bool_t,
handle=handle_type)
handle=handle_type,
num_groups=int_t)
def __init__(self, inplace=False, algo=None):
def __init__(self, inplace=False, algo=None, num_groups=1):
DnnBase.__init__(self, ["dnn_conv_base.c", "dnn_gw.c"],
"APPLY_SPECIFIC(conv_gw)")
self.inplace = bool(inplace)
......@@ -662,6 +665,7 @@ class GpuDnnConvGradW(DnnBase):
self.choose_algo = self.algo in SUPPORTED_DNN_CONV_ALGO_RUNTIME
self.choose_once = self.algo in DNN_CONV_ALGO_CHOOSE_ONCE
self.choose_time = self.algo in DNN_CONV_ALGO_CHOOSE_TIME
self.num_groups = num_groups
def __setstate__(self, d):
self.__dict__.update(d)
......@@ -771,9 +775,10 @@ class GpuDnnConvGradI(DnnBase):
params_type = ParamsType(conv_algo=cudnn.cudnnConvolutionBwdDataAlgo_t,
choose_algo=bool_t, choose_once=bool_t, choose_time=bool_t,
inplace=bool_t,
handle=handle_type)
handle=handle_type,
num_groups=int_t)
def __init__(self, inplace=False, algo=None):
def __init__(self, inplace=False, algo=None, num_groups=1):
DnnBase.__init__(self, ["dnn_conv_base.c", "dnn_gi.c"],
"APPLY_SPECIFIC(conv_gi)")
self.inplace = bool(inplace)
......@@ -791,6 +796,7 @@ class GpuDnnConvGradI(DnnBase):
self.choose_algo = self.algo in SUPPORTED_DNN_CONV_ALGO_RUNTIME
self.choose_once = self.algo in DNN_CONV_ALGO_CHOOSE_ONCE
self.choose_time = self.algo in DNN_CONV_ALGO_CHOOSE_TIME
self.num_groups = num_groups
def __setstate__(self, d):
self.__dict__.update(d)
......@@ -854,7 +860,7 @@ class GpuDnnConvGradI(DnnBase):
def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), dilation=(1, 1),
conv_mode='conv', direction_hint=None, workmem=None,
num_groups=1, conv_mode='conv', direction_hint=None, workmem=None,
algo=None, precision=None):
"""
GPU convolution using cuDNN from NVIDIA.
......@@ -973,7 +979,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), dilation=(1, 1),
filter_dilation=dilation)
out_shp = assert_conv_shape(out_shp)
out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*out_shp)
return GpuDnnConv(algo=algo)(img, kerns, out, desc)
return GpuDnnConv(algo=algo, num_groups=num_groups)(img, kerns, out, desc)
def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1), dilation=(1, 1, 1),
......@@ -1097,7 +1103,7 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1), dilation=(1
def dnn_gradweight(img, topgrad, kerns_shp, border_mode='valid',
subsample=(1, 1), dilation=(1, 1), conv_mode='conv', precision=None):
subsample=(1, 1), dilation=(1, 1), num_groups=1, conv_mode='conv', precision=None):
"""
TODO: document this
"""
......@@ -1112,7 +1118,7 @@ def dnn_gradweight(img, topgrad, kerns_shp, border_mode='valid',
desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample, dilation=dilation,
conv_mode=conv_mode, precision=precision)(kerns_shp)
out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*kerns_shp)
return GpuDnnConvGradW()(img, topgrad, out, desc)
return GpuDnnConvGradW(num_groups=num_groups)(img, topgrad, out, desc)
def dnn_gradweight3d(img, topgrad, kerns_shp, border_mode='valid',
......@@ -1125,7 +1131,7 @@ def dnn_gradweight3d(img, topgrad, kerns_shp, border_mode='valid',
def dnn_gradinput(kerns, topgrad, img_shp, border_mode='valid',
subsample=(1, 1), dilation=(1, 1), conv_mode='conv', precision=None):
subsample=(1, 1), dilation=(1, 1), num_groups=1, conv_mode='conv', precision=None):
"""
TODO: document this
"""
......@@ -1140,7 +1146,7 @@ def dnn_gradinput(kerns, topgrad, img_shp, border_mode='valid',
desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample, dilation=dilation,
conv_mode=conv_mode, precision=precision)(kerns.shape)
out = GpuAllocEmpty(dtype=kerns.dtype, context_name=ctx_name)(*img_shp)
return GpuDnnConvGradI()(kerns, topgrad, out, desc)
return GpuDnnConvGradI(num_groups=num_groups)(kerns, topgrad, out, desc)
def dnn_gradinput3d(kerns, topgrad, img_shp, border_mode='valid',
......@@ -2665,6 +2671,7 @@ def local_abstractconv_cudnn_graph(op, context_name, inputs, outputs):
border_mode=op.border_mode,
subsample=op.subsample,
dilation=op.filter_dilation,
num_groups=op.num_groups,
direction_hint='forward!',
conv_mode=conv_mode)
elif isinstance(op, AbstractConv2d_gradWeights):
......@@ -2674,6 +2681,7 @@ def local_abstractconv_cudnn_graph(op, context_name, inputs, outputs):
border_mode=op.border_mode,
subsample=op.subsample,
dilation=op.filter_dilation,
num_groups=op.num_groups,
conv_mode=conv_mode)
elif isinstance(op, AbstractConv2d_gradInputs):
shape = (inp2.shape[0], inp1.shape[1],
......@@ -2682,6 +2690,7 @@ def local_abstractconv_cudnn_graph(op, context_name, inputs, outputs):
border_mode=op.border_mode,
subsample=op.subsample,
dilation=op.filter_dilation,
num_groups=op.num_groups,
conv_mode=conv_mode)
return [rval]
......@@ -2767,17 +2776,17 @@ def local_abstractconv_gi_cudnn(node):
@inplace_allocempty(GpuDnnConv, 2)
def local_dnn_conv_inplace(node, inputs):
return [GpuDnnConv(algo=node.op.algo, inplace=True)(*inputs)]
return [GpuDnnConv(algo=node.op.algo, inplace=True, num_groups=node.op.num_groups)(*inputs)]
@inplace_allocempty(GpuDnnConvGradW, 2)
def local_dnn_convgw_inplace(node, inputs):
return [GpuDnnConvGradW(algo=node.op.algo, inplace=True)(*inputs)]
return [GpuDnnConvGradW(algo=node.op.algo, inplace=True, num_groups=node.op.num_groups)(*inputs)]
@inplace_allocempty(GpuDnnConvGradI, 2)
def local_dnn_convgi_inplace(node, inputs):
return [GpuDnnConvGradI(algo=node.op.algo, inplace=True)(*inputs)]
return [GpuDnnConvGradI(algo=node.op.algo, inplace=True, num_groups=node.op.num_groups)(*inputs)]
optdb.register('local_dnna_conv_inplace',
tensor.opt.in2out(local_dnn_conv_inplace,
......
......@@ -54,6 +54,49 @@ c_set_tensorNd(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc) {
return 0;
}
static int
c_set_tensor_for_conv(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc, int groups) {
cudnnDataType_t dt;
size_t ds;
switch (var->ga.typecode) {
case GA_FLOAT:
dt = CUDNN_DATA_FLOAT;
break;
case GA_DOUBLE:
dt = CUDNN_DATA_DOUBLE;
break;
case GA_HALF:
dt = CUDNN_DATA_HALF;
break;
default:
PyErr_SetString(PyExc_TypeError, "Non-float datatype in c_set_tensorNd");
return -1;
}
ds = gpuarray_get_elsize(var->ga.typecode);
int strs[8], dims[8], default_stride = 1;
unsigned int nd = PyGpuArray_NDIM(var);
for (unsigned int _i = nd; _i > 0; _i--) {
unsigned int i = _i - 1;
strs[i] = (PyGpuArray_DIM(var, i) != 1 && PyGpuArray_STRIDE(var, i)) ?
PyGpuArray_STRIDE(var, i)/ds : default_stride;
default_stride *= PyGpuArray_DIM(var, i);
dims[i] = PyGpuArray_DIM(var, i);
}
dims[1] = dims[1] / groups;
cudnnStatus_t err = cudnnSetTensorNdDescriptor(desc, dt, nd,
dims, strs);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not set tensorNd descriptor: %s",
cudnnGetErrorString(err));
return -1;
}
return 0;
}
static int c_make_tensorNd(PyGpuArrayObject *var, cudnnTensorDescriptor_t *desc) {
cudnnStatus_t err;
err = cudnnCreateTensorDescriptor(desc);
......@@ -71,7 +114,7 @@ static int c_make_tensorNd(PyGpuArrayObject *var, cudnnTensorDescriptor_t *desc)
}
static int
c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t desc) {
c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t desc, int groups) {
cudnnDataType_t dt;
cudnnStatus_t err;
......@@ -111,6 +154,7 @@ c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t desc) {
/* Filters can't be less than 3d so we pad */
for (unsigned int i = nd; i < 3; i++)
dims[i] = 1;
dims[0]/=groups;
if (nd < 3)
nd = 3;
......@@ -135,7 +179,7 @@ static int c_make_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t *desc) {
cudnnGetErrorString(err));
return -1;
}
if (c_set_filter(var, *desc) != 0) {
if (c_set_filter(var, *desc, 1) != 0) {
cudnnDestroyFilterDescriptor(*desc);
return -1;
}
......
......@@ -29,7 +29,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
float af = alpha, bf = beta;
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) {
if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1] * params->num_groups) {
PyErr_SetString(PyExc_ValueError,
"images and kernel must have the same stack size");
return 1;
......@@ -72,12 +72,15 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
return 0;
}
if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
if (c_set_tensor_for_conv(input, APPLY_SPECIFIC(input), params->num_groups) == -1)
return 1;
if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
if (c_set_filter(kerns, APPLY_SPECIFIC(kerns), params->num_groups) == -1)
return 1;
if (c_set_tensorNd(*output, APPLY_SPECIFIC(output)) == -1)
if (c_set_tensor_for_conv(*output, APPLY_SPECIFIC(output), params->num_groups) == -1)
return 1;
size_t input_offset = PyGpuArray_STRIDE(input, 0) / params->num_groups;
size_t kern_offset = PyGpuArray_STRIDE(kerns, 0) * PyGpuArray_DIM(kerns, 0) / params->num_groups;
size_t output_offset = PyGpuArray_STRIDE(*output, 0) / params->num_groups;
cudnnConvolutionFwdAlgo_t algo = params->conv_algo;
......@@ -264,15 +267,17 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
cuda_wait(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
for ( int g = 0; g < params->num_groups; g++) {
err = cudnnConvolutionForward(
params->handle,
alpha_p,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input) + input_offset * g,
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns) + kern_offset * g,
desc, algo,
worksize == 0 ? NULL : *(void **)workspace, worksize,
beta_p,
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output));
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output) + output_offset * g);
}
if (worksize != 0)
gpudata_release(workspace);
......
......@@ -32,7 +32,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
float af = alpha, bf = beta;
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (PyGpuArray_DIMS(im)[1] != PyGpuArray_DIMS(kerns)[1]) {
if (PyGpuArray_DIMS(im)[1] != PyGpuArray_DIMS(kerns)[1] * params->num_groups) {
PyErr_SetString(PyExc_ValueError, "images and kernel must have the same "
"stack size");
return 1;
......@@ -78,12 +78,15 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
return 0;
}
if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
if (c_set_tensor_for_conv(output, APPLY_SPECIFIC(output), params->num_groups) == -1)
return 1;
if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
if (c_set_filter(kerns, APPLY_SPECIFIC(kerns), params->num_groups) == -1)
return 1;
if (c_set_tensorNd(*input, APPLY_SPECIFIC(input)) == -1)
if (c_set_tensor_for_conv(*input, APPLY_SPECIFIC(input), params->num_groups) == -1)
return 1;
size_t input_offset = PyGpuArray_STRIDE(*input, 0) / params->num_groups;
size_t kern_offset = PyGpuArray_STRIDE(kerns, 0) * PyGpuArray_DIM(kerns, 0) / params->num_groups;
size_t output_offset = PyGpuArray_STRIDE(output, 0) / params->num_groups;
cudnnConvolutionBwdDataAlgo_t algo = params->conv_algo;
......@@ -100,7 +103,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
}
if (PyGpuArray_NDIM(im) == 4) {
if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
(PyGpuArray_DIMS(output)[1] != expected_output_dims[1]) ||
(PyGpuArray_DIMS(output)[1] / params->num_groups != expected_output_dims[1]) ||
(PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
(PyGpuArray_DIMS(output)[3] != expected_output_dims[3])) {
PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ld"
......@@ -294,14 +297,17 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
cuda_wait(output->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_wait((*input)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
for ( int g = 0; g < params->num_groups; g++)
{
err = cudnnConvolutionBackwardData(
params->handle,
alpha_p,
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns) + kern_offset * g,
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output) + output_offset * g,
desc, algo, worksize == 0 ? NULL : *(void **)workspace, worksize,
beta_p,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(*input));
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(*input) + input_offset * g);
}
if (worksize != 0)
gpudata_release(workspace);
......
......@@ -28,7 +28,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
float af = alpha, bf = beta;
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1]) {
if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1] * params->num_groups) {
PyErr_SetString(PyExc_ValueError,
"GpuDnnConv images and kernel must have the same stack size");
return 1;
......@@ -71,13 +71,17 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
return 0;
}
if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
if (c_set_tensor_for_conv(input, APPLY_SPECIFIC(input), params->num_groups) == -1)
return 1;
if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
if (c_set_tensor_for_conv(output, APPLY_SPECIFIC(output), params->num_groups) == -1)
return 1;
if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns)) == -1)
if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns), params->num_groups) == -1)
return 1;
size_t input_offset = PyGpuArray_STRIDE(input, 0) / params->num_groups;
size_t kern_offset = PyGpuArray_STRIDE(*kerns, 0) * PyGpuArray_DIM(*kerns, 0) / params->num_groups;
size_t output_offset = PyGpuArray_STRIDE(output, 0) / params->num_groups;
cudnnConvolutionBwdFilterAlgo_t algo = params->conv_algo;
cuda_enter(c->ctx);
......@@ -93,7 +97,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
}
if (PyGpuArray_NDIM(input) == 4) {
if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
(PyGpuArray_DIMS(output)[1] != expected_output_dims[1]) ||
(PyGpuArray_DIMS(output)[1] / params->num_groups != expected_output_dims[1]) ||
(PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
(PyGpuArray_DIMS(output)[3] != expected_output_dims[3])) {
PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%dx%ld"
......@@ -264,14 +268,18 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
cuda_wait(output->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_wait((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
for ( int g = 0; g < params->num_groups; g++)
{
err = cudnnConvolutionBackwardFilter(
params->handle,
alpha_p,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input) + input_offset * g ,
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output) + output_offset * g,
desc, algo, worksize == 0 ? NULL : *(void **)workspace, worksize,
beta_p,
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(*kerns));
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(*kerns) + kern_offset * g);
}
if (worksize != 0)
gpudata_release(workspace);
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论