提交 f3900a58 authored 作者: affanv14's avatar affanv14

modify cudnn implementation to support grouped comvolutions

上级 19f14deb
...@@ -508,9 +508,10 @@ class GpuDnnConv(DnnBase): ...@@ -508,9 +508,10 @@ class GpuDnnConv(DnnBase):
params_type = ParamsType(conv_algo=cudnn.cudnnConvolutionFwdAlgo_t, params_type = ParamsType(conv_algo=cudnn.cudnnConvolutionFwdAlgo_t,
choose_algo=bool_t, choose_once=bool_t, choose_time=bool_t, choose_algo=bool_t, choose_once=bool_t, choose_time=bool_t,
inplace=bool_t, inplace=bool_t,
handle=handle_type) handle=handle_type,
num_groups=int_t)
def __init__(self, algo=None, inplace=False): def __init__(self, algo=None, inplace=False, num_groups=1):
DnnBase.__init__(self, ["dnn_conv_base.c", "dnn_fwd.c"], DnnBase.__init__(self, ["dnn_conv_base.c", "dnn_fwd.c"],
"APPLY_SPECIFIC(conv_fwd)") "APPLY_SPECIFIC(conv_fwd)")
...@@ -530,6 +531,7 @@ class GpuDnnConv(DnnBase): ...@@ -530,6 +531,7 @@ class GpuDnnConv(DnnBase):
self.choose_algo = self.algo in SUPPORTED_DNN_CONV_ALGO_RUNTIME self.choose_algo = self.algo in SUPPORTED_DNN_CONV_ALGO_RUNTIME
self.choose_once = self.algo in DNN_CONV_ALGO_CHOOSE_ONCE self.choose_once = self.algo in DNN_CONV_ALGO_CHOOSE_ONCE
self.choose_time = self.algo in DNN_CONV_ALGO_CHOOSE_TIME self.choose_time = self.algo in DNN_CONV_ALGO_CHOOSE_TIME
self.num_groups = num_groups
def __setstate__(self, d): def __setstate__(self, d):
self.__dict__.update(d) self.__dict__.update(d)
...@@ -642,9 +644,10 @@ class GpuDnnConvGradW(DnnBase): ...@@ -642,9 +644,10 @@ class GpuDnnConvGradW(DnnBase):
params_type = ParamsType(conv_algo=cudnn.cudnnConvolutionBwdFilterAlgo_t, params_type = ParamsType(conv_algo=cudnn.cudnnConvolutionBwdFilterAlgo_t,
choose_algo=bool_t, choose_once=bool_t, choose_time=bool_t, choose_algo=bool_t, choose_once=bool_t, choose_time=bool_t,
inplace=bool_t, inplace=bool_t,
handle=handle_type) handle=handle_type,
num_groups=int_t)
def __init__(self, inplace=False, algo=None): def __init__(self, inplace=False, algo=None, num_groups=1):
DnnBase.__init__(self, ["dnn_conv_base.c", "dnn_gw.c"], DnnBase.__init__(self, ["dnn_conv_base.c", "dnn_gw.c"],
"APPLY_SPECIFIC(conv_gw)") "APPLY_SPECIFIC(conv_gw)")
self.inplace = bool(inplace) self.inplace = bool(inplace)
...@@ -662,6 +665,7 @@ class GpuDnnConvGradW(DnnBase): ...@@ -662,6 +665,7 @@ class GpuDnnConvGradW(DnnBase):
self.choose_algo = self.algo in SUPPORTED_DNN_CONV_ALGO_RUNTIME self.choose_algo = self.algo in SUPPORTED_DNN_CONV_ALGO_RUNTIME
self.choose_once = self.algo in DNN_CONV_ALGO_CHOOSE_ONCE self.choose_once = self.algo in DNN_CONV_ALGO_CHOOSE_ONCE
self.choose_time = self.algo in DNN_CONV_ALGO_CHOOSE_TIME self.choose_time = self.algo in DNN_CONV_ALGO_CHOOSE_TIME
self.num_groups = num_groups
def __setstate__(self, d): def __setstate__(self, d):
self.__dict__.update(d) self.__dict__.update(d)
...@@ -771,9 +775,10 @@ class GpuDnnConvGradI(DnnBase): ...@@ -771,9 +775,10 @@ class GpuDnnConvGradI(DnnBase):
params_type = ParamsType(conv_algo=cudnn.cudnnConvolutionBwdDataAlgo_t, params_type = ParamsType(conv_algo=cudnn.cudnnConvolutionBwdDataAlgo_t,
choose_algo=bool_t, choose_once=bool_t, choose_time=bool_t, choose_algo=bool_t, choose_once=bool_t, choose_time=bool_t,
inplace=bool_t, inplace=bool_t,
handle=handle_type) handle=handle_type,
num_groups=int_t)
def __init__(self, inplace=False, algo=None): def __init__(self, inplace=False, algo=None, num_groups=1):
DnnBase.__init__(self, ["dnn_conv_base.c", "dnn_gi.c"], DnnBase.__init__(self, ["dnn_conv_base.c", "dnn_gi.c"],
"APPLY_SPECIFIC(conv_gi)") "APPLY_SPECIFIC(conv_gi)")
self.inplace = bool(inplace) self.inplace = bool(inplace)
...@@ -791,6 +796,7 @@ class GpuDnnConvGradI(DnnBase): ...@@ -791,6 +796,7 @@ class GpuDnnConvGradI(DnnBase):
self.choose_algo = self.algo in SUPPORTED_DNN_CONV_ALGO_RUNTIME self.choose_algo = self.algo in SUPPORTED_DNN_CONV_ALGO_RUNTIME
self.choose_once = self.algo in DNN_CONV_ALGO_CHOOSE_ONCE self.choose_once = self.algo in DNN_CONV_ALGO_CHOOSE_ONCE
self.choose_time = self.algo in DNN_CONV_ALGO_CHOOSE_TIME self.choose_time = self.algo in DNN_CONV_ALGO_CHOOSE_TIME
self.num_groups = num_groups
def __setstate__(self, d): def __setstate__(self, d):
self.__dict__.update(d) self.__dict__.update(d)
...@@ -854,7 +860,7 @@ class GpuDnnConvGradI(DnnBase): ...@@ -854,7 +860,7 @@ class GpuDnnConvGradI(DnnBase):
def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), dilation=(1, 1), def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), dilation=(1, 1),
conv_mode='conv', direction_hint=None, workmem=None, num_groups=1, conv_mode='conv', direction_hint=None, workmem=None,
algo=None, precision=None): algo=None, precision=None):
""" """
GPU convolution using cuDNN from NVIDIA. GPU convolution using cuDNN from NVIDIA.
...@@ -973,7 +979,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), dilation=(1, 1), ...@@ -973,7 +979,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), dilation=(1, 1),
filter_dilation=dilation) filter_dilation=dilation)
out_shp = assert_conv_shape(out_shp) out_shp = assert_conv_shape(out_shp)
out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*out_shp) out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*out_shp)
return GpuDnnConv(algo=algo)(img, kerns, out, desc) return GpuDnnConv(algo=algo, num_groups=num_groups)(img, kerns, out, desc)
def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1), dilation=(1, 1, 1), def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1), dilation=(1, 1, 1),
...@@ -1097,7 +1103,7 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1), dilation=(1 ...@@ -1097,7 +1103,7 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1), dilation=(1
def dnn_gradweight(img, topgrad, kerns_shp, border_mode='valid', def dnn_gradweight(img, topgrad, kerns_shp, border_mode='valid',
subsample=(1, 1), dilation=(1, 1), conv_mode='conv', precision=None): subsample=(1, 1), dilation=(1, 1), num_groups=1, conv_mode='conv', precision=None):
""" """
TODO: document this TODO: document this
""" """
...@@ -1112,7 +1118,7 @@ def dnn_gradweight(img, topgrad, kerns_shp, border_mode='valid', ...@@ -1112,7 +1118,7 @@ def dnn_gradweight(img, topgrad, kerns_shp, border_mode='valid',
desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample, dilation=dilation, desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample, dilation=dilation,
conv_mode=conv_mode, precision=precision)(kerns_shp) conv_mode=conv_mode, precision=precision)(kerns_shp)
out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*kerns_shp) out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*kerns_shp)
return GpuDnnConvGradW()(img, topgrad, out, desc) return GpuDnnConvGradW(num_groups=num_groups)(img, topgrad, out, desc)
def dnn_gradweight3d(img, topgrad, kerns_shp, border_mode='valid', def dnn_gradweight3d(img, topgrad, kerns_shp, border_mode='valid',
...@@ -1125,7 +1131,7 @@ def dnn_gradweight3d(img, topgrad, kerns_shp, border_mode='valid', ...@@ -1125,7 +1131,7 @@ def dnn_gradweight3d(img, topgrad, kerns_shp, border_mode='valid',
def dnn_gradinput(kerns, topgrad, img_shp, border_mode='valid', def dnn_gradinput(kerns, topgrad, img_shp, border_mode='valid',
subsample=(1, 1), dilation=(1, 1), conv_mode='conv', precision=None): subsample=(1, 1), dilation=(1, 1), num_groups=1, conv_mode='conv', precision=None):
""" """
TODO: document this TODO: document this
""" """
...@@ -1140,7 +1146,7 @@ def dnn_gradinput(kerns, topgrad, img_shp, border_mode='valid', ...@@ -1140,7 +1146,7 @@ def dnn_gradinput(kerns, topgrad, img_shp, border_mode='valid',
desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample, dilation=dilation, desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample, dilation=dilation,
conv_mode=conv_mode, precision=precision)(kerns.shape) conv_mode=conv_mode, precision=precision)(kerns.shape)
out = GpuAllocEmpty(dtype=kerns.dtype, context_name=ctx_name)(*img_shp) out = GpuAllocEmpty(dtype=kerns.dtype, context_name=ctx_name)(*img_shp)
return GpuDnnConvGradI()(kerns, topgrad, out, desc) return GpuDnnConvGradI(num_groups=num_groups)(kerns, topgrad, out, desc)
def dnn_gradinput3d(kerns, topgrad, img_shp, border_mode='valid', def dnn_gradinput3d(kerns, topgrad, img_shp, border_mode='valid',
...@@ -2665,6 +2671,7 @@ def local_abstractconv_cudnn_graph(op, context_name, inputs, outputs): ...@@ -2665,6 +2671,7 @@ def local_abstractconv_cudnn_graph(op, context_name, inputs, outputs):
border_mode=op.border_mode, border_mode=op.border_mode,
subsample=op.subsample, subsample=op.subsample,
dilation=op.filter_dilation, dilation=op.filter_dilation,
num_groups=op.num_groups,
direction_hint='forward!', direction_hint='forward!',
conv_mode=conv_mode) conv_mode=conv_mode)
elif isinstance(op, AbstractConv2d_gradWeights): elif isinstance(op, AbstractConv2d_gradWeights):
...@@ -2674,6 +2681,7 @@ def local_abstractconv_cudnn_graph(op, context_name, inputs, outputs): ...@@ -2674,6 +2681,7 @@ def local_abstractconv_cudnn_graph(op, context_name, inputs, outputs):
border_mode=op.border_mode, border_mode=op.border_mode,
subsample=op.subsample, subsample=op.subsample,
dilation=op.filter_dilation, dilation=op.filter_dilation,
num_groups=op.num_groups,
conv_mode=conv_mode) conv_mode=conv_mode)
elif isinstance(op, AbstractConv2d_gradInputs): elif isinstance(op, AbstractConv2d_gradInputs):
shape = (inp2.shape[0], inp1.shape[1], shape = (inp2.shape[0], inp1.shape[1],
...@@ -2682,6 +2690,7 @@ def local_abstractconv_cudnn_graph(op, context_name, inputs, outputs): ...@@ -2682,6 +2690,7 @@ def local_abstractconv_cudnn_graph(op, context_name, inputs, outputs):
border_mode=op.border_mode, border_mode=op.border_mode,
subsample=op.subsample, subsample=op.subsample,
dilation=op.filter_dilation, dilation=op.filter_dilation,
num_groups=op.num_groups,
conv_mode=conv_mode) conv_mode=conv_mode)
return [rval] return [rval]
...@@ -2767,17 +2776,17 @@ def local_abstractconv_gi_cudnn(node): ...@@ -2767,17 +2776,17 @@ def local_abstractconv_gi_cudnn(node):
@inplace_allocempty(GpuDnnConv, 2) @inplace_allocempty(GpuDnnConv, 2)
def local_dnn_conv_inplace(node, inputs): def local_dnn_conv_inplace(node, inputs):
return [GpuDnnConv(algo=node.op.algo, inplace=True)(*inputs)] return [GpuDnnConv(algo=node.op.algo, inplace=True, num_groups=node.op.num_groups)(*inputs)]
@inplace_allocempty(GpuDnnConvGradW, 2) @inplace_allocempty(GpuDnnConvGradW, 2)
def local_dnn_convgw_inplace(node, inputs): def local_dnn_convgw_inplace(node, inputs):
return [GpuDnnConvGradW(algo=node.op.algo, inplace=True)(*inputs)] return [GpuDnnConvGradW(algo=node.op.algo, inplace=True, num_groups=node.op.num_groups)(*inputs)]
@inplace_allocempty(GpuDnnConvGradI, 2) @inplace_allocempty(GpuDnnConvGradI, 2)
def local_dnn_convgi_inplace(node, inputs): def local_dnn_convgi_inplace(node, inputs):
return [GpuDnnConvGradI(algo=node.op.algo, inplace=True)(*inputs)] return [GpuDnnConvGradI(algo=node.op.algo, inplace=True, num_groups=node.op.num_groups)(*inputs)]
optdb.register('local_dnna_conv_inplace', optdb.register('local_dnna_conv_inplace',
tensor.opt.in2out(local_dnn_conv_inplace, tensor.opt.in2out(local_dnn_conv_inplace,
......
...@@ -54,6 +54,49 @@ c_set_tensorNd(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc) { ...@@ -54,6 +54,49 @@ c_set_tensorNd(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc) {
return 0; return 0;
} }
static int
c_set_tensor_for_conv(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc, int groups) {
cudnnDataType_t dt;
size_t ds;
switch (var->ga.typecode) {
case GA_FLOAT:
dt = CUDNN_DATA_FLOAT;
break;
case GA_DOUBLE:
dt = CUDNN_DATA_DOUBLE;
break;
case GA_HALF:
dt = CUDNN_DATA_HALF;
break;
default:
PyErr_SetString(PyExc_TypeError, "Non-float datatype in c_set_tensorNd");
return -1;
}
ds = gpuarray_get_elsize(var->ga.typecode);
int strs[8], dims[8], default_stride = 1;
unsigned int nd = PyGpuArray_NDIM(var);
for (unsigned int _i = nd; _i > 0; _i--) {
unsigned int i = _i - 1;
strs[i] = (PyGpuArray_DIM(var, i) != 1 && PyGpuArray_STRIDE(var, i)) ?
PyGpuArray_STRIDE(var, i)/ds : default_stride;
default_stride *= PyGpuArray_DIM(var, i);
dims[i] = PyGpuArray_DIM(var, i);
}
dims[1] = dims[1] / groups;
cudnnStatus_t err = cudnnSetTensorNdDescriptor(desc, dt, nd,
dims, strs);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not set tensorNd descriptor: %s",
cudnnGetErrorString(err));
return -1;
}
return 0;
}
static int c_make_tensorNd(PyGpuArrayObject *var, cudnnTensorDescriptor_t *desc) { static int c_make_tensorNd(PyGpuArrayObject *var, cudnnTensorDescriptor_t *desc) {
cudnnStatus_t err; cudnnStatus_t err;
err = cudnnCreateTensorDescriptor(desc); err = cudnnCreateTensorDescriptor(desc);
...@@ -71,7 +114,7 @@ static int c_make_tensorNd(PyGpuArrayObject *var, cudnnTensorDescriptor_t *desc) ...@@ -71,7 +114,7 @@ static int c_make_tensorNd(PyGpuArrayObject *var, cudnnTensorDescriptor_t *desc)
} }
static int static int
c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t desc) { c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t desc, int groups) {
cudnnDataType_t dt; cudnnDataType_t dt;
cudnnStatus_t err; cudnnStatus_t err;
...@@ -111,6 +154,7 @@ c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t desc) { ...@@ -111,6 +154,7 @@ c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t desc) {
/* Filters can't be less than 3d so we pad */ /* Filters can't be less than 3d so we pad */
for (unsigned int i = nd; i < 3; i++) for (unsigned int i = nd; i < 3; i++)
dims[i] = 1; dims[i] = 1;
dims[0]/=groups;
if (nd < 3) if (nd < 3)
nd = 3; nd = 3;
...@@ -135,7 +179,7 @@ static int c_make_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t *desc) { ...@@ -135,7 +179,7 @@ static int c_make_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t *desc) {
cudnnGetErrorString(err)); cudnnGetErrorString(err));
return -1; return -1;
} }
if (c_set_filter(var, *desc) != 0) { if (c_set_filter(var, *desc, 1) != 0) {
cudnnDestroyFilterDescriptor(*desc); cudnnDestroyFilterDescriptor(*desc);
return -1; return -1;
} }
......
...@@ -29,7 +29,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -29,7 +29,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
float af = alpha, bf = beta; float af = alpha, bf = beta;
cudnnStatus_t err = CUDNN_STATUS_SUCCESS; cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) { if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1] * params->num_groups) {
PyErr_SetString(PyExc_ValueError, PyErr_SetString(PyExc_ValueError,
"images and kernel must have the same stack size"); "images and kernel must have the same stack size");
return 1; return 1;
...@@ -72,12 +72,15 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -72,12 +72,15 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
return 0; return 0;
} }
if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1) if (c_set_tensor_for_conv(input, APPLY_SPECIFIC(input), params->num_groups) == -1)
return 1; return 1;
if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1) if (c_set_filter(kerns, APPLY_SPECIFIC(kerns), params->num_groups) == -1)
return 1; return 1;
if (c_set_tensorNd(*output, APPLY_SPECIFIC(output)) == -1) if (c_set_tensor_for_conv(*output, APPLY_SPECIFIC(output), params->num_groups) == -1)
return 1; return 1;
size_t input_offset = PyGpuArray_STRIDE(input, 0) / params->num_groups;
size_t kern_offset = PyGpuArray_STRIDE(kerns, 0) * PyGpuArray_DIM(kerns, 0) / params->num_groups;
size_t output_offset = PyGpuArray_STRIDE(*output, 0) / params->num_groups;
cudnnConvolutionFwdAlgo_t algo = params->conv_algo; cudnnConvolutionFwdAlgo_t algo = params->conv_algo;
...@@ -264,15 +267,17 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -264,15 +267,17 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
cuda_wait(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_wait(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
for ( int g = 0; g < params->num_groups; g++) {
err = cudnnConvolutionForward( err = cudnnConvolutionForward(
params->handle, params->handle,
alpha_p, alpha_p,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input), APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input) + input_offset * g,
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns), APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns) + kern_offset * g,
desc, algo, desc, algo,
worksize == 0 ? NULL : *(void **)workspace, worksize, worksize == 0 ? NULL : *(void **)workspace, worksize,
beta_p, beta_p,
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output)); APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output) + output_offset * g);
}
if (worksize != 0) if (worksize != 0)
gpudata_release(workspace); gpudata_release(workspace);
......
...@@ -32,7 +32,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -32,7 +32,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
float af = alpha, bf = beta; float af = alpha, bf = beta;
cudnnStatus_t err = CUDNN_STATUS_SUCCESS; cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (PyGpuArray_DIMS(im)[1] != PyGpuArray_DIMS(kerns)[1]) { if (PyGpuArray_DIMS(im)[1] != PyGpuArray_DIMS(kerns)[1] * params->num_groups) {
PyErr_SetString(PyExc_ValueError, "images and kernel must have the same " PyErr_SetString(PyExc_ValueError, "images and kernel must have the same "
"stack size"); "stack size");
return 1; return 1;
...@@ -78,12 +78,15 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -78,12 +78,15 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
return 0; return 0;
} }
if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1) if (c_set_tensor_for_conv(output, APPLY_SPECIFIC(output), params->num_groups) == -1)
return 1; return 1;
if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1) if (c_set_filter(kerns, APPLY_SPECIFIC(kerns), params->num_groups) == -1)
return 1; return 1;
if (c_set_tensorNd(*input, APPLY_SPECIFIC(input)) == -1) if (c_set_tensor_for_conv(*input, APPLY_SPECIFIC(input), params->num_groups) == -1)
return 1; return 1;
size_t input_offset = PyGpuArray_STRIDE(*input, 0) / params->num_groups;
size_t kern_offset = PyGpuArray_STRIDE(kerns, 0) * PyGpuArray_DIM(kerns, 0) / params->num_groups;
size_t output_offset = PyGpuArray_STRIDE(output, 0) / params->num_groups;
cudnnConvolutionBwdDataAlgo_t algo = params->conv_algo; cudnnConvolutionBwdDataAlgo_t algo = params->conv_algo;
...@@ -100,7 +103,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -100,7 +103,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
} }
if (PyGpuArray_NDIM(im) == 4) { if (PyGpuArray_NDIM(im) == 4) {
if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) || if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
(PyGpuArray_DIMS(output)[1] != expected_output_dims[1]) || (PyGpuArray_DIMS(output)[1] / params->num_groups != expected_output_dims[1]) ||
(PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) || (PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
(PyGpuArray_DIMS(output)[3] != expected_output_dims[3])) { (PyGpuArray_DIMS(output)[3] != expected_output_dims[3])) {
PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ld" PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ld"
...@@ -294,14 +297,17 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -294,14 +297,17 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
cuda_wait(output->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_wait(output->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_wait((*input)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); cuda_wait((*input)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
for ( int g = 0; g < params->num_groups; g++)
{
err = cudnnConvolutionBackwardData( err = cudnnConvolutionBackwardData(
params->handle, params->handle,
alpha_p, alpha_p,
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns), APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns) + kern_offset * g,
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output) + output_offset * g,
desc, algo, worksize == 0 ? NULL : *(void **)workspace, worksize, desc, algo, worksize == 0 ? NULL : *(void **)workspace, worksize,
beta_p, beta_p,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(*input)); APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(*input) + input_offset * g);
}
if (worksize != 0) if (worksize != 0)
gpudata_release(workspace); gpudata_release(workspace);
......
...@@ -28,7 +28,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -28,7 +28,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
float af = alpha, bf = beta; float af = alpha, bf = beta;
cudnnStatus_t err = CUDNN_STATUS_SUCCESS; cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1]) { if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1] * params->num_groups) {
PyErr_SetString(PyExc_ValueError, PyErr_SetString(PyExc_ValueError,
"GpuDnnConv images and kernel must have the same stack size"); "GpuDnnConv images and kernel must have the same stack size");
return 1; return 1;
...@@ -71,13 +71,17 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -71,13 +71,17 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
return 0; return 0;
} }
if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1) if (c_set_tensor_for_conv(input, APPLY_SPECIFIC(input), params->num_groups) == -1)
return 1; return 1;
if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1) if (c_set_tensor_for_conv(output, APPLY_SPECIFIC(output), params->num_groups) == -1)
return 1; return 1;
if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns)) == -1) if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns), params->num_groups) == -1)
return 1; return 1;
size_t input_offset = PyGpuArray_STRIDE(input, 0) / params->num_groups;
size_t kern_offset = PyGpuArray_STRIDE(*kerns, 0) * PyGpuArray_DIM(*kerns, 0) / params->num_groups;
size_t output_offset = PyGpuArray_STRIDE(output, 0) / params->num_groups;
cudnnConvolutionBwdFilterAlgo_t algo = params->conv_algo; cudnnConvolutionBwdFilterAlgo_t algo = params->conv_algo;
cuda_enter(c->ctx); cuda_enter(c->ctx);
...@@ -93,7 +97,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -93,7 +97,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
} }
if (PyGpuArray_NDIM(input) == 4) { if (PyGpuArray_NDIM(input) == 4) {
if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) || if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
(PyGpuArray_DIMS(output)[1] != expected_output_dims[1]) || (PyGpuArray_DIMS(output)[1] / params->num_groups != expected_output_dims[1]) ||
(PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) || (PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
(PyGpuArray_DIMS(output)[3] != expected_output_dims[3])) { (PyGpuArray_DIMS(output)[3] != expected_output_dims[3])) {
PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%dx%ld" PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%dx%ld"
...@@ -264,14 +268,18 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -264,14 +268,18 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
cuda_wait(output->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_wait(output->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_wait((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); cuda_wait((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
for ( int g = 0; g < params->num_groups; g++)
{
err = cudnnConvolutionBackwardFilter( err = cudnnConvolutionBackwardFilter(
params->handle, params->handle,
alpha_p, alpha_p,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input), APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input) + input_offset * g ,
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output) + output_offset * g,
desc, algo, worksize == 0 ? NULL : *(void **)workspace, worksize, desc, algo, worksize == 0 ? NULL : *(void **)workspace, worksize,
beta_p, beta_p,
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(*kerns)); APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(*kerns) + kern_offset * g);
}
if (worksize != 0) if (worksize != 0)
gpudata_release(workspace); gpudata_release(workspace);
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论