提交 631d7d12 authored 作者: affanv14's avatar affanv14

make gpucorr3dMM support grouped convolutions

上级 2a7b2c81
......@@ -1074,7 +1074,7 @@ class BaseGpuCorr3dMM(CGpuKernelBase):
_f16_ok = True
def __init__(self, border_mode="valid", subsample=(1, 1, 1),
filter_dilation=(1, 1, 1)):
filter_dilation=(1, 1, 1), num_groups=1):
if isinstance(border_mode, integer_types):
border_mode = (border_mode, border_mode, border_mode)
if isinstance(border_mode, tuple):
......@@ -1093,6 +1093,9 @@ class BaseGpuCorr3dMM(CGpuKernelBase):
raise ValueError("filter_dilation must have three elements")
self.subsample = tuple(subsample)
self.filter_dilation = tuple(filter_dilation)
if num_groups < 1:
raise ValueError("Number of groups should be greater than 0")
self.num_groups = num_groups
CGpuKernelBase.__init__(self, ['c_code/corr3d_gemm.c'])
@property
......@@ -1102,11 +1105,12 @@ class BaseGpuCorr3dMM(CGpuKernelBase):
return (0, 0, 0)
def __str__(self):
return '%s{%s, %s, %s}' % (
return '%s{%s, %s, %s, %s}' % (
self.__class__.__name__,
self.border_mode,
str(self.subsample),
str(self.filter_dilation))
str(self.filter_dilation),
str(self.num_groups))
def flops(self, inp, outp):
"""
......@@ -1189,6 +1193,7 @@ class BaseGpuCorr3dMM(CGpuKernelBase):
"""
dH, dW, dD = self.subsample
dilH, dilW, dilD = self.filter_dilation
numgroups = self.num_groups
if self.border_mode == "half":
padH = padW = padD = -1
elif self.border_mode == "full":
......@@ -1249,6 +1254,7 @@ class BaseGpuCorr3dMM(CGpuKernelBase):
int padH = %(padH)s;
int padW = %(padW)s;
int padD = %(padD)s;
int numgroups = %(numgroups)s;
PyGpuArrayObject * bottom = %(bottom)s;
PyGpuArrayObject * weights = %(weights)s;
......@@ -1372,7 +1378,7 @@ class BaseGpuCorr3dMM(CGpuKernelBase):
// output is weights: (num_filters, num_channels, height, width, depth)
// height, width and depth: weights = (bottom + 2*pad - (top - 1) * sample - 1) / dil + 1
out_dim[0] = PyGpuArray_DIMS(top)[1];
out_dim[1] = PyGpuArray_DIMS(bottom)[1];
out_dim[1] = PyGpuArray_DIMS(bottom)[1] / numgroups;
out_dim[2] = kH; // already inferred further above
out_dim[3] = kW; // how convenient
out_dim[4] = kD;
......@@ -1399,7 +1405,7 @@ class BaseGpuCorr3dMM(CGpuKernelBase):
// output is bottom: (batchsize, num_channels, height, width, depth)
// height, width and depth: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad
out_dim[0] = PyGpuArray_DIMS(top)[0];
out_dim[1] = PyGpuArray_DIMS(weights)[1];
out_dim[1] = PyGpuArray_DIMS(weights)[1] * numgroups;
out_dim[2] = (%(height)s != -1) ? %(height)s : (PyGpuArray_DIMS(top)[2] - 1) * dH + (PyGpuArray_DIMS(weights)[2]-1)*dilH + 1 - 2*padH;
out_dim[3] = (%(width)s != -1) ? %(width)s : (PyGpuArray_DIMS(top)[3] - 1) * dW + (PyGpuArray_DIMS(weights)[3]-1)*dilW + 1 - 2*padW;
out_dim[4] = (%(depth)s != -1) ? %(depth)s : (PyGpuArray_DIMS(top)[4] - 1) * dD + (PyGpuArray_DIMS(weights)[4]-1)*dilD + 1 - 2*padD;
......@@ -1448,7 +1454,7 @@ class BaseGpuCorr3dMM(CGpuKernelBase):
// Call GPU code
out2 = corr3dMM(%(bottom)s, %(weights)s, %(top)s, direction,
dH, dW, dD, dilH, dilW, dilD, padH, padW, padD);
dH, dW, dD, dilH, dilW, dilD, padH, padW, padD, numgroups);
if (out2==NULL){
%(fail)s
}
......@@ -1503,9 +1509,10 @@ class GpuCorr3dMM(BaseGpuCorr3dMM):
"""
def __init__(self, border_mode="valid",
subsample=(1, 1, 1),
filter_dilation=(1, 1, 1)):
filter_dilation=(1, 1, 1),
num_groups=1):
super(GpuCorr3dMM, self).__init__(border_mode, subsample,
filter_dilation)
filter_dilation, num_groups)
def make_node(self, img, kern):
ctx_name = infer_context_name(img, kern)
......@@ -1534,11 +1541,13 @@ class GpuCorr3dMM(BaseGpuCorr3dMM):
top = gpu_contiguous(top)
d_bottom = GpuCorr3dMM_gradInputs(self.border_mode,
self.subsample,
self.filter_dilation)(
self.filter_dilation,
self.num_groups)(
weights, top, bottom.shape[-3:])
d_weights = GpuCorr3dMM_gradWeights(self.border_mode,
self.subsample,
self.filter_dilation)(
self.filter_dilation,
self.num_groups)(
bottom, top, weights.shape[-3:])
return d_bottom, d_weights
......@@ -1556,10 +1565,12 @@ class GpuCorr3dMM_gradWeights(BaseGpuCorr3dMM):
def __init__(self, border_mode="valid",
subsample=(1, 1, 1),
filter_dilation=(1, 1, 1)):
filter_dilation=(1, 1, 1),
num_groups=1):
super(GpuCorr3dMM_gradWeights, self).__init__(border_mode,
subsample,
filter_dilation)
filter_dilation,
num_groups)
def make_node(self, img, topgrad, shape=None):
ctx_name = infer_context_name(img, topgrad)
......@@ -1600,11 +1611,13 @@ class GpuCorr3dMM_gradWeights(BaseGpuCorr3dMM):
weights = gpu_contiguous(weights)
d_bottom = GpuCorr3dMM_gradInputs(self.border_mode,
self.subsample,
self.filter_dilation)(weights,
top,
bottom.shape[-3:])
self.filter_dilation,
self.num_groups)(weights,
top,
bottom.shape[-3:])
d_top = GpuCorr3dMM(
self.border_mode, self.subsample, self.filter_dilation)(bottom, weights)
self.border_mode, self.subsample, self.filter_dilation,
self.num_groups)(bottom, weights)
d_height_width_depth = (theano.gradient.DisconnectedType()(),)\
* 3 if len(inp) == 5 else ()
return (d_bottom, d_top) + d_height_width_depth
......@@ -1629,9 +1642,10 @@ class GpuCorr3dMM_gradInputs(BaseGpuCorr3dMM):
def __init__(self, border_mode="valid",
subsample=(1, 1, 1),
filter_dilation=(1, 1, 1)):
filter_dilation=(1, 1, 1),
num_groups=1):
super(GpuCorr3dMM_gradInputs, self).__init__(border_mode, subsample,
filter_dilation)
filter_dilation, num_groups)
def make_node(self, kern, topgrad, shape=None):
ctx_name = infer_context_name(kern, topgrad)
......@@ -1651,8 +1665,12 @@ class GpuCorr3dMM_gradInputs(BaseGpuCorr3dMM):
assert shape[1].ndim == 0
assert shape[2].ndim == 0
broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[1],
False, False, False]
if self.num_groups > 1:
broadcastable = [topgrad.type.broadcastable[0], False,
False, False, False]
else:
broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[1],
False, False, False]
return Apply(self, [kern, topgrad] + height_width_depth,
[GpuArrayType(dtype=topgrad.dtype,
context_name=ctx_name,
......@@ -1671,12 +1689,14 @@ class GpuCorr3dMM_gradInputs(BaseGpuCorr3dMM):
bottom = gpu_contiguous(bottom)
d_weights = GpuCorr3dMM_gradWeights(self.border_mode,
self.subsample,
self.filter_dilation)(bottom,
top,
weights.shape[-3:])
self.filter_dilation,
self.num_groups)(bottom,
top,
weights.shape[-3:])
d_top = GpuCorr3dMM(self.border_mode,
self.subsample,
self.filter_dilation)(bottom, weights)
self.filter_dilation,
self.num_groups)(bottom, weights)
d_height_width_depth = (theano.gradient.DisconnectedType()(),)\
* 3 if len(inp) == 5 else ()
return (d_weights, d_top) + d_height_width_depth
......
......@@ -411,7 +411,8 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
const size_t dilD = 1,
const size_t padH = 0,
const size_t padW = 0,
const size_t padD = 0)
const size_t padD = 0,
const size_t numgroups = 1)
{
if (PyGpuArray_NDIM(bottom) != 5)
{
......@@ -479,7 +480,7 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
const size_t kH = PyGpuArray_DIMS(weight)[2];
const size_t kW = PyGpuArray_DIMS(weight)[3];
const size_t kD = PyGpuArray_DIMS(weight)[4];
if (nChannels != PyGpuArray_DIMS(weight)[1]) {
if (nChannels != PyGpuArray_DIMS(weight)[1] * numgroups) {
PyErr_SetString(PyExc_ValueError,
"GpuCorr3dMM images and kernel must have the same stack size\n");
return NULL;
......@@ -511,7 +512,7 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
" weight shape: %ld %ld %ld %ld %ld\n"
" top shape: %ld %ld %ld %ld %ld (expected %ld %ld %ld %ld %ld)\n",
batchSize, nChannels, bottomHeight, bottomWidth, bottomDepth,
nFilters, nChannels, kH, kW, kD,
nFilters, nChannels / numgroups, kH, kW, kD,
PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3], PyGpuArray_DIMS(top)[4],
batchSize, nFilters, topHeight, topWidth, topDepth);
......@@ -542,11 +543,17 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
}
// Define some useful variables
const size_t bottom_stride = PyGpuArray_STRIDES(bottom)[0] / gpuarray_get_elsize(bottom->ga.typecode);
const size_t top_stride = PyGpuArray_STRIDES(top)[0] / gpuarray_get_elsize(top->ga.typecode);
const size_t K_ = col_dim[0];
const size_t batch_bottom_stride = PyGpuArray_STRIDES(bottom)[0] / gpuarray_get_elsize(bottom->ga.typecode);
const size_t batch_top_stride = PyGpuArray_STRIDES(top)[0] / gpuarray_get_elsize(top->ga.typecode);
const size_t group_bottom_stride = (PyGpuArray_STRIDES(bottom)[1] * nChannels / numgroups) / gpuarray_get_elsize(bottom->ga.typecode);
const size_t group_top_stride = (PyGpuArray_STRIDES(top)[1] * nFilters / numgroups) / gpuarray_get_elsize(top->ga.typecode);
const size_t group_weight_stride = (PyGpuArray_STRIDES(weight)[0] * nFilters / numgroups) / gpuarray_get_elsize(weight->ga.typecode);
const size_t K_ = col_dim[0] / numgroups;
const size_t N_ = col_dim[1];
const size_t M_ = nFilters;
const size_t group_col_stride = (K_ * N_);
const size_t M_ = nFilters / numgroups;
PyGpuArrayObject *output;
if (direction == 0) { // forward pass
......@@ -567,20 +574,22 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
for (size_t n = 0; n < batchSize; n++) {
// First, im3d2col
err = im3d2col(
&bottom->ga, n * bottom_stride, nChannels, bottomHeight,
&bottom->ga, n * batch_bottom_stride, nChannels, bottomHeight,
bottomWidth, bottomDepth, kH, kW, kD, dilH, dilW, dilD,
padH, padW, padD, dH, dW, dD, &col->ga);
if (err != GA_NO_ERROR) {
Py_DECREF(col);
return NULL;
}
// Second, gemm
err = rgemm(cb_fortran, cb_no_trans, cb_no_trans,
N_, M_, K_, 1,
&col->ga, 0, N_,
&weight->ga, 0, K_,
0,
&top->ga, n * top_stride, N_);
for ( size_t g = 0; g < numgroups; ++g){
// Second, gemm
err = rgemm(cb_fortran, cb_no_trans, cb_no_trans,
N_, M_, K_, 1,
&col->ga, g * group_col_stride, N_,
&weight->ga, g * group_weight_stride, K_,
0,
&top->ga, n * batch_top_stride + g * group_top_stride, N_);
}
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"GpuCorr3dMM forward encountered an error running gemm.");
......@@ -607,7 +616,7 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
for (size_t n = 0; n < batchSize; n++) {
// First, im3d2col
err = im3d2col(
&bottom->ga, n * bottom_stride, nChannels, bottomHeight,
&bottom->ga, n * batch_bottom_stride, nChannels, bottomHeight,
bottomWidth, bottomDepth, kH, kW, kD, dilH, dilW, dilD,
padH, padW, padD, dH, dW, dD, &col->ga);
if (err != GA_NO_ERROR) {
......@@ -618,12 +627,14 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
// Note that we accumulate into weight. We do so by setting beta = 0
// for the first iteration and beta = 1 for subsequent ones. (This
// is faster than setting weight to all zeros before the loop.)
err = rgemm(cb_fortran, cb_trans, cb_no_trans,
K_, M_, N_, 1,
&col->ga, 0, N_,
&top->ga, n * top_stride, N_,
(n == 0) ? 0 : 1,
&weight->ga, 0, K_);
for ( size_t g = 0; g < numgroups; ++g){
err = rgemm(cb_fortran, cb_trans, cb_no_trans,
K_, M_, N_, 1,
&col->ga, g * group_col_stride, N_,
&top->ga, n * batch_top_stride + g * group_top_stride, N_,
(n == 0) ? 0 : 1,
&weight->ga, g * group_weight_stride, K_);
}
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"GpuCorr3dMM grad weights encountered an error running gemm.");
......@@ -658,12 +669,14 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
// Iterate over batch
for (size_t n = 0; n < batchSize; n++) {
// gemm into columns
err = rgemm(cb_fortran, cb_no_trans, cb_trans,
N_, K_, M_, 1,
&top->ga, n * top_stride, N_,
&weight->ga, 0, K_,
0,
&col->ga, 0, N_);
for ( size_t g = 0; g < numgroups; ++g){
err = rgemm(cb_fortran, cb_no_trans, cb_trans,
N_, K_, M_, 1,
&top->ga, n * batch_top_stride + g * group_top_stride, N_,
&weight->ga, g * group_weight_stride, K_,
0,
&col->ga, g * group_col_stride, N_);
}
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"GpuCorr3dMM grad inputs encountered an error running gemm.");
......@@ -674,7 +687,7 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
err = col2im3d(&col->ga, nChannels,
bottomHeight, bottomWidth, bottomDepth,
kH, kW, kD, dilH, dilW, dilD, padH, padW, padD,
dH, dW, dD, &bottom->ga, n * bottom_stride);
dH, dW, dD, &bottom->ga, n * batch_bottom_stride);
if (err != GA_NO_ERROR) {
Py_DECREF(col);
return NULL;
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论