提交 9592125c authored 作者: Frédéric Bastien's avatar Frédéric Bastien 提交者: GitHub

Merge pull request #6286 from vikramnitin9/grouped_unshared

Implement Unshared Convolution
......@@ -458,13 +458,15 @@ class BaseGpuCorrMM(CGpuKernelBase):
num_groups :
Divides the image, kernel and output tensors into num_groups
separate groups. Each which carry out convolutions separately (default : 1).
unshared
Perform unshared correlation (default: False)
"""
check_broadcast = False
__props__ = ('border_mode', 'subsample', 'filter_dilation', 'num_groups')
__props__ = ('border_mode', 'subsample', 'filter_dilation', 'num_groups', 'unshared')
_f16_ok = True
def __init__(self, border_mode="valid", subsample=(1, 1),
filter_dilation=(1, 1), num_groups=1):
filter_dilation=(1, 1), num_groups=1, unshared=False):
if isinstance(border_mode, integer_types):
border_mode = (border_mode, border_mode)
if isinstance(border_mode, tuple):
......@@ -487,6 +489,7 @@ class BaseGpuCorrMM(CGpuKernelBase):
raise ValueError("Number of groups should be greater than 0")
self.num_groups = num_groups
CGpuKernelBase.__init__(self, ['c_code/corr_gemm.c'])
self.unshared = unshared
@property
def pad(self):
......@@ -495,12 +498,13 @@ class BaseGpuCorrMM(CGpuKernelBase):
return (0, 0)
def __str__(self):
return '%s{%s, %s, %s, %s}' % (
return '%s{%s, %s, %s, %s, %s}' % (
self.__class__.__name__,
self.border_mode,
str(self.subsample),
str(self.filter_dilation),
str(self.num_groups))
str(self.num_groups),
str(self.unshared))
def __setstate__(self, d):
self.__dict__.update(d)
......@@ -533,7 +537,7 @@ class BaseGpuCorrMM(CGpuKernelBase):
def c_code_cache_version(self):
# Raise this whenever modifying the C code (including the file).
return (10,)
return (11,)
def c_code_helper(self, bottom, weights, top, direction, sub, height=None, width=None):
"""
......@@ -581,6 +585,7 @@ class BaseGpuCorrMM(CGpuKernelBase):
dH, dW = self.subsample
dilH, dilW = self.filter_dilation
numgroups = self.num_groups
unshared = int(self.unshared)
if self.border_mode == "half":
padH = padW = -1
elif self.border_mode == "full":
......@@ -633,19 +638,24 @@ class BaseGpuCorrMM(CGpuKernelBase):
int padH = %(padH)s;
int padW = %(padW)s;
int numgroups = %(numgroups)s;
int unshared = %(unshared)s;
PyGpuArrayObject * bottom = %(bottom)s;
PyGpuArrayObject * weights = %(weights)s;
PyGpuArrayObject * top = %(top)s;
PyGpuArrayObject * out2 = NULL;
int wdim, odim;
wdim = unshared ? 6 : 4;
odim = 4; //Can be set to 6 later for unshared backprop wrt weights
// Obtain or infer kernel width and height
// (we need to know it early to be able to handle auto-padding)
size_t kH, kW, dil_kH, dil_kW;
if (direction != 1) {
// weight is an input variable, we can just read its shape
kH = PyGpuArray_DIMS(weights)[2];
kW = PyGpuArray_DIMS(weights)[3];
kH = PyGpuArray_DIMS(weights)[wdim-2];
kW = PyGpuArray_DIMS(weights)[wdim-1];
}
else {
if (%(height)s != -1) {
......@@ -699,8 +709,10 @@ class BaseGpuCorrMM(CGpuKernelBase):
// Infer output shape and type
// The inferred shape can be negative.
long long out_dim[4];
size_t out_dim_size[4];
long long out_dim[6];
size_t out_dim_size[6];
out_dim[4] = out_dim[5] = 0; //Only used for unshared backprop wrt weights
out_dim_size[4] = out_dim_size[5] = 0; //Same
int out_typecode;
PyGpuContextObject *out_context;
switch(direction) {
......@@ -709,71 +721,131 @@ class BaseGpuCorrMM(CGpuKernelBase):
// height and width: top = (bottom + 2*pad - ((weight-1)*dil + 1)) / sample + 1
out_dim[0] = PyGpuArray_DIMS(bottom)[0];
out_dim[1] = PyGpuArray_DIMS(weights)[0];
out_dim[2] = (PyGpuArray_DIMS(bottom)[2] + 2*padH - ((PyGpuArray_DIMS(weights)[2]-1)*dilH + 1)) / dH + 1;
out_dim[3] = (PyGpuArray_DIMS(bottom)[3] + 2*padW - ((PyGpuArray_DIMS(weights)[3]-1)*dilW + 1)) / dW + 1;
out_dim[2] = (PyGpuArray_DIMS(bottom)[2] + 2*padH - ((PyGpuArray_DIMS(weights)[wdim-2]-1)*dilH + 1)) / dH + 1;
out_dim[3] = (PyGpuArray_DIMS(bottom)[3] + 2*padW - ((PyGpuArray_DIMS(weights)[wdim-1]-1)*dilW + 1)) / dW + 1;
out_typecode = bottom->ga.typecode;
out_context = bottom->context;
if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
{
PyErr_Format(PyExc_ValueError,
"GpuCorrMM: impossible output shape\\n"
" bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
" weights shape: %%ld x %%ld x %%ld x %%ld\\n"
" top shape: %%ld x %%ld x %%ld x %%ld\\n",
PyGpuArray_DIMS(bottom)[0], PyGpuArray_DIMS(bottom)[1],
PyGpuArray_DIMS(bottom)[2], PyGpuArray_DIMS(bottom)[3],
PyGpuArray_DIMS(weights)[0], PyGpuArray_DIMS(weights)[1],
PyGpuArray_DIMS(weights)[2], PyGpuArray_DIMS(weights)[3],
out_dim[0], out_dim[1], out_dim[2], out_dim[3]);
%(fail)s
if (unshared) {
PyErr_Format(PyExc_ValueError,
"GpuCorrMM: impossible output shape\\n"
" bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
" weights shape: %%ld x %%ld x %%ld x %%ld x %%ld x %%ld\\n"
" top shape: %%ld x %%ld x %%ld x %%ld\\n",
PyGpuArray_DIMS(bottom)[0], PyGpuArray_DIMS(bottom)[1],
PyGpuArray_DIMS(bottom)[2], PyGpuArray_DIMS(bottom)[3],
PyGpuArray_DIMS(weights)[0], PyGpuArray_DIMS(weights)[1],
PyGpuArray_DIMS(weights)[2], PyGpuArray_DIMS(weights)[3],
PyGpuArray_DIMS(weights)[4], PyGpuArray_DIMS(weights)[5],
out_dim[0], out_dim[1], out_dim[2], out_dim[3]);
%(fail)s
}
else {
PyErr_Format(PyExc_ValueError,
"GpuCorrMM: impossible output shape\\n"
" bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
" weights shape: %%ld x %%ld x %%ld x %%ld\\n"
" top shape: %%ld x %%ld x %%ld x %%ld\\n",
PyGpuArray_DIMS(bottom)[0], PyGpuArray_DIMS(bottom)[1],
PyGpuArray_DIMS(bottom)[2], PyGpuArray_DIMS(bottom)[3],
PyGpuArray_DIMS(weights)[0], PyGpuArray_DIMS(weights)[1],
PyGpuArray_DIMS(weights)[2], PyGpuArray_DIMS(weights)[3],
out_dim[0], out_dim[1], out_dim[2], out_dim[3]);
%(fail)s
}
}
break;
case 1: // backprop wrt. weights
// output is weights: (num_filters, num_channels, height, width)
// output is weights: (num_filters, num_channels, height, width) or
// (num_filters, top_height, top_width, num_channels, height, width) -> for unshared
// height and width: weights = (bottom + 2*pad - (top - 1) * sample - 1) / dil + 1
out_dim[0] = PyGpuArray_DIMS(top)[1];
out_dim[1] = PyGpuArray_DIMS(bottom)[1] / numgroups;
out_dim[2] = kH; // already inferred further above
out_dim[3] = kW; // how convenient
if (unshared){
odim = 6;
out_dim[1] = PyGpuArray_DIMS(top)[2];
out_dim[2] = PyGpuArray_DIMS(top)[3];
}
out_dim[wdim-3] = PyGpuArray_DIMS(bottom)[1] / numgroups;
out_dim[wdim-2] = kH; // already inferred further above
out_dim[wdim-1] = kW; // how convenient
out_typecode = top->ga.typecode;
out_context = top->context;
if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
{
PyErr_Format(PyExc_ValueError,
"GpuCorrMM backprop wrt. weights: impossible output shape\\n"
" bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
" weights shape: %%ld x %%ld x %%ld x %%ld\\n"
" top shape: %%ld x %%ld x %%ld x %%ld\\n",
PyGpuArray_DIMS(bottom)[0], PyGpuArray_DIMS(bottom)[1],
PyGpuArray_DIMS(bottom)[2], PyGpuArray_DIMS(bottom)[3],
out_dim[0], out_dim[1], out_dim[2], out_dim[3],
PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3]);
%(fail)s
if (unshared) {
if (out_dim[0] < 0 || out_dim[1] <= 0 || out_dim[2] <= 0 || out_dim[3] < 0
|| out_dim[4] <= 0 || out_dim[5] <= 0){
PyErr_Format(PyExc_ValueError,
"GpuCorrMM backprop wrt. weights: impossible output shape\\n"
" bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
" weights shape: %%ld x %%ld x %%ld x %%ld x %%ld x %%ld\\n"
" top shape: %%ld x %%ld x %%ld x %%ld\\n",
PyGpuArray_DIMS(bottom)[0], PyGpuArray_DIMS(bottom)[1],
PyGpuArray_DIMS(bottom)[2], PyGpuArray_DIMS(bottom)[3],
out_dim[0], out_dim[1], out_dim[2], out_dim[3],
out_dim[4], out_dim[5],
PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3]);
%(fail)s
}
}
else {
if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
{
PyErr_Format(PyExc_ValueError,
"GpuCorrMM backprop wrt. weights: impossible output shape\\n"
" bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
" weights shape: %%ld x %%ld x %%ld x %%ld\\n"
" top shape: %%ld x %%ld x %%ld x %%ld\\n",
PyGpuArray_DIMS(bottom)[0], PyGpuArray_DIMS(bottom)[1],
PyGpuArray_DIMS(bottom)[2], PyGpuArray_DIMS(bottom)[3],
out_dim[0], out_dim[1], out_dim[2], out_dim[3],
PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3]);
%(fail)s
}
}
break;
case 2: // backprop wrt. inputs
// output is bottom: (batchsize, num_channels, height, width)
// height and width: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad
out_dim[0] = PyGpuArray_DIMS(top)[0];
out_dim[1] = PyGpuArray_DIMS(weights)[1] * numgroups;
out_dim[2] = (%(height)s != -1) ? %(height)s : (PyGpuArray_DIMS(top)[2] - 1) * dH + (PyGpuArray_DIMS(weights)[2]-1)*dilH + 1 - 2*padH;
out_dim[3] = (%(width)s != -1) ? %(width)s : (PyGpuArray_DIMS(top)[3] - 1) * dW + (PyGpuArray_DIMS(weights)[3]-1)*dilW + 1 - 2*padW;
out_dim[1] = PyGpuArray_DIMS(weights)[wdim-3] * numgroups;
out_dim[2] = (%(height)s != -1) ? %(height)s : (PyGpuArray_DIMS(top)[2] - 1) * dH + (PyGpuArray_DIMS(weights)[wdim-2]-1)*dilH + 1 - 2*padH;
out_dim[3] = (%(width)s != -1) ? %(width)s : (PyGpuArray_DIMS(top)[3] - 1) * dW + (PyGpuArray_DIMS(weights)[wdim-1]-1)*dilW + 1 - 2*padW;
out_typecode = top->ga.typecode;
out_context = top->context;
if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
{
PyErr_Format(PyExc_ValueError,
"GpuCorrMM backprop wrt. inputs: impossible output shape\\n"
" bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
" weight shape: %%ld x %%ld x %%ld x %%ld\\n"
" top shape: %%ld x %%ld x %%ld x %%ld\\n",
out_dim[0], out_dim[1], out_dim[2], out_dim[3],
PyGpuArray_DIMS(weights)[0], PyGpuArray_DIMS(weights)[1],
PyGpuArray_DIMS(weights)[2], PyGpuArray_DIMS(weights)[3],
PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3]);
%(fail)s
if (unshared) {
if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
{
PyErr_Format(PyExc_ValueError,
"GpuCorrMM backprop wrt. inputs: impossible output shape\\n"
" bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
" weight shape: %%ld x %%ld x %%ld x %%ld x %%ld x %%ld\\n"
" top shape: %%ld x %%ld x %%ld x %%ld\\n",
out_dim[0], out_dim[1], out_dim[2], out_dim[3],
PyGpuArray_DIMS(weights)[0], PyGpuArray_DIMS(weights)[1],
PyGpuArray_DIMS(weights)[2], PyGpuArray_DIMS(weights)[3],
PyGpuArray_DIMS(weights)[4], PyGpuArray_DIMS(weights)[5],
PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3]);
%(fail)s
}
}
else {
if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
{
PyErr_Format(PyExc_ValueError,
"GpuCorrMM backprop wrt. inputs: impossible output shape\\n"
" bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
" weight shape: %%ld x %%ld x %%ld x %%ld\\n"
" top shape: %%ld x %%ld x %%ld x %%ld\\n",
out_dim[0], out_dim[1], out_dim[2], out_dim[3],
PyGpuArray_DIMS(weights)[0], PyGpuArray_DIMS(weights)[1],
PyGpuArray_DIMS(weights)[2], PyGpuArray_DIMS(weights)[3],
PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3]);
%(fail)s
}
}
break;
default:
......@@ -786,12 +858,24 @@ class BaseGpuCorrMM(CGpuKernelBase):
out_dim_size[2] = (size_t)out_dim[2];
out_dim_size[3] = (size_t)out_dim[3];
if (odim == 6) {
out_dim_size[4] = (size_t)out_dim[4];
out_dim_size[5] = (size_t)out_dim[5];
}
// Prepare output array
if (theano_prep_output(&%(out)s, 4, out_dim_size, out_typecode, GA_C_ORDER, out_context) != 0)
if (theano_prep_output(&%(out)s, odim, out_dim_size, out_typecode, GA_C_ORDER, out_context) != 0)
{
PyErr_Format(PyExc_RuntimeError,
"BaseGpuCorrMM: Failed to allocate output of %%lld x %%lld x %%lld x %%lld",
out_dim[0], out_dim[1], out_dim[2], out_dim[3]);
if (odim == 4) {
PyErr_Format(PyExc_RuntimeError,
"BaseGpuCorrMM: Failed to allocate output of %%lld x %%lld x %%lld x %%lld",
out_dim[0], out_dim[1], out_dim[2], out_dim[3]);
}
if (odim == 6) {
PyErr_Format(PyExc_RuntimeError,
"BaseGpuCorrMM: Failed to allocate output of %%lld x %%lld x %%lld x %%lld %%lld %%lld",
out_dim[0], out_dim[1], out_dim[2], out_dim[3], out_dim[4], out_dim[5]);
}
%(fail)s
}
if (!GpuArray_IS_C_CONTIGUOUS(&%(out)s->ga)) {
......@@ -800,7 +884,7 @@ class BaseGpuCorrMM(CGpuKernelBase):
}
// Call GPU code
out2 = corrMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, dilH, dilW, padH, padW, numgroups);
out2 = corrMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, dilH, dilW, padH, padW, numgroups, unshared);
if (out2==NULL){
%(fail)s
}
......@@ -840,6 +924,8 @@ class GpuCorrMM(BaseGpuCorrMM):
divided into.
should be an int
set to 1 to disable grouped convolution
unshared
Perform unshared correlation (default: False)
Notes
-----
......@@ -859,9 +945,9 @@ class GpuCorrMM(BaseGpuCorrMM):
"""
def __init__(self, border_mode="valid",
subsample=(1, 1),
filter_dilation=(1, 1), num_groups=1):
filter_dilation=(1, 1), num_groups=1, unshared=False):
super(GpuCorrMM, self).__init__(border_mode, subsample,
filter_dilation, num_groups)
filter_dilation, num_groups, unshared)
def make_node(self, img, kern):
ctx_name = infer_context_name(img, kern)
......@@ -869,8 +955,12 @@ class GpuCorrMM(BaseGpuCorrMM):
kern = as_gpuarray_variable(kern, ctx_name)
if img.type.ndim != 4:
raise TypeError('img must be 4D tensor')
if kern.type.ndim != 4:
raise TypeError('kern must be 4D tensor')
if self.unshared:
if kern.type.ndim != 6:
raise TypeError('kern must be 6D tensor')
else:
if kern.type.ndim != 4:
raise TypeError('kern must be 4D tensor')
broadcastable = [img.type.broadcastable[0], kern.type.broadcastable[0],
False, False]
......@@ -891,12 +981,14 @@ class GpuCorrMM(BaseGpuCorrMM):
d_bottom = GpuCorrMM_gradInputs(self.border_mode,
self.subsample,
self.filter_dilation,
self.num_groups)(
self.num_groups,
self.unshared)(
weights, top, bottom.shape[-2:])
d_weights = GpuCorrMM_gradWeights(self.border_mode,
self.subsample,
self.filter_dilation,
self.num_groups)(
self.num_groups,
self.unshared)(
bottom, top, weights.shape[-2:])
return d_bottom, d_weights
......@@ -915,10 +1007,12 @@ class GpuCorrMM_gradWeights(BaseGpuCorrMM):
def __init__(self, border_mode="valid",
subsample=(1, 1),
filter_dilation=(1, 1),
num_groups=1):
num_groups=1,
unshared=False):
super(GpuCorrMM_gradWeights, self).__init__(border_mode,
subsample,
filter_dilation, num_groups)
filter_dilation, num_groups,
unshared)
def make_node(self, img, topgrad, shape=None):
ctx_name = infer_context_name(img, topgrad)
......@@ -938,8 +1032,12 @@ class GpuCorrMM_gradWeights(BaseGpuCorrMM):
assert shape[0].ndim == 0
assert shape[1].ndim == 0
broadcastable = [topgrad.type.broadcastable[1], img.type.broadcastable[1],
False, False]
if self.unshared:
broadcastable = [topgrad.type.broadcastable[0], False, False,
img.type.broadcastable[1], False, False]
else:
broadcastable = [topgrad.type.broadcastable[1], img.type.broadcastable[1],
False, False]
return Apply(self, [img, topgrad] + height_width, [GpuArrayType(dtype=img.dtype,
context_name=ctx_name,
broadcastable=broadcastable)()])
......@@ -958,11 +1056,12 @@ class GpuCorrMM_gradWeights(BaseGpuCorrMM):
d_bottom = GpuCorrMM_gradInputs(self.border_mode,
self.subsample,
self.filter_dilation,
self.num_groups)(weights,
top,
bottom.shape[-2:])
self.num_groups,
self.unshared)(weights,
top,
bottom.shape[-2:])
d_top = GpuCorrMM(
self.border_mode, self.subsample, self.filter_dilation, self.num_groups)(bottom, weights)
self.border_mode, self.subsample, self.filter_dilation, self.num_groups, self.unshared)(bottom, weights)
d_height_width = (
theano.gradient.DisconnectedType()(),
) * 2 if len(inp) == 4 else ()
......@@ -989,16 +1088,22 @@ class GpuCorrMM_gradInputs(BaseGpuCorrMM):
def __init__(self, border_mode="valid",
subsample=(1, 1),
filter_dilation=(1, 1),
num_groups=1):
num_groups=1,
unshared=False):
super(GpuCorrMM_gradInputs, self).__init__(border_mode, subsample,
filter_dilation, num_groups)
filter_dilation, num_groups,
unshared)
def make_node(self, kern, topgrad, shape=None):
ctx_name = infer_context_name(kern, topgrad)
kern = as_gpuarray_variable(kern, ctx_name)
topgrad = as_gpuarray_variable(topgrad, ctx_name)
if kern.type.ndim != 4:
raise TypeError('kern must be 4D tensor')
if self.unshared:
if kern.type.ndim != 6:
raise TypeError('kern must be 6D tensor')
else:
if kern.type.ndim != 4:
raise TypeError('kern must be 4D tensor')
if topgrad.type.ndim != 4:
raise TypeError('topgrad must be 4D tensor')
if shape is None:
......@@ -1014,7 +1119,7 @@ class GpuCorrMM_gradInputs(BaseGpuCorrMM):
broadcastable = [topgrad.type.broadcastable[0], False,
False, False]
else:
broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[1],
broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[-3],
False, False]
return Apply(self, [kern, topgrad] + height_width, [GpuArrayType(dtype=topgrad.dtype,
context_name=ctx_name,
......@@ -1034,13 +1139,15 @@ class GpuCorrMM_gradInputs(BaseGpuCorrMM):
d_weights = GpuCorrMM_gradWeights(self.border_mode,
self.subsample,
self.filter_dilation,
self.num_groups)(bottom,
top,
weights.shape[-2:])
self.num_groups,
self.unshared)(bottom,
top,
weights.shape[-2:])
d_top = GpuCorrMM(self.border_mode,
self.subsample,
self.filter_dilation,
self.num_groups)(bottom, weights)
self.num_groups,
self.unshared)(bottom, weights)
d_height_width = (
theano.gradient.DisconnectedType()(),
) * 2 if len(inp) == 4 else ()
......@@ -1682,7 +1789,7 @@ class GpuCorr3dMM_gradInputs(BaseGpuCorr3dMM):
broadcastable = [topgrad.type.broadcastable[0], False,
False, False, False]
else:
broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[1],
broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[-4],
False, False, False]
return Apply(self, [kern, topgrad] + height_width_depth,
[GpuArrayType(dtype=topgrad.dtype,
......
......@@ -349,7 +349,8 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
const size_t dilW = 1,
const size_t padH = 0,
const size_t padW = 0,
const size_t numgroups = 1)
const size_t numgroups = 1,
const size_t unshared = 0)
{
if (PyGpuArray_NDIM(bottom) != 4)
{
......@@ -368,21 +369,35 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
return NULL;
}
if (PyGpuArray_NDIM(weight) != 4)
if (PyGpuArray_NDIM(weight) != (unshared ? 6 : 4))
{
PyErr_SetString(PyExc_ValueError, "GpuCorrMM requires weight of 4D");
PyErr_Format(PyExc_ValueError, "GpuCorrMM requires weight of %dD", unshared ? 6 : 4);
return NULL;
}
if (!GpuArray_IS_C_CONTIGUOUS(&weight->ga))
{
PyErr_Format(PyExc_ValueError,
"GpuCorrMM requires weight to be C-contiguous, "
"but strides are: %ld %ld %ld %ld\n",
PyGpuArray_STRIDES(weight)[0],
PyGpuArray_STRIDES(weight)[1],
PyGpuArray_STRIDES(weight)[2],
PyGpuArray_STRIDES(weight)[3]);
return NULL;
if (unshared) {
PyErr_Format(PyExc_ValueError,
"GpuCorrMM requires weight to be C-contiguous, "
"but strides are: %ld %ld %ld %ld %ld %ld\n",
PyGpuArray_STRIDES(weight)[0],
PyGpuArray_STRIDES(weight)[1],
PyGpuArray_STRIDES(weight)[2],
PyGpuArray_STRIDES(weight)[3],
PyGpuArray_STRIDES(weight)[4],
PyGpuArray_STRIDES(weight)[5]);
return NULL;
}
else {
PyErr_Format(PyExc_ValueError,
"GpuCorrMM requires weight to be C-contiguous, "
"but strides are: %ld %ld %ld %ld\n",
PyGpuArray_STRIDES(weight)[0],
PyGpuArray_STRIDES(weight)[1],
PyGpuArray_STRIDES(weight)[2],
PyGpuArray_STRIDES(weight)[3]);
return NULL;
}
}
if (PyGpuArray_NDIM(top) != 4)
......@@ -409,10 +424,12 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
const size_t bottomHeight = PyGpuArray_DIMS(bottom)[2];
const size_t bottomWidth = PyGpuArray_DIMS(bottom)[3];
// weights: (nFilters, nChannels, rows, columns)
// or (nFilters, out_rows, out_columns, nChannels, rows, columns) -> for unshared
const size_t nFilters = PyGpuArray_DIMS(weight)[0];
const size_t kH = PyGpuArray_DIMS(weight)[2];
const size_t kW = PyGpuArray_DIMS(weight)[3];
if (nChannels != (PyGpuArray_DIMS(weight)[1] * numgroups)) {
const size_t kH = PyGpuArray_DIMS(weight)[unshared ? 4 : 2];
const size_t kW = PyGpuArray_DIMS(weight)[unshared ? 5 : 3];
if (nChannels != PyGpuArray_DIMS(weight)[unshared ? 3 : 1] * numgroups) {
PyErr_SetString(PyExc_ValueError,
"GpuCorrMM images and kernel must have the same stack size\n");
return NULL;
......@@ -435,21 +452,56 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
const size_t topHeight = _CONV_FLOORDIV_X(topHeightNoDH, dH) + 1;
const size_t topWidth = _CONV_FLOORDIV_X(topWidthNoDW, dW) + 1;
#undef _CONV_FLOORDIV
if (batchSize != PyGpuArray_DIMS(top)[0] ||
nFilters != PyGpuArray_DIMS(top)[1] ||
topHeight != PyGpuArray_DIMS(top)[2] ||
topWidth != PyGpuArray_DIMS(top)[3]) {
PyErr_Format(PyExc_ValueError,
"GpuCorrMM shape inconsistency:\n"
" bottom shape: %ld %ld %ld %ld\n"
" weight shape: %ld %ld %ld %ld\n"
" top shape: %ld %ld %ld %ld (expected %ld %ld %ld %ld)\n",
batchSize, nChannels, bottomHeight, bottomWidth,
nFilters, nChannels / numgroups, kH, kW,
PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3],
batchSize, nFilters, topHeight, topWidth);
return NULL;
if (unshared) {
if (topHeight != PyGpuArray_DIMS(weight)[1] ||
topWidth != PyGpuArray_DIMS(weight)[2]) {
PyErr_Format(PyExc_ValueError,
"GpuCorrMM regions in kernel must match output regions:\n"
" bottom shape: %ld %ld %ld %ld\n"
" weight shape: %ld %ld %ld %ld %ld %ld"
" (expected %ld %ld %ld %ld %ld %ld)\n"
" top shape(calculated): %ld %ld %ld %ld\n",
batchSize, nChannels, bottomHeight, bottomWidth,
nFilters, PyGpuArray_DIMS(weight)[1],
PyGpuArray_DIMS(weight)[2], nChannels / numgroups, kH, kW,
nFilters, topHeight, topWidth, nChannels / numgroups, kH, kW,
batchSize, nFilters, topHeight, topWidth);
return NULL;
}
if (batchSize != PyGpuArray_DIMS(top)[0] ||
nFilters != PyGpuArray_DIMS(top)[1] ||
topHeight != PyGpuArray_DIMS(top)[2] ||
topWidth != PyGpuArray_DIMS(top)[3]) {
PyErr_Format(PyExc_ValueError,
"GpuCorrMM shape inconsistency:\n"
" bottom shape: %ld %ld %ld %ld\n"
" weight shape: %ld %ld %ld %ld %ld %ld\n"
" top shape: %ld %ld %ld %ld (expected %ld %ld %ld %ld)\n",
batchSize, nChannels, bottomHeight, bottomWidth,
nFilters, topHeight, topWidth, nChannels / numgroups, kH, kW,
PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3],
batchSize, nFilters, topHeight, topWidth);
return NULL;
}
}
else{
if (batchSize != PyGpuArray_DIMS(top)[0] ||
nFilters != PyGpuArray_DIMS(top)[1] ||
topHeight != PyGpuArray_DIMS(top)[2] ||
topWidth != PyGpuArray_DIMS(top)[3]) {
PyErr_Format(PyExc_ValueError,
"GpuCorrMM shape inconsistency:\n"
" bottom shape: %ld %ld %ld %ld\n"
" weight shape: %ld %ld %ld %ld\n"
" top shape: %ld %ld %ld %ld (expected %ld %ld %ld %ld)\n",
batchSize, nChannels, bottomHeight, bottomWidth,
nFilters, nChannels / numgroups, kH, kW,
PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3],
batchSize, nFilters, topHeight, topWidth);
return NULL;
}
}
int err = gpublas_setup(bottom->context->ctx);
......@@ -512,19 +564,37 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
return NULL;
}
// Second, gemm
for (size_t g = 0; g < numgroups; g++){
err = rgemm(cb_fortran, cb_no_trans, cb_no_trans,
N_, M_, K_, 1,
&col->ga, g * group_col_stride, N_,
&weight->ga, g * group_weight_stride, K_,
0,
&top->ga, n * batch_top_stride + g * group_top_stride, N_);
if (unshared) {
for (size_t g = 0; g < numgroups; ++g) {
for (size_t reg = 0; reg < N_; ++reg){
err = rgemm(cb_fortran, cb_no_trans, cb_no_trans,
1, M_, K_, 1,
&col->ga, g * group_col_stride + reg, N_,
&weight->ga, g * group_weight_stride + reg * K_, K_ * N_,
0,
&top->ga, n * batch_top_stride + g * group_top_stride + reg, N_);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "GpuCorrMM forward encountered an error running gemm: %d", err);
Py_DECREF(col);
return NULL;
}
}
}
}
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"GpuCorrMM forward encountered an error running gemm: %d", err);
Py_DECREF(col);
return NULL;
else {
for (size_t g = 0; g < numgroups; ++g){
err = rgemm(cb_fortran, cb_no_trans, cb_no_trans,
N_, M_, K_, 1,
&col->ga, g * group_col_stride, N_,
&weight->ga, g * group_weight_stride, K_,
0,
&top->ga, n * batch_top_stride + g * group_top_stride, N_);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "GpuCorrMM forward encountered an error running gemm: %d", err);
Py_DECREF(col);
return NULL;
}
}
}
}
}
......@@ -557,19 +627,37 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
// Note that we accumulate into weight. We do so by setting beta = 0
// for the first iteration and beta = 1 for subsequent ones. (This
// is faster than setting weight to all zeros before the loop.)
for(size_t g = 0; g < numgroups; g++){
err = rgemm(cb_fortran, cb_trans, cb_no_trans,
K_, M_, N_, 1,
&col->ga, g * group_col_stride, N_,
&top->ga, n * batch_top_stride + g * group_top_stride, N_,
(n == 0) ? 0 : 1,
&weight->ga, g * group_weight_stride, K_);
if (unshared) {
for (size_t g = 0; g < numgroups; ++g) {
for (size_t reg = 0; reg < N_; ++reg){
err = rgemm(cb_fortran, cb_trans, cb_no_trans,
K_, M_, 1, 1,
&col->ga, g * group_col_stride + reg, N_,
&top->ga, n * batch_top_stride + g * group_top_stride + reg, N_,
(n == 0) ? 0 : 1,
&weight->ga, g * group_weight_stride + reg * K_, K_ * N_);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "GpuCorrMM grad weights encountered an error running gemm: %d", err);
Py_DECREF(col);
return NULL;
}
}
}
}
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"GpuCorrMM grad weights encountered an error running gemm: %d", err);
Py_DECREF(col);
return NULL;
else{
for(size_t g = 0; g < numgroups; g++){
err = rgemm(cb_fortran, cb_trans, cb_no_trans,
K_, M_, N_, 1,
&col->ga, g * group_col_stride, N_,
&top->ga, n * batch_top_stride + g * group_top_stride, N_,
(n == 0) ? 0 : 1,
&weight->ga, g * group_weight_stride, K_);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "GpuCorrMM grad weights encountered an error running gemm: %d", err);
Py_DECREF(col);
return NULL;
}
}
}
}
}
......@@ -590,19 +678,37 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
// Iterate over batch
for (size_t n = 0; n < batchSize; n++) {
// gemm into columns
for(size_t g = 0; g < numgroups; g++){
err = rgemm(cb_fortran, cb_no_trans, cb_trans,
N_, K_, M_, 1,
&top->ga, n * batch_top_stride + g * group_top_stride, N_,
&weight->ga, g * group_weight_stride, K_,
0,
&col->ga, g * group_col_stride, N_);
if (unshared) {
for (size_t g = 0; g < numgroups; ++g){
for (size_t reg = 0; reg < N_; ++reg) {
err = rgemm(cb_fortran, cb_no_trans, cb_trans,
1, K_, M_, 1,
&top->ga, n * batch_top_stride + g * group_top_stride + reg, N_,
&weight->ga, g * group_weight_stride + reg * K_, K_ * N_,
0,
&col->ga, g * group_col_stride + reg, N_);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "GpuCorrMM grad inputs encountered an error running gemm: %d", err);
Py_DECREF(col);
return NULL;
}
}
}
}
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"GpuCorrMM grad inputs encountered an error running gemm: %d", err);
Py_DECREF(col);
return NULL;
else {
for (size_t g = 0; g < numgroups; ++g){
err = rgemm(cb_fortran, cb_no_trans, cb_trans,
N_, K_, M_, 1,
&top->ga, n * batch_top_stride + g * group_top_stride, N_,
&weight->ga, g * group_weight_stride, K_,
0,
&col->ga, g * group_col_stride, N_);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "GpuCorrMM grad inputs encountered an error running gemm: %d", err);
Py_DECREF(col);
return NULL;
}
}
}
// col2im back to the data
err = col2im(&col->ga, nChannels, bottomHeight, bottomWidth,
......
......@@ -3035,6 +3035,9 @@ def local_abstractconv_cudnn_graph(op, context_name, inputs, outputs):
if version(raises=False) < 6000 and op.filter_dilation != (1, 1):
return None
if op.unshared:
return None
inp1 = inputs[0]
inp2 = inputs[1]
......@@ -3129,6 +3132,8 @@ def local_abstractconv_cudnn(node):
ctx = infer_context_name(*node.inputs)
if not isinstance(node.inputs[0].type, GpuArrayType):
return
if node.op.unshared:
return None
if isinstance(node.op, AbstractConv2d):
return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
elif isinstance(node.op, AbstractConv3d):
......@@ -3143,6 +3148,8 @@ def local_abstractconv_cudnn_alt(node):
if version(raises=False) < 6000 and node.op.filter_dilation != (1, 1):
return None
if node.op.unshared:
return None
inp1 = node.inputs[0]
inp2 = node.inputs[1]
......@@ -3349,6 +3356,8 @@ def local_abstractconv_gw_cudnn(node):
ctx = infer_context_name(*node.inputs)
if not isinstance(node.inputs[0].type, GpuArrayType):
return
if node.op.unshared:
return None
if isinstance(node.op, AbstractConv2d_gradWeights):
return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
elif isinstance(node.op, AbstractConv3d_gradWeights):
......@@ -3360,6 +3369,8 @@ def local_abstractconv_gi_cudnn(node):
ctx = infer_context_name(*node.inputs)
if not isinstance(node.inputs[0].type, GpuArrayType):
return
if node.op.unshared:
return None
if isinstance(node.op, AbstractConv2d_gradInputs):
return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
elif isinstance(node.op, AbstractConv3d_gradInputs):
......
......@@ -1595,12 +1595,17 @@ def local_abstractconv_gemm(node):
border_mode = node.op.border_mode
subsample = node.op.subsample
filter_dilation = node.op.filter_dilation
num_groups = node.op.num_groups
unshared = node.op.unshared
if ((border_mode == 'full') and (subsample == (1, 1)) and node.op.num_groups == 1):
flip = (slice(None),) * (kern.ndim - 2) + \
(slice(None, None, -1),) * 2
kern_axes = (1, 0) + tuple(i for i in range(2, kern.ndim))
if ((border_mode == 'full') and (subsample == (1, 1)) and num_groups == 1 and not unshared):
if not node.op.filter_flip:
kern = kern[:, :, ::-1, ::-1]
kern = kern[flip]
# need to dimshuffle the kernel for full convolution
kern = kern.dimshuffle(1, 0, 2, 3)
kern = kern.dimshuffle(kern_axes)
# call GpuCorrMM_gradInputs
rval = GpuCorrMM_gradInputs('valid',
subsample,
......@@ -1609,13 +1614,14 @@ def local_abstractconv_gemm(node):
else:
# need to flip the kernel if necessary
if node.op.filter_flip:
kern = kern[:, :, ::-1, ::-1]
kern = kern[flip]
# By default use GpuCorrMM
rval = GpuCorrMM(border_mode,
subsample,
filter_dilation,
node.op.num_groups)(gpu_contiguous(img),
gpu_contiguous(kern))
num_groups,
unshared)(gpu_contiguous(img),
gpu_contiguous(kern))
# call GpuCorrMM_gradWeights if good
# (the latter is faster if batchsize * kernelHeight * kernelWidth
......@@ -1628,11 +1634,12 @@ def local_abstractconv_gemm(node):
(node.op.kshp is not None) and
(None not in node.op.kshp) and
border_mode != "half" and
node.op.num_groups == 1):
num_groups == 1 and
not unshared):
# we know the kernel and output size
prod1 = node.op.kshp[0] * node.op.kshp[1]
prod1 = node.op.kshp[0] * node.op.kshp[-3]
prod2 = ((node.op.imshp[-2] - node.op.kshp[0] + 1) *
(node.op.imshp[-1] - node.op.kshp[1] + 1))
(node.op.imshp[-1] - node.op.kshp[-3] + 1))
if (None not in node.op.imshp[:1]):
# we also know batchsize and input channels
prod1 *= node.op.imshp[0]
......@@ -1666,13 +1673,19 @@ def local_abstractconv_gemm_def(node):
border_mode = node.op.border_mode
subsample = node.op.subsample
filter_dilation = node.op.filter_dilation
num_groups = node.op.num_groups
unshared = node.op.unshared
if node.op.filter_flip:
kern = kern[:, :, ::-1, ::-1]
flip = (slice(None),) * (kern.ndim - 2) + \
(slice(None, None, -1),) * 2
kern = kern[flip]
rval = GpuCorrMM(border_mode,
subsample,
filter_dilation,
node.op.num_groups)(gpu_contiguous(img),
gpu_contiguous(kern))
num_groups,
unshared)(gpu_contiguous(img),
gpu_contiguous(kern))
return [rval]
......@@ -1690,8 +1703,9 @@ def local_abstractconv_gemm_alt(node):
subsample = node.op.subsample
filter_dilation = node.op.filter_dilation
num_groups = node.op.num_groups
unshared = node.op.unshared
if border_mode == 'full' and subsample == (1, 1) and num_groups == 1:
if border_mode == 'full' and subsample == (1, 1) and num_groups == 1 and not unshared:
if not node.op.filter_flip:
kern = kern[:, :, ::-1, ::-1]
......@@ -1702,7 +1716,7 @@ def local_abstractconv_gemm_alt(node):
gpu_contiguous(kern), gpu_contiguous(img))
elif (border_mode == 'valid' and subsample == (1, 1) and filter_dilation == (1, 1) and
num_groups == 1):
num_groups == 1 and not unshared):
if node.op.filter_flip:
kern = kern[:, :, ::-1, ::-1]
......@@ -1896,10 +1910,13 @@ def local_abstractconv_gradweights_gemm(node):
rval = GpuCorrMM_gradWeights(border_mode=node.op.border_mode,
subsample=node.op.subsample,
filter_dilation=node.op.filter_dilation,
num_groups=node.op.num_groups)(
num_groups=node.op.num_groups,
unshared=node.op.unshared)(
gpu_contiguous(img), gpu_contiguous(topgrad), shape)
flip = (slice(None),) * (rval.ndim - 2) + \
(slice(None, None, -1),) * 2
if node.op.filter_flip:
rval = rval[:, :, ::-1, ::-1]
rval = rval[flip]
rval = tensor.patternbroadcast(rval, node.outputs[0].broadcastable)
rval = as_gpuarray_variable(rval, context_name=ctx)
return [rval]
......@@ -1918,9 +1935,10 @@ def local_abstractconv_gemm_gradweights_alt(node):
subsample = node.op.subsample
filter_dilation = node.op.filter_dilation
num_groups = node.op.num_groups
unshared = node.op.unshared
if(border_mode == 'valid' and subsample == (1, 1) and filter_dilation == (1, 1) and
num_groups == 1):
num_groups == 1 and not unshared):
rval = GpuCorrMM(border_mode,
subsample,
filter_dilation)(
......@@ -2001,12 +2019,15 @@ def local_abstractconv_gradinputs_gemm(node):
return None
if node.op.filter_flip:
kern = kern[:, :, ::-1, ::-1]
flip = (slice(None),) * (kern.ndim - 2) + \
(slice(None, None, -1),) * 2
kern = kern[flip]
rval = GpuCorrMM_gradInputs(border_mode=node.op.border_mode,
subsample=node.op.subsample,
filter_dilation=node.op.filter_dilation,
num_groups=node.op.num_groups)(
num_groups=node.op.num_groups,
unshared=node.op.unshared)(
gpu_contiguous(kern), gpu_contiguous(topgrad), shape)
return [rval]
......@@ -2023,8 +2044,9 @@ def local_abstractconv_gradinputs_gemm_alt(node):
subsample = node.op.subsample
filter_dilation = node.op.filter_dilation
num_groups = node.op.num_groups
unshared = node.op.unshared
if border_mode == 'valid' and subsample == (1, 1) and num_groups == 1:
if border_mode == 'valid' and subsample == (1, 1) and num_groups == 1 and not unshared:
if not node.op.filter_flip:
kern = kern[:, :, ::-1, ::-1]
......@@ -2117,8 +2139,9 @@ class ConvMetaOptimizer(LocalMetaOptimizer):
node.op.border_mode,
node.op.subsample,
node.op.filter_dilation)
convdim = img.ndim - 2
result[kshape] = theano.tensor.as_tensor_variable(node.op.kshp[2:])
result[kshape] = theano.tensor.as_tensor_variable(node.op.kshp[-convdim:])
for(var, shape) in zip((img, top), (node.op.imshp, tshp)):
result[var] = theano.shared(np.random.random(shape).astype(var.dtype),
......
......@@ -11,7 +11,7 @@ from theano.tensor.nnet.corr import CorrMM, CorrMM_gradWeights, CorrMM_gradInput
from ..type import gpuarray_shared_constructor
from ..blas import GpuCorrMM, GpuCorrMM_gradWeights, GpuCorrMM_gradInputs
from .config import mode_with_gpu, mode_without_gpu, ref_cast
from theano.tensor.nnet.tests.test_abstract_conv import Grouped_conv_noOptim
from theano.tensor.nnet.tests.test_abstract_conv import Grouped_conv_noOptim, TestUnsharedConv
class TestCorrMM(unittest.TestCase):
......@@ -20,9 +20,13 @@ class TestCorrMM(unittest.TestCase):
border_mode='valid',
filter_dilation=(1, 1),
subsample=(1, 1),
unshared=False,
verify_grad=False):
inputs_shape = [inputs_shape[i] for i in (0, 3, 1, 2)]
filters_shape = [filters_shape[i] for i in (0, 3, 1, 2)]
if unshared:
filters_shape = [filters_shape[i] for i in (0, 1, 2, 5, 3, 4)]
else:
filters_shape = [filters_shape[i] for i in (0, 3, 1, 2)]
inputs_val = np.random.random(inputs_shape).astype(config.floatX)
filters_val = np.random.random(filters_shape).astype(config.floatX)
......@@ -32,13 +36,15 @@ class TestCorrMM(unittest.TestCase):
conv_ref = CorrMM(border_mode=border_mode,
filter_dilation=filter_dilation,
subsample=subsample)(ref_cast(inputs),
ref_cast(filters))
subsample=subsample,
unshared=unshared)(ref_cast(inputs),
ref_cast(filters))
f_ref = theano.function([], conv_ref, mode=mode_without_gpu)
conv = GpuCorrMM(border_mode=border_mode,
filter_dilation=filter_dilation,
subsample=subsample)(inputs, filters)
subsample=subsample,
unshared=unshared)(inputs, filters)
f = theano.function([], conv, mode=mode_with_gpu)
res_ref = f_ref()
......@@ -48,7 +54,8 @@ class TestCorrMM(unittest.TestCase):
if verify_grad:
utt.verify_grad(GpuCorrMM(border_mode=border_mode,
filter_dilation=filter_dilation,
subsample=subsample),
subsample=subsample,
unshared=unshared),
[inputs_val, filters_val], mode=mode_with_gpu)
def test_valid(self):
......@@ -57,12 +64,6 @@ class TestCorrMM(unittest.TestCase):
self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
filters_shape=(10, 6, 12, 1),
subsample=(2, 2))
self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
filters_shape=(10, 6, 12, 1),
subsample=(2, 2))
self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
filters_shape=(10, 6, 12, 1),
subsample=(3, 3))
self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
filters_shape=(10, 6, 12, 1),
subsample=(3, 3))
......@@ -117,6 +118,41 @@ class TestCorrMM(unittest.TestCase):
border_mode=border_mode,
verify_grad=True)
def test_unshared(self):
self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
filters_shape=(10, 15, 1, 6, 12, 1),
unshared=True)
self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
filters_shape=(10, 8, 1, 6, 12, 1),
subsample=(2, 2), unshared=True)
self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
filters_shape=(10, 5, 1, 6, 12, 1),
subsample=(3, 3), unshared=True)
self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
filters_shape=(10, 5, 1, 6, 12, 1),
subsample=(3, 2), unshared=True)
self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
filters_shape=(10, 15, 1, 6, 12, 1),
subsample=(1, 2), unshared=True)
self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
filters_shape=(10, 15, 1, 6, 12, 1),
border_mode='valid', unshared=True)
self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
filters_shape=(10, 21, 13, 6, 12, 1),
border_mode='half', unshared=True)
self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
filters_shape=(10, 25, 23, 6, 12, 1),
border_mode='full', unshared=True)
self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
filters_shape=(10, 15, 1, 6, 12, 1),
border_mode=(0, 0), unshared=True)
self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
filters_shape=(10, 17, 5, 6, 12, 1),
border_mode=(1, 2), unshared=True)
self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
filters_shape=(10, 21, 5, 6, 12, 1),
border_mode=(3, 2), unshared=True)
def run_gradweight(self, inputs_shape, filters_shape, dCdH_shape,
subsample=(1, 1)):
inputs_shape = [inputs_shape[i] for i in (0, 3, 1, 2)]
......@@ -227,3 +263,12 @@ class TestGroupGpuCorr2d(Grouped_conv_noOptim):
conv_op = GpuCorrMM
conv_gradw_op = GpuCorrMM_gradWeights
conv_gradi_op = GpuCorrMM_gradInputs
flip_filter = True
is_dnn = False
class TestUnsharedGpuCorr2d(TestUnsharedConv):
mode = mode_with_gpu
conv2d_op = GpuCorrMM
conv2d_gradw_op = GpuCorrMM_gradWeights
conv2d_gradi_op = GpuCorrMM_gradInputs
......@@ -37,7 +37,7 @@ from .abstract_conv import separable_conv2d
def conv2d(input, filters, input_shape=None, filter_shape=None,
border_mode='valid', subsample=(1, 1), filter_flip=True,
image_shape=None, filter_dilation=(1, 1), num_groups=1, **kwargs):
image_shape=None, filter_dilation=(1, 1), num_groups=1, unshared=False, **kwargs):
"""
This function will build the symbolic graph for convolving a mini-batch of a
stack of 2D inputs with a set of 2D filters. The implementation is modelled
......@@ -51,18 +51,22 @@ def conv2d(input, filters, input_shape=None, filter_shape=None,
(batch size, input channels, input rows, input columns).
See the optional parameter ``input_shape``.
filters: symbolic 4D tensor
filters: symbolic 4D or 6D tensor
Set of filters used in CNN layer of shape
(output channels, input channels, filter rows, filter columns).
(output channels, input channels, filter rows, filter columns)
for normal convolution and
(output channels, output rows, output columns, input channels,
filter rows, filter columns)
for unshared convolution.
See the optional parameter ``filter_shape``.
input_shape: None, tuple/list of len 4 of int or Constant variable
input_shape: None, tuple/list of len 4 or 6 of int or Constant variable
The shape of the input parameter.
Optional, possibly used to choose an optimal implementation.
You can give ``None`` for any element of the list to specify that this
element is not known at compile time.
filter_shape: None, tuple/list of len 4 of int or Constant variable
filter_shape: None, tuple/list of len 4 or 6 of int or Constant variable
The shape of the filters parameter.
Optional, possibly used to choose an optimal implementation.
You can give ``None`` for any element of the list to specify that this
......@@ -105,6 +109,11 @@ def conv2d(input, filters, input_shape=None, filter_shape=None,
Divides the image, kernel and output tensors into num_groups
separate groups. Each which carry out convolutions separately
unshared: bool
If true, then unshared or 'locally connected' convolution will be
performed. A different filter will be used for each region of the
input.
kwargs: Any other keyword arguments are accepted for backwards
compatibility, but will be ignored.
......@@ -154,12 +163,12 @@ def conv2d(input, filters, input_shape=None, filter_shape=None,
return abstract_conv2d(input, filters, input_shape, filter_shape,
border_mode, subsample, filter_flip,
filter_dilation, num_groups)
filter_dilation, num_groups, unshared)
def conv2d_transpose(input, filters, output_shape, filter_shape=None,
border_mode='valid', input_dilation=(1, 1),
filter_flip=True, filter_dilation=(1, 1), num_groups=1):
filter_flip=True, filter_dilation=(1, 1), num_groups=1, unshared=False):
"""
This function will build the symbolic graph for applying a transposed
convolution over a mini-batch of a stack of 2D inputs with a set of 2D
......@@ -215,6 +224,12 @@ def conv2d_transpose(input, filters, output_shape, filter_shape=None,
Divides the image, kernel and output tensors into num_groups
separate groups. Each which carry out convolutions separately
unshared: bool
If true, then unshared or 'locally connected' convolution will be
performed. A different filter will be used for each region of the
input.
Grouped unshared convolution is supported.
Returns
-------
Symbolic 4D tensor
......@@ -242,4 +257,5 @@ def conv2d_transpose(input, filters, output_shape, filter_shape=None,
subsample=input_dilation,
filter_flip=filter_flip,
filter_dilation=filter_dilation,
num_groups=num_groups)
num_groups=num_groups,
unshared=unshared)
......@@ -44,9 +44,14 @@ def get_conv_output_shape(image_shape, kernel_shape,
to: batch size, number of input channels, height and width (and
possibly depth) of the image. None where undefined.
kernel_shape: tuple of int (symbolic or numeric) corresponding to the
kernel shape. Its four (or five) elements must correspond respectively
to: number of output channels, number of input channels, height and
width (and possibly depth) of the kernel. None where undefined.
kernel shape. For a normal convolution, its four (for 2D convolution)
or five (for 3D convolution) elements must correspond respectively to :
number of output channels, number of input channels, height and width
(and possibly depth) of the kernel.
For an unshared 2D convolution, its six channels must correspond to :
number of output channels, height and width of the output, number of
input channels, height and width of the kernel.
None where undefined.
border_mode: string, int (symbolic or numeric) or tuple of int (symbolic
or numeric). If it is a string, it must be 'valid', 'half' or 'full'.
If it is a tuple, its two (or three) elements respectively correspond
......@@ -56,6 +61,8 @@ def get_conv_output_shape(image_shape, kernel_shape,
possibly depth) axis.
filter_dilation: tuple of int (symbolic or numeric). Its two or three
elements correspond respectively to the dilation on height and width axis.
Note - The shape of the convolution output does not depend on the 'unshared'
or the 'num_groups' parameters.
Returns
-------
......@@ -65,7 +72,10 @@ def get_conv_output_shape(image_shape, kernel_shape,
"""
bsize, imshp = image_shape[0], image_shape[2:]
nkern, kshp = kernel_shape[0], kernel_shape[2:]
convdim = len(image_shape) - 2
nkern, kshp = kernel_shape[0], kernel_shape[-convdim:]
if filter_dilation is None:
filter_dilation = np.ones(len(subsample), dtype='int')
......@@ -139,7 +149,7 @@ def get_conv_shape_1axis(image_shape, kernel_shape, border_mode,
def get_conv_gradweights_shape(image_shape, top_shape,
border_mode, subsample,
filter_dilation=None,
num_groups=1):
num_groups=1, unshared=False):
"""
This function tries to compute the kernel shape of convolution gradWeights.
......@@ -169,6 +179,8 @@ def get_conv_gradweights_shape(image_shape, top_shape,
width axis.
num_groups: An int which specifies the number of separate groups to
be divided into.
unshared: Boolean value. If true, unshared convolution will be performed,
where a different filter is applied to each area of the input.
Returns
-------
......@@ -194,7 +206,10 @@ def get_conv_gradweights_shape(image_shape, top_shape,
out_shp = tuple(get_conv_gradweights_shape_1axis(
imshp[i], topshp[i], border_mode,
subsample[i], filter_dilation[i]) for i in range(len(subsample)))
return (nchan, nkern) + out_shp
if unshared:
return (nchan,) + top_shape[2:] + (nkern,) + out_shp
else:
return (nchan, nkern) + out_shp
def get_conv_gradweights_shape_1axis(image_shape, top_shape, border_mode,
......@@ -280,6 +295,8 @@ def get_conv_gradinputs_shape(kernel_shape, top_shape,
width axis.
num_groups: An int which specifies the number of separate groups to
be divided into.
Note - The shape of the convolution output does not depend on the 'unshared'
parameter.
Returns
-------
......@@ -289,7 +306,9 @@ def get_conv_gradinputs_shape(kernel_shape, top_shape,
"""
bsize, topshp = top_shape[0], top_shape[2:]
nkern, kshp = kernel_shape[1], kernel_shape[2:]
convdim = len(top_shape) - 2
nkern, kshp = kernel_shape[1], kernel_shape[-convdim:]
if filter_dilation is None:
filter_dilation = np.ones(len(subsample), dtype='int')
......@@ -522,7 +541,8 @@ def conv2d(input,
subsample=(1, 1),
filter_flip=True,
filter_dilation=(1, 1),
num_groups=1):
num_groups=1,
unshared=False):
"""This function will build the symbolic graph for convolving a mini-batch of a
stack of 2D inputs with a set of 2D filters. The implementation is modelled
after Convolutional Neural Networks (CNN).
......@@ -538,7 +558,8 @@ def conv2d(input,
subsample=subsample,
filter_flip=filter_flip,
filter_dilation=filter_dilation,
num_groups=num_groups)
num_groups=num_groups,
unshared=unshared)
return conv_op(input, filters)
......@@ -878,7 +899,6 @@ def conv3d(input,
version until it is released.
"""
input = as_tensor_variable(input)
filters = as_tensor_variable(filters)
conv_op = AbstractConv3d(imshp=input_shape,
......@@ -899,7 +919,8 @@ def conv2d_grad_wrt_inputs(output_grad,
subsample=(1, 1),
filter_flip=True,
filter_dilation=(1, 1),
num_groups=1):
num_groups=1,
unshared=False):
"""Compute conv output gradient w.r.t its inputs
This function builds the symbolic graph for getting the
......@@ -916,10 +937,14 @@ def conv2d_grad_wrt_inputs(output_grad,
will be upsampled or the output gradient of the convolution
whose gradient will be taken with respect to the input of the
convolution.
filters : symbolic 4D tensor
set of filters used in CNN layer of shape (output channels,
input channels, filter rows, filter columns). See the
optional parameter ``filter_shape``.
filters: symbolic 4D or 6D tensor
Set of filters used in CNN layer of shape
(output channels, input channels, filter rows, filter columns)
for normal convolution and
(output channels, output rows, output columns, input channels,
filter rows, filter columns)
for unshared convolution.
See the optional parameter ``filter_shape``.
input_shape : [None/int/Constant] * 2 + [Tensor/int/Constant] * 2
The shape of the input (upsampled) parameter.
A tuple/list of len 4, with the first two dimensions
......@@ -928,8 +953,9 @@ def conv2d_grad_wrt_inputs(output_grad,
Not Optional, since given the output_grad shape
and the subsample values, multiple input_shape may be
plausible.
filter_shape : None or [None/int/Constant] * 4
The shape of the filters parameter. None or a tuple/list of len 4.
filter_shape : None or [None/int/Constant] * (4 or 6)
The shape of the filters parameter. None or a tuple/list of len 4 or a
tuple/list of len 6 (for unshared convolution)
Optional, possibly used to choose an optimal implementation.
You can give ``None`` for any element of the list to specify that
this element is not known at compile time.
......@@ -975,6 +1001,10 @@ def conv2d_grad_wrt_inputs(output_grad,
num_groups : int
Divides the image, kernel and output tensors into num_groups
separate groups. Each which carry out convolutions separately
unshared: bool
If true, then unshared or 'locally connected' convolution will be
performed. A different filter will be used for each region of the
input.
Returns
-------
......@@ -1009,7 +1039,14 @@ def conv2d_grad_wrt_inputs(output_grad,
# checking the type of filter_shape
if filter_shape is not None:
for dim in [0, 1, 2, 3]:
if unshared:
expected_dim = 6
else:
expected_dim = 4
assert len(filter_shape) == expected_dim
for dim in range(expected_dim):
assert isinstance(filter_shape[dim], (theano.tensor.TensorConstant,
integer_types, type(None)))
......@@ -1026,7 +1063,8 @@ def conv2d_grad_wrt_inputs(output_grad,
subsample=subsample,
filter_flip=filter_flip,
filter_dilation=filter_dilation,
num_groups=num_groups)
num_groups=num_groups,
unshared=unshared)
return grad_input_op(filters, output_grad, input_shape[-2:])
......@@ -1179,7 +1217,8 @@ def conv2d_grad_wrt_weights(input,
subsample=(1, 1),
filter_flip=True,
filter_dilation=(1, 1),
num_groups=1):
num_groups=1,
unshared=False):
"""Compute conv output gradient w.r.t its weights
This function will build the symbolic graph for getting the
......@@ -1195,10 +1234,10 @@ def conv2d_grad_wrt_weights(input,
mini-batch of feature map stacks, of shape (batch size, input
channels, input rows, input columns). This is the gradient of
the output of convolution.
filter_shape : [None/int/Constant] * 2 + [Tensor/int/Constant] * 2
The shape of the filter parameter. A tuple/list of len 4, with the
first two dimensions being None or int or Constant and the last two
dimensions being Tensor or int or Constant.
filter_shape : [None/int/Constant] * (2 or 4) + [Tensor/int/Constant] * 2
The shape of the filter parameter. A tuple/list of len 4 or 6
(for unshared), with the first two dimensions being None or int or
Constant and the last two dimensions being Tensor or int or Constant.
Not Optional, since given the output_grad shape and
the input_shape, multiple filter_shape may be plausible.
input_shape : None or [None/int/Constant] * 4
......@@ -1247,13 +1286,19 @@ def conv2d_grad_wrt_weights(input,
num_groups : int
Divides the image, kernel and output tensors into num_groups
separate groups. Each which carry out convolutions separately
unshared: bool
If true, then unshared or 'locally connected' convolution will be
performed. A different filter will be used for each region of the
input.
Returns
-------
symbolic 4D tensor
symbolic 4D tensor or 6D tensor
set of feature maps generated by convolutional layer. Tensor
is of shape (batch size, output channels, output rows, output
columns)
columns) for normal convolution and
(output channels, output rows, output columns, input channels,
filter rows, filter columns) for unshared convolution
Notes
-----
......@@ -1274,7 +1319,11 @@ def conv2d_grad_wrt_weights(input,
for dim in [0, 1]:
assert isinstance(filter_shape[dim], (theano.tensor.TensorConstant,
integer_types, type(None)))
for dim in [2, 3]:
if unshared:
for dim in [2, 3]:
assert isinstance(filter_shape[dim], (theano.tensor.TensorConstant,
integer_types, type(None)))
for dim in [-2, -1]:
assert isinstance(filter_shape[dim], (theano.tensor.TensorVariable,
theano.tensor.TensorConstant,
integer_types))
......@@ -1288,7 +1337,7 @@ def conv2d_grad_wrt_weights(input,
# setting the last two dimensions of filter_shape to None, if
# the type of these dimensions is TensorVariable.
numerical_filter_shape = list(filter_shape)
for dim in [2, 3]:
for dim in [-2, -1]:
if isinstance(filter_shape[dim], theano.tensor.TensorVariable):
numerical_filter_shape[dim] = None
......@@ -1298,7 +1347,8 @@ def conv2d_grad_wrt_weights(input,
subsample=subsample,
filter_flip=filter_flip,
filter_dilation=filter_dilation,
num_groups=num_groups)
num_groups=num_groups,
unshared=unshared)
return gradWeight_op(input, output_grad, filter_shape[-2:])
......@@ -1631,7 +1681,8 @@ class BaseAbstractConv(Op):
element is not known at compile time.
imshp is defined w.r.t the forward conv.
kshp: None, tuple/list of len ``(2 + convdim)`` of int or Constant variable
kshp: None, tuple/list of len ``(2 + convdim)`` or ``(2 + 2 * convdim)``
(for unshared) of int or Constant variable
The shape of the filters parameter.
Optional, possibly used to choose an optimal implementation.
You can give ``None`` for any element of the list to specify that this
......@@ -1670,14 +1721,24 @@ class BaseAbstractConv(Op):
filter_dilation: tuple of len ``convdim``
Factor by which to subsample (stride) the input.
Also called dilation factor.
num_groups : int
Divides the image, kernel and output tensors into num_groups
separate groups. Each which carry out convolutions separately
unshared: bool
If true, then unshared or 'locally connected' convolution will be
performed. A different filter will be used for each region of the
input.
"""
check_broadcast = False
__props__ = ('convdim', 'border_mode', 'subsample', 'filter_flip',
'imshp', 'kshp', 'filter_dilation', 'num_groups')
'imshp', 'kshp', 'filter_dilation', 'num_groups', 'unshared')
def __init__(self, convdim,
imshp=None, kshp=None, border_mode="valid",
subsample=None, filter_flip=True, filter_dilation=None, num_groups=1):
subsample=None, filter_flip=True, filter_dilation=None, num_groups=1,
unshared=False):
self.convdim = convdim
if convdim not in (2, 3):
......@@ -1718,7 +1779,10 @@ class BaseAbstractConv(Op):
ValueError("imshp should be None or a tuple of "
"constant int values"),
sys.exc_info()[2])
self.kshp = tuple(kshp) if kshp else (None,) * (2 + convdim)
if kshp:
self.kshp = tuple(kshp)
else:
self.kshp = (None,) * ((2 + 2 * convdim) if unshared else (2 + convdim))
for kshp_i in self.kshp:
if kshp_i is not None:
# Components of kshp should be constant or ints
......@@ -1742,6 +1806,10 @@ class BaseAbstractConv(Op):
if num_groups < 1:
raise ValueError("num_groups must have value greater than zero")
self.num_groups = num_groups
if unshared and self.convdim != 2:
raise NotImplementedError('Unshared convolution not implemented for %dD'
% self.convdim)
self.unshared = unshared
def do_constant_folding(self, node):
# Disable constant folding since there is no implementation.
......@@ -1768,11 +1836,10 @@ class BaseAbstractConv(Op):
raise NotImplementedError(
'flops not implemented for convdim={}', self.convdim)
def conv(self, img, kern, mode="valid", dilation=1, num_groups=1):
def conv(self, img, kern, mode="valid", dilation=1, num_groups=1, unshared=False, direction="forward"):
"""
Basic slow Python 2D or 3D convolution for DebugMode
"""
if not imported_scipy_signal:
raise NotImplementedError(
"AbstractConv perform requires the python package"
......@@ -1787,18 +1854,29 @@ class BaseAbstractConv(Op):
raise ValueError(
'invalid dilation {}, expected {} values'.format(dilation,
self.convdim))
if unshared and direction == "backprop weights":
if mode != "valid":
raise ValueError('conv mode for unshared backprop wrt weights must be "valid"')
# To allow the same format for the call to 'unshared2d' for all three directions,
# the out_shape is shuffled here.
# We do a transpose in the 'perform' function to bring it to the required shape
out_shape = (img.shape[0], kern.shape[0],
kern.shape[2], kern.shape[3],
img.shape[2] - kern.shape[2] + 1,
img.shape[3] - kern.shape[3] + 1)
else:
out_shape = get_conv_output_shape(img.shape, kern.shape,
mode, [1] * self.convdim, dilation)
out_shape = get_conv_output_shape(img.shape, kern.shape,
mode, [1] * self.convdim, dilation)
out = np.zeros(out_shape, dtype=img.dtype)
dil_kern_shp = kern.shape[:-self.convdim] + tuple(
(kern.shape[-self.convdim + i] - 1) * dilation[i] + 1
for i in range(self.convdim))
dilated_kern = np.zeros(dil_kern_shp, dtype=kern.dtype)
dilated_kern[(slice(None), slice(None)) +
dilated_kern[(slice(None),) * (dilated_kern.ndim - self.convdim) +
tuple(slice(None, None, dilation[i]) for i in range(self.convdim))
] = kern
out = np.zeros(out_shape, dtype=img.dtype)
if img.shape[1] % self.num_groups != 0:
raise ValueError(
......@@ -1823,11 +1901,19 @@ class BaseAbstractConv(Op):
for g in xrange(self.num_groups):
for n in xrange(output_channel_offset):
for im0 in xrange(input_channel_offset):
# some cast generates a warning here
out[b, g * output_channel_offset + n, ...] += _convolve2d(img[b, g * input_channel_offset + im0, ...],
dilated_kern[g * output_channel_offset + n,
im0, ...], 1, val, bval, 0)
if unshared:
out[b, g * output_channel_offset + n, ...] += self.unshared2d(img[b, g * input_channel_offset + im0, ...],
dilated_kern[g * output_channel_offset + n, im0, ...],
out_shape[2:], direction)
else:
# some cast generates a warning here
out[b, g * output_channel_offset + n, ...] += _convolve2d(img[b, g * input_channel_offset + im0, ...],
dilated_kern[g * output_channel_offset + n, im0, ...],
1, val, bval, 0)
elif self.convdim == 3:
if unshared:
raise NotImplementedError('Unshared 3D convolution is not implemented')
for b in xrange(img.shape[0]):
for g in xrange(self.num_groups):
for n in xrange(output_channel_offset):
......@@ -1839,6 +1925,35 @@ class BaseAbstractConv(Op):
raise NotImplementedError('only 2D and 3D convolution are implemented')
return out
def unshared2d(self, inp, kern, out_shape, direction="forward"):
'''
Basic slow Python unshared 2d convolution.
'''
if self.convdim != 2:
raise NotImplementedError('Unshared convolution not implemented for %dD'
% self.convdim)
out = np.zeros(out_shape, dtype=inp.dtype)
if direction == "forward":
for row in xrange(out_shape[0]):
for col in xrange(out_shape[1]):
out[row, col] = np.sum(np.multiply(inp[row:row + kern.shape[2],
col:col + kern.shape[3]],
kern[row, col, ::-1, ::-1]))
elif direction == "backprop weights":
for row in xrange(out_shape[0]):
for col in xrange(out_shape[1]):
out[row, col, ...] = kern[row, col] * \
inp[row:row + out_shape[2], col:col + out_shape[3]]
elif direction == "backprop inputs":
for row in xrange(kern.shape[0]):
for col in xrange(kern.shape[1]):
out[row:row + kern.shape[2], col:col + kern.shape[3]] += inp[row, col] * \
kern[row, col, ::-1, ::-1]
else:
raise ValueError("unshared2d: invalid value '{}' for 'direction'".format(direction))
return out
class AbstractConv(BaseAbstractConv):
""" Abstract Op for the forward convolution.
......@@ -1854,14 +1969,16 @@ class AbstractConv(BaseAbstractConv):
subsample=None,
filter_flip=True,
filter_dilation=None,
num_groups=1):
num_groups=1,
unshared=False):
super(AbstractConv, self).__init__(convdim=convdim,
imshp=imshp, kshp=kshp,
border_mode=border_mode,
subsample=subsample,
filter_flip=filter_flip,
filter_dilation=filter_dilation,
num_groups=num_groups)
num_groups=num_groups,
unshared=unshared)
def make_node(self, img, kern):
# Make sure both inputs are Variables with the same Type
......@@ -1875,8 +1992,14 @@ class AbstractConv(BaseAbstractConv):
if img.type.ndim != 2 + self.convdim:
raise TypeError('img must be %dD tensor' % (2 + self.convdim))
if kern.type.ndim != 2 + self.convdim:
raise TypeError('kern must be %dD tensor' % (2 + self.convdim))
if self.unshared:
if kern.type.ndim != 2 + 2 * self.convdim:
raise TypeError('kern must be %dD tensor for unshared convolution'
% (2 + 2 * self.convdim))
else:
if kern.type.ndim != 2 + self.convdim:
raise TypeError('kern must be %dD tensor' % (2 + self.convdim))
img = assert_shape(img, self.imshp,
'AbstractConv shape mismatch: shape of '
......@@ -1894,8 +2017,12 @@ class AbstractConv(BaseAbstractConv):
img, kern = inp
img = np.asarray(img)
kern = np.asarray(kern)
dil_kernshp = tuple((kern.shape[2 + i] - 1) * self.filter_dilation[i] + 1
dil_kernshp = tuple((kern.shape[-self.convdim + i] - 1) * self.filter_dilation[i] + 1
for i in range(self.convdim))
if self.unshared and self.convdim != 2:
raise NotImplementedError('Unshared convolution not implemented for %dD'
% self.convdim)
o, = out_
mode = self.border_mode
......@@ -1922,8 +2049,34 @@ class AbstractConv(BaseAbstractConv):
for i in range(self.convdim))] = img
img = new_img
if not self.filter_flip:
kern = kern[(slice(None), slice(None)) + (slice(None, None, -1),) * self.convdim]
conv_out = self.conv(img, kern, mode="valid", dilation=self.filter_dilation, num_groups=self.num_groups)
kern = kern[(slice(None),) * (kern.ndim - self.convdim) + (slice(None, None, -1),) * self.convdim]
if self.unshared:
out_shape = get_conv_output_shape(img.shape, kern.shape,
mode, self.subsample, self.filter_dilation)
if kern.shape[1:1 + self.convdim] != out_shape[2:2 + self.convdim]:
raise ValueError('Kernel shape {} does not match '
'computed output size {}'.format(kern.shape[1:1 + self.convdim],
out_shape[2:2 + self.convdim]))
if any(self.subsample[i] > 1 for i in range(self.convdim)):
# Expand regions in kernel to correct for subsampling
out_shape = get_conv_output_shape(img.shape, kern.shape,
mode, (1,) * self.convdim, self.filter_dilation)
exp_kern_shp = kern.shape[:1] + out_shape[2:2 + self.convdim] + \
kern.shape[1 + self.convdim:]
exp_kern = np.zeros(exp_kern_shp, dtype=kern.dtype)
exp_kern[(slice(None),) +
tuple(slice(None, None, self.subsample[i]) for i in range(self.convdim)) +
(slice(None),) * (self.convdim + 1)] = kern
kern = exp_kern
# from (nFilters, out_rows, out_cols, nChannels, kH, kW)
# to (nFilters, nChannels, out_rows, out_cols, kH, kW)
axes_order = (0, 1 + self.convdim,) + tuple(range(1, 1 + self.convdim)) + \
tuple(range(2 + self.convdim, kern.ndim))
kern = kern.transpose(axes_order)
conv_out = self.conv(img, kern, mode="valid", dilation=self.filter_dilation, num_groups=self.num_groups,
unshared=self.unshared)
conv_out = conv_out[(slice(None), slice(None)) +
tuple(slice(None, None, self.subsample[i])
for i in range(self.convdim))]
......@@ -1934,6 +2087,8 @@ class AbstractConv(BaseAbstractConv):
if self.num_groups > 1:
raise NotImplementedError(
'Rop not implemented for grouped convolutions')
if self.unshared:
raise NotImplementedError('Rop not implemented for unshared convolution')
rval = None
if eval_points[0] is not None:
rval = self.make_node(eval_points[0], inputs[1]).outputs[0]
......@@ -1953,8 +2108,12 @@ class AbstractConv(BaseAbstractConv):
imshp = [imshp[i] if self.imshp[i] is None else self.imshp[i]
for i in range(2 + self.convdim)]
if self.kshp is not None:
kshp = [kshp[i] if self.kshp[i] is None else self.kshp[i]
for i in range(2 + self.convdim)]
if self.unshared:
kshp = [kshp[i] if self.kshp[i] is None else self.kshp[i]
for i in range(2 + 2 * self.convdim)]
else:
kshp = [kshp[i] if self.kshp[i] is None else self.kshp[i]
for i in range(2 + self.convdim)]
res = get_conv_output_shape(imshp, kshp, self.border_mode,
self.subsample, self.filter_dilation)
return [res]
......@@ -1973,14 +2132,16 @@ class AbstractConv2d(AbstractConv):
subsample=(1, 1),
filter_flip=True,
filter_dilation=(1, 1),
num_groups=1):
num_groups=1,
unshared=False):
super(AbstractConv2d, self).__init__(convdim=2,
imshp=imshp, kshp=kshp,
border_mode=border_mode,
subsample=subsample,
filter_flip=filter_flip,
filter_dilation=filter_dilation,
num_groups=num_groups)
num_groups=num_groups,
unshared=unshared)
def grad(self, inp, grads):
bottom, weights = inp
......@@ -1991,14 +2152,16 @@ class AbstractConv2d(AbstractConv):
self.subsample,
self.filter_flip,
self.filter_dilation,
num_groups=self.num_groups)(
num_groups=self.num_groups,
unshared=self.unshared)(
weights, top, bottom.shape[-2:], add_assert_shape=False)
d_weights = AbstractConv2d_gradWeights(self.imshp, self.kshp,
self.border_mode,
self.subsample,
self.filter_flip,
self.filter_dilation,
num_groups=self.num_groups)(
num_groups=self.num_groups,
unshared=self.unshared)(
bottom, top, weights.shape[-2:], add_assert_shape=False)
......@@ -2085,14 +2248,16 @@ class AbstractConv_gradWeights(BaseAbstractConv):
subsample=None,
filter_flip=True,
filter_dilation=None,
num_groups=1):
num_groups=1,
unshared=False):
super(AbstractConv_gradWeights, self).__init__(convdim=convdim,
imshp=imshp, kshp=kshp,
border_mode=border_mode,
subsample=subsample,
filter_flip=filter_flip,
filter_dilation=filter_dilation,
num_groups=num_groups)
num_groups=num_groups,
unshared=unshared)
# Update shape/height_width
def make_node(self, img, topgrad, shape, add_assert_shape=True):
......@@ -2115,8 +2280,12 @@ class AbstractConv_gradWeights(BaseAbstractConv):
'image does not match given imshp.')
shape = as_tensor_variable(shape)
broadcastable = [topgrad.broadcastable[1],
img.broadcastable[1]] + ([False] * self.convdim)
if self.unshared:
broadcastable = [topgrad.broadcastable[1]] + ([False] * self.convdim) + \
[img.broadcastable[1]] + ([False] * self.convdim)
else:
broadcastable = [topgrad.broadcastable[1],
img.broadcastable[1]] + ([False] * self.convdim)
output = img.type.clone(broadcastable=broadcastable)()
return Apply(self, [img, topgrad, shape], [output])
......@@ -2134,6 +2303,9 @@ class AbstractConv_gradWeights(BaseAbstractConv):
'invalid border_mode {}, which must be either '
'"valid", "full", "half", an integer or a tuple of'
' integers'.format(mode))
if self.unshared and self.convdim != 2:
raise NotImplementedError('Unshared convolution not implemented for %dD'
% self.convdim)
dil_shape = tuple((shape[i] - 1) * self.filter_dilation[i] + 1
for i in range(self.convdim))
......@@ -2166,9 +2338,7 @@ class AbstractConv_gradWeights(BaseAbstractConv):
topgrad = new_topgrad
axes_order = (1, 0) + tuple(range(2, self.convdim + 2))
flip_filters = ((slice(None), slice(None)) +
(slice(None, None, -1),) * self.convdim)
topgrad = topgrad.transpose(axes_order)[flip_filters]
topgrad = topgrad.transpose(axes_order)
img = img.transpose(axes_order)
def correct_for_groups(mat):
......@@ -2182,15 +2352,36 @@ class AbstractConv_gradWeights(BaseAbstractConv):
if self.num_groups > 1:
img = correct_for_groups(img)
kern = self.conv(img, topgrad, mode="valid", num_groups=self.num_groups)
if self.unshared:
flip_kern = ((slice(None),) * (2 + self.convdim) +
(slice(None, None, -1),) * self.convdim)
kern = self.conv(img, topgrad, mode="valid", num_groups=self.num_groups,
unshared=True, direction="backprop weights")
if any(self.subsample[i] > 1 for i in range(self.convdim)):
sub_slice = (slice(None),) * 2 + \
tuple(slice(None, None, self.subsample[i]) for i in range(0, self.convdim)) + \
(slice(None),) * self.convdim
kern = kern[sub_slice]
# from (nChannels, nFilters, out_rows, out_cols, kH, kW)
# to (nFilters, out_rows, out_cols, nChannels, kH, kW)
kern_axes = (1,) + tuple(range(2, self.convdim + 2)) + (0,) + \
tuple(range(self.convdim + 2, kern.ndim))
else:
flip_topgrad = flip_kern = ((slice(None), slice(None)) +
(slice(None, None, -1),) * self.convdim)
topgrad = topgrad[flip_topgrad]
kern = self.conv(img, topgrad, mode="valid", num_groups=self.num_groups)
kern_axes = (1, 0) + tuple(range(2, self.convdim + 2))
kern = kern.transpose(kern_axes)
if any(self.filter_dilation[i] > 1 for i in range(self.convdim)):
kern = kern[(slice(None), slice(None)) +
kern = kern[(slice(None),) * (kern.ndim - self.convdim) +
tuple(slice(None, None, self.filter_dilation[i])
for i in range(self.convdim))]
if self.filter_flip:
kern = kern.transpose(axes_order)[flip_filters]
else:
kern = kern.transpose(axes_order)
kern = kern[flip_kern]
o[0] = node.outputs[0].type.filter(kern)
def connection_pattern(self, node):
......@@ -2203,15 +2394,24 @@ class AbstractConv_gradWeights(BaseAbstractConv):
# from the shapes of inputs.
imshp = input_shapes[0]
topshp = input_shapes[1]
kshp = self.kshp[:] if self.kshp is not None else [None] * (2 + self.convdim)
if self.num_groups > 1:
fallback_kshp = ([topshp[1], imshp[1] // self.num_groups] +
if self.kshp:
kshp = self.kshp
else:
if self.unshared:
kshp = [None] * (2 + 2 * self.convdim)
else:
kshp = [None] * (2 + self.convdim)
if self.unshared:
fallback_kshp = ([topshp[1], topshp[2], topshp[3], imshp[1] // self.num_groups] +
[node.inputs[2][i] for i in range(self.convdim)])
kshp = [fallback_kshp[i] if kshp[i] is None else kshp[i]
for i in range(2 + 2 * self.convdim)]
else:
fallback_kshp = ([topshp[1], imshp[1]] +
fallback_kshp = ([topshp[1], imshp[1] // self.num_groups] +
[node.inputs[2][i] for i in range(self.convdim)])
kshp = [fallback_kshp[i] if kshp[i] is None else kshp[i]
for i in range(2 + self.convdim)]
kshp = [fallback_kshp[i] if kshp[i] is None else kshp[i]
for i in range(2 + self.convdim)]
return [kshp]
......@@ -2232,14 +2432,16 @@ class AbstractConv2d_gradWeights(AbstractConv_gradWeights):
subsample=(1, 1),
filter_flip=True,
filter_dilation=(1, 1),
num_groups=1):
num_groups=1,
unshared=False):
super(AbstractConv2d_gradWeights, self).__init__(convdim=2,
imshp=imshp, kshp=kshp,
border_mode=border_mode,
subsample=subsample,
filter_flip=filter_flip,
filter_dilation=filter_dilation,
num_groups=num_groups)
num_groups=num_groups,
unshared=unshared)
def grad(self, inp, grads):
bottom, top = inp[:2]
......@@ -2249,16 +2451,18 @@ class AbstractConv2d_gradWeights(AbstractConv_gradWeights):
self.subsample,
self.filter_flip,
self.filter_dilation,
self.num_groups)(weights,
top,
bottom.shape[-2:])
self.num_groups,
self.unshared)(weights,
top,
bottom.shape[-2:])
d_top = AbstractConv2d(self.imshp,
self.kshp,
self.border_mode,
self.subsample,
self.filter_flip,
self.filter_dilation,
self.num_groups)(bottom, weights)
self.num_groups,
self.unshared)(bottom, weights)
# Make sure that the broadcastable pattern of the inputs is used
# for the gradients, even if the grad opts are not able to infer
# that the dimensions are broadcastable.
......@@ -2350,14 +2554,16 @@ class AbstractConv_gradInputs(BaseAbstractConv):
subsample=None,
filter_flip=True,
filter_dilation=None,
num_groups=1):
num_groups=1,
unshared=False):
super(AbstractConv_gradInputs, self).__init__(convdim=convdim,
imshp=imshp, kshp=kshp,
border_mode=border_mode,
subsample=subsample,
filter_flip=filter_flip,
filter_dilation=filter_dilation,
num_groups=num_groups)
num_groups=num_groups,
unshared=unshared)
# Update shape/height_width
def make_node(self, kern, topgrad, shape, add_assert_shape=True):
......@@ -2370,10 +2576,19 @@ class AbstractConv_gradInputs(BaseAbstractConv):
broadcastable=topgrad.broadcastable)
topgrad = gtype.filter_variable(topgrad)
if kern.type.ndim != 2 + self.convdim:
raise TypeError('kern must be %dD tensor' % (2 + self.convdim))
if self.unshared:
if self.convdim != 2:
raise NotImplementedError('Unshared convolution not implemented for %dD'
% self.convdim)
elif kern.type.ndim != 2 + 2 * self.convdim:
raise TypeError('kern must be %dD tensor for unshared convolution'
% (2 + 2 * self.convdim))
else:
if kern.type.ndim != 2 + self.convdim:
raise TypeError('kern must be %dD tensor' % (2 + self.convdim))
if topgrad.type.ndim != 2 + self.convdim:
raise TypeError('topgrad must be %dD tensor' % (2 + self.convdim))
raise TypeError('topgrad must be %dD tensor' % (2 + self.convdim))
if add_assert_shape:
kern = assert_shape(kern, self.kshp,
......@@ -2386,7 +2601,7 @@ class AbstractConv_gradInputs(BaseAbstractConv):
False] + ([False] * self.convdim)
else:
broadcastable = [topgrad.type.broadcastable[0],
kern.type.broadcastable[1]] + ([False] * self.convdim)
kern.type.broadcastable[-self.convdim - 1]] + ([False] * self.convdim)
output = kern.type.clone(broadcastable=broadcastable)()
return Apply(self, [kern, topgrad, shape], [output])
......@@ -2403,9 +2618,12 @@ class AbstractConv_gradInputs(BaseAbstractConv):
'invalid border_mode {}, which must be either '
'"valid", "full", "half", an integer or a tuple of'
' integers'.format(mode))
if self.unshared and self.convdim != 2:
raise NotImplementedError('Unshared convolution not implemented for %dD'
% self.convdim)
imshp = self.imshp[:] if self.imshp is not None else [None] * (2 + self.convdim)
fallback_imshp = ([topgrad.shape[0], kern.shape[1]] +
fallback_imshp = ([topgrad.shape[0], kern.shape[-self.convdim - 1]] +
[shape[i] for i in range(self.convdim)])
imshp = [fallback_imshp[i] if imshp[i] is None else imshp[i]
for i in range(2 + self.convdim)]
......@@ -2419,8 +2637,9 @@ class AbstractConv_gradInputs(BaseAbstractConv):
'has shape {}'.format(tuple(expected_topgrad_shape),
tuple(topgrad.shape)))
dil_kernshp = tuple((kern.shape[i + 2] - 1) * self.filter_dilation[i] + 1
dil_kernshp = tuple((kern.shape[-self.convdim + i] - 1) * self.filter_dilation[i] + 1
for i in range(self.convdim))
pad = (0,) * self.convdim
if mode == "full":
pad = tuple(dil_kernshp[i] - 1 for i in range(self.convdim))
......@@ -2438,25 +2657,54 @@ class AbstractConv_gradInputs(BaseAbstractConv):
for i in range(self.convdim))] = topgrad
topgrad = new_topgrad
axes_order = (1, 0) + tuple(range(2, self.convdim + 2))
flip_filters = ((slice(None), slice(None)) +
(slice(None, None, -1),) * self.convdim)
if self.unshared:
# Expand regions in kernel to correct for subsampling
exp_kern_shp = kern.shape[:1] + topgrad.shape[2:] + kern.shape[1 + self.convdim:]
exp_kern = np.zeros(exp_kern_shp, dtype=kern.dtype)
exp_kern[(slice(None),) +
tuple(slice(None, None, self.subsample[i]) for i in range(self.convdim)) +
(slice(None),) * (self.convdim + 1)] = kern
kern = exp_kern
def correct_for_groups(mat):
mshp0 = mat.shape[0] // self.num_groups
mshp1 = mat.shape[1] * self.num_groups
mshp1 = mat.shape[-self.convdim - 1] * self.num_groups
mat = mat.reshape((self.num_groups, mshp0) + mat.shape[1:])
mat = mat.transpose((1, 0, 2) + tuple(range(3, 3 + self.convdim)))
mat = mat.reshape((mshp0, mshp1) + mat.shape[-self.convdim:])
if self.unshared:
# for 2D -> (1, 2, 3, 0, 4, 5, 6)
mat = mat.transpose(tuple(range(1, 2 + self.convdim)) + (0,) +
tuple(range(2 + self.convdim, mat.ndim)))
mat = mat.reshape((mshp0,) + mat.shape[1:1 + self.convdim] + (mshp1,) + mat.shape[-self.convdim:])
else:
mat = mat.transpose((1, 0, 2) + tuple(range(3, 3 + self.convdim)))
mat = mat.reshape((mshp0, mshp1) + mat.shape[-self.convdim:])
return mat
kern = correct_for_groups(kern)
kern = kern.transpose(axes_order)
if self.filter_flip:
topgrad = topgrad[flip_filters]
img = self.conv(topgrad, kern, mode="full", dilation=self.filter_dilation, num_groups=self.num_groups)
if self.filter_flip:
img = img[flip_filters]
if self.unshared:
# from (nFilters, out_rows, out_cols, nChannels, kH, kW)
# to (nChannels, nFilters, out_rows, out_cols, kH, kW)
axes_order = (1 + self.convdim, 0,) + tuple(range(1, 1 + self.convdim)) + \
tuple(range(2 + self.convdim, kern.ndim))
kern = kern.transpose(axes_order)
if not self.filter_flip:
kern = kern[(slice(None),) * (kern.ndim - self.convdim) +
(slice(None, None, -1),) * self.convdim]
img = self.conv(topgrad, kern, mode="full", dilation=self.filter_dilation,
num_groups=self.num_groups, unshared=True, direction="backprop inputs")
else:
axes_order = (1, 0) + tuple(range(2, 2 + self.convdim))
kern = kern.transpose(axes_order)
flip_filters = ((slice(None), slice(None)) +
(slice(None, None, -1),) * self.convdim)
if self.filter_flip:
topgrad = topgrad[flip_filters]
img = self.conv(topgrad, kern, mode="full", dilation=self.filter_dilation,
num_groups=self.num_groups)
if self.filter_flip:
img = img[flip_filters]
if any(p > 0 for p in pad):
img = img[(slice(None), slice(None)) +
tuple(slice(pad[i], img.shape[i + 2] - pad[i])
......@@ -2475,10 +2723,10 @@ class AbstractConv_gradInputs(BaseAbstractConv):
topshp = input_shapes[1]
imshp = self.imshp[:] if self.imshp is not None else [None] * (2 + self.convdim)
if self.num_groups > 1:
fallback_imshp = ([topshp[0], kshp[1] * self.num_groups] +
fallback_imshp = ([topshp[0], kshp[-self.convdim - 1] * self.num_groups] +
[node.inputs[2][i] for i in range(self.convdim)])
else:
fallback_imshp = ([topshp[0], kshp[1]] +
fallback_imshp = ([topshp[0], kshp[-self.convdim - 1]] +
[node.inputs[2][i] for i in range(self.convdim)])
imshp = [fallback_imshp[i] if imshp[i] is None else imshp[i]
for i in range(2 + self.convdim)]
......@@ -2503,14 +2751,16 @@ class AbstractConv2d_gradInputs(AbstractConv_gradInputs):
subsample=(1, 1),
filter_flip=True,
filter_dilation=(1, 1),
num_groups=1):
num_groups=1,
unshared=False):
super(AbstractConv2d_gradInputs, self).__init__(convdim=2,
imshp=imshp, kshp=kshp,
border_mode=border_mode,
subsample=subsample,
filter_flip=filter_flip,
filter_dilation=filter_dilation,
num_groups=num_groups)
num_groups=num_groups,
unshared=unshared)
def grad(self, inp, grads):
weights, top = inp[:2]
......@@ -2520,7 +2770,8 @@ class AbstractConv2d_gradInputs(AbstractConv_gradInputs):
self.subsample,
self.filter_flip,
self.filter_dilation,
self.num_groups)(
self.num_groups,
self.unshared)(
bottom, top,
weights.shape[-2:])
d_top = AbstractConv2d(self.imshp, self.kshp,
......@@ -2528,7 +2779,8 @@ class AbstractConv2d_gradInputs(AbstractConv_gradInputs):
self.subsample,
self.filter_flip,
self.filter_dilation,
self.num_groups)(bottom, weights)
self.num_groups,
self.unshared)(bottom, weights)
# Make sure that the broadcastable pattern of the inputs is used
# for the gradients, even if the grad opts are not able to infer
# that the dimensions are broadcastable.
......
......@@ -107,7 +107,8 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
const int dilW = 1,
const int padH = 0,
const int padW = 0,
const int numgroups = 1)
const int numgroups = 1,
const int unshared = 0)
{
if (PyArray_NDIM(bottom) != 4)
{
......@@ -120,9 +121,9 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
return NULL;
}
if (PyArray_NDIM(weight) != 4)
if (PyArray_NDIM(weight) != (unshared ? 6 : 4))
{
PyErr_SetString(PyExc_ValueError, "CorrMM requires weight of 4D");
PyErr_Format(PyExc_ValueError, "CorrMM requires weight of %%dD", unshared ? 6 : 4);
return NULL;
}
if (PyArray_TYPE(weight) != %(float_typenum)s)
......@@ -152,11 +153,12 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
const int nChannels = PyArray_DIMS(bottom)[1];
const int bottomHeight = PyArray_DIMS(bottom)[2];
const int bottomWidth = PyArray_DIMS(bottom)[3];
// weights: (nFilters, nChannels, rows, columns)
// normal weights: (nFilters, nChannels, rows, columns)
// unshared weights: (nFilters, topHeight, topWidth, nChannels, rows, columns)
const int nFilters = PyArray_DIMS(weight)[0];
const int kH = PyArray_DIMS(weight)[2];
const int kW = PyArray_DIMS(weight)[3];
if (nChannels != (PyArray_DIMS(weight)[1] * numgroups)) {
const int kH = PyArray_DIMS(weight)[unshared ? 4 : 2];
const int kW = PyArray_DIMS(weight)[unshared ? 5 : 3];
if (nChannels != PyArray_DIMS(weight)[unshared ? 3 : 1] * numgroups) {
PyErr_SetString(PyExc_ValueError,
"CorrMM images and kernel must have the same stack size\n");
return NULL;
......@@ -179,22 +181,57 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
const int topHeight = _CONV_FLOORDIV_X(topHeightNoDH, dH) + 1;
const int topWidth = _CONV_FLOORDIV_X(topWidthNoDW, dW) + 1;
#undef _CONV_FLOORDIV
if (batchSize != PyArray_DIMS(top)[0] ||
nFilters != PyArray_DIMS(top)[1] ||
topHeight != PyArray_DIMS(top)[2] ||
topWidth != PyArray_DIMS(top)[3]) {
PyErr_Format(PyExc_ValueError,
"CorrMM shape inconsistency:\n"
" bottom shape: %%d %%d %%d %%d\n"
" weight shape: %%d %%d %%d %%d\n"
" top shape: %%ld %%ld %%ld %%ld (expected %%d %%d %%d %%d)\n",
batchSize, nChannels, bottomHeight, bottomWidth,
nFilters, nChannels / numgroups, kH, kW,
PyArray_DIMS(top)[0], PyArray_DIMS(top)[1],
PyArray_DIMS(top)[2], PyArray_DIMS(top)[3],
batchSize, nFilters, topHeight, topWidth);
return NULL;
if (unshared) {
if (topHeight != PyArray_DIMS(weight)[1] ||
topWidth != PyArray_DIMS(weight)[2]) {
PyErr_Format(PyExc_ValueError,
"CorrMM regions in kernel must match output regions:\n"
" bottom shape: %%d %%d %%d %%d\n"
" weight shape: %%d %%ld %%ld %%d %%d %%d"
" (expected %%d %%d %%d %%d %%d %%d)\n"
" top shape(calculated): %%d %%d %%d %%d\n",
batchSize, nChannels, bottomHeight, bottomWidth,
nFilters, PyArray_DIMS(weight)[1],
PyArray_DIMS(weight)[2], nChannels / numgroups, kH, kW,
nFilters, topHeight, topWidth, nChannels / numgroups, kH, kW,
batchSize, nFilters, topHeight, topWidth);
return NULL;
}
if (batchSize != PyArray_DIMS(top)[0] ||
nFilters != PyArray_DIMS(top)[1] ||
topHeight != PyArray_DIMS(top)[2] ||
topWidth != PyArray_DIMS(top)[3]) {
PyErr_Format(PyExc_ValueError,
"CorrMM shape inconsistency:\n"
" bottom shape: %%d %%d %%d %%d\n"
" weight shape: %%d %%d %%d %%d %%d %%d\n"
" top shape: %%ld %%ld %%ld %%ld (expected %%d %%d %%d %%d)\n",
batchSize, nChannels, bottomHeight, bottomWidth,
nFilters, topHeight, topWidth, nChannels / numgroups, kH, kW,
PyArray_DIMS(top)[0], PyArray_DIMS(top)[1],
PyArray_DIMS(top)[2], PyArray_DIMS(top)[3],
batchSize, nFilters, topHeight, topWidth);
return NULL;
}
}
else {
if (batchSize != PyArray_DIMS(top)[0] ||
nFilters != PyArray_DIMS(top)[1] ||
topHeight != PyArray_DIMS(top)[2] ||
topWidth != PyArray_DIMS(top)[3]) {
PyErr_Format(PyExc_ValueError,
"CorrMM shape inconsistency:\n"
" bottom shape: %%d %%d %%d %%d\n"
" weight shape: %%d %%d %%d %%d\n"
" top shape: %%ld %%ld %%ld %%ld (expected %%d %%d %%d %%d)\n",
batchSize, nChannels, bottomHeight, bottomWidth,
nFilters, nChannels / numgroups, kH, kW,
PyArray_DIMS(top)[0], PyArray_DIMS(top)[1],
PyArray_DIMS(top)[2], PyArray_DIMS(top)[3],
batchSize, nFilters, topHeight, topWidth);
return NULL;
}
}
// Create temporary columns
int max_threads = %(omp_get_max_threads)s;
......@@ -230,8 +267,10 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
const int group_col_stride = (K_ * N_);
const int group_weight_stride = (PyArray_STRIDES(weight)[0] * nFilters / numgroups)/%(n_bytes)f;
const int M_ = nFilters / numgroups;
const int one_int = 1;
const %(c_float_type)s one = 1.0;
const %(c_float_type)s zero = 0.0;
const int ldw = (K_ * N_);
char NTrans = 'N';
char Trans = 'T';
PyArrayObject *output;
......@@ -266,15 +305,30 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
im2col((%(float_type)s*)PyArray_DATA(bottom) + n * batch_bottom_stride, nChannels,
bottomHeight,bottomWidth, kH, kW, dilH, dilW, padH, padW, dH, dW,
(%(float_type)s*)PyArray_DATA(col)+ tid * col_stride);
for ( int g = 0; g < numgroups; ++g){
// Second, gemm
%(gemm)s(&NTrans, &NTrans,
&N_, &M_, &K_,
&one,
(%(float_type)s*)PyArray_DATA(col) + tid * col_stride + g * group_col_stride, &N_,
(%(float_type)s*)PyArray_DATA(weight) + g * group_weight_stride, &K_,
&zero,
(%(float_type)s*)PyArray_DATA(top) + n * batch_top_stride + g * group_top_stride, &N_);
// Second, gemm
if (unshared) {
for (int g = 0; g < numgroups; ++g) {
for (int reg = 0; reg < N_; ++reg) {
%(gemv)s(&Trans, &K_, &M_,
&one,
(%(float_type)s*)PyArray_DATA(weight) + g * group_weight_stride + reg * K_, &ldw,
(%(float_type)s*)PyArray_DATA(col) + tid * col_stride + g * group_col_stride + reg, &N_,
&zero,
(%(float_type)s*)PyArray_DATA(top) + n * batch_top_stride + g * group_top_stride + reg, &N_);
}
}
}
else {
for ( int g = 0; g < numgroups; ++g){
// Second, gemm
%(gemm)s(&NTrans, &NTrans,
&N_, &M_, &K_,
&one,
(%(float_type)s*)PyArray_DATA(col) + tid * col_stride + g * group_col_stride, &N_,
(%(float_type)s*)PyArray_DATA(weight) + g * group_weight_stride, &K_,
&zero,
(%(float_type)s*)PyArray_DATA(top) + n * batch_top_stride + g * group_top_stride, &N_);
}
}
}
// Restore to previous blas threads
......@@ -316,7 +370,10 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
output = weight;
npy_intp weight_dim[2];
weight_dim[0] = (npy_intp)max_threads;
weight_dim[1] = (npy_intp)(M_ * K_ * numgroups);
if (unshared)
weight_dim[1] = (npy_intp)(M_ * N_ * K_ * numgroups);
else
weight_dim[1] = (npy_intp)(M_ * K_ * numgroups);
PyArrayObject* local_weight = (PyArrayObject*)PyArray_ZEROS(2,
weight_dim, PyArray_TYPE(weight), 0);
......@@ -341,26 +398,46 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
im2col((%(float_type)s*)PyArray_DATA(bottom) + n * batch_bottom_stride,
nChannels, bottomHeight,bottomWidth, kH, kW, dilH, dilW, padH, padW, dH, dW,
(%(float_type)s*)PyArray_DATA(col)+ tid * col_stride);
for(int g = 0; g < numgroups; ++g){
// Second, gemm
// Note that we accumulate into weight. We do so by setting beta = 0
// for the first iteration and beta = 1 for subsequent ones. (This
// is faster than setting weight to all zeros before the loop.)
%(gemm)s(&Trans, &NTrans,
&K_, &M_, &N_,
&one,
(%(float_type)s*)PyArray_DATA(col) + tid * col_stride + g * group_col_stride, &N_,
(%(float_type)s*)PyArray_DATA(top) + g * group_top_stride + n * batch_top_stride, &N_,
(n == 0) ? &zero : &one,
(%(float_type)s*)PyArray_DATA(local_weight) + g * group_weight_stride +
tid * weight_dim[1], &K_);
// Second, gemm
// Note that we accumulate into weight. We do so by setting beta = 0
// for the first iteration and beta = 1 for subsequent ones. (This
// is faster than setting weight to all zeros before the loop.)
if (unshared) {
for (int g = 0; g < numgroups; ++g) {
for (int reg = 0; reg < N_; ++reg) {
%(gemm)s(&Trans, &NTrans,
&K_, &M_, &one_int,
&one,
(%(float_type)s*)PyArray_DATA(col) + tid * col_stride + g * group_col_stride + reg, &N_,
(%(float_type)s*)PyArray_DATA(top) + g * group_top_stride + n * batch_top_stride + reg, &N_,
(n == 0) ? &zero : &one,
(%(float_type)s*)PyArray_DATA(local_weight) + g * group_weight_stride + reg * K_ +
tid * weight_dim[1], &ldw);
}
}
}
else {
for(int g = 0; g < numgroups; ++g){
// Second, gemm
// Note that we accumulate into weight. We do so by setting beta = 0
// for the first iteration and beta = 1 for subsequent ones. (This
// is faster than setting weight to all zeros before the loop.)
%(gemm)s(&Trans, &NTrans,
&K_, &M_, &N_,
&one,
(%(float_type)s*)PyArray_DATA(col) + tid * col_stride + g * group_col_stride, &N_,
(%(float_type)s*)PyArray_DATA(top) + g * group_top_stride + n * batch_top_stride, &N_,
(n == 0) ? &zero : &one,
(%(float_type)s*)PyArray_DATA(local_weight) + g * group_weight_stride +
tid * weight_dim[1], &K_);
}
}
}
// Restore to previous blas threads
%(blas_set_num_threads)s(blas_threads_saved);
//aggregate weights
memset((%(float_type)s*)PyArray_DATA(weight), 0, M_ * K_*sizeof(%(float_type)s));
memset((%(float_type)s*)PyArray_DATA(weight), 0, weight_dim[1]*sizeof(%(float_type)s));
/*
* Put index "j" into outer loop to get the
* correct result when openmp is used.
......@@ -416,15 +493,29 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
%(omp_flags)s
for (int n = 0; n < batchSize; ++n) {
int tid = %(omp_get_thread_num)s;
for ( int g = 0;g < numgroups; ++g){
// gemm into columns
%(gemm)s(&NTrans, &Trans,
&N_, &K_, &M_,
&one,
(%(float_type)s*)PyArray_DATA(top) + g * group_top_stride + n * batch_top_stride, &N_,
(%(float_type)s*)PyArray_DATA(weight) + g * group_weight_stride, &K_,
&zero,
(%(float_type)s*)PyArray_DATA(col) + tid * col_stride + g * group_col_stride, &N_);
if (unshared) {
for (int g = 0; g < numgroups; ++g){
for (int reg = 0; reg < N_; ++reg){
%(gemm)s(&NTrans, &Trans,
&one_int, &K_, &M_,
&one,
(%(float_type)s*)PyArray_DATA(top) + g * group_top_stride + n * batch_top_stride + reg, &N_,
(%(float_type)s*)PyArray_DATA(weight) + g * group_weight_stride + reg * K_, &ldw,
&zero,
(%(float_type)s*)PyArray_DATA(col) + tid * col_stride + g * group_col_stride + reg, &N_);
}
}
}
else {
for (int g = 0; g < numgroups; ++g) {
%(gemm)s(&NTrans, &Trans,
&N_, &K_, &M_,
&one,
(%(float_type)s*)PyArray_DATA(top) + g * group_top_stride + n * batch_top_stride, &N_,
(%(float_type)s*)PyArray_DATA(weight) + g * group_weight_stride, &K_,
&zero,
(%(float_type)s*)PyArray_DATA(col) + tid * col_stride + g * group_col_stride, &N_);
}
}
// col2im back to the data
col2im((%(float_type)s*)PyArray_DATA(col) + tid * col_stride, nChannels, bottomHeight, bottomWidth,
......@@ -474,4 +565,3 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
// in here output is just aliased to one of bottom, weights, or top.
return output;
}
......@@ -8,7 +8,7 @@ import theano
from theano import Apply
from theano import gof
from theano.gof import ParamsType, EnumList
from theano.scalar import int64
from theano.scalar import int64, int8
from theano.tensor import as_tensor_variable, TensorType
from theano.tensor.nnet.abstract_conv import get_conv_output_shape
from theano.tensor import blas_headers
......@@ -42,9 +42,11 @@ class BaseCorrMM(gof.OpenMPOp):
Perform dilated correlation (default: (1,1))
num_groups
Perform grouped convolutions (default: 1)
unshared
Perform unshared correlation (default: False)
"""
check_broadcast = False
__props__ = ('border_mode', 'subsample', 'filter_dilation', 'num_groups')
__props__ = ('border_mode', 'subsample', 'filter_dilation', 'num_groups', 'unshared')
_direction = None
......@@ -54,10 +56,10 @@ class BaseCorrMM(gof.OpenMPOp):
dH=int64, dW=int64,
dilH=int64, dilW=int64,
padH=int64, padW=int64,
num_groups=int64)
num_groups=int64, unshared=int8)
def __init__(self, border_mode="valid", subsample=(1, 1),
filter_dilation=(1, 1), num_groups=1, openmp=None):
filter_dilation=(1, 1), num_groups=1, unshared=False, openmp=None):
super(BaseCorrMM, self).__init__(openmp=openmp)
if isinstance(border_mode, integer_types):
if border_mode < 0:
......@@ -85,6 +87,7 @@ class BaseCorrMM(gof.OpenMPOp):
raise ValueError("filter_dilation must have two elements")
self.subsample = tuple(subsample)
self.filter_dilation = tuple(filter_dilation)
self.unshared = unshared
if not theano.config.blas.ldflags:
# Theano will use a NumPy C implementation of [sd]gemm_ instead.
......@@ -130,12 +133,13 @@ class BaseCorrMM(gof.OpenMPOp):
padW = property(lambda self: self.pad[1])
def __str__(self):
return '%s{%s, %s, %s, %s}' % (
return '%s{%s, %s, %s, %s %s}' % (
self.__class__.__name__,
self.border_mode,
str(self.subsample),
str(self.filter_dilation),
str(self.num_groups))
str(self.num_groups),
str(self.unshared))
@staticmethod
def as_common_dtype(in1, in2):
......@@ -179,7 +183,7 @@ class BaseCorrMM(gof.OpenMPOp):
def c_code_cache_version(self):
# raise this whenever modifying any of the support_code_files
return (7, self.openmp, blas_header_version())
return (9, self.openmp, blas_header_version())
def c_support_code_apply(self, node, nodename):
# REMEMBER TO RAISE c_code_cache_version when changing any of
......@@ -189,12 +193,14 @@ class BaseCorrMM(gof.OpenMPOp):
assert dtype in ('float32', 'float64')
if dtype == 'float32':
sub['gemm'] = 'sgemm_'
sub['gemv'] = 'sgemv_'
sub['float_type'] = 'npy_float'
sub['float_typenum'] = 'NPY_FLOAT'
sub['n_bytes'] = 4
sub['c_float_type'] = 'float'
else:
sub['gemm'] = 'dgemm_'
sub['gemv'] = 'dgemv_'
sub['float_type'] = 'npy_double'
sub['float_typenum'] = 'NPY_DOUBLE'
sub['n_bytes'] = 8
......@@ -287,6 +293,7 @@ class BaseCorrMM(gof.OpenMPOp):
int padH = %(params)s->padH;
int padW = %(params)s->padW;
int numgroups = %(params)s->num_groups;
int unshared = %(params)s->unshared;
PyArrayObject * bottom = %(bottom)s;
PyArrayObject * weights = %(weights)s;
......@@ -310,13 +317,17 @@ class BaseCorrMM(gof.OpenMPOp):
break;
}
int wdim, odim;
wdim = unshared ? 6 : 4;
odim = 4; //Can be set to 6 later for unshared backprop wrt weights
// Obtain or infer kernel width and height
// (we need to know it early to be able to handle auto-padding)
int kH, kW, dil_kH, dil_kW;
if (direction != 1) {
// weight is an input variable, we can just read its shape
kH = PyArray_DIMS(weights)[2];
kW = PyArray_DIMS(weights)[3];
kH = PyArray_DIMS(weights)[wdim-2];
kW = PyArray_DIMS(weights)[wdim-1];
}
else {
if (%(height)s != -1) {
......@@ -370,28 +381,45 @@ class BaseCorrMM(gof.OpenMPOp):
}
// Infer output shape
npy_intp out_dim[4];
npy_intp out_dim[6];
out_dim[4] = out_dim[5] = 0; //Only used for unshared backprop wrt weights
switch(direction) {
case 0: // forward pass
// output is top: (batchsize, num_filters, height, width)
// height and width: top = (bottom + 2*pad - ((weight-1)*dil + 1)) / sample + 1
out_dim[0] = (npy_intp)PyArray_DIMS(bottom)[0];
out_dim[1] = (npy_intp)PyArray_DIMS(weights)[0];
out_dim[2] = (npy_intp)((PyArray_DIMS(bottom)[2] + 2*padH - ((PyArray_DIMS(weights)[2]-1)*dilH + 1)) / dH + 1);
out_dim[3] = (npy_intp)((PyArray_DIMS(bottom)[3] + 2*padW - ((PyArray_DIMS(weights)[3]-1)*dilW + 1)) / dW + 1);
out_dim[2] = (npy_intp)((PyArray_DIMS(bottom)[2] + 2*padH - ((PyArray_DIMS(weights)[wdim-2]-1)*dilH + 1)) / dH + 1);
out_dim[3] = (npy_intp)((PyArray_DIMS(bottom)[3] + 2*padW - ((PyArray_DIMS(weights)[wdim-1]-1)*dilW + 1)) / dW + 1);
if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
{
PyErr_Format(PyExc_ValueError,
"CorrMM: impossible output shape\\n"
" bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
" weights shape: %%ld x %%ld x %%ld x %%ld\\n"
" top shape: %%ld x %%ld x %%ld x %%ld\\n",
(long int)PyArray_DIMS(bottom)[0], (long int)PyArray_DIMS(bottom)[1],
(long int)PyArray_DIMS(bottom)[2], (long int)PyArray_DIMS(bottom)[3],
(long int)PyArray_DIMS(weights)[0], (long int)PyArray_DIMS(weights)[1],
(long int)PyArray_DIMS(weights)[2], (long int)PyArray_DIMS(weights)[3],
(long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
(long int)out_dim[3]);
if (unshared) {
PyErr_Format(PyExc_ValueError,
"CorrMM: impossible output shape\\n"
" bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
" weights shape: %%ld x %%ld x %%ld x %%ld x %%ld x %%ld\\n"
" top shape: %%ld x %%ld x %%ld x %%ld\\n",
(long int)PyArray_DIMS(bottom)[0], (long int)PyArray_DIMS(bottom)[1],
(long int)PyArray_DIMS(bottom)[2], (long int)PyArray_DIMS(bottom)[3],
(long int)PyArray_DIMS(weights)[0], (long int)PyArray_DIMS(weights)[1],
(long int)PyArray_DIMS(weights)[2], (long int)PyArray_DIMS(weights)[3],
(long int)PyArray_DIMS(weights)[4], (long int)PyArray_DIMS(weights)[5],
(long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
(long int)out_dim[3]);
}
else {
PyErr_Format(PyExc_ValueError,
"CorrMM: impossible output shape\\n"
" bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
" weights shape: %%ld x %%ld x %%ld x %%ld\\n"
" top shape: %%ld x %%ld x %%ld x %%ld\\n",
(long int)PyArray_DIMS(bottom)[0], (long int)PyArray_DIMS(bottom)[1],
(long int)PyArray_DIMS(bottom)[2], (long int)PyArray_DIMS(bottom)[3],
(long int)PyArray_DIMS(weights)[0], (long int)PyArray_DIMS(weights)[1],
(long int)PyArray_DIMS(weights)[2], (long int)PyArray_DIMS(weights)[3],
(long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
(long int)out_dim[3]);
}
%(fail)s
}
break;
......@@ -399,46 +427,90 @@ class BaseCorrMM(gof.OpenMPOp):
// output is weights: (num_filters, num_channels, height, width)
// height and width: weights = (bottom + 2*pad - (top - 1) * sample - 1) / dil + 1
out_dim[0] = (npy_intp)PyArray_DIMS(top)[1];
out_dim[1] = (npy_intp)PyArray_DIMS(bottom)[1] / numgroups;
out_dim[2] = (npy_intp)kH; // already inferred further above
out_dim[3] = (npy_intp)kW; // how convenient
if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
{
PyErr_Format(PyExc_ValueError,
"CorrMM backprop wrt. weights: impossible output shape\\n"
" bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
" weights shape: %%ld x %%ld x %%ld x %%ld\\n"
" top shape: %%ld x %%ld x %%ld x %%ld\\n",
(long int)PyArray_DIMS(bottom)[0], (long int)PyArray_DIMS(bottom)[1],
(long int)PyArray_DIMS(bottom)[2], (long int)PyArray_DIMS(bottom)[3],
(long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
(long int)out_dim[3],
(long int)PyArray_DIMS(top)[0], (long int)PyArray_DIMS(top)[1],
(long int)PyArray_DIMS(top)[2], (long int)PyArray_DIMS(top)[3]);
%(fail)s
if (unshared){
odim = 6;
out_dim[1] = (npy_intp)PyArray_DIMS(top)[2];
out_dim[2] = (npy_intp)PyArray_DIMS(top)[3];
}
out_dim[wdim-3] = (npy_intp)PyArray_DIMS(bottom)[1] / numgroups;
out_dim[wdim-2] = (npy_intp)kH; // already inferred further above
out_dim[wdim-1] = (npy_intp)kW; // how convenient
if (unshared) {
if (out_dim[0] < 0 || out_dim[1] <= 0 || out_dim[2] <= 0 || out_dim[3] < 0
|| out_dim[4] <= 0 || out_dim[5] <= 0){
PyErr_Format(PyExc_ValueError,
"CorrMM backprop wrt. weights: impossible output shape\\n"
" bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
" weights shape: %%ld x %%ld x %%ld x %%ld x %%ld x %%ld\\n"
" top shape: %%ld x %%ld x %%ld x %%ld\\n",
(long int)PyArray_DIMS(bottom)[0], (long int)PyArray_DIMS(bottom)[1],
(long int)PyArray_DIMS(bottom)[2], (long int)PyArray_DIMS(bottom)[3],
(long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
(long int)out_dim[3], (long int)out_dim[4], (long int)out_dim[5],
(long int)PyArray_DIMS(top)[0], (long int)PyArray_DIMS(top)[1],
(long int)PyArray_DIMS(top)[2], (long int)PyArray_DIMS(top)[3]);
%(fail)s
}
}
else {
if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
{
PyErr_Format(PyExc_ValueError,
"CorrMM backprop wrt. weights: impossible output shape\\n"
" bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
" weights shape: %%ld x %%ld x %%ld x %%ld\\n"
" top shape: %%ld x %%ld x %%ld x %%ld\\n",
(long int)PyArray_DIMS(bottom)[0], (long int)PyArray_DIMS(bottom)[1],
(long int)PyArray_DIMS(bottom)[2], (long int)PyArray_DIMS(bottom)[3],
(long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
(long int)out_dim[3],
(long int)PyArray_DIMS(top)[0], (long int)PyArray_DIMS(top)[1],
(long int)PyArray_DIMS(top)[2], (long int)PyArray_DIMS(top)[3]);
%(fail)s
}
}
break;
case 2: // backprop wrt. inputs
// output is bottom: (batchsize, num_channels, height, width)
// height and width: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad
out_dim[0] = (npy_intp)PyArray_DIMS(top)[0];
out_dim[1] = (npy_intp)PyArray_DIMS(weights)[1] * numgroups;
out_dim[2] = (npy_intp)((%(height)s != -1) ? %(height)s : (PyArray_DIMS(top)[2] - 1) * dH + (PyArray_DIMS(weights)[2]-1)*dilH + 1 - 2*padH);
out_dim[3] = (npy_intp)((%(width)s != -1) ? %(width)s : (PyArray_DIMS(top)[3] - 1) * dW + (PyArray_DIMS(weights)[3]-1)*dilW + 1 - 2*padW);
if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
{
PyErr_Format(PyExc_ValueError,
"CorrMM backprop wrt. inputs: impossible output shape\\n"
" bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
" weights shape: %%ld x %%ld x %%ld x %%ld\\n"
" top shape: %%ld x %%ld x %%ld x %%ld\\n",
(long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
(long int)out_dim[3],
(long int)PyArray_DIMS(weights)[0], (long int)PyArray_DIMS(weights)[1],
(long int)PyArray_DIMS(weights)[2], (long int)PyArray_DIMS(weights)[3],
(long int)PyArray_DIMS(top)[0], (long int)PyArray_DIMS(top)[1],
(long int)PyArray_DIMS(top)[2], (long int)PyArray_DIMS(top)[3]);
%(fail)s
out_dim[1] = (npy_intp)PyArray_DIMS(weights)[wdim-3] * numgroups;
out_dim[2] = (npy_intp)((%(height)s != -1) ? %(height)s : (PyArray_DIMS(top)[2] - 1) * dH + (PyArray_DIMS(weights)[wdim-2]-1)*dilH + 1 - 2*padH);
out_dim[3] = (npy_intp)((%(width)s != -1) ? %(width)s : (PyArray_DIMS(top)[3] - 1) * dW + (PyArray_DIMS(weights)[wdim-1]-1)*dilW + 1 - 2*padW);
if (unshared) {
if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
{
PyErr_Format(PyExc_ValueError,
"CorrMM backprop wrt. inputs: impossible output shape\\n"
" bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
" weights shape: %%ld x %%ld x %%ld x %%ld x %%ld x %%ld\\n"
" top shape: %%ld x %%ld x %%ld x %%ld\\n",
(long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
(long int)out_dim[3],
(long int)PyArray_DIMS(weights)[0], (long int)PyArray_DIMS(weights)[1],
(long int)PyArray_DIMS(weights)[2], (long int)PyArray_DIMS(weights)[3],
(long int)PyArray_DIMS(weights)[4], (long int)PyArray_DIMS(weights)[5],
(long int)PyArray_DIMS(top)[0], (long int)PyArray_DIMS(top)[1],
(long int)PyArray_DIMS(top)[2], (long int)PyArray_DIMS(top)[3]);
%(fail)s
}
}
else {
if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
{
PyErr_Format(PyExc_ValueError,
"CorrMM backprop wrt. inputs: impossible output shape\\n"
" bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
" weights shape: %%ld x %%ld x %%ld x %%ld\\n"
" top shape: %%ld x %%ld x %%ld x %%ld\\n",
(long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
(long int)out_dim[3],
(long int)PyArray_DIMS(weights)[0], (long int)PyArray_DIMS(weights)[1],
(long int)PyArray_DIMS(weights)[2], (long int)PyArray_DIMS(weights)[3],
(long int)PyArray_DIMS(top)[0], (long int)PyArray_DIMS(top)[1],
(long int)PyArray_DIMS(top)[2], (long int)PyArray_DIMS(top)[3]);
%(fail)s
}
}
break;
default:
......@@ -448,13 +520,19 @@ class BaseCorrMM(gof.OpenMPOp):
// Prepare output array
int typenum;
if ( !(*out
&& PyArray_NDIM(*out)==4
int failure;
failure = !(*out
&& PyArray_NDIM(*out)==odim
&& PyArray_IS_C_CONTIGUOUS(*out)
&& PyArray_DIMS(*out)[0]==out_dim[0]
&& PyArray_DIMS(*out)[1]==out_dim[1]
&& PyArray_DIMS(*out)[2]==out_dim[2]
&& PyArray_DIMS(*out)[3]==out_dim[3]))
&& PyArray_DIMS(*out)[3]==out_dim[3]);
if (odim == 6){
failure = failure || !(PyArray_DIMS(*out)[4]==out_dim[4]
&& PyArray_DIMS(*out)[5]==out_dim[5]);
}
if ( failure )
{
Py_XDECREF(*out);
if (direction != 1) {
......@@ -464,21 +542,29 @@ class BaseCorrMM(gof.OpenMPOp):
typenum = PyArray_TYPE(bottom);
}
//Change to PyArray_ZEROS which is faster than PyArray_EMPTY.
*out = (PyArrayObject*)PyArray_ZEROS(4,
*out = (PyArrayObject*)PyArray_ZEROS(odim,
out_dim,
typenum,
0);
if (NULL == *out)
{
PyErr_Format(PyExc_RuntimeError,
"BaseCorrMM: Failed to allocate output of %%lld x %%lld x %%lld x %%lld",
(long long)out_dim[0], (long long)out_dim[1], (long long)out_dim[2], (long long)out_dim[3]);
if (odim == 4) {
PyErr_Format(PyExc_RuntimeError,
"BaseCorrMM: Failed to allocate output of %%lld x %%lld x %%lld x %%lld",
(long long)out_dim[0], (long long)out_dim[1], (long long)out_dim[2], (long long)out_dim[3]);
}
if (odim == 6) {
PyErr_Format(PyExc_RuntimeError,
"BaseCorrMM: Failed to allocate output of %%lld x %%lld x %%lld x %%lld %%lld %%lld",
(long long)out_dim[0], (long long)out_dim[1], (long long)out_dim[2], (long long)out_dim[3],
(long long)out_dim[4], (long long)out_dim[5]);
}
%(fail)s
}
}
// Call corrMM code
out2 = corrMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, dilH, dilW, padH, padW, numgroups );
out2 = corrMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, dilH, dilW, padH, padW, numgroups, unshared);
if (out2==NULL){
%(fail)s
}
......@@ -514,6 +600,13 @@ class CorrMM(BaseCorrMM):
The filter dilation operation applied to each input image.
Should be a tuple with 2 elements.
Set to `(1, 1)` to disable filter dilation.
num_groups
Divides the image, kernel and output tensors into num_groups
separate groups. Each which carry out convolutions separately.
Should be an integer.
unshared
Boolean value. If true, then a different filter will be applied to
each region of the input image.
"""
......@@ -525,8 +618,12 @@ class CorrMM(BaseCorrMM):
img, kern = self.as_common_dtype(img, kern)
if img.type.ndim != 4:
raise TypeError('img must be 4D tensor')
if kern.type.ndim != 4:
raise TypeError('kern must be 4D tensor')
if self.unshared is True:
if kern.type.ndim != 6:
raise TypeError('kern must be 6D tensor')
else:
if kern.type.ndim != 4:
raise TypeError('kern must be 4D tensor')
broadcastable = [img.type.broadcastable[0], kern.type.broadcastable[0],
False, False]
......@@ -555,13 +652,15 @@ class CorrMM(BaseCorrMM):
d_bottom = CorrMM_gradInputs(self.border_mode,
self.subsample,
self.filter_dilation,
self.num_groups)(weights, top,
bottom.shape[-2:])
self.num_groups,
self.unshared)(weights, top,
bottom.shape[-2:])
d_weights = CorrMM_gradWeights(self.border_mode,
self.subsample,
self.filter_dilation,
self.num_groups)(bottom, top,
weights.shape[-2:])
self.num_groups,
self.unshared)(bottom, top,
weights.shape[-2:])
return d_bottom, d_weights
......@@ -595,8 +694,12 @@ class CorrMM_gradWeights(BaseCorrMM):
else:
height_width = [as_tensor_variable(shape[0]).astype('int64'), as_tensor_variable(shape[1]).astype('int64')]
broadcastable = [topgrad.type.broadcastable[1], img.type.broadcastable[1],
False, False]
if self.unshared is True:
broadcastable = [topgrad.type.broadcastable[0], False, False,
img.type.broadcastable[1], False, False]
else:
broadcastable = [topgrad.type.broadcastable[0], img.type.broadcastable[1],
False, False]
dtype = img.type.dtype
return Apply(self, [img, topgrad] + height_width,
[TensorType(dtype, broadcastable)()])
......@@ -633,7 +736,10 @@ class CorrMM_gradWeights(BaseCorrMM):
kW = 2 - imshp[1] + (topshp[1] - 1) * dW
else:
kW = imshp[1] + 2 * padW - (topshp[1] - 1) * dW
return [(nkern, ssize, kH, kW)]
if self.unshared is True:
return [(nkern, topshp[0], topshp[1], ssize, kH, kW)]
else:
return [(nkern, ssize, kH, kW)]
def c_code(self, node, nodename, inp, out_, sub):
bottom, top = inp[:2]
......@@ -649,12 +755,14 @@ class CorrMM_gradWeights(BaseCorrMM):
d_bottom = CorrMM_gradInputs(self.border_mode,
self.subsample,
self.filter_dilation,
self.num_groups)(weights, top,
bottom.shape[-2:])
self.num_groups,
self.unshared)(weights, top,
bottom.shape[-2:])
d_top = CorrMM(self.border_mode,
self.subsample,
self.filter_dilation,
self.num_groups)(bottom, weights)
self.num_groups,
self.unshared)(bottom, weights)
d_height_width = ((theano.gradient.DisconnectedType()(),) * 2
if len(inp) == 4 else ())
return (d_bottom, d_top) + d_height_width
......@@ -684,8 +792,12 @@ class CorrMM_gradInputs(BaseCorrMM):
kern = as_tensor_variable(kern)
topgrad = as_tensor_variable(topgrad)
kern, topgrad = self.as_common_dtype(kern, topgrad)
if kern.type.ndim != 4:
raise TypeError('kern must be 4D tensor')
if self.unshared is True:
if kern.type.ndim != 6:
raise TypeError('kern must be 6D tensor')
else:
if kern.type.ndim != 4:
raise TypeError('kern must be 4D tensor')
if topgrad.type.ndim != 4:
raise TypeError('topgrad must be 4D tensor')
if shape is None:
......@@ -700,7 +812,7 @@ class CorrMM_gradInputs(BaseCorrMM):
broadcastable = [topgrad.type.broadcastable[0], False,
False, False]
else:
broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[1],
broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[-3],
False, False]
dtype = kern.type.dtype
return Apply(self, [kern, topgrad] + height_width,
......@@ -719,7 +831,7 @@ class CorrMM_gradInputs(BaseCorrMM):
dH, dW = self.subsample
kshp = input_shape[0]
topshp = input_shape[1]
ssize, kshp = kshp[1], list(kshp[2:])
ssize, kshp = kshp[-3], list(kshp[-2:])
ssize = ssize * self.num_groups
bsize, topshp = topshp[0], list(topshp[2:])
height_width = node.inputs[-2:]
......@@ -762,13 +874,15 @@ class CorrMM_gradInputs(BaseCorrMM):
d_weights = CorrMM_gradWeights(self.border_mode,
self.subsample,
self.filter_dilation,
self.num_groups)(bottom,
top,
weights.shape[-2:])
self.num_groups,
self.unshared)(bottom,
top,
weights.shape[-2:])
d_top = CorrMM(self.border_mode,
self.subsample,
self.filter_dilation,
self.num_groups)(bottom, weights)
self.num_groups,
self.unshared)(bottom, weights)
d_height_width = ((theano.gradient.DisconnectedType()(),) *
2 if len(inp) == 4 else ())
return (d_weights, d_top) + d_height_width
......
......@@ -82,12 +82,14 @@ def local_abstractconv_gemm(node):
# need to flip the kernel if necessary
if node.op.filter_flip:
kern = kern[:, :, ::-1, ::-1]
flip = (slice(None),) * (kern.ndim - 2) + \
(slice(None, None, -1),) * 2
kern = kern[flip]
rval = CorrMM(border_mode=node.op.border_mode,
subsample=node.op.subsample,
filter_dilation=node.op.filter_dilation,
num_groups=node.op.num_groups)(img, kern)
num_groups=node.op.num_groups,
unshared=node.op.unshared)(img, kern)
copy_stack_trace(node.outputs[0], rval)
return [rval]
......@@ -134,12 +136,15 @@ def local_abstractconv_gradweight_gemm(node):
rval = CorrMM_gradWeights(border_mode=node.op.border_mode,
subsample=node.op.subsample,
filter_dilation=node.op.filter_dilation,
num_groups=node.op.num_groups)(img, topgrad, shape)
num_groups=node.op.num_groups,
unshared=node.op.unshared)(img, topgrad, shape)
copy_stack_trace(node.outputs[0], rval)
# need to flip the kernel if necessary
if node.op.filter_flip:
rval = rval[:, :, ::-1, ::-1]
flip = (slice(None),) * (rval.ndim - 2) + \
(slice(None, None, -1),) * 2
rval = rval[flip]
rval = theano.tensor.patternbroadcast(rval, node.outputs[0].broadcastable)
copy_stack_trace(node.outputs[0], rval)
......@@ -189,12 +194,14 @@ def local_abstractconv_gradinputs_gemm(node):
# need to flip the kernel if necessary
if node.op.filter_flip:
kern = kern[:, :, ::-1, ::-1]
flip = (slice(None),) * (kern.ndim - 2) + \
(slice(None, None, -1),) * 2
kern = kern[flip]
rval = CorrMM_gradInputs(border_mode=node.op.border_mode,
subsample=node.op.subsample,
filter_dilation=node.op.filter_dilation,
num_groups=node.op.num_groups)(kern, topgrad,
shape)
num_groups=node.op.num_groups,
unshared=node.op.unshared)(kern, topgrad, shape)
copy_stack_trace(node.outputs[0], rval)
return [rval]
......@@ -242,7 +249,7 @@ def local_conv2d_cpu(node):
if not node.op.filter_flip:
# Not tested yet
return None
if node.op.num_groups > 1:
if node.op.num_groups > 1 or node.op.unshared:
return None
rval = conv2d(img, kern,
......@@ -270,7 +277,7 @@ def local_conv2d_gradweight_cpu(node):
if not node.op.filter_flip:
# Not tested yet
return
if node.op.num_groups > 1:
if node.op.num_groups > 1 or node.op.unshared:
return None
if node.op.border_mode == 'valid' and \
......@@ -370,7 +377,7 @@ def local_conv2d_gradinputs_cpu(node):
if not node.op.filter_flip:
# Not tested yet
return None
if node.op.num_groups > 1:
if node.op.num_groups > 1 or node.op.unshared:
return None
# Conv 3d implementation, needed when subsample > 2
......
......@@ -1744,3 +1744,154 @@ class Separable_conv(unittest.TestCase):
fun = theano.function([x_sym, dfilter_sym, pfilter_sym], sep_op, mode='FAST_RUN')
top = fun(x[:, :, :3, :3, :3], depthwise_filter, pointwise_filter)
utt.assert_allclose(top, precomp_output)
class TestUnsharedConv(unittest.TestCase):
conv2d = theano.tensor.nnet.abstract_conv.AbstractConv2d
conv2d_gradw = theano.tensor.nnet.abstract_conv.AbstractConv2d_gradWeights
conv2d_gradi = theano.tensor.nnet.abstract_conv.AbstractConv2d_gradInputs
conv2d_op = theano.tensor.nnet.abstract_conv.AbstractConv2d
conv2d_gradw_op = theano.tensor.nnet.abstract_conv.AbstractConv2d_gradWeights
conv2d_gradi_op = theano.tensor.nnet.abstract_conv.AbstractConv2d_gradInputs
mode = theano.compile.mode.Mode(optimizer='None')
def setUp(self):
self.img_shape = [(2, 2, 4, 4), (3, 2, 4, 2), (3, 3, 5, 3), (3, 4, 4, 4)]
self.kern_shape = [(2, 2, 2, 2, 3, 3), (2, 4, 2, 2, 4, 2), (3, 2, 1, 1, 3, 3), (4, 3, 3, 2, 4, 2)]
self.topgrad_shape = [(2, 2, 2, 2), (3, 2, 4, 2), (3, 3, 2, 1), (3, 4, 3, 3)]
self.border_mode = ['valid', 'full', 'valid', 'full']
self.subsample = [(1, 1), (2, 2), (2, 1), (3, 2)]
self.filter_dilation = (1, 1)
self.num_groups = [1, 1, 3, 2]
# self.verify_flags = np.random.choice([True, False], 4, [0.5, 0.5])
# Above line can be used instead if speed is a concern
self.verify_flags = [True] * 4
self.ref_mode = 'FAST_RUN'
if theano.config.cxx == "":
raise SkipTest("CorrMM needs cxx")
def test_fwd(self):
tensor6 = theano.tensor.TensorType(theano.config.floatX, (False,) * 6)
img_sym = theano.tensor.tensor4('img')
kern_sym = tensor6('kern')
ref_kern_sym = theano.tensor.tensor4('ref_kern')
for imshp, kshp, mode, sub, groups, verify in zip(self.img_shape, self.kern_shape, self.border_mode,
self.subsample, self.num_groups, self.verify_flags):
img = np.random.random(imshp).astype(theano.config.floatX)
kern = np.random.random(kshp).astype(theano.config.floatX)
unshared_conv_op = self.conv2d(border_mode=mode, subsample=sub,
filter_dilation=self.filter_dilation,
num_groups=groups, unshared=True)
unshared_out_sym = unshared_conv_op(img_sym, kern_sym)
unshared_func = theano.function([img_sym, kern_sym], unshared_out_sym, mode=self.mode)
assert any([isinstance(node.op, self.conv2d_op)
for node in unshared_func.maker.fgraph.toposort()])
unshared_output = unshared_func(img, kern)
single_kshp = kshp[:1] + kshp[3:]
ref_conv_op = self.conv2d(border_mode=mode, subsample=sub,
filter_dilation=self.filter_dilation,
num_groups=groups, unshared=False)
ref_out_sym = ref_conv_op(img_sym, ref_kern_sym)
ref_func = theano.function([img_sym, ref_kern_sym], ref_out_sym, mode=self.mode)
for i in range(0, kshp[1]):
for j in range(0, kshp[2]):
single_kern = kern[:, i, j, ...].reshape(single_kshp)
ref_val = ref_func(img, single_kern)
utt.assert_allclose(ref_val[:, :, i, j], unshared_output[:, :, i, j])
if verify:
utt.verify_grad(unshared_conv_op, [img, kern], mode=self.mode, eps=1)
def test_gradweight(self):
img_sym = theano.tensor.tensor4('img')
top_sym = theano.tensor.tensor4('top')
for imshp, kshp, topshp, mode, sub, groups, verify in zip(self.img_shape, self.kern_shape, self.topgrad_shape,
self.border_mode, self.subsample, self.num_groups,
self.verify_flags):
img = np.random.random(imshp).astype(theano.config.floatX)
top = np.random.random(topshp).astype(theano.config.floatX)
unshared_conv_op = self.conv2d_gradw(border_mode=mode, subsample=sub,
filter_dilation=self.filter_dilation,
num_groups=groups, unshared=True)
unshared_out_sym = unshared_conv_op(img_sym, top_sym, tensor.as_tensor_variable(kshp[-2:]))
unshared_func = theano.function([img_sym, top_sym], unshared_out_sym, mode=self.mode)
assert any([isinstance(node.op, self.conv2d_gradw_op)
for node in unshared_func.maker.fgraph.toposort()])
unshared_output = unshared_func(img, top)
single_kshp = kshp[:1] + kshp[3:]
ref_conv_op = self.conv2d_gradw(border_mode=mode, subsample=sub,
filter_dilation=self.filter_dilation,
num_groups=groups, unshared=False)
ref_out_sym = ref_conv_op(img_sym, top_sym, tensor.as_tensor_variable(single_kshp[-2:]))
ref_func = theano.function([img_sym, top_sym], ref_out_sym, mode=self.mode)
for i in range(0, topshp[2]):
for j in range(0, topshp[3]):
top_single = np.zeros_like(top)
top_single[:, :, i, j] = top[:, :, i, j]
ref_output = ref_func(img, top_single)
utt.assert_allclose(unshared_output[:, i, j, ...], ref_output)
def conv_gradweight(inputs_val, output_val):
return unshared_conv_op(inputs_val, output_val, tensor.as_tensor_variable(kshp[-2:]))
if verify:
utt.verify_grad(conv_gradweight, [img, top], mode=self.mode, eps=1)
def test_gradinput(self):
tensor6 = theano.tensor.TensorType(theano.config.floatX, (False,) * 6)
kern_sym = tensor6('kern')
top_sym = theano.tensor.tensor4('top')
ref_kern_sym = theano.tensor.tensor4('ref_kern')
for imshp, kshp, topshp, mode, sub, groups, verify in zip(self.img_shape, self.kern_shape, self.topgrad_shape,
self.border_mode, self.subsample, self.num_groups,
self.verify_flags):
single_kshp = kshp[:1] + kshp[3:]
kern = np.random.random(kshp).astype(theano.config.floatX)
top = np.random.random(topshp).astype(theano.config.floatX)
unshared_conv_op = self.conv2d_gradi(border_mode=mode, subsample=sub,
filter_dilation=self.filter_dilation,
num_groups=groups, unshared=True)
unshared_out_sym = unshared_conv_op(kern_sym, top_sym, tensor.as_tensor_variable(imshp[-2:]))
unshared_func = theano.function([kern_sym, top_sym], unshared_out_sym, mode=self.mode)
assert any([isinstance(node.op, self.conv2d_gradi_op)
for node in unshared_func.maker.fgraph.toposort()])
unshared_output = unshared_func(kern, top)
ref_conv_op = self.conv2d_gradi(border_mode=mode, subsample=sub,
filter_dilation=self.filter_dilation,
num_groups=groups, unshared=False)
ref_out_sym = ref_conv_op(ref_kern_sym, top_sym, tensor.as_tensor_variable(imshp[-2:]))
ref_func = theano.function([ref_kern_sym, top_sym], ref_out_sym, mode=self.mode)
ref_output = np.zeros(imshp)
for i in range(0, topshp[2]):
for j in range(0, topshp[3]):
single_kern = kern[:, i, j, ...].reshape(single_kshp)
top_single = np.zeros_like(top)
top_single[:, :, i, j] = top[:, :, i, j]
ref_output += ref_func(single_kern, top_single)
utt.assert_allclose(ref_output, unshared_output)
def conv_gradinputs(filters_val, output_val):
return unshared_conv_op(filters_val, output_val, tensor.as_tensor_variable(imshp[-2:]))
if verify:
utt.verify_grad(conv_gradinputs, [kern, top], mode=self.mode, eps=1)
......@@ -10,7 +10,7 @@ import theano
import theano.tensor as T
from theano.tests import unittest_tools as utt
from theano.tensor.nnet import corr, conv
from theano.tensor.nnet.tests.test_abstract_conv import Grouped_conv_noOptim
from theano.tensor.nnet.tests.test_abstract_conv import Grouped_conv_noOptim, TestUnsharedConv
class TestCorr2D(utt.InferShapeTester):
......@@ -452,6 +452,16 @@ class TestGroupCorr2d(Grouped_conv_noOptim):
utt.assert_allclose(gconv_output, conv_output)
class TestUnsharedCorr2d(TestUnsharedConv):
if theano.config.mode == "FAST_COMPILE":
mode = theano.compile.get_mode("FAST_RUN").excluding('gpuarray')
else:
mode = None
conv2d_op = corr.CorrMM
conv2d_gradw_op = corr.CorrMM_gradWeights
conv2d_gradi_op = corr.CorrMM_gradInputs
if __name__ == '__main__':
t = TestCorr2D('setUp')
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论