提交 58c3d08e authored 作者: Vikram's avatar Vikram

Unshared convolution python code

Errors fixed. Suggestions implemented. Minor changes Minor changes Gradient calculation added. Test for forward added. Errors fixed Grad wrt weights done. Grad wrt inputs incomplete Grad inp Fix typo Tests and bug fix for Grad Inp Modified flops to raise error Mostly convdim agnostic. Cleaner code for gradInput Some corr changes MiChecks for convdim=2 added. Some more misc changes Unshared code moved into one func Re-added unshared flag to get_conv_output_shape Simpler grad inputs. Unshared removeded from get_conv_output_shape. C code changes in corr.py wdim bug fix opt and abstract_conv changes CPU code for fwd and gradWeights. Added tests. Some errors gemv increment fixed. Values for fwd still don't match Forward perfect. Gradweights inverts regions; to be corrected. Added grad inputs and tests but allclose error Python gradInputs simplified Grad input fixed gradweights flipping problem solved Weight dimension order changed. C cache version updated. Docstring changes if unshared is True -> if unshared. Specific error messages for unshared in C code. Unshared tests integrated with AbstractConv. Subsampling errors fixed. Allclose errors with optimiser enabled Kern flip in optimiser fixed. Still some errors Errors fixed GPU corr_gemm code (untested) Unnecessary changes rolled back More GPU code but gemm error 11 Fixed mistakes caused while copying from CPU Errors fixed Fixed error with .data for gpuarray GPU tests Suggestions implemented for error messages Jenkins errors fixed Commits squashed Small errors fixed. Tests need to be rewritten Tests moved to separate class. Mistakes fixed Tests sped up Suggestions implemented. Tests modified
上级 078bdfb1
...@@ -458,13 +458,15 @@ class BaseGpuCorrMM(CGpuKernelBase): ...@@ -458,13 +458,15 @@ class BaseGpuCorrMM(CGpuKernelBase):
num_groups : num_groups :
Divides the image, kernel and output tensors into num_groups Divides the image, kernel and output tensors into num_groups
separate groups. Each which carry out convolutions separately (default : 1). separate groups. Each which carry out convolutions separately (default : 1).
unshared
Perform unshared correlation (default: False)
""" """
check_broadcast = False check_broadcast = False
__props__ = ('border_mode', 'subsample', 'filter_dilation', 'num_groups') __props__ = ('border_mode', 'subsample', 'filter_dilation', 'num_groups', 'unshared')
_f16_ok = True _f16_ok = True
def __init__(self, border_mode="valid", subsample=(1, 1), def __init__(self, border_mode="valid", subsample=(1, 1),
filter_dilation=(1, 1), num_groups=1): filter_dilation=(1, 1), num_groups=1, unshared=False):
if isinstance(border_mode, integer_types): if isinstance(border_mode, integer_types):
border_mode = (border_mode, border_mode) border_mode = (border_mode, border_mode)
if isinstance(border_mode, tuple): if isinstance(border_mode, tuple):
...@@ -487,6 +489,7 @@ class BaseGpuCorrMM(CGpuKernelBase): ...@@ -487,6 +489,7 @@ class BaseGpuCorrMM(CGpuKernelBase):
raise ValueError("Number of groups should be greater than 0") raise ValueError("Number of groups should be greater than 0")
self.num_groups = num_groups self.num_groups = num_groups
CGpuKernelBase.__init__(self, ['c_code/corr_gemm.c']) CGpuKernelBase.__init__(self, ['c_code/corr_gemm.c'])
self.unshared = unshared
@property @property
def pad(self): def pad(self):
...@@ -495,12 +498,13 @@ class BaseGpuCorrMM(CGpuKernelBase): ...@@ -495,12 +498,13 @@ class BaseGpuCorrMM(CGpuKernelBase):
return (0, 0) return (0, 0)
def __str__(self): def __str__(self):
return '%s{%s, %s, %s, %s}' % ( return '%s{%s, %s, %s, %s, %s}' % (
self.__class__.__name__, self.__class__.__name__,
self.border_mode, self.border_mode,
str(self.subsample), str(self.subsample),
str(self.filter_dilation), str(self.filter_dilation),
str(self.num_groups)) str(self.num_groups),
str(self.unshared))
def __setstate__(self, d): def __setstate__(self, d):
self.__dict__.update(d) self.__dict__.update(d)
...@@ -533,7 +537,7 @@ class BaseGpuCorrMM(CGpuKernelBase): ...@@ -533,7 +537,7 @@ class BaseGpuCorrMM(CGpuKernelBase):
def c_code_cache_version(self): def c_code_cache_version(self):
# Raise this whenever modifying the C code (including the file). # Raise this whenever modifying the C code (including the file).
return (10,) return (11,)
def c_code_helper(self, bottom, weights, top, direction, sub, height=None, width=None): def c_code_helper(self, bottom, weights, top, direction, sub, height=None, width=None):
""" """
...@@ -581,6 +585,7 @@ class BaseGpuCorrMM(CGpuKernelBase): ...@@ -581,6 +585,7 @@ class BaseGpuCorrMM(CGpuKernelBase):
dH, dW = self.subsample dH, dW = self.subsample
dilH, dilW = self.filter_dilation dilH, dilW = self.filter_dilation
numgroups = self.num_groups numgroups = self.num_groups
unshared = int(self.unshared)
if self.border_mode == "half": if self.border_mode == "half":
padH = padW = -1 padH = padW = -1
elif self.border_mode == "full": elif self.border_mode == "full":
...@@ -633,19 +638,24 @@ class BaseGpuCorrMM(CGpuKernelBase): ...@@ -633,19 +638,24 @@ class BaseGpuCorrMM(CGpuKernelBase):
int padH = %(padH)s; int padH = %(padH)s;
int padW = %(padW)s; int padW = %(padW)s;
int numgroups = %(numgroups)s; int numgroups = %(numgroups)s;
int unshared = %(unshared)s;
PyGpuArrayObject * bottom = %(bottom)s; PyGpuArrayObject * bottom = %(bottom)s;
PyGpuArrayObject * weights = %(weights)s; PyGpuArrayObject * weights = %(weights)s;
PyGpuArrayObject * top = %(top)s; PyGpuArrayObject * top = %(top)s;
PyGpuArrayObject * out2 = NULL; PyGpuArrayObject * out2 = NULL;
int wdim, odim;
wdim = unshared ? 6 : 4;
odim = 4; //Can be set to 6 later for unshared backprop wrt weights
// Obtain or infer kernel width and height // Obtain or infer kernel width and height
// (we need to know it early to be able to handle auto-padding) // (we need to know it early to be able to handle auto-padding)
size_t kH, kW, dil_kH, dil_kW; size_t kH, kW, dil_kH, dil_kW;
if (direction != 1) { if (direction != 1) {
// weight is an input variable, we can just read its shape // weight is an input variable, we can just read its shape
kH = PyGpuArray_DIMS(weights)[2]; kH = PyGpuArray_DIMS(weights)[wdim-2];
kW = PyGpuArray_DIMS(weights)[3]; kW = PyGpuArray_DIMS(weights)[wdim-1];
} }
else { else {
if (%(height)s != -1) { if (%(height)s != -1) {
...@@ -699,8 +709,10 @@ class BaseGpuCorrMM(CGpuKernelBase): ...@@ -699,8 +709,10 @@ class BaseGpuCorrMM(CGpuKernelBase):
// Infer output shape and type // Infer output shape and type
// The inferred shape can be negative. // The inferred shape can be negative.
long long out_dim[4]; long long out_dim[6];
size_t out_dim_size[4]; size_t out_dim_size[6];
out_dim[4] = out_dim[5] = 0; //Only used for unshared backprop wrt weights
out_dim_size[4] = out_dim_size[5] = 0; //Same
int out_typecode; int out_typecode;
PyGpuContextObject *out_context; PyGpuContextObject *out_context;
switch(direction) { switch(direction) {
...@@ -709,71 +721,131 @@ class BaseGpuCorrMM(CGpuKernelBase): ...@@ -709,71 +721,131 @@ class BaseGpuCorrMM(CGpuKernelBase):
// height and width: top = (bottom + 2*pad - ((weight-1)*dil + 1)) / sample + 1 // height and width: top = (bottom + 2*pad - ((weight-1)*dil + 1)) / sample + 1
out_dim[0] = PyGpuArray_DIMS(bottom)[0]; out_dim[0] = PyGpuArray_DIMS(bottom)[0];
out_dim[1] = PyGpuArray_DIMS(weights)[0]; out_dim[1] = PyGpuArray_DIMS(weights)[0];
out_dim[2] = (PyGpuArray_DIMS(bottom)[2] + 2*padH - ((PyGpuArray_DIMS(weights)[2]-1)*dilH + 1)) / dH + 1; out_dim[2] = (PyGpuArray_DIMS(bottom)[2] + 2*padH - ((PyGpuArray_DIMS(weights)[wdim-2]-1)*dilH + 1)) / dH + 1;
out_dim[3] = (PyGpuArray_DIMS(bottom)[3] + 2*padW - ((PyGpuArray_DIMS(weights)[3]-1)*dilW + 1)) / dW + 1; out_dim[3] = (PyGpuArray_DIMS(bottom)[3] + 2*padW - ((PyGpuArray_DIMS(weights)[wdim-1]-1)*dilW + 1)) / dW + 1;
out_typecode = bottom->ga.typecode; out_typecode = bottom->ga.typecode;
out_context = bottom->context; out_context = bottom->context;
if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0) if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
{ {
PyErr_Format(PyExc_ValueError, if (unshared) {
"GpuCorrMM: impossible output shape\\n" PyErr_Format(PyExc_ValueError,
" bottom shape: %%ld x %%ld x %%ld x %%ld\\n" "GpuCorrMM: impossible output shape\\n"
" weights shape: %%ld x %%ld x %%ld x %%ld\\n" " bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
" top shape: %%ld x %%ld x %%ld x %%ld\\n", " weights shape: %%ld x %%ld x %%ld x %%ld x %%ld x %%ld\\n"
PyGpuArray_DIMS(bottom)[0], PyGpuArray_DIMS(bottom)[1], " top shape: %%ld x %%ld x %%ld x %%ld\\n",
PyGpuArray_DIMS(bottom)[2], PyGpuArray_DIMS(bottom)[3], PyGpuArray_DIMS(bottom)[0], PyGpuArray_DIMS(bottom)[1],
PyGpuArray_DIMS(weights)[0], PyGpuArray_DIMS(weights)[1], PyGpuArray_DIMS(bottom)[2], PyGpuArray_DIMS(bottom)[3],
PyGpuArray_DIMS(weights)[2], PyGpuArray_DIMS(weights)[3], PyGpuArray_DIMS(weights)[0], PyGpuArray_DIMS(weights)[1],
out_dim[0], out_dim[1], out_dim[2], out_dim[3]); PyGpuArray_DIMS(weights)[2], PyGpuArray_DIMS(weights)[3],
%(fail)s PyGpuArray_DIMS(weights)[4], PyGpuArray_DIMS(weights)[5],
out_dim[0], out_dim[1], out_dim[2], out_dim[3]);
%(fail)s
}
else {
PyErr_Format(PyExc_ValueError,
"GpuCorrMM: impossible output shape\\n"
" bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
" weights shape: %%ld x %%ld x %%ld x %%ld\\n"
" top shape: %%ld x %%ld x %%ld x %%ld\\n",
PyGpuArray_DIMS(bottom)[0], PyGpuArray_DIMS(bottom)[1],
PyGpuArray_DIMS(bottom)[2], PyGpuArray_DIMS(bottom)[3],
PyGpuArray_DIMS(weights)[0], PyGpuArray_DIMS(weights)[1],
PyGpuArray_DIMS(weights)[2], PyGpuArray_DIMS(weights)[3],
out_dim[0], out_dim[1], out_dim[2], out_dim[3]);
%(fail)s
}
} }
break; break;
case 1: // backprop wrt. weights case 1: // backprop wrt. weights
// output is weights: (num_filters, num_channels, height, width) // output is weights: (num_filters, num_channels, height, width) or
// (num_filters, top_height, top_width, num_channels, height, width) -> for unshared
// height and width: weights = (bottom + 2*pad - (top - 1) * sample - 1) / dil + 1 // height and width: weights = (bottom + 2*pad - (top - 1) * sample - 1) / dil + 1
out_dim[0] = PyGpuArray_DIMS(top)[1]; out_dim[0] = PyGpuArray_DIMS(top)[1];
out_dim[1] = PyGpuArray_DIMS(bottom)[1] / numgroups; if (unshared){
out_dim[2] = kH; // already inferred further above odim = 6;
out_dim[3] = kW; // how convenient out_dim[1] = PyGpuArray_DIMS(top)[2];
out_dim[2] = PyGpuArray_DIMS(top)[3];
}
out_dim[wdim-3] = PyGpuArray_DIMS(bottom)[1] / numgroups;
out_dim[wdim-2] = kH; // already inferred further above
out_dim[wdim-1] = kW; // how convenient
out_typecode = top->ga.typecode; out_typecode = top->ga.typecode;
out_context = top->context; out_context = top->context;
if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0) if (unshared) {
{ if (out_dim[0] < 0 || out_dim[1] <= 0 || out_dim[2] <= 0 || out_dim[3] < 0
PyErr_Format(PyExc_ValueError, || out_dim[4] <= 0 || out_dim[5] <= 0){
"GpuCorrMM backprop wrt. weights: impossible output shape\\n" PyErr_Format(PyExc_ValueError,
" bottom shape: %%ld x %%ld x %%ld x %%ld\\n" "GpuCorrMM backprop wrt. weights: impossible output shape\\n"
" weights shape: %%ld x %%ld x %%ld x %%ld\\n" " bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
" top shape: %%ld x %%ld x %%ld x %%ld\\n", " weights shape: %%ld x %%ld x %%ld x %%ld x %%ld x %%ld\\n"
PyGpuArray_DIMS(bottom)[0], PyGpuArray_DIMS(bottom)[1], " top shape: %%ld x %%ld x %%ld x %%ld\\n",
PyGpuArray_DIMS(bottom)[2], PyGpuArray_DIMS(bottom)[3], PyGpuArray_DIMS(bottom)[0], PyGpuArray_DIMS(bottom)[1],
out_dim[0], out_dim[1], out_dim[2], out_dim[3], PyGpuArray_DIMS(bottom)[2], PyGpuArray_DIMS(bottom)[3],
PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1], out_dim[0], out_dim[1], out_dim[2], out_dim[3],
PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3]); out_dim[4], out_dim[5],
%(fail)s PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3]);
%(fail)s
}
}
else {
if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
{
PyErr_Format(PyExc_ValueError,
"GpuCorrMM backprop wrt. weights: impossible output shape\\n"
" bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
" weights shape: %%ld x %%ld x %%ld x %%ld\\n"
" top shape: %%ld x %%ld x %%ld x %%ld\\n",
PyGpuArray_DIMS(bottom)[0], PyGpuArray_DIMS(bottom)[1],
PyGpuArray_DIMS(bottom)[2], PyGpuArray_DIMS(bottom)[3],
out_dim[0], out_dim[1], out_dim[2], out_dim[3],
PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3]);
%(fail)s
}
} }
break; break;
case 2: // backprop wrt. inputs case 2: // backprop wrt. inputs
// output is bottom: (batchsize, num_channels, height, width) // output is bottom: (batchsize, num_channels, height, width)
// height and width: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad // height and width: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad
out_dim[0] = PyGpuArray_DIMS(top)[0]; out_dim[0] = PyGpuArray_DIMS(top)[0];
out_dim[1] = PyGpuArray_DIMS(weights)[1] * numgroups; out_dim[1] = PyGpuArray_DIMS(weights)[wdim-3] * numgroups;
out_dim[2] = (%(height)s != -1) ? %(height)s : (PyGpuArray_DIMS(top)[2] - 1) * dH + (PyGpuArray_DIMS(weights)[2]-1)*dilH + 1 - 2*padH; out_dim[2] = (%(height)s != -1) ? %(height)s : (PyGpuArray_DIMS(top)[2] - 1) * dH + (PyGpuArray_DIMS(weights)[wdim-2]-1)*dilH + 1 - 2*padH;
out_dim[3] = (%(width)s != -1) ? %(width)s : (PyGpuArray_DIMS(top)[3] - 1) * dW + (PyGpuArray_DIMS(weights)[3]-1)*dilW + 1 - 2*padW; out_dim[3] = (%(width)s != -1) ? %(width)s : (PyGpuArray_DIMS(top)[3] - 1) * dW + (PyGpuArray_DIMS(weights)[wdim-1]-1)*dilW + 1 - 2*padW;
out_typecode = top->ga.typecode; out_typecode = top->ga.typecode;
out_context = top->context; out_context = top->context;
if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0) if (unshared) {
{ if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
PyErr_Format(PyExc_ValueError, {
"GpuCorrMM backprop wrt. inputs: impossible output shape\\n" PyErr_Format(PyExc_ValueError,
" bottom shape: %%ld x %%ld x %%ld x %%ld\\n" "GpuCorrMM backprop wrt. inputs: impossible output shape\\n"
" weight shape: %%ld x %%ld x %%ld x %%ld\\n" " bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
" top shape: %%ld x %%ld x %%ld x %%ld\\n", " weight shape: %%ld x %%ld x %%ld x %%ld x %%ld x %%ld\\n"
out_dim[0], out_dim[1], out_dim[2], out_dim[3], " top shape: %%ld x %%ld x %%ld x %%ld\\n",
PyGpuArray_DIMS(weights)[0], PyGpuArray_DIMS(weights)[1], out_dim[0], out_dim[1], out_dim[2], out_dim[3],
PyGpuArray_DIMS(weights)[2], PyGpuArray_DIMS(weights)[3], PyGpuArray_DIMS(weights)[0], PyGpuArray_DIMS(weights)[1],
PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1], PyGpuArray_DIMS(weights)[2], PyGpuArray_DIMS(weights)[3],
PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3]); PyGpuArray_DIMS(weights)[4], PyGpuArray_DIMS(weights)[5],
%(fail)s PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3]);
%(fail)s
}
}
else {
if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
{
PyErr_Format(PyExc_ValueError,
"GpuCorrMM backprop wrt. inputs: impossible output shape\\n"
" bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
" weight shape: %%ld x %%ld x %%ld x %%ld\\n"
" top shape: %%ld x %%ld x %%ld x %%ld\\n",
out_dim[0], out_dim[1], out_dim[2], out_dim[3],
PyGpuArray_DIMS(weights)[0], PyGpuArray_DIMS(weights)[1],
PyGpuArray_DIMS(weights)[2], PyGpuArray_DIMS(weights)[3],
PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3]);
%(fail)s
}
} }
break; break;
default: default:
...@@ -786,12 +858,24 @@ class BaseGpuCorrMM(CGpuKernelBase): ...@@ -786,12 +858,24 @@ class BaseGpuCorrMM(CGpuKernelBase):
out_dim_size[2] = (size_t)out_dim[2]; out_dim_size[2] = (size_t)out_dim[2];
out_dim_size[3] = (size_t)out_dim[3]; out_dim_size[3] = (size_t)out_dim[3];
if (odim == 6) {
out_dim_size[4] = (size_t)out_dim[4];
out_dim_size[5] = (size_t)out_dim[5];
}
// Prepare output array // Prepare output array
if (theano_prep_output(&%(out)s, 4, out_dim_size, out_typecode, GA_C_ORDER, out_context) != 0) if (theano_prep_output(&%(out)s, odim, out_dim_size, out_typecode, GA_C_ORDER, out_context) != 0)
{ {
PyErr_Format(PyExc_RuntimeError, if (odim == 4) {
"BaseGpuCorrMM: Failed to allocate output of %%lld x %%lld x %%lld x %%lld", PyErr_Format(PyExc_RuntimeError,
out_dim[0], out_dim[1], out_dim[2], out_dim[3]); "BaseGpuCorrMM: Failed to allocate output of %%lld x %%lld x %%lld x %%lld",
out_dim[0], out_dim[1], out_dim[2], out_dim[3]);
}
if (odim == 6) {
PyErr_Format(PyExc_RuntimeError,
"BaseGpuCorrMM: Failed to allocate output of %%lld x %%lld x %%lld x %%lld %%lld %%lld",
out_dim[0], out_dim[1], out_dim[2], out_dim[3], out_dim[4], out_dim[5]);
}
%(fail)s %(fail)s
} }
if (!GpuArray_IS_C_CONTIGUOUS(&%(out)s->ga)) { if (!GpuArray_IS_C_CONTIGUOUS(&%(out)s->ga)) {
...@@ -800,7 +884,7 @@ class BaseGpuCorrMM(CGpuKernelBase): ...@@ -800,7 +884,7 @@ class BaseGpuCorrMM(CGpuKernelBase):
} }
// Call GPU code // Call GPU code
out2 = corrMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, dilH, dilW, padH, padW, numgroups); out2 = corrMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, dilH, dilW, padH, padW, numgroups, unshared);
if (out2==NULL){ if (out2==NULL){
%(fail)s %(fail)s
} }
...@@ -859,9 +943,9 @@ class GpuCorrMM(BaseGpuCorrMM): ...@@ -859,9 +943,9 @@ class GpuCorrMM(BaseGpuCorrMM):
""" """
def __init__(self, border_mode="valid", def __init__(self, border_mode="valid",
subsample=(1, 1), subsample=(1, 1),
filter_dilation=(1, 1), num_groups=1): filter_dilation=(1, 1), num_groups=1, unshared=False):
super(GpuCorrMM, self).__init__(border_mode, subsample, super(GpuCorrMM, self).__init__(border_mode, subsample,
filter_dilation, num_groups) filter_dilation, num_groups, unshared)
def make_node(self, img, kern): def make_node(self, img, kern):
ctx_name = infer_context_name(img, kern) ctx_name = infer_context_name(img, kern)
...@@ -869,8 +953,12 @@ class GpuCorrMM(BaseGpuCorrMM): ...@@ -869,8 +953,12 @@ class GpuCorrMM(BaseGpuCorrMM):
kern = as_gpuarray_variable(kern, ctx_name) kern = as_gpuarray_variable(kern, ctx_name)
if img.type.ndim != 4: if img.type.ndim != 4:
raise TypeError('img must be 4D tensor') raise TypeError('img must be 4D tensor')
if kern.type.ndim != 4: if self.unshared:
raise TypeError('kern must be 4D tensor') if kern.type.ndim != 6:
raise TypeError('kern must be 6D tensor')
else:
if kern.type.ndim != 4:
raise TypeError('kern must be 4D tensor')
broadcastable = [img.type.broadcastable[0], kern.type.broadcastable[0], broadcastable = [img.type.broadcastable[0], kern.type.broadcastable[0],
False, False] False, False]
...@@ -891,12 +979,14 @@ class GpuCorrMM(BaseGpuCorrMM): ...@@ -891,12 +979,14 @@ class GpuCorrMM(BaseGpuCorrMM):
d_bottom = GpuCorrMM_gradInputs(self.border_mode, d_bottom = GpuCorrMM_gradInputs(self.border_mode,
self.subsample, self.subsample,
self.filter_dilation, self.filter_dilation,
self.num_groups)( self.num_groups,
self.unshared)(
weights, top, bottom.shape[-2:]) weights, top, bottom.shape[-2:])
d_weights = GpuCorrMM_gradWeights(self.border_mode, d_weights = GpuCorrMM_gradWeights(self.border_mode,
self.subsample, self.subsample,
self.filter_dilation, self.filter_dilation,
self.num_groups)( self.num_groups,
self.unshared)(
bottom, top, weights.shape[-2:]) bottom, top, weights.shape[-2:])
return d_bottom, d_weights return d_bottom, d_weights
...@@ -915,10 +1005,12 @@ class GpuCorrMM_gradWeights(BaseGpuCorrMM): ...@@ -915,10 +1005,12 @@ class GpuCorrMM_gradWeights(BaseGpuCorrMM):
def __init__(self, border_mode="valid", def __init__(self, border_mode="valid",
subsample=(1, 1), subsample=(1, 1),
filter_dilation=(1, 1), filter_dilation=(1, 1),
num_groups=1): num_groups=1,
unshared=False):
super(GpuCorrMM_gradWeights, self).__init__(border_mode, super(GpuCorrMM_gradWeights, self).__init__(border_mode,
subsample, subsample,
filter_dilation, num_groups) filter_dilation, num_groups,
unshared)
def make_node(self, img, topgrad, shape=None): def make_node(self, img, topgrad, shape=None):
ctx_name = infer_context_name(img, topgrad) ctx_name = infer_context_name(img, topgrad)
...@@ -938,8 +1030,12 @@ class GpuCorrMM_gradWeights(BaseGpuCorrMM): ...@@ -938,8 +1030,12 @@ class GpuCorrMM_gradWeights(BaseGpuCorrMM):
assert shape[0].ndim == 0 assert shape[0].ndim == 0
assert shape[1].ndim == 0 assert shape[1].ndim == 0
broadcastable = [topgrad.type.broadcastable[1], img.type.broadcastable[1], if self.unshared:
False, False] broadcastable = [topgrad.type.broadcastable[0], False, False,
img.type.broadcastable[1], False, False]
else:
broadcastable = [topgrad.type.broadcastable[1], img.type.broadcastable[1],
False, False]
return Apply(self, [img, topgrad] + height_width, [GpuArrayType(dtype=img.dtype, return Apply(self, [img, topgrad] + height_width, [GpuArrayType(dtype=img.dtype,
context_name=ctx_name, context_name=ctx_name,
broadcastable=broadcastable)()]) broadcastable=broadcastable)()])
...@@ -958,11 +1054,12 @@ class GpuCorrMM_gradWeights(BaseGpuCorrMM): ...@@ -958,11 +1054,12 @@ class GpuCorrMM_gradWeights(BaseGpuCorrMM):
d_bottom = GpuCorrMM_gradInputs(self.border_mode, d_bottom = GpuCorrMM_gradInputs(self.border_mode,
self.subsample, self.subsample,
self.filter_dilation, self.filter_dilation,
self.num_groups)(weights, self.num_groups,
top, self.unshared)(weights,
bottom.shape[-2:]) top,
bottom.shape[-2:])
d_top = GpuCorrMM( d_top = GpuCorrMM(
self.border_mode, self.subsample, self.filter_dilation, self.num_groups)(bottom, weights) self.border_mode, self.subsample, self.filter_dilation, self.num_groups, self.unshared)(bottom, weights)
d_height_width = ( d_height_width = (
theano.gradient.DisconnectedType()(), theano.gradient.DisconnectedType()(),
) * 2 if len(inp) == 4 else () ) * 2 if len(inp) == 4 else ()
...@@ -989,16 +1086,22 @@ class GpuCorrMM_gradInputs(BaseGpuCorrMM): ...@@ -989,16 +1086,22 @@ class GpuCorrMM_gradInputs(BaseGpuCorrMM):
def __init__(self, border_mode="valid", def __init__(self, border_mode="valid",
subsample=(1, 1), subsample=(1, 1),
filter_dilation=(1, 1), filter_dilation=(1, 1),
num_groups=1): num_groups=1,
unshared=False):
super(GpuCorrMM_gradInputs, self).__init__(border_mode, subsample, super(GpuCorrMM_gradInputs, self).__init__(border_mode, subsample,
filter_dilation, num_groups) filter_dilation, num_groups,
unshared)
def make_node(self, kern, topgrad, shape=None): def make_node(self, kern, topgrad, shape=None):
ctx_name = infer_context_name(kern, topgrad) ctx_name = infer_context_name(kern, topgrad)
kern = as_gpuarray_variable(kern, ctx_name) kern = as_gpuarray_variable(kern, ctx_name)
topgrad = as_gpuarray_variable(topgrad, ctx_name) topgrad = as_gpuarray_variable(topgrad, ctx_name)
if kern.type.ndim != 4: if self.unshared:
raise TypeError('kern must be 4D tensor') if kern.type.ndim != 6:
raise TypeError('kern must be 6D tensor')
else:
if kern.type.ndim != 4:
raise TypeError('kern must be 4D tensor')
if topgrad.type.ndim != 4: if topgrad.type.ndim != 4:
raise TypeError('topgrad must be 4D tensor') raise TypeError('topgrad must be 4D tensor')
if shape is None: if shape is None:
...@@ -1014,7 +1117,7 @@ class GpuCorrMM_gradInputs(BaseGpuCorrMM): ...@@ -1014,7 +1117,7 @@ class GpuCorrMM_gradInputs(BaseGpuCorrMM):
broadcastable = [topgrad.type.broadcastable[0], False, broadcastable = [topgrad.type.broadcastable[0], False,
False, False] False, False]
else: else:
broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[1], broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[-3],
False, False] False, False]
return Apply(self, [kern, topgrad] + height_width, [GpuArrayType(dtype=topgrad.dtype, return Apply(self, [kern, topgrad] + height_width, [GpuArrayType(dtype=topgrad.dtype,
context_name=ctx_name, context_name=ctx_name,
...@@ -1034,13 +1137,15 @@ class GpuCorrMM_gradInputs(BaseGpuCorrMM): ...@@ -1034,13 +1137,15 @@ class GpuCorrMM_gradInputs(BaseGpuCorrMM):
d_weights = GpuCorrMM_gradWeights(self.border_mode, d_weights = GpuCorrMM_gradWeights(self.border_mode,
self.subsample, self.subsample,
self.filter_dilation, self.filter_dilation,
self.num_groups)(bottom, self.num_groups,
top, self.unshared)(bottom,
weights.shape[-2:]) top,
weights.shape[-2:])
d_top = GpuCorrMM(self.border_mode, d_top = GpuCorrMM(self.border_mode,
self.subsample, self.subsample,
self.filter_dilation, self.filter_dilation,
self.num_groups)(bottom, weights) self.num_groups,
self.unshared)(bottom, weights)
d_height_width = ( d_height_width = (
theano.gradient.DisconnectedType()(), theano.gradient.DisconnectedType()(),
) * 2 if len(inp) == 4 else () ) * 2 if len(inp) == 4 else ()
...@@ -1682,7 +1787,7 @@ class GpuCorr3dMM_gradInputs(BaseGpuCorr3dMM): ...@@ -1682,7 +1787,7 @@ class GpuCorr3dMM_gradInputs(BaseGpuCorr3dMM):
broadcastable = [topgrad.type.broadcastable[0], False, broadcastable = [topgrad.type.broadcastable[0], False,
False, False, False] False, False, False]
else: else:
broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[1], broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[-3],
False, False, False] False, False, False]
return Apply(self, [kern, topgrad] + height_width_depth, return Apply(self, [kern, topgrad] + height_width_depth,
[GpuArrayType(dtype=topgrad.dtype, [GpuArrayType(dtype=topgrad.dtype,
......
...@@ -349,7 +349,8 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom, ...@@ -349,7 +349,8 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
const size_t dilW = 1, const size_t dilW = 1,
const size_t padH = 0, const size_t padH = 0,
const size_t padW = 0, const size_t padW = 0,
const size_t numgroups = 1) const size_t numgroups = 1,
const size_t unshared = 0)
{ {
if (PyGpuArray_NDIM(bottom) != 4) if (PyGpuArray_NDIM(bottom) != 4)
{ {
...@@ -368,21 +369,35 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom, ...@@ -368,21 +369,35 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
return NULL; return NULL;
} }
if (PyGpuArray_NDIM(weight) != 4) if (PyGpuArray_NDIM(weight) != (unshared ? 6 : 4))
{ {
PyErr_SetString(PyExc_ValueError, "GpuCorrMM requires weight of 4D"); PyErr_Format(PyExc_ValueError, "GpuCorrMM requires weight of %dD", unshared ? 6 : 4);
return NULL; return NULL;
} }
if (!GpuArray_IS_C_CONTIGUOUS(&weight->ga)) if (!GpuArray_IS_C_CONTIGUOUS(&weight->ga))
{ {
PyErr_Format(PyExc_ValueError, if (unshared) {
"GpuCorrMM requires weight to be C-contiguous, " PyErr_Format(PyExc_ValueError,
"but strides are: %ld %ld %ld %ld\n", "GpuCorrMM requires weight to be C-contiguous, "
PyGpuArray_STRIDES(weight)[0], "but strides are: %ld %ld %ld %ld %ld %ld\n",
PyGpuArray_STRIDES(weight)[1], PyGpuArray_STRIDES(weight)[0],
PyGpuArray_STRIDES(weight)[2], PyGpuArray_STRIDES(weight)[1],
PyGpuArray_STRIDES(weight)[3]); PyGpuArray_STRIDES(weight)[2],
return NULL; PyGpuArray_STRIDES(weight)[3],
PyGpuArray_STRIDES(weight)[4],
PyGpuArray_STRIDES(weight)[5]);
return NULL;
}
else {
PyErr_Format(PyExc_ValueError,
"GpuCorrMM requires weight to be C-contiguous, "
"but strides are: %ld %ld %ld %ld\n",
PyGpuArray_STRIDES(weight)[0],
PyGpuArray_STRIDES(weight)[1],
PyGpuArray_STRIDES(weight)[2],
PyGpuArray_STRIDES(weight)[3]);
return NULL;
}
} }
if (PyGpuArray_NDIM(top) != 4) if (PyGpuArray_NDIM(top) != 4)
...@@ -409,10 +424,12 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom, ...@@ -409,10 +424,12 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
const size_t bottomHeight = PyGpuArray_DIMS(bottom)[2]; const size_t bottomHeight = PyGpuArray_DIMS(bottom)[2];
const size_t bottomWidth = PyGpuArray_DIMS(bottom)[3]; const size_t bottomWidth = PyGpuArray_DIMS(bottom)[3];
// weights: (nFilters, nChannels, rows, columns) // weights: (nFilters, nChannels, rows, columns)
// or (nFilters, out_rows, out_columns, nChannels, rows, columns) -> for unshared
const size_t nFilters = PyGpuArray_DIMS(weight)[0]; const size_t nFilters = PyGpuArray_DIMS(weight)[0];
const size_t kH = PyGpuArray_DIMS(weight)[2];
const size_t kW = PyGpuArray_DIMS(weight)[3]; const size_t kH = PyGpuArray_DIMS(weight)[unshared ? 4 : 2];
if (nChannels != (PyGpuArray_DIMS(weight)[1] * numgroups)) { const size_t kW = PyGpuArray_DIMS(weight)[unshared ? 5 : 3];
if (nChannels != PyGpuArray_DIMS(weight)[unshared ? 3 : 1] * numgroups) {
PyErr_SetString(PyExc_ValueError, PyErr_SetString(PyExc_ValueError,
"GpuCorrMM images and kernel must have the same stack size\n"); "GpuCorrMM images and kernel must have the same stack size\n");
return NULL; return NULL;
...@@ -435,21 +452,56 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom, ...@@ -435,21 +452,56 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
const size_t topHeight = _CONV_FLOORDIV_X(topHeightNoDH, dH) + 1; const size_t topHeight = _CONV_FLOORDIV_X(topHeightNoDH, dH) + 1;
const size_t topWidth = _CONV_FLOORDIV_X(topWidthNoDW, dW) + 1; const size_t topWidth = _CONV_FLOORDIV_X(topWidthNoDW, dW) + 1;
#undef _CONV_FLOORDIV #undef _CONV_FLOORDIV
if (batchSize != PyGpuArray_DIMS(top)[0] || if (unshared) {
nFilters != PyGpuArray_DIMS(top)[1] || if (topHeight != PyGpuArray_DIMS(weight)[1] ||
topHeight != PyGpuArray_DIMS(top)[2] || topWidth != PyGpuArray_DIMS(weight)[2]) {
topWidth != PyGpuArray_DIMS(top)[3]) { PyErr_Format(PyExc_ValueError,
PyErr_Format(PyExc_ValueError, "GpuCorrMM regions in kernel must match output regions:\n"
"GpuCorrMM shape inconsistency:\n" " bottom shape: %ld %ld %ld %ld\n"
" bottom shape: %ld %ld %ld %ld\n" " weight shape: %ld %ld %ld %ld %ld %ld"
" weight shape: %ld %ld %ld %ld\n" " (expected %ld %ld %ld %ld %ld %ld)\n"
" top shape: %ld %ld %ld %ld (expected %ld %ld %ld %ld)\n", " top shape(calculated): %ld %ld %ld %ld\n",
batchSize, nChannels, bottomHeight, bottomWidth, batchSize, nChannels, bottomHeight, bottomWidth,
nFilters, nChannels / numgroups, kH, kW, nFilters, PyGpuArray_DIMS(weight)[1],
PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1], PyGpuArray_DIMS(weight)[2], nChannels / numgroups, kH, kW,
PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3], nFilters, topHeight, topWidth, nChannels / numgroups, kH, kW,
batchSize, nFilters, topHeight, topWidth); batchSize, nFilters, topHeight, topWidth);
return NULL; return NULL;
}
if (batchSize != PyGpuArray_DIMS(top)[0] ||
nFilters != PyGpuArray_DIMS(top)[1] ||
topHeight != PyGpuArray_DIMS(top)[2] ||
topWidth != PyGpuArray_DIMS(top)[3]) {
PyErr_Format(PyExc_ValueError,
"GpuCorrMM shape inconsistency:\n"
" bottom shape: %ld %ld %ld %ld\n"
" weight shape: %ld %ld %ld %ld %ld %ld\n"
" top shape: %ld %ld %ld %ld (expected %ld %ld %ld %ld)\n",
batchSize, nChannels, bottomHeight, bottomWidth,
nFilters, topHeight, topWidth, nChannels / numgroups, kH, kW,
PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3],
batchSize, nFilters, topHeight, topWidth);
return NULL;
}
}
else{
if (batchSize != PyGpuArray_DIMS(top)[0] ||
nFilters != PyGpuArray_DIMS(top)[1] ||
topHeight != PyGpuArray_DIMS(top)[2] ||
topWidth != PyGpuArray_DIMS(top)[3]) {
PyErr_Format(PyExc_ValueError,
"GpuCorrMM shape inconsistency:\n"
" bottom shape: %ld %ld %ld %ld\n"
" weight shape: %ld %ld %ld %ld\n"
" top shape: %ld %ld %ld %ld (expected %ld %ld %ld %ld)\n",
batchSize, nChannels, bottomHeight, bottomWidth,
nFilters, nChannels / numgroups, kH, kW,
PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3],
batchSize, nFilters, topHeight, topWidth);
return NULL;
}
} }
int err = gpublas_setup(bottom->context->ctx); int err = gpublas_setup(bottom->context->ctx);
...@@ -512,19 +564,37 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom, ...@@ -512,19 +564,37 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
return NULL; return NULL;
} }
// Second, gemm // Second, gemm
for (size_t g = 0; g < numgroups; g++){ if (unshared) {
err = rgemm(cb_fortran, cb_no_trans, cb_no_trans, for (size_t g = 0; g < numgroups; ++g) {
N_, M_, K_, 1, for (size_t reg = 0; reg < N_; ++reg){
&col->ga, g * group_col_stride, N_, err = rgemm(cb_fortran, cb_no_trans, cb_no_trans,
&weight->ga, g * group_weight_stride, K_, 1, M_, K_, 1,
0, &col->ga, g * group_col_stride + reg, N_,
&top->ga, n * batch_top_stride + g * group_top_stride, N_); &weight->ga, g * group_weight_stride + reg * K_, K_ * N_,
0,
&top->ga, n * batch_top_stride + g * group_top_stride + reg, N_);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "GpuCorrMM forward encountered an error running gemm: %d", err);
Py_DECREF(col);
return NULL;
}
}
}
} }
if (err != GA_NO_ERROR) { else {
PyErr_Format(PyExc_RuntimeError, for (size_t g = 0; g < numgroups; ++g){
"GpuCorrMM forward encountered an error running gemm: %d", err); err = rgemm(cb_fortran, cb_no_trans, cb_no_trans,
Py_DECREF(col); N_, M_, K_, 1,
return NULL; &col->ga, g * group_col_stride, N_,
&weight->ga, g * group_weight_stride, K_,
0,
&top->ga, n * batch_top_stride + g * group_top_stride, N_);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "GpuCorrMM forward encountered an error running gemm: %d", err);
Py_DECREF(col);
return NULL;
}
}
} }
} }
} }
...@@ -557,19 +627,37 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom, ...@@ -557,19 +627,37 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
// Note that we accumulate into weight. We do so by setting beta = 0 // Note that we accumulate into weight. We do so by setting beta = 0
// for the first iteration and beta = 1 for subsequent ones. (This // for the first iteration and beta = 1 for subsequent ones. (This
// is faster than setting weight to all zeros before the loop.) // is faster than setting weight to all zeros before the loop.)
for(size_t g = 0; g < numgroups; g++){ if (unshared) {
err = rgemm(cb_fortran, cb_trans, cb_no_trans, for (size_t g = 0; g < numgroups; ++g) {
K_, M_, N_, 1, for (size_t reg = 0; reg < N_; ++reg){
&col->ga, g * group_col_stride, N_, err = rgemm(cb_fortran, cb_trans, cb_no_trans,
&top->ga, n * batch_top_stride + g * group_top_stride, N_, K_, M_, 1, 1,
(n == 0) ? 0 : 1, &col->ga, g * group_col_stride + reg, N_,
&weight->ga, g * group_weight_stride, K_); &top->ga, n * batch_top_stride + g * group_top_stride + reg, N_,
(n == 0) ? 0 : 1,
&weight->ga, g * group_weight_stride + reg * K_, K_ * N_);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "GpuCorrMM grad weights encountered an error running gemm: %d", err);
Py_DECREF(col);
return NULL;
}
}
}
} }
if (err != GA_NO_ERROR) { else{
PyErr_Format(PyExc_RuntimeError, for(size_t g = 0; g < numgroups; g++){
"GpuCorrMM grad weights encountered an error running gemm: %d", err); err = rgemm(cb_fortran, cb_trans, cb_no_trans,
Py_DECREF(col); K_, M_, N_, 1,
return NULL; &col->ga, g * group_col_stride, N_,
&top->ga, n * batch_top_stride + g * group_top_stride, N_,
(n == 0) ? 0 : 1,
&weight->ga, g * group_weight_stride, K_);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "GpuCorrMM grad weights encountered an error running gemm: %d", err);
Py_DECREF(col);
return NULL;
}
}
} }
} }
} }
...@@ -590,19 +678,37 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom, ...@@ -590,19 +678,37 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
// Iterate over batch // Iterate over batch
for (size_t n = 0; n < batchSize; n++) { for (size_t n = 0; n < batchSize; n++) {
// gemm into columns // gemm into columns
for(size_t g = 0; g < numgroups; g++){ if (unshared) {
err = rgemm(cb_fortran, cb_no_trans, cb_trans, for (size_t g = 0; g < numgroups; ++g){
N_, K_, M_, 1, for (size_t reg = 0; reg < N_; ++reg) {
&top->ga, n * batch_top_stride + g * group_top_stride, N_, err = rgemm(cb_fortran, cb_no_trans, cb_trans,
&weight->ga, g * group_weight_stride, K_, 1, K_, M_, 1,
0, &top->ga, n * batch_top_stride + g * group_top_stride + reg, N_,
&col->ga, g * group_col_stride, N_); &weight->ga, g * group_weight_stride + reg * K_, K_ * N_,
0,
&col->ga, g * group_col_stride + reg, N_);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "GpuCorrMM grad inputs encountered an error running gemm: %d", err);
Py_DECREF(col);
return NULL;
}
}
}
} }
if (err != GA_NO_ERROR) { else {
PyErr_Format(PyExc_RuntimeError, for (size_t g = 0; g < numgroups; ++g){
"GpuCorrMM grad inputs encountered an error running gemm: %d", err); err = rgemm(cb_fortran, cb_no_trans, cb_trans,
Py_DECREF(col); N_, K_, M_, 1,
return NULL; &top->ga, n * batch_top_stride + g * group_top_stride, N_,
&weight->ga, g * group_weight_stride, K_,
0,
&col->ga, g * group_col_stride, N_);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "GpuCorrMM grad inputs encountered an error running gemm: %d", err);
Py_DECREF(col);
return NULL;
}
}
} }
// col2im back to the data // col2im back to the data
err = col2im(&col->ga, nChannels, bottomHeight, bottomWidth, err = col2im(&col->ga, nChannels, bottomHeight, bottomWidth,
......
...@@ -3035,6 +3035,9 @@ def local_abstractconv_cudnn_graph(op, context_name, inputs, outputs): ...@@ -3035,6 +3035,9 @@ def local_abstractconv_cudnn_graph(op, context_name, inputs, outputs):
if version(raises=False) < 6000 and op.filter_dilation != (1, 1): if version(raises=False) < 6000 and op.filter_dilation != (1, 1):
return None return None
if op.unshared:
return None
inp1 = inputs[0] inp1 = inputs[0]
inp2 = inputs[1] inp2 = inputs[1]
...@@ -3129,6 +3132,8 @@ def local_abstractconv_cudnn(node): ...@@ -3129,6 +3132,8 @@ def local_abstractconv_cudnn(node):
ctx = infer_context_name(*node.inputs) ctx = infer_context_name(*node.inputs)
if not isinstance(node.inputs[0].type, GpuArrayType): if not isinstance(node.inputs[0].type, GpuArrayType):
return return
if node.op.unshared:
return None
if isinstance(node.op, AbstractConv2d): if isinstance(node.op, AbstractConv2d):
return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs) return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
elif isinstance(node.op, AbstractConv3d): elif isinstance(node.op, AbstractConv3d):
...@@ -3143,6 +3148,8 @@ def local_abstractconv_cudnn_alt(node): ...@@ -3143,6 +3148,8 @@ def local_abstractconv_cudnn_alt(node):
if version(raises=False) < 6000 and node.op.filter_dilation != (1, 1): if version(raises=False) < 6000 and node.op.filter_dilation != (1, 1):
return None return None
if node.op.unshared:
return None
inp1 = node.inputs[0] inp1 = node.inputs[0]
inp2 = node.inputs[1] inp2 = node.inputs[1]
...@@ -3349,6 +3356,8 @@ def local_abstractconv_gw_cudnn(node): ...@@ -3349,6 +3356,8 @@ def local_abstractconv_gw_cudnn(node):
ctx = infer_context_name(*node.inputs) ctx = infer_context_name(*node.inputs)
if not isinstance(node.inputs[0].type, GpuArrayType): if not isinstance(node.inputs[0].type, GpuArrayType):
return return
if node.op.unshared:
return None
if isinstance(node.op, AbstractConv2d_gradWeights): if isinstance(node.op, AbstractConv2d_gradWeights):
return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs) return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
elif isinstance(node.op, AbstractConv3d_gradWeights): elif isinstance(node.op, AbstractConv3d_gradWeights):
...@@ -3360,6 +3369,8 @@ def local_abstractconv_gi_cudnn(node): ...@@ -3360,6 +3369,8 @@ def local_abstractconv_gi_cudnn(node):
ctx = infer_context_name(*node.inputs) ctx = infer_context_name(*node.inputs)
if not isinstance(node.inputs[0].type, GpuArrayType): if not isinstance(node.inputs[0].type, GpuArrayType):
return return
if node.op.unshared:
return None
if isinstance(node.op, AbstractConv2d_gradInputs): if isinstance(node.op, AbstractConv2d_gradInputs):
return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs) return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
elif isinstance(node.op, AbstractConv3d_gradInputs): elif isinstance(node.op, AbstractConv3d_gradInputs):
......
...@@ -1595,12 +1595,17 @@ def local_abstractconv_gemm(node): ...@@ -1595,12 +1595,17 @@ def local_abstractconv_gemm(node):
border_mode = node.op.border_mode border_mode = node.op.border_mode
subsample = node.op.subsample subsample = node.op.subsample
filter_dilation = node.op.filter_dilation filter_dilation = node.op.filter_dilation
num_groups = node.op.num_groups
unshared = node.op.unshared
if ((border_mode == 'full') and (subsample == (1, 1)) and node.op.num_groups == 1): flip = (slice(None),) * (kern.ndim - 2) + \
(slice(None, None, -1),) * 2
kern_axes = (1, 0) + tuple(i for i in range(2, kern.ndim))
if ((border_mode == 'full') and (subsample == (1, 1)) and num_groups == 1 and not unshared):
if not node.op.filter_flip: if not node.op.filter_flip:
kern = kern[:, :, ::-1, ::-1] kern = kern[flip]
# need to dimshuffle the kernel for full convolution # need to dimshuffle the kernel for full convolution
kern = kern.dimshuffle(1, 0, 2, 3) kern = kern.dimshuffle(kern_axes)
# call GpuCorrMM_gradInputs # call GpuCorrMM_gradInputs
rval = GpuCorrMM_gradInputs('valid', rval = GpuCorrMM_gradInputs('valid',
subsample, subsample,
...@@ -1609,13 +1614,14 @@ def local_abstractconv_gemm(node): ...@@ -1609,13 +1614,14 @@ def local_abstractconv_gemm(node):
else: else:
# need to flip the kernel if necessary # need to flip the kernel if necessary
if node.op.filter_flip: if node.op.filter_flip:
kern = kern[:, :, ::-1, ::-1] kern = kern[flip]
# By default use GpuCorrMM # By default use GpuCorrMM
rval = GpuCorrMM(border_mode, rval = GpuCorrMM(border_mode,
subsample, subsample,
filter_dilation, filter_dilation,
node.op.num_groups)(gpu_contiguous(img), num_groups,
gpu_contiguous(kern)) unshared)(gpu_contiguous(img),
gpu_contiguous(kern))
# call GpuCorrMM_gradWeights if good # call GpuCorrMM_gradWeights if good
# (the latter is faster if batchsize * kernelHeight * kernelWidth # (the latter is faster if batchsize * kernelHeight * kernelWidth
...@@ -1628,11 +1634,12 @@ def local_abstractconv_gemm(node): ...@@ -1628,11 +1634,12 @@ def local_abstractconv_gemm(node):
(node.op.kshp is not None) and (node.op.kshp is not None) and
(None not in node.op.kshp) and (None not in node.op.kshp) and
border_mode != "half" and border_mode != "half" and
node.op.num_groups == 1): num_groups == 1 and
not unshared):
# we know the kernel and output size # we know the kernel and output size
prod1 = node.op.kshp[0] * node.op.kshp[1] prod1 = node.op.kshp[0] * node.op.kshp[-3]
prod2 = ((node.op.imshp[-2] - node.op.kshp[0] + 1) * prod2 = ((node.op.imshp[-2] - node.op.kshp[0] + 1) *
(node.op.imshp[-1] - node.op.kshp[1] + 1)) (node.op.imshp[-1] - node.op.kshp[-3] + 1))
if (None not in node.op.imshp[:1]): if (None not in node.op.imshp[:1]):
# we also know batchsize and input channels # we also know batchsize and input channels
prod1 *= node.op.imshp[0] prod1 *= node.op.imshp[0]
...@@ -1641,7 +1648,8 @@ def local_abstractconv_gemm(node): ...@@ -1641,7 +1648,8 @@ def local_abstractconv_gemm(node):
if prod1 > prod2: if prod1 > prod2:
rval = GpuCorrMM_gradWeights(border_mode, rval = GpuCorrMM_gradWeights(border_mode,
subsample, subsample,
filter_dilation)( filter_dilation,
unshared)(
gpu_contiguous(img.dimshuffle(1, 0, 2, 3)), gpu_contiguous(img.dimshuffle(1, 0, 2, 3)),
gpu_contiguous(kern.dimshuffle(1, 0, 2, 3))) gpu_contiguous(kern.dimshuffle(1, 0, 2, 3)))
# (we need to wrap the result in as_gpuarray_variable, # (we need to wrap the result in as_gpuarray_variable,
...@@ -1690,8 +1698,9 @@ def local_abstractconv_gemm_alt(node): ...@@ -1690,8 +1698,9 @@ def local_abstractconv_gemm_alt(node):
subsample = node.op.subsample subsample = node.op.subsample
filter_dilation = node.op.filter_dilation filter_dilation = node.op.filter_dilation
num_groups = node.op.num_groups num_groups = node.op.num_groups
unshared = node.op.unshared
if border_mode == 'full' and subsample == (1, 1) and num_groups == 1: if border_mode == 'full' and subsample == (1, 1) and num_groups == 1 and not unshared:
if not node.op.filter_flip: if not node.op.filter_flip:
kern = kern[:, :, ::-1, ::-1] kern = kern[:, :, ::-1, ::-1]
...@@ -1702,7 +1711,7 @@ def local_abstractconv_gemm_alt(node): ...@@ -1702,7 +1711,7 @@ def local_abstractconv_gemm_alt(node):
gpu_contiguous(kern), gpu_contiguous(img)) gpu_contiguous(kern), gpu_contiguous(img))
elif (border_mode == 'valid' and subsample == (1, 1) and filter_dilation == (1, 1) and elif (border_mode == 'valid' and subsample == (1, 1) and filter_dilation == (1, 1) and
num_groups == 1): num_groups == 1 and not unshared):
if node.op.filter_flip: if node.op.filter_flip:
kern = kern[:, :, ::-1, ::-1] kern = kern[:, :, ::-1, ::-1]
...@@ -1896,10 +1905,13 @@ def local_abstractconv_gradweights_gemm(node): ...@@ -1896,10 +1905,13 @@ def local_abstractconv_gradweights_gemm(node):
rval = GpuCorrMM_gradWeights(border_mode=node.op.border_mode, rval = GpuCorrMM_gradWeights(border_mode=node.op.border_mode,
subsample=node.op.subsample, subsample=node.op.subsample,
filter_dilation=node.op.filter_dilation, filter_dilation=node.op.filter_dilation,
num_groups=node.op.num_groups)( num_groups=node.op.num_groups,
unshared=node.op.unshared)(
gpu_contiguous(img), gpu_contiguous(topgrad), shape) gpu_contiguous(img), gpu_contiguous(topgrad), shape)
flip = (slice(None),) * (rval.ndim - 2) + \
(slice(None, None, -1),) * 2
if node.op.filter_flip: if node.op.filter_flip:
rval = rval[:, :, ::-1, ::-1] rval = rval[flip]
rval = tensor.patternbroadcast(rval, node.outputs[0].broadcastable) rval = tensor.patternbroadcast(rval, node.outputs[0].broadcastable)
rval = as_gpuarray_variable(rval, context_name=ctx) rval = as_gpuarray_variable(rval, context_name=ctx)
return [rval] return [rval]
...@@ -1918,9 +1930,10 @@ def local_abstractconv_gemm_gradweights_alt(node): ...@@ -1918,9 +1930,10 @@ def local_abstractconv_gemm_gradweights_alt(node):
subsample = node.op.subsample subsample = node.op.subsample
filter_dilation = node.op.filter_dilation filter_dilation = node.op.filter_dilation
num_groups = node.op.num_groups num_groups = node.op.num_groups
unshared = node.op.unshared
if(border_mode == 'valid' and subsample == (1, 1) and filter_dilation == (1, 1) and if(border_mode == 'valid' and subsample == (1, 1) and filter_dilation == (1, 1) and
num_groups == 1): num_groups == 1 and not unshared):
rval = GpuCorrMM(border_mode, rval = GpuCorrMM(border_mode,
subsample, subsample,
filter_dilation)( filter_dilation)(
...@@ -2001,12 +2014,15 @@ def local_abstractconv_gradinputs_gemm(node): ...@@ -2001,12 +2014,15 @@ def local_abstractconv_gradinputs_gemm(node):
return None return None
if node.op.filter_flip: if node.op.filter_flip:
kern = kern[:, :, ::-1, ::-1] flip = (slice(None),) * (kern.ndim - 2) + \
(slice(None, None, -1),) * 2
kern = kern[flip]
rval = GpuCorrMM_gradInputs(border_mode=node.op.border_mode, rval = GpuCorrMM_gradInputs(border_mode=node.op.border_mode,
subsample=node.op.subsample, subsample=node.op.subsample,
filter_dilation=node.op.filter_dilation, filter_dilation=node.op.filter_dilation,
num_groups=node.op.num_groups)( num_groups=node.op.num_groups,
unshared=node.op.unshared)(
gpu_contiguous(kern), gpu_contiguous(topgrad), shape) gpu_contiguous(kern), gpu_contiguous(topgrad), shape)
return [rval] return [rval]
...@@ -2023,8 +2039,9 @@ def local_abstractconv_gradinputs_gemm_alt(node): ...@@ -2023,8 +2039,9 @@ def local_abstractconv_gradinputs_gemm_alt(node):
subsample = node.op.subsample subsample = node.op.subsample
filter_dilation = node.op.filter_dilation filter_dilation = node.op.filter_dilation
num_groups = node.op.num_groups num_groups = node.op.num_groups
unshared = node.op.unshared
if border_mode == 'valid' and subsample == (1, 1) and num_groups == 1: if border_mode == 'valid' and subsample == (1, 1) and num_groups == 1 and not unshared:
if not node.op.filter_flip: if not node.op.filter_flip:
kern = kern[:, :, ::-1, ::-1] kern = kern[:, :, ::-1, ::-1]
......
...@@ -8,10 +8,10 @@ from theano.tests import unittest_tools as utt ...@@ -8,10 +8,10 @@ from theano.tests import unittest_tools as utt
from theano.tensor.nnet.corr import CorrMM, CorrMM_gradWeights, CorrMM_gradInputs from theano.tensor.nnet.corr import CorrMM, CorrMM_gradWeights, CorrMM_gradInputs
from ..type import gpuarray_shared_constructor from theano.gpuarray.type import gpuarray_shared_constructor
from ..blas import GpuCorrMM, GpuCorrMM_gradWeights, GpuCorrMM_gradInputs from theano.gpuarray.blas import GpuCorrMM, GpuCorrMM_gradWeights, GpuCorrMM_gradInputs
from .config import mode_with_gpu, mode_without_gpu, ref_cast from config import mode_with_gpu, mode_without_gpu, ref_cast
from theano.tensor.nnet.tests.test_abstract_conv import Grouped_conv_noOptim from theano.tensor.nnet.tests.test_abstract_conv import Grouped_conv_noOptim, TestUnsharedConv
class TestCorrMM(unittest.TestCase): class TestCorrMM(unittest.TestCase):
...@@ -20,9 +20,13 @@ class TestCorrMM(unittest.TestCase): ...@@ -20,9 +20,13 @@ class TestCorrMM(unittest.TestCase):
border_mode='valid', border_mode='valid',
filter_dilation=(1, 1), filter_dilation=(1, 1),
subsample=(1, 1), subsample=(1, 1),
unshared=False,
verify_grad=False): verify_grad=False):
inputs_shape = [inputs_shape[i] for i in (0, 3, 1, 2)] inputs_shape = [inputs_shape[i] for i in (0, 3, 1, 2)]
filters_shape = [filters_shape[i] for i in (0, 3, 1, 2)] if unshared:
filters_shape = [filters_shape[i] for i in (0, 1, 2, 5, 3, 4)]
else:
filters_shape = [filters_shape[i] for i in (0, 3, 1, 2)]
inputs_val = np.random.random(inputs_shape).astype(config.floatX) inputs_val = np.random.random(inputs_shape).astype(config.floatX)
filters_val = np.random.random(filters_shape).astype(config.floatX) filters_val = np.random.random(filters_shape).astype(config.floatX)
...@@ -32,13 +36,15 @@ class TestCorrMM(unittest.TestCase): ...@@ -32,13 +36,15 @@ class TestCorrMM(unittest.TestCase):
conv_ref = CorrMM(border_mode=border_mode, conv_ref = CorrMM(border_mode=border_mode,
filter_dilation=filter_dilation, filter_dilation=filter_dilation,
subsample=subsample)(ref_cast(inputs), subsample=subsample,
ref_cast(filters)) unshared=unshared)(ref_cast(inputs),
ref_cast(filters))
f_ref = theano.function([], conv_ref, mode=mode_without_gpu) f_ref = theano.function([], conv_ref, mode=mode_without_gpu)
conv = GpuCorrMM(border_mode=border_mode, conv = GpuCorrMM(border_mode=border_mode,
filter_dilation=filter_dilation, filter_dilation=filter_dilation,
subsample=subsample)(inputs, filters) subsample=subsample,
unshared=unshared)(inputs, filters)
f = theano.function([], conv, mode=mode_with_gpu) f = theano.function([], conv, mode=mode_with_gpu)
res_ref = f_ref() res_ref = f_ref()
...@@ -48,7 +54,8 @@ class TestCorrMM(unittest.TestCase): ...@@ -48,7 +54,8 @@ class TestCorrMM(unittest.TestCase):
if verify_grad: if verify_grad:
utt.verify_grad(GpuCorrMM(border_mode=border_mode, utt.verify_grad(GpuCorrMM(border_mode=border_mode,
filter_dilation=filter_dilation, filter_dilation=filter_dilation,
subsample=subsample), subsample=subsample,
unshared=unshared),
[inputs_val, filters_val], mode=mode_with_gpu) [inputs_val, filters_val], mode=mode_with_gpu)
def test_valid(self): def test_valid(self):
...@@ -57,12 +64,6 @@ class TestCorrMM(unittest.TestCase): ...@@ -57,12 +64,6 @@ class TestCorrMM(unittest.TestCase):
self.run_conv_valid(inputs_shape=(16, 20, 12, 1), self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
filters_shape=(10, 6, 12, 1), filters_shape=(10, 6, 12, 1),
subsample=(2, 2)) subsample=(2, 2))
self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
filters_shape=(10, 6, 12, 1),
subsample=(2, 2))
self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
filters_shape=(10, 6, 12, 1),
subsample=(3, 3))
self.run_conv_valid(inputs_shape=(16, 20, 12, 1), self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
filters_shape=(10, 6, 12, 1), filters_shape=(10, 6, 12, 1),
subsample=(3, 3)) subsample=(3, 3))
...@@ -117,6 +118,41 @@ class TestCorrMM(unittest.TestCase): ...@@ -117,6 +118,41 @@ class TestCorrMM(unittest.TestCase):
border_mode=border_mode, border_mode=border_mode,
verify_grad=True) verify_grad=True)
def test_unshared(self):
self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
filters_shape=(10, 15, 1, 6, 12, 1),
unshared=True)
self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
filters_shape=(10, 8, 1, 6, 12, 1),
subsample=(2, 2), unshared=True)
self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
filters_shape=(10, 5, 1, 6, 12, 1),
subsample=(3, 3), unshared=True)
self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
filters_shape=(10, 5, 1, 6, 12, 1),
subsample=(3, 2), unshared=True)
self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
filters_shape=(10, 15, 1, 6, 12, 1),
subsample=(1, 2), unshared=True)
self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
filters_shape=(10, 15, 1, 6, 12, 1),
border_mode='valid', unshared=True)
self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
filters_shape=(10, 21, 13, 6, 12, 1),
border_mode='half', unshared=True)
self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
filters_shape=(10, 25, 23, 6, 12, 1),
border_mode='full', unshared=True)
self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
filters_shape=(10, 15, 1, 6, 12, 1),
border_mode=(0, 0), unshared=True)
self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
filters_shape=(10, 17, 5, 6, 12, 1),
border_mode=(1, 2), unshared=True)
self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
filters_shape=(10, 21, 5, 6, 12, 1),
border_mode=(3, 2), unshared=True)
def run_gradweight(self, inputs_shape, filters_shape, dCdH_shape, def run_gradweight(self, inputs_shape, filters_shape, dCdH_shape,
subsample=(1, 1)): subsample=(1, 1)):
inputs_shape = [inputs_shape[i] for i in (0, 3, 1, 2)] inputs_shape = [inputs_shape[i] for i in (0, 3, 1, 2)]
...@@ -227,3 +263,17 @@ class TestGroupGpuCorr2d(Grouped_conv_noOptim): ...@@ -227,3 +263,17 @@ class TestGroupGpuCorr2d(Grouped_conv_noOptim):
conv_op = GpuCorrMM conv_op = GpuCorrMM
conv_gradw_op = GpuCorrMM_gradWeights conv_gradw_op = GpuCorrMM_gradWeights
conv_gradi_op = GpuCorrMM_gradInputs conv_gradi_op = GpuCorrMM_gradInputs
flip_filter = True
is_dnn = False
class TestUnsharedGpuCorr2d(TestUnsharedConv):
mode = theano.compile.get_mode("FAST_RUN")
conv2d = GpuCorrMM
conv2d_gradw = GpuCorrMM_gradWeights
conv2d_gradi = GpuCorrMM_gradInputs
conv2d_op = GpuCorrMM
conv2d_gradw_op = GpuCorrMM_gradWeights
conv2d_gradi_op = GpuCorrMM_gradInputs
flip_filter = True
is_dnn = False
...@@ -37,7 +37,7 @@ from .abstract_conv import separable_conv2d ...@@ -37,7 +37,7 @@ from .abstract_conv import separable_conv2d
def conv2d(input, filters, input_shape=None, filter_shape=None, def conv2d(input, filters, input_shape=None, filter_shape=None,
border_mode='valid', subsample=(1, 1), filter_flip=True, border_mode='valid', subsample=(1, 1), filter_flip=True,
image_shape=None, filter_dilation=(1, 1), num_groups=1, **kwargs): image_shape=None, filter_dilation=(1, 1), num_groups=1, unshared=False, **kwargs):
""" """
This function will build the symbolic graph for convolving a mini-batch of a This function will build the symbolic graph for convolving a mini-batch of a
stack of 2D inputs with a set of 2D filters. The implementation is modelled stack of 2D inputs with a set of 2D filters. The implementation is modelled
...@@ -51,18 +51,22 @@ def conv2d(input, filters, input_shape=None, filter_shape=None, ...@@ -51,18 +51,22 @@ def conv2d(input, filters, input_shape=None, filter_shape=None,
(batch size, input channels, input rows, input columns). (batch size, input channels, input rows, input columns).
See the optional parameter ``input_shape``. See the optional parameter ``input_shape``.
filters: symbolic 4D tensor filters: symbolic 4D or 6D tensor
Set of filters used in CNN layer of shape Set of filters used in CNN layer of shape
(output channels, input channels, filter rows, filter columns). (output channels, input channels, filter rows, filter columns)
for normal convolution and
(output channels, output rows, output columns, input channels,
filter rows, filter columns)
for unshared convolution.
See the optional parameter ``filter_shape``. See the optional parameter ``filter_shape``.
input_shape: None, tuple/list of len 4 of int or Constant variable input_shape: None, tuple/list of len 4 or 6 of int or Constant variable
The shape of the input parameter. The shape of the input parameter.
Optional, possibly used to choose an optimal implementation. Optional, possibly used to choose an optimal implementation.
You can give ``None`` for any element of the list to specify that this You can give ``None`` for any element of the list to specify that this
element is not known at compile time. element is not known at compile time.
filter_shape: None, tuple/list of len 4 of int or Constant variable filter_shape: None, tuple/list of len 4 or 6 of int or Constant variable
The shape of the filters parameter. The shape of the filters parameter.
Optional, possibly used to choose an optimal implementation. Optional, possibly used to choose an optimal implementation.
You can give ``None`` for any element of the list to specify that this You can give ``None`` for any element of the list to specify that this
...@@ -105,6 +109,11 @@ def conv2d(input, filters, input_shape=None, filter_shape=None, ...@@ -105,6 +109,11 @@ def conv2d(input, filters, input_shape=None, filter_shape=None,
Divides the image, kernel and output tensors into num_groups Divides the image, kernel and output tensors into num_groups
separate groups. Each which carry out convolutions separately separate groups. Each which carry out convolutions separately
unshared: bool
If true, then unshared or 'locally connected' convolution will be
performed. A different kernel will be used for each region of the
input.
kwargs: Any other keyword arguments are accepted for backwards kwargs: Any other keyword arguments are accepted for backwards
compatibility, but will be ignored. compatibility, but will be ignored.
...@@ -154,12 +163,12 @@ def conv2d(input, filters, input_shape=None, filter_shape=None, ...@@ -154,12 +163,12 @@ def conv2d(input, filters, input_shape=None, filter_shape=None,
return abstract_conv2d(input, filters, input_shape, filter_shape, return abstract_conv2d(input, filters, input_shape, filter_shape,
border_mode, subsample, filter_flip, border_mode, subsample, filter_flip,
filter_dilation, num_groups) filter_dilation, num_groups, unshared)
def conv2d_transpose(input, filters, output_shape, filter_shape=None, def conv2d_transpose(input, filters, output_shape, filter_shape=None,
border_mode='valid', input_dilation=(1, 1), border_mode='valid', input_dilation=(1, 1),
filter_flip=True, filter_dilation=(1, 1), num_groups=1): filter_flip=True, filter_dilation=(1, 1), num_groups=1, unshared=False):
""" """
This function will build the symbolic graph for applying a transposed This function will build the symbolic graph for applying a transposed
convolution over a mini-batch of a stack of 2D inputs with a set of 2D convolution over a mini-batch of a stack of 2D inputs with a set of 2D
...@@ -215,6 +224,11 @@ def conv2d_transpose(input, filters, output_shape, filter_shape=None, ...@@ -215,6 +224,11 @@ def conv2d_transpose(input, filters, output_shape, filter_shape=None,
Divides the image, kernel and output tensors into num_groups Divides the image, kernel and output tensors into num_groups
separate groups. Each which carry out convolutions separately separate groups. Each which carry out convolutions separately
unshared: bool
If true, then unshared or 'locally connected' convolution will be
performed. A different kernel will be used for each region of the
input.
Returns Returns
------- -------
Symbolic 4D tensor Symbolic 4D tensor
...@@ -242,4 +256,5 @@ def conv2d_transpose(input, filters, output_shape, filter_shape=None, ...@@ -242,4 +256,5 @@ def conv2d_transpose(input, filters, output_shape, filter_shape=None,
subsample=input_dilation, subsample=input_dilation,
filter_flip=filter_flip, filter_flip=filter_flip,
filter_dilation=filter_dilation, filter_dilation=filter_dilation,
num_groups=num_groups) num_groups=num_groups,
unshared=unshared)
...@@ -44,9 +44,13 @@ def get_conv_output_shape(image_shape, kernel_shape, ...@@ -44,9 +44,13 @@ def get_conv_output_shape(image_shape, kernel_shape,
to: batch size, number of input channels, height and width (and to: batch size, number of input channels, height and width (and
possibly depth) of the image. None where undefined. possibly depth) of the image. None where undefined.
kernel_shape: tuple of int (symbolic or numeric) corresponding to the kernel_shape: tuple of int (symbolic or numeric) corresponding to the
kernel shape. Its four (or five) elements must correspond respectively kernel shape. For a normal convolution, its four (or five) elements
to: number of output channels, number of input channels, height and must correspond respectively to : number of output channels, number of
width (and possibly depth) of the kernel. None where undefined. input channels, height and width (and possibly depth) of the kernel.
For an unshared convolution, its six channels must correspond to :
number of output channels, height and width
of the output, number of input channels, height and width of the kernel.
None where undefined.
border_mode: string, int (symbolic or numeric) or tuple of int (symbolic border_mode: string, int (symbolic or numeric) or tuple of int (symbolic
or numeric). If it is a string, it must be 'valid', 'half' or 'full'. or numeric). If it is a string, it must be 'valid', 'half' or 'full'.
If it is a tuple, its two (or three) elements respectively correspond If it is a tuple, its two (or three) elements respectively correspond
...@@ -65,7 +69,10 @@ def get_conv_output_shape(image_shape, kernel_shape, ...@@ -65,7 +69,10 @@ def get_conv_output_shape(image_shape, kernel_shape,
""" """
bsize, imshp = image_shape[0], image_shape[2:] bsize, imshp = image_shape[0], image_shape[2:]
nkern, kshp = kernel_shape[0], kernel_shape[2:]
convdim = len(image_shape) - 2
nkern, kshp = kernel_shape[0], kernel_shape[-convdim:]
if filter_dilation is None: if filter_dilation is None:
filter_dilation = np.ones(len(subsample), dtype='int') filter_dilation = np.ones(len(subsample), dtype='int')
...@@ -139,7 +146,7 @@ def get_conv_shape_1axis(image_shape, kernel_shape, border_mode, ...@@ -139,7 +146,7 @@ def get_conv_shape_1axis(image_shape, kernel_shape, border_mode,
def get_conv_gradweights_shape(image_shape, top_shape, def get_conv_gradweights_shape(image_shape, top_shape,
border_mode, subsample, border_mode, subsample,
filter_dilation=None, filter_dilation=None,
num_groups=1): num_groups=1, unshared=False):
""" """
This function tries to compute the kernel shape of convolution gradWeights. This function tries to compute the kernel shape of convolution gradWeights.
...@@ -194,7 +201,10 @@ def get_conv_gradweights_shape(image_shape, top_shape, ...@@ -194,7 +201,10 @@ def get_conv_gradweights_shape(image_shape, top_shape,
out_shp = tuple(get_conv_gradweights_shape_1axis( out_shp = tuple(get_conv_gradweights_shape_1axis(
imshp[i], topshp[i], border_mode, imshp[i], topshp[i], border_mode,
subsample[i], filter_dilation[i]) for i in range(len(subsample))) subsample[i], filter_dilation[i]) for i in range(len(subsample)))
return (nchan, nkern) + out_shp if unshared:
return (nchan,) + top_shape[2:] + (nkern,) + out_shp
else:
return (nchan, nkern) + out_shp
def get_conv_gradweights_shape_1axis(image_shape, top_shape, border_mode, def get_conv_gradweights_shape_1axis(image_shape, top_shape, border_mode,
...@@ -289,7 +299,9 @@ def get_conv_gradinputs_shape(kernel_shape, top_shape, ...@@ -289,7 +299,9 @@ def get_conv_gradinputs_shape(kernel_shape, top_shape,
""" """
bsize, topshp = top_shape[0], top_shape[2:] bsize, topshp = top_shape[0], top_shape[2:]
nkern, kshp = kernel_shape[1], kernel_shape[2:]
convdim = len(topshp) - 2
nkern, kshp = kernel_shape[1], kernel_shape[-convdim:]
if filter_dilation is None: if filter_dilation is None:
filter_dilation = np.ones(len(subsample), dtype='int') filter_dilation = np.ones(len(subsample), dtype='int')
...@@ -522,7 +534,8 @@ def conv2d(input, ...@@ -522,7 +534,8 @@ def conv2d(input,
subsample=(1, 1), subsample=(1, 1),
filter_flip=True, filter_flip=True,
filter_dilation=(1, 1), filter_dilation=(1, 1),
num_groups=1): num_groups=1,
unshared=False):
"""This function will build the symbolic graph for convolving a mini-batch of a """This function will build the symbolic graph for convolving a mini-batch of a
stack of 2D inputs with a set of 2D filters. The implementation is modelled stack of 2D inputs with a set of 2D filters. The implementation is modelled
after Convolutional Neural Networks (CNN). after Convolutional Neural Networks (CNN).
...@@ -538,7 +551,8 @@ def conv2d(input, ...@@ -538,7 +551,8 @@ def conv2d(input,
subsample=subsample, subsample=subsample,
filter_flip=filter_flip, filter_flip=filter_flip,
filter_dilation=filter_dilation, filter_dilation=filter_dilation,
num_groups=num_groups) num_groups=num_groups,
unshared=unshared)
return conv_op(input, filters) return conv_op(input, filters)
...@@ -878,7 +892,6 @@ def conv3d(input, ...@@ -878,7 +892,6 @@ def conv3d(input,
version until it is released. version until it is released.
""" """
input = as_tensor_variable(input) input = as_tensor_variable(input)
filters = as_tensor_variable(filters) filters = as_tensor_variable(filters)
conv_op = AbstractConv3d(imshp=input_shape, conv_op = AbstractConv3d(imshp=input_shape,
...@@ -899,7 +912,8 @@ def conv2d_grad_wrt_inputs(output_grad, ...@@ -899,7 +912,8 @@ def conv2d_grad_wrt_inputs(output_grad,
subsample=(1, 1), subsample=(1, 1),
filter_flip=True, filter_flip=True,
filter_dilation=(1, 1), filter_dilation=(1, 1),
num_groups=1): num_groups=1,
unshared=False):
"""Compute conv output gradient w.r.t its inputs """Compute conv output gradient w.r.t its inputs
This function builds the symbolic graph for getting the This function builds the symbolic graph for getting the
...@@ -916,10 +930,14 @@ def conv2d_grad_wrt_inputs(output_grad, ...@@ -916,10 +930,14 @@ def conv2d_grad_wrt_inputs(output_grad,
will be upsampled or the output gradient of the convolution will be upsampled or the output gradient of the convolution
whose gradient will be taken with respect to the input of the whose gradient will be taken with respect to the input of the
convolution. convolution.
filters : symbolic 4D tensor filters: symbolic 4D or 6D tensor
set of filters used in CNN layer of shape (output channels, Set of filters used in CNN layer of shape
input channels, filter rows, filter columns). See the (output channels, input channels, filter rows, filter columns)
optional parameter ``filter_shape``. for normal convolution and
(output channels, output rows, output columns, input channels,
filter rows, filter columns)
for unshared convolution.
See the optional parameter ``filter_shape``.
input_shape : [None/int/Constant] * 2 + [Tensor/int/Constant] * 2 input_shape : [None/int/Constant] * 2 + [Tensor/int/Constant] * 2
The shape of the input (upsampled) parameter. The shape of the input (upsampled) parameter.
A tuple/list of len 4, with the first two dimensions A tuple/list of len 4, with the first two dimensions
...@@ -928,8 +946,9 @@ def conv2d_grad_wrt_inputs(output_grad, ...@@ -928,8 +946,9 @@ def conv2d_grad_wrt_inputs(output_grad,
Not Optional, since given the output_grad shape Not Optional, since given the output_grad shape
and the subsample values, multiple input_shape may be and the subsample values, multiple input_shape may be
plausible. plausible.
filter_shape : None or [None/int/Constant] * 4 filter_shape : None or [None/int/Constant] * (4 or 6)
The shape of the filters parameter. None or a tuple/list of len 4. The shape of the filters parameter. None or a tuple/list of len 4 or a
tuple/list of len 6 (for unshared convolution)
Optional, possibly used to choose an optimal implementation. Optional, possibly used to choose an optimal implementation.
You can give ``None`` for any element of the list to specify that You can give ``None`` for any element of the list to specify that
this element is not known at compile time. this element is not known at compile time.
...@@ -975,6 +994,10 @@ def conv2d_grad_wrt_inputs(output_grad, ...@@ -975,6 +994,10 @@ def conv2d_grad_wrt_inputs(output_grad,
num_groups : int num_groups : int
Divides the image, kernel and output tensors into num_groups Divides the image, kernel and output tensors into num_groups
separate groups. Each which carry out convolutions separately separate groups. Each which carry out convolutions separately
unshared: bool
If true, then unshared or 'locally connected' convolution will be
performed. A different kernel will be used for each region of the
input.
Returns Returns
------- -------
...@@ -1012,6 +1035,10 @@ def conv2d_grad_wrt_inputs(output_grad, ...@@ -1012,6 +1035,10 @@ def conv2d_grad_wrt_inputs(output_grad,
for dim in [0, 1, 2, 3]: for dim in [0, 1, 2, 3]:
assert isinstance(filter_shape[dim], (theano.tensor.TensorConstant, assert isinstance(filter_shape[dim], (theano.tensor.TensorConstant,
integer_types, type(None))) integer_types, type(None)))
if unshared:
for dim in [4, 5]:
assert isinstance(filter_shape[dim], (theano.tensor.TensorConstant,
integer_types, type(None)))
# setting the last two dimensions of input_shape to None, if # setting the last two dimensions of input_shape to None, if
# the type of these dimensions is TensorVariable. # the type of these dimensions is TensorVariable.
...@@ -1026,7 +1053,8 @@ def conv2d_grad_wrt_inputs(output_grad, ...@@ -1026,7 +1053,8 @@ def conv2d_grad_wrt_inputs(output_grad,
subsample=subsample, subsample=subsample,
filter_flip=filter_flip, filter_flip=filter_flip,
filter_dilation=filter_dilation, filter_dilation=filter_dilation,
num_groups=num_groups) num_groups=num_groups,
unshared=unshared)
return grad_input_op(filters, output_grad, input_shape[-2:]) return grad_input_op(filters, output_grad, input_shape[-2:])
...@@ -1179,7 +1207,8 @@ def conv2d_grad_wrt_weights(input, ...@@ -1179,7 +1207,8 @@ def conv2d_grad_wrt_weights(input,
subsample=(1, 1), subsample=(1, 1),
filter_flip=True, filter_flip=True,
filter_dilation=(1, 1), filter_dilation=(1, 1),
num_groups=1): num_groups=1,
unshared=False):
"""Compute conv output gradient w.r.t its weights """Compute conv output gradient w.r.t its weights
This function will build the symbolic graph for getting the This function will build the symbolic graph for getting the
...@@ -1195,10 +1224,10 @@ def conv2d_grad_wrt_weights(input, ...@@ -1195,10 +1224,10 @@ def conv2d_grad_wrt_weights(input,
mini-batch of feature map stacks, of shape (batch size, input mini-batch of feature map stacks, of shape (batch size, input
channels, input rows, input columns). This is the gradient of channels, input rows, input columns). This is the gradient of
the output of convolution. the output of convolution.
filter_shape : [None/int/Constant] * 2 + [Tensor/int/Constant] * 2 filter_shape : [None/int/Constant] * (2 or 4) + [Tensor/int/Constant] * 2
The shape of the filter parameter. A tuple/list of len 4, with the The shape of the filter parameter. A tuple/list of len 4 or 6
first two dimensions being None or int or Constant and the last two (for unshared), with the first two dimensions being None or int or
dimensions being Tensor or int or Constant. Constant and the last two dimensions being Tensor or int or Constant.
Not Optional, since given the output_grad shape and Not Optional, since given the output_grad shape and
the input_shape, multiple filter_shape may be plausible. the input_shape, multiple filter_shape may be plausible.
input_shape : None or [None/int/Constant] * 4 input_shape : None or [None/int/Constant] * 4
...@@ -1247,13 +1276,19 @@ def conv2d_grad_wrt_weights(input, ...@@ -1247,13 +1276,19 @@ def conv2d_grad_wrt_weights(input,
num_groups : int num_groups : int
Divides the image, kernel and output tensors into num_groups Divides the image, kernel and output tensors into num_groups
separate groups. Each which carry out convolutions separately separate groups. Each which carry out convolutions separately
unshared: bool
If true, then unshared or 'locally connected' convolution will be
performed. A different kernel will be used for each region of the
input.
Returns Returns
------- -------
symbolic 4D tensor symbolic 4D tensor or 6D tensor
set of feature maps generated by convolutional layer. Tensor set of feature maps generated by convolutional layer. Tensor
is of shape (batch size, output channels, output rows, output is of shape (batch size, output channels, output rows, output
columns) columns) for normal convolution and
(output channels, output rows, output columns, input channels,
filter rows, filter columns) for unshared convolution
Notes Notes
----- -----
...@@ -1274,7 +1309,11 @@ def conv2d_grad_wrt_weights(input, ...@@ -1274,7 +1309,11 @@ def conv2d_grad_wrt_weights(input,
for dim in [0, 1]: for dim in [0, 1]:
assert isinstance(filter_shape[dim], (theano.tensor.TensorConstant, assert isinstance(filter_shape[dim], (theano.tensor.TensorConstant,
integer_types, type(None))) integer_types, type(None)))
for dim in [2, 3]: if unshared:
for dim in [2, 3]:
assert isinstance(filter_shape[dim], (theano.tensor.TensorConstant,
integer_types, type(None)))
for dim in [-2, -1]:
assert isinstance(filter_shape[dim], (theano.tensor.TensorVariable, assert isinstance(filter_shape[dim], (theano.tensor.TensorVariable,
theano.tensor.TensorConstant, theano.tensor.TensorConstant,
integer_types)) integer_types))
...@@ -1288,7 +1327,7 @@ def conv2d_grad_wrt_weights(input, ...@@ -1288,7 +1327,7 @@ def conv2d_grad_wrt_weights(input,
# setting the last two dimensions of filter_shape to None, if # setting the last two dimensions of filter_shape to None, if
# the type of these dimensions is TensorVariable. # the type of these dimensions is TensorVariable.
numerical_filter_shape = list(filter_shape) numerical_filter_shape = list(filter_shape)
for dim in [2, 3]: for dim in [-2, -1]:
if isinstance(filter_shape[dim], theano.tensor.TensorVariable): if isinstance(filter_shape[dim], theano.tensor.TensorVariable):
numerical_filter_shape[dim] = None numerical_filter_shape[dim] = None
...@@ -1298,7 +1337,8 @@ def conv2d_grad_wrt_weights(input, ...@@ -1298,7 +1337,8 @@ def conv2d_grad_wrt_weights(input,
subsample=subsample, subsample=subsample,
filter_flip=filter_flip, filter_flip=filter_flip,
filter_dilation=filter_dilation, filter_dilation=filter_dilation,
num_groups=num_groups) num_groups=num_groups,
unshared=unshared)
return gradWeight_op(input, output_grad, filter_shape[-2:]) return gradWeight_op(input, output_grad, filter_shape[-2:])
...@@ -1631,7 +1671,8 @@ class BaseAbstractConv(Op): ...@@ -1631,7 +1671,8 @@ class BaseAbstractConv(Op):
element is not known at compile time. element is not known at compile time.
imshp is defined w.r.t the forward conv. imshp is defined w.r.t the forward conv.
kshp: None, tuple/list of len ``(2 + convdim)`` of int or Constant variable kshp: None, tuple/list of len ``(2 + convdim)`` or ``(2 + 2 * convdim)``
(for unshared) of int or Constant variable
The shape of the filters parameter. The shape of the filters parameter.
Optional, possibly used to choose an optimal implementation. Optional, possibly used to choose an optimal implementation.
You can give ``None`` for any element of the list to specify that this You can give ``None`` for any element of the list to specify that this
...@@ -1670,14 +1711,20 @@ class BaseAbstractConv(Op): ...@@ -1670,14 +1711,20 @@ class BaseAbstractConv(Op):
filter_dilation: tuple of len ``convdim`` filter_dilation: tuple of len ``convdim``
Factor by which to subsample (stride) the input. Factor by which to subsample (stride) the input.
Also called dilation factor. Also called dilation factor.
unshared: bool
If true, then unshared or 'locally connected' convolution will be
performed. A different kernel will be used for each region of the
input.
""" """
check_broadcast = False check_broadcast = False
__props__ = ('convdim', 'border_mode', 'subsample', 'filter_flip', __props__ = ('convdim', 'border_mode', 'subsample', 'filter_flip',
'imshp', 'kshp', 'filter_dilation', 'num_groups') 'imshp', 'kshp', 'filter_dilation', 'num_groups', 'unshared')
def __init__(self, convdim, def __init__(self, convdim,
imshp=None, kshp=None, border_mode="valid", imshp=None, kshp=None, border_mode="valid",
subsample=None, filter_flip=True, filter_dilation=None, num_groups=1): subsample=None, filter_flip=True, filter_dilation=None, num_groups=1,
unshared=False):
self.convdim = convdim self.convdim = convdim
if convdim not in (2, 3): if convdim not in (2, 3):
...@@ -1718,7 +1765,10 @@ class BaseAbstractConv(Op): ...@@ -1718,7 +1765,10 @@ class BaseAbstractConv(Op):
ValueError("imshp should be None or a tuple of " ValueError("imshp should be None or a tuple of "
"constant int values"), "constant int values"),
sys.exc_info()[2]) sys.exc_info()[2])
self.kshp = tuple(kshp) if kshp else (None,) * (2 + convdim) if kshp:
self.kshp = tuple(kshp)
else:
self.kshp = (None,) * ((2 + 2 * convdim) if unshared else (2 + convdim))
for kshp_i in self.kshp: for kshp_i in self.kshp:
if kshp_i is not None: if kshp_i is not None:
# Components of kshp should be constant or ints # Components of kshp should be constant or ints
...@@ -1742,6 +1792,10 @@ class BaseAbstractConv(Op): ...@@ -1742,6 +1792,10 @@ class BaseAbstractConv(Op):
if num_groups < 1: if num_groups < 1:
raise ValueError("num_groups must have value greater than zero") raise ValueError("num_groups must have value greater than zero")
self.num_groups = num_groups self.num_groups = num_groups
if unshared and self.convdim != 2:
raise NotImplementedError('Unshared convolution not implemented for %dD'
% self.convdim)
self.unshared = unshared
def do_constant_folding(self, node): def do_constant_folding(self, node):
# Disable constant folding since there is no implementation. # Disable constant folding since there is no implementation.
...@@ -1768,11 +1822,10 @@ class BaseAbstractConv(Op): ...@@ -1768,11 +1822,10 @@ class BaseAbstractConv(Op):
raise NotImplementedError( raise NotImplementedError(
'flops not implemented for convdim={}', self.convdim) 'flops not implemented for convdim={}', self.convdim)
def conv(self, img, kern, mode="valid", dilation=1, num_groups=1): def conv(self, img, kern, mode="valid", dilation=1, num_groups=1, unshared=False, direction="forward"):
""" """
Basic slow Python 2D or 3D convolution for DebugMode Basic slow Python 2D or 3D convolution for DebugMode
""" """
if not imported_scipy_signal: if not imported_scipy_signal:
raise NotImplementedError( raise NotImplementedError(
"AbstractConv perform requires the python package" "AbstractConv perform requires the python package"
...@@ -1787,18 +1840,27 @@ class BaseAbstractConv(Op): ...@@ -1787,18 +1840,27 @@ class BaseAbstractConv(Op):
raise ValueError( raise ValueError(
'invalid dilation {}, expected {} values'.format(dilation, 'invalid dilation {}, expected {} values'.format(dilation,
self.convdim)) self.convdim))
if unshared and direction == "backprop weights":
if mode != "valid":
raise ValueError('conv mode for unshared backprop wrt weights must be "valid"')
# Do a transpose later to bring it to required shape
out_shape = (img.shape[0], kern.shape[0],
kern.shape[2], kern.shape[3],
img.shape[2] - kern.shape[2] + 1,
img.shape[3] - kern.shape[3] + 1)
else:
out_shape = get_conv_output_shape(img.shape, kern.shape,
mode, [1] * self.convdim, dilation)
out_shape = get_conv_output_shape(img.shape, kern.shape,
mode, [1] * self.convdim, dilation)
out = np.zeros(out_shape, dtype=img.dtype)
dil_kern_shp = kern.shape[:-self.convdim] + tuple( dil_kern_shp = kern.shape[:-self.convdim] + tuple(
(kern.shape[-self.convdim + i] - 1) * dilation[i] + 1 (kern.shape[-self.convdim + i] - 1) * dilation[i] + 1
for i in range(self.convdim)) for i in range(self.convdim))
dilated_kern = np.zeros(dil_kern_shp, dtype=kern.dtype) dilated_kern = np.zeros(dil_kern_shp, dtype=kern.dtype)
dilated_kern[(slice(None), slice(None)) +
dilated_kern[(slice(None),) * (dilated_kern.ndim - self.convdim) +
tuple(slice(None, None, dilation[i]) for i in range(self.convdim)) tuple(slice(None, None, dilation[i]) for i in range(self.convdim))
] = kern ] = kern
out = np.zeros(out_shape, dtype=img.dtype)
if img.shape[1] % self.num_groups != 0: if img.shape[1] % self.num_groups != 0:
raise ValueError( raise ValueError(
...@@ -1823,11 +1885,19 @@ class BaseAbstractConv(Op): ...@@ -1823,11 +1885,19 @@ class BaseAbstractConv(Op):
for g in xrange(self.num_groups): for g in xrange(self.num_groups):
for n in xrange(output_channel_offset): for n in xrange(output_channel_offset):
for im0 in xrange(input_channel_offset): for im0 in xrange(input_channel_offset):
# some cast generates a warning here if unshared:
out[b, g * output_channel_offset + n, ...] += _convolve2d(img[b, g * input_channel_offset + im0, ...], out[b, g * output_channel_offset + n, ...] += self.unshared2d(img[b, g * input_channel_offset + im0, ...],
dilated_kern[g * output_channel_offset + n, dilated_kern[g * output_channel_offset + n, im0, ...],
im0, ...], 1, val, bval, 0) out_shape[2:], direction)
else:
# some cast generates a warning here
out[b, g * output_channel_offset + n, ...] += _convolve2d(img[b, g * input_channel_offset + im0, ...],
dilated_kern[g * output_channel_offset + n, im0, ...],
1, val, bval, 0)
elif self.convdim == 3: elif self.convdim == 3:
if unshared:
raise NotImplementedError('Unshared 3D convolution is not implemented')
for b in xrange(img.shape[0]): for b in xrange(img.shape[0]):
for g in xrange(self.num_groups): for g in xrange(self.num_groups):
for n in xrange(output_channel_offset): for n in xrange(output_channel_offset):
...@@ -1839,6 +1909,35 @@ class BaseAbstractConv(Op): ...@@ -1839,6 +1909,35 @@ class BaseAbstractConv(Op):
raise NotImplementedError('only 2D and 3D convolution are implemented') raise NotImplementedError('only 2D and 3D convolution are implemented')
return out return out
def unshared2d(self, inp, kern, out_shape, direction="forward"):
'''
Basic slow Python unshared 2d convolution.
'''
if self.convdim != 2:
raise NotImplementedError('Unshared convolution not implemented for %dD'
% self.convdim)
out = np.zeros(out_shape, dtype=inp.dtype)
if direction == "forward":
for row in xrange(out_shape[0]):
for col in xrange(out_shape[1]):
out[row, col] = np.sum(np.multiply(inp[row:row + kern.shape[2],
col:col + kern.shape[3]],
kern[row, col, ::-1, ::-1]))
elif direction == "backprop weights":
for row in xrange(out_shape[0]):
for col in xrange(out_shape[1]):
out[row, col, ...] = kern[row, col] * \
inp[row:row + out_shape[2], col:col + out_shape[3]]
elif direction == "backprop inputs":
for row in xrange(kern.shape[0]):
for col in xrange(kern.shape[1]):
out[row:row + kern.shape[2], col:col + kern.shape[3]] += inp[row, col] * \
kern[row, col, ::-1, ::-1]
else:
raise ValueError("unshared2d: invalid value '{}' for 'direction'".format(direction))
return out
class AbstractConv(BaseAbstractConv): class AbstractConv(BaseAbstractConv):
""" Abstract Op for the forward convolution. """ Abstract Op for the forward convolution.
...@@ -1854,14 +1953,16 @@ class AbstractConv(BaseAbstractConv): ...@@ -1854,14 +1953,16 @@ class AbstractConv(BaseAbstractConv):
subsample=None, subsample=None,
filter_flip=True, filter_flip=True,
filter_dilation=None, filter_dilation=None,
num_groups=1): num_groups=1,
unshared=False):
super(AbstractConv, self).__init__(convdim=convdim, super(AbstractConv, self).__init__(convdim=convdim,
imshp=imshp, kshp=kshp, imshp=imshp, kshp=kshp,
border_mode=border_mode, border_mode=border_mode,
subsample=subsample, subsample=subsample,
filter_flip=filter_flip, filter_flip=filter_flip,
filter_dilation=filter_dilation, filter_dilation=filter_dilation,
num_groups=num_groups) num_groups=num_groups,
unshared=unshared)
def make_node(self, img, kern): def make_node(self, img, kern):
# Make sure both inputs are Variables with the same Type # Make sure both inputs are Variables with the same Type
...@@ -1875,8 +1976,14 @@ class AbstractConv(BaseAbstractConv): ...@@ -1875,8 +1976,14 @@ class AbstractConv(BaseAbstractConv):
if img.type.ndim != 2 + self.convdim: if img.type.ndim != 2 + self.convdim:
raise TypeError('img must be %dD tensor' % (2 + self.convdim)) raise TypeError('img must be %dD tensor' % (2 + self.convdim))
if kern.type.ndim != 2 + self.convdim:
raise TypeError('kern must be %dD tensor' % (2 + self.convdim)) if self.unshared:
if kern.type.ndim != 2 + 2 * self.convdim:
raise TypeError('kern must be %dD tensor for unshared convolution'
% (2 + 2 * self.convdim))
else:
if kern.type.ndim != 2 + self.convdim:
raise TypeError('kern must be %dD tensor' % (2 + self.convdim))
img = assert_shape(img, self.imshp, img = assert_shape(img, self.imshp,
'AbstractConv shape mismatch: shape of ' 'AbstractConv shape mismatch: shape of '
...@@ -1894,8 +2001,12 @@ class AbstractConv(BaseAbstractConv): ...@@ -1894,8 +2001,12 @@ class AbstractConv(BaseAbstractConv):
img, kern = inp img, kern = inp
img = np.asarray(img) img = np.asarray(img)
kern = np.asarray(kern) kern = np.asarray(kern)
dil_kernshp = tuple((kern.shape[2 + i] - 1) * self.filter_dilation[i] + 1
dil_kernshp = tuple((kern.shape[-self.convdim + i] - 1) * self.filter_dilation[i] + 1
for i in range(self.convdim)) for i in range(self.convdim))
if self.unshared and self.convdim != 2:
raise NotImplementedError('Unshared convolution not implemented for %dD'
% self.convdim)
o, = out_ o, = out_
mode = self.border_mode mode = self.border_mode
...@@ -1922,8 +2033,34 @@ class AbstractConv(BaseAbstractConv): ...@@ -1922,8 +2033,34 @@ class AbstractConv(BaseAbstractConv):
for i in range(self.convdim))] = img for i in range(self.convdim))] = img
img = new_img img = new_img
if not self.filter_flip: if not self.filter_flip:
kern = kern[(slice(None), slice(None)) + (slice(None, None, -1),) * self.convdim] kern = kern[(slice(None),) * (kern.ndim - self.convdim) + (slice(None, None, -1),) * self.convdim]
conv_out = self.conv(img, kern, mode="valid", dilation=self.filter_dilation, num_groups=self.num_groups)
if self.unshared:
out_shape = get_conv_output_shape(img.shape, kern.shape,
mode, self.subsample, self.filter_dilation)
if kern.shape[1:1 + self.convdim] != out_shape[2:2 + self.convdim]:
raise ValueError('Kernel shape {} does not match '
'computed output size {}'.format(kern.shape[1:1 + self.convdim],
out_shape[2:2 + self.convdim]))
if any(self.subsample[i] > 1 for i in range(self.convdim)):
# Expand regions in kernel to correct for subsampling
out_shape = get_conv_output_shape(img.shape, kern.shape,
mode, (1,) * self.convdim, self.filter_dilation)
exp_kern_shp = kern.shape[:1] + out_shape[2:2 + self.convdim] + \
kern.shape[1 + self.convdim:]
exp_kern = np.zeros(exp_kern_shp, dtype=kern.dtype)
exp_kern[(slice(None),) +
tuple(slice(None, None, self.subsample[i]) for i in range(self.convdim)) +
(slice(None),) * (self.convdim + 1)] = kern
kern = exp_kern
# from (nFilters, out_rows, out_cols, nChannels, kH, kW)
# to (nFilters, nChannels, out_rows, out_cols, kH, kW)
axes_order = (0, 1 + self.convdim,) + tuple(range(1, 1 + self.convdim)) + \
tuple(range(2 + self.convdim, kern.ndim))
kern = kern.transpose(axes_order)
conv_out = self.conv(img, kern, mode="valid", dilation=self.filter_dilation, num_groups=self.num_groups,
unshared=self.unshared)
conv_out = conv_out[(slice(None), slice(None)) + conv_out = conv_out[(slice(None), slice(None)) +
tuple(slice(None, None, self.subsample[i]) tuple(slice(None, None, self.subsample[i])
for i in range(self.convdim))] for i in range(self.convdim))]
...@@ -1934,6 +2071,8 @@ class AbstractConv(BaseAbstractConv): ...@@ -1934,6 +2071,8 @@ class AbstractConv(BaseAbstractConv):
if self.num_groups > 1: if self.num_groups > 1:
raise NotImplementedError( raise NotImplementedError(
'Rop not implemented for grouped convolutions') 'Rop not implemented for grouped convolutions')
if self.unshared:
raise NotImplementedError('Rop not implemented for unshared convolution')
rval = None rval = None
if eval_points[0] is not None: if eval_points[0] is not None:
rval = self.make_node(eval_points[0], inputs[1]).outputs[0] rval = self.make_node(eval_points[0], inputs[1]).outputs[0]
...@@ -1953,8 +2092,12 @@ class AbstractConv(BaseAbstractConv): ...@@ -1953,8 +2092,12 @@ class AbstractConv(BaseAbstractConv):
imshp = [imshp[i] if self.imshp[i] is None else self.imshp[i] imshp = [imshp[i] if self.imshp[i] is None else self.imshp[i]
for i in range(2 + self.convdim)] for i in range(2 + self.convdim)]
if self.kshp is not None: if self.kshp is not None:
kshp = [kshp[i] if self.kshp[i] is None else self.kshp[i] if self.unshared:
for i in range(2 + self.convdim)] kshp = [kshp[i] if self.kshp[i] is None else self.kshp[i]
for i in range(2 + 2 * self.convdim)]
else:
kshp = [kshp[i] if self.kshp[i] is None else self.kshp[i]
for i in range(2 + self.convdim)]
res = get_conv_output_shape(imshp, kshp, self.border_mode, res = get_conv_output_shape(imshp, kshp, self.border_mode,
self.subsample, self.filter_dilation) self.subsample, self.filter_dilation)
return [res] return [res]
...@@ -1973,14 +2116,16 @@ class AbstractConv2d(AbstractConv): ...@@ -1973,14 +2116,16 @@ class AbstractConv2d(AbstractConv):
subsample=(1, 1), subsample=(1, 1),
filter_flip=True, filter_flip=True,
filter_dilation=(1, 1), filter_dilation=(1, 1),
num_groups=1): num_groups=1,
unshared=False):
super(AbstractConv2d, self).__init__(convdim=2, super(AbstractConv2d, self).__init__(convdim=2,
imshp=imshp, kshp=kshp, imshp=imshp, kshp=kshp,
border_mode=border_mode, border_mode=border_mode,
subsample=subsample, subsample=subsample,
filter_flip=filter_flip, filter_flip=filter_flip,
filter_dilation=filter_dilation, filter_dilation=filter_dilation,
num_groups=num_groups) num_groups=num_groups,
unshared=unshared)
def grad(self, inp, grads): def grad(self, inp, grads):
bottom, weights = inp bottom, weights = inp
...@@ -1991,14 +2136,16 @@ class AbstractConv2d(AbstractConv): ...@@ -1991,14 +2136,16 @@ class AbstractConv2d(AbstractConv):
self.subsample, self.subsample,
self.filter_flip, self.filter_flip,
self.filter_dilation, self.filter_dilation,
num_groups=self.num_groups)( num_groups=self.num_groups,
unshared=self.unshared)(
weights, top, bottom.shape[-2:], add_assert_shape=False) weights, top, bottom.shape[-2:], add_assert_shape=False)
d_weights = AbstractConv2d_gradWeights(self.imshp, self.kshp, d_weights = AbstractConv2d_gradWeights(self.imshp, self.kshp,
self.border_mode, self.border_mode,
self.subsample, self.subsample,
self.filter_flip, self.filter_flip,
self.filter_dilation, self.filter_dilation,
num_groups=self.num_groups)( num_groups=self.num_groups,
unshared=self.unshared)(
bottom, top, weights.shape[-2:], add_assert_shape=False) bottom, top, weights.shape[-2:], add_assert_shape=False)
...@@ -2085,14 +2232,16 @@ class AbstractConv_gradWeights(BaseAbstractConv): ...@@ -2085,14 +2232,16 @@ class AbstractConv_gradWeights(BaseAbstractConv):
subsample=None, subsample=None,
filter_flip=True, filter_flip=True,
filter_dilation=None, filter_dilation=None,
num_groups=1): num_groups=1,
unshared=False):
super(AbstractConv_gradWeights, self).__init__(convdim=convdim, super(AbstractConv_gradWeights, self).__init__(convdim=convdim,
imshp=imshp, kshp=kshp, imshp=imshp, kshp=kshp,
border_mode=border_mode, border_mode=border_mode,
subsample=subsample, subsample=subsample,
filter_flip=filter_flip, filter_flip=filter_flip,
filter_dilation=filter_dilation, filter_dilation=filter_dilation,
num_groups=num_groups) num_groups=num_groups,
unshared=unshared)
# Update shape/height_width # Update shape/height_width
def make_node(self, img, topgrad, shape, add_assert_shape=True): def make_node(self, img, topgrad, shape, add_assert_shape=True):
...@@ -2115,8 +2264,12 @@ class AbstractConv_gradWeights(BaseAbstractConv): ...@@ -2115,8 +2264,12 @@ class AbstractConv_gradWeights(BaseAbstractConv):
'image does not match given imshp.') 'image does not match given imshp.')
shape = as_tensor_variable(shape) shape = as_tensor_variable(shape)
broadcastable = [topgrad.broadcastable[1], if self.unshared:
img.broadcastable[1]] + ([False] * self.convdim) broadcastable = [topgrad.broadcastable[1]] + ([False] * self.convdim) + \
[img.broadcastable[1]] + ([False] * self.convdim)
else:
broadcastable = [topgrad.broadcastable[1],
img.broadcastable[1]] + ([False] * self.convdim)
output = img.type.clone(broadcastable=broadcastable)() output = img.type.clone(broadcastable=broadcastable)()
return Apply(self, [img, topgrad, shape], [output]) return Apply(self, [img, topgrad, shape], [output])
...@@ -2134,6 +2287,9 @@ class AbstractConv_gradWeights(BaseAbstractConv): ...@@ -2134,6 +2287,9 @@ class AbstractConv_gradWeights(BaseAbstractConv):
'invalid border_mode {}, which must be either ' 'invalid border_mode {}, which must be either '
'"valid", "full", "half", an integer or a tuple of' '"valid", "full", "half", an integer or a tuple of'
' integers'.format(mode)) ' integers'.format(mode))
if self.unshared and self.convdim != 2:
raise NotImplementedError('Unshared convolution not implemented for %dD'
% self.convdim)
dil_shape = tuple((shape[i] - 1) * self.filter_dilation[i] + 1 dil_shape = tuple((shape[i] - 1) * self.filter_dilation[i] + 1
for i in range(self.convdim)) for i in range(self.convdim))
...@@ -2166,9 +2322,7 @@ class AbstractConv_gradWeights(BaseAbstractConv): ...@@ -2166,9 +2322,7 @@ class AbstractConv_gradWeights(BaseAbstractConv):
topgrad = new_topgrad topgrad = new_topgrad
axes_order = (1, 0) + tuple(range(2, self.convdim + 2)) axes_order = (1, 0) + tuple(range(2, self.convdim + 2))
flip_filters = ((slice(None), slice(None)) + topgrad = topgrad.transpose(axes_order)
(slice(None, None, -1),) * self.convdim)
topgrad = topgrad.transpose(axes_order)[flip_filters]
img = img.transpose(axes_order) img = img.transpose(axes_order)
def correct_for_groups(mat): def correct_for_groups(mat):
...@@ -2182,15 +2336,36 @@ class AbstractConv_gradWeights(BaseAbstractConv): ...@@ -2182,15 +2336,36 @@ class AbstractConv_gradWeights(BaseAbstractConv):
if self.num_groups > 1: if self.num_groups > 1:
img = correct_for_groups(img) img = correct_for_groups(img)
kern = self.conv(img, topgrad, mode="valid", num_groups=self.num_groups) if self.unshared:
flip_kern = ((slice(None),) * (2 + self.convdim) +
(slice(None, None, -1),) * self.convdim)
kern = self.conv(img, topgrad, mode="valid", num_groups=self.num_groups,
unshared=True, direction="backprop weights")
if any(self.subsample[i] > 1 for i in range(self.convdim)):
sub_slice = (slice(None),) * 2 + \
tuple(slice(None, None, self.subsample[i]) for i in range(0, self.convdim)) + \
(slice(None),) * self.convdim
kern = kern[sub_slice]
# from (nChannels, nFilters, out_rows, out_cols, kH, kW)
# to (nFilters, out_rows, out_cols, nChannels, kH, kW)
kern_axes = (1,) + tuple(range(2, self.convdim + 2)) + (0,) + \
tuple(range(self.convdim + 2, kern.ndim))
else:
flip_topgrad = flip_kern = ((slice(None), slice(None)) +
(slice(None, None, -1),) * self.convdim)
topgrad = topgrad[flip_topgrad]
kern = self.conv(img, topgrad, mode="valid", num_groups=self.num_groups)
kern_axes = (1, 0) + tuple(range(2, self.convdim + 2))
kern = kern.transpose(kern_axes)
if any(self.filter_dilation[i] > 1 for i in range(self.convdim)): if any(self.filter_dilation[i] > 1 for i in range(self.convdim)):
kern = kern[(slice(None), slice(None)) + kern = kern[(slice(None),) * (kern.ndim - self.convdim) +
tuple(slice(None, None, self.filter_dilation[i]) tuple(slice(None, None, self.filter_dilation[i])
for i in range(self.convdim))] for i in range(self.convdim))]
if self.filter_flip: if self.filter_flip:
kern = kern.transpose(axes_order)[flip_filters] kern = kern[flip_kern]
else:
kern = kern.transpose(axes_order)
o[0] = node.outputs[0].type.filter(kern) o[0] = node.outputs[0].type.filter(kern)
def connection_pattern(self, node): def connection_pattern(self, node):
...@@ -2203,15 +2378,24 @@ class AbstractConv_gradWeights(BaseAbstractConv): ...@@ -2203,15 +2378,24 @@ class AbstractConv_gradWeights(BaseAbstractConv):
# from the shapes of inputs. # from the shapes of inputs.
imshp = input_shapes[0] imshp = input_shapes[0]
topshp = input_shapes[1] topshp = input_shapes[1]
kshp = self.kshp[:] if self.kshp is not None else [None] * (2 + self.convdim)
if self.num_groups > 1: if self.kshp:
fallback_kshp = ([topshp[1], imshp[1] // self.num_groups] + kshp = self.kshp
else:
if self.unshared:
kshp = [None] * (2 + 2 * self.convdim)
else:
kshp = [None] * (2 + self.convdim)
if self.unshared:
fallback_kshp = ([topshp[1], topshp[2], topshp[3], imshp[1] // self.num_groups] +
[node.inputs[2][i] for i in range(self.convdim)]) [node.inputs[2][i] for i in range(self.convdim)])
kshp = [fallback_kshp[i] if kshp[i] is None else kshp[i]
for i in range(2 + 2 * self.convdim)]
else: else:
fallback_kshp = ([topshp[1], imshp[1]] + fallback_kshp = ([topshp[1], imshp[1] // self.num_groups] +
[node.inputs[2][i] for i in range(self.convdim)]) [node.inputs[2][i] for i in range(self.convdim)])
kshp = [fallback_kshp[i] if kshp[i] is None else kshp[i] kshp = [fallback_kshp[i] if kshp[i] is None else kshp[i]
for i in range(2 + self.convdim)] for i in range(2 + self.convdim)]
return [kshp] return [kshp]
...@@ -2232,14 +2416,16 @@ class AbstractConv2d_gradWeights(AbstractConv_gradWeights): ...@@ -2232,14 +2416,16 @@ class AbstractConv2d_gradWeights(AbstractConv_gradWeights):
subsample=(1, 1), subsample=(1, 1),
filter_flip=True, filter_flip=True,
filter_dilation=(1, 1), filter_dilation=(1, 1),
num_groups=1): num_groups=1,
unshared=False):
super(AbstractConv2d_gradWeights, self).__init__(convdim=2, super(AbstractConv2d_gradWeights, self).__init__(convdim=2,
imshp=imshp, kshp=kshp, imshp=imshp, kshp=kshp,
border_mode=border_mode, border_mode=border_mode,
subsample=subsample, subsample=subsample,
filter_flip=filter_flip, filter_flip=filter_flip,
filter_dilation=filter_dilation, filter_dilation=filter_dilation,
num_groups=num_groups) num_groups=num_groups,
unshared=unshared)
def grad(self, inp, grads): def grad(self, inp, grads):
bottom, top = inp[:2] bottom, top = inp[:2]
...@@ -2249,16 +2435,18 @@ class AbstractConv2d_gradWeights(AbstractConv_gradWeights): ...@@ -2249,16 +2435,18 @@ class AbstractConv2d_gradWeights(AbstractConv_gradWeights):
self.subsample, self.subsample,
self.filter_flip, self.filter_flip,
self.filter_dilation, self.filter_dilation,
self.num_groups)(weights, self.num_groups,
top, self.unshared)(weights,
bottom.shape[-2:]) top,
bottom.shape[-2:])
d_top = AbstractConv2d(self.imshp, d_top = AbstractConv2d(self.imshp,
self.kshp, self.kshp,
self.border_mode, self.border_mode,
self.subsample, self.subsample,
self.filter_flip, self.filter_flip,
self.filter_dilation, self.filter_dilation,
self.num_groups)(bottom, weights) self.num_groups,
self.unshared)(bottom, weights)
# Make sure that the broadcastable pattern of the inputs is used # Make sure that the broadcastable pattern of the inputs is used
# for the gradients, even if the grad opts are not able to infer # for the gradients, even if the grad opts are not able to infer
# that the dimensions are broadcastable. # that the dimensions are broadcastable.
...@@ -2350,14 +2538,16 @@ class AbstractConv_gradInputs(BaseAbstractConv): ...@@ -2350,14 +2538,16 @@ class AbstractConv_gradInputs(BaseAbstractConv):
subsample=None, subsample=None,
filter_flip=True, filter_flip=True,
filter_dilation=None, filter_dilation=None,
num_groups=1): num_groups=1,
unshared=False):
super(AbstractConv_gradInputs, self).__init__(convdim=convdim, super(AbstractConv_gradInputs, self).__init__(convdim=convdim,
imshp=imshp, kshp=kshp, imshp=imshp, kshp=kshp,
border_mode=border_mode, border_mode=border_mode,
subsample=subsample, subsample=subsample,
filter_flip=filter_flip, filter_flip=filter_flip,
filter_dilation=filter_dilation, filter_dilation=filter_dilation,
num_groups=num_groups) num_groups=num_groups,
unshared=unshared)
# Update shape/height_width # Update shape/height_width
def make_node(self, kern, topgrad, shape, add_assert_shape=True): def make_node(self, kern, topgrad, shape, add_assert_shape=True):
...@@ -2370,10 +2560,19 @@ class AbstractConv_gradInputs(BaseAbstractConv): ...@@ -2370,10 +2560,19 @@ class AbstractConv_gradInputs(BaseAbstractConv):
broadcastable=topgrad.broadcastable) broadcastable=topgrad.broadcastable)
topgrad = gtype.filter_variable(topgrad) topgrad = gtype.filter_variable(topgrad)
if kern.type.ndim != 2 + self.convdim: if self.unshared:
raise TypeError('kern must be %dD tensor' % (2 + self.convdim)) if self.convdim != 2:
raise NotImplementedError('Unshared convolution not implemented for %dD'
% self.convdim)
elif kern.type.ndim != 2 + 2 * self.convdim:
raise TypeError('kern must be %dD tensor for unshared convolution'
% (2 + 2 * self.convdim))
else:
if kern.type.ndim != 2 + self.convdim:
raise TypeError('kern must be %dD tensor' % (2 + self.convdim))
if topgrad.type.ndim != 2 + self.convdim: if topgrad.type.ndim != 2 + self.convdim:
raise TypeError('topgrad must be %dD tensor' % (2 + self.convdim)) raise TypeError('topgrad must be %dD tensor' % (2 + self.convdim))
if add_assert_shape: if add_assert_shape:
kern = assert_shape(kern, self.kshp, kern = assert_shape(kern, self.kshp,
...@@ -2386,7 +2585,7 @@ class AbstractConv_gradInputs(BaseAbstractConv): ...@@ -2386,7 +2585,7 @@ class AbstractConv_gradInputs(BaseAbstractConv):
False] + ([False] * self.convdim) False] + ([False] * self.convdim)
else: else:
broadcastable = [topgrad.type.broadcastable[0], broadcastable = [topgrad.type.broadcastable[0],
kern.type.broadcastable[1]] + ([False] * self.convdim) kern.type.broadcastable[-self.convdim - 1]] + ([False] * self.convdim)
output = kern.type.clone(broadcastable=broadcastable)() output = kern.type.clone(broadcastable=broadcastable)()
return Apply(self, [kern, topgrad, shape], [output]) return Apply(self, [kern, topgrad, shape], [output])
...@@ -2403,9 +2602,12 @@ class AbstractConv_gradInputs(BaseAbstractConv): ...@@ -2403,9 +2602,12 @@ class AbstractConv_gradInputs(BaseAbstractConv):
'invalid border_mode {}, which must be either ' 'invalid border_mode {}, which must be either '
'"valid", "full", "half", an integer or a tuple of' '"valid", "full", "half", an integer or a tuple of'
' integers'.format(mode)) ' integers'.format(mode))
if self.unshared and self.convdim != 2:
raise NotImplementedError('Unshared convolution not implemented for %dD'
% self.convdim)
imshp = self.imshp[:] if self.imshp is not None else [None] * (2 + self.convdim) imshp = self.imshp[:] if self.imshp is not None else [None] * (2 + self.convdim)
fallback_imshp = ([topgrad.shape[0], kern.shape[1]] + fallback_imshp = ([topgrad.shape[0], kern.shape[-self.convdim - 1]] +
[shape[i] for i in range(self.convdim)]) [shape[i] for i in range(self.convdim)])
imshp = [fallback_imshp[i] if imshp[i] is None else imshp[i] imshp = [fallback_imshp[i] if imshp[i] is None else imshp[i]
for i in range(2 + self.convdim)] for i in range(2 + self.convdim)]
...@@ -2419,8 +2621,9 @@ class AbstractConv_gradInputs(BaseAbstractConv): ...@@ -2419,8 +2621,9 @@ class AbstractConv_gradInputs(BaseAbstractConv):
'has shape {}'.format(tuple(expected_topgrad_shape), 'has shape {}'.format(tuple(expected_topgrad_shape),
tuple(topgrad.shape))) tuple(topgrad.shape)))
dil_kernshp = tuple((kern.shape[i + 2] - 1) * self.filter_dilation[i] + 1 dil_kernshp = tuple((kern.shape[-self.convdim + i] - 1) * self.filter_dilation[i] + 1
for i in range(self.convdim)) for i in range(self.convdim))
pad = (0,) * self.convdim pad = (0,) * self.convdim
if mode == "full": if mode == "full":
pad = tuple(dil_kernshp[i] - 1 for i in range(self.convdim)) pad = tuple(dil_kernshp[i] - 1 for i in range(self.convdim))
...@@ -2438,25 +2641,54 @@ class AbstractConv_gradInputs(BaseAbstractConv): ...@@ -2438,25 +2641,54 @@ class AbstractConv_gradInputs(BaseAbstractConv):
for i in range(self.convdim))] = topgrad for i in range(self.convdim))] = topgrad
topgrad = new_topgrad topgrad = new_topgrad
axes_order = (1, 0) + tuple(range(2, self.convdim + 2)) if self.unshared:
flip_filters = ((slice(None), slice(None)) + # Expand regions in kernel to correct for subsampling
(slice(None, None, -1),) * self.convdim) exp_kern_shp = kern.shape[:1] + topgrad.shape[2:] + kern.shape[1 + self.convdim:]
exp_kern = np.zeros(exp_kern_shp, dtype=kern.dtype)
exp_kern[(slice(None),) +
tuple(slice(None, None, self.subsample[i]) for i in range(self.convdim)) +
(slice(None),) * (self.convdim + 1)] = kern
kern = exp_kern
def correct_for_groups(mat): def correct_for_groups(mat):
mshp0 = mat.shape[0] // self.num_groups mshp0 = mat.shape[0] // self.num_groups
mshp1 = mat.shape[1] * self.num_groups mshp1 = mat.shape[-self.convdim - 1] * self.num_groups
mat = mat.reshape((self.num_groups, mshp0) + mat.shape[1:]) mat = mat.reshape((self.num_groups, mshp0) + mat.shape[1:])
mat = mat.transpose((1, 0, 2) + tuple(range(3, 3 + self.convdim))) if self.unshared:
mat = mat.reshape((mshp0, mshp1) + mat.shape[-self.convdim:]) # for 2D -> (1, 2, 3, 0, 4, 5, 6)
mat = mat.transpose(tuple(range(1, 2 + self.convdim)) + (0,) +
tuple(range(2 + self.convdim, mat.ndim)))
mat = mat.reshape((mshp0,) + mat.shape[1:1 + self.convdim] + (mshp1,) + mat.shape[-self.convdim:])
else:
mat = mat.transpose((1, 0, 2) + tuple(range(3, 3 + self.convdim)))
mat = mat.reshape((mshp0, mshp1) + mat.shape[-self.convdim:])
return mat return mat
kern = correct_for_groups(kern) kern = correct_for_groups(kern)
kern = kern.transpose(axes_order)
if self.filter_flip: if self.unshared:
topgrad = topgrad[flip_filters] # from (nFilters, out_rows, out_cols, nChannels, kH, kW)
img = self.conv(topgrad, kern, mode="full", dilation=self.filter_dilation, num_groups=self.num_groups) # to (nChannels, nFilters, out_rows, out_cols, kH, kW)
if self.filter_flip: axes_order = (1 + self.convdim, 0,) + tuple(range(1, 1 + self.convdim)) + \
img = img[flip_filters] tuple(range(2 + self.convdim, kern.ndim))
kern = kern.transpose(axes_order)
if not self.filter_flip:
kern = kern[(slice(None),) * (kern.ndim - self.convdim) +
(slice(None, None, -1),) * self.convdim]
img = self.conv(topgrad, kern, mode="full", dilation=self.filter_dilation,
num_groups=self.num_groups, unshared=True, direction="backprop inputs")
else:
axes_order = (1, 0) + tuple(range(2, 2 + self.convdim))
kern = kern.transpose(axes_order)
flip_filters = ((slice(None), slice(None)) +
(slice(None, None, -1),) * self.convdim)
if self.filter_flip:
topgrad = topgrad[flip_filters]
img = self.conv(topgrad, kern, mode="full", dilation=self.filter_dilation,
num_groups=self.num_groups)
if self.filter_flip:
img = img[flip_filters]
if any(p > 0 for p in pad): if any(p > 0 for p in pad):
img = img[(slice(None), slice(None)) + img = img[(slice(None), slice(None)) +
tuple(slice(pad[i], img.shape[i + 2] - pad[i]) tuple(slice(pad[i], img.shape[i + 2] - pad[i])
...@@ -2475,10 +2707,10 @@ class AbstractConv_gradInputs(BaseAbstractConv): ...@@ -2475,10 +2707,10 @@ class AbstractConv_gradInputs(BaseAbstractConv):
topshp = input_shapes[1] topshp = input_shapes[1]
imshp = self.imshp[:] if self.imshp is not None else [None] * (2 + self.convdim) imshp = self.imshp[:] if self.imshp is not None else [None] * (2 + self.convdim)
if self.num_groups > 1: if self.num_groups > 1:
fallback_imshp = ([topshp[0], kshp[1] * self.num_groups] + fallback_imshp = ([topshp[0], kshp[-self.convdim - 1] * self.num_groups] +
[node.inputs[2][i] for i in range(self.convdim)]) [node.inputs[2][i] for i in range(self.convdim)])
else: else:
fallback_imshp = ([topshp[0], kshp[1]] + fallback_imshp = ([topshp[0], kshp[-self.convdim - 1]] +
[node.inputs[2][i] for i in range(self.convdim)]) [node.inputs[2][i] for i in range(self.convdim)])
imshp = [fallback_imshp[i] if imshp[i] is None else imshp[i] imshp = [fallback_imshp[i] if imshp[i] is None else imshp[i]
for i in range(2 + self.convdim)] for i in range(2 + self.convdim)]
...@@ -2503,14 +2735,16 @@ class AbstractConv2d_gradInputs(AbstractConv_gradInputs): ...@@ -2503,14 +2735,16 @@ class AbstractConv2d_gradInputs(AbstractConv_gradInputs):
subsample=(1, 1), subsample=(1, 1),
filter_flip=True, filter_flip=True,
filter_dilation=(1, 1), filter_dilation=(1, 1),
num_groups=1): num_groups=1,
unshared=False):
super(AbstractConv2d_gradInputs, self).__init__(convdim=2, super(AbstractConv2d_gradInputs, self).__init__(convdim=2,
imshp=imshp, kshp=kshp, imshp=imshp, kshp=kshp,
border_mode=border_mode, border_mode=border_mode,
subsample=subsample, subsample=subsample,
filter_flip=filter_flip, filter_flip=filter_flip,
filter_dilation=filter_dilation, filter_dilation=filter_dilation,
num_groups=num_groups) num_groups=num_groups,
unshared=unshared)
def grad(self, inp, grads): def grad(self, inp, grads):
weights, top = inp[:2] weights, top = inp[:2]
...@@ -2520,7 +2754,8 @@ class AbstractConv2d_gradInputs(AbstractConv_gradInputs): ...@@ -2520,7 +2754,8 @@ class AbstractConv2d_gradInputs(AbstractConv_gradInputs):
self.subsample, self.subsample,
self.filter_flip, self.filter_flip,
self.filter_dilation, self.filter_dilation,
self.num_groups)( self.num_groups,
self.unshared)(
bottom, top, bottom, top,
weights.shape[-2:]) weights.shape[-2:])
d_top = AbstractConv2d(self.imshp, self.kshp, d_top = AbstractConv2d(self.imshp, self.kshp,
...@@ -2528,7 +2763,8 @@ class AbstractConv2d_gradInputs(AbstractConv_gradInputs): ...@@ -2528,7 +2763,8 @@ class AbstractConv2d_gradInputs(AbstractConv_gradInputs):
self.subsample, self.subsample,
self.filter_flip, self.filter_flip,
self.filter_dilation, self.filter_dilation,
self.num_groups)(bottom, weights) self.num_groups,
self.unshared)(bottom, weights)
# Make sure that the broadcastable pattern of the inputs is used # Make sure that the broadcastable pattern of the inputs is used
# for the gradients, even if the grad opts are not able to infer # for the gradients, even if the grad opts are not able to infer
# that the dimensions are broadcastable. # that the dimensions are broadcastable.
......
...@@ -107,7 +107,8 @@ PyArrayObject* corrMM(PyArrayObject* bottom, ...@@ -107,7 +107,8 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
const int dilW = 1, const int dilW = 1,
const int padH = 0, const int padH = 0,
const int padW = 0, const int padW = 0,
const int numgroups = 1) const int numgroups = 1,
const int unshared = 0)
{ {
if (PyArray_NDIM(bottom) != 4) if (PyArray_NDIM(bottom) != 4)
{ {
...@@ -120,9 +121,9 @@ PyArrayObject* corrMM(PyArrayObject* bottom, ...@@ -120,9 +121,9 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
return NULL; return NULL;
} }
if (PyArray_NDIM(weight) != 4) if (PyArray_NDIM(weight) != (unshared ? 6 : 4))
{ {
PyErr_SetString(PyExc_ValueError, "CorrMM requires weight of 4D"); PyErr_Format(PyExc_ValueError, "CorrMM requires weight of %%dD", unshared ? 6 : 4);
return NULL; return NULL;
} }
if (PyArray_TYPE(weight) != %(float_typenum)s) if (PyArray_TYPE(weight) != %(float_typenum)s)
...@@ -152,11 +153,12 @@ PyArrayObject* corrMM(PyArrayObject* bottom, ...@@ -152,11 +153,12 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
const int nChannels = PyArray_DIMS(bottom)[1]; const int nChannels = PyArray_DIMS(bottom)[1];
const int bottomHeight = PyArray_DIMS(bottom)[2]; const int bottomHeight = PyArray_DIMS(bottom)[2];
const int bottomWidth = PyArray_DIMS(bottom)[3]; const int bottomWidth = PyArray_DIMS(bottom)[3];
// weights: (nFilters, nChannels, rows, columns) // normal weights: (nFilters, nChannels, rows, columns)
// unshared weights: (nFilters, topHeight, topWidth, nChannels, rows, columns)
const int nFilters = PyArray_DIMS(weight)[0]; const int nFilters = PyArray_DIMS(weight)[0];
const int kH = PyArray_DIMS(weight)[2]; const int kH = PyArray_DIMS(weight)[unshared ? 4 : 2];
const int kW = PyArray_DIMS(weight)[3]; const int kW = PyArray_DIMS(weight)[unshared ? 5 : 3];
if (nChannels != (PyArray_DIMS(weight)[1] * numgroups)) { if (nChannels != PyArray_DIMS(weight)[unshared ? 3 : 1] * numgroups) {
PyErr_SetString(PyExc_ValueError, PyErr_SetString(PyExc_ValueError,
"CorrMM images and kernel must have the same stack size\n"); "CorrMM images and kernel must have the same stack size\n");
return NULL; return NULL;
...@@ -179,22 +181,57 @@ PyArrayObject* corrMM(PyArrayObject* bottom, ...@@ -179,22 +181,57 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
const int topHeight = _CONV_FLOORDIV_X(topHeightNoDH, dH) + 1; const int topHeight = _CONV_FLOORDIV_X(topHeightNoDH, dH) + 1;
const int topWidth = _CONV_FLOORDIV_X(topWidthNoDW, dW) + 1; const int topWidth = _CONV_FLOORDIV_X(topWidthNoDW, dW) + 1;
#undef _CONV_FLOORDIV #undef _CONV_FLOORDIV
if (batchSize != PyArray_DIMS(top)[0] || if (unshared) {
nFilters != PyArray_DIMS(top)[1] || if (topHeight != PyArray_DIMS(weight)[1] ||
topHeight != PyArray_DIMS(top)[2] || topWidth != PyArray_DIMS(weight)[2]) {
topWidth != PyArray_DIMS(top)[3]) { PyErr_Format(PyExc_ValueError,
PyErr_Format(PyExc_ValueError, "CorrMM regions in kernel must match output regions:\n"
"CorrMM shape inconsistency:\n" " bottom shape: %%d %%d %%d %%d\n"
" bottom shape: %%d %%d %%d %%d\n" " weight shape: %%d %%ld %%ld %%d %%d %%d"
" weight shape: %%d %%d %%d %%d\n" " (expected %%d %%d %%d %%d %%d %%d)\n"
" top shape: %%ld %%ld %%ld %%ld (expected %%d %%d %%d %%d)\n", " top shape(calculated): %%d %%d %%d %%d\n",
batchSize, nChannels, bottomHeight, bottomWidth, batchSize, nChannels, bottomHeight, bottomWidth,
nFilters, nChannels / numgroups, kH, kW, nFilters, PyArray_DIMS(weight)[1],
PyArray_DIMS(top)[0], PyArray_DIMS(top)[1], PyArray_DIMS(weight)[2], nChannels / numgroups, kH, kW,
PyArray_DIMS(top)[2], PyArray_DIMS(top)[3], nFilters, topHeight, topWidth, nChannels / numgroups, kH, kW,
batchSize, nFilters, topHeight, topWidth); batchSize, nFilters, topHeight, topWidth);
return NULL; return NULL;
}
if (batchSize != PyArray_DIMS(top)[0] ||
nFilters != PyArray_DIMS(top)[1] ||
topHeight != PyArray_DIMS(top)[2] ||
topWidth != PyArray_DIMS(top)[3]) {
PyErr_Format(PyExc_ValueError,
"CorrMM shape inconsistency:\n"
" bottom shape: %%d %%d %%d %%d\n"
" weight shape: %%d %%d %%d %%d %%d %%d\n"
" top shape: %%ld %%ld %%ld %%ld (expected %%d %%d %%d %%d)\n",
batchSize, nChannels, bottomHeight, bottomWidth,
nFilters, topHeight, topWidth, nChannels / numgroups, kH, kW,
PyArray_DIMS(top)[0], PyArray_DIMS(top)[1],
PyArray_DIMS(top)[2], PyArray_DIMS(top)[3],
batchSize, nFilters, topHeight, topWidth);
return NULL;
}
} }
else {
if (batchSize != PyArray_DIMS(top)[0] ||
nFilters != PyArray_DIMS(top)[1] ||
topHeight != PyArray_DIMS(top)[2] ||
topWidth != PyArray_DIMS(top)[3]) {
PyErr_Format(PyExc_ValueError,
"CorrMM shape inconsistency:\n"
" bottom shape: %%d %%d %%d %%d\n"
" weight shape: %%d %%d %%d %%d\n"
" top shape: %%ld %%ld %%ld %%ld (expected %%d %%d %%d %%d)\n",
batchSize, nChannels, bottomHeight, bottomWidth,
nFilters, nChannels / numgroups, kH, kW,
PyArray_DIMS(top)[0], PyArray_DIMS(top)[1],
PyArray_DIMS(top)[2], PyArray_DIMS(top)[3],
batchSize, nFilters, topHeight, topWidth);
return NULL;
}
}
// Create temporary columns // Create temporary columns
int max_threads = %(omp_get_max_threads)s; int max_threads = %(omp_get_max_threads)s;
...@@ -230,8 +267,10 @@ PyArrayObject* corrMM(PyArrayObject* bottom, ...@@ -230,8 +267,10 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
const int group_col_stride = (K_ * N_); const int group_col_stride = (K_ * N_);
const int group_weight_stride = (PyArray_STRIDES(weight)[0] * nFilters / numgroups)/%(n_bytes)f; const int group_weight_stride = (PyArray_STRIDES(weight)[0] * nFilters / numgroups)/%(n_bytes)f;
const int M_ = nFilters / numgroups; const int M_ = nFilters / numgroups;
const int one_int = 1;
const %(c_float_type)s one = 1.0; const %(c_float_type)s one = 1.0;
const %(c_float_type)s zero = 0.0; const %(c_float_type)s zero = 0.0;
const int ldw = (K_ * N_);
char NTrans = 'N'; char NTrans = 'N';
char Trans = 'T'; char Trans = 'T';
PyArrayObject *output; PyArrayObject *output;
...@@ -266,15 +305,30 @@ PyArrayObject* corrMM(PyArrayObject* bottom, ...@@ -266,15 +305,30 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
im2col((%(float_type)s*)PyArray_DATA(bottom) + n * batch_bottom_stride, nChannels, im2col((%(float_type)s*)PyArray_DATA(bottom) + n * batch_bottom_stride, nChannels,
bottomHeight,bottomWidth, kH, kW, dilH, dilW, padH, padW, dH, dW, bottomHeight,bottomWidth, kH, kW, dilH, dilW, padH, padW, dH, dW,
(%(float_type)s*)PyArray_DATA(col)+ tid * col_stride); (%(float_type)s*)PyArray_DATA(col)+ tid * col_stride);
for ( int g = 0; g < numgroups; ++g){ // Second, gemm
// Second, gemm if (unshared) {
%(gemm)s(&NTrans, &NTrans, for (int g = 0; g < numgroups; ++g) {
&N_, &M_, &K_, for (int reg = 0; reg < N_; ++reg) {
&one, %(gemv)s(&Trans, &K_, &M_,
(%(float_type)s*)PyArray_DATA(col) + tid * col_stride + g * group_col_stride, &N_, &one,
(%(float_type)s*)PyArray_DATA(weight) + g * group_weight_stride, &K_, (%(float_type)s*)PyArray_DATA(weight) + g * group_weight_stride + reg * K_, &ldw,
&zero, (%(float_type)s*)PyArray_DATA(col) + tid * col_stride + g * group_col_stride + reg, &N_,
(%(float_type)s*)PyArray_DATA(top) + n * batch_top_stride + g * group_top_stride, &N_); &zero,
(%(float_type)s*)PyArray_DATA(top) + n * batch_top_stride + g * group_top_stride + reg, &N_);
}
}
}
else {
for ( int g = 0; g < numgroups; ++g){
// Second, gemm
%(gemm)s(&NTrans, &NTrans,
&N_, &M_, &K_,
&one,
(%(float_type)s*)PyArray_DATA(col) + tid * col_stride + g * group_col_stride, &N_,
(%(float_type)s*)PyArray_DATA(weight) + g * group_weight_stride, &K_,
&zero,
(%(float_type)s*)PyArray_DATA(top) + n * batch_top_stride + g * group_top_stride, &N_);
}
} }
} }
// Restore to previous blas threads // Restore to previous blas threads
...@@ -316,7 +370,10 @@ PyArrayObject* corrMM(PyArrayObject* bottom, ...@@ -316,7 +370,10 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
output = weight; output = weight;
npy_intp weight_dim[2]; npy_intp weight_dim[2];
weight_dim[0] = (npy_intp)max_threads; weight_dim[0] = (npy_intp)max_threads;
weight_dim[1] = (npy_intp)(M_ * K_ * numgroups); if (unshared)
weight_dim[1] = (npy_intp)(M_ * N_ * K_ * numgroups);
else
weight_dim[1] = (npy_intp)(M_ * K_ * numgroups);
PyArrayObject* local_weight = (PyArrayObject*)PyArray_ZEROS(2, PyArrayObject* local_weight = (PyArrayObject*)PyArray_ZEROS(2,
weight_dim, PyArray_TYPE(weight), 0); weight_dim, PyArray_TYPE(weight), 0);
...@@ -341,26 +398,46 @@ PyArrayObject* corrMM(PyArrayObject* bottom, ...@@ -341,26 +398,46 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
im2col((%(float_type)s*)PyArray_DATA(bottom) + n * batch_bottom_stride, im2col((%(float_type)s*)PyArray_DATA(bottom) + n * batch_bottom_stride,
nChannels, bottomHeight,bottomWidth, kH, kW, dilH, dilW, padH, padW, dH, dW, nChannels, bottomHeight,bottomWidth, kH, kW, dilH, dilW, padH, padW, dH, dW,
(%(float_type)s*)PyArray_DATA(col)+ tid * col_stride); (%(float_type)s*)PyArray_DATA(col)+ tid * col_stride);
for(int g = 0; g < numgroups; ++g){ // Second, gemm
// Second, gemm // Note that we accumulate into weight. We do so by setting beta = 0
// Note that we accumulate into weight. We do so by setting beta = 0 // for the first iteration and beta = 1 for subsequent ones. (This
// for the first iteration and beta = 1 for subsequent ones. (This // is faster than setting weight to all zeros before the loop.)
// is faster than setting weight to all zeros before the loop.) if (unshared) {
%(gemm)s(&Trans, &NTrans, for (int g = 0; g < numgroups; ++g) {
&K_, &M_, &N_, for (int reg = 0; reg < N_; ++reg) {
&one, %(gemm)s(&Trans, &NTrans,
(%(float_type)s*)PyArray_DATA(col) + tid * col_stride + g * group_col_stride, &N_, &K_, &M_, &one_int,
(%(float_type)s*)PyArray_DATA(top) + g * group_top_stride + n * batch_top_stride, &N_, &one,
(n == 0) ? &zero : &one, (%(float_type)s*)PyArray_DATA(col) + tid * col_stride + g * group_col_stride + reg, &N_,
(%(float_type)s*)PyArray_DATA(local_weight) + g * group_weight_stride + (%(float_type)s*)PyArray_DATA(top) + g * group_top_stride + n * batch_top_stride + reg, &N_,
tid * weight_dim[1], &K_); (n == 0) ? &zero : &one,
(%(float_type)s*)PyArray_DATA(local_weight) + g * group_weight_stride + reg * K_ +
tid * weight_dim[1], &ldw);
}
}
}
else {
for(int g = 0; g < numgroups; ++g){
// Second, gemm
// Note that we accumulate into weight. We do so by setting beta = 0
// for the first iteration and beta = 1 for subsequent ones. (This
// is faster than setting weight to all zeros before the loop.)
%(gemm)s(&Trans, &NTrans,
&K_, &M_, &N_,
&one,
(%(float_type)s*)PyArray_DATA(col) + tid * col_stride + g * group_col_stride, &N_,
(%(float_type)s*)PyArray_DATA(top) + g * group_top_stride + n * batch_top_stride, &N_,
(n == 0) ? &zero : &one,
(%(float_type)s*)PyArray_DATA(local_weight) + g * group_weight_stride +
tid * weight_dim[1], &K_);
}
} }
} }
// Restore to previous blas threads // Restore to previous blas threads
%(blas_set_num_threads)s(blas_threads_saved); %(blas_set_num_threads)s(blas_threads_saved);
//aggregate weights //aggregate weights
memset((%(float_type)s*)PyArray_DATA(weight), 0, M_ * K_*sizeof(%(float_type)s)); memset((%(float_type)s*)PyArray_DATA(weight), 0, weight_dim[1]*sizeof(%(float_type)s));
/* /*
* Put index "j" into outer loop to get the * Put index "j" into outer loop to get the
* correct result when openmp is used. * correct result when openmp is used.
...@@ -416,15 +493,29 @@ PyArrayObject* corrMM(PyArrayObject* bottom, ...@@ -416,15 +493,29 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
%(omp_flags)s %(omp_flags)s
for (int n = 0; n < batchSize; ++n) { for (int n = 0; n < batchSize; ++n) {
int tid = %(omp_get_thread_num)s; int tid = %(omp_get_thread_num)s;
for ( int g = 0;g < numgroups; ++g){ if (unshared) {
// gemm into columns for (int g = 0; g < numgroups; ++g){
%(gemm)s(&NTrans, &Trans, for (int reg = 0; reg < N_; ++reg){
&N_, &K_, &M_, %(gemm)s(&NTrans, &Trans,
&one, &one_int, &K_, &M_,
(%(float_type)s*)PyArray_DATA(top) + g * group_top_stride + n * batch_top_stride, &N_, &one,
(%(float_type)s*)PyArray_DATA(weight) + g * group_weight_stride, &K_, (%(float_type)s*)PyArray_DATA(top) + g * group_top_stride + n * batch_top_stride + reg, &N_,
&zero, (%(float_type)s*)PyArray_DATA(weight) + g * group_weight_stride + reg * K_, &ldw,
(%(float_type)s*)PyArray_DATA(col) + tid * col_stride + g * group_col_stride, &N_); &zero,
(%(float_type)s*)PyArray_DATA(col) + tid * col_stride + g * group_col_stride + reg, &N_);
}
}
}
else {
for (int g = 0; g < numgroups; ++g) {
%(gemm)s(&NTrans, &Trans,
&N_, &K_, &M_,
&one,
(%(float_type)s*)PyArray_DATA(top) + g * group_top_stride + n * batch_top_stride, &N_,
(%(float_type)s*)PyArray_DATA(weight) + g * group_weight_stride, &K_,
&zero,
(%(float_type)s*)PyArray_DATA(col) + tid * col_stride + g * group_col_stride, &N_);
}
} }
// col2im back to the data // col2im back to the data
col2im((%(float_type)s*)PyArray_DATA(col) + tid * col_stride, nChannels, bottomHeight, bottomWidth, col2im((%(float_type)s*)PyArray_DATA(col) + tid * col_stride, nChannels, bottomHeight, bottomWidth,
...@@ -474,4 +565,3 @@ PyArrayObject* corrMM(PyArrayObject* bottom, ...@@ -474,4 +565,3 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
// in here output is just aliased to one of bottom, weights, or top. // in here output is just aliased to one of bottom, weights, or top.
return output; return output;
} }
...@@ -8,7 +8,7 @@ import theano ...@@ -8,7 +8,7 @@ import theano
from theano import Apply from theano import Apply
from theano import gof from theano import gof
from theano.gof import ParamsType, EnumList from theano.gof import ParamsType, EnumList
from theano.scalar import int64 from theano.scalar import int64, int8
from theano.tensor import as_tensor_variable, TensorType from theano.tensor import as_tensor_variable, TensorType
from theano.tensor.nnet.abstract_conv import get_conv_output_shape from theano.tensor.nnet.abstract_conv import get_conv_output_shape
from theano.tensor import blas_headers from theano.tensor import blas_headers
...@@ -42,9 +42,11 @@ class BaseCorrMM(gof.OpenMPOp): ...@@ -42,9 +42,11 @@ class BaseCorrMM(gof.OpenMPOp):
Perform dilated correlation (default: (1,1)) Perform dilated correlation (default: (1,1))
num_groups num_groups
Perform grouped convolutions (default: 1) Perform grouped convolutions (default: 1)
unshared
Perform unshared correlation (default: False)
""" """
check_broadcast = False check_broadcast = False
__props__ = ('border_mode', 'subsample', 'filter_dilation', 'num_groups') __props__ = ('border_mode', 'subsample', 'filter_dilation', 'num_groups', 'unshared')
_direction = None _direction = None
...@@ -54,10 +56,10 @@ class BaseCorrMM(gof.OpenMPOp): ...@@ -54,10 +56,10 @@ class BaseCorrMM(gof.OpenMPOp):
dH=int64, dW=int64, dH=int64, dW=int64,
dilH=int64, dilW=int64, dilH=int64, dilW=int64,
padH=int64, padW=int64, padH=int64, padW=int64,
num_groups=int64) num_groups=int64, unshared=int8)
def __init__(self, border_mode="valid", subsample=(1, 1), def __init__(self, border_mode="valid", subsample=(1, 1),
filter_dilation=(1, 1), num_groups=1, openmp=None): filter_dilation=(1, 1), num_groups=1, unshared=False, openmp=None):
super(BaseCorrMM, self).__init__(openmp=openmp) super(BaseCorrMM, self).__init__(openmp=openmp)
if isinstance(border_mode, integer_types): if isinstance(border_mode, integer_types):
if border_mode < 0: if border_mode < 0:
...@@ -85,6 +87,7 @@ class BaseCorrMM(gof.OpenMPOp): ...@@ -85,6 +87,7 @@ class BaseCorrMM(gof.OpenMPOp):
raise ValueError("filter_dilation must have two elements") raise ValueError("filter_dilation must have two elements")
self.subsample = tuple(subsample) self.subsample = tuple(subsample)
self.filter_dilation = tuple(filter_dilation) self.filter_dilation = tuple(filter_dilation)
self.unshared = unshared
if not theano.config.blas.ldflags: if not theano.config.blas.ldflags:
# Theano will use a NumPy C implementation of [sd]gemm_ instead. # Theano will use a NumPy C implementation of [sd]gemm_ instead.
...@@ -130,12 +133,13 @@ class BaseCorrMM(gof.OpenMPOp): ...@@ -130,12 +133,13 @@ class BaseCorrMM(gof.OpenMPOp):
padW = property(lambda self: self.pad[1]) padW = property(lambda self: self.pad[1])
def __str__(self): def __str__(self):
return '%s{%s, %s, %s, %s}' % ( return '%s{%s, %s, %s, %s %s}' % (
self.__class__.__name__, self.__class__.__name__,
self.border_mode, self.border_mode,
str(self.subsample), str(self.subsample),
str(self.filter_dilation), str(self.filter_dilation),
str(self.num_groups)) str(self.num_groups),
str(self.unshared))
@staticmethod @staticmethod
def as_common_dtype(in1, in2): def as_common_dtype(in1, in2):
...@@ -179,7 +183,7 @@ class BaseCorrMM(gof.OpenMPOp): ...@@ -179,7 +183,7 @@ class BaseCorrMM(gof.OpenMPOp):
def c_code_cache_version(self): def c_code_cache_version(self):
# raise this whenever modifying any of the support_code_files # raise this whenever modifying any of the support_code_files
return (7, self.openmp, blas_header_version()) return (8, self.openmp, blas_header_version())
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
# REMEMBER TO RAISE c_code_cache_version when changing any of # REMEMBER TO RAISE c_code_cache_version when changing any of
...@@ -189,12 +193,14 @@ class BaseCorrMM(gof.OpenMPOp): ...@@ -189,12 +193,14 @@ class BaseCorrMM(gof.OpenMPOp):
assert dtype in ('float32', 'float64') assert dtype in ('float32', 'float64')
if dtype == 'float32': if dtype == 'float32':
sub['gemm'] = 'sgemm_' sub['gemm'] = 'sgemm_'
sub['gemv'] = 'sgemv_'
sub['float_type'] = 'npy_float' sub['float_type'] = 'npy_float'
sub['float_typenum'] = 'NPY_FLOAT' sub['float_typenum'] = 'NPY_FLOAT'
sub['n_bytes'] = 4 sub['n_bytes'] = 4
sub['c_float_type'] = 'float' sub['c_float_type'] = 'float'
else: else:
sub['gemm'] = 'dgemm_' sub['gemm'] = 'dgemm_'
sub['gemv'] = 'dgemv_'
sub['float_type'] = 'npy_double' sub['float_type'] = 'npy_double'
sub['float_typenum'] = 'NPY_DOUBLE' sub['float_typenum'] = 'NPY_DOUBLE'
sub['n_bytes'] = 8 sub['n_bytes'] = 8
...@@ -287,6 +293,7 @@ class BaseCorrMM(gof.OpenMPOp): ...@@ -287,6 +293,7 @@ class BaseCorrMM(gof.OpenMPOp):
int padH = %(params)s->padH; int padH = %(params)s->padH;
int padW = %(params)s->padW; int padW = %(params)s->padW;
int numgroups = %(params)s->num_groups; int numgroups = %(params)s->num_groups;
int unshared = %(params)s->unshared;
PyArrayObject * bottom = %(bottom)s; PyArrayObject * bottom = %(bottom)s;
PyArrayObject * weights = %(weights)s; PyArrayObject * weights = %(weights)s;
...@@ -310,13 +317,17 @@ class BaseCorrMM(gof.OpenMPOp): ...@@ -310,13 +317,17 @@ class BaseCorrMM(gof.OpenMPOp):
break; break;
} }
int wdim, odim;
wdim = unshared ? 6 : 4;
odim = 4; //Can be set to 6 later for unshared backprop wrt weights
// Obtain or infer kernel width and height // Obtain or infer kernel width and height
// (we need to know it early to be able to handle auto-padding) // (we need to know it early to be able to handle auto-padding)
int kH, kW, dil_kH, dil_kW; int kH, kW, dil_kH, dil_kW;
if (direction != 1) { if (direction != 1) {
// weight is an input variable, we can just read its shape // weight is an input variable, we can just read its shape
kH = PyArray_DIMS(weights)[2]; kH = PyArray_DIMS(weights)[wdim-2];
kW = PyArray_DIMS(weights)[3]; kW = PyArray_DIMS(weights)[wdim-1];
} }
else { else {
if (%(height)s != -1) { if (%(height)s != -1) {
...@@ -370,28 +381,45 @@ class BaseCorrMM(gof.OpenMPOp): ...@@ -370,28 +381,45 @@ class BaseCorrMM(gof.OpenMPOp):
} }
// Infer output shape // Infer output shape
npy_intp out_dim[4]; npy_intp out_dim[6];
out_dim[4] = out_dim[5] = 0; //Only used for unshared backprop wrt weights
switch(direction) { switch(direction) {
case 0: // forward pass case 0: // forward pass
// output is top: (batchsize, num_filters, height, width) // output is top: (batchsize, num_filters, height, width)
// height and width: top = (bottom + 2*pad - ((weight-1)*dil + 1)) / sample + 1 // height and width: top = (bottom + 2*pad - ((weight-1)*dil + 1)) / sample + 1
out_dim[0] = (npy_intp)PyArray_DIMS(bottom)[0]; out_dim[0] = (npy_intp)PyArray_DIMS(bottom)[0];
out_dim[1] = (npy_intp)PyArray_DIMS(weights)[0]; out_dim[1] = (npy_intp)PyArray_DIMS(weights)[0];
out_dim[2] = (npy_intp)((PyArray_DIMS(bottom)[2] + 2*padH - ((PyArray_DIMS(weights)[2]-1)*dilH + 1)) / dH + 1); out_dim[2] = (npy_intp)((PyArray_DIMS(bottom)[2] + 2*padH - ((PyArray_DIMS(weights)[wdim-2]-1)*dilH + 1)) / dH + 1);
out_dim[3] = (npy_intp)((PyArray_DIMS(bottom)[3] + 2*padW - ((PyArray_DIMS(weights)[3]-1)*dilW + 1)) / dW + 1); out_dim[3] = (npy_intp)((PyArray_DIMS(bottom)[3] + 2*padW - ((PyArray_DIMS(weights)[wdim-1]-1)*dilW + 1)) / dW + 1);
if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0) if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
{ {
PyErr_Format(PyExc_ValueError, if (unshared) {
"CorrMM: impossible output shape\\n" PyErr_Format(PyExc_ValueError,
" bottom shape: %%ld x %%ld x %%ld x %%ld\\n" "CorrMM: impossible output shape\\n"
" weights shape: %%ld x %%ld x %%ld x %%ld\\n" " bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
" top shape: %%ld x %%ld x %%ld x %%ld\\n", " weights shape: %%ld x %%ld x %%ld x %%ld x %%ld x %%ld\\n"
(long int)PyArray_DIMS(bottom)[0], (long int)PyArray_DIMS(bottom)[1], " top shape: %%ld x %%ld x %%ld x %%ld\\n",
(long int)PyArray_DIMS(bottom)[2], (long int)PyArray_DIMS(bottom)[3], (long int)PyArray_DIMS(bottom)[0], (long int)PyArray_DIMS(bottom)[1],
(long int)PyArray_DIMS(weights)[0], (long int)PyArray_DIMS(weights)[1], (long int)PyArray_DIMS(bottom)[2], (long int)PyArray_DIMS(bottom)[3],
(long int)PyArray_DIMS(weights)[2], (long int)PyArray_DIMS(weights)[3], (long int)PyArray_DIMS(weights)[0], (long int)PyArray_DIMS(weights)[1],
(long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2], (long int)PyArray_DIMS(weights)[2], (long int)PyArray_DIMS(weights)[3],
(long int)out_dim[3]); (long int)PyArray_DIMS(weights)[4], (long int)PyArray_DIMS(weights)[5],
(long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
(long int)out_dim[3]);
}
else {
PyErr_Format(PyExc_ValueError,
"CorrMM: impossible output shape\\n"
" bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
" weights shape: %%ld x %%ld x %%ld x %%ld\\n"
" top shape: %%ld x %%ld x %%ld x %%ld\\n",
(long int)PyArray_DIMS(bottom)[0], (long int)PyArray_DIMS(bottom)[1],
(long int)PyArray_DIMS(bottom)[2], (long int)PyArray_DIMS(bottom)[3],
(long int)PyArray_DIMS(weights)[0], (long int)PyArray_DIMS(weights)[1],
(long int)PyArray_DIMS(weights)[2], (long int)PyArray_DIMS(weights)[3],
(long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
(long int)out_dim[3]);
}
%(fail)s %(fail)s
} }
break; break;
...@@ -399,46 +427,90 @@ class BaseCorrMM(gof.OpenMPOp): ...@@ -399,46 +427,90 @@ class BaseCorrMM(gof.OpenMPOp):
// output is weights: (num_filters, num_channels, height, width) // output is weights: (num_filters, num_channels, height, width)
// height and width: weights = (bottom + 2*pad - (top - 1) * sample - 1) / dil + 1 // height and width: weights = (bottom + 2*pad - (top - 1) * sample - 1) / dil + 1
out_dim[0] = (npy_intp)PyArray_DIMS(top)[1]; out_dim[0] = (npy_intp)PyArray_DIMS(top)[1];
out_dim[1] = (npy_intp)PyArray_DIMS(bottom)[1] / numgroups; if (unshared){
out_dim[2] = (npy_intp)kH; // already inferred further above odim = 6;
out_dim[3] = (npy_intp)kW; // how convenient out_dim[1] = (npy_intp)PyArray_DIMS(top)[2];
if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0) out_dim[2] = (npy_intp)PyArray_DIMS(top)[3];
{ }
PyErr_Format(PyExc_ValueError, out_dim[wdim-3] = (npy_intp)PyArray_DIMS(bottom)[1] / numgroups;
"CorrMM backprop wrt. weights: impossible output shape\\n" out_dim[wdim-2] = (npy_intp)kH; // already inferred further above
" bottom shape: %%ld x %%ld x %%ld x %%ld\\n" out_dim[wdim-1] = (npy_intp)kW; // how convenient
" weights shape: %%ld x %%ld x %%ld x %%ld\\n" if (unshared) {
" top shape: %%ld x %%ld x %%ld x %%ld\\n", if (out_dim[0] < 0 || out_dim[1] <= 0 || out_dim[2] <= 0 || out_dim[3] < 0
(long int)PyArray_DIMS(bottom)[0], (long int)PyArray_DIMS(bottom)[1], || out_dim[4] <= 0 || out_dim[5] <= 0){
(long int)PyArray_DIMS(bottom)[2], (long int)PyArray_DIMS(bottom)[3], PyErr_Format(PyExc_ValueError,
(long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2], "CorrMM backprop wrt. weights: impossible output shape\\n"
(long int)out_dim[3], " bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
(long int)PyArray_DIMS(top)[0], (long int)PyArray_DIMS(top)[1], " weights shape: %%ld x %%ld x %%ld x %%ld x %%ld x %%ld\\n"
(long int)PyArray_DIMS(top)[2], (long int)PyArray_DIMS(top)[3]); " top shape: %%ld x %%ld x %%ld x %%ld\\n",
(long int)PyArray_DIMS(bottom)[0], (long int)PyArray_DIMS(bottom)[1],
(long int)PyArray_DIMS(bottom)[2], (long int)PyArray_DIMS(bottom)[3],
(long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
(long int)out_dim[3], (long int)out_dim[4], (long int)out_dim[5],
(long int)PyArray_DIMS(top)[0], (long int)PyArray_DIMS(top)[1],
(long int)PyArray_DIMS(top)[2], (long int)PyArray_DIMS(top)[3]);
}
%(fail)s %(fail)s
} }
else {
if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
{
PyErr_Format(PyExc_ValueError,
"CorrMM backprop wrt. weights: impossible output shape\\n"
" bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
" weights shape: %%ld x %%ld x %%ld x %%ld\\n"
" top shape: %%ld x %%ld x %%ld x %%ld\\n",
(long int)PyArray_DIMS(bottom)[0], (long int)PyArray_DIMS(bottom)[1],
(long int)PyArray_DIMS(bottom)[2], (long int)PyArray_DIMS(bottom)[3],
(long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
(long int)out_dim[3],
(long int)PyArray_DIMS(top)[0], (long int)PyArray_DIMS(top)[1],
(long int)PyArray_DIMS(top)[2], (long int)PyArray_DIMS(top)[3]);
%(fail)s
}
}
break; break;
case 2: // backprop wrt. inputs case 2: // backprop wrt. inputs
// output is bottom: (batchsize, num_channels, height, width) // output is bottom: (batchsize, num_channels, height, width)
// height and width: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad // height and width: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad
out_dim[0] = (npy_intp)PyArray_DIMS(top)[0]; out_dim[0] = (npy_intp)PyArray_DIMS(top)[0];
out_dim[1] = (npy_intp)PyArray_DIMS(weights)[1] * numgroups; out_dim[1] = (npy_intp)PyArray_DIMS(weights)[wdim-3] * numgroups;
out_dim[2] = (npy_intp)((%(height)s != -1) ? %(height)s : (PyArray_DIMS(top)[2] - 1) * dH + (PyArray_DIMS(weights)[2]-1)*dilH + 1 - 2*padH); out_dim[2] = (npy_intp)((%(height)s != -1) ? %(height)s : (PyArray_DIMS(top)[2] - 1) * dH + (PyArray_DIMS(weights)[wdim-2]-1)*dilH + 1 - 2*padH);
out_dim[3] = (npy_intp)((%(width)s != -1) ? %(width)s : (PyArray_DIMS(top)[3] - 1) * dW + (PyArray_DIMS(weights)[3]-1)*dilW + 1 - 2*padW); out_dim[3] = (npy_intp)((%(width)s != -1) ? %(width)s : (PyArray_DIMS(top)[3] - 1) * dW + (PyArray_DIMS(weights)[wdim-1]-1)*dilW + 1 - 2*padW);
if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0) if (unshared) {
{ if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
PyErr_Format(PyExc_ValueError, {
"CorrMM backprop wrt. inputs: impossible output shape\\n" PyErr_Format(PyExc_ValueError,
" bottom shape: %%ld x %%ld x %%ld x %%ld\\n" "CorrMM backprop wrt. inputs: impossible output shape\\n"
" weights shape: %%ld x %%ld x %%ld x %%ld\\n" " bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
" top shape: %%ld x %%ld x %%ld x %%ld\\n", " weights shape: %%ld x %%ld x %%ld x %%ld x %%ld x %%ld\\n"
(long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2], " top shape: %%ld x %%ld x %%ld x %%ld\\n",
(long int)out_dim[3], (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
(long int)PyArray_DIMS(weights)[0], (long int)PyArray_DIMS(weights)[1], (long int)out_dim[3],
(long int)PyArray_DIMS(weights)[2], (long int)PyArray_DIMS(weights)[3], (long int)PyArray_DIMS(weights)[0], (long int)PyArray_DIMS(weights)[1],
(long int)PyArray_DIMS(top)[0], (long int)PyArray_DIMS(top)[1], (long int)PyArray_DIMS(weights)[2], (long int)PyArray_DIMS(weights)[3],
(long int)PyArray_DIMS(top)[2], (long int)PyArray_DIMS(top)[3]); (long int)PyArray_DIMS(weights)[4], (long int)PyArray_DIMS(weights)[5],
%(fail)s (long int)PyArray_DIMS(top)[0], (long int)PyArray_DIMS(top)[1],
(long int)PyArray_DIMS(top)[2], (long int)PyArray_DIMS(top)[3]);
%(fail)s
}
}
else {
if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
{
PyErr_Format(PyExc_ValueError,
"CorrMM backprop wrt. inputs: impossible output shape\\n"
" bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
" weights shape: %%ld x %%ld x %%ld x %%ld\\n"
" top shape: %%ld x %%ld x %%ld x %%ld\\n",
(long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
(long int)out_dim[3],
(long int)PyArray_DIMS(weights)[0], (long int)PyArray_DIMS(weights)[1],
(long int)PyArray_DIMS(weights)[2], (long int)PyArray_DIMS(weights)[3],
(long int)PyArray_DIMS(top)[0], (long int)PyArray_DIMS(top)[1],
(long int)PyArray_DIMS(top)[2], (long int)PyArray_DIMS(top)[3]);
%(fail)s
}
} }
break; break;
default: default:
...@@ -448,13 +520,19 @@ class BaseCorrMM(gof.OpenMPOp): ...@@ -448,13 +520,19 @@ class BaseCorrMM(gof.OpenMPOp):
// Prepare output array // Prepare output array
int typenum; int typenum;
if ( !(*out int failure;
&& PyArray_NDIM(*out)==4 failure = !(*out
&& PyArray_NDIM(*out)==odim
&& PyArray_IS_C_CONTIGUOUS(*out) && PyArray_IS_C_CONTIGUOUS(*out)
&& PyArray_DIMS(*out)[0]==out_dim[0] && PyArray_DIMS(*out)[0]==out_dim[0]
&& PyArray_DIMS(*out)[1]==out_dim[1] && PyArray_DIMS(*out)[1]==out_dim[1]
&& PyArray_DIMS(*out)[2]==out_dim[2] && PyArray_DIMS(*out)[2]==out_dim[2]
&& PyArray_DIMS(*out)[3]==out_dim[3])) && PyArray_DIMS(*out)[3]==out_dim[3]);
if (odim == 6){
failure = failure || !(PyArray_DIMS(*out)[4]==out_dim[4]
&& PyArray_DIMS(*out)[5]==out_dim[5]);
}
if ( failure )
{ {
Py_XDECREF(*out); Py_XDECREF(*out);
if (direction != 1) { if (direction != 1) {
...@@ -464,21 +542,29 @@ class BaseCorrMM(gof.OpenMPOp): ...@@ -464,21 +542,29 @@ class BaseCorrMM(gof.OpenMPOp):
typenum = PyArray_TYPE(bottom); typenum = PyArray_TYPE(bottom);
} }
//Change to PyArray_ZEROS which is faster than PyArray_EMPTY. //Change to PyArray_ZEROS which is faster than PyArray_EMPTY.
*out = (PyArrayObject*)PyArray_ZEROS(4, *out = (PyArrayObject*)PyArray_ZEROS(odim,
out_dim, out_dim,
typenum, typenum,
0); 0);
if (NULL == *out) if (NULL == *out)
{ {
PyErr_Format(PyExc_RuntimeError, if (odim == 4) {
"BaseCorrMM: Failed to allocate output of %%lld x %%lld x %%lld x %%lld", PyErr_Format(PyExc_RuntimeError,
(long long)out_dim[0], (long long)out_dim[1], (long long)out_dim[2], (long long)out_dim[3]); "BaseCorrMM: Failed to allocate output of %%lld x %%lld x %%lld x %%lld",
(long long)out_dim[0], (long long)out_dim[1], (long long)out_dim[2], (long long)out_dim[3]);
}
if (odim == 6) {
PyErr_Format(PyExc_RuntimeError,
"BaseCorrMM: Failed to allocate output of %%lld x %%lld x %%lld x %%lld %%lld %%lld",
(long long)out_dim[0], (long long)out_dim[1], (long long)out_dim[2], (long long)out_dim[3],
(long long)out_dim[4], (long long)out_dim[5]);
}
%(fail)s %(fail)s
} }
} }
// Call corrMM code // Call corrMM code
out2 = corrMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, dilH, dilW, padH, padW, numgroups ); out2 = corrMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, dilH, dilW, padH, padW, numgroups, unshared);
if (out2==NULL){ if (out2==NULL){
%(fail)s %(fail)s
} }
...@@ -514,6 +600,9 @@ class CorrMM(BaseCorrMM): ...@@ -514,6 +600,9 @@ class CorrMM(BaseCorrMM):
The filter dilation operation applied to each input image. The filter dilation operation applied to each input image.
Should be a tuple with 2 elements. Should be a tuple with 2 elements.
Set to `(1, 1)` to disable filter dilation. Set to `(1, 1)` to disable filter dilation.
unshared:
Boolean value. If true, then a different kernel will be applied to
each region of the input image.
""" """
...@@ -525,8 +614,12 @@ class CorrMM(BaseCorrMM): ...@@ -525,8 +614,12 @@ class CorrMM(BaseCorrMM):
img, kern = self.as_common_dtype(img, kern) img, kern = self.as_common_dtype(img, kern)
if img.type.ndim != 4: if img.type.ndim != 4:
raise TypeError('img must be 4D tensor') raise TypeError('img must be 4D tensor')
if kern.type.ndim != 4: if self.unshared is True:
raise TypeError('kern must be 4D tensor') if kern.type.ndim != 6:
raise TypeError('kern must be 6D tensor')
else:
if kern.type.ndim != 4:
raise TypeError('kern must be 4D tensor')
broadcastable = [img.type.broadcastable[0], kern.type.broadcastable[0], broadcastable = [img.type.broadcastable[0], kern.type.broadcastable[0],
False, False] False, False]
...@@ -555,13 +648,15 @@ class CorrMM(BaseCorrMM): ...@@ -555,13 +648,15 @@ class CorrMM(BaseCorrMM):
d_bottom = CorrMM_gradInputs(self.border_mode, d_bottom = CorrMM_gradInputs(self.border_mode,
self.subsample, self.subsample,
self.filter_dilation, self.filter_dilation,
self.num_groups)(weights, top, self.num_groups,
bottom.shape[-2:]) self.unshared)(weights, top,
bottom.shape[-2:])
d_weights = CorrMM_gradWeights(self.border_mode, d_weights = CorrMM_gradWeights(self.border_mode,
self.subsample, self.subsample,
self.filter_dilation, self.filter_dilation,
self.num_groups)(bottom, top, self.num_groups,
weights.shape[-2:]) self.unshared)(bottom, top,
weights.shape[-2:])
return d_bottom, d_weights return d_bottom, d_weights
...@@ -595,8 +690,12 @@ class CorrMM_gradWeights(BaseCorrMM): ...@@ -595,8 +690,12 @@ class CorrMM_gradWeights(BaseCorrMM):
else: else:
height_width = [as_tensor_variable(shape[0]).astype('int64'), as_tensor_variable(shape[1]).astype('int64')] height_width = [as_tensor_variable(shape[0]).astype('int64'), as_tensor_variable(shape[1]).astype('int64')]
broadcastable = [topgrad.type.broadcastable[1], img.type.broadcastable[1], if self.unshared is True:
False, False] broadcastable = [topgrad.type.broadcastable[0], False, False,
img.type.broadcastable[1], False, False]
else:
broadcastable = [topgrad.type.broadcastable[0], img.type.broadcastable[1],
False, False]
dtype = img.type.dtype dtype = img.type.dtype
return Apply(self, [img, topgrad] + height_width, return Apply(self, [img, topgrad] + height_width,
[TensorType(dtype, broadcastable)()]) [TensorType(dtype, broadcastable)()])
...@@ -633,7 +732,10 @@ class CorrMM_gradWeights(BaseCorrMM): ...@@ -633,7 +732,10 @@ class CorrMM_gradWeights(BaseCorrMM):
kW = 2 - imshp[1] + (topshp[1] - 1) * dW kW = 2 - imshp[1] + (topshp[1] - 1) * dW
else: else:
kW = imshp[1] + 2 * padW - (topshp[1] - 1) * dW kW = imshp[1] + 2 * padW - (topshp[1] - 1) * dW
return [(nkern, ssize, kH, kW)] if self.unshared is True:
return [(nkern, topshp[0], topshp[1], ssize, kH, kW)]
else:
return [(nkern, ssize, kH, kW)]
def c_code(self, node, nodename, inp, out_, sub): def c_code(self, node, nodename, inp, out_, sub):
bottom, top = inp[:2] bottom, top = inp[:2]
...@@ -649,12 +751,14 @@ class CorrMM_gradWeights(BaseCorrMM): ...@@ -649,12 +751,14 @@ class CorrMM_gradWeights(BaseCorrMM):
d_bottom = CorrMM_gradInputs(self.border_mode, d_bottom = CorrMM_gradInputs(self.border_mode,
self.subsample, self.subsample,
self.filter_dilation, self.filter_dilation,
self.num_groups)(weights, top, self.num_groups,
bottom.shape[-2:]) self.unshared)(weights, top,
bottom.shape[-2:])
d_top = CorrMM(self.border_mode, d_top = CorrMM(self.border_mode,
self.subsample, self.subsample,
self.filter_dilation, self.filter_dilation,
self.num_groups)(bottom, weights) self.num_groups,
self.unshared)(bottom, weights)
d_height_width = ((theano.gradient.DisconnectedType()(),) * 2 d_height_width = ((theano.gradient.DisconnectedType()(),) * 2
if len(inp) == 4 else ()) if len(inp) == 4 else ())
return (d_bottom, d_top) + d_height_width return (d_bottom, d_top) + d_height_width
...@@ -684,8 +788,12 @@ class CorrMM_gradInputs(BaseCorrMM): ...@@ -684,8 +788,12 @@ class CorrMM_gradInputs(BaseCorrMM):
kern = as_tensor_variable(kern) kern = as_tensor_variable(kern)
topgrad = as_tensor_variable(topgrad) topgrad = as_tensor_variable(topgrad)
kern, topgrad = self.as_common_dtype(kern, topgrad) kern, topgrad = self.as_common_dtype(kern, topgrad)
if kern.type.ndim != 4: if self.unshared is True:
raise TypeError('kern must be 4D tensor') if kern.type.ndim != 6:
raise TypeError('kern must be 6D tensor')
else:
if kern.type.ndim != 4:
raise TypeError('kern must be 4D tensor')
if topgrad.type.ndim != 4: if topgrad.type.ndim != 4:
raise TypeError('topgrad must be 4D tensor') raise TypeError('topgrad must be 4D tensor')
if shape is None: if shape is None:
...@@ -700,7 +808,7 @@ class CorrMM_gradInputs(BaseCorrMM): ...@@ -700,7 +808,7 @@ class CorrMM_gradInputs(BaseCorrMM):
broadcastable = [topgrad.type.broadcastable[0], False, broadcastable = [topgrad.type.broadcastable[0], False,
False, False] False, False]
else: else:
broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[1], broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[-3],
False, False] False, False]
dtype = kern.type.dtype dtype = kern.type.dtype
return Apply(self, [kern, topgrad] + height_width, return Apply(self, [kern, topgrad] + height_width,
...@@ -719,7 +827,7 @@ class CorrMM_gradInputs(BaseCorrMM): ...@@ -719,7 +827,7 @@ class CorrMM_gradInputs(BaseCorrMM):
dH, dW = self.subsample dH, dW = self.subsample
kshp = input_shape[0] kshp = input_shape[0]
topshp = input_shape[1] topshp = input_shape[1]
ssize, kshp = kshp[1], list(kshp[2:]) ssize, kshp = kshp[-3], list(kshp[-2:])
ssize = ssize * self.num_groups ssize = ssize * self.num_groups
bsize, topshp = topshp[0], list(topshp[2:]) bsize, topshp = topshp[0], list(topshp[2:])
height_width = node.inputs[-2:] height_width = node.inputs[-2:]
...@@ -762,13 +870,15 @@ class CorrMM_gradInputs(BaseCorrMM): ...@@ -762,13 +870,15 @@ class CorrMM_gradInputs(BaseCorrMM):
d_weights = CorrMM_gradWeights(self.border_mode, d_weights = CorrMM_gradWeights(self.border_mode,
self.subsample, self.subsample,
self.filter_dilation, self.filter_dilation,
self.num_groups)(bottom, self.num_groups,
top, self.unshared)(bottom,
weights.shape[-2:]) top,
weights.shape[-2:])
d_top = CorrMM(self.border_mode, d_top = CorrMM(self.border_mode,
self.subsample, self.subsample,
self.filter_dilation, self.filter_dilation,
self.num_groups)(bottom, weights) self.num_groups,
self.unshared)(bottom, weights)
d_height_width = ((theano.gradient.DisconnectedType()(),) * d_height_width = ((theano.gradient.DisconnectedType()(),) *
2 if len(inp) == 4 else ()) 2 if len(inp) == 4 else ())
return (d_weights, d_top) + d_height_width return (d_weights, d_top) + d_height_width
......
...@@ -82,12 +82,14 @@ def local_abstractconv_gemm(node): ...@@ -82,12 +82,14 @@ def local_abstractconv_gemm(node):
# need to flip the kernel if necessary # need to flip the kernel if necessary
if node.op.filter_flip: if node.op.filter_flip:
kern = kern[:, :, ::-1, ::-1] flip = (slice(None),) * (kern.ndim - 2) + \
(slice(None, None, -1),) * 2
kern = kern[flip]
rval = CorrMM(border_mode=node.op.border_mode, rval = CorrMM(border_mode=node.op.border_mode,
subsample=node.op.subsample, subsample=node.op.subsample,
filter_dilation=node.op.filter_dilation, filter_dilation=node.op.filter_dilation,
num_groups=node.op.num_groups)(img, kern) num_groups=node.op.num_groups,
unshared=node.op.unshared)(img, kern)
copy_stack_trace(node.outputs[0], rval) copy_stack_trace(node.outputs[0], rval)
return [rval] return [rval]
...@@ -134,12 +136,15 @@ def local_abstractconv_gradweight_gemm(node): ...@@ -134,12 +136,15 @@ def local_abstractconv_gradweight_gemm(node):
rval = CorrMM_gradWeights(border_mode=node.op.border_mode, rval = CorrMM_gradWeights(border_mode=node.op.border_mode,
subsample=node.op.subsample, subsample=node.op.subsample,
filter_dilation=node.op.filter_dilation, filter_dilation=node.op.filter_dilation,
num_groups=node.op.num_groups)(img, topgrad, shape) num_groups=node.op.num_groups,
unshared=node.op.unshared)(img, topgrad, shape)
copy_stack_trace(node.outputs[0], rval) copy_stack_trace(node.outputs[0], rval)
# need to flip the kernel if necessary # need to flip the kernel if necessary
if node.op.filter_flip: if node.op.filter_flip:
rval = rval[:, :, ::-1, ::-1] flip = (slice(None),) * (rval.ndim - 2) + \
(slice(None, None, -1),) * 2
rval = rval[flip]
rval = theano.tensor.patternbroadcast(rval, node.outputs[0].broadcastable) rval = theano.tensor.patternbroadcast(rval, node.outputs[0].broadcastable)
copy_stack_trace(node.outputs[0], rval) copy_stack_trace(node.outputs[0], rval)
...@@ -189,12 +194,14 @@ def local_abstractconv_gradinputs_gemm(node): ...@@ -189,12 +194,14 @@ def local_abstractconv_gradinputs_gemm(node):
# need to flip the kernel if necessary # need to flip the kernel if necessary
if node.op.filter_flip: if node.op.filter_flip:
kern = kern[:, :, ::-1, ::-1] flip = (slice(None),) * (kern.ndim - 2) + \
(slice(None, None, -1),) * 2
kern = kern[flip]
rval = CorrMM_gradInputs(border_mode=node.op.border_mode, rval = CorrMM_gradInputs(border_mode=node.op.border_mode,
subsample=node.op.subsample, subsample=node.op.subsample,
filter_dilation=node.op.filter_dilation, filter_dilation=node.op.filter_dilation,
num_groups=node.op.num_groups)(kern, topgrad, num_groups=node.op.num_groups,
shape) unshared=node.op.unshared)(kern, topgrad, shape)
copy_stack_trace(node.outputs[0], rval) copy_stack_trace(node.outputs[0], rval)
return [rval] return [rval]
...@@ -242,7 +249,7 @@ def local_conv2d_cpu(node): ...@@ -242,7 +249,7 @@ def local_conv2d_cpu(node):
if not node.op.filter_flip: if not node.op.filter_flip:
# Not tested yet # Not tested yet
return None return None
if node.op.num_groups > 1: if node.op.num_groups > 1 or node.op.unshared:
return None return None
rval = conv2d(img, kern, rval = conv2d(img, kern,
...@@ -270,7 +277,7 @@ def local_conv2d_gradweight_cpu(node): ...@@ -270,7 +277,7 @@ def local_conv2d_gradweight_cpu(node):
if not node.op.filter_flip: if not node.op.filter_flip:
# Not tested yet # Not tested yet
return return
if node.op.num_groups > 1: if node.op.num_groups > 1 or node.op.unshared:
return None return None
if node.op.border_mode == 'valid' and \ if node.op.border_mode == 'valid' and \
...@@ -370,7 +377,7 @@ def local_conv2d_gradinputs_cpu(node): ...@@ -370,7 +377,7 @@ def local_conv2d_gradinputs_cpu(node):
if not node.op.filter_flip: if not node.op.filter_flip:
# Not tested yet # Not tested yet
return None return None
if node.op.num_groups > 1: if node.op.num_groups > 1 or node.op.unshared:
return None return None
# Conv 3d implementation, needed when subsample > 2 # Conv 3d implementation, needed when subsample > 2
......
...@@ -1744,3 +1744,146 @@ class Separable_conv(unittest.TestCase): ...@@ -1744,3 +1744,146 @@ class Separable_conv(unittest.TestCase):
fun = theano.function([x_sym, dfilter_sym, pfilter_sym], sep_op, mode='FAST_RUN') fun = theano.function([x_sym, dfilter_sym, pfilter_sym], sep_op, mode='FAST_RUN')
top = fun(x[:, :, :3, :3, :3], depthwise_filter, pointwise_filter) top = fun(x[:, :, :3, :3, :3], depthwise_filter, pointwise_filter)
utt.assert_allclose(top, precomp_output) utt.assert_allclose(top, precomp_output)
class TestUnsharedConv(unittest.TestCase):
conv2d = theano.tensor.nnet.abstract_conv.AbstractConv2d
conv2d_gradw = theano.tensor.nnet.abstract_conv.AbstractConv2d_gradWeights
conv2d_gradi = theano.tensor.nnet.abstract_conv.AbstractConv2d_gradInputs
conv2d_op = theano.tensor.nnet.abstract_conv.AbstractConv2d
conv2d_gradw_op = theano.tensor.nnet.abstract_conv.AbstractConv2d_gradWeights
conv2d_gradi_op = theano.tensor.nnet.abstract_conv.AbstractConv2d_gradInputs
def setUp(self):
self.mode = theano.compile.mode.Mode(optimizer='None')
self.img_shape = [(2, 1, 4, 4), (1, 2, 4, 2), (1, 3, 5, 3), (1, 4, 4, 4)]
self.kern_shape = [(2, 2, 2, 1, 3, 3), (2, 4, 2, 2, 4, 2), (3, 2, 1, 1, 3, 3), (4, 3, 3, 2, 4, 2)]
self.topgrad_shape = [(2, 2, 2, 2), (1, 2, 4, 2), (1, 3, 2, 1), (1, 4, 3, 3)]
self.border_mode = ['valid', 'full', 'valid', 'full']
self.subsample = [(1, 1), (2, 2), (2, 1), (3, 2)]
self.filter_dilation = (1, 1)
self.num_groups = [1, 1, 3, 2]
self.verify_flags = np.random.choice([True, False], 4, [1.0, 0.0])
self.ref_mode = 'FAST_RUN'
if theano.config.cxx == "":
raise SkipTest("CorrMM needs cxx")
def test_fwd(self):
tensor6 = theano.tensor.TensorType(theano.config.floatX, (False,) * 6)
img_sym = theano.tensor.tensor4('img')
kern_sym = tensor6('kern')
ref_kern_sym = theano.tensor.tensor4('ref_kern')
for imshp, kshp, mode, sub, groups, verify in zip(self.img_shape, self.kern_shape, self.border_mode,
self.subsample, self.num_groups, self.verify_flags):
img = np.random.random(imshp).astype(theano.config.floatX)
kern = np.random.random(kshp).astype(theano.config.floatX)
unshared_conv_op = self.conv2d(border_mode=mode, subsample=sub,
filter_dilation=self.filter_dilation,
num_groups=groups, unshared=True)
unshared_out_sym = unshared_conv_op(img_sym, kern_sym)
unshared_func = theano.function([img_sym, kern_sym], unshared_out_sym, mode=self.mode)
assert any([isinstance(node.op, self.conv2d_op)
for node in unshared_func.maker.fgraph.toposort()])
unshared_output = unshared_func(img, kern)
single_kshp = kshp[:1] + kshp[3:]
ref_conv_op = self.conv2d(border_mode=mode, subsample=sub,
filter_dilation=self.filter_dilation,
num_groups=groups, unshared=False)
ref_out_sym = ref_conv_op(img_sym, ref_kern_sym)
ref_func = theano.function([img_sym, ref_kern_sym], ref_out_sym, mode=self.mode)
for i in range(0, kshp[1]):
for j in range(0, kshp[2]):
single_kern = kern[:, i, j, ...].reshape(single_kshp)
ref_val = ref_func(img, single_kern)
utt.assert_allclose(ref_val[:, :, i, j], unshared_output[:, :, i, j])
if verify:
utt.verify_grad(unshared_conv_op, [img, kern], mode=self.mode, eps=1)
def test_gradweight(self):
img_sym = theano.tensor.tensor4('img')
top_sym = theano.tensor.tensor4('top')
for imshp, kshp, topshp, mode, sub, groups, verify in zip(self.img_shape, self.kern_shape, self.topgrad_shape,
self.border_mode, self.subsample, self.num_groups,
self.verify_flags):
img = np.random.random(imshp).astype(theano.config.floatX)
top = np.random.random(topshp).astype(theano.config.floatX)
unshared_conv_op = self.conv2d_gradw(border_mode=mode, subsample=sub,
filter_dilation=self.filter_dilation,
num_groups=groups, unshared=True)
unshared_out_sym = unshared_conv_op(img_sym, top_sym, tensor.as_tensor_variable(kshp[-2:]))
unshared_func = theano.function([img_sym, top_sym], unshared_out_sym, mode=self.mode)
assert any([isinstance(node.op, self.conv2d_gradw_op)
for node in unshared_func.maker.fgraph.toposort()])
unshared_output = unshared_func(img, top)
single_kshp = kshp[:1] + kshp[3:]
ref_conv_op = self.conv2d_gradw(border_mode=mode, subsample=sub,
filter_dilation=self.filter_dilation,
num_groups=groups, unshared=False)
ref_out_sym = ref_conv_op(img_sym, top_sym, tensor.as_tensor_variable(single_kshp[-2:]))
ref_func = theano.function([img_sym, top_sym], ref_out_sym, mode=self.mode)
for i in range(0, topshp[2]):
for j in range(0, topshp[3]):
top_single = np.zeros_like(top)
top_single[:, :, i, j] = top[:, :, i, j]
ref_output = ref_func(img, top_single)
utt.assert_allclose(unshared_output[:, i, j, ...], ref_output)
def conv_gradweight(inputs_val, output_val):
return unshared_conv_op(inputs_val, output_val, tensor.as_tensor_variable(kshp[-2:]))
if verify:
utt.verify_grad(conv_gradweight, [img, top], mode=self.mode, eps=1)
def test_gradinput(self):
tensor6 = theano.tensor.TensorType(theano.config.floatX, (False,) * 6)
kern_sym = tensor6('kern')
top_sym = theano.tensor.tensor4('top')
ref_kern_sym = theano.tensor.tensor4('ref_kern')
for imshp, kshp, topshp, mode, sub, groups, verify in zip(self.img_shape, self.kern_shape, self.topgrad_shape,
self.border_mode, self.subsample, self.num_groups,
self.verify_flags):
single_kshp = kshp[:1] + kshp[3:]
single_kern = np.random.random(single_kshp).astype(theano.config.floatX)
kern = single_kern.reshape((kshp[:1] + (1, 1) + kshp[3:]))
kern = np.tile(kern, (1, kshp[1], kshp[2], 1, 1, 1))
top = np.random.random(topshp).astype(theano.config.floatX)
unshared_conv_op = self.conv2d_gradi(border_mode=mode, subsample=sub,
filter_dilation=self.filter_dilation,
num_groups=groups, unshared=True)
unshared_out_sym = unshared_conv_op(kern_sym, top_sym, tensor.as_tensor_variable(imshp[-2:]))
unshared_func = theano.function([kern_sym, top_sym], unshared_out_sym, mode=self.mode)
assert any([isinstance(node.op, self.conv2d_gradi_op)
for node in unshared_func.maker.fgraph.toposort()])
unshared_output = unshared_func(kern, top)
ref_conv_op = self.conv2d_gradi(border_mode=mode, subsample=sub,
filter_dilation=self.filter_dilation,
num_groups=groups, unshared=False)
ref_out_sym = ref_conv_op(ref_kern_sym, top_sym, tensor.as_tensor_variable(imshp[-2:]))
ref_func = theano.function([ref_kern_sym, top_sym], ref_out_sym, mode=self.mode)
ref_output = ref_func(single_kern, top)
utt.assert_allclose(ref_output, unshared_output)
def conv_gradinputs(filters_val, output_val):
return unshared_conv_op(filters_val, output_val, tensor.as_tensor_variable(imshp[-2:]))
if verify:
utt.verify_grad(conv_gradinputs, [kern, top], mode=self.mode, eps=1)
...@@ -10,7 +10,7 @@ import theano ...@@ -10,7 +10,7 @@ import theano
import theano.tensor as T import theano.tensor as T
from theano.tests import unittest_tools as utt from theano.tests import unittest_tools as utt
from theano.tensor.nnet import corr, conv from theano.tensor.nnet import corr, conv
from theano.tensor.nnet.tests.test_abstract_conv import Grouped_conv_noOptim from theano.tensor.nnet.tests.test_abstract_conv import Grouped_conv_noOptim, TestUnsharedConv
class TestCorr2D(utt.InferShapeTester): class TestCorr2D(utt.InferShapeTester):
...@@ -452,6 +452,21 @@ class TestGroupCorr2d(Grouped_conv_noOptim): ...@@ -452,6 +452,21 @@ class TestGroupCorr2d(Grouped_conv_noOptim):
utt.assert_allclose(gconv_output, conv_output) utt.assert_allclose(gconv_output, conv_output)
class TestUnsharedCorr2D(TestUnsharedConv):
if theano.config.mode == "FAST_COMPILE":
mode = theano.compile.get_mode("FAST_RUN").excluding('gpuarray')
else:
mode = None
conv2d = corr.CorrMM
conv2d_gradw = corr.CorrMM_gradWeights
conv2d_gradi = corr.CorrMM_gradInputs
conv2d_op = corr.CorrMM
conv2d_gradw_op = corr.CorrMM_gradWeights
conv2d_gradi_op = corr.CorrMM_gradInputs
flip_filter = True
is_dnn = False
if __name__ == '__main__': if __name__ == '__main__':
t = TestCorr2D('setUp') t = TestCorr2D('setUp')
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论