提交 691be8f5 authored 作者: Gijs van Tulder's avatar Gijs van Tulder

Add GpuCorrMM and GpuCorr3dMM to gpuarray backend.

上级 146ef971
from __future__ import absolute_import, print_function, division
import os.path
from six import integer_types
import theano
from theano import Apply, config, Op
from theano.compile import optdb
......@@ -8,7 +10,8 @@ from theano.gof import LocalOptGroup
from theano.tensor.basic import as_tensor_variable
from theano.tensor.opt import in2out
from .basic_ops import as_gpuarray_variable, infer_context_name
from .basic_ops import (GpuArrayType, CGpuKernelBase,
as_gpuarray_variable, gpu_contiguous, infer_context_name)
from .opt_util import inplace_allocempty
try:
......@@ -410,6 +413,1129 @@ gpugemmbatch_no_inplace = GpuGemmBatch(inplace=False)
gpugemmbatch_inplace = GpuGemmBatch(inplace=True)
class BaseGpuCorrMM(CGpuKernelBase, BlasOp):
"""
Base class for `GpuCorrMM`, `GpuCorrMM_gradWeights` and
`GpuCorrMM_gradInputs`. Cannot be used directly.
Parameters
----------
border_mode : {'valid', 'full', 'half'}
Additionally, the padding size could be directly specified by an integer
or a pair of integers
subsample
Perform subsampling of the output (default: (1, 1)).
filter_dilation
Perform subsampling of the input, also known as dilation (default: (1, 1)).
"""
check_broadcast = False
__props__ = ('border_mode', 'subsample', 'filter_dilation')
def __init__(self, border_mode="valid", subsample=(1, 1),
filter_dilation=(1, 1)):
if isinstance(border_mode, integer_types):
border_mode = (border_mode, border_mode)
if isinstance(border_mode, tuple):
pad_h, pad_w = map(int, border_mode)
border_mode = (pad_h, pad_w)
if not ((isinstance(border_mode, tuple) and min(border_mode) >= 0) or
border_mode in ('valid', 'full', 'half')):
raise ValueError(
'invalid border_mode {}, which must be either '
'"valid", "full", "half", an integer or a pair of'
' integers'.format(border_mode))
self.border_mode = border_mode
if len(subsample) != 2:
raise ValueError("subsample must have two elements")
if len(filter_dilation) != 2:
raise ValueError("filter_dilation must have two elements")
self.subsample = tuple(subsample)
self.filter_dilation = tuple(filter_dilation)
CGpuKernelBase.__init__(self, ['corr_gemm.c'])
@property
def pad(self):
if self.border_mode != 'valid':
return self.border_mode
return (0, 0)
def __str__(self):
return '%s{%s, %s, %s}' % (
self.__class__.__name__,
self.border_mode,
str(self.subsample),
str(self.filter_dilation))
def flops(self, inp, outp):
"""
Useful with the hack in profilemode to print the MFlops.
"""
# if the output shape is correct, then this gives the correct
# flops for any direction, sampling, padding, and border mode
inputs, filters = inp
outputs, = outp
assert inputs[1] == filters[1]
# nb mul and add by output pixel
flops = filters[2] * filters[3] * 2
# nb flops by output image
flops *= outputs[2] * outputs[3]
# nb patch multiplied
flops *= inputs[1] * filters[0] * inputs[0]
return flops
def get_params(self, node):
return node.inputs[0].type.context
def c_code_cache_version(self):
# raise this whenever modifying any of the support_code_files
return (0, 1)
def c_code_helper(self, bottom, weights, top, direction, sub, height=None, width=None):
"""
This generates the C code for GpuCorrMM (direction="forward"),
GpuCorrMM_gradWeights (direction="backprop weights"), and
GpuCorrMM_gradInputs (direction="backprop inputs").
Depending on the direction, one of bottom, weights, top will
receive the output, while the other two serve as inputs.
Parameters
----------
bottom
Variable name of the input images in the forward pass,
or the gradient of the input images in backprop wrt. inputs
weights
Variable name of the filters in the forward pass,
or the gradient of the filters in backprop wrt. weights
top
Variable name of the output images / feature maps in the
forward pass, or the gradient of the outputs in the backprop passes
direction : {'forward', 'backprop weights', 'backprop inputs'}
"forward" to correlate bottom with weights and store results in top,
"backprop weights" to do a valid convolution of bottom with top
(swapping the first two dimensions) and store results in weights,
and "backprop inputs" to do a full convolution of top with weights
(swapping the first two dimensions) and store results in bottom.
sub
Dictionary of substitutions useable to help generating the C code.
height
If self.subsample[0] != 1, a variable giving the height of the
filters for direction="backprop weights" or the height of the input
images for direction="backprop inputs".
If self.border_mode == 'half', a variable giving the height of the
filters for direction="backprop weights".
Ignored otherwise.
width
If self.subsample[1] != 1, a variable giving the width of the
filters for direction="backprop weights" or the width of the
input images for direction="backprop inputs".
If self.border_mode == 'half', a variable giving the width of the
filters for direction="backprop weights".
Ignored otherwise.
"""
dH, dW = self.subsample
dilH, dilW = self.filter_dilation
if self.border_mode == "half":
padH = padW = -1
elif self.border_mode == "full":
padH = padW = -2
elif isinstance(self.border_mode, tuple):
padH, padW = self.border_mode
else:
assert self.border_mode == "valid"
padH = padW = 0
if direction == "forward":
direction = 0
out = top
elif direction == "backprop weights":
direction = 1
out = weights
elif direction == "backprop inputs":
direction = 2
out = bottom
else:
raise ValueError("direction must be one of 'forward', "
"'backprop weights', 'backprop inputs'")
# When subsampling, we cannot unambiguously infer the height and width
# of bottom and weights from top, so we require them to be given.
# Similarly, when pad="half", we cannot infer the weight size.
if ((direction != 0) and (dH != 1)) or ((direction == 1) and (padH == -1)):
if not height:
raise ValueError("height must be given for backprop with vertical sampling or pad='half'")
height = '(*(npy_int*)(PyArray_DATA(%s)))' % height
else:
height = '0'
if ((direction != 0) and (dW != 1)) or ((direction == 1) and (padW == -1)):
if not width:
raise ValueError("width must be given for backprop with horizontal sampling or pad='half'")
width = '(*(npy_int*)(PyArray_DATA(%s)))' % width
else:
width = '0'
sync = ""
if config.gpuarray.sync:
sync = """
int err = GpuArray_sync(&%(out)s->ga);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"BaseGpuCorrMM error: gpuarray sync failed.");
%(fail)s;
}
""" % locals()
sub = sub.copy()
sub.update(locals())
return """
// Mandatory args
int direction = %(direction)s; // forward, bprop weights, bprop inputs
// Optional args
size_t dH = %(dH)s;
size_t dW = %(dW)s;
size_t dilH = %(dilH)s;
size_t dilW = %(dilW)s;
int padH = %(padH)s;
int padW = %(padW)s;
PyGpuArrayObject * bottom = %(bottom)s;
PyGpuArrayObject * weights = %(weights)s;
PyGpuArrayObject * top = %(top)s;
PyGpuArrayObject * out2 = NULL;
// Obtain or infer kernel width and height
// (we need to know it early to be able to handle auto-padding)
size_t kH, kW;
if (direction != 1) {
// weight is an input variable, we can just read its shape
kH = PyGpuArray_DIMS(weights)[2];
kW = PyGpuArray_DIMS(weights)[3];
}
else {
if ((dH != 1) || (padH == -1)) {
// vertical subsampling or half padding, kernel height is specified
kH = %(height)s;
}
else if (padH == -2) {
// vertical full padding, we can infer the kernel height
kH = (2 - PyGpuArray_DIMS(bottom)[2] + (PyGpuArray_DIMS(top)[2] - 1) * dH - 1) / dilH + 1;
}
else {
// explicit padding, we can infer the kernel height
kH = (PyGpuArray_DIMS(bottom)[2] + 2*padH - (PyGpuArray_DIMS(top)[2] - 1) * dH - 1) / dilH + 1 ;
}
if ((dW != 1) || (padW == -1)) {
kW = %(width)s;
}
else if (padW == -2) {
kW = (2 - PyGpuArray_DIMS(bottom)[3] + (PyGpuArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
}
else {
kW = (PyGpuArray_DIMS(bottom)[3] + 2*padW - (PyGpuArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
}
}
// Implicit dilated kernel size
size_t dil_kH = (kH - 1) * dilH + 1;
size_t dil_kW = (kW - 1) * dilW + 1;
// Auto-padding if requested
if (padH == -1) { // vertical half padding
padH = dil_kH / 2;
}
else if (padH == -2) { // vertical full padding
padH = dil_kH - 1;
}
else if (padH < 0) {
PyErr_SetString(PyExc_ValueError, "BaseGpuCorrMM: padH must be >= -2");
%(fail)s
}
if (padW == -1) { // horizontal half padding
padW = dil_kW / 2;
}
else if (padW == -2) { // horizontal full padding
padW = dil_kW - 1;
}
else if (padW < 0) {
PyErr_SetString(PyExc_ValueError, "BaseGpuCorrMM: padW must be >= -2");
%(fail)s
}
// Infer output shape and type
size_t out_dim[4];
int out_typecode;
PyGpuContextObject *out_context;
switch(direction) {
case 0: // forward pass
// output is top: (batchsize, num_filters, height, width)
// height and width: top = (bottom + 2*pad - ((weight-1)*dil + 1)) / sample + 1
out_dim[0] = PyGpuArray_DIMS(bottom)[0];
out_dim[1] = PyGpuArray_DIMS(weights)[0];
out_dim[2] = (PyGpuArray_DIMS(bottom)[2] + 2*padH - ((PyGpuArray_DIMS(weights)[2]-1)*dilH + 1)) / dH + 1;
out_dim[3] = (PyGpuArray_DIMS(bottom)[3] + 2*padW - ((PyGpuArray_DIMS(weights)[3]-1)*dilW + 1)) / dW + 1;
out_typecode = bottom->ga.typecode;
out_context = bottom->context;
break;
case 1: // backprop wrt. weights
// output is weights: (num_filters, num_channels, height, width)
// height and width: weights = (bottom + 2*pad - (top - 1) * sample - 1) / dil + 1
out_dim[0] = PyGpuArray_DIMS(top)[1];
out_dim[1] = PyGpuArray_DIMS(bottom)[1];
out_dim[2] = kH; // already inferred further above
out_dim[3] = kW; // how convenient
out_typecode = top->ga.typecode;
out_context = top->context;
break;
case 2: // backprop wrt. inputs
// output is bottom: (batchsize, num_channels, height, width)
// height and width: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad
out_dim[0] = PyGpuArray_DIMS(top)[0];
out_dim[1] = PyGpuArray_DIMS(weights)[1];
out_dim[2] = (dH != 1) ? %(height)s : (PyGpuArray_DIMS(top)[2] - 1) * dH + (PyGpuArray_DIMS(weights)[2]-1)*dilH + 1 - 2*padH;
out_dim[3] = (dW != 1) ? %(width)s : (PyGpuArray_DIMS(top)[3] - 1) * dW + (PyGpuArray_DIMS(weights)[3]-1)*dilW + 1 - 2*padW;
out_typecode = top->ga.typecode;
out_context = top->context;
break;
default:
PyErr_SetString(PyExc_ValueError, "BaseGpuCorrMM: direction must be 0, 1, or 2\\n");
%(fail)s
}
// Prepare output array
if (theano_prep_output(&%(out)s, 4, out_dim, out_typecode, GA_C_ORDER, out_context) != 0)
{
PyErr_Format(PyExc_RuntimeError,
"BaseGpuCorrMM: Failed to allocate output of %%ld x %%ld x %%ld x %%ld",
out_dim[0], out_dim[1], out_dim[2], out_dim[3]);
%(fail)s
}
if (!GpuArray_IS_C_CONTIGUOUS(&%(out)s->ga)) {
PyErr_SetString(PyExc_ValueError, "Only contiguous outputs are supported.");
%(fail)s
}
// Call GPU code
out2 = corrMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, dilH, dilW, padH, padW);
if (out2==NULL){
%(fail)s
}
assert (out2 == %(out)s);
%(sync)s
""" % sub
class GpuCorrMM(BaseGpuCorrMM):
"""
GPU correlation implementation using Matrix Multiplication.
Parameters
----------
border_mode
The width of a border of implicit zeros to pad the
input with. Must be a tuple with 2 elements giving the numbers of rows
and columns to pad on each side, or a single integer to pad the same
on all sides, or a string shortcut setting the padding at runtime:
``'valid'`` for ``(0, 0)`` (valid convolution, no padding), ``'full'``
for ``(kernel_rows - 1, kernel_columns - 1)`` (full convolution),
``'half'`` for ``(kernel_rows // 2, kernel_columns // 2)`` (same
convolution for odd-sized kernels). Note that the two widths are each
applied twice, once per side (left and right, top and bottom).
subsample
The subsample operation applied to each output image.
Should be a tuple with 2 elements.
`(sv, sh)` is equivalent to `GpuCorrMM(...)(...)[:,:,::sv, ::sh]`,
but faster.
Set to `(1, 1)` to disable subsampling.
filter_dilation
The filter dilation operation applied to each input image.
Should be a tuple with 2 elements.
Set to `(1, 1)` to disable filter dilation.
Notes
-----
Currently, the Op requires the inputs, filters and outputs to be
C-contiguous. Use :func:`gpu_contiguous
<theano.gpuarray.basic_ops.gpu_contiguous>` on these arguments
if needed.
You can either enable the Theano flag `optimizer_including=conv_gemm`
to automatically replace all convolution operations with `GpuCorrMM`
or one of its gradients, or you can use it as a replacement for
:func:`conv2d <theano.tensor.nnet.conv.conv2d>`, called as
`GpuCorrMM(subsample=...)(image, filters)`. The latter is currently
faster, but note that it computes a correlation -- if you need to
compute a convolution, flip the filters as `filters[:,:,::-1,::-1]`.
"""
def __init__(self, border_mode="valid",
subsample=(1, 1),
filter_dilation=(1, 1)):
super(GpuCorrMM, self).__init__(border_mode, subsample,
filter_dilation)
def make_node(self, img, kern):
ctx_name = infer_context_name(img, kern)
img = as_gpuarray_variable(img, ctx_name)
kern = as_gpuarray_variable(kern, ctx_name)
if img.type.ndim != 4:
raise TypeError('img must be 4D tensor')
if kern.type.ndim != 4:
raise TypeError('kern must be 4D tensor')
broadcastable = [img.type.broadcastable[0], kern.type.broadcastable[0],
False, False]
return Apply(self, [img, kern], [GpuArrayType(dtype=img.dtype,
context_name=ctx_name,
broadcastable=broadcastable)()])
def c_code(self, node, nodename, inp, out_, sub):
bottom, weights = inp
top, = out_
direction = "forward"
return super(GpuCorrMM, self).c_code_helper(bottom, weights, top, direction, sub)
def grad(self, inp, grads):
bottom, weights = inp
top, = grads
top = gpu_contiguous(top)
d_bottom = GpuCorrMM_gradInputs(self.border_mode,
self.subsample,
self.filter_dilation)(
weights, top, bottom.shape[-2:])
d_weights = GpuCorrMM_gradWeights(self.border_mode,
self.subsample,
self.filter_dilation)(
bottom, top, weights.shape[-2:])
return d_bottom, d_weights
class GpuCorrMM_gradWeights(BaseGpuCorrMM):
"""
Gradient wrt. filters for `GpuCorrMM`.
Notes
-----
You will not want to use this directly, but rely on Theano's automatic
differentiation or graph optimization to use it as needed.
"""
def __init__(self, border_mode="valid",
subsample=(1, 1),
filter_dilation=(1, 1)):
super(GpuCorrMM_gradWeights, self).__init__(border_mode,
subsample,
filter_dilation)
def make_node(self, img, topgrad, shape=None):
ctx_name = infer_context_name(img, topgrad)
img = as_gpuarray_variable(img, ctx_name)
topgrad = as_gpuarray_variable(topgrad, ctx_name)
if img.type.ndim != 4:
raise TypeError('img must be 4D tensor')
if topgrad.type.ndim != 4:
raise TypeError('topgrad must be 4D tensor')
if self.subsample != (1, 1) or self.border_mode == "half":
if shape is None:
raise ValueError('shape must be given if subsample != (1, 1)'
' or border_mode == "half"')
height_width = [shape[0], shape[1]]
assert shape[0].ndim == 0
assert shape[1].ndim == 0
else:
height_width = []
broadcastable = [topgrad.type.broadcastable[1], img.type.broadcastable[1],
False, False]
return Apply(self, [img, topgrad] + height_width, [GpuArrayType(dtype=img.dtype,
context_name=ctx_name,
broadcastable=broadcastable)()])
def c_code(self, node, nodename, inp, out_, sub):
bottom, top = inp[:2]
height, width = inp[2:] or (None, None)
weights, = out_
direction = "backprop weights"
return super(GpuCorrMM_gradWeights, self).c_code_helper(bottom, weights, top, direction, sub, height, width)
def grad(self, inp, grads):
bottom, top = inp[:2]
weights, = grads
weights = gpu_contiguous(weights)
d_bottom = GpuCorrMM_gradInputs(self.border_mode,
self.subsample,
self.filter_dilation)(weights,
top,
bottom.shape[-2:])
d_top = GpuCorrMM(
self.border_mode, self.subsample, self.filter_dilation)(bottom, weights)
d_height_width = (
theano.gradient.DisconnectedType()(),
) * 2 if len(inp) == 4 else ()
return (d_bottom, d_top) + d_height_width
def connection_pattern(self, node):
if node.nin == 2:
return [[1], [1]]
else:
return [[1], [1], [0], [0]] # no connection to height, width
class GpuCorrMM_gradInputs(BaseGpuCorrMM):
"""
Gradient wrt. inputs for `GpuCorrMM`.
Notes
-----
You will not want to use this directly, but rely on Theano's automatic
differentiation or graph optimization to use it as needed.
"""
def __init__(self, border_mode="valid",
subsample=(1, 1),
filter_dilation=(1, 1)):
super(GpuCorrMM_gradInputs, self).__init__(border_mode, subsample,
filter_dilation)
def make_node(self, kern, topgrad, shape=None):
ctx_name = infer_context_name(kern, topgrad)
kern = as_gpuarray_variable(kern, ctx_name)
topgrad = as_gpuarray_variable(topgrad, ctx_name)
if kern.type.ndim != 4:
raise TypeError('kern must be 4D tensor')
if topgrad.type.ndim != 4:
raise TypeError('topgrad must be 4D tensor')
if self.subsample != (1, 1) and shape is None:
raise ValueError('shape must be given if subsample != (1, 1)')
height_width = [shape[0], shape[1]] if self.subsample != (1, 1) else []
if height_width:
assert shape[0].ndim == 0
assert shape[1].ndim == 0
broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[1],
False, False]
return Apply(self, [kern, topgrad] + height_width, [GpuArrayType(dtype=topgrad.dtype,
context_name=ctx_name,
broadcastable=broadcastable)()])
def c_code(self, node, nodename, inp, out_, sub):
weights, top = inp[:2]
height, width = inp[2:] or (None, None)
bottom, = out_
direction = "backprop inputs"
return super(GpuCorrMM_gradInputs, self).c_code_helper(bottom, weights, top, direction, sub, height, width)
def grad(self, inp, grads):
weights, top = inp[:2]
bottom, = grads
bottom = gpu_contiguous(bottom)
d_weights = GpuCorrMM_gradWeights(self.border_mode,
self.subsample,
self.filter_dilation)(bottom,
top,
weights.shape[-2:])
d_top = GpuCorrMM(self.border_mode,
self.subsample,
self.filter_dilation)(bottom, weights)
d_height_width = (
theano.gradient.DisconnectedType()(),
) * 2 if len(inp) == 4 else ()
return (d_weights, d_top) + d_height_width
def connection_pattern(self, node):
if node.nin == 2:
return [[1], [1]]
else:
return [[1], [1], [0], [0]] # no connection to height, width
class BaseGpuCorr3dMM(CGpuKernelBase, BlasOp):
"""
Base class for `GpuCorr3dMM`, `GpuCorr3dMM_gradWeights` and
`GpuCorr3dMM_gradInputs`. Cannot be used directly.
Parameters
----------
border_mode : {'valid', 'full', 'half'}
Additionally, the padding size could be directly specified by an integer
or a pair of integers
subsample
Perform subsampling of the output (default: (1, 1, 1)).
filter_dilation
Perform subsampling of the input, also known as dilation (default: (1, 1, 1)).
"""
check_broadcast = False
__props__ = ('border_mode', 'subsample', 'filter_dilation')
def __init__(self, border_mode="valid", subsample=(1, 1, 1),
filter_dilation=(1, 1, 1)):
if isinstance(border_mode, integer_types):
border_mode = (border_mode, border_mode, border_mode)
if isinstance(border_mode, tuple):
pad_h, pad_w, pad_d = map(int, border_mode)
border_mode = (pad_h, pad_w, pad_d)
if not ((isinstance(border_mode, tuple) and min(border_mode) >= 0) or
border_mode in ('valid', 'full', 'half')):
raise ValueError(
'invalid border_mode {}, which must be either '
'"valid", "full", "half", an integer or a tuple of'
' three integers'.format(border_mode))
self.border_mode = border_mode
if len(subsample) != 3:
raise ValueError("subsample must have three elements")
if len(filter_dilation) != 3:
raise ValueError("filter_dilation must have three elements")
self.subsample = tuple(subsample)
self.filter_dilation = tuple(filter_dilation)
CGpuKernelBase.__init__(self, ['corr3d_gemm.c'])
@property
def pad(self):
if self.border_mode != 'valid':
return self.border_mode
return (0, 0, 0)
def __str__(self):
return '%s{%s, %s, %s}' % (
self.__class__.__name__,
self.border_mode,
str(self.subsample),
str(self.filter_dilation))
def flops(self, inp, outp):
"""
Useful with the hack in profilemode to print the MFlops.
"""
# if the output shape is correct, then this gives the correct
# flops for any direction, sampling, padding, and border mode
inputs, filters = inp
outputs, = outp
assert inputs[1] == filters[1]
# nb mul and add by output pixel
flops = filters[2] * filters[3] * filters[4] * 2
# nb flops by output image
flops *= outputs[2] * outputs[3] * outputs[4]
# nb patch multiplied
flops *= inputs[1] * filters[0] * inputs[0]
return flops
def get_params(self, node):
return node.inputs[0].type.context
def c_code_cache_version(self):
# raise this whenever modifying any of the support_code_files
return (0, 1)
def c_code_helper(self, bottom, weights, top, direction, sub,
height=None, width=None, depth=None):
"""
This generates the C code for GpuCorr3dMM (direction="forward"),
GpuCorr3dMM_gradWeights (direction="backprop weights"), and
GpuCorr3dMM_gradInputs (direction="backprop inputs").
Depending on the direction, one of bottom, weights, top will
receive the output, while the other two serve as inputs.
Parameters
----------
bottom
Variable name of the input images in the forward pass,
or the gradient of the input images in backprop wrt. inputs
weights
Variable name of the filters in the forward pass,
or the gradient of the filters in backprop wrt. weights
top
Variable name of the output images / feature maps in the
forward pass, or the gradient of the outputs in the backprop passes
direction : {'forward', 'backprop weights', 'backprop inputs'}
"forward" to correlate bottom with weights and store results in top,
"backprop weights" to do a valid convolution of bottom with top
(swapping the first two dimensions) and store results in weights,
and "backprop inputs" to do a full convolution of top with weights
(swapping the first two dimensions) and store results in bottom.
sub
Dictionary of substitutions useable to help generating the C code.
height
If self.subsample[0] != 1, a variable giving the height of the
filters for direction="backprop weights" or the height of the input
images for direction="backprop inputs".
If self.border_mode == 'half', a variable giving the height of the
filters for direction="backprop weights".
Ignored otherwise.
width
If self.subsample[1] != 1, a variable giving the width of the
filters for direction="backprop weights" or the width of the
input images for direction="backprop inputs".
If self.border_mode == 'half', a variable giving the width of the
filters for direction="backprop weights".
Ignored otherwise.
depth
If self.subsample[2] != 1, a variable giving the depth of the
filters for direction="backprop weights" or the depth of the
input images for direction="backprop inputs".
If self.border_mode == 'half', a variable giving the depth of the
filters for direction="backprop weights".
Ignored otherwise.
"""
dH, dW, dD = self.subsample
dilH, dilW, dilD = self.filter_dilation
if self.border_mode == "half":
padH = padW = padD = -1
elif self.border_mode == "full":
padH = padW = padD = -2
elif isinstance(self.border_mode, tuple):
padH, padW, padD = self.border_mode
else:
assert self.border_mode == "valid"
padH = padW = padD = 0
if direction == "forward":
direction = 0
out = top
elif direction == "backprop weights":
direction = 1
out = weights
elif direction == "backprop inputs":
direction = 2
out = bottom
else:
raise ValueError("direction must be one of 'forward', "
"'backprop weights', 'backprop inputs'")
# When subsampling, we cannot unambiguously infer the height and width
# of bottom and weights from top, so we require them to be given.
# Similarly, when pad="half", we cannot infer the weight size.
if ((direction != 0) and (dH != 1)) or ((direction == 1) and (padH == -1)):
if not height:
raise ValueError("height must be given for backprop with vertical sampling or pad='half'")
height = '(*(npy_int*)(PyArray_DATA(%s)))' % height
else:
height = '0'
if ((direction != 0) and (dW != 1)) or ((direction == 1) and (padW == -1)):
if not width:
raise ValueError("width must be given for backprop with horizontal sampling or pad='half'")
width = '(*(npy_int*)(PyArray_DATA(%s)))' % width
else:
width = '0'
if ((direction != 0) and (dD != 1)) or ((direction == 1) and (padD == -1)):
if not depth:
raise ValueError("depth must be given for backprop with horizontal sampling or pad='half'")
depth = '(*(npy_int*)(PyArray_DATA(%s)))' % depth
else:
depth = '0'
sync = ""
if config.gpuarray.sync:
sync = """
int err = GpuArray_sync(&%(out)s->ga);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"BaseGpuCorr3dMM error: gpuarray sync failed.");
%(fail)s;
}
""" % locals()
sub = sub.copy()
sub.update(locals())
return """
// Mandatory args
int direction = %(direction)s; // forward, bprop weights, bprop inputs
// Optional args
size_t dH = %(dH)s;
size_t dW = %(dW)s;
size_t dD = %(dD)s;
size_t dilH = %(dilH)s;
size_t dilW = %(dilW)s;
size_t dilD = %(dilD)s;
int padH = %(padH)s;
int padW = %(padW)s;
int padD = %(padD)s;
PyGpuArrayObject * bottom = %(bottom)s;
PyGpuArrayObject * weights = %(weights)s;
PyGpuArrayObject * top = %(top)s;
PyGpuArrayObject * out2 = NULL;
// Obtain or infer kernel height, width and depth
// (we need to know it early to be able to handle auto-padding)
size_t kH, kW, kD;
if (direction != 1) {
// weight is an input variable, we can just read its shape
kH = PyGpuArray_DIMS(weights)[2];
kW = PyGpuArray_DIMS(weights)[3];
kD = PyGpuArray_DIMS(weights)[4];
}
else {
if ((dH != 1) || (padH == -1)) {
// vertical subsampling or half padding, kernel height is specified
kH = %(height)s;
}
else if (padH == -2) {
// vertical full padding, we can infer the kernel height
kH = (2 - PyGpuArray_DIMS(bottom)[2] + (PyGpuArray_DIMS(top)[2] - 1) * dH - 1) / dilH + 1;
}
else {
// explicit padding, we can infer the kernel height
kH = (PyGpuArray_DIMS(bottom)[2] + 2*padH - (PyGpuArray_DIMS(top)[2] - 1) * dH - 1) / dilH + 1 ;
}
if ((dW != 1) || (padW == -1)) {
kW = %(width)s;
}
else if (padW == -2) {
kW = (2 - PyGpuArray_DIMS(bottom)[3] + (PyGpuArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
}
else {
kW = (PyGpuArray_DIMS(bottom)[3] + 2*padW - (PyGpuArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
}
if ((dD != 1) || (padD == -1)) {
kD = %(depth)s;
}
else if (padD == -2) {
kD = (2 - PyGpuArray_DIMS(bottom)[4] + (PyGpuArray_DIMS(top)[4] - 1) * dD - 1) / dilD + 1;
}
else {
kD = (PyGpuArray_DIMS(bottom)[4] + 2*padD - (PyGpuArray_DIMS(top)[4] - 1) * dD - 1) / dilD + 1;
}
}
// Implicit dilated kernel size
size_t dil_kH = (kH - 1) * dilH + 1;
size_t dil_kW = (kW - 1) * dilW + 1;
size_t dil_kD = (kD - 1) * dilD + 1;
// Auto-padding if requested
if (padH == -1) { // vertical half padding
padH = dil_kH / 2;
}
else if (padH == -2) { // vertical full padding
padH = dil_kH - 1;
}
else if (padH < 0) {
PyErr_SetString(PyExc_ValueError, "BaseGpuCorr3dMM: padH must be >= -2");
%(fail)s
}
if (padW == -1) { // horizontal half padding
padW = dil_kW / 2;
}
else if (padW == -2) { // horizontal full padding
padW = dil_kW - 1;
}
else if (padW < 0) {
PyErr_SetString(PyExc_ValueError, "BaseGpuCorr3dMM: padW must be >= -2");
%(fail)s
}
if (padD == -1) { // depth half padding
padD = dil_kD / 2;
}
else if (padD == -2) { // depth full padding
padD = dil_kD - 1;
}
else if (padD < 0) {
PyErr_SetString(PyExc_ValueError, "BaseGpuCorr3dMM: padD must be >= -2");
%(fail)s
}
// Infer output shape and type
size_t out_dim[5];
int out_typecode;
PyGpuContextObject *out_context;
switch(direction) {
case 0: // forward pass
// output is top: (batchsize, num_filters, height, width, depth)
// height, width and depth: top = (bottom + 2*pad - ((weight-1)*dil + 1)) / sample + 1
out_dim[0] = PyGpuArray_DIMS(bottom)[0];
out_dim[1] = PyGpuArray_DIMS(weights)[0];
out_dim[2] = (PyGpuArray_DIMS(bottom)[2] + 2*padH - ((PyGpuArray_DIMS(weights)[2]-1)*dilH + 1)) / dH + 1;
out_dim[3] = (PyGpuArray_DIMS(bottom)[3] + 2*padW - ((PyGpuArray_DIMS(weights)[3]-1)*dilW + 1)) / dW + 1;
out_dim[4] = (PyGpuArray_DIMS(bottom)[4] + 2*padD - ((PyGpuArray_DIMS(weights)[4]-1)*dilD + 1)) / dD + 1;
out_typecode = bottom->ga.typecode;
out_context = bottom->context;
break;
case 1: // backprop wrt. weights
// output is weights: (num_filters, num_channels, height, width, depth)
// height, width and depth: weights = (bottom + 2*pad - (top - 1) * sample - 1) / dil + 1
out_dim[0] = PyGpuArray_DIMS(top)[1];
out_dim[1] = PyGpuArray_DIMS(bottom)[1];
out_dim[2] = kH; // already inferred further above
out_dim[3] = kW; // how convenient
out_dim[4] = kD;
out_typecode = top->ga.typecode;
out_context = top->context;
break;
case 2: // backprop wrt. inputs
// output is bottom: (batchsize, num_channels, height, width, depth)
// height, width and depth: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad
out_dim[0] = PyGpuArray_DIMS(top)[0];
out_dim[1] = PyGpuArray_DIMS(weights)[1];
out_dim[2] = (dH != 1) ? %(height)s : (PyGpuArray_DIMS(top)[2] - 1) * dH + (PyGpuArray_DIMS(weights)[2]-1)*dilH + 1 - 2*padH;
out_dim[3] = (dW != 1) ? %(width)s : (PyGpuArray_DIMS(top)[3] - 1) * dW + (PyGpuArray_DIMS(weights)[3]-1)*dilW + 1 - 2*padW;
out_dim[4] = (dD != 1) ? %(depth)s : (PyGpuArray_DIMS(top)[4] - 1) * dD + (PyGpuArray_DIMS(weights)[4]-1)*dilD + 1 - 2*padD;
out_typecode = top->ga.typecode;
out_context = top->context;
break;
default:
PyErr_SetString(PyExc_ValueError, "BaseGpuCorr3dMM: direction must be 0, 1, or 2\\n");
%(fail)s
}
// Prepare output array
if (theano_prep_output(&%(out)s, 5, out_dim, out_typecode, GA_C_ORDER, out_context) != 0)
{
PyErr_Format(PyExc_RuntimeError,
"BaseGpuCorrMM: Failed to allocate output of %%ld x %%ld x %%ld x %%ld x %%ld",
out_dim[0], out_dim[1], out_dim[2], out_dim[3], out_dim[4]);
%(fail)s
}
if (!GpuArray_IS_C_CONTIGUOUS(&%(out)s->ga)) {
PyErr_SetString(PyExc_ValueError, "Only contiguous outputs are supported.");
%(fail)s
}
// Call GPU code
out2 = corr3dMM(%(bottom)s, %(weights)s, %(top)s, direction,
dH, dW, dD, dilH, dilW, dilD, padH, padW, padD);
if (out2==NULL){
%(fail)s
}
assert (out2 == %(out)s);
%(sync)s
""" % sub
class GpuCorr3dMM(BaseGpuCorr3dMM):
"""
GPU correlation implementation using Matrix Multiplication.
Parameters
----------
border_mode
The width of a border of implicit zeros to pad the
input with. Must be a tuple with 3 elements giving the width of
the padding on each side, or a single integer to pad the same
on all sides, or a string shortcut setting the padding at runtime:
``'valid'`` for ``(0, 0, 0)`` (valid convolution, no padding), ``'full'``
for ``(kernel_rows - 1, kernel_columns - 1, kernel_depth - 1)``
(full convolution), ``'half'`` for ``(kernel_rows // 2,
kernel_columns // 2, kernel_depth // 2)`` (same convolution for
odd-sized kernels). Note that the three widths are each
applied twice, once per side (left and right, top and bottom, front
and back).
subsample
The subsample operation applied to each output image. Should be a tuple
with 3 elements. `(sv, sh, sl)` is equivalent to
`GpuCorrMM(...)(...)[:,:,::sv, ::sh, ::sl]`, but faster.
Set to `(1, 1, 1)` to disable subsampling.
filter_dilation
The filter dilation operation applied to each input image.
Should be a tuple with 3 elements.
Set to `(1, 1, 1)` to disable filter dilation.
Notes
-----
Currently, the Op requires the inputs, filters and outputs to be
C-contiguous. Use :func:`gpu_contiguous
<theano.gpuarray.basic_ops.gpu_contiguous>` on these arguments
if needed.
You can either enable the Theano flag `optimizer_including=conv_gemm`
to automatically replace all convolution operations with `GpuCorr3dMM`
or one of its gradients, or you can use it as a replacement for
:func:`conv2d <theano.tensor.nnet.conv.conv2d>`, called as
`GpuCorr3dMM(subsample=...)(image, filters)`. The latter is currently
faster, but note that it computes a correlation -- if you need to
compute a convolution, flip the filters as `filters[:,:,::-1,::-1,::-1]`.
"""
def __init__(self, border_mode="valid",
subsample=(1, 1, 1),
filter_dilation=(1, 1, 1)):
super(GpuCorr3dMM, self).__init__(border_mode, subsample,
filter_dilation)
def make_node(self, img, kern):
ctx_name = infer_context_name(img, kern)
img = as_gpuarray_variable(img, ctx_name)
kern = as_gpuarray_variable(kern, ctx_name)
if img.type.ndim != 5:
raise TypeError('img must be 5D tensor')
if kern.type.ndim != 5:
raise TypeError('kern must be 5D tensor')
broadcastable = [img.type.broadcastable[0], kern.type.broadcastable[0],
False, False, False]
return Apply(self, [img, kern], [GpuArrayType(dtype=img.dtype,
context_name=ctx_name,
broadcastable=broadcastable)()])
def c_code(self, node, nodename, inp, out_, sub):
bottom, weights = inp
top, = out_
direction = "forward"
return super(GpuCorr3dMM, self).c_code_helper(bottom, weights, top, direction, sub)
def grad(self, inp, grads):
bottom, weights = inp
top, = grads
top = gpu_contiguous(top)
d_bottom = GpuCorr3dMM_gradInputs(self.border_mode,
self.subsample,
self.filter_dilation)(
weights, top, bottom.shape[-3:])
d_weights = GpuCorr3dMM_gradWeights(self.border_mode,
self.subsample,
self.filter_dilation)(
bottom, top, weights.shape[-3:])
return d_bottom, d_weights
class GpuCorr3dMM_gradWeights(BaseGpuCorr3dMM):
"""
Gradient wrt. filters for `GpuCorr3dMM`.
Notes
-----
You will not want to use this directly, but rely on Theano's automatic
differentiation or graph optimization to use it as needed.
"""
def __init__(self, border_mode="valid",
subsample=(1, 1, 1),
filter_dilation=(1, 1, 1)):
super(GpuCorr3dMM_gradWeights, self).__init__(border_mode,
subsample,
filter_dilation)
def make_node(self, img, topgrad, shape=None):
ctx_name = infer_context_name(img, topgrad)
img = as_gpuarray_variable(img, ctx_name)
topgrad = as_gpuarray_variable(topgrad, ctx_name)
if img.type.ndim != 5:
raise TypeError('img must be 5D tensor')
if topgrad.type.ndim != 5:
raise TypeError('topgrad must be 5D tensor')
if self.subsample != (1, 1, 1) or self.border_mode == "half":
if shape is None:
raise ValueError('shape must be given if subsample != (1, 1, 1)'
' or border_mode == "half"')
height_width_depth = [shape[0], shape[1], shape[2]]
assert shape[0].ndim == 0
assert shape[1].ndim == 0
assert shape[2].ndim == 0
else:
height_width_depth = []
broadcastable = [topgrad.type.broadcastable[1], img.type.broadcastable[1],
False, False, False]
return Apply(self, [img, topgrad] + height_width_depth,
[GpuArrayType(dtype=img.dtype,
context_name=ctx_name,
broadcastable=broadcastable)()])
def c_code(self, node, nodename, inp, out_, sub):
bottom, top = inp[:2]
height, width, depth = inp[2:] or (None, None, None)
weights, = out_
direction = "backprop weights"
return super(GpuCorr3dMM_gradWeights, self).c_code_helper(bottom, weights, top, direction, sub, height, width, depth)
def grad(self, inp, grads):
bottom, top = inp[:2]
weights, = grads
weights = gpu_contiguous(weights)
d_bottom = GpuCorr3dMM_gradInputs(self.border_mode,
self.subsample,
self.filter_dilation)(weights,
top,
bottom.shape[-3:])
d_top = GpuCorr3dMM(
self.border_mode, self.subsample, self.filter_dilation)(bottom, weights)
d_height_width_depth = (theano.gradient.DisconnectedType()(),)\
* 3 if len(inp) == 5 else ()
return (d_bottom, d_top) + d_height_width_depth
def connection_pattern(self, node):
if node.nin == 2:
return [[1], [1]]
else:
return [[1], [1], [0], [0], [0]] # no connection to height, width, depth
class GpuCorr3dMM_gradInputs(BaseGpuCorr3dMM):
"""
Gradient wrt. inputs for `GpuCorr3dMM`.
Notes
-----
You will not want to use this directly, but rely on Theano's automatic
differentiation or graph optimization to use it as needed.
"""
def __init__(self, border_mode="valid",
subsample=(1, 1, 1),
filter_dilation=(1, 1, 1)):
super(GpuCorr3dMM_gradInputs, self).__init__(border_mode, subsample,
filter_dilation)
def make_node(self, kern, topgrad, shape=None):
ctx_name = infer_context_name(kern, topgrad)
kern = as_gpuarray_variable(kern, ctx_name)
topgrad = as_gpuarray_variable(topgrad, ctx_name)
if kern.type.ndim != 5:
raise TypeError('kern must be 5D tensor')
if topgrad.type.ndim != 5:
raise TypeError('topgrad must be 5D tensor')
if self.subsample != (1, 1, 1) and shape is None:
raise ValueError('shape must be given if subsample != (1, 1, 1)')
height_width_depth = [shape[0], shape[1], shape[2]] if self.subsample != (1, 1, 1) else []
if height_width_depth:
assert shape[0].ndim == 0
assert shape[1].ndim == 0
assert shape[2].ndim == 0
broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[1],
False, False, False]
return Apply(self, [kern, topgrad] + height_width_depth,
[GpuArrayType(dtype=topgrad.dtype,
context_name=ctx_name,
broadcastable=broadcastable)()])
def c_code(self, node, nodename, inp, out_, sub):
weights, top = inp[:2]
height, width, depth = inp[2:] or (None, None, None)
bottom, = out_
direction = "backprop inputs"
return super(GpuCorr3dMM_gradInputs, self).c_code_helper(bottom, weights, top, direction, sub, height, width, depth)
def grad(self, inp, grads):
weights, top = inp[:2]
bottom, = grads
bottom = gpu_contiguous(bottom)
d_weights = GpuCorr3dMM_gradWeights(self.border_mode,
self.subsample,
self.filter_dilation)(bottom,
top,
weights.shape[-3:])
d_top = GpuCorr3dMM(self.border_mode,
self.subsample,
self.filter_dilation)(bottom, weights)
d_height_width_depth = (theano.gradient.DisconnectedType()(),)\
* 3 if len(inp) == 5 else ()
return (d_weights, d_top) + d_height_width_depth
def connection_pattern(self, node):
if node.nin == 2:
return [[1], [1]]
else:
return [[1], [1], [0], [0], [0]] # no connection to height, width, depth
@inplace_allocempty(GpuGemv, 0)
def local_inplace_gpuagemv(node, inputs):
return [gpugemv_inplace(*inputs)]
......
#section kernels
#kernel dilated_im3d2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, * :
// TODO check kernel flags
// This uses a lot of code from Caffe (http://caffe.berkeleyvision.org/);
// sources are clearly marked. Below we reproduce the original license of
// the Caffe software.
/*
Copyright (c) 2014, The Regents of the University of California (Regents)
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/util/im2col.cu)
// Kernels for fast unfold + copy
// GPU kernel for the case of dilation
KERNEL void dilated_im3d2col_kernel(const ga_size n,
GLOBAL_MEM const DTYPE_i0 * data_im,
const ga_size data_im_offset,
const ga_size height, const ga_size width, const ga_size depth,
const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_d,
const ga_size dilation_h, const ga_size dilation_w, const ga_size dilation_d,
const ga_size pad_h, const ga_size pad_w, const ga_size pad_d,
const ga_size stride_h, const ga_size stride_w, const ga_size stride_d,
const ga_size height_col, const ga_size width_col, const ga_size depth_col,
GLOBAL_MEM DTYPE_i0 * data_col) {
// grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < (n); index += LDIM_0 * GDIM_0) {
const ga_size w_index = index / depth_col;
const ga_size h_index = w_index / width_col;
const ga_size d_col = index % depth_col;
const ga_size h_col = h_index % height_col;
const ga_size w_col = w_index % width_col;
const ga_size c_im = h_index / height_col;
const ga_size c_col = c_im * kernel_h * kernel_w * kernel_d;
const ga_size h_offset = h_col * stride_h - pad_h;
const ga_size w_offset = w_col * stride_w - pad_w;
const ga_size d_offset = d_col * stride_d - pad_d;
DTYPE_i0 * data_col_ptr = data_col;
data_col_ptr += c_col * (height_col * width_col * depth_col) +
h_col * (width_col * depth_col) + w_col * depth_col + d_col;
const DTYPE_i0 * data_im_ptr = data_im + data_im_offset;
data_im_ptr += c_im * (height * width * depth) +
h_offset * (width * depth) + w_offset * depth + d_offset;
for (ga_size i = 0; i < kernel_h; ++i) {
ga_size h_im = h_offset + i * dilation_h;
for (ga_size j = 0; j < kernel_w; ++j) {
ga_size w_im = w_offset + j * dilation_w;
for (ga_size k = 0; k < kernel_d; ++k) {
ga_size d_im = d_offset + k * dilation_d;
*data_col_ptr = (h_im >= 0 && w_im >= 0 && d_im >= 0 &&
h_im < height && w_im < width && d_im < depth) ?
data_im_ptr[i * dilation_h * (width * depth) +
j * dilation_w * depth +
k * dilation_d] : 0;
data_col_ptr += height_col * width_col * depth_col;
}
}
}
}
}
#kernel im3d2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, * :
KERNEL void im3d2col_kernel(const ga_size n,
GLOBAL_MEM const DTYPE_i0 * data_im,
const ga_size data_im_offset,
const ga_size height, const ga_size width, const ga_size depth,
const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_d,
const ga_size pad_h, const ga_size pad_w, const ga_size pad_d,
const ga_size stride_h, const ga_size stride_w, const ga_size stride_d,
const ga_size height_col, const ga_size width_col, const ga_size depth_col,
GLOBAL_MEM DTYPE_i0 * data_col) {
// grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < (n); index += LDIM_0 * GDIM_0) {
const ga_size w_index = index / depth_col;
const ga_size h_index = w_index / width_col;
const ga_size d_col = index % depth_col;
const ga_size h_col = h_index % height_col;
const ga_size w_col = w_index % width_col;
const ga_size c_im = h_index / height_col;
const ga_size c_col = c_im * kernel_h * kernel_w * kernel_d;
const ga_size h_offset = h_col * stride_h - pad_h;
const ga_size w_offset = w_col * stride_w - pad_w;
const ga_size d_offset = d_col * stride_d - pad_d;
DTYPE_i0 * data_col_ptr = data_col;
data_col_ptr += c_col * (height_col * width_col * depth_col) +
h_col * (width_col * depth_col) + w_col * depth_col + d_col;
const DTYPE_i0 * data_im_ptr = data_im + data_im_offset;
data_im_ptr += c_im * (height * width * depth) +
h_offset * (width * depth) + w_offset * depth + d_offset;
for (ga_size i = 0; i < kernel_h; ++i) {
ga_size h_im = h_offset + i;
for (ga_size j = 0; j < kernel_w; ++j) {
ga_size w_im = w_offset + j;
for (ga_size k = 0; k < kernel_d; ++k) {
ga_size d_im = d_offset + k;
*data_col_ptr = (h_im >= 0 && w_im >= 0 && d_im >= 0 &&
h_im < height && w_im < width && d_im < depth) ?
data_im_ptr[i * (width * depth) + j * depth + k] : 0;
data_col_ptr += height_col * width_col * depth_col;
}
}
}
}
}
// GPU kernel for the case of dilation
#kernel dilated_col2im3d_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
KERNEL void dilated_col2im3d_kernel(const ga_size n,
GLOBAL_MEM const DTYPE_i0 * data_col,
const ga_size height, const ga_size width, const ga_size depth,
const ga_size channels,
const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_d,
const ga_size dilation_h, const ga_size dilation_w, const ga_size dilation_d,
const ga_size pad_h, const ga_size pad_w, const ga_size pad_d,
const ga_size stride_h, const ga_size stride_w, const ga_size stride_d,
const ga_size height_col, const ga_size width_col, const ga_size depth_col,
GLOBAL_MEM DTYPE_i0 * data_im,
const ga_size data_im_offset) {
// grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < (n); index += LDIM_0 * GDIM_0) {
DTYPE_i0 val = 0;
const ga_size d_im = index % depth + pad_d;
const ga_size w_index = index / depth;
const ga_size w_im = w_index % width + pad_w;
const ga_size h_index = w_index / width;
const ga_size h_im = h_index % height + pad_h;
const ga_size c_im = h_index / height;
ga_size kernel_extent_w = (kernel_w - 1) * dilation_w + 1;
ga_size kernel_extent_h = (kernel_h - 1) * dilation_h + 1;
ga_size kernel_extent_d = (kernel_d - 1) * dilation_d + 1;
// compute the start and end of the output
const ga_size d_col_start =
(d_im < kernel_extent_d) ? 0 : (d_im - kernel_extent_d) / stride_d + 1;
const ga_size d_col_end = min(d_im / stride_d + 1, depth_col);
const ga_size w_col_start =
(w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1;
const ga_size w_col_end = min(w_im / stride_w + 1, width_col);
const ga_size h_col_start =
(h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) / stride_h + 1;
const ga_size h_col_end = min(h_im / stride_h + 1, height_col);
// TODO: use LCM of stride and dilation to avoid unnecessary loops
for (ga_size d_col = d_col_start; d_col < d_col_end; ++d_col) {
for (ga_size h_col = h_col_start; h_col < h_col_end; ++h_col) {
for (ga_size w_col = w_col_start; w_col < w_col_end; ++w_col) {
ga_size h_k = (h_im - h_col * stride_h);
ga_size w_k = (w_im - w_col * stride_w);
ga_size d_k = (d_im - d_col * stride_d);
if (h_k % dilation_h == 0 && w_k % dilation_w == 0 && d_k % dilation_d == 0) {
h_k /= dilation_h;
w_k /= dilation_w;
d_k /= dilation_d;
ga_size data_col_index = c_im * kernel_h * kernel_w * kernel_d * height_col * width_col * depth_col +
h_k * kernel_w * kernel_d * height_col * width_col * depth_col +
w_k * kernel_d * height_col * width_col * depth_col +
d_k * height_col * width_col * depth_col +
h_col * width_col * depth_col +
w_col * depth_col +
d_col;
val += data_col[data_col_index];
}
}
}
}
data_im[data_im_offset + index] = val;
}
}
#kernel col2im3d_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
KERNEL void col2im3d_kernel(const ga_size n,
GLOBAL_MEM const DTYPE_i0 * data_col,
const ga_size height, const ga_size width, const ga_size depth,
const ga_size channels,
const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_d,
const ga_size pad_h, const ga_size pad_w, const ga_size pad_d,
const ga_size stride_h, const ga_size stride_w, const ga_size stride_d,
const ga_size height_col, const ga_size width_col, const ga_size depth_col,
GLOBAL_MEM DTYPE_i0 * data_im,
const ga_size data_im_offset) {
// grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < (n); index += LDIM_0 * GDIM_0) {
DTYPE_i0 val = 0;
const ga_size d_im = index % depth + pad_d;
const ga_size w_index = index / depth;
const ga_size w_im = w_index % width + pad_w;
const ga_size h_index = w_index / width;
const ga_size h_im = h_index % height + pad_h;
const ga_size c_im = h_index / height;
// compute the start and end of the output
const ga_size d_col_start = (d_im < kernel_d) ? 0 : (d_im - kernel_d) / stride_d + 1;
const ga_size d_col_end = min(d_im / stride_d + 1, depth_col);
const ga_size w_col_start = (w_im < kernel_w) ? 0 : (w_im - kernel_w) / stride_w + 1;
const ga_size w_col_end = min(w_im / stride_w + 1, width_col);
const ga_size h_col_start = (h_im < kernel_h) ? 0 : (h_im - kernel_h) / stride_h + 1;
const ga_size h_col_end = min(h_im / stride_h + 1, height_col);
ga_size offset =
(c_im * kernel_h * kernel_w * kernel_d + h_im * kernel_w * kernel_d +
w_im * kernel_d + d_im) * height_col * width_col * depth_col;
ga_size coeff_h_col = (1 - stride_h * kernel_w * kernel_d * height_col) * width_col * depth_col;
ga_size coeff_w_col = (1 - stride_w * kernel_d * height_col * width_col) * depth_col;
ga_size coeff_d_col = (1 - stride_d * height_col * width_col * depth_col);
for (ga_size d_col = d_col_start; d_col < d_col_end; ++d_col) {
for (ga_size h_col = h_col_start; h_col < h_col_end; ++h_col) {
for (ga_size w_col = w_col_start; w_col < w_col_end; ++w_col) {
val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col + d_col * coeff_d_col];
}
}
}
data_im[data_im_offset + index] = val;
}
}
#section support_code_struct
int im3d2col(const size_t max_threads_dim,
gpudata * data_im, const size_t data_im_offset, const size_t channels,
const size_t height, const size_t width, const size_t depth,
const size_t kernel_h, const size_t kernel_w, const size_t kernel_d,
const size_t dilation_h, const size_t dilation_w, const size_t dilation_d,
const size_t pad_h, const size_t pad_w, const size_t pad_d,
const size_t stride_h, const size_t stride_w, const size_t stride_d,
gpudata * data_col) {
// We are going to launch channels * height_col * width_col * depth_col
// kernels, each kernel responsible for copying a single-channel grid.
size_t dil_kernel_h = (kernel_h - 1) * dilation_h + 1;
size_t dil_kernel_w = (kernel_w - 1) * dilation_w + 1;
size_t dil_kernel_d = (kernel_d - 1) * dilation_d + 1;
size_t height_col = (height + 2 * pad_h - dil_kernel_h) / stride_h + 1;
size_t width_col = (width + 2 * pad_w - dil_kernel_w) / stride_w + 1;
size_t depth_col = (depth + 2 * pad_d - dil_kernel_d) / stride_d + 1;
size_t num_kernels = channels * height_col * width_col * depth_col;
size_t threads_per_block = max_threads_dim;
size_t n_blocks = (num_kernels + threads_per_block - 1) / threads_per_block;
int err;
GpuKernel *kernel;
if(dilation_h != 1 || dilation_w != 1 || dilation_d != 1){
err = dilated_im3d2col_kernel_call(
1, &threads_per_block, &n_blocks, 0,
num_kernels, data_im, data_im_offset, height, width, depth,
kernel_h, kernel_w, kernel_d, dilation_h, dilation_w, dilation_d,
pad_h, pad_w, pad_d, stride_h, stride_w, stride_d, height_col,
width_col, depth_col, data_col);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: dilated_im3d2col_kernel: %s.",
GpuKernel_error(&k_dilated_im3d2col_kernel, err));
}
}
else{
err = im3d2col_kernel_call(
1, &threads_per_block, &n_blocks, 0,
num_kernels, data_im, data_im_offset, height, width, depth,
kernel_h, kernel_w, kernel_d, pad_h, pad_w, pad_d,
stride_h, stride_w, stride_d, height_col, width_col, depth_col,
data_col);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: im3d2col_kernel: %s.",
GpuKernel_error(&k_im3d2col_kernel, err));
}
}
return err;
}
int col2im3d(const size_t max_threads_dim, gpudata * data_col, const size_t channels,
const size_t height, const size_t width, const size_t depth,
const size_t patch_h, const size_t patch_w, const size_t patch_d,
const size_t dilation_h, const size_t dilation_w, const size_t dilation_d,
const size_t pad_h, const size_t pad_w, const size_t pad_d,
const size_t stride_h, const size_t stride_w, const size_t stride_d,
gpudata * data_im, const size_t data_im_offset) {
size_t dil_patch_h = (patch_h - 1) * dilation_h + 1;
size_t dil_patch_w = (patch_w - 1) * dilation_w + 1;
size_t dil_patch_d = (patch_d - 1) * dilation_d + 1;
size_t height_col = (height + 2 * pad_h - dil_patch_h) / stride_h + 1;
size_t width_col = (width + 2 * pad_w - dil_patch_w) / stride_w + 1;
size_t depth_col = (depth + 2 * pad_d - dil_patch_d) / stride_d + 1;
size_t num_kernels = channels * height * width * depth;
size_t threads_per_block = max_threads_dim;
size_t n_blocks = (num_kernels + threads_per_block - 1) / threads_per_block;
// To avoid involving atomic operations, we will launch one kernel per
// bottom dimension, and then in the kernel add up the top dimensions.
int err;
if(dilation_h != 1 || dilation_w != 1 || dilation_d != 1){
err = dilated_col2im3d_kernel_call(
1, &threads_per_block, &n_blocks, 0,
num_kernels, data_col, height, width, depth, channels, patch_h, patch_w,
patch_d, dilation_h, dilation_w, dilation_d, pad_h, pad_w, pad_d,
stride_h, stride_w, stride_d, height_col, width_col, depth_col,
data_im, data_im_offset);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: dilated_col2im3d_kernel: %s.",
GpuKernel_error(&k_dilated_col2im3d_kernel, err));
}
}
else{
err = col2im3d_kernel_call(
1, &threads_per_block, &n_blocks, 0,
num_kernels, data_col, height, width, depth, channels, patch_h, patch_w,
patch_d, pad_h, pad_w, pad_d, stride_h, stride_w, stride_d,
height_col, width_col, depth_col, data_im, data_im_offset);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: col2im3d_kernel: %s.",
GpuKernel_error(&k_col2im3d_kernel, err));
}
}
return err;
}
// Theano op code
// Authors: Arjun Jain, Frederic Bastien, Jan Schluter
// Reference code: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu
// and https://github.com/torch/cunn/blob/master/SpatialConvolutionMM.cu
// Adaptation for 3d
PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
PyGpuArrayObject *const weight,
PyGpuArrayObject *const top,
const size_t direction,
const size_t dH = 1,
const size_t dW = 1,
const size_t dD = 1,
const size_t dilH = 1,
const size_t dilW = 1,
const size_t dilD = 1,
const size_t padH = 0,
const size_t padW = 0,
const size_t padD = 0)
{
if (PyGpuArray_NDIM(bottom) != 5)
{
PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM requires bottom of 5D");
return NULL;
}
if (!GpuArray_IS_C_CONTIGUOUS(&bottom->ga))
{
PyErr_Format(PyExc_ValueError,
"GpuCorr3dMM requires bottom to be C-contiguous, "
"but strides are: %ld %ld %ld %ld %ld\n",
PyGpuArray_STRIDES(bottom)[0],
PyGpuArray_STRIDES(bottom)[1],
PyGpuArray_STRIDES(bottom)[2],
PyGpuArray_STRIDES(bottom)[3],
PyGpuArray_STRIDES(bottom)[4]);
return NULL;
}
if (PyGpuArray_NDIM(weight) != 5)
{
PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM requires weight of 5D");
return NULL;
}
if (!GpuArray_IS_C_CONTIGUOUS(&weight->ga))
{
PyErr_Format(PyExc_ValueError,
"GpuCorr3dMM requires weight to be C-contiguous, "
"but strides are: %ld %ld %ld %ld %ld\n",
PyGpuArray_STRIDES(weight)[0],
PyGpuArray_STRIDES(weight)[1],
PyGpuArray_STRIDES(weight)[2],
PyGpuArray_STRIDES(weight)[3],
PyGpuArray_STRIDES(weight)[4]);
return NULL;
}
if (PyGpuArray_NDIM(top) != 5)
{
PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM requires top of 5D");
return NULL;
}
if (!GpuArray_IS_C_CONTIGUOUS(&top->ga))
{
PyErr_Format(PyExc_ValueError,
"GpuCorr3dMM requires top to be C-contiguous, "
"but strides are: %ld %ld %ld %ld %ld\n",
PyGpuArray_STRIDES(top)[0],
PyGpuArray_STRIDES(top)[1],
PyGpuArray_STRIDES(top)[2],
PyGpuArray_STRIDES(top)[3],
PyGpuArray_STRIDES(top)[4]);
return NULL;
}
// Extract some shape information for later and check shape consistency
// bottom: (batchSize, nChannels, bottomHeight, bottomWidth, bottomDepth)
const size_t batchSize = PyGpuArray_DIMS(bottom)[0];
const size_t nChannels = PyGpuArray_DIMS(bottom)[1];
const size_t bottomHeight = PyGpuArray_DIMS(bottom)[2];
const size_t bottomWidth = PyGpuArray_DIMS(bottom)[3];
const size_t bottomDepth = PyGpuArray_DIMS(bottom)[4];
// weights: (nFilters, nChannels, rows, columns, slices)
const size_t nFilters = PyGpuArray_DIMS(weight)[0];
const size_t kH = PyGpuArray_DIMS(weight)[2];
const size_t kW = PyGpuArray_DIMS(weight)[3];
const size_t kD = PyGpuArray_DIMS(weight)[4];
if (nChannels != PyGpuArray_DIMS(weight)[1]) {
PyErr_SetString(PyExc_ValueError,
"GpuCorr3dMM images and kernel must have the same stack size\n");
return NULL;
}
// implicit dilated filter
const size_t dil_kH = (kH - 1) * dilH + 1;
const size_t dil_kW = (kW - 1) * dilW + 1;
const size_t dil_kD = (kD - 1) * dilD + 1;
// top: (batchSize, nFilters, topHeight, topWidth, topDepth)
const size_t topHeight = (bottomHeight + 2*padH - dil_kH) / dH + 1;
const size_t topWidth = (bottomWidth + 2*padW - dil_kW) / dW + 1;
const size_t topDepth = (bottomDepth + 2*padD - dil_kD) / dD + 1;
if (batchSize != PyGpuArray_DIMS(top)[0] ||
nFilters != PyGpuArray_DIMS(top)[1] ||
topHeight != PyGpuArray_DIMS(top)[2] ||
topWidth != PyGpuArray_DIMS(top)[3] ||
topDepth != PyGpuArray_DIMS(top)[4]) {
PyErr_Format(PyExc_ValueError,
"GpuCorr3dMM shape inconsistency:\n"
" bottom shape: %ld %ld %ld %ld %ld\n"
" weight shape: %ld %ld %ld %ld %ld\n"
" top shape: %ld %ld %ld %ld %ld (expected %ld %ld %ld %ld %ld)\n",
batchSize, nChannels, bottomHeight, bottomWidth, bottomDepth,
nFilters, nChannels, kH, kW, kD,
PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3], PyGpuArray_DIMS(top)[4],
batchSize, nFilters, topHeight, topWidth, topDepth);
return NULL;
}
int err = gpublas_setup(bottom->context->ctx);
if (err != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "Can't setup blas");
return NULL;
}
// Get the max threads per blocks
size_t max_threads_dim;
err = gpucontext_property(bottom->context->ctx, GA_CTX_PROP_MAXLSIZE, &max_threads_dim);
if (err != GA_NO_ERROR){
PyErr_Format(PyExc_RuntimeError,
"Could not fetch max_threads_dim.");
return NULL;
}
// Create temporary columns
size_t col_dim[2];
col_dim[0] = nChannels * kW * kH * kD;
col_dim[1] = topHeight * topWidth * topDepth;
PyGpuArrayObject* col = (PyGpuArrayObject*)pygpu_empty(2, col_dim,
bottom->ga.typecode,
GA_C_ORDER,
bottom->context,
Py_None);
if (NULL == col)
{
PyErr_Format(PyExc_RuntimeError,
"GpuCorr3dMM failed to allocate working memory of %ld x %ld\n",
col_dim[0], col_dim[1]);
return NULL;
}
// Define some useful variables
const size_t bottom_stride = PyGpuArray_STRIDES(bottom)[0] / gpuarray_get_elsize(bottom->ga.typecode);
const size_t top_stride = PyGpuArray_STRIDES(top)[0] / gpuarray_get_elsize(top->ga.typecode);
const size_t K_ = col_dim[0];
const size_t N_ = col_dim[1];
const size_t M_ = nFilters;
const DTYPE_INPUT_0 one = 1.0f;
const DTYPE_INPUT_0 zero = 0.0f;
PyGpuArrayObject *output;
if (direction == 0) { // forward pass
output = top;
// valid correlation: im3d2col, then gemm
// Iterate over batch
for (size_t n = 0; n < batchSize; n++) {
// First, im3d2col
err = im3d2col(max_threads_dim,
bottom->ga.data, n * bottom_stride, nChannels, bottomHeight,
bottomWidth, bottomDepth, kH, kW, kD, dilH, dilW, dilD,
padH, padW, padD, dH, dW, dD, col->ga.data);
if (err != GA_NO_ERROR) {
Py_DECREF(col);
return NULL;
}
// Second, gemm
err = gpublas_sgemm(cb_fortran, cb_no_trans, cb_no_trans,
N_, M_, K_, one,
col->ga.data, 0, N_,
weight->ga.data, 0, K_,
zero,
top->ga.data, n * top_stride, N_);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"GpuCorr3dMM encountered an error running sgemm.\n");
Py_DECREF(col);
return NULL;
}
}
}
else if (direction == 1) { // backprop wrt. weights
output = weight;
// valid convolution: im3col, then gemm
// Iterate over batch
for (size_t n = 0; n < batchSize; n++) {
// First, im3d2col
err = im3d2col(max_threads_dim,
bottom->ga.data, n * bottom_stride, nChannels, bottomHeight,
bottomWidth, bottomDepth, kH, kW, kD, dilH, dilW, dilD,
padH, padW, padD, dH, dW, dD, col->ga.data);
if (err != GA_NO_ERROR) {
Py_DECREF(col);
return NULL;
}
// Second, gemm
// Note that we accumulate into weight. We do so by setting beta = 0
// for the first iteration and beta = 1 for subsequent ones. (This
// is faster than setting weight to all zeros before the loop.)
err = gpublas_sgemm(cb_fortran, cb_trans, cb_no_trans,
K_, M_, N_, one,
col->ga.data, 0, N_,
top->ga.data, n * top_stride, N_,
(n == 0) ? zero : one,
weight->ga.data, 0, K_);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"GpuCorr3dMM encountered an error running sgemm.\n");
Py_DECREF(col);
return NULL;
}
}
}
else if (direction == 2) { // backprop wrt. inputs
output = bottom;
// full convolution: gemm, then col2im3d
// Iterate over batch
for (size_t n = 0; n < batchSize; n++) {
// gemm into columns
err = gpublas_sgemm(cb_fortran, cb_no_trans, cb_trans,
N_, K_, M_, one,
top->ga.data, n * top_stride, N_,
weight->ga.data, 0, K_,
zero,
col->ga.data, 0, N_);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"GpuCorr3dMM encountered an error running sgemm.\n");
Py_DECREF(col);
return NULL;
}
// col2im3d back to the data
err = col2im3d(max_threads_dim,
col->ga.data, nChannels,
bottomHeight, bottomWidth, bottomDepth,
kH, kW, kD, dilH, dilW, dilD, padH, padW, padD,
dH, dW, dD, bottom->ga.data, n * bottom_stride);
if (err != GA_NO_ERROR) {
Py_DECREF(col);
return NULL;
}
}
}
// Free temporary columns
Py_DECREF(col);
// Note that we don't change the refcount of the output matrix here. Output
// (re)allocation and refcounting is done in BaseGpuCorr3dMM.c_code_helper();
// in here output is just aliased to one of bottom, weights, or top.
return output;
}
#section kernels
#kernel dilated_im2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, * :
// TODO check kernel flags
// This uses a lot of code from Caffe (http://caffe.berkeleyvision.org/);
// sources are clearly marked. Below we reproduce the original license of
// the Caffe software.
/*
Copyright (c) 2014, The Regents of the University of California (Regents)
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/util/im2col.cu)
// Kernels for fast unfold + copy
// GPU kernel for the case of dilation
KERNEL void dilated_im2col_kernel(const ga_size n,
GLOBAL_MEM const DTYPE_i0 * data_im,
const ga_size data_im_offset,
const ga_size height, const ga_size width,
const ga_size kernel_h, const ga_size kernel_w,
const ga_size dilation_h, const ga_size dilation_w,
const ga_size pad_h, const ga_size pad_w,
const ga_size stride_h, const ga_size stride_w,
const ga_size height_col, const ga_size width_col,
GLOBAL_MEM DTYPE_i0 * data_col) {
// grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < (n); index += LDIM_0 * GDIM_0) {
const ga_size h_index = index / width_col;
const ga_size h_col = h_index % height_col;
const ga_size w_col = index % width_col;
const ga_size c_im = h_index / height_col;
const ga_size c_col = c_im * kernel_h * kernel_w;
const ga_size h_offset = h_col * stride_h - pad_h;
const ga_size w_offset = w_col * stride_w - pad_w;
DTYPE_i0 * data_col_ptr = data_col;
data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;
const DTYPE_i0 * data_im_ptr = data_im + data_im_offset;
data_im_ptr += (c_im * height + h_offset) * width + w_offset;
for (ga_size i = 0; i < kernel_h; ++i) {
for (ga_size j = 0; j < kernel_w; ++j) {
ga_size h_im = h_offset + i * dilation_h;
ga_size w_im = w_offset + j * dilation_w;
*data_col_ptr =
(h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?
data_im_ptr[i * dilation_h * width + j * dilation_w] : 0;
data_col_ptr += height_col * width_col;
}
}
}
}
#kernel im2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, * :
KERNEL void im2col_kernel(const ga_size n,
GLOBAL_MEM const DTYPE_i0 * data_im,
const ga_size data_im_offset,
const ga_size height, const ga_size width,
const ga_size kernel_h, const ga_size kernel_w,
const ga_size pad_h, const ga_size pad_w,
const ga_size stride_h, const ga_size stride_w,
const ga_size height_col, const ga_size width_col,
GLOBAL_MEM DTYPE_i0 * data_col) {
// grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < (n); index += LDIM_0 * GDIM_0) {
const ga_size h_index = index / width_col;
const ga_size h_col = h_index % height_col;
const ga_size w_col = index % width_col;
const ga_size c_im = h_index / height_col;
const ga_size c_col = c_im * kernel_h * kernel_w;
const ga_size h_offset = h_col * stride_h - pad_h;
const ga_size w_offset = w_col * stride_w - pad_w;
DTYPE_i0 * data_col_ptr = data_col;
data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;
const DTYPE_i0 * data_im_ptr = data_im + data_im_offset;
data_im_ptr += (c_im * height + h_offset) * width + w_offset;
for (ga_size i = 0; i < kernel_h; ++i) {
for (ga_size j = 0; j < kernel_w; ++j) {
ga_size h_im = h_offset + i ;
ga_size w_im = w_offset + j ;
*data_col_ptr =
(h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?
data_im_ptr[i * width + j] : 0;
data_col_ptr += height_col * width_col;
}
}
}
}
// GPU kernel for the case of dilation
#kernel dilated_col2im_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
KERNEL void dilated_col2im_kernel(const ga_size n,
GLOBAL_MEM const DTYPE_i0 * data_col,
const ga_size height, const ga_size width, const ga_size channels,
const ga_size kernel_h, const ga_size kernel_w,
const ga_size dilation_h, const ga_size dilation_w,
const ga_size pad_h, const ga_size pad_w,
const ga_size stride_h, const ga_size stride_w,
const ga_size height_col, const ga_size width_col,
GLOBAL_MEM DTYPE_i0 * data_im,
const ga_size data_im_offset) {
// grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < (n); index += LDIM_0 * GDIM_0) {
DTYPE_i0 val = 0;
const ga_size w_im = index % width + pad_w;
const ga_size h_im = (index / width) % height + pad_h;
const ga_size c_im = index / (width * height);
ga_size kernel_extent_w = (kernel_w - 1) * dilation_w + 1;
ga_size kernel_extent_h = (kernel_h - 1) * dilation_h + 1;
// compute the start and end of the output
const ga_size w_col_start =
(w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1;
const ga_size w_col_end = min(w_im / stride_w + 1, width_col);
const ga_size h_col_start =
(h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) / stride_h + 1;
const ga_size h_col_end = min(h_im / stride_h + 1, height_col);
// TODO: use LCM of stride and dilation to avoid unnecessary loops
for (ga_size h_col = h_col_start; h_col < h_col_end; h_col += 1) {
for (ga_size w_col = w_col_start; w_col < w_col_end; w_col += 1) {
ga_size h_k = (h_im - h_col * stride_h);
ga_size w_k = (w_im - w_col * stride_w);
if (h_k % dilation_h == 0 && w_k % dilation_w == 0) {
h_k /= dilation_h;
w_k /= dilation_w;
ga_size data_col_index = (((c_im * kernel_h + h_k) * kernel_w + w_k) *
height_col + h_col) * width_col + w_col;
val += data_col[data_col_index];
}
}
}
data_im[data_im_offset + index] = val;
}
}
#kernel col2im_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, *, size :
KERNEL void col2im_kernel(const ga_size n,
GLOBAL_MEM const DTYPE_i0 * data_col,
const ga_size height, const ga_size width, const ga_size channels,
const ga_size kernel_h, const ga_size kernel_w,
const ga_size pad_h, const ga_size pad_w,
const ga_size stride_h, const ga_size stride_w,
const ga_size height_col, const ga_size width_col,
GLOBAL_MEM DTYPE_i0 * data_im,
const ga_size data_im_offset) {
// grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < (n); index += LDIM_0 * GDIM_0) {
DTYPE_i0 val = 0;
const ga_size w_im = index % width + pad_w;
const ga_size h_im = (index / width) % height + pad_h;
const ga_size c_im = index / (width * height);
// compute the start and end of the output
const ga_size w_col_start =
(w_im < kernel_w) ? 0 : (w_im - kernel_w) / stride_w + 1;
const ga_size w_col_end = min(w_im / stride_w + 1, width_col);
const ga_size h_col_start =
(h_im < kernel_h) ? 0 : (h_im - kernel_h) / stride_h + 1;
const ga_size h_col_end = min(h_im / stride_h + 1, height_col);
// equivalent implementation, no dilation
ga_size offset =
(c_im * kernel_h * kernel_w + h_im * kernel_w + w_im) * height_col * width_col;
ga_size coeff_h_col = (1 - stride_h * kernel_w * height_col) * width_col;
ga_size coeff_w_col = (1 - stride_w * height_col * width_col);
for (ga_size h_col = h_col_start; h_col < h_col_end; ++h_col) {
for (ga_size w_col = w_col_start; w_col < w_col_end; ++w_col) {
val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
}
}
data_im[data_im_offset + index] = val;
}
}
#section support_code_struct
int im2col(const size_t max_threads_dim,
gpudata * data_im, const size_t data_im_offset, const size_t channels,
const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w,
const size_t dilation_h, const size_t dilation_w,
const size_t pad_h, const size_t pad_w,
const size_t stride_h, const size_t stride_w,
gpudata * data_col) {
// We are going to launch channels * height_col * width_col kernels, each
// kernel responsible for copying a single-channel grid.
size_t dil_kernel_h = (kernel_h - 1) * dilation_h + 1;
size_t dil_kernel_w = (kernel_w - 1) * dilation_w + 1;
size_t height_col = (height + 2 * pad_h - dil_kernel_h) / stride_h + 1;
size_t width_col = (width + 2 * pad_w - dil_kernel_w) / stride_w + 1;
size_t num_kernels = channels * height_col * width_col;
size_t threads_per_block = max_threads_dim;
size_t n_blocks = (num_kernels + threads_per_block - 1) / threads_per_block;
int err;
GpuKernel *kernel;
if(dilation_h != 1 || dilation_w != 1){
err = dilated_im2col_kernel_call(
1, &threads_per_block, &n_blocks, 0,
num_kernels, data_im, data_im_offset, height, width, kernel_h, kernel_w,
dilation_h, dilation_w, pad_h, pad_w, stride_h, stride_w, height_col,
width_col, data_col);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: dilated_im2col_kernel: %s.",
GpuKernel_error(&k_dilated_im2col_kernel, err));
}
}
else{
err = im2col_kernel_call(
1, &threads_per_block, &n_blocks, 0,
num_kernels, data_im, data_im_offset, height, width, kernel_h, kernel_w,
pad_h, pad_w, stride_h, stride_w, height_col,
width_col, data_col);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: im2col_kernel: %s.",
GpuKernel_error(&k_im2col_kernel, err));
}
}
return err;
}
int col2im(const size_t max_threads_dim, gpudata * data_col, const size_t channels,
const size_t height, const size_t width, const size_t patch_h, const size_t patch_w,
const size_t dilation_h, const size_t dilation_w,
const size_t pad_h, const size_t pad_w, const size_t stride_h,
const size_t stride_w, gpudata * data_im, const size_t data_im_offset) {
size_t dil_patch_h = (patch_h - 1) * dilation_h + 1;
size_t dil_patch_w = (patch_w - 1) * dilation_w + 1;
size_t height_col = (height + 2 * pad_h - dil_patch_h) / stride_h + 1;
size_t width_col = (width + 2 * pad_w - dil_patch_w) / stride_w + 1;
size_t num_kernels = channels * height * width;
size_t threads_per_block = max_threads_dim;
size_t n_blocks = (num_kernels + threads_per_block - 1) / threads_per_block;
// To avoid involving atomic operations, we will launch one kernel per
// bottom dimension, and then in the kernel add up the top dimensions.
int err;
if(dilation_h != 1 || dilation_w != 1){
err = dilated_col2im_kernel_call(
1, &threads_per_block, &n_blocks, 0,
num_kernels, data_col, height, width, channels, patch_h, patch_w,
dilation_h, dilation_w, pad_h, pad_w, stride_h, stride_w,
height_col, width_col, data_im, data_im_offset);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: dilated_col2im_kernel: %s.",
GpuKernel_error(&k_dilated_col2im_kernel, err));
}
}
else{
err = col2im_kernel_call(
1, &threads_per_block, &n_blocks, 0,
num_kernels, data_col, height, width, channels, patch_h, patch_w,
pad_h, pad_w, stride_h, stride_w,
height_col, width_col, data_im, data_im_offset);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: col2im_kernel: %s.",
GpuKernel_error(&k_col2im_kernel, err));
}
}
return err;
}
// Theano op code
// Authors: Arjun Jain, Frederic Bastien, Jan Schluter
// Reference code: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu
// and https://github.com/torch/cunn/blob/master/SpatialConvolutionMM.cu
PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
PyGpuArrayObject *const weight,
PyGpuArrayObject *const top,
const size_t direction,
const size_t dH = 1,
const size_t dW = 1,
const size_t dilH = 1,
const size_t dilW = 1,
const size_t padH = 0,
const size_t padW = 0)
{
if (PyGpuArray_NDIM(bottom) != 4)
{
PyErr_SetString(PyExc_ValueError, "GpuCorrMM requires bottom of 4D");
return NULL;
}
if (!GpuArray_IS_C_CONTIGUOUS(&bottom->ga))
{
PyErr_Format(PyExc_ValueError,
"GpuCorrMM requires bottom to be C-contiguous, "
"but strides are: %ld %ld %ld %ld\n",
PyGpuArray_STRIDES(bottom)[0],
PyGpuArray_STRIDES(bottom)[1],
PyGpuArray_STRIDES(bottom)[2],
PyGpuArray_STRIDES(bottom)[3]);
return NULL;
}
if (PyGpuArray_NDIM(weight) != 4)
{
PyErr_SetString(PyExc_ValueError, "GpuCorrMM requires weight of 4D");
return NULL;
}
if (!GpuArray_IS_C_CONTIGUOUS(&weight->ga))
{
PyErr_Format(PyExc_ValueError,
"GpuCorrMM requires weight to be C-contiguous, "
"but strides are: %ld %ld %ld %ld\n",
PyGpuArray_STRIDES(weight)[0],
PyGpuArray_STRIDES(weight)[1],
PyGpuArray_STRIDES(weight)[2],
PyGpuArray_STRIDES(weight)[3]);
return NULL;
}
if (PyGpuArray_NDIM(top) != 4)
{
PyErr_SetString(PyExc_ValueError, "GpuCorrMM requires top of 4D");
return NULL;
}
if (!GpuArray_IS_C_CONTIGUOUS(&top->ga))
{
PyErr_Format(PyExc_ValueError,
"GpuCorrMM requires top to be C-contiguous, "
"but strides are: %ld %ld %ld %ld\n",
PyGpuArray_STRIDES(top)[0],
PyGpuArray_STRIDES(top)[1],
PyGpuArray_STRIDES(top)[2],
PyGpuArray_STRIDES(top)[3]);
return NULL;
}
// Extract some shape information for later and check shape consistency
// bottom: (batchSize, nChannels, bottomHeight, bottomWidth)
const size_t batchSize = PyGpuArray_DIMS(bottom)[0];
const size_t nChannels = PyGpuArray_DIMS(bottom)[1];
const size_t bottomHeight = PyGpuArray_DIMS(bottom)[2];
const size_t bottomWidth = PyGpuArray_DIMS(bottom)[3];
// weights: (nFilters, nChannels, rows, columns)
const size_t nFilters = PyGpuArray_DIMS(weight)[0];
const size_t kH = PyGpuArray_DIMS(weight)[2];
const size_t kW = PyGpuArray_DIMS(weight)[3];
if (nChannels != PyGpuArray_DIMS(weight)[1]) {
PyErr_SetString(PyExc_ValueError,
"GpuCorrMM images and kernel must have the same stack size\n");
return NULL;
}
// implicit dilated filter
const size_t dil_kH = (kH - 1) * dilH + 1;
const size_t dil_kW = (kW - 1) * dilW + 1;
// top: (batchSize, nFilters, topHeight, topWidth)
const size_t topHeight = (bottomHeight + 2*padH - dil_kH) / dH + 1;
const size_t topWidth = (bottomWidth + 2*padW - dil_kW) / dW + 1;
if (batchSize != PyGpuArray_DIMS(top)[0] ||
nFilters != PyGpuArray_DIMS(top)[1] ||
topHeight != PyGpuArray_DIMS(top)[2] ||
topWidth != PyGpuArray_DIMS(top)[3]) {
PyErr_Format(PyExc_ValueError,
"GpuCorrMM shape inconsistency:\n"
" bottom shape: %ld %ld %ld %ld\n"
" weight shape: %ld %ld %ld %ld\n"
" top shape: %ld %ld %ld %ld (expected %ld %ld %ld %ld)\n",
batchSize, nChannels, bottomHeight, bottomWidth,
nFilters, nChannels, kH, kW,
PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3],
batchSize, nFilters, topHeight, topWidth);
return NULL;
}
int err = gpublas_setup(bottom->context->ctx);
if (err != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "Can't setup blas");
return NULL;
}
// Get the max threads per blocks
size_t max_threads_dim;
err = gpucontext_property(bottom->context->ctx, GA_CTX_PROP_MAXLSIZE, &max_threads_dim);
if (err != GA_NO_ERROR){
PyErr_Format(PyExc_RuntimeError,
"Could not fetch max_threads_dim.");
return NULL;
}
// Create temporary columns
size_t col_dim[2];
col_dim[0] = nChannels * kW * kH;
col_dim[1] = topHeight * topWidth;
PyGpuArrayObject* col = (PyGpuArrayObject*)pygpu_empty(2, col_dim,
bottom->ga.typecode,
GA_C_ORDER,
bottom->context,
Py_None);
if (NULL == col)
{
PyErr_Format(PyExc_RuntimeError,
"GpuCorrMM failed to allocate working memory of %ld x %ld\n",
col_dim[0], col_dim[1]);
return NULL;
}
// Define some useful variables
const size_t bottom_stride = PyGpuArray_STRIDES(bottom)[0] / gpuarray_get_elsize(bottom->ga.typecode);
const size_t top_stride = PyGpuArray_STRIDES(top)[0] / gpuarray_get_elsize(top->ga.typecode);
const size_t K_ = col_dim[0];
const size_t N_ = col_dim[1];
const size_t M_ = nFilters;
const DTYPE_INPUT_0 one = 1.0f;
const DTYPE_INPUT_0 zero = 0.0f;
PyGpuArrayObject *output;
if (direction == 0) { // forward pass
output = top;
// valid correlation: im2col, then gemm
// Iterate over batch
for (size_t n = 0; n < batchSize; n++) {
// First, im2col
err = im2col(max_threads_dim,
bottom->ga.data, n * bottom_stride, nChannels, bottomHeight,
bottomWidth, kH, kW, dilH, dilW,
padH, padW, dH, dW, col->ga.data);
if (err != GA_NO_ERROR) {
Py_DECREF(col);
return NULL;
}
// Second, gemm
err = gpublas_sgemm(cb_fortran, cb_no_trans, cb_no_trans,
N_, M_, K_, one,
col->ga.data, 0, N_,
weight->ga.data, 0, K_,
zero,
top->ga.data, n * top_stride, N_);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"GpuCorrMM encountered an error running sgemm.\n");
Py_DECREF(col);
return NULL;
}
}
}
else if (direction == 1) { // backprop wrt. weights
output = weight;
// valid convolution: im2col, then gemm
// Iterate over batch
for (size_t n = 0; n < batchSize; n++) {
// First, im2col
err = im2col(max_threads_dim,
bottom->ga.data, n * bottom_stride, nChannels, bottomHeight,
bottomWidth, kH, kW, dilH, dilW,
padH, padW, dH, dW, col->ga.data);
if (err != GA_NO_ERROR) {
Py_DECREF(col);
return NULL;
}
// Second, gemm
// Note that we accumulate into weight. We do so by setting beta = 0
// for the first iteration and beta = 1 for subsequent ones. (This
// is faster than setting weight to all zeros before the loop.)
err = gpublas_sgemm(cb_fortran, cb_trans, cb_no_trans,
K_, M_, N_, one,
col->ga.data, 0, N_,
top->ga.data, n * top_stride, N_,
(n == 0) ? zero : one,
weight->ga.data, 0, K_);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"GpuCorrMM encountered an error running sgemm.\n");
Py_DECREF(col);
return NULL;
}
}
}
else if (direction == 2) { // backprop wrt. inputs
output = bottom;
// full convolution: gemm, then col2im
// Iterate over batch
for (size_t n = 0; n < batchSize; n++) {
// gemm into columns
err = gpublas_sgemm(cb_fortran, cb_no_trans, cb_trans,
N_, K_, M_, one,
top->ga.data, n * top_stride, N_,
weight->ga.data, 0, K_,
zero,
col->ga.data, 0, N_);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"GpuCorrMM encountered an error running sgemm.\n");
Py_DECREF(col);
return NULL;
}
// col2im back to the data
err = col2im(max_threads_dim,
col->ga.data, nChannels, bottomHeight, bottomWidth,
kH, kW, dilH, dilW, padH, padW,
dH, dW, bottom->ga.data, n * bottom_stride);
if (err != GA_NO_ERROR) {
Py_DECREF(col);
return NULL;
}
}
}
// Free temporary columns
Py_DECREF(col);
// Note that we don't change the refcount of the output matrix here. Output
// (re)allocation and refcounting is done in BaseGpuCorrMM.c_code_helper();
// in here output is just aliased to one of bottom, weights, or top.
return output;
}
......@@ -1877,8 +1877,6 @@ def dnn_batch_normalization_test(inputs, gamma, beta, mean, var,
return result
@register_opt2([AbstractConv2d, AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs], 'fast_compile', 'conv_dnn', 'cudnn')
def local_abstractconv_cudnn_graph(op, context_name, inputs, outputs):
if (not isinstance(op, (AbstractConv2d,
AbstractConv2d_gradWeights,
......@@ -1922,8 +1920,6 @@ def local_abstractconv_cudnn_graph(op, context_name, inputs, outputs):
return [rval]
@register_opt2([AbstractConv3d, AbstractConv3d_gradWeights,
AbstractConv3d_gradInputs], 'fast_compile', 'conv_dnn', 'cudnn')
def local_abstractconv3d_cudnn_graph(op, context_name, inputs, outputs):
if (not isinstance(op, (AbstractConv3d,
AbstractConv3d_gradWeights,
......@@ -1967,7 +1963,6 @@ def local_abstractconv3d_cudnn_graph(op, context_name, inputs, outputs):
return [rval]
@register_opt('fast_compile', 'conv_dnn', 'cudnn')
@local_optimizer([AbstractConv2d, AbstractConv3d])
def local_abstractconv_cudnn(node):
ctx = infer_context_name(*node.inputs)
......@@ -1979,7 +1974,6 @@ def local_abstractconv_cudnn(node):
return local_abstractconv3d_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
@register_opt('fast_compile', 'conv_dnn', 'cudnn')
@local_optimizer([AbstractConv2d_gradWeights, AbstractConv3d_gradWeights])
def local_abstractconv_gw_cudnn(node):
ctx = infer_context_name(*node.inputs)
......@@ -1991,7 +1985,6 @@ def local_abstractconv_gw_cudnn(node):
return local_abstractconv3d_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
@register_opt('fast_compile', 'conv_dnn', 'cudnn')
@local_optimizer([AbstractConv2d_gradInputs, AbstractConv3d_gradInputs])
def local_abstractconv_gi_cudnn(node):
ctx = infer_context_name(*node.inputs)
......
......@@ -22,7 +22,8 @@ from theano.scan_module import scan_utils, scan_op, scan_opt
from theano.tensor.nnet.conv import ConvOp
from theano.tensor.nnet.blocksparse import SparseBlockGemv, SparseBlockOuter
from theano.tensor.nnet.abstract_conv import (AbstractConv2d,
from theano.tensor.nnet.abstract_conv import (BaseAbstractConv,
AbstractConv2d,
AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs,
AbstractConv3d,
......@@ -43,7 +44,9 @@ from .basic_ops import (as_gpuarray_variable, infer_context_name,
from .blas import (gpu_dot22, GpuGemm, GpuGer, GpuGemmBatch,
gpugemm_no_inplace, gpugemm_inplace,
gpugemmbatch_no_inplace,
gpugemv_no_inplace, gpugemv_inplace)
gpugemv_no_inplace, gpugemv_inplace,
GpuCorrMM, GpuCorrMM_gradInputs, GpuCorrMM_gradWeights,
GpuCorr3dMM, GpuCorr3dMM_gradInputs, GpuCorr3dMM_gradWeights)
from .blocksparse import (GpuSparseBlockGemv, GpuSparseBlockOuter,
gpu_sparse_block_outer,
gpu_sparse_block_outer_inplace,
......@@ -1296,6 +1299,287 @@ def local_inplace_sparseblockouter(node):
return [GpuSparseBlockOuter(inplace=True)(*node.inputs)]
# Move to Gpu optimization
@local_optimizer([GpuFromHost,
AbstractConv2d,
AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs,
AbstractConv3d,
AbstractConv3d_gradWeights,
AbstractConv3d_gradInputs])
def local_conv_gpu_conv(node):
"""
gpu_from_host(AbstractConv) -> AbstractConv(gpu_from_host)
AbstractConv(host_from_gpu) -> host_from_gpu(AbstractConv)
"""
if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0]
if host_input.owner and isinstance(host_input.owner.op,
BaseAbstractConv):
conv = host_input.owner.op
inps = list(host_input.owner.inputs)
ctx = infer_context_name(*inps)
inps[0] = as_gpuarray_variable(inps[0], context_name=ctx)
inps[1] = as_gpuarray_variable(inps[1], context_name=ctx)
out = conv(*inps)
# out is on the GPU because both inputs are.
out = theano.tensor.patternbroadcast(out,
node.outputs[0].broadcastable)
return [out]
if isinstance(node.op, BaseAbstractConv):
# conv(host_from_gpu) -> host_from_gpu(gpu_conv)
inp1 = node.inputs[0]
inp2 = node.inputs[1]
if ((isinstance(inp1.type, GpuArrayType) and
isinstance(inp2.type, GpuArrayType))):
# Both inputs are already directly on the GPU, nothing to do
return
inp1_on_gpu = (isinstance(inp1.type, GpuArrayType) or
(inp1.owner and isinstance(inp1.owner.op, HostFromGpu)))
inp2_on_gpu = (isinstance(inp2.type, GpuArrayType) or
(inp2.owner and isinstance(inp2.owner.op, HostFromGpu)))
if inp1_on_gpu or inp2_on_gpu:
conv = node.op
inps = list(node.inputs)
ctx = infer_context_name(*inps)
inps[0] = as_gpuarray_variable(inps[0], context_name=ctx)
inps[1] = as_gpuarray_variable(inps[1], context_name=ctx)
out = conv(*inps)
# out is on the GPU because both inputs are.
out = theano.tensor.patternbroadcast(
out,
node.outputs[0].broadcastable)
# If the original output was on CPU, we have to transfer it
if isinstance(node.outputs[0].type, tensor.TensorType):
return [tensor.as_tensor_variable(out)]
else:
return [out]
register_opt()(local_conv_gpu_conv)
# CorrMM opt
@local_optimizer([AbstractConv2d])
def local_abstractconv_gemm(node):
if not isinstance(node.op, AbstractConv2d):
return None
img, kern = node.inputs
if (not isinstance(img.type, GpuArrayType) or
not isinstance(kern.type, GpuArrayType)):
return None
ctx = infer_context_name(img, kern)
border_mode = node.op.border_mode
subsample = node.op.subsample
filter_dilation = node.op.filter_dilation
if ((border_mode == 'full') and (subsample == (1, 1))):
if not node.op.filter_flip:
kern = kern[:, :, ::-1, ::-1]
# need to dimshuffle the kernel for full convolution
kern = kern.dimshuffle(1, 0, 2, 3)
# call GpuCorrMM_gradInputs
rval = GpuCorrMM_gradInputs('valid',
subsample,
filter_dilation)(
gpu_contiguous(kern), gpu_contiguous(img))
else:
# need to flip the kernel if necessary
if node.op.filter_flip:
kern = kern[:, :, ::-1, ::-1]
# By default use GpuCorrMM
rval = GpuCorrMM(border_mode,
subsample,
filter_dilation)(gpu_contiguous(img),
gpu_contiguous(kern))
# call GpuCorrMM_gradWeights if good
# (the latter is faster if batchsize * kernelHeight * kernelWidth
# is larger than inputChannels * outputHeight * outputWidth.
# GpuConv does not always store information on the batchsize and
# channels, though, so we only use what information we have.)
if ((subsample == (1, 1)) and (filter_dilation == (1, 1)) and
(node.op.imshp is not None) and
(None not in node.op.imshp[-2:]) and
(node.op.kshp is not None) and
(None not in node.op.kshp) and
border_mode != "half"):
# we know the kernel and output size
prod1 = node.op.kshp[0] * node.op.kshp[1]
prod2 = ((node.op.imshp[-2] - node.op.kshp[0] + 1) *
(node.op.imshp[-1] - node.op.kshp[1] + 1))
if (None not in node.op.imshp[:1]):
# we also know batchsize and input channels
prod1 *= node.op.imshp[0]
prod2 *= node.op.imshp[1]
# compare to decide
if prod1 > prod2:
rval = GpuCorrMM_gradWeights(border_mode,
subsample,
filter_dilation)(
gpu_contiguous(img.dimshuffle(1, 0, 2, 3)),
gpu_contiguous(kern.dimshuffle(1, 0, 2, 3)))
# (we need to wrap the result in as_gpuarray_variable,
# because we are not allowed to replace a GpuArray with
# a DimShuffle instance in a graph optimization)
rval = as_gpuarray_variable(
rval.dimshuffle(1, 0, 2, 3),
context_name=ctx)
return [rval]
@local_optimizer([AbstractConv3d])
def local_abstractconv3d_gemm(node):
if not isinstance(node.op, AbstractConv3d):
return None
img, kern = node.inputs
if (not isinstance(img.type, GpuArrayType) or
not isinstance(kern.type, GpuArrayType)):
return None
ctx = infer_context_name(img, kern)
border_mode = node.op.border_mode
subsample = node.op.subsample
filter_dilation = node.op.filter_dilation
if ((border_mode == 'full') and (subsample == (1, 1, 1))):
if not node.op.filter_flip:
kern = kern[:, :, ::-1, ::-1, ::-1]
# need to dimshuffle the kernel for full convolution
kern = kern.dimshuffle(1, 0, 2, 3, 4)
# call GpuCorr3dMM_gradInputs
rval = GpuCorr3dMM_gradInputs('valid',
subsample,
filter_dilation)(
gpu_contiguous(kern), gpu_contiguous(img))
else:
# need to flip the kernel if necessary
if node.op.filter_flip:
kern = kern[:, :, ::-1, ::-1, ::-1]
# By default use GpuCorr3dMM
rval = GpuCorr3dMM(border_mode,
subsample,
filter_dilation)(gpu_contiguous(img),
gpu_contiguous(kern))
# call GpuCorr3dMM_gradWeights if good
# (the latter is faster if batchsize * kernelHeight * kernelWidth * kernelDepth
# is larger than inputChannels * outputHeight * outputWidth * outputDepth.
# GpuConv does not always store information on the batchsize and
# channels, though, so we only use what information we have.)
if ((subsample == (1, 1, 1)) and (filter_dilation == (1, 1, 1)) and
(node.op.imshp is not None) and
(None not in node.op.imshp[-3:]) and
(node.op.kshp is not None) and
(None not in node.op.kshp) and
border_mode != "half"):
# we know the kernel and output size
prod1 = node.op.kshp[0] * node.op.kshp[1] * node.op.kshp[2]
prod2 = ((node.op.imshp[-3] - node.op.kshp[0] + 1) *
(node.op.imshp[-2] - node.op.kshp[1] + 1) *
(node.op.imshp[-1] - node.op.kshp[2] + 1))
if (None not in node.op.imshp[:1]):
# we also know batchsize and input channels
prod1 *= node.op.imshp[0]
prod2 *= node.op.imshp[1]
# compare to decide
if prod1 > prod2:
rval = GpuCorr3dMM_gradWeights(border_mode,
subsample,
filter_dilation)(
gpu_contiguous(img.dimshuffle(1, 0, 2, 3, 4)),
gpu_contiguous(kern.dimshuffle(1, 0, 2, 3, 4)))
# (we need to wrap the result in as_gpuarray_variable,
# because we are not allowed to replace a GpuArray with
# a DimShuffle instance in a graph optimization)
rval = as_gpuarray_variable(
rval.dimshuffle(1, 0, 2, 3, 4),
context_name=ctx)
return [rval]
@local_optimizer([AbstractConv2d_gradWeights])
def local_abstractconv_gradweights_gemm(node):
if not isinstance(node.op, AbstractConv2d_gradWeights):
return None
img, topgrad, shape = node.inputs
if not isinstance(img.type, GpuArrayType) or \
not isinstance(topgrad.type, GpuArrayType):
return None
ctx = infer_context_name(img, topgrad)
rval = GpuCorrMM_gradWeights(border_mode=node.op.border_mode,
subsample=node.op.subsample,
filter_dilation=node.op.filter_dilation)(
gpu_contiguous(img), gpu_contiguous(topgrad), shape)
if node.op.filter_flip:
rval = rval[:, :, ::-1, ::-1]
rval = tensor.patternbroadcast(rval, node.outputs[0].broadcastable)
rval = as_gpuarray_variable(rval, context_name=ctx)
return [rval]
@local_optimizer([AbstractConv3d_gradWeights])
def local_abstractconv3d_gradweights_gemm(node):
if not isinstance(node.op, AbstractConv3d_gradWeights):
return None
img, topgrad, shape = node.inputs
if not isinstance(img.type, GpuArrayType) or \
not isinstance(topgrad.type, GpuArrayType):
return None
ctx = infer_context_name(img, topgrad)
rval = GpuCorr3dMM_gradWeights(border_mode=node.op.border_mode,
subsample=node.op.subsample,
filter_dilation=node.op.filter_dilation)(
gpu_contiguous(img), gpu_contiguous(topgrad), shape)
if node.op.filter_flip:
rval = rval[:, :, ::-1, ::-1, ::-1]
rval = tensor.patternbroadcast(rval, node.outputs[0].broadcastable)
rval = as_gpuarray_variable(rval, context_name=ctx)
return [rval]
@local_optimizer([AbstractConv2d_gradInputs])
def local_abstractconv_gradinputs_gemm(node):
if not isinstance(node.op, AbstractConv2d_gradInputs):
return None
kern, topgrad, shape = node.inputs
if not isinstance(kern.type, GpuArrayType) or \
not isinstance(topgrad.type, GpuArrayType):
return None
if node.op.filter_flip:
kern = kern[:, :, ::-1, ::-1]
rval = GpuCorrMM_gradInputs(border_mode=node.op.border_mode,
subsample=node.op.subsample,
filter_dilation=node.op.filter_dilation)(
gpu_contiguous(kern), gpu_contiguous(topgrad), shape)
return [rval]
@local_optimizer([AbstractConv3d_gradInputs])
def local_abstractconv3d_gradinputs_gemm(node):
if not isinstance(node.op, AbstractConv3d_gradInputs):
return None
kern, topgrad, shape = node.inputs
if not isinstance(kern.type, GpuArrayType) or \
not isinstance(topgrad.type, GpuArrayType):
return None
if node.op.filter_flip:
kern = kern[:, :, ::-1, ::-1, ::-1]
rval = GpuCorr3dMM_gradInputs(border_mode=node.op.border_mode,
subsample=node.op.subsample,
filter_dilation=node.op.filter_dilation)(
gpu_contiguous(kern), gpu_contiguous(topgrad), shape)
return [rval]
# This deals with any abstract convs that have a transfer somewhere
@register_opt('fast_compile', 'conv_dnn', 'cudnn')
@op_lifter([AbstractConv2d,
......@@ -1493,3 +1777,53 @@ optdb.register('gpua_scanOp_make_inplace',
'gpuarray',
'inplace',
'scan')
# Register GPU convolution implementation
# They are tried in a specific order so we can control
# which ones take precedence over others.
abstractconv_groupopt = theano.gof.optdb.LocalGroupDB()
abstractconv_groupopt.__name__ = "gpuarray_abstractconv_opts"
register_opt()(abstractconv_groupopt)
# cuDNN is first, but only registered if cuDNN is available.
# (we import these opts here instead of at the top of this file
# to avoid a circular dependency problem with dnn)
from .dnn import (local_abstractconv_cudnn, local_abstractconv_gw_cudnn,
local_abstractconv_gi_cudnn) # noqa: 402
abstractconv_groupopt.register('local_abstractconv_dnn',
local_abstractconv_cudnn, 20,
'conv_dnn',
'gpuarray', 'fast_compile', 'fast_run', 'cudnn')
abstractconv_groupopt.register('local_abstractconv_gw_dnn',
local_abstractconv_gw_cudnn, 20,
'conv_dnn',
'gpuarray', 'fast_compile', 'fast_run', 'cudnn')
abstractconv_groupopt.register('local_abstractconv_gi_dnn',
local_abstractconv_gi_cudnn, 20,
'conv_dnn',
'gpuarray', 'fast_compile', 'fast_run', 'cudnn')
# The GEMM-based convolution comes last to catch all remaining cases.
# It can be disabled by excluding 'conv_gemm'.
abstractconv_groupopt.register('local_abstractconv_gemm', local_abstractconv_gemm, 30,
'conv_gemm',
'gpuarray', 'fast_compile', 'fast_run')
abstractconv_groupopt.register('local_abstractconv3d_gemm', local_abstractconv3d_gemm, 30,
'conv_gemm',
'gpuarray', 'fast_compile', 'fast_run')
abstractconv_groupopt.register('local_abstractconv_gradweights_gemm',
local_abstractconv_gradweights_gemm, 30,
'conv_gemm',
'gpuarray', 'fast_compile', 'fast_run')
abstractconv_groupopt.register('local_abstractconv3d_gradweights_gemm',
local_abstractconv3d_gradweights_gemm, 30,
'conv_gemm',
'gpuarray', 'fast_compile', 'fast_run')
abstractconv_groupopt.register('local_abstractconv_gradinputs',
local_abstractconv_gradinputs_gemm, 30,
'conv_gemm',
'gpuarray', 'fast_compile', 'fast_run')
abstractconv_groupopt.register('local_abstractconv3d_gradinputs',
local_abstractconv3d_gradinputs_gemm, 30,
'conv_gemm',
'gpuarray', 'fast_compile', 'fast_run')
......@@ -7,6 +7,9 @@ import numpy
from theano.tensor.nnet.tests import test_abstract_conv
from ..type import GpuArrayType, gpuarray_shared_constructor, get_context
from ..dnn import dnn_available, GpuDnnConv, GpuDnnConvGradW, GpuDnnConvGradI
from ..blas import (
GpuCorrMM, GpuCorrMM_gradWeights, GpuCorrMM_gradInputs,
GpuCorr3dMM, GpuCorr3dMM_gradWeights, GpuCorr3dMM_gradInputs)
from .config import mode_with_gpu, test_ctx_name
from pygpu import gpuarray
......@@ -80,6 +83,72 @@ class TestDnnConv3d(test_abstract_conv.BaseTestConv3d):
filter_flip=flip, target_op=GpuDnnConvGradI)
class TestCorrMMConv2d(test_abstract_conv.BaseTestConv2d):
@classmethod
def setup_class(cls):
test_abstract_conv.BaseTestConv2d.setup_class()
cls.shared = staticmethod(gpuarray_shared_constructor)
cls.mode = mode_with_gpu.excluding('cudnn')
def tcase(self, i, f, s, b, flip, provide_shape, fd=(1, 1)):
mode = self.mode
o = self.get_output_shape(i, f, s, b, fd)
self.run_fwd(inputs_shape=i, filters_shape=f,
subsample=s, verify_grad=True, mode=mode,
provide_shape=provide_shape, border_mode=b,
filter_flip=flip, target_op=(GpuCorrMM,
GpuCorrMM_gradWeights,
GpuCorrMM_gradInputs),
filter_dilation=fd)
self.run_gradweight(inputs_shape=i, filters_shape=f,
output_shape=o, subsample=s,
verify_grad=True, mode=mode,
provide_shape=provide_shape, border_mode=b,
filter_flip=flip,
target_op=GpuCorrMM_gradWeights,
filter_dilation=fd)
self.run_gradinput(inputs_shape=i, filters_shape=f,
output_shape=o, subsample=s,
verify_grad=True, mode=mode,
provide_shape=provide_shape, border_mode=b,
filter_flip=flip,
target_op=GpuCorrMM_gradInputs,
filter_dilation=fd)
class TestCorrMMConv3d(test_abstract_conv.BaseTestConv3d):
@classmethod
def setup_class(cls):
test_abstract_conv.BaseTestConv3d.setup_class()
cls.shared = staticmethod(gpuarray_shared_constructor)
cls.mode = mode_with_gpu.excluding('cudnn')
def tcase(self, i, f, s, b, flip, provide_shape, fd=(1, 1, 1)):
mode = self.mode
o = self.get_output_shape(i, f, s, b, fd)
self.run_fwd(inputs_shape=i, filters_shape=f,
subsample=s, verify_grad=True, mode=mode,
provide_shape=provide_shape, border_mode=b,
filter_flip=flip, target_op=(GpuCorr3dMM,
GpuCorr3dMM_gradWeights,
GpuCorr3dMM_gradInputs),
filter_dilation=fd)
self.run_gradweight(inputs_shape=i, filters_shape=f,
output_shape=o, subsample=s,
verify_grad=True, mode=mode,
provide_shape=provide_shape, border_mode=b,
filter_flip=flip,
target_op=GpuCorr3dMM_gradWeights,
filter_dilation=fd)
self.run_gradinput(inputs_shape=i, filters_shape=f,
output_shape=o, subsample=s,
verify_grad=True, mode=mode,
provide_shape=provide_shape, border_mode=b,
filter_flip=flip,
target_op=GpuCorr3dMM_gradInputs,
filter_dilation=fd)
class TestDnnConvTypes(test_abstract_conv.TestConvTypes):
def setUp(self):
self.input = gpu_ftensor4()
......
from __future__ import absolute_import, print_function, division
import unittest
import numpy
import theano
from theano.tests import unittest_tools as utt
from theano.tensor.nnet.corr import CorrMM, CorrMM_gradWeights, CorrMM_gradInputs
from ..type import gpuarray_shared_constructor
from ..blas import GpuCorrMM, GpuCorrMM_gradWeights, GpuCorrMM_gradInputs
from .config import mode_with_gpu, mode_without_gpu
class TestCorrMM(unittest.TestCase):
def run_conv_valid(self, inputs_shape, filters_shape,
border_mode='valid',
filter_dilation=(1, 1),
subsample=(1, 1),
verify_grad=False):
inputs_shape = [inputs_shape[i] for i in (0, 3, 1, 2)]
filters_shape = [filters_shape[i] for i in (0, 3, 1, 2)]
inputs_val = numpy.random.random(inputs_shape).astype('float32')
filters_val = numpy.random.random(filters_shape).astype('float32')
inputs = gpuarray_shared_constructor(inputs_val)
filters = gpuarray_shared_constructor(filters_val)
conv_ref = CorrMM(border_mode=border_mode,
filter_dilation=filter_dilation,
subsample=subsample)(inputs, filters)
f_ref = theano.function([], conv_ref, mode=mode_without_gpu)
conv = GpuCorrMM(border_mode=border_mode,
filter_dilation=filter_dilation,
subsample=subsample)(inputs, filters)
f = theano.function([], conv, mode=mode_with_gpu)
res_ref = f_ref()
res = f()
utt.assert_allclose(res_ref, res)
if verify_grad:
utt.verify_grad(GpuCorrMM(border_mode=border_mode,
filter_dilation=filter_dilation,
subsample=subsample),
[inputs_val, filters_val])
def test_valid(self):
self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
filters_shape=(10, 6, 12, 1))
self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
filters_shape=(10, 6, 12, 1),
subsample=(2, 2))
self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
filters_shape=(10, 6, 12, 1),
subsample=(2, 2))
self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
filters_shape=(10, 6, 12, 1),
subsample=(3, 3))
self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
filters_shape=(10, 6, 12, 1),
subsample=(3, 3))
self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
filters_shape=(10, 6, 12, 1),
subsample=(3, 2))
self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
filters_shape=(10, 6, 12, 1),
subsample=(1, 2))
def test_border_mode(self):
self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
filters_shape=(10, 6, 12, 1),
border_mode='valid')
self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
filters_shape=(10, 6, 12, 1),
border_mode='half')
self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
filters_shape=(10, 6, 12, 1),
border_mode='full')
self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
filters_shape=(10, 6, 12, 1),
border_mode=(0, 0))
self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
filters_shape=(10, 6, 12, 1),
border_mode=(1, 2))
self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
filters_shape=(10, 6, 12, 1),
border_mode=(3, 2))
def test_filter_dilation(self):
inputs_shape = [16, 20, 12, 1]
filters_shape = [10, 6, 5, 1]
for filter_dilation in [(2, 1), (1, 2)]:
for border_mode in ['valid', 'half', 'full']:
self.run_conv_valid(inputs_shape=inputs_shape,
filters_shape=filters_shape,
filter_dilation=filter_dilation,
border_mode=border_mode)
def test_verify_gradients(self):
# use a small example to check the gradients
inputs_shape = [2, 7, 9, 1]
filters_shape = [1, 3, 3, 1]
for filter_dilation in [(2, 1), (1, 2)]:
for border_mode in ['valid', 'half', 'full', (2, 1)]:
self.run_conv_valid(inputs_shape=inputs_shape,
filters_shape=filters_shape,
filter_dilation=filter_dilation,
border_mode=border_mode,
verify_grad=True)
def run_gradweight(self, inputs_shape, filters_shape, dCdH_shape,
subsample=(1, 1)):
inputs_shape = [inputs_shape[i] for i in (0, 3, 1, 2)]
filters_shape = [filters_shape[i] for i in (0, 3, 1, 2)]
dCdH_shape = [dCdH_shape[i] for i in (0, 3, 1, 2)]
inputs_val = numpy.random.random(inputs_shape).astype('float32')
dCdH_val = numpy.random.random(dCdH_shape).astype('float32')
inputs = gpuarray_shared_constructor(inputs_val)
dCdH = gpuarray_shared_constructor(dCdH_val)
shape = gpuarray_shared_constructor(numpy.array(filters_shape[2:]))
if (subsample == (1, 1)):
conv_ref = CorrMM_gradWeights(subsample=subsample)(
inputs, dCdH)
conv_gemm = GpuCorrMM_gradWeights(subsample=subsample)(
inputs, dCdH)
else:
conv_ref = CorrMM_gradWeights(subsample=subsample)(
inputs, dCdH, shape=shape)
conv_gemm = GpuCorrMM_gradWeights(subsample=subsample)(
inputs, dCdH, shape=shape)
f_ref = theano.function([], conv_ref, mode=mode_without_gpu)
f = theano.function([], conv_gemm, mode=mode_with_gpu)
res_ref = f_ref()
res = f()
utt.assert_allclose(res_ref, res)
def test_gradweight(self):
self.run_gradweight(inputs_shape=(16, 10, 12, 1),
filters_shape=(10, 6, 12, 1),
dCdH_shape=(16, 5, 1, 10),
subsample=(1, 1))
self.run_gradweight(inputs_shape=(16, 20, 10, 1),
filters_shape=(10, 6, 4, 1),
dCdH_shape=(16, 8, 4, 10),
subsample=(2, 2))
self.run_gradweight(inputs_shape=(16, 20, 10, 1),
filters_shape=(10, 6, 3, 1),
dCdH_shape=(16, 5, 3, 10),
subsample=(3, 3))
self.run_gradweight(inputs_shape=(16, 20, 12, 1),
filters_shape=(10, 6, 12, 1),
dCdH_shape=(16, 8, 1, 10),
subsample=(2, 1))
def run_gradinput(self, inputs_shape, filters_shape,
subsample=(1, 1)):
inputs_shape = [inputs_shape[i] for i in (0, 3, 1, 2)]
filters_shape = [filters_shape[i] for i in (0, 3, 1, 2)]
inputs_val = numpy.random.random(inputs_shape).astype('float32')
filters_val = numpy.random.random(filters_shape).astype('float32')
inputs = gpuarray_shared_constructor(inputs_val)
filters = gpuarray_shared_constructor(filters_val)
bottom_height = (inputs_shape[2] - 1) * subsample[0] + filters_shape[2]
bottom_width = (inputs_shape[3] - 1) * subsample[1] + filters_shape[3]
bottom_shape = gpuarray_shared_constructor(numpy.array([bottom_height, bottom_width]))
if (subsample == (1, 1)):
conv_ref = CorrMM_gradInputs(subsample=subsample)(
kern=filters, topgrad=inputs)
conv_gemm = GpuCorrMM_gradInputs(subsample=subsample)(
kern=filters, topgrad=inputs)
else:
conv_ref = CorrMM_gradInputs(subsample=subsample)(
kern=filters, topgrad=inputs, shape=bottom_shape)
conv_gemm = GpuCorrMM_gradInputs(subsample=subsample)(
kern=filters, topgrad=inputs, shape=bottom_shape)
f_ref = theano.function([], conv_ref, mode=mode_without_gpu)
f = theano.function([], conv_gemm, mode=mode_with_gpu)
res_ref = f_ref()
res = f()
utt.assert_allclose(res_ref, res)
def test_gradinput(self):
self.run_gradinput(inputs_shape=(16, 15, 12, 10),
filters_shape=(10, 6, 12, 1))
self.run_gradinput(inputs_shape=(16, 15, 12, 10),
filters_shape=(10, 6, 12, 1),
subsample=(2, 2))
self.run_gradinput(inputs_shape=(16, 15, 12, 10),
filters_shape=(10, 6, 12, 1),
subsample=(3, 3))
self.run_gradinput(inputs_shape=(16, 15, 12, 10),
filters_shape=(10, 6, 12, 1),
subsample=(3, 1))
from __future__ import absolute_import, print_function, division
import unittest
import numpy
import theano
from theano.tests import unittest_tools as utt
from theano.tensor.nnet.corr3d import Corr3dMM, Corr3dMM_gradWeights, Corr3dMM_gradInputs
from ..type import gpuarray_shared_constructor
from ..blas import GpuCorr3dMM, GpuCorr3dMM_gradWeights, GpuCorr3dMM_gradInputs
from .config import mode_with_gpu, mode_without_gpu
class TestCorr3dMM(unittest.TestCase):
def run_conv_valid(self, inputs_shape, filters_shape,
border_mode='valid',
filter_dilation=(1, 1, 1),
subsample=(1, 1, 1),
verify_grad=False):
inputs_shape = [inputs_shape[i] for i in (0, 4, 1, 2, 3)]
filters_shape = [filters_shape[i] for i in (0, 4, 1, 2, 3)]
inputs_val = numpy.random.random(inputs_shape).astype('float32')
filters_val = numpy.random.random(filters_shape).astype('float32')
inputs = gpuarray_shared_constructor(inputs_val)
filters = gpuarray_shared_constructor(filters_val)
conv_ref = Corr3dMM(border_mode=border_mode,
filter_dilation=filter_dilation,
subsample=subsample)(inputs, filters)
f_ref = theano.function([], conv_ref, mode=mode_without_gpu)
conv = GpuCorr3dMM(border_mode=border_mode,
filter_dilation=filter_dilation,
subsample=subsample)(inputs, filters)
f = theano.function([], conv, mode=mode_with_gpu)
res_ref = f_ref()
res = f()
utt.assert_allclose(res_ref, res)
if verify_grad:
utt.verify_grad(GpuCorr3dMM(border_mode=border_mode,
filter_dilation=filter_dilation,
subsample=subsample),
[inputs_val, filters_val])
def test_valid(self):
self.run_conv_valid(inputs_shape=(16, 20, 12, 16, 1),
filters_shape=(10, 6, 12, 4, 1))
self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
filters_shape=(10, 6, 12, 4, 1),
subsample=(2, 2, 2))
self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
filters_shape=(10, 6, 12, 4, 1),
subsample=(2, 2, 2))
self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
filters_shape=(10, 6, 12, 4, 1),
subsample=(3, 3, 3))
self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
filters_shape=(10, 6, 12, 4, 1),
subsample=(3, 3, 3))
self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
filters_shape=(10, 6, 12, 4, 1),
subsample=(3, 2, 1))
self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
filters_shape=(10, 6, 12, 4, 1),
subsample=(1, 2, 3))
def test_border_mode(self):
self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
filters_shape=(10, 6, 12, 4, 1),
border_mode='valid')
self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
filters_shape=(10, 6, 12, 4, 1),
border_mode='half')
self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
filters_shape=(10, 6, 12, 4, 1),
border_mode='full')
self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
filters_shape=(10, 6, 12, 4, 1),
border_mode=(0, 0, 0))
self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
filters_shape=(10, 6, 12, 4, 1),
border_mode=(1, 2, 3))
self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
filters_shape=(10, 6, 12, 4, 1),
border_mode=(3, 2, 1))
def test_filter_dilation(self):
inputs_shape = [16, 20, 12, 15, 1]
filters_shape = [10, 6, 5, 4, 1]
for filter_dilation in [(2, 1, 1), (1, 2, 1), (1, 1, 2)]:
for border_mode in ['valid', 'half', 'full']:
self.run_conv_valid(inputs_shape=inputs_shape,
filters_shape=filters_shape,
filter_dilation=filter_dilation,
border_mode=border_mode)
def test_verify_gradients(self):
# use a small example to check the gradients
inputs_shape = [2, 7, 9, 6, 1]
filters_shape = [1, 3, 3, 2, 1]
for filter_dilation in [(2, 1, 1), (1, 2, 1), (1, 1, 2)]:
for border_mode in ['valid', 'half', 'full', (2, 1, 3)]:
self.run_conv_valid(inputs_shape=inputs_shape,
filters_shape=filters_shape,
filter_dilation=filter_dilation,
border_mode=border_mode,
verify_grad=True)
def run_gradweight(self, inputs_shape, filters_shape, dCdH_shape,
subsample=(1, 1, 1)):
inputs_shape = [inputs_shape[i] for i in (0, 4, 1, 2, 3)]
filters_shape = [filters_shape[i] for i in (0, 4, 1, 2, 3)]
dCdH_shape = [dCdH_shape[i] for i in (0, 4, 1, 2, 3)]
inputs_val = numpy.random.random(inputs_shape).astype('float32')
dCdH_val = numpy.random.random(dCdH_shape).astype('float32')
inputs = gpuarray_shared_constructor(inputs_val)
dCdH = gpuarray_shared_constructor(dCdH_val)
shape = gpuarray_shared_constructor(numpy.array(filters_shape[2:]))
if (subsample == (1, 1, 1)):
conv_ref = Corr3dMM_gradWeights(subsample=subsample)(
inputs, dCdH)
conv_gemm = GpuCorr3dMM_gradWeights(subsample=subsample)(
inputs, dCdH)
else:
conv_ref = Corr3dMM_gradWeights(subsample=subsample)(
inputs, dCdH, shape=shape)
conv_gemm = GpuCorr3dMM_gradWeights(subsample=subsample)(
inputs, dCdH, shape=shape)
f_ref = theano.function([], conv_ref, mode=mode_without_gpu)
f = theano.function([], conv_gemm, mode=mode_with_gpu)
res_ref = f_ref()
res = f()
utt.assert_allclose(res_ref, res)
def test_gradweight(self):
self.run_gradweight(inputs_shape=(16, 10, 12, 16, 1),
filters_shape=(10, 6, 12, 4, 1),
dCdH_shape=(16, 5, 1, 13, 10),
subsample=(1, 1, 1))
self.run_gradweight(inputs_shape=(16, 20, 10, 16, 1),
filters_shape=(10, 6, 4, 4, 1),
dCdH_shape=(16, 8, 4, 7, 10),
subsample=(2, 2, 2))
self.run_gradweight(inputs_shape=(16, 20, 10, 16, 1),
filters_shape=(10, 6, 3, 4, 1),
dCdH_shape=(16, 5, 3, 5, 10),
subsample=(3, 3, 3))
self.run_gradweight(inputs_shape=(16, 20, 12, 16, 1),
filters_shape=(10, 6, 12, 4, 1),
dCdH_shape=(16, 8, 1, 5, 10),
subsample=(2, 1, 3))
def run_gradinput(self, inputs_shape, filters_shape,
subsample=(1, 1, 1)):
inputs_shape = [inputs_shape[i] for i in (0, 4, 1, 2, 3)]
filters_shape = [filters_shape[i] for i in (0, 4, 1, 2, 3)]
inputs_val = numpy.random.random(inputs_shape).astype('float32')
filters_val = numpy.random.random(filters_shape).astype('float32')
inputs = gpuarray_shared_constructor(inputs_val)
filters = gpuarray_shared_constructor(filters_val)
bottom_height = (inputs_shape[2] - 1) * subsample[0] + filters_shape[2]
bottom_width = (inputs_shape[3] - 1) * subsample[1] + filters_shape[3]
bottom_depth = (inputs_shape[4] - 1) * subsample[2] + filters_shape[4]
bottom_shape = gpuarray_shared_constructor(numpy.array([bottom_height, bottom_width, bottom_depth]))
if (subsample == (1, 1, 1)):
conv_ref = Corr3dMM_gradInputs(subsample=subsample)(
kern=filters, topgrad=inputs)
conv_gemm = GpuCorr3dMM_gradInputs(subsample=subsample)(
kern=filters, topgrad=inputs)
else:
conv_ref = Corr3dMM_gradInputs(subsample=subsample)(
kern=filters, topgrad=inputs, shape=bottom_shape)
conv_gemm = GpuCorr3dMM_gradInputs(subsample=subsample)(
kern=filters, topgrad=inputs, shape=bottom_shape)
f_ref = theano.function([], conv_ref, mode=mode_without_gpu)
f = theano.function([], conv_gemm, mode=mode_with_gpu)
res_ref = f_ref()
res = f()
utt.assert_allclose(res_ref, res)
def test_gradinput(self):
self.run_gradinput(inputs_shape=(16, 15, 12, 12, 10),
filters_shape=(10, 6, 12, 4, 1))
self.run_gradinput(inputs_shape=(16, 15, 12, 12, 10),
filters_shape=(10, 6, 12, 4, 1),
subsample=(2, 2, 2))
self.run_gradinput(inputs_shape=(16, 15, 12, 12, 10),
filters_shape=(10, 6, 12, 4, 1),
subsample=(3, 3, 3))
self.run_gradinput(inputs_shape=(16, 15, 12, 12, 10),
filters_shape=(10, 6, 12, 4, 1),
subsample=(3, 1, 2))
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论