提交 08957330 authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Make GpuDnnConv support inplace operation.

上级 065e0f5e
...@@ -103,11 +103,11 @@ cudnnConvolutionForward_v2( ...@@ -103,11 +103,11 @@ cudnnConvolutionForward_v2(
const cudnnTensorDescriptor_t destDesc, const cudnnTensorDescriptor_t destDesc,
void *destData) { void *destData) {
assert(*(float *)alpha == 1.0); assert(*(float *)alpha == 1.0);
assert(*(float *)beta == 0.0); assert(*(float *)beta == 1.0);
return cudnnConvolutionForward(handle, srcDesc, srcData, return cudnnConvolutionForward(handle, srcDesc, srcData,
filterDesc, filterData, filterDesc, filterData,
convDesc, destDesc, destData, convDesc, destDesc, destData,
CUDNN_RESULT_NO_ACCUMULATE); CUDNN_RESULT_ACCUMULATE);
} }
#define cudnnConvolutionForward cudnnConvolutionForward_v2 #define cudnnConvolutionForward cudnnConvolutionForward_v2
...@@ -124,11 +124,11 @@ cudnnConvolutionBackwardFilter_v2( ...@@ -124,11 +124,11 @@ cudnnConvolutionBackwardFilter_v2(
const cudnnFilterDescriptor_t gradDesc, const cudnnFilterDescriptor_t gradDesc,
void *gradData) { void *gradData) {
assert(*(float *)alpha == 1.0); assert(*(float *)alpha == 1.0);
assert(*(float *)beta == 0.0); assert(*(float *)beta == 1.0);
return cudnnConvolutionBackwardFilter(handle, srcDesc, srcData, return cudnnConvolutionBackwardFilter(handle, srcDesc, srcData,
diffDesc, diffData, diffDesc, diffData,
convDesc, gradDesc, gradData, convDesc, gradDesc, gradData,
CUDNN_RESULT_NO_ACCUMULATE); CUDNN_RESULT_ACCUMULATE);
} }
#define cudnnConvolutionBackwardFilter cudnnConvolutionBackwardFilter_v2 #define cudnnConvolutionBackwardFilter cudnnConvolutionBackwardFilter_v2
...@@ -146,7 +146,7 @@ cudnnConvolutionBackwardData_v2( ...@@ -146,7 +146,7 @@ cudnnConvolutionBackwardData_v2(
const cudnnTensorDescriptor_t gradDesc, const cudnnTensorDescriptor_t gradDesc,
void *gradData) { void *gradData) {
assert(*(float *)alpha == 1.0); assert(*(float *)alpha == 1.0);
assert(*(float *)beta == 0.0); assert(*(float *)beta == 1.0);
return cudnnConvolutionBackwardData(handle, return cudnnConvolutionBackwardData(handle,
(cudnnFilterDescriptor_t)filterDesc, (cudnnFilterDescriptor_t)filterDesc,
filterData, filterData,
...@@ -155,7 +155,7 @@ cudnnConvolutionBackwardData_v2( ...@@ -155,7 +155,7 @@ cudnnConvolutionBackwardData_v2(
(cudnnConvolutionDescriptor_t)convDesc, (cudnnConvolutionDescriptor_t)convDesc,
(cudnnTensorDescriptor_t)gradDesc, (cudnnTensorDescriptor_t)gradDesc,
gradData, gradData,
CUDNN_RESULT_NO_ACCUMULATE); CUDNN_RESULT_ACCUMULATE);
} }
#define cudnnConvolutionBackwardData cudnnConvolutionBackwardData_v2 #define cudnnConvolutionBackwardData cudnnConvolutionBackwardData_v2
......
...@@ -2,7 +2,7 @@ import os ...@@ -2,7 +2,7 @@ import os
import numpy import numpy
import theano import theano
from theano import Apply, gof, tensor, config from theano import Apply, gof, tensor, config, Variable
from theano.scalar import as_scalar, constant from theano.scalar import as_scalar, constant
from theano.gradient import DisconnectedType from theano.gradient import DisconnectedType
from theano.gof import Optimizer, local_optimizer, COp from theano.gof import Optimizer, local_optimizer, COp
...@@ -16,7 +16,8 @@ from theano.sandbox.cuda.type import CudaNdarrayType ...@@ -16,7 +16,8 @@ from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda import GpuOp from theano.sandbox.cuda import GpuOp
from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable, from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
gpu_contiguous, HostFromGpu, gpu_contiguous, HostFromGpu,
cp_on_negative_strides) cp_on_negative_strides,
gpu_alloc)
from theano.sandbox.cuda.blas import (GpuConv, GpuDownsampleFactorMax, from theano.sandbox.cuda.blas import (GpuConv, GpuDownsampleFactorMax,
GpuDownsampleFactorMaxGrad) GpuDownsampleFactorMaxGrad)
from theano.sandbox.cuda.nnet import GpuSoftmax from theano.sandbox.cuda.nnet import GpuSoftmax
...@@ -344,9 +345,9 @@ _one = constant(numpy.asarray(1.0, dtype='float32')) ...@@ -344,9 +345,9 @@ _one = constant(numpy.asarray(1.0, dtype='float32'))
def ensure_float(val, default, name): def ensure_float(val, default, name):
if val is None: if val is None:
return default.clone() return default.clone()
if not isinstnace(val, Variable): if not isinstance(val, Variable):
val = constant(val) val = constant(val)
if not isisntance(val.type, theano.scalar.Scalar): if not isinstance(val.type, theano.scalar.Scalar):
raise TypeError("%s: expected a scalar value" % (name,)) raise TypeError("%s: expected a scalar value" % (name,))
if not val.type.dtype == 'float32': if not val.type.dtype == 'float32':
raise TypeError("%s: type is not float32" % (name,)) raise TypeError("%s: type is not float32" % (name,))
...@@ -361,9 +362,9 @@ class GpuDnnConv(DnnBase, COp): ...@@ -361,9 +362,9 @@ class GpuDnnConv(DnnBase, COp):
:param kernel: :param kernel:
:param descr: the convolution descriptor :param descr: the convolution descriptor
""" """
__props__ = ('workmem',) __props__ = ('workmem', 'inplace')
def __init__(self, workmem=None): def __init__(self, workmem=None, inplace=False):
""" """
:param workmem: either 'none', 'small' or 'large'. Default is :param workmem: either 'none', 'small' or 'large'. Default is
the value of :attr:`config.dnn.conv.workmem`. the value of :attr:`config.dnn.conv.workmem`.
...@@ -373,92 +374,105 @@ class GpuDnnConv(DnnBase, COp): ...@@ -373,92 +374,105 @@ class GpuDnnConv(DnnBase, COp):
if workmem is None: if workmem is None:
workmem = config.dnn.conv.workmem workmem = config.dnn.conv.workmem
self.workmem = workmem self.workmem = workmem
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [2]}
assert self.workmem in ['none', 'small', 'large'] assert self.workmem in ['none', 'small', 'large']
def __setstate__(self, d): def __setstate__(self, d):
self.__dict__.update(d) self.__dict__.update(d)
if not hasattr(self, 'workmem'): if not hasattr(self, 'workmem'):
self.workmem = 'small' self.workmem = 'none'
if not hasattr(self, 'inplace'):
self.inplace = False
def get_op_params(self): def get_op_params(self):
if self.inplace:
inpl_def = [('CONV_INPLACE', '1')]
else:
inpl_def = []
if version() == -1: if version() == -1:
return [('CONV_ALGO', "0")] alg_def = ('CONV_ALGO', "0")
if self.workmem == 'none': else:
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM' if self.workmem == 'none':
elif self.workmem == 'small': alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM'
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM' elif self.workmem == 'small':
elif self.workmem == 'large': alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM'
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_GEMM' elif self.workmem == 'large':
return [('CONV_ALGO', alg)] alg = 'CUDNN_CONVOLUTION_FWD_ALGO_GEMM'
alg_def = ('CONV_ALGO', alg)
def make_node(self, img, kern, desc, alpha=None, beta=None): return [alg_def] + inpl_def
def make_node(self, img, kern, output, desc, alpha=None):
img = as_cuda_ndarray_variable(img) img = as_cuda_ndarray_variable(img)
kern = as_cuda_ndarray_variable(kern) kern = as_cuda_ndarray_variable(kern)
output = as_cuda_ndarray_variable(output)
if img.type.ndim != 4: if img.type.ndim != 4:
raise TypeError('img must be 4D tensor') raise TypeError('img must be 4D tensor')
if kern.type.ndim != 4: if kern.type.ndim != 4:
raise TypeError('kern must be 4D tensor') raise TypeError('kern must be 4D tensor')
if output.type.ndim != 4:
raise TypeError('output must be a 4D tensor')
if not isinstance(desc.type, CDataType) \ if not isinstance(desc.type, CDataType) \
or desc.type.ctype != 'cudnnConvolutionDescriptor_t': or desc.type.ctype != 'cudnnConvolutionDescriptor_t':
raise TypeError('desc must be cudnnConvolutionDescriptor_t') raise TypeError('desc must be cudnnConvolutionDescriptor_t')
alpha = ensure_float(alpha, _one, 'alpha') alpha = ensure_float(alpha, _one, 'alpha')
beta = ensure_float(beta, _zero, 'beta')
broadcastable = (img.type.broadcastable[0], return Apply(self, [img, kern, output, desc, alpha],
kern.type.broadcastable[0], [output.type()])
False, False)
return Apply(self, [img, kern, desc, alpha, beta],
[CudaNdarrayType(broadcastable)()])
def grad(self, inp, grads): def grad(self, inp, grads):
img, kerns, desc, alpha, beta = inp img, kerns, output, desc, alpha = inp
top, = grads top, = grads
top = cp_on_negative_strides(top) top = cp_on_negative_strides(top)
d_img = GpuDnnConvGradI()(kerns, top, desc, d_img = GpuDnnConvGradI()(kerns, top, img.zeros_like(), desc)
img.shape[2], img.shape[3]) d_kerns = GpuDnnConvGradW()(img, top, kerns.zeros_like(), desc)
d_kerns = GpuDnnConvGradW()(img, top, desc,
kerns.shape[2], kerns.shape[3])
return [d_img, d_kerns, DisconnectedType()(), DisconnectedType()(), return [d_img, d_kerns, output.zeros_like(),
DisconnectedType()()] DisconnectedType()(), DisconnectedType()()]
def connection_pattern(self, node): def connection_pattern(self, node):
# not connected to desc, alpha, beta # not connected to desc, alpha
return [[1], [1], [0], [0], [0]] return [[1], [1], [1], [0], [0]]
def infer_shape(self, node, shape): @staticmethod
b = shape[0][0] # Number of inputs def get_out_shape(ishape, kshape, border_mode, subsample):
h = shape[0][2] # Height of input feature maps """
w = shape[0][3] # Width of input feature maps This function computes the output shape for a convolution with
nb = shape[1][0] # Number of output feature maps the specified parameters. `ishape` and `kshape` can be symbolic
kh = shape[1][2] # Height of each filter or scalar.
kw = shape[1][3] # Width of each filter """
padh = 0 b = ishape[0] # Number of inputs
padw = 0 h = ishape[2] # Height of input feature maps
if ( w = ishape[3] # Width of input feature maps
not node.inputs[2].owner nb = kshape[0] # Number of output feature maps
or not isinstance(node.inputs[2].owner.op, GpuDnnConvDesc) kh = kshape[2] # Height of each filter
): kw = kshape[3] # Width of each filter
raise theano.tensor.basic.ShareError("case not implemented and probably not needed")
desc = node.inputs[2].owner.op sh, sw = subsample
sh, sw = desc.subsample if border_mode == 'full':
if desc.border_mode == 'full':
padh = kh - 1 padh = kh - 1
padw = kw - 1 padw = kw - 1
elif isinstance(desc.border_mode, tuple): elif isinstance(border_mode, tuple):
padh, padw = desc.border_mode padh, padw = border_mode
else: else:
assert desc.border_mode == 'valid' assert border_mode == 'valid'
padh = 0
padw = 0
return [( return (
b, nb, b, nb,
(h + 2*padh - kh)//sh + 1, (h + 2*padh - kh)//sh + 1,
(w + 2*padw - kw)//sw + 1 (w + 2*padw - kw)//sw + 1
)] )
def infer_shape(self, node, shape):
return [shape[2]]
class GpuDnnConvGradW(DnnBase, COp): class GpuDnnConvGradW(DnnBase, COp):
...@@ -470,62 +484,64 @@ class GpuDnnConvGradW(DnnBase, COp): ...@@ -470,62 +484,64 @@ class GpuDnnConvGradW(DnnBase, COp):
:param descr: the convolution descriptor :param descr: the convolution descriptor
""" """
__props__ = () __props__ = ('inplace',)
def __init__(self): def __init__(self, inplace=False):
COp.__init__(self, ["dnn_base.c", "dnn_conv_base.c", "dnn_gw.c"], COp.__init__(self, ["dnn_base.c", "dnn_conv_base.c", "dnn_gw.c"],
"APPLY_SPECIFIC(conv_gw)") "APPLY_SPECIFIC(conv_gw)")
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [2]}
def __setstate__(self, d):
self.__dict__.update(d)
if not hasattr(self, 'inplace'):
self.inplace = False
def grad(self, inp, grads): def grad(self, inp, grads):
img, top, desc, h, w, alpha, beta = inp img, top, output, desc, alpha = inp
kerns, = grads kerns, = grads
kerns = gpu_contiguous(kerns) kerns = gpu_contiguous(kerns)
d_img = GpuDnnConvGradI()(kerns, top, desc, d_img = GpuDnnConvGradI()(kerns, top, img.zeros_like(), desc)
img.shape[2], img.shape[3]) d_top = GpuDnnConv()(img, kerns, top.zeros_like(), desc)
d_top = GpuDnnConv()(img, kerns, desc)
return (d_img, d_top, DisconnectedType()(), DisconnectedType()(), return (d_img, d_top, output.zeros_like(),
DisconnectedType()(), DiconnnectedType()(), DisconnectedType()(), DiconnnectedType()())
DisconnectedType()())
def connection_pattern(self, node): def connection_pattern(self, node):
# not connected to desc, h, w, alpha, beta # not connected to desc, alpha
return [[1], [1], [0], [0], [0], [0], [0]] return [[1], [1], [1], [0], [0]]
def make_node(self, img, topgrad, desc, h, w, alpha=None, beta=None): def get_op_params(self):
if self.inplace:
return [('CONV_INPLACE', '1')]
else:
return []
def make_node(self, img, topgrad, output, desc, alpha=None):
img = as_cuda_ndarray_variable(img) img = as_cuda_ndarray_variable(img)
topgrad = as_cuda_ndarray_variable(topgrad) topgrad = as_cuda_ndarray_variable(topgrad)
output = as_cuda_ndarray_variable(output)
if img.type.ndim != 4: if img.type.ndim != 4:
raise TypeError('img must be 4D tensor') raise TypeError('img must be 4D tensor')
if topgrad.type.ndim != 4: if topgrad.type.ndim != 4:
raise TypeError('topgrad must be 4D tensor') raise TypeError('topgrad must be 4D tensor')
if output.type.ndim != 4:
raise TypeError('output must be 4D tensor')
if not isinstance(desc.type, CDataType) \ if not isinstance(desc.type, CDataType) \
or desc.type.ctype != 'cudnnConvolutionDescriptor_t': or desc.type.ctype != 'cudnnConvolutionDescriptor_t':
raise TypeError('desc must be cudnnConvolutionDescriptor_t') raise TypeError('desc must be cudnnConvolutionDescriptor_t')
h = as_scalar(h)
w = as_scalar(w)
alpha = ensure_float(alpha, _one, 'alpha') alpha = ensure_float(alpha, _one, 'alpha')
beta = ensure_float(beta, _zero, 'beta')
broadcastable = [topgrad.type.broadcastable[1], return Apply(self, [img, topgrad, output, desc, alpha],
img.type.broadcastable[1], [output.type()])
False, False]
return Apply(self, [img, topgrad, desc, h, w, alpha, beta],
[CudaNdarrayType(broadcastable)()])
def infer_shape(self, node, shape): def infer_shape(self, node, shape):
return [( return [shape[2]]
shape[1][1],
shape[0][1],
node.inputs[3],
node.inputs[4]
)]
class GpuDnnConvGradI(DnnBase, COp): class GpuDnnConvGradI(DnnBase, COp):
...@@ -537,61 +553,58 @@ class GpuDnnConvGradI(DnnBase, COp): ...@@ -537,61 +553,58 @@ class GpuDnnConvGradI(DnnBase, COp):
:param descr: the convolution descriptor :param descr: the convolution descriptor
""" """
__props__ = () __props__ = ('inplace',)
def __init__(self): def __init__(self, inplace=False):
COp.__init__(self, ["dnn_base.c", "dnn_conv_base.c", "dnn_gi.c"], COp.__init__(self, ["dnn_base.c", "dnn_conv_base.c", "dnn_gi.c"],
"APPLY_SPECIFIC(conv_gi)") "APPLY_SPECIFIC(conv_gi)")
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [2]}
def grad(self, inp, grads): def grad(self, inp, grads):
kerns, top, desc, h, w, alpha, beta = inp kerns, top, output, desc, alpha = inp
img, = grads img, = grads
img = cp_on_negative_strides(img) img = cp_on_negative_strides(img)
d_kerns = GpuDnnConvGradW()(img, top, desc, d_kerns = GpuDnnConvGradW()(img, top, kerns.zeros_like(), desc)
kerns.shape[2], kerns.shape[3]) d_top = GpuDnnConv()(img, kerns, top.zeros_like(), desc)
d_top = GpuDnnConv()(img, kerns, desc) return (d_kerns, d_top, output.zeros_like(),
return (d_kerns, d_top, DisconnectedType()(), DisconnectedType()(), DisconnectedType()(), DisconnectedType()())
DisconnectedType()(), DisconnectedType()(),
DisconnectedType()())
def connection_pattern(self, node): def connection_pattern(self, node):
# not connected to desc, h, w, alpha, beta # not connected to desc, alpha
return [[1], [1], [0], [0], [0], [0], [0]] return [[1], [1], [1], [0], [0]]
def make_node(self, kern, topgrad, desc, h, w, alpha=None, beta=None): def get_op_params(self):
if self.inplace:
return [('CONV_INPLACE', '1')]
else:
return []
def make_node(self, kern, topgrad, output, desc, alpha=None):
kern = as_cuda_ndarray_variable(kern) kern = as_cuda_ndarray_variable(kern)
topgrad = as_cuda_ndarray_variable(topgrad) topgrad = as_cuda_ndarray_variable(topgrad)
output = as_cuda_ndarray_variable(output)
if kern.type.ndim != 4: if kern.type.ndim != 4:
raise TypeError('kern must be 4D tensor') raise TypeError('kern must be 4D tensor')
if topgrad.type.ndim != 4: if topgrad.type.ndim != 4:
raise TypeError('topgrad must be 4D tensor') raise TypeError('topgrad must be 4D tensor')
if output.type.ndim != 4:
raise TypeError('output must be 4D tensor')
if not isinstance(desc.type, CDataType) \ if not isinstance(desc.type, CDataType) \
or desc.type.ctype != 'cudnnConvolutionDescriptor_t': or desc.type.ctype != 'cudnnConvolutionDescriptor_t':
raise TypeError('desc must be cudnnConvolutionDescriptor_t') raise TypeError('desc must be cudnnConvolutionDescriptor_t')
h = as_scalar(h)
w = as_scalar(w)
alpha = ensure_float(alpha, _one, 'alpha') alpha = ensure_float(alpha, _one, 'alpha')
beta = ensure_float(beta, _zero, 'beta')
broadcastable = [topgrad.type.broadcastable[0],
kern.type.broadcastable[1],
False, False]
return Apply(self, [kern, topgrad, desc, h, w, alpha, beta], return Apply(self, [kern, topgrad, output, desc, alpha],
[CudaNdarrayType(broadcastable)()]) [output.type()])
def infer_shape(self, node, shape): def infer_shape(self, node, shape):
return [( return [shape[2]]
shape[1][0],
shape[0][1],
node.inputs[3],
node.inputs[4]
)]
def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
...@@ -620,32 +633,31 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), ...@@ -620,32 +633,31 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
:param workmem: Specify the amount of working memory allowed. :param workmem: Specify the amount of working memory allowed.
More memory is usually faster. One of 'none', 'small' or More memory is usually faster. One of 'none', 'small' or
'large'. (default is None which takes its value from 'large'. (default is None which takes its value from
config.dnn.conv.workmem) :attr:`config.dnn.conv.workmem`)
:warning: The cuDNN library only works with GPU that have a compute :warning: The cuDNN library only works with GPU that have a compute
capability of 3.0 or higer. This means that older GPU will not capability of 3.0 or higer. This means that older GPU will not
work with this Op. work with this Op.
:note: The working memory of the op is influenced by
:attr:`config.dnn.conv.workmem`.
""" """
fgraph = getattr(img, 'fgraph', None) or getattr(kerns, 'fgraph', None) fgraph = getattr(img, 'fgraph', None) or getattr(kerns, 'fgraph', None)
if (border_mode == 'valid' and subsample == (1,1) and if (border_mode == 'valid' and subsample == (1,1) and
direction_hint == 'bprop weights'): direction_hint == 'bprop weights'):
# Special case: We are asked to use GpuDnnConvGradW. We need to set # Special case: We are asked to use GpuDnnConvGradW. We need to set
# up a suitable 'fake' convolution to compute the gradient for. # up a suitable 'fake' convolution to compute the gradient for.
img = gpu_contiguous(img.dimshuffle(1, 0, 2, 3)) img = cp_on_negative_strides(img.dimshuffle(1, 0, 2, 3))
if conv_mode == 'conv': if conv_mode == 'conv':
# We need to flip manually. These 'kerns' are not the kernels # We need to flip manually. These 'kerns' are not the kernels
# that would be flipped by conv_mode='conv' in GpuDnnConvGradW. # that would be flipped by conv_mode='conv' in GpuDnnConvGradW.
kerns = kerns[:, :, ::-1, ::-1] kerns = kerns[:, :, ::-1, ::-1]
kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3)) kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3))
shape = theano.tensor.stack(kerns.shape[1], img.shape[1], shape2 = shape_i(img, 2, fgraph) - shape_i(kerns, 2, fgraph) + 1
img.shape[2] - kerns.shape[2] + 1, shape3 = shape_i(img, 3, fgraph) - shape_i(kerns, 3, fgraph) + 1
img.shape[3] - kerns.shape[3] + 1) out = gpu_alloc(_zero.clone(), shape_i(kerns, 1, fgraph),
shape_i(img, 1, fgraph), shape2, shape3)
desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1), desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
conv_mode='cross')(img.shape, shape) conv_mode='cross')(img.shape, out.shape)
conv = GpuDnnConvGradW()(img, kerns, desc, shape[2], shape[3]) conv = GpuDnnConvGradW()(img, kerns, out, desc)
return as_cuda_ndarray_variable(conv.dimshuffle(1, 0, 2, 3)) return as_cuda_ndarray_variable(conv.dimshuffle(1, 0, 2, 3))
elif (border_mode == 'full' and subsample == (1, 1) and elif (border_mode == 'full' and subsample == (1, 1) and
...@@ -653,17 +665,16 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), ...@@ -653,17 +665,16 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
# Special case: We can be faster by using GpuDnnConvGradI to compute # Special case: We can be faster by using GpuDnnConvGradI to compute
# the full convolution as the backward pass of a valid convolution. # the full convolution as the backward pass of a valid convolution.
# We just need to set up a suitable 'fake' valid convolution. # We just need to set up a suitable 'fake' valid convolution.
img = gpu_contiguous(img) img = cp_on_negative_strides(img)
kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3)) kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3))
conv_mode = 'cross' if conv_mode == 'conv' else 'conv' conv_mode = 'cross' if conv_mode == 'conv' else 'conv'
shape2 = shape_i(img, 2, fgraph) + shape_i(kerns, 2, fgraph) - 1 shape2 = shape_i(img, 2, fgraph) + shape_i(kerns, 2, fgraph) - 1
shape3 = shape_i(img, 3, fgraph) + shape_i(kerns, 3, fgraph) - 1 shape3 = shape_i(img, 3, fgraph) + shape_i(kerns, 3, fgraph) - 1
shape = theano.tensor.stack(shape_i(img, 0, fgraph), out = gpu_alloc(_zero.clone(), shape_i(img, 0, fgraph),
shape_i(kerns, 1, fgraph), shape_i(kerns, 1, fgraph), shape2, shape3)
shape2, shape3)
desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1), desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
conv_mode=conv_mode)(shape, kerns.shape) conv_mode=conv_mode)(out.shape, kerns.shape)
return GpuDnnConvGradI()(kerns, img, desc, shape2, shape3) return GpuDnnConvGradI()(kerns, img, out, desc)
# Standard case: We use GpuDnnConv with suitable padding. # Standard case: We use GpuDnnConv with suitable padding.
# cp_on_negative_strides will return a gpu_contiguous copy # cp_on_negative_strides will return a gpu_contiguous copy
...@@ -678,7 +689,12 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), ...@@ -678,7 +689,12 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
# algorithm. # algorithm.
if workmem is None or workmem == 'small': if workmem is None or workmem == 'small':
workmem = 'none' workmem = 'none'
return GpuDnnConv(workmem=workmem)(img, kerns, desc) out_shp = GpuDnnConv.get_out_shape(img.shape, kerns.shape, border_mode,
subsample)
out = gpu_alloc(_zero.clone(),
out_shp[0], out_shp[1],
out_shp[2], out_shp[3])
return GpuDnnConv(workmem=workmem)(img, kerns, out, desc)
class GpuDnnPoolDesc(GpuOp): class GpuDnnPoolDesc(GpuOp):
...@@ -1455,6 +1471,27 @@ if True: ...@@ -1455,6 +1471,27 @@ if True:
rval, node.outputs[0].type.broadcastable) rval, node.outputs[0].type.broadcastable)
return [rval] return [rval]
@register_opt('cudnn')
@local_optimizer([GpuDnnConv], inplace=True)
def local_dnn_conv_inplace(node):
if type(node.op) != GpuDnnConv or node.op.inplace == True:
return
return [GpuDnnConv(workmem=node.op.workmem, inplace=True)(*node.inputs)]
@register_opt('cudnn')
@local_optimizer([GpuDnnConvGradW], inplace=True)
def local_dnn_convgw_inplace(node):
if type(node.op) != GpuDnnConvGradW or node.op.inplace == True:
return
return [GpuDnnConvGradW(inplace=True)(*node.inputs)]
@register_opt('cudnn')
@local_optimizer([GpuDnnConvGradI], inplace=True)
def local_dnn_convgi_inplace(node):
if type(node.op) != GpuDnnConvGradI or node.op.inplace == True:
return
return [GpuDnnConvGradI(inplace=True)(*node.inputs)]
@register_opt('cudnn') @register_opt('cudnn')
@local_optimizer([GpuDownsampleFactorMax]) @local_optimizer([GpuDownsampleFactorMax])
def local_pool_dnn(node): def local_pool_dnn(node):
......
...@@ -2,9 +2,8 @@ ...@@ -2,9 +2,8 @@
int int
APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns, APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
cudnnConvolutionDescriptor_t desc, CudaNdarray *om, cudnnConvolutionDescriptor_t desc,
float alpha, float beta, float alpha, CudaNdarray **output) {
CudaNdarray **output) {
cudnnStatus_t err = CUDNN_STATUS_SUCCESS; cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (c_set_tensor4d(input, APPLY_SPECIFIC(input)) == -1) if (c_set_tensor4d(input, APPLY_SPECIFIC(input)) == -1)
...@@ -12,23 +11,16 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns, ...@@ -12,23 +11,16 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1) if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
return 1; return 1;
{ #ifdef CONV_INPLACE
int out_dims[4]; Py_XDECREF(*output);
err = cudnnGetConvolution2dForwardOutputDim( *output = om;
desc, Py_INCREF(*output);
APPLY_SPECIFIC(input), #else
APPLY_SPECIFIC(kerns), if (CudaNdarray_prep_output(output, 4, CudaNdarray_HOST_DIMS(om)) != 0)
&out_dims[0], &out_dims[1], &out_dims[2], &out_dims[3]); return 1;
if (err != CUDNN_STATUS_SUCCESS) { if (CudaNdarray_CopyFromCudaNdarray(*output, om))
PyErr_Format(PyExc_RuntimeError, return 1;
"GpuDnnConv: error while computing the output shape: %s", #endif
cudnnGetErrorString(err));
return 1;
}
if (CudaNdarray_prep_output(output, 4, out_dims) != 0) {
return 1;
}
}
if (c_set_tensor4d(*output, APPLY_SPECIFIC(output)) == -1) if (c_set_tensor4d(*output, APPLY_SPECIFIC(output)) == -1)
return 1; return 1;
...@@ -55,6 +47,8 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns, ...@@ -55,6 +47,8 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
if (workspace == NULL && worksize != 0) if (workspace == NULL && worksize != 0)
return 1; return 1;
const float beta = 1;
err = cudnnConvolutionForward( err = cudnnConvolutionForward(
_handle, _handle,
(void *)&alpha, (void *)&alpha,
......
...@@ -2,9 +2,8 @@ ...@@ -2,9 +2,8 @@
int int
APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output, APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
cudnnConvolutionDescriptor_t desc, CudaNdarray *im, cudnnConvolutionDescriptor_t desc,
int h, int w, float alpha, float beta, float alpha, CudaNdarray **input) {
CudaNdarray **input) {
cudnnStatus_t err = CUDNN_STATUS_SUCCESS; cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (c_set_tensor4d(output, APPLY_SPECIFIC(output)) == -1) if (c_set_tensor4d(output, APPLY_SPECIFIC(output)) == -1)
...@@ -12,33 +11,33 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output, ...@@ -12,33 +11,33 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1) if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
return 1; return 1;
{ #ifdef CONV_INPLACE
int out_dims[4]; Py_XDECREF(*input);
out_dims[0] = CudaNdarray_HOST_DIMS(output)[0]; *input = im;
out_dims[1] = CudaNdarray_HOST_DIMS(kerns)[1]; Py_INCREF(*input);
out_dims[2] = h; #else
out_dims[3] = w; if (CudaNdarray_prep_output(input, 4, CudaNdarray_HOST_DIMS(im)) != 0)
if (CudaNdarray_prep_output(input, 4, out_dims) != 0) { return 1;
return 1; if (CudaNdarray_CopyFromCudaNdarray(*input, im))
} return 1;
} #endif
if (c_set_tensor4d(*input, APPLY_SPECIFIC(input)) == -1) if (c_set_tensor4d(*input, APPLY_SPECIFIC(input)) == -1)
return 1; return 1;
{ const float beta = 1;
err = cudnnConvolutionBackwardData(
_handle, err = cudnnConvolutionBackwardData(
(void *)&alpha, _handle,
APPLY_SPECIFIC(kerns), CudaNdarray_DEV_DATA(kerns), (void *)&alpha,
APPLY_SPECIFIC(output), CudaNdarray_DEV_DATA(output), APPLY_SPECIFIC(kerns), CudaNdarray_DEV_DATA(kerns),
desc, APPLY_SPECIFIC(output), CudaNdarray_DEV_DATA(output),
(void *)&beta, desc,
APPLY_SPECIFIC(input), CudaNdarray_DEV_DATA(*input)); (void *)&beta,
} APPLY_SPECIFIC(input), CudaNdarray_DEV_DATA(*input));
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradI: error doing operation: %s", PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradI: error doing operation: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
return 1; return 1;
} }
return 0; return 0;
......
...@@ -2,9 +2,8 @@ ...@@ -2,9 +2,8 @@
int int
APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output, APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
cudnnConvolutionDescriptor_t desc, CudaNdarray *km, cudnnConvolutionDescriptor_t desc,
int h, int w, float alpha, float beta, float alpha, CudaNdarray **kerns) {
CudaNdarray **kerns) {
cudnnStatus_t err = CUDNN_STATUS_SUCCESS; cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (c_set_tensor4d(input, APPLY_SPECIFIC(input)) == -1) if (c_set_tensor4d(input, APPLY_SPECIFIC(input)) == -1)
...@@ -12,33 +11,33 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output, ...@@ -12,33 +11,33 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
if (c_set_tensor4d(output, APPLY_SPECIFIC(output)) == -1) if (c_set_tensor4d(output, APPLY_SPECIFIC(output)) == -1)
return 1; return 1;
{ #ifdef CONV_INPLACE
int out_dims[4]; Py_XDECREF(*kerns);
out_dims[0] = CudaNdarray_HOST_DIMS(output)[1]; *kerns = km;
out_dims[1] = CudaNdarray_HOST_DIMS(input)[1]; Py_INCREF(*kerns);
out_dims[2] = h; #else
out_dims[3] = w; if (CudaNdarray_prep_output(kerns, 4, CudaNdarray_HOST_DIMS(km)) != 0)
if (CudaNdarray_prep_output(kerns, 4, out_dims) != 0) { return 1;
return 1; if (CudaNdarray_CopyFromCudaNdarray(*kerns, km))
} return 1;
} #endif
if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns)) == -1) if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns)) == -1)
return 1; return 1;
{ const float beta = 1;
err = cudnnConvolutionBackwardFilter(
_handle, err = cudnnConvolutionBackwardFilter(
(void *)&alpha, _handle,
APPLY_SPECIFIC(input), CudaNdarray_DEV_DATA(input), (void *)&alpha,
APPLY_SPECIFIC(output), CudaNdarray_DEV_DATA(output), APPLY_SPECIFIC(input), CudaNdarray_DEV_DATA(input),
desc, APPLY_SPECIFIC(output), CudaNdarray_DEV_DATA(output),
(void *)&beta, desc,
APPLY_SPECIFIC(kerns), CudaNdarray_DEV_DATA(*kerns)); (void *)&beta,
} APPLY_SPECIFIC(kerns), CudaNdarray_DEV_DATA(*kerns));
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradW: error doing operation: %s", PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradW: error doing operation: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
return 1; return 1;
} }
return 0; return 0;
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论