提交 5a656e9b authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #2621 from abergeron/fix_alpha_merge

Fix alpha_merge and cudnn conv gradient
......@@ -645,19 +645,22 @@ if cuda_available:
if node.op == sparse_block_outer_ss:
return [sparse_block_outer_ss_inplace(*node.inputs)]
# Should be run before elemwise fusion
@opt.register_opt()
@alpha_merge(SparseBlockOuterSS, alpha_in=5, nd=4)
def local_merge_blocksparse_alpha(node, *inputs):
"""
GpuElemwise{mul}(lr, SparseBlockOuterSS) -> SparseBlockOuterSS(..., alpha=lr)
"""
return [sparse_block_outer_ss(*inputs)]
@opt.register_opt()
@output_merge(SparseBlockOuterSS, alpha_in=5, out_in=0, nd=4)
def local_merge_blocksparse_output(node, *inputs):
return [sparse_block_outer_ss(*inputs)]
# XXX: these optimisations were badly broken and now require a working
# beta param (could only be a 0/1 thing for outer_merge, but
# alpha_merge needs the full range).
# @opt.register_opt()
# @alpha_merge(SparseBlockOuterSS, alpha_in=5, beta_in=?, nd=4)
# def local_merge_blocksparse_alpha(node, *inputs):
# """
#GpuElemwise{mul}(lr, SparseBlockOuterSS) -> SparseBlockOuterSS(..., alpha=lr)
# """
# return [sparse_block_outer_ss(*inputs)]
# @opt.register_opt()
# @output_merge(SparseBlockOuterSS, alpha_in=5, beta_in=? out_in=0, nd=4)
# def local_merge_blocksparse_output(node, *inputs):
# return [sparse_block_outer_ss(*inputs)]
def sparse_block_dot_SS(W, h, inputIdx, b, outputIdx):
......
......@@ -103,11 +103,18 @@ cudnnConvolutionForward_v2(
const cudnnTensorDescriptor_t destDesc,
void *destData) {
assert(*(float *)alpha == 1.0);
assert(*(float *)beta == 1.0);
cudnnAccumulateResult_t r;
if (*(float *)beta == 0.0) {
r = CUDNN_RESULT_NO_ACCUMULATE;
} else if (*(float *)beta == 1.0) {
r = CUDNN_RESULT_ACCUMULATE;
} else {
assert(0 && "beta must be 0.0 or 1.0");
}
return cudnnConvolutionForward(handle, srcDesc, srcData,
filterDesc, filterData,
convDesc, destDesc, destData,
CUDNN_RESULT_ACCUMULATE);
r);
}
#define cudnnConvolutionForward cudnnConvolutionForward_v2
......@@ -124,11 +131,18 @@ cudnnConvolutionBackwardFilter_v2(
const cudnnFilterDescriptor_t gradDesc,
void *gradData) {
assert(*(float *)alpha == 1.0);
assert(*(float *)beta == 1.0);
cudnnAccumulateResult_t r;
if (*(float *)beta == 0.0) {
r = CUDNN_RESULT_NO_ACCUMULATE;
} else if (*(float *)beta == 1.0) {
r = CUDNN_RESULT_ACCUMULATE;
} else {
assert(0 && "beta must be 0.0 or 1.0");
}
return cudnnConvolutionBackwardFilter(handle, srcDesc, srcData,
diffDesc, diffData,
convDesc, gradDesc, gradData,
CUDNN_RESULT_ACCUMULATE);
r);
}
#define cudnnConvolutionBackwardFilter cudnnConvolutionBackwardFilter_v2
......@@ -146,7 +160,16 @@ cudnnConvolutionBackwardData_v2(
const cudnnTensorDescriptor_t gradDesc,
void *gradData) {
assert(*(float *)alpha == 1.0);
assert(*(float *)beta == 1.0);
cudnnAccumulateResult_t r;
if (*(float *)beta == 0.0) {
r = CUDNN_RESULT_NO_ACCUMULATE;
} else if (*(float *)beta == 1.0) {
r = CUDNN_RESULT_ACCUMULATE;
} else {
assert(0 && "beta must be 0.0 or 1.0");
}
/* This function needs the casting because its params are not
declared as const */
return cudnnConvolutionBackwardData(handle,
(cudnnFilterDescriptor_t)filterDesc,
filterData,
......@@ -155,7 +178,7 @@ cudnnConvolutionBackwardData_v2(
(cudnnConvolutionDescriptor_t)convDesc,
(cudnnTensorDescriptor_t)gradDesc,
gradData,
CUDNN_RESULT_ACCUMULATE);
r);
}
#define cudnnConvolutionBackwardData cudnnConvolutionBackwardData_v2
......
......@@ -411,7 +411,7 @@ class GpuDnnConv(DnnBase, COp):
alg_def = ('CONV_ALGO', alg)
return [alg_def] + inpl_def
def make_node(self, img, kern, output, desc, alpha=None):
def make_node(self, img, kern, output, desc, alpha=None, beta=None):
img = as_cuda_ndarray_variable(img)
kern = as_cuda_ndarray_variable(kern)
output = as_cuda_ndarray_variable(output)
......@@ -427,12 +427,13 @@ class GpuDnnConv(DnnBase, COp):
raise TypeError('desc must be cudnnConvolutionDescriptor_t')
alpha = ensure_float(alpha, _one, 'alpha')
beta = ensure_float(beta, _zero, 'beta')
return Apply(self, [img, kern, output, desc, alpha],
return Apply(self, [img, kern, output, desc, alpha, beta],
[output.type()])
def grad(self, inp, grads):
img, kerns, output, desc, alpha = inp
img, kerns, output, desc, alpha, beta = inp
top, = grads
top = gpu_contiguous(top)
......@@ -440,12 +441,14 @@ class GpuDnnConv(DnnBase, COp):
d_img = GpuDnnConvGradI()(kerns, top, img.zeros_like(), desc)
d_kerns = GpuDnnConvGradW()(img, top, kerns.zeros_like(), desc)
d_alpha = grad_not_implemented(self, 4, alpha)
d_beta = grad_not_implemented(self, 5, beta)
return [d_img, d_kerns, top * alpha, DisconnectedType()(), d_alpha]
return [d_img * alpha, d_kerns * alpha, top * beta,
DisconnectedType()(), d_alpha, d_beta]
def connection_pattern(self, node):
# not connected to desc
return [[1], [1], [1], [0], [1]]
return [[1], [1], [1], [0], [1], [1]]
@staticmethod
def get_out_shape(ishape, kshape, border_mode, subsample):
......@@ -507,7 +510,7 @@ class GpuDnnConvGradW(DnnBase, COp):
self.inplace = False
def grad(self, inp, grads):
img, top, output, desc, alpha = inp
img, top, output, desc, alpha, beta = inp
kerns, = grads
kerns = gpu_contiguous(kerns)
......@@ -515,12 +518,14 @@ class GpuDnnConvGradW(DnnBase, COp):
d_img = GpuDnnConvGradI()(kerns, top, img.zeros_like(), desc)
d_top = GpuDnnConv()(img, kerns, top.zeros_like(), desc)
d_alpha = grad_not_implemented(self, 4, alpha)
d_beta = grad_not_implemented(self, 5, beta)
return (d_img, d_top, kerns * alpha, DisconnectedType()(), d_alpha)
return (d_img * alpha, d_top * alpha, kerns * beta,
DisconnectedType()(), d_alpha, d_beta)
def connection_pattern(self, node):
# not connected to desc
return [[1], [1], [1], [0], [1]]
return [[1], [1], [1], [0], [1], [1]]
def get_op_params(self):
if self.inplace:
......@@ -528,7 +533,7 @@ class GpuDnnConvGradW(DnnBase, COp):
else:
return []
def make_node(self, img, topgrad, output, desc, alpha=None):
def make_node(self, img, topgrad, output, desc, alpha=None, beta=None):
img = as_cuda_ndarray_variable(img)
topgrad = as_cuda_ndarray_variable(topgrad)
output = as_cuda_ndarray_variable(output)
......@@ -544,8 +549,9 @@ class GpuDnnConvGradW(DnnBase, COp):
raise TypeError('desc must be cudnnConvolutionDescriptor_t')
alpha = ensure_float(alpha, _one, 'alpha')
beta = ensure_float(beta, _zero, 'beta')
return Apply(self, [img, topgrad, output, desc, alpha],
return Apply(self, [img, topgrad, output, desc, alpha, beta],
[output.type()])
def infer_shape(self, node, shape):
......@@ -571,7 +577,7 @@ class GpuDnnConvGradI(DnnBase, COp):
self.destroy_map = {0: [2]}
def grad(self, inp, grads):
kerns, top, output, desc, alpha = inp
kerns, top, output, desc, alpha, beta = inp
img, = grads
img = gpu_contiguous(img)
......@@ -579,12 +585,14 @@ class GpuDnnConvGradI(DnnBase, COp):
d_kerns = GpuDnnConvGradW()(img, top, kerns.zeros_like(), desc)
d_top = GpuDnnConv()(img, kerns, top.zeros_like(), desc)
d_alpha = grad_not_implemented(self, 4, alpha)
d_beta = grad_not_implemented(self, 5, beta)
return (d_kerns, d_top, img * alpha, DisconnectedType()(), d_alpha)
return (d_kerns * alpha, d_top * alpha, img * beta,
DisconnectedType()(), d_alpha, d_beta)
def connection_pattern(self, node):
# not connected to desc
return [[1], [1], [1], [0], [1]]
return [[1], [1], [1], [0], [1], [1]]
def get_op_params(self):
if self.inplace:
......@@ -592,7 +600,7 @@ class GpuDnnConvGradI(DnnBase, COp):
else:
return []
def make_node(self, kern, topgrad, output, desc, alpha=None):
def make_node(self, kern, topgrad, output, desc, alpha=None, beta=None):
kern = as_cuda_ndarray_variable(kern)
topgrad = as_cuda_ndarray_variable(topgrad)
output = as_cuda_ndarray_variable(output)
......@@ -608,8 +616,9 @@ class GpuDnnConvGradI(DnnBase, COp):
raise TypeError('desc must be cudnnConvolutionDescriptor_t')
alpha = ensure_float(alpha, _one, 'alpha')
beta = ensure_float(beta, _zero, 'beta')
return Apply(self, [kern, topgrad, output, desc, alpha],
return Apply(self, [kern, topgrad, output, desc, alpha, beta],
[output.type()])
def infer_shape(self, node, shape):
......@@ -1550,47 +1559,41 @@ if True:
70.0, 'fast_run', 'inplace', 'gpu', 'cudnn')
@register_opt('cudnn')
@alpha_merge(GpuDnnConv, alpha_in=4, nd=4)
@alpha_merge(GpuDnnConv, alpha_in=4, beta_in=5, nd=4)
def local_dnn_conv_alpha_merge(node, *inputs):
if not dnn_available() or version() == -1:
return None
return [GpuDnnConv(workmem=node.op.workmem)(*inputs)]
@register_opt('cudnn')
@alpha_merge(GpuDnnConvGradW, alpha_in=4, nd=4)
@alpha_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, nd=4)
def local_dnn_convw_alpha_merge(node, *inputs):
if not dnn_available() or version() == -1:
return None
return [GpuDnnConvGradW()(*inputs)]
@register_opt('cudnn')
@alpha_merge(GpuDnnConvGradI, alpha_in=4, nd=4)
@alpha_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, nd=4)
def local_dnn_convi_alpha_merge(node, *inputs):
if not dnn_available() or version() == -1:
return None
return [GpuDnnConvGradI()(*inputs)]
@register_opt('cudnn')
@output_merge(GpuDnnConv, alpha_in=4, out_in=2, nd=4)
@output_merge(GpuDnnConv, alpha_in=4, beta_in=5, out_in=2, nd=4)
def local_dnn_conv_output_merge(node, *inputs):
if not dnn_available() or version() == -1:
return None
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConv(workmem=node.op.workmem)(*inputs)]
@register_opt('cudnn')
@output_merge(GpuDnnConvGradW, alpha_in=4, out_in=2, nd=4)
@output_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, out_in=2, nd=4)
def local_dnn_convw_output_merge(node, *inputs):
if not dnn_available() or version() == -1:
return None
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConvGradW()(*inputs)]
@register_opt('cudnn')
@output_merge(GpuDnnConvGradI, alpha_in=4, out_in=2, nd=4)
@output_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, out_in=2, nd=4)
def local_dnn_convi_output_merge(node, *inputs):
if not dnn_available() or version() == -1:
return None
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConvGradI()(*inputs)]
......
......@@ -3,7 +3,7 @@
int
APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
CudaNdarray *om, cudnnConvolutionDescriptor_t desc,
float alpha, CudaNdarray **output) {
float alpha, float beta, CudaNdarray **output) {
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (c_set_tensor4d(input, APPLY_SPECIFIC(input)) == -1)
......@@ -18,7 +18,7 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
#else
if (CudaNdarray_prep_output(output, 4, CudaNdarray_HOST_DIMS(om)) != 0)
return 1;
if (CudaNdarray_CopyFromCudaNdarray(*output, om))
if (beta != 0.0 && CudaNdarray_CopyFromCudaNdarray(*output, om))
return 1;
#endif
......@@ -47,8 +47,6 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
if (workspace == NULL && worksize != 0)
return 1;
const float beta = 1;
err = cudnnConvolutionForward(
_handle,
(void *)&alpha,
......
......@@ -3,7 +3,7 @@
int
APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
CudaNdarray *im, cudnnConvolutionDescriptor_t desc,
float alpha, CudaNdarray **input) {
float alpha, float beta, CudaNdarray **input) {
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (c_set_tensor4d(output, APPLY_SPECIFIC(output)) == -1)
......@@ -18,15 +18,13 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
#else
if (CudaNdarray_prep_output(input, 4, CudaNdarray_HOST_DIMS(im)) != 0)
return 1;
if (CudaNdarray_CopyFromCudaNdarray(*input, im))
if (beta != 0.0 && CudaNdarray_CopyFromCudaNdarray(*input, im))
return 1;
#endif
if (c_set_tensor4d(*input, APPLY_SPECIFIC(input)) == -1)
return 1;
const float beta = 1;
err = cudnnConvolutionBackwardData(
_handle,
(void *)&alpha,
......
......@@ -3,7 +3,7 @@
int
APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
CudaNdarray *km, cudnnConvolutionDescriptor_t desc,
float alpha, CudaNdarray **kerns) {
float alpha, float beta, CudaNdarray **kerns) {
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (c_set_tensor4d(input, APPLY_SPECIFIC(input)) == -1)
......@@ -18,15 +18,13 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
#else
if (CudaNdarray_prep_output(kerns, 4, CudaNdarray_HOST_DIMS(km)) != 0)
return 1;
if (CudaNdarray_CopyFromCudaNdarray(*kerns, km))
if (beta != 0.0 && CudaNdarray_CopyFromCudaNdarray(*kerns, km))
return 1;
#endif
if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns)) == -1)
return 1;
const float beta = 1;
err = cudnnConvolutionBackwardFilter(
_handle,
(void *)&alpha,
......
......@@ -5,11 +5,14 @@ import numpy
import theano
from theano import scalar as scal, Constant
from theano.gof import local_optimizer
from theano.tensor import DimShuffle
from theano.tensor import (DimShuffle, get_scalar_constant_value,
NotScalarConstantError)
from theano.sandbox.cuda.basic_ops import (
GpuFromHost, HostFromGpu, host_from_gpu, GpuDimShuffle, GpuElemwise)
_one = scal.constant(numpy.asarray(1.0, dtype='float32'))
def grab_cpu_scalar(v, nd):
if v.owner is not None:
n = v.owner
......@@ -28,6 +31,7 @@ def grab_cpu_scalar(v, nd):
v.broadcastable == (True,) * nd):
return v.dimshuffle(())
def find_node(v, cls):
# This digs through possibly redundant transfers to for the node
# that has the op class specified.
......@@ -42,7 +46,17 @@ def find_node(v, cls):
return None
def alpha_merge(cls, alpha_in, nd):
def is_equal(var, val):
# Returns True if var is always equal to val (python value), False
# otherwise (including if var is not constant)
try:
v = get_scalar_constant_value(var)
return v == val
except NotScalarConstantValue:
return False
def alpha_merge(cls, alpha_in, beta_in, nd):
def wrapper(maker):
@local_optimizer([GpuElemwise])
@wraps(maker)
......@@ -60,19 +74,19 @@ def alpha_merge(cls, alpha_in, nd):
return None
inputs = list(targ.inputs)
inputs[alpha_in] = lr * targ.inputs[alpha_in]
inputs[beta_in] = lr * targ.inputs[beta_in]
return maker(targ, *inputs)
return opt
return wrapper
def output_merge(cls, alpha_in, out_in, nd):
def output_merge(cls, alpha_in, beta_in, out_in, nd):
def wrapper(maker):
@local_optimizer([GpuElemwise])
@wraps(maker)
def opt(node):
if (isinstance(node.op, GpuElemwise) and
(node.op.scalar_op == scal.sub or
node.op.scalar_op == scal.add) and
node.op.scalar_op == scal.add and
node.nin == 2):
targ = find_node(node.inputs[0], cls)
W = node.inputs[1]
......@@ -81,15 +95,16 @@ def output_merge(cls, alpha_in, out_in, nd):
W = node.inputs[0]
if targ is None:
return None
if node.op.scalar_op == scal.sub:
alpha = -targ.inputs[alpha_in]
W = W - targ.inputs[out_in]
else:
alpha = targ.inputs[alpha_in]
W = W + targ.inputs[out_in]
if not is_equal(targ.inputs[beta_in], 0.0):
# other cases are too complex for now
return None
if W.broadcastable != targ.inputs[out_in].broadcastable:
# Would need to explicitly tile the output to fill
# the full shape here. Disable for now.
return None
inputs = list(targ.inputs)
inputs[out_in] = W
inputs[alpha_in] = alpha
inputs[beta_in] = _one.clone()
return maker(targ, *inputs)
return opt
return wrapper
......@@ -169,7 +169,10 @@ def test_blocksparse_grad_shape():
assert W_g.shape == W_val.shape
def test_blocksparse_grad_merge():
# This test is temporarily disabled since we disabled the output_merge
# and alpha_merge optimizations for blocksparse due to brokeness.
# Re-enable when those are re-added.
def Xtest_blocksparse_grad_merge():
b = tensor.fmatrix()
h = tensor.ftensor3()
iIdx = tensor.lmatrix()
......
......@@ -466,7 +466,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
def test_dnn_conv_merge():
if not cuda.dnn.dnn_available() or cuda.dnn.version() == -1:
if not cuda.dnn.dnn_available():
raise SkipTest(cuda.dnn.dnn_available.msg)
img = T.ftensor4()
kern = T.ftensor4()
......@@ -475,13 +475,13 @@ def test_dnn_conv_merge():
b = 1
c = 4
f = 3
ih = 2
ih = 5
iw = 8
kh = 2
kw = 2
kw = 6
img_val = numpy.random.random((b, c, ih, iw)).astype('float32')
kern_val = numpy.random.random((f, c, kh, kw)).astype('float32')
out_val = numpy.random.random((b, f, ih-kw+1, iw-kw+1)).astype('float32')
out_val = numpy.random.random((b, f, ih-kh+1, iw-kw+1)).astype('float32')
conv = dnn.dnn_conv(img, kern)
gw = theano.grad(conv.sum(), kern)
......@@ -489,9 +489,15 @@ def test_dnn_conv_merge():
lr = numpy.asarray(0.05, dtype='float32')
fr = out - lr * conv
wr = kern - lr * gw
ir = img - lr * gi
if cuda.dnn.version() == -1:
# Can't merge alpha with cudnn v1
fr = conv + out
wr = kern + gw
ir = img + gi
else:
fr = lr * (conv + out)
wr = kern + lr * gw
ir = img + lr * gi
f1 = theano.function([img, kern, out], [fr, wr, ir], mode=mode_with_gpu)
assert isinstance(f1.maker.fgraph.outputs[0].owner.inputs[0].owner.op,
......@@ -545,17 +551,19 @@ def test_dnn_conv_grad():
def dconv(img, kern, out):
desc = dnn.GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
conv_mode='conv')(img.shape, kern.shape)
return dnn.GpuDnnConv()(img, kern, out, desc)
return dnn.GpuDnnConv()(img, kern, out, desc, alpha=0.5, beta=0.75)
def dconvi(img, kern, out):
desc = dnn.GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
conv_mode='conv')(img.shape, kern.shape)
return dnn.GpuDnnConvGradI()(kern, out, img, desc)
return dnn.GpuDnnConvGradI()(kern, out, img, desc, alpha=-1.0,
beta=0.0)
def dconvw(img, kern, out):
desc = dnn.GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
conv_mode='conv')(img.shape, kern.shape)
return dnn.GpuDnnConvGradW()(img, out, kern, desc)
return dnn.GpuDnnConvGradW()(img, out, kern, desc, alpha=0.75,
beta=-1.0)
utt.verify_grad(dconv, [img_val, kern_val, out_val])
utt.verify_grad(dconvi, [img_val, kern_val, out_val])
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论