提交 5a656e9b authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #2621 from abergeron/fix_alpha_merge

Fix alpha_merge and cudnn conv gradient
...@@ -645,19 +645,22 @@ if cuda_available: ...@@ -645,19 +645,22 @@ if cuda_available:
if node.op == sparse_block_outer_ss: if node.op == sparse_block_outer_ss:
return [sparse_block_outer_ss_inplace(*node.inputs)] return [sparse_block_outer_ss_inplace(*node.inputs)]
# Should be run before elemwise fusion # XXX: these optimisations were badly broken and now require a working
@opt.register_opt() # beta param (could only be a 0/1 thing for outer_merge, but
@alpha_merge(SparseBlockOuterSS, alpha_in=5, nd=4) # alpha_merge needs the full range).
def local_merge_blocksparse_alpha(node, *inputs):
""" # @opt.register_opt()
GpuElemwise{mul}(lr, SparseBlockOuterSS) -> SparseBlockOuterSS(..., alpha=lr) # @alpha_merge(SparseBlockOuterSS, alpha_in=5, beta_in=?, nd=4)
""" # def local_merge_blocksparse_alpha(node, *inputs):
return [sparse_block_outer_ss(*inputs)] # """
#GpuElemwise{mul}(lr, SparseBlockOuterSS) -> SparseBlockOuterSS(..., alpha=lr)
@opt.register_opt() # """
@output_merge(SparseBlockOuterSS, alpha_in=5, out_in=0, nd=4) # return [sparse_block_outer_ss(*inputs)]
def local_merge_blocksparse_output(node, *inputs):
return [sparse_block_outer_ss(*inputs)] # @opt.register_opt()
# @output_merge(SparseBlockOuterSS, alpha_in=5, beta_in=? out_in=0, nd=4)
# def local_merge_blocksparse_output(node, *inputs):
# return [sparse_block_outer_ss(*inputs)]
def sparse_block_dot_SS(W, h, inputIdx, b, outputIdx): def sparse_block_dot_SS(W, h, inputIdx, b, outputIdx):
......
...@@ -103,11 +103,18 @@ cudnnConvolutionForward_v2( ...@@ -103,11 +103,18 @@ cudnnConvolutionForward_v2(
const cudnnTensorDescriptor_t destDesc, const cudnnTensorDescriptor_t destDesc,
void *destData) { void *destData) {
assert(*(float *)alpha == 1.0); assert(*(float *)alpha == 1.0);
assert(*(float *)beta == 1.0); cudnnAccumulateResult_t r;
if (*(float *)beta == 0.0) {
r = CUDNN_RESULT_NO_ACCUMULATE;
} else if (*(float *)beta == 1.0) {
r = CUDNN_RESULT_ACCUMULATE;
} else {
assert(0 && "beta must be 0.0 or 1.0");
}
return cudnnConvolutionForward(handle, srcDesc, srcData, return cudnnConvolutionForward(handle, srcDesc, srcData,
filterDesc, filterData, filterDesc, filterData,
convDesc, destDesc, destData, convDesc, destDesc, destData,
CUDNN_RESULT_ACCUMULATE); r);
} }
#define cudnnConvolutionForward cudnnConvolutionForward_v2 #define cudnnConvolutionForward cudnnConvolutionForward_v2
...@@ -124,11 +131,18 @@ cudnnConvolutionBackwardFilter_v2( ...@@ -124,11 +131,18 @@ cudnnConvolutionBackwardFilter_v2(
const cudnnFilterDescriptor_t gradDesc, const cudnnFilterDescriptor_t gradDesc,
void *gradData) { void *gradData) {
assert(*(float *)alpha == 1.0); assert(*(float *)alpha == 1.0);
assert(*(float *)beta == 1.0); cudnnAccumulateResult_t r;
if (*(float *)beta == 0.0) {
r = CUDNN_RESULT_NO_ACCUMULATE;
} else if (*(float *)beta == 1.0) {
r = CUDNN_RESULT_ACCUMULATE;
} else {
assert(0 && "beta must be 0.0 or 1.0");
}
return cudnnConvolutionBackwardFilter(handle, srcDesc, srcData, return cudnnConvolutionBackwardFilter(handle, srcDesc, srcData,
diffDesc, diffData, diffDesc, diffData,
convDesc, gradDesc, gradData, convDesc, gradDesc, gradData,
CUDNN_RESULT_ACCUMULATE); r);
} }
#define cudnnConvolutionBackwardFilter cudnnConvolutionBackwardFilter_v2 #define cudnnConvolutionBackwardFilter cudnnConvolutionBackwardFilter_v2
...@@ -146,7 +160,16 @@ cudnnConvolutionBackwardData_v2( ...@@ -146,7 +160,16 @@ cudnnConvolutionBackwardData_v2(
const cudnnTensorDescriptor_t gradDesc, const cudnnTensorDescriptor_t gradDesc,
void *gradData) { void *gradData) {
assert(*(float *)alpha == 1.0); assert(*(float *)alpha == 1.0);
assert(*(float *)beta == 1.0); cudnnAccumulateResult_t r;
if (*(float *)beta == 0.0) {
r = CUDNN_RESULT_NO_ACCUMULATE;
} else if (*(float *)beta == 1.0) {
r = CUDNN_RESULT_ACCUMULATE;
} else {
assert(0 && "beta must be 0.0 or 1.0");
}
/* This function needs the casting because its params are not
declared as const */
return cudnnConvolutionBackwardData(handle, return cudnnConvolutionBackwardData(handle,
(cudnnFilterDescriptor_t)filterDesc, (cudnnFilterDescriptor_t)filterDesc,
filterData, filterData,
...@@ -155,7 +178,7 @@ cudnnConvolutionBackwardData_v2( ...@@ -155,7 +178,7 @@ cudnnConvolutionBackwardData_v2(
(cudnnConvolutionDescriptor_t)convDesc, (cudnnConvolutionDescriptor_t)convDesc,
(cudnnTensorDescriptor_t)gradDesc, (cudnnTensorDescriptor_t)gradDesc,
gradData, gradData,
CUDNN_RESULT_ACCUMULATE); r);
} }
#define cudnnConvolutionBackwardData cudnnConvolutionBackwardData_v2 #define cudnnConvolutionBackwardData cudnnConvolutionBackwardData_v2
......
...@@ -411,7 +411,7 @@ class GpuDnnConv(DnnBase, COp): ...@@ -411,7 +411,7 @@ class GpuDnnConv(DnnBase, COp):
alg_def = ('CONV_ALGO', alg) alg_def = ('CONV_ALGO', alg)
return [alg_def] + inpl_def return [alg_def] + inpl_def
def make_node(self, img, kern, output, desc, alpha=None): def make_node(self, img, kern, output, desc, alpha=None, beta=None):
img = as_cuda_ndarray_variable(img) img = as_cuda_ndarray_variable(img)
kern = as_cuda_ndarray_variable(kern) kern = as_cuda_ndarray_variable(kern)
output = as_cuda_ndarray_variable(output) output = as_cuda_ndarray_variable(output)
...@@ -427,12 +427,13 @@ class GpuDnnConv(DnnBase, COp): ...@@ -427,12 +427,13 @@ class GpuDnnConv(DnnBase, COp):
raise TypeError('desc must be cudnnConvolutionDescriptor_t') raise TypeError('desc must be cudnnConvolutionDescriptor_t')
alpha = ensure_float(alpha, _one, 'alpha') alpha = ensure_float(alpha, _one, 'alpha')
beta = ensure_float(beta, _zero, 'beta')
return Apply(self, [img, kern, output, desc, alpha], return Apply(self, [img, kern, output, desc, alpha, beta],
[output.type()]) [output.type()])
def grad(self, inp, grads): def grad(self, inp, grads):
img, kerns, output, desc, alpha = inp img, kerns, output, desc, alpha, beta = inp
top, = grads top, = grads
top = gpu_contiguous(top) top = gpu_contiguous(top)
...@@ -440,12 +441,14 @@ class GpuDnnConv(DnnBase, COp): ...@@ -440,12 +441,14 @@ class GpuDnnConv(DnnBase, COp):
d_img = GpuDnnConvGradI()(kerns, top, img.zeros_like(), desc) d_img = GpuDnnConvGradI()(kerns, top, img.zeros_like(), desc)
d_kerns = GpuDnnConvGradW()(img, top, kerns.zeros_like(), desc) d_kerns = GpuDnnConvGradW()(img, top, kerns.zeros_like(), desc)
d_alpha = grad_not_implemented(self, 4, alpha) d_alpha = grad_not_implemented(self, 4, alpha)
d_beta = grad_not_implemented(self, 5, beta)
return [d_img, d_kerns, top * alpha, DisconnectedType()(), d_alpha] return [d_img * alpha, d_kerns * alpha, top * beta,
DisconnectedType()(), d_alpha, d_beta]
def connection_pattern(self, node): def connection_pattern(self, node):
# not connected to desc # not connected to desc
return [[1], [1], [1], [0], [1]] return [[1], [1], [1], [0], [1], [1]]
@staticmethod @staticmethod
def get_out_shape(ishape, kshape, border_mode, subsample): def get_out_shape(ishape, kshape, border_mode, subsample):
...@@ -507,7 +510,7 @@ class GpuDnnConvGradW(DnnBase, COp): ...@@ -507,7 +510,7 @@ class GpuDnnConvGradW(DnnBase, COp):
self.inplace = False self.inplace = False
def grad(self, inp, grads): def grad(self, inp, grads):
img, top, output, desc, alpha = inp img, top, output, desc, alpha, beta = inp
kerns, = grads kerns, = grads
kerns = gpu_contiguous(kerns) kerns = gpu_contiguous(kerns)
...@@ -515,12 +518,14 @@ class GpuDnnConvGradW(DnnBase, COp): ...@@ -515,12 +518,14 @@ class GpuDnnConvGradW(DnnBase, COp):
d_img = GpuDnnConvGradI()(kerns, top, img.zeros_like(), desc) d_img = GpuDnnConvGradI()(kerns, top, img.zeros_like(), desc)
d_top = GpuDnnConv()(img, kerns, top.zeros_like(), desc) d_top = GpuDnnConv()(img, kerns, top.zeros_like(), desc)
d_alpha = grad_not_implemented(self, 4, alpha) d_alpha = grad_not_implemented(self, 4, alpha)
d_beta = grad_not_implemented(self, 5, beta)
return (d_img, d_top, kerns * alpha, DisconnectedType()(), d_alpha) return (d_img * alpha, d_top * alpha, kerns * beta,
DisconnectedType()(), d_alpha, d_beta)
def connection_pattern(self, node): def connection_pattern(self, node):
# not connected to desc # not connected to desc
return [[1], [1], [1], [0], [1]] return [[1], [1], [1], [0], [1], [1]]
def get_op_params(self): def get_op_params(self):
if self.inplace: if self.inplace:
...@@ -528,7 +533,7 @@ class GpuDnnConvGradW(DnnBase, COp): ...@@ -528,7 +533,7 @@ class GpuDnnConvGradW(DnnBase, COp):
else: else:
return [] return []
def make_node(self, img, topgrad, output, desc, alpha=None): def make_node(self, img, topgrad, output, desc, alpha=None, beta=None):
img = as_cuda_ndarray_variable(img) img = as_cuda_ndarray_variable(img)
topgrad = as_cuda_ndarray_variable(topgrad) topgrad = as_cuda_ndarray_variable(topgrad)
output = as_cuda_ndarray_variable(output) output = as_cuda_ndarray_variable(output)
...@@ -544,8 +549,9 @@ class GpuDnnConvGradW(DnnBase, COp): ...@@ -544,8 +549,9 @@ class GpuDnnConvGradW(DnnBase, COp):
raise TypeError('desc must be cudnnConvolutionDescriptor_t') raise TypeError('desc must be cudnnConvolutionDescriptor_t')
alpha = ensure_float(alpha, _one, 'alpha') alpha = ensure_float(alpha, _one, 'alpha')
beta = ensure_float(beta, _zero, 'beta')
return Apply(self, [img, topgrad, output, desc, alpha], return Apply(self, [img, topgrad, output, desc, alpha, beta],
[output.type()]) [output.type()])
def infer_shape(self, node, shape): def infer_shape(self, node, shape):
...@@ -571,7 +577,7 @@ class GpuDnnConvGradI(DnnBase, COp): ...@@ -571,7 +577,7 @@ class GpuDnnConvGradI(DnnBase, COp):
self.destroy_map = {0: [2]} self.destroy_map = {0: [2]}
def grad(self, inp, grads): def grad(self, inp, grads):
kerns, top, output, desc, alpha = inp kerns, top, output, desc, alpha, beta = inp
img, = grads img, = grads
img = gpu_contiguous(img) img = gpu_contiguous(img)
...@@ -579,12 +585,14 @@ class GpuDnnConvGradI(DnnBase, COp): ...@@ -579,12 +585,14 @@ class GpuDnnConvGradI(DnnBase, COp):
d_kerns = GpuDnnConvGradW()(img, top, kerns.zeros_like(), desc) d_kerns = GpuDnnConvGradW()(img, top, kerns.zeros_like(), desc)
d_top = GpuDnnConv()(img, kerns, top.zeros_like(), desc) d_top = GpuDnnConv()(img, kerns, top.zeros_like(), desc)
d_alpha = grad_not_implemented(self, 4, alpha) d_alpha = grad_not_implemented(self, 4, alpha)
d_beta = grad_not_implemented(self, 5, beta)
return (d_kerns, d_top, img * alpha, DisconnectedType()(), d_alpha) return (d_kerns * alpha, d_top * alpha, img * beta,
DisconnectedType()(), d_alpha, d_beta)
def connection_pattern(self, node): def connection_pattern(self, node):
# not connected to desc # not connected to desc
return [[1], [1], [1], [0], [1]] return [[1], [1], [1], [0], [1], [1]]
def get_op_params(self): def get_op_params(self):
if self.inplace: if self.inplace:
...@@ -592,7 +600,7 @@ class GpuDnnConvGradI(DnnBase, COp): ...@@ -592,7 +600,7 @@ class GpuDnnConvGradI(DnnBase, COp):
else: else:
return [] return []
def make_node(self, kern, topgrad, output, desc, alpha=None): def make_node(self, kern, topgrad, output, desc, alpha=None, beta=None):
kern = as_cuda_ndarray_variable(kern) kern = as_cuda_ndarray_variable(kern)
topgrad = as_cuda_ndarray_variable(topgrad) topgrad = as_cuda_ndarray_variable(topgrad)
output = as_cuda_ndarray_variable(output) output = as_cuda_ndarray_variable(output)
...@@ -608,8 +616,9 @@ class GpuDnnConvGradI(DnnBase, COp): ...@@ -608,8 +616,9 @@ class GpuDnnConvGradI(DnnBase, COp):
raise TypeError('desc must be cudnnConvolutionDescriptor_t') raise TypeError('desc must be cudnnConvolutionDescriptor_t')
alpha = ensure_float(alpha, _one, 'alpha') alpha = ensure_float(alpha, _one, 'alpha')
beta = ensure_float(beta, _zero, 'beta')
return Apply(self, [kern, topgrad, output, desc, alpha], return Apply(self, [kern, topgrad, output, desc, alpha, beta],
[output.type()]) [output.type()])
def infer_shape(self, node, shape): def infer_shape(self, node, shape):
...@@ -1550,47 +1559,41 @@ if True: ...@@ -1550,47 +1559,41 @@ if True:
70.0, 'fast_run', 'inplace', 'gpu', 'cudnn') 70.0, 'fast_run', 'inplace', 'gpu', 'cudnn')
@register_opt('cudnn') @register_opt('cudnn')
@alpha_merge(GpuDnnConv, alpha_in=4, nd=4) @alpha_merge(GpuDnnConv, alpha_in=4, beta_in=5, nd=4)
def local_dnn_conv_alpha_merge(node, *inputs): def local_dnn_conv_alpha_merge(node, *inputs):
if not dnn_available() or version() == -1: if not dnn_available() or version() == -1:
return None return None
return [GpuDnnConv(workmem=node.op.workmem)(*inputs)] return [GpuDnnConv(workmem=node.op.workmem)(*inputs)]
@register_opt('cudnn') @register_opt('cudnn')
@alpha_merge(GpuDnnConvGradW, alpha_in=4, nd=4) @alpha_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, nd=4)
def local_dnn_convw_alpha_merge(node, *inputs): def local_dnn_convw_alpha_merge(node, *inputs):
if not dnn_available() or version() == -1: if not dnn_available() or version() == -1:
return None return None
return [GpuDnnConvGradW()(*inputs)] return [GpuDnnConvGradW()(*inputs)]
@register_opt('cudnn') @register_opt('cudnn')
@alpha_merge(GpuDnnConvGradI, alpha_in=4, nd=4) @alpha_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, nd=4)
def local_dnn_convi_alpha_merge(node, *inputs): def local_dnn_convi_alpha_merge(node, *inputs):
if not dnn_available() or version() == -1: if not dnn_available() or version() == -1:
return None return None
return [GpuDnnConvGradI()(*inputs)] return [GpuDnnConvGradI()(*inputs)]
@register_opt('cudnn') @register_opt('cudnn')
@output_merge(GpuDnnConv, alpha_in=4, out_in=2, nd=4) @output_merge(GpuDnnConv, alpha_in=4, beta_in=5, out_in=2, nd=4)
def local_dnn_conv_output_merge(node, *inputs): def local_dnn_conv_output_merge(node, *inputs):
if not dnn_available() or version() == -1:
return None
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:] inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConv(workmem=node.op.workmem)(*inputs)] return [GpuDnnConv(workmem=node.op.workmem)(*inputs)]
@register_opt('cudnn') @register_opt('cudnn')
@output_merge(GpuDnnConvGradW, alpha_in=4, out_in=2, nd=4) @output_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, out_in=2, nd=4)
def local_dnn_convw_output_merge(node, *inputs): def local_dnn_convw_output_merge(node, *inputs):
if not dnn_available() or version() == -1:
return None
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:] inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConvGradW()(*inputs)] return [GpuDnnConvGradW()(*inputs)]
@register_opt('cudnn') @register_opt('cudnn')
@output_merge(GpuDnnConvGradI, alpha_in=4, out_in=2, nd=4) @output_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, out_in=2, nd=4)
def local_dnn_convi_output_merge(node, *inputs): def local_dnn_convi_output_merge(node, *inputs):
if not dnn_available() or version() == -1:
return None
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:] inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConvGradI()(*inputs)] return [GpuDnnConvGradI()(*inputs)]
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
int int
APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns, APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
CudaNdarray *om, cudnnConvolutionDescriptor_t desc, CudaNdarray *om, cudnnConvolutionDescriptor_t desc,
float alpha, CudaNdarray **output) { float alpha, float beta, CudaNdarray **output) {
cudnnStatus_t err = CUDNN_STATUS_SUCCESS; cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (c_set_tensor4d(input, APPLY_SPECIFIC(input)) == -1) if (c_set_tensor4d(input, APPLY_SPECIFIC(input)) == -1)
...@@ -18,7 +18,7 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns, ...@@ -18,7 +18,7 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
#else #else
if (CudaNdarray_prep_output(output, 4, CudaNdarray_HOST_DIMS(om)) != 0) if (CudaNdarray_prep_output(output, 4, CudaNdarray_HOST_DIMS(om)) != 0)
return 1; return 1;
if (CudaNdarray_CopyFromCudaNdarray(*output, om)) if (beta != 0.0 && CudaNdarray_CopyFromCudaNdarray(*output, om))
return 1; return 1;
#endif #endif
...@@ -47,8 +47,6 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns, ...@@ -47,8 +47,6 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
if (workspace == NULL && worksize != 0) if (workspace == NULL && worksize != 0)
return 1; return 1;
const float beta = 1;
err = cudnnConvolutionForward( err = cudnnConvolutionForward(
_handle, _handle,
(void *)&alpha, (void *)&alpha,
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
int int
APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output, APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
CudaNdarray *im, cudnnConvolutionDescriptor_t desc, CudaNdarray *im, cudnnConvolutionDescriptor_t desc,
float alpha, CudaNdarray **input) { float alpha, float beta, CudaNdarray **input) {
cudnnStatus_t err = CUDNN_STATUS_SUCCESS; cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (c_set_tensor4d(output, APPLY_SPECIFIC(output)) == -1) if (c_set_tensor4d(output, APPLY_SPECIFIC(output)) == -1)
...@@ -18,15 +18,13 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output, ...@@ -18,15 +18,13 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
#else #else
if (CudaNdarray_prep_output(input, 4, CudaNdarray_HOST_DIMS(im)) != 0) if (CudaNdarray_prep_output(input, 4, CudaNdarray_HOST_DIMS(im)) != 0)
return 1; return 1;
if (CudaNdarray_CopyFromCudaNdarray(*input, im)) if (beta != 0.0 && CudaNdarray_CopyFromCudaNdarray(*input, im))
return 1; return 1;
#endif #endif
if (c_set_tensor4d(*input, APPLY_SPECIFIC(input)) == -1) if (c_set_tensor4d(*input, APPLY_SPECIFIC(input)) == -1)
return 1; return 1;
const float beta = 1;
err = cudnnConvolutionBackwardData( err = cudnnConvolutionBackwardData(
_handle, _handle,
(void *)&alpha, (void *)&alpha,
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
int int
APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output, APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
CudaNdarray *km, cudnnConvolutionDescriptor_t desc, CudaNdarray *km, cudnnConvolutionDescriptor_t desc,
float alpha, CudaNdarray **kerns) { float alpha, float beta, CudaNdarray **kerns) {
cudnnStatus_t err = CUDNN_STATUS_SUCCESS; cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (c_set_tensor4d(input, APPLY_SPECIFIC(input)) == -1) if (c_set_tensor4d(input, APPLY_SPECIFIC(input)) == -1)
...@@ -18,15 +18,13 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output, ...@@ -18,15 +18,13 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
#else #else
if (CudaNdarray_prep_output(kerns, 4, CudaNdarray_HOST_DIMS(km)) != 0) if (CudaNdarray_prep_output(kerns, 4, CudaNdarray_HOST_DIMS(km)) != 0)
return 1; return 1;
if (CudaNdarray_CopyFromCudaNdarray(*kerns, km)) if (beta != 0.0 && CudaNdarray_CopyFromCudaNdarray(*kerns, km))
return 1; return 1;
#endif #endif
if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns)) == -1) if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns)) == -1)
return 1; return 1;
const float beta = 1;
err = cudnnConvolutionBackwardFilter( err = cudnnConvolutionBackwardFilter(
_handle, _handle,
(void *)&alpha, (void *)&alpha,
......
...@@ -5,11 +5,14 @@ import numpy ...@@ -5,11 +5,14 @@ import numpy
import theano import theano
from theano import scalar as scal, Constant from theano import scalar as scal, Constant
from theano.gof import local_optimizer from theano.gof import local_optimizer
from theano.tensor import DimShuffle from theano.tensor import (DimShuffle, get_scalar_constant_value,
NotScalarConstantError)
from theano.sandbox.cuda.basic_ops import ( from theano.sandbox.cuda.basic_ops import (
GpuFromHost, HostFromGpu, host_from_gpu, GpuDimShuffle, GpuElemwise) GpuFromHost, HostFromGpu, host_from_gpu, GpuDimShuffle, GpuElemwise)
_one = scal.constant(numpy.asarray(1.0, dtype='float32'))
def grab_cpu_scalar(v, nd): def grab_cpu_scalar(v, nd):
if v.owner is not None: if v.owner is not None:
n = v.owner n = v.owner
...@@ -28,6 +31,7 @@ def grab_cpu_scalar(v, nd): ...@@ -28,6 +31,7 @@ def grab_cpu_scalar(v, nd):
v.broadcastable == (True,) * nd): v.broadcastable == (True,) * nd):
return v.dimshuffle(()) return v.dimshuffle(())
def find_node(v, cls): def find_node(v, cls):
# This digs through possibly redundant transfers to for the node # This digs through possibly redundant transfers to for the node
# that has the op class specified. # that has the op class specified.
...@@ -42,7 +46,17 @@ def find_node(v, cls): ...@@ -42,7 +46,17 @@ def find_node(v, cls):
return None return None
def alpha_merge(cls, alpha_in, nd): def is_equal(var, val):
# Returns True if var is always equal to val (python value), False
# otherwise (including if var is not constant)
try:
v = get_scalar_constant_value(var)
return v == val
except NotScalarConstantValue:
return False
def alpha_merge(cls, alpha_in, beta_in, nd):
def wrapper(maker): def wrapper(maker):
@local_optimizer([GpuElemwise]) @local_optimizer([GpuElemwise])
@wraps(maker) @wraps(maker)
...@@ -60,19 +74,19 @@ def alpha_merge(cls, alpha_in, nd): ...@@ -60,19 +74,19 @@ def alpha_merge(cls, alpha_in, nd):
return None return None
inputs = list(targ.inputs) inputs = list(targ.inputs)
inputs[alpha_in] = lr * targ.inputs[alpha_in] inputs[alpha_in] = lr * targ.inputs[alpha_in]
inputs[beta_in] = lr * targ.inputs[beta_in]
return maker(targ, *inputs) return maker(targ, *inputs)
return opt return opt
return wrapper return wrapper
def output_merge(cls, alpha_in, out_in, nd): def output_merge(cls, alpha_in, beta_in, out_in, nd):
def wrapper(maker): def wrapper(maker):
@local_optimizer([GpuElemwise]) @local_optimizer([GpuElemwise])
@wraps(maker) @wraps(maker)
def opt(node): def opt(node):
if (isinstance(node.op, GpuElemwise) and if (isinstance(node.op, GpuElemwise) and
(node.op.scalar_op == scal.sub or node.op.scalar_op == scal.add and
node.op.scalar_op == scal.add) and
node.nin == 2): node.nin == 2):
targ = find_node(node.inputs[0], cls) targ = find_node(node.inputs[0], cls)
W = node.inputs[1] W = node.inputs[1]
...@@ -81,15 +95,16 @@ def output_merge(cls, alpha_in, out_in, nd): ...@@ -81,15 +95,16 @@ def output_merge(cls, alpha_in, out_in, nd):
W = node.inputs[0] W = node.inputs[0]
if targ is None: if targ is None:
return None return None
if node.op.scalar_op == scal.sub: if not is_equal(targ.inputs[beta_in], 0.0):
alpha = -targ.inputs[alpha_in] # other cases are too complex for now
W = W - targ.inputs[out_in] return None
else: if W.broadcastable != targ.inputs[out_in].broadcastable:
alpha = targ.inputs[alpha_in] # Would need to explicitly tile the output to fill
W = W + targ.inputs[out_in] # the full shape here. Disable for now.
return None
inputs = list(targ.inputs) inputs = list(targ.inputs)
inputs[out_in] = W inputs[out_in] = W
inputs[alpha_in] = alpha inputs[beta_in] = _one.clone()
return maker(targ, *inputs) return maker(targ, *inputs)
return opt return opt
return wrapper return wrapper
...@@ -169,7 +169,10 @@ def test_blocksparse_grad_shape(): ...@@ -169,7 +169,10 @@ def test_blocksparse_grad_shape():
assert W_g.shape == W_val.shape assert W_g.shape == W_val.shape
def test_blocksparse_grad_merge(): # This test is temporarily disabled since we disabled the output_merge
# and alpha_merge optimizations for blocksparse due to brokeness.
# Re-enable when those are re-added.
def Xtest_blocksparse_grad_merge():
b = tensor.fmatrix() b = tensor.fmatrix()
h = tensor.ftensor3() h = tensor.ftensor3()
iIdx = tensor.lmatrix() iIdx = tensor.lmatrix()
......
...@@ -466,7 +466,7 @@ class TestDnnInferShapes(utt.InferShapeTester): ...@@ -466,7 +466,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
def test_dnn_conv_merge(): def test_dnn_conv_merge():
if not cuda.dnn.dnn_available() or cuda.dnn.version() == -1: if not cuda.dnn.dnn_available():
raise SkipTest(cuda.dnn.dnn_available.msg) raise SkipTest(cuda.dnn.dnn_available.msg)
img = T.ftensor4() img = T.ftensor4()
kern = T.ftensor4() kern = T.ftensor4()
...@@ -475,13 +475,13 @@ def test_dnn_conv_merge(): ...@@ -475,13 +475,13 @@ def test_dnn_conv_merge():
b = 1 b = 1
c = 4 c = 4
f = 3 f = 3
ih = 2 ih = 5
iw = 8 iw = 8
kh = 2 kh = 2
kw = 2 kw = 6
img_val = numpy.random.random((b, c, ih, iw)).astype('float32') img_val = numpy.random.random((b, c, ih, iw)).astype('float32')
kern_val = numpy.random.random((f, c, kh, kw)).astype('float32') kern_val = numpy.random.random((f, c, kh, kw)).astype('float32')
out_val = numpy.random.random((b, f, ih-kw+1, iw-kw+1)).astype('float32') out_val = numpy.random.random((b, f, ih-kh+1, iw-kw+1)).astype('float32')
conv = dnn.dnn_conv(img, kern) conv = dnn.dnn_conv(img, kern)
gw = theano.grad(conv.sum(), kern) gw = theano.grad(conv.sum(), kern)
...@@ -489,9 +489,15 @@ def test_dnn_conv_merge(): ...@@ -489,9 +489,15 @@ def test_dnn_conv_merge():
lr = numpy.asarray(0.05, dtype='float32') lr = numpy.asarray(0.05, dtype='float32')
fr = out - lr * conv if cuda.dnn.version() == -1:
wr = kern - lr * gw # Can't merge alpha with cudnn v1
ir = img - lr * gi fr = conv + out
wr = kern + gw
ir = img + gi
else:
fr = lr * (conv + out)
wr = kern + lr * gw
ir = img + lr * gi
f1 = theano.function([img, kern, out], [fr, wr, ir], mode=mode_with_gpu) f1 = theano.function([img, kern, out], [fr, wr, ir], mode=mode_with_gpu)
assert isinstance(f1.maker.fgraph.outputs[0].owner.inputs[0].owner.op, assert isinstance(f1.maker.fgraph.outputs[0].owner.inputs[0].owner.op,
...@@ -545,17 +551,19 @@ def test_dnn_conv_grad(): ...@@ -545,17 +551,19 @@ def test_dnn_conv_grad():
def dconv(img, kern, out): def dconv(img, kern, out):
desc = dnn.GpuDnnConvDesc(border_mode='valid', subsample=(1, 1), desc = dnn.GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
conv_mode='conv')(img.shape, kern.shape) conv_mode='conv')(img.shape, kern.shape)
return dnn.GpuDnnConv()(img, kern, out, desc) return dnn.GpuDnnConv()(img, kern, out, desc, alpha=0.5, beta=0.75)
def dconvi(img, kern, out): def dconvi(img, kern, out):
desc = dnn.GpuDnnConvDesc(border_mode='valid', subsample=(1, 1), desc = dnn.GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
conv_mode='conv')(img.shape, kern.shape) conv_mode='conv')(img.shape, kern.shape)
return dnn.GpuDnnConvGradI()(kern, out, img, desc) return dnn.GpuDnnConvGradI()(kern, out, img, desc, alpha=-1.0,
beta=0.0)
def dconvw(img, kern, out): def dconvw(img, kern, out):
desc = dnn.GpuDnnConvDesc(border_mode='valid', subsample=(1, 1), desc = dnn.GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
conv_mode='conv')(img.shape, kern.shape) conv_mode='conv')(img.shape, kern.shape)
return dnn.GpuDnnConvGradW()(img, out, kern, desc) return dnn.GpuDnnConvGradW()(img, out, kern, desc, alpha=0.75,
beta=-1.0)
utt.verify_grad(dconv, [img_val, kern_val, out_val]) utt.verify_grad(dconv, [img_val, kern_val, out_val])
utt.verify_grad(dconvi, [img_val, kern_val, out_val]) utt.verify_grad(dconvi, [img_val, kern_val, out_val])
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论