提交 26477f9f authored 作者: Kyunghyun Cho's avatar Kyunghyun Cho

conflict resolved

......@@ -524,7 +524,7 @@ import theano and print the config variable, as in:
slower otherwise.
This can be any compiler binary (full path or not) but things may
break if the interface if not g++-compatible to some degree.
break if the interface is not g++-compatible to some degree.
.. attribute:: config.nvcc.fastmath
......
......@@ -67,9 +67,9 @@ def add_tag_trace(thing, user_line=1):
tr = simple_extract_stack(limit=limit)[:-1]
# Different python version use different sementic for
# limit. python 2.7 include the call to extrack_stack. The -1 get
# rid of it. We also want to get rid of the add_tag_trace call.
if tr and "add_tag_trace" in tr[-1][-1]:
tr = tr[:-1]
# rid of it.
# Get rid of Theano internal
while tr:
file_path = tr[-1][0]
rm = False
......@@ -87,8 +87,10 @@ def add_tag_trace(thing, user_line=1):
break
if not rm:
break
# Keep only the most recent stack level.
# The order is from the oldest to the newest
if len(tr) > user_line:
tr = tr[:user_line]
tr = tr[-user_line:]
thing.tag.trace = tr
return thing
......
......@@ -217,6 +217,8 @@ class Print(Op):
"""
view_map = {0: [0]}
__props__ = ('message', 'attrs', 'global_fn')
def __init__(self, message="", attrs=("__str__",), global_fn=_print_fn):
self.message = message
self.attrs = tuple(attrs) # attrs should be a hashable iterable
......@@ -238,13 +240,6 @@ class Print(Op):
def R_op(self, inputs, eval_points):
return [x for x in eval_points]
def __eq__(self, other):
return (type(self) == type(other) and self.message == other.message
and self.attrs == other.attrs)
def __hash__(self):
return hash(self.message) ^ hash(self.attrs)
def __setstate__(self, dct):
dct.setdefault('global_fn', _print_fn)
self.__dict__.update(dct)
......
......@@ -645,19 +645,22 @@ if cuda_available:
if node.op == sparse_block_outer_ss:
return [sparse_block_outer_ss_inplace(*node.inputs)]
# Should be run before elemwise fusion
@opt.register_opt()
@alpha_merge(SparseBlockOuterSS, alpha_in=5, nd=4)
def local_merge_blocksparse_alpha(node, *inputs):
"""
GpuElemwise{mul}(lr, SparseBlockOuterSS) -> SparseBlockOuterSS(..., alpha=lr)
"""
return [sparse_block_outer_ss(*inputs)]
@opt.register_opt()
@output_merge(SparseBlockOuterSS, alpha_in=5, out_in=0, nd=4)
def local_merge_blocksparse_output(node, *inputs):
return [sparse_block_outer_ss(*inputs)]
# XXX: these optimisations were badly broken and now require a working
# beta param (could only be a 0/1 thing for outer_merge, but
# alpha_merge needs the full range).
# @opt.register_opt()
# @alpha_merge(SparseBlockOuterSS, alpha_in=5, beta_in=?, nd=4)
# def local_merge_blocksparse_alpha(node, *inputs):
# """
#GpuElemwise{mul}(lr, SparseBlockOuterSS) -> SparseBlockOuterSS(..., alpha=lr)
# """
# return [sparse_block_outer_ss(*inputs)]
# @opt.register_opt()
# @output_merge(SparseBlockOuterSS, alpha_in=5, beta_in=? out_in=0, nd=4)
# def local_merge_blocksparse_output(node, *inputs):
# return [sparse_block_outer_ss(*inputs)]
def sparse_block_dot_SS(W, h, inputIdx, b, outputIdx):
......
......@@ -103,11 +103,18 @@ cudnnConvolutionForward_v2(
const cudnnTensorDescriptor_t destDesc,
void *destData) {
assert(*(float *)alpha == 1.0);
assert(*(float *)beta == 1.0);
cudnnAccumulateResult_t r;
if (*(float *)beta == 0.0) {
r = CUDNN_RESULT_NO_ACCUMULATE;
} else if (*(float *)beta == 1.0) {
r = CUDNN_RESULT_ACCUMULATE;
} else {
assert(0 && "beta must be 0.0 or 1.0");
}
return cudnnConvolutionForward(handle, srcDesc, srcData,
filterDesc, filterData,
convDesc, destDesc, destData,
CUDNN_RESULT_ACCUMULATE);
r);
}
#define cudnnConvolutionForward cudnnConvolutionForward_v2
......@@ -124,11 +131,18 @@ cudnnConvolutionBackwardFilter_v2(
const cudnnFilterDescriptor_t gradDesc,
void *gradData) {
assert(*(float *)alpha == 1.0);
assert(*(float *)beta == 1.0);
cudnnAccumulateResult_t r;
if (*(float *)beta == 0.0) {
r = CUDNN_RESULT_NO_ACCUMULATE;
} else if (*(float *)beta == 1.0) {
r = CUDNN_RESULT_ACCUMULATE;
} else {
assert(0 && "beta must be 0.0 or 1.0");
}
return cudnnConvolutionBackwardFilter(handle, srcDesc, srcData,
diffDesc, diffData,
convDesc, gradDesc, gradData,
CUDNN_RESULT_ACCUMULATE);
r);
}
#define cudnnConvolutionBackwardFilter cudnnConvolutionBackwardFilter_v2
......@@ -146,7 +160,16 @@ cudnnConvolutionBackwardData_v2(
const cudnnTensorDescriptor_t gradDesc,
void *gradData) {
assert(*(float *)alpha == 1.0);
assert(*(float *)beta == 1.0);
cudnnAccumulateResult_t r;
if (*(float *)beta == 0.0) {
r = CUDNN_RESULT_NO_ACCUMULATE;
} else if (*(float *)beta == 1.0) {
r = CUDNN_RESULT_ACCUMULATE;
} else {
assert(0 && "beta must be 0.0 or 1.0");
}
/* This function needs the casting because its params are not
declared as const */
return cudnnConvolutionBackwardData(handle,
(cudnnFilterDescriptor_t)filterDesc,
filterData,
......@@ -155,7 +178,7 @@ cudnnConvolutionBackwardData_v2(
(cudnnConvolutionDescriptor_t)convDesc,
(cudnnTensorDescriptor_t)gradDesc,
gradData,
CUDNN_RESULT_ACCUMULATE);
r);
}
#define cudnnConvolutionBackwardData cudnnConvolutionBackwardData_v2
......
......@@ -411,7 +411,7 @@ class GpuDnnConv(DnnBase, COp):
alg_def = ('CONV_ALGO', alg)
return [alg_def] + inpl_def
def make_node(self, img, kern, output, desc, alpha=None):
def make_node(self, img, kern, output, desc, alpha=None, beta=None):
img = as_cuda_ndarray_variable(img)
kern = as_cuda_ndarray_variable(kern)
output = as_cuda_ndarray_variable(output)
......@@ -427,12 +427,13 @@ class GpuDnnConv(DnnBase, COp):
raise TypeError('desc must be cudnnConvolutionDescriptor_t')
alpha = ensure_float(alpha, _one, 'alpha')
beta = ensure_float(beta, _zero, 'beta')
return Apply(self, [img, kern, output, desc, alpha],
return Apply(self, [img, kern, output, desc, alpha, beta],
[output.type()])
def grad(self, inp, grads):
img, kerns, output, desc, alpha = inp
img, kerns, output, desc, alpha, beta = inp
top, = grads
top = gpu_contiguous(top)
......@@ -440,12 +441,14 @@ class GpuDnnConv(DnnBase, COp):
d_img = GpuDnnConvGradI()(kerns, top, img.zeros_like(), desc)
d_kerns = GpuDnnConvGradW()(img, top, kerns.zeros_like(), desc)
d_alpha = grad_not_implemented(self, 4, alpha)
d_beta = grad_not_implemented(self, 5, beta)
return [d_img, d_kerns, top * alpha, DisconnectedType()(), d_alpha]
return [d_img * alpha, d_kerns * alpha, top * beta,
DisconnectedType()(), d_alpha, d_beta]
def connection_pattern(self, node):
# not connected to desc
return [[1], [1], [1], [0], [1]]
return [[1], [1], [1], [0], [1], [1]]
@staticmethod
def get_out_shape(ishape, kshape, border_mode, subsample):
......@@ -507,7 +510,7 @@ class GpuDnnConvGradW(DnnBase, COp):
self.inplace = False
def grad(self, inp, grads):
img, top, output, desc, alpha = inp
img, top, output, desc, alpha, beta = inp
kerns, = grads
kerns = gpu_contiguous(kerns)
......@@ -515,12 +518,14 @@ class GpuDnnConvGradW(DnnBase, COp):
d_img = GpuDnnConvGradI()(kerns, top, img.zeros_like(), desc)
d_top = GpuDnnConv()(img, kerns, top.zeros_like(), desc)
d_alpha = grad_not_implemented(self, 4, alpha)
d_beta = grad_not_implemented(self, 5, beta)
return (d_img, d_top, kerns * alpha, DisconnectedType()(), d_alpha)
return (d_img * alpha, d_top * alpha, kerns * beta,
DisconnectedType()(), d_alpha, d_beta)
def connection_pattern(self, node):
# not connected to desc
return [[1], [1], [1], [0], [1]]
return [[1], [1], [1], [0], [1], [1]]
def get_op_params(self):
if self.inplace:
......@@ -528,7 +533,7 @@ class GpuDnnConvGradW(DnnBase, COp):
else:
return []
def make_node(self, img, topgrad, output, desc, alpha=None):
def make_node(self, img, topgrad, output, desc, alpha=None, beta=None):
img = as_cuda_ndarray_variable(img)
topgrad = as_cuda_ndarray_variable(topgrad)
output = as_cuda_ndarray_variable(output)
......@@ -544,8 +549,9 @@ class GpuDnnConvGradW(DnnBase, COp):
raise TypeError('desc must be cudnnConvolutionDescriptor_t')
alpha = ensure_float(alpha, _one, 'alpha')
beta = ensure_float(beta, _zero, 'beta')
return Apply(self, [img, topgrad, output, desc, alpha],
return Apply(self, [img, topgrad, output, desc, alpha, beta],
[output.type()])
def infer_shape(self, node, shape):
......@@ -571,7 +577,7 @@ class GpuDnnConvGradI(DnnBase, COp):
self.destroy_map = {0: [2]}
def grad(self, inp, grads):
kerns, top, output, desc, alpha = inp
kerns, top, output, desc, alpha, beta = inp
img, = grads
img = gpu_contiguous(img)
......@@ -579,12 +585,14 @@ class GpuDnnConvGradI(DnnBase, COp):
d_kerns = GpuDnnConvGradW()(img, top, kerns.zeros_like(), desc)
d_top = GpuDnnConv()(img, kerns, top.zeros_like(), desc)
d_alpha = grad_not_implemented(self, 4, alpha)
d_beta = grad_not_implemented(self, 5, beta)
return (d_kerns, d_top, img * alpha, DisconnectedType()(), d_alpha)
return (d_kerns * alpha, d_top * alpha, img * beta,
DisconnectedType()(), d_alpha, d_beta)
def connection_pattern(self, node):
# not connected to desc
return [[1], [1], [1], [0], [1]]
return [[1], [1], [1], [0], [1], [1]]
def get_op_params(self):
if self.inplace:
......@@ -592,7 +600,7 @@ class GpuDnnConvGradI(DnnBase, COp):
else:
return []
def make_node(self, kern, topgrad, output, desc, alpha=None):
def make_node(self, kern, topgrad, output, desc, alpha=None, beta=None):
kern = as_cuda_ndarray_variable(kern)
topgrad = as_cuda_ndarray_variable(topgrad)
output = as_cuda_ndarray_variable(output)
......@@ -608,8 +616,9 @@ class GpuDnnConvGradI(DnnBase, COp):
raise TypeError('desc must be cudnnConvolutionDescriptor_t')
alpha = ensure_float(alpha, _one, 'alpha')
beta = ensure_float(beta, _zero, 'beta')
return Apply(self, [kern, topgrad, output, desc, alpha],
return Apply(self, [kern, topgrad, output, desc, alpha, beta],
[output.type()])
def infer_shape(self, node, shape):
......@@ -1550,47 +1559,41 @@ if True:
70.0, 'fast_run', 'inplace', 'gpu', 'cudnn')
@register_opt('cudnn')
@alpha_merge(GpuDnnConv, alpha_in=4, nd=4)
@alpha_merge(GpuDnnConv, alpha_in=4, beta_in=5, nd=4)
def local_dnn_conv_alpha_merge(node, *inputs):
if not dnn_available() or version() == -1:
return None
return [GpuDnnConv(workmem=node.op.workmem)(*inputs)]
@register_opt('cudnn')
@alpha_merge(GpuDnnConvGradW, alpha_in=4, nd=4)
@alpha_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, nd=4)
def local_dnn_convw_alpha_merge(node, *inputs):
if not dnn_available() or version() == -1:
return None
return [GpuDnnConvGradW()(*inputs)]
@register_opt('cudnn')
@alpha_merge(GpuDnnConvGradI, alpha_in=4, nd=4)
@alpha_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, nd=4)
def local_dnn_convi_alpha_merge(node, *inputs):
if not dnn_available() or version() == -1:
return None
return [GpuDnnConvGradI()(*inputs)]
@register_opt('cudnn')
@output_merge(GpuDnnConv, alpha_in=4, out_in=2, nd=4)
@output_merge(GpuDnnConv, alpha_in=4, beta_in=5, out_in=2, nd=4)
def local_dnn_conv_output_merge(node, *inputs):
if not dnn_available() or version() == -1:
return None
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConv(workmem=node.op.workmem)(*inputs)]
@register_opt('cudnn')
@output_merge(GpuDnnConvGradW, alpha_in=4, out_in=2, nd=4)
@output_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, out_in=2, nd=4)
def local_dnn_convw_output_merge(node, *inputs):
if not dnn_available() or version() == -1:
return None
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConvGradW()(*inputs)]
@register_opt('cudnn')
@output_merge(GpuDnnConvGradI, alpha_in=4, out_in=2, nd=4)
@output_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, out_in=2, nd=4)
def local_dnn_convi_output_merge(node, *inputs):
if not dnn_available() or version() == -1:
return None
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConvGradI()(*inputs)]
......
......@@ -3,7 +3,7 @@
int
APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
CudaNdarray *om, cudnnConvolutionDescriptor_t desc,
float alpha, CudaNdarray **output) {
float alpha, float beta, CudaNdarray **output) {
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (c_set_tensor4d(input, APPLY_SPECIFIC(input)) == -1)
......@@ -18,7 +18,7 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
#else
if (CudaNdarray_prep_output(output, 4, CudaNdarray_HOST_DIMS(om)) != 0)
return 1;
if (CudaNdarray_CopyFromCudaNdarray(*output, om))
if (beta != 0.0 && CudaNdarray_CopyFromCudaNdarray(*output, om))
return 1;
#endif
......@@ -47,8 +47,6 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
if (workspace == NULL && worksize != 0)
return 1;
const float beta = 1;
err = cudnnConvolutionForward(
_handle,
(void *)&alpha,
......
......@@ -3,7 +3,7 @@
int
APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
CudaNdarray *im, cudnnConvolutionDescriptor_t desc,
float alpha, CudaNdarray **input) {
float alpha, float beta, CudaNdarray **input) {
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (c_set_tensor4d(output, APPLY_SPECIFIC(output)) == -1)
......@@ -18,15 +18,13 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
#else
if (CudaNdarray_prep_output(input, 4, CudaNdarray_HOST_DIMS(im)) != 0)
return 1;
if (CudaNdarray_CopyFromCudaNdarray(*input, im))
if (beta != 0.0 && CudaNdarray_CopyFromCudaNdarray(*input, im))
return 1;
#endif
if (c_set_tensor4d(*input, APPLY_SPECIFIC(input)) == -1)
return 1;
const float beta = 1;
err = cudnnConvolutionBackwardData(
_handle,
(void *)&alpha,
......
......@@ -3,7 +3,7 @@
int
APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
CudaNdarray *km, cudnnConvolutionDescriptor_t desc,
float alpha, CudaNdarray **kerns) {
float alpha, float beta, CudaNdarray **kerns) {
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (c_set_tensor4d(input, APPLY_SPECIFIC(input)) == -1)
......@@ -18,15 +18,13 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
#else
if (CudaNdarray_prep_output(kerns, 4, CudaNdarray_HOST_DIMS(km)) != 0)
return 1;
if (CudaNdarray_CopyFromCudaNdarray(*kerns, km))
if (beta != 0.0 && CudaNdarray_CopyFromCudaNdarray(*kerns, km))
return 1;
#endif
if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns)) == -1)
return 1;
const float beta = 1;
err = cudnnConvolutionBackwardFilter(
_handle,
(void *)&alpha,
......
......@@ -129,13 +129,13 @@ class InputToGpuOptimizer(Optimizer):
def apply(self, fgraph):
for input in fgraph.inputs:
if isinstance(input.type, CudaNdarrayType):
return
continue
# This happen frequently as we do 2 pass of the gpu optimizations
if (len(input.clients) == 1 and
(input.clients[0][0] == 'output' or
input.clients[0][0].op == gpu_from_host)):
return
continue
try:
new_input = host_from_gpu(gpu_from_host(input))
......
......@@ -5,11 +5,14 @@ import numpy
import theano
from theano import scalar as scal, Constant
from theano.gof import local_optimizer
from theano.tensor import DimShuffle
from theano.tensor import (DimShuffle, get_scalar_constant_value,
NotScalarConstantError)
from theano.sandbox.cuda.basic_ops import (
GpuFromHost, HostFromGpu, host_from_gpu, GpuDimShuffle, GpuElemwise)
_one = scal.constant(numpy.asarray(1.0, dtype='float32'))
def grab_cpu_scalar(v, nd):
if v.owner is not None:
n = v.owner
......@@ -28,6 +31,7 @@ def grab_cpu_scalar(v, nd):
v.broadcastable == (True,) * nd):
return v.dimshuffle(())
def find_node(v, cls):
# This digs through possibly redundant transfers to for the node
# that has the op class specified.
......@@ -42,7 +46,17 @@ def find_node(v, cls):
return None
def alpha_merge(cls, alpha_in, nd):
def is_equal(var, val):
# Returns True if var is always equal to val (python value), False
# otherwise (including if var is not constant)
try:
v = get_scalar_constant_value(var)
return v == val
except NotScalarConstantValue:
return False
def alpha_merge(cls, alpha_in, beta_in, nd):
def wrapper(maker):
@local_optimizer([GpuElemwise])
@wraps(maker)
......@@ -60,19 +74,19 @@ def alpha_merge(cls, alpha_in, nd):
return None
inputs = list(targ.inputs)
inputs[alpha_in] = lr * targ.inputs[alpha_in]
inputs[beta_in] = lr * targ.inputs[beta_in]
return maker(targ, *inputs)
return opt
return wrapper
def output_merge(cls, alpha_in, out_in, nd):
def output_merge(cls, alpha_in, beta_in, out_in, nd):
def wrapper(maker):
@local_optimizer([GpuElemwise])
@wraps(maker)
def opt(node):
if (isinstance(node.op, GpuElemwise) and
(node.op.scalar_op == scal.sub or
node.op.scalar_op == scal.add) and
node.op.scalar_op == scal.add and
node.nin == 2):
targ = find_node(node.inputs[0], cls)
W = node.inputs[1]
......@@ -81,15 +95,16 @@ def output_merge(cls, alpha_in, out_in, nd):
W = node.inputs[0]
if targ is None:
return None
if node.op.scalar_op == scal.sub:
alpha = -targ.inputs[alpha_in]
W = W - targ.inputs[out_in]
else:
alpha = targ.inputs[alpha_in]
W = W + targ.inputs[out_in]
if not is_equal(targ.inputs[beta_in], 0.0):
# other cases are too complex for now
return None
if W.broadcastable != targ.inputs[out_in].broadcastable:
# Would need to explicitly tile the output to fill
# the full shape here. Disable for now.
return None
inputs = list(targ.inputs)
inputs[out_in] = W
inputs[alpha_in] = alpha
inputs[beta_in] = _one.clone()
return maker(targ, *inputs)
return opt
return wrapper
......@@ -169,7 +169,10 @@ def test_blocksparse_grad_shape():
assert W_g.shape == W_val.shape
def test_blocksparse_grad_merge():
# This test is temporarily disabled since we disabled the output_merge
# and alpha_merge optimizations for blocksparse due to brokeness.
# Re-enable when those are re-added.
def Xtest_blocksparse_grad_merge():
b = tensor.fmatrix()
h = tensor.ftensor3()
iIdx = tensor.lmatrix()
......
......@@ -466,7 +466,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
def test_dnn_conv_merge():
if not cuda.dnn.dnn_available() or cuda.dnn.version() == -1:
if not cuda.dnn.dnn_available():
raise SkipTest(cuda.dnn.dnn_available.msg)
img = T.ftensor4()
kern = T.ftensor4()
......@@ -475,13 +475,13 @@ def test_dnn_conv_merge():
b = 1
c = 4
f = 3
ih = 2
ih = 5
iw = 8
kh = 2
kw = 2
kw = 6
img_val = numpy.random.random((b, c, ih, iw)).astype('float32')
kern_val = numpy.random.random((f, c, kh, kw)).astype('float32')
out_val = numpy.random.random((b, f, ih-kw+1, iw-kw+1)).astype('float32')
out_val = numpy.random.random((b, f, ih-kh+1, iw-kw+1)).astype('float32')
conv = dnn.dnn_conv(img, kern)
gw = theano.grad(conv.sum(), kern)
......@@ -489,9 +489,15 @@ def test_dnn_conv_merge():
lr = numpy.asarray(0.05, dtype='float32')
fr = out - lr * conv
wr = kern - lr * gw
ir = img - lr * gi
if cuda.dnn.version() == -1:
# Can't merge alpha with cudnn v1
fr = conv + out
wr = kern + gw
ir = img + gi
else:
fr = lr * (conv + out)
wr = kern + lr * gw
ir = img + lr * gi
f1 = theano.function([img, kern, out], [fr, wr, ir], mode=mode_with_gpu)
assert isinstance(f1.maker.fgraph.outputs[0].owner.inputs[0].owner.op,
......@@ -545,17 +551,19 @@ def test_dnn_conv_grad():
def dconv(img, kern, out):
desc = dnn.GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
conv_mode='conv')(img.shape, kern.shape)
return dnn.GpuDnnConv()(img, kern, out, desc)
return dnn.GpuDnnConv()(img, kern, out, desc, alpha=0.5, beta=0.75)
def dconvi(img, kern, out):
desc = dnn.GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
conv_mode='conv')(img.shape, kern.shape)
return dnn.GpuDnnConvGradI()(kern, out, img, desc)
return dnn.GpuDnnConvGradI()(kern, out, img, desc, alpha=-1.0,
beta=0.0)
def dconvw(img, kern, out):
desc = dnn.GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
conv_mode='conv')(img.shape, kern.shape)
return dnn.GpuDnnConvGradW()(img, out, kern, desc)
return dnn.GpuDnnConvGradW()(img, out, kern, desc, alpha=0.75,
beta=-1.0)
utt.verify_grad(dconv, [img_val, kern_val, out_val])
utt.verify_grad(dconvi, [img_val, kern_val, out_val])
......
......@@ -1931,9 +1931,10 @@ class Scan(PureOp):
type_outs.append(vl.type.why_null)
# Replace the inner output with a zero tensor of
# the right shape
inner_out_sitsot[_p] = tensor.zeros(
inner_out_nitsot[_p] = tensor.zeros(
diff_inputs[_p].shape,
dtype=theano.config.floatX)
if through_shared:
type_outs.append('through_shared')
elif disconnected_dC_dinps_t[_p]:
......
......@@ -46,11 +46,11 @@ else:
mode_with_gpu = mode_with_opt.including('gpu', 'scan')
type_eps = {'float64': 1e-7,
'float32': 3e-3}
class multiple_outputs_numeric_grad:
"""WRITEME"""
type_eps = {'float64': 1e-7,
'float32': 3e-3}
def __init__(self, f, pt, ndarray_mask=None, eps=None):
"""Return the gradient of f at pt.
......@@ -78,13 +78,12 @@ class multiple_outputs_numeric_grad:
if not ndarray_mask:
ndarray_mask = [True for x in pt]
dtype_eps = multiple_outputs_numeric_grad.type_eps['float64']
dtype_eps = type_eps['float64']
for i, p in enumerate(pt):
if ndarray_mask[i]:
pt[i] = numpy.array(p)
_eps = multiple_outputs_numeric_grad.type_eps[str(
pt[i].dtype)]
_eps = type_eps[str(pt[i].dtype)]
if _eps > dtype_eps:
dtype_eps = _eps
......@@ -836,6 +835,36 @@ class T_Scan(unittest.TestCase):
n_steps=2)
tensor.grad(a[-1], a0)
def test_grad_two_scans(self):
# data input & output
x = tensor.tensor3('x')
t = tensor.imatrix('t')
# forward pass
W = theano.shared(
numpy.random.randn(2, 2).astype('float32'),
name="W", borrow=True)
def forward_scanner(x_t):
a2_t = tensor.dot(x_t, W)
y_t = tensor.nnet.softmax(a2_t)
return y_t
y, _ = theano.scan(fn=forward_scanner, sequences=x,
outputs_info=[None])
# loss function
def error_scanner(y_t, t_t):
return tensor.mean(tensor.nnet.categorical_crossentropy(y_t, t_t))
L, _ = theano.scan(fn=error_scanner, sequences=[y, t],
outputs_info=[None])
L = tensor.mean(L)
# backward pass
gW = tensor.grad(L, [W])
# simple rnn, one input, one state, weights for each; input/state are
# vectors, weights are scalars; using shared variables and past
# taps (sequences and outputs)
......@@ -3867,6 +3896,42 @@ class T_Scan(unittest.TestCase):
f = theano.function([W, n_steps], H)
f(numpy.ones((8,), dtype='float32'), 1)
def test_strict_mode(self):
n = 10
w = numpy.array([[-1,2],[3,-4]]).astype(theano.config.floatX)
w_ = theano.shared(w)
x0 = numpy.array([1,2]).astype(theano.config.floatX)
x0_ = tensor.vector(name='x0', dtype=theano.config.floatX)
def _scan_loose(x):
return tensor.dot(x, w_)
def _scan_strict(x, w_ns):
return tensor.dot(x, w_ns)
ret_loose = theano.scan(_scan_loose,
sequences=[],
outputs_info=[x0_],
n_steps=n,
strict=False)
f_loose = theano.function([x0_], ret_loose[0][-1])
ret_strict = theano.scan(_scan_strict,
sequences=[],
outputs_info=[x0_],
non_sequences=[w_],
n_steps=n,
strict=True)
f_strict = theano.function([x0_], ret_strict[0][-1])
result_loose = f_loose(x0)
result_strict = f_strict(x0)
diff = (abs(result_loose - result_strict)).mean()
assert diff <= type_eps[theano.config.floatX]
def test_speed():
#
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论