提交 26477f9f authored 作者: Kyunghyun Cho's avatar Kyunghyun Cho

conflict resolved

...@@ -524,7 +524,7 @@ import theano and print the config variable, as in: ...@@ -524,7 +524,7 @@ import theano and print the config variable, as in:
slower otherwise. slower otherwise.
This can be any compiler binary (full path or not) but things may This can be any compiler binary (full path or not) but things may
break if the interface if not g++-compatible to some degree. break if the interface is not g++-compatible to some degree.
.. attribute:: config.nvcc.fastmath .. attribute:: config.nvcc.fastmath
......
...@@ -67,9 +67,9 @@ def add_tag_trace(thing, user_line=1): ...@@ -67,9 +67,9 @@ def add_tag_trace(thing, user_line=1):
tr = simple_extract_stack(limit=limit)[:-1] tr = simple_extract_stack(limit=limit)[:-1]
# Different python version use different sementic for # Different python version use different sementic for
# limit. python 2.7 include the call to extrack_stack. The -1 get # limit. python 2.7 include the call to extrack_stack. The -1 get
# rid of it. We also want to get rid of the add_tag_trace call. # rid of it.
if tr and "add_tag_trace" in tr[-1][-1]:
tr = tr[:-1] # Get rid of Theano internal
while tr: while tr:
file_path = tr[-1][0] file_path = tr[-1][0]
rm = False rm = False
...@@ -87,8 +87,10 @@ def add_tag_trace(thing, user_line=1): ...@@ -87,8 +87,10 @@ def add_tag_trace(thing, user_line=1):
break break
if not rm: if not rm:
break break
# Keep only the most recent stack level.
# The order is from the oldest to the newest
if len(tr) > user_line: if len(tr) > user_line:
tr = tr[:user_line] tr = tr[-user_line:]
thing.tag.trace = tr thing.tag.trace = tr
return thing return thing
......
...@@ -217,6 +217,8 @@ class Print(Op): ...@@ -217,6 +217,8 @@ class Print(Op):
""" """
view_map = {0: [0]} view_map = {0: [0]}
__props__ = ('message', 'attrs', 'global_fn')
def __init__(self, message="", attrs=("__str__",), global_fn=_print_fn): def __init__(self, message="", attrs=("__str__",), global_fn=_print_fn):
self.message = message self.message = message
self.attrs = tuple(attrs) # attrs should be a hashable iterable self.attrs = tuple(attrs) # attrs should be a hashable iterable
...@@ -238,13 +240,6 @@ class Print(Op): ...@@ -238,13 +240,6 @@ class Print(Op):
def R_op(self, inputs, eval_points): def R_op(self, inputs, eval_points):
return [x for x in eval_points] return [x for x in eval_points]
def __eq__(self, other):
return (type(self) == type(other) and self.message == other.message
and self.attrs == other.attrs)
def __hash__(self):
return hash(self.message) ^ hash(self.attrs)
def __setstate__(self, dct): def __setstate__(self, dct):
dct.setdefault('global_fn', _print_fn) dct.setdefault('global_fn', _print_fn)
self.__dict__.update(dct) self.__dict__.update(dct)
......
...@@ -645,19 +645,22 @@ if cuda_available: ...@@ -645,19 +645,22 @@ if cuda_available:
if node.op == sparse_block_outer_ss: if node.op == sparse_block_outer_ss:
return [sparse_block_outer_ss_inplace(*node.inputs)] return [sparse_block_outer_ss_inplace(*node.inputs)]
# Should be run before elemwise fusion # XXX: these optimisations were badly broken and now require a working
@opt.register_opt() # beta param (could only be a 0/1 thing for outer_merge, but
@alpha_merge(SparseBlockOuterSS, alpha_in=5, nd=4) # alpha_merge needs the full range).
def local_merge_blocksparse_alpha(node, *inputs):
""" # @opt.register_opt()
GpuElemwise{mul}(lr, SparseBlockOuterSS) -> SparseBlockOuterSS(..., alpha=lr) # @alpha_merge(SparseBlockOuterSS, alpha_in=5, beta_in=?, nd=4)
""" # def local_merge_blocksparse_alpha(node, *inputs):
return [sparse_block_outer_ss(*inputs)] # """
#GpuElemwise{mul}(lr, SparseBlockOuterSS) -> SparseBlockOuterSS(..., alpha=lr)
@opt.register_opt() # """
@output_merge(SparseBlockOuterSS, alpha_in=5, out_in=0, nd=4) # return [sparse_block_outer_ss(*inputs)]
def local_merge_blocksparse_output(node, *inputs):
return [sparse_block_outer_ss(*inputs)] # @opt.register_opt()
# @output_merge(SparseBlockOuterSS, alpha_in=5, beta_in=? out_in=0, nd=4)
# def local_merge_blocksparse_output(node, *inputs):
# return [sparse_block_outer_ss(*inputs)]
def sparse_block_dot_SS(W, h, inputIdx, b, outputIdx): def sparse_block_dot_SS(W, h, inputIdx, b, outputIdx):
......
...@@ -103,11 +103,18 @@ cudnnConvolutionForward_v2( ...@@ -103,11 +103,18 @@ cudnnConvolutionForward_v2(
const cudnnTensorDescriptor_t destDesc, const cudnnTensorDescriptor_t destDesc,
void *destData) { void *destData) {
assert(*(float *)alpha == 1.0); assert(*(float *)alpha == 1.0);
assert(*(float *)beta == 1.0); cudnnAccumulateResult_t r;
if (*(float *)beta == 0.0) {
r = CUDNN_RESULT_NO_ACCUMULATE;
} else if (*(float *)beta == 1.0) {
r = CUDNN_RESULT_ACCUMULATE;
} else {
assert(0 && "beta must be 0.0 or 1.0");
}
return cudnnConvolutionForward(handle, srcDesc, srcData, return cudnnConvolutionForward(handle, srcDesc, srcData,
filterDesc, filterData, filterDesc, filterData,
convDesc, destDesc, destData, convDesc, destDesc, destData,
CUDNN_RESULT_ACCUMULATE); r);
} }
#define cudnnConvolutionForward cudnnConvolutionForward_v2 #define cudnnConvolutionForward cudnnConvolutionForward_v2
...@@ -124,11 +131,18 @@ cudnnConvolutionBackwardFilter_v2( ...@@ -124,11 +131,18 @@ cudnnConvolutionBackwardFilter_v2(
const cudnnFilterDescriptor_t gradDesc, const cudnnFilterDescriptor_t gradDesc,
void *gradData) { void *gradData) {
assert(*(float *)alpha == 1.0); assert(*(float *)alpha == 1.0);
assert(*(float *)beta == 1.0); cudnnAccumulateResult_t r;
if (*(float *)beta == 0.0) {
r = CUDNN_RESULT_NO_ACCUMULATE;
} else if (*(float *)beta == 1.0) {
r = CUDNN_RESULT_ACCUMULATE;
} else {
assert(0 && "beta must be 0.0 or 1.0");
}
return cudnnConvolutionBackwardFilter(handle, srcDesc, srcData, return cudnnConvolutionBackwardFilter(handle, srcDesc, srcData,
diffDesc, diffData, diffDesc, diffData,
convDesc, gradDesc, gradData, convDesc, gradDesc, gradData,
CUDNN_RESULT_ACCUMULATE); r);
} }
#define cudnnConvolutionBackwardFilter cudnnConvolutionBackwardFilter_v2 #define cudnnConvolutionBackwardFilter cudnnConvolutionBackwardFilter_v2
...@@ -146,7 +160,16 @@ cudnnConvolutionBackwardData_v2( ...@@ -146,7 +160,16 @@ cudnnConvolutionBackwardData_v2(
const cudnnTensorDescriptor_t gradDesc, const cudnnTensorDescriptor_t gradDesc,
void *gradData) { void *gradData) {
assert(*(float *)alpha == 1.0); assert(*(float *)alpha == 1.0);
assert(*(float *)beta == 1.0); cudnnAccumulateResult_t r;
if (*(float *)beta == 0.0) {
r = CUDNN_RESULT_NO_ACCUMULATE;
} else if (*(float *)beta == 1.0) {
r = CUDNN_RESULT_ACCUMULATE;
} else {
assert(0 && "beta must be 0.0 or 1.0");
}
/* This function needs the casting because its params are not
declared as const */
return cudnnConvolutionBackwardData(handle, return cudnnConvolutionBackwardData(handle,
(cudnnFilterDescriptor_t)filterDesc, (cudnnFilterDescriptor_t)filterDesc,
filterData, filterData,
...@@ -155,7 +178,7 @@ cudnnConvolutionBackwardData_v2( ...@@ -155,7 +178,7 @@ cudnnConvolutionBackwardData_v2(
(cudnnConvolutionDescriptor_t)convDesc, (cudnnConvolutionDescriptor_t)convDesc,
(cudnnTensorDescriptor_t)gradDesc, (cudnnTensorDescriptor_t)gradDesc,
gradData, gradData,
CUDNN_RESULT_ACCUMULATE); r);
} }
#define cudnnConvolutionBackwardData cudnnConvolutionBackwardData_v2 #define cudnnConvolutionBackwardData cudnnConvolutionBackwardData_v2
......
...@@ -411,7 +411,7 @@ class GpuDnnConv(DnnBase, COp): ...@@ -411,7 +411,7 @@ class GpuDnnConv(DnnBase, COp):
alg_def = ('CONV_ALGO', alg) alg_def = ('CONV_ALGO', alg)
return [alg_def] + inpl_def return [alg_def] + inpl_def
def make_node(self, img, kern, output, desc, alpha=None): def make_node(self, img, kern, output, desc, alpha=None, beta=None):
img = as_cuda_ndarray_variable(img) img = as_cuda_ndarray_variable(img)
kern = as_cuda_ndarray_variable(kern) kern = as_cuda_ndarray_variable(kern)
output = as_cuda_ndarray_variable(output) output = as_cuda_ndarray_variable(output)
...@@ -427,12 +427,13 @@ class GpuDnnConv(DnnBase, COp): ...@@ -427,12 +427,13 @@ class GpuDnnConv(DnnBase, COp):
raise TypeError('desc must be cudnnConvolutionDescriptor_t') raise TypeError('desc must be cudnnConvolutionDescriptor_t')
alpha = ensure_float(alpha, _one, 'alpha') alpha = ensure_float(alpha, _one, 'alpha')
beta = ensure_float(beta, _zero, 'beta')
return Apply(self, [img, kern, output, desc, alpha], return Apply(self, [img, kern, output, desc, alpha, beta],
[output.type()]) [output.type()])
def grad(self, inp, grads): def grad(self, inp, grads):
img, kerns, output, desc, alpha = inp img, kerns, output, desc, alpha, beta = inp
top, = grads top, = grads
top = gpu_contiguous(top) top = gpu_contiguous(top)
...@@ -440,12 +441,14 @@ class GpuDnnConv(DnnBase, COp): ...@@ -440,12 +441,14 @@ class GpuDnnConv(DnnBase, COp):
d_img = GpuDnnConvGradI()(kerns, top, img.zeros_like(), desc) d_img = GpuDnnConvGradI()(kerns, top, img.zeros_like(), desc)
d_kerns = GpuDnnConvGradW()(img, top, kerns.zeros_like(), desc) d_kerns = GpuDnnConvGradW()(img, top, kerns.zeros_like(), desc)
d_alpha = grad_not_implemented(self, 4, alpha) d_alpha = grad_not_implemented(self, 4, alpha)
d_beta = grad_not_implemented(self, 5, beta)
return [d_img, d_kerns, top * alpha, DisconnectedType()(), d_alpha] return [d_img * alpha, d_kerns * alpha, top * beta,
DisconnectedType()(), d_alpha, d_beta]
def connection_pattern(self, node): def connection_pattern(self, node):
# not connected to desc # not connected to desc
return [[1], [1], [1], [0], [1]] return [[1], [1], [1], [0], [1], [1]]
@staticmethod @staticmethod
def get_out_shape(ishape, kshape, border_mode, subsample): def get_out_shape(ishape, kshape, border_mode, subsample):
...@@ -507,7 +510,7 @@ class GpuDnnConvGradW(DnnBase, COp): ...@@ -507,7 +510,7 @@ class GpuDnnConvGradW(DnnBase, COp):
self.inplace = False self.inplace = False
def grad(self, inp, grads): def grad(self, inp, grads):
img, top, output, desc, alpha = inp img, top, output, desc, alpha, beta = inp
kerns, = grads kerns, = grads
kerns = gpu_contiguous(kerns) kerns = gpu_contiguous(kerns)
...@@ -515,12 +518,14 @@ class GpuDnnConvGradW(DnnBase, COp): ...@@ -515,12 +518,14 @@ class GpuDnnConvGradW(DnnBase, COp):
d_img = GpuDnnConvGradI()(kerns, top, img.zeros_like(), desc) d_img = GpuDnnConvGradI()(kerns, top, img.zeros_like(), desc)
d_top = GpuDnnConv()(img, kerns, top.zeros_like(), desc) d_top = GpuDnnConv()(img, kerns, top.zeros_like(), desc)
d_alpha = grad_not_implemented(self, 4, alpha) d_alpha = grad_not_implemented(self, 4, alpha)
d_beta = grad_not_implemented(self, 5, beta)
return (d_img, d_top, kerns * alpha, DisconnectedType()(), d_alpha) return (d_img * alpha, d_top * alpha, kerns * beta,
DisconnectedType()(), d_alpha, d_beta)
def connection_pattern(self, node): def connection_pattern(self, node):
# not connected to desc # not connected to desc
return [[1], [1], [1], [0], [1]] return [[1], [1], [1], [0], [1], [1]]
def get_op_params(self): def get_op_params(self):
if self.inplace: if self.inplace:
...@@ -528,7 +533,7 @@ class GpuDnnConvGradW(DnnBase, COp): ...@@ -528,7 +533,7 @@ class GpuDnnConvGradW(DnnBase, COp):
else: else:
return [] return []
def make_node(self, img, topgrad, output, desc, alpha=None): def make_node(self, img, topgrad, output, desc, alpha=None, beta=None):
img = as_cuda_ndarray_variable(img) img = as_cuda_ndarray_variable(img)
topgrad = as_cuda_ndarray_variable(topgrad) topgrad = as_cuda_ndarray_variable(topgrad)
output = as_cuda_ndarray_variable(output) output = as_cuda_ndarray_variable(output)
...@@ -544,8 +549,9 @@ class GpuDnnConvGradW(DnnBase, COp): ...@@ -544,8 +549,9 @@ class GpuDnnConvGradW(DnnBase, COp):
raise TypeError('desc must be cudnnConvolutionDescriptor_t') raise TypeError('desc must be cudnnConvolutionDescriptor_t')
alpha = ensure_float(alpha, _one, 'alpha') alpha = ensure_float(alpha, _one, 'alpha')
beta = ensure_float(beta, _zero, 'beta')
return Apply(self, [img, topgrad, output, desc, alpha], return Apply(self, [img, topgrad, output, desc, alpha, beta],
[output.type()]) [output.type()])
def infer_shape(self, node, shape): def infer_shape(self, node, shape):
...@@ -571,7 +577,7 @@ class GpuDnnConvGradI(DnnBase, COp): ...@@ -571,7 +577,7 @@ class GpuDnnConvGradI(DnnBase, COp):
self.destroy_map = {0: [2]} self.destroy_map = {0: [2]}
def grad(self, inp, grads): def grad(self, inp, grads):
kerns, top, output, desc, alpha = inp kerns, top, output, desc, alpha, beta = inp
img, = grads img, = grads
img = gpu_contiguous(img) img = gpu_contiguous(img)
...@@ -579,12 +585,14 @@ class GpuDnnConvGradI(DnnBase, COp): ...@@ -579,12 +585,14 @@ class GpuDnnConvGradI(DnnBase, COp):
d_kerns = GpuDnnConvGradW()(img, top, kerns.zeros_like(), desc) d_kerns = GpuDnnConvGradW()(img, top, kerns.zeros_like(), desc)
d_top = GpuDnnConv()(img, kerns, top.zeros_like(), desc) d_top = GpuDnnConv()(img, kerns, top.zeros_like(), desc)
d_alpha = grad_not_implemented(self, 4, alpha) d_alpha = grad_not_implemented(self, 4, alpha)
d_beta = grad_not_implemented(self, 5, beta)
return (d_kerns, d_top, img * alpha, DisconnectedType()(), d_alpha) return (d_kerns * alpha, d_top * alpha, img * beta,
DisconnectedType()(), d_alpha, d_beta)
def connection_pattern(self, node): def connection_pattern(self, node):
# not connected to desc # not connected to desc
return [[1], [1], [1], [0], [1]] return [[1], [1], [1], [0], [1], [1]]
def get_op_params(self): def get_op_params(self):
if self.inplace: if self.inplace:
...@@ -592,7 +600,7 @@ class GpuDnnConvGradI(DnnBase, COp): ...@@ -592,7 +600,7 @@ class GpuDnnConvGradI(DnnBase, COp):
else: else:
return [] return []
def make_node(self, kern, topgrad, output, desc, alpha=None): def make_node(self, kern, topgrad, output, desc, alpha=None, beta=None):
kern = as_cuda_ndarray_variable(kern) kern = as_cuda_ndarray_variable(kern)
topgrad = as_cuda_ndarray_variable(topgrad) topgrad = as_cuda_ndarray_variable(topgrad)
output = as_cuda_ndarray_variable(output) output = as_cuda_ndarray_variable(output)
...@@ -608,8 +616,9 @@ class GpuDnnConvGradI(DnnBase, COp): ...@@ -608,8 +616,9 @@ class GpuDnnConvGradI(DnnBase, COp):
raise TypeError('desc must be cudnnConvolutionDescriptor_t') raise TypeError('desc must be cudnnConvolutionDescriptor_t')
alpha = ensure_float(alpha, _one, 'alpha') alpha = ensure_float(alpha, _one, 'alpha')
beta = ensure_float(beta, _zero, 'beta')
return Apply(self, [kern, topgrad, output, desc, alpha], return Apply(self, [kern, topgrad, output, desc, alpha, beta],
[output.type()]) [output.type()])
def infer_shape(self, node, shape): def infer_shape(self, node, shape):
...@@ -1550,47 +1559,41 @@ if True: ...@@ -1550,47 +1559,41 @@ if True:
70.0, 'fast_run', 'inplace', 'gpu', 'cudnn') 70.0, 'fast_run', 'inplace', 'gpu', 'cudnn')
@register_opt('cudnn') @register_opt('cudnn')
@alpha_merge(GpuDnnConv, alpha_in=4, nd=4) @alpha_merge(GpuDnnConv, alpha_in=4, beta_in=5, nd=4)
def local_dnn_conv_alpha_merge(node, *inputs): def local_dnn_conv_alpha_merge(node, *inputs):
if not dnn_available() or version() == -1: if not dnn_available() or version() == -1:
return None return None
return [GpuDnnConv(workmem=node.op.workmem)(*inputs)] return [GpuDnnConv(workmem=node.op.workmem)(*inputs)]
@register_opt('cudnn') @register_opt('cudnn')
@alpha_merge(GpuDnnConvGradW, alpha_in=4, nd=4) @alpha_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, nd=4)
def local_dnn_convw_alpha_merge(node, *inputs): def local_dnn_convw_alpha_merge(node, *inputs):
if not dnn_available() or version() == -1: if not dnn_available() or version() == -1:
return None return None
return [GpuDnnConvGradW()(*inputs)] return [GpuDnnConvGradW()(*inputs)]
@register_opt('cudnn') @register_opt('cudnn')
@alpha_merge(GpuDnnConvGradI, alpha_in=4, nd=4) @alpha_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, nd=4)
def local_dnn_convi_alpha_merge(node, *inputs): def local_dnn_convi_alpha_merge(node, *inputs):
if not dnn_available() or version() == -1: if not dnn_available() or version() == -1:
return None return None
return [GpuDnnConvGradI()(*inputs)] return [GpuDnnConvGradI()(*inputs)]
@register_opt('cudnn') @register_opt('cudnn')
@output_merge(GpuDnnConv, alpha_in=4, out_in=2, nd=4) @output_merge(GpuDnnConv, alpha_in=4, beta_in=5, out_in=2, nd=4)
def local_dnn_conv_output_merge(node, *inputs): def local_dnn_conv_output_merge(node, *inputs):
if not dnn_available() or version() == -1:
return None
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:] inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConv(workmem=node.op.workmem)(*inputs)] return [GpuDnnConv(workmem=node.op.workmem)(*inputs)]
@register_opt('cudnn') @register_opt('cudnn')
@output_merge(GpuDnnConvGradW, alpha_in=4, out_in=2, nd=4) @output_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, out_in=2, nd=4)
def local_dnn_convw_output_merge(node, *inputs): def local_dnn_convw_output_merge(node, *inputs):
if not dnn_available() or version() == -1:
return None
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:] inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConvGradW()(*inputs)] return [GpuDnnConvGradW()(*inputs)]
@register_opt('cudnn') @register_opt('cudnn')
@output_merge(GpuDnnConvGradI, alpha_in=4, out_in=2, nd=4) @output_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, out_in=2, nd=4)
def local_dnn_convi_output_merge(node, *inputs): def local_dnn_convi_output_merge(node, *inputs):
if not dnn_available() or version() == -1:
return None
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:] inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConvGradI()(*inputs)] return [GpuDnnConvGradI()(*inputs)]
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
int int
APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns, APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
CudaNdarray *om, cudnnConvolutionDescriptor_t desc, CudaNdarray *om, cudnnConvolutionDescriptor_t desc,
float alpha, CudaNdarray **output) { float alpha, float beta, CudaNdarray **output) {
cudnnStatus_t err = CUDNN_STATUS_SUCCESS; cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (c_set_tensor4d(input, APPLY_SPECIFIC(input)) == -1) if (c_set_tensor4d(input, APPLY_SPECIFIC(input)) == -1)
...@@ -18,7 +18,7 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns, ...@@ -18,7 +18,7 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
#else #else
if (CudaNdarray_prep_output(output, 4, CudaNdarray_HOST_DIMS(om)) != 0) if (CudaNdarray_prep_output(output, 4, CudaNdarray_HOST_DIMS(om)) != 0)
return 1; return 1;
if (CudaNdarray_CopyFromCudaNdarray(*output, om)) if (beta != 0.0 && CudaNdarray_CopyFromCudaNdarray(*output, om))
return 1; return 1;
#endif #endif
...@@ -47,8 +47,6 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns, ...@@ -47,8 +47,6 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
if (workspace == NULL && worksize != 0) if (workspace == NULL && worksize != 0)
return 1; return 1;
const float beta = 1;
err = cudnnConvolutionForward( err = cudnnConvolutionForward(
_handle, _handle,
(void *)&alpha, (void *)&alpha,
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
int int
APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output, APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
CudaNdarray *im, cudnnConvolutionDescriptor_t desc, CudaNdarray *im, cudnnConvolutionDescriptor_t desc,
float alpha, CudaNdarray **input) { float alpha, float beta, CudaNdarray **input) {
cudnnStatus_t err = CUDNN_STATUS_SUCCESS; cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (c_set_tensor4d(output, APPLY_SPECIFIC(output)) == -1) if (c_set_tensor4d(output, APPLY_SPECIFIC(output)) == -1)
...@@ -18,15 +18,13 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output, ...@@ -18,15 +18,13 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
#else #else
if (CudaNdarray_prep_output(input, 4, CudaNdarray_HOST_DIMS(im)) != 0) if (CudaNdarray_prep_output(input, 4, CudaNdarray_HOST_DIMS(im)) != 0)
return 1; return 1;
if (CudaNdarray_CopyFromCudaNdarray(*input, im)) if (beta != 0.0 && CudaNdarray_CopyFromCudaNdarray(*input, im))
return 1; return 1;
#endif #endif
if (c_set_tensor4d(*input, APPLY_SPECIFIC(input)) == -1) if (c_set_tensor4d(*input, APPLY_SPECIFIC(input)) == -1)
return 1; return 1;
const float beta = 1;
err = cudnnConvolutionBackwardData( err = cudnnConvolutionBackwardData(
_handle, _handle,
(void *)&alpha, (void *)&alpha,
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
int int
APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output, APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
CudaNdarray *km, cudnnConvolutionDescriptor_t desc, CudaNdarray *km, cudnnConvolutionDescriptor_t desc,
float alpha, CudaNdarray **kerns) { float alpha, float beta, CudaNdarray **kerns) {
cudnnStatus_t err = CUDNN_STATUS_SUCCESS; cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (c_set_tensor4d(input, APPLY_SPECIFIC(input)) == -1) if (c_set_tensor4d(input, APPLY_SPECIFIC(input)) == -1)
...@@ -18,15 +18,13 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output, ...@@ -18,15 +18,13 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
#else #else
if (CudaNdarray_prep_output(kerns, 4, CudaNdarray_HOST_DIMS(km)) != 0) if (CudaNdarray_prep_output(kerns, 4, CudaNdarray_HOST_DIMS(km)) != 0)
return 1; return 1;
if (CudaNdarray_CopyFromCudaNdarray(*kerns, km)) if (beta != 0.0 && CudaNdarray_CopyFromCudaNdarray(*kerns, km))
return 1; return 1;
#endif #endif
if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns)) == -1) if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns)) == -1)
return 1; return 1;
const float beta = 1;
err = cudnnConvolutionBackwardFilter( err = cudnnConvolutionBackwardFilter(
_handle, _handle,
(void *)&alpha, (void *)&alpha,
......
...@@ -129,13 +129,13 @@ class InputToGpuOptimizer(Optimizer): ...@@ -129,13 +129,13 @@ class InputToGpuOptimizer(Optimizer):
def apply(self, fgraph): def apply(self, fgraph):
for input in fgraph.inputs: for input in fgraph.inputs:
if isinstance(input.type, CudaNdarrayType): if isinstance(input.type, CudaNdarrayType):
return continue
# This happen frequently as we do 2 pass of the gpu optimizations # This happen frequently as we do 2 pass of the gpu optimizations
if (len(input.clients) == 1 and if (len(input.clients) == 1 and
(input.clients[0][0] == 'output' or (input.clients[0][0] == 'output' or
input.clients[0][0].op == gpu_from_host)): input.clients[0][0].op == gpu_from_host)):
return continue
try: try:
new_input = host_from_gpu(gpu_from_host(input)) new_input = host_from_gpu(gpu_from_host(input))
......
...@@ -5,11 +5,14 @@ import numpy ...@@ -5,11 +5,14 @@ import numpy
import theano import theano
from theano import scalar as scal, Constant from theano import scalar as scal, Constant
from theano.gof import local_optimizer from theano.gof import local_optimizer
from theano.tensor import DimShuffle from theano.tensor import (DimShuffle, get_scalar_constant_value,
NotScalarConstantError)
from theano.sandbox.cuda.basic_ops import ( from theano.sandbox.cuda.basic_ops import (
GpuFromHost, HostFromGpu, host_from_gpu, GpuDimShuffle, GpuElemwise) GpuFromHost, HostFromGpu, host_from_gpu, GpuDimShuffle, GpuElemwise)
_one = scal.constant(numpy.asarray(1.0, dtype='float32'))
def grab_cpu_scalar(v, nd): def grab_cpu_scalar(v, nd):
if v.owner is not None: if v.owner is not None:
n = v.owner n = v.owner
...@@ -28,6 +31,7 @@ def grab_cpu_scalar(v, nd): ...@@ -28,6 +31,7 @@ def grab_cpu_scalar(v, nd):
v.broadcastable == (True,) * nd): v.broadcastable == (True,) * nd):
return v.dimshuffle(()) return v.dimshuffle(())
def find_node(v, cls): def find_node(v, cls):
# This digs through possibly redundant transfers to for the node # This digs through possibly redundant transfers to for the node
# that has the op class specified. # that has the op class specified.
...@@ -42,7 +46,17 @@ def find_node(v, cls): ...@@ -42,7 +46,17 @@ def find_node(v, cls):
return None return None
def alpha_merge(cls, alpha_in, nd): def is_equal(var, val):
# Returns True if var is always equal to val (python value), False
# otherwise (including if var is not constant)
try:
v = get_scalar_constant_value(var)
return v == val
except NotScalarConstantValue:
return False
def alpha_merge(cls, alpha_in, beta_in, nd):
def wrapper(maker): def wrapper(maker):
@local_optimizer([GpuElemwise]) @local_optimizer([GpuElemwise])
@wraps(maker) @wraps(maker)
...@@ -60,19 +74,19 @@ def alpha_merge(cls, alpha_in, nd): ...@@ -60,19 +74,19 @@ def alpha_merge(cls, alpha_in, nd):
return None return None
inputs = list(targ.inputs) inputs = list(targ.inputs)
inputs[alpha_in] = lr * targ.inputs[alpha_in] inputs[alpha_in] = lr * targ.inputs[alpha_in]
inputs[beta_in] = lr * targ.inputs[beta_in]
return maker(targ, *inputs) return maker(targ, *inputs)
return opt return opt
return wrapper return wrapper
def output_merge(cls, alpha_in, out_in, nd): def output_merge(cls, alpha_in, beta_in, out_in, nd):
def wrapper(maker): def wrapper(maker):
@local_optimizer([GpuElemwise]) @local_optimizer([GpuElemwise])
@wraps(maker) @wraps(maker)
def opt(node): def opt(node):
if (isinstance(node.op, GpuElemwise) and if (isinstance(node.op, GpuElemwise) and
(node.op.scalar_op == scal.sub or node.op.scalar_op == scal.add and
node.op.scalar_op == scal.add) and
node.nin == 2): node.nin == 2):
targ = find_node(node.inputs[0], cls) targ = find_node(node.inputs[0], cls)
W = node.inputs[1] W = node.inputs[1]
...@@ -81,15 +95,16 @@ def output_merge(cls, alpha_in, out_in, nd): ...@@ -81,15 +95,16 @@ def output_merge(cls, alpha_in, out_in, nd):
W = node.inputs[0] W = node.inputs[0]
if targ is None: if targ is None:
return None return None
if node.op.scalar_op == scal.sub: if not is_equal(targ.inputs[beta_in], 0.0):
alpha = -targ.inputs[alpha_in] # other cases are too complex for now
W = W - targ.inputs[out_in] return None
else: if W.broadcastable != targ.inputs[out_in].broadcastable:
alpha = targ.inputs[alpha_in] # Would need to explicitly tile the output to fill
W = W + targ.inputs[out_in] # the full shape here. Disable for now.
return None
inputs = list(targ.inputs) inputs = list(targ.inputs)
inputs[out_in] = W inputs[out_in] = W
inputs[alpha_in] = alpha inputs[beta_in] = _one.clone()
return maker(targ, *inputs) return maker(targ, *inputs)
return opt return opt
return wrapper return wrapper
...@@ -169,7 +169,10 @@ def test_blocksparse_grad_shape(): ...@@ -169,7 +169,10 @@ def test_blocksparse_grad_shape():
assert W_g.shape == W_val.shape assert W_g.shape == W_val.shape
def test_blocksparse_grad_merge(): # This test is temporarily disabled since we disabled the output_merge
# and alpha_merge optimizations for blocksparse due to brokeness.
# Re-enable when those are re-added.
def Xtest_blocksparse_grad_merge():
b = tensor.fmatrix() b = tensor.fmatrix()
h = tensor.ftensor3() h = tensor.ftensor3()
iIdx = tensor.lmatrix() iIdx = tensor.lmatrix()
......
...@@ -466,7 +466,7 @@ class TestDnnInferShapes(utt.InferShapeTester): ...@@ -466,7 +466,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
def test_dnn_conv_merge(): def test_dnn_conv_merge():
if not cuda.dnn.dnn_available() or cuda.dnn.version() == -1: if not cuda.dnn.dnn_available():
raise SkipTest(cuda.dnn.dnn_available.msg) raise SkipTest(cuda.dnn.dnn_available.msg)
img = T.ftensor4() img = T.ftensor4()
kern = T.ftensor4() kern = T.ftensor4()
...@@ -475,13 +475,13 @@ def test_dnn_conv_merge(): ...@@ -475,13 +475,13 @@ def test_dnn_conv_merge():
b = 1 b = 1
c = 4 c = 4
f = 3 f = 3
ih = 2 ih = 5
iw = 8 iw = 8
kh = 2 kh = 2
kw = 2 kw = 6
img_val = numpy.random.random((b, c, ih, iw)).astype('float32') img_val = numpy.random.random((b, c, ih, iw)).astype('float32')
kern_val = numpy.random.random((f, c, kh, kw)).astype('float32') kern_val = numpy.random.random((f, c, kh, kw)).astype('float32')
out_val = numpy.random.random((b, f, ih-kw+1, iw-kw+1)).astype('float32') out_val = numpy.random.random((b, f, ih-kh+1, iw-kw+1)).astype('float32')
conv = dnn.dnn_conv(img, kern) conv = dnn.dnn_conv(img, kern)
gw = theano.grad(conv.sum(), kern) gw = theano.grad(conv.sum(), kern)
...@@ -489,9 +489,15 @@ def test_dnn_conv_merge(): ...@@ -489,9 +489,15 @@ def test_dnn_conv_merge():
lr = numpy.asarray(0.05, dtype='float32') lr = numpy.asarray(0.05, dtype='float32')
fr = out - lr * conv if cuda.dnn.version() == -1:
wr = kern - lr * gw # Can't merge alpha with cudnn v1
ir = img - lr * gi fr = conv + out
wr = kern + gw
ir = img + gi
else:
fr = lr * (conv + out)
wr = kern + lr * gw
ir = img + lr * gi
f1 = theano.function([img, kern, out], [fr, wr, ir], mode=mode_with_gpu) f1 = theano.function([img, kern, out], [fr, wr, ir], mode=mode_with_gpu)
assert isinstance(f1.maker.fgraph.outputs[0].owner.inputs[0].owner.op, assert isinstance(f1.maker.fgraph.outputs[0].owner.inputs[0].owner.op,
...@@ -545,17 +551,19 @@ def test_dnn_conv_grad(): ...@@ -545,17 +551,19 @@ def test_dnn_conv_grad():
def dconv(img, kern, out): def dconv(img, kern, out):
desc = dnn.GpuDnnConvDesc(border_mode='valid', subsample=(1, 1), desc = dnn.GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
conv_mode='conv')(img.shape, kern.shape) conv_mode='conv')(img.shape, kern.shape)
return dnn.GpuDnnConv()(img, kern, out, desc) return dnn.GpuDnnConv()(img, kern, out, desc, alpha=0.5, beta=0.75)
def dconvi(img, kern, out): def dconvi(img, kern, out):
desc = dnn.GpuDnnConvDesc(border_mode='valid', subsample=(1, 1), desc = dnn.GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
conv_mode='conv')(img.shape, kern.shape) conv_mode='conv')(img.shape, kern.shape)
return dnn.GpuDnnConvGradI()(kern, out, img, desc) return dnn.GpuDnnConvGradI()(kern, out, img, desc, alpha=-1.0,
beta=0.0)
def dconvw(img, kern, out): def dconvw(img, kern, out):
desc = dnn.GpuDnnConvDesc(border_mode='valid', subsample=(1, 1), desc = dnn.GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
conv_mode='conv')(img.shape, kern.shape) conv_mode='conv')(img.shape, kern.shape)
return dnn.GpuDnnConvGradW()(img, out, kern, desc) return dnn.GpuDnnConvGradW()(img, out, kern, desc, alpha=0.75,
beta=-1.0)
utt.verify_grad(dconv, [img_val, kern_val, out_val]) utt.verify_grad(dconv, [img_val, kern_val, out_val])
utt.verify_grad(dconvi, [img_val, kern_val, out_val]) utt.verify_grad(dconvi, [img_val, kern_val, out_val])
......
...@@ -1931,9 +1931,10 @@ class Scan(PureOp): ...@@ -1931,9 +1931,10 @@ class Scan(PureOp):
type_outs.append(vl.type.why_null) type_outs.append(vl.type.why_null)
# Replace the inner output with a zero tensor of # Replace the inner output with a zero tensor of
# the right shape # the right shape
inner_out_sitsot[_p] = tensor.zeros( inner_out_nitsot[_p] = tensor.zeros(
diff_inputs[_p].shape, diff_inputs[_p].shape,
dtype=theano.config.floatX) dtype=theano.config.floatX)
if through_shared: if through_shared:
type_outs.append('through_shared') type_outs.append('through_shared')
elif disconnected_dC_dinps_t[_p]: elif disconnected_dC_dinps_t[_p]:
......
...@@ -46,11 +46,11 @@ else: ...@@ -46,11 +46,11 @@ else:
mode_with_gpu = mode_with_opt.including('gpu', 'scan') mode_with_gpu = mode_with_opt.including('gpu', 'scan')
class multiple_outputs_numeric_grad: type_eps = {'float64': 1e-7,
"""WRITEME"""
type_eps = {'float64': 1e-7,
'float32': 3e-3} 'float32': 3e-3}
class multiple_outputs_numeric_grad:
"""WRITEME"""
def __init__(self, f, pt, ndarray_mask=None, eps=None): def __init__(self, f, pt, ndarray_mask=None, eps=None):
"""Return the gradient of f at pt. """Return the gradient of f at pt.
...@@ -78,13 +78,12 @@ class multiple_outputs_numeric_grad: ...@@ -78,13 +78,12 @@ class multiple_outputs_numeric_grad:
if not ndarray_mask: if not ndarray_mask:
ndarray_mask = [True for x in pt] ndarray_mask = [True for x in pt]
dtype_eps = multiple_outputs_numeric_grad.type_eps['float64'] dtype_eps = type_eps['float64']
for i, p in enumerate(pt): for i, p in enumerate(pt):
if ndarray_mask[i]: if ndarray_mask[i]:
pt[i] = numpy.array(p) pt[i] = numpy.array(p)
_eps = multiple_outputs_numeric_grad.type_eps[str( _eps = type_eps[str(pt[i].dtype)]
pt[i].dtype)]
if _eps > dtype_eps: if _eps > dtype_eps:
dtype_eps = _eps dtype_eps = _eps
...@@ -836,6 +835,36 @@ class T_Scan(unittest.TestCase): ...@@ -836,6 +835,36 @@ class T_Scan(unittest.TestCase):
n_steps=2) n_steps=2)
tensor.grad(a[-1], a0) tensor.grad(a[-1], a0)
def test_grad_two_scans(self):
# data input & output
x = tensor.tensor3('x')
t = tensor.imatrix('t')
# forward pass
W = theano.shared(
numpy.random.randn(2, 2).astype('float32'),
name="W", borrow=True)
def forward_scanner(x_t):
a2_t = tensor.dot(x_t, W)
y_t = tensor.nnet.softmax(a2_t)
return y_t
y, _ = theano.scan(fn=forward_scanner, sequences=x,
outputs_info=[None])
# loss function
def error_scanner(y_t, t_t):
return tensor.mean(tensor.nnet.categorical_crossentropy(y_t, t_t))
L, _ = theano.scan(fn=error_scanner, sequences=[y, t],
outputs_info=[None])
L = tensor.mean(L)
# backward pass
gW = tensor.grad(L, [W])
# simple rnn, one input, one state, weights for each; input/state are # simple rnn, one input, one state, weights for each; input/state are
# vectors, weights are scalars; using shared variables and past # vectors, weights are scalars; using shared variables and past
# taps (sequences and outputs) # taps (sequences and outputs)
...@@ -3867,6 +3896,42 @@ class T_Scan(unittest.TestCase): ...@@ -3867,6 +3896,42 @@ class T_Scan(unittest.TestCase):
f = theano.function([W, n_steps], H) f = theano.function([W, n_steps], H)
f(numpy.ones((8,), dtype='float32'), 1) f(numpy.ones((8,), dtype='float32'), 1)
def test_strict_mode(self):
n = 10
w = numpy.array([[-1,2],[3,-4]]).astype(theano.config.floatX)
w_ = theano.shared(w)
x0 = numpy.array([1,2]).astype(theano.config.floatX)
x0_ = tensor.vector(name='x0', dtype=theano.config.floatX)
def _scan_loose(x):
return tensor.dot(x, w_)
def _scan_strict(x, w_ns):
return tensor.dot(x, w_ns)
ret_loose = theano.scan(_scan_loose,
sequences=[],
outputs_info=[x0_],
n_steps=n,
strict=False)
f_loose = theano.function([x0_], ret_loose[0][-1])
ret_strict = theano.scan(_scan_strict,
sequences=[],
outputs_info=[x0_],
non_sequences=[w_],
n_steps=n,
strict=True)
f_strict = theano.function([x0_], ret_strict[0][-1])
result_loose = f_loose(x0)
result_strict = f_strict(x0)
diff = (abs(result_loose - result_strict)).mean()
assert diff <= type_eps[theano.config.floatX]
def test_speed(): def test_speed():
# #
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论