提交 7a6d676f authored 作者: Frédéric Bastien's avatar Frédéric Bastien 提交者: GitHub

Merge pull request #5688 from cooijmanstim/gpuarray-stack-trace

gpuarray: keep stack trace
...@@ -4,6 +4,7 @@ Node classes (`Apply`, `Variable`) and expression graph algorithms. ...@@ -4,6 +4,7 @@ Node classes (`Apply`, `Variable`) and expression graph algorithms.
from __future__ import absolute_import, print_function, division from __future__ import absolute_import, print_function, division
from collections import deque from collections import deque
import contextlib
from copy import copy from copy import copy
from itertools import count from itertools import count
...@@ -390,6 +391,8 @@ class Variable(Node): ...@@ -390,6 +391,8 @@ class Variable(Node):
self.name = name self.name = name
self.auto_name = 'auto_' + str(next(self.__count__)) self.auto_name = 'auto_' + str(next(self.__count__))
Variable.notify_construction_observers(self)
def __str__(self): def __str__(self):
"""Return a str representation of the Variable. """Return a str representation of the Variable.
...@@ -536,6 +539,22 @@ class Variable(Node): ...@@ -536,6 +539,22 @@ class Variable(Node):
d["tag"] = t d["tag"] = t
return d return d
# refer to doc in nodes_constructed.
construction_observers = []
@classmethod
def append_construction_observer(cls, observer):
cls.construction_observers.append(observer)
@classmethod
def remove_construction_observer(cls, observer):
cls.construction_observers.remove(observer)
@classmethod
def notify_construction_observers(cls, instance):
for observer in cls.construction_observers:
observer(instance)
class Constant(Variable): class Constant(Variable):
""" """
...@@ -1426,3 +1445,38 @@ def is_in_ancestors(l_node, f_node): ...@@ -1426,3 +1445,38 @@ def is_in_ancestors(l_node, f_node):
todo.append(cur) todo.append(cur)
todo.extend(i.owner for i in cur.inputs if i.owner) todo.extend(i.owner for i in cur.inputs if i.owner)
return False return False
@contextlib.contextmanager
def nodes_constructed():
"""
A contextmanager that is used in inherit_stack_trace and keeps track
of all the newly created varaible nodes inside an optimization. A list
of new_nodes is instantiated but will be filled in a lazy manner (when
Variable.notify_construction_observers is called).
`observer` is the entity that updates the new_nodes list.
construction_observers is a list inside Variable class and contains
a list of observer functions. The observer functions inside
construction_observers are only called when a variable node is
instantiated (where Variable.notify_construction_observers is called).
When the observer function is called, a new variable node is added to
the new_nodes list.
Parameters
----------
new_nodes
A list of all the variable nodes that are created inside the optimization.
yields
new_nodes list.
"""
new_nodes = []
def observer(node):
new_nodes.append(node)
Variable.append_construction_observer(observer)
yield new_nodes
Variable.remove_construction_observer(observer)
...@@ -6,6 +6,7 @@ amount of useful generic optimization tools. ...@@ -6,6 +6,7 @@ amount of useful generic optimization tools.
from __future__ import absolute_import, print_function, division from __future__ import absolute_import, print_function, division
from collections import deque, defaultdict, OrderedDict from collections import deque, defaultdict, OrderedDict
import contextlib
import copy import copy
import inspect import inspect
import logging import logging
...@@ -2902,7 +2903,7 @@ def pre_greedy_local_optimizer(list_optimizations, out): ...@@ -2902,7 +2903,7 @@ def pre_greedy_local_optimizer(list_optimizations, out):
def copy_stack_trace(from_var, to_var): def copy_stack_trace(from_var, to_var):
""" """
Copies the stack trace from one or more tensor variables to Copies the stack trace from one or more tensor variables to
one or more tensor variables. one or more tensor variables and returns the destination variables.
Parameters Parameters
---------- ----------
...@@ -2946,6 +2947,25 @@ def copy_stack_trace(from_var, to_var): ...@@ -2946,6 +2947,25 @@ def copy_stack_trace(from_var, to_var):
# Copy over stack traces from from_var to each variable to # Copy over stack traces from from_var to each variable to
# to_var, including the stack_trace of the to_var before # to_var, including the stack_trace of the to_var before
to_var.tag.trace = getattr(to_var.tag, 'trace', []) + tr to_var.tag.trace = getattr(to_var.tag, 'trace', []) + tr
return to_var
@contextlib.contextmanager
def inherit_stack_trace(from_var):
"""
Contextmanager that copies the stack trace from one or more variable nodes to all
variable nodes constructed in the body. new_nodes is the list of all the newly created
variable nodes inside an optimization that is managed by graph.nodes_constructed().
Parameters
----------
from_var
Variable node or a list of variable nodes to copy stack traces from.
"""
with graph.nodes_constructed() as new_nodes:
yield
copy_stack_trace(from_var, new_nodes)
def check_stack_trace(f_or_fgraph, ops_to_check='last', bug_print='raise'): def check_stack_trace(f_or_fgraph, ops_to_check='last', bug_print='raise'):
......
...@@ -15,6 +15,7 @@ from theano.tensor.basic import ( ...@@ -15,6 +15,7 @@ from theano.tensor.basic import (
from theano.gof import HideC, COp, ParamsType from theano.gof import HideC, COp, ParamsType
from theano.gof.utils import MethodNotDefined from theano.gof.utils import MethodNotDefined
from theano.gof.opt import copy_stack_trace
from collections import deque from collections import deque
...@@ -75,11 +76,11 @@ def as_gpuarray_variable(x, context_name): ...@@ -75,11 +76,11 @@ def as_gpuarray_variable(x, context_name):
# If we couldn't deal with transfers, then maybe it's a tensor # If we couldn't deal with transfers, then maybe it's a tensor
if isinstance(x.type, tensor.TensorType): if isinstance(x.type, tensor.TensorType):
return GpuFromHost(context_name)(x) return copy_stack_trace(x, GpuFromHost(context_name)(x))
# Try _as_GpuArrayVariable if possible # Try _as_GpuArrayVariable if possible
if hasattr(x, '_as_GpuArrayVariable'): if hasattr(x, '_as_GpuArrayVariable'):
return x._as_GpuArrayVariable(context_name) return copy_stack_trace(x, x._as_GpuArrayVariable(context_name))
# If it didn't work try for a constant # If it didn't work try for a constant
ctx = get_context(context_name) ctx = get_context(context_name)
......
...@@ -18,6 +18,7 @@ from theano.gradient import DisconnectedType, grad_not_implemented ...@@ -18,6 +18,7 @@ from theano.gradient import DisconnectedType, grad_not_implemented
from theano.gof import Optimizer, local_optimizer, COp, ParamsType, EnumList from theano.gof import Optimizer, local_optimizer, COp, ParamsType, EnumList
from theano.gof.cmodule import GCC_compiler from theano.gof.cmodule import GCC_compiler
from theano.gof.type import CDataType, Generic from theano.gof.type import CDataType, Generic
from theano.gof.opt import inherit_stack_trace
from theano.compile import optdb from theano.compile import optdb
from theano.compile.ops import shape_i, shape_i_op from theano.compile.ops import shape_i, shape_i_op
from theano.tensor.nnet import LogSoftmax, SoftmaxGrad from theano.tensor.nnet import LogSoftmax, SoftmaxGrad
...@@ -3127,9 +3128,11 @@ def local_abstractconv_cudnn(node): ...@@ -3127,9 +3128,11 @@ def local_abstractconv_cudnn(node):
if node.op.unshared: if node.op.unshared:
return None return None
if isinstance(node.op, AbstractConv2d): if isinstance(node.op, AbstractConv2d):
return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs) with inherit_stack_trace(node.outputs):
return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
elif isinstance(node.op, AbstractConv3d): elif isinstance(node.op, AbstractConv3d):
return local_abstractconv3d_cudnn_graph(node.op, ctx, node.inputs, node.outputs) with inherit_stack_trace(node.outputs):
return local_abstractconv3d_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
@local_optimizer([AbstractConv2d, AbstractConv2d_gradWeights, AbstractConv2d_gradInputs]) @local_optimizer([AbstractConv2d, AbstractConv2d_gradWeights, AbstractConv2d_gradInputs])
...@@ -3352,9 +3355,11 @@ def local_abstractconv_gw_cudnn(node): ...@@ -3352,9 +3355,11 @@ def local_abstractconv_gw_cudnn(node):
if node.op.unshared: if node.op.unshared:
return None return None
if isinstance(node.op, AbstractConv2d_gradWeights): if isinstance(node.op, AbstractConv2d_gradWeights):
return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs) with inherit_stack_trace(node.outputs):
return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
elif isinstance(node.op, AbstractConv3d_gradWeights): elif isinstance(node.op, AbstractConv3d_gradWeights):
return local_abstractconv3d_cudnn_graph(node.op, ctx, node.inputs, node.outputs) with inherit_stack_trace(node.outputs):
return local_abstractconv3d_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
@local_optimizer([AbstractConv2d_gradInputs, AbstractConv3d_gradInputs]) @local_optimizer([AbstractConv2d_gradInputs, AbstractConv3d_gradInputs])
...@@ -3365,9 +3370,11 @@ def local_abstractconv_gi_cudnn(node): ...@@ -3365,9 +3370,11 @@ def local_abstractconv_gi_cudnn(node):
if node.op.unshared: if node.op.unshared:
return None return None
if isinstance(node.op, AbstractConv2d_gradInputs): if isinstance(node.op, AbstractConv2d_gradInputs):
return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs) with inherit_stack_trace(node.outputs):
return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
elif isinstance(node.op, AbstractConv3d_gradInputs): elif isinstance(node.op, AbstractConv3d_gradInputs):
return local_abstractconv3d_cudnn_graph(node.op, ctx, node.inputs, node.outputs) with inherit_stack_trace(node.outputs):
return local_abstractconv3d_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
@inplace_allocempty(GpuDnnConv, 2) @inplace_allocempty(GpuDnnConv, 2)
...@@ -3384,7 +3391,6 @@ def local_dnn_convgw_inplace(node, inputs): ...@@ -3384,7 +3391,6 @@ def local_dnn_convgw_inplace(node, inputs):
def local_dnn_convgi_inplace(node, inputs): def local_dnn_convgi_inplace(node, inputs):
return [GpuDnnConvGradI(algo=node.op.algo, inplace=True, num_groups=node.op.num_groups)(*inputs)] return [GpuDnnConvGradI(algo=node.op.algo, inplace=True, num_groups=node.op.num_groups)(*inputs)]
optdb.register('local_dnna_conv_inplace', optdb.register('local_dnna_conv_inplace',
tensor.opt.in2out(local_dnn_conv_inplace, tensor.opt.in2out(local_dnn_conv_inplace,
local_dnn_convgw_inplace, local_dnn_convgw_inplace,
...@@ -3654,11 +3660,12 @@ def local_dnn_reduction(node): ...@@ -3654,11 +3660,12 @@ def local_dnn_reduction(node):
if not cudnn.cudnnReduceTensorOp_t.has_alias(node.op.scalar_op.name): if not cudnn.cudnnReduceTensorOp_t.has_alias(node.op.scalar_op.name):
return return
return (GpuDnnReduction(node.op.scalar_op.name, with inherit_stack_trace(node.outputs):
node.op.axis, return (GpuDnnReduction(node.op.scalar_op.name,
node.op.acc_dtype, node.op.axis,
node.op.dtype, node.op.acc_dtype,
False)(node.inputs[0]),) node.op.dtype,
False)(node.inputs[0]),)
@register_opt('cudnn') @register_opt('cudnn')
......
...@@ -15,7 +15,8 @@ from theano.compile.ops import shape_i ...@@ -15,7 +15,8 @@ from theano.compile.ops import shape_i
from theano.gof import (local_optimizer, EquilibriumDB, TopoOptimizer, from theano.gof import (local_optimizer, EquilibriumDB, TopoOptimizer,
LocalGroupDB, LocalGroupDB,
SequenceDB, Optimizer, DB, toolbox, graph) SequenceDB, Optimizer, DB, toolbox, graph)
from theano.gof.opt import LocalMetaOptimizer from theano.gof.opt import (LocalMetaOptimizer, copy_stack_trace,
inherit_stack_trace)
from theano.ifelse import IfElse from theano.ifelse import IfElse
from theano.misc.ordered_set import OrderedSet from theano.misc.ordered_set import OrderedSet
...@@ -252,12 +253,25 @@ def op_lifter(OP, cuda_only=False): ...@@ -252,12 +253,25 @@ def op_lifter(OP, cuda_only=False):
# This is needed as sometimes new_op inherits from OP. # This is needed as sometimes new_op inherits from OP.
if new_op and new_op != node.op: if new_op and new_op != node.op:
if isinstance(new_op, theano.Op): if isinstance(new_op, theano.Op):
return [safe_to_cpu(o) for o in new_outputs = new_op(*node.inputs, return_list=True)
new_op(*node.inputs, return_list=True)] to_cpu_fn = safe_to_cpu
elif isinstance(new_op, (tuple, list)): elif isinstance(new_op, (tuple, list)):
return [safe_to_cpu(o) for o in new_op] new_outputs = new_op
to_cpu_fn = safe_to_cpu
else: # suppose it is a variable on the GPU else: # suppose it is a variable on the GPU
return [new_op.transfer('cpu')] new_outputs = [new_op]
def to_cpu_fn(x):
return x.transfer('cpu')
# copy stack traces onto gpu outputs
# also copy the stack traces onto HostFromGpu outputs
on_cpu = []
for old_output, new_output in zip(node.outputs, new_outputs):
copy_stack_trace(old_output, new_output)
cpu = to_cpu_fn(new_output)
on_cpu.append(cpu)
copy_stack_trace(old_output, cpu)
return on_cpu
return False return False
local_opt.__name__ = maker.__name__ local_opt.__name__ = maker.__name__
return local_optimizer(OP)(local_opt) return local_optimizer(OP)(local_opt)
...@@ -419,6 +433,9 @@ class GraphToGPU(Optimizer): ...@@ -419,6 +433,9 @@ class GraphToGPU(Optimizer):
elif isinstance(new_ops, theano.Variable): elif isinstance(new_ops, theano.Variable):
outputs = [new_ops] outputs = [new_ops]
for old_output, new_output in zip(node.outputs, outputs):
copy_stack_trace(old_output, new_output)
if new_ops: if new_ops:
node_created[lopt] += len(graph.ops([mapping[i] for i in node.inputs], outputs)) node_created[lopt] += len(graph.ops([mapping[i] for i in node.inputs], outputs))
if any([getattr(old_o, 'dtype', None) != getattr(new_o, 'dtype', None) if any([getattr(old_o, 'dtype', None) != getattr(new_o, 'dtype', None)
...@@ -451,7 +468,7 @@ class GraphToGPU(Optimizer): ...@@ -451,7 +468,7 @@ class GraphToGPU(Optimizer):
new_o.owner.inputs[0].type == o.type): new_o.owner.inputs[0].type == o.type):
new_o = new_o.owner.inputs[0] new_o = new_o.owner.inputs[0]
else: else:
new_o = safe_to_cpu(new_o) new_o = copy_stack_trace(o, safe_to_cpu(new_o))
new_nodes.append(new_o) new_nodes.append(new_o)
fgraph.replace_all_validate(zip(fgraph.outputs, new_nodes), fgraph.replace_all_validate(zip(fgraph.outputs, new_nodes),
reason=self.__class__.__name__) reason=self.__class__.__name__)
...@@ -650,7 +667,8 @@ def local_gpualloc_memset_0(node): ...@@ -650,7 +667,8 @@ def local_gpualloc_memset_0(node):
inp.data.size == 1 and inp.data.size == 1 and
(np.asarray(inp.data) == 0).all()): (np.asarray(inp.data) == 0).all()):
new_op = GpuAlloc(node.op.context_name, memset_0=True) new_op = GpuAlloc(node.op.context_name, memset_0=True)
return [new_op(*node.inputs)] with inherit_stack_trace(node.outputs):
return new_op(*node.inputs, return_list=True)
# Don't register by default. # Don't register by default.
...@@ -659,10 +677,9 @@ def local_gpua_alloc_empty_to_zeros(node): ...@@ -659,10 +677,9 @@ def local_gpua_alloc_empty_to_zeros(node):
if isinstance(node.op, GpuAllocEmpty): if isinstance(node.op, GpuAllocEmpty):
context_name = infer_context_name(*node.inputs) context_name = infer_context_name(*node.inputs)
z = np.asarray(0, dtype=node.outputs[0].dtype) z = np.asarray(0, dtype=node.outputs[0].dtype)
return [GpuAlloc(context_name)(as_gpuarray_variable(z, context_name), with inherit_stack_trace(node.outputs):
*node.inputs)] return [GpuAlloc(context_name)(
as_gpuarray_variable(z, context_name), *node.inputs)]
optdb.register('local_gpua_alloc_empty_to_zeros', optdb.register('local_gpua_alloc_empty_to_zeros',
theano.tensor.opt.in2out(local_gpua_alloc_empty_to_zeros), theano.tensor.opt.in2out(local_gpua_alloc_empty_to_zeros),
# After move to gpu and merge2, before inplace. # After move to gpu and merge2, before inplace.
...@@ -1206,7 +1223,8 @@ def local_gpua_careduce(op, context_name, inputs, outputs): ...@@ -1206,7 +1223,8 @@ def local_gpua_careduce(op, context_name, inputs, outputs):
op.scalar_op, axis=op.axis, op.scalar_op, axis=op.axis,
dtype=odtype, dtype=odtype,
acc_dtype=adtype) acc_dtype=adtype)
gvar = greduce(x) with inherit_stack_trace(outputs):
gvar = greduce(x)
# We need to have the make node called, otherwise the mask can # We need to have the make node called, otherwise the mask can
# be None # be None
if (op2 is GpuCAReduceCPY or if (op2 is GpuCAReduceCPY or
...@@ -1246,25 +1264,27 @@ def local_gpua_careduce(op, context_name, inputs, outputs): ...@@ -1246,25 +1264,27 @@ def local_gpua_careduce(op, context_name, inputs, outputs):
dtype=getattr(op, 'dtype', outputs[0].dtype), dtype=getattr(op, 'dtype', outputs[0].dtype),
acc_dtype=getattr(op, 'acc_dtype', None)) acc_dtype=getattr(op, 'acc_dtype', None))
reshaped_x = x.reshape(tensor.stack(new_in_shp)) with inherit_stack_trace(outputs):
gpu_reshaped_x = as_gpuarray_variable(reshaped_x, context_name) reshaped_x = x.reshape(tensor.stack(new_in_shp))
gvar = greduce(gpu_reshaped_x) gpu_reshaped_x = as_gpuarray_variable(reshaped_x, context_name)
# We need to have the make node called, otherwise the mask can # We need to have the make node called, otherwise the mask can
# be None # be None
reshaped_gpu_inputs = [gpu_reshaped_x] gvar = greduce(gpu_reshaped_x)
if greduce.supports_c_code(reshaped_gpu_inputs): reshaped_gpu_inputs = [gpu_reshaped_x]
reduce_reshaped_x = greduce(gpu_reshaped_x) if greduce.supports_c_code(reshaped_gpu_inputs):
reduce_reshaped_x = greduce(gpu_reshaped_x)
if reduce_reshaped_x.ndim != outputs[0].ndim:
out_shp = [] if reduce_reshaped_x.ndim != outputs[0].ndim:
for i in range(x.ndim): out_shp = []
if i not in op.axis: for i in range(x.ndim):
out_shp.append(shape_i(x, i)) if i not in op.axis:
unreshaped_reduce = GpuReshape(len(out_shp))(reduce_reshaped_x, out_shp.append(shape_i(x, i))
tensor.stack(out_shp)) unreshaped_reduce = GpuReshape(len(out_shp))(
else: reduce_reshaped_x,
unreshaped_reduce = reduce_reshaped_x tensor.stack(out_shp))
return [unreshaped_reduce] else:
unreshaped_reduce = reduce_reshaped_x
return [unreshaped_reduce]
@register_opt('fast_compile') @register_opt('fast_compile')
...@@ -1305,33 +1325,34 @@ def local_gpua_gemm(op, context_name, inputs, outputs): ...@@ -1305,33 +1325,34 @@ def local_gpua_gemm(op, context_name, inputs, outputs):
def local_gpua_gemmbatch(op, context_name, inputs, outputs): def local_gpua_gemmbatch(op, context_name, inputs, outputs):
if inputs[0].dtype not in ['float16', 'float32', 'float64']: if inputs[0].dtype not in ['float16', 'float32', 'float64']:
return return
a, b = inputs with inherit_stack_trace(outputs):
# Since GpuGemmBatch only supports 3D inputs and output, a, b = inputs
# we need to add broadcastable dims to the inputs, and drop # Since GpuGemmBatch only supports 3D inputs and output,
# them from outputs # we need to add broadcastable dims to the inputs, and drop
output_dims = [0, 1, 2] # them from outputs
if a.ndim == 2: output_dims = [0, 1, 2]
a = GpuDimShuffle(a.broadcastable, (0, 'x', 1))(a) if a.ndim == 2:
del output_dims[1] a = GpuDimShuffle(a.broadcastable, (0, 'x', 1))(a)
if b.ndim == 2: del output_dims[1]
b = GpuDimShuffle(b.broadcastable, (0, 1, 'x'))(b) if b.ndim == 2:
del output_dims[-1] b = GpuDimShuffle(b.broadcastable, (0, 1, 'x'))(b)
# In case of mismatched dtypes, we also have to upcast del output_dims[-1]
out_dtype = outputs[0].dtype # In case of mismatched dtypes, we also have to upcast
if a.dtype != out_dtype or b.dtype != out_dtype: out_dtype = outputs[0].dtype
gpu_cast_op = GpuElemwise(Cast(Scalar(out_dtype))) if a.dtype != out_dtype or b.dtype != out_dtype:
if a.dtype != out_dtype: gpu_cast_op = GpuElemwise(Cast(Scalar(out_dtype)))
a = gpu_cast_op(a) if a.dtype != out_dtype:
if b.dtype != out_dtype: a = gpu_cast_op(a)
b = gpu_cast_op(b) if b.dtype != out_dtype:
b = gpu_cast_op(b)
c = GpuAllocEmpty(out_dtype, context_name)(
a.shape[0], a.shape[1], b.shape[2]) c = GpuAllocEmpty(out_dtype, context_name)(
out = gpugemmbatch_no_inplace(c, np.asarray(1.0, dtype=out_dtype), a.shape[0], a.shape[1], b.shape[2])
a, b, np.asarray(0.0, dtype=out_dtype)) out = gpugemmbatch_no_inplace(c, np.asarray(1.0, dtype=out_dtype),
if len(output_dims) != 3: a, b, np.asarray(0.0, dtype=out_dtype))
out = GpuDimShuffle(out.broadcastable, output_dims)(out) if len(output_dims) != 3:
return out out = GpuDimShuffle(out.broadcastable, output_dims)(out)
return out
@register_opt() @register_opt()
...@@ -1378,11 +1399,12 @@ def local_gpua_dot22(op, context_name, inputs, outputs): ...@@ -1378,11 +1399,12 @@ def local_gpua_dot22(op, context_name, inputs, outputs):
@op_lifter([tensor.blas.Dot22Scalar]) @op_lifter([tensor.blas.Dot22Scalar])
@register_opt2([tensor.blas.Dot22Scalar], 'fast_compile') @register_opt2([tensor.blas.Dot22Scalar], 'fast_compile')
def local_gpua_dot22scalar(op, context_name, inputs, outputs): def local_gpua_dot22scalar(op, context_name, inputs, outputs):
x, y, a = inputs with inherit_stack_trace(outputs):
x = as_gpuarray_variable(x, context_name) x, y, a = inputs
y = as_gpuarray_variable(y, context_name) x = as_gpuarray_variable(x, context_name)
z = GpuAllocEmpty(x.dtype, context_name)(x.shape[0], y.shape[1]) y = as_gpuarray_variable(y, context_name)
return [gpugemm_no_inplace(z, a, x, y, 0)] z = GpuAllocEmpty(x.dtype, context_name)(x.shape[0], y.shape[1])
return [gpugemm_no_inplace(z, a, x, y, 0)]
@register_opt('fast_compile') @register_opt('fast_compile')
...@@ -2392,7 +2414,8 @@ def local_gpu_elemwise_careduce(node): ...@@ -2392,7 +2414,8 @@ def local_gpu_elemwise_careduce(node):
props = node.op._props_dict() props = node.op._props_dict()
props["pre_scalar_op"] = scalar.basic.sqr props["pre_scalar_op"] = scalar.basic.sqr
out = GpuCAReduceCuda(**props)(inp) out = GpuCAReduceCuda(**props)(inp)
return [out] with inherit_stack_trace(node.outputs):
return [out]
@local_optimizer(None) @local_optimizer(None)
...@@ -2583,8 +2606,9 @@ def local_gpu_solve(op, context_name, inputs, outputs): ...@@ -2583,8 +2606,9 @@ def local_gpu_solve(op, context_name, inputs, outputs):
@local_optimizer([GpuCusolverSolve], inplace=True) @local_optimizer([GpuCusolverSolve], inplace=True)
def local_inplace_gpu_solve(node): def local_inplace_gpu_solve(node):
if isinstance(node.op, GpuCusolverSolve) and not node.op.inplace: if isinstance(node.op, GpuCusolverSolve) and not node.op.inplace:
return [GpuCusolverSolve(A_structure=node.op.A_structure, trans=node.op.trans, with inherit_stack_trace(node.outputs):
inplace=True)(*node.inputs)] return [GpuCusolverSolve(A_structure=node.op.A_structure, trans=node.op.trans,
inplace=True)(*node.inputs)]
# Cholesky decomposition # Cholesky decomposition
...@@ -2622,7 +2646,8 @@ register_opt2([slinalg.Solve], 'fast_compile', name='matrix_ops_db2')(matrix_ops ...@@ -2622,7 +2646,8 @@ register_opt2([slinalg.Solve], 'fast_compile', name='matrix_ops_db2')(matrix_ops
@local_optimizer([GpuCholesky], inplace=True) @local_optimizer([GpuCholesky], inplace=True)
def local_inplace_gpu_cholesky(node): def local_inplace_gpu_cholesky(node):
if isinstance(node.op, GpuCholesky) and not node.op.inplace: if isinstance(node.op, GpuCholesky) and not node.op.inplace:
return [node.op.clone_inplace()(*node.inputs)] with inherit_stack_trace(node.outputs):
return [node.op.clone_inplace()(*node.inputs)]
def local_gpu_magma_cholesky(op, context_name, inputs, outputs): def local_gpu_magma_cholesky(op, context_name, inputs, outputs):
...@@ -2705,7 +2730,8 @@ def local_gpu_magma_matrix_inverse(op, context_name, inputs, outputs): ...@@ -2705,7 +2730,8 @@ def local_gpu_magma_matrix_inverse(op, context_name, inputs, outputs):
@local_optimizer([GpuMagmaMatrixInverse]) @local_optimizer([GpuMagmaMatrixInverse])
def local_inplace_gpu_magma_matrix_inverse(node): def local_inplace_gpu_magma_matrix_inverse(node):
if isinstance(node.op, GpuMagmaMatrixInverse) and not node.op.inplace: if isinstance(node.op, GpuMagmaMatrixInverse) and not node.op.inplace:
return [node.op.clone_inplace()(*node.inputs)] with inherit_stack_trace(node.outputs):
return [node.op.clone_inplace()(*node.inputs)]
# Eigen decomposition of a symmetric matrix # Eigen decomposition of a symmetric matrix
......
...@@ -5,6 +5,7 @@ import numpy as np ...@@ -5,6 +5,7 @@ import numpy as np
from theano import tensor, scalar as scal, Constant from theano import tensor, scalar as scal, Constant
from theano.gof import local_optimizer from theano.gof import local_optimizer
from theano.gof.opt import inherit_stack_trace
from theano.tensor import (DimShuffle, get_scalar_constant_value, from theano.tensor import (DimShuffle, get_scalar_constant_value,
NotScalarConstantError) NotScalarConstantError)
...@@ -184,7 +185,8 @@ def alpha_merge(cls, alpha_in, beta_in): ...@@ -184,7 +185,8 @@ def alpha_merge(cls, alpha_in, beta_in):
except NotScalarConstantError: except NotScalarConstantError:
inputs[alpha_in] = lr * targ.inputs[alpha_in] inputs[alpha_in] = lr * targ.inputs[alpha_in]
inputs[beta_in] = lr * targ.inputs[beta_in] inputs[beta_in] = lr * targ.inputs[beta_in]
return maker(targ, *inputs) with inherit_stack_trace(node.outputs):
return maker(targ, *inputs)
return opt return opt
return wrapper return wrapper
...@@ -272,7 +274,8 @@ def output_merge(cls, alpha_in, beta_in, out_in): ...@@ -272,7 +274,8 @@ def output_merge(cls, alpha_in, beta_in, out_in):
inputs = list(targ.inputs) inputs = list(targ.inputs)
inputs[out_in] = W inputs[out_in] = W
inputs[beta_in] = _one.clone() inputs[beta_in] = _one.clone()
return maker(targ, *inputs) with inherit_stack_trace(node.outputs):
return maker(targ, *inputs)
return opt return opt
return wrapper return wrapper
...@@ -326,7 +329,8 @@ def inplace_allocempty(op, idx): ...@@ -326,7 +329,8 @@ def inplace_allocempty(op, idx):
len(alloc.clients) > 1): len(alloc.clients) > 1):
alloc_op = GpuAllocEmpty(alloc.owner.op.dtype, alloc.owner.op.context_name) alloc_op = GpuAllocEmpty(alloc.owner.op.dtype, alloc.owner.op.context_name)
inputs[idx] = alloc_op(*alloc.owner.inputs) inputs[idx] = alloc_op(*alloc.owner.inputs)
return maker(node, inputs) with inherit_stack_trace(node.outputs):
return maker(node, inputs)
return opt return opt
return wrapper return wrapper
......
...@@ -8,12 +8,13 @@ import theano.tensor.slinalg as slinalg ...@@ -8,12 +8,13 @@ import theano.tensor.slinalg as slinalg
from theano.tests.breakpoint import PdbBreakpoint from theano.tests.breakpoint import PdbBreakpoint
from theano.tests import unittest_tools as utt, test_ifelse from theano.tests import unittest_tools as utt, test_ifelse
from theano.tensor.tests import test_basic from theano.tensor.tests import test_basic
from theano.gof.opt import check_stack_trace
import theano.gpuarray import theano.gpuarray
from .. import basic_ops from .. import basic_ops
from ..type import GpuArrayType, gpuarray_shared_constructor, get_context from ..type import GpuArrayType, gpuarray_shared_constructor, get_context
from ..basic_ops import ( from ..basic_ops import (
GpuAlloc, GpuAllocEmpty, GpuReshape, GpuFromHost, host_from_gpu) GpuAlloc, GpuAllocEmpty, GpuReshape, GpuFromHost, HostFromGpu, host_from_gpu)
from ..blas import GpuGemm from ..blas import GpuGemm
from ..elemwise import ( from ..elemwise import (
GpuCAReduceCuda, GpuCAReduceCPY, GpuElemwise, Elemwise, max_inputs_to_GpuElemwise) GpuCAReduceCuda, GpuCAReduceCPY, GpuElemwise, Elemwise, max_inputs_to_GpuElemwise)
...@@ -27,6 +28,28 @@ from theano.tensor.nnet import abstract_conv ...@@ -27,6 +28,28 @@ from theano.tensor.nnet import abstract_conv
from theano.gpuarray import dnn, blas from theano.gpuarray import dnn, blas
def _check_stack_trace(thing):
def _ops_to_check(op):
if not isinstance(op, theano.gof.Op):
op = op.op # assume it is an apply node
return not isinstance(op, (theano.compile.ops.Shape_i,
theano.compile.ops.Shape,
theano.compile.ops.DeepCopyOp,
theano.tensor.opt.MakeVector,
theano.tensor.subtensor.Subtensor,
theano.tensor.elemwise.Elemwise,
theano.ifelse.IfElse,
GpuFromHost, HostFromGpu,
GpuCAReduceCuda,
basic_ops.GpuContiguous,
GpuElemwise,
theano.printing.Print,
PdbBreakpoint,
))
return check_stack_trace(thing, ops_to_check=_ops_to_check,
bug_print="ignore")
def test_local_assert(): def test_local_assert():
x = theano.tensor.fmatrix() x = theano.tensor.fmatrix()
a = theano.tensor.opt.assert_op(x, theano.tensor.eq(x, 0).any()) a = theano.tensor.opt.assert_op(x, theano.tensor.eq(x, 0).any())
...@@ -70,6 +93,8 @@ def test_local_gpu_contiguous_gpu_contiguous(): ...@@ -70,6 +93,8 @@ def test_local_gpu_contiguous_gpu_contiguous():
if isinstance(node.op, basic_ops.GpuContiguous)]) if isinstance(node.op, basic_ops.GpuContiguous)])
assert 1 == len([node for node in f2.maker.fgraph.toposort() assert 1 == len([node for node in f2.maker.fgraph.toposort()
if isinstance(node.op, basic_ops.GpuContiguous)]) if isinstance(node.op, basic_ops.GpuContiguous)])
assert _check_stack_trace(f1)
assert _check_stack_trace(f2)
def test_local_gpu_contiguous(): def test_local_gpu_contiguous():
...@@ -79,6 +104,7 @@ def test_local_gpu_contiguous(): ...@@ -79,6 +104,7 @@ def test_local_gpu_contiguous():
assert 1 == len([node for node in f.maker.fgraph.toposort() assert 1 == len([node for node in f.maker.fgraph.toposort()
if isinstance(node.op, basic_ops.GpuContiguous)]) if isinstance(node.op, basic_ops.GpuContiguous)])
f([[2.]]) f([[2.]])
assert _check_stack_trace(f)
def test_flatten(): def test_flatten():
...@@ -96,6 +122,7 @@ def test_flatten(): ...@@ -96,6 +122,7 @@ def test_flatten():
assert res.shape == val.flatten().shape assert res.shape == val.flatten().shape
assert GpuReshape in [type(node.op) assert GpuReshape in [type(node.op)
for node in f.maker.fgraph.toposort()] for node in f.maker.fgraph.toposort()]
assert _check_stack_trace(f)
f = theano.function([m], m.flatten(ndim=2), f = theano.function([m], m.flatten(ndim=2),
mode=mode_with_gpu.excluding("local_useless_reshape")) mode=mode_with_gpu.excluding("local_useless_reshape"))
...@@ -105,6 +132,7 @@ def test_flatten(): ...@@ -105,6 +132,7 @@ def test_flatten():
assert res.shape == val.shape assert res.shape == val.shape
assert GpuReshape in [type(node.op) assert GpuReshape in [type(node.op)
for node in f.maker.fgraph.toposort()] for node in f.maker.fgraph.toposort()]
assert _check_stack_trace(f)
m = theano.tensor.tensor3() m = theano.tensor.tensor3()
f = theano.function([m], m.flatten(ndim=2), mode=mode_with_gpu) f = theano.function([m], m.flatten(ndim=2), mode=mode_with_gpu)
...@@ -114,6 +142,7 @@ def test_flatten(): ...@@ -114,6 +142,7 @@ def test_flatten():
assert res.shape == val.reshape(10, -1).shape assert res.shape == val.reshape(10, -1).shape
assert GpuReshape in [type(node.op) assert GpuReshape in [type(node.op)
for node in f.maker.fgraph.toposort()] for node in f.maker.fgraph.toposort()]
assert _check_stack_trace(f)
def test_reduce(): def test_reduce():
...@@ -126,6 +155,9 @@ def test_reduce(): ...@@ -126,6 +155,9 @@ def test_reduce():
f = theano.function([m], getattr(m, method)(axis=0, f = theano.function([m], getattr(m, method)(axis=0,
**param), **param),
mode=mode_with_gpu) mode=mode_with_gpu)
# assert _check_stack_trace(f) this op is ok but since
# it is using GpuCAReduceCuda that has an empty stack
# trace, this assertion gives error.
val = np.random.rand(10, 11).astype("float32") val = np.random.rand(10, 11).astype("float32")
res = f(val) res = f(val)
utt.assert_allclose(res, getattr(val, method)(axis=0)) utt.assert_allclose(res, getattr(val, method)(axis=0))
...@@ -157,6 +189,7 @@ def test_local_gpualloc_memset_0(): ...@@ -157,6 +189,7 @@ def test_local_gpualloc_memset_0():
assert len(topo) == 1 assert len(topo) == 1
assert isinstance(topo[0].op, theano.tensor.Alloc) assert isinstance(topo[0].op, theano.tensor.Alloc)
assert (np.asarray(f(6)) == 0).all() assert (np.asarray(f(6)) == 0).all()
assert _check_stack_trace(f)
# Test with 0 from CPU op. # Test with 0 from CPU op.
# Should be transfered as it is used by another op. # Should be transfered as it is used by another op.
...@@ -166,6 +199,7 @@ def test_local_gpualloc_memset_0(): ...@@ -166,6 +199,7 @@ def test_local_gpualloc_memset_0():
assert len(topo) == 3 assert len(topo) == 3
assert isinstance(topo[0].op, GpuAlloc) assert isinstance(topo[0].op, GpuAlloc)
assert (np.asarray(f(6)) == 0).all() assert (np.asarray(f(6)) == 0).all()
assert _check_stack_trace(f)
# Test with 0 # Test with 0
a = GpuAlloc(test_ctx_name)(z, i) a = GpuAlloc(test_ctx_name)(z, i)
...@@ -174,6 +208,7 @@ def test_local_gpualloc_memset_0(): ...@@ -174,6 +208,7 @@ def test_local_gpualloc_memset_0():
assert len(topo) == 1 assert len(topo) == 1
assert isinstance(topo[0].op, GpuAlloc) and topo[0].op.memset_0 assert isinstance(topo[0].op, GpuAlloc) and topo[0].op.memset_0
assert (np.asarray(f(6)) == 0).all() assert (np.asarray(f(6)) == 0).all()
assert _check_stack_trace(f)
# Test with 1 # Test with 1
a = GpuAlloc(test_ctx_name)(o, i) a = GpuAlloc(test_ctx_name)(o, i)
...@@ -183,6 +218,7 @@ def test_local_gpualloc_memset_0(): ...@@ -183,6 +218,7 @@ def test_local_gpualloc_memset_0():
assert isinstance(topo[0].op, GpuAlloc) assert isinstance(topo[0].op, GpuAlloc)
assert not topo[0].op.memset_0 assert not topo[0].op.memset_0
assert (np.asarray(f(6)) == 1).all() assert (np.asarray(f(6)) == 1).all()
assert _check_stack_trace(f)
# Test with 1, 1 # Test with 1, 1
a = GpuAlloc(test_ctx_name)(ones, i) a = GpuAlloc(test_ctx_name)(ones, i)
...@@ -192,6 +228,7 @@ def test_local_gpualloc_memset_0(): ...@@ -192,6 +228,7 @@ def test_local_gpualloc_memset_0():
assert isinstance(topo[0].op, GpuAlloc) assert isinstance(topo[0].op, GpuAlloc)
assert not topo[0].op.memset_0 assert not topo[0].op.memset_0
assert (np.asarray(f(2)) == 1).all() assert (np.asarray(f(2)) == 1).all()
assert _check_stack_trace(f)
def test_local_gpualloc_empty(): def test_local_gpualloc_empty():
...@@ -207,6 +244,7 @@ def test_local_gpualloc_empty(): ...@@ -207,6 +244,7 @@ def test_local_gpualloc_empty():
assert isinstance(topo[0].op, theano.tensor.AllocEmpty) assert isinstance(topo[0].op, theano.tensor.AllocEmpty)
# This return not initilized data, so we can only check the shape # This return not initilized data, so we can only check the shape
assert f(3).shape == (3,) assert f(3).shape == (3,)
assert _check_stack_trace(f)
# Test with vector # Test with vector
# Should be moved # Should be moved
...@@ -217,6 +255,7 @@ def test_local_gpualloc_empty(): ...@@ -217,6 +255,7 @@ def test_local_gpualloc_empty():
assert isinstance(topo[0].op, GpuAllocEmpty) assert isinstance(topo[0].op, GpuAllocEmpty)
# This return not initilized data, so we can only check the shape # This return not initilized data, so we can only check the shape
assert f(3).shape == (3,) assert f(3).shape == (3,)
assert _check_stack_trace(f)
# Test with matrix # Test with matrix
a = tensor.AllocEmpty('float32')(i, ii) a = tensor.AllocEmpty('float32')(i, ii)
...@@ -226,6 +265,7 @@ def test_local_gpualloc_empty(): ...@@ -226,6 +265,7 @@ def test_local_gpualloc_empty():
assert isinstance(topo[0].op, GpuAllocEmpty) assert isinstance(topo[0].op, GpuAllocEmpty)
# This return not initilized data, so we can only check the shape # This return not initilized data, so we can only check the shape
assert f(3, 4).shape == (3, 4) assert f(3, 4).shape == (3, 4)
assert _check_stack_trace(f)
def test_rebroadcast(): def test_rebroadcast():
...@@ -243,6 +283,7 @@ def test_rebroadcast(): ...@@ -243,6 +283,7 @@ def test_rebroadcast():
assert isinstance(rebr.inputs[0].type, GpuArrayType) assert isinstance(rebr.inputs[0].type, GpuArrayType)
assert isinstance(rebr.outputs[0].type, GpuArrayType) assert isinstance(rebr.outputs[0].type, GpuArrayType)
assert _check_stack_trace(f)
class TestSpecifyShape(test_basic.TestSpecifyShape): class TestSpecifyShape(test_basic.TestSpecifyShape):
...@@ -268,6 +309,7 @@ class test_gpu_ifelse(test_ifelse.test_ifelse): ...@@ -268,6 +309,7 @@ class test_gpu_ifelse(test_ifelse.test_ifelse):
theano.ifelse.ifelse(cond, x.mean(), x.sum()), theano.ifelse.ifelse(cond, x.mean(), x.sum()),
mode=mode_with_gpu) mode=mode_with_gpu)
assert f(np.float32([1, 2, 3]), 0) == 6 assert f(np.float32([1, 2, 3]), 0) == 6
assert _check_stack_trace(f)
x = tensor.vector() x = tensor.vector()
cond = tensor.scalar() cond = tensor.scalar()
...@@ -275,6 +317,7 @@ class test_gpu_ifelse(test_ifelse.test_ifelse): ...@@ -275,6 +317,7 @@ class test_gpu_ifelse(test_ifelse.test_ifelse):
theano.ifelse.ifelse(cond, x.mean(), x.sum()), theano.ifelse.ifelse(cond, x.mean(), x.sum()),
mode=mode_with_gpu) mode=mode_with_gpu)
assert f(np.float32([1, 2, 3]), 0) == 6 assert f(np.float32([1, 2, 3]), 0) == 6
assert _check_stack_trace(f)
def test_lifter_with_shared_var(self): def test_lifter_with_shared_var(self):
x = tensor.lscalar('x') x = tensor.lscalar('x')
...@@ -297,6 +340,7 @@ def test_print_op(): ...@@ -297,6 +340,7 @@ def test_print_op():
assert isinstance(topo[1].op, theano.printing.Print) assert isinstance(topo[1].op, theano.printing.Print)
assert isinstance(topo[2].op, GpuElemwise) assert isinstance(topo[2].op, GpuElemwise)
assert topo[3].op == host_from_gpu assert topo[3].op == host_from_gpu
assert _check_stack_trace(f)
f(np.random.random((5, 5)).astype('float32')) f(np.random.random((5, 5)).astype('float32'))
...@@ -317,6 +361,7 @@ def test_pdbbreakpoint_op(): ...@@ -317,6 +361,7 @@ def test_pdbbreakpoint_op():
topo = f.maker.fgraph.toposort() topo = f.maker.fgraph.toposort()
assert isinstance(topo[-2].op, GpuElemwise) assert isinstance(topo[-2].op, GpuElemwise)
assert topo[-1].op == host_from_gpu assert topo[-1].op == host_from_gpu
assert _check_stack_trace(f)
def test_local_gpu_elemwise_careduce(): def test_local_gpu_elemwise_careduce():
...@@ -326,6 +371,7 @@ def test_local_gpu_elemwise_careduce(): ...@@ -326,6 +371,7 @@ def test_local_gpu_elemwise_careduce():
topo = f.maker.fgraph.toposort() topo = f.maker.fgraph.toposort()
assert len(topo) == 3 assert len(topo) == 3
assert topo[1].op.pre_scalar_op == theano.scalar.sqr assert topo[1].op.pre_scalar_op == theano.scalar.sqr
assert _check_stack_trace(f)
data = np.random.rand(3, 4).astype(theano.config.floatX) data = np.random.rand(3, 4).astype(theano.config.floatX)
utt.assert_allclose(f(data), (data * data).sum()) utt.assert_allclose(f(data), (data * data).sum())
...@@ -334,6 +380,7 @@ def test_local_gpu_elemwise_careduce(): ...@@ -334,6 +380,7 @@ def test_local_gpu_elemwise_careduce():
topo = f.maker.fgraph.toposort() topo = f.maker.fgraph.toposort()
assert len(topo) == 3 assert len(topo) == 3
assert topo[1].op.pre_scalar_op == theano.scalar.sqr assert topo[1].op.pre_scalar_op == theano.scalar.sqr
assert _check_stack_trace(f)
utt.assert_allclose(f(data), (data * data).sum(axis=1)) utt.assert_allclose(f(data), (data * data).sum(axis=1))
...@@ -352,6 +399,7 @@ def test_local_lift_dot22scalar(): ...@@ -352,6 +399,7 @@ def test_local_lift_dot22scalar():
y_val = np.random.random((3, 4)).astype(theano.config.floatX) y_val = np.random.random((3, 4)).astype(theano.config.floatX)
a_val = 0.5 a_val = 0.5
utt.assert_allclose(f_cpu(x_val, y_val, a_val), f_gpu(x_val, y_val, a_val)) utt.assert_allclose(f_cpu(x_val, y_val, a_val), f_gpu(x_val, y_val, a_val))
assert _check_stack_trace(f_gpu)
def test_local_gpu_subtensor(): def test_local_gpu_subtensor():
...@@ -361,6 +409,7 @@ def test_local_gpu_subtensor(): ...@@ -361,6 +409,7 @@ def test_local_gpu_subtensor():
topo = f.maker.fgraph.toposort() topo = f.maker.fgraph.toposort()
assert any([type(node.op) is tensor.Subtensor for node in topo]) assert any([type(node.op) is tensor.Subtensor for node in topo])
assert not any([isinstance(node.op, GpuSubtensor) for node in topo]) assert not any([isinstance(node.op, GpuSubtensor) for node in topo])
assert _check_stack_trace(f)
# Test graph input. # Test graph input.
t = tensor.fmatrix() t = tensor.fmatrix()
...@@ -368,6 +417,7 @@ def test_local_gpu_subtensor(): ...@@ -368,6 +417,7 @@ def test_local_gpu_subtensor():
topo = f.maker.fgraph.toposort() topo = f.maker.fgraph.toposort()
assert any([type(node.op) is tensor.Subtensor for node in topo]) assert any([type(node.op) is tensor.Subtensor for node in topo])
assert not any([isinstance(node.op, GpuSubtensor) for node in topo]) assert not any([isinstance(node.op, GpuSubtensor) for node in topo])
assert _check_stack_trace(f)
# Test multiple use of the input # Test multiple use of the input
# We want the subtensor to be on the GPU to prevent multiple transfer. # We want the subtensor to be on the GPU to prevent multiple transfer.
...@@ -376,6 +426,7 @@ def test_local_gpu_subtensor(): ...@@ -376,6 +426,7 @@ def test_local_gpu_subtensor():
topo = f.maker.fgraph.toposort() topo = f.maker.fgraph.toposort()
assert not any([type(node.op) is tensor.Subtensor for node in topo]) assert not any([type(node.op) is tensor.Subtensor for node in topo])
assert any([isinstance(node.op, GpuSubtensor) for node in topo]) assert any([isinstance(node.op, GpuSubtensor) for node in topo])
assert _check_stack_trace(f)
# Test multiple use of the input + input as output # Test multiple use of the input + input as output
# We want the subtensor to be on the GPU to prevent multiple transfer. # We want the subtensor to be on the GPU to prevent multiple transfer.
...@@ -384,6 +435,7 @@ def test_local_gpu_subtensor(): ...@@ -384,6 +435,7 @@ def test_local_gpu_subtensor():
topo = f.maker.fgraph.toposort() topo = f.maker.fgraph.toposort()
assert not any([type(node.op) is tensor.Subtensor for node in topo]) assert not any([type(node.op) is tensor.Subtensor for node in topo])
assert any([isinstance(node.op, GpuSubtensor) for node in topo]) assert any([isinstance(node.op, GpuSubtensor) for node in topo])
assert _check_stack_trace(f)
# Test shared forced on CPU end we do computation on the output of # Test shared forced on CPU end we do computation on the output of
# the subtensor. # the subtensor.
...@@ -396,6 +448,7 @@ def test_local_gpu_subtensor(): ...@@ -396,6 +448,7 @@ def test_local_gpu_subtensor():
# If it where just a little bit smarter, it could wrongly move it to the GPU. # If it where just a little bit smarter, it could wrongly move it to the GPU.
# If it where super smart, it would know it should not move it to the GPU. # If it where super smart, it would know it should not move it to the GPU.
assert any([isinstance(node.op, tensor.Elemwise) for node in topo]) assert any([isinstance(node.op, tensor.Elemwise) for node in topo])
assert _check_stack_trace(f)
def test_local_gpu_elemwise(): def test_local_gpu_elemwise():
...@@ -417,6 +470,7 @@ def test_local_gpu_elemwise(): ...@@ -417,6 +470,7 @@ def test_local_gpu_elemwise():
assert sum(isinstance(node.op, GpuElemwise) for node in topo) == 1 assert sum(isinstance(node.op, GpuElemwise) for node in topo) == 1
assert sum(type(node.op) == tensor.Elemwise for node in topo) == 0 assert sum(type(node.op) == tensor.Elemwise for node in topo) == 0
utt.assert_allclose(f(a_v, b_v, c_v), a_v + b_v + c_v) utt.assert_allclose(f(a_v, b_v, c_v), a_v + b_v + c_v)
assert _check_stack_trace(f)
# Now test with the composite already on the cpu before we move it # Now test with the composite already on the cpu before we move it
# to the gpu # to the gpu
...@@ -430,6 +484,7 @@ def test_local_gpu_elemwise(): ...@@ -430,6 +484,7 @@ def test_local_gpu_elemwise():
assert sum(isinstance(node.op, GpuElemwise) for node in topo) == 1 assert sum(isinstance(node.op, GpuElemwise) for node in topo) == 1
assert sum(type(node.op) == tensor.Elemwise for node in topo) == 0 assert sum(type(node.op) == tensor.Elemwise for node in topo) == 0
utt.assert_allclose(f(a_v, b_v, c_v), a_v + b_v + c_v) utt.assert_allclose(f(a_v, b_v, c_v), a_v + b_v + c_v)
assert _check_stack_trace(f)
return # Not yet implemeted return # Not yet implemeted
# Test multiple output # Test multiple output
...@@ -447,6 +502,7 @@ def test_local_gpu_elemwise(): ...@@ -447,6 +502,7 @@ def test_local_gpu_elemwise():
utt.assert_allclose(out[0], a_v) utt.assert_allclose(out[0], a_v)
utt.assert_allclose(out[1], c_v) utt.assert_allclose(out[1], c_v)
utt.assert_allclose(out[2], b_v) utt.assert_allclose(out[2], b_v)
assert _check_stack_trace(f)
# Test multiple output # Test multiple output
out_s = theano.scalar.Composite([a_s, b_s, c_s], [a_s + b_s, a_s * b_s]) out_s = theano.scalar.Composite([a_s, b_s, c_s], [a_s + b_s, a_s * b_s])
...@@ -458,6 +514,7 @@ def test_local_gpu_elemwise(): ...@@ -458,6 +514,7 @@ def test_local_gpu_elemwise():
out = f(a_v, b_v, c_v) out = f(a_v, b_v, c_v)
utt.assert_allclose(out[0], a_v + b_v) utt.assert_allclose(out[0], a_v + b_v)
utt.assert_allclose(out[1], a_v * c_v) utt.assert_allclose(out[1], a_v * c_v)
assert _check_stack_trace(f)
# Test non-contiguous input # Test non-contiguous input
c = gpuarray_shared_constructor(np.asarray(c_v, dtype='float32')) c = gpuarray_shared_constructor(np.asarray(c_v, dtype='float32'))
...@@ -466,6 +523,7 @@ def test_local_gpu_elemwise(): ...@@ -466,6 +523,7 @@ def test_local_gpu_elemwise():
out = f(a_v, b_v) out = f(a_v, b_v)
utt.assert_allclose(out[0], a_v[::2] + b_v[::2]) utt.assert_allclose(out[0], a_v[::2] + b_v[::2])
utt.assert_allclose(out[1], a_v[::2] * c_v[::2]) utt.assert_allclose(out[1], a_v[::2] * c_v[::2])
assert _check_stack_trace(f)
def test_many_arg_elemwise(): def test_many_arg_elemwise():
...@@ -541,7 +599,8 @@ def test_local_lift_abstractconv_gpu_shape(): ...@@ -541,7 +599,8 @@ def test_local_lift_abstractconv_gpu_shape():
a = tensor.ftensor4() a = tensor.ftensor4()
b = tensor.ftensor4() b = tensor.ftensor4()
c = tensor.nnet.abstract_conv.AbstractConv2d_gradWeights()(a, b, s) c = tensor.nnet.abstract_conv.AbstractConv2d_gradWeights()(a, b, s)
theano.function([s, a, b], c, mode=mode_with_gpu) f = theano.function([s, a, b], c, mode=mode_with_gpu)
assert _check_stack_trace(f)
finally: finally:
theano.config.on_opt_error = prev theano.config.on_opt_error = prev
...@@ -571,7 +630,8 @@ def test_local_assert_no_cpu_op(): ...@@ -571,7 +630,8 @@ def test_local_assert_no_cpu_op():
# If the flag is ignore # If the flag is ignore
try: try:
theano.config.assert_no_cpu_op = 'ignore' theano.config.assert_no_cpu_op = 'ignore'
theano.function([], out, mode=mode_local_assert) f = theano.function([], out, mode=mode_local_assert)
assert _check_stack_trace(f)
finally: finally:
theano.config.assert_no_cpu_op = old theano.config.assert_no_cpu_op = old
...@@ -581,8 +641,9 @@ def test_no_complex(): ...@@ -581,8 +641,9 @@ def test_no_complex():
freq_var = tensor.fscalar() freq_var = tensor.fscalar()
signal_var = tensor.fscalar() signal_var = tensor.fscalar()
stft_out = tensor.exp(width_var * freq_var) * signal_var stft_out = tensor.exp(width_var * freq_var) * signal_var
theano.function([width_var, freq_var, signal_var], stft_out, f = theano.function([width_var, freq_var, signal_var], stft_out,
mode=mode_with_gpu) mode=mode_with_gpu)
assert _check_stack_trace(f)
@utt.assertFailure_fast @utt.assertFailure_fast
...@@ -601,6 +662,7 @@ def test_local_lift_solve(): ...@@ -601,6 +662,7 @@ def test_local_lift_solve():
A_val = np.random.uniform(-0.4, 0.4, (5, 5)).astype("float32") A_val = np.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
b_val = np.random.uniform(-0.4, 0.4, (5, 3)).astype("float32") b_val = np.random.uniform(-0.4, 0.4, (5, 3)).astype("float32")
utt.assert_allclose(f_cpu(A_val, b_val), f_gpu(A_val, b_val)) utt.assert_allclose(f_cpu(A_val, b_val), f_gpu(A_val, b_val))
assert _check_stack_trace(f_gpu)
def test_gpu_solve_not_inplace(): def test_gpu_solve_not_inplace():
...@@ -665,7 +727,8 @@ def test_local_gpua_advanced_incsubtensor(): ...@@ -665,7 +727,8 @@ def test_local_gpua_advanced_incsubtensor():
w = tensor.ones_like(y) w = tensor.ones_like(y)
w = tensor.set_subtensor(w[tensor.eq(y, 1.0).nonzero()], 100) w = tensor.set_subtensor(w[tensor.eq(y, 1.0).nonzero()], 100)
w = tensor.set_subtensor(w[tensor.eq(y, -1.0).nonzero()], 0) w = tensor.set_subtensor(w[tensor.eq(y, -1.0).nonzero()], 0)
theano.function([target], w) f = theano.function([target], w)
assert _check_stack_trace(f)
def test_batched_dot_lifter(): def test_batched_dot_lifter():
...@@ -690,6 +753,7 @@ def test_batched_dot_lifter(): ...@@ -690,6 +753,7 @@ def test_batched_dot_lifter():
z = tensor.batched_dot(x, y) z = tensor.batched_dot(x, y)
f = theano.function([x, y], z, mode=mode_with_gpu) f = theano.function([x, y], z, mode=mode_with_gpu)
f(x_val, y_val) f(x_val, y_val)
assert check_stack_trace(f, ops_to_check='all')
def test_crossentropycategorical1hot_lifter(): def test_crossentropycategorical1hot_lifter():
......
...@@ -146,6 +146,7 @@ from theano.gof import (utils, Op, view_roots, ...@@ -146,6 +146,7 @@ from theano.gof import (utils, Op, view_roots,
EquilibriumOptimizer, Apply, EquilibriumOptimizer, Apply,
ReplacementDidntRemovedError) ReplacementDidntRemovedError)
from theano.gof.params_type import ParamsType from theano.gof.params_type import ParamsType
from theano.gof.opt import inherit_stack_trace
from theano.printing import pprint, FunctionPrinter, debugprint from theano.printing import pprint, FunctionPrinter, debugprint
from theano.compile.mode import optdb from theano.compile.mode import optdb
import theano.scalar import theano.scalar
...@@ -1625,19 +1626,16 @@ def local_dot_to_dot22(node): ...@@ -1625,19 +1626,16 @@ def local_dot_to_dot22(node):
return return
if y.type.dtype in ['float16', 'float32', 'float64', 'complex64', 'complex128']: if y.type.dtype in ['float16', 'float32', 'float64', 'complex64', 'complex128']:
if x.ndim == 2 and y.ndim == 2: with inherit_stack_trace(node.outputs):
# print "local_dot_to_dot22: MM" if x.ndim == 2 and y.ndim == 2:
return [_dot22(*node.inputs)] return [_dot22(*node.inputs)]
if x.ndim == 2 and y.ndim == 1: if x.ndim == 2 and y.ndim == 1:
# print "local_dot_to_dot22: MV" return [_dot22(x, y.dimshuffle(0, 'x')).dimshuffle(0)]
return [_dot22(x, y.dimshuffle(0, 'x')).dimshuffle(0)] if x.ndim == 1 and y.ndim == 2:
if x.ndim == 1 and y.ndim == 2: return [_dot22(x.dimshuffle('x', 0), y).dimshuffle(1)]
# print "local_dot_to_dot22: VM" if x.ndim == 1 and y.ndim == 1:
return [_dot22(x.dimshuffle('x', 0), y).dimshuffle(1)] return [_dot22(x.dimshuffle('x', 0),
if x.ndim == 1 and y.ndim == 1: y.dimshuffle(0, 'x')).dimshuffle()]
# print "local_dot_to_dot22: VV"
return [_dot22(x.dimshuffle('x', 0),
y.dimshuffle(0, 'x')).dimshuffle()]
_logger.info('Not optimizing dot with inputs %s %s %s %s', _logger.info('Not optimizing dot with inputs %s %s %s %s',
x, y, x.type, y.type) x, y, x.type, y.type)
...@@ -1646,19 +1644,22 @@ def local_dot_to_dot22(node): ...@@ -1646,19 +1644,22 @@ def local_dot_to_dot22(node):
@local_optimizer([gemm_no_inplace], inplace=True) @local_optimizer([gemm_no_inplace], inplace=True)
def local_inplace_gemm(node): def local_inplace_gemm(node):
if node.op == gemm_no_inplace: if node.op == gemm_no_inplace:
return [gemm_inplace(*node.inputs)] with inherit_stack_trace(node.outputs):
return [gemm_inplace(*node.inputs)]
@local_optimizer([gemv_no_inplace], inplace=True) @local_optimizer([gemv_no_inplace], inplace=True)
def local_inplace_gemv(node): def local_inplace_gemv(node):
if node.op == gemv_no_inplace: if node.op == gemv_no_inplace:
return [gemv_inplace(*node.inputs)] with inherit_stack_trace(node.outputs):
return [gemv_inplace(*node.inputs)]
@local_optimizer([ger], inplace=True) @local_optimizer([ger], inplace=True)
def local_inplace_ger(node): def local_inplace_ger(node):
if node.op == ger: if node.op == ger:
return [ger_destructive(*node.inputs)] with inherit_stack_trace(node.outputs):
return [ger_destructive(*node.inputs)]
@local_optimizer([gemm_no_inplace]) @local_optimizer([gemm_no_inplace])
...@@ -1666,12 +1667,13 @@ def local_gemm_to_gemv(node): ...@@ -1666,12 +1667,13 @@ def local_gemm_to_gemv(node):
"""GEMM acting on row or column matrices -> GEMV.""" """GEMM acting on row or column matrices -> GEMV."""
if node.op == gemm_no_inplace: if node.op == gemm_no_inplace:
z, a, x, y, b = node.inputs z, a, x, y, b = node.inputs
if z.broadcastable == x.broadcastable == (True, False): with inherit_stack_trace(node.outputs):
r = gemv_no_inplace(z.dimshuffle(1), a, y.T, x.dimshuffle(1), b) if z.broadcastable == x.broadcastable == (True, False):
return [r.dimshuffle('x', 0)] r = gemv_no_inplace(z.dimshuffle(1), a, y.T, x.dimshuffle(1), b)
if z.broadcastable == y.broadcastable == (False, True): return [r.dimshuffle('x', 0)]
r = gemv_no_inplace(z.dimshuffle(0), a, x, y.dimshuffle(0), b) if z.broadcastable == y.broadcastable == (False, True):
return [r.dimshuffle(0, 'x')] r = gemv_no_inplace(z.dimshuffle(0), a, x, y.dimshuffle(0), b)
return [r.dimshuffle(0, 'x')]
@local_optimizer([gemm_no_inplace]) @local_optimizer([gemm_no_inplace])
...@@ -1680,26 +1682,27 @@ def local_gemm_to_ger(node): ...@@ -1680,26 +1682,27 @@ def local_gemm_to_ger(node):
if node.op == gemm_no_inplace: if node.op == gemm_no_inplace:
z, a, x, y, b = node.inputs z, a, x, y, b = node.inputs
if x.broadcastable[1] and y.broadcastable[0]: if x.broadcastable[1] and y.broadcastable[0]:
# x and y are both vectors so this might qualifies for a GER with inherit_stack_trace(node.outputs):
xv = x.dimshuffle(0) # x and y are both vectors so this might qualifies for a GER
yv = y.dimshuffle(1) xv = x.dimshuffle(0)
try: yv = y.dimshuffle(1)
bval = T.get_scalar_constant_value(b) try:
except T.NotScalarConstantError: bval = T.get_scalar_constant_value(b)
# b isn't a constant, GEMM is doing useful pre-scaling except T.NotScalarConstantError:
return # b isn't a constant, GEMM is doing useful pre-scaling
return
if bval == 1: # best case a natural GER
rval = ger(z, a, xv, yv) if bval == 1: # best case a natural GER
return [rval] rval = ger(z, a, xv, yv)
elif bval == 0: # GER on zeros_like should be faster than GEMM return [rval]
zeros = T.zeros([x.shape[0], y.shape[1]], x.dtype) elif bval == 0: # GER on zeros_like should be faster than GEMM
rval = ger(zeros, a, xv, yv) zeros = T.zeros([x.shape[0], y.shape[1]], x.dtype)
return [rval] rval = ger(zeros, a, xv, yv)
else: return [rval]
# if bval is another constant, then z is being usefully else:
# pre-scaled and GER isn't really the right tool for the job. # if bval is another constant, then z is being usefully
return # pre-scaled and GER isn't really the right tool for the job.
return
# TODO: delete this optimization when we have the proper dot->gemm->ger pipeline # TODO: delete this optimization when we have the proper dot->gemm->ger pipeline
...@@ -1708,37 +1711,38 @@ def local_gemm_to_ger(node): ...@@ -1708,37 +1711,38 @@ def local_gemm_to_ger(node):
def local_dot22_to_ger_or_gemv(node): def local_dot22_to_ger_or_gemv(node):
"""dot22 computing an outer-product -> GER.""" """dot22 computing an outer-product -> GER."""
if node.op == _dot22: if node.op == _dot22:
x, y = node.inputs with inherit_stack_trace(node.outputs):
xb = x.broadcastable x, y = node.inputs
yb = y.broadcastable xb = x.broadcastable
one = T.as_tensor_variable(np.asarray(1, dtype=x.dtype)) yb = y.broadcastable
zero = T.as_tensor_variable(np.asarray(0, dtype=x.dtype)) one = T.as_tensor_variable(np.asarray(1, dtype=x.dtype))
if xb[1] and yb[0]: zero = T.as_tensor_variable(np.asarray(0, dtype=x.dtype))
# x and y are both vectors so this might qualifies for a GER if xb[1] and yb[0]:
xv = x.dimshuffle(0) # x and y are both vectors so this might qualifies for a GER
yv = y.dimshuffle(1) xv = x.dimshuffle(0)
zeros = T.zeros([x.shape[0], y.shape[1]], dtype=x.dtype) yv = y.dimshuffle(1)
rval = ger(zeros, one, xv, yv) zeros = T.zeros([x.shape[0], y.shape[1]], dtype=x.dtype)
return [rval] rval = ger(zeros, one, xv, yv)
if xb[0] and yb[1]: return [rval]
# x and y are both vectors so this qualifies for a sdot / ddot if xb[0] and yb[1]:
# TODO: Theano doesn't have a sdot, but gemv is better than _dot22 # x and y are both vectors so this qualifies for a sdot / ddot
xv = x.dimshuffle(1) # TODO: Theano doesn't have a sdot, but gemv is better than _dot22
zeros = T.AllocEmpty(x.dtype)(1) xv = x.dimshuffle(1)
rval = gemv_no_inplace(zeros, one, y.T, xv, zero) zeros = T.AllocEmpty(x.dtype)(1)
return [rval.dimshuffle('x', 0)] rval = gemv_no_inplace(zeros, one, y.T, xv, zero)
if xb[0] and not yb[0] and not yb[1]: return [rval.dimshuffle('x', 0)]
# x is vector, y is matrix so try gemv if xb[0] and not yb[0] and not yb[1]:
xv = x.dimshuffle(1) # x is vector, y is matrix so try gemv
zeros = T.AllocEmpty(x.dtype)(y.shape[1]) xv = x.dimshuffle(1)
rval = gemv_no_inplace(zeros, one, y.T, xv, zero) zeros = T.AllocEmpty(x.dtype)(y.shape[1])
return [rval.dimshuffle('x', 0)] rval = gemv_no_inplace(zeros, one, y.T, xv, zero)
if not xb[0] and not xb[1] and yb[1]: return [rval.dimshuffle('x', 0)]
# x is matrix, y is vector, try gemv if not xb[0] and not xb[1] and yb[1]:
yv = y.dimshuffle(0) # x is matrix, y is vector, try gemv
zeros = T.AllocEmpty(x.dtype)(x.shape[0]) yv = y.dimshuffle(0)
rval = gemv_no_inplace(zeros, one, x, yv, zero) zeros = T.AllocEmpty(x.dtype)(x.shape[0])
return [rval.dimshuffle(0, 'x')] rval = gemv_no_inplace(zeros, one, x, yv, zero)
return [rval.dimshuffle(0, 'x')]
################################# #################################
......
...@@ -43,6 +43,7 @@ from theano.tensor import DimShuffle, Subtensor ...@@ -43,6 +43,7 @@ from theano.tensor import DimShuffle, Subtensor
from theano.tensor.opt import register_uncanonicalize from theano.tensor.opt import register_uncanonicalize
from theano import scalar as scal from theano import scalar as scal
from theano.gof.opt import copy_stack_trace
_logger = logging.getLogger('theano.tensor.opt') _logger = logging.getLogger('theano.tensor.opt')
...@@ -57,10 +58,13 @@ def local_max_and_argmax(node): ...@@ -57,10 +58,13 @@ def local_max_and_argmax(node):
axis = node.op.get_params(node) axis = node.op.get_params(node)
if len(node.outputs[1].clients) == 0: if len(node.outputs[1].clients) == 0:
new = CAReduce(scal.maximum, axis)(node.inputs[0]) new = CAReduce(scal.maximum, axis)(node.inputs[0])
copy_stack_trace(node.outputs[0], new)
return [new, None] return [new, None]
if len(node.outputs[0].clients) == 0: if len(node.outputs[0].clients) == 0:
return [None, T.Argmax(axis)(node.inputs[0])] new = T.Argmax(axis)(node.inputs[0])
copy_stack_trace(node.outputs[0], new)
return [None, new]
@register_uncanonicalize @register_uncanonicalize
...@@ -84,8 +88,8 @@ def local_max_to_min(node): ...@@ -84,8 +88,8 @@ def local_max_to_min(node):
max.owner.op.scalar_op == scal.maximum): max.owner.op.scalar_op == scal.maximum):
neg = max.owner.inputs[0] neg = max.owner.inputs[0]
if neg.owner and neg.owner.op == T.neg: if neg.owner and neg.owner.op == T.neg:
return [CAReduce(scal.minimum, new = CAReduce(scal.minimum, max.owner.op.axis)(neg.owner.inputs[0])
max.owner.op.axis)(neg.owner.inputs[0])] return [copy_stack_trace(node.outputs[0], new)]
return False return False
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论