提交 7320e1b1 authored 作者: abergeron's avatar abergeron

Merge pull request #3288 from abergeron/nouiz_mixed

Nouiz mixed
......@@ -9,6 +9,9 @@ All rights reserved.
Contains code from NumPy, Copyright (c) 2005-2011, NumPy Developers.
All rights reserved.
Contain CnMeM under the same license with this copyright:
Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
......
......@@ -21,6 +21,8 @@ Montreal).
News
====
* We added support for :ref:`CuDNN v3 <libdoc_cuda_dnn>`.
* We added support for :attr:`CNMeM <config.lib.cnmem>` to speed up
the GPU memory allocation.
......
......@@ -308,6 +308,18 @@ to your ``Theano`` folder and execute the following command:
You should update frequently, bugs are fixed on a very regular basis.
Specific git commit
~~~~~~~~~~~~~~~~~~~
You can install a specific git commit by using the bleeding edge
instruction and adding @COMMIT_ID to the pip command like:
.. code-block:: bash
pip install --upgrade --no-deps git+git://github.com/Theano/Theano.git@07e9332a0932e90c47ed2a70fc3c7f8a55d2aa23
.. _testing_installation:
Testing your installation
......
......@@ -705,6 +705,25 @@ import theano and print the config variable, as in:
Generate a warning when the destroy_map or view_map tell that an op work
inplace, but the op did not reuse the input for its output.
.. attribute:: config.NanGuardMode.nan_is_error
Bool value, default: True
Controls whether NanGuardMode generates an error when it sees a nan.
.. attribute:: config.NanGuardMode.inf_is_error
Bool value, default: True
Controls whether NanGuardMode generates an error when it sees an inf.
.. attribute:: config.NanGuardMode.nan_is_error
Bool value, default: True
Controls whether NanGuardMode generates an error when it sees a
big value (>1e10).
.. attribute:: numpy
This section contains different attributes for configuring numpy's
......
......@@ -500,8 +500,8 @@ It will be used repeatedly.
training_steps = 10000
# Declare Theano symbolic variables
x = T.matrix("x")
y = T.vector("y")
x = T.dmatrix("x")
y = T.dvector("y")
w = theano.shared(rng.randn(feats), name="w")
b = theano.shared(0., name="b")
print "Initial model:"
......
......@@ -171,11 +171,6 @@ class OpFromGraph(gof.Op):
return ret
def grad(self, inputs, output_grads):
# OpFromGraph doesn't implement a connection_pattern, so for
# now we regard all inputs and outputs as connected. This will
# compute the right numerical value for the gradients but
# could fail to raise the disconnected inputs error in some
# cases.
if hasattr(self, "grad_ops"):
grad_ops = self.grad_ops
else:
......
......@@ -387,12 +387,17 @@ def get_mode(orig_string):
default_mode_class):
return instanciated_default_mode
if string in ['Mode', 'ProfileMode', 'DebugMode']:
if string in ['Mode', 'ProfileMode', 'DebugMode', 'NanGuardMode']:
if string == 'DebugMode':
# need to import later to break circular dependency.
from .debugmode import DebugMode
# DebugMode use its own linker.
ret = DebugMode(optimizer=config.optimizer)
elif string == 'NanGuardMode':
# need to import later to break circular dependency.
from .nanguardmode import NanGuardMode
# DebugMode use its own linker.
ret = NanGuardMode(True, True, True, optimizer=config.optimizer)
else:
# This might be required if the string is 'ProfileMode'
from .profilemode import ProfileMode # noqa
......
import logging
import collections
import logging
import numpy as np
import theano
from theano.configparser import config, AddConfigVar, BoolParam
import theano.tensor as T
import theano.sandbox.cuda as cuda
from theano.compile import Mode
AddConfigVar('NanGuardMode.nan_is_error',
"Default value for nan_is_error",
BoolParam(True),
in_c_key=False)
AddConfigVar('NanGuardMode.inf_is_error',
"Default value for inf_is_error",
BoolParam(True),
in_c_key=False)
AddConfigVar('NanGuardMode.big_is_error',
"Default value for big_is_error",
BoolParam(True),
in_c_key=False)
logger = logging.getLogger("theano.compile.nanguardmode")
......@@ -110,26 +128,60 @@ class NanGuardMode(Mode):
big_is_error : bool
If True, raise an error when a value greater than 1e10 is encountered.
Note
----
We ignore the linker parameter
"""
# We currently loose the 3 first params frequently, when calling
# mode.including() and variant.
def __init__(self, nan_is_error=None, inf_is_error=None, big_is_error=None,
optimizer=None, linker=None):
self.provided_optimizer = optimizer
cuda_compile_failed = False
if nan_is_error is None:
nan_is_error = config.NanGuardMode.nan_is_error
if inf_is_error is None:
inf_is_error = config.NanGuardMode.inf_is_error
if big_is_error is None:
big_is_error = config.NanGuardMode.big_is_error
assert nan_is_error or inf_is_error or big_is_error
def __init__(self, nan_is_error, inf_is_error, big_is_error=True):
if cuda.cuda_available:
self.guard_input = cuda.fvector('nan_guard')
if nan_is_error or inf_is_error:
self.gpumin = theano.function(
[self.guard_input], T.min(self.guard_input),
mode='FAST_RUN'
)
if inf_is_error:
self.gpumax = theano.function(
[self.guard_input], T.max(self.guard_input),
mode='FAST_RUN'
)
if big_is_error:
self.gpuabsmax = theano.function(
[self.guard_input], T.max(T.abs_(self.guard_input)),
mode='FAST_RUN'
)
try:
self.gpumin = theano.function(
[self.guard_input], T.min(self.guard_input),
mode='FAST_RUN'
)
except RuntimeError:
# This can happen if cuda is available, but the
# device is in exclusive mode and used by another
# process.
cuda_compile_failed = True
if inf_is_error and not cuda_compile_failed:
try:
self.gpumax = theano.function(
[self.guard_input], T.max(self.guard_input),
mode='FAST_RUN'
)
except RuntimeError:
# This can happen if cuda is available, but the
# device is in exclusive mode and used by another
# process.
cuda_compile_failed = True
if big_is_error and not cuda_compile_failed:
try:
self.gpuabsmax = theano.function(
[self.guard_input], T.max(T.abs_(self.guard_input)),
mode='FAST_RUN'
)
except RuntimeError:
# This can happen if cuda is available, but the
# device is in exclusive mode and used by another
# process.
cuda_compile_failed = True
def do_check_on(var, nd, f, is_input):
"""
......@@ -154,7 +206,10 @@ class NanGuardMode(Mode):
if nan_is_error:
err = False
if cuda.cuda_available and isinstance(var, cuda.CudaNdarray):
err = np.isnan(self.gpumin(var.reshape(var.size)))
if not isinstance(nd.op,
# It store ints in float container
theano.sandbox.rng_mrg.GPU_mrg_uniform):
err = np.isnan(self.gpumin(var.reshape(var.size)))
else:
err = contains_nan(var)
if err:
......@@ -227,4 +282,4 @@ class NanGuardMode(Mode):
wrap_linker = theano.gof.WrapLinker([theano.gof.OpWiseCLinker()],
nan_check)
super(NanGuardMode, self).__init__(wrap_linker,
optimizer=theano.config.optimizer)
optimizer=self.provided_optimizer)
......@@ -150,6 +150,7 @@ AddConfigVar(
'mode',
"Default compilation mode",
EnumStr('Mode', 'ProfileMode', 'DebugMode', 'FAST_RUN',
'NanGuardMode',
'FAST_COMPILE', 'PROFILE_MODE', 'DEBUG_MODE'),
in_c_key=False)
......
......@@ -290,6 +290,7 @@ class SequenceDB(DB):
def register(self, name, obj, position, *tags):
super(SequenceDB, self).register(name, obj, *tags)
assert isinstance(position, (int, float))
self.__position__[name] = position
def query(self, *tags, **kwtags):
......
......@@ -6,7 +6,6 @@ from theano.gof.opt import * # noqa
from theano.gof.fg import FunctionGraph as Env
from theano.gof.toolbox import * # noqa
from theano.tensor.opt import Assert
from theano import tensor as T
......
......@@ -49,7 +49,12 @@ if __name__ == '__main__':
else:
costlySpeed = costlyTimeOpenmp / costlyTime
costlySpeedstring = "slowdown"
print("Timed with vector of %d elements" % options.N)
print("Fast op time without openmp %fs with openmp %fs %s %2.2f" % (
cheapTime, cheapTimeOpenmp, cheapSpeedstring, cheapSpeed))
print("Fast op time without openmp %fs with openmp %fs %s %2.2f" % (cheapTime, cheapTimeOpenmp, cheapSpeedstring, cheapSpeed))
print("Fast op time without openmp %fs with openmp %fs %s %2.2f" % (
cheapTime, cheapTimeOpenmp, cheapSpeedstring, cheapSpeed))
print("Slow op time without openmp %fs with openmp %fs %s %2.2f" % (costlyTime, costlyTimeOpenmp, costlySpeedstring, costlySpeed))
print("Slow op time without openmp %fs with openmp %fs %s %2.2f" % (
costlyTime, costlyTimeOpenmp, costlySpeedstring, costlySpeed))
......@@ -285,6 +285,11 @@ class PycudaElemwiseSourceModuleMakeThunkOp(Op):
self.scalar_op = scalar_op
self.inplace_pattern = inplace_pattern
# As we have a dict in props, we need to implement __hash__
def __hash__(self):
return hash(type(self), hash(self.scalar_op),
hash_from_dict(self.inplace_pattern))
def __str__(self):
if self.name is None:
if self.inplace_pattern:
......
......@@ -66,7 +66,7 @@ class NaiveAlgo(object):
def cache_version(self):
ver = self.scalar_op.c_code_cache_version()
if ver:
return (17, self.verbose, self.sync, ver)
return (18, self.verbose, self.sync, ver)
else:
return ver
......@@ -142,6 +142,8 @@ class NaiveAlgo(object):
# perform the scalar operation on the input and output references
# TODO: What if the scalar_op needs support_code??
for ipos, i in enumerate(node.outputs):
print("npy_%s o%d_i;" % (i.dtype, ipos), file=sio)
task_code = self.scalar_op.c_code(
Apply(self.scalar_op,
[scalar.Scalar(dtype=input.type.dtype).make_variable()
......@@ -150,9 +152,11 @@ class NaiveAlgo(object):
for output in node.outputs]),
nodename + '_scalar_',
get_str_list_logical_scalar(node),
['ii_o%i_data[0]' % ipos for ipos, i in enumerate(node.outputs)],
['o%i_i' % ipos for ipos, i in enumerate(node.outputs)],
sub=dict(fail='return;')) # TODO: set a failure code somehow!!!
print(" ", task_code, file=sio)
for ipos, _ in enumerate(node.outputs):
print("o%i_data[i] = o%i_i;" % (ipos, ipos), file=sio)
print(" }", file=sio)
#indent = " "*(4*d+7)
......@@ -477,6 +481,8 @@ class NaiveAlgo(object):
print(" for (int i = idx; i < numEls; i += numThreads) {", file=sio)
# perform the scalar operation on the input and output references
# TODO: What if the scalar_op needs support_code??
for ipos, i in enumerate(node.outputs):
print("npy_%s o%d_i;" % (i.dtype, ipos), file=sio)
task_code = self.scalar_op.c_code(
Apply(self.scalar_op,
[scalar.Scalar(dtype=input.type.dtype).make_variable()
......@@ -486,9 +492,11 @@ class NaiveAlgo(object):
, nodename + '_scalar_'
#, ['i%i_data[i]'%ipos for ipos, i in enumerate(node.inputs)]
, get_str_list_logical_scalar(node, data_str='i%i_data[i]')
, ['o%i_data[i]'%ipos for ipos, i in enumerate(node.outputs)]
, ['o%i_i'%ipos for ipos, i in enumerate(node.outputs)]
, sub=dict(fail='return;')) # TODO: set a failure code somehow!!!
print(" ", task_code, file=sio)
for ipos, _ in enumerate(node.outputs):
print("o%i_data[i] = o%i_i;" % (ipos, ipos), file=sio)
print(" }", file=sio)
print("}", file=sio)
......
......@@ -279,7 +279,8 @@ def local_gpu_elemwise_0(node):
# TODO: change this when fusion makes Elemwise with
# multiple outputs
gpu_elemwise = new_op(*(gpu_from_host(i)
for i in node.inputs))
for i in node.inputs),
return_list=True)
# case 2 - it is still ok if some inputs were upcast to float32
elif all([i.type.dtype in upcastable
for i in node.inputs]):
......@@ -292,18 +293,19 @@ def local_gpu_elemwise_0(node):
new_inputs = [gpu_from_host(tensor.cast(i, 'float32'))
for i in node.inputs]
gpu_elemwise = new_op(*new_inputs)
gpu_elemwise = new_op(*new_inputs, return_list=True)
else:
return False
else:
return False
gpu_elemwise = split_huge_add_or_mul(gpu_elemwise.owner)
gpu_elemwise = split_huge_add_or_mul(gpu_elemwise[0].owner)
if not gpu_elemwise:
return False
if max_inputs_to_GpuElemwise(node) < len(gpu_elemwise.inputs):
if (max_inputs_to_GpuElemwise(node) <
len(gpu_elemwise.inputs)):
return False
return [host_from_gpu(gpu_elemwise.outputs[0])]
return [host_from_gpu(out) for out in gpu_elemwise.outputs]
@register_opt()
......@@ -785,7 +787,7 @@ def local_gpu_careduce(node):
x, = node.inputs
# Otherwise, is some corner case, we will try to move it
# to the GPU later and this cause not wanted user warning.
if x.dtype != 'float32':
if x.dtype != 'float32' or node.outputs[0].dtype != "float32":
return
replace = False
if x.owner and isinstance(x.owner.op, HostFromGpu):
......@@ -1114,6 +1116,13 @@ def local_gpu_incsubtensor(node):
incsubt = host_output.owner.op
x, y = host_output.owner.inputs[0:2]
coords = host_output.owner.inputs[2:]
if x.dtype != "float32":
return
if y.dtype != "float32":
# The IncSubtensor upcast to float32 y, so we do it
# explicitly to move it to the GPU.
y = y.astype('float32')
return [GpuIncSubtensor(
incsubt.idx_list,
inplace=incsubt.inplace,
......@@ -1124,7 +1133,7 @@ def local_gpu_incsubtensor(node):
# Incrementing a float32 x results in a float32
# output even if y is float64, so we can downcast
# y to put it on GPU
if type(node.op) == tensor.IncSubtensor and \
elif type(node.op) == tensor.IncSubtensor and \
node.inputs[0].dtype == "float32":
x, y = node.inputs[0:2]
assert isinstance(x.type, tensor.TensorType)
......
......@@ -599,11 +599,11 @@ def test_local_gpu_elemwise_0():
# Due to optimization order, this composite is created when all
# the op are on the gpu.
f = theano.function([a, b, c], [a + b + c], mode=mode_with_gpu)
f = theano.function([a, b, c], a + b + c, mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert sum(isinstance(node.op, cuda.GpuElemwise) for node in topo) == 1
assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 1
f(a_v, b_v, c_v)
utt.assert_allclose(f(a_v, b_v, c_v), a_v + b_v + c_v)
# Now test with the composite already on the cpu before we move it
# to the gpu
......@@ -612,11 +612,46 @@ def test_local_gpu_elemwise_0():
c_s = theano.scalar.float32()
out_s = theano.scalar.Composite([a_s, b_s, c_s], [a_s + b_s + c_s])
out_op = tensor.Elemwise(out_s)
f = theano.function([a, b, c], [out_op(a, b, c)], mode=mode_with_gpu)
f = theano.function([a, b, c], out_op(a, b, c), mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert sum(isinstance(node.op, cuda.GpuElemwise) for node in topo) == 1
assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 1
f(a_v, b_v, c_v)
utt.assert_allclose(f(a_v, b_v, c_v), a_v + b_v + c_v)
# Test multiple output
a_s = theano.scalar.float32()
a = tensor.fmatrix()
from theano.scalar.basic import identity
out_s = theano.scalar.Composite([a_s, b_s, c_s],
[identity(a_s), identity(c_s), identity(b_s)])
outs_op = tensor.Elemwise(out_s)
f = theano.function([a, b, c], outs_op(a, b, c), mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert sum(isinstance(node.op, cuda.GpuElemwise) for node in topo) == 1
assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 0
out = f(a_v, b_v, c_v)
utt.assert_allclose(out[0], a_v)
utt.assert_allclose(out[1], c_v)
utt.assert_allclose(out[2], b_v)
# Test multiple output
out_s = theano.scalar.Composite([a_s, b_s, c_s], [a_s + b_s, a_s * c_s])
outs_op = tensor.Elemwise(out_s)
f = theano.function([a, b, c], outs_op(a, b, c), mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert sum(isinstance(node.op, cuda.GpuElemwise) for node in topo) == 1
assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 0
out = f(a_v, b_v, c_v)
utt.assert_allclose(out[0], a_v + b_v)
utt.assert_allclose(out[1], a_v * c_v)
# Test non-contiguous input
c = cuda.shared_constructor(c_v)
f = theano.function([a, b], outs_op(a[::2], b[::2], c[::2]),
mode=mode_with_gpu)
out = f(a_v, b_v)
utt.assert_allclose(out[0], a_v[::2] + b_v[::2])
utt.assert_allclose(out[1], a_v[::2] * c_v[::2])
def test_elemwise_fusion():
......
......@@ -72,6 +72,8 @@ class GpuElemwise(HideC, Elemwise):
res = Elemwise.make_node(self, *inputs)
outputs = [GpuArrayType(broadcastable=o.type.broadcastable,
dtype=o.type.dtype)() for o in res.outputs]
if len(outputs) > 1:
raise NotImplementedError()
inputs = [as_gpuarray_variable(i) for i in inputs]
node = Apply(self, inputs, outputs)
......
......@@ -270,7 +270,8 @@ def local_gpu_elemwise(node):
name = op.name
if name:
name = 'Gpu' + name
if len(node.outputs) > 1:
return
res = GpuElemwise(scal_op, name=name,
inplace_pattern=copy.copy(op.inplace_pattern),
nfunc_spec=op.nfunc_spec)
......
......@@ -255,3 +255,73 @@ def test_local_gpu_subtensor():
assert any([type(node.op) is tensor.Subtensor for node in topo])
assert not any([isinstance(node.op, GpuSubtensor) for node in topo])
assert any([isinstance(node.op, GpuElemwise) for node in topo])
def test_local_gpu_elemwise():
"""
Test local_gpu_elemwise when there is a dtype upcastable to float32
"""
a = tensor.bmatrix()
b = tensor.fmatrix()
c = tensor.fmatrix()
a_v = (numpy.random.rand(4, 5) * 10).astype("int8")
b_v = (numpy.random.rand(4, 5) * 10).astype("float32")
c_v = (numpy.random.rand(4, 5) * 10).astype("float32")
# Due to optimization order, this composite is created when all
# the op are on the gpu.
f = theano.function([a, b, c], a + b + c, mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert sum(isinstance(node.op, GpuElemwise) for node in topo) == 1
assert sum(type(node.op) == tensor.Elemwise for node in topo) == 0
utt.assert_allclose(f(a_v, b_v, c_v), a_v + b_v + c_v)
# Now test with the composite already on the cpu before we move it
# to the gpu
a_s = theano.scalar.int8()
b_s = theano.scalar.float32()
c_s = theano.scalar.float32()
out_s = theano.scalar.Composite([a_s, b_s, c_s], [a_s + b_s + c_s])
out_op = tensor.Elemwise(out_s)
f = theano.function([a, b, c], out_op(a, b, c), mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert sum(isinstance(node.op, GpuElemwise) for node in topo) == 1
assert sum(type(node.op) == tensor.Elemwise for node in topo) == 0
utt.assert_allclose(f(a_v, b_v, c_v), a_v + b_v + c_v)
return # Not yet implemeted
# Test multiple output
a_s = theano.scalar.float32()
a = tensor.fmatrix()
from theano.scalar.basic import identity
out_s = theano.scalar.Composite([a_s, b_s, c_s],
[identity(a_s), identity(c_s), identity(b_s)])
outs_op = tensor.Elemwise(out_s)
f = theano.function([a, b, c], outs_op(a, b, c), mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert sum(isinstance(node.op, GpuElemwise) for node in topo) == 1
assert sum(type(node.op) == tensor.Elemwise for node in topo) == 0
out = f(a_v, b_v, c_v)
utt.assert_allclose(out[0], a_v)
utt.assert_allclose(out[1], c_v)
utt.assert_allclose(out[2], b_v)
# Test multiple output
out_s = theano.scalar.Composite([a_s, b_s, c_s], [a_s + b_s, a_s * b_s])
outs_op = tensor.Elemwise(out_s)
f = theano.function([a, b, c], outs_op(a, b, c), mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert sum(isinstance(node.op, GpuElemwise) for node in topo) == 1
assert sum(type(node.op) == tensor.Elemwise for node in topo) == 0
out = f(a_v, b_v, c_v)
utt.assert_allclose(out[0], a_v + b_v)
utt.assert_allclose(out[1], a_v * c_v)
# Test non-contiguous input
c = cuda.shared_constructor(numpy.asarray(c_v, dtype='float32'))
f = theano.function([a, b], outs_op(a[::2], b[::2], c[::2]),
mode=mode_with_gpu)
out = f(a_v, b_v)
utt.assert_allclose(out[0], a_v[::2] + b_v[::2])
utt.assert_allclose(out[1], a_v[::2] * c_v[::2])
......@@ -724,7 +724,7 @@ def same_out_float_only(type):
class transfer_type(gof.utils.object2):
def __init__(self, *transfer):
assert all(type(x) == int for x in transfer)
assert all(type(x) in [int, str] or x is None for x in transfer)
self.transfer = transfer
def __str__(self):
......@@ -736,6 +736,8 @@ class transfer_type(gof.utils.object2):
for i in self.transfer:
if i is None:
retval += [upcast]
elif isinstance(i, str):
retval += [i]
else:
retval += [types[i]]
return retval
......@@ -3410,7 +3412,10 @@ class Composite(ScalarOp):
return lambda inputs: r.data
node = r.owner
producers = [compose_impl(input) for input in node.inputs]
return lambda inputs: node.op.impl(*[p(inputs) for p in producers])
def f(inputs):
return node.op.impl(*[p(inputs) for p in producers])
return f
self._impls = [compose_impl(r) for r in self.fgraph.outputs]
def init_name(self):
......@@ -3467,6 +3472,8 @@ class Composite(ScalarOp):
# that will flatten Composite. We don't need to do this
# recusively, as the way the fusion optimizer work, we have
# only 1 new Composite each time at the output.
for i in inputs:
assert i not in outputs # This isn't supported, use identity
if len(outputs) > 1 or not any([isinstance(var.owner.op, Composite)
for var in outputs]):
# No inner Composite
......@@ -3538,8 +3545,11 @@ class Composite(ScalarOp):
def impl(self, *inputs):
output_storage = [[None] for i in xrange(self.nout)]
self.perform(None, inputs, output_storage)
return utils.to_return_values([storage[0] for storage in
output_storage])
ret = utils.to_return_values([storage[0] for storage in
output_storage])
if self.nout > 1:
ret = tuple(ret)
return ret
def grad(self, inputs, output_grads):
raise NotImplementedError("grad is not implemented for Composite")
......
......@@ -296,6 +296,7 @@ def inplace_elemwise_optimizer_op(OP):
# gpuarray GpuElemwise inherit from Elemwise
if not type(op) == OP:
continue
baseline = op.inplace_pattern
protected_inputs = [
f.protected for f in node.fgraph._features if
......@@ -331,8 +332,8 @@ def inplace_elemwise_optimizer_op(OP):
if hasattr(op.scalar_op, "make_new_inplace"):
new_scal = op.scalar_op.make_new_inplace(
scalar.transfer_type(
*[inplace_pattern.get(i, None)
for i in xrange(len(node.outputs))]))
*[inplace_pattern.get(i, o.dtype)
for i, o in enumerate(node.outputs)]))
else:
new_scal = op.scalar_op.__class__(
scalar.transfer_type(
......@@ -1507,7 +1508,11 @@ def local_subtensor_make_vector(node):
# Python 2.4 wants to index only with Python integers
v = int(v)
# We don't need to copy over any stack traces here
return [x.owner.inputs[v]]
try:
ret = [x.owner.inputs[v]]
except IndexError:
raise NotScalarConstantError("Bad user graph!")
return ret
except NotScalarConstantError:
pass
elif idx.ndim == 1 and isinstance(idx, T.Constant):
......@@ -5867,15 +5872,17 @@ def local_elemwise_fusion_op(OP, max_input_fct=lambda node: 32,
tmp_s_input.append(tmp)
tmp_input.append(ii)
tmp_scalar.append(tmp_s_input[-1])
s_op = i.owner.op.scalar_op(*tmp_s_input)
s_op = i.owner.op.scalar_op(*tmp_s_input,
return_list=True)
# if the scalar_op don't have a c implementation,
# we skip its fusion to allow the fusion of the
# other ops.
i.owner.op.scalar_op.c_code(s_op.owner,
i.owner.op.scalar_op.c_code(s_op[0].owner,
"test_presence_of_c_code",
["x" for x in i.owner.inputs],
"z", {})
["z" for z in i.owner.outputs],
{})
except MethodNotDefined:
catch = True
except NotImplementedError:
......@@ -5906,7 +5913,7 @@ def local_elemwise_fusion_op(OP, max_input_fct=lambda node: 32,
new_nb_input = new_nb_input_
inputs.extend(tmp_input)
s_inputs.extend(tmp_scalar)
s_g.append(s_op)
s_g.extend(s_op)
else:
# We must support the case where the same variable appear many
# time in the inputs
......@@ -5934,25 +5941,26 @@ def local_elemwise_fusion_op(OP, max_input_fct=lambda node: 32,
fusion optimization. We skip this optimization. You can ignore this message,
your code will run correctly, but may be slower.""")
s_new_out = node.op.scalar_op(*s_g)
s_new_out = node.op.scalar_op(*s_g, return_list=True)
try:
s_new_out.owner.op.c_code(s_new_out.owner,
"test_presence_of_c_code",
["x" for x in s_g],
"z", {})
s_new_out[0].owner.op.c_code(s_new_out[0].owner,
"test_presence_of_c_code",
["x" for x in s_g],
["z" for x in s_new_out], {})
except MethodNotDefined:
_logger.info(("%s does not implement the c_code function."
" As well as being potentially slow, this disables "
"loop fusion of this op.") % str(s_new_out.owner.op))
"loop fusion of this op.") % str(
s_new_out[0].owner.op))
return False
except NotImplementedError:
_logger.info(("%s does not implement the c_code function. As well"
" as being potentially slow, this disables loop"
" fusion of this op.") % str(s_new_out.owner.op))
" fusion of this op.") % str(s_new_out[0].owner.op))
return False
# create the composite op.
C = scalar.Composite(s_inputs, [s_new_out])
C = scalar.Composite(s_inputs, s_new_out)
# create the new node.
# Do not call make_node to have test_value
......
import sys
import numpy
import six.moves.cPickle as pickle
from six.moves import xrange
......@@ -120,4 +119,4 @@ def test_merge_opt_runtime():
dt = time.time() - t
# it should never take longer than 5 seconds to compile this graph
assert dt < 5.0
assert dt < 5.0, dt
......@@ -205,18 +205,6 @@ whitelist_flake8 = [
"sparse/sandbox/sp.py",
"gof/unify.py",
"gof/__init__.py",
"gof/tests/test_cmodule.py",
"gof/tests/test_destroyhandler.py",
"gof/tests/test_opt.py",
"gof/tests/test_lazy.py",
"gof/tests/test_toolbox.py",
"gof/tests/test_link.py",
"gof/tests/test_fg.py",
"gof/tests/test_sched.py",
"gof/tests/test_graph_opt_caching.py",
"gof/tests/test_graph.py",
"gof/tests/test_cc.py",
"gof/tests/test_compute_test_value.py",
"gof/sandbox/equilibrium.py",
]
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论