提交 51b39ada authored 作者: lamblin's avatar lamblin

Merge pull request #1441 from nouiz/fix_opt_crash

Fix opt crash of local_gpu_lazy_ifelse.
......@@ -1636,7 +1636,8 @@ class _Linker(gof.link.LocalLinker):
if not isinstance(node.op, gof.op.Op):
raise utils.MethodNotDefined()
e = FunctionGraph(*graph.clone(node.inputs, node.outputs))
e.toposort = lambda: e.apply_nodes # WARNING: STOCHASTIC ORDER
# The toposort isn't a stochastic order as it contain only one node.
e.toposort = lambda: list(e.apply_nodes)
# Specifically... e.nodes is a set, but of only 1 element
cl = CLinker().accept(e, [r for r, r2 in zip(e.outputs,
......@@ -1679,6 +1680,8 @@ class _Linker(gof.link.LocalLinker):
storage_map,
compute_map,
no_recycling)
thunk.inputs = [storage_map[v] for v in node.inputs]
thunk.outputs = [storage_map[v] for v in node.outputs]
# Right now there is no op that when called check if
# its ouputs are computed and don't recompute itself.
......
......@@ -1498,6 +1498,9 @@ class OpWiseCLinker(link.LocalLinker):
storage_map,
compute_map,
no_recycling)]
thunks[-1].inputs = [storage_map[v] for v in node.inputs]
thunks[-1].outputs = [storage_map[v] for v in node.outputs]
finally:
node.op._op_use_c_code = old_value
......
......@@ -112,29 +112,37 @@ def raise_with_op(op, thunk=None, exc_info=None):
if raise_with_op.print_thunk_trace:
log_thunk_trace(exc_value)
if theano.config.exception_verbosity == 'high':
f = StringIO.StringIO()
theano.printing.debugprint(op, file=f, stop_on_name=True)
if thunk is not None:
detailed_err_msg = "\nApply node that caused the error: " + str(op)
if thunk is not None:
if hasattr(thunk, 'inputs'):
shapes = [getattr(ipt[0], 'shape', 'No shapes')
for ipt in thunk.inputs]
strides = [getattr(ipt[0], 'strides', 'No strides')
for ipt in thunk.inputs]
detailed_err_msg = ("\nInputs shapes: %s \n" % shapes +
"Inputs strides: %s \n" % strides +
"Debugprint of the apply node: \n" +
f.getvalue())
else:
detailed_err_msg = "\nDebugprint of the apply node: \n" + f.getvalue()
shapes = "The thunk don't have an inputs attributes."
strides = "So we can't access the storage inputs value"
types = [getattr(ipt, 'type', 'No type')
for ipt in op.inputs]
detailed_err_msg += ("\nInputs shapes: %s" % shapes +
"\nInputs strides: %s" % strides +
"\nInputs types: %s" % types)
else:
detailed_err_msg += ("\nUse another linker then the c linker to"
" have the inputs shapes and strides printed.")
if theano.config.exception_verbosity == 'high':
f = StringIO.StringIO()
theano.printing.debugprint(op, file=f, stop_on_name=True,
print_type=True)
detailed_err_msg += "\nDebugprint of the apply node: \n" + f.getvalue()
else:
detailed_err_msg = ("\nUse the Theano flag"
" 'exception_verbosity=high' for more"
" information on the inputs of this apply"
" node.")
exc_value = exc_type(str(exc_value) +
"\nApply node that caused the error: " + str(op) +
detailed_err_msg)
detailed_err_msg += ("\nUse the Theano flag 'exception_verbosity=high'"
" for a debugprint of this apply node.")
exc_value = exc_type(str(exc_value) + detailed_err_msg)
raise exc_type, exc_value, exc_trace
raise_with_op.print_thunk_trace = False
......@@ -523,6 +531,8 @@ class PerformLinker(LocalLinker):
storage_map,
compute_map,
no_recycling)]
thunks[-1].inputs = [storage_map[v] for v in node.inputs]
thunks[-1].outputs = [storage_map[v] for v in node.outputs]
finally:
node.op._op_use_c_code = old_value
......
......@@ -431,6 +431,8 @@ class PureOp(object):
# compute output value once with test inputs to validate graph
thunk = node.op.make_thunk(node, storage_map, compute_map,
no_recycling=[])
thunk.inputs = [storage_map[v] for v in node.inputs]
thunk.outputs = [storage_map[v] for v in node.outputs]
required = thunk()
assert not required # We provided all inputs
......
......@@ -843,6 +843,9 @@ class VM_Linker(link.LocalLinker):
compute_map,
no_recycling)
for node in order]
for node, thunk in zip(order, thunks):
thunk.inputs = [storage_map[v] for v in node.inputs]
thunk.outputs = [storage_map[v] for v in node.outputs]
computed, last_user = link.gc_helper(order)
if self.allow_gc:
......
......@@ -652,7 +652,7 @@ class GpuConv(GpuOp):
def c_code_cache_version(self):
# raise this whenever modifying any of the support_code_files
return (0, 19)
return (0, 20)
def c_support_code_apply(self, node, nodename):
# REMEMBER TO RAISE c_code_cache_version when changing any of
......@@ -704,6 +704,7 @@ class GpuConv(GpuOp):
return NULL;
}
// TODO, make out be decref before we alloc out2!
CudaNdarray * out2 = (CudaNdarray *)CudaNdarray_Conv(%(img)s, %(kern)s,
%(out)s, mode,
dx, dy,
......@@ -711,6 +712,10 @@ class GpuConv(GpuOp):
%(max_threads_dim0)s);
Py_XDECREF(%(out)s);
%(out)s = out2;
if (%(out)s==NULL){
%(fail)s
}
""" % sub
......
......@@ -1811,9 +1811,10 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
{
PyErr_Format(
PyExc_RuntimeError,
"Cuda error: %s: %s.\n",
"k4",
cudaGetErrorString(err));
"Cuda error: %s: %s. n_block=(%ld,%ld) n_threads=%ld\n",
"k5 with loop over k4",
cudaGetErrorString(err),
(long) n_blocks.x, (long) n_blocks.y, (long) n_threads.x);
Py_XDECREF(new_other);
return -1;
}
......@@ -1831,14 +1832,17 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
);
while (n_blocks.x * n_blocks.y > NUM_VECTOR_OP_BLOCKS)
n_blocks.y /= 2;
while (n_blocks.x * n_blocks.y * n_blocks.z > NUM_VECTOR_OP_BLOCKS)
n_blocks.z /= 2;
// GTX285(compute capabilities 1.3) don't support n_blocks.z > 1
// (compute capabilities 2.0) support 65535 for n_blocks.z
//while (n_blocks.x * n_blocks.y * n_blocks.z > NUM_VECTOR_OP_BLOCKS)
// n_blocks.z /= 2;
n_blocks.z = 1;
dim3 n_threads(
std::min(
CudaNdarray_HOST_DIMS(self)[3],
NUM_VECTOR_OP_THREADS_PER_BLOCK)
//TODO: DON"T YOU NEED OT PUT DIMS[4] in here???
//TODO: DON"T YOU NEED OT PUT DIMS[5] in here???
//TODO: DON'T YOU NEED TO PUT DIMS[4] in here???
//TODO: DON'T YOU NEED TO PUT DIMS[5] in here???
);
k6<<<n_blocks, n_threads>>>(
CudaNdarray_HOST_DIMS(self)[0],
......@@ -1867,9 +1871,11 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
{
PyErr_Format(
PyExc_RuntimeError,
"Cuda error: %s: %s.\n",
"k4",
cudaGetErrorString(err));
"Cuda error: %s: %s. n_blocks=(%ld, %ld, %ld) n_threads=(%ld)\n",
"k6",
cudaGetErrorString(err),
(long) n_blocks.x, (long) n_blocks.y, (long) n_blocks.z,
(long) n_threads.x);
Py_XDECREF(new_other);
return -1;
}
......
......@@ -403,7 +403,12 @@ def local_gpu_lazy_ifelse(node):
host_input = node.inputs[0]
if (host_input.owner and
isinstance(host_input.owner.op, theano.ifelse.IfElse) and
not host_input.owner.op.gpu):
not host_input.owner.op.gpu and
# If there is more then 1 outputs, we can't replace it
# here with a local optimizer as we replace the
# GpuFromHost node and the other output of the if won't be
# replaced.
host_input.owner.op.n_outs == 1):
gpu_ifelse = theano.ifelse.IfElse(host_input.owner.op.n_outs,
gpu=True)
......
import sys, time, unittest
import sys
import numpy
# Skip test if cuda_ndarray is not available.
......@@ -7,7 +7,7 @@ from nose.plugins.skip import SkipTest
import theano
from theano.compile.pfunc import pfunc
from theano import config, tensor
import theano.sandbox.linalg.tests
import theano.sandbox.linalg.tests.test_linalg
from theano.tests import unittest_tools as utt
......@@ -48,28 +48,29 @@ def test_int_pow():
op_names = [n.op.__class__.__name__ for n in f.maker.fgraph.toposort()]
assert op_names == ['GpuCAReduce', 'GpuElemwise', 'HostFromGpu']
f = theano.function([a], tensor.pow(a,4).sum(), mode=mode_with_gpu)
f = theano.function([a], tensor.pow(a, 4).sum(), mode=mode_with_gpu)
op_names = [n.op.__class__.__name__ for n in f.maker.fgraph.toposort()]
assert op_names == ['GpuElemwise', 'GpuCAReduce', 'HostFromGpu']
#theano.printing.debugprint(f)
def test_gpualloc():
'''
This tests tries to catch the scenario when, due to infer_shape,
the input of the alloc changes from tesnor scalar to a constant
the input of the alloc changes from tensor scalar to a constant
1. In this case the original constracted broadcastable pattern will
have a False for that dimension, but the new broadcastable pattern
that will be inserted by gpualloc will have a True since it knows the
dimension is 1 and therefore broadcastable.
'''
x = theano.shared(numpy.ones(3,dtype='float32'), 'x')
m = (x).dimshuffle(['x',0])
x = theano.shared(numpy.ones(3, dtype='float32'), 'x')
m = (x).dimshuffle(['x', 0])
v = tensor.alloc(1., *m.shape)
f = theano.function([], v+x)
f = theano.function([], v + x, mode=mode_with_gpu)
l = f.maker.fgraph.toposort()
assert numpy.any(ininstance(x.op, cuda.GpuAlloc) for x in l )
assert numpy.any([isinstance(x.op, cuda.GpuAlloc) for x in l])
def test_alloc_memset_0():
......
......@@ -159,6 +159,49 @@ class test_ifelse(unittest.TestCase, utt.TestOptimizationMixin):
assert numpy.all(outs_0[2] == 1.)
assert numpy.all(outs_0[3] == 1.)
def test_multiple_out_crash(self):
# This test failed up to commit 2faeb62c38
p0 = self.shared(numpy.asarray(numpy.random.random([4, 8]),
dtype=self.dtype))
p1 = self.shared(numpy.asarray(numpy.random.random(8),
dtype=self.dtype))
p2 = self.shared(numpy.asarray(numpy.random.random([8, 3]),
dtype=self.dtype))
p3 = self.shared(numpy.asarray(numpy.random.random(3),
dtype=self.dtype))
p = [p0, p1, p2, p3]
# in my code these vars are the result of applying scan
ften0 = tensor.tensor3('ft0', dtype=self.dtype)
fmat1 = tensor.matrix('fm1', dtype=self.dtype)
ften2 = tensor.tensor3('ft2', dtype=self.dtype)
fmat3 = tensor.matrix('fm3', dtype=self.dtype)
# then I keep only the last iteration
fsub0 = ften0[-1]
fsub1 = fmat1[-1]
fsub2 = ften2[-1]
fsub3 = fmat3[-1]
fsub = [fsub0, fsub1, fsub2, fsub3]
acc = theano.tensor.constant(1, 'int8') >= 0
new_positions = theano.ifelse.ifelse(acc, fsub, p)
new_updates = [(p[0], new_positions[0])]
f = theano.function([ften0, fmat1, ften2, fmat3], [],
updates=new_updates, mode=self.mode)
self.assertFunctionContains1(f, self.get_ifelse(4))
i1 = numpy.asarray(numpy.random.random([19, 4, 8]), dtype=self.dtype)
i2 = numpy.asarray(numpy.random.random([19, 8]), dtype=self.dtype)
i3 = numpy.asarray(numpy.random.random([19, 8, 3]), dtype=self.dtype)
i4 = numpy.asarray(numpy.random.random([19, 3]), dtype=self.dtype)
f(i1, i2, i3, i4)
def test_dtype_mismatch(self):
rng = numpy.random.RandomState(utt.fetch_seed())
data = rng.rand(5).astype(self.dtype)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论