提交 51b39ada authored 作者: lamblin's avatar lamblin

Merge pull request #1441 from nouiz/fix_opt_crash

Fix opt crash of local_gpu_lazy_ifelse.
...@@ -1636,7 +1636,8 @@ class _Linker(gof.link.LocalLinker): ...@@ -1636,7 +1636,8 @@ class _Linker(gof.link.LocalLinker):
if not isinstance(node.op, gof.op.Op): if not isinstance(node.op, gof.op.Op):
raise utils.MethodNotDefined() raise utils.MethodNotDefined()
e = FunctionGraph(*graph.clone(node.inputs, node.outputs)) e = FunctionGraph(*graph.clone(node.inputs, node.outputs))
e.toposort = lambda: e.apply_nodes # WARNING: STOCHASTIC ORDER # The toposort isn't a stochastic order as it contain only one node.
e.toposort = lambda: list(e.apply_nodes)
# Specifically... e.nodes is a set, but of only 1 element # Specifically... e.nodes is a set, but of only 1 element
cl = CLinker().accept(e, [r for r, r2 in zip(e.outputs, cl = CLinker().accept(e, [r for r, r2 in zip(e.outputs,
...@@ -1679,6 +1680,8 @@ class _Linker(gof.link.LocalLinker): ...@@ -1679,6 +1680,8 @@ class _Linker(gof.link.LocalLinker):
storage_map, storage_map,
compute_map, compute_map,
no_recycling) no_recycling)
thunk.inputs = [storage_map[v] for v in node.inputs]
thunk.outputs = [storage_map[v] for v in node.outputs]
# Right now there is no op that when called check if # Right now there is no op that when called check if
# its ouputs are computed and don't recompute itself. # its ouputs are computed and don't recompute itself.
......
...@@ -1498,6 +1498,9 @@ class OpWiseCLinker(link.LocalLinker): ...@@ -1498,6 +1498,9 @@ class OpWiseCLinker(link.LocalLinker):
storage_map, storage_map,
compute_map, compute_map,
no_recycling)] no_recycling)]
thunks[-1].inputs = [storage_map[v] for v in node.inputs]
thunks[-1].outputs = [storage_map[v] for v in node.outputs]
finally: finally:
node.op._op_use_c_code = old_value node.op._op_use_c_code = old_value
......
...@@ -112,29 +112,37 @@ def raise_with_op(op, thunk=None, exc_info=None): ...@@ -112,29 +112,37 @@ def raise_with_op(op, thunk=None, exc_info=None):
if raise_with_op.print_thunk_trace: if raise_with_op.print_thunk_trace:
log_thunk_trace(exc_value) log_thunk_trace(exc_value)
if theano.config.exception_verbosity == 'high': detailed_err_msg = "\nApply node that caused the error: " + str(op)
f = StringIO.StringIO()
theano.printing.debugprint(op, file=f, stop_on_name=True) if thunk is not None:
if thunk is not None: if hasattr(thunk, 'inputs'):
shapes = [getattr(ipt[0], 'shape', 'No shapes') shapes = [getattr(ipt[0], 'shape', 'No shapes')
for ipt in thunk.inputs] for ipt in thunk.inputs]
strides = [getattr(ipt[0], 'strides', 'No strides') strides = [getattr(ipt[0], 'strides', 'No strides')
for ipt in thunk.inputs] for ipt in thunk.inputs]
detailed_err_msg = ("\nInputs shapes: %s \n" % shapes +
"Inputs strides: %s \n" % strides +
"Debugprint of the apply node: \n" +
f.getvalue())
else: else:
detailed_err_msg = "\nDebugprint of the apply node: \n" + f.getvalue() shapes = "The thunk don't have an inputs attributes."
strides = "So we can't access the storage inputs value"
types = [getattr(ipt, 'type', 'No type')
for ipt in op.inputs]
detailed_err_msg += ("\nInputs shapes: %s" % shapes +
"\nInputs strides: %s" % strides +
"\nInputs types: %s" % types)
else:
detailed_err_msg += ("\nUse another linker then the c linker to"
" have the inputs shapes and strides printed.")
if theano.config.exception_verbosity == 'high':
f = StringIO.StringIO()
theano.printing.debugprint(op, file=f, stop_on_name=True,
print_type=True)
detailed_err_msg += "\nDebugprint of the apply node: \n" + f.getvalue()
else: else:
detailed_err_msg = ("\nUse the Theano flag" detailed_err_msg += ("\nUse the Theano flag 'exception_verbosity=high'"
" 'exception_verbosity=high' for more" " for a debugprint of this apply node.")
" information on the inputs of this apply"
" node.") exc_value = exc_type(str(exc_value) + detailed_err_msg)
exc_value = exc_type(str(exc_value) +
"\nApply node that caused the error: " + str(op) +
detailed_err_msg)
raise exc_type, exc_value, exc_trace raise exc_type, exc_value, exc_trace
raise_with_op.print_thunk_trace = False raise_with_op.print_thunk_trace = False
...@@ -523,6 +531,8 @@ class PerformLinker(LocalLinker): ...@@ -523,6 +531,8 @@ class PerformLinker(LocalLinker):
storage_map, storage_map,
compute_map, compute_map,
no_recycling)] no_recycling)]
thunks[-1].inputs = [storage_map[v] for v in node.inputs]
thunks[-1].outputs = [storage_map[v] for v in node.outputs]
finally: finally:
node.op._op_use_c_code = old_value node.op._op_use_c_code = old_value
......
...@@ -431,6 +431,8 @@ class PureOp(object): ...@@ -431,6 +431,8 @@ class PureOp(object):
# compute output value once with test inputs to validate graph # compute output value once with test inputs to validate graph
thunk = node.op.make_thunk(node, storage_map, compute_map, thunk = node.op.make_thunk(node, storage_map, compute_map,
no_recycling=[]) no_recycling=[])
thunk.inputs = [storage_map[v] for v in node.inputs]
thunk.outputs = [storage_map[v] for v in node.outputs]
required = thunk() required = thunk()
assert not required # We provided all inputs assert not required # We provided all inputs
......
...@@ -843,6 +843,9 @@ class VM_Linker(link.LocalLinker): ...@@ -843,6 +843,9 @@ class VM_Linker(link.LocalLinker):
compute_map, compute_map,
no_recycling) no_recycling)
for node in order] for node in order]
for node, thunk in zip(order, thunks):
thunk.inputs = [storage_map[v] for v in node.inputs]
thunk.outputs = [storage_map[v] for v in node.outputs]
computed, last_user = link.gc_helper(order) computed, last_user = link.gc_helper(order)
if self.allow_gc: if self.allow_gc:
......
...@@ -652,7 +652,7 @@ class GpuConv(GpuOp): ...@@ -652,7 +652,7 @@ class GpuConv(GpuOp):
def c_code_cache_version(self): def c_code_cache_version(self):
# raise this whenever modifying any of the support_code_files # raise this whenever modifying any of the support_code_files
return (0, 19) return (0, 20)
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
# REMEMBER TO RAISE c_code_cache_version when changing any of # REMEMBER TO RAISE c_code_cache_version when changing any of
...@@ -704,6 +704,7 @@ class GpuConv(GpuOp): ...@@ -704,6 +704,7 @@ class GpuConv(GpuOp):
return NULL; return NULL;
} }
// TODO, make out be decref before we alloc out2!
CudaNdarray * out2 = (CudaNdarray *)CudaNdarray_Conv(%(img)s, %(kern)s, CudaNdarray * out2 = (CudaNdarray *)CudaNdarray_Conv(%(img)s, %(kern)s,
%(out)s, mode, %(out)s, mode,
dx, dy, dx, dy,
...@@ -711,6 +712,10 @@ class GpuConv(GpuOp): ...@@ -711,6 +712,10 @@ class GpuConv(GpuOp):
%(max_threads_dim0)s); %(max_threads_dim0)s);
Py_XDECREF(%(out)s); Py_XDECREF(%(out)s);
%(out)s = out2; %(out)s = out2;
if (%(out)s==NULL){
%(fail)s
}
""" % sub """ % sub
......
...@@ -1811,9 +1811,10 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t ...@@ -1811,9 +1811,10 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
{ {
PyErr_Format( PyErr_Format(
PyExc_RuntimeError, PyExc_RuntimeError,
"Cuda error: %s: %s.\n", "Cuda error: %s: %s. n_block=(%ld,%ld) n_threads=%ld\n",
"k4", "k5 with loop over k4",
cudaGetErrorString(err)); cudaGetErrorString(err),
(long) n_blocks.x, (long) n_blocks.y, (long) n_threads.x);
Py_XDECREF(new_other); Py_XDECREF(new_other);
return -1; return -1;
} }
...@@ -1831,14 +1832,17 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t ...@@ -1831,14 +1832,17 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
); );
while (n_blocks.x * n_blocks.y > NUM_VECTOR_OP_BLOCKS) while (n_blocks.x * n_blocks.y > NUM_VECTOR_OP_BLOCKS)
n_blocks.y /= 2; n_blocks.y /= 2;
while (n_blocks.x * n_blocks.y * n_blocks.z > NUM_VECTOR_OP_BLOCKS) // GTX285(compute capabilities 1.3) don't support n_blocks.z > 1
n_blocks.z /= 2; // (compute capabilities 2.0) support 65535 for n_blocks.z
//while (n_blocks.x * n_blocks.y * n_blocks.z > NUM_VECTOR_OP_BLOCKS)
// n_blocks.z /= 2;
n_blocks.z = 1;
dim3 n_threads( dim3 n_threads(
std::min( std::min(
CudaNdarray_HOST_DIMS(self)[3], CudaNdarray_HOST_DIMS(self)[3],
NUM_VECTOR_OP_THREADS_PER_BLOCK) NUM_VECTOR_OP_THREADS_PER_BLOCK)
//TODO: DON"T YOU NEED OT PUT DIMS[4] in here??? //TODO: DON'T YOU NEED TO PUT DIMS[4] in here???
//TODO: DON"T YOU NEED OT PUT DIMS[5] in here??? //TODO: DON'T YOU NEED TO PUT DIMS[5] in here???
); );
k6<<<n_blocks, n_threads>>>( k6<<<n_blocks, n_threads>>>(
CudaNdarray_HOST_DIMS(self)[0], CudaNdarray_HOST_DIMS(self)[0],
...@@ -1867,9 +1871,11 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t ...@@ -1867,9 +1871,11 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
{ {
PyErr_Format( PyErr_Format(
PyExc_RuntimeError, PyExc_RuntimeError,
"Cuda error: %s: %s.\n", "Cuda error: %s: %s. n_blocks=(%ld, %ld, %ld) n_threads=(%ld)\n",
"k4", "k6",
cudaGetErrorString(err)); cudaGetErrorString(err),
(long) n_blocks.x, (long) n_blocks.y, (long) n_blocks.z,
(long) n_threads.x);
Py_XDECREF(new_other); Py_XDECREF(new_other);
return -1; return -1;
} }
......
...@@ -403,7 +403,12 @@ def local_gpu_lazy_ifelse(node): ...@@ -403,7 +403,12 @@ def local_gpu_lazy_ifelse(node):
host_input = node.inputs[0] host_input = node.inputs[0]
if (host_input.owner and if (host_input.owner and
isinstance(host_input.owner.op, theano.ifelse.IfElse) and isinstance(host_input.owner.op, theano.ifelse.IfElse) and
not host_input.owner.op.gpu): not host_input.owner.op.gpu and
# If there is more then 1 outputs, we can't replace it
# here with a local optimizer as we replace the
# GpuFromHost node and the other output of the if won't be
# replaced.
host_input.owner.op.n_outs == 1):
gpu_ifelse = theano.ifelse.IfElse(host_input.owner.op.n_outs, gpu_ifelse = theano.ifelse.IfElse(host_input.owner.op.n_outs,
gpu=True) gpu=True)
......
import sys, time, unittest import sys
import numpy import numpy
# Skip test if cuda_ndarray is not available. # Skip test if cuda_ndarray is not available.
...@@ -7,7 +7,7 @@ from nose.plugins.skip import SkipTest ...@@ -7,7 +7,7 @@ from nose.plugins.skip import SkipTest
import theano import theano
from theano.compile.pfunc import pfunc from theano.compile.pfunc import pfunc
from theano import config, tensor from theano import config, tensor
import theano.sandbox.linalg.tests import theano.sandbox.linalg.tests.test_linalg
from theano.tests import unittest_tools as utt from theano.tests import unittest_tools as utt
...@@ -48,28 +48,29 @@ def test_int_pow(): ...@@ -48,28 +48,29 @@ def test_int_pow():
op_names = [n.op.__class__.__name__ for n in f.maker.fgraph.toposort()] op_names = [n.op.__class__.__name__ for n in f.maker.fgraph.toposort()]
assert op_names == ['GpuCAReduce', 'GpuElemwise', 'HostFromGpu'] assert op_names == ['GpuCAReduce', 'GpuElemwise', 'HostFromGpu']
f = theano.function([a], tensor.pow(a,4).sum(), mode=mode_with_gpu) f = theano.function([a], tensor.pow(a, 4).sum(), mode=mode_with_gpu)
op_names = [n.op.__class__.__name__ for n in f.maker.fgraph.toposort()] op_names = [n.op.__class__.__name__ for n in f.maker.fgraph.toposort()]
assert op_names == ['GpuElemwise', 'GpuCAReduce', 'HostFromGpu'] assert op_names == ['GpuElemwise', 'GpuCAReduce', 'HostFromGpu']
#theano.printing.debugprint(f) #theano.printing.debugprint(f)
def test_gpualloc(): def test_gpualloc():
''' '''
This tests tries to catch the scenario when, due to infer_shape, This tests tries to catch the scenario when, due to infer_shape,
the input of the alloc changes from tesnor scalar to a constant the input of the alloc changes from tensor scalar to a constant
1. In this case the original constracted broadcastable pattern will 1. In this case the original constracted broadcastable pattern will
have a False for that dimension, but the new broadcastable pattern have a False for that dimension, but the new broadcastable pattern
that will be inserted by gpualloc will have a True since it knows the that will be inserted by gpualloc will have a True since it knows the
dimension is 1 and therefore broadcastable. dimension is 1 and therefore broadcastable.
''' '''
x = theano.shared(numpy.ones(3,dtype='float32'), 'x') x = theano.shared(numpy.ones(3, dtype='float32'), 'x')
m = (x).dimshuffle(['x',0]) m = (x).dimshuffle(['x', 0])
v = tensor.alloc(1., *m.shape) v = tensor.alloc(1., *m.shape)
f = theano.function([], v+x) f = theano.function([], v + x, mode=mode_with_gpu)
l = f.maker.fgraph.toposort() l = f.maker.fgraph.toposort()
assert numpy.any(ininstance(x.op, cuda.GpuAlloc) for x in l ) assert numpy.any([isinstance(x.op, cuda.GpuAlloc) for x in l])
def test_alloc_memset_0(): def test_alloc_memset_0():
......
...@@ -159,6 +159,49 @@ class test_ifelse(unittest.TestCase, utt.TestOptimizationMixin): ...@@ -159,6 +159,49 @@ class test_ifelse(unittest.TestCase, utt.TestOptimizationMixin):
assert numpy.all(outs_0[2] == 1.) assert numpy.all(outs_0[2] == 1.)
assert numpy.all(outs_0[3] == 1.) assert numpy.all(outs_0[3] == 1.)
def test_multiple_out_crash(self):
# This test failed up to commit 2faeb62c38
p0 = self.shared(numpy.asarray(numpy.random.random([4, 8]),
dtype=self.dtype))
p1 = self.shared(numpy.asarray(numpy.random.random(8),
dtype=self.dtype))
p2 = self.shared(numpy.asarray(numpy.random.random([8, 3]),
dtype=self.dtype))
p3 = self.shared(numpy.asarray(numpy.random.random(3),
dtype=self.dtype))
p = [p0, p1, p2, p3]
# in my code these vars are the result of applying scan
ften0 = tensor.tensor3('ft0', dtype=self.dtype)
fmat1 = tensor.matrix('fm1', dtype=self.dtype)
ften2 = tensor.tensor3('ft2', dtype=self.dtype)
fmat3 = tensor.matrix('fm3', dtype=self.dtype)
# then I keep only the last iteration
fsub0 = ften0[-1]
fsub1 = fmat1[-1]
fsub2 = ften2[-1]
fsub3 = fmat3[-1]
fsub = [fsub0, fsub1, fsub2, fsub3]
acc = theano.tensor.constant(1, 'int8') >= 0
new_positions = theano.ifelse.ifelse(acc, fsub, p)
new_updates = [(p[0], new_positions[0])]
f = theano.function([ften0, fmat1, ften2, fmat3], [],
updates=new_updates, mode=self.mode)
self.assertFunctionContains1(f, self.get_ifelse(4))
i1 = numpy.asarray(numpy.random.random([19, 4, 8]), dtype=self.dtype)
i2 = numpy.asarray(numpy.random.random([19, 8]), dtype=self.dtype)
i3 = numpy.asarray(numpy.random.random([19, 8, 3]), dtype=self.dtype)
i4 = numpy.asarray(numpy.random.random([19, 3]), dtype=self.dtype)
f(i1, i2, i3, i4)
def test_dtype_mismatch(self): def test_dtype_mismatch(self):
rng = numpy.random.RandomState(utt.fetch_seed()) rng = numpy.random.RandomState(utt.fetch_seed())
data = rng.rand(5).astype(self.dtype) data = rng.rand(5).astype(self.dtype)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论