提交 26254645 authored 作者: Frédéric Bastien's avatar Frédéric Bastien 提交者: GitHub

Merge pull request #5902 from lamblin/fix_debugmode

[BUG, CRASH] Fixes in DebugMode for GPU
...@@ -9,6 +9,40 @@ export PATH=/usr/local/cuda/bin:$PATH ...@@ -9,6 +9,40 @@ export PATH=/usr/local/cuda/bin:$PATH
export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
export LIBRARY_PATH=/usr/local/cuda/lib64:$LIBRARY_PATH export LIBRARY_PATH=/usr/local/cuda/lib64:$LIBRARY_PATH
GPUARRAY_CONFIG="Release"
DEVICE=cuda0
LIBDIR=${WORKSPACE}/local
# Make fresh clones of libgpuarray (with no history since we don't need it)
rm -rf libgpuarray
git clone --depth 1 "https://github.com/Theano/libgpuarray.git"
# Clean up previous installs (to make sure no old files are left)
rm -rf $LIBDIR
mkdir $LIBDIR
# Build libgpuarray
mkdir libgpuarray/build
(cd libgpuarray/build && cmake .. -DCMAKE_BUILD_TYPE=${GPUARRAY_CONFIG} -DCMAKE_INSTALL_PREFIX=$LIBDIR && make)
# Finally install
(cd libgpuarray/build && make install)
# Export paths
export CPATH=$CPATH:$LIBDIR/include
export LIBRARY_PATH=$LIBRARY_PATH:$LIBDIR/lib
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$LIBDIR/lib
# Build the pygpu modules
(cd libgpuarray && python setup.py build_ext --inplace -I$LIBDIR/include -L$LIBDIR/lib)
ls $LIBDIR
mkdir $LIBDIR/lib/python
export PYTHONPATH=${PYTHONPATH}:$LIBDIR/lib/python
# Then install
(cd libgpuarray && python setup.py install --home=$LIBDIR)
python -c 'import pygpu; print(pygpu.__file__)'
# nosetests xunit for test profiling # nosetests xunit for test profiling
XUNIT="--with-xunit --xunit-file=" XUNIT="--with-xunit --xunit-file="
......
...@@ -73,8 +73,12 @@ class GpuGemv(BlasOp): ...@@ -73,8 +73,12 @@ class GpuGemv(BlasOp):
inplace = self.inplace inplace = self.inplace
if inplace and y.strides[0] < 0: if inplace and y.strides[0] < 0:
inplace = False inplace = False
out_storage[0][0] = blas.gemv(alpha, A, x, beta, y, if A.shape[1] == 0:
overwrite_y=inplace) out_storage[0][0] = pygpu.zeros(y.shape, dtype=y.dtype,
context=y.context)
else:
out_storage[0][0] = blas.gemv(alpha, A, x, beta, y,
overwrite_y=inplace)
def c_code(self, node, name, inp, out, sub): def c_code(self, node, name, inp, out, sub):
vars = dict(out=out[0], y=inp[0], alpha=inp[1], A=inp[2], x=inp[3], vars = dict(out=out[0], y=inp[0], alpha=inp[1], A=inp[2], x=inp[3],
...@@ -119,11 +123,11 @@ class GpuGemv(BlasOp): ...@@ -119,11 +123,11 @@ class GpuGemv(BlasOp):
if (%(A)s->ga.flags & GA_C_CONTIGUOUS) { if (%(A)s->ga.flags & GA_C_CONTIGUOUS) {
ssize_t a_stride0 = %(A)s->ga.strides[0]; ssize_t a_stride0 = %(A)s->ga.strides[0];
%(A)s->ga.strides[0] = %(A)s->ga.strides[1]; %(A)s->ga.strides[0] = %(A)s->ga.strides[1];
if (pygpu_blas_rdot(%(x)s, %(A)s, %(y)s, 0) == -1) { if (pygpu_blas_rdot(%(x)s, %(A)s, %(out)s, 0) == -1) {
%(fail)s %(fail)s
} }
%(A)s->ga.strides[0] = a_stride0; %(A)s->ga.strides[0] = a_stride0;
} else if (pygpu_blas_rdot(%(x)s, %(A)s, %(y)s, 0) == -1) { } else if (pygpu_blas_rdot(%(x)s, %(A)s, %(out)s, 0) == -1) {
%(fail)s %(fail)s
} }
%(out)s->ga.nd = 1; %(out)s->ga.nd = 1;
...@@ -145,7 +149,7 @@ class GpuGemv(BlasOp): ...@@ -145,7 +149,7 @@ class GpuGemv(BlasOp):
return code return code
def c_code_cache_version(self): def c_code_cache_version(self):
return (6,) return (7,)
gpugemv_no_inplace = GpuGemv(inplace=False) gpugemv_no_inplace = GpuGemv(inplace=False)
gpugemv_inplace = GpuGemv(inplace=True) gpugemv_inplace = GpuGemv(inplace=True)
......
...@@ -309,39 +309,29 @@ KERNEL void k_multi_warp_multinomial_wor( ...@@ -309,39 +309,29 @@ KERNEL void k_multi_warp_multinomial_wor(
if (n < nb_multi) if (n < nb_multi)
{ {
// Sum of the remaining p_vals in global_pvals_copy[n]
float pvals_sum = 1.;
for (int c = 0; c < n_samples; ++c) for (int c = 0; c < n_samples; ++c)
{ {
float cummul = 0.; float cummul = 0.;
bool done = false; const float unis_n = global_unis[(c * nb_multi + n)*unis_stride] * pvals_sum;
const float unis_n = global_unis[(c * nb_multi + n)*unis_stride];
for (ga_size m = 0; m < nb_outcomes; ++m) for (ga_size m = 0; m < nb_outcomes; ++m)
{ {
float pvals_nm = global_pvals_copy[m * pvals_col_stride + n * pvals_row_stride]; float pvals_nm = global_pvals_copy[m * pvals_col_stride + n * pvals_row_stride];
cummul += pvals_nm; cummul += pvals_nm;
if (!done && unis_n < cummul) if (unis_n < cummul)
{ {
//write out transposed for speed. // write out transposed for speed.
global_outs[n * outs_col_stride + global_outs[n * outs_col_stride +
c * outs_row_stride] = m; c * outs_row_stride] = m;
if (! %(replace)s ) if (! %(replace)s )
{ {
global_pvals_copy[m * pvals_col_stride + n * pvals_row_stride] = 0.0; global_pvals_copy[m * pvals_col_stride + n * pvals_row_stride] = 0.0;
cummul -= pvals_nm; pvals_sum -= pvals_nm;
} }
done = true; break;
}
}
// No need to renormalize after the last samples.
if (c == (n_samples - 1))
break;
if (! %(replace)s )
{
// parallel renormalize the multinomial
for (ga_int k = LID_1; k < nb_outcomes; k+=LDIM_1)
{
global_pvals_copy[k * pvals_col_stride + n * pvals_row_stride] /= cummul;
} }
} }
} }
...@@ -402,9 +392,12 @@ KERNEL void k_multi_warp_multinomial_wor( ...@@ -402,9 +392,12 @@ KERNEL void k_multi_warp_multinomial_wor(
PyErr_Format(PyExc_ValueError, "unis.shape[0] != pvals.shape[0] * n"); PyErr_Format(PyExc_ValueError, "unis.shape[0] != pvals.shape[0] * n");
%(fail)s %(fail)s
} }
if (! %(replace)s) {
pvals_copy = pygpu_copy(pvals, GA_C_ORDER); pvals_copy = pygpu_copy(pvals, GA_C_ORDER);
} else {
pvals_copy = pvals;
Py_INCREF(pvals_copy);
}
dims[0] = n_samples; dims[0] = n_samples;
dims[1] = PyGpuArray_DIMS(pvals)[0]; dims[1] = PyGpuArray_DIMS(pvals)[0];
...@@ -466,18 +459,7 @@ KERNEL void k_multi_warp_multinomial_wor( ...@@ -466,18 +459,7 @@ KERNEL void k_multi_warp_multinomial_wor(
args[9] = (void*)&strides[3]; args[9] = (void*)&strides[3];
args[10] = (void*)&strides[4]; args[10] = (void*)&strides[4];
size_t nb_threads2[2], nb_blocks2[2]; err = GpuKernel_call(&%(kname)s, 1, &nb_blocks, &nb_threads, 0, args);
nb_threads2[0] = nb_threads;
nb_threads2[1] = 1;
// If we can't schedule enough threads parallelize the renormalization.
// I do this because we don't always use those extra threads.
if ((nb_threads * nb_blocks < 2048) && ! %(replace)d )
nb_threads2[1] = 1024 / nb_threads;
nb_blocks2[0] = nb_blocks;
nb_blocks2[1] = 1;
err = GpuKernel_call(&%(kname)s, 2, nb_blocks2, nb_threads2, 0, args);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format( PyErr_Format(
PyExc_RuntimeError, PyExc_RuntimeError,
...@@ -495,7 +477,7 @@ KERNEL void k_multi_warp_multinomial_wor( ...@@ -495,7 +477,7 @@ KERNEL void k_multi_warp_multinomial_wor(
return s return s
def c_code_cache_version(self): def c_code_cache_version(self):
return (4,) return (7,)
@register_opt('fast_compile') @register_opt('fast_compile')
...@@ -528,7 +510,7 @@ def local_gpua_multinomial_wor(op, context_name, inputs, outputs): ...@@ -528,7 +510,7 @@ def local_gpua_multinomial_wor(op, context_name, inputs, outputs):
p, u, n = inputs p, u, n = inputs
m, = outputs m, = outputs
if ((p.dtype == u.dtype == 'float32') and (m.dtype == 'int64')): if ((p.dtype == u.dtype == 'float32') and (m.dtype == 'int64')):
gpu_op = GPUAChoiceFromUniform(op.odtype) gpu_op = GPUAChoiceFromUniform(**op._props_dict())
return GpuDimShuffle([False, False], [1, 0])( return GpuDimShuffle([False, False], [1, 0])(
gpu_op(p, u, n)) gpu_op(p, u, n))
......
...@@ -481,13 +481,12 @@ def test_hostfromgpu_shape_i(): ...@@ -481,13 +481,12 @@ def test_hostfromgpu_shape_i():
def test_Gpujoin_inplace(): def test_Gpujoin_inplace():
"""Test Gpujoin to work inplace. # Test Gpujoin to work inplace.
#
This function tests the case when several elements are passed to the # This function tests the case when several elements are passed to the
Gpujoin function but all except one of them are empty. In this case # Gpujoin function but all except one of them are empty. In this case
Gpujoin should work inplace and the output should be the view of the # Gpujoin should work inplace and the output should be the view of the
non-empty element. # non-empty element.
"""
s = T.lscalar() s = T.lscalar()
data = np.array([3, 4, 5], dtype=theano.config.floatX) data = np.array([3, 4, 5], dtype=theano.config.floatX)
x = gpuarray_shared_constructor(data, borrow=True) x = gpuarray_shared_constructor(data, borrow=True)
...@@ -497,5 +496,6 @@ def test_Gpujoin_inplace(): ...@@ -497,5 +496,6 @@ def test_Gpujoin_inplace():
c = join(0, x, z) c = join(0, x, z)
f = theano.function([s], theano.Out(c, borrow=True)) f = theano.function([s], theano.Out(c, borrow=True))
assert x.get_value(borrow=True, return_internal_type=True) is f(0) if not isinstance(mode_with_gpu, theano.compile.DebugMode):
assert x.get_value(borrow=True, return_internal_type=True) is f(0)
assert np.allclose(f(0), [3, 4, 5]) assert np.allclose(f(0), [3, 4, 5])
from __future__ import absolute_import, print_function, division from __future__ import absolute_import, print_function, division
from copy import copy
from unittest import TestCase
import numpy as np import numpy as np
import scipy.special import scipy.special
import theano import theano
from theano import scalar, gof, tensor from theano import scalar, gof, tensor
from unittest import TestCase from theano.compile import DebugMode
from theano.tests.unittest_tools import SkipTest, assert_allclose from theano.tests.unittest_tools import SkipTest, assert_allclose
from theano.tensor.tests import test_elemwise from theano.tensor.tests import test_elemwise
...@@ -66,18 +69,32 @@ class TestMathErrorFunctions(TestCase): ...@@ -66,18 +69,32 @@ class TestMathErrorFunctions(TestCase):
expected_erfinv_outputs = {} expected_erfinv_outputs = {}
expected_erfcinv_outputs = {} expected_erfcinv_outputs = {}
def setUp(self): @classmethod
def setUpClass(cls):
# NB: erfinv is defined in ]-1;1[, and erfcinv is defined in ]0;2[, # NB: erfinv is defined in ]-1;1[, and erfcinv is defined in ]0;2[,
# so we just take some values in an interval that covers both domains # so we just take some values in an interval that covers both domains
# (this will also allow to test some values outside the domains). # (this will also allow to test some values outside the domains).
# We take [-5;5[ by default and we concatenate it 1000 times # We take [-5;5[ by default and we concatenate it 1000 times
# to have the GPU ops run on large data. # to have the GPU ops run on large data.
default_array = [x / 10.0 for x in range(-50, 50)] * 1000 default_array = [x / 10.0 for x in range(-50, 50)] * 1000
for dtype in self.dtypes: for dtype in cls.dtypes:
numpy_array = np.asarray(default_array, dtype=dtype) numpy_array = np.asarray(default_array, dtype=dtype)
self.default_arrays[dtype] = numpy_array cls.default_arrays[dtype] = numpy_array
self.expected_erfinv_outputs[dtype] = scipy.special.erfinv(numpy_array) cls.expected_erfinv_outputs[dtype] = scipy.special.erfinv(numpy_array)
self.expected_erfcinv_outputs[dtype] = scipy.special.erfcinv(numpy_array) cls.expected_erfcinv_outputs[dtype] = scipy.special.erfcinv(numpy_array)
# Since there are infinite values, we need to disable that check
# in DebugMode if needed
if isinstance(mode_with_gpu, DebugMode):
cls.mode_with_gpu = copy(mode_with_gpu)
cls.mode_with_gpu.check_isfinite = False
else:
cls.mode_with_gpu = mode_with_gpu
if isinstance(mode_without_gpu, DebugMode):
cls.mode_without_gpu = copy(mode_without_gpu)
cls.mode_without_gpu.check_isfinite = False
else:
cls.mode_without_gpu = mode_without_gpu
def check_gpu_scalar_op(self, theano_function, scalar_optype): def check_gpu_scalar_op(self, theano_function, scalar_optype):
for node in theano_function.maker.fgraph.apply_nodes: for node in theano_function.maker.fgraph.apply_nodes:
...@@ -90,8 +107,8 @@ class TestMathErrorFunctions(TestCase): ...@@ -90,8 +107,8 @@ class TestMathErrorFunctions(TestCase):
for dtype in self.dtypes: for dtype in self.dtypes:
vector = theano.tensor.vector(dtype=dtype) vector = theano.tensor.vector(dtype=dtype)
output = theano.tensor.erfinv(vector) output = theano.tensor.erfinv(vector)
f_host = theano.function([vector], output, name='HOST/erfinv/' + dtype, mode=mode_without_gpu) f_host = theano.function([vector], output, name='HOST/erfinv/' + dtype, mode=self.mode_without_gpu)
f_gpu = theano.function([vector], output, name='GPU/erfinv/' + dtype, mode=mode_with_gpu) f_gpu = theano.function([vector], output, name='GPU/erfinv/' + dtype, mode=self.mode_with_gpu)
assert len([n for n in f_host.maker.fgraph.apply_nodes if isinstance(n.op, GpuElemwise)]) == 0 assert len([n for n in f_host.maker.fgraph.apply_nodes if isinstance(n.op, GpuElemwise)]) == 0
if not theano.config.device.startswith('opencl'): if not theano.config.device.startswith('opencl'):
assert self.check_gpu_scalar_op(f_gpu, GpuErfinv), \ assert self.check_gpu_scalar_op(f_gpu, GpuErfinv), \
...@@ -108,8 +125,8 @@ class TestMathErrorFunctions(TestCase): ...@@ -108,8 +125,8 @@ class TestMathErrorFunctions(TestCase):
for dtype in self.dtypes: for dtype in self.dtypes:
vector = theano.tensor.vector(dtype=dtype) vector = theano.tensor.vector(dtype=dtype)
output = theano.tensor.erfcinv(vector) output = theano.tensor.erfcinv(vector)
f_host = theano.function([vector], output, name='HOST/erfcinv/' + dtype, mode=mode_without_gpu) f_host = theano.function([vector], output, name='HOST/erfcinv/' + dtype, mode=self.mode_without_gpu)
f_gpu = theano.function([vector], output, name='GPU/erfcinv/' + dtype, mode=mode_with_gpu) f_gpu = theano.function([vector], output, name='GPU/erfcinv/' + dtype, mode=self.mode_with_gpu)
assert len([n for n in f_host.maker.fgraph.apply_nodes if isinstance(n.op, GpuElemwise)]) == 0 assert len([n for n in f_host.maker.fgraph.apply_nodes if isinstance(n.op, GpuElemwise)]) == 0
if not theano.config.device.startswith('opencl'): if not theano.config.device.startswith('opencl'):
assert self.check_gpu_scalar_op(f_gpu, GpuErfcinv), \ assert self.check_gpu_scalar_op(f_gpu, GpuErfcinv), \
......
...@@ -32,6 +32,17 @@ class TestGpuCumOp(theano.tensor.tests.test_extra_ops.TestCumOp): ...@@ -32,6 +32,17 @@ class TestGpuCumOp(theano.tensor.tests.test_extra_ops.TestCumOp):
self.max_grid_size1 = test_ctx.maxgsize2 self.max_grid_size1 = test_ctx.maxgsize2
self.op_class = CumOp self.op_class = CumOp
# The CPU implementation is not so accurate, which throws out DebugMode.
# Since propagating .tag.values_eq_approx to the output of every
# GpuFromHost seems overkill, we just relax the rtol for these tests
self.old_rtol = theano.tensor.float32_rtol
theano.tensor.basic.float32_rtol *= 2
def tearDown(self):
super(TestGpuCumOp, self).tearDown()
# Restore rtol
theano.tensor.basic.float32_rtol = self.old_rtol
@cum_modes @cum_modes
def test_infer_shape(self, mode): def test_infer_shape(self, mode):
# GpuCumOp is only defined for float32 for now, so we skip it # GpuCumOp is only defined for float32 for now, so we skip it
......
...@@ -327,30 +327,34 @@ def test_gpu_opt_wor(): ...@@ -327,30 +327,34 @@ def test_gpu_opt_wor():
p = tensor.fmatrix() p = tensor.fmatrix()
u = tensor.fvector() u = tensor.fvector()
n = tensor.iscalar() n = tensor.iscalar()
m = multinomial.ChoiceFromUniform(odtype='auto')(p, u, n) for replace in [False, True]:
assert m.dtype == 'int64', m.dtype m = multinomial.ChoiceFromUniform(odtype='auto',
replace=replace)(p, u, n)
f = function([p, u, n], m, allow_input_downcast=True, mode=mode_with_gpu) assert m.dtype == 'int64', m.dtype
assert any([type(node.op) is GPUAChoiceFromUniform
for node in f.maker.fgraph.toposort()]) f = function([p, u, n], m, allow_input_downcast=True,
n_samples = 3 mode=mode_with_gpu)
pval = np.arange(10000 * 4, dtype='float32').reshape((10000, 4)) + 0.1 assert any([type(node.op) is GPUAChoiceFromUniform
pval = pval / pval.sum(axis=1)[:, None] for node in f.maker.fgraph.toposort()])
uval = np.ones(pval.shape[0] * n_samples) * 0.5 n_samples = 3
f(pval, uval, n_samples) pval = np.arange(10000 * 4, dtype='float32').reshape((10000, 4)) + 0.1
pval = pval / pval.sum(axis=1)[:, None]
uval = np.ones(pval.shape[0] * n_samples) * 0.5
f(pval, uval, n_samples)
# Test with a row, it was failing in the past. # Test with a row, it was failing in the past.
r = tensor.frow() r = tensor.frow()
m = multinomial.ChoiceFromUniform('auto')(r, u, n) m = multinomial.ChoiceFromUniform('auto', replace=replace)(r, u, n)
assert m.dtype == 'int64', m.dtype assert m.dtype == 'int64', m.dtype
f = function([r, u, n], m, allow_input_downcast=True, mode=mode_with_gpu) f = function([r, u, n], m, allow_input_downcast=True,
assert any([type(node.op) is GPUAChoiceFromUniform mode=mode_with_gpu)
for node in f.maker.fgraph.toposort()]) assert any([type(node.op) is GPUAChoiceFromUniform
pval = np.arange(1 * 4, dtype='float32').reshape((1, 4)) + 0.1 for node in f.maker.fgraph.toposort()])
pval = pval / pval.sum(axis=1)[:, None] pval = np.arange(1 * 4, dtype='float32').reshape((1, 4)) + 0.1
uval = np.ones_like(pval[:, 0]) * 0.5 pval = pval / pval.sum(axis=1)[:, None]
f(pval, uval, 1) uval = np.ones_like(pval[:, 0]) * 0.5
f(pval, uval, 1)
def test_unpickle_legacy_op(): def test_unpickle_legacy_op():
......
...@@ -212,7 +212,7 @@ class ChoiceFromUniform(MultinomialFromUniform): ...@@ -212,7 +212,7 @@ class ChoiceFromUniform(MultinomialFromUniform):
""" """
__props__ = ("replace",) __props__ = ("odtype", "replace",)
def __init__(self, odtype, replace=False, *args, **kwargs): def __init__(self, odtype, replace=False, *args, **kwargs):
self.replace = replace self.replace = replace
......
...@@ -35,14 +35,14 @@ class TestScanCheckpoint(unittest.TestCase): ...@@ -35,14 +35,14 @@ class TestScanCheckpoint(unittest.TestCase):
self.grad_A_check = T.grad(self.result_check.sum(), self.A) self.grad_A_check = T.grad(self.result_check.sum(), self.A)
def test_forward_pass(self): def test_forward_pass(self):
"""Test forward computation of A**k.""" # Test forward computation of A**k.
f = theano.function(inputs=[self.A, self.k], f = theano.function(inputs=[self.A, self.k],
outputs=[self.result, self.result_check]) outputs=[self.result, self.result_check])
out, out_check = f(range(10), 101) out, out_check = f(range(10), 101)
assert np.allclose(out, out_check) assert np.allclose(out, out_check)
def test_backward_pass(self): def test_backward_pass(self):
"""Test gradient computation of A**k.""" # Test gradient computation of A**k.
f = theano.function(inputs=[self.A, self.k], f = theano.function(inputs=[self.A, self.k],
outputs=[self.grad_A, self.grad_A_check]) outputs=[self.grad_A, self.grad_A_check])
out, out_check = f(range(10), 101) out, out_check = f(range(10), 101)
...@@ -50,7 +50,7 @@ class TestScanCheckpoint(unittest.TestCase): ...@@ -50,7 +50,7 @@ class TestScanCheckpoint(unittest.TestCase):
@unittest.skipUnless(PYGPU_AVAILABLE, 'Requires pygpu.') @unittest.skipUnless(PYGPU_AVAILABLE, 'Requires pygpu.')
def test_memory(self): def test_memory(self):
"""Test that scan_checkpoint reduces memory usage.""" # Test that scan_checkpoint reduces memory usage.
if None not in theano.gpuarray.type.list_contexts(): if None not in theano.gpuarray.type.list_contexts():
return unittest.SkipTest('Requires gpuarray backend.') return unittest.SkipTest('Requires gpuarray backend.')
from theano.gpuarray.tests.config import mode_with_gpu # noqa from theano.gpuarray.tests.config import mode_with_gpu # noqa
...@@ -63,9 +63,11 @@ class TestScanCheckpoint(unittest.TestCase): ...@@ -63,9 +63,11 @@ class TestScanCheckpoint(unittest.TestCase):
# Check that it works with the checkpoints # Check that it works with the checkpoints
f_check(data, 1000) f_check(data, 1000)
# Check that the basic scan fails in that case # Check that the basic scan fails in that case
self.assertRaises(GpuArrayException, f, data, 1000) # Skip that check in DebugMode, as it can fail in different ways
if not isinstance(mode_with_gpu, theano.compile.DebugMode):
self.assertRaises(GpuArrayException, f, data, 1000)
def test_taps_error(self): def test_taps_error(self):
"""Test that an error rises if we use taps in outputs_info.""" # Test that an error rises if we use taps in outputs_info.
self.assertRaises(RuntimeError, theano.scan_checkpoints, self.assertRaises(RuntimeError, theano.scan_checkpoints,
lambda: None, [], {'initial': self.A, 'taps': [-2]}) lambda: None, [], {'initial': self.A, 'taps': [-2]})
...@@ -2001,7 +2001,7 @@ class AdvancedIncSubtensor1(Op): ...@@ -2001,7 +2001,7 @@ class AdvancedIncSubtensor1(Op):
if self.set_instead_of_inc: if self.set_instead_of_inc:
x[idx] = y x[idx] = y
else: else:
if config.cxx: if config.cxx and node.inputs[0].dtype != 'float16':
increment = inplace_increment increment = inplace_increment
else: else:
increment = self.inplace_increment1d_slow increment = self.inplace_increment1d_slow
......
...@@ -889,8 +889,10 @@ class T_reduce_dtype(unittest.TestCase): ...@@ -889,8 +889,10 @@ class T_reduce_dtype(unittest.TestCase):
(topo, output_dtype) (topo, output_dtype)
data = np.random.rand(3, 4) * 10 data = np.random.rand(3, 4) * 10
data = data.astype(input_dtype) data = data.astype(input_dtype)
if output_dtype == 'float16' and method == 'prod': if (method == 'prod' and output_dtype in
['float16', 'int8', 'uint8', 'int16', 'uint16']):
# We will likely get something infinite, # We will likely get something infinite,
# or the overflow will be different between CPU and GPU,
# and DebugMode will complain. # and DebugMode will complain.
data = data[0:1] data = data[0:1]
f(data) f(data)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论