提交 5890e98e authored 作者: lamblin's avatar lamblin

Merge pull request #580 from nouiz/gpu_setsubtensor

Gpu setsubtensor
......@@ -9,6 +9,13 @@ Bug fixes
(both in Python and Cython) since April 2011. (Pascal L.)
* In Sparse sandbox, fix the grad of theano.sparse.sandbox.sp.row_scale.
It did not return the right number of elements. (Frederic B.)
* set_subtensor(x[int vector], new_value) when moved to the GPU
where transformed into inc_subtensor on the GPU. Now we have a slow
GPU implementation.
Note: set_subtensor(x[slice[,...]], new_value) was working correctly
in all case as well as inc_subtensor(*, *).
Note2: If your code have this behavior, we print a warning by default.
(Frederic B.)
Documentation
* Added in the tutorial documentation on how to extend Theano.
......@@ -81,6 +88,8 @@ Crash Fix
element-wise fusion optimization when upcasting some inputs to
float32 (to compute them on the GPU).
(Frederic B., reported by Sander Dieleman)
* GpuReshape in some particular case when the input is not contiguous
(Frederic B., reported by Sander Dieleman)
* GpuSoftmaxWithBias with shape (0, N) with N > 1.
(Frédéric B., reported by Razvan P.)
* Fix crash under 64-bit Windows, when taking subtensors of the form a[n:]
......@@ -89,6 +98,7 @@ Crash Fix
dimensions, which could typically result in optimization crashes (Olivier D.)
* Fixed crash when concatenating some arrays with specific broadcasting
patterns (Olivier D.)
* Work around a known issue with nvcc 4.1 on MacOS X. (Graham Taylon)
=============
Release Notes
......
......@@ -315,6 +315,13 @@ AddConfigVar('warn.subtensor_merge_bug',
BoolParam(warn_default('0.5')),
in_c_key=False)
AddConfigVar('warn.gpu_set_subtensor1',
"Warn if previous versions of Theano (before 0.6) could have given "
"incorrect results when moving to the gpu"
"set_subtensor(x[int vector], new_value)",
BoolParam(warn_default('0.6')),
in_c_key=False)
AddConfigVar('compute_test_value',
("If 'True', Theano will run each op at graph build time, using "
"Constants, SharedVariables and the tag 'test_value' as inputs "
......
......@@ -1002,21 +1002,40 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
self->nd, other->nd);
}
//standard elemwise size checks
if (self->nd != other->nd)
//standard elemwise nb dim checks
if (self->nd < other->nd)
{
PyErr_Format(
PyExc_TypeError,
"CudaNdarray_inplace_elemwise: need same number of dims. Got %d and %d",
"CudaNdarray_inplace_elemwise: The destination need more or the"
" same number of dimensions then the source. Got %d and %d.",
self->nd, other->nd);
return -1;
}
//broadcast to the same number of dimensions.
int other_dims[self->nd];
int other_strides[self->nd];
int added_dims = self->nd - other->nd;
// Add the added broadcasted dimensions
for (int i = 0; i< added_dims; ++i)
{
other_dims[i] = 1;
other_strides[i] = 0;
}
// Copy the existing dimensions
for (int i = 0; i< other->nd; ++i)
{
other_dims[i+added_dims] = CudaNdarray_HOST_DIMS(other)[i];
other_strides[i+added_dims] = CudaNdarray_HOST_STRIDES(other)[i];
}
//standard elemwise dim checks
unsigned int size = 1;
for (int i = 0; i< self->nd; ++i)
{
if ((CudaNdarray_HOST_DIMS(self)[i] != CudaNdarray_HOST_DIMS(other)[i])
&& (CudaNdarray_HOST_DIMS(other)[i] != 1))
if ((CudaNdarray_HOST_DIMS(self)[i] != other_dims[i])
&& (other_dims[i] != 1))
{
PyErr_SetString(
PyExc_ValueError,
......@@ -1024,8 +1043,8 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
return -1;
}
// if we're broadcasting other, then make sure it has stride 0
assert ((CudaNdarray_HOST_DIMS(self)[i] == CudaNdarray_HOST_DIMS(other)[i])
|| (CudaNdarray_HOST_STRIDES(other)[i] == 0));
assert ((CudaNdarray_HOST_DIMS(self)[i] == other_dims[i])
|| (other_strides[i] == 0));
size *= (unsigned int) CudaNdarray_HOST_DIMS(self)[i];
}
......@@ -1090,7 +1109,7 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
CudaNdarray_DEV_DATA(other),
1, //strides
1,
CudaNdarray_HOST_STRIDES(other)[0]);
other_strides[0]);
CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError();
if (cudaSuccess != err)
......@@ -1126,8 +1145,8 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
CudaNdarray_HOST_STRIDES(self)[1],
CudaNdarray_DEV_DATA(other),
1,
CudaNdarray_HOST_STRIDES(other)[0],
CudaNdarray_HOST_STRIDES(other)[1]);
other_strides[0],
other_strides[1]);
CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError();
if (cudaSuccess != err)
......@@ -1165,9 +1184,9 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
CudaNdarray_HOST_STRIDES(self)[1],
CudaNdarray_HOST_STRIDES(self)[2],
CudaNdarray_DEV_DATA(other),
CudaNdarray_HOST_STRIDES(other)[0],
CudaNdarray_HOST_STRIDES(other)[1],
CudaNdarray_HOST_STRIDES(other)[2]);
other_strides[0],
other_strides[1],
other_strides[2]);
CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError();
if (cudaSuccess != err)
......@@ -1208,10 +1227,10 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
CudaNdarray_HOST_STRIDES(self)[2],
CudaNdarray_HOST_STRIDES(self)[3],
CudaNdarray_DEV_DATA(other),
CudaNdarray_HOST_STRIDES(other)[0],
CudaNdarray_HOST_STRIDES(other)[1],
CudaNdarray_HOST_STRIDES(other)[2],
CudaNdarray_HOST_STRIDES(other)[3]);
other_strides[0],
other_strides[1],
other_strides[2],
other_strides[3]);
CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError();
if (cudaSuccess != err)
......@@ -1252,11 +1271,11 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
CudaNdarray_HOST_STRIDES(self)[2],
CudaNdarray_HOST_STRIDES(self)[3],
CudaNdarray_HOST_STRIDES(self)[4],
CudaNdarray_DEV_DATA(other) + i * CudaNdarray_HOST_STRIDES(other)[0],
CudaNdarray_HOST_STRIDES(other)[1],
CudaNdarray_HOST_STRIDES(other)[2],
CudaNdarray_HOST_STRIDES(other)[3],
CudaNdarray_HOST_STRIDES(other)[4]);
CudaNdarray_DEV_DATA(other) + i * other_strides[0],
other_strides[1],
other_strides[2],
other_strides[3],
other_strides[4]);
CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
......@@ -1280,6 +1299,8 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
return -1;
}
}
if (verbose)
fprintf(stderr, "INPLACE ADD/DIV end\n");
return 0;
}
......@@ -2746,12 +2767,17 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, const CudaNdarray * othe
//standard elemwise size checks
if (self->nd == -1)
{
PyErr_SetString(PyExc_TypeError, "can't copy into un-initialized CudaNdarray");
PyErr_SetString(PyExc_TypeError,
"can't copy into un-initialized CudaNdarray");
return -1;
}
if (self->nd != other->nd)
{
PyErr_Format(PyExc_NotImplementedError, "CudaNdarray_CopyFromCudaNdarray: need same number of dims. destination nd=%d, source nd=%d. No broadcasting implemented.", self->nd, other->nd);
PyErr_Format(PyExc_NotImplementedError,
"CudaNdarray_CopyFromCudaNdarray: need same number of"
" dims. destination nd=%d, source nd=%d."
" No broadcasting implemented.",
self->nd, other->nd);
return -1;
}
//standard elemwise dim checks (also compute total size)
......@@ -2762,8 +2788,11 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, const CudaNdarray * othe
if ((CudaNdarray_HOST_DIMS(self)[i] != CudaNdarray_HOST_DIMS(other)[i])
&& (1!=CudaNdarray_HOST_DIMS(other)[i] || !unbroadcast) )
{
PyErr_Format(PyExc_ValueError, "need same dimensions for dim %d, destination=%d, source=%d",
i, CudaNdarray_HOST_DIMS(self)[i], CudaNdarray_HOST_DIMS(other)[i]);
PyErr_Format(PyExc_ValueError,
"need same dimensions for dim %d,"
" destination=%d, source=%d",
i, CudaNdarray_HOST_DIMS(self)[i],
CudaNdarray_HOST_DIMS(other)[i]);
return -1;
}
size *= (unsigned int) CudaNdarray_HOST_DIMS(self)[i];
......@@ -2773,12 +2802,15 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, const CudaNdarray * othe
{
return 0; //nothing to copy, we're done.
}
if (CudaNdarray_is_c_contiguous(self) && CudaNdarray_is_c_contiguous(other) && size == size_source)
if (CudaNdarray_is_c_contiguous(self) &&
CudaNdarray_is_c_contiguous(other) &&
size == size_source)
{
if (verbose)
fprintf(stderr, "Copying contiguous vector with cublasScopy\n");
cublasScopy(size, CudaNdarray_DEV_DATA(other), 1, CudaNdarray_DEV_DATA(self), 1);
cublasScopy(size, CudaNdarray_DEV_DATA(other), 1,
CudaNdarray_DEV_DATA(self), 1);
CNDA_THREAD_SYNC;
if (CUBLAS_STATUS_SUCCESS != cublasGetError())
{
......@@ -2800,23 +2832,33 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, const CudaNdarray * othe
{
if (verbose) fprintf(stderr, "Copying non-contiguous vector\n");
if (verbose) fprint_CudaNdarray(stderr, other);
unsigned int n_blocks = std::min(size, (unsigned int)NUM_VECTOR_OP_BLOCKS);
unsigned int n_threads = std::min(ceil_intdiv(size, n_blocks), (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
unsigned int n_blocks = std::min(size,
(unsigned int)NUM_VECTOR_OP_BLOCKS);
unsigned int n_threads = std::min(ceil_intdiv(size, n_blocks),
(unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
k_copy_1d<<<n_blocks, n_threads>>>(size,
CudaNdarray_DEV_DATA(other), CudaNdarray_HOST_STRIDES(other)[0],
CudaNdarray_DEV_DATA(self), CudaNdarray_HOST_STRIDES(self)[0]);
CudaNdarray_DEV_DATA(other),
CudaNdarray_HOST_STRIDES(other)[0],
CudaNdarray_DEV_DATA(self),
CudaNdarray_HOST_STRIDES(self)[0]);
CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s. (n_blocks=%i, n_threads_per_block=%i)\n", "k_copy_1d", cudaGetErrorString(err), n_blocks, n_threads);
PyErr_Format(PyExc_RuntimeError,
"Cuda error: %s: %s. (n_blocks=%i,"
" n_threads_per_block=%i)\n", "k_copy_1d",
cudaGetErrorString(err), n_blocks, n_threads);
return -1;
}
}; break;
default:
{
assert (cudaSuccess == cudaGetLastError());
if (verbose) fprintf(stderr, "Copying with default version unbroadcast=%d\n", unbroadcast);
if (verbose)
fprintf(stderr,
"Copying with default version unbroadcast=%d\n",
unbroadcast);
// call worker routine
unsigned int threads_per_block = std::min(size,
(unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
......@@ -2830,18 +2872,27 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, const CudaNdarray * othe
size,
(unsigned int)other->nd,
(const int *)CudaNdarray_DEV_DIMS(cuda_dims),
(const float*)CudaNdarray_DEV_DATA(other), (const int *)CudaNdarray_DEV_STRIDES(other),
CudaNdarray_DEV_DATA(self), (const int *)CudaNdarray_DEV_STRIDES(self));
(const float*)CudaNdarray_DEV_DATA(other),
(const int *)CudaNdarray_DEV_STRIDES(other),
CudaNdarray_DEV_DATA(self),
(const int *)CudaNdarray_DEV_STRIDES(self));
CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError();
if(verbose>1)
fprintf(stderr, "INFO k_elemwise_unary_rowmaj (n_blocks=%i, n_threads_per_block=%i)\n",
fprintf(stderr,
"INFO k_elemwise_unary_rowmaj (n_blocks=%i,"
" n_threads_per_block=%i)\n",
n_blocks, threads_per_block);
if( cudaSuccess != err)
{
//fprint_CudaNdarray(stderr, self);
//fprint_CudaNdarray(stderr, other);
PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s. (n_blocks=%i, n_threads_per_block=%i)\n", "k_elemwise_unary_rowmajor_copy", cudaGetErrorString(err), n_blocks, threads_per_block);
PyErr_Format(PyExc_RuntimeError,
"Cuda error: %s: %s. (n_blocks=%i,"
" n_threads_per_block=%i)\n",
"k_elemwise_unary_rowmajor_copy",
cudaGetErrorString(err), n_blocks,
threads_per_block);
return -1;
}
}
......@@ -3832,7 +3883,7 @@ cnda_mark_dev_structure_dirty(CudaNdarray * self)
int
CudaNdarray_EqualAndIgnore(CudaNdarray *cnda1, CudaNdarray *cnda2, int ignoreSync, int ignoreBase)
{
int verbose = 1;
int verbose = 0;
if (!ignoreSync && cnda1->dev_structure_fresh != cnda2->dev_structure_fresh)
{
......
......@@ -82,17 +82,24 @@ class InputToGpuOptimizer(Optimizer):
def apply(self, env):
for input in env.inputs:
if not isinstance(input.type, CudaNdarrayType):
try:
new_input = host_from_gpu(gpu_from_host(input))
if isinstance(input.type, CudaNdarrayType):
return
if new_input.type == input.type:
env.replace_validate(input, new_input,
"InputToGpuOptimizer")
except TypeError, e:
#as we currently only support float32, this can fail.
#Using try except make that we won't need
pass
# This happen frequently as we do 2 pass of the gpu optimizations
if (len(input.clients) == 1 and
input.clients[0][0].op == gpu_from_host):
return
try:
new_input = host_from_gpu(gpu_from_host(input))
if new_input.type == input.type:
env.replace_validate(input, new_input,
"InputToGpuOptimizer")
except TypeError, e:
#as we currently only support float32, this can fail.
#Using try except make that we won't need
pass
# we register it before all other gpu optimizer to be sure that the input
# are on the gpu.
......@@ -753,11 +760,11 @@ def local_gpu_advanced_incsubtensor1(node):
warnings.warn(
'Although your current code is fine, please note that '
'Theano versions prior to 0.6 (more specifically, '
'prior to commit XXXX on DATE) may have '
'prior to commitd 2240bddd on March 29, 2012) may have '
'yielded an incorrect result. To remove this warning, '
'either set the `warn.gpu_set_subtensor1` config '
'option to False, or `warn.ignore_bug_before` to at '
'least \'0.6\'.')
'least \'0.6\'.', stacklevel=1)
if set_instead_of_inc:
return
......@@ -787,7 +794,7 @@ def local_gpu_advanced_incsubtensor1(node):
warnings.warn(
'Although your current code is fine, please note that '
'Theano versions prior to 0.6 (more specifically, '
'prior to commit XXXX on DATE) may have '
'prior to commit d2240bddd on March 29, 2012) may have '
'yielded an incorrect result. To remove this warning, '
'either set the `warn.gpu_set_subtensor1` config '
'option to False, or `warn.ignore_bug_before` to at '
......
......@@ -2100,23 +2100,20 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
return super(T_subtensor, self).__init__(name)
def function(self, inputs, outputs, accept_inplace=False,
op=None, mode=None, N=1):
op=None, mode=None, N=1, N_fast=None):
""" wrapper around theano.function that also check the output
:param N: the number of op expected in the toposort
if tuple of length 2, (expected if fast_compile,
if not fast_compile)
"""
if isinstance(N, tuple):
assert len(N) == 2
if self.fast_compile:
N = N[0]
else:
N = N[1]
if self.fast_compile and N_fast is not None:
N = N_fast
if mode is None:
mode = self.mode
if op is None:
op = self.sub
f = theano.function(inputs, outputs, mode=mode,
accept_inplace=accept_inplace)
self.assertFunctionContainsClassN(f, op, N)
......@@ -2694,7 +2691,7 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
if idx is idxs[0]:
f = self.function([], [gn.shape, n[idx_].shape],
op=ops,
N=(2, 0))
N=0, N_fast=2)
f()
def test_wrong_exception_regression(self):
......@@ -2747,7 +2744,7 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
data = numpy.asarray(data, dtype=self.dtype)
n = self.shared(data)
t = n[idx]
f = self.function([], t.shape, op=self.ops, N=(1, 0))
f = self.function([], t.shape, op=self.ops, N=0, N_fast=1)
val = f()
self.assertTrue(numpy.allclose(val, data[idx].shape))
......@@ -2850,6 +2847,8 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
data_copy[idx] = inc_num
else:
data_copy[idx] += inc_num
data_var = theano.In(data_var, mutable=True)
# Remember data for the Theano function (see below).
all_inputs_var += [data_var, idx_var, inc_var]
all_inputs_num += [data_num, idx_num, inc_num]
......@@ -2869,9 +2868,16 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
assert (data_num == data_num_init).all()
# Actual test (we compile a single Theano function to make it faster).
f = self.function(all_inputs_var, all_outputs_var,
accept_inplace=True, op=self.adv_incsub1,
N=len(all_outputs_var))
orig_warn = theano.config.warn.gpu_set_subtensor1
try:
theano.config.warn.gpu_set_subtensor1 = False
f = self.function(all_inputs_var, all_outputs_var,
accept_inplace=True,
op=self.adv_incsub1,
N=len(all_outputs_var))
finally:
theano.config.warn.gpu_set_subtensor1 = orig_warn
f_outs = f(*all_inputs_num)
assert len(f_outs) == len(all_outputs_num)
for f_out, output_num in izip(f_outs, all_outputs_num):
......
......@@ -93,7 +93,8 @@ class TestOptimizationMixin(object):
def assertFunctionContains(self, f, op, min=1, max=sys.maxint):
toposort = f.maker.env.toposort()
matches = [node for node in toposort if node.op == op]
assert (min <= len(matches) <= max), (toposort, matches, str(op), min, max)
assert (min <= len(matches) <= max), (toposort, matches,
str(op), len(matches), min, max)
def assertFunctionContains0(self, f, op):
return self.assertFunctionContains(f, op, min=0, max=0)
......@@ -104,6 +105,15 @@ class TestOptimizationMixin(object):
def assertFunctionContainsN(self, f, op, N):
return self.assertFunctionContains(f, op, min=N, max=N)
def assertFunctionContainsClass(self, f, op, min=1, max=sys.maxint):
toposort = f.maker.env.toposort()
matches = [node for node in toposort if isinstance(node.op, op)]
assert (min <= len(matches) <= max), (toposort, matches,
str(op), len(matches), min, max)
def assertFunctionContainsClassN(self, f, op, N):
return self.assertFunctionContainsClass(f, op, min=N, max=N)
def SkipTest(self, msg='Skip this test'):
raise SkipTest(msg)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论