提交 59a5dfbb authored 作者: Frédéric Bastien's avatar Frédéric Bastien 提交者: GitHub

Merge pull request #4556 from abergeron/faster_incsub

Don't rebuild inplace add kernels all the time for GpuIncSubtensor.
...@@ -50,6 +50,15 @@ def init_dev(dev, name=None): ...@@ -50,6 +50,15 @@ def init_dev(dev, name=None):
if v[1] < 0: if v[1] < 0:
raise RuntimeError("Wrong minor API version for gpuarray:", v[1], raise RuntimeError("Wrong minor API version for gpuarray:", v[1],
"Please update libgpuarray/pygpu.") "Please update libgpuarray/pygpu.")
if len(v) < 3:
vpy = -1
else:
vpy = v[2]
vpye = 0
if vpy < vpye:
print("Wrong python API version for gpuarray:", vpy, "expected:", vpye,
"Some python ops may not work correctly and/or crash. "
"Consider updating pygpu.", file=sys.stderr)
global pygpu_activated global pygpu_activated
if dev not in init_dev.devmap: if dev not in init_dev.devmap:
ctx = pygpu.init(dev, ctx = pygpu.init(dev,
......
...@@ -6,7 +6,7 @@ import numpy ...@@ -6,7 +6,7 @@ import numpy
from six import integer_types from six import integer_types
from six.moves import StringIO from six.moves import StringIO
from theano import tensor, gof from theano import tensor, gof, Op
from theano.tensor.subtensor import IncSubtensor, Subtensor, get_idx_list from theano.tensor.subtensor import IncSubtensor, Subtensor, get_idx_list
try: try:
...@@ -20,6 +20,19 @@ from .basic_ops import (as_gpuarray_variable, HideC, GpuKernelBase, Kernel, ...@@ -20,6 +20,19 @@ from .basic_ops import (as_gpuarray_variable, HideC, GpuKernelBase, Kernel,
infer_context_name) infer_context_name)
iadd_reg = {}
def get_iadd(a, b):
key = (a.type.dtype, b.type.dtype, a.type.context)
if key not in iadd_reg:
a_arg = pygpu.elemwise.arg('a', a.type.dtype, read=True, write=True)
b_arg = pygpu.elemwise.arg('b', b.type.dtype, read=True)
res = pygpu.elemwise.GpuElemwise(a.type.context, "a = a + b", [a_arg, b_arg], convert_f16=True)
iadd_reg[key] = res
return iadd_reg[key]
class GpuSubtensor(HideC, Subtensor): class GpuSubtensor(HideC, Subtensor):
""" """
Subtensor on the GPU. Subtensor on the GPU.
...@@ -217,9 +230,10 @@ class GpuIncSubtensor(IncSubtensor): ...@@ -217,9 +230,10 @@ class GpuIncSubtensor(IncSubtensor):
# we've sliced out an N-D tensor with N > 0 # we've sliced out an N-D tensor with N > 0
if not self.set_instead_of_inc: if not self.set_instead_of_inc:
# sub_x += y # sub_x += y
pygpu.elemwise.ielemwise2(sub_x, '+', y, broadcast=False) iadd = get_iadd(node.inputs[0], node.inputs[1])
iadd(sub_x, y, broadcast=False)
else: else:
# sub_x += -sub_x + y # sub_x[...] = y
x.__setitem__(cdata, y) x.__setitem__(cdata, y)
else: else:
# scalar case # scalar case
...@@ -341,7 +355,7 @@ class GpuIncSubtensor(IncSubtensor): ...@@ -341,7 +355,7 @@ class GpuIncSubtensor(IncSubtensor):
args[1].typecode = %(type2)s; args[1].typecode = %(type2)s;
args[1].flags = GE_READ; args[1].flags = GE_READ;
iadd = GpuElemwise_new(%(ctx)s->ctx, "", "a += b", iadd = GpuElemwise_new(%(ctx)s->ctx, "", "a += b",
2, args, %(nd)s, 0); 2, args, %(nd)s, GE_CONVERT_F16);
if (iadd == NULL) { if (iadd == NULL) {
PyErr_SetString(PyExc_RuntimeError, "Could not intialize inplace add support"); PyErr_SetString(PyExc_RuntimeError, "Could not intialize inplace add support");
%(fail)s %(fail)s
...@@ -369,7 +383,7 @@ class GpuIncSubtensor(IncSubtensor): ...@@ -369,7 +383,7 @@ class GpuIncSubtensor(IncSubtensor):
parent_version = super(GpuIncSubtensor, self).c_code_cache_version() parent_version = super(GpuIncSubtensor, self).c_code_cache_version()
if not parent_version: if not parent_version:
return return
return parent_version + (6,) return parent_version + (7,)
class GpuAdvancedSubtensor1(HideC, tensor.AdvancedSubtensor1): class GpuAdvancedSubtensor1(HideC, tensor.AdvancedSubtensor1):
...@@ -447,11 +461,25 @@ if (err != GA_NO_ERROR) { ...@@ -447,11 +461,25 @@ if (err != GA_NO_ERROR) {
return (0,) return (0,)
class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1): class GpuAdvancedIncSubtensor1(Op):
""" """
Implement AdvancedIncSubtensor1 on the gpu. Implement AdvancedIncSubtensor1 on the gpu.
""" """
_f16_ok = True
__props__ = ('inplace', 'set_instead_of_inc')
params_type = gpu_context_type
def __init__(self, inplace=False, set_instead_of_inc=False):
self.inplace = inplace
self.set_instead_of_inc = set_instead_of_inc
if inplace:
self.destroy_map = {0: [0]}
def clone_inplace(self):
return self.__class__(
inplace=True,
set_instead_of_inc=self.set_instead_of_inc)
def make_node(self, x, y, ilist): def make_node(self, x, y, ilist):
ctx_name = infer_context_name(x, y) ctx_name = infer_context_name(x, y)
...@@ -480,21 +508,13 @@ class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1): ...@@ -480,21 +508,13 @@ class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1):
return gof.Apply(self, [x_, y_, ilist_], [x_.type()]) return gof.Apply(self, [x_, y_, ilist_], [x_.type()])
def getInplElemwiseAdditionKernel(self, a, b): def get_params(self, node):
if a.dtype == 'float16' or b.dtype == 'float16': return node.outputs[0].type.context
raise NotImplementedError('float16 is not supported by pygpu '
'elemwise')
a_arg = pygpu.tools.as_argument(a, 'a')
b_arg = pygpu.tools.as_argument(b, 'b')
args = [a_arg, b_arg]
oper = "a[i] = a[i] + %(b)s" % {'b': b_arg.expr()}
k = pygpu.elemwise.ElemwiseKernel(a.context, args, oper)
return k
# We can't use the parent version that loops on each index # We can't use the parent version that loops on each index
# as we also need to loop when set_instead_of_inc is True and the # as we also need to loop when set_instead_of_inc is True and the
# parent doesn't loop in that case. # parent doesn't loop in that case.
def perform(self, node, inp, out_): def perform(self, node, inp, out_, ctx=None):
# TODO opt to make this inplace # TODO opt to make this inplace
x, y, idx = inp x, y, idx = inp
out, = out_ out, = out_
...@@ -507,8 +527,9 @@ class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1): ...@@ -507,8 +527,9 @@ class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1):
if len(idx) == 0: if len(idx) == 0:
return return
# Make sure idx is not a GpuArray otherwise we cannot use its content # Make sure idx is not a GpuArray otherwise we cannot use its
# to index x and y # content to index x and y (This is because we serve as
# fallback for _dev20).
if isinstance(idx, gpuarray.GpuArray): if isinstance(idx, gpuarray.GpuArray):
idx = numpy.asarray(idx) idx = numpy.asarray(idx)
...@@ -521,7 +542,7 @@ class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1): ...@@ -521,7 +542,7 @@ class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1):
for (j, i) in enumerate(idx): for (j, i) in enumerate(idx):
x[i] = y[j] x[i] = y[j]
else: else:
k = self.getInplElemwiseAdditionKernel(x[0], y[0]) k = get_iadd(node.inputs[0], node.inputs[1])
for (j, i) in enumerate(idx): for (j, i) in enumerate(idx):
k(x[i], y[j], broadcast=True) k(x[i], y[j], broadcast=True)
else: else:
...@@ -536,12 +557,119 @@ class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1): ...@@ -536,12 +557,119 @@ class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1):
for i in idx: for i in idx:
x[i] = reshaped_y x[i] = reshaped_y
else: else:
k = self.getInplElemwiseAdditionKernel(x[0], reshaped_y) k = get_iadd(node.inputs[0], node.inputs[1])
for i in idx: for i in idx:
k(x[i], reshaped_y, broadcast=True) k(x[i], reshaped_y, broadcast=True)
def c_headers(self):
return ['<numpy_compat.h>', '<gpuarray/error.h>', '<gpuarray/array.h>',
'<gpuarray/elemwise.h>', 'gpuarray_helper.h']
class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, GpuAdvancedIncSubtensor1): def c_header_dirs(self):
return [os.path.dirname(__file__)]
def c_support_code_struct(self, node, nodename):
return "\nGpuElemwise *iadd;\n"
def c_init_code_struct(self, node, name, sub):
return """
gpuelemwise_arg args[2] = {{0}};
args[0].name = "a";
args[0].typecode = %(type1)s;
args[0].flags = GE_READ|GE_WRITE;
args[1].name = "b";
args[1].typecode = %(type2)s;
args[1].flags = GE_READ;
iadd = GpuElemwise_new(%(ctx)s->ctx, "", "a += b",
2, args, %(nd)s, GE_CONVERT_F16);
if (iadd == NULL) {
PyErr_SetString(PyExc_RuntimeError, "Could not intialize inplace add support");
%(fail)s
}
""" % dict(ctx=sub['params'], fail=sub['fail'],
type1=node.inputs[0].type.typecode,
type2=node.inputs[1].type.typecode,
nd=node.inputs[1].ndim)
def c_code(self, node, name, inputs, outputs, sub):
if (node.inputs[0].ndim != node.inputs[1].ndim):
raise NotImplementedError("This case does not have C code yet.")
return """
PyGpuArrayObject *row_x, *row_y;
ssize_t start[%(nd)s], step[%(nd)s];
size_t num_indices, j;
int ret;
int broadcast_y;
for (j = 0; j < %(nd)s; j++) {
start[j] = 0;
step[j] = 1;
}
step[0] = 0;
num_indices = PyArray_SIZE(%(ind)s);
if ((num_indices - 1) > LONG_MAX) {
PyErr_Format(PyExc_AssertionError,
"num_indices %%lld exceeds LONG_MAX + 1", (long long)num_indices);
%(fail)s
}
if (!%(inplace)s) {
%(out)s = theano_try_copy(%(out)s, %(x)s);
if (%(out)s == NULL)
%(fail)s
} else {
Py_XDECREF(%(out)s);
%(out)s = %(x)s;
Py_INCREF(%(out)s);
}
broadcast_y = PyGpuArray_DIM(%(y)s, 0) == 1;
for (j = 0; j < num_indices; j++) {
start[0] = *(dtype_%(ind)s *)PyArray_GETPTR1(%(ind)s, j);
if (start[0] < 0)
start[0] += PyGpuArray_DIM(%(out)s, 0);
if (start[0] < 0 || start[0] >= PyGpuArray_DIM(%(out)s, 0)) {
PyErr_SetString(PyExc_IndexError, "index out of bounds");
%(fail)s;
}
row_x = pygpu_index(%(out)s, start, (ssize_t *)PyGpuArray_DIMS(%(out)s), step);
if (row_x == NULL)
%(fail)s;
if (broadcast_y)
start[0] = 0;
else
start[0] = j;
row_y = pygpu_index(%(y)s, start, (ssize_t *)PyGpuArray_DIMS(%(y)s), step);
if (row_y == NULL) {
Py_DECREF(row_x);
%(fail)s;
}
if (%(set_instead_of_inc)s) {
ret = GpuArray_setarray(&row_x->ga, &row_y->ga);
} else {
void *args[2];
args[0] = (void *)&row_x->ga;
args[1] = (void *)&row_y->ga;
ret = GpuElemwise_call(iadd, args, GE_BROADCAST);
}
Py_DECREF(row_x);
Py_DECREF(row_y);
if (ret != GA_NO_ERROR)
PyErr_SetString(PyExc_RuntimeError, "Failed to set/inc elements");
}
""" % dict(x=inputs[0], y=inputs[1], ind=inputs[2], out=outputs[0],
fail=sub['fail'], inplace=int(self.inplace),
nd=node.inputs[0].ndim,
set_instead_of_inc=int(self.set_instead_of_inc))
def c_code_cache_version(self):
return (0,)
class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC,
GpuAdvancedIncSubtensor1):
""" """
Implement AdvancedIncSubtensor1 on the gpu, but use function Implement AdvancedIncSubtensor1 on the gpu, but use function
only avail on compute capability 2.0 and more recent. only avail on compute capability 2.0 and more recent.
...@@ -588,7 +716,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, GpuAdvancedIncSubtensor1): ...@@ -588,7 +716,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, GpuAdvancedIncSubtensor1):
return super(GpuAdvancedIncSubtensor1_dev20, self).perform(node, inp, out) return super(GpuAdvancedIncSubtensor1_dev20, self).perform(node, inp, out)
def c_code_cache_version(self): def c_code_cache_version(self):
return (8,) return (9,)
def c_headers(self): def c_headers(self):
return ['<numpy_compat.h>', '<gpuarray_helper.h>', return ['<numpy_compat.h>', '<gpuarray_helper.h>',
...@@ -601,10 +729,9 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, GpuAdvancedIncSubtensor1): ...@@ -601,10 +729,9 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, GpuAdvancedIncSubtensor1):
ctx = self.get_params(node) ctx = self.get_params(node)
if ctx.kind != b'cuda': if ctx.kind != b'cuda':
raise NotImplementedError("cuda only") raise NotImplementedError("cuda only")
if (self.set_instead_of_inc or if (node.inputs[0].ndim != node.inputs[1].ndim or
node.inputs[0].ndim != node.inputs[1].ndim or
node.inputs[0].ndim != 2 or node.inputs[0].ndim != 2 or
ctx.bin_id[-2] < b'2'): int(ctx.bin_id[-2]) < 2):
raise NotImplementedError("This case does not have C code yet.") raise NotImplementedError("This case does not have C code yet.")
x = inputs[0] x = inputs[0]
...@@ -612,6 +739,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, GpuAdvancedIncSubtensor1): ...@@ -612,6 +739,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, GpuAdvancedIncSubtensor1):
ind = inputs[2] ind = inputs[2]
out = outputs[0] out = outputs[0]
fail = sub['fail'] fail = sub['fail']
set_instead_of_inc = int(self.set_instead_of_inc)
inplace = int(self.inplace) inplace = int(self.inplace)
return """ return """
int err; int err;
...@@ -625,7 +753,7 @@ if (%(inplace)s) { ...@@ -625,7 +753,7 @@ if (%(inplace)s) {
if (!%(out)s) { if (!%(out)s) {
%(fail)s %(fail)s
} }
if (GpuArray_vector_add_fast(%(out)s, %(y)s, %(ind)s)) { if (GpuArray_vector_add_fast(%(out)s, %(y)s, %(ind)s, %(set_instead_of_inc)s)) {
%(fail)s %(fail)s
} }
""" % locals() """ % locals()
...@@ -651,7 +779,7 @@ if (GpuArray_vector_add_fast(%(out)s, %(y)s, %(ind)s)) { ...@@ -651,7 +779,7 @@ if (GpuArray_vector_add_fast(%(out)s, %(y)s, %(ind)s)) {
* This is an atomicAdd that works for doubles since that is not provided * This is an atomicAdd that works for doubles since that is not provided
* natively by cuda. * natively by cuda.
*/ */
__device__ double atomicAdd(ga_double* address, ga_double val) { __device__ ga_double atomicAdd(ga_double* address, ga_double val) {
unsigned long long int* address_as_ull = unsigned long long int* address_as_ull =
(unsigned long long int*)address; (unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed; unsigned long long int old = *address_as_ull, assumed;
...@@ -664,6 +792,11 @@ __device__ double atomicAdd(ga_double* address, ga_double val) { ...@@ -664,6 +792,11 @@ __device__ double atomicAdd(ga_double* address, ga_double val) {
return __longlong_as_double(old); return __longlong_as_double(old);
} }
__device__ ga_double atomicExch(ga_double *address, ga_double val) {
return atomicExch((unsigned long long int *)address,
__double_as_longlong(val));
}
/* /*
* This is a version of atomicAdd that works for half-floats. It may * This is a version of atomicAdd that works for half-floats. It may
* read and write 2 bytes more than the size of the array if the array * read and write 2 bytes more than the size of the array if the array
...@@ -688,6 +821,19 @@ __device__ ga_half atomicAdd(ga_half *addr, ga_half val) { ...@@ -688,6 +821,19 @@ __device__ ga_half atomicAdd(ga_half *addr, ga_half val) {
((ga_size)addr & 2) ? 0x4432 : 0x4410); ((ga_size)addr & 2) ? 0x4432 : 0x4410);
} }
__device__ ga_half atomicExch(ga_half *addr, ga_half val) {
ga_uint *base = (ga_uint *)((ga_size)addr & ~2);
ga_uint old, assumed, new_;
old = *base;
do {
assumed = old;
new_ = __byte_perm(old, val, ((ga_size)addr & 2) ? 0x5410 : 0x3254);
old = atomicCAS(base, assumed, new_);
} while (assumed != old);
return (ga_half)__byte_perm(old, 0,
((ga_size)addr & 2) ? 0x4432 : 0x4410);
}
KERNEL void k_vector_add_fast(const ga_size numRowsX, KERNEL void k_vector_add_fast(const ga_size numRowsX,
const ga_size numColsX, const ga_size numColsX,
const ga_ssize stridesX0, const ga_ssize stridesX0,
...@@ -704,6 +850,7 @@ __device__ ga_half atomicAdd(ga_half *addr, ga_half val) { ...@@ -704,6 +850,7 @@ __device__ ga_half atomicAdd(ga_half *addr, ga_half val) {
const ga_ssize stridesIndices, const ga_ssize stridesIndices,
%(type_ind)s *indices_arr, %(type_ind)s *indices_arr,
const ga_size offset_indices_arr, const ga_size offset_indices_arr,
const int set_instead_of_inc,
ga_int *err) ga_int *err)
{ {
X = (%(type_x)s *)(((char *)X)+offset_X); X = (%(type_x)s *)(((char *)X)+offset_X);
...@@ -718,7 +865,13 @@ __device__ ga_half atomicAdd(ga_half *addr, ga_half val) { ...@@ -718,7 +865,13 @@ __device__ ga_half atomicAdd(ga_half *addr, ga_half val) {
x_row += numRowsX; x_row += numRowsX;
ga_ssize y_row = i; ga_ssize y_row = i;
if (x_row < numRowsX && x_row >= 0) { if (x_row < numRowsX && x_row >= 0) {
atomicAdd(&X[(x_row * stridesX0) + (j * stridesX1)], Y[(y_row * stridesY0) + (j * stridesY1)]); if (set_instead_of_inc) {
atomicExch(&X[(x_row * stridesX0) + (j * stridesX1)],
Y[(y_row * stridesY0) + (j * stridesY1)]);
} else {
atomicAdd(&X[(x_row * stridesX0) + (j * stridesX1)],
Y[(y_row * stridesY0) + (j * stridesY1)]);
}
} else { } else {
*err = 1; *err = 1;
} }
...@@ -730,7 +883,8 @@ __device__ ga_half atomicAdd(ga_half *addr, ga_half val) { ...@@ -730,7 +883,8 @@ __device__ ga_half atomicAdd(ga_half *addr, ga_half val) {
params = [ params = [
'uintp', 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp', 'uintp', 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp',
'uintp', 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp', 'uintp', 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp',
'uintp', 'intp', gpuarray.GpuArray, 'uintp', gpuarray.GpuArray] 'uintp', 'intp', gpuarray.GpuArray, 'uintp', 'int',
gpuarray.GpuArray]
return [Kernel(code=code, name=kname, params=params, return [Kernel(code=code, name=kname, params=params,
flags=flags, objvar=k_var)] flags=flags, objvar=k_var)]
...@@ -748,7 +902,8 @@ __device__ ga_half atomicAdd(ga_half *addr, ga_half val) { ...@@ -748,7 +902,8 @@ __device__ ga_half atomicAdd(ga_half *addr, ga_half val) {
return super(GpuAdvancedIncSubtensor1_dev20, self).c_support_code_struct(node, nodename) + """ return super(GpuAdvancedIncSubtensor1_dev20, self).c_support_code_struct(node, nodename) + """
int GpuArray_vector_add_fast(PyGpuArrayObject* py_self, int GpuArray_vector_add_fast(PyGpuArrayObject* py_self,
PyGpuArrayObject* py_other, PyGpuArrayObject* py_other,
PyGpuArrayObject *indices_arr) PyGpuArrayObject *indices_arr,
const int set_instead_of_inc)
{ {
size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(py_self)[1], (size_t)256), 1, 1}; size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(py_self)[1], (size_t)256), 1, 1};
size_t n_blocks[3] = {std::min(PyGpuArray_SIZE(indices_arr), (size_t)4096), 1, 1}; size_t n_blocks[3] = {std::min(PyGpuArray_SIZE(indices_arr), (size_t)4096), 1, 1};
...@@ -784,6 +939,7 @@ __device__ ga_half atomicAdd(ga_half *addr, ga_half val) { ...@@ -784,6 +939,7 @@ __device__ ga_half atomicAdd(ga_half *addr, ga_half val) {
(void *)&stride_ind, (void *)&stride_ind,
(void *)indices_arr->ga.data, (void *)indices_arr->ga.data,
(void *)&indices_arr->ga.offset, (void *)&indices_arr->ga.offset,
(void *)&set_instead_of_inc,
(void *)errbuf}; (void *)errbuf};
err = GpuKernel_call(&%(k_var)s, 3, threads_per_block, n_blocks, 0, kernel_params); err = GpuKernel_call(&%(k_var)s, 3, threads_per_block, n_blocks, 0, kernel_params);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
......
...@@ -56,3 +56,32 @@ def test_advinc_subtensor1(): ...@@ -56,3 +56,32 @@ def test_advinc_subtensor1():
rep = xval.copy() rep = xval.copy()
rep[[0, 2]] += yval rep[[0, 2]] += yval
assert numpy.allclose(rval, rep) assert numpy.allclose(rval, rep)
def test_incsub_f16():
shp = (3, 3)
shared = gpuarray_shared_constructor
xval = numpy.arange(numpy.prod(shp), dtype='float16').reshape(shp) + 1
yval = numpy.empty((2,) + shp[1:], dtype='float16')
yval[:] = 2
x = shared(xval, name='x')
y = tensor.tensor(dtype='float16',
broadcastable=(False,) * len(shp),
name='y')
expr = tensor.advanced_inc_subtensor1(x, y, [0, 2])
f = theano.function([y], expr, mode=mode_with_gpu)
assert sum([isinstance(node.op, GpuAdvancedIncSubtensor1)
for node in f.maker.fgraph.toposort()]) == 1
rval = f(yval)
rep = xval.copy()
rep[[0, 2]] += yval
assert numpy.allclose(rval, rep)
expr = tensor.inc_subtensor(x[1:], y)
f = theano.function([y], expr, mode=mode_with_gpu)
assert sum([isinstance(node.op, GpuIncSubtensor)
for node in f.maker.fgraph.toposort()]) == 1
rval = f(yval)
rep = xval.copy()
rep[1:] += yval
assert numpy.allclose(rval, rep)
...@@ -301,20 +301,14 @@ class GpuArrayType(Type): ...@@ -301,20 +301,14 @@ class GpuArrayType(Type):
raise NotImplementedError( raise NotImplementedError(
"GpuArrayType.values_eq_approx() don't implemented the" "GpuArrayType.values_eq_approx() don't implemented the"
" allow_remove_inf and allow_remove_nan parameter") " allow_remove_inf and allow_remove_nan parameter")
if a.dtype == 'float16' or b.dtype == 'float16':
an = numpy.asarray(a)
bn = numpy.asarray(b)
return tensor.TensorType.values_eq_approx(
an, bn, allow_remove_inf=allow_remove_inf,
allow_remove_nan=allow_remove_nan, rtol=rtol, atol=atol)
atol_, rtol_ = theano.tensor.basic._get_atol_rtol(a, b) atol_, rtol_ = theano.tensor.basic._get_atol_rtol(a, b)
if rtol is not None: if rtol is not None:
rtol_ = rtol rtol_ = rtol
if atol is not None: if atol is not None:
atol_ = atol atol_ = atol
res = elemwise2(a, '', b, a, odtype=numpy.dtype('bool'), res = elemwise2(a, '', b, a, odtype=numpy.dtype('bool'),
op_tmpl="res[i] = (fabs(%%(a)s - %%(b)s) <" op_tmpl="res = (fabs(a - b) <"
"(%(atol_)s + %(rtol_)s * fabs(%%(b)s)))" % "(%(atol_)s + %(rtol_)s * fabs(b)))" %
locals()) locals())
ret = numpy.asarray(res).all() ret = numpy.asarray(res).all()
if ret: if ret:
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论