提交 bc8c9cb6 authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #2894 from abergeron/fix_advsub

Fix problems with advsub in the gpuarray backend
...@@ -406,49 +406,47 @@ class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1): ...@@ -406,49 +406,47 @@ class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1):
x, y, idx = inp x, y, idx = inp
out, = out_ out, = out_
if not self.inplace:
x = x.copy()
out[0] = x
if len(idx) == 0:
return
# Make sure idx is not a GpuArray otherwise we cannot use its content # Make sure idx is not a GpuArray otherwise we cannot use its content
# to index x and y # to index x and y
if isinstance(idx, gpuarray.GpuArray): if isinstance(idx, gpuarray.GpuArray):
idx = numpy.asarray(idx) idx = numpy.asarray(idx)
if not self.inplace:
x = x.copy()
if self.set_instead_of_inc:
assert y.ndim <= x.ndim # Should be guaranteed by `make_node`
if y.ndim == x.ndim:
assert len(y) == len(idx)
for (j, i) in enumerate(idx):
x[i] = y[j]
else:
for i in idx:
x[i] = y
else:
# If `y` has as many dimensions as `x`, then we want to iterate # If `y` has as many dimensions as `x`, then we want to iterate
# jointly on `x` and `y`. Otherwise, it means `y` should be # jointly on `x` and `y`. Otherwise, it means `y` should be
# broadcasted to fill all relevant rows of `x`. # broadcasted to fill all relevant rows of `x`.
assert y.ndim <= x.ndim # Should be guaranteed by `make_node` if y.ndim == x.ndim and y.shape[0] != 1:
if len(idx) == 0:
pass
# if len(y) == 1, we need to broadcast it.
elif y.ndim == x.ndim and len(y) != 1:
assert len(y) == len(idx) assert len(y) == len(idx)
if self.set_instead_of_inc:
for (j, i) in enumerate(idx):
x[i] = y[j]
else:
k = self.getInplElemwiseAdditionKernel(x[0], y[0]) k = self.getInplElemwiseAdditionKernel(x[0], y[0])
for (j, i) in enumerate(idx): for (j, i) in enumerate(idx):
k(x[i], y[j], broadcast=False) k(x[i], y[j], broadcast=True)
else:
if y.ndim == x.ndim:
# First dim is always 1 in this case.
reshaped_y = y.reshape(y.shape[1:])
else: else:
nb_dims_to_add = (x.ndim - 1) - y.ndim nb_dims_to_add = (x.ndim - 1) - y.ndim
reshaped_y = y.reshape((1,)*nb_dims_to_add + y.shape) reshaped_y = y.reshape((1,)*nb_dims_to_add + y.shape)
k = self.getInplElemwiseAdditionKernel(x[0],
reshaped_y)
if self.set_instead_of_inc:
for i in idx:
x[i] = reshaped_y
else:
k = self.getInplElemwiseAdditionKernel(x[0], reshaped_y)
for i in idx: for i in idx:
k(x[i], reshaped_y, broadcast=True) k(x[i], reshaped_y, broadcast=True)
out[0] = x
class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1): class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
"""Implement AdvancedIncSubtensor1 on the gpu, but use function """Implement AdvancedIncSubtensor1 on the gpu, but use function
...@@ -489,7 +487,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1): ...@@ -489,7 +487,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
return gof.Apply(self, [x_, y_, ilist_], [x_.type()]) return gof.Apply(self, [x_, y_, ilist_], [x_.type()])
def c_code_cache_version(self): def c_code_cache_version(self):
return (2,) return (3,)
def c_headers(self): def c_headers(self):
return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>', return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
...@@ -583,17 +581,17 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1): ...@@ -583,17 +581,17 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
dim3 n_threads(num_threads_per_block); dim3 n_threads(num_threads_per_block);
k_vector_add_fast<<<n_blocks, n_threads>>>( k_vector_add_fast<<<n_blocks, n_threads>>>(
PyGpuArray_DIMS(py_self)[0], PyGpuArray_DIM(py_self, 0),
PyGpuArray_DIMS(py_self)[1], PyGpuArray_DIM(py_self, 1),
PyGpuArray_STRIDES(py_self)[0] / %(itemsize_x)s, PyGpuArray_STRIDE(py_self, 0) / %(itemsize_x)s,
PyGpuArray_STRIDES(py_self)[1] / %(itemsize_x)s, PyGpuArray_STRIDE(py_self, 1) / %(itemsize_x)s,
(npy_%(dtype_x)s*)( (npy_%(dtype_x)s*)(
((char *)cuda_get_ptr(py_self->ga.data)) + ((char *)cuda_get_ptr(py_self->ga.data)) +
py_self->ga.offset), py_self->ga.offset),
PyGpuArray_DIMS(py_other)[0], PyGpuArray_DIM(py_other, 0),
PyGpuArray_DIMS(py_other)[1], PyGpuArray_DIM(py_other, 1),
PyGpuArray_STRIDES(py_other)[0] / %(itemsize_y)s, PyGpuArray_DIM(py_other, 0) == 1 ? 0 : PyGpuArray_STRIDE(py_other, 0) / %(itemsize_y)s,
PyGpuArray_STRIDES(py_other)[1] / %(itemsize_y)s, PyGpuArray_DIM(py_other, 1) == 1 ? 0 : PyGpuArray_STRIDE(py_other, 1) / %(itemsize_y)s,
(npy_%(dtype_x)s*)( (npy_%(dtype_x)s*)(
((char *)cuda_get_ptr(py_other->ga.data)) + ((char *)cuda_get_ptr(py_other->ga.data)) +
py_other->ga.offset), py_other->ga.offset),
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论