提交 975e0d2b authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #3227 from abergeron/gpua_advsub1

Implement GpuAdvancedSubtensor1 for gpuarray
...@@ -4,6 +4,8 @@ ...@@ -4,6 +4,8 @@
#include <string.h> #include <string.h>
#include <gpuarray_api.h> #include <gpuarray_api.h>
#include <numpy_compat.h> #include <numpy_compat.h>
#include <gpuarray/util.h>
static int theano_size_check(PyGpuArrayObject *a, unsigned int nd, static int theano_size_check(PyGpuArrayObject *a, unsigned int nd,
const size_t *dims, int typecode) { const size_t *dims, int typecode) {
...@@ -42,9 +44,14 @@ static PyGpuArrayObject *theano_try_copy(PyGpuArrayObject *out, ...@@ -42,9 +44,14 @@ static PyGpuArrayObject *theano_try_copy(PyGpuArrayObject *out,
return out; return out;
} }
/* This is guaranteed to work and return the raw CUDA/OpenCL object on static inline void *PyGpuArray_DEV_DATA(PyGpuArrayObject *a) {
/* This is guaranteed to work and return the raw CUDA/OpenCL object on
* all recent (as of June 2015) version of libgpuarray. This is also * all recent (as of June 2015) version of libgpuarray. This is also
* promised to keep working in future versions. */ * promised to keep working in future versions. */
#define PyGpuArray_DEV_DATA(ary) (*(void **)((ary)->ga.data)) char * p = *((char **)a->ga.data);
/* This only works on cuda since we have a real pointer. */
return (void *)(p + a->ga.offset);
}
#endif #endif
...@@ -35,6 +35,7 @@ from .nnet import (GpuCrossentropySoftmaxArgmax1HotWithBias, ...@@ -35,6 +35,7 @@ from .nnet import (GpuCrossentropySoftmaxArgmax1HotWithBias,
from .elemwise import (GpuElemwise, GpuDimShuffle, GpuCAReduceCuda, from .elemwise import (GpuElemwise, GpuDimShuffle, GpuCAReduceCuda,
GpuCAReduceCPY) GpuCAReduceCPY)
from .subtensor import (GpuIncSubtensor, GpuSubtensor, from .subtensor import (GpuIncSubtensor, GpuSubtensor,
GpuAdvancedSubtensor1,
GpuAdvancedIncSubtensor1, GpuAdvancedIncSubtensor1,
GpuAdvancedIncSubtensor1_dev20) GpuAdvancedIncSubtensor1_dev20)
...@@ -488,6 +489,12 @@ def local_gpua_incsubtensor(node): ...@@ -488,6 +489,12 @@ def local_gpua_incsubtensor(node):
node.op.destroyhandler_tolerate_aliased) node.op.destroyhandler_tolerate_aliased)
@register_opt('fast_compile')
@op_lifter([tensor.AdvancedSubtensor1])
def local_gpua_advanced_subtensor(node):
return GpuAdvancedSubtensor1()
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.AdvancedIncSubtensor1]) @op_lifter([tensor.AdvancedIncSubtensor1])
def local_gpua_advanced_incsubtensor(node): def local_gpua_advanced_incsubtensor(node):
...@@ -496,7 +503,16 @@ def local_gpua_advanced_incsubtensor(node): ...@@ -496,7 +503,16 @@ def local_gpua_advanced_incsubtensor(node):
if pygpu.get_default_context().kind != "cuda": if pygpu.get_default_context().kind != "cuda":
return None return None
x, y = node.inputs[0:2] x, y, ilist = node.inputs
# Gpu Ops needs both inputs to have the same dtype
if (x.type.dtype != y.type.dtype):
dtype = scalar.upcast(x.type.dtype, y.type.dtype)
if x.type.dtype != dtype:
x = tensor.cast(x, dtype)
if y.type.dtype != dtype:
y = tensor.cast(y, dtype)
set_instead_of_inc = node.op.set_instead_of_inc set_instead_of_inc = node.op.set_instead_of_inc
active_device_no = theano.sandbox.cuda.active_device_number() active_device_no = theano.sandbox.cuda.active_device_number()
device_properties = theano.sandbox.cuda.device_properties device_properties = theano.sandbox.cuda.device_properties
...@@ -504,11 +520,11 @@ def local_gpua_advanced_incsubtensor(node): ...@@ -504,11 +520,11 @@ def local_gpua_advanced_incsubtensor(node):
compute_capability = device_properties(active_device_no)['major'] compute_capability = device_properties(active_device_no)['major']
if (compute_capability < 2 or x.ndim != 2 or y.ndim != 2): if (compute_capability < 2 or x.ndim != 2 or y.ndim != 2):
return GpuAdvancedIncSubtensor1( return [GpuAdvancedIncSubtensor1(
set_instead_of_inc=set_instead_of_inc) set_instead_of_inc=set_instead_of_inc)(x, y, ilist)]
else: else:
return GpuAdvancedIncSubtensor1_dev20( return [GpuAdvancedIncSubtensor1_dev20(
set_instead_of_inc=set_instead_of_inc) set_instead_of_inc=set_instead_of_inc)(x, y, ilist)]
@register_opt('fast_compile') @register_opt('fast_compile')
......
from __future__ import print_function from __future__ import print_function
import copy import copy
import numpy
import os import os
import numpy
import theano import theano
from theano import tensor, gof, Op, config from theano import tensor, gof, config
from theano.gof.utils import MethodNotDefined
from six.moves import StringIO from six.moves import StringIO
from theano.tensor.subtensor import IncSubtensor, Subtensor, get_idx_list from theano.tensor.subtensor import IncSubtensor, Subtensor, get_idx_list
import theano.tensor.inplace import theano.tensor.inplace
...@@ -18,7 +21,6 @@ except ImportError: ...@@ -18,7 +21,6 @@ except ImportError:
from .type import GpuArrayType from .type import GpuArrayType
from .basic_ops import (as_gpuarray_variable, HideC, GpuKernelBase, Kernel) from .basic_ops import (as_gpuarray_variable, HideC, GpuKernelBase, Kernel)
from .elemwise import GpuElemwise from .elemwise import GpuElemwise
from .comp import NVCC_compiler
class GpuSubtensor(HideC, Subtensor): class GpuSubtensor(HideC, Subtensor):
...@@ -243,15 +245,15 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor): ...@@ -243,15 +245,15 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
if sub_x.shape: if sub_x.shape:
# we've sliced out an N-D tensor with N > 0 # we've sliced out an N-D tensor with N > 0
if not self.set_instead_of_inc: if not self.set_instead_of_inc:
#sub_x += y # sub_x += y
pygpu.elemwise.ielemwise2(sub_x, '+', y, broadcast=False) pygpu.elemwise.ielemwise2(sub_x, '+', y, broadcast=False)
else: else:
#sub_x += -sub_x + y # sub_x += -sub_x + y
x.__setitem__(cdata, y) x.__setitem__(cdata, y)
else: else:
# scalar case # scalar case
if not self.set_instead_of_inc: if not self.set_instead_of_inc:
#x.__setitem__(cdata, sub_x + y) # x.__setitem__(cdata, sub_x + y)
tmp = pygpu.elemwise.elemwise2(sub_x, '+', y, sub_x, tmp = pygpu.elemwise.elemwise2(sub_x, '+', y, sub_x,
broadcast=False) broadcast=False)
x.__setitem__(cdata, tmp) x.__setitem__(cdata, tmp)
...@@ -261,9 +263,9 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor): ...@@ -261,9 +263,9 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
def __setstate__(self, d): def __setstate__(self, d):
self.__dict__.update(d) self.__dict__.update(d)
owner = getattr(self.__dict__, "owner", None) owner = getattr(self, "owner", None)
if owner: if owner:
op.create_iadd_node(owner) self.create_iadd_node(owner)
def __getstate__(self): def __getstate__(self):
d = copy.copy(self.__dict__) d = copy.copy(self.__dict__)
...@@ -405,6 +407,77 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor): ...@@ -405,6 +407,77 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
return parent_version + elemwise_version + (2,) return parent_version + elemwise_version + (2,)
class GpuAdvancedSubtensor1(HideC, tensor.AdvancedSubtensor1):
def make_node(self, x, ilist):
x_ = as_gpuarray_variable(x)
ilist__ = tensor.as_tensor_variable(ilist)
if ilist__.type.dtype[:3] not in ('int', 'uin'):
raise TypeError('index must be integers')
if ilist__.type.dtype != 'int64':
ilist__ = tensor.cast(ilist__, 'int64')
ilist_ = as_gpuarray_variable(ilist__)
if ilist_.type.dtype != 'int64':
raise TypeError('index must be int64')
if ilist_.type.ndim != 1:
raise TypeError('index must be a vector')
if x_.type.ndim == 0:
raise TypeError('cannot index into a scalar')
bcast = ilist_.broadcastable + x_.broadcastable[1:]
return gof.Apply(self, [x_, ilist_],
[GpuArrayType(dtype=x.dtype,
broadcastable=bcast)()])
def perform(self, node, inp, out_):
raise NotImplementedError()
def c_support_code(self):
return """
int take1_match_dims(GpuArray *a, GpuArray *v) {
if (a->nd != v->nd) return 0;
for (unsigned int i = 1; i < v->nd; i++) {
if (a->dimensions[i] != v->dimensions[i]) return 0;
}
return 1;
}
"""
def c_code(self, node, name, inputs, outputs, sub):
return """
int err;
if (%(out)s == NULL || !GpuArray_IS_C_CONTIGUOUS(&%(out)s->ga) ||
%(out)s->ga.dimensions[0] != %(idx)s->ga.dimensions[0] ||
!take1_match_dims(&%(out)s->ga, &%(v)s->ga)) {
size_t tmp;
Py_XDECREF(%(out)s);
/* This is a dirty hack to avoid an extra alloc */
tmp = %(v)s->ga.dimensions[0];
%(v)s->ga.dimensions[0] = %(idx)s->ga.dimensions[0];
%(out)s = pygpu_empty(%(v)s->ga.nd, %(v)s->ga.dimensions, %(v)s->ga.typecode,
GA_C_ORDER, %(v)s->context, Py_None);
%(v)s->ga.dimensions[0] = tmp; // Don't remove this line
}
err = GpuArray_take1(&%(out)s->ga, &%(v)s->ga, &%(idx)s->ga, 1);
if (err != GA_NO_ERROR) {
if (err == GA_VALUE_ERROR) {
PyErr_SetString(PyExc_IndexError, "Index out of bounds.");
} else {
PyErr_SetString(PyExc_RuntimeError, Gpu_error(%(v)s->context->ops,
%(v)s->context->ctx, err));
}
%(fail)s
}
""" % dict(out=outputs[0], v=inputs[0], idx=inputs[1], fail=sub['fail'])
def c_code_cache_version(self):
return (0,)
class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1): class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1):
""" """
Implement AdvancedIncSubtensor1 on the gpu. Implement AdvancedIncSubtensor1 on the gpu.
...@@ -487,7 +560,7 @@ class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1): ...@@ -487,7 +560,7 @@ class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1):
reshaped_y = y.reshape(y.shape[1:]) reshaped_y = y.reshape(y.shape[1:])
else: else:
nb_dims_to_add = (x.ndim - 1) - y.ndim nb_dims_to_add = (x.ndim - 1) - y.ndim
reshaped_y = y.reshape((1,)*nb_dims_to_add + y.shape) reshaped_y = y.reshape((1,) * nb_dims_to_add + y.shape)
if self.set_instead_of_inc: if self.set_instead_of_inc:
for i in idx: for i in idx:
...@@ -537,25 +610,22 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, GpuAdvancedIncSubtensor1): ...@@ -537,25 +610,22 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, GpuAdvancedIncSubtensor1):
return gof.Apply(self, [x_, y_, ilist_], [x_.type()]) return gof.Apply(self, [x_, y_, ilist_], [x_.type()])
def c_code_cache_version(self): def c_code_cache_version(self):
return (5,) return (6,)
def c_headers(self): def c_headers(self):
if pygpu.get_default_context().kind == 'opencl': if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only') raise MethodNotDefined('cuda only')
return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>', return ['cuda.h', '<numpy_compat.h>', '<gpuarray_helper.h>',
'<gpuarray/ext_cuda.h>', '<gpuarray/types.h>'] '<gpuarray/types.h>']
def c_header_dirs(self): def c_header_dirs(self):
if pygpu.get_default_context().kind == 'opencl': if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only') raise MethodNotDefined('cuda only')
cuda_root = config.cuda.root cuda_root = config.cuda.root
res = [os.path.dirname(__file__)]
if cuda_root: if cuda_root:
return [os.path.join(cuda_root, 'include')] res.append(os.path.join(cuda_root, 'include'))
return res
def c_init_code(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
return ['setup_ext_cuda();']
def c_code(self, node, name, inputs, outputs, sub): def c_code(self, node, name, inputs, outputs, sub):
active_device_no = theano.sandbox.cuda.active_device_number() active_device_no = theano.sandbox.cuda.active_device_number()
...@@ -574,19 +644,20 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, GpuAdvancedIncSubtensor1): ...@@ -574,19 +644,20 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, GpuAdvancedIncSubtensor1):
fail = sub['fail'] fail = sub['fail']
inplace = int(self.inplace) inplace = int(self.inplace)
return """ return """
int err;
if (%(inplace)s) {
Py_XDECREF(%(out)s); Py_XDECREF(%(out)s);
if (!%(inplace)s) {
%(out)s = (PyGpuArrayObject*)pygpu_copy(%(x)s, GA_C_ORDER);
} else {
%(out)s = %(x)s; %(out)s = %(x)s;
Py_XINCREF(%(out)s); Py_INCREF(%(out)s);
} } else {
%(out)s = theano_try_copy(%(out)s, %(x)s);
GpuArray_vector_add_fast(%(out)s, %(y)s, %(ind)s); }
if (!%(out)s) {
if (!%(out)s) {
%(fail)s %(fail)s
} }
if (GpuArray_vector_add_fast(%(out)s, %(y)s, %(ind)s)) {
%(fail)s
}
""" % locals() """ % locals()
def gpu_kernels(self, node, nodename): def gpu_kernels(self, node, nodename):
...@@ -598,7 +669,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, GpuAdvancedIncSubtensor1): ...@@ -598,7 +669,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, GpuAdvancedIncSubtensor1):
itemsize_y = numpy.dtype(dtype_y).itemsize itemsize_y = numpy.dtype(dtype_y).itemsize
itemsize_ind = numpy.dtype(dtype_ind).itemsize itemsize_ind = numpy.dtype(dtype_ind).itemsize
itemsize_out = numpy.dtype(dtype_out).itemsize itemsize_out = numpy.dtype(dtype_out).itemsize
flags=Kernel.get_flags(dtype_x, dtype_y, dtype_ind) flags = Kernel.get_flags(dtype_x, dtype_y, dtype_ind)
type_x = gpuarray.dtype_to_ctype(dtype_x) type_x = gpuarray.dtype_to_ctype(dtype_x)
type_y = gpuarray.dtype_to_ctype(dtype_y) type_y = gpuarray.dtype_to_ctype(dtype_y)
type_ind = gpuarray.dtype_to_ctype(dtype_ind) type_ind = gpuarray.dtype_to_ctype(dtype_ind)
...@@ -606,6 +677,22 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, GpuAdvancedIncSubtensor1): ...@@ -606,6 +677,22 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, GpuAdvancedIncSubtensor1):
kname = "k_vector_add_fast" kname = "k_vector_add_fast"
k_var = "k_vector_add_fast_" + nodename k_var = "k_vector_add_fast_" + nodename
code = """ code = """
/*
* This is an atomicAdd that works for doubles since that is not provided
* natively by cuda.
*/
__device__ double atomicAdd(ga_double* address, ga_double val) {
unsigned long long int* address_as_ull =
(unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed,
__double_as_longlong(val +
__longlong_as_double(assumed)));
} while (assumed != old);
return __longlong_as_double(old);
}
/* /*
* This is a version of atomicAdd that works for half-floats. It may * This is a version of atomicAdd that works for half-floats. It may
...@@ -646,7 +733,8 @@ __device__ ga_half atomicAdd(ga_half *addr, ga_half val) { ...@@ -646,7 +733,8 @@ __device__ ga_half atomicAdd(ga_half *addr, ga_half val) {
const ga_size numIndices, const ga_size numIndices,
const ga_ssize stridesIndices, const ga_ssize stridesIndices,
%(type_ind)s *indices_arr, %(type_ind)s *indices_arr,
const ga_size offset_indices_arr) const ga_size offset_indices_arr,
ga_int *err)
{ {
X = (%(type_x)s *)(((char *)X)+offset_X); X = (%(type_x)s *)(((char *)X)+offset_X);
Y = (%(type_y)s *)(((char *)Y)+offset_Y); Y = (%(type_y)s *)(((char *)Y)+offset_Y);
...@@ -655,11 +743,15 @@ __device__ ga_half atomicAdd(ga_half *addr, ga_half val) { ...@@ -655,11 +743,15 @@ __device__ ga_half atomicAdd(ga_half *addr, ga_half val) {
{ {
for(int j = (threadIdx.x); j < numColsX;j += blockDim.x) for(int j = (threadIdx.x); j < numColsX;j += blockDim.x)
{ {
int x_row = indices_arr[i * stridesIndices]; ga_ssize x_row = indices_arr[i * stridesIndices];
if(x_row < 0) if (x_row < 0)
x_row += numRowsX; x_row += numRowsX;
int y_row = i; ga_ssize y_row = i;
if (x_row < numRowsX && x_row >= 0) {
atomicAdd(&X[(x_row * stridesX0) + (j * stridesX1)], Y[(y_row * stridesY0) + (j * stridesY1)]); atomicAdd(&X[(x_row * stridesX0) + (j * stridesX1)], Y[(y_row * stridesY0) + (j * stridesY1)]);
} else {
*err = 1;
}
} }
} }
return; return;
...@@ -668,8 +760,7 @@ __device__ ga_half atomicAdd(ga_half *addr, ga_half val) { ...@@ -668,8 +760,7 @@ __device__ ga_half atomicAdd(ga_half *addr, ga_half val) {
params = [ params = [
'uintp', 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp', 'uintp', 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp',
'uintp', 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp', 'uintp', 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp',
'uintp', 'intp', gpuarray.GpuArray, 'uintp' 'uintp', 'intp', gpuarray.GpuArray, 'uintp', gpuarray.GpuArray]
]
return [Kernel(code=code, name=kname, params=params, return [Kernel(code=code, name=kname, params=params,
flags=flags, objvar=k_var)] flags=flags, objvar=k_var)]
...@@ -683,27 +774,25 @@ __device__ ga_half atomicAdd(ga_half *addr, ga_half val) { ...@@ -683,27 +774,25 @@ __device__ ga_half atomicAdd(ga_half *addr, ga_half val) {
itemsize_ind = numpy.dtype(dtype_ind).itemsize itemsize_ind = numpy.dtype(dtype_ind).itemsize
itemsize_out = numpy.dtype(dtype_out).itemsize itemsize_out = numpy.dtype(dtype_out).itemsize
k_var = "k_vector_add_fast_" + nodename k_var = "k_vector_add_fast_" + nodename
err_check = """
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: %(k_var)s: %%s.",
GpuKernel_error(&%(k_var)s, err));
}
""" % locals()
sync = ""
if config.gpuarray.sync:
sync = """
err = GpuArray_sync(&%(z)s->ga);
%(err_check)s
""" % locals()
return super(GpuAdvancedIncSubtensor1_dev20, self).c_support_code_apply(node, nodename) + """ return super(GpuAdvancedIncSubtensor1_dev20, self).c_support_code_apply(node, nodename) + """
void GpuArray_vector_add_fast(PyGpuArrayObject* py_self, int GpuArray_vector_add_fast(PyGpuArrayObject* py_self,
PyGpuArrayObject* py_other, PyGpuArrayObject* py_other,
PyGpuArrayObject *indices_arr) PyGpuArrayObject *indices_arr)
{ {
size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(py_self)[1], (size_t)256), 1, 1}; size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(py_self)[1], (size_t)256), 1, 1};
size_t n_blocks[3] = {std::min(PyGpuArray_SIZE(indices_arr), (size_t)4096), 1, 1}; size_t n_blocks[3] = {std::min(PyGpuArray_SIZE(indices_arr), (size_t)4096), 1, 1};
gpudata *errbuf;
int err, kerr = 0;
if (threads_per_block[0] > 0 && n_blocks[0] > 0) { if (threads_per_block[0] > 0 && n_blocks[0] > 0) {
err = py_self->ga.ops->property(NULL, py_self->ga.data, NULL,
GA_CTX_PROP_ERRBUF, &errbuf);
if (err != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "Can't fetch error buffer");
return 1;
}
ssize_t stride_X0 = PyGpuArray_STRIDES(py_self)[0] / %(itemsize_x)s; ssize_t stride_X0 = PyGpuArray_STRIDES(py_self)[0] / %(itemsize_x)s;
ssize_t stride_X1 = PyGpuArray_STRIDES(py_self)[1] / %(itemsize_x)s; ssize_t stride_X1 = PyGpuArray_STRIDES(py_self)[1] / %(itemsize_x)s;
ssize_t stride_Y0 = PyGpuArray_DIMS(py_other)[0] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[0] / %(itemsize_y)s; ssize_t stride_Y0 = PyGpuArray_DIMS(py_other)[0] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[0] / %(itemsize_y)s;
...@@ -724,11 +813,27 @@ __device__ ga_half atomicAdd(ga_half *addr, ga_half val) { ...@@ -724,11 +813,27 @@ __device__ ga_half atomicAdd(ga_half *addr, ga_half val) {
(void *)&PyGpuArray_DIMS(indices_arr)[0], (void *)&PyGpuArray_DIMS(indices_arr)[0],
(void *)&stride_ind, (void *)&stride_ind,
(void *)indices_arr->ga.data, (void *)indices_arr->ga.data,
(void *)&indices_arr->ga.offset}; (void *)&indices_arr->ga.offset,
int err = GpuKernel_call(&%(k_var)s, 3, threads_per_block, n_blocks, 0, kernel_params); (void *)errbuf};
%(err_check)s err = GpuKernel_call(&%(k_var)s, 3, threads_per_block, n_blocks, 0, kernel_params);
%(sync)s if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: %(k_var)s: %%s.",
GpuKernel_error(&%(k_var)s, err));
return 1;
}
err = py_self->ga.ops->buffer_read(&kerr, errbuf, 0, sizeof(int));
if (err != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "Can't read error buffer");
return 1;
} }
if (kerr != 0) {
PyErr_SetString(PyExc_IndexError, "Index out of bounds");
kerr = 0;
py_self->ga.ops->buffer_write(errbuf, 0, &kerr, sizeof(int));
return 1;
}
}
return 0;
} }
""" % locals() """ % locals()
...@@ -7,6 +7,7 @@ from theano.tensor.tests import test_subtensor ...@@ -7,6 +7,7 @@ from theano.tensor.tests import test_subtensor
from ..basic_ops import HostFromGpu, GpuFromHost from ..basic_ops import HostFromGpu, GpuFromHost
from ..subtensor import (GpuIncSubtensor, GpuSubtensor, from ..subtensor import (GpuIncSubtensor, GpuSubtensor,
GpuAdvancedSubtensor1,
GpuAdvancedIncSubtensor1) GpuAdvancedIncSubtensor1)
from ..type import gpuarray_shared_constructor from ..type import gpuarray_shared_constructor
...@@ -24,6 +25,7 @@ class G_subtensor(test_subtensor.T_subtensor): ...@@ -24,6 +25,7 @@ class G_subtensor(test_subtensor.T_subtensor):
shared=gpuarray_shared_constructor, shared=gpuarray_shared_constructor,
sub=GpuSubtensor, sub=GpuSubtensor,
inc_sub=GpuIncSubtensor, inc_sub=GpuIncSubtensor,
adv_sub1=GpuAdvancedSubtensor1,
adv_incsub1=GpuAdvancedIncSubtensor1, adv_incsub1=GpuAdvancedIncSubtensor1,
mode=mode_with_gpu, mode=mode_with_gpu,
# avoid errors with limited devices # avoid errors with limited devices
......
...@@ -515,8 +515,8 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin): ...@@ -515,8 +515,8 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
self.assertRaises(IndexError, g, shp) self.assertRaises(IndexError, g, shp)
def test_adv_sub1_broadcast(self): def test_adv_sub1_broadcast(self):
ones = numpy.ones((1, 3), dtype=self.dtype) v = numpy.arange(3, dtype=self.dtype).reshape((1, 3))
n = self.shared(ones * 5, broadcastable=(True, False)) n = self.shared(v*5, broadcastable=(True, False))
idx = tensor.lvector() idx = tensor.lvector()
t = n[idx] t = n[idx]
self.assertTrue(isinstance(t.owner.op, tensor.AdvancedSubtensor1)) self.assertTrue(isinstance(t.owner.op, tensor.AdvancedSubtensor1))
...@@ -529,10 +529,10 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin): ...@@ -529,10 +529,10 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
self.assertTrue(isinstance(topo_[0].op, self.adv_sub1)) self.assertTrue(isinstance(topo_[0].op, self.adv_sub1))
f_0 = f([0]) f_0 = f([0])
self.assertTrue(f_0.shape == (1, 3)) self.assertTrue(f_0.shape == (1, 3))
self.assertTrue(numpy.allclose(f_0, ones[0] * 5)) self.assertTrue(numpy.allclose(f_0, v*5))
f_00 = f([0, 0]) f_00 = f([0, 0])
self.assertTrue(f_00.shape == (2, 3)) self.assertTrue(f_00.shape == (2, 3))
self.assertTrue(numpy.allclose(f_00, 5)) self.assertTrue(numpy.allclose(f_00, v*5))
self.assertRaises(IndexError, f, [0, 1]) self.assertRaises(IndexError, f, [0, 1])
# Test the gradient # Test the gradient
......
...@@ -160,7 +160,6 @@ whitelist_flake8 = [ ...@@ -160,7 +160,6 @@ whitelist_flake8 = [
"sandbox/linalg/tests/test_linalg.py", "sandbox/linalg/tests/test_linalg.py",
"sandbox/gpuarray/basic_ops.py", "sandbox/gpuarray/basic_ops.py",
"sandbox/gpuarray/nnet.py", "sandbox/gpuarray/nnet.py",
"sandbox/gpuarray/subtensor.py",
"sandbox/gpuarray/elemwise.py", "sandbox/gpuarray/elemwise.py",
"sandbox/gpuarray/type.py", "sandbox/gpuarray/type.py",
"sandbox/gpuarray/__init__.py", "sandbox/gpuarray/__init__.py",
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论