提交 fff9c1f7 authored 作者: Pierre Luc Carrier's avatar Pierre Luc Carrier

Mostly adapted Op and tests to new backend. TODO: Remove faulty python…

Mostly adapted Op and tests to new backend. TODO: Remove faulty python implementation from _dev20 version of op
上级 6936dd28
...@@ -10,6 +10,7 @@ from theano.gof import (local_optimizer, EquilibriumDB, ...@@ -10,6 +10,7 @@ from theano.gof import (local_optimizer, EquilibriumDB,
from theano.gof.python25 import all, any from theano.gof.python25 import all, any
from theano.tensor.nnet.conv import ConvOp from theano.tensor.nnet.conv import ConvOp
from theano.sandbox.cuda.basic_ops import device_properties
from theano.sandbox.gpuarray.type import GpuArrayType from theano.sandbox.gpuarray.type import GpuArrayType
from theano.sandbox.gpuarray.basic_ops import (host_from_gpu, from theano.sandbox.gpuarray.basic_ops import (host_from_gpu,
gpu_from_host, gpu_from_host,
...@@ -25,7 +26,9 @@ from theano.sandbox.gpuarray.nnet import (GpuCrossentropySoftmaxArgmax1HotWithBi ...@@ -25,7 +26,9 @@ from theano.sandbox.gpuarray.nnet import (GpuCrossentropySoftmaxArgmax1HotWithBi
GpuSoftmax) GpuSoftmax)
from theano.sandbox.gpuarray.elemwise import (GpuElemwise, _is_scalar, from theano.sandbox.gpuarray.elemwise import (GpuElemwise, _is_scalar,
GpuDimShuffle, GpuCAReduceCuda) GpuDimShuffle, GpuCAReduceCuda)
from theano.sandbox.gpuarray.subtensor import GpuIncSubtensor, GpuSubtensor from theano.sandbox.gpuarray.subtensor import (GpuIncSubtensor, GpuSubtensor,
GpuAdvancedIncSubtensor1,
GpuAdvancedIncSubtensor1_dev20)
from theano.sandbox.gpuarray.type import GpuArrayConstant from theano.sandbox.gpuarray.type import GpuArrayConstant
gpu_optimizer = EquilibriumDB() gpu_optimizer = EquilibriumDB()
...@@ -241,6 +244,23 @@ def local_gpua_incsubtensor(node): ...@@ -241,6 +244,23 @@ def local_gpua_incsubtensor(node):
return GpuIncSubtensor(node.op.idx_list, node.op.inplace, return GpuIncSubtensor(node.op.idx_list, node.op.inplace,
node.op.set_instead_of_inc, node.op.set_instead_of_inc,
node.op.destroyhandler_tolerate_aliased) node.op.destroyhandler_tolerate_aliased)
@register_opt()
@op_lifter([tensor.AdvancedIncSubtensor1])
def local_gpua_advanced_incsubtensor(node):
x, y = node.inputs[0:2]
coords = node.inputs[2:]
set_instead_of_inc = node.op.set_instead_of_inc
active_device_no = theano.sandbox.cuda.active_device_number()
compute_capability = device_properties(active_device_no)['major']
if (compute_capability < 2 or x.ndim != 2 or y.ndim != 2):
return GpuAdvancedIncSubtensor1(
set_instead_of_inc=set_instead_of_inc)
else:
return GpuAdvancedIncSubtensor1_dev20(
set_instead_of_inc=set_instead_of_inc)
@register_opt() @register_opt()
......
...@@ -4,9 +4,10 @@ import StringIO ...@@ -4,9 +4,10 @@ import StringIO
import numpy import numpy
import theano import theano
from theano import tensor, gof from theano import tensor, gof, Op
from theano.gof.python25 import all, any from theano.gof.python25 import all, any
from theano.tensor.subtensor import IncSubtensor, Subtensor, get_idx_list from theano.tensor.subtensor import IncSubtensor, Subtensor, get_idx_list
from theano.sandbox.cuda.basic_ops import device_properties
from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
try: try:
...@@ -358,13 +359,13 @@ class GpuIncSubtensor(IncSubtensor): ...@@ -358,13 +359,13 @@ class GpuIncSubtensor(IncSubtensor):
return parent_version + elemwise_version + (0,) return parent_version + elemwise_version + (0,)
class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp): class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, Op):
""" """
Implement AdvancedIncSubtensor1 on the gpu. Implement AdvancedIncSubtensor1 on the gpu.
""" """
def make_node(self, x, y, ilist): def make_node(self, x, y, ilist):
x_ = as_cuda_ndarray_variable(x) x_ = as_gpuarray_variable(x)
y_ = as_cuda_ndarray_variable(y) y_ = as_gpuarray_variable(y)
ilist_ = tensor.as_tensor_variable(ilist) ilist_ = tensor.as_tensor_variable(ilist)
assert x_.type.dtype == y_.type.dtype assert x_.type.dtype == y_.type.dtype
...@@ -380,7 +381,7 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp): ...@@ -380,7 +381,7 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp):
# the caller should have made a copy of x len(ilist) times # the caller should have made a copy of x len(ilist) times
raise TypeError('cannot index into a broadcastable dimension') raise TypeError('cannot index into a broadcastable dimension')
return Apply(self, [x_, y_, ilist_], [x_.type()]) return gof.Apply(self, [x_, y_, ilist_], [x_.type()])
# CudaNdarray_Subscript() doesn't support Advanced slicing. # CudaNdarray_Subscript() doesn't support Advanced slicing.
# But we can't use the parent version that loops on each index # But we can't use the parent version that loops on each index
...@@ -393,13 +394,15 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp): ...@@ -393,13 +394,15 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp):
if not self.inplace: if not self.inplace:
x = x.copy() x = x.copy()
if self.set_instead_of_inc: if self.set_instead_of_inc:
# CudaNdarray __setitem__ doesn't do broadcast nor support
# list of index.
assert y.ndim <= x.ndim # Should be guaranteed by `make_node` assert y.ndim <= x.ndim # Should be guaranteed by `make_node`
if y.ndim == x.ndim: if y.ndim == x.ndim:
assert len(y) == len(idx) assert len(y) == len(idx)
for (j, i) in enumerate(idx): for (j, i) in enumerate(idx):
x[i] = y[j] try:
x[i] = y[j]
except:
import pdb
pdb.set_trace()
else: else:
for i in idx: for i in idx:
x[i] = y x[i] = y
...@@ -410,131 +413,36 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp): ...@@ -410,131 +413,36 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp):
assert y.ndim <= x.ndim # Should be guaranteed by `make_node` assert y.ndim <= x.ndim # Should be guaranteed by `make_node`
if y.ndim == x.ndim: if y.ndim == x.ndim:
assert len(y) == len(idx) assert len(y) == len(idx)
for (j, i) in enumerate(idx): for (j, i) in enumerate(idx):
x[i] += y[j] #x[i] += y[j]
pygpu.elemwise.ielemwise2(x[i], '+', y[j], broadcast=False)
else: else:
for i in idx: for i in idx:
x[i] += y #x[i] += y
out[0] = x nb_dims_to_add = (x[i].ndim - y.ndim)
reshaped_y = y.reshape((1,)*nb_dims_to_add + y.shape)
def c_code_cache_version(self): pygpu.elemwise.ielemwise2(x[i], '+', reshaped_y,
return (3,) broadcast=True)
def c_code(self, node, name, inputs, outputs, sub):
if (self.set_instead_of_inc) or \
(node.inputs[0].ndim != node.inputs[1].ndim):
raise NotImplementedError("This case does not have C code yet.")
x = inputs[0]
y = inputs[1]
ind = inputs[2]
out = outputs[0]
fail = sub['fail']
inplace = int(self.inplace)
return """ out[0] = x
PyObject *x_obj, *y_obj, *row_x, *row_y;
PyObject *x_rowind_obj, *y_rowind_obj;
dtype_%(ind)s *p_index;
int num_indices, j;
int ret;
num_indices = PyArray_SIZE(%(ind)s);
if ((num_indices - 1) > LONG_MAX) {
PyErr_Format(PyExc_AssertionError,
"num_indices %%d exceeds LONG_MAX + 1", num_indices);
%(fail)s;
}
Py_XDECREF(%(out)s);
if (!%(inplace)s) {
%(out)s = (CudaNdarray*)CudaNdarray_Copy(%(x)s);
} else {
%(out)s = %(x)s;
Py_XINCREF(%(out)s);
}
x_obj = (PyObject*)CudaNdarray_View(%(out)s);
y_obj = (PyObject*)CudaNdarray_View(%(y)s);
for (j = 0;j < num_indices; j++) {
p_index = (dtype_%(ind)s *)PyArray_GETPTR1(%(ind)s, j);
x_rowind_obj = PyInt_FromLong(*p_index);
if (PyInt_AsLong(x_rowind_obj) != (*p_index)) {
PyErr_Format(PyExc_AssertionError,
"Error in converting row index to integer from long");
// Dec Ref what ever we have increfed or allocated so far
// We deallocate objects exactly in the reverse order they were allocated.
Py_XDECREF(x_rowind_obj);
Py_XDECREF(y_obj);
Py_XDECREF(x_obj);
%(fail)s;
}
y_rowind_obj = PyInt_FromLong(j);
row_x = CudaNdarray_Subscript(x_obj, x_rowind_obj);
row_y = CudaNdarray_Subscript(y_obj, y_rowind_obj);
if ((row_x == NULL) || (row_y == NULL)) {
Py_XDECREF(row_y);
Py_XDECREF(row_x);
Py_XDECREF(y_rowind_obj);
Py_XDECREF(x_rowind_obj);
Py_XDECREF(y_obj);
Py_XDECREF(x_obj);
%(fail)s;
}
ret = CudaNdarray_inplace_elemwise(row_x, row_y, IADD);
if (ret != 0) {
Py_XDECREF(row_y);
Py_XDECREF(row_x);
Py_XDECREF(y_rowind_obj);
Py_XDECREF(x_rowind_obj);
Py_XDECREF(y_obj);
Py_XDECREF(x_obj);
%(fail)s;
}
Py_XDECREF(row_y);
Py_XDECREF(row_x);
Py_XDECREF(y_rowind_obj);
Py_XDECREF(x_rowind_obj);
}
Py_XDECREF(y_obj);
Py_XDECREF(x_obj);
if (!%(out)s) {
%(fail)s
}
""" % locals()
class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1): class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
"""Implement AdvancedIncSubtensor1 on the gpu, but use function """Implement AdvancedIncSubtensor1 on the gpu, but use function
only avail on compute capability 2.0 and more recent. only avail on compute capability 2.0 and more recent.
""" """
def __init__(self, inplace=False, set_instead_of_inc=False):
# The python implementation in the parent class is not applicable here
GpuAdvancedIncSubtensor1.__init__(self, inplace, set_instead_of_inc)
def make_node(self, x, y, ilist): def make_node(self, x, y, ilist):
"""It defer from GpuAdvancedIncSubtensor1 in that it make sure """It defer from GpuAdvancedIncSubtensor1 in that it make sure
the index are of type long. the index are of type long.
""" """
x_ = as_cuda_ndarray_variable(x) x_ = as_gpuarray_variable(x)
y_ = as_cuda_ndarray_variable(y) y_ = as_gpuarray_variable(y)
ilist_ = tensor.as_tensor_variable(ilist) ilist_ = as_gpuarray_variable(ilist)
convert_map = {8: tensor.basic._convert_to_int8,
16: tensor.basic._convert_to_int16,
32: tensor.basic._convert_to_int32,
64: tensor.basic._convert_to_int64
}
intwidth = theano.gof.compiledir.python_int_bitwidth()
ilist_ = convert_map[intwidth](ilist_)
assert x_.type.dtype == y_.type.dtype assert x_.type.dtype == y_.type.dtype
assert x_.type.ndim >= y_.type.ndim assert x_.type.ndim >= y_.type.ndim
...@@ -549,10 +457,20 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1): ...@@ -549,10 +457,20 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
# the caller should have made a copy of x len(ilist) times # the caller should have made a copy of x len(ilist) times
raise TypeError('cannot index into a broadcastable dimension') raise TypeError('cannot index into a broadcastable dimension')
return Apply(self, [x_, y_, ilist_], [x_.type()]) return gof.Apply(self, [x_, y_, ilist_], [x_.type()])
def c_code_cache_version(self): def c_code_cache_version(self):
return (2,) return (2,)
def c_headers(self):
return ['cuda.h', '<compyte/extension.h>', '<numpy_compat.h>',
'<compyte/ext_cuda.h>']
def c_compiler(self):
return NVCC_compiler
def c_init_code(self):
return ['setup_ext_cuda();']
def c_code(self, node, name, inputs, outputs, sub): def c_code(self, node, name, inputs, outputs, sub):
active_device_no = theano.sandbox.cuda.active_device_number() active_device_no = theano.sandbox.cuda.active_device_number()
...@@ -572,13 +490,13 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1): ...@@ -572,13 +490,13 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
return """ return """
Py_XDECREF(%(out)s); Py_XDECREF(%(out)s);
if (!%(inplace)s) { if (!%(inplace)s) {
%(out)s = (CudaNdarray*)CudaNdarray_Copy(%(x)s); %(out)s = (PyGpuArrayObject*)pygpu_copy(%(x)s, GA_C_ORDER);
} else { } else {
%(out)s = %(x)s; %(out)s = %(x)s;
Py_XINCREF(%(out)s); Py_XINCREF(%(out)s);
} }
CudaNdarray_vector_add_fast(%(out)s, %(y)s, %(ind)s); GpuArray_vector_add_fast(%(out)s, %(y)s, %(ind)s);
if (!%(out)s) { if (!%(out)s) {
%(fail)s %(fail)s
...@@ -586,26 +504,35 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1): ...@@ -586,26 +504,35 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
""" % locals() """ % locals()
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
dtype_x = node.inputs[0].dtype
dtype_y = node.inputs[1].dtype
dtype_ind = node.inputs[2].dtype
dtype_out = node.outputs[0].dtype
itemsize_x = numpy.dtype(dtype_x).itemsize
itemsize_y = numpy.dtype(dtype_y).itemsize
itemsize_ind = numpy.dtype(dtype_ind).itemsize
itemsize_out = numpy.dtype(dtype_out).itemsize
return """ return """
__global__ void k_vector_add_fast(int numRowsX, __global__ void k_vector_add_fast(int numRowsX,
int numColsX, int numColsX,
int stridesX0, int stridesX0,
int stridesX1, int stridesX1,
float *X, npy_%(dtype_x)s *X,
int numRowsY, int numRowsY,
int numColsY, int numColsY,
int stridesY0, int stridesY0,
int stridesY1, int stridesY1,
float *Y , npy_%(dtype_y)s *Y,
long *d_indices_arr, int numIndices,
int num) int stridesIndices,
npy_%(dtype_ind)s *indices_arr)
{ {
for (int i = (blockIdx.x); i < num; i += gridDim.x) for (int i = (blockIdx.x); i < numIndices; i += gridDim.x)
{ {
for(int j = (threadIdx.x); j < numColsX;j += blockDim.x) for(int j = (threadIdx.x); j < numColsX;j += blockDim.x)
{ {
int x_row = d_indices_arr[i]; int x_row = indices_arr[i * stridesIndices];
int y_row = i; int y_row = i;
atomicAdd(&X[(x_row * stridesX0) + (j * stridesX1)], Y[(y_row * stridesY0) + (j * stridesY1)]); atomicAdd(&X[(x_row * stridesX0) + (j * stridesX1)], Y[(y_row * stridesY0) + (j * stridesY1)]);
} }
...@@ -613,49 +540,39 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1): ...@@ -613,49 +540,39 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
return; return;
} }
void CudaNdarray_vector_add_fast(CudaNdarray* py_self, CudaNdarray* py_other, PyArrayObject *indices_arr) void GpuArray_vector_add_fast(PyGpuArrayObject* py_self,
PyGpuArrayObject* py_other,
PyGpuArrayObject *indices_arr)
{ {
const int *shapeX = CudaNdarray_HOST_DIMS(py_self); int num_threads_per_block = std::min(PyGpuArray_DIMS(py_self)[1],
const int *shapeY = CudaNdarray_HOST_DIMS(py_other); (size_t)256);
const int *strX = CudaNdarray_HOST_STRIDES(py_self); int num_blocks = std::min(PyGpuArray_SIZE(indices_arr),
const int *strY = CudaNdarray_HOST_STRIDES(py_other); (size_t)4096);
unsigned int size = (unsigned int)PyArray_SIZE(indices_arr);
unsigned int numcolsX = shapeX[1];
unsigned int num_threads_per_block = std::min(numcolsX, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
unsigned int num_blocks = std::min(size ,(unsigned int)NUM_VECTOR_OP_BLOCKS);
dim3 n_blocks(num_blocks); dim3 n_blocks(num_blocks);
dim3 n_threads(num_threads_per_block); dim3 n_threads(num_threads_per_block);
long *d_indices_arr = NULL;
k_vector_add_fast<<<n_blocks, n_threads>>>(
PyArrayObject *cpu_indices_arr = PyArray_GETCONTIGUOUS(indices_arr); PyGpuArray_DIMS(py_self)[0],
PyGpuArray_DIMS(py_self)[1],
d_indices_arr = (long*)device_malloc(PyArray_NBYTES(cpu_indices_arr)); PyGpuArray_STRIDES(py_self)[0] / %(itemsize_x)s,
assert(d_indices_arr); PyGpuArray_STRIDES(py_self)[1] / %(itemsize_x)s,
(npy_%(dtype_x)s*)(
cudaError_t err = cudaMemcpy(d_indices_arr, ((char *)cuda_get_ptr(py_self->ga.data)) +
PyArray_DATA(cpu_indices_arr), py_self->ga.offset),
PyArray_NBYTES(cpu_indices_arr), PyGpuArray_DIMS(py_other)[0],
cudaMemcpyHostToDevice); PyGpuArray_DIMS(py_other)[1],
PyGpuArray_STRIDES(py_other)[0] / %(itemsize_y)s,
assert(err == cudaSuccess); PyGpuArray_STRIDES(py_other)[1] / %(itemsize_y)s,
(npy_%(dtype_x)s*)(
k_vector_add_fast<<<n_blocks, n_threads>>>(shapeX[0], ((char *)cuda_get_ptr(py_other->ga.data)) +
shapeX[1], py_other->ga.offset),
strX[0], PyGpuArray_DIMS(indices_arr)[0],
strX[1], PyGpuArray_STRIDES(indices_arr)[0] / %(itemsize_ind)s,
CudaNdarray_DEV_DATA(py_self), (npy_%(dtype_ind)s*)(
shapeY[0], ((char *)cuda_get_ptr(indices_arr->ga.data)) +
shapeY[1], indices_arr->ga.offset)
strY[0], );
strY[1],
CudaNdarray_DEV_DATA(py_other),
d_indices_arr,
PyArray_SIZE(indices_arr)
);
device_free(d_indices_arr);
Py_XDECREF(cpu_indices_arr);
return; return;
} }
......
import numpy
import theano
from theano.tensor.tests.test_subtensor import T_subtensor from theano.tensor.tests.test_subtensor import T_subtensor
from theano.sandbox.gpuarray.basic_ops import (HostFromGpu, GpuFromHost) from theano.sandbox.gpuarray.basic_ops import (HostFromGpu, GpuFromHost)
from theano.sandbox.gpuarray.subtensor import GpuIncSubtensor, GpuSubtensor from theano.sandbox.gpuarray.subtensor import (GpuIncSubtensor, GpuSubtensor,
GpuAdvancedIncSubtensor1)
from theano.sandbox.gpuarray.type import gpuarray_shared_constructor from theano.sandbox.gpuarray.type import gpuarray_shared_constructor
...@@ -21,6 +25,7 @@ class G_subtensor(T_subtensor): ...@@ -21,6 +25,7 @@ class G_subtensor(T_subtensor):
shared=gpuarray_shared_constructor, shared=gpuarray_shared_constructor,
sub=GpuSubtensor, sub=GpuSubtensor,
inc_sub=GpuIncSubtensor, inc_sub=GpuIncSubtensor,
adv_incsub1 = GpuAdvancedIncSubtensor1,
mode=mode_with_gpu, mode=mode_with_gpu,
# avoid errors with limited devices # avoid errors with limited devices
dtype='float32', dtype='float32',
...@@ -34,17 +39,17 @@ class G_subtensor(T_subtensor): ...@@ -34,17 +39,17 @@ class G_subtensor(T_subtensor):
def test_advinc_subtensor1(): def test_advinc_subtensor1():
""" Test the second case in the opt local_gpu_advanced_incsubtensor1 """ """ Test the second case in the opt local_gpu_advanced_incsubtensor1 """
for shp in [(3, 3), (3, 3, 3)]: for shp in [(3, 3), (3, 3, 3)]:
shared = cuda.shared_constructor shared = gpuarray_shared_constructor
xval = numpy.arange(numpy.prod(shp), dtype='float32').reshape(shp) + 1 xval = numpy.arange(numpy.prod(shp), dtype='float32').reshape(shp) + 1
yval = numpy.empty((2,) + shp[1:], dtype='float32') yval = numpy.empty((2,) + shp[1:], dtype='float32')
yval[:] = 10 yval[:] = 10
x = shared(xval, name='x') x = shared(xval, name='x')
y = T.tensor(dtype='float32', y = tensor.tensor(dtype='float32',
broadcastable=(False,) * len(shp), broadcastable=(False,) * len(shp),
name='y') name='y')
expr = T.advanced_inc_subtensor1(x, y, [0, 2]) expr = tensor.advanced_inc_subtensor1(x, y, [0, 2])
f = theano.function([y], expr, mode=mode_with_gpu) f = theano.function([y], expr, mode=mode_with_gpu)
assert sum([isinstance(node.op, cuda.GpuAdvancedIncSubtensor1) assert sum([isinstance(node.op, GpuAdvancedIncSubtensor1)
for node in f.maker.fgraph.toposort()]) == 1 for node in f.maker.fgraph.toposort()]) == 1
rval = f(yval) rval = f(yval)
rep = xval.copy() rep = xval.copy()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论