提交 a1796d2d authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #1828 from nouiz/carriepl-subtensor

gh-1816 PR with small fixes
...@@ -154,6 +154,12 @@ class GpuElemwise(HideC, Elemwise): ...@@ -154,6 +154,12 @@ class GpuElemwise(HideC, Elemwise):
#define ga_half uint16_t #define ga_half uint16_t
""" """
try:
#We accept only some c_support_code().
#This filter is done in the make_node()
support_code += self.scalar_op.c_support_code()
except MethodNotDefined:
pass
for npy, ga in [("npy_uint8", "ga_ubyte"), for npy, ga in [("npy_uint8", "ga_ubyte"),
("npy_uint16", "ga_ushort"), ("npy_uint16", "ga_ushort"),
("npy_uin32", "ga_uint"), ("npy_uin32", "ga_uint"),
...@@ -179,6 +185,9 @@ class GpuElemwise(HideC, Elemwise): ...@@ -179,6 +185,9 @@ class GpuElemwise(HideC, Elemwise):
raise MethodNotDefined('cuda only') raise MethodNotDefined('cuda only')
return NVCC_compiler return NVCC_compiler
def c_support_code(self):
return self.scalar_op.c_support_code()
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
if pygpu.get_default_context().kind == 'opencl': if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only') raise MethodNotDefined('cuda only')
......
import copy import copy
import theano import theano
import numpy import numpy
try:
import pygpu
except ImportError:
pass
from theano import tensor, scalar, gof from theano import tensor, scalar, gof
from theano.compile import optdb from theano.compile import optdb
from theano.gof import (local_optimizer, EquilibriumDB, from theano.gof import (local_optimizer, EquilibriumDB,
...@@ -26,7 +32,9 @@ from theano.sandbox.gpuarray.nnet import ( ...@@ -26,7 +32,9 @@ from theano.sandbox.gpuarray.nnet import (
) )
from theano.sandbox.gpuarray.elemwise import (GpuElemwise, _is_scalar, from theano.sandbox.gpuarray.elemwise import (GpuElemwise, _is_scalar,
GpuDimShuffle, GpuCAReduceCuda) GpuDimShuffle, GpuCAReduceCuda)
from theano.sandbox.gpuarray.subtensor import GpuIncSubtensor, GpuSubtensor from theano.sandbox.gpuarray.subtensor import (GpuIncSubtensor, GpuSubtensor,
GpuAdvancedIncSubtensor1,
GpuAdvancedIncSubtensor1_dev20)
from theano.sandbox.gpuarray.type import GpuArrayConstant from theano.sandbox.gpuarray.type import GpuArrayConstant
gpu_optimizer = EquilibriumDB() gpu_optimizer = EquilibriumDB()
...@@ -273,6 +281,30 @@ def local_gpua_incsubtensor(node): ...@@ -273,6 +281,30 @@ def local_gpua_incsubtensor(node):
node.op.destroyhandler_tolerate_aliased) node.op.destroyhandler_tolerate_aliased)
@register_opt()
@op_lifter([tensor.AdvancedIncSubtensor1])
def local_gpua_advanced_incsubtensor(node):
# This optimization is disabled if cuda is not active
if pygpu.get_default_context().kind != "cuda":
return None
x, y = node.inputs[0:2]
coords = node.inputs[2:]
set_instead_of_inc = node.op.set_instead_of_inc
active_device_no = theano.sandbox.cuda.active_device_number()
device_properties = theano.sandbox.cuda.device_properties
compute_capability = device_properties(active_device_no)['major']
if (compute_capability < 2 or x.ndim != 2 or y.ndim != 2):
return GpuAdvancedIncSubtensor1(
set_instead_of_inc=set_instead_of_inc)
else:
return GpuAdvancedIncSubtensor1_dev20(
set_instead_of_inc=set_instead_of_inc)
@register_opt() @register_opt()
@op_lifter([tensor.CAReduce, tensor.Sum]) @op_lifter([tensor.CAReduce, tensor.Sum])
def local_gpua_careduce(node): def local_gpua_careduce(node):
......
import copy import copy
import StringIO import StringIO
import numpy import numpy
import theano import theano
from theano import tensor, gof from theano import tensor, gof, Op
from theano.gof.python25 import all, any from theano.gof.python25 import all, any
from theano.tensor.subtensor import IncSubtensor, Subtensor, get_idx_list from theano.tensor.subtensor import IncSubtensor, Subtensor, get_idx_list
import theano.tensor.inplace import theano.tensor.inplace
...@@ -357,3 +356,239 @@ class GpuIncSubtensor(IncSubtensor): ...@@ -357,3 +356,239 @@ class GpuIncSubtensor(IncSubtensor):
if not parent_version or not elemwise_version: if not parent_version or not elemwise_version:
return return
return parent_version + elemwise_version + (0,) return parent_version + elemwise_version + (0,)
class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1):
"""
Implement AdvancedIncSubtensor1 on the gpu.
"""
def make_node(self, x, y, ilist):
x_ = as_gpuarray_variable(x)
y_ = as_gpuarray_variable(y)
ilist_ = tensor.as_tensor_variable(ilist)
assert x_.type.dtype == y_.type.dtype
assert x_.type.ndim >= y_.type.ndim
if ilist_.type.dtype[:3] not in ('int', 'uin'):
raise TypeError('index must be integers')
if ilist_.type.broadcastable != (False,):
raise TypeError('index must be vector')
if x_.type.ndim == 0:
raise TypeError('cannot index into a scalar')
if x_.type.broadcastable[0]:
# the caller should have made a copy of x len(ilist) times
raise TypeError('cannot index into a broadcastable dimension')
return gof.Apply(self, [x_, y_, ilist_], [x_.type()])
def getInplElemwiseAdditionKernel(self, a, b):
a_arg = pygpu.tools.as_argument(a, 'a')
b_arg = pygpu.tools.as_argument(b, 'b')
args = [a_arg, b_arg]
oper = "a[i] = a[i] + %(b)s" % {'b': b_arg.expr()}
k = pygpu.elemwise.ElemwiseKernel(a.context, args, oper)
return k
# We can't use the parent version that loops on each index
# as we also need to loop when set_instead_of_inc is True and the
# parent doesn't loop in that case.
def perform(self, node, inp, out_):
# TODO opt to make this inplace
x, y, idx = inp
out, = out_
# Make sure idx is not a GpuArray otherwise we cannot use its content
# to index x and y
if isinstance(idx, gpuarray.GpuArray):
idx = numpy.asarray(idx)
if not self.inplace:
x = x.copy()
if self.set_instead_of_inc:
assert y.ndim <= x.ndim # Should be guaranteed by `make_node`
if y.ndim == x.ndim:
assert len(y) == len(idx)
for (j, i) in enumerate(idx):
x[i] = y[j]
else:
for i in idx:
x[i] = y
else:
# If `y` has as many dimensions as `x`, then we want to iterate
# jointly on `x` and `y`. Otherwise, it means `y` should be
# broadcasted to fill all relevant rows of `x`.
assert y.ndim <= x.ndim # Should be guaranteed by `make_node`
if len(idx) == 0:
pass
elif y.ndim == x.ndim:
assert len(y) == len(idx)
k = self.getInplElemwiseAdditionKernel(x[0], y[0])
for (j, i) in enumerate(idx):
k(x[i], y[j], broadcast=False)
else:
nb_dims_to_add = (x.ndim - 1) - y.ndim
reshaped_y = y.reshape((1,)*nb_dims_to_add + y.shape)
k = self.getInplElemwiseAdditionKernel(x[0],
reshaped_y)
for i in idx:
k(x[i], reshaped_y, broadcast=True)
out[0] = x
class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
"""Implement AdvancedIncSubtensor1 on the gpu, but use function
only avail on compute capability 2.0 and more recent.
"""
def __init__(self, inplace=False, set_instead_of_inc=False):
# The python implementation in the parent class is not applicable here
GpuAdvancedIncSubtensor1.__init__(self, inplace, set_instead_of_inc)
def make_node(self, x, y, ilist):
"""It defer from GpuAdvancedIncSubtensor1 in that it make sure
the index are of type long.
"""
x_ = as_gpuarray_variable(x)
y_ = as_gpuarray_variable(y)
ilist_ = as_gpuarray_variable(ilist)
assert x_.type.dtype == y_.type.dtype
assert x_.type.ndim >= y_.type.ndim
if ilist_.type.dtype[:3] not in ('int', 'uin'):
raise TypeError('index must be integers')
if ilist_.type.broadcastable != (False,):
raise TypeError('index must be vector')
if x_.type.ndim == 0:
raise TypeError('cannot index into a scalar')
if x_.type.broadcastable[0]:
# the caller should have made a copy of x len(ilist) times
raise TypeError('cannot index into a broadcastable dimension')
return gof.Apply(self, [x_, y_, ilist_], [x_.type()])
def c_code_cache_version(self):
return (2,)
def c_headers(self):
return ['cuda.h', '<compyte/extension.h>', '<numpy_compat.h>',
'<compyte/ext_cuda.h>']
def c_compiler(self):
return NVCC_compiler
def c_init_code(self):
return ['setup_ext_cuda();']
def c_code(self, node, name, inputs, outputs, sub):
active_device_no = theano.sandbox.cuda.active_device_number()
device_properties = theano.sandbox.cuda.device_properties
compute_capability = device_properties(active_device_no)['major']
if ((self.set_instead_of_inc) or
(node.inputs[0].ndim != node.inputs[1].ndim) or
(node.inputs[0].ndim != 2) or
(compute_capability < 2)):
raise NotImplementedError("This case does not have C code yet.")
x = inputs[0]
y = inputs[1]
ind = inputs[2]
out = outputs[0]
fail = sub['fail']
inplace = int(self.inplace)
return """
Py_XDECREF(%(out)s);
if (!%(inplace)s) {
%(out)s = (PyGpuArrayObject*)pygpu_copy(%(x)s, GA_C_ORDER);
} else {
%(out)s = %(x)s;
Py_XINCREF(%(out)s);
}
GpuArray_vector_add_fast(%(out)s, %(y)s, %(ind)s);
if (!%(out)s) {
%(fail)s
}
""" % locals()
def c_support_code_apply(self, node, nodename):
dtype_x = node.inputs[0].dtype
dtype_y = node.inputs[1].dtype
dtype_ind = node.inputs[2].dtype
dtype_out = node.outputs[0].dtype
itemsize_x = numpy.dtype(dtype_x).itemsize
itemsize_y = numpy.dtype(dtype_y).itemsize
itemsize_ind = numpy.dtype(dtype_ind).itemsize
itemsize_out = numpy.dtype(dtype_out).itemsize
return """
__global__ void k_vector_add_fast(int numRowsX,
int numColsX,
int stridesX0,
int stridesX1,
npy_%(dtype_x)s *X,
int numRowsY,
int numColsY,
int stridesY0,
int stridesY1,
npy_%(dtype_y)s *Y,
int numIndices,
int stridesIndices,
npy_%(dtype_ind)s *indices_arr)
{
for (int i = (blockIdx.x); i < numIndices; i += gridDim.x)
{
for(int j = (threadIdx.x); j < numColsX;j += blockDim.x)
{
int x_row = indices_arr[i * stridesIndices];
int y_row = i;
atomicAdd(&X[(x_row * stridesX0) + (j * stridesX1)], Y[(y_row * stridesY0) + (j * stridesY1)]);
}
}
return;
}
void GpuArray_vector_add_fast(PyGpuArrayObject* py_self,
PyGpuArrayObject* py_other,
PyGpuArrayObject *indices_arr)
{
int num_threads_per_block = std::min(PyGpuArray_DIMS(py_self)[1],
(size_t)256);
int num_blocks = std::min(PyGpuArray_SIZE(indices_arr),
(size_t)4096);
dim3 n_blocks(num_blocks);
dim3 n_threads(num_threads_per_block);
k_vector_add_fast<<<n_blocks, n_threads>>>(
PyGpuArray_DIMS(py_self)[0],
PyGpuArray_DIMS(py_self)[1],
PyGpuArray_STRIDES(py_self)[0] / %(itemsize_x)s,
PyGpuArray_STRIDES(py_self)[1] / %(itemsize_x)s,
(npy_%(dtype_x)s*)(
((char *)cuda_get_ptr(py_self->ga.data)) +
py_self->ga.offset),
PyGpuArray_DIMS(py_other)[0],
PyGpuArray_DIMS(py_other)[1],
PyGpuArray_STRIDES(py_other)[0] / %(itemsize_y)s,
PyGpuArray_STRIDES(py_other)[1] / %(itemsize_y)s,
(npy_%(dtype_x)s*)(
((char *)cuda_get_ptr(py_other->ga.data)) +
py_other->ga.offset),
PyGpuArray_DIMS(indices_arr)[0],
PyGpuArray_STRIDES(indices_arr)[0] / %(itemsize_ind)s,
(npy_%(dtype_ind)s*)(
((char *)cuda_get_ptr(indices_arr->ga.data)) +
indices_arr->ga.offset)
);
return;
}
""" %locals()
import numpy
import theano
from theano.tensor.tests.test_subtensor import T_subtensor from theano.tensor.tests.test_subtensor import T_subtensor
from theano.sandbox.gpuarray.basic_ops import (HostFromGpu, GpuFromHost) from theano.sandbox.gpuarray.basic_ops import (HostFromGpu, GpuFromHost)
from theano.sandbox.gpuarray.subtensor import GpuIncSubtensor, GpuSubtensor from theano.sandbox.gpuarray.subtensor import (GpuIncSubtensor, GpuSubtensor,
GpuAdvancedIncSubtensor1)
from theano.sandbox.gpuarray.type import gpuarray_shared_constructor from theano.sandbox.gpuarray.type import gpuarray_shared_constructor
...@@ -21,6 +25,7 @@ class G_subtensor(T_subtensor): ...@@ -21,6 +25,7 @@ class G_subtensor(T_subtensor):
shared=gpuarray_shared_constructor, shared=gpuarray_shared_constructor,
sub=GpuSubtensor, sub=GpuSubtensor,
inc_sub=GpuIncSubtensor, inc_sub=GpuIncSubtensor,
adv_incsub1=GpuAdvancedIncSubtensor1,
mode=mode_with_gpu, mode=mode_with_gpu,
# avoid errors with limited devices # avoid errors with limited devices
dtype='float32', dtype='float32',
...@@ -29,3 +34,24 @@ class G_subtensor(T_subtensor): ...@@ -29,3 +34,24 @@ class G_subtensor(T_subtensor):
# GPU opt can't run in fast_compile only. # GPU opt can't run in fast_compile only.
self.fast_compile = False self.fast_compile = False
assert self.sub == GpuSubtensor assert self.sub == GpuSubtensor
def test_advinc_subtensor1():
""" Test the second case in the opt local_gpu_advanced_incsubtensor1 """
for shp in [(3, 3), (3, 3, 3)]:
shared = gpuarray_shared_constructor
xval = numpy.arange(numpy.prod(shp), dtype='float32').reshape(shp) + 1
yval = numpy.empty((2,) + shp[1:], dtype='float32')
yval[:] = 10
x = shared(xval, name='x')
y = tensor.tensor(dtype='float32',
broadcastable=(False,) * len(shp),
name='y')
expr = tensor.advanced_inc_subtensor1(x, y, [0, 2])
f = theano.function([y], expr, mode=mode_with_gpu)
assert sum([isinstance(node.op, GpuAdvancedIncSubtensor1)
for node in f.maker.fgraph.toposort()]) == 1
rval = f(yval)
rep = xval.copy()
rep[[0, 2]] += yval
assert numpy.allclose(rval, rep)
...@@ -29,6 +29,7 @@ if cuda_available: ...@@ -29,6 +29,7 @@ if cuda_available:
from theano.sandbox.gpuarray.basic_ops import GpuKernelBase from theano.sandbox.gpuarray.basic_ops import GpuKernelBase
from theano.sandbox.gpuarray.type import GpuArrayType from theano.sandbox.gpuarray.type import GpuArrayType
def matVecModM(A, s, m): def matVecModM(A, s, m):
assert A.dtype == 'int64' assert A.dtype == 'int64'
return numpy.int32(numpy.sum((A*s) % m, 1) % m) return numpy.int32(numpy.sum((A*s) % m, 1) % m)
......
...@@ -431,7 +431,7 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin): ...@@ -431,7 +431,7 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
self.assertTrue(numpy.allclose(val, good), (val, good)) self.assertTrue(numpy.allclose(val, good), (val, good))
# Test reuse of output memory # Test reuse of output memory
if isinstance(self.adv_sub1, tensor.AdvancedSubtensor1): if type(self.adv_sub1) == tensor.AdvancedSubtensor1:
op = self.adv_sub1() op = self.adv_sub1()
# When idx is a TensorConstant. # When idx is a TensorConstant.
if hasattr(idx, "data"): if hasattr(idx, "data"):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论