提交 4ba74e22 authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #1316 from viveksck/try_nouiz

WIP:Speeding up GpuAdvancedIncSubTensor1 by writing fast Cuda Code
...@@ -2444,7 +2444,7 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp): ...@@ -2444,7 +2444,7 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp):
out[0] = x out[0] = x
def c_code_cache_version(self): def c_code_cache_version(self):
return (1,) return (3,)
def c_code(self, node, name, inputs, outputs, sub): def c_code(self, node, name, inputs, outputs, sub):
if (self.set_instead_of_inc) or \ if (self.set_instead_of_inc) or \
...@@ -2467,7 +2467,8 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp): ...@@ -2467,7 +2467,8 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp):
num_indices = PyArray_SIZE(%(ind)s); num_indices = PyArray_SIZE(%(ind)s);
if ((num_indices - 1) > LONG_MAX) { if ((num_indices - 1) > LONG_MAX) {
PyErr_Format(PyExc_AssertionError, "num_indices %%d exceeds LONG_MAX + 1", num_indices); PyErr_Format(PyExc_AssertionError,
"num_indices %%d exceeds LONG_MAX + 1", num_indices);
%(fail)s; %(fail)s;
} }
...@@ -2489,7 +2490,8 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp): ...@@ -2489,7 +2490,8 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp):
x_rowind_obj = PyInt_FromLong(*p_index); x_rowind_obj = PyInt_FromLong(*p_index);
if (PyInt_AsLong(x_rowind_obj) != (*p_index)) { if (PyInt_AsLong(x_rowind_obj) != (*p_index)) {
PyErr_Format(PyExc_AssertionError, "Error in converting row index to integer from long"); PyErr_Format(PyExc_AssertionError,
"Error in converting row index to integer from long");
// Dec Ref what ever we have increfed or allocated so far // Dec Ref what ever we have increfed or allocated so far
// We deallocate objects exactly in the reverse order they were allocated. // We deallocate objects exactly in the reverse order they were allocated.
Py_XDECREF(x_rowind_obj); Py_XDECREF(x_rowind_obj);
...@@ -2536,6 +2538,153 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp): ...@@ -2536,6 +2538,153 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp):
if (!%(out)s) { if (!%(out)s) {
%(fail)s %(fail)s
} }
""" % locals()
class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
"""Implement AdvancedIncSubtensor1 on the gpu, but use function
only avail on compute capability 2.0 and more recent.
"""
def make_node(self, x, y, ilist):
"""It defer from GpuAdvancedIncSubtensor1 in that it make sure
the index are of type long.
"""
x_ = as_cuda_ndarray_variable(x)
y_ = as_cuda_ndarray_variable(y)
ilist_ = tensor.as_tensor_variable(ilist)
convert_map = {8: tensor.basic._convert_to_int8,
16: tensor.basic._convert_to_int16,
32: tensor.basic._convert_to_int32,
64: tensor.basic._convert_to_int64
}
intwidth = theano.gof.compiledir.python_int_bitwidth()
ilist_ = convert_map[intwidth](ilist_)
assert x_.type.dtype == y_.type.dtype
assert x_.type.ndim >= y_.type.ndim
if ilist_.type.dtype[:3] not in ('int', 'uin'):
raise TypeError('index must be integers')
if ilist_.type.broadcastable != (False,):
raise TypeError('index must be vector')
if x_.type.ndim == 0:
raise TypeError('cannot index into a scalar')
if x_.type.broadcastable[0]:
# the caller should have made a copy of x len(ilist) times
raise TypeError('cannot index into a broadcastable dimension')
return Apply(self, [x_, y_, ilist_], [x_.type()])
def c_code_cache_version(self):
return (2,)
def c_code(self, node, name, inputs, outputs, sub):
active_device_no = theano.sandbox.cuda.active_device_number()
compute_capability = device_properties(active_device_no)['major']
if ((self.set_instead_of_inc) or
(node.inputs[0].ndim != node.inputs[1].ndim) or
(node.inputs[0].ndim != 2) or
(compute_capability < 2)):
raise NotImplementedError("This case does not have C code yet.")
x = inputs[0]
y = inputs[1]
ind = inputs[2]
out = outputs[0]
fail = sub['fail']
inplace = int(self.inplace)
return """
Py_XDECREF(%(out)s);
if (!%(inplace)s) {
%(out)s = (CudaNdarray*)CudaNdarray_Copy(%(x)s);
} else {
%(out)s = %(x)s;
Py_XINCREF(%(out)s);
}
CudaNdarray_vector_add_fast(%(out)s, %(y)s, %(ind)s);
if (!%(out)s) {
%(fail)s
}
""" % locals()
def c_support_code_apply(self, node, nodename):
return """
__global__ void k_vector_add_fast(int numRowsX,
int numColsX,
int stridesX0,
int stridesX1,
float *X,
int numRowsY,
int numColsY,
int stridesY0,
int stridesY1,
float *Y ,
long *d_indices_arr,
int num)
{
for (int i = (blockIdx.x); i < num; i += gridDim.x)
{
for(int j = (threadIdx.x); j < numColsX;j += blockDim.x)
{
int x_row = d_indices_arr[i];
int y_row = i;
atomicAdd(&X[(x_row * stridesX0) + (j * stridesX1)], Y[(y_row * stridesY0) + (j * stridesY1)]);
}
}
return;
}
void CudaNdarray_vector_add_fast(CudaNdarray* py_self, CudaNdarray* py_other, PyArrayObject *indices_arr)
{
const int *shapeX = CudaNdarray_HOST_DIMS(py_self);
const int *shapeY = CudaNdarray_HOST_DIMS(py_other);
const int *strX = CudaNdarray_HOST_STRIDES(py_self);
const int *strY = CudaNdarray_HOST_STRIDES(py_other);
unsigned int size = (unsigned int)PyArray_SIZE(indices_arr);
unsigned int numcolsX = shapeX[1];
unsigned int num_threads_per_block = std::min(numcolsX, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
unsigned int num_blocks = std::min(size ,(unsigned int)NUM_VECTOR_OP_BLOCKS);
dim3 n_blocks(num_blocks);
dim3 n_threads(num_threads_per_block);
long *d_indices_arr = NULL;
PyArrayObject *cpu_indices_arr = PyArray_GETCONTIGUOUS(indices_arr);
d_indices_arr = (long*)device_malloc(PyArray_NBYTES(cpu_indices_arr));
assert(d_indices_arr);
cudaError_t err = cudaMemcpy(d_indices_arr,
PyArray_DATA(cpu_indices_arr),
PyArray_NBYTES(cpu_indices_arr),
cudaMemcpyHostToDevice);
assert(err == cudaSuccess);
k_vector_add_fast<<<n_blocks, n_threads>>>(shapeX[0],
shapeX[1],
strX[0],
strX[1],
CudaNdarray_DEV_DATA(py_self),
shapeY[0],
shapeY[1],
strY[0],
strY[1],
CudaNdarray_DEV_DATA(py_other),
d_indices_arr,
PyArray_SIZE(indices_arr)
);
device_free(d_indices_arr);
Py_XDECREF(cpu_indices_arr);
return;
}
""" %locals() """ %locals()
class GpuIncSubtensor(tensor.IncSubtensor, GpuOp): class GpuIncSubtensor(tensor.IncSubtensor, GpuOp):
......
...@@ -522,7 +522,6 @@ DllExport PyObject * CudaNdarray_inplace_add(PyObject* py_self, PyObject * py_ot ...@@ -522,7 +522,6 @@ DllExport PyObject * CudaNdarray_inplace_add(PyObject* py_self, PyObject * py_ot
DllExport PyObject * CudaNdarray_Subscript(PyObject * py_self, PyObject * key); DllExport PyObject * CudaNdarray_Subscript(PyObject * py_self, PyObject * key);
DllExport int CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t fct_nb); DllExport int CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t fct_nb);
// Ensures that *arr is a pointer to a contiguous ndarray of the specified // Ensures that *arr is a pointer to a contiguous ndarray of the specified
// dimensions. // dimensions.
// *arr may initially be NULL, a pointer to an ndarray of the wrong size, // *arr may initially be NULL, a pointer to an ndarray of the wrong size,
......
...@@ -781,9 +781,16 @@ def local_gpu_advanced_incsubtensor1(node): ...@@ -781,9 +781,16 @@ def local_gpu_advanced_incsubtensor1(node):
'either set the `warn.gpu_set_subtensor1` config ' 'either set the `warn.gpu_set_subtensor1` config '
'option to False, or `warn.ignore_bug_before` to at ' 'option to False, or `warn.ignore_bug_before` to at '
'least \'0.6\'.', stacklevel=1) 'least \'0.6\'.', stacklevel=1)
active_device_no = theano.sandbox.cuda.active_device_number()
compute_capability = device_properties(active_device_no)['major']
if (compute_capability < 2 or
x.ndim != 2 or
y.ndim != 2):
gpu_op = GpuAdvancedIncSubtensor1( gpu_op = GpuAdvancedIncSubtensor1(
set_instead_of_inc=set_instead_of_inc) set_instead_of_inc=set_instead_of_inc)
else:
gpu_op = GpuAdvancedIncSubtensor1_dev20(
set_instead_of_inc=set_instead_of_inc)
return [gpu_op(gpu_from_host(x), gpu_from_host(y), *coords)] return [gpu_op(gpu_from_host(x), gpu_from_host(y), *coords)]
# Should not execute for GpuAdvancedIncSubtensor1 # Should not execute for GpuAdvancedIncSubtensor1
...@@ -814,8 +821,16 @@ def local_gpu_advanced_incsubtensor1(node): ...@@ -814,8 +821,16 @@ def local_gpu_advanced_incsubtensor1(node):
'option to False, or `warn.ignore_bug_before` to at ' 'option to False, or `warn.ignore_bug_before` to at '
'least \'0.6\'.', stacklevel=1) 'least \'0.6\'.', stacklevel=1)
active_device_no = theano.sandbox.cuda.active_device_number()
compute_capability = device_properties(active_device_no)['major']
if (compute_capability < 2 or
x.ndim != 2 or
y.ndim != 2):
gpu_op = GpuAdvancedIncSubtensor1( gpu_op = GpuAdvancedIncSubtensor1(
set_instead_of_inc=set_instead_of_inc) set_instead_of_inc=set_instead_of_inc)
else:
gpu_op = GpuAdvancedIncSubtensor1_dev20(
set_instead_of_inc=set_instead_of_inc)
return [host_from_gpu(gpu_op(gpu_x, gpu_y, *coords))] return [host_from_gpu(gpu_op(gpu_x, gpu_y, *coords))]
return False return False
......
...@@ -1005,20 +1005,23 @@ class T_subtensor(theano.tensor.tests.test_basic.T_subtensor): ...@@ -1005,20 +1005,23 @@ class T_subtensor(theano.tensor.tests.test_basic.T_subtensor):
def test_advinc_subtensor1(): def test_advinc_subtensor1():
""" Test the second case in the opt local_gpu_advanced_incsubtensor1 """ """ Test the second case in the opt local_gpu_advanced_incsubtensor1 """
for shp in [(3, 3), (3, 3, 3)]:
shared = cuda.shared_constructor shared = cuda.shared_constructor
#shared = tensor.shared xval = numpy.arange(numpy.prod(shp), dtype='float32').reshape(shp) + 1
xval = numpy.asarray([[1, 2, 3], [4, 5, 6], [7, 8, 9]], yval = numpy.empty((2,) + shp[1:], dtype='float32')
dtype='float32') yval[:] = 10
yval = numpy.asarray([[10, 10, 10], [10, 10, 10]],
dtype='float32')
x = shared(xval, name='x') x = shared(xval, name='x')
y = T.fmatrices('y') y = T.tensor(dtype='float32',
broadcastable=(False,) * len(shp),
name='y')
expr = T.advanced_inc_subtensor1(x, y, [0, 2]) expr = T.advanced_inc_subtensor1(x, y, [0, 2])
f = theano.function([y], expr, mode=mode_with_gpu) f = theano.function([y], expr, mode=mode_with_gpu)
assert sum([isinstance(node.op, cuda.GpuAdvancedIncSubtensor1) assert sum([isinstance(node.op, cuda.GpuAdvancedIncSubtensor1)
for node in f.maker.fgraph.toposort()]) == 1 for node in f.maker.fgraph.toposort()]) == 1
assert numpy.allclose(f(yval), [[11., 12., 13.], [4., 5., 6.], rval = f(yval)
[17., 18., 19.]]) rep = xval.copy()
rep[[0, 2]] += yval
assert numpy.allclose(rval, rep)
def test_inc_subtensor(): def test_inc_subtensor():
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论