提交 305b11ae authored 作者: Vivek Kulkarni's avatar Vivek Kulkarni

Merge pull request #2 from nouiz/viveksck-try_nouiz

Bugfix and reuse old code when the new one isn't used
...@@ -2308,6 +2308,7 @@ class GpuSubtensor(GpuOp, tensor.Subtensor): ...@@ -2308,6 +2308,7 @@ class GpuSubtensor(GpuOp, tensor.Subtensor):
return () return ()
return (3, hv) return (3, hv)
class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp): class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp):
""" """
Implement AdvancedSubtensor1 on the gpu. Implement AdvancedSubtensor1 on the gpu.
...@@ -2391,14 +2392,6 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp): ...@@ -2391,14 +2392,6 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp):
x_ = as_cuda_ndarray_variable(x) x_ = as_cuda_ndarray_variable(x)
y_ = as_cuda_ndarray_variable(y) y_ = as_cuda_ndarray_variable(y)
ilist_ = tensor.as_tensor_variable(ilist) ilist_ = tensor.as_tensor_variable(ilist)
convert_map = { 8:tensor.basic._convert_to_int8,
16:tensor.basic._convert_to_int16,
32:tensor.basic._convert_to_int32,
64:tensor.basic._convert_to_int64
}
intwidth = theano.gof.compiledir.python_int_bitwidth()
ilist_ = convert_map[intwidth](ilist_)
assert x_.type.dtype == y_.type.dtype assert x_.type.dtype == y_.type.dtype
assert x_.type.ndim >= y_.type.ndim assert x_.type.ndim >= y_.type.ndim
...@@ -2451,15 +2444,12 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp): ...@@ -2451,15 +2444,12 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp):
out[0] = x out[0] = x
def c_code_cache_version(self): def c_code_cache_version(self):
return (1,) return (3,)
def c_code(self, node, name, inputs, outputs, sub): def c_code(self, node, name, inputs, outputs, sub):
active_device_no = theano.sandbox.cuda.active_device_number()
compute_capability = theano.sandbox.cuda.device_properties(active_device_no)['major']
if (self.set_instead_of_inc) or \ if (self.set_instead_of_inc) or \
(node.inputs[0].ndim != node.inputs[1].ndim) or \ (node.inputs[0].ndim != node.inputs[1].ndim):
(compute_capability < 2): raise NotImplementedError("This case does not have C code yet.")
raise NotImplementedError("This case does not have C code yet.")
x = inputs[0] x = inputs[0]
y = inputs[1] y = inputs[1]
...@@ -2469,6 +2459,19 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp): ...@@ -2469,6 +2459,19 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp):
inplace = int(self.inplace) inplace = int(self.inplace)
return """ return """
PyObject *x_obj, *y_obj, *row_x, *row_y;
PyObject *x_rowind_obj, *y_rowind_obj;
dtype_%(ind)s *p_index;
int num_indices, j;
int ret;
num_indices = PyArray_SIZE(%(ind)s);
if ((num_indices - 1) > LONG_MAX) {
PyErr_Format(PyExc_AssertionError,
"num_indices %%d exceeds LONG_MAX + 1", num_indices);
%(fail)s;
}
Py_XDECREF(%(out)s); Py_XDECREF(%(out)s);
if (!%(inplace)s) { if (!%(inplace)s) {
%(out)s = (CudaNdarray*)CudaNdarray_Copy(%(x)s); %(out)s = (CudaNdarray*)CudaNdarray_Copy(%(x)s);
...@@ -2477,12 +2480,136 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp): ...@@ -2477,12 +2480,136 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp):
Py_XINCREF(%(out)s); Py_XINCREF(%(out)s);
} }
CudaNdarray_vector_add_fast(%(x)s, %(y)s, %(ind)s); x_obj = (PyObject*)CudaNdarray_View(%(out)s);
y_obj = (PyObject*)CudaNdarray_View(%(y)s);
for (j = 0;j < num_indices; j++) {
p_index = (dtype_%(ind)s *)PyArray_GETPTR1(%(ind)s, j);
x_rowind_obj = PyInt_FromLong(*p_index);
if (PyInt_AsLong(x_rowind_obj) != (*p_index)) {
PyErr_Format(PyExc_AssertionError,
"Error in converting row index to integer from long");
// Dec Ref what ever we have increfed or allocated so far
// We deallocate objects exactly in the reverse order they were allocated.
Py_XDECREF(x_rowind_obj);
Py_XDECREF(y_obj);
Py_XDECREF(x_obj);
%(fail)s;
}
y_rowind_obj = PyInt_FromLong(j);
row_x = CudaNdarray_Subscript(x_obj, x_rowind_obj);
row_y = CudaNdarray_Subscript(y_obj, y_rowind_obj);
if ((row_x == NULL) || (row_y == NULL)) {
Py_XDECREF(row_y);
Py_XDECREF(row_x);
Py_XDECREF(y_rowind_obj);
Py_XDECREF(x_rowind_obj);
Py_XDECREF(y_obj);
Py_XDECREF(x_obj);
%(fail)s;
}
ret = CudaNdarray_inplace_elemwise(row_x, row_y, IADD);
if (ret != 0) {
Py_XDECREF(row_y);
Py_XDECREF(row_x);
Py_XDECREF(y_rowind_obj);
Py_XDECREF(x_rowind_obj);
Py_XDECREF(y_obj);
Py_XDECREF(x_obj);
%(fail)s;
}
Py_XDECREF(row_y);
Py_XDECREF(row_x);
Py_XDECREF(y_rowind_obj);
Py_XDECREF(x_rowind_obj);
}
Py_XDECREF(y_obj);
Py_XDECREF(x_obj);
if (!%(out)s) { if (!%(out)s) {
%(fail)s %(fail)s
} }
""" %locals() """ % locals()
class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
"""Implement AdvancedIncSubtensor1 on the gpu, but use function
only avail on compute capability 2.0 and more recent.
"""
def make_node(self, x, y, ilist):
"""It defer from GpuAdvancedIncSubtensor1 in that it make sure
the index are of type long.
"""
x_ = as_cuda_ndarray_variable(x)
y_ = as_cuda_ndarray_variable(y)
ilist_ = tensor.as_tensor_variable(ilist)
convert_map = {8: tensor.basic._convert_to_int8,
16: tensor.basic._convert_to_int16,
32: tensor.basic._convert_to_int32,
64: tensor.basic._convert_to_int64
}
intwidth = theano.gof.compiledir.python_int_bitwidth()
ilist_ = convert_map[intwidth](ilist_)
assert x_.type.dtype == y_.type.dtype
assert x_.type.ndim >= y_.type.ndim
if ilist_.type.dtype[:3] not in ('int', 'uin'):
raise TypeError('index must be integers')
if ilist_.type.broadcastable != (False,):
raise TypeError('index must be vector')
if x_.type.ndim == 0:
raise TypeError('cannot index into a scalar')
if x_.type.broadcastable[0]:
# the caller should have made a copy of x len(ilist) times
raise TypeError('cannot index into a broadcastable dimension')
return Apply(self, [x_, y_, ilist_], [x_.type()])
def c_code_cache_version(self):
return (2,)
def c_code(self, node, name, inputs, outputs, sub):
active_device_no = theano.sandbox.cuda.active_device_number()
compute_capability = device_properties(active_device_no)['major']
if ((self.set_instead_of_inc) or
(node.inputs[0].ndim != node.inputs[1].ndim) or
(node.inputs[0].ndim != 2) or
(compute_capability < 2)):
raise NotImplementedError("This case does not have C code yet.")
x = inputs[0]
y = inputs[1]
ind = inputs[2]
out = outputs[0]
fail = sub['fail']
inplace = int(self.inplace)
return """
Py_XDECREF(%(out)s);
if (!%(inplace)s) {
%(out)s = (CudaNdarray*)CudaNdarray_Copy(%(x)s);
} else {
%(out)s = %(x)s;
Py_XINCREF(%(out)s);
}
CudaNdarray_vector_add_fast(%(out)s, %(y)s, %(ind)s);
if (!%(out)s) {
%(fail)s
}
""" % locals()
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
return """ return """
......
...@@ -776,9 +776,16 @@ def local_gpu_advanced_incsubtensor1(node): ...@@ -776,9 +776,16 @@ def local_gpu_advanced_incsubtensor1(node):
'either set the `warn.gpu_set_subtensor1` config ' 'either set the `warn.gpu_set_subtensor1` config '
'option to False, or `warn.ignore_bug_before` to at ' 'option to False, or `warn.ignore_bug_before` to at '
'least \'0.6\'.', stacklevel=1) 'least \'0.6\'.', stacklevel=1)
active_device_no = theano.sandbox.cuda.active_device_number()
gpu_op = GpuAdvancedIncSubtensor1( compute_capability = device_properties(active_device_no)['major']
set_instead_of_inc=set_instead_of_inc) if (compute_capability < 2 or
x.ndim != 2 or
y.ndim != 2):
gpu_op = GpuAdvancedIncSubtensor1(
set_instead_of_inc=set_instead_of_inc)
else:
gpu_op = GpuAdvancedIncSubtensor1_dev20(
set_instead_of_inc=set_instead_of_inc)
return [gpu_op(gpu_from_host(x), gpu_from_host(y), *coords)] return [gpu_op(gpu_from_host(x), gpu_from_host(y), *coords)]
# Should not execute for GpuAdvancedIncSubtensor1 # Should not execute for GpuAdvancedIncSubtensor1
...@@ -809,8 +816,16 @@ def local_gpu_advanced_incsubtensor1(node): ...@@ -809,8 +816,16 @@ def local_gpu_advanced_incsubtensor1(node):
'option to False, or `warn.ignore_bug_before` to at ' 'option to False, or `warn.ignore_bug_before` to at '
'least \'0.6\'.', stacklevel=1) 'least \'0.6\'.', stacklevel=1)
gpu_op = GpuAdvancedIncSubtensor1( active_device_no = theano.sandbox.cuda.active_device_number()
set_instead_of_inc=set_instead_of_inc) compute_capability = device_properties(active_device_no)['major']
if (compute_capability < 2 or
x.ndim != 2 or
y.ndim != 2):
gpu_op = GpuAdvancedIncSubtensor1(
set_instead_of_inc=set_instead_of_inc)
else:
gpu_op = GpuAdvancedIncSubtensor1_dev20(
set_instead_of_inc=set_instead_of_inc)
return [host_from_gpu(gpu_op(gpu_x, gpu_y, *coords))] return [host_from_gpu(gpu_op(gpu_x, gpu_y, *coords))]
return False return False
......
...@@ -999,20 +999,23 @@ class T_subtensor(theano.tensor.tests.test_basic.T_subtensor): ...@@ -999,20 +999,23 @@ class T_subtensor(theano.tensor.tests.test_basic.T_subtensor):
def test_advinc_subtensor1(): def test_advinc_subtensor1():
""" Test the second case in the opt local_gpu_advanced_incsubtensor1 """ """ Test the second case in the opt local_gpu_advanced_incsubtensor1 """
shared = cuda.shared_constructor for shp in [(3, 3), (3, 3, 3)]:
#shared = tensor.shared shared = cuda.shared_constructor
xval = numpy.asarray([[1, 2, 3], [4, 5, 6], [7, 8, 9]], xval = numpy.arange(numpy.prod(shp), dtype='float32').reshape(shp) + 1
dtype='float32') yval = numpy.empty((2,) + shp[1:], dtype='float32')
yval = numpy.asarray([[10, 10, 10], [10, 10, 10]], yval[:] = 10
dtype='float32') x = shared(xval, name='x')
x = shared(xval, name='x') y = T.tensor(dtype='float32',
y = T.fmatrices('y') broadcastable=(False,) * len(shp),
expr = T.advanced_inc_subtensor1(x, y, [0, 2]) name='y')
f = theano.function([y], expr, mode=mode_with_gpu) expr = T.advanced_inc_subtensor1(x, y, [0, 2])
assert sum([isinstance(node.op, cuda.GpuAdvancedIncSubtensor1) f = theano.function([y], expr, mode=mode_with_gpu)
for node in f.maker.fgraph.toposort()]) == 1 assert sum([isinstance(node.op, cuda.GpuAdvancedIncSubtensor1)
assert numpy.allclose(f(yval), [[11., 12., 13.], [4., 5., 6.], for node in f.maker.fgraph.toposort()]) == 1
[17., 18., 19.]]) rval = f(yval)
rep = xval.copy()
rep[[0, 2]] += yval
assert numpy.allclose(rval, rep)
def test_inc_subtensor(): def test_inc_subtensor():
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论