提交 daf196e6 authored 作者: Frederic's avatar Frederic

Make GpuCAReduceCuda support pre_scalar_op

上级 961c15e0
......@@ -570,10 +570,14 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
GPUs are not especially well-suited to reduction operations so it is
quite possible that the GPU might be slower for some cases.
pre_scalar_op: if present, must be a scalar op with only 1
input. We will execute it on the input value before reduction.
"""
def __init__(self, scalar_op, axis=None,
reduce_mask=None, dtype=None, acc_dtype=None):
reduce_mask=None, dtype=None, acc_dtype=None,
pre_scalar_op=None):
if reduce_mask is not None:
reduce_mask = tuple(reduce_mask)
self.reduce_mask = reduce_mask
......@@ -583,6 +587,9 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
self._n_scalar_op_calls = 0
CAReduceDtype.__init__(self, scalar_op, axis=axis,
dtype=dtype, acc_dtype=acc_dtype)
self.pre_scalar_op = pre_scalar_op
if pre_scalar_op:
assert pre_scalar_op.nin == 1
def __eq__(self, other):
return (type(self) == type(other) and
......@@ -590,7 +597,8 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
self.reduce_mask == other.reduce_mask and
self.dtype == other.dtype and
self.acc_dtype == other.acc_dtype and
self.scalar_op == other.scalar_op)
self.scalar_op == other.scalar_op and
self.pre_scalar_op == other.pre_scalar_op)
def __hash__(self):
return (hash(type(self)) ^
......@@ -598,19 +606,35 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
hash(self.reduce_mask) ^
hash(self.dtype) ^
hash(self.acc_dtype) ^
hash(type(self.scalar_op)))
hash(type(self.scalar_op)) ^
hash(type(self.pre_scalar_op)))
def __str__(self):
pre = ""
if self.pre_scalar_op:
pre = "pre=%s,red=" % str(self.pre_scalar_op)
ax = ''
if self.axis is not None:
ax = '{%s}' % (', '.join(str(x) for x in self.axis),)
return "GpuCAReduceCuda{%s}%s" % (str(self.scalar_op), ax)
return "GpuCAReduceCuda{%s%s}%s" % (pre,str(self.scalar_op), ax)
def __setstate__(self, d):
self.__dict__.update(d)
# For unpickling of old ops.
if not hasattr(self, "pre_scalar_op"):
self.pre_scalar_op = None
def make_node(self, x):
x = as_gpuarray_variable(x)
ret = super(GpuCAReduceCuda, self).make_node(x)
self = copy.copy(self)
self.axis = ret.op.axis
if self.pre_scalar_op:
# Currently we only tested pre_scalar_op that don't cause
# upcast.
d1 = self.__class__(scalar_op=self.scalar_op)(Elemwise(self.pre_scalar_op)(x))
assert d1.dtype == ret.outputs[0].dtype
assert Elemwise(self.pre_scalar_op)(x).dtype == x.dtype
if self.reduce_mask is None:
if self.axis is None:
reduce_mask = [1] * x.type.ndim
......@@ -1010,15 +1034,33 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
else:
assert isinstance(self.scalar_op, (scal.Maximum,
scal.Minimum))
if self.pre_scalar_op: # TODO, multi_dtype!
#dtype = node.inputs[0].dtype
dtype = 'float32'
dummy_var = scal.Scalar(dtype=dtype)()
dummy_node = self.pre_scalar_op.make_node(dummy_var)
dummy_name = 'assign_init_pre_scalar_op' + str(self._n_scalar_op_calls)
self._n_scalar_op_calls += 1
t = self.pre_scalar_op.c_code(dummy_node, dummy_name,
(first_item,), ("",), {})
assert t.startswith(' = ')
first_item = t[3:]
if first_item[-1] == ';':
first_item = first_item[:-1]
return first_item
def _assign_reduce(self, node, name, left, right, sub):
def _assign_reduce(self, node, name, left, right, sub, pre):
"""
node: the node argument to this op's c_code
name: the name argument to this op's c_code
left: a C code string identifying an lvalue
right: a C code string identifying an expression
sub: the sub argument to this op's c_code
pre: If True, we will add the pre_scalar_op.c_code
returns C code to reduce left and right, assigning the
result to left."""
......@@ -1035,6 +1077,18 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
dummy_name = name + '_scalar_op' + str(self._n_scalar_op_calls)
self._n_scalar_op_calls += 1
if pre and self.pre_scalar_op:
assert left == "myresult"
dummy_node = self.pre_scalar_op.make_node(dummy_left)
dummy_name = name + '_scalar_op' + str(self._n_scalar_op_calls)
self._n_scalar_op_calls += 1
t = self.pre_scalar_op.c_code(dummy_node, dummy_name,
(right,), ("",), sub)
assert t.startswith(' = ')
right = t[3:]
if right[-1] == ';':
right = right[:-1]
return self.scalar_op.c_code(dummy_node, dummy_name, (left, right),
(left,), sub)
......@@ -1064,7 +1118,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
int idx = threadNum - (threadCount >> 1) * 2;"""
new_version += self._assign_reduce(node, name, 'buf[idx]',
'buf[threadNum]', sub)
'buf[threadNum]', sub, False)
new_version += """
}
......@@ -1084,7 +1138,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
"""
new_version += self._assign_reduce(node, name,
'buf[threadNum]', 'temp', sub)
'buf[threadNum]', 'temp', sub, False)
new_version += """
}
......@@ -1115,7 +1169,8 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
{
"""
current_version += self._assign_reduce(node, name,
'myresult', 'buf[i]', sub) + """
'myresult', 'buf[i]',
sub, False) + """
}
buf[threadNum] = myresult;
/*Comment this optimization as it don't work on Fermi GPU.
......@@ -1127,7 +1182,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
current_version += self._assign_reduce(node, name,
'buf[threadNum]',
'buf[threadNum+%d]' % num,
sub)
sub, False)
current_version += """
"""
current_version += """
......@@ -1146,7 +1201,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
this_if = "if (threadNum + %d < threadCount) " % num + \
self._assign_reduce(node, name,
'buf[threadNum]','buf[threadNum+%d]' % num,
sub)
sub, False)
current_version += this_if
current_version += """
"""
......@@ -1166,7 +1221,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
#Threads must be organized as: threadNum%nb_reduce correspond to the same sum
#nb_reduce<=warpSize
def _k_reduce_buf_multiple(self, z_pos, node, name, nb_reduce):
reduce_fct = self._assign_reduce(node, name, 'myresult', 'buf[i]', {})
reduce_fct = self._assign_reduce(node, name, 'myresult', 'buf[i]', {}, False)
return """
__syncthreads(); // some kernel do multiple reduction.
buf[threadNum] = myresult;
......@@ -1767,7 +1822,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
reducebuf = self._k_reduce_buf('Z[0]', node, nodename, sub={})
reduce_fct = self._assign_reduce(node, nodename, "myresult",
"A[i0]",
{})
{}, True)
reduce_init = self._assign_init("A[0]")
print >> sio, """
static __global__ void kernel_reduce_ccontig_%(nodename)s(
......@@ -1798,7 +1853,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
reducebuf = self._k_reduce_buf('Z[0]', node, nodename, sub={})
reduce_fct = self._assign_reduce(node, nodename, "myresult",
"A[i0 * sA0]",
{})
{}, True)
reduce_init = self._assign_init("A[0]")
print >> sio, """
static __global__ void kernel_reduce_1_%(nodename)s(
......@@ -1829,7 +1884,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
reducebuf = self._k_reduce_buf('Z[0]', node, nodename, sub={})
reduce_fct = self._assign_reduce(node, nodename, "myresult",
"A[i0 * sA0 + i1 * sA1]",
{})
{}, True)
reduce_init = self._assign_init("A[0]")
print >> sio, """
......@@ -1913,7 +1968,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
reduce_fct = self._assign_reduce(
node, nodename, "myresult",
"A[i3 * sA3 + i2 * sA2 + i1 * sA1 + i0 * sA0]",
{})
{}, True)
print >> sio, """
%(decl)s{
%(init)s
......@@ -1941,7 +1996,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
node, nodename, sub={})
reduce_fct = self._assign_reduce(node, nodename, "myresult",
"A[i0 * sA0 + i1 * sA1 + i2 * sA2]",
{})
{}, True)
reduce_init = self._assign_init("A[i0 * sA0 + threadIdx.x * sA1 + i2 * sA2]")
print >> sio, """
static __global__ void kernel_reduce_010_%(nodename)s(
......@@ -1980,7 +2035,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
if self.reduce_mask == (0, 1, 0):
reduce_fct = self._assign_reduce(node, nodename, "myresult",
"X[a * sX0 + b * sX1 + c * sX2]",
{})
{}, True)
reduce_init = self._assign_init("X[a * sX0 + 0 * sX1 + c * sX2]")
print >> sio, """
static __global__ void kernel_reduce_010_AD_%(nodename)s(
......@@ -2040,7 +2095,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
'blockDim.x')
reduce_fct = self._assign_reduce(node, nodename, "myresult",
"A[i0 * sA0 + i1 * sA1 + i2 * sA2]",
{})
{}, True)
reduce_init = self._assign_init("A[i0 * sA0 + 0 * sA1 + i2 * sA2]")
print >> sio, """
%(decl)s
......@@ -2076,7 +2131,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
reducebuf = self._k_reduce_buf('Z[blockIdx.x * sZ0]', node, nodename, sub = {})
reduce_fct = self._assign_reduce(node, nodename, "myresult",
"A[i0 * sA0 + i1 * sA1 + blockIdx.x * sA2]",
{})
{}, True)
reduce_init = self._assign_init("A[blockIdx.x * sA2]")
print >> sio, """
static __global__ void kernel_reduce_110_%(nodename)s(
......@@ -2117,7 +2172,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
init = self._k_init(node, nodename)
reduce_fct = self._assign_reduce(node, nodename, "myresult",
"A[i0 * sA0 + i1 * sA1 + i2 * sA2]",
{})
{}, True)
reduce_init = self._assign_init("A[i1 * sA1 + i2 * sA2]")
print >> sio, """
%(decl)s
......@@ -2144,7 +2199,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
init = self._k_init(node, nodename)
reduce_fct = self._assign_reduce(node, nodename, "myresult",
"A[i0 * sA0 + i1 * sA1 + i2 * sA2]",
{})
{}, True)
reduce_init = self._assign_init("A[0]")
print >> sio, """
%(decl)s
......@@ -2171,7 +2226,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
node, nodename, sub={})
reduce_fct = self._assign_reduce(node, nodename, "myresult",
"A[i0 * sA0 + i1 * sA1 + i2 * sA2]",
{})
{}, True)
reduce_init = self._assign_init("A[i0 * sA0 + i1 * sA1]")
print >> sio, """
static __global__ void kernel_reduce_001_%(nodename)s(
......@@ -2214,7 +2269,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
init = self._k_init(node, nodename)
reduce_fct = self._assign_reduce(node, nodename, "myresult",
"A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3]",
{})
{}, True)
reduce_init = self._assign_init("A[i0 * sA0 + i1 * sA1]")
print >> sio, """
%(decl)s
......@@ -2247,7 +2302,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
init = self._k_init(node, nodename)
reduce_fct = self._assign_reduce(node, nodename, "myresult",
"A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3]",
{})
{}, True)
reduce_init = self._assign_init("A[i0 * sA0 + i2 * sA2]")
print >> sio, """
%(decl)s
......@@ -2278,7 +2333,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
init = self._k_init(node, nodename)
reduce_fct = self._assign_reduce(node, nodename, "myresult",
"A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3]",
{})
{}, True)
reduce_init = self._assign_init("A[0]")
print >> sio, """
%(decl)s
......@@ -2304,7 +2359,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
node, nodename, sub={})
reduce_fct = self._assign_reduce(node, nodename, "myresult",
"A[i0 * sA0 + blockIdx.x * sA1 + i2 * sA2 + i3 * sA3]",
{})
{}, True)
reduce_init = self._assign_init("A[blockIdx.x * sA1]")
print >> sio, """
static __global__ void kernel_reduce_1011_%(nodename)s(
......
......@@ -40,11 +40,13 @@ class test_GpuCAReduceCPY(test_CAReduce):
bin_dtypes = ["uint8", "int8"]
op = GpuCAReduceCPY
reds = [scalar.add, scalar.mul]
pre_scalar_op = None
def test_perform(self):
for dtype in self.dtypes + self.bin_dtypes:
for op in self.reds:
self.with_linker(gof.PerformLinker(), op, dtype=dtype)
self.with_linker(gof.PerformLinker(), op, dtype=dtype,
pre_scalar_op=self.pre_scalar_op)
def test_perform_nan(self):
for dtype in self.dtypes:
......@@ -52,12 +54,14 @@ class test_GpuCAReduceCPY(test_CAReduce):
continue
for op in self.reds:
self.with_linker(gof.PerformLinker(), op, dtype=dtype,
test_nan=True)
test_nan=True,
pre_scalar_op=self.pre_scalar_op)
def test_c(self):
for dtype in self.dtypes + self.bin_dtypes:
for op in self.reds:
self.with_linker(gof.CLinker(), op, dtype=dtype)
self.with_linker(gof.CLinker(), op, dtype=dtype,
pre_scalar_op=self.pre_scalar_op)
def test_c_nan(self):
for dtype in self.dtypes:
......@@ -65,7 +69,8 @@ class test_GpuCAReduceCPY(test_CAReduce):
continue
for op in self.reds:
self.with_linker(gof.CLinker(), op, dtype=dtype,
test_nan=True)
test_nan=True,
pre_scalar_op=self.pre_scalar_op)
def test_infer_shape(self):
for dtype in self.dtypes:
......@@ -148,6 +153,7 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
op = GpuCAReduceCuda
reds = [scalar.add, scalar.mul,
scalar.maximum, scalar.minimum]
pre_scalar_op = scalar.sqr
def test_perform(self):
return
......
......@@ -308,15 +308,19 @@ class test_CAReduce(unittest_tools.InferShapeTester):
]
def with_linker(self, linker, scalar_op=scalar.add, dtype="floatX",
pre_scalar_op=None,
test_nan=False, tensor_op=None):
for xsh, tosum in self.cases:
if dtype == "floatX":
dtype = theano.config.floatX
x = TensorType(dtype, [(entry == 1) for entry in xsh])('x')
d = {}
if pre_scalar_op is not None:
d = {"pre_scalar_op": pre_scalar_op}
if tensor_op is None:
e = as_tensor_variable(self.op(scalar_op, axis=tosum)(x))
e = as_tensor_variable(self.op(scalar_op, axis=tosum, **d)(x))
else:
e = as_tensor_variable(tensor_op(x, axis=tosum))
e = as_tensor_variable(tensor_op(x, axis=tosum, **d))
if tosum is None:
tosum = range(len(xsh))
......@@ -337,6 +341,8 @@ class test_CAReduce(unittest_tools.InferShapeTester):
else:
xv = numpy.asarray(numpy.nan, dtype=dtype)
zv = xv
if pre_scalar_op is not None:
zv = Elemwise(scalar_op=pre_scalar_op)(x).eval({x: xv})
numpy_raised = False
if len(tosum) > 1 and any([a < 0 for a in tosum]):
#In that case, we need to use the good order of axis
......@@ -505,16 +511,22 @@ class test_CAReduce(unittest_tools.InferShapeTester):
self.with_linker(gof.CLinker(), scalar.maximum, dtype=dtype,
test_nan=True)
def test_infer_shape(self, dtype=None):
def test_infer_shape(self, dtype=None, pre_scalar_op=None):
if dtype is None:
dtype = theano.config.floatX
for xsh, tosum in self.cases:
x = TensorType(dtype, [(entry == 1) for entry in xsh])('x')
if pre_scalar_op is not None:
x = pre_scalar_op(x)
if tosum is None:
tosum = range(len(xsh))
xv = numpy.asarray(numpy.random.rand(*xsh), dtype=dtype)
d = {}
if pre_scalar_op is not None:
xv = x.eval({x.owner.inputs[0]: xv})
d = {pre_scalar_op: pre_scalar_op}
self._compile_and_check([x],
[self.op(scalar.add, axis=tosum)(x)],
[self.op(scalar.add, axis=tosum, *d)(x)],
[xv], self.op,
["local_cut_useless_reduce"],
warn=0 not in xsh)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论