提交 0d3dffac authored 作者: abergeron's avatar abergeron

Merge pull request #1888 from nouiz/gpu_sqr_sum_ax0

Add GpuSqrSumAx0 to lower the memory usage on the GPU.
...@@ -503,31 +503,49 @@ class GpuCAReduce(GpuOp): ...@@ -503,31 +503,49 @@ class GpuCAReduce(GpuOp):
GPUs are not especially well-suited to reduction operations so it is GPUs are not especially well-suited to reduction operations so it is
quite possible that the GPU might be slower for some cases. quite possible that the GPU might be slower for some cases.
pre_scalar_op: if present, must be a scalar op with only 1
input. We will execute it on the input value before reduction.
""" """
def __init__(self, reduce_mask, scalar_op): def __init__(self, reduce_mask, scalar_op, pre_scalar_op=None):
self.reduce_mask = tuple(reduce_mask) self.reduce_mask = tuple(reduce_mask)
self.scalar_op = scalar_op self.scalar_op = scalar_op
# used to make sure that calls to scalar op # used to make sure that calls to scalar op
# have unique name arguments # have unique name arguments
self._n_scalar_op_calls = 0 self._n_scalar_op_calls = 0
self.pre_scalar_op = pre_scalar_op
if pre_scalar_op:
assert pre_scalar_op.nin == 1
def __eq__(self, other): def __eq__(self, other):
return (type(self) == type(other) and return (type(self) == type(other) and
self.reduce_mask == other.reduce_mask and self.reduce_mask == other.reduce_mask and
self.scalar_op == other.scalar_op) self.scalar_op == other.scalar_op and
self.pre_scalar_op == other.pre_scalar_op)
def __hash__(self): def __hash__(self):
return (hash(type(self)) ^ return (hash(type(self)) ^
hash(self.reduce_mask) ^ hash(self.reduce_mask) ^
hash(type(self.scalar_op))) hash(type(self.scalar_op)) ^
hash(type(self.pre_scalar_op)))
def __str__(self): def __str__(self):
return "GpuCAReduce{%s}{%s}" % ( pre = ""
if self.pre_scalar_op:
pre = "pre=%s,red=" % str(self.pre_scalar_op)
return "GpuCAReduce{%s%s}{%s}" % (
pre,
str(self.scalar_op), str(self.scalar_op),
','.join(str(i) for i in self.reduce_mask) ','.join(str(i) for i in self.reduce_mask)
) )
def __setstate__(self, d):
self.__dict__.update(d)
# For unpickling of old ops.
if not hasattr(self, "pre_scalar_op"):
self.pre_scalar_op = None
def make_node(self, x): def make_node(self, x):
if (x.type.ndim != len(self.reduce_mask)): if (x.type.ndim != len(self.reduce_mask)):
raise TypeError("x must have rank %i" % len(self.reduce_mask)) raise TypeError("x must have rank %i" % len(self.reduce_mask))
...@@ -889,15 +907,33 @@ class GpuCAReduce(GpuOp): ...@@ -889,15 +907,33 @@ class GpuCAReduce(GpuOp):
else: else:
assert isinstance(self.scalar_op, (scal.Maximum, assert isinstance(self.scalar_op, (scal.Maximum,
scal.Minimum)) scal.Minimum))
if self.pre_scalar_op:
#dtype = node.inputs[0].dtype
dtype = 'float32'
dummy_var = scal.Scalar(dtype=dtype)()
dummy_node = self.pre_scalar_op.make_node(dummy_var)
dummy_name = 'assign_init_pre_scalar_op' + str(self._n_scalar_op_calls)
self._n_scalar_op_calls += 1
t = self.pre_scalar_op.c_code(dummy_node, dummy_name,
(first_item,), ("",), {})
assert t.startswith(' = ')
first_item = t[3:]
if first_item[-1] == ';':
first_item = first_item[:-1]
return first_item return first_item
def _assign_reduce(self, node, name, left, right, sub): def _assign_reduce(self, node, name, left, right, sub, pre):
""" """
node: the node argument to this op's c_code node: the node argument to this op's c_code
name: the name argument to this op's c_code name: the name argument to this op's c_code
left: a C code string identifying an lvalue left: a C code string identifying an lvalue
right: a C code string identifying an expression right: a C code string identifying an expression
sub: the sub argument to this op's c_code sub: the sub argument to this op's c_code
pre: If True, we will add the pre_scalar_op.c_code
returns C code to reduce left and right, assigning the returns C code to reduce left and right, assigning the
result to left.""" result to left."""
...@@ -913,7 +949,17 @@ class GpuCAReduce(GpuOp): ...@@ -913,7 +949,17 @@ class GpuCAReduce(GpuOp):
dummy_name = name + '_scalar_op' + str(self._n_scalar_op_calls) dummy_name = name + '_scalar_op' + str(self._n_scalar_op_calls)
self._n_scalar_op_calls += 1 self._n_scalar_op_calls += 1
if pre and self.pre_scalar_op:
assert left == "myresult"
dummy_node = self.pre_scalar_op.make_node(dummy_left)
dummy_name = name + '_scalar_op' + str(self._n_scalar_op_calls)
self._n_scalar_op_calls += 1
t = self.pre_scalar_op.c_code(dummy_node, dummy_name,
(right,), ("",), sub)
assert t.startswith(' = ')
right = t[3:]
if right[-1] == ';':
right = right[:-1]
return self.scalar_op.c_code(dummy_node, dummy_name, (left, right), return self.scalar_op.c_code(dummy_node, dummy_name, (left, right),
(left,), sub) (left,), sub)
...@@ -939,7 +985,8 @@ class GpuCAReduce(GpuOp): ...@@ -939,7 +985,8 @@ class GpuCAReduce(GpuOp):
{ {
int idx = threadNum - (threadCount >> 1) * 2;""" int idx = threadNum - (threadCount >> 1) * 2;"""
new_version += self._assign_reduce(node, name, 'buf[idx]','buf[threadNum]', sub) new_version += self._assign_reduce(node, name, 'buf[idx]',
'buf[threadNum]', sub, False)
new_version += """ new_version += """
} }
...@@ -958,8 +1005,8 @@ class GpuCAReduce(GpuOp): ...@@ -958,8 +1005,8 @@ class GpuCAReduce(GpuOp):
float temp = buf[threadNum + halfPoint]; float temp = buf[threadNum + halfPoint];
""" """
new_version += self._assign_reduce(node, name, new_version += self._assign_reduce(node, name, 'buf[threadNum]',
'buf[threadNum]', 'temp', sub) 'temp', sub, False)
new_version += """ new_version += """
} }
...@@ -989,8 +1036,8 @@ class GpuCAReduce(GpuOp): ...@@ -989,8 +1036,8 @@ class GpuCAReduce(GpuOp):
for (int i = threadNum + warpSize; i < threadCount; i += warpSize) for (int i = threadNum + warpSize; i < threadCount; i += warpSize)
{ {
""" """
current_version += self._assign_reduce(node, name, current_version += self._assign_reduce(node, name, 'myresult',
'myresult', 'buf[i]', sub) + """ 'buf[i]', sub, False) + """
} }
buf[threadNum] = myresult; buf[threadNum] = myresult;
/*Comment this optimization as it don't work on Fermi GPU. /*Comment this optimization as it don't work on Fermi GPU.
...@@ -1002,7 +1049,7 @@ class GpuCAReduce(GpuOp): ...@@ -1002,7 +1049,7 @@ class GpuCAReduce(GpuOp):
current_version += self._assign_reduce(node, name, current_version += self._assign_reduce(node, name,
'buf[threadNum]', 'buf[threadNum]',
'buf[threadNum+%d]' % num, 'buf[threadNum+%d]' % num,
sub) sub, False)
current_version += """ current_version += """
if (threadNum == 0) if (threadNum == 0)
{ {
...@@ -1019,7 +1066,7 @@ class GpuCAReduce(GpuOp): ...@@ -1019,7 +1066,7 @@ class GpuCAReduce(GpuOp):
this_if = "if (threadNum + %d < threadCount) " % num + \ this_if = "if (threadNum + %d < threadCount) " % num + \
self._assign_reduce(node, name, self._assign_reduce(node, name,
'buf[threadNum]','buf[threadNum+%d]' % num, 'buf[threadNum]','buf[threadNum+%d]' % num,
sub) sub, False)
current_version += this_if current_version += this_if
current_version += """ current_version += """
if (threadNum == 0) if (threadNum == 0)
...@@ -1037,7 +1084,8 @@ class GpuCAReduce(GpuOp): ...@@ -1037,7 +1084,8 @@ class GpuCAReduce(GpuOp):
#Threads must be organized as: threadNum%nb_reduce correspond to the same sum #Threads must be organized as: threadNum%nb_reduce correspond to the same sum
#nb_reduce<=warpSize #nb_reduce<=warpSize
def _k_reduce_buf_multiple(self, z_pos, node, name, nb_reduce): def _k_reduce_buf_multiple(self, z_pos, node, name, nb_reduce):
reduce_fct = self._assign_reduce(node, name, 'myresult', 'buf[i]', {}) reduce_fct = self._assign_reduce(node, name, 'myresult',
'buf[i]', {}, True)
return """ return """
__syncthreads(); // some kernel do multiple reduction. __syncthreads(); // some kernel do multiple reduction.
buf[threadNum] = myresult; buf[threadNum] = myresult;
...@@ -1609,7 +1657,7 @@ class GpuCAReduce(GpuOp): ...@@ -1609,7 +1657,7 @@ class GpuCAReduce(GpuOp):
reducebuf = self._k_reduce_buf('Z[0]', node, nodename, sub={}) reducebuf = self._k_reduce_buf('Z[0]', node, nodename, sub={})
reduce_fct = self._assign_reduce(node, nodename, "myresult", reduce_fct = self._assign_reduce(node, nodename, "myresult",
"A[i0]", "A[i0]",
{}) {}, True)
reduce_init = self._assign_init("A[0]") reduce_init = self._assign_init("A[0]")
print >> sio, """ print >> sio, """
static __global__ void kernel_reduce_ccontig_%(nodename)s( static __global__ void kernel_reduce_ccontig_%(nodename)s(
...@@ -1640,7 +1688,7 @@ class GpuCAReduce(GpuOp): ...@@ -1640,7 +1688,7 @@ class GpuCAReduce(GpuOp):
reducebuf = self._k_reduce_buf('Z[0]', node, nodename, sub={}) reducebuf = self._k_reduce_buf('Z[0]', node, nodename, sub={})
reduce_fct = self._assign_reduce(node, nodename, "myresult", reduce_fct = self._assign_reduce(node, nodename, "myresult",
"A[i0 * sA0]", "A[i0 * sA0]",
{}) {}, True)
reduce_init = self._assign_init("A[0]") reduce_init = self._assign_init("A[0]")
print >> sio, """ print >> sio, """
static __global__ void kernel_reduce_1_%(nodename)s( static __global__ void kernel_reduce_1_%(nodename)s(
...@@ -1671,7 +1719,7 @@ class GpuCAReduce(GpuOp): ...@@ -1671,7 +1719,7 @@ class GpuCAReduce(GpuOp):
reducebuf = self._k_reduce_buf('Z[0]', node, nodename, sub={}) reducebuf = self._k_reduce_buf('Z[0]', node, nodename, sub={})
reduce_fct = self._assign_reduce(node, nodename, "myresult", reduce_fct = self._assign_reduce(node, nodename, "myresult",
"A[i0 * sA0 + i1 * sA1]", "A[i0 * sA0 + i1 * sA1]",
{}) {}, True)
reduce_init = self._assign_init("A[0]") reduce_init = self._assign_init("A[0]")
print >> sio, """ print >> sio, """
...@@ -1755,7 +1803,8 @@ class GpuCAReduce(GpuOp): ...@@ -1755,7 +1803,8 @@ class GpuCAReduce(GpuOp):
reduce_fct = self._assign_reduce( reduce_fct = self._assign_reduce(
node, nodename, "myresult", node, nodename, "myresult",
"A[i3 * sA3 + i2 * sA2 + i1 * sA1 + i0 * sA0]", "A[i3 * sA3 + i2 * sA2 + i1 * sA1 + i0 * sA0]",
{}) {}, True)
print >> sio, """ print >> sio, """
%(decl)s{ %(decl)s{
%(init)s %(init)s
...@@ -1783,7 +1832,7 @@ class GpuCAReduce(GpuOp): ...@@ -1783,7 +1832,7 @@ class GpuCAReduce(GpuOp):
node, nodename, sub={}) node, nodename, sub={})
reduce_fct = self._assign_reduce(node, nodename, "myresult", reduce_fct = self._assign_reduce(node, nodename, "myresult",
"A[i0 * sA0 + i1 * sA1 + i2 * sA2]", "A[i0 * sA0 + i1 * sA1 + i2 * sA2]",
{}) {}, True)
reduce_init = self._assign_init("A[i0 * sA0 + threadIdx.x * sA1 + i2 * sA2]") reduce_init = self._assign_init("A[i0 * sA0 + threadIdx.x * sA1 + i2 * sA2]")
print >> sio, """ print >> sio, """
static __global__ void kernel_reduce_010_%(nodename)s( static __global__ void kernel_reduce_010_%(nodename)s(
...@@ -1822,7 +1871,7 @@ class GpuCAReduce(GpuOp): ...@@ -1822,7 +1871,7 @@ class GpuCAReduce(GpuOp):
if self.reduce_mask == (0, 1, 0): if self.reduce_mask == (0, 1, 0):
reduce_fct = self._assign_reduce(node, nodename, "myresult", reduce_fct = self._assign_reduce(node, nodename, "myresult",
"X[a * sX0 + b * sX1 + c * sX2]", "X[a * sX0 + b * sX1 + c * sX2]",
{}) {}, True)
reduce_init = self._assign_init("X[a * sX0 + 0 * sX1 + c * sX2]") reduce_init = self._assign_init("X[a * sX0 + 0 * sX1 + c * sX2]")
print >> sio, """ print >> sio, """
static __global__ void kernel_reduce_010_AD_%(nodename)s( static __global__ void kernel_reduce_010_AD_%(nodename)s(
...@@ -1882,7 +1931,7 @@ class GpuCAReduce(GpuOp): ...@@ -1882,7 +1931,7 @@ class GpuCAReduce(GpuOp):
'blockDim.x') 'blockDim.x')
reduce_fct = self._assign_reduce(node, nodename, "myresult", reduce_fct = self._assign_reduce(node, nodename, "myresult",
"A[i0 * sA0 + i1 * sA1 + i2 * sA2]", "A[i0 * sA0 + i1 * sA1 + i2 * sA2]",
{}) {}, True)
reduce_init = self._assign_init("A[i0 * sA0 + 0 * sA1 + i2 * sA2]") reduce_init = self._assign_init("A[i0 * sA0 + 0 * sA1 + i2 * sA2]")
print >> sio, """ print >> sio, """
%(decl)s %(decl)s
...@@ -1918,7 +1967,7 @@ class GpuCAReduce(GpuOp): ...@@ -1918,7 +1967,7 @@ class GpuCAReduce(GpuOp):
reducebuf = self._k_reduce_buf('Z[blockIdx.x * sZ0]', node, nodename, sub = {}) reducebuf = self._k_reduce_buf('Z[blockIdx.x * sZ0]', node, nodename, sub = {})
reduce_fct = self._assign_reduce(node, nodename, "myresult", reduce_fct = self._assign_reduce(node, nodename, "myresult",
"A[i0 * sA0 + i1 * sA1 + blockIdx.x * sA2]", "A[i0 * sA0 + i1 * sA1 + blockIdx.x * sA2]",
{}) {}, True)
reduce_init = self._assign_init("A[blockIdx.x * sA2]") reduce_init = self._assign_init("A[blockIdx.x * sA2]")
print >> sio, """ print >> sio, """
static __global__ void kernel_reduce_110_%(nodename)s( static __global__ void kernel_reduce_110_%(nodename)s(
...@@ -1959,7 +2008,7 @@ class GpuCAReduce(GpuOp): ...@@ -1959,7 +2008,7 @@ class GpuCAReduce(GpuOp):
init = self._k_init(node, nodename) init = self._k_init(node, nodename)
reduce_fct = self._assign_reduce(node, nodename, "myresult", reduce_fct = self._assign_reduce(node, nodename, "myresult",
"A[i0 * sA0 + i1 * sA1 + i2 * sA2]", "A[i0 * sA0 + i1 * sA1 + i2 * sA2]",
{}) {}, True)
reduce_init = self._assign_init("A[i1 * sA1 + i2 * sA2]") reduce_init = self._assign_init("A[i1 * sA1 + i2 * sA2]")
print >> sio, """ print >> sio, """
%(decl)s %(decl)s
...@@ -1986,7 +2035,7 @@ class GpuCAReduce(GpuOp): ...@@ -1986,7 +2035,7 @@ class GpuCAReduce(GpuOp):
init = self._k_init(node, nodename) init = self._k_init(node, nodename)
reduce_fct = self._assign_reduce(node, nodename, "myresult", reduce_fct = self._assign_reduce(node, nodename, "myresult",
"A[i0 * sA0 + i1 * sA1 + i2 * sA2]", "A[i0 * sA0 + i1 * sA1 + i2 * sA2]",
{}) {}, True)
reduce_init = self._assign_init("A[0]") reduce_init = self._assign_init("A[0]")
print >> sio, """ print >> sio, """
%(decl)s %(decl)s
...@@ -2013,7 +2062,7 @@ class GpuCAReduce(GpuOp): ...@@ -2013,7 +2062,7 @@ class GpuCAReduce(GpuOp):
node, nodename, sub={}) node, nodename, sub={})
reduce_fct = self._assign_reduce(node, nodename, "myresult", reduce_fct = self._assign_reduce(node, nodename, "myresult",
"A[i0 * sA0 + i1 * sA1 + i2 * sA2]", "A[i0 * sA0 + i1 * sA1 + i2 * sA2]",
{}) {}, True)
reduce_init = self._assign_init("A[i0 * sA0 + i1 * sA1]") reduce_init = self._assign_init("A[i0 * sA0 + i1 * sA1]")
print >> sio, """ print >> sio, """
static __global__ void kernel_reduce_001_%(nodename)s( static __global__ void kernel_reduce_001_%(nodename)s(
...@@ -2056,7 +2105,7 @@ class GpuCAReduce(GpuOp): ...@@ -2056,7 +2105,7 @@ class GpuCAReduce(GpuOp):
init = self._k_init(node, nodename) init = self._k_init(node, nodename)
reduce_fct = self._assign_reduce(node, nodename, "myresult", reduce_fct = self._assign_reduce(node, nodename, "myresult",
"A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3]", "A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3]",
{}) {}, True)
reduce_init = self._assign_init("A[i0 * sA0 + i1 * sA1]") reduce_init = self._assign_init("A[i0 * sA0 + i1 * sA1]")
print >> sio, """ print >> sio, """
%(decl)s %(decl)s
...@@ -2089,7 +2138,7 @@ class GpuCAReduce(GpuOp): ...@@ -2089,7 +2138,7 @@ class GpuCAReduce(GpuOp):
init = self._k_init(node, nodename) init = self._k_init(node, nodename)
reduce_fct = self._assign_reduce(node, nodename, "myresult", reduce_fct = self._assign_reduce(node, nodename, "myresult",
"A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3]", "A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3]",
{}) {}, True)
reduce_init = self._assign_init("A[i0 * sA0 + i2 * sA2]") reduce_init = self._assign_init("A[i0 * sA0 + i2 * sA2]")
print >> sio, """ print >> sio, """
%(decl)s %(decl)s
...@@ -2120,7 +2169,7 @@ class GpuCAReduce(GpuOp): ...@@ -2120,7 +2169,7 @@ class GpuCAReduce(GpuOp):
init = self._k_init(node, nodename) init = self._k_init(node, nodename)
reduce_fct = self._assign_reduce(node, nodename, "myresult", reduce_fct = self._assign_reduce(node, nodename, "myresult",
"A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3]", "A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3]",
{}) {}, True)
reduce_init = self._assign_init("A[0]") reduce_init = self._assign_init("A[0]")
print >> sio, """ print >> sio, """
%(decl)s %(decl)s
...@@ -2146,7 +2195,7 @@ class GpuCAReduce(GpuOp): ...@@ -2146,7 +2195,7 @@ class GpuCAReduce(GpuOp):
node, nodename, sub={}) node, nodename, sub={})
reduce_fct = self._assign_reduce(node, nodename, "myresult", reduce_fct = self._assign_reduce(node, nodename, "myresult",
"A[i0 * sA0 + blockIdx.x * sA1 + i2 * sA2 + i3 * sA3]", "A[i0 * sA0 + blockIdx.x * sA1 + i2 * sA2 + i3 * sA3]",
{}) {}, True)
reduce_init = self._assign_init("A[blockIdx.x * sA1]") reduce_init = self._assign_init("A[blockIdx.x * sA1]")
print >> sio, """ print >> sio, """
static __global__ void kernel_reduce_1011_%(nodename)s( static __global__ void kernel_reduce_1011_%(nodename)s(
......
from theano import Op, Apply from theano import Op, Apply
from theano.compat.six import StringIO from theano.compat.six import StringIO
from theano.sandbox.cuda import GpuOp from theano.sandbox.cuda import GpuOp, as_cuda_ndarray_variable
from theano.sandbox.cuda.kernel_codegen import (nvcc_kernel, from theano.sandbox.cuda.kernel_codegen import (nvcc_kernel,
inline_softmax, inline_softmax,
inline_softmax_fixed_shared) inline_softmax_fixed_shared)
class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp): class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuOp):
""" """
Implement CrossentropySoftmaxArgmax1HotWithBias on the gpu. Implement CrossentropySoftmaxArgmax1HotWithBias on the gpu.
""" """
...@@ -216,7 +216,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp): ...@@ -216,7 +216,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp):
gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1HotWithBias() gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1HotWithBias()
class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp): class GpuCrossentropySoftmax1HotWithBiasDx(GpuOp):
""" """
Implement CrossentropySoftmax1HotWithBiasDx on the gpu. Implement CrossentropySoftmax1HotWithBiasDx on the gpu.
""" """
...@@ -364,7 +364,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp): ...@@ -364,7 +364,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp):
gpu_crossentropy_softmax_1hot_with_bias_dx = GpuCrossentropySoftmax1HotWithBiasDx() gpu_crossentropy_softmax_1hot_with_bias_dx = GpuCrossentropySoftmax1HotWithBiasDx()
class GpuSoftmax (GpuOp): class GpuSoftmax(GpuOp):
""" """
Implement Softmax on the gpu. Implement Softmax on the gpu.
""" """
...@@ -483,8 +483,8 @@ class GpuSoftmax (GpuOp): ...@@ -483,8 +483,8 @@ class GpuSoftmax (GpuOp):
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
ret1 = nvcc_kernel("kSoftmax_%s" % nodename, ret1 = nvcc_kernel("kSoftmax_%s" % nodename,
params=['int M', 'int N', params=['int M', 'int N',
'const float * x', 'const int sx0', 'const int sx1', 'const float * x', 'const int sx0', 'const int sx1',
'float * sm', 'const int sm_s0', 'const int sm_s1'], 'float * sm', 'const int sm_s0', 'const int sm_s1'],
body=[ body=[
"extern __shared__ float buf[]", "extern __shared__ float buf[]",
"float * buf2 = buf + N", "float * buf2 = buf + N",
...@@ -506,8 +506,8 @@ class GpuSoftmax (GpuOp): ...@@ -506,8 +506,8 @@ class GpuSoftmax (GpuOp):
]) ])
ret2 = nvcc_kernel("kSoftmax_fixed_shared%s" % nodename, ret2 = nvcc_kernel("kSoftmax_fixed_shared%s" % nodename,
params=['int M', 'int N', params=['int M', 'int N',
'const float * x', 'const int sx0', 'const int sx1', 'const float * x', 'const int sx0', 'const int sx1',
'float * sm', 'const int sm_s0', 'const int sm_s1'], 'float * sm', 'const int sm_s0', 'const int sm_s1'],
body=[ body=[
"extern __shared__ float buf[]", "extern __shared__ float buf[]",
"for (int blockIDX = blockIdx.x; blockIDX < M;" "for (int blockIDX = blockIdx.x; blockIDX < M;"
...@@ -525,7 +525,7 @@ class GpuSoftmax (GpuOp): ...@@ -525,7 +525,7 @@ class GpuSoftmax (GpuOp):
gpu_softmax = GpuSoftmax() gpu_softmax = GpuSoftmax()
class GpuSoftmaxWithBias (GpuOp): class GpuSoftmaxWithBias(GpuOp):
""" """
Implement SoftmaxWithBias on the gpu. Implement SoftmaxWithBias on the gpu.
""" """
...@@ -545,7 +545,7 @@ class GpuSoftmaxWithBias (GpuOp): ...@@ -545,7 +545,7 @@ class GpuSoftmaxWithBias (GpuOp):
return Apply(self, [x, b], [x.type()]) return Apply(self, [x, b], [x.type()])
def infer_shape(self, node, shape): def infer_shape(self, node, shape):
return [shape[0]] return [shape[0]]
def c_code_cache_version(self): def c_code_cache_version(self):
#return () #return ()
...@@ -660,12 +660,13 @@ class GpuSoftmaxWithBias (GpuOp): ...@@ -660,12 +660,13 @@ class GpuSoftmaxWithBias (GpuOp):
""" % locals() """ % locals()
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
ret1 = nvcc_kernel("kSoftmaxWithBias_%s" % nodename, ret1 = nvcc_kernel(
params=['int M', 'int N', "kSoftmaxWithBias_%s" % nodename,
'const float * x', 'const int sx0', 'const int sx1', params=['int M', 'int N',
'const float * b', 'const int sb0', 'const float * x', 'const int sx0', 'const int sx1',
'float * sm', 'const int sm_s0', 'const int sm_s1'], 'const float * b', 'const int sb0',
body=[ 'float * sm', 'const int sm_s0', 'const int sm_s1'],
body=[
"extern __shared__ float buf[]", "extern __shared__ float buf[]",
"float * buf2 = buf + N", "float * buf2 = buf + N",
"for (int blockIDX = blockIdx.x; blockIDX < M;" "for (int blockIDX = blockIdx.x; blockIDX < M;"
...@@ -683,7 +684,7 @@ class GpuSoftmaxWithBias (GpuOp): ...@@ -683,7 +684,7 @@ class GpuSoftmaxWithBias (GpuOp):
"}", "}",
"__syncthreads()", "__syncthreads()",
"}", "}",
]) ])
ret2 = nvcc_kernel("kSoftmaxWithBias_fixed_shared%s" % nodename, ret2 = nvcc_kernel("kSoftmaxWithBias_fixed_shared%s" % nodename,
params=['int M', 'int N', params=['int M', 'int N',
'const float * x', 'const float * x',
......
...@@ -684,6 +684,24 @@ def local_gpu_careduce(node): ...@@ -684,6 +684,24 @@ def local_gpu_careduce(node):
return False return False
@register_opt("low_memory")
@local_optimizer([GpuCAReduce])
def local_gpu_elemwise_careduce(node):
if (isinstance(node.op, GpuCAReduce) and
node.op.pre_scalar_op is None and
node.inputs[0].owner and
isinstance(node.inputs[0].owner.op, GpuElemwise) and
# The Op support all scalar with 1 inputs. We don't
# automatically add more case, as some like trigonometic
# operation with some reduction pattern will probably result
# to slow down.
isinstance(node.inputs[0].owner.op.scalar_op, scal.basic.Sqr)
):
op = node.op
inp = node.inputs[0].owner.inputs[0]
return [GpuCAReduce(op.reduce_mask, op.scalar_op, scal.basic.sqr)(inp)]
@register_opt() @register_opt()
@local_optimizer([gpu_from_host, tensor.Reshape]) @local_optimizer([gpu_from_host, tensor.Reshape])
def local_gpu_reshape(node): def local_gpu_reshape(node):
......
...@@ -60,6 +60,10 @@ def test_careduce(): ...@@ -60,6 +60,10 @@ def test_careduce():
1110,1101,1011 1110,1101,1011
TODO: test with broadcast TODO: test with broadcast
We test with the pre_scalar_op sqr in all cases. This cover all
code, with and without it the pre_scalar_op.
""" """
for scalar_op, careduce_op in [ for scalar_op, careduce_op in [
(theano.scalar.mul, tensor.elemwise.CAReduceDtype), (theano.scalar.mul, tensor.elemwise.CAReduceDtype),
...@@ -132,7 +136,7 @@ def test_careduce(): ...@@ -132,7 +136,7 @@ def test_careduce():
pat = tensor_pattern_to_gpu_pattern(shape, pattern) pat = tensor_pattern_to_gpu_pattern(shape, pattern)
a = tensor.TensorType('float32', (False,) * len(shape))() a = tensor.TensorType('float32', (False,) * len(shape))()
b = op(a) b = op(a*a)
val = numpy.random.rand(numpy.prod(shape)).reshape(shape) val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
# val = numpy.ones(shape) # val = numpy.ones(shape)
# val = numpy.arange(numpy.prod(shape)).reshape(shape) # val = numpy.arange(numpy.prod(shape)).reshape(shape)
...@@ -142,6 +146,10 @@ def test_careduce(): ...@@ -142,6 +146,10 @@ def test_careduce():
assert tcn.GpuCAReduce in [x.op.__class__ assert tcn.GpuCAReduce in [x.op.__class__
for x in f.maker.fgraph.toposort()], ( for x in f.maker.fgraph.toposort()], (
scalar_op, shape, pattern) scalar_op, shape, pattern)
if tcn.GpuElemwise in [x.op.__class__
for x in f.maker.fgraph.toposort()]:
assert tcn.GpuReshape in [x.op.__class__
for x in f.maker.fgraph.toposort()]
assert op.__class__ in [x.op.__class__ assert op.__class__ in [x.op.__class__
for x in f2.maker.fgraph.toposort()], ( for x in f2.maker.fgraph.toposort()], (
scalar_op, shape, pattern) scalar_op, shape, pattern)
...@@ -210,7 +218,7 @@ def test_careduce(): ...@@ -210,7 +218,7 @@ def test_careduce():
dim_pattern[0] = 1 dim_pattern[0] = 1
dim_pattern[1] = 0 dim_pattern[1] = 0
a = a.dimshuffle(dim_pattern) a = a.dimshuffle(dim_pattern)
b = op(a) b = op(a*a)
val = numpy.random.rand(numpy.prod(shape)).reshape(shape) val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
# val = numpy.ones(shape) # val = numpy.ones(shape)
# val = numpy.arange(numpy.prod(shape)).reshape(shape) # val = numpy.arange(numpy.prod(shape)).reshape(shape)
...@@ -220,6 +228,8 @@ def test_careduce(): ...@@ -220,6 +228,8 @@ def test_careduce():
assert tcn.GpuCAReduce in [x.op.__class__ assert tcn.GpuCAReduce in [x.op.__class__
for x in f.maker.fgraph.toposort()], ( for x in f.maker.fgraph.toposort()], (
scalar_op, shape, pattern) scalar_op, shape, pattern)
assert tcn.GpuElemwise not in [x.op.__class__
for x in f.maker.fgraph.toposort()]
assert op.__class__ in [x.op.__class__ assert op.__class__ in [x.op.__class__
for x in f2.maker.fgraph.toposort()], ( for x in f2.maker.fgraph.toposort()], (
scalar_op, shape, pattern) scalar_op, shape, pattern)
...@@ -242,8 +252,8 @@ def test_careduce(): ...@@ -242,8 +252,8 @@ def test_careduce():
shape = numpy.asarray(shape) * 2 shape = numpy.asarray(shape) * 2
a = tensor.TensorType('float32', (False,) * len(shape))() a = tensor.TensorType('float32', (False,) * len(shape))()
a2 = tcn.CudaNdarrayType((False,) * len(shape))() a2 = tcn.CudaNdarrayType((False,) * len(shape))()
b = op(a) b = op(a*a)
b2 = op(a2) b2 = op(a2*a2)
val = numpy.random.rand(numpy.prod(shape)).reshape(shape) val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
# val = numpy.ones(shape) # val = numpy.ones(shape)
# val = numpy.arange(numpy.prod(shape)).reshape(shape) # val = numpy.arange(numpy.prod(shape)).reshape(shape)
...@@ -266,6 +276,8 @@ def test_careduce(): ...@@ -266,6 +276,8 @@ def test_careduce():
assert tcn.GpuCAReduce in [x.op.__class__ assert tcn.GpuCAReduce in [x.op.__class__
for x in f2.maker.fgraph.toposort()], ( for x in f2.maker.fgraph.toposort()], (
scalar_op, shape, pattern) scalar_op, shape, pattern)
assert tcn.GpuElemwise not in [x.op.__class__
for x in f.maker.fgraph.toposort()]
assert op.__class__ in [x.op.__class__ assert op.__class__ in [x.op.__class__
for x in f.maker.fgraph.toposort()], ( for x in f.maker.fgraph.toposort()], (
scalar_op, shape, pattern) scalar_op, shape, pattern)
......
...@@ -22,6 +22,15 @@ from type import GpuArrayType ...@@ -22,6 +22,15 @@ from type import GpuArrayType
def as_gpuarray_variable(x): def as_gpuarray_variable(x):
# This is needed to lower the number of useless transfer
# introduced during optimization. This speed up optimization and
# "canonicalize" the graph, so it make easier making some
# optimization.
if (hasattr(x, 'fgraph') and
len(x.clients) == 1 and
x.owner and
isinstance(x.owner.op, HostFromGpu)):
return x.owner.inputs[0]
if hasattr(x, '_as_GpuArrayVariable'): if hasattr(x, '_as_GpuArrayVariable'):
return x._as_GpuArrayVariable() return x._as_GpuArrayVariable()
# TODO we need to have the cuda -> gpu path taken care of. # TODO we need to have the cuda -> gpu path taken care of.
......
...@@ -570,10 +570,14 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -570,10 +570,14 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
GPUs are not especially well-suited to reduction operations so it is GPUs are not especially well-suited to reduction operations so it is
quite possible that the GPU might be slower for some cases. quite possible that the GPU might be slower for some cases.
pre_scalar_op: if present, must be a scalar op with only 1
input. We will execute it on the input value before reduction.
""" """
def __init__(self, scalar_op, axis=None, def __init__(self, scalar_op, axis=None,
reduce_mask=None, dtype=None, acc_dtype=None): reduce_mask=None, dtype=None, acc_dtype=None,
pre_scalar_op=None):
if reduce_mask is not None: if reduce_mask is not None:
reduce_mask = tuple(reduce_mask) reduce_mask = tuple(reduce_mask)
self.reduce_mask = reduce_mask self.reduce_mask = reduce_mask
...@@ -583,6 +587,9 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -583,6 +587,9 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
self._n_scalar_op_calls = 0 self._n_scalar_op_calls = 0
CAReduceDtype.__init__(self, scalar_op, axis=axis, CAReduceDtype.__init__(self, scalar_op, axis=axis,
dtype=dtype, acc_dtype=acc_dtype) dtype=dtype, acc_dtype=acc_dtype)
self.pre_scalar_op = pre_scalar_op
if pre_scalar_op:
assert pre_scalar_op.nin == 1
def __eq__(self, other): def __eq__(self, other):
return (type(self) == type(other) and return (type(self) == type(other) and
...@@ -590,7 +597,8 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -590,7 +597,8 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
self.reduce_mask == other.reduce_mask and self.reduce_mask == other.reduce_mask and
self.dtype == other.dtype and self.dtype == other.dtype and
self.acc_dtype == other.acc_dtype and self.acc_dtype == other.acc_dtype and
self.scalar_op == other.scalar_op) self.scalar_op == other.scalar_op and
self.pre_scalar_op == other.pre_scalar_op)
def __hash__(self): def __hash__(self):
return (hash(type(self)) ^ return (hash(type(self)) ^
...@@ -598,19 +606,35 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -598,19 +606,35 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
hash(self.reduce_mask) ^ hash(self.reduce_mask) ^
hash(self.dtype) ^ hash(self.dtype) ^
hash(self.acc_dtype) ^ hash(self.acc_dtype) ^
hash(type(self.scalar_op))) hash(type(self.scalar_op)) ^
hash(type(self.pre_scalar_op)))
def __str__(self): def __str__(self):
pre = ""
if self.pre_scalar_op:
pre = "pre=%s,red=" % str(self.pre_scalar_op)
ax = '' ax = ''
if self.axis is not None: if self.axis is not None:
ax = '{%s}' % (', '.join(str(x) for x in self.axis),) ax = '{%s}' % (', '.join(str(x) for x in self.axis),)
return "GpuCAReduceCuda{%s}%s" % (str(self.scalar_op), ax) return "GpuCAReduceCuda{%s%s}%s" % (pre,str(self.scalar_op), ax)
def __setstate__(self, d):
self.__dict__.update(d)
# For unpickling of old ops.
if not hasattr(self, "pre_scalar_op"):
self.pre_scalar_op = None
def make_node(self, x): def make_node(self, x):
x = as_gpuarray_variable(x) x = as_gpuarray_variable(x)
ret = super(GpuCAReduceCuda, self).make_node(x) ret = super(GpuCAReduceCuda, self).make_node(x)
self = copy.copy(self) self = copy.copy(self)
self.axis = ret.op.axis self.axis = ret.op.axis
if self.pre_scalar_op:
# Currently we only tested pre_scalar_op that don't cause
# upcast.
d1 = self.__class__(scalar_op=self.scalar_op)(Elemwise(self.pre_scalar_op)(x))
assert d1.dtype == ret.outputs[0].dtype
assert Elemwise(self.pre_scalar_op)(x).dtype == x.dtype
if self.reduce_mask is None: if self.reduce_mask is None:
if self.axis is None: if self.axis is None:
reduce_mask = [1] * x.type.ndim reduce_mask = [1] * x.type.ndim
...@@ -1010,15 +1034,33 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1010,15 +1034,33 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
else: else:
assert isinstance(self.scalar_op, (scal.Maximum, assert isinstance(self.scalar_op, (scal.Maximum,
scal.Minimum)) scal.Minimum))
if self.pre_scalar_op: # TODO, multi_dtype!
#dtype = node.inputs[0].dtype
dtype = 'float32'
dummy_var = scal.Scalar(dtype=dtype)()
dummy_node = self.pre_scalar_op.make_node(dummy_var)
dummy_name = 'assign_init_pre_scalar_op' + str(self._n_scalar_op_calls)
self._n_scalar_op_calls += 1
t = self.pre_scalar_op.c_code(dummy_node, dummy_name,
(first_item,), ("",), {})
assert t.startswith(' = ')
first_item = t[3:]
if first_item[-1] == ';':
first_item = first_item[:-1]
return first_item return first_item
def _assign_reduce(self, node, name, left, right, sub): def _assign_reduce(self, node, name, left, right, sub, pre):
""" """
node: the node argument to this op's c_code node: the node argument to this op's c_code
name: the name argument to this op's c_code name: the name argument to this op's c_code
left: a C code string identifying an lvalue left: a C code string identifying an lvalue
right: a C code string identifying an expression right: a C code string identifying an expression
sub: the sub argument to this op's c_code sub: the sub argument to this op's c_code
pre: If True, we will add the pre_scalar_op.c_code
returns C code to reduce left and right, assigning the returns C code to reduce left and right, assigning the
result to left.""" result to left."""
...@@ -1035,6 +1077,18 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1035,6 +1077,18 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
dummy_name = name + '_scalar_op' + str(self._n_scalar_op_calls) dummy_name = name + '_scalar_op' + str(self._n_scalar_op_calls)
self._n_scalar_op_calls += 1 self._n_scalar_op_calls += 1
if pre and self.pre_scalar_op:
assert left == "myresult"
dummy_node = self.pre_scalar_op.make_node(dummy_left)
dummy_name = name + '_scalar_op' + str(self._n_scalar_op_calls)
self._n_scalar_op_calls += 1
t = self.pre_scalar_op.c_code(dummy_node, dummy_name,
(right,), ("",), sub)
assert t.startswith(' = ')
right = t[3:]
if right[-1] == ';':
right = right[:-1]
return self.scalar_op.c_code(dummy_node, dummy_name, (left, right), return self.scalar_op.c_code(dummy_node, dummy_name, (left, right),
(left,), sub) (left,), sub)
...@@ -1064,7 +1118,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1064,7 +1118,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
int idx = threadNum - (threadCount >> 1) * 2;""" int idx = threadNum - (threadCount >> 1) * 2;"""
new_version += self._assign_reduce(node, name, 'buf[idx]', new_version += self._assign_reduce(node, name, 'buf[idx]',
'buf[threadNum]', sub) 'buf[threadNum]', sub, False)
new_version += """ new_version += """
} }
...@@ -1084,7 +1138,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1084,7 +1138,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
""" """
new_version += self._assign_reduce(node, name, new_version += self._assign_reduce(node, name,
'buf[threadNum]', 'temp', sub) 'buf[threadNum]', 'temp', sub, False)
new_version += """ new_version += """
} }
...@@ -1115,7 +1169,8 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1115,7 +1169,8 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
{ {
""" """
current_version += self._assign_reduce(node, name, current_version += self._assign_reduce(node, name,
'myresult', 'buf[i]', sub) + """ 'myresult', 'buf[i]',
sub, False) + """
} }
buf[threadNum] = myresult; buf[threadNum] = myresult;
/*Comment this optimization as it don't work on Fermi GPU. /*Comment this optimization as it don't work on Fermi GPU.
...@@ -1127,7 +1182,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1127,7 +1182,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
current_version += self._assign_reduce(node, name, current_version += self._assign_reduce(node, name,
'buf[threadNum]', 'buf[threadNum]',
'buf[threadNum+%d]' % num, 'buf[threadNum+%d]' % num,
sub) sub, False)
current_version += """ current_version += """
""" """
current_version += """ current_version += """
...@@ -1146,7 +1201,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1146,7 +1201,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
this_if = "if (threadNum + %d < threadCount) " % num + \ this_if = "if (threadNum + %d < threadCount) " % num + \
self._assign_reduce(node, name, self._assign_reduce(node, name,
'buf[threadNum]','buf[threadNum+%d]' % num, 'buf[threadNum]','buf[threadNum+%d]' % num,
sub) sub, False)
current_version += this_if current_version += this_if
current_version += """ current_version += """
""" """
...@@ -1166,7 +1221,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1166,7 +1221,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
#Threads must be organized as: threadNum%nb_reduce correspond to the same sum #Threads must be organized as: threadNum%nb_reduce correspond to the same sum
#nb_reduce<=warpSize #nb_reduce<=warpSize
def _k_reduce_buf_multiple(self, z_pos, node, name, nb_reduce): def _k_reduce_buf_multiple(self, z_pos, node, name, nb_reduce):
reduce_fct = self._assign_reduce(node, name, 'myresult', 'buf[i]', {}) reduce_fct = self._assign_reduce(node, name, 'myresult', 'buf[i]', {}, False)
return """ return """
__syncthreads(); // some kernel do multiple reduction. __syncthreads(); // some kernel do multiple reduction.
buf[threadNum] = myresult; buf[threadNum] = myresult;
...@@ -1767,7 +1822,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1767,7 +1822,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
reducebuf = self._k_reduce_buf('Z[0]', node, nodename, sub={}) reducebuf = self._k_reduce_buf('Z[0]', node, nodename, sub={})
reduce_fct = self._assign_reduce(node, nodename, "myresult", reduce_fct = self._assign_reduce(node, nodename, "myresult",
"A[i0]", "A[i0]",
{}) {}, True)
reduce_init = self._assign_init("A[0]") reduce_init = self._assign_init("A[0]")
print >> sio, """ print >> sio, """
static __global__ void kernel_reduce_ccontig_%(nodename)s( static __global__ void kernel_reduce_ccontig_%(nodename)s(
...@@ -1798,7 +1853,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1798,7 +1853,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
reducebuf = self._k_reduce_buf('Z[0]', node, nodename, sub={}) reducebuf = self._k_reduce_buf('Z[0]', node, nodename, sub={})
reduce_fct = self._assign_reduce(node, nodename, "myresult", reduce_fct = self._assign_reduce(node, nodename, "myresult",
"A[i0 * sA0]", "A[i0 * sA0]",
{}) {}, True)
reduce_init = self._assign_init("A[0]") reduce_init = self._assign_init("A[0]")
print >> sio, """ print >> sio, """
static __global__ void kernel_reduce_1_%(nodename)s( static __global__ void kernel_reduce_1_%(nodename)s(
...@@ -1829,7 +1884,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1829,7 +1884,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
reducebuf = self._k_reduce_buf('Z[0]', node, nodename, sub={}) reducebuf = self._k_reduce_buf('Z[0]', node, nodename, sub={})
reduce_fct = self._assign_reduce(node, nodename, "myresult", reduce_fct = self._assign_reduce(node, nodename, "myresult",
"A[i0 * sA0 + i1 * sA1]", "A[i0 * sA0 + i1 * sA1]",
{}) {}, True)
reduce_init = self._assign_init("A[0]") reduce_init = self._assign_init("A[0]")
print >> sio, """ print >> sio, """
...@@ -1913,7 +1968,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1913,7 +1968,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
reduce_fct = self._assign_reduce( reduce_fct = self._assign_reduce(
node, nodename, "myresult", node, nodename, "myresult",
"A[i3 * sA3 + i2 * sA2 + i1 * sA1 + i0 * sA0]", "A[i3 * sA3 + i2 * sA2 + i1 * sA1 + i0 * sA0]",
{}) {}, True)
print >> sio, """ print >> sio, """
%(decl)s{ %(decl)s{
%(init)s %(init)s
...@@ -1941,7 +1996,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1941,7 +1996,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
node, nodename, sub={}) node, nodename, sub={})
reduce_fct = self._assign_reduce(node, nodename, "myresult", reduce_fct = self._assign_reduce(node, nodename, "myresult",
"A[i0 * sA0 + i1 * sA1 + i2 * sA2]", "A[i0 * sA0 + i1 * sA1 + i2 * sA2]",
{}) {}, True)
reduce_init = self._assign_init("A[i0 * sA0 + threadIdx.x * sA1 + i2 * sA2]") reduce_init = self._assign_init("A[i0 * sA0 + threadIdx.x * sA1 + i2 * sA2]")
print >> sio, """ print >> sio, """
static __global__ void kernel_reduce_010_%(nodename)s( static __global__ void kernel_reduce_010_%(nodename)s(
...@@ -1980,7 +2035,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1980,7 +2035,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
if self.reduce_mask == (0, 1, 0): if self.reduce_mask == (0, 1, 0):
reduce_fct = self._assign_reduce(node, nodename, "myresult", reduce_fct = self._assign_reduce(node, nodename, "myresult",
"X[a * sX0 + b * sX1 + c * sX2]", "X[a * sX0 + b * sX1 + c * sX2]",
{}) {}, True)
reduce_init = self._assign_init("X[a * sX0 + 0 * sX1 + c * sX2]") reduce_init = self._assign_init("X[a * sX0 + 0 * sX1 + c * sX2]")
print >> sio, """ print >> sio, """
static __global__ void kernel_reduce_010_AD_%(nodename)s( static __global__ void kernel_reduce_010_AD_%(nodename)s(
...@@ -2040,7 +2095,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2040,7 +2095,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
'blockDim.x') 'blockDim.x')
reduce_fct = self._assign_reduce(node, nodename, "myresult", reduce_fct = self._assign_reduce(node, nodename, "myresult",
"A[i0 * sA0 + i1 * sA1 + i2 * sA2]", "A[i0 * sA0 + i1 * sA1 + i2 * sA2]",
{}) {}, True)
reduce_init = self._assign_init("A[i0 * sA0 + 0 * sA1 + i2 * sA2]") reduce_init = self._assign_init("A[i0 * sA0 + 0 * sA1 + i2 * sA2]")
print >> sio, """ print >> sio, """
%(decl)s %(decl)s
...@@ -2076,7 +2131,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2076,7 +2131,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
reducebuf = self._k_reduce_buf('Z[blockIdx.x * sZ0]', node, nodename, sub = {}) reducebuf = self._k_reduce_buf('Z[blockIdx.x * sZ0]', node, nodename, sub = {})
reduce_fct = self._assign_reduce(node, nodename, "myresult", reduce_fct = self._assign_reduce(node, nodename, "myresult",
"A[i0 * sA0 + i1 * sA1 + blockIdx.x * sA2]", "A[i0 * sA0 + i1 * sA1 + blockIdx.x * sA2]",
{}) {}, True)
reduce_init = self._assign_init("A[blockIdx.x * sA2]") reduce_init = self._assign_init("A[blockIdx.x * sA2]")
print >> sio, """ print >> sio, """
static __global__ void kernel_reduce_110_%(nodename)s( static __global__ void kernel_reduce_110_%(nodename)s(
...@@ -2117,7 +2172,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2117,7 +2172,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
init = self._k_init(node, nodename) init = self._k_init(node, nodename)
reduce_fct = self._assign_reduce(node, nodename, "myresult", reduce_fct = self._assign_reduce(node, nodename, "myresult",
"A[i0 * sA0 + i1 * sA1 + i2 * sA2]", "A[i0 * sA0 + i1 * sA1 + i2 * sA2]",
{}) {}, True)
reduce_init = self._assign_init("A[i1 * sA1 + i2 * sA2]") reduce_init = self._assign_init("A[i1 * sA1 + i2 * sA2]")
print >> sio, """ print >> sio, """
%(decl)s %(decl)s
...@@ -2144,7 +2199,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2144,7 +2199,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
init = self._k_init(node, nodename) init = self._k_init(node, nodename)
reduce_fct = self._assign_reduce(node, nodename, "myresult", reduce_fct = self._assign_reduce(node, nodename, "myresult",
"A[i0 * sA0 + i1 * sA1 + i2 * sA2]", "A[i0 * sA0 + i1 * sA1 + i2 * sA2]",
{}) {}, True)
reduce_init = self._assign_init("A[0]") reduce_init = self._assign_init("A[0]")
print >> sio, """ print >> sio, """
%(decl)s %(decl)s
...@@ -2171,7 +2226,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2171,7 +2226,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
node, nodename, sub={}) node, nodename, sub={})
reduce_fct = self._assign_reduce(node, nodename, "myresult", reduce_fct = self._assign_reduce(node, nodename, "myresult",
"A[i0 * sA0 + i1 * sA1 + i2 * sA2]", "A[i0 * sA0 + i1 * sA1 + i2 * sA2]",
{}) {}, True)
reduce_init = self._assign_init("A[i0 * sA0 + i1 * sA1]") reduce_init = self._assign_init("A[i0 * sA0 + i1 * sA1]")
print >> sio, """ print >> sio, """
static __global__ void kernel_reduce_001_%(nodename)s( static __global__ void kernel_reduce_001_%(nodename)s(
...@@ -2214,7 +2269,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2214,7 +2269,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
init = self._k_init(node, nodename) init = self._k_init(node, nodename)
reduce_fct = self._assign_reduce(node, nodename, "myresult", reduce_fct = self._assign_reduce(node, nodename, "myresult",
"A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3]", "A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3]",
{}) {}, True)
reduce_init = self._assign_init("A[i0 * sA0 + i1 * sA1]") reduce_init = self._assign_init("A[i0 * sA0 + i1 * sA1]")
print >> sio, """ print >> sio, """
%(decl)s %(decl)s
...@@ -2247,7 +2302,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2247,7 +2302,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
init = self._k_init(node, nodename) init = self._k_init(node, nodename)
reduce_fct = self._assign_reduce(node, nodename, "myresult", reduce_fct = self._assign_reduce(node, nodename, "myresult",
"A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3]", "A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3]",
{}) {}, True)
reduce_init = self._assign_init("A[i0 * sA0 + i2 * sA2]") reduce_init = self._assign_init("A[i0 * sA0 + i2 * sA2]")
print >> sio, """ print >> sio, """
%(decl)s %(decl)s
...@@ -2278,7 +2333,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2278,7 +2333,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
init = self._k_init(node, nodename) init = self._k_init(node, nodename)
reduce_fct = self._assign_reduce(node, nodename, "myresult", reduce_fct = self._assign_reduce(node, nodename, "myresult",
"A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3]", "A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3]",
{}) {}, True)
reduce_init = self._assign_init("A[0]") reduce_init = self._assign_init("A[0]")
print >> sio, """ print >> sio, """
%(decl)s %(decl)s
...@@ -2304,7 +2359,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2304,7 +2359,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
node, nodename, sub={}) node, nodename, sub={})
reduce_fct = self._assign_reduce(node, nodename, "myresult", reduce_fct = self._assign_reduce(node, nodename, "myresult",
"A[i0 * sA0 + blockIdx.x * sA1 + i2 * sA2 + i3 * sA3]", "A[i0 * sA0 + blockIdx.x * sA1 + i2 * sA2 + i3 * sA3]",
{}) {}, True)
reduce_init = self._assign_init("A[blockIdx.x * sA1]") reduce_init = self._assign_init("A[blockIdx.x * sA1]")
print >> sio, """ print >> sio, """
static __global__ void kernel_reduce_1011_%(nodename)s( static __global__ void kernel_reduce_1011_%(nodename)s(
......
...@@ -563,6 +563,27 @@ def local_gpu_conv(node): ...@@ -563,6 +563,27 @@ def local_gpu_conv(node):
return [out] return [out]
@register_opt("low_memory")
@local_optimizer([GpuCAReduceCuda])
def local_gpu_elemwise_careduce(node):
""" Merge some GpuCAReduceCuda and GPUElemwise"""
if (isinstance(node.op, GpuCAReduceCuda) and
node.op.pre_scalar_op is None and
node.inputs[0].owner and
isinstance(node.inputs[0].owner.op, GpuElemwise) and
# The Op support all scalar with 1 inputs. We don't
# automatically add more case, as some like trigonometic
# operation with some reduction pattern will probably result
# to slow down.
isinstance(node.inputs[0].owner.op.scalar_op, scalar.basic.Sqr)
):
op = node.op
inp = node.inputs[0].owner.inputs[0]
return [GpuCAReduceCuda(scalar_op=op.scalar_op,
reduce_mask=op.reduce_mask,
pre_scalar_op=scalar.basic.sqr)(inp)]
def tensor_to_gpu(x): def tensor_to_gpu(x):
if isinstance(x.type, tensor.TensorType): if isinstance(x.type, tensor.TensorType):
y = GpuArrayType(broadcastable=x.type.broadcastable, y = GpuArrayType(broadcastable=x.type.broadcastable,
......
...@@ -40,11 +40,13 @@ class test_GpuCAReduceCPY(test_CAReduce): ...@@ -40,11 +40,13 @@ class test_GpuCAReduceCPY(test_CAReduce):
bin_dtypes = ["uint8", "int8"] bin_dtypes = ["uint8", "int8"]
op = GpuCAReduceCPY op = GpuCAReduceCPY
reds = [scalar.add, scalar.mul] reds = [scalar.add, scalar.mul]
pre_scalar_op = None
def test_perform(self): def test_perform(self):
for dtype in self.dtypes + self.bin_dtypes: for dtype in self.dtypes + self.bin_dtypes:
for op in self.reds: for op in self.reds:
self.with_linker(gof.PerformLinker(), op, dtype=dtype) self.with_linker(gof.PerformLinker(), op, dtype=dtype,
pre_scalar_op=self.pre_scalar_op)
def test_perform_nan(self): def test_perform_nan(self):
for dtype in self.dtypes: for dtype in self.dtypes:
...@@ -52,12 +54,14 @@ class test_GpuCAReduceCPY(test_CAReduce): ...@@ -52,12 +54,14 @@ class test_GpuCAReduceCPY(test_CAReduce):
continue continue
for op in self.reds: for op in self.reds:
self.with_linker(gof.PerformLinker(), op, dtype=dtype, self.with_linker(gof.PerformLinker(), op, dtype=dtype,
test_nan=True) test_nan=True,
pre_scalar_op=self.pre_scalar_op)
def test_c(self): def test_c(self):
for dtype in self.dtypes + self.bin_dtypes: for dtype in self.dtypes + self.bin_dtypes:
for op in self.reds: for op in self.reds:
self.with_linker(gof.CLinker(), op, dtype=dtype) self.with_linker(gof.CLinker(), op, dtype=dtype,
pre_scalar_op=self.pre_scalar_op)
def test_c_nan(self): def test_c_nan(self):
for dtype in self.dtypes: for dtype in self.dtypes:
...@@ -65,7 +69,8 @@ class test_GpuCAReduceCPY(test_CAReduce): ...@@ -65,7 +69,8 @@ class test_GpuCAReduceCPY(test_CAReduce):
continue continue
for op in self.reds: for op in self.reds:
self.with_linker(gof.CLinker(), op, dtype=dtype, self.with_linker(gof.CLinker(), op, dtype=dtype,
test_nan=True) test_nan=True,
pre_scalar_op=self.pre_scalar_op)
def test_infer_shape(self): def test_infer_shape(self):
for dtype in self.dtypes: for dtype in self.dtypes:
...@@ -148,6 +153,7 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY): ...@@ -148,6 +153,7 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
op = GpuCAReduceCuda op = GpuCAReduceCuda
reds = [scalar.add, scalar.mul, reds = [scalar.add, scalar.mul,
scalar.maximum, scalar.minimum] scalar.maximum, scalar.minimum]
pre_scalar_op = scalar.sqr
def test_perform(self): def test_perform(self):
return return
......
...@@ -133,3 +133,13 @@ def test_print_op(): ...@@ -133,3 +133,13 @@ def test_print_op():
assert isinstance(topo[2].op, GpuElemwise) assert isinstance(topo[2].op, GpuElemwise)
assert topo[3].op == host_from_gpu assert topo[3].op == host_from_gpu
f(numpy.random.random((5, 5)).astype('float32')) f(numpy.random.random((5, 5)).astype('float32'))
def test_local_gpu_elemwise_careduce():
x = theano.tensor.matrix()
o = (x*x).sum()
f = theano.function([x], o, mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert len(topo) == 3
assert topo[1].op.pre_scalar_op == theano.scalar.sqr
f(numpy.random.rand(3, 4).astype(theano.config.floatX))
...@@ -308,15 +308,19 @@ class test_CAReduce(unittest_tools.InferShapeTester): ...@@ -308,15 +308,19 @@ class test_CAReduce(unittest_tools.InferShapeTester):
] ]
def with_linker(self, linker, scalar_op=scalar.add, dtype="floatX", def with_linker(self, linker, scalar_op=scalar.add, dtype="floatX",
pre_scalar_op=None,
test_nan=False, tensor_op=None): test_nan=False, tensor_op=None):
for xsh, tosum in self.cases: for xsh, tosum in self.cases:
if dtype == "floatX": if dtype == "floatX":
dtype = theano.config.floatX dtype = theano.config.floatX
x = TensorType(dtype, [(entry == 1) for entry in xsh])('x') x = TensorType(dtype, [(entry == 1) for entry in xsh])('x')
d = {}
if pre_scalar_op is not None:
d = {"pre_scalar_op": pre_scalar_op}
if tensor_op is None: if tensor_op is None:
e = as_tensor_variable(self.op(scalar_op, axis=tosum)(x)) e = as_tensor_variable(self.op(scalar_op, axis=tosum, **d)(x))
else: else:
e = as_tensor_variable(tensor_op(x, axis=tosum)) e = as_tensor_variable(tensor_op(x, axis=tosum, **d))
if tosum is None: if tosum is None:
tosum = range(len(xsh)) tosum = range(len(xsh))
...@@ -337,6 +341,8 @@ class test_CAReduce(unittest_tools.InferShapeTester): ...@@ -337,6 +341,8 @@ class test_CAReduce(unittest_tools.InferShapeTester):
else: else:
xv = numpy.asarray(numpy.nan, dtype=dtype) xv = numpy.asarray(numpy.nan, dtype=dtype)
zv = xv zv = xv
if pre_scalar_op is not None:
zv = Elemwise(scalar_op=pre_scalar_op)(x).eval({x: xv})
numpy_raised = False numpy_raised = False
if len(tosum) > 1 and any([a < 0 for a in tosum]): if len(tosum) > 1 and any([a < 0 for a in tosum]):
#In that case, we need to use the good order of axis #In that case, we need to use the good order of axis
...@@ -505,16 +511,22 @@ class test_CAReduce(unittest_tools.InferShapeTester): ...@@ -505,16 +511,22 @@ class test_CAReduce(unittest_tools.InferShapeTester):
self.with_linker(gof.CLinker(), scalar.maximum, dtype=dtype, self.with_linker(gof.CLinker(), scalar.maximum, dtype=dtype,
test_nan=True) test_nan=True)
def test_infer_shape(self, dtype=None): def test_infer_shape(self, dtype=None, pre_scalar_op=None):
if dtype is None: if dtype is None:
dtype = theano.config.floatX dtype = theano.config.floatX
for xsh, tosum in self.cases: for xsh, tosum in self.cases:
x = TensorType(dtype, [(entry == 1) for entry in xsh])('x') x = TensorType(dtype, [(entry == 1) for entry in xsh])('x')
if pre_scalar_op is not None:
x = pre_scalar_op(x)
if tosum is None: if tosum is None:
tosum = range(len(xsh)) tosum = range(len(xsh))
xv = numpy.asarray(numpy.random.rand(*xsh), dtype=dtype) xv = numpy.asarray(numpy.random.rand(*xsh), dtype=dtype)
d = {}
if pre_scalar_op is not None:
xv = x.eval({x.owner.inputs[0]: xv})
d = {pre_scalar_op: pre_scalar_op}
self._compile_and_check([x], self._compile_and_check([x],
[self.op(scalar.add, axis=tosum)(x)], [self.op(scalar.add, axis=tosum, *d)(x)],
[xv], self.op, [xv], self.op,
["local_cut_useless_reduce"], ["local_cut_useless_reduce"],
warn=0 not in xsh) warn=0 not in xsh)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论