提交 0d3dffac authored 作者: abergeron's avatar abergeron

Merge pull request #1888 from nouiz/gpu_sqr_sum_ax0

Add GpuSqrSumAx0 to lower the memory usage on the GPU.
from theano import Op, Apply
from theano.compat.six import StringIO
from theano.sandbox.cuda import GpuOp
from theano.sandbox.cuda import GpuOp, as_cuda_ndarray_variable
from theano.sandbox.cuda.kernel_codegen import (nvcc_kernel,
inline_softmax,
inline_softmax_fixed_shared)
class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp):
class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuOp):
"""
Implement CrossentropySoftmaxArgmax1HotWithBias on the gpu.
"""
......@@ -216,7 +216,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp):
gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1HotWithBias()
class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp):
class GpuCrossentropySoftmax1HotWithBiasDx(GpuOp):
"""
Implement CrossentropySoftmax1HotWithBiasDx on the gpu.
"""
......@@ -364,7 +364,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp):
gpu_crossentropy_softmax_1hot_with_bias_dx = GpuCrossentropySoftmax1HotWithBiasDx()
class GpuSoftmax (GpuOp):
class GpuSoftmax(GpuOp):
"""
Implement Softmax on the gpu.
"""
......@@ -483,8 +483,8 @@ class GpuSoftmax (GpuOp):
def c_support_code_apply(self, node, nodename):
ret1 = nvcc_kernel("kSoftmax_%s" % nodename,
params=['int M', 'int N',
'const float * x', 'const int sx0', 'const int sx1',
'float * sm', 'const int sm_s0', 'const int sm_s1'],
'const float * x', 'const int sx0', 'const int sx1',
'float * sm', 'const int sm_s0', 'const int sm_s1'],
body=[
"extern __shared__ float buf[]",
"float * buf2 = buf + N",
......@@ -506,8 +506,8 @@ class GpuSoftmax (GpuOp):
])
ret2 = nvcc_kernel("kSoftmax_fixed_shared%s" % nodename,
params=['int M', 'int N',
'const float * x', 'const int sx0', 'const int sx1',
'float * sm', 'const int sm_s0', 'const int sm_s1'],
'const float * x', 'const int sx0', 'const int sx1',
'float * sm', 'const int sm_s0', 'const int sm_s1'],
body=[
"extern __shared__ float buf[]",
"for (int blockIDX = blockIdx.x; blockIDX < M;"
......@@ -525,7 +525,7 @@ class GpuSoftmax (GpuOp):
gpu_softmax = GpuSoftmax()
class GpuSoftmaxWithBias (GpuOp):
class GpuSoftmaxWithBias(GpuOp):
"""
Implement SoftmaxWithBias on the gpu.
"""
......@@ -545,7 +545,7 @@ class GpuSoftmaxWithBias (GpuOp):
return Apply(self, [x, b], [x.type()])
def infer_shape(self, node, shape):
return [shape[0]]
return [shape[0]]
def c_code_cache_version(self):
#return ()
......@@ -660,12 +660,13 @@ class GpuSoftmaxWithBias (GpuOp):
""" % locals()
def c_support_code_apply(self, node, nodename):
ret1 = nvcc_kernel("kSoftmaxWithBias_%s" % nodename,
params=['int M', 'int N',
'const float * x', 'const int sx0', 'const int sx1',
'const float * b', 'const int sb0',
'float * sm', 'const int sm_s0', 'const int sm_s1'],
body=[
ret1 = nvcc_kernel(
"kSoftmaxWithBias_%s" % nodename,
params=['int M', 'int N',
'const float * x', 'const int sx0', 'const int sx1',
'const float * b', 'const int sb0',
'float * sm', 'const int sm_s0', 'const int sm_s1'],
body=[
"extern __shared__ float buf[]",
"float * buf2 = buf + N",
"for (int blockIDX = blockIdx.x; blockIDX < M;"
......@@ -683,7 +684,7 @@ class GpuSoftmaxWithBias (GpuOp):
"}",
"__syncthreads()",
"}",
])
])
ret2 = nvcc_kernel("kSoftmaxWithBias_fixed_shared%s" % nodename,
params=['int M', 'int N',
'const float * x',
......
......@@ -684,6 +684,24 @@ def local_gpu_careduce(node):
return False
@register_opt("low_memory")
@local_optimizer([GpuCAReduce])
def local_gpu_elemwise_careduce(node):
if (isinstance(node.op, GpuCAReduce) and
node.op.pre_scalar_op is None and
node.inputs[0].owner and
isinstance(node.inputs[0].owner.op, GpuElemwise) and
# The Op support all scalar with 1 inputs. We don't
# automatically add more case, as some like trigonometic
# operation with some reduction pattern will probably result
# to slow down.
isinstance(node.inputs[0].owner.op.scalar_op, scal.basic.Sqr)
):
op = node.op
inp = node.inputs[0].owner.inputs[0]
return [GpuCAReduce(op.reduce_mask, op.scalar_op, scal.basic.sqr)(inp)]
@register_opt()
@local_optimizer([gpu_from_host, tensor.Reshape])
def local_gpu_reshape(node):
......
......@@ -60,6 +60,10 @@ def test_careduce():
1110,1101,1011
TODO: test with broadcast
We test with the pre_scalar_op sqr in all cases. This cover all
code, with and without it the pre_scalar_op.
"""
for scalar_op, careduce_op in [
(theano.scalar.mul, tensor.elemwise.CAReduceDtype),
......@@ -132,7 +136,7 @@ def test_careduce():
pat = tensor_pattern_to_gpu_pattern(shape, pattern)
a = tensor.TensorType('float32', (False,) * len(shape))()
b = op(a)
b = op(a*a)
val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
# val = numpy.ones(shape)
# val = numpy.arange(numpy.prod(shape)).reshape(shape)
......@@ -142,6 +146,10 @@ def test_careduce():
assert tcn.GpuCAReduce in [x.op.__class__
for x in f.maker.fgraph.toposort()], (
scalar_op, shape, pattern)
if tcn.GpuElemwise in [x.op.__class__
for x in f.maker.fgraph.toposort()]:
assert tcn.GpuReshape in [x.op.__class__
for x in f.maker.fgraph.toposort()]
assert op.__class__ in [x.op.__class__
for x in f2.maker.fgraph.toposort()], (
scalar_op, shape, pattern)
......@@ -210,7 +218,7 @@ def test_careduce():
dim_pattern[0] = 1
dim_pattern[1] = 0
a = a.dimshuffle(dim_pattern)
b = op(a)
b = op(a*a)
val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
# val = numpy.ones(shape)
# val = numpy.arange(numpy.prod(shape)).reshape(shape)
......@@ -220,6 +228,8 @@ def test_careduce():
assert tcn.GpuCAReduce in [x.op.__class__
for x in f.maker.fgraph.toposort()], (
scalar_op, shape, pattern)
assert tcn.GpuElemwise not in [x.op.__class__
for x in f.maker.fgraph.toposort()]
assert op.__class__ in [x.op.__class__
for x in f2.maker.fgraph.toposort()], (
scalar_op, shape, pattern)
......@@ -242,8 +252,8 @@ def test_careduce():
shape = numpy.asarray(shape) * 2
a = tensor.TensorType('float32', (False,) * len(shape))()
a2 = tcn.CudaNdarrayType((False,) * len(shape))()
b = op(a)
b2 = op(a2)
b = op(a*a)
b2 = op(a2*a2)
val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
# val = numpy.ones(shape)
# val = numpy.arange(numpy.prod(shape)).reshape(shape)
......@@ -266,6 +276,8 @@ def test_careduce():
assert tcn.GpuCAReduce in [x.op.__class__
for x in f2.maker.fgraph.toposort()], (
scalar_op, shape, pattern)
assert tcn.GpuElemwise not in [x.op.__class__
for x in f.maker.fgraph.toposort()]
assert op.__class__ in [x.op.__class__
for x in f.maker.fgraph.toposort()], (
scalar_op, shape, pattern)
......
......@@ -22,6 +22,15 @@ from type import GpuArrayType
def as_gpuarray_variable(x):
# This is needed to lower the number of useless transfer
# introduced during optimization. This speed up optimization and
# "canonicalize" the graph, so it make easier making some
# optimization.
if (hasattr(x, 'fgraph') and
len(x.clients) == 1 and
x.owner and
isinstance(x.owner.op, HostFromGpu)):
return x.owner.inputs[0]
if hasattr(x, '_as_GpuArrayVariable'):
return x._as_GpuArrayVariable()
# TODO we need to have the cuda -> gpu path taken care of.
......
......@@ -563,6 +563,27 @@ def local_gpu_conv(node):
return [out]
@register_opt("low_memory")
@local_optimizer([GpuCAReduceCuda])
def local_gpu_elemwise_careduce(node):
""" Merge some GpuCAReduceCuda and GPUElemwise"""
if (isinstance(node.op, GpuCAReduceCuda) and
node.op.pre_scalar_op is None and
node.inputs[0].owner and
isinstance(node.inputs[0].owner.op, GpuElemwise) and
# The Op support all scalar with 1 inputs. We don't
# automatically add more case, as some like trigonometic
# operation with some reduction pattern will probably result
# to slow down.
isinstance(node.inputs[0].owner.op.scalar_op, scalar.basic.Sqr)
):
op = node.op
inp = node.inputs[0].owner.inputs[0]
return [GpuCAReduceCuda(scalar_op=op.scalar_op,
reduce_mask=op.reduce_mask,
pre_scalar_op=scalar.basic.sqr)(inp)]
def tensor_to_gpu(x):
if isinstance(x.type, tensor.TensorType):
y = GpuArrayType(broadcastable=x.type.broadcastable,
......
......@@ -40,11 +40,13 @@ class test_GpuCAReduceCPY(test_CAReduce):
bin_dtypes = ["uint8", "int8"]
op = GpuCAReduceCPY
reds = [scalar.add, scalar.mul]
pre_scalar_op = None
def test_perform(self):
for dtype in self.dtypes + self.bin_dtypes:
for op in self.reds:
self.with_linker(gof.PerformLinker(), op, dtype=dtype)
self.with_linker(gof.PerformLinker(), op, dtype=dtype,
pre_scalar_op=self.pre_scalar_op)
def test_perform_nan(self):
for dtype in self.dtypes:
......@@ -52,12 +54,14 @@ class test_GpuCAReduceCPY(test_CAReduce):
continue
for op in self.reds:
self.with_linker(gof.PerformLinker(), op, dtype=dtype,
test_nan=True)
test_nan=True,
pre_scalar_op=self.pre_scalar_op)
def test_c(self):
for dtype in self.dtypes + self.bin_dtypes:
for op in self.reds:
self.with_linker(gof.CLinker(), op, dtype=dtype)
self.with_linker(gof.CLinker(), op, dtype=dtype,
pre_scalar_op=self.pre_scalar_op)
def test_c_nan(self):
for dtype in self.dtypes:
......@@ -65,7 +69,8 @@ class test_GpuCAReduceCPY(test_CAReduce):
continue
for op in self.reds:
self.with_linker(gof.CLinker(), op, dtype=dtype,
test_nan=True)
test_nan=True,
pre_scalar_op=self.pre_scalar_op)
def test_infer_shape(self):
for dtype in self.dtypes:
......@@ -148,6 +153,7 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
op = GpuCAReduceCuda
reds = [scalar.add, scalar.mul,
scalar.maximum, scalar.minimum]
pre_scalar_op = scalar.sqr
def test_perform(self):
return
......
......@@ -133,3 +133,13 @@ def test_print_op():
assert isinstance(topo[2].op, GpuElemwise)
assert topo[3].op == host_from_gpu
f(numpy.random.random((5, 5)).astype('float32'))
def test_local_gpu_elemwise_careduce():
x = theano.tensor.matrix()
o = (x*x).sum()
f = theano.function([x], o, mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert len(topo) == 3
assert topo[1].op.pre_scalar_op == theano.scalar.sqr
f(numpy.random.rand(3, 4).astype(theano.config.floatX))
......@@ -308,15 +308,19 @@ class test_CAReduce(unittest_tools.InferShapeTester):
]
def with_linker(self, linker, scalar_op=scalar.add, dtype="floatX",
pre_scalar_op=None,
test_nan=False, tensor_op=None):
for xsh, tosum in self.cases:
if dtype == "floatX":
dtype = theano.config.floatX
x = TensorType(dtype, [(entry == 1) for entry in xsh])('x')
d = {}
if pre_scalar_op is not None:
d = {"pre_scalar_op": pre_scalar_op}
if tensor_op is None:
e = as_tensor_variable(self.op(scalar_op, axis=tosum)(x))
e = as_tensor_variable(self.op(scalar_op, axis=tosum, **d)(x))
else:
e = as_tensor_variable(tensor_op(x, axis=tosum))
e = as_tensor_variable(tensor_op(x, axis=tosum, **d))
if tosum is None:
tosum = range(len(xsh))
......@@ -337,6 +341,8 @@ class test_CAReduce(unittest_tools.InferShapeTester):
else:
xv = numpy.asarray(numpy.nan, dtype=dtype)
zv = xv
if pre_scalar_op is not None:
zv = Elemwise(scalar_op=pre_scalar_op)(x).eval({x: xv})
numpy_raised = False
if len(tosum) > 1 and any([a < 0 for a in tosum]):
#In that case, we need to use the good order of axis
......@@ -505,16 +511,22 @@ class test_CAReduce(unittest_tools.InferShapeTester):
self.with_linker(gof.CLinker(), scalar.maximum, dtype=dtype,
test_nan=True)
def test_infer_shape(self, dtype=None):
def test_infer_shape(self, dtype=None, pre_scalar_op=None):
if dtype is None:
dtype = theano.config.floatX
for xsh, tosum in self.cases:
x = TensorType(dtype, [(entry == 1) for entry in xsh])('x')
if pre_scalar_op is not None:
x = pre_scalar_op(x)
if tosum is None:
tosum = range(len(xsh))
xv = numpy.asarray(numpy.random.rand(*xsh), dtype=dtype)
d = {}
if pre_scalar_op is not None:
xv = x.eval({x.owner.inputs[0]: xv})
d = {pre_scalar_op: pre_scalar_op}
self._compile_and_check([x],
[self.op(scalar.add, axis=tosum)(x)],
[self.op(scalar.add, axis=tosum, *d)(x)],
[xv], self.op,
["local_cut_useless_reduce"],
warn=0 not in xsh)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论