提交 0d3dffac authored 作者: abergeron's avatar abergeron

Merge pull request #1888 from nouiz/gpu_sqr_sum_ax0

Add GpuSqrSumAx0 to lower the memory usage on the GPU.
from theano import Op, Apply from theano import Op, Apply
from theano.compat.six import StringIO from theano.compat.six import StringIO
from theano.sandbox.cuda import GpuOp from theano.sandbox.cuda import GpuOp, as_cuda_ndarray_variable
from theano.sandbox.cuda.kernel_codegen import (nvcc_kernel, from theano.sandbox.cuda.kernel_codegen import (nvcc_kernel,
inline_softmax, inline_softmax,
inline_softmax_fixed_shared) inline_softmax_fixed_shared)
class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp): class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuOp):
""" """
Implement CrossentropySoftmaxArgmax1HotWithBias on the gpu. Implement CrossentropySoftmaxArgmax1HotWithBias on the gpu.
""" """
...@@ -216,7 +216,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp): ...@@ -216,7 +216,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp):
gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1HotWithBias() gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1HotWithBias()
class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp): class GpuCrossentropySoftmax1HotWithBiasDx(GpuOp):
""" """
Implement CrossentropySoftmax1HotWithBiasDx on the gpu. Implement CrossentropySoftmax1HotWithBiasDx on the gpu.
""" """
...@@ -364,7 +364,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp): ...@@ -364,7 +364,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp):
gpu_crossentropy_softmax_1hot_with_bias_dx = GpuCrossentropySoftmax1HotWithBiasDx() gpu_crossentropy_softmax_1hot_with_bias_dx = GpuCrossentropySoftmax1HotWithBiasDx()
class GpuSoftmax (GpuOp): class GpuSoftmax(GpuOp):
""" """
Implement Softmax on the gpu. Implement Softmax on the gpu.
""" """
...@@ -483,8 +483,8 @@ class GpuSoftmax (GpuOp): ...@@ -483,8 +483,8 @@ class GpuSoftmax (GpuOp):
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
ret1 = nvcc_kernel("kSoftmax_%s" % nodename, ret1 = nvcc_kernel("kSoftmax_%s" % nodename,
params=['int M', 'int N', params=['int M', 'int N',
'const float * x', 'const int sx0', 'const int sx1', 'const float * x', 'const int sx0', 'const int sx1',
'float * sm', 'const int sm_s0', 'const int sm_s1'], 'float * sm', 'const int sm_s0', 'const int sm_s1'],
body=[ body=[
"extern __shared__ float buf[]", "extern __shared__ float buf[]",
"float * buf2 = buf + N", "float * buf2 = buf + N",
...@@ -506,8 +506,8 @@ class GpuSoftmax (GpuOp): ...@@ -506,8 +506,8 @@ class GpuSoftmax (GpuOp):
]) ])
ret2 = nvcc_kernel("kSoftmax_fixed_shared%s" % nodename, ret2 = nvcc_kernel("kSoftmax_fixed_shared%s" % nodename,
params=['int M', 'int N', params=['int M', 'int N',
'const float * x', 'const int sx0', 'const int sx1', 'const float * x', 'const int sx0', 'const int sx1',
'float * sm', 'const int sm_s0', 'const int sm_s1'], 'float * sm', 'const int sm_s0', 'const int sm_s1'],
body=[ body=[
"extern __shared__ float buf[]", "extern __shared__ float buf[]",
"for (int blockIDX = blockIdx.x; blockIDX < M;" "for (int blockIDX = blockIdx.x; blockIDX < M;"
...@@ -525,7 +525,7 @@ class GpuSoftmax (GpuOp): ...@@ -525,7 +525,7 @@ class GpuSoftmax (GpuOp):
gpu_softmax = GpuSoftmax() gpu_softmax = GpuSoftmax()
class GpuSoftmaxWithBias (GpuOp): class GpuSoftmaxWithBias(GpuOp):
""" """
Implement SoftmaxWithBias on the gpu. Implement SoftmaxWithBias on the gpu.
""" """
...@@ -545,7 +545,7 @@ class GpuSoftmaxWithBias (GpuOp): ...@@ -545,7 +545,7 @@ class GpuSoftmaxWithBias (GpuOp):
return Apply(self, [x, b], [x.type()]) return Apply(self, [x, b], [x.type()])
def infer_shape(self, node, shape): def infer_shape(self, node, shape):
return [shape[0]] return [shape[0]]
def c_code_cache_version(self): def c_code_cache_version(self):
#return () #return ()
...@@ -660,12 +660,13 @@ class GpuSoftmaxWithBias (GpuOp): ...@@ -660,12 +660,13 @@ class GpuSoftmaxWithBias (GpuOp):
""" % locals() """ % locals()
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
ret1 = nvcc_kernel("kSoftmaxWithBias_%s" % nodename, ret1 = nvcc_kernel(
params=['int M', 'int N', "kSoftmaxWithBias_%s" % nodename,
'const float * x', 'const int sx0', 'const int sx1', params=['int M', 'int N',
'const float * b', 'const int sb0', 'const float * x', 'const int sx0', 'const int sx1',
'float * sm', 'const int sm_s0', 'const int sm_s1'], 'const float * b', 'const int sb0',
body=[ 'float * sm', 'const int sm_s0', 'const int sm_s1'],
body=[
"extern __shared__ float buf[]", "extern __shared__ float buf[]",
"float * buf2 = buf + N", "float * buf2 = buf + N",
"for (int blockIDX = blockIdx.x; blockIDX < M;" "for (int blockIDX = blockIdx.x; blockIDX < M;"
...@@ -683,7 +684,7 @@ class GpuSoftmaxWithBias (GpuOp): ...@@ -683,7 +684,7 @@ class GpuSoftmaxWithBias (GpuOp):
"}", "}",
"__syncthreads()", "__syncthreads()",
"}", "}",
]) ])
ret2 = nvcc_kernel("kSoftmaxWithBias_fixed_shared%s" % nodename, ret2 = nvcc_kernel("kSoftmaxWithBias_fixed_shared%s" % nodename,
params=['int M', 'int N', params=['int M', 'int N',
'const float * x', 'const float * x',
......
...@@ -684,6 +684,24 @@ def local_gpu_careduce(node): ...@@ -684,6 +684,24 @@ def local_gpu_careduce(node):
return False return False
@register_opt("low_memory")
@local_optimizer([GpuCAReduce])
def local_gpu_elemwise_careduce(node):
if (isinstance(node.op, GpuCAReduce) and
node.op.pre_scalar_op is None and
node.inputs[0].owner and
isinstance(node.inputs[0].owner.op, GpuElemwise) and
# The Op support all scalar with 1 inputs. We don't
# automatically add more case, as some like trigonometic
# operation with some reduction pattern will probably result
# to slow down.
isinstance(node.inputs[0].owner.op.scalar_op, scal.basic.Sqr)
):
op = node.op
inp = node.inputs[0].owner.inputs[0]
return [GpuCAReduce(op.reduce_mask, op.scalar_op, scal.basic.sqr)(inp)]
@register_opt() @register_opt()
@local_optimizer([gpu_from_host, tensor.Reshape]) @local_optimizer([gpu_from_host, tensor.Reshape])
def local_gpu_reshape(node): def local_gpu_reshape(node):
......
...@@ -60,6 +60,10 @@ def test_careduce(): ...@@ -60,6 +60,10 @@ def test_careduce():
1110,1101,1011 1110,1101,1011
TODO: test with broadcast TODO: test with broadcast
We test with the pre_scalar_op sqr in all cases. This cover all
code, with and without it the pre_scalar_op.
""" """
for scalar_op, careduce_op in [ for scalar_op, careduce_op in [
(theano.scalar.mul, tensor.elemwise.CAReduceDtype), (theano.scalar.mul, tensor.elemwise.CAReduceDtype),
...@@ -132,7 +136,7 @@ def test_careduce(): ...@@ -132,7 +136,7 @@ def test_careduce():
pat = tensor_pattern_to_gpu_pattern(shape, pattern) pat = tensor_pattern_to_gpu_pattern(shape, pattern)
a = tensor.TensorType('float32', (False,) * len(shape))() a = tensor.TensorType('float32', (False,) * len(shape))()
b = op(a) b = op(a*a)
val = numpy.random.rand(numpy.prod(shape)).reshape(shape) val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
# val = numpy.ones(shape) # val = numpy.ones(shape)
# val = numpy.arange(numpy.prod(shape)).reshape(shape) # val = numpy.arange(numpy.prod(shape)).reshape(shape)
...@@ -142,6 +146,10 @@ def test_careduce(): ...@@ -142,6 +146,10 @@ def test_careduce():
assert tcn.GpuCAReduce in [x.op.__class__ assert tcn.GpuCAReduce in [x.op.__class__
for x in f.maker.fgraph.toposort()], ( for x in f.maker.fgraph.toposort()], (
scalar_op, shape, pattern) scalar_op, shape, pattern)
if tcn.GpuElemwise in [x.op.__class__
for x in f.maker.fgraph.toposort()]:
assert tcn.GpuReshape in [x.op.__class__
for x in f.maker.fgraph.toposort()]
assert op.__class__ in [x.op.__class__ assert op.__class__ in [x.op.__class__
for x in f2.maker.fgraph.toposort()], ( for x in f2.maker.fgraph.toposort()], (
scalar_op, shape, pattern) scalar_op, shape, pattern)
...@@ -210,7 +218,7 @@ def test_careduce(): ...@@ -210,7 +218,7 @@ def test_careduce():
dim_pattern[0] = 1 dim_pattern[0] = 1
dim_pattern[1] = 0 dim_pattern[1] = 0
a = a.dimshuffle(dim_pattern) a = a.dimshuffle(dim_pattern)
b = op(a) b = op(a*a)
val = numpy.random.rand(numpy.prod(shape)).reshape(shape) val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
# val = numpy.ones(shape) # val = numpy.ones(shape)
# val = numpy.arange(numpy.prod(shape)).reshape(shape) # val = numpy.arange(numpy.prod(shape)).reshape(shape)
...@@ -220,6 +228,8 @@ def test_careduce(): ...@@ -220,6 +228,8 @@ def test_careduce():
assert tcn.GpuCAReduce in [x.op.__class__ assert tcn.GpuCAReduce in [x.op.__class__
for x in f.maker.fgraph.toposort()], ( for x in f.maker.fgraph.toposort()], (
scalar_op, shape, pattern) scalar_op, shape, pattern)
assert tcn.GpuElemwise not in [x.op.__class__
for x in f.maker.fgraph.toposort()]
assert op.__class__ in [x.op.__class__ assert op.__class__ in [x.op.__class__
for x in f2.maker.fgraph.toposort()], ( for x in f2.maker.fgraph.toposort()], (
scalar_op, shape, pattern) scalar_op, shape, pattern)
...@@ -242,8 +252,8 @@ def test_careduce(): ...@@ -242,8 +252,8 @@ def test_careduce():
shape = numpy.asarray(shape) * 2 shape = numpy.asarray(shape) * 2
a = tensor.TensorType('float32', (False,) * len(shape))() a = tensor.TensorType('float32', (False,) * len(shape))()
a2 = tcn.CudaNdarrayType((False,) * len(shape))() a2 = tcn.CudaNdarrayType((False,) * len(shape))()
b = op(a) b = op(a*a)
b2 = op(a2) b2 = op(a2*a2)
val = numpy.random.rand(numpy.prod(shape)).reshape(shape) val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
# val = numpy.ones(shape) # val = numpy.ones(shape)
# val = numpy.arange(numpy.prod(shape)).reshape(shape) # val = numpy.arange(numpy.prod(shape)).reshape(shape)
...@@ -266,6 +276,8 @@ def test_careduce(): ...@@ -266,6 +276,8 @@ def test_careduce():
assert tcn.GpuCAReduce in [x.op.__class__ assert tcn.GpuCAReduce in [x.op.__class__
for x in f2.maker.fgraph.toposort()], ( for x in f2.maker.fgraph.toposort()], (
scalar_op, shape, pattern) scalar_op, shape, pattern)
assert tcn.GpuElemwise not in [x.op.__class__
for x in f.maker.fgraph.toposort()]
assert op.__class__ in [x.op.__class__ assert op.__class__ in [x.op.__class__
for x in f.maker.fgraph.toposort()], ( for x in f.maker.fgraph.toposort()], (
scalar_op, shape, pattern) scalar_op, shape, pattern)
......
...@@ -22,6 +22,15 @@ from type import GpuArrayType ...@@ -22,6 +22,15 @@ from type import GpuArrayType
def as_gpuarray_variable(x): def as_gpuarray_variable(x):
# This is needed to lower the number of useless transfer
# introduced during optimization. This speed up optimization and
# "canonicalize" the graph, so it make easier making some
# optimization.
if (hasattr(x, 'fgraph') and
len(x.clients) == 1 and
x.owner and
isinstance(x.owner.op, HostFromGpu)):
return x.owner.inputs[0]
if hasattr(x, '_as_GpuArrayVariable'): if hasattr(x, '_as_GpuArrayVariable'):
return x._as_GpuArrayVariable() return x._as_GpuArrayVariable()
# TODO we need to have the cuda -> gpu path taken care of. # TODO we need to have the cuda -> gpu path taken care of.
......
...@@ -563,6 +563,27 @@ def local_gpu_conv(node): ...@@ -563,6 +563,27 @@ def local_gpu_conv(node):
return [out] return [out]
@register_opt("low_memory")
@local_optimizer([GpuCAReduceCuda])
def local_gpu_elemwise_careduce(node):
""" Merge some GpuCAReduceCuda and GPUElemwise"""
if (isinstance(node.op, GpuCAReduceCuda) and
node.op.pre_scalar_op is None and
node.inputs[0].owner and
isinstance(node.inputs[0].owner.op, GpuElemwise) and
# The Op support all scalar with 1 inputs. We don't
# automatically add more case, as some like trigonometic
# operation with some reduction pattern will probably result
# to slow down.
isinstance(node.inputs[0].owner.op.scalar_op, scalar.basic.Sqr)
):
op = node.op
inp = node.inputs[0].owner.inputs[0]
return [GpuCAReduceCuda(scalar_op=op.scalar_op,
reduce_mask=op.reduce_mask,
pre_scalar_op=scalar.basic.sqr)(inp)]
def tensor_to_gpu(x): def tensor_to_gpu(x):
if isinstance(x.type, tensor.TensorType): if isinstance(x.type, tensor.TensorType):
y = GpuArrayType(broadcastable=x.type.broadcastable, y = GpuArrayType(broadcastable=x.type.broadcastable,
......
...@@ -40,11 +40,13 @@ class test_GpuCAReduceCPY(test_CAReduce): ...@@ -40,11 +40,13 @@ class test_GpuCAReduceCPY(test_CAReduce):
bin_dtypes = ["uint8", "int8"] bin_dtypes = ["uint8", "int8"]
op = GpuCAReduceCPY op = GpuCAReduceCPY
reds = [scalar.add, scalar.mul] reds = [scalar.add, scalar.mul]
pre_scalar_op = None
def test_perform(self): def test_perform(self):
for dtype in self.dtypes + self.bin_dtypes: for dtype in self.dtypes + self.bin_dtypes:
for op in self.reds: for op in self.reds:
self.with_linker(gof.PerformLinker(), op, dtype=dtype) self.with_linker(gof.PerformLinker(), op, dtype=dtype,
pre_scalar_op=self.pre_scalar_op)
def test_perform_nan(self): def test_perform_nan(self):
for dtype in self.dtypes: for dtype in self.dtypes:
...@@ -52,12 +54,14 @@ class test_GpuCAReduceCPY(test_CAReduce): ...@@ -52,12 +54,14 @@ class test_GpuCAReduceCPY(test_CAReduce):
continue continue
for op in self.reds: for op in self.reds:
self.with_linker(gof.PerformLinker(), op, dtype=dtype, self.with_linker(gof.PerformLinker(), op, dtype=dtype,
test_nan=True) test_nan=True,
pre_scalar_op=self.pre_scalar_op)
def test_c(self): def test_c(self):
for dtype in self.dtypes + self.bin_dtypes: for dtype in self.dtypes + self.bin_dtypes:
for op in self.reds: for op in self.reds:
self.with_linker(gof.CLinker(), op, dtype=dtype) self.with_linker(gof.CLinker(), op, dtype=dtype,
pre_scalar_op=self.pre_scalar_op)
def test_c_nan(self): def test_c_nan(self):
for dtype in self.dtypes: for dtype in self.dtypes:
...@@ -65,7 +69,8 @@ class test_GpuCAReduceCPY(test_CAReduce): ...@@ -65,7 +69,8 @@ class test_GpuCAReduceCPY(test_CAReduce):
continue continue
for op in self.reds: for op in self.reds:
self.with_linker(gof.CLinker(), op, dtype=dtype, self.with_linker(gof.CLinker(), op, dtype=dtype,
test_nan=True) test_nan=True,
pre_scalar_op=self.pre_scalar_op)
def test_infer_shape(self): def test_infer_shape(self):
for dtype in self.dtypes: for dtype in self.dtypes:
...@@ -148,6 +153,7 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY): ...@@ -148,6 +153,7 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
op = GpuCAReduceCuda op = GpuCAReduceCuda
reds = [scalar.add, scalar.mul, reds = [scalar.add, scalar.mul,
scalar.maximum, scalar.minimum] scalar.maximum, scalar.minimum]
pre_scalar_op = scalar.sqr
def test_perform(self): def test_perform(self):
return return
......
...@@ -133,3 +133,13 @@ def test_print_op(): ...@@ -133,3 +133,13 @@ def test_print_op():
assert isinstance(topo[2].op, GpuElemwise) assert isinstance(topo[2].op, GpuElemwise)
assert topo[3].op == host_from_gpu assert topo[3].op == host_from_gpu
f(numpy.random.random((5, 5)).astype('float32')) f(numpy.random.random((5, 5)).astype('float32'))
def test_local_gpu_elemwise_careduce():
x = theano.tensor.matrix()
o = (x*x).sum()
f = theano.function([x], o, mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert len(topo) == 3
assert topo[1].op.pre_scalar_op == theano.scalar.sqr
f(numpy.random.rand(3, 4).astype(theano.config.floatX))
...@@ -308,15 +308,19 @@ class test_CAReduce(unittest_tools.InferShapeTester): ...@@ -308,15 +308,19 @@ class test_CAReduce(unittest_tools.InferShapeTester):
] ]
def with_linker(self, linker, scalar_op=scalar.add, dtype="floatX", def with_linker(self, linker, scalar_op=scalar.add, dtype="floatX",
pre_scalar_op=None,
test_nan=False, tensor_op=None): test_nan=False, tensor_op=None):
for xsh, tosum in self.cases: for xsh, tosum in self.cases:
if dtype == "floatX": if dtype == "floatX":
dtype = theano.config.floatX dtype = theano.config.floatX
x = TensorType(dtype, [(entry == 1) for entry in xsh])('x') x = TensorType(dtype, [(entry == 1) for entry in xsh])('x')
d = {}
if pre_scalar_op is not None:
d = {"pre_scalar_op": pre_scalar_op}
if tensor_op is None: if tensor_op is None:
e = as_tensor_variable(self.op(scalar_op, axis=tosum)(x)) e = as_tensor_variable(self.op(scalar_op, axis=tosum, **d)(x))
else: else:
e = as_tensor_variable(tensor_op(x, axis=tosum)) e = as_tensor_variable(tensor_op(x, axis=tosum, **d))
if tosum is None: if tosum is None:
tosum = range(len(xsh)) tosum = range(len(xsh))
...@@ -337,6 +341,8 @@ class test_CAReduce(unittest_tools.InferShapeTester): ...@@ -337,6 +341,8 @@ class test_CAReduce(unittest_tools.InferShapeTester):
else: else:
xv = numpy.asarray(numpy.nan, dtype=dtype) xv = numpy.asarray(numpy.nan, dtype=dtype)
zv = xv zv = xv
if pre_scalar_op is not None:
zv = Elemwise(scalar_op=pre_scalar_op)(x).eval({x: xv})
numpy_raised = False numpy_raised = False
if len(tosum) > 1 and any([a < 0 for a in tosum]): if len(tosum) > 1 and any([a < 0 for a in tosum]):
#In that case, we need to use the good order of axis #In that case, we need to use the good order of axis
...@@ -505,16 +511,22 @@ class test_CAReduce(unittest_tools.InferShapeTester): ...@@ -505,16 +511,22 @@ class test_CAReduce(unittest_tools.InferShapeTester):
self.with_linker(gof.CLinker(), scalar.maximum, dtype=dtype, self.with_linker(gof.CLinker(), scalar.maximum, dtype=dtype,
test_nan=True) test_nan=True)
def test_infer_shape(self, dtype=None): def test_infer_shape(self, dtype=None, pre_scalar_op=None):
if dtype is None: if dtype is None:
dtype = theano.config.floatX dtype = theano.config.floatX
for xsh, tosum in self.cases: for xsh, tosum in self.cases:
x = TensorType(dtype, [(entry == 1) for entry in xsh])('x') x = TensorType(dtype, [(entry == 1) for entry in xsh])('x')
if pre_scalar_op is not None:
x = pre_scalar_op(x)
if tosum is None: if tosum is None:
tosum = range(len(xsh)) tosum = range(len(xsh))
xv = numpy.asarray(numpy.random.rand(*xsh), dtype=dtype) xv = numpy.asarray(numpy.random.rand(*xsh), dtype=dtype)
d = {}
if pre_scalar_op is not None:
xv = x.eval({x.owner.inputs[0]: xv})
d = {pre_scalar_op: pre_scalar_op}
self._compile_and_check([x], self._compile_and_check([x],
[self.op(scalar.add, axis=tosum)(x)], [self.op(scalar.add, axis=tosum, *d)(x)],
[xv], self.op, [xv], self.op,
["local_cut_useless_reduce"], ["local_cut_useless_reduce"],
warn=0 not in xsh) warn=0 not in xsh)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论