提交 750d7815 authored 作者: Frederic's avatar Frederic

Allow GpuCAReduce do unary elemwise operation on the input.

The opt to merge the Elemwise and the reduction is limited to sqr, as otherwise, we need to time, as it could slow things down.
上级 1cd49b15
......@@ -35,7 +35,7 @@ from theano.sandbox.cuda.blas import (GpuDownsampleFactorMax,
from theano.sandbox.cuda.nnet import (
GpuCrossentropySoftmaxArgmax1HotWithBias,
GpuCrossentropySoftmax1HotWithBiasDx,
GpuSoftmax, GpuSoftmaxWithBias, GpuSqrSumAx0)
GpuSoftmax, GpuSoftmaxWithBias)
from theano.sandbox.cuda.elemwise import SupportCodeError
from theano.scalar.basic_scipy import Erfinv
from theano.sandbox.cuda.elemwise import erfinv_gpu
......@@ -685,17 +685,22 @@ def local_gpu_careduce(node):
return False
@register_opt()#"fast_compile")
@register_opt("low_memory")
@local_optimizer([GpuCAReduce])
def local_gpu_sqr_sum_ax0(node):
def local_gpu_elemwise_careduce(node):
if (isinstance(node.op, GpuCAReduce) and
isinstance(node.op.scalar_op, theano.scalar.basic.Add) and
node.op.reduce_mask == (1, 0) and
node.op.pre_scalar_op is None and
node.inputs[0].owner and
isinstance(node.inputs[0].owner.op, GpuElemwise) and
isinstance(node.inputs[0].owner.op.scalar_op, theano.scalar.basic.Sqr)
# The Op support all scalar with 1 inputs. We don't
# automatically add more case, as some like trigonometic
# operation with some reduction pattern will probably result
# to slow down.
isinstance(node.inputs[0].owner.op.scalar_op, scal.basic.Sqr)
):
return [GpuSqrSumAx0()(node.inputs[0].owner.inputs[0])]
op = node.op
inp = node.inputs[0].owner.inputs[0]
return [GpuCAReduce(op.reduce_mask, op.scalar_op, scal.basic.sqr)(inp)]
@register_opt()
......
......@@ -60,6 +60,10 @@ def test_careduce():
1110,1101,1011
TODO: test with broadcast
We test with the pre_scalar_op sqr in all cases. This cover all
code, with and without it the pre_scalar_op.
"""
for scalar_op, careduce_op in [
(theano.scalar.mul, tensor.elemwise.CAReduceDtype),
......@@ -132,7 +136,7 @@ def test_careduce():
pat = tensor_pattern_to_gpu_pattern(shape, pattern)
a = tensor.TensorType('float32', (False,) * len(shape))()
b = op(a)
b = op(a*a)
val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
# val = numpy.ones(shape)
# val = numpy.arange(numpy.prod(shape)).reshape(shape)
......@@ -142,6 +146,10 @@ def test_careduce():
assert tcn.GpuCAReduce in [x.op.__class__
for x in f.maker.fgraph.toposort()], (
scalar_op, shape, pattern)
if tcn.GpuElemwise in [x.op.__class__
for x in f.maker.fgraph.toposort()]:
assert tcn.GpuReshape in [x.op.__class__
for x in f.maker.fgraph.toposort()]
assert op.__class__ in [x.op.__class__
for x in f2.maker.fgraph.toposort()], (
scalar_op, shape, pattern)
......@@ -210,7 +218,7 @@ def test_careduce():
dim_pattern[0] = 1
dim_pattern[1] = 0
a = a.dimshuffle(dim_pattern)
b = op(a)
b = op(a*a)
val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
# val = numpy.ones(shape)
# val = numpy.arange(numpy.prod(shape)).reshape(shape)
......@@ -220,6 +228,8 @@ def test_careduce():
assert tcn.GpuCAReduce in [x.op.__class__
for x in f.maker.fgraph.toposort()], (
scalar_op, shape, pattern)
assert tcn.GpuElemwise not in [x.op.__class__
for x in f.maker.fgraph.toposort()]
assert op.__class__ in [x.op.__class__
for x in f2.maker.fgraph.toposort()], (
scalar_op, shape, pattern)
......@@ -242,8 +252,8 @@ def test_careduce():
shape = numpy.asarray(shape) * 2
a = tensor.TensorType('float32', (False,) * len(shape))()
a2 = tcn.CudaNdarrayType((False,) * len(shape))()
b = op(a)
b2 = op(a2)
b = op(a*a)
b2 = op(a2*a2)
val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
# val = numpy.ones(shape)
# val = numpy.arange(numpy.prod(shape)).reshape(shape)
......@@ -266,6 +276,8 @@ def test_careduce():
assert tcn.GpuCAReduce in [x.op.__class__
for x in f2.maker.fgraph.toposort()], (
scalar_op, shape, pattern)
assert tcn.GpuElemwise not in [x.op.__class__
for x in f.maker.fgraph.toposort()]
assert op.__class__ in [x.op.__class__
for x in f.maker.fgraph.toposort()], (
scalar_op, shape, pattern)
......
......@@ -264,10 +264,24 @@ def test_sqr_sum_ax0():
gout = f_gpu(data)
assert numpy.allclose(out, gout), numpy.absolute(out - gout)
cmp(10, 15)
cmp(120000, 15)
cmp(15, 120000)
cmp(4000, 4000)
cmp(0, 15)
cmp(10, 0)
cmp(0, 0)
#cmp(10, 15)
#cmp(120000, 15)
#cmp(15, 120000)
#cmp(4000, 4000)
#cmp(0, 15)
#cmp(10, 0)
#cmp(0, 0)
m = mode_with_gpu.excluding("local_gpu_sqr_sum_ax0")
f_gpu2 = theano.function([x], z, mode=m)
n, m = 4000, 4000
data = numpy.arange(n * m, dtype='float32').reshape(n, m)
import time
t0 = time.time()
for i in range(1000):
f_gpu(data)
t1 = time.time()
for i in range(1000):
f_gpu2(data)
t2 = time.time()
print t1 - t0, t2 - t1
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论