提交 0ac9ec62 authored 作者: Frederic's avatar Frederic

[ENH] Make opt use the opencl version of the op for opencl device

上级 58e9b7fb
...@@ -31,7 +31,8 @@ from theano.sandbox.gpuarray.nnet import ( ...@@ -31,7 +31,8 @@ from theano.sandbox.gpuarray.nnet import (
GpuSoftmaxWithBias, GpuSoftmax GpuSoftmaxWithBias, GpuSoftmax
) )
from theano.sandbox.gpuarray.elemwise import (GpuElemwise, _is_scalar, from theano.sandbox.gpuarray.elemwise import (GpuElemwise, _is_scalar,
GpuDimShuffle, GpuCAReduceCuda) GpuDimShuffle, GpuCAReduceCuda,
GpuCAReduceCPY)
from theano.sandbox.gpuarray.subtensor import (GpuIncSubtensor, GpuSubtensor, from theano.sandbox.gpuarray.subtensor import (GpuIncSubtensor, GpuSubtensor,
GpuAdvancedIncSubtensor1, GpuAdvancedIncSubtensor1,
GpuAdvancedIncSubtensor1_dev20) GpuAdvancedIncSubtensor1_dev20)
...@@ -366,15 +367,25 @@ def local_gpua_advanced_incsubtensor(node): ...@@ -366,15 +367,25 @@ def local_gpua_advanced_incsubtensor(node):
def local_gpua_careduce(node): def local_gpua_careduce(node):
if isinstance(node.op.scalar_op, (scalar.Add, scalar.Mul, if isinstance(node.op.scalar_op, (scalar.Add, scalar.Mul,
scalar.Maximum, scalar.Minimum)): scalar.Maximum, scalar.Minimum)):
dev = theano.sandbox.gpuarray.init_dev.device
if dev.startswith('opencl'):
op = GpuCAReduceCPY
if node.op.scalar_op not in [scalar.add, scalar.mul]:
# We don't support yet all reduction with cpy code.
return
else:
op = GpuCAReduceCuda
x, = node.inputs x, = node.inputs
greduce = GpuCAReduceCuda(
greduce = op(
node.op.scalar_op, axis=node.op.axis, node.op.scalar_op, axis=node.op.axis,
dtype=getattr(node.op, 'dtype', None), dtype=getattr(node.op, 'dtype', None),
acc_dtype=getattr(node.op, 'acc_dtype', None)) acc_dtype=getattr(node.op, 'acc_dtype', None))
gvar = greduce(x) gvar = greduce(x)
# We need to have the make node called, otherwise the mask can # We need to have the make node called, otherwise the mask can
# be None # be None
if gvar.owner.op.supports_c_code([gpu_from_host(x)]): if (op is GpuCAReduceCPY or
gvar.owner.op.supports_c_code([gpu_from_host(x)])):
return greduce return greduce
else: else:
# Try to make a simpler pattern based on reshaping # Try to make a simpler pattern based on reshaping
...@@ -407,7 +418,7 @@ def local_gpua_careduce(node): ...@@ -407,7 +418,7 @@ def local_gpua_careduce(node):
for idx, m in enumerate(new_mask): for idx, m in enumerate(new_mask):
if m == 1: if m == 1:
new_axis.append(idx) new_axis.append(idx)
greduce = GpuCAReduceCuda( greduce = op(
node.op.scalar_op, node.op.scalar_op,
axis=new_axis, reduce_mask=new_mask, axis=new_axis, reduce_mask=new_mask,
dtype=getattr(node.op, 'dtype', None), dtype=getattr(node.op, 'dtype', None),
......
...@@ -7,7 +7,8 @@ import theano.sandbox.gpuarray ...@@ -7,7 +7,8 @@ import theano.sandbox.gpuarray
from theano.sandbox.gpuarray.type import GpuArrayType from theano.sandbox.gpuarray.type import GpuArrayType
from theano.sandbox.gpuarray.basic_ops import ( from theano.sandbox.gpuarray.basic_ops import (
GpuAlloc, GpuReshape, gpu_alloc, gpu_from_host, host_from_gpu) GpuAlloc, GpuReshape, gpu_alloc, gpu_from_host, host_from_gpu)
from theano.sandbox.gpuarray.elemwise import GpuCAReduceCuda, GpuElemwise from theano.sandbox.gpuarray.elemwise import (
GpuCAReduceCuda, GpuCAReduceCPY, GpuElemwise)
from theano.sandbox.gpuarray.tests.test_basic_ops import ( from theano.sandbox.gpuarray.tests.test_basic_ops import (
rand_gpuarray, mode_with_gpu, mode_without_gpu rand_gpuarray, mode_with_gpu, mode_without_gpu
) )
...@@ -50,17 +51,26 @@ def test_flatten(): ...@@ -50,17 +51,26 @@ def test_flatten():
def test_reduce(): def test_reduce():
for method in ['sum', 'prod', 'max', 'min']: dev = theano.sandbox.gpuarray.init_dev.device
for method, param in [('sum', dict(acc_dtype='float32')),
('prod', dict(acc_dtype='float32')),
('max', {}), ('min', {})]:
m = theano.tensor.fmatrix() m = theano.tensor.fmatrix()
f = theano.function([m], getattr(m, method)(axis=0), f = theano.function([m], getattr(m, method)(axis=0,
**param),
mode=mode_with_gpu) mode=mode_with_gpu)
val = numpy.random.rand(10, 11).astype("float32") val = numpy.random.rand(10, 11).astype("float32")
res = f(val) res = f(val)
utt.assert_allclose(res, getattr(val, method)(axis=0)) utt.assert_allclose(res, getattr(val, method)(axis=0))
assert res.shape == (11,) assert res.shape == (11,)
topo = f.maker.fgraph.toposort() topo = f.maker.fgraph.toposort()
assert GpuCAReduceCuda in [type(node.op) ops = [type(node.op) for node in topo]
for node in topo], topo
if dev.startswith('opencl') and method in ["max", "min"]:
assert not(GpuCAReduceCuda in ops or GpuCAReduceCPY in ops)
else:
assert GpuCAReduceCuda in ops or GpuCAReduceCPY in ops
def test_local_gpualloc_memset_0(): def test_local_gpualloc_memset_0():
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论