提交 c930d96a authored 作者: Frederic's avatar Frederic

Fix the opt local_gpu_dot_to_dot22 to move more dot to the GPU in

fast_compile.
上级 f8a19f5b
...@@ -350,7 +350,7 @@ def local_gpu_specifyShape_0(node): ...@@ -350,7 +350,7 @@ def local_gpu_specifyShape_0(node):
@register_opt() @register_opt()
@local_optimizer([gpu_from_host]) # XXX: broken: tensor.basic.dot is not an op @local_optimizer([GpuFromHost, tensor.basic.Dot])
def local_gpu_dot_to_dot22(node): def local_gpu_dot_to_dot22(node):
""" """
gpu_from_host(dot) -> gpudot(gpu_from_host) gpu_from_host(dot) -> gpudot(gpu_from_host)
...@@ -361,6 +361,8 @@ def local_gpu_dot_to_dot22(node): ...@@ -361,6 +361,8 @@ def local_gpu_dot_to_dot22(node):
the output. the output.
A more suitable solution would be to use the right cublas call A more suitable solution would be to use the right cublas call
This is needed in fast_compile
""" """
# In case the got do input upcast, we much check that we can # In case the got do input upcast, we much check that we can
...@@ -369,17 +371,18 @@ def local_gpu_dot_to_dot22(node): ...@@ -369,17 +371,18 @@ def local_gpu_dot_to_dot22(node):
if node.outputs[0].type.dtype != 'float32': if node.outputs[0].type.dtype != 'float32':
return False return False
host_input = node.inputs[0] host_input = node.inputs[0]
if host_input.owner and host_input.owner.op == tensor.basic.dot: if host_input.owner and isinstance(host_input.owner.op,
tensor.basic.Dot):
x, y = host_input.owner.inputs x, y = host_input.owner.inputs
# case one: vector X matrix # case one: vector X matrix
if _is_real_vector(x) and _is_real_matrix(y): if _is_real_vector(x) and _is_real_matrix(y):
new_op = GpuDimShuffle((False,), ['x', 0]) new_op = GpuDimShuffle((False,), ('x', 0))
shape_out = y.shape[1].dimshuffle(['x']) shape_out = y.shape[1].dimshuffle(['x'])
gpu_x = new_op(gpu_from_host(x)) gpu_x = new_op(gpu_from_host(x))
gpu_y = gpu_from_host(y) gpu_y = gpu_from_host(y)
# case two: matrix X vector # case two: matrix X vector
elif _is_real_matrix(x) and _is_real_vector(y): elif _is_real_matrix(x) and _is_real_vector(y):
new_op = GpuDimShuffle((False,), [0, 'x']) new_op = GpuDimShuffle((False,), (0, 'x'))
shape_out = x.shape[0].dimshuffle(['x']) shape_out = x.shape[0].dimshuffle(['x'])
gpu_x = gpu_from_host(x) gpu_x = gpu_from_host(x)
gpu_y = new_op(gpu_from_host(y)) gpu_y = new_op(gpu_from_host(y))
...@@ -387,20 +390,20 @@ def local_gpu_dot_to_dot22(node): ...@@ -387,20 +390,20 @@ def local_gpu_dot_to_dot22(node):
return False return False
return [GpuReshape(1)(gpu_dot22(gpu_x, gpu_y), shape_out)] return [GpuReshape(1)(gpu_dot22(gpu_x, gpu_y), shape_out)]
if node.op == tensor.basic.dot: if isinstance(node.op, tensor.basic.Dot):
if node.outputs[0].type.dtype != 'float32': if node.outputs[0].type.dtype != 'float32':
return False return False
if any([i.owner and isinstance(i.owner.op, HostFromGpu) if any([i.owner and isinstance(i.owner.op, HostFromGpu)
for i in node.inputs]): for i in node.inputs]):
x, y = node.inputs x, y = node.inputs
if _is_real_vector(x) and _is_real_matrix(y): if _is_real_vector(x) and _is_real_matrix(y):
new_op = GpuDimShuffle((False,), ['x', 0]) new_op = GpuDimShuffle((False,), ('x', 0))
shape_out = y.shape[1].dimshuffle(['x']) shape_out = y.shape[1].dimshuffle(['x'])
gpu_x = new_op(gpu_from_host(x)) gpu_x = new_op(gpu_from_host(x))
gpu_y = gpu_from_host(y) gpu_y = gpu_from_host(y)
elif _is_real_matrix(x) and _is_real_vector(y): elif _is_real_matrix(x) and _is_real_vector(y):
new_op = GpuDimShuffle((False,), [0, 'x']) new_op = GpuDimShuffle((False,), (0, 'x'))
shape_out = x.shape[0].dimshuffle(['x']) shape_out = x.shape[0].dimshuffle(['x'])
gpu_x = gpu_from_host(x) gpu_x = gpu_from_host(x)
gpu_y = new_op(gpu_from_host(y)) gpu_y = new_op(gpu_from_host(y))
......
...@@ -404,6 +404,32 @@ def test_erfinvgpu(): ...@@ -404,6 +404,32 @@ def test_erfinvgpu():
assert numpy.allclose(f(xv),f2(xv)) assert numpy.allclose(f(xv),f2(xv))
def test_local_gpu_dot_to_dot22dot():
def cmp(a_shp, b_shp):
a0 = numpy.random.rand(*a_shp).astype('float32')
a = cuda.shared_constructor(a0, 'a')
b0 = numpy.random.rand(*b_shp).astype('float32')
b = cuda.shared_constructor(b0, 'a')
f = pfunc([], tensor.dot(a, b), mode=mode_with_gpu)
assert cuda.opt.local_gpu_dot_to_dot22.transform(
tensor.dot(a, b).owner)
out = f()
assert numpy.allclose(numpy.dot(a0, b0), out)
# Try with a matrix equal to a0, but with strides in both dims
a.set_value(a0)
a.set_value(
a.get_value(borrow=True,
return_internal_type=True)[::-1],
borrow=True)
f()
cmp((4,), (4, 5))
cmp((3, 4), (4,))
class test_diag(theano.tensor.tests.test_nlinalg.test_diag): class test_diag(theano.tensor.tests.test_nlinalg.test_diag):
mode = mode_with_gpu mode = mode_with_gpu
shared = staticmethod(cuda.shared_constructor) shared = staticmethod(cuda.shared_constructor)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论