提交 1bf7ea39 authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #2972 from t13m/opt_as_cuda_ndarray_variable

Remove useless gpu_from_host(host_from_gpu(x)) op.
...@@ -34,6 +34,14 @@ _logger = logging.getLogger(_logger_name) ...@@ -34,6 +34,14 @@ _logger = logging.getLogger(_logger_name)
def as_cuda_ndarray_variable(x): def as_cuda_ndarray_variable(x):
if x.owner:
if isinstance(x.owner.op, HostFromGpu):
return x.owner.inputs[0]
elif \
isinstance(x.owner.op, GpuFromHost) and \
x.owner.inputs[0].owner and \
isinstance(x.owner.inputs[0].owner.op, HostFromGpu):
return x.owner.inputs[0].owner.inputs[0]
if hasattr(x, '_as_CudaNdarrayVariable'): if hasattr(x, '_as_CudaNdarrayVariable'):
return x._as_CudaNdarrayVariable() return x._as_CudaNdarrayVariable()
tensor_x = tensor.as_tensor_variable(x) tensor_x = tensor.as_tensor_variable(x)
......
...@@ -19,6 +19,7 @@ from theano.compile import optdb ...@@ -19,6 +19,7 @@ from theano.compile import optdb
from theano.gof import (local_optimizer, EquilibriumDB, ProxyDB, from theano.gof import (local_optimizer, EquilibriumDB, ProxyDB,
Optimizer, toolbox) Optimizer, toolbox)
from theano.gof.opt import LocalMetaOptimizer from theano.gof.opt import LocalMetaOptimizer
from theano.sandbox.cuda import as_cuda_ndarray_variable
from theano.sandbox.cuda.basic_ops import ( from theano.sandbox.cuda.basic_ops import (
gpu_eye, gpu_contiguous, gpu_eye, gpu_contiguous,
gpu_from_host, host_from_gpu, GpuFromHost, HostFromGpu, gpu_from_host, host_from_gpu, GpuFromHost, HostFromGpu,
...@@ -314,7 +315,7 @@ def local_gpu_elemwise_1(node): ...@@ -314,7 +315,7 @@ def local_gpu_elemwise_1(node):
return False return False
if all([i.dtype == 'float32' for i in elemwise_node.inputs]): if all([i.dtype == 'float32' for i in elemwise_node.inputs]):
gpu_elemwise = new_op(*[gpu_from_host(i) gpu_elemwise = new_op(*[as_cuda_ndarray_variable(i)
for i in elemwise_node.inputs]) for i in elemwise_node.inputs])
gpu_elemwise = split_huge_add_or_mul(gpu_elemwise.owner) gpu_elemwise = split_huge_add_or_mul(gpu_elemwise.owner)
if not gpu_elemwise: if not gpu_elemwise:
...@@ -334,7 +335,7 @@ def local_gpu_split(node): ...@@ -334,7 +335,7 @@ def local_gpu_split(node):
any([c != 'output' and isinstance(c.op, GpuFromHost) for c, idx any([c != 'output' and isinstance(c.op, GpuFromHost) for c, idx
in outs_clients])): in outs_clients])):
new_op = GpuSplit(node.op.len_splits) new_op = GpuSplit(node.op.len_splits)
split_res = new_op(gpu_from_host(input), *node.inputs[1:], split_res = new_op(as_cuda_ndarray_variable(input), *node.inputs[1:],
return_list=True) return_list=True)
return [host_from_gpu(o) for o in split_res] return [host_from_gpu(o) for o in split_res]
return False return False
...@@ -353,7 +354,7 @@ def local_gpu_dimshuffle_0(node): ...@@ -353,7 +354,7 @@ def local_gpu_dimshuffle_0(node):
# move the add to a GpuAdd # move the add to a GpuAdd
new_op = GpuDimShuffle(node.op.input_broadcastable, new_op = GpuDimShuffle(node.op.input_broadcastable,
node.op.new_order) node.op.new_order)
return [host_from_gpu(new_op(gpu_from_host(input)))] return [host_from_gpu(new_op(as_cuda_ndarray_variable(input)))]
if isinstance(node.op, GpuFromHost): if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0] host_input = node.inputs[0]
if host_input.owner and isinstance(host_input.owner.op, if host_input.owner and isinstance(host_input.owner.op,
...@@ -361,7 +362,7 @@ def local_gpu_dimshuffle_0(node): ...@@ -361,7 +362,7 @@ def local_gpu_dimshuffle_0(node):
dimshuffle_node = host_input.owner dimshuffle_node = host_input.owner
new_op = GpuDimShuffle(dimshuffle_node.op.input_broadcastable, new_op = GpuDimShuffle(dimshuffle_node.op.input_broadcastable,
dimshuffle_node.op.new_order) dimshuffle_node.op.new_order)
return [new_op(gpu_from_host(dimshuffle_node.inputs[0]))] return [new_op(as_cuda_ndarray_variable(dimshuffle_node.inputs[0]))]
return False return False
...@@ -375,7 +376,7 @@ def local_gpu_specifyShape_0(node): ...@@ -375,7 +376,7 @@ def local_gpu_specifyShape_0(node):
if isinstance(node.op, tensor.SpecifyShape): if isinstance(node.op, tensor.SpecifyShape):
input = node.inputs[0] input = node.inputs[0]
if input.owner and isinstance(input.owner.op, HostFromGpu): if input.owner and isinstance(input.owner.op, HostFromGpu):
return [host_from_gpu(tensor.specify_shape(gpu_from_host(input), return [host_from_gpu(tensor.specify_shape(as_cuda_ndarray_variable(input),
*node.inputs[1:]))] *node.inputs[1:]))]
if isinstance(node.op, GpuFromHost): if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0] host_input = node.inputs[0]
...@@ -383,7 +384,7 @@ def local_gpu_specifyShape_0(node): ...@@ -383,7 +384,7 @@ def local_gpu_specifyShape_0(node):
tensor.SpecifyShape): tensor.SpecifyShape):
specifyshape_node = host_input.owner specifyshape_node = host_input.owner
return [tensor.specify_shape( return [tensor.specify_shape(
gpu_from_host(specifyshape_node.inputs[0]), as_cuda_ndarray_variable(specifyshape_node.inputs[0]),
*specifyshape_node.inputs[1:])] *specifyshape_node.inputs[1:])]
return False return False
...@@ -417,14 +418,14 @@ def local_gpu_dot_to_dot22(node): ...@@ -417,14 +418,14 @@ def local_gpu_dot_to_dot22(node):
if _is_real_vector(x) and _is_real_matrix(y): if _is_real_vector(x) and _is_real_matrix(y):
new_op = GpuDimShuffle((False,), ('x', 0)) new_op = GpuDimShuffle((False,), ('x', 0))
shape_out = y.shape[1].dimshuffle(['x']) shape_out = y.shape[1].dimshuffle(['x'])
gpu_x = new_op(gpu_from_host(x)) gpu_x = new_op(as_cuda_ndarray_variable(x))
gpu_y = gpu_from_host(y) gpu_y = as_cuda_ndarray_variable(y)
# case two: matrix X vector # case two: matrix X vector
elif _is_real_matrix(x) and _is_real_vector(y): elif _is_real_matrix(x) and _is_real_vector(y):
new_op = GpuDimShuffle((False,), (0, 'x')) new_op = GpuDimShuffle((False,), (0, 'x'))
shape_out = x.shape[0].dimshuffle(['x']) shape_out = x.shape[0].dimshuffle(['x'])
gpu_x = gpu_from_host(x) gpu_x = as_cuda_ndarray_variable(x)
gpu_y = new_op(gpu_from_host(y)) gpu_y = new_op(as_cuda_ndarray_variable(y))
else: else:
return False return False
...@@ -438,14 +439,14 @@ def local_gpu_dot_to_dot22(node): ...@@ -438,14 +439,14 @@ def local_gpu_dot_to_dot22(node):
if _is_real_vector(x) and _is_real_matrix(y): if _is_real_vector(x) and _is_real_matrix(y):
new_op = GpuDimShuffle((False,), ('x', 0)) new_op = GpuDimShuffle((False,), ('x', 0))
shape_out = y.shape[1].dimshuffle(['x']) shape_out = y.shape[1].dimshuffle(['x'])
gpu_x = new_op(gpu_from_host(x)) gpu_x = new_op(as_cuda_ndarray_variable(x))
gpu_y = gpu_from_host(y) gpu_y = as_cuda_ndarray_variable(y)
elif _is_real_matrix(x) and _is_real_vector(y): elif _is_real_matrix(x) and _is_real_vector(y):
new_op = GpuDimShuffle((False,), (0, 'x')) new_op = GpuDimShuffle((False,), (0, 'x'))
shape_out = x.shape[0].dimshuffle(['x']) shape_out = x.shape[0].dimshuffle(['x'])
gpu_x = gpu_from_host(x) gpu_x = as_cuda_ndarray_variable(x)
gpu_y = new_op(gpu_from_host(y)) gpu_y = new_op(as_cuda_ndarray_variable(y))
else: else:
return False return False
...@@ -504,7 +505,7 @@ def local_gpu_lazy_ifelse(node): ...@@ -504,7 +505,7 @@ def local_gpu_lazy_ifelse(node):
for i in range(len(outs)): for i in range(len(outs)):
if (not isinstance(outs[i].type, CudaNdarrayType) and if (not isinstance(outs[i].type, CudaNdarrayType) and
outs[i].dtype == 'float32'): outs[i].dtype == 'float32'):
outs[i] = gpu_from_host(outs[i]) outs[i] = as_cuda_ndarray_variable(outs[i])
outs = gpu_ifelse(c, *outs, return_list=True) outs = gpu_ifelse(c, *outs, return_list=True)
for i in range(len(outs)): for i in range(len(outs)):
if isinstance(outs[i].type, CudaNdarrayType): if isinstance(outs[i].type, CudaNdarrayType):
...@@ -536,7 +537,7 @@ def local_gpu_lazy_ifelse(node): ...@@ -536,7 +537,7 @@ def local_gpu_lazy_ifelse(node):
for i in range(len(outs)): for i in range(len(outs)):
if (not isinstance(outs[i].type, CudaNdarrayType) and if (not isinstance(outs[i].type, CudaNdarrayType) and
outs[i].dtype == 'float32'): outs[i].dtype == 'float32'):
outs[i] = gpu_from_host(outs[i]) outs[i] = as_cuda_ndarray_variable(outs[i])
outs = gpu_ifelse.make_node(c, *outs).outputs outs = gpu_ifelse.make_node(c, *outs).outputs
return outs return outs
...@@ -556,13 +557,13 @@ def local_gpu_dot22(node): ...@@ -556,13 +557,13 @@ def local_gpu_dot22(node):
if host_input.owner and isinstance(host_input.owner.op, if host_input.owner and isinstance(host_input.owner.op,
tensor.blas.Dot22): tensor.blas.Dot22):
x, y = host_input.owner.inputs x, y = host_input.owner.inputs
return [gpu_dot22(gpu_from_host(x), gpu_from_host(y))] return [gpu_dot22(as_cuda_ndarray_variable(x), as_cuda_ndarray_variable(y))]
if isinstance(node.op, tensor.blas.Dot22): if isinstance(node.op, tensor.blas.Dot22):
if any([(i.owner and isinstance(i.owner.op, HostFromGpu)) if any([(i.owner and isinstance(i.owner.op, HostFromGpu))
for i in node.inputs]): for i in node.inputs]):
x, y = node.inputs x, y = node.inputs
return [host_from_gpu(gpu_dot22(gpu_from_host(x), return [host_from_gpu(gpu_dot22(as_cuda_ndarray_variable(x),
gpu_from_host(y)))] as_cuda_ndarray_variable(y)))]
return False return False
...@@ -580,15 +581,15 @@ def local_gpu_dot22scalar(node): ...@@ -580,15 +581,15 @@ def local_gpu_dot22scalar(node):
isinstance(host_input.owner.op, isinstance(host_input.owner.op,
tensor.blas.Dot22Scalar)): tensor.blas.Dot22Scalar)):
x, y, scalar = host_input.owner.inputs x, y, scalar = host_input.owner.inputs
return [gpu_dot22scalar(gpu_from_host(x), gpu_from_host(y), return [gpu_dot22scalar(as_cuda_ndarray_variable(x), as_cuda_ndarray_variable(y),
tensor.blas._as_scalar(scalar))] tensor.blas._as_scalar(scalar))]
if isinstance(node.op, tensor.blas.Dot22Scalar): if isinstance(node.op, tensor.blas.Dot22Scalar):
if any([i.owner and isinstance(i.owner.op, HostFromGpu) if any([i.owner and isinstance(i.owner.op, HostFromGpu)
for i in node.inputs]): for i in node.inputs]):
x, y, scalar = node.inputs x, y, scalar = node.inputs
return [host_from_gpu( return [host_from_gpu(
gpu_dot22scalar(gpu_from_host(x), gpu_dot22scalar(as_cuda_ndarray_variable(x),
gpu_from_host(y), as_cuda_ndarray_variable(y),
tensor.blas._as_scalar(scalar)))] tensor.blas._as_scalar(scalar)))]
return False return False
...@@ -606,15 +607,15 @@ def local_gpu_solve(node): ...@@ -606,15 +607,15 @@ def local_gpu_solve(node):
isinstance(host_input.owner.op, isinstance(host_input.owner.op,
slinalg.Solve)): slinalg.Solve)):
x, y = host_input.owner.inputs x, y = host_input.owner.inputs
return [gpu_solve(gpu_from_host(x), gpu_from_host(y))] return [gpu_solve(as_cuda_ndarray_variable(x), as_cuda_ndarray_variable(y))]
if isinstance(node.op, slinalg.Solve): if isinstance(node.op, slinalg.Solve):
if any([i.owner and isinstance(i.owner.op, HostFromGpu) if any([i.owner and isinstance(i.owner.op, HostFromGpu)
for i in node.inputs]): for i in node.inputs]):
x, y = node.inputs x, y = node.inputs
return [host_from_gpu( return [host_from_gpu(
gpu_solve(gpu_from_host(x), gpu_solve(as_cuda_ndarray_variable(x),
gpu_from_host(y)))] as_cuda_ndarray_variable(y)))]
return False return False
...@@ -634,10 +635,10 @@ def local_gpu_gemv(node): ...@@ -634,10 +635,10 @@ def local_gpu_gemv(node):
if host_input.owner and isinstance(host_input.owner.op, gemvs): if host_input.owner and isinstance(host_input.owner.op, gemvs):
z, a, x, y, b = host_input.owner.inputs z, a, x, y, b = host_input.owner.inputs
return [gpu_gemv_no_inplace( return [gpu_gemv_no_inplace(
gpu_from_host(z), as_cuda_ndarray_variable(z),
a, a,
gpu_from_host(x), as_cuda_ndarray_variable(x),
gpu_from_host(y), as_cuda_ndarray_variable(y),
b)] b)]
if isinstance(node.op, gemvs): if isinstance(node.op, gemvs):
z, a, x, y, b = node.inputs z, a, x, y, b = node.inputs
...@@ -647,10 +648,10 @@ def local_gpu_gemv(node): ...@@ -647,10 +648,10 @@ def local_gpu_gemv(node):
if x_on_gpu or y_on_gpu or z_on_gpu: if x_on_gpu or y_on_gpu or z_on_gpu:
return [host_from_gpu( return [host_from_gpu(
gpu_gemv_no_inplace( gpu_gemv_no_inplace(
gpu_from_host(z), as_cuda_ndarray_variable(z),
a, a,
gpu_from_host(x), as_cuda_ndarray_variable(x),
gpu_from_host(y), as_cuda_ndarray_variable(y),
b))] b))]
return False return False
...@@ -674,10 +675,10 @@ def local_gpu_ger(node): ...@@ -674,10 +675,10 @@ def local_gpu_ger(node):
if host_input.owner and isinstance(host_input.owner.op, gers): if host_input.owner and isinstance(host_input.owner.op, gers):
z, a, x, y = host_input.owner.inputs z, a, x, y = host_input.owner.inputs
return [gpu_ger_no_inplace( return [gpu_ger_no_inplace(
gpu_from_host(z), as_cuda_ndarray_variable(z),
a, a,
gpu_from_host(x), as_cuda_ndarray_variable(x),
gpu_from_host(y) as_cuda_ndarray_variable(y)
)] )]
if isinstance(node.op, gers): if isinstance(node.op, gers):
z, a, x, y = node.inputs z, a, x, y = node.inputs
...@@ -687,10 +688,10 @@ def local_gpu_ger(node): ...@@ -687,10 +688,10 @@ def local_gpu_ger(node):
if x_on_gpu or y_on_gpu or z_on_gpu: if x_on_gpu or y_on_gpu or z_on_gpu:
return [host_from_gpu( return [host_from_gpu(
gpu_ger_no_inplace( gpu_ger_no_inplace(
gpu_from_host(z), as_cuda_ndarray_variable(z),
a, a,
gpu_from_host(x), as_cuda_ndarray_variable(x),
gpu_from_host(y) as_cuda_ndarray_variable(y)
))] ))]
return False return False
...@@ -708,10 +709,10 @@ def local_gpu_gemm(node): ...@@ -708,10 +709,10 @@ def local_gpu_gemm(node):
if host_input.owner and isinstance(host_input.owner.op, if host_input.owner and isinstance(host_input.owner.op,
tensor.blas.Gemm): tensor.blas.Gemm):
z, a, x, y, b = host_input.owner.inputs z, a, x, y, b = host_input.owner.inputs
return [gpu_gemm_no_inplace(gpu_from_host(z), return [gpu_gemm_no_inplace(as_cuda_ndarray_variable(z),
a, a,
gpu_from_host(x), as_cuda_ndarray_variable(x),
gpu_from_host(y), as_cuda_ndarray_variable(y),
b)] b)]
if isinstance(node.op, tensor.blas.Gemm): if isinstance(node.op, tensor.blas.Gemm):
z, a, x, y, b = node.inputs z, a, x, y, b = node.inputs
...@@ -719,10 +720,10 @@ def local_gpu_gemm(node): ...@@ -719,10 +720,10 @@ def local_gpu_gemm(node):
y_on_gpu = (y.owner and isinstance(y.owner.op, HostFromGpu)) y_on_gpu = (y.owner and isinstance(y.owner.op, HostFromGpu))
z_on_gpu = (z.owner and isinstance(z.owner.op, HostFromGpu)) z_on_gpu = (z.owner and isinstance(z.owner.op, HostFromGpu))
if x_on_gpu or y_on_gpu or z_on_gpu: if x_on_gpu or y_on_gpu or z_on_gpu:
return [host_from_gpu(gpu_gemm_no_inplace(gpu_from_host(z), return [host_from_gpu(gpu_gemm_no_inplace(as_cuda_ndarray_variable(z),
a, a,
gpu_from_host(x), as_cuda_ndarray_variable(x),
gpu_from_host(y), as_cuda_ndarray_variable(y),
b))] b))]
return False return False
...@@ -783,8 +784,8 @@ def local_gpu_careduce(node): ...@@ -783,8 +784,8 @@ def local_gpu_careduce(node):
reduce_mask[a] = 1 reduce_mask[a] = 1
greduce = GpuCAReduce(reduce_mask, scalar_op) greduce = GpuCAReduce(reduce_mask, scalar_op)
out = node.outputs[0] out = node.outputs[0]
if greduce.supports_c_code([gpu_from_host(x)]): if greduce.supports_c_code([as_cuda_ndarray_variable(x)]):
rval = host_from_gpu(greduce(gpu_from_host(x))) rval = host_from_gpu(greduce(as_cuda_ndarray_variable(x)))
else: else:
# Try to make a simpler pattern based on reshaping # Try to make a simpler pattern based on reshaping
# The principle is that if two adjacent dimensions have # The principle is that if two adjacent dimensions have
...@@ -807,7 +808,7 @@ def local_gpu_careduce(node): ...@@ -807,7 +808,7 @@ def local_gpu_careduce(node):
new_greduce = GpuCAReduce(new_mask, scalar_op) new_greduce = GpuCAReduce(new_mask, scalar_op)
reshaped_x = x.reshape(tensor.stack(*new_in_shp)) reshaped_x = x.reshape(tensor.stack(*new_in_shp))
gpu_reshaped_x = gpu_from_host(reshaped_x) gpu_reshaped_x = as_cuda_ndarray_variable(reshaped_x)
reshaped_gpu_inputs = [gpu_reshaped_x] reshaped_gpu_inputs = [gpu_reshaped_x]
if new_greduce.supports_c_code(reshaped_gpu_inputs): if new_greduce.supports_c_code(reshaped_gpu_inputs):
reduce_reshaped_x = host_from_gpu( reduce_reshaped_x = host_from_gpu(
...@@ -876,7 +877,7 @@ def local_gpu_reshape(node): ...@@ -876,7 +877,7 @@ def local_gpu_reshape(node):
isinstance(host_input.owner.op, tensor.Reshape): isinstance(host_input.owner.op, tensor.Reshape):
rshp = host_input.owner.op rshp = host_input.owner.op
x, shp = host_input.owner.inputs x, shp = host_input.owner.inputs
gpu_reshape = GpuReshape(rshp.ndim)(gpu_from_host(x), shp) gpu_reshape = GpuReshape(rshp.ndim)(as_cuda_ndarray_variable(x), shp)
if gpu_reshape.broadcastable != node.outputs[0].broadcastable: if gpu_reshape.broadcastable != node.outputs[0].broadcastable:
# this can happen as we always return False for all broadcast # this can happen as we always return False for all broadcast
# dim in GpuReshape but not for Reshape # dim in GpuReshape but not for Reshape
...@@ -910,7 +911,7 @@ def local_gpu_flatten(node): ...@@ -910,7 +911,7 @@ def local_gpu_flatten(node):
isinstance(host_input.owner.op, tensor.Flatten): isinstance(host_input.owner.op, tensor.Flatten):
outdim = host_input.owner.op.outdim outdim = host_input.owner.op.outdim
return [GpuFlatten(outdim)( return [GpuFlatten(outdim)(
gpu_from_host(host_input.owner.inputs[0]))] as_cuda_ndarray_variable(host_input.owner.inputs[0]))]
if isinstance(node.op, tensor.Flatten): if isinstance(node.op, tensor.Flatten):
x, = node.inputs x, = node.inputs
outdim = node.op.outdim outdim = node.op.outdim
...@@ -935,7 +936,7 @@ def local_gpu_subtensor(node): ...@@ -935,7 +936,7 @@ def local_gpu_subtensor(node):
# to the GPU in that case. # to the GPU in that case.
return return
coords = host_input.owner.inputs[1:] coords = host_input.owner.inputs[1:]
return [GpuSubtensor(subt.idx_list)(gpu_from_host(x), *coords)] return [GpuSubtensor(subt.idx_list)(as_cuda_ndarray_variable(x), *coords)]
if isinstance(node.op, tensor.Subtensor): if isinstance(node.op, tensor.Subtensor):
x = node.inputs[0] x = node.inputs[0]
if (x.owner and if (x.owner and
...@@ -951,7 +952,7 @@ def local_gpu_subtensor(node): ...@@ -951,7 +952,7 @@ def local_gpu_subtensor(node):
for n, _ in node.outputs[0].clients]): for n, _ in node.outputs[0].clients]):
return return
else: else:
return [host_from_gpu(gpu_from_host(node.outputs[0]))] return [host_from_gpu(as_cuda_ndarray_variable(node.outputs[0]))]
return return
gpu_x, = x.owner.inputs gpu_x, = x.owner.inputs
...@@ -970,7 +971,7 @@ def local_gpu_advanced_subtensor1(node): ...@@ -970,7 +971,7 @@ def local_gpu_advanced_subtensor1(node):
host_input.owner.op.__class__ is tensor.AdvancedSubtensor1: host_input.owner.op.__class__ is tensor.AdvancedSubtensor1:
x = host_input.owner.inputs[0] x = host_input.owner.inputs[0]
coords = host_input.owner.inputs[1:] coords = host_input.owner.inputs[1:]
return [GpuAdvancedSubtensor1()(gpu_from_host(x), *coords)] return [GpuAdvancedSubtensor1()(as_cuda_ndarray_variable(x), *coords)]
if node.op.__class__ is tensor.AdvancedSubtensor1: if node.op.__class__ is tensor.AdvancedSubtensor1:
x = node.inputs[0] x = node.inputs[0]
coords = node.inputs[1:] coords = node.inputs[1:]
...@@ -1010,7 +1011,7 @@ def local_gpu_advanced_incsubtensor1(node): ...@@ -1010,7 +1011,7 @@ def local_gpu_advanced_incsubtensor1(node):
else: else:
gpu_op = GpuAdvancedIncSubtensor1_dev20( gpu_op = GpuAdvancedIncSubtensor1_dev20(
set_instead_of_inc=set_instead_of_inc) set_instead_of_inc=set_instead_of_inc)
return [gpu_op(gpu_from_host(x), gpu_from_host(y), *coords)] return [gpu_op(as_cuda_ndarray_variable(x), as_cuda_ndarray_variable(y), *coords)]
# Should not execute for GpuAdvancedIncSubtensor1 # Should not execute for GpuAdvancedIncSubtensor1
if node.op.__class__ is tensor.AdvancedIncSubtensor1 and \ if node.op.__class__ is tensor.AdvancedIncSubtensor1 and \
...@@ -1022,12 +1023,12 @@ def local_gpu_advanced_incsubtensor1(node): ...@@ -1022,12 +1023,12 @@ def local_gpu_advanced_incsubtensor1(node):
go_gpu = True go_gpu = True
gpu_x, = x.owner.inputs gpu_x, = x.owner.inputs
else: else:
gpu_x = gpu_from_host(x) gpu_x = as_cuda_ndarray_variable(x)
if y.owner and isinstance(y.owner.op, HostFromGpu): if y.owner and isinstance(y.owner.op, HostFromGpu):
go_gpu = True go_gpu = True
gpu_y, = y.owner.inputs gpu_y, = y.owner.inputs
else: else:
gpu_y = gpu_from_host(y) gpu_y = as_cuda_ndarray_variable(y)
if go_gpu: if go_gpu:
set_instead_of_inc = node.op.set_instead_of_inc set_instead_of_inc = node.op.set_instead_of_inc
if set_instead_of_inc and config.warn.gpu_set_subtensor1: if set_instead_of_inc and config.warn.gpu_set_subtensor1:
...@@ -1068,8 +1069,8 @@ def local_gpu_incsubtensor(node): ...@@ -1068,8 +1069,8 @@ def local_gpu_incsubtensor(node):
incsubt.idx_list, incsubt.idx_list,
inplace=incsubt.inplace, inplace=incsubt.inplace,
set_instead_of_inc=incsubt.set_instead_of_inc)( set_instead_of_inc=incsubt.set_instead_of_inc)(
gpu_from_host(x), as_cuda_ndarray_variable(x),
gpu_from_host(y), as_cuda_ndarray_variable(y),
*coords)] *coords)]
# Incrementing a float32 x results in a float32 # Incrementing a float32 x results in a float32
# output even if y is float64, so we can downcast # output even if y is float64, so we can downcast
...@@ -1085,14 +1086,14 @@ def local_gpu_incsubtensor(node): ...@@ -1085,14 +1086,14 @@ def local_gpu_incsubtensor(node):
go_gpu = True go_gpu = True
gpu_x, = x.owner.inputs gpu_x, = x.owner.inputs
else: else:
gpu_x = gpu_from_host(x) gpu_x = as_cuda_ndarray_variable(x)
if y.owner and isinstance(y.owner.op, HostFromGpu): if y.owner and isinstance(y.owner.op, HostFromGpu):
go_gpu = True go_gpu = True
gpu_y, = y.owner.inputs gpu_y, = y.owner.inputs
else: else:
if y.dtype != 'float32': if y.dtype != 'float32':
y = tensor.cast(y, 'float32') y = tensor.cast(y, 'float32')
gpu_y = gpu_from_host(y) gpu_y = as_cuda_ndarray_variable(y)
if go_gpu: if go_gpu:
return [host_from_gpu(GpuIncSubtensor( return [host_from_gpu(GpuIncSubtensor(
node.op.idx_list, inplace=node.op.inplace, node.op.idx_list, inplace=node.op.inplace,
...@@ -1169,8 +1170,8 @@ def local_gpu_crossentorpy_softmax_argmax_1hot_with_bias(node): ...@@ -1169,8 +1170,8 @@ def local_gpu_crossentorpy_softmax_argmax_1hot_with_bias(node):
gpu_nll, gpu_sm, gpu_am = \ gpu_nll, gpu_sm, gpu_am = \
GpuCrossentropySoftmaxArgmax1HotWithBias()( GpuCrossentropySoftmaxArgmax1HotWithBias()(
gpu_x, gpu_x,
gpu_from_host(b), as_cuda_ndarray_variable(b),
gpu_from_host(cast(y, 'float32'))) as_cuda_ndarray_variable(cast(y, 'float32')))
am_dtype = node.outputs[2].type.dtype am_dtype = node.outputs[2].type.dtype
return [host_from_gpu(gpu_nll), return [host_from_gpu(gpu_nll),
host_from_gpu(gpu_sm), host_from_gpu(gpu_sm),
...@@ -1186,9 +1187,9 @@ def local_gpu_crossentorpy_softmax_1hot_with_bias_dx(node): ...@@ -1186,9 +1187,9 @@ def local_gpu_crossentorpy_softmax_1hot_with_bias_dx(node):
if sm.owner and isinstance(sm.owner.op, HostFromGpu): if sm.owner and isinstance(sm.owner.op, HostFromGpu):
gpu_sm, = sm.owner.inputs gpu_sm, = sm.owner.inputs
gpu_dx = GpuCrossentropySoftmax1HotWithBiasDx()( gpu_dx = GpuCrossentropySoftmax1HotWithBiasDx()(
gpu_from_host(dnll), as_cuda_ndarray_variable(dnll),
gpu_sm, gpu_sm,
gpu_from_host(cast(yidx, 'float32'))) as_cuda_ndarray_variable(cast(yidx, 'float32')))
return [host_from_gpu(gpu_dx)] return [host_from_gpu(gpu_dx)]
return False return False
...@@ -1213,7 +1214,7 @@ def local_gpu_softmax_with_bias(node): ...@@ -1213,7 +1214,7 @@ def local_gpu_softmax_with_bias(node):
x_on_gpu = x.owner and isinstance(x.owner.op, HostFromGpu) x_on_gpu = x.owner and isinstance(x.owner.op, HostFromGpu)
b_on_gpu = b.owner and isinstance(b.owner.op, HostFromGpu) b_on_gpu = b.owner and isinstance(b.owner.op, HostFromGpu)
if x_on_gpu or b_on_gpu: if x_on_gpu or b_on_gpu:
gpu_sm = GpuSoftmaxWithBias()(gpu_from_host(x), gpu_from_host(b)) gpu_sm = GpuSoftmaxWithBias()(as_cuda_ndarray_variable(x), as_cuda_ndarray_variable(b))
return [host_from_gpu(gpu_sm)] return [host_from_gpu(gpu_sm)]
return False return False
...@@ -1711,8 +1712,8 @@ def local_gpu_downsample_factor_max_grad(node): ...@@ -1711,8 +1712,8 @@ def local_gpu_downsample_factor_max_grad(node):
gpu_ds_grad = GpuDownsampleFactorMaxGrad(node.op.ds, gpu_ds_grad = GpuDownsampleFactorMaxGrad(node.op.ds,
node.op.ignore_border) node.op.ignore_border)
return [host_from_gpu(gpu_ds_grad(x.owner.inputs[0], return [host_from_gpu(gpu_ds_grad(x.owner.inputs[0],
gpu_from_host(z), as_cuda_ndarray_variable(z),
gpu_from_host(gz)))] as_cuda_ndarray_variable(gz)))]
@register_opt() @register_opt()
...@@ -1726,8 +1727,8 @@ def local_gpu_downsample_factor_max_grad_grad(node): ...@@ -1726,8 +1727,8 @@ def local_gpu_downsample_factor_max_grad_grad(node):
op = GpuDownsampleFactorMaxGradGrad(node.op.ds, op = GpuDownsampleFactorMaxGradGrad(node.op.ds,
node.op.ignore_border) node.op.ignore_border)
return [host_from_gpu(op(x.owner.inputs[0], return [host_from_gpu(op(x.owner.inputs[0],
gpu_from_host(z), as_cuda_ndarray_variable(z),
gpu_from_host(gx)))] as_cuda_ndarray_variable(gx)))]
from theano.sandbox.cuda.basic_ops import gpu_join, GpuJoin from theano.sandbox.cuda.basic_ops import gpu_join, GpuJoin
...@@ -1782,7 +1783,7 @@ def local_gpu_join(node): ...@@ -1782,7 +1783,7 @@ def local_gpu_join(node):
if all(matches): if all(matches):
# the extra gpu_from_host introduced here will # the extra gpu_from_host introduced here will
# be removed by further optimizations # be removed by further optimizations
new_tensors = [gpu_from_host(t) for t in axis_and_tensors[1:]] new_tensors = [as_cuda_ndarray_variable(t) for t in axis_and_tensors[1:]]
new_a_and_t = [axis_and_tensors[0]] + new_tensors new_a_and_t = [axis_and_tensors[0]] + new_tensors
replacement_node = host_from_gpu(gpu_join(*new_a_and_t)) replacement_node = host_from_gpu(gpu_join(*new_a_and_t))
...@@ -2079,7 +2080,7 @@ def local_gpu_eye(node): ...@@ -2079,7 +2080,7 @@ def local_gpu_eye(node):
def safe_to_gpu(x): def safe_to_gpu(x):
if (isinstance(x.type, tensor.TensorType) and if (isinstance(x.type, tensor.TensorType) and
x.type.dtype == 'float32'): x.type.dtype == 'float32'):
return gpu_from_host(x) return as_cuda_ndarray_variable(x)
else: else:
return x return x
...@@ -2151,7 +2152,7 @@ def local_gpu_extract_diagonal(node): ...@@ -2151,7 +2152,7 @@ def local_gpu_extract_diagonal(node):
theano.tensor.TensorType)): theano.tensor.TensorType)):
inp = node.inputs[0] inp = node.inputs[0]
if inp.owner and isinstance(inp.owner.op, HostFromGpu): if inp.owner and isinstance(inp.owner.op, HostFromGpu):
return [host_from_gpu(nlinalg.extract_diag(gpu_from_host(inp)))] return [host_from_gpu(nlinalg.extract_diag(as_cuda_ndarray_variable(inp)))]
if isinstance(node.op, GpuFromHost): if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0] host_input = node.inputs[0]
if (host_input.owner and if (host_input.owner and
...@@ -2160,7 +2161,7 @@ def local_gpu_extract_diagonal(node): ...@@ -2160,7 +2161,7 @@ def local_gpu_extract_diagonal(node):
theano.tensor.TensorType)): theano.tensor.TensorType)):
diag_node = host_input.owner diag_node = host_input.owner
return [nlinalg.extract_diag( return [nlinalg.extract_diag(
gpu_from_host(diag_node.inputs[0]))] as_cuda_ndarray_variable(diag_node.inputs[0]))]
return False return False
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论