提交 64a907e9 authored 作者: nouiz's avatar nouiz

Merge pull request #528 from pascanur/gpu_specify_shape

Gpu specify shape
...@@ -52,6 +52,7 @@ optdb.register('gpu_after_fusion', ...@@ -52,6 +52,7 @@ optdb.register('gpu_after_fusion',
optdb.__position__.get('elemwise_fusion', 71) + .1, optdb.__position__.get('elemwise_fusion', 71) + .1,
'gpu') 'gpu')
def register_opt(*tags, **kwargs): def register_opt(*tags, **kwargs):
def f(local_opt): def f(local_opt):
name = (kwargs and kwargs.pop('name')) or local_opt.__name__ name = (kwargs and kwargs.pop('name')) or local_opt.__name__
...@@ -63,6 +64,7 @@ def register_opt(*tags, **kwargs): ...@@ -63,6 +64,7 @@ def register_opt(*tags, **kwargs):
#to make multi-level lift of shape work. #to make multi-level lift of shape work.
register_opt()(theano.tensor.opt.local_track_shape_i) register_opt()(theano.tensor.opt.local_track_shape_i)
class InputToGpuOptimizer(Optimizer): class InputToGpuOptimizer(Optimizer):
"""Transfert the input of a graph to the gpu if needed """Transfert the input of a graph to the gpu if needed
It should make this part of the optimizer faster we will will need only 1 It should make this part of the optimizer faster we will will need only 1
...@@ -81,16 +83,22 @@ class InputToGpuOptimizer(Optimizer): ...@@ -81,16 +83,22 @@ class InputToGpuOptimizer(Optimizer):
try: try:
new_input = host_from_gpu(gpu_from_host(input)) new_input = host_from_gpu(gpu_from_host(input))
if new_input.type==input.type: if new_input.type == input.type:
env.replace_validate(input, new_input, "InputToGpuOptimizer") env.replace_validate(input, new_input,
"InputToGpuOptimizer")
except TypeError, e: except TypeError, e:
#as we currently only support float32, this can fail. #as we currently only support float32, this can fail.
#Using try except make that we won't need #Using try except make that we won't need
pass pass
#we register it before all other gpu optimizer to be sure that the input are on the gpu. # we register it before all other gpu optimizer to be sure that the input
# are on the gpu.
gpu_seqopt.register('InputToGpuOptimizer', InputToGpuOptimizer(), gpu_seqopt.register('InputToGpuOptimizer', InputToGpuOptimizer(),
0, 'fast_run', 'fast_compile', 'merge')#TODO: how to make it mandatory for gpu_seqopt? 0,
'fast_run',
'fast_compile',
'merge') # TODO: how to make it mandatory for gpu_seqopt?
@local_optimizer([]) @local_optimizer([])
def local_cut_gpu_host_gpu(node): def local_cut_gpu_host_gpu(node):
...@@ -109,9 +117,11 @@ gpu_cut_copies.register('cut_gpu_constant_transfers', ...@@ -109,9 +117,11 @@ gpu_cut_copies.register('cut_gpu_constant_transfers',
optdb['canonicalize'].register('local_cut_gpu_host_gpu', optdb['canonicalize'].register('local_cut_gpu_host_gpu',
local_cut_gpu_host_gpu, 'fast_run', 'gpu') local_cut_gpu_host_gpu, 'fast_run', 'gpu')
#'float64', 'complex128' and 'complex64' are not supported in elemwise on the gpu. # 'float64', 'complex128' and 'complex64' are not supported in elemwise
elemwise_cuda_dtype_supported=['float32','uint8','int8','uint16','int16', # on the gpu.
'uint32','int32','uint64','int64'] elemwise_cuda_dtype_supported = ['float32', 'uint8', 'int8', 'uint16', 'int16',
'uint32', 'int32', 'uint64', 'int64']
def dtype_in_elemwise_supported(op): def dtype_in_elemwise_supported(op):
""" """
...@@ -121,7 +131,7 @@ def dtype_in_elemwise_supported(op): ...@@ -121,7 +131,7 @@ def dtype_in_elemwise_supported(op):
:note: We need to check inside the Composite op. :note: We need to check inside the Composite op.
""" """
def get_all_basic_scalar(composite_op): def get_all_basic_scalar(composite_op):
l=[] l = []
for i in composite_op.env.toposort(): for i in composite_op.env.toposort():
if isinstance(i, theano.scalar.Composite): if isinstance(i, theano.scalar.Composite):
l += get_all_basic_scalar(i) l += get_all_basic_scalar(i)
...@@ -133,21 +143,22 @@ def dtype_in_elemwise_supported(op): ...@@ -133,21 +143,22 @@ def dtype_in_elemwise_supported(op):
scals = get_all_basic_scalar(op.scalar_op) scals = get_all_basic_scalar(op.scalar_op)
for s in scals: for s in scals:
if any([i.type.dtype not in elemwise_cuda_dtype_supported if any([i.type.dtype not in elemwise_cuda_dtype_supported
for i in s.inputs+s.outputs]): for i in s.inputs + s.outputs]):
return False return False
return True return True
@register_opt() @register_opt()
@local_optimizer([]) @local_optimizer([])
def local_gpu_elemwise_0(node): def local_gpu_elemwise_0(node):
"""elemwise(..., host_from_gpu, ...) """elemwise(..., host_from_gpu, ...)
-> host_from_gpu(elemwise(gpu_from_host, ..., gpu_from_host) -> host_from_gpu(elemwise(gpu_from_host, ..., gpu_from_host)
""" """
if isinstance(node.op, tensor.Elemwise) and dtype_in_elemwise_supported(node.op): if (isinstance(node.op, tensor.Elemwise) and
if numpy.any([i.owner and isinstance(i.owner.op, HostFromGpu) for i in node.inputs]): dtype_in_elemwise_supported(node.op)):
if numpy.any([i.owner and
isinstance(i.owner.op, HostFromGpu)
for i in node.inputs]):
if numpy.all([o.type.dtype == 'float32' for o in node.outputs]): if numpy.all([o.type.dtype == 'float32' for o in node.outputs]):
# Don't set any inplace pattern. # Don't set any inplace pattern.
# gpu_inplace_elemwise_optimizer will do it later # gpu_inplace_elemwise_optimizer will do it later
...@@ -158,19 +169,26 @@ def local_gpu_elemwise_0(node): ...@@ -158,19 +169,26 @@ def local_gpu_elemwise_0(node):
return False return False
# first establish that float32 can store all inputs # first establish that float32 can store all inputs
upcastable = set(['float32', 'int8', 'int16', 'uint8', 'uint16']) upcastable = set(['float32', 'int8', 'int16', 'uint8',
'uint16'])
# case 1 - all inputs are already float32 # case 1 - all inputs are already float32
if numpy.all([i.type.dtype == 'float32' for i in node.inputs]): if numpy.all([i.type.dtype == 'float32' for i in node.inputs]):
#TODO: change this when fusion makes Elemwise with multiple outputs #TODO: change this when fusion makes Elemwise with multiple
gpu_elemwise = new_op(*(gpu_from_host(i) for i in node.inputs)) # outputs
gpu_elemwise = new_op(*(gpu_from_host(i)
for i in node.inputs))
# case 2 - it is still ok if some inputs were upcast to float32 # case 2 - it is still ok if some inputs were upcast to float32
elif numpy.all([i.type.dtype in upcastable for i in node.inputs]): elif numpy.all([i.type.dtype in upcastable
# second - establish that a new node with upcasted inputs has the same outputs for i in node.inputs]):
# types as the original node # second - establish that a new node with upcasted inputs
upcasted = node.op.make_node(*[tensor.cast(i, 'float32') for i in node.inputs]) # has the same outputs types as the original node
if [o.type for o in upcasted.outputs] == [o.type for o in node.outputs]: upcasted = node.op.make_node(*[tensor.cast(i, 'float32')
for i in node.inputs])
new_inputs = [gpu_from_host(tensor.cast(i, 'float32')) for i in node.inputs] if [o.type for o in upcasted.outputs] ==\
[o.type for o in node.outputs]:
new_inputs = [gpu_from_host(tensor.cast(i, 'float32'))
for i in node.inputs]
gpu_elemwise = new_op(*new_inputs) gpu_elemwise = new_op(*new_inputs)
else: else:
return False return False
...@@ -180,9 +198,11 @@ def local_gpu_elemwise_0(node): ...@@ -180,9 +198,11 @@ def local_gpu_elemwise_0(node):
gpu_elemwise = split_huge_add_or_mul(gpu_elemwise.owner) gpu_elemwise = split_huge_add_or_mul(gpu_elemwise.owner)
if not gpu_elemwise: if not gpu_elemwise:
return False return False
if max_inputs_to_GpuElemwise(node)<len(gpu_elemwise.inputs): if max_inputs_to_GpuElemwise(node) < len(gpu_elemwise.inputs):
return False return False
return [host_from_gpu(gpu_elemwise.outputs[0])] return [host_from_gpu(gpu_elemwise.outputs[0])]
@register_opt() @register_opt()
@local_optimizer([]) @local_optimizer([])
def local_gpu_elemwise_1(node): def local_gpu_elemwise_1(node):
...@@ -193,7 +213,7 @@ def local_gpu_elemwise_1(node): ...@@ -193,7 +213,7 @@ def local_gpu_elemwise_1(node):
host_i, = node.inputs host_i, = node.inputs
if (host_i.owner and if (host_i.owner and
isinstance(host_i.owner.op, tensor.Elemwise) and isinstance(host_i.owner.op, tensor.Elemwise) and
len(host_i.clients)==1 and len(host_i.clients) == 1 and
dtype_in_elemwise_supported(node.op)): dtype_in_elemwise_supported(node.op)):
elemwise_node = host_i.owner elemwise_node = host_i.owner
...@@ -204,14 +224,16 @@ def local_gpu_elemwise_1(node): ...@@ -204,14 +224,16 @@ def local_gpu_elemwise_1(node):
except SupportCodeError: except SupportCodeError:
# This happens when scalar_op requires support code # This happens when scalar_op requires support code
return False return False
if all([i.dtype=='float32' for i in elemwise_node.inputs]): if all([i.dtype == 'float32' for i in elemwise_node.inputs]):
gpu_elemwise = new_op(*[gpu_from_host(i) for i in elemwise_node.inputs]) gpu_elemwise = new_op(*[gpu_from_host(i)
for i in elemwise_node.inputs])
gpu_elemwise = split_huge_add_or_mul(gpu_elemwise.owner) gpu_elemwise = split_huge_add_or_mul(gpu_elemwise.owner)
if not gpu_elemwise: if not gpu_elemwise:
return False return False
return [gpu_elemwise.outputs[0]] return [gpu_elemwise.outputs[0]]
return False return False
@register_opt() @register_opt()
@local_optimizer([]) @local_optimizer([])
def local_gpu_dimshuffle_0(node): def local_gpu_dimshuffle_0(node):
...@@ -228,7 +250,8 @@ def local_gpu_dimshuffle_0(node): ...@@ -228,7 +250,8 @@ def local_gpu_dimshuffle_0(node):
return [host_from_gpu(new_op(gpu_from_host(input)))] return [host_from_gpu(new_op(gpu_from_host(input)))]
if node.op == gpu_from_host: if node.op == gpu_from_host:
host_input = node.inputs[0] host_input = node.inputs[0]
if host_input.owner and isinstance(host_input.owner.op, tensor.DimShuffle): if host_input.owner and isinstance(host_input.owner.op,
tensor.DimShuffle):
dimshuffle_node = host_input.owner dimshuffle_node = host_input.owner
new_op = GpuDimShuffle(dimshuffle_node.op.input_broadcastable, new_op = GpuDimShuffle(dimshuffle_node.op.input_broadcastable,
dimshuffle_node.op.new_order) dimshuffle_node.op.new_order)
...@@ -236,6 +259,29 @@ def local_gpu_dimshuffle_0(node): ...@@ -236,6 +259,29 @@ def local_gpu_dimshuffle_0(node):
return False return False
@register_opt()
@local_optimizer([])
def local_gpu_specifyShape_0(node):
"""
specify_shape(host_from_gpu()) -> host_from_gpu(specify_shape)
gpu_from_host(specify_shape) -> specifyshape(gpu_from_host)
"""
if isinstance(node.op, tensor.SpecifyShape):
input = node.inputs[0]
if input.owner and isinstance(input.owner.op, HostFromGpu):
return [host_from_gpu(tensor.specify_shape(gpu_from_host(input),
*node.inputs[1:]))]
if node.op == gpu_from_host:
host_input = node.inputs[0]
if host_input.owner and isinstance(host_input.owner.op,
tensor.SpecifyShape):
specifyshape_node = host_input.owner
return [tensor.specify_shape(
gpu_from_host(specifyshape_node.inputs[0]),
*specifyshape_node.inputs[1:])]
return False
@register_opt() @register_opt()
@local_optimizer([]) @local_optimizer([])
def local_gpu_dot_to_dot22(node): def local_gpu_dot_to_dot22(node):
...@@ -260,13 +306,13 @@ def local_gpu_dot_to_dot22(node): ...@@ -260,13 +306,13 @@ def local_gpu_dot_to_dot22(node):
x, y = host_input.owner.inputs x, y = host_input.owner.inputs
# case one: vector X matrix # case one: vector X matrix
if _is_real_vector(x) and _is_real_matrix(y): if _is_real_vector(x) and _is_real_matrix(y):
new_op = GpuDimShuffle((False,), ['x',0]) new_op = GpuDimShuffle((False,), ['x', 0])
shape_out = y.shape[1].dimshuffle(['x']) shape_out = y.shape[1].dimshuffle(['x'])
gpu_x = new_op(gpu_from_host(x)) gpu_x = new_op(gpu_from_host(x))
gpu_y = gpu_from_host(y) gpu_y = gpu_from_host(y)
# case two: matrix X vector # case two: matrix X vector
elif _is_real_matrix(x) and _is_real_vector(y): elif _is_real_matrix(x) and _is_real_vector(y):
new_op = GpuDimShuffle((False,), [0,'x']) new_op = GpuDimShuffle((False,), [0, 'x'])
shape_out = x.shape[0].dimshuffle(['x']) shape_out = x.shape[0].dimshuffle(['x'])
gpu_x = gpu_from_host(x) gpu_x = gpu_from_host(x)
gpu_y = new_op(gpu_from_host(y)) gpu_y = new_op(gpu_from_host(y))
...@@ -277,16 +323,17 @@ def local_gpu_dot_to_dot22(node): ...@@ -277,16 +323,17 @@ def local_gpu_dot_to_dot22(node):
if node.op == tensor.basic.dot: if node.op == tensor.basic.dot:
if node.outputs[0].type.dtype != 'float32': if node.outputs[0].type.dtype != 'float32':
return False return False
if numpy.any([(i.owner and i.owner.op == host_from_gpu) for i in node.inputs]): if numpy.any([(i.owner and i.owner.op == host_from_gpu)
for i in node.inputs]):
x, y = node.inputs x, y = node.inputs
if _is_real_vector(x) and _is_real_matrix(y): if _is_real_vector(x) and _is_real_matrix(y):
new_op = GpuDimShuffle((False,), ['x',0]) new_op = GpuDimShuffle((False,), ['x', 0])
shape_out = y.shape[1].dimshuffle(['x']) shape_out = y.shape[1].dimshuffle(['x'])
gpu_x = new_op(gpu_from_host(x)) gpu_x = new_op(gpu_from_host(x))
gpu_y = gpu_from_host(y) gpu_y = gpu_from_host(y)
elif _is_real_matrix(x) and _is_real_vector(y): elif _is_real_matrix(x) and _is_real_vector(y):
new_op = GpuDimShuffle((False,), [0,'x']) new_op = GpuDimShuffle((False,), [0, 'x'])
shape_out = x.shape[0].dimshuffle(['x']) shape_out = x.shape[0].dimshuffle(['x'])
gpu_x = gpu_from_host(x) gpu_x = gpu_from_host(x)
gpu_y = new_op(gpu_from_host(y)) gpu_y = new_op(gpu_from_host(y))
...@@ -297,6 +344,7 @@ def local_gpu_dot_to_dot22(node): ...@@ -297,6 +344,7 @@ def local_gpu_dot_to_dot22(node):
shape_out))] shape_out))]
return False return False
@register_opt() @register_opt()
@local_optimizer([]) @local_optimizer([])
def local_gpu_lazy_ifelse(node): def local_gpu_lazy_ifelse(node):
...@@ -306,31 +354,32 @@ def local_gpu_lazy_ifelse(node): ...@@ -306,31 +354,32 @@ def local_gpu_lazy_ifelse(node):
ifelse(host_from_gpu) -> host_from_gpu(ifelse) ifelse(host_from_gpu) -> host_from_gpu(ifelse)
""" """
if hasattr(theano, "lazycond"): if hasattr(theano, "lazycond"):
gpu_ifelse = theano.lazycond.IfElse(gpu = True) gpu_ifelse = theano.lazycond.IfElse(gpu=True)
if node.op == gpu_from_host: if node.op == gpu_from_host:
host_input = node.inputs[0] host_input = node.inputs[0]
if (host_input.owner if (host_input.owner
and host_input.owner.op == theano.lazycond.ifelse): and host_input.owner.op == theano.lazycond.ifelse):
c, t, f = host_input.owner.inputs c, t, f = host_input.owner.inputs
if not isinstance(f.type,CudaNdarrayType): if not isinstance(f.type, CudaNdarrayType):
f = gpu_from_host(f) f = gpu_from_host(f)
if not isinstance(t.type,CudaNdarrayType): if not isinstance(t.type, CudaNdarrayType):
t = gpu_from_host(t) t = gpu_from_host(t)
if isinstance(c.type,CudaNdarrayType): if isinstance(c.type, CudaNdarrayType):
c = host_from_gpu(c) c = host_from_gpu(c)
return [gpu_ifelse(c, t, f)] return [gpu_ifelse(c, t, f)]
if node.op == theano.lazycond.ifelse: if node.op == theano.lazycond.ifelse:
if numpy.any([(i.owner and i.owner.op == host_from_gpu) for i in node.inputs]): if numpy.any([(i.owner and i.owner.op == host_from_gpu)
for i in node.inputs]):
c, t, f = node.inputs c, t, f = node.inputs
if not isinstance(f.type,CudaNdarrayType): if not isinstance(f.type, CudaNdarrayType):
f = gpu_from_host(f) f = gpu_from_host(f)
if not isinstance(t.type,CudaNdarrayType): if not isinstance(t.type, CudaNdarrayType):
t = gpu_from_host(t) t = gpu_from_host(t)
if isinstance(c.type,CudaNdarrayType): if isinstance(c.type, CudaNdarrayType):
c = host_from_gpu(c) c = host_from_gpu(c)
return [host_from_gpu(gpu_ifelse(c, t, f))] return [host_from_gpu(gpu_ifelse(c, t, f))]
...@@ -352,11 +401,14 @@ def local_gpu_dot22(node): ...@@ -352,11 +401,14 @@ def local_gpu_dot22(node):
x, y = host_input.owner.inputs x, y = host_input.owner.inputs
return [gpu_dot22(gpu_from_host(x), gpu_from_host(y))] return [gpu_dot22(gpu_from_host(x), gpu_from_host(y))]
if node.op == tensor.blas._dot22: if node.op == tensor.blas._dot22:
if numpy.any([(i.owner and i.owner.op == host_from_gpu) for i in node.inputs]): if numpy.any([(i.owner and i.owner.op == host_from_gpu)
for i in node.inputs]):
x, y = node.inputs x, y = node.inputs
return [host_from_gpu(gpu_dot22(gpu_from_host(x), gpu_from_host(y)))] return [host_from_gpu(gpu_dot22(gpu_from_host(x),
gpu_from_host(y)))]
return False return False
@register_opt() @register_opt()
@local_optimizer([]) @local_optimizer([])
def local_gpu_dot22scalar(node): def local_gpu_dot22scalar(node):
...@@ -367,13 +419,19 @@ def local_gpu_dot22scalar(node): ...@@ -367,13 +419,19 @@ def local_gpu_dot22scalar(node):
""" """
if node.op == gpu_from_host: if node.op == gpu_from_host:
host_input = node.inputs[0] host_input = node.inputs[0]
if host_input.owner and host_input.owner.op == tensor.blas._dot22scalar: if (host_input.owner and
host_input.owner.op == tensor.blas._dot22scalar):
x, y, scalar = host_input.owner.inputs x, y, scalar = host_input.owner.inputs
return [gpu_dot22scalar(gpu_from_host(x), gpu_from_host(y), tensor.blas._as_scalar(scalar))] return [gpu_dot22scalar(gpu_from_host(x), gpu_from_host(y),
tensor.blas._as_scalar(scalar))]
if node.op == tensor.blas._dot22scalar: if node.op == tensor.blas._dot22scalar:
if numpy.any([(i.owner and i.owner.op == host_from_gpu) for i in node.inputs]): if numpy.any([(i.owner and i.owner.op == host_from_gpu)
for i in node.inputs]):
x, y, scalar = node.inputs x, y, scalar = node.inputs
return [host_from_gpu(gpu_dot22scalar(gpu_from_host(x), gpu_from_host(y),tensor.blas._as_scalar(scalar)))] return [host_from_gpu(
gpu_dot22scalar(gpu_from_host(x),
gpu_from_host(y),
tensor.blas._as_scalar(scalar)))]
return False return False
...@@ -397,11 +455,11 @@ def local_gpu_gemv(node): ...@@ -397,11 +455,11 @@ def local_gpu_gemv(node):
op = host_input.owner.op op = host_input.owner.op
z, a, x, y, b = host_input.owner.inputs z, a, x, y, b = host_input.owner.inputs
return [gemvs[op]( return [gemvs[op](
gpu_from_host(z) gpu_from_host(z),
, a a,
, gpu_from_host(x) gpu_from_host(x),
, gpu_from_host(y) gpu_from_host(y),
, b)] b)]
if node.op in gemvs: if node.op in gemvs:
z, a, x, y, b = node.inputs z, a, x, y, b = node.inputs
x_on_gpu = (x.owner and x.owner.op == host_from_gpu) x_on_gpu = (x.owner and x.owner.op == host_from_gpu)
...@@ -410,11 +468,11 @@ def local_gpu_gemv(node): ...@@ -410,11 +468,11 @@ def local_gpu_gemv(node):
if x_on_gpu or y_on_gpu or z_on_gpu: if x_on_gpu or y_on_gpu or z_on_gpu:
return [host_from_gpu( return [host_from_gpu(
gemvs[node.op]( gemvs[node.op](
gpu_from_host(z) gpu_from_host(z),
, a a,
, gpu_from_host(x) gpu_from_host(x),
, gpu_from_host(y) gpu_from_host(y),
, b))] b))]
return False return False
...@@ -438,10 +496,10 @@ def local_gpu_ger(node): ...@@ -438,10 +496,10 @@ def local_gpu_ger(node):
op = host_input.owner.op op = host_input.owner.op
z, a, x, y = host_input.owner.inputs z, a, x, y = host_input.owner.inputs
return [gers[op]( return [gers[op](
gpu_from_host(z) gpu_from_host(z),
, a a,
, gpu_from_host(x) gpu_from_host(x),
, gpu_from_host(y) gpu_from_host(y)
)] )]
if node.op in gers: if node.op in gers:
z, a, x, y = node.inputs z, a, x, y = node.inputs
...@@ -451,13 +509,14 @@ def local_gpu_ger(node): ...@@ -451,13 +509,14 @@ def local_gpu_ger(node):
if x_on_gpu or y_on_gpu or z_on_gpu: if x_on_gpu or y_on_gpu or z_on_gpu:
return [host_from_gpu( return [host_from_gpu(
gers[node.op]( gers[node.op](
gpu_from_host(z) gpu_from_host(z),
, a a,
, gpu_from_host(x) gpu_from_host(x),
, gpu_from_host(y) gpu_from_host(y)
))] ))]
return False return False
@register_opt() @register_opt()
@local_optimizer([]) @local_optimizer([])
def local_gpu_gemm(node): def local_gpu_gemm(node):
...@@ -474,16 +533,25 @@ def local_gpu_gemm(node): ...@@ -474,16 +533,25 @@ def local_gpu_gemm(node):
if host_input.owner and host_input.owner.op in gemms: if host_input.owner and host_input.owner.op in gemms:
op = host_input.owner.op op = host_input.owner.op
z, a, x, y, b = host_input.owner.inputs z, a, x, y, b = host_input.owner.inputs
return [gemms[op](gpu_from_host(z), a, gpu_from_host(x), gpu_from_host(y), b)] return [gemms[op](gpu_from_host(z),
a,
gpu_from_host(x),
gpu_from_host(y),
b)]
if node.op in gemms: if node.op in gemms:
z, a, x, y, b = node.inputs z, a, x, y, b = node.inputs
x_on_gpu = (x.owner and x.owner.op == host_from_gpu) x_on_gpu = (x.owner and x.owner.op == host_from_gpu)
y_on_gpu = (y.owner and y.owner.op == host_from_gpu) y_on_gpu = (y.owner and y.owner.op == host_from_gpu)
z_on_gpu = (z.owner and z.owner.op == host_from_gpu) z_on_gpu = (z.owner and z.owner.op == host_from_gpu)
if x_on_gpu or y_on_gpu or z_on_gpu: if x_on_gpu or y_on_gpu or z_on_gpu:
return [host_from_gpu(gemms[node.op](gpu_from_host(z), a, gpu_from_host(x), gpu_from_host(y), b))] return [host_from_gpu(gemms[node.op](gpu_from_host(z),
a,
gpu_from_host(x),
gpu_from_host(y),
b))]
return False return False
@register_opt() @register_opt()
@local_optimizer([]) @local_optimizer([])
def local_gpu_outer(node): def local_gpu_outer(node):
...@@ -493,16 +561,17 @@ def local_gpu_outer(node): ...@@ -493,16 +561,17 @@ def local_gpu_outer(node):
if node.op == gpu_dot22: if node.op == gpu_dot22:
l, r = node.inputs l, r = node.inputs
if l.type.broadcastable[1] and r.type.broadcastable[0]: if l.type.broadcastable[1] and r.type.broadcastable[0]:
# TODO: we would like to remove the double-dimshuffle when l or r is # TODO: we would like to remove the double-dimshuffle when
# already the output of a GpuDimshuffle. To do this, refactor the # l or r is already the output of a GpuDimshuffle. To do
# logic in tensor/opt.py that collapses dimshuffle chains so that we # this, refactor the logic in tensor/opt.py that collapses
# can call it from here. # dimshuffle chains so that we can call it from here.
lvec = GpuDimShuffle(l.broadcastable, [0])(l) lvec = GpuDimShuffle(l.broadcastable, [0])(l)
rvec = GpuDimShuffle(r.broadcastable, [1])(r) rvec = GpuDimShuffle(r.broadcastable, [1])(r)
return [gpu_outer(lvec, rvec)] return [gpu_outer(lvec, rvec)]
return False return False
@register_opt() @register_opt()
@local_optimizer([]) @local_optimizer([])
def local_gpu_sum(node): def local_gpu_sum(node):
...@@ -517,21 +586,23 @@ def local_gpu_sum(node): ...@@ -517,21 +586,23 @@ def local_gpu_sum(node):
for a in node.op.axis: for a in node.op.axis:
assert reduce_mask[a] == 0 assert reduce_mask[a] == 0
reduce_mask[a] = 1 reduce_mask[a] = 1
gsum=GpuSum(reduce_mask) gsum = GpuSum(reduce_mask)
pattern=(''.join(str(i) for i in reduce_mask)) pattern = (''.join(str(i) for i in reduce_mask))
if hasattr(gsum, 'c_code_reduce_%s'%pattern): if hasattr(gsum, 'c_code_reduce_%s' % pattern):
rval = host_from_gpu(gsum(gpu_from_host(x))) rval = host_from_gpu(gsum(gpu_from_host(x)))
if rval.type == node.outputs[0].type: if rval.type == node.outputs[0].type:
return [rval] return [rval]
else: else:
print >> sys.stderr, "WARNING: local_gpu_sum got type wrong" print >> sys.stderr, \
"WARNING: local_gpu_sum got type wrong"
return None return None
else: else:
# Try to make a simpler pattern based on reshaping # Try to make a simpler pattern based on reshaping
# The principle is that if two adjacent dimensions have the same value in # The principle is that if two adjacent dimensions have
# the reduce_mask, then we can reshape to make them a single dimension, do # the same value in the reduce_mask, then we can reshape
# the sum, and then reshape to get them back. # to make them a single dimension, do the sum, and then
# reshape to get them back.
shape_of = node.env.shape_feature.shape_of shape_of = node.env.shape_feature.shape_of
...@@ -540,44 +611,54 @@ def local_gpu_sum(node): ...@@ -540,44 +611,54 @@ def local_gpu_sum(node):
new_in_shp = [x_shape[0]] new_in_shp = [x_shape[0]]
new_mask = [reduce_mask[0]] new_mask = [reduce_mask[0]]
for i in xrange(1, x.type.ndim): for i in xrange(1, x.type.ndim):
if reduce_mask[i] == reduce_mask[i-1]: if reduce_mask[i] == reduce_mask[i - 1]:
new_in_shp[-1] *= x_shape[i] new_in_shp[-1] *= x_shape[i]
else: else:
new_mask.append(reduce_mask[i]) new_mask.append(reduce_mask[i])
new_in_shp.append(x_shape[i]) new_in_shp.append(x_shape[i])
pattern=(''.join(str(i) for i in new_mask)) pattern = (''.join(str(i) for i in new_mask))
new_gsum = GpuSum(new_mask) new_gsum = GpuSum(new_mask)
if hasattr(new_gsum, 'c_code_reduce_%s'%pattern): if hasattr(new_gsum, 'c_code_reduce_%s' % pattern):
reshaped_x = x.reshape(tensor.stack(*new_in_shp)) reshaped_x = x.reshape(tensor.stack(*new_in_shp))
sum_reshaped_x = host_from_gpu(new_gsum(gpu_from_host(reshaped_x))) sum_reshaped_x = host_from_gpu(
new_gsum(gpu_from_host(reshaped_x)))
if sum_reshaped_x.ndim != node.outputs[0].ndim: if sum_reshaped_x.ndim != node.outputs[0].ndim:
unreshaped_sum = sum_reshaped_x.reshape(tensor.stack(*shape_of[node.outputs[0]])) unreshaped_sum = sum_reshaped_x.reshape(
tensor.stack(*shape_of[node.outputs[0]]))
else: else:
unreshaped_sum = sum_reshaped_x unreshaped_sum = sum_reshaped_x
if unreshaped_sum.type == node.outputs[0].type: if unreshaped_sum.type == node.outputs[0].type:
return [unreshaped_sum] return [unreshaped_sum]
else: else:
print >> sys.stderr, "WARNING: local_gpu_sum got type wrong" print >> sys.stderr, \
"WARNING: local_gpu_sum got type wrong"
return None return None
raise Exception("GpuSum don't have implemented the pattern",pattern) raise Exception(
"GpuSum don't have implemented the pattern",
pattern)
return False return False
@register_opt() @register_opt()
@local_optimizer([]) @local_optimizer([])
def local_gpu_reshape(node): def local_gpu_reshape(node):
if node.op == gpu_from_host: if node.op == gpu_from_host:
host_input = node.inputs[0] host_input = node.inputs[0]
if host_input.owner and isinstance(host_input.owner.op, tensor.Reshape): if host_input.owner and \
isinstance(host_input.owner.op, tensor.Reshape):
rshp = host_input.owner.op rshp = host_input.owner.op
x, shp = host_input.owner.inputs x, shp = host_input.owner.inputs
gpu_reshape = GpuReshape(rshp.ndim)(gpu_from_host(x), shp) gpu_reshape = GpuReshape(rshp.ndim)(gpu_from_host(x), shp)
if gpu_reshape.broadcastable != node.outputs[0].broadcastable: if gpu_reshape.broadcastable != node.outputs[0].broadcastable:
#this can happen as we always return False for all broadcast dim in GpuReshape but not for Reshape # this can happen as we always return False for all broadcast
#Event if we did the same think, with the constant optimization that could happen. # dim in GpuReshape but not for Reshape
gpu_reshape = theano.tensor.patternbroadcast(gpu_reshape,node.outputs[0].broadcastable) # Event if we did the same think, with the constant
# optimization that could happen.
gpu_reshape = theano.tensor.patternbroadcast(
gpu_reshape, node.outputs[0].broadcastable)
return [gpu_reshape] return [gpu_reshape]
if isinstance(node.op, tensor.Reshape): if isinstance(node.op, tensor.Reshape):
x, shp = node.inputs x, shp = node.inputs
...@@ -585,20 +666,26 @@ def local_gpu_reshape(node): ...@@ -585,20 +666,26 @@ def local_gpu_reshape(node):
gpu_x, = x.owner.inputs gpu_x, = x.owner.inputs
gpu_reshape = GpuReshape(node.op.ndim)(gpu_x, shp) gpu_reshape = GpuReshape(node.op.ndim)(gpu_x, shp)
if gpu_reshape.broadcastable != node.outputs[0].broadcastable: if gpu_reshape.broadcastable != node.outputs[0].broadcastable:
#this can happen as we always return False for all broadcast dim in GpuReshape but not for Reshape # this can happen as we always return False for all broadcast
#Event if we did the same think, with the constant optimization that could happen. # dim in GpuReshape but not for Reshape
gpu_reshape = theano.tensor.patternbroadcast(gpu_reshape,node.outputs[0].broadcastable) # Event if we did the same think, with the constant
# optimization that could happen.
gpu_reshape = theano.tensor.patternbroadcast(
gpu_reshape, node.outputs[0].broadcastable)
return [host_from_gpu(gpu_reshape)] return [host_from_gpu(gpu_reshape)]
return False return False
@register_opt() @register_opt()
@local_optimizer([]) @local_optimizer([])
def local_gpu_flatten(node): def local_gpu_flatten(node):
if node.op == gpu_from_host: if node.op == gpu_from_host:
host_input = node.inputs[0] host_input = node.inputs[0]
if host_input.owner and isinstance(host_input.owner.op, tensor.Flatten): if host_input.owner and \
isinstance(host_input.owner.op, tensor.Flatten):
outdim = host_input.owner.op.outdim outdim = host_input.owner.op.outdim
return [GpuFlatten(outdim)(gpu_from_host(host_input.owner.inputs[0]))] return [GpuFlatten(outdim)(
gpu_from_host(host_input.owner.inputs[0]))]
if isinstance(node.op, tensor.Flatten): if isinstance(node.op, tensor.Flatten):
x, = node.inputs x, = node.inputs
outdim = node.op.outdim outdim = node.op.outdim
...@@ -607,56 +694,64 @@ def local_gpu_flatten(node): ...@@ -607,56 +694,64 @@ def local_gpu_flatten(node):
return [host_from_gpu(GpuFlatten(outdim)(gpu_x))] return [host_from_gpu(GpuFlatten(outdim)(gpu_x))]
return False return False
@register_opt() @register_opt()
@local_optimizer([]) @local_optimizer([])
def local_gpu_subtensor(node): def local_gpu_subtensor(node):
if node.op == gpu_from_host: if node.op == gpu_from_host:
host_input = node.inputs[0] host_input = node.inputs[0]
if host_input.owner and isinstance(host_input.owner.op, tensor.Subtensor): if host_input.owner and \
isinstance(host_input.owner.op, tensor.Subtensor):
subt = host_input.owner.op subt = host_input.owner.op
x = host_input.owner.inputs[0] x = host_input.owner.inputs[0]
coords = host_input.owner.inputs[1:] coords = host_input.owner.inputs[1:]
return [GpuSubtensor(subt.idx_list)(gpu_from_host(x), *coords)] return [GpuSubtensor(subt.idx_list)(gpu_from_host(x), *coords)]
if isinstance(node.op, tensor.Subtensor): if isinstance(node.op, tensor.Subtensor):
x = node.inputs[0] x = node.inputs[0]
coords = node.inputs[1:] coords = node.inputs[1:]
if x.owner and x.owner.op == host_from_gpu and x.dtype == "float32": if x.owner and x.owner.op == host_from_gpu and x.dtype == "float32":
gpu_x, = x.owner.inputs gpu_x, = x.owner.inputs
return [host_from_gpu(GpuSubtensor(node.op.idx_list)(gpu_x, *coords))] return [host_from_gpu(GpuSubtensor(
node.op.idx_list)(gpu_x, *coords))]
return False return False
@register_opt() @register_opt()
@local_optimizer([]) @local_optimizer([])
def local_gpu_advanced_subtensor1(node): def local_gpu_advanced_subtensor1(node):
if node.op == gpu_from_host: if node.op == gpu_from_host:
host_input = node.inputs[0] host_input = node.inputs[0]
if host_input.owner and host_input.owner.op.__class__ is tensor.AdvancedSubtensor1: if host_input.owner and \
host_input.owner.op.__class__ is tensor.AdvancedSubtensor1:
x = host_input.owner.inputs[0] x = host_input.owner.inputs[0]
coords = host_input.owner.inputs[1:] coords = host_input.owner.inputs[1:]
return [GpuAdvancedSubtensor1()(gpu_from_host(x), *coords)] return [GpuAdvancedSubtensor1()(gpu_from_host(x), *coords)]
if node.op.__class__ is tensor.AdvancedSubtensor1: if node.op.__class__ is tensor.AdvancedSubtensor1:
x = node.inputs[0] x = node.inputs[0]
coords = node.inputs[1:] coords = node.inputs[1:]
if x.owner and x.owner.op == host_from_gpu and x.dtype == "float32": if x.owner and x.owner.op == host_from_gpu and x.dtype == "float32":
gpu_x, = x.owner.inputs gpu_x, = x.owner.inputs
return [host_from_gpu(GpuAdvancedSubtensor1()(gpu_x, *coords))] return [host_from_gpu(GpuAdvancedSubtensor1()(gpu_x, *coords))]
return False return False
@register_opt() @register_opt()
@local_optimizer([]) @local_optimizer([])
def local_gpu_advanced_incsubtensor1(node): def local_gpu_advanced_incsubtensor1(node):
if node.op == gpu_from_host: if node.op == gpu_from_host:
host_input = node.inputs[0] host_input = node.inputs[0]
# Should not execute for GpuAdvancedIncSubtensor1 # Should not execute for GpuAdvancedIncSubtensor1
if host_input.owner and host_input.owner.op.__class__ is tensor.AdvancedIncSubtensor1: if host_input.owner and \
host_input.owner.op.__class__ is tensor.AdvancedIncSubtensor1:
x, y = host_input.owner.inputs[0:2] x, y = host_input.owner.inputs[0:2]
coords = host_input.owner.inputs[2:] coords = host_input.owner.inputs[2:]
return [GpuAdvancedIncSubtensor1()(gpu_from_host(x), return [GpuAdvancedIncSubtensor1()(gpu_from_host(x),
gpu_from_host(y), *coords)] gpu_from_host(y), *coords)]
# Should not execute for GpuAdvancedIncSubtensor1 # Should not execute for GpuAdvancedIncSubtensor1
if node.op.__class__ is tensor.AdvancedIncSubtensor1 and node.inputs[0].dtype=="float32": if node.op.__class__ is tensor.AdvancedIncSubtensor1 and \
x, y = node.inputs[0:2] node.inputs[0].dtype == "float32":
x, y = node.inputs[0:2]
coords = node.inputs[2:] coords = node.inputs[2:]
go_gpu = False go_gpu = False
if x.owner and x.owner.op == host_from_gpu: if x.owner and x.owner.op == host_from_gpu:
...@@ -670,24 +765,30 @@ def local_gpu_advanced_incsubtensor1(node): ...@@ -670,24 +765,30 @@ def local_gpu_advanced_incsubtensor1(node):
else: else:
gpu_y = gpu_from_host(y) gpu_y = gpu_from_host(y)
if go_gpu: if go_gpu:
return [host_from_gpu(GpuAdvancedIncSubtensor1()(gpu_x, gpu_y, *coords))] return [host_from_gpu(GpuAdvancedIncSubtensor1()(
gpu_x, gpu_y, *coords))]
return False return False
@register_opt() @register_opt()
@local_optimizer([]) @local_optimizer([])
def local_gpu_incsubtensor(node): def local_gpu_incsubtensor(node):
if node.op == gpu_from_host: if node.op == gpu_from_host:
host_output = node.inputs[0] host_output = node.inputs[0]
if host_output.owner and type(host_output.owner.op) == tensor.IncSubtensor: if host_output.owner and \
type(host_output.owner.op) == tensor.IncSubtensor:
incsubt = host_output.owner.op incsubt = host_output.owner.op
x, y = host_output.owner.inputs[0:2] x, y = host_output.owner.inputs[0:2]
coords = host_output.owner.inputs[2:] coords = host_output.owner.inputs[2:]
return [GpuIncSubtensor(incsubt.idx_list, inplace=incsubt.inplace, return [GpuIncSubtensor(
set_instead_of_inc=incsubt.set_instead_of_inc)( incsubt.idx_list,
gpu_from_host(x), inplace=incsubt.inplace,
gpu_from_host(y), set_instead_of_inc=incsubt.set_instead_of_inc)(
*coords)] gpu_from_host(x),
if type(node.op) == tensor.IncSubtensor and node.inputs[0].dtype=="float32": gpu_from_host(y),
*coords)]
if type(node.op) == tensor.IncSubtensor and \
node.inputs[0].dtype == "float32":
x, y = node.inputs[0:2] x, y = node.inputs[0:2]
assert isinstance(x.type, tensor.TensorType) assert isinstance(x.type, tensor.TensorType)
assert isinstance(y.type, tensor.TensorType) assert isinstance(y.type, tensor.TensorType)
...@@ -710,6 +811,7 @@ def local_gpu_incsubtensor(node): ...@@ -710,6 +811,7 @@ def local_gpu_incsubtensor(node):
gpu_x, gpu_y, *coords))] gpu_x, gpu_y, *coords))]
return False return False
@register_opt() @register_opt()
@local_optimizer([]) @local_optimizer([])
def local_gpu_shape(node): def local_gpu_shape(node):
...@@ -720,6 +822,7 @@ def local_gpu_shape(node): ...@@ -720,6 +822,7 @@ def local_gpu_shape(node):
return [gpu_shape(gpu_x)] return [gpu_shape(gpu_x)]
return False return False
@register_opt() @register_opt()
@local_optimizer([]) @local_optimizer([])
def local_gpu_rebroadcast(node): def local_gpu_rebroadcast(node):
...@@ -734,6 +837,7 @@ def local_gpu_rebroadcast(node): ...@@ -734,6 +837,7 @@ def local_gpu_rebroadcast(node):
def gpu_print_wrapper(op, cnda): def gpu_print_wrapper(op, cnda):
op.old_op.global_fn(op.old_op, numpy.asarray(cnda)) op.old_op.global_fn(op.old_op, numpy.asarray(cnda))
@register_opt() @register_opt()
@local_optimizer([]) @local_optimizer([])
def local_gpu_print_op(node): def local_gpu_print_op(node):
...@@ -766,10 +870,10 @@ def local_gpu_tensordot(node): ...@@ -766,10 +870,10 @@ def local_gpu_tensordot(node):
x, y = node.inputs x, y = node.inputs
if ((x.owner and if ((x.owner and
x.owner.op == host_from_gpu and x.owner.op == host_from_gpu and
y.dtype=='float32') or y.dtype == 'float32') or
(y.owner and (y.owner and
y.owner.op == host_from_gpu and y.owner.op == host_from_gpu and
x.dtype=='float32')): x.dtype == 'float32')):
axes = node.op.axes axes = node.op.axes
out = tensordot(x, y, axes=axes) out = tensordot(x, y, axes=axes)
...@@ -782,15 +886,18 @@ def cast(x, dtype): ...@@ -782,15 +886,18 @@ def cast(x, dtype):
return cast_op(x) return cast_op(x)
import theano.tensor.nnet import theano.tensor.nnet
@register_opt() @register_opt()
@local_optimizer([]) @local_optimizer([])
def local_gpu_crossentorpy_softmax_argmax_1hot_with_bias(node): def local_gpu_crossentorpy_softmax_argmax_1hot_with_bias(node):
if isinstance(node.op, tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias): if isinstance(node.op, tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias):
x,b,y = node.inputs x, b, y = node.inputs
if x.owner and x.owner.op == host_from_gpu: if x.owner and x.owner.op == host_from_gpu:
gpu_x, = x.owner.inputs gpu_x, = x.owner.inputs
# if y is a cast to integers, we can go to the underlying thing if we want, # if y is a cast to integers, we can go to the underlying
# since this gpu op will cast to integers internally anyway # thing if we want, since this gpu op will cast to integers
# internally anyway
int_cast_ops = ( int_cast_ops = (
tensor.basic._convert_to_int32, tensor.basic._convert_to_int32,
tensor.basic._convert_to_int8, tensor.basic._convert_to_int8,
...@@ -799,21 +906,23 @@ def local_gpu_crossentorpy_softmax_argmax_1hot_with_bias(node): ...@@ -799,21 +906,23 @@ def local_gpu_crossentorpy_softmax_argmax_1hot_with_bias(node):
) )
while y.owner and y.owner.op in int_cast_ops: while y.owner and y.owner.op in int_cast_ops:
y = y.owner.inputs[0] y = y.owner.inputs[0]
gpu_nll, gpu_sm, gpu_am = GpuCrossentropySoftmaxArgmax1HotWithBias()( gpu_nll, gpu_sm, gpu_am = \
gpu_x, GpuCrossentropySoftmaxArgmax1HotWithBias()(
gpu_from_host(b), gpu_x,
gpu_from_host(cast(y, 'float32'))) gpu_from_host(b),
gpu_from_host(cast(y, 'float32')))
am_dtype = node.outputs[2].type.dtype am_dtype = node.outputs[2].type.dtype
return [host_from_gpu(gpu_nll), return [host_from_gpu(gpu_nll),
host_from_gpu(gpu_sm), host_from_gpu(gpu_sm),
cast(host_from_gpu(gpu_am), am_dtype)] cast(host_from_gpu(gpu_am), am_dtype)]
return False return False
@register_opt() @register_opt()
@local_optimizer([]) @local_optimizer([])
def local_gpu_crossentorpy_softmax_1hot_with_bias_dx(node): def local_gpu_crossentorpy_softmax_1hot_with_bias_dx(node):
if isinstance(node.op, tensor.nnet.CrossentropySoftmax1HotWithBiasDx): if isinstance(node.op, tensor.nnet.CrossentropySoftmax1HotWithBiasDx):
dnll,sm,yidx = node.inputs dnll, sm, yidx = node.inputs
if sm.owner and sm.owner.op == host_from_gpu: if sm.owner and sm.owner.op == host_from_gpu:
gpu_sm, = sm.owner.inputs gpu_sm, = sm.owner.inputs
gpu_dx = GpuCrossentropySoftmax1HotWithBiasDx()( gpu_dx = GpuCrossentropySoftmax1HotWithBiasDx()(
...@@ -823,6 +932,7 @@ def local_gpu_crossentorpy_softmax_1hot_with_bias_dx(node): ...@@ -823,6 +932,7 @@ def local_gpu_crossentorpy_softmax_1hot_with_bias_dx(node):
return [host_from_gpu(gpu_dx)] return [host_from_gpu(gpu_dx)]
return False return False
@register_opt() @register_opt()
@local_optimizer([]) @local_optimizer([])
def local_gpu_softmax(node): def local_gpu_softmax(node):
...@@ -834,6 +944,7 @@ def local_gpu_softmax(node): ...@@ -834,6 +944,7 @@ def local_gpu_softmax(node):
return [host_from_gpu(gpu_sm)] return [host_from_gpu(gpu_sm)]
return False return False
@register_opt() @register_opt()
@local_optimizer([]) @local_optimizer([])
def local_gpu_softmax_with_bias(node): def local_gpu_softmax_with_bias(node):
...@@ -848,6 +959,8 @@ def local_gpu_softmax_with_bias(node): ...@@ -848,6 +959,8 @@ def local_gpu_softmax_with_bias(node):
#### Convolution, maxpooling #### Convolution, maxpooling
from theano.tensor.nnet import conv from theano.tensor.nnet import conv
@register_opt() @register_opt()
@local_optimizer([]) @local_optimizer([])
def local_gpu_conv(node): def local_gpu_conv(node):
...@@ -857,9 +970,9 @@ def local_gpu_conv(node): ...@@ -857,9 +970,9 @@ def local_gpu_conv(node):
conv(host_from_gpu) -> host_from_gpu(gpu_conv) conv(host_from_gpu) -> host_from_gpu(gpu_conv)
""" """
def GpuConvOp_from_ConvOp(op): def GpuConvOp_from_ConvOp(op):
logical_img_hw=None logical_img_hw = None
if op.imshp_logical is not None: if op.imshp_logical is not None:
logical_img_hw=op.imshp_logical[1:3] logical_img_hw = op.imshp_logical[1:3]
if logical_img_hw != op.imshp[1:3]: if logical_img_hw != op.imshp[1:3]:
# this case is not implemented # this case is not implemented
return None return None
...@@ -878,11 +991,10 @@ def local_gpu_conv(node): ...@@ -878,11 +991,10 @@ def local_gpu_conv(node):
imshp=op.imshp, imshp=op.imshp,
) )
#HACK to print the number of MFlops in the profiler output. #HACK to print the number of MFlops in the profiler output.
if hasattr(op,'flops'): if hasattr(op, 'flops'):
ret.flops=op.flops ret.flops = op.flops
return ret return ret
if node.op == gpu_from_host: if node.op == gpu_from_host:
#gpu_from_host(conv) -> gpu_conv(gpu_from_host) #gpu_from_host(conv) -> gpu_conv(gpu_from_host)
host_input = node.inputs[0] host_input = node.inputs[0]
...@@ -891,9 +1003,12 @@ def local_gpu_conv(node): ...@@ -891,9 +1003,12 @@ def local_gpu_conv(node):
if gpu_conv is None: if gpu_conv is None:
return return
img, kern = host_input.owner.inputs img, kern = host_input.owner.inputs
#in some case the ConvOp broadcast the last 2 dimensions differently then the gpu ConvOp # in some case the ConvOp broadcast the last 2 dimensions
return [tensor.patternbroadcast(gpu_conv(gpu_from_host(img), gpu_from_host(kern)), # differently then the gpu ConvOp
node.outputs[0].broadcastable)] return [tensor.patternbroadcast(
gpu_conv(gpu_from_host(img),
gpu_from_host(kern)),
node.outputs[0].broadcastable)]
if isinstance(node.op, conv.ConvOp): if isinstance(node.op, conv.ConvOp):
#conv(host_from_gpu) -> host_from_gpu(gpu_conv) #conv(host_from_gpu) -> host_from_gpu(gpu_conv)
...@@ -904,12 +1019,16 @@ def local_gpu_conv(node): ...@@ -904,12 +1019,16 @@ def local_gpu_conv(node):
gpu_conv = GpuConvOp_from_ConvOp(node.op) gpu_conv = GpuConvOp_from_ConvOp(node.op)
if gpu_conv is None: if gpu_conv is None:
return return
#in some case the ConvOp broadcast the last 2 dimensions differently then the gpu ConvOp # in some case the ConvOp broadcast the last 2 dimensions
return [tensor.patternbroadcast(host_from_gpu(gpu_conv(gpu_from_host(img), # differently then the gpu ConvOp
gpu_from_host(kern))), return [tensor.patternbroadcast(
node.outputs[0].broadcastable)] host_from_gpu(gpu_conv(gpu_from_host(img),
gpu_from_host(kern))),
node.outputs[0].broadcastable)]
import theano.tensor.signal.downsample as downsample import theano.tensor.signal.downsample as downsample
@register_opt() @register_opt()
@local_optimizer([]) @local_optimizer([])
def local_gpu_downsample_factor_max(node): def local_gpu_downsample_factor_max(node):
...@@ -919,19 +1038,23 @@ def local_gpu_downsample_factor_max(node): ...@@ -919,19 +1038,23 @@ def local_gpu_downsample_factor_max(node):
gpu_ds = GpuDownsampleFactorMax(node.op.ds, node.op.ignore_border) gpu_ds = GpuDownsampleFactorMax(node.op.ds, node.op.ignore_border)
return [host_from_gpu(gpu_ds(x.owner.inputs[0]))] return [host_from_gpu(gpu_ds(x.owner.inputs[0]))]
@register_opt() @register_opt()
@local_optimizer([]) @local_optimizer([])
def local_gpu_downsample_factor_max_grad(node): def local_gpu_downsample_factor_max_grad(node):
if isinstance(node.op, downsample.DownsampleFactorMaxGrad): if isinstance(node.op, downsample.DownsampleFactorMaxGrad):
x,z,gz = node.inputs x, z, gz = node.inputs
if (x.owner and x.owner.op == host_from_gpu): if (x.owner and x.owner.op == host_from_gpu):
gpu_ds_grad = GpuDownsampleFactorMaxGrad(node.op.ds, node.op.ignore_border) gpu_ds_grad = GpuDownsampleFactorMaxGrad(node.op.ds,
return [host_from_gpu(gpu_ds_grad(x.owner.inputs[0], gpu_from_host(z), gpu_from_host(gz)))] node.op.ignore_border)
return [host_from_gpu(gpu_ds_grad(x.owner.inputs[0],
gpu_from_host(z),
gpu_from_host(gz)))]
from theano.sandbox.cuda.basic_ops import gpu_join from theano.sandbox.cuda.basic_ops import gpu_join
@register_opt() @register_opt()
@local_optimizer([]) @local_optimizer([])
def local_gpu_join(node): def local_gpu_join(node):
...@@ -955,7 +1078,8 @@ def local_gpu_join(node): ...@@ -955,7 +1078,8 @@ def local_gpu_join(node):
host_from_gpu(gpu_join) host_from_gpu(gpu_join)
For intermediate places in the graph not covered by the first opt, the following could be useful: For intermediate places in the graph not covered by the first opt, the
following could be useful:
gpu_from_host(join) -> gpu_join(gpu_from_host) gpu_from_host(join) -> gpu_join(gpu_from_host)
...@@ -981,7 +1105,7 @@ def local_gpu_join(node): ...@@ -981,7 +1105,7 @@ def local_gpu_join(node):
# the extra gpu_from_host introduced here will # the extra gpu_from_host introduced here will
# be removed by further optimizations # be removed by further optimizations
new_tensors = [gpu_from_host(t) for t in axis_and_tensors[1:]] new_tensors = [gpu_from_host(t) for t in axis_and_tensors[1:]]
new_a_and_t = [axis_and_tensors[0]]+new_tensors new_a_and_t = [axis_and_tensors[0]] + new_tensors
replacement_node = host_from_gpu(gpu_join(*new_a_and_t)) replacement_node = host_from_gpu(gpu_join(*new_a_and_t))
...@@ -990,8 +1114,10 @@ def local_gpu_join(node): ...@@ -990,8 +1114,10 @@ def local_gpu_join(node):
return [replacement_node] return [replacement_node]
#Commented out because it can result in shared = dimshuffle(gemm_inplace(dimshuffle(shared))) # Commented out because it can result in
#which causes memory leaks (long term fix is to make the above not leak memory) # shared = dimshuffle(gemm_inplace(dimshuffle(shared)))
# which causes memory leaks (long term fix is to make the above not leak
# memory)
@local_optimizer([gpu_gemm_no_inplace]) @local_optimizer([gpu_gemm_no_inplace])
def local_inplace_gemm(node): def local_inplace_gemm(node):
if node.op == gpu_gemm_no_inplace: if node.op == gpu_gemm_no_inplace:
...@@ -1049,6 +1175,7 @@ def get_device_type_sizes(): ...@@ -1049,6 +1175,7 @@ def get_device_type_sizes():
rval = get_device_type_sizes.rval = locals() rval = get_device_type_sizes.rval = locals()
return rval return rval
def max_inputs_to_GpuElemwise(node): def max_inputs_to_GpuElemwise(node):
""" """
return the maximum number of inputs this GpuElemwise Apply node can return the maximum number of inputs this GpuElemwise Apply node can
...@@ -1067,10 +1194,11 @@ def max_inputs_to_GpuElemwise(node): ...@@ -1067,10 +1194,11 @@ def max_inputs_to_GpuElemwise(node):
int_size = type_sizes['int_size'] int_size = type_sizes['int_size']
gpu_ptr_size = type_sizes['gpu_ptr_size'] gpu_ptr_size = type_sizes['gpu_ptr_size']
argument_limit = 232 # some bytes are used for block and thread coords etc. # some bytes are used for block and thread coords etc.
argument_limit = 232
ndim = node.inputs[0].type.ndim ndim = node.inputs[0].type.ndim
size_param_mandatory = int_size #for numels size_param_mandatory = int_size # for numels
size_param_mandatory += int_size * ndim # for the shape size_param_mandatory += int_size * ndim # for the shape
size_param_mandatory += sum((gpu_ptr_size + int_size * ndim) size_param_mandatory += sum((gpu_ptr_size + int_size * ndim)
for i in node.outputs) for i in node.outputs)
...@@ -1085,6 +1213,7 @@ def max_inputs_to_GpuElemwise(node): ...@@ -1085,6 +1213,7 @@ def max_inputs_to_GpuElemwise(node):
return max_nb_inputs return max_nb_inputs
def split_huge_add_or_mul(node): def split_huge_add_or_mul(node):
""" """
For add and mul, it can happen that we have too much input For add and mul, it can happen that we have too much input
...@@ -1097,12 +1226,14 @@ def split_huge_add_or_mul(node): ...@@ -1097,12 +1226,14 @@ def split_huge_add_or_mul(node):
""" """
if node.op.scalar_op in (scal.add, scal.mul): if node.op.scalar_op in (scal.add, scal.mul):
max_nb_inputs = max_inputs_to_GpuElemwise(node) max_nb_inputs = max_inputs_to_GpuElemwise(node)
if max_nb_inputs<=1 and len(node.inputs)>1: if max_nb_inputs <= 1 and len(node.inputs) > 1:
return False return False
while len(node.inputs)>max_nb_inputs: while len(node.inputs) > max_nb_inputs:
inner_op = [] inner_op = []
for i in xrange(0,len(node.inputs),max_nb_inputs): for i in xrange(0,
inner_op.append(node.op(*node.inputs[i:i+max_nb_inputs])) len(node.inputs),
max_nb_inputs):
inner_op.append(node.op(*node.inputs[i: i + max_nb_inputs]))
node = node.op(*inner_op).owner node = node.op(*inner_op).owner
return node return node
...@@ -1115,9 +1246,10 @@ if config.gpu.local_elemwise_fusion: ...@@ -1115,9 +1246,10 @@ if config.gpu.local_elemwise_fusion:
optdb.register('gpu_elemwise_fusion', optdb.register('gpu_elemwise_fusion',
tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion), tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion),
71.00, 'fast_run', 'fusion', 71.00, 'fast_run', 'fusion',
'local_elemwise_fusion','gpu') 'local_elemwise_fusion', 'gpu')
else: else:
_logger.debug("not enabling optimization fusion of gpu elemwise in fast_run") _logger.debug(("not enabling optimization fusion of gpu elemwise in "
"fast_run"))
optdb.register('gpu_elemwise_fusion', optdb.register('gpu_elemwise_fusion',
tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion), tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion),
71.00, 'fusion', 'local_elemwise_fusion') 71.00, 'fusion', 'local_elemwise_fusion')
...@@ -1126,19 +1258,29 @@ else: ...@@ -1126,19 +1258,29 @@ else:
gpu_inplace_elemwise_optimizer = tensor.opt.inplace_elemwise_optimizer_op( gpu_inplace_elemwise_optimizer = tensor.opt.inplace_elemwise_optimizer_op(
GpuElemwise) GpuElemwise)
optdb.register('gpu_inplace_elemwise_opt', gpu_inplace_elemwise_optimizer, 75, optdb.register('gpu_inplace_elemwise_opt', gpu_inplace_elemwise_optimizer, 75,
'fast_run', 'inplace','gpu_inplace', 'gpu') 'fast_run', 'inplace', 'gpu_inplace', 'gpu')
@register_opt() @register_opt()
@local_optimizer([tensor.Alloc]) @local_optimizer([tensor.Alloc])
def local_gpualloc(node): def local_gpualloc(node):
replace=False replace = False
if node.op == tensor.alloc: if node.op == tensor.alloc:
if node.inputs[0].owner and node.inputs[0].owner.op==host_from_gpu:#if the input was on the gpu if node.inputs[0].owner and \
node.inputs[0].owner.op == host_from_gpu:
replace = True
if all([c != 'output' and c.op == gpu_from_host
for c, idx in node.outputs[0].clients]):
# if all clients are on gpu
replace = True
if all([c != 'output' and
c.op == tensor.join and
all([i.owner and
i.owner.op in [host_from_gpu, tensor.alloc]
for i in c.inputs[1:]])
for c, idx in node.outputs[0].clients]):
# if the client is a subtensor with input on gpu or alloc
replace = True replace = True
if all([c!='output' and c.op == gpu_from_host for c,idx in node.outputs[0].clients]):#if all clients are on gpu
replace=True
if all([c!='output' and c.op == tensor.join and all([i.owner and i.owner.op in [host_from_gpu,tensor.alloc] for i in c.inputs[1:]]) for c,idx in node.outputs[0].clients]):#if the client is a subtensor with input on gpu or alloc
replace=True
if replace: if replace:
val = node.inputs[0] val = node.inputs[0]
shp = node.inputs[1:] shp = node.inputs[1:]
...@@ -1155,7 +1297,8 @@ def local_gpualloc(node): ...@@ -1155,7 +1297,8 @@ def local_gpualloc(node):
assert new_out.type.dtype == old_out.type.dtype assert new_out.type.dtype == old_out.type.dtype
# it seems to have happened that new_out has some broadcastable # it seems to have happened that new_out has some broadcastable
# dimensions that old_out did not have # dimensions that old_out did not have
for b_old,b_new in zip(old_out.type.broadcastable, new_out.type.broadcastable): for b_old, b_new in zip(old_out.type.broadcastable,
new_out.type.broadcastable):
assert b_new or (not b_old) assert b_new or (not b_old)
new_out = tensor.patternbroadcast(new_out, old_out.broadcastable) new_out = tensor.patternbroadcast(new_out, old_out.broadcastable)
#if old_out.type != new_out.type: #if old_out.type != new_out.type:
...@@ -1163,8 +1306,6 @@ def local_gpualloc(node): ...@@ -1163,8 +1306,6 @@ def local_gpualloc(node):
return [new_out] return [new_out]
def safe_to_gpu(x): def safe_to_gpu(x):
if (isinstance(x.type, tensor.TensorType) and if (isinstance(x.type, tensor.TensorType) and
x.type.dtype == 'float32'): x.type.dtype == 'float32'):
...@@ -1172,6 +1313,7 @@ def safe_to_gpu(x): ...@@ -1172,6 +1313,7 @@ def safe_to_gpu(x):
else: else:
return x return x
def safe_to_cpu(x): def safe_to_cpu(x):
if isinstance(x.type, CudaNdarrayType): if isinstance(x.type, CudaNdarrayType):
return host_from_gpu(x) return host_from_gpu(x)
...@@ -1179,8 +1321,7 @@ def safe_to_cpu(x): ...@@ -1179,8 +1321,7 @@ def safe_to_cpu(x):
return x return x
def gpu_safe_new(x, tag=''):
def gpu_safe_new(x, tag = ''):
""" """
Internal function that constructs a new variable from x with the same Internal function that constructs a new variable from x with the same
type, but with a different name ( old name + tag). This function is used type, but with a different name ( old name + tag). This function is used
...@@ -1199,7 +1340,8 @@ def gpu_safe_new(x, tag = ''): ...@@ -1199,7 +1340,8 @@ def gpu_safe_new(x, tag = ''):
nw_x.name = nw_name nw_x.name = nw_name
return nw_x return nw_x
def gpu_reconstruct_graph(inputs, outputs, tag = None):
def gpu_reconstruct_graph(inputs, outputs, tag=None):
""" """
Different interface to clone, that allows you to pass inputs. Different interface to clone, that allows you to pass inputs.
Compared to clone, this method always replaces the inputs with Compared to clone, this method always replaces the inputs with
...@@ -1208,20 +1350,20 @@ def gpu_reconstruct_graph(inputs, outputs, tag = None): ...@@ -1208,20 +1350,20 @@ def gpu_reconstruct_graph(inputs, outputs, tag = None):
""" """
if tag is None: if tag is None:
tag = '' tag = ''
nw_inputs = [gpu_safe_new(x,tag) for x in inputs] nw_inputs = [gpu_safe_new(x, tag) for x in inputs]
givens = {} givens = {}
for nw_x, x in zip(nw_inputs, inputs): for nw_x, x in zip(nw_inputs, inputs):
givens[x] = nw_x givens[x] = nw_x
nw_outputs = scan_utils.clone( outputs, replace=givens) nw_outputs = scan_utils.clone(outputs, replace=givens)
return (nw_inputs, nw_outputs) return (nw_inputs, nw_outputs)
def tensor_to_cuda(x): def tensor_to_cuda(x):
if (isinstance(x.type, tensor.TensorType) and if (isinstance(x.type, tensor.TensorType) and
x.type.dtype == 'float32'): x.type.dtype == 'float32'):
y = CudaNdarrayType( broadcastable = x.type.broadcastable)() y = CudaNdarrayType(broadcastable=x.type.broadcastable)()
if x.name : if x.name:
y.name = x.name +'[cuda]' y.name = x.name + '[cuda]'
return y return y
else: else:
return x return x
...@@ -1241,7 +1383,7 @@ def gpuScanOptimization(node): ...@@ -1241,7 +1383,7 @@ def gpuScanOptimization(node):
if (host_input.owner and if (host_input.owner and
isinstance(host_input.owner.op, scan_op.Scan) and isinstance(host_input.owner.op, scan_op.Scan) and
not host_input.owner.op.info['gpu'] and not host_input.owner.op.info['gpu'] and
len(host_input.owner.outputs) == 1 ): len(host_input.owner.outputs) == 1):
# Note that we are not doing the right thing here !! # Note that we are not doing the right thing here !!
# This is because the local optimizer expects only one # This is because the local optimizer expects only one
# output that corresponds to the input of ``node`` # output that corresponds to the input of ``node``
...@@ -1257,39 +1399,40 @@ def gpuScanOptimization(node): ...@@ -1257,39 +1399,40 @@ def gpuScanOptimization(node):
info = thescan.info.copy() info = thescan.info.copy()
info['gpu'] = True info['gpu'] = True
inputs = host_input.owner.inputs inputs = host_input.owner.inputs
nw_ins = [ inputs[0]] nw_ins = [inputs[0]]
e = ( 1+ thescan.n_seqs e = (1 +
+ thescan.n_mit_mot thescan.n_seqs +
+ thescan.n_mit_sot thescan.n_mit_mot +
+ thescan.n_sit_sot thescan.n_mit_sot +
+ thescan.n_shared_outs) thescan.n_sit_sot +
nw_ins += [safe_to_gpu(x) for x in inputs[1:e] ] thescan.n_shared_outs)
nw_ins += [safe_to_gpu(x) for x in inputs[1:e]]
b = e b = e
e = e + thescan.n_nit_sot e = e + thescan.n_nit_sot
nw_ins += inputs[b:e] nw_ins += inputs[b:e]
nw_ins += [safe_to_gpu(x) for x in inputs[e:] ] nw_ins += [safe_to_gpu(x) for x in inputs[e:]]
scan_ins = [ tensor_to_cuda(x) for x in thescan.inputs] scan_ins = [tensor_to_cuda(x) for x in thescan.inputs]
scan_outs = [ safe_to_gpu(x) for x in thescan.outputs ] scan_outs = [safe_to_gpu(x) for x in thescan.outputs]
scan_outs = scan_utils.clone( scan_outs = scan_utils.clone(
scan_outs scan_outs,
, replace = zip(thescan.inputs, replace=zip(thescan.inputs,
[safe_to_cpu(x) for x in scan_ins])) [safe_to_cpu(x) for x in scan_ins]))
# We need to construct the hash here, because scan # We need to construct the hash here, because scan
# __init__ does not know about cuda ndarray and can not # __init__ does not know about cuda ndarray and can not
# handle graphs with inputs being Cuda Ndarrays # handle graphs with inputs being Cuda Ndarrays
tmp_in, tmp_out = gpu_reconstruct_graph(scan_ins, tmp_in, tmp_out = gpu_reconstruct_graph(scan_ins,
scan_outs) scan_outs)
local_env = gof.Env(tmp_in, tmp_out) local_env = gof.Env(tmp_in, tmp_out)
_cmodule_key = gof.CLinker.cmodule_key_(local_env,[]) _cmodule_key = gof.CLinker.cmodule_key_(local_env, [])
info['gpu_hash'] = hash(_cmodule_key) info['gpu_hash'] = hash(_cmodule_key)
typeConstructor = lambda broadcastable, dtype: CudaNdarrayType( typeConstructor = lambda broadcastable, dtype: CudaNdarrayType(
broadcastable = broadcastable) broadcastable=broadcastable)
nw_op = scan_op.Scan( scan_ins nw_op = scan_op.Scan(scan_ins,
, scan_outs scan_outs,
, info info,
, typeConstructor = typeConstructor typeConstructor=typeConstructor).make_node(
).make_node(*nw_ins) *nw_ins)
_outputs = nw_op.outputs _outputs = nw_op.outputs
return _outputs return _outputs
...@@ -1303,24 +1446,25 @@ def gpuScanOptimization(node): ...@@ -1303,24 +1446,25 @@ def gpuScanOptimization(node):
info = thescan.info.copy() info = thescan.info.copy()
info['gpu'] = True info['gpu'] = True
inputs = node.inputs inputs = node.inputs
nw_ins = [ inputs[0]] nw_ins = [inputs[0]]
e = ( 1+ thescan.n_seqs e = (1 +
+ thescan.n_mit_mot thescan.n_seqs +
+ thescan.n_mit_sot thescan.n_mit_mot +
+ thescan.n_sit_sot thescan.n_mit_sot +
+ thescan.n_shared_outs) thescan.n_sit_sot +
nw_ins += [safe_to_gpu(x) for x in inputs[1:e] ] thescan.n_shared_outs)
nw_ins += [safe_to_gpu(x) for x in inputs[1:e]]
b = e b = e
e = e + thescan.n_nit_sot e = e + thescan.n_nit_sot
nw_ins += inputs[b:e] nw_ins += inputs[b:e]
nw_ins += [safe_to_gpu(x) for x in inputs[e:] ] nw_ins += [safe_to_gpu(x) for x in inputs[e:]]
scan_ins = [ tensor_to_cuda(x) for x in thescan.inputs] scan_ins = [tensor_to_cuda(x) for x in thescan.inputs]
scan_outs = [ safe_to_gpu(x) for x in thescan.outputs ] scan_outs = [safe_to_gpu(x) for x in thescan.outputs]
scan_outs = scan_utils.clone( scan_outs = scan_utils.clone(
scan_outs scan_outs,
, replace = zip(thescan.inputs replace=zip(thescan.inputs,
,[safe_to_cpu(x) for x in scan_ins])) [safe_to_cpu(x) for x in scan_ins]))
# We need to construct the hash here, because scan # We need to construct the hash here, because scan
# __init__ does not know about cuda ndarray and can not # __init__ does not know about cuda ndarray and can not
...@@ -1328,18 +1472,18 @@ def gpuScanOptimization(node): ...@@ -1328,18 +1472,18 @@ def gpuScanOptimization(node):
tmp_in, tmp_out = gpu_reconstruct_graph(scan_ins, tmp_in, tmp_out = gpu_reconstruct_graph(scan_ins,
scan_outs) scan_outs)
local_env = gof.Env(tmp_in, tmp_out) local_env = gof.Env(tmp_in, tmp_out)
_cmodule_key = gof.CLinker.cmodule_key_(local_env,[]) _cmodule_key = gof.CLinker.cmodule_key_(local_env, [])
info['gpu_hash'] = hash(_cmodule_key) info['gpu_hash'] = hash(_cmodule_key)
typeConstructor = lambda broadcastable, dtype: CudaNdarrayType( typeConstructor = lambda broadcastable, dtype: CudaNdarrayType(
broadcastable = broadcastable) broadcastable=broadcastable)
_outputs = scan_op.Scan( _outputs = scan_op.Scan(
scan_ins scan_ins,
, scan_outs scan_outs,
, info info,
, typeConstructor = typeConstructor typeConstructor=typeConstructor).make_node(
).make_node(*nw_ins).outputs *nw_ins).outputs
outputs = [] outputs = []
for x,y in zip(_outputs, node.outputs): for x, y in zip(_outputs, node.outputs):
if isinstance(y.type, CudaNdarrayType): if isinstance(y.type, CudaNdarrayType):
outputs += [x] outputs += [x]
else: else:
...@@ -1347,20 +1491,21 @@ def gpuScanOptimization(node): ...@@ -1347,20 +1491,21 @@ def gpuScanOptimization(node):
return outputs return outputs
return False return False
@gof.local_optimizer([None]) @gof.local_optimizer([None])
def gpu_scan_make_inplace(node): def gpu_scan_make_inplace(node):
op = node.op op = node.op
if ( isinstance(op, scan_op.Scan) and if (isinstance(op, scan_op.Scan) and
(not op.info['inplace']) and (not op.info['inplace']) and
(op.info['gpu'])): (op.info['gpu'])):
info = op.info.copy() info = op.info.copy()
info['inplace'] = True info['inplace'] = True
# inputs corresponding to sequences and n_steps # inputs corresponding to sequences and n_steps
ls_begin = node.inputs[:1+op.n_seqs] ls_begin = node.inputs[:1 + op.n_seqs]
ls = op.outer_mitmot(node) ls = op.outer_mitmot(node)
ls += op.outer_mitsot(node) ls += op.outer_mitsot(node)
ls += op.outer_sitsot(node) ls += op.outer_sitsot(node)
ls_end = op.outer_shared(node) ls_end = op.outer_shared(node)
ls_end += op.outer_nitsot(node) ls_end += op.outer_nitsot(node)
ls_end += op.outer_non_seqs(node) ls_end += op.outer_non_seqs(node)
n_outs = len(ls) n_outs = len(ls)
...@@ -1371,19 +1516,19 @@ def gpu_scan_make_inplace(node): ...@@ -1371,19 +1516,19 @@ def gpu_scan_make_inplace(node):
inputs = ls_begin + ls + ls_end inputs = ls_begin + ls + ls_end
typeConstructor = lambda broadcastable, dtype: CudaNdarrayType( typeConstructor = lambda broadcastable, dtype: CudaNdarrayType(
broadcastable = broadcastable) broadcastable=broadcastable)
new_op = scan_op.Scan( op.inputs new_op = scan_op.Scan(op.inputs,
, op.outputs op.outputs,
, info info,
, typeConstructor = typeConstructor typeConstructor=typeConstructor)
)
return new_op.make_node(*inputs).outputs return new_op.make_node(*inputs).outputs
return False return False
optdb.register( 'gpu_scanOp_make_inplace' optdb.register('gpu_scanOp_make_inplace',
, theano.tensor.opt.in2out(gpu_scan_make_inplace,ignore_newtrees=True) theano.tensor.opt.in2out(
, 75 gpu_scan_make_inplace, ignore_newtrees=True),
, 'gpu' 75,
, 'fast_run' 'gpu',
, 'inplace' 'fast_run',
, 'scan') 'inplace',
'scan')
...@@ -68,6 +68,15 @@ def test_gpualloc(): ...@@ -68,6 +68,15 @@ def test_gpualloc():
assert numpy.any(ininstance(x.op, cuda.GpuAlloc) for x in l ) assert numpy.any(ininstance(x.op, cuda.GpuAlloc) for x in l )
def test_gpuspecifyshape():
x = cuda.shared_constructor(numpy.ones(3,dtype='float32'), 'x')
m = theano.tensor.specify_shape(x + numpy.float32(1), (3,))
f = theano.function([], updates={x:m * numpy.float32(2)},
mode=mode_with_gpu)
l = f.maker.env.toposort()
assert not numpy.any([isinstance(x.op, cuda.HostFromGpu) for x in l])
def test_softmax(): def test_softmax():
x = tensor.fmatrix() x = tensor.fmatrix()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论