Refactor CTC GPU wrapper to make use of L_op and allocate costs and gradients in make_node

上级 ba6d2de5
from __future__ import absolute_import, print_function, division from __future__ import absolute_import, print_function, division
import theano import theano
from theano import Op from theano import (config, gof)
from theano import config
import theano.tensor as T import theano.tensor as T
from .basic_ops import (gpu_contiguous, as_gpuarray_variable, from .basic_ops import (gpu_contiguous, as_gpuarray_variable, infer_context_name)
infer_context_name, CGpuKernelBase)
import theano.tensor.nnet.ctc import theano.tensor.nnet.ctc
from .type import GpuArrayType from .type import (GpuArrayType, gpu_context_type)
from .elemwise import GpuDimShuffle from .elemwise import GpuDimShuffle
from theano.gradient import grad_undefined from theano.gradient import grad_undefined
from theano.gof import local_optimizer from theano.gof import local_optimizer
...@@ -20,32 +18,44 @@ import pygpu ...@@ -20,32 +18,44 @@ import pygpu
ctc_enabled = config.ctc.enabled ctc_enabled = config.ctc.enabled
class GpuConnectionistTemporalClassification(CGpuKernelBase, Op): class GpuConnectionistTemporalClassification(gof.COp):
""" """
GPU wrapper for Baidu CTC loss function. GPU wrapper for Baidu CTC loss function.
Parameters
----------
activations
Three-dimensional tensor, which has a shape of (t, m, p), where
t is the time index, m is the minibatch index, and p is the index
over the probabilities of each symbol in the alphabet. The memory
layout is assumed to be in C-order, which consists in the slowest
to the fastest changing dimension, from left to right. In this case,
p is the fastest changing dimension.
labels
A 1-D tensor of all the labels for the minibatch.
input_lengths
A 1-D tensor with the number of time steps for each sequence in
the minibatch.
Returns
-------
3D tensor
Cost of each example in the minibatch. Tensor is of shape
(time index, minibatch index, probabilities).
""" """
__props__ = ('context_name', 'compute_grad',) __props__ = ('compute_grad',)
func_file = "./ctc_wrapper.c" func_file = "./ctc_wrapper.c"
func_name = "APPLY_SPECIFIC(ctc_cost_gpu)" func_name = "APPLY_SPECIFIC(ctc_cost_gpu)"
def __init__(self, compute_grad=True, context_name=None): params_type = gpu_context_type
def __init__(self, compute_grad=True):
if not compute_grad: if not compute_grad:
self.func_name = "APPLY_SPECIFIC(ctc_cost_gpu_no_grad)" self.func_name = "APPLY_SPECIFIC(ctc_cost_gpu_no_grad)"
self.compute_grad = compute_grad self.compute_grad = compute_grad
self.context_name = context_name
Op.__init__(self)
CGpuKernelBase.__init__(self, self.func_file, self.func_name)
self.costs = GpuArrayType(dtype='float32', gof.COp.__init__(self, self.func_file, self.func_name)
broadcastable=(False,),
context_name=self.context_name)
if self.compute_grad:
self.gradients = GpuArrayType(dtype='float32',
broadcastable=(False, False, False,),
context_name=self.context_name)
if config.ctc.root == "": if config.ctc.root == "":
raise ValueError('ctc.root variable is not set, please set it ' raise ValueError('ctc.root variable is not set, please set it '
...@@ -75,35 +85,19 @@ class GpuConnectionistTemporalClassification(CGpuKernelBase, Op): ...@@ -75,35 +85,19 @@ class GpuConnectionistTemporalClassification(CGpuKernelBase, Op):
return ['ctc.h', 'numpy_compat.h', 'gpuarray_helper.h', 'gpuarray/types.h', return ['ctc.h', 'numpy_compat.h', 'gpuarray_helper.h', 'gpuarray/types.h',
'gpuarray_api.h', 'gpuarray/array.h', 'gpuarray/util.h'] 'gpuarray_api.h', 'gpuarray/array.h', 'gpuarray/util.h']
def make_node(self, activations, labels, input_lengths): def get_params(self, node):
""" return node.inputs[0].type.context
Parameters
----------
activations
Three-dimensional tensor, which has a shape of (t, m, p), where
t is the time index, m is the minibatch index, and p is the index
over the probabilities of each symbol in the alphabet. The memory
layout is assumed to be in C-order, which consists in the slowest
to the fastest changing dimension, from left to right. In this case,
p is the fastest changing dimension.
labels
A 1-D tensor of all the labels for the minibatch.
input_lengths
A 1-D tensor with the number of time steps for each sequence in
the minibatch.
"""
def make_node(self, activations, labels, input_lengths):
if not ctc_enabled: if not ctc_enabled:
raise RuntimeError('Baidu CTC is not enabled and ' raise RuntimeError('Baidu CTC is not enabled and '
'GpuConnectionistTemporalClassification Op ' 'GpuConnectionistTemporalClassification Op '
'can not be constructed.') 'can not be constructed.')
context = infer_context_name(activations, labels, input_lengths) context = infer_context_name(activations, labels, input_lengths)
assert context == self.context_name
t_activations = as_gpuarray_variable(activations, t_activations = as_gpuarray_variable(activations,
context_name=self.context_name) context_name=context)
# Ensure activations array is C-contiguous # Ensure activations array is C-contiguous
t_activations = gpu_contiguous(t_activations) t_activations = gpu_contiguous(t_activations)
...@@ -123,24 +117,31 @@ class GpuConnectionistTemporalClassification(CGpuKernelBase, Op): ...@@ -123,24 +117,31 @@ class GpuConnectionistTemporalClassification(CGpuKernelBase, Op):
# Return only the cost. Gradient will be returned by grad() # Return only the cost. Gradient will be returned by grad()
self.default_output = 0 self.default_output = 0
out_params = [as_gpuarray_variable(self.costs(), context_name=self.context_name)] costs = GpuArrayType(dtype='float32',
if self.gradients is not None: broadcastable=(False,),
out_params.append(as_gpuarray_variable(self.gradients(), context_name=context)()
context_name=self.context_name))
if self.compute_grad:
gradients = GpuArrayType(dtype='float32',
broadcastable=(False, False, False,),
context_name=context)()
return theano.Apply(self, inputs=[t_activations, t_labels, t_input_lengths], return theano.Apply(self, inputs=[t_activations, t_labels, t_input_lengths],
outputs=out_params) outputs=[costs, gradients])
def grad(self, inputs, output_grads): def L_op(self, inputs, outputs, output_grads):
if not ctc_enabled: if not ctc_enabled:
raise RuntimeError('Baidu CTC is not enabled and ' raise RuntimeError('Baidu CTC is not enabled and '
'GpuConnectionistTemporalClassification Op ' 'GpuConnectionistTemporalClassification Op '
'can not be constructed.') 'can not be constructed.')
z = output_grads[0] # Gradients computed by Op
grad_shuffle = GpuDimShuffle(input_broadcastable=(False, False, False), gradients = outputs[1]
new_order=(1, 0, 2))(self.gradients()) # Gradients of original function, to compose chain rule
grad_bdot = T.basic.batched_dot(z, grad_shuffle) grad_op = output_grads[0]
grad_shuffle_reverse = GpuDimShuffle(input_broadcastable=(False, False, False), grad_shuffle = GpuDimShuffle(input_broadcastable=(False, False, False,),
new_order=(1, 0, 2))(gradients)
grad_bdot = T.basic.batched_dot(grad_op, grad_shuffle)
grad_shuffle_reverse = GpuDimShuffle(input_broadcastable=(False, False, False,),
new_order=(1, 0, 2))(grad_bdot) new_order=(1, 0, 2))(grad_bdot)
return [grad_shuffle_reverse, return [grad_shuffle_reverse,
grad_undefined(self, 1, inputs[1]), grad_undefined(self, 1, inputs[1]),
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论