提交 dae1f236 authored 作者: Frédéric Bastien's avatar Frédéric Bastien 提交者: GitHub

Merge pull request #5949 from joaovictortr/warp_ctc_wrapper

Baidu CTC wrapper
......@@ -759,6 +759,14 @@ import theano and print the config variable, as in:
Location of the magma library.
.. attribute:: config.ctc.root
Default: ``''``
Location of the warp-ctc folder. The folder should contain either a build,
lib or lib64 subfolder with the shared library (libwarpctc.so), and another
subfolder called include, with the CTC library header.
.. attribute:: config.gcc.cxxflags
Default: ``""``
......
.. _libdoc_gpuarray_ctc:
================================================================================
:mod:`theano.gpuarray.ctc` -- Connectionist Temporal Classification (CTC) loss
================================================================================
.. note::
Usage of connectionist temporal classification (CTC) loss Op, requires that
the `warp-ctc <https://github.com/baidu-research/warp-ctc>`_ library is
available. In case the warp-ctc library is not in your compiler's library path,
the ``config.ctc.root`` configuration option must be appropriately set to the
directory containing the warp-ctc library files.
.. note::
Unfortunately, Windows platforms are not yet supported by the underlying
library.
.. module:: theano.gpuarray.ctc
:platform: Unix
:synopsis: Connectionist temporal classification (CTC) loss Op, using the warp-ctc library
.. moduleauthor:: `João Victor Risso <https://github.com/joaovictortr>`_
.. autofunction:: theano.gpuarray.ctc.gpu_ctc
.. autoclass:: theano.gpuarray.ctc.GpuConnectionistTemporalClassification
......@@ -18,3 +18,4 @@
fft
type
extra
ctc
.. _libdoc_tensor_nnet_ctc:
==================================================================================
:mod:`theano.tensor.nnet.ctc` -- Connectionist Temporal Classification (CTC) loss
==================================================================================
.. note::
Usage of connectionist temporal classification (CTC) loss Op, requires that
the `warp-ctc <https://github.com/baidu-research/warp-ctc>`_ library is
available. In case the warp-ctc library is not in your compiler's library path,
the ``config.ctc.root`` configuration option must be appropriately set to the
directory containing the warp-ctc library files.
.. note::
Unfortunately, Windows platforms are not yet supported by the underlying
library.
.. module:: theano.tensor.nnet.ctc
:platform: Unix
:synopsis: Connectionist temporal classification (CTC) loss Op, using the warp-ctc library
.. moduleauthor:: `João Victor Risso <https://github.com/joaovictortr>`_
.. autofunction:: theano.tensor.nnet.ctc.ctc
.. autoclass:: theano.tensor.nnet.ctc.ConnectionistTemporalClassification
......@@ -21,3 +21,4 @@ and ops which are particular to neural networks and deep learning.
neighbours
bn
blocksparse
ctc
......@@ -1839,6 +1839,13 @@ AddConfigVar(
allow_override=False),
in_c_key=False)
AddConfigVar(
'ctc.root',
'Directory which contains the root of Baidu CTC library. It is assumed \
that the compiled library is either inside the build, lib or lib64 \
subdirectory, and the header inside the include directory.',
StrParam('', allow_override=False),
in_c_key=False)
# Check if there are remaining flags provided by the user through THEANO_FLAGS.
for key in THEANO_FLAGS_DICT.keys():
......
......@@ -29,7 +29,7 @@ from .type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant,
GpuArraySharedVariable, gpuarray_shared_constructor,
reg_context, get_context, ContextNotDefined)
from .basic_ops import as_gpuarray_variable
from . import fft, dnn, opt, extra_ops, multinomial, reduction, rng_mrg
from . import fft, dnn, opt, extra_ops, multinomial, reduction, rng_mrg, ctc
def transfer(x, target):
......
#section init_code
setup_ext_cuda();
#section support_code
typedef struct ctc_context {
struct ctcOptions options;
gpudata * workspace;
int * input_lengths;
int * flat_labels;
int * label_lengths;
} ctc_context_t;
void ctc_context_init(ctc_context_t * context, PyGpuContextObject * gpu_context)
{
memset(&(context->options), 0, sizeof(struct ctcOptions));
context->options.loc = CTC_GPU;
// Get CUDA function pointer to obtain stream
CUstream (*getstream_func_ptr)(void *) = (CUstream (*)(void *)) gpuarray_get_extension( "cuda_get_stream" );
context->options.stream = getstream_func_ptr(gpu_context->ctx);
context->workspace = NULL;
context->input_lengths = NULL;
context->flat_labels = NULL;
context->label_lengths = NULL;
}
void ctc_context_destroy(ctc_context_t * context)
{
gpudata_release( context->workspace );
free( context->input_lengths );
free( context->flat_labels );
free( context->label_lengths );
}
int ctc_check_result(ctcStatus_t retcode, const char * msg)
{
if( CTC_STATUS_SUCCESS != retcode )
{
// Get error message from underlying library
const char * ctc_msg = ctcGetStatusString( retcode );
PyErr_Format( PyExc_RuntimeError,
"GpuConnectionistTemporalClassification: %s CTC error: %s",
msg,
ctc_msg );
return 1;
}
return 0;
}
void create_contiguous_input_lengths( PyArrayObject * input_lengths_arr,
int ** input_lengths )
{
npy_int num_elements = PyArray_DIMS( input_lengths_arr )[0];
*input_lengths = (int *) malloc( num_elements * sizeof(int) );
if ( NULL == (*input_lengths) )
return;
for( npy_int elem_idx = 0; elem_idx < num_elements; ++elem_idx )
{
(*input_lengths)[elem_idx] = *( (npy_int *) PyArray_GETPTR1( input_lengths_arr, elem_idx ) );
}
}
void create_flat_labels( PyArrayObject * label_matrix, int ** flat_labels,
int ** label_lengths )
{
npy_int rows = PyArray_DIMS( label_matrix )[0];
npy_int cols = PyArray_DIMS( label_matrix )[1];
*flat_labels = (int *) calloc( rows * cols, sizeof(int) );
if ( NULL == (*flat_labels) )
return;
*label_lengths = (int *) calloc( rows, sizeof(int) );
if ( NULL == (*label_lengths) )
{
free( *flat_labels );
*flat_labels = NULL;
return;
}
npy_int label_index = 0;
for( npy_int row_idx = 0; row_idx < rows; ++row_idx )
{
npy_int label_length = 0;
for( npy_int col_idx = 0; col_idx < cols; ++col_idx )
{
npy_int label = *( (npy_int *) PyArray_GETPTR2( label_matrix, row_idx, col_idx ) );
if ( label >= 0 ) // negative values are assumed to be padding
{
(*flat_labels)[ label_index++ ] = label;
++label_length;
}
}
(*label_lengths)[ row_idx ] = label_length;
}
}
#section support_code_apply
int APPLY_SPECIFIC(ctc_cost_gpu)(PyGpuArrayObject * in_activations,
PyArrayObject * in_labels,
PyArrayObject * in_input_lengths,
PyGpuArrayObject ** out_costs,
PyGpuArrayObject ** out_gradients,
PyGpuContextObject * gpu_context)
{
ctc_context_t ctc_object;
ctc_context_t * context = &ctc_object;
size_t gpu_workspace_size;
int ctc_error = 0;
const size_t num_activations = PyGpuArray_DIMS( in_activations )[0];
const size_t minibatch_size = PyGpuArray_DIMS( in_activations )[1];
const size_t alphabet_size = PyGpuArray_DIMS( in_activations )[2];
const size_t cost_size = minibatch_size;
const size_t grad_dims[3] = { num_activations, minibatch_size, alphabet_size };
float * costs = NULL,
* activations = NULL,
* gradients = NULL;
cuda_enter( gpu_context->ctx );
ctc_context_init( context, gpu_context );
switch (in_activations->ga.typecode)
{
case GA_FLOAT:
activations = (float *) PyGpuArray_DEV_DATA( in_activations );
break;
default:
ctc_context_destroy( context );
cuda_exit( gpu_context->ctx );
PyErr_SetString( PyExc_TypeError,
"GpuConnectionistTemporalClassification: Unsupported type for activations." );
return 1;
}
create_contiguous_input_lengths( in_input_lengths, &(context->input_lengths) );
if ( NULL == context->input_lengths )
{
// Destroy previous CTC context before returning exception
ctc_context_destroy( context );
cuda_exit( gpu_context->ctx );
PyErr_Format( PyExc_MemoryError,
"GpuConnectionistTemporalClassification: Could not allocate memory for input lengths." );
return 1;
}
// flatten labels to conform with library memory layout
create_flat_labels( in_labels, &(context->flat_labels), &(context->label_lengths) );
if ( ( NULL == context->label_lengths ) || ( NULL == context->flat_labels ) )
{
// Destroy previous CTC context before returning exception
ctc_context_destroy( context );
cuda_exit( gpu_context->ctx );
PyErr_Format( PyExc_MemoryError,
"GpuConnectionistTemporalClassification: Could not allocate memory for labels and their lengths." );
return 1;
}
if ( theano_prep_output( out_costs, 1, &cost_size, in_activations->ga.typecode,
GA_C_ORDER, gpu_context ) != 0 )
{
ctc_context_destroy( context );
cuda_exit( gpu_context->ctx );
return 1;
}
GpuArray_memset( &((*out_costs)->ga), 0 );
costs = (float *) PyGpuArray_DEV_DATA( *out_costs );
if ( NULL != out_gradients ) // if gradient computation is not disabled
{
if ( theano_prep_output( out_gradients, 3, grad_dims, in_activations->ga.typecode,
GA_C_ORDER, gpu_context ) != 0 )
{
ctc_context_destroy( context );
cuda_exit( gpu_context->ctx );
return 1;
}
GpuArray_memset( &((*out_gradients)->ga), 0 );
gradients = (float *) PyGpuArray_DEV_DATA( *out_gradients );
}
ctc_error = ctc_check_result( get_workspace_size( context->label_lengths,
context->input_lengths, alphabet_size, minibatch_size, context->options,
&gpu_workspace_size ),
"Failed to obtain CTC workspace size." );
if ( ctc_error ) // Exception is set by ctc_check_result, return error here
{
// Destroy previous CTC context before returning exception
ctc_context_destroy( context );
cuda_exit( gpu_context->ctx );
return 1;
}
context->workspace = gpudata_alloc( gpu_context->ctx, gpu_workspace_size, NULL, 0, NULL );
if ( NULL == context->workspace )
{
ctc_context_destroy( context );
cuda_exit( gpu_context->ctx );
PyErr_Format( PyExc_MemoryError,
"GpuConnectionistTemporalClassification: Failed to allocate memory for CTC workspace." );
return 1;
}
cuda_wait( in_activations->ga.data, GPUARRAY_CUDA_WAIT_READ );
cuda_wait( (*out_costs)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
if ( out_gradients != NULL )
cuda_wait( (*out_gradients)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
ctc_error = ctc_check_result( compute_ctc_loss( activations, gradients,
context->flat_labels, context->label_lengths, context->input_lengths,
alphabet_size, minibatch_size, costs, *(void **)context->workspace,
context->options ), "Failed to compute CTC loss function." );
cuda_record( in_activations->ga.data, GPUARRAY_CUDA_WAIT_READ );
cuda_record( (*out_costs)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
if ( out_gradients != NULL )
cuda_record( (*out_gradients)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
if ( ctc_error ) // Exception is set by ctc_check_result, return error here
{
ctc_context_destroy( context );
cuda_exit( gpu_context->ctx );
return 1;
}
ctc_context_destroy( context );
cuda_exit( gpu_context->ctx );
return 0;
}
from __future__ import absolute_import, print_function, division
import theano
from theano import (config, gof)
import theano.tensor as T
from .basic_ops import (gpu_contiguous, as_gpuarray_variable, infer_context_name)
import theano.tensor.nnet.ctc
from .type import (GpuArrayType, gpu_context_type)
from .elemwise import GpuDimShuffle
from theano.gradient import grad_undefined
from theano.gof import local_optimizer
from theano.tensor.opt import register_canonicalize
from theano.tensor.nnet.ctc import ctc_available
import os
import os.path
from . import pygpu
class GpuConnectionistTemporalClassification(gof.COp):
"""
GPU wrapper for Baidu CTC loss function.
Parameters
----------
compute_grad
If set to True, enables the computation of gradients of the CTC loss function.
"""
__props__ = ('compute_grad',)
_cop_num_inputs = 3
_cop_num_outputs = 2
func_file = "./c_code/ctc_wrapper.c"
func_name = "APPLY_SPECIFIC(ctc_cost_gpu)"
params_type = gpu_context_type
def __init__(self, compute_grad=True):
if not ctc_available():
raise RuntimeError('Baidu CTC is not available and '
'GpuConnectionistTemporalClassification Op '
'can not be constructed.')
self.compute_grad = compute_grad
# Return only the cost. Gradient will be returned by grad()
self.default_output = 0
gof.COp.__init__(self, self.func_file, self.func_name)
def c_lib_dirs(self):
lib_dirs = []
if ctc_available.path is not None:
lib_dirs += [ctc_available.path]
return lib_dirs
def c_libraries(self):
return ["warpctc", "gpuarray"]
def c_header_dirs(self):
dirs = [os.path.dirname(__file__), pygpu.get_include()]
if config.ctc.root != '':
dirs.append(os.path.join(config.ctc.root, "include"))
return dirs
def c_headers(self):
return ['ctc.h', 'numpy_compat.h', 'gpuarray/ext_cuda.h',
'gpuarray_helper.h', 'gpuarray/types.h', 'gpuarray_api.h',
'gpuarray/array.h', 'gpuarray/util.h', 'gpuarray/extension.h']
def get_params(self, node):
return node.inputs[0].type.context
def make_node(self, activations, labels, input_lengths):
context_name = infer_context_name(activations)
t_activations = as_gpuarray_variable(activations,
context_name=context_name)
# Ensure activations array is C-contiguous
t_activations = gpu_contiguous(t_activations)
# Labels and input lengths are always on the CPU
t_labels = T.as_tensor_variable(labels)
t_input_lengths = T.as_tensor_variable(input_lengths)
if t_activations.type.dtype != 'float32':
raise TypeError('activations must use the float32 type.')
if t_activations.ndim != 3:
raise ValueError('activations must have 3 dimensions.')
if t_labels.type.dtype != 'int32':
raise TypeError('labels must use the int32 type.')
if t_labels.ndim != 2:
raise ValueError('labels must have 2 dimensions.')
if t_input_lengths.type.dtype != 'int32':
raise TypeError('input_lengths must use the int32 type.')
if t_input_lengths.ndim != 1:
raise ValueError('input_lengths must have 1 dimension.')
costs = GpuArrayType(dtype='float32',
broadcastable=(False,),
context_name=context_name)()
outputs = [costs]
if self.compute_grad:
gradients = GpuArrayType(dtype='float32',
broadcastable=(False, False, False,),
context_name=context_name)()
outputs += [gradients]
return theano.Apply(self, inputs=[t_activations, t_labels, t_input_lengths],
outputs=outputs)
def L_op(self, inputs, outputs, output_grads):
# Gradients computed by Op
assert self.compute_grad and len(outputs) == 2
gradients = outputs[1]
assert gradients is not None
# Gradients of original function, to compose chain rule
grad_op = output_grads[0]
grad_shuffle = GpuDimShuffle(input_broadcastable=(False, False, False,),
new_order=(1, 0, 2))(gradients)
grad_bdot = T.basic.batched_dot(grad_op, grad_shuffle)
grad_shuffle_reverse = GpuDimShuffle(input_broadcastable=(False, False, False,),
new_order=(1, 0, 2))(grad_bdot)
return [grad_shuffle_reverse,
grad_undefined(self, 1, inputs[1]),
grad_undefined(self, 2, inputs[2])]
def gpu_ctc(activations, labels, input_lengths):
"""
Compute CTC loss function on the GPU.
Parameters
----------
activations
Three-dimensional tensor, which has a shape of (t, m, p), where
t is the time index, m is the minibatch index, and p is the index
over the probabilities of each symbol in the alphabet. The memory
layout is assumed to be in C-order, which consists in the slowest
to the fastest changing dimension, from left to right. In this case,
p is the fastest changing dimension.
labels
A 2-D tensor of all the labels for the minibatch. In each row, there
is a sequence of target labels. Negative values are assumed to be padding,
and thus are ignored. Blank symbol is assumed to have index 0 in the
alphabet.
input_lengths
A 1-D tensor with the number of time steps for each sequence in
the minibatch.
Returns
-------
1-D array
Cost of each example in the minibatch.
"""
return GpuConnectionistTemporalClassification()(activations, labels, input_lengths)
# Disable gradient computation if not needed
@register_canonicalize
@local_optimizer([GpuConnectionistTemporalClassification])
def local_gpu_ctc_no_grad(node):
if isinstance(node.op, GpuConnectionistTemporalClassification):
if len(node.outputs) > 1:
if len(node.outputs[1].clients) == 0: # gradient is not used
return [GpuConnectionistTemporalClassification(compute_grad=False)(*node.inputs), None]
return False
......@@ -34,6 +34,7 @@ from theano.tensor.nnet.abstract_conv import (BaseAbstractConv,
AbstractConv3d_gradWeights,
AbstractConv3d_gradInputs)
from theano.tensor.nnet.neighbours import Images2Neibs
from theano.tensor.nnet.ctc import ConnectionistTemporalClassification
import theano.tensor.nlinalg as nlinalg
import theano.tensor.signal.pool as pool
import theano.tensor.slinalg as slinalg
......@@ -80,6 +81,7 @@ from .linalg import (GpuCusolverSolve, MATRIX_STRUCTURES_SOLVE, GpuCholesky,
GpuMagmaCholesky, gpu_qr, GpuMagmaEigh,
GpuCublasTriangularSolve, cublas_available)
from .neighbours import GpuImages2Neibs
from .ctc import GpuConnectionistTemporalClassification
_logger = logging.getLogger("theano.gpuarray.opt")
......@@ -162,6 +164,7 @@ def register_inplace(*tags, **kwargs):
return local_opt
return f
register_opt('fast_compile')(theano.tensor.opt.local_track_shape_i)
register_opt(final_opt=True, name='gpua_constant_folding')(
tensor.opt.constant_folding)
......@@ -582,6 +585,7 @@ def local_cut_gpu_transfers(node):
else:
return [node.op(n2.inputs[0])]
gpu_cut_copies.register('cut_gpua_host_transfers', local_cut_gpu_transfers,
'fast_compile', 'fast_run', 'gpuarray')
gpu_cut_copies.register('cut_gpua_constant_transfers',
......@@ -652,6 +656,8 @@ def local_gpua_alloc_empty_to_zeros(node):
z = np.asarray(0, dtype=node.outputs[0].dtype)
return [GpuAlloc(context_name)(as_gpuarray_variable(z, context_name),
*node.inputs)]
optdb.register('local_gpua_alloc_empty_to_zeros',
theano.tensor.opt.in2out(local_gpua_alloc_empty_to_zeros),
# After move to gpu and merge2, before inplace.
......@@ -1540,6 +1546,8 @@ def local_conv_gpu_conv(node):
return [tensor.as_tensor_variable(out)]
else:
return [out]
register_opt()(local_conv_gpu_conv)
......@@ -1812,6 +1820,8 @@ def local_gpu_pool(op, ctx_name, inputs, outputs):
inp_padded = pad_dims(inp, 2, nd)
ret_padded = op(inp_padded, ws, stride, pad)
return unpad_dims(ret_padded, inp, 2, nd)
pool_db = LocalGroupDB()
pool_db2 = LocalGroupDB(local_opt=theano.gof.opt.GraphToGPULocalOptGroup)
pool_db2.__name__ = "pool_db2"
......@@ -1849,6 +1859,8 @@ def local_gpu_max_pool_grad(op, ctx_name, inputs, outputs):
ret_padded = op(inp_padded, out_padded, out_grad_padded,
ws, stride, pad)
return unpad_dims(ret_padded, inp, 2, nd)
lifter = op_lifter([pool.MaxPoolGrad])(local_gpu_max_pool_grad)
pool_db.register("local_gpu_max_pool_grad", lifter,
'gpuarray', 'fast_compile', 'fast_run',
......@@ -1879,6 +1891,8 @@ def local_gpu_average_pool_grad(op, ctx_name, inputs, outputs):
ret_padded = op(inp_padded, out_grad_padded,
ws, stride, pad)
return unpad_dims(ret_padded, inp, 2, nd)
lifter = op_lifter([pool.AveragePoolGrad])(local_gpu_average_pool_grad)
pool_db.register("local_gpu_average_pool_grad", lifter,
'gpuarray', 'fast_compile', 'fast_run',
......@@ -1976,6 +1990,7 @@ def local_assert_no_cpu_op(node):
elif config.assert_no_cpu_op == "pdb":
pdb.set_trace()
# Register the local_assert_no_cpu_op:
assert_no_cpu_op = theano.tensor.opt.in2out(local_assert_no_cpu_op,
name='assert_no_cpu_op')
......@@ -2308,6 +2323,15 @@ def local_gpu_magma_svd(op, context_name, inputs, outputs):
out = [out.astype('float16')]
return out
@register_opt('ctc', 'fast_compile')
@op_lifter([theano.tensor.nnet.ctc.ConnectionistTemporalClassification])
@register_opt2([ConnectionistTemporalClassification], 'ctc', 'fast_compile')
def local_gpu_ctc(op, context_name, inputs, outputs):
op = GpuConnectionistTemporalClassification(compute_grad=op.compute_grad)
return op.make_node(*inputs).outputs
# Do not register in fast_run or fast_compile.
# It will be added to fast_run if the GPU is enabled.
optdb.register('gpua_scanOp_make_inplace',
......
from __future__ import (division, absolute_import, print_function)
import unittest
import numpy as np
import theano
import theano.tensor as T
from theano.tests import unittest_tools as utt
import theano.gpuarray
from theano.gpuarray.ctc import (gpu_ctc, GpuConnectionistTemporalClassification)
from theano.tensor.nnet.ctc import (ctc, ctc_available, ConnectionistTemporalClassification)
from theano.tensor.nnet.tests.test_ctc import (setup_torch_case, setup_ctc_case, setup_grad_case)
from .config import (mode_with_gpu, mode_without_gpu)
class TestCTC(unittest.TestCase):
def setUp(self):
if not ctc_available():
self.skipTest('Optional library warp-ctc not available')
def check_ctc(self, activations, labels, input_length, expected_costs, expected_grads):
# Create symbolic variables
t_activations = theano.shared(activations, name="activations")
t_activation_times = theano.shared(input_length, name="activation_times")
t_labels = theano.shared(labels, name="labels")
inputs = [t_activations, t_labels, t_activation_times]
# Execute several tests for each test case
self.check_expected_values(t_activations, t_labels, t_activation_times, expected_costs, expected_grads)
self.compare_gpu_and_cpu_values(*inputs)
self.check_grads_disabled(*inputs)
self.run_gpu_optimization_with_grad(*inputs)
self.run_gpu_optimization_no_grad(*inputs)
def setup_cpu_op(self, activations, labels, input_length, compute_grad=True, mode=mode_without_gpu):
cpu_ctc_cost = ctc(activations, labels, input_length)
outputs = [cpu_ctc_cost]
if compute_grad:
# Symbolic gradient of CTC cost
cpu_ctc_grad = T.grad(T.mean(cpu_ctc_cost), activations)
outputs += [cpu_ctc_grad]
return theano.function([], outputs, mode=mode)
def setup_gpu_op(self, activations, labels, input_length, compute_grad=True):
gpu_ctc_cost = gpu_ctc(activations, labels, input_length)
outputs = [gpu_ctc_cost]
if compute_grad:
# Symbolic gradient of CTC cost
gpu_ctc_grad = T.grad(T.mean(gpu_ctc_cost), activations)
outputs += [gpu_ctc_grad]
return theano.function([], outputs)
def check_expected_values(self, activations, labels, input_length, expected_costs, expected_grads):
gpu_train = self.setup_gpu_op(activations, labels, input_length)
gpu_cost, gpu_grad = gpu_train()
# Transfer costs from GPU memory to host
cost_from_gpu = np.asarray(gpu_cost)
# Transfer gradients from GPU memory to host
grad_from_gpu = np.asarray(gpu_grad)
# Check that results are in conformance with expected values
utt.assert_allclose(expected_grads / cost_from_gpu.shape[0], grad_from_gpu)
utt.assert_allclose(expected_costs, cost_from_gpu)
def compare_gpu_and_cpu_values(self, activations, labels, input_length):
cpu_train = self.setup_cpu_op(activations, labels, input_length)
cpu_cost, cpu_grad = cpu_train()
gpu_train = self.setup_gpu_op(activations, labels, input_length)
gpu_cost, gpu_grad = gpu_train()
# Transfer costs from GPU memory to host
cost_from_gpu = np.asarray(gpu_cost)
# Transfer gradients from GPU memory to host
grad_from_gpu = np.asarray(gpu_grad)
# Check that results are in conformance with expected values
utt.assert_allclose(cpu_grad, grad_from_gpu)
utt.assert_allclose(cpu_cost, cost_from_gpu)
def check_grads_disabled(self, activations, labels, input_length):
"""
Check if optimization to disable gradients is working
"""
gpu_ctc_cost = gpu_ctc(activations, labels, input_length)
gpu_ctc_function = theano.function([], [gpu_ctc_cost])
for node in gpu_ctc_function.maker.fgraph.apply_nodes:
if isinstance(node.op, GpuConnectionistTemporalClassification):
assert (node.op.compute_grad is False)
def run_gpu_optimization_with_grad(self, activations, labels, input_length):
# Compile CPU function with optimization
cpu_lifted_train = self.setup_cpu_op(activations, labels, input_length, mode=mode_with_gpu)
# Check whether Op is lifted to the GPU
assert self.has_only_gpu_op(cpu_lifted_train)
def run_gpu_optimization_no_grad(self, activations, labels, input_length):
cpu_train = self.setup_cpu_op(activations, labels, input_length, compute_grad=False)
cpu_cost = cpu_train()
# Compile CPU function with optimization
cpu_lifted_test = self.setup_cpu_op(activations, labels, input_length, compute_grad=False, mode=mode_with_gpu)
# Check whether Op is lifted to the GPU
assert self.has_only_gpu_op(cpu_lifted_test)
gpu_cost = cpu_lifted_test()
# Transfer costs from GPU memory to host
cost_from_gpu = np.asarray(gpu_cost)
# Compare values from CPU and GPU Ops
utt.assert_allclose(cpu_cost, cost_from_gpu)
def has_only_gpu_op(self, function):
has_cpu_instance = False
has_gpu_instance = False
for node in function.maker.fgraph.apply_nodes:
if isinstance(node.op, ConnectionistTemporalClassification):
has_cpu_instance = True
if isinstance(node.op, GpuConnectionistTemporalClassification):
has_gpu_instance = True
return has_gpu_instance and (not has_cpu_instance)
# Test obtained from Torch tutorial at:
# https://github.com/baidu-research/warp-ctc/blob/master/torch_binding/TUTORIAL.md
def test_torch_case(self):
activations, labels, activation_times, expected_costs, expected_grads = setup_torch_case()
self.check_ctc(activations, labels, activation_times, expected_costs, expected_grads)
def test_ctc(self):
activations, labels, input_length, expected_costs, expected_grads = setup_ctc_case()
self.check_ctc(activations, labels, input_length, expected_costs, expected_grads)
def test_verify_grad(self):
def ctc_op_functor(labels, in_lengths):
def wrapper(acts):
# Create auxiliary symbolic variables
t_activation_times = theano.shared(in_lengths, name="activation_times")
t_labels = theano.shared(labels, name="labels")
return gpu_ctc(acts, t_labels, t_activation_times)
return wrapper
activations, labels, activation_times = setup_grad_case()
ctc_op = ctc_op_functor(labels, activation_times)
utt.verify_grad(ctc_op, [activations])
from __future__ import (division, absolute_import, print_function)
import os
import os.path
import theano.tensor as T
from theano import config
from theano import gof
from theano.gof import local_optimizer
from theano.gof.cmodule import GCC_compiler
from theano.tensor.opt import register_canonicalize
from theano.tensor.extra_ops import cpu_contiguous
from theano.gradient import grad_undefined
def _ctc_find_lib():
"""
Find the directory that contains libwarpctc.so
"""
if config.ctc.root != '':
for lib_dir in ["build", "lib", "lib64"]:
lib_path = os.path.join(config.ctc.root, lib_dir)
if os.path.isdir(lib_path) and os.path.exists(lib_path):
lib_found = os.path.exists(os.path.join(lib_path, "libwarpctc.so"))
if lib_found:
return lib_path
return None
def _ctc_check_compile(ctc_lib_path):
preambule = """
#include <string.h>
#include "ctc.h"
"""
body = """
ctcOptions options;
memset(&options, 0, sizeof(ctcOptions));
options.loc = CTC_CPU;
options.num_threads = 1;
"""
params = ['-I%s' % (os.path.dirname(__file__))]
if ctc_lib_path is not None:
params.extend(["-I%s" % (os.path.join(config.ctc.root, "include"))])
params.extend(["-L%s" % (ctc_lib_path)])
params.extend(["-l", "warpctc"])
compiler_res = GCC_compiler.try_flags(
params, preambule=preambule, body=body,
try_run=False, output=True)
avail, out, err = compiler_res if isinstance(compiler_res, tuple) else (compiler_res, None, None)
if not avail:
return False, ("cannot compile with warp-ctc. "
"We got this error:\n" + str(err))
return True, None
def ctc_present():
if ctc_present.avail is not None:
return ctc_present.avail
ctc_lib_path = _ctc_find_lib()
ctc_present.path = ctc_lib_path
ctc_present.avail, ctc_present.msg = _ctc_check_compile(ctc_present.path)
return ctc_present.avail
ctc_present.avail = None
ctc_present.msg = None
ctc_present.path = None
def ctc_available():
if os.name == 'nt':
ctc_available.msg = 'Windows platforms are currently not supported ',
'by underlying CTC library (warp-ctc).'
return False
elif not ctc_present():
ctc_available.msg = ctc_present.msg
return False
ctc_available.path = ctc_present.path
return True
ctc_available.msg = None
ctc_available.path = None
class ConnectionistTemporalClassification(gof.COp, gof.OpenMPOp):
"""
CTC loss function wrapper.
Notes
-----
Using the wrapper requires that Baidu's warp-ctc library is installed.
If the warp-ctc library is not on your compiler's default library path,
you must set the configuration variable ``config.ctc.root`` appropriately.
Parameters
----------
compute_grad
If set to True, enables the computation of gradients of the CTC loss function.
"""
__props__ = ('compute_grad',)
_cop_num_inputs = 3
_cop_num_outputs = 2
func_file = "./ctc_wrapper.c"
func_name = "APPLY_SPECIFIC(ctc_cost_cpu)"
def __init__(self, compute_grad=True, openmp=None):
if not ctc_available():
raise RuntimeError('Baidu CTC is not available and '
'ConnectionistTemporalClassification Op '
'can not be constructed.')
gof.COp.__init__(self, self.func_file, self.func_name)
gof.OpenMPOp.__init__(self, openmp=openmp)
self.compute_grad = compute_grad
# Return only the cost. Gradient will be returned by grad()
self.default_output = 0
def c_lib_dirs(self):
lib_dirs = []
if ctc_available.path is not None:
lib_dirs += [ctc_available.path]
return lib_dirs
def c_libraries(self):
return ["warpctc"]
def c_header_dirs(self):
header_dirs = []
if config.ctc.root != '':
# We assume here that the header is available at the include directory
# of the CTC root directory.
header_dirs += [os.path.join(config.ctc.root, "include")]
return header_dirs
def c_headers(self):
return ["ctc.h"] + gof.OpenMPOp.c_headers(self)
def make_node(self, activations, labels, input_lengths):
t_activations = T.as_tensor_variable(activations)
# Ensure activations array is C-contiguous
t_activations = cpu_contiguous(t_activations)
t_labels = T.as_tensor_variable(labels)
t_input_lengths = T.as_tensor_variable(input_lengths)
if t_activations.type.dtype != 'float32':
raise TypeError('activations must use the float32 type!')
if t_activations.ndim != 3:
raise ValueError('activations must have 3 dimensions.')
if t_labels.type.dtype != 'int32':
raise TypeError('labels must use the int32 type!')
if t_labels.ndim != 2:
raise ValueError('labels must have 2 dimensions.')
if t_input_lengths.type.dtype != 'int32':
raise TypeError('input_lengths must use the int32 type!')
if t_input_lengths.ndim != 1:
raise ValueError('input_lengths must have 1 dimension.')
costs = T.fvector(name="ctc_cost")
outputs = [costs]
if self.compute_grad:
gradients = T.ftensor3(name="ctc_grad")
outputs += [gradients]
return gof.Apply(self, inputs=[t_activations, t_labels, t_input_lengths],
outputs=outputs)
def L_op(self, inputs, outputs, output_grads):
assert self.compute_grad and len(outputs) == 2
gradients = outputs[1]
assert gradients is not None
grad_op = output_grads[0]
total_grad = T.basic.batched_dot(grad_op, gradients.dimshuffle(1, 0, 2)).dimshuffle(1, 0, 2)
return [total_grad,
grad_undefined(self, 1, inputs[1]),
grad_undefined(self, 2, inputs[2])]
def ctc(activations, labels, input_lengths):
"""
Compute CTC loss function.
Notes
-----
Using the loss function requires that the Baidu's warp-ctc library be installed.
If the warp-ctc library is not on the compiler's default library path, the
configuration variable ``config.ctc.root`` must be properly set.
Parameters
----------
activations
Three-dimensional tensor, which has a shape of (t, m, p), where
t is the time index, m is the minibatch index, and p is the index
over the probabilities of each symbol in the alphabet. The memory
layout is assumed to be in C-order, which consists in the slowest
to the fastest changing dimension, from left to right. In this case,
p is the fastest changing dimension.
labels
A 2-D tensor of all the labels for the minibatch. In each row, there
is a sequence of target labels. Negative values are assumed to be padding,
and thus are ignored. Blank symbol is assumed to have index 0 in the
alphabet.
input_lengths
A 1-D tensor with the number of time steps for each sequence in
the minibatch.
Returns
-------
1-D array
Cost of each example in the minibatch.
"""
return ConnectionistTemporalClassification()(activations, labels, input_lengths)
# Disable gradient computation if not needed
@register_canonicalize
@local_optimizer([ConnectionistTemporalClassification])
def local_ctc_no_grad(node):
if isinstance(node.op, ConnectionistTemporalClassification):
if len(node.outputs) > 1:
if len(node.outputs[1].clients) == 0: # gradient is not used
return [ConnectionistTemporalClassification(compute_grad=False)(*node.inputs), None]
return False
#section support_code
typedef struct ctc_context {
struct ctcOptions options;
void * workspace;
int * input_lengths;
int * flat_labels;
int * label_lengths;
} ctc_context_t;
void ctc_context_init(ctc_context_t * context)
{
struct ctcOptions * options = &(context->options);
memset(options, 0, sizeof(struct ctcOptions));
options->loc = CTC_CPU;
#if defined(_OPENMP)
options->num_threads = omp_get_num_threads();
#else
options->num_threads = 1;
#endif
context->workspace = NULL;
context->input_lengths = NULL;
context->flat_labels = NULL;
context->label_lengths = NULL;
}
void ctc_context_destroy(ctc_context_t * context)
{
free( context->workspace );
free( context->input_lengths );
free( context->flat_labels );
free( context->label_lengths );
}
int ctc_check_result(ctcStatus_t retcode, const char * msg)
{
if( CTC_STATUS_SUCCESS != retcode )
{
// Get error message from underlying library
const char * ctc_msg = ctcGetStatusString( retcode );
PyErr_Format( PyExc_RuntimeError,
"ConnectionistTemporalClassification: %s CTC error: %s",
msg,
ctc_msg );
return 1;
}
return 0;
}
void create_contiguous_input_lengths( PyArrayObject * input_lengths_arr,
int ** input_lengths )
{
npy_int num_elements = PyArray_DIMS( input_lengths_arr )[0];
*input_lengths = (int *) calloc( num_elements, sizeof(int) );
if ( NULL == (*input_lengths) )
return;
for( npy_int elem_idx = 0; elem_idx < num_elements; ++elem_idx )
{
(*input_lengths)[elem_idx] = *( (npy_int *) PyArray_GETPTR1( input_lengths_arr, elem_idx ) );
}
}
void create_flat_labels( PyArrayObject * label_matrix, int ** flat_labels,
int ** label_lengths )
{
npy_int rows = PyArray_DIMS( label_matrix )[0];
npy_int cols = PyArray_DIMS( label_matrix )[1];
*flat_labels = (int *) calloc( rows * cols, sizeof(int) );
if ( NULL == (*flat_labels) )
return;
*label_lengths = (int *) calloc( rows, sizeof(int) );
if ( NULL == (*label_lengths) )
{
free( *flat_labels );
*flat_labels = NULL;
return;
}
npy_int label_index = 0;
for( npy_int row_idx = 0; row_idx < rows; ++row_idx )
{
npy_int label_length = 0;
for( npy_int col_idx = 0; col_idx < cols; ++col_idx )
{
npy_int label = *( (npy_int *) PyArray_GETPTR2( label_matrix, row_idx, col_idx ) );
if ( label >= 0 ) // negative values are assumed to be padding
{
(*flat_labels)[ label_index++ ] = label;
++label_length;
}
}
(*label_lengths)[ row_idx ] = label_length;
}
}
#section support_code_apply
int APPLY_SPECIFIC(ctc_cost_cpu)(PyArrayObject * in_activations,
PyArrayObject * in_labels,
PyArrayObject * in_input_lengths,
PyArrayObject ** out_costs,
PyArrayObject ** out_gradients)
{
ctc_context_t ctc_object;
ctc_context_t * context = &ctc_object;
ctc_context_init( context );
if ( !PyArray_IS_C_CONTIGUOUS( in_activations ) )
{
PyErr_SetString( PyExc_RuntimeError,
"ConnectionistTemporalClassification: activations array must be C-contiguous." );
return 1;
}
npy_float32 * activations = (npy_float32 *) PyArray_DATA( in_activations );
create_contiguous_input_lengths( in_input_lengths, &(context->input_lengths) );
if ( NULL == context->input_lengths )
{
// Destroy previous CTC context before returning exception
ctc_context_destroy( context );
PyErr_Format( PyExc_MemoryError,
"ConnectionistTemporalClassification: Could not allocate memory for input lengths" );
return 1;
}
// flatten labels to conform with library memory layout
create_flat_labels( in_labels, &(context->flat_labels), &(context->label_lengths) );
if ( ( NULL == context->label_lengths ) || ( NULL == context->flat_labels ) )
{
// Destroy previous CTC context before returning exception
ctc_context_destroy( context );
PyErr_Format( PyExc_MemoryError,
"ConnectionistTemporalClassification: Could not allocate memory for labels and their lengths" );
return 1;
}
npy_int minibatch_size = PyArray_DIMS( in_activations )[1];
npy_int alphabet_size = PyArray_DIMS( in_activations )[2];
npy_float32 * costs = NULL;
npy_intp cost_size = minibatch_size;
if ( (*out_costs) == NULL || // Symbolic variable has no memory backing
PyArray_NDIM( *out_costs ) != 1 || // or, matrix has the wrong size
PyArray_DIMS( *out_costs )[0] != cost_size )
{
Py_XDECREF( *out_costs );
// Allocate new matrix
*out_costs = (PyArrayObject *) PyArray_ZEROS( 1, &cost_size, NPY_FLOAT32, 0 );
if ( NULL == (*out_costs) )
{
// Destroy previous CTC context before returning exception
ctc_context_destroy( context );
PyErr_Format( PyExc_MemoryError,
"ConnectionistTemporalClassification: Could not allocate memory for CTC costs" );
return 1;
}
}
costs = (npy_float32 *) PyArray_DATA( *out_costs );
npy_float32 * gradients = NULL;
if ( NULL != out_gradients ) // If gradient computation is not disabled
{
if ( NULL == (*out_gradients) || // Symbolic variable has no real backing
PyArray_NDIM( *out_gradients ) != 3 ||
PyArray_DIMS( *out_gradients )[0] != PyArray_DIMS( in_activations )[0] ||
PyArray_DIMS( *out_gradients )[1] != PyArray_DIMS( in_activations )[1] ||
PyArray_DIMS( *out_gradients )[2] != PyArray_DIMS( in_activations )[2] )
{
// Existing matrix is the wrong size. Make a new one.
// Decrement ref counter to existing array
Py_XDECREF( *out_gradients );
// Allocate new array
*out_gradients = (PyArrayObject *) PyArray_ZEROS(3, PyArray_DIMS( in_activations ),
NPY_FLOAT32, 0);
if ( NULL == (*out_gradients) )
{
// Destroy previous CTC context before returning exception
ctc_context_destroy( context );
PyErr_Format( PyExc_MemoryError,
"ConnectionistTemporalClassification: Could not allocate memory for CTC gradients!" );
return 1;
}
}
gradients = (npy_float32 *) PyArray_DATA( *out_gradients );
}
size_t cpu_workspace_size;
int ctc_error;
ctc_error = ctc_check_result( get_workspace_size( context->label_lengths,
context->input_lengths, alphabet_size, minibatch_size, context->options,
&cpu_workspace_size ),
"Failed to obtain CTC workspace size." );
if ( ctc_error ) // Exception is set by ctc_check_result, return error here
{
// Destroy previous CTC context before returning exception
ctc_context_destroy( context );
return 1;
}
context->workspace = malloc( cpu_workspace_size );
if ( NULL == context->workspace )
{
// Destroy previous CTC context before returning exception
ctc_context_destroy( context );
PyErr_Format( PyExc_MemoryError,
"ConnectionistTemporalClassification: Failed to allocate memory for CTC workspace." );
return 1;
}
ctc_error = ctc_check_result( compute_ctc_loss( activations, gradients,
context->flat_labels, context->label_lengths, context->input_lengths,
alphabet_size, minibatch_size, costs, context->workspace,
context->options ), "Failed to compute CTC loss function." );
if ( ctc_error ) // Exception is set by ctc_check_result, return error here
{
ctc_context_destroy( context );
return 1;
}
ctc_context_destroy( context );
return 0;
}
from __future__ import (division, absolute_import, print_function)
import unittest
import numpy as np
import theano
import theano.tensor as T
from theano.tests import unittest_tools as utt
from theano.tensor.nnet.ctc import (ctc_available, ctc, ConnectionistTemporalClassification)
def setup_torch_case():
# Test obtained from Torch tutorial at:
# https://github.com/baidu-research/warp-ctc/blob/master/torch_binding/TUTORIAL.md
# Layout, from slowest to fastest changing dimension, is (time, batchSize, inputLayerSize)
activations = np.asarray([[[0, 0, 0, 0, 0], [1, 2, 3, 4, 5], [-5, -4, -3, -2, -1]],
[[0, 0, 0, 0, 0], [6, 7, 8, 9, 10], [-10, -9, -8, -7, -6]],
[[0, 0, 0, 0, 0], [11, 12, 13, 14, 15], [-15, -14, -13, -12, -11]]],
dtype=np.float32)
# Duration of each sequence
activation_times = np.asarray([1, 3, 3], dtype=np.int32)
# Labels for each sequence
labels = np.asarray([[1, -1],
[3, 3],
[2, 3]], dtype=np.int32)
expected_costs = np.asarray([1.609437943, 7.355742931, 4.938849926],
dtype=np.float32)
grads = [[[0.2, -0.8, 0.2, 0.2, 0.2],
[0.01165623125, 0.03168492019, 0.08612854034, -0.7658783197, 0.636408627],
[-0.02115798369, 0.03168492019, -0.8810571432, 0.2341216654, 0.636408627]],
[[0, 0, 0, 0, 0],
[-0.9883437753, 0.03168492019, 0.08612854034, 0.2341216654, 0.636408627],
[-0.02115798369, 0.03168492019, -0.1891518533, -0.4577836394, 0.636408627]],
[[0, 0, 0, 0, 0],
[0.01165623125, 0.03168492019, 0.08612854034, -0.7658783197, 0.636408627],
[-0.02115798369, 0.03168492019, 0.08612854034, -0.7330639958, 0.636408627]]]
expected_gradients = np.asarray(grads, dtype=np.float32)
return [activations, labels, activation_times, expected_costs, expected_gradients]
def setup_ctc_case():
activations = np.asarray([[[0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]],
[[0.6, 0.1, 0.1, 0.1, 0.1], [0.1, 0.1, 0.5, 0.2, 0.1]]],
dtype=np.float32)
activation_times = np.asarray([2, 2], dtype=np.int32)
labels = np.asarray([[1, 2], [1, 2]], dtype=np.int32)
expected_costs = np.asarray([2.962858438, 3.053659201], dtype=np.float32)
grads = [[[0.177031219, -0.7081246376, 0.177031219, 0.177031219, 0.177031219],
[0.177031219, -0.8229685426, 0.291875124, 0.177031219, 0.177031219]],
[[0.291875124, 0.177031219, -0.8229685426, 0.177031219, 0.177031219],
[0.1786672771, 0.1786672771, -0.7334594727, 0.1974578798, 0.1786672771]]]
expected_gradients = np.asarray(grads, dtype=np.float32)
return [activations, labels, activation_times, expected_costs, expected_gradients]
def setup_grad_case():
activations = np.asarray([[[0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]],
[[0.6, 0.1, 0.1, 0.1, 0.1], [0.1, 0.1, 0.5, 0.2, 0.1]]],
dtype=np.float32)
activation_times = np.asarray([2, 2], dtype=np.int32)
labels = np.asarray([[1, 2], [1, 2]], dtype=np.int32)
return [activations, labels, activation_times]
class TestCTC(unittest.TestCase):
"""
Test Baidu CTC wrapper implementation.
Expected values for costs and gradients are obtained through an external
C implementation, that uses the library directly.
"""
def setUp(self):
if not ctc_available():
self.skipTest('Optional library warp-ctc not available')
def run_ctc(self, activations, labels, input_length, expected_costs, expected_grads):
# Create symbolic variables
t_activations = theano.shared(activations, name="activations")
t_activation_times = theano.shared(input_length, name="activation_times")
t_labels = theano.shared(labels, name="labels")
t_cost = ctc(t_activations, t_labels, t_activation_times)
# Symbolic gradient of CTC cost
t_grad = T.grad(T.mean(t_cost), t_activations)
# Compile symbolic functions
train = theano.function([], [t_cost, t_grad])
cost, grad = train()
utt.assert_allclose(expected_grads / cost.shape[0], grad)
utt.assert_allclose(expected_costs, cost)
self.check_grads_disabled(t_activations, t_labels, t_activation_times)
def check_grads_disabled(self, activations, labels, input_length):
"""
Check if optimization to disable gradients is working
"""
ctc_cost = ctc(activations, labels, input_length)
ctc_function = theano.function([], [ctc_cost])
for node in ctc_function.maker.fgraph.apply_nodes:
if isinstance(node.op, ConnectionistTemporalClassification):
assert (node.op.compute_grad is False)
def test_torch_case(self):
activations, labels, input_length, expected_costs, expected_grads = setup_torch_case()
self.run_ctc(activations, labels, input_length, expected_costs, expected_grads)
def test_ctc(self):
activations, labels, input_length, expected_costs, expected_grads = setup_ctc_case()
self.run_ctc(activations, labels, input_length, expected_costs, expected_grads)
def test_verify_grad(self):
def ctc_op_functor(labels, in_lengths):
def wrapper(acts):
# Create auxiliary symbolic variables
t_activation_times = theano.shared(in_lengths, name="activation_times")
t_labels = theano.shared(labels, name="labels")
return ctc(acts, t_labels, t_activation_times)
return wrapper
activations, labels, activation_times = setup_grad_case()
ctc_op = ctc_op_functor(labels, activation_times)
utt.verify_grad(ctc_op, [activations])
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论