提交 dae1f236 authored 作者: Frédéric Bastien's avatar Frédéric Bastien 提交者: GitHub

Merge pull request #5949 from joaovictortr/warp_ctc_wrapper

Baidu CTC wrapper
...@@ -759,6 +759,14 @@ import theano and print the config variable, as in: ...@@ -759,6 +759,14 @@ import theano and print the config variable, as in:
Location of the magma library. Location of the magma library.
.. attribute:: config.ctc.root
Default: ``''``
Location of the warp-ctc folder. The folder should contain either a build,
lib or lib64 subfolder with the shared library (libwarpctc.so), and another
subfolder called include, with the CTC library header.
.. attribute:: config.gcc.cxxflags .. attribute:: config.gcc.cxxflags
Default: ``""`` Default: ``""``
......
.. _libdoc_gpuarray_ctc:
================================================================================
:mod:`theano.gpuarray.ctc` -- Connectionist Temporal Classification (CTC) loss
================================================================================
.. note::
Usage of connectionist temporal classification (CTC) loss Op, requires that
the `warp-ctc <https://github.com/baidu-research/warp-ctc>`_ library is
available. In case the warp-ctc library is not in your compiler's library path,
the ``config.ctc.root`` configuration option must be appropriately set to the
directory containing the warp-ctc library files.
.. note::
Unfortunately, Windows platforms are not yet supported by the underlying
library.
.. module:: theano.gpuarray.ctc
:platform: Unix
:synopsis: Connectionist temporal classification (CTC) loss Op, using the warp-ctc library
.. moduleauthor:: `João Victor Risso <https://github.com/joaovictortr>`_
.. autofunction:: theano.gpuarray.ctc.gpu_ctc
.. autoclass:: theano.gpuarray.ctc.GpuConnectionistTemporalClassification
...@@ -18,3 +18,4 @@ ...@@ -18,3 +18,4 @@
fft fft
type type
extra extra
ctc
.. _libdoc_tensor_nnet_ctc:
==================================================================================
:mod:`theano.tensor.nnet.ctc` -- Connectionist Temporal Classification (CTC) loss
==================================================================================
.. note::
Usage of connectionist temporal classification (CTC) loss Op, requires that
the `warp-ctc <https://github.com/baidu-research/warp-ctc>`_ library is
available. In case the warp-ctc library is not in your compiler's library path,
the ``config.ctc.root`` configuration option must be appropriately set to the
directory containing the warp-ctc library files.
.. note::
Unfortunately, Windows platforms are not yet supported by the underlying
library.
.. module:: theano.tensor.nnet.ctc
:platform: Unix
:synopsis: Connectionist temporal classification (CTC) loss Op, using the warp-ctc library
.. moduleauthor:: `João Victor Risso <https://github.com/joaovictortr>`_
.. autofunction:: theano.tensor.nnet.ctc.ctc
.. autoclass:: theano.tensor.nnet.ctc.ConnectionistTemporalClassification
...@@ -21,3 +21,4 @@ and ops which are particular to neural networks and deep learning. ...@@ -21,3 +21,4 @@ and ops which are particular to neural networks and deep learning.
neighbours neighbours
bn bn
blocksparse blocksparse
ctc
...@@ -1839,6 +1839,13 @@ AddConfigVar( ...@@ -1839,6 +1839,13 @@ AddConfigVar(
allow_override=False), allow_override=False),
in_c_key=False) in_c_key=False)
AddConfigVar(
'ctc.root',
'Directory which contains the root of Baidu CTC library. It is assumed \
that the compiled library is either inside the build, lib or lib64 \
subdirectory, and the header inside the include directory.',
StrParam('', allow_override=False),
in_c_key=False)
# Check if there are remaining flags provided by the user through THEANO_FLAGS. # Check if there are remaining flags provided by the user through THEANO_FLAGS.
for key in THEANO_FLAGS_DICT.keys(): for key in THEANO_FLAGS_DICT.keys():
......
...@@ -29,7 +29,7 @@ from .type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant, ...@@ -29,7 +29,7 @@ from .type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant,
GpuArraySharedVariable, gpuarray_shared_constructor, GpuArraySharedVariable, gpuarray_shared_constructor,
reg_context, get_context, ContextNotDefined) reg_context, get_context, ContextNotDefined)
from .basic_ops import as_gpuarray_variable from .basic_ops import as_gpuarray_variable
from . import fft, dnn, opt, extra_ops, multinomial, reduction, rng_mrg from . import fft, dnn, opt, extra_ops, multinomial, reduction, rng_mrg, ctc
def transfer(x, target): def transfer(x, target):
......
#section init_code
setup_ext_cuda();
#section support_code
typedef struct ctc_context {
struct ctcOptions options;
gpudata * workspace;
int * input_lengths;
int * flat_labels;
int * label_lengths;
} ctc_context_t;
void ctc_context_init(ctc_context_t * context, PyGpuContextObject * gpu_context)
{
memset(&(context->options), 0, sizeof(struct ctcOptions));
context->options.loc = CTC_GPU;
// Get CUDA function pointer to obtain stream
CUstream (*getstream_func_ptr)(void *) = (CUstream (*)(void *)) gpuarray_get_extension( "cuda_get_stream" );
context->options.stream = getstream_func_ptr(gpu_context->ctx);
context->workspace = NULL;
context->input_lengths = NULL;
context->flat_labels = NULL;
context->label_lengths = NULL;
}
void ctc_context_destroy(ctc_context_t * context)
{
gpudata_release( context->workspace );
free( context->input_lengths );
free( context->flat_labels );
free( context->label_lengths );
}
int ctc_check_result(ctcStatus_t retcode, const char * msg)
{
if( CTC_STATUS_SUCCESS != retcode )
{
// Get error message from underlying library
const char * ctc_msg = ctcGetStatusString( retcode );
PyErr_Format( PyExc_RuntimeError,
"GpuConnectionistTemporalClassification: %s CTC error: %s",
msg,
ctc_msg );
return 1;
}
return 0;
}
void create_contiguous_input_lengths( PyArrayObject * input_lengths_arr,
int ** input_lengths )
{
npy_int num_elements = PyArray_DIMS( input_lengths_arr )[0];
*input_lengths = (int *) malloc( num_elements * sizeof(int) );
if ( NULL == (*input_lengths) )
return;
for( npy_int elem_idx = 0; elem_idx < num_elements; ++elem_idx )
{
(*input_lengths)[elem_idx] = *( (npy_int *) PyArray_GETPTR1( input_lengths_arr, elem_idx ) );
}
}
void create_flat_labels( PyArrayObject * label_matrix, int ** flat_labels,
int ** label_lengths )
{
npy_int rows = PyArray_DIMS( label_matrix )[0];
npy_int cols = PyArray_DIMS( label_matrix )[1];
*flat_labels = (int *) calloc( rows * cols, sizeof(int) );
if ( NULL == (*flat_labels) )
return;
*label_lengths = (int *) calloc( rows, sizeof(int) );
if ( NULL == (*label_lengths) )
{
free( *flat_labels );
*flat_labels = NULL;
return;
}
npy_int label_index = 0;
for( npy_int row_idx = 0; row_idx < rows; ++row_idx )
{
npy_int label_length = 0;
for( npy_int col_idx = 0; col_idx < cols; ++col_idx )
{
npy_int label = *( (npy_int *) PyArray_GETPTR2( label_matrix, row_idx, col_idx ) );
if ( label >= 0 ) // negative values are assumed to be padding
{
(*flat_labels)[ label_index++ ] = label;
++label_length;
}
}
(*label_lengths)[ row_idx ] = label_length;
}
}
#section support_code_apply
int APPLY_SPECIFIC(ctc_cost_gpu)(PyGpuArrayObject * in_activations,
PyArrayObject * in_labels,
PyArrayObject * in_input_lengths,
PyGpuArrayObject ** out_costs,
PyGpuArrayObject ** out_gradients,
PyGpuContextObject * gpu_context)
{
ctc_context_t ctc_object;
ctc_context_t * context = &ctc_object;
size_t gpu_workspace_size;
int ctc_error = 0;
const size_t num_activations = PyGpuArray_DIMS( in_activations )[0];
const size_t minibatch_size = PyGpuArray_DIMS( in_activations )[1];
const size_t alphabet_size = PyGpuArray_DIMS( in_activations )[2];
const size_t cost_size = minibatch_size;
const size_t grad_dims[3] = { num_activations, minibatch_size, alphabet_size };
float * costs = NULL,
* activations = NULL,
* gradients = NULL;
cuda_enter( gpu_context->ctx );
ctc_context_init( context, gpu_context );
switch (in_activations->ga.typecode)
{
case GA_FLOAT:
activations = (float *) PyGpuArray_DEV_DATA( in_activations );
break;
default:
ctc_context_destroy( context );
cuda_exit( gpu_context->ctx );
PyErr_SetString( PyExc_TypeError,
"GpuConnectionistTemporalClassification: Unsupported type for activations." );
return 1;
}
create_contiguous_input_lengths( in_input_lengths, &(context->input_lengths) );
if ( NULL == context->input_lengths )
{
// Destroy previous CTC context before returning exception
ctc_context_destroy( context );
cuda_exit( gpu_context->ctx );
PyErr_Format( PyExc_MemoryError,
"GpuConnectionistTemporalClassification: Could not allocate memory for input lengths." );
return 1;
}
// flatten labels to conform with library memory layout
create_flat_labels( in_labels, &(context->flat_labels), &(context->label_lengths) );
if ( ( NULL == context->label_lengths ) || ( NULL == context->flat_labels ) )
{
// Destroy previous CTC context before returning exception
ctc_context_destroy( context );
cuda_exit( gpu_context->ctx );
PyErr_Format( PyExc_MemoryError,
"GpuConnectionistTemporalClassification: Could not allocate memory for labels and their lengths." );
return 1;
}
if ( theano_prep_output( out_costs, 1, &cost_size, in_activations->ga.typecode,
GA_C_ORDER, gpu_context ) != 0 )
{
ctc_context_destroy( context );
cuda_exit( gpu_context->ctx );
return 1;
}
GpuArray_memset( &((*out_costs)->ga), 0 );
costs = (float *) PyGpuArray_DEV_DATA( *out_costs );
if ( NULL != out_gradients ) // if gradient computation is not disabled
{
if ( theano_prep_output( out_gradients, 3, grad_dims, in_activations->ga.typecode,
GA_C_ORDER, gpu_context ) != 0 )
{
ctc_context_destroy( context );
cuda_exit( gpu_context->ctx );
return 1;
}
GpuArray_memset( &((*out_gradients)->ga), 0 );
gradients = (float *) PyGpuArray_DEV_DATA( *out_gradients );
}
ctc_error = ctc_check_result( get_workspace_size( context->label_lengths,
context->input_lengths, alphabet_size, minibatch_size, context->options,
&gpu_workspace_size ),
"Failed to obtain CTC workspace size." );
if ( ctc_error ) // Exception is set by ctc_check_result, return error here
{
// Destroy previous CTC context before returning exception
ctc_context_destroy( context );
cuda_exit( gpu_context->ctx );
return 1;
}
context->workspace = gpudata_alloc( gpu_context->ctx, gpu_workspace_size, NULL, 0, NULL );
if ( NULL == context->workspace )
{
ctc_context_destroy( context );
cuda_exit( gpu_context->ctx );
PyErr_Format( PyExc_MemoryError,
"GpuConnectionistTemporalClassification: Failed to allocate memory for CTC workspace." );
return 1;
}
cuda_wait( in_activations->ga.data, GPUARRAY_CUDA_WAIT_READ );
cuda_wait( (*out_costs)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
if ( out_gradients != NULL )
cuda_wait( (*out_gradients)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
ctc_error = ctc_check_result( compute_ctc_loss( activations, gradients,
context->flat_labels, context->label_lengths, context->input_lengths,
alphabet_size, minibatch_size, costs, *(void **)context->workspace,
context->options ), "Failed to compute CTC loss function." );
cuda_record( in_activations->ga.data, GPUARRAY_CUDA_WAIT_READ );
cuda_record( (*out_costs)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
if ( out_gradients != NULL )
cuda_record( (*out_gradients)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
if ( ctc_error ) // Exception is set by ctc_check_result, return error here
{
ctc_context_destroy( context );
cuda_exit( gpu_context->ctx );
return 1;
}
ctc_context_destroy( context );
cuda_exit( gpu_context->ctx );
return 0;
}
from __future__ import absolute_import, print_function, division
import theano
from theano import (config, gof)
import theano.tensor as T
from .basic_ops import (gpu_contiguous, as_gpuarray_variable, infer_context_name)
import theano.tensor.nnet.ctc
from .type import (GpuArrayType, gpu_context_type)
from .elemwise import GpuDimShuffle
from theano.gradient import grad_undefined
from theano.gof import local_optimizer
from theano.tensor.opt import register_canonicalize
from theano.tensor.nnet.ctc import ctc_available
import os
import os.path
from . import pygpu
class GpuConnectionistTemporalClassification(gof.COp):
"""
GPU wrapper for Baidu CTC loss function.
Parameters
----------
compute_grad
If set to True, enables the computation of gradients of the CTC loss function.
"""
__props__ = ('compute_grad',)
_cop_num_inputs = 3
_cop_num_outputs = 2
func_file = "./c_code/ctc_wrapper.c"
func_name = "APPLY_SPECIFIC(ctc_cost_gpu)"
params_type = gpu_context_type
def __init__(self, compute_grad=True):
if not ctc_available():
raise RuntimeError('Baidu CTC is not available and '
'GpuConnectionistTemporalClassification Op '
'can not be constructed.')
self.compute_grad = compute_grad
# Return only the cost. Gradient will be returned by grad()
self.default_output = 0
gof.COp.__init__(self, self.func_file, self.func_name)
def c_lib_dirs(self):
lib_dirs = []
if ctc_available.path is not None:
lib_dirs += [ctc_available.path]
return lib_dirs
def c_libraries(self):
return ["warpctc", "gpuarray"]
def c_header_dirs(self):
dirs = [os.path.dirname(__file__), pygpu.get_include()]
if config.ctc.root != '':
dirs.append(os.path.join(config.ctc.root, "include"))
return dirs
def c_headers(self):
return ['ctc.h', 'numpy_compat.h', 'gpuarray/ext_cuda.h',
'gpuarray_helper.h', 'gpuarray/types.h', 'gpuarray_api.h',
'gpuarray/array.h', 'gpuarray/util.h', 'gpuarray/extension.h']
def get_params(self, node):
return node.inputs[0].type.context
def make_node(self, activations, labels, input_lengths):
context_name = infer_context_name(activations)
t_activations = as_gpuarray_variable(activations,
context_name=context_name)
# Ensure activations array is C-contiguous
t_activations = gpu_contiguous(t_activations)
# Labels and input lengths are always on the CPU
t_labels = T.as_tensor_variable(labels)
t_input_lengths = T.as_tensor_variable(input_lengths)
if t_activations.type.dtype != 'float32':
raise TypeError('activations must use the float32 type.')
if t_activations.ndim != 3:
raise ValueError('activations must have 3 dimensions.')
if t_labels.type.dtype != 'int32':
raise TypeError('labels must use the int32 type.')
if t_labels.ndim != 2:
raise ValueError('labels must have 2 dimensions.')
if t_input_lengths.type.dtype != 'int32':
raise TypeError('input_lengths must use the int32 type.')
if t_input_lengths.ndim != 1:
raise ValueError('input_lengths must have 1 dimension.')
costs = GpuArrayType(dtype='float32',
broadcastable=(False,),
context_name=context_name)()
outputs = [costs]
if self.compute_grad:
gradients = GpuArrayType(dtype='float32',
broadcastable=(False, False, False,),
context_name=context_name)()
outputs += [gradients]
return theano.Apply(self, inputs=[t_activations, t_labels, t_input_lengths],
outputs=outputs)
def L_op(self, inputs, outputs, output_grads):
# Gradients computed by Op
assert self.compute_grad and len(outputs) == 2
gradients = outputs[1]
assert gradients is not None
# Gradients of original function, to compose chain rule
grad_op = output_grads[0]
grad_shuffle = GpuDimShuffle(input_broadcastable=(False, False, False,),
new_order=(1, 0, 2))(gradients)
grad_bdot = T.basic.batched_dot(grad_op, grad_shuffle)
grad_shuffle_reverse = GpuDimShuffle(input_broadcastable=(False, False, False,),
new_order=(1, 0, 2))(grad_bdot)
return [grad_shuffle_reverse,
grad_undefined(self, 1, inputs[1]),
grad_undefined(self, 2, inputs[2])]
def gpu_ctc(activations, labels, input_lengths):
"""
Compute CTC loss function on the GPU.
Parameters
----------
activations
Three-dimensional tensor, which has a shape of (t, m, p), where
t is the time index, m is the minibatch index, and p is the index
over the probabilities of each symbol in the alphabet. The memory
layout is assumed to be in C-order, which consists in the slowest
to the fastest changing dimension, from left to right. In this case,
p is the fastest changing dimension.
labels
A 2-D tensor of all the labels for the minibatch. In each row, there
is a sequence of target labels. Negative values are assumed to be padding,
and thus are ignored. Blank symbol is assumed to have index 0 in the
alphabet.
input_lengths
A 1-D tensor with the number of time steps for each sequence in
the minibatch.
Returns
-------
1-D array
Cost of each example in the minibatch.
"""
return GpuConnectionistTemporalClassification()(activations, labels, input_lengths)
# Disable gradient computation if not needed
@register_canonicalize
@local_optimizer([GpuConnectionistTemporalClassification])
def local_gpu_ctc_no_grad(node):
if isinstance(node.op, GpuConnectionistTemporalClassification):
if len(node.outputs) > 1:
if len(node.outputs[1].clients) == 0: # gradient is not used
return [GpuConnectionistTemporalClassification(compute_grad=False)(*node.inputs), None]
return False
...@@ -34,6 +34,7 @@ from theano.tensor.nnet.abstract_conv import (BaseAbstractConv, ...@@ -34,6 +34,7 @@ from theano.tensor.nnet.abstract_conv import (BaseAbstractConv,
AbstractConv3d_gradWeights, AbstractConv3d_gradWeights,
AbstractConv3d_gradInputs) AbstractConv3d_gradInputs)
from theano.tensor.nnet.neighbours import Images2Neibs from theano.tensor.nnet.neighbours import Images2Neibs
from theano.tensor.nnet.ctc import ConnectionistTemporalClassification
import theano.tensor.nlinalg as nlinalg import theano.tensor.nlinalg as nlinalg
import theano.tensor.signal.pool as pool import theano.tensor.signal.pool as pool
import theano.tensor.slinalg as slinalg import theano.tensor.slinalg as slinalg
...@@ -80,6 +81,7 @@ from .linalg import (GpuCusolverSolve, MATRIX_STRUCTURES_SOLVE, GpuCholesky, ...@@ -80,6 +81,7 @@ from .linalg import (GpuCusolverSolve, MATRIX_STRUCTURES_SOLVE, GpuCholesky,
GpuMagmaCholesky, gpu_qr, GpuMagmaEigh, GpuMagmaCholesky, gpu_qr, GpuMagmaEigh,
GpuCublasTriangularSolve, cublas_available) GpuCublasTriangularSolve, cublas_available)
from .neighbours import GpuImages2Neibs from .neighbours import GpuImages2Neibs
from .ctc import GpuConnectionistTemporalClassification
_logger = logging.getLogger("theano.gpuarray.opt") _logger = logging.getLogger("theano.gpuarray.opt")
...@@ -162,6 +164,7 @@ def register_inplace(*tags, **kwargs): ...@@ -162,6 +164,7 @@ def register_inplace(*tags, **kwargs):
return local_opt return local_opt
return f return f
register_opt('fast_compile')(theano.tensor.opt.local_track_shape_i) register_opt('fast_compile')(theano.tensor.opt.local_track_shape_i)
register_opt(final_opt=True, name='gpua_constant_folding')( register_opt(final_opt=True, name='gpua_constant_folding')(
tensor.opt.constant_folding) tensor.opt.constant_folding)
...@@ -582,6 +585,7 @@ def local_cut_gpu_transfers(node): ...@@ -582,6 +585,7 @@ def local_cut_gpu_transfers(node):
else: else:
return [node.op(n2.inputs[0])] return [node.op(n2.inputs[0])]
gpu_cut_copies.register('cut_gpua_host_transfers', local_cut_gpu_transfers, gpu_cut_copies.register('cut_gpua_host_transfers', local_cut_gpu_transfers,
'fast_compile', 'fast_run', 'gpuarray') 'fast_compile', 'fast_run', 'gpuarray')
gpu_cut_copies.register('cut_gpua_constant_transfers', gpu_cut_copies.register('cut_gpua_constant_transfers',
...@@ -652,6 +656,8 @@ def local_gpua_alloc_empty_to_zeros(node): ...@@ -652,6 +656,8 @@ def local_gpua_alloc_empty_to_zeros(node):
z = np.asarray(0, dtype=node.outputs[0].dtype) z = np.asarray(0, dtype=node.outputs[0].dtype)
return [GpuAlloc(context_name)(as_gpuarray_variable(z, context_name), return [GpuAlloc(context_name)(as_gpuarray_variable(z, context_name),
*node.inputs)] *node.inputs)]
optdb.register('local_gpua_alloc_empty_to_zeros', optdb.register('local_gpua_alloc_empty_to_zeros',
theano.tensor.opt.in2out(local_gpua_alloc_empty_to_zeros), theano.tensor.opt.in2out(local_gpua_alloc_empty_to_zeros),
# After move to gpu and merge2, before inplace. # After move to gpu and merge2, before inplace.
...@@ -1540,6 +1546,8 @@ def local_conv_gpu_conv(node): ...@@ -1540,6 +1546,8 @@ def local_conv_gpu_conv(node):
return [tensor.as_tensor_variable(out)] return [tensor.as_tensor_variable(out)]
else: else:
return [out] return [out]
register_opt()(local_conv_gpu_conv) register_opt()(local_conv_gpu_conv)
...@@ -1812,6 +1820,8 @@ def local_gpu_pool(op, ctx_name, inputs, outputs): ...@@ -1812,6 +1820,8 @@ def local_gpu_pool(op, ctx_name, inputs, outputs):
inp_padded = pad_dims(inp, 2, nd) inp_padded = pad_dims(inp, 2, nd)
ret_padded = op(inp_padded, ws, stride, pad) ret_padded = op(inp_padded, ws, stride, pad)
return unpad_dims(ret_padded, inp, 2, nd) return unpad_dims(ret_padded, inp, 2, nd)
pool_db = LocalGroupDB() pool_db = LocalGroupDB()
pool_db2 = LocalGroupDB(local_opt=theano.gof.opt.GraphToGPULocalOptGroup) pool_db2 = LocalGroupDB(local_opt=theano.gof.opt.GraphToGPULocalOptGroup)
pool_db2.__name__ = "pool_db2" pool_db2.__name__ = "pool_db2"
...@@ -1849,6 +1859,8 @@ def local_gpu_max_pool_grad(op, ctx_name, inputs, outputs): ...@@ -1849,6 +1859,8 @@ def local_gpu_max_pool_grad(op, ctx_name, inputs, outputs):
ret_padded = op(inp_padded, out_padded, out_grad_padded, ret_padded = op(inp_padded, out_padded, out_grad_padded,
ws, stride, pad) ws, stride, pad)
return unpad_dims(ret_padded, inp, 2, nd) return unpad_dims(ret_padded, inp, 2, nd)
lifter = op_lifter([pool.MaxPoolGrad])(local_gpu_max_pool_grad) lifter = op_lifter([pool.MaxPoolGrad])(local_gpu_max_pool_grad)
pool_db.register("local_gpu_max_pool_grad", lifter, pool_db.register("local_gpu_max_pool_grad", lifter,
'gpuarray', 'fast_compile', 'fast_run', 'gpuarray', 'fast_compile', 'fast_run',
...@@ -1879,6 +1891,8 @@ def local_gpu_average_pool_grad(op, ctx_name, inputs, outputs): ...@@ -1879,6 +1891,8 @@ def local_gpu_average_pool_grad(op, ctx_name, inputs, outputs):
ret_padded = op(inp_padded, out_grad_padded, ret_padded = op(inp_padded, out_grad_padded,
ws, stride, pad) ws, stride, pad)
return unpad_dims(ret_padded, inp, 2, nd) return unpad_dims(ret_padded, inp, 2, nd)
lifter = op_lifter([pool.AveragePoolGrad])(local_gpu_average_pool_grad) lifter = op_lifter([pool.AveragePoolGrad])(local_gpu_average_pool_grad)
pool_db.register("local_gpu_average_pool_grad", lifter, pool_db.register("local_gpu_average_pool_grad", lifter,
'gpuarray', 'fast_compile', 'fast_run', 'gpuarray', 'fast_compile', 'fast_run',
...@@ -1976,6 +1990,7 @@ def local_assert_no_cpu_op(node): ...@@ -1976,6 +1990,7 @@ def local_assert_no_cpu_op(node):
elif config.assert_no_cpu_op == "pdb": elif config.assert_no_cpu_op == "pdb":
pdb.set_trace() pdb.set_trace()
# Register the local_assert_no_cpu_op: # Register the local_assert_no_cpu_op:
assert_no_cpu_op = theano.tensor.opt.in2out(local_assert_no_cpu_op, assert_no_cpu_op = theano.tensor.opt.in2out(local_assert_no_cpu_op,
name='assert_no_cpu_op') name='assert_no_cpu_op')
...@@ -2308,6 +2323,15 @@ def local_gpu_magma_svd(op, context_name, inputs, outputs): ...@@ -2308,6 +2323,15 @@ def local_gpu_magma_svd(op, context_name, inputs, outputs):
out = [out.astype('float16')] out = [out.astype('float16')]
return out return out
@register_opt('ctc', 'fast_compile')
@op_lifter([theano.tensor.nnet.ctc.ConnectionistTemporalClassification])
@register_opt2([ConnectionistTemporalClassification], 'ctc', 'fast_compile')
def local_gpu_ctc(op, context_name, inputs, outputs):
op = GpuConnectionistTemporalClassification(compute_grad=op.compute_grad)
return op.make_node(*inputs).outputs
# Do not register in fast_run or fast_compile. # Do not register in fast_run or fast_compile.
# It will be added to fast_run if the GPU is enabled. # It will be added to fast_run if the GPU is enabled.
optdb.register('gpua_scanOp_make_inplace', optdb.register('gpua_scanOp_make_inplace',
......
from __future__ import (division, absolute_import, print_function)
import unittest
import numpy as np
import theano
import theano.tensor as T
from theano.tests import unittest_tools as utt
import theano.gpuarray
from theano.gpuarray.ctc import (gpu_ctc, GpuConnectionistTemporalClassification)
from theano.tensor.nnet.ctc import (ctc, ctc_available, ConnectionistTemporalClassification)
from theano.tensor.nnet.tests.test_ctc import (setup_torch_case, setup_ctc_case, setup_grad_case)
from .config import (mode_with_gpu, mode_without_gpu)
class TestCTC(unittest.TestCase):
def setUp(self):
if not ctc_available():
self.skipTest('Optional library warp-ctc not available')
def check_ctc(self, activations, labels, input_length, expected_costs, expected_grads):
# Create symbolic variables
t_activations = theano.shared(activations, name="activations")
t_activation_times = theano.shared(input_length, name="activation_times")
t_labels = theano.shared(labels, name="labels")
inputs = [t_activations, t_labels, t_activation_times]
# Execute several tests for each test case
self.check_expected_values(t_activations, t_labels, t_activation_times, expected_costs, expected_grads)
self.compare_gpu_and_cpu_values(*inputs)
self.check_grads_disabled(*inputs)
self.run_gpu_optimization_with_grad(*inputs)
self.run_gpu_optimization_no_grad(*inputs)
def setup_cpu_op(self, activations, labels, input_length, compute_grad=True, mode=mode_without_gpu):
cpu_ctc_cost = ctc(activations, labels, input_length)
outputs = [cpu_ctc_cost]
if compute_grad:
# Symbolic gradient of CTC cost
cpu_ctc_grad = T.grad(T.mean(cpu_ctc_cost), activations)
outputs += [cpu_ctc_grad]
return theano.function([], outputs, mode=mode)
def setup_gpu_op(self, activations, labels, input_length, compute_grad=True):
gpu_ctc_cost = gpu_ctc(activations, labels, input_length)
outputs = [gpu_ctc_cost]
if compute_grad:
# Symbolic gradient of CTC cost
gpu_ctc_grad = T.grad(T.mean(gpu_ctc_cost), activations)
outputs += [gpu_ctc_grad]
return theano.function([], outputs)
def check_expected_values(self, activations, labels, input_length, expected_costs, expected_grads):
gpu_train = self.setup_gpu_op(activations, labels, input_length)
gpu_cost, gpu_grad = gpu_train()
# Transfer costs from GPU memory to host
cost_from_gpu = np.asarray(gpu_cost)
# Transfer gradients from GPU memory to host
grad_from_gpu = np.asarray(gpu_grad)
# Check that results are in conformance with expected values
utt.assert_allclose(expected_grads / cost_from_gpu.shape[0], grad_from_gpu)
utt.assert_allclose(expected_costs, cost_from_gpu)
def compare_gpu_and_cpu_values(self, activations, labels, input_length):
cpu_train = self.setup_cpu_op(activations, labels, input_length)
cpu_cost, cpu_grad = cpu_train()
gpu_train = self.setup_gpu_op(activations, labels, input_length)
gpu_cost, gpu_grad = gpu_train()
# Transfer costs from GPU memory to host
cost_from_gpu = np.asarray(gpu_cost)
# Transfer gradients from GPU memory to host
grad_from_gpu = np.asarray(gpu_grad)
# Check that results are in conformance with expected values
utt.assert_allclose(cpu_grad, grad_from_gpu)
utt.assert_allclose(cpu_cost, cost_from_gpu)
def check_grads_disabled(self, activations, labels, input_length):
"""
Check if optimization to disable gradients is working
"""
gpu_ctc_cost = gpu_ctc(activations, labels, input_length)
gpu_ctc_function = theano.function([], [gpu_ctc_cost])
for node in gpu_ctc_function.maker.fgraph.apply_nodes:
if isinstance(node.op, GpuConnectionistTemporalClassification):
assert (node.op.compute_grad is False)
def run_gpu_optimization_with_grad(self, activations, labels, input_length):
# Compile CPU function with optimization
cpu_lifted_train = self.setup_cpu_op(activations, labels, input_length, mode=mode_with_gpu)
# Check whether Op is lifted to the GPU
assert self.has_only_gpu_op(cpu_lifted_train)
def run_gpu_optimization_no_grad(self, activations, labels, input_length):
cpu_train = self.setup_cpu_op(activations, labels, input_length, compute_grad=False)
cpu_cost = cpu_train()
# Compile CPU function with optimization
cpu_lifted_test = self.setup_cpu_op(activations, labels, input_length, compute_grad=False, mode=mode_with_gpu)
# Check whether Op is lifted to the GPU
assert self.has_only_gpu_op(cpu_lifted_test)
gpu_cost = cpu_lifted_test()
# Transfer costs from GPU memory to host
cost_from_gpu = np.asarray(gpu_cost)
# Compare values from CPU and GPU Ops
utt.assert_allclose(cpu_cost, cost_from_gpu)
def has_only_gpu_op(self, function):
has_cpu_instance = False
has_gpu_instance = False
for node in function.maker.fgraph.apply_nodes:
if isinstance(node.op, ConnectionistTemporalClassification):
has_cpu_instance = True
if isinstance(node.op, GpuConnectionistTemporalClassification):
has_gpu_instance = True
return has_gpu_instance and (not has_cpu_instance)
# Test obtained from Torch tutorial at:
# https://github.com/baidu-research/warp-ctc/blob/master/torch_binding/TUTORIAL.md
def test_torch_case(self):
activations, labels, activation_times, expected_costs, expected_grads = setup_torch_case()
self.check_ctc(activations, labels, activation_times, expected_costs, expected_grads)
def test_ctc(self):
activations, labels, input_length, expected_costs, expected_grads = setup_ctc_case()
self.check_ctc(activations, labels, input_length, expected_costs, expected_grads)
def test_verify_grad(self):
def ctc_op_functor(labels, in_lengths):
def wrapper(acts):
# Create auxiliary symbolic variables
t_activation_times = theano.shared(in_lengths, name="activation_times")
t_labels = theano.shared(labels, name="labels")
return gpu_ctc(acts, t_labels, t_activation_times)
return wrapper
activations, labels, activation_times = setup_grad_case()
ctc_op = ctc_op_functor(labels, activation_times)
utt.verify_grad(ctc_op, [activations])
from __future__ import (division, absolute_import, print_function)
import os
import os.path
import theano.tensor as T
from theano import config
from theano import gof
from theano.gof import local_optimizer
from theano.gof.cmodule import GCC_compiler
from theano.tensor.opt import register_canonicalize
from theano.tensor.extra_ops import cpu_contiguous
from theano.gradient import grad_undefined
def _ctc_find_lib():
"""
Find the directory that contains libwarpctc.so
"""
if config.ctc.root != '':
for lib_dir in ["build", "lib", "lib64"]:
lib_path = os.path.join(config.ctc.root, lib_dir)
if os.path.isdir(lib_path) and os.path.exists(lib_path):
lib_found = os.path.exists(os.path.join(lib_path, "libwarpctc.so"))
if lib_found:
return lib_path
return None
def _ctc_check_compile(ctc_lib_path):
preambule = """
#include <string.h>
#include "ctc.h"
"""
body = """
ctcOptions options;
memset(&options, 0, sizeof(ctcOptions));
options.loc = CTC_CPU;
options.num_threads = 1;
"""
params = ['-I%s' % (os.path.dirname(__file__))]
if ctc_lib_path is not None:
params.extend(["-I%s" % (os.path.join(config.ctc.root, "include"))])
params.extend(["-L%s" % (ctc_lib_path)])
params.extend(["-l", "warpctc"])
compiler_res = GCC_compiler.try_flags(
params, preambule=preambule, body=body,
try_run=False, output=True)
avail, out, err = compiler_res if isinstance(compiler_res, tuple) else (compiler_res, None, None)
if not avail:
return False, ("cannot compile with warp-ctc. "
"We got this error:\n" + str(err))
return True, None
def ctc_present():
if ctc_present.avail is not None:
return ctc_present.avail
ctc_lib_path = _ctc_find_lib()
ctc_present.path = ctc_lib_path
ctc_present.avail, ctc_present.msg = _ctc_check_compile(ctc_present.path)
return ctc_present.avail
ctc_present.avail = None
ctc_present.msg = None
ctc_present.path = None
def ctc_available():
if os.name == 'nt':
ctc_available.msg = 'Windows platforms are currently not supported ',
'by underlying CTC library (warp-ctc).'
return False
elif not ctc_present():
ctc_available.msg = ctc_present.msg
return False
ctc_available.path = ctc_present.path
return True
ctc_available.msg = None
ctc_available.path = None
class ConnectionistTemporalClassification(gof.COp, gof.OpenMPOp):
"""
CTC loss function wrapper.
Notes
-----
Using the wrapper requires that Baidu's warp-ctc library is installed.
If the warp-ctc library is not on your compiler's default library path,
you must set the configuration variable ``config.ctc.root`` appropriately.
Parameters
----------
compute_grad
If set to True, enables the computation of gradients of the CTC loss function.
"""
__props__ = ('compute_grad',)
_cop_num_inputs = 3
_cop_num_outputs = 2
func_file = "./ctc_wrapper.c"
func_name = "APPLY_SPECIFIC(ctc_cost_cpu)"
def __init__(self, compute_grad=True, openmp=None):
if not ctc_available():
raise RuntimeError('Baidu CTC is not available and '
'ConnectionistTemporalClassification Op '
'can not be constructed.')
gof.COp.__init__(self, self.func_file, self.func_name)
gof.OpenMPOp.__init__(self, openmp=openmp)
self.compute_grad = compute_grad
# Return only the cost. Gradient will be returned by grad()
self.default_output = 0
def c_lib_dirs(self):
lib_dirs = []
if ctc_available.path is not None:
lib_dirs += [ctc_available.path]
return lib_dirs
def c_libraries(self):
return ["warpctc"]
def c_header_dirs(self):
header_dirs = []
if config.ctc.root != '':
# We assume here that the header is available at the include directory
# of the CTC root directory.
header_dirs += [os.path.join(config.ctc.root, "include")]
return header_dirs
def c_headers(self):
return ["ctc.h"] + gof.OpenMPOp.c_headers(self)
def make_node(self, activations, labels, input_lengths):
t_activations = T.as_tensor_variable(activations)
# Ensure activations array is C-contiguous
t_activations = cpu_contiguous(t_activations)
t_labels = T.as_tensor_variable(labels)
t_input_lengths = T.as_tensor_variable(input_lengths)
if t_activations.type.dtype != 'float32':
raise TypeError('activations must use the float32 type!')
if t_activations.ndim != 3:
raise ValueError('activations must have 3 dimensions.')
if t_labels.type.dtype != 'int32':
raise TypeError('labels must use the int32 type!')
if t_labels.ndim != 2:
raise ValueError('labels must have 2 dimensions.')
if t_input_lengths.type.dtype != 'int32':
raise TypeError('input_lengths must use the int32 type!')
if t_input_lengths.ndim != 1:
raise ValueError('input_lengths must have 1 dimension.')
costs = T.fvector(name="ctc_cost")
outputs = [costs]
if self.compute_grad:
gradients = T.ftensor3(name="ctc_grad")
outputs += [gradients]
return gof.Apply(self, inputs=[t_activations, t_labels, t_input_lengths],
outputs=outputs)
def L_op(self, inputs, outputs, output_grads):
assert self.compute_grad and len(outputs) == 2
gradients = outputs[1]
assert gradients is not None
grad_op = output_grads[0]
total_grad = T.basic.batched_dot(grad_op, gradients.dimshuffle(1, 0, 2)).dimshuffle(1, 0, 2)
return [total_grad,
grad_undefined(self, 1, inputs[1]),
grad_undefined(self, 2, inputs[2])]
def ctc(activations, labels, input_lengths):
"""
Compute CTC loss function.
Notes
-----
Using the loss function requires that the Baidu's warp-ctc library be installed.
If the warp-ctc library is not on the compiler's default library path, the
configuration variable ``config.ctc.root`` must be properly set.
Parameters
----------
activations
Three-dimensional tensor, which has a shape of (t, m, p), where
t is the time index, m is the minibatch index, and p is the index
over the probabilities of each symbol in the alphabet. The memory
layout is assumed to be in C-order, which consists in the slowest
to the fastest changing dimension, from left to right. In this case,
p is the fastest changing dimension.
labels
A 2-D tensor of all the labels for the minibatch. In each row, there
is a sequence of target labels. Negative values are assumed to be padding,
and thus are ignored. Blank symbol is assumed to have index 0 in the
alphabet.
input_lengths
A 1-D tensor with the number of time steps for each sequence in
the minibatch.
Returns
-------
1-D array
Cost of each example in the minibatch.
"""
return ConnectionistTemporalClassification()(activations, labels, input_lengths)
# Disable gradient computation if not needed
@register_canonicalize
@local_optimizer([ConnectionistTemporalClassification])
def local_ctc_no_grad(node):
if isinstance(node.op, ConnectionistTemporalClassification):
if len(node.outputs) > 1:
if len(node.outputs[1].clients) == 0: # gradient is not used
return [ConnectionistTemporalClassification(compute_grad=False)(*node.inputs), None]
return False
#section support_code
typedef struct ctc_context {
struct ctcOptions options;
void * workspace;
int * input_lengths;
int * flat_labels;
int * label_lengths;
} ctc_context_t;
void ctc_context_init(ctc_context_t * context)
{
struct ctcOptions * options = &(context->options);
memset(options, 0, sizeof(struct ctcOptions));
options->loc = CTC_CPU;
#if defined(_OPENMP)
options->num_threads = omp_get_num_threads();
#else
options->num_threads = 1;
#endif
context->workspace = NULL;
context->input_lengths = NULL;
context->flat_labels = NULL;
context->label_lengths = NULL;
}
void ctc_context_destroy(ctc_context_t * context)
{
free( context->workspace );
free( context->input_lengths );
free( context->flat_labels );
free( context->label_lengths );
}
int ctc_check_result(ctcStatus_t retcode, const char * msg)
{
if( CTC_STATUS_SUCCESS != retcode )
{
// Get error message from underlying library
const char * ctc_msg = ctcGetStatusString( retcode );
PyErr_Format( PyExc_RuntimeError,
"ConnectionistTemporalClassification: %s CTC error: %s",
msg,
ctc_msg );
return 1;
}
return 0;
}
void create_contiguous_input_lengths( PyArrayObject * input_lengths_arr,
int ** input_lengths )
{
npy_int num_elements = PyArray_DIMS( input_lengths_arr )[0];
*input_lengths = (int *) calloc( num_elements, sizeof(int) );
if ( NULL == (*input_lengths) )
return;
for( npy_int elem_idx = 0; elem_idx < num_elements; ++elem_idx )
{
(*input_lengths)[elem_idx] = *( (npy_int *) PyArray_GETPTR1( input_lengths_arr, elem_idx ) );
}
}
void create_flat_labels( PyArrayObject * label_matrix, int ** flat_labels,
int ** label_lengths )
{
npy_int rows = PyArray_DIMS( label_matrix )[0];
npy_int cols = PyArray_DIMS( label_matrix )[1];
*flat_labels = (int *) calloc( rows * cols, sizeof(int) );
if ( NULL == (*flat_labels) )
return;
*label_lengths = (int *) calloc( rows, sizeof(int) );
if ( NULL == (*label_lengths) )
{
free( *flat_labels );
*flat_labels = NULL;
return;
}
npy_int label_index = 0;
for( npy_int row_idx = 0; row_idx < rows; ++row_idx )
{
npy_int label_length = 0;
for( npy_int col_idx = 0; col_idx < cols; ++col_idx )
{
npy_int label = *( (npy_int *) PyArray_GETPTR2( label_matrix, row_idx, col_idx ) );
if ( label >= 0 ) // negative values are assumed to be padding
{
(*flat_labels)[ label_index++ ] = label;
++label_length;
}
}
(*label_lengths)[ row_idx ] = label_length;
}
}
#section support_code_apply
int APPLY_SPECIFIC(ctc_cost_cpu)(PyArrayObject * in_activations,
PyArrayObject * in_labels,
PyArrayObject * in_input_lengths,
PyArrayObject ** out_costs,
PyArrayObject ** out_gradients)
{
ctc_context_t ctc_object;
ctc_context_t * context = &ctc_object;
ctc_context_init( context );
if ( !PyArray_IS_C_CONTIGUOUS( in_activations ) )
{
PyErr_SetString( PyExc_RuntimeError,
"ConnectionistTemporalClassification: activations array must be C-contiguous." );
return 1;
}
npy_float32 * activations = (npy_float32 *) PyArray_DATA( in_activations );
create_contiguous_input_lengths( in_input_lengths, &(context->input_lengths) );
if ( NULL == context->input_lengths )
{
// Destroy previous CTC context before returning exception
ctc_context_destroy( context );
PyErr_Format( PyExc_MemoryError,
"ConnectionistTemporalClassification: Could not allocate memory for input lengths" );
return 1;
}
// flatten labels to conform with library memory layout
create_flat_labels( in_labels, &(context->flat_labels), &(context->label_lengths) );
if ( ( NULL == context->label_lengths ) || ( NULL == context->flat_labels ) )
{
// Destroy previous CTC context before returning exception
ctc_context_destroy( context );
PyErr_Format( PyExc_MemoryError,
"ConnectionistTemporalClassification: Could not allocate memory for labels and their lengths" );
return 1;
}
npy_int minibatch_size = PyArray_DIMS( in_activations )[1];
npy_int alphabet_size = PyArray_DIMS( in_activations )[2];
npy_float32 * costs = NULL;
npy_intp cost_size = minibatch_size;
if ( (*out_costs) == NULL || // Symbolic variable has no memory backing
PyArray_NDIM( *out_costs ) != 1 || // or, matrix has the wrong size
PyArray_DIMS( *out_costs )[0] != cost_size )
{
Py_XDECREF( *out_costs );
// Allocate new matrix
*out_costs = (PyArrayObject *) PyArray_ZEROS( 1, &cost_size, NPY_FLOAT32, 0 );
if ( NULL == (*out_costs) )
{
// Destroy previous CTC context before returning exception
ctc_context_destroy( context );
PyErr_Format( PyExc_MemoryError,
"ConnectionistTemporalClassification: Could not allocate memory for CTC costs" );
return 1;
}
}
costs = (npy_float32 *) PyArray_DATA( *out_costs );
npy_float32 * gradients = NULL;
if ( NULL != out_gradients ) // If gradient computation is not disabled
{
if ( NULL == (*out_gradients) || // Symbolic variable has no real backing
PyArray_NDIM( *out_gradients ) != 3 ||
PyArray_DIMS( *out_gradients )[0] != PyArray_DIMS( in_activations )[0] ||
PyArray_DIMS( *out_gradients )[1] != PyArray_DIMS( in_activations )[1] ||
PyArray_DIMS( *out_gradients )[2] != PyArray_DIMS( in_activations )[2] )
{
// Existing matrix is the wrong size. Make a new one.
// Decrement ref counter to existing array
Py_XDECREF( *out_gradients );
// Allocate new array
*out_gradients = (PyArrayObject *) PyArray_ZEROS(3, PyArray_DIMS( in_activations ),
NPY_FLOAT32, 0);
if ( NULL == (*out_gradients) )
{
// Destroy previous CTC context before returning exception
ctc_context_destroy( context );
PyErr_Format( PyExc_MemoryError,
"ConnectionistTemporalClassification: Could not allocate memory for CTC gradients!" );
return 1;
}
}
gradients = (npy_float32 *) PyArray_DATA( *out_gradients );
}
size_t cpu_workspace_size;
int ctc_error;
ctc_error = ctc_check_result( get_workspace_size( context->label_lengths,
context->input_lengths, alphabet_size, minibatch_size, context->options,
&cpu_workspace_size ),
"Failed to obtain CTC workspace size." );
if ( ctc_error ) // Exception is set by ctc_check_result, return error here
{
// Destroy previous CTC context before returning exception
ctc_context_destroy( context );
return 1;
}
context->workspace = malloc( cpu_workspace_size );
if ( NULL == context->workspace )
{
// Destroy previous CTC context before returning exception
ctc_context_destroy( context );
PyErr_Format( PyExc_MemoryError,
"ConnectionistTemporalClassification: Failed to allocate memory for CTC workspace." );
return 1;
}
ctc_error = ctc_check_result( compute_ctc_loss( activations, gradients,
context->flat_labels, context->label_lengths, context->input_lengths,
alphabet_size, minibatch_size, costs, context->workspace,
context->options ), "Failed to compute CTC loss function." );
if ( ctc_error ) // Exception is set by ctc_check_result, return error here
{
ctc_context_destroy( context );
return 1;
}
ctc_context_destroy( context );
return 0;
}
from __future__ import (division, absolute_import, print_function)
import unittest
import numpy as np
import theano
import theano.tensor as T
from theano.tests import unittest_tools as utt
from theano.tensor.nnet.ctc import (ctc_available, ctc, ConnectionistTemporalClassification)
def setup_torch_case():
# Test obtained from Torch tutorial at:
# https://github.com/baidu-research/warp-ctc/blob/master/torch_binding/TUTORIAL.md
# Layout, from slowest to fastest changing dimension, is (time, batchSize, inputLayerSize)
activations = np.asarray([[[0, 0, 0, 0, 0], [1, 2, 3, 4, 5], [-5, -4, -3, -2, -1]],
[[0, 0, 0, 0, 0], [6, 7, 8, 9, 10], [-10, -9, -8, -7, -6]],
[[0, 0, 0, 0, 0], [11, 12, 13, 14, 15], [-15, -14, -13, -12, -11]]],
dtype=np.float32)
# Duration of each sequence
activation_times = np.asarray([1, 3, 3], dtype=np.int32)
# Labels for each sequence
labels = np.asarray([[1, -1],
[3, 3],
[2, 3]], dtype=np.int32)
expected_costs = np.asarray([1.609437943, 7.355742931, 4.938849926],
dtype=np.float32)
grads = [[[0.2, -0.8, 0.2, 0.2, 0.2],
[0.01165623125, 0.03168492019, 0.08612854034, -0.7658783197, 0.636408627],
[-0.02115798369, 0.03168492019, -0.8810571432, 0.2341216654, 0.636408627]],
[[0, 0, 0, 0, 0],
[-0.9883437753, 0.03168492019, 0.08612854034, 0.2341216654, 0.636408627],
[-0.02115798369, 0.03168492019, -0.1891518533, -0.4577836394, 0.636408627]],
[[0, 0, 0, 0, 0],
[0.01165623125, 0.03168492019, 0.08612854034, -0.7658783197, 0.636408627],
[-0.02115798369, 0.03168492019, 0.08612854034, -0.7330639958, 0.636408627]]]
expected_gradients = np.asarray(grads, dtype=np.float32)
return [activations, labels, activation_times, expected_costs, expected_gradients]
def setup_ctc_case():
activations = np.asarray([[[0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]],
[[0.6, 0.1, 0.1, 0.1, 0.1], [0.1, 0.1, 0.5, 0.2, 0.1]]],
dtype=np.float32)
activation_times = np.asarray([2, 2], dtype=np.int32)
labels = np.asarray([[1, 2], [1, 2]], dtype=np.int32)
expected_costs = np.asarray([2.962858438, 3.053659201], dtype=np.float32)
grads = [[[0.177031219, -0.7081246376, 0.177031219, 0.177031219, 0.177031219],
[0.177031219, -0.8229685426, 0.291875124, 0.177031219, 0.177031219]],
[[0.291875124, 0.177031219, -0.8229685426, 0.177031219, 0.177031219],
[0.1786672771, 0.1786672771, -0.7334594727, 0.1974578798, 0.1786672771]]]
expected_gradients = np.asarray(grads, dtype=np.float32)
return [activations, labels, activation_times, expected_costs, expected_gradients]
def setup_grad_case():
activations = np.asarray([[[0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]],
[[0.6, 0.1, 0.1, 0.1, 0.1], [0.1, 0.1, 0.5, 0.2, 0.1]]],
dtype=np.float32)
activation_times = np.asarray([2, 2], dtype=np.int32)
labels = np.asarray([[1, 2], [1, 2]], dtype=np.int32)
return [activations, labels, activation_times]
class TestCTC(unittest.TestCase):
"""
Test Baidu CTC wrapper implementation.
Expected values for costs and gradients are obtained through an external
C implementation, that uses the library directly.
"""
def setUp(self):
if not ctc_available():
self.skipTest('Optional library warp-ctc not available')
def run_ctc(self, activations, labels, input_length, expected_costs, expected_grads):
# Create symbolic variables
t_activations = theano.shared(activations, name="activations")
t_activation_times = theano.shared(input_length, name="activation_times")
t_labels = theano.shared(labels, name="labels")
t_cost = ctc(t_activations, t_labels, t_activation_times)
# Symbolic gradient of CTC cost
t_grad = T.grad(T.mean(t_cost), t_activations)
# Compile symbolic functions
train = theano.function([], [t_cost, t_grad])
cost, grad = train()
utt.assert_allclose(expected_grads / cost.shape[0], grad)
utt.assert_allclose(expected_costs, cost)
self.check_grads_disabled(t_activations, t_labels, t_activation_times)
def check_grads_disabled(self, activations, labels, input_length):
"""
Check if optimization to disable gradients is working
"""
ctc_cost = ctc(activations, labels, input_length)
ctc_function = theano.function([], [ctc_cost])
for node in ctc_function.maker.fgraph.apply_nodes:
if isinstance(node.op, ConnectionistTemporalClassification):
assert (node.op.compute_grad is False)
def test_torch_case(self):
activations, labels, input_length, expected_costs, expected_grads = setup_torch_case()
self.run_ctc(activations, labels, input_length, expected_costs, expected_grads)
def test_ctc(self):
activations, labels, input_length, expected_costs, expected_grads = setup_ctc_case()
self.run_ctc(activations, labels, input_length, expected_costs, expected_grads)
def test_verify_grad(self):
def ctc_op_functor(labels, in_lengths):
def wrapper(acts):
# Create auxiliary symbolic variables
t_activation_times = theano.shared(in_lengths, name="activation_times")
t_labels = theano.shared(labels, name="labels")
return ctc(acts, t_labels, t_activation_times)
return wrapper
activations, labels, activation_times = setup_grad_case()
ctc_op = ctc_op_functor(labels, activation_times)
utt.verify_grad(ctc_op, [activations])
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论