Merge pull request #5949 from joaovictortr/warp_ctc_wrapper

Baidu CTC wrapper

Merge pull request #5949 from joaovictortr/warp_ctc_wrapper
dae1f236 · Frédéric Bastien · GitHub · 82921962 · 011e970c · dae1f236
--- a/doc/library/config.txt
+++ b/doc/library/config.txt
@@ -759,6 +759,14 @@ import theano and print the config variable, as in:
    Location of the magma library.
+.. attribute:: config.ctc.root
+    Default: ``''``
+    Location of the warp-ctc folder. The folder should contain either a build,
+    lib or lib64 subfolder with the shared library (libwarpctc.so), and another
+    subfolder called include, with the CTC library header.
 .. attribute:: config.gcc.cxxflags
    Default: ``""``

--- a/doc/library/gpuarray/ctc.txt
+++ b/doc/library/gpuarray/ctc.txt
+.. _libdoc_gpuarray_ctc:
+================================================================================
+:mod:`theano.gpuarray.ctc` -- Connectionist Temporal Classification (CTC) loss
+================================================================================
+.. note::
+    Usage of connectionist temporal classification (CTC) loss Op, requires that
+    the `warp-ctc <https://github.com/baidu-research/warp-ctc>`_ library is
+    available. In case the warp-ctc library is not in your compiler's library path,
+    the ``config.ctc.root`` configuration option must be appropriately set to the
+    directory containing the warp-ctc library files.
+.. note::
+    Unfortunately, Windows platforms are not yet supported by the underlying
+    library.
+.. module:: theano.gpuarray.ctc
+   :platform: Unix
+   :synopsis: Connectionist temporal classification (CTC) loss Op, using the warp-ctc library
+.. moduleauthor:: `João Victor Risso <https://github.com/joaovictortr>`_
+.. autofunction:: theano.gpuarray.ctc.gpu_ctc
+.. autoclass:: theano.gpuarray.ctc.GpuConnectionistTemporalClassification
--- a/doc/library/gpuarray/index.txt
+++ b/doc/library/gpuarray/index.txt
@@ -18,3 +18,4 @@
    fft
    type
    extra
+    ctc
--- a/doc/library/tensor/nnet/ctc.txt
+++ b/doc/library/tensor/nnet/ctc.txt
+.. _libdoc_tensor_nnet_ctc:
+==================================================================================
+:mod:`theano.tensor.nnet.ctc` -- Connectionist Temporal Classification (CTC) loss
+==================================================================================
+.. note::
+    Usage of connectionist temporal classification (CTC) loss Op, requires that
+    the `warp-ctc <https://github.com/baidu-research/warp-ctc>`_ library is
+    available. In case the warp-ctc library is not in your compiler's library path,
+    the ``config.ctc.root`` configuration option must be appropriately set to the
+    directory containing the warp-ctc library files.
+.. note::
+    Unfortunately, Windows platforms are not yet supported by the underlying
+    library.
+.. module:: theano.tensor.nnet.ctc
+   :platform: Unix
+   :synopsis: Connectionist temporal classification (CTC) loss Op, using the warp-ctc library
+.. moduleauthor:: `João Victor Risso <https://github.com/joaovictortr>`_
+.. autofunction:: theano.tensor.nnet.ctc.ctc
+.. autoclass:: theano.tensor.nnet.ctc.ConnectionistTemporalClassification
--- a/doc/library/tensor/nnet/index.txt
+++ b/doc/library/tensor/nnet/index.txt
@@ -21,3 +21,4 @@ and ops which are particular to neural networks and deep learning.
    neighbours
    bn
    blocksparse
+    ctc
--- a/theano/configdefaults.py
+++ b/theano/configdefaults.py
@@ -1839,6 +1839,13 @@ AddConfigVar(
        allow_override=False),
    in_c_key=False)
+AddConfigVar(
+    'ctc.root',
+    'Directory which contains the root of Baidu CTC library. It is assumed \
+    that the compiled library is either inside the build, lib or lib64 \
+    subdirectory, and the header inside the include directory.',
+    StrParam('', allow_override=False),
+    in_c_key=False)
 # Check if there are remaining flags provided by the user through THEANO_FLAGS.
 for key in THEANO_FLAGS_DICT.keys():

--- a/theano/gpuarray/__init__.py
+++ b/theano/gpuarray/__init__.py
@@ -29,7 +29,7 @@ from .type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant,
                   GpuArraySharedVariable, gpuarray_shared_constructor,
                   reg_context, get_context, ContextNotDefined)
 from .basic_ops import as_gpuarray_variable
-from . import fft, dnn, opt, extra_ops, multinomial, reduction, rng_mrg
+from . import fft, dnn, opt, extra_ops, multinomial, reduction, rng_mrg, ctc
 def transfer(x, target):

--- a/theano/gpuarray/c_code/ctc_wrapper.c
+++ b/theano/gpuarray/c_code/ctc_wrapper.c
+#section init_code
+setup_ext_cuda();
+#section support_code
+typedef struct ctc_context {
+    struct ctcOptions options;
+    gpudata * workspace;
+    int * input_lengths;
+    int * flat_labels;
+    int * label_lengths;
+} ctc_context_t;
+void ctc_context_init(ctc_context_t * context, PyGpuContextObject * gpu_context)
+{
+    memset(&(context->options), 0, sizeof(struct ctcOptions));
+    context->options.loc = CTC_GPU;
+    // Get CUDA function pointer to obtain stream
+    CUstream (*getstream_func_ptr)(void *) = (CUstream (*)(void *)) gpuarray_get_extension( "cuda_get_stream" );
+    context->options.stream = getstream_func_ptr(gpu_context->ctx);
+    context->workspace = NULL;
+    context->input_lengths = NULL;
+    context->flat_labels = NULL;
+    context->label_lengths = NULL;
+}
+void ctc_context_destroy(ctc_context_t * context)
+{
+    gpudata_release( context->workspace );
+    free( context->input_lengths );
+    free( context->flat_labels );
+    free( context->label_lengths );
+}
+int ctc_check_result(ctcStatus_t retcode, const char * msg)
+{
+    if( CTC_STATUS_SUCCESS != retcode )
+    {
+        // Get error message from underlying library
+        const char * ctc_msg = ctcGetStatusString( retcode );
+        PyErr_Format( PyExc_RuntimeError,
+                      "GpuConnectionistTemporalClassification: %s CTC error: %s",
+                      msg,
+                      ctc_msg );
+        return 1;
+    }
+    return 0;
+}
+void create_contiguous_input_lengths( PyArrayObject * input_lengths_arr,
+    int ** input_lengths )
+{
+    npy_int num_elements = PyArray_DIMS( input_lengths_arr )[0];
+    *input_lengths = (int *) malloc( num_elements * sizeof(int) );
+    if ( NULL == (*input_lengths) )
+        return;
+    for( npy_int elem_idx = 0; elem_idx < num_elements; ++elem_idx )
+    {
+        (*input_lengths)[elem_idx] = *( (npy_int *) PyArray_GETPTR1( input_lengths_arr, elem_idx ) );
+    }
+}
+void create_flat_labels( PyArrayObject * label_matrix, int ** flat_labels,
+    int ** label_lengths )
+{
+    npy_int rows = PyArray_DIMS( label_matrix )[0];
+    npy_int cols = PyArray_DIMS( label_matrix )[1];
+    *flat_labels = (int *) calloc( rows * cols, sizeof(int) );
+    if ( NULL == (*flat_labels) )
+        return;
+    *label_lengths = (int *) calloc( rows, sizeof(int) );
+    if ( NULL == (*label_lengths) )
+    {
+        free( *flat_labels );
+        *flat_labels = NULL;
+        return;
+    }
+    npy_int label_index = 0;
+    for( npy_int row_idx = 0; row_idx < rows; ++row_idx )
+    {
+        npy_int label_length = 0;
+        for( npy_int col_idx = 0; col_idx < cols; ++col_idx )
+        {
+            npy_int label = *( (npy_int *) PyArray_GETPTR2( label_matrix, row_idx, col_idx ) );
+            if ( label >= 0 )  // negative values are assumed to be padding
+            {
+                (*flat_labels)[ label_index++ ] = label;
+                ++label_length;
+            }
+        }
+        (*label_lengths)[ row_idx ] = label_length;
+    }
+}
+#section support_code_apply
+int APPLY_SPECIFIC(ctc_cost_gpu)(PyGpuArrayObject   *  in_activations,
+                                 PyArrayObject      *  in_labels,
+                                 PyArrayObject      *  in_input_lengths,
+                                 PyGpuArrayObject   ** out_costs,
+                                 PyGpuArrayObject   ** out_gradients,
+                                 PyGpuContextObject *  gpu_context)
+{
+    ctc_context_t ctc_object;
+    ctc_context_t * context = &ctc_object;
+    size_t gpu_workspace_size;
+    int ctc_error = 0;
+    const size_t num_activations = PyGpuArray_DIMS( in_activations )[0];
+    const size_t minibatch_size = PyGpuArray_DIMS( in_activations )[1];
+    const size_t alphabet_size = PyGpuArray_DIMS( in_activations )[2];
+    const size_t cost_size = minibatch_size;
+    const size_t grad_dims[3] = { num_activations, minibatch_size, alphabet_size };
+    float * costs = NULL,
+          * activations = NULL,
+          * gradients = NULL;
+    cuda_enter( gpu_context->ctx );
+    ctc_context_init( context, gpu_context );
+    switch (in_activations->ga.typecode)
+    {
+    case GA_FLOAT:
+        activations = (float *) PyGpuArray_DEV_DATA( in_activations );
+        break;
+    default:
+        ctc_context_destroy( context );
+        cuda_exit( gpu_context->ctx );
+        PyErr_SetString( PyExc_TypeError,
+            "GpuConnectionistTemporalClassification: Unsupported type for activations." );
+        return 1;
+    }
+    create_contiguous_input_lengths( in_input_lengths, &(context->input_lengths) );
+    if ( NULL == context->input_lengths )
+    {
+        // Destroy previous CTC context before returning exception
+        ctc_context_destroy( context );
+        cuda_exit( gpu_context->ctx );
+        PyErr_Format( PyExc_MemoryError,
+            "GpuConnectionistTemporalClassification: Could not allocate memory for input lengths." );
+        return 1;
+    }
+    // flatten labels to conform with library memory layout
+    create_flat_labels( in_labels, &(context->flat_labels), &(context->label_lengths) );
+    if ( ( NULL == context->label_lengths ) || ( NULL == context->flat_labels ) )
+    {
+        // Destroy previous CTC context before returning exception
+        ctc_context_destroy( context );
+        cuda_exit( gpu_context->ctx );
+        PyErr_Format( PyExc_MemoryError,
+            "GpuConnectionistTemporalClassification: Could not allocate memory for labels and their lengths." );
+        return 1;
+    }
+    if ( theano_prep_output( out_costs, 1, &cost_size, in_activations->ga.typecode,
+                             GA_C_ORDER, gpu_context ) != 0 )
+    {
+        ctc_context_destroy( context );
+        cuda_exit( gpu_context->ctx );
+        return 1;
+    }
+    GpuArray_memset( &((*out_costs)->ga), 0 );
+    costs = (float *) PyGpuArray_DEV_DATA( *out_costs );
+    if ( NULL != out_gradients )  // if gradient computation is not disabled
+    {
+        if ( theano_prep_output( out_gradients, 3, grad_dims, in_activations->ga.typecode,
+                                 GA_C_ORDER, gpu_context ) != 0 )
+        {
+            ctc_context_destroy( context );
+            cuda_exit( gpu_context->ctx );
+            return 1;
+        }
+        GpuArray_memset( &((*out_gradients)->ga), 0 );
+        gradients = (float *) PyGpuArray_DEV_DATA( *out_gradients );
+    }
+    ctc_error = ctc_check_result( get_workspace_size( context->label_lengths,
+        context->input_lengths, alphabet_size, minibatch_size, context->options,
+        &gpu_workspace_size ),
+        "Failed to obtain CTC workspace size." );
+    if ( ctc_error )  // Exception is set by ctc_check_result, return error here
+    {
+        // Destroy previous CTC context before returning exception
+        ctc_context_destroy( context );
+        cuda_exit( gpu_context->ctx );
+        return 1;
+    }
+    context->workspace = gpudata_alloc( gpu_context->ctx, gpu_workspace_size, NULL, 0, NULL );
+    if ( NULL == context->workspace )
+    {
+        ctc_context_destroy( context );
+        cuda_exit( gpu_context->ctx );
+        PyErr_Format( PyExc_MemoryError,
+            "GpuConnectionistTemporalClassification: Failed to allocate memory for CTC workspace." );
+        return 1;
+    }
+    cuda_wait( in_activations->ga.data, GPUARRAY_CUDA_WAIT_READ );
+    cuda_wait( (*out_costs)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
+    if ( out_gradients != NULL )
+        cuda_wait( (*out_gradients)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
+    ctc_error = ctc_check_result( compute_ctc_loss( activations, gradients,
+        context->flat_labels, context->label_lengths, context->input_lengths,
+        alphabet_size, minibatch_size, costs, *(void **)context->workspace,
+        context->options ), "Failed to compute CTC loss function." );
+    cuda_record( in_activations->ga.data, GPUARRAY_CUDA_WAIT_READ );
+    cuda_record( (*out_costs)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
+    if ( out_gradients != NULL )
+        cuda_record( (*out_gradients)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
+    if ( ctc_error )  // Exception is set by ctc_check_result, return error here
+    {
+        ctc_context_destroy( context );
+        cuda_exit( gpu_context->ctx );
+        return 1;
+    }
+    ctc_context_destroy( context );
+    cuda_exit( gpu_context->ctx );
+    return 0;
+}
--- a/theano/gpuarray/ctc.py
+++ b/theano/gpuarray/ctc.py
+from __future__ import absolute_import, print_function, division
+import theano
+from theano import (config, gof)
+import theano.tensor as T
+from .basic_ops import (gpu_contiguous, as_gpuarray_variable, infer_context_name)
+import theano.tensor.nnet.ctc
+from .type import (GpuArrayType, gpu_context_type)
+from .elemwise import GpuDimShuffle
+from theano.gradient import grad_undefined
+from theano.gof import local_optimizer
+from theano.tensor.opt import register_canonicalize
+from theano.tensor.nnet.ctc import ctc_available
+import os
+import os.path
+from . import pygpu
+class GpuConnectionistTemporalClassification(gof.COp):
+    """
+    GPU wrapper for Baidu CTC loss function.
+    Parameters
+    ----------
+    compute_grad
+        If set to True, enables the computation of gradients of the CTC loss function.
+    """
+    __props__ = ('compute_grad',)
+    _cop_num_inputs = 3
+    _cop_num_outputs = 2
+    func_file = "./c_code/ctc_wrapper.c"
+    func_name = "APPLY_SPECIFIC(ctc_cost_gpu)"
+    params_type = gpu_context_type
+    def __init__(self, compute_grad=True):
+        if not ctc_available():
+            raise RuntimeError('Baidu CTC is not available and '
+                               'GpuConnectionistTemporalClassification Op '
+                               'can not be constructed.')
+        self.compute_grad = compute_grad
+        # Return only the cost. Gradient will be returned by grad()
+        self.default_output = 0
+        gof.COp.__init__(self, self.func_file, self.func_name)
+    def c_lib_dirs(self):
+        lib_dirs = []
+        if ctc_available.path is not None:
+            lib_dirs += [ctc_available.path]
+        return lib_dirs
+    def c_libraries(self):
+        return ["warpctc", "gpuarray"]
+    def c_header_dirs(self):
+        dirs = [os.path.dirname(__file__), pygpu.get_include()]
+        if config.ctc.root != '':
+            dirs.append(os.path.join(config.ctc.root, "include"))
+        return dirs
+    def c_headers(self):
+        return ['ctc.h', 'numpy_compat.h', 'gpuarray/ext_cuda.h',
+                'gpuarray_helper.h', 'gpuarray/types.h', 'gpuarray_api.h',
+                'gpuarray/array.h', 'gpuarray/util.h', 'gpuarray/extension.h']
+    def get_params(self, node):
+        return node.inputs[0].type.context
+    def make_node(self, activations, labels, input_lengths):
+        context_name = infer_context_name(activations)
+        t_activations = as_gpuarray_variable(activations,
+                                             context_name=context_name)
+        # Ensure activations array is C-contiguous
+        t_activations = gpu_contiguous(t_activations)
+        # Labels and input lengths are always on the CPU
+        t_labels = T.as_tensor_variable(labels)
+        t_input_lengths = T.as_tensor_variable(input_lengths)
+        if t_activations.type.dtype != 'float32':
+            raise TypeError('activations must use the float32 type.')
+        if t_activations.ndim != 3:
+            raise ValueError('activations must have 3 dimensions.')
+        if t_labels.type.dtype != 'int32':
+            raise TypeError('labels must use the int32 type.')
+        if t_labels.ndim != 2:
+            raise ValueError('labels must have 2 dimensions.')
+        if t_input_lengths.type.dtype != 'int32':
+            raise TypeError('input_lengths must use the int32 type.')
+        if t_input_lengths.ndim != 1:
+            raise ValueError('input_lengths must have 1 dimension.')
+        costs = GpuArrayType(dtype='float32',
+                             broadcastable=(False,),
+                             context_name=context_name)()
+        outputs = [costs]
+        if self.compute_grad:
+            gradients = GpuArrayType(dtype='float32',
+                                     broadcastable=(False, False, False,),
+                                     context_name=context_name)()
+            outputs += [gradients]
+        return theano.Apply(self, inputs=[t_activations, t_labels, t_input_lengths],
+                            outputs=outputs)
+    def L_op(self, inputs, outputs, output_grads):
+        # Gradients computed by Op
+        assert self.compute_grad and len(outputs) == 2
+        gradients = outputs[1]
+        assert gradients is not None
+        # Gradients of original function, to compose chain rule
+        grad_op = output_grads[0]
+        grad_shuffle = GpuDimShuffle(input_broadcastable=(False, False, False,),
+                                     new_order=(1, 0, 2))(gradients)
+        grad_bdot = T.basic.batched_dot(grad_op, grad_shuffle)
+        grad_shuffle_reverse = GpuDimShuffle(input_broadcastable=(False, False, False,),
+                                             new_order=(1, 0, 2))(grad_bdot)
+        return [grad_shuffle_reverse,
+                grad_undefined(self, 1, inputs[1]),
+                grad_undefined(self, 2, inputs[2])]
+def gpu_ctc(activations, labels, input_lengths):
+    """
+    Compute CTC loss function on the GPU.
+    Parameters
+    ----------
+    activations
+        Three-dimensional tensor, which has a shape of (t, m, p), where
+        t is the time index, m is the minibatch index, and p is the index
+        over the probabilities of each symbol in the alphabet. The memory
+        layout is assumed to be in C-order, which consists in the slowest
+        to the fastest changing dimension, from left to right. In this case,
+        p is the fastest changing dimension.
+    labels
+        A 2-D tensor of all the labels for the minibatch. In each row, there
+        is a sequence of target labels. Negative values are assumed to be padding,
+        and thus are ignored. Blank symbol is assumed to have index 0 in the
+        alphabet.
+    input_lengths
+        A 1-D tensor with the number of time steps for each sequence in
+        the minibatch.
+    Returns
+    -------
+    1-D array
+        Cost of each example in the minibatch.
+    """
+    return GpuConnectionistTemporalClassification()(activations, labels, input_lengths)
+# Disable gradient computation if not needed
+@register_canonicalize
+@local_optimizer([GpuConnectionistTemporalClassification])
+def local_gpu_ctc_no_grad(node):
+    if isinstance(node.op, GpuConnectionistTemporalClassification):
+        if len(node.outputs) > 1:
+            if len(node.outputs[1].clients) == 0:   # gradient is not used
+                return [GpuConnectionistTemporalClassification(compute_grad=False)(*node.inputs), None]
+    return False
--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -34,6 +34,7 @@ from theano.tensor.nnet.abstract_conv import (BaseAbstractConv,
                                              AbstractConv3d_gradWeights,
                                              AbstractConv3d_gradInputs)
 from theano.tensor.nnet.neighbours import Images2Neibs
+from theano.tensor.nnet.ctc import ConnectionistTemporalClassification
 import theano.tensor.nlinalg as nlinalg
 import theano.tensor.signal.pool as pool
 import theano.tensor.slinalg as slinalg
@@ -80,6 +81,7 @@ from .linalg import (GpuCusolverSolve, MATRIX_STRUCTURES_SOLVE, GpuCholesky,
                     GpuMagmaCholesky, gpu_qr, GpuMagmaEigh,
                     GpuCublasTriangularSolve, cublas_available)
 from .neighbours import GpuImages2Neibs
+from .ctc import GpuConnectionistTemporalClassification
 _logger = logging.getLogger("theano.gpuarray.opt")
@@ -162,6 +164,7 @@ def register_inplace(*tags, **kwargs):
        return local_opt
    return f
 register_opt('fast_compile')(theano.tensor.opt.local_track_shape_i)
 register_opt(final_opt=True, name='gpua_constant_folding')(
    tensor.opt.constant_folding)
@@ -582,6 +585,7 @@ def local_cut_gpu_transfers(node):
                else:
                    return [node.op(n2.inputs[0])]
 gpu_cut_copies.register('cut_gpua_host_transfers', local_cut_gpu_transfers,
                        'fast_compile', 'fast_run', 'gpuarray')
 gpu_cut_copies.register('cut_gpua_constant_transfers',
@@ -652,6 +656,8 @@ def local_gpua_alloc_empty_to_zeros(node):
        z = np.asarray(0, dtype=node.outputs[0].dtype)
        return [GpuAlloc(context_name)(as_gpuarray_variable(z, context_name),
                                       *node.inputs)]
 optdb.register('local_gpua_alloc_empty_to_zeros',
               theano.tensor.opt.in2out(local_gpua_alloc_empty_to_zeros),
               # After move to gpu and merge2, before inplace.
@@ -1540,6 +1546,8 @@ def local_conv_gpu_conv(node):
                return [tensor.as_tensor_variable(out)]
            else:
                return [out]
 register_opt()(local_conv_gpu_conv)
@@ -1812,6 +1820,8 @@ def local_gpu_pool(op, ctx_name, inputs, outputs):
        inp_padded = pad_dims(inp, 2, nd)
        ret_padded = op(inp_padded, ws, stride, pad)
        return unpad_dims(ret_padded, inp, 2, nd)
 pool_db = LocalGroupDB()
 pool_db2 = LocalGroupDB(local_opt=theano.gof.opt.GraphToGPULocalOptGroup)
 pool_db2.__name__ = "pool_db2"
@@ -1849,6 +1859,8 @@ def local_gpu_max_pool_grad(op, ctx_name, inputs, outputs):
        ret_padded = op(inp_padded, out_padded, out_grad_padded,
                        ws, stride, pad)
        return unpad_dims(ret_padded, inp, 2, nd)
 lifter = op_lifter([pool.MaxPoolGrad])(local_gpu_max_pool_grad)
 pool_db.register("local_gpu_max_pool_grad", lifter,
                 'gpuarray', 'fast_compile', 'fast_run',
@@ -1879,6 +1891,8 @@ def local_gpu_average_pool_grad(op, ctx_name, inputs, outputs):
        ret_padded = op(inp_padded, out_grad_padded,
                        ws, stride, pad)
        return unpad_dims(ret_padded, inp, 2, nd)
 lifter = op_lifter([pool.AveragePoolGrad])(local_gpu_average_pool_grad)
 pool_db.register("local_gpu_average_pool_grad", lifter,
                 'gpuarray', 'fast_compile', 'fast_run',
@@ -1976,6 +1990,7 @@ def local_assert_no_cpu_op(node):
            elif config.assert_no_cpu_op == "pdb":
                pdb.set_trace()
 # Register the local_assert_no_cpu_op:
 assert_no_cpu_op = theano.tensor.opt.in2out(local_assert_no_cpu_op,
                                            name='assert_no_cpu_op')
@@ -2308,6 +2323,15 @@ def local_gpu_magma_svd(op, context_name, inputs, outputs):
            out = [out.astype('float16')]
    return out
+@register_opt('ctc', 'fast_compile')
+@op_lifter([theano.tensor.nnet.ctc.ConnectionistTemporalClassification])
+@register_opt2([ConnectionistTemporalClassification], 'ctc', 'fast_compile')
+def local_gpu_ctc(op, context_name, inputs, outputs):
+    op = GpuConnectionistTemporalClassification(compute_grad=op.compute_grad)
+    return op.make_node(*inputs).outputs
 # Do not register in fast_run or fast_compile.
 # It will be added to fast_run if the GPU is enabled.
 optdb.register('gpua_scanOp_make_inplace',

--- a/theano/gpuarray/tests/test_ctc.py
+++ b/theano/gpuarray/tests/test_ctc.py
+from __future__ import (division, absolute_import, print_function)
+import unittest
+import numpy as np
+import theano
+import theano.tensor as T
+from theano.tests import unittest_tools as utt
+import theano.gpuarray
+from theano.gpuarray.ctc import (gpu_ctc, GpuConnectionistTemporalClassification)
+from theano.tensor.nnet.ctc import (ctc, ctc_available, ConnectionistTemporalClassification)
+from theano.tensor.nnet.tests.test_ctc import (setup_torch_case, setup_ctc_case, setup_grad_case)
+from .config import (mode_with_gpu, mode_without_gpu)
+class TestCTC(unittest.TestCase):
+    def setUp(self):
+        if not ctc_available():
+            self.skipTest('Optional library warp-ctc not available')
+    def check_ctc(self, activations, labels, input_length, expected_costs, expected_grads):
+        # Create symbolic variables
+        t_activations = theano.shared(activations, name="activations")
+        t_activation_times = theano.shared(input_length, name="activation_times")
+        t_labels = theano.shared(labels, name="labels")
+        inputs = [t_activations, t_labels, t_activation_times]
+        # Execute several tests for each test case
+        self.check_expected_values(t_activations, t_labels, t_activation_times, expected_costs, expected_grads)
+        self.compare_gpu_and_cpu_values(*inputs)
+        self.check_grads_disabled(*inputs)
+        self.run_gpu_optimization_with_grad(*inputs)
+        self.run_gpu_optimization_no_grad(*inputs)
+    def setup_cpu_op(self, activations, labels, input_length, compute_grad=True, mode=mode_without_gpu):
+        cpu_ctc_cost = ctc(activations, labels, input_length)
+        outputs = [cpu_ctc_cost]
+        if compute_grad:
+            # Symbolic gradient of CTC cost
+            cpu_ctc_grad = T.grad(T.mean(cpu_ctc_cost), activations)
+            outputs += [cpu_ctc_grad]
+        return theano.function([], outputs, mode=mode)
+    def setup_gpu_op(self, activations, labels, input_length, compute_grad=True):
+        gpu_ctc_cost = gpu_ctc(activations, labels, input_length)
+        outputs = [gpu_ctc_cost]
+        if compute_grad:
+            # Symbolic gradient of CTC cost
+            gpu_ctc_grad = T.grad(T.mean(gpu_ctc_cost), activations)
+            outputs += [gpu_ctc_grad]
+        return theano.function([], outputs)
+    def check_expected_values(self, activations, labels, input_length, expected_costs, expected_grads):
+        gpu_train = self.setup_gpu_op(activations, labels, input_length)
+        gpu_cost, gpu_grad = gpu_train()
+        # Transfer costs from GPU memory to host
+        cost_from_gpu = np.asarray(gpu_cost)
+        # Transfer gradients from GPU memory to host
+        grad_from_gpu = np.asarray(gpu_grad)
+        # Check that results are in conformance with expected values
+        utt.assert_allclose(expected_grads / cost_from_gpu.shape[0], grad_from_gpu)
+        utt.assert_allclose(expected_costs, cost_from_gpu)
+    def compare_gpu_and_cpu_values(self, activations, labels, input_length):
+        cpu_train = self.setup_cpu_op(activations, labels, input_length)
+        cpu_cost, cpu_grad = cpu_train()
+        gpu_train = self.setup_gpu_op(activations, labels, input_length)
+        gpu_cost, gpu_grad = gpu_train()
+        # Transfer costs from GPU memory to host
+        cost_from_gpu = np.asarray(gpu_cost)
+        # Transfer gradients from GPU memory to host
+        grad_from_gpu = np.asarray(gpu_grad)
+        # Check that results are in conformance with expected values
+        utt.assert_allclose(cpu_grad, grad_from_gpu)
+        utt.assert_allclose(cpu_cost, cost_from_gpu)
+    def check_grads_disabled(self, activations, labels, input_length):
+        """
+        Check if optimization to disable gradients is working
+        """
+        gpu_ctc_cost = gpu_ctc(activations, labels, input_length)
+        gpu_ctc_function = theano.function([], [gpu_ctc_cost])
+        for node in gpu_ctc_function.maker.fgraph.apply_nodes:
+            if isinstance(node.op, GpuConnectionistTemporalClassification):
+                assert (node.op.compute_grad is False)
+    def run_gpu_optimization_with_grad(self, activations, labels, input_length):
+        # Compile CPU function with optimization
+        cpu_lifted_train = self.setup_cpu_op(activations, labels, input_length, mode=mode_with_gpu)
+        # Check whether Op is lifted to the GPU
+        assert self.has_only_gpu_op(cpu_lifted_train)
+    def run_gpu_optimization_no_grad(self, activations, labels, input_length):
+        cpu_train = self.setup_cpu_op(activations, labels, input_length, compute_grad=False)
+        cpu_cost = cpu_train()
+        # Compile CPU function with optimization
+        cpu_lifted_test = self.setup_cpu_op(activations, labels, input_length, compute_grad=False, mode=mode_with_gpu)
+        # Check whether Op is lifted to the GPU
+        assert self.has_only_gpu_op(cpu_lifted_test)
+        gpu_cost = cpu_lifted_test()
+        # Transfer costs from GPU memory to host
+        cost_from_gpu = np.asarray(gpu_cost)
+        # Compare values from CPU and GPU Ops
+        utt.assert_allclose(cpu_cost, cost_from_gpu)
+    def has_only_gpu_op(self, function):
+        has_cpu_instance = False
+        has_gpu_instance = False
+        for node in function.maker.fgraph.apply_nodes:
+            if isinstance(node.op, ConnectionistTemporalClassification):
+                has_cpu_instance = True
+            if isinstance(node.op, GpuConnectionistTemporalClassification):
+                has_gpu_instance = True
+        return has_gpu_instance and (not has_cpu_instance)
+    # Test obtained from Torch tutorial at:
+    # https://github.com/baidu-research/warp-ctc/blob/master/torch_binding/TUTORIAL.md
+    def test_torch_case(self):
+        activations, labels, activation_times, expected_costs, expected_grads = setup_torch_case()
+        self.check_ctc(activations, labels, activation_times, expected_costs, expected_grads)
+    def test_ctc(self):
+        activations, labels, input_length, expected_costs, expected_grads = setup_ctc_case()
+        self.check_ctc(activations, labels, input_length, expected_costs, expected_grads)
+    def test_verify_grad(self):
+        def ctc_op_functor(labels, in_lengths):
+            def wrapper(acts):
+                # Create auxiliary symbolic variables
+                t_activation_times = theano.shared(in_lengths, name="activation_times")
+                t_labels = theano.shared(labels, name="labels")
+                return gpu_ctc(acts, t_labels, t_activation_times)
+            return wrapper
+        activations, labels, activation_times = setup_grad_case()
+        ctc_op = ctc_op_functor(labels, activation_times)
+        utt.verify_grad(ctc_op, [activations])
--- a/theano/tensor/nnet/ctc.py
+++ b/theano/tensor/nnet/ctc.py
+from __future__ import (division, absolute_import, print_function)
+import os
+import os.path
+import theano.tensor as T
+from theano import config
+from theano import gof
+from theano.gof import local_optimizer
+from theano.gof.cmodule import GCC_compiler
+from theano.tensor.opt import register_canonicalize
+from theano.tensor.extra_ops import cpu_contiguous
+from theano.gradient import grad_undefined
+def _ctc_find_lib():
+    """
+    Find the directory that contains libwarpctc.so
+    """
+    if config.ctc.root != '':
+        for lib_dir in ["build", "lib", "lib64"]:
+            lib_path = os.path.join(config.ctc.root, lib_dir)
+            if os.path.isdir(lib_path) and os.path.exists(lib_path):
+                lib_found = os.path.exists(os.path.join(lib_path, "libwarpctc.so"))
+                if lib_found:
+                    return lib_path
+    return None
+def _ctc_check_compile(ctc_lib_path):
+    preambule = """
+#include <string.h>
+#include "ctc.h"
+"""
+    body = """
+ctcOptions options;
+memset(&options, 0, sizeof(ctcOptions));
+options.loc = CTC_CPU;
+options.num_threads = 1;
+"""
+    params = ['-I%s' % (os.path.dirname(__file__))]
+    if ctc_lib_path is not None:
+        params.extend(["-I%s" % (os.path.join(config.ctc.root, "include"))])
+        params.extend(["-L%s" % (ctc_lib_path)])
+    params.extend(["-l", "warpctc"])
+    compiler_res = GCC_compiler.try_flags(
+        params, preambule=preambule, body=body,
+        try_run=False, output=True)
+    avail, out, err = compiler_res if isinstance(compiler_res, tuple) else (compiler_res, None, None)
+    if not avail:
+        return False, ("cannot compile with warp-ctc. "
+                       "We got this error:\n" + str(err))
+    return True, None
+def ctc_present():
+    if ctc_present.avail is not None:
+        return ctc_present.avail
+    ctc_lib_path = _ctc_find_lib()
+    ctc_present.path = ctc_lib_path
+    ctc_present.avail, ctc_present.msg = _ctc_check_compile(ctc_present.path)
+    return ctc_present.avail
+ctc_present.avail = None
+ctc_present.msg = None
+ctc_present.path = None
+def ctc_available():
+    if os.name == 'nt':
+        ctc_available.msg = 'Windows platforms are currently not supported ',
+        'by underlying CTC library (warp-ctc).'
+        return False
+    elif not ctc_present():
+        ctc_available.msg = ctc_present.msg
+        return False
+    ctc_available.path = ctc_present.path
+    return True
+ctc_available.msg = None
+ctc_available.path = None
+class ConnectionistTemporalClassification(gof.COp, gof.OpenMPOp):
+    """
+    CTC loss function wrapper.
+    Notes
+    -----
+    Using the wrapper requires that Baidu's warp-ctc library is installed.
+    If the warp-ctc library is not on your compiler's default library path,
+    you must set the configuration variable ``config.ctc.root`` appropriately.
+    Parameters
+    ----------
+    compute_grad
+        If set to True, enables the computation of gradients of the CTC loss function.
+    """
+    __props__ = ('compute_grad',)
+    _cop_num_inputs = 3
+    _cop_num_outputs = 2
+    func_file = "./ctc_wrapper.c"
+    func_name = "APPLY_SPECIFIC(ctc_cost_cpu)"
+    def __init__(self, compute_grad=True, openmp=None):
+        if not ctc_available():
+            raise RuntimeError('Baidu CTC is not available and '
+                               'ConnectionistTemporalClassification Op '
+                               'can not be constructed.')
+        gof.COp.__init__(self, self.func_file, self.func_name)
+        gof.OpenMPOp.__init__(self, openmp=openmp)
+        self.compute_grad = compute_grad
+        # Return only the cost. Gradient will be returned by grad()
+        self.default_output = 0
+    def c_lib_dirs(self):
+        lib_dirs = []
+        if ctc_available.path is not None:
+            lib_dirs += [ctc_available.path]
+        return lib_dirs
+    def c_libraries(self):
+        return ["warpctc"]
+    def c_header_dirs(self):
+        header_dirs = []
+        if config.ctc.root != '':
+            # We assume here that the header is available at the include directory
+            # of the CTC root directory.
+            header_dirs += [os.path.join(config.ctc.root, "include")]
+        return header_dirs
+    def c_headers(self):
+        return ["ctc.h"] + gof.OpenMPOp.c_headers(self)
+    def make_node(self, activations, labels, input_lengths):
+        t_activations = T.as_tensor_variable(activations)
+        # Ensure activations array is C-contiguous
+        t_activations = cpu_contiguous(t_activations)
+        t_labels = T.as_tensor_variable(labels)
+        t_input_lengths = T.as_tensor_variable(input_lengths)
+        if t_activations.type.dtype != 'float32':
+            raise TypeError('activations must use the float32 type!')
+        if t_activations.ndim != 3:
+            raise ValueError('activations must have 3 dimensions.')
+        if t_labels.type.dtype != 'int32':
+            raise TypeError('labels must use the int32 type!')
+        if t_labels.ndim != 2:
+            raise ValueError('labels must have 2 dimensions.')
+        if t_input_lengths.type.dtype != 'int32':
+            raise TypeError('input_lengths must use the int32 type!')
+        if t_input_lengths.ndim != 1:
+            raise ValueError('input_lengths must have 1 dimension.')
+        costs = T.fvector(name="ctc_cost")
+        outputs = [costs]
+        if self.compute_grad:
+            gradients = T.ftensor3(name="ctc_grad")
+            outputs += [gradients]
+        return gof.Apply(self, inputs=[t_activations, t_labels, t_input_lengths],
+                         outputs=outputs)
+    def L_op(self, inputs, outputs, output_grads):
+        assert self.compute_grad and len(outputs) == 2
+        gradients = outputs[1]
+        assert gradients is not None
+        grad_op = output_grads[0]
+        total_grad = T.basic.batched_dot(grad_op, gradients.dimshuffle(1, 0, 2)).dimshuffle(1, 0, 2)
+        return [total_grad,
+                grad_undefined(self, 1, inputs[1]),
+                grad_undefined(self, 2, inputs[2])]
+def ctc(activations, labels, input_lengths):
+    """
+    Compute CTC loss function.
+    Notes
+    -----
+    Using the loss function requires that the Baidu's warp-ctc library be installed.
+    If the warp-ctc library is not on the compiler's default library path, the
+    configuration variable ``config.ctc.root`` must be properly set.
+    Parameters
+    ----------
+    activations
+        Three-dimensional tensor, which has a shape of (t, m, p), where
+        t is the time index, m is the minibatch index, and p is the index
+        over the probabilities of each symbol in the alphabet. The memory
+        layout is assumed to be in C-order, which consists in the slowest
+        to the fastest changing dimension, from left to right. In this case,
+        p is the fastest changing dimension.
+    labels
+        A 2-D tensor of all the labels for the minibatch. In each row, there
+        is a sequence of target labels. Negative values are assumed to be padding,
+        and thus are ignored. Blank symbol is assumed to have index 0 in the
+        alphabet.
+    input_lengths
+        A 1-D tensor with the number of time steps for each sequence in
+        the minibatch.
+    Returns
+    -------
+    1-D array
+        Cost of each example in the minibatch.
+    """
+    return ConnectionistTemporalClassification()(activations, labels, input_lengths)
+# Disable gradient computation if not needed
+@register_canonicalize
+@local_optimizer([ConnectionistTemporalClassification])
+def local_ctc_no_grad(node):
+    if isinstance(node.op, ConnectionistTemporalClassification):
+        if len(node.outputs) > 1:
+            if len(node.outputs[1].clients) == 0:   # gradient is not used
+                return [ConnectionistTemporalClassification(compute_grad=False)(*node.inputs), None]
+    return False
--- a/theano/tensor/nnet/ctc_wrapper.c
+++ b/theano/tensor/nnet/ctc_wrapper.c
+#section support_code
+typedef struct ctc_context {
+    struct ctcOptions options;
+    void * workspace;
+    int * input_lengths;
+    int * flat_labels;
+    int * label_lengths;
+} ctc_context_t;
+void ctc_context_init(ctc_context_t * context)
+{
+    struct ctcOptions * options = &(context->options);
+    memset(options, 0, sizeof(struct ctcOptions));
+    options->loc = CTC_CPU;
+#if defined(_OPENMP)
+    options->num_threads = omp_get_num_threads();
+#else
+    options->num_threads = 1;
+#endif
+    context->workspace = NULL;
+    context->input_lengths = NULL;
+    context->flat_labels = NULL;
+    context->label_lengths = NULL;
+}
+void ctc_context_destroy(ctc_context_t * context)
+{
+    free( context->workspace );
+    free( context->input_lengths );
+    free( context->flat_labels );
+    free( context->label_lengths );
+}
+int ctc_check_result(ctcStatus_t retcode, const char * msg)
+{
+    if( CTC_STATUS_SUCCESS != retcode )
+    {
+        // Get error message from underlying library
+        const char * ctc_msg = ctcGetStatusString( retcode );
+        PyErr_Format( PyExc_RuntimeError,
+                      "ConnectionistTemporalClassification: %s CTC error: %s",
+                      msg,
+                      ctc_msg );
+        return 1;
+    }
+    return 0;
+}
+void create_contiguous_input_lengths( PyArrayObject * input_lengths_arr,
+    int ** input_lengths )
+{
+    npy_int num_elements = PyArray_DIMS( input_lengths_arr )[0];
+    *input_lengths = (int *) calloc( num_elements, sizeof(int) );
+    if ( NULL == (*input_lengths) )
+        return;
+    for( npy_int elem_idx = 0; elem_idx < num_elements; ++elem_idx )
+    {
+        (*input_lengths)[elem_idx] = *( (npy_int *) PyArray_GETPTR1( input_lengths_arr, elem_idx ) );
+    }
+}
+void create_flat_labels( PyArrayObject * label_matrix, int ** flat_labels,
+    int ** label_lengths )
+{
+    npy_int rows = PyArray_DIMS( label_matrix )[0];
+    npy_int cols = PyArray_DIMS( label_matrix )[1];
+    *flat_labels = (int *) calloc( rows * cols, sizeof(int) );
+    if ( NULL == (*flat_labels) )
+        return;
+    *label_lengths = (int *) calloc( rows, sizeof(int) );
+    if ( NULL == (*label_lengths) )
+    {
+        free( *flat_labels );
+        *flat_labels = NULL;
+        return;
+    }
+    npy_int label_index = 0;
+    for( npy_int row_idx = 0; row_idx < rows; ++row_idx )
+    {
+        npy_int label_length = 0;
+        for( npy_int col_idx = 0; col_idx < cols; ++col_idx )
+        {
+            npy_int label = *( (npy_int *) PyArray_GETPTR2( label_matrix, row_idx, col_idx ) );
+            if ( label >= 0 )  // negative values are assumed to be padding
+            {
+                (*flat_labels)[ label_index++ ] = label;
+                ++label_length;
+            }
+        }
+        (*label_lengths)[ row_idx ] = label_length;
+    }
+}
+#section support_code_apply
+int APPLY_SPECIFIC(ctc_cost_cpu)(PyArrayObject *  in_activations,
+                                 PyArrayObject *  in_labels,
+                                 PyArrayObject *  in_input_lengths,
+                                 PyArrayObject ** out_costs,
+                                 PyArrayObject ** out_gradients)
+{
+    ctc_context_t ctc_object;
+    ctc_context_t * context = &ctc_object;
+    ctc_context_init( context );
+    if ( !PyArray_IS_C_CONTIGUOUS( in_activations ) )
+    {
+        PyErr_SetString( PyExc_RuntimeError,
+            "ConnectionistTemporalClassification: activations array must be C-contiguous." );
+        return 1;
+    }
+    npy_float32 * activations = (npy_float32 *) PyArray_DATA( in_activations );
+    create_contiguous_input_lengths( in_input_lengths, &(context->input_lengths) );
+    if ( NULL == context->input_lengths )
+    {
+        // Destroy previous CTC context before returning exception
+        ctc_context_destroy( context );
+        PyErr_Format( PyExc_MemoryError,
+            "ConnectionistTemporalClassification: Could not allocate memory for input lengths" );
+        return 1;
+    }
+    // flatten labels to conform with library memory layout
+    create_flat_labels( in_labels, &(context->flat_labels), &(context->label_lengths) );
+    if ( ( NULL == context->label_lengths ) || ( NULL == context->flat_labels ) )
+    {
+        // Destroy previous CTC context before returning exception
+        ctc_context_destroy( context );
+        PyErr_Format( PyExc_MemoryError,
+            "ConnectionistTemporalClassification: Could not allocate memory for labels and their lengths" );
+        return 1;
+    }
+    npy_int minibatch_size = PyArray_DIMS( in_activations )[1];
+    npy_int alphabet_size = PyArray_DIMS( in_activations )[2];
+    npy_float32 * costs = NULL;
+    npy_intp cost_size = minibatch_size;
+    if ( (*out_costs) == NULL ||                       // Symbolic variable has no memory backing
+         PyArray_NDIM( *out_costs ) != 1 ||            // or, matrix has the wrong size
+         PyArray_DIMS( *out_costs )[0] != cost_size )
+    {
+        Py_XDECREF( *out_costs );
+        // Allocate new matrix
+        *out_costs = (PyArrayObject *) PyArray_ZEROS( 1, &cost_size, NPY_FLOAT32, 0 );
+        if ( NULL == (*out_costs) )
+        {
+            // Destroy previous CTC context before returning exception
+            ctc_context_destroy( context );
+            PyErr_Format( PyExc_MemoryError,
+                "ConnectionistTemporalClassification: Could not allocate memory for CTC costs" );
+            return 1;
+        }
+    }
+    costs = (npy_float32 *) PyArray_DATA( *out_costs );
+    npy_float32 * gradients = NULL;
+    if ( NULL != out_gradients )  // If gradient computation is not disabled
+    {
+        if ( NULL == (*out_gradients) ||  // Symbolic variable has no real backing
+            PyArray_NDIM( *out_gradients ) != 3 ||
+            PyArray_DIMS( *out_gradients )[0] != PyArray_DIMS( in_activations )[0] ||
+            PyArray_DIMS( *out_gradients )[1] != PyArray_DIMS( in_activations )[1] ||
+            PyArray_DIMS( *out_gradients )[2] != PyArray_DIMS( in_activations )[2] )
+        {
+            // Existing matrix is the wrong size. Make a new one.
+            // Decrement ref counter to existing array
+            Py_XDECREF( *out_gradients );
+            // Allocate new array
+            *out_gradients = (PyArrayObject *) PyArray_ZEROS(3, PyArray_DIMS( in_activations ),
+                NPY_FLOAT32, 0);
+            if ( NULL == (*out_gradients) )
+            {
+                // Destroy previous CTC context before returning exception
+                ctc_context_destroy( context );
+                PyErr_Format( PyExc_MemoryError,
+                    "ConnectionistTemporalClassification: Could not allocate memory for CTC gradients!" );
+                return 1;
+            }
+        }
+        gradients = (npy_float32 *) PyArray_DATA( *out_gradients );
+    }
+    size_t cpu_workspace_size;
+    int ctc_error;
+    ctc_error = ctc_check_result( get_workspace_size( context->label_lengths,
+        context->input_lengths, alphabet_size, minibatch_size, context->options,
+        &cpu_workspace_size ),
+        "Failed to obtain CTC workspace size." );
+    if ( ctc_error )  // Exception is set by ctc_check_result, return error here
+    {
+        // Destroy previous CTC context before returning exception
+        ctc_context_destroy( context );
+        return 1;
+    }
+    context->workspace = malloc( cpu_workspace_size );
+    if ( NULL == context->workspace )
+    {
+        // Destroy previous CTC context before returning exception
+        ctc_context_destroy( context );
+        PyErr_Format( PyExc_MemoryError,
+            "ConnectionistTemporalClassification: Failed to allocate memory for CTC workspace." );
+        return 1;
+    }
+    ctc_error = ctc_check_result( compute_ctc_loss( activations, gradients,
+        context->flat_labels, context->label_lengths, context->input_lengths,
+        alphabet_size, minibatch_size, costs, context->workspace,
+        context->options ), "Failed to compute CTC loss function." );
+    if ( ctc_error )  // Exception is set by ctc_check_result, return error here
+    {
+        ctc_context_destroy( context );
+        return 1;
+    }
+    ctc_context_destroy( context );
+    return 0;
+}
--- a/theano/tensor/nnet/tests/test_ctc.py
+++ b/theano/tensor/nnet/tests/test_ctc.py
+from __future__ import (division, absolute_import, print_function)
+import unittest
+import numpy as np
+import theano
+import theano.tensor as T
+from theano.tests import unittest_tools as utt
+from theano.tensor.nnet.ctc import (ctc_available, ctc, ConnectionistTemporalClassification)
+def setup_torch_case():
+    # Test obtained from Torch tutorial at:
+    # https://github.com/baidu-research/warp-ctc/blob/master/torch_binding/TUTORIAL.md
+    # Layout, from slowest to fastest changing dimension, is (time, batchSize, inputLayerSize)
+    activations = np.asarray([[[0, 0, 0, 0, 0], [1, 2, 3, 4, 5], [-5, -4, -3, -2, -1]],
+                              [[0, 0, 0, 0, 0], [6, 7, 8, 9, 10], [-10, -9, -8, -7, -6]],
+                              [[0, 0, 0, 0, 0], [11, 12, 13, 14, 15], [-15, -14, -13, -12, -11]]],
+                             dtype=np.float32)
+    # Duration of each sequence
+    activation_times = np.asarray([1, 3, 3], dtype=np.int32)
+    # Labels for each sequence
+    labels = np.asarray([[1, -1],
+                         [3, 3],
+                         [2, 3]], dtype=np.int32)
+    expected_costs = np.asarray([1.609437943, 7.355742931, 4.938849926],
+                                dtype=np.float32)
+    grads = [[[0.2, -0.8, 0.2, 0.2, 0.2],
+              [0.01165623125, 0.03168492019, 0.08612854034, -0.7658783197, 0.636408627],
+              [-0.02115798369, 0.03168492019, -0.8810571432, 0.2341216654, 0.636408627]],
+             [[0, 0, 0, 0, 0],
+              [-0.9883437753, 0.03168492019, 0.08612854034, 0.2341216654, 0.636408627],
+              [-0.02115798369, 0.03168492019, -0.1891518533, -0.4577836394, 0.636408627]],
+             [[0, 0, 0, 0, 0],
+              [0.01165623125, 0.03168492019, 0.08612854034, -0.7658783197, 0.636408627],
+              [-0.02115798369, 0.03168492019, 0.08612854034, -0.7330639958, 0.636408627]]]
+    expected_gradients = np.asarray(grads, dtype=np.float32)
+    return [activations, labels, activation_times, expected_costs, expected_gradients]
+def setup_ctc_case():
+    activations = np.asarray([[[0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]],
+                              [[0.6, 0.1, 0.1, 0.1, 0.1], [0.1, 0.1, 0.5, 0.2, 0.1]]],
+                             dtype=np.float32)
+    activation_times = np.asarray([2, 2], dtype=np.int32)
+    labels = np.asarray([[1, 2], [1, 2]], dtype=np.int32)
+    expected_costs = np.asarray([2.962858438, 3.053659201], dtype=np.float32)
+    grads = [[[0.177031219, -0.7081246376, 0.177031219, 0.177031219, 0.177031219],
+              [0.177031219, -0.8229685426, 0.291875124, 0.177031219, 0.177031219]],
+             [[0.291875124, 0.177031219, -0.8229685426, 0.177031219, 0.177031219],
+              [0.1786672771, 0.1786672771, -0.7334594727, 0.1974578798, 0.1786672771]]]
+    expected_gradients = np.asarray(grads, dtype=np.float32)
+    return [activations, labels, activation_times, expected_costs, expected_gradients]
+def setup_grad_case():
+    activations = np.asarray([[[0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]],
+                              [[0.6, 0.1, 0.1, 0.1, 0.1], [0.1, 0.1, 0.5, 0.2, 0.1]]],
+                             dtype=np.float32)
+    activation_times = np.asarray([2, 2], dtype=np.int32)
+    labels = np.asarray([[1, 2], [1, 2]], dtype=np.int32)
+    return [activations, labels, activation_times]
+class TestCTC(unittest.TestCase):
+    """
+    Test Baidu CTC wrapper implementation.
+    Expected values for costs and gradients are obtained through an external
+    C implementation, that uses the library directly.
+    """
+    def setUp(self):
+        if not ctc_available():
+            self.skipTest('Optional library warp-ctc not available')
+    def run_ctc(self, activations, labels, input_length, expected_costs, expected_grads):
+        # Create symbolic variables
+        t_activations = theano.shared(activations, name="activations")
+        t_activation_times = theano.shared(input_length, name="activation_times")
+        t_labels = theano.shared(labels, name="labels")
+        t_cost = ctc(t_activations, t_labels, t_activation_times)
+        # Symbolic gradient of CTC cost
+        t_grad = T.grad(T.mean(t_cost), t_activations)
+        # Compile symbolic functions
+        train = theano.function([], [t_cost, t_grad])
+        cost, grad = train()
+        utt.assert_allclose(expected_grads / cost.shape[0], grad)
+        utt.assert_allclose(expected_costs, cost)
+        self.check_grads_disabled(t_activations, t_labels, t_activation_times)
+    def check_grads_disabled(self, activations, labels, input_length):
+        """
+        Check if optimization to disable gradients is working
+        """
+        ctc_cost = ctc(activations, labels, input_length)
+        ctc_function = theano.function([], [ctc_cost])
+        for node in ctc_function.maker.fgraph.apply_nodes:
+            if isinstance(node.op, ConnectionistTemporalClassification):
+                assert (node.op.compute_grad is False)
+    def test_torch_case(self):
+        activations, labels, input_length, expected_costs, expected_grads = setup_torch_case()
+        self.run_ctc(activations, labels, input_length, expected_costs, expected_grads)
+    def test_ctc(self):
+        activations, labels, input_length, expected_costs, expected_grads = setup_ctc_case()
+        self.run_ctc(activations, labels, input_length, expected_costs, expected_grads)
+    def test_verify_grad(self):
+        def ctc_op_functor(labels, in_lengths):
+            def wrapper(acts):
+                # Create auxiliary symbolic variables
+                t_activation_times = theano.shared(in_lengths, name="activation_times")
+                t_labels = theano.shared(labels, name="labels")
+                return ctc(acts, t_labels, t_activation_times)
+            return wrapper
+        activations, labels, activation_times = setup_grad_case()
+        ctc_op = ctc_op_functor(labels, activation_times)
+        utt.verify_grad(ctc_op, [activations])