Fix pygpu memory allocations and flattening of lengths and labels

Signed-off-by: João Victor Tozatti Risso <joaovictor.risso@gmail.com>

Fix pygpu memory allocations and flattening of lengths and labels
ae7f287f · João Victor Tozatti Risso · 808c8c64 · ae7f287f
--- a/theano/gpuarray/ctc_wrapper.c
+++ b/theano/gpuarray/ctc_wrapper.c
+#section kernels
 #section support_code
 typedef struct ctc_context {
@@ -23,7 +25,7 @@ void ctc_context_init(ctc_context_t * context)
 void ctc_context_destroy(ctc_context_t * context)
 {
    if ( NULL != context->workspace )
-        free( context->workspace );
+        cudaFree( context->workspace );
    if ( NULL != context->input_lengths )
        free( context->input_lengths );
@@ -51,11 +53,62 @@ int ctc_check_result(ctcStatus_t retcode, const char * msg)
    return 0;
 }
-#section support_code_struct
+void create_contiguous_input_lengths( PyArrayObject * input_lengths_arr,
+    int ** input_lengths )
+{
+    npy_int num_elements = PyArray_DIMS( input_lengths_arr )[0];
+    *input_lengths = (int *) malloc( num_elements * sizeof(int) );
+    if ( NULL == (*input_lengths) )
+        return;
+    for( npy_int elem_idx = 0; elem_idx < num_elements; ++elem_idx )
+    {
+        (*input_lengths)[elem_idx] = *( (npy_int *) PyArray_GETPTR1( input_lengths_arr, elem_idx ) );
+    }
+}
+void create_flat_labels( PyArrayObject * label_matrix, int ** flat_labels,
+    int ** label_lengths )
+{
+    npy_int rows = PyArray_DIMS( label_matrix )[0];
+    npy_int cols = PyArray_DIMS( label_matrix )[1];
+    *flat_labels = (int *) malloc( rows * cols * sizeof(int) );
+    if ( NULL == (*flat_labels) )
+        return;
+    *label_lengths = (int *) malloc( rows * sizeof(int) );
+    if ( NULL == (*label_lengths) )
+    {
+        free( *flat_labels );
+        *flat_labels = NULL;
+        return;
+    }
+    npy_int label_index = 0;
+    for( npy_int row_idx = 0; row_idx < rows; ++row_idx )
+    {
+        npy_int label_length = 0;
+        for( npy_int col_idx = 0; col_idx < cols; ++col_idx )
+        {
+            npy_int label = *( (npy_int *) PyArray_GETPTR2( label_matrix, row_idx, col_idx ) );
+            if ( label >= 0 )  // negative values are assumed to be padding
+            {
+                (*flat_labels)[ label_index++ ] = label;
+                ++label_length;
+            }
+        }
+        (*label_lengths)[ row_idx ] = label_length;
+    }
+}
+#section support_code_apply
 int APPLY_SPECIFIC(ctc_cost_gpu)(PyGpuArrayObject   *  in_activations,
-                                 PyGpuArrayObject   *  in_labels,
+                                 PyArrayObject      *  in_labels,
-                                 PyGpuArrayObject   *  in_input_lengths,
+                                 PyArrayObject      *  in_input_lengths,
                                 PyGpuArrayObject   ** out_costs,
                                 PyGpuArrayObject   ** out_gradients,
                                 PyGpuContextObject *  ctx)
@@ -64,34 +117,44 @@ int APPLY_SPECIFIC(ctc_cost_gpu)(PyGpuArrayObject   *  in_activations,
    ctc_context_t * context = &ctc_object;
    ctc_context_init( context );
-    if ( !PyArray_IS_C_CONTIGUOUS( in_activations ) )
+    npy_float32 * activations = (npy_float32 *) PyGpuArray_DEV_DATA( in_activations );
+    create_contiguous_input_lengths( in_input_lengths, &(context->input_lengths) );
+    if ( NULL == context->input_lengths )
    {
-        PyErr_SetString( PyExc_RuntimeError,
+        PyErr_Format( PyExc_MemoryError,
-            "activations array must be C-contiguous." );
+            "Could not allocate storage for input lengths" );
        return 1;
    }
-    npy_float32 * activations = (npy_float32 *) PyArray_DATA( in_activations );
+    // flatten labels to conform with library memory layout
+    create_flat_labels( in_labels, &(context->flat_labels), &(context->label_lengths) );
-    // TODO: flatten input_lengths to conform with underlying library memory layout
-    // TODO: flatten labels to conform with underlying library memory layout
+    if ( ( NULL == context->label_lengths ) || ( NULL == context->flat_labels ) )
+    {
+        // Destroy previous CTC context before returning exception
+        ctc_context_destroy( context );
+        PyErr_Format( PyExc_MemoryError,
+            "Could not allocate storage for labels and their lengths" );
+        return 1;
+    }
-    const npy_int minibatch_size = PyArray_DIMS( in_activations )[1];
+    const size_t minibatch_size = PyGpuArray_DIMS( in_activations )[1];
-    const npy_int alphabet_size  = PyArray_DIMS( in_activations )[2];
+    const size_t alphabet_size  = PyGpuArray_DIMS( in_activations )[2];
    npy_float32 * costs = NULL;
-    const npy_intp cost_size = minibatch_size;
+    const size_t cost_size = minibatch_size;
    if (NULL == *out_costs ||  // symbolic variable has no real backing
-        PyArray_NDIM( *out_costs ) != 1 ||
+        PyGpuArray_NDIM( *out_costs ) != 1 ||
-        PyArray_DIMS( *out_costs )[0] != cost_size)
+        PyGpuArray_DIMS( *out_costs )[0] != cost_size)
    {
-        PY_XDECREF( *out_costs );
+        Py_XDECREF( *out_costs );
-        *out_costs = pygpu_zeros(1, cost_size, GA_FLOAT, GA_C_ORDER,
+        *out_costs = pygpu_zeros( 1, &cost_size, GA_FLOAT, GA_C_ORDER,
-            ctx, Py_None);
+            ctx, Py_None );
        if ( NULL == *out_costs )
        {
@@ -104,21 +167,23 @@ int APPLY_SPECIFIC(ctc_cost_gpu)(PyGpuArrayObject   *  in_activations,
        }
    }
-    costs = (npy_float32 *) PyArray_DATA( *out_costs );
+    costs = (npy_float32 *) PyGpuArray_DEV_DATA( *out_costs );
    npy_float32 * gradients = NULL;
    if ( NULL != out_gradients )  // if gradient computation is not disabled
    {
        if ( NULL == *out_gradients ||
-             PyArray_NDIM( *out_gradients ) != 3 ||
+             PyGpuArray_NDIM( *out_gradients ) != 3 ||
-             PyArray_DIMS( *out_gradients )[0] != PyArray_DIMS( in_activations )[0] ||
+             PyGpuArray_DIMS( *out_gradients )[0] != PyGpuArray_DIMS( in_activations )[0] ||
-             PyArray_DIMS( *out_gradients )[1] != PyArray_DIMS( in_activations )[1] ||
+             PyGpuArray_DIMS( *out_gradients )[1] != PyGpuArray_DIMS( in_activations )[1] ||
-             PyArray_DIMS( *out_gradients )[2] != PyArray_DIMS( in_activations )[2] )
+             PyGpuArray_DIMS( *out_gradients )[2] != PyGpuArray_DIMS( in_activations )[2] )
        {
            Py_XDECREF( *out_gradients );
-            *out_gradients = pygpu_zeros( 3, PyArray_DIMS( in_activations ), GA_FLOAT, 0 );
+            const size_t * activation_dims = PyGpuArray_DIMS( in_activations );
+            *out_gradients = pygpu_zeros( 3, activation_dims, GA_FLOAT, GA_C_ORDER,
+                ctx, Py_None );
            if ( NULL == *out_gradients )
            {
@@ -130,7 +195,32 @@ int APPLY_SPECIFIC(ctc_cost_gpu)(PyGpuArrayObject   *  in_activations,
            }
        }
-        gradients = (npy_float32 *) PyArray_DATA( *out_gradients );
+        gradients = (npy_float32 *) PyGpuArray_DEV_DATA( *out_gradients );
+    }
+    size_t gpu_workspace_size;
+    int ctc_error = 0;
+    ctc_error = ctc_check_result( get_workspace_size( context->label_lengths,
+        context->input_lengths, alphabet_size, minibatch_size, context->options,
+        &gpu_workspace_size ),
+        "Failed to obtain CTC workspace size!" );
+    if ( ctc_error )  // Exception is set by ctc_check_result, return error here
+    {
+        // Destroy previous CTC context before returning exception
+        ctc_context_destroy( context );
+        return 1;
+    }
+    if ( cudaSuccess != cudaMalloc( &(context->workspace), gpu_workspace_size ) )
+    {
+        ctc_context_destroy( context );
+        PyErr_Format( PyExc_MemoryError,
+            "Failed to allocate memory for CTC workspace!" );
+        return 1;
    }
    ctc_context_destroy( context );
@@ -139,8 +229,8 @@ int APPLY_SPECIFIC(ctc_cost_gpu)(PyGpuArrayObject   *  in_activations,
 }
 int APPLY_SPECIFIC(ctc_cost_gpu_no_grad)(PyGpuArrayObject   *  in_activations,
-                                         PyGpuArrayObject   *  in_labels,
+                                         PyArrayObject      *  in_labels,
-                                         PyGpuArrayObject   *  in_input_lengths,
+                                         PyArrayObject      *  in_input_lengths,
                                         PyGpuArrayObject   ** out_costs,
                                         PyGpuContextObject *  ctx)
 {