Fix pygpu memory allocations and flattening of lengths and labels

上级 808c8c64
#section kernels
#section support_code #section support_code
typedef struct ctc_context { typedef struct ctc_context {
...@@ -23,7 +25,7 @@ void ctc_context_init(ctc_context_t * context) ...@@ -23,7 +25,7 @@ void ctc_context_init(ctc_context_t * context)
void ctc_context_destroy(ctc_context_t * context) void ctc_context_destroy(ctc_context_t * context)
{ {
if ( NULL != context->workspace ) if ( NULL != context->workspace )
free( context->workspace ); cudaFree( context->workspace );
if ( NULL != context->input_lengths ) if ( NULL != context->input_lengths )
free( context->input_lengths ); free( context->input_lengths );
...@@ -51,11 +53,62 @@ int ctc_check_result(ctcStatus_t retcode, const char * msg) ...@@ -51,11 +53,62 @@ int ctc_check_result(ctcStatus_t retcode, const char * msg)
return 0; return 0;
} }
#section support_code_struct void create_contiguous_input_lengths( PyArrayObject * input_lengths_arr,
int ** input_lengths )
{
npy_int num_elements = PyArray_DIMS( input_lengths_arr )[0];
*input_lengths = (int *) malloc( num_elements * sizeof(int) );
if ( NULL == (*input_lengths) )
return;
for( npy_int elem_idx = 0; elem_idx < num_elements; ++elem_idx )
{
(*input_lengths)[elem_idx] = *( (npy_int *) PyArray_GETPTR1( input_lengths_arr, elem_idx ) );
}
}
void create_flat_labels( PyArrayObject * label_matrix, int ** flat_labels,
int ** label_lengths )
{
npy_int rows = PyArray_DIMS( label_matrix )[0];
npy_int cols = PyArray_DIMS( label_matrix )[1];
*flat_labels = (int *) malloc( rows * cols * sizeof(int) );
if ( NULL == (*flat_labels) )
return;
*label_lengths = (int *) malloc( rows * sizeof(int) );
if ( NULL == (*label_lengths) )
{
free( *flat_labels );
*flat_labels = NULL;
return;
}
npy_int label_index = 0;
for( npy_int row_idx = 0; row_idx < rows; ++row_idx )
{
npy_int label_length = 0;
for( npy_int col_idx = 0; col_idx < cols; ++col_idx )
{
npy_int label = *( (npy_int *) PyArray_GETPTR2( label_matrix, row_idx, col_idx ) );
if ( label >= 0 ) // negative values are assumed to be padding
{
(*flat_labels)[ label_index++ ] = label;
++label_length;
}
}
(*label_lengths)[ row_idx ] = label_length;
}
}
#section support_code_apply
int APPLY_SPECIFIC(ctc_cost_gpu)(PyGpuArrayObject * in_activations, int APPLY_SPECIFIC(ctc_cost_gpu)(PyGpuArrayObject * in_activations,
PyGpuArrayObject * in_labels, PyArrayObject * in_labels,
PyGpuArrayObject * in_input_lengths, PyArrayObject * in_input_lengths,
PyGpuArrayObject ** out_costs, PyGpuArrayObject ** out_costs,
PyGpuArrayObject ** out_gradients, PyGpuArrayObject ** out_gradients,
PyGpuContextObject * ctx) PyGpuContextObject * ctx)
...@@ -64,34 +117,44 @@ int APPLY_SPECIFIC(ctc_cost_gpu)(PyGpuArrayObject * in_activations, ...@@ -64,34 +117,44 @@ int APPLY_SPECIFIC(ctc_cost_gpu)(PyGpuArrayObject * in_activations,
ctc_context_t * context = &ctc_object; ctc_context_t * context = &ctc_object;
ctc_context_init( context ); ctc_context_init( context );
if ( !PyArray_IS_C_CONTIGUOUS( in_activations ) ) npy_float32 * activations = (npy_float32 *) PyGpuArray_DEV_DATA( in_activations );
create_contiguous_input_lengths( in_input_lengths, &(context->input_lengths) );
if ( NULL == context->input_lengths )
{ {
PyErr_SetString( PyExc_RuntimeError, PyErr_Format( PyExc_MemoryError,
"activations array must be C-contiguous." ); "Could not allocate storage for input lengths" );
return 1; return 1;
} }
npy_float32 * activations = (npy_float32 *) PyArray_DATA( in_activations ); // flatten labels to conform with library memory layout
create_flat_labels( in_labels, &(context->flat_labels), &(context->label_lengths) );
// TODO: flatten input_lengths to conform with underlying library memory layout
// TODO: flatten labels to conform with underlying library memory layout if ( ( NULL == context->label_lengths ) || ( NULL == context->flat_labels ) )
{
// Destroy previous CTC context before returning exception
ctc_context_destroy( context );
PyErr_Format( PyExc_MemoryError,
"Could not allocate storage for labels and their lengths" );
return 1;
}
const npy_int minibatch_size = PyArray_DIMS( in_activations )[1]; const size_t minibatch_size = PyGpuArray_DIMS( in_activations )[1];
const npy_int alphabet_size = PyArray_DIMS( in_activations )[2]; const size_t alphabet_size = PyGpuArray_DIMS( in_activations )[2];
npy_float32 * costs = NULL; npy_float32 * costs = NULL;
const npy_intp cost_size = minibatch_size; const size_t cost_size = minibatch_size;
if (NULL == *out_costs || // symbolic variable has no real backing if (NULL == *out_costs || // symbolic variable has no real backing
PyArray_NDIM( *out_costs ) != 1 || PyGpuArray_NDIM( *out_costs ) != 1 ||
PyArray_DIMS( *out_costs )[0] != cost_size) PyGpuArray_DIMS( *out_costs )[0] != cost_size)
{ {
PY_XDECREF( *out_costs ); Py_XDECREF( *out_costs );
*out_costs = pygpu_zeros(1, cost_size, GA_FLOAT, GA_C_ORDER, *out_costs = pygpu_zeros( 1, &cost_size, GA_FLOAT, GA_C_ORDER,
ctx, Py_None); ctx, Py_None );
if ( NULL == *out_costs ) if ( NULL == *out_costs )
{ {
...@@ -104,21 +167,23 @@ int APPLY_SPECIFIC(ctc_cost_gpu)(PyGpuArrayObject * in_activations, ...@@ -104,21 +167,23 @@ int APPLY_SPECIFIC(ctc_cost_gpu)(PyGpuArrayObject * in_activations,
} }
} }
costs = (npy_float32 *) PyArray_DATA( *out_costs ); costs = (npy_float32 *) PyGpuArray_DEV_DATA( *out_costs );
npy_float32 * gradients = NULL; npy_float32 * gradients = NULL;
if ( NULL != out_gradients ) // if gradient computation is not disabled if ( NULL != out_gradients ) // if gradient computation is not disabled
{ {
if ( NULL == *out_gradients || if ( NULL == *out_gradients ||
PyArray_NDIM( *out_gradients ) != 3 || PyGpuArray_NDIM( *out_gradients ) != 3 ||
PyArray_DIMS( *out_gradients )[0] != PyArray_DIMS( in_activations )[0] || PyGpuArray_DIMS( *out_gradients )[0] != PyGpuArray_DIMS( in_activations )[0] ||
PyArray_DIMS( *out_gradients )[1] != PyArray_DIMS( in_activations )[1] || PyGpuArray_DIMS( *out_gradients )[1] != PyGpuArray_DIMS( in_activations )[1] ||
PyArray_DIMS( *out_gradients )[2] != PyArray_DIMS( in_activations )[2] ) PyGpuArray_DIMS( *out_gradients )[2] != PyGpuArray_DIMS( in_activations )[2] )
{ {
Py_XDECREF( *out_gradients ); Py_XDECREF( *out_gradients );
*out_gradients = pygpu_zeros( 3, PyArray_DIMS( in_activations ), GA_FLOAT, 0 ); const size_t * activation_dims = PyGpuArray_DIMS( in_activations );
*out_gradients = pygpu_zeros( 3, activation_dims, GA_FLOAT, GA_C_ORDER,
ctx, Py_None );
if ( NULL == *out_gradients ) if ( NULL == *out_gradients )
{ {
...@@ -130,7 +195,32 @@ int APPLY_SPECIFIC(ctc_cost_gpu)(PyGpuArrayObject * in_activations, ...@@ -130,7 +195,32 @@ int APPLY_SPECIFIC(ctc_cost_gpu)(PyGpuArrayObject * in_activations,
} }
} }
gradients = (npy_float32 *) PyArray_DATA( *out_gradients ); gradients = (npy_float32 *) PyGpuArray_DEV_DATA( *out_gradients );
}
size_t gpu_workspace_size;
int ctc_error = 0;
ctc_error = ctc_check_result( get_workspace_size( context->label_lengths,
context->input_lengths, alphabet_size, minibatch_size, context->options,
&gpu_workspace_size ),
"Failed to obtain CTC workspace size!" );
if ( ctc_error ) // Exception is set by ctc_check_result, return error here
{
// Destroy previous CTC context before returning exception
ctc_context_destroy( context );
return 1;
}
if ( cudaSuccess != cudaMalloc( &(context->workspace), gpu_workspace_size ) )
{
ctc_context_destroy( context );
PyErr_Format( PyExc_MemoryError,
"Failed to allocate memory for CTC workspace!" );
return 1;
} }
ctc_context_destroy( context ); ctc_context_destroy( context );
...@@ -139,8 +229,8 @@ int APPLY_SPECIFIC(ctc_cost_gpu)(PyGpuArrayObject * in_activations, ...@@ -139,8 +229,8 @@ int APPLY_SPECIFIC(ctc_cost_gpu)(PyGpuArrayObject * in_activations,
} }
int APPLY_SPECIFIC(ctc_cost_gpu_no_grad)(PyGpuArrayObject * in_activations, int APPLY_SPECIFIC(ctc_cost_gpu_no_grad)(PyGpuArrayObject * in_activations,
PyGpuArrayObject * in_labels, PyArrayObject * in_labels,
PyGpuArrayObject * in_input_lengths, PyArrayObject * in_input_lengths,
PyGpuArrayObject ** out_costs, PyGpuArrayObject ** out_costs,
PyGpuContextObject * ctx) PyGpuContextObject * ctx)
{ {
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论