Refactor CTC GPU wrapper to make use of L_op and allocate costs and gradients in make_node

Signed-off-by: João Victor Tozatti Risso <joaovictor.risso@gmail.com>

Refactor CTC GPU wrapper to make use of L_op and allocate costs and gradients in make_node
80d8bf27 · João Victor Tozatti Risso · ba6d2de5 · 80d8bf27
--- a/theano/gpuarray/ctc.py
+++ b/theano/gpuarray/ctc.py
 from __future__ import absolute_import, print_function, division

 import theano
-from theano import Op
-from theano import config
+from theano import (config, gof)
 import theano.tensor as T
-from .basic_ops import (gpu_contiguous, as_gpuarray_variable,
-                        infer_context_name, CGpuKernelBase)
+from .basic_ops import (gpu_contiguous, as_gpuarray_variable, infer_context_name)
 import theano.tensor.nnet.ctc
-from .type import GpuArrayType
+from .type import (GpuArrayType, gpu_context_type)
 from .elemwise import GpuDimShuffle
 from theano.gradient import grad_undefined
 from theano.gof import local_optimizer
@@ -20,32 +18,44 @@ import pygpu
 ctc_enabled = config.ctc.enabled


-class GpuConnectionistTemporalClassification(CGpuKernelBase, Op):
+class GpuConnectionistTemporalClassification(gof.COp):
    """
    GPU wrapper for Baidu CTC loss function.
+
+    Parameters
+    ----------
+    activations
+        Three-dimensional tensor, which has a shape of (t, m, p), where
+        t is the time index, m is the minibatch index, and p is the index
+        over the probabilities of each symbol in the alphabet. The memory
+        layout is assumed to be in C-order, which consists in the slowest
+        to the fastest changing dimension, from left to right. In this case,
+        p is the fastest changing dimension.
+    labels
+        A 1-D tensor of all the labels for the minibatch.
+    input_lengths
+        A 1-D tensor with the number of time steps for each sequence in
+        the minibatch.
+
+    Returns
+    -------
+    3D tensor
+        Cost of each example in the minibatch. Tensor is of shape
+        (time index, minibatch index, probabilities).
    """
-    __props__ = ('context_name', 'compute_grad',)
+    __props__ = ('compute_grad',)

    func_file = "./ctc_wrapper.c"
    func_name = "APPLY_SPECIFIC(ctc_cost_gpu)"

-    def __init__(self, compute_grad=True, context_name=None):
+    params_type = gpu_context_type
+
+    def __init__(self, compute_grad=True):
        if not compute_grad:
            self.func_name = "APPLY_SPECIFIC(ctc_cost_gpu_no_grad)"
        self.compute_grad = compute_grad
-        self.context_name = context_name
-
-        Op.__init__(self)
-        CGpuKernelBase.__init__(self, self.func_file, self.func_name)

-        self.costs = GpuArrayType(dtype='float32',
-                                  broadcastable=(False,),
-                                  context_name=self.context_name)
-
-        if self.compute_grad:
-            self.gradients = GpuArrayType(dtype='float32',
-                                          broadcastable=(False, False, False,),
-                                          context_name=self.context_name)
+        gof.COp.__init__(self, self.func_file, self.func_name)

        if config.ctc.root == "":
            raise ValueError('ctc.root variable is not set, please set it '
@@ -75,35 +85,19 @@ class GpuConnectionistTemporalClassification(CGpuKernelBase, Op):
        return ['ctc.h', 'numpy_compat.h', 'gpuarray_helper.h', 'gpuarray/types.h',
                'gpuarray_api.h', 'gpuarray/array.h', 'gpuarray/util.h']

-    def make_node(self, activations, labels, input_lengths):
-        """
-        Parameters
-        ----------
-        activations
-            Three-dimensional tensor, which has a shape of (t, m, p), where
-            t is the time index, m is the minibatch index, and p is the index
-            over the probabilities of each symbol in the alphabet. The memory
-            layout is assumed to be in C-order, which consists in the slowest
-            to the fastest changing dimension, from left to right. In this case,
-            p is the fastest changing dimension.
-        labels
-            A 1-D tensor of all the labels for the minibatch.
-        input_lengths
-            A 1-D tensor with the number of time steps for each sequence in
-            the minibatch.
-
-        """
+    def get_params(self, node):
+        return node.inputs[0].type.context

+    def make_node(self, activations, labels, input_lengths):
        if not ctc_enabled:
            raise RuntimeError('Baidu CTC is not enabled and '
                               'GpuConnectionistTemporalClassification Op '
                               'can not be constructed.')

        context = infer_context_name(activations, labels, input_lengths)
-        assert context == self.context_name

        t_activations = as_gpuarray_variable(activations,
-                                             context_name=self.context_name)
+                                             context_name=context)
        # Ensure activations array is C-contiguous
        t_activations = gpu_contiguous(t_activations)

@@ -123,24 +117,31 @@ class GpuConnectionistTemporalClassification(CGpuKernelBase, Op):
        # Return only the cost. Gradient will be returned by grad()
        self.default_output = 0

-        out_params = [as_gpuarray_variable(self.costs(), context_name=self.context_name)]
-        if self.gradients is not None:
-            out_params.append(as_gpuarray_variable(self.gradients(),
-                                                   context_name=self.context_name))
+        costs = GpuArrayType(dtype='float32',
+                             broadcastable=(False,),
+                             context_name=context)()
+
+        if self.compute_grad:
+            gradients = GpuArrayType(dtype='float32',
+                                     broadcastable=(False, False, False,),
+                                     context_name=context)()

        return theano.Apply(self, inputs=[t_activations, t_labels, t_input_lengths],
-                            outputs=out_params)
+                            outputs=[costs, gradients])

-    def grad(self, inputs, output_grads):
+    def L_op(self, inputs, outputs, output_grads):
        if not ctc_enabled:
            raise RuntimeError('Baidu CTC is not enabled and '
                               'GpuConnectionistTemporalClassification Op '
                               'can not be constructed.')
-        z = output_grads[0]
-        grad_shuffle = GpuDimShuffle(input_broadcastable=(False, False, False),
-                                     new_order=(1, 0, 2))(self.gradients())
-        grad_bdot = T.basic.batched_dot(z, grad_shuffle)
-        grad_shuffle_reverse = GpuDimShuffle(input_broadcastable=(False, False, False),
+        # Gradients computed by Op
+        gradients = outputs[1]
+        # Gradients of original function, to compose chain rule
+        grad_op = output_grads[0]
+        grad_shuffle = GpuDimShuffle(input_broadcastable=(False, False, False,),
+                                     new_order=(1, 0, 2))(gradients)
+        grad_bdot = T.basic.batched_dot(grad_op, grad_shuffle)
+        grad_shuffle_reverse = GpuDimShuffle(input_broadcastable=(False, False, False,),
                                             new_order=(1, 0, 2))(grad_bdot)
        return [grad_shuffle_reverse,
                grad_undefined(self, 1, inputs[1]),