Document sandbox/gpuarray/opt_util.py

de4be8be · Arnaud Bergeron · 0ad41ce5 · de4be8be · de4be8be · de4be8be
--- a/theano/sandbox/gpuarray/dnn.py
+++ b/theano/sandbox/gpuarray/dnn.py
@@ -1264,39 +1264,39 @@ optdb.register('local_dnna_conv_inplace',
 @register_opt('cudnn')
-@alpha_merge(GpuDnnConv, alpha_in=4, beta_in=5, nd=4)
+@alpha_merge(GpuDnnConv, alpha_in=4, beta_in=5)
 def local_dnn_conv_alpha_merge(node, *inputs):
    return [GpuDnnConv(algo=node.op.algo)(*inputs)]
 @register_opt('cudnn')
-@alpha_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, nd=4)
+@alpha_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5)
 def local_dnn_convw_alpha_merge(node, *inputs):
    return [GpuDnnConvGradW(algo=node.op.algo)(*inputs)]
 @register_opt('cudnn')
-@alpha_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, nd=4)
+@alpha_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5)
 def local_dnn_convi_alpha_merge(node, *inputs):
    return [GpuDnnConvGradI(algo=node.op.algo)(*inputs)]
 @register_opt('cudnn')
-@output_merge(GpuDnnConv, alpha_in=4, beta_in=5, out_in=2, nd=4)
+@output_merge(GpuDnnConv, alpha_in=4, beta_in=5, out_in=2)
 def local_dnn_conv_output_merge(node, *inputs):
    inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
    return [GpuDnnConv(algo=node.op.algo)(*inputs)]
 @register_opt('cudnn')
-@output_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, out_in=2, nd=4)
+@output_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, out_in=2)
 def local_dnn_convw_output_merge(node, *inputs):
    inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
    return [GpuDnnConvGradW(algo=node.op.algo)(*inputs)]
 @register_opt('cudnn')
-@output_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, out_in=2, nd=4)
+@output_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, out_in=2)
 def local_dnn_convi_output_merge(node, *inputs):
    inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
    return [GpuDnnConvGradI(algo=node.op.algo)(*inputs)]

--- a/theano/sandbox/gpuarray/nerv.py
+++ b/theano/sandbox/gpuarray/nerv.py
@@ -176,13 +176,13 @@ def local_dot_to_gemm16(node):
 @opt.register_opt()
-@alpha_merge(Gemm16, alpha_in=1, beta_in=4, nd=2)
+@alpha_merge(Gemm16, alpha_in=1, beta_in=4)
 def local_gemm16_alpha_merge(node, *inputs):
    return [Gemm16(relu=node.op.relu)(*inputs)]
 @opt.register_opt()
-@output_merge(Gemm16, alpha_in=1, beta_in=4, out_in=0, nd=2)
+@output_merge(Gemm16, alpha_in=1, beta_in=4, out_in=0)
 def local_gemm16_output_merge(node, *inputs):
    return [Gemm16(relu=node.op.relu)(*inputs)]

--- a/theano/sandbox/gpuarray/opt_util.py
+++ b/theano/sandbox/gpuarray/opt_util.py
@@ -14,16 +14,28 @@ _one = scal.constant(numpy.asarray(1.0, dtype='float64'))
 def grab_cpu_scalar(v, nd):
+    """
+    Get a scalar variable value from the tree at `v`.
+    This function will dig through transfers and dimshuffles to get
+    the constant value. If no such constant is found, it returns None.
+    Parameters
+    ----------
+    v : variable
+        Theano variable to extract the constant value from.
+    nd : int
+        Expected number of dimensions for the variable (for
+        broadcasted constants).
+    """
    if v.owner is not None:
        n = v.owner
-        if (isinstance(n.op, GpuDimShuffle) and
+        elif (isinstance(n.op, (GpuDimShuffle, DimShuffle)) and
-                n.op.new_order == ('x',) * nd):
-            return grab_cpu_scalar(n.inputs[0])
-        elif (isinstance(n.op, DimShuffle) and
              n.op.new_order == ('x',) * nd):
-            return grab_cpu_scalar(n.inputs[0])
+            return grab_cpu_scalar(n.inputs[0], n.inputs[0].ndim)
-        elif isinstance(n.op, GpuFromHost):
+        elif isinstance(n.op, (GpuFromHost, HostFromGpu)):
-            return grab_cpu_scalar(n.inputs[0], nd=nd)
+            return grab_cpu_scalar(n.inputs[0], nd)
        else:
            return None
    else:
@@ -33,10 +45,24 @@ def grab_cpu_scalar(v, nd):
 def find_node(v, cls, ignore_clients=False):
-    # This digs through possibly redundant transfers to for the node
+    """
-    # that has the op class specified. If ignore_clients is False (the
+    Find the node that has an op of of type `cls` in `v`.
-    # default) it will only dig through nodes that have a single
-    # client.
+    This digs through possibly redundant transfers to for the node
+    that has the type `cls`. If `ignore_clients` is False (the
+    default) it will only dig through nodes that have a single client
+    to avoid duplicating computations.
+    Parameters
+    ----------
+    v : variable
+        The variable to dig through
+    cls : Op class
+        The type of the node we are looking for
+    ignore_clients : bool, optional
+        Whether to ignore multiple clients or not.
+    """
    if v.owner is not None and (ignore_clients or len(v.clients) == 1):
        if isinstance(v.owner.op, cls):
            return v.owner
@@ -50,8 +76,20 @@ def find_node(v, cls, ignore_clients=False):
 def is_equal(var, val):
-    # Returns True if var is always equal to val (python value), False
+    """
-    # otherwise (including if var is not constant)
+    Returns True if `var` is always equal to `val`.
+    This will only return True if the variable will always be equal to
+    the value.  If it might not be true in some cases then it returns False.
+    Parameters
+    ----------
+    var : variable
+        Variable to compare
+    val : value
+        Python value
+    """
    try:
        v = get_scalar_constant_value(var)
        return v == val
@@ -59,7 +97,57 @@ def is_equal(var, val):
        return False
-def alpha_merge(cls, alpha_in, beta_in, nd):
+def alpha_merge(cls, alpha_in, beta_in):
+    """
+    Decorator to merge multiplication by a scalar on the output.
+    This will find a pattern of scal * <yourop>(some, params, alpha,
+    beta) and update it so that the scalar multiplication happens as
+    part of your op.
+    The op needs to accept an alpha and a beta scalar which act this way:
+       out = Op() * alpha + out_like * beta
+    Where out_like is a buffer that has the same size as the output
+    and gets added to the "real" output of the operation.  An example
+    of an operation that respects this pattern is GEMM from blas.
+    The decorated function must have this signature:
+        maker(node, *inputs)
+    The `node` argument you recieve is the original apply node that
+    contains your op.  You should use it to grab relevant properties
+    for your op so that the new version performs the same computation.
+    The `*inputs` parameters contains the new inputs for your op.  You
+    MUST use those inputs instead of the ones on `node`.  Note that
+    this function can be as simple as:
+        def maker(node, *inputs):
+            return node.op(*inputs)
+    Parameters
+    ----------
+    cls : op class
+        The class of the op you want to merge
+    alpha_in : int
+        The input index for the alpha scalar for your op (in node.inputs).
+    beta_in : int
+        The input index for the beta scalar for your op (in node.inputs).
+    Returns
+    -------
+    This returns an unregistered local optimizer that has the same
+    name as the decorated function.
+    Notes
+    -----
+    This was factored out since the code to deal with intervening
+    transfers and correctness in the presence of different values of
+    alpha and beta scaling factors is not trivial.
+    """
    def wrapper(maker):
        @local_optimizer([GpuElemwise])
        @wraps(maker)
@@ -70,11 +158,12 @@ def alpha_merge(cls, alpha_in, beta_in, nd):
                targ = find_node(node.inputs[0], cls)
                if targ is None:
                    targ = find_node(node.inputs[1], cls)
-                    lr = grab_cpu_scalar(node.inputs[0], nd=nd)
+                    if targ is None:
+                        return
+                    lr = grab_cpu_scalar(node.inputs[0], nd=targ.ndim)
                else:
-                    lr = grab_cpu_scalar(node.inputs[1], nd=nd)
+                    lr = grab_cpu_scalar(node.inputs[1], nd=targ.ndim)
-                if (lr is None or targ is None or
+                if lr is None or lr.dtype != targ.outputs[0].dtype:
-                        lr.dtype != targ.outputs[0].dtype):
                    return None
                inputs = list(targ.inputs)
                try:
@@ -96,7 +185,62 @@ def alpha_merge(cls, alpha_in, beta_in, nd):
    return wrapper
-def output_merge(cls, alpha_in, beta_in, out_in, nd):
+def output_merge(cls, alpha_in, beta_in, out_in):
+    """
+    Decorator to merge addition by a value on the output.
+    This will find a pattern of val * <yourop>(some, params, alpha,
+    beta, out_like) and update it so that the addtition happens as
+    part of your op.
+    The op needs to accept an alpha and a beta scalar which act this way:
+       out = Op() * alpha + out_like * beta
+    Where out_like is a buffer that has the same size as the output
+    and gets added to the "real" output of the operation.  An example
+    of an operation that respects this pattern is GEMM from blas.
+    The decorated function must have this signature:
+        maker(node, *inputs)
+    The `node` argument you recieve is the original apply node that
+    contains your op.  You should use it to grab relevant properties
+    for your op so that the new version performs the same computation.
+    The `*inputs` parameters contains the new inputs for your op.  You
+    MUST use those inputs instead of the ones on `node`.  Note that
+    this function can be as simple as:
+        def maker(node, *inputs):
+            return node.op(*inputs)
+    Parameters
+    ----------
+    cls : op class
+        The class of the op you want to merge
+    alpha_in : int
+        The input index for the alpha scalar for your op (in node.inputs).
+    beta_in : int
+        The input index for the beta scalar for your op (in node.inputs).
+    out_in : int
+        The input index for the out_like input for your op (in node.inputs).
+    Returns
+    -------
+    This returns an unregistered local optimizer that has the same
+    name as the decorated function.
+    Notes
+    -----
+    This was factored out since the code to deal with intervening
+    transfers and correctness in the presence of different values of
+    alpha and beta scaling factors is not trivial.
+    This also correctly handles the case where the added value is
+    broadcasted (by not performing the replacement).
+    """
    def wrapper(maker):
        @local_optimizer([GpuElemwise])
        @wraps(maker)
@@ -129,6 +273,40 @@ def output_merge(cls, alpha_in, beta_in, out_in, nd):
 def inplace_allocempty(op, idx):
+    """
+    Wrapper to make an inplace optimization that deals with AllocEmpty
+    This will duplicate the alloc input if it has more than one client
+    to allow the op to work on it inplace.
+    The decorated function must have this signature:
+        maker(node, inputs)
+    The `node` argument you recieve is the original apply node that
+    contains your op.  You should use it to grab relevant properties
+    for your op so that the new version performs the same computation.
+    You should also switch the op to work inplace.  The `*inputs`
+    parameters contains the new inputs for your op.  You MUST use
+    those inputs instead of the ones on `node`.  Note that this
+    function can be as simple as:
+        def maker(node, inputs):
+            return node.op.__class__(inplace=True)(*inputs)
+    Parameters
+    ----------
+    op : op class
+        The op class to look for to make inplace
+    idx : int
+        The index of the (possibly) AllocEmpty input (in node.inputs).
+    Returns
+    -------
+    This returns an unregistered inplace local optimizer that has the
+    same name as the decorated function.
+    """
    def wrapper(maker):
        @local_optimizer([op], inplace=True)
        @wraps(maker)