Factor out and generalize grab_ger and grab_lr.

065e0f5e · Arnaud Bergeron · 1586d6d8 · 065e0f5e · 065e0f5e · 065e0f5e
--- a/theano/sandbox/cuda/blocksparse.py
+++ b/theano/sandbox/cuda/blocksparse.py
 import numpy
 import theano
-from theano import Apply, tensor, scalar, Constant
-from theano.tensor import DimShuffle, discrete_dtypes
+from theano import Apply, tensor, scalar
+from theano.tensor import discrete_dtypes

 from theano.gradient import grad_undefined

@@ -645,19 +645,6 @@ if cuda_available:
        if node.op == sparse_block_outer_ss:
            return [sparse_block_outer_ss_inplace(*node.inputs)]

-    def grab_ger(v):
-        # We need to do some digging because apparently the
-        # cut_transfers op does not run before us.
-        if v.owner is not None:
-            if isinstance(v.owner.op, SparseBlockOuterSS):
-                return v.owner
-            elif (isinstance(v.owner.op, GpuFromHost) and
-                  v.owner.inputs[0].owner is not None and
-                  isinstance(v.owner.inputs[0].owner.op, HostFromGpu)):
-                return grab_ger(v.owner.inputs[0].owner.inputs[0])
-            else:
-                return None
-
    # Should be run before elemwise fusion
    @opt.register_opt()
    @opt.local_optimizer([GpuElemwise])
@@ -665,33 +652,15 @@ if cuda_available:
        """
 GpuElemwise{mul}(lr, SparseBlockOuterSS) -> SparseBlockOuterSS(..., alpha=lr)
        """
-        def grab_lr(v):
-            if v.owner is not None:
-                n = v.owner
-                if (isinstance(n.op, GpuDimShuffle) and
-                      n.op.new_order == ('x', 'x', 'x', 'x')):
-                    return host_from_gpu(n.inputs[0])
-                elif (isinstance(n.op, DimShuffle) and
-                      n.op.new_order == ('x', 'x', 'x', 'x')):
-                    return n.inputs[0]
-                elif isinstance(n.op, GpuFromHost):
-                      return grab_lr(n.inputs[0])
-                else:
-                    return None
-            else:
-                if (isinstance(v, Constant) and
-                    v.broadcastable == (True, True, True, True)):
-                    return v.dimshuffle(())
-
        if (isinstance(node.op, GpuElemwise) and
            node.op.scalar_op == scalar.mul and
            node.nin == 2):
-            ger = grab_ger(node.inputs[0])
+            ger = opt.find_node(node.inputs[0], SparseBlockOuterSS)
            if ger is None:
-                ger = grab_ger(node.inputs[1])
-                lr = grab_lr(node.inputs[0])
+                ger = opt.find_node(node.inputs[1], SparseBlockOuterSS)
+                lr = opt.grab_cpu_scalar(node.inputs[0], nd=4)
            else:
-                lr = grab_lr(node.inputs[1])
+                lr = opt.grab_cpu_scalar(node.inputs[1], nd=4)
            if lr is None or ger is None:
                return None
            alpha = lr * ger.inputs[5]
@@ -704,10 +673,10 @@ GpuElemwise{mul}(lr, SparseBlockOuterSS) -> SparseBlockOuterSS(..., alpha=lr)
            (node.op.scalar_op == scalar.sub or
             node.op.scalar_op == scalar.add) and
            node.nin == 2):
-            ger = grab_ger(node.inputs[0])
+            ger = opt.find_node(node.inputs[0], SparseBlockOuterSS)
            W = node.inputs[1]
            if ger is None:
-                ger = grab_ger(node.inputs[1])
+                ger = opt.find_node(node.inputs[1], SparseBlockOuterSS)
                W = node.inputs[0]
            if ger is None:
                return None

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -10,7 +10,7 @@ import numpy

 import theano
 from theano import scalar as scal
-from theano import config, tensor, gof
+from theano import config, tensor, gof, Constant
 import theano.ifelse

 from theano.compile import optdb
@@ -47,7 +47,7 @@ from theano.sandbox.cuda.var import CudaNdarrayConstant
 from theano.sandbox.cuda import gpu_optimizer, register_opt, gpu_seqopt, GpuOp
 from theano.scan_module import scan_utils, scan_op, scan_opt
 from theano.tensor.blas import _is_real_vector, _is_real_matrix
-from theano.tensor import nlinalg
+from theano.tensor import nlinalg, DimShuffle
 from theano.tensor.nnet.Conv3D import Conv3D

 try:
@@ -88,6 +88,38 @@ register_opt()(theano.tensor.opt.local_track_shape_i)
 register_opt(name='gpu_constant_folding')(
    tensor.opt.constant_folding)

+
+def grab_cpu_scalar(v, nd):
+    if v.owner is not None:
+        n = v.owner
+        if (isinstance(n.op, GpuDimShuffle) and
+            n.op.new_order == ('x',) * nd):
+            return host_from_gpu(n.inputs[0])
+        elif (isinstance(n.op, DimShuffle) and
+              n.op.new_order == ('x',) * nd):
+            return n.inputs[0]
+        elif isinstance(n.op, GpuFromHost):
+            return grab_cpu_scalar(n.inputs[0], nd=nd)
+        else:
+            return None
+    else:
+        if (isinstance(v, Constant) and
+            v.broadcastable == (True,) * nd):
+            return v.dimshuffle(())
+
+def find_node(v, cls):
+    # This digs through possibly redundant transfers to for the node
+    # that has the op class specified.
+    if v.owner is not None:
+        if isinstance(v.owner.op, cls):
+            return v.owner
+        elif (isinstance(v.owner.op, GpuFromHost) and
+              v.owner.inputs[0].owner is not None and
+              isinstance(v.owner.inputs[0].owner.op, HostFromGpu)):
+            return find_node(v.owner.inputs[0].owner.inputs[0], cls)
+        else:
+            return None
+
 # This is a partial list of CPU ops that can be in some circonstance
 # moved to the GPU. This list is used by an optimization.
 # Hopefully, we can keep this list up to date.

--- a/theano/sandbox/cuda/tests/test_blocksparse.py
+++ b/theano/sandbox/cuda/tests/test_blocksparse.py
@@ -18,7 +18,8 @@ from theano.sandbox.cuda.basic_ops import (GpuDimShuffle,
 from theano.sandbox.cuda.blocksparse import (sparse_block_dot_SS,
                                             sparse_block_gemv_ss,
                                             sparse_block_outer_ss,
-                                             sparse_block_outer_ss_inplace)
+                                             sparse_block_outer_ss_inplace,
+                                             SparseBlockOuterSS)
 from theano.sandbox.cuda.var import float32_shared_constructor


@@ -186,13 +187,20 @@ def test_blocksparse_grad_merge():

    f1 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)],
                         mode=mode_with_gpu)
-    # not running with mode=gpu ensures that the elemwise is not merged in
-    mode = None
-    if theano.config.mode == 'FAST_COMPILE':
-        mode = theano.compile.mode.get_mode('FAST_RUN')
+
+    # Make sure the lr update was merged.
+    assert isinstance(f1.maker.fgraph.outputs[0].owner.op, SparseBlockOuterSS)
+
+    # Exclude the merge optimizations.
+    mode = mode_with_gpu.excluding('local_merge_blocksparse_alpha')
+    mode = mode.excluding('local_merge_blocksparse_output')

    f2 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)], mode=mode)

+    # Make sure the lr update is not merged.
+    assert not isinstance(f2.maker.fgraph.outputs[0].owner.op,
+                          SparseBlockOuterSS)
+
    f2(h_val, iIdx_val, b_val, oIdx_val)
    W_ref = W.get_value()