Add optimizations and relativ tests

658bf2ef · Xavier Bouthillier · ed4e0095 · 658bf2ef · 658bf2ef · 658bf2ef
--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -21,6 +21,7 @@ from theano.gof import (local_optimizer, EquilibriumDB, ProxyDB,
                        Optimizer, toolbox)
 from theano.gof.opt import LocalMetaOptimizer
 from theano.sandbox.cuda import as_cuda_ndarray_variable
+from theano.sandbox.opt import register_meta_opt
 from theano.sandbox.cuda.basic_ops import (
    gpu_eye, gpu_contiguous,
    gpu_from_host, host_from_gpu, GpuFromHost, HostFromGpu,
@@ -32,9 +33,9 @@ from theano.sandbox.cuda.basic_ops import (
 from theano.sandbox.cuda.type import CudaNdarrayType
 from theano.sandbox.cuda.blas import (gpu_dot22, gpu_dot22scalar,
-        gpu_gemm_inplace, gpu_gemm_no_inplace, GpuConv,
+    gpu_gemm_inplace, gpu_gemm_no_inplace, GpuConv,
-        GpuCorrMM, GpuCorrMM_gradInputs, GpuCorrMM_gradWeights,
+    GpuCorrMM, GpuCorrMM_gradInputs, GpuCorrMM_gradWeights,
-        GpuCorr3dMM, GpuCorr3dMM_gradInputs, GpuCorr3dMM_gradWeights)
+    GpuCorr3dMM, GpuCorr3dMM_gradInputs, GpuCorr3dMM_gradWeights)
 from theano.sandbox.cuda.blas import gpu_gemv_inplace
 from theano.sandbox.cuda.cula import gpu_solve
@@ -43,7 +44,10 @@ from theano.sandbox.cuda.blas import gpu_gemv_no_inplace
 from theano.sandbox.cuda.blas import gpu_ger_inplace
 from theano.sandbox.cuda.blas import gpu_ger_no_inplace
 from theano.sandbox.cuda.blas import (GpuDownsampleFactorMax,
-        GpuDownsampleFactorMaxGrad, GpuDownsampleFactorMaxGradGrad)
+    GpuDownsampleFactorMaxGrad, GpuDownsampleFactorMaxGradGrad)
+from theano.sandbox.blocksparse import SparseBlockGemv, SparseBlockOuter
+from theano.sandbox.cuda.blocksparse import GpuSparseBlockGemv, GpuSparseBlockOuter
 from theano.sandbox.cuda.nnet import (
        GpuCrossentropySoftmaxArgmax1HotWithBias,
@@ -266,8 +270,8 @@ def local_gpu_elemwise_0(node):
                                  'uint16'])
                # case 1 - all inputs are already float32
                if all([i.type.dtype == 'float32' for i in node.inputs]):
-                    # TODO: change this when fusion makes Elemwise with multiple
+                    # TODO: change this when fusion makes Elemwise with 
-                    # outputs
+                    # multiple outputs
                    gpu_elemwise = new_op(*(gpu_from_host(i)
                                            for i in node.inputs))
                # case 2 - it is still ok if some inputs were upcast to float32
@@ -648,7 +652,7 @@ def local_gpu_gemv(node):
    """
    gemvs = (tensor.blas.Gemv,
             tensor.blas_c.CGemv,
-            )
+             )
    if isinstance(node.op, GpuFromHost):
        host_input = node.inputs[0]
        if host_input.owner and isinstance(host_input.owner.op, gemvs):
@@ -688,7 +692,7 @@ def local_gpu_ger(node):
    gers = (tensor.blas_c.CGer,
            tensor.blas.Ger,
            tensor.blas_scipy.ScipyGer,
-        )
+            )
    if isinstance(node.op, GpuFromHost):
        host_input = node.inputs[0]
@@ -741,11 +745,11 @@ def local_gpu_gemm(node):
        y_on_gpu = (y.owner and isinstance(y.owner.op, HostFromGpu))
        z_on_gpu = (z.owner and isinstance(z.owner.op, HostFromGpu))
        if x_on_gpu or y_on_gpu or z_on_gpu:
-            return [host_from_gpu(gpu_gemm_no_inplace(as_cuda_ndarray_variable(z),
+            return [host_from_gpu(gpu_gemm_no_inplace(gpu_from_host(z),
-                                                 a,
+                                                      a,
-                                                 as_cuda_ndarray_variable(x),
+                                                      gpu_from_host(x),
-                                                 as_cuda_ndarray_variable(y),
+                                                      gpu_from_host(y),
-                                                 b))]
+                                                      b))]
    return False
@@ -996,7 +1000,8 @@ def local_gpu_advanced_subtensor1(node):
    if node.op.__class__ is tensor.AdvancedSubtensor1:
        x = node.inputs[0]
        coords = node.inputs[1:]
-        if x.owner and isinstance(x.owner.op, HostFromGpu) and x.dtype == "float32":
+        if (x.owner and isinstance(x.owner.op, HostFromGpu) and
+            x.dtype == "float32"):
            gpu_x, = x.owner.inputs
            return [host_from_gpu(GpuAdvancedSubtensor1()(gpu_x, *coords))]
    return False
@@ -1396,19 +1401,19 @@ def local_gpu_conv(node):
        # print op.kshp, op.imshp[1:3]
        # print op.kshp_logical, logical_img_hw
        ret = GpuConv(border_mode=op.out_mode,
-                    subsample=(op.dx, op.dy),
+                      subsample=(op.dx, op.dy),
-                    logical_img_hw=logical_img_hw,
+                      logical_img_hw=logical_img_hw,
-                    logical_kern_hw=op.kshp_logical,
+                      logical_kern_hw=op.kshp_logical,
-                    logical_kern_align_top=op.kshp_logical_top_aligned,
+                      logical_kern_align_top=op.kshp_logical_top_aligned,
-                    kshp=op.kshp,
+                      kshp=op.kshp,
-                    version=op.version,
+                      version=op.version,
-                    direction_hint=op.direction_hint,
+                      direction_hint=op.direction_hint,
-                    verbose=op.verbose,
+                      verbose=op.verbose,
-                    imshp=op.imshp,
+                      imshp=op.imshp,
-                    nkern=op.nkern,
+                      nkern=op.nkern,
-                    bsize=op.bsize,
+                      bsize=op.bsize,
-                    fft_opt=op.fft_opt
+                      fft_opt=op.fft_opt
-                    )
+                      )
        if op.imshp_logical is not None:
            logical_img_hw = op.imshp_logical[1:3]
            if logical_img_hw != op.imshp[1:3]:
@@ -2420,4 +2425,101 @@ optdb.register('gpu_scanOp_make_inplace',
               'inplace',
               'scan')
+# XXX: these optimisations were badly broken and now require a working
+# beta param (could only be a 0/1 thing for outer_merge, but
+# alpha_merge needs the full range).
+#    @register_opt()
+#    @alpha_merge(GpuSparseBlockOuter, alpha_in=5, beta_in=?, nd=4)
+#    def local_merge_blocksparse_alpha(node, *inputs):
+#        """
+# GpuElemwise{mul}(lr, GpuSparseBlockOuter) -> GpuSparseBlockOuter(..., alpha=lr)
+#        """
+#        return [gpu_sparse_block_outer(*inputs)]
+#    @register_opt()
+#    @output_merge(GpuSparseBlockOuter, alpha_in=5, beta_in=? out_in=0, nd=4)
+#    def local_merge_blocksparse_output(node, *inputs):
+#        return [gpu_sparse_block_outer(*inputs)]
+def _owner_isinstance(inp, test_class):
+    """
+        Tests whether input has an owner and if its owner is
+        of type `test_class`
+    """
+    return bool(inp.owner) and isinstance(inp.owner.op, test_class)
+def _clear_host_from_gpu(inputs):
+    """
+        Replace any HostFromGpu by its input
+    """
+    clean_inputs = []
+    for inp in inputs:
+        if _owner_isinstance(inp, HostFromGpu):
+            clean_inputs.append(inp.owner.inputs[0])
+        else:
+            clean_inputs.append(inp)
+    return clean_inputs
+@register_meta_opt(SparseBlockGemv, ["gpu_opt", "gpu_local_optimizations"],
+                   0., 'fast_run', 'fast_compile', 'gpu')
+@local_optimizer([SparseBlockGemv, GpuFromHost])
+def gpu_sparse_block_gemv_opt(node):
+    """
+        SparseBlockGemv(HostFromGpu(input)) ->
+        HostFromGpu(GpuSparseBlockGemv(input))
+        or
+        GpuFromHost(SparseBlockGemv) -> GpuSparseBlockGemv
+    """
+    if isinstance(node.op, SparseBlockGemv) and \
+            any(_owner_isinstance(inp, HostFromGpu) for inp in node.inputs):
+        inputs = _clear_host_from_gpu(node.inputs)
+        return [host_from_gpu(GpuSparseBlockGemv(node.op.inplace)(*inputs))]
+    elif isinstance(node.op, GpuFromHost) and \
+            _owner_isinstance(node.inputs[0], SparseBlockGemv):
+        meta_node = node.inputs[0].owner
+        inputs = _clear_host_from_gpu(meta_node.inputs)
+        return [GpuSparseBlockGemv(meta_node.op.inplace)(*inputs)]
+@register_meta_opt(SparseBlockOuter, ["gpu_opt", "gpu_local_optimizations"],
+                   0., 'fast_run', 'fast_compile', 'gpu')
+@local_optimizer([SparseBlockOuter, GpuFromHost])
+def gpu_sparse_block_outer_opt(node):
+    """
+        SparseBlockOuter(HostFromGpu(input)) ->
+        HostFromGpu(GpuSparseBlockOuter(input))
+        or
+        GpuFromHost(SparseBlockOuter) -> GpuSparseBlockOuter
+    """
+    if isinstance(node.op, SparseBlockOuter) and \
+            any(_owner_isinstance(inp, HostFromGpu) for inp in node.inputs):
+        inputs = _clear_host_from_gpu(node.inputs)
+        return [host_from_gpu(GpuSparseBlockOuter(node.op.inplace)(*inputs))]
+    elif isinstance(node.op, GpuFromHost) and \
+            _owner_isinstance(node.inputs[0], SparseBlockOuter):
+        meta_node = node.inputs[0].owner
+        inputs = _clear_host_from_gpu(meta_node.inputs)
+        return [GpuSparseBlockOuter(meta_node.op.inplace)(*inputs)]
 import theano.sandbox.cuda.extra_ops
--- a/theano/sandbox/cuda/tests/test_opt.py
+++ b/theano/sandbox/cuda/tests/test_opt.py
@@ -29,6 +29,9 @@ from theano.sandbox.cuda import basic_ops
 from theano.sandbox.cuda.type import CudaNdarrayType
 from theano.scalar.basic_scipy import erfinv
+from theano.sandbox.blocksparse import sparse_block_dot
+from theano.sandbox.cuda.blocksparse import GpuSparseBlockGemv, GpuSparseBlockOuter
 if theano.config.mode == 'FAST_COMPILE':
    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
    mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpu')
@@ -740,6 +743,37 @@ def test_local_gpu_dot_to_dot22dot():
    cmp((3, 4), (4,))
+def test_blocksparse_gpu_gemv_opt():
+    b = tensor.fmatrix()
+    W = tensor.ftensor4()
+    h = tensor.ftensor3()
+    iIdx = tensor.lmatrix()
+    oIdx = tensor.lmatrix()
+    o = sparse_block_dot(W, h, iIdx, b, oIdx)
+    f = theano.function([W, h, iIdx, b, oIdx], o, mode=mode_with_gpu)
+    assert isinstance(f.maker.fgraph.toposort()[-2].op, GpuSparseBlockGemv)
+def test_blocksparse_gpu_outer_opt():
+    b = tensor.fmatrix()
+    W = tensor.ftensor4()
+    h = tensor.ftensor3()
+    iIdx = tensor.lmatrix()
+    oIdx = tensor.lmatrix()
+    o = sparse_block_dot(W, h, iIdx, b, oIdx)
+    theano.printing.debugprint(tensor.grad(o.sum(),wrt=W))
+    f = theano.function([W, h, iIdx, b, oIdx], [o, tensor.grad(o.sum(),wrt=W)], 
+                        mode=mode_with_gpu)
+    assert isinstance(f.maker.fgraph.toposort()[-2].op, GpuSparseBlockOuter)
 class test_diag(theano.tensor.tests.test_nlinalg.test_diag):
    mode = mode_with_gpu
    shared = staticmethod(cuda.shared_constructor)
@@ -751,6 +785,7 @@ class test_diag(theano.tensor.tests.test_nlinalg.test_diag):
              self).__init__(name)
 if __name__ == '__main__':
    test_gpualloc()
    test_opt_gpujoin_onlyajoin()

--- a/theano/sandbox/opt.py
+++ b/theano/sandbox/opt.py
+"""
+Optimizations addressing the ops in sandbox root directory
+"""
+import bisect
+import logging
+from theano.compile import optdb
+from theano.gof import local_optimizer, EquilibriumDB
+from theano.tensor.opt import register_specialize
+from theano.sandbox.blocksparse import (
+    SparseBlockGemv,
+    SparseBlockOuter,
+    sparse_block_gemv,
+    sparse_block_outer,
+    sparse_block_gemv_inplace,
+    sparse_block_outer_inplace,
+    CpuSparseBlockGemv,
+    CpuSparseBlockOuter)
+_logger = logging.getLogger('theano.sandbox.opt')
+def _db_exists(db, db_name):
+    """
+        Tests whether the full path from `db_name[0]` down to
+        `db_name[-1]` exists.
+        Parameters
+        ----------
+        db: `theano.gof.optdb.DB`
+            A dataset of optimisations or sub-datasets.
+        db_name: list or tuple of strings
+            Names of datasets from given one `db[db_name[0]]` down
+            to the dataset of interest where to register.
+            ex: ['level_1_dataset', 'level_2_dataset']
+    """
+    if len(db_name) == 1:
+        return db_name[0] in db._names
+    return db_name[0] in db._names and _db_exists(db[db_name[0]], db_name[1:])
+def _db_register(db, db_name, *args):
+    """
+        Registers an object in last datasets given in db_name. `db_name[-1]`
+        is deep in the hierarchy of `db`.
+        Parameters
+        ----------
+        db: `theano.gof.optdb.DB`
+            A dataset of optimisations or sub-datasets.
+        db_name: list or tuple of strings
+            Names of datasets from given one `db[db_name[0]]` down
+            to the dataset of interest where to register.
+            ex: ['level_1_dataset', 'level_2_dataset']
+    """
+    if len(db_name) == 0:
+        return db.register(*args)
+    return _db_register(db[db_name[0]], db_name[1:], *args)
+def _db_positions(db, db_name, positions=()):
+    """
+        Returns the list of positions of all databases from `db_name[0]`
+        down to `db_name[-1]`. The path is hierarchical, hence `db_name[0]`
+        is in `db`, `db_name[1]` is in `db[db_name[0]]`, etc.
+        Parameters
+        ----------
+        db: `theano.gof.optdb.DB`
+            A dataset of optimisations or sub-datasets.
+        db_name: list or tuple of strings
+            Names of datasets from given one `db[db_name[0]]` down
+            to the dataset of interests.
+            ex: ['level_1_dataset', 'level_2_dataset']
+    """
+    if len(db_name) == 0:
+        return positions
+    db_position = db.__position__.get(db_name[0], 0.)
+    return _db_positions(db[db_name[0]], db_name[1:],
+                         positions + (db_position, ))
+def register_meta_opt(op_class, db_name, position, *args):
+    """
+    Registers a given optimization under given database name and saves
+    optimization information in `op_class.registered_opts`.
+    Parameters
+    ----------
+    op_class: `theano.gof.Op`
+        A meta Op which have multiple implementations available
+        for optimization.
+    db_name: string, list or tuple of strings
+        A string if optimization is inserted in `theano.compile.optdb`
+        directly. List is used to insert an optimization deep inside a
+        hierarchy of optimization databases.
+    position: int or float
+        Position of the optimisation in the target dataset.
+        (Position in deep database if not optdb)
+    *args
+        Arguments to register the optimization.
+    """
+    if isinstance(db_name, str):
+        db_name = [db_name]
+    def call(local_meta_opt):
+        if not _db_exists(optdb, db_name):
+            # TODO: Would another default DB be better?
+            _db_register(optdb, db_name[:-2],
+                         db_name[-1], EquilibriumDB(), position, *args)
+        _db_register(optdb, db_name,
+                     local_meta_opt.__name__, local_meta_opt, *args)
+        positions = _db_positions(optdb, db_name)
+        idx = bisect.bisect_left((positions, local_meta_opt),
+                                 op_class.registered_opts)
+        op_class.registered_opts.insert(idx,
+                (positions, local_meta_opt.__name__))
+        return local_meta_opt
+    return call
+@register_meta_opt(SparseBlockGemv, ["meta_cpu"], 51.0,
+                   "fast_run", "fast_compile")
+@local_optimizer([SparseBlockGemv])
+def cpu_sparse_block_gemv_opt(node):
+    """
+        SparseBlockGemv -> CpuSparseBlockGemv
+    """
+    return [CpuSparseBlockGemv(node.op.inplace)(*node.inputs)]
+@register_meta_opt(SparseBlockOuter, ["meta_cpu"], 51.0,
+                   "fast_run", "fast_compile")
+@local_optimizer([SparseBlockOuter])
+def cpu_sparse_block_outer_opt(node):
+    """
+        SparseBlockOuter -> CpuSparseBlockOuter
+    """
+    return [CpuSparseBlockOuter(node.op.inplace)(*node.inputs)]
+@register_specialize
+@local_optimizer([sparse_block_gemv], inplace=True)
+def local_inplace_block_sparse_gemv(node):
+    """
+        SparseBlockGemv(inplace=False) -> SparseBlockGemv(inplace=True)
+    """
+    return [sparse_block_gemv_inplace(*node.inputs)]
+@register_specialize
+@local_optimizer([sparse_block_outer], inplace=True)
+def local_inplace_block_sparse_outer(node):
+    """
+        SparseBlockOuter(inplace=False) -> SparseBlockOuter(inplace=True)
+    """
+    return [sparse_block_outer_inplace(*node.inputs)]
--- a/theano/sandbox/tests/test_opt.py
+++ b/theano/sandbox/tests/test_opt.py
+import theano
+from theano import tensor
+from theano.sandbox.blocksparse import CpuSparseBlockGemv, CpuSparseBlockOuter, sparse_block_dot
+def test_blocksparse_cpu_gemv_opt():
+    b = tensor.fmatrix()
+    W = tensor.ftensor4()
+    h = tensor.ftensor3()
+    iIdx = tensor.lmatrix()
+    oIdx = tensor.lmatrix()
+    o = sparse_block_dot(W, h, iIdx, b, oIdx)
+    f = theano.function([W, h, iIdx, b, oIdx], o)
+    assert isinstance(f.maker.fgraph.toposort()[-1].op, CpuSparseBlockGemv)
+def test_blocksparse_cpu_outer_opt():
+    b = tensor.fmatrix()
+    W = tensor.ftensor4()
+    h = tensor.ftensor3()
+    iIdx = tensor.lmatrix()
+    oIdx = tensor.lmatrix()
+    o = sparse_block_dot(W, h, iIdx, b, oIdx)
+    theano.printing.debugprint(tensor.grad(o.sum(),wrt=W))
+    f = theano.function([W, h, iIdx, b, oIdx], [o, tensor.grad(o.sum(),wrt=W)])
+    assert isinstance(f.maker.fgraph.toposort()[-1].op, CpuSparseBlockOuter)