created an fusion optimizer for the gpu.

9afcee26 · Frederic Bastien · f1a3bae9 · 9afcee26 · 9afcee26 · 9afcee26
--- a/theano/configdefaults.py
+++ b/theano/configdefaults.py
@@ -45,6 +45,11 @@ AddConfigVar('tensor.local_elemwise_fusion',
        "Enable or not in fast_run mode(fast_run optimization) the elemwise fusion optimization",
        BoolParam(True))
+AddConfigVar('gpu.local_elemwise_fusion',
+        "Enable or not in fast_run mode(fast_run optimization) the gpu elemwise fusion optimization",
+        BoolParam(True))
+#http://developer.amd.com/CPU/LIBRARIES/LIBM/Pages/default.aspx
 AddConfigVar('lib.amdlibm',
        "Use amd's amdlibm numerical library",
        BoolParam(False))

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
+import logging
+_logger = logging.getLogger('theano.sandbox.cuda.opt')
 import sys
 import theano
 import numpy
@@ -569,3 +572,21 @@ def local_gpu_join(node):
+# After destroyhandler is in but before we try to make elemwise things inplace
+# Try to make gpu gemm inplace
+# Also, need to make the gemm optimisation(step 70) happen before the fusion of elemwise(step 71)
+#optdb.register('InplaceGpuBlasOpt',
+#        EquilibriumOptimizer([local_inplace_gemm], failure_callback=EquilibriumOptimizer.warn_inplace,
+#            max_use_ratio=5),
+#               70.0, 'fast_run', 'inplace')
+#GpuElemwise fusion
+gpu_local_elemwise_fusion = tensor.opt.local_elemwise_fusion_op(GpuElemwise)
+if config.gpu.local_elemwise_fusion:
+    _logger.debug("enabling optimization fusion of gpu elemwise in fast_run")
+    compile.optdb.register('gpu_elemwise_fusion', tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion), 71.00, 'fast_run', 'fusion', 'local_elemwise_fusion')
+else:
+    _logger.debug("not enabling optimization fusion of gpu elemwise in fast_run")
+    compile.optdb.register('gpu_elemwise_fusion', tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion), 71.00, 'fusion', 'local_elemwise_fusion')
--- a/theano/sandbox/cuda/tests/test_opt.py
+++ b/theano/sandbox/cuda/tests/test_opt.py
@@ -7,8 +7,8 @@ import numpy
 # Skip test if cuda_ndarray is not available.
 from nose.plugins.skip import SkipTest
-import theano.sandbox.cuda as cuda_ndarray
+import theano.sandbox.cuda as cuda
-if cuda_ndarray.cuda_available == False:
+if cuda.cuda_available == False:
    raise SkipTest('Optional package cuda disabled')
 import theano.compile.mode
@@ -76,8 +76,8 @@ def test_opt_gpujoin_onlyajoin():
    # from a bug in normal sampling
    _a = numpy.asarray([[1,2],[3,4]],dtype='float32')
    _b = numpy.asarray([[5,6,7],[8,9,10]],dtype='float32')
-    a = cuda_ndarray.shared_constructor(_a)
+    a = cuda.shared_constructor(_a)
-    b = cuda_ndarray.shared_constructor(_b)
+    b = cuda.shared_constructor(_b)
    c = tensor.join(1,a,b)
@@ -100,8 +100,8 @@ def test_opt_gpujoin_joinvectors_elemwise_then_minusone():
    # from a bug in gpu normal sampling
    _a = numpy.asarray([1,2,3,4],dtype='float32')
    _b = numpy.asarray([5,6,7,8],dtype='float32')
-    a = cuda_ndarray.shared_constructor(_a)
+    a = cuda.shared_constructor(_a)
-    b = cuda_ndarray.shared_constructor(_b)
+    b = cuda.shared_constructor(_b)
    a_prime = tensor.cos(a)
    b_prime = tensor.sin(b)
@@ -125,6 +125,20 @@ def test_opt_gpujoin_joinvectors_elemwise_then_minusone():
    assert numpy.allclose(numpy.asarray(f()), concat)
+def test_elemwise_fusion():
+    """ Test the the GpuElemwise fusion work correctly"""
+    shape = (3,4)
+    a = cuda.shared_constructor(theano._asarray(numpy.random.rand(*shape), dtype='float32'), 'a')
+    b = tensor.fmatrix()
+    c = tensor.fmatrix()
+    f = pfunc([b,c], [a+b+c], mode=mode_with_gpu)
+    topo = f.maker.env.toposort()
+    for i, node in enumerate(topo):
+        print >> sys.stdout, i, node
+    assert len(topo)==4
+    assert isinstance(topo[2].op.scalar_op,theano.scalar.basic.Composite)
+    #let debugmode catch errors
+    f(theano._asarray(numpy.random.rand(*shape), dtype='float32'), theano._asarray(numpy.random.rand(*shape), dtype='float32'))
 if __name__ == '__main__':

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -2096,8 +2096,11 @@ register_canonicalize(local_transposed_dot, name='local_transposed_dot')
 # ###############
 # # Loop fusion #
 # ###############
+def local_elemwise_fusion_op(OP):
-def local_elemwise_fusion(node):
+    """
+    We parametrise it to make it work for Elemwise and GpuElemwise op.
+    """
+    def local_fuse(node):
        """
        As part of specialisation, we fusion two consecutif elemwise op of the same shape.
@@ -2123,7 +2126,7 @@ def local_elemwise_fusion(node):
        # might not be easy or worthwhile if the summation axis doesn't line up with a contiguous
        # dimension)
-    if not isinstance(node.op, T.Elemwise):
+        if not isinstance(node.op, OP):
            return False
        nb_elemwise=0
        inputs=[]#inputs of the new Elemwise op.
@@ -2132,7 +2135,7 @@ def local_elemwise_fusion(node):
        for i in node.inputs:
            do_fusion = False
            catch = False
-        if i.owner and isinstance(i.owner.op,T.Elemwise) and len(i.clients)<=1:
+            if i.owner and isinstance(i.owner.op, OP) and len(i.clients)<=1:
                #if the scalar_op don't have a c implementation, we skip its fusion to allow the fusion of the other ops.
                do_fusion=True
                try:
@@ -2162,7 +2165,7 @@ def local_elemwise_fusion(node):
        #if no inputs have are an elemwise, there is nothing to fuse.
        if nb_elemwise==0:
-#        print "local_elemwise_fusion: no elemwise in inputs. Nothing to fuse."
+    #        print "local_elemwise_fusion: no elemwise in inputs. Nothing to fuse."
            return False
        otype = node.outputs[0].type
@@ -2182,7 +2185,7 @@ def local_elemwise_fusion(node):
        C = scalar.Composite(s_inputs,[s_new_out])
        #create the new node.
-    n=T.Elemwise(C).make_node(*inputs)
+        n=OP(C).make_node(*inputs)
        assert len(n.outputs)==1
        assert node.outputs[0].dtype==n.outputs[0].dtype
@@ -2201,13 +2204,17 @@ def local_elemwise_fusion(node):
                _logger.info('loop fusion failed because Op would exceed kernel argument limit.')
                return False
-#    print "local_elemwise_fusion: FUSED",nb_elemwise+1,"elemwise!"
+    #    print "local_elemwise_fusion: FUSED",nb_elemwise+1,"elemwise!"
        return n.outputs
+    return local_fuse
+local_elemwise_fusion = local_elemwise_fusion_op(T.Elemwise)
 class FusionOptimizer(Optimizer):
    """Graph optimizer for Fusion of elemwise operations"""
-    def __init__(self):
+    def __init__(self, local_optimizer):
        Optimizer.__init__(self)
+        self.optimizer = local_optimizer
    def add_requirements(self, env):
        env.extend(toolbox.ReplaceValidate())
@@ -2219,7 +2226,7 @@ class FusionOptimizer(Optimizer):
            nodelist = list(env.toposort())
            did_something = False
            for node in nodelist:
-                new_outputs = local_elemwise_fusion(node)
+                new_outputs = self.optimizer(node)
                if new_outputs:
                    assert len(new_outputs) == len(node.outputs)
                    try:
@@ -2235,9 +2242,9 @@ class FusionOptimizer(Optimizer):
 if config.tensor.local_elemwise_fusion:
    _logger.debug("enabling optimization fusion elemwise in fast_run")
-    compile.optdb.register('elemwise_fusion', FusionOptimizer(), 71.00, 'fast_run', 'fusion', 'local_elemwise_fusion')
+    compile.optdb.register('elemwise_fusion', FusionOptimizer(local_elemwise_fusion), 71.00, 'fast_run', 'fusion', 'local_elemwise_fusion')
 else:
    _logger.debug("not enabling optimization fusion elemwise in fast_run")
-    compile.optdb.register('elemwise_fusion', FusionOptimizer(), 71.00, 'fusion', 'local_elemwise_fusion')
+    compile.optdb.register('elemwise_fusion', FusionOptimizer(local_elemwise_fusion), 71.00, 'fusion', 'local_elemwise_fusion')