created an fusion optimizer for the gpu.

9afcee26 · Frederic Bastien · f1a3bae9 · 9afcee26 · 9afcee26 · 9afcee26
--- a/theano/configdefaults.py
+++ b/theano/configdefaults.py
@@ -45,6 +45,11 @@ AddConfigVar('tensor.local_elemwise_fusion',
        "Enable or not in fast_run mode(fast_run optimization) the elemwise fusion optimization",
        BoolParam(True))

+AddConfigVar('gpu.local_elemwise_fusion',
+        "Enable or not in fast_run mode(fast_run optimization) the gpu elemwise fusion optimization",
+        BoolParam(True))
+
+#http://developer.amd.com/CPU/LIBRARIES/LIBM/Pages/default.aspx
 AddConfigVar('lib.amdlibm',
        "Use amd's amdlibm numerical library",
        BoolParam(False))

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
+import logging
+_logger = logging.getLogger('theano.sandbox.cuda.opt')
+
 import sys
 import theano
 import numpy
@@ -569,3 +572,21 @@ def local_gpu_join(node):



+# After destroyhandler is in but before we try to make elemwise things inplace
+# Try to make gpu gemm inplace
+# Also, need to make the gemm optimisation(step 70) happen before the fusion of elemwise(step 71)
+#optdb.register('InplaceGpuBlasOpt',
+#        EquilibriumOptimizer([local_inplace_gemm], failure_callback=EquilibriumOptimizer.warn_inplace,
+#            max_use_ratio=5),
+#               70.0, 'fast_run', 'inplace')
+
+#GpuElemwise fusion
+gpu_local_elemwise_fusion = tensor.opt.local_elemwise_fusion_op(GpuElemwise)
+if config.gpu.local_elemwise_fusion:
+    _logger.debug("enabling optimization fusion of gpu elemwise in fast_run")
+    compile.optdb.register('gpu_elemwise_fusion', tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion), 71.00, 'fast_run', 'fusion', 'local_elemwise_fusion')
+else:
+    _logger.debug("not enabling optimization fusion of gpu elemwise in fast_run")
+    compile.optdb.register('gpu_elemwise_fusion', tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion), 71.00, 'fusion', 'local_elemwise_fusion')
+
+
--- a/theano/sandbox/cuda/tests/test_opt.py
+++ b/theano/sandbox/cuda/tests/test_opt.py
@@ -7,8 +7,8 @@ import numpy

 # Skip test if cuda_ndarray is not available.
 from nose.plugins.skip import SkipTest
-import theano.sandbox.cuda as cuda_ndarray
-if cuda_ndarray.cuda_available == False:
+import theano.sandbox.cuda as cuda
+if cuda.cuda_available == False:
    raise SkipTest('Optional package cuda disabled')

 import theano.compile.mode
@@ -76,8 +76,8 @@ def test_opt_gpujoin_onlyajoin():
    # from a bug in normal sampling
    _a = numpy.asarray([[1,2],[3,4]],dtype='float32')
    _b = numpy.asarray([[5,6,7],[8,9,10]],dtype='float32')
-    a = cuda_ndarray.shared_constructor(_a)
-    b = cuda_ndarray.shared_constructor(_b)
+    a = cuda.shared_constructor(_a)
+    b = cuda.shared_constructor(_b)

    c = tensor.join(1,a,b)
    
@@ -100,8 +100,8 @@ def test_opt_gpujoin_joinvectors_elemwise_then_minusone():
    # from a bug in gpu normal sampling
    _a = numpy.asarray([1,2,3,4],dtype='float32')
    _b = numpy.asarray([5,6,7,8],dtype='float32')
-    a = cuda_ndarray.shared_constructor(_a)
-    b = cuda_ndarray.shared_constructor(_b)
+    a = cuda.shared_constructor(_a)
+    b = cuda.shared_constructor(_b)

    a_prime = tensor.cos(a)
    b_prime = tensor.sin(b)
@@ -125,6 +125,20 @@ def test_opt_gpujoin_joinvectors_elemwise_then_minusone():

    assert numpy.allclose(numpy.asarray(f()), concat)

+def test_elemwise_fusion():
+    """ Test the the GpuElemwise fusion work correctly"""
+    shape = (3,4)
+    a = cuda.shared_constructor(theano._asarray(numpy.random.rand(*shape), dtype='float32'), 'a')
+    b = tensor.fmatrix()
+    c = tensor.fmatrix()
+    f = pfunc([b,c], [a+b+c], mode=mode_with_gpu)
+    topo = f.maker.env.toposort()
+    for i, node in enumerate(topo):
+        print >> sys.stdout, i, node
+    assert len(topo)==4
+    assert isinstance(topo[2].op.scalar_op,theano.scalar.basic.Composite)
+    #let debugmode catch errors
+    f(theano._asarray(numpy.random.rand(*shape), dtype='float32'), theano._asarray(numpy.random.rand(*shape), dtype='float32'))


 if __name__ == '__main__':

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -2096,118 +2096,125 @@ register_canonicalize(local_transposed_dot, name='local_transposed_dot')
 # ###############
 # # Loop fusion #
 # ###############
-
-def local_elemwise_fusion(node):
+def local_elemwise_fusion_op(OP):
    """
-    As part of specialisation, we fusion two consecutif elemwise op of the same shape.
-
-    For mixed dtype, we let the Compise op do the cast. It let the C compile do the cast.
-    The number of dimension is validated at call time by theano itself.
-
+    We parametrise it to make it work for Elemwise and GpuElemwise op.
    """
-    # META TODO:  PUT THESE THINGS IN TRAC, NOT TODO NOTES!!
-    # TODO: use broadcast flag?
+    def local_fuse(node):
+        """
+        As part of specialisation, we fusion two consecutif elemwise op of the same shape.

-    # TODO: don't do this optimization as a localOptimizer.  Analyze the graph in terms of
-    # elemwise subgraphs, and then replace each subgraph with a Composite version.
+        For mixed dtype, we let the Compise op do the cast. It let the C compile do the cast.
+        The number of dimension is validated at call time by theano itself.

-    # TODO: use malloc and copy to transfer arguments that don't fit within the parameter space
-    # of 256 bytes
-    #
-    # TODO: Merge with multiple output to merge when an inputs have multiple clients. This can't be done with a local optimiser.
-    # TODO: Related: Support composites with multiple outputs
+        """
+        # META TODO:  PUT THESE THINGS IN TRAC, NOT TODO NOTES!!
+        # TODO: use broadcast flag?

-    # TODO: Use Composite to combine Elemwise and Reduce operations.  We have to loop over the
-    # data anyway... might as well sum it up while we're at it (this can be trickier than i'm
-    # making it seound here. The data-traversal should be done contiguously, and the summing-up
-    # might not be easy or worthwhile if the summation axis doesn't line up with a contiguous
-    # dimension)
+        # TODO: don't do this optimization as a localOptimizer.  Analyze the graph in terms of
+        # elemwise subgraphs, and then replace each subgraph with a Composite version.

-    if not isinstance(node.op, T.Elemwise):
-        return False
-    nb_elemwise=0
-    inputs=[]#inputs of the new Elemwise op.
-    s_inputs = []#inputs of the new scalar op.
-    s_g=[]#graph of scalar, what will by done in the inner loop.
-    for i in node.inputs:
-        do_fusion = False
-        catch = False
-        if i.owner and isinstance(i.owner.op,T.Elemwise) and len(i.clients)<=1:
-            #if the scalar_op don't have a c implementation, we skip its fusion to allow the fusion of the other ops.
-            do_fusion=True
-            try:
-                s_input = [scalar.Scalar(x.dtype).make_variable() for x in i.owner.inputs]
-                s_op=i.owner.op.scalar_op(*s_input)
-                i.owner.op.scalar_op.c_code(s_op.owner,"test_presence_of_c_code",
-                                            ["x" for x in i.owner.inputs],
-                                            "z",{})
-            except MethodNotDefined:
-                catch = True
-            except NotImplementedError:
-                catch = True
-            if catch:
-                _logger.info("%s does not implement the c_code function. As well as being potentially slow, this disables loop fusion of this op." % str(i.owner.op.scalar_op))
-                do_fusion=False
-
-        if do_fusion:
-            nb_elemwise+=1
-            inputs.extend(i.owner.inputs)
-            s_inputs.extend(s_input)
-            s_g.append(s_op)
-        else:
-            inputs.append(i)
-            s=scalar.Scalar(i.dtype).make_variable()
-            s_inputs.append(s)
-            s_g.append(s)
-
-    #if no inputs have are an elemwise, there is nothing to fuse.
-    if nb_elemwise==0:
-#        print "local_elemwise_fusion: no elemwise in inputs. Nothing to fuse."
-        return False
+        # TODO: use malloc and copy to transfer arguments that don't fit within the parameter space
+        # of 256 bytes
+        #
+        # TODO: Merge with multiple output to merge when an inputs have multiple clients. This can't be done with a local optimiser.
+        # TODO: Related: Support composites with multiple outputs

-    otype = node.outputs[0].type
-    s_new_out=node.op.scalar_op(*s_g)
-    try:
-        s_new_out.owner.op.c_code(s_new_out.owner, "test_presence_of_c_code",
-                         ["x" for x in s_g],
-                         "z",{}) 
-    except MethodNotDefined:
-        _logger.info("%s does not implement the c_code function. As well as being potentially slow, this disables loop fusion of this op." % str(s_new_out.owner.op))
-        return False
-    except NotImplementedError:
-        _logger.info("%s does not implement the c_code function. As well as being potentially slow, this disables loop fusion of this op." % str(s_new_out.owner.op))
-        return False
+        # TODO: Use Composite to combine Elemwise and Reduce operations.  We have to loop over the
+        # data anyway... might as well sum it up while we're at it (this can be trickier than i'm
+        # making it seound here. The data-traversal should be done contiguously, and the summing-up
+        # might not be easy or worthwhile if the summation axis doesn't line up with a contiguous
+        # dimension)

-    #create the composite op.
-    C = scalar.Composite(s_inputs,[s_new_out])
-
-    #create the new node.
-    n=T.Elemwise(C).make_node(*inputs)
-    assert len(n.outputs)==1
-    assert node.outputs[0].dtype==n.outputs[0].dtype
-
-    # There is a hard limit of 256 bytes for the formal argument list to a GPU kernel function.
-    # Here, we estimate how many bytes the new Op will need, and abort if it needs too much.
-    if True:
-        argument_limit = 240  # 16 bytes are used for block and thread coords etc.
-        #TODO: read in from architecture to make this 4 or 8
-        int_size = 8
-        ptr_size = 8
-        argument_size = int_size #for numels
-        argument_size += int_size *  inputs[0].type.ndim # for the shape
-        argument_size += sum((ptr_size + int_size * i.type.ndim) for i in n.inputs)
-        argument_size += sum((ptr_size + int_size * i.type.ndim) for i in n.outputs)
-        if argument_size >= argument_limit:
-            _logger.info('loop fusion failed because Op would exceed kernel argument limit.')
+        if not isinstance(node.op, OP):
+            return False
+        nb_elemwise=0
+        inputs=[]#inputs of the new Elemwise op.
+        s_inputs = []#inputs of the new scalar op.
+        s_g=[]#graph of scalar, what will by done in the inner loop.
+        for i in node.inputs:
+            do_fusion = False
+            catch = False
+            if i.owner and isinstance(i.owner.op, OP) and len(i.clients)<=1:
+                #if the scalar_op don't have a c implementation, we skip its fusion to allow the fusion of the other ops.
+                do_fusion=True
+                try:
+                    s_input = [scalar.Scalar(x.dtype).make_variable() for x in i.owner.inputs]
+                    s_op=i.owner.op.scalar_op(*s_input)
+                    i.owner.op.scalar_op.c_code(s_op.owner,"test_presence_of_c_code",
+                                                ["x" for x in i.owner.inputs],
+                                                "z",{})
+                except MethodNotDefined:
+                    catch = True
+                except NotImplementedError:
+                    catch = True
+                if catch:
+                    _logger.info("%s does not implement the c_code function. As well as being potentially slow, this disables loop fusion of this op." % str(i.owner.op.scalar_op))
+                    do_fusion=False
+
+            if do_fusion:
+                nb_elemwise+=1
+                inputs.extend(i.owner.inputs)
+                s_inputs.extend(s_input)
+                s_g.append(s_op)
+            else:
+                inputs.append(i)
+                s=scalar.Scalar(i.dtype).make_variable()
+                s_inputs.append(s)
+                s_g.append(s)
+
+        #if no inputs have are an elemwise, there is nothing to fuse.
+        if nb_elemwise==0:
+    #        print "local_elemwise_fusion: no elemwise in inputs. Nothing to fuse."
            return False

-#    print "local_elemwise_fusion: FUSED",nb_elemwise+1,"elemwise!"
-    return n.outputs
+        otype = node.outputs[0].type
+        s_new_out=node.op.scalar_op(*s_g)
+        try:
+            s_new_out.owner.op.c_code(s_new_out.owner, "test_presence_of_c_code",
+                             ["x" for x in s_g],
+                             "z",{}) 
+        except MethodNotDefined:
+            _logger.info("%s does not implement the c_code function. As well as being potentially slow, this disables loop fusion of this op." % str(s_new_out.owner.op))
+            return False
+        except NotImplementedError:
+            _logger.info("%s does not implement the c_code function. As well as being potentially slow, this disables loop fusion of this op." % str(s_new_out.owner.op))
+            return False
+
+        #create the composite op.
+        C = scalar.Composite(s_inputs,[s_new_out])
+
+        #create the new node.
+        n=OP(C).make_node(*inputs)
+        assert len(n.outputs)==1
+        assert node.outputs[0].dtype==n.outputs[0].dtype
+
+        # There is a hard limit of 256 bytes for the formal argument list to a GPU kernel function.
+        # Here, we estimate how many bytes the new Op will need, and abort if it needs too much.
+        if True:
+            argument_limit = 240  # 16 bytes are used for block and thread coords etc.
+            #TODO: read in from architecture to make this 4 or 8
+            int_size = 8
+            ptr_size = 8
+            argument_size = int_size #for numels
+            argument_size += int_size *  inputs[0].type.ndim # for the shape
+            argument_size += sum((ptr_size + int_size * i.type.ndim) for i in n.inputs)
+            argument_size += sum((ptr_size + int_size * i.type.ndim) for i in n.outputs)
+            if argument_size >= argument_limit:
+                _logger.info('loop fusion failed because Op would exceed kernel argument limit.')
+                return False
+
+    #    print "local_elemwise_fusion: FUSED",nb_elemwise+1,"elemwise!"
+        return n.outputs
+    return local_fuse
+
+local_elemwise_fusion = local_elemwise_fusion_op(T.Elemwise)

 class FusionOptimizer(Optimizer):
    """Graph optimizer for Fusion of elemwise operations"""
-    def __init__(self):
+    def __init__(self, local_optimizer):
        Optimizer.__init__(self)
+        self.optimizer = local_optimizer

    def add_requirements(self, env):
        env.extend(toolbox.ReplaceValidate())
@@ -2219,7 +2226,7 @@ class FusionOptimizer(Optimizer):
            nodelist = list(env.toposort())
            did_something = False
            for node in nodelist:
-                new_outputs = local_elemwise_fusion(node)
+                new_outputs = self.optimizer(node)
                if new_outputs:
                    assert len(new_outputs) == len(node.outputs)
                    try:
@@ -2235,9 +2242,9 @@ class FusionOptimizer(Optimizer):

 if config.tensor.local_elemwise_fusion:
    _logger.debug("enabling optimization fusion elemwise in fast_run")
-    compile.optdb.register('elemwise_fusion', FusionOptimizer(), 71.00, 'fast_run', 'fusion', 'local_elemwise_fusion')
+    compile.optdb.register('elemwise_fusion', FusionOptimizer(local_elemwise_fusion), 71.00, 'fast_run', 'fusion', 'local_elemwise_fusion')
 else:
    _logger.debug("not enabling optimization fusion elemwise in fast_run")
-    compile.optdb.register('elemwise_fusion', FusionOptimizer(), 71.00, 'fusion', 'local_elemwise_fusion')
+    compile.optdb.register('elemwise_fusion', FusionOptimizer(local_elemwise_fusion), 71.00, 'fusion', 'local_elemwise_fusion')