created an fusion optimizer for the gpu.

9afcee26 · Frederic Bastien · f1a3bae9 · 9afcee26 · 9afcee26 · 9afcee26
--- a/theano/configdefaults.py
+++ b/theano/configdefaults.py
@@ -45,6 +45,11 @@ AddConfigVar('tensor.local_elemwise_fusion',
        "Enable or not in fast_run mode(fast_run optimization) the elemwise fusion optimization",
        BoolParam(True))
+AddConfigVar('gpu.local_elemwise_fusion',
+        "Enable or not in fast_run mode(fast_run optimization) the gpu elemwise fusion optimization",
+        BoolParam(True))
+#http://developer.amd.com/CPU/LIBRARIES/LIBM/Pages/default.aspx
 AddConfigVar('lib.amdlibm',
        "Use amd's amdlibm numerical library",
        BoolParam(False))

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
+import logging
+_logger = logging.getLogger('theano.sandbox.cuda.opt')
 import sys
 import theano
 import numpy
@@ -569,3 +572,21 @@ def local_gpu_join(node):
+# After destroyhandler is in but before we try to make elemwise things inplace
+# Try to make gpu gemm inplace
+# Also, need to make the gemm optimisation(step 70) happen before the fusion of elemwise(step 71)
+#optdb.register('InplaceGpuBlasOpt',
+#        EquilibriumOptimizer([local_inplace_gemm], failure_callback=EquilibriumOptimizer.warn_inplace,
+#            max_use_ratio=5),
+#               70.0, 'fast_run', 'inplace')
+#GpuElemwise fusion
+gpu_local_elemwise_fusion = tensor.opt.local_elemwise_fusion_op(GpuElemwise)
+if config.gpu.local_elemwise_fusion:
+    _logger.debug("enabling optimization fusion of gpu elemwise in fast_run")
+    compile.optdb.register('gpu_elemwise_fusion', tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion), 71.00, 'fast_run', 'fusion', 'local_elemwise_fusion')
+else:
+    _logger.debug("not enabling optimization fusion of gpu elemwise in fast_run")
+    compile.optdb.register('gpu_elemwise_fusion', tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion), 71.00, 'fusion', 'local_elemwise_fusion')
--- a/theano/sandbox/cuda/tests/test_opt.py
+++ b/theano/sandbox/cuda/tests/test_opt.py
@@ -7,8 +7,8 @@ import numpy
 # Skip test if cuda_ndarray is not available.
 from nose.plugins.skip import SkipTest
-import theano.sandbox.cuda as cuda_ndarray
+import theano.sandbox.cuda as cuda
-if cuda_ndarray.cuda_available == False:
+if cuda.cuda_available == False:
    raise SkipTest('Optional package cuda disabled')
 import theano.compile.mode
@@ -76,8 +76,8 @@ def test_opt_gpujoin_onlyajoin():
    # from a bug in normal sampling
    _a = numpy.asarray([[1,2],[3,4]],dtype='float32')
    _b = numpy.asarray([[5,6,7],[8,9,10]],dtype='float32')
-    a = cuda_ndarray.shared_constructor(_a)
+    a = cuda.shared_constructor(_a)
-    b = cuda_ndarray.shared_constructor(_b)
+    b = cuda.shared_constructor(_b)
    c = tensor.join(1,a,b)
@@ -100,8 +100,8 @@ def test_opt_gpujoin_joinvectors_elemwise_then_minusone():
    # from a bug in gpu normal sampling
    _a = numpy.asarray([1,2,3,4],dtype='float32')
    _b = numpy.asarray([5,6,7,8],dtype='float32')
-    a = cuda_ndarray.shared_constructor(_a)
+    a = cuda.shared_constructor(_a)
-    b = cuda_ndarray.shared_constructor(_b)
+    b = cuda.shared_constructor(_b)
    a_prime = tensor.cos(a)
    b_prime = tensor.sin(b)
@@ -125,6 +125,20 @@ def test_opt_gpujoin_joinvectors_elemwise_then_minusone():
    assert numpy.allclose(numpy.asarray(f()), concat)
+def test_elemwise_fusion():
+    """ Test the the GpuElemwise fusion work correctly"""
+    shape = (3,4)
+    a = cuda.shared_constructor(theano._asarray(numpy.random.rand(*shape), dtype='float32'), 'a')
+    b = tensor.fmatrix()
+    c = tensor.fmatrix()
+    f = pfunc([b,c], [a+b+c], mode=mode_with_gpu)
+    topo = f.maker.env.toposort()
+    for i, node in enumerate(topo):
+        print >> sys.stdout, i, node
+    assert len(topo)==4
+    assert isinstance(topo[2].op.scalar_op,theano.scalar.basic.Composite)
+    #let debugmode catch errors
+    f(theano._asarray(numpy.random.rand(*shape), dtype='float32'), theano._asarray(numpy.random.rand(*shape), dtype='float32'))
 if __name__ == '__main__':

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py