Allow to move to the GPU Composite op that have non float32 in them but float32…

Allow to move to the GPU Composite op that have non float32 in them but float32 in input and output.

Allow to move to the GPU Composite op that have non float32 in them but float32…
00b39e52 · Frederic Bastien · 535e615e · 00b39e52 · 00b39e52 · 00b39e52
--- a/NEWS.txt
+++ b/NEWS.txt
 Modifications in the trunk since the last release

+In trunk since 0.3.1 release
+----------------------------
+GPU:
+ * Move to the gpu fused elemwise that have other dtype then float32 in them(except float64) if the input and output are float32.
+   * This allow to move elemwise comparaison to the gpu if we cast it to float32 after that.
+
+
 Theano 0.3.1 (2011-02-21)
 ----------------------------


--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -32,6 +32,10 @@ gpu_seqopt.register('gpu_cut_transfers', gpu_cut_copies, 2,
        'fast_run', 'inplace')
 optdb.register('gpu', 
        gpu_seqopt, optdb.__position__.get('add_destroy_handler', 49.5) - 1)
+# This second pass is needed as the fusion can put all the non float32 code
+# inside the elemwise. When it there is no float64 op, this is working.
+optdb.register('gpu_after_fusion',
+        gpu_seqopt, optdb.__position__.get('elemwise_fusion', 71) + .1)

 def register_opt(*tags, **kwargs):
    def f(local_opt):
@@ -88,13 +92,39 @@ gpu_cut_copies.register('cut_gpu_constant_transfers', tensor.opt.constant_foldin
 #botering with this useless pattern.
 compile.optdb['canonicalize'].register('local_cut_gpu_host_gpu', local_cut_gpu_host_gpu, 'fast_run')

+def float64_in_elemwise(op):
+    """
+    Return True of the Elemwise op have float64 in it.
+    Return False otherwise.
+
+    :note: This can happen with the Composite Op.
+    """
+    def get_all_basic_scalar(composite_op):
+        l=[]
+        for i in composite_op.env.toposort():
+            if isinstance(i, theano.scalar.Composite):
+                l += get_all_basic_scalar(i)
+            else:
+                l.append(i)
+        return l
+    if isinstance(op, GpuElemwise) or isinstance(op, tensor.Elemwise):
+        if isinstance(op.scalar_op, theano.scalar.Composite):
+            scals = get_all_basic_scalar(op.scalar_op)
+            for s in scals:
+                if any([i.type.dtype=='float64' for i in s.inputs+s.outputs]):
+                    return True
+    return False
+
+        
+
+
 @register_opt()
 @local_optimizer([])
 def local_gpu_elemwise_0(node):
    """elemwise(..., host_from_gpu, ...)
       -> host_from_gpu(elemwise(gpu_from_host, ..., gpu_from_host)
    """
-    if isinstance(node.op, tensor.Elemwise):
+    if isinstance(node.op, tensor.Elemwise) and not float64_in_elemwise(node.op):
        if numpy.any([i.owner and isinstance(i.owner.op, HostFromGpu) for i in node.inputs]):
            if numpy.all([o.type.dtype == 'float32' for o in node.outputs]):
                #don't set any inplace pattern. gpu_insert_inplace_optimizer will do it later
@@ -134,7 +164,11 @@ def local_gpu_elemwise_1(node):
    """
    if node.op == gpu_from_host:
        host_i, = node.inputs
-        if host_i.owner and isinstance(host_i.owner.op, tensor.Elemwise) and len(host_i.clients)==1:
+        if (host_i.owner and
+            isinstance(host_i.owner.op, tensor.Elemwise) and
+            len(host_i.clients)==1 and
+            not float64_in_elemwise(node.op)):
+
            elemwise_node = host_i.owner
            #don't set any inplace pattern. gpu_insert_inplace_optimizer will do it later
            new_op = GpuElemwise(elemwise_node.op.scalar_op)

--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
@@ -361,8 +361,8 @@ def test_elemwise_comparaison_cast():
        #theano.printing.debugprint(f)
        out = f(av,bv)
        assert numpy.all(out == ans)
-        #assert any([isinstance(node.op, cuda.GpuElemwise) for node in f.maker.env.toposort()])
-        assert any([isinstance(node.op, tensor.Elemwise) for node in f.maker.env.toposort()])
+        assert any([isinstance(node.op, cuda.GpuElemwise) for node in f.maker.env.toposort()])
+        #assert any([isinstance(node.op, tensor.Elemwise) for node in f.maker.env.toposort()])

 def test_elemwise_composite_float64():
    # test that we don't fuse composite elemwise with float64 somewhere inside