cuda - extended elemwise optimization to upcast inputs sometimes in gpu mode

de4e6e6a · James Bergstra · a1404da1 · de4e6e6a · de4e6e6a
--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -74,14 +74,33 @@ gpu_cut_copies.register('cut_gpu_constant_transfers', tensor.opt.constant_foldin
 @register_opt()
 @local_optimizer([])
 def local_gpu_elemwise_0(node):
+    """elemwise(..., host_from_gpu, ...) 
+       -> host_from_gpu(elemwise(gpu_from_host, ..., gpu_from_host)
+    """
    if isinstance(node.op, tensor.Elemwise):
-        if numpy.any([hasattr(i.owner, 'op') and isinstance(i.owner.op, HostFromGpu) for i in node.inputs]):
-            if numpy.all([i.type.dtype == 'float32' for i in node.inputs]):
+        if numpy.any([i.owner and isinstance(i.owner.op, HostFromGpu) for i in node.inputs]):
            if numpy.all([o.type.dtype == 'float32' for o in node.outputs]):
                new_op = GpuElemwise(node.op.scalar_op, node.op.inplace_pattern)
+
+                # case 1 - all inputs are already float32
+                if numpy.all([i.type.dtype == 'float32' for i in node.inputs]):
                    #TODO: change this when fusion makes Elemwise with multiple outputs
                    return [host_from_gpu(new_op(*(gpu_from_host(i) for i in node.inputs)))]

+                # THIS IS PROBABLY TRUE....
+                # case 2 - it would still be ok if some inputs were upcast to float32
+                #   first establish that float32 can store all inputs
+                upcastable = set(['float32', 'int8', 'int16', 'uint8', 'uint16'])
+                if numpy.all([i.type.dtype in upcastable for i in node.inputs]):
+                    # second - establish that a new node with upcasted inputs has the same outputs
+                    # types as the original node
+                    casted = node.op.make_node(*[tensor.cast(i, 'float32') for i in node.inputs])
+                    if [o.type for o in casted.outputs] == [o.type for o in node.outputs]:
+
+                        new_inputs = [gpu_from_host(tensor.cast(i, 'float32')) for i in node.inputs]
+
+                        return [host_from_gpu(new_op(*new_inputs))]
+
 @register_opt()
 @local_optimizer([])
 def local_gpu_elemwise_1(node):

--- a/theano/sandbox/cuda/tests/test_opt.py
+++ b/theano/sandbox/cuda/tests/test_opt.py
@@ -12,6 +12,7 @@ if cuda_ndarray.cuda_available == False:
    raise SkipTest('Optional package cuda disabled')

 import theano.compile.mode
+from theano.sandbox.cuda.type import CudaNdarrayType

 if theano.config.mode=='FAST_COMPILE':
    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
@@ -32,3 +33,19 @@ def test_no_shared_var_graph():
    assert numpy.any(isinstance(x.op,cuda.GpuElemwise) for x in l)
    assert numpy.any(isinstance(x.op,cuda.GpuFromHost) for x in l)
    assert numpy.any(isinstance(x.op,cuda.HostFromGpu) for x in l)
+
+def test_int_pow():
+    a = CudaNdarrayType([False])()
+
+    f = theano.function([a], (a*4).sum(), mode=mode_with_gpu)
+
+    op_names = [n.op.__class__.__name__ for n in f.maker.env.toposort()]
+    assert op_names == ['GpuSum', 'GpuElemwise', 'HostFromGpu']
+
+    f = theano.function([a], tensor.pow(a,4).sum(), mode=mode_with_gpu)
+    op_names = [n.op.__class__.__name__ for n in f.maker.env.toposort()]
+    assert op_names == ['GpuElemwise', 'GpuSum', 'HostFromGpu']
+
+    #theano.printing.debugprint(f)
+
+