Merge pull request #2930 from carriepl/gpuarray_elemwise_pow

Add opt for gpuarray.GpuElemwise so exponent of pow has same dtype as output

Merge pull request #2930 from carriepl/gpuarray_elemwise_pow
7f36ca79 · abergeron · bbca839e · 4704a39e · 7f36ca79 · 7f36ca79
--- a/theano/sandbox/gpuarray/elemwise.py
+++ b/theano/sandbox/gpuarray/elemwise.py
@@ -189,8 +189,8 @@ class GpuElemwise(HideC, Elemwise):
            pass
        for npy, ga in [("npy_uint8", "ga_ubyte"),
                        ("npy_uint16", "ga_ushort"),
-                        ("npy_uin32", "ga_uint"),
-                        ("npy_uin64", "ga_ulong"),
+                        ("npy_uint32", "ga_uint"),
+                        ("npy_uint64", "ga_ulong"),
                        ("npy_int8", "ga_byte"),
                        ("npy_int16", "ga_short"),
                        ("npy_int32", "ga_int"),

--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -14,13 +14,15 @@ from theano.gof import (local_optimizer, EquilibriumDB,
                        SequenceDB, Optimizer, toolbox)
 from theano.gof.optdb import LocalGroupDB

+from theano.scalar.basic import Scalar, Pow, Cast
 from theano.scan_module import scan_utils, scan_op, scan_opt

 from theano.tensor.nnet.conv import ConvOp
 from theano.tests.breakpoint import PdbBreakpoint

 from .type import GpuArrayType, GpuArrayConstant
-from .basic_ops import (host_from_gpu, gpu_from_host,
+from .basic_ops import (as_gpuarray_variable,
+                        host_from_gpu, gpu_from_host,
                        HostFromGpu, GpuFromHost,
                        GpuSplit, GpuContiguous,
                        gpu_alloc, GpuAlloc, GpuReshape,
@@ -262,10 +264,38 @@ def local_gpu_elemwise(node):
    name = op.name
    if name:
        name = 'Gpu' + name
+
    res = GpuElemwise(scal_op, name=name,
                      inplace_pattern=copy.copy(op.inplace_pattern),
                      nfunc_spec=op.nfunc_spec)
-    return res
+
+    # If the elemwise operation is a pow, casts might be required on the
+    # inputs and or outputs because only the (float, float)->float and
+    # (double, double)->double cases are implemented at the moment.
+    if isinstance(op.scalar_op, Pow):
+
+        # Only transfer the computation on the gpu if the output dtype is
+        # floating point. Else, give up on the transfer to the gpu.
+        out_dtype = node.outputs[0].dtype
+        if out_dtype not in ['float16', 'float32', 'float64']:
+            return
+
+        # Transfer the inputs on the GPU and cast them to the right dtype.
+        new_inputs = []
+        for inp in node.inputs:
+            if inp.dtype != out_dtype:
+                gpu_cast_op = GpuElemwise(Cast(Scalar(out_dtype)))
+                new_inputs.append(gpu_cast_op(as_gpuarray_variable(inp)))
+            else:
+                new_inputs.append(as_gpuarray_variable(inp))
+
+        # Perform the exponent on the gpu and transfer the output back to the
+        # cpu.
+        gpu_output = res(*new_inputs)
+        cpu_output = host_from_gpu(gpu_output)
+        return [cpu_output]
+    else:
+        return res


 def max_inputs_to_GpuElemwise(node):

--- a/theano/sandbox/gpuarray/tests/test_elemwise.py
+++ b/theano/sandbox/gpuarray/tests/test_elemwise.py
+import numpy
+
 import theano
 from theano import scalar, gof
-from theano.tests.unittest_tools import SkipTest
+from theano.tests.unittest_tools import SkipTest, assert_allclose

 from theano.tensor.tests.test_elemwise import (test_Broadcast, test_DimShuffle,
                                               test_CAReduce, T_reduce_dtype)
@@ -46,6 +48,35 @@ class test_gpu_Broadcast(test_Broadcast):
        super(test_gpu_Broadcast, self).test_c_inplace()


+def test_elemwise_pow():
+    # Test that GpuElemwise(pow) can compile with any combination of integer
+    # or float input dtype.
+    dev = theano.sandbox.gpuarray.init_dev.device
+    if not dev.startswith('cuda'):
+        raise SkipTest("Cuda specific tests")
+
+    dtypes = ["uint8", "uint16", "uint32", "uint64",
+              "int8", "int16", "int32", "int64",
+              "float16", "float32", "float64"]
+
+    for dtype_base in dtypes:
+        for dtype_exp in dtypes:
+
+            # Compile a gpu function with the specified dtypes
+            base = theano.tensor.vector(dtype=dtype_base)
+            exp = theano.tensor.vector(dtype=dtype_exp)
+            output = base ** exp
+            f = theano.function([base, exp], output)
+
+            # Call the function to make sure the output is valid
+            base_val = numpy.random.randint(0, 5, size=10).astype(dtype_base)
+            exp_val = numpy.random.randint(0, 3, size=10).astype(dtype_exp)
+
+            out = f(base_val, exp_val)
+            expected_out = base_val ** exp_val
+            assert_allclose(out, expected_out)
+
+
 class test_GpuDimShuffle(test_DimShuffle):
    op = GpuDimShuffle