Add more float16 ops. Test to refactor.

9067f773 · Frederic Bastien · 81245414 · 9067f773 · 9067f773
--- a/theano/gpuarray/subtensor.py
+++ b/theano/gpuarray/subtensor.py
@@ -403,6 +403,7 @@ class GpuAdvancedSubtensor1(HideC, tensor.AdvancedSubtensor1):
    """
    AdvancedSubrensor1 on the GPU.
    """
+    _f16_ok = True
    def make_node(self, x, ilist):
        ctx_name = infer_context_name(x, ilist)
        x_ = as_gpuarray_variable(x, ctx_name)
@@ -1088,6 +1089,7 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) {
 class GpuExtractDiag(Op):
    __props__ = ("offset", "axis1", "axis2", "view")
+    _f16_ok = True
    def __init__(self, offset=0, axis1=0, axis2=1, view=False):
        self.view = view

--- a/theano/gpuarray/tests/test_subtensor.py
+++ b/theano/gpuarray/tests/test_subtensor.py
@@ -49,6 +49,33 @@ class G_subtensor(test_subtensor.T_subtensor):
        assert self.sub == GpuSubtensor
+class G_subtensorF16(test_subtensor.T_subtensor):
+    def shortDescription(self):
+        return None
+    def __init__(self, name):
+        def shared(x, **kwargs):
+            return gpuarray_shared_constructor(x, target=test_ctx_name,
+                                               **kwargs)
+        test_subtensor.T_subtensor.__init__(
+            self, name,
+            shared=shared,
+            sub=GpuSubtensor,
+            inc_sub=GpuIncSubtensor,
+            adv_sub1=GpuAdvancedSubtensor1,
+            adv_incsub1=GpuAdvancedIncSubtensor1,
+            dimshuffle=GpuDimShuffle,
+            mode=mode_with_gpu,
+            # avoid errors with limited devices
+            dtype='float16', #use floatX?
+            ignore_topo=(HostFromGpu, GpuFromHost,
+                         DeepCopyOp, GpuContiguous))
+        # GPU opt can't run in fast_compile only.
+        self.fast_compile = False
+        assert self.sub == GpuSubtensor
 def test_advinc_subtensor1():
    # Test the second case in the opt local_gpu_advanced_incsubtensor1
    for shp in [(3, 3), (3, 3, 3)]:
@@ -73,7 +100,9 @@ def test_advinc_subtensor1():
 def test_advinc_subtensor1_dtype():
    # Test the mixed dtype case
    shp = (3, 4)
-    for dtype1, dtype2 in [('float32', 'int8'), ('float32', 'float64')]:
+    for dtype1, dtype2 in [('float32', 'int8'), ('float32', 'float64'),
+                           ('float16', 'int8'), ('float16', 'float64'),
+                           ('float16', 'float16')]:
        shared = gpuarray_shared_constructor
        xval = np.arange(np.prod(shp), dtype=dtype1).reshape(shp) + 1
        yval = np.empty((2,) + shp[1:], dtype=dtype2)
@@ -95,7 +124,9 @@ def test_advinc_subtensor1_dtype():
 def test_advinc_subtensor1_vector_scalar():
    # Test the case where x is a vector and y a scalar
    shp = (3,)
-    for dtype1, dtype2 in [('float32', 'int8'), ('float32', 'float64')]:
+    for dtype1, dtype2 in [('float32', 'int8'), ('float32', 'float64'),
+                           ('float16', 'int8'), ('float16', 'float64'),
+                           ('float16', 'float16')]:
        shared = gpuarray_shared_constructor
        xval = np.arange(np.prod(shp), dtype=dtype1).reshape(shp) + 1
        yval = np.asarray(10, dtype=dtype2)
@@ -169,7 +200,26 @@ class G_advancedsubtensor(test_subtensor.TestAdvancedSubtensor):
            sub=GpuAdvancedSubtensor,
            mode=mode_with_gpu,
            # avoid errors with limited devices
-            dtype='float32',
+            dtype='float32', # floatX?
+            ignore_topo=(HostFromGpu, GpuFromHost,
+                         DeepCopyOp))
+        # GPU opt can't run in fast_compile only.
+        self.fast_compile = False
+        assert self.sub == GpuAdvancedSubtensor
+class G_advancedsubtensorF16(test_subtensor.TestAdvancedSubtensor):
+    def shortDescription(self):
+        return None
+    def __init__(self, name):
+        test_subtensor.TestAdvancedSubtensor.__init__(
+            self, name,
+            shared=gpuarray_shared_constructor,
+            sub=GpuAdvancedSubtensor,
+            mode=mode_with_gpu,
+            # avoid errors with limited devices
+            dtype='float16', # floatX?
            ignore_topo=(HostFromGpu, GpuFromHost,
                         DeepCopyOp))
        # GPU opt can't run in fast_compile only.
@@ -218,6 +268,17 @@ class test_gpuextractdiag(unittest.TestCase):
                GpuExtractDiag(offset, axis1, axis2)(x).eval({x: np_x}),
                np_x.diagonal(offset, axis1, axis2))
+    def test_tensor_float16(self):
+        x = tensor.tensor4()
+        np_x = np.arange(30107).reshape(7, 11, 17, 23).astype('float16')
+        for offset, axis1, axis2 in [
+                (1, 0, 1), (-1, 0, 1), (0, 1, 0), (-2, 1, 0),
+                (-3, 1, 0), (-2, 2, 0), (3, 3, 0), (-1, 3, 2),
+                (2, 2, 3), (-1, 2, 1), (1, 3, 1), (-1, 1, 3)]:
+            assert np.allclose(
+                GpuExtractDiag(offset, axis1, axis2)(x).eval({x: np_x}),
+                np_x.diagonal(offset, axis1, axis2))
 class test_gpuallocdiag(unittest.TestCase):
    def test_matrix(self):