Remove the special code that would move float16 dot to the gpu since

we do it the normal way now.

Remove the special code that would move float16 dot to the gpu since
78aa6276 · Arnaud Bergeron · d3cb54fa · 78aa6276 · 78aa6276
--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -1134,27 +1134,6 @@ def local_gpua_gemmbatch(op, context_name, inputs, outputs):
    return gpugemmbatch_no_inplace(c, 1.0, a, b, 0.0)


-@register_opt('fast_compile')
-@op_lifter([tensor.basic.Dot])
-@register_opt2([tensor.basic.Dot], 'fast_compile')
-def local_gpua_hgemm(op, context_name, inputs, outputs):
-    from theano.sandbox.cuda import nvcc_compiler
-    if nvcc_compiler.nvcc_version < '7.5':
-        _logger.warning("Not performing dot of float16 on the GPU since "
-                        "cuda 7.5 is not available. Updating could speed up "
-                        "your code.")
-        return
-    A = inputs[0]
-    B = inputs[1]
-    if (A.ndim == 2 and B.ndim == 2 and
-            A.dtype == 'float16' and B.dtype == 'float16'):
-        fgraph = outputs[0].fgraph
-        C = gpu_alloc_empty(context_name, dtype='float16')(
-            shape_i(A, 0, fgraph),
-            shape_i(B, 1, fgraph))
-        return gpugemm_no_inplace(C, 1.0, A, B, 0.0)
-
-
 @register_opt()
 @alpha_merge(GpuGemm, alpha_in=1, beta_in=4)
 def local_gpua_gemm_alpha_merge(node, *inputs):

--- a/theano/gpuarray/tests/test_blas.py
+++ b/theano/gpuarray/tests/test_blas.py
@@ -130,52 +130,3 @@ GpuDot22Tester = makeTester(
        # test9=[rand(0, 0), rand(0, 0)],
    )
 )
-
-
-def test_hgemm_swap():
-    from theano.sandbox.cuda import nvcc_compiler
-    if nvcc_compiler.nvcc_version < '7.5':
-        raise SkipTest("SgemmEx is only avaialble on cuda 7.5+")
-
-    v = tensor.vector(dtype='float16')
-    m = tensor.matrix(dtype='float16')
-    m2 = tensor.matrix(dtype='float16')
-    m32 = tensor.matrix(dtype='float32')
-
-    # test that we don't try to replace anything but matrix x matrix in float16
-    f = theano.function([v, m], tensor.dot(v, m), mode=mode_with_gpu)
-    assert len([node for node in f.maker.fgraph.apply_nodes
-                if isinstance(node.op, GpuGemm)]) == 0
-
-    f = theano.function([m32, m], tensor.dot(m32, m), mode=mode_with_gpu)
-    assert len([node for node in f.maker.fgraph.apply_nodes
-                if isinstance(node.op, GpuGemm)]) == 0
-
-    f = theano.function([m, m2], tensor.dot(m, m2), mode=mode_with_gpu)
-    assert len([node for node in f.maker.fgraph.apply_nodes
-                if isinstance(node.op, GpuGemm)]) == 1
-
-    v1 = numpy.random.random((3, 4)).astype('float16')
-    v2 = numpy.random.random((4, 2)).astype('float16')
-
-    of = f(v1, v2)
-    on = numpy.dot(v1, v2)
-
-    utt.assert_allclose(of, on)
-
-
-def test_hgemm_alpha_output_merge():
-    from theano.sandbox.cuda import nvcc_compiler
-    if nvcc_compiler.nvcc_version < '7.5':
-        raise SkipTest("SgemmEx is only avaialble on cuda 7.5+")
-
-    m1 = tensor.matrix(dtype='float16')
-    m2 = tensor.matrix(dtype='float16')
-
-    b = tensor.matrix(dtype='float16')
-
-    hgemm = numpy.asarray(0.05, dtype='float16') * (tensor.dot(m1, m2) + b)
-
-    f = theano.function([m1, m2, b], hgemm, mode=mode_with_gpu)
-    # there should be 3 gpu_from_host, 1 hgemm and 1 host_from_gpu
-    assert len(f.maker.fgraph.apply_nodes) == 5