Merge pull request #1925 from nouiz/gpuarray

GpuContiguous, tests, better opencl support

Merge pull request #1925 from nouiz/gpuarray
7af47dd8 · abergeron · f6bf2943 · 71493004 · 7af47dd8 · 7af47dd8
--- a/theano/sandbox/gpuarray/basic_ops.py
+++ b/theano/sandbox/gpuarray/basic_ops.py
@@ -5,6 +5,7 @@ import numpy
 import theano
 from theano import Op, Apply
 from theano import tensor, scalar, config
+from theano.gradient import grad_undefined
 from theano.scalar import Scalar
 from theano.tensor.basic import Alloc, Join, Split

@@ -516,7 +517,7 @@ class CudaFromGpu(Op):
        return [gpu_from_cuda(gz)]

    def R_op(self, inputs, eval_points):
-        from theano.sandbox.cuda import CudaNdArrayType
+        from theano.sandbox.cuda import CudaNdarrayType
        ev, = eval_points
        if (isinstance(ev, CudaNdarrayType)):
            return [gpu_from_cuda(ev)]
@@ -750,6 +751,73 @@ class GpuAlloc(HideC, Alloc):
 gpu_alloc = GpuAlloc()


+class GpuContiguous(Op):
+    """
+    Always return a c contiguous output. Copy the input only if it is
+    not already c contiguous.
+    """
+    view_map = {0: [0]}
+
+    def __eq__(self, other):
+        return type(self) == type(other)
+
+    def __hash__(self):
+        return hash(type(self))
+
+    def grad(self, inputs, dout):
+
+        x, = inputs
+        dout, = dout
+        dout = as_gpuarray_variable(dout)
+
+        return [dout]
+
+    def __str__(self):
+        return self.__class__.__name__
+
+    def make_node(self, input):
+        input = as_gpuarray_variable(input)
+        return Apply(self, [input], [input.type()])
+
+    def c_headers(self):
+        return ['<numpy_compat.h>']
+
+    def c_code_cache_version(self):
+        return (3,)
+
+    def c_code(self, node, name, inp, out, sub):
+        input, = inp
+        z, = out
+        fail = sub['fail']
+        str = """
+        {
+            if (GpuArray_IS_C_CONTIGUOUS(&(%(input)s->ga))){
+                Py_XDECREF(%(z)s);
+                %(z)s = %(input)s;
+                Py_INCREF(%(z)s);
+
+            } else if ((NULL == %(z)s)""" % locals()
+        for i in xrange(len(node.inputs[0].type.broadcastable)):
+            str += "\n|| (PyGpuArray_DIMS(%(input)s)[%(i)s] != PyGpuArray_DIMS(%(z)s)[%(i)s])" % locals()
+        str += """
+                || !GpuArray_IS_C_CONTIGUOUS(&(%(z)s->ga)))
+            {
+                Py_XDECREF(%(z)s);
+                %(z)s = pygpu_copy(%(input)s, GA_C_ORDER);
+                if (!%(z)s)
+                {
+                    %(fail)s;
+                }
+            }else if(pygpu_move(%(z)s, %(input)s) == -1) {
+                %(fail)s;
+            }
+        }
+        """ % locals()
+        return str
+
+gpu_contiguous = GpuContiguous()
+
+
 class GpuReshape(HideC, tensor.Reshape):
    """
    Implement Reshape on the gpu.
@@ -769,7 +837,6 @@ class GpuReshape(HideC, tensor.Reshape):
            raise ValueError('shape argument to GpuReshape.perform'
                             ' has incorrect length %i'
                             ', should be %i' % (len(shp), self.ndim), shp)
-        s = shp.prod()

        if shp.prod() != x.size:
            # We need to do check here to raise the same error as NumPy.
@@ -872,7 +939,8 @@ class GpuEye(GpuKernelBase, Op):
        return [out_shape]

    def grad(self, inp, grads):
-        return [grad_undefined(self, i, inp[i]) for i in xrange(3)]
+        return [grad_undefined(self, i, inp[i])
+                for i in xrange(3)]

    def __eq__(self, other):
        return type(self) == type(other) and self.dtype == other.dtype

--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -31,7 +31,8 @@ from theano.sandbox.gpuarray.nnet import (
    GpuSoftmaxWithBias, GpuSoftmax
 )
 from theano.sandbox.gpuarray.elemwise import (GpuElemwise, _is_scalar,
-                                              GpuDimShuffle, GpuCAReduceCuda)
+                                              GpuDimShuffle, GpuCAReduceCuda,
+                                              GpuCAReduceCPY)
 from theano.sandbox.gpuarray.subtensor import (GpuIncSubtensor, GpuSubtensor,
                                               GpuAdvancedIncSubtensor1,
                                               GpuAdvancedIncSubtensor1_dev20)
@@ -366,15 +367,25 @@ def local_gpua_advanced_incsubtensor(node):
 def local_gpua_careduce(node):
    if isinstance(node.op.scalar_op, (scalar.Add, scalar.Mul,
                                      scalar.Maximum, scalar.Minimum)):
+        dev = theano.sandbox.gpuarray.init_dev.device
+        if dev.startswith('opencl'):
+            op = GpuCAReduceCPY
+            if node.op.scalar_op not in [scalar.add, scalar.mul]:
+                # We don't support yet all reduction with cpy code.
+                return
+        else:
+            op = GpuCAReduceCuda
        x, = node.inputs
-        greduce = GpuCAReduceCuda(
+
+        greduce = op(
            node.op.scalar_op, axis=node.op.axis,
            dtype=getattr(node.op, 'dtype', None),
            acc_dtype=getattr(node.op, 'acc_dtype', None))
        gvar = greduce(x)
        # We need to have the make node called, otherwise the mask can
        # be None
-        if gvar.owner.op.supports_c_code([gpu_from_host(x)]):
+        if (op is GpuCAReduceCPY or
+            gvar.owner.op.supports_c_code([gpu_from_host(x)])):
            return greduce
        else:
            # Try to make a simpler pattern based on reshaping
@@ -407,7 +418,7 @@ def local_gpua_careduce(node):
            for idx, m in enumerate(new_mask):
                if m == 1:
                    new_axis.append(idx)
-            greduce = GpuCAReduceCuda(
+            greduce = op(
                node.op.scalar_op,
                axis=new_axis, reduce_mask=new_mask,
                dtype=getattr(node.op, 'dtype', None),

--- a/theano/sandbox/gpuarray/tests/test_basic_ops.py
+++ b/theano/sandbox/gpuarray/tests/test_basic_ops.py
@@ -42,7 +42,8 @@ from theano.sandbox.gpuarray.basic_ops import (
    gpu_from_cuda,
    cuda_from_gpu, HostFromGpu,
    GpuFromHost, GpuReshape,
-    gpu_join, GpuJoin, GpuSplit, GpuEye)
+    gpu_join, GpuJoin, GpuSplit, GpuEye, gpu_contiguous)
+from theano.sandbox.gpuarray.subtensor import GpuSubtensor

 from theano.tests import unittest_tools as utt
 utt.seed_rng()
@@ -73,6 +74,7 @@ def may_fail(msg, EClass):
        return wrapper
    return test_decorator

+
 def inplace_func(inputs, outputs, mode=None, allow_input_downcast=False,
                 on_unused_input='raise', name=None):
    if mode is None:
@@ -93,6 +95,7 @@ def fake_shared(value, name=None, strict=False, allow_downcast=None, **kwargs):
        except TypeError:
            continue

+
 def rand_gpuarray(*shape, **kwargs):
    r = rng.rand(*shape) * 2 - 1
    dtype = kwargs.pop('dtype', theano.config.floatX)
@@ -208,10 +211,10 @@ def makeTester(name, op, gpu_op, cases, checks=None, mode_gpu=mode_with_gpu,
 def test_transfer_cpu_gpu():
    a = T.fmatrix('a')
    g = GpuArrayType(dtype='float32', broadcastable=(False, False))('g')
-    
+
    av = numpy.asarray(rng.rand(5, 4), dtype='float32')
    gv = gpuarray.array(av)
-    
+
    f = theano.function([a], gpu_from_host(a))
    fv = f(av)
    assert GpuArrayType.values_eq(fv, gv)
@@ -231,8 +234,8 @@ def test_transfer_strided():
    av = numpy.asarray(rng.rand(5, 8), dtype='float32')
    gv = gpuarray.array(av)

-    av = av[:,::2]
-    gv = gv[:,::2]
+    av = av[:, ::2]
+    gv = gv[:, ::2]

    f = theano.function([a], gpu_from_host(a))
    fv = f(av)
@@ -247,7 +250,7 @@ def test_transfer_strided():
          "that the tests will be run this way", ValueError)
 def test_transfer_cuda_gpu():
    import theano.sandbox.cuda as cuda_ndarray
-    if cuda_ndarray.cuda_available == False:
+    if cuda_ndarray.cuda_available is False:
        raise SkipTest("Can't test interaction with cuda if cuda not present")
    g = GpuArrayType(dtype='float32', broadcastable=(False, False))('g')
    c = cuda_ndarray.CudaNdarrayType((False, False))('c')
@@ -255,8 +258,8 @@ def test_transfer_cuda_gpu():
    av = theano._asarray(rng.rand(5, 4), dtype='float32')
    gv = gpuarray.array(av)
    cv = cuda_ndarray.CudaNdarray(av)
-    gvs = gv[:,::-2]
-    cvs = cv[:,::-2]
+    gvs = gv[:, ::-2]
+    cvs = cv[:, ::-2]

    f = theano.function([c], gpu_from_cuda(c))
    fv = f(cv)
@@ -324,6 +327,19 @@ def test_shape():
    assert isinstance(topo[0].op, T.Shape)


+def test_gpu_contiguous():
+    a = T.fmatrix('a')
+    i = T.iscalar('i')
+    a_val = numpy.asarray(numpy.random.rand(4, 5), dtype='float32')
+    f = theano.function([a, i], gpu_contiguous(a[::i]),
+                        mode=mode_with_gpu)
+    topo = f.maker.fgraph.toposort()
+    assert any([isinstance(node.op, GpuSubtensor) for node in topo])
+    assert f(a_val, 1).flags.c_contiguous
+    assert f(a_val, 2).flags.c_contiguous
+    assert f(a_val, 2).flags.c_contiguous
+
+
 class G_reshape(T_reshape):
    def shortDescription(self):
        return None
@@ -335,11 +351,11 @@ class G_reshape(T_reshape):
                           mode=mode_with_gpu,
                           # avoid errors with limited devices
 #                             dtype='float32',
-                             ignore_topo=(HostFromGpu, GpuFromHost,
-                                          theano.compile.DeepCopyOp,
-                                          theano.sandbox.gpuarray.elemwise.GpuElemwise,
-                                          theano.tensor.opt.Shape_i,
-                                          theano.tensor.opt.MakeVector))
+                           ignore_topo=(HostFromGpu, GpuFromHost,
+                                        theano.compile.DeepCopyOp,
+                                        theano.sandbox.gpuarray.elemwise.GpuElemwise,
+                                        theano.tensor.opt.Shape_i,
+                                        theano.tensor.opt.MakeVector))
        assert self.op == GpuReshape


@@ -429,7 +445,8 @@ def test_hostfromgpu_shape_i():
    """

    m = mode_with_gpu.including('local_dot_to_dot22',
-                                'local_dot22_to_dot22scalar','specialize')
+                                'local_dot22_to_dot22scalar',
+                                'specialize')
    a = T.fmatrix('a')
    ca = theano.sandbox.gpuarray.type.GpuArrayType('float32', (False, False))()
    av = numpy.asarray(numpy.random.rand(5, 4), dtype='float32')

--- a/theano/sandbox/gpuarray/tests/test_elemwise.py
+++ b/theano/sandbox/gpuarray/tests/test_elemwise.py
+import theano
 from theano import scalar, gof
 from theano.gof.python25 import all, any
+from theano.tests.unittest_tools import SkipTest

 from theano.tensor.tests.test_elemwise import (test_Broadcast, test_DimShuffle,
                                               test_CAReduce, T_reduce_dtype)
@@ -19,17 +21,32 @@ class test_gpu_Broadcast(test_Broadcast):
    type = GpuArrayType
    cop = GpuElemwise
    ctype = GpuArrayType
+    # The order is important
+    linkers = [gof.PerformLinker, gof.CLinker]
+
+    def setUp(self):
+        dev = theano.sandbox.gpuarray.init_dev.device
+        if not dev.startswith('cuda'):
+            self.linkers = [gof.PerformLinker]

    def rand_val(self, shp):
        return rand_gpuarray(*shp, **dict(cls=gpuarray))

-    # no c_code() yet
-    #cop = GpuElemwise
-    #ctype = GpuArrayType
-
    def rand_cval(self, shp):
        return rand_gpuarray(*shp, **dict(cls=gpuarray))

+    def test_c(self):
+        dev = theano.sandbox.gpuarray.init_dev.device
+        if not dev.startswith('cuda'):
+            raise SkipTest("Cuda specific tests")
+        super(test_gpu_Broadcast, self).test_c()
+
+    def test_c_inplace(self):
+        dev = theano.sandbox.gpuarray.init_dev.device
+        if not dev.startswith('cuda'):
+            raise SkipTest("Cuda specific tests")
+        super(test_gpu_Broadcast, self).test_c_inplace()
+

 class test_GpuDimShuffle(test_DimShuffle):
    op = GpuDimShuffle
@@ -149,7 +166,7 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
 #             ((4100,4,3,2),[3]),((4,4100,3,2),[3]),((4,3,4100,2),[3]),((4,3,2,4100),[3]),#0001
 #             ((1100,2,3,4,5),[0,1,2,3,4]),((2,1100,3,4,5),[0,1,2,3,4]),((2,3,1100,4,5),[0,1,2,3,4]),((2,3,4,1100,5),[0,1,2,3,4]),((2,3,4,5,1100),[0,1,2,3,4]),#11111
 #             ((5,4,3,10,11),[1,2]),
-        ]
+    ]
    op = GpuCAReduceCuda
    reds = [scalar.add, scalar.mul,
            scalar.maximum, scalar.minimum]
@@ -161,6 +178,12 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
    def test_perform_nan(self):
        return

+    def setUp(self):
+        super(test_GpuCAReduceCuda, self).setUp()
+        dev = theano.sandbox.gpuarray.init_dev.device
+        if not dev.startswith('cuda'):
+            raise SkipTest("Cuda specific tests")
+

 class T_gpureduce_dtype(T_reduce_dtype):
    mode = mode_with_gpu.excluding('local_cut_useless_reduce')
@@ -172,6 +195,11 @@ class T_gpureduce_dtype(T_reduce_dtype):
              'uint8', 'uint16', 'uint32', 'uint64',
              'float32', 'float64']

+    def setUp(self):
+        dev = theano.sandbox.gpuarray.init_dev.device
+        if not dev.startswith('cuda'):
+            raise SkipTest("Cuda specific tests")
+

 def speed_reduce10():
    import numpy

--- a/theano/sandbox/gpuarray/tests/test_opt.py
+++ b/theano/sandbox/gpuarray/tests/test_opt.py
@@ -7,7 +7,8 @@ import theano.sandbox.gpuarray
 from theano.sandbox.gpuarray.type import GpuArrayType
 from theano.sandbox.gpuarray.basic_ops import (
    GpuAlloc, GpuReshape, gpu_alloc, gpu_from_host, host_from_gpu)
-from theano.sandbox.gpuarray.elemwise import GpuCAReduceCuda, GpuElemwise
+from theano.sandbox.gpuarray.elemwise import (
+    GpuCAReduceCuda, GpuCAReduceCPY, GpuElemwise)
 from theano.sandbox.gpuarray.tests.test_basic_ops import (
    rand_gpuarray, mode_with_gpu, mode_without_gpu
    )
@@ -50,17 +51,26 @@ def test_flatten():


 def test_reduce():
-    for method in ['sum', 'prod', 'max', 'min']:
+    dev = theano.sandbox.gpuarray.init_dev.device
+
+    for method, param in [('sum', dict(acc_dtype='float32')),
+                          ('prod', dict(acc_dtype='float32')),
+                          ('max', {}), ('min', {})]:
        m = theano.tensor.fmatrix()
-        f = theano.function([m], getattr(m, method)(axis=0),
+        f = theano.function([m], getattr(m, method)(axis=0,
+                                                    **param),
                            mode=mode_with_gpu)
        val = numpy.random.rand(10, 11).astype("float32")
        res = f(val)
        utt.assert_allclose(res, getattr(val, method)(axis=0))
        assert res.shape == (11,)
        topo = f.maker.fgraph.toposort()
-        assert GpuCAReduceCuda in [type(node.op)
-                                   for node in topo], topo
+        ops = [type(node.op) for node in topo]
+
+        if dev.startswith('opencl') and method in ["max", "min"]:
+            assert not(GpuCAReduceCuda in ops or GpuCAReduceCPY in ops)
+        else:
+            assert GpuCAReduceCuda in ops or GpuCAReduceCPY in ops


 def test_local_gpualloc_memset_0():

--- a/theano/sandbox/gpuarray/tests/test_type.py
+++ b/theano/sandbox/gpuarray/tests/test_type.py
@@ -33,3 +33,10 @@ def test_values_eq_approx():
    b = a.copy()
    b[0] = -numpy.asarray(b[0])
    assert not GpuArrayType.values_eq_approx(a, b)
+
+
+def test_specify_shape():
+    a = rand_gpuarray(20, dtype='float32')
+    g = GpuArrayType(dtype='float32', broadcastable=(False,))('g')
+    f = theano.function([g], theano.tensor.specify_shape(g, [20]))
+    f(a)
--- a/theano/tensor/tests/test_elemwise.py
+++ b/theano/tensor/tests/test_elemwise.py
@@ -19,6 +19,7 @@ from theano.tensor.elemwise import (CAReduce, Elemwise, DimShuffle,
 from theano.tests import unittest_tools
 import math

+
 def FunctionGraph(i, o):
    e = gof.FunctionGraph(i, o)
    return e
@@ -46,8 +47,8 @@ class test_DimShuffle(unittest_tools.InferShapeTester):
            #test that DimShuffle.infer_shape work correctly
            x = TensorType('float64', ib)('x')
            e = self.op(ib, shuffle)(x)
-            f = copy(linker).accept(FunctionGraph([x], [e.
-                shape])).make_function()
+            f = copy(linker).accept(FunctionGraph([x],
+                                                  [e.shape])).make_function()
            assert all(f(numpy.ones(xsh))) == all(zsh)

        # Test when we drop a axis that is not broadcastable
@@ -100,44 +101,52 @@ class test_DimShuffle(unittest_tools.InferShapeTester):
        y = x.dimshuffle(('x',) * (numpy.MAXDIMS + 1))
        self.assertRaises(ValueError, y.eval, {x: 0})

+
 class test_reduce_axes(unittest.TestCase):

    def test_sum_axes(self):
-        axes = [None, 0, 1, [0, 1], numpy.array(1), [numpy.array(0), numpy.array(1)]]
+        axes = [None, 0, 1, [0, 1], numpy.array(1),
+                [numpy.array(0), numpy.array(1)]]
        for a in axes:
            x = tensor.matrix()
            m = x.sum(a)

    def test_mean_axes(self):
-        axes = [None, 0, 1, [0, 1], numpy.array(1), [numpy.array(0), numpy.array(1)]]
+        axes = [None, 0, 1, [0, 1], numpy.array(1),
+                [numpy.array(0), numpy.array(1)]]
        for a in axes:
            x = tensor.matrix()
            m = x.mean(a)

    def test_max_axes(self):
-        axes = [None, 0, 1, [0, 1], numpy.array(1), [numpy.array(0), numpy.array(1)]]
+        axes = [None, 0, 1, [0, 1], numpy.array(1),
+                [numpy.array(0), numpy.array(1)]]
        for a in axes:
            x = tensor.matrix()
            m = x.max(a)

    def test_min_axes(self):
-        axes = [None, 0, 1, [0, 1], numpy.array(1), [numpy.array(0), numpy.array(1)]]
+        axes = [None, 0, 1, [0, 1], numpy.array(1),
+                [numpy.array(0), numpy.array(1)]]
        for a in axes:
            x = tensor.matrix()
            m = x.min(a)

    def test_argmax_axes(self):
-        axes = [None, 0, 1, [0, 1], numpy.array(1), [numpy.array(0), numpy.array(1)]]
+        axes = [None, 0, 1, [0, 1], numpy.array(1),
+                [numpy.array(0), numpy.array(1)]]
        for a in axes:
            x = tensor.matrix()
            m = x.argmax(a)

    def test_var_axes(self):
-        axes = [None, 0, 1, [0, 1], numpy.array(1), [numpy.array(0), numpy.array(1)]]
+        axes = [None, 0, 1, [0, 1], numpy.array(1),
+                [numpy.array(0), numpy.array(1)]]
        for a in axes:
            x = tensor.matrix()
            m = x.var(a)

+
 class test_Broadcast(unittest.TestCase):
    # this is to allow other types to reuse this class to test their ops
    type = TensorType
@@ -149,6 +158,9 @@ class test_Broadcast(unittest.TestCase):
    openmp_minsize = 2*config.openmp_elemwise_minsize
    openmp_minsize_sqrt = math.ceil(math.sqrt(openmp_minsize))

+    # The order is important if you change them.
+    linkers = [gof.PerformLinker, gof.CLinker]
+
    def rand_val(self, shp):
        return numpy.asarray(numpy.random.rand(*shp))

@@ -165,7 +177,10 @@ class test_Broadcast(unittest.TestCase):
                         ((1, 5), (5, 1)),
                         ((1, 1), (1, 1)),
                         ((self.openmp_minsize,), (self.openmp_minsize,)),
-                         ((self.openmp_minsize_sqrt, self.openmp_minsize_sqrt), (self.openmp_minsize_sqrt, self.openmp_minsize_sqrt)),
+                         ((self.openmp_minsize_sqrt,
+                           self.openmp_minsize_sqrt),
+                          (self.openmp_minsize_sqrt,
+                           self.openmp_minsize_sqrt)),
                         ((2, 3, 4, 5), (2, 3, 4, 5)),
                         ((2, 3, 4, 5), (1, 3, 1, 5)),
                         ((2, 3, 4, 5), (1, 1, 1, 1)),
@@ -186,8 +201,8 @@ class test_Broadcast(unittest.TestCase):
                x = type('float64', [(entry == 1) for entry in xsh])('x')
                y = type('float64', [(entry == 1) for entry in ysh])('y')
                e = op(scalar.add)(x, y)
-                f = copy(linker).accept(FunctionGraph([x,
-                     y], [e.shape])).make_function()
+                f = copy(linker).accept(FunctionGraph(
+                    [x, y], [e.shape])).make_function()
                assert tuple(f(xv, yv)) == tuple(zv.shape)

    def with_linker_inplace(self, linker, op, type, rand_val):
@@ -216,8 +231,8 @@ class test_Broadcast(unittest.TestCase):
                x = type('float64', [(entry == 1) for entry in xsh])('x')
                y = type('float64', [(entry == 1) for entry in ysh])('y')
                e = op(scalar.Add(scalar.transfer_type(0)), {0: 0})(x, y)
-                f = copy(linker).accept(FunctionGraph([x,
-                     y], [e.shape])).make_function()
+                f = copy(linker).accept(FunctionGraph(
+                    [x, y], [e.shape])).make_function()
                xv = rand_val(xsh)
                yv = rand_val(ysh)
                zv = xv + yv
@@ -250,12 +265,13 @@ class test_Broadcast(unittest.TestCase):
            raise SkipTest("G++ not available, so we need to skip this test.")
        x = self.ctype('float64', [0, 0])('x')
        y = self.ctype('float64', [1, 1])('y')
-        e = self.cop(scalar.Second(scalar.transfer_type(0)), {0: 0})(x, y)
-        f = gof.CLinker().accept(FunctionGraph([x, y], [e])).make_function()
-        xv = self.rand_cval((5, 5))
-        yv = self.rand_cval((1, 1))
-        f(xv, yv)
-        assert (xv == yv).all()
+        for linker, op in zip(self.linkers, [self.op, self.cop]):
+            e = op(scalar.Second(scalar.transfer_type(0)), {0: 0})(x, y)
+            f = linker().accept(FunctionGraph([x, y], [e])).make_function()
+            xv = self.rand_cval((5, 5))
+            yv = self.rand_cval((1, 1))
+            f(xv, yv)
+            assert (xv == yv).all()

    def test_fill_var(self):
        x = tensor.matrix()
@@ -274,22 +290,24 @@ class test_Broadcast(unittest.TestCase):
            raise SkipTest("G++ not available, so we need to skip this test.")
        x = self.ctype('float64', [0, 0, 0, 0, 0])('x')
        y = self.ctype('float64', [0, 0, 0, 0, 0])('y')
-        e = self.cop(scalar.add)(x, y)
-        f = gof.CLinker().accept(FunctionGraph([x, y], [e])).make_function()
-        xv = self.rand_cval((2, 2, 2, 2, 2))
-        yv = self.rand_cval((2, 2, 2, 2, 2)).transpose(4, 0, 3, 1, 2)
-        zv = xv + yv
-        assert (f(xv, yv) == zv).all()
+        for linker, op in zip(self.linkers, [self.op, self.cop]):
+            e = op(scalar.add)(x, y)
+            f = linker().accept(FunctionGraph([x, y], [e])).make_function()
+            xv = self.rand_cval((2, 2, 2, 2, 2))
+            yv = self.rand_cval((2, 2, 2, 2, 2)).transpose(4, 0, 3, 1, 2)
+            zv = xv + yv
+            assert (f(xv, yv) == zv).all()

    def test_same_inputs(self):
        if not theano.config.cxx:
            raise SkipTest("G++ not available, so we need to skip this test.")
        x = self.ctype('float64', [0, 0])('x')
-        e = self.cop(scalar.add)(x, x)
-        f = gof.CLinker().accept(FunctionGraph([x], [e])).make_function()
-        xv = self.rand_cval((2, 2))
-        zv = xv + xv
-        assert (f(xv) == zv).all()
+        for linker, op in zip(self.linkers, [self.op, self.cop]):
+            e = op(scalar.add)(x, x)
+            f = linker().accept(FunctionGraph([x], [e])).make_function()
+            xv = self.rand_cval((2, 2))
+            zv = xv + xv
+            assert (f(xv) == zv).all()


 class test_CAReduce(unittest_tools.InferShapeTester):
@@ -309,7 +327,7 @@ class test_CAReduce(unittest_tools.InferShapeTester):
             ((5, 0), ()),
             ((), None),
             ((), ())
-         ]
+    ]

    def with_linker(self, linker, scalar_op=scalar.add, dtype="floatX",
                    pre_scalar_op=None,
@@ -429,7 +447,8 @@ class test_CAReduce(unittest_tools.InferShapeTester):
                    try:
                        f_xv = f(xv)
                        self.assertTrue((f_xv.shape == zv.shape), (f_xv, zv))
-                        self.assertTrue(numpy.allclose(f_xv, zv), (f_xv, zv, xsh, tosum))
+                        self.assertTrue(numpy.allclose(f_xv, zv),
+                                        (f_xv, zv, xsh, tosum))
                    except NotImplementedError:
                        # GpuCAReduce don't implement all cases when size is 0
                        assert xv.size == 0
@@ -553,7 +572,7 @@ class test_Prod(unittest.TestCase):
        # including zeros, as the case with zeros is important
        # (and special cases: 1 zero in the row, more than 1 zero in the row)
        x_val = numpy.asarray([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
-             dtype='float32')
+                              dtype='float32')
        # now with verify_grad
        unittest_tools.verify_grad(Prod(axis=1), [x_val], mode=self.mode)

@@ -568,7 +587,7 @@ class test_Prod(unittest.TestCase):
        # including zeros, as the case with zeros is important
        # (and special cases: 1 zero in the row, more than 1 zero in the row)
        x_val = numpy.asarray([[1., 2., 3.], [0., 5., 6.], [0., 0., 9.]],
-             dtype='float32')
+                              dtype='float32')
        x = theano.tensor.dmatrix()

        # sanity check
@@ -760,7 +779,8 @@ class T_reduce_dtype(unittest.TestCase):
                ).get(dtype, dtype)
                f = theano.function([x], s, mode=self.mode)
                topo = f.maker.fgraph.toposort()
-                assert [n for n in topo if isinstance(n.op, self.op)], (topo, dtype)
+                assert [n for n in topo if isinstance(n.op, self.op)], (topo,
+                                                                        dtype)
                data = numpy.random.rand(3, 4) * 10
                data = data.astype(dtype)
                f(data)
@@ -785,7 +805,8 @@ class T_reduce_dtype(unittest.TestCase):
                ).get(dtype, dtype)
                f = theano.function([x], s, mode=self.mode)
                topo = f.maker.fgraph.toposort()
-                assert [n for n in topo if isinstance(n.op, self.op)], (topo, dtype)
+                assert [n for n in topo if isinstance(n.op, self.op)], (topo,
+                                                                        dtype)
                data = numpy.random.rand(3, 4) * 10
                data = data.astype(dtype)
                f(data)
@@ -814,7 +835,8 @@ class T_reduce_dtype(unittest.TestCase):

                    f = theano.function([x], var, mode=self.mode)
                    topo = f.maker.fgraph.toposort()
-                    assert [n for n in topo if isinstance(n.op, self.op)], (topo, dtype)
+                    assert [n for n in topo if isinstance(n.op, self.op)], (topo,
+                                                                            dtype)
                    data = numpy.random.rand(3, 4) * 10
                    data = data.astype(input_dtype)
                    f(data)
@@ -850,7 +872,8 @@ class T_reduce_dtype(unittest.TestCase):
                        (input_dtype in tensor.discrete_dtypes and
                            acc_dtype in tensor.continuous_dtypes)
                        ):
-                        var = getattr(x, method)(acc_dtype=acc_dtype, axis=axis)
+                        var = getattr(x, method)(acc_dtype=acc_dtype,
+                                                 axis=axis)
                        assert var.owner.op.acc_dtype == acc_dtype

                        if "complex" in input_dtype:
@@ -873,10 +896,12 @@ class T_reduce_dtype(unittest.TestCase):
            s = getattr(x, method)()
            f = theano.function([], s, mode=self.mode)
            topo = f.maker.fgraph.toposort()
-            assert [n for n in topo if isinstance(n.op, self.op)], (topo, dtype)
+            assert [n for n in topo if isinstance(n.op, self.op)], (topo,
+                                                                    dtype)
            s_val = f()
            # Use extra precision in NumPy to compute the good answer.
-            ret = getattr(numpy.asarray([1e8, 1, -1e8], dtype='float64'), method)()
+            ret = getattr(numpy.asarray([1e8, 1, -1e8], dtype='float64'),
+                          method)()
            assert numpy.allclose(s_val, ret), (s_val, ret)


@@ -922,10 +947,10 @@ class T_mean_dtype(unittest.TestCase):
                    # Executed if no TypeError was raised
                    if sum_dtype in tensor.discrete_dtypes and axis != []:
                        assert mean_var.dtype == 'float64', (
-                                (mean_var.dtype, sum_dtype))
+                            (mean_var.dtype, sum_dtype))
                    else:
                        assert mean_var.dtype == sum_dtype, (
-                                (mean_var.dtype, sum_dtype))
+                            (mean_var.dtype, sum_dtype))
                    if (('complex' in input_dtype or
                         'complex' in sum_dtype) and
                        input_dtype != sum_dtype):
@@ -970,13 +995,13 @@ class T_prod_without_zeros_dtype(unittest.TestCase):
            axis = axes[idx % len(axes)]
            x = ProdWithoutZeros(axis=axis)(tensor.matrix(dtype=dtype))
            assert x.dtype == dict(
-                    int8='int64',
-                    int16='int64',
-                    int32='int64',
-                    uint8='uint64',
-                    uint16='uint64',
-                    uint32='uint64',
-                    ).get(dtype, dtype)
+                int8='int64',
+                int16='int64',
+                int32='int64',
+                uint8='uint64',
+                uint16='uint64',
+                uint32='uint64',
+            ).get(dtype, dtype)

    def test_prod_without_zeros_default_acc_dtype(self):
        """