Merge pull request #1925 from nouiz/gpuarray

GpuContiguous, tests, better opencl support

Merge pull request #1925 from nouiz/gpuarray
7af47dd8 · abergeron · f6bf2943 · 71493004 · 7af47dd8 · 7af47dd8
--- a/theano/sandbox/gpuarray/basic_ops.py
+++ b/theano/sandbox/gpuarray/basic_ops.py
@@ -5,6 +5,7 @@ import numpy
 import theano
 from theano import Op, Apply
 from theano import tensor, scalar, config
+from theano.gradient import grad_undefined
 from theano.scalar import Scalar
 from theano.tensor.basic import Alloc, Join, Split
@@ -516,7 +517,7 @@ class CudaFromGpu(Op):
        return [gpu_from_cuda(gz)]
    def R_op(self, inputs, eval_points):
-        from theano.sandbox.cuda import CudaNdArrayType
+        from theano.sandbox.cuda import CudaNdarrayType
        ev, = eval_points
        if (isinstance(ev, CudaNdarrayType)):
            return [gpu_from_cuda(ev)]
@@ -750,6 +751,73 @@ class GpuAlloc(HideC, Alloc):
 gpu_alloc = GpuAlloc()
+class GpuContiguous(Op):
+    """
+    Always return a c contiguous output. Copy the input only if it is
+    not already c contiguous.
+    """
+    view_map = {0: [0]}
+    def __eq__(self, other):
+        return type(self) == type(other)
+    def __hash__(self):
+        return hash(type(self))
+    def grad(self, inputs, dout):
+        x, = inputs
+        dout, = dout
+        dout = as_gpuarray_variable(dout)
+        return [dout]
+    def __str__(self):
+        return self.__class__.__name__
+    def make_node(self, input):
+        input = as_gpuarray_variable(input)
+        return Apply(self, [input], [input.type()])
+    def c_headers(self):
+        return ['<numpy_compat.h>']
+    def c_code_cache_version(self):
+        return (3,)
+    def c_code(self, node, name, inp, out, sub):
+        input, = inp
+        z, = out
+        fail = sub['fail']
+        str = """
+        {
+            if (GpuArray_IS_C_CONTIGUOUS(&(%(input)s->ga))){
+                Py_XDECREF(%(z)s);
+                %(z)s = %(input)s;
+                Py_INCREF(%(z)s);
+            } else if ((NULL == %(z)s)""" % locals()
+        for i in xrange(len(node.inputs[0].type.broadcastable)):
+            str += "\n|| (PyGpuArray_DIMS(%(input)s)[%(i)s] != PyGpuArray_DIMS(%(z)s)[%(i)s])" % locals()
+        str += """
+                || !GpuArray_IS_C_CONTIGUOUS(&(%(z)s->ga)))
+            {
+                Py_XDECREF(%(z)s);
+                %(z)s = pygpu_copy(%(input)s, GA_C_ORDER);
+                if (!%(z)s)
+                {
+                    %(fail)s;
+                }
+            }else if(pygpu_move(%(z)s, %(input)s) == -1) {
+                %(fail)s;
+            }
+        }
+        """ % locals()
+        return str
+gpu_contiguous = GpuContiguous()
 class GpuReshape(HideC, tensor.Reshape):
    """
    Implement Reshape on the gpu.
@@ -769,7 +837,6 @@ class GpuReshape(HideC, tensor.Reshape):
            raise ValueError('shape argument to GpuReshape.perform'
                             ' has incorrect length %i'
                             ', should be %i' % (len(shp), self.ndim), shp)
-        s = shp.prod()
        if shp.prod() != x.size:
            # We need to do check here to raise the same error as NumPy.
@@ -872,7 +939,8 @@ class GpuEye(GpuKernelBase, Op):
        return [out_shape]
    def grad(self, inp, grads):
-        return [grad_undefined(self, i, inp[i]) for i in xrange(3)]
+        return [grad_undefined(self, i, inp[i])
+                for i in xrange(3)]
    def __eq__(self, other):
        return type(self) == type(other) and self.dtype == other.dtype

--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -31,7 +31,8 @@ from theano.sandbox.gpuarray.nnet import (
    GpuSoftmaxWithBias, GpuSoftmax
 )
 from theano.sandbox.gpuarray.elemwise import (GpuElemwise, _is_scalar,
-                                              GpuDimShuffle, GpuCAReduceCuda)
+                                              GpuDimShuffle, GpuCAReduceCuda,
+                                              GpuCAReduceCPY)
 from theano.sandbox.gpuarray.subtensor import (GpuIncSubtensor, GpuSubtensor,
                                               GpuAdvancedIncSubtensor1,
                                               GpuAdvancedIncSubtensor1_dev20)
@@ -366,15 +367,25 @@ def local_gpua_advanced_incsubtensor(node):
 def local_gpua_careduce(node):
    if isinstance(node.op.scalar_op, (scalar.Add, scalar.Mul,
                                      scalar.Maximum, scalar.Minimum)):
+        dev = theano.sandbox.gpuarray.init_dev.device
+        if dev.startswith('opencl'):
+            op = GpuCAReduceCPY
+            if node.op.scalar_op not in [scalar.add, scalar.mul]:
+                # We don't support yet all reduction with cpy code.
+                return
+        else:
+            op = GpuCAReduceCuda
        x, = node.inputs
-        greduce = GpuCAReduceCuda(
+        greduce = op(
            node.op.scalar_op, axis=node.op.axis,
            dtype=getattr(node.op, 'dtype', None),
            acc_dtype=getattr(node.op, 'acc_dtype', None))
        gvar = greduce(x)
        # We need to have the make node called, otherwise the mask can
        # be None
-        if gvar.owner.op.supports_c_code([gpu_from_host(x)]):
+        if (op is GpuCAReduceCPY or
+            gvar.owner.op.supports_c_code([gpu_from_host(x)])):
            return greduce
        else:
            # Try to make a simpler pattern based on reshaping
@@ -407,7 +418,7 @@ def local_gpua_careduce(node):
            for idx, m in enumerate(new_mask):
                if m == 1:
                    new_axis.append(idx)
-            greduce = GpuCAReduceCuda(
+            greduce = op(
                node.op.scalar_op,
                axis=new_axis, reduce_mask=new_mask,
                dtype=getattr(node.op, 'dtype', None),

--- a/theano/sandbox/gpuarray/tests/test_basic_ops.py
+++ b/theano/sandbox/gpuarray/tests/test_basic_ops.py
@@ -42,7 +42,8 @@ from theano.sandbox.gpuarray.basic_ops import (
    gpu_from_cuda,
    cuda_from_gpu, HostFromGpu,
    GpuFromHost, GpuReshape,
-    gpu_join, GpuJoin, GpuSplit, GpuEye)
+    gpu_join, GpuJoin, GpuSplit, GpuEye, gpu_contiguous)
+from theano.sandbox.gpuarray.subtensor import GpuSubtensor
 from theano.tests import unittest_tools as utt
 utt.seed_rng()
@@ -73,6 +74,7 @@ def may_fail(msg, EClass):
        return wrapper
    return test_decorator
 def inplace_func(inputs, outputs, mode=None, allow_input_downcast=False,
                 on_unused_input='raise', name=None):
    if mode is None:
@@ -93,6 +95,7 @@ def fake_shared(value, name=None, strict=False, allow_downcast=None, **kwargs):
        except TypeError:
            continue
 def rand_gpuarray(*shape, **kwargs):
    r = rng.rand(*shape) * 2 - 1
    dtype = kwargs.pop('dtype', theano.config.floatX)
@@ -208,10 +211,10 @@ def makeTester(name, op, gpu_op, cases, checks=None, mode_gpu=mode_with_gpu,
 def test_transfer_cpu_gpu():
    a = T.fmatrix('a')
    g = GpuArrayType(dtype='float32', broadcastable=(False, False))('g')
    av = numpy.asarray(rng.rand(5, 4), dtype='float32')
    gv = gpuarray.array(av)
    f = theano.function([a], gpu_from_host(a))
    fv = f(av)
    assert GpuArrayType.values_eq(fv, gv)
@@ -231,8 +234,8 @@ def test_transfer_strided():
    av = numpy.asarray(rng.rand(5, 8), dtype='float32')
    gv = gpuarray.array(av)
-    av = av[:,::2]
+    av = av[:, ::2]
-    gv = gv[:,::2]
+    gv = gv[:, ::2]
    f = theano.function([a], gpu_from_host(a))
    fv = f(av)
@@ -247,7 +250,7 @@ def test_transfer_strided():
          "that the tests will be run this way", ValueError)
 def test_transfer_cuda_gpu():
    import theano.sandbox.cuda as cuda_ndarray
-    if cuda_ndarray.cuda_available == False:
+    if cuda_ndarray.cuda_available is False:
        raise SkipTest("Can't test interaction with cuda if cuda not present")
    g = GpuArrayType(dtype='float32', broadcastable=(False, False))('g')
    c = cuda_ndarray.CudaNdarrayType((False, False))('c')
@@ -255,8 +258,8 @@ def test_transfer_cuda_gpu():
    av = theano._asarray(rng.rand(5, 4), dtype='float32')
    gv = gpuarray.array(av)
    cv = cuda_ndarray.CudaNdarray(av)
-    gvs = gv[:,::-2]
+    gvs = gv[:, ::-2]
-    cvs = cv[:,::-2]
+    cvs = cv[:, ::-2]
    f = theano.function([c], gpu_from_cuda(c))
    fv = f(cv)
@@ -324,6 +327,19 @@ def test_shape():
    assert isinstance(topo[0].op, T.Shape)
+def test_gpu_contiguous():
+    a = T.fmatrix('a')
+    i = T.iscalar('i')
+    a_val = numpy.asarray(numpy.random.rand(4, 5), dtype='float32')
+    f = theano.function([a, i], gpu_contiguous(a[::i]),
+                        mode=mode_with_gpu)
+    topo = f.maker.fgraph.toposort()
+    assert any([isinstance(node.op, GpuSubtensor) for node in topo])
+    assert f(a_val, 1).flags.c_contiguous
+    assert f(a_val, 2).flags.c_contiguous
+    assert f(a_val, 2).flags.c_contiguous
 class G_reshape(T_reshape):
    def shortDescription(self):
        return None
@@ -335,11 +351,11 @@ class G_reshape(T_reshape):
                           mode=mode_with_gpu,
                           # avoid errors with limited devices
 #                             dtype='float32',
-                             ignore_topo=(HostFromGpu, GpuFromHost,
+                           ignore_topo=(HostFromGpu, GpuFromHost,
-                                          theano.compile.DeepCopyOp,
+                                        theano.compile.DeepCopyOp,
-                                          theano.sandbox.gpuarray.elemwise.GpuElemwise,
+                                        theano.sandbox.gpuarray.elemwise.GpuElemwise,
-                                          theano.tensor.opt.Shape_i,
+                                        theano.tensor.opt.Shape_i,
-                                          theano.tensor.opt.MakeVector))
+                                        theano.tensor.opt.MakeVector))
        assert self.op == GpuReshape
@@ -429,7 +445,8 @@ def test_hostfromgpu_shape_i():
    """
    m = mode_with_gpu.including('local_dot_to_dot22',
-                                'local_dot22_to_dot22scalar','specialize')
+                                'local_dot22_to_dot22scalar',
+                                'specialize')
    a = T.fmatrix('a')
    ca = theano.sandbox.gpuarray.type.GpuArrayType('float32', (False, False))()
    av = numpy.asarray(numpy.random.rand(5, 4), dtype='float32')

--- a/theano/sandbox/gpuarray/tests/test_elemwise.py
+++ b/theano/sandbox/gpuarray/tests/test_elemwise.py
+import theano
 from theano import scalar, gof
 from theano.gof.python25 import all, any
+from theano.tests.unittest_tools import SkipTest
 from theano.tensor.tests.test_elemwise import (test_Broadcast, test_DimShuffle,
                                               test_CAReduce, T_reduce_dtype)
@@ -19,17 +21,32 @@ class test_gpu_Broadcast(test_Broadcast):
    type = GpuArrayType
    cop = GpuElemwise
    ctype = GpuArrayType
+    # The order is important
+    linkers = [gof.PerformLinker, gof.CLinker]
+    def setUp(self):
+        dev = theano.sandbox.gpuarray.init_dev.device
+        if not dev.startswith('cuda'):
+            self.linkers = [gof.PerformLinker]
    def rand_val(self, shp):
        return rand_gpuarray(*shp, **dict(cls=gpuarray))
-    # no c_code() yet
-    #cop = GpuElemwise
-    #ctype = GpuArrayType
    def rand_cval(self, shp):
        return rand_gpuarray(*shp, **dict(cls=gpuarray))
+    def test_c(self):
+        dev = theano.sandbox.gpuarray.init_dev.device
+        if not dev.startswith('cuda'):
+            raise SkipTest("Cuda specific tests")
+        super(test_gpu_Broadcast, self).test_c()
+    def test_c_inplace(self):
+        dev = theano.sandbox.gpuarray.init_dev.device
+        if not dev.startswith('cuda'):
+            raise SkipTest("Cuda specific tests")
+        super(test_gpu_Broadcast, self).test_c_inplace()
 class test_GpuDimShuffle(test_DimShuffle):
    op = GpuDimShuffle
@@ -149,7 +166,7 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
 #             ((4100,4,3,2),[3]),((4,4100,3,2),[3]),((4,3,4100,2),[3]),((4,3,2,4100),[3]),#0001
 #             ((1100,2,3,4,5),[0,1,2,3,4]),((2,1100,3,4,5),[0,1,2,3,4]),((2,3,1100,4,5),[0,1,2,3,4]),((2,3,4,1100,5),[0,1,2,3,4]),((2,3,4,5,1100),[0,1,2,3,4]),#11111
 #             ((5,4,3,10,11),[1,2]),
-        ]
+    ]
    op = GpuCAReduceCuda
    reds = [scalar.add, scalar.mul,
            scalar.maximum, scalar.minimum]
@@ -161,6 +178,12 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
    def test_perform_nan(self):
        return
+    def setUp(self):
+        super(test_GpuCAReduceCuda, self).setUp()
+        dev = theano.sandbox.gpuarray.init_dev.device
+        if not dev.startswith('cuda'):
+            raise SkipTest("Cuda specific tests")
 class T_gpureduce_dtype(T_reduce_dtype):
    mode = mode_with_gpu.excluding('local_cut_useless_reduce')
@@ -172,6 +195,11 @@ class T_gpureduce_dtype(T_reduce_dtype):
              'uint8', 'uint16', 'uint32', 'uint64',
              'float32', 'float64']
+    def setUp(self):
+        dev = theano.sandbox.gpuarray.init_dev.device
+        if not dev.startswith('cuda'):
+            raise SkipTest("Cuda specific tests")
 def speed_reduce10():
    import numpy

--- a/theano/sandbox/gpuarray/tests/test_opt.py
+++ b/theano/sandbox/gpuarray/tests/test_opt.py
@@ -7,7 +7,8 @@ import theano.sandbox.gpuarray
 from theano.sandbox.gpuarray.type import GpuArrayType
 from theano.sandbox.gpuarray.basic_ops import (
    GpuAlloc, GpuReshape, gpu_alloc, gpu_from_host, host_from_gpu)
-from theano.sandbox.gpuarray.elemwise import GpuCAReduceCuda, GpuElemwise
+from theano.sandbox.gpuarray.elemwise import (
+    GpuCAReduceCuda, GpuCAReduceCPY, GpuElemwise)
 from theano.sandbox.gpuarray.tests.test_basic_ops import (
    rand_gpuarray, mode_with_gpu, mode_without_gpu
    )
@@ -50,17 +51,26 @@ def test_flatten():
 def test_reduce():
-    for method in ['sum', 'prod', 'max', 'min']:
+    dev = theano.sandbox.gpuarray.init_dev.device
+    for method, param in [('sum', dict(acc_dtype='float32')),
+                          ('prod', dict(acc_dtype='float32')),
+                          ('max', {}), ('min', {})]:
        m = theano.tensor.fmatrix()
-        f = theano.function([m], getattr(m, method)(axis=0),
+        f = theano.function([m], getattr(m, method)(axis=0,
+                                                    **param),
                            mode=mode_with_gpu)
        val = numpy.random.rand(10, 11).astype("float32")
        res = f(val)
        utt.assert_allclose(res, getattr(val, method)(axis=0))
        assert res.shape == (11,)
        topo = f.maker.fgraph.toposort()
-        assert GpuCAReduceCuda in [type(node.op)
+        ops = [type(node.op) for node in topo]
-                                   for node in topo], topo
+        if dev.startswith('opencl') and method in ["max", "min"]:
+            assert not(GpuCAReduceCuda in ops or GpuCAReduceCPY in ops)
+        else:
+            assert GpuCAReduceCuda in ops or GpuCAReduceCPY in ops
 def test_local_gpualloc_memset_0():

--- a/theano/sandbox/gpuarray/tests/test_type.py
+++ b/theano/sandbox/gpuarray/tests/test_type.py
@@ -33,3 +33,10 @@ def test_values_eq_approx():
    b = a.copy()
    b[0] = -numpy.asarray(b[0])
    assert not GpuArrayType.values_eq_approx(a, b)
+def test_specify_shape():
+    a = rand_gpuarray(20, dtype='float32')
+    g = GpuArrayType(dtype='float32', broadcastable=(False,))('g')
+    f = theano.function([g], theano.tensor.specify_shape(g, [20]))
+    f(a)
--- a/theano/tensor/tests/test_elemwise.py
+++ b/theano/tensor/tests/test_elemwise.py