Added a few things to support GpuJoin: a ZerosWithPattern method to…

Added a few things to support GpuJoin: a ZerosWithPattern method to cuda_ndarray, and a slight refactoring of Join(Op) to inherit from this class in GpuJoin. And added GpuJoin itself. And a few unit tests for it.

Added a few things to support GpuJoin: a ZerosWithPattern method to…
75426101 · fsavard · 0e0f4802 · 75426101 · 75426101 · 75426101
--- a/theano/sandbox/cuda/__init__.py
+++ b/theano/sandbox/cuda/__init__.py
@@ -105,7 +105,8 @@ if cuda_available:
    import basic_ops
    from basic_ops import (GpuFromHost, HostFromGpu, GpuElemwise, 
            GpuDimShuffle, GpuSum, GpuReshape, 
-            GpuSubtensor, GpuIncSubtensor, GpuFlatten, GpuShape)
+            GpuSubtensor, GpuIncSubtensor, GpuFlatten, GpuShape,
+            GpuJoin)
    import opt
    import cuda_ndarray

--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -11,6 +11,9 @@ from theano.sandbox.cuda import filter as type_support_filter
 from theano.sandbox.cuda.elemwise import NaiveAlgo
 import logging, copy
+import cuda_ndarray
 _logger_name = 'theano.sandbox.cuda.basic_ops'
 _logger = logging.getLogger(_logger_name)
 _logger.setLevel(logging.INFO)
@@ -1418,3 +1421,75 @@ class GpuShape(tensor.Shape):
        return Apply(self, [x], [tensor.lvector()])
 gpu_shape = GpuShape()
+class GpuJoin(tensor.Join):
+    def make_node(self, *axis_and_tensors):
+        axis, tensors = axis_and_tensors[0], axis_and_tensors[1:]
+        if not tensors:
+            raise ValueError('Cannot join an empty list of tensors')
+        are_instances = [isinstance(x.type, CudaNdarrayType) \
+                                                for x in tensors]
+        assert numpy.all(are_instances)
+        # no conversion needed, we just checked everything was
+        # a CNDA var
+        as_tensor_variable_args = tensors
+        output_maker = \
+                lambda bcast: CudaNdarrayType(broadcastable=bcast)()
+        return tensor.Join._make_node_internal(self, 
+                        axis, tensors, 
+                        as_tensor_variable_args, output_maker)
+    def perform(self, node, axis_and_tensors, (out, )):
+        axis, cndas = axis_and_tensors[0], axis_and_tensors[1:]
+        # compute size/shape
+        width_sum = 0
+        template_shape = cndas[0].shape
+        for cnda in cndas:
+            width_sum += cnda.shape[axis]
+            # and while we're at it, check that shapes match
+            tmp_shape = list(cnda.shape)
+            # dimension in "axis" can be different, so make equal for ==
+            tmp_shape[axis] = template_shape[axis]
+            if tuple(tmp_shape) != template_shape:
+                raise ValueError, "Shape of input CudaNdarrays must agree except for the 'axis' dimension"
+        if len(template_shape) != node.outputs[0].type.ndim:
+            raise ValueError, "Number of dimension of input tensors disagree with dimensions passed at graph creation time."
+        # final shape must be the same as all input tensors
+        # except for the "axis" dimension, so we can simply
+        # copy the shape of the first one
+        final_shape = list(cndas[0].shape)
+        final_shape[axis] = width_sum
+        # just to be explicit, set -1 for broadcastable
+        # dimensions
+        for i, val in enumerate(node.outputs[0].type.broadcastable):
+            if val:
+                final_shape[i] = -1
+        rval = cuda_ndarray.cuda_ndarray.CudaNdarray.zeros_with_pattern(final_shape)
+        curpos = 0
+        # we use a [:] (copy all) slice for all dimensions
+        # except for 'axis'
+        def construct_slices(curlen):
+            slices = [slice(None,None,None) for i in range(len(cndas))]
+            slices[axis] = slice(curpos,curpos+curlen,None)
+            return tuple(slices)
+        for i, cnda in enumerate(cndas):
+            curlen = cnda.shape[axis]
+            rval.__setitem__(construct_slices(curlen), cnda)
+            curpos += curlen
+        out[0] = rval
+gpu_join = GpuJoin()
--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -247,6 +247,135 @@ PyObject * CudaNdarray_CreateArrayObj(CudaNdarray * self)
    Py_DECREF(contiguous_self);
    return rval;
 }
+// declared as a static method
+// Based on _Copy and _dimshuffle
+PyObject* CudaNdarray_ZerosWithPattern(PyObject* dummy, PyObject* pattern)
+{
+    if(!PySequence_Check(pattern))
+    {
+        PyErr_SetString(PyExc_TypeError, "pattern argument must be a sequence");
+        return NULL;
+    }
+    int patlen = PySequence_Length(pattern);
+    if (patlen == 0)
+    {
+        PyErr_SetString(PyExc_ValueError,
+            "CudaNdarray_NewWithPattern: empty pattern");
+        return NULL;
+    }
+    //fprintf(stdout, "Pattern length: %d\n", patlen);
+    int* newdims = (int *)malloc(sizeof(int) * 2 * patlen);
+    if (!newdims)
+    {
+        PyErr_SetString(PyExc_MemoryError,
+            "CudaNdarray_NewWithPattern: Failed to allocate temporary space");
+        return NULL;
+    }
+    int* newstrides = newdims + patlen;
+    // strides are in number of floats, not bytes
+    int cur_stride = 1;
+    // start from the end to compute strides
+    for (int i = patlen-1; i >= 0; --i)
+    {
+        PyObject* pat_el_obj = PySequence_GetItem(pattern, i);
+        if(pat_el_obj == NULL)
+        {
+            // shouldn't happen since we checked length before...
+            PyErr_SetString(PyExc_RuntimeError, "CudaNdarray_NewWithPattern: Index out of bound in sequence");
+            free(newdims);
+            return NULL;
+        }
+        int pat_el = PyInt_AsLong(pat_el_obj);
+        if (pat_el == 0)
+        {
+            PyErr_SetString(PyExc_ValueError, "CudaNdarray_NewWithPattern: pattern must not contain 0 for size of a dimension");
+            free(newdims);
+            return NULL;
+        }
+        // apparently, from looking at alloc_contiguous, we set
+        // stride=0 if the dim == 1
+        if (pat_el < 0 || pat_el == 1)
+        {
+            // broadcast
+            newdims[i] = 1;
+            newstrides[i] = 0;
+        }
+        else
+        {
+            newdims[i] = pat_el;
+            newstrides[i] = cur_stride;
+        }
+        cur_stride *= newdims[i];
+    }
+    // cur_stride now contains the size of the array, in reals
+    int total_size = cur_stride * sizeof(real);
+    CudaNdarray* rval = (CudaNdarray*)CudaNdarray_new_null();
+    if (!rval)
+    {
+        PyErr_SetString(PyExc_RuntimeError, "CudaNdarray_NewWithPattern: call to new_null failed");
+        free(newdims);
+        return NULL;
+    }
+    if (CudaNdarray_alloc_contiguous(rval, patlen, newdims))
+    {
+        PyErr_SetString(PyExc_RuntimeError, "CudaNdarray_NewWithPattern: allocation failed.");
+        free(newdims);
+        Py_DECREF(rval);
+        return NULL;
+    }
+    // Fill with zeros
+    //fprintf(stdout, "Sizeof: %d\n", total_size);
+    if (cudaSuccess != cudaMemset(rval->devdata, 0, total_size))
+    {
+        fprintf(stderr, "Error memsetting %d bytes of device memory.\n", cur_stride);
+        PyErr_Format(PyExc_MemoryError, "Error memsetting %d bytes of device memory.", cur_stride);
+        free(newdims);
+        Py_DECREF(rval);
+        return NULL;
+    }
+    // change the strides to account for broadcastability
+    // (not necessary as alloc_contiguous sets stride=0 for dim=1)
+    //for (int i = 0; i < patlen; ++i)
+    //{
+    //    CudaNdarray_set_stride(rval, i, newstrides[i]);
+    //}
+    if (cnda_copy_structure_to_device(rval))
+    {
+        PyErr_SetString(PyExc_RuntimeError, "CudaNdarray_NewWithPattern: syncing structure to device failed");
+        free(newdims);
+        Py_DECREF(rval);
+        return NULL;
+    }
+    free(newdims);
+    return (PyObject*)rval;
+}
 PyObject * CudaNdarray_Copy(CudaNdarray * self)
 {
    PyObject * rval = CudaNdarray_new_null();
@@ -578,6 +707,9 @@ static PyMethodDef CudaNdarray_methods[] =
    {"__deepcopy__", 
        (PyCFunction)CudaNdarray_DeepCopy, METH_O,
        "Create a copy of this object"},
+    {"zeros_with_pattern",
+        (PyCFunction)CudaNdarray_ZerosWithPattern, METH_STATIC,
+        "Create a new CudaNdarray with specified shape and broadcastability, filled with zeros."},
    {"copy", 
        (PyCFunction)CudaNdarray_Copy, METH_NOARGS,
        "Create a copy of this object"},

--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
@@ -493,3 +493,93 @@ def test_hostfromgpu_shape_i():
    assert isinstance(topo[2].op,T.opt.MakeVector)
    assert tuple(f(cv))==(5,4)
+# -----------------------------------------------------------------------
+import theano.sandbox.cuda as cuda_ndarray
+from theano.sandbox.cuda.basic_ops import gpu_join, GpuDimShuffle
+def test_gpujoin_twomatrices_joincolumns():
+    _a = numpy.asarray([[1,2],[3,4]],dtype='float32')
+    _b = numpy.asarray([[5,6,7],[8,9,10]],dtype='float32')
+    a = theano.shared(_a)
+    b = theano.shared(_b)
+    c = gpu_join(1,a,b)
+    f = theano.function([], c)
+    assert numpy.all(f() == numpy.concatenate([_a,_b], axis=1))
+def test_gpujoin_twomatrices_badshapes():
+    _a = numpy.asarray([[1,2],[3,4]],dtype='float32')
+    _b = numpy.asarray([[5,6,7],[8,9,10]],dtype='float32')
+    a = theano.shared(_a)
+    b = theano.shared(_b)
+    # try to join on dimension 0 where they don't agree (2!=3)
+    c = gpu_join(0,a,b)
+    f = theano.function([], c)
+    try:
+        f()
+        assert False
+    except ValueError:
+        assert True
+def test_gpujoin_preserves_broadcasting():
+    _a = numpy.asarray([[1,2],[3,4]],dtype='float32')
+    _b = numpy.asarray([[5,6,7],[8,9,10]],dtype='float32')
+    a = theano.shared(_a)
+    b = theano.shared(_b)
+    # [0,0] : the two original dims were non-broadcastable
+    # [1,x,0]: new order and broadcastability
+    gpu_dimshuffle = GpuDimShuffle([0,0], [1,'x',0])
+    a_shuffled = gpu_dimshuffle(a)
+    b_shuffled = gpu_dimshuffle(b)
+    c = gpu_join(0,a_shuffled,b_shuffled)
+    assert c.type.broadcastable == (False,True,False)
+    f = theano.function([], c)
+    res = f()
+    a_reshaped = numpy.asarray([[[1,3]],[[2,4]]], dtype='float32')
+    b_reshaped = numpy.asarray([[[5,8]],[[6,9]],[[7,10]]], dtype='float32')
+    concat = numpy.concatenate([a_reshaped,b_reshaped], axis=0)
+    assert numpy.all(res == concat)
+def test_gpujoin_assert_cndas():
+    # this will end up being an ndarray, as it's float64
+    _a = numpy.asarray([[1,2],[3,4]],dtype='float64')
+    a = theano.shared(_a)
+    try:
+        c = gpu_join(1,a)
+        # can't "assert False" here, as we want the assertion 
+        # error from gpu_join
+    except AssertionError:
+        assert True
+        return
+    assert False
+if __name__ == '__main__':
+    test_gpujoin_twomatrices_joincolumns()
+    test_gpujoin_assert_cndas()
+    test_gpujoin_preserves_broadcasting()
+    test_gpujoin_twomatrices_badshapes()
--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -2723,9 +2723,17 @@ class Join(Op):
        if not tensors:
            raise ValueError('Cannot join an empty list of tensors')
        as_tensor_variable_args= [as_tensor_variable(x) for x in tensors]
        dtypes = [x.type.dtype for x in as_tensor_variable_args]
        out_dtype = scal.upcast(*dtypes)
+        output_maker = lambda bcastable: tensor(dtype=out_dtype, broadcastable=bcastable)
+        return self._make_node_internal(axis, tensors,
+                            as_tensor_variable_args, output_maker)
+    def _make_node_internal(self, axis, tensors,
+                as_tensor_variable_args, output_maker):
        if not all(targs.type.ndim for targs in as_tensor_variable_args):
            raise TypeError('Join cannot handle arguments of dimension 0. For joining scalar values, see @stack');
@@ -2757,8 +2765,8 @@ class Join(Op):
        if inputs[0].type not in int_types: 
            raise TypeError('Axis could not be cast to an integer type', axis, inputs[0].type, int_types)
-        outputs = [tensor(dtype = out_dtype,
+        outputs = [output_maker(bcastable)]
-                          broadcastable = bcastable)]
        node = Apply(self, inputs, outputs)
        if any(not x.type.broadcastable[0] for x in orig):
          node.tag.shape_zero = None