test_elemwise4 passed; modified setting of stride in alloc_contiguous to set…

test_elemwise4 passed; modified setting of stride in alloc_contiguous to set stride 0 for dimensions of size 1

test_elemwise4 passed; modified setting of stride in alloc_contiguous to set…
ff24c985 · James Bergstra · 43f97ea3 · ff24c985 · ff24c985 · ff24c985
--- a/basic_ops.py
+++ b/basic_ops.py
@@ -19,7 +19,7 @@ class HostFromGpu(Op):
    def __hash__(self):
        return hash(type(self))
    def __str__(self):
-        return '<HostFromGpu@%i>' % id(self)
+        return 'HostFromGpu'
    def make_node(self, x):
        if not isinstance(x.type, CudaNdarrayType):
            raise TypeError(x)
@@ -36,7 +36,7 @@ class GpuFromHost(Op):
    def __hash__(self):
        return hash(type(self))
    def __str__(self):
-        return '<GpuFromHost@%i>' % id(self)
+        return 'GpuFromHost'
    def make_node(self, x):
        if not isinstance(x.type, tensor.TensorType):
            raise TypeError(x)
@@ -102,9 +102,21 @@ class GpuElemwise(Op):
        if self.nin > 0 and len(_inputs) != self.nin:
            raise TypeError('Wrong argument count', (self.nin, len(_inputs)))
        for i in _inputs[1:]:
-            if i.type.broadcastable != inputs[0].type.broadcastable:
-                raise NotImplementedError('different bcastable')
-        otype = CudaNdarrayType(broadcastable=_inputs[0].broadcastable)
+            if i.type.ndim != inputs[0].type.ndim:
+                raise TypeError('different ranks among inputs')
+
+        # output is broadcastable only along dimensions where all inputs are broadcastable
+        broadcastable = []
+        for d in xrange(_inputs[0].type.ndim):
+            bcast_d = True
+            for i in _inputs:
+                if not i.type.broadcastable[d]:
+                    bcast_d = False
+                    break
+            broadcastable.append(bcast_d)
+        assert len(broadcastable) == _inputs[0].type.ndim
+
+        otype = CudaNdarrayType(broadcastable=broadcastable)
        assert self.nout > 0
        return Apply(self, _inputs, [otype() for o in xrange(self.nout)])
    def c_support_code(self):
@@ -274,37 +286,38 @@ class GpuElemwise(Op):
        nout = len(outputs)
        fail = sub['fail']
        opname = str(self.scalar_op)
-        print >> sio, """
-        //std::cerr << "C_CODE %(opname)s START\\n";
+        initial_dims = ','.join('1' for i in xrange(nd))
+        if 1 or self.scalar_op == scalar.pow:
+            print >> sio, """
+        std::cerr << "C_CODE %(opname)s START\\n";
        //standard elemwise size checks
-        const int * dims = NULL;
+            """ %locals()
+        print >> sio, """
+        int dims[%(nd)s] = {%(initial_dims)s};
        """ %locals()
        for iname in inputs:
            print >> sio, """
+        std::cerr << "C_CODE %(opname)s checking input %(iname)s\\n";
        if (%(nd)s != cnda_%(iname)s->nd)
        {
            PyErr_Format(PyExc_TypeError, "need %(nd)s dims, not %%i", cnda_%(iname)s->nd);
            %(fail)s;
        }
-            """ %locals()
-        for iname0, iname1 in zip(inputs[1:], inputs[:-1]):
-            print >> sio, """
-        //standard elemwise dim checks
        for (int i = 0; i< %(nd)s; ++i)
        {
-            if (cnda_%(iname0)s->dim[i] != cnda_%(iname1)s->dim[i])
+            dims[i] = (dims[i] == 1) ? cnda_%(iname)s->dim[i] : dims[i];
+            if ((cnda_%(iname)s->dim[i] != 1) && (dims[i] != cnda_%(iname)s->dim[i]))
            {
-                PyErr_SetString(PyExc_TypeError, "need same dimensions");
+                std::cerr << "C_CODE %(opname)s checking input %(iname)s failed\\n";
+                PyErr_Format(PyExc_TypeError, "GpuElemwise input has incompatible dim[%%i] == %%i, where output has size %%i",
+                    i,
+                    cnda_%(iname)s->dim[i],
+                    dims[i]
+                    );
                %(fail)s;
            }
        }
            """ %locals()
-        iname0 = inputs[0]
-        print >> sio, """
-        dims = cnda_%(iname0)s->dim;
-        //unsigned int size = CudaNdarray_SIZE(cnda_%(iname0)s);
-        //std::cerr << "ADD size " << size << "\\n";
-        """ %locals()

        for oname in outputs:
            print >> sio, """
@@ -329,13 +342,14 @@ class GpuElemwise(Op):
                %(fail)s;
            }
        }
-        //std::cerr << "ELEMWISE NEW %(oname)s nd" << cnda_%(oname)s->nd << "\\n";
+        std::cerr << "ELEMWISE NEW %(oname)s nd" << cnda_%(oname)s->nd << "\\n";
        //std::cerr << "ELEMWISE NEW %(oname)s data" << cnda_%(oname)s->devdata << "\\n";
        """ % locals()
        print >> sio, """
        { 
            //new block so that failure gotos don't skip over variable initialization
            int log2_dims[%(nd)s];
+            std::cerr << "calling callkernel\\n";
            callkernel_%(nodename)s(1, 0, dims, log2_dims
            """ % locals()
        for iname in inputs:
@@ -349,6 +363,7 @@ class GpuElemwise(Op):
        print >> sio, """
                        );

+            std::cerr << "calling callkernel returned\\n";
            cudaThreadSynchronize();
            cudaError_t err = cudaGetLastError();
            if( cudaSuccess != err) 
@@ -462,6 +477,12 @@ class GpuDimShuffle(Op):

        #alloc an output
        print >> sio, """
+        if (cnda_%(res)s)
+        {
+            //TODO: re-use previously-allocated stuff
+            Py_DECREF(cnda_%(res)s);
+            cnda_%(res)s = NULL;
+        }
        if (NULL == cnda_%(res)s) {
            cnda_%(res)s = (CudaNdarray*) CudaNdarray_new_null();
            if (NULL == cnda_%(res)s)
@@ -493,16 +514,23 @@ class GpuDimShuffle(Op):
        #reassign the dimension and strides in the host pointers
        for i, o in enumerate(self.new_order):
            if o == 'x':
+                assert node.outputs[0].type.broadcastable[i]
                print >> sio, """
        cnda_%(res)s->dim[%(i)s] = 1;
        cnda_%(res)s->str[%(i)s] = 0;
                """ %locals()
            else:
+                assert not node.outputs[0].type.broadcastable[i]
                print >> sio, """
        cnda_%(res)s->dim[%(i)s] = cnda_%(input)s->dim[%(o)s];
        cnda_%(res)s->str[%(i)s] = cnda_%(input)s->str[%(o)s];
                """ %locals()

+        for i, o in enumerate(self.new_order):
+                print >> sio, """
+        std::cerr << "GpuDimShuffle " << cnda_%(res)s << " str[%(i)s] = " << cnda_%(res)s->str[%(i)s] << "\\n";
+                """ %locals()
+
        # copy the host dims and stride -> device
        print >> sio, """
        if (CudaNdarray_copy_structure_to_device(cnda_%(res)s))

--- a/tests/test_basic_ops.py
+++ b/tests/test_basic_ops.py
@@ -87,9 +87,34 @@ def test_elemwise3():
    
    shape = (3,4,5,6)
    a = tcn.shared_constructor(numpy.random.rand(*shape), 'a')
-    b = tensor.dvector()
+    b = tensor.fvector()
+    print b.type
+    print tensor.constant(1).type
+    print (1 + b).type
+    print (1 + b**a).type
+    print tensor.exp((1 + b**a)).type
    f = pfunc([b], [], updates=[(a, (a+b).dimshuffle([2,0,3,1]) * tensor.exp(1 +
        b**a).dimshuffle([2,0,3,1]))])
+    has_elemwise = False
+    for i, node in enumerate(f.maker.env.toposort()):
+        print >> sys.stderr, i, node
+        has_elemwise = has_elemwise or isinstance(node.op, tensor.Elemwise)
+    assert not has_elemwise
    #let debugmode catch errors
    f(numpy.random.rand(6))

+def test_elemwise4():
+    """ Test that two vectors can be broadcast to form an outer product (by performing rank-1 matrix update"""
+    
+    shape = (3,4)
+    a = tcn.shared_constructor(numpy.random.rand(*shape), 'a')
+    b = tensor.fvector()
+    c = tensor.fvector()
+    f = pfunc([b,c], [], updates=[(a, (a+b.dimshuffle('x', 0)*x.dimshuffle(0, 'x')))])
+    has_elemwise = False
+    for i, node in enumerate(f.maker.env.toposort()):
+        print >> sys.stderr, i, node
+        has_elemwise = has_elemwise or isinstance(node.op, tensor.Elemwise)
+    assert not has_elemwise
+    #let debugmode catch errors
+    f(numpy.random.rand(4), numpy.random.rand(3))
--- a/type.py
+++ b/type.py
-import sys, os
+import sys, os, StringIO
 import numpy

 from theano import Op, Type, Apply, Variable, Constant
@@ -130,10 +130,45 @@ class CudaNdarrayType(Type):
        return "cnda_%(name)s = NULL;" % locals()

    def c_extract(self, name, sub):
-        return """
+        sio = StringIO.StringIO()
+        fail = sub['fail']
+        nd = self.ndim
+        print >> sio, """
        if (CudaNdarray_Check(py_%(name)s))
        {
            cnda_%(name)s = (CudaNdarray*)py_%(name)s;
+            std::cerr << "c_extract " << cnda_%(name)s << '\\n';
+            if (cnda_%(name)s->nd != %(nd)s)
+            {
+                PyErr_Format(PyExc_RuntimeError, "Some CudaNdarray has rank %%i, it was supposed to have rank %(nd)s", cnda_%(name)s->nd);
+                cnda_%(name)s = NULL;
+                %(fail)s;
+            }
+            std::cerr << "c_extract " << cnda_%(name)s << " nd check passed\\n";
+        """ %locals()
+        for i, b in enumerate(self.broadcastable):
+            if b:
+                print >> sio, """
+            if (cnda_%(name)s->dim[%(i)s] != 1)
+            {
+                PyErr_Format(PyExc_RuntimeError, "Some CudaNdarray has dim %%i on broadcastable dimension %%i", cnda_%(name)s->dim[%(i)s], %(i)s);
+                cnda_%(name)s = NULL;
+                %(fail)s;
+            }
+            std::cerr << "c_extract " << cnda_%(name)s << "dim check %(i)s passed\\n";
+            std::cerr << "c_extract " << cnda_%(name)s << "checking bcast %(i)s <" << cnda_%(name)s->str<< ">\\n";
+            std::cerr << "c_extract " << cnda_%(name)s->str[%(i)s] << "\\n";
+            if (cnda_%(name)s->str[%(i)s])
+            {
+                std::cerr << "c_extract bad stride detected...\\n";
+                PyErr_Format(PyExc_RuntimeError, "Some CudaNdarray has a nonzero stride %%i on a broadcastable dimension %%i", cnda_%(name)s->str[%(i)s], %(i)s);
+                cnda_%(name)s = NULL;
+                %(fail)s;
+            }
+            std::cerr << "c_extract " << cnda_%(name)s << "bcast check %(i)s passed\\n";
+                """ %locals()
+        print >> sio, """
+            assert(cnda_%(name)s);
            Py_INCREF(py_%(name)s);
        }
        else
@@ -142,12 +177,19 @@ class CudaNdarrayType(Type):
            cnda_%(name)s = NULL;
            %(fail)s;
        }
-        """ % dict(sub, name = name, type_num = self.dtype_specs()[2])
+        std::cerr << "c_extract done " << cnda_%(name)s << '\\n';
+        """ % locals()
+        #print sio.getvalue()
+        return sio.getvalue()

    def c_cleanup(self, name, sub):
        return """
-        //std::cerr << "cleanup " << py_%(name)s << "\\n";
-        Py_XDECREF(py_%(name)s);
+        std::cerr << "cleanup " << py_%(name)s << " " << cnda_%(name)s << "\\n";
+        if (cnda_%(name)s)
+        {
+            Py_XDECREF(cnda_%(name)s);
+        }
+        std::cerr << "cleanup done" << py_%(name)s << "\\n";
        """ % locals()

    def c_sync(self, name, sub):
@@ -194,12 +236,6 @@ class CudaNdarrayType(Type):
    def c_code_cache_version(self):
        return () #do not cache this stuff until it matures

-
-    def c_compiler(self): return nvcc_module_compile_str
-
-
-
-
-
-
+    def c_compiler(self):
+        return nvcc_module_compile_str

--- a/var.py
+++ b/var.py
@@ -49,10 +49,14 @@ class CudaNdarraySharedVariable(SharedVariable, _operators):
        if hasattr(other, '_as_CudaNdarrayVariable'):
            return other._as_CudaNdarrayVariable()

-        if isinstance(other.type, tensor.TensorType) and (other.type.dtype == self.dtype) and (other.broadcastable == self.broadcastable):
-            return GpuFromHost()(other)
-        else:
-            raise TypeError((other, other.type))
+        if not isinstance(other.type, tensor.TensorType):
+            raise TypeError('Incompatible type', other.type)
+        if (other.type.dtype != self.dtype):
+            raise TypeError('Incompatible dtype', (self.dtype, other.type.dtype))
+        if (other.type.broadcastable != self.broadcastable):
+            raise TypeError('Incompatible broadcastable', (self.broadcastable, other.type.broadcastable))
+        return GpuFromHost()(other)
+
 CudaNdarrayType.SharedVariable = CudaNdarraySharedVariable

 def shared_constructor(value, name, strict=False):