test_elemwise1 passes, didnt run in debugmode

aa278955 · James Bergstra · 56d152c8 · aa278955 · aa278955 · aa278955
--- a/__init__.py
+++ b/__init__.py
@@ -4,3 +4,6 @@ from .var import (CudaNdarrayVariable,
    CudaNdarrayConstant,
    CudaNdarraySharedVariable,
    shared_constructor)
+
+import basic_ops
+import opt
--- a/basic_ops.py
+++ b/basic_ops.py
@@ -18,6 +18,8 @@ class HostFromGpu(Op):
        return type(self) == type(other)
    def __hash__(self):
        return hash(type(self))
+    def __str__(self):
+        return '<HostFromGpu@%i>' % id(self)
    def make_node(self, x):
        if not isinstance(x.type, CudaNdarrayType):
            raise TypeError(x)
@@ -32,6 +34,8 @@ class GpuFromHost(Op):
        return type(self) == type(other)
    def __hash__(self):
        return hash(type(self))
+    def __str__(self):
+        return '<GpuFromHost@%i>' % id(self)
    def make_node(self, x):
        if not isinstance(x.type, tensor.TensorType):
            raise TypeError(x)
@@ -41,26 +45,67 @@ class GpuFromHost(Op):
    def grad(self, inputs, (gz,)):
        return [HostFromGpu()(gz)]

-class GpuAdd(Op):
+
+class GpuElemwise(Op):
+
+    nin = property(lambda self: self.scalar_op.nin)
+    nout = property(lambda self: self.scalar_op.nout)
+
+    def __init__(self, scalar_op, inplace_pattern):
+        self.scalar_op = scalar_op
+        self.inplace_pattern = inplace_pattern
+        self.destroy_map = dict((o, [i]) for o, i in inplace_pattern.items())
+        if scalar_op.nin > 0:
+            self.ufunc = numpy.frompyfunc(scalar_op.impl, scalar_op.nin, scalar_op.nout)
+        else:
+            self.ufunc = None
+        self._rehash()
+
+    def __getstate__(self):
+        d = copy(self.__dict__)
+        d.pop('ufunc')
+        d.pop('__epydoc_asRoutine', None)
+        d.pop('_hashval')
+        return d
+    
+    def __setstate__(self, d):
+        self.__dict__.update(d)
+        if self.scalar_op.nin > 0:
+            self.ufunc = numpy.frompyfunc(self.scalar_op.impl, self.scalar_op.nin, self.scalar_op.nout)
+        else:
+            self.ufunc = None
+        self._rehash()
+
    def __eq__(self, other):
-        self.scalar_op = scalar.add
-        return type(self) == type(other)
-    def __hash__(self):
-        return hash(type(self))
-    def make_node(self, a, b):
-        _a = as_cuda_ndarray_variable(a)
-        _b = as_cuda_ndarray_variable(b)
-        if _a.type.broadcastable != _b.type.broadcastable:
-            raise NotImplementedError('different bcastable')
-        return Apply(self, [_a,_b], [CudaNdarrayType(broadcastable=_a.broadcastable)()])
-
-    def perform(self, node, (a,b), (z,)):
-        aval = numpy.asarray(a, dtype='float32')
-        bval = numpy.asarray(b, dtype='float32')
-        z[0] = type_support_filter(aval + bval, (0,)*len(zval.shape), 0)
+        return type(self) == type(other) and (self.scalar_op == other.scalar_op)

-    def grad(self, inputs, (gz,)):
-        return [gz for i in inputs]
+    def _rehash(self):
+        items = self.inplace_pattern.items()
+        items.sort()
+        tuple_items = tuple([k for k,v in items] + [(tuple(v) if isinstance(v, (tuple, list)) else v) for k,v in items])
+        h = hash('Elemwise') ^ hash(self.scalar_op) ^ hash(tuple_items)
+        assert h == getattr(self,'_hashval', h)
+        self._hashval = h
+
+    def __hash__(self):
+        return self._hashval
+    def __str__(self):
+        if self.inplace_pattern:
+            items = self.inplace_pattern.items()
+            items.sort()
+            return "GpuElemwise{%s}%s" % (self.scalar_op, str(items))
+        else:
+            return "GpuElemwise{%s}" % (self.scalar_op)
+    def make_node(self, *inputs):
+        _inputs = [as_cuda_ndarray_variable(i) for i in inputs]
+        if self.nin > 0 and len(_inputs) != self.nin:
+            raise TypeError('Wrong argument count', (self.nin, len(_inputs)))
+        for i in _inputs[1:]:
+            if i.type.broadcastable != inputs[0].type.broadcastable:
+                raise NotImplementedError('different bcastable')
+        otype = CudaNdarrayType(broadcastable=_inputs[0].broadcastable)
+        assert self.nout > 0
+        return Apply(self, _inputs, [otype() for o in xrange(self.nout)])
    def c_support_code(self):
        return """
        #define INTDIV_POW2(a, b) (a >> b)
@@ -108,6 +153,7 @@ class GpuAdd(Op):
                print >> sio, "        ii_o%i_data += pos%i * o%i_str_%i;" % (ipos, d, ipos, d)

        # perform the scalar operation on the input and output references
+            if d == 0:
                print >> sio, "       ", self.scalar_op.c_code(None, None, 
                        ['ii_i%i_data[0]'%ipos for ipos, i in enumerate(node.inputs)], 
                        ['ii_o%i_data[0]'%ipos for ipos, i in enumerate(node.outputs)], 
@@ -121,7 +167,8 @@ class GpuAdd(Op):
        #for ipos, i in enumerate(node.inputs):
            #print >> sio, indent, "const float * i%i" % ipos, '= i%i_data', ''
        print >> sio, "}"
-        print sio.getvalue()
+        if 0:
+            print sio.getvalue()
        return sio.getvalue()

    def c_support_code_apply(self, node, nodename):
@@ -131,18 +178,44 @@ class GpuAdd(Op):
        nd = node.outputs[0].type.ndim
        d = dict()
        assert nd == 2
-        kernel_call_args = ("numEls, log2_dims[0], log2_dims[1]"
-                ", a_str[0], a_str[1], a_data"
-                ", b_str[0], b_str[1], b_data"
-                ", z_str[0], z_str[1], z_data")
+        #input_params and output_params go into the function declaration/definition
+        input_params = ", ".join("const float * i%i_data, const int * i%i_str"%(ipos, ipos) 
+                for ipos in xrange(len(node.inputs)))
+        output_params = ", ".join("float * o%i_data, const int * o%i_str"%(ipos, ipos) 
+                for ipos in xrange(len(node.outputs)))
+
+        #input_args and output_args go into the recursive call.
+        input_args = ", ".join("i%i_data, i%i_str"%(ipos, ipos) 
+                for ipos in xrange(len(node.inputs)))
+        output_args = ", ".join("o%i_data, o%i_str"%(ipos, ipos) 
+                for ipos in xrange(len(node.outputs)))
+
+        # kernel_call_args are used to invoke the cuda kernel
+        kernel_call_args = ["numEls, log2_dims[0], log2_dims[1]"]
+        for ipos in xrange(len(node.inputs)):
+            strides = ", ".join("i%i_str[%i]"%(ipos, di) for di in xrange(nd))
+            kernel_call_args.append( "%s, i%i_data" % (strides, ipos))
+        for ipos in xrange(len(node.outputs)):
+            strides = ", ".join("i%i_str[%i]"%(ipos, di) for di in xrange(nd))
+            kernel_call_args.append( "%s, o%i_data" % (strides, ipos))
+        kernel_call_args = ",".join(kernel_call_args)
+
+        # the data_pointer_increments are inserted after each recursive call
+        data_ptr_inc = []
+        for ipos in xrange(len(node.inputs)):
+            data_ptr_inc.append("i%i_data += (1<< log2_dim) * i%i_str[d]" %(ipos, ipos))
+        for ipos in xrange(len(node.outputs)):
+            data_ptr_inc.append("o%i_data += (1<< log2_dim) * o%i_str[d]" %(ipos, ipos))
+        data_ptr_inc = ";\n".join(data_ptr_inc)
+
+
        d.update(locals())
        return """

        static void callkernel_%(nodename)s(const unsigned int numEls, const int d,
-        const int * dims, int * log2_dims,
-        const float * a_data, const int * a_str,
-        const float * b_data, const int * b_str, 
-        float * z_data, const int * z_str)
+            const int * dims, int * log2_dims,
+            %(input_params)s,
+            %(output_params)s)
        {
            if (d == %(nd)s)
            {
@@ -150,40 +223,39 @@ class GpuAdd(Op):
                //a ceil would be better here
                int n_blocks = std::min(numEls/threads_per_block + 1, (unsigned int)NUM_VECTOR_OP_BLOCKS);
                kernel_%(nodename)s<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s);
-std::cerr << "ADDCALL a str" << a_str[0] << " "<< a_str[1] << "\\n";
-std::cerr << "ADDCALL a data" << a_data << "\\n";
-std::cerr << "ADDCALL b str" << b_str[0] << " "<< b_str[1] << "\\n";
-std::cerr << "ADDCALL b data" << b_data << "\\n";
-std::cerr << "ADDCALL z str" << z_str[0] << " "<< z_str[1] << "\\n";
-std::cerr << "ADDCALL z data" << z_data << "\\n";
-
+                //std::cerr << "ADDCALL a str" << i0_str[0] << " "<< i0_str[1] << "\\n";
+                //std::cerr << "ADDCALL a data" << i0_data << "\\n";
+                //std::cerr << "ADDCALL b str" << i1_str[0] << " "<< i1_str[1] << "\\n";
+                //std::cerr << "ADDCALL b data" << i1_data << "\\n";
+                //std::cerr << "ADDCALL z str" << o0_str[0] << " "<< o0_str[1] << "\\n";
+                //std::cerr << "ADDCALL z data" << o0_data << "\\n";
            }
            else
            {
-std::cerr << "_ADDCALL d " << d << "\\n";
+                //std::cerr << "_ADDCALL d " << d << "\\n";
                unsigned int dim_d = dims[d];
-std::cerr << "_ADDCALL dim_d " << dim_d << "\\n";
+                //std::cerr << "_ADDCALL dim_d " << dim_d << "\\n";
                int log2_dim = 0;
                while(dim_d)
                {
-std::cerr << "___ADDCALL d " << d << " " << dim_d << "\\n";
-                    if (dim_d&1)
-                    {
-                        log2_dims[d] = log2_dim; 
-std::cerr << "___ADDCALL a str" << a_str[0] << " "<< a_str[1] << "\\n";
-std::cerr << "___ADDCALL a data" << a_data << "\\n";
-std::cerr << "___ADDCALL b str" << b_str[0] << " "<< b_str[1] << "\\n";
-std::cerr << "___ADDCALL b data" << b_data << "\\n";
-std::cerr << "___ADDCALL z str" << z_str[0] << " "<< z_str[1] << "\\n";
-std::cerr << "___ADDCALL z data" << z_data << "\\n";
-                        callkernel_%(nodename)s(numEls * (1<<log2_dim), d+1, 
-                            dims, log2_dims, 
-                            a_data, a_str, 
-                            b_data, b_str, 
-                            z_data, z_str);
-                        a_data += (1 << log2_dim) * a_str[d];
-                        b_data += (1 << log2_dim) * b_str[d];
-                        z_data += (1 << log2_dim) * z_str[d];
+                        //std::cerr << "___ADDCALL d " << d << " " << dim_d << "\\n";
+                        if (dim_d&1)
+                        {
+                            log2_dims[d] = log2_dim; 
+                            //std::cerr << "___ADDCALL a str" << i0_str[0] << " "<< i0_str[1] << "\\n";
+                            //std::cerr << "___ADDCALL a data" << i0_data << "\\n";
+                        //std::cerr << "___ADDCALL b str" << i1_str[0] << " "<< i1_str[1] << "\\n";
+                        //std::cerr << "___ADDCALL b data" << i1_data << "\\n";
+                        //std::cerr << "___ADDCALL z str" << o0_str[0] << " "<< o0_str[1] << "\\n";
+                        //std::cerr << "___ADDCALL z data" << o0_data << "\\n";
+                        callkernel_%(nodename)s(numEls * (1<<log2_dim), d+1, dims, log2_dims, 
+                            %(input_args)s,
+                            %(output_args)s);
+
+                        %(data_ptr_inc)s;
+                        //i0_data += (1 << log2_dim) * i0_str[d];
+                        //i1_data += (1 << log2_dim) * i1_str[d];
+                        //o0_data += (1 << log2_dim) * o0_str[d];
                    }
                    log2_dim += 1;
                    dim_d >>= 1;
@@ -192,72 +264,122 @@ std::cerr << "___ADDCALL z data" << z_data << "\\n";
        }
        """ %d

-    def c_code(self, node, nodename, (a,b), (z,), sub):
+    def c_code(self, node, nodename, inputs, outputs, sub):
        d = dict(sub)
        nd = node.outputs[0].type.ndim
        d.update(locals())
-        return """
-std::cerr << "ADD start\\n";
+        sio = StringIO.StringIO()
+        nin = len(inputs)
+        nout = len(outputs)
+        fail = sub['fail']
+        opname = str(self.scalar_op)
+        print >> sio, """
+        std::cerr << "C_CODE %(opname)s START\\n";
        //standard elemwise size checks
-        if (cnda_%(a)s->nd != cnda_%(b)s->nd)
+        const int * dims = NULL;
+        """ %locals()
+        for iname in inputs:
+            print >> sio, """
+        if (%(nd)s != cnda_%(iname)s->nd)
        {
-            PyErr_SetString(PyExc_TypeError, "need same number of dims");
-            return NULL;
+            PyErr_Format(PyExc_TypeError, "need %(nd)s dims, not %%i", cnda_%(iname)s->nd);
+            %(fail)s;
        }
+            """ %locals()
+        for iname0, iname1 in zip(inputs[1:], inputs[:-1]):
+            print >> sio, """
        //standard elemwise dim checks
-        unsigned int size = 1;
-        for (int i = 0; i< cnda_%(a)s->nd; ++i)
+        for (int i = 0; i< %(nd)s; ++i)
        {
-            if (cnda_%(a)s->dim[i] != cnda_%(b)s->dim[i])
+            if (cnda_%(iname0)s->dim[i] != cnda_%(iname1)s->dim[i])
            {
                PyErr_SetString(PyExc_TypeError, "need same dimensions");
-                return NULL;
+                %(fail)s;
            }
-            size *= (unsigned int) cnda_%(a)s->dim[i];
        }
-std::cerr << "ADD size " << size << "\\n";
-        if (cnda_%(z)s){
+            """ %locals()
+        iname0 = inputs[0]
+        print >> sio, """
+        dims = cnda_%(iname0)s->dim;
+        //unsigned int size = CudaNdarray_SIZE(cnda_%(iname0)s);
+        //std::cerr << "ADD size " << size << "\\n";
+        """ %locals()
+
+        for oname in outputs:
+            print >> sio, """
+        if (cnda_%(oname)s){
            //TODO: check if we can maybe use existing storage
-            Py_XDECREF(cnda_%(z)s);
-            cnda_%(z)s = NULL;
-std::cerr << "ADD decref z \\n";
+            Py_XDECREF(cnda_%(oname)s);
+            cnda_%(oname)s = NULL;
        }
-        if (NULL == cnda_%(z)s)
+        if (NULL == cnda_%(oname)s)
        {
-            cnda_%(z)s = (CudaNdarray*)CudaNdarray_new_null();
-            if (!cnda_%(z)s)
-            {
+            cnda_%(oname)s = (CudaNdarray*)CudaNdarray_new_null();
+            if (!cnda_%(oname)s)
+            { 
+                //error string already set
                %(fail)s;
            }
-            if (CudaNdarray_alloc_contiguous(cnda_%(z)s, cnda_%(a)s->nd, cnda_%(a)s->dim))
+            if (CudaNdarray_alloc_contiguous(cnda_%(oname)s, %(nd)s, dims))
            {
-                Py_XDECREF(cnda_%(z)s);
-                cnda_%(z)s = NULL;
+                //error string already set
+                Py_XDECREF(cnda_%(oname)s);
+                cnda_%(oname)s = NULL;
                %(fail)s;
            }
        }
-std::cerr << "ADD z nd" << cnda_%(z)s->nd << "\\n";
-std::cerr << "ADD z str" << cnda_%(z)s->str[0] << " "<< cnda_%(z)s->str[1] << "\\n";
-std::cerr << "ADD z data" << cnda_%(z)s->devdata << "\\n";
-        { //new block so that failure gotos don't skip over variable initialization
+        std::cerr << "ELEMWISE NEW %(oname)s nd" << cnda_%(oname)s->nd << "\\n";
+        std::cerr << "ELEMWISE NEW %(oname)s data" << cnda_%(oname)s->devdata << "\\n";
+        """ % locals()
+        print >> sio, """
+        { 
+            //new block so that failure gotos don't skip over variable initialization
            int log2_dims[%(nd)s];
-            callkernel_%(nodename)s(1, 0, CudaNdarray_DIMS(cnda_%(z)s), log2_dims,
-                        CudaNdarray_DEV_DATA(cnda_%(a)s), CudaNdarray_STRIDES(cnda_%(a)s),
-                        CudaNdarray_DEV_DATA(cnda_%(b)s), CudaNdarray_STRIDES(cnda_%(b)s),
-                        CudaNdarray_DEV_DATA(cnda_%(z)s), CudaNdarray_STRIDES(cnda_%(z)s));
+            callkernel_%(nodename)s(1, 0, dims, log2_dims
+            """ % locals()
+        for iname in inputs:
+            print >> sio, """
+                        , CudaNdarray_DEV_DATA(cnda_%(iname)s), CudaNdarray_STRIDES(cnda_%(iname)s)
+            """ % locals()
+        for oname in outputs:
+            print >> sio, """
+                        , CudaNdarray_DEV_DATA(cnda_%(oname)s), CudaNdarray_STRIDES(cnda_%(oname)s)
+            """ % locals()
+        print >> sio, """
+                        );

            cudaThreadSynchronize();
            cudaError_t err = cudaGetLastError();
            if( cudaSuccess != err) 
            {
-                PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n", "kExp", cudaGetErrorString(err));
-                Py_XDECREF(cnda_%(z)s);
-                cnda_%(z)s = NULL;
+                PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n", "Elemwise %(nodename)s", cudaGetErrorString(err));
+                """ % locals()
+        for oname in outputs:
+            print >> sio, """
+                Py_XDECREF(cnda_%(oname)s);
+                cnda_%(oname)s = NULL;
+                """ % locals()
+        print >> sio, """
                %(fail)s;
            }                         
        }
-        """ % d
+        std::cerr << "C_CODE %(opname)s END\\n";
+        """ % locals()
+        return sio.getvalue()

    def c_code_cache_version(self):
        return ()

+if 0:
+    class GpuAdd(GpuElemwise):
+        def __init__(self):
+            super(GpuAdd, self).__init__(scalar.add)
+
+        def perform(self, node, args, (z,)):
+            print "GpuAdd perform"
+            zval = numpy.asarray(args[0])
+            for a in args[1:]:
+                zval += numpy.asarray(a)
+            z[0] = type_support_filter(zval, (0,)*len(zval.shape), 0)
+
+    gpu_add = GpuAdd()
--- a/tests/test_basic_ops.py
+++ b/tests/test_basic_ops.py
+import sys
 from theano.compile.sandbox.sharedvalue import shared
 from theano.compile.sandbox.pfunc import pfunc
 from theano import tensor
@@ -11,7 +12,7 @@ def test_elemwise0():

    a = tcn.shared_constructor(numpy.random.rand(4,4), 'a')

-    b = tensor.dmatrix()
+    b = tensor.fmatrix()

    f = pfunc([b], [], updates=[(a, a+b)])

@@ -27,11 +28,27 @@ def test_elemwise1():
    """ Several kinds of elemwise expressions with no broadcasting, non power-of-two shape """

    shape = (3,4)
-    a = tcn.shared_constructor(numpy.random.rand(*shape), 'a')
-    b = tensor.dmatrix()
-    f = pfunc([b], [], updates=[(a, a+b * tensor.exp(b**a))])
+    a = tcn.shared_constructor(numpy.random.rand(*shape)+0.5, 'a')
+    b = tensor.fmatrix()
+
    #let debugmode catch any mistakes
-    f(numpy.ones(shape))
+    print >> sys.stderr, "STARTING FUNCTION 1"
+    f = pfunc([b], [], updates=[(a, b**a)])
+    for i, node in enumerate(f.maker.env.toposort()):
+        print i, node
+    f(numpy.random.rand(*shape)+0.3)
+
+    print >> sys.stderr, "STARTING FUNCTION 2"
+    #let debugmode catch any mistakes
+    f = pfunc([b], [], updates=[(a, tensor.exp(b**a))])
+    for i, node in enumerate(f.maker.env.toposort()):
+        print i, node
+    f(numpy.random.rand(*shape)+0.3)
+
+    print >> sys.stderr, "STARTING FUNCTION 3"
+    #let debugmode catch any mistakes
+    f = pfunc([b], [], updates=[(a, a+b * tensor.exp(b**a))])
+    f(numpy.random.rand(*shape)+0.3)

 def test_elemwise2():
    """ Several kinds of elemwise expressions with dimension permutations """
@@ -41,6 +58,11 @@ def test_elemwise2():
    b = tensor.Tensor(dtype='float32', broadcastable=[0]*len(shape))()
    f = pfunc([b], [], updates=[(a, (a+b).dimshuffle([2,0,3,1]) *
        tensor.exp(b**a).dimshuffle([2,0,3,1]))])
+    has_elemwise = False
+    for i, node in enumerate(f.maker.env.toposort()):
+        print i, node
+        has_elemwise = has_elemwise or isinstance(node.op, tensor.Elemwise)
+    assert not has_elemwise
    #let debugmode catch errors
    f(numpy.ones(shape))

@@ -54,3 +76,4 @@ def test_elemwise3():
        b**a).dimshuffle([2,0,3,1]))])
    #let debugmode catch errors
    f(numpy.ones(6))
+