upgraded to use CudaNdarray_HOST_DIMS and family

a317f101 · James Bergstra · 71338c0b · a317f101 · a317f101 · a317f101
--- a/basic_ops.py
+++ b/basic_ops.py
@@ -329,13 +329,13 @@ class GpuElemwise(Op):
        }
        for (int i = 0; i< %(nd)s; ++i)
        {
-            dims[i] = (dims[i] == 1) ? cnda_%(iname)s->dim[i] : dims[i];
+            dims[i] = (dims[i] == 1) ? CudaNdarray_HOST_DIMS(cnda_%(iname)s)[i] : dims[i];
-            if ((cnda_%(iname)s->dim[i] != 1) && (dims[i] != cnda_%(iname)s->dim[i]))
+            if ((CudaNdarray_HOST_DIMS(cnda_%(iname)s)[i] != 1) && (dims[i] != CudaNdarray_HOST_DIMS(cnda_%(iname)s)[i]))
            {
                //std::cerr << "C_CODE %(opname)s checking input %(iname)s failed\\n";
                PyErr_Format(PyExc_TypeError, "GpuElemwise input has incompatible dim[%%i] == %%i, where output has size %%i",
                    i,
-                    cnda_%(iname)s->dim[i],
+                    CudaNdarray_HOST_DIMS(cnda_%(iname)s)[i],
                    dims[i]
                    );
                %(fail)s;
@@ -378,11 +378,11 @@ class GpuElemwise(Op):
            """ % locals()
        for iname in inputs:
            print >> sio, """
-                        , CudaNdarray_DEV_DATA(cnda_%(iname)s), CudaNdarray_STRIDES(cnda_%(iname)s)
+                        , CudaNdarray_DEV_DATA(cnda_%(iname)s), CudaNdarray_HOST_STRIDES(cnda_%(iname)s)
            """ % locals()
        for oname in outputs:
            print >> sio, """
-                        , CudaNdarray_DEV_DATA(cnda_%(oname)s), CudaNdarray_STRIDES(cnda_%(oname)s)
+                        , CudaNdarray_DEV_DATA(cnda_%(oname)s), CudaNdarray_HOST_STRIDES(cnda_%(oname)s)
            """ % locals()
        print >> sio, """
                        );
@@ -508,21 +508,20 @@ class GpuDimShuffle(Op):
        {
            if (cnda_%(res)s)
            {
-                Py_DECREF(cnda_%(res)s);
+                if (CudaNdarray_set_nd(cnda_%(res)s, %(nd_out)s))
-                cnda_%(res)s = NULL;
+                {
-            }
+                    Py_DECREF(cnda_%(res)s);
-            cnda_%(res)s = (CudaNdarray*) CudaNdarray_new_null();
+                    cnda_%(res)s = NULL;
-            if (NULL == cnda_%(res)s)
+                    %(fail)s;
-            {
+                }
-                PyErr_SetString(PyExc_MemoryError, "Failed to allocate result");
-                %(fail)s;
            }
-            if (CudaNdarray_set_nd(cnda_%(res)s, %(nd_out)s))
+            else
            {
-                // err message set
+                cnda_%(res)s = (CudaNdarray*) CudaNdarray_New(%(nd_out)s);
-                Py_DECREF(cnda_%(res)s);
+                if (NULL == cnda_%(res)s)
-                cnda_%(res)s = NULL;
+                {
-                %(fail)s;
+                    %(fail)s;
+                }
            }
        }
        """ %locals()
@@ -542,14 +541,14 @@ class GpuDimShuffle(Op):
            if o == 'x':
                assert node.outputs[0].type.broadcastable[i]
                print >> sio, """
-        cnda_%(res)s->dim[%(i)s] = 1;
+        CudaNdarray_set_dim(cnda_%(res)s, %(i)s, 1);
-        cnda_%(res)s->str[%(i)s] = 0;
+        CudaNdarray_set_stride(cnda_%(res)s, %(i)s, 0);
                """ %locals()
            else:
                assert not node.outputs[0].type.broadcastable[i]
                print >> sio, """
-        cnda_%(res)s->dim[%(i)s] = cnda_%(input)s->dim[%(o)s];
+        CudaNdarray_set_dim(cnda_%(res)s, %(i)s, CudaNdarray_HOST_DIMS(cnda_%(input)s)[%(o)s]);
-        cnda_%(res)s->str[%(i)s] = cnda_%(input)s->str[%(o)s];
+        CudaNdarray_set_stride(cnda_%(res)s, %(i)s, CudaNdarray_HOST_STRIDES(cnda_%(input)s)[%(o)s]);
                """ %locals()
        for i, o in enumerate(self.new_order):
@@ -558,17 +557,18 @@ class GpuDimShuffle(Op):
                """ %locals()
        # copy the host dims and stride -> device
-        print >> sio, """
-        if (CudaNdarray_copy_structure_to_device(cnda_%(res)s))
-        {
-            //err msg set
-            Py_DECREF(cnda_%(res)s);
-            cnda_%(res)s = NULL;
-            %(fail)s;
-        }
-        """ %locals()
        if 0:
+            print >> sio, """
+            if (CudaNdarray_copy_structure_to_device(cnda_%(res)s))
+            {
+                //err msg set
+                Py_DECREF(cnda_%(res)s);
+                cnda_%(res)s = NULL;
+                %(fail)s;
+            }
+            """ %locals()
+        if 1:
            print '--------------------------------------'
            print 'C_CODE'
            print ''

--- a/blas.py
+++ b/blas.py
@@ -39,13 +39,13 @@ class GpuDot22(Op):
            %(fail)s;
        }
        if ((NULL == cnda_%(z)s)
-            || (cnda_%(z)s->dim[0] != cnda_%(x)s->dim[0])
+            || (CudaNdarray_HOST_DIMS(cnda_%(z)s)[0] != CudaNdarray_HOST_DIMS(cnda_%(x)s)[0])
-            || (cnda_%(z)s->dim[1] != cnda_%(y)s->dim[1]))
+            || (CudaNdarray_HOST_DIMS(cnda_%(z)s)[1] != CudaNdarray_HOST_DIMS(cnda_%(y)s)[1]))
        {
            if (cnda_%(z)s) Py_DECREF(cnda_%(z)s);
            npy_intp dims[2];
-            dims[0] = cnda_%(x)s->dim[0];
+            dims[0] = CudaNdarray_HOST_DIMS(cnda_%(x)s)[0];
-            dims[1] = cnda_%(y)s->dim[1];
+            dims[1] = CudaNdarray_HOST_DIMS(cnda_%(y)s)[1];
            cnda_%(z)s = (CudaNdarray*)CudaNdarray_new_null();
            if ((NULL == cnda_%(z)s) || CudaNdarray_alloc_contiguous(cnda_%(z)s, 2, dims))
            {

--- a/tests/test_nnet.py
+++ b/tests/test_nnet.py
@@ -120,3 +120,66 @@ def test_conv_nnet1():
    rval_gpu = run_conv_nnet1(tcn.shared_constructor)
    assert numpy.allclose(rval_cpu, rval_gpu,rtol=1e-4,atol=1e-6)
+def run_conv_nnet2(shared_fn):
+    n_batch = 16
+    shape_img = (n_batch, 1, 32, 32)
+    n_kern = 20
+    shape_kern = (n_kern, 1, 5, 5)
+    n_kern1 = 30
+    shape_kern1 = (n_kern1, n_kern, 5, 5)
+    logical_hid_shape = tcn.blas.GpuConv.logical_output_shape_2d((32, 32), (5, 5), 'valid')
+    logical_hid_shape1 = tcn.blas.GpuConv.logical_output_shape_2d((logical_hid_shape[0]/2, logical_hid_shape[1]/2), (5, 5), 'valid')
+    n_hid = n_kern1 * logical_hid_shape1[0] * logical_hid_shape1[1]
+    n_out = 10
+    w0 = shared_fn(numpy.asarray(0.01*(numpy.random.rand(*shape_kern)-0.5), dtype='float32'), 'w0')
+    b0 = shared_fn(numpy.asarray(numpy.zeros((n_kern,1,1)), dtype='float32'), 'b0')
+    w1 = shared_fn(numpy.asarray(0.01*(numpy.random.rand(*shape_kern1)-0.5), dtype='float32'), 'w1')
+    b1 = shared_fn(numpy.asarray(numpy.zeros((n_kern1,1,1)), dtype='float32'), 'b1')
+    v = shared_fn(numpy.asarray(numpy.zeros((n_hid, n_out)), dtype='float32'), 'c')
+    c = shared_fn(numpy.asarray(numpy.zeros(n_out), dtype='float32'), 'c')
+    x = tensor.Tensor(dtype='float32', broadcastable=(0,0,0,0))('x')
+    y = tensor.fmatrix('y')
+    lr = tensor.fscalar('lr')
+    conv_op = theano.sandbox.conv.ConvOp(shape_img[2:], shape_kern[2:], n_kern, n_batch, 1, 1)
+    conv_op1 = theano.sandbox.conv.ConvOp((n_kern,logical_hid_shape[0]/2, logical_hid_shape[1]/2), shape_kern1[2:], n_kern1, n_batch, 1, 1)
+    hid = tensor.tanh(conv_op(x, w0)+b0)
+    hid1 = tensor.tanh(conv_op1(hid[:,:,::2,::2], w1) + b1)
+    hid_flat = hid1.reshape((n_batch, n_hid))
+    out = tensor.tanh(tensor.dot(hid_flat, v)+c)
+    loss = tensor.sum(0.5 * (out-y)**2 * lr)
+    print 'loss type', loss.type
+    params = [w0, b0, w1, b1, v, c]
+    gparams = tensor.grad(loss, params)
+    mode = theano.compile.ProfileMode()
+    print 'building pfunc ...'
+    train = pfunc([x,y,lr], [loss], mode=mode, updates=[(p, p-g) for p,g in zip(params, gparams)])
+    for i, n in enumerate(train.maker.env.toposort()):
+        print i, n
+    xval = numpy.asarray(numpy.random.rand(*shape_img), dtype='float32')
+    yval = numpy.asarray(numpy.random.rand(n_batch, n_out), dtype='float32')
+    lr = numpy.asarray(0.01, dtype='float32')
+    for i in xrange(10):
+        rval = train(xval, yval, lr)
+    mode.print_summary()
+    return rval
+def test_conv_nnet2():
+    numpy.random.seed(23456)
+    rval_cpu = run_conv_nnet2(shared)
+    numpy.random.seed(23456)
+    rval_gpu = run_conv_nnet2(tcn.shared_constructor)
+    assert numpy.allclose(rval_cpu, rval_gpu,rtol=1e-4,atol=1e-6)
--- a/type.py
+++ b/type.py
@@ -149,19 +149,19 @@ class CudaNdarrayType(Type):
        for i, b in enumerate(self.broadcastable):
            if b:
                print >> sio, """
-            if (cnda_%(name)s->dim[%(i)s] != 1)
+            if (CudaNdarray_HOST_DIMS(cnda_%(name)s)[%(i)s] != 1)
            {
-                PyErr_Format(PyExc_RuntimeError, "Some CudaNdarray has dim %%i on broadcastable dimension %%i", cnda_%(name)s->dim[%(i)s], %(i)s);
+                PyErr_Format(PyExc_RuntimeError, "Some CudaNdarray has dim %%i on broadcastable dimension %%i", CudaNdarray_HOST_DIMS(cnda_%(name)s)[%(i)s], %(i)s);
                cnda_%(name)s = NULL;
                %(fail)s;
            }
            //std::cerr << "c_extract " << cnda_%(name)s << "dim check %(i)s passed\\n";
            //std::cerr << "c_extract " << cnda_%(name)s << "checking bcast %(i)s <" << cnda_%(name)s->str<< ">\\n";
            //std::cerr << "c_extract " << cnda_%(name)s->str[%(i)s] << "\\n";
-            if (cnda_%(name)s->str[%(i)s])
+            if (CudaNdarray_HOST_STRIDES(cnda_%(name)s)[%(i)s])
            {
                //std::cerr << "c_extract bad stride detected...\\n";
-                PyErr_Format(PyExc_RuntimeError, "Some CudaNdarray has a nonzero stride %%i on a broadcastable dimension %%i", cnda_%(name)s->str[%(i)s], %(i)s);
+                PyErr_Format(PyExc_RuntimeError, "Some CudaNdarray has a nonzero stride %%i on a broadcastable dimension %%i", CudaNdarray_HOST_STRIDES(cnda_%(name)s)[%(i)s], %(i)s);
                cnda_%(name)s = NULL;
                %(fail)s;
            }

--- a/type_support.cu
+++ b/type_support.cu
@@ -44,7 +44,7 @@ filter(PyObject* __unsed_self, PyObject *args) // args = (data, broadcastable, s
        }
        for (int i = 0; i < cnda->nd; ++i)
        {
-            if ((cnda->dim[i] > 1) and PyInt_AsLong(PyTuple_GetItem(broadcastable, Py_ssize_t(i))))
+            if ((CudaNdarray_HOST_DIMS(cnda)[i] > 1) and PyInt_AsLong(PyTuple_GetItem(broadcastable, Py_ssize_t(i))))
            {
                std::cerr << "Non-unit size in bcastable dim:\n";
                PyErr_Format(PyExc_TypeError, "Non-unit size in broadcastable vt dimension %i", i);