corrected test_nnet, misc fixes.

9c146bd8 · James Bergstra · b3528941 · 9c146bd8 · 9c146bd8
--- a/basic_ops.py
+++ b/basic_ops.py
@@ -27,7 +27,8 @@ class HostFromGpu(Op):
    def perform(self, node, (x,), (z,)):
        z[0] = numpy.asarray(x)
    def grad(self, inputs, (gz,)):
-        return [GpuFromHost()(gz)]
+        return gz,
+        #return [GpuFromHost()(gz)]
 host_from_gpu = HostFromGpu()
 class GpuFromHost(Op):
@@ -44,7 +45,8 @@ class GpuFromHost(Op):
    def perform(self, node, (x,), (z,)):
        z[0] = type_support_filter(numpy.asarray(x, dtype='float32'), tuple([0]*x.ndim), 0)
    def grad(self, inputs, (gz,)):
-        return [HostFromGpu()(gz)]
+        return gz,
+        #return [HostFromGpu()(gz)]
 gpu_from_host = GpuFromHost()
@@ -256,7 +258,7 @@ class GpuElemwise(Op):
            {
                int threads_per_block = std::min(numEls, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
                //a ceil would be better here
-                int n_blocks = std::min(numEls/threads_per_block + 1, (unsigned int)NUM_VECTOR_OP_BLOCKS);
+                int n_blocks = std::min(numEls/threads_per_block + (numEls %% threads_per_block?1:0), (unsigned int)NUM_VECTOR_OP_BLOCKS);
                kernel_%(nodename)s<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s);
                //std::cerr << "ADDCALL a str" << i0_str[0] << " "<< i0_str[1] << "\\n";
                //std::cerr << "ADDCALL a data" << i0_data << "\\n";
@@ -498,31 +500,34 @@ class GpuDimShuffle(Op):
        #alloc an output
        print >> sio, """
-        if (cnda_%(res)s)
+        if (cnda_%(res)s && (cnda_%(res)s->nd == %(nd_out)s))
        {
-            //TODO: re-use previously-allocated stuff
+            //re-use previously-allocated cnda
-            Py_DECREF(cnda_%(res)s);
-            cnda_%(res)s = NULL;
        }
-        if (NULL == cnda_%(res)s) {
+        else
+        {
+            if (cnda_%(res)s)
+            {
+                Py_DECREF(cnda_%(res)s);
+                cnda_%(res)s = NULL;
+            }
            cnda_%(res)s = (CudaNdarray*) CudaNdarray_new_null();
            if (NULL == cnda_%(res)s)
            {
                PyErr_SetString(PyExc_MemoryError, "Failed to allocate result");
                %(fail)s;
            }
+            if (CudaNdarray_set_nd(cnda_%(res)s, %(nd_out)s))
+            {
+                // err message set
+                Py_DECREF(cnda_%(res)s);
+                cnda_%(res)s = NULL;
+                %(fail)s;
+            }
        }
        """ %locals()
-        #get the copy / view of the input depending on whether we're doing things inplace or not.
        print >> sio, """
-        if (CudaNdarray_set_nd(cnda_%(res)s, %(nd_out)s))
-        {
-            // err message set
-            Py_DECREF(cnda_%(res)s);
-            cnda_%(res)s = NULL;
-            %(fail)s;
-        }
        if (CudaNdarray_set_device_data(cnda_%(res)s, CudaNdarray_DEV_DATA(cnda_%(input)s), cnda_%(input)s))
        {
            // err message set

--- a/tests/test_nnet.py
+++ b/tests/test_nnet.py
@@ -24,10 +24,10 @@ def run_nnet(use_gpu):
        v = tcn.shared_constructor(numpy.zeros((n_hid, n_out)), 'c')
        c = tcn.shared_constructor(numpy.zeros(n_out), 'c')
    else:
-        w = shared(0.01*(numpy.random.rand(n_in,n_hid)-0.5), 'w')
+        w = shared(numpy.asarray(0.01*(numpy.random.rand(n_in,n_hid)-0.5), dtype='float32'), 'w')
-        b = shared(numpy.zeros(n_hid), 'b')
+        b = shared(numpy.asarray(numpy.zeros(n_hid), dtype='float32'), 'b')
-        v = shared(numpy.zeros((n_hid, n_out)), 'c')
+        v = shared(numpy.asarray(numpy.zeros((n_hid, n_out)), dtype='float32'), 'c')
-        c = shared(numpy.zeros(n_out), 'c')
+        c = shared(numpy.asarray(numpy.zeros(n_out), dtype='float32'), 'c')
    x = tensor.fmatrix('x')
    y = tensor.fmatrix('y')
@@ -54,10 +54,14 @@ def run_nnet(use_gpu):
    lr = numpy.asarray(0.01, dtype='float32')
    for i in xrange(100):
-        train(xval, yval, lr)
+        rval = train(xval, yval, lr)
    mode.print_summary()
+    return rval
-def test_nnet_cpu():
+def test_nnet_cpu_gpu():
-    run_nnet(False)
+    numpy.random.seed(23456)
-def test_nnet_gpu():
+    rval_cpu = run_nnet(False)
-    run_nnet(True)
+    numpy.random.seed(23456)
+    rval_gpu = run_nnet(True)
+    assert numpy.allclose(rval_cpu, rval_gpu,rtol=1e-4,atol=1e-6)