improving the code for fixing ticket 1356

5d767fbb · Amjad Almahairi · f6689bbe · 5d767fbb
--- a/doc/tutorial/using_gpu_solution_1.py
+++ b/doc/tutorial/using_gpu_solution_1.py
@@ -28,8 +28,8 @@ rng.randint(size=N, low=0, high=2).astype(theano.config.floatX))
 training_steps = 10000
 # Declare Theano symbolic variables
-x = tt.matrix("x")
+x = theano.shared(D[0], name="x")
-y = tt.vector("y")
+y = theano.shared(D[1], name="y")
 w = theano.shared(rng.randn(feats).astype(theano.config.floatX), name="w")
 b = theano.shared(numpy.asarray(0., dtype=theano.config.floatX), name="b")
 x.tag.test_value = D[0]
@@ -58,11 +58,11 @@ predict = theano.function(inputs=[x], outputs=Out(theano.sandbox.cuda.basic_ops.
 # Compile expressions to functions
 train = theano.function(
-            inputs=[x, y],
+            inputs=[],
            outputs=[prediction, xent],
            updates={w: w - 0.01 * gw, b: b - 0.01 * gb},
            name="train")
-predict = theano.function(inputs=[x], outputs=prediction,
+predict = theano.function(inputs=[], outputs=prediction,
            name="predict")
 if any([x.op.__class__.__name__ in ['Gemv', 'CGemv', 'Gemm', 'CGemm'] for x in
@@ -76,7 +76,7 @@ else:
    print train.maker.fgraph.toposort()
 for i in range(training_steps):
-    pred, err = train(D[0], D[1])
+    pred, err = train()
 #print "Final model:"
 #print w.get_value(), b.get_value()
@@ -84,7 +84,7 @@ print "target values for D"
 print D[1]
 print "prediction on D"
-print predict(D[0])
+print predict()
 """
@@ -308,7 +308,9 @@ Test them first, as they are not guaranteed to always provide a speedup.
 Facts:
 Examine and compare 'Single Op-wise' summaries for CPU and GPU. GPU ops 'GpuFromHost' (and 'HostFromGpu') by themselves
-consume a large amount of extra time. Furthermore, notice that each of the GPU ops consumes more time than its CPU counterpart.
+consume a large amount of extra time, but by making as few as possible data transfers between GPU and CPU, you can minimize its overhead.
+In addition, you probably need to increase the input data size (e.g. set N = 4000) to see the gain of the GPU.
+Furthermore, notice that each of the GPU ops consumes more time than its CPU counterpart.
 An additional experiment also confirms that adding an 'out' instance in the GPU version only brings about a minor
 improvement in this situation.