Correct Theanos's tutorial: round of corrections, add exercises and their links

7b2c5948 · Eric Larsen · Frederic · 2d5a2297 · 7b2c5948 · 7b2c5948
--- a/doc/library/scan.txt
+++ b/doc/library/scan.txt
@@ -136,26 +136,41 @@ arange must have its length specified at creation time.
 Simple accumulation into a scalar, ditching lamba
 -------------------------------------------------

-This should be fairly self-explanatory.
+Although this example would seem almost self-explanatory, it stresses a
+pitfall to be careful of: the initial output state that is supplied, that is 
+``output_info``, must be of a **shape similar to that of the output variable**
+generated at each iteration and moreover, it **must not involve an implicit
+downcast** of the latter. 

 .. code-block:: python

-    up_to = T.iscalar("up_to")

-    # define a named function, rather than using lambda
-    def accumulate_by_adding(arange_val, sum_to_date):
-        return sum_to_date + arange_val
+import numpy as np
+import theano
+import theano.tensor as T

-    scan_result, scan_updates = theano.scan(fn=accumulate_by_adding,
-                                            outputs_info=T.as_tensor_variable(0),
-                                            sequences=T.arange(up_to))
-    triangular_sequence = theano.function(inputs=[up_to], outputs=scan_result)
+up_to = T.iscalar("up_to")

-    # test
-    some_num = 15
-    print triangular_sequence(some_num)
-    print [n * (n + 1) // 2 for n in xrange(some_num)]
+# define a named function, rather than using lambda
+def accumulate_by_adding(arange_val, sum_to_date):
+    return sum_to_date + arange_val
+seq = T.arange(up_to)
+
+# An unauthorized implicit downcast from the dtype of 'seq', to that of 
+# 'T.as_tensor_variable(0)' which is of dtype 'int8' by default would occur
+# if this instruction were to be used instead of the next one:
+# outputs_info = T.as_tensor_variable(0)
+
+outputs_info = T.as_tensor_variable(np.asarray(0, seq.dtype))
+scan_result, scan_updates = theano.scan(fn=accumulate_by_adding,
+                                        outputs_info=outputs_info,
+                                        sequences=seq)
+triangular_sequence = theano.function(inputs=[up_to], outputs=scan_result)

+# test
+some_num = 15
+print triangular_sequence(some_num)
+print [n * (n + 1) // 2 for n in xrange(some_num)]


 Another simple example

--- a/doc/tutorial/adding.txt
+++ b/doc/tutorial/adding.txt
@@ -183,4 +183,9 @@ with NumPy arrays may be found here: :ref:`tensor creation<libdoc_tensor_creatio
  
 Modify and execute this code to compute this expression: a**2 + b**2 + 2*a*b.

+
+.. TODO: repair this link
+
+:download:`Solution<../adding_solution_1.py>`
+
 -------------------------------------------
--- a/doc/tutorial/adding_solution_1.py
+++ b/doc/tutorial/adding_solution_1.py
+
+# Theano tutorial
+# Solution to Exercise in section 'Baby Steps - Algebra'
+
+import theano
+a = theano.tensor.vector()  # declare variable
+b = theano.tensor.vector()  # declare variable
+out = a ** 2 + b ** 2 + 2 * a * b  # build symbolic expression
+f = theano.function([a, b], out)   # compile function
+print f([1, 2], [4, 5])  # prints [ 25.  49.]
+
--- a/doc/tutorial/debug_faq.txt
+++ b/doc/tutorial/debug_faq.txt
@@ -144,7 +144,7 @@ The ``compute_test_value`` mechanism works as follows:
  which do not implement a ``perform`` method.


-How do I Print an Intermediate Value in a Function/Method?
+"How do I Print an Intermediate Value in a Function/Method?"
 ----------------------------------------------------------

 Theano provides a 'Print' op to do this.
@@ -259,8 +259,8 @@ Use your imagination :)
 This can be a really powerful debugging tool. Note the call to *fn* inside the call to
 *print_eval*; without it, the graph wouldn't get computed at all!

-How to Use pdb ?
----------------
+How to Use pdb
+--------------

 In the majority of cases, you won't be executing from the interactive shell
 but from a set of Python scripts. In such cases, the use of the Python

--- a/doc/tutorial/extending_theano.txt
+++ b/doc/tutorial/extending_theano.txt
@@ -10,8 +10,8 @@ Theano Graphs

 - Theano works with symbolic graphs.
 - Those graphs are bi-partite graphs (graph with 2 types of nodes).
- The 2 types of nodes are Apply and Variable nodes.
- Each Apply node has a link to the op that it executes.
+- The two types of nodes are ``Apply`` and ``Variable`` nodes.
+- Each ``Apply`` node has a link to the op that it executes.

 Inputs and Outputs are lists of Theano variables.

@@ -21,24 +21,25 @@ Inputs and Outputs are lists of Theano variables.
 .. note::

    This tutorial does not cover how to make an op that returns a view or
-    modify the values in its inputs. So all
-    Ops created with the instructions here MUST return newly allocated
+    modifies the values in its inputs. Thus, all ops created with the 
+    instructions described here MUST return newly allocated
    memory or reuse the memory provided in the parameter
-    output_storage of the :func:`perform` function. See :ref:`views_and_inplace`
-    for explanation of how to do this.
+    ``output_storage`` of the :func:`perform` function. See :ref:`views_and_inplace`
+    for an explanation on how to do this.

-    If your Op returns a view or change the value on its inputs
-    without doing as said in that page, Theano will run, but will
-    return good results for some graphs, but bad results for others.
+    If your op returns a view or changes the value of its inputs
+    without doing as prescribed in that page, Theano will run, but will
+    return correct results for some graphs and wrong results for others.

-    It is recommented that you run your tests in DebugMode (Theano flag
-    mode=DebugMode) that checks if your Op behaves correctly in this
+    It is recommended that you run your tests in DebugMode (Theano *flag*
+    ``mode=DebugMode``) since it verifies if your op behaves correctly in this
    regard.

 .. note::

-   See the :ref:`dev_start_guide` for information about git, github, the
-   development workflow and how to make a quality contribution.
+   See the :ref:`dev_start_guide` for information regarding the versioning
+   framework, namely about *git* and *GitHub*, regarding the development workflow and
+   how to make a quality contribution.


 Op Contract
@@ -90,7 +91,7 @@ Op Contract

 .. ../extending/op.txt

-There are 2 mandatory methods that one needs to implement.
+There are two mandatory methods that one needs to implement.
 The first one is :func:`make_node`. The second one 
 would describe the computations that are required to be done
 at run time. Currently there are 2 different possibilites:
@@ -105,7 +106,7 @@ a ``thunk``: a standalone function that when called will do the wanted computati
 This is useful if you want to generate code and compile it yourself. For
 example, this allows you to use PyCUDA to compile GPU code.

-Also there are 2 methods that are highly recommended to be implemented. They are
+Also there are two methods whose implementations are highly recommended. They are
 needed in order to merge duplicate computations involving your op. So if you
 do not want Theano to execute your op multiple times with the same inputs,
 do implement them. Those methods are :func:`__eq__` and
@@ -182,9 +183,9 @@ You can try it as follows:
 How To Test it
 --------------

-Theano has some functions to simplify testing. These help test the
+Theano has some functionalities to simplify testing. These help test the
 ``infer_shape``, ``grad`` and ``R_op`` methods. Put the following code
-in a file and execute it with the ``nosetests`` program.
+in a file and execute it with the ``theano-nose`` program.

 **Basic Tests**

@@ -304,10 +305,24 @@ To perform your tests, you may select either one of the three following methods:

 **theano-nose**

-The method of choice to run every test located in a specific folder
-or in Theano's path is to run the file ``theano-nose`` which is by default located in
-the ``theano/bin`` folder. If a folder is specified on the command line, every test
-it contains will be conducted. Otherwise, every test found in Theano's path will be.
+The method of choice to conduct tests is to run the file ``theano-nose``. In a regular
+Theano installation, the latter will be on the operating system's path and directly accessible.
+Otherwise, it can be accessed in the ``Theano/bin`` folder. The following command
+lines may be used for the corresponding purposes:
+
+* ``theano-nose``: Run every test found in Theano's path.
+
+* ``theano-nose folder_name``: Run every test found in the folder *folder_name*.
+
+* ``theano-nose test_file.py``: Run every test found in the file *test_file.py*.
+
+The following are particularly useful for development purposes since they call for
+particular classes or even for particular tests: 
+
+* ``theano-nose test_file.py:test_DoubleRop``: Run every test found inside the class *test_DoubleRop*.
+
+* ``theano-nose test_file.py:test_DoubleRop.test_double_op``: Run only the test *test_double_op*
+  in the class *test_DoubleRop*.

 Help with the use and functionalities of ``theano-nose`` may be obtained by running
 it with the command line parameter ``--help (-h)``. 
@@ -315,18 +330,10 @@ it with the command line parameter ``--help (-h)``.
 **nosetests**

 The command ``nosetests`` can also be used.  Although it lacks the useful 
-functionalities that ``theano-nose`` provides, you can use ``nosetests`` similarly
-to ``theano-nose`` to run all tests in Theano's
-path or in a specific folder. Nevertheless, ``nosetests`` is particularly useful for development
-purposes since it allows to request for specific tests.  For instance, the following command
-lines may be used for these respective purposes:
-
-* ``nosetests test_file.py``: Run every test found in the file *test_file.py*.
+functionalities that ``theano-nose`` provides, ``nosetests`` can be called similarly
+to ``theano-nose`` from any folder in Python's path like so:

-* ``nosetests test_file.py:test_DoubleRop``: Run every test found inside the class *test_DoubleRop*.
-
-* ``nosetests test_file.py:test_DoubleRop.test_double_op``: Run only the test *test_double_op*
-  in the class *test_DoubleRop*.
+``nosetests [suffix similar to the above]``.

 More documentation on ``nosetests`` is available here:
 `nosetests <http://readthedocs.org/docs/nose/en/latest/>`_.
@@ -351,12 +358,18 @@ file containing a specific test of interest and run the file. In this example, t
 **Exercise**


-Run the code in the file double_op.py.
+Run the code of the *DoubleOp* example above.

 Modify and execute to compute: x * y.

-Modify and execute the example to return 2 outputs: x + y and x - y
-(our current element-wise fusion generates computation with only 1 output).
+Modify and execute the example to return two outputs: x + y and x - y.
+
+You can omit the Rop functions. Try to implement the testing apparatus described above.
+
+(Notice that Theano's current *elemwise fusion* optimization is
+only applicable to computations involving a single output. Hence, to gain
+efficiency over the basic solution that is asked here, the two operations would
+have to be jointly optimized explicitly in the code.)

 SciPy
 -----
@@ -401,6 +414,11 @@ don't forget to call the parent ``setUp`` function.
 For more details see :ref:`random_value_in_tests`.


+.. TODO: repair this link
+
+:download:`Solution<../extending_theano_solution_1.py>`
+
+-------------------------------------------


 **A Final Note:**

--- a/doc/tutorial/extending_theano_solution_1.py
+++ b/doc/tutorial/extending_theano_solution_1.py
+
+# Theano tutorial
+# Solution to Exercise in section 'Extending Theano'
+
+
+import theano
+
+
+# 1. Op returns x * y
+
+class ProdOp(theano.Op):
+    def __eq__(self, other):
+        return type(self) == type(other)
+
+    def __hash__(self):
+        return hash(type(self))
+
+    def __str__(self):
+        return self.__class__.__name__
+
+    def make_node(self, x, y):
+        x = theano.tensor.as_tensor_variable(x)
+        y = theano.tensor.as_tensor_variable(y)
+        outdim = x.ndim
+        output = (theano.tensor.TensorType
+                  (dtype=theano.scalar.upcast(x.dtype, y.dtype),
+                      broadcastable=[False] * outdim)())
+        return theano.Apply(self, inputs=[x, y], outputs=[output])
+
+    def perform(self, node, inputs, output_storage):
+        x, y = inputs
+        z = output_storage[0]
+        z[0] = x * y
+
+    def infer_shape(self, node, i0_shapes):
+        return [i0_shapes[0]]
+
+    def grad(self, inputs, output_grads):
+        return [output_grads[0] * inputs[1], output_grads[0] * inputs[0]]
+
+
+# 2. Op returns x + y and x - y
+
+class SumDiffOp(theano.Op):
+    def __eq__(self, other):
+        return type(self) == type(other)
+
+    def __hash__(self):
+        return hash(type(self))
+
+    def __str__(self):
+        return self.__class__.__name__
+
+    def make_node(self, x, y):
+        x = theano.tensor.as_tensor_variable(x)
+        y = theano.tensor.as_tensor_variable(y)
+        outdim = x.ndim
+        output1 = (theano.tensor.TensorType
+                  (dtype=theano.scalar.upcast(x.dtype, y.dtype),
+                      broadcastable=[False] * outdim)())
+        output2 = (theano.tensor.TensorType
+                  (dtype=theano.scalar.upcast(x.dtype, y.dtype),
+                      broadcastable=[False] * outdim)())
+        return theano.Apply(self, inputs=[x, y], outputs=[output1, output2])
+
+    def perform(self, node, inputs, output_storage):
+        x, y = inputs
+        z1, z2 = output_storage
+        z1[0] = x + y
+        z2[0] = x - y
+
+    def infer_shape(self, node, i0_shapes):
+        return [i0_shapes[0], i0_shapes[0]]
+
+    def grad(self, inputs, output_grads):
+        return [output_grads[0] + output_grads[1],
+                output_grads[0] - output_grads[1]]
+
+
+# 3. Testing apparatus
+
+import numpy
+from theano.gof import Op, Apply
+from theano import tensor, function, printing
+from theano.tests import unittest_tools as utt
+
+
+class TestOp(utt.InferShapeTester):
+
+    rng = numpy.random.RandomState(43)
+
+    def setUp(self):
+
+        super(TestOp, self).setUp()
+        # adapt the choice of the next instruction to the op under test
+        self.op_class = ProdOp  # case 1
+        #self.op_class = SumDiffOp  # case 2
+
+    def test_perform(self):
+
+        x = theano.tensor.matrix()
+        y = theano.tensor.matrix()
+        f = theano.function([x, y], self.op_class()(x, y))
+        import numpy
+        x_val = numpy.random.rand(5, 4)
+        y_val = numpy.random.rand(5, 4)
+        out = f(x_val, y_val)
+        # adapt the choice of the next instruction to the op under test
+        assert numpy.allclose(x_val * y_val, out)  # case 1
+        #assert numpy.allclose([x_val + y_val, x_val - y_val], out)  # case 2
+
+    def test_gradient(self):
+
+        utt.verify_grad(self.op_class(), [numpy.random.rand(5, 4),
+                                numpy.random.rand(5, 4)],
+                        n_tests=1, rng=TestOp.rng)
+
+    def test_infer_shape(self):
+
+        x = tensor.dmatrix()
+        y = tensor.dmatrix()
+
+        # adapt the choice of the next instruction to the op under test
+       
+        self._compile_and_check([x, y], [self.op_class()(x, y)],  # case 1
+                                [numpy.random.rand(5, 6),
+                                 numpy.random.rand(5, 6)],
+                                self.op_class)
+        """
+        
+        self._compile_and_check([x, y], self.op_class()(x, y),  # case 2
+                                [numpy.random.rand(5, 6),
+                                 numpy.random.rand(5, 6)],
+                                self.op_class)
+        """
+
+if __name__ == "__main__":
+
+    t = TestOp('setUp')
+    t.setUp()
+    t.test_perform()
+    # comment out next instruction in case 2 since autotesting of
+    # gradient of multiple output functions is not implemented yet
+    t.test_gradient() # enable in case 1, disable in case 2
+    t.test_infer_shape()
--- a/doc/tutorial/faq.txt
+++ b/doc/tutorial/faq.txt
@@ -15,7 +15,7 @@ be implemented on Theano variables:

   TypeError: object of type 'TensorVariable' has no len()

-Python requires that *__len__* returns an integer, yet it cannot be done as Theano's symbolic variables. However, `var.shape[0]` can be used as a workaround.
+Python requires that *__len__* returns an integer, yet it cannot be done as Theano's are symbolic variables. However, `var.shape[0]` can be used as a workaround.

 This error message cannot be made more explicit because the relevant aspects of Python's 
 internals cannot be modified.
@@ -36,3 +36,18 @@ Related Projects

 We try to list in this `wiki page <https://github.com/Theano/Theano/wiki/Related-projects>`_ other Theano related projects.

+
+"What are Theano's Limitations?"
+-------------------------------
+
+Theano offers a good amount of flexibility, but has some limitations too.
+You must answer for yourself the following question: How can my algorithm be cleverly written 
+so as to make the most of what Theano can do?
+
+Here is a list of some of the known limitations:
+
+- *While*- or *for*-Loops within an expression graph are supported, but only via
+  the :func:`theano.scan` op (which puts restrictions on how the loop body can
+  interact with the rest of the graph).
+
+- Neither *goto* nor *recursion* is supported or planned within expression graphs.
--- a/doc/tutorial/gradients.txt
+++ b/doc/tutorial/gradients.txt
@@ -98,7 +98,7 @@ Computing the Jacobian
 ======================

 In Theano's parlance, the term *Jacobian* designates the tensor comprising the
-first differences of the output of a function with respect to its inputs.
+first partial derivatives of the output of a function with respect to its inputs.
 (This is a generalization of to the so-called Jacobian matrix in Mathematics.) 
 Theano implements the :func:`theano.gradient.jacobian` macro that does all
 that is needed to compute the Jacobian. The following text explains how
@@ -115,7 +115,7 @@ do is to loop over the entries in *y* and compute the gradient of
    manner all kinds of recurrent equations. While creating
    symbolic loops (and optimizing them for performance) is a hard task,
    effort is being done for improving the performance of ``scan``. We 
-    shall return to ``scan`` later in this tutorial.
+    shall return to :ref:`scan<tutloop>` later in this tutorial.

 >>> x = T.dvector('x')
 >>> y = x**2

--- a/doc/tutorial/index.txt
+++ b/doc/tutorial/index.txt
@@ -18,7 +18,8 @@ of Theano. Let us import that subpackage under a handy name like
 If that succeeded you are ready for the tutorial, otherwise check your
 installation (see :ref:`install`).

-Throughout the tutorial, bear in mind that there is a :ref:`glossary` to help
+Throughout the tutorial, bear in mind that there is a :ref:`glossary` as well
+as *index* and *modules* links in the upper-right corner of each page to help
 you out.

 .. toctree::

--- a/doc/tutorial/loop.txt
+++ b/doc/tutorial/loop.txt
@@ -32,9 +32,12 @@ The full documentation can be found in the library: :ref:`Scan <lib_scan>`.
  import theano.tensor as T
  theano.config.warn.subtensor_merge_bug = False

-  k = T.iscalar("k"); A = T.vector("A")
+  k = T.iscalar("k")
+  A = T.vector("A")
+
+  def inner_fct(prior_result, A):
+      return prior_result * A

-  def inner_fct(prior_result, A): return prior_result * A
  # Symbolic description of the result
  result, updates = theano.scan(fn=inner_fct,
                              outputs_info=T.ones_like(A),
@@ -61,7 +64,8 @@ The full documentation can be found in the library: :ref:`Scan <lib_scan>`.
  theano.config.warn.subtensor_merge_bug = False	

  coefficients = theano.tensor.vector("coefficients")
-  x = T.scalar("x"); max_coefficients_supported = 10000
+  x = T.scalar("x")
+  max_coefficients_supported = 10000

  # Generate the components of the polynomial
  full_range=theano.tensor.arange(max_coefficients_supported)
@@ -70,6 +74,7 @@ The full documentation can be found in the library: :ref:`Scan <lib_scan>`.
                                  outputs_info=None,
                                  sequences=[coefficients, full_range],
                                  non_sequences=x)
+
  polynomial = components.sum()
  calculate_polynomial = theano.function(inputs=[coefficients, x],
                                       outputs=polynomial)
@@ -89,4 +94,9 @@ Run both examples.
 Modify and execute the polynomial example to have the reduction done by ``scan``.


+.. TODO: repair this link as well as the code in the target file
+
+:download:`Solution<../loop_solution_1.py>`
+
+
 -------------------------------------------
--- a/doc/tutorial/loop_solution_1.py
+++ b/doc/tutorial/loop_solution_1.py
+
+# Theano tutorial
+# Solution to Exercise in section 'Loop'
+
+"""
+
+# 1. First example (runs satisfactorily)
+
+import theano
+import theano.tensor as T
+
+theano.config.warn.subtensor_merge_bug = False
+
+k = T.iscalar("k")
+A = T.vector("A")
+
+
+def inner_fct(prior_result, A):
+    return prior_result * A
+
+# Symbolic description of the result
+result, updates = theano.scan(fn=inner_fct,
+                            outputs_info=T.ones_like(A),
+                            non_sequences=A, n_steps=k)
+
+# Scan has provided us with A**1 through A**k.  Keep only the last
+# value. Scan notices this and does not waste memory saving them.
+final_result = result[-1]
+
+power = theano.function(inputs=[A, k], outputs=final_result,
+                      updates=updates)
+
+print power(range(10), 2)
+# [  0.   1.   4.   9.  16.  25.  36.  49.  64.  81.]
+
+
+# 2. Second example (runs satisfactorily)
+
+import numpy
+import theano
+import theano.tensor as T
+
+coefficients = theano.tensor.vector("coefficients")
+x = T.scalar("x")
+max_coefficients_supported = 10000
+
+# Generate the components of the polynomial
+full_range = theano.tensor.arange(max_coefficients_supported)
+components, updates = theano.scan(fn=lambda coeff, power, free_var:
+                                   coeff * (free_var ** power),
+                                outputs_info=None,
+                                sequences=[coefficients, full_range],
+                                non_sequences=x)
+#polynomial = components.sum()
+polynomial1 = components
+calculate_polynomial1 = theano.function(inputs=[coefficients, x],
+                                     outputs=polynomial)
+
+test_coeff = numpy.asarray([1, 0, 2], dtype=numpy.float32)
+print calculate_polynomial1(test_coeff, 3)
+# 19.0
+"""
+
+
+# 3. Reduction performed inside scan
+
+# TODO: repair this code: yields 56.0 instead of 19.0
+
+import numpy
+import theano
+import theano.tensor as T
+
+theano.config.warn.subtensor_merge_bug = False
+
+coefficients = theano.tensor.vector("coefficients")
+x = T.scalar("x")
+max_coefficients_supported = 10000
+
+# Generate the components of the polynomial
+full_range = theano.tensor.arange(max_coefficients_supported)
+
+
+outputs_info = T.as_tensor_variable(numpy.asarray(0, 'float64'))
+
+components, updates = theano.scan(fn=lambda prior_value, coeff, power, free_var:
+                                 prior_value + (coeff * (free_var ** power)),
+                                outputs_info=outputs_info,
+                                sequences=[coefficients, full_range],
+                                non_sequences=x)
+                                
+polynomial = components[-1]
+calculate_polynomial = theano.function(inputs=[coefficients, x],
+                                     outputs=polynomial, updates=updates)
+
+test_coeff = numpy.asarray([1, 0, 2], dtype=numpy.float32)
+print calculate_polynomial(test_coeff, 3)
+# 19.0
+
--- a/doc/tutorial/modes.txt
+++ b/doc/tutorial/modes.txt
@@ -82,16 +82,15 @@ Consider the logistic regression:
    predict = theano.function(inputs=[x], outputs=prediction,
                name = "predict")

-    if any( [x.op.__class__.__name__=='Gemv' for x in
+    if any([x.op.__class__.__name__ in ['Gemv', 'CGemv', 'Gemm', 'CGemm'] for x in
    train.maker.fgraph.toposort()]):
-        print 'Used the cpu'
-    elif any( [x.op.__class__.__name__=='GpuGemm' for x in
+	print 'Used the cpu'
+    elif any([x.op.__class__.__name__ in ['GpuGemm', 'GpuGemv'] for x in
    train.maker.fgraph.toposort()]):
-        print 'Used the gpu'
+	print 'Used the gpu'
    else:
-        print 'ERROR, not able to tell if theano used the cpu or the gpu'
-        print train.maker.fgraph.toposort()
-
+	print 'ERROR, not able to tell if theano used the cpu or the gpu'
+	print train.maker.fgraph.toposort()

    for i in range(training_steps):
        pred, err = train(D[0], D[1])
@@ -127,6 +126,10 @@ time the execution using the command line ``time python file.py``.
     * Insert manual cast around the mean operator (this involves division by length, which is an *int64*).
     * Notice that a new casting mechanism is being developed.

+.. TODO: repair this link
+
+:download:`Solution<../modes_solution_1.py>`
+
 -------------------------------------------

 Mode
@@ -142,7 +145,7 @@ Theano defines the following modes by name:
 - ``'FAST_COMPILE'``: Apply just a few graph optimizations and only use Python implementations.
 - ``'FAST_RUN'``: Apply all optimizations, and use C implementations where possible.
 - ``'DEBUG_MODE'``: Verify the correctness of all optimizations, and compare C and Python 
-    implementations. This mode can take much longer than the other modes,but can identify
+    implementations. This mode can take much longer than the other modes, but can identify
    several kinds of problems.
 - ``'PROFILE_MODE'``: Same optimization then FAST_RUN, put print some profiling information


--- a/doc/tutorial/modes_solution_1.py
+++ b/doc/tutorial/modes_solution_1.py
+
+# Theano tutorial
+# Solution to Exercise in section 'Configuration Settings and Compiling Modes'
+
+import numpy
+import theano
+import theano.tensor as T
+
+theano.config.floatX = 'float32'
+
+rng = numpy.random
+
+N = 400
+feats = 784
+D = (rng.randn(N, feats).astype(theano.config.floatX),
+rng.randint(size=N, low=0, high=2).astype(theano.config.floatX))
+training_steps = 10000
+
+# Declare Theano symbolic variables
+x = T.matrix("x")
+y = T.vector("y")
+w = theano.shared(rng.randn(feats).astype(theano.config.floatX), name="w")
+b = theano.shared(numpy.asarray(0., dtype=theano.config.floatX), name="b")
+x.tag.test_value = D[0]
+y.tag.test_value = D[1]
+#print "Initial model:"
+#print w.get_value(), b.get_value()
+
+# Construct Theano expression graph
+p_1 = 1 / (1 + T.exp(-T.dot(x, w) - b))  # Probabily of having a one
+prediction = p_1 > 0.5  # The prediction that is done: 0 or 1
+xent = -y * T.log(p_1) - (1 - y) * T.log(1 - p_1)  # Cross-entropy
+cost = T.cast(xent.mean(), 'float32') + \
+       0.01 * (w ** 2).sum()  # The cost to optimize
+gw, gb = T.grad(cost, [w, b])
+
+# Compile expressions to functions
+train = theano.function(
+            inputs=[x, y],
+            outputs=[prediction, xent],
+            updates={w: w - 0.01 * gw, b: b - 0.01 * gb},
+            name="train")
+predict = theano.function(inputs=[x], outputs=prediction,
+            name="predict")
+
+if any([x.op.__class__.__name__ in ['Gemv', 'CGemv', 'Gemm', 'CGemm'] for x in
+train.maker.fgraph.toposort()]):
+    print 'Used the cpu'
+elif any([x.op.__class__.__name__ in ['GpuGemm', 'GpuGemv'] for x in
+train.maker.fgraph.toposort()]):
+    print 'Used the gpu'
+else:
+    print 'ERROR, not able to tell if theano used the cpu or the gpu'
+    print train.maker.fgraph.toposort()
+
+for i in range(training_steps):
+    pred, err = train(D[0], D[1])
+#print "Final model:"
+#print w.get_value(), b.get_value()
+
+print "target values for D"
+print D[1]
+
+print "prediction on D"
+print predict(D[0])
+
--- a/doc/tutorial/remarks.txt
+++ b/doc/tutorial/remarks.txt
@@ -5,21 +5,8 @@
 Some general Remarks
 =====================

-.. TODO: This discussion is awkward. Even with this beneficial reordering (28 July 2012)
-..  its purpose and message are for the moment unclear. 
-
-
-Limitations
-----------
-
-Theano offers a good amount of flexibility, but has some limitations too.
-You must answer for yourself the following question: How can my algorithm be cleverly written 
-so as to make the most of what Theano can do?
-
-
- *While*- or *for*-Loops within an expression graph are supported, but only via
-  the :func:`theano.scan` op (which puts restrictions on how the loop body can
-  interact with the rest of the graph).
-
- Neither *goto* nor *recursion* is supported or planned within expression graphs.
+.. This section is reserved for remarks and discussions of a general scope regarding the
+.. nature and development of Theano.
+.. The FAQ section is dedicated to specifics, i.e. information regarding the what, how,
+.. why and if of Theano. 

--- a/doc/tutorial/sparse.txt
+++ b/doc/tutorial/sparse.txt
@@ -4,9 +4,6 @@
 Sparse
 ======

-Sparse Matrices
-===============
-
 In general, *sparse* matrices provide the same functionality as regular
 matrices. The difference lies in the way the elements of *sparse* matrices are 
 represented and stored in memory. Only the non-zero elements of the latter are stored.

--- a/doc/tutorial/symbolic_graphs.txt
+++ b/doc/tutorial/symbolic_graphs.txt
@@ -18,18 +18,18 @@ The first step in writing Theano code is to write down all mathematical
 relations using symbolic placeholders (**variables**). When writing down 
 these expressions you use operations like ``+``, ``-``, ``**``,
 ``sum()``, ``tanh()``. All these are represented internally as **ops**. 
-An **op** represents a certain computation on some type of inputs
+An *op* represents a certain computation on some type of inputs
 producing some type of output. You can see it as a *function definition*
 in most programming languages. 

 Theano builds internally a graph structure composed of interconnected 
 **variable** nodes, **op** nodes and **apply** nodes. An 
-**apply** node represents the application of an **op** to some 
-**variables**. It is important to make the difference between the
-definition of a computation represented by an **op** and its application
-to some actual data which is represented by the **apply** node. For more
-detail about these building blocks see :ref:`variable`, :ref:`op`, 
-:ref:`apply`. Here is a an example of a graph:
+*apply* node represents the application of an *op* to some 
+*variables*. It is important to draw the difference between the
+definition of a computation represented by an *op* and its application
+to some actual data which is represented by the *apply* node. For more
+detail about these building blocks refer to :ref:`variable`, :ref:`op`, 
+:ref:`apply`. Here is an example of a graph:


 **Code**
@@ -54,9 +54,9 @@ detail about these building blocks see :ref:`variable`, :ref:`op`,
    WARNING: hyper-links and ref's seem to break the PDF build when placed
    into this figure caption.

-Arrows in this :ref:`figure <tutorial-graphfigure>` represent references to the 
+Arrows in this figure represent references to the 
 Python objects pointed at. The blue
-box is an :ref:`apply` node. Red boxes are :ref:`variable` nodes. Green
+box is an :ref:`Apply` node. Red boxes are :ref:`Variable` nodes. Green
 circles are :ref:`Ops <op>`. Purple boxes are :ref:`Types <type>`.


@@ -111,18 +111,18 @@ Automatic Differentiation

 Having the graph structure, computing automatic differentiation is
 simple. The only thing :func:`tensor.grad` has to do is to traverse the
-graph from the outputs back towards the inputs through all :ref:`apply`
-nodes (:ref:`apply` nodes are those that define which computations the
-graph does). For each such :ref:`apply` node, its  :ref:`op` defines 
-how to compute the gradient of the node's outputs with respect to its
-inputs. Note that if an :ref:`op` does not provide this information, 
-it is assumed that the gradient is not defined.
+graph from the outputs back towards the inputs through all *apply*
+nodes (*apply* nodes are those that define which computations the
+graph does). For each such *apply* node, its  *op* defines 
+how to compute the *gradient* of the node's outputs with respect to its
+inputs. Note that if an *op* does not provide this information, 
+it is assumed that the *gradient* is not defined.
 Using the 
 `chain rule <http://en.wikipedia.org/wiki/Chain_rule>`_ 
 these gradients can be composed in order to obtain the expression of the 
-gradient of the graph's output with respect to the graph's inputs .
+*gradient* of the graph's output with respect to the graph's inputs .

-A following section of this tutorial will examine the topic of differentiation
+A following section of this tutorial will examine the topic of :ref:`differentiation<tutcomputinggrads>`
 in greater detail.


@@ -142,7 +142,7 @@ identical subgraphs and ensure that the same values are not computed
 twice or reformulate parts of the graph to a GPU specific version.

 For example, one (simple) optimization that Theano uses is to replace 
-the pattern :math:`\frac{xy}{y}` by :math:`x`.
+the pattern :math:`\frac{xy}{y}` by *x.*

 Further information regarding the optimization
 :ref:`process<optimization>` and the specific :ref:`optimizations<optimizations>` that are applicable

--- a/doc/tutorial/using_gpu.txt
+++ b/doc/tutorial/using_gpu.txt
@@ -44,6 +44,7 @@ file and run it.
    rng = numpy.random.RandomState(22)
    x = shared(numpy.asarray(rng.rand(vlen), config.floatX))
    f = function([], T.exp(x))
+    print f.maker.fgraph.toposort()	
    t0 = time.time()
    for i in xrange(iters):
        r = f()
@@ -59,23 +60,32 @@ The program just computes the ``exp()`` of a bunch of random numbers.
 Note that we use the ``shared`` function to
 make sure that the input *x* is stored on the graphics device.

-If I run this program (in thing.py) with ``device=cpu``, my computer takes a little over 7 seconds,
-whereas on the GPU it takes just over 0.4 seconds. The GPU will not always produce the exact 
-same floating-point numbers as the CPU. As a benchmark, a loop that calls ``numpy.exp(x.value)`` also takes about 7 seconds.
+.. the following figures have been measured twice on BART3 on 2 Aug 2012 with no other job running simultaneously 

-.. code-block:: text
+If I run this program (in check1.py) with ``device=cpu``, my computer takes a little over 3 seconds,
+whereas on the GPU it takes just over 0.64 seconds. The GPU will not always produce the exact 
+same floating-point numbers as the CPU. As a benchmark, a loop that calls ``numpy.exp(x.get_value())`` takes about 46 seconds.

-    $ THEANO_FLAGS=mode=FAST_RUN,device=cpu,floatX=float32 python thing.py 
-    Looping 1000 times took 7.17374897003 seconds
-    Result is [ 1.23178032  1.61879341  1.52278065 ...,  2.20771815  2.29967753 1.62323285]
+.. code-block:: text

-    $ THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python thing.py 
-    Using gpu device 0: GeForce GTX 285
-    Looping 1000 times took 0.418929815292 seconds
-    Result is [ 1.23178029  1.61879349  1.52278066 ...,  2.20771813  2.29967761 1.62323296]
+    $ THEANO_FLAGS=mode=FAST_RUN,device=cpu,floatX=float32 python check1.py
+    [Elemwise{exp,no_inplace}(<TensorType(float32, vector)>)]
+    Looping 1000 times took 3.06635117531 seconds
+    Result is [ 1.23178029  1.61879337  1.52278066 ...,  2.20771813  2.29967761
+      1.62323284]
+    Used the cpu
+
+    $ THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python check1.py
+    Using gpu device 0: GeForce GTX 580
+    [GpuElemwise{exp,no_inplace}(<CudaNdarrayType(float32, vector)>), HostFromGpu(GpuElemwise{exp,no_inplace}.0)]
+    Looping 1000 times took 0.638810873032 seconds
+    Result is [ 1.23178029  1.61879349  1.52278066 ...,  2.20771813  2.29967761
+      1.62323296]
+    Used the gpu

 Note that GPU operations in Theano require for now ``floatX`` to be *float32* (see also below).

+
 Returning a Handle to Device-Allocated Data
 -------------------------------------------

@@ -103,6 +113,7 @@ after the ``T.exp(x)`` is replaced by a GPU version of ``exp()``.
    rng = numpy.random.RandomState(22)
    x = shared(numpy.asarray(rng.rand(vlen), config.floatX))
    f = function([], sandbox.cuda.basic_ops.gpu_from_host(T.exp(x)))
+    print f.maker.fgraph.toposort()
    t0 = time.time()
    for i in xrange(iters):
        r = f()
@@ -119,10 +130,14 @@ The output from this program is

 .. code-block:: text

-    Using gpu device 0: GeForce GTX 285
-    Looping 1000 times took 0.185714006424 seconds
-    Result is <CudaNdarray object at 0x3e9e970>
-    Numpy result is [ 1.23178029  1.61879349  1.52278066 ...,  2.20771813  2.29967761 1.62323296]
+    $ THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python check2.py
+    Using gpu device 0: GeForce GTX 580
+    [GpuElemwise{exp,no_inplace}(<CudaNdarrayType(float32, vector)>)]
+    Looping 1000 times took 0.34898686409 seconds
+    Result is <CudaNdarray object at 0x6a7a5f0>
+    Numpy result is [ 1.23178029  1.61879349  1.52278066 ...,  2.20771813  2.29967761
+      1.62323296]
+    Used the gpu

 Here we've shaved off about 50% of the run-time by simply not copying the
 resulting array back to the host.
@@ -134,18 +149,14 @@ NumPy casting mechanism.
 Running the GPU at Full Speed
 ------------------------------

-..  TODO: the discussion of this section is unintelligible to a beginner
-..  TODO: is it better now?
-
-
-To really get maximum performance in this simple example, we need to use an :class:`Out`
-instance to tell Theano not to copy the output it returns to us.
-This is because Theano pre-allocates memory for internal use (like working buffers), and by default
-will never return a result that is aliased to one of its internal buffers: instead, it will
-copy the buffers associated to outputs into newly allocated memory at each function call.
-This is to ensure that subsequent function calls will not overwrite previously computed outputs.
-Although this is normally what you want, our example is so simple
-that it has the unwanted side-effect of really slowing things down.
+To really get maximum performance in this simple example, we need to use an
+:class:`out<function.Out>` instance with the flag ``borrow=True`` to tell Theano not to copy
+the output it returns to us. This is because Theano pre-allocates memory for internal use 
+(like working buffers), and by default will never return a result that is aliased to one of
+its internal buffers: instead, it will copy the buffers associated to outputs into newly 
+allocated memory at each function call. This is to ensure that subsequent function calls will
+not overwrite previously computed outputs. Although this is normally what you want, our last
+example was so simple that it had the unwanted side-effect of really slowing things down.


 .. 
@@ -164,7 +175,7 @@ that it has the unwanted side-effect of really slowing things down.
    import numpy
    import time

-    vlen = 10 * 30 * 768  # 10 x #cores x # threads per core
+    vlen = 10 * 30 * 768  # 10 x # cores x # threads per core
    iters = 1000

    rng = numpy.random.RandomState(22)
@@ -172,6 +183,7 @@ that it has the unwanted side-effect of really slowing things down.
    f = function([], 
            Out(sandbox.cuda.basic_ops.gpu_from_host(T.exp(x)),
                borrow=True))
+    print f.maker.fgraph.toposort()
    t0 = time.time()
    for i in xrange(iters):
        r = f()
@@ -184,25 +196,43 @@ that it has the unwanted side-effect of really slowing things down.
    else:
        print 'Used the gpu'

-Running this version of the code takes just under 0.05 seconds, over 140x faster than
+Running this version of the code takes just over 0.05 seconds, that is 60x faster than
 the CPU implementation!

 .. code-block:: text

-    Using gpu device 0: GeForce GTX 285
-    Looping 1000 times took 0.0497219562531 seconds
-    Result is <CudaNdarray object at 0x31eeaf0>
-    Numpy result is [ 1.23178029  1.61879349  1.52278066 ...,  2.20771813  2.29967761 1.62323296]
+    With *flag* ``borrow=True``:
+
+    $ THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python using_gpu_solution_1.py
+    Using gpu device 0: GeForce GTX 580
+    [GpuElemwise{exp,no_inplace}(<CudaNdarrayType(float32, vector)>)]
+    Looping 1000 times took 0.31614613533 seconds
+    Result is <CudaNdarray object at 0x77e9270>
+    Numpy result is [ 1.23178029  1.61879349  1.52278066 ...,  2.20771813  2.29967761
+      1.62323296]
+    Used the gpu
+
+    With *flag* ``borrow=False``:
+
+    $ THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python using_gpu_solution_1.py
+    Using gpu device 0: GeForce GTX 580
+    [GpuElemwise{exp,no_inplace}(<CudaNdarrayType(float32, vector)>)]
+    Looping 1000 times took 0.0502779483795 seconds
+    Result is <CudaNdarray object at 0x83e5cb0>
+    Numpy result is [ 1.23178029  1.61879349  1.52278066 ...,  2.20771813  2.29967761
+      1.62323296]
+    Used the gpu
+

-This version of the code using ``borrow=True`` is slightly less safe because if we had saved
+This version of the code including the flag ``borrow=True`` is slightly less safe because if we had saved
 the *r* returned from one function call, we would have to take care and remember that its value might
 be over-written by a subsequent function call.  Although ``borrow=True`` makes a dramatic difference
 in this example, be careful!  The advantage of ``borrow=True`` is much weaker in larger graphs, and 
 there is a lot of potential for making a mistake by failing to account for the resulting memory aliasing.


-What Can Be Accelerated on the GPU?
-----------------------------------
+What Can Be Accelerated on the GPU
+----------------------------------

 The performance characteristics will change as we continue to optimize our
 implementations, and vary from device to device, but to give a rough idea of
@@ -248,7 +278,7 @@ Tips for Improving Performance on GPU
  eliminate transfer time for GPU ops using those variables.
 * If you aren't happy with the performance you see, try building your functions with 
  ``mode='PROFILE_MODE'``. This should print some timing information at program
-  termination (at exit). Is time being used sensibly?   If an op or Apply is
+  termination. Is time being used sensibly?   If an op or Apply is
  taking more time than its share, then if you know something about GPU
  programming, have a look at how it's implemented in theano.sandbox.cuda.
  Check the line similar to *Spent Xs(X%) in cpu op, Xs(X%) in gpu op and Xs(X%) in transfer op*.
@@ -267,7 +297,6 @@ see :ref:`aliasing`.

 **Exercise**

-
 Consider the logistic regression:

 .. code-block:: python
@@ -309,16 +338,15 @@ Consider the logistic regression:
    predict = theano.function(inputs=[x], outputs=prediction,
                name = "predict")

-    if any( [x.op.__class__.__name__=='Gemv' for x in
+    if any([x.op.__class__.__name__ in ['Gemv', 'CGemv', 'Gemm', 'CGemm'] for x in
    train.maker.fgraph.toposort()]):
-        print 'Used the cpu'
-    elif any( [x.op.__class__.__name__=='GpuGemm' for x in
+	print 'Used the cpu'
+    elif any([x.op.__class__.__name__ in ['GpuGemm', 'GpuGemv'] for x in
    train.maker.fgraph.toposort()]):
-        print 'Used the GPU'
+	print 'Used the gpu'
    else:
-        print 'ERROR, not able to tell if theano used the cpu or the GPU'
-        print train.maker.fgraph.toposort()
-
+	print 'ERROR, not able to tell if theano used the cpu or the gpu'
+	print train.maker.fgraph.toposort()

    for i in range(training_steps):
        pred, err = train(D[0], D[1])
@@ -340,12 +368,12 @@ Is there an increase in speed from CPU to GPU?

 Where does it come from? (Use ``ProfileMode``)

-What can be done to further increase the speed of the GPU version?
+What can be done to further increase the speed of the GPU version? Put your ideas to test.


 .. Note::

-   * Only 32 bits floats are currently supported (development is in progress).
+   * Only 32 bit floats are currently supported (development is in progress).
   * ``Shared`` variables with *float32* dtype are by default moved to the GPU memory space.

   * There is a limit of one GPU per process.
@@ -360,57 +388,117 @@ What can be done to further increase the speed of the GPU version?
     * Insert manual cast around the mean operator (this involves division by length, which is an *int64*).
     * Notice that a new casting mechanism is being developed.

+.. TODO: repair this link
+
+:download:`Solution<../using_gpu_solution_1.py>`

 -------------------------------------------

+
 Software for Directly Programming a GPU
 ---------------------------------------

 Leaving aside Theano which is a meta-programmer, there are:

-* **CUDA**: C extension by NVIDIA 
+* **CUDA**: GPU programming API by NVIDIA based on extension to C (CUDA C) 

-   * Vendor-specific
+  * Vendor-specific

-   * Numeric libraries (BLAS, RNG, FFT) are maturing.
+  * Numeric libraries (BLAS, RNG, FFT) are maturing.

 * **OpenCL**: multi-vendor version of CUDA

-   * More general, standardized.
+  * More general, standardized.

-   * Fewer libraries, lesser spread.
+  * Fewer libraries, lesser spread.

 * **PyCUDA**: Python bindings to CUDA driver interface allow to access Nvidia's CUDA parallel 
  computation API from Python

-   * Convenience: Makes it easy to do GPU meta-programming from within Python. Helpful documentation.
+  * Convenience:
+
+    Makes it easy to do GPU meta-programming from within Python.
+ 
+    Abstractions to compile low-level CUDA code from Python (``pycuda.driver.SourceModule``).
+
+    GPU memory buffer (``pycuda.gpuarray.GPUArray``).
+  
+    Helpful documentation.

-     (abstractions to compile low-level CUDA code from Python: ``pycuda.driver.SourceModule``)
+  * Completeness: Binding to all of CUDA's driver API.

-   * Completeness: Binding to all of CUDA's driver API.
+  * Automatic error checking: All CUDA errors are automatically translated into Python exceptions.

-   * Automatic error checking: All CUDA errors are automatically translated into Python exceptions.
+  * Speed: PyCUDA's base layer is written in C++.

-   * Speed: PyCUDA's base layer is written in C++.
+  * Good memory management of GPU objects:

-   * Good memory management of GPU objects:
+    Object cleanup tied to lifetime of objects (RAII, 'Resource Acquisition Is Initialization').

-     Object cleanup tied to lifetime of objects (RAII, 'Resource Acquisition Is Initialization').
+    Makes it much easier to write correct, leak- and crash-free code.

-     Makes it much easier to write correct, leak- and crash-free code.
+    PyCUDA knows about dependencies (e.g. it won't detach from a context before all memory
+    allocated in it is also freed).
+     

-     PyCUDA knows about dependencies (e.g. it won't detach from a context before all memory allocated in it is also freed).
+  (This is adapted from PyCUDA's `documentation <http://documen.tician.de/pycuda/index.html>`_ 
+  and Andreas Kloeckner's `website <http://mathema.tician.de/software/pycuda>`_ on PyCUDA.)

-     (GPU memory buffer: \texttt{``pycuda.gpuarray.GPUArray``})

 * **PyOpenCL**: PyCUDA for OpenCL

+Learning to Program with PyCUDA
+-------------------------------
+
+If you already enjoy a good proficiency with the C programming language, you
+may easily leverage your knowledge by learning, first, to program a GPU with the
+CUDA extension to C (CUDA C) and, second, to use PyCUDA to access the CUDA
+API with a Python wrapper.
+
+The following resources will assist you in this learning process:
+
+* **CUDA API and CUDA C: Introductory**
+
+  * `NVIDIA's slides <http://www.sdsc.edu/us/training/assets/docs/NVIDIA-02-BasicsOfCUDA.pdf>`_
+
+  * `Stein's (NYU) slides <http://www.cs.nyu.edu/manycores/cuda_many_cores.pdf>`_
+  
+* **CUDA API and CUDA C: Advanced**
+
+  * `MIT IAP2009 CUDA <https://sites.google.com/site/cudaiap2009/home>`_
+    (full coverage: lectures, leading Kirk-Hwu textbook, examples, additional resources)
+
+  * `Course U. of Illinois <http://courses.engr.illinois.edu/ece498/al/index.html>`_
+    (full lectures, Kirk-Hwu textbook)
+
+  * `NVIDIA's knowledge base <http://www.nvidia.com/content/cuda/cuda-developer-resources.html>`_
+    (extensive coverage, levels from introductory to advanced)
+
+  * `practical issues <http://stackoverflow.com/questions/2392250/understanding-cuda-grid-dimensions-block-dimensions-and-threads-organization-s>`_
+    (on the relationship between grids, blocks and threads; see also linked and related issues on same page)
+
+  * `CUDA optimisation <http://www.gris.informatik.tu-darmstadt.de/cuda-workshop/slides.html>`_
+
+* **PyCUDA: Introductory**
+
+  * `Kloeckner's slides <http://www.gputechconf.com/gtcnew/on-demand-gtc.php?sessionTopic=&searchByKeyword=kloeckner&submit=&select=+&sessionEvent=2&sessionYear=2010&sessionFormat=3>`_
+
+  * `Kloeckner' website <http://mathema.tician.de/software/pycuda>`_ 
+
+* **PYCUDA: Advanced**
+
+  * `PyCUDA documentation website <http://documen.tician.de/pycuda/>`_
+
+
+The following examples give a foretaste of programming a GPU with PyCUDA. Once
+you feel competent enough, you may try yourself on the corresponding exercises.

 **Example: PyCUDA**


 .. code-block:: python

+  # (from PyCUDA's documentation)
  import pycuda.autoinit
  import pycuda.driver as drv
  import numpy
@@ -511,7 +599,11 @@ Run the preceding example.
 Modify and execute to multiply two matrices: *x* * *y*.

 Modify and execute to return two outputs: *x + y* and *x - y*.
-(Currently, *elemwise fusion* generates computation with only 1 output.)
+
+(Notice that Theano's current *elemwise fusion* optimization is
+only applicable to computations involving a single output. Hence, to gain
+efficiency over the basic solution that is asked here, the two operations would
+have to be jointly optimized explicitly in the code.)

 Modify and execute to support *stride* (i.e. so as not constrain the input to be *C-contiguous*).


--- a/doc/tutorial/using_gpu_solution_1.py
+++ b/doc/tutorial/using_gpu_solution_1.py
+
+# Theano tutorial
+# Solution to Exercise in section 'Using the GPU'
+
+
+# 1. Raw results
+#
+# same code as in mode_solution_1 but run with following command lines:
+# THEANO_FLAGS=mode=FAST_RUN,device=gpu time python program_name.py
+# THEANO_FLAGS=mode=FAST_RUN,device=cpu time python program_name.py
+# for GPU and CPU respectively
+# typical time: 20 sec (CPU), 10 sec (GPU)
+
+import numpy
+import theano
+import theano.tensor as T
+
+from theano import sandbox, Out
+
+theano.config.floatX = 'float32'
+
+rng = numpy.random
+
+N = 400
+feats = 784
+D = (rng.randn(N, feats).astype(theano.config.floatX),
+rng.randint(size=N, low=0, high=2).astype(theano.config.floatX))
+training_steps = 10000
+
+# Declare Theano symbolic variables
+x = T.matrix("x")
+y = T.vector("y")
+w = theano.shared(rng.randn(feats).astype(theano.config.floatX), name="w")
+b = theano.shared(numpy.asarray(0., dtype=theano.config.floatX), name="b")
+x.tag.test_value = D[0]
+y.tag.test_value = D[1]
+#print "Initial model:"
+#print w.get_value(), b.get_value()
+
+# Construct Theano expression graph
+p_1 = 1 / (1 + T.exp(-T.dot(x, w) - b))  # Probabily of having a one
+prediction = p_1 > 0.5  # The prediction that is done: 0 or 1
+xent = -y * T.log(p_1) - (1 - y) * T.log(1 - p_1)  # Cross-entropy
+cost = T.cast(xent.mean(), 'float32') + \
+       0.01 * (w ** 2).sum()  # The cost to optimize
+gw, gb = T.grad(cost, [w, b])
+
+"""
+# Compile expressions to functions
+train = theano.function(
+            inputs=[x, y],
+            outputs=[Out(theano.sandbox.cuda.basic_ops.gpu_from_host(T.cast(prediction, 'float32')),borrow=True), Out(theano.sandbox.cuda.basic_ops.gpu_from_host(T.cast(xent, 'float32')), borrow=True)],
+            updates={w: w - 0.01 * gw, b: b - 0.01 * gb},
+            name="train")
+predict = theano.function(inputs=[x], outputs=Out(theano.sandbox.cuda.basic_ops.gpu_from_host(T.cast(prediction, 'float32')), borrow=True),
+            name="predict")
+"""
+
+# Compile expressions to functions
+train = theano.function(
+            inputs=[x, y],
+            outputs=[prediction, xent],
+            updates={w: w - 0.01 * gw, b: b - 0.01 * gb},
+            name="train")
+predict = theano.function(inputs=[x], outputs=prediction,
+            name="predict")
+
+if any([x.op.__class__.__name__ in ['Gemv', 'CGemv', 'Gemm', 'CGemm'] for x in
+train.maker.fgraph.toposort()]):
+    print 'Used the cpu'
+elif any([x.op.__class__.__name__ in ['GpuGemm', 'GpuGemv'] for x in
+train.maker.fgraph.toposort()]):
+    print 'Used the gpu'
+else:
+    print 'ERROR, not able to tell if theano used the cpu or the gpu'
+    print train.maker.fgraph.toposort()
+
+for i in range(training_steps):
+    pred, err = train(D[0], D[1])
+#print "Final model:"
+#print w.get_value(), b.get_value()
+
+print "target values for D"
+print D[1]
+
+print "prediction on D"
+print predict(D[0])
+
+"""
+
+# 2. Profiling
+#
+# same code as above but run with following command lines:
+# THEANO_FLAGS=mode=PROFILE_MODE,device=gpu python program_name.py
+# THEANO_FLAGS=mode=PROFILE_MODE,device=cpu python program_name.py
+# for GPU and CPU
+
+
+# 2.1 Profiling output for CPU computations
+
+
+$ THEANO_FLAGS=mode=PROFILE_MODE,device=cpu python program_name.py
+Used the cpu
+target values for D
+prediction on D
+Used the cpu
+target values for D
+prediction on D
+
+ProfileMode.print_summary()
+---------------------------
+
+Time since import 12.586s
+Theano compile time: 0.000s (0.0% since import)
+    Optimization time: 0.000s
+    Linker time: 0.000s
+Theano fct call 5.147s (40.9% since import)
+   Theano Op time 3.595s 28.6%(since import) 69.8%(of fct call)
+   Theano function overhead in ProfileMode 1.552s 12.3%(since import) 30.2%(of fct call)
+20002 Theano fct call, 0.000s per call
+Rest of the time since import 7.440s 59.1%
+
+Theano fct summary:
+<% total fct time> <total time> <time per call> <nb call> <fct name>
+   49.9% 2.567s 2.57e-04s 10000 train
+    0.0% 0.000s 1.24e-04s 1 predict
+    0.0% 0.000s 1.26e-04s 1 predict
+   50.1% 2.579s 2.58e-04s 10000 train
+
+Single Op-wise summary:
+<% of local_time spent on this kind of Op> <cumulative %> <self seconds> <cumulative seconds> <time per call> [*] <nb_call> <nb_op> <nb_apply> <Op name>
+   59.3%   59.3%  2.133s  2.133s  5.33e-05s * 40002  1  6 <class 'theano.tensor.blas_c.CGemv'>
+   34.4%   93.8%  1.238s  3.371s  6.19e-06s * 200002 11 22 <class 'theano.tensor.elemwise.Elemwise'>
+    2.8%   96.6%  0.100s  3.471s  2.51e-06s * 40002  1  6 <class 'theano.tensor.basic.Alloc'>
+    2.1%   98.7%  0.075s  3.546s  1.26e-06s * 60002  2  8 <class 'theano.tensor.elemwise.DimShuffle'>
+    0.7%   99.3%  0.024s  3.571s  6.11e-07s * 40002  1  6 <class 'theano.tensor.opt.Shape_i'>
+    0.7%  100.0%  0.024s  3.595s  1.18e-06s * 20000  1  2 <class 'theano.tensor.elemwise.Sum'>
+   ... (remaining 0 single Op account for 0.00%(0.00s) of the runtime)
+(*) Op is running a c implementation
+
+Op-wise summary:
+<% of local_time spent on this kind of Op> <cumulative %> <self seconds> <cumulative seconds> <time per call> [*]  <nb_call> <nb apply> <Op name>
+   59.3%   59.3%  2.133s  2.133s  5.33e-05s * 40002  6 CGemv{inplace}
+   18.1%   77.4%  0.650s  2.783s  3.25e-05s * 20000  2 Elemwise{Composite{[Composite{[Composite{[sub(mul(i0, i1), neg(i2))]}(i0, scalar_softplus(i1), mul(i2, i3))]}(i0, i1, i2, scalar_softplus(i3))]}}
+    6.4%   83.9%  0.231s  3.014s  1.16e-05s * 20000  2 Elemwise{Composite{[Composite{[Composite{[Composite{[mul(i0, add(i1, i2))]}(i0, neg(i1), true_div(i2, i3))]}(i0, mul(i1, i2, i3), i4, i5)]}(i0, i1, i2, exp(i3), i4, i5)]}}[(0, 0)]
+    4.0%   87.8%  0.142s  3.157s  7.11e-06s * 20000  2 Elemwise{ScalarSigmoid{output_types_preference=transfer_type{0}}}[(0, 0)]
+    2.8%   90.6%  0.100s  3.257s  2.51e-06s * 40002  6 Alloc
+    1.4%   92.1%  0.052s  3.309s  1.30e-06s * 40002  6 InplaceDimShuffle{x}
+    1.1%   93.1%  0.038s  3.347s  1.92e-06s * 20000  2 Elemwise{Cast{float32}}
+    1.1%   94.2%  0.038s  3.386s  1.91e-06s * 20000  2 Elemwise{sub,no_inplace}
+    1.0%   95.2%  0.036s  3.421s  1.79e-06s * 20000  2 Elemwise{gt,no_inplace}
+    0.8%   96.0%  0.029s  3.450s  1.44e-06s * 20000  2 Elemwise{Composite{[sub(neg(i0), i1)]}}[(0, 0)]
+    0.8%   96.8%  0.028s  3.479s  1.42e-06s * 20000  2 Elemwise{neg,no_inplace}
+    0.7%   97.5%  0.024s  3.503s  6.11e-07s * 40002  6 Shape_i{0}
+    0.7%   98.1%  0.024s  3.527s  1.18e-06s * 20000  2 Sum
+    0.6%   98.8%  0.023s  3.550s  1.16e-06s * 20000  2 InplaceDimShuffle{1,0}
+    0.6%   99.4%  0.023s  3.573s  1.15e-06s * 20000  2 Elemwise{Composite{[sub(i0, mul(i1, i2))]}}[(0, 0)]
+    0.6%  100.0%  0.022s  3.595s  1.08e-06s * 20000  2 Elemwise{inv,no_inplace}
+    0.0%  100.0%  0.000s  3.595s  1.19e-05s *     2  2 Elemwise{Composite{[Composite{[Composite{[Composite{[GT(scalar_sigmoid(i0), i1)]}(neg(i0), i1)]}(sub(i0, i1), i2)]}(neg(i0), i1, i2)]}}
+   ... (remaining 0 Op account for   0.00%(0.00s) of the runtime)
+(*) Op is running a c implementation
+
+Apply-wise summary:
+<% of local_time spent at this position> <cumulative %%> <apply time> <cumulative seconds> <time per call> [*] <nb_call> <Apply position> <Apply Op name>
+   14.9%   14.9%  0.536s  0.536s 5.36e-05s  * 10000   7 CGemv{inplace}(Alloc.0, TensorConstant{1.0}, x, w, TensorConstant{1.0})
+   14.9%   29.8%  0.534s  1.070s 5.34e-05s  * 10000  18 CGemv{inplace}(w, TensorConstant{-0.00999999977648}, x.T, Elemwise{Composite{[Composite{[Composite{[Composite{[mul(i0, add(i1, i2))]}(i0, neg(i1), true_div(i2, i3))]}(i0, mul(i1, i2, i3), i4, i5)]}(i0, i1, i2, exp(i3), i4, i5)]}}[(0, 0)].0, TensorConstant{0.999800026417})
+   14.8%   44.6%  0.532s  1.602s 5.32e-05s  * 10000   7 CGemv{inplace}(Alloc.0, TensorConstant{1.0}, x, w, TensorConstant{1.0})
+   14.7%   59.3%  0.530s  2.132s 5.30e-05s  * 10000  18 CGemv{inplace}(w, TensorConstant{-0.00999999977648}, x.T, Elemwise{Composite{[Composite{[Composite{[Composite{[mul(i0, add(i1, i2))]}(i0, neg(i1), true_div(i2, i3))]}(i0, mul(i1, i2, i3), i4, i5)]}(i0, i1, i2, exp(i3), i4, i5)]}}[(0, 0)].0, TensorConstant{0.999800026417})
+    9.1%   68.4%  0.327s  2.460s 3.27e-05s  * 10000  13 Elemwise{Composite{[Composite{[Composite{[sub(mul(i0, i1), neg(i2))]}(i0, scalar_softplus(i1), mul(i2, i3))]}(i0, i1, i2, scalar_softplus(i3))]}}(y, Elemwise{Composite{[sub(neg(i0), i1)]}}[(0, 0)].0, Elemwise{sub,no_inplace}.0, Elemwise{neg,no_inplace}.0)
+    9.0%   77.4%  0.323s  2.783s 3.23e-05s  * 10000  13 Elemwise{Composite{[Composite{[Composite{[sub(mul(i0, i1), neg(i2))]}(i0, scalar_softplus(i1), mul(i2, i3))]}(i0, i1, i2, scalar_softplus(i3))]}}(y, Elemwise{Composite{[sub(neg(i0), i1)]}}[(0, 0)].0, Elemwise{sub,no_inplace}.0, Elemwise{neg,no_inplace}.0)
+    3.2%   80.6%  0.116s  2.899s 1.16e-05s  * 10000  16 Elemwise{Composite{[Composite{[Composite{[Composite{[mul(i0, add(i1, i2))]}(i0, neg(i1), true_div(i2, i3))]}(i0, mul(i1, i2, i3), i4, i5)]}(i0, i1, i2, exp(i3), i4, i5)]}}[(0, 0)](Elemwise{ScalarSigmoid{output_types_preference=transfer_type{0}}}[(0, 0)].0, Alloc.0, y, Elemwise{Composite{[sub(neg(i0), i1)]}}[(0, 0)].0, Elemwise{sub,no_inplace}.0, Elemwise{Cast{float32}}.0)
+    3.2%   83.9%  0.116s  3.014s 1.16e-05s  * 10000  16 Elemwise{Composite{[Composite{[Composite{[Composite{[mul(i0, add(i1, i2))]}(i0, neg(i1), true_div(i2, i3))]}(i0, mul(i1, i2, i3), i4, i5)]}(i0, i1, i2, exp(i3), i4, i5)]}}[(0, 0)](Elemwise{ScalarSigmoid{output_types_preference=transfer_type{0}}}[(0, 0)].0, Alloc.0, y, Elemwise{Composite{[sub(neg(i0), i1)]}}[(0, 0)].0, Elemwise{sub,no_inplace}.0, Elemwise{Cast{float32}}.0)
+    2.0%   85.8%  0.071s  3.086s 7.12e-06s  * 10000  14 Elemwise{ScalarSigmoid{output_types_preference=transfer_type{0}}}[(0, 0)](Elemwise{neg,no_inplace}.0)
+    2.0%   87.8%  0.071s  3.156s 7.09e-06s  * 10000  14 Elemwise{ScalarSigmoid{output_types_preference=transfer_type{0}}}[(0, 0)](Elemwise{neg,no_inplace}.0)
+    0.9%   88.8%  0.034s  3.190s 3.38e-06s  * 10000  12 Alloc(Elemwise{inv,no_inplace}.0, Shape_i{0}.0)
+    0.9%   89.7%  0.034s  3.224s 3.37e-06s  * 10000  12 Alloc(Elemwise{inv,no_inplace}.0, Shape_i{0}.0)
+    0.5%   90.2%  0.019s  3.243s 1.93e-06s  * 10000   8 Elemwise{Cast{float32}}(InplaceDimShuffle{x}.0)
+    0.5%   90.8%  0.019s  3.262s 1.92e-06s  * 10000   4 Elemwise{sub,no_inplace}(TensorConstant{(1,) of 1.0}, y)
+    0.5%   91.3%  0.019s  3.282s 1.90e-06s  * 10000   4 Elemwise{sub,no_inplace}(TensorConstant{(1,) of 1.0}, y)
+   ... (remaining 35 Apply instances account for 8.71%(0.31s) of the runtime)
+(*) Op is running a c implementation
+
+Profile of Theano functions memory:
+(This check only the output of each apply node. It don't check the temporary memory used by the op in the apply node.)
+   We skipped 4 theano function(s). Each of them used less then 1024B(theano flags ProfileMode.min_memory_size) of total intermediate memory size
+
+Here are tips to potentially make your code run faster
+(if you think of new ones, suggest them on the mailing list).
+Test them first, as they are not guaranteed to always provide a speedup.
+  Sorry, no tip for today.
+
+
+# 2.2 Profiling output for GPU computations
+
+$ THEANO_FLAGS=mode=PROFILE_MODE,device=gpu python program_name.py 
+Using gpu device 0: GeForce GTX 580
+Used the gpu
+target values for D
+prediction on D
+Used the gpu
+target values for D
+prediction on D
+
+ProfileMode.print_summary()
+---------------------------
+
+Time since import 25.682s
+Theano compile time: 0.000s (0.0% since import)
+    Optimization time: 0.000s
+    Linker time: 0.000s
+Theano fct call 17.052s (66.4% since import)
+   Theano Op time 14.548s 56.6%(since import) 85.3%(of fct call)
+   Theano function overhead in ProfileMode 2.505s 9.8%(since import) 14.7%(of fct call)
+20002 Theano fct call, 0.001s per call
+Rest of the time since import 8.630s 33.6%
+
+Theano fct summary:
+<% total fct time> <total time> <time per call> <nb call> <fct name>
+   50.0% 8.526s 8.53e-04s 10000 train
+    0.0% 0.001s 1.09e-03s 1 predict
+   50.0% 8.524s 8.52e-04s 10000 train
+    0.0% 0.001s 1.10e-03s 1 predict
+
+Single Op-wise summary:
+<% of local_time spent on this kind of Op> <cumulative %> <self seconds> <cumulative seconds> <time per call> [*] <nb_call> <nb_op> <nb_apply> <Op name>
+   54.8%   54.8%  7.968s  7.968s  1.33e-04s   60002  1  8 <class 'theano.sandbox.cuda.basic_ops.GpuFromHost'>
+   16.2%   71.0%  2.358s  10.325s  1.47e-05s * 160002  9 18 <class 'theano.sandbox.cuda.basic_ops.GpuElemwise'>
+   12.3%   83.3%  1.795s  12.120s  4.49e-05s * 40002  1  6 <class 'theano.sandbox.cuda.blas.GpuGemv'>
+    7.0%   90.4%  1.024s  13.144s  2.56e-05s   40002  1  6 <class 'theano.sandbox.cuda.basic_ops.HostFromGpu'>
+    5.0%   95.4%  0.728s  13.872s  1.82e-05s * 40002  1  6 <class 'theano.sandbox.cuda.basic_ops.GpuAlloc'>
+    2.1%   97.4%  0.300s  14.171s  1.50e-05s * 20000  1  2 <class 'theano.sandbox.cuda.basic_ops.GpuSum'>
+    1.3%   98.7%  0.189s  14.360s  3.15e-06s * 60002  3  8 <class 'theano.sandbox.cuda.basic_ops.GpuDimShuffle'>
+    0.6%   99.4%  0.094s  14.454s  2.35e-06s * 40002  2  6 <class 'theano.tensor.elemwise.Elemwise'>
+    0.3%   99.7%  0.048s  14.503s  1.21e-06s * 40002  1  6 <class 'theano.tensor.opt.Shape_i'>
+    0.3%  100.0%  0.045s  14.548s  2.25e-06s * 20000  1  2 <class 'theano.tensor.elemwise.DimShuffle'>
+   ... (remaining 0 single Op account for 0.00%(0.00s) of the runtime)
+(*) Op is running a c implementation
+
+Op-wise summary:
+<% of local_time spent on this kind of Op> <cumulative %> <self seconds> <cumulative seconds> <time per call> [*]  <nb_call> <nb apply> <Op name>
+   54.8%   54.8%  7.968s  7.968s  1.33e-04s   60002  8 GpuFromHost
+   12.3%   67.1%  1.795s  9.763s  4.49e-05s * 40002  6 GpuGemv{inplace}
+    7.0%   74.1%  1.024s  10.786s  2.56e-05s   40002  6 HostFromGpu
+    5.0%   79.1%  0.728s  11.514s  1.82e-05s * 40002  6 GpuAlloc
+    2.3%   81.4%  0.334s  11.848s  1.67e-05s * 20000  2 GpuElemwise{Composite{[Composite{[Composite{[Composite{[mul(i0, add(i1, i2))]}(i0, neg(i1), true_div(i2, i3))]}(i0, mul(i1, i2, i3), i4, i5)]}(i0, i1, i2, exp(i3), i4, i5)]}}[(0, 0)]
+    2.2%   83.6%  0.319s  12.167s  1.59e-05s * 20000  2 GpuElemwise{Composite{[Composite{[Composite{[sub(mul(i0, i1), neg(i2))]}(i0, scalar_softplus(i1), mul(i2, i3))]}(i0, i1, i2, scalar_softplus(i3))]},no_inplace}
+    2.1%   85.7%  0.301s  12.468s  1.50e-05s * 20000  2 GpuElemwise{neg,no_inplace}
+    2.1%   87.8%  0.300s  12.768s  1.50e-05s * 20000  2 GpuSum{1}
+    2.0%   89.8%  0.292s  13.060s  1.46e-05s * 20000  2 GpuElemwise{inv,no_inplace}
+    1.9%   91.7%  0.283s  13.343s  1.42e-05s * 20000  2 GpuElemwise{Composite{[sub(neg(i0), i1)]}}[(0, 0)]
+    1.9%   93.7%  0.281s  13.625s  1.41e-05s * 20000  2 GpuElemwise{sub,no_inplace}
+    1.9%   95.5%  0.273s  13.898s  1.37e-05s * 20000  2 GpuElemwise{ScalarSigmoid{output_types_preference=transfer_type{0}}}[(0, 0)]
+    1.9%   97.4%  0.273s  14.171s  1.37e-05s * 20000  2 GpuElemwise{Composite{[sub(i0, mul(i1, i2))]}}[(0, 0)]
+    1.0%   98.4%  0.141s  14.313s  7.06e-06s * 20002  4 GpuDimShuffle{x}
+    0.4%   98.8%  0.057s  14.370s  2.87e-06s * 20002  4 Elemwise{gt,no_inplace}
+    0.3%   99.1%  0.048s  14.418s  1.21e-06s * 40002  6 Shape_i{0}
+    0.3%   99.4%  0.045s  14.463s  2.25e-06s * 20000  2 InplaceDimShuffle{x}
+    0.3%   99.7%  0.037s  14.500s  1.83e-06s * 20000  2 Elemwise{Cast{float32}}
+    0.2%   99.8%  0.025s  14.525s  1.24e-06s * 20000  2 GpuDimShuffle{0}
+    0.2%  100.0%  0.023s  14.548s  1.14e-06s * 20000  2 GpuDimShuffle{1,0}
+   ... (remaining 1 Op account for   0.00%(0.00s) of the runtime)
+(*) Op is running a c implementation
+
+Apply-wise summary:
+<% of local_time spent at this position> <cumulative %%> <apply time> <cumulative seconds> <time per call> [*] <nb_call> <Apply position> <Apply Op name>
+   24.0%   24.0%  3.493s  3.493s 3.49e-04s    10000   1 GpuFromHost(x)
+   23.9%   47.9%  3.479s  6.972s 3.48e-04s    10000   1 GpuFromHost(x)
+    4.3%   52.3%  0.629s  7.602s 6.29e-05s  * 10000  24 GpuGemv{inplace}(w, TensorConstant{-0.00999999977648}, GpuDimShuffle{1,0}.0, GpuElemwise{Composite{[Composite{[Composite{[Composite{[mul(i0, add(i1, i2))]}(i0, neg(i1), true_div(i2, i3))]}(i0, mul(i1, i2, i3), i4, i5)]}(i0, i1, i2, exp(i3), i4, i5)]}}[(0, 0)].0, TensorConstant{0.999800026417})
+    4.3%   56.6%  0.629s  8.231s 6.29e-05s  * 10000  24 GpuGemv{inplace}(w, TensorConstant{-0.00999999977648}, GpuDimShuffle{1,0}.0, GpuElemwise{Composite{[Composite{[Composite{[Composite{[mul(i0, add(i1, i2))]}(i0, neg(i1), true_div(i2, i3))]}(i0, mul(i1, i2, i3), i4, i5)]}(i0, i1, i2, exp(i3), i4, i5)]}}[(0, 0)].0, TensorConstant{0.999800026417})
+    1.8%   58.4%  0.269s  8.499s 2.69e-05s  * 10000   9 GpuGemv{inplace}(GpuAlloc.0, TensorConstant{1.0}, GpuFromHost.0, w, TensorConstant{1.0})
+    1.8%   60.3%  0.268s  8.767s 2.68e-05s  * 10000   9 GpuGemv{inplace}(GpuAlloc.0, TensorConstant{1.0}, GpuFromHost.0, w, TensorConstant{1.0})
+    1.8%   62.1%  0.266s  9.033s 2.66e-05s    10000  18 HostFromGpu(GpuElemwise{Composite{[Composite{[Composite{[sub(mul(i0, i1), neg(i2))]}(i0, scalar_softplus(i1), mul(i2, i3))]}(i0, i1, i2, scalar_softplus(i3))]},no_inplace}.0)
+    1.8%   63.9%  0.262s  9.296s 2.62e-05s    10000  18 HostFromGpu(GpuElemwise{Composite{[Composite{[Composite{[sub(mul(i0, i1), neg(i2))]}(i0, scalar_softplus(i1), mul(i2, i3))]}(i0, i1, i2, scalar_softplus(i3))]},no_inplace}.0)
+    1.8%   65.7%  0.260s  9.555s 2.60e-05s    10000   3 GpuFromHost(y)
+    1.8%   67.5%  0.258s  9.813s 2.58e-05s    10000   3 GpuFromHost(y)
+    1.7%   69.2%  0.248s  10.061s 2.48e-05s    10000  20 HostFromGpu(GpuElemwise{ScalarSigmoid{output_types_preference=transfer_type{0}}}[(0, 0)].0)
+    1.7%   70.9%  0.247s  10.309s 2.47e-05s    10000  20 HostFromGpu(GpuElemwise{ScalarSigmoid{output_types_preference=transfer_type{0}}}[(0, 0)].0)
+    1.6%   72.5%  0.238s  10.547s 2.38e-05s    10000  12 GpuFromHost(Elemwise{Cast{float32}}.0)
+    1.6%   74.1%  0.237s  10.785s 2.37e-05s    10000  12 GpuFromHost(Elemwise{Cast{float32}}.0)
+    1.3%   75.4%  0.185s  10.969s 1.85e-05s  * 10000   6 GpuAlloc(CudaNdarrayConstant{[  1.58212732e-09]}, Shape_i{0}.0)
+   ... (remaining 53 Apply instances account for 24.60%(3.58s) of the runtime)
+(*) Op is running a c implementation
+
+Some info useful for gpu:
+
+    Spent 1.211s(8.324%) in cpu Op, 13.337s(91.676%) in gpu Op and 0.000s(0.000%) transfert Op
+
+    Theano function input that are float64
+    <fct name> <input name> <input type> <str input>
+
+    List of apply that don't have float64 as input but have float64 in outputs
+    (Useful to know if we forgot some cast when using floatX=float32 or gpu code)
+    <Apply> <Apply position> <fct name> <inputs type> <outputs type>
+
+Profile of Theano functions memory:
+(This check only the output of each apply node. It don't check the temporary memory used by the op in the apply node.)
+   We skipped 4 theano function(s). Each of them used less then 1024B(theano flags ProfileMode.min_memory_size) of total intermediate memory size
+
+Here are tips to potentially make your code run faster
+(if you think of new ones, suggest them on the mailing list).
+Test them first, as they are not guaranteed to always provide a speedup.
+  Sorry, no tip for today.
+
+
+
+# 3. Conclusions
+
+
+Facts:
+Examine and compare 'Single Op-wise' summaries for CPU and GPU. GPU ops 'GpuFromHost' (and 'HostFromGpu') by themselves
+consume a large amount of extra time. Furthermore, notice that each of the GPU ops consumes more time than its CPU counterpart.
+An additional experiment would also confirm that adding ou 'out' instance in the GPU version would only bring about a minor
+improvement in this situation.
+
+Tentative conclusion:
+The large number of training steps (10000) generates disproportionate GPU overhead costs.
+
+Tentative solution:
+Include the training steps inside the definition of the Theano functionA tentative solution would
+
+Implement this solution and put it to test.
+
+
+"""
\ No newline at end of file
--- a/theano/scan_module/scan.py
+++ b/theano/scan_module/scan.py
@@ -197,11 +197,12 @@ def scan(fn,

        * ``initial`` -- Theano variable that represents the initial
          state of a given output. In case the output is not computed
-          recursively (think of a map) and does not require a initial
-          state this field can be skiped. Given that only the previous
-          time step of the output is used by ``fn`` the initial state
-          should have the same shape as the output. If multiple time
-          taps are used, the initial state should have one extra
+          recursively (think of a map) and does not require an initial
+          state this field can be skipped. Given that (only) the previous
+          time step of the output is used by ``fn``, the initial state
+          **should have the same shape** as the output and **should not
+          involve a downcast** of the data type of the output. If multiple
+          time taps are used, the initial state should have one extra
          dimension that should cover all the possible taps. For example
          if we use ``-5``, ``-2`` and ``-1`` as past taps, at step 0,
          ``fn`` will require (by an abuse of notation) ``output[-5]``,

--- a/theano/tests/test_tutorial.py
+++ b/theano/tests/test_tutorial.py
@@ -797,6 +797,7 @@ class T_using_gpu(unittest.TestCase):
        rng = numpy.random.RandomState(22)
        x = shared(numpy.asarray(rng.rand(vlen), config.floatX))
        f = function([], T.exp(x))
+        # print f.maker.fgraph.toposort()
        t0 = time.time()
        for i in xrange(iters):
            r = f()
@@ -813,7 +814,6 @@ class T_using_gpu(unittest.TestCase):
            assert numpy.any([isinstance(x.op, T.Elemwise) for x in f.maker.fgraph.toposort()])


-
    def test_using_gpu_2(self):

        if theano.config.device.find('gpu') > -1:
@@ -829,6 +829,7 @@ class T_using_gpu(unittest.TestCase):
            rng = numpy.random.RandomState(22)
            x = shared(numpy.asarray(rng.rand(vlen), config.floatX))
            f = function([], sandbox.cuda.basic_ops.gpu_from_host(T.exp(x)))
+            # print f.maker.fgraph.toposort()
            t0 = time.time()
            for i in xrange(iters):
                r = f()
@@ -844,9 +845,6 @@ class T_using_gpu(unittest.TestCase):
            assert not numpy.any([isinstance(x.op, T.Elemwise) for x in f.maker.fgraph.toposort()])


-
-
-
    def test_using_gpu_3(self):

        if theano.config.device.find('gpu') >-1:
@@ -864,6 +862,7 @@ class T_using_gpu(unittest.TestCase):
            f = function([],
                    Out(sandbox.cuda.basic_ops.gpu_from_host(T.exp(x)),
                        borrow=True))
+            # print f.maker.fgraph.toposort()
            t0 = time.time()
            for i in xrange(iters):
                r = f()