Made all extending theano before PyCUDA stuff and put PyCUDA op example in the pycuda section.

264ab591 · Frederic Bastien · 392c2a74 · 264ab591 · 264ab591 · 264ab591
--- a/doc/hpcs2011_tutorial/extending_theano.txt
+++ b/doc/hpcs2011_tutorial/extending_theano.txt
@@ -75,7 +75,7 @@ Test it!
 >>> print out
-Exercises 7
+Exercises 8
 -----------
 - Run the code in the file double_op.py.
@@ -86,64 +86,3 @@ Exercises 7
-Theano + PyCUDA
---------------
-.. code-block:: python
-    import numpy, theano
-    import theano.misc.pycuda_init
-    from pycuda.compiler import SourceModule
-    import theano.sandbox.cuda as cuda
-    class PyCUDADoubleOp(theano.Op):
-        def __eq__(self, other):
-            return type(self) == type(other)
-        def __hash__(self):
-            return hash(type(self))
-        def __str__(self):
-            return self.__class__.__name__
-        def make_node(self, inp):
-            inp = cuda.basic_ops.gpu_contiguous(
-               cuda.basic_ops.as_cuda_ndarray_variable(inp))
-            assert inp.dtype == "float32"
-            return theano.Apply(self, [inp], [inp.type()])
-        def make_thunk(self, node, storage_map, _, _2):
-            mod = SourceModule("""
-        __global__ void my_fct(float * i0, float * o0, int size) {
-        int i = blockIdx.x*blockDim.x + threadIdx.x;
-        if(i<size){
-            o0[i] = i0[i]*2;
-        }
-      }""")
-            pycuda_fct = mod.get_function("my_fct")
-            inputs = [ storage_map[v] for v in node.inputs]
-            outputs = [ storage_map[v] for v in node.outputs]
-            def thunk():
-                z = outputs[0]
-                if z[0] is None or z[0].shape!=inputs[0][0].shape:
-                    z[0] = cuda.CudaNdarray.zeros(inputs[0][0].shape)
-                grid = (int(numpy.ceil(inputs[0][0].size / 512.)),1)
-                pycuda_fct(inputs[0][0], z[0], numpy.intc(inputs[0][0].size),
-                           block=(512,1,1), grid=grid)
-            return thunk
-Test it!
->>> x = theano.tensor.fmatrix()
->>> f = theano.function([x], PyCUDADoubleOp()(x))
->>> xv=numpy.ones((4,5), dtype="float32")
->>> assert numpy.allclose(f(xv), xv*2)
->>> print numpy.asarray(f(xv))
-Exercises 8
-----------
- Run the above example
- Modify and execute the example to multiple two matrix: x * y
- Modify and execute the example to return 2 outputs: x + y and x - y
-  - Our current elemwise fusion generate computation with only 1 outputs
- Modify and execute the example to support stride? (Don't force the input to be c contiguous)
--- a/doc/hpcs2011_tutorial/index.txt
+++ b/doc/hpcs2011_tutorial/index.txt
@@ -10,7 +10,7 @@ GPU programming made Easy
    introduction
    theano
    advanced_theano
-    pyCUDA
    extending_theano
+    pyCUDA
    gpundarray
--- a/doc/hpcs2011_tutorial/pyCUDA.txt
+++ b/doc/hpcs2011_tutorial/pyCUDA.txt
@@ -74,3 +74,65 @@ Exercice 6
 - Run the above example
 - Modify and execute it to work for a matrix of 20 x 10
+Theano + PyCUDA
+---------------
+.. code-block:: python
+    import numpy, theano
+    import theano.misc.pycuda_init
+    from pycuda.compiler import SourceModule
+    import theano.sandbox.cuda as cuda
+    class PyCUDADoubleOp(theano.Op):
+        def __eq__(self, other):
+            return type(self) == type(other)
+        def __hash__(self):
+            return hash(type(self))
+        def __str__(self):
+            return self.__class__.__name__
+        def make_node(self, inp):
+            inp = cuda.basic_ops.gpu_contiguous(
+               cuda.basic_ops.as_cuda_ndarray_variable(inp))
+            assert inp.dtype == "float32"
+            return theano.Apply(self, [inp], [inp.type()])
+        def make_thunk(self, node, storage_map, _, _2):
+            mod = SourceModule("""
+        __global__ void my_fct(float * i0, float * o0, int size) {
+        int i = blockIdx.x*blockDim.x + threadIdx.x;
+        if(i<size){
+            o0[i] = i0[i]*2;
+        }
+      }""")
+            pycuda_fct = mod.get_function("my_fct")
+            inputs = [ storage_map[v] for v in node.inputs]
+            outputs = [ storage_map[v] for v in node.outputs]
+            def thunk():
+                z = outputs[0]
+                if z[0] is None or z[0].shape!=inputs[0][0].shape:
+                    z[0] = cuda.CudaNdarray.zeros(inputs[0][0].shape)
+                grid = (int(numpy.ceil(inputs[0][0].size / 512.)),1)
+                pycuda_fct(inputs[0][0], z[0], numpy.intc(inputs[0][0].size),
+                           block=(512,1,1), grid=grid)
+            return thunk
+Test it!
+>>> x = theano.tensor.fmatrix()
+>>> f = theano.function([x], PyCUDADoubleOp()(x))
+>>> xv=numpy.ones((4,5), dtype="float32")
+>>> assert numpy.allclose(f(xv), xv*2)
+>>> print numpy.asarray(f(xv))
+Exercises 7
+-----------
+- Run the above example
+- Modify and execute the example to multiple two matrix: x * y
+- Modify and execute the example to return 2 outputs: x + y and x - y
+  - Our current elemwise fusion generate computation with only 1 outputs
+- Modify and execute the example to support stride? (Don't force the input to be c contiguous)