Merge pull request #1790 from nouiz/mixed

Mixed

Merge pull request #1790 from nouiz/mixed
12ec7339 · abergeron · 66d7f5c7 · 0860c441 · 12ec7339 · 12ec7339
--- a/doc/install_ubuntu.txt
+++ b/doc/install_ubuntu.txt
@@ -3,8 +3,8 @@
 Easy Installation of an optimized Theano on Ubuntu
 ==================================================
-These instruction was done for Ubuntu 11.04, 11.10 and 12.04. You can
+These instruction was done for Ubuntu 11.04, 11.10, 12.04, 12.10, 13.04
-probably do something similar on older computer.
+and 13.10. You can probably do something similar on older computer.
 .. note::
@@ -49,7 +49,7 @@ probably do something similar on older computer.
 Installation steps
 ~~~~~~~~~~~~~~~~~~
-Ubuntu 11.10/12.04/12.10/13.04:
+Ubuntu 11.10/12.04/12.10/13.04/13.10:
 1) ``sudo apt-get install python-numpy python-scipy python-dev python-pip python-nose g++ libopenblas-dev git``
 2) ``sudo pip install Theano``
@@ -236,15 +236,4 @@ Test GPU configuration
   Ubuntu 12.10: default gcc version 4.7.2. gcc 4.4.7, 4.5.4 and 4.6.3 availables.
+   Ubuntu 13.10: default gcc version 4.8.1. gcc 4.4.7, 4.6.4 and 4.7.3 availables.
--- a/theano/compile/profiling.py
+++ b/theano/compile/profiling.py
@@ -507,13 +507,22 @@ class ProfileStats(object):
        print >> file, header_str
-        atimes = [(
+        topos = {}  # Only do the topo once per fct.
+        atimes = []
+        for a, t in self.apply_time.items():
+            if a.fgraph not in topos:
+                topo = a.fgraph.toposort()
+                topos[a.fgraph] = topo
+            else:
+                topo = topos[a.fgraph]
+            atimes.append((
                t * 100 / local_time,
                t,
                a,
-                a.fgraph.toposort().index(a),
+                topo.index(a),
-                self.apply_callcount[a])
+                self.apply_callcount[a]))
-            for a, t in self.apply_time.items()]
+        del topos
        atimes.sort()
        atimes.reverse()
        tot = 0

--- a/theano/misc/check_blas.py
+++ b/theano/misc/check_blas.py
@@ -203,6 +203,7 @@ if __name__ == "__main__":
        cuda version      5.5    5.0    4.2    4.1    4.0    3.2    3.0   # note
        gpu
+        K6000/NOECC       0.06s
        K20m/ECC                 0.07s
        K20/NOECC                0.07s
        M2090             0.19s

--- a/theano/sandbox/gpuarray/basic_ops.py
+++ b/theano/sandbox/gpuarray/basic_ops.py
@@ -3,12 +3,12 @@ import os
 import numpy
 import theano
-from theano import Op, Type, Apply, Variable, Constant
+from theano import Op, Apply
 from theano import tensor, scalar, config
 from theano.scalar import Scalar
 from theano.tensor.basic import Alloc
-from theano.gof.python25 import all, any
+from theano.gof.python25 import any
 from theano.gof.utils import MethodNotDefined
 from theano.compat import PY3
@@ -257,7 +257,7 @@ class GpuFromHost(Op):
    def R_op(self, inputs, eval_points):
        ev, = eval_points
-        if isintance(ev, GpuArrayType):
+        if isinstance(ev, GpuArrayType):
            return [host_from_gpu(ev)]
        else:
            return ev
@@ -317,7 +317,7 @@ class GpuFromCuda(Op):
    def R_op(self, inputs, eval_points):
        ev, = eval_points
-        if isintance(ev, GpuArrayType):
+        if isinstance(ev, GpuArrayType):
            return [cuda_from_gpu(ev)]
        else:
            return ev
@@ -651,6 +651,36 @@ class GpuAlloc(HideC, Alloc):
    def c_code_cache_version(self):
        return (2,)
+    def do_constant_folding(self, node):
+        for client in node.outputs[0].clients:
+            if client[0] == 'output':
+                # If the output is a constant, it will have to be deepcopied
+                # each time the function is called.  So we do not fold.
+                return False
+            elif (#The following ops work inplace of their input id 0.
+                  client[1] == 0 and
+                  isinstance(client[0].op, (
+                    #Ops that will work inplace on the Alloc. So if they
+                    #get constant_folded, they would copy the
+                    #constant and this is less efficients.
+                    #Not doing the constant folding could also lower
+                    #the peak memory usage, as we the "constant" won't
+                    #always exists.
+                      #theano.tensor.subtensor.AdvancedIncSubtensor,
+                      theano.sandbox.gpuarray.subtensor.GpuIncSubtensor,
+                      #theano.sandbox.gpuarray.subtensor.GpuAdvancedIncSubtensor1,
+                      theano.sandbox.gpuarray.blas.GpuGemm,
+                      theano.sandbox.gpuarray.blas.GpuGemv,
+                      #theano.sandbox.gpuarray.blas.GpuGer, Not Yet implemented
+                  ))):
+                return False
+            #If the clients is a transfer, we don't want to fold. We
+            #let the moving opt finish before deciding what to do.
+            elif isinstance(client[0].op, HostFromGpu):
+                return False
+        return True
 gpu_alloc = GpuAlloc()

--- a/theano/sandbox/gpuarray/tests/test_basic_ops.py
+++ b/theano/sandbox/gpuarray/tests/test_basic_ops.py
@@ -32,11 +32,13 @@ if not theano.sandbox.gpuarray.pygpu_activated:
 from theano.sandbox.gpuarray.type import (GpuArrayType,
                                          gpuarray_shared_constructor)
-from theano.sandbox.gpuarray.basic_ops import (host_from_gpu, gpu_from_host,
+from theano.sandbox.gpuarray.basic_ops import (
-                                               gpu_alloc, gpu_from_cuda,
+    host_from_gpu, gpu_from_host,
-                                               cuda_from_gpu, HostFromGpu,
+    gpu_alloc, GpuAlloc,
-                                               GpuFromHost, GpuReshape,
+    gpu_from_cuda,
-                                               GpuEye)
+    cuda_from_gpu, HostFromGpu,
+    GpuFromHost, GpuReshape,
+    GpuEye)
 from theano.tests import unittest_tools as utt
 utt.seed_rng()
@@ -290,6 +292,13 @@ GpuAllocTester = makeTester(
 )
+class TestAlloc(theano.tensor.tests.test_basic.TestAlloc):
+    dtype = "float32"
+    mode = mode_with_gpu
+    shared = staticmethod(gpuarray_shared_constructor)
+    allocs = [GpuAlloc, GpuAlloc, T.Alloc]
 def test_shape():
    x = GpuArrayType(dtype='float32', broadcastable=[False, False, False])()
    v = gpuarray.zeros((3, 4, 5), dtype='float32')

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -1369,6 +1369,9 @@ class MaxAndArgmax(Op):
            """ % locals()
        ret = """
        int axis;
+        Py_CLEAR(%(max)s);
+        Py_CLEAR(%(argmax)s);//todo pass them as out parameter.
        %(axis_code)s
        %(max)s = (PyArrayObject*)PyArray_Max(%(x)s, axis, NULL);
        if(%(max)s == NULL){
@@ -1407,7 +1410,7 @@ class MaxAndArgmax(Op):
        return ret % locals()
    def c_code_cache_version(self):
-        return (2,)
+        return (3,)
    def infer_shape(self, node, shapes):
        ishape, axis_shape = shapes

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -4049,8 +4049,8 @@ def constant_folding(node):
    return rval
 register_canonicalize(constant_folding, 'fast_compile')
-register_stabilize(constant_folding)
+register_stabilize(constant_folding, 'fast_compile')
-register_specialize(constant_folding)
+register_specialize(constant_folding, 'fast_compile')
 def _is_1(expr):