提交 12ec7339 authored 作者: abergeron's avatar abergeron

Merge pull request #1790 from nouiz/mixed

Mixed
......@@ -3,8 +3,8 @@
Easy Installation of an optimized Theano on Ubuntu
==================================================
These instruction was done for Ubuntu 11.04, 11.10 and 12.04. You can
probably do something similar on older computer.
These instruction was done for Ubuntu 11.04, 11.10, 12.04, 12.10, 13.04
and 13.10. You can probably do something similar on older computer.
.. note::
......@@ -49,7 +49,7 @@ probably do something similar on older computer.
Installation steps
~~~~~~~~~~~~~~~~~~
Ubuntu 11.10/12.04/12.10/13.04:
Ubuntu 11.10/12.04/12.10/13.04/13.10:
1) ``sudo apt-get install python-numpy python-scipy python-dev python-pip python-nose g++ libopenblas-dev git``
2) ``sudo pip install Theano``
......@@ -236,15 +236,4 @@ Test GPU configuration
Ubuntu 12.10: default gcc version 4.7.2. gcc 4.4.7, 4.5.4 and 4.6.3 availables.
Ubuntu 13.10: default gcc version 4.8.1. gcc 4.4.7, 4.6.4 and 4.7.3 availables.
......@@ -507,13 +507,22 @@ class ProfileStats(object):
print >> file, header_str
atimes = [(
topos = {} # Only do the topo once per fct.
atimes = []
for a, t in self.apply_time.items():
if a.fgraph not in topos:
topo = a.fgraph.toposort()
topos[a.fgraph] = topo
else:
topo = topos[a.fgraph]
atimes.append((
t * 100 / local_time,
t,
a,
a.fgraph.toposort().index(a),
self.apply_callcount[a])
for a, t in self.apply_time.items()]
topo.index(a),
self.apply_callcount[a]))
del topos
atimes.sort()
atimes.reverse()
tot = 0
......
......@@ -203,6 +203,7 @@ if __name__ == "__main__":
cuda version 5.5 5.0 4.2 4.1 4.0 3.2 3.0 # note
gpu
K6000/NOECC 0.06s
K20m/ECC 0.07s
K20/NOECC 0.07s
M2090 0.19s
......
......@@ -3,12 +3,12 @@ import os
import numpy
import theano
from theano import Op, Type, Apply, Variable, Constant
from theano import Op, Apply
from theano import tensor, scalar, config
from theano.scalar import Scalar
from theano.tensor.basic import Alloc
from theano.gof.python25 import all, any
from theano.gof.python25 import any
from theano.gof.utils import MethodNotDefined
from theano.compat import PY3
......@@ -257,7 +257,7 @@ class GpuFromHost(Op):
def R_op(self, inputs, eval_points):
ev, = eval_points
if isintance(ev, GpuArrayType):
if isinstance(ev, GpuArrayType):
return [host_from_gpu(ev)]
else:
return ev
......@@ -317,7 +317,7 @@ class GpuFromCuda(Op):
def R_op(self, inputs, eval_points):
ev, = eval_points
if isintance(ev, GpuArrayType):
if isinstance(ev, GpuArrayType):
return [cuda_from_gpu(ev)]
else:
return ev
......@@ -651,6 +651,36 @@ class GpuAlloc(HideC, Alloc):
def c_code_cache_version(self):
return (2,)
def do_constant_folding(self, node):
for client in node.outputs[0].clients:
if client[0] == 'output':
# If the output is a constant, it will have to be deepcopied
# each time the function is called. So we do not fold.
return False
elif (#The following ops work inplace of their input id 0.
client[1] == 0 and
isinstance(client[0].op, (
#Ops that will work inplace on the Alloc. So if they
#get constant_folded, they would copy the
#constant and this is less efficients.
#Not doing the constant folding could also lower
#the peak memory usage, as we the "constant" won't
#always exists.
#theano.tensor.subtensor.AdvancedIncSubtensor,
theano.sandbox.gpuarray.subtensor.GpuIncSubtensor,
#theano.sandbox.gpuarray.subtensor.GpuAdvancedIncSubtensor1,
theano.sandbox.gpuarray.blas.GpuGemm,
theano.sandbox.gpuarray.blas.GpuGemv,
#theano.sandbox.gpuarray.blas.GpuGer, Not Yet implemented
))):
return False
#If the clients is a transfer, we don't want to fold. We
#let the moving opt finish before deciding what to do.
elif isinstance(client[0].op, HostFromGpu):
return False
return True
gpu_alloc = GpuAlloc()
......
......@@ -32,8 +32,10 @@ if not theano.sandbox.gpuarray.pygpu_activated:
from theano.sandbox.gpuarray.type import (GpuArrayType,
gpuarray_shared_constructor)
from theano.sandbox.gpuarray.basic_ops import (host_from_gpu, gpu_from_host,
gpu_alloc, gpu_from_cuda,
from theano.sandbox.gpuarray.basic_ops import (
host_from_gpu, gpu_from_host,
gpu_alloc, GpuAlloc,
gpu_from_cuda,
cuda_from_gpu, HostFromGpu,
GpuFromHost, GpuReshape,
GpuEye)
......@@ -290,6 +292,13 @@ GpuAllocTester = makeTester(
)
class TestAlloc(theano.tensor.tests.test_basic.TestAlloc):
dtype = "float32"
mode = mode_with_gpu
shared = staticmethod(gpuarray_shared_constructor)
allocs = [GpuAlloc, GpuAlloc, T.Alloc]
def test_shape():
x = GpuArrayType(dtype='float32', broadcastable=[False, False, False])()
v = gpuarray.zeros((3, 4, 5), dtype='float32')
......
......@@ -1369,6 +1369,9 @@ class MaxAndArgmax(Op):
""" % locals()
ret = """
int axis;
Py_CLEAR(%(max)s);
Py_CLEAR(%(argmax)s);//todo pass them as out parameter.
%(axis_code)s
%(max)s = (PyArrayObject*)PyArray_Max(%(x)s, axis, NULL);
if(%(max)s == NULL){
......@@ -1407,7 +1410,7 @@ class MaxAndArgmax(Op):
return ret % locals()
def c_code_cache_version(self):
return (2,)
return (3,)
def infer_shape(self, node, shapes):
ishape, axis_shape = shapes
......
......@@ -4049,8 +4049,8 @@ def constant_folding(node):
return rval
register_canonicalize(constant_folding, 'fast_compile')
register_stabilize(constant_folding)
register_specialize(constant_folding)
register_stabilize(constant_folding, 'fast_compile')
register_specialize(constant_folding, 'fast_compile')
def _is_1(expr):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论