提交 12ec7339 authored 作者: abergeron's avatar abergeron

Merge pull request #1790 from nouiz/mixed

Mixed
...@@ -3,8 +3,8 @@ ...@@ -3,8 +3,8 @@
Easy Installation of an optimized Theano on Ubuntu Easy Installation of an optimized Theano on Ubuntu
================================================== ==================================================
These instruction was done for Ubuntu 11.04, 11.10 and 12.04. You can These instruction was done for Ubuntu 11.04, 11.10, 12.04, 12.10, 13.04
probably do something similar on older computer. and 13.10. You can probably do something similar on older computer.
.. note:: .. note::
...@@ -49,7 +49,7 @@ probably do something similar on older computer. ...@@ -49,7 +49,7 @@ probably do something similar on older computer.
Installation steps Installation steps
~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~
Ubuntu 11.10/12.04/12.10/13.04: Ubuntu 11.10/12.04/12.10/13.04/13.10:
1) ``sudo apt-get install python-numpy python-scipy python-dev python-pip python-nose g++ libopenblas-dev git`` 1) ``sudo apt-get install python-numpy python-scipy python-dev python-pip python-nose g++ libopenblas-dev git``
2) ``sudo pip install Theano`` 2) ``sudo pip install Theano``
...@@ -236,15 +236,4 @@ Test GPU configuration ...@@ -236,15 +236,4 @@ Test GPU configuration
Ubuntu 12.10: default gcc version 4.7.2. gcc 4.4.7, 4.5.4 and 4.6.3 availables. Ubuntu 12.10: default gcc version 4.7.2. gcc 4.4.7, 4.5.4 and 4.6.3 availables.
Ubuntu 13.10: default gcc version 4.8.1. gcc 4.4.7, 4.6.4 and 4.7.3 availables.
...@@ -507,13 +507,22 @@ class ProfileStats(object): ...@@ -507,13 +507,22 @@ class ProfileStats(object):
print >> file, header_str print >> file, header_str
atimes = [( topos = {} # Only do the topo once per fct.
atimes = []
for a, t in self.apply_time.items():
if a.fgraph not in topos:
topo = a.fgraph.toposort()
topos[a.fgraph] = topo
else:
topo = topos[a.fgraph]
atimes.append((
t * 100 / local_time, t * 100 / local_time,
t, t,
a, a,
a.fgraph.toposort().index(a), topo.index(a),
self.apply_callcount[a]) self.apply_callcount[a]))
for a, t in self.apply_time.items()] del topos
atimes.sort() atimes.sort()
atimes.reverse() atimes.reverse()
tot = 0 tot = 0
......
...@@ -203,6 +203,7 @@ if __name__ == "__main__": ...@@ -203,6 +203,7 @@ if __name__ == "__main__":
cuda version 5.5 5.0 4.2 4.1 4.0 3.2 3.0 # note cuda version 5.5 5.0 4.2 4.1 4.0 3.2 3.0 # note
gpu gpu
K6000/NOECC 0.06s
K20m/ECC 0.07s K20m/ECC 0.07s
K20/NOECC 0.07s K20/NOECC 0.07s
M2090 0.19s M2090 0.19s
......
...@@ -3,12 +3,12 @@ import os ...@@ -3,12 +3,12 @@ import os
import numpy import numpy
import theano import theano
from theano import Op, Type, Apply, Variable, Constant from theano import Op, Apply
from theano import tensor, scalar, config from theano import tensor, scalar, config
from theano.scalar import Scalar from theano.scalar import Scalar
from theano.tensor.basic import Alloc from theano.tensor.basic import Alloc
from theano.gof.python25 import all, any from theano.gof.python25 import any
from theano.gof.utils import MethodNotDefined from theano.gof.utils import MethodNotDefined
from theano.compat import PY3 from theano.compat import PY3
...@@ -257,7 +257,7 @@ class GpuFromHost(Op): ...@@ -257,7 +257,7 @@ class GpuFromHost(Op):
def R_op(self, inputs, eval_points): def R_op(self, inputs, eval_points):
ev, = eval_points ev, = eval_points
if isintance(ev, GpuArrayType): if isinstance(ev, GpuArrayType):
return [host_from_gpu(ev)] return [host_from_gpu(ev)]
else: else:
return ev return ev
...@@ -317,7 +317,7 @@ class GpuFromCuda(Op): ...@@ -317,7 +317,7 @@ class GpuFromCuda(Op):
def R_op(self, inputs, eval_points): def R_op(self, inputs, eval_points):
ev, = eval_points ev, = eval_points
if isintance(ev, GpuArrayType): if isinstance(ev, GpuArrayType):
return [cuda_from_gpu(ev)] return [cuda_from_gpu(ev)]
else: else:
return ev return ev
...@@ -651,6 +651,36 @@ class GpuAlloc(HideC, Alloc): ...@@ -651,6 +651,36 @@ class GpuAlloc(HideC, Alloc):
def c_code_cache_version(self): def c_code_cache_version(self):
return (2,) return (2,)
def do_constant_folding(self, node):
for client in node.outputs[0].clients:
if client[0] == 'output':
# If the output is a constant, it will have to be deepcopied
# each time the function is called. So we do not fold.
return False
elif (#The following ops work inplace of their input id 0.
client[1] == 0 and
isinstance(client[0].op, (
#Ops that will work inplace on the Alloc. So if they
#get constant_folded, they would copy the
#constant and this is less efficients.
#Not doing the constant folding could also lower
#the peak memory usage, as we the "constant" won't
#always exists.
#theano.tensor.subtensor.AdvancedIncSubtensor,
theano.sandbox.gpuarray.subtensor.GpuIncSubtensor,
#theano.sandbox.gpuarray.subtensor.GpuAdvancedIncSubtensor1,
theano.sandbox.gpuarray.blas.GpuGemm,
theano.sandbox.gpuarray.blas.GpuGemv,
#theano.sandbox.gpuarray.blas.GpuGer, Not Yet implemented
))):
return False
#If the clients is a transfer, we don't want to fold. We
#let the moving opt finish before deciding what to do.
elif isinstance(client[0].op, HostFromGpu):
return False
return True
gpu_alloc = GpuAlloc() gpu_alloc = GpuAlloc()
......
...@@ -32,8 +32,10 @@ if not theano.sandbox.gpuarray.pygpu_activated: ...@@ -32,8 +32,10 @@ if not theano.sandbox.gpuarray.pygpu_activated:
from theano.sandbox.gpuarray.type import (GpuArrayType, from theano.sandbox.gpuarray.type import (GpuArrayType,
gpuarray_shared_constructor) gpuarray_shared_constructor)
from theano.sandbox.gpuarray.basic_ops import (host_from_gpu, gpu_from_host, from theano.sandbox.gpuarray.basic_ops import (
gpu_alloc, gpu_from_cuda, host_from_gpu, gpu_from_host,
gpu_alloc, GpuAlloc,
gpu_from_cuda,
cuda_from_gpu, HostFromGpu, cuda_from_gpu, HostFromGpu,
GpuFromHost, GpuReshape, GpuFromHost, GpuReshape,
GpuEye) GpuEye)
...@@ -290,6 +292,13 @@ GpuAllocTester = makeTester( ...@@ -290,6 +292,13 @@ GpuAllocTester = makeTester(
) )
class TestAlloc(theano.tensor.tests.test_basic.TestAlloc):
dtype = "float32"
mode = mode_with_gpu
shared = staticmethod(gpuarray_shared_constructor)
allocs = [GpuAlloc, GpuAlloc, T.Alloc]
def test_shape(): def test_shape():
x = GpuArrayType(dtype='float32', broadcastable=[False, False, False])() x = GpuArrayType(dtype='float32', broadcastable=[False, False, False])()
v = gpuarray.zeros((3, 4, 5), dtype='float32') v = gpuarray.zeros((3, 4, 5), dtype='float32')
......
...@@ -1369,6 +1369,9 @@ class MaxAndArgmax(Op): ...@@ -1369,6 +1369,9 @@ class MaxAndArgmax(Op):
""" % locals() """ % locals()
ret = """ ret = """
int axis; int axis;
Py_CLEAR(%(max)s);
Py_CLEAR(%(argmax)s);//todo pass them as out parameter.
%(axis_code)s %(axis_code)s
%(max)s = (PyArrayObject*)PyArray_Max(%(x)s, axis, NULL); %(max)s = (PyArrayObject*)PyArray_Max(%(x)s, axis, NULL);
if(%(max)s == NULL){ if(%(max)s == NULL){
...@@ -1407,7 +1410,7 @@ class MaxAndArgmax(Op): ...@@ -1407,7 +1410,7 @@ class MaxAndArgmax(Op):
return ret % locals() return ret % locals()
def c_code_cache_version(self): def c_code_cache_version(self):
return (2,) return (3,)
def infer_shape(self, node, shapes): def infer_shape(self, node, shapes):
ishape, axis_shape = shapes ishape, axis_shape = shapes
......
...@@ -4049,8 +4049,8 @@ def constant_folding(node): ...@@ -4049,8 +4049,8 @@ def constant_folding(node):
return rval return rval
register_canonicalize(constant_folding, 'fast_compile') register_canonicalize(constant_folding, 'fast_compile')
register_stabilize(constant_folding) register_stabilize(constant_folding, 'fast_compile')
register_specialize(constant_folding) register_specialize(constant_folding, 'fast_compile')
def _is_1(expr): def _is_1(expr):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论