提交 38e2f502 authored 作者: abergeron's avatar abergeron

Merge pull request #1946 from nouiz/scan

Small scan speed up on the GPU.
......@@ -21,6 +21,8 @@ Montreal).
News
====
* Colin Raffel `tutorial on Theano <http://nbviewer.ipython.org/github/craffel/theano-tutorial/blob/master/Theano%20Tutorial.ipynb>`_.
* Ian Goodfellow did a `12h class with exercises on Theano <https://github.com/goodfeli/theano_exercises>`_.
* Theano 0.6 was released. Everybody is encouraged to update.
......
......@@ -120,8 +120,18 @@ enum = EnumStr("g++", "")
try:
rc = call_subprocess_Popen(['g++', '-v'])
except OSError:
enum = EnumStr("")
rc = 1
if rc == 0:
AddConfigVar('cxx',
"The C++ compiler to use. Currently only g++ is"
" supported, but supporting additional compilers should not be "
"too difficult. "
"If it is empty, no C++ code is compiled.",
enum,
in_c_key=False)
del enum
if rc == 0 and config.cxx != "":
# Keep the default linker the same as the one for the mode FAST_RUN
AddConfigVar('linker',
("Default linker used if the theano flags mode is Mode "
......@@ -140,16 +150,6 @@ else:
'optimized C-implementations (for both CPU and GPU) and will '
'default to Python implementations. Performance will be severely '
'degraded.')
enum = EnumStr("")
AddConfigVar('cxx',
"The C++ compiler to use. Currently only g++ is"
" supported, but supporting additional compilers should not be "
"too difficult. "
"If it is empty, no C++ code is compiled.",
enum,
in_c_key=False)
del enum
#Keep the default value the same as the one for the mode FAST_RUN
......
......@@ -12,7 +12,8 @@ import warnings
from theano.gof.python25 import all
from theano.configparser import config, AddConfigVar, BoolParam, ConfigParam
from theano.configparser import (config, AddConfigVar,
BoolParam, ConfigParam, _config_var_list)
import theano.gof.cmodule
......@@ -560,7 +561,7 @@ except (OSError, theano.gof.cmodule.MissingGXX), e:
# already changed the default linker to something else then CVM.
# Currently this is the py linker.
# Here we assert that the default linker is not cvm.
assert not [x for x in theano.configparser._config_var_list
assert not [x for x in _config_var_list
if x.fullname == 'linker'][0].default.startswith('cvm'), e
pass
......
......@@ -1411,9 +1411,10 @@ def norm(x,ord):
elif ndim > 2:
raise NotImplementedError("We don't support norm witn ndim > 2")
class lstsq(theano.Op):
def __eq__(self, other):
pass
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
......
......@@ -422,11 +422,19 @@ class Scan(PureOp):
raise ValueError('For output %s you need to provide a '
'scalar int !', str(outer_nitsot))
assert len(new_inputs) == len(inputs)
self.vector_seqs = [seq.ndim == 1 for seq in
new_inputs[1:1 + self.n_seqs]]
self.vector_outs = [arg.ndim == 1 for arg in
new_inputs[1 + self.n_seqs: (1 + self.n_seqs +
self.n_outs)]]
# The vector_seqs and vector_outs are just a workaround
# strange NumPy behavior: vector_ndarray[int] return a NumPy
# scalar and not a NumPy ndarray of 0 dimensions.
self.vector_seqs = [isinstance(seq, (tensor.TensorVariable,
tensor.TensorConstant)) and
seq.ndim == 1 for seq in
new_inputs[1:1 + self.n_seqs]]
self.vector_outs = [isinstance(arg, (tensor.TensorVariable,
tensor.TensorConstant)) and
arg.ndim == 1 for arg in
new_inputs[1 + self.n_seqs: (1 + self.n_seqs +
self.n_outs)]]
self.vector_outs += [False] * self.n_nit_sot
apply_node = Apply(self,
......@@ -598,12 +606,6 @@ class Scan(PureOp):
for _d1 in range(cython_mit_mot_out_nslices[_d0]):
cython_mit_mot_out_slices[_d0, _d1] = \
self.mit_mot_out_slices[_d0][_d1]
vector_seqs = [seq.ndim == 1 for seq in
node.inputs[1:1 + self.n_seqs]]
vector_outs = [arg.ndim == 1 for arg in
node.inputs[1 + self.n_seqs:
(1 + self.n_seqs + self.n_outs)]]
vector_outs += [False] * self.n_nit_sot
cython_vector_seqs = numpy.asarray(self.vector_seqs,
dtype='int32')
......
import os, logging, sys
import logging
import os
import sys
import numpy
import theano
from theano import config
......@@ -60,6 +64,28 @@ except ImportError:
os.mkdir(loc)
preargs = ['-fwrapv', '-O2', '-fno-strict-aliasing']
preargs += cmodule.GCC_compiler.compile_args()
# Cython 19.1 always use the old NumPy interface. So we
# need to manually modify the .c file to get it compiled
# by Theano. As by default, we tell NumPy to don't import
# the old interface.
if False:
#During scan cython development, it is helpful to keep the old interface, to don't manually edit the c file each time.
preargs.remove('-D NPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION')
else:
numpy_ver = [int(n) for n in numpy.__version__.split('.')[:2]]
# Add add some macro to lower the number of edit
# needed to the c file.
if bool(numpy_ver >= [1, 7]):
# Needed when we disable the old API, as cython
# use the old interface
preargs.append("-D NPY_ENSUREARRAY=NPY_ARRAY_ENSUREARRAY")
preargs.append("-D NPY_ENSURECOPY=NPY_ARRAY_ENSURECOPY")
preargs.append("-D NPY_ALIGNED=NPY_ARRAY_ALIGNED")
preargs.append("-D NPY_WRITEABLE=NPY_ARRAY_WRITEABLE")
preargs.append("-D NPY_UPDATE_ALL=NPY_ARRAY_UPDATE_ALL")
preargs.append("-D NPY_C_CONTIGUOUS=NPY_ARRAY_C_CONTIGUOUS")
preargs.append("-D NPY_F_CONTIGUOUS=NPY_ARRAY_F_CONTIGUOUS")
cmodule.GCC_compiler.compile_str(dirname, code, location=loc,
preargs=preargs)
# Save version into the __init__.py file.
......
......@@ -16,7 +16,7 @@ import theano.sandbox.rng_mrg
from theano import tensor
from theano.compile.pfunc import rebuild_collect_shared
from theano.gof.python25 import any
from theano.tests import unittest_tools as utt
from theano.tests import unittest_tools as utt
import theano.scalar.sharedvar
from theano.gof.python25 import OrderedDict
from theano.compat import PY3
......@@ -46,7 +46,7 @@ mode_with_gpu = mode_with_opt.including('gpu', 'scan')
class multiple_outputs_numeric_grad:
"""WRITEME"""
type_eps = {'float64': 1e-7,
'float32': 3e-3}
'float32': 3e-3}
def __init__(self, f, pt, ndarray_mask=None, eps=None):
"""Return the gradient of f at pt.
......@@ -81,12 +81,12 @@ class multiple_outputs_numeric_grad:
if ndarray_mask[i]:
pt[i] = numpy.array(p)
_eps = multiple_outputs_numeric_grad.type_eps[str(
pt[i].dtype)]
pt[i].dtype)]
if _eps > dtype_eps:
dtype_eps = _eps
self.ndarray_mask = ndarray_mask
#'''
# '''
# Compute clean output:
f_x = f(*pt)
gx = []
......@@ -148,7 +148,7 @@ class multiple_outputs_numeric_grad:
return numpy.inf, 0
#TODO: Test this function, and if it works,
# TODO: Test this function, and if it works,
# use it with the normal verify_grad rather than the
# copy-and-pasted one above.
# Also - add a reference to this technique in the
......@@ -201,7 +201,6 @@ def grab_scan_node(output):
class T_Scan(unittest.TestCase):
#class T_Scan(object):
def setUp(self):
utt.seed_rng()
......@@ -230,7 +229,7 @@ class T_Scan(unittest.TestCase):
updates=updates,
allow_input_downcast=True)
### TESTING PICKLE-ing this function
# TESTING PICKLE-ing this function
origdir = os.getcwd()
tmpdir = None
try:
......@@ -367,7 +366,7 @@ class T_Scan(unittest.TestCase):
# This first version test the first case in the optimizer to the gpu.
def test_one_sequence_one_output_weights_gpu1(self):
from theano.sandbox import cuda
if cuda.cuda_available == False:
if not cuda.cuda_available:
raise SkipTest('Optional package cuda disabled')
def f_rnn(u_t, x_tm1, W_in, W):
......@@ -447,7 +446,7 @@ class T_Scan(unittest.TestCase):
# This second version test the second case in the optimizer to the gpu.
def test_one_sequence_one_output_weights_gpu2(self):
from theano.sandbox import cuda
if cuda.cuda_available == False:
if not cuda.cuda_available:
raise SkipTest('Optional package cuda disabled')
def f_rnn(u_t, x_tm1, W_in, W):
......@@ -511,7 +510,7 @@ class T_Scan(unittest.TestCase):
# outputs when is running on GPU
def test_gpu3_mixture_dtype_outputs(self):
from theano.sandbox import cuda
if cuda.cuda_available == False:
if not cuda.cuda_available:
raise SkipTest('Optional package cuda disabled')
def f_rnn(u_t, x_tm1, W_in, W):
......@@ -595,11 +594,11 @@ class T_Scan(unittest.TestCase):
v_out = numpy.zeros((4,))
v_out[0] = v_u[0] * W_in.get_value() + v_x0 * W.get_value()
for step in xrange(1, 4):
v_out[step] = v_u[step] * W_in.get_value() + \
v_out[step - 1] * W.get_value()
v_out[step] = (v_u[step] * W_in.get_value() +
v_out[step - 1] * W.get_value())
theano_values = f3(v_u, v_x0)
assert numpy.allclose(theano_values, v_out)
assert numpy.allclose(theano_values, v_out)
# some rnn with multiple outputs and multiple inputs; other
# dimension instead of scalars/vectors
......@@ -624,7 +623,7 @@ class T_Scan(unittest.TestCase):
y0 = theano.tensor.scalar('y0')
def f_rnn_cmpl(u1_t, u2_t, x_tm1, y_tm1, W_in1):
return [theano.dot(u1_t, W_in1) + u2_t * W_in2 + \
return [theano.dot(u1_t, W_in1) + u2_t * W_in2 +
theano.dot(x_tm1, W), theano.dot(x_tm1, W_out)]
outputs, updates = theano.scan(f_rnn_cmpl,
......@@ -643,12 +642,12 @@ class T_Scan(unittest.TestCase):
# compute the values in numpy
v_x = numpy.zeros((3, 2), dtype=theano.config.floatX)
v_y = numpy.zeros((3,), dtype=theano.config.floatX)
v_x[0] = numpy.dot(v_u1[0], vW_in1) + v_u2[0] * vW_in2 + \
numpy.dot(v_x0, vW)
v_x[0] = (numpy.dot(v_u1[0], vW_in1) + v_u2[0] * vW_in2 +
numpy.dot(v_x0, vW))
v_y[0] = numpy.dot(v_x0, vWout)
for i in xrange(1, 3):
v_x[i] = numpy.dot(v_u1[i], vW_in1) + v_u2[i] * vW_in2 + \
numpy.dot(v_x[i - 1], vW)
v_x[i] = (numpy.dot(v_u1[i], vW_in1) + v_u2[i] * vW_in2 +
numpy.dot(v_x[i - 1], vW))
v_y[i] = numpy.dot(v_x[i - 1], vWout)
(theano_x, theano_y) = f4(v_u1, v_u2, v_x0, v_y0, vW_in1)
......@@ -684,9 +683,9 @@ class T_Scan(unittest.TestCase):
y_tm1,
y_tm3,
W_in1):
return [theano.dot(u1_t, W_in1) + \
(u2_t + u2_tm1 * u2_tp1) * W_in2 + \
theano.dot(x_tm1, W),
return [theano.dot(u1_t, W_in1) +
(u2_t + u2_tm1 * u2_tp1) * W_in2 +
theano.dot(x_tm1, W),
(y_tm1 + y_tm3) * theano.dot(x_tm1, W_out),
theano.dot(u1_t, W_in1)]
......@@ -891,10 +890,10 @@ class T_Scan(unittest.TestCase):
numpy_x0[0] = vu0[0] * vW_in + vx0 * vW + vu1[0] * vu2[0]
numpy_x1[0] = vu0[0] * vW_in + vx1 * vW + vu1[0] + vu2[0]
for i in xrange(1, 3):
numpy_x0[i] = vu0[i] * vW_in + numpy_x0[i - 1] * vW + \
vu1[i] * vu2[i]
numpy_x1[i] = vu0[i] * vW_in + numpy_x1[i - 1] * vW + \
vu1[i] + vu2[i]
numpy_x0[i] = (vu0[i] * vW_in + numpy_x0[i - 1] * vW +
vu1[i] * vu2[i])
numpy_x1[i] = (vu0[i] * vW_in + numpy_x1[i - 1] * vW +
vu1[i] + vu2[i])
# note theano computes inplace, so call function after numpy
# equivalent is done
......@@ -908,8 +907,8 @@ class T_Scan(unittest.TestCase):
# Old way of doing inplace operations is deprecated .. tests don't
# make sense anymore.
##utt.assert_allclose( theano_x0 , vu2)
## utt.assert_allclose( theano_x1 , vu1)
## utt.assert_allclose(theano_x0 , vu2)
## utt.assert_allclose(theano_x1 , vu1)
# simple rnn ; compute inplace version 2
def test_inplace2(self):
......@@ -965,16 +964,16 @@ class T_Scan(unittest.TestCase):
if isinstance(x.op, theano.scan_module.scan_op.Scan)]
assert 0 in scan_node[0].op.destroy_map.keys()
assert 1 in scan_node[0].op.destroy_map.keys()
# compute output in numpy
# compute output in numpy
numpy_x0 = numpy.zeros((3,))
numpy_x1 = numpy.zeros((3,))
numpy_x0[0] = vu0[0] * vW_in + vx0 * vW + vu1[0] * vu1[1]
numpy_x1[0] = vu0[0] * vW_in + vx1 * vW + vu2[0] + vu2[1] + vu2[2]
for i in xrange(1, 3):
numpy_x0[i] = vu0[i] * vW_in + numpy_x0[i - 1] * vW + \
vu1[i] * vu1[i + 1]
numpy_x1[i] = vu0[i] * vW_in + numpy_x1[i - 1] * vW + \
vu2[i] + vu2[i + 1] + vu2[i + 2]
numpy_x0[i] = (vu0[i] * vW_in + numpy_x0[i - 1] * vW +
vu1[i] * vu1[i + 1])
numpy_x1[i] = (vu0[i] * vW_in + numpy_x1[i - 1] * vW +
vu2[i] + vu2[i + 1] + vu2[i + 2])
# note theano computes inplace, so call function after numpy
# equivalent is done
......@@ -1069,8 +1068,8 @@ class T_Scan(unittest.TestCase):
y1 = theano.shared(vy1, 'y1')
def f(u1_t, u2_t, y0_tm3, y0_tm2, y0_tm1, y1_tm1):
y0_t = theano.dot(theano.dot(u1_t, W1), W2) + 0.1 * y0_tm1 + \
0.33 * y0_tm2 + 0.17 * y0_tm3
y0_t = (theano.dot(theano.dot(u1_t, W1), W2) + 0.1 * y0_tm1 +
0.33 * y0_tm2 + 0.17 * y0_tm3)
y1_t = theano.dot(u2_t, W2) + y1_tm1
y2_t = theano.dot(u1_t, W1)
nwW1 = W1 + .1
......@@ -1106,14 +1105,13 @@ class T_Scan(unittest.TestCase):
numpy_W1 = vW1.copy()
numpy_W2 = vW2.copy()
for idx in xrange(3):
numpy_y0[idx + 3] = numpy.dot(\
numpy.dot(vu1[idx, :], numpy_W1), \
numpy_y0[idx + 3] = numpy.dot(numpy.dot(vu1[idx, :], numpy_W1),
numpy_W2) + \
0.1 * numpy_y0[idx + 2] + \
0.33 * numpy_y0[idx + 1] + \
0.17 * numpy_y0[idx]
numpy_y1[idx + 1] = numpy.dot(vu2[idx, :], numpy_W2) +\
numpy_y1[idx]
numpy_y1[idx + 1] = (numpy.dot(vu2[idx, :], numpy_W2) +
numpy_y1[idx])
numpy_y2[idx] = numpy.dot(vu1[idx, :], numpy_W1)
numpy_W1 = numpy_W1 + .1
numpy_W2 = numpy_W2 + .05
......@@ -1168,7 +1166,7 @@ class T_Scan(unittest.TestCase):
def test_simple_shared_random(self):
theano_rng = theano.tensor.shared_randomstreams.RandomStreams(
utt.fetch_seed())
utt.fetch_seed())
values, updates = theano.scan(lambda: theano_rng.uniform((2,), -1, 1),
[],
......@@ -1196,7 +1194,7 @@ class T_Scan(unittest.TestCase):
def test_cuda_gibbs_chain(self):
from theano.sandbox import cuda
if cuda.cuda_available == False:
if not cuda.cuda_available:
raise SkipTest('Optional package cuda disabled')
rng = numpy.random.RandomState(utt.fetch_seed())
......@@ -1204,7 +1202,7 @@ class T_Scan(unittest.TestCase):
dtype='float32')
vsample = theano.shared(v_vsample)
trng = theano.sandbox.rng_mrg.MRG_RandomStreams(
utt.fetch_seed())
utt.fetch_seed())
def f(vsample_tm1):
return trng.binomial(vsample_tm1.shape, n=1, p=0.3,
......@@ -1240,7 +1238,7 @@ class T_Scan(unittest.TestCase):
bvis = theano.shared(v_bvis, 'vbvis')
vsample = theano.tensor.matrix(dtype='float32')
trng = theano.tensor.shared_randomstreams.RandomStreams(
utt.fetch_seed())
utt.fetch_seed())
def f(vsample_tm1):
hmean_t = theano.tensor.nnet.sigmoid(
......
......@@ -5033,10 +5033,10 @@ def power(x, y):
return x**y
def swapaxes(y,axis1,axis2):
"swap axes of inputted tensor"
y = as_tensor_variable(y)
ndim = y.ndim
li = range(0, ndim)
li[axis1], li[axis2] = li[axis2], li[axis1]
return y.dimshuffle(li)
def swapaxes(y, axis1, axis2):
"swap axes of inputted tensor"
y = as_tensor_variable(y)
ndim = y.ndim
li = range(0, ndim)
li[axis1], li[axis2] = li[axis2], li[axis1]
return y.dimshuffle(li)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论