提交 c972873e authored 作者: Pascal Lamblin's avatar Pascal Lamblin

Merge pull request #2688 from carriepl/scan_mem_usage

Add test for gpu memory usage
......@@ -207,6 +207,7 @@ class T_Scan(unittest.TestCase):
def setUp(self):
utt.seed_rng()
super(T_Scan, self).setUp()
# generator network, only one output , type scalar ; no sequence or
# non sequence arguments
......@@ -2696,6 +2697,85 @@ class T_Scan(unittest.TestCase):
f2_vals = f2(x_val)
utt.assert_allclose(f_vals, f2_vals)
def test_gpu_memory_usage(self):
# This test validates that the memory usage of the defined theano
# function is reasonnable when executed on the GPU. It checks for
# a bug in which one of scan's optimization was not applied which
# made the scan node compute large and unnecessary outputs which
# brought memory usage on the GPU to ~12G.
# The test must be performed on the GPU
from theano.sandbox import cuda
if not cuda.cuda_available:
raise SkipTest('Optional package cuda disabled')
# Dimensionality of input and output data (not one-hot coded)
n_in = 100
n_out = 100
# Number of neurons in hidden layer
n_hid = 4000
# Number of minibatches
mb_size = 2
# Time steps in minibatch
mb_length = 200
# Define input variables
xin = tensor.ftensor3(name='xin')
yout = tensor.ftensor3(name='yout')
# Initialize the network parameters
floatX = theano.config.floatX
U = theano.shared(numpy.zeros((n_in, n_hid), dtype="float32"),
name='W_xin_to_l1')
V = theano.shared(numpy.zeros((n_hid, n_hid), dtype="float32"),
name='W_l1_to_l1')
W = theano.shared(numpy.zeros((n_hid, n_out), dtype="float32"),
name='W_l1_to_l2')
nparams = [U, V, W]
# Build the forward pass
l1_base = tensor.dot(xin, U)
def scan_l(baseline, last_step):
return baseline + tensor.dot(last_step, V)
zero_output = tensor.alloc(numpy.asarray(0., dtype="float32"),
mb_size, n_hid)
l1_out, _ = theano.scan(scan_l, sequences=[l1_base],
outputs_info=[zero_output],
mode=mode_with_gpu)
l2_out = tensor.dot(l1_out, W)
# Compute the cost and take the gradient wrt params
cost = tensor.sum((l2_out - yout) ** 2)
grads = tensor.grad(cost, nparams)
updates = zip(nparams, [n - g for n, g in zip(nparams, grads)])
# Compile the theano function
feval_backprop = theano.function([xin, yout], cost, updates=updates,
mode=mode_with_gpu)
# Validate that the PushOutScanOutput optimization has been applied
# by checking the number of outputs of the grad Scan node in the
#compiled function.
nodes = feval_backprop.maker.fgraph.toposort()
scan_nodes = [n for n in nodes if isinstance(
n.op, theano.scan_module.scan_op.Scan)]
# The grad scan is always the 2nd one according to toposort. If the
# optimization has been applied, it has 2 outputs, otherwise 3.
grad_scan_node = scan_nodes[1]
assert len(grad_scan_node.outputs) == 2
# Call the theano function to ensure the absence of a memory error
feval_backprop(numpy.zeros((mb_length, mb_size, n_in),
dtype="float32"),
numpy.zeros((mb_length, mb_size, n_out),
dtype="float32"))
def test_reduce_memory_consumption(self):
x = theano.shared(numpy.asarray(
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论