提交 ccf4116b authored 作者: Pascal Lamblin's avatar Pascal Lamblin

Merge pull request #3899 from carriepl/log_softmax

Add optimizations to transfer new LogSoftmax op to the GPU as CuDNN softmax ops
...@@ -10,7 +10,7 @@ from theano.gof import Optimizer, local_optimizer, COp ...@@ -10,7 +10,7 @@ from theano.gof import Optimizer, local_optimizer, COp
from theano.gof.type import CDataType, Generic from theano.gof.type import CDataType, Generic
from theano.compile import optdb from theano.compile import optdb
from theano.compile.ops import shape_i from theano.compile.ops import shape_i
from theano.tensor.nnet import SoftmaxGrad from theano.tensor.nnet import LogSoftmax, SoftmaxGrad
from theano.tensor.nnet.abstract_conv import get_conv_output_shape from theano.tensor.nnet.abstract_conv import get_conv_output_shape
from theano.tensor.signal.pool import ( from theano.tensor.signal.pool import (
Pool, MaxPoolGrad, AveragePoolGrad) Pool, MaxPoolGrad, AveragePoolGrad)
...@@ -2400,11 +2400,12 @@ if True: ...@@ -2400,11 +2400,12 @@ if True:
return [out] return [out]
@register_opt('cudnn') @register_opt('cudnn')
@local_optimizer([GpuElemwise]) @local_optimizer([GpuElemwise, LogSoftmax])
def local_log_softmax_dnn(node): def local_log_softmax_dnn(node):
# The log-softmax implementation is only available starting at CuDNN V3 # The log-softmax implementation is only available starting at CuDNN V3
if not dnn_available() or version() < (3000, 3000): if not dnn_available() or version() < (3000, 3000):
return return
if (isinstance(node.op, GpuElemwise) and if (isinstance(node.op, GpuElemwise) and
isinstance(node.op.scalar_op, Log) and isinstance(node.op.scalar_op, Log) and
node.inputs[0].owner and node.inputs[0].owner and
...@@ -2419,6 +2420,21 @@ if True: ...@@ -2419,6 +2420,21 @@ if True:
new_log_softmax = new_softmax_node(softmax_node.inputs[0]) new_log_softmax = new_softmax_node(softmax_node.inputs[0])
return [new_log_softmax] return [new_log_softmax]
elif (isinstance(node.op, LogSoftmax) and node.inputs[0].owner and
isinstance(node.inputs[0].owner.op, HostFromGpu)):
if not dnn_available():
return
# Transform the input in the format expected by GpuDnnSoftmax
inp = node.inputs[0].owner.inputs[0]
if inp.ndim != 2:
return
inp = inp.dimshuffle(0, 1, 'x', 'x')
# Apply GpuDnnSoftmax and return the result
out = GpuDnnSoftmax('bc01', 'log', 'channel')(gpu_contiguous(inp))
return [out.dimshuffle(0, 1)]
class NoCuDNNRaise(Optimizer): class NoCuDNNRaise(Optimizer):
def apply(self, fgraph): def apply(self, fgraph):
""" Raise a RuntimeError if cudnn can't be used""" """ Raise a RuntimeError if cudnn can't be used"""
......
...@@ -612,6 +612,41 @@ class test_DnnSoftMax(test_nnet.test_SoftMax): ...@@ -612,6 +612,41 @@ class test_DnnSoftMax(test_nnet.test_SoftMax):
utt.assert_allclose(out, expected_out) utt.assert_allclose(out, expected_out)
def test_log_softmax2(self):
# Test that the op LogSoftmax is correctly replaced by the op
# DnnSoftmax with the 'log' mode.
# Compile a reference function, on the CPU, to be used to validate the
# results of the other function.
x = T.fmatrix()
f_ref = theano.function([x], T.nnet.LogSoftmax()(x))
# Build the first graph and ensure that the optimization is applied
log_softmax_out = T.nnet.LogSoftmax()(x)
f = theano.function([x], log_softmax_out, mode=mode_with_gpu)
dnn_softmax_nodes = [n for n in f.maker.fgraph.toposort() if
isinstance(n.op, cuda.dnn.GpuDnnSoftmax)]
assert len(dnn_softmax_nodes) == 1
assert dnn_softmax_nodes[0].op.algo == "log"
# Compare the output of the function with the reference function
inp = numpy.random.normal(0, 1, (5, 6)).astype("float32")
utt.assert_allclose(f(inp), f_ref(inp))
# Build the first graph and ensure that the optimization is applied
log_softmax_out = T.log(T.nnet.Softmax()(x))
f = theano.function([x], log_softmax_out, mode=mode_with_gpu)
dnn_softmax_nodes = [n for n in f.maker.fgraph.toposort() if
isinstance(n.op, cuda.dnn.GpuDnnSoftmax)]
assert len(dnn_softmax_nodes) == 1
assert dnn_softmax_nodes[0].op.algo == "log"
# Compare the output of the function with the reference function
inp = numpy.random.normal(0, 1, (5, 6)).astype("float32")
utt.assert_allclose(f(inp), f_ref(inp))
def test_dnn_tag(): def test_dnn_tag():
""" """
......
...@@ -12,7 +12,7 @@ from theano.gof.cmodule import GCC_compiler ...@@ -12,7 +12,7 @@ from theano.gof.cmodule import GCC_compiler
from theano.gof.type import CDataType, Generic from theano.gof.type import CDataType, Generic
from theano.compile import optdb from theano.compile import optdb
from theano.compile.ops import shape_i from theano.compile.ops import shape_i
from theano.tensor.nnet import SoftmaxGrad from theano.tensor.nnet import LogSoftmax, SoftmaxGrad
from theano.tensor.nnet.abstract_conv import (AbstractConv2d, from theano.tensor.nnet.abstract_conv import (AbstractConv2d,
AbstractConv2d_gradWeights, AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs, AbstractConv2d_gradInputs,
...@@ -1170,8 +1170,9 @@ class GpuDnnSoftmaxBase(DnnBase): ...@@ -1170,8 +1170,9 @@ class GpuDnnSoftmaxBase(DnnBase):
Parameters Parameters
---------- ----------
algo algo
'fast' or 'accurate' indicating whether computations should be 'fast', 'accurate' or 'log' indicating whether, respectively,
optimized for speed or accuracy respectively. computations should be optimized for speed, for accuracy, or if CuDNN
should rather compute the log-softmax instead.
mode mode
'instance' or 'channel' indicating whether the softmax should be 'instance' or 'channel' indicating whether the softmax should be
computed per image across 'c01' or per spatial location '01' per computed per image across 'c01' or per spatial location '01' per
...@@ -1219,8 +1220,9 @@ class GpuDnnSoftmax(GpuDnnSoftmaxBase): ...@@ -1219,8 +1220,9 @@ class GpuDnnSoftmax(GpuDnnSoftmaxBase):
Op for the cuDNN Softmax. Op for the cuDNN Softmax.
algo algo
'fast' or 'accurate' indicating whether computations should be 'fast', 'accurate' or 'log' indicating whether, respectively,
optimized for speed or accuracy respectively. computations should be optimized for speed, for accuracy, or if CuDNN
should rather compute the log-softmax instead.
mode mode
'instance' or 'channel' indicating whether the softmax should be 'instance' or 'channel' indicating whether the softmax should be
computed per image across 'c01' or per spatial location '01' per computed per image across 'c01' or per spatial location '01' per
...@@ -1253,8 +1255,9 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase): ...@@ -1253,8 +1255,9 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
Parameters Parameters
---------- ----------
algo algo
'fast' or 'accurate' indicating whether computations should be 'fast', 'accurate' or 'log' indicating whether, respectively,
optimized for speed or accuracy respectively. computations should be optimized for speed, for accuracy, or if CuDNN
should rather compute the gradient of the log-softmax instead.
mode mode
'instance' or 'channel' indicating whether the softmax should 'instance' or 'channel' indicating whether the softmax should
be computed per image across 'c01' or per spatial location '01' per be computed per image across 'c01' or per spatial location '01' per
...@@ -1472,6 +1475,25 @@ def local_log_softmax_dnn(node): ...@@ -1472,6 +1475,25 @@ def local_log_softmax_dnn(node):
return [new_softmax(softmax_node.inputs[0])] return [new_softmax(softmax_node.inputs[0])]
@register_opt('cudnn')
@op_lifter([LogSoftmax])
def local_logsoftmax_to_dnn(node, ctx_name):
if not dnn_available(ctx_name) or version() < 3000:
# No log-softmax before cudnn v3
return
# Transform the input in the format expected by GpuDnnSoftmax
inp = node.inputs[0]
if inp.ndim != 2:
return
inp = inp.dimshuffle(0, 1, 'x', 'x')
inp.tag.context_name = ctx_name
# Apply GpuDnnSoftmax and return the result
out = GpuDnnSoftmax('log', 'channel')(gpu_contiguous(inp))
return [out.dimshuffle(0, 1)]
class NoCuDNNRaise(Optimizer): class NoCuDNNRaise(Optimizer):
def apply(self, fgraph): def apply(self, fgraph):
""" """
......
...@@ -847,3 +847,43 @@ class test_SoftMax(test_nnet.test_SoftMax): ...@@ -847,3 +847,43 @@ class test_SoftMax(test_nnet.test_SoftMax):
numpy.exp(input_val).sum(1)[:, None, :, :]) numpy.exp(input_val).sum(1)[:, None, :, :])
utt.assert_allclose(out, expected_out) utt.assert_allclose(out, expected_out)
def test_log_softmax2(self):
# Test that the op LogSoftmax is correctly replaced by the op
# DnnSoftmax with the 'log' mode.
# This is a test for an optimization that depends on CuDNN v3 or
# more recent. Don't test if the CuDNN version is too old.
if dnn.version() < 3000:
raise SkipTest("Log-softmax is only in cudnn v3+")
# Compile a reference function, on the CPU, to be used to validate the
# results of the other function.
x = T.fmatrix()
f_ref = theano.function([x], T.nnet.LogSoftmax()(x))
# Build the first graph and ensure that the optimization is applied
log_softmax_out = T.nnet.LogSoftmax()(x)
f = theano.function([x], log_softmax_out, mode=mode_with_gpu)
dnn_softmax_nodes = [n for n in f.maker.fgraph.toposort() if
isinstance(n.op, dnn.GpuDnnSoftmax)]
assert len(dnn_softmax_nodes) == 1
assert dnn_softmax_nodes[0].op.algo == "log"
# Compare the output of the function with the reference function
inp = numpy.random.normal(0, 1, (5, 6)).astype("float32")
utt.assert_allclose(f(inp), f_ref(inp))
# Build the first graph and ensure that the optimization is applied
log_softmax_out = T.log(T.nnet.Softmax()(x))
f = theano.function([x], log_softmax_out, mode=mode_with_gpu)
dnn_softmax_nodes = [n for n in f.maker.fgraph.toposort() if
isinstance(n.op, dnn.GpuDnnSoftmax)]
assert len(dnn_softmax_nodes) == 1
assert dnn_softmax_nodes[0].op.algo == "log"
# Compare the output of the function with the reference function
inp = numpy.random.normal(0, 1, (5, 6)).astype("float32")
utt.assert_allclose(f(inp), f_ref(inp))
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论