提交 ea9e3e54 authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #2250 from daemonmaker/cudnn

Implemented grad for cudnn softmax.
......@@ -5,11 +5,12 @@ from theano import Apply, gof, tensor
from theano.gof import Optimizer
from theano.gof.type import CDataType
from theano.compat import PY3
from theano.tensor.nnet import SoftmaxGrad
from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda import (GpuOp, cuda_available, active_device_number,
device_properties)
from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
gpu_contiguous)
gpu_contiguous, HostFromGpu)
from theano.sandbox.cuda.blas import (GpuConv, GpuDownsampleFactorMax,
GpuDownsampleFactorMaxGrad)
from theano.sandbox.cuda.nnet import GpuSoftmax
......@@ -849,7 +850,7 @@ def dnn_pool(img, ws, stride=(1, 1), mode='max'):
return GpuDnnPool()(img, desc)
class GpuDnnSoftmax(DnnBase):
class GpuDnnSoftmaxBase(DnnBase):
"""
Op for the cuDNN Softmax.
......@@ -873,46 +874,54 @@ class GpuDnnSoftmax(DnnBase):
assert(mode in ('instance', 'channel'))
self.mode = mode
def make_node(self, x):
x = as_cuda_ndarray_variable(x)
assert x.ndim == 4
return Apply(self, [x], [x.type()])
self.tensor_4d_descs = [softmax_input
for softmax_input in self.softmax_inputs]
self.tensor_4d_descs.append('softmax_output')
def c_support_code_struct(self, node, struct_id):
def _define_tensor4d_desc(self, name, id):
return """
cudnnTensor4dDescriptor_t softmax_input_%(id)d;
cudnnTensor4dDescriptor_t softmax_output_%(id)d;
""" % dict(id=struct_id)
cudnnTensor4dDescriptor_t %(name)s_%(id)d;
""" % dict(name=name, id=id)
def c_init_code_struct(self, node, struct_id, sub):
def _init_tensor4d_desc(self, name, id, fail):
return """
softmax_input_%(id)d = NULL;
softmax_output_%(id)d = NULL;
cudnnStatus_t err%(id)d;
if ((err%(id)d = cudnnCreateTensor4dDescriptor(&softmax_input_%(id)d)) != CUDNN_STATUS_SUCCESS) {
%(name)s_%(id)d = NULL;
if ((err%(id)d = cudnnCreateTensor4dDescriptor(&%(name)s_%(id)d)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
"(inp): %%s", cudnnGetErrorString(err%(id)d));
%(fail)s
}
if ((err%(id)d = cudnnCreateTensor4dDescriptor(&softmax_output_%(id)d)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
"(out): %%s", cudnnGetErrorString(err%(id)d));
": %%s", cudnnGetErrorString(err%(id)d));
%(fail)s
}
""" % dict(id=struct_id, fail=sub['fail'])
""" % dict(name=name, id=id, fail=fail)
def c_cleanup_code_struct(self, node, struct_id):
def _clean_tensor4d_desc(self, name, id):
return """
if(softmax_input_%(id)d != NULL)
cudnnDestroyTensor4dDescriptor(softmax_input_%(id)d);
if(%(name)s_%(id)d!= NULL)
cudnnDestroyTensor4dDescriptor(%(name)s_%(id)d);
""" % dict(name=name, id=id)
def c_support_code_struct(self, node, struct_id):
result = ''
for name in self.tensor_4d_descs:
result += self._define_tensor4d_desc(name, struct_id)
return result
if(softmax_output_%(id)d != NULL)
cudnnDestroyTensor4dDescriptor(softmax_output_%(id)d);
def c_init_code_struct(self, node, struct_id, sub):
result = """
cudnnStatus_t err%(id)d;
""" % dict(id=struct_id)
for name in self.tensor_4d_descs:
result += self._init_tensor4d_desc(name, struct_id, sub['fail'])
return result
def c_cleanup_code_struct(self, node, struct_id):
result = ''
for name in self.tensor_4d_descs:
result += self._clean_tensor4d_desc(name, struct_id)
return result
def c_code(self, node, name, inputs, outputs, sub):
ins, = inputs
ins = inputs
outs, = outputs
if self.tensor_format == 'b01c':
......@@ -930,7 +939,8 @@ if(softmax_output_%(id)d != NULL)
else:
algo = 0
return """
# Setup configuration variables.
result = """
cudnnStatus_t err%(name)s;
cudnnTensorFormat_t format%(id)d = CUDNN_TENSOR_NCHW;
if (%(tensor_format)d == 1)
......@@ -943,14 +953,19 @@ if (%(algo)d == 1)
cudnnSoftmaxMode_t mode%(id)d = CUDNN_SOFTMAX_MODE_CHANNEL;
if (%(mode)d == 1)
mode%(id)d = CUDNN_SOFTMAX_MODE_INSTANCE;
""" % dict(id=sub['struct_id'], name=name,
tensor_format=tensor_format, mode=mode, algo=algo)
# Validate the input and build the input variables.
for input_idx, input_name in enumerate(self.softmax_inputs):
result += """
if (!CudaNdarray_is_c_contiguous(%(ins)s)) {
PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
%(fail)s
}
err%(name)s = cudnnSetTensor4dDescriptor(
softmax_input_%(id)d,
%(input_name)s_%(id)d,
format%(id)d,
CUDNN_DATA_FLOAT,
CudaNdarray_HOST_DIMS(%(ins)s)[0],
......@@ -959,11 +974,15 @@ err%(name)s = cudnnSetTensor4dDescriptor(
CudaNdarray_HOST_DIMS(%(ins)s)[3]
);
if (err%(name)s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set tensor4d descriptor: %%s",
PyErr_Format(PyExc_RuntimeError, "could not set tensor4d descriptor: %%%%s",
cudnnGetErrorString(err%(name)s));
%(fail)s
}
""" % dict(id=sub['struct_id'], name=name, input_name=input_name,
ins=ins[input_idx], fail=sub['fail'])
# Build and prepare the output variable.
result += """
if (CudaNdarray_prep_output(&%(outs)s, 4, CudaNdarray_HOST_DIMS(%(ins)s)) != 0)
{
%(fail)s
......@@ -979,11 +998,41 @@ err%(name)s = cudnnSetTensor4dDescriptor(
CudaNdarray_HOST_DIMS(%(outs)s)[3]
);
if (err%(name)s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set out descriptor: %%s",
PyErr_Format(PyExc_RuntimeError, "could not set out descriptor: %%%%s",
cudnnGetErrorString(err%(name)s));
%(fail)s
}
"""
# Add on a call to the method that does the actual work.
result += self.method()
subs = dict(ins=ins[-1], outs=outs, fail=sub['fail'],
id=sub['struct_id'], name=name)
for idx, softmax_input in enumerate(self.softmax_inputs):
subs['name%d' % idx] = softmax_input
subs['ins%d' % idx] = inputs[idx]
return result % subs
def c_code_cache_version(self):
return (0, 6)
def method(self):
raise NotImplementedError('GpuDnnSoftmaxBase::method')
class GpuDnnSoftmax(GpuDnnSoftmaxBase):
softmax_inputs = ['softmax_input']
def make_node(self, x):
x = as_cuda_ndarray_variable(x)
assert x.ndim == 4
return Apply(self, [x], [x.type()])
def method(self):
return """
err%(name)s = cudnnSoftmaxForward(
_handle,
algo%(id)d,
......@@ -993,11 +1042,43 @@ err%(name)s = cudnnSoftmaxForward(
softmax_output_%(id)d,
CudaNdarray_DEV_DATA(%(outs)s)
);
""" % dict(ins=ins, outs=outs, tensor_format=tensor_format, mode=mode,
algo=algo, fail=sub['fail'], id=sub['struct_id'], name=name)
"""
def c_code_cache_version(self):
return (0, 3)
def grad(self, inp, grads):
x, = inp
g_sm, = grads
sm = self.make_node(x).outputs[0]
return [GpuDnnSoftmaxGrad(
self.tensor_format,
self.algo,
self.mode
)(g_sm, sm)]
class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
softmax_inputs = ['softmax_gout', 'softmax_input']
def make_node(self, dy, sm):
dy = as_cuda_ndarray_variable(dy)
sm = as_cuda_ndarray_variable(sm)
assert dy.ndim == 4
assert sm.ndim == 4
return Apply(self, [dy, sm], [sm.type.make_variable()])
def method(self):
return """
err%(name)s = cudnnSoftmaxBackward(
_handle,
algo%(id)d,
mode%(id)d,
%(name1)s_%(id)d,
CudaNdarray_DEV_DATA(%(ins1)s),
%(name0)s_%(id)d,
CudaNdarray_DEV_DATA(%(ins0)s),
softmax_output_%(id)d,
CudaNdarray_DEV_DATA(%(outs)s)
);
"""
# We need this since other stuff from opt is not importable.
......@@ -1072,3 +1153,27 @@ if cuda_available:
" to use it. We got this error: \n" +
dnn_available.msg)
gpu_seqopt.register("NoCuDNNRaise", NoCuDNNRaise(), 0, 'cudnn')
@register_opt('cudnn')
@local_optimizer([SoftmaxGrad])
def local_softmax_dnn_grad(node):
if (
isinstance(node.op, SoftmaxGrad)
and (isinstance(node.inputs[0].owner.op, HostFromGpu)
or isinstance(node.inputs[1].owner.op, HostFromGpu))
):
ins = []
for n in node.inputs:
if isinstance(n.owner.op, HostFromGpu):
n = n.owner.inputs[0]
ins.append(n.dimshuffle(0, 1, 'x', 'x'))
out = GpuDnnSoftmaxGrad(
'bc01',
'accurate',
'channel'
)(
gpu_contiguous(ins[0]),
gpu_contiguous(ins[1])
)
return [out.dimshuffle(0, 1)]
......@@ -276,7 +276,7 @@ class test_SoftMax(unittest.TestCase):
x = T.fmatrix('x')
z = T.nnet.softmax
def check_types(graph, graph_gpu):
def check_types_without_cudnn(graph, graph_gpu):
self._check_types(
graph,
graph_gpu,
......@@ -285,14 +285,15 @@ class test_SoftMax(unittest.TestCase):
cuda.nnet.GpuSoftmax
)
mode_wo_cudnn = mode_with_gpu.excluding("cudnn")
f, f_gpu = self._test_softmax(
x,
x,
z,
z,
self._cmp,
mode_with_gpu,
check_types
mode_wo_cudnn,
check_types_without_cudnn
)
# cuDNN R1 cannot handle these test cases but the Theano softmax can so
......@@ -300,6 +301,25 @@ class test_SoftMax(unittest.TestCase):
self._cmp(2 << 15, 5, f, f_gpu)
self._cmp(0, 10, f, f_gpu)
def check_types_with_cudnn(graph, graph_gpu):
self._check_types(
graph,
graph_gpu,
-3,
type(z),
theano.sandbox.cuda.dnn.GpuDnnSoftmax
)
f, f_gpu = self._test_softmax(
x,
x,
z,
z,
self._cmp,
mode_with_gpu,
check_types_with_cudnn
)
def test_cudnn_softmax(self):
if not cuda.dnn.dnn_available():
raise SkipTest(cuda.dnn.dnn_available.msg)
......@@ -320,6 +340,14 @@ class test_SoftMax(unittest.TestCase):
'channel'
)
# Verify the grad operation
dims = (2, 3, 4, 5)
gdata = numpy.arange(
numpy.product(dims),
dtype='float32'
).reshape(dims)
T.verify_grad(f_gpu, [gdata], rng=numpy.random)
def check_types(graph, graph_gpu):
self._check_types(
graph,
......@@ -337,6 +365,8 @@ class test_SoftMax(unittest.TestCase):
theano.sandbox.cuda.dnn.GpuDnnSoftmax
)]) == 1
# Verify that the CPU and GPU implementations return the same results
# up to a tolerance.
self._test_softmax(
x,
x_gpu,
......@@ -347,5 +377,53 @@ class test_SoftMax(unittest.TestCase):
check_types
)
mode = mode_with_gpu.including("cudnn")
self._test_softmax(x, x, f_z, f_z, self._cmp, mode, check_types_opt)
mode_w_cudnn = mode_with_gpu.including("cudnn")
self._test_softmax(
x, x, f_z, f_z, self._cmp,
mode_w_cudnn, check_types_opt
)
# Verify that the SoftmaxGrad -> GpuDnnSoftmaxGrad optimization is
# applied when cudnn is required
y = T.fvector('y')
f = theano.function(
[y],
T.grad(T.nnet.softmax(y).mean(), y),
mode=mode_with_gpu
)
sorted_f = f.maker.fgraph.toposort()
assert(len([i
for i in sorted_f
if isinstance(
i.op,
theano.sandbox.cuda.dnn.GpuDnnSoftmaxGrad
)]) == 1)
assert(len([i
for i in sorted_f
if isinstance(
i.op,
theano.tensor.nnet.SoftmaxGrad
)]) == 0)
# Verify that the SoftmaxGrad -> GpuDnnSoftmaxGrad optimization is not
# applied when cudnn is excluded or not available
mode_wo_cudnn = mode_with_gpu.excluding("cudnn")
y = T.vector('y')
f = theano.function(
[y],
T.grad(T.nnet.softmax(y).mean(), y),
mode=mode_wo_cudnn
)
sorted_f = f.maker.fgraph.toposort()
assert(len([i
for i in sorted_f
if isinstance(
i.op,
theano.sandbox.cuda.dnn.GpuDnnSoftmaxGrad
)]) == 0)
assert(len([i
for i in sorted_f
if isinstance(
i.op,
theano.tensor.nnet.SoftmaxGrad
)]) == 1)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论