提交 ddb712a5 authored 作者: Dustin Webb's avatar Dustin Webb

Implemented grad for cudnn softmax.

上级 c416c5eb
......@@ -849,7 +849,7 @@ def dnn_pool(img, ws, stride=(1, 1), mode='max'):
return GpuDnnPool()(img, desc)
class GpuDnnSoftmax(DnnBase):
class GpuDnnSoftmaxBase(DnnBase):
"""
Op for the cuDNN Softmax.
......@@ -873,46 +873,57 @@ class GpuDnnSoftmax(DnnBase):
assert(mode in ('instance', 'channel'))
self.mode = mode
def make_node(self, x):
x = as_cuda_ndarray_variable(x)
assert x.ndim == 4
return Apply(self, [x], [x.type()])
self.tensor_4d_descs = [softmax_input
for softmax_input in self.softmax_inputs]
self.tensor_4d_descs.append('softmax_output')
def c_support_code_struct(self, node, struct_id):
def _define_tensor4d_desc(self, name, id):
return """
cudnnTensor4dDescriptor_t softmax_input_%(id)d;
cudnnTensor4dDescriptor_t softmax_output_%(id)d;
""" % dict(id=struct_id)
cudnnTensor4dDescriptor_t %(name)s_%(id)d;
""" % dict(name=name, id=id)
def c_init_code_struct(self, node, struct_id, sub):
def _init_tensor4d_desc(self, name, id, fail):
return """
softmax_input_%(id)d = NULL;
softmax_output_%(id)d = NULL;
cudnnStatus_t err%(id)d;
if ((err%(id)d = cudnnCreateTensor4dDescriptor(&softmax_input_%(id)d)) != CUDNN_STATUS_SUCCESS) {
%(name)s_%(id)d = NULL;
if ((err%(id)d = cudnnCreateTensor4dDescriptor(&%(name)s_%(id)d)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
"(inp): %%s", cudnnGetErrorString(err%(id)d));
%(fail)s
}
if ((err%(id)d = cudnnCreateTensor4dDescriptor(&softmax_output_%(id)d)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
"(out): %%s", cudnnGetErrorString(err%(id)d));
%(fail)s
}
""" % dict(id=struct_id, fail=sub['fail'])
""" % dict(name=name, id=id, fail=fail)
def c_cleanup_code_struct(self, node, struct_id):
def _clean_tensor4d_desc(self, name, id):
return """
if(softmax_input_%(id)d != NULL)
cudnnDestroyTensor4dDescriptor(softmax_input_%(id)d);
if(%(name)s_%(id)d!= NULL)
cudnnDestroyTensor4dDescriptor(%(name)s_%(id)d);
""" % dict(name=name, id=id)
def c_support_code_struct(self, node, struct_id):
result = ''
for name in self.tensor_4d_descs:
result += self._define_tensor4d_desc(name, struct_id)
return result
if(softmax_output_%(id)d != NULL)
cudnnDestroyTensor4dDescriptor(softmax_output_%(id)d);
def c_init_code_struct(self, node, struct_id, sub):
result = """
cudnnStatus_t err%(id)d;
""" % dict(id=struct_id)
for name in self.tensor_4d_descs:
result += self._init_tensor4d_desc(name, struct_id, sub['fail'])
return result
def c_cleanup_code_struct(self, node, struct_id):
result = ''
for name in self.tensor_4d_descs:
result += self._clean_tensor4d_desc(name, struct_id)
return result
def c_code(self, node, name, inputs, outputs, sub):
ins, = inputs
ins = inputs
if type(inputs) is not list:
ins = [ins]
outs, = outputs
if self.tensor_format == 'b01c':
......@@ -930,7 +941,8 @@ if(softmax_output_%(id)d != NULL)
else:
algo = 0
return """
# Setup configuration variables.
result = """
cudnnStatus_t err%(name)s;
cudnnTensorFormat_t format%(id)d = CUDNN_TENSOR_NCHW;
if (%(tensor_format)d == 1)
......@@ -943,14 +955,19 @@ if (%(algo)d == 1)
cudnnSoftmaxMode_t mode%(id)d = CUDNN_SOFTMAX_MODE_CHANNEL;
if (%(mode)d == 1)
mode%(id)d = CUDNN_SOFTMAX_MODE_INSTANCE;
""" % dict(id=sub['struct_id'], name=name,
tensor_format=tensor_format, mode=mode, algo=algo)
# Validate the input and build the input variables.
for input_idx, input_name in enumerate(self.softmax_inputs):
result += """
if (!CudaNdarray_is_c_contiguous(%(ins)s)) {
PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
%(fail)s
}
err%(name)s = cudnnSetTensor4dDescriptor(
softmax_input_%(id)d,
%(input_name)s_%(id)d,
format%(id)d,
CUDNN_DATA_FLOAT,
CudaNdarray_HOST_DIMS(%(ins)s)[0],
......@@ -959,11 +976,15 @@ err%(name)s = cudnnSetTensor4dDescriptor(
CudaNdarray_HOST_DIMS(%(ins)s)[3]
);
if (err%(name)s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set tensor4d descriptor: %%s",
PyErr_Format(PyExc_RuntimeError, "could not set tensor4d descriptor: %%%%s",
cudnnGetErrorString(err%(name)s));
%(fail)s
}
""" % dict(id=sub['struct_id'], name=name, input_name=input_name,
ins=ins[input_idx], fail=sub['fail'])
# Build and prepare the output variable.
result += """
if (CudaNdarray_prep_output(&%(outs)s, 4, CudaNdarray_HOST_DIMS(%(ins)s)) != 0)
{
%(fail)s
......@@ -979,11 +1000,41 @@ err%(name)s = cudnnSetTensor4dDescriptor(
CudaNdarray_HOST_DIMS(%(outs)s)[3]
);
if (err%(name)s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set out descriptor: %%s",
PyErr_Format(PyExc_RuntimeError, "could not set out descriptor: %%%%s",
cudnnGetErrorString(err%(name)s));
%(fail)s
}
"""
# Add on a call to the method that does the actual work.
result += self.method()
subs = dict(ins=ins[-1], outs=outs, fail=sub['fail'],
id=sub['struct_id'], name=name)
for idx, softmax_input in enumerate(self.softmax_inputs):
subs['name%d' % idx] = softmax_input
subs['ins%d' % idx] = inputs[idx]
return result % subs
def c_code_cache_version(self):
return (0, 6)
def method(self):
raise NotImplementedError('GpuDnnSoftmaxBase::method')
class GpuDnnSoftmax(GpuDnnSoftmaxBase):
softmax_inputs = ['softmax_input']
def make_node(self, x):
x = as_cuda_ndarray_variable(x)
assert x.ndim == 4
return Apply(self, [x], [x.type()])
def method(self):
return """
err%(name)s = cudnnSoftmaxForward(
_handle,
algo%(id)d,
......@@ -993,11 +1044,43 @@ err%(name)s = cudnnSoftmaxForward(
softmax_output_%(id)d,
CudaNdarray_DEV_DATA(%(outs)s)
);
""" % dict(ins=ins, outs=outs, tensor_format=tensor_format, mode=mode,
algo=algo, fail=sub['fail'], id=sub['struct_id'], name=name)
"""
def c_code_cache_version(self):
return (0, 3)
def grad(self, inp, grads):
x, = inp
g_sm, = grads
sm = self.make_node(x).outputs[0]
return [GpuDnnSoftmaxGrad(
self.tensor_format,
self.algo,
self.mode
)(g_sm, sm)]
class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
softmax_inputs = ['softmax_var', 'softmax_input']
def make_node(self, dy, sm):
dy = as_cuda_ndarray_variable(dy)
sm = as_cuda_ndarray_variable(sm)
assert dy.ndim == 4
assert sm.ndim == 4
return Apply(self, [dy, sm], [sm.type.make_variable()])
def method(self):
return """
err%(name)s = cudnnSoftmaxBackward(
_handle,
algo%(id)d,
mode%(id)d,
%(name1)s_%(id)d,
CudaNdarray_DEV_DATA(%(ins1)s),
%(name0)s_%(id)d,
CudaNdarray_DEV_DATA(%(ins0)s),
softmax_output_%(id)d,
CudaNdarray_DEV_DATA(%(outs)s)
);
"""
# We need this since other stuff from opt is not importable.
......
......@@ -320,6 +320,13 @@ class test_SoftMax(unittest.TestCase):
'channel'
)
# Verify the grad operation
n = 3
m = 5
data = numpy.arange(n * m, dtype='float32').reshape(n, m)
gdata = numpy.asarray(data)[:, :, None, None]
T.verify_grad(f_gpu, [gdata], rng=numpy.random)
def check_types(graph, graph_gpu):
self._check_types(
graph,
......@@ -337,6 +344,8 @@ class test_SoftMax(unittest.TestCase):
theano.sandbox.cuda.dnn.GpuDnnSoftmax
)]) == 1
# Verify that the CPU and GPU implementations return the same results
# up to a tolerance.
self._test_softmax(
x,
x_gpu,
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论