提交 62c81c9c authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Add v3 features for softmax.

上级 2198fc07
...@@ -1333,15 +1333,17 @@ class GpuDnnSoftmaxBase(DnnBase): ...@@ -1333,15 +1333,17 @@ class GpuDnnSoftmaxBase(DnnBase):
DnnBase.__init__(self) DnnBase.__init__(self)
self.tensor_format = tensor_format self.tensor_format = tensor_format
assert(algo in ('fast', 'accurate')) assert(algo in ('fast', 'accurate', 'log'))
if algo == 'log' and version() < 3000:
raise RuntimeError("Need CuDNN v3 for log-softmax")
self.algo = algo self.algo = algo
assert(mode in ('instance', 'channel')) assert(mode in ('instance', 'channel'))
self.mode = mode self.mode = mode
self.tensor_4d_descs = [softmax_input self.tensor_descs = [softmax_input
for softmax_input in self.softmax_inputs] for softmax_input in self.softmax_inputs]
self.tensor_4d_descs.append('softmax_output') self.tensor_descs.append('softmax_output')
def infer_shape(self, node, shape): def infer_shape(self, node, shape):
if self.direction == 'forward': if self.direction == 'forward':
...@@ -1349,22 +1351,22 @@ class GpuDnnSoftmaxBase(DnnBase): ...@@ -1349,22 +1351,22 @@ class GpuDnnSoftmaxBase(DnnBase):
else: else:
return [shape[1]] return [shape[1]]
def _define_tensor4d_desc(self, name, id): def _define_tensor_desc(self, name, id):
return """ return """
cudnnTensorDescriptor_t %(id)s_%(name)s; cudnnTensorDescriptor_t %(id)s_%(name)s;
""" % dict(name=name, id=id) """ % dict(name=name, id=id)
def _init_tensor4d_desc(self, name, id, fail): def _init_tensor_desc(self, name, id, fail):
return """ return """
%(id)s_%(name)s = NULL; %(id)s_%(name)s = NULL;
if ((err%(name)s = cudnnCreateTensorDescriptor(&%(id)s_%(name)s)) != CUDNN_STATUS_SUCCESS) { if ((err%(name)s = cudnnCreateTensorDescriptor(&%(id)s_%(name)s)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor " PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor : %%s",
": %%s", cudnnGetErrorString(err%(name)s)); cudnnGetErrorString(err%(name)s));
%(fail)s %(fail)s
} }
""" % dict(name=name, id=id, fail=fail) """ % dict(name=name, id=id, fail=fail)
def _clean_tensor4d_desc(self, name, id): def _clean_tensor_desc(self, name, id):
return """ return """
if(%(id)s_%(name)s!= NULL) if(%(id)s_%(name)s!= NULL)
cudnnDestroyTensorDescriptor(%(id)s_%(name)s); cudnnDestroyTensorDescriptor(%(id)s_%(name)s);
...@@ -1372,8 +1374,8 @@ if(%(id)s_%(name)s!= NULL) ...@@ -1372,8 +1374,8 @@ if(%(id)s_%(name)s!= NULL)
def c_support_code_struct(self, node, name): def c_support_code_struct(self, node, name):
result = '' result = ''
for id in self.tensor_4d_descs: for id in self.tensor_descs:
result += self._define_tensor4d_desc(name, id) result += self._define_tensor_desc(name, id)
return result return result
def c_init_code_struct(self, node, name, sub): def c_init_code_struct(self, node, name, sub):
...@@ -1381,14 +1383,14 @@ if(%(id)s_%(name)s!= NULL) ...@@ -1381,14 +1383,14 @@ if(%(id)s_%(name)s!= NULL)
cudnnStatus_t err%(name)s; cudnnStatus_t err%(name)s;
""" % dict(name=name) """ % dict(name=name)
for id in self.tensor_4d_descs: for id in self.tensor_descs:
result += self._init_tensor4d_desc(name, id, sub['fail']) result += self._init_tensor_desc(name, id, sub['fail'])
return result return result
def c_cleanup_code_struct(self, node, name): def c_cleanup_code_struct(self, node, name):
result = '' result = ''
for id in self.tensor_4d_descs: for id in self.tensor_descs:
result += self._clean_tensor4d_desc(name, id) result += self._clean_tensor_desc(name, id)
return result return result
def c_code(self, node, name, inputs, outputs, sub): def c_code(self, node, name, inputs, outputs, sub):
...@@ -1396,43 +1398,31 @@ cudnnStatus_t err%(name)s; ...@@ -1396,43 +1398,31 @@ cudnnStatus_t err%(name)s;
outs, = outputs outs, = outputs
if self.tensor_format == 'b01c': if self.tensor_format == 'b01c':
tensor_format = 1 tensor_format = "CUDNN_TENSOR_NHWC"
else: else:
tensor_format = 0 tensor_format = "CUDNN_TENSOR_NCHW"
if self.mode == 'instance': if self.mode == 'instance':
mode = 1 mode = "CUDNN_SOFTMAX_MODE_INSTANCE"
else: else:
mode = 0 mode = "CUDNN_SOFTMAX_MODE_CHANNEL"
if self.algo == 'fast': if self.algo == 'fast':
algo = 1 algo = "CUDNN_SOFTMAX_FAST"
elif self.algo == 'log':
algo = "CUDNN_SOFTMAX_LOG"
else: else:
algo = 0 algo = "CUDNN_SOFTMAX_ACCURATE"
# Setup configuration variables.
result = """
cudnnStatus_t err%(name)s;
cudnnTensorFormat_t format%(name)s = CUDNN_TENSOR_NCHW;
if (%(tensor_format)d == 1)
format%(name)s = CUDNN_TENSOR_NHWC;
cudnnSoftmaxAlgorithm_t algo%(name)s = CUDNN_SOFTMAX_ACCURATE;
if (%(algo)d == 1)
algo%(name)s = CUDNN_SOFTMAX_FAST;
cudnnSoftmaxMode_t mode%(name)s = CUDNN_SOFTMAX_MODE_CHANNEL;
if (%(mode)d == 1)
mode%(name)s = CUDNN_SOFTMAX_MODE_INSTANCE;
""" % dict(name=name, tensor_format=tensor_format, mode=mode, algo=algo)
# Validate the input and build the input variables. # Validate the input and build the input variables.
for input_idx, input_name in enumerate(self.softmax_inputs): for input_idx, input_name in enumerate(self.softmax_inputs):
result += c_set_tensor4d(ins[input_idx], input_name + "_" + name, result += """
"err" + name, sub['fail']) if (c_set_tensorNd(%(t)s, %(desc)s) != 0)
%(fail)s
""" % dict(t=ins[input_idx], desc=input_name + "_" + name, fail=sub['fail'])
subs = dict(ins=ins[-1], outs=outs, fail=sub['fail'], subs = dict(ins=ins[-1], outs=outs, fail=sub['fail'],
name=name) name=name, algo=algo, mode=mode)
for idx, softmax_input in enumerate(self.softmax_inputs): for idx, softmax_input in enumerate(self.softmax_inputs):
subs['name%d' % idx] = softmax_input subs['name%d' % idx] = softmax_input
...@@ -1446,10 +1436,9 @@ if (theano_prep_output(&%(outs)s, PyGpuArray_NDIM(%(ins)s), ...@@ -1446,10 +1436,9 @@ if (theano_prep_output(&%(outs)s, PyGpuArray_NDIM(%(ins)s),
{ {
%(fail)s %(fail)s
} }
if (c_set_tensorNd(%(outs)s, softmax_output_%(name)s) != 0)
%(fail)s
""" % subs """ % subs
result += c_set_tensor4d(outs,
"softmax_output_" + name,
"err" + name, sub['fail'])
# Add on a call to the method that does the actual work. # Add on a call to the method that does the actual work.
result += self.method() % subs result += self.method() % subs
...@@ -1457,7 +1446,7 @@ if (theano_prep_output(&%(outs)s, PyGpuArray_NDIM(%(ins)s), ...@@ -1457,7 +1446,7 @@ if (theano_prep_output(&%(outs)s, PyGpuArray_NDIM(%(ins)s),
return result return result
def c_code_cache_version(self): def c_code_cache_version(self):
return (0, 7, version()) return (0.1, version())
def method(self): def method(self):
raise NotImplementedError('GpuDnnSoftmaxBase::method') raise NotImplementedError('GpuDnnSoftmaxBase::method')
...@@ -1489,24 +1478,13 @@ class GpuDnnSoftmax(GpuDnnSoftmaxBase): ...@@ -1489,24 +1478,13 @@ class GpuDnnSoftmax(GpuDnnSoftmaxBase):
def method(self): def method(self):
return """ return """
#ifndef CUDNN_VERSION
err%(name)s = cudnnSoftmaxForward(
_handle,
algo%(name)s,
mode%(name)s,
softmax_input_%(name)s,
PyGpuArray_DEV_DATA(%(ins)s),
softmax_output_%(name)s,
PyGpuArray_DEV_DATA(%(outs)s)
);
#else
{ {
const float alpha = 1.; const float alpha = 1.;
const float beta = 0.; const float beta = 0.;
err%(name)s = cudnnSoftmaxForward( err%(name)s = cudnnSoftmaxForward(
_handle, _handle,
algo%(name)s, %(algo)s,
mode%(name)s, %(mode)s,
(void*) &alpha, (void*) &alpha,
softmax_input_%(name)s, softmax_input_%(name)s,
PyGpuArray_DEV_DATA(%(ins)s), PyGpuArray_DEV_DATA(%(ins)s),
...@@ -1515,7 +1493,6 @@ err%(name)s = cudnnSoftmaxForward( ...@@ -1515,7 +1493,6 @@ err%(name)s = cudnnSoftmaxForward(
PyGpuArray_DEV_DATA(%(outs)s) PyGpuArray_DEV_DATA(%(outs)s)
); );
} }
#endif
""" """
def grad(self, inp, grads): def grad(self, inp, grads):
...@@ -1558,26 +1535,13 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase): ...@@ -1558,26 +1535,13 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
def method(self): def method(self):
return """ return """
#ifndef CUDNN_VERSION
err%(name)s = cudnnSoftmaxBackward(
_handle,
algo%(name)s,
mode%(name)s,
%(name1)s_%(name)s,
PyGpuArray_DEV_DATA(%(ins1)s),
%(name0)s_%(name)s,
PyGpuArray_DEV_DATA(%(ins0)s),
softmax_output_%(name)s,
PyGpuArray_DEV_DATA(%(outs)s)
);
#else
{ {
const float alpha = 1.; const float alpha = 1.;
const float beta = 0.; const float beta = 0.;
err%(name)s = cudnnSoftmaxBackward( err%(name)s = cudnnSoftmaxBackward(
_handle, _handle,
algo%(name)s, %(algo)s,
mode%(name)s, %(mode)s,
(void*) &alpha, (void*) &alpha,
%(name1)s_%(name)s, %(name1)s_%(name)s,
PyGpuArray_DEV_DATA(%(ins1)s), PyGpuArray_DEV_DATA(%(ins1)s),
...@@ -1588,8 +1552,7 @@ err%(name)s = cudnnSoftmaxBackward( ...@@ -1588,8 +1552,7 @@ err%(name)s = cudnnSoftmaxBackward(
PyGpuArray_DEV_DATA(%(outs)s) PyGpuArray_DEV_DATA(%(outs)s)
); );
} }
#endif """
"""
# @register_opt('cudnn') # this optimizer is registered in opt.py instead. # @register_opt('cudnn') # this optimizer is registered in opt.py instead.
......
...@@ -175,8 +175,6 @@ def test_pooling(): ...@@ -175,8 +175,6 @@ def test_pooling():
func = T.max func = T.max
else: else:
func = T.mean func = T.mean
if pad != (0, 0) and dnn.version() == -1:
continue
if pad != (0, 0) and func is T.mean: if pad != (0, 0) and func is T.mean:
continue continue
...@@ -611,15 +609,9 @@ def test_dnn_conv_alpha_output_merge(): ...@@ -611,15 +609,9 @@ def test_dnn_conv_alpha_output_merge():
lr = numpy.asarray(0.05, dtype='float32') lr = numpy.asarray(0.05, dtype='float32')
if dnn.version() == -1: fr = lr * (conv + out)
# Can't merge alpha with cudnn v1 wr = kern + lr * gw
fr = conv + out ir = img + lr * gi
wr = kern + gw
ir = img + gi
else:
fr = lr * (conv + out)
wr = kern + lr * gw
ir = img + lr * gi
f1 = theano.function([img, kern, out], [fr, wr, ir], mode=mode_with_gpu) f1 = theano.function([img, kern, out], [fr, wr, ir], mode=mode_with_gpu)
assert isinstance(f1.maker.fgraph.outputs[0].owner.inputs[0].owner.op, assert isinstance(f1.maker.fgraph.outputs[0].owner.inputs[0].owner.op,
...@@ -656,9 +648,6 @@ def test_dnn_conv_alpha_output_merge(): ...@@ -656,9 +648,6 @@ def test_dnn_conv_alpha_output_merge():
def test_dnn_conv_grad(): def test_dnn_conv_grad():
if not dnn.dnn_available() or dnn.version() == -1:
raise SkipTest('alpha != 1.0 not supported in cudnn v1')
b = 1 b = 1
c = 4 c = 4
f = 3 f = 3
...@@ -696,7 +685,7 @@ def test_dnn_conv_grad(): ...@@ -696,7 +685,7 @@ def test_dnn_conv_grad():
def test_version(): def test_version():
if not dnn.dnn_available(): if not dnn.dnn_available():
raise SkipTest(dnn.dnn_available.msg) raise SkipTest(dnn.dnn_available.msg)
assert isinstance(dnn.version(), (int, tuple)) assert isinstance(dnn.version(), int)
class test_SoftMax(test_nnet.test_SoftMax): class test_SoftMax(test_nnet.test_SoftMax):
...@@ -705,7 +694,7 @@ class test_SoftMax(test_nnet.test_SoftMax): ...@@ -705,7 +694,7 @@ class test_SoftMax(test_nnet.test_SoftMax):
mode = mode_with_gpu mode = mode_with_gpu
def test_softmax_shape_0(self): def test_softmax_shape_0(self):
raise SkipTest("Cudnn do not suport 0 shapes") raise SkipTest("Cudnn doesn't suport 0 shapes")
def test_softmax_grad(self): def test_softmax_grad(self):
def cmp(n, m, f, f_gpu): def cmp(n, m, f, f_gpu):
...@@ -758,18 +747,20 @@ class test_SoftMax(test_nnet.test_SoftMax): ...@@ -758,18 +747,20 @@ class test_SoftMax(test_nnet.test_SoftMax):
mode=mode_with_gpu mode=mode_with_gpu
) )
sorted_f = f.maker.fgraph.toposort() sorted_f = f.maker.fgraph.toposort()
assert(len([i # Optimization is disabled for cudnn v3 rc1
for i in sorted_f if dnn.version() == 2000:
if isinstance( assert(len([i
i.op, for i in sorted_f
self.gpu_grad_op if isinstance(
)]) == 1) i.op,
assert(len([i self.gpu_grad_op
for i in sorted_f )]) == 1)
if isinstance( assert(len([i
i.op, for i in sorted_f
theano.tensor.nnet.SoftmaxGrad if isinstance(
)]) == 0) i.op,
theano.tensor.nnet.SoftmaxGrad
)]) == 0)
# Verify that the SoftmaxGrad -> Gpu[Dnn]SoftmaxGrad # Verify that the SoftmaxGrad -> Gpu[Dnn]SoftmaxGrad
# optimization is not applied when cudnn is excluded or not # optimization is not applied when cudnn is excluded or not
...@@ -801,15 +792,17 @@ class test_SoftMax(test_nnet.test_SoftMax): ...@@ -801,15 +792,17 @@ class test_SoftMax(test_nnet.test_SoftMax):
o = theano.tensor.nnet.SoftmaxGrad()(y, y * 2) o = theano.tensor.nnet.SoftmaxGrad()(y, y * 2)
f = theano.function([y], o, mode=mode_with_gpu) f = theano.function([y], o, mode=mode_with_gpu)
sorted_f = f.maker.fgraph.toposort() sorted_f = f.maker.fgraph.toposort()
assert(len([i if dnn.version() == 2000:
for i in sorted_f # opt disabled for cudnn v3 rc1
if isinstance( assert(len([i
i.op, for i in sorted_f
self.gpu_grad_op if isinstance(
)]) == 1) i.op,
assert(len([i self.gpu_grad_op
for i in sorted_f )]) == 1)
if isinstance( assert(len([i
i.op, for i in sorted_f
theano.tensor.nnet.SoftmaxGrad if isinstance(
)]) == 0) i.op,
theano.tensor.nnet.SoftmaxGrad
)]) == 0)
...@@ -346,7 +346,6 @@ class test_SoftMax(unittest.TestCase): ...@@ -346,7 +346,6 @@ class test_SoftMax(unittest.TestCase):
return f, f_gpu return f, f_gpu
def _cmp(self, n, m, f, f_gpu): def _cmp(self, n, m, f, f_gpu):
# print "test_softmax",n,m
data = numpy.arange(n * m, dtype='float32').reshape(n, m) data = numpy.arange(n * m, dtype='float32').reshape(n, m)
out = f(data) out = f(data)
gout = f_gpu(data) gout = f_gpu(data)
...@@ -369,8 +368,6 @@ class test_SoftMax(unittest.TestCase): ...@@ -369,8 +368,6 @@ class test_SoftMax(unittest.TestCase):
self._cmp self._cmp
) )
# cuDNN R1 cannot handle these test cases but the Theano softmax can so
# we test them only for the Theano softmax.
self._cmp(2 << 15, 5, f, f_gpu) self._cmp(2 << 15, 5, f, f_gpu)
def test_softmax_shape_0(self): def test_softmax_shape_0(self):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论