提交 52ea2d0c authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #2116 from daemonmaker/cudnn

Implemented cuDNN softmax but it is not currently passing tests.
...@@ -4,16 +4,18 @@ import os ...@@ -4,16 +4,18 @@ import os
import theano import theano
from theano import Apply, tensor from theano import Apply, tensor
from theano.gof.type import CDataType from theano.gof.type import CDataType
from theano.compat import PY3
from theano.compat.six import StringIO from theano.compat.six import StringIO
from theano.sandbox.cuda.type import CudaNdarrayType from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda import GpuOp from theano.sandbox.cuda import GpuOp
from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable, from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
gpu_contiguous) gpu_contiguous)
from theano.sandbox.cuda.blas import GpuConv from theano.sandbox.cuda.blas import GpuConv
from theano.compat import PY3 from theano.sandbox.cuda.nnet import GpuSoftmax
from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
class DnnBase(GpuOp): class DnnBase(GpuOp):
""" """
Creates a handle for cudnn and pulls in the cudnn libraries and headers. Creates a handle for cudnn and pulls in the cudnn libraries and headers.
...@@ -360,6 +362,7 @@ class GpuDnnConvGradI(GpuDnnConvBase): ...@@ -360,6 +362,7 @@ class GpuDnnConvGradI(GpuDnnConvBase):
from theano.sandbox.cuda.opt import (local_optimizer, gpu_contiguous, from theano.sandbox.cuda.opt import (local_optimizer, gpu_contiguous,
gpu_optimizer) gpu_optimizer)
def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
conv_mode='conv'): conv_mode='conv'):
img = gpu_contiguous(img) img = gpu_contiguous(img)
...@@ -368,6 +371,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), ...@@ -368,6 +371,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
conv_mode=conv_mode)(img.shape, kerns.shape) conv_mode=conv_mode)(img.shape, kerns.shape)
return GpuDnnConv()(img, kerns, desc) return GpuDnnConv()(img, kerns, desc)
@local_optimizer([GpuConv]) @local_optimizer([GpuConv])
def local_conv_dnn(node): def local_conv_dnn(node):
if isinstance(node.op, GpuConv): if isinstance(node.op, GpuConv):
...@@ -380,3 +384,166 @@ def local_conv_dnn(node): ...@@ -380,3 +384,166 @@ def local_conv_dnn(node):
border_mode=border_mode, subsample=subsample)] border_mode=border_mode, subsample=subsample)]
gpu_optimizer.register("conv_cudnn", local_conv_dnn, 'cudnn') gpu_optimizer.register("conv_cudnn", local_conv_dnn, 'cudnn')
class GpuDnnSoftmax(DnnBase):
"""
Op for the cuDNN Softmax.
Parameters''
-tensor_format: Whether the data format is 'bc01' or 'b01c'
-algo: 'fast' or 'accurate' indicating whether computations should be
optimized for speed or accuracy respectively.
-mode: 'instance' or 'channel' indicating whether the softmax should be
computed per image across 'c01' or per spationali location '01' per image
across 'c'.
"""
__props__ = ('tensor_format', 'mode', 'algo')
def __init__(self, tensor_format, algo, mode):
assert(tensor_format in ('bc01', 'b01c'))
self.tensor_format = tensor_format
assert(algo in ('fast', 'accurate'))
self.algo = algo
assert(mode in ('instance', 'channel'))
self.mode = mode
def make_node(self, x):
x = as_cuda_ndarray_variable(x)
assert x.ndim == 4
return Apply(self, [x], [x.type()])
def c_support_code_struct(self, node, struct_id):
return """
cudnnTensor4dDescriptor_t softmax_input_%(id)d;
cudnnTensor4dDescriptor_t softmax_output_%(id)d;
""" % dict(id=struct_id)
def c_init_code_struct(self, node, struct_id, sub):
return """
softmax_input_%(id)d = NULL;
softmax_output_%(id)d = NULL;
cudnnStatus_t err%(id)d;
if ((err%(id)d = cudnnCreateTensor4dDescriptor(&softmax_input_%(id)d)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
"(inp): %%s", cudnnGetErrorString(err%(id)d));
%(fail)s
}
if ((err%(id)d = cudnnCreateTensor4dDescriptor(&softmax_output_%(id)d)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
"(out): %%s", cudnnGetErrorString(err%(id)d));
%(fail)s
}
""" % dict(id=struct_id, fail=sub['fail'])
def c_cleanup_code_struct(self, node, struct_id):
return """
if(softmax_input_%(id)d != NULL)
cudnnDestroyTensor4dDescriptor(softmax_input_%(id)d);
if(softmax_output_%(id)d != NULL)
cudnnDestroyTensor4dDescriptor(softmax_output_%(id)d);
""" % dict(id=struct_id)
def c_code(self, node, name, inputs, outputs, sub):
ins, = inputs
outs, = outputs
if self.tensor_format == 'b01c':
tensor_format = 1
else:
tensor_format = 0
if self.mode == 'instance':
mode = 1
else:
mode = 0
if self.algo == 'fast':
algo = 1
else:
algo = 0
return """
cudnnStatus_t err%(name)s;
cudnnTensorFormat_t format%(id)d = CUDNN_TENSOR_NCHW;
if (%(tensor_format)d == 1)
format%(id)d = CUDNN_TENSOR_NHWC;
cudnnSoftmaxAlgorithm_t algo%(id)d = CUDNN_SOFTMAX_ACCURATE;
if (%(algo)d == 1)
algo%(id)d = CUDNN_SOFTMAX_FAST;
cudnnSoftmaxMode_t mode%(id)d = CUDNN_SOFTMAX_MODE_CHANNEL;
if (%(mode)d == 1)
mode%(id)d = CUDNN_SOFTMAX_MODE_INSTANCE;
if (!CudaNdarray_is_c_contiguous(%(ins)s)) {
PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
%(fail)s
}
err%(name)s = cudnnSetTensor4dDescriptor(
softmax_input_%(id)d,
format%(id)d,
CUDNN_DATA_FLOAT,
CudaNdarray_HOST_DIMS(%(ins)s)[0],
CudaNdarray_HOST_DIMS(%(ins)s)[1],
CudaNdarray_HOST_DIMS(%(ins)s)[2],
CudaNdarray_HOST_DIMS(%(ins)s)[3]
);
if (err%(name)s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set tensor4d descriptor: %%s",
cudnnGetErrorString(err%(name)s));
%(fail)s
}
if (CudaNdarray_prep_output(&%(outs)s, 4, CudaNdarray_HOST_DIMS(%(ins)s)) != 0)
{
%(fail)s
}
err%(name)s = cudnnSetTensor4dDescriptor(
softmax_output_%(id)d,
format%(id)d,
CUDNN_DATA_FLOAT,
CudaNdarray_HOST_DIMS(%(outs)s)[0],
CudaNdarray_HOST_DIMS(%(outs)s)[1],
CudaNdarray_HOST_DIMS(%(outs)s)[2],
CudaNdarray_HOST_DIMS(%(outs)s)[3]
);
if (err%(name)s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set out descriptor: %%s",
cudnnGetErrorString(err%(name)s));
%(fail)s
}
err%(name)s = cudnnSoftmaxForward(
_handle,
algo%(id)d,
mode%(id)d,
softmax_input_%(id)d,
CudaNdarray_DEV_DATA(%(ins)s),
softmax_output_%(id)d,
CudaNdarray_DEV_DATA(%(outs)s)
);
""" % dict(ins=ins, outs=outs, tensor_format=tensor_format, mode=mode,
algo=algo, fail=sub['fail'], id=sub['struct_id'], name=name)
def c_code_cache_version(self):
return (0, 3)
@local_optimizer([GpuSoftmax])
def local_softmax_dnn(node):
if isinstance(node.op, GpuSoftmax):
ins = node.inputs[0].dimshuffle(0, 1, 'x', 'x')
out = GpuDnnSoftmax('bc01', 'accurate', 'channel')(gpu_contiguous(ins))
out = as_cuda_ndarray_variable(out.dimshuffle(0, 1))
return [out]
gpu_optimizer.register("softmax_cudnn", local_softmax_dnn, 'cudnn')
from nose.plugins.skip import SkipTest from nose.plugins.skip import SkipTest
import numpy import numpy
import unittest
import theano import theano
from theano.gof.python25 import any from theano.gof.python25 import any
...@@ -208,42 +209,140 @@ def test_softmax_with_bias(): ...@@ -208,42 +209,140 @@ def test_softmax_with_bias():
cmp(128, 64 * 1024) cmp(128, 64 * 1024)
def test_softmax(): class test_SoftMax(unittest.TestCase):
def _test_softmax(
self,
x,
x_gpu,
f_z,
f_gpu_z,
cmp,
gpu_mode,
check_types
):
""" """
This is basic test for GpuSoftmax This is basic test for GpuSoftmax and GpuDnnSoftmax
We check that we loop when their is too much block We check that we loop when their is too much block
We use slower code when there isn't enough shared memory We use slower code when there isn't enough shared memory
""" """
x = T.fmatrix('x') f_z_out = f_z(x)
f_gpu_z_out = f_gpu_z(x_gpu)
z = T.nnet.softmax(x) f = theano.function([x], f_z_out, mode=mode_without_gpu)
f = theano.function([x], z, mode=mode_without_gpu) f_gpu = theano.function([x_gpu], f_gpu_z_out, mode=gpu_mode)
f_gpu = theano.function([x], z, mode=mode_with_gpu) check_types(f, f_gpu)
assert f.maker.fgraph.toposort()[-1].op == T.nnet.softmax
assert isinstance(f_gpu.maker.fgraph.toposort()[-2].op,
cuda.nnet.GpuSoftmax)
def cmp(n, m): #we need to test n>32*1024 to check that we make the block loop.
cmp(1, 5, f, f_gpu)
cmp(2, 5, f, f_gpu)
cmp(10, 5, f, f_gpu)
cmp(100, 5, f, f_gpu)
cmp(1000, 5, f, f_gpu)
cmp(10000, 5, f, f_gpu)
cmp(4074, 400, f, f_gpu)
cmp(784, 784, f, f_gpu)
cmp(4, 1000, f, f_gpu)
cmp(4, 1024, f, f_gpu)
cmp(4, 2000, f, f_gpu)
cmp(4, 2024, f, f_gpu)
# The GTX285 don't have enough shared memory.
cmp(4, 4074, f, f_gpu)
# The GTX580, 680 and kepler don't have enough shared memory.
cmp(2, 10000, f, f_gpu)
cmp(128, 16 * 1024, f, f_gpu)
cmp(128, 64 * 1024, f, f_gpu)
# cudnn permits no more than 2^15 - 1 rows
cmp((2 << 15) - 1, 5, f, f_gpu)
cmp(5, 2 << 15, f, f_gpu)
return f, f_gpu
def _cmp(self, n, m, f, f_gpu):
#print "test_softmax",n,m #print "test_softmax",n,m
data = numpy.arange(n * m, dtype='float32').reshape(n, m) data = numpy.arange(n * m, dtype='float32').reshape(n, m)
out = f(data) out = f(data)
gout = f_gpu(data) gout = f_gpu(data)
assert numpy.allclose(out, gout), numpy.absolute(out - gout) assert numpy.allclose(out, gout), numpy.absolute(out - gout)
#we need to test n>32*1024 to check that we make the block loop. def _check_types(self, graph, graph_gpu, topo_idx, f_type, f_gpu_type):
cmp(2, 5) assert isinstance(graph.maker.fgraph.toposort()[-1].op, f_type)
cmp(2 << 15, 5) assert isinstance(
cmp(4074, 400) graph_gpu.maker.fgraph.toposort()[topo_idx].op,
cmp(0, 10) f_gpu_type
cmp(784, 784) )
cmp(4, 1000)
cmp(4, 1024) def test_softmax(self):
cmp(4, 2000) x = T.fmatrix('x')
cmp(4, 2024) z = T.nnet.softmax
# The GTX285 don't have enough shared memory.
cmp(4, 4074) def check_types(graph, graph_gpu):
# The GTX580, 680 and kepler don't have enough shared memory. self._check_types(
cmp(2, 10000) graph,
cmp(128, 16 * 1024) graph_gpu,
cmp(128, 64 * 1024) -2,
type(z),
cuda.nnet.GpuSoftmax
)
f, f_gpu = self._test_softmax(
x,
x,
z,
z,
self._cmp,
mode_with_gpu,
check_types
)
# cuDNN R1 cannot handle these test cases but the Theano softmax can so
# we test them only for the Theano softmax.
self._cmp(2 << 15, 5, f, f_gpu)
self._cmp(0, 10, f, f_gpu)
def test_cudnn_softmax(self):
def cmp(n, m, f, f_gpu):
data = numpy.arange(n * m, dtype='float32').reshape(n, m)
gdata = numpy.asarray(data)[:, :, None, None]
out = f(data)
gout = numpy.asarray(f_gpu(gdata))[:, :, 0, 0]
assert numpy.allclose(out, gout), numpy.absolute(out - gout)
x = T.matrix('x', 'float32')
x_gpu = T.tensor4('x_gpu', 'float32')
f_z = T.nnet.softmax
f_gpu = theano.sandbox.cuda.dnn.GpuDnnSoftmax(
'bc01',
'accurate',
'channel'
)
def check_types(graph, graph_gpu):
self._check_types(
graph,
graph_gpu,
-1,
type(f_z),
theano.sandbox.cuda.dnn.GpuDnnSoftmax
)
def check_types_opt(graph, graph_gpu):
assert isinstance(graph.maker.fgraph.toposort()[-1].op, type(f_z))
assert len([n for n in graph_gpu.maker.fgraph.toposort()
if isinstance(
n.op,
theano.sandbox.cuda.dnn.GpuDnnSoftmax
)]) == 1
self._test_softmax(
x,
x_gpu,
f_z,
f_gpu,
cmp,
mode_with_gpu,
check_types
)
mode = mode_with_gpu.including("cudnn")
self._test_softmax(x, x, f_z, f_z, self._cmp, mode, check_types_opt)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论