提交 f3b5fe3b authored 作者: Dustin Webb's avatar Dustin Webb

Implemented cuDNN softmax but it is not currently passing tests.

上级 239b6d80
...@@ -4,16 +4,18 @@ import os ...@@ -4,16 +4,18 @@ import os
import theano import theano
from theano import Apply, tensor from theano import Apply, tensor
from theano.gof.type import CDataType from theano.gof.type import CDataType
from theano.compat import PY3
from theano.compat.six import StringIO from theano.compat.six import StringIO
from theano.sandbox.cuda.type import CudaNdarrayType from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda import GpuOp from theano.sandbox.cuda import GpuOp
from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable, from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
gpu_contiguous) gpu_contiguous)
from theano.sandbox.cuda.blas import GpuConv from theano.sandbox.cuda.blas import GpuConv
from theano.compat import PY3 from theano.sandbox.cuda.nnet import GpuSoftmax
from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
class DnnBase(GpuOp): class DnnBase(GpuOp):
""" """
Creates a handle for cudnn and pulls in the cudnn libraries and headers. Creates a handle for cudnn and pulls in the cudnn libraries and headers.
...@@ -360,6 +362,7 @@ class GpuDnnConvGradI(GpuDnnConvBase): ...@@ -360,6 +362,7 @@ class GpuDnnConvGradI(GpuDnnConvBase):
from theano.sandbox.cuda.opt import (local_optimizer, gpu_contiguous, from theano.sandbox.cuda.opt import (local_optimizer, gpu_contiguous,
gpu_optimizer) gpu_optimizer)
def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
conv_mode='conv'): conv_mode='conv'):
img = gpu_contiguous(img) img = gpu_contiguous(img)
...@@ -368,6 +371,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), ...@@ -368,6 +371,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
conv_mode=conv_mode)(img.shape, kerns.shape) conv_mode=conv_mode)(img.shape, kerns.shape)
return GpuDnnConv()(img, kerns, desc) return GpuDnnConv()(img, kerns, desc)
@local_optimizer([GpuConv]) @local_optimizer([GpuConv])
def local_conv_dnn(node): def local_conv_dnn(node):
if isinstance(node.op, GpuConv): if isinstance(node.op, GpuConv):
...@@ -380,3 +384,182 @@ def local_conv_dnn(node): ...@@ -380,3 +384,182 @@ def local_conv_dnn(node):
border_mode=border_mode, subsample=subsample)] border_mode=border_mode, subsample=subsample)]
gpu_optimizer.register("conv_cudnn", local_conv_dnn, 'cudnn') gpu_optimizer.register("conv_cudnn", local_conv_dnn, 'cudnn')
class GpuDnnSoftmax(GpuOp):
"""
Op for the cuDNN Softmax.
Parameters''
-tensor_format: Whether the data has shape 'bc01' or 'b01c'
-algo: 'fast' or 'accurate' indicating whether computations should be
optimized for speed or accuracy respectively.
-mode: 'instance' or 'channel' indicating whether the data format is
'bc01' or 'b01c' respectively.
"""
def __init__(self, tensor_format, algo, mode):
assert(tensor_format in ('bc01', 'b01c'))
self.tensor_format = tensor_format
assert(algo in ('fast', 'accurate'))
self.algo = algo
assert(mode in ('instance', 'channel'))
self.mode = mode
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def __str__(self):
return self.__class__.__name__
def make_node(self, x):
x = as_cuda_ndarray_variable(x)
assert x.ndim == 4
return Apply(self, [x], [x.type()])
def c_headers(self):
return ['cudnn.h', 'cudnn_helper.h']
def c_header_dirs(self):
return [os.path.dirname(__file__)]
def c_libraries(self):
return ['cudnn']
def c_support_code_struct(self, node, struct_id):
return """
cudnnHandle_t softmax_handle_%(id)d;
cudnnTensor4dDescriptor_t softmax_input_%(id)d;
cudnnTensor4dDescriptor_t softmax_output_%(id)d;
""" % dict(id=struct_id)
def c_init_code_struct(self, node, struct_id, sub):
return """
softmax_handle_%(id)d = NULL;
softmax_input_%(id)d = NULL;
softmax_output_%(id)d = NULL;
cudnnStatus_t err%(id)d;
if ((err%(id)d = cudnnCreate(&softmax_handle_%(id)d)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not create cudnn handle: %%s",
cudnnGetErrorString(err%(id)d));
%(fail)s
}
if ((err%(id)d = cudnnCreateTensor4dDescriptor(&softmax_input_%(id)d)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
"(inp): %%s", cudnnGetErrorString(err%(id)d));
%(fail)s
}
if ((err%(id)d = cudnnCreateTensor4dDescriptor(&softmax_output_%(id)d)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
"(out): %%s", cudnnGetErrorString(err%(id)d));
%(fail)s
}
""" % dict(id=struct_id, fail=sub['fail'])
def c_cleanup_code_struct(self, node, struct_id):
return """
if(softmax_input_%(id)d != NULL)
cudnnDestroyTensor4dDescriptor(softmax_input_%(id)d);
if(softmax_output_%(id)d != NULL)
cudnnDestroyTensor4dDescriptor(softmax_output_%(id)d);
if(softmax_handle_%(id)d != NULL)
cudnnDestroy(softmax_handle_%(id)d);
""" % dict(id=struct_id)
def c_code(self, node, name, inputs, outputs, sub):
ins, = inputs
outs, = outputs
if self.tensor_format == 'b01c':
tensor_format = 1
else:
tensor_format = 0
if self.mode == 'channel':
mode = 1
else:
mode = 0
if self.algo == 'fast':
algo = 1
else:
algo = 0
return """
cudnnStatus_t err%(name)s;
cudnnTensorFormat_t format%(id)d = CUDNN_TENSOR_NCHW;
if (%(tensor_format)d == 1)
format%(id)d = CUDNN_TENSOR_NHWC;
cudnnSoftmaxAlgorithm_t algo%(id)d = CUDNN_SOFTMAX_ACCURATE;
if (%(algo)d == 1)
algo%(id)d = CUDNN_SOFTMAX_FAST;
cudnnSoftmaxMode_t mode%(id)d = CUDNN_SOFTMAX_MODE_INSTANCE;
if (%(mode)d == 1)
mode%(id)d = CUDNN_SOFTMAX_MODE_CHANNEL;
err%(name)s = cudnnSetTensor4dDescriptor(
softmax_input_%(id)d,
format%(id)d,
CUDNN_DATA_FLOAT,
CudaNdarray_HOST_DIMS(%(ins)s)[0],
CudaNdarray_HOST_DIMS(%(ins)s)[1],
CudaNdarray_HOST_DIMS(%(ins)s)[2],
CudaNdarray_HOST_DIMS(%(ins)s)[3]
);
if (err%(name)s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set tensor4d descriptor: %%s",
cudnnGetErrorString(err%(name)s));
%(fail)s
}
err%(name)s = cudnnSetTensor4dDescriptor(
softmax_output_%(id)d,
format%(id)d,
CUDNN_DATA_FLOAT,
CudaNdarray_HOST_DIMS(%(ins)s)[0],
CudaNdarray_HOST_DIMS(%(ins)s)[1],
CudaNdarray_HOST_DIMS(%(ins)s)[2],
CudaNdarray_HOST_DIMS(%(ins)s)[3]
);
if (err%(name)s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set out descriptor: %%s",
cudnnGetErrorString(err%(name)s));
%(fail)s
}
CudaNdarray_prep_output(&(%(outs)s), 4, CudaNdarray_HOST_DIMS(%(ins)s));
err%(name)s = cudnnSoftmaxForward(
softmax_handle_%(id)d,
algo%(id)d,
mode%(id)d,
softmax_input_%(id)d,
CudaNdarray_DEV_DATA(%(ins)s),
softmax_output_%(id)d,
CudaNdarray_DEV_DATA(%(outs)s)
);
""" % dict(ins=ins, outs=outs, tensor_format=tensor_format, mode=mode,
algo=algo, fail=sub['fail'], id=sub['struct_id'], name=name)
def c_code_cache_version(self):
return (0, )
@local_optimizer([GpuSoftmax])
def local_softmax_dnn(node):
if isinstance(node.op, GpuSoftmax):
ins = node.inputs[0].dimshuffle('x', 'x', 0, 1)
out = GpuDnnSoftmax('bc01', 'accurate', 'instance')(ins)
out = as_cuda_ndarray_variable(out.dimshuffle(2, 3))
return [out]
gpu_optimizer.register("softmax_cudnn", local_softmax_dnn, 'cudnn')
...@@ -208,42 +208,65 @@ def test_softmax_with_bias(): ...@@ -208,42 +208,65 @@ def test_softmax_with_bias():
cmp(128, 64 * 1024) cmp(128, 64 * 1024)
def test_softmax(): def _test_softmax(x, x_gpu, f_z, f_gpu_z, cpu_type, gpu_type, cmp, topo_idx):
""" """
This is basic test for GpuSoftmax This is basic test for GpuSoftmax and GpuDnnSoftmax
We check that we loop when their is too much block We check that we loop when their is too much block
We use slower code when there isn't enough shared memory We use slower code when there isn't enough shared memory
""" """
x = T.fmatrix('x') f_z_out = f_z(x)
f_gpu_z_out = f_gpu_z(x_gpu)
f = theano.function([x], f_z_out, mode=mode_without_gpu)
f_gpu = theano.function([x_gpu], f_gpu_z_out, mode=mode_with_gpu)
assert isinstance(f.maker.fgraph.toposort()[-1].op, cpu_type)
assert isinstance(f_gpu.maker.fgraph.toposort()[topo_idx].op, gpu_type)
z = T.nnet.softmax(x) #we need to test n>32*1024 to check that we make the block loop.
f = theano.function([x], z, mode=mode_without_gpu) cmp(2, 5, f, f_gpu)
f_gpu = theano.function([x], z, mode=mode_with_gpu) cmp(2 << 15, 5, f, f_gpu)
assert f.maker.fgraph.toposort()[-1].op == T.nnet.softmax cmp(4074, 400, f, f_gpu)
assert isinstance(f_gpu.maker.fgraph.toposort()[-2].op, cmp(0, 10, f, f_gpu)
cuda.nnet.GpuSoftmax) cmp(784, 784, f, f_gpu)
cmp(4, 1000, f, f_gpu)
cmp(4, 1024, f, f_gpu)
cmp(4, 2000, f, f_gpu)
cmp(4, 2024, f, f_gpu)
# The GTX285 don't have enough shared memory.
cmp(4, 4074, f, f_gpu)
# The GTX580, 680 and kepler don't have enough shared memory.
cmp(2, 10000, f, f_gpu)
cmp(128, 16 * 1024, f, f_gpu)
cmp(128, 64 * 1024, f, f_gpu)
def cmp(n, m):
def test_softmax():
def cmp(n, m, f, f_gpu):
#print "test_softmax",n,m #print "test_softmax",n,m
data = numpy.arange(n * m, dtype='float32').reshape(n, m) data = numpy.arange(n * m, dtype='float32').reshape(n, m)
out = f(data) out = f(data)
gout = f_gpu(data) gout = f_gpu(data)
assert numpy.allclose(out, gout), numpy.absolute(out - gout) assert numpy.allclose(out, gout), numpy.absolute(out - gout)
#we need to test n>32*1024 to check that we make the block loop. x = T.fmatrix('x')
cmp(2, 5) z = T.nnet.softmax
cmp(2 << 15, 5) _test_softmax(x, x, z, z, type(z), cuda.nnet.GpuSoftmax, cmp, -2)
cmp(4074, 400)
cmp(0, 10)
cmp(784, 784) def test_cudnn_softmax():
cmp(4, 1000) def cmp(n, m, f, f_gpu):
cmp(4, 1024) #print "test_softmax",n,m
cmp(4, 2000) data = numpy.arange(n * m, dtype='float32').reshape(n, m)
cmp(4, 2024) out = f(data)
# The GTX285 don't have enough shared memory. gout = f_gpu(data.reshape(1, 1, n, m)).reshape((n, m))
cmp(4, 4074) assert numpy.allclose(out, gout), numpy.absolute(out - gout)
# The GTX580, 680 and kepler don't have enough shared memory.
cmp(2, 10000) x = T.matrix('x')
cmp(128, 16 * 1024) x_gpu = T.tensor4('x_gpu')
cmp(128, 64 * 1024) f_z = T.nnet.softmax
f_gpu = theano.sandbox.cuda.dnn.GpuDnnSoftmax(
'bc01',
'accurate',
'instance'
)
_test_softmax(x, x_gpu, f_z, f_gpu, type(f_z), type(f_gpu), cmp, -1)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论