提交 50968f7a authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #2112 from nouiz/abergeron-dnn

Abergeron dnn
......@@ -99,7 +99,7 @@ There are less methods to define for an Op than for a Type:
module is initialized, before anything else is executed and is
specialized for a particular apply of an :ref:`op`.
.. method:: c_init_code_struct(node, struct_id)
.. method:: c_init_code_struct(node, struct_id, sub)
Allows you to specify code that will be inserted in the struct
constructor of the Op. This is for code which should be
......@@ -108,6 +108,16 @@ There are less methods to define for an Op than for a Type:
`struct_id` is an integer guaranteed to be unique inside the
struct.
`sub` is a dictionary of extras parameters to the
c_code_init_code_struct method. It contains the following
values:
``sub['fail']``
A string of code that you should execute (after ensuring
that a python exception is set) if your C code needs to
raise an exception.
.. method:: c_support_code()
Allows you to specify helper functions/structs that the
......
......@@ -94,6 +94,13 @@ TODO: Give examples on how to use these things! They are pretty complicated.
f = theano.function(..., mode=mode)
- :func:`GpuDnnConv <theano.sandbox.cuda.dnn.GpuDnnConv>` GPU-only
convolution using NVIDIA's cuDNN library. To enable it (and
other cudnn-acclerated ops), set
``THEANO_FLAGS=optimizer_including=cudnn`` in your environment.
This requires that you have cuDNN installed and available. It
also requires a GPU with compute capability 3.0 or more.
- :func:`conv3D <theano.tensor.nnet.Conv3D.conv3D>`
3D Convolution applying multi-channel 3D filters to batches of
multi-channel 3D images.
......
......@@ -143,6 +143,18 @@ def failure_code(sub):
goto __label_%(id)i;}''' % sub
def failure_code_init(sub):
"Code for failure in the struct init."
return '''{
if (!PyErr_Occurred()) {
PyErr_SetString(PyExc_RuntimeError,
"Unexpected error in an Op's C code. "
"No Python exception was set.");
}
return %(id)d;
}''' % sub
def code_gen(blocks):
"""WRITEME From a list of L{CodeBlock} instances, returns a string
that executes them all in sequence. eg for C{(decl1, task1,
......@@ -205,8 +217,7 @@ def struct_gen(args, struct_builders, blocks, sub):
# be executed if any step in the constructor fails and the
# latter only at destruction time.
struct_decl += block.declare
struct_init_head = struct_init_head + ("\n{\n%s" % block.behavior)
struct_init_tail = ("%s\n}\n" % block.cleanup) + struct_init_tail
struct_init_head = struct_init_head + ("\n%s" % block.behavior)
struct_cleanup += block.cleanup
behavior = code_gen(blocks)
......@@ -258,6 +269,7 @@ def struct_gen(args, struct_builders, blocks, sub):
# TODO: add some error checking to make sure storage_<x> are
# 1-element lists and __ERROR is a 3-elements list.
struct_code = """
namespace {
struct %(name)s {
......@@ -274,13 +286,9 @@ def struct_gen(args, struct_builders, blocks, sub):
int init(PyObject* __ERROR, %(args_decl)s) {
%(storage_incref)s
%(storage_set)s
int %(failure_var)s = 0;
%(struct_init_head)s
this->__ERROR = __ERROR;
return 0;
%(struct_init_tail)s
%(storage_decref)s
%(do_return)s
}
void cleanup(void) {
%(struct_cleanup)s
......@@ -333,7 +341,7 @@ def get_c_init(r, name, sub):
def get_c_extract(r, name, sub):
"""Wrapper around c_extract that initializes py_name from storage."""
if any([getattr(c.op, 'check_input', config.check_input) for (c, _) in
if any([getattr(c.op, 'check_input', config.check_input) for (c, _) in
r.clients]):
c_extract = r.type.c_extract(name, sub, True)
......@@ -419,7 +427,7 @@ def struct_variable_codeblocks(variable, policies, id, symbol_table, sub):
sub = dict(sub)
# sub['name'] = name
sub['id'] = id
sub['fail'] = failure_code(sub)
sub['fail'] = failure_code_init(sub)
sub['py_ptr'] = "py_%s" % name
sub['stor_ptr'] = "storage_%s" % name
# struct_declare, struct_behavior, struct_cleanup, sub)
......@@ -530,9 +538,8 @@ class CLinker(link.Linker):
failure_var = "__failure"
id = 1
sub = dict(failure_var=failure_var)
for variable in self.variables:
sub = dict(failure_var=failure_var)
# it might be possible to inline constant variables as C literals
# policy = [[what to declare in the struct,
......@@ -634,6 +641,10 @@ class CLinker(link.Linker):
sub['struct_id'] = id + 1
sub['fail'] = failure_code(sub)
sub_struct = dict()
sub_struct['id'] = id + 1
sub_struct['fail'] = failure_code_init(sub)
struct_support = ""
struct_init = ""
struct_cleanup = ""
......@@ -661,7 +672,7 @@ class CLinker(link.Linker):
" didn't return a string for c_init_code_apply")
try:
struct_init = op.c_init_code_struct(node, id + 1)
struct_init = op.c_init_code_struct(node, id + 1, sub_struct)
assert isinstance(struct_init, basestring), (
str(node.op) +
" didn't return a string for c_init_code_struct")
......@@ -1418,7 +1429,10 @@ class CLinker(link.Linker):
print >> code, ' return NULL;'
print >> code, ' }'
print >> code, ' %(struct_name)s* struct_ptr = new %(struct_name)s();' % locals()
print >> code, ' struct_ptr->init(', ','.join('PyTuple_GET_ITEM(argtuple, %i)' % n for n in xrange(n_args)), ');'
print >> code, ' if (struct_ptr->init(', ','.join('PyTuple_GET_ITEM(argtuple, %i)' % n for n in xrange(n_args)), ') != 0) {'
print >> code, ' delete struct_ptr;'
print >> code, ' return NULL;'
print >> code, ' }'
if PY3:
print >> code, """\
PyObject* thunk = PyCapsule_New((void*)(&{struct_name}_executor), NULL, {struct_name}_destructor);
......
......@@ -319,7 +319,7 @@ class CLinkerOp(CLinkerObject):
raise utils.MethodNotDefined("c_init_code_apply", type(self),
self.__class__.__name__)
def c_init_code_struct(self, node, struct_id):
def c_init_code_struct(self, node, struct_id, sub):
"""
Optional: return a code string specific to the apply
to be inserted in the struct initialization code.
......@@ -331,6 +331,11 @@ class CLinkerOp(CLinkerObject):
sub parameter named struct_id that will
contain this name.
:param sub: a dictionary of values to substitute in the code.
Most notably it contains a 'fail' entry that you
should place in your code after setting a python
exception to indicate an error.
:Exceptions:
- `MethodNotDefined`: the subclass does not override this method
"""
......
......@@ -89,7 +89,7 @@ class StructOp(Op):
def c_support_code_struct(self, node, struct_id):
return "npy_uint64 counter%d;" % (struct_id,)
def c_init_code_struct(self, node, struct_id):
def c_init_code_struct(self, node, struct_id, sub):
return "counter%d = 0;" % (struct_id,)
def c_code(self, node, name, input_names, outputs_names, sub):
......
......@@ -155,7 +155,8 @@ if compile_cuda_ndarray and cuda_available:
'cuda_ndarray',
code,
location=cuda_ndarray_loc,
include_dirs=[cuda_path], libs=[config.cublas.lib],
include_dirs=[cuda_path],
libs=[config.cublas.lib],
preargs=['-O3'] + compiler.compile_args())
from cuda_ndarray.cuda_ndarray import *
except Exception, e:
......@@ -262,8 +263,8 @@ if cuda_available:
shared_constructor = float32_shared_constructor
import basic_ops
from basic_ops import (
from . import basic_ops
from .basic_ops import (
GpuFromHost, HostFromGpu, GpuElemwise,
GpuDimShuffle, GpuCAReduce, GpuReshape, GpuContiguous,
GpuSubtensor, GpuIncSubtensor,
......@@ -273,11 +274,11 @@ if cuda_available:
ftensor3, ftensor4,
scalar, vector, matrix, row, col,
tensor3, tensor4)
from basic_ops import (host_from_gpu, gpu_from_host,
from .basic_ops import (host_from_gpu, gpu_from_host,
as_cuda_array, as_cuda_ndarray_variable)
import opt
import cuda_ndarray
from rng_curand import CURAND_RandomStreams
from . import opt, dnn
from .rng_curand import CURAND_RandomStreams
def use(device,
......
#ifndef CUDNN_HELPER_H
#define CUDNN_HELPER_H
#include <cudnn.h>
static inline const char *cudnnGetErrorString(cudnnStatus_t err) {
switch (err) {
case CUDNN_STATUS_SUCCESS:
return "The operation completed successfully.";
case CUDNN_STATUS_NOT_INITIALIZED:
return "The handle was not initialized(Is your driver recent enought?).";
case CUDNN_STATUS_ALLOC_FAILED:
return "Ressource allocation failed inside the library.";
case CUDNN_STATUS_BAD_PARAM:
return "An incorrect value was passed in.";
case CUDNN_STATUS_ARCH_MISMATCH:
return "The current GPU does not support the required features (only cc 3.0+ are supported).";
case CUDNN_STATUS_MAPPING_ERROR:
return "An access to GPU memory space failed (probably due to a failure to bind texture).";
case CUDNN_STATUS_EXECUTION_FAILED:
return "A kernel failed to execute.";
case CUDNN_STATUS_INTERNAL_ERROR:
return "An internal cuDNN operation failed.";
case CUDNN_STATUS_NOT_SUPPORTED:
return "The combination of parameters is not currently supported.";
default:
return "Unknown error code.";
}
}
#endif
import copy
import os
import theano
from theano import Apply
from theano import tensor
from theano.compat.six import StringIO
from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda import GpuOp
from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
gpu_contiguous)
from theano.sandbox.cuda.blas import GpuConv
class GpuDnnConv(GpuOp):
__props__ = ('border_mode',)
def __init__(self, border_mode):
self.border_mode = border_mode
def make_node(self, img, kern):
if img.type.ndim != 4:
raise TypeError('img must be 4D tensor')
if kern.type.ndim != 4:
raise TypeError('kern must be 4D tensor')
broadcastable = (img.type.broadcastable[0],
kern.type.broadcastable[0],
False, False)
return Apply(self, [img, kern], [CudaNdarrayType(broadcastable)()])
def c_headers(self):
return ['cudnn.h', 'cudnn_helper.h']
def c_header_dirs(self):
return [os.path.dirname(__file__)]
def c_libraries(self):
return ['cudnn']
def c_support_code_struct(self, node, struct_id):
return """
cudnnHandle_t handle%(id)d;
cudnnTensor4dDescriptor_t input%(id)d;
cudnnTensor4dDescriptor_t output%(id)d;
cudnnFilterDescriptor_t kerns%(id)d;
cudnnConvolutionDescriptor_t op%(id)d;
""" % dict(id=struct_id)
def c_init_code_struct(self, node, struct_id, sub):
return """
handle%(id)d = NULL;
input%(id)d = NULL;
output%(id)d = NULL;
kerns%(id)d = NULL;
op%(id)d = NULL;
cudnnStatus_t err%(id)d;
if ((err%(id)d = cudnnCreate(&handle%(id)d)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not create cudnn handle: %%s",
cudnnGetErrorString(err%(id)d));
%(fail)s
}
if ((err%(id)d = cudnnCreateTensor4dDescriptor(&input%(id)d)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
"(inp): %%s", cudnnGetErrorString(err%(id)d));
%(fail)s
}
if ((err%(id)d = cudnnCreateTensor4dDescriptor(&output%(id)d)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
"(out): %%s", cudnnGetErrorString(err%(id)d));
%(fail)s
}
if ((err%(id)d = cudnnCreateFilterDescriptor(&kerns%(id)d)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate filter descriptor: %%s",
cudnnGetErrorString(err%(id)d));
%(fail)s
}
if ((err%(id)d = cudnnCreateConvolutionDescriptor(&op%(id)d)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate convolution "
"descriptor: %%s", cudnnGetErrorString(err%(id)d));
%(fail)s
}
""" % dict(id=struct_id, fail=sub['fail'])
def c_cleanup_code_struct(self, node, struct_id):
return """
cudnnDestroyTensor4dDescriptor(input%(id)d);
cudnnDestroyTensor4dDescriptor(output%(id)d);
cudnnDestroyFilterDescriptor(kerns%(id)d);
cudnnDestroyConvolutionDescriptor(op%(id)d);
cudnnDestroy(handle%(id)d);
""" % dict(id=struct_id)
def c_code(self, node, name, inputs, outputs, sub):
img, kern = inputs
out, = outputs
if self.border_mode == "valid":
bmode = 1
else:
assert self.border_mode == "full"
bmode = 0
return """
cudnnStatus_t err%(name)s;
int pad_w%(name)s;
int pad_h%(name)s;
if (!CudaNdarray_is_c_contiguous(%(img)s)) {
PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
%(fail)s
}
if (!CudaNdarray_is_c_contiguous(%(kerns)s)) {
PyErr_SetString(PyExc_ValueError, "Only contiguous filters are supported.");
%(fail)s
}
err%(name)s = cudnnSetTensor4dDescriptorEx(
input%(id)d, CUDNN_DATA_FLOAT,
CudaNdarray_HOST_DIMS(%(img)s)[0],
CudaNdarray_HOST_DIMS(%(img)s)[1],
CudaNdarray_HOST_DIMS(%(img)s)[2],
CudaNdarray_HOST_DIMS(%(img)s)[3],
CudaNdarray_HOST_STRIDES(%(img)s)[0],
CudaNdarray_HOST_STRIDES(%(img)s)[1],
CudaNdarray_HOST_STRIDES(%(img)s)[2],
CudaNdarray_HOST_STRIDES(%(img)s)[3]
);
if (err%(name)s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set tensor4d descriptor: %%s",
cudnnGetErrorString(err%(name)s));
%(fail)s
}
err%(name)s = cudnnSetFilterDescriptor(
kerns%(id)d, CUDNN_DATA_FLOAT,
CudaNdarray_HOST_DIMS(%(kerns)s)[0],
CudaNdarray_HOST_DIMS(%(kerns)s)[1],
CudaNdarray_HOST_DIMS(%(kerns)s)[2],
CudaNdarray_HOST_DIMS(%(kerns)s)[3]
);
if (err%(name)s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set filter descriptor: %%s",
cudnnGetErrorString(err%(name)s));
%(fail)s
}
if (%(bmode)d == 1) {
pad_h%(name)s = 0;
pad_w%(name)s = 0;
} else if (%(bmode)d == 0) {
pad_h%(name)s = CudaNdarray_HOST_DIMS(%(kerns)s)[2] - 1;
pad_w%(name)s = CudaNdarray_HOST_DIMS(%(kerns)s)[3] - 1;
} else {
PyErr_SetString(PyExc_ValueError, "bad border mode");
%(fail)s
}
err%(name)s = cudnnSetConvolutionDescriptor(
op%(id)d, input%(id)d, kerns%(id)d,
pad_h%(name)s,
pad_w%(name)s,
1, 1, 1, 1,
CUDNN_CONVOLUTION
);
if (err%(name)s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set op descriptor: %%s",
cudnnGetErrorString(err%(name)s));
%(fail)s
}
{
int out_dims[4];
err%(name)s = cudnnGetOutputTensor4dDim(
op%(id)d, CUDNN_CONVOLUTION_FWD,
&out_dims[0], &out_dims[1],
&out_dims[2], &out_dims[3]
);
if (err%(name)s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set op descriptor: %%s",
cudnnGetErrorString(err%(name)s));
%(fail)s
}
if (CudaNdarray_prep_output(&%(out)s, 4, out_dims) != 0) {
%(fail)s
}
}
err%(name)s = cudnnSetTensor4dDescriptorEx(
output%(id)d, CUDNN_DATA_FLOAT,
CudaNdarray_HOST_DIMS(%(out)s)[0],
CudaNdarray_HOST_DIMS(%(out)s)[1],
CudaNdarray_HOST_DIMS(%(out)s)[2],
CudaNdarray_HOST_DIMS(%(out)s)[3],
CudaNdarray_HOST_STRIDES(%(out)s)[0],
CudaNdarray_HOST_STRIDES(%(out)s)[1],
CudaNdarray_HOST_STRIDES(%(out)s)[2],
CudaNdarray_HOST_STRIDES(%(out)s)[3]
);
if (err%(name)s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set out descriptor: %%s",
cudnnGetErrorString(err%(name)s));
%(fail)s
}
err%(name)s = cudnnConvolutionForward(
handle%(id)d,
input%(id)d, CudaNdarray_DEV_DATA(%(img)s),
kerns%(id)d, CudaNdarray_DEV_DATA(%(kerns)s),
op%(id)d,
output%(id)d, CudaNdarray_DEV_DATA(%(out)s),
CUDNN_RESULT_NO_ACCUMULATE
);
if (err%(name)s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error doing operation: %%s",
cudnnGetErrorString(err%(name)s));
%(fail)s
}
""" % dict(img=img, kerns=kern, out=out, bmode=bmode,
fail=sub['fail'], id=sub['struct_id'], name=name)
def c_code_cache_version(self):
return (4,)
from theano.sandbox.cuda.opt import (local_optimizer, gpu_contiguous,
gpu_optimizer)
@local_optimizer([GpuConv])
def local_conv_dnn(node):
if isinstance(node.op, GpuConv):
if (node.op.subsample != (1, 1) or
node.op.border_mode not in ['full', 'valid']):
return
img, kern = node.inputs
border_mode = node.op.border_mode
return [GpuDnnConv(border_mode)(gpu_contiguous(img),
gpu_contiguous(kern))]
gpu_optimizer.register("conv_cudnn", local_conv_dnn, 'cudnn')
......@@ -26,6 +26,8 @@ from theano.sandbox import cuda
if cuda.cuda_available == False:
raise SkipTest('Optional package cuda disabled')
from theano.sandbox.cuda.dnn import GpuDnnConv
#needed as the gpu conv don't have a perform implementation.
if theano.config.mode == 'FAST_COMPILE':
theano_mode = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
......@@ -615,14 +617,13 @@ def test_valid_9_10():
print_=print_, ones=ones, rtol=1.1e-5)
def test_valid(conv_gemm=False):
def _test_valid(cls, mode=None, extra_shapes=[], version=[-1]):
seed_rng()
shapes = get_valid_shapes()
#shapes=shapes[400:426]
# I put -1 in case we forget to add version in the test to.
# I put -2 to test the reference version.
version = [-2, -1, 6]
verbose = 0
random = True
......@@ -631,28 +632,31 @@ def test_valid(conv_gemm=False):
if ones:
random = False
if conv_gemm:
# Test the GpuCorrMM version
mode = theano_mode.including("conv_gemm")
cls = cuda.blas.BaseGpuCorrMM
# dummy version; not used by GpuCorrMM so one version is enough
version = [-1]
# Add tests with strided inputs by still square images and filters.
shapes += get_shapes2(scales_img=(2, 2), img_stride=(2, 2))
shapes += get_shapes2(scales_kern=(2, 2), kern_stride=(2, 2))
else:
mode = theano_mode
cls = None
shapes += extra_shapes
exec_conv(version, shapes, verbose, random, 'valid',
print_=print_, ones=ones, rtol=1.1e-5,
theano_mode=mode, cls=cls)
def test_valid():
_test_valid(None, version=[-2, -1, 6])
def test_gemm_valid():
test_valid(conv_gemm=True)
extra_shapes = get_shapes2(scales_img=(2, 2), img_stride=(2, 2))
extra_shapes += get_shapes2(scales_kern=(2, 2), kern_stride=(2, 2))
_test_valid(cuda.blas.BaseGpuCorrMM,
mode=theano_mode.including("conv_gemm"),
extra_shapes=extra_shapes)
def test_dnn_valid():
_test_valid(GpuDnnConv, mode=theano_mode.including("cudnn"))
def test_full(conv_gemm=False):
def _test_full(cls, mode=None, version=[-1], extra_shapes=[]):
seed_rng()
shapes = get_basic_shapes()
shapes += get_shapes2()
......@@ -707,25 +711,26 @@ def test_full(conv_gemm=False):
]
# shapes=shapes[:277]
version = [-2, -1, 0, 1, 2, 3, 4, 5]
verbose = 0
random = True
if conv_gemm:
# Test the GpuCorrMM version
mode = theano_mode.including("conv_gemm")
cls = cuda.blas.BaseGpuCorrMM
# dummy version; not used by GpuCorrMM so one version is enough
version = [-1]
else:
mode = theano_mode
cls = None
shapes += extra_shapes
exec_conv(version, shapes, verbose, random, 'full',
theano_mode=mode, cls=cls)
def test_full():
_test_full(None, version=[-2, -1, 0, 1, 2, 3, 4, 5])
def test_gemm_full():
test_full(conv_gemm=True)
_test_full(cuda.blas.BaseGpuCorrMM,
mode=theano_mode.including("conv_gemm"))
def test_dnn_full():
_test_full(GpuDnnConv, mode=theano_mode.including("cudnn"))
def test_subsample(conv_gemm=False):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论