提交 66e2fcef authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Convolution layer using cudnn.

上级 ee3e49e4
...@@ -33,6 +33,10 @@ AddConfigVar('cublas.lib', ...@@ -33,6 +33,10 @@ AddConfigVar('cublas.lib',
"""Name of the cuda blas library for the linker.""", """Name of the cuda blas library for the linker.""",
StrParam('cublas')) StrParam('cublas'))
AddConfigVar('cudnn.lib',
"""Name of the cuda dnn library for the linker.""",
StrParam('cudnn'))
#is_nvcc_available called here to initialize global vars in #is_nvcc_available called here to initialize global vars in
#nvcc_compiler module #nvcc_compiler module
nvcc_compiler.is_nvcc_available() nvcc_compiler.is_nvcc_available()
...@@ -155,7 +159,8 @@ if compile_cuda_ndarray and cuda_available: ...@@ -155,7 +159,8 @@ if compile_cuda_ndarray and cuda_available:
'cuda_ndarray', 'cuda_ndarray',
code, code,
location=cuda_ndarray_loc, location=cuda_ndarray_loc,
include_dirs=[cuda_path], libs=[config.cublas.lib], include_dirs=[cuda_path],
libs=[config.cublas.lib, config.cudnn.lib],
preargs=['-O3'] + compiler.compile_args()) preargs=['-O3'] + compiler.compile_args())
from cuda_ndarray.cuda_ndarray import * from cuda_ndarray.cuda_ndarray import *
except Exception, e: except Exception, e:
......
...@@ -42,6 +42,7 @@ ...@@ -42,6 +42,7 @@
#endif #endif
cublasHandle_t handle = NULL; cublasHandle_t handle = NULL;
cudnnHandle_t dnn_handle = NULL;
///////////////////////// /////////////////////////
// Alloc and Free // Alloc and Free
...@@ -3051,6 +3052,8 @@ CudaNdarray_ptr_int_size(PyObject* _unused, PyObject* args) ...@@ -3051,6 +3052,8 @@ CudaNdarray_ptr_int_size(PyObject* _unused, PyObject* args)
static int cublas_init(); static int cublas_init();
static void cublas_shutdown(); static void cublas_shutdown();
static int cudnn_init();
static void cudnn_shutdown();
// Initialize the gpu. // Initialize the gpu.
// Takes one optional parameter, the device number. // Takes one optional parameter, the device number.
// If provided, it sets that device to be the active device. // If provided, it sets that device to be the active device.
...@@ -3117,6 +3120,8 @@ CudaNdarray_gpu_init(PyObject* _unused, PyObject* args) ...@@ -3117,6 +3120,8 @@ CudaNdarray_gpu_init(PyObject* _unused, PyObject* args)
} }
if (cublas_init() == -1) if (cublas_init() == -1)
return NULL; return NULL;
if (cudnn_init() == -1)
return NULL;
} }
Py_INCREF(Py_None); Py_INCREF(Py_None);
...@@ -3147,6 +3152,7 @@ CudaNdarray_active_device_name(PyObject* _unused, PyObject* _unused_args) { ...@@ -3147,6 +3152,7 @@ CudaNdarray_active_device_name(PyObject* _unused, PyObject* _unused_args) {
PyObject * PyObject *
CudaNdarray_gpu_shutdown(PyObject* _unused, PyObject* _unused_args) { CudaNdarray_gpu_shutdown(PyObject* _unused, PyObject* _unused_args) {
// Don't handle errors here // Don't handle errors here
cudnn_shutdown();
cublas_shutdown(); cublas_shutdown();
cudaThreadExit(); cudaThreadExit();
g_gpu_context_active = 0; // context has now been closed down g_gpu_context_active = 0; // context has now been closed down
...@@ -3607,6 +3613,28 @@ cublas_shutdown() ...@@ -3607,6 +3613,28 @@ cublas_shutdown()
handle = NULL; handle = NULL;
} }
static int
cudnn_init()
{
cudnnStatus_t err;
err = cudnnCreate(&dnn_handle);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Error initializing cudnn %d", err);
return -1;
}
cudnnSetStream(dnn_handle, NULL);
return 0;
}
static void
cudnn_shutdown()
{
if (dnn_handle != NULL)
cudnnDestroy(dnn_handle);
handle = NULL;
}
int int
CudaNdarray_CopyFromArray(CudaNdarray * self, PyArrayObject*obj) CudaNdarray_CopyFromArray(CudaNdarray * self, PyArrayObject*obj)
{ {
......
...@@ -43,6 +43,7 @@ ...@@ -43,6 +43,7 @@
#include <cublas_v2.h> #include <cublas_v2.h>
#include <cudnn.h>
#ifdef _WIN32 #ifdef _WIN32
#ifdef _CUDA_NDARRAY_C #ifdef _CUDA_NDARRAY_C
...@@ -85,6 +86,8 @@ typedef float real; ...@@ -85,6 +86,8 @@ typedef float real;
/* Use this handle to make cublas calls */ /* Use this handle to make cublas calls */
extern DllExport cublasHandle_t handle; extern DllExport cublasHandle_t handle;
/* and this for cudnn calls */
extern DllExport cudnnHandle_t dnn_handle;
/** /**
* Allocation and freeing of device memory should go through these functions so that the lib can track memory usage. * Allocation and freeing of device memory should go through these functions so that the lib can track memory usage.
......
import copy
import os
import theano
from theano import Apply
from theano import tensor
from theano.compat.six import StringIO
from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda import GpuOp
from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
gpu_contiguous)
from theano.sandbox.cuda.blas import GpuConv
class GpuDnnConv(GpuOp):
__props__ = ('border_mode',)
def __init__(self, border_mode):
self.border_mode = border_mode
def make_node(self, img, kern):
if img.type.ndim != 4:
raise TypeError('img must be 4D tensor')
if kern.type.ndim != 4:
raise TypeError('kern must be 4D tensor')
broadcastable = (img.type.broadcastable[0],
kern.type.broadcastable[0],
False, False)
return Apply(self, [img, kern], [CudaNdarrayType(broadcastable)()])
def c_support_code_struct(self, node, struct_id):
return """
cudnnTensor4dDescriptior_t input%(id)d = NULL;
cudnnTensor4dDescriptior_t output%(id)d = NULL;
cudnnFilterDescriptor_t kerns%(id)d = NULL;
cudnnConvolutionDescriptor_t op%(id)d = NULL;
""" % dict(id=struct_id)
def c_init_code_struct(self, node, struct_id, sub):
return """
if (cudnnCreateTensor4dDescriptor(&input%(id)d) != CUDNN_STATUS_SUCCESS) {
PyErr_SetString(PyExc_MemoryError, "could not allocate tensor4d descriptor (inp)");
%(fail)s
}
if (cudnnCreateTensor4dDescriptor(&output%(id)d) != CUDNN_STATUS_SUCCESS) {
PyErr_SetString(PyExc_MemoryError, "could not allocate tensor4d descriptor (out)");
%(fail)s
}
if (cudnnCreateFilterDescriptor(&kerns%(id)d) != CUDNN_STATUS_SUCCESS) {
PyErr_SetString(PyExc_MemoryError, "could not allocate filter descriptor");
%(fail)s
}
if (cudnnCreateConvolutionDescriptor(&op%(id)d) != CUDNN_STATUS_SUCCESS) {
PyErr_SetString(PyExc_MemoryError, "could not allocate convolution descriptor");
%(fail)s
}
""" % dict(id=struct_id, fail=sub['fail'])
def c_cleanup_code_struct(self, node, struct_id):
return """
cudnnDestroyTensor4dDescriptor(input%(id)d);
input%(id)d = NULL;
cudnnDestroyTensor4dDescriptor(output%(id)d);
output%(id)d = NULL;
cudnnDestroyFilterDescriptor(kerns%(id)d);
kerns%(id)d = NULL;
cudnnDestroyConvolutionDescriptor(op%(id)d);
op%(id)d = NULL;
""" % dict(id=struct_id)
def c_code(self, node, name, inputs, outputs, sub):
img, kern = inputs
out, = outputs
if self.border_mode == "valid":
bmode = 1
else:
assert self.border_mode == "full"
bmode = 0
return """
cudnnStatus_t err%(name)s;
int pad_w%(name)s;
int pad_h%(name)s;
err%(name)s = cudnnSetTensor4dDescriptorEx(
input%(id)d, CUDNN_DATA_FLOAT,
CudaNdarray_HOST_DIMS(%(img)s)[0],
CudaNdarray_HOST_DIMS(%(img)s)[1],
CudaNdarray_HOST_DIMS(%(img)s)[2],
CudaNdarray_HOST_DIMS(%(img)s)[3],
CudaNdarray_HOST_STRIDES(%(img)s)[0],
CudaNdarray_HOST_STRIDES(%(img)s)[1],
CudaNdarray_HOST_STRIDES(%(img)s)[2],
CudaNdarray_HOST_STRIDES(%(img)s)[3]
);
if (err%(name)s != CUDNN_STATUS_SUCCESS) {
PyErr_SetString(PyExc_RuntimeError, "could not set tensor4d descriptor");
%(fail)s
}
// TODO: make sure the kernels are contiguous or ... BOOM!
err%(name)s = cudnnSetFilterDescriptor(
kerns%(id)d, CUDNN_DATA_FLOAT,
CudaNdarray_HOST_DIMS(%(kerns)s)[0],
CudaNdarray_HOST_DIMS(%(kerns)s)[1],
CudaNdarray_HOST_DIMS(%(kerns)s)[2],
CudaNdarray_HOST_DIMS(%(kerns)s)[3]
);
if (err%(name)s != CUDNN_STATUS_SUCCESS) {
PyErr_SetString(PyExc_RuntimeError, "could not set filter descriptor");
%(fail)s
}
if (%(bmode)d == 1) {
pad_h%(name)s = 0;
pad_w%(name)s = 0;
} else if (%(bmode)d == 0) {
pad_h%(name)s = CudaNdarray_HOST_DIMS(%(kerns)s)[2] - 1;
pad_w%(name)s = CudaNdarray_HOST_DIMS(%(kerns)s)[3] - 1;
} else {
PyErr_SetString(PyExc_ValueError, "bad border mode");
%(fail)s
}
err%(name)s = cudnnSetConvolutionDescriptor(
op%(id)d, input%(id)d, kerns%(id)d,
pad_h%(name)s,
pad_w%(name)s,
1, 1, 1, 1,
CUDNN_CONVOLUTION
);
if (err%(name)s != CUDNN_STATUS_SUCCESS) {
PyErr_SetString(PyExc_RuntimeError, "could not set op descriptor");
%(fail)s
}
{
int out_dims[4];
err%(name)s = cudnnGetOutputTensor4dDim(
op%(id)d, CUDNN_CONVOLUTION_FWD,
&out_dims[0], &out_dims[1],
&out_dims[2], &out_dims[3]
);
if (err%(name)s != CUDNN_STATUS_SUCCESS) {
PyErr_SetString(PyExc_RuntimeError, "could not set op descriptor");
%(fail)s
}
if (CudaNdarray_prep_output(&%(out)s, 4, out_dims) != 0) {
%(fail)s
}
}
err%(name)s = cudnnSetTensor4DescriptorEx(
output%(id)d, CUDNN_DATA_FLOAT,
CudaNdarray_HOST_DIMS(%(out)s)[0],
CudaNdarray_HOST_DIMS(%(out)s)[1],
CudaNdarray_HOST_DIMS(%(out)s)[2],
CudaNdarray_HOST_DIMS(%(out)s)[3],
CudaNdarray_HOST_STRIDES(%(out)s)[0],
CudaNdarray_HOST_STRIDES(%(out)s)[1],
CudaNdarray_HOST_STRIDES(%(out)s)[2],
CudaNdarray_HOST_STRIDES(%(out)s)[3]
);
if (err%(name)s != CUDNN_STATUS_SUCCESS) {
PyErr_SetString(PyExc_RuntimeError, "could not set out descriptor");
%(fail)s
}
err%(name)s = cudnnConvolutionForward(
dnn_handle,
input%(id)d, CudaNdarray_DEV_DATA(%(img)s),
kerns%(id)d, CudaNdarray_DEV_DATA(%(kerns)s),
op%(id)d,
out%(id)d, CudaNdarray_DEV_DATA(%(out)s),
CUDNN_RESULT_NO_ACCUMULATE
);
if (err%(name)s != CUDNN_STATUS_SUCCESS) {
PyErr_SetString(PyExc_RuntimeError, "error doing operation");
%(fail)s
}
""" % dict(img=img, kerns=kerns, out=out, bmode=bmode,
fail=sub['fail'], id=sub['struct_id'], name=name)
from theano.sandbox.cuda.opt import local_optimizer, gpu_contiguous, register_opt
@register_opt()
@local_optimizer([GpuConv])
def local_conv_dnn(node):
if (isinstance(node.op, GpuConv) and
node.op.border_mode in ['full', 'valid']):
if node.op.subsample != (1, 1):
return
img, kern = node.inputs
border_mode = node.op.border_mode
return [GpuDnnConv(border_mode)(gpu_contiguous(img),
gpu_contiguous(kern))]
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论