提交 4a8da2e8 authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Don't make the gpu backend depend on cudnn.

Add some tests for the new op.
上级 6ad2afec
...@@ -33,10 +33,6 @@ AddConfigVar('cublas.lib', ...@@ -33,10 +33,6 @@ AddConfigVar('cublas.lib',
"""Name of the cuda blas library for the linker.""", """Name of the cuda blas library for the linker.""",
StrParam('cublas')) StrParam('cublas'))
AddConfigVar('cudnn.lib',
"""Name of the cuda dnn library for the linker.""",
StrParam('cudnn'))
#is_nvcc_available called here to initialize global vars in #is_nvcc_available called here to initialize global vars in
#nvcc_compiler module #nvcc_compiler module
nvcc_compiler.is_nvcc_available() nvcc_compiler.is_nvcc_available()
...@@ -160,7 +156,7 @@ if compile_cuda_ndarray and cuda_available: ...@@ -160,7 +156,7 @@ if compile_cuda_ndarray and cuda_available:
code, code,
location=cuda_ndarray_loc, location=cuda_ndarray_loc,
include_dirs=[cuda_path], include_dirs=[cuda_path],
libs=[config.cublas.lib, config.cudnn.lib], libs=[config.cublas.lib],
preargs=['-O3'] + compiler.compile_args()) preargs=['-O3'] + compiler.compile_args())
from cuda_ndarray.cuda_ndarray import * from cuda_ndarray.cuda_ndarray import *
except Exception, e: except Exception, e:
......
...@@ -42,7 +42,6 @@ ...@@ -42,7 +42,6 @@
#endif #endif
cublasHandle_t handle = NULL; cublasHandle_t handle = NULL;
cudnnHandle_t dnn_handle = NULL;
///////////////////////// /////////////////////////
// Alloc and Free // Alloc and Free
...@@ -3052,8 +3051,6 @@ CudaNdarray_ptr_int_size(PyObject* _unused, PyObject* args) ...@@ -3052,8 +3051,6 @@ CudaNdarray_ptr_int_size(PyObject* _unused, PyObject* args)
static int cublas_init(); static int cublas_init();
static void cublas_shutdown(); static void cublas_shutdown();
static int cudnn_init();
static void cudnn_shutdown();
// Initialize the gpu. // Initialize the gpu.
// Takes one optional parameter, the device number. // Takes one optional parameter, the device number.
// If provided, it sets that device to be the active device. // If provided, it sets that device to be the active device.
...@@ -3120,8 +3117,6 @@ CudaNdarray_gpu_init(PyObject* _unused, PyObject* args) ...@@ -3120,8 +3117,6 @@ CudaNdarray_gpu_init(PyObject* _unused, PyObject* args)
} }
if (cublas_init() == -1) if (cublas_init() == -1)
return NULL; return NULL;
if (cudnn_init() == -1)
return NULL;
} }
Py_INCREF(Py_None); Py_INCREF(Py_None);
...@@ -3152,7 +3147,6 @@ CudaNdarray_active_device_name(PyObject* _unused, PyObject* _unused_args) { ...@@ -3152,7 +3147,6 @@ CudaNdarray_active_device_name(PyObject* _unused, PyObject* _unused_args) {
PyObject * PyObject *
CudaNdarray_gpu_shutdown(PyObject* _unused, PyObject* _unused_args) { CudaNdarray_gpu_shutdown(PyObject* _unused, PyObject* _unused_args) {
// Don't handle errors here // Don't handle errors here
cudnn_shutdown();
cublas_shutdown(); cublas_shutdown();
cudaThreadExit(); cudaThreadExit();
g_gpu_context_active = 0; // context has now been closed down g_gpu_context_active = 0; // context has now been closed down
...@@ -3613,28 +3607,6 @@ cublas_shutdown() ...@@ -3613,28 +3607,6 @@ cublas_shutdown()
handle = NULL; handle = NULL;
} }
static int
cudnn_init()
{
cudnnStatus_t err;
err = cudnnCreate(&dnn_handle);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Error initializing cudnn %d", err);
return -1;
}
cudnnSetStream(dnn_handle, NULL);
return 0;
}
static void
cudnn_shutdown()
{
if (dnn_handle != NULL)
cudnnDestroy(dnn_handle);
handle = NULL;
}
int int
CudaNdarray_CopyFromArray(CudaNdarray * self, PyArrayObject*obj) CudaNdarray_CopyFromArray(CudaNdarray * self, PyArrayObject*obj)
{ {
......
...@@ -43,7 +43,6 @@ ...@@ -43,7 +43,6 @@
#include <cublas_v2.h> #include <cublas_v2.h>
#include <cudnn.h>
#ifdef _WIN32 #ifdef _WIN32
#ifdef _CUDA_NDARRAY_C #ifdef _CUDA_NDARRAY_C
...@@ -86,8 +85,6 @@ typedef float real; ...@@ -86,8 +85,6 @@ typedef float real;
/* Use this handle to make cublas calls */ /* Use this handle to make cublas calls */
extern DllExport cublasHandle_t handle; extern DllExport cublasHandle_t handle;
/* and this for cudnn calls */
extern DllExport cudnnHandle_t dnn_handle;
/** /**
* Allocation and freeing of device memory should go through these functions so that the lib can track memory usage. * Allocation and freeing of device memory should go through these functions so that the lib can track memory usage.
......
...@@ -29,8 +29,15 @@ class GpuDnnConv(GpuOp): ...@@ -29,8 +29,15 @@ class GpuDnnConv(GpuOp):
return Apply(self, [img, kern], [CudaNdarrayType(broadcastable)()]) return Apply(self, [img, kern], [CudaNdarrayType(broadcastable)()])
def c_headers(self):
return ['cudnn.h']
def c_libraries(self):
return ['cudnn']
def c_support_code_struct(self, node, struct_id): def c_support_code_struct(self, node, struct_id):
return """ return """
cudnnHandle_t handle%(id)d;
cudnnTensor4dDescriptor_t input%(id)d; cudnnTensor4dDescriptor_t input%(id)d;
cudnnTensor4dDescriptor_t output%(id)d; cudnnTensor4dDescriptor_t output%(id)d;
cudnnFilterDescriptor_t kerns%(id)d; cudnnFilterDescriptor_t kerns%(id)d;
...@@ -39,6 +46,10 @@ cudnnConvolutionDescriptor_t op%(id)d; ...@@ -39,6 +46,10 @@ cudnnConvolutionDescriptor_t op%(id)d;
def c_init_code_struct(self, node, struct_id, sub): def c_init_code_struct(self, node, struct_id, sub):
return """ return """
if (cudnnCreate(&handle%(id)d) != CUDNN_STATUS_SUCCESS) {
PyErr_SetString(PyExc_RuntimeError, "could not create cudnn handle");
%(fail)s
}
if (cudnnCreateTensor4dDescriptor(&input%(id)d) != CUDNN_STATUS_SUCCESS) { if (cudnnCreateTensor4dDescriptor(&input%(id)d) != CUDNN_STATUS_SUCCESS) {
PyErr_SetString(PyExc_MemoryError, "could not allocate tensor4d descriptor (inp)"); PyErr_SetString(PyExc_MemoryError, "could not allocate tensor4d descriptor (inp)");
%(fail)s %(fail)s
...@@ -60,13 +71,10 @@ if (cudnnCreateConvolutionDescriptor(&op%(id)d) != CUDNN_STATUS_SUCCESS) { ...@@ -60,13 +71,10 @@ if (cudnnCreateConvolutionDescriptor(&op%(id)d) != CUDNN_STATUS_SUCCESS) {
def c_cleanup_code_struct(self, node, struct_id): def c_cleanup_code_struct(self, node, struct_id):
return """ return """
cudnnDestroyTensor4dDescriptor(input%(id)d); cudnnDestroyTensor4dDescriptor(input%(id)d);
input%(id)d = NULL;
cudnnDestroyTensor4dDescriptor(output%(id)d); cudnnDestroyTensor4dDescriptor(output%(id)d);
output%(id)d = NULL;
cudnnDestroyFilterDescriptor(kerns%(id)d); cudnnDestroyFilterDescriptor(kerns%(id)d);
kerns%(id)d = NULL;
cudnnDestroyConvolutionDescriptor(op%(id)d); cudnnDestroyConvolutionDescriptor(op%(id)d);
op%(id)d = NULL; cudnnDestroy(handle%(id)d);
""" % dict(id=struct_id) """ % dict(id=struct_id)
def c_code(self, node, name, inputs, outputs, sub): def c_code(self, node, name, inputs, outputs, sub):
...@@ -162,7 +170,7 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) { ...@@ -162,7 +170,7 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) {
%(fail)s %(fail)s
} }
err%(name)s = cudnnConvolutionForward( err%(name)s = cudnnConvolutionForward(
dnn_handle, handle%(id)d,
input%(id)d, CudaNdarray_DEV_DATA(%(img)s), input%(id)d, CudaNdarray_DEV_DATA(%(img)s),
kerns%(id)d, CudaNdarray_DEV_DATA(%(kerns)s), kerns%(id)d, CudaNdarray_DEV_DATA(%(kerns)s),
op%(id)d, op%(id)d,
...@@ -176,9 +184,13 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) { ...@@ -176,9 +184,13 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) {
""" % dict(img=img, kerns=kern, out=out, bmode=bmode, """ % dict(img=img, kerns=kern, out=out, bmode=bmode,
fail=sub['fail'], id=sub['struct_id'], name=name) fail=sub['fail'], id=sub['struct_id'], name=name)
from theano.sandbox.cuda.opt import local_optimizer, gpu_contiguous, register_opt def c_code_cache_version(self):
return (0,)
from theano.sandbox.cuda.opt import (local_optimizer, gpu_contiguous,
gpu_optimizer)
@register_opt()
@local_optimizer([GpuConv]) @local_optimizer([GpuConv])
def local_conv_dnn(node): def local_conv_dnn(node):
if isinstance(node.op, GpuConv): if isinstance(node.op, GpuConv):
...@@ -189,3 +201,5 @@ def local_conv_dnn(node): ...@@ -189,3 +201,5 @@ def local_conv_dnn(node):
border_mode = node.op.border_mode border_mode = node.op.border_mode
return [GpuDnnConv(border_mode)(gpu_contiguous(img), return [GpuDnnConv(border_mode)(gpu_contiguous(img),
gpu_contiguous(kern))] gpu_contiguous(kern))]
gpu_optimizer.register("conv_cudnn", local_conv_dnn, 'cudnn')
...@@ -26,6 +26,8 @@ from theano.sandbox import cuda ...@@ -26,6 +26,8 @@ from theano.sandbox import cuda
if cuda.cuda_available == False: if cuda.cuda_available == False:
raise SkipTest('Optional package cuda disabled') raise SkipTest('Optional package cuda disabled')
from theano.sandbox.cuda.dnn import GpuDnnConv
#needed as the gpu conv don't have a perform implementation. #needed as the gpu conv don't have a perform implementation.
if theano.config.mode == 'FAST_COMPILE': if theano.config.mode == 'FAST_COMPILE':
theano_mode = theano.compile.mode.get_mode('FAST_RUN').including('gpu') theano_mode = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
...@@ -615,14 +617,13 @@ def test_valid_9_10(): ...@@ -615,14 +617,13 @@ def test_valid_9_10():
print_=print_, ones=ones, rtol=1.1e-5) print_=print_, ones=ones, rtol=1.1e-5)
def test_valid(conv_gemm=False): def _test_valid(cls, mode=None, extra_shapes=[], version=[-1]):
seed_rng() seed_rng()
shapes = get_valid_shapes() shapes = get_valid_shapes()
#shapes=shapes[400:426] #shapes=shapes[400:426]
# I put -1 in case we forget to add version in the test to. # I put -1 in case we forget to add version in the test to.
# I put -2 to test the reference version. # I put -2 to test the reference version.
version = [-2, -1, 6]
verbose = 0 verbose = 0
random = True random = True
...@@ -631,28 +632,31 @@ def test_valid(conv_gemm=False): ...@@ -631,28 +632,31 @@ def test_valid(conv_gemm=False):
if ones: if ones:
random = False random = False
if conv_gemm: shapes += extra_shapes
# Test the GpuCorrMM version
mode = theano_mode.including("conv_gemm")
cls = cuda.blas.BaseGpuCorrMM
# dummy version; not used by GpuCorrMM so one version is enough
version = [-1]
# Add tests with strided inputs by still square images and filters.
shapes += get_shapes2(scales_img=(2, 2), img_stride=(2, 2))
shapes += get_shapes2(scales_kern=(2, 2), kern_stride=(2, 2))
else:
mode = theano_mode
cls = None
exec_conv(version, shapes, verbose, random, 'valid', exec_conv(version, shapes, verbose, random, 'valid',
print_=print_, ones=ones, rtol=1.1e-5, print_=print_, ones=ones, rtol=1.1e-5,
theano_mode=mode, cls=cls) theano_mode=mode, cls=cls)
def test_valid():
_test_valid(None, version=[-2, -1, 6])
def test_gemm_valid(): def test_gemm_valid():
test_valid(conv_gemm=True) extra_shapes = get_shapes2(scales_img=(2, 2), img_stride=(2, 2))
extra_shapes += get_shapes2(scales_kern=(2, 2), kern_stride=(2, 2))
_test_valid(cuda.blas.BaseGpuCorrMM,
mode=theano_mode.including("conv_gemm"),
extra_shapes=extra_shapes)
def test_dnn_valid():
_test_valid(GpuDnnConv, mode=theano_mode.including("cudnn"))
def test_full(conv_gemm=False):
def _test_full(cls, mode=None, version=[-1], extra_shapes=[]):
seed_rng() seed_rng()
shapes = get_basic_shapes() shapes = get_basic_shapes()
shapes += get_shapes2() shapes += get_shapes2()
...@@ -707,25 +711,26 @@ def test_full(conv_gemm=False): ...@@ -707,25 +711,26 @@ def test_full(conv_gemm=False):
] ]
# shapes=shapes[:277] # shapes=shapes[:277]
version = [-2, -1, 0, 1, 2, 3, 4, 5]
verbose = 0 verbose = 0
random = True random = True
if conv_gemm: shapes += extra_shapes
# Test the GpuCorrMM version
mode = theano_mode.including("conv_gemm")
cls = cuda.blas.BaseGpuCorrMM
# dummy version; not used by GpuCorrMM so one version is enough
version = [-1]
else:
mode = theano_mode
cls = None
exec_conv(version, shapes, verbose, random, 'full', exec_conv(version, shapes, verbose, random, 'full',
theano_mode=mode, cls=cls) theano_mode=mode, cls=cls)
def test_full():
_test_full(None, version=[-2, -1, 0, 1, 2, 3, 4, 5])
def test_gemm_full(): def test_gemm_full():
test_full(conv_gemm=True) _test_full(cuda.blas.BaseGpuCorrMM,
mode=theano_mode.including("conv_gemm"))
def test_dnn_full():
_test_full(GpuDnnConv, mode=theano_mode.including("cudnn"))
def test_subsample(conv_gemm=False): def test_subsample(conv_gemm=False):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论