提交 4a8da2e8 authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Don't make the gpu backend depend on cudnn.

Add some tests for the new op.
上级 6ad2afec
......@@ -33,10 +33,6 @@ AddConfigVar('cublas.lib',
"""Name of the cuda blas library for the linker.""",
StrParam('cublas'))
AddConfigVar('cudnn.lib',
"""Name of the cuda dnn library for the linker.""",
StrParam('cudnn'))
#is_nvcc_available called here to initialize global vars in
#nvcc_compiler module
nvcc_compiler.is_nvcc_available()
......@@ -160,7 +156,7 @@ if compile_cuda_ndarray and cuda_available:
code,
location=cuda_ndarray_loc,
include_dirs=[cuda_path],
libs=[config.cublas.lib, config.cudnn.lib],
libs=[config.cublas.lib],
preargs=['-O3'] + compiler.compile_args())
from cuda_ndarray.cuda_ndarray import *
except Exception, e:
......
......@@ -42,7 +42,6 @@
#endif
cublasHandle_t handle = NULL;
cudnnHandle_t dnn_handle = NULL;
/////////////////////////
// Alloc and Free
......@@ -3052,8 +3051,6 @@ CudaNdarray_ptr_int_size(PyObject* _unused, PyObject* args)
static int cublas_init();
static void cublas_shutdown();
static int cudnn_init();
static void cudnn_shutdown();
// Initialize the gpu.
// Takes one optional parameter, the device number.
// If provided, it sets that device to be the active device.
......@@ -3120,8 +3117,6 @@ CudaNdarray_gpu_init(PyObject* _unused, PyObject* args)
}
if (cublas_init() == -1)
return NULL;
if (cudnn_init() == -1)
return NULL;
}
Py_INCREF(Py_None);
......@@ -3152,7 +3147,6 @@ CudaNdarray_active_device_name(PyObject* _unused, PyObject* _unused_args) {
PyObject *
CudaNdarray_gpu_shutdown(PyObject* _unused, PyObject* _unused_args) {
// Don't handle errors here
cudnn_shutdown();
cublas_shutdown();
cudaThreadExit();
g_gpu_context_active = 0; // context has now been closed down
......@@ -3613,28 +3607,6 @@ cublas_shutdown()
handle = NULL;
}
static int
cudnn_init()
{
cudnnStatus_t err;
err = cudnnCreate(&dnn_handle);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Error initializing cudnn %d", err);
return -1;
}
cudnnSetStream(dnn_handle, NULL);
return 0;
}
static void
cudnn_shutdown()
{
if (dnn_handle != NULL)
cudnnDestroy(dnn_handle);
handle = NULL;
}
int
CudaNdarray_CopyFromArray(CudaNdarray * self, PyArrayObject*obj)
{
......
......@@ -43,7 +43,6 @@
#include <cublas_v2.h>
#include <cudnn.h>
#ifdef _WIN32
#ifdef _CUDA_NDARRAY_C
......@@ -86,8 +85,6 @@ typedef float real;
/* Use this handle to make cublas calls */
extern DllExport cublasHandle_t handle;
/* and this for cudnn calls */
extern DllExport cudnnHandle_t dnn_handle;
/**
* Allocation and freeing of device memory should go through these functions so that the lib can track memory usage.
......
......@@ -29,8 +29,15 @@ class GpuDnnConv(GpuOp):
return Apply(self, [img, kern], [CudaNdarrayType(broadcastable)()])
def c_headers(self):
return ['cudnn.h']
def c_libraries(self):
return ['cudnn']
def c_support_code_struct(self, node, struct_id):
return """
cudnnHandle_t handle%(id)d;
cudnnTensor4dDescriptor_t input%(id)d;
cudnnTensor4dDescriptor_t output%(id)d;
cudnnFilterDescriptor_t kerns%(id)d;
......@@ -39,6 +46,10 @@ cudnnConvolutionDescriptor_t op%(id)d;
def c_init_code_struct(self, node, struct_id, sub):
return """
if (cudnnCreate(&handle%(id)d) != CUDNN_STATUS_SUCCESS) {
PyErr_SetString(PyExc_RuntimeError, "could not create cudnn handle");
%(fail)s
}
if (cudnnCreateTensor4dDescriptor(&input%(id)d) != CUDNN_STATUS_SUCCESS) {
PyErr_SetString(PyExc_MemoryError, "could not allocate tensor4d descriptor (inp)");
%(fail)s
......@@ -60,13 +71,10 @@ if (cudnnCreateConvolutionDescriptor(&op%(id)d) != CUDNN_STATUS_SUCCESS) {
def c_cleanup_code_struct(self, node, struct_id):
return """
cudnnDestroyTensor4dDescriptor(input%(id)d);
input%(id)d = NULL;
cudnnDestroyTensor4dDescriptor(output%(id)d);
output%(id)d = NULL;
cudnnDestroyFilterDescriptor(kerns%(id)d);
kerns%(id)d = NULL;
cudnnDestroyConvolutionDescriptor(op%(id)d);
op%(id)d = NULL;
cudnnDestroy(handle%(id)d);
""" % dict(id=struct_id)
def c_code(self, node, name, inputs, outputs, sub):
......@@ -162,7 +170,7 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) {
%(fail)s
}
err%(name)s = cudnnConvolutionForward(
dnn_handle,
handle%(id)d,
input%(id)d, CudaNdarray_DEV_DATA(%(img)s),
kerns%(id)d, CudaNdarray_DEV_DATA(%(kerns)s),
op%(id)d,
......@@ -176,9 +184,13 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) {
""" % dict(img=img, kerns=kern, out=out, bmode=bmode,
fail=sub['fail'], id=sub['struct_id'], name=name)
from theano.sandbox.cuda.opt import local_optimizer, gpu_contiguous, register_opt
def c_code_cache_version(self):
return (0,)
from theano.sandbox.cuda.opt import (local_optimizer, gpu_contiguous,
gpu_optimizer)
@register_opt()
@local_optimizer([GpuConv])
def local_conv_dnn(node):
if isinstance(node.op, GpuConv):
......@@ -189,3 +201,5 @@ def local_conv_dnn(node):
border_mode = node.op.border_mode
return [GpuDnnConv(border_mode)(gpu_contiguous(img),
gpu_contiguous(kern))]
gpu_optimizer.register("conv_cudnn", local_conv_dnn, 'cudnn')
......@@ -26,6 +26,8 @@ from theano.sandbox import cuda
if cuda.cuda_available == False:
raise SkipTest('Optional package cuda disabled')
from theano.sandbox.cuda.dnn import GpuDnnConv
#needed as the gpu conv don't have a perform implementation.
if theano.config.mode == 'FAST_COMPILE':
theano_mode = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
......@@ -615,14 +617,13 @@ def test_valid_9_10():
print_=print_, ones=ones, rtol=1.1e-5)
def test_valid(conv_gemm=False):
def _test_valid(cls, mode=None, extra_shapes=[], version=[-1]):
seed_rng()
shapes = get_valid_shapes()
#shapes=shapes[400:426]
# I put -1 in case we forget to add version in the test to.
# I put -2 to test the reference version.
version = [-2, -1, 6]
verbose = 0
random = True
......@@ -631,28 +632,31 @@ def test_valid(conv_gemm=False):
if ones:
random = False
if conv_gemm:
# Test the GpuCorrMM version
mode = theano_mode.including("conv_gemm")
cls = cuda.blas.BaseGpuCorrMM
# dummy version; not used by GpuCorrMM so one version is enough
version = [-1]
# Add tests with strided inputs by still square images and filters.
shapes += get_shapes2(scales_img=(2, 2), img_stride=(2, 2))
shapes += get_shapes2(scales_kern=(2, 2), kern_stride=(2, 2))
else:
mode = theano_mode
cls = None
shapes += extra_shapes
exec_conv(version, shapes, verbose, random, 'valid',
print_=print_, ones=ones, rtol=1.1e-5,
theano_mode=mode, cls=cls)
def test_valid():
_test_valid(None, version=[-2, -1, 6])
def test_gemm_valid():
test_valid(conv_gemm=True)
extra_shapes = get_shapes2(scales_img=(2, 2), img_stride=(2, 2))
extra_shapes += get_shapes2(scales_kern=(2, 2), kern_stride=(2, 2))
_test_valid(cuda.blas.BaseGpuCorrMM,
mode=theano_mode.including("conv_gemm"),
extra_shapes=extra_shapes)
def test_dnn_valid():
_test_valid(GpuDnnConv, mode=theano_mode.including("cudnn"))
def test_full(conv_gemm=False):
def _test_full(cls, mode=None, version=[-1], extra_shapes=[]):
seed_rng()
shapes = get_basic_shapes()
shapes += get_shapes2()
......@@ -707,25 +711,26 @@ def test_full(conv_gemm=False):
]
# shapes=shapes[:277]
version = [-2, -1, 0, 1, 2, 3, 4, 5]
verbose = 0
random = True
if conv_gemm:
# Test the GpuCorrMM version
mode = theano_mode.including("conv_gemm")
cls = cuda.blas.BaseGpuCorrMM
# dummy version; not used by GpuCorrMM so one version is enough
version = [-1]
else:
mode = theano_mode
cls = None
shapes += extra_shapes
exec_conv(version, shapes, verbose, random, 'full',
theano_mode=mode, cls=cls)
def test_full():
_test_full(None, version=[-2, -1, 0, 1, 2, 3, 4, 5])
def test_gemm_full():
test_full(conv_gemm=True)
_test_full(cuda.blas.BaseGpuCorrMM,
mode=theano_mode.including("conv_gemm"))
def test_dnn_full():
_test_full(GpuDnnConv, mode=theano_mode.including("cudnn"))
def test_subsample(conv_gemm=False):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论