提交 740295e2 authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Make the core code for the convolution reusable.

上级 c022347b
...@@ -10,12 +10,22 @@ from theano.sandbox.cuda import GpuOp ...@@ -10,12 +10,22 @@ from theano.sandbox.cuda import GpuOp
from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable, from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
gpu_contiguous) gpu_contiguous)
from theano.sandbox.cuda.blas import GpuConv from theano.sandbox.cuda.blas import GpuConv
from theano.compat import PY3
class GpuDnnConv(GpuOp):
__props__ = ('border_mode',)
def __init__(self, border_mode): class GpuDnnConvBase(GpuOp):
__props__ = ('border_mode', 'conv_mode')
def __init__(self, border_mode, conv_mode='conv'):
assert border_mode in ('valid', 'full')
self.border_mode = border_mode self.border_mode = border_mode
assert conv_mode in ('conv', 'cross')
self.conv_mode = conv_mode
def __setstate__(self, props):
self.__dict__.update(props)
if not hasattr(self, 'conv_mode'):
self.conv_mode = 'conv'
def make_node(self, img, kern): def make_node(self, img, kern):
if img.type.ndim != 4: if img.type.ndim != 4:
...@@ -38,61 +48,106 @@ class GpuDnnConv(GpuOp): ...@@ -38,61 +48,106 @@ class GpuDnnConv(GpuOp):
def c_libraries(self): def c_libraries(self):
return ['cudnn'] return ['cudnn']
def c_support_code_struct(self, node, struct_id): def c_support_code(self):
return """ return """
cudnnHandle_t handle%(id)d; cudnnHandle_t _handle = NULL;
cudnnTensor4dDescriptor_t input%(id)d; """
cudnnTensor4dDescriptor_t output%(id)d;
cudnnFilterDescriptor_t kerns%(id)d;
cudnnConvolutionDescriptor_t op%(id)d;
""" % dict(id=struct_id)
def c_init_code_struct(self, node, struct_id, sub): def c_init_code(self):
return """ if PY3:
handle%(id)d = NULL; error_out = "NULL"
input%(id)d = NULL; else:
output%(id)d = NULL; error_out = ""
kerns%(id)d = NULL; return ["""{
op%(id)d = NULL; cudnnStatus_t err;
cudnnStatus_t err%(id)d; if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
if ((err%(id)d = cudnnCreate(&handle%(id)d)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not create cudnn handle: %%s", PyErr_Format(PyExc_RuntimeError, "could not create cudnn handle: %%s",
cudnnGetErrorString(err%(id)d)); cudnnGetErrorString(err));
%(fail)s return %s;
} }
if ((err%(id)d = cudnnCreateTensor4dDescriptor(&input%(id)d)) != CUDNN_STATUS_SUCCESS) { }""" % (error_out,)]
def c_support_code_struct(self, node, struct_id):
types = ['cudnn' + d.capitalize() + 'Descriptor_t'
for d in self.descriptors]
elems = [t + ' param%d_%d;' % (i, struct_id)
for i, t in enumerate(types)]
return ("cudnnConvolutionDescriptor_t op%d;\n" % (struct_id,) +
'\n'.join(elems))
def c_init_code_struct(self, node, struct_id, sub):
vnames = ['param%d_%d' % (i, struct_id)
for i, t in enumerate(self.descriptors)]
inits = [vname + '= NULL;' for vname in vnames]
creates = []
for d, var in zip(self.descriptors, vnames):
creates.append("""
if ((err%(id)d = cudnnCreate%(d)sDescriptor(&%(var)s)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor " PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
"(inp): %%s", cudnnGetErrorString(err%(id)d)); "(inp): %%s", cudnnGetErrorString(err%(id)d));
%(fail)s %(fail)s
} }
if ((err%(id)d = cudnnCreateTensor4dDescriptor(&output%(id)d)) != CUDNN_STATUS_SUCCESS) { """ % dict(id=struct_id, d=d.capitalize(), var=var, fail=sub['fail']))
PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
"(out): %%s", cudnnGetErrorString(err%(id)d)); return """
%(fail)s %(init)s
} cudnnStatus_t err%(id)d;
if ((err%(id)d = cudnnCreateFilterDescriptor(&kerns%(id)d)) != CUDNN_STATUS_SUCCESS) { %(create)s
PyErr_Format(PyExc_MemoryError, "could not allocate filter descriptor: %%s",
cudnnGetErrorString(err%(id)d));
%(fail)s
}
if ((err%(id)d = cudnnCreateConvolutionDescriptor(&op%(id)d)) != CUDNN_STATUS_SUCCESS) { if ((err%(id)d = cudnnCreateConvolutionDescriptor(&op%(id)d)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate convolution " PyErr_Format(PyExc_MemoryError, "could not allocate convolution "
"descriptor: %%s", cudnnGetErrorString(err%(id)d)); "descriptor: %%s", cudnnGetErrorString(err%(id)d));
%(fail)s %(fail)s
} }
""" % dict(id=struct_id, fail=sub['fail']) """ % dict(id=struct_id, fail=sub['fail'], init='\n'.join(inits),
create='\n'.join(creates))
def c_cleanup_code_struct(self, node, struct_id): def c_cleanup_code_struct(self, node, struct_id):
cleanups = ['cudnnDestroy%sDescriptor(param%d_%d);' % (d.capitalize(),
i, struct_id)
for i, d in enumerate(self.descriptors)]
return """ return """
cudnnDestroyTensor4dDescriptor(input%(id)d); %(cleanup)s
cudnnDestroyTensor4dDescriptor(output%(id)d);
cudnnDestroyFilterDescriptor(kerns%(id)d);
cudnnDestroyConvolutionDescriptor(op%(id)d); cudnnDestroyConvolutionDescriptor(op%(id)d);
cudnnDestroy(handle%(id)d); """ % dict(id=struct_id, cleanup='\n'.join(cleanups))
""" % dict(id=struct_id)
def c_set_tensor4d(self, var, desc, err, fail):
return """
%(err)s = cudnnSetTensor4dDescriptorEx(
%(desc)s, CUDNN_DATA_FLOAT,
CudaNdarray_HOST_DIMS(%(var)s)[0],
CudaNdarray_HOST_DIMS(%(var)s)[1],
CudaNdarray_HOST_DIMS(%(var)s)[2],
CudaNdarray_HOST_DIMS(%(var)s)[3],
CudaNdarray_HOST_STRIDES(%(var)s)[0],
CudaNdarray_HOST_STRIDES(%(var)s)[1],
CudaNdarray_HOST_STRIDES(%(var)s)[2],
CudaNdarray_HOST_STRIDES(%(var)s)[3]
);
if (%(err)s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set tensor4d descriptor: %%s",
cudnnGetErrorString(%(err)s));
%(fail)s
}
""" % dict(var=var, err=err, desc=desc, fail=fail)
def c_set_filter(self, var, desc, err, fail):
return """
%(err)s = cudnnSetFilterDescriptor(
%(desc)s, CUDNN_DATA_FLOAT,
CudaNdarray_HOST_DIMS(%(var)s)[0],
CudaNdarray_HOST_DIMS(%(var)s)[1],
CudaNdarray_HOST_DIMS(%(var)s)[2],
CudaNdarray_HOST_DIMS(%(var)s)[3]
);
if (%(err)s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set filter descriptor: %%s",
cudnnGetErrorString(%(err)s));
%(fail)s
}
""" % dict(var=var, desc=desc, err=err, fail=fail)
def c_code(self, node, name, inputs, outputs, sub): def c_code(self, node, name, inputs, outputs, sub):
img, kern = inputs param0, param1 = inputs
out, = outputs out, = outputs
if self.border_mode == "valid": if self.border_mode == "valid":
...@@ -101,65 +156,57 @@ cudnnDestroy(handle%(id)d); ...@@ -101,65 +156,57 @@ cudnnDestroy(handle%(id)d);
assert self.border_mode == "full" assert self.border_mode == "full"
bmode = 0 bmode = 0
if self.conv_mode == 'conv':
conv_flag = 'CUDNN_CONVOLUTION'
else:
conv_flag = 'CUDNN_CROSS_CORRELATION'
vnames = ['param%d_%d' % (i, sub['struct_id'])
for i, t in enumerate(self.descriptors)]
checks = []
for v in (param0, param1):
checks.append("""
if (!CudaNdarray_is_c_contiguous(%s)) {
PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
%s
}
""" % (v, sub['fail']))
sets = []
for p, v, d in zip((param0, param1), vnames[:-1],
self.descriptors[:-1]):
sets.append(getattr(self, 'c_set_'+d)(p, v, 'err'+name,
sub['fail']))
set_out = getattr(self, 'c_set_'+self.descriptors[-1])(
out, vnames[-1], 'err'+name, sub['fail'])
return """ return """
cudnnStatus_t err%(name)s; cudnnStatus_t err%(name)s;
int pad_w%(name)s; int pad_w%(name)s;
int pad_h%(name)s; int pad_h%(name)s;
if (!CudaNdarray_is_c_contiguous(%(img)s)) { %(checks)s
PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
%(fail)s
}
if (!CudaNdarray_is_c_contiguous(%(kerns)s)) { %(sets)s
PyErr_SetString(PyExc_ValueError, "Only contiguous filters are supported.");
%(fail)s
}
err%(name)s = cudnnSetTensor4dDescriptorEx(
input%(id)d, CUDNN_DATA_FLOAT,
CudaNdarray_HOST_DIMS(%(img)s)[0],
CudaNdarray_HOST_DIMS(%(img)s)[1],
CudaNdarray_HOST_DIMS(%(img)s)[2],
CudaNdarray_HOST_DIMS(%(img)s)[3],
CudaNdarray_HOST_STRIDES(%(img)s)[0],
CudaNdarray_HOST_STRIDES(%(img)s)[1],
CudaNdarray_HOST_STRIDES(%(img)s)[2],
CudaNdarray_HOST_STRIDES(%(img)s)[3]
);
if (err%(name)s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set tensor4d descriptor: %%s",
cudnnGetErrorString(err%(name)s));
%(fail)s
}
err%(name)s = cudnnSetFilterDescriptor(
kerns%(id)d, CUDNN_DATA_FLOAT,
CudaNdarray_HOST_DIMS(%(kerns)s)[0],
CudaNdarray_HOST_DIMS(%(kerns)s)[1],
CudaNdarray_HOST_DIMS(%(kerns)s)[2],
CudaNdarray_HOST_DIMS(%(kerns)s)[3]
);
if (err%(name)s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set filter descriptor: %%s",
cudnnGetErrorString(err%(name)s));
%(fail)s
}
if (%(bmode)d == 1) { if (%(bmode)d == 1) {
pad_h%(name)s = 0; pad_h%(name)s = 0;
pad_w%(name)s = 0; pad_w%(name)s = 0;
} else if (%(bmode)d == 0) { } else if (%(bmode)d == 0) {
pad_h%(name)s = CudaNdarray_HOST_DIMS(%(kerns)s)[2] - 1; pad_h%(name)s = CudaNdarray_HOST_DIMS(%(param1)s)[2] - 1;
pad_w%(name)s = CudaNdarray_HOST_DIMS(%(kerns)s)[3] - 1; pad_w%(name)s = CudaNdarray_HOST_DIMS(%(param1)s)[3] - 1;
} else { } else {
PyErr_SetString(PyExc_ValueError, "bad border mode"); PyErr_SetString(PyExc_ValueError, "bad border mode");
%(fail)s %(fail)s
} }
err%(name)s = cudnnSetConvolutionDescriptor( err%(name)s = cudnnSetConvolutionDescriptor(
op%(id)d, input%(id)d, kerns%(id)d, op%(id)d, param0_%(id)d, param1_%(id)d,
pad_h%(name)s, pad_h%(name)s,
pad_w%(name)s, pad_w%(name)s,
1, 1, 1, 1, 1, 1, 1, 1,
CUDNN_CONVOLUTION %(conv_flag)s
); );
if (err%(name)s != CUDNN_STATUS_SUCCESS) { if (err%(name)s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set op descriptor: %%s", PyErr_Format(PyExc_RuntimeError, "could not set op descriptor: %%s",
...@@ -169,12 +216,12 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) { ...@@ -169,12 +216,12 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) {
{ {
int out_dims[4]; int out_dims[4];
err%(name)s = cudnnGetOutputTensor4dDim( err%(name)s = cudnnGetOutputTensor4dDim(
op%(id)d, CUDNN_CONVOLUTION_FWD, op%(id)d, %(path)s,
&out_dims[0], &out_dims[1], &out_dims[0], &out_dims[1],
&out_dims[2], &out_dims[3] &out_dims[2], &out_dims[3]
); );
if (err%(name)s != CUDNN_STATUS_SUCCESS) { if (err%(name)s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set op descriptor: %%s", PyErr_Format(PyExc_RuntimeError, "could not get output sizes: %%s",
cudnnGetErrorString(err%(name)s)); cudnnGetErrorString(err%(name)s));
%(fail)s %(fail)s
} }
...@@ -182,28 +229,15 @@ if (CudaNdarray_prep_output(&%(out)s, 4, out_dims) != 0) { ...@@ -182,28 +229,15 @@ if (CudaNdarray_prep_output(&%(out)s, 4, out_dims) != 0) {
%(fail)s %(fail)s
} }
} }
err%(name)s = cudnnSetTensor4dDescriptorEx(
output%(id)d, CUDNN_DATA_FLOAT, %(set_out)s
CudaNdarray_HOST_DIMS(%(out)s)[0],
CudaNdarray_HOST_DIMS(%(out)s)[1], err%(name)s = %(method)s(
CudaNdarray_HOST_DIMS(%(out)s)[2], _handle,
CudaNdarray_HOST_DIMS(%(out)s)[3], param0_%(id)d, CudaNdarray_DEV_DATA(%(param0)s),
CudaNdarray_HOST_STRIDES(%(out)s)[0], param1_%(id)d, CudaNdarray_DEV_DATA(%(param1)s),
CudaNdarray_HOST_STRIDES(%(out)s)[1],
CudaNdarray_HOST_STRIDES(%(out)s)[2],
CudaNdarray_HOST_STRIDES(%(out)s)[3]
);
if (err%(name)s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set out descriptor: %%s",
cudnnGetErrorString(err%(name)s));
%(fail)s
}
err%(name)s = cudnnConvolutionForward(
handle%(id)d,
input%(id)d, CudaNdarray_DEV_DATA(%(img)s),
kerns%(id)d, CudaNdarray_DEV_DATA(%(kerns)s),
op%(id)d, op%(id)d,
output%(id)d, CudaNdarray_DEV_DATA(%(out)s), param2_%(id)d, CudaNdarray_DEV_DATA(%(out)s),
CUDNN_RESULT_NO_ACCUMULATE CUDNN_RESULT_NO_ACCUMULATE
); );
if (err%(name)s != CUDNN_STATUS_SUCCESS) { if (err%(name)s != CUDNN_STATUS_SUCCESS) {
...@@ -211,11 +245,20 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) { ...@@ -211,11 +245,20 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) {
cudnnGetErrorString(err%(name)s)); cudnnGetErrorString(err%(name)s));
%(fail)s %(fail)s
} }
""" % dict(img=img, kerns=kern, out=out, bmode=bmode, """ % dict(param0=param0, param1=param1, out=out, bmode=bmode,
fail=sub['fail'], id=sub['struct_id'], name=name) conv_flag=conv_flag, fail=sub['fail'], id=sub['struct_id'],
name=name, checks='\n'.join(checks), sets='\n'.join(sets),
set_out=set_out, method=self.conv_op, path=self.path_flag)
def c_code_cache_version(self): def c_code_cache_version(self):
return (4,) return (5,)
class GpuDnnConv(GpuDnnConvBase):
descriptors = ('tensor4d', 'filter', 'tensor4d')
path_flag = 'CUDNN_CONVOLUTION_FWD'
conv_op ='cudnnConvolutionForward'
from theano.sandbox.cuda.opt import (local_optimizer, gpu_contiguous, from theano.sandbox.cuda.opt import (local_optimizer, gpu_contiguous,
......
...@@ -4,7 +4,7 @@ Tests for GPU convolution ...@@ -4,7 +4,7 @@ Tests for GPU convolution
import sys import sys
import time import time
import unittest import unittest
import traceback
import numpy import numpy
...@@ -286,7 +286,7 @@ def exec_conv(version, shapes, verbose, random, mode, ...@@ -286,7 +286,7 @@ def exec_conv(version, shapes, verbose, random, mode,
cls=cls) cls=cls)
except Exception, e: except Exception, e:
print ver, id, (ishape, kshape, subshape, istride, kstride) print ver, id, (ishape, kshape, subshape, istride, kstride)
print "Exception", type(e), e print traceback.format_exc()
pass pass
if not ret: if not ret:
failed_version.add(ver) failed_version.add(ver)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论