提交 cea45e8b authored 作者: Frédéric Bastien's avatar Frédéric Bastien 提交者: GitHub

Merge pull request #5174 from abergeron/cormm_f16

Make corrMM work in float16/64
...@@ -173,11 +173,15 @@ class Kernel(object): ...@@ -173,11 +173,15 @@ class Kernel(object):
fname: str fname: str
the name of the function wrapper. the name of the function wrapper.
(defaults to name + `_call`) (defaults to name + `_call`)
sname: str
the name of the scheduled call function
(defaults to name _ `_scall`)
""" """
def __init__(self, code, params, name, flags, def __init__(self, code, params, name, flags,
codevar=None, binvar=None, objvar=None, fname=None): codevar=None, binvar=None, objvar=None, fname=None,
sname=None):
self.code = code self.code = code
self.params = params self.params = params
self.name = name self.name = name
...@@ -194,6 +198,9 @@ class Kernel(object): ...@@ -194,6 +198,9 @@ class Kernel(object):
if fname is None: if fname is None:
fname = name + '_call' fname = name + '_call'
self.fname = fname self.fname = fname
if sname is None:
sname = name + '_scall'
self.sname = sname
@staticmethod @staticmethod
def get_flags(*types): def get_flags(*types):
...@@ -338,22 +345,30 @@ class GpuKernelBase(object): ...@@ -338,22 +345,30 @@ class GpuKernelBase(object):
setargs = '\n '.join(setargs) setargs = '\n '.join(setargs)
return """ return """
int {fname}(unsigned int nd, size_t *gdim, size_t *ldim, size_t shared, int {fname}(unsigned int _nd, size_t *_gdim, size_t *_ldim, size_t _shared,
{args}) {{ {args}) {{
{setargs} {setargs}
return GpuKernel_call(&{kname}, nd, ldim, gdim, shared, NULL); return GpuKernel_call(&{kname}, _nd, _ldim, _gdim, _shared, NULL);
}} }}
""".format(args=args, fname=k.fname, setargs=setargs, kname=k.objvar)
def c_support_code(self): int {sname}(unsigned int _nd, size_t *_n, size_t _shared, {args}) {{
return """ size_t _ls = 0;
template <typename T> size_t _gs = 0;
static T ceil_intdiv(T a, T b) int _err;
{
return (a/b) + ((a % b) ? 1: 0); if (_nd != 1) return GA_UNSUPPORTED_ERROR;
}
""" _err = GpuKernel_sched(&{kname}, _n[0], &_ls, &_gs);
if (_err != GA_NO_ERROR)
return _err;
{setargs}
return GpuKernel_call(&{kname}, 1, &_ls, &_gs, _shared, NULL);
}}
""".format(args=args, fname=k.fname, setargs=setargs, sname=k.sname,
kname=k.objvar)
def c_support_code_apply(self, node, name): def c_support_code_apply(self, node, name):
kernels = self.gpu_kernels(node, name) kernels = self.gpu_kernels(node, name)
...@@ -428,7 +443,7 @@ int {fname}(unsigned int nd, size_t *gdim, size_t *ldim, size_t shared, ...@@ -428,7 +443,7 @@ int {fname}(unsigned int nd, size_t *gdim, size_t *ldim, size_t shared,
The node that we need the cache version for. The node that we need the cache version for.
""" """
return (6, self.get_params(node).bin_id) return (7, self.get_params(node).bin_id)
def forward_string_meth(name): def forward_string_meth(name):
...@@ -466,12 +481,14 @@ class CGpuKernelBase(COp, GpuKernelBase): ...@@ -466,12 +481,14 @@ class CGpuKernelBase(COp, GpuKernelBase):
kernel_re = re.compile(r'^#kernel ([a-zA-Z_].*?)$', re.MULTILINE) kernel_re = re.compile(r'^#kernel ([a-zA-Z_].*?)$', re.MULTILINE)
c_support_code = forward_string_meth('c_support_code')
c_support_code_apply = forward_string_meth('c_support_code_apply') c_support_code_apply = forward_string_meth('c_support_code_apply')
c_support_code_struct = forward_string_meth('c_support_code_struct') c_support_code_struct = forward_string_meth('c_support_code_struct')
c_init_code_struct = forward_string_meth('c_init_code_struct') c_init_code_struct = forward_string_meth('c_init_code_struct')
c_cleanup_code_struct = forward_string_meth('c_cleanup_code_struct') c_cleanup_code_struct = forward_string_meth('c_cleanup_code_struct')
def c_code_cache_version_apply(self, node):
return GpuKernelBase.c_code_cache_version_apply(self, node)
def _type_macros(self, node): def _type_macros(self, node):
define_template = "#define %s %s\n" define_template = "#define %s %s\n"
undef_template = "#undef %s\n" undef_template = "#undef %s\n"
......
...@@ -414,7 +414,7 @@ gpugemmbatch_no_inplace = GpuGemmBatch(inplace=False) ...@@ -414,7 +414,7 @@ gpugemmbatch_no_inplace = GpuGemmBatch(inplace=False)
gpugemmbatch_inplace = GpuGemmBatch(inplace=True) gpugemmbatch_inplace = GpuGemmBatch(inplace=True)
class BaseGpuCorrMM(CGpuKernelBase, BlasOp): class BaseGpuCorrMM(CGpuKernelBase):
""" """
Base class for `GpuCorrMM`, `GpuCorrMM_gradWeights` and Base class for `GpuCorrMM`, `GpuCorrMM_gradWeights` and
`GpuCorrMM_gradInputs`. Cannot be used directly. `GpuCorrMM_gradInputs`. Cannot be used directly.
...@@ -429,9 +429,9 @@ class BaseGpuCorrMM(CGpuKernelBase, BlasOp): ...@@ -429,9 +429,9 @@ class BaseGpuCorrMM(CGpuKernelBase, BlasOp):
filter_dilation filter_dilation
Perform subsampling of the input, also known as dilation (default: (1, 1)). Perform subsampling of the input, also known as dilation (default: (1, 1)).
""" """
check_broadcast = False check_broadcast = False
__props__ = ('border_mode', 'subsample', 'filter_dilation') __props__ = ('border_mode', 'subsample', 'filter_dilation')
_f16_ok = True
def __init__(self, border_mode="valid", subsample=(1, 1), def __init__(self, border_mode="valid", subsample=(1, 1),
filter_dilation=(1, 1)): filter_dilation=(1, 1)):
...@@ -489,9 +489,15 @@ class BaseGpuCorrMM(CGpuKernelBase, BlasOp): ...@@ -489,9 +489,15 @@ class BaseGpuCorrMM(CGpuKernelBase, BlasOp):
def get_params(self, node): def get_params(self, node):
return node.inputs[0].type.context return node.inputs[0].type.context
def c_headers(self):
return ["<gpuarray/array.h>", "<gpuarray/blas.h>", "gpuarray_helper.h"]
def c_header_dirs(self):
return [os.path.dirname(__file__)]
def c_code_cache_version(self): def c_code_cache_version(self):
# raise this whenever modifying any of the support_code_files # Raise this whenever modifying the code below.
return (0, 2) return (2,)
def c_code_helper(self, bottom, weights, top, direction, sub, height=None, width=None): def c_code_helper(self, bottom, weights, top, direction, sub, height=None, width=None):
""" """
...@@ -953,7 +959,7 @@ class GpuCorrMM_gradInputs(BaseGpuCorrMM): ...@@ -953,7 +959,7 @@ class GpuCorrMM_gradInputs(BaseGpuCorrMM):
return [[1], [1], [0], [0]] # no connection to height, width return [[1], [1], [0], [0]] # no connection to height, width
class BaseGpuCorr3dMM(CGpuKernelBase, BlasOp): class BaseGpuCorr3dMM(CGpuKernelBase):
""" """
Base class for `GpuCorr3dMM`, `GpuCorr3dMM_gradWeights` and Base class for `GpuCorr3dMM`, `GpuCorr3dMM_gradWeights` and
`GpuCorr3dMM_gradInputs`. Cannot be used directly. `GpuCorr3dMM_gradInputs`. Cannot be used directly.
...@@ -967,10 +973,11 @@ class BaseGpuCorr3dMM(CGpuKernelBase, BlasOp): ...@@ -967,10 +973,11 @@ class BaseGpuCorr3dMM(CGpuKernelBase, BlasOp):
Perform subsampling of the output (default: (1, 1, 1)). Perform subsampling of the output (default: (1, 1, 1)).
filter_dilation filter_dilation
Perform subsampling of the input, also known as dilation (default: (1, 1, 1)). Perform subsampling of the input, also known as dilation (default: (1, 1, 1)).
"""
"""
check_broadcast = False check_broadcast = False
__props__ = ('border_mode', 'subsample', 'filter_dilation') __props__ = ('border_mode', 'subsample', 'filter_dilation')
_f16_ok = True
def __init__(self, border_mode="valid", subsample=(1, 1, 1), def __init__(self, border_mode="valid", subsample=(1, 1, 1),
filter_dilation=(1, 1, 1)): filter_dilation=(1, 1, 1)):
...@@ -1028,9 +1035,15 @@ class BaseGpuCorr3dMM(CGpuKernelBase, BlasOp): ...@@ -1028,9 +1035,15 @@ class BaseGpuCorr3dMM(CGpuKernelBase, BlasOp):
def get_params(self, node): def get_params(self, node):
return node.inputs[0].type.context return node.inputs[0].type.context
def c_headers(self):
return ["<gpuarray/array.h>", "<gpuarray/blas.h>", "gpuarray_helper.h"]
def c_header_dirs(self):
return [os.path.dirname(__file__)]
def c_code_cache_version(self): def c_code_cache_version(self):
# raise this whenever modifying any of the support_code_files # raise this whenever modifying the code below.
return (0, 2) return (2,)
def c_code_helper(self, bottom, weights, top, direction, sub, def c_code_helper(self, bottom, weights, top, direction, sub,
height=None, width=None, depth=None): height=None, width=None, depth=None):
......
...@@ -236,11 +236,9 @@ KERNEL void col2im3d_kernel(const ga_size n, ...@@ -236,11 +236,9 @@ KERNEL void col2im3d_kernel(const ga_size n,
} }
} }
#section support_code_struct #section support_code_struct
int im3d2col(const size_t max_threads_dim, int im3d2col(
gpudata * data_im, const size_t data_im_offset, const size_t channels, gpudata * data_im, const size_t data_im_offset, const size_t channels,
const size_t height, const size_t width, const size_t depth, const size_t height, const size_t width, const size_t depth,
const size_t kernel_h, const size_t kernel_w, const size_t kernel_d, const size_t kernel_h, const size_t kernel_w, const size_t kernel_d,
...@@ -257,13 +255,10 @@ int im3d2col(const size_t max_threads_dim, ...@@ -257,13 +255,10 @@ int im3d2col(const size_t max_threads_dim,
size_t width_col = (width + 2 * pad_w - dil_kernel_w) / stride_w + 1; size_t width_col = (width + 2 * pad_w - dil_kernel_w) / stride_w + 1;
size_t depth_col = (depth + 2 * pad_d - dil_kernel_d) / stride_d + 1; size_t depth_col = (depth + 2 * pad_d - dil_kernel_d) / stride_d + 1;
size_t num_kernels = channels * height_col * width_col * depth_col; size_t num_kernels = channels * height_col * width_col * depth_col;
size_t threads_per_block = max_threads_dim;
size_t n_blocks = (num_kernels + threads_per_block - 1) / threads_per_block;
int err; int err;
GpuKernel *kernel; if (dilation_h != 1 || dilation_w != 1 || dilation_d != 1) {
if(dilation_h != 1 || dilation_w != 1 || dilation_d != 1){ err = dilated_im3d2col_kernel_scall(
err = dilated_im3d2col_kernel_call( 1, &num_kernels, 0,
1, &n_blocks, &threads_per_block, 0,
num_kernels, data_im, data_im_offset, height, width, depth, num_kernels, data_im, data_im_offset, height, width, depth,
kernel_h, kernel_w, kernel_d, dilation_h, dilation_w, dilation_d, kernel_h, kernel_w, kernel_d, dilation_h, dilation_w, dilation_d,
pad_h, pad_w, pad_d, stride_h, stride_w, stride_d, height_col, pad_h, pad_w, pad_d, stride_h, stride_w, stride_d, height_col,
...@@ -273,10 +268,9 @@ int im3d2col(const size_t max_threads_dim, ...@@ -273,10 +268,9 @@ int im3d2col(const size_t max_threads_dim,
"gpuarray error: dilated_im3d2col_kernel: %s.", "gpuarray error: dilated_im3d2col_kernel: %s.",
GpuKernel_error(&k_dilated_im3d2col_kernel, err)); GpuKernel_error(&k_dilated_im3d2col_kernel, err));
} }
} } else {
else{ err = im3d2col_kernel_scall(
err = im3d2col_kernel_call( 1, &num_kernels, 0,
1, &n_blocks, &threads_per_block, 0,
num_kernels, data_im, data_im_offset, height, width, depth, num_kernels, data_im, data_im_offset, height, width, depth,
kernel_h, kernel_w, kernel_d, pad_h, pad_w, pad_d, kernel_h, kernel_w, kernel_d, pad_h, pad_w, pad_d,
stride_h, stride_w, stride_d, height_col, width_col, depth_col, stride_h, stride_w, stride_d, height_col, width_col, depth_col,
...@@ -290,7 +284,7 @@ int im3d2col(const size_t max_threads_dim, ...@@ -290,7 +284,7 @@ int im3d2col(const size_t max_threads_dim,
return err; return err;
} }
int col2im3d(const size_t max_threads_dim, gpudata * data_col, const size_t channels, int col2im3d(gpudata * data_col, const size_t channels,
const size_t height, const size_t width, const size_t depth, const size_t height, const size_t width, const size_t depth,
const size_t patch_h, const size_t patch_w, const size_t patch_d, const size_t patch_h, const size_t patch_w, const size_t patch_d,
const size_t dilation_h, const size_t dilation_w, const size_t dilation_d, const size_t dilation_h, const size_t dilation_w, const size_t dilation_d,
...@@ -304,14 +298,12 @@ int col2im3d(const size_t max_threads_dim, gpudata * data_col, const size_t chan ...@@ -304,14 +298,12 @@ int col2im3d(const size_t max_threads_dim, gpudata * data_col, const size_t chan
size_t width_col = (width + 2 * pad_w - dil_patch_w) / stride_w + 1; size_t width_col = (width + 2 * pad_w - dil_patch_w) / stride_w + 1;
size_t depth_col = (depth + 2 * pad_d - dil_patch_d) / stride_d + 1; size_t depth_col = (depth + 2 * pad_d - dil_patch_d) / stride_d + 1;
size_t num_kernels = channels * height * width * depth; size_t num_kernels = channels * height * width * depth;
size_t threads_per_block = max_threads_dim;
size_t n_blocks = (num_kernels + threads_per_block - 1) / threads_per_block;
// To avoid involving atomic operations, we will launch one kernel per // To avoid involving atomic operations, we will launch one kernel per
// bottom dimension, and then in the kernel add up the top dimensions. // bottom dimension, and then in the kernel add up the top dimensions.
int err; int err;
if(dilation_h != 1 || dilation_w != 1 || dilation_d != 1){ if (dilation_h != 1 || dilation_w != 1 || dilation_d != 1) {
err = dilated_col2im3d_kernel_call( err = dilated_col2im3d_kernel_scall(
1, &n_blocks, &threads_per_block, 0, 1, &num_kernels, 0,
num_kernels, data_col, height, width, depth, channels, patch_h, patch_w, num_kernels, data_col, height, width, depth, channels, patch_h, patch_w,
patch_d, dilation_h, dilation_w, dilation_d, pad_h, pad_w, pad_d, patch_d, dilation_h, dilation_w, dilation_d, pad_h, pad_w, pad_d,
stride_h, stride_w, stride_d, height_col, width_col, depth_col, stride_h, stride_w, stride_d, height_col, width_col, depth_col,
...@@ -323,8 +315,8 @@ int col2im3d(const size_t max_threads_dim, gpudata * data_col, const size_t chan ...@@ -323,8 +315,8 @@ int col2im3d(const size_t max_threads_dim, gpudata * data_col, const size_t chan
} }
} }
else{ else{
err = col2im3d_kernel_call( err = col2im3d_kernel_scall(
1, &n_blocks, &threads_per_block, 0, 1, &num_kernels, 0,
num_kernels, data_col, height, width, depth, channels, patch_h, patch_w, num_kernels, data_col, height, width, depth, channels, patch_h, patch_w,
patch_d, pad_h, pad_w, pad_d, stride_h, stride_w, stride_d, patch_d, pad_h, pad_w, pad_d, stride_h, stride_w, stride_d,
height_col, width_col, depth_col, data_im, data_im_offset); height_col, width_col, depth_col, data_im, data_im_offset);
...@@ -460,15 +452,6 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom, ...@@ -460,15 +452,6 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
return NULL; return NULL;
} }
// Get the max threads per blocks
size_t max_threads_dim;
err = gpucontext_property(bottom->context->ctx, GA_CTX_PROP_MAXLSIZE, &max_threads_dim);
if (err != GA_NO_ERROR){
PyErr_Format(PyExc_RuntimeError,
"Could not fetch max_threads_dim.");
return NULL;
}
// Create temporary columns // Create temporary columns
size_t col_dim[2]; size_t col_dim[2];
col_dim[0] = nChannels * kW * kH * kD; col_dim[0] = nChannels * kW * kH * kD;
...@@ -492,8 +475,6 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom, ...@@ -492,8 +475,6 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
const size_t K_ = col_dim[0]; const size_t K_ = col_dim[0];
const size_t N_ = col_dim[1]; const size_t N_ = col_dim[1];
const size_t M_ = nFilters; const size_t M_ = nFilters;
const DTYPE_INPUT_0 one = 1.0f;
const DTYPE_INPUT_0 zero = 0.0f;
PyGpuArrayObject *output; PyGpuArrayObject *output;
if (direction == 0) { // forward pass if (direction == 0) { // forward pass
...@@ -502,24 +483,46 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom, ...@@ -502,24 +483,46 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
// Iterate over batch // Iterate over batch
for (size_t n = 0; n < batchSize; n++) { for (size_t n = 0; n < batchSize; n++) {
// First, im3d2col // First, im3d2col
err = im3d2col(max_threads_dim, err = im3d2col(
bottom->ga.data, n * bottom_stride, nChannels, bottomHeight, bottom->ga.data, n * bottom_stride, nChannels, bottomHeight,
bottomWidth, bottomDepth, kH, kW, kD, dilH, dilW, dilD, bottomWidth, bottomDepth, kH, kW, kD, dilH, dilW, dilD,
padH, padW, padD, dH, dW, dD, col->ga.data); padH, padW, padD, dH, dW, dD, col->ga.data);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
Py_DECREF(col); Py_DECREF(col);
return NULL; return NULL;
} }
// Second, gemm // Second, gemm
err = gpublas_sgemm(cb_fortran, cb_no_trans, cb_no_trans, switch (col->ga.typecode) {
N_, M_, K_, one, case GA_FLOAT:
col->ga.data, 0, N_, err = gpublas_sgemm(cb_fortran, cb_no_trans, cb_no_trans,
weight->ga.data, 0, K_, N_, M_, K_, 1,
zero, col->ga.data, 0, N_,
top->ga.data, n * top_stride, N_); weight->ga.data, 0, K_,
0,
top->ga.data, n * top_stride, N_);
break;
case GA_DOUBLE:
err = gpublas_dgemm(cb_fortran, cb_no_trans, cb_no_trans,
N_, M_, K_, 1,
col->ga.data, 0, N_,
weight->ga.data, 0, K_,
0,
top->ga.data, n * top_stride, N_);
break;
case GA_HALF:
err = gpublas_hgemm(cb_fortran, cb_no_trans, cb_no_trans,
N_, M_, K_, 1,
col->ga.data, 0, N_,
weight->ga.data, 0, K_,
0,
top->ga.data, n * top_stride, N_);
break;
default:
err = GA_UNSUPPORTED_ERROR;
}
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuCorr3dMM encountered an error running sgemm.\n"); "GpuCorr3dMM forward encountered an error running gemm.");
Py_DECREF(col); Py_DECREF(col);
return NULL; return NULL;
} }
...@@ -531,10 +534,10 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom, ...@@ -531,10 +534,10 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
// Iterate over batch // Iterate over batch
for (size_t n = 0; n < batchSize; n++) { for (size_t n = 0; n < batchSize; n++) {
// First, im3d2col // First, im3d2col
err = im3d2col(max_threads_dim, err = im3d2col(
bottom->ga.data, n * bottom_stride, nChannels, bottomHeight, bottom->ga.data, n * bottom_stride, nChannels, bottomHeight,
bottomWidth, bottomDepth, kH, kW, kD, dilH, dilW, dilD, bottomWidth, bottomDepth, kH, kW, kD, dilH, dilW, dilD,
padH, padW, padD, dH, dW, dD, col->ga.data); padH, padW, padD, dH, dW, dD, col->ga.data);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
Py_DECREF(col); Py_DECREF(col);
return NULL; return NULL;
...@@ -543,15 +546,37 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom, ...@@ -543,15 +546,37 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
// Note that we accumulate into weight. We do so by setting beta = 0 // Note that we accumulate into weight. We do so by setting beta = 0
// for the first iteration and beta = 1 for subsequent ones. (This // for the first iteration and beta = 1 for subsequent ones. (This
// is faster than setting weight to all zeros before the loop.) // is faster than setting weight to all zeros before the loop.)
err = gpublas_sgemm(cb_fortran, cb_trans, cb_no_trans, switch (col->ga.typecode) {
K_, M_, N_, one, case GA_FLOAT:
col->ga.data, 0, N_, err = gpublas_sgemm(cb_fortran, cb_trans, cb_no_trans,
top->ga.data, n * top_stride, N_, K_, M_, N_, 1,
(n == 0) ? zero : one, col->ga.data, 0, N_,
weight->ga.data, 0, K_); top->ga.data, n * top_stride, N_,
(n == 0) ? 0 : 1,
weight->ga.data, 0, K_);
break;
case GA_DOUBLE:
err = gpublas_dgemm(cb_fortran, cb_trans, cb_no_trans,
K_, M_, N_, 1,
col->ga.data, 0, N_,
top->ga.data, n * top_stride, N_,
(n == 0) ? 0 : 1,
weight->ga.data, 0, K_);
break;
case GA_HALF:
err = gpublas_hgemm(cb_fortran, cb_trans, cb_no_trans,
K_, M_, N_, 1,
col->ga.data, 0, N_,
top->ga.data, n * top_stride, N_,
(n == 0) ? 0 : 1,
weight->ga.data, 0, K_);
break;
default:
err = GA_UNSUPPORTED_ERROR;
}
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuCorr3dMM encountered an error running sgemm.\n"); "GpuCorr3dMM grad weights encountered an error running gemm.");
Py_DECREF(col); Py_DECREF(col);
return NULL; return NULL;
} }
...@@ -562,29 +587,50 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom, ...@@ -562,29 +587,50 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
// full convolution: gemm, then col2im3d // full convolution: gemm, then col2im3d
// Iterate over batch // Iterate over batch
for (size_t n = 0; n < batchSize; n++) { for (size_t n = 0; n < batchSize; n++) {
// gemm into columns // gemm into columns
switch (top->ga.typecode) {
case GA_FLOAT:
err = gpublas_sgemm(cb_fortran, cb_no_trans, cb_trans, err = gpublas_sgemm(cb_fortran, cb_no_trans, cb_trans,
N_, K_, M_, one, N_, K_, M_, 1,
top->ga.data, n * top_stride, N_, top->ga.data, n * top_stride, N_,
weight->ga.data, 0, K_, weight->ga.data, 0, K_,
zero, 0,
col->ga.data, 0, N_); col->ga.data, 0, N_);
if (err != GA_NO_ERROR) { break;
PyErr_Format(PyExc_RuntimeError, case GA_DOUBLE:
"GpuCorr3dMM encountered an error running sgemm.\n"); err = gpublas_dgemm(cb_fortran, cb_no_trans, cb_trans,
Py_DECREF(col); N_, K_, M_, 1,
return NULL; top->ga.data, n * top_stride, N_,
} weight->ga.data, 0, K_,
// col2im3d back to the data 0,
err = col2im3d(max_threads_dim, col->ga.data, 0, N_);
col->ga.data, nChannels, break;
bottomHeight, bottomWidth, bottomDepth, case GA_HALF:
kH, kW, kD, dilH, dilW, dilD, padH, padW, padD, err = gpublas_hgemm(cb_fortran, cb_no_trans, cb_trans,
dH, dW, dD, bottom->ga.data, n * bottom_stride); N_, K_, M_, 1,
if (err != GA_NO_ERROR) { top->ga.data, n * top_stride, N_,
Py_DECREF(col); weight->ga.data, 0, K_,
return NULL; 0,
} col->ga.data, 0, N_);
break;
default:
err = GA_UNSUPPORTED_ERROR;
}
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"GpuCorr3dMM grad inputs encountered an error running gemm.");
Py_DECREF(col);
return NULL;
}
// col2im3d back to the data
err = col2im3d(col->ga.data, nChannels,
bottomHeight, bottomWidth, bottomDepth,
kH, kW, kD, dilH, dilW, dilD, padH, padW, padD,
dH, dW, dD, bottom->ga.data, n * bottom_stride);
if (err != GA_NO_ERROR) {
Py_DECREF(col);
return NULL;
}
} }
} }
// Free temporary columns // Free temporary columns
......
...@@ -195,8 +195,7 @@ KERNEL void col2im_kernel(const ga_size n, ...@@ -195,8 +195,7 @@ KERNEL void col2im_kernel(const ga_size n,
#section support_code_struct #section support_code_struct
int im2col(const size_t max_threads_dim, int im2col(gpudata *data_im, const size_t data_im_offset, const size_t channels,
gpudata * data_im, const size_t data_im_offset, const size_t channels,
const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w,
const size_t dilation_h, const size_t dilation_w, const size_t dilation_h, const size_t dilation_w,
const size_t pad_h, const size_t pad_w, const size_t pad_h, const size_t pad_w,
...@@ -209,13 +208,10 @@ int im2col(const size_t max_threads_dim, ...@@ -209,13 +208,10 @@ int im2col(const size_t max_threads_dim,
size_t height_col = (height + 2 * pad_h - dil_kernel_h) / stride_h + 1; size_t height_col = (height + 2 * pad_h - dil_kernel_h) / stride_h + 1;
size_t width_col = (width + 2 * pad_w - dil_kernel_w) / stride_w + 1; size_t width_col = (width + 2 * pad_w - dil_kernel_w) / stride_w + 1;
size_t num_kernels = channels * height_col * width_col; size_t num_kernels = channels * height_col * width_col;
size_t threads_per_block = max_threads_dim;
size_t n_blocks = (num_kernels + threads_per_block - 1) / threads_per_block;
int err; int err;
GpuKernel *kernel; if (dilation_h != 1 || dilation_w != 1) {
if(dilation_h != 1 || dilation_w != 1){ err = dilated_im2col_kernel_scall(
err = dilated_im2col_kernel_call( 1, &num_kernels, 0,
1, &n_blocks, &threads_per_block, 0,
num_kernels, data_im, data_im_offset, height, width, kernel_h, kernel_w, num_kernels, data_im, data_im_offset, height, width, kernel_h, kernel_w,
dilation_h, dilation_w, pad_h, pad_w, stride_h, stride_w, height_col, dilation_h, dilation_w, pad_h, pad_w, stride_h, stride_w, height_col,
width_col, data_col); width_col, data_col);
...@@ -224,10 +220,9 @@ int im2col(const size_t max_threads_dim, ...@@ -224,10 +220,9 @@ int im2col(const size_t max_threads_dim,
"gpuarray error: dilated_im2col_kernel: %s.", "gpuarray error: dilated_im2col_kernel: %s.",
GpuKernel_error(&k_dilated_im2col_kernel, err)); GpuKernel_error(&k_dilated_im2col_kernel, err));
} }
} } else {
else{ err = im2col_kernel_scall(
err = im2col_kernel_call( 1, &num_kernels, 0,
1, &n_blocks, &threads_per_block, 0,
num_kernels, data_im, data_im_offset, height, width, kernel_h, kernel_w, num_kernels, data_im, data_im_offset, height, width, kernel_h, kernel_w,
pad_h, pad_w, stride_h, stride_w, height_col, pad_h, pad_w, stride_h, stride_w, height_col,
width_col, data_col); width_col, data_col);
...@@ -240,7 +235,7 @@ int im2col(const size_t max_threads_dim, ...@@ -240,7 +235,7 @@ int im2col(const size_t max_threads_dim,
return err; return err;
} }
int col2im(const size_t max_threads_dim, gpudata * data_col, const size_t channels, int col2im(gpudata * data_col, const size_t channels,
const size_t height, const size_t width, const size_t patch_h, const size_t patch_w, const size_t height, const size_t width, const size_t patch_h, const size_t patch_w,
const size_t dilation_h, const size_t dilation_w, const size_t dilation_h, const size_t dilation_w,
const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t pad_h, const size_t pad_w, const size_t stride_h,
...@@ -250,14 +245,12 @@ int col2im(const size_t max_threads_dim, gpudata * data_col, const size_t channe ...@@ -250,14 +245,12 @@ int col2im(const size_t max_threads_dim, gpudata * data_col, const size_t channe
size_t height_col = (height + 2 * pad_h - dil_patch_h) / stride_h + 1; size_t height_col = (height + 2 * pad_h - dil_patch_h) / stride_h + 1;
size_t width_col = (width + 2 * pad_w - dil_patch_w) / stride_w + 1; size_t width_col = (width + 2 * pad_w - dil_patch_w) / stride_w + 1;
size_t num_kernels = channels * height * width; size_t num_kernels = channels * height * width;
size_t threads_per_block = max_threads_dim;
size_t n_blocks = (num_kernels + threads_per_block - 1) / threads_per_block;
// To avoid involving atomic operations, we will launch one kernel per // To avoid involving atomic operations, we will launch one kernel per
// bottom dimension, and then in the kernel add up the top dimensions. // bottom dimension, and then in the kernel add up the top dimensions.
int err; int err;
if(dilation_h != 1 || dilation_w != 1){ if (dilation_h != 1 || dilation_w != 1) {
err = dilated_col2im_kernel_call( err = dilated_col2im_kernel_scall(
1, &n_blocks, &threads_per_block, 0, 1, &num_kernels, 0,
num_kernels, data_col, height, width, channels, patch_h, patch_w, num_kernels, data_col, height, width, channels, patch_h, patch_w,
dilation_h, dilation_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, pad_h, pad_w, stride_h, stride_w,
height_col, width_col, data_im, data_im_offset); height_col, width_col, data_im, data_im_offset);
...@@ -266,10 +259,9 @@ int col2im(const size_t max_threads_dim, gpudata * data_col, const size_t channe ...@@ -266,10 +259,9 @@ int col2im(const size_t max_threads_dim, gpudata * data_col, const size_t channe
"gpuarray error: dilated_col2im_kernel: %s.", "gpuarray error: dilated_col2im_kernel: %s.",
GpuKernel_error(&k_dilated_col2im_kernel, err)); GpuKernel_error(&k_dilated_col2im_kernel, err));
} }
} } else {
else{ err = col2im_kernel_scall(
err = col2im_kernel_call( 1, &num_kernels, 0,
1, &n_blocks, &threads_per_block, 0,
num_kernels, data_col, height, width, channels, patch_h, patch_w, num_kernels, data_col, height, width, channels, patch_h, patch_w,
pad_h, pad_w, stride_h, stride_w, pad_h, pad_w, stride_h, stride_w,
height_col, width_col, data_im, data_im_offset); height_col, width_col, data_im, data_im_offset);
...@@ -393,15 +385,6 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom, ...@@ -393,15 +385,6 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
return NULL; return NULL;
} }
// Get the max threads per blocks
size_t max_threads_dim;
err = gpucontext_property(bottom->context->ctx, GA_CTX_PROP_MAXLSIZE, &max_threads_dim);
if (err != GA_NO_ERROR){
PyErr_Format(PyExc_RuntimeError,
"Could not fetch max_threads_dim.");
return NULL;
}
// Create temporary columns // Create temporary columns
size_t col_dim[2]; size_t col_dim[2];
col_dim[0] = nChannels * kW * kH; col_dim[0] = nChannels * kW * kH;
...@@ -411,8 +394,7 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom, ...@@ -411,8 +394,7 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
GA_C_ORDER, GA_C_ORDER,
bottom->context, bottom->context,
Py_None); Py_None);
if (NULL == col) if (NULL == col) {
{
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuCorrMM failed to allocate working memory of %ld x %ld\n", "GpuCorrMM failed to allocate working memory of %ld x %ld\n",
col_dim[0], col_dim[1]); col_dim[0], col_dim[1]);
...@@ -425,8 +407,6 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom, ...@@ -425,8 +407,6 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
const size_t K_ = col_dim[0]; const size_t K_ = col_dim[0];
const size_t N_ = col_dim[1]; const size_t N_ = col_dim[1];
const size_t M_ = nFilters; const size_t M_ = nFilters;
const DTYPE_INPUT_0 one = 1.0f;
const DTYPE_INPUT_0 zero = 0.0f;
PyGpuArrayObject *output; PyGpuArrayObject *output;
if (direction == 0) { // forward pass if (direction == 0) { // forward pass
...@@ -435,8 +415,8 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom, ...@@ -435,8 +415,8 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
// Iterate over batch // Iterate over batch
for (size_t n = 0; n < batchSize; n++) { for (size_t n = 0; n < batchSize; n++) {
// First, im2col // First, im2col
err = im2col(max_threads_dim, err = im2col(bottom->ga.data, n * bottom_stride,
bottom->ga.data, n * bottom_stride, nChannels, bottomHeight, nChannels, bottomHeight,
bottomWidth, kH, kW, dilH, dilW, bottomWidth, kH, kW, dilH, dilW,
padH, padW, dH, dW, col->ga.data); padH, padW, dH, dW, col->ga.data);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
...@@ -444,15 +424,37 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom, ...@@ -444,15 +424,37 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
return NULL; return NULL;
} }
// Second, gemm // Second, gemm
err = gpublas_sgemm(cb_fortran, cb_no_trans, cb_no_trans, switch (col->ga.typecode) {
N_, M_, K_, one, case GA_FLOAT:
col->ga.data, 0, N_, err = gpublas_sgemm(cb_fortran, cb_no_trans, cb_no_trans,
weight->ga.data, 0, K_, N_, M_, K_, 1,
zero, col->ga.data, 0, N_,
top->ga.data, n * top_stride, N_); weight->ga.data, 0, K_,
0,
top->ga.data, n * top_stride, N_);
break;
case GA_DOUBLE:
err = gpublas_dgemm(cb_fortran, cb_no_trans, cb_no_trans,
N_, M_, K_, 1,
col->ga.data, 0, N_,
weight->ga.data, 0, K_,
0,
top->ga.data, n * top_stride, N_);
break;
case GA_HALF:
err = gpublas_hgemm(cb_fortran, cb_no_trans, cb_no_trans,
N_, M_, K_, 1,
col->ga.data, 0, N_,
weight->ga.data, 0, K_,
0,
top->ga.data, n * top_stride, N_);
break;
default:
err = GA_UNSUPPORTED_ERROR;
}
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuCorrMM encountered an error running sgemm.\n"); "GpuCorrMM forward encountered an error running gemm: %d", err);
Py_DECREF(col); Py_DECREF(col);
return NULL; return NULL;
} }
...@@ -464,8 +466,8 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom, ...@@ -464,8 +466,8 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
// Iterate over batch // Iterate over batch
for (size_t n = 0; n < batchSize; n++) { for (size_t n = 0; n < batchSize; n++) {
// First, im2col // First, im2col
err = im2col(max_threads_dim, err = im2col(bottom->ga.data, n * bottom_stride,
bottom->ga.data, n * bottom_stride, nChannels, bottomHeight, nChannels, bottomHeight,
bottomWidth, kH, kW, dilH, dilW, bottomWidth, kH, kW, dilH, dilW,
padH, padW, dH, dW, col->ga.data); padH, padW, dH, dW, col->ga.data);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
...@@ -476,15 +478,37 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom, ...@@ -476,15 +478,37 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
// Note that we accumulate into weight. We do so by setting beta = 0 // Note that we accumulate into weight. We do so by setting beta = 0
// for the first iteration and beta = 1 for subsequent ones. (This // for the first iteration and beta = 1 for subsequent ones. (This
// is faster than setting weight to all zeros before the loop.) // is faster than setting weight to all zeros before the loop.)
err = gpublas_sgemm(cb_fortran, cb_trans, cb_no_trans, switch (col->ga.typecode) {
K_, M_, N_, one, case GA_FLOAT:
col->ga.data, 0, N_, err = gpublas_sgemm(cb_fortran, cb_trans, cb_no_trans,
top->ga.data, n * top_stride, N_, K_, M_, N_, 1,
(n == 0) ? zero : one, col->ga.data, 0, N_,
weight->ga.data, 0, K_); top->ga.data, n * top_stride, N_,
(n == 0) ? 0 : 1,
weight->ga.data, 0, K_);
break;
case GA_DOUBLE:
err = gpublas_dgemm(cb_fortran, cb_trans, cb_no_trans,
K_, M_, N_, 1,
col->ga.data, 0, N_,
top->ga.data, n * top_stride, N_,
(n == 0) ? 0 : 1,
weight->ga.data, 0, K_);
break;
case GA_HALF:
err = gpublas_hgemm(cb_fortran, cb_trans, cb_no_trans,
K_, M_, N_, 1,
col->ga.data, 0, N_,
top->ga.data, n * top_stride, N_,
(n == 0) ? 0 : 1,
weight->ga.data, 0, K_);
break;
default:
err = GA_UNSUPPORTED_ERROR;
}
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuCorrMM encountered an error running sgemm.\n"); "GpuCorrMM grad weights encountered an error running gemm: %d", err);
Py_DECREF(col); Py_DECREF(col);
return NULL; return NULL;
} }
...@@ -496,21 +520,42 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom, ...@@ -496,21 +520,42 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
// Iterate over batch // Iterate over batch
for (size_t n = 0; n < batchSize; n++) { for (size_t n = 0; n < batchSize; n++) {
// gemm into columns // gemm into columns
switch (top->ga.typecode) {
case GA_FLOAT:
err = gpublas_sgemm(cb_fortran, cb_no_trans, cb_trans, err = gpublas_sgemm(cb_fortran, cb_no_trans, cb_trans,
N_, K_, M_, one, N_, K_, M_, 1,
top->ga.data, n * top_stride, N_,
weight->ga.data, 0, K_,
0,
col->ga.data, 0, N_);
break;
case GA_DOUBLE:
err = gpublas_dgemm(cb_fortran, cb_no_trans, cb_trans,
N_, K_, M_, 1,
top->ga.data, n * top_stride, N_,
weight->ga.data, 0, K_,
0,
col->ga.data, 0, N_);
break;
case GA_HALF:
err = gpublas_hgemm(cb_fortran, cb_no_trans, cb_trans,
N_, K_, M_, 1,
top->ga.data, n * top_stride, N_, top->ga.data, n * top_stride, N_,
weight->ga.data, 0, K_, weight->ga.data, 0, K_,
zero, 0,
col->ga.data, 0, N_); col->ga.data, 0, N_);
break;
default:
err = GA_UNSUPPORTED_ERROR;
}
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuCorrMM encountered an error running sgemm.\n"); "GpuCorrMM grad inputs encountered an error running gemm: %d", err);
Py_DECREF(col); Py_DECREF(col);
return NULL; return NULL;
} }
// col2im back to the data // col2im back to the data
err = col2im(max_threads_dim, err = col2im(col->ga.data, nChannels, bottomHeight, bottomWidth,
col->ga.data, nChannels, bottomHeight, bottomWidth,
kH, kW, dilH, dilW, padH, padW, kH, kW, dilH, dilW, padH, padW,
dH, dW, bottom->ga.data, n * bottom_stride); dH, dW, bottom->ga.data, n * bottom_stride);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
......
...@@ -613,6 +613,15 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -613,6 +613,15 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
def c_headers(self): def c_headers(self):
return ['<numpy_compat.h>', '<gpuarray/types.h>'] return ['<numpy_compat.h>', '<gpuarray/types.h>']
def c_support_code(self):
return """
template <typename T>
static T ceil_intdiv(T a, T b)
{
return (a/b) + ((a % b) ? 1: 0);
}
"""
def c_code(self, node, name, inp, out, sub): def c_code(self, node, name, inp, out, sub):
x, = inp x, = inp
z, = out z, = out
......
...@@ -242,6 +242,15 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -242,6 +242,15 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
flags=flags, objvar=k_var)) flags=flags, objvar=k_var))
return kernels return kernels
def c_support_code(self):
return """
template <typename T>
static T ceil_intdiv(T a, T b)
{
return (a/b) + ((a % b) ? 1: 0);
}
"""
def c_code(self, node, name, inp, out, sub): def c_code(self, node, name, inp, out, sub):
dtype_ten4 = node.inputs[0].dtype dtype_ten4 = node.inputs[0].dtype
dtype_neib_shape = node.inputs[1].dtype dtype_neib_shape = node.inputs[1].dtype
......
from __future__ import absolute_import, print_function, division from __future__ import absolute_import, print_function, division
from nose.plugins.skip import SkipTest from nose.plugins.skip import SkipTest
import theano.tensor
import theano.gpuarray import theano.gpuarray
if theano.gpuarray.pygpu is None: if theano.gpuarray.pygpu is None:
...@@ -21,3 +22,10 @@ if theano.config.mode == 'FAST_COMPILE': ...@@ -21,3 +22,10 @@ if theano.config.mode == 'FAST_COMPILE':
else: else:
mode_with_gpu = theano.compile.mode.get_default_mode().including('gpuarray').excluding('gpu') mode_with_gpu = theano.compile.mode.get_default_mode().including('gpuarray').excluding('gpu')
mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpuarray') mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpuarray')
# If using float16, cast reference input to float32
def ref_cast(x):
if x.type.dtype == 'float16':
x = theano.tensor.cast(x, 'float32')
return x
...@@ -17,7 +17,7 @@ from .. import dnn ...@@ -17,7 +17,7 @@ from .. import dnn
from ..basic_ops import GpuAllocEmpty from ..basic_ops import GpuAllocEmpty
from ..type import gpuarray_shared_constructor from ..type import gpuarray_shared_constructor
from .config import mode_with_gpu, mode_without_gpu, test_ctx_name from .config import mode_with_gpu, mode_without_gpu, test_ctx_name, ref_cast
from . import test_nnet from . import test_nnet
from .rnn_support import Model, GRU, LSTM, WrapperLayer from .rnn_support import Model, GRU, LSTM, WrapperLayer
...@@ -33,13 +33,6 @@ def set_precision(floatX): ...@@ -33,13 +33,6 @@ def set_precision(floatX):
return precision return precision
# If using float16, cast reference input to float32
def ref_cast(x):
if theano.config.floatX == 'float16':
x = T.cast(x, 'float32')
return x
def test_dnn_conv_desc_merge(): def test_dnn_conv_desc_merge():
if not dnn.dnn_available(test_ctx_name): if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg) raise SkipTest(dnn.dnn_available.msg)
......
...@@ -3,13 +3,14 @@ import unittest ...@@ -3,13 +3,14 @@ import unittest
import numpy import numpy
import theano import theano
from theano import config
from theano.tests import unittest_tools as utt from theano.tests import unittest_tools as utt
from theano.tensor.nnet.corr import CorrMM, CorrMM_gradWeights, CorrMM_gradInputs from theano.tensor.nnet.corr import CorrMM, CorrMM_gradWeights, CorrMM_gradInputs
from ..type import gpuarray_shared_constructor from ..type import gpuarray_shared_constructor
from ..blas import GpuCorrMM, GpuCorrMM_gradWeights, GpuCorrMM_gradInputs from ..blas import GpuCorrMM, GpuCorrMM_gradWeights, GpuCorrMM_gradInputs
from .config import mode_with_gpu, mode_without_gpu from .config import mode_with_gpu, mode_without_gpu, ref_cast
class TestCorrMM(unittest.TestCase): class TestCorrMM(unittest.TestCase):
...@@ -22,15 +23,16 @@ class TestCorrMM(unittest.TestCase): ...@@ -22,15 +23,16 @@ class TestCorrMM(unittest.TestCase):
inputs_shape = [inputs_shape[i] for i in (0, 3, 1, 2)] inputs_shape = [inputs_shape[i] for i in (0, 3, 1, 2)]
filters_shape = [filters_shape[i] for i in (0, 3, 1, 2)] filters_shape = [filters_shape[i] for i in (0, 3, 1, 2)]
inputs_val = numpy.random.random(inputs_shape).astype('float32') inputs_val = numpy.random.random(inputs_shape).astype(config.floatX)
filters_val = numpy.random.random(filters_shape).astype('float32') filters_val = numpy.random.random(filters_shape).astype(config.floatX)
inputs = gpuarray_shared_constructor(inputs_val) inputs = gpuarray_shared_constructor(inputs_val)
filters = gpuarray_shared_constructor(filters_val) filters = gpuarray_shared_constructor(filters_val)
conv_ref = CorrMM(border_mode=border_mode, conv_ref = CorrMM(border_mode=border_mode,
filter_dilation=filter_dilation, filter_dilation=filter_dilation,
subsample=subsample)(inputs, filters) subsample=subsample)(ref_cast(inputs),
ref_cast(filters))
f_ref = theano.function([], conv_ref, mode=mode_without_gpu) f_ref = theano.function([], conv_ref, mode=mode_without_gpu)
conv = GpuCorrMM(border_mode=border_mode, conv = GpuCorrMM(border_mode=border_mode,
...@@ -120,20 +122,20 @@ class TestCorrMM(unittest.TestCase): ...@@ -120,20 +122,20 @@ class TestCorrMM(unittest.TestCase):
filters_shape = [filters_shape[i] for i in (0, 3, 1, 2)] filters_shape = [filters_shape[i] for i in (0, 3, 1, 2)]
dCdH_shape = [dCdH_shape[i] for i in (0, 3, 1, 2)] dCdH_shape = [dCdH_shape[i] for i in (0, 3, 1, 2)]
inputs_val = numpy.random.random(inputs_shape).astype('float32') inputs_val = numpy.random.random(inputs_shape).astype(config.floatX)
dCdH_val = numpy.random.random(dCdH_shape).astype('float32') dCdH_val = numpy.random.random(dCdH_shape).astype(config.floatX)
inputs = gpuarray_shared_constructor(inputs_val) inputs = gpuarray_shared_constructor(inputs_val)
dCdH = gpuarray_shared_constructor(dCdH_val) dCdH = gpuarray_shared_constructor(dCdH_val)
shape = gpuarray_shared_constructor(numpy.array(filters_shape[2:])) shape = gpuarray_shared_constructor(numpy.array(filters_shape[2:]))
if (subsample == (1, 1)): if (subsample == (1, 1)):
conv_ref = CorrMM_gradWeights(subsample=subsample)( conv_ref = CorrMM_gradWeights(subsample=subsample)(
inputs, dCdH) ref_cast(inputs), ref_cast(dCdH))
conv_gemm = GpuCorrMM_gradWeights(subsample=subsample)( conv_gemm = GpuCorrMM_gradWeights(subsample=subsample)(
inputs, dCdH) inputs, dCdH)
else: else:
conv_ref = CorrMM_gradWeights(subsample=subsample)( conv_ref = CorrMM_gradWeights(subsample=subsample)(
inputs, dCdH, shape=shape) ref_cast(inputs), ref_cast(dCdH), shape=shape)
conv_gemm = GpuCorrMM_gradWeights(subsample=subsample)( conv_gemm = GpuCorrMM_gradWeights(subsample=subsample)(
inputs, dCdH, shape=shape) inputs, dCdH, shape=shape)
...@@ -167,8 +169,8 @@ class TestCorrMM(unittest.TestCase): ...@@ -167,8 +169,8 @@ class TestCorrMM(unittest.TestCase):
inputs_shape = [inputs_shape[i] for i in (0, 3, 1, 2)] inputs_shape = [inputs_shape[i] for i in (0, 3, 1, 2)]
filters_shape = [filters_shape[i] for i in (0, 3, 1, 2)] filters_shape = [filters_shape[i] for i in (0, 3, 1, 2)]
inputs_val = numpy.random.random(inputs_shape).astype('float32') inputs_val = numpy.random.random(inputs_shape).astype(config.floatX)
filters_val = numpy.random.random(filters_shape).astype('float32') filters_val = numpy.random.random(filters_shape).astype(config.floatX)
inputs = gpuarray_shared_constructor(inputs_val) inputs = gpuarray_shared_constructor(inputs_val)
filters = gpuarray_shared_constructor(filters_val) filters = gpuarray_shared_constructor(filters_val)
...@@ -178,12 +180,13 @@ class TestCorrMM(unittest.TestCase): ...@@ -178,12 +180,13 @@ class TestCorrMM(unittest.TestCase):
if (subsample == (1, 1)): if (subsample == (1, 1)):
conv_ref = CorrMM_gradInputs(subsample=subsample)( conv_ref = CorrMM_gradInputs(subsample=subsample)(
kern=filters, topgrad=inputs) kern=ref_cast(filters), topgrad=ref_cast(inputs))
conv_gemm = GpuCorrMM_gradInputs(subsample=subsample)( conv_gemm = GpuCorrMM_gradInputs(subsample=subsample)(
kern=filters, topgrad=inputs) kern=filters, topgrad=inputs)
else: else:
conv_ref = CorrMM_gradInputs(subsample=subsample)( conv_ref = CorrMM_gradInputs(subsample=subsample)(
kern=filters, topgrad=inputs, shape=bottom_shape) kern=ref_cast(filters), topgrad=ref_cast(inputs),
shape=bottom_shape)
conv_gemm = GpuCorrMM_gradInputs(subsample=subsample)( conv_gemm = GpuCorrMM_gradInputs(subsample=subsample)(
kern=filters, topgrad=inputs, shape=bottom_shape) kern=filters, topgrad=inputs, shape=bottom_shape)
......
...@@ -3,13 +3,14 @@ import unittest ...@@ -3,13 +3,14 @@ import unittest
import numpy import numpy
import theano import theano
from theano import config
from theano.tests import unittest_tools as utt from theano.tests import unittest_tools as utt
from theano.tensor.nnet.corr3d import Corr3dMM, Corr3dMM_gradWeights, Corr3dMM_gradInputs from theano.tensor.nnet.corr3d import Corr3dMM, Corr3dMM_gradWeights, Corr3dMM_gradInputs
from ..type import gpuarray_shared_constructor from ..type import gpuarray_shared_constructor
from ..blas import GpuCorr3dMM, GpuCorr3dMM_gradWeights, GpuCorr3dMM_gradInputs from ..blas import GpuCorr3dMM, GpuCorr3dMM_gradWeights, GpuCorr3dMM_gradInputs
from .config import mode_with_gpu, mode_without_gpu from .config import mode_with_gpu, mode_without_gpu, ref_cast
class TestCorr3dMM(unittest.TestCase): class TestCorr3dMM(unittest.TestCase):
...@@ -22,15 +23,15 @@ class TestCorr3dMM(unittest.TestCase): ...@@ -22,15 +23,15 @@ class TestCorr3dMM(unittest.TestCase):
inputs_shape = [inputs_shape[i] for i in (0, 4, 1, 2, 3)] inputs_shape = [inputs_shape[i] for i in (0, 4, 1, 2, 3)]
filters_shape = [filters_shape[i] for i in (0, 4, 1, 2, 3)] filters_shape = [filters_shape[i] for i in (0, 4, 1, 2, 3)]
inputs_val = numpy.random.random(inputs_shape).astype('float32') inputs_val = numpy.random.random(inputs_shape).astype(config.floatX)
filters_val = numpy.random.random(filters_shape).astype('float32') filters_val = numpy.random.random(filters_shape).astype(config.floatX)
inputs = gpuarray_shared_constructor(inputs_val) inputs = gpuarray_shared_constructor(inputs_val)
filters = gpuarray_shared_constructor(filters_val) filters = gpuarray_shared_constructor(filters_val)
conv_ref = Corr3dMM(border_mode=border_mode, conv_ref = Corr3dMM(border_mode=border_mode,
filter_dilation=filter_dilation, filter_dilation=filter_dilation,
subsample=subsample)(inputs, filters) subsample=subsample)(ref_cast(inputs), ref_cast(filters))
f_ref = theano.function([], conv_ref, mode=mode_without_gpu) f_ref = theano.function([], conv_ref, mode=mode_without_gpu)
conv = GpuCorr3dMM(border_mode=border_mode, conv = GpuCorr3dMM(border_mode=border_mode,
...@@ -120,20 +121,20 @@ class TestCorr3dMM(unittest.TestCase): ...@@ -120,20 +121,20 @@ class TestCorr3dMM(unittest.TestCase):
filters_shape = [filters_shape[i] for i in (0, 4, 1, 2, 3)] filters_shape = [filters_shape[i] for i in (0, 4, 1, 2, 3)]
dCdH_shape = [dCdH_shape[i] for i in (0, 4, 1, 2, 3)] dCdH_shape = [dCdH_shape[i] for i in (0, 4, 1, 2, 3)]
inputs_val = numpy.random.random(inputs_shape).astype('float32') inputs_val = numpy.random.random(inputs_shape).astype(config.floatX)
dCdH_val = numpy.random.random(dCdH_shape).astype('float32') dCdH_val = numpy.random.random(dCdH_shape).astype(config.floatX)
inputs = gpuarray_shared_constructor(inputs_val) inputs = gpuarray_shared_constructor(inputs_val)
dCdH = gpuarray_shared_constructor(dCdH_val) dCdH = gpuarray_shared_constructor(dCdH_val)
shape = gpuarray_shared_constructor(numpy.array(filters_shape[2:])) shape = gpuarray_shared_constructor(numpy.array(filters_shape[2:]))
if (subsample == (1, 1, 1)): if (subsample == (1, 1, 1)):
conv_ref = Corr3dMM_gradWeights(subsample=subsample)( conv_ref = Corr3dMM_gradWeights(subsample=subsample)(
inputs, dCdH) ref_cast(inputs), ref_cast(dCdH))
conv_gemm = GpuCorr3dMM_gradWeights(subsample=subsample)( conv_gemm = GpuCorr3dMM_gradWeights(subsample=subsample)(
inputs, dCdH) inputs, dCdH)
else: else:
conv_ref = Corr3dMM_gradWeights(subsample=subsample)( conv_ref = Corr3dMM_gradWeights(subsample=subsample)(
inputs, dCdH, shape=shape) ref_cast(inputs), ref_cast(dCdH), shape=shape)
conv_gemm = GpuCorr3dMM_gradWeights(subsample=subsample)( conv_gemm = GpuCorr3dMM_gradWeights(subsample=subsample)(
inputs, dCdH, shape=shape) inputs, dCdH, shape=shape)
...@@ -167,8 +168,8 @@ class TestCorr3dMM(unittest.TestCase): ...@@ -167,8 +168,8 @@ class TestCorr3dMM(unittest.TestCase):
inputs_shape = [inputs_shape[i] for i in (0, 4, 1, 2, 3)] inputs_shape = [inputs_shape[i] for i in (0, 4, 1, 2, 3)]
filters_shape = [filters_shape[i] for i in (0, 4, 1, 2, 3)] filters_shape = [filters_shape[i] for i in (0, 4, 1, 2, 3)]
inputs_val = numpy.random.random(inputs_shape).astype('float32') inputs_val = numpy.random.random(inputs_shape).astype(config.floatX)
filters_val = numpy.random.random(filters_shape).astype('float32') filters_val = numpy.random.random(filters_shape).astype(config.floatX)
inputs = gpuarray_shared_constructor(inputs_val) inputs = gpuarray_shared_constructor(inputs_val)
filters = gpuarray_shared_constructor(filters_val) filters = gpuarray_shared_constructor(filters_val)
...@@ -179,12 +180,12 @@ class TestCorr3dMM(unittest.TestCase): ...@@ -179,12 +180,12 @@ class TestCorr3dMM(unittest.TestCase):
if (subsample == (1, 1, 1)): if (subsample == (1, 1, 1)):
conv_ref = Corr3dMM_gradInputs(subsample=subsample)( conv_ref = Corr3dMM_gradInputs(subsample=subsample)(
kern=filters, topgrad=inputs) kern=ref_cast(filters), topgrad=ref_cast(inputs))
conv_gemm = GpuCorr3dMM_gradInputs(subsample=subsample)( conv_gemm = GpuCorr3dMM_gradInputs(subsample=subsample)(
kern=filters, topgrad=inputs) kern=filters, topgrad=inputs)
else: else:
conv_ref = Corr3dMM_gradInputs(subsample=subsample)( conv_ref = Corr3dMM_gradInputs(subsample=subsample)(
kern=filters, topgrad=inputs, shape=bottom_shape) kern=ref_cast(filters), topgrad=ref_cast(inputs), shape=bottom_shape)
conv_gemm = GpuCorr3dMM_gradInputs(subsample=subsample)( conv_gemm = GpuCorr3dMM_gradInputs(subsample=subsample)(
kern=filters, topgrad=inputs, shape=bottom_shape) kern=filters, topgrad=inputs, shape=bottom_shape)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论