提交 709c14cc authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #3091 from nouiz/abergeron-gpuarray_dnn

gpuarray in new back-end
...@@ -56,7 +56,7 @@ if pygpu: ...@@ -56,7 +56,7 @@ if pygpu:
init_dev(config.device) init_dev(config.device)
import theano.compile import theano.compile
theano.compile.shared_constructor(gpuarray_shared_constructor) theano.compile.shared_constructor(gpuarray_shared_constructor)
optdb.add_tags('gpuarray_opt', 'fast_run', 'fast_compile') optdb.add_tags('gpuarray', 'fast_run', 'fast_compile')
elif config.gpuarray.init_device != '': elif config.gpuarray.init_device != '':
init_dev(config.gpuarray.init_device) init_dev(config.gpuarray.init_device)
......
...@@ -783,6 +783,10 @@ if (theano_prep_output(&%(zz)s, %(ndim)s, shape, %(type)s, GA_C_ORDER, ...@@ -783,6 +783,10 @@ if (theano_prep_output(&%(zz)s, %(ndim)s, shape, %(type)s, GA_C_ORDER,
raise NotImplementedError("grad disabled") raise NotImplementedError("grad disabled")
def empty_like(var):
return GpuAllocEmpty(var.type.dtype)(*var.shape)
class GpuContiguous(Op): class GpuContiguous(Op):
""" """
Always return a c contiguous output. Copy the input only if it is Always return a c contiguous output. Copy the input only if it is
......
...@@ -29,12 +29,12 @@ class NVCC_compiler(NVCC_base): ...@@ -29,12 +29,12 @@ class NVCC_compiler(NVCC_base):
# exist in the past # exist in the past
numpy_ver = [int(n) for n in numpy.__version__.split('.')[:2]] numpy_ver = [int(n) for n in numpy.__version__.split('.')[:2]]
if bool(numpy_ver < [1, 7]): if bool(numpy_ver < [1, 7]):
flags.append("-D NPY_ARRAY_ENSURECOPY=NPY_ENSURECOPY") flags.append("-DNPY_ARRAY_ENSURECOPY=NPY_ENSURECOPY")
flags.append("-D NPY_ARRAY_ALIGNED=NPY_ALIGNED") flags.append("-DNPY_ARRAY_ALIGNED=NPY_ALIGNED")
flags.append("-D NPY_ARRAY_WRITEABLE=NPY_WRITEABLE") flags.append("-DNPY_ARRAY_WRITEABLE=NPY_WRITEABLE")
flags.append("-D NPY_ARRAY_UPDATE_ALL=NPY_UPDATE_ALL") flags.append("-DNPY_ARRAY_UPDATE_ALL=NPY_UPDATE_ALL")
flags.append("-D NPY_ARRAY_C_CONTIGUOUS=NPY_C_CONTIGUOUS") flags.append("-DNPY_ARRAY_C_CONTIGUOUS=NPY_C_CONTIGUOUS")
flags.append("-D NPY_ARRAY_F_CONTIGUOUS=NPY_F_CONTIGUOUS") flags.append("-DNPY_ARRAY_F_CONTIGUOUS=NPY_F_CONTIGUOUS")
# If the user didn't specify architecture flags add them # If the user didn't specify architecture flags add them
if not any(['-arch=sm_' in f for f in flags]): if not any(['-arch=sm_' in f for f in flags]):
......
#ifndef CUDNN_HELPER_H
#define CUDNN_HELPER_H
#include <cudnn.h>
#ifndef CUDNN_VERSION
#include <assert.h>
// Here we define the R2 API in terms of functions in the R1 interface
// This is only for what we use
static inline const char *cudnnGetErrorString(cudnnStatus_t err) {
switch (err) {
case CUDNN_STATUS_SUCCESS:
return "The operation completed successfully.";
case CUDNN_STATUS_NOT_INITIALIZED:
return "The handle was not initialized(Is your driver recent enought?).";
case CUDNN_STATUS_ALLOC_FAILED:
return "Ressource allocation failed inside the library.";
case CUDNN_STATUS_BAD_PARAM:
return "An incorrect value was passed in.";
case CUDNN_STATUS_ARCH_MISMATCH:
return "The current GPU does not support the required features (only cc 3.0+ are supported).";
case CUDNN_STATUS_MAPPING_ERROR:
return "An access to GPU memory space failed (probably due to a failure to bind texture).";
case CUDNN_STATUS_EXECUTION_FAILED:
return "A kernel failed to execute.";
case CUDNN_STATUS_INTERNAL_ERROR:
return "An internal cuDNN operation failed.";
case CUDNN_STATUS_NOT_SUPPORTED:
return "The combination of parameters is not currently supported.";
default:
return "Unknown error code.";
}
}
// some macros to help support cudnn R1 while using R2 code.
#define cudnnCreateTensorDescriptor cudnnCreateTensor4dDescriptor
#define cudnnDestroyTensorDescriptor cudnnDestroyTensor4dDescriptor
#define cudnnSetFilter4dDescriptor cudnnSetFilterDescriptor
typedef cudnnTensor4dDescriptor_t cudnnTensorDescriptor_t;
static inline cudnnStatus_t
cudnnGetConvolution2dForwardOutputDim(
const cudnnConvolutionDescriptor_t convDesc,
const cudnnTensorDescriptor_t inputTensorDesc,
const cudnnFilterDescriptor_t filterDesc,
int *n,
int *c,
int *h,
int *w) {
return cudnnGetOutputTensor4dDim(convDesc, CUDNN_CONVOLUTION_FWD,
n, c, h, w);
}
typedef int cudnnConvolutionFwdAlgo_t;
typedef int cudnnConvolutionFwdPreference_t;
#define CUDNN_CONVOLUTION_FWD_NO_WORKSPACE 0
static inline cudnnStatus_t
cudnnGetConvolutionForwardAlgorithm(
cudnnHandle_t handle,
const cudnnTensorDescriptor_t srcDesc,
const cudnnFilterDescriptor_t filterDesc,
const cudnnConvolutionDescriptor_t convDesc,
const cudnnTensorDescriptor_t destDesc,
cudnnConvolutionFwdPreference_t preference,
size_t memoryLimitInbytes,
cudnnConvolutionFwdAlgo_t *algo) {
*algo = 0;
return CUDNN_STATUS_SUCCESS;
}
static inline cudnnStatus_t
cudnnGetConvolutionForwardWorkspaceSize(
cudnnHandle_t handle,
const cudnnTensorDescriptor_t srcDesc,
const cudnnFilterDescriptor_t filterDesc,
const cudnnConvolutionDescriptor_t convDesc,
const cudnnTensor4dDescriptor_t destDesc,
cudnnConvolutionFwdAlgo_t algo,
size_t *sizeInBytes) {
*sizeInBytes = 0;
return CUDNN_STATUS_SUCCESS;
}
static inline cudnnStatus_t
cudnnConvolutionForward_v2(
cudnnHandle_t handle,
const void *alpha,
const cudnnTensorDescriptor_t srcDesc,
const void *srcData,
const cudnnFilterDescriptor_t filterDesc,
const void *filterData,
const cudnnConvolutionDescriptor_t convDesc,
cudnnConvolutionFwdAlgo_t algo,
void *workSpace,
size_t workSpaceSizeInBytes,
const void *beta,
const cudnnTensorDescriptor_t destDesc,
void *destData) {
assert(*(float *)alpha == 1.0);
cudnnAccumulateResult_t r;
if (*(float *)beta == 0.0) {
r = CUDNN_RESULT_NO_ACCUMULATE;
} else if (*(float *)beta == 1.0) {
r = CUDNN_RESULT_ACCUMULATE;
} else {
assert(0 && "beta must be 0.0 or 1.0");
}
return cudnnConvolutionForward(handle, srcDesc, srcData,
filterDesc, filterData,
convDesc, destDesc, destData,
r);
}
#define cudnnConvolutionForward cudnnConvolutionForward_v2
static inline cudnnStatus_t
cudnnConvolutionBackwardFilter_v2(
cudnnHandle_t handle,
const void *alpha,
const cudnnTensorDescriptor_t srcDesc,
const void *srcData,
const cudnnTensorDescriptor_t diffDesc,
const void *diffData,
const cudnnConvolutionDescriptor_t convDesc,
const void *beta,
const cudnnFilterDescriptor_t gradDesc,
void *gradData) {
assert(*(float *)alpha == 1.0);
cudnnAccumulateResult_t r;
if (*(float *)beta == 0.0) {
r = CUDNN_RESULT_NO_ACCUMULATE;
} else if (*(float *)beta == 1.0) {
r = CUDNN_RESULT_ACCUMULATE;
} else {
assert(0 && "beta must be 0.0 or 1.0");
}
return cudnnConvolutionBackwardFilter(handle, srcDesc, srcData,
diffDesc, diffData,
convDesc, gradDesc, gradData,
r);
}
#define cudnnConvolutionBackwardFilter cudnnConvolutionBackwardFilter_v2
static inline cudnnStatus_t
cudnnConvolutionBackwardData_v2(
cudnnHandle_t handle,
const void *alpha,
const cudnnFilterDescriptor_t filterDesc,
const void *filterData,
const cudnnTensorDescriptor_t diffDesc,
const void *diffData,
const cudnnConvolutionDescriptor_t convDesc,
const void *beta,
const cudnnTensorDescriptor_t gradDesc,
void *gradData) {
assert(*(float *)alpha == 1.0);
cudnnAccumulateResult_t r;
if (*(float *)beta == 0.0) {
r = CUDNN_RESULT_NO_ACCUMULATE;
} else if (*(float *)beta == 1.0) {
r = CUDNN_RESULT_ACCUMULATE;
} else {
assert(0 && "beta must be 0.0 or 1.0");
}
/* This function needs the casting because its params are not
declared as const */
return cudnnConvolutionBackwardData(handle,
(cudnnFilterDescriptor_t)filterDesc,
filterData,
(cudnnTensorDescriptor_t)diffDesc,
diffData,
(cudnnConvolutionDescriptor_t)convDesc,
(cudnnTensorDescriptor_t)gradDesc,
gradData,
r);
}
#define cudnnConvolutionBackwardData cudnnConvolutionBackwardData_v2
//Needed for R2 rc2
# define CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING CUDNN_POOLING_AVERAGE
#else
// r2 rc1 and rc2 do not have the same macro defined
// I didn't checked if this the right combination, but as we do not wrap the padding interface, it is fine for now.
# define CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING ((cudnnPoolingMode_t)1)
#endif
#endif
import os
import numpy
import theano
from theano import Op, Apply, tensor, config, Variable
from theano.scalar import as_scalar, constant
from theano.gradient import DisconnectedType, grad_not_implemented
from theano.gof import Optimizer, local_optimizer, COp
from theano.gof.cmodule import GCC_compiler
from theano.gof.type import CDataType, Generic
from theano.compile import optdb
from theano.compile.ops import shape_i
from theano.configparser import AddConfigVar, EnumStr, StrParam
from theano.tensor.nnet import SoftmaxGrad
from theano.tensor.signal.downsample import (
DownsampleFactorMax, DownsampleFactorMaxGrad)
from . import pygpu, init_dev
from .basic_ops import (as_gpuarray_variable,
gpu_contiguous, HostFromGpu,
GpuAllocEmpty, empty_like)
from .conv import GpuConv
# These don't exist in gpuarray
# GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad
from .nnet import GpuSoftmax
from .opt import gpu_seqopt, register_opt, conv_groupopt, op_lifter
from .opt_util import alpha_merge, output_merge
# This is to avoid conflict with the one in cuda/dnn.py
if not hasattr(config, 'dnn'):
AddConfigVar('dnn.conv.workmem',
"Default value for the workmem attribute of cudnn "
"convolutions.",
EnumStr('small', 'none', 'large'),
in_c_key=False)
AddConfigVar('dnn.include_path',
"Location of the cudnn header (defaults to the cuda root)",
StrParam(lambda: os.path.join(config.cuda.root, 'include')))
AddConfigVar('dnn.library_path',
"Location of the cudnn header (defaults to the cuda root)",
StrParam(lambda: os.path.join(config.cuda.root, 'lib64')))
def dnn_available():
if dnn_available.avail is not None:
return dnn_available.avail
if pygpu is None:
dnn_available.msg = "PyGPU not available"
dnn_available.avail = False
return False
if not init_dev.device.startswith('cuda'):
dnn_available.msg = "Not on a CUDA device. Got %s." % init_dev.device
dnn_available.avail = False
return False
# This is a hack because bin_id is in the from of
# "sm_<major><minor>" for cuda devices.
if pygpu.get_default_context().bin_id < 'sm_30':
dnn_available.msg = "Device not supported by cuDNN"
dnn_available.avail = False
preambule = """
#include <stdio.h>
#include <cuda.h>
#include <cudnn.h>
#include <cudnn_helper.h>
"""
body = """
cudnnHandle_t _handle = NULL;
cudnnStatus_t err;
if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
fprintf(stderr, "could not create cuDNN handle: %s",
cudnnGetErrorString(err));
return 1;
}
"""
# Do not run here the test program. It would run on the
# default gpu, not the one selected by the user. If mixed
# GPU are installed or if the GPUs are configured in
# exclusive mode, this cause bad detection.
comp, out, err = GCC_compiler.try_flags(
["-l", "cudnn", "-I" + os.path.dirname(__file__),
"-I" + config.dnn.include_path,
"-L" + config.dnn.library_path],
preambule=preambule, body=body,
try_run=False, output=True)
dnn_available.avail = comp
if not dnn_available.avail:
dnn_available.msg = (
"Theano cannot compile with cuDNN. We got this error:\n" +
str(err))
else:
# If we can compile, check that we can import and run.
v = version()
if isinstance(v, tuple) and v[0] != v[1]:
dnn_available.avail = False
dnn_available.msg = ("Mixed dnn version. The header is"
" from one version, but we link with"
" a different version %s" % str(v))
raise RuntimeError(dnn_available.msg)
if version() == (20, 20):
dnn_available.avail = False
dnn_available.msg = (
"You have installed a release candidate of CuDNN v2."
" This isn't supported anymore."
" Update to CuDNN v2 final version.")
raise RuntimeError(dnn_available.msg)
return dnn_available.avail
dnn_available.avail = None
dnn_available.msg = None
def c_set_tensor4d(var, desc, err, fail):
return """
{
cudnnDataType_t dt;
size_t ds;
switch (%(var)s->ga.typecode) {
case GA_FLOAT:
dt = CUDNN_DATA_FLOAT;
break;
case GA_DOUBLE:
dt = CUDNN_DATA_DOUBLE;
break;
default:
PyErr_SetString(PyExc_TypeError, "Non-float datatype in c_set_tensor4d");
return -1;
}
ds = gpuarray_get_elsize(%(var)s->ga.typecode);
int str0, str1, str2, str3;
// cudnn do not like 0s in strides
str3 = PyGpuArray_STRIDES(%(var)s)[3]?PyGpuArray_STRIDES(%(var)s)[3]/ds:1;
str2 = PyGpuArray_STRIDES(%(var)s)[2]?PyGpuArray_STRIDES(%(var)s)[2]/ds:PyGpuArray_DIMS(%(var)s)[3];
str1 = PyGpuArray_STRIDES(%(var)s)[1]?PyGpuArray_STRIDES(%(var)s)[1]/ds:PyGpuArray_DIMS(%(var)s)[2]*PyGpuArray_DIMS(%(var)s)[3];
str0 = PyGpuArray_STRIDES(%(var)s)[0]?PyGpuArray_STRIDES(%(var)s)[0]/ds:PyGpuArray_DIMS(%(var)s)[2]*PyGpuArray_DIMS(%(var)s)[3]*PyGpuArray_DIMS(%(var)s)[1];
%(err)s = cudnnSetTensor4dDescriptorEx(
%(desc)s, dt,
PyGpuArray_DIMS(%(var)s)[0],
PyGpuArray_DIMS(%(var)s)[1],
PyGpuArray_DIMS(%(var)s)[2],
PyGpuArray_DIMS(%(var)s)[3],
str0, str1, str2, str3);
if (%(err)s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"could not set tensor4d descriptor: %%s",
cudnnGetErrorString(%(err)s));
%(fail)s
}
}
""" % dict(var=var, err=err, desc=desc, fail=fail)
class DnnBase(COp):
"""
Creates a handle for cudnn and pulls in the cudnn libraries and headers.
"""
# dnn does not know about broadcasting, so we do not need to assert
# the input broadcasting pattern.
check_broadcast = False
def __init__(self):
COp.__init__(self, "dnn_base.c")
def c_headers(self):
return ['cudnn.h', 'cudnn_helper.h', 'gpuarray_helper.h',
'gpuarray/types.h', 'gpuarray/array.h', 'gpuarray/util.h',
'gpuarray_api.h', 'numpy_compat.h']
def c_header_dirs(self):
return [os.path.dirname(__file__), pygpu.get_include(),
config.dnn.include_path]
def c_libraries(self):
return ['cudnn', 'gpuarray']
def c_lib_dirs(self):
return [config.dnn.library_path]
class DnnVersion(Op):
def c_headers(self):
return ['cudnn.h']
def c_header_dirs(self):
return [config.dnn.include_path]
def c_libraries(self):
return ['cudnn']
def c_lib_dirs(self):
return [config.dnn.library_path]
def c_support_code(self):
return """
#if PY_MAJOR_VERSION >= 3
#define PyInt_FromLong PyLong_FromLong
#endif
"""
def make_node(self):
return Apply(self, [], [Generic()()])
def c_code(self, node, name, inputs, outputs, sub):
o = outputs[0]
return """
#if defined(CUDNN_VERSION)
%(o)s = PyTuple_Pack(2, PyInt_FromLong(CUDNN_VERSION), PyInt_FromLong(cudnnGetVersion()));
#else
%(o)s = PyInt_FromLong(-1);
#endif
""" % locals()
def do_constant_folding(self, node):
# Needed as we do not want to cache this information.
return False
def c_code_cache_version(self):
# Not needed, but make it clear that we do not want to cache this.
return None
def version():
"""return the current cuDNN version we compile with.
This return a tuple with the header version and the library
version we link with. For older cudnn version without version
information, we return -1.
"""
if not dnn_available():
raise Exception(
"We can't determine the cudnn version as it is not available",
dnn_available.msg)
if version.v is None:
f = theano.function([], DnnVersion()(),
theano.Mode(optimizer=None),
profile=False)
version.v = f()
return version.v
version.v = None
class GpuDnnConvDesc(Op):
"""This Op builds a convolution descriptor for use in the other
convolution operations.
see the doc of :func:`dnn_conv` for a description of the parameters
"""
__props__ = ('border_mode', 'subsample', 'conv_mode')
def c_headers(self):
return ['cudnn.h', 'cudnn_helper.h']
def c_header_dirs(self):
return [os.path.dirname(__file__), config.dnn.include_path]
def c_libraries(self):
return ['cudnn']
def c_lib_dirs(self):
return [config.dnn.library_path]
def __init__(self, border_mode, subsample=(1, 1), conv_mode='conv'):
if isinstance(border_mode, int):
border_mode = (border_mode, border_mode)
if isinstance(border_mode, tuple):
pad_h, pad_w = map(int, border_mode)
border_mode = (pad_h, pad_w)
if not ((isinstance(border_mode, tuple) and min(border_mode) >= 0) or
border_mode in ('valid', 'full')):
raise ValueError(
'invalid border_mode {}, which must be either '
'"valid", "full", an integer or a pair of'
' integers'.format(border_mode))
self.border_mode = border_mode
assert len(subsample) == 2
self.subsample = subsample
assert conv_mode in ('conv', 'cross')
self.conv_mode = conv_mode
def make_node(self, img_shape, kern_shape):
if img_shape.type.ndim != 1 or img_shape.type.dtype != 'int64':
raise TypeError('img must be 1D shape tensor')
if kern_shape.type.ndim != 1 or kern_shape.type.dtype != 'int64':
raise TypeError('kern must be 1D shape tensor')
return Apply(self, [img_shape, kern_shape],
[CDataType("cudnnConvolutionDescriptor_t")()])
def c_code(self, node, name, inputs, outputs, sub):
img_shape, kern_shape = inputs
desc, = outputs
if isinstance(self.border_mode, tuple):
pad_h_spec, pad_w_spec = map(int, self.border_mode)
assert pad_h_spec >= 0 and pad_w_spec >= 0
bmode = 2
else:
pad_h_spec = pad_w_spec = 0
if self.border_mode == "valid":
bmode = 1
else:
assert self.border_mode == "full"
bmode = 0
if self.conv_mode == 'conv':
conv_flag = 'CUDNN_CONVOLUTION'
else:
conv_flag = 'CUDNN_CROSS_CORRELATION'
return """
{
cudnnStatus_t err;
int pad_h%(name)s;
int pad_w%(name)s;
if ((err = cudnnCreateConvolutionDescriptor(&%(desc)s)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate convolution "
"descriptor: %%s", cudnnGetErrorString(err));
%(fail)s
}
if (%(bmode)d == 2) {
pad_h%(name)s = %(pad_h_spec)d;
pad_w%(name)s = %(pad_w_spec)d;
} else if (%(bmode)d == 1) {
pad_h%(name)s = 0;
pad_w%(name)s = 0;
} else if (%(bmode)d == 0) {
pad_h%(name)s = *(npy_int64 *)PyArray_GETPTR1(%(kern_shape)s, 2) - 1;
pad_w%(name)s = *(npy_int64 *)PyArray_GETPTR1(%(kern_shape)s, 3) - 1;
} else {
PyErr_SetString(PyExc_ValueError, "bad border mode");
%(fail)s
}
#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 20
err = cudnnSetConvolution2dDescriptor(
%(desc)s,
pad_h%(name)s,
pad_w%(name)s,
%(subsx)d, %(subsy)d, 1, 1,
%(conv_flag)s
);
#else
err = cudnnSetConvolutionDescriptorEx(
%(desc)s,
*(npy_int64 *)PyArray_GETPTR1(%(img_shape)s, 0),
*(npy_int64 *)PyArray_GETPTR1(%(img_shape)s, 1),
*(npy_int64 *)PyArray_GETPTR1(%(img_shape)s, 2),
*(npy_int64 *)PyArray_GETPTR1(%(img_shape)s, 3),
*(npy_int64 *)PyArray_GETPTR1(%(kern_shape)s, 0),
*(npy_int64 *)PyArray_GETPTR1(%(kern_shape)s, 2),
*(npy_int64 *)PyArray_GETPTR1(%(kern_shape)s, 3),
pad_h%(name)s,
pad_w%(name)s,
%(subsx)d, %(subsy)d, 1, 1,
%(conv_flag)s
);
#endif
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set op descriptor: %%s",
cudnnGetErrorString(err));
%(fail)s
}
}
""" % dict(name=name, img_shape=img_shape, kern_shape=kern_shape, desc=desc,
bmode=bmode, conv_flag=conv_flag, fail=sub['fail'],
subsx=self.subsample[0], subsy=self.subsample[1],
pad_h_spec=pad_h_spec, pad_w_spec=pad_w_spec)
def c_code_cache_version(self):
return (1, version())
# scalar constants
_zero = constant(numpy.asarray(0.0, dtype='float64'))
_one = constant(numpy.asarray(1.0, dtype='float64'))
def ensure_dt(val, default, name, dtype):
if val is None:
val = default.clone()
if not isinstance(val, Variable):
val = constant(val)
if hasattr(val, 'ndim') and val.ndim == 0:
val = as_scalar(val)
if not isinstance(val.type, theano.scalar.Scalar):
raise TypeError("%s: expected a scalar value" % (name,))
if not val.type.dtype == dtype:
val = val.astype(dtype)
return val
class GpuDnnConv(DnnBase, COp):
"""
The forward convolution.
:param image:
:param kernel:
:param descr: the convolution descriptor
"""
__props__ = ('workmem', 'inplace')
def __init__(self, workmem=None, inplace=False):
"""
:param workmem: either 'none', 'small' or 'large'. Default is
the value of :attr:`config.dnn.conv.workmem`.
"""
COp.__init__(self, ["dnn_base.c", "dnn_conv_base.c", "dnn_fwd.c"],
"APPLY_SPECIFIC(conv_fwd)")
if workmem is None:
workmem = config.dnn.conv.workmem
self.workmem = workmem
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [2]}
assert self.workmem in ['none', 'small', 'large']
def get_op_params(self):
if self.inplace:
inpl_def = [('CONV_INPLACE', '1')]
else:
inpl_def = []
if version() == -1:
alg_def = ('CONV_ALGO', "0")
else:
if self.workmem == 'none':
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM'
elif self.workmem == 'small':
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM'
elif self.workmem == 'large':
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_GEMM'
alg_def = ('CONV_ALGO', alg)
return [alg_def] + inpl_def
def make_node(self, img, kern, output, desc, alpha=None, beta=None):
img = as_gpuarray_variable(img)
kern = as_gpuarray_variable(kern)
output = as_gpuarray_variable(output)
if img.type.ndim != 4:
raise TypeError('img must be 4D tensor')
if kern.type.ndim != 4:
raise TypeError('kern must be 4D tensor')
if output.type.ndim != 4:
raise TypeError('output must be a 4D tensor')
if not isinstance(desc.type, CDataType) \
or desc.type.ctype != 'cudnnConvolutionDescriptor_t':
raise TypeError('desc must be cudnnConvolutionDescriptor_t')
alpha = ensure_dt(alpha, _one, 'alpha', img.dtype)
beta = ensure_dt(beta, _zero, 'beta', img.dtype)
return Apply(self, [img, kern, output, desc, alpha, beta],
[output.type()])
def grad(self, inp, grads):
img, kerns, output, desc, alpha, beta = inp
top, = grads
top = gpu_contiguous(top)
d_img = GpuDnnConvGradI()(kerns, top, empty_like(img), desc)
d_kerns = GpuDnnConvGradW()(img, top, empty_like(kerns), desc)
d_alpha = grad_not_implemented(self, 4, alpha)
d_beta = grad_not_implemented(self, 5, beta)
return [d_img * alpha, d_kerns * alpha, top * beta,
DisconnectedType()(), d_alpha, d_beta]
def connection_pattern(self, node):
# not connected to desc
return [[1], [1], [1], [0], [1], [1]]
@staticmethod
def get_out_shape(ishape, kshape, border_mode, subsample):
"""
This function computes the output shape for a convolution with
the specified parameters. `ishape` and `kshape` can be symbolic
or scalar.
"""
b = ishape[0] # Number of inputs
h = ishape[2] # Height of input feature maps
w = ishape[3] # Width of input feature maps
nb = kshape[0] # Number of output feature maps
kh = kshape[2] # Height of each filter
kw = kshape[3] # Width of each filter
sh, sw = subsample
if border_mode == 'full':
padh = kh - 1
padw = kw - 1
elif isinstance(border_mode, tuple):
padh, padw = border_mode
else:
assert border_mode == 'valid'
padh = 0
padw = 0
return (
b, nb,
(h + 2 * padh - kh) // sh + 1,
(w + 2 * padw - kw) // sw + 1
)
def infer_shape(self, node, shape):
return [shape[2]]
class GpuDnnConvGradW(DnnBase, COp):
"""
The convolution gradient with respect to the weights.
:param image:
:param kernel:
:param descr: the convolution descriptor
"""
__props__ = ('inplace',)
def __init__(self, inplace=False):
COp.__init__(self, ["dnn_base.c", "dnn_conv_base.c", "dnn_gw.c"],
"APPLY_SPECIFIC(conv_gw)")
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [2]}
def __setstate__(self, d):
self.__dict__.update(d)
if not hasattr(self, 'inplace'):
self.inplace = False
def grad(self, inp, grads):
img, top, output, desc, alpha, beta = inp
kerns, = grads
kerns = gpu_contiguous(kerns)
d_img = GpuDnnConvGradI()(kerns, top, empty_like(img), desc)
d_top = GpuDnnConv()(img, kerns, empty_like(top), desc)
d_alpha = grad_not_implemented(self, 4, alpha)
d_beta = grad_not_implemented(self, 5, beta)
return (d_img * alpha, d_top * alpha, kerns * beta,
DisconnectedType()(), d_alpha, d_beta)
def connection_pattern(self, node):
# not connected to desc
return [[1], [1], [1], [0], [1], [1]]
def get_op_params(self):
if self.inplace:
return [('CONV_INPLACE', '1')]
else:
return []
def make_node(self, img, topgrad, output, desc, alpha=None, beta=None):
img = as_gpuarray_variable(img)
topgrad = as_gpuarray_variable(topgrad)
output = as_gpuarray_variable(output)
if img.type.ndim != 4:
raise TypeError('img must be 4D tensor')
if topgrad.type.ndim != 4:
raise TypeError('topgrad must be 4D tensor')
if output.type.ndim != 4:
raise TypeError('output must be 4D tensor')
if not isinstance(desc.type, CDataType) \
or desc.type.ctype != 'cudnnConvolutionDescriptor_t':
raise TypeError('desc must be cudnnConvolutionDescriptor_t')
alpha = ensure_dt(alpha, _one, 'alpha', img.dtype)
beta = ensure_dt(beta, _zero, 'beta', img.dtype)
return Apply(self, [img, topgrad, output, desc, alpha, beta],
[output.type()])
def infer_shape(self, node, shape):
return [shape[2]]
class GpuDnnConvGradI(DnnBase):
"""
The convolution gradient with respect to the inputs.
:param image:
:param kernel:
:param descr: the convolution descriptor
"""
__props__ = ('inplace',)
def __init__(self, inplace=False):
COp.__init__(self, ["dnn_base.c", "dnn_conv_base.c", "dnn_gi.c"],
"APPLY_SPECIFIC(conv_gi)")
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [2]}
def grad(self, inp, grads):
kerns, top, output, desc, alpha, beta = inp
img, = grads
img = gpu_contiguous(img)
d_kerns = GpuDnnConvGradW()(img, top, empty_like(kerns), desc)
d_top = GpuDnnConv()(img, kerns, empty_like(top), desc)
d_alpha = grad_not_implemented(self, 4, alpha)
d_beta = grad_not_implemented(self, 5, beta)
return (d_kerns * alpha, d_top * alpha, img * beta,
DisconnectedType()(), d_alpha, d_beta)
def connection_pattern(self, node):
# not connected to desc
return [[1], [1], [1], [0], [1], [1]]
def get_op_params(self):
if self.inplace:
return [('CONV_INPLACE', '1')]
else:
return []
def make_node(self, kern, topgrad, output, desc, alpha=None, beta=None):
kern = as_gpuarray_variable(kern)
topgrad = as_gpuarray_variable(topgrad)
output = as_gpuarray_variable(output)
if kern.type.ndim != 4:
raise TypeError('kern must be 4D tensor')
if topgrad.type.ndim != 4:
raise TypeError('topgrad must be 4D tensor')
if output.type.ndim != 4:
raise TypeError('output must be 4D tensor')
if not isinstance(desc.type, CDataType) \
or desc.type.ctype != 'cudnnConvolutionDescriptor_t':
raise TypeError('desc must be cudnnConvolutionDescriptor_t')
alpha = ensure_dt(alpha, _one, 'alpha', kern.dtype)
beta = ensure_dt(beta, _zero, 'beta', kern.dtype)
return Apply(self, [kern, topgrad, output, desc, alpha, beta],
[output.type()])
def infer_shape(self, node, shape):
return [shape[2]]
def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
conv_mode='conv', direction_hint=None, workmem=None):
"""
GPU convolution using cuDNN from NVIDIA.
The memory layout to use is 'bc01', that is 'batch', 'channel',
'first dim', 'second dim' in that order.
:param img: images to do the convolution over
:param kerns: convolution filters
:param border_mode: one of 'valid', 'full'; additionally, the padding size
could be directly specified by an integer or a pair of integers
:param subsample: perform subsampling of the output (default: (1, 1))
:param conv_mode: perform convolution (kernels flipped) or cross-correlation.
One of 'conv', 'cross'. (default: 'conv')
:param direction_hint: Used by graph optimizers to change algorithm choice.
By default, GpuDnnConv will be used to carry out the convolution.
If border_mode is 'valid', subsample is (1,1) and direction_hint is
'bprop weights', it will use GpuDnnConvGradW.
If border_mode is 'full', subsample is (1,1) and direction_hint is
*not* 'forward!', it will use GpuDnnConvGradI.
This parameter is used internally by graph optimizers and may be
removed at any time without a deprecation period. You have been warned.
:param workmem: Specify the amount of working memory allowed.
More memory is usually faster. One of 'none', 'small' or
'large'. (default is None which takes its value from
:attr:`config.dnn.conv.workmem`)
:warning: The cuDNN library only works with GPU that have a compute
capability of 3.0 or higer. This means that older GPU will not
work with this Op.
"""
fgraph = getattr(img, 'fgraph', None) or getattr(kerns, 'fgraph', None)
if (border_mode == 'valid' and subsample == (1, 1) and
direction_hint == 'bprop weights'):
# Special case: We are asked to use GpuDnnConvGradW. We need to set
# up a suitable 'fake' convolution to compute the gradient for.
img = gpu_contiguous(img.dimshuffle(1, 0, 2, 3))
if conv_mode == 'conv':
# We need to flip manually. These 'kerns' are not the kernels
# that would be flipped by conv_mode='conv' in GpuDnnConvGradW.
kerns = kerns[:, :, ::-1, ::-1]
kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3))
shape2 = shape_i(img, 2, fgraph) - shape_i(kerns, 2, fgraph) + 1
shape3 = shape_i(img, 3, fgraph) - shape_i(kerns, 3, fgraph) + 1
out = GpuAllocEmpty(img.dtype)(shape_i(kerns, 1, fgraph),
shape_i(img, 1, fgraph), shape2, shape3)
desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
conv_mode='cross')(img.shape, out.shape)
conv = GpuDnnConvGradW()(img, kerns, out, desc)
return as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3))
elif (border_mode == 'full' and subsample == (1, 1) and
direction_hint != 'forward!'):
# Special case: We can be faster by using GpuDnnConvGradI to compute
# the full convolution as the backward pass of a valid convolution.
# We just need to set up a suitable 'fake' valid convolution.
img = gpu_contiguous(img) # cudnn v1 and v2 rc3 need contiguous data
kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3))
conv_mode = 'cross' if conv_mode == 'conv' else 'conv'
shape2 = shape_i(img, 2, fgraph) + shape_i(kerns, 2, fgraph) - 1
shape3 = shape_i(img, 3, fgraph) + shape_i(kerns, 3, fgraph) - 1
out = GpuAllocEmpty(img.dtype)(shape_i(img, 0, fgraph),
shape_i(kerns, 1, fgraph),
shape2, shape3)
desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
conv_mode=conv_mode)(out.shape, kerns.shape)
return GpuDnnConvGradI()(kerns, img, out, desc)
# Standard case: We use GpuDnnConv with suitable padding.
# contig_version will return a gpu_contiguous copy
# if the img contains negative strides
img = gpu_contiguous(img)
kerns = gpu_contiguous(kerns)
desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
conv_mode=conv_mode)(img.shape, kerns.shape)
desc_op = desc.owner.op
out_shp = GpuDnnConv.get_out_shape(img.shape, kerns.shape,
desc_op.border_mode,
desc_op.subsample)
out = GpuAllocEmpty(img.dtype)(*out_shp)
return GpuDnnConv(workmem=workmem)(img, kerns, out, desc)
class GpuDnnPoolDesc(Op):
"""
This Op builds a pooling descriptor for use in the other
pooling operations.
:param ws: windows size
:param stride: (dx, dy)
:param mode: 'max', 'average_inc_pad' or 'average_exc_pad'
The old deprecated name 'average' correspond to 'average_inc_pad'
:param pad: (padX, padY) padding information.
padX is the size of the left and right borders,
padY is the size of the top and bottom borders.
"""
__props__ = ('ws', 'stride', 'mode', 'pad')
def c_headers(self):
return ['cudnn.h', 'cudnn_helper.h']
def c_header_dirs(self):
return [os.path.dirname(__file__), config.dnn.include_path]
def c_libraries(self):
return ['cudnn']
def c_lib_dirs(self):
return [config.dnn.library_path]
def do_constant_folding(self, node):
return False
def __init__(self, ws=(1, 1), stride=(1, 1), mode='max', pad=(0, 0)):
if mode == 'average':
mode = 'average_inc_pad'
assert mode in ('max', 'average_inc_pad', 'average_exc_pad')
self.mode = mode
assert len(ws) == 2
self.ws = ws
assert len(stride) == 2
self.stride = stride
assert len(stride) == 2
self.pad = pad
if (pad[0] != 0 or pad[1] != 0) and version() == -1:
raise RuntimeError("CuDNN pooling with padding requires CuDNN v2")
def __setstate__(self, d):
self.__dict__.update(d)
if not hasattr(self, 'pad'):
self.pad = (0, 0)
def make_node(self):
if self.pad != (0, 0) and version() == -1:
raise RuntimeError("CuDNN pooling with padding requires CuDNN v2")
return Apply(self, [],
[CDataType("cudnnPoolingDescriptor_t")()])
def c_code(self, node, name, inputs, outputs, sub):
desc, = outputs
if self.mode == 'max':
mode_flag = 'CUDNN_POOLING_MAX'
elif self.mode == "average_inc_pad":
mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING'
elif self.mode == "average_exc_pad":
mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING'
if version() == -1:
raise Exception("cudnn v1 do not support average_exc_pad")
else:
raise NotImplementedError("Unsupported pooling model.")
return """
{
cudnnStatus_t err;
if ((err = cudnnCreatePoolingDescriptor(&%(desc)s)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate pooling "
"descriptor: %%s", cudnnGetErrorString(err));
%(fail)s
}
#ifndef CUDNN_VERSION
err = cudnnSetPoolingDescriptor(
%(desc)s,
%(mode_flag)s,
%(wsX)d, %(wsY)d,
%(stridex)d, %(stridey)d
);
#else
err = cudnnSetPooling2dDescriptor(
%(desc)s,
%(mode_flag)s,
%(wsX)d, %(wsY)d,
%(padX)d, %(padY)d,
%(stridex)d, %(stridey)d
);
#endif
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set op descriptor: %%s",
cudnnGetErrorString(err));
%(fail)s
}
}
""" % dict(name=name, desc=desc, mode_flag=mode_flag, fail=sub['fail'],
wsX=self.ws[0], wsY=self.ws[1],
stridex=self.stride[0], stridey=self.stride[1],
padX=self.pad[0], padY=self.pad[1])
def c_code_cache_version(self):
return (2, version())
class GpuDnnPool(DnnBase):
"""
Pooling.
:param img: the image 4d tensor.
:param desc: the pooling descriptor.
"""
__props__ = ()
def make_node(self, img, desc):
img = as_gpuarray_variable(img)
if img.type.ndim != 4:
raise TypeError('img must be 4D tensor')
if not isinstance(desc.type, CDataType) \
or desc.type.ctype != 'cudnnPoolingDescriptor_t':
raise TypeError('desc must be cudnnPoolingDescriptor_t')
return Apply(self, [img, desc],
[img.type()])
def infer_shape(self, node, shape):
desc = node.inputs[1].owner.op
kh, kw = desc.ws
sh, sw = desc.stride
padh, padw = desc.pad
return [(
shape[0][0],
shape[0][1],
(shape[0][2] + 2 * padh - kh) // sh + 1,
(shape[0][3] + 2 * padw - kw) // sw + 1
)]
def c_support_code_struct(self, node, name):
return """
cudnnTensorDescriptor_t input%(name)s;
cudnnTensorDescriptor_t output%(name)s;
""" % dict(name=name)
def c_init_code_struct(self, node, name, sub):
return """
cudnnStatus_t err%(name)s;
input%(name)s = NULL;
output%(name)s = NULL;
if ((err%(name)s = cudnnCreateTensorDescriptor(&input%(name)s)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
"(inp): %%s", cudnnGetErrorString(err%(name)s));
%(fail)s
}
if ((err%(name)s = cudnnCreateTensorDescriptor(&output%(name)s)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
"(out): %%s", cudnnGetErrorString(err%(name)s));
%(fail)s
}
""" % dict(name=name, fail=sub['fail'])
def c_cleanup_code_struct(self, node, name):
return """
if (input%(name)s != NULL) { cudnnDestroyTensorDescriptor(input%(name)s); }
if (output%(name)s != NULL) { cudnnDestroyTensorDescriptor(output%(name)s); }
""" % dict(name=name)
def c_code(self, node, name, inputs, outputs, sub):
desc = inputs[1]
out, = outputs
set_in = c_set_tensor4d(inputs[0], "input" + str(name),
'err' + name, sub['fail'])
set_out = c_set_tensor4d(out, "output" + str(name),
'err' + name, sub['fail'])
return """
cudnnStatus_t err%(name)s;
size_t %(out)s_dims[4];
if (!GpuArray_IS_C_CONTIGUOUS(&%(input)s->ga)) {
PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
%(fail)s
}
%(set_in)s
cudnnPoolingMode_t mode;
int wsX, wsY, vpad, hpad, strideX, strideY;
#ifndef CUDNN_VERSION
err%(name)s = cudnnGetPoolingDescriptor(
%(desc)s, &mode,
&wsX, &wsY,
&strideX, &strideY);
#else
err%(name)s = cudnnGetPooling2dDescriptor(
%(desc)s, &mode,
&wsX, &wsY,
&vpad, &hpad,
&strideX, &strideY);
#endif
if (err%(name)s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"GpuDnnPool: error doing cudnnGetPoolingDescriptor operation: %%s",
cudnnGetErrorString(err%(name)s));
%(fail)s
}
%(out)s_dims[0] = PyGpuArray_DIMS(%(input)s)[0];
%(out)s_dims[1] = PyGpuArray_DIMS(%(input)s)[1];
%(out)s_dims[2] = (PyGpuArray_DIMS(%(input)s)[2] + (vpad*2) - wsX) / strideX + 1;
%(out)s_dims[3] = (PyGpuArray_DIMS(%(input)s)[3] + (hpad*2) - wsY) / strideY + 1;
if (theano_prep_output(&%(out)s, 4, %(out)s_dims, %(input)s->ga.typecode,
GA_C_ORDER, pygpu_default_context()) != 0) {
%(fail)s
}
%(set_out)s
#ifndef CUDNN_VERSION
err%(name)s = cudnnPoolingForward(
_handle,
%(desc)s,
%(input_desc)s, PyGpuArray_DEV_DATA(%(input)s),
%(output_desc)s, PyGpuArray_DEV_DATA(%(out)s)
);
#else
{
const float alpha = 1;
const float beta = 0;
err%(name)s = cudnnPoolingForward(
_handle,
%(desc)s,
&alpha,
%(input_desc)s, PyGpuArray_DEV_DATA(%(input)s),
&beta,
%(output_desc)s, PyGpuArray_DEV_DATA(%(out)s)
);
}
#endif
if (err%(name)s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"GpuDnnPool: error doing cudnnPoolingForward operation: %%s",
cudnnGetErrorString(err%(name)s));
%(fail)s
}
""" % dict(out=out, desc=desc, fail=sub['fail'],
name=name, set_in=set_in,
set_out=set_out, input=inputs[0],
input_desc="input" + name,
output_desc="output" + name)
def grad(self, inp, grads):
img, desc = inp
grad, = grads
grad = gpu_contiguous(grad)
out = self(img, desc)
g_out = GpuDnnPoolGrad()(img, out, grad, desc)
return g_out, theano.gradient.DisconnectedType()()
def connection_pattern(self, node):
# not connected to desc
return [[1], [0]]
def c_code_cache_version(self):
return (7, version())
class GpuDnnPoolGrad(DnnBase):
"""
The pooling gradient.
:param inp: the input of the pooling.
:param out: the output of the pooling in the forward.
:param inp_grad: same size as out, but is the corresponding gradient information.
:param desc: The pooling descriptor.
"""
__props__ = ()
def make_node(self, inp, out, inp_grad, desc):
inp = as_gpuarray_variable(inp)
if inp.type.ndim != 4:
raise TypeError('inp must be 4D tensor')
inp_grad = as_gpuarray_variable(inp_grad)
if inp_grad.type.ndim != 4:
raise TypeError('inp_grad must be 4D tensor')
out = as_gpuarray_variable(out)
if out.type.ndim != 4:
raise TypeError('out must be 4D tensor')
if not isinstance(desc.type, CDataType) \
or desc.type.ctype != 'cudnnPoolingDescriptor_t':
raise TypeError('desc must be cudnnPoolingDescriptor_t')
return Apply(self, [inp, out, inp_grad, desc],
[inp.type()])
def c_support_code_struct(self, node, name):
return """
cudnnTensorDescriptor_t input%(name)s;
cudnnTensorDescriptor_t input_grad%(name)s;
cudnnTensorDescriptor_t output%(name)s;
cudnnTensorDescriptor_t output_grad%(name)s;
""" % dict(name=name)
def c_init_code_struct(self, node, name, sub):
return """
cudnnStatus_t err%(name)s;
input%(name)s = NULL;
input_grad%(name)s = NULL;
output%(name)s = NULL;
output_grad%(name)s = NULL;
if ((err%(name)s = cudnnCreateTensorDescriptor(&input%(name)s)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError,
"GpuDnnPoolGrad: could not allocate tensor4d descriptor "
"(input): %%s", cudnnGetErrorString(err%(name)s));
%(fail)s
}
if ((err%(name)s = cudnnCreateTensorDescriptor(&input_grad%(name)s)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError,
"GpuDnnPoolGrad: could not allocate tensor4d descriptor "
"(input_grad): %%s", cudnnGetErrorString(err%(name)s));
%(fail)s
}
if ((err%(name)s = cudnnCreateTensorDescriptor(&output%(name)s)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError,
"GpuDnnPoolGrad: could not allocate tensor4d descriptor "
"(output): %%s", cudnnGetErrorString(err%(name)s));
%(fail)s
}
if ((err%(name)s = cudnnCreateTensorDescriptor(&output_grad%(name)s)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError,
"GpuDnnPoolGrad: could not allocate tensor4d descriptor "
"(output_grad): %%s", cudnnGetErrorString(err%(name)s));
%(fail)s
}
""" % dict(name=name, fail=sub['fail'])
def c_cleanup_code_struct(self, node, name):
return """
if (input%(name)s != NULL) { cudnnDestroyTensorDescriptor(input%(name)s); }
if (input_grad%(name)s != NULL) { cudnnDestroyTensorDescriptor(input_grad%(name)s); }
if (output%(name)s != NULL) { cudnnDestroyTensorDescriptor(output%(name)s); }
if (output_grad%(name)s != NULL) { cudnnDestroyTensorDescriptor(output_grad%(name)s); }
""" % dict(name=name)
def c_code(self, node, name, inputs, outputs, sub):
# Here the name out and inp are based on the cudnn definition.
# Not the definition of this class.
# This make it complicated.
out, inp, inp_grad, desc = inputs
out_grad, = outputs
set_in = "\n".join([
c_set_tensor4d(inp, "input" + name,
'err' + name, sub['fail']),
c_set_tensor4d(inp_grad, "input_grad" + name,
'err' + name, sub['fail']),
c_set_tensor4d(out, "output" + name,
'err' + name, sub['fail'])
])
set_out = c_set_tensor4d(out, "output_grad" + name,
'err' + name, sub['fail'])
return """
cudnnStatus_t err%(name)s;
if (!GpuArray_IS_C_CONTIGUOUS(&%(input)s->ga)) {
PyErr_SetString(PyExc_ValueError,
"GpuDnnPoolGrad: Only contiguous inputs are supported.");
%(fail)s
}
if (!GpuArray_IS_C_CONTIGUOUS(&%(input_grad)s->ga)) {
PyErr_SetString(PyExc_ValueError,
"GpuDnnPoolGrad: Only contiguous input gradients are supported.");
%(fail)s
}
if (!GpuArray_IS_C_CONTIGUOUS(&%(output)s->ga)) {
PyErr_SetString(PyExc_ValueError,
"GpuDnnPoolGrad: Only contiguous outputs are supported.");
%(fail)s
}
%(set_in)s
if (theano_prep_output(&%(output_grad)s, PyGpuArray_NDIM(%(output)s),
PyGpuArray_DIMS(%(output)s), %(output)s->ga.typecode,
GA_C_ORDER, pygpu_default_context()) != 0)
{
%(fail)s
}
%(set_out)s
#ifndef CUDNN_VERSION
err%(name)s = cudnnPoolingBackward(
_handle,
%(desc)s,
%(input_desc)s, PyGpuArray_DEV_DATA(%(input)s),
%(input_grad_desc)s, PyGpuArray_DEV_DATA(%(input_grad)s),
%(output_desc)s, PyGpuArray_DEV_DATA(%(output)s),
%(output_grad_desc)s, PyGpuArray_DEV_DATA(%(output_grad)s)
);
#else
{
const float alpha = 1;
const float beta = 0;
err%(name)s = cudnnPoolingBackward(
_handle,
%(desc)s,
&alpha,
%(input_desc)s, PyGpuArray_DEV_DATA(%(input)s),
%(input_grad_desc)s, PyGpuArray_DEV_DATA(%(input_grad)s),
%(output_desc)s, PyGpuArray_DEV_DATA(%(output)s),
&beta,
%(output_grad_desc)s, PyGpuArray_DEV_DATA(%(output_grad)s)
);
}
#endif
if (err%(name)s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"GpuDnnPoolGrad: error doing operation: %%s.",
cudnnGetErrorString(err%(name)s));
%(fail)s
}
""" % dict(output_grad=out_grad, desc=desc,
fail=sub['fail'],
name=name, set_in=set_in,
set_out=set_out, input=inp, input_grad=inp_grad, output=out,
input_desc="input" + name,
input_grad_desc="input_grad" + name,
output_desc="output" + name,
output_grad_desc="output_grad" + name)
def c_code_cache_version(self):
return (5, version())
def infer_shape(self, node, shape):
return [shape[0]]
def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)):
"""
GPU pooling using cuDNN from NVIDIA.
The memory layout to use is 'bc01', that is 'batch', 'channel',
'first dim', 'second dim' in that order.
:param img: images to do the pooling over
:param ws: subsampling window size
:param stride: subsampling stride (default: (1, 1))
:param mode: one of 'max', 'average_inc_pad' or 'average_exc_pad
(default: 'max')
:param pad: (padX, padY) padding information.
padX is the size of the left and right borders,
padY is the size of the top and bottom borders.
:warning: The cuDNN library only works with GPU that have a compute
capability of 3.0 or higer. This means that older GPU will not
work with this Op.
:note: This Op implements the ignore_border=True of max_pool_2d.
"""
img = gpu_contiguous(img)
desc = GpuDnnPoolDesc(ws=ws, stride=stride, mode=mode, pad=pad)()
return GpuDnnPool()(img, desc)
class GpuDnnSoftmaxBase(DnnBase):
"""
Op for the cuDNN Softmax.
:param tensor_format: Whether the data format is 'bc01' or 'b01c'.
:param algo: 'fast' or 'accurate' indicating whether computations should be
optimized for speed or accuracy respectively.
:param mode: 'instance' or 'channel' indicating whether the softmax should
be computed per image across 'c01' or per spatial location '01' per
image across 'c'.
"""
__props__ = ('tensor_format', 'mode', 'algo')
def __init__(self, tensor_format, algo, mode):
assert(tensor_format in ('bc01', 'b01c'))
DnnBase.__init__(self)
self.tensor_format = tensor_format
assert(algo in ('fast', 'accurate'))
self.algo = algo
assert(mode in ('instance', 'channel'))
self.mode = mode
self.tensor_4d_descs = [softmax_input
for softmax_input in self.softmax_inputs]
self.tensor_4d_descs.append('softmax_output')
def infer_shape(self, node, shape):
if self.direction == 'forward':
return [shape[0]]
else:
return [shape[1]]
def _define_tensor4d_desc(self, name, id):
return """
cudnnTensorDescriptor_t %(id)s_%(name)s;
""" % dict(name=name, id=id)
def _init_tensor4d_desc(self, name, id, fail):
return """
%(id)s_%(name)s = NULL;
if ((err%(name)s = cudnnCreateTensorDescriptor(&%(id)s_%(name)s)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
": %%s", cudnnGetErrorString(err%(name)s));
%(fail)s
}
""" % dict(name=name, id=id, fail=fail)
def _clean_tensor4d_desc(self, name, id):
return """
if(%(id)s_%(name)s!= NULL)
cudnnDestroyTensorDescriptor(%(id)s_%(name)s);
""" % dict(name=name, id=id)
def c_support_code_struct(self, node, name):
result = ''
for id in self.tensor_4d_descs:
result += self._define_tensor4d_desc(name, id)
return result
def c_init_code_struct(self, node, name, sub):
result = """
cudnnStatus_t err%(name)s;
""" % dict(name=name)
for id in self.tensor_4d_descs:
result += self._init_tensor4d_desc(name, id, sub['fail'])
return result
def c_cleanup_code_struct(self, node, name):
result = ''
for id in self.tensor_4d_descs:
result += self._clean_tensor4d_desc(name, id)
return result
def c_code(self, node, name, inputs, outputs, sub):
ins = inputs
outs, = outputs
if self.tensor_format == 'b01c':
tensor_format = 1
else:
tensor_format = 0
if self.mode == 'instance':
mode = 1
else:
mode = 0
if self.algo == 'fast':
algo = 1
else:
algo = 0
# Setup configuration variables.
result = """
cudnnStatus_t err%(name)s;
cudnnTensorFormat_t format%(name)s = CUDNN_TENSOR_NCHW;
if (%(tensor_format)d == 1)
format%(name)s = CUDNN_TENSOR_NHWC;
cudnnSoftmaxAlgorithm_t algo%(name)s = CUDNN_SOFTMAX_ACCURATE;
if (%(algo)d == 1)
algo%(name)s = CUDNN_SOFTMAX_FAST;
cudnnSoftmaxMode_t mode%(name)s = CUDNN_SOFTMAX_MODE_CHANNEL;
if (%(mode)d == 1)
mode%(name)s = CUDNN_SOFTMAX_MODE_INSTANCE;
""" % dict(name=name, tensor_format=tensor_format, mode=mode, algo=algo)
# Validate the input and build the input variables.
for input_idx, input_name in enumerate(self.softmax_inputs):
result += c_set_tensor4d(ins[input_idx], input_name + "_" + name,
"err" + name, sub['fail'])
subs = dict(ins=ins[-1], outs=outs, fail=sub['fail'],
name=name)
for idx, softmax_input in enumerate(self.softmax_inputs):
subs['name%d' % idx] = softmax_input
subs['ins%d' % idx] = inputs[idx]
# Build and prepare the output variable.
result += """
if (theano_prep_output(&%(outs)s, PyGpuArray_NDIM(%(ins)s),
PyGpuArray_DIMS(%(ins)s), %(ins)s->ga.typecode,
GA_C_ORDER, pygpu_default_context()) != 0)
{
%(fail)s
}
""" % subs
result += c_set_tensor4d(outs,
"softmax_output_" + name,
"err" + name, sub['fail'])
# Add on a call to the method that does the actual work.
result += self.method() % subs
return result
def c_code_cache_version(self):
return (0, 7, version())
def method(self):
raise NotImplementedError('GpuDnnSoftmaxBase::method')
class GpuDnnSoftmax(GpuDnnSoftmaxBase):
"""
Op for the cuDNN Softmax.
:param tensor_format: Whether the data format is 'bc01' or 'b01c'.
:param algo: 'fast' or 'accurate' indicating whether computations should be
optimized for speed or accuracy respectively.
:param mode: 'instance' or 'channel' indicating whether the softmax should
be computed per image across 'c01' or per spatial location '01' per
image across 'c'.
"""
direction = 'forward'
softmax_inputs = ['softmax_input']
def make_node(self, x):
x = as_gpuarray_variable(x)
assert x.ndim == 4
return Apply(self, [x], [x.type()])
def method(self):
return """
#ifndef CUDNN_VERSION
err%(name)s = cudnnSoftmaxForward(
_handle,
algo%(name)s,
mode%(name)s,
softmax_input_%(name)s,
PyGpuArray_DEV_DATA(%(ins)s),
softmax_output_%(name)s,
PyGpuArray_DEV_DATA(%(outs)s)
);
#else
{
const float alpha = 1.;
const float beta = 0.;
err%(name)s = cudnnSoftmaxForward(
_handle,
algo%(name)s,
mode%(name)s,
(void*) &alpha,
softmax_input_%(name)s,
PyGpuArray_DEV_DATA(%(ins)s),
(void*) &beta,
softmax_output_%(name)s,
PyGpuArray_DEV_DATA(%(outs)s)
);
}
#endif
"""
def grad(self, inp, grads):
x, = inp
g_sm, = grads
sm = self.make_node(x).outputs[0]
return [GpuDnnSoftmaxGrad(
self.tensor_format,
self.algo,
self.mode
)(g_sm, sm)]
class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
"""
Op for the cuDNN SoftmaxGrad.
:param tensor_format: Whether the data format is 'bc01' or 'b01c'.
:param algo: 'fast' or 'accurate' indicating whether computations should be
optimized for speed or accuracy respectively.
:param mode: 'instance' or 'channel' indicating whether the softmax should
be computed per image across 'c01' or per spatial location '01' per
image across 'c'.
"""
direction = 'backward'
softmax_inputs = ['softmax_gout', 'softmax_input']
def make_node(self, dy, sm):
dy = as_gpuarray_variable(dy)
sm = as_gpuarray_variable(sm)
assert dy.ndim == 4
assert sm.ndim == 4
return Apply(self, [dy, sm], [sm.type.make_variable()])
def method(self):
return """
#ifndef CUDNN_VERSION
err%(name)s = cudnnSoftmaxBackward(
_handle,
algo%(name)s,
mode%(name)s,
%(name1)s_%(name)s,
PyGpuArray_DEV_DATA(%(ins1)s),
%(name0)s_%(name)s,
PyGpuArray_DEV_DATA(%(ins0)s),
softmax_output_%(name)s,
PyGpuArray_DEV_DATA(%(outs)s)
);
#else
{
const float alpha = 1.;
const float beta = 0.;
err%(name)s = cudnnSoftmaxBackward(
_handle,
algo%(name)s,
mode%(name)s,
(void*) &alpha,
%(name1)s_%(name)s,
PyGpuArray_DEV_DATA(%(ins1)s),
%(name0)s_%(name)s,
PyGpuArray_DEV_DATA(%(ins0)s),
(void*) &beta,
softmax_output_%(name)s,
PyGpuArray_DEV_DATA(%(outs)s)
);
}
#endif
"""
# @register_opt('cudnn') # this optimizer is registered in opt.py instead.
@local_optimizer([GpuConv])
def local_conv_dnn(node):
if not dnn_available():
return
if isinstance(node.op, GpuConv):
if node.op.border_mode not in ['full', 'valid']:
return
img, kern = node.inputs
border_mode = node.op.border_mode
subsample = node.op.subsample
direction_hint = node.op.direction_hint
rval = dnn_conv(img, kern,
border_mode=border_mode, subsample=subsample,
direction_hint=direction_hint)
return [rval]
# This optimizer is registered in opt.py as part of the meta-optimizer.
# It tries exactly the opposite code path of what local_conv_dnn() uses,
# because for some input/kernel shape configurations, this is faster.
@local_optimizer([GpuConv])
def local_conv_dnn_alternative(node):
if not dnn_available():
return
if isinstance(node.op, GpuConv):
border_mode = node.op.border_mode
subsample = node.op.subsample
if border_mode not in ['full', 'valid'] or subsample != (1, 1):
return
img, kern = node.inputs
direction_hint = node.op.direction_hint
if border_mode == 'full':
# for a full convolution, try using the forward pass instead
# of the backward pass wrt. inputs
direction_hint = 'forward!'
elif border_mode == 'valid':
# for a valid convolution, try using the backward pass wrt.
# weights instead of the forward pass and vice versa
if direction_hint == 'bprop weights':
direction_hint = 'forward'
else:
direction_hint = 'bprop weights'
rval = dnn_conv(img, kern,
border_mode=border_mode, subsample=subsample,
direction_hint=direction_hint)
if node.outputs[0].broadcastable != rval.broadcastable:
rval = tensor.patternbroadcast(
rval, node.outputs[0].type.broadcastable)
return [rval]
conv_groupopt.register('local_conv_dnn', local_conv_dnn, 20,
'conv_dnn', 'fast_compile', 'fast_run', 'cudnn')
@local_optimizer([GpuDnnConv], inplace=True)
def local_dnn_conv_inplace(node):
if type(node.op) != GpuDnnConv or node.op.inplace:
return
inputs = list(node.inputs)
dest = inputs[2]
if (dest.owner and
isinstance(dest.owner.op, GpuAllocEmpty) and
len(dest.clients) > 1):
inputs[2] = GpuAllocEmpty(dest.owner.op.dtype)(*dest.owner.inputs)
return [GpuDnnConv(workmem=node.op.workmem, inplace=True)(*inputs)]
@local_optimizer([GpuDnnConvGradW], inplace=True)
def local_dnn_convgw_inplace(node):
if type(node.op) != GpuDnnConvGradW or node.op.inplace:
return
inputs = list(node.inputs)
dest = inputs[2]
if (dest.owner and
isinstance(dest.owner.op, GpuAllocEmpty) and
len(dest.clients) > 1):
inputs[2] = GpuAllocEmpty(dest.owner.op.dtype)(*dest.owner.inputs)
return [GpuDnnConvGradW(inplace=True)(*inputs)]
@local_optimizer([GpuDnnConvGradI], inplace=True)
def local_dnn_convgi_inplace(node):
if type(node.op) != GpuDnnConvGradI or node.op.inplace:
return
inputs = list(node.inputs)
dest = inputs[2]
if (dest.owner and
isinstance(dest.owner.op, GpuAllocEmpty) and
len(dest.clients) > 1):
inputs[2] = GpuAllocEmpty(dest.owner.op.dtype)(*dest.owner.inputs)
return [GpuDnnConvGradI(inplace=True)(*inputs)]
optdb.register('local_dnna_conv_inplace',
tensor.opt.in2out(local_dnn_conv_inplace,
local_dnn_convgw_inplace,
local_dnn_convgi_inplace,
name="local_dnn_conv_inplace"),
70.0, 'fast_run', 'inplace', 'gpuarray', 'cudnn')
@register_opt('cudnn')
@alpha_merge(GpuDnnConv, alpha_in=4, beta_in=5, nd=4)
def local_dnn_conv_alpha_merge(node, *inputs):
if not dnn_available() or version() == -1:
return None
return [GpuDnnConv(workmem=node.op.workmem)(*inputs)]
@register_opt('cudnn')
@alpha_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, nd=4)
def local_dnn_convw_alpha_merge(node, *inputs):
if not dnn_available() or version() == -1:
return None
return [GpuDnnConvGradW()(*inputs)]
@register_opt('cudnn')
@alpha_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, nd=4)
def local_dnn_convi_alpha_merge(node, *inputs):
if not dnn_available() or version() == -1:
return None
return [GpuDnnConvGradI()(*inputs)]
@register_opt('cudnn')
@output_merge(GpuDnnConv, alpha_in=4, beta_in=5, out_in=2, nd=4)
def local_dnn_conv_output_merge(node, *inputs):
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConv(workmem=node.op.workmem)(*inputs)]
@register_opt('cudnn')
@output_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, out_in=2, nd=4)
def local_dnn_convw_output_merge(node, *inputs):
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConvGradW()(*inputs)]
@register_opt('cudnn')
@output_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, out_in=2, nd=4)
def local_dnn_convi_output_merge(node, *inputs):
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConvGradI()(*inputs)]
@register_opt('cudnn')
@op_lifter([DownsampleFactorMax])
def local_pool_dnn_alternative(node):
if not dnn_available():
return
if not node.op.ignore_border:
return
img, = node.inputs
ds = node.op.ds
stride = node.op.st
pad = node.op.padding
mode = node.op.mode
return dnn_pool(gpu_contiguous(img.owner.inputs[0]),
ds, stride=stride, pad=pad, mode=mode)
@register_opt('cudnn')
@op_lifter([DownsampleFactorMaxGrad])
def local_pool_dnn_grad_stride(node):
if not dnn_available():
return
if not node.op.ignore_border:
return
inp, out, inp_grad = node.inputs
ds = node.op.ds
st = node.op.st
pad = node.op.padding
mode = node.op.mode
desc = GpuDnnPoolDesc(ws=ds, stride=st, mode=mode, pad=pad)()
return GpuDnnPoolGrad()(gpu_contiguous(inp),
gpu_contiguous(out),
gpu_contiguous(inp_grad),
desc)
@register_opt('cudnn')
@local_optimizer([GpuSoftmax])
def local_softmax_dnn(node):
if not dnn_available():
return
if isinstance(node.op, GpuSoftmax):
ins = node.inputs[0].dimshuffle(0, 1, 'x', 'x')
ins = gpu_contiguous(ins)
out = GpuDnnSoftmax('bc01', 'accurate', 'channel')(ins)
out = as_gpuarray_variable(out.dimshuffle(0, 1))
return [out]
class NoCuDNNRaise(Optimizer):
def apply(self, fgraph):
""" Raise a RuntimeError if cudnn can't be used"""
if not dnn_available():
# Make an assert error as we want Theano to fail, not
# just skip this optimization.
raise AssertionError(
"cuDNN optimization was enabled, but Theano was not able"
" to use it. We got this error: \n" +
dnn_available.msg)
gpu_seqopt.register("NoCuDNNRaise", NoCuDNNRaise(), 0, 'cudnn')
@register_opt('cudnn')
@op_lifter([SoftmaxGrad])
def local_softmax_dnn_grad(node):
if not dnn_available():
return
ins = []
for n in node.inputs:
if isinstance(n.owner.op, HostFromGpu):
n = n.owner.inputs[0]
if n.ndim != 2:
return
ins.append(n.dimshuffle(0, 1, 'x', 'x'))
out = GpuDnnSoftmaxGrad('bc01', 'accurate', 'channel')(
gpu_contiguous(ins[0]), gpu_contiguous(ins[1]))
return [out.dimshuffle(0, 1)]
#section support_code
static cudnnHandle_t _handle = NULL;
static int
c_set_tensor4d(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc) {
cudnnDataType_t dt;
size_t ds;
switch (var->ga.typecode) {
case GA_FLOAT:
dt = CUDNN_DATA_FLOAT;
break;
case GA_DOUBLE:
dt = CUDNN_DATA_DOUBLE;
break;
default:
PyErr_SetString(PyExc_TypeError, "Non-float datatype in c_set_tensor4d");
return -1;
}
ds = gpuarray_get_elsize(var->ga.typecode);
int str0, str1, str2, str3;
// cudnn do not like 0s in strides
str3 = PyGpuArray_STRIDES(var)[3]?PyGpuArray_STRIDES(var)[3]/ds:1;
str2 = PyGpuArray_STRIDES(var)[2]?PyGpuArray_STRIDES(var)[2]/ds:PyGpuArray_DIMS(var)[3];
str1 = PyGpuArray_STRIDES(var)[1]?PyGpuArray_STRIDES(var)[1]/ds:PyGpuArray_DIMS(var)[2]*PyGpuArray_DIMS(var)[3];
str0 = PyGpuArray_STRIDES(var)[0]?PyGpuArray_STRIDES(var)[0]/ds:PyGpuArray_DIMS(var)[2]*PyGpuArray_DIMS(var)[3]*PyGpuArray_DIMS(var)[1];
cudnnStatus_t err = cudnnSetTensor4dDescriptorEx(
desc, dt,
PyGpuArray_DIM(var, 0), PyGpuArray_DIM(var, 1),
PyGpuArray_DIM(var, 2), PyGpuArray_DIM(var, 3),
str0, str1, str2, str3);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not set tensor4d descriptor: %s",
cudnnGetErrorString(err));
return -1;
}
return 0;
}
static int
c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t desc) {
cudnnDataType_t dt;
if (!GpuArray_IS_C_CONTIGUOUS(&var->ga)) {
PyErr_SetString(PyExc_ValueError,
"Only contiguous filters (kernels) are supported.");
return -1;
}
switch (var->ga.typecode) {
case GA_FLOAT:
dt = CUDNN_DATA_FLOAT;
break;
case GA_DOUBLE:
dt = CUDNN_DATA_DOUBLE;
break;
default:
PyErr_SetString(PyExc_TypeError, "Non-float datatype in c_set_filter");
return -1;
}
cudnnStatus_t err = cudnnSetFilter4dDescriptor(
desc, dt,
PyGpuArray_DIMS(var)[0], PyGpuArray_DIMS(var)[1],
PyGpuArray_DIMS(var)[2], PyGpuArray_DIMS(var)[3]);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not set filter descriptor: %s.",
cudnnGetErrorString(err));
return -1;
}
return 0;
}
#section init_code
{
cudnnStatus_t err;
if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not create cuDNN handle: %s",
cudnnGetErrorString(err));
#if PY_MAJOR_VERSION >= 3
return NULL;
#else
return;
#endif
}
}
#section support_code_struct
cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
cudnnFilterDescriptor_t APPLY_SPECIFIC(kerns);
#section init_code_struct
cudnnStatus_t APPLY_SPECIFIC(err);
APPLY_SPECIFIC(input) = NULL;
APPLY_SPECIFIC(output) = NULL;
APPLY_SPECIFIC(kerns) = NULL;
if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
"(inp): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
FAIL;
}
if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
"(out): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
FAIL;
}
if ((APPLY_SPECIFIC(err) = cudnnCreateFilterDescriptor(&APPLY_SPECIFIC(kerns))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate filter descriptor: %s",
cudnnGetErrorString(APPLY_SPECIFIC(err)));
FAIL;
}
#section cleanup_code_struct
if (APPLY_SPECIFIC(input) != NULL)
cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input));
if (APPLY_SPECIFIC(output) != NULL)
cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output));
if (APPLY_SPECIFIC(kerns) != NULL)
cudnnDestroyFilterDescriptor(APPLY_SPECIFIC(kerns));
#section support_code_struct
int
APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
PyGpuArrayObject *om,
cudnnConvolutionDescriptor_t desc,
double alpha, double beta,
PyGpuArrayObject **output) {
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
float af = alpha, bf = beta;
void *alpha_p;
void *beta_p;
if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) {
PyErr_SetString(PyExc_ValueError,
"GpuDnnConv images and kernel must have the same stack size");
return 1;
}
if (c_set_tensor4d(input, APPLY_SPECIFIC(input)) == -1)
return 1;
if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
return 1;
switch (input->ga.typecode) {
case GA_DOUBLE:
alpha_p = (void *)&alpha;
beta_p = (void *)&beta;
break;
case GA_FLOAT:
alpha_p = (void *)&af;
beta_p = (void *)&bf;
break;
default:
PyErr_SetString(PyExc_TypeError, "Unsupported type in convolution");
return 1;
}
#ifdef CONV_INPLACE
Py_XDECREF(*output);
*output = om;
Py_INCREF(*output);
#else
if (theano_prep_output(output, PyGpuArray_NDIM(om), PyGpuArray_DIMS(om),
om->ga.typecode, GA_C_ORDER,
pygpu_default_context()) != 0)
return 1;
if (beta != 0.0 && pygpu_move(*output, om))
return 1;
#endif
if (c_set_tensor4d(*output, APPLY_SPECIFIC(output)) == -1)
return 1;
{
size_t worksize;
gpudata *workspace;
PyGpuContextObject *c;
err = cudnnGetConvolutionForwardWorkspaceSize(_handle,
APPLY_SPECIFIC(input),
APPLY_SPECIFIC(kerns),
desc,
APPLY_SPECIFIC(output),
CONV_ALGO,
&worksize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"GpuDnnConv: error getting worksize: %s",
cudnnGetErrorString(err));
return 1;
}
/*
* This is less than ideal since we need to free it after (which
* introduces a synchronization point. But we don't have a module
* to place a nice get_work_mem() function in.
*/
if (worksize != 0) {
c = pygpu_default_context();
workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL);
if (workspace == NULL) {
PyErr_SetString(PyExc_RuntimeError,
"Could not allocate working memory");
return 1;
}
}
err = cudnnConvolutionForward(
_handle,
alpha_p,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
desc, CONV_ALGO,
worksize == 0 ? NULL : *(void **)workspace, worksize,
beta_p,
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output));
if (worksize != 0)
c->ops->buffer_release(workspace);
}
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "GpuDnnConv: error doing operation: %s",
cudnnGetErrorString(err));
return 1;
}
return 0;
}
#section support_code_struct
int
APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
PyGpuArrayObject *im,
cudnnConvolutionDescriptor_t desc,
double alpha, double beta, PyGpuArrayObject **input) {
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
float af = alpha, bf = beta;
void *alpha_p;
void *beta_p;
if (PyGpuArray_DIMS(im)[1] != PyGpuArray_DIMS(kerns)[1]) {
PyErr_SetString(PyExc_ValueError,
"GpuDnnConv images and kernel must have the same stack size");
return 1;
}
if (c_set_tensor4d(output, APPLY_SPECIFIC(output)) == -1)
return 1;
if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
return 1;
switch (im->ga.typecode) {
case GA_DOUBLE:
alpha_p = (void *)&alpha;
beta_p = (void *)&beta;
break;
case GA_FLOAT:
alpha_p = (void *)&af;
beta_p = (void *)&bf;
break;
default:
PyErr_SetString(PyExc_TypeError, "Unsupported type in convolution");
return 1;
}
#ifdef CONV_INPLACE
Py_XDECREF(*input);
*input = im;
Py_INCREF(*input);
#else
if (theano_prep_output(input, PyGpuArray_NDIM(im), PyGpuArray_DIMS(im),
im->ga.typecode, GA_C_ORDER,
pygpu_default_context()) != 0)
return 1;
if (beta != 0.0 && pygpu_move(*input, im))
return 1;
#endif
if (c_set_tensor4d(*input, APPLY_SPECIFIC(input)) == -1)
return 1;
err = cudnnConvolutionBackwardData(
_handle,
alpha_p,
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),
desc,
beta_p,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(*input));
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradI: error doing operation: %s",
cudnnGetErrorString(err));
return 1;
}
return 0;
}
#section support_code_struct
int
APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
PyGpuArrayObject *km,
cudnnConvolutionDescriptor_t desc,
double alpha, double beta, PyGpuArrayObject **kerns) {
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
float af = alpha, bf = beta;
void *alpha_p;
void *beta_p;
if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1]) {
PyErr_SetString(PyExc_ValueError,
"GpuDnnConv images and kernel must have the same stack size");
return 1;
}
if (c_set_tensor4d(input, APPLY_SPECIFIC(input)) == -1)
return 1;
if (c_set_tensor4d(output, APPLY_SPECIFIC(output)) == -1)
return 1;
switch (input->ga.typecode) {
case GA_DOUBLE:
alpha_p = (void *)&alpha;
beta_p = (void *)&beta;
break;
case GA_FLOAT:
alpha_p = (void *)&af;
beta_p = (void *)&bf;
break;
default:
PyErr_SetString(PyExc_TypeError, "Unsupported type in convolution");
return 1;
}
#ifdef CONV_INPLACE
Py_XDECREF(*kerns);
*kerns = km;
Py_INCREF(*kerns);
#else
if (theano_prep_output(kerns, PyGpuArray_NDIM(km), PyGpuArray_DIMS(km),
km->ga.typecode, GA_C_ORDER,
pygpu_default_context()) != 0)
return 1;
if (beta != 0.0 && pygpu_move(*kerns, km))
return 1;
#endif
if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns)) == -1)
return 1;
err = cudnnConvolutionBackwardFilter(
_handle,
alpha_p,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),
desc,
beta_p,
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(*kerns));
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradW: error doing operation: %s",
cudnnGetErrorString(err));
return 1;
}
return 0;
}
...@@ -42,4 +42,9 @@ static PyGpuArrayObject *theano_try_copy(PyGpuArrayObject *out, ...@@ -42,4 +42,9 @@ static PyGpuArrayObject *theano_try_copy(PyGpuArrayObject *out,
return out; return out;
} }
/* This is guaranteed to work and return the raw CUDA/OpenCL object on
* all recent (as of June 2015) version of libgpuarray. This is also
* promised to keep working in future versions. */
#define PyGpuArray_DEV_DATA(ary) (*(void **)((ary)->ga.data))
#endif #endif
...@@ -12,11 +12,13 @@ from theano import tensor, scalar, gof ...@@ -12,11 +12,13 @@ from theano import tensor, scalar, gof
from theano.compile import optdb from theano.compile import optdb
from theano.gof import (local_optimizer, EquilibriumDB, from theano.gof import (local_optimizer, EquilibriumDB,
SequenceDB, Optimizer, toolbox) SequenceDB, Optimizer, toolbox)
from theano.gof.optdb import LocalGroupDB
from theano.scan_module import scan_utils, scan_op, scan_opt from theano.scan_module import scan_utils, scan_op, scan_opt
from theano.tensor.nnet.conv import ConvOp from theano.tensor.nnet.conv import ConvOp
from theano.tests.breakpoint import PdbBreakpoint from theano.tests.breakpoint import PdbBreakpoint
from .type import GpuArrayType, GpuArrayConstant from .type import GpuArrayType, GpuArrayConstant
from .basic_ops import (host_from_gpu, gpu_from_host, from .basic_ops import (host_from_gpu, gpu_from_host,
HostFromGpu, GpuFromHost, HostFromGpu, GpuFromHost,
...@@ -39,6 +41,10 @@ gpu_cut_copies = EquilibriumDB() ...@@ -39,6 +41,10 @@ gpu_cut_copies = EquilibriumDB()
gpu_seqopt = SequenceDB() gpu_seqopt = SequenceDB()
# Don't register this right now
conv_groupopt = LocalGroupDB()
conv_groupopt.__name__ = "gpua_conv_opts"
gpu_seqopt.register('gpuarray_local_optimiziations', gpu_optimizer, 1, gpu_seqopt.register('gpuarray_local_optimiziations', gpu_optimizer, 1,
'fast_compile', 'fast_run', 'inplace', 'gpuarray') 'fast_compile', 'fast_run', 'inplace', 'gpuarray')
gpu_seqopt.register('gpuarray_cut_transfers', gpu_cut_copies, 2, gpu_seqopt.register('gpuarray_cut_transfers', gpu_cut_copies, 2,
...@@ -689,6 +695,9 @@ def local_gpu_conv(node): ...@@ -689,6 +695,9 @@ def local_gpu_conv(node):
out.values_eq_approx = values_eq_approx out.values_eq_approx = values_eq_approx
return [out] return [out]
# Register this here so that it goes after 'local_gpu_conv'
register_opt()(conv_groupopt)
@register_opt("low_memory") @register_opt("low_memory")
@local_optimizer([GpuCAReduceCuda]) @local_optimizer([GpuCAReduceCuda])
......
...@@ -7,10 +7,10 @@ from theano.gof import local_optimizer ...@@ -7,10 +7,10 @@ from theano.gof import local_optimizer
from theano.tensor import (DimShuffle, get_scalar_constant_value, from theano.tensor import (DimShuffle, get_scalar_constant_value,
NotScalarConstantError) NotScalarConstantError)
from .basic_ops import GpuFromHost, HostFromGpu, host_from_gpu from .basic_ops import GpuFromHost, HostFromGpu
from .elemwise import GpuDimShuffle, GpuElemwise from .elemwise import GpuDimShuffle, GpuElemwise
_one = scal.constant(numpy.asarray(1.0, dtype='float32')) _one = scal.constant(numpy.asarray(1.0, dtype='float64'))
def grab_cpu_scalar(v, nd): def grab_cpu_scalar(v, nd):
...@@ -18,10 +18,10 @@ def grab_cpu_scalar(v, nd): ...@@ -18,10 +18,10 @@ def grab_cpu_scalar(v, nd):
n = v.owner n = v.owner
if (isinstance(n.op, GpuDimShuffle) and if (isinstance(n.op, GpuDimShuffle) and
n.op.new_order == ('x',) * nd): n.op.new_order == ('x',) * nd):
return host_from_gpu(n.inputs[0]) return grab_cpu_scalar(n.inputs[0])
elif (isinstance(n.op, DimShuffle) and elif (isinstance(n.op, DimShuffle) and
n.op.new_order == ('x',) * nd): n.op.new_order == ('x',) * nd):
return n.inputs[0] return grab_cpu_scalar(n.inputs[0])
elif isinstance(n.op, GpuFromHost): elif isinstance(n.op, GpuFromHost):
return grab_cpu_scalar(n.inputs[0], nd=nd) return grab_cpu_scalar(n.inputs[0], nd=nd)
else: else:
...@@ -37,7 +37,7 @@ def find_node(v, cls, ignore_clients=False): ...@@ -37,7 +37,7 @@ def find_node(v, cls, ignore_clients=False):
# that has the op class specified. If ignore_clients is False (the # that has the op class specified. If ignore_clients is False (the
# default) it will only dig through nodes that have a single # default) it will only dig through nodes that have a single
# client. # client.
if v.owner is not None and (ignore_clients or v.clients == 1): if v.owner is not None and (ignore_clients or len(v.clients) == 1):
if isinstance(v.owner.op, cls): if isinstance(v.owner.op, cls):
return v.owner return v.owner
elif (isinstance(v.owner.op, GpuFromHost) and elif (isinstance(v.owner.op, GpuFromHost) and
......
import logging
from nose.plugins.skip import SkipTest
import numpy
from itertools import product
import theano
from six import StringIO
import theano.tensor as T
import theano.tests.unittest_tools as utt
from theano.sandbox.neighbours import images2neibs
from theano.tensor.signal.downsample import max_pool_2d
from theano.tensor.signal.downsample import DownsampleFactorMaxGrad
from .. import dnn
from ..basic_ops import GpuAllocEmpty
from .test_basic_ops import mode_with_gpu, mode_without_gpu
from . import test_nnet
def test_dnn_conv_desc_merge():
if not dnn.dnn_available():
raise SkipTest(dnn.dnn_available.msg)
img_shp = T.as_tensor_variable(
numpy.asarray([2, 1, 8, 8]).astype('int64'))
kern_shp = T.as_tensor_variable(
numpy.asarray([3, 1, 2, 2]).astype('int64'))
desc1 = dnn.GpuDnnConvDesc(border_mode='valid', subsample=(2, 2),
conv_mode='conv')(img_shp, kern_shp)
desc2 = dnn.GpuDnnConvDesc(border_mode='full', subsample=(1, 1),
conv_mode='cross')(img_shp, kern_shp)
# CDataType is not DeepCopyable so this will crash if we don't use
# borrow=True
f = theano.function([], [theano.Out(desc1, borrow=True),
theano.Out(desc2, borrow=True)])
d1, d2 = f()
# This will be the case if they are merged, which would be bad.
assert d1 != d2
def test_dnn_conv_merge():
# This test that we merge correctly multiple dnn_conv.
if not dnn.dnn_available():
raise SkipTest(dnn.dnn_available.msg)
img_shp = [2, 5, 6, 8]
kern_shp = [3, 5, 5, 6]
img = T.ftensor4('img')
kern = T.ftensor4('kern')
out = T.ftensor4('out')
desc = dnn.GpuDnnConvDesc(
border_mode='valid')(img.shape, kern.shape)
# Test forward op
o1 = dnn.dnn_conv(img, kern)
o2 = dnn.dnn_conv(img, kern)
f = theano.function([img, kern], [o1, o2], mode=mode_with_gpu)
d1, d2 = f(numpy.random.rand(*img_shp).astype('float32'),
numpy.random.rand(*kern_shp).astype('float32'))
topo = f.maker.fgraph.toposort()
assert len([n for n in topo if isinstance(n.op, dnn.GpuDnnConv)]) == 1
# Test grad w op
o1 = dnn.GpuDnnConvGradW()(img, kern, out, desc)
o2 = dnn.GpuDnnConvGradW()(img, kern, out, desc)
f = theano.function([img, kern, out], [o1, o2], mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert len([n for n in topo if isinstance(n.op, dnn.GpuDnnConvGradW)]) == 1
# Test grad i op
o1 = dnn.GpuDnnConvGradI()(img, kern, out, desc)
o2 = dnn.GpuDnnConvGradI()(img, kern, out, desc)
f = theano.function([img, kern, out], [o1, o2], mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert len([n for n in topo if isinstance(n.op, dnn.GpuDnnConvGradI)]) == 1
def test_dnn_conv_inplace():
"""This test that we have inplace work correctly even when
GpuAllocEmpty get merged together.
"""
if not dnn.dnn_available():
raise SkipTest(dnn.dnn_available.msg)
img_shp = [2, 5, 6, 8]
kern_shp = [3, 5, 5, 6]
img = T.ftensor4('img')
kern = T.ftensor4('kern')
out = T.ftensor4('out')
desc1 = dnn.GpuDnnConvDesc(border_mode='valid', conv_mode='conv')(
img.shape, kern.shape)
desc2 = dnn.GpuDnnConvDesc(
border_mode='valid', conv_mode='cross')(img.shape, kern.shape)
# Test forward op
o1 = dnn.dnn_conv(img, kern, conv_mode='conv')
o2 = dnn.dnn_conv(img, kern, conv_mode='cross')
f = theano.function([img, kern], [o1, o2], mode=mode_with_gpu)
d1, d2 = f(numpy.random.rand(*img_shp).astype('float32'),
numpy.random.rand(*kern_shp).astype('float32'))
topo = f.maker.fgraph.toposort()
convs = [n for n in topo if isinstance(n.op, dnn.GpuDnnConv)]
assert len(convs) == 2
assert all([node.op.inplace for node in convs])
assert len([n for n in topo if isinstance(n.op, GpuAllocEmpty)]) == 2
# Test grad w op
out = GpuAllocEmpty(kern.dtype)(*kern.shape)
o1 = dnn.GpuDnnConvGradW()(img, kern, out, desc1)
o2 = dnn.GpuDnnConvGradW()(img, kern, out, desc2)
f = theano.function([img, kern], [o1, o2], mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
convs = [n for n in topo if isinstance(n.op, dnn.GpuDnnConvGradW)]
assert len(convs) == 2
assert all([node.op.inplace for node in convs])
assert len([n for n in topo if isinstance(n.op, GpuAllocEmpty)]) == 2
# Test grad i op
out = GpuAllocEmpty(img.dtype)(*img.shape)
o1 = dnn.GpuDnnConvGradI()(img, kern, out, desc1)
o2 = dnn.GpuDnnConvGradI()(img, kern, out, desc2)
f = theano.function([img, kern], [o1, o2], mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
convs = [n for n in topo if isinstance(n.op, dnn.GpuDnnConvGradI)]
assert len(convs) == 2
assert all([node.op.inplace for node in convs])
assert len([n for n in topo if isinstance(n.op, GpuAllocEmpty)]) == 2
def pool_2d_i2n(input, ds=(2, 2), strides=None,
pad=(0, 0),
pool_function=T.max, mode='ignore_borders'):
if strides is None:
strides = ds
if strides[0] > ds[0] or strides[1] > ds[1]:
raise RuntimeError(
"strides should be smaller than or equal to ds,"
" strides=(%d, %d) and ds=(%d, %d)" %
(strides + ds))
shape = input.shape
if pad != (0, 0):
assert pool_function is T.max
pad_x = pad[0]
pad_y = pad[1]
a = T.alloc(-numpy.inf, shape[0], shape[1], shape[2] + pad_x * 2,
shape[3] + pad_y * 2)
input = T.set_subtensor(a[:, :,
pad_x:pad_x + shape[2],
pad_y:pad_y + shape[3]],
input)
shape = input.shape
neibs = images2neibs(input, ds, strides, mode=mode)
pooled_neibs = pool_function(neibs, axis=1)
output_width = (shape[2] - ds[0]) // strides[0] + 1
output_height = (shape[3] - ds[1]) // strides[1] + 1
pooled_output = pooled_neibs.reshape((shape[0], shape[1],
output_width, output_height))
return pooled_output
def test_pooling():
if not dnn.dnn_available():
raise SkipTest(dnn.dnn_available.msg)
x = T.ftensor4()
for mode, pad in product(('max', 'average_inc_pad', 'average_exc_pad'),
((0, 0), (1, 0), (1, 0), (2, 3), (3, 2))):
if mode == 'max':
func = T.max
else:
func = T.mean
if pad != (0, 0) and dnn.version() == -1:
continue
if pad != (0, 0) and func is T.mean:
continue
for ws in (4, 2, 5):
for stride in (2, 3):
if stride > ws:
continue
if pad[0] > stride or pad[1] > stride:
# Not implemented
continue
# We will check that the opt introduced it.
out1 = max_pool_2d(x, (ws, ws),
st=(stride, stride),
ignore_border=True,
padding=pad, mode=mode)
out2 = pool_2d_i2n(x, ds=(ws, ws), strides=(stride, stride),
pad=pad,
pool_function=func)
mode_without_gpu2 = mode_without_gpu.including()
mode_without_gpu2.check_isfinite = False
f1 = theano.function([x], out1, mode=mode_with_gpu)
assert any([isinstance(node.op, dnn.GpuDnnPool)
for node in f1.maker.fgraph.apply_nodes])
f2 = theano.function([x], out2, mode=mode_without_gpu2)
assert not any([isinstance(node.op, dnn.GpuDnnPool)
for node in f2.maker.fgraph.apply_nodes])
for shp in [(1, 10, 100, 100),
(1, 3, 99, 99),
(32, 1, 147, 197),
]:
data = numpy.random.normal(0, 1, shp).astype("float32")
a = f1(data).__array__()
b = f2(data).__array__()
assert numpy.allclose(a, b,
atol=numpy.finfo(numpy.float32).eps)
# Test the grad
for shp in [(1, 1, 2, 2),
(1, 1, 3, 3)]:
data = numpy.random.normal(0, 1, shp).astype("float32") * 10
ws = 2
stride = 2
if pad[0] > stride or pad[1] > stride:
# Not implemented
continue
# This test the CPU grad + opt + GPU implemtentation
def fn(x):
return max_pool_2d(x, (ws, ws), ignore_border=True,
padding=pad, mode=mode)
theano.tests.unittest_tools.verify_grad(fn, [data],
cast_to_output_type=False,
mode=mode_with_gpu)
# Confirm that the opt would have inserted it.
fg = theano.function([x], theano.grad(fn(x).sum(), x),
mode=mode_with_gpu)
assert any([isinstance(node.op, dnn.GpuDnnPoolGrad)
for node in fg.maker.fgraph.toposort()])
# Test the GPU grad + GPU implementation
def fn(x):
dnn_op = dnn.dnn_pool(
x, ws=(ws, ws),
stride=(stride, stride),
pad=pad,
mode=mode)
return dnn_op
theano.tests.unittest_tools.verify_grad(
fn, [data],
cast_to_output_type=False,
mode=mode_with_gpu)
# Confirm that we get the good op.
fg = theano.function([x], theano.grad(fn(x).sum(), x),
mode=mode_with_gpu)
assert any([isinstance(node.op, dnn.GpuDnnPoolGrad)
for node in fg.maker.fgraph.toposort()])
g_out = fg(data)
# Compare again the CPU result
out = max_pool_2d(x, (ws, ws),
padding=pad,
ignore_border=True, mode=mode)
fc = theano.function([x], theano.grad(out.sum(), x),
mode=mode_without_gpu)
assert any([isinstance(node.op, DownsampleFactorMaxGrad)
for node in fc.maker.fgraph.toposort()])
c_out = fc(data)
assert numpy.allclose(c_out, g_out)
def test_pooling_opt():
if not dnn.dnn_available():
raise SkipTest(dnn.dnn_available.msg)
x = T.ftensor4()
f = theano.function(
[x],
max_pool_2d(x, ds=(2, 2), ignore_border=True),
mode=mode_with_gpu)
assert any([isinstance(n.op, dnn.GpuDnnPool)
for n in f.maker.fgraph.toposort()])
f = theano.function(
[x],
T.grad(max_pool_2d(x, ds=(2, 2), ignore_border=True).sum(), x),
mode=mode_with_gpu.including("cudnn"))
assert any([isinstance(n.op, dnn.GpuDnnPoolGrad)
for n in f.maker.fgraph.toposort()])
def test_dnn_tag():
"""
Test that if cudnn isn't avail we crash and that if it is avail, we use it.
"""
x = T.ftensor4()
old = theano.config.on_opt_error
theano.config.on_opt_error = "raise"
sio = StringIO()
handler = logging.StreamHandler(sio)
logging.getLogger('theano.compile.tests.test_dnn').addHandler(handler)
# Silence original handler when intentionnally generating warning messages
logging.getLogger('theano').removeHandler(theano.logging_default_handler)
raised = False
try:
f = theano.function(
[x],
max_pool_2d(x, ds=(2, 2), ignore_border=True),
mode=mode_with_gpu.including("cudnn"))
except (AssertionError, RuntimeError):
assert not dnn.dnn_available()
raised = True
finally:
theano.config.on_opt_error = old
logging.getLogger(
'theano.compile.tests.test_dnn').removeHandler(handler)
logging.getLogger('theano').addHandler(theano.logging_default_handler)
if not raised:
assert dnn.dnn_available()
assert any([isinstance(n.op, dnn.GpuDnnPool)
for n in f.maker.fgraph.toposort()])
class TestDnnInferShapes(utt.InferShapeTester):
def setUp(self):
super(TestDnnInferShapes, self).setUp()
self.mode = mode_with_gpu
def test_softmax(self):
if not dnn.dnn_available():
raise SkipTest(dnn.dnn_available.msg)
t = T.ftensor4('t')
rand_tensor = numpy.asarray(
numpy.random.rand(5, 4, 3, 2),
dtype='float32'
)
self._compile_and_check(
[t],
[dnn.GpuDnnSoftmax('bc01', 'accurate', 'channel')(t)],
[rand_tensor],
dnn.GpuDnnSoftmax
)
self._compile_and_check(
[t],
[
T.grad(
dnn.GpuDnnSoftmax(
'bc01',
'accurate',
'channel'
)(t).mean(),
t
)
],
[rand_tensor],
dnn.GpuDnnSoftmaxGrad
)
def test_conv(self):
if not dnn.dnn_available():
raise SkipTest(dnn.dnn_available.msg)
img = T.ftensor4('img')
kerns = T.ftensor4('kerns')
out = T.ftensor4('out')
img_val = numpy.asarray(
numpy.random.rand(7, 2, 6, 4),
dtype='float32'
)
kern_vals = numpy.asarray(
numpy.random.rand(8, 2, 4, 3),
dtype='float32'
)
for params in product(
['valid', 'full'],
[(1, 1), (2, 2)],
['conv', 'cross']
):
out_vals = numpy.zeros(
dnn.GpuDnnConv.get_out_shape(img_val.shape, kern_vals.shape,
border_mode=params[0],
subsample=params[1]),
dtype='float32')
desc = dnn.GpuDnnConvDesc(
border_mode=params[0],
subsample=params[1],
conv_mode=params[2]
)(img.shape, kerns.shape)
conv = dnn.GpuDnnConv()(img, kerns, out, desc)
self._compile_and_check(
[img, kerns, out],
[conv],
[img_val, kern_vals, out_vals],
dnn.GpuDnnConv
)
def test_conv_gradw(self):
if not dnn.dnn_available():
raise SkipTest(dnn.dnn_available.msg)
img = T.ftensor4('img')
kerns = T.ftensor4('kerns')
out = T.ftensor4('out')
img_val = numpy.asarray(
numpy.random.rand(2, 5, 6, 8),
dtype='float32'
)
kern_vals = numpy.asarray(
numpy.random.rand(2, 1, 5, 6),
dtype='float32'
)
for params in product(
['valid', 'full'],
[(1, 1)], # strides besides (1, 1)
['conv', 'cross']
):
temp_img = img.dimshuffle(1, 0, 2, 3)
temp_kerns = kerns
if params[2] == 'conv':
temp_kerns = temp_kerns[:, :, ::-1, ::-1]
temp_kerns = temp_kerns.dimshuffle(1, 0, 2, 3)
shape = (
kern_vals.shape[1], img_val.shape[1],
img_val.shape[2] - kern_vals.shape[2] + 1,
img_val.shape[3] - kern_vals.shape[3] + 1
)
out_vals = numpy.zeros(shape, dtype='float32')
desc = dnn.GpuDnnConvDesc(
border_mode=params[0],
subsample=params[1],
conv_mode=params[2]
)(temp_img.shape, out.shape)
conv_grad_w = dnn.GpuDnnConvGradW()(
temp_img,
temp_kerns,
out,
desc,
)
self._compile_and_check(
[temp_img, temp_kerns, out],
[conv_grad_w],
[img_val, kern_vals, out_vals],
dnn.GpuDnnConvGradW
)
def test_conv_gradi(self):
if not dnn.dnn_available():
raise SkipTest(dnn.dnn_available.msg)
img = T.ftensor4('img')
kerns = T.ftensor4('kerns')
out = T.ftensor4('out')
img_val = numpy.asarray(
numpy.random.rand(3, 4, 5, 6),
dtype='float32'
)
kern_vals = numpy.asarray(
numpy.random.rand(13, 14, 15, 16),
dtype='float32'
)
for params in product(
['valid'], # Should this work for 'full'?
[(1, 1)],
['conv', 'cross']
):
temp_kerns = kerns.dimshuffle(1, 0, 2, 3)
shape = (
img_val.shape[0], kern_vals.shape[1],
img_val.shape[2] + kern_vals.shape[2] - 1,
img_val.shape[3] + kern_vals.shape[3] - 1
)
out_vals = numpy.zeros(shape, dtype='float32')
desc = dnn.GpuDnnConvDesc(
border_mode=params[0],
subsample=params[1],
conv_mode=params[2]
)(out.shape, temp_kerns.shape)
conv_grad_i = dnn.GpuDnnConvGradI()(
temp_kerns,
img,
out,
desc,
)
self._compile_and_check(
[temp_kerns, img, out],
[conv_grad_i],
[kern_vals, img_val, out_vals],
dnn.GpuDnnConvGradI
)
def test_pool(self):
if not dnn.dnn_available():
raise SkipTest(dnn.dnn_available.msg)
img = T.ftensor4('img')
img_val = numpy.asarray(
numpy.random.rand(2, 3, 4, 5),
dtype='float32'
)
for params in product(
[(1, 1), (2, 2), (3, 3)],
[(1, 1), (2, 2), (3, 3)],
['max', 'average_inc_pad', 'average_exc_pad']
):
desc = dnn.GpuDnnPoolDesc(
ws=params[0],
stride=params[1],
mode=params[2]
)()
self._compile_and_check(
[img],
[dnn.GpuDnnPool()(img, desc)],
[img_val],
dnn.GpuDnnPool
)
def test_pool_grad(self):
if not dnn.dnn_available():
raise SkipTest(dnn.dnn_available.msg)
img = T.ftensor4('img')
img_grad = T.ftensor4('img_grad')
out = T.ftensor4('out')
img_val = numpy.asarray(
numpy.random.rand(2, 3, 4, 5),
dtype='float32'
)
img_grad_val = numpy.asarray(
numpy.random.rand(2, 3, 4, 5),
dtype='float32'
)
out_val = numpy.asarray(
numpy.random.rand(2, 3, 4, 5),
dtype='float32'
)
for params in product(
[(1, 1), (2, 2), (3, 3)],
[(1, 1), (2, 2), (3, 3)],
['max', 'average_inc_pad']
):
desc = dnn.GpuDnnPoolDesc(
ws=params[0],
stride=params[1],
mode=params[2]
)()
pool_grad = dnn.GpuDnnPoolGrad()(
img,
out,
img_grad,
desc
)
self._compile_and_check(
[img, img_grad, out],
[pool_grad],
[img_val, img_grad_val, out_val],
dnn.GpuDnnPoolGrad
)
# this has been a problem in the past
def test_dnn_conv_border_mode():
if not dnn.dnn_available():
raise SkipTest(dnn.dnn_available.msg)
img = T.ftensor4()
kern = T.ftensor4()
dnn.dnn_conv(img, kern, border_mode=1)
dnn.dnn_conv(img, kern, border_mode=(2, 3))
dnn.dnn_conv(img, kern, border_mode='full')
dnn.dnn_conv(img, kern, border_mode='valid')
def test_dnn_conv_alpha_output_merge():
if not dnn.dnn_available():
raise SkipTest(dnn.dnn_available.msg)
img = T.ftensor4()
kern = T.ftensor4()
out = T.ftensor4()
b = 1
c = 4
f = 3
ih = 5
iw = 8
kh = 2
kw = 6
img_val = numpy.random.random((b, c, ih, iw)).astype('float32')
kern_val = numpy.random.random((f, c, kh, kw)).astype('float32')
out_val = numpy.random.random((b, f, ih - kh + 1,
iw - kw + 1)).astype('float32')
conv = dnn.dnn_conv(img, kern)
gw = theano.grad(conv.sum(), kern)
gi = theano.grad(conv.sum(), img)
lr = numpy.asarray(0.05, dtype='float32')
if dnn.version() == -1:
# Can't merge alpha with cudnn v1
fr = conv + out
wr = kern + gw
ir = img + gi
else:
fr = lr * (conv + out)
wr = kern + lr * gw
ir = img + lr * gi
f1 = theano.function([img, kern, out], [fr, wr, ir], mode=mode_with_gpu)
assert isinstance(f1.maker.fgraph.outputs[0].owner.inputs[0].owner.op,
dnn.GpuDnnConv)
assert isinstance(f1.maker.fgraph.outputs[1].owner.inputs[0].owner.op,
dnn.GpuDnnConvGradW)
assert isinstance(f1.maker.fgraph.outputs[2].owner.inputs[0].owner.op,
dnn.GpuDnnConvGradI)
mode = mode_with_gpu
mode = mode.excluding('local_dnn_conv_alpha_merge')
mode = mode.excluding('local_dnn_convw_alpha_merge')
mode = mode.excluding('local_dnn_convi_alpha_merge')
mode = mode.excluding('local_dnn_conv_output_merge')
mode = mode.excluding('local_dnn_convw_output_merge')
mode = mode.excluding('local_dnn_convi_output_merge')
f2 = theano.function([img, kern, out], [fr, wr, ir], mode=mode)
assert not isinstance(f2.maker.fgraph.outputs[0].owner.inputs[0].owner.op,
dnn.GpuDnnConv)
assert not isinstance(f2.maker.fgraph.outputs[1].owner.inputs[0].owner.op,
dnn.GpuDnnConvGradW)
assert not isinstance(f2.maker.fgraph.outputs[2].owner.inputs[0].owner.op,
dnn.GpuDnnConvGradI)
out_f1 = f1(img_val, kern_val, out_val)
out_f2 = f2(img_val, kern_val, out_val)
assert len(out_f1) == len(out_f2)
for v1, v2 in zip(out_f1, out_f2):
utt.assert_allclose(v1, v2)
def test_dnn_conv_grad():
if not dnn.dnn_available() or dnn.version() == -1:
raise SkipTest('alpha != 1.0 not supported in cudnn v1')
b = 1
c = 4
f = 3
ih = 2
iw = 8
kh = 2
kw = 2
img_val = numpy.random.random((b, c, ih, iw)).astype('float32')
kern_val = numpy.random.random((f, c, kh, kw)).astype('float32')
out_val = numpy.random.random((b, f, ih - kw + 1,
iw - kw + 1)).astype('float32')
def dconv(img, kern, out):
desc = dnn.GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
conv_mode='conv')(img.shape, kern.shape)
return dnn.GpuDnnConv()(img, kern, out, desc, alpha=0.5, beta=0.75)
def dconvi(img, kern, out):
desc = dnn.GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
conv_mode='conv')(img.shape, kern.shape)
return dnn.GpuDnnConvGradI()(kern, out, img, desc, alpha=-1.0,
beta=0.0)
def dconvw(img, kern, out):
desc = dnn.GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
conv_mode='conv')(img.shape, kern.shape)
return dnn.GpuDnnConvGradW()(img, out, kern, desc, alpha=0.75,
beta=-1.0)
utt.verify_grad(dconv, [img_val, kern_val, out_val])
utt.verify_grad(dconvi, [img_val, kern_val, out_val])
utt.verify_grad(dconvw, [img_val, kern_val, out_val])
def test_version():
if not dnn.dnn_available():
raise SkipTest(dnn.dnn_available.msg)
assert isinstance(dnn.version(), (int, tuple))
class test_SoftMax(test_nnet.test_SoftMax):
gpu_op = dnn.GpuDnnSoftmax
gpu_grad_op = dnn.GpuDnnSoftmaxGrad
mode = mode_with_gpu
def test_softmax_shape_0(self):
raise SkipTest("Cudnn do not suport 0 shapes")
def test_softmax_grad(self):
def cmp(n, m, f, f_gpu):
data = numpy.arange(n * m, dtype='float32').reshape(n, m)
gdata = numpy.asarray(data)[:, :, None, None]
out = f(data)
gout = numpy.asarray(f_gpu(gdata))[:, :, 0, 0]
assert numpy.allclose(out, gout), numpy.absolute(out - gout)
x = T.matrix('x', 'float32')
x_gpu = T.tensor4('x_gpu', 'float32')
f_z = T.nnet.softmax_op
f_gpu = dnn.GpuDnnSoftmax(
'bc01',
'accurate',
'channel'
)
# Verify the grad operation
dims = (2, 3, 4, 5)
gdata = numpy.arange(
numpy.product(dims),
dtype='float32'
).reshape(dims)
T.verify_grad(f_gpu, [gdata], rng=numpy.random,
mode=mode_with_gpu)
# Verify that the CPU and GPU implementations return the same results
# up to a tolerance.
self._test_softmax(
x,
x_gpu,
f_z,
f_gpu,
cmp
)
self._test_softmax(
x, x, f_z, f_z, self._cmp
)
# Verify that the SoftmaxGrad -> Gpu[Dnn]SoftmaxGrad
# optimization is applied when cudnn is required
y = T.fvector('y')
f = theano.function(
[y],
T.grad(T.nnet.softmax(y).mean(), y),
mode=mode_with_gpu
)
sorted_f = f.maker.fgraph.toposort()
assert(len([i
for i in sorted_f
if isinstance(
i.op,
self.gpu_grad_op
)]) == 1)
assert(len([i
for i in sorted_f
if isinstance(
i.op,
theano.tensor.nnet.SoftmaxGrad
)]) == 0)
# Verify that the SoftmaxGrad -> Gpu[Dnn]SoftmaxGrad
# optimization is not applied when cudnn is excluded or not
# available
mode_wo_cudnn = mode_with_gpu.excluding("cudnn")
y = T.fvector('y')
f = theano.function(
[y],
T.grad(T.nnet.softmax(y).mean(), y),
mode=mode_wo_cudnn
)
sorted_f = f.maker.fgraph.toposort()
assert(len([i
for i in sorted_f
if isinstance(
i.op,
self.gpu_grad_op
)]) == 0)
assert(len([i
for i in sorted_f
if isinstance(
i.op,
theano.tensor.nnet.SoftmaxGrad
)]) == 1)
# Verify that the SoftmaxGrad -> GpuDnnSoftmaxGrad do not
# crash with manual graph
y = T.fvector('y')
o = theano.tensor.nnet.SoftmaxGrad()(y, y*2)
f = theano.function([y], o, mode=mode_with_gpu)
sorted_f = f.maker.fgraph.toposort()
assert(len([i
for i in sorted_f
if isinstance(
i.op,
self.gpu_grad_op
)]) == 1)
assert(len([i
for i in sorted_f
if isinstance(
i.op,
theano.tensor.nnet.SoftmaxGrad
)]) == 0)
from __future__ import print_function from __future__ import print_function
from nose.plugins.skip import SkipTest from nose.plugins.skip import SkipTest
import numpy import numpy
import unittest
import theano import theano
import theano.tensor as T import theano.tensor as T
...@@ -11,12 +12,13 @@ from theano.sandbox import gpuarray ...@@ -11,12 +12,13 @@ from theano.sandbox import gpuarray
# We let that import do the init of the back-end if needed. # We let that import do the init of the back-end if needed.
from .test_basic_ops import (mode_with_gpu, from .test_basic_ops import (mode_with_gpu,
mode_without_gpu) mode_without_gpu)
from ..nnet import ( from ..nnet import (
GpuCrossentropySoftmaxArgmax1HotWithBias, GpuCrossentropySoftmaxArgmax1HotWithBias,
GpuCrossentropySoftmax1HotWithBiasDx, GpuCrossentropySoftmax1HotWithBiasDx,
GpuSoftmaxWithBias, GpuSoftmax) GpuSoftmaxWithBias, GpuSoftmax)
mode_wo_cudnn = mode_with_gpu.excluding("cudnn")
def test_GpuCrossentropySoftmaxArgmax1HotWithBias(): def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
""" """
...@@ -290,3 +292,96 @@ def softmax_unittest_template(dtypeInput): ...@@ -290,3 +292,96 @@ def softmax_unittest_template(dtypeInput):
cmp(2, 10000) cmp(2, 10000)
cmp(128, 16 * 1024) cmp(128, 16 * 1024)
cmp(128, 64 * 1024) cmp(128, 64 * 1024)
class test_SoftMax(unittest.TestCase):
gpu_op = GpuSoftmax
mode = mode_wo_cudnn
def _test_softmax(
self,
x,
x_gpu,
f_z,
f_gpu_z,
cmp
):
"""
This is basic test for GpuSoftmax and GpuDnnSoftmax
We check that we loop when there is too much block
We use slower code when there isn't enough shared memory
"""
f_z_out = f_z(x)
f_gpu_z_out = f_gpu_z(x_gpu)
f = theano.function([x], f_z_out, mode=mode_without_gpu)
f_gpu = theano.function([x_gpu], f_gpu_z_out, mode=self.mode)
self._check_types(f, f_gpu, T.nnet.Softmax, self.gpu_op)
# we need to test n>32*1024 to check that we make the block loop.
cmp(1, 5, f, f_gpu)
cmp(2, 5, f, f_gpu)
cmp(10, 5, f, f_gpu)
cmp(100, 5, f, f_gpu)
cmp(1000, 5, f, f_gpu)
cmp(10000, 5, f, f_gpu)
cmp(4074, 400, f, f_gpu)
cmp(784, 784, f, f_gpu)
cmp(4, 1000, f, f_gpu)
cmp(4, 1024, f, f_gpu)
cmp(4, 2000, f, f_gpu)
cmp(4, 2024, f, f_gpu)
# The GTX285 don't have enough shared memory.
cmp(4, 4074, f, f_gpu)
# The GTX580, 680 and kepler don't have enough shared memory.
cmp(2, 10000, f, f_gpu)
cmp(128, 16 * 1024, f, f_gpu)
cmp(128, 64 * 1024, f, f_gpu)
# cudnn permits no more than 2^15 - 1 rows
cmp((2 << 15) - 1, 5, f, f_gpu)
cmp(5, 2 << 15, f, f_gpu)
return f, f_gpu
def _cmp(self, n, m, f, f_gpu):
# print "test_softmax",n,m
data = numpy.arange(n * m, dtype='float32').reshape(n, m)
out = f(data)
gout = f_gpu(data)
assert numpy.allclose(out, gout), numpy.absolute(out - gout)
def _check_types(self, graph, graph_gpu, f_type, f_gpu_type):
assert isinstance(graph.maker.fgraph.toposort()[-1].op, f_type)
assert len([node for node in graph_gpu.maker.fgraph.toposort()
if isinstance(node.op, f_gpu_type)]) == 1
def test_softmax(self):
x = T.fmatrix('x')
z = T.nnet.softmax_op
f, f_gpu = self._test_softmax(
x,
x,
z,
z,
self._cmp
)
# cuDNN R1 cannot handle these test cases but the Theano softmax can so
# we test them only for the Theano softmax.
self._cmp(2 << 15, 5, f, f_gpu)
def test_softmax_shape_0(self):
x = T.fmatrix('x')
z = T.nnet.softmax_op
f, f_gpu = self._test_softmax(
x,
x,
z,
z,
self._cmp
)
# Theano can handle that case, but cudnn can't
self._cmp(0, 10, f, f_gpu)
...@@ -593,7 +593,7 @@ def get_scalar_constant_value(orig_v, elemwise=True, ...@@ -593,7 +593,7 @@ def get_scalar_constant_value(orig_v, elemwise=True,
# mess with the stabilization optimization and be too slow. # mess with the stabilization optimization and be too slow.
# We put all the scalar Ops used by get_canonical_form_slice() # We put all the scalar Ops used by get_canonical_form_slice()
# to allow it to determine the broadcast pattern correctly. # to allow it to determine the broadcast pattern correctly.
elif isinstance(v.owner.op, ScalarFromTensor): elif isinstance(v.owner.op, (ScalarFromTensor, TensorFromScalar)):
return get_scalar_constant_value(v.owner.inputs[0]) return get_scalar_constant_value(v.owner.inputs[0])
elif isinstance(v.owner.op, scal.ScalarOp): elif isinstance(v.owner.op, scal.ScalarOp):
if isinstance(v.owner.op, scal.Second): if isinstance(v.owner.op, scal.Second):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论