提交 72623e6c authored 作者: Frédéric Bastien's avatar Frédéric Bastien 提交者: GitHub

Merge pull request #6069 from abergeron/dnn_redux

Add reduction based on cudnn.
...@@ -243,10 +243,8 @@ def test_badoptimization(): ...@@ -243,10 +243,8 @@ def test_badoptimization():
def test_badoptimization_opt_err(): def test_badoptimization_opt_err():
"""This variant of test_badoptimization() replace the working code # This variant of test_badoptimization() replace the working code
with a new apply node that will raise an error. # with a new apply node that will raise an error.
"""
@gof.local_optimizer([theano.tensor.add]) @gof.local_optimizer([theano.tensor.add])
def insert_bigger_b_add(node): def insert_bigger_b_add(node):
if node.op == theano.tensor.add: if node.op == theano.tensor.add:
......
...@@ -634,7 +634,7 @@ class _make_cdata(Op): ...@@ -634,7 +634,7 @@ class _make_cdata(Op):
""" % dict(ctype=self.rtype.ctype, out=outputs[0], inp=inputs[0]) """ % dict(ctype=self.rtype.ctype, out=outputs[0], inp=inputs[0])
def c_code_cache_version(self): def c_code_cache_version(self):
return (0,) return (0, self.rtype.version)
class CDataType(Type): class CDataType(Type):
......
...@@ -34,9 +34,6 @@ class CuDNNV51(object): ...@@ -34,9 +34,6 @@ class CuDNNV51(object):
cudnnDataType_t = CEnumType(('CUDNN_DATA_FLOAT', 'float32'), cudnnDataType_t = CEnumType(('CUDNN_DATA_FLOAT', 'float32'),
('CUDNN_DATA_DOUBLE', 'float64'), ('CUDNN_DATA_DOUBLE', 'float64'),
('CUDNN_DATA_HALF', 'float16'), ('CUDNN_DATA_HALF', 'float16'),
# CUDNN_DATA_INT8 # new in v6
# CUDNN_DATA_INT32 # new in v6
# CUDNN_DATA_INT8x4 # new in v6
ctype='cudnnDataType_t') ctype='cudnnDataType_t')
cudnnConvolutionFwdAlgo_t = CEnumType(('CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM', 'none'), cudnnConvolutionFwdAlgo_t = CEnumType(('CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM', 'none'),
...@@ -91,11 +88,24 @@ class CuDNNV51(object): ...@@ -91,11 +88,24 @@ class CuDNNV51(object):
cudnnBatchNormMode_t = CEnumType(('CUDNN_BATCHNORM_PER_ACTIVATION', 'per-activation'), cudnnBatchNormMode_t = CEnumType(('CUDNN_BATCHNORM_PER_ACTIVATION', 'per-activation'),
('CUDNN_BATCHNORM_SPATIAL', 'spatial'), ('CUDNN_BATCHNORM_SPATIAL', 'spatial'),
ctype='cudnnBatchNormMode_t') ctype='cudnnBatchNormMode_t')
# It was introduced in cudnnv6, but we need to define it with an
# empty list of enum to don't crash with cudnn 5
cudnnReduceTensorOp_t = CEnumType()
class CuDNNV6(CuDNNV51): class CuDNNV6(CuDNNV51):
version = 6 version = 6
cudnnDataType_t = CEnumType(('CUDNN_DATA_FLOAT', 'float32'),
('CUDNN_DATA_DOUBLE', 'float64'),
('CUDNN_DATA_HALF', 'float16'),
# new in v6
('CUDNN_DATA_INT8', 'int8'),
('CUDNN_DATA_INT32', 'int32'),
# Also in v6, but restrictions make this fail
# CUDNN_DATA_INT8x4
ctype='cudnnDataType_t')
cudnnPoolingMode_t = CEnumType(('CUDNN_POOLING_MAX', 'max'), cudnnPoolingMode_t = CEnumType(('CUDNN_POOLING_MAX', 'max'),
('CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING', 'average_inc_pad'), ('CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING', 'average_inc_pad'),
('CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING', 'average_exc_pad'), ('CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING', 'average_exc_pad'),
...@@ -115,6 +125,16 @@ class CuDNNV6(CuDNNV51): ...@@ -115,6 +125,16 @@ class CuDNNV6(CuDNNV51):
('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING', 'fft_tiling'), ('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING', 'fft_tiling'),
ctype='cudnnConvolutionBwdFilterAlgo_t') ctype='cudnnConvolutionBwdFilterAlgo_t')
cudnnReduceTensorOp_t = CEnumType(('CUDNN_REDUCE_TENSOR_ADD', 'add'),
('CUDNN_REDUCE_TENSOR_MUL', 'mul'),
('CUDNN_REDUCE_TENSOR_MIN', 'minimum'),
('CUDNN_REDUCE_TENSOR_MAX', 'maximum'),
('CUDNN_REDUCE_TENSOR_AMAX', 'absmax'),
('CUDNN_REDUCE_TENSOR_AVG', 'avg'),
('CUDNN_REDUCE_TENSOR_NORM1', 'norm1'),
('CUDNN_REDUCE_TENSOR_NORM2', 'norm2'),
ctype='cudnnReduceTensorOp_t')
def get_definitions(cudnn_version=None): def get_definitions(cudnn_version=None):
""" """
......
...@@ -6,10 +6,12 @@ import warnings ...@@ -6,10 +6,12 @@ import warnings
import numpy as np import numpy as np
from six import integer_types from six import integer_types
from six.moves import reduce
import theano import theano
from theano import Op, Apply, tensor, config, Variable from theano import Op, Apply, tensor, config, Variable
from theano.scalar import as_scalar, constant, Log, get_scalar_type, int32 as int_t, bool as bool_t from theano.scalar import (as_scalar, constant, Log, get_scalar_type,
int32 as int_t, bool as bool_t, uint32 as uint32_t)
from theano.tensor import as_tensor_variable from theano.tensor import as_tensor_variable
from theano.gradient import DisconnectedType, grad_not_implemented from theano.gradient import DisconnectedType, grad_not_implemented
from theano.gof import Optimizer, local_optimizer, COp, ParamsType, EnumList from theano.gof import Optimizer, local_optimizer, COp, ParamsType, EnumList
...@@ -34,7 +36,7 @@ from .type import (get_context, gpu_context_type, list_contexts, ...@@ -34,7 +36,7 @@ from .type import (get_context, gpu_context_type, list_contexts,
from .basic_ops import (as_gpuarray_variable, infer_context_name, from .basic_ops import (as_gpuarray_variable, infer_context_name,
gpu_contiguous, GpuAllocEmpty, gpu_contiguous, GpuAllocEmpty,
empty_like, GpuArrayType, HostFromGpu) empty_like, GpuArrayType, HostFromGpu)
from .elemwise import GpuElemwise from .elemwise import GpuElemwise, GpuCAReduceCuda
# These don't exist in gpuarray # These don't exist in gpuarray
# GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad # GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad
...@@ -433,7 +435,8 @@ class GpuDnnConvDesc(COp): ...@@ -433,7 +435,8 @@ class GpuDnnConvDesc(COp):
node = Apply(self, [kern_shape], node = Apply(self, [kern_shape],
[CDataType("cudnnConvolutionDescriptor_t", [CDataType("cudnnConvolutionDescriptor_t",
freefunc="cudnnDestroyConvolutionDescriptor")()]) freefunc="cudnnDestroyConvolutionDescriptor",
version=version(raises=False))()])
# DebugMode cannot compare the values of CDataType variables, so by # DebugMode cannot compare the values of CDataType variables, so by
# default it returns False all the time. To prevent DebugMode from # default it returns False all the time. To prevent DebugMode from
# complaining because of the MergeOptimizer, we make this variable # complaining because of the MergeOptimizer, we make this variable
...@@ -1216,7 +1219,8 @@ class GpuDnnPoolDesc(Op): ...@@ -1216,7 +1219,8 @@ class GpuDnnPoolDesc(Op):
def make_node(self): def make_node(self):
node = Apply(self, [], node = Apply(self, [],
[CDataType("cudnnPoolingDescriptor_t", [CDataType("cudnnPoolingDescriptor_t",
freefunc="cudnnDestroyPoolingDescriptor")()]) freefunc="cudnnDestroyPoolingDescriptor",
version=version(raises=False))()])
# DebugMode cannot compare the values of CDataType variables, so by # DebugMode cannot compare the values of CDataType variables, so by
# default it returns False all the time. To prevent DebugMode from # default it returns False all the time. To prevent DebugMode from
# complaining because of the MergeOptimizer, we make this variable # complaining because of the MergeOptimizer, we make this variable
...@@ -1557,6 +1561,75 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase): ...@@ -1557,6 +1561,75 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
return Apply(self, [dy, sm], [sm.type()]) return Apply(self, [dy, sm], [sm.type()])
class GpuDnnReduction(DnnBase):
check_input = False
_f16_ok = True
_cop_num_outputs = 2
__props__ = ('red_op', 'axis', 'acc_dtype', 'dtype', 'return_indices')
params_type = ParamsType(red_op=cudnn.cudnnReduceTensorOp_t,
acc_dtype=cudnn.cudnnDataType_t,
c_axis=uint32_t,
handle=handle_type)
def __init__(self, red_op, axis, acc_dtype, dtype, return_indices):
DnnBase.__init__(self, ['dnn_redux.c'], 'APPLY_SPECIFIC(dnn_redux)')
assert cudnn.cudnnReduceTensorOp_t.has_alias(red_op)
self.red_op = red_op
assert acc_dtype in ['float16', 'float32', 'float64']
self.acc_dtype = acc_dtype
assert dtype in ['float16', 'float32', 'float64']
self.dtype = dtype
# 8 is the current limit for cudnn
if axis is not None:
if len(axis) > 8:
raise ValueError('Too many axes to reduce on')
if any(a >= 8 for a in axis):
raise ValueError('Axes larger than 8 not supported')
axis = tuple(axis)
# c_axis is a bitfield (1 to reduce)
self.c_axis = self._convert_axis(axis)
# axis is a list of axes to reduce on
self.axis = axis
if return_indices and (red_op != 'max' and red_op != 'min'):
raise ValueError("Can't request indices for something other than min or max")
self.return_indices = return_indices
def _convert_axis(self, axis):
if axis is None:
return np.uint32(-1)
else:
return reduce(lambda a, b: a | b, map(lambda a: 1 << a, axis), 0)
def make_node(self, inp):
ctx_name = infer_context_name(inp)
inp = as_gpuarray_variable(inp, ctx_name)
if inp.ndim > 8:
raise ValueError("cuDNN reduction doesn't support nd > 8")
assert inp.dtype in ['float16', 'float32', 'float64']
# These restrictions where guessed from vague clues since
# there is no actual documentation on this
if inp.dtype == 'float64':
assert self.acc_dtype == 'float64'
if inp.dtype == 'float32':
assert self.acc_dtype == 'float32'
if inp.dtype == 'float16':
assert self.acc_dtype != 'float64'
bcast = []
for i in range(inp.ndim):
if not (self.c_axis & (1 << i)):
bcast.append(inp.broadcastable[i])
outs = [inp.type.clone(dtype=self.dtype, broadcastable=bcast)()]
if self.return_indices:
outs.append(GpuArrayType(dtype='uint32', broadcastable=bcast,
context_name=ctx_name)())
return Apply(self, [inp], outs)
class GpuDnnBatchNorm(DnnBase): class GpuDnnBatchNorm(DnnBase):
""" """
Base Op for cuDNN Batch Normalization. Base Op for cuDNN Batch Normalization.
...@@ -1812,7 +1885,8 @@ class GpuDnnBatchNormGrad(DnnBase): ...@@ -1812,7 +1885,8 @@ class GpuDnnBatchNormGrad(DnnBase):
gpudata_type = CDataType('gpudata *', 'gpudata_release') gpudata_type = CDataType('gpudata *', 'gpudata_release')
dropoutdesc_type = CDataType('cudnnDropoutDescriptor_t', dropoutdesc_type = CDataType('cudnnDropoutDescriptor_t',
'cudnnDestroyDropoutDescriptor') 'cudnnDestroyDropoutDescriptor',
version=version(raises=False))
class GpuDnnDropoutOp(DnnBase): class GpuDnnDropoutOp(DnnBase):
...@@ -1881,7 +1955,8 @@ def dropout(x, dropout=0.0, seed=4242): ...@@ -1881,7 +1955,8 @@ def dropout(x, dropout=0.0, seed=4242):
return y, desc, odesc, states return y, desc, odesc, states
rnndesc_type = CDataType('cudnnRNNDescriptor_t', rnndesc_type = CDataType('cudnnRNNDescriptor_t',
'cudnnDestroyRNNDescriptor') 'cudnnDestroyRNNDescriptor',
version=version(raises=False))
def as_i32(v): def as_i32(v):
...@@ -2985,6 +3060,76 @@ def local_gpua_logsoftmax_to_dnn(op, ctx_name, inputs, outputs): ...@@ -2985,6 +3060,76 @@ def local_gpua_logsoftmax_to_dnn(op, ctx_name, inputs, outputs):
return [out.dimshuffle(0, 1)] return [out.dimshuffle(0, 1)]
@register_opt('cudnn', 'fast_compile')
@op_lifter([SoftmaxGrad])
@register_opt2([SoftmaxGrad], 'cudnn', 'fast_compile')
def local_gpua_softmax_dnn_grad(op, ctx_name, inputs, outputs):
if not dnn_available(ctx_name):
return
ins = []
for n in inputs:
n = as_gpuarray_variable(n, ctx_name)
if n.ndim != 2:
return
ins.append(n.dimshuffle(0, 'x', 1, 'x'))
out = GpuDnnSoftmaxGrad('accurate', 'instance')(
gpu_contiguous(ins[0]), gpu_contiguous(ins[1]))
return [out.dimshuffle(0, 2)]
@register_opt('cudnn')
@local_optimizer([GpuCAReduceCuda])
def local_dnn_reduction(node):
if not isinstance(node.op, GpuCAReduceCuda):
return
if not dnn_available(node.inputs[0].type.context_name):
return
if version(raises=False) < 6000:
return
if node.inputs[0].ndim > 8:
return
if node.inputs[0].dtype != node.outputs[0].dtype:
# We can mix float16 and float32, but not float64.
if (node.inputs[0].dtype == 'float64' or
node.outputs[0].dtype == 'float64'):
return
if node.op.acc_dtype != 'float32':
return
if node.inputs[0].dtype not in ['float16', 'float32', 'float64']:
return
if (node.inputs[0].dtype == 'float64' and
node.op.acc_dtype != 'float64'):
return
if (node.inputs[0].dtype == 'float32' and
node.op.acc_dtype != 'float32'):
return
if (node.inputs[0].dtype == 'float16' and
node.op.acc_dtype == 'float64'):
return
if node.op.pre_scalar_op is not None:
# Might want to handle absmax, avg, norm1, norm2 here
return
if not cudnn.cudnnReduceTensorOp_t.has_alias(node.op.scalar_op.name):
return
return (GpuDnnReduction(node.op.scalar_op.name,
node.op.axis,
node.op.acc_dtype,
node.op.dtype,
False)(node.inputs[0]),)
class NoCuDNNRaise(Optimizer): class NoCuDNNRaise(Optimizer):
def apply(self, fgraph): def apply(self, fgraph):
...@@ -3004,24 +3149,6 @@ class NoCuDNNRaise(Optimizer): ...@@ -3004,24 +3149,6 @@ class NoCuDNNRaise(Optimizer):
gpu_seqopt.register("NoCuDNNRaise", NoCuDNNRaise(), 0, 'cudnn') gpu_seqopt.register("NoCuDNNRaise", NoCuDNNRaise(), 0, 'cudnn')
@register_opt('cudnn', 'fast_compile')
@op_lifter([SoftmaxGrad])
@register_opt2([SoftmaxGrad], 'cudnn', 'fast_compile')
def local_gpua_softmax_dnn_grad(op, ctx_name, inputs, outputs):
if not dnn_available(ctx_name):
return
ins = []
for n in inputs:
n = as_gpuarray_variable(n, ctx_name)
if n.ndim != 2:
return
ins.append(n.dimshuffle(0, 'x', 1, 'x'))
out = GpuDnnSoftmaxGrad('accurate', 'instance')(
gpu_contiguous(ins[0]), gpu_contiguous(ins[1]))
return [out.dimshuffle(0, 2)]
def local_abstract_batch_norm_train_cudnn(op, ctx_name, inputs, outputs): def local_abstract_batch_norm_train_cudnn(op, ctx_name, inputs, outputs):
x, scale, bias, epsilon, running_average_factor = inputs[:5] x, scale, bias, epsilon, running_average_factor = inputs[:5]
running_mean = inputs[5] if len(inputs) > 5 else None running_mean = inputs[5] if len(inputs) > 5 else None
......
...@@ -94,6 +94,7 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img, ...@@ -94,6 +94,7 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set op descriptor %s", cudnnGetErrorString(err)); PyErr_Format(PyExc_RuntimeError, "could not set op descriptor %s", cudnnGetErrorString(err));
return 1;
} }
{ {
......
#section support_code_struct
cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
cudnnReduceTensorDescriptor_t APPLY_SPECIFIC(red);
#section init_code_struct
cudnnStatus_t APPLY_SPECIFIC(err);
APPLY_SPECIFIC(input) = NULL;
APPLY_SPECIFIC(output) = NULL;
APPLY_SPECIFIC(red) = NULL;
if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
"(inp): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
FAIL;
}
if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
"(out): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
FAIL;
}
if ((APPLY_SPECIFIC(err) = cudnnCreateReduceTensorDescriptor(&APPLY_SPECIFIC(red))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate reduction descriptor"
"(red): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
FAIL;
}
#section cleanup_code_struct
if (APPLY_SPECIFIC(input) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input)); }
if (APPLY_SPECIFIC(output) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output)); }
if (APPLY_SPECIFIC(red) != NULL) { cudnnDestroyReduceTensorDescriptor(APPLY_SPECIFIC(red)); }
#section support_code_struct
int APPLY_SPECIFIC(dnn_redux)(PyGpuArrayObject *input,
PyGpuArrayObject **output,
PyGpuArrayObject **indices,
PARAMS_TYPE* params) {
PyGpuContextObject *c = input->context;
gpudata *workspace = NULL;
size_t worksize = 0;
size_t indsize = 0;
size_t *tdims;
ssize_t *tstrs;
size_t dims[8];
ssize_t strs[8];
size_t rsz;
void *alpha;
void *beta;
cudnnStatus_t err;
unsigned int p;
int e;
static float falpha = 1.0f;
static double dalpha = 1.0;
static float fbeta = 0.0f;
static double dbeta = 0.0;
if (!GpuArray_IS_C_CONTIGUOUS(&input->ga)) {
PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
return 1;
}
if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) != 0)
return 1;
p = 0;
rsz = 1;
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
if (!(params->c_axis & (1U << i))) {
dims[p] = PyGpuArray_DIM(input, i);
p++;
} else {
rsz *= PyGpuArray_DIM(input, i);
}
}
if (indices != NULL) {
if (theano_prep_output(indices, p, dims, GA_UINT, GA_C_ORDER, c) != 0)
return 1;
indsize = PyGpuArray_SIZE(*indices);
}
if (p == input->ga.nd || rsz == 1) {
int err;
Py_XDECREF(*output);
*output = pygpu_copy(input, GA_C_ORDER);
if (*output == NULL)
return 1;
err = GpuArray_reshape_inplace(&(*output)->ga, p, dims, GA_C_ORDER);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "GpuArray_reshape_inplace: %s", GpuArray_error(&(*output)->ga, err));
return 1;
}
if (indices != NULL) {
// All indices will be 0 since the size of the reduced area is 1.
err = GpuArray_memset(&(*indices)->ga, 0);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "GpuArray_memset: %s", GpuArray_error(&(*indices)->ga, err));
return 1;
}
}
// This is a shortcut path.
return 0;
}
if (theano_prep_output(output, p, dims, input->ga.typecode,
GA_C_ORDER, c) != 0)
return 1;
// cuDNN expect that the output has the same number of dimension as
// the input, but the dimensions to reduce are of size 1 in the output.
// We have to do some trickery to be able to pass it what it need.
p = 0;
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
if (params->c_axis & (1U << i)) {
dims[i] = 1;
strs[i] = 0;
} else {
dims[i] = PyGpuArray_DIM(input, i);
strs[i] = PyGpuArray_STRIDE(*output, p);
p++;
}
}
// Perform horrible surgery to be able to reuse c_set_tensorNd()
tdims = (*output)->ga.dimensions;
tstrs = (*output)->ga.strides;
(*output)->ga.dimensions = dims;
(*output)->ga.strides = strs;
(*output)->ga.nd = input->ga.nd;
// Delay error checking to avoid exposing a broken object
e = c_set_tensorNd(*output, APPLY_SPECIFIC(output));
// Undo our horrible surgery
(*output)->ga.nd = p;
(*output)->ga.dimensions = tdims;
(*output)->ga.strides = tstrs;
if (e != 0)
return 1;
// Back to normal, no more horrible things
// Note that only CUDNN_32BIT_INDICES is implemented
err = cudnnSetReduceTensorDescriptor(
APPLY_SPECIFIC(red), params->red_op,
params->acc_dtype, CUDNN_PROPAGATE_NAN,
indices == NULL ? CUDNN_REDUCE_TENSOR_NO_INDICES : CUDNN_REDUCE_TENSOR_FLATTENED_INDICES,
CUDNN_32BIT_INDICES);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set reduce descriptor: %s",
cudnnGetErrorString(err));
return 1;
}
switch (input->ga.typecode) {
case GA_FLOAT:
case GA_HALF:
alpha = &falpha;
beta = &fbeta;
break;
case GA_DOUBLE:
alpha = &dalpha;
beta = &dbeta;
break;
default:
PyErr_SetString(PyExc_RuntimeError, "Unsupported dtype in dnn reduce");
return 1;
}
err = cudnnGetReductionWorkspaceSize(params->handle,
APPLY_SPECIFIC(red),
APPLY_SPECIFIC(input),
APPLY_SPECIFIC(output),
&worksize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not get reduce workspace size: %s",
cudnnGetErrorString(err));
return 1;
}
if (worksize != 0) {
workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, &e);
if (workspace == NULL) {
PyErr_Format(PyExc_RuntimeError, "gpudata_alloc: %s",
gpucontext_error(c->ctx, e));
return 1;
}
}
err = cudnnReduceTensor(params->handle, APPLY_SPECIFIC(red),
indices ? PyGpuArray_DEV_DATA(*indices) : NULL, indsize,
worksize ? *((void **)workspace) : NULL, worksize,
alpha,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
beta,
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output));
if (workspace != NULL)
gpudata_release(workspace);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not run reduction: %s",
cudnnGetErrorString(err));
return 1;
}
return 0;
}
...@@ -1119,10 +1119,20 @@ def local_gpua_careduce(op, context_name, inputs, outputs): ...@@ -1119,10 +1119,20 @@ def local_gpua_careduce(op, context_name, inputs, outputs):
else: else:
return False return False
x, = inputs x, = inputs
idtype = x.dtype
adtype = getattr(op, 'acc_dtype', None)
odtype = getattr(op, 'dtype', outputs[0].dtype)
# Force accumulator to float32 for float32 inputs since tree
# reduction will not loose as much precision as linear
# accumulation and float64 is much slower on GPU.
if idtype == 'float32' and odtype == 'float32':
adtype = 'float32'
greduce = op2( greduce = op2(
op.scalar_op, axis=op.axis, op.scalar_op, axis=op.axis,
dtype=getattr(op, 'dtype', outputs[0].dtype), dtype=odtype,
acc_dtype=getattr(op, 'acc_dtype', None)) acc_dtype=adtype)
gvar = greduce(x) gvar = greduce(x)
# We need to have the make node called, otherwise the mask can # We need to have the make node called, otherwise the mask can
# be None # be None
......
...@@ -1485,6 +1485,27 @@ class test_SoftMax(test_nnet.test_SoftMax): ...@@ -1485,6 +1485,27 @@ class test_SoftMax(test_nnet.test_SoftMax):
utt.assert_allclose(f(inp), f_ref(inp)) utt.assert_allclose(f(inp), f_ref(inp))
def dnn_reduction(nd, idtype, acc_dtype, odtype):
inp = T.TensorType(idtype, (False,) * nd)()
res = inp.sum(acc_dtype=acc_dtype, dtype=odtype)
f = theano.function([inp], res, mode=mode_with_gpu)
assert any(isinstance(n.op, dnn.GpuDnnReduction)
for n in f.maker.fgraph.apply_nodes)
def test_dnn_reduction_opt():
if not dnn.dnn_available(test_ctx_name) or dnn.version(raises=False) < 6000:
raise SkipTest(dnn.dnn_available.msg)
for nd in range(1, 9):
yield dnn_reduction, nd, 'float32', 'float32', 'float32'
for idtype, adtype, odtype in (('float64', 'float64', 'float64'),
('float16', 'float32', 'float16'),
('float16', 'float32', 'float32')):
yield dnn_reduction, 2, idtype, adtype, odtype
def test_dnn_batchnorm_train(): def test_dnn_batchnorm_train():
if not dnn.dnn_available(test_ctx_name): if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg) raise SkipTest(dnn.dnn_available.msg)
......
...@@ -16,8 +16,10 @@ from .config import mode_with_gpu, mode_without_gpu, test_ctx_name ...@@ -16,8 +16,10 @@ from .config import mode_with_gpu, mode_without_gpu, test_ctx_name
from .test_basic_ops import rand_gpuarray from .test_basic_ops import rand_gpuarray
from ..elemwise import (GpuElemwise, GpuDimShuffle, from ..elemwise import (GpuElemwise, GpuDimShuffle,
GpuCAReduceCuda, GpuCAReduceCPY, GpuErfinv, GpuErfcinv) GpuCAReduceCuda, GpuCAReduceCPY, GpuErfinv, GpuErfcinv)
from ..dnn import GpuDnnReduction
from ..type import GpuArrayType, get_context, gpuarray_shared_constructor from ..type import GpuArrayType, get_context, gpuarray_shared_constructor
from pygpu import ndgpuarray as gpuarray from pygpu import ndgpuarray as gpuarray
...@@ -346,7 +348,9 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY): ...@@ -346,7 +348,9 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
class T_gpureduce_dtype(test_elemwise.T_reduce_dtype): class T_gpureduce_dtype(test_elemwise.T_reduce_dtype):
mode = mode_with_gpu.excluding('local_cut_useless_reduce') mode = mode_with_gpu.excluding('local_cut_useless_reduce')
op = GpuCAReduceCuda
# GpuDnnReduction doesn't cover all cases, but should cover some
op = (GpuCAReduceCuda, GpuDnnReduction)
# Currently we don't support reduction on 0 axis # Currently we don't support reduction on 0 axis
axes = [None, 0, 1, 1, [0], [1], [0, 1]] axes = [None, 0, 1, 1, [0], [1], [0, 1]]
# We don't support complex dtype # We don't support complex dtype
......
...@@ -17,6 +17,7 @@ from ..basic_ops import ( ...@@ -17,6 +17,7 @@ from ..basic_ops import (
from ..blas import GpuGemm from ..blas import GpuGemm
from ..elemwise import ( from ..elemwise import (
GpuCAReduceCuda, GpuCAReduceCPY, GpuElemwise, Elemwise, max_inputs_to_GpuElemwise) GpuCAReduceCuda, GpuCAReduceCPY, GpuElemwise, Elemwise, max_inputs_to_GpuElemwise)
from ..dnn import GpuDnnReduction
from ..subtensor import GpuSubtensor from ..subtensor import GpuSubtensor
from ..linalg import GpuCusolverSolve, cusolver_available, GpuCholesky from ..linalg import GpuCusolverSolve, cusolver_available, GpuCholesky
...@@ -130,9 +131,13 @@ def test_reduce(): ...@@ -130,9 +131,13 @@ def test_reduce():
ops = [type(node.op) for node in topo] ops = [type(node.op) for node in topo]
if kind == b'opencl' and method in ["max", "min"]: if kind == b'opencl' and method in ["max", "min"]:
assert not(GpuCAReduceCuda in ops or GpuCAReduceCPY in ops) assert not(GpuCAReduceCuda in ops or
GpuCAReduceCPY in ops or
GpuDnnReduction in ops)
else: else:
assert GpuCAReduceCuda in ops or GpuCAReduceCPY in ops assert (GpuCAReduceCuda in ops or
GpuCAReduceCPY in ops or
GpuDnnReduction in ops)
def test_local_gpualloc_memset_0(): def test_local_gpualloc_memset_0():
......
File mode changed from 100755 to 100644
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论