提交 ae5e5a03 authored 作者: Gijs van Tulder's avatar Gijs van Tulder

Batch norm optimizations for gpuarray.

上级 9ad04124
...@@ -28,12 +28,13 @@ from theano.tensor.nnet.abstract_conv import (AbstractConv2d, ...@@ -28,12 +28,13 @@ from theano.tensor.nnet.abstract_conv import (AbstractConv2d,
assert_conv_shape) assert_conv_shape)
from theano.tensor.signal.pool import ( from theano.tensor.signal.pool import (
Pool, MaxPoolGrad, AveragePoolGrad) Pool, MaxPoolGrad, AveragePoolGrad)
from theano.tensor.nnet import bn
from . import pygpu from . import pygpu
from .type import (get_context, gpu_context_type, list_contexts, from .type import (get_context, gpu_context_type, list_contexts,
GpuArraySharedVariable) GpuArraySharedVariable)
from .basic_ops import (as_gpuarray_variable, infer_context_name, from .basic_ops import (as_gpuarray_variable, infer_context_name,
gpu_contiguous, gpu_alloc_empty, gpu_contiguous, gpu_alloc_empty,
empty_like, GpuArrayType) empty_like, GpuArrayType, HostFromGpu)
from .elemwise import GpuElemwise from .elemwise import GpuElemwise
# These don't exist in gpuarray # These don't exist in gpuarray
...@@ -2928,3 +2929,182 @@ def local_gpua_softmax_dnn_grad(op, ctx_name, inputs, outputs): ...@@ -2928,3 +2929,182 @@ def local_gpua_softmax_dnn_grad(op, ctx_name, inputs, outputs):
out = GpuDnnSoftmaxGrad('accurate', 'instance')( out = GpuDnnSoftmaxGrad('accurate', 'instance')(
gpu_contiguous(ins[0]), gpu_contiguous(ins[1])) gpu_contiguous(ins[0]), gpu_contiguous(ins[1]))
return [out.dimshuffle(0, 2)] return [out.dimshuffle(0, 2)]
@local_optimizer([bn.AbstractBatchNormTrain])
def local_abstract_batch_norm_train_cudnn(node):
if not isinstance(node.op, bn.AbstractBatchNormTrain):
return None
x, scale, bias, epsilon = node.inputs
if x.ndim > 5:
# TODO do something better than this (reshape?)
return None
# input on gpu? TODO what about the output?
x_on_gpu = (isinstance(x.type, GpuArrayType) or
(x.owner and isinstance(x.owner.op, HostFromGpu)))
if not x_on_gpu:
return None
# convert axes to cuDNN mode
axes = tuple(node.op.axes)
if axes == (0,):
mode = 'per-activation'
elif axes == (0,) + tuple(range(2, x.ndim)):
mode = 'spatial'
else:
return None
try:
eps = theano.tensor.get_scalar_constant_value(epsilon)
except theano.tensor.NotScalarConstantError:
return None
if eps < 1e-5:
return None
ctx = infer_context_name(*node.inputs)
if not dnn_available(ctx):
# TODO should this raise_no_cudnn?
return None
x = as_gpuarray_variable(x, context_name=ctx)
scale = as_gpuarray_variable(scale, context_name=ctx)
bias = as_gpuarray_variable(bias, context_name=ctx)
out, mean, invstd = dnn_batch_normalization_train(x, scale, bias, mode, eps)
# If the original output was on CPU, we have to transfer it
if isinstance(node.outputs[0].type, tensor.TensorType):
out = tensor.as_tensor_variable(out)
if isinstance(node.outputs[1].type, tensor.TensorType):
mean = tensor.as_tensor_variable(mean)
if isinstance(node.outputs[2].type, tensor.TensorType):
invstd = tensor.as_tensor_variable(invstd)
# TODO copy_stack_trace?
return [out, mean, invstd]
@local_optimizer([bn.AbstractBatchNormTrainGrad])
def local_abstract_batch_norm_train_grad_cudnn(node):
if not isinstance(node.op, bn.AbstractBatchNormTrainGrad):
return None
x, dy, scale, x_mean, x_invstd, epsilon = node.inputs
if x.ndim > 5:
# TODO do something better than this (reshape?)
return None
# input on gpu? TODO what about the output?
x_on_gpu = (isinstance(x.type, GpuArrayType) or
(x.owner and isinstance(x.owner.op, HostFromGpu)))
dy_on_gpu = (isinstance(dy.type, GpuArrayType) or
(dy.owner and isinstance(dy.owner.op, HostFromGpu)))
if not (x_on_gpu or dy_on_gpu):
return None
# convert axes to cuDNN mode
axes = tuple(node.op.axes)
if axes == (0,):
mode = 'per-activation'
elif axes == (0,) + tuple(range(2, x.ndim)):
mode = 'spatial'
else:
return None
ndim = x.ndim
if ndim < 4:
x = theano.tensor.shape_padright(x, 4 - ndim)
dy = theano.tensor.shape_padright(dy, 4 - ndim)
scale = theano.tensor.shape_padright(scale, 4 - ndim)
x_mean = theano.tensor.shape_padright(x_mean, 4 - ndim)
x_invstd = theano.tensor.shape_padright(x_invstd, 4 - ndim)
try:
eps = theano.tensor.get_scalar_constant_value(epsilon)
except theano.tensor.NotScalarConstantError:
return None
if eps < 1e-5:
return None
ctx = infer_context_name(*node.inputs)
if not dnn_available(ctx):
# TODO should this raise_no_cudnn?
return None
x = as_gpuarray_variable(x, context_name=ctx)
dy = as_gpuarray_variable(dy, context_name=ctx)
scale = as_gpuarray_variable(scale, context_name=ctx)
x_mean = as_gpuarray_variable(x_mean, context_name=ctx)
x_invstd = as_gpuarray_variable(x_invstd, context_name=ctx)
g_wrt_inputs, g_wrt_scale, g_wrt_bias = \
GpuDnnBatchNormGrad(mode)(x, dy, scale, x_mean, x_invstd, eps)
if ndim < 4:
g_wrt_inputs = theano.tensor.flatten(g_wrt_inputs, ndim)
g_wrt_scale = theano.tensor.flatten(g_wrt_scale, ndim)
g_wrt_bias = theano.tensor.flatten(g_wrt_bias, ndim)
# If the original output was on CPU, we have to transfer it
if isinstance(node.outputs[0].type, tensor.TensorType):
g_wrt_inputs = tensor.as_tensor_variable(g_wrt_inputs)
if isinstance(node.outputs[1].type, tensor.TensorType):
g_wrt_scale = tensor.as_tensor_variable(g_wrt_scale)
if isinstance(node.outputs[2].type, tensor.TensorType):
g_wrt_bias = tensor.as_tensor_variable(g_wrt_bias)
# TODO copy_stack_trace?
return [g_wrt_inputs, g_wrt_scale, g_wrt_bias]
@local_optimizer([bn.AbstractBatchNormInference])
def local_abstract_batch_norm_inference_cudnn(node):
if not isinstance(node.op, bn.AbstractBatchNormInference):
return None
x, scale, bias, estimated_mean, estimated_variance, epsilon = node.inputs
if x.ndim > 5:
# TODO do something better than this (reshape?)
return None
axes = tuple(node.op.axes)
if axes == (0,):
mode = 'per-activation'
elif axes == (0,) + tuple(range(2, x.ndim)):
mode = 'spatial'
else:
return None
# input on gpu? TODO what about the output?
x_on_gpu = (isinstance(x.type, GpuArrayType) or
(x.owner and isinstance(x.owner.op, HostFromGpu)))
if not x_on_gpu:
return None
try:
eps = theano.tensor.get_scalar_constant_value(epsilon)
except theano.tensor.NotScalarConstantError:
return None
if eps < 1e-5:
return None
ctx = infer_context_name(*node.inputs)
if not dnn_available(ctx):
# TODO should this raise_no_cudnn?
return None
x = as_gpuarray_variable(x, context_name=ctx)
scale = as_gpuarray_variable(scale, context_name=ctx)
bias = as_gpuarray_variable(bias, context_name=ctx)
estimated_mean = as_gpuarray_variable(estimated_mean, context_name=ctx)
estimated_variance = as_gpuarray_variable(estimated_variance, context_name=ctx)
out = dnn_batch_normalization_test(x, scale, bias, estimated_mean, estimated_variance,
mode, eps)
# If the original output was on CPU, we have to transfer it
# TODO copy_stack_trace?
if isinstance(node.outputs[0].type, tensor.TensorType):
return [tensor.as_tensor_variable(out)]
else:
return [out]
...@@ -2005,3 +2005,28 @@ abstractconv_groupopt.register('local_abstractconv3d_gradinputs', ...@@ -2005,3 +2005,28 @@ abstractconv_groupopt.register('local_abstractconv3d_gradinputs',
local_abstractconv3d_gradinputs_gemm, 30, local_abstractconv3d_gradinputs_gemm, 30,
'conv_gemm', 'conv_gemm',
'gpuarray', 'fast_compile', 'fast_run') 'gpuarray', 'fast_compile', 'fast_run')
# Register cuDNN batch normalization implementation
abstract_batch_norm_groupopt = theano.gof.optdb.LocalGroupDB()
abstract_batch_norm_groupopt.__name__ = "gpuarray_batchnorm_opts"
register_opt('fast_compile')(abstract_batch_norm_groupopt)
# cuDNN optimizations are only registered if cuDNN is available.
# (we import these opts here instead of at the top of this file
# to avoid a circular dependency problem with dnn)
from .dnn import (local_abstract_batch_norm_train_cudnn,
local_abstract_batch_norm_train_grad_cudnn,
local_abstract_batch_norm_inference_cudnn) # noqa: 402
abstract_batch_norm_groupopt.register('local_abstract_batch_norm_train_dnn',
local_abstract_batch_norm_train_cudnn, 20,
'batchnorm_dnn',
'gpuarray', 'fast_compile', 'fast_run', 'cudnn')
abstract_batch_norm_groupopt.register('local_abstract_batch_norm_train_grad_dnn',
local_abstract_batch_norm_train_grad_cudnn, 20,
'batchnorm_dnn',
'gpuarray', 'fast_compile', 'fast_run', 'cudnn')
abstract_batch_norm_groupopt.register('local_abstract_batch_norm_inference_dnn',
local_abstract_batch_norm_inference_cudnn, 20,
'batchnorm_dnn',
'gpuarray', 'fast_compile', 'fast_run', 'cudnn')
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论