提交 86842018 authored 作者: Frederic Bastien's avatar Frederic Bastien

Use GraphToGPU for the new BN opt. Also remove the LocalGroupDB, as it add extra…

Use GraphToGPU for the new BN opt. Also remove the LocalGroupDB, as it add extra overhead and wasn't useul as there is only 1 GPU version.
上级 8c3b228f
...@@ -3081,23 +3081,16 @@ def local_gpua_softmax_dnn_grad(op, ctx_name, inputs, outputs): ...@@ -3081,23 +3081,16 @@ def local_gpua_softmax_dnn_grad(op, ctx_name, inputs, outputs):
return [out.dimshuffle(0, 2)] return [out.dimshuffle(0, 2)]
@local_optimizer([bn.AbstractBatchNormTrain]) @register_opt('cudnn', 'fast_compile')
def local_abstract_batch_norm_train_cudnn(node): @op_lifter([bn.AbstractBatchNormTrain])
if not isinstance(node.op, bn.AbstractBatchNormTrain): @register_opt2([bn.AbstractBatchNormTrain], 'cudnn', 'fast_compile')
return None def local_abstract_batch_norm_train_cudnn(op, ctx_name, inputs, outputs):
x, scale, bias, epsilon, running_average_factor = inputs[:5]
x, scale, bias, epsilon, running_average_factor = node.inputs[:5] running_mean = inputs[5] if len(inputs) > 5 else None
running_mean = node.inputs[5] if len(node.inputs) > 5 else None running_var = inputs[6] if len(inputs) > 6 else None
running_var = node.inputs[6] if len(node.inputs) > 6 else None
# input on gpu? TODO what about the output?
x_on_gpu = (isinstance(x.type, GpuArrayType) or
(x.owner and isinstance(x.owner.op, HostFromGpu)))
if not x_on_gpu:
return None
# convert axes to cuDNN mode # convert axes to cuDNN mode
axes = tuple(node.op.axes) axes = tuple(op.axes)
if axes == (0,): if axes == (0,):
mode = 'per-activation' mode = 'per-activation'
elif axes == (0,) + tuple(range(2, x.ndim)): elif axes == (0,) + tuple(range(2, x.ndim)):
...@@ -3116,7 +3109,7 @@ def local_abstract_batch_norm_train_cudnn(node): ...@@ -3116,7 +3109,7 @@ def local_abstract_batch_norm_train_cudnn(node):
except theano.tensor.NotScalarConstantError: except theano.tensor.NotScalarConstantError:
return None return None
ctx = infer_context_name(*node.inputs) ctx = infer_context_name(*inputs)
if not dnn_available(ctx): if not dnn_available(ctx):
# TODO should this raise_no_cudnn? # TODO should this raise_no_cudnn?
return None return None
...@@ -3131,11 +3124,6 @@ def local_abstract_batch_norm_train_cudnn(node): ...@@ -3131,11 +3124,6 @@ def local_abstract_batch_norm_train_cudnn(node):
results = list(dnn_batch_normalization_train(*inputs)) results = list(dnn_batch_normalization_train(*inputs))
# If the original output was on CPU, we have to transfer it
for i in range(len(node.outputs)):
if isinstance(node.outputs[i].type, tensor.TensorType):
results[i] = tensor.as_tensor_variable(results[i])
# TODO copy_stack_trace?
return results return results
...@@ -3179,12 +3167,11 @@ def local_batch_norm_inference_inplace(node): ...@@ -3179,12 +3167,11 @@ def local_batch_norm_inference_inplace(node):
return [GpuDnnBatchNormInference(mode=node.op.mode, inplace=True)(*node.inputs)] return [GpuDnnBatchNormInference(mode=node.op.mode, inplace=True)(*node.inputs)]
@local_optimizer([bn.AbstractBatchNormTrainGrad]) @register_opt('cudnn', 'fast_compile')
def local_abstract_batch_norm_train_grad_cudnn(node): @op_lifter([bn.AbstractBatchNormTrainGrad])
if not isinstance(node.op, bn.AbstractBatchNormTrainGrad): @register_opt2([bn.AbstractBatchNormTrainGrad], 'cudnn', 'fast_compile')
return None def local_abstract_batch_norm_train_grad_cudnn(op, ctx_name, inputs, outputs):
x, dy, scale, x_mean, x_invstd, epsilon = inputs
x, dy, scale, x_mean, x_invstd, epsilon = node.inputs
# input on gpu? TODO what about the output? # input on gpu? TODO what about the output?
x_on_gpu = (isinstance(x.type, GpuArrayType) or x_on_gpu = (isinstance(x.type, GpuArrayType) or
...@@ -3195,7 +3182,7 @@ def local_abstract_batch_norm_train_grad_cudnn(node): ...@@ -3195,7 +3182,7 @@ def local_abstract_batch_norm_train_grad_cudnn(node):
return None return None
# convert axes to cuDNN mode # convert axes to cuDNN mode
axes = tuple(node.op.axes) axes = tuple(op.axes)
if axes == (0,): if axes == (0,):
mode = 'per-activation' mode = 'per-activation'
elif axes == (0,) + tuple(range(2, x.ndim)): elif axes == (0,) + tuple(range(2, x.ndim)):
...@@ -3226,7 +3213,7 @@ def local_abstract_batch_norm_train_grad_cudnn(node): ...@@ -3226,7 +3213,7 @@ def local_abstract_batch_norm_train_grad_cudnn(node):
if eps < 1e-5: if eps < 1e-5:
return None return None
ctx = infer_context_name(*node.inputs) ctx = infer_context_name(*inputs)
if not dnn_available(ctx): if not dnn_available(ctx):
# TODO should this raise_no_cudnn? # TODO should this raise_no_cudnn?
return None return None
...@@ -3248,25 +3235,16 @@ def local_abstract_batch_norm_train_grad_cudnn(node): ...@@ -3248,25 +3235,16 @@ def local_abstract_batch_norm_train_grad_cudnn(node):
g_wrt_scale = theano.tensor.reshape(g_wrt_scale, params_shape) g_wrt_scale = theano.tensor.reshape(g_wrt_scale, params_shape)
g_wrt_bias = theano.tensor.reshape(g_wrt_bias, params_shape) g_wrt_bias = theano.tensor.reshape(g_wrt_bias, params_shape)
# If the original output was on CPU, we have to transfer it
if isinstance(node.outputs[0].type, tensor.TensorType):
g_wrt_inputs = tensor.as_tensor_variable(g_wrt_inputs)
if isinstance(node.outputs[1].type, tensor.TensorType):
g_wrt_scale = tensor.as_tensor_variable(g_wrt_scale)
if isinstance(node.outputs[2].type, tensor.TensorType):
g_wrt_bias = tensor.as_tensor_variable(g_wrt_bias)
# TODO copy_stack_trace?
return [g_wrt_inputs, g_wrt_scale, g_wrt_bias] return [g_wrt_inputs, g_wrt_scale, g_wrt_bias]
@local_optimizer([bn.AbstractBatchNormInference]) @register_opt('cudnn', 'fast_compile')
def local_abstract_batch_norm_inference_cudnn(node): @op_lifter([bn.AbstractBatchNormInference])
if not isinstance(node.op, bn.AbstractBatchNormInference): @register_opt2([bn.AbstractBatchNormInference], 'cudnn', 'fast_compile')
return None def local_abstract_batch_norm_inference_cudnn(op, ctx_name, inputs, outputs):
x, scale, bias, estimated_mean, estimated_variance, epsilon = inputs
x, scale, bias, estimated_mean, estimated_variance, epsilon = node.inputs
axes = tuple(node.op.axes) axes = tuple(op.axes)
if axes == (0,): if axes == (0,):
mode = 'per-activation' mode = 'per-activation'
elif axes == (0,) + tuple(range(2, x.ndim)): elif axes == (0,) + tuple(range(2, x.ndim)):
...@@ -3274,12 +3252,6 @@ def local_abstract_batch_norm_inference_cudnn(node): ...@@ -3274,12 +3252,6 @@ def local_abstract_batch_norm_inference_cudnn(node):
else: else:
return None return None
# input on gpu? TODO what about the output?
x_on_gpu = (isinstance(x.type, GpuArrayType) or
(x.owner and isinstance(x.owner.op, HostFromGpu)))
if not x_on_gpu:
return None
try: try:
eps = theano.tensor.get_scalar_constant_value(epsilon) eps = theano.tensor.get_scalar_constant_value(epsilon)
except theano.tensor.NotScalarConstantError: except theano.tensor.NotScalarConstantError:
...@@ -3287,7 +3259,7 @@ def local_abstract_batch_norm_inference_cudnn(node): ...@@ -3287,7 +3259,7 @@ def local_abstract_batch_norm_inference_cudnn(node):
if eps < 1e-5: if eps < 1e-5:
return None return None
ctx = infer_context_name(*node.inputs) ctx = infer_context_name(*inputs)
if not dnn_available(ctx): if not dnn_available(ctx):
# TODO should this raise_no_cudnn? # TODO should this raise_no_cudnn?
return None return None
...@@ -3300,9 +3272,4 @@ def local_abstract_batch_norm_inference_cudnn(node): ...@@ -3300,9 +3272,4 @@ def local_abstract_batch_norm_inference_cudnn(node):
out = dnn_batch_normalization_test(x, scale, bias, estimated_mean, estimated_variance, out = dnn_batch_normalization_test(x, scale, bias, estimated_mean, estimated_variance,
mode, eps) mode, eps)
# If the original output was on CPU, we have to transfer it
# TODO copy_stack_trace?
if isinstance(node.outputs[0].type, tensor.TensorType):
return [tensor.as_tensor_variable(out)]
else:
return [out] return [out]
...@@ -2005,28 +2005,3 @@ abstractconv_groupopt.register('local_abstractconv3d_gradinputs', ...@@ -2005,28 +2005,3 @@ abstractconv_groupopt.register('local_abstractconv3d_gradinputs',
local_abstractconv3d_gradinputs_gemm, 30, local_abstractconv3d_gradinputs_gemm, 30,
'conv_gemm', 'conv_gemm',
'gpuarray', 'fast_compile', 'fast_run') 'gpuarray', 'fast_compile', 'fast_run')
# Register cuDNN batch normalization implementation
abstract_batch_norm_groupopt = theano.gof.optdb.LocalGroupDB()
abstract_batch_norm_groupopt.__name__ = "gpuarray_batchnorm_opts"
register_opt('fast_compile')(abstract_batch_norm_groupopt)
# cuDNN optimizations are only registered if cuDNN is available.
# (we import these opts here instead of at the top of this file
# to avoid a circular dependency problem with dnn)
from .dnn import (local_abstract_batch_norm_train_cudnn,
local_abstract_batch_norm_train_grad_cudnn,
local_abstract_batch_norm_inference_cudnn) # noqa: 402
abstract_batch_norm_groupopt.register('local_abstract_batch_norm_train_dnn',
local_abstract_batch_norm_train_cudnn, 20,
'batchnorm_dnn',
'gpuarray', 'fast_compile', 'fast_run', 'cudnn')
abstract_batch_norm_groupopt.register('local_abstract_batch_norm_train_grad_dnn',
local_abstract_batch_norm_train_grad_cudnn, 20,
'batchnorm_dnn',
'gpuarray', 'fast_compile', 'fast_run', 'cudnn')
abstract_batch_norm_groupopt.register('local_abstract_batch_norm_inference_dnn',
local_abstract_batch_norm_inference_cudnn, 20,
'batchnorm_dnn',
'gpuarray', 'fast_compile', 'fast_run', 'cudnn')
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论