提交 ccd25be8 authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Convert the optimizarions to work with gpuarray.

上级 edabc511
...@@ -16,7 +16,6 @@ from theano.tensor.signal.downsample import ( ...@@ -16,7 +16,6 @@ from theano.tensor.signal.downsample import (
from . import pygpu, init_dev from . import pygpu, init_dev
from .basic_ops import (as_gpuarray_variable, from .basic_ops import (as_gpuarray_variable,
host_from_gpu,
gpu_contiguous, HostFromGpu, gpu_contiguous, HostFromGpu,
GpuAllocEmpty, empty_like) GpuAllocEmpty, empty_like)
from .conv import GpuConv from .conv import GpuConv
...@@ -24,7 +23,7 @@ from .conv import GpuConv ...@@ -24,7 +23,7 @@ from .conv import GpuConv
# These don't exist in gpuarray # These don't exist in gpuarray
# GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad # GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad
from .nnet import GpuSoftmax from .nnet import GpuSoftmax
from .opt import gpu_seqopt, register_opt from .opt import gpu_seqopt, register_opt, conv_groupopt, op_lifter
from .opt_util import alpha_merge, output_merge from .opt_util import alpha_merge, output_merge
from .comp import NVCC_compiler from .comp import NVCC_compiler
...@@ -1502,11 +1501,9 @@ def local_conv_dnn(node): ...@@ -1502,11 +1501,9 @@ def local_conv_dnn(node):
rval = dnn_conv(img, kern, rval = dnn_conv(img, kern,
border_mode=border_mode, subsample=subsample, border_mode=border_mode, subsample=subsample,
direction_hint=direction_hint) direction_hint=direction_hint)
if node.outputs[0].broadcastable != rval.broadcastable:
rval = tensor.patternbroadcast(
rval, node.outputs[0].type.broadcastable)
return [rval] return [rval]
# This optimizer is registered in opt.py as part of the meta-optimizer. # This optimizer is registered in opt.py as part of the meta-optimizer.
# It tries exactly the opposite code path of what local_conv_dnn() uses, # It tries exactly the opposite code path of what local_conv_dnn() uses,
# because for some input/kernel shape configurations, this is faster. # because for some input/kernel shape configurations, this is faster.
...@@ -1540,6 +1537,11 @@ def local_conv_dnn_alternative(node): ...@@ -1540,6 +1537,11 @@ def local_conv_dnn_alternative(node):
rval, node.outputs[0].type.broadcastable) rval, node.outputs[0].type.broadcastable)
return [rval] return [rval]
conv_groupopt.register('local_conv_dnn', local_conv_dnn, 20,
'conv_dnn', 'fast_compile', 'fast_run', 'cudnn')
@local_optimizer([GpuDnnConv], inplace=True) @local_optimizer([GpuDnnConv], inplace=True)
def local_dnn_conv_inplace(node): def local_dnn_conv_inplace(node):
if type(node.op) != GpuDnnConv or node.op.inplace: if type(node.op) != GpuDnnConv or node.op.inplace:
...@@ -1552,6 +1554,7 @@ def local_dnn_conv_inplace(node): ...@@ -1552,6 +1554,7 @@ def local_dnn_conv_inplace(node):
inputs[2] = GpuAllocEmpty(dest.owner.op.dtype)(*dest.owner.inputs) inputs[2] = GpuAllocEmpty(dest.owner.op.dtype)(*dest.owner.inputs)
return [GpuDnnConv(workmem=node.op.workmem, inplace=True)(*inputs)] return [GpuDnnConv(workmem=node.op.workmem, inplace=True)(*inputs)]
@local_optimizer([GpuDnnConvGradW], inplace=True) @local_optimizer([GpuDnnConvGradW], inplace=True)
def local_dnn_convgw_inplace(node): def local_dnn_convgw_inplace(node):
if type(node.op) != GpuDnnConvGradW or node.op.inplace: if type(node.op) != GpuDnnConvGradW or node.op.inplace:
...@@ -1564,6 +1567,7 @@ def local_dnn_convgw_inplace(node): ...@@ -1564,6 +1567,7 @@ def local_dnn_convgw_inplace(node):
inputs[2] = GpuAllocEmpty(dest.owner.op.dtype)(*dest.owner.inputs) inputs[2] = GpuAllocEmpty(dest.owner.op.dtype)(*dest.owner.inputs)
return [GpuDnnConvGradW(inplace=True)(*inputs)] return [GpuDnnConvGradW(inplace=True)(*inputs)]
@local_optimizer([GpuDnnConvGradI], inplace=True) @local_optimizer([GpuDnnConvGradI], inplace=True)
def local_dnn_convgi_inplace(node): def local_dnn_convgi_inplace(node):
if type(node.op) != GpuDnnConvGradI or node.op.inplace: if type(node.op) != GpuDnnConvGradI or node.op.inplace:
...@@ -1576,13 +1580,14 @@ def local_dnn_convgi_inplace(node): ...@@ -1576,13 +1580,14 @@ def local_dnn_convgi_inplace(node):
inputs[2] = GpuAllocEmpty(dest.owner.op.dtype)(*dest.owner.inputs) inputs[2] = GpuAllocEmpty(dest.owner.op.dtype)(*dest.owner.inputs)
return [GpuDnnConvGradI(inplace=True)(*inputs)] return [GpuDnnConvGradI(inplace=True)(*inputs)]
optdb.register('local_dnn_conv_inplace', optdb.register('local_dnna_conv_inplace',
tensor.opt.in2out(local_dnn_conv_inplace, tensor.opt.in2out(local_dnn_conv_inplace,
local_dnn_convgw_inplace, local_dnn_convgw_inplace,
local_dnn_convgi_inplace, local_dnn_convgi_inplace,
name="local_dnn_conv_inplace"), name="local_dnn_conv_inplace"),
70.0, 'fast_run', 'inplace', 'gpu', 'cudnn') 70.0, 'fast_run', 'inplace', 'gpu', 'cudnn')
@register_opt('cudnn') @register_opt('cudnn')
@alpha_merge(GpuDnnConv, alpha_in=4, beta_in=5, nd=4) @alpha_merge(GpuDnnConv, alpha_in=4, beta_in=5, nd=4)
def local_dnn_conv_alpha_merge(node, *inputs): def local_dnn_conv_alpha_merge(node, *inputs):
...@@ -1590,6 +1595,7 @@ def local_dnn_conv_alpha_merge(node, *inputs): ...@@ -1590,6 +1595,7 @@ def local_dnn_conv_alpha_merge(node, *inputs):
return None return None
return [GpuDnnConv(workmem=node.op.workmem)(*inputs)] return [GpuDnnConv(workmem=node.op.workmem)(*inputs)]
@register_opt('cudnn') @register_opt('cudnn')
@alpha_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, nd=4) @alpha_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, nd=4)
def local_dnn_convw_alpha_merge(node, *inputs): def local_dnn_convw_alpha_merge(node, *inputs):
...@@ -1597,6 +1603,7 @@ def local_dnn_convw_alpha_merge(node, *inputs): ...@@ -1597,6 +1603,7 @@ def local_dnn_convw_alpha_merge(node, *inputs):
return None return None
return [GpuDnnConvGradW()(*inputs)] return [GpuDnnConvGradW()(*inputs)]
@register_opt('cudnn') @register_opt('cudnn')
@alpha_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, nd=4) @alpha_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, nd=4)
def local_dnn_convi_alpha_merge(node, *inputs): def local_dnn_convi_alpha_merge(node, *inputs):
...@@ -1604,42 +1611,33 @@ def local_dnn_convi_alpha_merge(node, *inputs): ...@@ -1604,42 +1611,33 @@ def local_dnn_convi_alpha_merge(node, *inputs):
return None return None
return [GpuDnnConvGradI()(*inputs)] return [GpuDnnConvGradI()(*inputs)]
@register_opt('cudnn') @register_opt('cudnn')
@output_merge(GpuDnnConv, alpha_in=4, beta_in=5, out_in=2, nd=4) @output_merge(GpuDnnConv, alpha_in=4, beta_in=5, out_in=2, nd=4)
def local_dnn_conv_output_merge(node, *inputs): def local_dnn_conv_output_merge(node, *inputs):
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:] inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConv(workmem=node.op.workmem)(*inputs)] return [GpuDnnConv(workmem=node.op.workmem)(*inputs)]
@register_opt('cudnn') @register_opt('cudnn')
@output_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, out_in=2, nd=4) @output_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, out_in=2, nd=4)
def local_dnn_convw_output_merge(node, *inputs): def local_dnn_convw_output_merge(node, *inputs):
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:] inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConvGradW()(*inputs)] return [GpuDnnConvGradW()(*inputs)]
@register_opt('cudnn') @register_opt('cudnn')
@output_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, out_in=2, nd=4) @output_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, out_in=2, nd=4)
def local_dnn_convi_output_merge(node, *inputs): def local_dnn_convi_output_merge(node, *inputs):
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:] inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConvGradI()(*inputs)] return [GpuDnnConvGradI()(*inputs)]
@register_opt('cudnn')
@local_optimizer([GpuDownsampleFactorMax])
def local_pool_dnn(node):
if not dnn_available():
return
if isinstance(node.op, GpuDownsampleFactorMax):
if not node.op.ignore_border:
return
img, = node.inputs
ds = node.op.ds
return [dnn_pool(gpu_contiguous(img), ds, ds)]
@register_opt('cudnn') @register_opt('cudnn')
@local_optimizer([DownsampleFactorMax]) @op_lifter([DownsampleFactorMax])
def local_pool_dnn_alternative(node): def local_pool_dnn_alternative(node):
if not dnn_available(): if not dnn_available():
return return
if isinstance(node.op, DownsampleFactorMax):
if not node.op.ignore_border: if not node.op.ignore_border:
return return
img, = node.inputs img, = node.inputs
...@@ -1647,34 +1645,15 @@ def local_pool_dnn_alternative(node): ...@@ -1647,34 +1645,15 @@ def local_pool_dnn_alternative(node):
stride = node.op.st stride = node.op.st
pad = node.op.padding pad = node.op.padding
mode = node.op.mode mode = node.op.mode
if (img.owner and isinstance(img.owner.op, HostFromGpu)): return dnn_pool(gpu_contiguous(img.owner.inputs[0]),
ret = dnn_pool(gpu_contiguous(img.owner.inputs[0]),
ds, stride=stride, pad=pad, mode=mode) ds, stride=stride, pad=pad, mode=mode)
return [host_from_gpu(ret)]
@register_opt('cudnn')
@local_optimizer([GpuDownsampleFactorMaxGrad])
def local_pool_dnn_grad(node):
if not dnn_available():
return
if isinstance(node.op, GpuDownsampleFactorMaxGrad):
if not node.op.ignore_border:
return
inp, out, inp_grad = node.inputs
ds = node.op.ds
desc = GpuDnnPoolDesc(ws=ds, stride=ds, mode="max")()
return [GpuDnnPoolGrad()(gpu_contiguous(inp),
gpu_contiguous(out),
gpu_contiguous(inp_grad),
desc)]
@register_opt('cudnn') @register_opt('cudnn')
@local_optimizer([DownsampleFactorMaxGrad]) @op_lifter([DownsampleFactorMaxGrad])
def local_pool_dnn_grad_stride(node): def local_pool_dnn_grad_stride(node):
if not dnn_available(): if not dnn_available():
return return
if isinstance(node.op, DownsampleFactorMaxGrad):
if not node.op.ignore_border: if not node.op.ignore_border:
return return
inp, out, inp_grad = node.inputs inp, out, inp_grad = node.inputs
...@@ -1683,16 +1662,12 @@ def local_pool_dnn_grad_stride(node): ...@@ -1683,16 +1662,12 @@ def local_pool_dnn_grad_stride(node):
pad = node.op.padding pad = node.op.padding
mode = node.op.mode mode = node.op.mode
if ((inp.owner and isinstance(inp.owner.op, HostFromGpu)) or
(out.owner and isinstance(out.owner.op, HostFromGpu)) or
(inp_grad.owner and isinstance(inp_grad.owner.op,
HostFromGpu))):
desc = GpuDnnPoolDesc(ws=ds, stride=st, mode=mode, pad=pad)() desc = GpuDnnPoolDesc(ws=ds, stride=st, mode=mode, pad=pad)()
ret = GpuDnnPoolGrad()(gpu_contiguous(inp), return GpuDnnPoolGrad()(gpu_contiguous(inp),
gpu_contiguous(out), gpu_contiguous(out),
gpu_contiguous(inp_grad), gpu_contiguous(inp_grad),
desc) desc)
return [host_from_gpu(ret)]
@register_opt('cudnn') @register_opt('cudnn')
@local_optimizer([GpuSoftmax]) @local_optimizer([GpuSoftmax])
...@@ -1706,6 +1681,7 @@ def local_softmax_dnn(node): ...@@ -1706,6 +1681,7 @@ def local_softmax_dnn(node):
out = as_gpuarray_variable(out.dimshuffle(0, 1)) out = as_gpuarray_variable(out.dimshuffle(0, 1))
return [out] return [out]
class NoCuDNNRaise(Optimizer): class NoCuDNNRaise(Optimizer):
def apply(self, fgraph): def apply(self, fgraph):
""" Raise a RuntimeError if cudnn can't be used""" """ Raise a RuntimeError if cudnn can't be used"""
...@@ -1716,16 +1692,13 @@ class NoCuDNNRaise(Optimizer): ...@@ -1716,16 +1692,13 @@ class NoCuDNNRaise(Optimizer):
"cuDNN optimization was enabled, but Theano was not able" "cuDNN optimization was enabled, but Theano was not able"
" to use it. We got this error: \n" + " to use it. We got this error: \n" +
dnn_available.msg) dnn_available.msg)
gpu_seqopt.register("NoCuDNNRaise", NoCuDNNRaise(), 0, 'cudnn') gpu_seqopt.register("NoCuDNNRaise", NoCuDNNRaise(), 0, 'cudnn')
@register_opt('cudnn') @register_opt('cudnn')
@local_optimizer([SoftmaxGrad]) @op_lifter([SoftmaxGrad])
def local_softmax_dnn_grad(node): def local_softmax_dnn_grad(node):
if (isinstance(node.op, SoftmaxGrad) and
((node.inputs[0].owner and
isinstance(node.inputs[0].owner.op, HostFromGpu)) or
(node.inputs[1].owner and
isinstance(node.inputs[1].owner.op, HostFromGpu)))):
if not dnn_available(): if not dnn_available():
return return
ins = [] ins = []
...@@ -1736,8 +1709,6 @@ def local_softmax_dnn_grad(node): ...@@ -1736,8 +1709,6 @@ def local_softmax_dnn_grad(node):
return return
ins.append(n.dimshuffle(0, 1, 'x', 'x')) ins.append(n.dimshuffle(0, 1, 'x', 'x'))
out = GpuDnnSoftmaxGrad('bc01', out = GpuDnnSoftmaxGrad('bc01', 'accurate', 'channel')(
'accurate', gpu_contiguous(ins[0]), gpu_contiguous(ins[1]))
'channel')(gpu_contiguous(ins[0]),
gpu_contiguous(ins[1]))
return [out.dimshuffle(0, 1)] return [out.dimshuffle(0, 1)]
...@@ -12,11 +12,13 @@ from theano import tensor, scalar, gof ...@@ -12,11 +12,13 @@ from theano import tensor, scalar, gof
from theano.compile import optdb from theano.compile import optdb
from theano.gof import (local_optimizer, EquilibriumDB, from theano.gof import (local_optimizer, EquilibriumDB,
SequenceDB, Optimizer, toolbox) SequenceDB, Optimizer, toolbox)
from theano.gof.optdb import LocalGroupDB
from theano.scan_module import scan_utils, scan_op, scan_opt from theano.scan_module import scan_utils, scan_op, scan_opt
from theano.tensor.nnet.conv import ConvOp from theano.tensor.nnet.conv import ConvOp
from theano.tests.breakpoint import PdbBreakpoint from theano.tests.breakpoint import PdbBreakpoint
from .type import GpuArrayType, GpuArrayConstant from .type import GpuArrayType, GpuArrayConstant
from .basic_ops import (host_from_gpu, gpu_from_host, from .basic_ops import (host_from_gpu, gpu_from_host,
HostFromGpu, GpuFromHost, HostFromGpu, GpuFromHost,
...@@ -39,6 +41,10 @@ gpu_cut_copies = EquilibriumDB() ...@@ -39,6 +41,10 @@ gpu_cut_copies = EquilibriumDB()
gpu_seqopt = SequenceDB() gpu_seqopt = SequenceDB()
# Don't register this right now
conv_groupopt = LocalGroupDB()
conv_groupopt.__name__ = "gpua_conv_opts"
gpu_seqopt.register('gpuarray_local_optimiziations', gpu_optimizer, 1, gpu_seqopt.register('gpuarray_local_optimiziations', gpu_optimizer, 1,
'fast_compile', 'fast_run', 'inplace', 'gpuarray') 'fast_compile', 'fast_run', 'inplace', 'gpuarray')
gpu_seqopt.register('gpuarray_cut_transfers', gpu_cut_copies, 2, gpu_seqopt.register('gpuarray_cut_transfers', gpu_cut_copies, 2,
...@@ -689,6 +695,9 @@ def local_gpu_conv(node): ...@@ -689,6 +695,9 @@ def local_gpu_conv(node):
out.values_eq_approx = values_eq_approx out.values_eq_approx = values_eq_approx
return [out] return [out]
# Register this here so that it goes after 'local_gpu_conv'
register_opt()(conv_groupopt)
@register_opt("low_memory") @register_opt("low_memory")
@local_optimizer([GpuCAReduceCuda]) @local_optimizer([GpuCAReduceCuda])
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论