提交 dc6c058c authored 作者: Nicolas Ballas's avatar Nicolas Ballas 提交者: Pascal Lamblin

update optim

上级 5ae763de
......@@ -14,31 +14,25 @@ from theano.tensor import (as_tensor_variable, blas, get_scalar_constant_value,
from theano.gof import Apply, Op
from theano.gof import local_optimizer
from theano.sandbox.cuda import register_opt as register_gpu
from theano.tensor.opt import register_specialize_device
### Gpu related optimization (to be moved in sandbox/cuda)
from theano.sandbox.cuda.basic_ops import (
as_cuda_ndarray_variable,
gpu_contiguous, gpu_from_host, host_from_gpu,
GpuFromHost, HostFromGpu
)
from theano.sandbox.cuda import gpu_optimizer, register_opt
from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda.dnn import dnn_available, dnn_conv
from theano.sandbox.cuda.blas import GpuCorrMM, GpuCorrMM_gradWeights, GpuCorrMM_gradInputs
from theano.sandbox.cuda.opt import values_eq_approx_high_tol
## Cpu implementation
from theano.tensor.nnet import conv2d as cpu_conv2d
imported_scipy_signal = False
try:
# TODO: move these back out to global scope when they no longer
# cause an atexit error
from scipy.signal.signaltools import _valfrommode, _bvalfromboundary
from scipy.signal.sigtools import _convolve2d
imported_scipy_signal = True
except ImportError:
pass
_logger = logging.getLogger("theano.tensor.nnet.conv")
_logger = logging.getLogger("theano.tensor.nnet.conv2d")
def conv2d(img,
......@@ -115,7 +109,7 @@ def conv2d(img,
class BaseConv2d(Op):
class BaseAbstractConv2d(Op):
"""Base class for ConvInferace
FIXME
......@@ -178,7 +172,7 @@ class BaseConv2d(Op):
class Conv2d(BaseConv2d):
class AbstractConv2d(BaseAbstractConv2d):
"""
FIXME
"""
......@@ -188,7 +182,7 @@ class Conv2d(BaseConv2d):
bsize=None,
border_mode="valid",
subsample=(1, 1)):
super(Conv2d, self).__init__(imshp, kshp, bsize,
super(AbstractConv2d, self).__init__(imshp, kshp, bsize,
border_mode, subsample)
def make_node(self, img, kern):
......@@ -200,29 +194,31 @@ class Conv2d(BaseConv2d):
broadcastable=[img.broadcastable[0],
kern.broadcastable[0],
False, False]
img = as_tensor_variable(img)
kern = as_tensor_variable(kern)
output = theano.tensor.tensor(dtype=img.type.dtype,
broadcastable=broadcastable)
output = img.type.__class__(dtype=img.type.dtype,
broadcastable=broadcastable)
return Apply(self, [img, kern], [output])
def perform(self, node, inp, out_):
raise NotImplementedError('Conv2d theano optimization failed')
raise NotImplementedError('AbstractConv2d theano optimization failed')
def grad(self, inp, grads):
bottom, weights = inp
top, = grads
d_bottom = Conv2d_gradInputs(self.imshp, self.kshp, self.bsize,
self.border_mode, self.subsample)(
d_bottom = AbstractConv2d_gradInputs(self.imshp, self.kshp,
self.bsize,
self.border_mode,
self.subsample)(
weights, top, bottom.shape[-2:])
d_weights = Conv2d_gradWeights(self.imshp, self.kshp, self.bsize,
self.border_mode, self.subsample)(
d_weights = AbstractConv2d_gradWeights(self.imshp, self.kshp,
self.bsize,
self.border_mode,
self.subsample)(
bottom, top, weights.shape[-2:])
return d_bottom, d_weights
class Conv2d_gradWeights(BaseConv2d):
"""Gradient wrt. filters for `Conv2d`.
class AbstractConv2d_gradWeights(BaseAbstractConv2d):
"""Gradient wrt. filters for `AbstractConv2d`.
:note: You will not want to use this directly, but rely on
Theano's automatic differentiation or graph optimization to
......@@ -236,7 +232,7 @@ class Conv2d_gradWeights(BaseConv2d):
bsize=None,
border_mode="valid",
subsample=(1, 1)):
super(Conv2d_gradWeights, self).__init__(imshp, kshp, bsize,
super(AbstractConv2d_gradWeights, self).__init__(imshp, kshp, bsize,
border_mode, subsample)
def make_node(self, img, topgrad, shape=None):
......@@ -255,23 +251,27 @@ class Conv2d_gradWeights(BaseConv2d):
broadcastable=[topgrad.broadcastable[0],
img.broadcastable[0],
False, False]
img = as_tensor_variable(img)
topgrad = as_tensor_variable(topgrad)
output = theano.tensor.tensor(dtype=img.type.dtype,
broadcastable=broadcastable)
output = img.type.__class__(dtype=img.type.dtype,
broadcastable=broadcastable)
return Apply(self, [img, topgrad] + height_width, [output])
def perform(self, node, inp, out_):
raise NotImplementedError('Conv2d_gradWeight theano optimization failed')
raise NotImplementedError('AbstractConv2d_gradWeight theano optimization failed')
def grad(self, inp, grads):
bottom, top = inp[:2]
weights, = grads
d_bottom = Conv2d_gradInputs(self.imshp, self.kshp, self.bsize,
self.border_mode, self.subsample)(
d_bottom = AbstractConv2d_gradInputs(self.imshp, self.kshp,
self.bsize,
self.border_mode,
self.subsample)(
weights, top, bottom.shape[-2:])
d_top = Conv2d(self.imshp, self.kshp, self.bsize,
self.border_mode, self.subsample)(bottom, weights)
d_top = AbstractConv2d(self.imshp,
self.kshp,
self.bsize,
self.border_mode,
self.subsample)(
bottom, weights)
d_height_width = (theano.gradient.DisconnectedType()(),) * 2 if len(inp) == 4 else ()
return (d_bottom, d_top) + d_height_width
......@@ -282,8 +282,8 @@ class Conv2d_gradWeights(BaseConv2d):
return [[1], [1], [0], [0]] # no connection to height, width
class Conv2d_gradInputs(Conv2d):
"""Gradient wrt. inputs for `Conv2d`.
class AbstractConv2d_gradInputs(Conv2d):
"""Gradient wrt. inputs for `AbstractConv2d`.
:note: You will not want to use this directly, but rely on
Theano's automatic differentiation or graph optimization to
......@@ -297,7 +297,7 @@ class Conv2d_gradInputs(Conv2d):
bsize=None,
border_mode="valid",
subsample=(1, 1)):
super(Conv2d_gradInputs, self).__init__(imshp, kshp, bsize,
super(AbstractConv2d_gradInputs, self).__init__(imshp, kshp, bsize,
border_mode, subsample)
def make_node(self, kern, topgrad, shape=None):
......@@ -312,24 +312,25 @@ class Conv2d_gradInputs(Conv2d):
broadcastable = [topgrad.type.broadcastable[0],
kern.type.broadcastable[1],
False, False]
kern = as_tensor_variable(kern)
topgrad = as_tensor_variable(topgrad)
output = theano.tensor.tensor(dtype=kern.type.dtype,
broadcastable=broadcastable)
output = kern.type.__class__(dtype=kern.type.dtype,
broadcastable=broadcastable)
return Apply(self, [kern, topgrad] + height_width, [output])
def perform(self, node, nodename, inp, out_, sub):
raise NotImplementedError('Conv2d_gradWeight theano optimization failed')
raise NotImplementedError('AbstractConv2d_gradWeight theano optimization failed')
def grad(self, inp, grads):
weights, top = inp[:2]
bottom, = grads
d_weights = Conv2d_gradWeights(self.imshp, self.kshp, self.bsize,
self.border_mode, self.subsample)(
d_weights = AbstractConv2d_gradWeights(self.imshp, self.kshp,
self.bsize,
self.border_mode,
self.subsample)(
bottom, top, weights.shape[-2:])
d_top = Conv2d(self.imshp, self.filter_shape, self.bsize,
self.border_mode, self.subsample)(bottom, weights)
d_top = AbstractConv2d(self.imshp, self.filter_shape, self.bsize,
self.border_mode, self.subsample)(
bottom, weights)
d_height_width = (theano.gradient.DisconnectedType()(),) * 2 if len(inp) == 4 else ()
return (d_weights, d_top) + d_height_width
......@@ -340,38 +341,126 @@ class Conv2d_gradInputs(Conv2d):
return [[1], [1], [0], [0]] # no connection to height, width
### Optimizations should be move in their appropriate files
### move to Gpu optimization
### Do not replace the AbstractOpt only the inputs
### Abstract Ops is replaced layer by device_specialized opt
@local_optimizer([gpu_from_host, AbstractConv2d,
AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs])
def local_conv2d_gpu_conv(node):
"""
gpu_from_host(AbstractConv) -> AbstractConv(gpu_from_host)
AbstractConv(host_from_gpu) -> host_from_gpu(AbstractConv)
"""
if isinstance(node.op, GpuFromHost):
#gpu_from_host(conv) -> gpu_conv(gpu_from_host)
host_input = node.inputs[0]
if host_input.owner and \
(isinstance(host_input.owner.op, AbstractConv2d) or
isinstance(host_input.owner.op, AbstractConv2d_gradWeights) or
isinstance(host_input.owner.op, AbstractConv2d_gradInputs)):
conv = host_input.owner.op
if len(host_input.owner.inputs) == 3:
inp1, inp2, shape = host_input.owner.inputs
else:
inp1, inp2 = host_input.owner.inputs
shape = None
out = conv.type.__class__(imgshp = conv.imshp,
kshp = conv.kshp,
bsize = conv.bsize,
border_mode = conv.border_mode,
subsample = conv.subsample)
out = out(gpu_from_host(inp1),
gpu_from_host(inp2),
shape)
out = theano.tensor.patternbroadcast(gpu_from_host(out),
node.outputs[0].broadcastable)
out.values_eq_approx = values_eq_approx_high_tol
return [out]
if (isinstance(node.op, AbstractConv2d) or
isinstance(node.op, AbstractConv2d_gradWeights) or
isinstance(node.op, AbstractConv2d_gradInputs)):
#conv(host_from_gpu) -> host_from_gpu(gpu_conv)
if len(node.inputs) == 3:
inp1, inp2, shape = node.inputs
else:
inp1, inp2 = node.inputs
shape = None
inp1_on_gpu = (inp1.owner and isinstance(inp1.owner.op, HostFromGpu))
inp2_on_gpu = (inp2.owner and isinstance(inp2.owner.op, HostFromGpu))
if inp1_on_gpu or inp2_on_gpu:
conv = node.op
out = conv.type.__class__(imgshp=conv.imshp,
kshp=conv.kshp,
bsize=conv.bsize,
border_mode=conv.border_mode,
subsample = conv.subsample)
out = out(gpu_from_host(inp1),
gpu_from_host(inp2),
shape)
out = theano.tensor.patternbroadcast(
out,
node.outputs[0].broadcastable)
out.values_eq_approx = values_eq_approx_high_tol
return [as_tensor_variable(out)]
# We register the optimizer that moves convolutions to the GPU.
register_gpu()(local_conv2d_gpu_conv)
@local_optimizer([AbstractConv2d,
AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs])
def local_conv2d_cudnn(node):
def replace_conv_with_cudnn(convop, inputs):
if len(node.inputs) == 3:
inp1, inp2, shape = node.inputs
else:
inp1, inp2 = node.inputs
shape = None
if not isinstance(inp1, CudaNdarrayType) or \
isinstance(inp2, CudaNdarrayType):
return None
if not dnn_available():
return None
inp1, inp2, shape = inputs
if (isinstance(convop, Conv2d)):
if (isinstance(node.op, AbstractConv2d)):
rval = dnn_conv(inp1, inp2,
border_mode=convop.border_mode,
subsample=convop.subsample,
border_mode=node.op.border_mode,
subsample=node.op.subsample,
direction_hint='forward')
return rval
if (isinstance(convop, Conv2d_gradWeights)):
if (isinstance(node.op, AbstractConv2d_gradWeights)):
rval = dnn_conv(inp1.dimshuffle(1, 0, 2, 3), inp2,
border_mode=convop.border_mode,
subsample=convop.subsample,
border_mode=node.op.border_mode,
subsample=node.op.subsample,
direction_hint='bprop weights')
return rval
if (isinstance(convop, Conv2d_gradInputs)):
if (isinstance(node.op, AbstractConv2d_gradInputs)):
rval = dnn_conv(inp1, inp2,
border_mode=convop.border_mode,
subsample=convop.subsample,
border_mode=node.op.border_mode,
subsample=node.op.subsample,
direction_hint='bprop inputs')
return rval
register_specialize_device()(local_conv2d_cudnn)
def replace_convforward_with_corrmm(convop, inputs):
img, kern, shape = inputs
if convop.border_mode in ['full', 'valid']:
border_mode = convop.border_mode
subsample = convop.subsample
@local_optimizer(AbstractConv2d)
def local_conv2d_corrmm(convop, inputs):
img, kern = node.inputs
if not isinstance(img, CudaNdarrayType) or \
isinstance(kern, CudaNdarrayType):
return None
if node.op.border_mode in ['full', 'valid']:
border_mode = node.op.border_mode
subsample = node.op.subsample
if (border_mode == 'valid') or (subsample != (1,1)):
# need to flip the kernel for valid convolution
kern = kern[:, :, ::-1, ::-1]
......@@ -385,20 +474,20 @@ def replace_convforward_with_corrmm(convop, inputs):
# GpuConv does not always store information on the batchsize and
# channels, though, so we only use what information we have.)
if ((subsample == (1,1)) and
(convop.imshp is not None) and
(None not in convop.imshp[-2:]) and
(convop.kshp is not None) and
(None not in convop.kshp)):
(node.op.imshp is not None) and
(None not in node.op.imshp[-2:]) and
(node.op.kshp is not None) and
(None not in node.op.kshp)):
# we know the kernel and output size
prod1 = convop.kshp[0] * convop.kshp[1]
prod2 = ((convop.imshp[-2] - convop.kshp[0] + 1) *
(convop.imshp[-1] - convop.kshp[1] + 1))
if ((convop.bsize is not None) and
(len(convop.imshp) == 3) and
(convop.imshp[0] is not None)):
prod1 = node.op.kshp[0] * node.op.kshp[1]
prod2 = ((node.op.imshp[-2] - node.op.kshp[0] + 1) *
(node.op.imshp[-1] - node.op.kshp[1] + 1))
if ((node.op.bsize is not None) and
(len(node.op.imshp) == 3) and
(node.op.imshp[0] is not None)):
# we also know batchsize and input channels
prod1 *= convop.bsize
prod2 *= convop.imshp[0]
prod1 *= node.op.bsize
prod2 *= node.op.imshp[0]
# compare to decide
if prod1 > prod2:
# (we need to wrap the result in as_cuda_ndarray_variable,
......@@ -416,108 +505,41 @@ def replace_convforward_with_corrmm(convop, inputs):
rval = GpuCorrMM_gradInputs('valid', subsample)(
gpu_contiguous(kern), gpu_contiguous(img))
return rval
register_specialize_device()(local_conv2d_corrmm)
def replace_convgradweight_with_corrmm(convop, inputs):
img, topgrad, shape = inputs
rval = GpuCorrMM_gradWeights(border_mode=convop.border_mode,
subsample=convop.subsample)(
@local_optimizer(AbstractConv2d_gradWeights)
def local_conv2d_gradweight_corrmm(node):
img, topgrad, shape = node.inputs
if not isinstance(img, CudaNdarrayType) or \
isinstance(topgrad, CudaNdarrayType):
return None
rval = GpuCorrMM_gradWeights(border_mode=node.op.border_mode,
subsample=node.op.subsample)(
gpu_contiguous(img), gpu_contiguous(topgrad), shape)
return rval
register_specialize_device()(local_conv2d_gradweight_corrmm)
def replace_convgradinputs_withcorrmm(convop, inputs):
kern, topgrad, shape = inputs
rval = GpuCorrMM_gradInputs(border_mode=convop.border_mode,
subsample=convop.subsample)(
@local_optimizer(AbstractConv2d_gradInputs)
def local_conv2d_gradinputs_corrmm(node):
kern, topgrad, shape = node.inputs
if not isinstance(img, CudaNdarrayType) or \
isinstance(topgrad, CudaNdarrayType):
return None
rval = GpuCorrMM_gradInputs(border_mode=node.op.border_mode,
subsample=node.op.subsample)(
gpu_contiguous(kern), gpu_contiguous(topgrad), shape)
return rval
def replace_convop(convop, inputs):
"""
Dispatch based on the convop.optim values
"""
gpu_conv = None
if "cudnn" in convop.optim:
gpu_conv = replace_conv_with_cudnn(convop, inputs)
if gpu_conv is None and "corrmm" in convop.optim:
if isinstance(convop, Conv2d):
gpu_conv = replace_convforward_with_corrmm(convop, inputs)
elif isinstance(convop, Conv2d_gradWeights):
gpu_conv = replace_convgradweight_with_corrmm(convop, inputs)
elif isinstance(convop, Conv2d_gradInputs):
gpu_conv = replace_convgradinputs_withcorrmm(convop, inputs)
### FIXME add fft code
return gpu_conv
### move to Gpu optimization
@local_optimizer([gpu_from_host, Conv2d, Conv2d_gradWeights, Conv2d_gradInputs])
def local_conv2d_gpu_conv(node):
"""
gpu_from_host(Conv) -> (gpu)_Conv(gpu_from_host)
Conv(host_from_gpu) -> host_from_gpu((gpu)_Conv)
"""
if isinstance(node.op, GpuFromHost):
#gpu_from_host(conv) -> gpu_conv(gpu_from_host)
host_input = node.inputs[0]
if host_input.owner and \
(isinstance(host_input.owner.op, Conv2d) or
isinstance(host_input.owner.op, Conv2d_gradWeights) or
isinstance(host_input.owner.op, Conv2d_gradInputs)):
conv = host_input.owner.op
if len(host_input.owner.inputs) == 3:
inp1, inp2, shape = host_input.owner.inputs
else:
inp1, inp2 = host_input.owner.inputs
shape = None
out = replace_convop(conv, [gpu_from_host(inp1),
gpu_from_host(inp2),
shape])
if out is None:
return
out = theano.tensor.patternbroadcast(gpu_from_host(out),
node.outputs[0].broadcastable)
out.values_eq_approx = values_eq_approx_high_tol
return [out]
if (isinstance(node.op, Conv2d) or
isinstance(node.op, Conv2d_gradWeights) or
isinstance(node.op, Conv2d_gradInputs)):
#conv(host_from_gpu) -> host_from_gpu(gpu_conv)
if len(node.inputs) == 3:
inp1, inp2, shape = node.inputs
else:
inp1, inp2 = node.inputs
shape = None
inp1_on_gpu = (inp1.owner and isinstance(inp1.owner.op, HostFromGpu))
inp2_on_gpu = (inp2.owner and isinstance(inp2.owner.op, HostFromGpu))
if inp1_on_gpu or inp2_on_gpu:
conv = node.op
out = replace_convop(conv, [gpu_from_host(inp1),
gpu_from_host(inp2),
shape])
if out is None:
return
out = theano.tensor.patternbroadcast(
out,
node.outputs[0].broadcastable)
out.values_eq_approx = values_eq_approx_high_tol
return [as_tensor_variable(out)]
# We register the optimizer that moves convolutions to the GPU.
register_opt()(local_conv2d_gpu_conv)
register_specialize_device()(local_conv2d_gradinputs_corrmm)
### Cpu Optmization
### Desactived focus on GPU optimization first
# @local_optimizer([Conv2d])
# @local_optimizer([AbstractConv2d])
# def local_conv2d(node):
# if isinstance(node.op, Conv2d) and not node.on_gpu:
# if isinstance(node.op, AbstractConv2d) and not node.on_gpu:
# img, kern = node.inputs
# rval = cpu_conv2d(img, kern,
# node.op.imshp, node.op.filter_shape,
......@@ -526,10 +548,10 @@ register_opt()(local_conv2d_gpu_conv)
# return [rval]
# @local_optimizer([Conv2d_gradWeights])
# @local_optimizer([AbstractConv2d_gradWeights])
# def local_conv2d_gradweight_cpu(node):
# if not isinstance(node.op, Conv2d_gradWeights) or not node.on_gpu:
# if not isinstance(node.op, AbstractConv2d_gradWeights) or not node.on_gpu:
# return
# img, topgrad = node.inputs
......@@ -555,7 +577,7 @@ register_opt()(local_conv2d_gpu_conv)
# "stride y are different from 1 and 2, as there is a bug in it.")
# if op.imshp is None or op.kshp is None:
# raise Exception("Conv2d grad when stride x!=1 or stride y!=1 we must have"
# raise Exception("AbstractConv2d grad when stride x!=1 or stride y!=1 we must have"
# " all the optional shape information")
# ####### Determine gradient on kernels ########
......@@ -604,9 +626,9 @@ register_opt()(local_conv2d_gpu_conv)
# return [dw(img, filters)]
# @local_optimizer([Conv2d_gradInputs])
# @local_optimizer([AbstractConv2d_gradInputs])
# def local_conv2d_gradinputs_cpu(node):
# if not isinstance(node.op, Conv2d_gradInputs) or not node.on_gpu:
# if not isinstance(node.op, AbstractConv2d_gradInputs) or not node.on_gpu:
# return
# # ####### Determine gradient on inputs ########
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论