提交 1ebe9fd6 authored 作者: Frederic Bastien's avatar Frederic Bastien

pep8 code redability.

上级 97e12dae
......@@ -16,25 +16,26 @@ import numpy
import theano
from theano.tensor import (as_tensor_variable, blas, get_constant_value,
patternbroadcast)
patternbroadcast)
from theano import Op, config
from theano.gof import Apply
from theano.gof.python25 import any
imported_scipy_signal = False
try:
# TODO: move these back out to global scope when they no longer cause an atexit error
from scipy.signal.signaltools import _valfrommode, _bvalfromboundary
# TODO: move these back out to global scope when they no longer
# cause an atexit error
from scipy.signal.signaltools import _valfrommode, _bvalfromboundary
from scipy.signal.sigtools import _convolve2d
imported_scipy_signal = True
except ImportError:
pass
_logger=logging.getLogger("theano.tensor.nnet.conv")
_logger = logging.getLogger("theano.tensor.nnet.conv")
def conv2d(input, filters, image_shape=None, filter_shape=None,
border_mode='valid', subsample=(1,1), **kargs):
border_mode='valid', subsample=(1, 1), **kargs):
"""This function will build the symbolic graph for convolving a stack of input
images with a set of filters. The implementation is modelled after
Convolutional Neural Networks (CNN). It is simply a wrapper to the ConvOp but
......@@ -62,8 +63,10 @@ def conv2d(input, filters, image_shape=None, filter_shape=None,
:param filter_shape: (nb filters, stack size, nb row, nb col)
Optional, used for optimization.
:param kwargs: kwargs are passed onto ConvOp. Can be used to set the following:
unroll_batch, unroll_kern, unroll_patch, openmp (see ConvOp doc)
:param kwargs: kwargs are passed onto ConvOp.
Can be used to set the following:
unroll_batch, unroll_kern, unroll_patch,
openmp (see ConvOp doc)
openmp: By default have the same value as
config.openmp. For small image, filter,
......@@ -77,8 +80,8 @@ def conv2d(input, filters, image_shape=None, filter_shape=None,
with openmp on a core 2 duo.
:rtype: symbolic 4D tensor
:return: set of feature maps generated by convolutional layer. Tensor is of shape
(batch size, nb filters, output row, output col)
:return: set of feature maps generated by convolutional layer. Tensor is
of shape (batch size, nb filters, output row, output col)
"""
......@@ -87,20 +90,22 @@ def conv2d(input, filters, image_shape=None, filter_shape=None,
image_shape = list(image_shape)
for i in xrange(len(image_shape)):
if image_shape[i] is not None:
image_shape[i] = get_constant_value(as_tensor_variable(image_shape[i]))
image_shape[i] = get_constant_value(
as_tensor_variable(image_shape[i]))
assert str(image_shape[i].dtype).startswith('int')
image_shape[i] = int(image_shape[i])
if filter_shape is not None:
filter_shape = list(filter_shape)
for i in xrange(len(filter_shape)):
if filter_shape[i] is not None:
filter_shape[i] = get_constant_value(as_tensor_variable(filter_shape[i]))
filter_shape[i] = get_constant_value(
as_tensor_variable(filter_shape[i]))
assert str(filter_shape[i].dtype).startswith('int')
filter_shape[i] = int(filter_shape[i])
if image_shape and filter_shape:
try:
assert image_shape[1]==filter_shape[1]
assert image_shape[1] == filter_shape[1]
except Exception:
print 'image ', image_shape, ' filters ', filter_shape
raise
......@@ -118,7 +123,7 @@ def conv2d(input, filters, image_shape=None, filter_shape=None,
bsize, imshp = None, None
op = ConvOp(output_mode=border_mode, dx=subsample[0], dy=subsample[1],
imshp=imshp, kshp=kshp, nkern=nkern, bsize=bsize,**kargs)
imshp=imshp, kshp=kshp, nkern=nkern, bsize=bsize, **kargs)
return op(input, filters)
......@@ -141,13 +146,13 @@ class ConvOp(Op):
The output of ConvOp is a 4D tensor, generated as follows:
output[b,k,:,:] = \sum_i input[b,i,:,:] * filter[k,i,:,:] \forall b,k
where b is the mini-batch index, k the filter index and * is the convolution
operator.
where b is the mini-batch index, k the filter index and * is the
convolution operator.
"""
__attrnames = ['imshp', 'kshp', 'nkern', 'bsize', 'dx', 'dy', 'out_mode',
'unroll_batch', 'unroll_kern', 'unroll_patch',
'imshp_logical', 'kshp_logical', 'kshp_logical_top_aligned']
'unroll_batch', 'unroll_kern', 'unroll_patch',
'imshp_logical', 'kshp_logical', 'kshp_logical_top_aligned']
"""These attributes uniquely identify the behaviour of this op for
given inputs. Do not set openmp here.
"""
......@@ -161,63 +166,63 @@ class ConvOp(Op):
# using the real shape and the same dtype could also help.
#unroll_batch, unroll_kern, valid time, full time
speed_unroll_batch_kern=[(1, 1, 2.4661250114440918, 6.5472931861877441) ,
(1, 2, 1.5869178771972656, 5.1499760150909424) ,
(1, 3, 1.4270510673522949, 3.6593470573425293) ,
(1, 4, 1.3373479843139648, 3.3451821804046631) ,
(1, 5, 1.2818830013275146, 3.1444568634033203) ,
(1, 6, 1.2521560192108154, 3.0256359577178955) ,
(1, 10, 1.2134110927581787, 2.9174180030822754) ,
(2, 1, 1.657214879989624, 4.5261678695678711) ,
(2, 2, 1.2123160362243652, 2.9747390747070312) ,
(2, 3, 1.0758891105651855, 2.5690360069274902) ,
(2, 4, 1.0683329105377197, 2.4233770370483398) ,
(2, 5, 1.0955719947814941, 2.3999948501586914) ,
(2, 6, 1.5935721397399902, 2.6878271102905273) ,
(2, 10, 1.8511250019073486, 3.2417428493499756) ,
(3, 1, 1.5948119163513184, 3.631148099899292) ,
(3, 2, 1.0761330127716064, 2.6011371612548828) ,
(3, 3, 1.0551531314849854, 2.4200370311737061) ,
(3, 4, 1.3930759429931641, 2.5211219787597656) ,
(3, 5, 1.4330689907073975, 2.5704989433288574) ,
(3, 6, 1.362138032913208, 2.5964410305023193) ,
(3, 10, 1.6582000255584717, 2.9907989501953125) ,
(4, 1, 1.4793620109558105, 3.3473429679870605) ,
(4, 2, 1.0671560764312744, 2.4171769618988037) ,
(4, 3, 1.2569692134857178, 2.2807950973510742) ,
(4, 4, 1.3456289768218994, 2.6219108104705811) ,
(4, 5, 1.4055080413818359, 2.4606490135192871) ,
(4, 6, 1.372107982635498, 2.551663875579834) ,
(4, 10, 1.599470853805542, 2.9172940254211426) ,
(5, 1, 1.4115700721740723, 3.2077109813690186) ,
(5, 2, 1.0635769367218018, 2.2648060321807861) ,
(5, 3, 1.3842809200286865, 2.6135518550872803) ,
(5, 4, 1.3470511436462402, 2.3852400779724121) ,
(5, 5, 1.3539440631866455, 2.5245928764343262) ,
(5, 6, 1.4037849903106689, 2.5985310077667236) ,
(5, 10, 1.6120610237121582, 2.8127608299255371) ,
(6, 1, 1.3623628616333008, 3.021122932434082) ,
(6, 2, 1.1697649955749512, 2.6285450458526611) ,
(6, 3, 1.2980999946594238, 2.4746189117431641) ,
(6, 4, 1.3739941120147705, 2.5579929351806641) ,
(6, 5, 1.3967819213867188, 2.5522029399871826) ,
(6, 6, 1.4279270172119141, 2.6127138137817383) ,
(6, 10, 1.605496883392334, 2.864037036895752) ,
(10, 1, 1.6401121616363525, 2.970099925994873) ,
(10, 2, 1.46710205078125, 2.7231831550598145) ,
(10, 3, 1.4193780422210693, 2.6087639331817627) ,
(10, 4, 1.4657118320465088, 2.6246678829193115) ,
(10, 5, 1.5052611827850342, 2.6542458534240723) ,
(10, 6, 1.5214400291442871, 2.7243161201477051) ,
(10, 10, 1.6116268634796143, 2.956165075302124)]
speed_unroll_batch_kern = [(1, 1, 2.4661250114440918, 6.5472931861877441),
(1, 2, 1.5869178771972656, 5.1499760150909424),
(1, 3, 1.4270510673522949, 3.6593470573425293),
(1, 4, 1.3373479843139648, 3.3451821804046631),
(1, 5, 1.2818830013275146, 3.1444568634033203),
(1, 6, 1.2521560192108154, 3.0256359577178955),
(1, 10, 1.2134110927581787, 2.9174180030822754),
(2, 1, 1.657214879989624, 4.5261678695678711),
(2, 2, 1.2123160362243652, 2.9747390747070312),
(2, 3, 1.0758891105651855, 2.5690360069274902),
(2, 4, 1.0683329105377197, 2.4233770370483398),
(2, 5, 1.0955719947814941, 2.3999948501586914),
(2, 6, 1.5935721397399902, 2.6878271102905273),
(2, 10, 1.8511250019073486, 3.2417428493499756),
(3, 1, 1.5948119163513184, 3.631148099899292),
(3, 2, 1.0761330127716064, 2.6011371612548828),
(3, 3, 1.0551531314849854, 2.4200370311737061),
(3, 4, 1.3930759429931641, 2.5211219787597656),
(3, 5, 1.4330689907073975, 2.5704989433288574),
(3, 6, 1.362138032913208, 2.5964410305023193),
(3, 10, 1.6582000255584717, 2.9907989501953125),
(4, 1, 1.4793620109558105, 3.3473429679870605),
(4, 2, 1.0671560764312744, 2.4171769618988037),
(4, 3, 1.2569692134857178, 2.2807950973510742),
(4, 4, 1.3456289768218994, 2.6219108104705811),
(4, 5, 1.4055080413818359, 2.4606490135192871),
(4, 6, 1.372107982635498, 2.551663875579834),
(4, 10, 1.599470853805542, 2.9172940254211426),
(5, 1, 1.4115700721740723, 3.2077109813690186),
(5, 2, 1.0635769367218018, 2.2648060321807861),
(5, 3, 1.3842809200286865, 2.6135518550872803),
(5, 4, 1.3470511436462402, 2.3852400779724121),
(5, 5, 1.3539440631866455, 2.5245928764343262),
(5, 6, 1.4037849903106689, 2.5985310077667236),
(5, 10, 1.6120610237121582, 2.8127608299255371),
(6, 1, 1.3623628616333008, 3.021122932434082),
(6, 2, 1.1697649955749512, 2.6285450458526611),
(6, 3, 1.2980999946594238, 2.4746189117431641),
(6, 4, 1.3739941120147705, 2.5579929351806641),
(6, 5, 1.3967819213867188, 2.5522029399871826),
(6, 6, 1.4279270172119141, 2.6127138137817383),
(6, 10, 1.605496883392334, 2.864037036895752),
(10, 1, 1.6401121616363525, 2.970099925994873),
(10, 2, 1.46710205078125, 2.7231831550598145),
(10, 3, 1.4193780422210693, 2.6087639331817627),
(10, 4, 1.4657118320465088, 2.6246678829193115),
(10, 5, 1.5052611827850342, 2.6542458534240723),
(10, 6, 1.5214400291442871, 2.7243161201477051),
(10, 10, 1.6116268634796143, 2.956165075302124)]
#valid time, full time
speed_unroll_patch_noshape=[2.0109100341796875, 5.8175678253173828]
speed_unroll_patch_noshape = [2.0109100341796875, 5.8175678253173828]
#valid time, full time
speed_unroll_patch_shape=[1.2967290878295898, 5.5283889770507812]
speed_unroll_patch_shape = [1.2967290878295898, 5.5283889770507812]
@staticmethod
def getOutputShape(inshp, kshp, stride=(1,1), mode='valid'):
def getOutputShape(inshp, kshp, stride=(1, 1), mode='valid'):
"""
Computes the output dimensions of convolving an image of shape "inshp"
with kernels of shape "kshp".
......@@ -228,26 +233,27 @@ class ConvOp(Op):
:return: (rows,cols) of output image
"""
dx, dy = stride
if mode=='valid': s = -1
else: s = 1
if mode == 'valid':
s = -1
else:
s = 1
inshp, kshp = numpy.array(inshp), numpy.array(kshp)
return numpy.int64(numpy.ceil((inshp + s*kshp - s*1)/\
numpy.array([dx,dy], dtype='float')))
return numpy.int64(numpy.ceil((inshp + s * kshp - s * 1) /
numpy.array([dx, dy], dtype='float')))
def __init__(self, imshp=None, kshp=None, nkern=None, bsize=None,
dx=1, dy=1,
output_mode='valid',
unroll_batch=None,
unroll_kern=None,
unroll_patch=None,
imshp_logical=None,
kshp_logical=None,
kshp_logical_top_aligned=True,
verbose=0,
version=-1,
openmp=None):
dx=1, dy=1,
output_mode='valid',
unroll_batch=None,
unroll_kern=None,
unroll_patch=None,
imshp_logical=None,
kshp_logical=None,
kshp_logical_top_aligned=True,
verbose=0,
version=-1,
openmp=None):
"""
Initializes a ConvOp with given output_mode (full/valid). All other
parameters are optional and are only used to generate more optimized c
......@@ -259,12 +265,13 @@ class ConvOp(Op):
By default we try to select the fastest version. You can specify it
with the unroll_batch, unroll_kern, and unroll_patch parameter.
The second type of optimization is hardcoding some dimensions into the code
when all shape are know.
The second type of optimization is hardcoding some dimensions into the
code when all shape are know.
This make a significant difference for the 'full' output_mode.
Some times, the fastest implementation on x86-64 uses {unroll_batch=4, unroll_kern=4,
unroll_patch=False} with all other shape parameters being provided.
Some times, the fastest implementation on x86-64 uses
{unroll_batch=4, unroll_kern=4, unroll_patch=False}
with all other shape parameters being provided.
For optimizing other architectures, see:
Kazushige Goto and Robert A. Van De Geijn, Anatomy of High-Performance
......@@ -278,7 +285,8 @@ class ConvOp(Op):
Optional parameters: (will generate more optimal c code)
:type imshp: tuple of len 2 or 3: 2 for 2d image, 3 for a stack of 2d images.
:type imshp: tuple of len 2 or 3: 2 for 2d image,
3 for a stack of 2d images.
:param imshp: (stacksize, nb image row, nb image col)
:type kshp: tuple of len 2
:param kshp: (nb kernel row, nb kernel col)
......@@ -294,16 +302,18 @@ class ConvOp(Op):
Params which select the version of code used:
:type unroll_patch: bool
:param unroll_patch: use a version of c_code that unroll the patch loop that don't
request all shape information to work, but if all shape information are present, will
:param unroll_patch: use a version of c_code that unroll the patch loop
that don't request all shape information to work, but if all shape
information are present, will
use it to hardcode the value in the code for faster code.
:type unroll_batch:int
:param unroll_batch: use a version of c_code that unroll the batch(by unroll_batch) and
the nkern(by unroll_kern) loop. The size must by a multiple of bsize or nkern
respectively.
:param unroll_batch: use a version of c_code that unroll the batch
(by unroll_batch) and the nkern(by unroll_kern) loop. The size
must by a multiple of bsize or nkern respectively.
:type unroll_kern:int
:param unroll_kern: use a version of c_code that unroll the batch(by unroll_batch) and
the nkern(by unroll_kern) loop. The size must by a multiple of bsize or nkern
:param unroll_kern: use a version of c_code that unroll the batch
(by unroll_batch) and the nkern(by unroll_kern) loop. The size
must by a multiple of bsize or nkern
respectively.
:type verbose: int
......@@ -316,8 +326,10 @@ class ConvOp(Op):
:param kshp_logical_top_aligned: idem
"""
# We must continue to consider None as 1 for backward compatibility.
if dx is None: dx = 1
if dy is None: dy = 1
if dx is None:
dx = 1
if dy is None:
dy = 1
if int(dx) != dx:
raise TypeError('ConvOp.__init__ param dx must be an int', dx)
......@@ -330,8 +342,9 @@ class ConvOp(Op):
all_shape = imshp is not None and kshp is not None and \
nkern is not None and bsize is not None
if (unroll_batch>0 or unroll_kern>0) and not all_shape:
raise Exception("In ConvOp, when using unroll_batch and unroll_nkern, all shape are needed")
if (unroll_batch > 0 or unroll_kern > 0) and not all_shape:
raise Exception("In ConvOp, when using unroll_batch and"
" unroll_nkern, all shape are needed")
if openmp is None:
openmp = theano.config.openmp
......@@ -343,9 +356,9 @@ class ConvOp(Op):
if imshp is not None:
imshp = tuple(imshp)
if len(imshp)==2:
imshp = (1,)+imshp
elif len(imshp)==3:
if len(imshp) == 2:
imshp = (1,) + imshp
elif len(imshp) == 3:
imshp = imshp
else:
raise Exception("bad len for imshp")
......@@ -356,73 +369,83 @@ class ConvOp(Op):
self.kshp = kshp
self.nkern = nkern
self.bsize=bsize
self.dx=dx
self.dy=dy
self.verbose=verbose
self.version=version
self.bsize = bsize
self.dx = dx
self.dy = dy
self.verbose = verbose
self.version = version
if openmp is None:
openmp = config.openmp
self.openmp = openmp
# a triple
self.imshp_logical = self.imshp
if imshp_logical is not None: self.imshp_logical = tuple(imshp_logical)
if imshp_logical is not None:
self.imshp_logical = tuple(imshp_logical)
assert (self.imshp is None and self.imshp_logical is None) or \
(len(self.imshp) == len(self.imshp_logical))
# a pair
self.kshp_logical = self.kshp
if kshp_logical is not None: self.kshp_logical = tuple(kshp_logical)
if kshp_logical is not None:
self.kshp_logical = tuple(kshp_logical)
self.kshp_logical_top_aligned = kshp_logical_top_aligned
self.unroll_batch=unroll_batch
self.unroll_kern=unroll_kern
self.unroll_patch=unroll_patch
self.unroll_batch = unroll_batch
self.unroll_kern = unroll_kern
self.unroll_patch = unroll_patch
if self.unroll_batch and not self.unroll_kern: self.unroll_kern = 1
if self.unroll_kern and not self.unroll_batch: self.unroll_batch = 1
if self.unroll_batch and not self.unroll_kern:
self.unroll_kern = 1
if self.unroll_kern and not self.unroll_batch:
self.unroll_batch = 1
#downcast unroll_batch if not a divisor of batch size
if self.unroll_batch>0 and self.bsize % self.unroll_batch!=0:
if self.unroll_batch > 0 and self.bsize % self.unroll_batch != 0:
if self.bsize<=self.unroll_batch:
if self.bsize <= self.unroll_batch:
self.unroll_batch = self.bsize
else:
#find the maximum value under unroll_batch that would work
new=self.unroll_batch
assert(new>=1)
while self.bsize % new!=0:
new-=1
warnstr = "OPTIMISATION WARNING: in ConvOp.__init__() unroll_batch(%i)"\
"must be 0 or a divisor of bsize(%i). We revert it to %i. This"\
" won't change the result, but may make it slower."
new = self.unroll_batch
assert(new >= 1)
while self.bsize % new != 0:
new -= 1
warnstr = ("OPTIMISATION WARNING: in ConvOp.__init__() "
"unroll_batch(%i) must be 0 or a divisor of"
" bsize(%i). We revert it to %i. This"
" won't change the result, but may make it slower.")
_logger.warn(warnstr, self.unroll_batch, self.bsize, new)
self.unroll_batch=new
self.unroll_batch = new
#downcast unroll_kern if not a divisor of nb of kernel
if self.unroll_kern>0 and self.nkern % self.unroll_kern!=0:
if self.unroll_kern > 0 and self.nkern % self.unroll_kern != 0:
if self.nkern<=self.unroll_kern:
if self.nkern <= self.unroll_kern:
self.unroll_kern = self.nkern
else:
#find the maximum value under unroll_kern that would work
new=self.unroll_kern
assert(new>=1)
while self.nkern % new!=0:
new-=1
warnstr = "OPTIMISATION WARNING: in ConvOp.__init__() unroll_kern(%i)"\
"should be 0 or a divisor of nkern(%i). We revert it to %i."\
"This won't change the result, but may make it slower."
new = self.unroll_kern
assert(new >= 1)
while self.nkern % new != 0:
new -= 1
warnstr = ("OPTIMISATION WARNING: in ConvOp.__init__()"
" unroll_kern(%i) should be 0 or a divisor of"
" nkern(%i). We revert it to %i. This"
" won't change the result, but may make it slower.")
_logger.warn(warnstr, self.unroll_kern, self.nkern, new)
self.unroll_kern=new
self.unroll_kern = new
if all_shape:
self.outshp = ConvOp.getOutputShape(self.imshp_logical[1:], self.kshp_logical, (dx,dy), output_mode)
self.fulloutshp = ConvOp.getOutputShape(self.imshp_logical[1:], self.kshp_logical, (1,1), output_mode)
self.outshp = ConvOp.getOutputShape(self.imshp_logical[1:],
self.kshp_logical, (dx, dy),
output_mode)
self.fulloutshp = ConvOp.getOutputShape(self.imshp_logical[1:],
self.kshp_logical, (1, 1),
output_mode)
else:
self.outshp = None
self.fulloutshp = None
......@@ -430,51 +453,59 @@ class ConvOp(Op):
self.out_mode = output_mode
if not self.out_mode in ["valid", "full"]:
raise Exception("Mode %s not implemented"%self.out_mode)
raise Exception("Mode %s not implemented" % self.out_mode)
if all_shape and not (self.outshp > 0).all():
raise Exception(("Bad size for the output shape. Verify that [post-"\
"supersampling] input shape (%s) and kern shape(%s) are ok. "\
"(Hint: kerns must fit inside image in valid mode)")%
(self.imshp_logical,self.kshp_logical))
raise Exception("Bad size for the output shape. Verify that [post-"
"supersampling] input shape (%s) and kern"
" shape(%s) are ok. (Hint: kerns must fit inside"
" image in valid mode)" %
(self.imshp_logical, self.kshp_logical))
if (self.unroll_kern is None and
self.unroll_batch is None and
self.unroll_patch is None):
if self.unroll_kern is None and self.unroll_batch is None and self.unroll_patch is None:
#no version specified. Find the faster we have
if self.bsize is None and self.nkern is None:
self.unroll_patch = True
elif self.bsize is not None and self.nkern is not None:
bsize=self.bsize
nkern=self.nkern
bsize = self.bsize
nkern = self.nkern
if bsize is None:
bsize=1
bsize = 1
if nkern is None:
nkern=1
mode_idx=0
if self.out_mode!="valid":
mode_idx=1
nkern = 1
mode_idx = 0
if self.out_mode != "valid":
mode_idx = 1
if all_shape:
time_unroll_patch = self.speed_unroll_patch_shape[mode_idx]
else:
time_unroll_patch = self.speed_unroll_patch_noshape[mode_idx]
time_unroll_patch = self.speed_unroll_patch_noshape[
mode_idx]
time_unroll_batch_kern = 9999999
for i in xrange(len(self.speed_unroll_batch_kern)):
if bsize%self.speed_unroll_batch_kern[i][0]==0 and nkern%self.speed_unroll_batch_kern[i][1]==0:
if self.speed_unroll_batch_kern[i][2+mode_idx]<time_unroll_batch_kern:
time_unroll_batch_kern=self.speed_unroll_batch_kern[i][2+mode_idx]
time_unroll_batch_kern_idx=i
if (bsize % self.speed_unroll_batch_kern[i][0] == 0 and
nkern % self.speed_unroll_batch_kern[i][1] == 0):
if self.speed_unroll_batch_kern[i][2 + mode_idx] < time_unroll_batch_kern:
time_unroll_batch_kern = self.speed_unroll_batch_kern[i][2 + mode_idx]
time_unroll_batch_kern_idx = i
if time_unroll_patch < time_unroll_batch_kern:
self.unroll_patch = True
else:
self.unroll_batch=self.speed_unroll_batch_kern[time_unroll_batch_kern_idx][0]
self.unroll_kern=self.speed_unroll_batch_kern[time_unroll_batch_kern_idx][1]
self.unroll_batch = self.speed_unroll_batch_kern[
time_unroll_batch_kern_idx][0]
self.unroll_kern = self.speed_unroll_batch_kern[
time_unroll_batch_kern_idx][1]
self.unroll_patch = False
_logger.debug("AUTO FIND VERSION OF C_CODE OF CONV OP "
"%s %s %s %s %s %s %s",
self.unroll_batch, self.unroll_kern, self.unroll_patch,
self.bsize, self.nkern, time_unroll_patch,
time_unroll_batch_kern)
"%s %s %s %s %s %s %s",
self.unroll_batch, self.unroll_kern,
self.unroll_patch,
self.bsize, self.nkern, time_unroll_patch,
time_unroll_batch_kern)
self._rehash()
if config.op.set_flops:
......@@ -504,41 +535,46 @@ class ConvOp(Op):
return self.__hashval
def __str__(self):
return "ConvOp{" +",".join(str((a, getattr(self, a))) for a in self.__attrnames) + "}"
return "ConvOp{" + ",".join(str((a, getattr(self, a)))
for a in self.__attrnames) + "}"
def set_flops(self):
""" Useful with the hack in profilemode to print the MFlops"""
if self.out_mode=="valid":
self.flops=self.kshp[0]*self.kshp[1]*2#nb mul and add by output pixed
self.flops*=self.outshp[0]*self.outshp[1]#nb flops by output image
self.flops*=self.imshp[0]*self.nkern*self.bsize#for all outputs images#n_stack==self.imshp[0]
else: #full mode not implemented
self.flops=0
for out_row in xrange(self.outshp[0]):#loop over output row
for out_col in xrange(self.outshp[0]):#loop over output col
for row in xrange(self.kshp[0]):#loop over kern row
if (row+out_row-self.kshp[0]+1<0 or
row+out_row-self.kshp[0]+1>=self.imshp[1]):
if self.out_mode == "valid":
# nb mul and add by output pixed
self.flops = self.kshp[0] * self.kshp[1] * 2
#nb flops by output image
self.flops *= self.outshp[0] * self.outshp[1]
# for all outputs images#n_stack==self.imshp[0]
self.flops *= self.imshp[0] * self.nkern * self.bsize
else: # full mode not implemented
self.flops = 0
for out_row in xrange(self.outshp[0]): # loop over output row
for out_col in xrange(self.outshp[0]): # loop over output col
for row in xrange(self.kshp[0]): # loop over kern row
if (row + out_row - self.kshp[0] + 1 < 0 or
row + out_row - self.kshp[0] + 1 >= self.imshp[1]):
continue
col=0
max_col=self.kshp[1]
img_col=out_col-self.kshp[1]+1
max_col=min(max_col,self.imshp[2]-img_col)
col = 0
max_col = self.kshp[1]
img_col = out_col - self.kshp[1] + 1
max_col = min(max_col, self.imshp[2] - img_col)
if img_col<0:
col=-img_col
img_col+=col
while col < max_col: #loop over kern col
self.flops+=2
col+=1
self.flops*=self.imshp[0]*self.nkern*self.bsize#for all outputs images#n_stack==self.imshp[0]
if img_col < 0:
col = -img_col
img_col += col
while col < max_col: # loop over kern col
self.flops += 2
col += 1
# for all outputs images#n_stack==self.imshp[0]
self.flops *= self.imshp[0] * self.nkern * self.bsize
assert self.flops == self.bsize * self.nkern * self.imshp[0] * \
self.kshp[0] * self.kshp[1] * self.imshp[1] * self.imshp[2] * 2
self.kshp[0] * self.kshp[1] * \
self.imshp[1] * self.imshp[2] * 2
def make_node(self, inputs, kerns):
# TODO: find a way to make ConvOp work for N-D (after NIPS09)
......@@ -551,19 +587,23 @@ class ConvOp(Op):
_kerns = as_tensor_variable(kerns)
# TODO: lift this restriction by upcasting either inputs or kerns
if _inputs.ndim != 4:
raise TypeError('ConvOp (make_node) requires input be a 4D tensor; received "%s" (%i dims)' % (inputs, _inputs.ndim))
raise TypeError('ConvOp (make_node) requires input be a 4D tensor;'
' received "%s" (%i dims)' %
(inputs, _inputs.ndim))
if _kerns.ndim != 4:
raise TypeError('make_node requires 4D tensor of kernels')
if _inputs.type.dtype != _kerns.type.dtype:
raise NotImplementedError("The image and the kernel must have the same type."
"inputs(%s), kerns(%s)"%(_inputs.dtype, _kerns.dtype))
raise NotImplementedError(
"The image and the kernel must have the same type."
"inputs(%s), kerns(%s)" % (_inputs.dtype, _kerns.dtype))
if self.outshp is not None:
bcastable23 = [self.outshp[0]==1, self.outshp[1]==1]
bcastable23 = [self.outshp[0] == 1, self.outshp[1] == 1]
else:
bcastable23 = [False, False]
output = theano.tensor.tensor(dtype=_inputs.type.dtype,
broadcastable=[_inputs.broadcastable[0],
_kerns.broadcastable[0]]+bcastable23);
_kerns.broadcastable[0]] +
bcastable23)
return Apply(self, [_inputs, _kerns], [output])
......@@ -582,10 +622,12 @@ class ConvOp(Op):
if self.kshp_logical:
kshp = self.kshp_logical
try:
fmshp = ConvOp.getOutputShape(imshp[1:], kshp, (self.dx,self.dy), self.out_mode)
fmshp = ConvOp.getOutputShape(imshp[1:],
kshp, (self.dx, self.dy),
self.out_mode)
except TypeError:
raise theano.tensor.ShapeError()
outshp = (batch_size,fmo) + tuple(fmshp)
outshp = (batch_size, fmo) + tuple(fmshp)
return [outshp]
else:
# Haven't implemented this case. imshp and kshp may be symbollic
......@@ -593,8 +635,7 @@ class ConvOp(Op):
# we simply let the default function do its work.
raise theano.tensor.ShapeError()
def perform(self,node, inp, out):
def perform(self, node, inp, out):
"""
By default if len(img2d.shape)==3, we
"""
......@@ -603,9 +644,12 @@ class ConvOp(Op):
if not imported_scipy_signal:
raise theano.gof.utils.MethodNotDefined(
"c_headers", type(self), self.__class__.__name__,
"Need the python package for scipy.signal to be installed for the python implementation. You can use the C implementation instead.")
"Need the python package for scipy.signal to be installed "
"for the python implementation. You can use the C"
" implementation instead.")
# TODO: move these back out to global scope when they no longer cause an atexit error
# TODO: move these back out to global scope when they no longer
# cause an atexit error
imshp = self.imshp
if imshp is None or any([x is None for x in imshp]):
imshp = tuple(img2d.shape[1:])
......@@ -634,39 +678,43 @@ class ConvOp(Op):
if self.fulloutshp is not None:
fulloutshp = tuple(self.fulloutshp)
else:
fulloutshp = tuple(ConvOp.getOutputShape(imshp_logical[1:], kshp_logical, (1,1), self.out_mode))
fulloutshp = tuple(ConvOp.getOutputShape(imshp_logical[
1:], kshp_logical, (1, 1), self.out_mode))
if z[0] is None or z[0].shape!=(bsize,)+(nkern,)+fulloutshp:
z[0] = numpy.zeros((bsize,)+(nkern,)+fulloutshp,
dtype=img2d.dtype)
zz=z[0]
if z[0] is None or z[0].shape != (bsize,) + (nkern,) + fulloutshp:
z[0] = numpy.zeros((bsize,) + (nkern,) + fulloutshp,
dtype=img2d.dtype)
zz = z[0]
stacklen = imshp[0]
img2d = img2d.reshape((bsize,)+ imshp)
filtersflipped = filtersflipped.reshape((nkern,stacklen)+kshp)
img2d = img2d.reshape((bsize,) + imshp)
filtersflipped = filtersflipped.reshape((nkern, stacklen) + kshp)
if self.imshp != self.imshp_logical:
# assuming that to get from imshp to imshp logical we insert zeros in missing spots
rstride = int(numpy.ceil(imshp_logical[1] / float(imshp[1])))
cstride = int(numpy.ceil(imshp_logical[2] / float(imshp[2])))
buf = numpy.zeros((bsize,)+ imshp_logical, dtype=img2d.dtype)
buf[:,:,::rstride, ::cstride] = img2d
buf = numpy.zeros((bsize,) + imshp_logical, dtype=img2d.dtype)
buf[:, :, ::rstride, ::cstride] = img2d
img2d = buf
del buf, rstride, cstride
if kshp != kshp_logical:
rstride = int(numpy.ceil(kshp_logical[0] / float(kshp[0])))
cstride = int(numpy.ceil(kshp_logical[1] / float(kshp[1])))
buf = numpy.zeros((nkern,stacklen)+ self.kshp_logical, dtype=filtersflipped.dtype)
buf = numpy.zeros((nkern, stacklen) +
self.kshp_logical, dtype=filtersflipped.dtype)
if self.kshp_logical_top_aligned:
roffset=coffset=0
roffset = coffset = 0
else:
roffset=(kshp_logical[0] - (kshp[0]*rstride) - 1+rstride) % rstride
coffset=(kshp_logical[1] - (kshp[1]*cstride) - 1+cstride) % cstride
roffset = (kshp_logical[0] - (kshp[0] *
rstride) - 1 + rstride) % rstride
coffset = (kshp_logical[1] - (kshp[1] *
cstride) - 1 + cstride) % cstride
assert roffset >= 0
assert coffset >= 0
buf[:,:,roffset::rstride, coffset::cstride] = filtersflipped
buf[:, :, roffset::rstride, coffset::cstride] = filtersflipped
filtersflipped = buf
del buf, rstride, cstride
......@@ -675,39 +723,39 @@ class ConvOp(Op):
for b in xrange(bsize):
for n in xrange(nkern):
zz[b,n,...].fill(0)
zz[b, n, ...].fill(0)
for im0 in xrange(stacklen):
zz[b,n,...] += _convolve2d(\
img2d[b,im0,...], filtersflipped[n,im0,...],1,val, bval, 0)
zz[b, n, ...] += _convolve2d(img2d[b, im0, ...],
filtersflipped[n, im0, ...],
1, val, bval, 0)
if False:
if False and self.out_mode=="full":
img2d2 = numpy.zeros((bsize,stacklen,
imshp[1]+2*kshp[0]-2,
imshp[2]+2*kshp[1]-2))
img2d2[:,:,kshp[0]-1:kshp[0]-1+imshp[1],
kshp[1]-1:kshp[1]-1+imshp[2]] = img2d
if False and self.out_mode == "full":
img2d2 = numpy.zeros((bsize, stacklen,
imshp[1] + 2 * kshp[0] - 2,
imshp[2] + 2 * kshp[1] - 2))
img2d2[:, :, kshp[0] - 1:kshp[0] - 1 + imshp[1],
kshp[1] - 1:kshp[1] - 1 + imshp[2]] = img2d
img2d = img2d2
#N_image_shape = image_data.shape
for b in xrange(bsize):
for n in xrange(nkern):
zz[b,n,...].fill(0)
zz[b, n, ...].fill(0)
for im0 in xrange(stacklen):
for row in xrange(0,zz.shape[2],self.dx):
for col in xrange(0,zz.shape[3],self.dy):
zz[b,n,row,col] += (img2d[b,im0,row:row+kshp[0],col:col+kshp[1]]*\
filtersflipped[n,im0,::-1,::-1]).sum()
for row in xrange(0, zz.shape[2], self.dx):
for col in xrange(0, zz.shape[3], self.dy):
zz[b, n, row, col] += (img2d[b, im0, row:row + kshp[0], col:col + kshp[1]] *
filtersflipped[n, im0, ::-1, ::-1]).sum()
#We copy it to remove the Stride mismatch warning from DEBUG_MODE.
#The copy make that we return an object with the same stride as the c version.
#The copy don't affect the performence during our experience as in that case we
#execute the c version which is much faster.
if self.dx>1 or self.dy>1:
zz = zz[:,:,0::self.dx,0::self.dy].copy()
z[0]=zz
if self.dx > 1 or self.dy > 1:
zz = zz[:, :, 0::self.dx, 0::self.dy].copy()
z[0] = zz
def grad(self, inp, grads):
inputs, kerns = inp
......@@ -724,34 +772,38 @@ class ConvOp(Op):
# build a "node", that should be equivalent to the one given by
# self.make_node, but using conv3D instead of self.
tmp_node = theano.tensor.nnet.conv3D(
V=inputs.dimshuffle(0, 2, 3, 'x', 1),
W=kerns[:, :, ::-1, ::-1].dimshuffle(0, 2, 3, 'x', 1),
b=theano.tensor.alloc(numpy.asarray(0, dtype=kerns.dtype), kerns.shape[0]),
d=(self.dx, self.dy, 1))
node = theano.tensor.addbroadcast(tmp_node, 3).dimshuffle(0, 4, 1, 2)
V=inputs.dimshuffle(0, 2, 3, 'x', 1),
W=kerns[:, :, ::-1, ::-1].dimshuffle(0, 2, 3, 'x', 1),
b=theano.tensor.alloc(numpy.asarray(0, dtype=kerns.dtype),
kerns.shape[0]),
d=(self.dx, self.dy, 1))
node = theano.tensor.addbroadcast(
tmp_node, 3).dimshuffle(0, 4, 1, 2)
# mimic what happens inside theano.grad: get the input gradient
# of the final cost wrt all variables involved.
tmp_gmap = theano.gradient.grad_sources_inputs([(node, gz)], [inputs, kerns])
tmp_gmap = theano.gradient.grad_sources_inputs(
[(node, gz)], [inputs, kerns])
return [tmp_gmap[inputs], tmp_gmap[kerns]]
if self.dx not in (1, 2) or self.dy not in (1, 2):
raise NotImplementedError("ERROR: We disable ConvOp.grad now when dx or "\
"dy are different from 1 and 2, as there is a bug in it.")
raise NotImplementedError(
"ERROR: We disable ConvOp.grad now when dx or "
"dy are different from 1 and 2, as there is a bug in it.")
all_shape = self.imshp is not None and self.kshp is not None and \
self.nkern is not None and self.bsize is not None
all_shape = (self.imshp is not None and self.kshp is not None and
self.nkern is not None and self.bsize is not None)
if not all_shape and (self.dx!=1 or self.dy!=1):
raise Exception("ConvOp.grad when dx!=1 or dy!=1 we must have all "\
if not all_shape and (self.dx != 1 or self.dy != 1):
raise Exception("ConvOp.grad when dx!=1 or dy!=1 we must have all "
"the optional shape information")
####### Determine gradient on kernels ########
assert inputs.ndim==4 and kerns.ndim==4
assert inputs.ndim == 4 and kerns.ndim == 4
newin = inputs.dimshuffle((1,0,2,3))
newgz = gz.dimshuffle((1,0,2,3))
newin = inputs.dimshuffle((1, 0, 2, 3))
newgz = gz.dimshuffle((1, 0, 2, 3))
(bsize, nkern) = None, None
imshp = None
......@@ -762,48 +814,55 @@ class ConvOp(Op):
if self.out_mode == 'valid':
(img, filters) = (newin, newgz)
kshp_logical = self.fulloutshp
kshp_logical_top_aligned=False
kshp_logical_top_aligned = False
if all_shape:
(bsize, nkern) = (self.imshp[0], self.nkern)
imshp = (self.bsize, self.imshp[1], self.imshp[2])
kshp = self.outshp
kshp = self.outshp
un_b = self.unroll_batch
un_k = self.unroll_kern
elif self.out_mode == 'full':
(img, filters) = (newgz, newin)
kshp_logical = None
kshp_logical_top_aligned=True
kshp_logical_top_aligned = True
if all_shape:
imshp_logical = (self.bsize, self.fulloutshp[0], self.fulloutshp[1])
imshp_logical = (self.bsize,
self.fulloutshp[0],
self.fulloutshp[1])
(bsize, nkern) = (self.nkern, self.imshp[0])
imshp = (self.bsize, self.outshp[0], self.outshp[1])
kshp = self.imshp[1:]
kshp = self.imshp[1:]
un_b = self.unroll_kern
un_k = self.unroll_batch
else:
raise NotImplementedError('Only [full,valid] modes are currently supported.')
raise NotImplementedError(
'Only [full,valid] modes are currently supported.')
filters = filters[:,:,::-1,::-1] #flip them
filters = filters[:, :, ::-1, ::-1] # flip them
if 0: #find good value for the unroll
if 0: # find good value for the unroll
if all_shape and un_b!=0 and bsize%un_b!=0:
if bsize<un_b:
if all_shape and un_b != 0 and bsize % un_b != 0:
if bsize < un_b:
un_b = bsize
else:
un_b = 1
_logger.warn("Optimization Warning: in ConvOp.grad() we can't determine "\
"a good unroll value for the batch. Maybe you can optimize this!")
_logger.warn(
"Optimization Warning: in ConvOp.grad() we can't "
" determine a good unroll value for the batch."
" Maybe you can optimize this!")
if all_shape and un_k!=0 and nkern%un_k!=0:
if nkern<un_k:
if all_shape and un_k != 0 and nkern % un_k != 0:
if nkern < un_k:
un_k = nkern
else:
un_k = 1
_logger.warn("Optimization Warning: in ConvOp.grad() we can't determine "\
"a good unroll value for the kernel. Maybe you can optimize this!")
_logger.warn(
"Optimization Warning: in ConvOp.grad() we can't"
" determine a good unroll value for the kernel. Maybe"
" you can optimize this!")
dw = ConvOp(imshp, kshp, nkern, bsize, 1,1, output_mode='valid',
dw = ConvOp(imshp, kshp, nkern, bsize, 1, 1, output_mode='valid',
unroll_batch=un_b, unroll_kern=un_k, unroll_patch=un_p,
imshp_logical=imshp_logical,
kshp_logical=kshp_logical,
......@@ -811,8 +870,8 @@ class ConvOp(Op):
version=self.version,
verbose=self.verbose)
else: # let __init__ choose c params be chosen automatically from shapes
dw = ConvOp(imshp, kshp, nkern, bsize, 1,1, output_mode='valid',
else: # let __init__ choose c params be chosen automatically from shapes
dw = ConvOp(imshp, kshp, nkern, bsize, 1, 1, output_mode='valid',
unroll_batch=None, unroll_kern=None, unroll_patch=None,
imshp_logical=imshp_logical,
kshp_logical=kshp_logical,
......@@ -820,26 +879,25 @@ class ConvOp(Op):
version=self.version,
verbose=self.verbose)
if hasattr(self,'flops'):
if hasattr(self, 'flops'):
dw.set_flops()
dw = dw(img,filters)
dw = dw(img, filters)
if all_shape:
assert (dw.owner.op.outshp==self.kshp).all()
assert (dw.owner.op.outshp == self.kshp).all()
if self.out_mode == 'valid':
# before DimShuffle, dw is of shape visdim x nkern x kshp[0] x kshp[1]
dw = dw.dimshuffle((1,0,2,3))
dw = dw[:,:,::-1,::-1]
dw = dw.dimshuffle((1, 0, 2, 3))
dw = dw[:, :, ::-1, ::-1]
####### Determine gradient on inputs ########
mode = 'valid'
if not self.out_mode == 'full':
mode = 'full'
filters = kerns.dimshuffle((1,0,2,3))
filters = filters[:,:,::-1,::-1]
filters = kerns.dimshuffle((1, 0, 2, 3))
filters = filters[:, :, ::-1, ::-1]
nkern = None
imshp = None
imshp_logical = None
......@@ -848,33 +906,36 @@ class ConvOp(Op):
if all_shape:
nkern = self.imshp[0]
imshp = (self.nkern, self.outshp[0], self.outshp[1])
imshp_logical=(self.nkern, self.fulloutshp[0], self.fulloutshp[1])
imshp_logical = (self.nkern, self.fulloutshp[0],
self.fulloutshp[1])
if 0: # hard-code c generation parameters
if 0: # hard-code c generation parameters
din = ConvOp(imshp, self.kshp, nkern, self.bsize,
1,1, output_mode=mode,
unroll_batch=un_b, unroll_kern=un_k, unroll_patch=un_p,
1, 1, output_mode=mode,
unroll_batch=un_b, unroll_kern=un_k,
unroll_patch=un_p,
imshp_logical=imshp_logical,
kshp_logical=None,
version=-1,#we we change the mode, we don't forward the version.
version=-1, # we we change the mode, we don't forward the version.
verbose=self.verbose)
else: # let __init__ figure out the unrolling / patch sizes
else: # let __init__ figure out the unrolling / patch sizes
din = ConvOp(imshp, self.kshp, nkern, self.bsize,
1,1, output_mode=mode,
unroll_batch=None, unroll_kern=None, unroll_patch=None,
1, 1, output_mode=mode,
unroll_batch=None, unroll_kern=None,
unroll_patch=None,
imshp_logical=imshp_logical,
kshp_logical=None,
version=-1,#we we change the mode, we don't forward the version.
version=-1, # we we change the mode, we don't forward the version.
verbose=self.verbose)
if hasattr(self,'flops'):
if hasattr(self, 'flops'):
din.set_flops()
din = din(gz,filters)
din = din(gz, filters)
assert (din.owner.op.outshp is None and self.imshp is None) or \
(din.owner.op.outshp is None) or \
(din.owner.op.outshp==self.imshp[1:]).all()
(din.owner.op.outshp == self.imshp[1:]).all()
# din and dw should have the same broadcasting pattern as the
# parameters they are the gradient of (resp. inputs and kerns).
......@@ -902,10 +963,14 @@ using namespace std;
""" Return True if we will generate code that use gemm.
"""
#the gemm version only support that case
if self.out_mode == 'valid' and self.dx==0 and self.dy==0:
if self.out_mode == 'valid' and self.dx == 0 and self.dy == 0:
#We use a faster version in those case.
if (self.imshp != self.imshp_logical or self.kshp != self.kshp_logical
or self.unroll_patch or self.unroll_batch>0 or self.unroll_kern>0):
if (self.imshp != self.imshp_logical or
self.kshp != self.kshp_logical or
self.unroll_patch or
self.unroll_batch > 0 or
self.unroll_kern > 0):
return False
return True
return False
......@@ -918,7 +983,9 @@ using namespace std;
def c_no_compile_args(self):
#when the ksph==(1,1) gcc 4.3.0 segfault during the
#compilation with -O3. This don't happen at -O2
if theano.gof.cmodule.gcc_version() in ['4.3.0'] and self.kshp==(1, 1):
if (theano.gof.cmodule.gcc_version() in ['4.3.0'] and
self.kshp == (1, 1)):
return ['-O3']
else:
return []
......@@ -928,7 +995,8 @@ using namespace std;
if self.use_blas():
ret = blas.ldflags(libs=False, flags=True)
if theano.gof.cmodule.gcc_version() in ['4.3.0'] and self.kshp==(1, 1):
if (theano.gof.cmodule.gcc_version() in ['4.3.0'] and
self.kshp == (1, 1)):
ret += ['-O2']
if self.openmp:
ret += ['-fopenmp']
......@@ -951,123 +1019,140 @@ using namespace std;
if node.inputs[0].type.dtype != node.inputs[1].type.dtype:
raise NotImplementedError()
assert node.inputs[0].type.dtype == node.inputs[1].type.dtype
d=locals()
d = locals()
d.update(sub)
all_shape = self.imshp is not None and self.kshp is not None and \
self.nkern is not None and self.bsize is not None
all_shape = (self.imshp is not None and self.kshp is not None and
self.nkern is not None and self.bsize is not None)
d["self_out_mode"]=self.out_mode
d["self_dx"]=self.dx
d["self_dy"]=self.dy
d["mode"]=self.out_mode.upper()
d["affectation"]="="
d["self_out_mode"] = self.out_mode
d["self_dx"] = self.dx
d["self_dy"] = self.dy
d["mode"] = self.out_mode.upper()
d["affectation"] = "="
if all_shape:
d["self_bsize"]=self.bsize
d["self_nkern"]=self.nkern
d["self_outshp0"]=self.outshp[0]
d["self_outshp1"]=self.outshp[1]
d["self_imshp0"]=self.imshp[0]
d["self_imshp1"]=self.imshp[1]
d["self_imshp2"]=self.imshp[2]
d["self_kshp0"]=self.kshp[0]
d["self_kshp1"]=self.kshp[1]
d["self_bsize"] = self.bsize
d["self_nkern"] = self.nkern
d["self_outshp0"] = self.outshp[0]
d["self_outshp1"] = self.outshp[1]
d["self_imshp0"] = self.imshp[0]
d["self_imshp1"] = self.imshp[1]
d["self_imshp2"] = self.imshp[2]
d["self_kshp0"] = self.kshp[0]
d["self_kshp1"] = self.kshp[1]
d["self_kshp_logical_r"] = self.kshp_logical[0]
d["self_kshp_logical_c"] = self.kshp_logical[1]
d["self_kshp_logical_stride_r"] = int(numpy.ceil(self.kshp_logical[0] / float(self.kshp[0])))
d["self_kshp_logical_stride_c"] = int(numpy.ceil(self.kshp_logical[1] / float(self.kshp[1])))
d["self_imshp_logical_r"] = self.imshp_logical[1] #numpy.B. 1 not 0
d["self_imshp_logical_c"] = self.imshp_logical[2]#numpy.B. 2 not 1
d["self_imshp_logical_stride_r"] = int(numpy.ceil(self.imshp_logical[1] / float(self.imshp[1])))
d["self_imshp_logical_stride_c"] = int(numpy.ceil(self.imshp_logical[2] / float(self.imshp[2])))
if not self.imshp[0]==1: d["affectation"]="+="
d["all_shape"]="1"
d["dim_zz_const"]="const"
d["dim_zz_affect"]=""
d["assert_size"]="""
d["self_kshp_logical_stride_r"] = int(numpy.ceil(
self.kshp_logical[0] / float(self.kshp[0])))
d["self_kshp_logical_stride_c"] = int(numpy.ceil(
self.kshp_logical[1] / float(self.kshp[1])))
d["self_imshp_logical_r"] = self.imshp_logical[1]
#numpy.B. 1 not 0
d["self_imshp_logical_c"] = self.imshp_logical[2]
# numpy.B. 2 not 1
d["self_imshp_logical_stride_r"] = int(numpy.ceil(
self.imshp_logical[1] / float(self.imshp[1])))
d["self_imshp_logical_stride_c"] = int(numpy.ceil(
self.imshp_logical[2] / float(self.imshp[2])))
if not self.imshp[0] == 1:
d["affectation"] = "+="
d["all_shape"] = "1"
d["dim_zz_const"] = "const"
d["dim_zz_affect"] = ""
d["assert_size"] = """
// Check the batch size and the number of kernels (sometimes constant in the graph)
if(img2d_dim[0] != %(self_bsize)s!=0){
PyErr_Format(PyExc_ValueError,
"the batch size in the image (%%ld) at run time is different than at build time (%%ld) for the ConvOp.",
(long)img2d_dim[0], (long)%(self_bsize)s);
"the batch size in the image (%%ld) at run time is different"
" than at build time (%%ld) for the ConvOp.",
(long)img2d_dim[0], (long)%(self_bsize)s);
%(fail)s;
}
if(kerns_dim[0] != %(self_nkern)s!=0){
PyErr_Format(PyExc_ValueError,
"the number of kernels in the filter (%%ld) at run time is different than at build time (%%ld) for the ConvOp.",
(long)kerns_dim[0], (long)%(self_nkern)s);
"the number of kernels in the filter (%%ld) at run time is"
" different than at build time (%%ld) for the ConvOp.",
(long)kerns_dim[0], (long)%(self_nkern)s);
%(fail)s;
}
// Check the size of the image (sometimes constant in the graph)
if(img2d_dim[1] != %(self_imshp0)s){
PyErr_Format(PyExc_ValueError,
"the image stack size (%%ld) at run time is different than at build time (%%ld) for the ConvOp.",
(long)img2d_dim[1], (long)%(self_imshp0)s);
"the image stack size (%%ld) at run time is different than"
" at build time (%%ld) for the ConvOp.",
(long)img2d_dim[1], (long)%(self_imshp0)s);
%(fail)s;
}
if(img2d_dim[2] != %(self_imshp1)s){
PyErr_Format(PyExc_ValueError,
"the number of rows in the image (%%ld) at run time is different than at build time (%%ld) for the ConvOp.",
(long)img2d_dim[2], (long)%(self_imshp1)s);
"the number of rows in the image (%%ld) at run time is different"
" than at build time (%%ld) for the ConvOp.",
(long)img2d_dim[2], (long)%(self_imshp1)s);
%(fail)s;
}
if(img2d_dim[3] != %(self_imshp2)s){
PyErr_Format(PyExc_ValueError,
"the number of columns in the image (%%ld) at run time is different than at build time (%%ld) for the ConvOp.",
(long)img2d_dim[3], (long)%(self_imshp2)s);
"the number of columns in the image (%%ld) at run time is"
" different than at build time (%%ld) for the ConvOp.",
(long)img2d_dim[3], (long)%(self_imshp2)s);
%(fail)s;
}
// Check the size of the output (sometimes constant in the graph)
if(dim_zz[0] != %(self_outshp0)s!=0){
PyErr_Format(PyExc_ValueError,
"the precomputed number of rows in the output (%%ld) at run time is different than at build time (%%ld) for the ConvOp.",
(long)dim_zz[0], (long)%(self_outshp0)s);
"the precomputed number of rows in the output (%%ld) at run time"
" is different than at build time (%%ld) for the ConvOp.",
(long)dim_zz[0], (long)%(self_outshp0)s);
%(fail)s;
}
if(dim_zz[1] != %(self_outshp1)s!=0){
PyErr_Format(PyExc_ValueError,
"the precomputed number of columns in the output (%%ld) at run time is different than at build time (%%ld) for the ConvOp.",
(long)dim_zz[1], (long)%(self_outshp1)s);
"the precomputed number of columns in the output (%%ld) at run"
" time is different than at build time (%%ld) for the ConvOp.",
(long)dim_zz[1], (long)%(self_outshp1)s);
%(fail)s;
}
// Check the size of the filter (sometimes constant in the graph)
if(kerns_dim[1] %% %(self_imshp0)s!=0){
PyErr_Format(PyExc_ValueError,
"the filter stack size (%%ld) at run time is different than at build time (%%ld) for the ConvOp.",
(long)kerns_dim[1], (long)%(self_imshp0)s);
"the filter stack size (%%ld) at run time is different than at"
" build time (%%ld) for the ConvOp.",
(long)kerns_dim[1], (long)%(self_imshp0)s);
%(fail)s;
}
if(kerns_dim[2] %% %(self_kshp0)s!=0){
PyErr_Format(PyExc_ValueError,
"the number of rows in the filter (%%ld) at run time is different than at build time (%%ld) for the ConvOp.",
(long)kerns_dim[2], (long)%(self_kshp0)s);
"the number of rows in the filter (%%ld) at run time is different"
" than at build time (%%ld) for the ConvOp.",
(long)kerns_dim[2], (long)%(self_kshp0)s);
%(fail)s;
}
if(kerns_dim[3] %% %(self_kshp1)s!=0){
PyErr_Format(PyExc_ValueError,
"the number of columns in the filter (%%ld) at run time is different than at build time (%%ld) for the ConvOp.",
(long)kerns_dim[3], (long)%(self_kshp1)s);
"the number of columns in the filter (%%ld) at run time is"
" different than at build time (%%ld) for the ConvOp.",
(long)kerns_dim[3], (long)%(self_kshp1)s);
%(fail)s;
}
"""%(locals())
""" % (locals())
else:
d["self_bsize"]="%(img2d)s->dimensions[0]"%d
d["self_nkern"]="%(filtersflipped)s->dimensions[0]"%d
d["self_outshp0"]="-1"
d["self_outshp1"]="-1"
d["self_imshp0"]="%(img2d)s->dimensions[1]"%d
d["self_imshp1"]="%(img2d)s->dimensions[2]"%d
d["self_imshp2"]="%(img2d)s->dimensions[3]"%d
d["self_kshp0"]="%(filtersflipped)s->dimensions[2]"%d
d["self_kshp1"]="%(filtersflipped)s->dimensions[3]"%d
d["affectation"]="+="
d["all_shape"]="0"
d["dim_zz_const"]=""
d["dim_zz_affect"]="""
d["self_bsize"] = "%(img2d)s->dimensions[0]" % d
d["self_nkern"] = "%(filtersflipped)s->dimensions[0]" % d
d["self_outshp0"] = "-1"
d["self_outshp1"] = "-1"
d["self_imshp0"] = "%(img2d)s->dimensions[1]" % d
d["self_imshp1"] = "%(img2d)s->dimensions[2]" % d
d["self_imshp2"] = "%(img2d)s->dimensions[3]" % d
d["self_kshp0"] = "%(filtersflipped)s->dimensions[2]" % d
d["self_kshp1"] = "%(filtersflipped)s->dimensions[3]" % d
d["affectation"] = "+="
d["all_shape"] = "0"
d["dim_zz_const"] = ""
d["dim_zz_affect"] = """
if (mode == FULL) {
dim_zz[0] = (int)ceil((dim_im[0]+dim_ker0-1)/float(%(self_dx)s));
dim_zz[1] = (int)ceil((dim_im[1]+dim_ker1-1)/float(%(self_dy)s));
......@@ -1075,8 +1160,8 @@ if(kerns_dim[3] %% %(self_kshp1)s!=0){
dim_zz[0] = (int)ceil((dim_im[0]-dim_ker0+1)/float(%(self_dx)s));
dim_zz[1] = (int)ceil((dim_im[1]-dim_ker1+1)/float(%(self_dy)s));
}
"""% d
d["assert_size"]=""
""" % d
d["assert_size"] = ""
if self.kshp_logical_top_aligned:
d["self_kshp_logical_offset_r"] = 0
......@@ -1084,36 +1169,47 @@ if(kerns_dim[3] %% %(self_kshp1)s!=0){
elif all_shape:
rstride = d["self_kshp_logical_stride_r"]
cstride = d["self_kshp_logical_stride_c"]
d["self_kshp_logical_offset_r"] = (self.kshp_logical[0] - (self.kshp[0]*rstride) - 1+rstride) % rstride
d["self_kshp_logical_offset_c"] = (self.kshp_logical[1] - (self.kshp[1]*cstride) - 1+cstride) % cstride
d["self_kshp_logical_offset_r"] = (self.kshp_logical[0] -
(self.kshp[0] * rstride) -
1 + rstride) % rstride
d["self_kshp_logical_offset_c"] = (self.kshp_logical[1] -
(self.kshp[1] * cstride) -
1 + cstride) % cstride
del rstride, cstride
if node.inputs[0].type.dtype=="float32": d["type"]="float"
elif node.inputs[0].type.dtype=="float64": d["type"]="double"
else: raise Exception("Type %s not implemented"%node.inputs[0].type.dtype)
d["gemm"]='dgemm_'
if not d["type"]=="double":d["gemm"]='sgemm_'
if node.inputs[0].type.dtype == "float32":
d["type"] = "float"
elif node.inputs[0].type.dtype == "float64":
d["type"] = "double"
else:
raise Exception("Type %s not implemented" %
node.inputs[0].type.dtype)
d["gemm"] = 'dgemm_'
if not d["type"] == "double":
d["gemm"] = 'sgemm_'
if self.imshp != self.imshp_logical or self.kshp != self.kshp_logical:
if self.verbose:
_logger.debug("return imshp!=imshp_logical or self.kshp != self.kshp_logical shape version")
_logger.debug("return imshp!=imshp_logical or"
" self.kshp != self.kshp_logical shape version")
return _conv_op_code_a % d
if self.unroll_patch:
if self.verbose:
_logger.debug("return unroll patch version. all_shape=%s", all_shape)
return _conv_op_code_unroll_patch%d
if self.unroll_batch>0 or self.unroll_kern>0:
assert self.unroll_batch>0
assert self.unroll_kern>0
_logger.debug("return unroll patch version. all_shape=%s",
all_shape)
return _conv_op_code_unroll_patch % d
if self.unroll_batch > 0 or self.unroll_kern > 0:
assert self.unroll_batch > 0
assert self.unroll_kern > 0
if self.verbose:
_logger.debug("return unrolled batch (%s) and kern code (%s)",
str(self.unroll_batch), str(self.unroll_kern))
str(self.unroll_batch), str(self.unroll_kern))
return gen_conv_code_unroll_batch_kern(d, self.unroll_batch,
self.unroll_kern)
#TODO: should we choose the unroll size automatically with the bigger divisor under 5?
if self.out_mode == 'valid' and self.dx==0 and self.dy==0:
if self.out_mode == 'valid' and self.dx == 0 and self.dy == 0:
if self.verbose:
_logger.debug("return gemm version")
return _conv_op_code_valid_gemm % d
......@@ -1126,7 +1222,8 @@ if(kerns_dim[3] %% %(self_kshp1)s!=0){
_conv_op_code_a = """
const int mode=%(mode)s;
int typenum=0, typenum_f=0;
PyArrayObject *ain1=NULL, *ain2=NULL, *filtersflipped_arr=NULL, *img2d_arr=NULL;
PyArrayObject *ain1=NULL, *ain2=NULL;
PyArrayObject *filtersflipped_arr=NULL, *img2d_arr=NULL;
const %(type)s fill_value = 0;
int type_im=PyArray_TYPE(%(img2d)s);
......@@ -1216,12 +1313,17 @@ if ((filtersflipped_arr->strides[3] != (npy_intp)sizeof(%(type)s))
filtersflipped_arr = (PyArrayObject*)filtersflipped;
if(mode != VALID && mode != FULL){
PyErr_SetString(PyExc_ValueError, "invalid mode, only full and valid are supported"); %(fail)s;
PyErr_SetString(PyExc_ValueError,
"invalid mode, only full and valid are supported");
%(fail)s;
}
typenum = PyArray_ObjectType((PyObject*)%(img2d)s, 0);
typenum_f = PyArray_ObjectType((PyObject*)%(filtersflipped)s, 0);
if (typenum < 0) {PyErr_SetString(PyExc_ValueError, "Invalid type"); %(fail)s;}
if (typenum != typenum_f) {PyErr_SetString(PyExc_ValueError, "Input types must match"); %(fail)s;}
if (typenum != typenum_f) {
PyErr_SetString(PyExc_ValueError, "Input types must match");
%(fail)s;
}
if (!img2d) %(fail)s;
if (!filtersflipped) %(fail)s;
......@@ -1249,10 +1351,19 @@ Os[0]=%(self_outshp0)s;
Os[1]=%(self_outshp1)s;
//assertions
if (%(z)s->strides[0] != %(z)s->dimensions[1] *%(z)s->dimensions[2] *%(z)s->dimensions[3] * (npy_intp)sizeof(%(type)s)) %(fail)s;
if (%(z)s->strides[1] != %(z)s->dimensions[2] * %(z)s->dimensions[3] * (npy_intp)sizeof(%(type)s)) %(fail)s;
if (%(z)s->strides[2] != %(z)s->dimensions[3] * (npy_intp)sizeof(%(type)s)) %(fail)s;
if (%(z)s->strides[3] != (npy_intp)sizeof(%(type)s)) %(fail)s;
if (%(z)s->strides[0] != %(z)s->dimensions[1] *
%(z)s->dimensions[2] *
%(z)s->dimensions[3] *
(npy_intp)sizeof(%(type)s))
%(fail)s;
if (%(z)s->strides[1] != %(z)s->dimensions[2] *
%(z)s->dimensions[3] *
(npy_intp)sizeof(%(type)s))
%(fail)s;
if (%(z)s->strides[2] != %(z)s->dimensions[3] * (npy_intp)sizeof(%(type)s))
%(fail)s;
if (%(z)s->strides[3] != (npy_intp)sizeof(%(type)s))
%(fail)s;
for(int b=0;b< %(self_bsize)s;b++){
for(int n_kern=0;n_kern<%(self_nkern)s;n_kern++){
......@@ -1267,34 +1378,41 @@ for(int b=0;b< %(self_bsize)s;b++){
for (int iter_m=0; iter_m < Os[0]; iter_m++) {
/// Reposition index into input image based on requested output size
int pos_m = iter_m*%(self_dx)s; //row position in logical output image
int new_m; //row anchor in logical input image (we will loop upward from here)
// Reposition index into input image based on requested output size
//row position in logical output image
int pos_m = iter_m*%(self_dx)s;
//row anchor in logical input image (we will loop upward from here)
int new_m;
if (mode == FULL) new_m = pos_m ;
else new_m = (pos_m+dim_ker_log[0]-1);
for (int iter_n=0; iter_n < Os[1]; iter_n++) { // loop over columns
int pos_n=iter_n*%(self_dy)s; // current col position in logical output image
// current col position in logical output image
int pos_n=iter_n*%(self_dy)s;
%(type)s sum=0;
// Sum over kernel, if index into image is out of bounds
// fill with the value
for (int j_log=0; j_log < %(self_kshp_logical_r)s; j_log++) { // loop over logical rows in kernel
// loop over logical rows in kernel
for (int j_log=0; j_log < %(self_kshp_logical_r)s; j_log++) {
// ind0_log: row position in logical input image
int ind0_log = (new_m-j_log);
int ind0_log = (new_m-j_log); // ind0_log: row position in logical input image
if ((j_log < %(self_kshp_logical_offset_r)s) || (j_log - %(self_kshp_logical_offset_r)s) MOD %(self_kshp_logical_stride_r)s)
if ((j_log < %(self_kshp_logical_offset_r)s) ||
(j_log - %(self_kshp_logical_offset_r)s) MOD %(self_kshp_logical_stride_r)s)
continue;
if (ind0_log MOD %(self_imshp_logical_stride_r)s)
continue;
int j_phys = ((j_log- %(self_kshp_logical_offset_r)s) / %(self_kshp_logical_stride_r)s);
int j_phys = ((j_log- %(self_kshp_logical_offset_r)s) /
%(self_kshp_logical_stride_r)s);
int ind0_phys = (ind0_log / %(self_imshp_logical_stride_r)s);
//std::cerr <<"j_log" << j_log << " j_phys " << j_phys << " " << ind0_phys << "\\n";
if(mode==FULL){
const %(type)s * idx_hvals=&hvals[j_phys*dim_ker_phys[1]]; //This is a pointer to the current row of the kernel
//This is a pointer to the current row of the kernel
const %(type)s * idx_hvals=&hvals[j_phys*dim_ker_phys[1]];
if(ind0_log < 0 || ind0_log >= dim_im_log[0]){
// the current row of the kernel is off the image
}else{
......@@ -1304,30 +1422,40 @@ for(int b=0;b< %(self_bsize)s;b++){
for (int ind1_log=pos_n-k; k<max_k; k++,ind1_log--) {
if (1)
{
if ((k < %(self_kshp_logical_offset_c)s) || (k - %(self_kshp_logical_offset_c)s) MOD %(self_kshp_logical_stride_c)s)
if ((k < %(self_kshp_logical_offset_c)s) ||
(k - %(self_kshp_logical_offset_c)s) MOD
%(self_kshp_logical_stride_c)s)
continue;
if (ind1_log MOD %(self_imshp_logical_stride_c)s)
if (ind1_log MOD
%(self_imshp_logical_stride_c)s)
continue;
}
sum+= idx_hvals[(k-%(self_kshp_logical_offset_c)s) / %(self_kshp_logical_stride_c)s] * idx_in[ind1_log / %(self_imshp_logical_stride_c)s];
sum += idx_hvals[(k-%(self_kshp_logical_offset_c)s) /
%(self_kshp_logical_stride_c)s] *
idx_in[ind1_log / %(self_imshp_logical_stride_c)s];
}
}
}else{
const %(type)s* idx_in=&in[ind0_phys*dim_im_phys[1]]; //JB: should be dim_im[1] right? (was dim_im[0])
}else{ // mode==VALID
//JB: should be dim_im[1] right? (was dim_im[0])
const %(type)s* idx_in=&in[ind0_phys*dim_im_phys[1]];
const %(type)s* idx_hvals=&hvals[j_phys*dim_ker_phys[1]];
int new_n = (pos_n+dim_ker_log[1]-1);
if (%(self_imshp_logical_stride_c)s != 1) // a general loop
{
for (int k=0,last=new_n; k < dim_ker_log[1]; k++,last--) {
if ((k < %(self_kshp_logical_offset_c)s) || (k - %(self_kshp_logical_offset_c)s) MOD %(self_kshp_logical_stride_c)s)
if ((k < %(self_kshp_logical_offset_c)s) ||
(k - %(self_kshp_logical_offset_c)s) MOD
%(self_kshp_logical_stride_c)s)
continue;
else if (last MOD %(self_imshp_logical_stride_c)s)
continue;
else
{
sum+=idx_hvals[(k-%(self_kshp_logical_offset_c)s) / %(self_kshp_logical_stride_c)s]*idx_in[last/%(self_imshp_logical_stride_c)s];
sum+=idx_hvals[(k-%(self_kshp_logical_offset_c)s) /
%(self_kshp_logical_stride_c)s] *
idx_in[last/%(self_imshp_logical_stride_c)s];
}
}
}
......@@ -1335,7 +1463,8 @@ for(int b=0;b< %(self_bsize)s;b++){
{
int offset = %(self_kshp_logical_offset_c)s;
int k_phys=0;
for (int k_log=offset,last=new_n-offset; k_log < dim_ker_log[1]; ) {
for (int k_log=offset,last=new_n-offset;
k_log < dim_ker_log[1]; ) {
sum += idx_hvals[k_phys]*idx_in[last];
++k_phys;
last -= %(self_kshp_logical_stride_c)s;
......@@ -1343,10 +1472,10 @@ for(int b=0;b< %(self_bsize)s;b++){
}
}
}
}//for j
}//for j_log
out[iter_m*dim_zz[1]+iter_n] %(affectation)s sum;
}//for n
}//for m
}//for iter_n
}//for iter_m
}//for stack_size
if (0 && (mode==FULL)){
for (int i = 0; i < dim_zz[0]*dim_zz[1]; ++i)
......@@ -1585,33 +1714,36 @@ free(kbuf);
Py_XDECREF(img2d);
"""
def gen_conv_code_unroll_batch_kern(d,unroll_bsize=1, unroll_ksize=1):
def gen_conv_code_unroll_batch_kern(d, unroll_bsize=1, unroll_ksize=1):
""" c_code for ConvOp that unroll the batch size loop
"""
assert unroll_bsize>0 and unroll_ksize>0
if d.has_key("unroll_bsize") or d.has_key("unroll_ksize") or d.has_key("unroll_iter") or d.has_key("unroll_biter") or d.has_key("unroll_kiter"):
assert unroll_bsize > 0 and unroll_ksize > 0
if "unroll_bsize" in d or "unroll_ksize" in d or "unroll_iter" in d or "unroll_biter" in d or "unroll_kiter" in d:
raise Exception("We can't use this dictionnary as we will overwrite some of its containt")
d=d.copy()
d = d.copy()
d["unroll_bsize"]=unroll_bsize
d["unroll_ksize"]=unroll_ksize
def my_dup(st,size):
s=""
d["unroll_bsize"] = unroll_bsize
d["unroll_ksize"] = unroll_ksize
def my_dup(st, size):
s = ""
for i in xrange(size):
d["unroll_iter"]=i
s+=st%d
return s+"\n"
d["unroll_iter"] = i
s += st % d
return s + "\n"
def my_dup2(st):
s=""
iter=0
s = ""
iter = 0
for i in xrange(unroll_bsize):
d["unroll_biter"]=i
d["unroll_biter"] = i
for j in xrange(unroll_ksize):
d["unroll_kiter"]=j
d["unroll_iter"]=iter
iter+=1
s+=st%d
return s+"\n"
d["unroll_kiter"] = j
d["unroll_iter"] = iter
iter += 1
s += st % d
return s + "\n"
ret = """
const int mode=%(mode)s;
int typenum=0, typenum_f=0;
......@@ -1765,7 +1897,8 @@ for(int b=0;b< %(self_bsize)s ;b+=%(unroll_bsize)s){
for (int iter_n=0; iter_n < Os[1]; iter_n++) { // loop over columns
int pos_n=iter_n*%(self_dy)s;
""" % d
ret += my_dup("%(type)s sum%(unroll_iter)s=0;", unroll_bsize * unroll_ksize)
ret += my_dup(
"%(type)s sum%(unroll_iter)s=0;", unroll_bsize * unroll_ksize)
ret += """
// Sum over kernel, if index into image is out of bounds
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论