提交 1ebe9fd6 authored 作者: Frederic Bastien's avatar Frederic Bastien

pep8 code redability.

上级 97e12dae
...@@ -23,18 +23,19 @@ from theano.gof.python25 import any ...@@ -23,18 +23,19 @@ from theano.gof.python25 import any
imported_scipy_signal = False imported_scipy_signal = False
try: try:
# TODO: move these back out to global scope when they no longer cause an atexit error # TODO: move these back out to global scope when they no longer
# cause an atexit error
from scipy.signal.signaltools import _valfrommode, _bvalfromboundary from scipy.signal.signaltools import _valfrommode, _bvalfromboundary
from scipy.signal.sigtools import _convolve2d from scipy.signal.sigtools import _convolve2d
imported_scipy_signal = True imported_scipy_signal = True
except ImportError: except ImportError:
pass pass
_logger=logging.getLogger("theano.tensor.nnet.conv") _logger = logging.getLogger("theano.tensor.nnet.conv")
def conv2d(input, filters, image_shape=None, filter_shape=None, def conv2d(input, filters, image_shape=None, filter_shape=None,
border_mode='valid', subsample=(1,1), **kargs): border_mode='valid', subsample=(1, 1), **kargs):
"""This function will build the symbolic graph for convolving a stack of input """This function will build the symbolic graph for convolving a stack of input
images with a set of filters. The implementation is modelled after images with a set of filters. The implementation is modelled after
Convolutional Neural Networks (CNN). It is simply a wrapper to the ConvOp but Convolutional Neural Networks (CNN). It is simply a wrapper to the ConvOp but
...@@ -62,8 +63,10 @@ def conv2d(input, filters, image_shape=None, filter_shape=None, ...@@ -62,8 +63,10 @@ def conv2d(input, filters, image_shape=None, filter_shape=None,
:param filter_shape: (nb filters, stack size, nb row, nb col) :param filter_shape: (nb filters, stack size, nb row, nb col)
Optional, used for optimization. Optional, used for optimization.
:param kwargs: kwargs are passed onto ConvOp. Can be used to set the following: :param kwargs: kwargs are passed onto ConvOp.
unroll_batch, unroll_kern, unroll_patch, openmp (see ConvOp doc) Can be used to set the following:
unroll_batch, unroll_kern, unroll_patch,
openmp (see ConvOp doc)
openmp: By default have the same value as openmp: By default have the same value as
config.openmp. For small image, filter, config.openmp. For small image, filter,
...@@ -77,8 +80,8 @@ def conv2d(input, filters, image_shape=None, filter_shape=None, ...@@ -77,8 +80,8 @@ def conv2d(input, filters, image_shape=None, filter_shape=None,
with openmp on a core 2 duo. with openmp on a core 2 duo.
:rtype: symbolic 4D tensor :rtype: symbolic 4D tensor
:return: set of feature maps generated by convolutional layer. Tensor is of shape :return: set of feature maps generated by convolutional layer. Tensor is
(batch size, nb filters, output row, output col) of shape (batch size, nb filters, output row, output col)
""" """
...@@ -87,20 +90,22 @@ def conv2d(input, filters, image_shape=None, filter_shape=None, ...@@ -87,20 +90,22 @@ def conv2d(input, filters, image_shape=None, filter_shape=None,
image_shape = list(image_shape) image_shape = list(image_shape)
for i in xrange(len(image_shape)): for i in xrange(len(image_shape)):
if image_shape[i] is not None: if image_shape[i] is not None:
image_shape[i] = get_constant_value(as_tensor_variable(image_shape[i])) image_shape[i] = get_constant_value(
as_tensor_variable(image_shape[i]))
assert str(image_shape[i].dtype).startswith('int') assert str(image_shape[i].dtype).startswith('int')
image_shape[i] = int(image_shape[i]) image_shape[i] = int(image_shape[i])
if filter_shape is not None: if filter_shape is not None:
filter_shape = list(filter_shape) filter_shape = list(filter_shape)
for i in xrange(len(filter_shape)): for i in xrange(len(filter_shape)):
if filter_shape[i] is not None: if filter_shape[i] is not None:
filter_shape[i] = get_constant_value(as_tensor_variable(filter_shape[i])) filter_shape[i] = get_constant_value(
as_tensor_variable(filter_shape[i]))
assert str(filter_shape[i].dtype).startswith('int') assert str(filter_shape[i].dtype).startswith('int')
filter_shape[i] = int(filter_shape[i]) filter_shape[i] = int(filter_shape[i])
if image_shape and filter_shape: if image_shape and filter_shape:
try: try:
assert image_shape[1]==filter_shape[1] assert image_shape[1] == filter_shape[1]
except Exception: except Exception:
print 'image ', image_shape, ' filters ', filter_shape print 'image ', image_shape, ' filters ', filter_shape
raise raise
...@@ -118,7 +123,7 @@ def conv2d(input, filters, image_shape=None, filter_shape=None, ...@@ -118,7 +123,7 @@ def conv2d(input, filters, image_shape=None, filter_shape=None,
bsize, imshp = None, None bsize, imshp = None, None
op = ConvOp(output_mode=border_mode, dx=subsample[0], dy=subsample[1], op = ConvOp(output_mode=border_mode, dx=subsample[0], dy=subsample[1],
imshp=imshp, kshp=kshp, nkern=nkern, bsize=bsize,**kargs) imshp=imshp, kshp=kshp, nkern=nkern, bsize=bsize, **kargs)
return op(input, filters) return op(input, filters)
...@@ -141,8 +146,8 @@ class ConvOp(Op): ...@@ -141,8 +146,8 @@ class ConvOp(Op):
The output of ConvOp is a 4D tensor, generated as follows: The output of ConvOp is a 4D tensor, generated as follows:
output[b,k,:,:] = \sum_i input[b,i,:,:] * filter[k,i,:,:] \forall b,k output[b,k,:,:] = \sum_i input[b,i,:,:] * filter[k,i,:,:] \forall b,k
where b is the mini-batch index, k the filter index and * is the convolution where b is the mini-batch index, k the filter index and * is the
operator. convolution operator.
""" """
__attrnames = ['imshp', 'kshp', 'nkern', 'bsize', 'dx', 'dy', 'out_mode', __attrnames = ['imshp', 'kshp', 'nkern', 'bsize', 'dx', 'dy', 'out_mode',
...@@ -161,63 +166,63 @@ class ConvOp(Op): ...@@ -161,63 +166,63 @@ class ConvOp(Op):
# using the real shape and the same dtype could also help. # using the real shape and the same dtype could also help.
#unroll_batch, unroll_kern, valid time, full time #unroll_batch, unroll_kern, valid time, full time
speed_unroll_batch_kern=[(1, 1, 2.4661250114440918, 6.5472931861877441) , speed_unroll_batch_kern = [(1, 1, 2.4661250114440918, 6.5472931861877441),
(1, 2, 1.5869178771972656, 5.1499760150909424) , (1, 2, 1.5869178771972656, 5.1499760150909424),
(1, 3, 1.4270510673522949, 3.6593470573425293) , (1, 3, 1.4270510673522949, 3.6593470573425293),
(1, 4, 1.3373479843139648, 3.3451821804046631) , (1, 4, 1.3373479843139648, 3.3451821804046631),
(1, 5, 1.2818830013275146, 3.1444568634033203) , (1, 5, 1.2818830013275146, 3.1444568634033203),
(1, 6, 1.2521560192108154, 3.0256359577178955) , (1, 6, 1.2521560192108154, 3.0256359577178955),
(1, 10, 1.2134110927581787, 2.9174180030822754) , (1, 10, 1.2134110927581787, 2.9174180030822754),
(2, 1, 1.657214879989624, 4.5261678695678711) , (2, 1, 1.657214879989624, 4.5261678695678711),
(2, 2, 1.2123160362243652, 2.9747390747070312) , (2, 2, 1.2123160362243652, 2.9747390747070312),
(2, 3, 1.0758891105651855, 2.5690360069274902) , (2, 3, 1.0758891105651855, 2.5690360069274902),
(2, 4, 1.0683329105377197, 2.4233770370483398) , (2, 4, 1.0683329105377197, 2.4233770370483398),
(2, 5, 1.0955719947814941, 2.3999948501586914) , (2, 5, 1.0955719947814941, 2.3999948501586914),
(2, 6, 1.5935721397399902, 2.6878271102905273) , (2, 6, 1.5935721397399902, 2.6878271102905273),
(2, 10, 1.8511250019073486, 3.2417428493499756) , (2, 10, 1.8511250019073486, 3.2417428493499756),
(3, 1, 1.5948119163513184, 3.631148099899292) , (3, 1, 1.5948119163513184, 3.631148099899292),
(3, 2, 1.0761330127716064, 2.6011371612548828) , (3, 2, 1.0761330127716064, 2.6011371612548828),
(3, 3, 1.0551531314849854, 2.4200370311737061) , (3, 3, 1.0551531314849854, 2.4200370311737061),
(3, 4, 1.3930759429931641, 2.5211219787597656) , (3, 4, 1.3930759429931641, 2.5211219787597656),
(3, 5, 1.4330689907073975, 2.5704989433288574) , (3, 5, 1.4330689907073975, 2.5704989433288574),
(3, 6, 1.362138032913208, 2.5964410305023193) , (3, 6, 1.362138032913208, 2.5964410305023193),
(3, 10, 1.6582000255584717, 2.9907989501953125) , (3, 10, 1.6582000255584717, 2.9907989501953125),
(4, 1, 1.4793620109558105, 3.3473429679870605) , (4, 1, 1.4793620109558105, 3.3473429679870605),
(4, 2, 1.0671560764312744, 2.4171769618988037) , (4, 2, 1.0671560764312744, 2.4171769618988037),
(4, 3, 1.2569692134857178, 2.2807950973510742) , (4, 3, 1.2569692134857178, 2.2807950973510742),
(4, 4, 1.3456289768218994, 2.6219108104705811) , (4, 4, 1.3456289768218994, 2.6219108104705811),
(4, 5, 1.4055080413818359, 2.4606490135192871) , (4, 5, 1.4055080413818359, 2.4606490135192871),
(4, 6, 1.372107982635498, 2.551663875579834) , (4, 6, 1.372107982635498, 2.551663875579834),
(4, 10, 1.599470853805542, 2.9172940254211426) , (4, 10, 1.599470853805542, 2.9172940254211426),
(5, 1, 1.4115700721740723, 3.2077109813690186) , (5, 1, 1.4115700721740723, 3.2077109813690186),
(5, 2, 1.0635769367218018, 2.2648060321807861) , (5, 2, 1.0635769367218018, 2.2648060321807861),
(5, 3, 1.3842809200286865, 2.6135518550872803) , (5, 3, 1.3842809200286865, 2.6135518550872803),
(5, 4, 1.3470511436462402, 2.3852400779724121) , (5, 4, 1.3470511436462402, 2.3852400779724121),
(5, 5, 1.3539440631866455, 2.5245928764343262) , (5, 5, 1.3539440631866455, 2.5245928764343262),
(5, 6, 1.4037849903106689, 2.5985310077667236) , (5, 6, 1.4037849903106689, 2.5985310077667236),
(5, 10, 1.6120610237121582, 2.8127608299255371) , (5, 10, 1.6120610237121582, 2.8127608299255371),
(6, 1, 1.3623628616333008, 3.021122932434082) , (6, 1, 1.3623628616333008, 3.021122932434082),
(6, 2, 1.1697649955749512, 2.6285450458526611) , (6, 2, 1.1697649955749512, 2.6285450458526611),
(6, 3, 1.2980999946594238, 2.4746189117431641) , (6, 3, 1.2980999946594238, 2.4746189117431641),
(6, 4, 1.3739941120147705, 2.5579929351806641) , (6, 4, 1.3739941120147705, 2.5579929351806641),
(6, 5, 1.3967819213867188, 2.5522029399871826) , (6, 5, 1.3967819213867188, 2.5522029399871826),
(6, 6, 1.4279270172119141, 2.6127138137817383) , (6, 6, 1.4279270172119141, 2.6127138137817383),
(6, 10, 1.605496883392334, 2.864037036895752) , (6, 10, 1.605496883392334, 2.864037036895752),
(10, 1, 1.6401121616363525, 2.970099925994873) , (10, 1, 1.6401121616363525, 2.970099925994873),
(10, 2, 1.46710205078125, 2.7231831550598145) , (10, 2, 1.46710205078125, 2.7231831550598145),
(10, 3, 1.4193780422210693, 2.6087639331817627) , (10, 3, 1.4193780422210693, 2.6087639331817627),
(10, 4, 1.4657118320465088, 2.6246678829193115) , (10, 4, 1.4657118320465088, 2.6246678829193115),
(10, 5, 1.5052611827850342, 2.6542458534240723) , (10, 5, 1.5052611827850342, 2.6542458534240723),
(10, 6, 1.5214400291442871, 2.7243161201477051) , (10, 6, 1.5214400291442871, 2.7243161201477051),
(10, 10, 1.6116268634796143, 2.956165075302124)] (10, 10, 1.6116268634796143, 2.956165075302124)]
#valid time, full time #valid time, full time
speed_unroll_patch_noshape=[2.0109100341796875, 5.8175678253173828] speed_unroll_patch_noshape = [2.0109100341796875, 5.8175678253173828]
#valid time, full time #valid time, full time
speed_unroll_patch_shape=[1.2967290878295898, 5.5283889770507812] speed_unroll_patch_shape = [1.2967290878295898, 5.5283889770507812]
@staticmethod @staticmethod
def getOutputShape(inshp, kshp, stride=(1,1), mode='valid'): def getOutputShape(inshp, kshp, stride=(1, 1), mode='valid'):
""" """
Computes the output dimensions of convolving an image of shape "inshp" Computes the output dimensions of convolving an image of shape "inshp"
with kernels of shape "kshp". with kernels of shape "kshp".
...@@ -228,12 +233,13 @@ class ConvOp(Op): ...@@ -228,12 +233,13 @@ class ConvOp(Op):
:return: (rows,cols) of output image :return: (rows,cols) of output image
""" """
dx, dy = stride dx, dy = stride
if mode=='valid': s = -1 if mode == 'valid':
else: s = 1 s = -1
else:
s = 1
inshp, kshp = numpy.array(inshp), numpy.array(kshp) inshp, kshp = numpy.array(inshp), numpy.array(kshp)
return numpy.int64(numpy.ceil((inshp + s*kshp - s*1)/\ return numpy.int64(numpy.ceil((inshp + s * kshp - s * 1) /
numpy.array([dx,dy], dtype='float'))) numpy.array([dx, dy], dtype='float')))
def __init__(self, imshp=None, kshp=None, nkern=None, bsize=None, def __init__(self, imshp=None, kshp=None, nkern=None, bsize=None,
dx=1, dy=1, dx=1, dy=1,
...@@ -259,12 +265,13 @@ class ConvOp(Op): ...@@ -259,12 +265,13 @@ class ConvOp(Op):
By default we try to select the fastest version. You can specify it By default we try to select the fastest version. You can specify it
with the unroll_batch, unroll_kern, and unroll_patch parameter. with the unroll_batch, unroll_kern, and unroll_patch parameter.
The second type of optimization is hardcoding some dimensions into the code The second type of optimization is hardcoding some dimensions into the
when all shape are know. code when all shape are know.
This make a significant difference for the 'full' output_mode. This make a significant difference for the 'full' output_mode.
Some times, the fastest implementation on x86-64 uses {unroll_batch=4, unroll_kern=4, Some times, the fastest implementation on x86-64 uses
unroll_patch=False} with all other shape parameters being provided. {unroll_batch=4, unroll_kern=4, unroll_patch=False}
with all other shape parameters being provided.
For optimizing other architectures, see: For optimizing other architectures, see:
Kazushige Goto and Robert A. Van De Geijn, Anatomy of High-Performance Kazushige Goto and Robert A. Van De Geijn, Anatomy of High-Performance
...@@ -278,7 +285,8 @@ class ConvOp(Op): ...@@ -278,7 +285,8 @@ class ConvOp(Op):
Optional parameters: (will generate more optimal c code) Optional parameters: (will generate more optimal c code)
:type imshp: tuple of len 2 or 3: 2 for 2d image, 3 for a stack of 2d images. :type imshp: tuple of len 2 or 3: 2 for 2d image,
3 for a stack of 2d images.
:param imshp: (stacksize, nb image row, nb image col) :param imshp: (stacksize, nb image row, nb image col)
:type kshp: tuple of len 2 :type kshp: tuple of len 2
:param kshp: (nb kernel row, nb kernel col) :param kshp: (nb kernel row, nb kernel col)
...@@ -294,16 +302,18 @@ class ConvOp(Op): ...@@ -294,16 +302,18 @@ class ConvOp(Op):
Params which select the version of code used: Params which select the version of code used:
:type unroll_patch: bool :type unroll_patch: bool
:param unroll_patch: use a version of c_code that unroll the patch loop that don't :param unroll_patch: use a version of c_code that unroll the patch loop
request all shape information to work, but if all shape information are present, will that don't request all shape information to work, but if all shape
information are present, will
use it to hardcode the value in the code for faster code. use it to hardcode the value in the code for faster code.
:type unroll_batch:int :type unroll_batch:int
:param unroll_batch: use a version of c_code that unroll the batch(by unroll_batch) and :param unroll_batch: use a version of c_code that unroll the batch
the nkern(by unroll_kern) loop. The size must by a multiple of bsize or nkern (by unroll_batch) and the nkern(by unroll_kern) loop. The size
respectively. must by a multiple of bsize or nkern respectively.
:type unroll_kern:int :type unroll_kern:int
:param unroll_kern: use a version of c_code that unroll the batch(by unroll_batch) and :param unroll_kern: use a version of c_code that unroll the batch
the nkern(by unroll_kern) loop. The size must by a multiple of bsize or nkern (by unroll_batch) and the nkern(by unroll_kern) loop. The size
must by a multiple of bsize or nkern
respectively. respectively.
:type verbose: int :type verbose: int
...@@ -316,8 +326,10 @@ class ConvOp(Op): ...@@ -316,8 +326,10 @@ class ConvOp(Op):
:param kshp_logical_top_aligned: idem :param kshp_logical_top_aligned: idem
""" """
# We must continue to consider None as 1 for backward compatibility. # We must continue to consider None as 1 for backward compatibility.
if dx is None: dx = 1 if dx is None:
if dy is None: dy = 1 dx = 1
if dy is None:
dy = 1
if int(dx) != dx: if int(dx) != dx:
raise TypeError('ConvOp.__init__ param dx must be an int', dx) raise TypeError('ConvOp.__init__ param dx must be an int', dx)
...@@ -330,8 +342,9 @@ class ConvOp(Op): ...@@ -330,8 +342,9 @@ class ConvOp(Op):
all_shape = imshp is not None and kshp is not None and \ all_shape = imshp is not None and kshp is not None and \
nkern is not None and bsize is not None nkern is not None and bsize is not None
if (unroll_batch>0 or unroll_kern>0) and not all_shape: if (unroll_batch > 0 or unroll_kern > 0) and not all_shape:
raise Exception("In ConvOp, when using unroll_batch and unroll_nkern, all shape are needed") raise Exception("In ConvOp, when using unroll_batch and"
" unroll_nkern, all shape are needed")
if openmp is None: if openmp is None:
openmp = theano.config.openmp openmp = theano.config.openmp
...@@ -343,9 +356,9 @@ class ConvOp(Op): ...@@ -343,9 +356,9 @@ class ConvOp(Op):
if imshp is not None: if imshp is not None:
imshp = tuple(imshp) imshp = tuple(imshp)
if len(imshp)==2: if len(imshp) == 2:
imshp = (1,)+imshp imshp = (1,) + imshp
elif len(imshp)==3: elif len(imshp) == 3:
imshp = imshp imshp = imshp
else: else:
raise Exception("bad len for imshp") raise Exception("bad len for imshp")
...@@ -356,73 +369,83 @@ class ConvOp(Op): ...@@ -356,73 +369,83 @@ class ConvOp(Op):
self.kshp = kshp self.kshp = kshp
self.nkern = nkern self.nkern = nkern
self.bsize=bsize self.bsize = bsize
self.dx=dx self.dx = dx
self.dy=dy self.dy = dy
self.verbose=verbose self.verbose = verbose
self.version=version self.version = version
if openmp is None: if openmp is None:
openmp = config.openmp openmp = config.openmp
self.openmp = openmp self.openmp = openmp
# a triple # a triple
self.imshp_logical = self.imshp self.imshp_logical = self.imshp
if imshp_logical is not None: self.imshp_logical = tuple(imshp_logical) if imshp_logical is not None:
self.imshp_logical = tuple(imshp_logical)
assert (self.imshp is None and self.imshp_logical is None) or \ assert (self.imshp is None and self.imshp_logical is None) or \
(len(self.imshp) == len(self.imshp_logical)) (len(self.imshp) == len(self.imshp_logical))
# a pair # a pair
self.kshp_logical = self.kshp self.kshp_logical = self.kshp
if kshp_logical is not None: self.kshp_logical = tuple(kshp_logical) if kshp_logical is not None:
self.kshp_logical = tuple(kshp_logical)
self.kshp_logical_top_aligned = kshp_logical_top_aligned self.kshp_logical_top_aligned = kshp_logical_top_aligned
self.unroll_batch=unroll_batch self.unroll_batch = unroll_batch
self.unroll_kern=unroll_kern self.unroll_kern = unroll_kern
self.unroll_patch=unroll_patch self.unroll_patch = unroll_patch
if self.unroll_batch and not self.unroll_kern: self.unroll_kern = 1 if self.unroll_batch and not self.unroll_kern:
if self.unroll_kern and not self.unroll_batch: self.unroll_batch = 1 self.unroll_kern = 1
if self.unroll_kern and not self.unroll_batch:
self.unroll_batch = 1
#downcast unroll_batch if not a divisor of batch size #downcast unroll_batch if not a divisor of batch size
if self.unroll_batch>0 and self.bsize % self.unroll_batch!=0: if self.unroll_batch > 0 and self.bsize % self.unroll_batch != 0:
if self.bsize<=self.unroll_batch: if self.bsize <= self.unroll_batch:
self.unroll_batch = self.bsize self.unroll_batch = self.bsize
else: else:
#find the maximum value under unroll_batch that would work #find the maximum value under unroll_batch that would work
new=self.unroll_batch new = self.unroll_batch
assert(new>=1) assert(new >= 1)
while self.bsize % new!=0: while self.bsize % new != 0:
new-=1 new -= 1
warnstr = "OPTIMISATION WARNING: in ConvOp.__init__() unroll_batch(%i)"\ warnstr = ("OPTIMISATION WARNING: in ConvOp.__init__() "
"must be 0 or a divisor of bsize(%i). We revert it to %i. This"\ "unroll_batch(%i) must be 0 or a divisor of"
" won't change the result, but may make it slower." " bsize(%i). We revert it to %i. This"
" won't change the result, but may make it slower.")
_logger.warn(warnstr, self.unroll_batch, self.bsize, new) _logger.warn(warnstr, self.unroll_batch, self.bsize, new)
self.unroll_batch=new self.unroll_batch = new
#downcast unroll_kern if not a divisor of nb of kernel #downcast unroll_kern if not a divisor of nb of kernel
if self.unroll_kern>0 and self.nkern % self.unroll_kern!=0: if self.unroll_kern > 0 and self.nkern % self.unroll_kern != 0:
if self.nkern<=self.unroll_kern: if self.nkern <= self.unroll_kern:
self.unroll_kern = self.nkern self.unroll_kern = self.nkern
else: else:
#find the maximum value under unroll_kern that would work #find the maximum value under unroll_kern that would work
new=self.unroll_kern new = self.unroll_kern
assert(new>=1) assert(new >= 1)
while self.nkern % new!=0: while self.nkern % new != 0:
new-=1 new -= 1
warnstr = "OPTIMISATION WARNING: in ConvOp.__init__() unroll_kern(%i)"\ warnstr = ("OPTIMISATION WARNING: in ConvOp.__init__()"
"should be 0 or a divisor of nkern(%i). We revert it to %i."\ " unroll_kern(%i) should be 0 or a divisor of"
"This won't change the result, but may make it slower." " nkern(%i). We revert it to %i. This"
" won't change the result, but may make it slower.")
_logger.warn(warnstr, self.unroll_kern, self.nkern, new) _logger.warn(warnstr, self.unroll_kern, self.nkern, new)
self.unroll_kern=new self.unroll_kern = new
if all_shape: if all_shape:
self.outshp = ConvOp.getOutputShape(self.imshp_logical[1:], self.kshp_logical, (dx,dy), output_mode) self.outshp = ConvOp.getOutputShape(self.imshp_logical[1:],
self.fulloutshp = ConvOp.getOutputShape(self.imshp_logical[1:], self.kshp_logical, (1,1), output_mode) self.kshp_logical, (dx, dy),
output_mode)
self.fulloutshp = ConvOp.getOutputShape(self.imshp_logical[1:],
self.kshp_logical, (1, 1),
output_mode)
else: else:
self.outshp = None self.outshp = None
self.fulloutshp = None self.fulloutshp = None
...@@ -430,52 +453,60 @@ class ConvOp(Op): ...@@ -430,52 +453,60 @@ class ConvOp(Op):
self.out_mode = output_mode self.out_mode = output_mode
if not self.out_mode in ["valid", "full"]: if not self.out_mode in ["valid", "full"]:
raise Exception("Mode %s not implemented"%self.out_mode) raise Exception("Mode %s not implemented" % self.out_mode)
if all_shape and not (self.outshp > 0).all(): if all_shape and not (self.outshp > 0).all():
raise Exception(("Bad size for the output shape. Verify that [post-"\ raise Exception("Bad size for the output shape. Verify that [post-"
"supersampling] input shape (%s) and kern shape(%s) are ok. "\ "supersampling] input shape (%s) and kern"
"(Hint: kerns must fit inside image in valid mode)")% " shape(%s) are ok. (Hint: kerns must fit inside"
(self.imshp_logical,self.kshp_logical)) " image in valid mode)" %
(self.imshp_logical, self.kshp_logical))
if (self.unroll_kern is None and
self.unroll_batch is None and
self.unroll_patch is None):
if self.unroll_kern is None and self.unroll_batch is None and self.unroll_patch is None:
#no version specified. Find the faster we have #no version specified. Find the faster we have
if self.bsize is None and self.nkern is None: if self.bsize is None and self.nkern is None:
self.unroll_patch = True self.unroll_patch = True
elif self.bsize is not None and self.nkern is not None: elif self.bsize is not None and self.nkern is not None:
bsize=self.bsize bsize = self.bsize
nkern=self.nkern nkern = self.nkern
if bsize is None: if bsize is None:
bsize=1 bsize = 1
if nkern is None: if nkern is None:
nkern=1 nkern = 1
mode_idx=0 mode_idx = 0
if self.out_mode!="valid": if self.out_mode != "valid":
mode_idx=1 mode_idx = 1
if all_shape: if all_shape:
time_unroll_patch = self.speed_unroll_patch_shape[mode_idx] time_unroll_patch = self.speed_unroll_patch_shape[mode_idx]
else: else:
time_unroll_patch = self.speed_unroll_patch_noshape[mode_idx] time_unroll_patch = self.speed_unroll_patch_noshape[
mode_idx]
time_unroll_batch_kern = 9999999 time_unroll_batch_kern = 9999999
for i in xrange(len(self.speed_unroll_batch_kern)): for i in xrange(len(self.speed_unroll_batch_kern)):
if bsize%self.speed_unroll_batch_kern[i][0]==0 and nkern%self.speed_unroll_batch_kern[i][1]==0: if (bsize % self.speed_unroll_batch_kern[i][0] == 0 and
if self.speed_unroll_batch_kern[i][2+mode_idx]<time_unroll_batch_kern: nkern % self.speed_unroll_batch_kern[i][1] == 0):
time_unroll_batch_kern=self.speed_unroll_batch_kern[i][2+mode_idx] if self.speed_unroll_batch_kern[i][2 + mode_idx] < time_unroll_batch_kern:
time_unroll_batch_kern_idx=i time_unroll_batch_kern = self.speed_unroll_batch_kern[i][2 + mode_idx]
time_unroll_batch_kern_idx = i
if time_unroll_patch < time_unroll_batch_kern: if time_unroll_patch < time_unroll_batch_kern:
self.unroll_patch = True self.unroll_patch = True
else: else:
self.unroll_batch=self.speed_unroll_batch_kern[time_unroll_batch_kern_idx][0] self.unroll_batch = self.speed_unroll_batch_kern[
self.unroll_kern=self.speed_unroll_batch_kern[time_unroll_batch_kern_idx][1] time_unroll_batch_kern_idx][0]
self.unroll_kern = self.speed_unroll_batch_kern[
time_unroll_batch_kern_idx][1]
self.unroll_patch = False self.unroll_patch = False
_logger.debug("AUTO FIND VERSION OF C_CODE OF CONV OP " _logger.debug("AUTO FIND VERSION OF C_CODE OF CONV OP "
"%s %s %s %s %s %s %s", "%s %s %s %s %s %s %s",
self.unroll_batch, self.unroll_kern, self.unroll_patch, self.unroll_batch, self.unroll_kern,
self.unroll_patch,
self.bsize, self.nkern, time_unroll_patch, self.bsize, self.nkern, time_unroll_patch,
time_unroll_batch_kern) time_unroll_batch_kern)
self._rehash() self._rehash()
if config.op.set_flops: if config.op.set_flops:
self.set_flops() self.set_flops()
...@@ -504,41 +535,46 @@ class ConvOp(Op): ...@@ -504,41 +535,46 @@ class ConvOp(Op):
return self.__hashval return self.__hashval
def __str__(self): def __str__(self):
return "ConvOp{" +",".join(str((a, getattr(self, a))) for a in self.__attrnames) + "}" return "ConvOp{" + ",".join(str((a, getattr(self, a)))
for a in self.__attrnames) + "}"
def set_flops(self): def set_flops(self):
""" Useful with the hack in profilemode to print the MFlops""" """ Useful with the hack in profilemode to print the MFlops"""
if self.out_mode=="valid": if self.out_mode == "valid":
self.flops=self.kshp[0]*self.kshp[1]*2#nb mul and add by output pixed # nb mul and add by output pixed
self.flops*=self.outshp[0]*self.outshp[1]#nb flops by output image self.flops = self.kshp[0] * self.kshp[1] * 2
self.flops*=self.imshp[0]*self.nkern*self.bsize#for all outputs images#n_stack==self.imshp[0] #nb flops by output image
else: #full mode not implemented self.flops *= self.outshp[0] * self.outshp[1]
# for all outputs images#n_stack==self.imshp[0]
self.flops=0 self.flops *= self.imshp[0] * self.nkern * self.bsize
for out_row in xrange(self.outshp[0]):#loop over output row else: # full mode not implemented
for out_col in xrange(self.outshp[0]):#loop over output col
for row in xrange(self.kshp[0]):#loop over kern row self.flops = 0
for out_row in xrange(self.outshp[0]): # loop over output row
if (row+out_row-self.kshp[0]+1<0 or for out_col in xrange(self.outshp[0]): # loop over output col
row+out_row-self.kshp[0]+1>=self.imshp[1]): for row in xrange(self.kshp[0]): # loop over kern row
if (row + out_row - self.kshp[0] + 1 < 0 or
row + out_row - self.kshp[0] + 1 >= self.imshp[1]):
continue continue
col=0 col = 0
max_col=self.kshp[1] max_col = self.kshp[1]
img_col=out_col-self.kshp[1]+1 img_col = out_col - self.kshp[1] + 1
max_col=min(max_col,self.imshp[2]-img_col) max_col = min(max_col, self.imshp[2] - img_col)
if img_col<0:
col=-img_col
img_col+=col
while col < max_col: #loop over kern col
self.flops+=2
col+=1
self.flops*=self.imshp[0]*self.nkern*self.bsize#for all outputs images#n_stack==self.imshp[0] if img_col < 0:
col = -img_col
img_col += col
while col < max_col: # loop over kern col
self.flops += 2
col += 1
# for all outputs images#n_stack==self.imshp[0]
self.flops *= self.imshp[0] * self.nkern * self.bsize
assert self.flops == self.bsize * self.nkern * self.imshp[0] * \ assert self.flops == self.bsize * self.nkern * self.imshp[0] * \
self.kshp[0] * self.kshp[1] * self.imshp[1] * self.imshp[2] * 2 self.kshp[0] * self.kshp[1] * \
self.imshp[1] * self.imshp[2] * 2
def make_node(self, inputs, kerns): def make_node(self, inputs, kerns):
# TODO: find a way to make ConvOp work for N-D (after NIPS09) # TODO: find a way to make ConvOp work for N-D (after NIPS09)
...@@ -551,19 +587,23 @@ class ConvOp(Op): ...@@ -551,19 +587,23 @@ class ConvOp(Op):
_kerns = as_tensor_variable(kerns) _kerns = as_tensor_variable(kerns)
# TODO: lift this restriction by upcasting either inputs or kerns # TODO: lift this restriction by upcasting either inputs or kerns
if _inputs.ndim != 4: if _inputs.ndim != 4:
raise TypeError('ConvOp (make_node) requires input be a 4D tensor; received "%s" (%i dims)' % (inputs, _inputs.ndim)) raise TypeError('ConvOp (make_node) requires input be a 4D tensor;'
' received "%s" (%i dims)' %
(inputs, _inputs.ndim))
if _kerns.ndim != 4: if _kerns.ndim != 4:
raise TypeError('make_node requires 4D tensor of kernels') raise TypeError('make_node requires 4D tensor of kernels')
if _inputs.type.dtype != _kerns.type.dtype: if _inputs.type.dtype != _kerns.type.dtype:
raise NotImplementedError("The image and the kernel must have the same type." raise NotImplementedError(
"inputs(%s), kerns(%s)"%(_inputs.dtype, _kerns.dtype)) "The image and the kernel must have the same type."
"inputs(%s), kerns(%s)" % (_inputs.dtype, _kerns.dtype))
if self.outshp is not None: if self.outshp is not None:
bcastable23 = [self.outshp[0]==1, self.outshp[1]==1] bcastable23 = [self.outshp[0] == 1, self.outshp[1] == 1]
else: else:
bcastable23 = [False, False] bcastable23 = [False, False]
output = theano.tensor.tensor(dtype=_inputs.type.dtype, output = theano.tensor.tensor(dtype=_inputs.type.dtype,
broadcastable=[_inputs.broadcastable[0], broadcastable=[_inputs.broadcastable[0],
_kerns.broadcastable[0]]+bcastable23); _kerns.broadcastable[0]] +
bcastable23)
return Apply(self, [_inputs, _kerns], [output]) return Apply(self, [_inputs, _kerns], [output])
...@@ -582,10 +622,12 @@ class ConvOp(Op): ...@@ -582,10 +622,12 @@ class ConvOp(Op):
if self.kshp_logical: if self.kshp_logical:
kshp = self.kshp_logical kshp = self.kshp_logical
try: try:
fmshp = ConvOp.getOutputShape(imshp[1:], kshp, (self.dx,self.dy), self.out_mode) fmshp = ConvOp.getOutputShape(imshp[1:],
kshp, (self.dx, self.dy),
self.out_mode)
except TypeError: except TypeError:
raise theano.tensor.ShapeError() raise theano.tensor.ShapeError()
outshp = (batch_size,fmo) + tuple(fmshp) outshp = (batch_size, fmo) + tuple(fmshp)
return [outshp] return [outshp]
else: else:
# Haven't implemented this case. imshp and kshp may be symbollic # Haven't implemented this case. imshp and kshp may be symbollic
...@@ -593,8 +635,7 @@ class ConvOp(Op): ...@@ -593,8 +635,7 @@ class ConvOp(Op):
# we simply let the default function do its work. # we simply let the default function do its work.
raise theano.tensor.ShapeError() raise theano.tensor.ShapeError()
def perform(self, node, inp, out):
def perform(self,node, inp, out):
""" """
By default if len(img2d.shape)==3, we By default if len(img2d.shape)==3, we
""" """
...@@ -603,9 +644,12 @@ class ConvOp(Op): ...@@ -603,9 +644,12 @@ class ConvOp(Op):
if not imported_scipy_signal: if not imported_scipy_signal:
raise theano.gof.utils.MethodNotDefined( raise theano.gof.utils.MethodNotDefined(
"c_headers", type(self), self.__class__.__name__, "c_headers", type(self), self.__class__.__name__,
"Need the python package for scipy.signal to be installed for the python implementation. You can use the C implementation instead.") "Need the python package for scipy.signal to be installed "
"for the python implementation. You can use the C"
" implementation instead.")
# TODO: move these back out to global scope when they no longer cause an atexit error # TODO: move these back out to global scope when they no longer
# cause an atexit error
imshp = self.imshp imshp = self.imshp
if imshp is None or any([x is None for x in imshp]): if imshp is None or any([x is None for x in imshp]):
imshp = tuple(img2d.shape[1:]) imshp = tuple(img2d.shape[1:])
...@@ -634,39 +678,43 @@ class ConvOp(Op): ...@@ -634,39 +678,43 @@ class ConvOp(Op):
if self.fulloutshp is not None: if self.fulloutshp is not None:
fulloutshp = tuple(self.fulloutshp) fulloutshp = tuple(self.fulloutshp)
else: else:
fulloutshp = tuple(ConvOp.getOutputShape(imshp_logical[1:], kshp_logical, (1,1), self.out_mode)) fulloutshp = tuple(ConvOp.getOutputShape(imshp_logical[
1:], kshp_logical, (1, 1), self.out_mode))
if z[0] is None or z[0].shape!=(bsize,)+(nkern,)+fulloutshp: if z[0] is None or z[0].shape != (bsize,) + (nkern,) + fulloutshp:
z[0] = numpy.zeros((bsize,)+(nkern,)+fulloutshp, z[0] = numpy.zeros((bsize,) + (nkern,) + fulloutshp,
dtype=img2d.dtype) dtype=img2d.dtype)
zz=z[0] zz = z[0]
stacklen = imshp[0] stacklen = imshp[0]
img2d = img2d.reshape((bsize,)+ imshp) img2d = img2d.reshape((bsize,) + imshp)
filtersflipped = filtersflipped.reshape((nkern,stacklen)+kshp) filtersflipped = filtersflipped.reshape((nkern, stacklen) + kshp)
if self.imshp != self.imshp_logical: if self.imshp != self.imshp_logical:
# assuming that to get from imshp to imshp logical we insert zeros in missing spots # assuming that to get from imshp to imshp logical we insert zeros in missing spots
rstride = int(numpy.ceil(imshp_logical[1] / float(imshp[1]))) rstride = int(numpy.ceil(imshp_logical[1] / float(imshp[1])))
cstride = int(numpy.ceil(imshp_logical[2] / float(imshp[2]))) cstride = int(numpy.ceil(imshp_logical[2] / float(imshp[2])))
buf = numpy.zeros((bsize,)+ imshp_logical, dtype=img2d.dtype) buf = numpy.zeros((bsize,) + imshp_logical, dtype=img2d.dtype)
buf[:,:,::rstride, ::cstride] = img2d buf[:, :, ::rstride, ::cstride] = img2d
img2d = buf img2d = buf
del buf, rstride, cstride del buf, rstride, cstride
if kshp != kshp_logical: if kshp != kshp_logical:
rstride = int(numpy.ceil(kshp_logical[0] / float(kshp[0]))) rstride = int(numpy.ceil(kshp_logical[0] / float(kshp[0])))
cstride = int(numpy.ceil(kshp_logical[1] / float(kshp[1]))) cstride = int(numpy.ceil(kshp_logical[1] / float(kshp[1])))
buf = numpy.zeros((nkern,stacklen)+ self.kshp_logical, dtype=filtersflipped.dtype) buf = numpy.zeros((nkern, stacklen) +
self.kshp_logical, dtype=filtersflipped.dtype)
if self.kshp_logical_top_aligned: if self.kshp_logical_top_aligned:
roffset=coffset=0 roffset = coffset = 0
else: else:
roffset=(kshp_logical[0] - (kshp[0]*rstride) - 1+rstride) % rstride roffset = (kshp_logical[0] - (kshp[0] *
coffset=(kshp_logical[1] - (kshp[1]*cstride) - 1+cstride) % cstride rstride) - 1 + rstride) % rstride
coffset = (kshp_logical[1] - (kshp[1] *
cstride) - 1 + cstride) % cstride
assert roffset >= 0 assert roffset >= 0
assert coffset >= 0 assert coffset >= 0
buf[:,:,roffset::rstride, coffset::cstride] = filtersflipped buf[:, :, roffset::rstride, coffset::cstride] = filtersflipped
filtersflipped = buf filtersflipped = buf
del buf, rstride, cstride del buf, rstride, cstride
...@@ -675,39 +723,39 @@ class ConvOp(Op): ...@@ -675,39 +723,39 @@ class ConvOp(Op):
for b in xrange(bsize): for b in xrange(bsize):
for n in xrange(nkern): for n in xrange(nkern):
zz[b,n,...].fill(0) zz[b, n, ...].fill(0)
for im0 in xrange(stacklen): for im0 in xrange(stacklen):
zz[b,n,...] += _convolve2d(\ zz[b, n, ...] += _convolve2d(img2d[b, im0, ...],
img2d[b,im0,...], filtersflipped[n,im0,...],1,val, bval, 0) filtersflipped[n, im0, ...],
1, val, bval, 0)
if False: if False:
if False and self.out_mode=="full": if False and self.out_mode == "full":
img2d2 = numpy.zeros((bsize,stacklen, img2d2 = numpy.zeros((bsize, stacklen,
imshp[1]+2*kshp[0]-2, imshp[1] + 2 * kshp[0] - 2,
imshp[2]+2*kshp[1]-2)) imshp[2] + 2 * kshp[1] - 2))
img2d2[:,:,kshp[0]-1:kshp[0]-1+imshp[1], img2d2[:, :, kshp[0] - 1:kshp[0] - 1 + imshp[1],
kshp[1]-1:kshp[1]-1+imshp[2]] = img2d kshp[1] - 1:kshp[1] - 1 + imshp[2]] = img2d
img2d = img2d2 img2d = img2d2
#N_image_shape = image_data.shape #N_image_shape = image_data.shape
for b in xrange(bsize): for b in xrange(bsize):
for n in xrange(nkern): for n in xrange(nkern):
zz[b,n,...].fill(0) zz[b, n, ...].fill(0)
for im0 in xrange(stacklen): for im0 in xrange(stacklen):
for row in xrange(0,zz.shape[2],self.dx): for row in xrange(0, zz.shape[2], self.dx):
for col in xrange(0,zz.shape[3],self.dy): for col in xrange(0, zz.shape[3], self.dy):
zz[b,n,row,col] += (img2d[b,im0,row:row+kshp[0],col:col+kshp[1]]*\ zz[b, n, row, col] += (img2d[b, im0, row:row + kshp[0], col:col + kshp[1]] *
filtersflipped[n,im0,::-1,::-1]).sum() filtersflipped[n, im0, ::-1, ::-1]).sum()
#We copy it to remove the Stride mismatch warning from DEBUG_MODE. #We copy it to remove the Stride mismatch warning from DEBUG_MODE.
#The copy make that we return an object with the same stride as the c version. #The copy make that we return an object with the same stride as the c version.
#The copy don't affect the performence during our experience as in that case we #The copy don't affect the performence during our experience as in that case we
#execute the c version which is much faster. #execute the c version which is much faster.
if self.dx>1 or self.dy>1: if self.dx > 1 or self.dy > 1:
zz = zz[:,:,0::self.dx,0::self.dy].copy() zz = zz[:, :, 0::self.dx, 0::self.dy].copy()
z[0]=zz
z[0] = zz
def grad(self, inp, grads): def grad(self, inp, grads):
inputs, kerns = inp inputs, kerns = inp
...@@ -726,32 +774,36 @@ class ConvOp(Op): ...@@ -726,32 +774,36 @@ class ConvOp(Op):
tmp_node = theano.tensor.nnet.conv3D( tmp_node = theano.tensor.nnet.conv3D(
V=inputs.dimshuffle(0, 2, 3, 'x', 1), V=inputs.dimshuffle(0, 2, 3, 'x', 1),
W=kerns[:, :, ::-1, ::-1].dimshuffle(0, 2, 3, 'x', 1), W=kerns[:, :, ::-1, ::-1].dimshuffle(0, 2, 3, 'x', 1),
b=theano.tensor.alloc(numpy.asarray(0, dtype=kerns.dtype), kerns.shape[0]), b=theano.tensor.alloc(numpy.asarray(0, dtype=kerns.dtype),
kerns.shape[0]),
d=(self.dx, self.dy, 1)) d=(self.dx, self.dy, 1))
node = theano.tensor.addbroadcast(tmp_node, 3).dimshuffle(0, 4, 1, 2) node = theano.tensor.addbroadcast(
tmp_node, 3).dimshuffle(0, 4, 1, 2)
# mimic what happens inside theano.grad: get the input gradient # mimic what happens inside theano.grad: get the input gradient
# of the final cost wrt all variables involved. # of the final cost wrt all variables involved.
tmp_gmap = theano.gradient.grad_sources_inputs([(node, gz)], [inputs, kerns]) tmp_gmap = theano.gradient.grad_sources_inputs(
[(node, gz)], [inputs, kerns])
return [tmp_gmap[inputs], tmp_gmap[kerns]] return [tmp_gmap[inputs], tmp_gmap[kerns]]
if self.dx not in (1, 2) or self.dy not in (1, 2): if self.dx not in (1, 2) or self.dy not in (1, 2):
raise NotImplementedError("ERROR: We disable ConvOp.grad now when dx or "\ raise NotImplementedError(
"ERROR: We disable ConvOp.grad now when dx or "
"dy are different from 1 and 2, as there is a bug in it.") "dy are different from 1 and 2, as there is a bug in it.")
all_shape = self.imshp is not None and self.kshp is not None and \ all_shape = (self.imshp is not None and self.kshp is not None and
self.nkern is not None and self.bsize is not None self.nkern is not None and self.bsize is not None)
if not all_shape and (self.dx!=1 or self.dy!=1): if not all_shape and (self.dx != 1 or self.dy != 1):
raise Exception("ConvOp.grad when dx!=1 or dy!=1 we must have all "\ raise Exception("ConvOp.grad when dx!=1 or dy!=1 we must have all "
"the optional shape information") "the optional shape information")
####### Determine gradient on kernels ######## ####### Determine gradient on kernels ########
assert inputs.ndim==4 and kerns.ndim==4 assert inputs.ndim == 4 and kerns.ndim == 4
newin = inputs.dimshuffle((1,0,2,3)) newin = inputs.dimshuffle((1, 0, 2, 3))
newgz = gz.dimshuffle((1,0,2,3)) newgz = gz.dimshuffle((1, 0, 2, 3))
(bsize, nkern) = None, None (bsize, nkern) = None, None
imshp = None imshp = None
...@@ -762,7 +814,7 @@ class ConvOp(Op): ...@@ -762,7 +814,7 @@ class ConvOp(Op):
if self.out_mode == 'valid': if self.out_mode == 'valid':
(img, filters) = (newin, newgz) (img, filters) = (newin, newgz)
kshp_logical = self.fulloutshp kshp_logical = self.fulloutshp
kshp_logical_top_aligned=False kshp_logical_top_aligned = False
if all_shape: if all_shape:
(bsize, nkern) = (self.imshp[0], self.nkern) (bsize, nkern) = (self.imshp[0], self.nkern)
imshp = (self.bsize, self.imshp[1], self.imshp[2]) imshp = (self.bsize, self.imshp[1], self.imshp[2])
...@@ -772,38 +824,45 @@ class ConvOp(Op): ...@@ -772,38 +824,45 @@ class ConvOp(Op):
elif self.out_mode == 'full': elif self.out_mode == 'full':
(img, filters) = (newgz, newin) (img, filters) = (newgz, newin)
kshp_logical = None kshp_logical = None
kshp_logical_top_aligned=True kshp_logical_top_aligned = True
if all_shape: if all_shape:
imshp_logical = (self.bsize, self.fulloutshp[0], self.fulloutshp[1]) imshp_logical = (self.bsize,
self.fulloutshp[0],
self.fulloutshp[1])
(bsize, nkern) = (self.nkern, self.imshp[0]) (bsize, nkern) = (self.nkern, self.imshp[0])
imshp = (self.bsize, self.outshp[0], self.outshp[1]) imshp = (self.bsize, self.outshp[0], self.outshp[1])
kshp = self.imshp[1:] kshp = self.imshp[1:]
un_b = self.unroll_kern un_b = self.unroll_kern
un_k = self.unroll_batch un_k = self.unroll_batch
else: else:
raise NotImplementedError('Only [full,valid] modes are currently supported.') raise NotImplementedError(
'Only [full,valid] modes are currently supported.')
filters = filters[:,:,::-1,::-1] #flip them filters = filters[:, :, ::-1, ::-1] # flip them
if 0: #find good value for the unroll if 0: # find good value for the unroll
if all_shape and un_b!=0 and bsize%un_b!=0: if all_shape and un_b != 0 and bsize % un_b != 0:
if bsize<un_b: if bsize < un_b:
un_b = bsize un_b = bsize
else: else:
un_b = 1 un_b = 1
_logger.warn("Optimization Warning: in ConvOp.grad() we can't determine "\ _logger.warn(
"a good unroll value for the batch. Maybe you can optimize this!") "Optimization Warning: in ConvOp.grad() we can't "
" determine a good unroll value for the batch."
" Maybe you can optimize this!")
if all_shape and un_k!=0 and nkern%un_k!=0: if all_shape and un_k != 0 and nkern % un_k != 0:
if nkern<un_k: if nkern < un_k:
un_k = nkern un_k = nkern
else: else:
un_k = 1 un_k = 1
_logger.warn("Optimization Warning: in ConvOp.grad() we can't determine "\ _logger.warn(
"a good unroll value for the kernel. Maybe you can optimize this!") "Optimization Warning: in ConvOp.grad() we can't"
" determine a good unroll value for the kernel. Maybe"
" you can optimize this!")
dw = ConvOp(imshp, kshp, nkern, bsize, 1,1, output_mode='valid', dw = ConvOp(imshp, kshp, nkern, bsize, 1, 1, output_mode='valid',
unroll_batch=un_b, unroll_kern=un_k, unroll_patch=un_p, unroll_batch=un_b, unroll_kern=un_k, unroll_patch=un_p,
imshp_logical=imshp_logical, imshp_logical=imshp_logical,
kshp_logical=kshp_logical, kshp_logical=kshp_logical,
...@@ -812,7 +871,7 @@ class ConvOp(Op): ...@@ -812,7 +871,7 @@ class ConvOp(Op):
verbose=self.verbose) verbose=self.verbose)
else: # let __init__ choose c params be chosen automatically from shapes else: # let __init__ choose c params be chosen automatically from shapes
dw = ConvOp(imshp, kshp, nkern, bsize, 1,1, output_mode='valid', dw = ConvOp(imshp, kshp, nkern, bsize, 1, 1, output_mode='valid',
unroll_batch=None, unroll_kern=None, unroll_patch=None, unroll_batch=None, unroll_kern=None, unroll_patch=None,
imshp_logical=imshp_logical, imshp_logical=imshp_logical,
kshp_logical=kshp_logical, kshp_logical=kshp_logical,
...@@ -820,26 +879,25 @@ class ConvOp(Op): ...@@ -820,26 +879,25 @@ class ConvOp(Op):
version=self.version, version=self.version,
verbose=self.verbose) verbose=self.verbose)
if hasattr(self, 'flops'):
if hasattr(self,'flops'):
dw.set_flops() dw.set_flops()
dw = dw(img,filters) dw = dw(img, filters)
if all_shape: if all_shape:
assert (dw.owner.op.outshp==self.kshp).all() assert (dw.owner.op.outshp == self.kshp).all()
if self.out_mode == 'valid': if self.out_mode == 'valid':
# before DimShuffle, dw is of shape visdim x nkern x kshp[0] x kshp[1] # before DimShuffle, dw is of shape visdim x nkern x kshp[0] x kshp[1]
dw = dw.dimshuffle((1,0,2,3)) dw = dw.dimshuffle((1, 0, 2, 3))
dw = dw[:,:,::-1,::-1] dw = dw[:, :, ::-1, ::-1]
####### Determine gradient on inputs ######## ####### Determine gradient on inputs ########
mode = 'valid' mode = 'valid'
if not self.out_mode == 'full': if not self.out_mode == 'full':
mode = 'full' mode = 'full'
filters = kerns.dimshuffle((1,0,2,3)) filters = kerns.dimshuffle((1, 0, 2, 3))
filters = filters[:,:,::-1,::-1] filters = filters[:, :, ::-1, ::-1]
nkern = None nkern = None
imshp = None imshp = None
imshp_logical = None imshp_logical = None
...@@ -848,33 +906,36 @@ class ConvOp(Op): ...@@ -848,33 +906,36 @@ class ConvOp(Op):
if all_shape: if all_shape:
nkern = self.imshp[0] nkern = self.imshp[0]
imshp = (self.nkern, self.outshp[0], self.outshp[1]) imshp = (self.nkern, self.outshp[0], self.outshp[1])
imshp_logical=(self.nkern, self.fulloutshp[0], self.fulloutshp[1]) imshp_logical = (self.nkern, self.fulloutshp[0],
self.fulloutshp[1])
if 0: # hard-code c generation parameters if 0: # hard-code c generation parameters
din = ConvOp(imshp, self.kshp, nkern, self.bsize, din = ConvOp(imshp, self.kshp, nkern, self.bsize,
1,1, output_mode=mode, 1, 1, output_mode=mode,
unroll_batch=un_b, unroll_kern=un_k, unroll_patch=un_p, unroll_batch=un_b, unroll_kern=un_k,
unroll_patch=un_p,
imshp_logical=imshp_logical, imshp_logical=imshp_logical,
kshp_logical=None, kshp_logical=None,
version=-1,#we we change the mode, we don't forward the version. version=-1, # we we change the mode, we don't forward the version.
verbose=self.verbose) verbose=self.verbose)
else: # let __init__ figure out the unrolling / patch sizes else: # let __init__ figure out the unrolling / patch sizes
din = ConvOp(imshp, self.kshp, nkern, self.bsize, din = ConvOp(imshp, self.kshp, nkern, self.bsize,
1,1, output_mode=mode, 1, 1, output_mode=mode,
unroll_batch=None, unroll_kern=None, unroll_patch=None, unroll_batch=None, unroll_kern=None,
unroll_patch=None,
imshp_logical=imshp_logical, imshp_logical=imshp_logical,
kshp_logical=None, kshp_logical=None,
version=-1,#we we change the mode, we don't forward the version. version=-1, # we we change the mode, we don't forward the version.
verbose=self.verbose) verbose=self.verbose)
if hasattr(self,'flops'): if hasattr(self, 'flops'):
din.set_flops() din.set_flops()
din = din(gz,filters) din = din(gz, filters)
assert (din.owner.op.outshp is None and self.imshp is None) or \ assert (din.owner.op.outshp is None and self.imshp is None) or \
(din.owner.op.outshp is None) or \ (din.owner.op.outshp is None) or \
(din.owner.op.outshp==self.imshp[1:]).all() (din.owner.op.outshp == self.imshp[1:]).all()
# din and dw should have the same broadcasting pattern as the # din and dw should have the same broadcasting pattern as the
# parameters they are the gradient of (resp. inputs and kerns). # parameters they are the gradient of (resp. inputs and kerns).
...@@ -902,10 +963,14 @@ using namespace std; ...@@ -902,10 +963,14 @@ using namespace std;
""" Return True if we will generate code that use gemm. """ Return True if we will generate code that use gemm.
""" """
#the gemm version only support that case #the gemm version only support that case
if self.out_mode == 'valid' and self.dx==0 and self.dy==0: if self.out_mode == 'valid' and self.dx == 0 and self.dy == 0:
#We use a faster version in those case. #We use a faster version in those case.
if (self.imshp != self.imshp_logical or self.kshp != self.kshp_logical if (self.imshp != self.imshp_logical or
or self.unroll_patch or self.unroll_batch>0 or self.unroll_kern>0): self.kshp != self.kshp_logical or
self.unroll_patch or
self.unroll_batch > 0 or
self.unroll_kern > 0):
return False return False
return True return True
return False return False
...@@ -918,7 +983,9 @@ using namespace std; ...@@ -918,7 +983,9 @@ using namespace std;
def c_no_compile_args(self): def c_no_compile_args(self):
#when the ksph==(1,1) gcc 4.3.0 segfault during the #when the ksph==(1,1) gcc 4.3.0 segfault during the
#compilation with -O3. This don't happen at -O2 #compilation with -O3. This don't happen at -O2
if theano.gof.cmodule.gcc_version() in ['4.3.0'] and self.kshp==(1, 1): if (theano.gof.cmodule.gcc_version() in ['4.3.0'] and
self.kshp == (1, 1)):
return ['-O3'] return ['-O3']
else: else:
return [] return []
...@@ -928,7 +995,8 @@ using namespace std; ...@@ -928,7 +995,8 @@ using namespace std;
if self.use_blas(): if self.use_blas():
ret = blas.ldflags(libs=False, flags=True) ret = blas.ldflags(libs=False, flags=True)
if theano.gof.cmodule.gcc_version() in ['4.3.0'] and self.kshp==(1, 1): if (theano.gof.cmodule.gcc_version() in ['4.3.0'] and
self.kshp == (1, 1)):
ret += ['-O2'] ret += ['-O2']
if self.openmp: if self.openmp:
ret += ['-fopenmp'] ret += ['-fopenmp']
...@@ -951,50 +1019,59 @@ using namespace std; ...@@ -951,50 +1019,59 @@ using namespace std;
if node.inputs[0].type.dtype != node.inputs[1].type.dtype: if node.inputs[0].type.dtype != node.inputs[1].type.dtype:
raise NotImplementedError() raise NotImplementedError()
assert node.inputs[0].type.dtype == node.inputs[1].type.dtype assert node.inputs[0].type.dtype == node.inputs[1].type.dtype
d=locals() d = locals()
d.update(sub) d.update(sub)
all_shape = self.imshp is not None and self.kshp is not None and \ all_shape = (self.imshp is not None and self.kshp is not None and
self.nkern is not None and self.bsize is not None self.nkern is not None and self.bsize is not None)
d["self_out_mode"]=self.out_mode d["self_out_mode"] = self.out_mode
d["self_dx"]=self.dx d["self_dx"] = self.dx
d["self_dy"]=self.dy d["self_dy"] = self.dy
d["mode"]=self.out_mode.upper() d["mode"] = self.out_mode.upper()
d["affectation"]="=" d["affectation"] = "="
if all_shape: if all_shape:
d["self_bsize"]=self.bsize d["self_bsize"] = self.bsize
d["self_nkern"]=self.nkern d["self_nkern"] = self.nkern
d["self_outshp0"]=self.outshp[0] d["self_outshp0"] = self.outshp[0]
d["self_outshp1"]=self.outshp[1] d["self_outshp1"] = self.outshp[1]
d["self_imshp0"]=self.imshp[0] d["self_imshp0"] = self.imshp[0]
d["self_imshp1"]=self.imshp[1] d["self_imshp1"] = self.imshp[1]
d["self_imshp2"]=self.imshp[2] d["self_imshp2"] = self.imshp[2]
d["self_kshp0"]=self.kshp[0] d["self_kshp0"] = self.kshp[0]
d["self_kshp1"]=self.kshp[1] d["self_kshp1"] = self.kshp[1]
d["self_kshp_logical_r"] = self.kshp_logical[0] d["self_kshp_logical_r"] = self.kshp_logical[0]
d["self_kshp_logical_c"] = self.kshp_logical[1] d["self_kshp_logical_c"] = self.kshp_logical[1]
d["self_kshp_logical_stride_r"] = int(numpy.ceil(self.kshp_logical[0] / float(self.kshp[0]))) d["self_kshp_logical_stride_r"] = int(numpy.ceil(
d["self_kshp_logical_stride_c"] = int(numpy.ceil(self.kshp_logical[1] / float(self.kshp[1]))) self.kshp_logical[0] / float(self.kshp[0])))
d["self_imshp_logical_r"] = self.imshp_logical[1] #numpy.B. 1 not 0 d["self_kshp_logical_stride_c"] = int(numpy.ceil(
d["self_imshp_logical_c"] = self.imshp_logical[2]#numpy.B. 2 not 1 self.kshp_logical[1] / float(self.kshp[1])))
d["self_imshp_logical_stride_r"] = int(numpy.ceil(self.imshp_logical[1] / float(self.imshp[1]))) d["self_imshp_logical_r"] = self.imshp_logical[1]
d["self_imshp_logical_stride_c"] = int(numpy.ceil(self.imshp_logical[2] / float(self.imshp[2]))) #numpy.B. 1 not 0
if not self.imshp[0]==1: d["affectation"]="+=" d["self_imshp_logical_c"] = self.imshp_logical[2]
d["all_shape"]="1" # numpy.B. 2 not 1
d["dim_zz_const"]="const" d["self_imshp_logical_stride_r"] = int(numpy.ceil(
d["dim_zz_affect"]="" self.imshp_logical[1] / float(self.imshp[1])))
d["assert_size"]=""" d["self_imshp_logical_stride_c"] = int(numpy.ceil(
self.imshp_logical[2] / float(self.imshp[2])))
if not self.imshp[0] == 1:
d["affectation"] = "+="
d["all_shape"] = "1"
d["dim_zz_const"] = "const"
d["dim_zz_affect"] = ""
d["assert_size"] = """
// Check the batch size and the number of kernels (sometimes constant in the graph) // Check the batch size and the number of kernels (sometimes constant in the graph)
if(img2d_dim[0] != %(self_bsize)s!=0){ if(img2d_dim[0] != %(self_bsize)s!=0){
PyErr_Format(PyExc_ValueError, PyErr_Format(PyExc_ValueError,
"the batch size in the image (%%ld) at run time is different than at build time (%%ld) for the ConvOp.", "the batch size in the image (%%ld) at run time is different"
" than at build time (%%ld) for the ConvOp.",
(long)img2d_dim[0], (long)%(self_bsize)s); (long)img2d_dim[0], (long)%(self_bsize)s);
%(fail)s; %(fail)s;
} }
if(kerns_dim[0] != %(self_nkern)s!=0){ if(kerns_dim[0] != %(self_nkern)s!=0){
PyErr_Format(PyExc_ValueError, PyErr_Format(PyExc_ValueError,
"the number of kernels in the filter (%%ld) at run time is different than at build time (%%ld) for the ConvOp.", "the number of kernels in the filter (%%ld) at run time is"
" different than at build time (%%ld) for the ConvOp.",
(long)kerns_dim[0], (long)%(self_nkern)s); (long)kerns_dim[0], (long)%(self_nkern)s);
%(fail)s; %(fail)s;
} }
...@@ -1002,19 +1079,22 @@ if(kerns_dim[0] != %(self_nkern)s!=0){ ...@@ -1002,19 +1079,22 @@ if(kerns_dim[0] != %(self_nkern)s!=0){
// Check the size of the image (sometimes constant in the graph) // Check the size of the image (sometimes constant in the graph)
if(img2d_dim[1] != %(self_imshp0)s){ if(img2d_dim[1] != %(self_imshp0)s){
PyErr_Format(PyExc_ValueError, PyErr_Format(PyExc_ValueError,
"the image stack size (%%ld) at run time is different than at build time (%%ld) for the ConvOp.", "the image stack size (%%ld) at run time is different than"
" at build time (%%ld) for the ConvOp.",
(long)img2d_dim[1], (long)%(self_imshp0)s); (long)img2d_dim[1], (long)%(self_imshp0)s);
%(fail)s; %(fail)s;
} }
if(img2d_dim[2] != %(self_imshp1)s){ if(img2d_dim[2] != %(self_imshp1)s){
PyErr_Format(PyExc_ValueError, PyErr_Format(PyExc_ValueError,
"the number of rows in the image (%%ld) at run time is different than at build time (%%ld) for the ConvOp.", "the number of rows in the image (%%ld) at run time is different"
" than at build time (%%ld) for the ConvOp.",
(long)img2d_dim[2], (long)%(self_imshp1)s); (long)img2d_dim[2], (long)%(self_imshp1)s);
%(fail)s; %(fail)s;
} }
if(img2d_dim[3] != %(self_imshp2)s){ if(img2d_dim[3] != %(self_imshp2)s){
PyErr_Format(PyExc_ValueError, PyErr_Format(PyExc_ValueError,
"the number of columns in the image (%%ld) at run time is different than at build time (%%ld) for the ConvOp.", "the number of columns in the image (%%ld) at run time is"
" different than at build time (%%ld) for the ConvOp.",
(long)img2d_dim[3], (long)%(self_imshp2)s); (long)img2d_dim[3], (long)%(self_imshp2)s);
%(fail)s; %(fail)s;
} }
...@@ -1022,13 +1102,15 @@ if(img2d_dim[3] != %(self_imshp2)s){ ...@@ -1022,13 +1102,15 @@ if(img2d_dim[3] != %(self_imshp2)s){
// Check the size of the output (sometimes constant in the graph) // Check the size of the output (sometimes constant in the graph)
if(dim_zz[0] != %(self_outshp0)s!=0){ if(dim_zz[0] != %(self_outshp0)s!=0){
PyErr_Format(PyExc_ValueError, PyErr_Format(PyExc_ValueError,
"the precomputed number of rows in the output (%%ld) at run time is different than at build time (%%ld) for the ConvOp.", "the precomputed number of rows in the output (%%ld) at run time"
" is different than at build time (%%ld) for the ConvOp.",
(long)dim_zz[0], (long)%(self_outshp0)s); (long)dim_zz[0], (long)%(self_outshp0)s);
%(fail)s; %(fail)s;
} }
if(dim_zz[1] != %(self_outshp1)s!=0){ if(dim_zz[1] != %(self_outshp1)s!=0){
PyErr_Format(PyExc_ValueError, PyErr_Format(PyExc_ValueError,
"the precomputed number of columns in the output (%%ld) at run time is different than at build time (%%ld) for the ConvOp.", "the precomputed number of columns in the output (%%ld) at run"
" time is different than at build time (%%ld) for the ConvOp.",
(long)dim_zz[1], (long)%(self_outshp1)s); (long)dim_zz[1], (long)%(self_outshp1)s);
%(fail)s; %(fail)s;
} }
...@@ -1036,38 +1118,41 @@ if(dim_zz[1] != %(self_outshp1)s!=0){ ...@@ -1036,38 +1118,41 @@ if(dim_zz[1] != %(self_outshp1)s!=0){
// Check the size of the filter (sometimes constant in the graph) // Check the size of the filter (sometimes constant in the graph)
if(kerns_dim[1] %% %(self_imshp0)s!=0){ if(kerns_dim[1] %% %(self_imshp0)s!=0){
PyErr_Format(PyExc_ValueError, PyErr_Format(PyExc_ValueError,
"the filter stack size (%%ld) at run time is different than at build time (%%ld) for the ConvOp.", "the filter stack size (%%ld) at run time is different than at"
" build time (%%ld) for the ConvOp.",
(long)kerns_dim[1], (long)%(self_imshp0)s); (long)kerns_dim[1], (long)%(self_imshp0)s);
%(fail)s; %(fail)s;
} }
if(kerns_dim[2] %% %(self_kshp0)s!=0){ if(kerns_dim[2] %% %(self_kshp0)s!=0){
PyErr_Format(PyExc_ValueError, PyErr_Format(PyExc_ValueError,
"the number of rows in the filter (%%ld) at run time is different than at build time (%%ld) for the ConvOp.", "the number of rows in the filter (%%ld) at run time is different"
" than at build time (%%ld) for the ConvOp.",
(long)kerns_dim[2], (long)%(self_kshp0)s); (long)kerns_dim[2], (long)%(self_kshp0)s);
%(fail)s; %(fail)s;
} }
if(kerns_dim[3] %% %(self_kshp1)s!=0){ if(kerns_dim[3] %% %(self_kshp1)s!=0){
PyErr_Format(PyExc_ValueError, PyErr_Format(PyExc_ValueError,
"the number of columns in the filter (%%ld) at run time is different than at build time (%%ld) for the ConvOp.", "the number of columns in the filter (%%ld) at run time is"
" different than at build time (%%ld) for the ConvOp.",
(long)kerns_dim[3], (long)%(self_kshp1)s); (long)kerns_dim[3], (long)%(self_kshp1)s);
%(fail)s; %(fail)s;
} }
"""%(locals()) """ % (locals())
else: else:
d["self_bsize"]="%(img2d)s->dimensions[0]"%d d["self_bsize"] = "%(img2d)s->dimensions[0]" % d
d["self_nkern"]="%(filtersflipped)s->dimensions[0]"%d d["self_nkern"] = "%(filtersflipped)s->dimensions[0]" % d
d["self_outshp0"]="-1" d["self_outshp0"] = "-1"
d["self_outshp1"]="-1" d["self_outshp1"] = "-1"
d["self_imshp0"]="%(img2d)s->dimensions[1]"%d d["self_imshp0"] = "%(img2d)s->dimensions[1]" % d
d["self_imshp1"]="%(img2d)s->dimensions[2]"%d d["self_imshp1"] = "%(img2d)s->dimensions[2]" % d
d["self_imshp2"]="%(img2d)s->dimensions[3]"%d d["self_imshp2"] = "%(img2d)s->dimensions[3]" % d
d["self_kshp0"]="%(filtersflipped)s->dimensions[2]"%d d["self_kshp0"] = "%(filtersflipped)s->dimensions[2]" % d
d["self_kshp1"]="%(filtersflipped)s->dimensions[3]"%d d["self_kshp1"] = "%(filtersflipped)s->dimensions[3]" % d
d["affectation"]="+=" d["affectation"] = "+="
d["all_shape"]="0" d["all_shape"] = "0"
d["dim_zz_const"]="" d["dim_zz_const"] = ""
d["dim_zz_affect"]=""" d["dim_zz_affect"] = """
if (mode == FULL) { if (mode == FULL) {
dim_zz[0] = (int)ceil((dim_im[0]+dim_ker0-1)/float(%(self_dx)s)); dim_zz[0] = (int)ceil((dim_im[0]+dim_ker0-1)/float(%(self_dx)s));
dim_zz[1] = (int)ceil((dim_im[1]+dim_ker1-1)/float(%(self_dy)s)); dim_zz[1] = (int)ceil((dim_im[1]+dim_ker1-1)/float(%(self_dy)s));
...@@ -1075,8 +1160,8 @@ if(kerns_dim[3] %% %(self_kshp1)s!=0){ ...@@ -1075,8 +1160,8 @@ if(kerns_dim[3] %% %(self_kshp1)s!=0){
dim_zz[0] = (int)ceil((dim_im[0]-dim_ker0+1)/float(%(self_dx)s)); dim_zz[0] = (int)ceil((dim_im[0]-dim_ker0+1)/float(%(self_dx)s));
dim_zz[1] = (int)ceil((dim_im[1]-dim_ker1+1)/float(%(self_dy)s)); dim_zz[1] = (int)ceil((dim_im[1]-dim_ker1+1)/float(%(self_dy)s));
} }
"""% d """ % d
d["assert_size"]="" d["assert_size"] = ""
if self.kshp_logical_top_aligned: if self.kshp_logical_top_aligned:
d["self_kshp_logical_offset_r"] = 0 d["self_kshp_logical_offset_r"] = 0
...@@ -1084,28 +1169,39 @@ if(kerns_dim[3] %% %(self_kshp1)s!=0){ ...@@ -1084,28 +1169,39 @@ if(kerns_dim[3] %% %(self_kshp1)s!=0){
elif all_shape: elif all_shape:
rstride = d["self_kshp_logical_stride_r"] rstride = d["self_kshp_logical_stride_r"]
cstride = d["self_kshp_logical_stride_c"] cstride = d["self_kshp_logical_stride_c"]
d["self_kshp_logical_offset_r"] = (self.kshp_logical[0] - (self.kshp[0]*rstride) - 1+rstride) % rstride d["self_kshp_logical_offset_r"] = (self.kshp_logical[0] -
d["self_kshp_logical_offset_c"] = (self.kshp_logical[1] - (self.kshp[1]*cstride) - 1+cstride) % cstride (self.kshp[0] * rstride) -
1 + rstride) % rstride
d["self_kshp_logical_offset_c"] = (self.kshp_logical[1] -
(self.kshp[1] * cstride) -
1 + cstride) % cstride
del rstride, cstride del rstride, cstride
if node.inputs[0].type.dtype=="float32": d["type"]="float" if node.inputs[0].type.dtype == "float32":
elif node.inputs[0].type.dtype=="float64": d["type"]="double" d["type"] = "float"
else: raise Exception("Type %s not implemented"%node.inputs[0].type.dtype) elif node.inputs[0].type.dtype == "float64":
d["gemm"]='dgemm_' d["type"] = "double"
if not d["type"]=="double":d["gemm"]='sgemm_' else:
raise Exception("Type %s not implemented" %
node.inputs[0].type.dtype)
d["gemm"] = 'dgemm_'
if not d["type"] == "double":
d["gemm"] = 'sgemm_'
if self.imshp != self.imshp_logical or self.kshp != self.kshp_logical: if self.imshp != self.imshp_logical or self.kshp != self.kshp_logical:
if self.verbose: if self.verbose:
_logger.debug("return imshp!=imshp_logical or self.kshp != self.kshp_logical shape version") _logger.debug("return imshp!=imshp_logical or"
" self.kshp != self.kshp_logical shape version")
return _conv_op_code_a % d return _conv_op_code_a % d
if self.unroll_patch: if self.unroll_patch:
if self.verbose: if self.verbose:
_logger.debug("return unroll patch version. all_shape=%s", all_shape) _logger.debug("return unroll patch version. all_shape=%s",
return _conv_op_code_unroll_patch%d all_shape)
if self.unroll_batch>0 or self.unroll_kern>0: return _conv_op_code_unroll_patch % d
assert self.unroll_batch>0 if self.unroll_batch > 0 or self.unroll_kern > 0:
assert self.unroll_kern>0 assert self.unroll_batch > 0
assert self.unroll_kern > 0
if self.verbose: if self.verbose:
_logger.debug("return unrolled batch (%s) and kern code (%s)", _logger.debug("return unrolled batch (%s) and kern code (%s)",
str(self.unroll_batch), str(self.unroll_kern)) str(self.unroll_batch), str(self.unroll_kern))
...@@ -1113,7 +1209,7 @@ if(kerns_dim[3] %% %(self_kshp1)s!=0){ ...@@ -1113,7 +1209,7 @@ if(kerns_dim[3] %% %(self_kshp1)s!=0){
self.unroll_kern) self.unroll_kern)
#TODO: should we choose the unroll size automatically with the bigger divisor under 5? #TODO: should we choose the unroll size automatically with the bigger divisor under 5?
if self.out_mode == 'valid' and self.dx==0 and self.dy==0: if self.out_mode == 'valid' and self.dx == 0 and self.dy == 0:
if self.verbose: if self.verbose:
_logger.debug("return gemm version") _logger.debug("return gemm version")
return _conv_op_code_valid_gemm % d return _conv_op_code_valid_gemm % d
...@@ -1126,7 +1222,8 @@ if(kerns_dim[3] %% %(self_kshp1)s!=0){ ...@@ -1126,7 +1222,8 @@ if(kerns_dim[3] %% %(self_kshp1)s!=0){
_conv_op_code_a = """ _conv_op_code_a = """
const int mode=%(mode)s; const int mode=%(mode)s;
int typenum=0, typenum_f=0; int typenum=0, typenum_f=0;
PyArrayObject *ain1=NULL, *ain2=NULL, *filtersflipped_arr=NULL, *img2d_arr=NULL; PyArrayObject *ain1=NULL, *ain2=NULL;
PyArrayObject *filtersflipped_arr=NULL, *img2d_arr=NULL;
const %(type)s fill_value = 0; const %(type)s fill_value = 0;
int type_im=PyArray_TYPE(%(img2d)s); int type_im=PyArray_TYPE(%(img2d)s);
...@@ -1216,12 +1313,17 @@ if ((filtersflipped_arr->strides[3] != (npy_intp)sizeof(%(type)s)) ...@@ -1216,12 +1313,17 @@ if ((filtersflipped_arr->strides[3] != (npy_intp)sizeof(%(type)s))
filtersflipped_arr = (PyArrayObject*)filtersflipped; filtersflipped_arr = (PyArrayObject*)filtersflipped;
if(mode != VALID && mode != FULL){ if(mode != VALID && mode != FULL){
PyErr_SetString(PyExc_ValueError, "invalid mode, only full and valid are supported"); %(fail)s; PyErr_SetString(PyExc_ValueError,
"invalid mode, only full and valid are supported");
%(fail)s;
} }
typenum = PyArray_ObjectType((PyObject*)%(img2d)s, 0); typenum = PyArray_ObjectType((PyObject*)%(img2d)s, 0);
typenum_f = PyArray_ObjectType((PyObject*)%(filtersflipped)s, 0); typenum_f = PyArray_ObjectType((PyObject*)%(filtersflipped)s, 0);
if (typenum < 0) {PyErr_SetString(PyExc_ValueError, "Invalid type"); %(fail)s;} if (typenum < 0) {PyErr_SetString(PyExc_ValueError, "Invalid type"); %(fail)s;}
if (typenum != typenum_f) {PyErr_SetString(PyExc_ValueError, "Input types must match"); %(fail)s;} if (typenum != typenum_f) {
PyErr_SetString(PyExc_ValueError, "Input types must match");
%(fail)s;
}
if (!img2d) %(fail)s; if (!img2d) %(fail)s;
if (!filtersflipped) %(fail)s; if (!filtersflipped) %(fail)s;
...@@ -1249,10 +1351,19 @@ Os[0]=%(self_outshp0)s; ...@@ -1249,10 +1351,19 @@ Os[0]=%(self_outshp0)s;
Os[1]=%(self_outshp1)s; Os[1]=%(self_outshp1)s;
//assertions //assertions
if (%(z)s->strides[0] != %(z)s->dimensions[1] *%(z)s->dimensions[2] *%(z)s->dimensions[3] * (npy_intp)sizeof(%(type)s)) %(fail)s; if (%(z)s->strides[0] != %(z)s->dimensions[1] *
if (%(z)s->strides[1] != %(z)s->dimensions[2] * %(z)s->dimensions[3] * (npy_intp)sizeof(%(type)s)) %(fail)s; %(z)s->dimensions[2] *
if (%(z)s->strides[2] != %(z)s->dimensions[3] * (npy_intp)sizeof(%(type)s)) %(fail)s; %(z)s->dimensions[3] *
if (%(z)s->strides[3] != (npy_intp)sizeof(%(type)s)) %(fail)s; (npy_intp)sizeof(%(type)s))
%(fail)s;
if (%(z)s->strides[1] != %(z)s->dimensions[2] *
%(z)s->dimensions[3] *
(npy_intp)sizeof(%(type)s))
%(fail)s;
if (%(z)s->strides[2] != %(z)s->dimensions[3] * (npy_intp)sizeof(%(type)s))
%(fail)s;
if (%(z)s->strides[3] != (npy_intp)sizeof(%(type)s))
%(fail)s;
for(int b=0;b< %(self_bsize)s;b++){ for(int b=0;b< %(self_bsize)s;b++){
for(int n_kern=0;n_kern<%(self_nkern)s;n_kern++){ for(int n_kern=0;n_kern<%(self_nkern)s;n_kern++){
...@@ -1267,34 +1378,41 @@ for(int b=0;b< %(self_bsize)s;b++){ ...@@ -1267,34 +1378,41 @@ for(int b=0;b< %(self_bsize)s;b++){
for (int iter_m=0; iter_m < Os[0]; iter_m++) { for (int iter_m=0; iter_m < Os[0]; iter_m++) {
/// Reposition index into input image based on requested output size // Reposition index into input image based on requested output size
int pos_m = iter_m*%(self_dx)s; //row position in logical output image //row position in logical output image
int new_m; //row anchor in logical input image (we will loop upward from here) int pos_m = iter_m*%(self_dx)s;
//row anchor in logical input image (we will loop upward from here)
int new_m;
if (mode == FULL) new_m = pos_m ; if (mode == FULL) new_m = pos_m ;
else new_m = (pos_m+dim_ker_log[0]-1); else new_m = (pos_m+dim_ker_log[0]-1);
for (int iter_n=0; iter_n < Os[1]; iter_n++) { // loop over columns for (int iter_n=0; iter_n < Os[1]; iter_n++) { // loop over columns
int pos_n=iter_n*%(self_dy)s; // current col position in logical output image // current col position in logical output image
int pos_n=iter_n*%(self_dy)s;
%(type)s sum=0; %(type)s sum=0;
// Sum over kernel, if index into image is out of bounds // Sum over kernel, if index into image is out of bounds
// fill with the value // fill with the value
for (int j_log=0; j_log < %(self_kshp_logical_r)s; j_log++) { // loop over logical rows in kernel // loop over logical rows in kernel
for (int j_log=0; j_log < %(self_kshp_logical_r)s; j_log++) {
int ind0_log = (new_m-j_log); // ind0_log: row position in logical input image // ind0_log: row position in logical input image
int ind0_log = (new_m-j_log);
if ((j_log < %(self_kshp_logical_offset_r)s) || (j_log - %(self_kshp_logical_offset_r)s) MOD %(self_kshp_logical_stride_r)s) if ((j_log < %(self_kshp_logical_offset_r)s) ||
(j_log - %(self_kshp_logical_offset_r)s) MOD %(self_kshp_logical_stride_r)s)
continue; continue;
if (ind0_log MOD %(self_imshp_logical_stride_r)s) if (ind0_log MOD %(self_imshp_logical_stride_r)s)
continue; continue;
int j_phys = ((j_log- %(self_kshp_logical_offset_r)s) / %(self_kshp_logical_stride_r)s); int j_phys = ((j_log- %(self_kshp_logical_offset_r)s) /
%(self_kshp_logical_stride_r)s);
int ind0_phys = (ind0_log / %(self_imshp_logical_stride_r)s); int ind0_phys = (ind0_log / %(self_imshp_logical_stride_r)s);
//std::cerr <<"j_log" << j_log << " j_phys " << j_phys << " " << ind0_phys << "\\n"; //std::cerr <<"j_log" << j_log << " j_phys " << j_phys << " " << ind0_phys << "\\n";
if(mode==FULL){ if(mode==FULL){
const %(type)s * idx_hvals=&hvals[j_phys*dim_ker_phys[1]]; //This is a pointer to the current row of the kernel //This is a pointer to the current row of the kernel
const %(type)s * idx_hvals=&hvals[j_phys*dim_ker_phys[1]];
if(ind0_log < 0 || ind0_log >= dim_im_log[0]){ if(ind0_log < 0 || ind0_log >= dim_im_log[0]){
// the current row of the kernel is off the image // the current row of the kernel is off the image
}else{ }else{
...@@ -1304,30 +1422,40 @@ for(int b=0;b< %(self_bsize)s;b++){ ...@@ -1304,30 +1422,40 @@ for(int b=0;b< %(self_bsize)s;b++){
for (int ind1_log=pos_n-k; k<max_k; k++,ind1_log--) { for (int ind1_log=pos_n-k; k<max_k; k++,ind1_log--) {
if (1) if (1)
{ {
if ((k < %(self_kshp_logical_offset_c)s) || (k - %(self_kshp_logical_offset_c)s) MOD %(self_kshp_logical_stride_c)s) if ((k < %(self_kshp_logical_offset_c)s) ||
(k - %(self_kshp_logical_offset_c)s) MOD
%(self_kshp_logical_stride_c)s)
continue; continue;
if (ind1_log MOD %(self_imshp_logical_stride_c)s) if (ind1_log MOD
%(self_imshp_logical_stride_c)s)
continue; continue;
} }
sum+= idx_hvals[(k-%(self_kshp_logical_offset_c)s) / %(self_kshp_logical_stride_c)s] * idx_in[ind1_log / %(self_imshp_logical_stride_c)s]; sum += idx_hvals[(k-%(self_kshp_logical_offset_c)s) /
%(self_kshp_logical_stride_c)s] *
idx_in[ind1_log / %(self_imshp_logical_stride_c)s];
} }
} }
}else{ }else{ // mode==VALID
const %(type)s* idx_in=&in[ind0_phys*dim_im_phys[1]]; //JB: should be dim_im[1] right? (was dim_im[0]) //JB: should be dim_im[1] right? (was dim_im[0])
const %(type)s* idx_in=&in[ind0_phys*dim_im_phys[1]];
const %(type)s* idx_hvals=&hvals[j_phys*dim_ker_phys[1]]; const %(type)s* idx_hvals=&hvals[j_phys*dim_ker_phys[1]];
int new_n = (pos_n+dim_ker_log[1]-1); int new_n = (pos_n+dim_ker_log[1]-1);
if (%(self_imshp_logical_stride_c)s != 1) // a general loop if (%(self_imshp_logical_stride_c)s != 1) // a general loop
{ {
for (int k=0,last=new_n; k < dim_ker_log[1]; k++,last--) { for (int k=0,last=new_n; k < dim_ker_log[1]; k++,last--) {
if ((k < %(self_kshp_logical_offset_c)s) || (k - %(self_kshp_logical_offset_c)s) MOD %(self_kshp_logical_stride_c)s) if ((k < %(self_kshp_logical_offset_c)s) ||
(k - %(self_kshp_logical_offset_c)s) MOD
%(self_kshp_logical_stride_c)s)
continue; continue;
else if (last MOD %(self_imshp_logical_stride_c)s) else if (last MOD %(self_imshp_logical_stride_c)s)
continue; continue;
else else
{ {
sum+=idx_hvals[(k-%(self_kshp_logical_offset_c)s) / %(self_kshp_logical_stride_c)s]*idx_in[last/%(self_imshp_logical_stride_c)s]; sum+=idx_hvals[(k-%(self_kshp_logical_offset_c)s) /
%(self_kshp_logical_stride_c)s] *
idx_in[last/%(self_imshp_logical_stride_c)s];
} }
} }
} }
...@@ -1335,7 +1463,8 @@ for(int b=0;b< %(self_bsize)s;b++){ ...@@ -1335,7 +1463,8 @@ for(int b=0;b< %(self_bsize)s;b++){
{ {
int offset = %(self_kshp_logical_offset_c)s; int offset = %(self_kshp_logical_offset_c)s;
int k_phys=0; int k_phys=0;
for (int k_log=offset,last=new_n-offset; k_log < dim_ker_log[1]; ) { for (int k_log=offset,last=new_n-offset;
k_log < dim_ker_log[1]; ) {
sum += idx_hvals[k_phys]*idx_in[last]; sum += idx_hvals[k_phys]*idx_in[last];
++k_phys; ++k_phys;
last -= %(self_kshp_logical_stride_c)s; last -= %(self_kshp_logical_stride_c)s;
...@@ -1343,10 +1472,10 @@ for(int b=0;b< %(self_bsize)s;b++){ ...@@ -1343,10 +1472,10 @@ for(int b=0;b< %(self_bsize)s;b++){
} }
} }
} }
}//for j }//for j_log
out[iter_m*dim_zz[1]+iter_n] %(affectation)s sum; out[iter_m*dim_zz[1]+iter_n] %(affectation)s sum;
}//for n }//for iter_n
}//for m }//for iter_m
}//for stack_size }//for stack_size
if (0 && (mode==FULL)){ if (0 && (mode==FULL)){
for (int i = 0; i < dim_zz[0]*dim_zz[1]; ++i) for (int i = 0; i < dim_zz[0]*dim_zz[1]; ++i)
...@@ -1585,33 +1714,36 @@ free(kbuf); ...@@ -1585,33 +1714,36 @@ free(kbuf);
Py_XDECREF(img2d); Py_XDECREF(img2d);
""" """
def gen_conv_code_unroll_batch_kern(d,unroll_bsize=1, unroll_ksize=1):
def gen_conv_code_unroll_batch_kern(d, unroll_bsize=1, unroll_ksize=1):
""" c_code for ConvOp that unroll the batch size loop """ c_code for ConvOp that unroll the batch size loop
""" """
assert unroll_bsize>0 and unroll_ksize>0 assert unroll_bsize > 0 and unroll_ksize > 0
if d.has_key("unroll_bsize") or d.has_key("unroll_ksize") or d.has_key("unroll_iter") or d.has_key("unroll_biter") or d.has_key("unroll_kiter"): if "unroll_bsize" in d or "unroll_ksize" in d or "unroll_iter" in d or "unroll_biter" in d or "unroll_kiter" in d:
raise Exception("We can't use this dictionnary as we will overwrite some of its containt") raise Exception("We can't use this dictionnary as we will overwrite some of its containt")
d=d.copy() d = d.copy()
d["unroll_bsize"]=unroll_bsize d["unroll_bsize"] = unroll_bsize
d["unroll_ksize"]=unroll_ksize d["unroll_ksize"] = unroll_ksize
def my_dup(st,size):
s="" def my_dup(st, size):
s = ""
for i in xrange(size): for i in xrange(size):
d["unroll_iter"]=i d["unroll_iter"] = i
s+=st%d s += st % d
return s+"\n" return s + "\n"
def my_dup2(st): def my_dup2(st):
s="" s = ""
iter=0 iter = 0
for i in xrange(unroll_bsize): for i in xrange(unroll_bsize):
d["unroll_biter"]=i d["unroll_biter"] = i
for j in xrange(unroll_ksize): for j in xrange(unroll_ksize):
d["unroll_kiter"]=j d["unroll_kiter"] = j
d["unroll_iter"]=iter d["unroll_iter"] = iter
iter+=1 iter += 1
s+=st%d s += st % d
return s+"\n" return s + "\n"
ret = """ ret = """
const int mode=%(mode)s; const int mode=%(mode)s;
int typenum=0, typenum_f=0; int typenum=0, typenum_f=0;
...@@ -1765,7 +1897,8 @@ for(int b=0;b< %(self_bsize)s ;b+=%(unroll_bsize)s){ ...@@ -1765,7 +1897,8 @@ for(int b=0;b< %(self_bsize)s ;b+=%(unroll_bsize)s){
for (int iter_n=0; iter_n < Os[1]; iter_n++) { // loop over columns for (int iter_n=0; iter_n < Os[1]; iter_n++) { // loop over columns
int pos_n=iter_n*%(self_dy)s; int pos_n=iter_n*%(self_dy)s;
""" % d """ % d
ret += my_dup("%(type)s sum%(unroll_iter)s=0;", unroll_bsize * unroll_ksize) ret += my_dup(
"%(type)s sum%(unroll_iter)s=0;", unroll_bsize * unroll_ksize)
ret += """ ret += """
// Sum over kernel, if index into image is out of bounds // Sum over kernel, if index into image is out of bounds
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论