提交 0923873d authored 作者: gdesjardins's avatar gdesjardins

* moved tensor/nnet.py to tensor/nnet/nnet.py

* moved sandbox/conv.py to tensor/nnet/conv.py * created tensor/signal/conv.py which implements generic 2D convolution (i.e. traditional signal processing definition) + tests * added tensor/nnet/tests/speed_test_conv.py which contains Fred's speed tests for ConvOp * added appropriate documentation to sphinx library docs
上级 3dac2fd1
......@@ -18,8 +18,7 @@ They are grouped into the following sections:
:maxdepth: 1
basic
nnet/index
raw_random
shared_randomstreams
nnet
signal
signal/index
.. _libdoc_tensor_signal:
.. _libdoc_tensor_nnet_conv:
======================================================
:mod:`signal` -- Signal processing
======================================================
==========================================================
:mod:`conv` -- Ops for convolutional neural nets
==========================================================
.. module:: signal
.. module:: conv
:platform: Unix, Windows
:synopsis: ops for signal processing
.. moduleauthor:: LISA
......@@ -12,9 +12,7 @@
TODO: Give examples for how to use these things! They are pretty complicated.
.. function:: conv2D(*todo)
.. function:: downsample2D(*todo)
.. autofunction:: theano.tensor.nnet.conv.conv2d
.. function:: fft(*todo)
......
.. _libdoc_tensor_nnet:
==================================================
:mod:`nnet` -- Ops related to neural networks
==================================================
.. module:: nnet
:platform: Unix, Windows
:synopsis: various ops relating to neural networks
.. moduleauthor:: LISA
Theano was originally developped for machine learning applications, particularly
for the topic of deep learning. As such, our lab has developed many functions
and ops which are particular to neural networks and deep learning.
.. toctree::
:maxdepth: 1
conv
nnet
.. _libdoc_tensor_nnet:
.. _libdoc_tensor_nnet_nnet:
======================================================
:mod:`nnet` -- Ops for neural networks
......
.. _libdoc_tensor_signal_conv:
======================================================
:mod:`conv` -- Convolution
======================================================
.. module:: conv
:platform: Unix, Windows
:synopsis: ops for performing convolutions
.. moduleauthor:: LISA
.. autofunction:: theano.tensor.signal.conv.conv2d
.. function:: fft(*todo)
[James has some code for this, but hasn't gotten it into the source tree yet.]
.. _libdoc_tensor_signal_downsample:
======================================================
:mod:`downsample` -- Down-Sampling
======================================================
.. module:: downsample
:platform: Unix, Windows
:synopsis: ops for performing various forms of downsampling
.. moduleauthor:: LISA
.. autofunction:: theano.tensor.signal.downsample.max_pool2D
.. function:: fft(*todo)
[James has some code for this, but hasn't gotten it into the source tree yet.]
.. _libdoc_tensor:
=====================================================
:mod:`signal` -- Signal Processing
=====================================================
Signal Processing
-----------------
.. module:: signal
:platform: Unix, Windows
:synopsis: various ops for performing basic signal processing
(convolutions, subsampling, fft, etc.)
.. moduleauthor:: LISA
The signal subpackage contains ops which are useful for performing various
forms of signal processing.
.. toctree::
:maxdepth: 1
conv
downsample
import sys, time, unittest
import numpy
import numpy as N
from theano.tests import unittest_tools as utt
from theano import function, Mode
import theano.tensor as T
from conv import ConvOp, getFilterOutShp
def flip(kern, kshp):
"flip the kernel as scipy.convolv2d do it flipped."
flip = N.zeros(kern.shape)
if len(kern.shape)==2:
kern=kern.reshape(-1)
it = reversed(kern)
for i in range(kshp[0]):
for j in range(kshp[1]):
flip[i,j] = it.next()
elif len(kern.shape)==3:
kern=kern.reshape(kern.shape[0],-1)
for k in range(kern.shape[0]):
it = reversed(kern[k,:])
for i in range(kshp[0]):
for j in range(kshp[1]):
flip[k,i,j] = it.next()
elif len(kern.shape)==4:
kern=kern.reshape(kern.shape[0],kern.shape[1],-1)
for k in range(kern.shape[0]):
for m in range(kern.shape[1]):
it = reversed(kern[k,m,:])
for i in range(kshp[0]):
for j in range(kshp[1]):
flip[k,m,i,j] = it.next()
else:
raise NotImplementedError()
return flip
global_rng = N.random.RandomState(3423489)
dmatrix4=T.TensorType('float64', (False, False, False, False))
def exec_multilayer_conv_nnet(conv_mode, ss, bsize, imshp, kshps, nkerns, unroll_batch=0, unroll_kern=0, img=T.dmatrix(), validate=True, conv_op_py=False, do_print=True, repeat=1, unroll_patch=False, unroll_patch_size=False, verbose=0):
# build actual input images
imgval = global_rng.rand(bsize, imshp[0], imshp[1], imshp[2])
a=T.dmatrix()
kerns = [a for i in nkerns]
inputs4=dmatrix4()
kerns4=dmatrix4()
# for each layer
ntot=0
tctot=0
tpytot=0
for kshp, kern, nkern, n_layer in zip(kshps, kerns, nkerns, range(len(nkerns))):
if do_print:
print '************* layer %i ***************' % n_layer
print conv_mode, ss, n_layer, kshp, nkern
# actual values
w = global_rng.random_sample(N.r_[nkern,imshp[0],kshp])
w_flip = flip(w,kshp).reshape(w.shape)
## manual implementation
# check first stage
padimg = imgval
if conv_mode == 'full':
padimg_shp = N.array(imshp[1:]) + 2*(N.array(kshp) - N.array([1,1]))
padimg = N.zeros(N.r_[bsize,imshp[0],padimg_shp])
padimg[:, :, kshp[0]-1:-kshp[0]+1,
kshp[1]-1:-kshp[1]+1] = imgval
outshp = N.hstack((nkern, getFilterOutShp(imshp, kshp, ss, conv_mode)))
time1 = time.time()
outval = N.zeros(N.r_[bsize,outshp])
if validate:
# causes an atexit problem
from scipy.signal.sigtools import _convolve2d
from scipy.signal.signaltools import _valfrommode, _bvalfromboundary
val = _valfrommode(conv_mode)
bval = _bvalfromboundary('fill')
for b in range(bsize): # loop over batches
for n in range(nkern): # loop over filters
for i in range(imshp[0]): # loop over input feature maps
outval[b,n,...] += _convolve2d(\
imgval[b,i,...], w_flip[n,i,...],1,val, bval, 0)[0::ss[0],0::ss[1]]
ntot += time.time() - time1
# ConvOp
if unroll_patch and not unroll_patch_size:
conv_op = ConvOp(dx=ss[0],dy=ss[1], output_mode=conv_mode,
unroll_patch=unroll_patch, verbose=verbose)(inputs4, kerns4)
else:
conv_op = ConvOp(imshp, kshp, nkern, bsize, ss[0],ss[1], conv_mode,
unroll_batch=unroll_batch, unroll_kern=unroll_kern, unroll_patch=unroll_patch, verbose=verbose)(inputs4, kerns4)
l1shp=N.hstack((nkern,
getFilterOutShp(imshp, kshp, ss, conv_mode)))
propup2 = function([inputs4, kerns4], conv_op)
propup3 = function([inputs4, kerns4], conv_op, mode=Mode(linker="py"))
time1 = time.time()
for i in range(repeat):
hidval2_ = propup2(imgval,w_flip)
hidval2 = hidval2_#[:,:,0::ss[0],0::ss[1]]
tctot += time.time() - time1
if conv_op_py:
time1 = time.time()
for i in range(repeat):
hidval3_ = propup3(imgval,w_flip)
hidval3 = hidval3_#[:,:,0::ss[0],0::ss[1]]
tpytot += time.time() - time1
assert (N.abs(hidval2-hidval3)<1e-5).all()
else:
tpytot += 0
if validate:
temp = N.abs(outval - hidval2)
assert (temp < 1e-5).all()
if validate and conv_op_py:
temp = N.abs(outval - hidval3)
assert (temp < 1e-5).all()
imshp = tuple(outshp)
imgval = outval.reshape(bsize,outshp[0],outshp[1],outshp[2])
return tctot, tpytot, ntot
class TestConvOp(unittest.TestCase):
"""NOTE: we test only when we pass 4d tensor.
"""
def setUp(self):
utt.seed_rng()
def test_convolution(self):
print '\n\n*************************************************'
print ' TEST CONVOLUTION'
print '*************************************************'
from scipy.signal import convolve2d
if 0:
# fixed parameters
bsize = 10 # batch size
imshp = (1,28,28)# image shape
kshps = [(5,5),(6,7),(12,8)] # kernel shaped
nkern = 5 # nb kernel
ssizes = ((1,1),(2,2),(3,3),(4,4))#step size
convmodes = ('full','valid')
elif 0:
# fixed parameters
bsize = 10 # batch size
imshp = (1,50,50)# image shape
print >> sys.stderr, "WARNING: only square shape tested"
kshps = [(12,12), (12,12)]
nkern = 20 # nb kernel
ssizes = [(1,1)] #step size
convmodes = ('full','valid')
elif 0:
# fixed parameters
bsize = 7 # batch size
imshp = (1,5,4)# image shape
kshps = [(2,3)]
nkern = 6 # nb kernel
ssizes = [(1,1)] #step size
convmodes = ('valid',)
else:
# fixed parameters
bsize = 7 # batch size
imshp = (1,5,4)# image shape
kshps = [(2,3)]
nkern = 6 # nb kernel
ssizes = [(1,1)] #step size
convmodes = ('valid',)
# TODO: ask Fred about this
# this combination trigered a bug.
# bsize=1
# imshp=(1,9,9)#fail with 9,9
# kshp=(2,2)
# nkern=5
# ssizes=((1,1),)
# this combination trigered a bug.
# bsize = 1 # batch size
# imshp = (1,3,3)# image shape
# kshp = (2,3)#(5,5) # kernel shaped
# nkern = 1 # nb kernel
# ssizes = ((1,1),)#(2,2),(3,3),(4,4))#step size
# convmodes = ('full','valid')
# symbolic stuff
bias = T.dvector()
kerns = T.dmatrix()
input = T.dmatrix()
rng = N.random.RandomState(3423489)
biasvals = rng.randn(nkern)
#profmode = wraplinker.ProfileMode(OpWiseCLinker(), 'fast_run')
tconvop, tscipy, tconv2 = [], [], []
for conv_mode in convmodes:
for kshp in kshps:
filters = rng.randn(nkern,N.prod(kshp))
for ss in ssizes:
# now test with real values
img2d = 1 + N.arange(bsize*N.prod(imshp)).reshape((bsize,)+imshp)
# print 'img2d', img2d
img1d = img2d.reshape(bsize,-1)
# create filters
filtersflipped = flip(filters.reshape((nkern,)+kshp), kshp)
# compute with ConvOp
dmatrix3=T.TensorType('float64', (False, False, False))
inputs4=dmatrix4()
kerns4=dmatrix4()
bia=T.dscalar()
conv_op = ConvOp(imshp, kshp, nkern, bsize, ss[0],ss[1], conv_mode)(inputs4, kerns4)
f2 = function([inputs4, kerns4], conv_op, mode=Mode(linker="c"))
f3 = function([inputs4, kerns4], conv_op, mode=Mode(linker="py"))
ttime1 = time.time()
out2_ = f2(img2d, filtersflipped.reshape(nkern,1,*kshp))
out2__ = out2_
tconvop += [time.time() - ttime1]
out2___ = out2__.copy()
out2 = out2___ + biasvals.reshape(1,nkern,1,1)
out3_ = f3(img2d, filtersflipped.reshape(nkern,1,*kshp))
out3__ = out3_
out3___ = out3__.copy()
out3 = out3___ + biasvals.reshape(1,nkern,1,1)
assert (N.abs(out2_-out3_)<1e-5).all()
# REFERENCE IMPLEMENTATION: compute output with convolve2d
if conv_mode=='valid':
fulloutshp = N.array(imshp[1:]) - N.array(kshp) + 1
else:
fulloutshp = N.array(imshp[1:]) + N.array(kshp) - 1
ntime1 = time.time()
refout = N.zeros((bsize,)+tuple(fulloutshp)+(nkern,))
for b in range(bsize):
for n in range(nkern):
refout[b,...,n] = convolve2d(\
img2d[b,0,:,:], filtersflipped[n,...],conv_mode)
tscipy += [time.time() - ntime1]
# need to flatten images
bench1 = refout[:,0::ss[0],0::ss[1],:].reshape(bsize,-1,nkern)
bench1 += biasvals.reshape(1,1,nkern)
# swap the last two dimensions (output needs to be nkern x outshp)
bench1 = N.swapaxes(bench1,1,2)
# compare benchmark with ConvOp
temp = bench1.flatten() - out2.flatten()
assert (temp < 1e-5).all()
print '**** Convolution Profiling Results ****'
print 'Scipy convolve2d processing time: %.3fs'%sum(tscipy),tscipy
print 'ConvOp processing time: %.3fs'%sum(tconvop),tconvop
print 'convolve2 processing time: %.3fs'%sum(tconv2),tconv2
d=N.asarray(tscipy)/tconvop
print 'speed up ConvOp vs convolve2d: %.3f'%d.mean(),d
def speed_multilayer_conv(self):
# calculate the speed up of different combination of unroll
# put the paramter to the same you will try.
validate=False# we don't validate the result to have it much faster!
verbose=1
unroll_batch = [1,2,4,5,10,20]
unroll_kern = [1,2,4,5,10,20]
unroll_batch = [1,4,5]
unroll_kern = [1,4,5]
unroll_patch = [True, False]
bsize = 20 # batch size
imshp_start = (1,48,48)#un square shape to test more corner case.
kshps = ([11,12],[12,11])#un square shape to test more corner case.
nkerns = [20,20] # per output pixel
ssizes = [(1,1),]#(1,1)]#(2,2) bugged
convmodes = ['valid','full']
do_convolve2=False
a=T.dmatrix()
kerns = [a for i in nkerns]
assert len(kshps)==len(nkerns)==len(kerns)
timing = N.zeros((len(unroll_batch),len(unroll_kern),3,len(convmodes)*len(ssizes)))
t_b_k=[]
#calculate the timing with unrolling
print 'time unroll batch kern'
t_=[[ 7.60572791, 3.95069814, 3.74271464], [ 4.05631089, 2.90384555, 2.93613672], [ 3.90551591, 2.92595196, 3.00102282]]
best=[0.52690219879150391, 2.4266397953033447]
worst=[0.92042708396911621, 6.8822150230407715]
best=[]
worst=[]
t_=[]
for unroll_b, n_b in zip(unroll_batch,range(len(unroll_batch))):
for unroll_k, n_k in zip(unroll_kern,range(len(unroll_kern))):
t_b_k.append(str(unroll_b)+"/"+str(unroll_k))
if not t_:
tctot, tpytot, ntot=[],[],[]
for conv_mode, n_mode in zip(convmodes,range(len(convmodes))):
for ss, n_ss in zip(ssizes,range(len(ssizes))):
tctot_, tpytot_, ntot_ = exec_multilayer_conv_nnet(conv_mode, ss, bsize, imshp_start, kshps, nkerns, unroll_batch=unroll_b, unroll_kern=unroll_k, validate=validate, verbose=verbose,do_print=False)
tctot+=[tctot_]
tpytot+=[tpytot_]
ntot+=[ntot_]
if unroll_b==4 and unroll_k==4:
#print "unroll 4/4",tctot
best=tctot
if unroll_b==1 and unroll_k==1:
#print "unroll 1/1",tctot
worst=tctot
timing[n_b,n_k]=[tctot, tpytot, ntot]#[sum(tctot), sum(tpytot), sum(ntot)]
if not t_:
t=timing[:,:,0,:]#We select only the c timing.
else:
t=t_
t=N.asarray(t)
#calculate the old timing
print 'time old version'
tctot_=[0.52555489540100098, 6.6634182929992676]
tctot,tpytot,ntot=[],[],[]
tctot_=[]
if not tctot_:
for conv_mode, n_mode in zip(convmodes,range(len(convmodes))):
for ss, n_ss in zip(ssizes,range(len(ssizes))):
tctot_, tpytot_, ntot_ = exec_multilayer_conv_nnet(conv_mode, ss, bsize, imshp_start, kshps, nkerns, unroll_batch=0, unroll_kern=0, validate=validate, verbose=verbose,do_print=False)
tctot+=[tctot_]
tpytot+=[tpytot_]
ntot+=[ntot_]
else: tctot=N.asarray(tctot_)
print "old code timing %.3fs"%sum(tctot),tctot
best=N.asarray(best)
worst=N.asarray(worst)
print "timing for unrolled version"
print t_b_k
print t
t_detail=t
t = t.sum(axis=2)
print "max %.3fs"%t.max(), "max param(batch unloop size/kernel unloop size)", t_b_k[t.argmax()]
print "min %.3fs"%t.min(), "min param(batch unloop size/kernel unloop size)", t_b_k[t.argmin()]
print "speedup vs (1/1)%.3fx, vs old %.3fx"% (t.max()/t.min(),sum(tctot)/t.min())
print worst/best,tctot/best
#calculate the timing of unroll_patch
print 'time unroll_patch'
tctot_patch = []
tctot_patch_size = []
for conv_mode, n_mode in zip(convmodes,range(len(convmodes))):
for ss, n_ss in zip(ssizes,range(len(ssizes))):
tctot_, tpytot_, ntot_ = exec_multilayer_conv_nnet(conv_mode, ss, bsize, imshp_start, kshps, nkerns, unroll_batch=0, unroll_kern=0, validate=validate,unroll_patch=True,verbose=verbose,do_print=False)
tctot_patch += [tctot_]
tctot_, tpytot_, ntot_ = exec_multilayer_conv_nnet(conv_mode, ss, bsize, imshp_start, kshps, nkerns, unroll_batch=0, unroll_kern=0, validate=validate,unroll_patch=True,verbose=verbose,do_print=False,unroll_patch_size=True)
tctot_patch_size += [tctot_]
t_patch=sum(tctot_patch)
print "unroll_patch without shape time", tctot_patch
print "speedup vs (1/1)%.3fx, vs old %.3fx"% (t.max()/t_patch,sum(tctot)/t_patch)
print best/tctot_patch, worst/tctot_patch
t_patch_size=sum(tctot_patch_size)
print "unroll_patch with shape time", tctot_patch_size
print "speedup vs (1/1)%.3fx, vs old %.3fx"% (t.max()/t_patch_size,sum(tctot)/t_patch_size)
print best/tctot_patch_size, worst/tctot_patch_size
return
def test_multilayer_conv(self):
print '\n\n*************************************************'
print ' TEST MULTILAYER CONVOLUTION'
print '*************************************************'
# fixed parameters
# test multiple configuration at the same time
bsizes = [6,6] # batch size
imshp_starts = [(1,13,14),(1,4,5)]
kshpss = ([[5,6],[7,4]],[[2,2],[2,2]])
nkernss = [[20,40],[2,2]] # per output pixel
ssizess = [[(1,1),(1,2)],[(1,1),(2,2)]]
convmodes = ['valid','full']
do_convolve2=True
unroll = [(0,0,True),(0,0,False),(1,1,False),(2,2,False),(3,2,False)]#(batch,kern,patch)
# TODO: this version show a bug that was fixed
# the test is included in the upper test.
# imshp_start = (1,4,4)
# kshps = ([2,2],[2,2])#,[7,4])
# nkerns = [2,2] # per output pixel
# ssizes = [(1,1),(2,2)]#2,2)]
# bsizes = [1,1] # batch size
# imshp_starts = [(1,10,10),(1,5,6)]
# kshpss = ([[2,3],[3,2]],[[2,2],[2,2]])
# nkernss = [[1,1],[1,1]] # per output pixel
N.set_printoptions(threshold=N.nan)
# symbolic stuff
kerns = [T.matrix(),T.dmatrix()]
img = T.dmatrix()
rng = N.random.RandomState(3423489)
tctot, tpytot, ntot = [], [], []
for i in range(len(kshpss)):
assert len(kshpss[i])==len(nkernss[i])==len(kerns)
for i in range(len(kshpss)):
for conv_mode, n_mode in zip(convmodes,range(len(convmodes))):
for ss, n_ss in zip(ssizess[i],range(len(ssizess[i]))):
for un_b, un_k, un_p in unroll:
tctot_, tpytot_, ntot_ = exec_multilayer_conv_nnet(
conv_mode, ss, bsizes[i], imshp_starts[i],
kshpss[i], nkernss[i],
img=img, unroll_batch=un_b, unroll_kern=un_k,
unroll_patch=un_p,
validate=True)
tctot+=[tctot_]
tpytot+=[tpytot_]
ntot+=[ntot_]
print '**** Multilayer Convolution Profiling Results ****'
print 'Numpy convolve2d processing time: %.3fs'%sum(ntot),ntot
print 'c Theano(ConvOp) processing time: %.3fs'%sum(tctot),tctot
print 'py Theano(ConvOp) processing time: %.3fs'%sum(tpytot),tpytot
d=N.asarray(ntot)/tctot
print 'speed up c theano(ConvOp) vs convolve2d: %.3fx'%d.mean(),d
d=N.asarray(ntot)/tpytot
print 'speed up py theano(ConvOp) vs convolve2d: %.3fx'%d.mean(),d
def init_data(self,shape):
return N.ones(shape)
return N.random.random(shape)
def test_ConvOpGrad(self):
"""
test the gradient in float and double
"""
print '\n\n*************************************************'
print ' TEST ConvOp.grad'
print '*************************************************'
nkern = 3
bsize = 2
types = ["float32", "float64"]
kshps = [(2,3)]
imshps = [(2,3,4)]
modes = ['valid', 'full']
unroll = [(0,0,True),(1,1,False),(2,3,False),(1,1,False),(0,0,False)]#(batch,kern,patch)
ssizes = [(1,1)]#,(2,2)]#grad for ss!=(1,1) is currently disabled!
for typ in types:
imgs = T.TensorType(typ, (False, False, False, False),'imgs')
kerns = T.TensorType(typ, (False, False, False, False),'kerns')
for mode in modes:
for imshp in imshps:
if len(imshp)!=3:
visdim = 1
else:
visdim = imshp[0]
imgvals = N.array(N.random.random(N.hstack((bsize,imshp))),dtype=imgs.dtype)
for kshp in kshps:
t=numpy.array([imshp[1]-kshp[0],imshp[2]-kshp[1]])
kernvals = N.array(self.init_data((nkern,visdim,kshp[0],
kshp[1])),dtype=kerns.dtype)
# 'full' mode should support kernels bigger than the input
if mode == 'valid' and (t<0).any():
continue
for un_b,un_k, un_p in unroll:
for ss in ssizes:
print 'test_ConvOpGrad'
# print 'mode:',mode,'type:', typ
# print 'imshp:', imshp,
# print 'kshp:', kshp
# print 'un_b:', un_b,
# print 'un_k:', un_k,
# print 'un_p:', un_p
# print 'ss:', ss,
# print 'bsize:', bsize,
# print 'nkern:', nkern
def test_i(imgs):
if un_p and ss[0]==1 and ss[1]==1:
convop = ConvOp(dx=ss[0], dy=ss[1],
output_mode=mode, unroll_patch=un_p)
else:
convop = ConvOp(imshp, kshp, nkern, bsize, ss[0], ss[1],
output_mode=mode, unroll_batch=un_b, unroll_kern=un_k, unroll_patch=un_p)
return convop(imgs, kernvals)
def test_k(kerns):
if un_p and ss[0]==1 and ss[1]==1:
convop = ConvOp(dx=ss[0], dy=ss[1],
output_mode=mode, unroll_patch=un_p)
else:
convop = ConvOp(imshp, kshp, nkern, bsize, ss[0], ss[1],
output_mode=mode, unroll_batch=un_b, unroll_kern=un_k, unroll_patch=un_p)
return convop(imgvals, kerns)
print mode, imshp, kshp, un_b, un_k, ss
#TODO the tolerance needed to pass is very high for float32(0.17). Is this acceptable? Expected?
tol = None
if typ=="float32" and (ss[0]!=1 or ss[1]!=1):
tol = 0.1
utt.verify_grad(test_i, [imgvals],
cast_to_output_type=True,
tol=tol)
utt.verify_grad(test_k, [kernvals],
cast_to_output_type=True,
tol=tol)
if __name__ == '__main__':
t = TestConvOp("test_convolution")
# t.test_convolution()
t.test_multilayer_conv()
# from theano.tests import main
# main("test_sp")
if False:
#used to lanch 8 jobs at the same time.
bsize = 20 # batch size
imshp_start = (1,100,100)#un square shape to test more corner case.
kshps = ([11,12],[12,11])#un square shape to test more corner case.
nkerns = [20,20] # per output pixel
ssizes = [(1,1),]#(1,1)]#(2,2) bugged
convmodes = ['valid','full']
unroll_batch = 5
unroll_kern = 2
ctot=0
tctot, tpytot, ntot = exec_multilayer_conv_nnet(convmodes[1], ssizes[0], bsize, imshp_start, kshps, nkerns, unroll_batch=unroll_batch, unroll_kern=unroll_kern, validate=False, do_print=False,repeat=5)
print "total exec time %.3fs"%tctot
import numpy as N
"""
Contains an op for convolving input images with a set of filters. This was
developed especially for Convolutional Neural Networks.
"""
__docformat__ = "restructuredtext en"
import numpy
import theano
import theano.tensor as T
import theano.tensor as tensor
from theano import gof, Op, tensor, config
from theano.printing import Print
def getFilterOutShp(inshp, kshp, (dx,dy)=(1,1), mode='valid'):
"""
Computes the shape (nb_rows, nb_col) of each output image.
:type inshp: tuple, list or 1D ndarray of length 2
:param inshp: shape of each (2D) input image
:type kshp: tuple, list or 1D ndarray of length 2
:param kshp: shape of each (2D) kernel filter
:type mode: string
:param mode: 'valid' or 'full' (see 'border_mode' in conv2d's doc)
:rtype: numpy 1D ndarray of len 2
:return: shape of each output "image" (or feature map)
"""
if mode=='valid': s = -1
else: s = 1
inshp, kshp = N.array(inshp), N.array(kshp)
return N.int64(N.ceil((inshp[1:] + s*kshp - s*1)/\
N.array([dx,dy], dtype='float')))
import logging
_logger=logging.getLogger("theano.signal.conv")
def _debug(*msg):
_logger.debug(' '.join(msg))
def _warn(*msg):
_logger.warn(' '.join(msg))
def conv2d(input, filters, border_mode='valid', subsample=(1,1),
image_shape=None, filter_shape=None, **kargs):
def conv2d(input, filters, image_shape=None, filter_shape=None,
border_mode='valid', subsample=(1,1), **kargs):
"""
This function returns an instanciated ConvOp through a simple interface.
We do this instead of changing the ConvOp interface so as not to change
previous code based on the ConvOp.
This function will build the symbolic graph for convolving a stack of input
images with a set of filters. The implementation is modelled after
Convolutional Neural Networks (CNN). It is simply a wrapper to the ConvOp but
provides a much cleaner interface.
:type input: symbolic 4D tensor
:param input: tensor containing mini-batch of input feature maps that are
2D. Indexing is thus: (batch, feature map, image row, image col).
:param input: mini-batch of feature map stacks, of shape image_shape.
:type filters: symbolic 4D tensor
:param filters: tensor containing filters for convolutional neural net.
Indexing is: (filter, filter input feature map, filter row, filter col).
:type border_mode: string
:param border_mode:'valid'(only apply kernel over complete patch of the image) or
'full'(padd the image with 0 and apply the kernel over all full patch and partial patch of
the image
:param filters: set of filters used in CNN layer of shape filter_shape
:param border_mode:
'valid'-- only apply filter to complete patches of the image. Generates
output of shape: image_shape - filter_shape + 1
'full' -- zero-pads image to multiple of filter shape to generate output of
shape: image_shape + filter_shape - 1
:type subsample: tuple of len 2
:param subsample: how many pixel we move in the (row,col) direction of the image when we
change of patch
:param subsample: factor by which to subsample the output
:type image_shape: tuple of len 4
:param image_shape: (batch size, stack size, nb row, nb col)
:type filter_shape: tuple of len 4
:param filter_shape: (nb kernel, stack size, nb row, nb col)
:param filter_shape: (nb filters, stack size, nb row, nb col)
:param kwargs: kwargs are passed onto ConvOp. Can be used to set the following:
unroll_batch, unroll_kern, unroll_patch (see ConvOp doc)
:rtype: symbolic 4D tensor
:return: set of feature maps generated by convolutional layer. Tensor is of shape
(batch size, nb filters, output row, output col)
"""
if image_shape and filter_shape:
assert image_shape[1]==filter_shape[1]
......@@ -72,8 +75,24 @@ def conv2d(input, filters, border_mode='valid', subsample=(1,1),
class ConvOp(Op):
"""
A convolution op that should behave like scipy.signal.convolve2d,
but much faster!
This Op serves a dual purpose: it can implement a vanilla 2D convolution
(as taught in any signal processing class) or implement the
convolutional layers found in Convolutional Neural Networks.
In this setting, a set of 3D images is convolved with a set of 3D kernels,
with the particularity that their leading dimensions are of equal length.
Vanilla 2D convolution is treated as a special case of this.
The input parameter represents a mini-batch of multiple images. Its shape is:
batch size x num. input feature maps x image height x image width
The kernel parameter represents a set of 3D kernels. Its shape is:
number of filters x num. input images x filter height x filter width
The output of ConvOp is a 4D tensor, generated as follows:
output[b,k,:,:] = \sum_i input[b,i,:,:] * filter[k,i,:,:] \forall b,k
where b is the mini-batch index, k the filter index and * is the convolution
operator.
"""
__attrnames = ['imshp', 'kshp', 'nkern', 'bsize', 'dx', 'dy', 'out_mode',
......@@ -81,6 +100,24 @@ class ConvOp(Op):
'imshp_logical', 'kshp_logical', 'kshp_logical_top_aligned']
"""These attributes uniquely identify the behaviour of this op for given inputs"""
@staticmethod
def getOutputShape(inshp, kshp, (dx,dy)=(1,1), mode='valid'):
"""
Computes the output dimensions of convolving an image of shape "inshp"
with kernels of shape "kshp".
:param inshp: (rows,cols) of input image
:param kshp: (rows,cols) of filters
:param mode: 'valid' or 'full' (see 'border_mode' in conv2d's doc)
:return: (rows,cols) of output image
"""
if mode=='valid': s = -1
else: s = 1
inshp, kshp = numpy.array(inshp), numpy.array(kshp)
return numpy.int64(numpy.ceil((inshp[1:] + s*kshp - s*1)/\
numpy.array([dx,dy], dtype='float')))
def __init__(self, imshp=None, kshp=None, nkern=None, bsize=None,
dx=None, dy=None,
output_mode='valid', unroll_batch=0,
......@@ -92,37 +129,29 @@ class ConvOp(Op):
verbose=0,
version=-1):
"""
This Op implement the convolution of a kernel(tensor 4d,(nkern, stacksize, nb row, nb
col)) on an image(tensor 4d, (batchsize, stacksize, nb row, nb col). The batch size is
multiple image that we want to apply the same kernel over. The nkern is numtiple kernel
that we want to apply to each image. The stack size is mostly used when their is
multiple layer in the network. It is the sum of the convolution of multiple 2d image
and kernel.
The reason that this op does the summation over convolutions within the 'stack' is that
it allows us to be memory-efficient about how gradients are calculated. If, for
example, we had a convolution op that took a list of images, a list of kernels, and
gave you back each image as filtered by each kernel (JB thought he wanted this at one
point) then we would have to sum over a potentially very large tensor to get the
gradient on the filters.
If the imshp, kshp, nkern and bsize are provided, we can generate more optimal code.
This make a significant difference for the full mode with unroll_patch version. The
most frequent faster code currently available on 64_x86 computer is unroll_batch=4,
unroll_kern=4, unroll_patch=False and this request that all the optional shape
information are gived. Those number are empirically tested and backed up by the
article: Anatomy of High-Performance Matrix Multiplication by Kazushige Goto and Robert
A. Van De Geijn, ACM Transactions on Mathematical Software, vol 34, No. 3, article 12,
May 2008. It is in figure 12, it give the value mr x nr, those value are the optimum to
use for unroll_batch and unroll_kern. For x86_64 bits computer it is 4x4. Other
architecture can have different value.(2x4 for x86, 8x8 for itanium,...)
:type out_mode: string
:param out_mode: 'valid'(give an output smaller then the image, 'full'(give an output
bigger then the image)
optional parameters: (will generate more optimal c code)
Initializes a ConvOp with given output_mode (full/valid). All other
parameters are optional and are only used to generate more optimized c
code.
NOTES ON OPTIMIZATION:
If ALL (imshp, kshp, nkern and bsize) parameters are provided, we can
generate faster c-code. This make a significant difference for the
'full' output_mode with unroll_patch=True. The current fastest
implementation on x86-64 uses {unroll_batch=4, unroll_kern=4,
unroll_patch=False} with all other shape parameters being provided.
For optimizing other architectures, see:
Kazushige Goto and Robert A. Van De Geijn, Anatomy of High-Performance
Matrix Multiplication, (mr x nr). ACM Transactions on Mathematical
Software, May 2008.
Figure 12: (mr x nr). For x86 use 2x4, itanium 8x8, etc.
:type output_mode: string
:param output_mode: 'valid' -- gives an output smaller then the image
'full' -- gives an output bigger then the image
Optional parameters: (will generate more optimal c code)
:type imshp: tuple of len 2 or 3: 2 for 2d image, 3 for a stack of 2d images.
:param imshp: (stacksize, nb image row, nb image col)
:type kshp: tuple of len 2
......@@ -136,7 +165,8 @@ class ConvOp(Op):
:type dy: int
:param dx: patch stride cols
param to select the version of code used:
Params which select the version of code used:
:type unroll_patch: bool
:param unroll_patch: use a version of c_code that unroll the patch loop that don't
request all shape information to work, but if all shape information are present, will
......@@ -149,6 +179,7 @@ class ConvOp(Op):
:param unroll_kern: use a version of c_code that unroll the batch(by unroll_batch) and
the nkern(by unroll_kern) loop. The size must by a multiple of bsize or nkern
respectively.
:type verbose: int
:param verbose: passed to GpuConv
:type version: int
......@@ -157,18 +188,14 @@ class ConvOp(Op):
:param imshp_logical: used internally when we generate the gradient when dx!=1 or dy!=1
:param kshp_logical: idem
:param kshp_logical_top_aligned: idem
"""
all_shape = imshp is not None and kshp is not None and \
nkern is not None and bsize is not None
if (unroll_batch>0 or unroll_kern>0) and not all_shape:
raise Exception("In ConvOp, when using unroll_batch and unroll_nkern, all shape are needed")
if not all_shape and (imshp is not None or kshp is not None \
or nkern is not None or bsize is not None):
print "OPTIMISATION WARNING: passing only a few shape to ConvOp "\
"for faster code is useless. We use all of them or none."
if not all_shape:
unroll_patch = True
......@@ -221,10 +248,10 @@ class ConvOp(Op):
while self.bsize % new!=0:
new-=1
print "OPTIMISATION WARNING: in ConvOp.__init__() unroll_batch(%s)"\
"must be 0 or a divisor of bsize(%s). We revert it to %d. This"\
"won't change the result, but may make it slower."%\
(str(self.unroll_batch),str(self.bsize),new)
warnstr = "OPTIMISATION WARNING: in ConvOp.__init__() unroll_batch(%i)"\
"must be 0 or a divisor of bsize(%i). We revert it to %i. This"\
"won't change the result, but may make it slower."
_warn(warnstr % (self.unroll_batch, self.bsize, new))
self.unroll_batch=new
......@@ -238,15 +265,16 @@ class ConvOp(Op):
assert(new>=1)
while self.nkern % new!=0:
new-=1
print "OPTIMISATION WARNING: in ConvOp.__init__() unroll_kern(%s)"\
"should be 0 or a divisor of nkern(%s)We revert it to %d."\
"This won't change the result, but may make it slower."\
%(str(self.unroll_kern),str(self.nkern),new)
warnstr = "OPTIMISATION WARNING: in ConvOp.__init__() unroll_kern(%i)"\
"should be 0 or a divisor of nkern(%i). We revert it to %i."\
"This won't change the result, but may make it slower."
_warn(warnstr % (self.unroll_kern, self.nkern, new))
self.unroll_kern=new
if all_shape:
self.outshp = getFilterOutShp(self.imshp_logical, self.kshp_logical, (dx,dy), output_mode)
self.fulloutshp = getFilterOutShp(self.imshp_logical, self.kshp_logical, (1,1), output_mode)
self.outshp = ConvOp.getOutputShape(self.imshp_logical, self.kshp_logical, (dx,dy), output_mode)
self.fulloutshp = ConvOp.getOutputShape(self.imshp_logical, self.kshp_logical, (1,1), output_mode)
else:
self.outshp = None
self.fulloutshp = None
......@@ -377,10 +405,10 @@ class ConvOp(Op):
if self.fulloutshp is not None:
fulloutshp = tuple(self.fulloutshp)
else:
fulloutshp = tuple(getFilterOutShp(imshp_logical, kshp_logical, (1,1), self.out_mode))
fulloutshp = tuple(ConvOp.getOutputShape(imshp_logical, kshp_logical, (1,1), self.out_mode))
if z[0] is None:
z[0] = N.zeros((bsize,)+(nkern,)+fulloutshp,
z[0] = numpy.zeros((bsize,)+(nkern,)+fulloutshp,
dtype=img2d.dtype)
zz=z[0]
val = _valfrommode(self.out_mode)
......@@ -393,17 +421,17 @@ class ConvOp(Op):
if self.imshp != self.imshp_logical:
# assuming that to get from imshp to imshp logical we insert zeros in missing spots
rstride = int(N.ceil(imshp_logical[1] / float(imshp[1])))
cstride = int(N.ceil(imshp_logical[2] / float(imshp[2])))
buf = N.zeros((bsize,)+ imshp_logical, dtype=img2d.dtype)
rstride = int(numpy.ceil(imshp_logical[1] / float(imshp[1])))
cstride = int(numpy.ceil(imshp_logical[2] / float(imshp[2])))
buf = numpy.zeros((bsize,)+ imshp_logical, dtype=img2d.dtype)
buf[:,:,::rstride, ::cstride] = img2d
img2d = buf
del buf, rstride, cstride
if kshp != kshp_logical:
rstride = int(N.ceil(kshp_logical[0] / float(kshp[0])))
cstride = int(N.ceil(kshp_logical[1] / float(kshp[1])))
buf = N.zeros((nkern,stacklen)+ self.kshp_logical, dtype=filtersflipped.dtype)
rstride = int(numpy.ceil(kshp_logical[0] / float(kshp[0])))
cstride = int(numpy.ceil(kshp_logical[1] / float(kshp[1])))
buf = numpy.zeros((nkern,stacklen)+ self.kshp_logical, dtype=filtersflipped.dtype)
if self.kshp_logical_top_aligned:
roffset=coffset=0
else:
......@@ -421,6 +449,7 @@ class ConvOp(Op):
for im0 in range(stacklen):
zz[b,n,...] += _convolve2d(\
img2d[b,im0,...], filtersflipped[n,im0,...],1,val, bval, 0)
#We copy it to remove the Stride mismatch warning from DEBUG_MODE.
#The copy make that we return an object with the same stride as the c version.
#The copy don't affect the performence during our experience as in that case we
......@@ -432,20 +461,14 @@ class ConvOp(Op):
def grad(self, (inputs, kerns), (gz,)):
"""
In development. Works for test cases in test_sp.py
WARNING: a few known issues:
* doesn't work for rectangular images or filters
* inputs needs to be a 4D tensor. Couldn't get 3D to work
* will crash if filter the same size as input image
"""
if self.imshp != self.imshp_logical or self.kshp != self.kshp_logical:
raise NotImplementedError('todo')
if self.dx!=1 or self.dy!=1:
raise Exception("ERROR: We disable ConvOp.grad now when dx!=1 or "\
"dy!=1 as we think their is a high probability of bug in it."\
"We need to raise the error on the gradient to .1!")
#if self.dx!=1 or self.dy!=1:
#raise Exception("ERROR: We disable ConvOp.grad now when dx!=1 or "\
#"dy!=1 as we think their is a high probability of bug in it."\
#"We need to raise the error on the gradient to .1!")
all_shape = self.imshp is not None and self.kshp is not None and \
self.nkern is not None and self.bsize is not None
......@@ -454,15 +477,6 @@ class ConvOp(Op):
raise Exception("ConvOp.grad when dx!=1 or dy!=1 we must have all "\
"the optional shape information")
grad_hack_necessary = False
if grad_hack_necessary:
if self.dx!=1 or self.dy!=1:
upgz = T.as_tensor(N.zeros((self.bsize,self.nkern)+tuple(self.fulloutshp),
dtype=gz.type.dtype))
gz = T.SetSubtensor([slice(self.bsize), slice(self.nkern),
slice(0,self.fulloutshp[0],self.dy),
slice(0,self.fulloutshp[1],self.dx)])(upgz,gz)
####### Determine gradient on kernels ########
assert inputs.ndim==4 and kerns.ndim==4
......@@ -485,7 +499,6 @@ class ConvOp(Op):
kshp = self.outshp
un_b = self.unroll_batch
un_k = self.unroll_kern
#print 'dw_valid', imshp, kshp, nkern, bsize
elif self.out_mode == 'full':
(img, filters) = (newgz, newin)
kshp_logical = None
......@@ -497,7 +510,6 @@ class ConvOp(Op):
kshp = self.imshp[1:]
un_b = self.unroll_kern
un_k = self.unroll_batch
#print 'dw_full', imshp, kshp, nkern, bsize
else:
raise NotImplementedError('Only [full,valid] modes are currently supported.')
......@@ -509,17 +521,16 @@ class ConvOp(Op):
un_b = bsize
else:
un_b = 1
print "OPTIMISATION WARNING: in ConvOp.grad() we can't determine "\
"a good unroll value for the batch. Maybe you can optimize this!",\
bsize, un_b, self.unroll_batch, self.unroll_kern
_warn("OPTIMISATION WARNING: in ConvOp.grad() we can't determine "\
"a good unroll value for the batch. Maybe you can optimize this!")
if un_k!=0 and nkern%un_k!=0:
if nkern<un_k:
un_k = nkern
else:
un_k = 1
print "OPTIMISATION WARNING: in ConvOp.grad() we can't determine "\
"a good unroll value for the kernel. Maybe you can optimize this!"
_warn("OPTIMISATION WARNING: in ConvOp.grad() we can't determine "\
"a good unroll value for the kernel. Maybe you can optimize this!")
dw = ConvOp(imshp, kshp, nkern, bsize, 1,1, output_mode='valid',
unroll_batch=un_b, unroll_kern=un_k, unroll_patch=un_p,
......@@ -572,6 +583,7 @@ class ConvOp(Op):
din = din(gz,filters)
assert (din.owner.op.outshp is None and self.imshp is None) or \
(din.owner.op.outshp is None) or \
(din.owner.op.outshp==self.imshp[1:]).all()
return [din, dw]
......@@ -622,12 +634,12 @@ using namespace std;
d["self_kshp1"]=self.kshp[1]
d["self_kshp_logical_r"] = self.kshp_logical[0]
d["self_kshp_logical_c"] = self.kshp_logical[1]
d["self_kshp_logical_stride_r"] = int(N.ceil(self.kshp_logical[0] / float(self.kshp[0])))
d["self_kshp_logical_stride_c"] = int(N.ceil(self.kshp_logical[1] / float(self.kshp[1])))
d["self_imshp_logical_r"] = self.imshp_logical[1] #N.B. 1 not 0
d["self_imshp_logical_c"] = self.imshp_logical[2]#N.B. 2 not 1
d["self_imshp_logical_stride_r"] = int(N.ceil(self.imshp_logical[1] / float(self.imshp[1])))
d["self_imshp_logical_stride_c"] = int(N.ceil(self.imshp_logical[2] / float(self.imshp[2])))
d["self_kshp_logical_stride_r"] = int(numpy.ceil(self.kshp_logical[0] / float(self.kshp[0])))
d["self_kshp_logical_stride_c"] = int(numpy.ceil(self.kshp_logical[1] / float(self.kshp[1])))
d["self_imshp_logical_r"] = self.imshp_logical[1] #numpy.B. 1 not 0
d["self_imshp_logical_c"] = self.imshp_logical[2]#numpy.B. 2 not 1
d["self_imshp_logical_stride_r"] = int(numpy.ceil(self.imshp_logical[1] / float(self.imshp[1])))
d["self_imshp_logical_stride_c"] = int(numpy.ceil(self.imshp_logical[2] / float(self.imshp[2])))
if not self.imshp[0]==1: d["affectation"]="+="
d["all_shape"]=1
d["dim_zz_const"]="const"
......@@ -663,29 +675,30 @@ using namespace std;
if self.imshp != self.imshp_logical or self.kshp != self.kshp_logical:
if self.verbose:
print "return imshp!=imshp_logical or self.kshp != self.kshp_logical shape version"
_debug("return imshp!=imshp_logical or self.kshp != self.kshp_logical shape version")
return _conv_op_code_a % d
if self.unroll_patch:
if self.verbose:
print "return unroll patch version. all_shape=", all_shape
_debug("return unroll patch version. all_shape=", all_shape)
return _conv_op_code_unroll_patch%d
if self.unroll_batch>0 or self.unroll_kern>0:
if self.unroll_batch<=0: self.unroll_batch=1
if self.unroll_kern<=0: self.unroll_kern=1
if self.verbose:
print "return unrolled batch and kern code by",self.unroll_batch, self.unroll_kern
_debug("return unrolled batch (%s) and kern code (%s)",
str(self.unroll_batch), str(self.unroll_kern))
return gen_conv_code_unroll_batch_kern(d, self.unroll_batch,
self.unroll_kern)
#TODO: should we choose the unroll size automatically with the bigger divisor under 5?
if self.out_mode == 'valid' and self.dx==0 and self.dy==0:
if self.verbose:
print "return gemm version"
_debug("return gemm version")
return _conv_op_code_valid_gemm % d
else:
if self.verbose:
print "return no gemm version"
_debug("return no gemm version")
return _conv_op_code_a % d
......
......@@ -7,11 +7,11 @@ from theano import gof
from theano import scalar
from theano import printing
from theano.printing import pprint
import basic as tensor
import elemwise
import numpy
import opt
from theano.tensor import basic as tensor
from theano.tensor import elemwise
from theano.tensor import opt
from theano.compile import optdb
import numpy
############
#
......
......@@ -7,7 +7,7 @@ from theano.tests import unittest_tools as utt
from theano import function, Mode
import theano.tensor as T
from theano.tensor.signal.conv import ConvOp
from theano.tensor.nnet.conv import ConvOp
def flip(kern, kshp):
"flip the kernel as scipy.convolv2d do it flipped."
......
import sys, time, unittest
import numpy
from scipy import signal
import theano
import theano.tensor as T
from theano import function, Mode
from theano.tests import unittest_tools as utt
from theano.tensor.nnet import conv
from theano.tensor.basic import _allclose
class TestConv2D(unittest.TestCase):
def setUp(self):
utt.seed_rng()
self.input = T.dtensor4('input')
self.filters = T.dtensor4('filters')
def validate(self, image_shape, filter_shape,
border_mode='valid', subsample=(1,1),
N_image_shape=None, N_filter_shape=None,
input=None, filters=None,
unroll_batch=0, unroll_kern=0, unroll_patch=True,
verify_grad=True):
if N_image_shape is None:
N_image_shape = image_shape
if N_filter_shape is None:
N_filter_shape = filter_shape
if not input:
input = self.input
if not filters:
filters = self.filters
############# THEANO IMPLEMENTATION ############
# we create a symbolic function so that verify_grad can work
def sym_conv2d(input, filters):
# define theano graph and function
return conv.conv2d(input, filters, image_shape, filter_shape,
border_mode, subsample, unroll_batch=unroll_batch,
unroll_kern=unroll_kern, unroll_patch=unroll_patch)
output = sym_conv2d(input, filters)
theano_conv = theano.function([input, filters], output)
# initialize input and compute result
image_data = numpy.random.random(N_image_shape)
filter_data = numpy.random.random(N_filter_shape)
theano_output = theano_conv(image_data, filter_data)
############# REFERENCE IMPLEMENTATION ############
s = 1. if border_mode is 'full' else -1.
out_shape2d = numpy.array(N_image_shape[-2:]) +\
s*numpy.array(N_filter_shape[-2:]) - s
out_shape2d = numpy.ceil(out_shape2d / numpy.array(subsample))
out_shape = (N_image_shape[0],N_filter_shape[0]) + tuple(out_shape2d)
ref_output = numpy.zeros(out_shape)
# loop over output feature maps
for k in range(N_filter_shape[0]):
# loop over input feature maps
for l in range(N_filter_shape[1]):
filter2d = filter_data[k,l,:,:]
# loop over mini-batches
for b in range(N_image_shape[0]):
image2d = image_data[b,l,:,:]
output2d = signal.convolve2d(image2d, filter2d, border_mode)
ref_output[b,k,:,:] +=\
output2d[::subsample[0],::subsample[1]]
self.failUnless(_allclose(theano_output, ref_output))
############# TEST GRADIENT ############
if verify_grad:
utt.verify_grad(sym_conv2d, [image_data, filter_data])
def test_basic(self):
"""
Tests that basic convolutions work for odd and even dimensions of image and filter
shapes, as well as rectangular images and filters.
"""
self.validate((3,2,8,8), (4,2,5,5), 'valid')
self.validate((3,2,7,5), (5,2,2,3), 'valid')
self.validate((3,2,7,5), (5,2,3,2), 'valid')
self.validate((3,2,8,8), (4,2,5,5), 'full')
self.validate((3,2,7,5), (5,2,2,3), 'full')
# test filter same size as input
self.validate((3,2,3,3), (4,2,3,3), 'valid')
def test_unroll_patch_false(self):
"""
unroll_patch is True by default. Test basic convs with False.
"""
self.validate((3,2,7,5), (5,2,2,3), 'valid', unroll_patch=False)
self.validate((3,2,7,5), (5,2,2,3), 'full', unroll_patch=False)
self.validate((3,2,3,3), (4,2,3,3), 'valid', unroll_patch=False)
def test_unroll_special(self):
"""
(unroll_kern, unroll_batch) in (0,1),(1,0) is special case.
"""
self.validate((6,2,3,3), (3,2,2,2), 'valid', unroll_batch=1)
def test_unroll_batch(self):
"""
Test mini-batch unrolling for various legal values.
"""
# mini-batch of size 6 is multiple of 2 and 3. Should work.
self.validate((6,2,3,3), (3,2,2,2), 'valid', unroll_batch=2, verify_grad=False)
self.validate((6,2,3,3), (3,2,2,2), 'valid', unroll_batch=3, verify_grad=False)
def test_unroll_kern(self):
"""
Test kernel unrolling for various legal values.
"""
# 6 filters is a multiple of 2 and 3. Should work.
self.validate((2,3,3,3), (6,3,2,2), 'valid', unroll_kern=2, verify_grad=False)
self.validate((2,3,3,3), (6,3,2,2), 'valid', unroll_kern=3, verify_grad=False)
def test_subsample(self):
"""
Tests convolution where subsampling != (1,1)
"""
self.validate((3,2,7,5), (5,2,2,3), 'valid', subsample=(2,2))
self.validate((3,2,7,5), (5,2,2,3), 'full', subsample=(2,2))
self.validate((3,2,7,5), (5,2,2,3), 'valid', subsample=(2,1))
def test_invalid_filter_shape(self):
"""
Tests scenario where filter_shape[1] != input_shape[1]
"""
def f():
self.validate((3,2,8,8), (4,3,5,5), 'valid')
self.failUnlessRaises(AssertionError, f)
def test_missing_info(self):
"""
Test convolutions for various pieces of missing info.
"""
self.validate(None, None,
N_image_shape=(3,2,8,8),
N_filter_shape=(4,2,5,5))
self.validate((3,2,None,None), None,
N_image_shape=(3,2,8,8),
N_filter_shape=(4,2,5,5))
self.validate((None,2,None,None), (None,2,5,5),
N_image_shape=(3,2,8,8),
N_filter_shape=(4,2,5,5))
def test_full_mode(self):
"""
Tests basic convolution in full mode and case where filter
is larger than the input image.
"""
self.validate((3,2,5,5), (4,2,8,8), 'full')
def f():
self.validate((3,2,5,5), (4,2,8,8), 'valid')
self.failUnlessRaises(Exception, f)
def test_wrong_input(self):
"""
Make sure errors are raised when image and kernel are not 4D tensors
"""
try:
self.validate((3,2,8,8), (4,2,5,5), 'valid', input = T.dmatrix())
self.validate((3,2,8,8), (4,2,5,5), 'valid', filters = T.dvector())
self.validate((3,2,8,8), (4,2,5,5), 'valid', input = T.dtensor3())
# should never reach here
self.fail()
except:
pass
......@@ -3,9 +3,9 @@ import unittest
import theano
from theano import tensor as T
from theano import gof
import test_basic as TT
import numpy
from theano.tests import unittest_tools as utt
from theano.tensor.tests import test_basic as TT
from theano.tensor.nnet import *
......
"""
Contains an op for convolving input images with a set of filters. This was
developed especially for Convolutional Neural Networks.
Contains a wrapper function for tensor.nnet.ConvOp, which can be used to perform
generic 2D convolution.
"""
__docformat__ = "restructuredtext en"
......@@ -8,7 +8,9 @@ __docformat__ = "restructuredtext en"
import numpy
import theano
import theano.tensor as tensor
import theano.tensor.nnet as nnet
from theano import gof, Op, tensor, config
from theano.tensor.nnet import conv
import logging
_logger=logging.getLogger("theano.signal.conv")
......@@ -19,1664 +21,76 @@ def _warn(*msg):
def conv2d(input, filters, image_shape=None, filter_shape=None,
border_mode='valid', subsample=(1,1), **kargs):
border_mode='valid', subsample=(1,1), **kargs):
"""
This function will build the symbolic graph for convolving a stack of input
images with a set of filters. The implementation is modelled after
Convolutional Neural Networks (CNN). It is simply a wrapper to the ConvOp but
provides a much cleaner interface.
signal.conv.conv2d performs a basic 2D convolution of the input with the
given filters. The input parameter can be a single 2D image or a 3D tensor,
containing a set of images. Similarly, filters can be a single 2D filter or
a 3D tensor, corresponding to a set of 2D filters.
:type input: symbolic 4D tensor
:param input: mini-batch of feature map stacks, of shape image_shape.
:type filters: symbolic 4D tensor
:param filters: set of filters used in CNN layer of shape filter_shape
:param border_mode:
'valid'-- only apply filter to complete patches of the image. Generates
output of shape: image_shape - filter_shape + 1
'full' -- zero-pads image to multiple of filter shape to generate output of
shape: image_shape + filter_shape - 1
:type subsample: tuple of len 2
:param subsample: factor by which to subsample the output
:type image_shape: tuple of len 4
:param image_shape: (batch size, stack size, nb row, nb col)
:type filter_shape: tuple of len 4
:param filter_shape: (nb filters, stack size, nb row, nb col)
:param kwargs: kwargs are passed onto ConvOp. Can be used to set the following:
unroll_batch, unroll_kern, unroll_patch (see ConvOp doc)
"""
if image_shape and filter_shape:
assert image_shape[1]==filter_shape[1]
if filter_shape is not None:
nkern = filter_shape[0]
kshp = filter_shape[2:]
else:
nkern, kshp = None, None
if image_shape is not None:
bsize = image_shape[0]
imshp = image_shape[1:]
else:
bsize, imshp = None, None
op = ConvOp(output_mode=border_mode, dx=subsample[0], dy=subsample[1],
imshp=imshp, kshp=kshp, nkern=nkern, bsize=bsize,**kargs)
return op(input, filters)
class ConvOp(Op):
"""
This Op serves a dual purpose: it can implement a vanilla 2D convolution
(as taught in any signal processing class) or implement the
convolutional layers found in Convolutional Neural Networks.
Shape parameters are optional and will result in faster execution.
In this setting, a set of 3D images is convolved with a set of 3D kernels,
with the particularity that their leading dimensions are of equal length.
Vanilla 2D convolution is treated as a special case of this.
The input parameter represents a mini-batch of multiple images. Its shape is:
batch size x num. input feature maps x image height x image width
The kernel parameter represents a set of 3D kernels. Its shape is:
number of filters x num. input images x filter height x filter width
The output of ConvOp is a 4D tensor, generated as follows:
output[b,k,:,:] = \sum_i input[b,i,:,:] * filter[k,i,:,:] \forall b,k
where b is the mini-batch index, k the filter index and * is the convolution
operator.
:type input: dmatrix of dtensor3
:param input: symbolic variable for images to be filtered
:type filters: dmatrix of dtensor3
:param filters: symbolic variable containing filter values
:param border_mode: 'valid' or 'full'. see scipy.signal.convolve2d
:param subsample: factor by which to subsample output
:type image_shape: tuple of length 2 or 3
:param image_shape: ([number images,] image height, image width)
:type filter_shape: tuple of length 2 or 3
:param filter_shape: ([number filters,] filter height, filter width)
:param kwargs: see theano.tensor.nnet.conv.conv2d
:rtype: symbolic 2D,3D or 4D tensor
:return: tensor of filtered images, with shape
([number images,] [number filters,] image height, image width)
"""
assert input.ndim in (2,3)
assert filters.ndim in (2,3)
__attrnames = ['imshp', 'kshp', 'nkern', 'bsize', 'dx', 'dy', 'out_mode',
'unroll_batch', 'unroll_kern', 'unroll_patch',
'imshp_logical', 'kshp_logical', 'kshp_logical_top_aligned']
"""These attributes uniquely identify the behaviour of this op for given inputs"""
@staticmethod
def getOutputShape(inshp, kshp, (dx,dy)=(1,1), mode='valid'):
"""
Computes the output dimensions of convolving an image of shape "inshp"
with kernels of shape "kshp".
:param inshp: (rows,cols) of input image
:param kshp: (rows,cols) of filters
:param mode: 'valid' or 'full' (see 'border_mode' in conv2d's doc)
:return: (rows,cols) of output image
"""
if mode=='valid': s = -1
else: s = 1
inshp, kshp = numpy.array(inshp), numpy.array(kshp)
return numpy.int64(numpy.ceil((inshp[1:] + s*kshp - s*1)/\
numpy.array([dx,dy], dtype='float')))
def __init__(self, imshp=None, kshp=None, nkern=None, bsize=None,
dx=None, dy=None,
output_mode='valid', unroll_batch=0,
unroll_kern=0,
unroll_patch=True,
imshp_logical=None,
kshp_logical=None,
kshp_logical_top_aligned=True,
verbose=0,
version=-1):
"""
Initializes a ConvOp with given output_mode (full/valid). All other
parameters are optional and are only used to generate more optimized c
code.
NOTES ON OPTIMIZATION:
If ALL (imshp, kshp, nkern and bsize) parameters are provided, we can
generate faster c-code. This make a significant difference for the
'full' output_mode with unroll_patch=True. The current fastest
implementation on x86-64 uses {unroll_batch=4, unroll_kern=4,
unroll_patch=False} with all other shape parameters being provided.
For optimizing other architectures, see:
Kazushige Goto and Robert A. Van De Geijn, Anatomy of High-Performance
Matrix Multiplication, (mr x nr). ACM Transactions on Mathematical
Software, May 2008.
Figure 12: (mr x nr). For x86 use 2x4, itanium 8x8, etc.
:type output_mode: string
:param output_mode: 'valid' -- gives an output smaller then the image
'full' -- gives an output bigger then the image
Optional parameters: (will generate more optimal c code)
:type imshp: tuple of len 2 or 3: 2 for 2d image, 3 for a stack of 2d images.
:param imshp: (stacksize, nb image row, nb image col)
:type kshp: tuple of len 2
:param kshp: (nb kernel row, nb kernel col)
:type nkern: int
:param nkern: the number of kernel
:type bsize: int
:param bsize: the size of the minibatch
:type dx: int
:param dx: patch stride rows
:type dy: int
:param dx: patch stride cols
Params which select the version of code used:
:type unroll_patch: bool
:param unroll_patch: use a version of c_code that unroll the patch loop that don't
request all shape information to work, but if all shape information are present, will
use it to hardcode the value in the code for faster code.
:type unroll_batch:int
:param unroll_batch: use a version of c_code that unroll the batch(by unroll_batch) and
the nkern(by unroll_kern) loop. The size must by a multiple of bsize or nkern
respectively.
:type unroll_kern:int
:param unroll_kern: use a version of c_code that unroll the batch(by unroll_batch) and
the nkern(by unroll_kern) loop. The size must by a multiple of bsize or nkern
respectively.
:type verbose: int
:param verbose: passed to GpuConv
:type version: int
:param version: passed to GpuConv
:param imshp_logical: used internally when we generate the gradient when dx!=1 or dy!=1
:param kshp_logical: idem
:param kshp_logical_top_aligned: idem
"""
all_shape = imshp is not None and kshp is not None and \
nkern is not None and bsize is not None
if (unroll_batch>0 or unroll_kern>0) and not all_shape:
raise Exception("In ConvOp, when using unroll_batch and unroll_nkern, all shape are needed")
if not all_shape:
unroll_patch = True
if imshp is not None:
imshp = tuple(imshp)
if len(imshp)==2:
imshp = (1,)+imshp
elif len(imshp)==3:
imshp = imshp
else:
raise Exception("bad len for imshp")
self.imshp = imshp
if kshp is not None:
kshp = tuple(kshp)
self.kshp = kshp
self.nkern = nkern
self.bsize=bsize
self.dx=dx
self.dy=dy
self.verbose=verbose
self.version=version
# a triple
self.imshp_logical = self.imshp
if imshp_logical is not None: self.imshp_logical = tuple(imshp_logical)
assert (self.imshp is None and self.imshp_logical is None) or \
(len(self.imshp) == len(self.imshp_logical))
# a pair
self.kshp_logical = self.kshp
if kshp_logical is not None: self.kshp_logical = tuple(kshp_logical)
self.kshp_logical_top_aligned = kshp_logical_top_aligned
self.unroll_batch=unroll_batch
self.unroll_kern=unroll_kern
self.unroll_patch=unroll_patch
if self.unroll_batch>0 and self.bsize % self.unroll_batch!=0:
if self.bsize<=self.unroll_batch:
self.unroll_batch = self.bsize
else:
#find the maximum value under unroll_batch that would work
new=self.unroll_batch
assert(new>=1)
while self.bsize % new!=0:
new-=1
warnstr = "OPTIMISATION WARNING: in ConvOp.__init__() unroll_batch(%i)"\
"must be 0 or a divisor of bsize(%i). We revert it to %i. This"\
"won't change the result, but may make it slower."
_warn(warnstr % (self.unroll_batch, self.bsize, new))
self.unroll_batch=new
if self.unroll_kern>0 and self.nkern % unroll_kern!=0:
if self.nkern<=self.unroll_kern:
self.unroll_kern = self.nkern
else:
#find the maximum value under unroll_kern that would work
new=self.unroll_kern
assert(new>=1)
while self.nkern % new!=0:
new-=1
warnstr = "OPTIMISATION WARNING: in ConvOp.__init__() unroll_kern(%i)"\
"should be 0 or a divisor of nkern(%i). We revert it to %i."\
"This won't change the result, but may make it slower."
_warn(warnstr % (self.unroll_kern, self.nkern, new))
self.unroll_kern=new
if all_shape:
self.outshp = ConvOp.getOutputShape(self.imshp_logical, self.kshp_logical, (dx,dy), output_mode)
self.fulloutshp = ConvOp.getOutputShape(self.imshp_logical, self.kshp_logical, (1,1), output_mode)
else:
self.outshp = None
self.fulloutshp = None
self.out_mode = output_mode
if not self.out_mode in ["valid", "full"]:
raise Exception("Mode %s not implemented"%self.out_mode)
if all_shape and not (self.outshp > 0).all():
raise Exception(("Bad size for the output shape. Verify that [post-"\
"supersampling] input shape (%s) and kern shape(%s) are ok. "\
"(Hint: kerns must fit inside image in valid mode)")%
(self.imshp_logical,self.kshp_logical))
self._rehash()
if config.op.set_flops:
self.set_flops()
def __eq__(self, other):
if type(self) != type(other):
return False
for a in self.__attrnames:
if getattr(self, a) != getattr(other, a):
return False
return True
def __setstate__(self, d):
self.__dict__.update(d)
self._rehash()
def _rehash(self):
hashval = hash(type(self))
for a in self.__attrnames:
hashval = hashval ^ hash(getattr(self, a))
self.__hashval = hashval
def __hash__(self):
return self.__hashval
def __str__(self):
return "ConvOp{" +",".join(str((a, getattr(self, a))) for a in self.__attrnames) + "}"
def set_flops(self):
""" Usefull with the hack in profilemode to print the MFlops"""
if self.out_mode=="valid":
self.flops=self.kshp[0]*self.kshp[1]*2#nb mul and add by output pixed
self.flops*=self.outshp[0]*self.outshp[1]#nb flops by output image
self.flops*=self.imshp[0]*self.nkern*self.bsize#for all outputs images#n_stack==self.imshp[0]
else: #full mode not implemented
self.flops=0
for out_row in range(self.outshp[0]):#loop over output row
for out_col in range(self.outshp[0]):#loop over output col
for row in range(self.kshp[0]):#loop over kern row
if (row+out_row-self.kshp[0]+1<0 or
row+out_row-self.kshp[0]+1>=self.imshp[1]):
continue
col=0
max_col=self.kshp[1]
img_col=out_col-self.kshp[1]+1
max_col=min(max_col,self.imshp[2]-img_col)
if img_col<0:
col=-img_col
img_col+=col
while col < max_col: #loop over kern col
self.flops+=2
col+=1
self.flops*=self.imshp[0]*self.nkern*self.bsize#for all outputs images#n_stack==self.imshp[0]
assert self.flops == self.bsize * self.nkern * self.imshp[0] * \
self.kshp[0] * self.kshp[1] * self.imshp[1] * self.imshp[2] * 2
def make_node(self, inputs, kerns):
# TODO: find a way to make ConvOp work for N-D (after NIPS09)
"""
inputs - 4 dim: batches x stacksize x rows x cols
kerns - 4 dim: nkern x stackidx x rows x cols
"""
outdim = kerns.ndim
_inputs = tensor.as_tensor_variable(inputs)
_kerns = tensor.as_tensor_variable(kerns)
# TODO: lift this restriction by upcasting either inputs or kerns
if _inputs.ndim != 4:
raise TypeError('make_node requires 4D tensor of inputs')
if _kerns.ndim != 4:
raise TypeError('make_node requires 4D tensor of kernels')
if _inputs.type.dtype != _kerns.type.dtype:
raise NotImplementedError("The image and the kernel must have the same type."
"inputs(%s), kerns(%s)"%(_inputs.dtype, _kerns.dtype))
output = tensor.tensor(dtype=_inputs.type.dtype,
broadcastable=[_inputs.broadcastable[0],
_kerns.broadcastable[0], False, False]);
return gof.Apply(self, [_inputs, _kerns], [output])
def perform(self,node, (img2d, filtersflipped), (z,)):
"""
By default if len(img2d.shape)==3, we
"""
# TODO: move these back out to global scope when they no longer cause an atexit error
from scipy.signal.signaltools import _valfrommode, _bvalfromboundary
from scipy.signal.sigtools import _convolve2d
imshp = self.imshp
if imshp is None:
imshp = tuple(img2d.shape[1:])
kshp = self.kshp
if kshp is None:
kshp = tuple(filtersflipped.shape[2:])
bsize = self.bsize
if bsize is None:
bsize = img2d.shape[0]
nkern = self.nkern
if nkern is None:
nkern = filtersflipped.shape[0]
imshp_logical = self.imshp_logical
if imshp_logical is None:
imshp_logical = imshp
kshp_logical = self.kshp_logical
if kshp_logical is None:
kshp_logical = kshp
if self.fulloutshp is not None:
fulloutshp = tuple(self.fulloutshp)
### use shape information if it is given to us ###
if filter_shape and image_shape:
if input.ndim==3:
bsize = image_shape[0]
else:
fulloutshp = tuple(ConvOp.getOutputShape(imshp_logical, kshp_logical, (1,1), self.out_mode))
if z[0] is None:
z[0] = numpy.zeros((bsize,)+(nkern,)+fulloutshp,
dtype=img2d.dtype)
zz=z[0]
val = _valfrommode(self.out_mode)
bval = _bvalfromboundary('fill')
stacklen = imshp[0]
img2d = img2d.reshape((bsize,)+ imshp)
filtersflipped = filtersflipped.reshape((nkern,stacklen)+kshp)
if self.imshp != self.imshp_logical:
# assuming that to get from imshp to imshp logical we insert zeros in missing spots
rstride = int(numpy.ceil(imshp_logical[1] / float(imshp[1])))
cstride = int(numpy.ceil(imshp_logical[2] / float(imshp[2])))
buf = numpy.zeros((bsize,)+ imshp_logical, dtype=img2d.dtype)
buf[:,:,::rstride, ::cstride] = img2d
img2d = buf
del buf, rstride, cstride
if kshp != kshp_logical:
rstride = int(numpy.ceil(kshp_logical[0] / float(kshp[0])))
cstride = int(numpy.ceil(kshp_logical[1] / float(kshp[1])))
buf = numpy.zeros((nkern,stacklen)+ self.kshp_logical, dtype=filtersflipped.dtype)
if self.kshp_logical_top_aligned:
roffset=coffset=0
else:
roffset=(kshp_logical[0] - (kshp[0]*rstride) - 1+rstride) % rstride
coffset=(kshp_logical[1] - (kshp[1]*cstride) - 1+cstride) % cstride
assert roffset >= 0
assert coffset >= 0
buf[:,:,roffset::rstride, coffset::cstride] = filtersflipped
filtersflipped = buf
del buf, rstride, cstride
for b in range(bsize):
for n in range(nkern):
zz[b,n,...].fill(0)
for im0 in range(stacklen):
zz[b,n,...] += _convolve2d(\
img2d[b,im0,...], filtersflipped[n,im0,...],1,val, bval, 0)
#We copy it to remove the Stride mismatch warning from DEBUG_MODE.
#The copy make that we return an object with the same stride as the c version.
#The copy don't affect the performence during our experience as in that case we
#execute the c version which is much faster.
if self.dx>1 or self.dy>1:
zz = zz[:,:,0::self.dx,0::self.dy].copy()
z[0]=zz
def grad(self, (inputs, kerns), (gz,)):
if self.imshp != self.imshp_logical or self.kshp != self.kshp_logical:
raise NotImplementedError('todo')
#if self.dx!=1 or self.dy!=1:
#raise Exception("ERROR: We disable ConvOp.grad now when dx!=1 or "\
#"dy!=1 as we think their is a high probability of bug in it."\
#"We need to raise the error on the gradient to .1!")
all_shape = self.imshp is not None and self.kshp is not None and \
self.nkern is not None and self.bsize is not None
if not all_shape and (self.dx!=1 or self.dy!=1):
raise Exception("ConvOp.grad when dx!=1 or dy!=1 we must have all "\
"the optional shape information")
####### Determine gradient on kernels ########
assert inputs.ndim==4 and kerns.ndim==4
newin = inputs.dimshuffle((1,0,2,3))
newgz = gz.dimshuffle((1,0,2,3))
(bsize, nkern) = None, None
imshp = None
kshp = None
un_p = self.unroll_patch
imshp_logical = None
if self.out_mode == 'valid':
(img, filters) = (newin, newgz)
kshp_logical = self.fulloutshp
kshp_logical_top_aligned=False
if all_shape:
(bsize, nkern) = (self.imshp[0], self.nkern)
imshp = (self.bsize, self.imshp[1], self.imshp[2])
kshp = self.outshp
un_b = self.unroll_batch
un_k = self.unroll_kern
elif self.out_mode == 'full':
(img, filters) = (newgz, newin)
kshp_logical = None
kshp_logical_top_aligned=True
if all_shape:
imshp_logical = (self.bsize, self.fulloutshp[0], self.fulloutshp[1])
(bsize, nkern) = (self.nkern, self.imshp[0])
imshp = (self.bsize, self.outshp[0], self.outshp[1])
kshp = self.imshp[1:]
un_b = self.unroll_kern
un_k = self.unroll_batch
bsize = 1
imshp = (1,) + tuple(image_shape[-2:])
if filters.ndim==3:
nkern = filter_shape[0]
else:
raise NotImplementedError('Only [full,valid] modes are currently supported.')
filters = filters[:,:,::-1,::-1] #flip them
#find good value for the unroll
if all_shape and un_b!=0 and bsize%un_b!=0:
if bsize<un_b:
un_b = bsize
else:
un_b = 1
_warn("OPTIMISATION WARNING: in ConvOp.grad() we can't determine "\
"a good unroll value for the batch. Maybe you can optimize this!")
if un_k!=0 and nkern%un_k!=0:
if nkern<un_k:
un_k = nkern
else:
un_k = 1
_warn("OPTIMISATION WARNING: in ConvOp.grad() we can't determine "\
"a good unroll value for the kernel. Maybe you can optimize this!")
dw = ConvOp(imshp, kshp, nkern, bsize, 1,1, output_mode='valid',
unroll_batch=un_b, unroll_kern=un_k, unroll_patch=un_p,
imshp_logical=imshp_logical,
kshp_logical=kshp_logical,
kshp_logical_top_aligned=kshp_logical_top_aligned,
version=self.version,
verbose=self.verbose)
if hasattr(self,'flops'):
dw.set_flops()
dw = dw(img,filters)
if all_shape:
assert (dw.owner.op.outshp==self.kshp).all()
if self.out_mode == 'valid':
# before DimShuffle, dw is of shape visdim x nkern x kshp[0] x kshp[1]
dw = dw.dimshuffle((1,0,2,3))
dw = dw[:,:,::-1,::-1]
####### Determine gradient on inputs ########
mode = 'valid'
if not self.out_mode == 'full':
mode = 'full'
filters = kerns.dimshuffle((1,0,2,3))
filters = filters[:,:,::-1,::-1]
nkern = None
imshp = None
imshp_logical = None
kshp = None
if all_shape:
nkern = self.imshp[0]
imshp = (self.nkern, self.outshp[0], self.outshp[1])
imshp_logical=(self.nkern, self.fulloutshp[0], self.fulloutshp[1])
din = ConvOp(imshp, self.kshp, nkern, self.bsize,
1,1, output_mode=mode,
unroll_batch=un_b, unroll_kern=un_k, unroll_patch=un_p,
imshp_logical=imshp_logical,
kshp_logical=None,
version=-1,#we we change the mode, we don't forward the version.
verbose=self.verbose)
if hasattr(self,'flops'):
din.set_flops()
din = din(gz,filters)
nkern = 1
kshp = filter_shape[-2:]
else:
nkern, kshp = None, None
bsize, imshp = None, None
print 'self.imshp = ', self.imshp
print 'din.owner.op.outshp = ', din.owner.op.outshp
assert (din.owner.op.outshp is None and self.imshp is None) or \
(din.owner.op.outshp is None) or \
(din.owner.op.outshp==self.imshp[1:]).all()
return [din, dw]
### reshape tensors to 4D, for compatibility with ConvOp ###
if input.ndim==3:
sym_bsize = input.shape[0]
else:
sym_bsize = 1
def c_headers(self):
return ['<numpy/noprefix.h>', '<iostream>', '<sstream>' ]
if filters.ndim==3:
sym_nkern = filters.shape[0]
else:
sym_nkern = 1
def c_code_cache_version(self):
return (1)
new_input_shape = tensor.join(0, tensor.stack(sym_bsize,1), input.shape[-2:])
input4D = tensor.reshape(input, new_input_shape, ndim=4)
def c_support_code(self):
return """
#define STRIDES(arr) ((arr)->strides)
#define FULL 2
#define SAME 1
#define VALID 0
#define MOD %
using namespace std;
""" + tensor.blas.blas_header_text()
def c_libraries(self):
return tensor.blas.ldflags()
def c_code(self, node, name, (img2d, filtersflipped), (z, ), sub):
if node.inputs[0].type.dtype != node.inputs[1].type.dtype:
raise NotImplementedError()
assert node.inputs[0].type.dtype == node.inputs[1].type.dtype
d=locals()
d.update(sub)
all_shape = self.imshp is not None and self.kshp is not None and \
self.nkern is not None and self.bsize is not None
d["self_out_mode"]=self.out_mode
d["self_dx"]=self.dx
d["self_dy"]=self.dy
d["mode"]=self.out_mode.upper()
d["mode"]=self.out_mode.upper()
d["affectation"]="="
if all_shape:
d["self_bsize"]=self.bsize
d["self_nkern"]=self.nkern
d["self_outshp0"]=self.outshp[0]
d["self_outshp1"]=self.outshp[1]
d["self_imshp0"]=self.imshp[0]
d["self_imshp1"]=self.imshp[1]
d["self_imshp2"]=self.imshp[2]
d["self_kshp0"]=self.kshp[0]
d["self_kshp1"]=self.kshp[1]
d["self_kshp_logical_r"] = self.kshp_logical[0]
d["self_kshp_logical_c"] = self.kshp_logical[1]
d["self_kshp_logical_stride_r"] = int(numpy.ceil(self.kshp_logical[0] / float(self.kshp[0])))
d["self_kshp_logical_stride_c"] = int(numpy.ceil(self.kshp_logical[1] / float(self.kshp[1])))
d["self_imshp_logical_r"] = self.imshp_logical[1] #numpy.B. 1 not 0
d["self_imshp_logical_c"] = self.imshp_logical[2]#numpy.B. 2 not 1
d["self_imshp_logical_stride_r"] = int(numpy.ceil(self.imshp_logical[1] / float(self.imshp[1])))
d["self_imshp_logical_stride_c"] = int(numpy.ceil(self.imshp_logical[2] / float(self.imshp[2])))
if not self.imshp[0]==1: d["affectation"]="+="
d["all_shape"]=1
d["dim_zz_const"]="const"
else:
d["self_bsize"]="%(img2d)s->dimensions[0]"%d
d["self_nkern"]="%(filtersflipped)s->dimensions[0]"%d
d["self_outshp0"]="-1"
d["self_outshp1"]="-1"
d["self_imshp0"]="%(img2d)s->dimensions[1]"%d
d["self_imshp1"]="%(img2d)s->dimensions[2]"%d
d["self_imshp2"]="%(img2d)s->dimensions[3]"%d
d["self_kshp0"]="%(filtersflipped)s->dimensions[2]"%d
d["self_kshp1"]="%(filtersflipped)s->dimensions[3]"%d
d["affectation"]="+="
d["all_shape"]=0
d["dim_zz_const"]=""
if self.kshp_logical_top_aligned:
d["self_kshp_logical_offset_r"] = 0
d["self_kshp_logical_offset_c"] = 0
elif self.imshp != self.imshp_logical or self.kshp != self.kshp_logical:
rstride = d["self_kshp_logical_stride_r"]
cstride = d["self_kshp_logical_stride_c"]
d["self_kshp_logical_offset_r"] = (self.kshp_logical[0] - (self.kshp[0]*rstride) - 1+rstride) % rstride
d["self_kshp_logical_offset_c"] = (self.kshp_logical[1] - (self.kshp[1]*cstride) - 1+cstride) % cstride
del rstride, cstride
if node.inputs[0].type.dtype=="float32": d["type"]="float"
elif node.inputs[0].type.dtype=="float64": d["type"]="double"
else: raise Exception("Type %s not implemented"%node.inputs[0].type.dtype)
d["gemm"]='dgemm_'
if not d["type"]=="double":d["gemm"]='sgemm_'
if self.imshp != self.imshp_logical or self.kshp != self.kshp_logical:
if self.verbose:
_debug("return imshp!=imshp_logical or self.kshp != self.kshp_logical shape version")
return _conv_op_code_a % d
if self.unroll_patch:
if self.verbose:
_debug("return unroll patch version. all_shape=", all_shape)
return _conv_op_code_unroll_patch%d
if self.unroll_batch>0 or self.unroll_kern>0:
if self.unroll_batch<=0: self.unroll_batch=1
if self.unroll_kern<=0: self.unroll_kern=1
if self.verbose:
_debug("return unrolled batch (%s) and kern code (%s)",
str(self.unroll_batch), str(self.unroll_kern))
return gen_conv_code_unroll_batch_kern(d, self.unroll_batch,
self.unroll_kern)
#TODO: should we choose the unroll size automatically with the bigger divisor under 5?
if self.out_mode == 'valid' and self.dx==0 and self.dy==0:
if self.verbose:
_debug("return gemm version")
return _conv_op_code_valid_gemm % d
else:
if self.verbose:
_debug("return no gemm version")
return _conv_op_code_a % d
_conv_op_code_a = """
const int mode=%(mode)s;
int typenum=0, typenum_f=0;
PyArrayObject *ain1=NULL, *ain2=NULL, *filtersflipped_arr=NULL, *img2d_arr=NULL;
const %(type)s fill_value = 0;
int type_im=PyArray_TYPE(%(img2d)s);
int type_ker=PyArray_TYPE(%(filtersflipped)s);
npy_intp dim_zz[2]={%(self_outshp0)s,%(self_outshp1)s};
npy_intp dim_im_phys[2]={%(self_imshp1)s,%(self_imshp2)s};
npy_intp dim_im_log[2]={%(self_imshp_logical_r)s,%(self_imshp_logical_c)s};
npy_intp dim_ker_phys[2]={%(self_kshp0)s,%(self_kshp1)s};
npy_intp dim_ker_log[2]={%(self_kshp_logical_r)s,%(self_kshp_logical_c)s};
PyArray_Dims img2d_shape;
npy_intp img2d_dim[4]={1,1,0,0};
img2d_shape.ptr=img2d_dim;
img2d_shape.len=4;
PyArray_Dims kerns_shape;
npy_intp kerns_dim[4]={1,1,0,0};
kerns_shape.ptr=kerns_dim;
kerns_shape.len=4;
PyObject *img2d=NULL, *contig, *filtersflipped=NULL;
if(%(img2d)s->nd==2){
img2d_dim[3]=%(img2d)s->dimensions[1];
img2d_dim[2]=%(img2d)s->dimensions[0];
}else if(%(img2d)s->nd==3){
img2d_dim[3]=%(img2d)s->dimensions[2];
img2d_dim[2]=%(img2d)s->dimensions[1];
img2d_dim[0]=%(img2d)s->dimensions[0];
}else if(%(img2d)s->nd==4){
img2d_dim[3]=%(img2d)s->dimensions[3];
img2d_dim[2]=%(img2d)s->dimensions[2];
img2d_dim[1]=%(img2d)s->dimensions[1];
img2d_dim[0]=%(img2d)s->dimensions[0];
}else {
PyErr_SetString(PyExc_ValueError, "img don't have a good shape");
%(fail)s;
}
if(%(filtersflipped)s->nd==3){
kerns_dim[3]=%(filtersflipped)s->dimensions[2];
kerns_dim[2]=%(filtersflipped)s->dimensions[1];
kerns_dim[0]=%(filtersflipped)s->dimensions[0];
}else if(%(filtersflipped)s->nd==4){
kerns_dim[3]=%(filtersflipped)s->dimensions[3];
kerns_dim[2]=%(filtersflipped)s->dimensions[2];
kerns_dim[1]=%(filtersflipped)s->dimensions[1];
kerns_dim[0]=%(filtersflipped)s->dimensions[0];
}else{
std:stringstream temp;
temp << "nddim="<<%(filtersflipped)s->nd;
std::string param = temp.str();
PyErr_SetString(PyExc_ValueError,
("kernel don't have a good shape. " + param).c_str());
%(fail)s;
}
img2d = PyArray_Newshape(%(img2d)s,&img2d_shape, PyArray_CORDER);
img2d_arr = (PyArrayObject*)img2d;
if ((img2d_arr->strides[3] != (npy_intp)sizeof(%(type)s))
|| (img2d_arr->strides[2] != img2d_arr->dimensions[3]*(npy_intp)sizeof(%(type)s))){
contig = (PyObject*)(PyArray_GETCONTIGUOUS((PyArrayObject*)img2d));
Py_DECREF(img2d);
img2d = contig;
if (!PyArray_ISCONTIGUOUS(img2d)){
PyErr_SetString(PyExc_ValueError, "img2d isn't contiguous");
%(fail)s;
}
}
img2d_arr = (PyArrayObject*)img2d;
filtersflipped = PyArray_Newshape(%(filtersflipped)s,&kerns_shape, PyArray_CORDER);
filtersflipped_arr = (PyArrayObject*)filtersflipped;
if ((filtersflipped_arr->strides[3] != (npy_intp)sizeof(%(type)s))
|| (filtersflipped_arr->strides[2] != filtersflipped_arr->dimensions[3]*(npy_intp)sizeof(%(type)s))){
contig = (PyObject*)(PyArray_GETCONTIGUOUS((PyArrayObject*)filtersflipped));
Py_DECREF(filtersflipped);
filtersflipped = contig;
if (!PyArray_ISCONTIGUOUS(filtersflipped)){
PyErr_SetString(PyExc_ValueError, "filtersflipped isn't contiguous");
%(fail)s;
}
}
filtersflipped_arr = (PyArrayObject*)filtersflipped;
if(mode != VALID && mode != FULL){
PyErr_SetString(PyExc_ValueError, "invalid mode, only full and valid are supported"); %(fail)s;
}
typenum = PyArray_ObjectType((PyObject*)%(img2d)s, 0);
typenum_f = PyArray_ObjectType((PyObject*)%(filtersflipped)s, 0);
if (typenum < 0) {PyErr_SetString(PyExc_ValueError, "Invalid type"); %(fail)s;}
if (typenum != typenum_f) {PyErr_SetString(PyExc_ValueError, "Input types must match"); %(fail)s;}
if (!img2d) %(fail)s;
if (!filtersflipped) %(fail)s;
if ((!%(z)s)
|| *PyArray_DIMS(%(z)s)!=4
||(%(z)s->dimensions[0] != %(self_bsize)s)
||(%(z)s->dimensions[1] != %(self_nkern)s)
||(%(z)s->dimensions[2] != dim_zz[0])
|| (%(z)s->dimensions[3] != dim_zz[1])
)
{
{Py_XDECREF(%(z)s);}
npy_intp dims[4] = {0,0,0,0};
dims[0]=%(self_bsize)s;
dims[1]=%(self_nkern)s;
dims[2]=dim_zz[0];
dims[3]=dim_zz[1];
%(z)s = (PyArrayObject*) PyArray_ZEROS(4, dims, typenum,0);
}else{
//PyArray_FILLWBYTE((PyObject*)%(z)s,0);
}
int Os[2];
Os[0]=%(self_outshp0)s;
Os[1]=%(self_outshp1)s;
for(int b=0;b< %(self_bsize)s;b++){
for(int n_kern=0;n_kern<%(self_nkern)s;n_kern++){
//assertions
if (%(z)s->strides[0] != %(z)s->dimensions[1] *%(z)s->dimensions[2] *%(z)s->dimensions[3] * (npy_intp)sizeof(%(type)s)) %(fail)s;
if (%(z)s->strides[1] != %(z)s->dimensions[2] * %(z)s->dimensions[3] * (npy_intp)sizeof(%(type)s)) %(fail)s;
if (%(z)s->strides[2] != %(z)s->dimensions[3] * (npy_intp)sizeof(%(type)s)) %(fail)s;
if (%(z)s->strides[3] != (npy_intp)sizeof(%(type)s)) %(fail)s;
%(type)s * __restrict__ out=(%(type)s *)(PyArray_GETPTR2(%(z)s,b,n_kern));
for (int i = 0; i < dim_zz[0]*dim_zz[1]; ++i) out[i] = 0;
for(int stack_size=0;stack_size<%(self_imshp0)s;stack_size++){
const %(type)s * __restrict__ in=(%(type)s *)(PyArray_GETPTR2(img2d,b,stack_size));
const %(type)s * __restrict__ hvals=(%(type)s *)(PyArray_GETPTR2(filtersflipped,n_kern,stack_size));
for (int iter_m=0; iter_m < Os[0]; iter_m++) {
/// Reposition index into input image based on requested output size
int pos_m = iter_m*%(self_dx)s; //row position in logical output image
int new_m; //row anchor in logical input image (we will loop upward from here)
if (mode == FULL) new_m = pos_m ;
else new_m = (pos_m+dim_ker_log[0]-1);
for (int iter_n=0; iter_n < Os[1]; iter_n++) { // loop over columns
int pos_n=iter_n*%(self_dy)s; // current col position in logical output image
%(type)s sum=0;
// Sum over kernel, if index into image is out of bounds
// fill with the value
for (int j_log=0; j_log < %(self_kshp_logical_r)s; j_log++) { // loop over logical rows in kernel
int ind0_log = (new_m-j_log); // ind0_log: row position in logical input image
if ((j_log < %(self_kshp_logical_offset_r)s) || (j_log - %(self_kshp_logical_offset_r)s) MOD %(self_kshp_logical_stride_r)s)
continue;
if (ind0_log MOD %(self_imshp_logical_stride_r)s)
continue;
int j_phys = ((j_log- %(self_kshp_logical_offset_r)s) / %(self_kshp_logical_stride_r)s);
int ind0_phys = (ind0_log / %(self_imshp_logical_stride_r)s);
//std::cerr <<"j_log" << j_log << " j_phys " << j_phys << " " << ind0_phys << "\\n";
if(mode==FULL){
const %(type)s * idx_hvals=&hvals[j_phys*dim_ker_phys[1]]; //This is a pointer to the current row of the kernel
if(ind0_log < 0 || ind0_log >= dim_im_log[0]){
// the current row of the kernel is off the image
}else{
int k = max((int)(pos_n-dim_im_log[1])+1,0);
int max_k=min(pos_n+1,(int)dim_ker_log[1]);
const %(type)s * idx_in=&in[ind0_phys*dim_im_phys[1]];
for (int ind1_log=pos_n-k; k<max_k; k++,ind1_log--) {
if (1)
{
if ((k < %(self_kshp_logical_offset_c)s) || (k - %(self_kshp_logical_offset_c)s) MOD %(self_kshp_logical_stride_c)s)
continue;
if (ind1_log MOD %(self_imshp_logical_stride_c)s)
continue;
}
sum+= idx_hvals[(k-%(self_kshp_logical_offset_c)s) / %(self_kshp_logical_stride_c)s] * idx_in[ind1_log / %(self_imshp_logical_stride_c)s];
}
}
}else{
const %(type)s* idx_in=&in[ind0_phys*dim_im_phys[1]]; //JB: should be dim_im[1] right? (was dim_im[0])
const %(type)s* idx_hvals=&hvals[j_phys*dim_ker_phys[1]];
int new_n = (pos_n+dim_ker_log[1]-1);
if (%(self_imshp_logical_stride_c)s != 1) // a general loop
{
for (int k=0,last=new_n; k < dim_ker_log[1]; k++,last--) {
if ((k < %(self_kshp_logical_offset_c)s) || (k - %(self_kshp_logical_offset_c)s) MOD %(self_kshp_logical_stride_c)s)
continue;
else if (last MOD %(self_imshp_logical_stride_c)s)
continue;
else
{
sum+=idx_hvals[(k-%(self_kshp_logical_offset_c)s) / %(self_kshp_logical_stride_c)s]*idx_in[last/%(self_imshp_logical_stride_c)s];
}
}
}
else // self_imshp_stride_c == 1
{
int offset = %(self_kshp_logical_offset_c)s;
int k_phys=0;
for (int k_log=offset,last=new_n-offset; k_log < dim_ker_log[1]; ) {
sum += idx_hvals[k_phys]*idx_in[last];
++k_phys;
last -= %(self_kshp_logical_stride_c)s;
k_log += %(self_kshp_logical_stride_c)s;
}
}
}
}//for j
out[iter_m*dim_zz[1]+iter_n] %(affectation)s sum;
}//for n
}//for m
}//for stack_size
if (0 && (mode==FULL)){
for (int i = 0; i < dim_zz[0]*dim_zz[1]; ++i)
std::cout << " " << out[i];
std::cout << "\\n";
}
}//for n_kern
}//for b
Py_XDECREF(img2d);
Py_XDECREF(filtersflipped);
"""
#########
######### ConvOp c_code for valid mode (uses gemm)
#########
_conv_op_code_valid_gemm = """
int typenum=0, typenum_f=0;
PyArrayObject *ain1=NULL, *ain2=NULL, *img2d_arr=NULL;
const int NKERN = %(self_nkern)s;
int type_im=PyArray_TYPE(%(img2d)s);
int type_ker=PyArray_TYPE(%(filtersflipped)s);
npy_intp dim_zz[2]={%(self_outshp0)s,%(self_outshp1)s};
npy_intp dim_im[2]={%(self_imshp1)s,%(self_imshp2)s};
npy_intp dim_ker[2]={%(self_kshp0)s,%(self_kshp1)s};
PyArray_Dims img2d_shape;
npy_intp img2d_dim[4]={1,1,0,0};
img2d_shape.ptr=img2d_dim;
img2d_shape.len=4;
PyArray_Dims kerns_shape;
npy_intp kerns_dim[4]={1,1,0,0};
kerns_shape.ptr=kerns_dim;
kerns_shape.len=4;
PyObject *img2d=NULL, *contig;
if(%(img2d)s->nd==2){
img2d_dim[3]=%(img2d)s->dimensions[1];
img2d_dim[2]=%(img2d)s->dimensions[0];
}else if(%(img2d)s->nd==3){
img2d_dim[3]=%(img2d)s->dimensions[2];
img2d_dim[2]=%(img2d)s->dimensions[1];
img2d_dim[0]=%(img2d)s->dimensions[0];
}else if(%(img2d)s->nd==4){
img2d_dim[3]=%(img2d)s->dimensions[3];
img2d_dim[2]=%(img2d)s->dimensions[2];
img2d_dim[1]=%(img2d)s->dimensions[1];
img2d_dim[0]=%(img2d)s->dimensions[0];
}else {
PyErr_SetString(PyExc_ValueError, "img don't have a good shape");
%(fail)s;
}
if(%(filtersflipped)s->nd==3){
kerns_dim[3]=%(filtersflipped)s->dimensions[2];
kerns_dim[2]=%(filtersflipped)s->dimensions[1];
kerns_dim[0]=%(filtersflipped)s->dimensions[0];
}else if(%(filtersflipped)s->nd==4){
kerns_dim[3]=%(filtersflipped)s->dimensions[3];
kerns_dim[2]=%(filtersflipped)s->dimensions[2];
kerns_dim[1]=%(filtersflipped)s->dimensions[1];
kerns_dim[0]=%(filtersflipped)s->dimensions[0];
}else{
std:stringstream temp;
temp << "nddim="<<%(filtersflipped)s->nd;
std::string param = temp.str();
PyErr_SetString(PyExc_ValueError,
("kernel don't have a good shape. " + param).c_str());
%(fail)s;
}
if (NKERN != kerns_dim[0])
{
PyErr_SetString(PyExc_NotImplementedError, "nonsense nkern");
%(fail)s;
}
img2d = PyArray_Newshape(%(img2d)s,&img2d_shape, PyArray_CORDER);
img2d_arr = (PyArrayObject*)img2d;
if ((img2d_arr->strides[3] != (npy_intp)sizeof(%(type)s))
|| (img2d_arr->strides[2] != img2d_arr->dimensions[3]*(npy_intp)sizeof(%(type)s))){
contig = (PyObject*)(PyArray_GETCONTIGUOUS((PyArrayObject*)img2d));
Py_DECREF(img2d);
img2d = contig;
if (!PyArray_ISCONTIGUOUS(img2d)){
PyErr_SetString(PyExc_ValueError, "img2d isn't contiguous");
%(fail)s;
}
}
img2d_arr = (PyArrayObject*)img2d;
typenum = PyArray_ObjectType((PyObject*)%(img2d)s, 0);
typenum_f = PyArray_ObjectType((PyObject*)%(filtersflipped)s, 0);
if (typenum < 0) {PyErr_SetString(PyExc_ValueError, "Invalid type"); %(fail)s;}
if (typenum != typenum_f) {PyErr_SetString(PyExc_ValueError, "Input types must match"); %(fail)s;}
if (!img2d) {
PyErr_SetString(PyExc_ValueError, "Null argument img2d");
%(fail)s;
}
if ((!%(z)s)
|| *PyArray_DIMS(%(z)s)!=4
||(%(z)s->dimensions[0] != %(self_bsize)s)
||(%(z)s->dimensions[1] != %(self_nkern)s)
||(%(z)s->dimensions[2] != dim_zz[0])
|| (%(z)s->dimensions[3] != dim_zz[1])
)
{
{Py_XDECREF(%(z)s);}
npy_intp dims[4] = {0,0,0,0};
dims[0]=%(self_bsize)s;
dims[1]=%(self_nkern)s;
dims[2]=dim_zz[0];
dims[3]=dim_zz[1];
%(z)s = (PyArrayObject*) PyArray_ZEROS(4, dims, typenum,0);
}else{
PyArray_FILLWBYTE((PyObject*)%(z)s,0);
}
int Os[2];
Os[0] = dim_im[0]-dim_ker[0]+1;
Os[1] = dim_im[1]-dim_ker[1]+1;
// allocate a temporary buffer for storing the inner product of each nth kernel row
// with each row of an image
{
%(type)s * kbuf = (%(type)s *)malloc((Os[0] * NKERN + PyArray_Size((PyObject*)%(filtersflipped)s))* (npy_intp)sizeof(%(type)s));
int kbufstride = NKERN;
%(type)s * myfilters = kbuf + Os[0] * NKERN;
//copy out filtersflipped into filters un-flipped format
//std::cerr << "__filling myfilters__\\n";
for(int i=0;i < kerns_dim[0];++i){
for(int j=0;j < kerns_dim[1];++j){
for(int k=0;k < kerns_dim[2];++k){
for(int l=0;l < kerns_dim[3];++l){
%(type)s * ff = ((%(filtersflipped)s)->nd == 3)
? (%(type)s *)PyArray_GETPTR3(%(filtersflipped)s, i, kerns_dim[2]-1-k, kerns_dim[3]-1-l)
: (%(type)s *)PyArray_GETPTR4(%(filtersflipped)s, i, j, kerns_dim[2]-1-k, kerns_dim[3]-1-l);
myfilters[i * (kerns_dim[1]*kerns_dim[2]*kerns_dim[3])
+ j * (kerns_dim[2]*kerns_dim[3])
+ k * (kerns_dim[3])
+ l] = ff[0];
//std::cerr << " " << ff[0];
}
//std::cerr << "\\n";
}
//std::cerr << "(end of stack/batch " <<j << "/" << i << " ) \\n";
}
}
new_filter_shape = tensor.join(0, tensor.stack(sym_nkern,1), filters.shape[-2:])
filters4D = tensor.reshape(filters, new_filter_shape, ndim=4)
//std::cerr << "-----new loop ----\\n";
for(int b=0;b< %(self_bsize)s;b++){
for (int img_col = 0; img_col < Os[1]; ++img_col){
for (int filter_row = 0; filter_row < kerns_dim[2]; ++filter_row){
for (int stackidx = 0; stackidx < %(self_imshp0)s; ++stackidx){
%(type)s * img_colview =
(%(type)s *)(PyArray_GETPTR4(img2d, b, stackidx, filter_row, img_col));
%(type)s * filter_rows = myfilters + stackidx * (kerns_dim[2]*kerns_dim[3]) +
filter_row * kerns_dim[3];
//std::cerr << "filterview offset: " << filter_rows - myfilters << "\\n";
char N = 'N'; char T = 'T';
int Nz0 = Os[0];
int Nz1 = NKERN;
int K = kerns_dim[3];
%(type)s alpha = 1.0;
%(type)s beta = stackidx ? 1.0 : 0.0;
int imgview_stride = dim_im[1];
int filter_rows_stride =kerns_dim[1]*kerns_dim[2]*kerns_dim[3];
//remember, Fortran wants a column-major interpretation
assert(img2d->strides[3] == (npy_intp)sizeof(%(type)s));
if (0){
std::cerr << "b " << b << " img_col " << img_col << " filterrow " << filter_row << " stackidx " <<stackidx << "\\n";
std::cerr << "colview (physical layout) stride: " << imgview_stride << "\\n";
for (int ii = 0; ii < Nz0; ++ii){
for (int jj = 0; jj < K; ++jj){
std::cerr << " " << img_colview[ii * imgview_stride + jj];
}
std::cerr << "\\n";
}
std::cerr << "filterview ("<<filter_row<<"'th rows) stride: " << filter_rows_stride << "\\n";
for (int ii = 0; ii < Nz1; ++ii){
for (int jj = 0; jj < K; ++jj){
std::cerr << " " << filter_rows[ii * filter_rows_stride + jj];
}
std::cerr << "\\n";
}
std::cerr << Nz1 << " " << Nz0 << " " << K << "\\n" ;
}
%(gemm)s(&T, &N,
&Nz1, &Nz0, &K,
&alpha,
filter_rows, &filter_rows_stride,
img_colview, &imgview_stride,
&beta, kbuf, &kbufstride);
if (0){
std::cerr << "z (logical layout) beta" << beta << "\\n";
for (int ii = 0; ii < Nz0; ++ii){
for (int jj = 0; jj < Nz1; ++jj){
std::cerr << " " << kbuf[ii * kbufstride + jj];
}
std::cerr << "\\n";
}
}
}
// now kbuf the sum over the stack, put it into the outbuf
for (int img_row = 0; img_row < Os[0]; ++img_row) {
for (int kernel_idx = 0; kernel_idx < NKERN; ++kernel_idx) {
%(type)s * z_p = (%(type)s *)PyArray_GETPTR4(%(z)s, b, kernel_idx, img_row, img_col);
if (0)
{
if (b >= %(z)s->dimensions[0]) %(fail)s;
if (kernel_idx >= %(z)s->dimensions[1]) %(fail)s;
if (img_row >= %(z)s->dimensions[2]) %(fail)s;
if (img_col >= %(z)s->dimensions[3]) %(fail)s;
}
z_p[0] += kbuf[img_row * kbufstride + kernel_idx];
}
}
}
}
}
free(kbuf);
}
Py_XDECREF(img2d);
"""
def gen_conv_code_unroll_batch_kern(d,unroll_bsize=1, unroll_ksize=1):
""" c_code for ConvOp that unroll the batch size loop
"""
assert unroll_bsize>0 and unroll_ksize>0
if d.has_key("unroll_bsize") or d.has_key("unroll_ksize") or d.has_key("unroll_iter") or d.has_key("unroll_biter") or d.has_key("unroll_kiter"):
raise Exception("We can't use this dictionnary as we will overwrite some of its containt")
d=d.copy()
d["unroll_bsize"]=unroll_bsize
d["unroll_ksize"]=unroll_ksize
def my_dup(st,size):
s=""
for i in range(size):
d["unroll_iter"]=i
s+=st%d
return s+"\n"
def my_dup2(st):
s=""
iter=0
for i in range(unroll_bsize):
d["unroll_biter"]=i
for j in range(unroll_ksize):
d["unroll_kiter"]=j
d["unroll_iter"]=iter
iter+=1
s+=st%d
return s+"\n"
ret = """
const int mode=%(mode)s;
int typenum=0, typenum_f=0;
PyArrayObject *ain1=NULL, *ain2=NULL, *filtersflipped_arr=NULL, *img2d_arr=NULL;
const %(type)s fill_value = 0;
int type_im=PyArray_TYPE(%(img2d)s);
int type_ker=PyArray_TYPE(%(filtersflipped)s);
npy_intp dim_zz[2]={%(self_outshp0)s,%(self_outshp1)s};
npy_intp dim_im[2]={%(self_imshp1)s,%(self_imshp2)s};
npy_intp dim_ker[2]={%(self_kshp0)s,%(self_kshp1)s};
PyArray_Dims img2d_shape;
npy_intp img2d_dim[4]={1,1,0,0};
img2d_shape.ptr=img2d_dim;
img2d_shape.len=4;
PyArray_Dims kerns_shape;
npy_intp kerns_dim[4]={1,1,0,0};
kerns_shape.ptr=kerns_dim;
kerns_shape.len=4;
PyObject *img2d=NULL, *contig, *filtersflipped=NULL;
if(%(img2d)s->nd==2){
img2d_dim[3]=%(img2d)s->dimensions[1];
img2d_dim[2]=%(img2d)s->dimensions[0];
}else if(%(img2d)s->nd==3){
img2d_dim[3]=%(img2d)s->dimensions[2];
img2d_dim[2]=%(img2d)s->dimensions[1];
img2d_dim[0]=%(img2d)s->dimensions[0];
}else if(%(img2d)s->nd==4){
img2d_dim[3]=%(img2d)s->dimensions[3];
img2d_dim[2]=%(img2d)s->dimensions[2];
img2d_dim[1]=%(img2d)s->dimensions[1];
img2d_dim[0]=%(img2d)s->dimensions[0];
}else {
std:stringstream temp;
temp << "nddim="<<%(img2d)s->nd;
std::string param = temp.str();
PyErr_SetString(PyExc_ValueError,
("img don't have a good shape. " + param).c_str());
%(fail)s;
}
if(%(filtersflipped)s->nd==3){
kerns_dim[3]=%(filtersflipped)s->dimensions[2];
kerns_dim[2]=%(filtersflipped)s->dimensions[1];
kerns_dim[0]=%(filtersflipped)s->dimensions[0];
}else if(%(filtersflipped)s->nd==4){
kerns_dim[3]=%(filtersflipped)s->dimensions[3];
kerns_dim[2]=%(filtersflipped)s->dimensions[2];
kerns_dim[1]=%(filtersflipped)s->dimensions[1];
kerns_dim[0]=%(filtersflipped)s->dimensions[0];
}else{
PyErr_SetString(PyExc_ValueError, "kernel don't have a good shape");
%(fail)s;
}
img2d = PyArray_Newshape(%(img2d)s,&img2d_shape, PyArray_CORDER);
img2d_arr = (PyArrayObject*)img2d;
if ((img2d_arr->strides[3] != (npy_intp)sizeof(%(type)s))
|| (img2d_arr->strides[2] != img2d_arr->dimensions[3]*(npy_intp)sizeof(%(type)s))){
contig = (PyObject*)(PyArray_GETCONTIGUOUS((PyArrayObject*)img2d));
Py_DECREF(img2d);
img2d = contig;
if (!PyArray_ISCONTIGUOUS(img2d)){
PyErr_SetString(PyExc_ValueError, "img2d isn't contiguous");
%(fail)s;
}
}
img2d_arr = (PyArrayObject*)img2d;
filtersflipped = PyArray_Newshape(%(filtersflipped)s,&kerns_shape, PyArray_CORDER);
filtersflipped_arr = (PyArrayObject*)filtersflipped;
if ((filtersflipped_arr->strides[3] != (npy_intp)sizeof(%(type)s))
|| (filtersflipped_arr->strides[2] != filtersflipped_arr->dimensions[3]*(npy_intp)sizeof(%(type)s))){
contig = (PyObject*)(PyArray_GETCONTIGUOUS((PyArrayObject*)filtersflipped));
Py_DECREF(filtersflipped);
filtersflipped = contig;
if (!PyArray_ISCONTIGUOUS(filtersflipped)){
PyErr_SetString(PyExc_ValueError, "filtersflipped isn't contiguous");
%(fail)s;
}
}
filtersflipped_arr = (PyArrayObject*)filtersflipped;
if(mode != VALID && mode != FULL){
PyErr_SetString(PyExc_ValueError, "invalid mode, only full and valid are supported"); %(fail)s;
}
typenum = PyArray_ObjectType((PyObject*)%(img2d)s, 0);
typenum_f = PyArray_ObjectType((PyObject*)%(filtersflipped)s, 0);
if (typenum < 0) {PyErr_SetString(PyExc_ValueError, "Invalid type"); %(fail)s;}
if (typenum != typenum_f) {PyErr_SetString(PyExc_ValueError, "Input types must match"); %(fail)s;}
if (!img2d) %(fail)s;
if (!filtersflipped) %(fail)s;
if ((!%(z)s)
|| *PyArray_DIMS(%(z)s)!=4
||(%(z)s->dimensions[0] != %(self_bsize)s)
||(%(z)s->dimensions[1] != %(self_nkern)s)
||(%(z)s->dimensions[2] != dim_zz[0])
|| (%(z)s->dimensions[3] != dim_zz[1])
)
{
{Py_XDECREF(%(z)s);}
npy_intp dims[4] = {0,0,0,0};
dims[0]=%(self_bsize)s;
dims[1]=%(self_nkern)s;
dims[2]=dim_zz[0];
dims[3]=dim_zz[1];
%(z)s = (PyArrayObject*) PyArray_ZEROS(4, dims, typenum,0);
}else{
//PyArray_FILLWBYTE((PyObject*)%(z)s,0);
}
int Os[2];
Os[0]=%(self_outshp0)s;
Os[1]=%(self_outshp1)s;
for(int b=0;b< %(self_bsize)s ;b+=%(unroll_bsize)s){
for(int n_kern=0;n_kern<%(self_nkern)s;n_kern+=%(unroll_ksize)s){
//assertions
if (%(z)s->strides[0] != %(z)s->dimensions[1] *%(z)s->dimensions[2] *%(z)s->dimensions[3] * (npy_intp)sizeof(%(type)s)) %(fail)s;
if (%(z)s->strides[1] != %(z)s->dimensions[2] * %(z)s->dimensions[3] * (npy_intp)sizeof(%(type)s)) %(fail)s;
if (%(z)s->strides[2] != %(z)s->dimensions[3] * (npy_intp)sizeof(%(type)s)) %(fail)s;
if (%(z)s->strides[3] != (npy_intp)sizeof(%(type)s)) %(fail)s;
"""%d
ret+=my_dup2("%(type)s * __restrict__ out%(unroll_iter)s=(%(type)s *)(PyArray_GETPTR2(%(z)s,b+%(unroll_biter)s,n_kern+%(unroll_kiter)s));")
ret+=my_dup("for (int i = 0; i < dim_zz[0]*dim_zz[1]; ++i) out%(unroll_iter)s[i] = 0;",unroll_bsize*unroll_ksize)
ret+="""
for(int stack_size=0;stack_size<%(self_imshp0)s;stack_size++){
"""%d
ret+=my_dup("const %(type)s * __restrict__ in%(unroll_iter)d=(%(type)s *)(PyArray_GETPTR2(img2d,b+%(unroll_iter)s,stack_size));", unroll_bsize)
ret+=my_dup("const %(type)s * __restrict__ hvals%(unroll_iter)s=(%(type)s *)(PyArray_GETPTR2(filtersflipped,n_kern+%(unroll_iter)s,stack_size));",unroll_ksize)
ret+="""
int new_m;
for (int iter_m=0; iter_m < Os[0]; iter_m++) {
// Reposition index into input image based on requested output size
int pos_m = iter_m*%(self_dx)s;//The position of the patch in the image
if (mode == FULL) new_m = pos_m ;
else new_m = (pos_m+dim_ker[0]-1);
for (int iter_n=0; iter_n < Os[1]; iter_n++) { // loop over columns
int pos_n=iter_n*%(self_dy)s;
"""%d
ret+=my_dup("%(type)s sum%(unroll_iter)s=0;", unroll_bsize*unroll_ksize)
ret+="""
// Sum over kernel, if index into image is out of bounds
// fill with the value
for (int j=0; j < dim_ker[0]; j++) {
int ind0 = (new_m-j);
if(mode==FULL){
"""%d
ret+=my_dup("const %(type)s * idx_hvals%(unroll_iter)s=&hvals%(unroll_iter)s[j*dim_ker[1]];",unroll_ksize)
ret+="""
if(ind0 < 0 || ind0 >= dim_im[0]){
if(fill_value!=0)
for (int k=0; k < dim_ker[1]; k++) {
"""%d
ret+=my_dup2("sum%(unroll_iter)s += idx_hvals%(unroll_kiter)s[k] * fill_value;")
ret+="""
}
}else{
//do the part where kernel is to the right of the img
int k=0,max_k=max((int)(pos_n-dim_im[1])+1,0);
if(fill_value!=0){
for(k=0;k<max_k;k++){
"""%d
ret+=my_dup2("sum%(unroll_iter)s += idx_hvals%(unroll_kiter)s[k] * fill_value;")
ret+="""
}
}else {k=max_k;}
//do the part where the kernel is on the img
max_k=min(pos_n+1,(int)dim_ker[1]);
"""%d
ret+=my_dup("const %(type)s * idx_in%(unroll_iter)s=&in%(unroll_iter)s[ind0*dim_im[1]];", unroll_bsize)
ret+="""
for (int ind1=pos_n-k; k<max_k; k++,ind1--) {
"""%d
ret+=my_dup2("sum%(unroll_iter)s+= idx_hvals%(unroll_kiter)s[k] * idx_in%(unroll_biter)s[ind1];")
ret+="""
}
//do the part to the left of the img
if(fill_value!=0)
for(;k<dim_ker[1];k++){
"""%d
ret+=my_dup2("sum%(unroll_iter)s += idx_hvals%(unroll_kiter)s[k] * fill_value;")
ret+="""
}
}
}else{//valid mode
"""%d
ret+=my_dup("const %(type)s* idx_in%(unroll_iter)s=&in%(unroll_iter)s[ind0*dim_im[1]];", unroll_bsize)
ret+=my_dup("const %(type)s* idx_hvals%(unroll_iter)s=&hvals%(unroll_iter)s[j*dim_ker[1]];",unroll_ksize)
ret+="""
int new_n = (pos_n+dim_ker[1]-1);
for (int k=0,last=new_n; k < dim_ker[1]; k++,last--) {
"""%d
ret+=my_dup2("sum%(unroll_iter)s+=idx_hvals%(unroll_kiter)s[k]*idx_in%(unroll_biter)s[last];")
ret+="""
}
}
}//for j
"""%d
ret+=my_dup("out%(unroll_iter)s[iter_m*dim_zz[1]+iter_n] %(affectation)s sum%(unroll_iter)s;", unroll_bsize*unroll_ksize)
ret+="""
}//for n
}//for m
}//for stack_size
}//for n_kern
}//for b
Py_XDECREF(img2d);
Py_XDECREF(filtersflipped);
"""
return ret
_conv_op_code_unroll_patch = """
const int mode=%(mode)s;
int typenum=0, typenum_f=0;
PyArrayObject *ain1=NULL, *ain2=NULL, *filtersflipped_arr=NULL, *img2d_arr=NULL;
const %(type)s fill_value = 0;//only value of 0 are currently tested and correctly implemented
int type_im=PyArray_TYPE(%(img2d)s);
int type_ker=PyArray_TYPE(%(filtersflipped)s);
const npy_intp dim_im[2]={%(self_imshp1)s,%(self_imshp2)s};
const npy_intp dim_ker[2]={%(self_kshp0)s,%(self_kshp1)s};
%(dim_zz_const)s npy_intp dim_zz[2]={%(self_outshp0)s,%(self_outshp1)s};
#if !%(all_shape)s
if (mode == FULL) {
dim_zz[0] = (int)ceil((dim_im[0]+dim_ker[0]-1)/float(%(self_dx)s));
dim_zz[1] = (int)ceil((dim_im[1]+dim_ker[1]-1)/float(%(self_dy)s));
} else {
dim_zz[0] = (int)ceil((dim_im[0]-dim_ker[0]+1)/float(%(self_dx)s));
dim_zz[1] = (int)ceil((dim_im[1]-dim_ker[1]+1)/float(%(self_dy)s));
}
#endif
PyArray_Dims img2d_shape;
npy_intp img2d_dim[4]={1,1,0,0};
img2d_shape.ptr=img2d_dim;
img2d_shape.len=4;
PyArray_Dims kerns_shape;
npy_intp kerns_dim[4]={1,1,0,0};
kerns_shape.ptr=kerns_dim;
kerns_shape.len=4;
PyObject *img2d=NULL, *contig, *filtersflipped=NULL;
if(%(img2d)s->nd==2){
img2d_dim[3]=%(img2d)s->dimensions[1];
img2d_dim[2]=%(img2d)s->dimensions[0];
}else if(%(img2d)s->nd==3){
img2d_dim[3]=%(img2d)s->dimensions[2];
img2d_dim[2]=%(img2d)s->dimensions[1];
img2d_dim[0]=%(img2d)s->dimensions[0];
}else if(%(img2d)s->nd==4){
img2d_dim[3]=%(img2d)s->dimensions[3];
img2d_dim[2]=%(img2d)s->dimensions[2];
img2d_dim[1]=%(img2d)s->dimensions[1];
img2d_dim[0]=%(img2d)s->dimensions[0];
}else {
PyErr_Format(PyExc_ValueError,
"image don't have a good number of dimensions %%d. ", %(filtersflipped)s->nd);
%(fail)s;
}
if(%(filtersflipped)s->nd==3){
kerns_dim[3]=%(filtersflipped)s->dimensions[2];
kerns_dim[2]=%(filtersflipped)s->dimensions[1];
kerns_dim[0]=%(filtersflipped)s->dimensions[0];
}else if(%(filtersflipped)s->nd==4){
kerns_dim[3]=%(filtersflipped)s->dimensions[3];
kerns_dim[2]=%(filtersflipped)s->dimensions[2];
kerns_dim[1]=%(filtersflipped)s->dimensions[1];
kerns_dim[0]=%(filtersflipped)s->dimensions[0];
}else{
PyErr_Format(PyExc_ValueError,
"kernel don't have a good number of dimensions %%d. ", %(filtersflipped)s->nd);
%(fail)s;
}
img2d = PyArray_Newshape(%(img2d)s,&img2d_shape, PyArray_CORDER);
img2d_arr = (PyArrayObject*)img2d;
if ((img2d_arr->strides[3] != sizeof(%(type)s))
|| (img2d_arr->strides[2] != img2d_arr->dimensions[3]*sizeof(%(type)s))){
contig = (PyObject*)(PyArray_GETCONTIGUOUS((PyArrayObject*)img2d));
Py_DECREF(img2d);
img2d = contig;
if (!PyArray_ISCONTIGUOUS(img2d)){
PyErr_SetString(PyExc_ValueError, "img2d isn't contiguous");
%(fail)s;
}
}
img2d_arr = (PyArrayObject*)img2d;
filtersflipped = PyArray_Newshape(%(filtersflipped)s,&kerns_shape, PyArray_CORDER);
filtersflipped_arr = (PyArrayObject*)filtersflipped;
if ((filtersflipped_arr->strides[3] != sizeof(%(type)s))
|| (filtersflipped_arr->strides[2] != filtersflipped_arr->dimensions[3]*sizeof(%(type)s))){
contig = (PyObject*)(PyArray_GETCONTIGUOUS((PyArrayObject*)filtersflipped));
Py_DECREF(filtersflipped);
filtersflipped = contig;
if (!PyArray_ISCONTIGUOUS(filtersflipped)){
PyErr_SetString(PyExc_ValueError, "filtersflipped isn't contiguous");
%(fail)s;
}
}
filtersflipped_arr = (PyArrayObject*)filtersflipped;
if(mode != VALID && mode != FULL){
PyErr_SetString(PyExc_ValueError, "invalid mode, only full and valid are supported"); %(fail)s;
}
if(dim_zz[0]<=0 || dim_zz[1]<=0){
PyErr_Format(PyExc_ValueError,
"Output dimensions are not valid %%dx%%d",dim_zz[0],dim_zz[1]);
%(fail)s;
}
typenum = PyArray_ObjectType((PyObject*)%(img2d)s, 0);
typenum_f = PyArray_ObjectType((PyObject*)%(filtersflipped)s, 0);
if (typenum < 0) {PyErr_SetString(PyExc_ValueError, "Invalid type"); %(fail)s;}
if (typenum != typenum_f) {PyErr_SetString(PyExc_ValueError, "Input types must match"); %(fail)s;}
if (!img2d) %(fail)s;
if (!filtersflipped) %(fail)s;
if ((!%(z)s)
|| *PyArray_DIMS(%(z)s)!=4
||(%(z)s->dimensions[0] != %(self_bsize)s)
||(%(z)s->dimensions[1] != %(self_nkern)s)
||(%(z)s->dimensions[2] != dim_zz[0])
|| (%(z)s->dimensions[3] != dim_zz[1])
)
{
if (%(z)s) Py_DECREF(%(z)s);
npy_intp dims[4] = {0,0,0,0};
if(!dims) %(fail)s;
dims[0]=%(self_bsize)s;
dims[1]=%(self_nkern)s;
dims[2]=dim_zz[0];
dims[3]=dim_zz[1];
%(z)s = (PyArrayObject*) PyArray_ZEROS(4, dims, typenum,0);
}else{
//PyArray_FILLWBYTE((PyObject*)%(z)s,0);
}
for(int b=0;b< %(self_bsize)s;b++){
for(int n_kern=0;n_kern<%(self_nkern)s;n_kern++){
//assertions
if (%(z)s->strides[0] != %(z)s->dimensions[1] *%(z)s->dimensions[2] *%(z)s->dimensions[3] * sizeof(%(type)s)) %(fail)s;
if (%(z)s->strides[1] != %(z)s->dimensions[2] * %(z)s->dimensions[3] * sizeof(%(type)s)) %(fail)s;
if (%(z)s->strides[2] != %(z)s->dimensions[3] * sizeof(%(type)s)) %(fail)s;
if (%(z)s->strides[3] != sizeof(%(type)s)) %(fail)s;
%(type)s * __restrict__ out=(%(type)s *)(PyArray_GETPTR2(%(z)s,b,n_kern));
for (int i = 0; i < dim_zz[0]*dim_zz[1]; ++i) out[i] = 0;
for(int stack_size=0;stack_size<%(self_imshp0)s;stack_size++){
const %(type)s * __restrict__ in=(%(type)s *)(PyArray_GETPTR2(img2d,b,stack_size));
const %(type)s * __restrict__ hvals=(%(type)s *)(PyArray_GETPTR2(filtersflipped,n_kern,stack_size));
int new_m;
for (int iter_m=0; iter_m < dim_zz[0]; iter_m++) {
// Reposition index into input image based on requested output size
int pos_m = iter_m*%(self_dx)s;//The position of the patch in the image
if (mode == FULL) new_m = pos_m ;
else new_m = (pos_m+dim_ker[0]-1);
### perform actual convolution ###
op = conv.ConvOp(output_mode=border_mode,
dx=subsample[0], dy=subsample[1],
imshp=imshp, kshp=kshp, nkern=nkern, bsize=bsize,**kargs)
for (int iter_n=0; iter_n < dim_zz[1]; iter_n++) { // loop over columns
int pos_n=iter_n*%(self_dy)s;
%(type)s sum=0;
%(type)s sum2=0;
%(type)s sum3=0;
%(type)s sum4=0;
int nb_sum=0;
// Sum over kernel, if index into image is out of bounds
// fill with the value
for (int j=0; j < dim_ker[0]; j++) {
int ind0 = (new_m-j);
output = op(input4D, filters4D)
if(mode==FULL){
const %(type)s * idx_hvals=&hvals[j*dim_ker[1]];
if(ind0 < 0 || ind0 >= dim_im[0]){
if(fill_value!=0)
for (int k=0; k < dim_ker[1]; k++) {
sum+= idx_hvals[k] * fill_value;
}
}else{
//do the part where kernel is to the right of the img
int k=0,max_k=max((int)(pos_n-dim_im[1])+1,0);
if(fill_value!=0){
for(k=0;k<max_k;k++){
sum+= idx_hvals[k]*fill_value;
}
}else {k=max_k;}
//do the part where the kernel is on the img
max_k=min(pos_n+1,(int)dim_ker[1]);
const %(type)s * idx_in=&in[ind0*dim_im[1]];
# flatten to 3D tensor if convolving with single filter or single image
if input.ndim==2 or filters.ndim==2:
output = tensor.flatten(output.T, outdim=3).T
if(iter_n + 4*%(self_dy)s < dim_zz[1]
&& iter_n>dim_ker[1]-1+3
&& iter_n<dim_im[1]-dim_ker[1]+1-3){
nb_sum=4;
for (int ind1=pos_n-k; k<max_k; k++,ind1--) {
sum+=idx_hvals[k]*idx_in[ind1];
sum2+=idx_hvals[k]*idx_in[ind1+%(self_dy)s];
sum3+=idx_hvals[k]*idx_in[ind1+2*%(self_dy)s];
sum4+=idx_hvals[k]*idx_in[ind1+3*%(self_dy)s];
}
}else if(iter_n + 2*%(self_dy)s < dim_zz[1]
&& iter_n>dim_ker[1]-1
&& iter_n<dim_im[1]-dim_ker[1]+1){
nb_sum=2;
for (int ind1=pos_n-k; k<max_k; k++,ind1--) {
sum+=idx_hvals[k]*idx_in[ind1];
sum2+=idx_hvals[k]*idx_in[ind1+%(self_dy)s];
}
}else{
nb_sum=1;
/*
%(type)s sum_=0;
if((k-max_k) & 0x1 != 0){
sum+= idx_hvals[k] * idx_in[pos_n-k];
}
for (int ind1=pos_n-k; k<max_k; k+=2,ind1-=2) {
sum+= idx_hvals[k] * idx_in[ind1];
sum_+= idx_hvals[k+1] * idx_in[ind1-1];
}
sum+=sum_;
*/
for (int ind1=pos_n-k; k<max_k; k++,ind1--) {
sum+=idx_hvals[k]*idx_in[ind1];
}
}
//do the part to the left of the img
if(fill_value!=0)
for(;k<dim_ker[1];k++) sum+= idx_hvals[k]*fill_value;
}
}else{//valid mode
const %(type)s* idx_in=&in[ind0*dim_im[1]];
const %(type)s* idx_hvals=&hvals[j*dim_ker[1]];
if(iter_n + 4*%(self_dy)s < dim_zz[1]){
nb_sum=4;
for (int k=dim_ker[1]-1,im_idx=pos_n; k >=0; k--,im_idx++) {
sum+=idx_hvals[k]*idx_in[im_idx];
sum2+=idx_hvals[k]*idx_in[im_idx+%(self_dy)s];
sum3+=idx_hvals[k]*idx_in[im_idx+2*%(self_dy)s];
sum4+=idx_hvals[k]*idx_in[im_idx+3*%(self_dy)s];
}
}else if(iter_n + 2*%(self_dy)s < dim_zz[1]){
nb_sum=2;
for (int k=dim_ker[1]-1,im_idx=pos_n; k >=0; k--,im_idx++) {
sum+=idx_hvals[k]*idx_in[im_idx];
sum2+=idx_hvals[k]*idx_in[im_idx+%(self_dy)s];
}
}else{
nb_sum=1;
for (int k=dim_ker[1]-1,im_idx=pos_n; k >=0; k--,im_idx++) {
sum+=idx_hvals[k]*idx_in[im_idx];
}
}
}//else valid mode
}//for j
switch(nb_sum){
case 4: out[iter_m*dim_zz[1]+iter_n+3] %(affectation)s sum4;
case 3: out[iter_m*dim_zz[1]+iter_n+2] %(affectation)s sum3;
case 2: out[iter_m*dim_zz[1]+iter_n+1] %(affectation)s sum2;
case 1: out[iter_m*dim_zz[1]+iter_n] %(affectation)s sum;
}
iter_n+=nb_sum-1;
}//for iter_n
}//for iter_m
}//for stack_size
}//for n_kern
}//for b
Py_XDECREF(img2d);
Py_XDECREF(filtersflipped);
"""
return output
......@@ -15,13 +15,13 @@ def max_pool2D(input, ds, ignore_border=False):
Takes as input a N-D tensor, where N >= 2. It downscales the input image by
the specified factor, by keeping only the maximum value of non-overlapping
patches of size (ds[0],ds[1])
:type input: N-D theano tensor of input images.
:param input: input images. Max pooling will be done over the 2 last dimensions.
:type ds: tuple of length 2
:param ds: factor by which to downscale. (2,2) will halve the image in each
dimension.
:param ignore_border: boolean value. When True, (5,5) input with ds=(2,2)
will generate a (2,2) output. (3,3) otherwise.
:param ds: factor by which to downscale. (2,2) will halve the image in each dimension.
:param ignore_border: boolean value. When True, (5,5) input with ds=(2,2) will generate a
(2,2) output. (3,3) otherwise.
"""
if input.ndim < 2:
raise NotImplementedError('max_pool2D requires a dimension >= 2')
......
......@@ -11,169 +11,78 @@ from theano.tensor.signal import conv
from theano.tensor.basic import _allclose
class TestConv2D(unittest.TestCase):
class TestSignalConv2D(unittest.TestCase):
def setUp(self):
utt.seed_rng()
self.input = T.dtensor4('input')
self.filters = T.dtensor4('filters')
def validate(self, image_shape, filter_shape,
border_mode='valid', subsample=(1,1),
N_image_shape=None, N_filter_shape=None,
input=None, filters=None,
unroll_batch=0, unroll_kern=0, unroll_patch=True,
verify_grad=True):
if N_image_shape is None:
N_image_shape = image_shape
if N_filter_shape is None:
N_filter_shape = filter_shape
if not input:
input = self.input
if not filters:
filters = self.filters
def validate(self, image_shape, filter_shape, verify_grad=True):
image_dim = len(image_shape)
filter_dim = len(filter_shape)
input = T.TensorType('float64', [False]*image_dim)()
filters = T.TensorType('float64', [False]*filter_dim)()
bsize = image_shape[0] if image_dim==3 else 1
nkern = filter_shape[0] if filter_dim==3 else 1
############# THEANO IMPLEMENTATION ############
# we create a symbolic function so that verify_grad can work
def sym_conv2d(input, filters):
# define theano graph and function
return conv.conv2d(input, filters, image_shape, filter_shape,
border_mode, subsample, unroll_batch=unroll_batch,
unroll_kern=unroll_kern, unroll_patch=unroll_patch)
return conv.conv2d(input, filters)
output = sym_conv2d(input, filters)
theano_conv = theano.function([input, filters], output)
# initialize input and compute result
image_data = numpy.random.random(N_image_shape)
filter_data = numpy.random.random(N_filter_shape)
image_data = numpy.random.random(image_shape)
filter_data = numpy.random.random(filter_shape)
theano_output = theano_conv(image_data, filter_data)
############# REFERENCE IMPLEMENTATION ############
s = 1. if border_mode is 'full' else -1.
out_shape2d = numpy.array(N_image_shape[-2:]) +\
s*numpy.array(N_filter_shape[-2:]) - s
out_shape2d = numpy.ceil(out_shape2d / numpy.array(subsample))
out_shape = (N_image_shape[0],N_filter_shape[0]) + tuple(out_shape2d)
ref_output = numpy.zeros(out_shape)
out_shape2d = numpy.array(image_shape[-2:]) -\
numpy.array(filter_shape[-2:]) + 1
ref_output = numpy.zeros(tuple(out_shape2d))
# loop over output feature maps
for k in range(N_filter_shape[0]):
# loop over input feature maps
for l in range(N_filter_shape[1]):
# reshape as 3D input tensors to make life easier
image_data3d = image_data.reshape((bsize,)+image_shape[-2:])
filter_data3d = filter_data.reshape((nkern,)+filter_shape[-2:])
# reshape theano output as 4D to make life easier
theano_output4d = theano_output.reshape((bsize,nkern,)+theano_output.shape[-2:])
filter2d = filter_data[k,l,:,:]
# loop over mini-batches (if required)
for b in range(bsize):
# loop over mini-batches
for b in range(N_image_shape[0]):
image2d = image_data[b,l,:,:]
output2d = signal.convolve2d(image2d, filter2d, border_mode)
# loop over filters (if required)
for k in range(nkern):
ref_output[b,k,:,:] +=\
output2d[::subsample[0],::subsample[1]]
image2d = image_data3d[b,:,:]
filter2d = filter_data3d[k,:,:]
output2d = signal.convolve2d(image2d, filter2d, 'valid')
self.failUnless(_allclose(theano_output, ref_output))
self.failUnless(_allclose(theano_output4d[b,k,:,:], output2d))
############# TEST GRADIENT ############
if verify_grad:
utt.verify_grad(sym_conv2d, [image_data, filter_data])
def test_basic(self):
"""
Tests that basic convolutions work for odd and even dimensions of image and filter
shapes, as well as rectangular images and filters.
"""
self.validate((3,2,8,8), (4,2,5,5), 'valid')
self.validate((3,2,7,5), (5,2,2,3), 'valid')
self.validate((3,2,7,5), (5,2,3,2), 'valid')
self.validate((3,2,8,8), (4,2,5,5), 'full')
self.validate((3,2,7,5), (5,2,2,3), 'full')
# test filter same size as input
self.validate((3,2,3,3), (4,2,3,3), 'valid')
def test_unroll_patch_false(self):
"""
unroll_patch is True by default. Test basic convs with False.
"""
self.validate((3,2,7,5), (5,2,2,3), 'valid', unroll_patch=False)
self.validate((3,2,7,5), (5,2,2,3), 'full', unroll_patch=False)
self.validate((3,2,3,3), (4,2,3,3), 'valid', unroll_patch=False)
def test_unroll_special(self):
"""
(unroll_kern, unroll_batch) in (0,1),(1,0) is special case.
"""
self.validate((6,2,3,3), (3,2,2,2), 'valid', unroll_batch=1)
def test_unroll_batch(self):
"""
Test mini-batch unrolling for various legal values.
"""
# mini-batch of size 6 is multiple of 2 and 3. Should work.
self.validate((6,2,3,3), (3,2,2,2), 'valid', unroll_batch=2, verify_grad=False)
self.validate((6,2,3,3), (3,2,2,2), 'valid', unroll_batch=3, verify_grad=False)
def test_unroll_kern(self):
"""
Test kernel unrolling for various legal values.
"""
# 6 filters is a multiple of 2 and 3. Should work.
self.validate((2,3,3,3), (6,3,2,2), 'valid', unroll_kern=2, verify_grad=False)
self.validate((2,3,3,3), (6,3,2,2), 'valid', unroll_kern=3, verify_grad=False)
def test_subsample(self):
"""
Tests convolution where subsampling != (1,1)
"""
self.validate((3,2,7,5), (5,2,2,3), 'valid', subsample=(2,2))
self.validate((3,2,7,5), (5,2,2,3), 'full', subsample=(2,2))
self.validate((3,2,7,5), (5,2,2,3), 'valid', subsample=(2,1))
def test_invalid_filter_shape(self):
"""
Tests scenario where filter_shape[1] != input_shape[1]
"""
def f():
self.validate((3,2,8,8), (4,3,5,5), 'valid')
self.failUnlessRaises(AssertionError, f)
def test_missing_info(self):
"""
Test convolutions for various pieces of missing info.
"""
self.validate(None, None,
N_image_shape=(3,2,8,8),
N_filter_shape=(4,2,5,5))
self.validate((3,2,None,None), None,
N_image_shape=(3,2,8,8),
N_filter_shape=(4,2,5,5))
self.validate((None,2,None,None), (None,2,5,5),
N_image_shape=(3,2,8,8),
N_filter_shape=(4,2,5,5))
def test_full_mode(self):
"""
Tests basic convolution in full mode and case where filter
is larger than the input image.
Basic functionality of nnet.conv.ConvOp is already tested by its own test suite. We
just have to test whether or not signal.conv.conv2d can support inputs and filters of
type matrix or tensor3.
"""
self.validate((3,2,5,5), (4,2,8,8), 'full')
def f():
self.validate((3,2,5,5), (4,2,8,8), 'valid')
self.failUnlessRaises(Exception, f)
self.validate((3,7,5), (5,2,3), 'valid')
self.validate((7,5), (5,2,3), 'valid')
self.validate((3,7,5), (2,3), 'valid')
self.validate((7,5), (2,3), 'valid')
def test_wrong_input(self):
def test_fail(self):
"""
Make sure errors are raised when image and kernel are not 4D tensors
Test that conv2d fails for dimensions other than 2 or 3.
"""
try:
self.validate((3,2,8,8), (4,2,5,5), 'valid', input = T.dmatrix())
self.validate((3,2,8,8), (4,2,5,5), 'valid', filters = T.dvector())
self.validate((3,2,8,8), (4,2,5,5), 'valid', input = T.dtensor3())
# should never reach here
conv.conv2d(T.dtensor4(), T.dtensor3())
conv.conv2d(T.dtensor3(), T.dvector())
self.fail()
except:
except:
pass
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论