提交 aaea1d32 authored 作者: Frederic Bastien's avatar Frederic Bastien

fix white space.

上级 a04be875
...@@ -26,7 +26,7 @@ try: ...@@ -26,7 +26,7 @@ try:
imported_scipy_signal = True imported_scipy_signal = True
except ImportError: except ImportError:
pass pass
_logger=logging.getLogger("theano.signal.conv") _logger=logging.getLogger("theano.signal.conv")
def _debug(*msg): def _debug(*msg):
...@@ -51,13 +51,13 @@ def conv2d(input, filters, image_shape=None, filter_shape=None, ...@@ -51,13 +51,13 @@ def conv2d(input, filters, image_shape=None, filter_shape=None,
:param border_mode: :param border_mode:
'valid'-- only apply filter to complete patches of the image. Generates 'valid'-- only apply filter to complete patches of the image. Generates
output of shape: image_shape - filter_shape + 1 output of shape: image_shape - filter_shape + 1
'full' -- zero-pads image to multiple of filter shape to generate output of 'full' -- zero-pads image to multiple of filter shape to generate output of
shape: image_shape + filter_shape - 1 shape: image_shape + filter_shape - 1
:type subsample: tuple of len 2 :type subsample: tuple of len 2
:param subsample: factor by which to subsample the output :param subsample: factor by which to subsample the output
:type image_shape: tuple of len 4 of int or Contant variable :type image_shape: tuple of len 4 of int or Contant variable
:param image_shape: (batch size, stack size, nb row, nb col) :param image_shape: (batch size, stack size, nb row, nb col)
Optional, used for optimization. Optional, used for optimization.
...@@ -68,7 +68,7 @@ def conv2d(input, filters, image_shape=None, filter_shape=None, ...@@ -68,7 +68,7 @@ def conv2d(input, filters, image_shape=None, filter_shape=None,
:param kwargs: kwargs are passed onto ConvOp. Can be used to set the following: :param kwargs: kwargs are passed onto ConvOp. Can be used to set the following:
unroll_batch, unroll_kern, unroll_patch (see ConvOp doc) unroll_batch, unroll_kern, unroll_patch (see ConvOp doc)
:rtype: symbolic 4D tensor :rtype: symbolic 4D tensor
:return: set of feature maps generated by convolutional layer. Tensor is of shape :return: set of feature maps generated by convolutional layer. Tensor is of shape
(batch size, nb filters, output row, output col) (batch size, nb filters, output row, output col)
""" """
...@@ -89,11 +89,11 @@ def conv2d(input, filters, image_shape=None, filter_shape=None, ...@@ -89,11 +89,11 @@ def conv2d(input, filters, image_shape=None, filter_shape=None,
filter_shape[i] = int(filter_shape[i]) filter_shape[i] = int(filter_shape[i])
if image_shape and filter_shape: if image_shape and filter_shape:
try: try:
assert image_shape[1]==filter_shape[1] assert image_shape[1]==filter_shape[1]
except: except:
print 'image ', image_shape, ' filters ', filter_shape print 'image ', image_shape, ' filters ', filter_shape
raise raise
if filter_shape is not None: if filter_shape is not None:
nkern = filter_shape[0] nkern = filter_shape[0]
...@@ -117,25 +117,25 @@ class ConvOp(Op): ...@@ -117,25 +117,25 @@ class ConvOp(Op):
""" """
This Op serves a dual purpose: it can implement a vanilla 2D convolution This Op serves a dual purpose: it can implement a vanilla 2D convolution
(as taught in any signal processing class) or implement the (as taught in any signal processing class) or implement the
convolutional layers found in Convolutional Neural Networks. convolutional layers found in Convolutional Neural Networks.
In this setting, a set of 3D images is convolved with a set of 3D kernels, In this setting, a set of 3D images is convolved with a set of 3D kernels,
with the particularity that their leading dimensions are of equal length. with the particularity that their leading dimensions are of equal length.
Vanilla 2D convolution is treated as a special case of this. Vanilla 2D convolution is treated as a special case of this.
The input parameter represents a mini-batch of multiple images. Its shape is: The input parameter represents a mini-batch of multiple images. Its shape is:
batch size x num. input feature maps x image height x image width batch size x num. input feature maps x image height x image width
The kernel parameter represents a set of 3D kernels. Its shape is: The kernel parameter represents a set of 3D kernels. Its shape is:
number of filters x num. input images x filter height x filter width number of filters x num. input images x filter height x filter width
The output of ConvOp is a 4D tensor, generated as follows: The output of ConvOp is a 4D tensor, generated as follows:
output[b,k,:,:] = \sum_i input[b,i,:,:] * filter[k,i,:,:] \forall b,k output[b,k,:,:] = \sum_i input[b,i,:,:] * filter[k,i,:,:] \forall b,k
where b is the mini-batch index, k the filter index and * is the convolution where b is the mini-batch index, k the filter index and * is the convolution
operator. operator.
""" """
__attrnames = ['imshp', 'kshp', 'nkern', 'bsize', 'dx', 'dy', 'out_mode', __attrnames = ['imshp', 'kshp', 'nkern', 'bsize', 'dx', 'dy', 'out_mode',
'unroll_batch', 'unroll_kern', 'unroll_patch', 'unroll_batch', 'unroll_kern', 'unroll_patch',
'imshp_logical', 'kshp_logical', 'kshp_logical_top_aligned'] 'imshp_logical', 'kshp_logical', 'kshp_logical_top_aligned']
"""These attributes uniquely identify the behaviour of this op for given inputs""" """These attributes uniquely identify the behaviour of this op for given inputs"""
...@@ -203,7 +203,7 @@ class ConvOp(Op): ...@@ -203,7 +203,7 @@ class ConvOp(Op):
speed_unroll_patch_noshape=[2.0109100341796875, 5.8175678253173828] speed_unroll_patch_noshape=[2.0109100341796875, 5.8175678253173828]
#valid time, full time #valid time, full time
speed_unroll_patch_shape=[1.2967290878295898, 5.5283889770507812] speed_unroll_patch_shape=[1.2967290878295898, 5.5283889770507812]
def c_compile_args(self): def c_compile_args(self):
#when the ksph==(1,1) gcc 4.3.0 segfault during the compilation with -O3. #when the ksph==(1,1) gcc 4.3.0 segfault during the compilation with -O3.
#This don't happen at -O2 #This don't happen at -O2
...@@ -223,7 +223,7 @@ class ConvOp(Op): ...@@ -223,7 +223,7 @@ class ConvOp(Op):
""" """
Computes the output dimensions of convolving an image of shape "inshp" Computes the output dimensions of convolving an image of shape "inshp"
with kernels of shape "kshp". with kernels of shape "kshp".
:param inshp: (rows,cols) of input image :param inshp: (rows,cols) of input image
:param kshp: (rows,cols) of filters :param kshp: (rows,cols) of filters
:param mode: 'valid' or 'full' (see 'border_mode' in conv2d's doc) :param mode: 'valid' or 'full' (see 'border_mode' in conv2d's doc)
...@@ -236,7 +236,7 @@ class ConvOp(Op): ...@@ -236,7 +236,7 @@ class ConvOp(Op):
numpy.array([dx,dy], dtype='float'))) numpy.array([dx,dy], dtype='float')))
def __init__(self, imshp=None, kshp=None, nkern=None, bsize=None, def __init__(self, imshp=None, kshp=None, nkern=None, bsize=None,
dx=None, dy=None, dx=None, dy=None,
output_mode='valid', output_mode='valid',
...@@ -269,7 +269,7 @@ class ConvOp(Op): ...@@ -269,7 +269,7 @@ class ConvOp(Op):
For optimizing other architectures, see: For optimizing other architectures, see:
Kazushige Goto and Robert A. Van De Geijn, Anatomy of High-Performance Kazushige Goto and Robert A. Van De Geijn, Anatomy of High-Performance
Matrix Multiplication, (mr x nr). ACM Transactions on Mathematical Matrix Multiplication, (mr x nr). ACM Transactions on Mathematical
Software, May 2008. Software, May 2008.
Figure 12: (mr x nr). For x86 use 2x4, itanium 8x8, etc. Figure 12: (mr x nr). For x86 use 2x4, itanium 8x8, etc.
:type output_mode: string :type output_mode: string
...@@ -325,7 +325,7 @@ class ConvOp(Op): ...@@ -325,7 +325,7 @@ class ConvOp(Op):
if (unroll_batch>0 or unroll_kern>0) and not all_shape: if (unroll_batch>0 or unroll_kern>0) and not all_shape:
raise Exception("In ConvOp, when using unroll_batch and unroll_nkern, all shape are needed") raise Exception("In ConvOp, when using unroll_batch and unroll_nkern, all shape are needed")
if not all_shape: if not all_shape:
unroll_patch = True unroll_patch = True
...@@ -419,7 +419,7 @@ class ConvOp(Op): ...@@ -419,7 +419,7 @@ class ConvOp(Op):
if not self.out_mode in ["valid", "full"]: if not self.out_mode in ["valid", "full"]:
raise Exception("Mode %s not implemented"%self.out_mode) raise Exception("Mode %s not implemented"%self.out_mode)
if all_shape and not (self.outshp > 0).all(): if all_shape and not (self.outshp > 0).all():
raise Exception(("Bad size for the output shape. Verify that [post-"\ raise Exception(("Bad size for the output shape. Verify that [post-"\
"supersampling] input shape (%s) and kern shape(%s) are ok. "\ "supersampling] input shape (%s) and kern shape(%s) are ok. "\
...@@ -501,8 +501,8 @@ class ConvOp(Op): ...@@ -501,8 +501,8 @@ class ConvOp(Op):
for out_col in range(self.outshp[0]):#loop over output col for out_col in range(self.outshp[0]):#loop over output col
for row in range(self.kshp[0]):#loop over kern row for row in range(self.kshp[0]):#loop over kern row
if (row+out_row-self.kshp[0]+1<0 or if (row+out_row-self.kshp[0]+1<0 or
row+out_row-self.kshp[0]+1>=self.imshp[1]): row+out_row-self.kshp[0]+1>=self.imshp[1]):
continue continue
col=0 col=0
...@@ -516,9 +516,9 @@ class ConvOp(Op): ...@@ -516,9 +516,9 @@ class ConvOp(Op):
while col < max_col: #loop over kern col while col < max_col: #loop over kern col
self.flops+=2 self.flops+=2
col+=1 col+=1
self.flops*=self.imshp[0]*self.nkern*self.bsize#for all outputs images#n_stack==self.imshp[0] self.flops*=self.imshp[0]*self.nkern*self.bsize#for all outputs images#n_stack==self.imshp[0]
assert self.flops == self.bsize * self.nkern * self.imshp[0] * \ assert self.flops == self.bsize * self.nkern * self.imshp[0] * \
self.kshp[0] * self.kshp[1] * self.imshp[1] * self.imshp[2] * 2 self.kshp[0] * self.kshp[1] * self.imshp[1] * self.imshp[2] * 2
...@@ -545,7 +545,7 @@ class ConvOp(Op): ...@@ -545,7 +545,7 @@ class ConvOp(Op):
bcastable23 = [False, False] bcastable23 = [False, False]
output = tensor.tensor(dtype=_inputs.type.dtype, output = tensor.tensor(dtype=_inputs.type.dtype,
broadcastable=[_inputs.broadcastable[0], broadcastable=[_inputs.broadcastable[0],
_kerns.broadcastable[0]]+bcastable23); _kerns.broadcastable[0]]+bcastable23);
return Apply(self, [_inputs, _kerns], [output]) return Apply(self, [_inputs, _kerns], [output])
...@@ -568,7 +568,7 @@ class ConvOp(Op): ...@@ -568,7 +568,7 @@ class ConvOp(Op):
except TypeError: except TypeError:
raise NotImplementedError() raise NotImplementedError()
outshp = (batch_size,fmo) + tuple(fmshp) outshp = (batch_size,fmo) + tuple(fmshp)
return [outshp] return [outshp]
else: else:
# Haven't implemented this case. imshp and kshp may be symbollic # Haven't implemented this case. imshp and kshp may be symbollic
# and ConvOp.getOutputShape doesn't handle this. In this case # and ConvOp.getOutputShape doesn't handle this. In this case
...@@ -583,7 +583,7 @@ class ConvOp(Op): ...@@ -583,7 +583,7 @@ class ConvOp(Op):
raise theano.gof.utils.MethodNotDefined( raise theano.gof.utils.MethodNotDefined(
"c_headers", type(self), self.__class__.__name__, "c_headers", type(self), self.__class__.__name__,
"Need the python package for scipy.signal to be installed for the python implementation. You can use the C implementation instead.") "Need the python package for scipy.signal to be installed for the python implementation. You can use the C implementation instead.")
# TODO: move these back out to global scope when they no longer cause an atexit error # TODO: move these back out to global scope when they no longer cause an atexit error
imshp = self.imshp imshp = self.imshp
if imshp is None or any([x is None for x in imshp]): if imshp is None or any([x is None for x in imshp]):
...@@ -597,7 +597,7 @@ class ConvOp(Op): ...@@ -597,7 +597,7 @@ class ConvOp(Op):
nkern = self.nkern nkern = self.nkern
if nkern is None: if nkern is None:
nkern = filtersflipped.shape[0] nkern = filtersflipped.shape[0]
imshp_logical = self.imshp_logical imshp_logical = self.imshp_logical
if imshp_logical is None: if imshp_logical is None:
imshp_logical = imshp imshp_logical = imshp
...@@ -704,13 +704,13 @@ class ConvOp(Op): ...@@ -704,13 +704,13 @@ class ConvOp(Op):
if not all_shape and (self.dx!=1 or self.dy!=1): if not all_shape and (self.dx!=1 or self.dy!=1):
raise Exception("ConvOp.grad when dx!=1 or dy!=1 we must have all "\ raise Exception("ConvOp.grad when dx!=1 or dy!=1 we must have all "\
"the optional shape information") "the optional shape information")
####### Determine gradient on kernels ######## ####### Determine gradient on kernels ########
assert inputs.ndim==4 and kerns.ndim==4 assert inputs.ndim==4 and kerns.ndim==4
newin = inputs.dimshuffle((1,0,2,3)) newin = inputs.dimshuffle((1,0,2,3))
newgz = gz.dimshuffle((1,0,2,3)) newgz = gz.dimshuffle((1,0,2,3))
(bsize, nkern) = None, None (bsize, nkern) = None, None
imshp = None imshp = None
kshp = None kshp = None
...@@ -742,7 +742,7 @@ class ConvOp(Op): ...@@ -742,7 +742,7 @@ class ConvOp(Op):
raise NotImplementedError('Only [full,valid] modes are currently supported.') raise NotImplementedError('Only [full,valid] modes are currently supported.')
filters = filters[:,:,::-1,::-1] #flip them filters = filters[:,:,::-1,::-1] #flip them
if 0: #find good value for the unroll if 0: #find good value for the unroll
if all_shape and un_b!=0 and bsize%un_b!=0: if all_shape and un_b!=0 and bsize%un_b!=0:
...@@ -793,7 +793,7 @@ class ConvOp(Op): ...@@ -793,7 +793,7 @@ class ConvOp(Op):
####### Determine gradient on inputs ######## ####### Determine gradient on inputs ########
mode = 'valid' mode = 'valid'
if not self.out_mode == 'full': if not self.out_mode == 'full':
mode = 'full' mode = 'full'
filters = kerns.dimshuffle((1,0,2,3)) filters = kerns.dimshuffle((1,0,2,3))
...@@ -809,7 +809,7 @@ class ConvOp(Op): ...@@ -809,7 +809,7 @@ class ConvOp(Op):
imshp_logical=(self.nkern, self.fulloutshp[0], self.fulloutshp[1]) imshp_logical=(self.nkern, self.fulloutshp[0], self.fulloutshp[1])
if 0: # hard-code c generation parameters if 0: # hard-code c generation parameters
din = ConvOp(imshp, self.kshp, nkern, self.bsize, din = ConvOp(imshp, self.kshp, nkern, self.bsize,
1,1, output_mode=mode, 1,1, output_mode=mode,
unroll_batch=un_b, unroll_kern=un_k, unroll_patch=un_p, unroll_batch=un_b, unroll_kern=un_k, unroll_patch=un_p,
imshp_logical=imshp_logical, imshp_logical=imshp_logical,
...@@ -817,7 +817,7 @@ class ConvOp(Op): ...@@ -817,7 +817,7 @@ class ConvOp(Op):
version=-1,#we we change the mode, we don't forward the version. version=-1,#we we change the mode, we don't forward the version.
verbose=self.verbose) verbose=self.verbose)
else: # let __init__ figure out the unrolling / patch sizes else: # let __init__ figure out the unrolling / patch sizes
din = ConvOp(imshp, self.kshp, nkern, self.bsize, din = ConvOp(imshp, self.kshp, nkern, self.bsize,
1,1, output_mode=mode, 1,1, output_mode=mode,
unroll_batch=None, unroll_kern=None, unroll_patch=None, unroll_batch=None, unroll_kern=None, unroll_patch=None,
imshp_logical=imshp_logical, imshp_logical=imshp_logical,
...@@ -840,7 +840,7 @@ class ConvOp(Op): ...@@ -840,7 +840,7 @@ class ConvOp(Op):
def c_code_cache_version(self): def c_code_cache_version(self):
return (4) return (4)
def c_support_code(self): def c_support_code(self):
return """ return """
#define STRIDES(arr) ((arr)->strides) #define STRIDES(arr) ((arr)->strides)
...@@ -878,12 +878,12 @@ using namespace std; ...@@ -878,12 +878,12 @@ using namespace std;
if self.use_blas(): if self.use_blas():
return tensor.blas.ldflags(libs=False, libs_dir=True) return tensor.blas.ldflags(libs=False, libs_dir=True)
return [] return []
def c_header_dirs(self): def c_header_dirs(self):
if self.use_blas(): if self.use_blas():
return tensor.blas.ldflags(libs=False, include_dir=True) return tensor.blas.ldflags(libs=False, include_dir=True)
return [] return []
def c_code(self, node, name, (img2d, filtersflipped), (z, ), sub): def c_code(self, node, name, (img2d, filtersflipped), (z, ), sub):
if node.inputs[0].type.dtype != node.inputs[1].type.dtype: if node.inputs[0].type.dtype != node.inputs[1].type.dtype:
raise NotImplementedError() raise NotImplementedError()
...@@ -953,7 +953,7 @@ using namespace std; ...@@ -953,7 +953,7 @@ using namespace std;
d["self_kshp_logical_offset_r"] = (self.kshp_logical[0] - (self.kshp[0]*rstride) - 1+rstride) % rstride d["self_kshp_logical_offset_r"] = (self.kshp_logical[0] - (self.kshp[0]*rstride) - 1+rstride) % rstride
d["self_kshp_logical_offset_c"] = (self.kshp_logical[1] - (self.kshp[1]*cstride) - 1+cstride) % cstride d["self_kshp_logical_offset_c"] = (self.kshp_logical[1] - (self.kshp[1]*cstride) - 1+cstride) % cstride
del rstride, cstride del rstride, cstride
if node.inputs[0].type.dtype=="float32": d["type"]="float" if node.inputs[0].type.dtype=="float32": d["type"]="float"
elif node.inputs[0].type.dtype=="float64": d["type"]="double" elif node.inputs[0].type.dtype=="float64": d["type"]="double"
else: raise Exception("Type %s not implemented"%node.inputs[0].type.dtype) else: raise Exception("Type %s not implemented"%node.inputs[0].type.dtype)
...@@ -978,7 +978,7 @@ using namespace std; ...@@ -978,7 +978,7 @@ using namespace std;
return gen_conv_code_unroll_batch_kern(d, self.unroll_batch, return gen_conv_code_unroll_batch_kern(d, self.unroll_batch,
self.unroll_kern) self.unroll_kern)
#TODO: should we choose the unroll size automatically with the bigger divisor under 5? #TODO: should we choose the unroll size automatically with the bigger divisor under 5?
if self.out_mode == 'valid' and self.dx==0 and self.dy==0: if self.out_mode == 'valid' and self.dx==0 and self.dy==0:
if self.verbose: if self.verbose:
_debug("return gemm version") _debug("return gemm version")
...@@ -1067,7 +1067,7 @@ img2d_arr = (PyArrayObject*)img2d; ...@@ -1067,7 +1067,7 @@ img2d_arr = (PyArrayObject*)img2d;
filtersflipped = PyArray_Newshape(%(filtersflipped)s,&kerns_shape, PyArray_CORDER); filtersflipped = PyArray_Newshape(%(filtersflipped)s,&kerns_shape, PyArray_CORDER);
filtersflipped_arr = (PyArrayObject*)filtersflipped; filtersflipped_arr = (PyArrayObject*)filtersflipped;
if ((filtersflipped_arr->strides[3] != (npy_intp)sizeof(%(type)s)) if ((filtersflipped_arr->strides[3] != (npy_intp)sizeof(%(type)s))
|| (filtersflipped_arr->strides[2] != filtersflipped_arr->dimensions[3]*(npy_intp)sizeof(%(type)s))){ || (filtersflipped_arr->strides[2] != filtersflipped_arr->dimensions[3]*(npy_intp)sizeof(%(type)s))){
contig = (PyObject*)(PyArray_GETCONTIGUOUS((PyArrayObject*)filtersflipped)); contig = (PyObject*)(PyArray_GETCONTIGUOUS((PyArrayObject*)filtersflipped));
Py_DECREF(filtersflipped); Py_DECREF(filtersflipped);
...@@ -1213,7 +1213,7 @@ for(int b=0;b< %(self_bsize)s;b++){ ...@@ -1213,7 +1213,7 @@ for(int b=0;b< %(self_bsize)s;b++){
}//for m }//for m
}//for stack_size }//for stack_size
if (0 && (mode==FULL)){ if (0 && (mode==FULL)){
for (int i = 0; i < dim_zz[0]*dim_zz[1]; ++i) for (int i = 0; i < dim_zz[0]*dim_zz[1]; ++i)
std::cout << " " << out[i]; std::cout << " " << out[i];
std::cout << "\\n"; std::cout << "\\n";
} }
...@@ -1224,7 +1224,7 @@ Py_XDECREF(filtersflipped); ...@@ -1224,7 +1224,7 @@ Py_XDECREF(filtersflipped);
""" """
######### #########
######### ConvOp c_code for valid mode (uses gemm) ######### ConvOp c_code for valid mode (uses gemm)
######### #########
...@@ -1293,7 +1293,7 @@ if (NKERN != kerns_dim[0]) ...@@ -1293,7 +1293,7 @@ if (NKERN != kerns_dim[0])
img2d = PyArray_Newshape(%(img2d)s,&img2d_shape, PyArray_CORDER); img2d = PyArray_Newshape(%(img2d)s,&img2d_shape, PyArray_CORDER);
img2d_arr = (PyArrayObject*)img2d; img2d_arr = (PyArrayObject*)img2d;
if ((img2d_arr->strides[3] != (npy_intp)sizeof(%(type)s)) if ((img2d_arr->strides[3] != (npy_intp)sizeof(%(type)s))
|| (img2d_arr->strides[2] != img2d_arr->dimensions[3]*(npy_intp)sizeof(%(type)s))){ || (img2d_arr->strides[2] != img2d_arr->dimensions[3]*(npy_intp)sizeof(%(type)s))){
contig = (PyObject*)(PyArray_GETCONTIGUOUS((PyArrayObject*)img2d)); contig = (PyObject*)(PyArray_GETCONTIGUOUS((PyArrayObject*)img2d));
Py_DECREF(img2d); Py_DECREF(img2d);
...@@ -1337,7 +1337,7 @@ int Os[2]; ...@@ -1337,7 +1337,7 @@ int Os[2];
Os[0] = dim_im[0]-dim_ker[0]+1; Os[0] = dim_im[0]-dim_ker[0]+1;
Os[1] = dim_im[1]-dim_ker[1]+1; Os[1] = dim_im[1]-dim_ker[1]+1;
// allocate a temporary buffer for storing the inner product of each nth kernel row // allocate a temporary buffer for storing the inner product of each nth kernel row
// with each row of an image // with each row of an image
{ {
%(type)s * kbuf = (%(type)s *)malloc((Os[0] * NKERN + PyArray_Size((PyObject*)%(filtersflipped)s))* (npy_intp)sizeof(%(type)s)); %(type)s * kbuf = (%(type)s *)malloc((Os[0] * NKERN + PyArray_Size((PyObject*)%(filtersflipped)s))* (npy_intp)sizeof(%(type)s));
...@@ -1353,7 +1353,7 @@ for(int i=0;i < kerns_dim[0];++i){ ...@@ -1353,7 +1353,7 @@ for(int i=0;i < kerns_dim[0];++i){
%(type)s * ff = ((%(filtersflipped)s)->nd == 3) %(type)s * ff = ((%(filtersflipped)s)->nd == 3)
? (%(type)s *)PyArray_GETPTR3(%(filtersflipped)s, i, kerns_dim[2]-1-k, kerns_dim[3]-1-l) ? (%(type)s *)PyArray_GETPTR3(%(filtersflipped)s, i, kerns_dim[2]-1-k, kerns_dim[3]-1-l)
: (%(type)s *)PyArray_GETPTR4(%(filtersflipped)s, i, j, kerns_dim[2]-1-k, kerns_dim[3]-1-l); : (%(type)s *)PyArray_GETPTR4(%(filtersflipped)s, i, j, kerns_dim[2]-1-k, kerns_dim[3]-1-l);
myfilters[i * (kerns_dim[1]*kerns_dim[2]*kerns_dim[3]) myfilters[i * (kerns_dim[1]*kerns_dim[2]*kerns_dim[3])
+ j * (kerns_dim[2]*kerns_dim[3]) + j * (kerns_dim[2]*kerns_dim[3])
+ k * (kerns_dim[3]) + k * (kerns_dim[3])
+ l] = ff[0]; + l] = ff[0];
...@@ -1370,14 +1370,14 @@ for(int b=0;b< %(self_bsize)s;b++){ ...@@ -1370,14 +1370,14 @@ for(int b=0;b< %(self_bsize)s;b++){
for (int img_col = 0; img_col < Os[1]; ++img_col){ for (int img_col = 0; img_col < Os[1]; ++img_col){
for (int filter_row = 0; filter_row < kerns_dim[2]; ++filter_row){ for (int filter_row = 0; filter_row < kerns_dim[2]; ++filter_row){
for (int stackidx = 0; stackidx < %(self_imshp0)s; ++stackidx){ for (int stackidx = 0; stackidx < %(self_imshp0)s; ++stackidx){
%(type)s * img_colview = %(type)s * img_colview =
(%(type)s *)(PyArray_GETPTR4(img2d, b, stackidx, filter_row, img_col)); (%(type)s *)(PyArray_GETPTR4(img2d, b, stackidx, filter_row, img_col));
%(type)s * filter_rows = myfilters + stackidx * (kerns_dim[2]*kerns_dim[3]) + %(type)s * filter_rows = myfilters + stackidx * (kerns_dim[2]*kerns_dim[3]) +
filter_row * kerns_dim[3]; filter_row * kerns_dim[3];
//std::cerr << "filterview offset: " << filter_rows - myfilters << "\\n"; //std::cerr << "filterview offset: " << filter_rows - myfilters << "\\n";
char N = 'N'; char T = 'T'; char N = 'N'; char T = 'T';
int Nz0 = Os[0]; int Nz0 = Os[0];
int Nz1 = NKERN; int Nz1 = NKERN;
int K = kerns_dim[3]; int K = kerns_dim[3];
%(type)s alpha = 1.0; %(type)s alpha = 1.0;
...@@ -1407,11 +1407,11 @@ for(int b=0;b< %(self_bsize)s;b++){ ...@@ -1407,11 +1407,11 @@ for(int b=0;b< %(self_bsize)s;b++){
std::cerr << Nz1 << " " << Nz0 << " " << K << "\\n" ; std::cerr << Nz1 << " " << Nz0 << " " << K << "\\n" ;
} }
%(gemm)s(&T, &N, %(gemm)s(&T, &N,
&Nz1, &Nz0, &K, &Nz1, &Nz0, &K,
&alpha, &alpha,
filter_rows, &filter_rows_stride, filter_rows, &filter_rows_stride,
img_colview, &imgview_stride, img_colview, &imgview_stride,
&beta, kbuf, &kbufstride); &beta, kbuf, &kbufstride);
if (0){ if (0){
...@@ -1453,7 +1453,7 @@ def gen_conv_code_unroll_batch_kern(d,unroll_bsize=1, unroll_ksize=1): ...@@ -1453,7 +1453,7 @@ def gen_conv_code_unroll_batch_kern(d,unroll_bsize=1, unroll_ksize=1):
if d.has_key("unroll_bsize") or d.has_key("unroll_ksize") or d.has_key("unroll_iter") or d.has_key("unroll_biter") or d.has_key("unroll_kiter"): if d.has_key("unroll_bsize") or d.has_key("unroll_ksize") or d.has_key("unroll_iter") or d.has_key("unroll_biter") or d.has_key("unroll_kiter"):
raise Exception("We can't use this dictionnary as we will overwrite some of its containt") raise Exception("We can't use this dictionnary as we will overwrite some of its containt")
d=d.copy() d=d.copy()
d["unroll_bsize"]=unroll_bsize d["unroll_bsize"]=unroll_bsize
d["unroll_ksize"]=unroll_ksize d["unroll_ksize"]=unroll_ksize
def my_dup(st,size): def my_dup(st,size):
...@@ -1547,7 +1547,7 @@ if(kerns_dim[0] %% %(self_nkern)s!=0){ ...@@ -1547,7 +1547,7 @@ if(kerns_dim[0] %% %(self_nkern)s!=0){
img2d = PyArray_Newshape(%(img2d)s,&img2d_shape, PyArray_CORDER); img2d = PyArray_Newshape(%(img2d)s,&img2d_shape, PyArray_CORDER);
img2d_arr = (PyArrayObject*)img2d; img2d_arr = (PyArrayObject*)img2d;
if ((img2d_arr->strides[3] != (npy_intp)sizeof(%(type)s)) if ((img2d_arr->strides[3] != (npy_intp)sizeof(%(type)s))
|| (img2d_arr->strides[2] != img2d_arr->dimensions[3]*(npy_intp)sizeof(%(type)s))){ || (img2d_arr->strides[2] != img2d_arr->dimensions[3]*(npy_intp)sizeof(%(type)s))){
contig = (PyObject*)(PyArray_GETCONTIGUOUS((PyArrayObject*)img2d)); contig = (PyObject*)(PyArray_GETCONTIGUOUS((PyArrayObject*)img2d));
Py_DECREF(img2d); Py_DECREF(img2d);
...@@ -1561,7 +1561,7 @@ img2d_arr = (PyArrayObject*)img2d; ...@@ -1561,7 +1561,7 @@ img2d_arr = (PyArrayObject*)img2d;
filtersflipped = PyArray_Newshape(%(filtersflipped)s,&kerns_shape, PyArray_CORDER); filtersflipped = PyArray_Newshape(%(filtersflipped)s,&kerns_shape, PyArray_CORDER);
filtersflipped_arr = (PyArrayObject*)filtersflipped; filtersflipped_arr = (PyArrayObject*)filtersflipped;
if ((filtersflipped_arr->strides[3] != (npy_intp)sizeof(%(type)s)) if ((filtersflipped_arr->strides[3] != (npy_intp)sizeof(%(type)s))
|| (filtersflipped_arr->strides[2] != filtersflipped_arr->dimensions[3]*(npy_intp)sizeof(%(type)s))){ || (filtersflipped_arr->strides[2] != filtersflipped_arr->dimensions[3]*(npy_intp)sizeof(%(type)s))){
contig = (PyObject*)(PyArray_GETCONTIGUOUS((PyArrayObject*)filtersflipped)); contig = (PyObject*)(PyArray_GETCONTIGUOUS((PyArrayObject*)filtersflipped));
Py_DECREF(filtersflipped); Py_DECREF(filtersflipped);
...@@ -1632,7 +1632,7 @@ for(int b=0;b< %(self_bsize)s ;b+=%(unroll_bsize)s){ ...@@ -1632,7 +1632,7 @@ for(int b=0;b< %(self_bsize)s ;b+=%(unroll_bsize)s){
if (mode == FULL) new_m = pos_m ; if (mode == FULL) new_m = pos_m ;
else new_m = (pos_m+dim_ker[0]-1); else new_m = (pos_m+dim_ker[0]-1);
for (int iter_n=0; iter_n < Os[1]; iter_n++) { // loop over columns for (int iter_n=0; iter_n < Os[1]; iter_n++) { // loop over columns
int pos_n=iter_n*%(self_dy)s; int pos_n=iter_n*%(self_dy)s;
"""%d """%d
ret+=my_dup("%(type)s sum%(unroll_iter)s=0;", unroll_bsize*unroll_ksize) ret+=my_dup("%(type)s sum%(unroll_iter)s=0;", unroll_bsize*unroll_ksize)
...@@ -1658,15 +1658,15 @@ for(int b=0;b< %(self_bsize)s ;b+=%(unroll_bsize)s){ ...@@ -1658,15 +1658,15 @@ for(int b=0;b< %(self_bsize)s ;b+=%(unroll_bsize)s){
//do the part where kernel is to the right of the img //do the part where kernel is to the right of the img
int k=0,max_k=max((int)(pos_n-dim_im[1])+1,0); int k=0,max_k=max((int)(pos_n-dim_im[1])+1,0);
if(fill_value!=0){ if(fill_value!=0){
for(k=0;k<max_k;k++){ for(k=0;k<max_k;k++){
"""%d """%d
ret+=my_dup2("sum%(unroll_iter)s += idx_hvals%(unroll_kiter)s[k] * fill_value;") ret+=my_dup2("sum%(unroll_iter)s += idx_hvals%(unroll_kiter)s[k] * fill_value;")
ret+=""" ret+="""
} }
}else {k=max_k;} }else {k=max_k;}
//do the part where the kernel is on the img //do the part where the kernel is on the img
max_k=min(pos_n+1,(int)dim_ker[1]); max_k=min(pos_n+1,(int)dim_ker[1]);
"""%d """%d
...@@ -1787,7 +1787,7 @@ if(kerns_dim[0] != %(self_nkern)s){ ...@@ -1787,7 +1787,7 @@ if(kerns_dim[0] != %(self_nkern)s){
img2d = PyArray_Newshape(%(img2d)s,&img2d_shape, PyArray_CORDER); img2d = PyArray_Newshape(%(img2d)s,&img2d_shape, PyArray_CORDER);
img2d_arr = (PyArrayObject*)img2d; img2d_arr = (PyArrayObject*)img2d;
if ((img2d_arr->strides[3] != sizeof(%(type)s)) if ((img2d_arr->strides[3] != sizeof(%(type)s))
|| (img2d_arr->strides[2] != img2d_arr->dimensions[3]*sizeof(%(type)s))){ || (img2d_arr->strides[2] != img2d_arr->dimensions[3]*sizeof(%(type)s))){
contig = (PyObject*)(PyArray_GETCONTIGUOUS((PyArrayObject*)img2d)); contig = (PyObject*)(PyArray_GETCONTIGUOUS((PyArrayObject*)img2d));
Py_DECREF(img2d); Py_DECREF(img2d);
...@@ -1801,7 +1801,7 @@ img2d_arr = (PyArrayObject*)img2d; ...@@ -1801,7 +1801,7 @@ img2d_arr = (PyArrayObject*)img2d;
filtersflipped = PyArray_Newshape(%(filtersflipped)s,&kerns_shape, PyArray_CORDER); filtersflipped = PyArray_Newshape(%(filtersflipped)s,&kerns_shape, PyArray_CORDER);
filtersflipped_arr = (PyArrayObject*)filtersflipped; filtersflipped_arr = (PyArrayObject*)filtersflipped;
if ((filtersflipped_arr->strides[3] != sizeof(%(type)s)) if ((filtersflipped_arr->strides[3] != sizeof(%(type)s))
|| (filtersflipped_arr->strides[2] != filtersflipped_arr->dimensions[3]*sizeof(%(type)s))){ || (filtersflipped_arr->strides[2] != filtersflipped_arr->dimensions[3]*sizeof(%(type)s))){
contig = (PyObject*)(PyArray_GETCONTIGUOUS((PyArrayObject*)filtersflipped)); contig = (PyObject*)(PyArray_GETCONTIGUOUS((PyArrayObject*)filtersflipped));
Py_DECREF(filtersflipped); Py_DECREF(filtersflipped);
...@@ -1897,13 +1897,13 @@ for(int b=0;b< %(self_bsize)s;b++){ ...@@ -1897,13 +1897,13 @@ for(int b=0;b< %(self_bsize)s;b++){
}else{ }else{
//do the part where kernel is to the right of the img //do the part where kernel is to the right of the img
int k=0,max_k=max((int)(pos_n-dim_im[1])+1,0); int k=0,max_k=max((int)(pos_n-dim_im[1])+1,0);
if(fill_value!=0){ if(fill_value!=0){
for(k=0;k<max_k;k++){ for(k=0;k<max_k;k++){
sum+= idx_hvals[k]*fill_value; sum+= idx_hvals[k]*fill_value;
} }
}else {k=max_k;} }else {k=max_k;}
//do the part where the kernel is on the img //do the part where the kernel is on the img
max_k=min(pos_n+1,(int)dim_ker[1]); max_k=min(pos_n+1,(int)dim_ker[1]);
const %(type)s * idx_in=&in[ind0*dim_im[1]]; const %(type)s * idx_in=&in[ind0*dim_im[1]];
...@@ -1918,7 +1918,7 @@ for(int b=0;b< %(self_bsize)s;b++){ ...@@ -1918,7 +1918,7 @@ for(int b=0;b< %(self_bsize)s;b++){
sum3+=idx_hvals[k]*idx_in[ind1+2*%(self_dy)s]; sum3+=idx_hvals[k]*idx_in[ind1+2*%(self_dy)s];
sum4+=idx_hvals[k]*idx_in[ind1+3*%(self_dy)s]; sum4+=idx_hvals[k]*idx_in[ind1+3*%(self_dy)s];
} }
}else if(iter_n + 2*%(self_dy)s < dim_zz[1] }else if(iter_n + 2*%(self_dy)s < dim_zz[1]
&& iter_n>dim_ker[1]-1 && iter_n>dim_ker[1]-1
&& iter_n<dim_im[1]-dim_ker[1]+1){ && iter_n<dim_im[1]-dim_ker[1]+1){
nb_sum=2; nb_sum=2;
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论