提交 554ba22f authored 作者: nouiz's avatar nouiz

Merge pull request #871 from delallea/minor

Minor stuff
......@@ -232,7 +232,7 @@ Tips for improving performance on GPU
taking more time than its share, then if you know something about GPU
programming have a look at how it's implemented in theano.sandbox.cuda.
Check the line like 'Spent Xs(X%) in cpu Op, Xs(X%) in gpu Op and Xs(X%) transfert Op'
that can tell you if not enought of your graph is on the gpu or if their
that can tell you if not enough of your graph is on the gpu or if their
is too much memory transfert.
......
......@@ -64,7 +64,7 @@ def test_sum():
((5,4,3,10,11),[1,2]),
((5,4,3,20),[2,3]), ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3]),((5,4,3,2),[1,2,3]),
#test shape bigger then 4096 on each dimension to make sure that we work correctly when we don't have enought thread/block in each dimensions
#test shape bigger then 4096 on each dimension to make sure that we work correctly when we don't have enough thread/block in each dimensions
((4100,3),[0]),((3,4101),[0]),#10
((1024,33),[0]),((33,1024),[0]),#10
((1025,33),[0]),((33,1025),[0]),#10
......@@ -880,7 +880,7 @@ class T_subtensor(theano.tensor.tests.test_basic.T_subtensor):
((4, 4, 2, 3), [3, 3, 1, 1, 2, 2, 0, 0,
-1, -2, -3, -4], False),
]:
# If there is not enought memory on the GPU, skip the test
# If there is not enough memory on the GPU, skip the test
size_needed = numpy.prod(shape) * (4 + 1)
if isinstance(theano.compile.get_default_mode(),
theano.compile.DebugMode):
......@@ -905,7 +905,7 @@ class T_subtensor(theano.tensor.tests.test_basic.T_subtensor):
# Test with input strided
t = self.adv_sub1()(n[::-1], idx)
#DebugMode do a copy of the input, so we loose the strides.
#DebugMode does a copy of the input, so we lose the strides.
if not isinstance(theano.compile.get_default_mode(),
theano.compile.DebugMode):
t.owner.op.perform_using_take = fast
......
......@@ -310,7 +310,7 @@ def test_downsample():
# The grad is too slow on GT220 GPU
# This cause the computer to freeze...
# Remove this when it get optimized enought
# Remove this when it gets optimized enough
# This only bypass the last 2 checks
# Those tests where passing in all Mode on a GTX470
if shp[0] > 30000 or shp[1] > 30000:
......
......@@ -46,7 +46,7 @@ def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
#we precompute the dot with big shape before to allow the test of
#GpuCrossentropySoftmax1HotWithBiasDx to don't fail with the error
#(the launch timed out and was terminated) on GPU card not
#powerfull enought. We need the big shape to check for corner
#powerful enough. We need the big shape to check for corner
#case.
dot_result = T.fmatrix('dot_result')
......
......@@ -55,10 +55,10 @@ def conv2d(input, filters, image_shape=None, filter_shape=None,
:type subsample: tuple of len 2
:param subsample: factor by which to subsample the output
:type image_shape: tuple of len 4 of int or Contant variable
:type image_shape: tuple of len 4 of int or Constant variable
:param image_shape: (batch size, stack size, nb row, nb col)
Optional, used for optimization.
:type filter_shape: tuple of len 4 of int or Contant variable
:type filter_shape: tuple of len 4 of int or Constant variable
:param filter_shape: (nb filters, stack size, nb row, nb col)
Optional, used for optimization.
......@@ -1744,15 +1744,15 @@ if (%(z)s->strides[3] != (npy_intp)sizeof(%(type)s)) %(fail)s;
for(int b=0;b< %(self_bsize)s ;b+=%(unroll_bsize)s){
for(int n_kern=0;n_kern<%(self_nkern)s;n_kern+=%(unroll_ksize)s){
"""%d
ret+=my_dup2("%(type)s * __restrict__ out%(unroll_iter)s=(%(type)s *)(PyArray_GETPTR2(%(z)s,b+%(unroll_biter)s,n_kern+%(unroll_kiter)s));")
ret+=my_dup("for (int i = 0; i < dim_zz[0]*dim_zz[1]; ++i) out%(unroll_iter)s[i] = 0;",unroll_bsize*unroll_ksize)
ret+="""
""" % d
ret += my_dup2("%(type)s * __restrict__ out%(unroll_iter)s=(%(type)s *)(PyArray_GETPTR2(%(z)s,b+%(unroll_biter)s,n_kern+%(unroll_kiter)s));")
ret += my_dup("for (int i = 0; i < dim_zz[0]*dim_zz[1]; ++i) out%(unroll_iter)s[i] = 0;", unroll_bsize * unroll_ksize)
ret += """
for(int stack_size=0;stack_size<%(self_imshp0)s;stack_size++){
"""%d
ret+=my_dup("const %(type)s * __restrict__ in%(unroll_iter)d=(%(type)s *)(PyArray_GETPTR2(img2d,b+%(unroll_iter)s,stack_size));", unroll_bsize)
ret+=my_dup("const %(type)s * __restrict__ hvals%(unroll_iter)s=(%(type)s *)(PyArray_GETPTR2(filtersflipped,n_kern+%(unroll_iter)s,stack_size));",unroll_ksize)
ret+="""
""" % d
ret += my_dup("const %(type)s * __restrict__ in%(unroll_iter)d=(%(type)s *)(PyArray_GETPTR2(img2d,b+%(unroll_iter)s,stack_size));", unroll_bsize)
ret += my_dup("const %(type)s * __restrict__ hvals%(unroll_iter)s=(%(type)s *)(PyArray_GETPTR2(filtersflipped,n_kern+%(unroll_iter)s,stack_size));", unroll_ksize)
ret += """
int new_m;
......@@ -1764,9 +1764,9 @@ for(int b=0;b< %(self_bsize)s ;b+=%(unroll_bsize)s){
for (int iter_n=0; iter_n < Os[1]; iter_n++) { // loop over columns
int pos_n=iter_n*%(self_dy)s;
"""%d
ret+=my_dup("%(type)s sum%(unroll_iter)s=0;", unroll_bsize*unroll_ksize)
ret+="""
""" % d
ret += my_dup("%(type)s sum%(unroll_iter)s=0;", unroll_bsize * unroll_ksize)
ret += """
// Sum over kernel, if index into image is out of bounds
// fill with the value
......@@ -1774,15 +1774,15 @@ for(int b=0;b< %(self_bsize)s ;b+=%(unroll_bsize)s){
int ind0 = (new_m-j);
if(mode==FULL){
"""%d
ret+=my_dup("const %(type)s * idx_hvals%(unroll_iter)s=&hvals%(unroll_iter)s[j*dim_ker1];",unroll_ksize)
ret+="""
""" % d
ret += my_dup("const %(type)s * idx_hvals%(unroll_iter)s=&hvals%(unroll_iter)s[j*dim_ker1];", unroll_ksize)
ret += """
if(ind0 < 0 || ind0 >= dim_im[0]){
if(fill_value!=0)
for (int k=0; k < dim_ker1; k++) {
"""%d
ret+=my_dup2("sum%(unroll_iter)s += idx_hvals%(unroll_kiter)s[k] * fill_value;")
ret+="""
""" % d
ret += my_dup2("sum%(unroll_iter)s += idx_hvals%(unroll_kiter)s[k] * fill_value;")
ret += """
}
}else{
//do the part where kernel is to the right of the img
......@@ -1791,49 +1791,49 @@ for(int b=0;b< %(self_bsize)s ;b+=%(unroll_bsize)s){
if(fill_value!=0){
for(k=0;k<max_k;k++){
"""%d
ret+=my_dup2("sum%(unroll_iter)s += idx_hvals%(unroll_kiter)s[k] * fill_value;")
ret+="""
""" % d
ret += my_dup2("sum%(unroll_iter)s += idx_hvals%(unroll_kiter)s[k] * fill_value;")
ret += """
}
}else {k=max_k;}
//do the part where the kernel is on the img
max_k=min(pos_n+1,(int)dim_ker1);
"""%d
ret+=my_dup("const %(type)s * idx_in%(unroll_iter)s=&in%(unroll_iter)s[ind0*dim_im[1]];", unroll_bsize)
ret+="""
""" % d
ret += my_dup("const %(type)s * idx_in%(unroll_iter)s=&in%(unroll_iter)s[ind0*dim_im[1]];", unroll_bsize)
ret += """
for (int ind1=pos_n-k; k<max_k; k++,ind1--) {
"""%d
ret+=my_dup2("sum%(unroll_iter)s+= idx_hvals%(unroll_kiter)s[k] * idx_in%(unroll_biter)s[ind1];")
ret+="""
""" % d
ret += my_dup2("sum%(unroll_iter)s+= idx_hvals%(unroll_kiter)s[k] * idx_in%(unroll_biter)s[ind1];")
ret += """
}
//do the part to the left of the img
if(fill_value!=0)
for(;k<dim_ker1;k++){
"""%d
ret+=my_dup2("sum%(unroll_iter)s += idx_hvals%(unroll_kiter)s[k] * fill_value;")
ret+="""
""" % d
ret += my_dup2("sum%(unroll_iter)s += idx_hvals%(unroll_kiter)s[k] * fill_value;")
ret += """
}
}
}else{//valid mode
"""%d
ret+=my_dup("const %(type)s* idx_in%(unroll_iter)s=&in%(unroll_iter)s[ind0*dim_im[1]];", unroll_bsize)
ret+=my_dup("const %(type)s* idx_hvals%(unroll_iter)s=&hvals%(unroll_iter)s[j*dim_ker1];",unroll_ksize)
ret+="""
""" % d
ret += my_dup("const %(type)s* idx_in%(unroll_iter)s=&in%(unroll_iter)s[ind0*dim_im[1]];", unroll_bsize)
ret += my_dup("const %(type)s* idx_hvals%(unroll_iter)s=&hvals%(unroll_iter)s[j*dim_ker1];", unroll_ksize)
ret += """
int new_n = (pos_n+dim_ker1-1);
for (int k=0,last=new_n; k < dim_ker1; k++,last--) {
"""%d
ret+=my_dup2("sum%(unroll_iter)s+=idx_hvals%(unroll_kiter)s[k]*idx_in%(unroll_biter)s[last];")
ret+="""
""" % d
ret += my_dup2("sum%(unroll_iter)s+=idx_hvals%(unroll_kiter)s[k]*idx_in%(unroll_biter)s[last];")
ret += """
}
}
}//for j
"""%d
ret+=my_dup("out%(unroll_iter)s[iter_m*dim_zz[1]+iter_n] %(affectation)s sum%(unroll_iter)s;", unroll_bsize*unroll_ksize)
ret+="""
""" % d
ret += my_dup("out%(unroll_iter)s[iter_m*dim_zz[1]+iter_n] %(affectation)s sum%(unroll_iter)s;", unroll_bsize * unroll_ksize)
ret += """
}//for n
}//for m
}//for stack_size
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论