提交 925a4eb6 authored 作者: Olivier Delalleau's avatar Olivier Delalleau

Merged

......@@ -421,7 +421,7 @@ class GpuDownsampleFactorMax(Op):
#def perform(self, node, input_storage, output_storage):
#raise NotImplementedError('only C is implemented')
def c_code_cache_version(self):
return ()
return (1)
def c_code(self, node, nodename, (x,), (z,), sub):
fail = sub['fail']
ds0, ds1 = self.ds
......
......@@ -521,6 +521,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
}
if (1 && (version==6||version==-1) &&
kern_len<=320 &&
!work_complete) //conv_valid_row_reduce
{
int outsize = CudaNdarray_SIZE(out);
......
import sys, time
import numpy
from nose.plugins.skip import SkipTest
imported_scipy_convolve2d = False
try:
from scipy.signal import convolve2d
imported_scipy_convolve2d = True
except ImportError:
pass
import theano
# Skip test if cuda_ndarray is not available.
from nose.plugins.skip import SkipTest
import theano.sandbox.cuda as cuda_ndarray
if cuda_ndarray.cuda_available == False:
raise SkipTest('Optional package cuda disabled')
......@@ -38,9 +46,23 @@ def py_conv_full_numpy(img, kern):
pad_cols = 2*(kern.shape[3]-1) + img.shape[3]
padded_img = numpy.zeros((img.shape[0], img.shape[1], pad_rows, pad_cols), dtype=img.dtype)
padded_img[:,:,kern.shape[2]-1:kern.shape[2]-1+img.shape[2],kern.shape[3]-1:kern.shape[3]-1+img.shape[3]] = img
return py_conv_valid(padded_img, kern)
return py_conv_valid_numpy(padded_img, kern)
def py_conv(img, kern, mode, subsample):
"""
use a scipy or numpy implementation depending is scipy is available.
The scipy version is faster.
"""
if imported_scipy_convolve2d:
return py_conv_scipy(img, kern, mode, subsample)
elif mode=='valid':
return py_conv_valid_numpy(img,kern)[:,:,::subsample[0],::subsample[1]]
elif mode=='full':
return py_conv_full_numpy(img,kern)[:,:,::subsample[0],::subsample[1]]
else:
raise Exception("Can't execute this kernel.")
def py_conv_scipy(img, kern, mode, subsample):
from scipy.signal import convolve2d
assert img.shape[1] == kern.shape[1]
if mode == 'valid':
outshp = (img.shape[0], kern.shape[0],
......@@ -89,7 +111,7 @@ def _params_allgood(ishape, kshape, mode, subsample=(1,1), img_stride=(1,1), ker
rval = True
try:
t0 = time.time()
cpuval = py_conv_scipy(npy_img, npy_kern, mode, subsample)
cpuval = py_conv(npy_img, npy_kern, mode, subsample)
t1 = time.time()
i = cuda_tensor4()
k = cuda_tensor4()
......@@ -550,7 +572,7 @@ def _test_dummy():
rval = True
t0 = time.time()
cpuval = py_conv_scipy(npy_img, npy_kern, mode, subsample)
cpuval = py_conv(npy_img, npy_kern, mode, subsample)
t1 = time.time()
gpuval = cuda_ndarray.conv(img, kern, mode, subsample)
t2 = time.time()
......
......@@ -252,13 +252,13 @@ class GpuImages2Neibs(Images2Neibs):
dtype=ten4.type.dtype)()])
def c_code_cache_version(self):
return ()
return (2,)
return (6,)
def c_support_code_apply(self, node, nodename):
if self.mode=="valid":
return """
static __global__ void k_multi_warp_%(nodename)s(
mode = self.mode
return """
//a version that use less register but don't work in all case.
static __global__ void k_multi_warp_less_%(nodename)s(
const int nb_batch,
const int nb_stack,
const int height,
......@@ -274,8 +274,10 @@ class GpuImages2Neibs(Images2Neibs):
float * global_out
)
{
for(int tblock = blockIdx.x;tblock<nb_batch*nb_stack*grid_c*grid_d;tblock+=gridDim.x){
const int wrap_centered_idx_shift_x = c/2;
const int wrap_centered_idx_shift_y = d/2;
for(int tblock = blockIdx.x*blockDim.z+threadIdx.z;tblock<nb_batch*nb_stack*grid_c*grid_d;tblock+=gridDim.x*blockDim.z){
const int b = tblock%%grid_d;
int left = tblock/grid_d;
const int a = left%%grid_c;
......@@ -289,12 +291,23 @@ class GpuImages2Neibs(Images2Neibs):
if(a>grid_c)continue;
if(b>grid_d)continue;
int z_row = b + grid_d*(a + grid_c*(s + nb_stack*n));
for (int i = 0; i < c; i++) // loop over c
int i = threadIdx.y; // loop over c
{
int ten4_2 = i + a * step_x;
for (int j = threadIdx.x; j < d; j+=blockDim.x) // loop over d
if("%(mode)s"=="wrap_centered"){
ten4_2 -= wrap_centered_idx_shift_x;
if ( ten4_2 < 0 ) ten4_2 += height;
else if (ten4_2 >= height) ten4_2 -= height;
}
int j = threadIdx.x; // loop over d
{
int ten4_3 = j + b * step_y;
if("%(mode)s"=="wrap_centered"){
ten4_3 -= wrap_centered_idx_shift_y;
if ( ten4_3 < 0 ) ten4_3 += width;
else if (ten4_3 >= width) ten4_3 -= width;
}
//int ten4_idx = ten4_3 + width*(ten4_2 + height*(s +nb_stack*n));
//int ten4_idx = stride3*ten4_3 + stride2*(ten4_2 + stride1*(s + stride0*n));
int ten4_idx = stride3*ten4_3 + stride2*ten4_2 + stride1*s + stride0*n;
......@@ -307,9 +320,6 @@ class GpuImages2Neibs(Images2Neibs):
}
}
""" % locals()
if self.mode=="wrap_centered":
return """
static __global__ void k_multi_warp_%(nodename)s(
const int nb_batch,
const int nb_stack,
......@@ -329,7 +339,7 @@ class GpuImages2Neibs(Images2Neibs):
const int wrap_centered_idx_shift_x = c/2;
const int wrap_centered_idx_shift_y = d/2;
for(int tblock = blockIdx.x;tblock<nb_batch*nb_stack*grid_c*grid_d;tblock+=gridDim.x){
for(int tblock = blockIdx.x*blockDim.z+threadIdx.z;tblock<nb_batch*nb_stack*grid_c*grid_d;tblock+=gridDim.x*blockDim.z){
const int b = tblock%%grid_d;
int left = tblock/grid_d;
const int a = left%%grid_c;
......@@ -343,19 +353,23 @@ class GpuImages2Neibs(Images2Neibs):
if(a>grid_c)continue;
if(b>grid_d)continue;
int z_row = b + grid_d*(a + grid_c*(s + nb_stack*n));
for (int i = 0; i < c; i++) // loop over c
for (int i = threadIdx.y; i < c; i+=blockDim.y) // loop over c
{
int ten4_2 = i + a * step_x;
ten4_2 -= wrap_centered_idx_shift_x;
if ( ten4_2 < 0 ) ten4_2 += height;
else if (ten4_2 >= height) ten4_2 -= height;
if("%(mode)s"=="wrap_centered"){
ten4_2 -= wrap_centered_idx_shift_x;
if ( ten4_2 < 0 ) ten4_2 += height;
else if (ten4_2 >= height) ten4_2 -= height;
}
for (int j = threadIdx.x; j < d; j+=blockDim.x) // loop over d
{
int ten4_3 = j + b * step_y;
ten4_3 -= wrap_centered_idx_shift_y;
if ( ten4_3 < 0 ) ten4_3 += width;
else if (ten4_3 >= width) ten4_3 -= width;
if("%(mode)s"=="wrap_centered"){
ten4_3 -= wrap_centered_idx_shift_y;
if ( ten4_3 < 0 ) ten4_3 += width;
else if (ten4_3 >= width) ten4_3 -= width;
}
//int ten4_idx = ten4_3 + width*(ten4_2 + height*(s +nb_stack*n));
//int ten4_idx = stride3*ten4_3 + stride2*(ten4_2 + stride1*(s + stride0*n));
int ten4_idx = stride3*ten4_3 + stride2*ten4_2 + stride1*s + stride0*n;
......@@ -370,7 +384,6 @@ class GpuImages2Neibs(Images2Neibs):
""" % locals()
def c_code(self, node, name, (ten4, neib_shape, neib_step), (z,), sub):
fail = sub['fail']
mode = self.mode
......@@ -473,17 +486,36 @@ class GpuImages2Neibs(Images2Neibs):
const npy_intp step_x = (npy_intp) *(dtype_%(neib_step)s*) PyArray_GETPTR1(%(neib_step)s, 0);
const npy_intp step_y = (npy_intp) *(dtype_%(neib_step)s*) PyArray_GETPTR1(%(neib_step)s, 1);
dim3 n_threads(d,c,1);
//Their is a max of 512 threads per blocks
while(n_threads.x*n_threads.y>512 && n_threads.y>1)n_threads.y--;
while(n_threads.x*n_threads.y>512 && n_threads.x>1)n_threads.x--;
//Make bigger block to have better memory access pattern and a higher core utilisation.
//for smaller patch size
while(c*d*(n_threads.z+1) < 128 && n_threads.z<64 && n_threads.z<CudaNdarray_HOST_DIMS(%(z)s)[0]){
n_threads.z++;
}
int nb_block;
if (nb_batch %% 32 == 0)
nb_block = nb_batch/32;
if (CudaNdarray_HOST_DIMS(%(z)s)[0] %% n_threads.z == 0)
nb_block = CudaNdarray_HOST_DIMS(%(z)s)[0] / n_threads.z;
else
nb_block = (int)((float)nb_batch/32. + 1.);
dim3 n_blocks(std::min(32*1024,CudaNdarray_HOST_DIMS(%(z)s)[0]),1,1);
dim3 n_threads(32,1,1);
nb_block = (CudaNdarray_HOST_DIMS(%(z)s)[0] / n_threads.z) + 1;
dim3 n_blocks(std::min(32*1024,nb_block));
int n_shared = 0;
k_multi_warp_%(name)s<<<n_blocks, n_threads, n_shared>>>(
void (*f)(int, int, int ,int,
int, int, int ,int,
int, int,
int, int, int, int,
float*, float*);
if(n_threads.x==d && n_threads.y==c){
f = k_multi_warp_less_%(name)s;
}else{
f = k_multi_warp_%(name)s;
}
f<<<n_blocks, n_threads, n_shared>>>(
nb_batch,
nb_stack,
height, width,
......
......@@ -278,26 +278,30 @@ def test_neibs_wrap_centered_step_manual():
def test_neibs_gpu():
if cuda.cuda_available == False:
raise SkipTest('Optional package cuda disabled')
shape = (100,40,18,18)
images = shared(numpy.arange(numpy.prod(shape), dtype='float32').reshape(shape))
neib_shape = T.as_tensor_variable((2,2))#(array((2,2), dtype='float32'))
from theano.sandbox.cuda.basic_ops import gpu_from_host
f = function([], images2neibs(images,neib_shape),
mode=mode_with_gpu)
f_gpu = function([], images2neibs(images,neib_shape),
mode=mode_with_gpu)
assert any([isinstance(node.op,GpuImages2Neibs) for node in f_gpu.maker.env.toposort()])
#print images.value
neibs = numpy.asarray(f_gpu())
assert numpy.allclose(neibs,f())
#print neibs
g = function([], neibs2images(neibs, neib_shape, images.shape), mode=mode_with_gpu)
assert any([isinstance(node.op,GpuImages2Neibs) for node in f.maker.env.toposort()])
#print numpy.asarray(g())
assert numpy.allclose(images.value,g())
for shape, pshape in [((100,40,18,18),(2,2)),
((100,40,6,18),(3,2)),
((10,40,66,66),(33,33)),
((10,40,68,66),(34,33))
]:
images = shared(numpy.arange(numpy.prod(shape), dtype='float32').reshape(shape))
neib_shape = T.as_tensor_variable(pshape)
from theano.sandbox.cuda.basic_ops import gpu_from_host
f = function([], images2neibs(images,neib_shape),
mode=mode_with_gpu)
f_gpu = function([], images2neibs(images,neib_shape),
mode=mode_with_gpu)
assert any([isinstance(node.op,GpuImages2Neibs) for node in f_gpu.maker.env.toposort()])
#print images.value
neibs = numpy.asarray(f_gpu())
assert numpy.allclose(neibs,f())
#print neibs
g = function([], neibs2images(neibs, neib_shape, images.shape), mode=mode_with_gpu)
assert any([isinstance(node.op,GpuImages2Neibs) for node in f.maker.env.toposort()])
#print numpy.asarray(g())
assert numpy.allclose(images.value,g())
def speed_neibs():
......
......@@ -12,7 +12,7 @@ import numpy, theano
#from copy import copy as python_copy
from theano import gof, shared
from theano.gof import Variable, Op, utils, Type, Constant, Value
from theano.gof import Variable, Op, Type, Constant, Value
from theano.tensor.tsor_apply import Apply
from theano import gradient
......@@ -21,7 +21,7 @@ import elemwise
from theano import scalar as scal
from theano.gof.python25 import partial, any, all
from theano import compile, printing
from theano.printing import pprint, Print
from theano.printing import pprint
### set up the external interface
from elemwise import Elemwise, DimShuffle, CAReduce, Sum
......
......@@ -18,6 +18,16 @@ from theano import gof, Op, tensor, config
from theano.tensor.tsor_apply import Apply
from theano.gof.python25 import any
imported_scipy_signal = False
try:
# TODO: move these back out to global scope when they no longer cause an atexit error
from scipy.signal.signaltools import _valfrommode, _bvalfromboundary
from scipy.signal.sigtools import _convolve2d
imported_scipy_signal = True
except ImportError:
pass
_logger=logging.getLogger("theano.signal.conv")
def _debug(*msg):
_logger.debug(' '.join([ str(x) for x in msg]))
......@@ -547,9 +557,12 @@ class ConvOp(Op):
"""
By default if len(img2d.shape)==3, we
"""
if not imported_scipy_signal:
raise theano.gof.utils.MethodNotDefined(
"c_headers", type(self), self.__class__.__name__,
"Need the python package for scipy.signal to be installed for the python implementation. You can use the C implementation instead.")
# TODO: move these back out to global scope when they no longer cause an atexit error
from scipy.signal.signaltools import _valfrommode, _bvalfromboundary
from scipy.signal.sigtools import _convolve2d
imshp = self.imshp
if imshp is None or any([x is None for x in imshp]):
imshp = tuple(img2d.shape[1:])
......@@ -584,8 +597,6 @@ class ConvOp(Op):
z[0] = numpy.zeros((bsize,)+(nkern,)+fulloutshp,
dtype=img2d.dtype)
zz=z[0]
val = _valfrommode(self.out_mode)
bval = _bvalfromboundary('fill')
stacklen = imshp[0]
......@@ -616,6 +627,9 @@ class ConvOp(Op):
filtersflipped = buf
del buf, rstride, cstride
val = _valfrommode(self.out_mode)
bval = _bvalfromboundary('fill')
for b in range(bsize):
for n in range(nkern):
zz[b,n,...].fill(0)
......@@ -623,6 +637,25 @@ class ConvOp(Op):
zz[b,n,...] += _convolve2d(\
img2d[b,im0,...], filtersflipped[n,im0,...],1,val, bval, 0)
if False:
if False and self.out_mode=="full":
img2d2 = numpy.zeros((bsize,stacklen,
imshp[1]+2*kshp[0]-2,
imshp[2]+2*kshp[1]-2))
img2d2[:,:,kshp[0]-1:kshp[0]-1+imshp[1],
kshp[1]-1:kshp[1]-1+imshp[2]] = img2d
img2d = img2d2
#N_image_shape = image_data.shape
for b in range(bsize):
for n in range(nkern):
zz[b,n,...].fill(0)
for im0 in range(stacklen):
for row in range(0,zz.shape[2],self.dx):
for col in range(0,zz.shape[3],self.dy):
zz[b,n,row,col] += (img2d[b,im0,row:row+kshp[0],col:col+kshp[1]]*\
filtersflipped[n,im0,::-1,::-1]).sum()
#We copy it to remove the Stride mismatch warning from DEBUG_MODE.
#The copy make that we return an object with the same stride as the c version.
#The copy don't affect the performence during our experience as in that case we
......
import sys, time, unittest
import numpy
from scipy import signal
import theano
import theano.tensor as T
......@@ -60,6 +59,7 @@ class TestConv2D(unittest.TestCase):
############# REFERENCE IMPLEMENTATION ############
s = 1.
orig_image_data = image_data
if border_mode is not 'full': s = -1.
out_shape2d = numpy.array(N_image_shape[-2:]) +\
s*numpy.array(N_filter_shape[-2:]) - s
......@@ -68,26 +68,41 @@ class TestConv2D(unittest.TestCase):
ref_output = numpy.zeros(out_shape)
# loop over output feature maps
for k in range(N_filter_shape[0]):
# loop over input feature maps
for l in range(N_filter_shape[1]):
filter2d = filter_data[k,l,:,:]
# loop over mini-batches
for b in range(N_image_shape[0]):
image2d = image_data[b,l,:,:]
output2d = signal.convolve2d(image2d, filter2d, border_mode)
ref_output[b,k,:,:] +=\
output2d[::subsample[0],::subsample[1]]
ref_output.fill(0)
if border_mode=='full':
image_data2 = numpy.zeros((N_image_shape[0],N_image_shape[1],
N_image_shape[2]+2*N_filter_shape[2]-2,
N_image_shape[3]+2*N_filter_shape[3]-2))
image_data2[:,:,N_filter_shape[2]-1:N_filter_shape[2]-1+N_image_shape[2],
N_filter_shape[3]-1:N_filter_shape[3]-1+N_image_shape[3]] = image_data
image_data = image_data2
N_image_shape = image_data.shape
for bb in range(N_image_shape[0]):
for nn in range(N_filter_shape[0]):
for im0 in range(N_image_shape[1]):
filter2d = filter_data[nn,im0,:,:]
image2d = image_data[bb,im0,:,:]
for row in range(ref_output.shape[2]):
irow = row * subsample[0]#image row
for col in range(ref_output.shape[3]):
icol = col * subsample[1]#image col
ref_output[bb,nn,row,col] += (image2d[irow:irow+N_filter_shape[2],
icol:icol+N_filter_shape[3]]*filter2d[::-1,::-1]
).sum()
self.failUnless(_allclose(theano_output, ref_output))
############# TEST GRADIENT ############
if verify_grad:
utt.verify_grad(sym_conv2d, [image_data, filter_data])
utt.verify_grad(sym_conv2d, [orig_image_data, filter_data])
def test_basic1(self):
"""
Tests that basic convolutions work for odd and even dimensions of image and filter
shapes, as well as rectangular images and filters.
"""
self.validate((2,2,3,3), (2,2,2,2), 'valid', verify_grad=False)
def test_basic(self):
"""
......
import sys, time, unittest
import numpy
from scipy import signal
import theano
import theano.tensor as T
......@@ -59,7 +58,13 @@ class TestSignalConv2D(unittest.TestCase):
image2d = image_data3d[b,:,:]
filter2d = filter_data3d[k,:,:]
output2d = signal.convolve2d(image2d, filter2d, 'valid')
output2d = numpy.zeros(ref_output.shape)
for row in range(ref_output.shape[0]):
for col in range(ref_output.shape[1]):
output2d[row,col] += (image2d[row:row+filter2d.shape[0],
col:col+filter2d.shape[1]]*filter2d[::-1,::-1]
).sum()
self.failUnless(_allclose(theano_output4d[b,k,:,:], output2d))
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论