提交 998b9bc4 authored 作者: Frederic's avatar Frederic

Reuse the current gpu conv test for gpuconvmm

上级 598f485b
...@@ -21,9 +21,9 @@ from theano import tensor ...@@ -21,9 +21,9 @@ from theano import tensor
from theano.gof.python25 import any from theano.gof.python25 import any
from theano.tests.unittest_tools import seed_rng from theano.tests.unittest_tools import seed_rng
# Skip test if cuda_ndarray is not available. # Skip test if cuda is not available.
import theano.sandbox.cuda as cuda_ndarray from theano.sandbox import cuda
if cuda_ndarray.cuda_available == False: if cuda.cuda_available == False:
raise SkipTest('Optional package cuda disabled') raise SkipTest('Optional package cuda disabled')
#needed as the gpu conv don't have a perform implementation. #needed as the gpu conv don't have a perform implementation.
...@@ -32,11 +32,11 @@ if theano.config.mode == 'FAST_COMPILE': ...@@ -32,11 +32,11 @@ if theano.config.mode == 'FAST_COMPILE':
else: else:
theano_mode = theano.compile.mode.get_default_mode().including('gpu') theano_mode = theano.compile.mode.get_default_mode().including('gpu')
cuda_tensor4 = cuda_ndarray.CudaNdarrayType([False] * 4) cuda_tensor4 = cuda.CudaNdarrayType([False] * 4)
device_id = theano.sandbox.cuda.use.device_number device_id = theano.sandbox.cuda.use.device_number
if device_id is None: if device_id is None:
cuda_ndarray.shared_constructor(numpy.zeros(2, dtype='float32')) cuda.shared_constructor(numpy.zeros(2, dtype='float32'))
device_id = theano.sandbox.cuda.use.device_number device_id = theano.sandbox.cuda.use.device_number
if device_id is None: if device_id is None:
cuda.use("gpu", cuda.use("gpu",
...@@ -126,7 +126,8 @@ def _params_allgood_header(): ...@@ -126,7 +126,8 @@ def _params_allgood_header():
def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1), def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
kern_stride=(1, 1), version=-1, verbose=0, random=True, kern_stride=(1, 1), version=-1, verbose=0, random=True,
print_=None, id=None, rtol=1e-5, atol=1e-8, print_=None, id=None, rtol=1e-5, atol=1e-8,
nb_iter=0, ones=False, compile_kshp=None): nb_iter=0, ones=False, compile_kshp=None,
theano_mode=None, cls=None):
# #
# This function is the core of several of the big unit-test drivers, # This function is the core of several of the big unit-test drivers,
# but it can also be used very directly on its own to test a specific # but it can also be used very directly on its own to test a specific
...@@ -181,6 +182,9 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1), ...@@ -181,6 +182,9 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
verbose=verbose, verbose=verbose,
kshp=compile_kshp)(i, k) kshp=compile_kshp)(i, k)
f = theano.function([i, k], op, mode=theano_mode) f = theano.function([i, k], op, mode=theano_mode)
if cls is not None:
assert any([isinstance(node.op, cls)
for node in f.maker.fgraph.toposort()]), f.maker.fgraph.toposort()
gpuval = f(img, kern) gpuval = f(img, kern)
t2 = time.time() t2 = time.time()
for i in range(nb_iter): for i in range(nb_iter):
...@@ -247,7 +251,8 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1), ...@@ -247,7 +251,8 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
def exec_conv(version, shapes, verbose, random, mode, def exec_conv(version, shapes, verbose, random, mode,
print_=None, rtol=1e-5, ones=False): print_=None, rtol=1e-5, ones=False,
theano_mode=theano_mode, cls=None):
if verbose > 0: if verbose > 0:
_params_allgood_header() _params_allgood_header()
nb_failed = 0 nb_failed = 0
...@@ -273,7 +278,9 @@ def exec_conv(version, shapes, verbose, random, mode, ...@@ -273,7 +278,9 @@ def exec_conv(version, shapes, verbose, random, mode,
id=id, id=id,
print_=print_, print_=print_,
rtol=rtol, rtol=rtol,
ones=ones) ones=ones,
theano_mode=theano_mode,
cls=cls)
except Exception, e: except Exception, e:
print ver, id, (ishape, kshape, subshape, istride, kstride) print ver, id, (ishape, kshape, subshape, istride, kstride)
print e print e
...@@ -624,11 +631,19 @@ def test_valid(): ...@@ -624,11 +631,19 @@ def test_valid():
if ones: if ones:
random = False random = False
# exec_conv(version, shapes, verbose, random, 'valid',
# print_=print_, ones=ones, rtol=1.1e-5)
mode = theano_mode.including("conv_gemm")
# import pdb;pdb.set_trace()
shapes = [shp for shp in shapes if shp[1][2] == shp[1][3]]
shapes = [shp for shp in shapes if shp[0][2] == shp[0][3]]
exec_conv(version, shapes, verbose, random, 'valid', exec_conv(version, shapes, verbose, random, 'valid',
print_=print_, ones=ones, rtol=1.1e-5) print_=print_, ones=ones, rtol=1.1e-5,
theano_mode=mode, cls=cuda.blas.GpuConvMM)
def test_full(): def test_full(gemm=False):
seed_rng() seed_rng()
shapes = get_basic_shapes() shapes = get_basic_shapes()
shapes += get_shapes2() shapes += get_shapes2()
...@@ -688,7 +703,16 @@ def test_full(): ...@@ -688,7 +703,16 @@ def test_full():
# version=[4] # version=[4]
random = True random = True
exec_conv(version, shapes, verbose, random, 'full') # exec_conv(version, shapes, verbose, random, 'full')
# Test the GpuConvMM version
mode = theano_mode.including("conv_gemm")
shapes = [shp for shp in shapes if shp[1][2] == shp[1][3]]
shapes = [shp for shp in shapes if shp[0][2] == shp[0][3]]
shapes = shapes[0:10]
exec_conv(version, shapes, verbose, random, 'full',
theano_mode=mode, cls=cuda.blas.GpuConvMM)
def test_subsample(): def test_subsample():
......
"""
Tests for Caffe GPU convolution
"""
import sys
import time
import unittest
import numpy
from nose.plugins.skip import SkipTest
imported_scipy_convolve2d = False
try:
from scipy.signal import correlate
imported_scipy_convolve2d = True
except ImportError:
pass
import theano
from theano import tensor
from theano.gof.python25 import any
from theano.tests.unittest_tools import seed_rng
# Skip test if cuda_ndarray is not available.
import theano.sandbox.cuda as cuda_ndarray
if cuda_ndarray.cuda_available == False:
raise SkipTest('Optional package cuda disabled')
#needed as the gpu conv don't have a perform implementation.
if theano.config.mode == 'FAST_COMPILE':
theano_mode = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
else:
theano_mode = theano.compile.mode.get_default_mode().including('gpu')
cuda_tensor4 = cuda_ndarray.CudaNdarrayType([False] * 4)
cuda_tensor2 = cuda_ndarray.CudaNdarrayType([False] * 2)
device_id = theano.sandbox.cuda.use.device_number
if device_id is None:
cuda_ndarray.shared_constructor(numpy.zeros(2, dtype='float32'))
device_id = theano.sandbox.cuda.use.device_number
if device_id is None:
cuda.use("gpu",
force=False,
default_to_move_computation_to_gpu=False,
move_shared_float32_to_gpu=False,
enable_cuda=False,
test_driver=True)
device_id = theano.sandbox.cuda.use.device_number
cuda_ndarray = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray
device_prop = cuda_ndarray.device_properties(device_id)
def py_corr_scipy(img, kern, mode, subsample):
assert img.shape[1] == kern.shape[1]
if mode == 'valid':
outshp = (img.shape[0], kern.shape[0],
img.shape[2] - kern.shape[2] + 1,
img.shape[3] - kern.shape[3] + 1)
else:
outshp = (img.shape[0], kern.shape[0],
img.shape[2] + kern.shape[2] - 1,
img.shape[3] + kern.shape[3] - 1)
out = numpy.zeros(outshp, dtype='float32')
for b in xrange(out.shape[0]):
for k in xrange(out.shape[1]):
for s in xrange(img.shape[1]):
out[b, k, :, :] += correlate(img[b, s, :, :],
kern[k, s, :, :],
mode)
return out
def _params_allgood_header():
print "ishape kshape #Mflops CPU Mflops GPU Mflops Speedup"
kH = 3
kW = 3
nInputPlane = 3 #channels
nOutputPlane = 2
padding = 0
batchSize = 4
inputWidth = 7 #im.shape[1]
inputHeight = 7 #im.shape[0]
ishape = (batchSize, nInputPlane, inputHeight, inputWidth)
kshape = (nOutputPlane, nInputPlane, kH, kW)
print 'Image shape', ishape
print 'Kernel shape', kshape
im = numpy.random.rand(*ishape) + 1
#plt.imread('lena.bmp')
img_stride = (1, 1)
kern_stride = (1, 1)
outputWidth = (inputWidth + 2*padding - kW) / img_stride[1] + 1
outputHeight = (inputHeight + 2*padding - kH) / img_stride[0] + 1
oshape=(batchSize, nInputPlane, outputHeight, outputWidth)
npy_img = theano._asarray(numpy.random.rand(*ishape) + 1,
dtype='float32')
npy_kern = theano._asarray(numpy.random.rand(*kshape) - 2,
dtype='float32')
img = cuda_ndarray.CudaNdarray(npy_img)
kern = cuda_ndarray.CudaNdarray(npy_kern)
#temporary columns
cshape = (nInputPlane*kW*kH, outputHeight*outputWidth)
print 'Columns shape: ', cshape
oshape=(batchSize, nInputPlane, outputHeight, outputWidth)
print 'Output shape: ', oshape
subsample = 1
mode = 'valid'
t0 = time.time()
cpuval = py_corr_scipy(npy_img, npy_kern, mode, subsample)
t1 = time.time()
i = cuda_tensor4()
k = cuda_tensor4()
op = theano.sandbox.cuda.blas.GpuConvMM(border_mode=mode,
subsample=(subsample, subsample),
version=100,
verbose=2, pad=1)(i, k)
f = theano.function([i, k], op, mode=theano_mode)
gpuval = f(img, kern)
t2 = time.time()
gpuval = numpy.asarray(gpuval)
if gpuval.shape != cpuval.shape:
print >> sys.stdout, "ERROR: shape mismatch",
print >> sys.stdout, gpuval.shape, cpuval.shape
print '---------------- INPUT VAL -----------------------'
print npy_img
print '---------------- kernel -----------------------'
print npy_kern
print '---------------- GPU VAL -----------------------'
print gpuval
print '---------------- CPU VAL -----------------------'
print cpuval
rval = numpy.allclose(cpuval, gpuval, rtol=1e-4)
print rval
assert numpy.all(numpy.isfinite(gpuval))
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论