提交 85447abe authored 作者: James Bergstra's avatar James Bergstra

merge

......@@ -52,6 +52,10 @@ Community
* Register and post to `theano-dev`_ if you want to talk to the developers.
* Register and post to `theano-announce`_ if you want to be keep informed on important change on theano(low volume).
* Register and post to `theano-buildbot`_ if you want to receive our daily buildbot email.
* We try to stay organized with `Theano's Trac <http://trac-hg.assembla.com/theano/report/1>`__
* Come visit us in Montreal! Most of the developers are students in the LISA_ group at the `University of Montreal`_.
......@@ -77,6 +81,8 @@ Community
.. _theano-dev: http://groups.google.com/group/theano-dev
.. _theano-users: http://groups.google.com/group/theano-users
.. _theano-announce: http://groups.google.com/group/theano-announce
.. _theano-buildbot: http://groups.google.com/group/theano-buildbot
.. _tickets: http://pylearn.org/theano/trac/query?status=accepted&status=assigned&status=new&status=reopened&group=milestone&max=200&col=id&col=summary&col=status&col=owner&col=type&col=priority&col=component&col=time&report=9&order=priority
.. _LISA: http://www.iro.umontreal.ca/~lisa
......
......@@ -188,7 +188,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx (Op):
def make_node(self, dy, sm, y_idx):
return Apply(self, [dy, sm, y_idx],[sm.type()])
def c_code_cache_version(self):
return (2,)
return (3,)
#return ()
def c_code(self, node, nodename, (dnll, sm, y_idx), (dx,), sub):
fail = sub['fail']
......@@ -229,7 +229,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx (Op):
kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s
<<<
CudaNdarray_HOST_DIMS(%(dx)s)[0],
CudaNdarray_HOST_DIMS(%(dx)s)[1]
std::min(CudaNdarray_HOST_DIMS(%(dx)s)[1],256)
>>>(
CudaNdarray_HOST_DIMS(%(dx)s)[0],
CudaNdarray_HOST_DIMS(%(dx)s)[1],
......@@ -303,7 +303,7 @@ class GpuSoftmax (Op):
return shape
def c_code_cache_version(self):
#return ()
return (1,) + inline_softmax.code_version
return (2,) + inline_softmax.code_version
def c_code(self, node, nodename, (x,), (z,), sub):
fail = sub['fail']
return """
......@@ -330,7 +330,7 @@ class GpuSoftmax (Op):
kSoftmax_%(nodename)s
<<<
// todo: cap these at the card limits, implement loops in kernel
CudaNdarray_HOST_DIMS(%(x)s)[0],
std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],32*1024),
CudaNdarray_HOST_DIMS(%(x)s)[1],
CudaNdarray_HOST_DIMS(%(x)s)[1] * 2 * sizeof(float)
>>>(
......@@ -362,11 +362,14 @@ class GpuSoftmax (Op):
body=[
"extern __shared__ float buf[]",
"float * buf2 = buf + N",
"buf[threadIdx.x] = x[blockIdx.x * sx0 + threadIdx.x * sx1]",
"buf2[threadIdx.x] = buf[threadIdx.x]",
"__syncthreads()",
inline_softmax('N', 'buf', 'buf2', 'threadIdx.x', 'blockDim.x'),
"sm[blockIdx.x * N + threadIdx.x] = buf[threadIdx.x]"
"for (int blockIDX = blockIdx.x; blockIDX < M; blockIDX += gridDim.x){",
"buf[threadIdx.x] = x[blockIDX * sx0 + threadIdx.x * sx1]",
"buf2[threadIdx.x] = buf[threadIdx.x]",
"__syncthreads()",
inline_softmax('N', 'buf', 'buf2', 'threadIdx.x', 'blockDim.x'),
"sm[blockIDX * N + threadIdx.x] = buf[threadIdx.x]",
"__syncthreads()",
"}",
])
......@@ -386,7 +389,7 @@ class GpuSoftmaxWithBias (Op):
return [shape[0]]
def c_code_cache_version(self):
#return ()
return (1,) + inline_softmax.code_version
return (2,) + inline_softmax.code_version
def c_code(self, node, nodename, (x,b), (z,), sub):
fail = sub['fail']
......@@ -425,7 +428,7 @@ class GpuSoftmaxWithBias (Op):
kSoftmaxWithBias_%(nodename)s
<<<
// todo: cap these at the card limits, implement loops in kernel
CudaNdarray_HOST_DIMS(%(x)s)[0],
std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],32*1024),
CudaNdarray_HOST_DIMS(%(x)s)[1],
CudaNdarray_HOST_DIMS(%(x)s)[1] * 2 * sizeof(float)
>>>(
......@@ -461,10 +464,14 @@ class GpuSoftmaxWithBias (Op):
body=[
"extern __shared__ float buf[]",
"float * buf2 = buf + N",
"buf[threadIdx.x] = x[blockIdx.x * sx0 + threadIdx.x * sx1]",
"buf[threadIdx.x] += b[threadIdx.x * sb0]",
"buf2[threadIdx.x] = buf[threadIdx.x]",
"__syncthreads()",
inline_softmax('N', 'buf', 'buf2', 'threadIdx.x', 'blockDim.x'),
"sm[blockIdx.x * N + threadIdx.x] = buf[threadIdx.x]"
"for (int blockIDX = blockIdx.x; blockIDX < M; blockIDX += gridDim.x){",
"buf[threadIdx.x] = x[blockIDX * sx0 + threadIdx.x * sx1]",
"buf[threadIdx.x] += b[threadIdx.x * sb0]",
"buf2[threadIdx.x] = buf[threadIdx.x]",
"__syncthreads()",
inline_softmax('N', 'buf', 'buf2', 'threadIdx.x', 'blockDim.x'),
"sm[blockIDX * N + threadIdx.x] = buf[threadIdx.x]",
"__syncthreads()",
"}",
])
#for (int i = blockIdx.x; i < N; i += gridDim.x)
import sys, time
import theano
from theano.compile.sharedvalue import shared
from theano.compile.pfunc import pfunc
from theano import tensor
import theano.tensor.nnet
from theano import config
import theano.tensor.nnet.conv as conv
import theano.tensor.signal.downsample as downsample
import numpy
# Skip test if cuda_ndarray is not available.
from nose.plugins.skip import SkipTest
raise SkipTest('SKIP TO PREVENT THE BUILDBOT FROM CRASHING. THERE IS A DIFFICULT BUG TO FIX WITH MEMORY LEAK AND/OR WHEN Cuda_Ndarray alloc fail!')
import theano.sandbox.cuda as cuda_ndarray
if cuda_ndarray.cuda_available == False:
raise SkipTest('Optional package cuda disabled')
import theano.sandbox.cuda as tcn
import logging
logging.getLogger('theano.sandbox.cuda.tests.test_nnet').setLevel(logging.INFO)
def my_rand(*shape):
return theano._asarray(numpy.random.rand(*shape),dtype='float32')
def my_randn(*shape):
return theano._asarray(numpy.random.randn(*shape),dtype='float32')
def my_zeros(*shape):
return theano._asarray(numpy.zeros(*shape),dtype='float32')
def get_mode(use_gpu):
ret = theano.compile.get_default_mode()
if isinstance(ret, theano.compile.ProfileMode):
ret = theano.compile.ProfileMode()
if use_gpu:
ret = ret.including('gpu')
else:
ret = ret.excluding('gpu')
return ret
def print_mode(mode):
if mode != None and isinstance(mode,(theano.compile.ProfileMode,)):
mode.print_summary()
def print_diff_mode(a,b):
if a != None and isinstance(a,(theano.compile.ProfileMode,)) and isinstance(b,(theano.compile.ProfileMode,)):
a.print_diff_summary(b)
def run_nnet(use_gpu, n_batch=60, n_in=1024, n_hid=2048, n_out=10, n_train=100):
if config.mode=='DEBUG_MODE': n_train=1
if use_gpu:
w = tcn.shared_constructor(0.01*(my_rand(n_in,n_hid)-0.5), 'w')
b = tcn.shared_constructor(my_zeros(n_hid), 'b')
v = tcn.shared_constructor(my_zeros((n_hid, n_out)), 'c')
c = tcn.shared_constructor(my_zeros(n_out), 'c')
else:
w = shared(0.01*(my_rand(n_in,n_hid)-0.5), 'w')
b = shared(my_zeros(n_hid), 'b')
v = shared(my_zeros((n_hid, n_out)), 'c')
c = shared(my_zeros(n_out), 'c')
x = tensor.fmatrix('x')
y = tensor.fmatrix('y')
lr = tensor.fscalar('lr')
hid = tensor.tanh(tensor.dot(x, w)+b)
out = tensor.tanh(tensor.dot(hid, v)+c)
loss = tensor.sum(0.5 * (out-y)**2 * lr)
if 0: print 'loss type', loss.type
params = [w, b, v, c]
gparams = tensor.grad(loss, params)
mode = get_mode(use_gpu)
print 'building pfunc ...'
train = pfunc([x,y,lr], [loss], mode=mode, updates=[(p, p-g) for p,g in zip(params, gparams)])
if 0:
for i, n in enumerate(train.maker.env.toposort()):
print i, n
xval = my_rand(n_batch, n_in)
yval = my_rand(n_batch, n_out)
lr = theano._asarray(0.01, dtype='float32')
t0 = time.time()
rval = []
for i in xrange(n_train):
rval.append(train(xval, yval, lr))
dt = time.time() - t0
print_mode(mode)
return numpy.asarray(rval), dt
def test_run_nnet():
for n_in in 1024, 2048, 4096:
for n_hid in 1024, 2048, 4096:
numpy.random.seed(23456)
rval_cpu, tc = run_nnet(False, n_in=n_in, n_hid=n_hid)
numpy.random.seed(23456)
rval_gpu, tg = run_nnet(True, n_in=n_in, n_hid=n_hid)
#print "cpu:", rval_cpu
#print "gpu:", rval_gpu
print "max abs diff:", numpy.max(numpy.absolute(rval_gpu-rval_cpu))
print "time cpu: %f, time gpu: %f, speed up %f"%(tc, tg, tc/tg)
assert numpy.allclose(rval_cpu, rval_gpu,rtol=1e-4,atol=1e-6)
def test_run_nnet_med():
numpy.random.seed(23456)
rval_cpu = run_nnet(False, 10, 128, 50, 4, n_train=10000)
def test_run_nnet_small():
numpy.random.seed(23456)
rval_cpu = run_nnet(False, 10, 10, 4, 4, n_train=100000)
def run_conv_nnet1(use_gpu):
if use_gpu:
shared_fn = tcn.shared_constructor
else:
shared_fn = shared
n_batch = 16
n_kern = 20
shape_img = (n_batch, 1, 32, 32)
shape_kern = (n_kern, 1, 5, 5)
n_train=10
if config.mode=='DEBUG_MODE': n_train=1
logical_hid_shape = tcn.blas.GpuConv.logical_output_shape_2d(shape_img[2:],shape_kern[2:], 'valid')
n_hid = n_kern * logical_hid_shape[0] * logical_hid_shape[1]
n_out = 10
w = shared_fn(0.01*(my_rand(*shape_kern)-0.5), 'w')
b = shared_fn(my_zeros((n_kern,)), 'b')
v = shared_fn(my_zeros((n_hid, n_out)), 'c')
c = shared_fn(my_zeros(n_out), 'c')
x = tensor.Tensor(dtype='float32', broadcastable=(0,1,0,0))('x')
y = tensor.fmatrix('y')
lr = tensor.fscalar('lr')
conv_op = conv.ConvOp(shape_img[2:], shape_kern[2:], n_kern, n_batch, 1, 1)
conv_op.set_flops()
hid = tensor.tanh(conv_op(x, w)+b.dimshuffle((0,'x','x')))
hid_flat = hid.reshape((n_batch, n_hid))
out = tensor.tanh(tensor.dot(hid_flat, v)+c)
loss = tensor.sum(0.5 * (out-y)**2 * lr)
print 'loss type', loss.type
params = [w, b, v, c]
gparams = tensor.grad(loss, params)
mode = get_mode(use_gpu)
print 'building pfunc ...'
train = pfunc([x,y,lr], [loss], mode=mode, updates=[(p, p-g) for p,g in zip(params, gparams)])
# for i, n in enumerate(train.maker.env.toposort()):
# print i, n
xval = my_rand(*shape_img)
yval = my_rand(n_batch, n_out)
lr = theano._asarray(0.01, dtype='float32')
for i in xrange(n_train):
rval = train(xval, yval, lr)
print 'training done'
print_mode(mode)
return rval
def test_conv_nnet1():
numpy.random.seed(23456)
rval_cpu = run_conv_nnet1(False)
numpy.random.seed(23456)
rval_gpu = run_conv_nnet1(True)
assert numpy.allclose(rval_cpu, rval_gpu,rtol=1e-4,atol=1e-6)
def run_conv_nnet2(use_gpu): # pretend we are training LeNet for MNIST
if use_gpu:
shared_fn = tcn.shared_constructor
else:
shared_fn = shared
#cumulativ rounding error affect this comparaison of result. So we lower the tolerance.
#TODO: why the last two example see the error lower? We are converging?
#n_train=10, n_batch=3, n_kern=1, n_kern1=1, error see of 1e-9
#n_train=10, n_batch=3, n_kern=10, n_kern1=1, error see of -1.27777e-06
#n_train=10, n_batch=3, n_kern=10, n_kern1=10, error see of -6.91377e-05
#n_train=10, n_batch=30, n_kern=10, n_kern1=10, error see of -0.00185963
#n_train=10, n_batch=60, n_kern=10, n_kern1=10, error see of -5.26905e-05
#n_train=30, n_batch=60, n_kern=10, n_kern1=10, error see of -3.8147e-06
#n_train=30, n_batch=60, n_kern=20, n_kern1=10, error see of 6.82771e-05
#n_train=30, n_batch=60, n_kern=20, n_kern1=30, error see of 0.000231534
n_batch = 60
shape_img = (n_batch, 1, 32, 32)
n_kern = 20
shape_kern = (n_kern, 1, 5, 5)
n_kern1 = 10
shape_kern1 = (n_kern1, n_kern, 5, 5)
n_train=30
if config.mode=='DEBUG_MODE': n_train=1
logical_hid_shape = tcn.blas.GpuConv.logical_output_shape_2d(tuple(shape_img[2:]),tuple(shape_kern[2:]), 'valid')
logical_hid_shape1 = tcn.blas.GpuConv.logical_output_shape_2d((logical_hid_shape[0]/2, logical_hid_shape[1]/2), tuple(shape_kern1[2:]), 'valid')
n_hid = n_kern1 * logical_hid_shape1[0] * logical_hid_shape1[1]
n_out = 10
w0 = shared_fn(0.01*(my_rand(*shape_kern)-0.5), 'w0')
b0 = shared_fn(my_zeros((n_kern,)), 'b0')
w1 = shared_fn(0.01*(my_rand(*shape_kern1)-0.5), 'w1')
b1 = shared_fn(my_zeros((n_kern1,)), 'b1')
v = shared_fn(my_zeros((n_hid, n_out)), 'c')
c = shared_fn(my_zeros(n_out), 'c')
x = tensor.Tensor(dtype='float32', broadcastable=(0,1,0,0))('x')
y = tensor.fmatrix('y')
lr = tensor.fscalar('lr')
conv_op = conv.ConvOp(shape_img[2:], shape_kern[2:], n_kern, n_batch, 1, 1)
conv_op1 = conv.ConvOp((n_kern,logical_hid_shape[0]/2, logical_hid_shape[1]/2), shape_kern1[2:], n_kern1, n_batch, 1, 1)
conv_op.set_flops()
conv_op1.set_flops()
hid = tensor.tanh(conv_op(x, w0)+b0.dimshuffle((0,'x','x')))
hid1 = tensor.tanh(conv_op1(hid[:,:,::2,::2], w1) + b1.dimshuffle((0,'x','x')))
hid_flat = hid1.reshape((n_batch, n_hid))
out = tensor.tanh(tensor.dot(hid_flat, v)+c)
loss = tensor.sum(0.5 * (out-y)**2 * lr)
print 'loss type', loss.type
params = [w0, b0, w1, b1, v, c]
gparams = tensor.grad(loss, params)
mode = get_mode(use_gpu)
print 'building pfunc ...'
train = pfunc([x,y,lr], [loss], mode=mode, updates=[(p, p-g) for p,g in zip(params, gparams)])
# for i, n in enumerate(train.maker.env.toposort()):
# print i, n
xval = my_rand(*shape_img)
yval = my_rand(n_batch,n_out)#int32 make all 0...
lr = theano._asarray(0.01, dtype='float32')
for i in xrange(n_train):
rval = train(xval, yval, lr)
print_mode(mode)
return rval
def test_conv_nnet2():
numpy.random.seed(23456)
rval_gpu = run_conv_nnet2(True)
if True:
numpy.random.seed(23456)
rval_cpu = run_conv_nnet2(False)
print rval_cpu[0], rval_gpu[0],rval_cpu[0]-rval_gpu[0]
assert numpy.allclose(rval_cpu, rval_gpu,rtol=1e-4,atol=1e-4)
def run_conv_nnet2_classif(use_gpu, isize, ksize, n_batch, n_train,
downsample_ops=True, verbose=0, version=-1):
if use_gpu:
shared_fn = tcn.shared_constructor
else:
shared_fn = shared
isize1=isize
isize2=isize
if isinstance(isize,(tuple,)):
isize1=isize[0]
isize2=isize[1]
shape_img = (n_batch, 1, isize1, isize2)
n_kern = 20 # 6 were used in LeNet5
shape_kern = (n_kern, 1, ksize, ksize)
n_kern1 = 30 # 16 were used in LeNet5
shape_kern1 = (n_kern1, n_kern, ksize, ksize)
logical_hid_shape = tcn.blas.GpuConv.logical_output_shape_2d((isize1, isize2), (ksize, ksize), 'valid')
logical_hid_shape1 = tcn.blas.GpuConv.logical_output_shape_2d((logical_hid_shape[0]/2,
logical_hid_shape[1]/2), (ksize, ksize), 'valid')
n_hid = n_kern1 * logical_hid_shape1[0] * logical_hid_shape1[1]
n_out = 10
w0 = shared_fn(0.01*(my_rand(*shape_kern)-0.5), 'w0')
b0 = shared_fn(my_zeros((n_kern,)), 'b0')
w1 = shared_fn(0.01*(my_rand(*shape_kern1)-0.5), 'w1')
b1 = shared_fn(my_zeros((n_kern1,)), 'b1')
v = shared_fn(0.01*my_randn(n_hid, n_out), 'v')
c = shared_fn(my_zeros(n_out), 'c')
print 'ALLOCATING ARCH: w0 shape', w0.value.shape
print 'ALLOCATING ARCH: w1 shape', w1.value.shape
print 'ALLOCATING ARCH: v shape', v.value.shape
x = tensor.Tensor(dtype='float32', broadcastable=(0,1,0,0))('x')
y = tensor.fmatrix('y')
lr = tensor.fscalar('lr')
conv_op = conv.ConvOp(shape_img[2:], shape_kern[2:], n_kern,
n_batch, 1, 1, verbose=verbose, version=version)
conv_op1 = conv.ConvOp(
(n_kern,logical_hid_shape[0]/2, logical_hid_shape[1]/2),
shape_kern1[2:], n_kern1, n_batch, 1, 1,verbose=verbose, version=version)
conv_op.set_flops()
conv_op1.set_flops()
ds_op = downsample.DownsampleFactorMax((2,2), ignore_border=False)
if downsample_ops:
hid = tensor.tanh(ds_op(conv_op(x, w0)+b0.dimshuffle((0,'x','x'))))
else:
hid = tensor.tanh((conv_op(x, w0)+b0.dimshuffle((0,'x','x')))[:,:,::2,::2])
hid1 = tensor.tanh(conv_op1(hid, w1) + b1.dimshuffle((0,'x','x')))
hid_flat = hid1.reshape((n_batch, n_hid))
out = tensor.nnet.softmax(tensor.dot(hid_flat, v)+c)
loss = tensor.sum(tensor.nnet.crossentropy_categorical_1hot(out, tensor.argmax(y, axis=1)) * lr)
print 'loss type', loss.type
params = [w0, b0, w1, b1, v, c]
gparams = tensor.grad(loss, params, warn_type=True)
mode = get_mode(use_gpu)
print 'building pfunc ...'
train = pfunc([x,y,lr], [loss], mode=mode, updates=[(p, p-g) for p,g in zip(params, gparams)])
if False:
for i, n in enumerate(train.maker.env.toposort()):
print i, n
xval = my_rand(*shape_img)
yval = my_rand(n_batch,n_out)
lr = theano._asarray(0.01, dtype='float32')
rvals=my_zeros(n_train)
t0 = time.time()
for i in xrange(n_train):
rvals[i] = train(xval, yval, lr)[0]
t1 = time.time()
print_mode(mode)
return rvals, t1-t0, mode
def cmp_run_conv_nnet2_classif(seed, isize, ksize, bsize,
ignore_error=False,
n_train=10,
gpu_only=False,
cpu_only=False,
float_atol=1e-06,
check_isfinite=True,
pickle=False,
verbose=0,
version=-1):
"""
float_atol: None mean use the default value.
check_isfinite: the debug mode option. We forward this value to debug mode.
For some parameter CrossentropyCategorical1Hot op generate inf when not optimized.
"""
if config.mode=='DEBUG_MODE': n_train=1
numpy.random.seed(seed)
import theano.tensor.basic
import theano.compile.debugmode
from theano.compile.mode import predefined_modes
orig_float32_atol = theano.tensor.basic.float32_atol
orig_check_isfinite = predefined_modes["DEBUG_MODE"].check_isfinite
try:
predefined_modes["DEBUG_MODE"].check_isfinite = check_isfinite
if gpu_only:
tcn.use()
if float_atol:
print "float_atol",float_atol
theano.tensor.basic.float32_atol=float_atol
if not cpu_only:
rval_gpu, tg, gpu_mode = run_conv_nnet2_classif(True,
isize, ksize, bsize, n_train, verbose=verbose, version=version)
finally:
predefined_modes["DEBUG_MODE"].check_isfinite = orig_check_isfinite
theano.tensor.basic.float32_atol=orig_float32_atol
if gpu_only:
print "time gpu: %.3f"%(tg)
return
try:
predefined_modes["DEBUG_MODE"].check_isfinite = check_isfinite
numpy.random.seed(seed)
rval_cpu, tc, cpu_mode = run_conv_nnet2_classif(False, isize, ksize, bsize, n_train,
verbose=verbose, version=version)
if pickle and isinstance(cpu_mode,(theano.compile.ProfileMode,)):
import pickle
print "BEGIN GPU profile mode dump"
#print pickle.dumps(gpu_mode)
print "END GPU profile mode dump"
print "BEGIN CPU profile mode dump"
print pickle.dumps(cpu_mode)
print "END CPU profile mode dump"
finally:
predefined_modes["DEBUG_MODE"].check_isfinite = orig_check_isfinite
theano.tensor.basic.float32_atol=orig_float32_atol
if not cpu_only:
if verbose or not numpy.allclose(rval_cpu, rval_gpu,rtol=1e-3,atol=float_atol):
print "cpu:", rval_cpu
print "gpu:", rval_gpu
print "abs diff:", numpy.absolute(rval_gpu-rval_cpu)
print "time cpu: %.3f, time gpu: %.3f, speed up %f"%(tc, tg, tc/tg)
print "estimated time for one pass through MNIST with cpu: %f" % (tc * (60000.0 / (n_train*bsize)))
print "estimated time for one pass through MNIST with gpu: %f" % (tg * (60000.0 / (n_train*bsize)))
else:
print "time cpu: %.3f"%(tc)
print "estimated time for one pass through MNIST with cpu: %f" % (tc * (60000.0 / (n_train*bsize)))
if not ignore_error and not cpu_only and not gpu_only:
assert numpy.allclose(rval_cpu, rval_gpu,rtol=1e-3,atol=float_atol)
gpu_only=False
cpu_only=False
ignore_error=False
verbose=0
version=-1
def test_lenet_28(): #MNIST
cmp_run_conv_nnet2_classif(23485, 28, 5, 60, n_train=10,
ignore_error=ignore_error, gpu_only=gpu_only,
cpu_only=cpu_only, verbose=verbose, version=version)
def test_lenet_32(): #CIFAR10 / Shapeset
cmp_run_conv_nnet2_classif(23485, 32, 5, 60, n_train=10,
ignore_error=ignore_error, gpu_only=gpu_only,
verbose=verbose, version=version)
def test_lenet_32_long(): #CIFAR10 / Shapeset
# this tests the gradient of downsample on the GPU,
# which does not recieve specific testing
cmp_run_conv_nnet2_classif(23485, 32, 5, 30, n_train=50,
ignore_error=ignore_error, gpu_only=gpu_only,
cpu_only=cpu_only, verbose=verbose, version=version)
def test_lenet_64(): # ???
#float_atol need to pass in debug mode
#needed as cpu use extended precision and gpu don't
cmp_run_conv_nnet2_classif(23485, 64, 7, 10, n_train=10,
ignore_error=ignore_error, gpu_only=gpu_only,
cpu_only=cpu_only, verbose=verbose,
float_atol=5e-4, check_isfinite=True, version=version)
def test_lenet_108(): # NORB
cmp_run_conv_nnet2_classif(23485, 108, 7, 5, n_train=4,
ignore_error=ignore_error, gpu_only=gpu_only,
cpu_only=cpu_only, verbose=verbose,
check_isfinite=True, version=version, float_atol=7e-2)
def test_lenet_256(): # ImageNet
cmp_run_conv_nnet2_classif(23485, 256, 9, 2, n_train=5,
ignore_error=ignore_error, gpu_only=gpu_only,
cpu_only=cpu_only, verbose=verbose,
check_isfinite=True, version=version)
#I did a wanted error in the name as we don't want it to execute automatically for now as it don't work
def tes_lenet_hd(): #HD 720p: 1280(wid)x720(len)
cmp_run_conv_nnet2_classif(23485, (720,1280), 9, 2, n_train=3,
ignore_error=ignore_error, gpu_only=gpu_only,
cpu_only=cpu_only, verbose=verbose,
check_isfinite=True, version=version)
#I did a wanted error in the name as we don't want it to execute automatically for now as it don't work
def tes_lenet_full_hd(): #HD 1080p: 1920(wid)x1080(len)
cmp_run_conv_nnet2_classif(23485, (1080,1920), 9, 2, n_train=3,
ignore_error=ignore_error, gpu_only=gpu_only,
cpu_only=cpu_only, verbose=verbose,
check_isfinite=True, version=version)
import sys, time
import theano
from theano.compile.sharedvalue import shared
from theano.compile.pfunc import pfunc
from theano import tensor
import theano.tensor.nnet
from theano import config
import theano.tensor.nnet.conv as conv
import theano.tensor.signal.downsample as downsample
import numpy
import theano, numpy
import theano.tensor as T
# Skip test if cuda_ndarray is not available.
from nose.plugins.skip import SkipTest
raise SkipTest('SKIP TO PREVENT THE BUILDBOT FROM CRASHING. THERE IS A DIFFICULT BUG TO FIX WITH MEMORY LEAK AND/OR WHEN Cuda_Ndarray alloc fail!')
import theano.sandbox.cuda as cuda_ndarray
if cuda_ndarray.cuda_available == False:
import theano.sandbox.cuda as cuda
if cuda.cuda_available == False:
raise SkipTest('Optional package cuda disabled')
import theano.sandbox.cuda as tcn
import logging
logging.getLogger('theano.sandbox.cuda.tests.test_nnet').setLevel(logging.INFO)
def my_rand(*shape):
return theano._asarray(numpy.random.rand(*shape),dtype='float32')
def my_randn(*shape):
return theano._asarray(numpy.random.randn(*shape),dtype='float32')
def my_zeros(*shape):
return theano._asarray(numpy.zeros(*shape),dtype='float32')
def get_mode(use_gpu):
ret = theano.compile.get_default_mode()
if isinstance(ret, theano.compile.ProfileMode):
ret = theano.compile.ProfileMode()
if use_gpu:
ret = ret.including('gpu')
else:
ret = ret.excluding('gpu')
return ret
def print_mode(mode):
if mode != None and isinstance(mode,(theano.compile.ProfileMode,)):
mode.print_summary()
def print_diff_mode(a,b):
if a != None and isinstance(a,(theano.compile.ProfileMode,)) and isinstance(b,(theano.compile.ProfileMode,)):
a.print_diff_summary(b)
if theano.config.mode=='FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpu')
else:
mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpu')
def run_nnet(use_gpu, n_batch=60, n_in=1024, n_hid=2048, n_out=10, n_train=100):
if config.mode=='DEBUG_MODE': n_train=1
if use_gpu:
w = tcn.shared_constructor(0.01*(my_rand(n_in,n_hid)-0.5), 'w')
b = tcn.shared_constructor(my_zeros(n_hid), 'b')
v = tcn.shared_constructor(my_zeros((n_hid, n_out)), 'c')
c = tcn.shared_constructor(my_zeros(n_out), 'c')
else:
w = shared(0.01*(my_rand(n_in,n_hid)-0.5), 'w')
b = shared(my_zeros(n_hid), 'b')
v = shared(my_zeros((n_hid, n_out)), 'c')
c = shared(my_zeros(n_out), 'c')
def test_GpuCrossentropySoftmax1HotWithBiasDx():
"""
This is basic test for GpuCrossentropySoftmaxArgmax1HotWithBias and GpuCrossentropySoftmax1HotWithBiasDx
x = tensor.fmatrix('x')
y = tensor.fmatrix('y')
lr = tensor.fscalar('lr')
hid = tensor.tanh(tensor.dot(x, w)+b)
out = tensor.tanh(tensor.dot(hid, v)+c)
loss = tensor.sum(0.5 * (out-y)**2 * lr)
if 0: print 'loss type', loss.type
We check that we loop when their is too much threads
TODO: check that we loop when their is too much block(>32*1024)
"""
params = [w, b, v, c]
gparams = tensor.grad(loss, params)
n_in = 1000
batch_size = 4097
n_out = 1250
mode = get_mode(use_gpu)
if theano.config.mode!="DEBUG_MODE":
n_in = 4098
n_out = 4099
print 'building pfunc ...'
train = pfunc([x,y,lr], [loss], mode=mode, updates=[(p, p-g) for p,g in zip(params, gparams)])
x = T.fmatrix('x')
y = T.lvector('y')
if 0:
for i, n in enumerate(train.maker.env.toposort()):
print i, n
xval = my_rand(n_batch, n_in)
yval = my_rand(n_batch, n_out)
lr = theano._asarray(0.01, dtype='float32')
b = T.fvector()
W = T.fmatrix()
t0 = time.time()
rval = []
for i in xrange(n_train):
rval.append(train(xval, yval, lr))
dt = time.time() - t0
print_mode(mode)
return numpy.asarray(rval), dt
p_y_given_x = T.nnet.softmax(T.dot(x,W)+b)
y_pred = T.argmax(p_y_given_x)
loss = -T.mean(T.log(p_y_given_x)[T.arange(y.shape[0]), y])
dW = T.grad(loss,W)
classify = theano.function( inputs = [x,y,b,W], outputs = [loss,y_pred,dW],
mode = mode_without_gpu)
classify_gpu = theano.function( inputs = [x,y,b,W], outputs = [loss,y_pred,dW],
mode = mode_with_gpu)
xx = numpy.asarray(numpy.random.rand(batch_size,n_in),dtype=numpy.float32)
yy = numpy.ones((batch_size,),dtype='float32')
b_values = numpy.zeros((n_out,),dtype='float32')
W_values = numpy.asarray(numpy.random.rand(n_in,n_out),dtype='float32')
def test_run_nnet():
for n_in in 1024, 2048, 4096:
for n_hid in 1024, 2048, 4096:
numpy.random.seed(23456)
rval_cpu, tc = run_nnet(False, n_in=n_in, n_hid=n_hid)
numpy.random.seed(23456)
rval_gpu, tg = run_nnet(True, n_in=n_in, n_hid=n_hid)
#print "cpu:", rval_cpu
#print "gpu:", rval_gpu
print "max abs diff:", numpy.max(numpy.absolute(rval_gpu-rval_cpu))
print "time cpu: %f, time gpu: %f, speed up %f"%(tc, tg, tc/tg)
assert numpy.allclose(rval_cpu, rval_gpu,rtol=1e-4,atol=1e-6)
def test_run_nnet_med():
numpy.random.seed(23456)
rval_cpu = run_nnet(False, 10, 128, 50, 4, n_train=10000)
def test_run_nnet_small():
numpy.random.seed(23456)
rval_cpu = run_nnet(False, 10, 10, 4, 4, n_train=100000)
def run_conv_nnet1(use_gpu):
if use_gpu:
shared_fn = tcn.shared_constructor
else:
shared_fn = shared
n_batch = 16
n_kern = 20
shape_img = (n_batch, 1, 32, 32)
shape_kern = (n_kern, 1, 5, 5)
n_train=10
if config.mode=='DEBUG_MODE': n_train=1
logical_hid_shape = tcn.blas.GpuConv.logical_output_shape_2d(shape_img[2:],shape_kern[2:], 'valid')
n_hid = n_kern * logical_hid_shape[0] * logical_hid_shape[1]
n_out = 10
w = shared_fn(0.01*(my_rand(*shape_kern)-0.5), 'w')
b = shared_fn(my_zeros((n_kern,)), 'b')
v = shared_fn(my_zeros((n_hid, n_out)), 'c')
c = shared_fn(my_zeros(n_out), 'c')
x = tensor.Tensor(dtype='float32', broadcastable=(0,1,0,0))('x')
y = tensor.fmatrix('y')
lr = tensor.fscalar('lr')
conv_op = conv.ConvOp(shape_img[2:], shape_kern[2:], n_kern, n_batch, 1, 1)
conv_op.set_flops()
hid = tensor.tanh(conv_op(x, w)+b.dimshuffle((0,'x','x')))
hid_flat = hid.reshape((n_batch, n_hid))
out = tensor.tanh(tensor.dot(hid_flat, v)+c)
loss = tensor.sum(0.5 * (out-y)**2 * lr)
print 'loss type', loss.type
params = [w, b, v, c]
gparams = tensor.grad(loss, params)
mode = get_mode(use_gpu)
print 'building pfunc ...'
train = pfunc([x,y,lr], [loss], mode=mode, updates=[(p, p-g) for p,g in zip(params, gparams)])
# for i, n in enumerate(train.maker.env.toposort()):
# print i, n
xval = my_rand(*shape_img)
yval = my_rand(n_batch, n_out)
lr = theano._asarray(0.01, dtype='float32')
for i in xrange(n_train):
rval = train(xval, yval, lr)
print 'training done'
print_mode(mode)
return rval
def test_conv_nnet1():
numpy.random.seed(23456)
rval_cpu = run_conv_nnet1(False)
numpy.random.seed(23456)
rval_gpu = run_conv_nnet1(True)
assert numpy.allclose(rval_cpu, rval_gpu,rtol=1e-4,atol=1e-6)
def run_conv_nnet2(use_gpu): # pretend we are training LeNet for MNIST
if use_gpu:
shared_fn = tcn.shared_constructor
else:
shared_fn = shared
#cumulativ rounding error affect this comparaison of result. So we lower the tolerance.
#TODO: why the last two example see the error lower? We are converging?
#n_train=10, n_batch=3, n_kern=1, n_kern1=1, error see of 1e-9
#n_train=10, n_batch=3, n_kern=10, n_kern1=1, error see of -1.27777e-06
#n_train=10, n_batch=3, n_kern=10, n_kern1=10, error see of -6.91377e-05
#n_train=10, n_batch=30, n_kern=10, n_kern1=10, error see of -0.00185963
#n_train=10, n_batch=60, n_kern=10, n_kern1=10, error see of -5.26905e-05
#n_train=30, n_batch=60, n_kern=10, n_kern1=10, error see of -3.8147e-06
#n_train=30, n_batch=60, n_kern=20, n_kern1=10, error see of 6.82771e-05
#n_train=30, n_batch=60, n_kern=20, n_kern1=30, error see of 0.000231534
n_batch = 60
shape_img = (n_batch, 1, 32, 32)
n_kern = 20
shape_kern = (n_kern, 1, 5, 5)
n_kern1 = 10
shape_kern1 = (n_kern1, n_kern, 5, 5)
n_train=30
if config.mode=='DEBUG_MODE': n_train=1
logical_hid_shape = tcn.blas.GpuConv.logical_output_shape_2d(tuple(shape_img[2:]),tuple(shape_kern[2:]), 'valid')
logical_hid_shape1 = tcn.blas.GpuConv.logical_output_shape_2d((logical_hid_shape[0]/2, logical_hid_shape[1]/2), tuple(shape_kern1[2:]), 'valid')
n_hid = n_kern1 * logical_hid_shape1[0] * logical_hid_shape1[1]
n_out = 10
w0 = shared_fn(0.01*(my_rand(*shape_kern)-0.5), 'w0')
b0 = shared_fn(my_zeros((n_kern,)), 'b0')
w1 = shared_fn(0.01*(my_rand(*shape_kern1)-0.5), 'w1')
b1 = shared_fn(my_zeros((n_kern1,)), 'b1')
v = shared_fn(my_zeros((n_hid, n_out)), 'c')
c = shared_fn(my_zeros(n_out), 'c')
x = tensor.Tensor(dtype='float32', broadcastable=(0,1,0,0))('x')
y = tensor.fmatrix('y')
lr = tensor.fscalar('lr')
conv_op = conv.ConvOp(shape_img[2:], shape_kern[2:], n_kern, n_batch, 1, 1)
conv_op1 = conv.ConvOp((n_kern,logical_hid_shape[0]/2, logical_hid_shape[1]/2), shape_kern1[2:], n_kern1, n_batch, 1, 1)
conv_op.set_flops()
conv_op1.set_flops()
hid = tensor.tanh(conv_op(x, w0)+b0.dimshuffle((0,'x','x')))
hid1 = tensor.tanh(conv_op1(hid[:,:,::2,::2], w1) + b1.dimshuffle((0,'x','x')))
hid_flat = hid1.reshape((n_batch, n_hid))
out = tensor.tanh(tensor.dot(hid_flat, v)+c)
loss = tensor.sum(0.5 * (out-y)**2 * lr)
print 'loss type', loss.type
params = [w0, b0, w1, b1, v, c]
gparams = tensor.grad(loss, params)
mode = get_mode(use_gpu)
print 'building pfunc ...'
train = pfunc([x,y,lr], [loss], mode=mode, updates=[(p, p-g) for p,g in zip(params, gparams)])
# for i, n in enumerate(train.maker.env.toposort()):
# print i, n
xval = my_rand(*shape_img)
yval = my_rand(n_batch,n_out)#int32 make all 0...
lr = theano._asarray(0.01, dtype='float32')
for i in xrange(n_train):
rval = train(xval, yval, lr)
print_mode(mode)
return rval
def test_conv_nnet2():
numpy.random.seed(23456)
rval_gpu = run_conv_nnet2(True)
if True:
numpy.random.seed(23456)
rval_cpu = run_conv_nnet2(False)
print rval_cpu[0], rval_gpu[0],rval_cpu[0]-rval_gpu[0]
assert numpy.allclose(rval_cpu, rval_gpu,rtol=1e-4,atol=1e-4)
def run_conv_nnet2_classif(use_gpu, isize, ksize, n_batch, n_train,
downsample_ops=True, verbose=0, version=-1):
if use_gpu:
shared_fn = tcn.shared_constructor
else:
shared_fn = shared
isize1=isize
isize2=isize
if isinstance(isize,(tuple,)):
isize1=isize[0]
isize2=isize[1]
shape_img = (n_batch, 1, isize1, isize2)
n_kern = 20 # 6 were used in LeNet5
shape_kern = (n_kern, 1, ksize, ksize)
n_kern1 = 30 # 16 were used in LeNet5
shape_kern1 = (n_kern1, n_kern, ksize, ksize)
logical_hid_shape = tcn.blas.GpuConv.logical_output_shape_2d((isize1, isize2), (ksize, ksize), 'valid')
logical_hid_shape1 = tcn.blas.GpuConv.logical_output_shape_2d((logical_hid_shape[0]/2,
logical_hid_shape[1]/2), (ksize, ksize), 'valid')
n_hid = n_kern1 * logical_hid_shape1[0] * logical_hid_shape1[1]
n_out = 10
w0 = shared_fn(0.01*(my_rand(*shape_kern)-0.5), 'w0')
b0 = shared_fn(my_zeros((n_kern,)), 'b0')
w1 = shared_fn(0.01*(my_rand(*shape_kern1)-0.5), 'w1')
b1 = shared_fn(my_zeros((n_kern1,)), 'b1')
v = shared_fn(0.01*my_randn(n_hid, n_out), 'v')
c = shared_fn(my_zeros(n_out), 'c')
print 'ALLOCATING ARCH: w0 shape', w0.value.shape
print 'ALLOCATING ARCH: w1 shape', w1.value.shape
print 'ALLOCATING ARCH: v shape', v.value.shape
x = tensor.Tensor(dtype='float32', broadcastable=(0,1,0,0))('x')
y = tensor.fmatrix('y')
lr = tensor.fscalar('lr')
conv_op = conv.ConvOp(shape_img[2:], shape_kern[2:], n_kern,
n_batch, 1, 1, verbose=verbose, version=version)
conv_op1 = conv.ConvOp(
(n_kern,logical_hid_shape[0]/2, logical_hid_shape[1]/2),
shape_kern1[2:], n_kern1, n_batch, 1, 1,verbose=verbose, version=version)
conv_op.set_flops()
conv_op1.set_flops()
ds_op = downsample.DownsampleFactorMax((2,2), ignore_border=False)
if downsample_ops:
hid = tensor.tanh(ds_op(conv_op(x, w0)+b0.dimshuffle((0,'x','x'))))
else:
hid = tensor.tanh((conv_op(x, w0)+b0.dimshuffle((0,'x','x')))[:,:,::2,::2])
hid1 = tensor.tanh(conv_op1(hid, w1) + b1.dimshuffle((0,'x','x')))
hid_flat = hid1.reshape((n_batch, n_hid))
out = tensor.nnet.softmax(tensor.dot(hid_flat, v)+c)
loss = tensor.sum(tensor.nnet.crossentropy_categorical_1hot(out, tensor.argmax(y, axis=1)) * lr)
print 'loss type', loss.type
params = [w0, b0, w1, b1, v, c]
gparams = tensor.grad(loss, params, warn_type=True)
mode = get_mode(use_gpu)
print 'building pfunc ...'
train = pfunc([x,y,lr], [loss], mode=mode, updates=[(p, p-g) for p,g in zip(params, gparams)])
assert any([isinstance(node.op,T.nnet.CrossentropySoftmaxArgmax1HotWithBias) for node in classify.maker.env.toposort()])
assert any([isinstance(node.op,T.nnet.CrossentropySoftmax1HotWithBiasDx) for node in classify.maker.env.toposort()])
assert any([isinstance(node.op,cuda.nnet.GpuCrossentropySoftmaxArgmax1HotWithBias) for node in classify_gpu.maker.env.toposort()])
assert any([isinstance(node.op,cuda.nnet.GpuCrossentropySoftmax1HotWithBiasDx) for node in classify_gpu.maker.env.toposort()])
if False:
for i, n in enumerate(train.maker.env.toposort()):
print i, n
out=classify(xx,yy,b_values,W_values)
gout=classify_gpu(xx,yy,b_values,W_values)
xval = my_rand(*shape_img)
yval = my_rand(n_batch,n_out)
lr = theano._asarray(0.01, dtype='float32')
assert numpy.allclose(out[0],gout[0])
assert numpy.allclose(out[1],gout[1])
assert numpy.allclose(out[2],gout[2],atol=2e-6)
rvals=my_zeros(n_train)
t0 = time.time()
for i in xrange(n_train):
rvals[i] = train(xval, yval, lr)[0]
t1 = time.time()
print_mode(mode)
return rvals, t1-t0, mode
def cmp_run_conv_nnet2_classif(seed, isize, ksize, bsize,
ignore_error=False,
n_train=10,
gpu_only=False,
cpu_only=False,
float_atol=1e-06,
check_isfinite=True,
pickle=False,
verbose=0,
version=-1):
"""
float_atol: None mean use the default value.
check_isfinite: the debug mode option. We forward this value to debug mode.
For some parameter CrossentropyCategorical1Hot op generate inf when not optimized.
def test_softmax_with_bias():
"""
if config.mode=='DEBUG_MODE': n_train=1
This is basic test for GpuSoftmaxWithBias
numpy.random.seed(seed)
import theano.tensor.basic
import theano.compile.debugmode
from theano.compile.mode import predefined_modes
orig_float32_atol = theano.tensor.basic.float32_atol
orig_check_isfinite = predefined_modes["DEBUG_MODE"].check_isfinite
try:
predefined_modes["DEBUG_MODE"].check_isfinite = check_isfinite
if gpu_only:
tcn.use()
if float_atol:
print "float_atol",float_atol
theano.tensor.basic.float32_atol=float_atol
if not cpu_only:
rval_gpu, tg, gpu_mode = run_conv_nnet2_classif(True,
isize, ksize, bsize, n_train, verbose=verbose, version=version)
finally:
predefined_modes["DEBUG_MODE"].check_isfinite = orig_check_isfinite
theano.tensor.basic.float32_atol=orig_float32_atol
if gpu_only:
print "time gpu: %.3f"%(tg)
return
try:
predefined_modes["DEBUG_MODE"].check_isfinite = check_isfinite
numpy.random.seed(seed)
rval_cpu, tc, cpu_mode = run_conv_nnet2_classif(False, isize, ksize, bsize, n_train,
verbose=verbose, version=version)
if pickle and isinstance(cpu_mode,(theano.compile.ProfileMode,)):
import pickle
print "BEGIN GPU profile mode dump"
#print pickle.dumps(gpu_mode)
print "END GPU profile mode dump"
print "BEGIN CPU profile mode dump"
print pickle.dumps(cpu_mode)
print "END CPU profile mode dump"
finally:
predefined_modes["DEBUG_MODE"].check_isfinite = orig_check_isfinite
theano.tensor.basic.float32_atol=orig_float32_atol
if not cpu_only:
if verbose or not numpy.allclose(rval_cpu, rval_gpu,rtol=1e-3,atol=float_atol):
print "cpu:", rval_cpu
print "gpu:", rval_gpu
print "abs diff:", numpy.absolute(rval_gpu-rval_cpu)
print "time cpu: %.3f, time gpu: %.3f, speed up %f"%(tc, tg, tc/tg)
print "estimated time for one pass through MNIST with cpu: %f" % (tc * (60000.0 / (n_train*bsize)))
print "estimated time for one pass through MNIST with gpu: %f" % (tg * (60000.0 / (n_train*bsize)))
else:
print "time cpu: %.3f"%(tc)
print "estimated time for one pass through MNIST with cpu: %f" % (tc * (60000.0 / (n_train*bsize)))
We check that we loop when their is too much block
TODO: check that we loop when their is too much thread.(THIS IS NOT IMPLEMENTED)
"""
x = T.fmatrix('x')
if not ignore_error and not cpu_only and not gpu_only:
assert numpy.allclose(rval_cpu, rval_gpu,rtol=1e-3,atol=float_atol)
#we need to test n>32*1024 to check that we make the block loop.
n,m=2<<15,5
gpu_only=False
cpu_only=False
ignore_error=False
verbose=0
version=-1
data = numpy.arange(n*m, dtype='float32').reshape(n,m)
def test_lenet_28(): #MNIST
cmp_run_conv_nnet2_classif(23485, 28, 5, 60, n_train=10,
ignore_error=ignore_error, gpu_only=gpu_only,
cpu_only=cpu_only, verbose=verbose, version=version)
z = T.nnet.softmax_with_bias(x, T.zeros_like(x[0,:]))
def test_lenet_32(): #CIFAR10 / Shapeset
cmp_run_conv_nnet2_classif(23485, 32, 5, 60, n_train=10,
ignore_error=ignore_error, gpu_only=gpu_only,
verbose=verbose, version=version)
f = theano.function([x],z, mode=mode_without_gpu)
f_gpu = theano.function([x],z, mode=mode_with_gpu)
assert f.maker.env.toposort()[-1].op==T.nnet.softmax_with_bias
assert isinstance(f_gpu.maker.env.toposort()[-2].op,cuda.nnet.GpuSoftmaxWithBias)
out=f(data)
gout=f_gpu(data)
assert numpy.allclose(out,gout),numpy.absolute(out-gout)
def test_lenet_32_long(): #CIFAR10 / Shapeset
# this tests the gradient of downsample on the GPU,
# which does not recieve specific testing
cmp_run_conv_nnet2_classif(23485, 32, 5, 30, n_train=50,
ignore_error=ignore_error, gpu_only=gpu_only,
cpu_only=cpu_only, verbose=verbose, version=version)
def test_softmax():
"""
This is basic test for GpuSoftmax
def test_lenet_64(): # ???
#float_atol need to pass in debug mode
#needed as cpu use extended precision and gpu don't
cmp_run_conv_nnet2_classif(23485, 64, 7, 10, n_train=10,
ignore_error=ignore_error, gpu_only=gpu_only,
cpu_only=cpu_only, verbose=verbose,
float_atol=5e-4, check_isfinite=True, version=version)
We check that we loop when their is too much block
TODO: check that we loop when their is too much thread.(THIS IS NOT IMPLEMENTED)
"""
x = T.fmatrix('x')
def test_lenet_108(): # NORB
cmp_run_conv_nnet2_classif(23485, 108, 7, 5, n_train=4,
ignore_error=ignore_error, gpu_only=gpu_only,
cpu_only=cpu_only, verbose=verbose,
check_isfinite=True, version=version, float_atol=7e-2)
#we need to test n>32*1024 to check that we make the block loop.
n,m=2<<15,5
def test_lenet_256(): # ImageNet
cmp_run_conv_nnet2_classif(23485, 256, 9, 2, n_train=5,
ignore_error=ignore_error, gpu_only=gpu_only,
cpu_only=cpu_only, verbose=verbose,
check_isfinite=True, version=version)
data = numpy.arange(n*m, dtype='float32').reshape(n,m)
#I did a wanted error in the name as we don't want it to execute automatically for now as it don't work
def tes_lenet_hd(): #HD 720p: 1280(wid)x720(len)
cmp_run_conv_nnet2_classif(23485, (720,1280), 9, 2, n_train=3,
ignore_error=ignore_error, gpu_only=gpu_only,
cpu_only=cpu_only, verbose=verbose,
check_isfinite=True, version=version)
z = T.nnet.softmax(x)
#I did a wanted error in the name as we don't want it to execute automatically for now as it don't work
def tes_lenet_full_hd(): #HD 1080p: 1920(wid)x1080(len)
cmp_run_conv_nnet2_classif(23485, (1080,1920), 9, 2, n_train=3,
ignore_error=ignore_error, gpu_only=gpu_only,
cpu_only=cpu_only, verbose=verbose,
check_isfinite=True, version=version)
f = theano.function([x],z, mode=mode_without_gpu)
f_gpu = theano.function([x],z, mode=mode_with_gpu)
assert f.maker.env.toposort()[-1].op==T.nnet.softmax
assert isinstance(f_gpu.maker.env.toposort()[-2].op,cuda.nnet.GpuSoftmax)
out=f(data)
gout=f_gpu(data)
assert numpy.allclose(out,gout),numpy.absolute(out-gout)
......@@ -254,7 +254,9 @@ class CudaNdarrayType(Type):
return ret
def c_libraries(self):
return ['cudart']
# returning cublas because the cuda_ndarray.cuh header includes calls to SetVector and
# cublasGetError
return ['cudart', 'cublas']
def c_support_code(cls):
return ""
......
......@@ -4,7 +4,7 @@ import theano.tensor as T
from theano.tensor.opt import register_specialize
from theano.gof import local_optimizer
from theano.sandbox.cuda import cuda_available
from theano.sandbox.cuda import cuda_available, cuda_enabled
if cuda_available:
from theano.sandbox.cuda import CudaNdarrayType
from theano.sandbox.cuda.basic_ops import host_from_gpu, gpu_from_host
......@@ -109,12 +109,11 @@ class GpuMultinomial(Multinomial):
raise TypeError('pvals must be cudandarray', pvals)
if not isinstance(unis.type, CudaNdarrayType):
raise TypeError('unis must be cudandarray', unis)
return Apply(self, [pvals, unis], [pvals.type()])
def c_code_cache_version(self):
#return ()
return (super(GpuMultinomial,self).c_code_cache_version(),1)
return ()
#return (super(GpuMultinomial,self).c_code_cache_version(),1)
def c_support_code_apply(self, node, nodename):
return """
......@@ -128,7 +127,7 @@ class GpuMultinomial(Multinomial):
float * global_outs
)
{
int n = 32*blockIdx.x + threadIdx.x;
int n = blockDim.x*blockIdx.x + threadIdx.x;
if (n < nb_multi)
{
......@@ -201,14 +200,31 @@ class GpuMultinomial(Multinomial):
int nb_outcomes = CudaNdarray_HOST_DIMS(%(z)s)[0];
int nb_multi = CudaNdarray_HOST_DIMS(%(z)s)[1];
int nb_block;
if (nb_multi %% 32 == 0)
nb_block = nb_multi/32;
else
nb_block = (int)((float)nb_multi/32. + 1.);
//TODO : change this for a beautiful constant
int max_nb_blocks = 2<<15 - 1;
int nb_blocks = max_nb_blocks + 1;
int nb_threads=16; // so it really starts at 32, because of the *2
do
{
nb_threads*=2;
if (nb_multi %% nb_threads == 0)
nb_blocks = nb_multi/nb_threads;
else
nb_blocks = (int)((float)nb_multi/(float)nb_threads + 1.);
} while (nb_blocks > max_nb_blocks);
//printf("\\nN=%%i b=%%i t=%%i t*b=%%i", nb_multi, nb_blocks, nb_threads, nb_blocks*nb_threads);
// TODO : next line is a bit hardcoded...
if (nb_threads > 512)
{
PyErr_Format(PyExc_ValueError, "Mutinomial is not implemented for as many rows in the matrix (%%i)", nb_multi);
%(fail)s;
}
dim3 n_blocks(nb_block,1,1);
dim3 n_threads(32,1,1);
dim3 n_blocks(nb_blocks,1,1);
dim3 n_threads(nb_threads,1,1);
int n_shared = 0;
k_multi_warp_%(name)s<<<n_blocks, n_threads, n_shared>>>(
......@@ -244,6 +260,6 @@ gpu_multinomial = GpuMultinomial()
def use_gpu_multinomial(node):
if node.op == multinomial:
return [host_from_gpu(gpu_multinomial(*[gpu_from_host(i) for i in node.inputs]))]
if theano.config.device.startswith('gpu'):
if cuda_enabled:#theano.config.device.startswith('gpu'):
register_specialize(use_gpu_multinomial)
......@@ -685,7 +685,7 @@ class MRG_RandomStreams(object):
else:
raise NotImplementedError("MRG_RandomStreams.binomial with n > 1")
def multinomial(self, size=None, n=1, pvals=[[.5,.5]], ndim=None, dtype='int64'):
def multinomial(self, size=None, n=1, pvals=None, ndim=None, dtype='int64'):
"""
Sample `n` (currently `n` needs to be 1) times from a multinomial distribution defined by
probabilities pvals.
......@@ -696,13 +696,12 @@ class MRG_RandomStreams(object):
`size` and `ndim` are only there keep the same signature as other uniform, binomial, normal, etc.
todo : adapt multinomial to take that into account
"""
if pvals is None:
raise TypeError("You have to specify pvals")
pvals = as_tensor_variable(pvals)
if n == 1 and pvals.ndim == 2:
pvals = as_tensor_variable(pvals)
unis = self.uniform(size=pvals.shape[0:1], ndim=1)
return cast(multinomial(pvals.T, unis).T, dtype)
else:
raise NotImplementedError("MRG_RandomStreams.multinomial only implemented with n == 1 and pvals.ndim = 2")
......
......@@ -345,7 +345,7 @@ def test_uniform():
#print 'random?[-1,-10:]\n', out[-1,-10:]
basictest(f, steps, sample_size, prefix='mrg cpu', inputs=input)
if mode!='FAST_COMPILE':
if mode!='FAST_COMPILE' and cuda_available:
print ''
print 'ON GPU with size=(%s):'%str(size)
R = MRG_RandomStreams(234, use_cuda=True)
......@@ -403,7 +403,7 @@ def test_binomial():
print 'random?[-1,-10:]\n', out[-1,-10:]
basictest(f, steps, sample_size, prefix='mrg cpu', inputs=input, allow_01=True, target_avg = mean)
if mode!='FAST_COMPILE':
if mode!='FAST_COMPILE' and cuda_available:
print ''
print 'ON GPU with size=(%s) and mean(%d):'%(str(size),mean)
R = MRG_RandomStreams(234, use_cuda=True)
......@@ -450,7 +450,7 @@ def test_normal0():
# now with odd number of samples
sample_size = (sample_size[0],sample_size[1]-1)
if mode!='FAST_COMPILE':
if mode!='FAST_COMPILE' and cuda_available:
print ''
print 'ON GPU:'
R = MRG_RandomStreams(234, use_cuda=True)
......@@ -465,7 +465,7 @@ def test_normal0():
print 'random?[:10]\n', numpy.asarray(f())[0,0:10]
print '----'
sys.stdout.flush()
basictest(f, steps, sample_size_odd, target_avg=-5.0, target_std=2.0, prefix='gpu mrg ', allow_01=True)
basictest(f, steps, sample_size, target_avg=-5.0, target_std=2.0, prefix='gpu mrg ', allow_01=True)
print ''
......@@ -528,6 +528,7 @@ def test_multinomial():
print ''
print 'ON GPU:'
R = MRG_RandomStreams(234, use_cuda=True)
pvals = numpy.asarray(pvals, dtype='float32')
n = R.multinomial(pvals=pvals, dtype='float32')
assert n.dtype == 'float32' #well, it's really that this test w GPU doesn't make sense otw
f = theano.function([], theano.Out(
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论