提交 85447abe authored 作者: James Bergstra's avatar James Bergstra

merge

...@@ -52,6 +52,10 @@ Community ...@@ -52,6 +52,10 @@ Community
* Register and post to `theano-dev`_ if you want to talk to the developers. * Register and post to `theano-dev`_ if you want to talk to the developers.
* Register and post to `theano-announce`_ if you want to be keep informed on important change on theano(low volume).
* Register and post to `theano-buildbot`_ if you want to receive our daily buildbot email.
* We try to stay organized with `Theano's Trac <http://trac-hg.assembla.com/theano/report/1>`__ * We try to stay organized with `Theano's Trac <http://trac-hg.assembla.com/theano/report/1>`__
* Come visit us in Montreal! Most of the developers are students in the LISA_ group at the `University of Montreal`_. * Come visit us in Montreal! Most of the developers are students in the LISA_ group at the `University of Montreal`_.
...@@ -77,6 +81,8 @@ Community ...@@ -77,6 +81,8 @@ Community
.. _theano-dev: http://groups.google.com/group/theano-dev .. _theano-dev: http://groups.google.com/group/theano-dev
.. _theano-users: http://groups.google.com/group/theano-users .. _theano-users: http://groups.google.com/group/theano-users
.. _theano-announce: http://groups.google.com/group/theano-announce
.. _theano-buildbot: http://groups.google.com/group/theano-buildbot
.. _tickets: http://pylearn.org/theano/trac/query?status=accepted&status=assigned&status=new&status=reopened&group=milestone&max=200&col=id&col=summary&col=status&col=owner&col=type&col=priority&col=component&col=time&report=9&order=priority .. _tickets: http://pylearn.org/theano/trac/query?status=accepted&status=assigned&status=new&status=reopened&group=milestone&max=200&col=id&col=summary&col=status&col=owner&col=type&col=priority&col=component&col=time&report=9&order=priority
.. _LISA: http://www.iro.umontreal.ca/~lisa .. _LISA: http://www.iro.umontreal.ca/~lisa
......
...@@ -188,7 +188,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx (Op): ...@@ -188,7 +188,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx (Op):
def make_node(self, dy, sm, y_idx): def make_node(self, dy, sm, y_idx):
return Apply(self, [dy, sm, y_idx],[sm.type()]) return Apply(self, [dy, sm, y_idx],[sm.type()])
def c_code_cache_version(self): def c_code_cache_version(self):
return (2,) return (3,)
#return () #return ()
def c_code(self, node, nodename, (dnll, sm, y_idx), (dx,), sub): def c_code(self, node, nodename, (dnll, sm, y_idx), (dx,), sub):
fail = sub['fail'] fail = sub['fail']
...@@ -229,7 +229,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx (Op): ...@@ -229,7 +229,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx (Op):
kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s
<<< <<<
CudaNdarray_HOST_DIMS(%(dx)s)[0], CudaNdarray_HOST_DIMS(%(dx)s)[0],
CudaNdarray_HOST_DIMS(%(dx)s)[1] std::min(CudaNdarray_HOST_DIMS(%(dx)s)[1],256)
>>>( >>>(
CudaNdarray_HOST_DIMS(%(dx)s)[0], CudaNdarray_HOST_DIMS(%(dx)s)[0],
CudaNdarray_HOST_DIMS(%(dx)s)[1], CudaNdarray_HOST_DIMS(%(dx)s)[1],
...@@ -303,7 +303,7 @@ class GpuSoftmax (Op): ...@@ -303,7 +303,7 @@ class GpuSoftmax (Op):
return shape return shape
def c_code_cache_version(self): def c_code_cache_version(self):
#return () #return ()
return (1,) + inline_softmax.code_version return (2,) + inline_softmax.code_version
def c_code(self, node, nodename, (x,), (z,), sub): def c_code(self, node, nodename, (x,), (z,), sub):
fail = sub['fail'] fail = sub['fail']
return """ return """
...@@ -330,7 +330,7 @@ class GpuSoftmax (Op): ...@@ -330,7 +330,7 @@ class GpuSoftmax (Op):
kSoftmax_%(nodename)s kSoftmax_%(nodename)s
<<< <<<
// todo: cap these at the card limits, implement loops in kernel // todo: cap these at the card limits, implement loops in kernel
CudaNdarray_HOST_DIMS(%(x)s)[0], std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],32*1024),
CudaNdarray_HOST_DIMS(%(x)s)[1], CudaNdarray_HOST_DIMS(%(x)s)[1],
CudaNdarray_HOST_DIMS(%(x)s)[1] * 2 * sizeof(float) CudaNdarray_HOST_DIMS(%(x)s)[1] * 2 * sizeof(float)
>>>( >>>(
...@@ -362,11 +362,14 @@ class GpuSoftmax (Op): ...@@ -362,11 +362,14 @@ class GpuSoftmax (Op):
body=[ body=[
"extern __shared__ float buf[]", "extern __shared__ float buf[]",
"float * buf2 = buf + N", "float * buf2 = buf + N",
"buf[threadIdx.x] = x[blockIdx.x * sx0 + threadIdx.x * sx1]", "for (int blockIDX = blockIdx.x; blockIDX < M; blockIDX += gridDim.x){",
"buf[threadIdx.x] = x[blockIDX * sx0 + threadIdx.x * sx1]",
"buf2[threadIdx.x] = buf[threadIdx.x]", "buf2[threadIdx.x] = buf[threadIdx.x]",
"__syncthreads()", "__syncthreads()",
inline_softmax('N', 'buf', 'buf2', 'threadIdx.x', 'blockDim.x'), inline_softmax('N', 'buf', 'buf2', 'threadIdx.x', 'blockDim.x'),
"sm[blockIdx.x * N + threadIdx.x] = buf[threadIdx.x]" "sm[blockIDX * N + threadIdx.x] = buf[threadIdx.x]",
"__syncthreads()",
"}",
]) ])
...@@ -386,7 +389,7 @@ class GpuSoftmaxWithBias (Op): ...@@ -386,7 +389,7 @@ class GpuSoftmaxWithBias (Op):
return [shape[0]] return [shape[0]]
def c_code_cache_version(self): def c_code_cache_version(self):
#return () #return ()
return (1,) + inline_softmax.code_version return (2,) + inline_softmax.code_version
def c_code(self, node, nodename, (x,b), (z,), sub): def c_code(self, node, nodename, (x,b), (z,), sub):
fail = sub['fail'] fail = sub['fail']
...@@ -425,7 +428,7 @@ class GpuSoftmaxWithBias (Op): ...@@ -425,7 +428,7 @@ class GpuSoftmaxWithBias (Op):
kSoftmaxWithBias_%(nodename)s kSoftmaxWithBias_%(nodename)s
<<< <<<
// todo: cap these at the card limits, implement loops in kernel // todo: cap these at the card limits, implement loops in kernel
CudaNdarray_HOST_DIMS(%(x)s)[0], std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],32*1024),
CudaNdarray_HOST_DIMS(%(x)s)[1], CudaNdarray_HOST_DIMS(%(x)s)[1],
CudaNdarray_HOST_DIMS(%(x)s)[1] * 2 * sizeof(float) CudaNdarray_HOST_DIMS(%(x)s)[1] * 2 * sizeof(float)
>>>( >>>(
...@@ -461,10 +464,14 @@ class GpuSoftmaxWithBias (Op): ...@@ -461,10 +464,14 @@ class GpuSoftmaxWithBias (Op):
body=[ body=[
"extern __shared__ float buf[]", "extern __shared__ float buf[]",
"float * buf2 = buf + N", "float * buf2 = buf + N",
"buf[threadIdx.x] = x[blockIdx.x * sx0 + threadIdx.x * sx1]", "for (int blockIDX = blockIdx.x; blockIDX < M; blockIDX += gridDim.x){",
"buf[threadIdx.x] = x[blockIDX * sx0 + threadIdx.x * sx1]",
"buf[threadIdx.x] += b[threadIdx.x * sb0]", "buf[threadIdx.x] += b[threadIdx.x * sb0]",
"buf2[threadIdx.x] = buf[threadIdx.x]", "buf2[threadIdx.x] = buf[threadIdx.x]",
"__syncthreads()", "__syncthreads()",
inline_softmax('N', 'buf', 'buf2', 'threadIdx.x', 'blockDim.x'), inline_softmax('N', 'buf', 'buf2', 'threadIdx.x', 'blockDim.x'),
"sm[blockIdx.x * N + threadIdx.x] = buf[threadIdx.x]" "sm[blockIDX * N + threadIdx.x] = buf[threadIdx.x]",
"__syncthreads()",
"}",
]) ])
#for (int i = blockIdx.x; i < N; i += gridDim.x)
import sys, time
import theano
from theano.compile.sharedvalue import shared
from theano.compile.pfunc import pfunc
from theano import tensor
import theano.tensor.nnet
from theano import config
import theano.tensor.nnet.conv as conv
import theano.tensor.signal.downsample as downsample
import numpy
# Skip test if cuda_ndarray is not available.
from nose.plugins.skip import SkipTest
raise SkipTest('SKIP TO PREVENT THE BUILDBOT FROM CRASHING. THERE IS A DIFFICULT BUG TO FIX WITH MEMORY LEAK AND/OR WHEN Cuda_Ndarray alloc fail!')
import theano.sandbox.cuda as cuda_ndarray
if cuda_ndarray.cuda_available == False:
raise SkipTest('Optional package cuda disabled')
import theano.sandbox.cuda as tcn
import logging
logging.getLogger('theano.sandbox.cuda.tests.test_nnet').setLevel(logging.INFO)
def my_rand(*shape):
return theano._asarray(numpy.random.rand(*shape),dtype='float32')
def my_randn(*shape):
return theano._asarray(numpy.random.randn(*shape),dtype='float32')
def my_zeros(*shape):
return theano._asarray(numpy.zeros(*shape),dtype='float32')
def get_mode(use_gpu):
ret = theano.compile.get_default_mode()
if isinstance(ret, theano.compile.ProfileMode):
ret = theano.compile.ProfileMode()
if use_gpu:
ret = ret.including('gpu')
else:
ret = ret.excluding('gpu')
return ret
def print_mode(mode):
if mode != None and isinstance(mode,(theano.compile.ProfileMode,)):
mode.print_summary()
def print_diff_mode(a,b):
if a != None and isinstance(a,(theano.compile.ProfileMode,)) and isinstance(b,(theano.compile.ProfileMode,)):
a.print_diff_summary(b)
def run_nnet(use_gpu, n_batch=60, n_in=1024, n_hid=2048, n_out=10, n_train=100):
if config.mode=='DEBUG_MODE': n_train=1
if use_gpu:
w = tcn.shared_constructor(0.01*(my_rand(n_in,n_hid)-0.5), 'w')
b = tcn.shared_constructor(my_zeros(n_hid), 'b')
v = tcn.shared_constructor(my_zeros((n_hid, n_out)), 'c')
c = tcn.shared_constructor(my_zeros(n_out), 'c')
else:
w = shared(0.01*(my_rand(n_in,n_hid)-0.5), 'w')
b = shared(my_zeros(n_hid), 'b')
v = shared(my_zeros((n_hid, n_out)), 'c')
c = shared(my_zeros(n_out), 'c')
x = tensor.fmatrix('x')
y = tensor.fmatrix('y')
lr = tensor.fscalar('lr')
hid = tensor.tanh(tensor.dot(x, w)+b)
out = tensor.tanh(tensor.dot(hid, v)+c)
loss = tensor.sum(0.5 * (out-y)**2 * lr)
if 0: print 'loss type', loss.type
params = [w, b, v, c]
gparams = tensor.grad(loss, params)
mode = get_mode(use_gpu)
print 'building pfunc ...'
train = pfunc([x,y,lr], [loss], mode=mode, updates=[(p, p-g) for p,g in zip(params, gparams)])
if 0:
for i, n in enumerate(train.maker.env.toposort()):
print i, n
xval = my_rand(n_batch, n_in)
yval = my_rand(n_batch, n_out)
lr = theano._asarray(0.01, dtype='float32')
t0 = time.time()
rval = []
for i in xrange(n_train):
rval.append(train(xval, yval, lr))
dt = time.time() - t0
print_mode(mode)
return numpy.asarray(rval), dt
def test_run_nnet():
for n_in in 1024, 2048, 4096:
for n_hid in 1024, 2048, 4096:
numpy.random.seed(23456)
rval_cpu, tc = run_nnet(False, n_in=n_in, n_hid=n_hid)
numpy.random.seed(23456)
rval_gpu, tg = run_nnet(True, n_in=n_in, n_hid=n_hid)
#print "cpu:", rval_cpu
#print "gpu:", rval_gpu
print "max abs diff:", numpy.max(numpy.absolute(rval_gpu-rval_cpu))
print "time cpu: %f, time gpu: %f, speed up %f"%(tc, tg, tc/tg)
assert numpy.allclose(rval_cpu, rval_gpu,rtol=1e-4,atol=1e-6)
def test_run_nnet_med():
numpy.random.seed(23456)
rval_cpu = run_nnet(False, 10, 128, 50, 4, n_train=10000)
def test_run_nnet_small():
numpy.random.seed(23456)
rval_cpu = run_nnet(False, 10, 10, 4, 4, n_train=100000)
def run_conv_nnet1(use_gpu):
if use_gpu:
shared_fn = tcn.shared_constructor
else:
shared_fn = shared
n_batch = 16
n_kern = 20
shape_img = (n_batch, 1, 32, 32)
shape_kern = (n_kern, 1, 5, 5)
n_train=10
if config.mode=='DEBUG_MODE': n_train=1
logical_hid_shape = tcn.blas.GpuConv.logical_output_shape_2d(shape_img[2:],shape_kern[2:], 'valid')
n_hid = n_kern * logical_hid_shape[0] * logical_hid_shape[1]
n_out = 10
w = shared_fn(0.01*(my_rand(*shape_kern)-0.5), 'w')
b = shared_fn(my_zeros((n_kern,)), 'b')
v = shared_fn(my_zeros((n_hid, n_out)), 'c')
c = shared_fn(my_zeros(n_out), 'c')
x = tensor.Tensor(dtype='float32', broadcastable=(0,1,0,0))('x')
y = tensor.fmatrix('y')
lr = tensor.fscalar('lr')
conv_op = conv.ConvOp(shape_img[2:], shape_kern[2:], n_kern, n_batch, 1, 1)
conv_op.set_flops()
hid = tensor.tanh(conv_op(x, w)+b.dimshuffle((0,'x','x')))
hid_flat = hid.reshape((n_batch, n_hid))
out = tensor.tanh(tensor.dot(hid_flat, v)+c)
loss = tensor.sum(0.5 * (out-y)**2 * lr)
print 'loss type', loss.type
params = [w, b, v, c]
gparams = tensor.grad(loss, params)
mode = get_mode(use_gpu)
print 'building pfunc ...'
train = pfunc([x,y,lr], [loss], mode=mode, updates=[(p, p-g) for p,g in zip(params, gparams)])
# for i, n in enumerate(train.maker.env.toposort()):
# print i, n
xval = my_rand(*shape_img)
yval = my_rand(n_batch, n_out)
lr = theano._asarray(0.01, dtype='float32')
for i in xrange(n_train):
rval = train(xval, yval, lr)
print 'training done'
print_mode(mode)
return rval
def test_conv_nnet1():
numpy.random.seed(23456)
rval_cpu = run_conv_nnet1(False)
numpy.random.seed(23456)
rval_gpu = run_conv_nnet1(True)
assert numpy.allclose(rval_cpu, rval_gpu,rtol=1e-4,atol=1e-6)
def run_conv_nnet2(use_gpu): # pretend we are training LeNet for MNIST
if use_gpu:
shared_fn = tcn.shared_constructor
else:
shared_fn = shared
#cumulativ rounding error affect this comparaison of result. So we lower the tolerance.
#TODO: why the last two example see the error lower? We are converging?
#n_train=10, n_batch=3, n_kern=1, n_kern1=1, error see of 1e-9
#n_train=10, n_batch=3, n_kern=10, n_kern1=1, error see of -1.27777e-06
#n_train=10, n_batch=3, n_kern=10, n_kern1=10, error see of -6.91377e-05
#n_train=10, n_batch=30, n_kern=10, n_kern1=10, error see of -0.00185963
#n_train=10, n_batch=60, n_kern=10, n_kern1=10, error see of -5.26905e-05
#n_train=30, n_batch=60, n_kern=10, n_kern1=10, error see of -3.8147e-06
#n_train=30, n_batch=60, n_kern=20, n_kern1=10, error see of 6.82771e-05
#n_train=30, n_batch=60, n_kern=20, n_kern1=30, error see of 0.000231534
n_batch = 60
shape_img = (n_batch, 1, 32, 32)
n_kern = 20
shape_kern = (n_kern, 1, 5, 5)
n_kern1 = 10
shape_kern1 = (n_kern1, n_kern, 5, 5)
n_train=30
if config.mode=='DEBUG_MODE': n_train=1
logical_hid_shape = tcn.blas.GpuConv.logical_output_shape_2d(tuple(shape_img[2:]),tuple(shape_kern[2:]), 'valid')
logical_hid_shape1 = tcn.blas.GpuConv.logical_output_shape_2d((logical_hid_shape[0]/2, logical_hid_shape[1]/2), tuple(shape_kern1[2:]), 'valid')
n_hid = n_kern1 * logical_hid_shape1[0] * logical_hid_shape1[1]
n_out = 10
w0 = shared_fn(0.01*(my_rand(*shape_kern)-0.5), 'w0')
b0 = shared_fn(my_zeros((n_kern,)), 'b0')
w1 = shared_fn(0.01*(my_rand(*shape_kern1)-0.5), 'w1')
b1 = shared_fn(my_zeros((n_kern1,)), 'b1')
v = shared_fn(my_zeros((n_hid, n_out)), 'c')
c = shared_fn(my_zeros(n_out), 'c')
x = tensor.Tensor(dtype='float32', broadcastable=(0,1,0,0))('x')
y = tensor.fmatrix('y')
lr = tensor.fscalar('lr')
conv_op = conv.ConvOp(shape_img[2:], shape_kern[2:], n_kern, n_batch, 1, 1)
conv_op1 = conv.ConvOp((n_kern,logical_hid_shape[0]/2, logical_hid_shape[1]/2), shape_kern1[2:], n_kern1, n_batch, 1, 1)
conv_op.set_flops()
conv_op1.set_flops()
hid = tensor.tanh(conv_op(x, w0)+b0.dimshuffle((0,'x','x')))
hid1 = tensor.tanh(conv_op1(hid[:,:,::2,::2], w1) + b1.dimshuffle((0,'x','x')))
hid_flat = hid1.reshape((n_batch, n_hid))
out = tensor.tanh(tensor.dot(hid_flat, v)+c)
loss = tensor.sum(0.5 * (out-y)**2 * lr)
print 'loss type', loss.type
params = [w0, b0, w1, b1, v, c]
gparams = tensor.grad(loss, params)
mode = get_mode(use_gpu)
print 'building pfunc ...'
train = pfunc([x,y,lr], [loss], mode=mode, updates=[(p, p-g) for p,g in zip(params, gparams)])
# for i, n in enumerate(train.maker.env.toposort()):
# print i, n
xval = my_rand(*shape_img)
yval = my_rand(n_batch,n_out)#int32 make all 0...
lr = theano._asarray(0.01, dtype='float32')
for i in xrange(n_train):
rval = train(xval, yval, lr)
print_mode(mode)
return rval
def test_conv_nnet2():
numpy.random.seed(23456)
rval_gpu = run_conv_nnet2(True)
if True:
numpy.random.seed(23456)
rval_cpu = run_conv_nnet2(False)
print rval_cpu[0], rval_gpu[0],rval_cpu[0]-rval_gpu[0]
assert numpy.allclose(rval_cpu, rval_gpu,rtol=1e-4,atol=1e-4)
def run_conv_nnet2_classif(use_gpu, isize, ksize, n_batch, n_train,
downsample_ops=True, verbose=0, version=-1):
if use_gpu:
shared_fn = tcn.shared_constructor
else:
shared_fn = shared
isize1=isize
isize2=isize
if isinstance(isize,(tuple,)):
isize1=isize[0]
isize2=isize[1]
shape_img = (n_batch, 1, isize1, isize2)
n_kern = 20 # 6 were used in LeNet5
shape_kern = (n_kern, 1, ksize, ksize)
n_kern1 = 30 # 16 were used in LeNet5
shape_kern1 = (n_kern1, n_kern, ksize, ksize)
logical_hid_shape = tcn.blas.GpuConv.logical_output_shape_2d((isize1, isize2), (ksize, ksize), 'valid')
logical_hid_shape1 = tcn.blas.GpuConv.logical_output_shape_2d((logical_hid_shape[0]/2,
logical_hid_shape[1]/2), (ksize, ksize), 'valid')
n_hid = n_kern1 * logical_hid_shape1[0] * logical_hid_shape1[1]
n_out = 10
w0 = shared_fn(0.01*(my_rand(*shape_kern)-0.5), 'w0')
b0 = shared_fn(my_zeros((n_kern,)), 'b0')
w1 = shared_fn(0.01*(my_rand(*shape_kern1)-0.5), 'w1')
b1 = shared_fn(my_zeros((n_kern1,)), 'b1')
v = shared_fn(0.01*my_randn(n_hid, n_out), 'v')
c = shared_fn(my_zeros(n_out), 'c')
print 'ALLOCATING ARCH: w0 shape', w0.value.shape
print 'ALLOCATING ARCH: w1 shape', w1.value.shape
print 'ALLOCATING ARCH: v shape', v.value.shape
x = tensor.Tensor(dtype='float32', broadcastable=(0,1,0,0))('x')
y = tensor.fmatrix('y')
lr = tensor.fscalar('lr')
conv_op = conv.ConvOp(shape_img[2:], shape_kern[2:], n_kern,
n_batch, 1, 1, verbose=verbose, version=version)
conv_op1 = conv.ConvOp(
(n_kern,logical_hid_shape[0]/2, logical_hid_shape[1]/2),
shape_kern1[2:], n_kern1, n_batch, 1, 1,verbose=verbose, version=version)
conv_op.set_flops()
conv_op1.set_flops()
ds_op = downsample.DownsampleFactorMax((2,2), ignore_border=False)
if downsample_ops:
hid = tensor.tanh(ds_op(conv_op(x, w0)+b0.dimshuffle((0,'x','x'))))
else:
hid = tensor.tanh((conv_op(x, w0)+b0.dimshuffle((0,'x','x')))[:,:,::2,::2])
hid1 = tensor.tanh(conv_op1(hid, w1) + b1.dimshuffle((0,'x','x')))
hid_flat = hid1.reshape((n_batch, n_hid))
out = tensor.nnet.softmax(tensor.dot(hid_flat, v)+c)
loss = tensor.sum(tensor.nnet.crossentropy_categorical_1hot(out, tensor.argmax(y, axis=1)) * lr)
print 'loss type', loss.type
params = [w0, b0, w1, b1, v, c]
gparams = tensor.grad(loss, params, warn_type=True)
mode = get_mode(use_gpu)
print 'building pfunc ...'
train = pfunc([x,y,lr], [loss], mode=mode, updates=[(p, p-g) for p,g in zip(params, gparams)])
if False:
for i, n in enumerate(train.maker.env.toposort()):
print i, n
xval = my_rand(*shape_img)
yval = my_rand(n_batch,n_out)
lr = theano._asarray(0.01, dtype='float32')
rvals=my_zeros(n_train)
t0 = time.time()
for i in xrange(n_train):
rvals[i] = train(xval, yval, lr)[0]
t1 = time.time()
print_mode(mode)
return rvals, t1-t0, mode
def cmp_run_conv_nnet2_classif(seed, isize, ksize, bsize,
ignore_error=False,
n_train=10,
gpu_only=False,
cpu_only=False,
float_atol=1e-06,
check_isfinite=True,
pickle=False,
verbose=0,
version=-1):
"""
float_atol: None mean use the default value.
check_isfinite: the debug mode option. We forward this value to debug mode.
For some parameter CrossentropyCategorical1Hot op generate inf when not optimized.
"""
if config.mode=='DEBUG_MODE': n_train=1
numpy.random.seed(seed)
import theano.tensor.basic
import theano.compile.debugmode
from theano.compile.mode import predefined_modes
orig_float32_atol = theano.tensor.basic.float32_atol
orig_check_isfinite = predefined_modes["DEBUG_MODE"].check_isfinite
try:
predefined_modes["DEBUG_MODE"].check_isfinite = check_isfinite
if gpu_only:
tcn.use()
if float_atol:
print "float_atol",float_atol
theano.tensor.basic.float32_atol=float_atol
if not cpu_only:
rval_gpu, tg, gpu_mode = run_conv_nnet2_classif(True,
isize, ksize, bsize, n_train, verbose=verbose, version=version)
finally:
predefined_modes["DEBUG_MODE"].check_isfinite = orig_check_isfinite
theano.tensor.basic.float32_atol=orig_float32_atol
if gpu_only:
print "time gpu: %.3f"%(tg)
return
try:
predefined_modes["DEBUG_MODE"].check_isfinite = check_isfinite
numpy.random.seed(seed)
rval_cpu, tc, cpu_mode = run_conv_nnet2_classif(False, isize, ksize, bsize, n_train,
verbose=verbose, version=version)
if pickle and isinstance(cpu_mode,(theano.compile.ProfileMode,)):
import pickle
print "BEGIN GPU profile mode dump"
#print pickle.dumps(gpu_mode)
print "END GPU profile mode dump"
print "BEGIN CPU profile mode dump"
print pickle.dumps(cpu_mode)
print "END CPU profile mode dump"
finally:
predefined_modes["DEBUG_MODE"].check_isfinite = orig_check_isfinite
theano.tensor.basic.float32_atol=orig_float32_atol
if not cpu_only:
if verbose or not numpy.allclose(rval_cpu, rval_gpu,rtol=1e-3,atol=float_atol):
print "cpu:", rval_cpu
print "gpu:", rval_gpu
print "abs diff:", numpy.absolute(rval_gpu-rval_cpu)
print "time cpu: %.3f, time gpu: %.3f, speed up %f"%(tc, tg, tc/tg)
print "estimated time for one pass through MNIST with cpu: %f" % (tc * (60000.0 / (n_train*bsize)))
print "estimated time for one pass through MNIST with gpu: %f" % (tg * (60000.0 / (n_train*bsize)))
else:
print "time cpu: %.3f"%(tc)
print "estimated time for one pass through MNIST with cpu: %f" % (tc * (60000.0 / (n_train*bsize)))
if not ignore_error and not cpu_only and not gpu_only:
assert numpy.allclose(rval_cpu, rval_gpu,rtol=1e-3,atol=float_atol)
gpu_only=False
cpu_only=False
ignore_error=False
verbose=0
version=-1
def test_lenet_28(): #MNIST
cmp_run_conv_nnet2_classif(23485, 28, 5, 60, n_train=10,
ignore_error=ignore_error, gpu_only=gpu_only,
cpu_only=cpu_only, verbose=verbose, version=version)
def test_lenet_32(): #CIFAR10 / Shapeset
cmp_run_conv_nnet2_classif(23485, 32, 5, 60, n_train=10,
ignore_error=ignore_error, gpu_only=gpu_only,
verbose=verbose, version=version)
def test_lenet_32_long(): #CIFAR10 / Shapeset
# this tests the gradient of downsample on the GPU,
# which does not recieve specific testing
cmp_run_conv_nnet2_classif(23485, 32, 5, 30, n_train=50,
ignore_error=ignore_error, gpu_only=gpu_only,
cpu_only=cpu_only, verbose=verbose, version=version)
def test_lenet_64(): # ???
#float_atol need to pass in debug mode
#needed as cpu use extended precision and gpu don't
cmp_run_conv_nnet2_classif(23485, 64, 7, 10, n_train=10,
ignore_error=ignore_error, gpu_only=gpu_only,
cpu_only=cpu_only, verbose=verbose,
float_atol=5e-4, check_isfinite=True, version=version)
def test_lenet_108(): # NORB
cmp_run_conv_nnet2_classif(23485, 108, 7, 5, n_train=4,
ignore_error=ignore_error, gpu_only=gpu_only,
cpu_only=cpu_only, verbose=verbose,
check_isfinite=True, version=version, float_atol=7e-2)
def test_lenet_256(): # ImageNet
cmp_run_conv_nnet2_classif(23485, 256, 9, 2, n_train=5,
ignore_error=ignore_error, gpu_only=gpu_only,
cpu_only=cpu_only, verbose=verbose,
check_isfinite=True, version=version)
#I did a wanted error in the name as we don't want it to execute automatically for now as it don't work
def tes_lenet_hd(): #HD 720p: 1280(wid)x720(len)
cmp_run_conv_nnet2_classif(23485, (720,1280), 9, 2, n_train=3,
ignore_error=ignore_error, gpu_only=gpu_only,
cpu_only=cpu_only, verbose=verbose,
check_isfinite=True, version=version)
#I did a wanted error in the name as we don't want it to execute automatically for now as it don't work
def tes_lenet_full_hd(): #HD 1080p: 1920(wid)x1080(len)
cmp_run_conv_nnet2_classif(23485, (1080,1920), 9, 2, n_train=3,
ignore_error=ignore_error, gpu_only=gpu_only,
cpu_only=cpu_only, verbose=verbose,
check_isfinite=True, version=version)
import sys, time import theano, numpy
import theano import theano.tensor as T
from theano.compile.sharedvalue import shared
from theano.compile.pfunc import pfunc
from theano import tensor
import theano.tensor.nnet
from theano import config
import theano.tensor.nnet.conv as conv
import theano.tensor.signal.downsample as downsample
import numpy
# Skip test if cuda_ndarray is not available. # Skip test if cuda_ndarray is not available.
from nose.plugins.skip import SkipTest from nose.plugins.skip import SkipTest
raise SkipTest('SKIP TO PREVENT THE BUILDBOT FROM CRASHING. THERE IS A DIFFICULT BUG TO FIX WITH MEMORY LEAK AND/OR WHEN Cuda_Ndarray alloc fail!') import theano.sandbox.cuda as cuda
import theano.sandbox.cuda as cuda_ndarray if cuda.cuda_available == False:
if cuda_ndarray.cuda_available == False:
raise SkipTest('Optional package cuda disabled') raise SkipTest('Optional package cuda disabled')
import theano.sandbox.cuda as tcn if theano.config.mode=='FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
import logging mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpu')
logging.getLogger('theano.sandbox.cuda.tests.test_nnet').setLevel(logging.INFO) else:
mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpu')
def my_rand(*shape):
return theano._asarray(numpy.random.rand(*shape),dtype='float32')
def my_randn(*shape):
return theano._asarray(numpy.random.randn(*shape),dtype='float32')
def my_zeros(*shape):
return theano._asarray(numpy.zeros(*shape),dtype='float32')
def get_mode(use_gpu):
ret = theano.compile.get_default_mode()
if isinstance(ret, theano.compile.ProfileMode):
ret = theano.compile.ProfileMode()
if use_gpu:
ret = ret.including('gpu')
else:
ret = ret.excluding('gpu')
return ret
def print_mode(mode):
if mode != None and isinstance(mode,(theano.compile.ProfileMode,)):
mode.print_summary()
def print_diff_mode(a,b):
if a != None and isinstance(a,(theano.compile.ProfileMode,)) and isinstance(b,(theano.compile.ProfileMode,)):
a.print_diff_summary(b)
def run_nnet(use_gpu, n_batch=60, n_in=1024, n_hid=2048, n_out=10, n_train=100):
if config.mode=='DEBUG_MODE': n_train=1
if use_gpu:
w = tcn.shared_constructor(0.01*(my_rand(n_in,n_hid)-0.5), 'w')
b = tcn.shared_constructor(my_zeros(n_hid), 'b')
v = tcn.shared_constructor(my_zeros((n_hid, n_out)), 'c')
c = tcn.shared_constructor(my_zeros(n_out), 'c')
else:
w = shared(0.01*(my_rand(n_in,n_hid)-0.5), 'w')
b = shared(my_zeros(n_hid), 'b')
v = shared(my_zeros((n_hid, n_out)), 'c')
c = shared(my_zeros(n_out), 'c')
x = tensor.fmatrix('x')
y = tensor.fmatrix('y')
lr = tensor.fscalar('lr')
hid = tensor.tanh(tensor.dot(x, w)+b)
out = tensor.tanh(tensor.dot(hid, v)+c)
loss = tensor.sum(0.5 * (out-y)**2 * lr)
if 0: print 'loss type', loss.type
params = [w, b, v, c]
gparams = tensor.grad(loss, params)
mode = get_mode(use_gpu)
print 'building pfunc ...'
train = pfunc([x,y,lr], [loss], mode=mode, updates=[(p, p-g) for p,g in zip(params, gparams)])
if 0:
for i, n in enumerate(train.maker.env.toposort()):
print i, n
xval = my_rand(n_batch, n_in)
yval = my_rand(n_batch, n_out)
lr = theano._asarray(0.01, dtype='float32')
t0 = time.time()
rval = []
for i in xrange(n_train):
rval.append(train(xval, yval, lr))
dt = time.time() - t0
print_mode(mode)
return numpy.asarray(rval), dt
def test_run_nnet():
for n_in in 1024, 2048, 4096:
for n_hid in 1024, 2048, 4096:
numpy.random.seed(23456)
rval_cpu, tc = run_nnet(False, n_in=n_in, n_hid=n_hid)
numpy.random.seed(23456)
rval_gpu, tg = run_nnet(True, n_in=n_in, n_hid=n_hid)
#print "cpu:", rval_cpu
#print "gpu:", rval_gpu
print "max abs diff:", numpy.max(numpy.absolute(rval_gpu-rval_cpu))
print "time cpu: %f, time gpu: %f, speed up %f"%(tc, tg, tc/tg)
assert numpy.allclose(rval_cpu, rval_gpu,rtol=1e-4,atol=1e-6)
def test_run_nnet_med():
numpy.random.seed(23456)
rval_cpu = run_nnet(False, 10, 128, 50, 4, n_train=10000)
def test_run_nnet_small():
numpy.random.seed(23456)
rval_cpu = run_nnet(False, 10, 10, 4, 4, n_train=100000)
def run_conv_nnet1(use_gpu):
if use_gpu:
shared_fn = tcn.shared_constructor
else:
shared_fn = shared
n_batch = 16
n_kern = 20
shape_img = (n_batch, 1, 32, 32)
shape_kern = (n_kern, 1, 5, 5)
n_train=10
if config.mode=='DEBUG_MODE': n_train=1
logical_hid_shape = tcn.blas.GpuConv.logical_output_shape_2d(shape_img[2:],shape_kern[2:], 'valid')
n_hid = n_kern * logical_hid_shape[0] * logical_hid_shape[1]
n_out = 10
w = shared_fn(0.01*(my_rand(*shape_kern)-0.5), 'w')
b = shared_fn(my_zeros((n_kern,)), 'b')
v = shared_fn(my_zeros((n_hid, n_out)), 'c')
c = shared_fn(my_zeros(n_out), 'c')
x = tensor.Tensor(dtype='float32', broadcastable=(0,1,0,0))('x')
y = tensor.fmatrix('y')
lr = tensor.fscalar('lr')
conv_op = conv.ConvOp(shape_img[2:], shape_kern[2:], n_kern, n_batch, 1, 1)
conv_op.set_flops()
hid = tensor.tanh(conv_op(x, w)+b.dimshuffle((0,'x','x')))
hid_flat = hid.reshape((n_batch, n_hid))
out = tensor.tanh(tensor.dot(hid_flat, v)+c)
loss = tensor.sum(0.5 * (out-y)**2 * lr)
print 'loss type', loss.type
params = [w, b, v, c]
gparams = tensor.grad(loss, params)
mode = get_mode(use_gpu)
print 'building pfunc ...'
train = pfunc([x,y,lr], [loss], mode=mode, updates=[(p, p-g) for p,g in zip(params, gparams)])
# for i, n in enumerate(train.maker.env.toposort()):
# print i, n
xval = my_rand(*shape_img)
yval = my_rand(n_batch, n_out)
lr = theano._asarray(0.01, dtype='float32')
for i in xrange(n_train): def test_GpuCrossentropySoftmax1HotWithBiasDx():
rval = train(xval, yval, lr) """
print 'training done' This is basic test for GpuCrossentropySoftmaxArgmax1HotWithBias and GpuCrossentropySoftmax1HotWithBiasDx
print_mode(mode)
return rval
def test_conv_nnet1():
numpy.random.seed(23456)
rval_cpu = run_conv_nnet1(False)
numpy.random.seed(23456)
rval_gpu = run_conv_nnet1(True)
assert numpy.allclose(rval_cpu, rval_gpu,rtol=1e-4,atol=1e-6)
def run_conv_nnet2(use_gpu): # pretend we are training LeNet for MNIST
if use_gpu:
shared_fn = tcn.shared_constructor
else:
shared_fn = shared
#cumulativ rounding error affect this comparaison of result. So we lower the tolerance.
#TODO: why the last two example see the error lower? We are converging?
#n_train=10, n_batch=3, n_kern=1, n_kern1=1, error see of 1e-9
#n_train=10, n_batch=3, n_kern=10, n_kern1=1, error see of -1.27777e-06
#n_train=10, n_batch=3, n_kern=10, n_kern1=10, error see of -6.91377e-05
#n_train=10, n_batch=30, n_kern=10, n_kern1=10, error see of -0.00185963
#n_train=10, n_batch=60, n_kern=10, n_kern1=10, error see of -5.26905e-05
#n_train=30, n_batch=60, n_kern=10, n_kern1=10, error see of -3.8147e-06
#n_train=30, n_batch=60, n_kern=20, n_kern1=10, error see of 6.82771e-05
#n_train=30, n_batch=60, n_kern=20, n_kern1=30, error see of 0.000231534
n_batch = 60
shape_img = (n_batch, 1, 32, 32)
n_kern = 20
shape_kern = (n_kern, 1, 5, 5)
n_kern1 = 10
shape_kern1 = (n_kern1, n_kern, 5, 5)
n_train=30
if config.mode=='DEBUG_MODE': n_train=1
logical_hid_shape = tcn.blas.GpuConv.logical_output_shape_2d(tuple(shape_img[2:]),tuple(shape_kern[2:]), 'valid')
logical_hid_shape1 = tcn.blas.GpuConv.logical_output_shape_2d((logical_hid_shape[0]/2, logical_hid_shape[1]/2), tuple(shape_kern1[2:]), 'valid')
n_hid = n_kern1 * logical_hid_shape1[0] * logical_hid_shape1[1]
n_out = 10
w0 = shared_fn(0.01*(my_rand(*shape_kern)-0.5), 'w0')
b0 = shared_fn(my_zeros((n_kern,)), 'b0')
w1 = shared_fn(0.01*(my_rand(*shape_kern1)-0.5), 'w1')
b1 = shared_fn(my_zeros((n_kern1,)), 'b1')
v = shared_fn(my_zeros((n_hid, n_out)), 'c')
c = shared_fn(my_zeros(n_out), 'c')
x = tensor.Tensor(dtype='float32', broadcastable=(0,1,0,0))('x')
y = tensor.fmatrix('y')
lr = tensor.fscalar('lr')
conv_op = conv.ConvOp(shape_img[2:], shape_kern[2:], n_kern, n_batch, 1, 1)
conv_op1 = conv.ConvOp((n_kern,logical_hid_shape[0]/2, logical_hid_shape[1]/2), shape_kern1[2:], n_kern1, n_batch, 1, 1)
conv_op.set_flops()
conv_op1.set_flops()
hid = tensor.tanh(conv_op(x, w0)+b0.dimshuffle((0,'x','x')))
hid1 = tensor.tanh(conv_op1(hid[:,:,::2,::2], w1) + b1.dimshuffle((0,'x','x')))
hid_flat = hid1.reshape((n_batch, n_hid))
out = tensor.tanh(tensor.dot(hid_flat, v)+c)
loss = tensor.sum(0.5 * (out-y)**2 * lr)
print 'loss type', loss.type
params = [w0, b0, w1, b1, v, c]
gparams = tensor.grad(loss, params)
mode = get_mode(use_gpu)
print 'building pfunc ...'
train = pfunc([x,y,lr], [loss], mode=mode, updates=[(p, p-g) for p,g in zip(params, gparams)])
# for i, n in enumerate(train.maker.env.toposort()):
# print i, n
xval = my_rand(*shape_img)
yval = my_rand(n_batch,n_out)#int32 make all 0...
lr = theano._asarray(0.01, dtype='float32')
for i in xrange(n_train):
rval = train(xval, yval, lr)
print_mode(mode)
return rval
def test_conv_nnet2():
numpy.random.seed(23456)
rval_gpu = run_conv_nnet2(True)
if True:
numpy.random.seed(23456)
rval_cpu = run_conv_nnet2(False)
print rval_cpu[0], rval_gpu[0],rval_cpu[0]-rval_gpu[0]
assert numpy.allclose(rval_cpu, rval_gpu,rtol=1e-4,atol=1e-4)
def run_conv_nnet2_classif(use_gpu, isize, ksize, n_batch, n_train,
downsample_ops=True, verbose=0, version=-1):
if use_gpu:
shared_fn = tcn.shared_constructor
else:
shared_fn = shared
isize1=isize
isize2=isize
if isinstance(isize,(tuple,)):
isize1=isize[0]
isize2=isize[1]
shape_img = (n_batch, 1, isize1, isize2)
n_kern = 20 # 6 were used in LeNet5
shape_kern = (n_kern, 1, ksize, ksize)
n_kern1 = 30 # 16 were used in LeNet5
shape_kern1 = (n_kern1, n_kern, ksize, ksize)
logical_hid_shape = tcn.blas.GpuConv.logical_output_shape_2d((isize1, isize2), (ksize, ksize), 'valid') We check that we loop when their is too much threads
logical_hid_shape1 = tcn.blas.GpuConv.logical_output_shape_2d((logical_hid_shape[0]/2, TODO: check that we loop when their is too much block(>32*1024)
logical_hid_shape[1]/2), (ksize, ksize), 'valid') """
n_hid = n_kern1 * logical_hid_shape1[0] * logical_hid_shape1[1]
n_out = 10
n_in = 1000
batch_size = 4097
n_out = 1250
w0 = shared_fn(0.01*(my_rand(*shape_kern)-0.5), 'w0') if theano.config.mode!="DEBUG_MODE":
b0 = shared_fn(my_zeros((n_kern,)), 'b0') n_in = 4098
w1 = shared_fn(0.01*(my_rand(*shape_kern1)-0.5), 'w1') n_out = 4099
b1 = shared_fn(my_zeros((n_kern1,)), 'b1')
v = shared_fn(0.01*my_randn(n_hid, n_out), 'v')
c = shared_fn(my_zeros(n_out), 'c')
print 'ALLOCATING ARCH: w0 shape', w0.value.shape x = T.fmatrix('x')
print 'ALLOCATING ARCH: w1 shape', w1.value.shape y = T.lvector('y')
print 'ALLOCATING ARCH: v shape', v.value.shape
x = tensor.Tensor(dtype='float32', broadcastable=(0,1,0,0))('x')
y = tensor.fmatrix('y')
lr = tensor.fscalar('lr')
conv_op = conv.ConvOp(shape_img[2:], shape_kern[2:], n_kern, b = T.fvector()
n_batch, 1, 1, verbose=verbose, version=version) W = T.fmatrix()
conv_op1 = conv.ConvOp(
(n_kern,logical_hid_shape[0]/2, logical_hid_shape[1]/2),
shape_kern1[2:], n_kern1, n_batch, 1, 1,verbose=verbose, version=version)
conv_op.set_flops()
conv_op1.set_flops()
ds_op = downsample.DownsampleFactorMax((2,2), ignore_border=False) p_y_given_x = T.nnet.softmax(T.dot(x,W)+b)
if downsample_ops: y_pred = T.argmax(p_y_given_x)
hid = tensor.tanh(ds_op(conv_op(x, w0)+b0.dimshuffle((0,'x','x')))) loss = -T.mean(T.log(p_y_given_x)[T.arange(y.shape[0]), y])
else: dW = T.grad(loss,W)
hid = tensor.tanh((conv_op(x, w0)+b0.dimshuffle((0,'x','x')))[:,:,::2,::2]) classify = theano.function( inputs = [x,y,b,W], outputs = [loss,y_pred,dW],
hid1 = tensor.tanh(conv_op1(hid, w1) + b1.dimshuffle((0,'x','x'))) mode = mode_without_gpu)
hid_flat = hid1.reshape((n_batch, n_hid)) classify_gpu = theano.function( inputs = [x,y,b,W], outputs = [loss,y_pred,dW],
out = tensor.nnet.softmax(tensor.dot(hid_flat, v)+c) mode = mode_with_gpu)
loss = tensor.sum(tensor.nnet.crossentropy_categorical_1hot(out, tensor.argmax(y, axis=1)) * lr)
print 'loss type', loss.type
params = [w0, b0, w1, b1, v, c] xx = numpy.asarray(numpy.random.rand(batch_size,n_in),dtype=numpy.float32)
gparams = tensor.grad(loss, params, warn_type=True) yy = numpy.ones((batch_size,),dtype='float32')
b_values = numpy.zeros((n_out,),dtype='float32')
W_values = numpy.asarray(numpy.random.rand(n_in,n_out),dtype='float32')
mode = get_mode(use_gpu)
print 'building pfunc ...' assert any([isinstance(node.op,T.nnet.CrossentropySoftmaxArgmax1HotWithBias) for node in classify.maker.env.toposort()])
train = pfunc([x,y,lr], [loss], mode=mode, updates=[(p, p-g) for p,g in zip(params, gparams)]) assert any([isinstance(node.op,T.nnet.CrossentropySoftmax1HotWithBiasDx) for node in classify.maker.env.toposort()])
assert any([isinstance(node.op,cuda.nnet.GpuCrossentropySoftmaxArgmax1HotWithBias) for node in classify_gpu.maker.env.toposort()])
assert any([isinstance(node.op,cuda.nnet.GpuCrossentropySoftmax1HotWithBiasDx) for node in classify_gpu.maker.env.toposort()])
if False: out=classify(xx,yy,b_values,W_values)
for i, n in enumerate(train.maker.env.toposort()): gout=classify_gpu(xx,yy,b_values,W_values)
print i, n
xval = my_rand(*shape_img) assert numpy.allclose(out[0],gout[0])
yval = my_rand(n_batch,n_out) assert numpy.allclose(out[1],gout[1])
lr = theano._asarray(0.01, dtype='float32') assert numpy.allclose(out[2],gout[2],atol=2e-6)
rvals=my_zeros(n_train)
t0 = time.time()
for i in xrange(n_train):
rvals[i] = train(xval, yval, lr)[0]
t1 = time.time()
print_mode(mode)
return rvals, t1-t0, mode
def cmp_run_conv_nnet2_classif(seed, isize, ksize, bsize, def test_softmax_with_bias():
ignore_error=False,
n_train=10,
gpu_only=False,
cpu_only=False,
float_atol=1e-06,
check_isfinite=True,
pickle=False,
verbose=0,
version=-1):
""" """
float_atol: None mean use the default value. This is basic test for GpuSoftmaxWithBias
check_isfinite: the debug mode option. We forward this value to debug mode.
For some parameter CrossentropyCategorical1Hot op generate inf when not optimized.
"""
if config.mode=='DEBUG_MODE': n_train=1
numpy.random.seed(seed)
import theano.tensor.basic
import theano.compile.debugmode
from theano.compile.mode import predefined_modes
orig_float32_atol = theano.tensor.basic.float32_atol
orig_check_isfinite = predefined_modes["DEBUG_MODE"].check_isfinite
try:
predefined_modes["DEBUG_MODE"].check_isfinite = check_isfinite
if gpu_only:
tcn.use()
if float_atol:
print "float_atol",float_atol
theano.tensor.basic.float32_atol=float_atol
if not cpu_only:
rval_gpu, tg, gpu_mode = run_conv_nnet2_classif(True,
isize, ksize, bsize, n_train, verbose=verbose, version=version)
finally:
predefined_modes["DEBUG_MODE"].check_isfinite = orig_check_isfinite
theano.tensor.basic.float32_atol=orig_float32_atol
if gpu_only: We check that we loop when their is too much block
print "time gpu: %.3f"%(tg) TODO: check that we loop when their is too much thread.(THIS IS NOT IMPLEMENTED)
return """
x = T.fmatrix('x')
try:
predefined_modes["DEBUG_MODE"].check_isfinite = check_isfinite
numpy.random.seed(seed)
rval_cpu, tc, cpu_mode = run_conv_nnet2_classif(False, isize, ksize, bsize, n_train,
verbose=verbose, version=version)
if pickle and isinstance(cpu_mode,(theano.compile.ProfileMode,)):
import pickle
print "BEGIN GPU profile mode dump"
#print pickle.dumps(gpu_mode)
print "END GPU profile mode dump"
print "BEGIN CPU profile mode dump"
print pickle.dumps(cpu_mode)
print "END CPU profile mode dump"
finally: #we need to test n>32*1024 to check that we make the block loop.
predefined_modes["DEBUG_MODE"].check_isfinite = orig_check_isfinite n,m=2<<15,5
theano.tensor.basic.float32_atol=orig_float32_atol
if not cpu_only: data = numpy.arange(n*m, dtype='float32').reshape(n,m)
if verbose or not numpy.allclose(rval_cpu, rval_gpu,rtol=1e-3,atol=float_atol):
print "cpu:", rval_cpu
print "gpu:", rval_gpu
print "abs diff:", numpy.absolute(rval_gpu-rval_cpu)
print "time cpu: %.3f, time gpu: %.3f, speed up %f"%(tc, tg, tc/tg)
print "estimated time for one pass through MNIST with cpu: %f" % (tc * (60000.0 / (n_train*bsize)))
print "estimated time for one pass through MNIST with gpu: %f" % (tg * (60000.0 / (n_train*bsize)))
else:
print "time cpu: %.3f"%(tc)
print "estimated time for one pass through MNIST with cpu: %f" % (tc * (60000.0 / (n_train*bsize)))
if not ignore_error and not cpu_only and not gpu_only: z = T.nnet.softmax_with_bias(x, T.zeros_like(x[0,:]))
assert numpy.allclose(rval_cpu, rval_gpu,rtol=1e-3,atol=float_atol)
gpu_only=False f = theano.function([x],z, mode=mode_without_gpu)
cpu_only=False f_gpu = theano.function([x],z, mode=mode_with_gpu)
ignore_error=False assert f.maker.env.toposort()[-1].op==T.nnet.softmax_with_bias
verbose=0 assert isinstance(f_gpu.maker.env.toposort()[-2].op,cuda.nnet.GpuSoftmaxWithBias)
version=-1
def test_lenet_28(): #MNIST out=f(data)
cmp_run_conv_nnet2_classif(23485, 28, 5, 60, n_train=10, gout=f_gpu(data)
ignore_error=ignore_error, gpu_only=gpu_only, assert numpy.allclose(out,gout),numpy.absolute(out-gout)
cpu_only=cpu_only, verbose=verbose, version=version)
def test_lenet_32(): #CIFAR10 / Shapeset def test_softmax():
cmp_run_conv_nnet2_classif(23485, 32, 5, 60, n_train=10, """
ignore_error=ignore_error, gpu_only=gpu_only, This is basic test for GpuSoftmax
verbose=verbose, version=version)
def test_lenet_32_long(): #CIFAR10 / Shapeset We check that we loop when their is too much block
# this tests the gradient of downsample on the GPU, TODO: check that we loop when their is too much thread.(THIS IS NOT IMPLEMENTED)
# which does not recieve specific testing """
cmp_run_conv_nnet2_classif(23485, 32, 5, 30, n_train=50, x = T.fmatrix('x')
ignore_error=ignore_error, gpu_only=gpu_only,
cpu_only=cpu_only, verbose=verbose, version=version)
def test_lenet_64(): # ??? #we need to test n>32*1024 to check that we make the block loop.
#float_atol need to pass in debug mode n,m=2<<15,5
#needed as cpu use extended precision and gpu don't
cmp_run_conv_nnet2_classif(23485, 64, 7, 10, n_train=10,
ignore_error=ignore_error, gpu_only=gpu_only,
cpu_only=cpu_only, verbose=verbose,
float_atol=5e-4, check_isfinite=True, version=version)
def test_lenet_108(): # NORB data = numpy.arange(n*m, dtype='float32').reshape(n,m)
cmp_run_conv_nnet2_classif(23485, 108, 7, 5, n_train=4,
ignore_error=ignore_error, gpu_only=gpu_only,
cpu_only=cpu_only, verbose=verbose,
check_isfinite=True, version=version, float_atol=7e-2)
def test_lenet_256(): # ImageNet z = T.nnet.softmax(x)
cmp_run_conv_nnet2_classif(23485, 256, 9, 2, n_train=5,
ignore_error=ignore_error, gpu_only=gpu_only,
cpu_only=cpu_only, verbose=verbose,
check_isfinite=True, version=version)
#I did a wanted error in the name as we don't want it to execute automatically for now as it don't work f = theano.function([x],z, mode=mode_without_gpu)
def tes_lenet_hd(): #HD 720p: 1280(wid)x720(len) f_gpu = theano.function([x],z, mode=mode_with_gpu)
cmp_run_conv_nnet2_classif(23485, (720,1280), 9, 2, n_train=3, assert f.maker.env.toposort()[-1].op==T.nnet.softmax
ignore_error=ignore_error, gpu_only=gpu_only, assert isinstance(f_gpu.maker.env.toposort()[-2].op,cuda.nnet.GpuSoftmax)
cpu_only=cpu_only, verbose=verbose,
check_isfinite=True, version=version)
#I did a wanted error in the name as we don't want it to execute automatically for now as it don't work out=f(data)
def tes_lenet_full_hd(): #HD 1080p: 1920(wid)x1080(len) gout=f_gpu(data)
cmp_run_conv_nnet2_classif(23485, (1080,1920), 9, 2, n_train=3, assert numpy.allclose(out,gout),numpy.absolute(out-gout)
ignore_error=ignore_error, gpu_only=gpu_only,
cpu_only=cpu_only, verbose=verbose,
check_isfinite=True, version=version)
...@@ -254,7 +254,9 @@ class CudaNdarrayType(Type): ...@@ -254,7 +254,9 @@ class CudaNdarrayType(Type):
return ret return ret
def c_libraries(self): def c_libraries(self):
return ['cudart'] # returning cublas because the cuda_ndarray.cuh header includes calls to SetVector and
# cublasGetError
return ['cudart', 'cublas']
def c_support_code(cls): def c_support_code(cls):
return "" return ""
......
...@@ -4,7 +4,7 @@ import theano.tensor as T ...@@ -4,7 +4,7 @@ import theano.tensor as T
from theano.tensor.opt import register_specialize from theano.tensor.opt import register_specialize
from theano.gof import local_optimizer from theano.gof import local_optimizer
from theano.sandbox.cuda import cuda_available from theano.sandbox.cuda import cuda_available, cuda_enabled
if cuda_available: if cuda_available:
from theano.sandbox.cuda import CudaNdarrayType from theano.sandbox.cuda import CudaNdarrayType
from theano.sandbox.cuda.basic_ops import host_from_gpu, gpu_from_host from theano.sandbox.cuda.basic_ops import host_from_gpu, gpu_from_host
...@@ -109,12 +109,11 @@ class GpuMultinomial(Multinomial): ...@@ -109,12 +109,11 @@ class GpuMultinomial(Multinomial):
raise TypeError('pvals must be cudandarray', pvals) raise TypeError('pvals must be cudandarray', pvals)
if not isinstance(unis.type, CudaNdarrayType): if not isinstance(unis.type, CudaNdarrayType):
raise TypeError('unis must be cudandarray', unis) raise TypeError('unis must be cudandarray', unis)
return Apply(self, [pvals, unis], [pvals.type()]) return Apply(self, [pvals, unis], [pvals.type()])
def c_code_cache_version(self): def c_code_cache_version(self):
#return () return ()
return (super(GpuMultinomial,self).c_code_cache_version(),1) #return (super(GpuMultinomial,self).c_code_cache_version(),1)
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
return """ return """
...@@ -128,7 +127,7 @@ class GpuMultinomial(Multinomial): ...@@ -128,7 +127,7 @@ class GpuMultinomial(Multinomial):
float * global_outs float * global_outs
) )
{ {
int n = 32*blockIdx.x + threadIdx.x; int n = blockDim.x*blockIdx.x + threadIdx.x;
if (n < nb_multi) if (n < nb_multi)
{ {
...@@ -201,14 +200,31 @@ class GpuMultinomial(Multinomial): ...@@ -201,14 +200,31 @@ class GpuMultinomial(Multinomial):
int nb_outcomes = CudaNdarray_HOST_DIMS(%(z)s)[0]; int nb_outcomes = CudaNdarray_HOST_DIMS(%(z)s)[0];
int nb_multi = CudaNdarray_HOST_DIMS(%(z)s)[1]; int nb_multi = CudaNdarray_HOST_DIMS(%(z)s)[1];
int nb_block; //TODO : change this for a beautiful constant
if (nb_multi %% 32 == 0) int max_nb_blocks = 2<<15 - 1;
nb_block = nb_multi/32; int nb_blocks = max_nb_blocks + 1;
int nb_threads=16; // so it really starts at 32, because of the *2
do
{
nb_threads*=2;
if (nb_multi %% nb_threads == 0)
nb_blocks = nb_multi/nb_threads;
else else
nb_block = (int)((float)nb_multi/32. + 1.); nb_blocks = (int)((float)nb_multi/(float)nb_threads + 1.);
} while (nb_blocks > max_nb_blocks);
//printf("\\nN=%%i b=%%i t=%%i t*b=%%i", nb_multi, nb_blocks, nb_threads, nb_blocks*nb_threads);
// TODO : next line is a bit hardcoded...
if (nb_threads > 512)
{
PyErr_Format(PyExc_ValueError, "Mutinomial is not implemented for as many rows in the matrix (%%i)", nb_multi);
%(fail)s;
}
dim3 n_blocks(nb_block,1,1); dim3 n_blocks(nb_blocks,1,1);
dim3 n_threads(32,1,1); dim3 n_threads(nb_threads,1,1);
int n_shared = 0; int n_shared = 0;
k_multi_warp_%(name)s<<<n_blocks, n_threads, n_shared>>>( k_multi_warp_%(name)s<<<n_blocks, n_threads, n_shared>>>(
...@@ -244,6 +260,6 @@ gpu_multinomial = GpuMultinomial() ...@@ -244,6 +260,6 @@ gpu_multinomial = GpuMultinomial()
def use_gpu_multinomial(node): def use_gpu_multinomial(node):
if node.op == multinomial: if node.op == multinomial:
return [host_from_gpu(gpu_multinomial(*[gpu_from_host(i) for i in node.inputs]))] return [host_from_gpu(gpu_multinomial(*[gpu_from_host(i) for i in node.inputs]))]
if theano.config.device.startswith('gpu'): if cuda_enabled:#theano.config.device.startswith('gpu'):
register_specialize(use_gpu_multinomial) register_specialize(use_gpu_multinomial)
...@@ -685,7 +685,7 @@ class MRG_RandomStreams(object): ...@@ -685,7 +685,7 @@ class MRG_RandomStreams(object):
else: else:
raise NotImplementedError("MRG_RandomStreams.binomial with n > 1") raise NotImplementedError("MRG_RandomStreams.binomial with n > 1")
def multinomial(self, size=None, n=1, pvals=[[.5,.5]], ndim=None, dtype='int64'): def multinomial(self, size=None, n=1, pvals=None, ndim=None, dtype='int64'):
""" """
Sample `n` (currently `n` needs to be 1) times from a multinomial distribution defined by Sample `n` (currently `n` needs to be 1) times from a multinomial distribution defined by
probabilities pvals. probabilities pvals.
...@@ -696,13 +696,12 @@ class MRG_RandomStreams(object): ...@@ -696,13 +696,12 @@ class MRG_RandomStreams(object):
`size` and `ndim` are only there keep the same signature as other uniform, binomial, normal, etc. `size` and `ndim` are only there keep the same signature as other uniform, binomial, normal, etc.
todo : adapt multinomial to take that into account todo : adapt multinomial to take that into account
""" """
if pvals is None:
raise TypeError("You have to specify pvals")
pvals = as_tensor_variable(pvals) pvals = as_tensor_variable(pvals)
if n == 1 and pvals.ndim == 2: if n == 1 and pvals.ndim == 2:
pvals = as_tensor_variable(pvals)
unis = self.uniform(size=pvals.shape[0:1], ndim=1) unis = self.uniform(size=pvals.shape[0:1], ndim=1)
return cast(multinomial(pvals.T, unis).T, dtype) return cast(multinomial(pvals.T, unis).T, dtype)
else: else:
raise NotImplementedError("MRG_RandomStreams.multinomial only implemented with n == 1 and pvals.ndim = 2") raise NotImplementedError("MRG_RandomStreams.multinomial only implemented with n == 1 and pvals.ndim = 2")
......
...@@ -345,7 +345,7 @@ def test_uniform(): ...@@ -345,7 +345,7 @@ def test_uniform():
#print 'random?[-1,-10:]\n', out[-1,-10:] #print 'random?[-1,-10:]\n', out[-1,-10:]
basictest(f, steps, sample_size, prefix='mrg cpu', inputs=input) basictest(f, steps, sample_size, prefix='mrg cpu', inputs=input)
if mode!='FAST_COMPILE': if mode!='FAST_COMPILE' and cuda_available:
print '' print ''
print 'ON GPU with size=(%s):'%str(size) print 'ON GPU with size=(%s):'%str(size)
R = MRG_RandomStreams(234, use_cuda=True) R = MRG_RandomStreams(234, use_cuda=True)
...@@ -403,7 +403,7 @@ def test_binomial(): ...@@ -403,7 +403,7 @@ def test_binomial():
print 'random?[-1,-10:]\n', out[-1,-10:] print 'random?[-1,-10:]\n', out[-1,-10:]
basictest(f, steps, sample_size, prefix='mrg cpu', inputs=input, allow_01=True, target_avg = mean) basictest(f, steps, sample_size, prefix='mrg cpu', inputs=input, allow_01=True, target_avg = mean)
if mode!='FAST_COMPILE': if mode!='FAST_COMPILE' and cuda_available:
print '' print ''
print 'ON GPU with size=(%s) and mean(%d):'%(str(size),mean) print 'ON GPU with size=(%s) and mean(%d):'%(str(size),mean)
R = MRG_RandomStreams(234, use_cuda=True) R = MRG_RandomStreams(234, use_cuda=True)
...@@ -450,7 +450,7 @@ def test_normal0(): ...@@ -450,7 +450,7 @@ def test_normal0():
# now with odd number of samples # now with odd number of samples
sample_size = (sample_size[0],sample_size[1]-1) sample_size = (sample_size[0],sample_size[1]-1)
if mode!='FAST_COMPILE': if mode!='FAST_COMPILE' and cuda_available:
print '' print ''
print 'ON GPU:' print 'ON GPU:'
R = MRG_RandomStreams(234, use_cuda=True) R = MRG_RandomStreams(234, use_cuda=True)
...@@ -465,7 +465,7 @@ def test_normal0(): ...@@ -465,7 +465,7 @@ def test_normal0():
print 'random?[:10]\n', numpy.asarray(f())[0,0:10] print 'random?[:10]\n', numpy.asarray(f())[0,0:10]
print '----' print '----'
sys.stdout.flush() sys.stdout.flush()
basictest(f, steps, sample_size_odd, target_avg=-5.0, target_std=2.0, prefix='gpu mrg ', allow_01=True) basictest(f, steps, sample_size, target_avg=-5.0, target_std=2.0, prefix='gpu mrg ', allow_01=True)
print '' print ''
...@@ -528,6 +528,7 @@ def test_multinomial(): ...@@ -528,6 +528,7 @@ def test_multinomial():
print '' print ''
print 'ON GPU:' print 'ON GPU:'
R = MRG_RandomStreams(234, use_cuda=True) R = MRG_RandomStreams(234, use_cuda=True)
pvals = numpy.asarray(pvals, dtype='float32')
n = R.multinomial(pvals=pvals, dtype='float32') n = R.multinomial(pvals=pvals, dtype='float32')
assert n.dtype == 'float32' #well, it's really that this test w GPU doesn't make sense otw assert n.dtype == 'float32' #well, it's really that this test w GPU doesn't make sense otw
f = theano.function([], theano.Out( f = theano.function([], theano.Out(
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论