提交 168c502e authored 作者: Pascal Lamblin's avatar Pascal Lamblin

merge

...@@ -182,14 +182,14 @@ class ProfileMode(Mode): ...@@ -182,14 +182,14 @@ class ProfileMode(Mode):
print 'local_time %fs (Time spent running thunks)'% local_time print 'local_time %fs (Time spent running thunks)'% local_time
if print_apply: if print_apply:
print 'Apply-wise summary: <% of local_time spent at this position> <total of local_time spent at this position> <nb_call> <Apply position> <Apply Op name>' print 'Apply-wise summary: <% of local_time spent at this position> <cumulative seconds> <apply time> <time per call> <nb_call> <Apply position> <Apply Op name>'
atimes = [(t/local_time, t, (a[0], str(a[1])), apply_call[a]) for a, t in apply_time.items()] atimes = [(t/local_time, t, (a[0], str(a[1])), apply_call[a]) for a, t in apply_time.items()]
atimes.sort() atimes.sort()
atimes.reverse() atimes.reverse()
tot=0 tot=0
for f,t,a,nb_call in atimes[:n_apply_to_print]: for f,t,a,nb_call in atimes[:n_apply_to_print]:
tot+=t tot+=t
print ' %4.1f%% %.3fs %.3fs %i %i %s' % (f*100, tot, t, nb_call, a[0], a[1]) print ' %4.1f%% %.3fs %.3fs %.2es %i %i %s' % (f*100, tot, t, t/nb_call,nb_call, a[0], a[1])
print ' ... (remaining %i Apply instances account for %.2f%%(%.2fs) of the runtime)'\ print ' ... (remaining %i Apply instances account for %.2f%%(%.2fs) of the runtime)'\
%(max(0, len(atimes)-n_apply_to_print), %(max(0, len(atimes)-n_apply_to_print),
sum(f for f, t, a, nb_call in atimes[n_apply_to_print:])*100, sum(f for f, t, a, nb_call in atimes[n_apply_to_print:])*100,
...@@ -199,7 +199,7 @@ class ProfileMode(Mode): ...@@ -199,7 +199,7 @@ class ProfileMode(Mode):
if op_flops: if op_flops:
flops_msg=' <MFlops/s>' flops_msg=' <MFlops/s>'
print '\nHACK WARNING: we print the flops for some OP, but the logic don\' always work. You need to know the internal of Theano to make it work correctly. Otherwise don\'t use!' print '\nHACK WARNING: we print the flops for some OP, but the logic don\' always work. You need to know the internal of Theano to make it work correctly. Otherwise don\'t use!'
print '\nOp-wise summary: < of local_time spent on this kind of Op> <cumulative seconds> <self seconds>%s <nb_call> <Op name>'%(flops_msg) print '\nOp-wise summary: <%% of local_time spent on this kind of Op> <cumulative seconds> <self seconds> <time per call> %s <nb_call> <Op name>'%(flops_msg)
otimes = [(t/local_time, t, a, op_cimpl[a], op_call[a]) for a, t in op_time.items()] otimes = [(t/local_time, t, a, op_cimpl[a], op_call[a]) for a, t in op_time.items()]
otimes.sort() otimes.sort()
...@@ -212,9 +212,9 @@ class ProfileMode(Mode): ...@@ -212,9 +212,9 @@ class ProfileMode(Mode):
else: else:
msg = ' ' msg = ' '
if op_flops: if op_flops:
print ' %4.1f%% %.3fs %.3fs %s %7.1f %d %s' % (f*100, tot, t, msg, op_flops.get(a,-1), nb_call, a) print ' %4.1f%% %.3fs %.3fs %.2es %s %7.1f %d %s' % (f*100, tot, t, t/nb_call, msg, op_flops.get(a,-1), nb_call, a)
else: else:
print ' %4.1f%% %.3fs %.3fs %s %d %s' % (f*100, tot, t, msg, nb_call, a) print ' %4.1f%% %.3fs %.3fs %.2es %s %d %s' % (f*100, tot, t, t/nb_call, msg, nb_call, a)
print ' ... (remaining %i Ops account for %6.2f%%(%.2fs) of the runtime)'\ print ' ... (remaining %i Ops account for %6.2f%%(%.2fs) of the runtime)'\
%(max(0, len(otimes)-n_ops_to_print), %(max(0, len(otimes)-n_ops_to_print),
sum(f for f, t, a, ci, nb_call in otimes[n_ops_to_print:])*100, sum(f for f, t, a, ci, nb_call in otimes[n_ops_to_print:])*100,
...@@ -231,7 +231,7 @@ class ProfileMode(Mode): ...@@ -231,7 +231,7 @@ class ProfileMode(Mode):
sop_c.setdefault(type(a),True) sop_c.setdefault(type(a),True)
sop_c[type(a)]=sop_c[type(a)] and op_cimpl[a] sop_c[type(a)]=sop_c[type(a)] and op_cimpl[a]
sop_call[type(a)]=sop_call.get(type(a),0)+op_call[a] sop_call[type(a)]=sop_call.get(type(a),0)+op_call[a]
print '\nSingle Op-wise summary: <% of local_time spent on this kind of Op> <cumulative seconds> <self seconds> <nb_call> <Op name>' print '\nSingle Op-wise summary: <% of local_time spent on this kind of Op> <cumulative seconds> <self seconds> <time per call> <nb_call> <Op name>'
sotimes = [(t/local_time, t, a, sop_c[a], sop_call[a]) for a, t in sop_time.items()] sotimes = [(t/local_time, t, a, sop_c[a], sop_call[a]) for a, t in sop_time.items()]
sotimes.sort() sotimes.sort()
sotimes.reverse() sotimes.reverse()
...@@ -242,7 +242,7 @@ class ProfileMode(Mode): ...@@ -242,7 +242,7 @@ class ProfileMode(Mode):
msg = '*' msg = '*'
else: else:
msg = ' ' msg = ' '
print ' %4.1f%% %.3fs %.3fs %s %d %s' % (f*100, tot, t, msg, nb_call, a) print ' %4.1f%% %.3fs %.3fs %.2es %s %d %s' % (f*100, tot, t, t/nb_call, msg, nb_call, a)
print ' ... (remaining %i Ops account for %.2f%%(%.2fs) of the runtime)'\ print ' ... (remaining %i Ops account for %.2f%%(%.2fs) of the runtime)'\
%(max(0, len(sotimes)-n_ops_to_print), %(max(0, len(sotimes)-n_ops_to_print),
sum(f for f, t, a, ci, nb_call in sotimes[n_ops_to_print:])*100, sum(f for f, t, a, ci, nb_call in sotimes[n_ops_to_print:])*100,
......
...@@ -8,6 +8,10 @@ default_={ ...@@ -8,6 +8,10 @@ default_={
'ProfileMode.n_ops_to_print':20, 'ProfileMode.n_ops_to_print':20,
'tensor_opt.local_elemwise_fusion':False, 'tensor_opt.local_elemwise_fusion':False,
'lib.amdlibm':False, 'lib.amdlibm':False,
'op.set_flops':False,#currently used only in ConvOp. The profile mode will print the flops/s for the op.
'nvcc.fastmath':False,
'scalar.floatX':'float64',
'gpuelemwise.sync':True, #when true, wait that the gpu fct finished and check it error code.
} }
#default value taked from env variable #default value taked from env variable
...@@ -38,6 +42,8 @@ THEANO_DEBUGMODE_CHECK_PY = bool(int(os.getenv('THEANO_DEBUGMODE_CHECK_PY', 1))) ...@@ -38,6 +42,8 @@ THEANO_DEBUGMODE_CHECK_PY = bool(int(os.getenv('THEANO_DEBUGMODE_CHECK_PY', 1)))
THEANO_DEBUGMODE_CHECK_FINITE = bool(int(os.getenv('THEANO_DEBUGMODE_CHECK_FINITE', 1))) THEANO_DEBUGMODE_CHECK_FINITE = bool(int(os.getenv('THEANO_DEBUGMODE_CHECK_FINITE', 1)))
THEANO_DEBUGMODE_CHECK_STRIDES = bool(int(os.getenv('THEANO_DEBUGMODE_CHECK_STRIDES', 1))) THEANO_DEBUGMODE_CHECK_STRIDES = bool(int(os.getenv('THEANO_DEBUGMODE_CHECK_STRIDES', 1)))
THEANO_FLAGS=os.getenv("THEANO_FLAGS","")
class TheanoConfig(object): class TheanoConfig(object):
"""Return the value for a key after parsing ~/.theano.cfg and """Return the value for a key after parsing ~/.theano.cfg and
the THEANO_FLAGS environment variable. the THEANO_FLAGS environment variable.
...@@ -72,7 +78,7 @@ class TheanoConfig(object): ...@@ -72,7 +78,7 @@ class TheanoConfig(object):
#user config file override the default value #user config file override the default value
self.config.read(['theano.cfg', os.path.expanduser('~/.theano.cfg')]) self.config.read(['theano.cfg', os.path.expanduser('~/.theano.cfg')])
self.env_flags=os.getenv("THEANO_FLAGS","") self.env_flags=THEANO_FLAGS
#The value in the env variable THEANO_FLAGS override the previous value #The value in the env variable THEANO_FLAGS override the previous value
for flag in self.env_flags.split(','): for flag in self.env_flags.split(','):
if not flag: if not flag:
...@@ -88,16 +94,17 @@ class TheanoConfig(object): ...@@ -88,16 +94,17 @@ class TheanoConfig(object):
self.config.set(sp[0],sp[1],val) self.config.set(sp[0],sp[1],val)
else: else:
found=0 found=0
sp=sp[0].lower()#the ConfigParser seam to use only lower letter.
for sec in self.config.sections(): for sec in self.config.sections():
for opt in self.config.options(sec): for opt in self.config.options(sec):
if opt == sp[0]: if opt == sp:
found+=1 found+=1
section=sec section=sec
option=opt option=opt
if found==1: if found==1:
self.config.set(section,option,val) self.config.set(section,option,val)
elif found>1: elif found>1:
raise Exception("Ambiguous option (%s) in THEANO_FLAGS"%(sp[0])) raise Exception("Ambiguous option (%s) in THEANO_FLAGS"%(sp))
def __getitem__(self, key): def __getitem__(self, key):
""":returns: a str with the value associated to the key""" """:returns: a str with the value associated to the key"""
...@@ -142,3 +149,5 @@ class TheanoConfig(object): ...@@ -142,3 +149,5 @@ class TheanoConfig(object):
config = TheanoConfig() config = TheanoConfig()
if config.get('scalar.floatX') not in ['float32', 'float64']:
raise Exception("the configuration scalar.floatX must have value float32 or float64")
import numpy as N import numpy as N
import theano import theano
import theano.tensor as T import theano.tensor as T
from theano import gof, Op, tensor from theano import gof, Op, tensor, config
from theano.printing import Print from theano.printing import Print
def getFilterOutShp(inshp, kshp, (dx,dy)=(1,1), mode='valid'): def getFilterOutShp(inshp, kshp, (dx,dy)=(1,1), mode='valid'):
...@@ -131,6 +131,8 @@ class ConvOp(Op): ...@@ -131,6 +131,8 @@ class ConvOp(Op):
"'valid' mode)")%(self.imshp_logical,self.kshp_logical)) "'valid' mode)")%(self.imshp_logical,self.kshp_logical))
self._rehash() self._rehash()
if config.config.getboolean('op.set_flops'):
self.set_flops()
def __eq__(self, other): def __eq__(self, other):
if type(self) != type(other): if type(self) != type(other):
...@@ -177,11 +179,12 @@ class ConvOp(Op): ...@@ -177,11 +179,12 @@ class ConvOp(Op):
col=-img_col col=-img_col
img_col+=col img_col+=col
while col < max_col: #loop over kern col while col < max_col: #loop over kern col
self.flops+=1 self.flops+=2
col+=1 col+=1
self.flops*=self.imshp[0]*self.nkern*self.bsize#for all outputs images#n_stack==self.imshp[0] self.flops*=self.imshp[0]*self.nkern*self.bsize#for all outputs images#n_stack==self.imshp[0]
assert self.flops==self.bsize * self.nkern * self.imshp[0] * self.kshp[0] * self.kshp[1] * self.imshp[1] * self.imshp[2] * 2
def make_node(self, inputs, kerns): def make_node(self, inputs, kerns):
# TODO: find a way to make ConvOp work for N-D (after NIPS09) # TODO: find a way to make ConvOp work for N-D (after NIPS09)
......
...@@ -2,7 +2,7 @@ import StringIO, sys ...@@ -2,7 +2,7 @@ import StringIO, sys
import numpy import numpy
from theano import Op, Type, Apply, Variable, Constant from theano import Op, Type, Apply, Variable, Constant
from theano import tensor, scalar from theano import tensor, scalar, config
from theano.sandbox.cuda.type import CudaNdarrayType from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda.type_support import filter as type_support_filter from theano.sandbox.cuda.type_support import filter as type_support_filter
...@@ -67,7 +67,7 @@ class GpuElemwise(Op): ...@@ -67,7 +67,7 @@ class GpuElemwise(Op):
nin = property(lambda self: self.scalar_op.nin) nin = property(lambda self: self.scalar_op.nin)
nout = property(lambda self: self.scalar_op.nout) nout = property(lambda self: self.scalar_op.nout)
def __init__(self, scalar_op, inplace_pattern): def __init__(self, scalar_op, inplace_pattern, sync=None):
## ##
# TODO: implement inplace operations. # TODO: implement inplace operations.
# It's ok that we set the DestroyMap to something but then don't actually destroy # It's ok that we set the DestroyMap to something but then don't actually destroy
...@@ -77,6 +77,7 @@ class GpuElemwise(Op): ...@@ -77,6 +77,7 @@ class GpuElemwise(Op):
# the amount of loading and storing to global memory that we would have to do. # the amount of loading and storing to global memory that we would have to do.
# That's why it isn't implemented yet. # That's why it isn't implemented yet.
# #
sync = config.config.getboolean('gpuelemwise.sync',sync)
self.scalar_op = scalar_op self.scalar_op = scalar_op
self.inplace_pattern = inplace_pattern self.inplace_pattern = inplace_pattern
self.destroy_map = dict((o, [i]) for o, i in inplace_pattern.items()) self.destroy_map = dict((o, [i]) for o, i in inplace_pattern.items())
...@@ -86,7 +87,8 @@ class GpuElemwise(Op): ...@@ -86,7 +87,8 @@ class GpuElemwise(Op):
self.ufunc = None self.ufunc = None
self._rehash() self._rehash()
self.src_generator = NaiveAlgo(self.scalar_op) self.src_generator = NaiveAlgo(self.scalar_op, sync=sync)
self.sync = sync
def __getstate__(self): def __getstate__(self):
d = copy.copy(self.__dict__) d = copy.copy(self.__dict__)
......
...@@ -51,7 +51,6 @@ def local_gpu_elemwise_0(node): ...@@ -51,7 +51,6 @@ def local_gpu_elemwise_0(node):
if numpy.any([hasattr(i.owner, 'op') and isinstance(i.owner.op, HostFromGpu) for i in node.inputs]): if numpy.any([hasattr(i.owner, 'op') and isinstance(i.owner.op, HostFromGpu) for i in node.inputs]):
if numpy.any([o.type.dtype == 'float64' for o in node.outputs]): if numpy.any([o.type.dtype == 'float64' for o in node.outputs]):
print 'WARNING: THERE ARE STILL float64s in your graph local_gpu_elemwise_0', node print 'WARNING: THERE ARE STILL float64s in your graph local_gpu_elemwise_0', node
import pdb; pdb.set_trace()
else: else:
# move the add to a GpuAdd # move the add to a GpuAdd
new_op = GpuElemwise(node.op.scalar_op, node.op.inplace_pattern) new_op = GpuElemwise(node.op.scalar_op, node.op.inplace_pattern)
......
...@@ -172,7 +172,7 @@ def speed_elemwise_collapse2(): ...@@ -172,7 +172,7 @@ def speed_elemwise_collapse2():
t2=time.time() t2=time.time()
def test_elemwise_collapse(): def test_elemwise_collapse():
""" used to test if the case where all inputs are broadcast """ """ Test when all inputs have one(and the same) broadcastable dimension """
shape = (4,5,60) shape = (4,5,60)
a = cuda_ndarray.CudaNdarray(numpy.asarray(numpy.random.rand(*shape),dtype='float32')) a = cuda_ndarray.CudaNdarray(numpy.asarray(numpy.random.rand(*shape),dtype='float32'))
...@@ -186,14 +186,16 @@ def test_elemwise_collapse(): ...@@ -186,14 +186,16 @@ def test_elemwise_collapse():
v = numpy.asarray(numpy.random.rand(shape[0],1,*shape[1:]),dtype='float32') v = numpy.asarray(numpy.random.rand(shape[0],1,*shape[1:]),dtype='float32')
v=cuda_ndarray.CudaNdarray(v) v=cuda_ndarray.CudaNdarray(v)
for id,n in enumerate(f.maker.env.toposort()): if False:
print id, n for id,n in enumerate(f.maker.env.toposort()):
print id, n
#let debugmode catch errors #let debugmode catch errors
f(v) out=f(v)[0]
assert numpy.allclose(out,a.reshape(shape[0],1,*shape[1:])+v)
print "Expected collapse of all dimensions" print "Expected collapse of all dimensions"
def test_elemwise_collapse2(): def test_elemwise_collapse2():
""" used to test if the case where one inputs have a broadcast """ """ Test when only one inputs have one broadcastable dimension """
shape = (4,5,60) shape = (4,5,60)
a = cuda_ndarray.CudaNdarray(numpy.asarray(numpy.random.rand(*shape),dtype='float32')) a = cuda_ndarray.CudaNdarray(numpy.asarray(numpy.random.rand(*shape),dtype='float32'))
...@@ -207,14 +209,16 @@ def test_elemwise_collapse2(): ...@@ -207,14 +209,16 @@ def test_elemwise_collapse2():
v = numpy.asarray(numpy.random.rand(shape[0],5,*shape[1:]),dtype='float32') v = numpy.asarray(numpy.random.rand(shape[0],5,*shape[1:]),dtype='float32')
v=cuda_ndarray.CudaNdarray(v) v=cuda_ndarray.CudaNdarray(v)
for id,n in enumerate(f.maker.env.toposort()): if False:
print id, n for id,n in enumerate(f.maker.env.toposort()):
print id, n
#let debugmode catch errors #let debugmode catch errors
f(v) out=f(v)[0]
assert numpy.allclose(out,a.reshape(shape[0],1,*shape[1:])+v)
print "Expected collapse to 3 dimensions" print "Expected collapse to 3 dimensions"
def test_elemwise_collapse3(): def test_elemwise_collapse3():
""" used to test if the case where one inputs have 2 broadcast dimensions at each ends.""" """ Test when only one inputs have two broadcastable dimension at each ends """
shape = (4,5) shape = (4,5)
a = cuda_ndarray.CudaNdarray(numpy.asarray(numpy.random.rand(*shape),dtype='float32')) a = cuda_ndarray.CudaNdarray(numpy.asarray(numpy.random.rand(*shape),dtype='float32'))
...@@ -228,14 +232,16 @@ def test_elemwise_collapse3(): ...@@ -228,14 +232,16 @@ def test_elemwise_collapse3():
v = numpy.asarray(numpy.random.rand(5,shape[0],shape[1],4),dtype='float32') v = numpy.asarray(numpy.random.rand(5,shape[0],shape[1],4),dtype='float32')
v=cuda_ndarray.CudaNdarray(v) v=cuda_ndarray.CudaNdarray(v)
for id,n in enumerate(f.maker.env.toposort()): if False:
print id, n for id,n in enumerate(f.maker.env.toposort()):
print id, n
#let debugmode catch errors #let debugmode catch errors
f(v) out=f(v)[0]
assert numpy.allclose(out,a.reshape(1,shape[0],shape[1],1)+v)
print "Expected collapse to 3 dimensions" print "Expected collapse to 3 dimensions"
def test_elemwise_collapse4(): def test_elemwise_collapse4():
""" used to test if the case where one inputs have 2 broadcast dimensions at each ends and a scalar""" """ Test when only one inputs have two broadcastable dimension at each ends and we add a scalar"""
shape = (4,5) shape = (4,5)
a = cuda_ndarray.CudaNdarray(numpy.asarray(numpy.random.rand(*shape),dtype='float32')) a = cuda_ndarray.CudaNdarray(numpy.asarray(numpy.random.rand(*shape),dtype='float32'))
...@@ -249,8 +255,74 @@ def test_elemwise_collapse4(): ...@@ -249,8 +255,74 @@ def test_elemwise_collapse4():
v = numpy.asarray(numpy.random.rand(5,shape[0],shape[1],4),dtype='float32') v = numpy.asarray(numpy.random.rand(5,shape[0],shape[1],4),dtype='float32')
v=cuda_ndarray.CudaNdarray(v) v=cuda_ndarray.CudaNdarray(v)
for id,n in enumerate(f.maker.env.toposort()): if False:
print id, n for id,n in enumerate(f.maker.env.toposort()):
print id, n
#let debugmode catch errors #let debugmode catch errors
f(v) out=f(v)[0]
assert numpy.allclose(out,a.reshape(1,shape[0],shape[1],1)+v+2)
print "Expected collapse to 3 dimensions" print "Expected collapse to 3 dimensions"
def test_elemwise_collapse5():
""" Test when only one inputs have two broadcastable dimension at the beginning and we add a scalar"""
shape = (4,5)
a = cuda_ndarray.CudaNdarray(numpy.asarray(numpy.random.rand(*shape),dtype='float32'))
a = numpy.asarray(numpy.random.rand(*shape),dtype='float32')
a2 = tcn.shared_constructor(a, 'a')
a3 = a2.dimshuffle('x','x',0,1)
b = tcn.CudaNdarrayType((False, False, False, False))()
c = (a3+b+2)
f = pfunc([b], [c])
v = numpy.asarray(numpy.random.rand(5,4,shape[0],shape[1]),dtype='float32')
v=cuda_ndarray.CudaNdarray(v)
if False:
for id,n in enumerate(f.maker.env.toposort()):
print id, n
#let debugmode catch errors
out=f(v)[0]
assert numpy.allclose(out,a.reshape(1,1,shape[0],shape[1])+v+2)
print "Expected collapse to 2 dimensions"
def test_elemwise_collapse6():
""" Test when all inputs have two broadcastable dimension at the beginning"""
shape = (4,5)
a = cuda_ndarray.CudaNdarray(numpy.asarray(numpy.random.rand(*shape),dtype='float32'))
a = numpy.asarray(numpy.random.rand(*shape),dtype='float32')
a2 = tcn.shared_constructor(a, 'a')
a3 = a2.dimshuffle('x','x',0,1)
b = tcn.CudaNdarrayType((True, True, False, False))()
f = pfunc([b], [a3+b])
v = numpy.asarray(numpy.random.rand(1,1,shape[0],shape[1]),dtype='float32')
v=cuda_ndarray.CudaNdarray(v)
if False:
for id,n in enumerate(f.maker.env.toposort()):
print id, n
#let debugmode catch errors
out=f(v)[0]
assert numpy.allclose(out,a.reshape(1,1,shape[0],shape[1])+v)
print "Expected collapse to c contiguous"
def test_elemwise_collapse7(atol=1e-6):
""" Test when one input have one broadcastable dimension and the other is a scalar"""
shape = (5,4,1)
a = cuda_ndarray.CudaNdarray(numpy.asarray(numpy.random.rand(*shape),dtype='float32'))
a = numpy.asarray(numpy.random.rand(*shape),dtype='float32')
a2 = tcn.shared_constructor(a.copy(), 'a')
a3 = a2.dimshuffle(0, 'x', 1, 2)
f = pfunc([], [a3+2])
if False:
for id,n in enumerate(f.maker.env.toposort()):
print id, n
#let debugmode catch errors
out=f()[0]
ans=(a+2).reshape(shape[0],1,shape[1],shape[2])
assert numpy.allclose(out,ans, atol=atol)
print "Expected collapse to c contiguous"
...@@ -228,6 +228,11 @@ class Kouh2008(object): ...@@ -228,6 +228,11 @@ class Kouh2008(object):
class Config(object): class Config(object):
use_gpu = True use_gpu = True
dtype='float32' dtype='float32'
dtype2=dtype
if dtype2=='floatX':
import theano.config as c
dtype2 = c.config.get('scalar.floatX')
rng_seed = 23498 rng_seed = 23498
n_hid = 300 n_hid = 300
...@@ -296,7 +301,7 @@ def test_bench_elemwise(n_iter=1000, **kwargs): ...@@ -296,7 +301,7 @@ def test_bench_elemwise(n_iter=1000, **kwargs):
xval = numpy.asarray( xval = numpy.asarray(
rng.uniform(size=(conf.ft_batchsize, x.type.shape[1])), rng.uniform(size=(conf.ft_batchsize, x.type.shape[1])),
dtype=conf.dtype, dtype=conf.dtype2,
) )
yval = numpy.arange(conf.ft_batchsize) yval = numpy.arange(conf.ft_batchsize)
for i in xrange(n_iter): for i in xrange(n_iter):
......
...@@ -261,11 +261,17 @@ class CudaNdarrayType(Type): ...@@ -261,11 +261,17 @@ class CudaNdarrayType(Type):
def c_code_cache_version(self): def c_code_cache_version(self):
#return () #return ()
#no need to put nvcc.fastmath in the tuple as the c_compile_args is put in the key.
return (2,) # with assertion about refcounts return (2,) # with assertion about refcounts
def c_compiler(self): def c_compiler(self):
return nvcc_module_compile_str return nvcc_module_compile_str
def c_compile_args(self):
ret = []
if config.config.getboolean('nvcc.fastmath'):
ret.append('-use_fast_math')
return ret
# THIS WORKS # THIS WORKS
# But CudaNdarray instances don't compare equal to one another, and what about __hash__ ? # But CudaNdarray instances don't compare equal to one another, and what about __hash__ ?
......
...@@ -56,6 +56,8 @@ def constant(x): ...@@ -56,6 +56,8 @@ def constant(x):
class Scalar(Type): class Scalar(Type):
def __init__(self, dtype): def __init__(self, dtype):
if dtype=='floatX':
dtype=config.config.get('floatX')
self.dtype = dtype self.dtype = dtype
self.dtype_specs() # error checking self.dtype_specs() # error checking
...@@ -238,8 +240,8 @@ class Scalar(Type): ...@@ -238,8 +240,8 @@ class Scalar(Type):
def c_code_cache_version(self): def c_code_cache_version(self):
#return () #return ()
# no need to put lib.amdlibm here as c_compile_args() are put in the key.
return (4,) #explicit T given in specialization of operator= lines. This makes it compile with open64 return (4,) #explicit T given in specialization of operator= lines. This makes it compile with open64
#2,
int8 = Scalar('int8') int8 = Scalar('int8')
...@@ -252,6 +254,7 @@ uint32 = Scalar('uint32') ...@@ -252,6 +254,7 @@ uint32 = Scalar('uint32')
uint64 = Scalar('uint64') uint64 = Scalar('uint64')
float32 = Scalar('float32') float32 = Scalar('float32')
float64 = Scalar('float64') float64 = Scalar('float64')
floatX = Scalar(config.config.get('scalar.floatX'))
complex64 = Scalar('complex64') complex64 = Scalar('complex64')
complex128 = Scalar('complex128') complex128 = Scalar('complex128')
...@@ -934,6 +937,7 @@ convert_to_uint32 = Cast(uint32, name='convert_to_uint32') ...@@ -934,6 +937,7 @@ convert_to_uint32 = Cast(uint32, name='convert_to_uint32')
convert_to_uint64 = Cast(uint64, name='convert_to_uint64') convert_to_uint64 = Cast(uint64, name='convert_to_uint64')
convert_to_float32 = Cast(float32, name='convert_to_float32') convert_to_float32 = Cast(float32, name='convert_to_float32')
convert_to_float64 = Cast(float64, name='convert_to_float64') convert_to_float64 = Cast(float64, name='convert_to_float64')
convert_to_floatX = Cast(floatX, name='convert_to_floatX')
convert_to_complex64 = Cast(complex64, name='convert_to_complex64') convert_to_complex64 = Cast(complex64, name='convert_to_complex64')
convert_to_complex128 = Cast(complex128, name='convert_to_complex128') convert_to_complex128 = Cast(complex128, name='convert_to_complex128')
...@@ -948,10 +952,13 @@ _cast_mapping = { ...@@ -948,10 +952,13 @@ _cast_mapping = {
'uint64': convert_to_uint64, 'uint64': convert_to_uint64,
'float32': convert_to_float32, 'float32': convert_to_float32,
'float64': convert_to_float64, 'float64': convert_to_float64,
'floatX': convert_to_floatX,
'complex64': convert_to_complex64, 'complex64': convert_to_complex64,
'complex128': convert_to_complex128} 'complex128': convert_to_complex128}
def cast(x, dtype): def cast(x, dtype):
"""Symbolically cast `x` to a Scalar of given `dtype`.""" """Symbolically cast `x` to a Scalar of given `dtype`."""
if dtype=='floatX': dtype = config.config.get('scalar.floatX')
_x = as_scalar(x) _x = as_scalar(x)
if _x.type.dtype == dtype: if _x.type.dtype == dtype:
return _x return _x
......
...@@ -217,10 +217,17 @@ def _wrap_tensor_into_member(x): ...@@ -217,10 +217,17 @@ def _wrap_tensor_into_member(x):
return compile.module.Member(constant(x)) return compile.module.Member(constant(x))
compile.module.register_wrapper(_obj_is_wrappable_as_tensor, _wrap_tensor_into_member) compile.module.register_wrapper(_obj_is_wrappable_as_tensor, _wrap_tensor_into_member)
if int(config.THEANO_CMP_SLOPPY): if int(config.THEANO_CMP_SLOPPY)>1:
# This environment variable is a quick-and-dirty way to get low-precision comparisons. # This environment variable is a quick-and-dirty way to get low-precision comparisons.
# For a more precise setting of these tolerances set them explicitly in your user code by # For a more precise setting of these tolerances set them explicitly in your user code by
# assigning, for example, "theano.tensor.basic.float32_atol = ..." # assigning, for example, "theano.tensor.basic.float32_atol = ..."
#when THEANO_CMP_SLOPPY>1 we are even more sloppy. This is usefull to test the gpu as they don't use extended precision and this cause some difference bigger then the normal sloppy.
float32_atol = 5e-4
float32_rtol = 1e-3
float64_rtol = 1e-4
float64_atol = 1e-3
elif int(config.THEANO_CMP_SLOPPY):
float32_atol = 1e-4 float32_atol = 1e-4
float32_rtol = 1e-3 float32_rtol = 1e-3
float64_rtol = 1e-4 float64_rtol = 1e-4
...@@ -275,6 +282,8 @@ class TensorType(Type): ...@@ -275,6 +282,8 @@ class TensorType(Type):
Optional name for this type. Optional name for this type.
""" """
self.dtype = str(dtype) self.dtype = str(dtype)
if self.dtype=='floatX':
self.dtype=config.config.get('scalar.floatX')
self.broadcastable = tuple(broadcastable) self.broadcastable = tuple(broadcastable)
self.dtype_specs() # error checking is done there self.dtype_specs() # error checking is done there
self.name = name self.name = name
...@@ -601,6 +610,7 @@ cscalar = TensorType('complex64', ()) ...@@ -601,6 +610,7 @@ cscalar = TensorType('complex64', ())
zscalar = TensorType('complex128', ()) zscalar = TensorType('complex128', ())
fscalar = TensorType('float32', ()) fscalar = TensorType('float32', ())
dscalar = TensorType('float64', ()) dscalar = TensorType('float64', ())
xscalar = TensorType('floatX',())
bscalar = TensorType('int8', ()) bscalar = TensorType('int8', ())
wscalar = TensorType('int16', ()) wscalar = TensorType('int16', ())
iscalar = TensorType('int32', ()) iscalar = TensorType('int32', ())
...@@ -621,6 +631,7 @@ cvector = TensorType('complex64', (False, )) ...@@ -621,6 +631,7 @@ cvector = TensorType('complex64', (False, ))
zvector = TensorType('complex128', (False, )) zvector = TensorType('complex128', (False, ))
fvector = TensorType('float32', (False, )) fvector = TensorType('float32', (False, ))
dvector = TensorType('float64', (False, )) dvector = TensorType('float64', (False, ))
xvector = TensorType('floatX', (False, ))
bvector = TensorType('int8', (False,)) bvector = TensorType('int8', (False,))
wvector = TensorType('int16', (False,)) wvector = TensorType('int16', (False,))
ivector = TensorType('int32', (False, )) ivector = TensorType('int32', (False, ))
...@@ -638,6 +649,7 @@ cmatrix = TensorType('complex64', (False, False)) ...@@ -638,6 +649,7 @@ cmatrix = TensorType('complex64', (False, False))
zmatrix = TensorType('complex128', (False, False)) zmatrix = TensorType('complex128', (False, False))
fmatrix = TensorType('float32', (False, False)) fmatrix = TensorType('float32', (False, False))
dmatrix = TensorType('float64', (False, False)) dmatrix = TensorType('float64', (False, False))
xmatrix = TensorType('floatX', (False, False))
bmatrix = TensorType('int8', (False, False)) bmatrix = TensorType('int8', (False, False))
wmatrix = TensorType('int16', (False, False)) wmatrix = TensorType('int16', (False, False))
imatrix = TensorType('int32', (False, False)) imatrix = TensorType('int32', (False, False))
...@@ -655,6 +667,7 @@ crow = TensorType('complex64', (True, False)) ...@@ -655,6 +667,7 @@ crow = TensorType('complex64', (True, False))
zrow = TensorType('complex128', (True, False)) zrow = TensorType('complex128', (True, False))
frow = TensorType('float32', (True, False)) frow = TensorType('float32', (True, False))
drow = TensorType('float64', (True, False)) drow = TensorType('float64', (True, False))
xrow = TensorType('floatX', (True, False))
brow = TensorType('int8', (True, False)) brow = TensorType('int8', (True, False))
wrow = TensorType('int16', (True, False)) wrow = TensorType('int16', (True, False))
irow = TensorType('int32', (True, False)) irow = TensorType('int32', (True, False))
...@@ -668,6 +681,7 @@ ccol = TensorType('complex64', (False, True)) ...@@ -668,6 +681,7 @@ ccol = TensorType('complex64', (False, True))
zcol = TensorType('complex128', (False, True)) zcol = TensorType('complex128', (False, True))
fcol = TensorType('float32', (False, True)) fcol = TensorType('float32', (False, True))
dcol = TensorType('float64', (False, True)) dcol = TensorType('float64', (False, True))
xcol = TensorType('floatX', (False, True))
bcol = TensorType('int8', (False, True)) bcol = TensorType('int8', (False, True))
wcol = TensorType('int16', (False, True)) wcol = TensorType('int16', (False, True))
icol = TensorType('int32', (False, True)) icol = TensorType('int32', (False, True))
...@@ -681,6 +695,7 @@ ctensor3 = TensorType('complex64', (False,)*3) ...@@ -681,6 +695,7 @@ ctensor3 = TensorType('complex64', (False,)*3)
ztensor3 = TensorType('complex128', (False,)*3) ztensor3 = TensorType('complex128', (False,)*3)
ftensor3 = TensorType('float32', (False,)*3) ftensor3 = TensorType('float32', (False,)*3)
dtensor3 = TensorType('float64', (False,)*3) dtensor3 = TensorType('float64', (False,)*3)
xtensor3 = TensorType('floatX', (False,)*3)
btensor3 = TensorType('int8', (False,)*3) btensor3 = TensorType('int8', (False,)*3)
wtensor3 = TensorType('int16', (False,)*3) wtensor3 = TensorType('int16', (False,)*3)
itensor3 = TensorType('int32', (False,)*3) itensor3 = TensorType('int32', (False,)*3)
...@@ -690,6 +705,7 @@ ctensor4 = TensorType('complex64', (False,)*4) ...@@ -690,6 +705,7 @@ ctensor4 = TensorType('complex64', (False,)*4)
ztensor4 = TensorType('complex128', (False,)*4) ztensor4 = TensorType('complex128', (False,)*4)
ftensor4 = TensorType('float32', (False,)*4) ftensor4 = TensorType('float32', (False,)*4)
dtensor4 = TensorType('float64', (False,)*4) dtensor4 = TensorType('float64', (False,)*4)
xtensor4 = TensorType('floatX', (False,)*4)
btensor4 = TensorType('int8', (False,)*4) btensor4 = TensorType('int8', (False,)*4)
wtensor4 = TensorType('int16', (False,)*4) wtensor4 = TensorType('int16', (False,)*4)
itensor4 = TensorType('int32', (False,)*4) itensor4 = TensorType('int32', (False,)*4)
...@@ -1086,6 +1102,9 @@ _convert_to_float32 = _conversion(elemwise.Elemwise(scal.convert_to_float32), 'f ...@@ -1086,6 +1102,9 @@ _convert_to_float32 = _conversion(elemwise.Elemwise(scal.convert_to_float32), 'f
_convert_to_float64 = _conversion(elemwise.Elemwise(scal.convert_to_float64), 'float64') _convert_to_float64 = _conversion(elemwise.Elemwise(scal.convert_to_float64), 'float64')
"""Cast to double-precision floating point""" """Cast to double-precision floating point"""
_convert_to_floatX = _conversion(elemwise.Elemwise(scal.convert_to_floatX), 'floatX')
"""Cast to floatX floating point"""
_convert_to_complex64 = _conversion(elemwise.Elemwise(scal.convert_to_complex64), 'complex64') _convert_to_complex64 = _conversion(elemwise.Elemwise(scal.convert_to_complex64), 'complex64')
"""Cast to single-precision complex""" """Cast to single-precision complex"""
...@@ -1103,11 +1122,14 @@ _cast_mapping = { ...@@ -1103,11 +1122,14 @@ _cast_mapping = {
'uint64': _convert_to_uint64, 'uint64': _convert_to_uint64,
'float32': _convert_to_float32, 'float32': _convert_to_float32,
'float64': _convert_to_float64, 'float64': _convert_to_float64,
'floatX': _convert_to_floatX,
'complex64': _convert_to_complex64, 'complex64': _convert_to_complex64,
'complex128': _convert_to_complex128} 'complex128': _convert_to_complex128}
@constructor @constructor
def cast(x, dtype): def cast(x, dtype):
"""Symbolically cast `x` to a Tensor of type `dtype`.""" """Symbolically cast `x` to a Tensor of type `dtype`."""
if dtype=='floatX': dtype = config.config.get('scalar.floatX')
_x = as_tensor_variable(x) _x = as_tensor_variable(x)
if _x.type.dtype == dtype: if _x.type.dtype == dtype:
return _x return _x
...@@ -2462,7 +2484,7 @@ def get_vector_length(v): ...@@ -2462,7 +2484,7 @@ def get_vector_length(v):
return join.vec_length(v) return join.vec_length(v)
except ValueError: except ValueError:
pass pass
if v.owner and v.owner.op == _shape: if v.owner and isinstance(v.owner.op, Shape):
return v.owner.inputs[0].type.ndim return v.owner.inputs[0].type.ndim
raise ValueError("length not known") raise ValueError("length not known")
......
...@@ -1929,6 +1929,63 @@ def test_default_state(): ...@@ -1929,6 +1929,63 @@ def test_default_state():
assert f(1) == 4.8 assert f(1) == 4.8
assert f(2.2) == 7 assert f(2.2) == 7
def test_cast_floatX():
floatx=config.config.get('scalar.floatX')
#float64 cast to float64 should not generate an op
x = dvector('x')
f = function([x],[cast(x,'float64')])
# print f.maker.env.toposort()
assert len(f.maker.env.toposort())==0
#float32 cast to float32 should not generate an op
x = fvector('x')
f = function([x],[cast(x,'float32')])
# print f.maker.env.toposort()
assert len(f.maker.env.toposort())==0
#floatX cast to float64
x = xvector('x')
f = function([x],[cast(x,'float64')])
# print f.maker.env.toposort()
if floatx=='float64':
assert len(f.maker.env.toposort()) == 0
else:
assert len(f.maker.env.toposort()) == 1
#floatX cast to float32
x = xvector('x')
f = function([x],[cast(x,'float32')])
# print f.maker.env.toposort()
if floatx=='float32':
assert len(f.maker.env.toposort()) == 0
else:
assert len(f.maker.env.toposort()) == 1
#float64 cast to floatX
x = dvector('x')
f = function([x],[cast(x,'floatX')])
# print f.maker.env.toposort()
if floatx=='float64':
assert len(f.maker.env.toposort()) == 0
else:
assert len(f.maker.env.toposort()) == 1
#float32 cast to floatX
x = fvector('x')
f = function([x],[cast(x,'floatX')])
# print f.maker.env.toposort()
if floatx=='float32':
assert len(f.maker.env.toposort()) == 0
else:
assert len(f.maker.env.toposort()) == 1
#floatX cast to floatX
x = xvector('x')
f = function([x],[cast(x,'floatX')])
# print f.maker.env.toposort()
assert len(f.maker.env.toposort()) == 0
if __name__ == '__main__': if __name__ == '__main__':
if len(sys.argv) >= 2 and sys.argv[1] == 'OPT': if len(sys.argv) >= 2 and sys.argv[1] == 'OPT':
default_mode = compile.Mode(linker = 'c&py', default_mode = compile.Mode(linker = 'c&py',
......
...@@ -1059,34 +1059,42 @@ class test_fusion(unittest.TestCase): ...@@ -1059,34 +1059,42 @@ class test_fusion(unittest.TestCase):
print "time", self.do(mode, shared, shp=(1000,1000),gpu=False, assert_len_topo=False,slice=s, nb_repeat=100) print "time", self.do(mode, shared, shp=(1000,1000),gpu=False, assert_len_topo=False,slice=s, nb_repeat=100)
def tes_memory_leak(self, mode=compile.mode.predefined_modes['FAST_RUN'], shared_fn=shared, shp=(3000,3000), gpu=False, nb_repeat=30, assert_len_topo=True, slice=None): def tes_memory_leak(self, mode=compile.mode.Mode('c', 'merge'), shared_fn=shared, shp=(3000,3000), gpu=False, nb_repeat=30, assert_len_topo=True, slice=None):
""" """
param shared_fn: if None, will use compile.function param shared_fn: if None, will use compile.function
verify that the elemwise fusion work verify that the elemwise fusion work
Test with and without DimShuffle Test with and without DimShuffle
""" """
#TODO: disable the canonizer? #TODO: disable the canonizer?
fx, fy = fmatrices('xy') fx = fmatrices('x')
fxv = numpy.zeros(shp, dtype='float32')+ 2 fxv = numpy.zeros(shp, dtype='float32')+ 2
fyv = numpy.zeros(shp, dtype='float32')+ 3
cases = [ cases = [
(fx+fy,(fx,fy),(fxv,fyv),1,fxv+fyv,'float32'),#1 (fx,(fx),(fxv),'float32'),#1
] ]
import gc, pdb, objgraph, weakref import gc, pdb, objgraph, weakref
d={} d={}
dl=[] dl=[]
v1=None v1=None
for id, [g, sym_inputs, val_inputs, nb_elemwise, answer, out_dtype] in enumerate(cases): mode=compile.mode.Mode('c', 'merge')
for id, [g, sym_inputs, val_inputs, out_dtype] in enumerate(cases):
for zzzz in range(nb_repeat): for zzzz in range(nb_repeat):
v=numpy.zeros(shp, dtype=out_dtype) v=numpy.zeros(shp, dtype=out_dtype)
gc.collect();gc.collect();gc.collect() gc.collect();gc.collect();gc.collect()
print 'v1',v1 # print 'v1',v1
v1=weakref.ref(v) # v1=weakref.ref(v)
out=shared_fn(v,'out') out=shared_fn(v,'out')
f = pfunc(sym_inputs,[],updates=[(out,out+g)],mode=mode) pdb.set_trace()
# f = pfunc(sym_inputs,[],updates=[(out,out+g)],mode=mode)
# f = pfunc([fx],[],updates=[(out,out+fx)],mode=mode)
# f = pfunc([fx],out+fx,mode=mode)
# f = compile.function([fx,out],[out+fx],mode=mode)#no memory leak.
f = compile.function([fx,compile.In(variable=out, value=out.container, mutable=None)],
[out+fx],mode=mode)#if mutable is True or False, their is a memory leak
del v del v
gc.collect();gc.collect();gc.collect()
pdb.set_trace()
if True: if False:
gc.collect();gc.collect();gc.collect() gc.collect();gc.collect();gc.collect()
nd=objgraph.typestats() nd=objgraph.typestats()
print 'key, old val, new val, diff' print 'key, old val, new val, diff'
...@@ -1097,7 +1105,7 @@ class test_fusion(unittest.TestCase): ...@@ -1097,7 +1105,7 @@ class test_fusion(unittest.TestCase):
d=nd d=nd
# pdb.set_trace() # pdb.set_trace()
if True: if False:
gc.collect();gc.collect();gc.collect() gc.collect();gc.collect();gc.collect()
ndl=objgraph.by_type('list') ndl=objgraph.by_type('list')
ll=[] ll=[]
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论