提交 168c502e authored 作者: Pascal Lamblin's avatar Pascal Lamblin

merge

......@@ -182,14 +182,14 @@ class ProfileMode(Mode):
print 'local_time %fs (Time spent running thunks)'% local_time
if print_apply:
print 'Apply-wise summary: <% of local_time spent at this position> <total of local_time spent at this position> <nb_call> <Apply position> <Apply Op name>'
print 'Apply-wise summary: <% of local_time spent at this position> <cumulative seconds> <apply time> <time per call> <nb_call> <Apply position> <Apply Op name>'
atimes = [(t/local_time, t, (a[0], str(a[1])), apply_call[a]) for a, t in apply_time.items()]
atimes.sort()
atimes.reverse()
tot=0
for f,t,a,nb_call in atimes[:n_apply_to_print]:
tot+=t
print ' %4.1f%% %.3fs %.3fs %i %i %s' % (f*100, tot, t, nb_call, a[0], a[1])
print ' %4.1f%% %.3fs %.3fs %.2es %i %i %s' % (f*100, tot, t, t/nb_call,nb_call, a[0], a[1])
print ' ... (remaining %i Apply instances account for %.2f%%(%.2fs) of the runtime)'\
%(max(0, len(atimes)-n_apply_to_print),
sum(f for f, t, a, nb_call in atimes[n_apply_to_print:])*100,
......@@ -199,7 +199,7 @@ class ProfileMode(Mode):
if op_flops:
flops_msg=' <MFlops/s>'
print '\nHACK WARNING: we print the flops for some OP, but the logic don\' always work. You need to know the internal of Theano to make it work correctly. Otherwise don\'t use!'
print '\nOp-wise summary: < of local_time spent on this kind of Op> <cumulative seconds> <self seconds>%s <nb_call> <Op name>'%(flops_msg)
print '\nOp-wise summary: <%% of local_time spent on this kind of Op> <cumulative seconds> <self seconds> <time per call> %s <nb_call> <Op name>'%(flops_msg)
otimes = [(t/local_time, t, a, op_cimpl[a], op_call[a]) for a, t in op_time.items()]
otimes.sort()
......@@ -212,9 +212,9 @@ class ProfileMode(Mode):
else:
msg = ' '
if op_flops:
print ' %4.1f%% %.3fs %.3fs %s %7.1f %d %s' % (f*100, tot, t, msg, op_flops.get(a,-1), nb_call, a)
print ' %4.1f%% %.3fs %.3fs %.2es %s %7.1f %d %s' % (f*100, tot, t, t/nb_call, msg, op_flops.get(a,-1), nb_call, a)
else:
print ' %4.1f%% %.3fs %.3fs %s %d %s' % (f*100, tot, t, msg, nb_call, a)
print ' %4.1f%% %.3fs %.3fs %.2es %s %d %s' % (f*100, tot, t, t/nb_call, msg, nb_call, a)
print ' ... (remaining %i Ops account for %6.2f%%(%.2fs) of the runtime)'\
%(max(0, len(otimes)-n_ops_to_print),
sum(f for f, t, a, ci, nb_call in otimes[n_ops_to_print:])*100,
......@@ -231,7 +231,7 @@ class ProfileMode(Mode):
sop_c.setdefault(type(a),True)
sop_c[type(a)]=sop_c[type(a)] and op_cimpl[a]
sop_call[type(a)]=sop_call.get(type(a),0)+op_call[a]
print '\nSingle Op-wise summary: <% of local_time spent on this kind of Op> <cumulative seconds> <self seconds> <nb_call> <Op name>'
print '\nSingle Op-wise summary: <% of local_time spent on this kind of Op> <cumulative seconds> <self seconds> <time per call> <nb_call> <Op name>'
sotimes = [(t/local_time, t, a, sop_c[a], sop_call[a]) for a, t in sop_time.items()]
sotimes.sort()
sotimes.reverse()
......@@ -242,7 +242,7 @@ class ProfileMode(Mode):
msg = '*'
else:
msg = ' '
print ' %4.1f%% %.3fs %.3fs %s %d %s' % (f*100, tot, t, msg, nb_call, a)
print ' %4.1f%% %.3fs %.3fs %.2es %s %d %s' % (f*100, tot, t, t/nb_call, msg, nb_call, a)
print ' ... (remaining %i Ops account for %.2f%%(%.2fs) of the runtime)'\
%(max(0, len(sotimes)-n_ops_to_print),
sum(f for f, t, a, ci, nb_call in sotimes[n_ops_to_print:])*100,
......
......@@ -8,6 +8,10 @@ default_={
'ProfileMode.n_ops_to_print':20,
'tensor_opt.local_elemwise_fusion':False,
'lib.amdlibm':False,
'op.set_flops':False,#currently used only in ConvOp. The profile mode will print the flops/s for the op.
'nvcc.fastmath':False,
'scalar.floatX':'float64',
'gpuelemwise.sync':True, #when true, wait that the gpu fct finished and check it error code.
}
#default value taked from env variable
......@@ -38,6 +42,8 @@ THEANO_DEBUGMODE_CHECK_PY = bool(int(os.getenv('THEANO_DEBUGMODE_CHECK_PY', 1)))
THEANO_DEBUGMODE_CHECK_FINITE = bool(int(os.getenv('THEANO_DEBUGMODE_CHECK_FINITE', 1)))
THEANO_DEBUGMODE_CHECK_STRIDES = bool(int(os.getenv('THEANO_DEBUGMODE_CHECK_STRIDES', 1)))
THEANO_FLAGS=os.getenv("THEANO_FLAGS","")
class TheanoConfig(object):
"""Return the value for a key after parsing ~/.theano.cfg and
the THEANO_FLAGS environment variable.
......@@ -72,7 +78,7 @@ class TheanoConfig(object):
#user config file override the default value
self.config.read(['theano.cfg', os.path.expanduser('~/.theano.cfg')])
self.env_flags=os.getenv("THEANO_FLAGS","")
self.env_flags=THEANO_FLAGS
#The value in the env variable THEANO_FLAGS override the previous value
for flag in self.env_flags.split(','):
if not flag:
......@@ -88,16 +94,17 @@ class TheanoConfig(object):
self.config.set(sp[0],sp[1],val)
else:
found=0
sp=sp[0].lower()#the ConfigParser seam to use only lower letter.
for sec in self.config.sections():
for opt in self.config.options(sec):
if opt == sp[0]:
if opt == sp:
found+=1
section=sec
option=opt
if found==1:
self.config.set(section,option,val)
elif found>1:
raise Exception("Ambiguous option (%s) in THEANO_FLAGS"%(sp[0]))
raise Exception("Ambiguous option (%s) in THEANO_FLAGS"%(sp))
def __getitem__(self, key):
""":returns: a str with the value associated to the key"""
......@@ -142,3 +149,5 @@ class TheanoConfig(object):
config = TheanoConfig()
if config.get('scalar.floatX') not in ['float32', 'float64']:
raise Exception("the configuration scalar.floatX must have value float32 or float64")
import numpy as N
import theano
import theano.tensor as T
from theano import gof, Op, tensor
from theano import gof, Op, tensor, config
from theano.printing import Print
def getFilterOutShp(inshp, kshp, (dx,dy)=(1,1), mode='valid'):
......@@ -131,6 +131,8 @@ class ConvOp(Op):
"'valid' mode)")%(self.imshp_logical,self.kshp_logical))
self._rehash()
if config.config.getboolean('op.set_flops'):
self.set_flops()
def __eq__(self, other):
if type(self) != type(other):
......@@ -177,11 +179,12 @@ class ConvOp(Op):
col=-img_col
img_col+=col
while col < max_col: #loop over kern col
self.flops+=1
self.flops+=2
col+=1
self.flops*=self.imshp[0]*self.nkern*self.bsize#for all outputs images#n_stack==self.imshp[0]
assert self.flops==self.bsize * self.nkern * self.imshp[0] * self.kshp[0] * self.kshp[1] * self.imshp[1] * self.imshp[2] * 2
def make_node(self, inputs, kerns):
# TODO: find a way to make ConvOp work for N-D (after NIPS09)
......
......@@ -2,7 +2,7 @@ import StringIO, sys
import numpy
from theano import Op, Type, Apply, Variable, Constant
from theano import tensor, scalar
from theano import tensor, scalar, config
from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda.type_support import filter as type_support_filter
......@@ -67,7 +67,7 @@ class GpuElemwise(Op):
nin = property(lambda self: self.scalar_op.nin)
nout = property(lambda self: self.scalar_op.nout)
def __init__(self, scalar_op, inplace_pattern):
def __init__(self, scalar_op, inplace_pattern, sync=None):
##
# TODO: implement inplace operations.
# It's ok that we set the DestroyMap to something but then don't actually destroy
......@@ -77,6 +77,7 @@ class GpuElemwise(Op):
# the amount of loading and storing to global memory that we would have to do.
# That's why it isn't implemented yet.
#
sync = config.config.getboolean('gpuelemwise.sync',sync)
self.scalar_op = scalar_op
self.inplace_pattern = inplace_pattern
self.destroy_map = dict((o, [i]) for o, i in inplace_pattern.items())
......@@ -86,7 +87,8 @@ class GpuElemwise(Op):
self.ufunc = None
self._rehash()
self.src_generator = NaiveAlgo(self.scalar_op)
self.src_generator = NaiveAlgo(self.scalar_op, sync=sync)
self.sync = sync
def __getstate__(self):
d = copy.copy(self.__dict__)
......
......@@ -210,12 +210,19 @@ class RecAlgo(object):
return self.c_src_kernel(node, nodename) + self.c_src_callkernel(node, nodename)
class NaiveAlgo(object):
verbose = 0 # 1 or 2 for more verbose output.
verbose = 0 # 1, 2 or 3 for more verbose output.
cache_version = ()
cache_version = ('debug', 6, verbose)
cache_version = ('debug', 7, verbose)
def __init__(self, scalar_op):
def __init__(self, scalar_op, sync=True):
"""
:param scalar_op: the scalar operation to execute on each element.
:param sync: if True, will wait after the kernel launch and check for error call.
"""
self.scalar_op = scalar_op
self.sync = sync
if not self.sync:
self.cache_version+=('nosync',)
def c_src_kernel(self, node, nodename, nd):
sio = StringIO.StringIO()
......@@ -248,9 +255,6 @@ class NaiveAlgo(object):
if _logical_scalar(i):
print >> sio, " const float ii_i%i_value = i%i_data[0];" % (ipos, ipos)
#TODO: insert code to check for strides of 1, and use a different loop
#loop over the elements to be treated by this kernel call
print >> sio, " for (int i = idx; i < numEls; i += numThreads) {"
# calculate the data pointers for all arguments
......@@ -286,9 +290,6 @@ class NaiveAlgo(object):
print >> sio, " ", task_code
print >> sio, " }"
#TODO: insert runtime stride checks that select the best loop order either here, or in
# the host code that launched the kernel (host code probably better spot)
#indent = " "*(4*d+7)
#for ipos, i in enumerate(node.inputs):
#print >> sio, indent, "const float * i%i" % ipos, '= i%i_data', ''
......@@ -635,8 +636,9 @@ class NaiveAlgo(object):
# like
# float *, int, int, int ...
#
# The second is to recognize when trailing (right-most in numpy) dimensions can be collapsed as
# being contiguous... (confusing... read code)
# The second is to recognize when any dimensions can be collapsed as
# being contiguous. That mean that we can merge that dimensions with another
# one for all inputs/outputs and have the same retusuls (confusing... read code)
#
# The thrid is to make a special case for scalar element. We allow the collapsing of them.
# In the ccontiguous and not contiguous case, we use registers to lower the number of memory access.
......@@ -644,6 +646,8 @@ class NaiveAlgo(object):
#TODO: make a special case for broadcasting, to store the data in shared memory.
nd = node.outputs[0].type.ndim
nb_inputs = len(node.inputs)
nb_outputs = len(node.outputs)
id_self = id(self)
d = dict()
#input_params and output_params go into the function declaration/definition
......@@ -668,11 +672,7 @@ class NaiveAlgo(object):
{
//can we collapse dims[i] and dims[i-1]
for(int i=nd-1;i>0;i--){
if(false && dims[i]==1 && strides[i]==0){//
collapse[i]=1;
}else if(false && dims[i-1]==1 && strides[i-1]==0){
collapse[i]=1;
}else if(strides[i]*dims[i]==strides[i-1]){//the dims nd-1 are not strided again dimension nd
if(strides[i]*dims[i]==strides[i-1]){//the dims nd-1 are not strided again dimension nd
collapse[i]=1;
}else collapse[i]=0;
}
......@@ -704,9 +704,85 @@ class NaiveAlgo(object):
std::cerr << " %(ipos)s data strides" <<
""" %locals() + " << ' ' << ".join(["o%s_data"%ipos]
+ list("o%s_str[%i]"%(ipos, di) for di in xrange(nd))) + ''' << "\\n"; '''
# collapse dimension that are broadcast in all inputs.
# need to be done before contiguous collapse as it will break it.
# do the dimensions and the strides
print >> sio, """
int local_dims[%(nd)s];
int local_str[%(nb_inputs)s][%(nd)s];
int local_ostr[%(nb_inputs)s][%(nd)s];
int nd_collapse = %(nd)s;
for(int i=0;i<%(nd)s;i++){//init new dim
local_dims[i]=dims[i];
}
"""%locals()
for ipos in xrange(len(node.inputs)):
print >> sio, """
for(int i=0;i<%(nd)s;i++){//init new strides
local_str[%(ipos)s][i]=i%(ipos)s_str[i];
}
"""%locals()
for ipos in xrange(len(node.outputs)):
print >> sio, """
for(int i=0;i<%(nd)s;i++){//init new strides
local_ostr[%(ipos)s][i]=o%(ipos)s_str[i];
}
"""%locals()
if self.verbose>2:
print >>sio, 'std::cerr <<"before broadcast collapse\\n";'
print >>sio, 'std::cerr<< "nd_collapse "<< nd_collapse << "\\n"; '
print >> sio, 'std::cerr << "local_dims";'
for d in xrange(nd):
print >> sio, 'std::cerr << " " << local_dims[%(d)s]; '%locals()
print >> sio, 'std::cerr << "\\n";'
for ipos in xrange(len(node.inputs)):
print >> sio, 'std::cerr << " local_str inputs %(ipos)s: " <<'%locals()+' << " " << '.join(["local_str[%(ipos)s][%(x)s]"%locals() for x in range(nd)])+'<<"\\n";'
for ipos in xrange(len(node.outputs)):
print >> sio, 'std::cerr << " local_ostr inputs %(ipos)s: " <<'%locals()+' << " " << '.join(["local_ostr[%(ipos)s][%(x)s]"%locals() for x in range(nd)])+'<<"\\n";'
print >> sio, """
for(int id=0;id<nd_collapse;id++){
bool all_broadcast=true;
for(int input_id=0;input_id<%(nb_inputs)s;input_id++){
if(local_str[input_id][id]!=0 || local_dims[id]!=1) all_broadcast= false;
}
for(int input_id=0;input_id<%(nb_outputs)s;input_id++){
if(local_ostr[input_id][id]!=0 || local_dims[id]!=1) all_broadcast= false;
}
if(all_broadcast){
for(int j=id+1;j<nd_collapse;j++)//remove dims i from the array
local_dims[j-1]=local_dims[j];
for(int input_id=0;input_id<%(nb_inputs)s;input_id++){
for(int j=id+1;j<nd_collapse;j++){//remove dims i from the array
local_str[input_id][j-1]=local_str[input_id][j];
}
}
for(int output_id=0;output_id<%(nb_outputs)s;output_id++){
for(int j=id+1;j<nd_collapse;j++){//remove dims i from the array
local_ostr[output_id][j-1]=local_ostr[output_id][j];
}
}
nd_collapse--; id--;
}
}
"""%locals()
if self.verbose>2:
print >>sio, 'std::cerr <<"after broadcast collapse\\n";'
print >>sio, 'std::cerr<< "nd_collapse "<< nd_collapse << "\\n"; '
print >> sio, 'std::cerr << "local_dims";'
for d in xrange(nd):
print >> sio, 'std::cerr << " " << local_dims[%(d)s]; '%locals()
print >> sio, 'std::cerr << "\\n";'
for ipos in xrange(len(node.inputs)):
print >> sio, 'std::cerr << " local_str %(ipos)s: " <<'%locals()+' << " " << '.join(["local_str[%(ipos)s][%(x)s]"%locals() for x in range(nd)])+'<<"\\n";'
for ipos in xrange(len(node.outputs)):
print >> sio, 'std::cerr << " local_ostr %(ipos)s: " <<'%locals()+' << " " << '.join(["local_ostr[%(ipos)s][%(x)s]"%locals() for x in range(nd)])+'<<"\\n";'
# collapse contiguous dimensions (ignoring scalars, generic version(collapse any dimensions, right, left, middle))
# this is a good idea because [we assume that] the output has been allocated c_contiguous
# this is a good idea because we make less index calculation in the gpu.
print >> sio, "int nd_collapse_[%(nd)s] = {"%locals() +','.join(['1' for x in range(nd)]) +"};"
for ipos in xrange(len(node.inputs)):
......@@ -714,8 +790,8 @@ class NaiveAlgo(object):
print >> sio, """
int nd_collapse_%(ipos)s[%(nd)s] = {"""%locals() +','.join(['1' for x in range(nd)]) +"};"
print >> sio, """
can_collapse_%(nodename)s(%(nd)s, dims, i%(ipos)s_str, nd_collapse_%(ipos)s);
for(int i=0;i<%(nd)s;i++){
can_collapse_%(nodename)s(nd_collapse, local_dims, local_str[%(ipos)s], nd_collapse_%(ipos)s);
for(int i=0;i<nd_collapse;i++){
if(nd_collapse_%(ipos)s[i]==0)
nd_collapse_[i]=0;
}
......@@ -731,79 +807,69 @@ nd_collapse_[i]=0;
"""%locals()
print >>sio, ' << " " << '.join(["nd_collapse_["%locals()+str(i)+"]" for i in range(nd)])
print >>sio, '<< "\\n";'
print >> sio, """
int nd_collapse=%(nd)s;
for(int i=1;i<%(nd)s;i++){
if(nd_collapse_[i]==1)nd_collapse--;
}
if(nd_collapse==1 && """%locals()
print >> sio, " && ".join([ "i%(ipos)s_str[%(nd)s-1]==1 "%locals()for x in range(len(node.inputs))])
print >> sio,"""){nd_collapse=0;} """
if self.verbose:
print >> sio, """std::cerr << "nd_collapse " << nd_collapse << "\\n"; """ %locals()
# set the new dims.
print >> sio, "int local_dims[%(nd)s];"%locals()
print >> sio, """
for(int i=0;i<%(nd)s;i++){//init new dim
local_dims[i]=dims[i];
}
for(int i=%(nd)s-1;i>0;i--){
if(nd_collapse_[i]==1){
local_dims[i-1]*=local_dims[i];//set new dims
for(int j=i+1;j<%(nd)s;j++)//remove dims i from the array
local_dims[j-1]=local_dims[j];
}
}
"""%locals()
if self.verbose>1:
for d in xrange(nd):
print >> sio, 'std::cerr << "local_dims %(d)s " << local_dims[%(d)s] << "\\n"; '%locals()
# set the new stride.
# update the local stride.
for ipos in xrange(len(node.inputs)):
print >> sio, """
int local_i%(ipos)s_str[%(nd)s];
"""%locals()
print >> sio, """
for(int i=0;i<%(nd)s;i++){//init new strides
local_i%(ipos)s_str[i]=i%(ipos)s_str[i];
}
for(int i=%(nd)s-1;i>0;i--){
for(int i=nd_collapse-1;i>0;i--){
if(nd_collapse_[i]==1){
local_i%(ipos)s_str[i-1]=local_i%(ipos)s_str[i];//set new strides
for(int j=i+1;j<%(nd)s;j++)//remove stride i from the array
local_i%(ipos)s_str[j-1]=local_i%(ipos)s_str[j];
local_str[%(ipos)s][i-1]=local_str[%(ipos)s][i];//set new strides
for(int j=i+1;j<nd_collapse;j++)//remove stride i from the array
local_str[%(ipos)s][j-1]=local_str[%(ipos)s][j];
}
}
"""%locals()
for ipos in xrange(len(node.outputs)):
print >> sio, "int local_o%(ipos)s_str[%(nd)s];"%locals()
print >> sio, """
for(int i=0;i<%(nd)s;i++){//init new strides
local_o%(ipos)s_str[i]=o%(ipos)s_str[i];
}
for(int i=%(nd)s-1;i>0;i--){
for(int i=nd_collapse-1;i>0;i--){
if(nd_collapse_[i]==1){
local_o%(ipos)s_str[i-1]=local_o%(ipos)s_str[i];//set new strides
for(int j=i+1;j<%(nd)s;j++)//remove stride i from the array
local_o%(ipos)s_str[j-1]=local_o%(ipos)s_str[j];
local_ostr[%(ipos)s][i-1]=local_ostr[%(ipos)s][i];//set new strides
for(int j=i+1;j<nd_collapse;j++)//remove stride i from the array
local_ostr[%(ipos)s][j-1]=local_ostr[%(ipos)s][j];
}
}
"""%locals()
# update the local dims.
print >> sio, """
for(int i=nd_collapse-1;i>0;i--){
if(nd_collapse_[i]==1){
local_dims[i-1]*=local_dims[i];//set new dims
for(int j=i+1;j<nd_collapse;j++)//remove dims i from the array
local_dims[j-1]=local_dims[j];
}
}
"""%locals()
#update the new number of dim
print >> sio, """
for(int i=1, end=nd_collapse;i<end;i++){
if(nd_collapse_[i]==1)nd_collapse--;
}
if(nd_collapse == 1 """%locals()
l=["local_str[%(ipos)s][nd_collapse-1]==1 "%locals()for ipos in range(len(node.inputs)) if not _logical_scalar(node.inputs[ipos])]
l+=["local_ostr[%(ipos)s][nd_collapse-1]==1 "%locals()for ipos in range(len(node.outputs)) if not _logical_scalar(node.outputs[ipos])]
if len(l)>0:
print >> sio," && "," && ".join(l)
print >> sio,"""){nd_collapse=0;} """
if self.verbose:
print >> sio, 'std::cerr <<"after can_collapse\\n";'
print >> sio, """std::cerr << "nd_collapse " << nd_collapse << "\\n"; """ %locals()
if self.verbose>1:
for ipos in ["i"+ str(x) for x in xrange(len(node.inputs))]+["o"+ str(x) for x in xrange(len(node.outputs))]:
print >> sio, 'std::cerr << " local_%(ipos)s_str " <<'%locals()+' << " " << '.join(["local_%(ipos)s_str[%(x)s]"%locals() for x in range(nd)])+'<<"\\n";'
for d in xrange(nd):
print >> sio, 'std::cerr << " " << local_dims[%(d)s]; '%locals()
print >> sio, 'std::cerr << "\\n";'
for ipos in xrange(len(node.inputs)):
print >> sio, 'std::cerr << " local_str %(ipos)s: " <<'%locals()+' << " " << '.join(["local_str[%(ipos)s][%(x)s]"%locals() for x in range(nd)])+'<<"\\n";'
for ipos in xrange(len(node.outputs)):
print >> sio, 'std::cerr << " local_ostr %(ipos)s: " <<'%locals()+' << " " << '.join(["local_ostr[%(ipos)s][%(x)s]"%locals() for x in range(nd)])+'<<"\\n";'
def launch_Ccontiguous(nodename, id_self, scalar_op):
def launch_Ccontiguous(nodename, id_self, scalar_op, sync=True):
kernel_call_args = ["numEls"]
for ipos in xrange(len(node.inputs)):
kernel_call_args.append("i%i_data"%ipos)
......@@ -819,6 +885,9 @@ nd_collapse_[i]=0;
kernel_%(scalar_op)s_%(nodename)s_Ccontiguous<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s);
//std::cerr << "calling callkernel returned\\n";
""" %locals()
if sync:
print >> sio, """
CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
......@@ -830,18 +899,20 @@ nd_collapse_[i]=0;
%(verb)s
return 0;
""" %locals()
else:
print >> sio, " return 0; " %locals()
def launch_General(nodename, id_self, scalar_op, force_nd):
def launch_General(nodename, id_self, scalar_op, force_nd, sync=True):
# kernel_call_args are used to invoke the cuda kernel
local="local_"
kernel_call_args = ["numEls"]
kernel_call_args.extend(local+"dims[%i]"%di for di in xrange(force_nd))
for ipos in xrange(len(node.inputs)):
kernel_call_args+=["i%i_data"%ipos] + list(local+"i%i_str[%i]"%(ipos, di) for di in xrange(force_nd))
kernel_call_args+=["i%i_data"%ipos] + list(local+"str[%i][%i]"%(ipos, di) for di in xrange(force_nd))
#strides = ", ".join("i%i_str[%i]"%(ipos, di) for di in xrange(force_nd))
#kernel_call_args.append( "%s, i%i_data" % (strides, ipos))
for ipos in xrange(len(node.outputs)):
kernel_call_args+=["o%i_data"%ipos] + list(local+"o%i_str[%i]"%(ipos, di) for di in xrange(force_nd))
kernel_call_args+=["o%i_data"%ipos] + list(local+"ostr[%i][%i]"%(ipos, di) for di in xrange(force_nd))
#strides = ", ".join("o%i_str[%i]"%(ipos, di) for di in xrange(force_nd))
#kernel_call_args.append( "%s, o%i_data" % (strides, ipos))
if self.verbose:
......@@ -857,6 +928,9 @@ nd_collapse_[i]=0;
int threads_per_block = std::min(numEls, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
int n_blocks = std::min(numEls/threads_per_block + (numEls %% threads_per_block?1:0), (unsigned int)NUM_VECTOR_OP_BLOCKS);
kernel_%(scalar_op)s_%(nodename)s_%(id_self)s_%(force_nd)s<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s);
""" %locals()
if sync:
print >> sio, """
CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
......@@ -867,14 +941,15 @@ nd_collapse_[i]=0;
}
return 0;
""" %locals()
else:
print >> sio, " return 0; " %locals()
print >> sio, "switch (nd_collapse==0?0:min(%(nd)s,nd_collapse)) {"%locals()
print >> sio, "case 0: {"
launch_Ccontiguous(nodename, id_self, scalar_op)
launch_Ccontiguous(nodename, id_self, scalar_op, self.sync)
print >> sio, " } break;"
for i in range(1, nd+1):
print >> sio, "case "+str(i)+": {"
launch_General(nodename, id_self, scalar_op, i)
launch_General(nodename, id_self, scalar_op, i, self.sync)
print >> sio, " } break;"
print >> sio, "}"#end case
......
......@@ -51,7 +51,6 @@ def local_gpu_elemwise_0(node):
if numpy.any([hasattr(i.owner, 'op') and isinstance(i.owner.op, HostFromGpu) for i in node.inputs]):
if numpy.any([o.type.dtype == 'float64' for o in node.outputs]):
print 'WARNING: THERE ARE STILL float64s in your graph local_gpu_elemwise_0', node
import pdb; pdb.set_trace()
else:
# move the add to a GpuAdd
new_op = GpuElemwise(node.op.scalar_op, node.op.inplace_pattern)
......
......@@ -172,7 +172,7 @@ def speed_elemwise_collapse2():
t2=time.time()
def test_elemwise_collapse():
""" used to test if the case where all inputs are broadcast """
""" Test when all inputs have one(and the same) broadcastable dimension """
shape = (4,5,60)
a = cuda_ndarray.CudaNdarray(numpy.asarray(numpy.random.rand(*shape),dtype='float32'))
......@@ -186,14 +186,16 @@ def test_elemwise_collapse():
v = numpy.asarray(numpy.random.rand(shape[0],1,*shape[1:]),dtype='float32')
v=cuda_ndarray.CudaNdarray(v)
for id,n in enumerate(f.maker.env.toposort()):
print id, n
if False:
for id,n in enumerate(f.maker.env.toposort()):
print id, n
#let debugmode catch errors
f(v)
out=f(v)[0]
assert numpy.allclose(out,a.reshape(shape[0],1,*shape[1:])+v)
print "Expected collapse of all dimensions"
def test_elemwise_collapse2():
""" used to test if the case where one inputs have a broadcast """
""" Test when only one inputs have one broadcastable dimension """
shape = (4,5,60)
a = cuda_ndarray.CudaNdarray(numpy.asarray(numpy.random.rand(*shape),dtype='float32'))
......@@ -207,14 +209,16 @@ def test_elemwise_collapse2():
v = numpy.asarray(numpy.random.rand(shape[0],5,*shape[1:]),dtype='float32')
v=cuda_ndarray.CudaNdarray(v)
for id,n in enumerate(f.maker.env.toposort()):
print id, n
if False:
for id,n in enumerate(f.maker.env.toposort()):
print id, n
#let debugmode catch errors
f(v)
out=f(v)[0]
assert numpy.allclose(out,a.reshape(shape[0],1,*shape[1:])+v)
print "Expected collapse to 3 dimensions"
def test_elemwise_collapse3():
""" used to test if the case where one inputs have 2 broadcast dimensions at each ends."""
""" Test when only one inputs have two broadcastable dimension at each ends """
shape = (4,5)
a = cuda_ndarray.CudaNdarray(numpy.asarray(numpy.random.rand(*shape),dtype='float32'))
......@@ -228,14 +232,16 @@ def test_elemwise_collapse3():
v = numpy.asarray(numpy.random.rand(5,shape[0],shape[1],4),dtype='float32')
v=cuda_ndarray.CudaNdarray(v)
for id,n in enumerate(f.maker.env.toposort()):
print id, n
if False:
for id,n in enumerate(f.maker.env.toposort()):
print id, n
#let debugmode catch errors
f(v)
out=f(v)[0]
assert numpy.allclose(out,a.reshape(1,shape[0],shape[1],1)+v)
print "Expected collapse to 3 dimensions"
def test_elemwise_collapse4():
""" used to test if the case where one inputs have 2 broadcast dimensions at each ends and a scalar"""
""" Test when only one inputs have two broadcastable dimension at each ends and we add a scalar"""
shape = (4,5)
a = cuda_ndarray.CudaNdarray(numpy.asarray(numpy.random.rand(*shape),dtype='float32'))
......@@ -249,8 +255,74 @@ def test_elemwise_collapse4():
v = numpy.asarray(numpy.random.rand(5,shape[0],shape[1],4),dtype='float32')
v=cuda_ndarray.CudaNdarray(v)
for id,n in enumerate(f.maker.env.toposort()):
print id, n
if False:
for id,n in enumerate(f.maker.env.toposort()):
print id, n
#let debugmode catch errors
f(v)
out=f(v)[0]
assert numpy.allclose(out,a.reshape(1,shape[0],shape[1],1)+v+2)
print "Expected collapse to 3 dimensions"
def test_elemwise_collapse5():
""" Test when only one inputs have two broadcastable dimension at the beginning and we add a scalar"""
shape = (4,5)
a = cuda_ndarray.CudaNdarray(numpy.asarray(numpy.random.rand(*shape),dtype='float32'))
a = numpy.asarray(numpy.random.rand(*shape),dtype='float32')
a2 = tcn.shared_constructor(a, 'a')
a3 = a2.dimshuffle('x','x',0,1)
b = tcn.CudaNdarrayType((False, False, False, False))()
c = (a3+b+2)
f = pfunc([b], [c])
v = numpy.asarray(numpy.random.rand(5,4,shape[0],shape[1]),dtype='float32')
v=cuda_ndarray.CudaNdarray(v)
if False:
for id,n in enumerate(f.maker.env.toposort()):
print id, n
#let debugmode catch errors
out=f(v)[0]
assert numpy.allclose(out,a.reshape(1,1,shape[0],shape[1])+v+2)
print "Expected collapse to 2 dimensions"
def test_elemwise_collapse6():
""" Test when all inputs have two broadcastable dimension at the beginning"""
shape = (4,5)
a = cuda_ndarray.CudaNdarray(numpy.asarray(numpy.random.rand(*shape),dtype='float32'))
a = numpy.asarray(numpy.random.rand(*shape),dtype='float32')
a2 = tcn.shared_constructor(a, 'a')
a3 = a2.dimshuffle('x','x',0,1)
b = tcn.CudaNdarrayType((True, True, False, False))()
f = pfunc([b], [a3+b])
v = numpy.asarray(numpy.random.rand(1,1,shape[0],shape[1]),dtype='float32')
v=cuda_ndarray.CudaNdarray(v)
if False:
for id,n in enumerate(f.maker.env.toposort()):
print id, n
#let debugmode catch errors
out=f(v)[0]
assert numpy.allclose(out,a.reshape(1,1,shape[0],shape[1])+v)
print "Expected collapse to c contiguous"
def test_elemwise_collapse7(atol=1e-6):
""" Test when one input have one broadcastable dimension and the other is a scalar"""
shape = (5,4,1)
a = cuda_ndarray.CudaNdarray(numpy.asarray(numpy.random.rand(*shape),dtype='float32'))
a = numpy.asarray(numpy.random.rand(*shape),dtype='float32')
a2 = tcn.shared_constructor(a.copy(), 'a')
a3 = a2.dimshuffle(0, 'x', 1, 2)
f = pfunc([], [a3+2])
if False:
for id,n in enumerate(f.maker.env.toposort()):
print id, n
#let debugmode catch errors
out=f()[0]
ans=(a+2).reshape(shape[0],1,shape[1],shape[2])
assert numpy.allclose(out,ans, atol=atol)
print "Expected collapse to c contiguous"
......@@ -228,6 +228,11 @@ class Kouh2008(object):
class Config(object):
use_gpu = True
dtype='float32'
dtype2=dtype
if dtype2=='floatX':
import theano.config as c
dtype2 = c.config.get('scalar.floatX')
rng_seed = 23498
n_hid = 300
......@@ -296,7 +301,7 @@ def test_bench_elemwise(n_iter=1000, **kwargs):
xval = numpy.asarray(
rng.uniform(size=(conf.ft_batchsize, x.type.shape[1])),
dtype=conf.dtype,
dtype=conf.dtype2,
)
yval = numpy.arange(conf.ft_batchsize)
for i in xrange(n_iter):
......
......@@ -261,11 +261,17 @@ class CudaNdarrayType(Type):
def c_code_cache_version(self):
#return ()
#no need to put nvcc.fastmath in the tuple as the c_compile_args is put in the key.
return (2,) # with assertion about refcounts
def c_compiler(self):
return nvcc_module_compile_str
def c_compile_args(self):
ret = []
if config.config.getboolean('nvcc.fastmath'):
ret.append('-use_fast_math')
return ret
# THIS WORKS
# But CudaNdarray instances don't compare equal to one another, and what about __hash__ ?
......
......@@ -56,6 +56,8 @@ def constant(x):
class Scalar(Type):
def __init__(self, dtype):
if dtype=='floatX':
dtype=config.config.get('floatX')
self.dtype = dtype
self.dtype_specs() # error checking
......@@ -238,8 +240,8 @@ class Scalar(Type):
def c_code_cache_version(self):
#return ()
# no need to put lib.amdlibm here as c_compile_args() are put in the key.
return (4,) #explicit T given in specialization of operator= lines. This makes it compile with open64
#2,
int8 = Scalar('int8')
......@@ -252,6 +254,7 @@ uint32 = Scalar('uint32')
uint64 = Scalar('uint64')
float32 = Scalar('float32')
float64 = Scalar('float64')
floatX = Scalar(config.config.get('scalar.floatX'))
complex64 = Scalar('complex64')
complex128 = Scalar('complex128')
......@@ -934,6 +937,7 @@ convert_to_uint32 = Cast(uint32, name='convert_to_uint32')
convert_to_uint64 = Cast(uint64, name='convert_to_uint64')
convert_to_float32 = Cast(float32, name='convert_to_float32')
convert_to_float64 = Cast(float64, name='convert_to_float64')
convert_to_floatX = Cast(floatX, name='convert_to_floatX')
convert_to_complex64 = Cast(complex64, name='convert_to_complex64')
convert_to_complex128 = Cast(complex128, name='convert_to_complex128')
......@@ -948,10 +952,13 @@ _cast_mapping = {
'uint64': convert_to_uint64,
'float32': convert_to_float32,
'float64': convert_to_float64,
'floatX': convert_to_floatX,
'complex64': convert_to_complex64,
'complex128': convert_to_complex128}
def cast(x, dtype):
"""Symbolically cast `x` to a Scalar of given `dtype`."""
if dtype=='floatX': dtype = config.config.get('scalar.floatX')
_x = as_scalar(x)
if _x.type.dtype == dtype:
return _x
......
......@@ -217,10 +217,17 @@ def _wrap_tensor_into_member(x):
return compile.module.Member(constant(x))
compile.module.register_wrapper(_obj_is_wrappable_as_tensor, _wrap_tensor_into_member)
if int(config.THEANO_CMP_SLOPPY):
if int(config.THEANO_CMP_SLOPPY)>1:
# This environment variable is a quick-and-dirty way to get low-precision comparisons.
# For a more precise setting of these tolerances set them explicitly in your user code by
# assigning, for example, "theano.tensor.basic.float32_atol = ..."
#when THEANO_CMP_SLOPPY>1 we are even more sloppy. This is usefull to test the gpu as they don't use extended precision and this cause some difference bigger then the normal sloppy.
float32_atol = 5e-4
float32_rtol = 1e-3
float64_rtol = 1e-4
float64_atol = 1e-3
elif int(config.THEANO_CMP_SLOPPY):
float32_atol = 1e-4
float32_rtol = 1e-3
float64_rtol = 1e-4
......@@ -275,6 +282,8 @@ class TensorType(Type):
Optional name for this type.
"""
self.dtype = str(dtype)
if self.dtype=='floatX':
self.dtype=config.config.get('scalar.floatX')
self.broadcastable = tuple(broadcastable)
self.dtype_specs() # error checking is done there
self.name = name
......@@ -601,6 +610,7 @@ cscalar = TensorType('complex64', ())
zscalar = TensorType('complex128', ())
fscalar = TensorType('float32', ())
dscalar = TensorType('float64', ())
xscalar = TensorType('floatX',())
bscalar = TensorType('int8', ())
wscalar = TensorType('int16', ())
iscalar = TensorType('int32', ())
......@@ -621,6 +631,7 @@ cvector = TensorType('complex64', (False, ))
zvector = TensorType('complex128', (False, ))
fvector = TensorType('float32', (False, ))
dvector = TensorType('float64', (False, ))
xvector = TensorType('floatX', (False, ))
bvector = TensorType('int8', (False,))
wvector = TensorType('int16', (False,))
ivector = TensorType('int32', (False, ))
......@@ -638,6 +649,7 @@ cmatrix = TensorType('complex64', (False, False))
zmatrix = TensorType('complex128', (False, False))
fmatrix = TensorType('float32', (False, False))
dmatrix = TensorType('float64', (False, False))
xmatrix = TensorType('floatX', (False, False))
bmatrix = TensorType('int8', (False, False))
wmatrix = TensorType('int16', (False, False))
imatrix = TensorType('int32', (False, False))
......@@ -655,6 +667,7 @@ crow = TensorType('complex64', (True, False))
zrow = TensorType('complex128', (True, False))
frow = TensorType('float32', (True, False))
drow = TensorType('float64', (True, False))
xrow = TensorType('floatX', (True, False))
brow = TensorType('int8', (True, False))
wrow = TensorType('int16', (True, False))
irow = TensorType('int32', (True, False))
......@@ -668,6 +681,7 @@ ccol = TensorType('complex64', (False, True))
zcol = TensorType('complex128', (False, True))
fcol = TensorType('float32', (False, True))
dcol = TensorType('float64', (False, True))
xcol = TensorType('floatX', (False, True))
bcol = TensorType('int8', (False, True))
wcol = TensorType('int16', (False, True))
icol = TensorType('int32', (False, True))
......@@ -681,6 +695,7 @@ ctensor3 = TensorType('complex64', (False,)*3)
ztensor3 = TensorType('complex128', (False,)*3)
ftensor3 = TensorType('float32', (False,)*3)
dtensor3 = TensorType('float64', (False,)*3)
xtensor3 = TensorType('floatX', (False,)*3)
btensor3 = TensorType('int8', (False,)*3)
wtensor3 = TensorType('int16', (False,)*3)
itensor3 = TensorType('int32', (False,)*3)
......@@ -690,6 +705,7 @@ ctensor4 = TensorType('complex64', (False,)*4)
ztensor4 = TensorType('complex128', (False,)*4)
ftensor4 = TensorType('float32', (False,)*4)
dtensor4 = TensorType('float64', (False,)*4)
xtensor4 = TensorType('floatX', (False,)*4)
btensor4 = TensorType('int8', (False,)*4)
wtensor4 = TensorType('int16', (False,)*4)
itensor4 = TensorType('int32', (False,)*4)
......@@ -1086,6 +1102,9 @@ _convert_to_float32 = _conversion(elemwise.Elemwise(scal.convert_to_float32), 'f
_convert_to_float64 = _conversion(elemwise.Elemwise(scal.convert_to_float64), 'float64')
"""Cast to double-precision floating point"""
_convert_to_floatX = _conversion(elemwise.Elemwise(scal.convert_to_floatX), 'floatX')
"""Cast to floatX floating point"""
_convert_to_complex64 = _conversion(elemwise.Elemwise(scal.convert_to_complex64), 'complex64')
"""Cast to single-precision complex"""
......@@ -1103,11 +1122,14 @@ _cast_mapping = {
'uint64': _convert_to_uint64,
'float32': _convert_to_float32,
'float64': _convert_to_float64,
'floatX': _convert_to_floatX,
'complex64': _convert_to_complex64,
'complex128': _convert_to_complex128}
@constructor
def cast(x, dtype):
"""Symbolically cast `x` to a Tensor of type `dtype`."""
if dtype=='floatX': dtype = config.config.get('scalar.floatX')
_x = as_tensor_variable(x)
if _x.type.dtype == dtype:
return _x
......@@ -2462,7 +2484,7 @@ def get_vector_length(v):
return join.vec_length(v)
except ValueError:
pass
if v.owner and v.owner.op == _shape:
if v.owner and isinstance(v.owner.op, Shape):
return v.owner.inputs[0].type.ndim
raise ValueError("length not known")
......
......@@ -1929,6 +1929,63 @@ def test_default_state():
assert f(1) == 4.8
assert f(2.2) == 7
def test_cast_floatX():
floatx=config.config.get('scalar.floatX')
#float64 cast to float64 should not generate an op
x = dvector('x')
f = function([x],[cast(x,'float64')])
# print f.maker.env.toposort()
assert len(f.maker.env.toposort())==0
#float32 cast to float32 should not generate an op
x = fvector('x')
f = function([x],[cast(x,'float32')])
# print f.maker.env.toposort()
assert len(f.maker.env.toposort())==0
#floatX cast to float64
x = xvector('x')
f = function([x],[cast(x,'float64')])
# print f.maker.env.toposort()
if floatx=='float64':
assert len(f.maker.env.toposort()) == 0
else:
assert len(f.maker.env.toposort()) == 1
#floatX cast to float32
x = xvector('x')
f = function([x],[cast(x,'float32')])
# print f.maker.env.toposort()
if floatx=='float32':
assert len(f.maker.env.toposort()) == 0
else:
assert len(f.maker.env.toposort()) == 1
#float64 cast to floatX
x = dvector('x')
f = function([x],[cast(x,'floatX')])
# print f.maker.env.toposort()
if floatx=='float64':
assert len(f.maker.env.toposort()) == 0
else:
assert len(f.maker.env.toposort()) == 1
#float32 cast to floatX
x = fvector('x')
f = function([x],[cast(x,'floatX')])
# print f.maker.env.toposort()
if floatx=='float32':
assert len(f.maker.env.toposort()) == 0
else:
assert len(f.maker.env.toposort()) == 1
#floatX cast to floatX
x = xvector('x')
f = function([x],[cast(x,'floatX')])
# print f.maker.env.toposort()
assert len(f.maker.env.toposort()) == 0
if __name__ == '__main__':
if len(sys.argv) >= 2 and sys.argv[1] == 'OPT':
default_mode = compile.Mode(linker = 'c&py',
......
......@@ -1059,34 +1059,42 @@ class test_fusion(unittest.TestCase):
print "time", self.do(mode, shared, shp=(1000,1000),gpu=False, assert_len_topo=False,slice=s, nb_repeat=100)
def tes_memory_leak(self, mode=compile.mode.predefined_modes['FAST_RUN'], shared_fn=shared, shp=(3000,3000), gpu=False, nb_repeat=30, assert_len_topo=True, slice=None):
def tes_memory_leak(self, mode=compile.mode.Mode('c', 'merge'), shared_fn=shared, shp=(3000,3000), gpu=False, nb_repeat=30, assert_len_topo=True, slice=None):
"""
param shared_fn: if None, will use compile.function
verify that the elemwise fusion work
Test with and without DimShuffle
"""
#TODO: disable the canonizer?
fx, fy = fmatrices('xy')
fx = fmatrices('x')
fxv = numpy.zeros(shp, dtype='float32')+ 2
fyv = numpy.zeros(shp, dtype='float32')+ 3
cases = [
(fx+fy,(fx,fy),(fxv,fyv),1,fxv+fyv,'float32'),#1
(fx,(fx),(fxv),'float32'),#1
]
import gc, pdb, objgraph, weakref
d={}
dl=[]
v1=None
for id, [g, sym_inputs, val_inputs, nb_elemwise, answer, out_dtype] in enumerate(cases):
mode=compile.mode.Mode('c', 'merge')
for id, [g, sym_inputs, val_inputs, out_dtype] in enumerate(cases):
for zzzz in range(nb_repeat):
v=numpy.zeros(shp, dtype=out_dtype)
gc.collect();gc.collect();gc.collect()
print 'v1',v1
v1=weakref.ref(v)
# print 'v1',v1
# v1=weakref.ref(v)
out=shared_fn(v,'out')
f = pfunc(sym_inputs,[],updates=[(out,out+g)],mode=mode)
pdb.set_trace()
# f = pfunc(sym_inputs,[],updates=[(out,out+g)],mode=mode)
# f = pfunc([fx],[],updates=[(out,out+fx)],mode=mode)
# f = pfunc([fx],out+fx,mode=mode)
# f = compile.function([fx,out],[out+fx],mode=mode)#no memory leak.
f = compile.function([fx,compile.In(variable=out, value=out.container, mutable=None)],
[out+fx],mode=mode)#if mutable is True or False, their is a memory leak
del v
gc.collect();gc.collect();gc.collect()
pdb.set_trace()
if True:
if False:
gc.collect();gc.collect();gc.collect()
nd=objgraph.typestats()
print 'key, old val, new val, diff'
......@@ -1097,7 +1105,7 @@ class test_fusion(unittest.TestCase):
d=nd
# pdb.set_trace()
if True:
if False:
gc.collect();gc.collect();gc.collect()
ndl=objgraph.by_type('list')
ll=[]
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论