提交 edf434c0 authored 作者: Frederic Bastien's avatar Frederic Bastien

implemented the new flag THEANO_FLAGS=gpuelemwise.sync that default to True.…

implemented the new flag THEANO_FLAGS=gpuelemwise.sync that default to True. When false it won't wait for the gpu fct to finish and won't check the return status. cuda/tests pass in debug mode. Should we add a sync at the end of all the theano graph? How to do so?
上级 194d4ca2
......@@ -11,6 +11,7 @@ default_={
'op.set_flops':False,#currently used only in ConvOp. The profile mode will print the flops/s for the op.
'nvcc.fastmath':False,
'scalar.floatX':'float64',
'gpuelemwise.sync':True, #when true, wait that the gpu fct finished and check it error code.
}
#default value taked from env variable
......
......@@ -2,7 +2,7 @@ import StringIO, sys
import numpy
from theano import Op, Type, Apply, Variable, Constant
from theano import tensor, scalar
from theano import tensor, scalar, config
from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda.type_support import filter as type_support_filter
......@@ -67,7 +67,7 @@ class GpuElemwise(Op):
nin = property(lambda self: self.scalar_op.nin)
nout = property(lambda self: self.scalar_op.nout)
def __init__(self, scalar_op, inplace_pattern):
def __init__(self, scalar_op, inplace_pattern, sync=None):
##
# TODO: implement inplace operations.
# It's ok that we set the DestroyMap to something but then don't actually destroy
......@@ -77,6 +77,7 @@ class GpuElemwise(Op):
# the amount of loading and storing to global memory that we would have to do.
# That's why it isn't implemented yet.
#
sync = config.config.getboolean('gpuelemwise.sync',sync)
self.scalar_op = scalar_op
self.inplace_pattern = inplace_pattern
self.destroy_map = dict((o, [i]) for o, i in inplace_pattern.items())
......@@ -86,7 +87,8 @@ class GpuElemwise(Op):
self.ufunc = None
self._rehash()
self.src_generator = NaiveAlgo(self.scalar_op)
self.src_generator = NaiveAlgo(self.scalar_op, sync=sync)
self.sync = sync
def __getstate__(self):
d = copy.copy(self.__dict__)
......
......@@ -214,8 +214,15 @@ class NaiveAlgo(object):
cache_version = ()
cache_version = ('debug', 7, verbose)
def __init__(self, scalar_op):
def __init__(self, scalar_op, sync=True):
"""
:param scalar_op: the scalar operation to execute on each element.
:param sync: if True, will wait after the kernel launch and check for error call.
"""
self.scalar_op = scalar_op
self.sync = sync
if not self.sync:
self.cache_version+=('nosync',)
def c_src_kernel(self, node, nodename, nd):
sio = StringIO.StringIO()
......@@ -860,7 +867,7 @@ nd_collapse_[i]=0;
print >> sio, 'std::cerr << " local_ostr %(ipos)s: " <<'%locals()+' << " " << '.join(["local_ostr[%(ipos)s][%(x)s]"%locals() for x in range(nd)])+'<<"\\n";'
def launch_Ccontiguous(nodename, id_self, scalar_op):
def launch_Ccontiguous(nodename, id_self, scalar_op, sync=True):
kernel_call_args = ["numEls"]
for ipos in xrange(len(node.inputs)):
kernel_call_args.append("i%i_data"%ipos)
......@@ -876,6 +883,9 @@ nd_collapse_[i]=0;
kernel_%(scalar_op)s_%(nodename)s_Ccontiguous<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s);
//std::cerr << "calling callkernel returned\\n";
""" %locals()
if sync:
print >> sio, """
CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
......@@ -887,8 +897,10 @@ nd_collapse_[i]=0;
%(verb)s
return 0;
""" %locals()
else:
print >> sio, " return 0; " %locals()
def launch_General(nodename, id_self, scalar_op, force_nd):
def launch_General(nodename, id_self, scalar_op, force_nd, sync=True):
# kernel_call_args are used to invoke the cuda kernel
local="local_"
kernel_call_args = ["numEls"]
......@@ -914,6 +926,9 @@ nd_collapse_[i]=0;
int threads_per_block = std::min(numEls, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
int n_blocks = std::min(numEls/threads_per_block + (numEls %% threads_per_block?1:0), (unsigned int)NUM_VECTOR_OP_BLOCKS);
kernel_%(scalar_op)s_%(nodename)s_%(id_self)s_%(force_nd)s<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s);
""" %locals()
if sync:
print >> sio, """
CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
......@@ -924,14 +939,15 @@ nd_collapse_[i]=0;
}
return 0;
""" %locals()
else:
print >> sio, " return 0; " %locals()
print >> sio, "switch (nd_collapse==0?0:min(%(nd)s,nd_collapse)) {"%locals()
print >> sio, "case 0: {"
launch_Ccontiguous(nodename, id_self, scalar_op)
launch_Ccontiguous(nodename, id_self, scalar_op, self.sync)
print >> sio, " } break;"
for i in range(1, nd+1):
print >> sio, "case "+str(i)+": {"
launch_General(nodename, id_self, scalar_op, i)
launch_General(nodename, id_self, scalar_op, i, self.sync)
print >> sio, " } break;"
print >> sio, "}"#end case
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论