提交 edf434c0 authored 作者: Frederic Bastien's avatar Frederic Bastien

implemented the new flag THEANO_FLAGS=gpuelemwise.sync that default to True.…

implemented the new flag THEANO_FLAGS=gpuelemwise.sync that default to True. When false it won't wait for the gpu fct to finish and won't check the return status. cuda/tests pass in debug mode. Should we add a sync at the end of all the theano graph? How to do so?
上级 194d4ca2
...@@ -11,6 +11,7 @@ default_={ ...@@ -11,6 +11,7 @@ default_={
'op.set_flops':False,#currently used only in ConvOp. The profile mode will print the flops/s for the op. 'op.set_flops':False,#currently used only in ConvOp. The profile mode will print the flops/s for the op.
'nvcc.fastmath':False, 'nvcc.fastmath':False,
'scalar.floatX':'float64', 'scalar.floatX':'float64',
'gpuelemwise.sync':True, #when true, wait that the gpu fct finished and check it error code.
} }
#default value taked from env variable #default value taked from env variable
......
...@@ -2,7 +2,7 @@ import StringIO, sys ...@@ -2,7 +2,7 @@ import StringIO, sys
import numpy import numpy
from theano import Op, Type, Apply, Variable, Constant from theano import Op, Type, Apply, Variable, Constant
from theano import tensor, scalar from theano import tensor, scalar, config
from theano.sandbox.cuda.type import CudaNdarrayType from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda.type_support import filter as type_support_filter from theano.sandbox.cuda.type_support import filter as type_support_filter
...@@ -67,7 +67,7 @@ class GpuElemwise(Op): ...@@ -67,7 +67,7 @@ class GpuElemwise(Op):
nin = property(lambda self: self.scalar_op.nin) nin = property(lambda self: self.scalar_op.nin)
nout = property(lambda self: self.scalar_op.nout) nout = property(lambda self: self.scalar_op.nout)
def __init__(self, scalar_op, inplace_pattern): def __init__(self, scalar_op, inplace_pattern, sync=None):
## ##
# TODO: implement inplace operations. # TODO: implement inplace operations.
# It's ok that we set the DestroyMap to something but then don't actually destroy # It's ok that we set the DestroyMap to something but then don't actually destroy
...@@ -77,6 +77,7 @@ class GpuElemwise(Op): ...@@ -77,6 +77,7 @@ class GpuElemwise(Op):
# the amount of loading and storing to global memory that we would have to do. # the amount of loading and storing to global memory that we would have to do.
# That's why it isn't implemented yet. # That's why it isn't implemented yet.
# #
sync = config.config.getboolean('gpuelemwise.sync',sync)
self.scalar_op = scalar_op self.scalar_op = scalar_op
self.inplace_pattern = inplace_pattern self.inplace_pattern = inplace_pattern
self.destroy_map = dict((o, [i]) for o, i in inplace_pattern.items()) self.destroy_map = dict((o, [i]) for o, i in inplace_pattern.items())
...@@ -86,7 +87,8 @@ class GpuElemwise(Op): ...@@ -86,7 +87,8 @@ class GpuElemwise(Op):
self.ufunc = None self.ufunc = None
self._rehash() self._rehash()
self.src_generator = NaiveAlgo(self.scalar_op) self.src_generator = NaiveAlgo(self.scalar_op, sync=sync)
self.sync = sync
def __getstate__(self): def __getstate__(self):
d = copy.copy(self.__dict__) d = copy.copy(self.__dict__)
......
...@@ -214,8 +214,15 @@ class NaiveAlgo(object): ...@@ -214,8 +214,15 @@ class NaiveAlgo(object):
cache_version = () cache_version = ()
cache_version = ('debug', 7, verbose) cache_version = ('debug', 7, verbose)
def __init__(self, scalar_op): def __init__(self, scalar_op, sync=True):
"""
:param scalar_op: the scalar operation to execute on each element.
:param sync: if True, will wait after the kernel launch and check for error call.
"""
self.scalar_op = scalar_op self.scalar_op = scalar_op
self.sync = sync
if not self.sync:
self.cache_version+=('nosync',)
def c_src_kernel(self, node, nodename, nd): def c_src_kernel(self, node, nodename, nd):
sio = StringIO.StringIO() sio = StringIO.StringIO()
...@@ -860,7 +867,7 @@ nd_collapse_[i]=0; ...@@ -860,7 +867,7 @@ nd_collapse_[i]=0;
print >> sio, 'std::cerr << " local_ostr %(ipos)s: " <<'%locals()+' << " " << '.join(["local_ostr[%(ipos)s][%(x)s]"%locals() for x in range(nd)])+'<<"\\n";' print >> sio, 'std::cerr << " local_ostr %(ipos)s: " <<'%locals()+' << " " << '.join(["local_ostr[%(ipos)s][%(x)s]"%locals() for x in range(nd)])+'<<"\\n";'
def launch_Ccontiguous(nodename, id_self, scalar_op): def launch_Ccontiguous(nodename, id_self, scalar_op, sync=True):
kernel_call_args = ["numEls"] kernel_call_args = ["numEls"]
for ipos in xrange(len(node.inputs)): for ipos in xrange(len(node.inputs)):
kernel_call_args.append("i%i_data"%ipos) kernel_call_args.append("i%i_data"%ipos)
...@@ -876,6 +883,9 @@ nd_collapse_[i]=0; ...@@ -876,6 +883,9 @@ nd_collapse_[i]=0;
kernel_%(scalar_op)s_%(nodename)s_Ccontiguous<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s); kernel_%(scalar_op)s_%(nodename)s_Ccontiguous<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s);
//std::cerr << "calling callkernel returned\\n"; //std::cerr << "calling callkernel returned\\n";
""" %locals()
if sync:
print >> sio, """
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError(); cudaError_t err = cudaGetLastError();
if( cudaSuccess != err) if( cudaSuccess != err)
...@@ -887,8 +897,10 @@ nd_collapse_[i]=0; ...@@ -887,8 +897,10 @@ nd_collapse_[i]=0;
%(verb)s %(verb)s
return 0; return 0;
""" %locals() """ %locals()
else:
print >> sio, " return 0; " %locals()
def launch_General(nodename, id_self, scalar_op, force_nd): def launch_General(nodename, id_self, scalar_op, force_nd, sync=True):
# kernel_call_args are used to invoke the cuda kernel # kernel_call_args are used to invoke the cuda kernel
local="local_" local="local_"
kernel_call_args = ["numEls"] kernel_call_args = ["numEls"]
...@@ -914,6 +926,9 @@ nd_collapse_[i]=0; ...@@ -914,6 +926,9 @@ nd_collapse_[i]=0;
int threads_per_block = std::min(numEls, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK); int threads_per_block = std::min(numEls, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
int n_blocks = std::min(numEls/threads_per_block + (numEls %% threads_per_block?1:0), (unsigned int)NUM_VECTOR_OP_BLOCKS); int n_blocks = std::min(numEls/threads_per_block + (numEls %% threads_per_block?1:0), (unsigned int)NUM_VECTOR_OP_BLOCKS);
kernel_%(scalar_op)s_%(nodename)s_%(id_self)s_%(force_nd)s<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s); kernel_%(scalar_op)s_%(nodename)s_%(id_self)s_%(force_nd)s<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s);
""" %locals()
if sync:
print >> sio, """
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError(); cudaError_t err = cudaGetLastError();
if( cudaSuccess != err) if( cudaSuccess != err)
...@@ -924,14 +939,15 @@ nd_collapse_[i]=0; ...@@ -924,14 +939,15 @@ nd_collapse_[i]=0;
} }
return 0; return 0;
""" %locals() """ %locals()
else:
print >> sio, " return 0; " %locals()
print >> sio, "switch (nd_collapse==0?0:min(%(nd)s,nd_collapse)) {"%locals() print >> sio, "switch (nd_collapse==0?0:min(%(nd)s,nd_collapse)) {"%locals()
print >> sio, "case 0: {" print >> sio, "case 0: {"
launch_Ccontiguous(nodename, id_self, scalar_op) launch_Ccontiguous(nodename, id_self, scalar_op, self.sync)
print >> sio, " } break;" print >> sio, " } break;"
for i in range(1, nd+1): for i in range(1, nd+1):
print >> sio, "case "+str(i)+": {" print >> sio, "case "+str(i)+": {"
launch_General(nodename, id_self, scalar_op, i) launch_General(nodename, id_self, scalar_op, i, self.sync)
print >> sio, " } break;" print >> sio, " } break;"
print >> sio, "}"#end case print >> sio, "}"#end case
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论