提交 80b4304a authored 作者: Frederic Bastien's avatar Frederic Bastien

make GpuElemwise generate code that work inplace when the destroy_map tell that.

上级 e023c7ee
...@@ -74,15 +74,6 @@ class GpuElemwise(Op): ...@@ -74,15 +74,6 @@ class GpuElemwise(Op):
nout = property(lambda self: self.scalar_op.nout) nout = property(lambda self: self.scalar_op.nout)
def __init__(self, scalar_op, inplace_pattern = {}, sync=None): def __init__(self, scalar_op, inplace_pattern = {}, sync=None):
##
# TODO: implement inplace operations.
# It's ok that we set the DestroyMap to something but then don't actually destroy
# anything. It's just a bit of a waste of memory.
#
# As current GPUs don't have cache, this probably doesn't make any difference to
# the amount of loading and storing to global memory that we would have to do.
# That's why it isn't implemented yet.
#
sync = config.gpuelemwise.sync sync = config.gpuelemwise.sync
self.scalar_op = scalar_op self.scalar_op = scalar_op
...@@ -93,7 +84,8 @@ class GpuElemwise(Op): ...@@ -93,7 +84,8 @@ class GpuElemwise(Op):
self._rehash() self._rehash()
self.src_generator = NaiveAlgo(self.scalar_op, sync=sync) self.src_generator = NaiveAlgo(self.scalar_op, sync=sync,
inplace_pattern = self.inplace_pattern)
def __getstate__(self): def __getstate__(self):
d = copy.copy(self.__dict__) d = copy.copy(self.__dict__)
...@@ -129,14 +121,10 @@ class GpuElemwise(Op): ...@@ -129,14 +121,10 @@ class GpuElemwise(Op):
return self._hashval return self._hashval
def __str__(self): def __str__(self):
if 0: if self.inplace_pattern:
# TODO: items = self.inplace_pattern.items()
# Current implementation does not use inplace pattern items.sort()
# although since memory on card is precious... it should! return "GpuElemwise{%s}%s" % (self.scalar_op.__class__.__name__, str(items))
if self.inplace_pattern:
items = self.inplace_pattern.items()
items.sort()
return "GpuElemwise{%s}%s" % (self.scalar_op.__class__.__name__, str(items))
#return "GpuElemwise{%s}" % (self.scalar_op.__class__.__name__) #return "GpuElemwise{%s}" % (self.scalar_op.__class__.__name__)
return "GpuElemwise{%s}" % (self.scalar_op) return "GpuElemwise{%s}" % (self.scalar_op)
......
...@@ -37,15 +37,16 @@ def get_str_list_logical_scalar(node, value_str='ii_i%i_value', data_str='ii_i%i ...@@ -37,15 +37,16 @@ def get_str_list_logical_scalar(node, value_str='ii_i%i_value', data_str='ii_i%i
class NaiveAlgo(object): class NaiveAlgo(object):
verbose = 0 # 1, 2 or 3 for more verbose output. verbose = 0 # 1, 2 or 3 for more verbose output.
cache_version = () cache_version = ()
cache_version = ('debug', 12, verbose) cache_version = ('debug', 13, verbose)
def __init__(self, scalar_op, sync=True): def __init__(self, scalar_op, sync=True, inplace_pattern={}):
""" """
:param scalar_op: the scalar operation to execute on each element. :param scalar_op: the scalar operation to execute on each element.
:param sync: if True, will wait after the kernel launch and check for error call. :param sync: if True, will wait after the kernel launch and check for error call.
""" """
self.scalar_op = scalar_op self.scalar_op = scalar_op
self.sync = sync self.sync = sync
self.inplace_pattern = inplace_pattern
def c_src_kernel(self, node, nodename, nd): def c_src_kernel(self, node, nodename, nd):
sio = StringIO.StringIO() sio = StringIO.StringIO()
...@@ -875,8 +876,9 @@ nd_collapse_[i]=0; ...@@ -875,8 +876,9 @@ nd_collapse_[i]=0;
emitted_inames[iname] = True emitted_inames[iname] = True
#check that all outputs have valid dimensions #check that all outputs have valid dimensions
for oname in outputs: for idx,oname in enumerate(outputs):
print >> sio, """ if idx not in self.inplace_pattern.keys():
print >> sio, """
for (int i = 0; (i< %(nd)s) && (%(oname)s); ++i) { for (int i = 0; (i< %(nd)s) && (%(oname)s); ++i) {
if (dims[i] != CudaNdarray_HOST_DIMS(%(oname)s)[i]) if (dims[i] != CudaNdarray_HOST_DIMS(%(oname)s)[i])
{ {
...@@ -903,6 +905,25 @@ nd_collapse_[i]=0; ...@@ -903,6 +905,25 @@ nd_collapse_[i]=0;
//std::cerr << "ELEMWISE NEW %(oname)s nd" << %(oname)s->nd << "\\n"; //std::cerr << "ELEMWISE NEW %(oname)s nd" << %(oname)s->nd << "\\n";
//std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n"; //std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n";
""" % locals() """ % locals()
else:
input_idx = self.inplace_pattern[idx]
iname = inputs[input_idx]
print >> sio, """
Py_XDECREF(%(oname)s);
%(oname)s = %(iname)s;
Py_INCREF(%(oname)s);
for (int i = 0; (i< %(nd)s) && (%(oname)s); ++i) {
if (dims[i] != CudaNdarray_HOST_DIMS(%(oname)s)[i])
{
Py_DECREF(%(oname)s);
%(oname)s = NULL;
%(fail)s;
}
}
//std::cerr << "ELEMWISE NEW %(oname)s nd" << %(oname)s->nd << "\\n";
//std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n";
""" % locals()
print >> sio, """ print >> sio, """
{ {
//new block so that failure gotos don't skip over variable initialization //new block so that failure gotos don't skip over variable initialization
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论