make GpuElemwise generate code that work inplace when the destroy_map tell that.

80b4304a · Frederic Bastien · e023c7ee · 80b4304a · 80b4304a
--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -74,15 +74,6 @@ class GpuElemwise(Op):
    nout = property(lambda self: self.scalar_op.nout)
    def __init__(self, scalar_op, inplace_pattern = {}, sync=None):
-        ##
-        # TODO: implement inplace operations.  
-        #       It's ok that we set the DestroyMap to something but then don't actually destroy
-        #       anything.  It's just a bit of a waste of memory.
-        #
-        #       As current GPUs don't have cache, this probably doesn't make any difference to
-        #       the amount of loading and storing to global memory that we would have to do.
-        #       That's why it isn't implemented yet.
-        #
        sync = config.gpuelemwise.sync
        self.scalar_op = scalar_op
@@ -93,7 +84,8 @@ class GpuElemwise(Op):
        self._rehash()
-        self.src_generator = NaiveAlgo(self.scalar_op, sync=sync)
+        self.src_generator = NaiveAlgo(self.scalar_op, sync=sync,
+                                       inplace_pattern = self.inplace_pattern)
    def __getstate__(self):
        d = copy.copy(self.__dict__)
@@ -129,10 +121,6 @@ class GpuElemwise(Op):
        return self._hashval
    def __str__(self):
-        if 0:
-            # TODO:
-            # Current implementation does not use inplace pattern
-            # although since memory on card is precious... it should!
        if self.inplace_pattern:
            items = self.inplace_pattern.items()
            items.sort()

--- a/theano/sandbox/cuda/elemwise.py
+++ b/theano/sandbox/cuda/elemwise.py
@@ -37,15 +37,16 @@ def get_str_list_logical_scalar(node, value_str='ii_i%i_value', data_str='ii_i%i
 class NaiveAlgo(object):
    verbose = 0 # 1, 2 or 3 for more verbose output.
    cache_version = ()
-    cache_version = ('debug', 12, verbose)
+    cache_version = ('debug', 13, verbose)
-    def __init__(self, scalar_op, sync=True):
+    def __init__(self, scalar_op, sync=True, inplace_pattern={}):
        """ 
        :param scalar_op: the scalar operation to execute on each element.
        :param sync: if True, will wait after the kernel launch and check for error call.
        """
        self.scalar_op = scalar_op
        self.sync = sync
+        self.inplace_pattern = inplace_pattern
    def c_src_kernel(self, node, nodename, nd):
        sio = StringIO.StringIO()
@@ -875,7 +876,8 @@ nd_collapse_[i]=0;
            emitted_inames[iname] = True
        #check that all outputs have valid dimensions
-        for oname in outputs:
+        for idx,oname in enumerate(outputs):
+            if idx not in self.inplace_pattern.keys():
                print >> sio, """
        for (int i = 0; (i< %(nd)s) && (%(oname)s); ++i) {
            if (dims[i] != CudaNdarray_HOST_DIMS(%(oname)s)[i])
@@ -903,6 +905,25 @@ nd_collapse_[i]=0;
        //std::cerr << "ELEMWISE NEW %(oname)s nd" << %(oname)s->nd << "\\n";
        //std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n";
        """ % locals()
+            else:
+                input_idx = self.inplace_pattern[idx]
+                iname = inputs[input_idx]
+                print >> sio, """
+        Py_XDECREF(%(oname)s);
+        %(oname)s = %(iname)s;
+        Py_INCREF(%(oname)s);
+        for (int i = 0; (i< %(nd)s) && (%(oname)s); ++i) {
+            if (dims[i] != CudaNdarray_HOST_DIMS(%(oname)s)[i])
+            {
+                Py_DECREF(%(oname)s);
+                %(oname)s = NULL;
+                %(fail)s;
+            }
+        }
+        //std::cerr << "ELEMWISE NEW %(oname)s nd" << %(oname)s->nd << "\\n";
+        //std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n";
+        """ % locals()
        print >> sio, """
        { 
            //new block so that failure gotos don't skip over variable initialization