cuda.blas - added inplace option to GpuGemm (and optimizations)

4031c7b6 · James Bergstra · a73ce6b6 · 4031c7b6 · 4031c7b6
--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -153,14 +153,24 @@ class GpuGemm(Op):
            Need to check al least refcount.
    """
-    destroy_map = {0:[0]}
+    def __init__(self, inplace):
+        self.inplace = inplace
+        if self.inplace:
+            self.destroy_map = {0:[0]}
    def __str__(self):
-        return 'GpuGemm'
+        if self.inplace:
+            return 'GpuGemm{inplace}'
+        else:
+            return 'GpuGemm{no_inplace}'
    def __eq__(self, other):
-        return type(self) == type(other)
+        return type(self) == type(other) and self.inplace == other.inplace
    def __hash__(self):
-        return hash(type(self))
+        return hash(type(self)) ^ hash(self.inplace)
+    def __setstate__(self, dct):
+        self.inplace = dct.get('inplace', True)
    def make_node(self, z, a, x, y, b):
        # the more complicated error checking performed by tensor.gemm is assumed to already
@@ -168,13 +178,16 @@ class GpuGemm(Op):
        return Apply(self, [z, a, x, y, b], [z.type()])
    def c_code_cache_version(self):
+        return ()
        return (2,)
    def c_code(self, node, name, inputs, outputs, sub):
        z_in, a, x, y, b = inputs
        z_out, = outputs
        fail = sub['fail']
-        return """
+        sio = StringIO.StringIO()
+        print >> sio, """
        #define REAL float
        float %(name)s_a = (%(a)s->descr->type_num == PyArray_FLOAT) 
@@ -186,15 +199,46 @@ class GpuGemm(Op):
        : (REAL)(((double*)%(b)s->data)[0]);
        #undef REAL
-        if (CudaNdarray_gemm(%(name)s_a, %(x)s, %(y)s, %(name)s_b, %(z_in)s))
+        """
+        if self.inplace:
+            print >> sio, """
+            Py_XDECREF(%(z_out)s);
+            %(z_out)s = %(z_in)s;
+            Py_INCREF(%(z_out)s);
+            """
+        else:
+            print >> sio, """
+            if (!%(z_out)s
+                || (%(z_out)s->nd != 2)
+                || (CudaNdarray_HOST_DIMS(%(z_out)s)[0] != CudaNdarray_HOST_DIMS(%(z_in)s)[0])
+                || (CudaNdarray_HOST_DIMS(%(z_out)s)[1] != CudaNdarray_HOST_DIMS(%(z_in)s)[1])
+                )
+            {
+                Py_XDECREF(%(z_out)s);
+                %(z_out)s = CudaNdarray_Copy(%(z_in)s);
+                if (!(z_out)s)
+                {
+                    %(fail)s;
+                }
+            }
+            else
+            {
+                if (CudaNdarray_CopyFromCudaNdarray(%(z_out)s, %(z_in)s))
+                {
+                    %(fail)s;
+                }
+            }
+            """
+        print >> sio, """
+        if (CudaNdarray_gemm(%(name)s_a, %(x)s, %(y)s, %(name)s_b, %(z_out)s))
        {
            %(fail)s;
        }
-        Py_XDECREF(%(z_out)s);
-        %(z_out)s = %(z_in)s;
-        Py_INCREF(%(z_out)s);
        """ % locals()
-gpu_gemm = GpuGemm()
+gpu_gemm_no_inplace = GpuGemm(inplace=False)
+gpu_gemm_inplace = GpuGemm(inplace=True)
 ##
 # Not really a BLAS operation, but whatever.

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -7,7 +7,8 @@ from theano.gof import local_optimizer, EquilibriumDB, SequenceDB, Optimizer, to
 from theano.sandbox.cuda.basic_ops import *
 from theano.sandbox.cuda.type import CudaNdarrayType
-from theano.sandbox.cuda.blas import gpu_dot22, gpu_dot22scalar, gpu_gemm, GpuConv
+from theano.sandbox.cuda.blas import (gpu_dot22, gpu_dot22scalar, gpu_gemm_inplace,
+        gpu_gemm_no_inplace, GpuConv)
 from theano.sandbox.cuda.blas import GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad
 from theano.sandbox.cuda.nnet import (
        GpuCrossentropySoftmaxArgmax1HotWithBias,
@@ -191,18 +192,21 @@ def local_gpu_gemm(node):
    gemm(host_from_gpu) -> host_from_gpu(gpu_gemm)
    """
+    gemms = {tensor.blas.gemm_inplace: gpu_gemm_inplace,
+            tensor.blas.gemm_no_inplace: gpu_gemm_no_inplace}
    if node.op == gpu_from_host:
        host_input = node.inputs[0]
-        if host_input.owner and host_input.owner.op == tensor.blas.gemm_inplace:
+        if host_input.owner and host_input.owner.op in gemms:
+            op = host_input.owner.op
            z, a, x, y, b = host_input.owner.inputs
-            return [gpu_gemm(gpu_from_host(z), a, gpu_from_host(x), gpu_from_host(y), b)]
+            return [gemms[op](gpu_from_host(z), a, gpu_from_host(x), gpu_from_host(y), b)]
-    if node.op == tensor.blas.gemm_inplace:
+    if node.op in gemms:
        z, a, x, y, b = node.inputs
        x_on_gpu = (x.owner and x.owner.op == host_from_gpu)
        y_on_gpu = (y.owner and y.owner.op == host_from_gpu)
        z_on_gpu = (z.owner and z.owner.op == host_from_gpu)
        if x_on_gpu or y_on_gpu or z_on_gpu:
-            return [host_from_gpu(gpu_gemm(gpu_from_host(z), a, gpu_from_host(x), gpu_from_host(y), b))]
+            return [host_from_gpu(gemms[node.op](gpu_from_host(z), a, gpu_from_host(x), gpu_from_host(y), b))]
    return False
 @register_opt()