implemented GpuGemm not inplace version

c5f98527 · Frederic Bastien · 6658cf32 · c5f98527 · c5f98527
--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -153,14 +153,29 @@ class GpuGemm(Op):
            Need to check al least refcount.
    """
-    destroy_map = {0:[0]}
+    def __init__(self, inplace):
+        self.__setstate__({'inplace':inplace})
    def __str__(self):
-        return 'GpuGemm'
+        if self.inplace: inplace_str = 'inplace'
+        else: inplace_str = 'no_inplace'
+        return '%s{%s}' % (self.__class__.__name__, inplace_str)
    def __eq__(self, other):
-        return type(self) == type(other)
+        return (type(self) == type(other)\
+                and self.inplace == other.inplace)
    def __hash__(self):
-        return hash(type(self))
+        return hash(type(self)) ^ hash(self.inplace)
+    def __setstate__(self, dct):
+        inplace = dct.get('inplace', True)
+        if inplace:
+            self.destroy_map = {0: [0]} 
+        self.inplace = inplace
+    def __getstate__(self):
+        return dict(inplace=self.inplace)
    def make_node(self, z, a, x, y, b):
        # the more complicated error checking performed by tensor.gemm is assumed to already
@@ -171,9 +186,13 @@ class GpuGemm(Op):
        return (2,)
    def c_code(self, node, name, inputs, outputs, sub):
+        #z_out = alpha * dot(x,y) + beta * z_in
+        #inplace version, set set z_out = z_in
+        #not inplace version, we copy z_in to z_out.
        z_in, a, x, y, b = inputs
        z_out, = outputs
        fail = sub['fail']
+        if self.inplace:
            return """
        #define REAL float
@@ -194,7 +213,43 @@ class GpuGemm(Op):
        %(z_out)s = %(z_in)s;
        Py_INCREF(%(z_out)s);
        """ % locals()
-gpu_gemm = GpuGemm()
+        else:
+            return """
+        #define REAL float
+        float %(name)s_a = (%(a)s->descr->type_num == PyArray_FLOAT) 
+        ? (REAL)(((float*)%(a)s->data)[0])
+        : (REAL)(((double*)%(a)s->data)[0]);
+        float %(name)s_b = (%(b)s->descr->type_num == PyArray_FLOAT) ?
+        (REAL)(((float*)%(b)s->data)[0])
+        : (REAL)(((double*)%(b)s->data)[0]);
+        #undef REAL
+        if ((NULL == %(z_out)s)
+            || (CudaNdarray_HOST_DIMS(%(z_out)s)[0] != CudaNdarray_HOST_DIMS(%(z_in)s)[0])
+            || (CudaNdarray_HOST_DIMS(%(z_out)s)[1] != CudaNdarray_HOST_DIMS(%(z_in)s)[1]))
+        {
+            Py_XDECREF(%(z_out)s);
+            %(z_out)s = (CudaNdarray*)CudaNdarray_Copy(%(z_in)s);
+            if(!%(z_out)s) {
+                PyErr_SetString(PyExc_MemoryError, "failed to alloc GpuGemm{no_inplace} output");
+                %(fail)s
+            }
+        }else{
+            if(CudaNdarray_CopyFromCudaNdarray(%(z_out)s,%(z_in)s)){
+                PyErr_SetString(PyExc_MemoryError, "failed to copy input in GpuGemm{no_inplace}");
+                %(fail)s
+            }
+        }
+        if (CudaNdarray_gemm(%(name)s_a, %(x)s, %(y)s, %(name)s_b, %(z_out)s))
+        {
+            %(fail)s;
+        }
+        """ % locals()
+gpu_gemm_inplace = GpuGemm(True)
+gpu_gemm_no_inplace = GpuGemm(False)
 ##
 # Not really a BLAS operation, but whatever.

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -7,7 +7,7 @@ from theano.gof import local_optimizer, EquilibriumDB, SequenceDB, Optimizer, to
 from theano.sandbox.cuda.basic_ops import *
 from theano.sandbox.cuda.type import CudaNdarrayType
-from theano.sandbox.cuda.blas import gpu_dot22, gpu_dot22scalar, gpu_gemm, GpuConv
+from theano.sandbox.cuda.blas import gpu_dot22, gpu_dot22scalar, gpu_gemm_inplace, gpu_gemm_no_inplace, GpuConv
 from theano.sandbox.cuda.blas import GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad
 from theano.sandbox.cuda.nnet import (
        GpuCrossentropySoftmaxArgmax1HotWithBias,
@@ -187,6 +187,8 @@ def local_gpu_dot22scalar(node):
 @local_optimizer([])
 def local_gpu_gemm(node):
    """
+    work for inplace and not inplace gemm
    gpu_from_host(gemm) -> gpu_gemm(gpu_from_host)
    gemm(host_from_gpu) -> host_from_gpu(gpu_gemm)
@@ -195,14 +197,24 @@ def local_gpu_gemm(node):
        host_input = node.inputs[0]
        if host_input.owner and host_input.owner.op == tensor.blas.gemm_inplace:
            z, a, x, y, b = host_input.owner.inputs
-            return [gpu_gemm(gpu_from_host(z), a, gpu_from_host(x), gpu_from_host(y), b)]
+            return [gpu_gemm_inplace(gpu_from_host(z), a, gpu_from_host(x), gpu_from_host(y), b)]
-    if node.op == tensor.blas.gemm_inplace:
+        elif host_input.owner and host_input.owner.op == tensor.blas.gemm_no_inplace:
+            z, a, x, y, b = host_input.owner.inputs
+            return [gpu_gemm_no_inplace(gpu_from_host(z), a, gpu_from_host(x), gpu_from_host(y), b)]
+    elif node.op == tensor.blas.gemm_inplace:
+        z, a, x, y, b = node.inputs
+        x_on_gpu = (x.owner and x.owner.op == host_from_gpu)
+        y_on_gpu = (y.owner and y.owner.op == host_from_gpu)
+        z_on_gpu = (z.owner and z.owner.op == host_from_gpu)
+        if x_on_gpu or y_on_gpu or z_on_gpu:
+            return [host_from_gpu(gpu_gemm_inplace(gpu_from_host(z), a, gpu_from_host(x), gpu_from_host(y), b))]
+    elif node.op == tensor.blas.gemm_no_inplace:
        z, a, x, y, b = node.inputs
        x_on_gpu = (x.owner and x.owner.op == host_from_gpu)
        y_on_gpu = (y.owner and y.owner.op == host_from_gpu)
        z_on_gpu = (z.owner and z.owner.op == host_from_gpu)
        if x_on_gpu or y_on_gpu or z_on_gpu:
-            return [host_from_gpu(gpu_gemm(gpu_from_host(z), a, gpu_from_host(x), gpu_from_host(y), b))]
+            return [host_from_gpu(gpu_gemm_no_inplace(gpu_from_host(z), a, gpu_from_host(x), gpu_from_host(y), b))]
    return False
 @register_opt()