Gpu implementation of tensor.basic.outer

c050562f · Arnaud Bergeron · 46baf3f2 · c050562f · c050562f · c050562f
--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -247,6 +247,81 @@ class GpuGemm(Op):
 gpu_gemm_no_inplace = GpuGemm(inplace=False)
 gpu_gemm_inplace = GpuGemm(inplace=True)
+class GpuOuter(Op):
+    def make_node(self, x, y):
+        # we suppose type checking has been done, but make sure.
+        assert (x.type.ndim == 1 and y.type.ndim == 1 and
+                x.type.dtype == 'float32' and y.type.dtype == 'float32')
+        bz = [x.type.broadcastable[0], y.type.broadcastable[0]]
+        outputs = [CudaNdarrayType(dtype='float32', broadcastable=bz)()]
+        return Apply(self, [x, y], outputs)
+    def __eq__(self, other):
+        return type(self) == type(other)
+    def __hash__(self):
+        return hash(type(self))
+    def c_code_cache_version(self):
+        return (3,)
+    def c_code(self, node, name, inputs, outputs, sub):
+        # A = x * y'
+        x, y = inputs
+        A, = outputs
+        fail = sub['fail']
+        return """
+        CudaNdarray *%(name)sx = NULL, *%(name)sy = NULL;
+        int %(name)sres;
+        if (CudaNdarray_HOST_STRIDES(%(x)s)[0] < 0) {
+            %(name)sx = (CudaNdarray *)CudaNdarray_Copy(%(x)s);
+            if (!%(name)sx) {
+                %(fail)s;
+            }
+        } else {
+            %(name)sx = %(x)s;
+            Py_INCREF(%(name)sx);
+        }
+        if (CudaNdarray_HOST_STRIDES(%(y)s)[0] < 0) {
+            %(name)sy = (CudaNdarray *)CudaNdarray_Copy(%(y)s);
+            if (!%(name)sy) {
+                Py_DECREF(%(name)sx);
+                %(fail)s;
+            }
+        } else {
+            %(name)sy = %(y)s;
+            Py_INCREF(%(name)sy);
+        }
+        if (!(%(A)s &&
+              CudaNdarray_HOST_DIMS(%(A)s)[0] == CudaNdarray_HOST_DIMS(%(x)s)[0] &&
+              CudaNdarray_HOST_DIMS(%(A)s)[1] == CudaNdarray_HOST_DIMS(%(y)s)[0] &&
+              CudaNdarray_is_c_contiguous(%(A)s))) {
+            Py_XDECREF(%(A)s);
+            int dims[2];
+            dims[0] = CudaNdarray_HOST_DIMS(%(x)s)[0];
+            dims[1] = CudaNdarray_HOST_DIMS(%(y)s)[0];
+            %(A)s = (CudaNdarray *)CudaNdarray_ZEROS(2, dims);
+            if (!%(A)s) {
+                Py_DECREF(%(name)sy);
+                Py_DECREF(%(name)sx);
+                %(fail)s;
+            }
+        }
+        %(name)sres = CudaNdarray_sger(1.0, %(name)sx, %(name)sy, %(A)s);
+        Py_DECREF(%(name)sy);
+        Py_DECREF(%(name)sx);
+        if (%(name)sres) {
+            %(fail)s;
+        }
+        """%dict(x=x,y=y,A=A,fail=fail,name=name)
+gpu_outer = GpuOuter()
 ##
 # Not really a BLAS operation, but whatever.
 #

--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -2717,6 +2717,48 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B,
    return 0;
 }
+int CudaNdarray_sger(float alpha, CudaNdarray * x, CudaNdarray * y, CudaNdarray * A) {
+  if (x->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg x to sger"); return -1; }
+  if (y->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg y to sger"); return -1; }
+  if (A->nd != 2) { PyErr_SetString(PyExc_ValueError, "non-matrix arg A to sger"); return -1; }
+  if ((CudaNdarray_HOST_DIMS(A)[0] != CudaNdarray_HOST_DIMS(x)[0])
+      || (CudaNdarray_HOST_DIMS(A)[1] != CudaNdarray_HOST_DIMS(y)[0])) {
+    PyErr_Format(PyExc_ValueError,
+		 "dimension mismatch in args to sger (%i)x(%i)->(%i,%i)",
+		 CudaNdarray_HOST_DIMS(x)[0],
+		 CudaNdarray_HOST_DIMS(y)[0],
+		 CudaNdarray_HOST_DIMS(A)[0],
+		 CudaNdarray_HOST_DIMS(A)[1]);
+      return -1;
+  }
+  // Maybe this could work, but be safe for now
+  if (!CudaNdarray_is_c_contiguous(A)) {
+    PyErr_SetString(PyExc_NotImplementedError, "non-c continugous A in sger");
+    return -1;
+  }
+  // Same for this, be safe
+  assert (CudaNdarray_HOST_STRIDES(x)[0] >= 0);
+  assert (CudaNdarray_HOST_STRIDES(y)[0] >= 0);
+  // Since Sger expects A in col-major, we invert x and y to fake this.
+  cublasSger(CudaNdarray_HOST_DIMS(y)[0], CudaNdarray_HOST_DIMS(x)[0], alpha,
+	     CudaNdarray_DEV_DATA(y), CudaNdarray_HOST_STRIDES(y)[0],
+	     CudaNdarray_DEV_DATA(x), CudaNdarray_HOST_STRIDES(x)[0],
+	     CudaNdarray_DEV_DATA(A), CudaNdarray_HOST_DIMS(A)[1]);
+  CNDA_THREAD_SYNC;
+  cudaError_t err = cudaGetLastError();
+  if (CUBLAS_STATUS_SUCCESS != err)
+    {
+      PyErr_Format(PyExc_RuntimeError, "cublasSger failed (%s)",cudaGetErrorString(err));
+      return -1;
+    }
+  return 0;
+}
 /**
 *
 * Precondition:

--- a/theano/sandbox/cuda/cuda_ndarray.cuh
+++ b/theano/sandbox/cuda/cuda_ndarray.cuh
@@ -478,13 +478,16 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, CudaNdarray * other, boo
 PyObject * 
 CudaNdarray_CreateArrayObj(CudaNdarray * self);
+PyObject * 
+CudaNdarray_ZEROS(int n, int * dims);
 /**
 * True iff the strides look like [dim[nd-2], dim[nd-3], ... , dim[0], 1]
 */
 bool CudaNdarray_is_c_contiguous(const CudaNdarray * self);
 int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C);
+int CudaNdarray_sger(float alpha, CudaNdarray * x, CudaNdarray * y, CudaNdarray* A);
 int CudaNdarray_reduce_sum(CudaNdarray * self, CudaNdarray * A);
 int CudaNdarray_reduce_prod(CudaNdarray * self, CudaNdarray * A);

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -14,7 +14,7 @@ from theano.gof import (local_optimizer, EquilibriumDB, SequenceDB, ProxyDB,
 from theano.sandbox.cuda.basic_ops import *
 from theano.sandbox.cuda.type import CudaNdarrayType
 from theano.sandbox.cuda.blas import (gpu_dot22, gpu_dot22scalar,
-        gpu_gemm_inplace, gpu_gemm_no_inplace, GpuConv)
+        gpu_gemm_inplace, gpu_gemm_no_inplace, gpu_outer, GpuConv)
 from theano.sandbox.cuda.blas import (GpuDownsampleFactorMax,
        GpuDownsampleFactorMaxGrad)
 from theano.sandbox.cuda.nnet import (
@@ -378,6 +378,29 @@ def local_gpu_gemm(node):
            return [host_from_gpu(gemms[node.op](gpu_from_host(z), a, gpu_from_host(x), gpu_from_host(y), b))]
    return False
+@register_opt()
+@local_optimizer([])
+def local_gpu_outer(node):
+    """
+    gpu_from_host(outer) -> gpu_outer(gpu_from_host)
+    outer(host_from_gpu) -> host_from_gpu(gpu_outer)
+    """
+    if node.op == gpu_from_host:
+        host_input = node.inputs[0]
+        if host_input.owner and host_input.owner.op == tensor.basic.outer:
+            x, y = host_input.owner.inputs
+            # gpu_outer will refuse to work with float64 so future-proof
+            if x.type.dtype == 'float32' and y.type.dtype == 'float32':
+                return [gpu_outer(gpu_from_host(x), gpu_from_host(y))]
+    if node.op == tensor.basic.outer:
+        x, y = node.inputs
+        x_on_gpu = (x.owner and x.owner.op == host_from_gpu and x.type.dtype == 'float32')
+        y_on_gpu = (y.owner and y.owner.op == host_from_gpu and x.type.dtype == 'float32')
+        if x_on_gpu or y_on_gpu:
+            return [host_from_gpu(gpu_outer(as_cuda_ndarray_variable(x), as_cuda_ndarray_variable(y)))]
+    return False
 @register_opt()
 @local_optimizer([])
 def local_gpu_sum(node):

--- a/theano/sandbox/cuda/tests/test_blas.py
+++ b/theano/sandbox/cuda/tests/test_blas.py
@@ -117,6 +117,31 @@ def test_gemm_no_inplace():
    assert numpy.allclose(numpy.dot(a0, bval)+cval, a.get_value())
    assert numpy.allclose(numpy.dot(a0, bval2)+cval, rval)
+def test_outer():
+    x = tcn.shared_constructor(my_rand(8,), 'x')
+    y = tcn.shared_constructor(my_rand(6,), 'y')
+    x_val = x.get_value().copy()
+    y_val = y.get_value().copy()
+    f = pfunc([], tensor.outer(x, y), mode=mode_with_gpu)
+    assert numpy.allclose(numpy.outer(x_val, y_val), f())
+    f = pfunc([], tensor.outer(x[::2], y), mode=mode_with_gpu)
+    assert numpy.allclose(numpy.outer(x_val[::2], y_val), f())
+    f = pfunc([], tensor.outer(x, y[::3]), mode=mode_with_gpu)
+    assert numpy.allclose(numpy.outer(x_val, y_val[::3]), f())
+    f = pfunc([], tensor.outer(x[::2], y[::3]), mode=mode_with_gpu)
+    assert numpy.allclose(numpy.outer(x_val[::2], y_val[::3]), f())
+    f = pfunc([], tensor.outer(x[::-1], y), mode=mode_with_gpu)
+    assert numpy.allclose(numpy.outer(x_val[::-1], y_val), f())
+    f = pfunc([], tensor.outer(x, y[::-1]), mode=mode_with_gpu)
+    assert numpy.allclose(numpy.outer(x_val, y_val[::-1]), f())
 if 0:
    # This is commented out because it doesn't make sense...
    # tcn.blas has no op called DownsampleFactorMax