提交 c050562f authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Gpu implementation of tensor.basic.outer

上级 46baf3f2
......@@ -247,6 +247,81 @@ class GpuGemm(Op):
gpu_gemm_no_inplace = GpuGemm(inplace=False)
gpu_gemm_inplace = GpuGemm(inplace=True)
class GpuOuter(Op):
def make_node(self, x, y):
# we suppose type checking has been done, but make sure.
assert (x.type.ndim == 1 and y.type.ndim == 1 and
x.type.dtype == 'float32' and y.type.dtype == 'float32')
bz = [x.type.broadcastable[0], y.type.broadcastable[0]]
outputs = [CudaNdarrayType(dtype='float32', broadcastable=bz)()]
return Apply(self, [x, y], outputs)
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def c_code_cache_version(self):
return (3,)
def c_code(self, node, name, inputs, outputs, sub):
# A = x * y'
x, y = inputs
A, = outputs
fail = sub['fail']
return """
CudaNdarray *%(name)sx = NULL, *%(name)sy = NULL;
int %(name)sres;
if (CudaNdarray_HOST_STRIDES(%(x)s)[0] < 0) {
%(name)sx = (CudaNdarray *)CudaNdarray_Copy(%(x)s);
if (!%(name)sx) {
%(fail)s;
}
} else {
%(name)sx = %(x)s;
Py_INCREF(%(name)sx);
}
if (CudaNdarray_HOST_STRIDES(%(y)s)[0] < 0) {
%(name)sy = (CudaNdarray *)CudaNdarray_Copy(%(y)s);
if (!%(name)sy) {
Py_DECREF(%(name)sx);
%(fail)s;
}
} else {
%(name)sy = %(y)s;
Py_INCREF(%(name)sy);
}
if (!(%(A)s &&
CudaNdarray_HOST_DIMS(%(A)s)[0] == CudaNdarray_HOST_DIMS(%(x)s)[0] &&
CudaNdarray_HOST_DIMS(%(A)s)[1] == CudaNdarray_HOST_DIMS(%(y)s)[0] &&
CudaNdarray_is_c_contiguous(%(A)s))) {
Py_XDECREF(%(A)s);
int dims[2];
dims[0] = CudaNdarray_HOST_DIMS(%(x)s)[0];
dims[1] = CudaNdarray_HOST_DIMS(%(y)s)[0];
%(A)s = (CudaNdarray *)CudaNdarray_ZEROS(2, dims);
if (!%(A)s) {
Py_DECREF(%(name)sy);
Py_DECREF(%(name)sx);
%(fail)s;
}
}
%(name)sres = CudaNdarray_sger(1.0, %(name)sx, %(name)sy, %(A)s);
Py_DECREF(%(name)sy);
Py_DECREF(%(name)sx);
if (%(name)sres) {
%(fail)s;
}
"""%dict(x=x,y=y,A=A,fail=fail,name=name)
gpu_outer = GpuOuter()
##
# Not really a BLAS operation, but whatever.
#
......
......@@ -2717,6 +2717,48 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B,
return 0;
}
int CudaNdarray_sger(float alpha, CudaNdarray * x, CudaNdarray * y, CudaNdarray * A) {
if (x->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg x to sger"); return -1; }
if (y->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg y to sger"); return -1; }
if (A->nd != 2) { PyErr_SetString(PyExc_ValueError, "non-matrix arg A to sger"); return -1; }
if ((CudaNdarray_HOST_DIMS(A)[0] != CudaNdarray_HOST_DIMS(x)[0])
|| (CudaNdarray_HOST_DIMS(A)[1] != CudaNdarray_HOST_DIMS(y)[0])) {
PyErr_Format(PyExc_ValueError,
"dimension mismatch in args to sger (%i)x(%i)->(%i,%i)",
CudaNdarray_HOST_DIMS(x)[0],
CudaNdarray_HOST_DIMS(y)[0],
CudaNdarray_HOST_DIMS(A)[0],
CudaNdarray_HOST_DIMS(A)[1]);
return -1;
}
// Maybe this could work, but be safe for now
if (!CudaNdarray_is_c_contiguous(A)) {
PyErr_SetString(PyExc_NotImplementedError, "non-c continugous A in sger");
return -1;
}
// Same for this, be safe
assert (CudaNdarray_HOST_STRIDES(x)[0] >= 0);
assert (CudaNdarray_HOST_STRIDES(y)[0] >= 0);
// Since Sger expects A in col-major, we invert x and y to fake this.
cublasSger(CudaNdarray_HOST_DIMS(y)[0], CudaNdarray_HOST_DIMS(x)[0], alpha,
CudaNdarray_DEV_DATA(y), CudaNdarray_HOST_STRIDES(y)[0],
CudaNdarray_DEV_DATA(x), CudaNdarray_HOST_STRIDES(x)[0],
CudaNdarray_DEV_DATA(A), CudaNdarray_HOST_DIMS(A)[1]);
CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError();
if (CUBLAS_STATUS_SUCCESS != err)
{
PyErr_Format(PyExc_RuntimeError, "cublasSger failed (%s)",cudaGetErrorString(err));
return -1;
}
return 0;
}
/**
*
* Precondition:
......
......@@ -478,13 +478,16 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, CudaNdarray * other, boo
PyObject *
CudaNdarray_CreateArrayObj(CudaNdarray * self);
PyObject *
CudaNdarray_ZEROS(int n, int * dims);
/**
* True iff the strides look like [dim[nd-2], dim[nd-3], ... , dim[0], 1]
*/
bool CudaNdarray_is_c_contiguous(const CudaNdarray * self);
int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C);
int CudaNdarray_sger(float alpha, CudaNdarray * x, CudaNdarray * y, CudaNdarray* A);
int CudaNdarray_reduce_sum(CudaNdarray * self, CudaNdarray * A);
int CudaNdarray_reduce_prod(CudaNdarray * self, CudaNdarray * A);
......
......@@ -14,7 +14,7 @@ from theano.gof import (local_optimizer, EquilibriumDB, SequenceDB, ProxyDB,
from theano.sandbox.cuda.basic_ops import *
from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda.blas import (gpu_dot22, gpu_dot22scalar,
gpu_gemm_inplace, gpu_gemm_no_inplace, GpuConv)
gpu_gemm_inplace, gpu_gemm_no_inplace, gpu_outer, GpuConv)
from theano.sandbox.cuda.blas import (GpuDownsampleFactorMax,
GpuDownsampleFactorMaxGrad)
from theano.sandbox.cuda.nnet import (
......@@ -378,6 +378,29 @@ def local_gpu_gemm(node):
return [host_from_gpu(gemms[node.op](gpu_from_host(z), a, gpu_from_host(x), gpu_from_host(y), b))]
return False
@register_opt()
@local_optimizer([])
def local_gpu_outer(node):
"""
gpu_from_host(outer) -> gpu_outer(gpu_from_host)
outer(host_from_gpu) -> host_from_gpu(gpu_outer)
"""
if node.op == gpu_from_host:
host_input = node.inputs[0]
if host_input.owner and host_input.owner.op == tensor.basic.outer:
x, y = host_input.owner.inputs
# gpu_outer will refuse to work with float64 so future-proof
if x.type.dtype == 'float32' and y.type.dtype == 'float32':
return [gpu_outer(gpu_from_host(x), gpu_from_host(y))]
if node.op == tensor.basic.outer:
x, y = node.inputs
x_on_gpu = (x.owner and x.owner.op == host_from_gpu and x.type.dtype == 'float32')
y_on_gpu = (y.owner and y.owner.op == host_from_gpu and x.type.dtype == 'float32')
if x_on_gpu or y_on_gpu:
return [host_from_gpu(gpu_outer(as_cuda_ndarray_variable(x), as_cuda_ndarray_variable(y)))]
return False
@register_opt()
@local_optimizer([])
def local_gpu_sum(node):
......
......@@ -117,6 +117,31 @@ def test_gemm_no_inplace():
assert numpy.allclose(numpy.dot(a0, bval)+cval, a.get_value())
assert numpy.allclose(numpy.dot(a0, bval2)+cval, rval)
def test_outer():
x = tcn.shared_constructor(my_rand(8,), 'x')
y = tcn.shared_constructor(my_rand(6,), 'y')
x_val = x.get_value().copy()
y_val = y.get_value().copy()
f = pfunc([], tensor.outer(x, y), mode=mode_with_gpu)
assert numpy.allclose(numpy.outer(x_val, y_val), f())
f = pfunc([], tensor.outer(x[::2], y), mode=mode_with_gpu)
assert numpy.allclose(numpy.outer(x_val[::2], y_val), f())
f = pfunc([], tensor.outer(x, y[::3]), mode=mode_with_gpu)
assert numpy.allclose(numpy.outer(x_val, y_val[::3]), f())
f = pfunc([], tensor.outer(x[::2], y[::3]), mode=mode_with_gpu)
assert numpy.allclose(numpy.outer(x_val[::2], y_val[::3]), f())
f = pfunc([], tensor.outer(x[::-1], y), mode=mode_with_gpu)
assert numpy.allclose(numpy.outer(x_val[::-1], y_val), f())
f = pfunc([], tensor.outer(x, y[::-1]), mode=mode_with_gpu)
assert numpy.allclose(numpy.outer(x_val, y_val[::-1]), f())
if 0:
# This is commented out because it doesn't make sense...
# tcn.blas has no op called DownsampleFactorMax
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论