提交 f85856b2 authored 作者: James Bergstra's avatar James Bergstra 提交者: Frederic

added GpuGemv and GpuGer

上级 1b01d9b9
...@@ -251,6 +251,188 @@ class GpuGemm(Op): ...@@ -251,6 +251,188 @@ class GpuGemm(Op):
gpu_gemm_no_inplace = GpuGemm(inplace=False) gpu_gemm_no_inplace = GpuGemm(inplace=False)
gpu_gemm_inplace = GpuGemm(inplace=True) gpu_gemm_inplace = GpuGemm(inplace=True)
class GpuGemv(Op):
"""
implement gemv on the gpu.
"""
def __init__(self, inplace):
self.__setstate__({'inplace':inplace})
def __str__(self):
if self.inplace:
return 'GpuGemv{inplace}'
else:
return 'GpuGemv{no_inplace}'
def __eq__(self, other):
return (type(self) == type(other)\
and self.inplace == other.inplace)
def __hash__(self):
return hash(type(self)) ^ hash(self.inplace)
def __setstate__(self, dct):
inplace = dct.get('inplace', True)
if inplace:
self.destroy_map = {0: [0]}
self.inplace = inplace
def __getstate__(self):
return dict(inplace=self.inplace)
def make_node(self, z, a, x, y, b):
# the more complicated error checking performed by tensor.gemv is assumed to already
# have been done
return Apply(self, [z, a, x, y, b], [z.type()])
def c_code_cache_version(self):
return ()
def c_code(self, node, name, inputs, outputs, sub):
#z_out = alpha * dot(x,y) + beta * z_in
#inplace version, set set z_out = z_in
#not inplace version, we copy z_in to z_out.
z_in, a, x, y, b = inputs
z_out, = outputs
fail = sub['fail']
sio = StringIO.StringIO()
print >> sio, """
float %(name)s_alpha = ((dtype_%(a)s*)(%(a)s->data))[0];
float %(name)s_beta = ((dtype_%(b)s*)(%(b)s->data))[0];
"""
if self.inplace:
print >> sio, """
Py_XDECREF(%(z_out)s);
%(z_out)s = %(z_in)s;
Py_INCREF(%(z_out)s);
"""
else:
print >> sio, """
if (!%(z_out)s
|| (%(z_out)s->nd != 1)
|| (CudaNdarray_HOST_DIMS(%(z_out)s)[0] != CudaNdarray_HOST_DIMS(%(z_in)s)[0])
)
{
Py_XDECREF(%(z_out)s);
%(z_out)s = (CudaNdarray*)CudaNdarray_Copy(%(z_in)s);
if (!%(z_out)s)
{
%(fail)s;
}
}
else
{
if (CudaNdarray_CopyFromCudaNdarray(%(z_out)s, %(z_in)s))
{
%(fail)s;
}
}
"""
print >> sio, """
if (CudaNdarray_sgemv(%(name)s_alpha, %(x)s, %(y)s, %(name)s_beta, %(z_out)s))
{
%(fail)s;
}
"""
return sio.getvalue() % locals()
gpu_gemv_no_inplace = GpuGemv(inplace=False)
gpu_gemv_inplace = GpuGemv(inplace=True)
class GpuGer(Op):
"""
implement ger on the gpu.
"""
def __init__(self, inplace):
self.__setstate__({'inplace':inplace})
def __str__(self):
if self.inplace:
return 'GpuGer{inplace}'
else:
return 'GpuGer{no_inplace}'
def __eq__(self, other):
return (type(self) == type(other)\
and self.inplace == other.inplace)
def __hash__(self):
return hash(type(self)) ^ hash(self.inplace)
def __setstate__(self, dct):
inplace = dct.get('inplace', True)
if inplace:
self.destroy_map = {0: [0]}
self.inplace = inplace
def __getstate__(self):
return dict(inplace=self.inplace)
def make_node(self, z, a, x, y):
# the more complicated error checking performed by tensor.ger is
# assumed to already have been done
return Apply(self, [z, a, x, y], [z.type()])
def c_code_cache_version(self):
return ()
def c_code(self, node, name, inputs, outputs, sub):
#z_out = alpha * dot(x,y) + beta * z_in
#inplace version, set set z_out = z_in
#not inplace version, we copy z_in to z_out.
z_in, a, x, y = inputs
z_out, = outputs
fail = sub['fail']
sio = StringIO.StringIO()
print >> sio, """
float %(name)s_alpha = ((dtype_%(a)s*)(%(a)s->data))[0];
"""
if self.inplace:
print >> sio, """
Py_XDECREF(%(z_out)s);
%(z_out)s = %(z_in)s;
Py_INCREF(%(z_out)s);
"""
else:
print >> sio, """
if (!%(z_out)s
|| (%(z_out)s->nd != 2)
|| (CudaNdarray_HOST_DIMS(%(z_out)s)[0] != CudaNdarray_HOST_DIMS(%(z_in)s)[0])
|| (CudaNdarray_HOST_DIMS(%(z_out)s)[1] != CudaNdarray_HOST_DIMS(%(z_in)s)[1])
)
{
Py_XDECREF(%(z_out)s);
%(z_out)s = (CudaNdarray*)CudaNdarray_Copy(%(z_in)s);
if (!%(z_out)s)
{
%(fail)s;
}
}
else
{
if (CudaNdarray_CopyFromCudaNdarray(%(z_out)s, %(z_in)s))
{
%(fail)s;
}
}
"""
print >> sio, """
if (CudaNdarray_sger(%(name)s_alpha, %(x)s, %(y)s, %(z_out)s))
{
%(fail)s;
}
"""
return sio.getvalue() % locals()
gpu_ger_no_inplace = GpuGer(inplace=False)
gpu_ger_inplace = GpuGer(inplace=True)
class GpuOuter(Op): class GpuOuter(Op):
def make_node(self, x, y): def make_node(self, x, y):
# we suppose type checking has been done, but make sure. # we suppose type checking has been done, but make sure.
......
...@@ -2892,6 +2892,86 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B, ...@@ -2892,6 +2892,86 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B,
return 0; return 0;
} }
int CudaNdarray_sgemv(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C)
{
if (A->nd != 2) { PyErr_SetString(PyExc_ValueError, "non-matrix arg to gemv"); return -1; }
if (B->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg to gemv"); return -1; }
if (C->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg to gemv"); return -1; }
// We must allow dimensions to be zeros.
if ((CudaNdarray_HOST_DIMS(A)[1] != CudaNdarray_HOST_DIMS(B)[0])
|| (CudaNdarray_HOST_DIMS(A)[0] != CudaNdarray_HOST_DIMS(C)[0]))
{
PyErr_Format(PyExc_ValueError, "dimension mismatch in args to gemv (%i,%i)x(%i)->(%i)",
CudaNdarray_HOST_DIMS(A)[0],
CudaNdarray_HOST_DIMS(A)[1],
CudaNdarray_HOST_DIMS(B)[0],
CudaNdarray_HOST_DIMS(C)[0]);
return -1;
}
// a matrix has non-unit size and non-unit stride in both directions, we can't operate in-place
// TODO: make a copy instead of returning in error
if (((CudaNdarray_HOST_DIMS(A)[0] > 1) && (CudaNdarray_HOST_STRIDES(A)[0] != 1)) && ((CudaNdarray_HOST_DIMS(A)[1] > 1) && (CudaNdarray_HOST_STRIDES(A)[1] != 1)))
{ PyErr_SetString(PyExc_NotImplementedError, "non-unit stride in gemv arg"); return -1; }
// I don't know if cudablas handles negative strides
if ( (CudaNdarray_HOST_STRIDES(A)[0] < 0)
|| (CudaNdarray_HOST_STRIDES(A)[1] < 0)
|| (CudaNdarray_HOST_STRIDES(B)[0] < 0)
|| (CudaNdarray_HOST_STRIDES(C)[0] < 0))
{
PyErr_Format(PyExc_ValueError, "illegal strides in args to gemv (%i,%i)x(%i)->(%i)",
CudaNdarray_HOST_STRIDES(A)[0],
CudaNdarray_HOST_STRIDES(A)[1],
CudaNdarray_HOST_STRIDES(B)[0],
CudaNdarray_HOST_STRIDES(C)[0]);
return -1;
}
/* create appropriate strides for malformed matrices that are row or column
* vectors
*/
int sa_0 = (CudaNdarray_HOST_DIMS(A)[0] > 1) ? CudaNdarray_HOST_STRIDES(A)[0] : CudaNdarray_HOST_DIMS(A)[1];
int sa_1 = (CudaNdarray_HOST_DIMS(A)[1] > 1) ? CudaNdarray_HOST_STRIDES(A)[1] : CudaNdarray_HOST_DIMS(A)[0];
int sb_0 = (CudaNdarray_HOST_DIMS(B)[0] > 1) ? CudaNdarray_HOST_STRIDES(B)[0] : CudaNdarray_HOST_DIMS(B)[1];
int sc_0 = (CudaNdarray_HOST_DIMS(C)[0] > 1) ? CudaNdarray_HOST_STRIDES(C)[0] : CudaNdarray_HOST_DIMS(C)[1];
if (sa_0 == 1)
{
cublasSgemv('N',
CudaNdarray_HOST_DIMS(A)[0], CudaNdarray_HOST_DIMS(A)[1],
alpha,
CudaNdarray_DEV_DATA(A), sa_1,
CudaNdarray_DEV_DATA(B), sb_0,
beta,
CudaNdarray_DEV_DATA(C), sc_0);
}
else if (sa_1 == 1)
{
cublasSgemv('T',
CudaNdarray_HOST_DIMS(A)[1], CudaNdarray_HOST_DIMS(A)[0],
alpha,
CudaNdarray_DEV_DATA(A), sa_0,
CudaNdarray_DEV_DATA(B), sb_0,
beta,
CudaNdarray_DEV_DATA(C), sc_0);
}
else
{
PyErr_SetString(PyExc_NotImplementedError, "too many strides strides in sgemv");
return -1;
}
CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError();
if (CUBLAS_STATUS_SUCCESS != err)
{
PyErr_Format(PyExc_RuntimeError, "cublassGemv failed (%s)",cudaGetErrorString(err));
return -1;
}
return 0;
}
int CudaNdarray_sger(float alpha, CudaNdarray * x, CudaNdarray * y, CudaNdarray * A) { int CudaNdarray_sger(float alpha, CudaNdarray * x, CudaNdarray * y, CudaNdarray * A) {
if (x->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg x to sger"); return -1; } if (x->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg x to sger"); return -1; }
if (y->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg y to sger"); return -1; } if (y->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg y to sger"); return -1; }
......
...@@ -320,6 +320,7 @@ DllExport bool CudaNdarray_is_c_contiguous(const CudaNdarray * self); ...@@ -320,6 +320,7 @@ DllExport bool CudaNdarray_is_c_contiguous(const CudaNdarray * self);
DllExport PyObject * CudaNdarray_IS_C_Contiguous(CudaNdarray * self); DllExport PyObject * CudaNdarray_IS_C_Contiguous(CudaNdarray * self);
DllExport int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C); DllExport int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C);
DllExport int CudaNdarray_sgemv(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C);
DllExport int CudaNdarray_sger(float alpha, CudaNdarray * x, CudaNdarray * y, CudaNdarray* A); DllExport int CudaNdarray_sger(float alpha, CudaNdarray * x, CudaNdarray * y, CudaNdarray* A);
DllExport int CudaNdarray_reduce_sum(CudaNdarray * self, CudaNdarray * A); DllExport int CudaNdarray_reduce_sum(CudaNdarray * self, CudaNdarray * A);
......
...@@ -16,6 +16,10 @@ from theano.sandbox.cuda.basic_ops import * ...@@ -16,6 +16,10 @@ from theano.sandbox.cuda.basic_ops import *
from theano.sandbox.cuda.type import CudaNdarrayType from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda.blas import (gpu_dot22, gpu_dot22scalar, from theano.sandbox.cuda.blas import (gpu_dot22, gpu_dot22scalar,
gpu_gemm_inplace, gpu_gemm_no_inplace, gpu_outer, GpuConv) gpu_gemm_inplace, gpu_gemm_no_inplace, gpu_outer, GpuConv)
from theano.sandbox.cuda.blas import gpu_gemv_inplace
from theano.sandbox.cuda.blas import gpu_gemv_no_inplace
from theano.sandbox.cuda.blas import gpu_ger_inplace
from theano.sandbox.cuda.blas import gpu_ger_no_inplace
from theano.sandbox.cuda.blas import (GpuDownsampleFactorMax, from theano.sandbox.cuda.blas import (GpuDownsampleFactorMax,
GpuDownsampleFactorMaxGrad) GpuDownsampleFactorMaxGrad)
from theano.sandbox.cuda.nnet import ( from theano.sandbox.cuda.nnet import (
...@@ -375,47 +379,82 @@ def local_gpu_dot22scalar(node): ...@@ -375,47 +379,82 @@ def local_gpu_dot22scalar(node):
@register_opt() @register_opt()
@local_optimizer([]) @local_optimizer([])
def local_gpu_gemv_as_gemm(node): def local_gpu_gemv(node):
""" """
gpu_from_host(gemv) -> gpu_gemv(gpu_from_host) gpu_from_host(gemv) -> gpu_gemv(gpu_from_host)
gemm(host_from_gpu) -> host_from_gpu(gpu_gemv) gemv(host_from_gpu) -> host_from_gpu(gpu_gemv)
This optimization solves the vector-matrix multiplication issue by
transforming the vector into a matrix, apply gpudot22 and reshaping
the output.
A more suitable solution would be to use the right cublas call
""" """
gemvs = {tensor.blas.gemv_inplace: gpu_gemm_inplace, gemvs = {tensor.blas.gemv_inplace: gpu_gemv_inplace,
tensor.blas.gemv_no_inplace: gpu_gemm_no_inplace} tensor.blas.gemv_no_inplace: gpu_gemv_no_inplace,
tensor.blas_c.CGemv(inplace=True): gpu_gemv_inplace,
tensor.blas_c.CGemv(inplace=False): gpu_gemv_no_inplace,
}
if node.op == gpu_from_host: if node.op == gpu_from_host:
host_input = node.inputs[0] host_input = node.inputs[0]
if host_input.owner and host_input.owner.op in gemvs: if host_input.owner and host_input.owner.op in gemvs:
op = host_input.owner.op op = host_input.owner.op
z, a, x, y, b = host_input.owner.inputs z, a, x, y, b = host_input.owner.inputs
return [ return [gemvs[op](
GpuDimShuffle((False,True),[0])(gemvs[op]( gpu_from_host(z)
GpuDimShuffle((False,),[0,'x'])(gpu_from_host(z))
, a , a
, gpu_from_host(x) , gpu_from_host(x)
, GpuDimShuffle((False,),[0,'x'])(gpu_from_host(y)) , gpu_from_host(y)
, b))] , b)]
if node.op in gemvs: if node.op in gemvs:
z, a, x, y, b = node.inputs z, a, x, y, b = node.inputs
x_on_gpu = (x.owner and x.owner.op == host_from_gpu) x_on_gpu = (x.owner and x.owner.op == host_from_gpu)
y_on_gpu = (y.owner and y.owner.op == host_from_gpu) y_on_gpu = (y.owner and y.owner.op == host_from_gpu)
z_on_gpu = (z.owner and z.owner.op == host_from_gpu) z_on_gpu = (z.owner and z.owner.op == host_from_gpu)
if x_on_gpu or y_on_gpu or z_on_gpu: if x_on_gpu or y_on_gpu or z_on_gpu:
return [host_from_gpu(GpuDimShuffle((False,True),[0])( return [host_from_gpu(
gemvs[node.op]( gemvs[node.op](
GpuDimShuffle((False,),[0,'x'])(gpu_from_host(z)) gpu_from_host(z)
, a , a
, gpu_from_host(x) , gpu_from_host(x)
, GpuDimShuffle((False,),[0,'x'])(gpu_from_host(y)) , gpu_from_host(y)
, b)))] , b))]
return False return False
@register_opt()
@local_optimizer([])
def local_gpu_ger(node):
"""
gpu_from_host(gemv) -> gpu_gemv(gpu_from_host)
gemv(host_from_gpu) -> host_from_gpu(gpu_gemv)
"""
gers = {
tensor.blas_c.CGer(destructive=True): gpu_ger_inplace,
tensor.blas_c.CGer(destructive=False): gpu_ger_no_inplace,
}
if node.op == gpu_from_host:
host_input = node.inputs[0]
if host_input.owner and host_input.owner.op in gers:
op = host_input.owner.op
z, a, x, y = host_input.owner.inputs
return [gers[op](
gpu_from_host(z)
, a
, gpu_from_host(x)
, gpu_from_host(y)
)]
if node.op in gers:
z, a, x, y = node.inputs
x_on_gpu = (x.owner and x.owner.op == host_from_gpu)
y_on_gpu = (y.owner and y.owner.op == host_from_gpu)
z_on_gpu = (z.owner and z.owner.op == host_from_gpu)
if x_on_gpu or y_on_gpu or z_on_gpu:
return [host_from_gpu(
gers[node.op](
gpu_from_host(z)
, a
, gpu_from_host(x)
, gpu_from_host(y)
))]
return False
@register_opt() @register_opt()
@local_optimizer([]) @local_optimizer([])
def local_gpu_gemm(node): def local_gpu_gemm(node):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论