提交 85159185 authored 作者: Xavier Bouthillier's avatar Xavier Bouthillier

Merge pull request #2893 from sebastien-j/batched_gemm

GPU batched gemm
...@@ -14,6 +14,192 @@ from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable, ...@@ -14,6 +14,192 @@ from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
gpu_contiguous) gpu_contiguous)
from theano.tensor import as_tensor_variable from theano.tensor import as_tensor_variable
class BatchedDotOp(GpuOp):
__props__ = ()
def make_node(self, inp1, inp2):
inp1 = gpu_contiguous(as_cuda_ndarray_variable(inp1))
inp2 = gpu_contiguous(as_cuda_ndarray_variable(inp2))
assert inp1.dtype == "float32"
assert inp2.dtype == "float32"
assert inp1.ndim == 3 # (batch, a, b)
assert inp2.ndim == 3
return theano.Apply(self, [inp1, inp2],
[self.output_type(inp1, inp2)()])
def output_type(self, inp1, inp2):
return CudaNdarrayType(
(inp1.type.broadcastable[0] or inp2.type.broadcastable[0],
inp1.type.broadcastable[1], inp2.type.broadcastable[2]))
def c_code(self, node, name, input_names, output_names, sub):
bx, by = input_names
bz, = output_names
fail = sub['fail']
return """
float alpha = 1.0;
float beta = 0.0;
int i, x_dim0, x_dim1, x_dim2, y_dim0, y_dim1, y_dim2;
int x_stride, y_stride, z_stride, total_size;
int ptr_array_size = 3 * CudaNdarray_HOST_DIMS(%(bx)s)[0] * sizeof(float *);
int out_dim[3];
cublasStatus_t err;
cudaError_t err1;
float **host_x = NULL;
float **host_z = NULL;
float **host_y = NULL;
float **gpu_x = NULL;
float **gpu_y = NULL;
float **gpu_z = NULL;
x_dim0 = CudaNdarray_HOST_DIMS(%(bx)s)[0];
x_dim1 = CudaNdarray_HOST_DIMS(%(bx)s)[1];
x_dim2 = CudaNdarray_HOST_DIMS(%(bx)s)[2];
y_dim0 = CudaNdarray_HOST_DIMS(%(by)s)[0];
y_dim1 = CudaNdarray_HOST_DIMS(%(by)s)[1];
y_dim2 = CudaNdarray_HOST_DIMS(%(by)s)[2];
if (x_dim0 != y_dim0)
{
PyErr_Format(PyExc_RuntimeError,
"The batchsizes (%%d, %%d) don't match.\\n",
x_dim0, x_dim1);
%(fail)s;
}
if (x_dim2 != y_dim1)
{
PyErr_Format(PyExc_RuntimeError,
"Shape mismatch. (%%d, %%d, %%d) (%%d, %%d, %%d)\\n",
x_dim0, x_dim1, x_dim2, y_dim0, y_dim1, y_dim2);
%(fail)s;
}
out_dim[0] = x_dim0;
out_dim[1] = x_dim1;
out_dim[2] = y_dim2;
if ( !(%(bz)s
&& %(bz)s->nd==3
&& CudaNdarray_is_c_contiguous(%(bz)s)
&& CudaNdarray_HOST_DIMS(%(bz)s)[0]==out_dim[0]
&& CudaNdarray_HOST_DIMS(%(bz)s)[1]==out_dim[1]
&& CudaNdarray_HOST_DIMS(%(bz)s)[2]==out_dim[2]))
{
Py_XDECREF(%(bz)s);
%(bz)s = (CudaNdarray*)CudaNdarray_NewDims(3,out_dim);
if (NULL == %(bz)s)
{
PyErr_Format(PyExc_RuntimeError,
"Failed to allocate output of %%d x %%d x %%d",
out_dim[0], out_dim[1], out_dim[2]);
%(fail)s;
}
}
if (x_dim0 != 0 && y_dim0 != 0 &&
x_dim1 != 0 && y_dim1 != 0 &&
x_dim2 != 0 && y_dim2 != 0)
{
x_stride = CudaNdarray_HOST_STRIDES(%(bx)s)[0];
y_stride = CudaNdarray_HOST_STRIDES(%(by)s)[0];
z_stride = CudaNdarray_HOST_STRIDES(%(bz)s)[0];
host_x = (float **) malloc (ptr_array_size);
if (host_x == NULL)
{
CLEANUP();
PyErr_Format(PyExc_RuntimeError,
"%%s", "malloc failure");
%(fail)s;
}
host_y = &host_x[x_dim0];
host_z = &host_y[x_dim0];
host_x[0] = CudaNdarray_DEV_DATA(%(bx)s);
host_y[0] = CudaNdarray_DEV_DATA(%(by)s);
host_z[0] = CudaNdarray_DEV_DATA(%(bz)s);
for (i = 1; i < out_dim[0]; i++)
{
host_x[i] = host_x[i - 1] + x_stride;
host_y[i] = host_y[i - 1] + y_stride;
host_z[i] = host_z[i - 1] + z_stride;
}
err1 = cudaMalloc((void **)&gpu_x, ptr_array_size);
if (err1 != cudaSuccess)
{
CLEANUP();
PyErr_Format(PyExc_RuntimeError,
"%%s", "cudaMalloc failure");
%(fail)s;
}
gpu_y = &gpu_x[x_dim0];
gpu_z = &gpu_y[x_dim0];
err1 = cudaMemcpy(gpu_x, host_x, ptr_array_size, cudaMemcpyHostToDevice);
if (err1 != cudaSuccess)
{
CLEANUP();
PyErr_Format(PyExc_RuntimeError,
"%%s", "cudaMemcpy failure");
%(fail)s;
}
err = cublasSgemmBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N,
y_dim2, x_dim1, x_dim2, &alpha,
(const float **) gpu_y, y_dim2,
(const float **) gpu_x, x_dim2, &beta,
gpu_z, y_dim2, x_dim0);
CLEANUP();
if (CUBLAS_STATUS_SUCCESS != err)
{
PyErr_Format(PyExc_RuntimeError,
"cublasSgemmBatched failed (%%i) %%s",
err, cublasGetErrorString(err));
%(fail)s;
}
}
else
{
total_size = x_dim0 * x_dim1 * y_dim2 * sizeof(float);
if (cudaSuccess != cudaMemset(CudaNdarray_DEV_DATA(%(bz)s), 0, total_size))
{
PyErr_Format(PyExc_RuntimeError,
"Failed to fill output with zeros");
%(fail)s;
}
}
""" % locals()
def c_support_code(self):
return """
#define CLEANUP() \
do \
{ \
if (host_x) free (host_x); \
if (gpu_x) cudaFree(gpu_x); \
} while (0)
"""
batched_dot = BatchedDotOp()
class GpuDot22(GpuOp): class GpuDot22(GpuOp):
""" """
......
...@@ -23,7 +23,7 @@ import theano.compile.mode ...@@ -23,7 +23,7 @@ import theano.compile.mode
from theano.tensor.tests.test_blas import BaseGemv, TestBlasStrides, TestGer from theano.tensor.tests.test_blas import BaseGemv, TestBlasStrides, TestGer
from theano.sandbox.cuda.blas import gpu_gemv_no_inplace, gpu_gemv_inplace from theano.sandbox.cuda.blas import gpu_gemv_no_inplace, gpu_gemv_inplace
from theano.sandbox.cuda.blas import gpu_ger_inplace, gpu_ger_no_inplace from theano.sandbox.cuda.blas import gpu_ger_inplace, gpu_ger_no_inplace
from theano.sandbox.cuda.blas import batched_dot
if theano.config.mode == 'FAST_COMPILE': if theano.config.mode == 'FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu') mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
...@@ -43,6 +43,68 @@ mode_without_gpu.check_py_code = False ...@@ -43,6 +43,68 @@ mode_without_gpu.check_py_code = False
def my_rand(*shape): def my_rand(*shape):
return theano._asarray(numpy.random.rand(*shape), dtype='float32') return theano._asarray(numpy.random.rand(*shape), dtype='float32')
class TestBatchedDot(TestCase):
def test_batched_dot_correctness(self):
def cmp(a_shp, b_shp):
a=numpy.random.randn(*a_shp).astype(numpy.float32)
b=numpy.random.randn(*b_shp).astype(numpy.float32)
x=tensor.ftensor3()
y=tensor.ftensor3()
f=theano.function([x,y], batched_dot(x,y), mode=mode_with_gpu)
z0=numpy.asarray(f(a,b))
ga = cuda_ndarray.CudaNdarray(a)
gb = cuda_ndarray.CudaNdarray(b)
z1=numpy.asarray(f(ga,gb))
z_test = numpy.sum(a[:,:,:,None]*b[:,None,:,:],axis=-2)
assert numpy.allclose(z0, z_test)
assert numpy.allclose(z1, z_test)
cmp((5,4,3), (5,3,2))
cmp((5,3,3), (5,3,3))
cmp((5,2,6), (5,6,3))
# Test dimensions of 0
cmp((0,2,6), (0,6,3))
cmp((5,0,3), (5,3,2))
cmp((5,4,0), (5,0,2))
cmp((5,4,3), (5,3,0))
cmp((0,0,0), (0,0,0))
# Test dimensions of 1
cmp((1,2,6), (1,6,3))
cmp((5,1,3), (5,3,2))
cmp((5,4,1), (5,1,2))
cmp((5,4,3), (5,3,1))
def test_batched_dot_errors(self):
def fail(a_shp, b_shp):
a=numpy.random.randn(*a_shp).astype(numpy.float32)
b=numpy.random.randn(*b_shp).astype(numpy.float32)
x=tensor.ftensor3()
y=tensor.ftensor3()
f=theano.function([x,y], batched_dot(x,y), mode=mode_with_gpu)
z = f(a,b)
# Different batch size
self.assertRaises(RuntimeError, fail, (5,4,3), (6,3,2))
# Shape mismatch
self.assertRaises(RuntimeError, fail, (5,4,3), (5,2,2))
def test_dot22(): def test_dot22():
def cmp(a_shp, b_shp): def cmp(a_shp, b_shp):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论