提交 452595b6 authored 作者: Tim Cooijmans's avatar Tim Cooijmans

GpuBatchedDot: configure stream threshold through constructor argument

上级 9ded03f3
...@@ -18,6 +18,9 @@ from theano.tensor import as_tensor_variable ...@@ -18,6 +18,9 @@ from theano.tensor import as_tensor_variable
class GpuBatchedDot(GpuOp): class GpuBatchedDot(GpuOp):
__props__ = () __props__ = ()
def __init__(self, stream_threshold=128):
self.stream_threshold = stream_threshold
def make_node(self, inp1, inp2): def make_node(self, inp1, inp2):
inp1 = gpu_contiguous(as_cuda_ndarray_variable(inp1)) inp1 = gpu_contiguous(as_cuda_ndarray_variable(inp1))
inp2 = gpu_contiguous(as_cuda_ndarray_variable(inp2)) inp2 = gpu_contiguous(as_cuda_ndarray_variable(inp2))
...@@ -39,6 +42,7 @@ class GpuBatchedDot(GpuOp): ...@@ -39,6 +42,7 @@ class GpuBatchedDot(GpuOp):
bx, by = input_names bx, by = input_names
bz, = output_names bz, = output_names
fail = sub['fail'] fail = sub['fail']
threshold = self.stream_threshold
return (""" return ("""
float alpha = 1.0; float alpha = 1.0;
float beta = 0.0; float beta = 0.0;
...@@ -59,7 +63,7 @@ class GpuBatchedDot(GpuOp): ...@@ -59,7 +63,7 @@ class GpuBatchedDot(GpuOp):
y_dim2 = CudaNdarray_HOST_DIMS(%(by)s)[2]; y_dim2 = CudaNdarray_HOST_DIMS(%(by)s)[2];
// use parallel cublasSgemm calls rather than cublasSgemmBatched for large products // use parallel cublasSgemm calls rather than cublasSgemmBatched for large products
bool use_cublas_sgemm_batched = x_dim1 * x_dim2 * y_dim2 < 128 * 128 * 128; bool use_cublas_sgemm_batched = x_dim1 * x_dim2 * y_dim2 < %(threshold)s * %(threshold)s * %(threshold)s;
if (x_dim0 != y_dim0) if (x_dim0 != y_dim0)
{ {
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论