提交 8289fc6b authored 作者: Thomas George's avatar Thomas George

now using a cuda kernel through GpuKernel for triu/tril

上级 9d8c72f8
...@@ -5,8 +5,12 @@ import theano ...@@ -5,8 +5,12 @@ import theano
import warnings import warnings
from theano import Op from theano import Op
from theano.gpuarray import basic_ops, GpuArrayType from theano.gpuarray import basic_ops, GpuArrayType
from pygpu.gpuarray import GpuKernel, GpuArray
from string import Template
import numpy as np import numpy as np
from numpy.linalg.linalg import LinAlgError from numpy.linalg.linalg import LinAlgError
...@@ -19,7 +23,6 @@ cusolver_available = False ...@@ -19,7 +23,6 @@ cusolver_available = False
try: try:
import skcuda import skcuda
from skcuda import cusolver from skcuda import cusolver
from skcuda import linalg
cusolver_available = True cusolver_available = True
except (ImportError, OSError, RuntimeError, pkg_resources.DistributionNotFound): except (ImportError, OSError, RuntimeError, pkg_resources.DistributionNotFound):
pass pass
...@@ -53,13 +56,51 @@ if cusolver_available: ...@@ -53,13 +56,51 @@ if cusolver_available:
cusolver.cusolverCheckStatus(status) cusolver.cusolverCheckStatus(status)
def attach_handle_to_context(ctx): def attach_cusolver_handle_to_context(ctx):
handle = getattr(ctx, 'cusolver_handle', None) handle = getattr(ctx, 'cusolver_handle', None)
if handle is None: if handle is None:
with ctx: with ctx:
ctx.cusolver_handle = cusolver.cusolverDnCreate() ctx.cusolver_handle = cusolver.cusolverDnCreate()
def tril(A, ctx):
tmpl = Template("""
KERNEL void tril(GLOBAL_MEM ga_float *a, ga_uint N) {
unsigned int idx = blockIdx.y*blockDim.x*gridDim.x+
blockIdx.x*blockDim.x+threadIdx.x;
unsigned int ix = idx/${cols};
unsigned int iy = idx%${cols};
if (idx < N) {
if (ix < iy)
a[idx] = 0.0;
}
}
""")
src = tmpl.substitute(cols=A.shape[0])
spec = [GpuArray, 'uint32']
k = GpuKernel(src, "tril", spec, context=ctx)
return k(A, A.shape[0] * A.shape[1], n=A.shape[0])
def triu(A, ctx):
tmpl = Template("""
KERNEL void triu(GLOBAL_MEM ga_float *a, ga_uint N) {
unsigned int idx = blockIdx.y*blockDim.x*gridDim.x+
blockIdx.x*blockDim.x+threadIdx.x;
unsigned int ix = idx/${cols};
unsigned int iy = idx%${cols};
if (idx < N) {
if (ix > iy)
a[idx] = 0.0;
}
}
""")
src = tmpl.substitute(cols=A.shape[0])
spec = [GpuArray, 'uint32']
k = GpuKernel(src, "triu", spec, context=ctx)
return k(A, A.shape[0] * A.shape[1], n=A.shape[0])
class GpuCusolverSolve(Op): class GpuCusolverSolve(Op):
""" """
CUSOLVER GPU solver OP. CUSOLVER GPU solver OP.
...@@ -109,7 +150,7 @@ class GpuCusolverSolve(Op): ...@@ -109,7 +150,7 @@ class GpuCusolverSolve(Op):
def prepare_node(self, node, storage_map, compute_map, impl): def prepare_node(self, node, storage_map, compute_map, impl):
ctx = node.inputs[0].type.context ctx = node.inputs[0].type.context
attach_handle_to_context(ctx) attach_cusolver_handle_to_context(ctx)
def check_dev_info(self, dev_info): def check_dev_info(self, dev_info):
val = np.asarray(dev_info)[0] val = np.asarray(dev_info)[0]
...@@ -268,7 +309,7 @@ class GpuCholesky(Op): ...@@ -268,7 +309,7 @@ class GpuCholesky(Op):
def prepare_node(self, node, storage_map, compute_map, impl): def prepare_node(self, node, storage_map, compute_map, impl):
ctx = node.inputs[0].type.context ctx = node.inputs[0].type.context
attach_handle_to_context(ctx) attach_cusolver_handle_to_context(ctx)
def perform(self, node, inputs, outputs): def perform(self, node, inputs, outputs):
context = inputs[0][0].context context = inputs[0][0].context
...@@ -306,32 +347,29 @@ class GpuCholesky(Op): ...@@ -306,32 +347,29 @@ class GpuCholesky(Op):
workspace_size = cusolver.cusolverDnSpotrf_bufferSize( workspace_size = cusolver.cusolverDnSpotrf_bufferSize(
context.cusolver_handle, l_parameter, n, L_ptr, lda) context.cusolver_handle, l_parameter, n, L_ptr, lda)
workspace = pygpu.zeros(workspace_size, dtype='float32', workspace = pygpu.zeros(workspace_size, dtype='float32',
context=context) context=context)
dev_info = pygpu.zeros((1,), dtype='int32', context=context) dev_info = pygpu.zeros((1,), dtype='int32', context=context)
workspace_ptr = workspace.gpudata workspace_ptr = workspace.gpudata
dev_info_ptr = dev_info.gpudata dev_info_ptr = dev_info.gpudata
with context:
cusolver.cusolverDnSpotrf( cusolver.cusolverDnSpotrf(
context.cusolver_handle, l_parameter, n, L_ptr, lda, workspace_ptr, context.cusolver_handle, l_parameter, n, L_ptr, lda, workspace_ptr,
workspace_size, dev_info_ptr) workspace_size, dev_info_ptr)
val_dev_info = np.asarray(dev_info)[0]
if val_dev_info > 0:
raise LinAlgError('Cholesky decomposition failed (is A SPD?)')
# cusolver leaves the elements in the matrix outside the considered # cusolver leaves the elements in the matrix outside the considered
# upper or lower triangle unchanged, so we need to put zeros outside # upper or lower triangle unchanged, so we need to put zeros outside
# the triangle # the triangle
""" # Note : we should probably check for c or f order in triu instead of here
with context: if self.lower and L.flags['C_CONTIGUOUS'] or (not self.lower and L.flags['F_CONTIGUOUS']):
if self.lower: tril(L, context)
linalg.tril(L, overwrite=True, handle=context.cudnn_handle)
else:
linalg.triu(L, overwrite=True, handle=context.cudnn_handle)
"""
if self.lower:
L.write(numpy.tril(L))
else: else:
L.write(numpy.triu(L)) triu(L, context)
outputs[0][0] = L outputs[0][0] = L
...@@ -196,11 +196,7 @@ class TestGpuCholesky(unittest.TestCase): ...@@ -196,11 +196,7 @@ class TestGpuCholesky(unittest.TestCase):
A_val = M_val.dot(M_val.T) A_val = M_val.dot(M_val.T)
self.compare_gpu_cholesky_to_numpy(A_val, lower=False, inplace=True) self.compare_gpu_cholesky_to_numpy(A_val, lower=False, inplace=True)
class nothing:
def test_invalid_input_fail_non_symmetric(self): def test_invalid_input_fail_non_symmetric(self):
pass
""" Invalid Cholesky input test with non-symmetric input. """ Invalid Cholesky input test with non-symmetric input.
(Non-symmetric real input must also be non-positive definite). """ (Non-symmetric real input must also be non-positive definite). """
A_val = numpy.random.normal(size=(3, 3)).astype("float32") A_val = numpy.random.normal(size=(3, 3)).astype("float32")
...@@ -208,14 +204,13 @@ class nothing: ...@@ -208,14 +204,13 @@ class nothing:
# not being the case even with finite precision should be negligible # not being the case even with finite precision should be negligible
assert not numpy.allclose(A_val, A_val.T) assert not numpy.allclose(A_val, A_val.T)
fn = self.get_gpu_cholesky_func(True, False) fn = self.get_gpu_cholesky_func(True, False)
self.assertRaises(cula.cula.culaError, fn, A_val) self.assertRaises(LinAlgError, fn, A_val)
def test_invalid_input_fail_negative_definite(self): def test_invalid_input_fail_negative_definite(self):
pass
""" Invalid Cholesky input test with negative-definite input. """ """ Invalid Cholesky input test with negative-definite input. """
M_val = numpy.random.normal(size=(3, 3)).astype("float32") M_val = numpy.random.normal(size=(3, 3)).astype("float32")
# A = -M.dot(M) will be negative definite for all non-singular M # A = -M.dot(M) will be negative definite for all non-singular M
A_val = -M_val.dot(M_val.T) A_val = -M_val.dot(M_val.T)
fn = self.get_gpu_cholesky_func(True, False) fn = self.get_gpu_cholesky_func(True, False)
self.assertRaises(cula.cula.culaError, fn, A_val) self.assertRaises(LinAlgError, fn, A_val)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论