提交 aca733c8 authored 作者: Pascal Lamblin's avatar Pascal Lamblin 提交者: GitHub

Merge pull request #5774 from nouiz/less_gpuelemwise

Don't move scalar float* elemwise unless the result is needed on the GPU.
......@@ -50,9 +50,8 @@ class GpuGemv(BlasOp):
A = as_gpuarray_variable(A, ctx_name)
x = as_gpuarray_variable(x, ctx_name)
y = as_gpuarray_variable(y, ctx_name)
with theano.configparser.change_flags(warn_float64='ignore'):
alpha = as_tensor_variable(alpha).astype('float64')
beta = as_tensor_variable(beta).astype('float64')
alpha = as_tensor_variable(alpha)
beta = as_tensor_variable(beta)
assert alpha.ndim == 0
assert beta.ndim == 0
......@@ -60,6 +59,13 @@ class GpuGemv(BlasOp):
assert x.ndim == 1
assert y.ndim == 1
assert A.dtype == x.dtype == y.dtype
# float16 not supported
expected = A.dtype
assert theano.scalar.upcast(alpha.dtype,
beta.dtype, expected) == expected
alpha = alpha.astype(expected)
beta = beta.astype(expected)
return Apply(self, [y, alpha, A, x, beta], [y.type()])
def perform(self, node, inputs, out_storage):
......@@ -163,15 +169,30 @@ class GpuGemm(BlasOp):
A = as_gpuarray_variable(A, ctx_name)
B = as_gpuarray_variable(B, ctx_name)
C = as_gpuarray_variable(C, ctx_name)
with theano.configparser.change_flags(warn_float64='ignore'):
alpha = as_tensor_variable(alpha).astype('float64')
beta = as_tensor_variable(beta).astype('float64')
alpha = as_tensor_variable(alpha)
beta = as_tensor_variable(beta)
if not (A.dtype == B.dtype == C.dtype):
raise TypeError(theano.tensor.blas.Gemm.E_mixed,
(A.dtype, B.dtype, C.dtype,
alpha.dtype, beta.dtype))
if not A.dtype.startswith('float'):
raise TypeError(theano.tensor.blas.Gemm.E_float, (A.dtype))
if A.dtype == 'float16':
expected = 'float32'
else:
expected = A.dtype
assert theano.scalar.upcast(alpha.dtype,
beta.dtype, expected) == expected
alpha = alpha.astype(expected)
beta = beta.astype(expected)
assert alpha.ndim == 0
assert beta.ndim == 0
assert A.ndim == 2
assert B.ndim == 2
assert C.ndim == 2
assert A.dtype == B.dtype == C.dtype
return Apply(self, [C, alpha, A, B, beta], [C.type()])
def perform(self, node, inputs, outputs):
......@@ -244,13 +265,17 @@ class GpuGer(BlasOp):
A = as_gpuarray_variable(A, ctx_name)
x = as_gpuarray_variable(x, ctx_name)
y = as_gpuarray_variable(y, ctx_name)
with theano.configparser.change_flags(warn_float64='ignore'):
alpha = as_tensor_variable(alpha).astype('float64')
alpha = as_tensor_variable(alpha)
if not(A.dtype == x.dtype == y.dtype):
raise TypeError('ger requires matching dtypes',
(A.dtype, alpha.dtype, x.dtype, y.dtype))
assert theano.scalar.upcast(alpha.dtype, A.dtype) == A.dtype
alpha = alpha.astype(A.dtype)
assert alpha.ndim == 0
assert A.ndim == 2
assert x.ndim == 1
assert y.ndim == 1
assert A.dtype == x.dtype == y.dtype
return Apply(self, [A, alpha, x, y], [A.type()])
def perform(self, node, inp, out):
......@@ -383,15 +408,14 @@ class GpuGemmBatch(BlasOp):
A = as_gpuarray_variable(A, ctx_name)
B = as_gpuarray_variable(B, ctx_name)
C = as_gpuarray_variable(C, ctx_name)
with theano.configparser.change_flags(warn_float64='ignore'):
alpha = as_tensor_variable(alpha).astype('float64')
beta = as_tensor_variable(beta).astype('float64')
alpha = as_tensor_variable(alpha)
beta = as_tensor_variable(beta)
assert alpha.ndim == 0
assert beta.ndim == 0
assert A.ndim == 3
assert B.ndim == 3
assert C.ndim == 3
assert A.dtype == B.dtype == C.dtype
assert A.dtype == B.dtype == C.dtype == alpha.dtype == beta.dtype
return Apply(self, [C, alpha, A, B, beta], [C.type()])
def c_headers(self):
......
......@@ -702,6 +702,7 @@ def local_gpua_elemwise(op, context_name, inputs, outputs):
name = 'Gpu' + name
if len(outputs) > 1:
return
have_cuda = False
have_opencl = False
if inputs and isinstance(inputs[0].type, GpuArrayType):
......@@ -1162,6 +1163,8 @@ def local_gpua_careduce(op, context_name, inputs, outputs):
@op_lifter([tensor.blas.Gemv, tensor.blas_c.CGemv])
@register_opt2([tensor.blas.Gemv], 'fast_compile')
def local_gpua_gemv(op, context_name, inputs, outputs):
if inputs[0].dtype not in ['float32', 'float64']:
return
if op.inplace:
return gpugemv_inplace
else:
......@@ -1172,6 +1175,8 @@ def local_gpua_gemv(op, context_name, inputs, outputs):
@op_lifter([tensor.blas.Gemm])
@register_opt2([tensor.blas.Gemm], 'fast_compile')
def local_gpua_gemm(op, context_name, inputs, outputs):
if inputs[0].dtype not in ['float16', 'float32', 'float64']:
return
if op.inplace:
return gpugemm_inplace
else:
......@@ -1182,9 +1187,12 @@ def local_gpua_gemm(op, context_name, inputs, outputs):
@op_lifter([tensor.blas.BatchedDot])
@register_opt2([tensor.blas.BatchedDot], 'fast_compile')
def local_gpua_gemmbatch(op, context_name, inputs, outputs):
if inputs[0].dtype not in ['float32', 'float64']:
return
a, b = inputs
c = tensor.AllocEmpty(a.dtype)(a.shape[0], a.shape[1], b.shape[2])
return gpugemmbatch_no_inplace(c, 1.0, a, b, 0.0)
return gpugemmbatch_no_inplace(c, np.asarray(1.0, dtype=a.dtype),
a, b, np.asarray(0.0, dtype=a.dtype))
@register_opt()
......@@ -1215,6 +1223,8 @@ def local_gpua_gemmbatch_output_merge(node, *inputs):
@op_lifter([tensor.blas.Ger, tensor.blas_c.CGer, tensor.blas_scipy.ScipyGer])
@register_opt2([tensor.blas.Ger, tensor.blas_c.CGer, tensor.blas_scipy.ScipyGer], 'fast_compile')
def local_gpua_ger(op, context_name, inputs, outputs):
if inputs[0].dtype not in ['float32', 'float64']:
return
return GpuGer(inplace=op.destructive)
......
......@@ -234,7 +234,8 @@ def gpu_alloc_expected(x, *shp):
GpuAllocTester = makeTester(
name="GpuAllocTester",
op=alloc,
# The +1 is there to allow the lift to the GPU.
op=lambda *args: alloc(*args) + 1,
gpu_op=GpuAlloc(test_ctx_name),
cases=dict(
correct01=(rand(), np.int32(7)),
......
......@@ -15,7 +15,8 @@ from .config import mode_with_gpu
from .test_basic_ops import makeTester, rand
from ..blas import (gpugemv_inplace, gpugemv_no_inplace,
gpugemm_inplace, gpugemmbatch_no_inplace,
gpugemm_inplace, gpugemm_no_inplace,
gpugemmbatch_no_inplace,
gpuger_inplace, gpuger_no_inplace,
GpuGer, gpu_dot22)
......@@ -23,16 +24,51 @@ from ..blas import (gpugemv_inplace, gpugemv_no_inplace,
GpuGemvTester = makeTester(
'GpuGemvTester',
op=gemv_inplace, gpu_op=gpugemv_inplace,
cases=dict(dot_vv=[rand(1), 1, rand(1, 2), rand(2), 0],
dot_vm=[rand(3), 1, rand(3, 2), rand(2), 0],
# It doesn't support float16
cases=dict(dot_vv=[rand(1), 1., rand(1, 2), rand(2), 0.],
dot_vm=[rand(3), 1., rand(3, 2), rand(2), 0.],
float32=[rand(3).astype('float32'), np.float32(1),
rand(3, 2).astype('float32'),
rand(2).astype('float32'), np.float32(0)],
float64=[rand(3).astype('float64'), np.float64(1),
rand(3, 2).astype('float64'),
rand(2).astype('float64'), np.float64(0)],
# test_02=[rand(0), 1, rand(0, 2), rand(2), 0],
# test_30=[rand(3), 1, rand(3, 0), rand(0), 0],
# test_00=[rand(0), 1, rand(0, 0), rand(0), 0],
test_stride=[rand(3)[::-1], 1, rand(3, 2)[::-1], rand(2)[::-1], 0],
test_stride=[rand(3)[::-1], 1., rand(3, 2)[::-1], rand(2)[::-1], 0.],
)
)
def test_float16():
# gemm
float16_data = [rand(3, 3).astype('float16'),
np.asarray(1, dtype=np.float32),
rand(3, 3).astype('float16'),
rand(3, 3).astype('float16'),
np.asarray(0.5, dtype=np.float32)]
float16_shared = [gpuarray_shared_constructor(val)
for val in float16_data]
o = gpugemm_no_inplace(*float16_shared)
f = theano.function([], o)
y, alpha, A, x, beta = float16_data
out = f()
utt.assert_allclose(np.asarray(out), alpha * np.dot(A, x) + beta * y)
# dot22
float16_data = [rand(3, 3).astype('float16'),
rand(3, 3).astype('float16')]
float16_shared = [gpuarray_shared_constructor(val)
for val in float16_data]
o = gpu_dot22(*float16_shared)
f = theano.function([], o)
x, y = float16_data
out = f()
utt.assert_allclose(np.asarray(out), np.dot(x, y))
class TestGpuSgemv(TestCase, BaseGemv, utt.TestOptimizationMixin):
mode = mode_with_gpu
dtype = 'float32'
......@@ -51,6 +87,7 @@ class TestGpuSgemv(TestCase, BaseGemv, utt.TestOptimizationMixin):
GpuGemmTester = makeTester(
'GpuGemmTester',
op=gemm_inplace, gpu_op=gpugemm_inplace,
# float16 tested in test_float16
cases=dict(test1=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), 0.0],
test2=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), 1.0],
test3=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), -1.0],
......@@ -59,7 +96,12 @@ GpuGemmTester = makeTester(
test6=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), -1.0],
test7=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), 0.0],
test8=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), 1.1],
test9=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), -1.1],
float32=[rand(3, 4).astype('float32'), np.float32(-1.0),
rand(3, 5).astype('float32'),
rand(5, 4).astype('float32'), np.float32(-1.1)],
float64=[rand(3, 4).astype('float64'), np.float64(-1.0),
rand(3, 5).astype('float64'),
rand(5, 4).astype('float64'), np.float64(-1.1)],
# test10=[rand(0, 4), -1.0, rand(0, 5), rand(5, 4), 0.0],
# test11=[rand(3, 0), -1.0, rand(3, 5), rand(5, 0), 1.1],
# test12=[rand(3, 4), -1.0, rand(3, 0), rand(0, 4), -1.1],
......@@ -68,14 +110,29 @@ GpuGemmTester = makeTester(
)
gemm_batched_tests = dict(
("test_b%im%ik%in%i" % (b, m, k, n),
[rand(b, m, n), rand(), rand(b, m, k), rand(b, k, n), rand()])
for b, m, k, n in itertools.combinations([2, 3, 5, 7, 11, 13], 4))
# float16 not supported
gemm_batched_tests['float32'] = [rand(3, 4, 7).astype('float32'),
rand().astype('float32'),
rand(3, 4, 4).astype('float32'),
rand(3, 4, 7).astype('float32'),
rand().astype('float32')]
gemm_batched_tests['float64'] = [rand(3, 4, 7).astype('float64'),
rand().astype('float64'),
rand(3, 4, 4).astype('float64'),
rand(3, 4, 7).astype('float64'),
rand().astype('float64')]
GpuGemmBatchTester = makeTester(
'GpuGemmBatchTester',
op=lambda z, alpha, x, y, beta: alpha * batched_dot(x, y) + beta * z,
gpu_op=gpugemmbatch_no_inplace,
cases=dict(
("test_b%im%ik%in%i" % (b, m, k, n),
[rand(b, m, n), rand(), rand(b, m, k), rand(b, k, n), rand()])
for b, m, k, n in itertools.combinations([2, 3, 5, 7, 11, 13], 4)))
cases=gemm_batched_tests
)
class TestGpuSger(TestGer):
......
......@@ -493,6 +493,27 @@ def test_many_arg_elemwise():
utt.assert_allclose(results_gpu, results_cpu)
def test_not_useless_scalar_gpuelemwise():
# We don't want to move elemwise on scalar on the GPU when the
# result will not be used on the GPU!
with theano.configparser.change_flags(warn_float64='ignore'):
X = tensor.fmatrix()
x = np.random.randn(32, 32).astype(np.float32)
m1 = theano.shared(np.random.randn(32, 32).astype(np.float32))
loss = (X - tensor.dot(X, m1)).norm(L=2)
lr = theano.shared(np.asarray(.001, dtype=np.float32))
grad = tensor.grad(loss, m1)
train = theano.function(inputs=[X], updates=[(m1, m1 - lr * grad)],
mode=mode_with_gpu)
train(x)
topo = train.maker.fgraph.toposort()
gemms = [app for app in topo if isinstance(app.op, GpuGemm)]
assert len(gemms) == 2
assert isinstance(gemms[1].inputs[1].owner.op, tensor.Elemwise)
def test_local_lift_abstractconv_gpu_shape():
prev = theano.config.on_opt_error
try:
......
......@@ -24,11 +24,25 @@ except ImportError:
_context_reg = {}
def gpu_supported(data):
"""
Is the following data supported on the GPU?
Currently, only complex aren't supported.
Parameters
----------
data : numpy.ndarray or TensorVariable
(it must have dtype and ndim parameter)
"""
return str(data.dtype) not in tensor.basic.complex_dtypes
def move_to_gpu(data):
"""
Do we want to move this computation to the GPU?
Currently, we don't move complex and scalar int.
Currently, we don't move complex and scalar.
Parameters
----------
......@@ -36,10 +50,10 @@ def move_to_gpu(data):
(it must have dtype and ndim parameter)
"""
# We don't support complex on the GPU
if str(data.dtype) in tensor.basic.complex_dtypes:
if not gpu_supported(data):
return False
# We don't want scalar int on the GPU.
if data.ndim == 0 and str(data.dtype) in tensor.basic.discrete_dtypes:
# We don't want scalars on the GPU.
if data.ndim == 0:
return False
return True
......@@ -637,7 +651,7 @@ def gpuarray_shared_constructor(value, name=None, strict=False,
if target is notset:
target = None
if not move_to_gpu(value):
if not gpu_supported(value):
raise TypeError('We do not move that data by default to the GPU')
try:
get_context(target)
......
......@@ -317,7 +317,7 @@ class Ger(Op):
y = T.as_tensor_variable(y)
x = T.as_tensor_variable(x)
alpha = T.as_tensor_variable(alpha)
if len(set([A.dtype, alpha.dtype, x.dtype, y.dtype])) != 1:
if not(A.dtype == x.dtype == y.dtype == alpha.dtype):
raise TypeError('ger requires matching dtypes',
(A.dtype, alpha.dtype, x.dtype, y.dtype))
if alpha.ndim != 0:
......@@ -852,9 +852,6 @@ class Gemm(GemmRelated):
(self, len(inputs)))
z, a, x, y, b = inputs
# For the consistency check we don't want z to be a cached constant.
if getattr(z, 'cached', False):
z = copy.copy(z)
zr, xr, yr = [set(view_roots(i)) for i in (z, x, y)]
# We want the gemm to be inplace. When this op is inplace, it
......@@ -867,10 +864,11 @@ class Gemm(GemmRelated):
# think there is another mechanism that would prevent this,
# but I don't what to modify old code and have chance to break
# something.
if zr.intersection(xr):
raise InconsistencyError(Gemm.E_z_uniq, (z, x))
if zr.intersection(yr):
raise InconsistencyError(Gemm.E_z_uniq, (z, y))
if self.inplace:
if zr.intersection(xr):
raise InconsistencyError(Gemm.E_z_uniq, (z, x))
if zr.intersection(yr):
raise InconsistencyError(Gemm.E_z_uniq, (z, y))
if z.ndim != 2:
raise TypeError(Gemm.E_rank, z)
......
......@@ -105,7 +105,7 @@ class t_gemm(TestCase):
def test0a(self):
Gemm.debug = True
try:
g = gemm_inplace([1.], 1., [1.], [1.], 1.)
g = gemm_no_inplace([1.], 1., [1.], [1.], 1.)
except TypeError as e:
if exc_message(e) is Gemm.E_rank:
return
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论