提交 658bf2ef authored 作者: Xavier Bouthillier's avatar Xavier Bouthillier

Add optimizations and relativ tests

上级 ed4e0095
...@@ -21,6 +21,7 @@ from theano.gof import (local_optimizer, EquilibriumDB, ProxyDB, ...@@ -21,6 +21,7 @@ from theano.gof import (local_optimizer, EquilibriumDB, ProxyDB,
Optimizer, toolbox) Optimizer, toolbox)
from theano.gof.opt import LocalMetaOptimizer from theano.gof.opt import LocalMetaOptimizer
from theano.sandbox.cuda import as_cuda_ndarray_variable from theano.sandbox.cuda import as_cuda_ndarray_variable
from theano.sandbox.opt import register_meta_opt
from theano.sandbox.cuda.basic_ops import ( from theano.sandbox.cuda.basic_ops import (
gpu_eye, gpu_contiguous, gpu_eye, gpu_contiguous,
gpu_from_host, host_from_gpu, GpuFromHost, HostFromGpu, gpu_from_host, host_from_gpu, GpuFromHost, HostFromGpu,
...@@ -32,9 +33,9 @@ from theano.sandbox.cuda.basic_ops import ( ...@@ -32,9 +33,9 @@ from theano.sandbox.cuda.basic_ops import (
from theano.sandbox.cuda.type import CudaNdarrayType from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda.blas import (gpu_dot22, gpu_dot22scalar, from theano.sandbox.cuda.blas import (gpu_dot22, gpu_dot22scalar,
gpu_gemm_inplace, gpu_gemm_no_inplace, GpuConv, gpu_gemm_inplace, gpu_gemm_no_inplace, GpuConv,
GpuCorrMM, GpuCorrMM_gradInputs, GpuCorrMM_gradWeights, GpuCorrMM, GpuCorrMM_gradInputs, GpuCorrMM_gradWeights,
GpuCorr3dMM, GpuCorr3dMM_gradInputs, GpuCorr3dMM_gradWeights) GpuCorr3dMM, GpuCorr3dMM_gradInputs, GpuCorr3dMM_gradWeights)
from theano.sandbox.cuda.blas import gpu_gemv_inplace from theano.sandbox.cuda.blas import gpu_gemv_inplace
from theano.sandbox.cuda.cula import gpu_solve from theano.sandbox.cuda.cula import gpu_solve
...@@ -43,7 +44,10 @@ from theano.sandbox.cuda.blas import gpu_gemv_no_inplace ...@@ -43,7 +44,10 @@ from theano.sandbox.cuda.blas import gpu_gemv_no_inplace
from theano.sandbox.cuda.blas import gpu_ger_inplace from theano.sandbox.cuda.blas import gpu_ger_inplace
from theano.sandbox.cuda.blas import gpu_ger_no_inplace from theano.sandbox.cuda.blas import gpu_ger_no_inplace
from theano.sandbox.cuda.blas import (GpuDownsampleFactorMax, from theano.sandbox.cuda.blas import (GpuDownsampleFactorMax,
GpuDownsampleFactorMaxGrad, GpuDownsampleFactorMaxGradGrad) GpuDownsampleFactorMaxGrad, GpuDownsampleFactorMaxGradGrad)
from theano.sandbox.blocksparse import SparseBlockGemv, SparseBlockOuter
from theano.sandbox.cuda.blocksparse import GpuSparseBlockGemv, GpuSparseBlockOuter
from theano.sandbox.cuda.nnet import ( from theano.sandbox.cuda.nnet import (
GpuCrossentropySoftmaxArgmax1HotWithBias, GpuCrossentropySoftmaxArgmax1HotWithBias,
...@@ -266,8 +270,8 @@ def local_gpu_elemwise_0(node): ...@@ -266,8 +270,8 @@ def local_gpu_elemwise_0(node):
'uint16']) 'uint16'])
# case 1 - all inputs are already float32 # case 1 - all inputs are already float32
if all([i.type.dtype == 'float32' for i in node.inputs]): if all([i.type.dtype == 'float32' for i in node.inputs]):
# TODO: change this when fusion makes Elemwise with multiple # TODO: change this when fusion makes Elemwise with
# outputs # multiple outputs
gpu_elemwise = new_op(*(gpu_from_host(i) gpu_elemwise = new_op(*(gpu_from_host(i)
for i in node.inputs)) for i in node.inputs))
# case 2 - it is still ok if some inputs were upcast to float32 # case 2 - it is still ok if some inputs were upcast to float32
...@@ -648,7 +652,7 @@ def local_gpu_gemv(node): ...@@ -648,7 +652,7 @@ def local_gpu_gemv(node):
""" """
gemvs = (tensor.blas.Gemv, gemvs = (tensor.blas.Gemv,
tensor.blas_c.CGemv, tensor.blas_c.CGemv,
) )
if isinstance(node.op, GpuFromHost): if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0] host_input = node.inputs[0]
if host_input.owner and isinstance(host_input.owner.op, gemvs): if host_input.owner and isinstance(host_input.owner.op, gemvs):
...@@ -688,7 +692,7 @@ def local_gpu_ger(node): ...@@ -688,7 +692,7 @@ def local_gpu_ger(node):
gers = (tensor.blas_c.CGer, gers = (tensor.blas_c.CGer,
tensor.blas.Ger, tensor.blas.Ger,
tensor.blas_scipy.ScipyGer, tensor.blas_scipy.ScipyGer,
) )
if isinstance(node.op, GpuFromHost): if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0] host_input = node.inputs[0]
...@@ -741,11 +745,11 @@ def local_gpu_gemm(node): ...@@ -741,11 +745,11 @@ def local_gpu_gemm(node):
y_on_gpu = (y.owner and isinstance(y.owner.op, HostFromGpu)) y_on_gpu = (y.owner and isinstance(y.owner.op, HostFromGpu))
z_on_gpu = (z.owner and isinstance(z.owner.op, HostFromGpu)) z_on_gpu = (z.owner and isinstance(z.owner.op, HostFromGpu))
if x_on_gpu or y_on_gpu or z_on_gpu: if x_on_gpu or y_on_gpu or z_on_gpu:
return [host_from_gpu(gpu_gemm_no_inplace(as_cuda_ndarray_variable(z), return [host_from_gpu(gpu_gemm_no_inplace(gpu_from_host(z),
a, a,
as_cuda_ndarray_variable(x), gpu_from_host(x),
as_cuda_ndarray_variable(y), gpu_from_host(y),
b))] b))]
return False return False
...@@ -996,7 +1000,8 @@ def local_gpu_advanced_subtensor1(node): ...@@ -996,7 +1000,8 @@ def local_gpu_advanced_subtensor1(node):
if node.op.__class__ is tensor.AdvancedSubtensor1: if node.op.__class__ is tensor.AdvancedSubtensor1:
x = node.inputs[0] x = node.inputs[0]
coords = node.inputs[1:] coords = node.inputs[1:]
if x.owner and isinstance(x.owner.op, HostFromGpu) and x.dtype == "float32": if (x.owner and isinstance(x.owner.op, HostFromGpu) and
x.dtype == "float32"):
gpu_x, = x.owner.inputs gpu_x, = x.owner.inputs
return [host_from_gpu(GpuAdvancedSubtensor1()(gpu_x, *coords))] return [host_from_gpu(GpuAdvancedSubtensor1()(gpu_x, *coords))]
return False return False
...@@ -1396,19 +1401,19 @@ def local_gpu_conv(node): ...@@ -1396,19 +1401,19 @@ def local_gpu_conv(node):
# print op.kshp, op.imshp[1:3] # print op.kshp, op.imshp[1:3]
# print op.kshp_logical, logical_img_hw # print op.kshp_logical, logical_img_hw
ret = GpuConv(border_mode=op.out_mode, ret = GpuConv(border_mode=op.out_mode,
subsample=(op.dx, op.dy), subsample=(op.dx, op.dy),
logical_img_hw=logical_img_hw, logical_img_hw=logical_img_hw,
logical_kern_hw=op.kshp_logical, logical_kern_hw=op.kshp_logical,
logical_kern_align_top=op.kshp_logical_top_aligned, logical_kern_align_top=op.kshp_logical_top_aligned,
kshp=op.kshp, kshp=op.kshp,
version=op.version, version=op.version,
direction_hint=op.direction_hint, direction_hint=op.direction_hint,
verbose=op.verbose, verbose=op.verbose,
imshp=op.imshp, imshp=op.imshp,
nkern=op.nkern, nkern=op.nkern,
bsize=op.bsize, bsize=op.bsize,
fft_opt=op.fft_opt fft_opt=op.fft_opt
) )
if op.imshp_logical is not None: if op.imshp_logical is not None:
logical_img_hw = op.imshp_logical[1:3] logical_img_hw = op.imshp_logical[1:3]
if logical_img_hw != op.imshp[1:3]: if logical_img_hw != op.imshp[1:3]:
...@@ -2420,4 +2425,101 @@ optdb.register('gpu_scanOp_make_inplace', ...@@ -2420,4 +2425,101 @@ optdb.register('gpu_scanOp_make_inplace',
'inplace', 'inplace',
'scan') 'scan')
# XXX: these optimisations were badly broken and now require a working
# beta param (could only be a 0/1 thing for outer_merge, but
# alpha_merge needs the full range).
# @register_opt()
# @alpha_merge(GpuSparseBlockOuter, alpha_in=5, beta_in=?, nd=4)
# def local_merge_blocksparse_alpha(node, *inputs):
# """
# GpuElemwise{mul}(lr, GpuSparseBlockOuter) -> GpuSparseBlockOuter(..., alpha=lr)
# """
# return [gpu_sparse_block_outer(*inputs)]
# @register_opt()
# @output_merge(GpuSparseBlockOuter, alpha_in=5, beta_in=? out_in=0, nd=4)
# def local_merge_blocksparse_output(node, *inputs):
# return [gpu_sparse_block_outer(*inputs)]
def _owner_isinstance(inp, test_class):
"""
Tests whether input has an owner and if its owner is
of type `test_class`
"""
return bool(inp.owner) and isinstance(inp.owner.op, test_class)
def _clear_host_from_gpu(inputs):
"""
Replace any HostFromGpu by its input
"""
clean_inputs = []
for inp in inputs:
if _owner_isinstance(inp, HostFromGpu):
clean_inputs.append(inp.owner.inputs[0])
else:
clean_inputs.append(inp)
return clean_inputs
@register_meta_opt(SparseBlockGemv, ["gpu_opt", "gpu_local_optimizations"],
0., 'fast_run', 'fast_compile', 'gpu')
@local_optimizer([SparseBlockGemv, GpuFromHost])
def gpu_sparse_block_gemv_opt(node):
"""
SparseBlockGemv(HostFromGpu(input)) ->
HostFromGpu(GpuSparseBlockGemv(input))
or
GpuFromHost(SparseBlockGemv) -> GpuSparseBlockGemv
"""
if isinstance(node.op, SparseBlockGemv) and \
any(_owner_isinstance(inp, HostFromGpu) for inp in node.inputs):
inputs = _clear_host_from_gpu(node.inputs)
return [host_from_gpu(GpuSparseBlockGemv(node.op.inplace)(*inputs))]
elif isinstance(node.op, GpuFromHost) and \
_owner_isinstance(node.inputs[0], SparseBlockGemv):
meta_node = node.inputs[0].owner
inputs = _clear_host_from_gpu(meta_node.inputs)
return [GpuSparseBlockGemv(meta_node.op.inplace)(*inputs)]
@register_meta_opt(SparseBlockOuter, ["gpu_opt", "gpu_local_optimizations"],
0., 'fast_run', 'fast_compile', 'gpu')
@local_optimizer([SparseBlockOuter, GpuFromHost])
def gpu_sparse_block_outer_opt(node):
"""
SparseBlockOuter(HostFromGpu(input)) ->
HostFromGpu(GpuSparseBlockOuter(input))
or
GpuFromHost(SparseBlockOuter) -> GpuSparseBlockOuter
"""
if isinstance(node.op, SparseBlockOuter) and \
any(_owner_isinstance(inp, HostFromGpu) for inp in node.inputs):
inputs = _clear_host_from_gpu(node.inputs)
return [host_from_gpu(GpuSparseBlockOuter(node.op.inplace)(*inputs))]
elif isinstance(node.op, GpuFromHost) and \
_owner_isinstance(node.inputs[0], SparseBlockOuter):
meta_node = node.inputs[0].owner
inputs = _clear_host_from_gpu(meta_node.inputs)
return [GpuSparseBlockOuter(meta_node.op.inplace)(*inputs)]
import theano.sandbox.cuda.extra_ops import theano.sandbox.cuda.extra_ops
...@@ -29,6 +29,9 @@ from theano.sandbox.cuda import basic_ops ...@@ -29,6 +29,9 @@ from theano.sandbox.cuda import basic_ops
from theano.sandbox.cuda.type import CudaNdarrayType from theano.sandbox.cuda.type import CudaNdarrayType
from theano.scalar.basic_scipy import erfinv from theano.scalar.basic_scipy import erfinv
from theano.sandbox.blocksparse import sparse_block_dot
from theano.sandbox.cuda.blocksparse import GpuSparseBlockGemv, GpuSparseBlockOuter
if theano.config.mode == 'FAST_COMPILE': if theano.config.mode == 'FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu') mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpu') mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpu')
...@@ -740,6 +743,37 @@ def test_local_gpu_dot_to_dot22dot(): ...@@ -740,6 +743,37 @@ def test_local_gpu_dot_to_dot22dot():
cmp((3, 4), (4,)) cmp((3, 4), (4,))
def test_blocksparse_gpu_gemv_opt():
b = tensor.fmatrix()
W = tensor.ftensor4()
h = tensor.ftensor3()
iIdx = tensor.lmatrix()
oIdx = tensor.lmatrix()
o = sparse_block_dot(W, h, iIdx, b, oIdx)
f = theano.function([W, h, iIdx, b, oIdx], o, mode=mode_with_gpu)
assert isinstance(f.maker.fgraph.toposort()[-2].op, GpuSparseBlockGemv)
def test_blocksparse_gpu_outer_opt():
b = tensor.fmatrix()
W = tensor.ftensor4()
h = tensor.ftensor3()
iIdx = tensor.lmatrix()
oIdx = tensor.lmatrix()
o = sparse_block_dot(W, h, iIdx, b, oIdx)
theano.printing.debugprint(tensor.grad(o.sum(),wrt=W))
f = theano.function([W, h, iIdx, b, oIdx], [o, tensor.grad(o.sum(),wrt=W)],
mode=mode_with_gpu)
assert isinstance(f.maker.fgraph.toposort()[-2].op, GpuSparseBlockOuter)
class test_diag(theano.tensor.tests.test_nlinalg.test_diag): class test_diag(theano.tensor.tests.test_nlinalg.test_diag):
mode = mode_with_gpu mode = mode_with_gpu
shared = staticmethod(cuda.shared_constructor) shared = staticmethod(cuda.shared_constructor)
...@@ -751,6 +785,7 @@ class test_diag(theano.tensor.tests.test_nlinalg.test_diag): ...@@ -751,6 +785,7 @@ class test_diag(theano.tensor.tests.test_nlinalg.test_diag):
self).__init__(name) self).__init__(name)
if __name__ == '__main__': if __name__ == '__main__':
test_gpualloc() test_gpualloc()
test_opt_gpujoin_onlyajoin() test_opt_gpujoin_onlyajoin()
......
"""
Optimizations addressing the ops in sandbox root directory
"""
import bisect
import logging
from theano.compile import optdb
from theano.gof import local_optimizer, EquilibriumDB
from theano.tensor.opt import register_specialize
from theano.sandbox.blocksparse import (
SparseBlockGemv,
SparseBlockOuter,
sparse_block_gemv,
sparse_block_outer,
sparse_block_gemv_inplace,
sparse_block_outer_inplace,
CpuSparseBlockGemv,
CpuSparseBlockOuter)
_logger = logging.getLogger('theano.sandbox.opt')
def _db_exists(db, db_name):
"""
Tests whether the full path from `db_name[0]` down to
`db_name[-1]` exists.
Parameters
----------
db: `theano.gof.optdb.DB`
A dataset of optimisations or sub-datasets.
db_name: list or tuple of strings
Names of datasets from given one `db[db_name[0]]` down
to the dataset of interest where to register.
ex: ['level_1_dataset', 'level_2_dataset']
"""
if len(db_name) == 1:
return db_name[0] in db._names
return db_name[0] in db._names and _db_exists(db[db_name[0]], db_name[1:])
def _db_register(db, db_name, *args):
"""
Registers an object in last datasets given in db_name. `db_name[-1]`
is deep in the hierarchy of `db`.
Parameters
----------
db: `theano.gof.optdb.DB`
A dataset of optimisations or sub-datasets.
db_name: list or tuple of strings
Names of datasets from given one `db[db_name[0]]` down
to the dataset of interest where to register.
ex: ['level_1_dataset', 'level_2_dataset']
"""
if len(db_name) == 0:
return db.register(*args)
return _db_register(db[db_name[0]], db_name[1:], *args)
def _db_positions(db, db_name, positions=()):
"""
Returns the list of positions of all databases from `db_name[0]`
down to `db_name[-1]`. The path is hierarchical, hence `db_name[0]`
is in `db`, `db_name[1]` is in `db[db_name[0]]`, etc.
Parameters
----------
db: `theano.gof.optdb.DB`
A dataset of optimisations or sub-datasets.
db_name: list or tuple of strings
Names of datasets from given one `db[db_name[0]]` down
to the dataset of interests.
ex: ['level_1_dataset', 'level_2_dataset']
"""
if len(db_name) == 0:
return positions
db_position = db.__position__.get(db_name[0], 0.)
return _db_positions(db[db_name[0]], db_name[1:],
positions + (db_position, ))
def register_meta_opt(op_class, db_name, position, *args):
"""
Registers a given optimization under given database name and saves
optimization information in `op_class.registered_opts`.
Parameters
----------
op_class: `theano.gof.Op`
A meta Op which have multiple implementations available
for optimization.
db_name: string, list or tuple of strings
A string if optimization is inserted in `theano.compile.optdb`
directly. List is used to insert an optimization deep inside a
hierarchy of optimization databases.
position: int or float
Position of the optimisation in the target dataset.
(Position in deep database if not optdb)
*args
Arguments to register the optimization.
"""
if isinstance(db_name, str):
db_name = [db_name]
def call(local_meta_opt):
if not _db_exists(optdb, db_name):
# TODO: Would another default DB be better?
_db_register(optdb, db_name[:-2],
db_name[-1], EquilibriumDB(), position, *args)
_db_register(optdb, db_name,
local_meta_opt.__name__, local_meta_opt, *args)
positions = _db_positions(optdb, db_name)
idx = bisect.bisect_left((positions, local_meta_opt),
op_class.registered_opts)
op_class.registered_opts.insert(idx,
(positions, local_meta_opt.__name__))
return local_meta_opt
return call
@register_meta_opt(SparseBlockGemv, ["meta_cpu"], 51.0,
"fast_run", "fast_compile")
@local_optimizer([SparseBlockGemv])
def cpu_sparse_block_gemv_opt(node):
"""
SparseBlockGemv -> CpuSparseBlockGemv
"""
return [CpuSparseBlockGemv(node.op.inplace)(*node.inputs)]
@register_meta_opt(SparseBlockOuter, ["meta_cpu"], 51.0,
"fast_run", "fast_compile")
@local_optimizer([SparseBlockOuter])
def cpu_sparse_block_outer_opt(node):
"""
SparseBlockOuter -> CpuSparseBlockOuter
"""
return [CpuSparseBlockOuter(node.op.inplace)(*node.inputs)]
@register_specialize
@local_optimizer([sparse_block_gemv], inplace=True)
def local_inplace_block_sparse_gemv(node):
"""
SparseBlockGemv(inplace=False) -> SparseBlockGemv(inplace=True)
"""
return [sparse_block_gemv_inplace(*node.inputs)]
@register_specialize
@local_optimizer([sparse_block_outer], inplace=True)
def local_inplace_block_sparse_outer(node):
"""
SparseBlockOuter(inplace=False) -> SparseBlockOuter(inplace=True)
"""
return [sparse_block_outer_inplace(*node.inputs)]
import theano
from theano import tensor
from theano.sandbox.blocksparse import CpuSparseBlockGemv, CpuSparseBlockOuter, sparse_block_dot
def test_blocksparse_cpu_gemv_opt():
b = tensor.fmatrix()
W = tensor.ftensor4()
h = tensor.ftensor3()
iIdx = tensor.lmatrix()
oIdx = tensor.lmatrix()
o = sparse_block_dot(W, h, iIdx, b, oIdx)
f = theano.function([W, h, iIdx, b, oIdx], o)
assert isinstance(f.maker.fgraph.toposort()[-1].op, CpuSparseBlockGemv)
def test_blocksparse_cpu_outer_opt():
b = tensor.fmatrix()
W = tensor.ftensor4()
h = tensor.ftensor3()
iIdx = tensor.lmatrix()
oIdx = tensor.lmatrix()
o = sparse_block_dot(W, h, iIdx, b, oIdx)
theano.printing.debugprint(tensor.grad(o.sum(),wrt=W))
f = theano.function([W, h, iIdx, b, oIdx], [o, tensor.grad(o.sum(),wrt=W)])
assert isinstance(f.maker.fgraph.toposort()[-1].op, CpuSparseBlockOuter)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论