提交 aeb8c035 authored 作者: Xavier Bouthillier's avatar Xavier Bouthillier

Fix optimizations

上级 76b71018
...@@ -84,7 +84,19 @@ class SparseBlockGemv(Op): ...@@ -84,7 +84,19 @@ class SparseBlockGemv(Op):
return Apply(self, [o, W, h, inputIdx, outputIdx], [output]) return Apply(self, [o, W, h, inputIdx, outputIdx], [output])
def perform(self, node, inp, out_): def perform(self, node, inp, out_):
raise NotImplementedError('Optimization of SparseBlockGemv failed.') o, W, h, iIdx, oIdx = inp[:5]
if not self.inplace:
o = o.copy()
for b in range(o.shape[0]):
for j in range(o.shape[1]):
outputIdx = oIdx[b, j]
for i in range(h.shape[1]):
inputIdx = iIdx[b, i]
w = W[inputIdx, outputIdx]
o[b, j, :] += numpy.dot(h[b, i], w)
out_[0][0] = o
def grad(self, inputs, grads): def grad(self, inputs, grads):
o, W, h, inputIdx, outputIdx = inputs o, W, h, inputIdx, outputIdx = inputs
...@@ -160,50 +172,6 @@ class SparseBlockOuter(Op): ...@@ -160,50 +172,6 @@ class SparseBlockOuter(Op):
return Apply(self, [o, x, y, xIdx, yIdx, alpha], return Apply(self, [o, x, y, xIdx, yIdx, alpha],
[output]) [output])
def perform(self, node, inp, out_):
raise NotImplementedError('Optimization of SparseBlockOuter failed.')
def grad(self, inputs, output_gradients):
raise NotImplementedError("SparseBlockOuter has no gradient "
"implemented")
class CpuSparseBlockGemv(SparseBlockGemv):
"""
CPU version of SparseBlockGemv. Check SparseBlockGemv's docstring for more
information.
This should not be directly called since the interface is subject
to change without notice. Use the sandbox.blocksparse.sparse_block_dot()
function for a stable interface.
"""
def perform(self, node, inp, out_):
o, W, h, iIdx, oIdx = inp[:5]
if not self.inplace:
o = o.copy()
for b in range(o.shape[0]):
for j in range(o.shape[1]):
outputIdx = oIdx[b, j]
for i in range(h.shape[1]):
inputIdx = iIdx[b, i]
w = W[inputIdx, outputIdx]
o[b, j, :] += numpy.dot(h[b, i], w)
out_[0][0] = o
class CpuSparseBlockOuter(SparseBlockOuter):
"""
CPU version of SparseBlockOuter. See SparseBlockOuter's docstring for more
information.
This op should not be called directly since its interface is
subject to change without notice. It is involved in the gradient
of GpuSparseBlockGemv. The gradient is not implemented.
"""
def perform(self, node, inp, out_): def perform(self, node, inp, out_):
o, x, y, xIdx, yIdx, alpha = inp[:6] o, x, y, xIdx, yIdx, alpha = inp[:6]
...@@ -223,11 +191,6 @@ sparse_block_gemv_inplace = SparseBlockGemv(True) ...@@ -223,11 +191,6 @@ sparse_block_gemv_inplace = SparseBlockGemv(True)
sparse_block_outer = SparseBlockOuter(False) sparse_block_outer = SparseBlockOuter(False)
sparse_block_outer_inplace = SparseBlockOuter(True) sparse_block_outer_inplace = SparseBlockOuter(True)
cpu_sparse_block_gemv = CpuSparseBlockGemv(False)
cpu_sparse_block_gemv_inplace = CpuSparseBlockGemv(True)
cpu_sparse_block_outer = CpuSparseBlockOuter(False)
cpu_sparse_block_outer_inplace = CpuSparseBlockOuter(True)
def sparse_block_dot(W, h, inputIdx, b, outputIdx): def sparse_block_dot(W, h, inputIdx, b, outputIdx):
""" """
......
...@@ -18,10 +18,9 @@ import theano.ifelse ...@@ -18,10 +18,9 @@ import theano.ifelse
from six.moves import reduce, xrange from six.moves import reduce, xrange
from theano.compile import optdb from theano.compile import optdb
from theano.gof import (local_optimizer, EquilibriumDB, ProxyDB, from theano.gof import (local_optimizer, EquilibriumDB, ProxyDB,
Optimizer, toolbox) Optimizer, TopoOptimizer, toolbox)
from theano.gof.opt import LocalMetaOptimizer from theano.gof.opt import LocalMetaOptimizer
from theano.sandbox.cuda import as_cuda_ndarray_variable from theano.sandbox.cuda import as_cuda_ndarray_variable
from theano.sandbox.opt import register_meta_opt
from theano.sandbox.cuda.basic_ops import ( from theano.sandbox.cuda.basic_ops import (
gpu_eye, gpu_contiguous, gpu_eye, gpu_contiguous,
gpu_from_host, host_from_gpu, GpuFromHost, HostFromGpu, gpu_from_host, host_from_gpu, GpuFromHost, HostFromGpu,
...@@ -32,8 +31,8 @@ from theano.sandbox.cuda.basic_ops import ( ...@@ -32,8 +31,8 @@ from theano.sandbox.cuda.basic_ops import (
GpuIncSubtensor, gpu_alloc, GpuAlloc, gpu_shape, GpuSplit, GpuAllocEmpty) GpuIncSubtensor, gpu_alloc, GpuAlloc, gpu_shape, GpuSplit, GpuAllocEmpty)
from theano.sandbox.cuda.type import CudaNdarrayType from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda.blas import (gpu_dot22, gpu_dot22scalar, from theano.sandbox.cuda.blas import (
gpu_gemm_inplace, gpu_gemm_no_inplace, GpuConv, gpu_dot22, gpu_dot22scalar, gpu_gemm_inplace, gpu_gemm_no_inplace, GpuConv,
GpuCorrMM, GpuCorrMM_gradInputs, GpuCorrMM_gradWeights, GpuCorrMM, GpuCorrMM_gradInputs, GpuCorrMM_gradWeights,
GpuCorr3dMM, GpuCorr3dMM_gradInputs, GpuCorr3dMM_gradWeights) GpuCorr3dMM, GpuCorr3dMM_gradInputs, GpuCorr3dMM_gradWeights)
...@@ -43,11 +42,17 @@ from theano.sandbox.cuda.cula import gpu_solve ...@@ -43,11 +42,17 @@ from theano.sandbox.cuda.cula import gpu_solve
from theano.sandbox.cuda.blas import gpu_gemv_no_inplace from theano.sandbox.cuda.blas import gpu_gemv_no_inplace
from theano.sandbox.cuda.blas import gpu_ger_inplace from theano.sandbox.cuda.blas import gpu_ger_inplace
from theano.sandbox.cuda.blas import gpu_ger_no_inplace from theano.sandbox.cuda.blas import gpu_ger_no_inplace
from theano.sandbox.cuda.blas import (GpuDownsampleFactorMax, from theano.sandbox.cuda.blas import (
GpuDownsampleFactorMaxGrad, GpuDownsampleFactorMaxGradGrad) GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad,
GpuDownsampleFactorMaxGradGrad)
from theano.sandbox.blocksparse import SparseBlockGemv, SparseBlockOuter from theano.sandbox.blocksparse import SparseBlockGemv, SparseBlockOuter
from theano.sandbox.cuda.blocksparse import GpuSparseBlockGemv, GpuSparseBlockOuter from theano.sandbox.cuda.blocksparse import (
GpuSparseBlockGemv,
GpuSparseBlockOuter,
gpu_sparse_block_gemv_inplace,
gpu_sparse_block_outer_inplace)
from theano.sandbox.cuda.nnet import ( from theano.sandbox.cuda.nnet import (
GpuCrossentropySoftmaxArgmax1HotWithBias, GpuCrossentropySoftmaxArgmax1HotWithBias,
...@@ -84,7 +89,8 @@ gpu_seqopt.register('gpu_local_optimizations', gpu_optimizer, 1, ...@@ -84,7 +89,8 @@ gpu_seqopt.register('gpu_local_optimizations', gpu_optimizer, 1,
'fast_run', 'fast_compile', 'inplace', 'gpu') 'fast_run', 'fast_compile', 'inplace', 'gpu')
gpu_seqopt.register('gpu_cut_transfers', gpu_cut_copies, 2, gpu_seqopt.register('gpu_cut_transfers', gpu_cut_copies, 2,
'fast_run', 'fast_compile', 'gpu') 'fast_run', 'fast_compile', 'gpu')
# DO NOT PUT fast_run or fast_compile in gpu_opt! This will ALWAYS enable the GPU! # DO NOT PUT fast_run or fast_compile in gpu_opt! This will ALWAYS
# enable the GPU!
optdb.register('gpu_opt', optdb.register('gpu_opt',
gpu_seqopt, gpu_seqopt,
optdb.__position__.get('add_destroy_handler', 49.5) - 1, optdb.__position__.get('add_destroy_handler', 49.5) - 1,
...@@ -350,8 +356,8 @@ def local_gpu_split(node): ...@@ -350,8 +356,8 @@ def local_gpu_split(node):
any([c != 'output' and isinstance(c.op, GpuFromHost) for c, idx any([c != 'output' and isinstance(c.op, GpuFromHost) for c, idx
in outs_clients])): in outs_clients])):
new_op = GpuSplit(node.op.len_splits) new_op = GpuSplit(node.op.len_splits)
split_res = new_op(as_cuda_ndarray_variable(input), *node.inputs[1:], split_res = new_op(as_cuda_ndarray_variable(input),
return_list=True) *node.inputs[1:], return_list=True)
return [host_from_gpu(o) for o in split_res] return [host_from_gpu(o) for o in split_res]
return False return False
...@@ -378,7 +384,8 @@ def local_gpu_dimshuffle_0(node): ...@@ -378,7 +384,8 @@ def local_gpu_dimshuffle_0(node):
dimshuffle_node = host_input.owner dimshuffle_node = host_input.owner
new_op = GpuDimShuffle(dimshuffle_node.op.input_broadcastable, new_op = GpuDimShuffle(dimshuffle_node.op.input_broadcastable,
dimshuffle_node.op.new_order) dimshuffle_node.op.new_order)
return [new_op(as_cuda_ndarray_variable(dimshuffle_node.inputs[0]))] return [new_op(
as_cuda_ndarray_variable(dimshuffle_node.inputs[0]))]
return False return False
...@@ -393,8 +400,8 @@ def local_gpu_specifyShape_0(node): ...@@ -393,8 +400,8 @@ def local_gpu_specifyShape_0(node):
if isinstance(node.op, tensor.SpecifyShape): if isinstance(node.op, tensor.SpecifyShape):
input = node.inputs[0] input = node.inputs[0]
if input.owner and isinstance(input.owner.op, HostFromGpu): if input.owner and isinstance(input.owner.op, HostFromGpu):
return [host_from_gpu(tensor.specify_shape(as_cuda_ndarray_variable(input), return [host_from_gpu(tensor.specify_shape(
*node.inputs[1:]))] as_cuda_ndarray_variable(input), *node.inputs[1:]))]
if isinstance(node.op, GpuFromHost): if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0] host_input = node.inputs[0]
if host_input.owner and isinstance(host_input.owner.op, if host_input.owner and isinstance(host_input.owner.op,
...@@ -471,11 +478,15 @@ def local_gpu_dot_to_dot22(node): ...@@ -471,11 +478,15 @@ def local_gpu_dot_to_dot22(node):
shape_out))] shape_out))]
return False return False
@local_optimizer(None) @local_optimizer(None)
def local_assert_no_cpu_op(node): def local_assert_no_cpu_op(node):
if not isinstance(node.op, GpuOp) and all([var.owner and isinstance(var.owner.op, if (not isinstance(node.op, GpuOp) and
HostFromGpu) for var in node.inputs]) and any([[c for c in var.clients all([var.owner and isinstance(var.owner.op, HostFromGpu)
if isinstance(c[0].op, GpuFromHost)] for var in node.outputs]): for var in node.inputs]) and
any([[c for c in var.clients if isinstance(c[0].op, GpuFromHost)]
for var in node.outputs])):
if config.assert_no_cpu_op == "warn": if config.assert_no_cpu_op == "warn":
_logger.warning(("CPU op %s is detected in the computational" _logger.warning(("CPU op %s is detected in the computational"
" graph") % node) " graph") % node)
...@@ -576,7 +587,8 @@ def local_gpu_dot22(node): ...@@ -576,7 +587,8 @@ def local_gpu_dot22(node):
if host_input.owner and isinstance(host_input.owner.op, if host_input.owner and isinstance(host_input.owner.op,
tensor.blas.Dot22): tensor.blas.Dot22):
x, y = host_input.owner.inputs x, y = host_input.owner.inputs
return [gpu_dot22(as_cuda_ndarray_variable(x), as_cuda_ndarray_variable(y))] return [gpu_dot22(as_cuda_ndarray_variable(x),
as_cuda_ndarray_variable(y))]
if isinstance(node.op, tensor.blas.Dot22): if isinstance(node.op, tensor.blas.Dot22):
if any([(i.owner and isinstance(i.owner.op, HostFromGpu)) if any([(i.owner and isinstance(i.owner.op, HostFromGpu))
for i in node.inputs]): for i in node.inputs]):
...@@ -601,7 +613,8 @@ def local_gpu_dot22scalar(node): ...@@ -601,7 +613,8 @@ def local_gpu_dot22scalar(node):
isinstance(host_input.owner.op, isinstance(host_input.owner.op,
tensor.blas.Dot22Scalar)): tensor.blas.Dot22Scalar)):
x, y, scalar = host_input.owner.inputs x, y, scalar = host_input.owner.inputs
return [gpu_dot22scalar(as_cuda_ndarray_variable(x), as_cuda_ndarray_variable(y), return [gpu_dot22scalar(as_cuda_ndarray_variable(x),
as_cuda_ndarray_variable(y),
tensor.blas._as_scalar(scalar))] tensor.blas._as_scalar(scalar))]
if isinstance(node.op, tensor.blas.Dot22Scalar): if isinstance(node.op, tensor.blas.Dot22Scalar):
if any([i.owner and isinstance(i.owner.op, HostFromGpu) if any([i.owner and isinstance(i.owner.op, HostFromGpu)
...@@ -629,7 +642,8 @@ def local_gpu_solve(node): ...@@ -629,7 +642,8 @@ def local_gpu_solve(node):
isinstance(host_input.owner.op, isinstance(host_input.owner.op,
slinalg.Solve)): slinalg.Solve)):
x, y = host_input.owner.inputs x, y = host_input.owner.inputs
return [gpu_solve(as_cuda_ndarray_variable(x), as_cuda_ndarray_variable(y))] return [gpu_solve(as_cuda_ndarray_variable(x),
as_cuda_ndarray_variable(y))]
if isinstance(node.op, slinalg.Solve): if isinstance(node.op, slinalg.Solve):
if any([i.owner and isinstance(i.owner.op, HostFromGpu) if any([i.owner and isinstance(i.owner.op, HostFromGpu)
...@@ -715,8 +729,7 @@ def local_gpu_ger(node): ...@@ -715,8 +729,7 @@ def local_gpu_ger(node):
as_cuda_ndarray_variable(z), as_cuda_ndarray_variable(z),
a, a,
as_cuda_ndarray_variable(x), as_cuda_ndarray_variable(x),
as_cuda_ndarray_variable(y) as_cuda_ndarray_variable(y)))]
))]
return False return False
...@@ -745,10 +758,11 @@ def local_gpu_gemm(node): ...@@ -745,10 +758,11 @@ def local_gpu_gemm(node):
y_on_gpu = (y.owner and isinstance(y.owner.op, HostFromGpu)) y_on_gpu = (y.owner and isinstance(y.owner.op, HostFromGpu))
z_on_gpu = (z.owner and isinstance(z.owner.op, HostFromGpu)) z_on_gpu = (z.owner and isinstance(z.owner.op, HostFromGpu))
if x_on_gpu or y_on_gpu or z_on_gpu: if x_on_gpu or y_on_gpu or z_on_gpu:
return [host_from_gpu(gpu_gemm_no_inplace(gpu_from_host(z), return [host_from_gpu(gpu_gemm_no_inplace(
as_cuda_ndarray_variable(z),
a, a,
gpu_from_host(x), as_cuda_ndarray_variable(x),
gpu_from_host(y), as_cuda_ndarray_variable(y),
b))] b))]
return False return False
...@@ -886,8 +900,8 @@ def local_gpu_elemwise_careduce(node): ...@@ -886,8 +900,8 @@ def local_gpu_elemwise_careduce(node):
# automatically add more case, as some like trigonometic # automatically add more case, as some like trigonometic
# operation with some reduction pattern will probably result # operation with some reduction pattern will probably result
# to slow down. # to slow down.
isinstance(node.inputs[0].owner.op.scalar_op, scal.basic.Sqr) isinstance(node.inputs[0].owner.op.scalar_op, scal.basic.Sqr)):
):
op = node.op op = node.op
inp = node.inputs[0].owner.inputs[0] inp = node.inputs[0].owner.inputs[0]
return [GpuCAReduce(op.reduce_mask, op.scalar_op, scal.basic.sqr)(inp)] return [GpuCAReduce(op.reduce_mask, op.scalar_op, scal.basic.sqr)(inp)]
...@@ -902,7 +916,8 @@ def local_gpu_reshape(node): ...@@ -902,7 +916,8 @@ def local_gpu_reshape(node):
isinstance(host_input.owner.op, tensor.Reshape): isinstance(host_input.owner.op, tensor.Reshape):
rshp = host_input.owner.op rshp = host_input.owner.op
x, shp = host_input.owner.inputs x, shp = host_input.owner.inputs
gpu_reshape = GpuReshape(rshp.ndim)(as_cuda_ndarray_variable(x), shp) gpu_reshape = GpuReshape(rshp.ndim)(as_cuda_ndarray_variable(x),
shp)
if gpu_reshape.broadcastable != node.outputs[0].broadcastable: if gpu_reshape.broadcastable != node.outputs[0].broadcastable:
# this can happen as we always return False for all broadcast # this can happen as we always return False for all broadcast
# dim in GpuReshape but not for Reshape # dim in GpuReshape but not for Reshape
...@@ -961,23 +976,27 @@ def local_gpu_subtensor(node): ...@@ -961,23 +976,27 @@ def local_gpu_subtensor(node):
# to the GPU in that case. # to the GPU in that case.
return return
coords = host_input.owner.inputs[1:] coords = host_input.owner.inputs[1:]
return [GpuSubtensor(subt.idx_list)(as_cuda_ndarray_variable(x), *coords)] return [GpuSubtensor(subt.idx_list)(as_cuda_ndarray_variable(x),
*coords)]
if isinstance(node.op, tensor.Subtensor): if isinstance(node.op, tensor.Subtensor):
x = node.inputs[0] x = node.inputs[0]
if (x.owner and if (x.owner and
isinstance(x.owner.op, HostFromGpu) and isinstance(x.owner.op, HostFromGpu) and
x.dtype == "float32"): x.dtype == "float32"):
gpu_x = x.owner.inputs[0] gpu_x = x.owner.inputs[0]
if (gpu_x.owner and if (gpu_x.owner and
isinstance(gpu_x.owner.op, GpuFromHost) and isinstance(gpu_x.owner.op, GpuFromHost) and
# And it is a shared var or an input of the graph. # And it is a shared var or an input of the graph.
not gpu_x.owner.inputs[0].owner): not gpu_x.owner.inputs[0].owner):
if len(x.clients) == 1: if len(x.clients) == 1:
if any([n == 'output' or isinstance(n.op, GpuOp) if any([n == 'output' or isinstance(n.op, GpuOp)
for n, _ in node.outputs[0].clients]): for n, _ in node.outputs[0].clients]):
return return
else: else:
return [host_from_gpu(as_cuda_ndarray_variable(node.outputs[0]))] return [host_from_gpu(as_cuda_ndarray_variable(
node.outputs[0]))]
return return
gpu_x, = x.owner.inputs gpu_x, = x.owner.inputs
...@@ -996,7 +1015,8 @@ def local_gpu_advanced_subtensor1(node): ...@@ -996,7 +1015,8 @@ def local_gpu_advanced_subtensor1(node):
host_input.owner.op.__class__ is tensor.AdvancedSubtensor1: host_input.owner.op.__class__ is tensor.AdvancedSubtensor1:
x = host_input.owner.inputs[0] x = host_input.owner.inputs[0]
coords = host_input.owner.inputs[1:] coords = host_input.owner.inputs[1:]
return [GpuAdvancedSubtensor1()(as_cuda_ndarray_variable(x), *coords)] return [GpuAdvancedSubtensor1()(as_cuda_ndarray_variable(x),
*coords)]
if node.op.__class__ is tensor.AdvancedSubtensor1: if node.op.__class__ is tensor.AdvancedSubtensor1:
x = node.inputs[0] x = node.inputs[0]
coords = node.inputs[1:] coords = node.inputs[1:]
...@@ -1032,12 +1052,14 @@ def local_gpu_advanced_incsubtensor1(node): ...@@ -1032,12 +1052,14 @@ def local_gpu_advanced_incsubtensor1(node):
if (compute_capability < 2 or if (compute_capability < 2 or
x.ndim != 2 or x.ndim != 2 or
y.ndim != 2): y.ndim != 2):
gpu_op = GpuAdvancedIncSubtensor1( gpu_op = GpuAdvancedIncSubtensor1(
set_instead_of_inc=set_instead_of_inc) set_instead_of_inc=set_instead_of_inc)
else: else:
gpu_op = GpuAdvancedIncSubtensor1_dev20( gpu_op = GpuAdvancedIncSubtensor1_dev20(
set_instead_of_inc=set_instead_of_inc) set_instead_of_inc=set_instead_of_inc)
return [gpu_op(as_cuda_ndarray_variable(x), as_cuda_ndarray_variable(y), *coords)] return [gpu_op(as_cuda_ndarray_variable(x),
as_cuda_ndarray_variable(y), *coords)]
# Should not execute for GpuAdvancedIncSubtensor1 # Should not execute for GpuAdvancedIncSubtensor1
if (node.op.__class__ is tensor.AdvancedIncSubtensor1 and if (node.op.__class__ is tensor.AdvancedIncSubtensor1 and
...@@ -1188,7 +1210,7 @@ def local_gpu_pdbbreakpoint_op(node): ...@@ -1188,7 +1210,7 @@ def local_gpu_pdbbreakpoint_op(node):
nb_monitored_vars = len(node.outputs) nb_monitored_vars = len(node.outputs)
for i in range(nb_monitored_vars): for i in range(nb_monitored_vars):
inp = old_inputs[i+1] inp = old_inputs[i + 1]
out = old_outputs[i] out = old_outputs[i]
input_is_from_gpu = (inp.owner and input_is_from_gpu = (inp.owner and
...@@ -1256,8 +1278,7 @@ def local_gpu_crossentorpy_softmax_argmax_1hot_with_bias(node): ...@@ -1256,8 +1278,7 @@ def local_gpu_crossentorpy_softmax_argmax_1hot_with_bias(node):
tensor.basic._convert_to_int32, tensor.basic._convert_to_int32,
tensor.basic._convert_to_int8, tensor.basic._convert_to_int8,
tensor.basic._convert_to_int16, tensor.basic._convert_to_int16,
tensor.basic._convert_to_int64, tensor.basic._convert_to_int64)
)
while y.owner and y.owner.op in int_cast_ops: while y.owner and y.owner.op in int_cast_ops:
y = y.owner.inputs[0] y = y.owner.inputs[0]
gpu_nll, gpu_sm, gpu_am = \ gpu_nll, gpu_sm, gpu_am = \
...@@ -1307,7 +1328,8 @@ def local_gpu_softmax_with_bias(node): ...@@ -1307,7 +1328,8 @@ def local_gpu_softmax_with_bias(node):
x_on_gpu = x.owner and isinstance(x.owner.op, HostFromGpu) x_on_gpu = x.owner and isinstance(x.owner.op, HostFromGpu)
b_on_gpu = b.owner and isinstance(b.owner.op, HostFromGpu) b_on_gpu = b.owner and isinstance(b.owner.op, HostFromGpu)
if x_on_gpu or b_on_gpu: if x_on_gpu or b_on_gpu:
gpu_sm = GpuSoftmaxWithBias()(as_cuda_ndarray_variable(x), as_cuda_ndarray_variable(b)) gpu_sm = GpuSoftmaxWithBias()(as_cuda_ndarray_variable(x),
as_cuda_ndarray_variable(b))
return [host_from_gpu(gpu_sm)] return [host_from_gpu(gpu_sm)]
return False return False
...@@ -1324,6 +1346,7 @@ def _gpu_conv_to_fftconv(node): ...@@ -1324,6 +1346,7 @@ def _gpu_conv_to_fftconv(node):
if (node.op.imshp is not None and if (node.op.imshp is not None and
node.op.imshp[-1] is not None and node.op.imshp[-1] is not None and
node.op.imshp[-1] % 2 == 1): node.op.imshp[-1] % 2 == 1):
kwargs['pad_last_dim'] = True kwargs['pad_last_dim'] = True
# If the user supplied the full nonsymbolic image_shape and # If the user supplied the full nonsymbolic image_shape and
# filter_shape in conv2d(), we can pass it on to conv2d_fft(). # filter_shape in conv2d(), we can pass it on to conv2d_fft().
...@@ -1337,7 +1360,8 @@ def _gpu_conv_to_fftconv(node): ...@@ -1337,7 +1360,8 @@ def _gpu_conv_to_fftconv(node):
(node.op.nkern is not None) and (node.op.nkern is not None) and
(len(node.op.imshp) == 3) and (len(node.op.imshp) == 3) and
(node.op.imshp[0] is not None)): (node.op.imshp[0] is not None)):
kwargs['filter_shape'] = (node.op.nkern, node.op.imshp[0]) + node.op.kshp kwargs['filter_shape'] = (node.op.nkern, node.op.imshp[0]) + \
node.op.kshp
rval = conv2d_fft(node.inputs[0], node.inputs[1], **kwargs) rval = conv2d_fft(node.inputs[0], node.inputs[1], **kwargs)
if node.outputs[0].broadcastable != rval.broadcastable: if node.outputs[0].broadcastable != rval.broadcastable:
# With given shape information, conv2d_fft may return a different # With given shape information, conv2d_fft may return a different
...@@ -1353,6 +1377,7 @@ def local_conv_fft_valid(node): ...@@ -1353,6 +1377,7 @@ def local_conv_fft_valid(node):
if (node.op.border_mode == 'valid' and if (node.op.border_mode == 'valid' and
node.op.subsample == (1, 1) and node.op.subsample == (1, 1) and
node.op.fft_opt): node.op.fft_opt):
return [_gpu_conv_to_fftconv(node)] return [_gpu_conv_to_fftconv(node)]
return False return False
...@@ -1363,6 +1388,7 @@ def local_conv_fft_full(node): ...@@ -1363,6 +1388,7 @@ def local_conv_fft_full(node):
if (node.op.border_mode == 'full' and if (node.op.border_mode == 'full' and
node.op.subsample == (1, 1) and node.op.subsample == (1, 1) and
node.op.fft_opt): node.op.fft_opt):
return [_gpu_conv_to_fftconv(node)] return [_gpu_conv_to_fftconv(node)]
return return
...@@ -1476,6 +1502,7 @@ def local_gpu_conv(node): ...@@ -1476,6 +1502,7 @@ def local_gpu_conv(node):
def local_conv_gemm(node): def local_conv_gemm(node):
if (isinstance(node.op, GpuConv) and if (isinstance(node.op, GpuConv) and
node.op.border_mode in ['full', 'valid']): node.op.border_mode in ['full', 'valid']):
img, kern = node.inputs img, kern = node.inputs
border_mode = node.op.border_mode border_mode = node.op.border_mode
subsample = node.op.subsample subsample = node.op.subsample
...@@ -1599,10 +1626,12 @@ class ConvMetaOptimizer(LocalCudaMetaOptimizer): ...@@ -1599,10 +1626,12 @@ class ConvMetaOptimizer(LocalCudaMetaOptimizer):
if ((var in inputs) and if ((var in inputs) and
(shape is not None) and (shape is not None) and
not any(s is None for s in shape)): not any(s is None for s in shape)):
result[var] = theano.shared( result[var] = theano.shared(
# TODO: Use var.type.filter when cuda_ndarray.filter supports non-strict casts # TODO: Use var.type.filter when cuda_ndarray.filter
# var.type.filter(numpy.random.randn(*shape), # supports non-strict casts
# allow_downcast=True), # var.type.filter(numpy.random.randn(*shape),
# allow_downcast=True),
numpy.require(numpy.random.randn(*shape), numpy.require(numpy.random.randn(*shape),
dtype=var.dtype), dtype=var.dtype),
var.name, var.name,
...@@ -1616,7 +1645,8 @@ conv_metaopt = ConvMetaOptimizer( ...@@ -1616,7 +1645,8 @@ conv_metaopt = ConvMetaOptimizer(
conv_groupopt.query(*['+' + name for name in conv_groupopt._names]).opts) conv_groupopt.query(*['+' + name for name in conv_groupopt._names]).opts)
# Then we add some optimizers that try less obvious options # Then we add some optimizers that try less obvious options
conv_metaopt.register(dnn.local_conv_dnn_alternative) conv_metaopt.register(dnn.local_conv_dnn_alternative)
# Finally, we register the metaoptimizer as the first optimizer in conv_groupopt # Finally, we register the metaoptimizer as the first optimizer in
# conv_groupopt
conv_groupopt.register('conv_meta', conv_metaopt, 0) conv_groupopt.register('conv_meta', conv_metaopt, 0)
...@@ -1661,6 +1691,7 @@ def local_convgrad3d_fft(node): ...@@ -1661,6 +1691,7 @@ def local_convgrad3d_fft(node):
return False return False
if (isinstance(node.op, ConvGrad3D) and if (isinstance(node.op, ConvGrad3D) and
(stride_x, stride_y, stride_z) == (1, 1, 1)): (stride_x, stride_y, stride_z) == (1, 1, 1)):
# we import conv3d_fft locally to avoid pycuda warnings # we import conv3d_fft locally to avoid pycuda warnings
from theano.sandbox.cuda.fftconv import conv3d_fft from theano.sandbox.cuda.fftconv import conv3d_fft
# Shuffle inputs signal from (b, 0, 1, t, ic) to (ic, b, 0, 1, t) # Shuffle inputs signal from (b, 0, 1, t, ic) to (ic, b, 0, 1, t)
...@@ -1747,8 +1778,8 @@ def local_convgrad3d_gemm(node): ...@@ -1747,8 +1778,8 @@ def local_convgrad3d_gemm(node):
f = node.inputs[3] f = node.inputs[3]
f = gpu_contiguous(f.dimshuffle(0, 4, 1, 2, 3)) f = gpu_contiguous(f.dimshuffle(0, 4, 1, 2, 3))
rval = GpuCorr3dMM_gradWeights(subsample=(sx, sy, sz))(x, f, rval = GpuCorr3dMM_gradWeights(subsample=(sx, sy, sz))(
shape=node.inputs[2][1:4]) x, f, shape=node.inputs[2][1:4])
# Shuffle from (ic, oc, 0, 1, t) to (oc, 0, 1, t, ic) # Shuffle from (ic, oc, 0, 1, t) to (oc, 0, 1, t, ic)
return [rval.dimshuffle(0, 2, 3, 4, 1)] return [rval.dimshuffle(0, 2, 3, 4, 1)]
...@@ -1770,7 +1801,8 @@ def local_convtransp3d_gemm(node): ...@@ -1770,7 +1801,8 @@ def local_convtransp3d_gemm(node):
# Shuffle dCdH from (b, 0, 1, t, oc) to (b, oc, 0, 1, t) # Shuffle dCdH from (b, 0, 1, t, oc) to (b, oc, 0, 1, t)
f = node.inputs[3] f = node.inputs[3]
f = gpu_contiguous(f.dimshuffle(0, 4, 1, 2, 3)) f = gpu_contiguous(f.dimshuffle(0, 4, 1, 2, 3))
rval = GpuCorr3dMM_gradInputs(subsample=(sx, sy, sz))(kern=x, topgrad=f) rval = GpuCorr3dMM_gradInputs(subsample=(sx, sy, sz))(kern=x,
topgrad=f)
# Shuffle from (ic, b, 0, 1, t) to (b, 0, 1, t, ic) # Shuffle from (ic, b, 0, 1, t) to (b, 0, 1, t, ic)
return [rval.dimshuffle(0, 2, 3, 4, 1) + node.inputs[1]] return [rval.dimshuffle(0, 2, 3, 4, 1) + node.inputs[1]]
...@@ -1786,6 +1818,7 @@ import theano.tensor.signal.downsample as downsample ...@@ -1786,6 +1818,7 @@ import theano.tensor.signal.downsample as downsample
def local_gpu_downsample_factor_max(node): def local_gpu_downsample_factor_max(node):
if (isinstance(node.op, downsample.DownsampleFactorMax) if (isinstance(node.op, downsample.DownsampleFactorMax)
and node.op.ds == node.op.st): and node.op.ds == node.op.st):
assert node.op.__props__ == ('ds', 'ignore_border', 'st', 'padding', assert node.op.__props__ == ('ds', 'ignore_border', 'st', 'padding',
'mode') 'mode')
if node.op.padding != (0, 0) or node.op.mode != 'max': if node.op.padding != (0, 0) or node.op.mode != 'max':
...@@ -1801,11 +1834,13 @@ def local_gpu_downsample_factor_max(node): ...@@ -1801,11 +1834,13 @@ def local_gpu_downsample_factor_max(node):
def local_gpu_downsample_factor_max_grad(node): def local_gpu_downsample_factor_max_grad(node):
if (isinstance(node.op, downsample.MaxPoolGrad) and if (isinstance(node.op, downsample.MaxPoolGrad) and
node.op.ds == node.op.st): node.op.ds == node.op.st):
assert node.op.__props__ == ('ds', 'ignore_border', 'st', 'padding', assert node.op.__props__ == ('ds', 'ignore_border', 'st', 'padding',
'mode') 'mode')
if (node.op.padding != (0, 0) or if (node.op.padding != (0, 0) or
node.op.mode != 'max' or node.op.mode != 'max' or
node.op.st != node.op.ds): node.op.st != node.op.ds):
return return
x, z, gz = node.inputs x, z, gz = node.inputs
if (x.owner and isinstance(x.owner.op, HostFromGpu)): if (x.owner and isinstance(x.owner.op, HostFromGpu)):
...@@ -1876,7 +1911,8 @@ def local_gpu_join(node): ...@@ -1876,7 +1911,8 @@ def local_gpu_join(node):
# print "OPT: axis_and_tensors=", axis_and_tensors # print "OPT: axis_and_tensors=", axis_and_tensors
matches = [(not t.owner is None and isinstance(t.owner.op, HostFromGpu)) or matches = [(t.owner is not None and
isinstance(t.owner.op, HostFromGpu)) or
isinstance(t, gof.Constant) for t in axis_and_tensors[1:]] isinstance(t, gof.Constant) for t in axis_and_tensors[1:]]
# print "OPT: matches =", matches # print "OPT: matches =", matches
...@@ -1884,7 +1920,8 @@ def local_gpu_join(node): ...@@ -1884,7 +1920,8 @@ def local_gpu_join(node):
if all(matches): if all(matches):
# the extra gpu_from_host introduced here will # the extra gpu_from_host introduced here will
# be removed by further optimizations # be removed by further optimizations
new_tensors = [as_cuda_ndarray_variable(t) for t in axis_and_tensors[1:]] new_tensors = [as_cuda_ndarray_variable(t)
for t in axis_and_tensors[1:]]
new_a_and_t = [axis_and_tensors[0]] + new_tensors new_a_and_t = [axis_and_tensors[0]] + new_tensors
replacement_node = host_from_gpu(gpu_join(*new_a_and_t)) replacement_node = host_from_gpu(gpu_join(*new_a_and_t))
...@@ -1941,7 +1978,6 @@ optdb.register('InplaceGpuBlasOpt', ...@@ -1941,7 +1978,6 @@ optdb.register('InplaceGpuBlasOpt',
def get_device_type_sizes(): def get_device_type_sizes():
""" """
Returns Returns
------- -------
tuple tuple
...@@ -1962,7 +1998,8 @@ def get_device_type_sizes(): ...@@ -1962,7 +1998,8 @@ def get_device_type_sizes():
del gpu_int_size del gpu_int_size
del t del t
except Exception as e: except Exception as e:
_logger.warning(("Optimization Warning: " _logger.warning((
"Optimization Warning: "
"Got the following error, but you can ignore it. " "Got the following error, but you can ignore it. "
"This could cause less GpuElemwise fused together.\n" "This could cause less GpuElemwise fused together.\n"
"%s") % e) "%s") % e)
...@@ -2037,11 +2074,11 @@ def split_huge_add_or_mul(node): ...@@ -2037,11 +2074,11 @@ def split_huge_add_or_mul(node):
# GpuElemwise fusion # GpuElemwise fusion
gpu_local_elemwise_fusion = tensor.opt.local_elemwise_fusion_op( gpu_local_elemwise_fusion = tensor.opt.local_elemwise_fusion_op(
GpuElemwise, GpuElemwise, max_inputs_to_GpuElemwise)
max_inputs_to_GpuElemwise)
if config.gpu.local_elemwise_fusion: if config.gpu.local_elemwise_fusion:
_logger.debug("enabling optimization fusion of gpu elemwise in fast_run") _logger.debug("enabling optimization fusion of gpu elemwise in fast_run")
# Must be after cpu fusion at 40, gpu at 48.5 and before AddDestroyHandler at 49.5 # Must be after cpu fusion at 40, gpu at 48.5 and before
# AddDestroyHandler at 49.5
optdb.register('gpu_elemwise_fusion', optdb.register('gpu_elemwise_fusion',
tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion), tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion),
49, 'fast_run', 'fusion', 49, 'fast_run', 'fusion',
...@@ -2069,7 +2106,8 @@ gpu_elemwise_alloc = gof.local_optimizer([GpuElemwise])( ...@@ -2069,7 +2106,8 @@ gpu_elemwise_alloc = gof.local_optimizer([GpuElemwise])(
tensor.opt.local_elemwise_alloc_op(GpuElemwise, GpuAlloc, GpuDimShuffle) tensor.opt.local_elemwise_alloc_op(GpuElemwise, GpuAlloc, GpuDimShuffle)
) )
register_opt()(gpu_elemwise_alloc) register_opt()(gpu_elemwise_alloc)
register_opt()(tensor.opt.local_useless_elemwise) # needed by gpu_elemwise_alloc # needed by gpu_elemwise_alloc
register_opt()(tensor.opt.local_useless_elemwise)
tensor.opt.register_specialize_device(gpu_elemwise_alloc) tensor.opt.register_specialize_device(gpu_elemwise_alloc)
...@@ -2115,8 +2153,7 @@ def local_gpualloc(node): ...@@ -2115,8 +2153,7 @@ def local_gpualloc(node):
new_out.type.broadcastable): new_out.type.broadcastable):
assert b_new or (not b_old) assert b_new or (not b_old)
new_out = tensor.patternbroadcast(new_out, old_out.broadcastable) new_out = tensor.patternbroadcast(new_out, old_out.broadcastable)
# if old_out.type != new_out.type:
#import pdb; pdb.set_trace()
return [new_out] return [new_out]
...@@ -2139,12 +2176,14 @@ def local_gpualloc_memset_0(node): ...@@ -2139,12 +2176,14 @@ def local_gpualloc_memset_0(node):
if (isinstance(inp, CudaNdarrayConstant) and if (isinstance(inp, CudaNdarrayConstant) and
inp.data.size == 1 and inp.data.size == 1 and
(numpy.asarray(inp.data) == 0).all()): (numpy.asarray(inp.data) == 0).all()):
new_out = GpuAlloc(memset_0=True)(*node.inputs) new_out = GpuAlloc(memset_0=True)(*node.inputs)
old_bcast = node.outputs[0].type.broadcastable old_bcast = node.outputs[0].type.broadcastable
if new_out.type.broadcastable != old_bcast: if new_out.type.broadcastable != old_bcast:
# check that we did not try discarding a broadcastable dimension # check that we did not try discarding a broadcastable
assert not any(b_old and not b_new for b_old, b_new in zip( # dimension
old_bcast, new_out.type.broadcastable)) assert not any(b_old and not b_new for b_old, b_new in
zip(old_bcast, new_out.type.broadcastable))
# force old broadcasting pattern; we must not change it here # force old broadcasting pattern; we must not change it here
new_out = tensor.patternbroadcast(new_out, old_bcast) new_out = tensor.patternbroadcast(new_out, old_bcast)
return [new_out] return [new_out]
...@@ -2177,6 +2216,7 @@ def local_gpu_eye(node): ...@@ -2177,6 +2216,7 @@ def local_gpu_eye(node):
if (host_input.owner and if (host_input.owner and
isinstance(host_input.owner.op, tensor.Eye) and isinstance(host_input.owner.op, tensor.Eye) and
host_input.owner.op.dtype == "float32"): host_input.owner.op.dtype == "float32"):
return [gpu_eye(*host_input.owner.inputs)] return [gpu_eye(*host_input.owner.inputs)]
if isinstance(node.op, tensor.Eye) and node.op.dtype == "float32": if isinstance(node.op, tensor.Eye) and node.op.dtype == "float32":
if any([(i.owner and isinstance(i.owner.op, HostFromGpu)) if any([(i.owner and isinstance(i.owner.op, HostFromGpu))
...@@ -2188,6 +2228,7 @@ def local_gpu_eye(node): ...@@ -2188,6 +2228,7 @@ def local_gpu_eye(node):
def safe_to_gpu(x): def safe_to_gpu(x):
if (isinstance(x.type, tensor.TensorType) and if (isinstance(x.type, tensor.TensorType) and
x.type.dtype == 'float32'): x.type.dtype == 'float32'):
return as_cuda_ndarray_variable(x) return as_cuda_ndarray_variable(x)
else: else:
return x return x
...@@ -2242,6 +2283,7 @@ def gpu_reconstruct_graph(inputs, outputs, tag=None): ...@@ -2242,6 +2283,7 @@ def gpu_reconstruct_graph(inputs, outputs, tag=None):
def tensor_to_cuda(x): def tensor_to_cuda(x):
if (isinstance(x.type, tensor.TensorType) and if (isinstance(x.type, tensor.TensorType) and
x.type.dtype == 'float32'): x.type.dtype == 'float32'):
y = CudaNdarrayType(broadcastable=x.type.broadcastable)() y = CudaNdarrayType(broadcastable=x.type.broadcastable)()
if x.name: if x.name:
y.name = x.name + '[cuda]' y.name = x.name + '[cuda]'
...@@ -2264,7 +2306,8 @@ def local_gpu_extract_diagonal(node): ...@@ -2264,7 +2306,8 @@ def local_gpu_extract_diagonal(node):
theano.tensor.TensorType)): theano.tensor.TensorType)):
inp = node.inputs[0] inp = node.inputs[0]
if inp.owner and isinstance(inp.owner.op, HostFromGpu): if inp.owner and isinstance(inp.owner.op, HostFromGpu):
return [host_from_gpu(nlinalg.extract_diag(as_cuda_ndarray_variable(inp)))] return [host_from_gpu(nlinalg.extract_diag(
as_cuda_ndarray_variable(inp)))]
if isinstance(node.op, GpuFromHost): if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0] host_input = node.inputs[0]
if (host_input.owner and if (host_input.owner and
...@@ -2300,6 +2343,7 @@ def gpuScanOptimization(node): ...@@ -2300,6 +2343,7 @@ def gpuScanOptimization(node):
isinstance(host_input.owner.op, scan_op.Scan) and isinstance(host_input.owner.op, scan_op.Scan) and
not host_input.owner.op.info['gpu'] and not host_input.owner.op.info['gpu'] and
len(host_input.owner.outputs) == 1): len(host_input.owner.outputs) == 1):
# Note that we are not doing the right thing here !! # Note that we are not doing the right thing here !!
# This is because the local optimizer expects only one # This is because the local optimizer expects only one
# output that corresponds to the input of ``node`` # output that corresponds to the input of ``node``
...@@ -2353,6 +2397,7 @@ def gpuScanOptimization(node): ...@@ -2353,6 +2397,7 @@ def gpuScanOptimization(node):
# scan(host_from_gpu) -> host_from_gpu(GPUscan) # scan(host_from_gpu) -> host_from_gpu(GPUscan)
if (type(node.op) == scan_op.Scan if (type(node.op) == scan_op.Scan
and not node.op.info['gpu']): and not node.op.info['gpu']):
if any([(i.owner and isinstance(i.owner.op, HostFromGpu)) if any([(i.owner and isinstance(i.owner.op, HostFromGpu))
for i in node.inputs]): for i in node.inputs]):
...@@ -2434,7 +2479,8 @@ optdb.register('gpu_scanOp_make_inplace', ...@@ -2434,7 +2479,8 @@ optdb.register('gpu_scanOp_make_inplace',
# @alpha_merge(GpuSparseBlockOuter, alpha_in=5, beta_in=?, nd=4) # @alpha_merge(GpuSparseBlockOuter, alpha_in=5, beta_in=?, nd=4)
# def local_merge_blocksparse_alpha(node, *inputs): # def local_merge_blocksparse_alpha(node, *inputs):
# """ # """
# GpuElemwise{mul}(lr, GpuSparseBlockOuter) -> GpuSparseBlockOuter(..., alpha=lr) # GpuElemwise{mul}(lr, GpuSparseBlockOuter) ->
# GpuSparseBlockOuter(..., alpha=lr)
# """ # """
# return [gpu_sparse_block_outer(*inputs)] # return [gpu_sparse_block_outer(*inputs)]
...@@ -2465,8 +2511,7 @@ def _clear_host_from_gpu(inputs): ...@@ -2465,8 +2511,7 @@ def _clear_host_from_gpu(inputs):
return clean_inputs return clean_inputs
@register_meta_opt(SparseBlockGemv, ["gpu_opt", "gpu_local_optimizations"], @register_opt()
0., 'fast_run', 'fast_compile', 'gpu')
@local_optimizer([SparseBlockGemv, GpuFromHost]) @local_optimizer([SparseBlockGemv, GpuFromHost])
def gpu_sparse_block_gemv_opt(node): def gpu_sparse_block_gemv_opt(node):
""" """
...@@ -2493,8 +2538,7 @@ def gpu_sparse_block_gemv_opt(node): ...@@ -2493,8 +2538,7 @@ def gpu_sparse_block_gemv_opt(node):
return [GpuSparseBlockGemv(meta_node.op.inplace)(*inputs)] return [GpuSparseBlockGemv(meta_node.op.inplace)(*inputs)]
@register_meta_opt(SparseBlockOuter, ["gpu_opt", "gpu_local_optimizations"], @register_opt()
0., 'fast_run', 'fast_compile', 'gpu')
@local_optimizer([SparseBlockOuter, GpuFromHost]) @local_optimizer([SparseBlockOuter, GpuFromHost])
def gpu_sparse_block_outer_opt(node): def gpu_sparse_block_outer_opt(node):
""" """
...@@ -2522,4 +2566,36 @@ def gpu_sparse_block_outer_opt(node): ...@@ -2522,4 +2566,36 @@ def gpu_sparse_block_outer_opt(node):
return [GpuSparseBlockOuter(meta_node.op.inplace)(*inputs)] return [GpuSparseBlockOuter(meta_node.op.inplace)(*inputs)]
@local_optimizer([GpuSparseBlockGemv], inplace=True)
def local_inplace_gpu_sparse_block_gemv(node):
"""
GpuSparseBlockGemv(inplace=False) -> GpuSparseBlockGemv(inplace=True)
"""
if isinstance(node.op, GpuSparseBlockGemv) and not node.op.inplace:
new_node = gpu_sparse_block_gemv_inplace(*node.inputs)
return [new_node]
return False
compile.optdb.register('local_inplace_gpu_sparse_block_gemv',
TopoOptimizer(
local_inplace_gpu_sparse_block_gemv,
failure_callback=TopoOptimizer.warn_inplace),
60, 'fast_run', 'inplace', 'gpu') # DEBUG
@local_optimizer([GpuSparseBlockOuter], inplace=True)
def local_inplace_gpu_sparse_block_outer(node):
"""
GpuSparseBlockOuter(inplace=False) -> GpuSparseBlockOuter(inplace=True)
"""
if isinstance(node.op, GpuSparseBlockOuter) and not node.op.inplace:
new_node = gpu_sparse_block_outer_inplace(*node.inputs)
return [new_node]
return False
compile.optdb.register('local_inplace_gpu_sparse_block_outer',
TopoOptimizer(
local_inplace_gpu_sparse_block_outer,
failure_callback=TopoOptimizer.warn_inplace),
60, 'fast_run', 'inplace', 'gpu') # DEBUG
import theano.sandbox.cuda.extra_ops import theano.sandbox.cuda.extra_ops
...@@ -2,170 +2,42 @@ ...@@ -2,170 +2,42 @@
Optimizations addressing the ops in sandbox root directory Optimizations addressing the ops in sandbox root directory
""" """
import bisect from theano import compile # to register the optimizer built by this file
import logging from theano import gof
from theano.compile import optdb
from theano.gof import local_optimizer, EquilibriumDB
from theano.tensor.opt import register_specialize
from theano.sandbox.blocksparse import ( from theano.sandbox.blocksparse import (
SparseBlockGemv, SparseBlockGemv,
SparseBlockOuter, SparseBlockOuter,
sparse_block_gemv,
sparse_block_outer,
sparse_block_gemv_inplace, sparse_block_gemv_inplace,
sparse_block_outer_inplace, sparse_block_outer_inplace)
CpuSparseBlockGemv,
CpuSparseBlockOuter)
_logger = logging.getLogger('theano.sandbox.opt')
def _db_exists(db, db_name):
"""
Tests whether the full path from `db_name[0]` down to
`db_name[-1]` exists.
Parameters
----------
db: `theano.gof.optdb.DB`
A dataset of optimisations or sub-datasets.
db_name: list or tuple of strings
Names of datasets from given one `db[db_name[0]]` down
to the dataset of interest where to register.
ex: ['level_1_dataset', 'level_2_dataset']
"""
if len(db_name) == 1:
return db_name[0] in db._names
return db_name[0] in db._names and _db_exists(db[db_name[0]], db_name[1:])
def _db_register(db, db_name, *args):
"""
Registers an object in last datasets given in db_name. `db_name[-1]`
is deep in the hierarchy of `db`.
Parameters
----------
db: `theano.gof.optdb.DB`
A dataset of optimisations or sub-datasets.
db_name: list or tuple of strings
Names of datasets from given one `db[db_name[0]]` down
to the dataset of interest where to register.
ex: ['level_1_dataset', 'level_2_dataset']
"""
if len(db_name) == 0:
return db.register(*args)
return _db_register(db[db_name[0]], db_name[1:], *args)
def _db_positions(db, db_name, positions=()):
"""
Returns the list of positions of all databases from `db_name[0]`
down to `db_name[-1]`. The path is hierarchical, hence `db_name[0]`
is in `db`, `db_name[1]` is in `db[db_name[0]]`, etc.
Parameters
----------
db: `theano.gof.optdb.DB`
A dataset of optimisations or sub-datasets.
db_name: list or tuple of strings
Names of datasets from given one `db[db_name[0]]` down
to the dataset of interests.
ex: ['level_1_dataset', 'level_2_dataset']
"""
if len(db_name) == 0:
return positions
db_position = db.__position__.get(db_name[0], 0.)
return _db_positions(db[db_name[0]], db_name[1:],
positions + (db_position, ))
def register_meta_opt(op_class, db_name, position, *args):
"""
Registers a given optimization under given database name and saves
optimization information in `op_class.registered_opts`.
Parameters
----------
op_class: `theano.gof.Op`
A meta Op which have multiple implementations available
for optimization.
db_name: string, list or tuple of strings
A string if optimization is inserted in `theano.compile.optdb`
directly. List is used to insert an optimization deep inside a
hierarchy of optimization databases.
position: int or float
Position of the optimisation in the target dataset.
(Position in deep database if not optdb)
*args
Arguments to register the optimization.
"""
if isinstance(db_name, str):
db_name = [db_name]
def call(local_meta_opt):
if not _db_exists(optdb, db_name):
# TODO: Would another default DB be better?
_db_register(optdb, db_name[:-2],
db_name[-1], EquilibriumDB(), position, *args)
_db_register(optdb, db_name,
local_meta_opt.__name__, local_meta_opt, *args)
positions = _db_positions(optdb, db_name)
idx = bisect.bisect_left((positions, local_meta_opt),
op_class.registered_opts)
op_class.registered_opts.insert(idx,
(positions, local_meta_opt.__name__))
return local_meta_opt
return call
@register_meta_opt(SparseBlockGemv, ["meta_cpu"], 51.0,
"fast_run", "fast_compile")
@local_optimizer([SparseBlockGemv])
def cpu_sparse_block_gemv_opt(node):
"""
SparseBlockGemv -> CpuSparseBlockGemv
"""
return [CpuSparseBlockGemv(node.op.inplace)(*node.inputs)]
@register_meta_opt(SparseBlockOuter, ["meta_cpu"], 51.0,
"fast_run", "fast_compile")
@local_optimizer([SparseBlockOuter])
def cpu_sparse_block_outer_opt(node):
"""
SparseBlockOuter -> CpuSparseBlockOuter
"""
return [CpuSparseBlockOuter(node.op.inplace)(*node.inputs)]
@register_specialize @gof.local_optimizer([SparseBlockGemv], inplace=True)
@local_optimizer([sparse_block_gemv], inplace=True) def local_inplace_sparse_block_gemv(node):
def local_inplace_block_sparse_gemv(node):
""" """
SparseBlockGemv(inplace=False) -> SparseBlockGemv(inplace=True) SparseBlockGemv(inplace=False) -> SparseBlockGemv(inplace=True)
""" """
return [sparse_block_gemv_inplace(*node.inputs)] if isinstance(node.op, SparseBlockGemv) and not node.op.inplace:
new_node = sparse_block_gemv_inplace(*node.inputs)
return [new_node]
return False
compile.optdb.register('local_inplace_sparse_block_gemv',
gof.TopoOptimizer(
local_inplace_sparse_block_gemv,
failure_callback=gof.TopoOptimizer.warn_inplace),
60, 'fast_run', 'inplace') # DEBUG
@register_specialize @gof.local_optimizer([SparseBlockOuter], inplace=True)
@local_optimizer([sparse_block_outer], inplace=True) def local_inplace_sparse_block_outer(node):
def local_inplace_block_sparse_outer(node):
""" """
SparseBlockOuter(inplace=False) -> SparseBlockOuter(inplace=True) SparseBlockOuter(inplace=False) -> SparseBlockOuter(inplace=True)
""" """
return [sparse_block_outer_inplace(*node.inputs)] if isinstance(node.op, SparseBlockOuter) and not node.op.inplace:
new_node = sparse_block_outer_inplace(*node.inputs)
return [new_node]
return False
compile.optdb.register('local_inplace_sparse_block_outer',
gof.TopoOptimizer(
local_inplace_sparse_block_outer,
failure_callback=gof.TopoOptimizer.warn_inplace),
60, 'fast_run', 'inplace') # DEBUG
...@@ -11,7 +11,7 @@ from theano import tensor ...@@ -11,7 +11,7 @@ from theano import tensor
import theano.tests.unittest_tools as utt import theano.tests.unittest_tools as utt
from theano.sandbox.blocksparse import sparse_block_dot, \ from theano.sandbox.blocksparse import sparse_block_dot, \
cpu_sparse_block_gemv, cpu_sparse_block_outer sparse_block_gemv, sparse_block_outer
class BlockSparse_Gemv_and_Outer(unittest.TestCase): class BlockSparse_Gemv_and_Outer(unittest.TestCase):
...@@ -24,8 +24,8 @@ class BlockSparse_Gemv_and_Outer(unittest.TestCase): ...@@ -24,8 +24,8 @@ class BlockSparse_Gemv_and_Outer(unittest.TestCase):
self.mode = theano.compile.get_default_mode().excluding( self.mode = theano.compile.get_default_mode().excluding(
'constant_folding' 'constant_folding'
) )
self.gemv_op = cpu_sparse_block_gemv self.gemv_op = sparse_block_gemv
self.outer_op = cpu_sparse_block_outer self.outer_op = sparse_block_outer
@staticmethod @staticmethod
def gemv_data(): def gemv_data():
......
import theano import theano
from theano import tensor from theano import tensor
from theano.sandbox.blocksparse import CpuSparseBlockGemv, \ from theano.sandbox.blocksparse import sparse_block_dot
CpuSparseBlockOuter, sparse_block_dot
def test_blocksparse_cpu_gemv_opt(): def test_blocksparse_inplace_gemv_opt():
b = tensor.fmatrix() b = tensor.fmatrix()
W = tensor.ftensor4() W = tensor.ftensor4()
h = tensor.ftensor3() h = tensor.ftensor3()
...@@ -15,10 +14,13 @@ def test_blocksparse_cpu_gemv_opt(): ...@@ -15,10 +14,13 @@ def test_blocksparse_cpu_gemv_opt():
f = theano.function([W, h, iIdx, b, oIdx], o) f = theano.function([W, h, iIdx, b, oIdx], o)
assert isinstance(f.maker.fgraph.toposort()[-1].op, CpuSparseBlockGemv) if theano.config.mode == "FAST_COMPILE":
assert not f.maker.fgraph.toposort()[-1].op.inplace
else:
assert f.maker.fgraph.toposort()[-1].op.inplace
def test_blocksparse_cpu_outer_opt(): def test_blocksparse_inplace_outer_opt():
b = tensor.fmatrix() b = tensor.fmatrix()
W = tensor.ftensor4() W = tensor.ftensor4()
h = tensor.ftensor3() h = tensor.ftensor3()
...@@ -32,4 +34,7 @@ def test_blocksparse_cpu_outer_opt(): ...@@ -32,4 +34,7 @@ def test_blocksparse_cpu_outer_opt():
f = theano.function([W, h, iIdx, b, oIdx], f = theano.function([W, h, iIdx, b, oIdx],
[o, tensor.grad(o.sum(), wrt=W)]) [o, tensor.grad(o.sum(), wrt=W)])
assert isinstance(f.maker.fgraph.toposort()[-1].op, CpuSparseBlockOuter) if theano.config.mode == "FAST_COMPILE":
assert not f.maker.fgraph.toposort()[-1].op.inplace
else:
assert f.maker.fgraph.toposort()[-1].op.inplace
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论