提交 88599bc3 authored 作者: Brandon T. Willard's avatar Brandon T. Willard

Apply isort to theano.gpuarray and remove circular references

Two new modules were made in order to remove the circular references: `theano.gpuarray.dnn_opt` (i.e. optimizations specific to `theano.gpuarray.dnn`) and `theano.gpuarray.optdb` (i.e. optimization databases for gpuarray `Op`s).
上级 b4fbaa2e
import sys
import os
import logging import logging
import os
import sys
import warnings import warnings
import theano import theano
from theano import config from theano import config
from theano.compile import optdb from theano.compile import optdb
from theano.tensor.basic import register_transfer from theano.tensor.basic import register_transfer
_logger_name = "theano.gpuarray" _logger_name = "theano.gpuarray"
_logger = logging.getLogger(_logger_name) _logger = logging.getLogger(_logger_name)
...@@ -24,19 +24,20 @@ try: ...@@ -24,19 +24,20 @@ try:
except ImportError: except ImportError:
pygpu = None pygpu = None
from . import ctc, dnn, extra_ops, fft, multinomial, opt, reduction, rng_mrg, sort
from .basic_ops import as_gpuarray_variable
# This is for documentation not to depend on the availability of pygpu # This is for documentation not to depend on the availability of pygpu
from .type import ( from .type import (
GpuArrayType, ContextNotDefined,
GpuArrayVariable,
GpuArrayConstant, GpuArrayConstant,
GpuArraySharedVariable, GpuArraySharedVariable,
GpuArrayType,
GpuArrayVariable,
get_context,
gpuarray_shared_constructor, gpuarray_shared_constructor,
reg_context, reg_context,
get_context,
ContextNotDefined,
) )
from .basic_ops import as_gpuarray_variable
from . import fft, dnn, opt, extra_ops, multinomial, reduction, sort, rng_mrg, ctc
def transfer(x, target): def transfer(x, target):
...@@ -292,7 +293,7 @@ if pygpu: ...@@ -292,7 +293,7 @@ if pygpu:
host_from_gpu, host_from_gpu,
) )
from .elemwise import GpuElemwise from .elemwise import GpuElemwise
from .subtensor import GpuSubtensor, GpuIncSubtensor, GpuAdvancedIncSubtensor1 from .subtensor import GpuAdvancedIncSubtensor1, GpuIncSubtensor, GpuSubtensor
else: else:
if ( if (
......
import os
import copy import copy
import os
import re import re
import numpy as np
import theano
from collections import deque from collections import deque
import numpy as np
from six import string_types from six import string_types
from theano import Op, Apply, Type, Variable import theano
from theano import tensor, config from theano import Apply, Op, Type, Variable, config, tensor
from theano.gof import COp, HideC, ParamsType
from theano.gof.opt import copy_stack_trace
from theano.gof.utils import MethodNotDefined
from theano.gradient import grad_undefined from theano.gradient import grad_undefined
from theano.scalar import bool as bool_t, int32 as int32_t from theano.scalar import bool as bool_t
from theano.tensor.basic import Alloc, AllocEmpty, alloc_validate_shape, Join, Split from theano.scalar import int32 as int32_t
from theano.tensor.basic import Alloc, AllocEmpty, Join, Split, alloc_validate_shape
from theano.gof import HideC, COp, ParamsType
from theano.gof.utils import MethodNotDefined
from theano.gof.opt import copy_stack_trace
try: try:
import pygpu import pygpu
...@@ -26,15 +23,15 @@ try: ...@@ -26,15 +23,15 @@ try:
except ImportError: except ImportError:
pass pass
from .fp16_help import write_w
from .type import ( from .type import (
GpuArrayType, EQ_MAP,
ContextNotDefined,
GpuArrayConstant, GpuArrayConstant,
gpu_context_type, GpuArrayType,
get_context, get_context,
ContextNotDefined, gpu_context_type,
EQ_MAP,
) )
from .fp16_help import write_w
def as_gpuarray_variable(x, context_name): def as_gpuarray_variable(x, context_name):
...@@ -1000,7 +997,7 @@ class GpuAlloc(HideC, Alloc): ...@@ -1000,7 +997,7 @@ class GpuAlloc(HideC, Alloc):
return (4,) return (4,)
def do_constant_folding(self, node): def do_constant_folding(self, node):
from . import subtensor, blas from . import blas, subtensor
for client in node.outputs[0].clients: for client in node.outputs[0].clients:
if client[0] == "output": if client[0] == "output":
......
...@@ -2,7 +2,6 @@ from six import integer_types ...@@ -2,7 +2,6 @@ from six import integer_types
import theano import theano
from theano import Apply, Op from theano import Apply, Op
from theano.compile import optdb from theano.compile import optdb
from theano.gof import LocalOptGroup, ParamsType from theano.gof import LocalOptGroup, ParamsType
from theano.scalar import bool as bool_t from theano.scalar import bool as bool_t
...@@ -10,15 +9,16 @@ from theano.tensor.basic import as_tensor_variable ...@@ -10,15 +9,16 @@ from theano.tensor.basic import as_tensor_variable
from theano.tensor.opt import in2out from theano.tensor.opt import in2out
from .basic_ops import ( from .basic_ops import (
GpuArrayType,
CGpuKernelBase, CGpuKernelBase,
GpuArrayType,
as_gpuarray_variable, as_gpuarray_variable,
gpu_contiguous, gpu_contiguous,
infer_context_name,
gpuarray_helper_inc_dir, gpuarray_helper_inc_dir,
infer_context_name,
) )
from .opt_util import inplace_allocempty from .opt_util import inplace_allocempty
try: try:
import pygpu import pygpu
from pygpu import blas from pygpu import blas
......
import logging import logging
import numpy as np import numpy as np
from theano import Apply, tensor from theano import Apply, tensor
from theano.gof import COp, ParamsType from theano.gof import COp, ParamsType
from theano.tensor import discrete_dtypes, as_tensor_variable
from theano.scalar import bool as bool_t
from theano.gradient import grad_undefined from theano.gradient import grad_undefined
from theano.scalar import bool as bool_t
from theano.tensor import as_tensor_variable, discrete_dtypes
from .basic_ops import as_gpuarray_variable, gpuarray_helper_inc_dir, infer_context_name
from .type import gpu_context_type from .type import gpu_context_type
from .basic_ops import as_gpuarray_variable, infer_context_name, gpuarray_helper_inc_dir
_logger = logging.getLogger("theano.gpuarray.blocksparse") _logger = logging.getLogger("theano.gpuarray.blocksparse")
......
...@@ -4,22 +4,20 @@ import sys ...@@ -4,22 +4,20 @@ import sys
import theano import theano
import theano.tensor as tt import theano.tensor as tt
import theano.tensor.nnet.ctc import theano.tensor.nnet.ctc
from theano import config, gof from theano import config, gof
from theano.gof import local_optimizer
from theano.gpuarray import pygpu
from theano.gpuarray.basic_ops import ( from theano.gpuarray.basic_ops import (
gpu_contiguous,
as_gpuarray_variable, as_gpuarray_variable,
infer_context_name, gpu_contiguous,
gpuarray_helper_inc_dir, gpuarray_helper_inc_dir,
infer_context_name,
) )
from theano.gpuarray.type import GpuArrayType, gpu_context_type
from theano.gpuarray.elemwise import GpuDimShuffle from theano.gpuarray.elemwise import GpuDimShuffle
from theano.gpuarray.type import GpuArrayType, gpu_context_type
from theano.gradient import grad_undefined from theano.gradient import grad_undefined
from theano.gof import local_optimizer
from theano.tensor.opt import register_canonicalize
from theano.tensor.nnet.ctc import ctc_available from theano.tensor.nnet.ctc import ctc_available
from theano.tensor.opt import register_canonicalize
from theano.gpuarray import pygpu
class GpuConnectionistTemporalClassification(gof.COp): class GpuConnectionistTemporalClassification(gof.COp):
......
...@@ -18,6 +18,7 @@ Currently supported cuDNN APIs: ...@@ -18,6 +18,7 @@ Currently supported cuDNN APIs:
from theano.gof import CEnumType from theano.gof import CEnumType
HALF, FLOAT, DOUBLE = ("float16", "float32", "float64") HALF, FLOAT, DOUBLE = ("float16", "float32", "float64")
TRUE_HALF_CONFIG = (HALF, HALF) TRUE_HALF_CONFIG = (HALF, HALF)
PSEUDO_HALF_CONFIG = (HALF, FLOAT) PSEUDO_HALF_CONFIG = (HALF, FLOAT)
......
...@@ -2,85 +2,51 @@ import ctypes ...@@ -2,85 +2,51 @@ import ctypes
import os import os
import sys import sys
import warnings import warnings
from functools import reduce
import numpy as np import numpy as np
from six import integer_types
import theano import theano
import theano.pathparse import theano.pathparse
from theano import Apply, Op, Variable, config, tensor
from functools import reduce from theano.compile.ops import shape_i, shape_i_op
from theano.configdefaults import SUPPORTED_DNN_CONV_ALGO_RUNTIME
from six import integer_types from theano.gof import COp, EnumList, ParamsType
from theano import Op, Apply, tensor, config, Variable
from theano.scalar import (
as_scalar,
constant,
Log,
get_scalar_type,
int32 as int_t,
bool as bool_t,
uint32 as uint32_t,
)
from theano.tensor import as_tensor_variable, Argmax
from theano.tensor.extra_ops import cpu_contiguous
from theano.gradient import DisconnectedType, grad_not_implemented
from theano.gof import Optimizer, local_optimizer, COp, ParamsType, EnumList
from theano.gof.cmodule import GCC_compiler from theano.gof.cmodule import GCC_compiler
from theano.gof.type import CDataType, Generic from theano.gof.type import CDataType, Generic
from theano.gof.opt import inherit_stack_trace from theano.gpuarray import cudnn_defs, pygpu
from theano.tensor.opt import Assert from theano.gpuarray.basic_ops import (
from theano.compile import optdb GpuAllocEmpty,
from theano.compile.ops import shape_i, shape_i_op GpuArrayType,
from theano.tensor.nnet import LogSoftmax, SoftmaxGrad HostFromGpu,
as_gpuarray_variable,
empty_like,
gpu_contiguous,
gpuarray_helper_inc_dir,
infer_context_name,
)
from theano.gpuarray.type import GpuArraySharedVariable, get_context, gpu_context_type
from theano.gradient import DisconnectedType, grad_not_implemented
from theano.scalar import as_scalar
from theano.scalar import bool as bool_t
from theano.scalar import constant, get_scalar_type
from theano.scalar import int32 as int_t
from theano.scalar import uint32 as uint32_t
from theano.tensor.basic import as_tensor_variable
from theano.tensor.extra_ops import cpu_contiguous
from theano.tensor.nnet.abstract_conv import ( from theano.tensor.nnet.abstract_conv import (
AbstractConv2d, AbstractConv2d,
AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs, AbstractConv2d_gradInputs,
AbstractConv2d_gradWeights,
AbstractConv3d, AbstractConv3d,
AbstractConv3d_gradWeights,
AbstractConv3d_gradInputs, AbstractConv3d_gradInputs,
get_conv_output_shape, AbstractConv3d_gradWeights,
assert_conv_shape, assert_conv_shape,
get_conv_output_shape,
) )
from theano.tensor.signal.pool import Pool, MaxPoolGrad, AveragePoolGrad from theano.tensor.opt import Assert
from theano.configdefaults import SUPPORTED_DNN_CONV_ALGO_RUNTIME
from . import pygpu, cudnn_defs
from .type import get_context, gpu_context_type, list_contexts, GpuArraySharedVariable
from .basic_ops import (
as_gpuarray_variable,
infer_context_name,
gpuarray_helper_inc_dir,
gpu_contiguous,
GpuAllocEmpty,
empty_like,
GpuArrayType,
HostFromGpu,
)
from .elemwise import GpuElemwise, GpuCAReduceCuda
from .reduction import GpuMaxAndArgmax
# These don't exist in gpuarray
# GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad
from .nnet import GpuSoftmax
from .opt import (
gpu_seqopt,
register_opt,
pool_db,
pool_db2,
op_lifter,
register_opt2,
register_inplace,
)
from .opt_util import (
alpha_merge,
output_merge,
inplace_allocempty,
pad_dims,
unpad_dims,
)
DNN_CONV_ALGO_CHOOSE_ONCE = ["guess_once", "time_once"] DNN_CONV_ALGO_CHOOSE_ONCE = ["guess_once", "time_once"]
DNN_CONV_ALGO_CHOOSE_TIME = ["time_once", "time_on_shape_change"] DNN_CONV_ALGO_CHOOSE_TIME = ["time_once", "time_on_shape_change"]
...@@ -3923,779 +3889,6 @@ def local_abstractconv3d_cudnn_graph(op, context_name, inputs, outputs): ...@@ -3923,779 +3889,6 @@ def local_abstractconv3d_cudnn_graph(op, context_name, inputs, outputs):
return [rval] return [rval]
@local_optimizer([AbstractConv2d, AbstractConv3d])
def local_abstractconv_cudnn(node):
ctx = infer_context_name(*node.inputs)
if not isinstance(node.inputs[0].type, GpuArrayType):
return
if node.op.unshared:
return None
if isinstance(node.op.border_mode, tuple) and any(
isinstance(p, tuple) for p in node.op.border_mode
):
# Asymmetric padding not yet supported
return None
if isinstance(node.op, AbstractConv2d):
with inherit_stack_trace(node.outputs):
return local_abstractconv_cudnn_graph(
node.op, ctx, node.inputs, node.outputs
)
elif isinstance(node.op, AbstractConv3d):
with inherit_stack_trace(node.outputs):
return local_abstractconv3d_cudnn_graph(
node.op, ctx, node.inputs, node.outputs
)
@local_optimizer(
[AbstractConv2d, AbstractConv2d_gradWeights, AbstractConv2d_gradInputs]
)
def local_abstractconv_cudnn_alt(node):
if not isinstance(
node.op, (AbstractConv2d, AbstractConv2d_gradWeights, AbstractConv2d_gradInputs)
):
return
if version(raises=False) < 6000 and node.op.filter_dilation != (1, 1):
return None
if node.op.unshared:
return None
if isinstance(node.op.border_mode, tuple) and any(
isinstance(p, tuple) for p in node.op.border_mode
):
# Asymmetric padding not yet supported
return None
inp1 = node.inputs[0]
inp2 = node.inputs[1]
if not dnn_available(inp1.type.context_name):
return
op = node.op
border_mode = node.op.border_mode
subsample = node.op.subsample
filter_dilation = node.op.filter_dilation
num_groups = node.op.num_groups
precision, _ = get_precision(None, [inp1, inp2])
if node.op.filter_flip:
conv_mode = "conv"
else:
conv_mode = "cross"
if isinstance(op, AbstractConv2d):
if border_mode == "half" or subsample != (1, 1) or num_groups != 1:
return None
if border_mode == "full":
direction_hint = "bprop inputs"
elif border_mode == "valid" and filter_dilation == (1, 1):
direction_hint = "bprop weights"
else:
return None
rval = dnn_conv(
inp1,
inp2,
border_mode=border_mode,
subsample=subsample,
dilation=filter_dilation,
direction_hint=direction_hint,
conv_mode=conv_mode,
num_groups=num_groups,
)
elif isinstance(op, AbstractConv2d_gradWeights):
if (
border_mode == "valid"
and subsample == (1, 1)
and filter_dilation == (1, 1)
and num_groups == 1
):
img = gpu_contiguous(inp1)
topgrad = gpu_contiguous(inp2)
ctx_name = infer_context_name(img, topgrad)
img = gpu_contiguous(img.dimshuffle(1, 0, 2, 3))
topgrad = gpu_contiguous(topgrad.dimshuffle(1, 0, 2, 3))
ishape = [shape_i_op(i)(img) for i in range(img.ndim)]
tshape = [shape_i_op(i)(topgrad) for i in range(topgrad.ndim)]
out_shp = get_conv_output_shape(
ishape,
tshape,
border_mode=border_mode,
subsample=subsample,
filter_dilation=filter_dilation,
)
out_shp = assert_conv_shape(out_shp)
out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*out_shp)
desc = GpuDnnConvDesc(
border_mode=border_mode,
subsample=subsample,
dilation=filter_dilation,
conv_mode="cross",
precision=precision,
)(out.shape)
conv = GpuDnnConv(algo=None, num_groups=num_groups)(img, topgrad, out, desc)
if conv_mode == "conv":
conv = conv[:, :, ::-1, ::-1]
rval = as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3), ctx_name)
else:
return None
elif isinstance(op, AbstractConv2d_gradInputs):
if border_mode == "valid" and subsample == (1, 1) and num_groups == 1:
kerns = gpu_contiguous(inp1.dimshuffle(1, 0, 2, 3))
topgrad = gpu_contiguous(inp2)
ctx_name = infer_context_name(kerns, topgrad)
conv_mode = "cross" if conv_mode == "conv" else "conv"
desc = GpuDnnConvDesc(
border_mode="full",
subsample=subsample,
dilation=filter_dilation,
conv_mode=conv_mode,
precision=precision,
)(kerns.shape)
tshape = [shape_i_op(i)(topgrad) for i in range(topgrad.ndim)]
kshape = [shape_i_op(i)(kerns) for i in range(kerns.ndim)]
shape = get_conv_output_shape(
tshape,
kshape,
border_mode="full",
subsample=subsample,
filter_dilation=filter_dilation,
)
shape = assert_conv_shape(shape)
out = GpuAllocEmpty(dtype=topgrad.dtype, context_name=ctx_name)(*shape)
rval = GpuDnnConv(algo=None, num_groups=num_groups)(
topgrad, kerns, out, desc
)
else:
return None
return [rval]
@local_optimizer(
[AbstractConv3d, AbstractConv3d_gradWeights, AbstractConv3d_gradInputs]
)
def local_abstractconv3d_cudnn_alt(node):
if not isinstance(
node.op, (AbstractConv3d, AbstractConv3d_gradWeights, AbstractConv3d_gradInputs)
):
return
if version(raises=False) < 6000 and node.op.filter_dilation != (1, 1, 1):
return None
inp1 = node.inputs[0]
inp2 = node.inputs[1]
if not dnn_available(inp1.type.context_name):
return
op = node.op
border_mode = node.op.border_mode
subsample = node.op.subsample
filter_dilation = node.op.filter_dilation
num_groups = node.op.num_groups
precision, _ = get_precision(None, [inp1, inp2])
if node.op.filter_flip:
conv_mode = "conv"
else:
conv_mode = "cross"
if isinstance(op, AbstractConv3d):
if border_mode == "half" or subsample != (1, 1, 1) or num_groups > 1:
return None
if border_mode == "full":
direction_hint = "bprop inputs"
elif border_mode == "valid" and filter_dilation == (1, 1, 1):
direction_hint = "bprop weights"
else:
return None
rval = dnn_conv3d(
inp1,
inp2,
border_mode=border_mode,
subsample=subsample,
dilation=filter_dilation,
direction_hint=direction_hint,
conv_mode=conv_mode,
)
elif isinstance(op, AbstractConv3d_gradWeights):
if (
border_mode == "valid"
and subsample == (1, 1, 1)
and filter_dilation == (1, 1, 1)
and num_groups == 1
):
img = gpu_contiguous(inp1)
topgrad = gpu_contiguous(inp2)
ctx_name = infer_context_name(img, topgrad)
img = gpu_contiguous(img.dimshuffle(1, 0, 2, 3, 4))
topgrad = gpu_contiguous(topgrad.dimshuffle(1, 0, 2, 3, 4))
ishape = [shape_i_op(i)(img) for i in range(img.ndim)]
tshape = [shape_i_op(i)(topgrad) for i in range(topgrad.ndim)]
out_shp = get_conv_output_shape(
ishape,
tshape,
border_mode=border_mode,
subsample=subsample,
filter_dilation=filter_dilation,
)
out_shp = assert_conv_shape(out_shp)
out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*out_shp)
desc = GpuDnnConvDesc(
border_mode=border_mode,
subsample=subsample,
dilation=filter_dilation,
conv_mode="cross",
num_groups=num_groups,
precision=precision,
)(out.shape)
conv = GpuDnnConv(algo=None, num_groups=num_groups)(img, topgrad, out, desc)
if conv_mode == "conv":
conv = conv[:, :, ::-1, ::-1, ::-1]
rval = as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3, 4), ctx_name)
else:
return None
elif isinstance(op, AbstractConv3d_gradInputs):
if border_mode == "valid" and subsample == (1, 1, 1) and num_groups == 1:
kerns = gpu_contiguous(inp1.dimshuffle(1, 0, 2, 3, 4))
topgrad = gpu_contiguous(inp2)
ctx_name = infer_context_name(kerns, topgrad)
conv_mode = "cross" if conv_mode == "conv" else "conv"
desc = GpuDnnConvDesc(
border_mode="full",
subsample=subsample,
dilation=filter_dilation,
conv_mode=conv_mode,
num_groups=num_groups,
precision=precision,
)(kerns.shape)
tshape = [shape_i_op(i)(topgrad) for i in range(topgrad.ndim)]
kshape = [shape_i_op(i)(kerns) for i in range(kerns.ndim)]
shape = get_conv_output_shape(
tshape,
kshape,
border_mode="full",
subsample=subsample,
filter_dilation=filter_dilation,
)
shape = assert_conv_shape(shape)
out = GpuAllocEmpty(dtype=topgrad.dtype, context_name=ctx_name)(*shape)
rval = GpuDnnConv(algo=None, num_groups=num_groups)(
topgrad, kerns, out, desc
)
else:
return None
return [rval]
@local_optimizer([AbstractConv2d_gradWeights, AbstractConv3d_gradWeights])
def local_abstractconv_gw_cudnn(node):
ctx = infer_context_name(*node.inputs)
if not isinstance(node.inputs[0].type, GpuArrayType):
return
if node.op.unshared:
return None
if isinstance(node.op.border_mode, tuple) and any(
isinstance(p, tuple) for p in node.op.border_mode
):
# Asymmetric padding not yet supported
return None
if isinstance(node.op, AbstractConv2d_gradWeights):
with inherit_stack_trace(node.outputs):
return local_abstractconv_cudnn_graph(
node.op, ctx, node.inputs, node.outputs
)
elif isinstance(node.op, AbstractConv3d_gradWeights):
with inherit_stack_trace(node.outputs):
return local_abstractconv3d_cudnn_graph(
node.op, ctx, node.inputs, node.outputs
)
@local_optimizer([AbstractConv2d_gradInputs, AbstractConv3d_gradInputs])
def local_abstractconv_gi_cudnn(node):
ctx = infer_context_name(*node.inputs)
if not isinstance(node.inputs[0].type, GpuArrayType):
return
if node.op.unshared:
return None
if isinstance(node.op.border_mode, tuple) and any(
isinstance(p, tuple) for p in node.op.border_mode
):
# Asymmetric padding not yet supported
return None
if isinstance(node.op, AbstractConv2d_gradInputs):
with inherit_stack_trace(node.outputs):
return local_abstractconv_cudnn_graph(
node.op, ctx, node.inputs, node.outputs
)
elif isinstance(node.op, AbstractConv3d_gradInputs):
with inherit_stack_trace(node.outputs):
return local_abstractconv3d_cudnn_graph(
node.op, ctx, node.inputs, node.outputs
)
@inplace_allocempty(GpuDnnConv, 2)
def local_dnn_conv_inplace(node, inputs):
return [
GpuDnnConv(algo=node.op.algo, inplace=True, num_groups=node.op.num_groups)(
*inputs
)
]
@inplace_allocempty(GpuDnnConvGradW, 2)
def local_dnn_convgw_inplace(node, inputs):
return [
GpuDnnConvGradW(algo=node.op.algo, inplace=True, num_groups=node.op.num_groups)(
*inputs
)
]
@inplace_allocempty(GpuDnnConvGradI, 2)
def local_dnn_convgi_inplace(node, inputs):
return [
GpuDnnConvGradI(algo=node.op.algo, inplace=True, num_groups=node.op.num_groups)(
*inputs
)
]
optdb.register(
"local_dnna_conv_inplace",
tensor.opt.in2out(
local_dnn_conv_inplace,
local_dnn_convgw_inplace,
local_dnn_convgi_inplace,
name="local_dnna_conv_inplace",
),
70.0,
"fast_run",
"inplace",
"gpuarray",
"cudnn",
)
@register_opt("cudnn")
@alpha_merge(GpuDnnConv, alpha_in=4, beta_in=5)
def local_dnn_conv_alpha_merge(node, *inputs):
return [GpuDnnConv(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)]
@register_opt("cudnn")
@alpha_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5)
def local_dnn_convw_alpha_merge(node, *inputs):
return [GpuDnnConvGradW(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)]
@register_opt("cudnn")
@alpha_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5)
def local_dnn_convi_alpha_merge(node, *inputs):
return [GpuDnnConvGradI(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)]
@register_opt("cudnn")
@output_merge(GpuDnnConv, alpha_in=4, beta_in=5, out_in=2)
def local_dnn_conv_output_merge(node, *inputs):
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConv(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)]
@register_opt("cudnn")
@output_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, out_in=2)
def local_dnn_convw_output_merge(node, *inputs):
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConvGradW(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)]
@register_opt("cudnn")
@output_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, out_in=2)
def local_dnn_convi_output_merge(node, *inputs):
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConvGradI(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)]
def local_gpua_pool_dnn_alternative(op, ctx_name, inputs, outputs):
if not dnn_available(ctx_name):
return
if not op.ignore_border:
return
img, ws, stride, pad = inputs
nd = op.ndim
if nd not in (2, 3):
return
img = gpu_contiguous(as_gpuarray_variable(img, ctx_name))
mode = op.mode
# dnn_pool expects exactly 2 non-pooling dimensions
if img.ndim == nd + 2:
return dnn_pool(img, ws, stride=stride, pad=pad, mode=mode)
else:
# reshape to 4D or 5D with 2 non-pooling dimensions
img_padded = pad_dims(img, 2, nd)
ret_padded = dnn_pool(img_padded, ws, stride=stride, pad=pad, mode=mode)
return unpad_dims(ret_padded, img, 2, nd)
pool_db.register(
"local_gpua_pool_dnn_alternative",
op_lifter([Pool])(local_gpua_pool_dnn_alternative),
"gpuarray",
"fast_compile",
"fast_run",
"cudnn",
position=0,
)
pool_db2.register(
"local_gpua_pool_dnn_alternative",
local_optimizer([Pool])(local_gpua_pool_dnn_alternative),
"gpuarray",
"fast_compile",
"fast_run",
"cudnn",
position=0,
)
def local_gpua_pool_dnn_grad_stride(op, ctx_name, inputs, outputs):
if not dnn_available(ctx_name):
return
if not op.ignore_border:
return
inp, out, out_grad, ws, stride, pad = inputs
nd = op.ndim
if nd not in (2, 3):
return
inp = gpu_contiguous(as_gpuarray_variable(inp, ctx_name))
out = gpu_contiguous(as_gpuarray_variable(out, ctx_name))
out_grad = gpu_contiguous(as_gpuarray_variable(out_grad, ctx_name))
mode = op.mode
# the GPU ops expect exactly 2 non-pooling dimensions
if inp.ndim == nd + 2:
return GpuDnnPoolGrad(mode=mode)(inp, out, out_grad, ws, stride, pad)
else:
# reshape to 4D or 5D with 2 non-pooling dimensions
inp_padded = pad_dims(inp, 2, nd)
out_padded = pad_dims(out, 2, nd)
out_grad_padded = pad_dims(out_grad, 2, nd)
ret_padded = GpuDnnPoolGrad(mode=mode)(
inp_padded, out_padded, out_grad_padded, ws, stride, pad
)
return unpad_dims(ret_padded, inp, 2, nd)
pool_db.register(
"local_gpua_pool_dnn_grad_stride",
op_lifter([MaxPoolGrad])(local_gpua_pool_dnn_grad_stride),
"gpuarray",
"fast_compile",
"fast_run",
"cudnn",
position=0,
)
pool_db2.register(
"local_gpua_pool_dnn_grad_stride",
local_optimizer([MaxPoolGrad])(local_gpua_pool_dnn_grad_stride),
"gpuarray",
"fast_compile",
"fast_run",
"cudnn",
position=0,
)
def local_gpua_avg_pool_dnn_grad_stride(op, ctx_name, inputs, outputs):
if not dnn_available(ctx_name):
return
if not op.ignore_border:
return
inp, out_grad, ws, stride, pad = inputs
nd = op.ndim
if nd not in (2, 3):
return
inp = gpu_contiguous(as_gpuarray_variable(inp, ctx_name))
out_grad = gpu_contiguous(as_gpuarray_variable(out_grad, ctx_name))
mode = op.mode
# the GPU ops expect exactly 2 non-pooling dimensions
if inp.ndim == nd + 2:
# We reuse out_grad because cuDNN does not use the value of the `out`
# argument but still checks its shape for average pooling. This
# has been observed in v2 and v3 as far as I know.
return GpuDnnPoolGrad(mode=mode)(inp, out_grad, out_grad, ws, stride, pad)
else:
# reshape to 4D or 5D with 2 non-pooling dimensions
inp_padded = pad_dims(inp, 2, nd)
out_grad_padded = pad_dims(out_grad, 2, nd)
ret_padded = GpuDnnPoolGrad(mode=mode)(
inp_padded, out_grad_padded, out_grad_padded, ws, stride, pad
)
return unpad_dims(ret_padded, inp, 2, nd)
pool_db.register(
"local_gpua_avg_pool_dnn_grad_stride",
op_lifter([AveragePoolGrad])(local_gpua_avg_pool_dnn_grad_stride),
"gpuarray",
"fast_compile",
"fast_run",
"cudnn",
position=0,
)
pool_db2.register(
"local_gpua_avg_pool_dnn_grad_stride",
local_optimizer([AveragePoolGrad])(local_gpua_avg_pool_dnn_grad_stride),
"gpuarray",
"fast_compile",
"fast_run",
"cudnn",
position=0,
)
@register_opt("cudnn", "fast_compile")
@local_optimizer([GpuSoftmax])
def local_softmax_dnn(node):
if isinstance(node.op, GpuSoftmax):
if not dnn_available(node.outputs[0].type.context_name):
return
ins = node.inputs[0].dimshuffle(0, 1, "x", "x")
ins = gpu_contiguous(ins)
out = GpuDnnSoftmax("accurate", "channel")(ins)
out = as_gpuarray_variable(out.dimshuffle(0, 1), out.type.context_name)
return [out]
@register_opt("cudnn", "stabilize")
@local_optimizer([GpuElemwise])
def local_log_softmax_dnn(node):
# This looks for GpuDnnSoftmax so we know that we have cudnn.
if (
isinstance(node.op, GpuElemwise)
and isinstance(node.op.scalar_op, Log)
and node.inputs[0].owner
and isinstance(node.inputs[0].owner.op, GpuDnnSoftmax)
and len(node.inputs[0].clients) == 1
):
softmax_node = node.inputs[0].owner
new_softmax = GpuDnnSoftmax("log", softmax_node.op.mode)
return [new_softmax(softmax_node.inputs[0])]
@register_opt("cudnn", "fast_compile")
@op_lifter([LogSoftmax])
@register_opt2([LogSoftmax], "fast_compile", "cudnn")
def local_gpua_logsoftmax_to_dnn(op, ctx_name, inputs, outputs):
# Transform the input in the format expected by GpuDnnSoftmax
inp = inputs[0]
if inp.ndim != 2:
return
if not dnn_available(ctx_name):
return
inp = inp.dimshuffle(0, 1, "x", "x")
inp.tag.context_name = ctx_name
# Apply GpuDnnSoftmax and return the result
out = GpuDnnSoftmax("log", "channel")(gpu_contiguous(inp))
return [out.dimshuffle(0, 1)]
@register_opt("cudnn", "fast_compile")
@op_lifter([SoftmaxGrad])
@register_opt2([SoftmaxGrad], "cudnn", "fast_compile")
def local_gpua_softmax_dnn_grad(op, ctx_name, inputs, outputs):
if not dnn_available(ctx_name):
return
ins = []
for n in inputs:
n = as_gpuarray_variable(n, ctx_name)
if n.ndim != 2:
return
ins.append(n.dimshuffle(0, "x", 1, "x"))
out = GpuDnnSoftmaxGrad("accurate", "instance")(
gpu_contiguous(ins[0]), gpu_contiguous(ins[1])
)
return [out.dimshuffle(0, 2)]
@register_opt("cudnn")
@local_optimizer([GpuCAReduceCuda])
def local_dnn_reduction(node):
if not isinstance(node.op, GpuCAReduceCuda):
return
if not dnn_available(node.inputs[0].type.context_name):
return
if version(raises=False) < 6000:
return
if node.inputs[0].ndim > 8:
return
acc_dtype = node.op._acc_dtype(node.inputs[0].dtype)
if node.inputs[0].dtype != node.outputs[0].dtype:
# We can mix float16 and float32, but not float64.
if node.inputs[0].dtype == "float64" or node.outputs[0].dtype == "float64":
return
if acc_dtype != "float32":
return
if node.inputs[0].dtype not in ["float16", "float32", "float64"]:
return
if node.inputs[0].dtype == "float64" and acc_dtype != "float64":
return
if node.inputs[0].dtype == "float32" and acc_dtype != "float32":
return
if node.inputs[0].dtype == "float16" and acc_dtype == "float64":
return
def _identity(a):
return a
def _square(a):
return GpuElemwise(theano.scalar.basic.sqr)(a)
scal = node.op.scalar_op.name
post = _identity
if node.op.pre_scalar_op is not None:
if isinstance(node.op.scalar_op, theano.scalar.basic.Add):
if isinstance(node.op.pre_scalar_op, theano.scalar.basic.Sqr):
scal = "norm2"
post = _square
elif isinstance(node.op.pre_scalar_op, theano.scalar.basic.Abs):
scal = "norm1"
else:
return
elif isinstance(node.op.scalar_op, theano.scalar.basic.Maximum) and isinstance(
node.op.pre_scalar_op, theano.scalar.basic.Abs
):
scal = "absmax"
else:
return
if not cudnn.cudnnReduceTensorOp_t.has_alias(scal):
return
with inherit_stack_trace(node.outputs):
ret = GpuDnnReduction(scal, node.op.axis, acc_dtype, node.op.dtype, False)(
node.inputs[0]
)
return [post(ret)]
@register_opt("cudnn")
@local_optimizer([GpuMaxAndArgmax])
def local_cudnn_maxandargmax(node):
if not isinstance(node.op, GpuMaxAndArgmax):
return
if not dnn_available(node.inputs[0].type.context_name):
return
if version(raises=False) < 6000:
return
if node.inputs[0].ndim > 8:
return
if node.inputs[0].dtype != node.outputs[0].dtype:
return
if node.inputs[0].dtype not in ["float16", "float32", "float64"]:
return
# order of the axes influences the output indices
if node.op.axis is not None and tuple(sorted(node.op.axis)) != node.op.axis:
return
max, arg = GpuDnnReduction(
"maximum", node.op.axis, node.outputs[0].dtype, node.outputs[0].dtype, True
)(node.inputs[0])
# cudnn can only return int32 indices
return (
max,
as_gpuarray_variable(arg.astype("int64"), node.outputs[1].type.context_name),
)
@register_opt("cudnn", "fast_compile")
@op_lifter([Argmax])
@register_opt2([Argmax], "fast_compile", "cudnn")
def local_dnn_argmax(op, ctx_name, inputs, outputs):
if not dnn_available(ctx_name):
return
if version(raises=False) < 6000:
return
if inputs[0].ndim > 8:
return
if inputs[0].dtype not in ["float16", "float32", "float64"]:
return
# order of the axes influences the output indices
if op.axis is not None and tuple(sorted(op.axis)) != op.axis:
return
max, arg = GpuDnnReduction(
"maximum", op.axis, inputs[0].dtype, inputs[0].dtype, True
)(*inputs)
return [as_gpuarray_variable(arg.astype("int64"), ctx_name)]
class NoCuDNNRaise(Optimizer):
def apply(self, fgraph):
"""
Raise a error if cudnn can't be used.
"""
for c in list_contexts():
if not dnn_available(c):
# Make an assert error as we want Theano to fail, not
# just skip this optimization.
raise AssertionError(
"cuDNN optimization was enabled, but Theano was not able "
"to use it for context "
+ str(c)
+ ". We got this error: \n"
+ dnn_available.msg
)
gpu_seqopt.register("NoCuDNNRaise", NoCuDNNRaise(), 0, "cudnn")
def local_abstract_batch_norm_train_cudnn(op, ctx_name, inputs, outputs): def local_abstract_batch_norm_train_cudnn(op, ctx_name, inputs, outputs):
x, scale, bias, epsilon, running_average_factor = inputs[:5] x, scale, bias, epsilon, running_average_factor = inputs[:5]
running_mean = inputs[5] if len(inputs) > 5 else None running_mean = inputs[5] if len(inputs) > 5 else None
...@@ -4740,60 +3933,6 @@ def local_abstract_batch_norm_train_cudnn(op, ctx_name, inputs, outputs): ...@@ -4740,60 +3933,6 @@ def local_abstract_batch_norm_train_cudnn(op, ctx_name, inputs, outputs):
return results return results
@register_inplace()
@local_optimizer([GpuDnnBatchNorm], inplace=True)
def local_batch_norm_inplace_output(node):
if isinstance(node.op, GpuDnnBatchNorm) and not node.op.inplace_output:
return GpuDnnBatchNorm(
mode=node.op.mode,
running_averages=node.op.running_averages,
inplace_running_mean=node.op.inplace_running_mean,
inplace_running_var=node.op.inplace_running_var,
inplace_output=True,
)(*node.inputs)
@register_inplace()
@local_optimizer([GpuDnnBatchNorm], inplace=True)
def local_batch_norm_inplace_running_mean(node):
if (
isinstance(node.op, GpuDnnBatchNorm)
and node.op.running_averages
and not node.op.inplace_running_mean
):
return GpuDnnBatchNorm(
mode=node.op.mode,
running_averages=node.op.running_averages,
inplace_running_mean=True,
inplace_running_var=node.op.inplace_running_var,
inplace_output=node.op.inplace_output,
)(*node.inputs)
@register_inplace()
@local_optimizer([GpuDnnBatchNorm], inplace=True)
def local_batch_norm_inplace_running_var(node):
if (
isinstance(node.op, GpuDnnBatchNorm)
and node.op.running_averages
and not node.op.inplace_running_var
):
return GpuDnnBatchNorm(
mode=node.op.mode,
running_averages=node.op.running_averages,
inplace_running_mean=node.op.inplace_running_mean,
inplace_running_var=True,
inplace_output=node.op.inplace_output,
)(*node.inputs)
@register_inplace()
@local_optimizer([GpuDnnBatchNormInference], inplace=True)
def local_batch_norm_inference_inplace(node):
if isinstance(node.op, GpuDnnBatchNormInference) and not node.op.inplace:
return [GpuDnnBatchNormInference(mode=node.op.mode, inplace=True)(*node.inputs)]
def local_abstract_batch_norm_train_grad_cudnn(op, ctx_name, inputs, outputs): def local_abstract_batch_norm_train_grad_cudnn(op, ctx_name, inputs, outputs):
x, dy, scale, x_mean, x_invstd, epsilon = inputs x, dy, scale, x_mean, x_invstd, epsilon = inputs
......
import theano
from theano.compile import optdb
from theano.compile.ops import shape_i_op
from theano.gof.opt import Optimizer, inherit_stack_trace, local_optimizer
from theano.gpuarray.basic_ops import (
GpuAllocEmpty,
GpuArrayType,
as_gpuarray_variable,
gpu_contiguous,
infer_context_name,
)
from theano.gpuarray.dnn import (
GpuDnnBatchNorm,
GpuDnnBatchNormInference,
GpuDnnConv,
GpuDnnConvDesc,
GpuDnnConvGradI,
GpuDnnConvGradW,
GpuDnnPoolGrad,
GpuDnnReduction,
GpuDnnSoftmax,
GpuDnnSoftmaxGrad,
cudnn,
dnn_available,
dnn_conv,
dnn_conv3d,
dnn_pool,
get_precision,
local_abstractconv3d_cudnn_graph,
local_abstractconv_cudnn_graph,
version,
)
from theano.gpuarray.elemwise import GpuCAReduceCuda, GpuElemwise
from theano.gpuarray.nnet import GpuSoftmax
from theano.gpuarray.opt_util import (
alpha_merge,
inplace_allocempty,
op_lifter,
output_merge,
pad_dims,
unpad_dims,
)
from theano.gpuarray.optdb import (
gpu_seqopt,
pool_db,
pool_db2,
register_inplace,
register_opt,
register_opt2,
)
from theano.gpuarray.reduction import GpuMaxAndArgmax
from theano.gpuarray.type import list_contexts
from theano.scalar import Log
from theano.tensor import Argmax
from theano.tensor.nnet import LogSoftmax, SoftmaxGrad
from theano.tensor.nnet.abstract_conv import (
AbstractConv2d,
AbstractConv2d_gradInputs,
AbstractConv2d_gradWeights,
AbstractConv3d,
AbstractConv3d_gradInputs,
AbstractConv3d_gradWeights,
assert_conv_shape,
get_conv_output_shape,
)
from theano.tensor.signal.pool import AveragePoolGrad, MaxPoolGrad, Pool
@local_optimizer([AbstractConv2d, AbstractConv3d])
def local_abstractconv_cudnn(node):
ctx = infer_context_name(*node.inputs)
if not isinstance(node.inputs[0].type, GpuArrayType):
return
if node.op.unshared:
return None
if isinstance(node.op.border_mode, tuple) and any(
isinstance(p, tuple) for p in node.op.border_mode
):
# Asymmetric padding not yet supported
return None
if isinstance(node.op, AbstractConv2d):
with inherit_stack_trace(node.outputs):
return local_abstractconv_cudnn_graph(
node.op, ctx, node.inputs, node.outputs
)
elif isinstance(node.op, AbstractConv3d):
with inherit_stack_trace(node.outputs):
return local_abstractconv3d_cudnn_graph(
node.op, ctx, node.inputs, node.outputs
)
@local_optimizer(
[AbstractConv2d, AbstractConv2d_gradWeights, AbstractConv2d_gradInputs]
)
def local_abstractconv_cudnn_alt(node):
if not isinstance(
node.op, (AbstractConv2d, AbstractConv2d_gradWeights, AbstractConv2d_gradInputs)
):
return
if version(raises=False) < 6000 and node.op.filter_dilation != (1, 1):
return None
if node.op.unshared:
return None
if isinstance(node.op.border_mode, tuple) and any(
isinstance(p, tuple) for p in node.op.border_mode
):
# Asymmetric padding not yet supported
return None
inp1 = node.inputs[0]
inp2 = node.inputs[1]
if not dnn_available(inp1.type.context_name):
return
op = node.op
border_mode = node.op.border_mode
subsample = node.op.subsample
filter_dilation = node.op.filter_dilation
num_groups = node.op.num_groups
precision, _ = get_precision(None, [inp1, inp2])
if node.op.filter_flip:
conv_mode = "conv"
else:
conv_mode = "cross"
if isinstance(op, AbstractConv2d):
if border_mode == "half" or subsample != (1, 1) or num_groups != 1:
return None
if border_mode == "full":
direction_hint = "bprop inputs"
elif border_mode == "valid" and filter_dilation == (1, 1):
direction_hint = "bprop weights"
else:
return None
rval = dnn_conv(
inp1,
inp2,
border_mode=border_mode,
subsample=subsample,
dilation=filter_dilation,
direction_hint=direction_hint,
conv_mode=conv_mode,
num_groups=num_groups,
)
elif isinstance(op, AbstractConv2d_gradWeights):
if (
border_mode == "valid"
and subsample == (1, 1)
and filter_dilation == (1, 1)
and num_groups == 1
):
img = gpu_contiguous(inp1)
topgrad = gpu_contiguous(inp2)
ctx_name = infer_context_name(img, topgrad)
img = gpu_contiguous(img.dimshuffle(1, 0, 2, 3))
topgrad = gpu_contiguous(topgrad.dimshuffle(1, 0, 2, 3))
ishape = [shape_i_op(i)(img) for i in range(img.ndim)]
tshape = [shape_i_op(i)(topgrad) for i in range(topgrad.ndim)]
out_shp = get_conv_output_shape(
ishape,
tshape,
border_mode=border_mode,
subsample=subsample,
filter_dilation=filter_dilation,
)
out_shp = assert_conv_shape(out_shp)
out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*out_shp)
desc = GpuDnnConvDesc(
border_mode=border_mode,
subsample=subsample,
dilation=filter_dilation,
conv_mode="cross",
precision=precision,
)(out.shape)
conv = GpuDnnConv(algo=None, num_groups=num_groups)(img, topgrad, out, desc)
if conv_mode == "conv":
conv = conv[:, :, ::-1, ::-1]
rval = as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3), ctx_name)
else:
return None
elif isinstance(op, AbstractConv2d_gradInputs):
if border_mode == "valid" and subsample == (1, 1) and num_groups == 1:
kerns = gpu_contiguous(inp1.dimshuffle(1, 0, 2, 3))
topgrad = gpu_contiguous(inp2)
ctx_name = infer_context_name(kerns, topgrad)
conv_mode = "cross" if conv_mode == "conv" else "conv"
desc = GpuDnnConvDesc(
border_mode="full",
subsample=subsample,
dilation=filter_dilation,
conv_mode=conv_mode,
precision=precision,
)(kerns.shape)
tshape = [shape_i_op(i)(topgrad) for i in range(topgrad.ndim)]
kshape = [shape_i_op(i)(kerns) for i in range(kerns.ndim)]
shape = get_conv_output_shape(
tshape,
kshape,
border_mode="full",
subsample=subsample,
filter_dilation=filter_dilation,
)
shape = assert_conv_shape(shape)
out = GpuAllocEmpty(dtype=topgrad.dtype, context_name=ctx_name)(*shape)
rval = GpuDnnConv(algo=None, num_groups=num_groups)(
topgrad, kerns, out, desc
)
else:
return None
return [rval]
@local_optimizer(
[AbstractConv3d, AbstractConv3d_gradWeights, AbstractConv3d_gradInputs]
)
def local_abstractconv3d_cudnn_alt(node):
if not isinstance(
node.op, (AbstractConv3d, AbstractConv3d_gradWeights, AbstractConv3d_gradInputs)
):
return
if version(raises=False) < 6000 and node.op.filter_dilation != (1, 1, 1):
return None
inp1 = node.inputs[0]
inp2 = node.inputs[1]
if not dnn_available(inp1.type.context_name):
return
op = node.op
border_mode = node.op.border_mode
subsample = node.op.subsample
filter_dilation = node.op.filter_dilation
num_groups = node.op.num_groups
precision, _ = get_precision(None, [inp1, inp2])
if node.op.filter_flip:
conv_mode = "conv"
else:
conv_mode = "cross"
if isinstance(op, AbstractConv3d):
if border_mode == "half" or subsample != (1, 1, 1) or num_groups > 1:
return None
if border_mode == "full":
direction_hint = "bprop inputs"
elif border_mode == "valid" and filter_dilation == (1, 1, 1):
direction_hint = "bprop weights"
else:
return None
rval = dnn_conv3d(
inp1,
inp2,
border_mode=border_mode,
subsample=subsample,
dilation=filter_dilation,
direction_hint=direction_hint,
conv_mode=conv_mode,
)
elif isinstance(op, AbstractConv3d_gradWeights):
if (
border_mode == "valid"
and subsample == (1, 1, 1)
and filter_dilation == (1, 1, 1)
and num_groups == 1
):
img = gpu_contiguous(inp1)
topgrad = gpu_contiguous(inp2)
ctx_name = infer_context_name(img, topgrad)
img = gpu_contiguous(img.dimshuffle(1, 0, 2, 3, 4))
topgrad = gpu_contiguous(topgrad.dimshuffle(1, 0, 2, 3, 4))
ishape = [shape_i_op(i)(img) for i in range(img.ndim)]
tshape = [shape_i_op(i)(topgrad) for i in range(topgrad.ndim)]
out_shp = get_conv_output_shape(
ishape,
tshape,
border_mode=border_mode,
subsample=subsample,
filter_dilation=filter_dilation,
)
out_shp = assert_conv_shape(out_shp)
out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*out_shp)
desc = GpuDnnConvDesc(
border_mode=border_mode,
subsample=subsample,
dilation=filter_dilation,
conv_mode="cross",
num_groups=num_groups,
precision=precision,
)(out.shape)
conv = GpuDnnConv(algo=None, num_groups=num_groups)(img, topgrad, out, desc)
if conv_mode == "conv":
conv = conv[:, :, ::-1, ::-1, ::-1]
rval = as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3, 4), ctx_name)
else:
return None
elif isinstance(op, AbstractConv3d_gradInputs):
if border_mode == "valid" and subsample == (1, 1, 1) and num_groups == 1:
kerns = gpu_contiguous(inp1.dimshuffle(1, 0, 2, 3, 4))
topgrad = gpu_contiguous(inp2)
ctx_name = infer_context_name(kerns, topgrad)
conv_mode = "cross" if conv_mode == "conv" else "conv"
desc = GpuDnnConvDesc(
border_mode="full",
subsample=subsample,
dilation=filter_dilation,
conv_mode=conv_mode,
num_groups=num_groups,
precision=precision,
)(kerns.shape)
tshape = [shape_i_op(i)(topgrad) for i in range(topgrad.ndim)]
kshape = [shape_i_op(i)(kerns) for i in range(kerns.ndim)]
shape = get_conv_output_shape(
tshape,
kshape,
border_mode="full",
subsample=subsample,
filter_dilation=filter_dilation,
)
shape = assert_conv_shape(shape)
out = GpuAllocEmpty(dtype=topgrad.dtype, context_name=ctx_name)(*shape)
rval = GpuDnnConv(algo=None, num_groups=num_groups)(
topgrad, kerns, out, desc
)
else:
return None
return [rval]
@local_optimizer([AbstractConv2d_gradWeights, AbstractConv3d_gradWeights])
def local_abstractconv_gw_cudnn(node):
ctx = infer_context_name(*node.inputs)
if not isinstance(node.inputs[0].type, GpuArrayType):
return
if node.op.unshared:
return None
if isinstance(node.op.border_mode, tuple) and any(
isinstance(p, tuple) for p in node.op.border_mode
):
# Asymmetric padding not yet supported
return None
if isinstance(node.op, AbstractConv2d_gradWeights):
with inherit_stack_trace(node.outputs):
return local_abstractconv_cudnn_graph(
node.op, ctx, node.inputs, node.outputs
)
elif isinstance(node.op, AbstractConv3d_gradWeights):
with inherit_stack_trace(node.outputs):
return local_abstractconv3d_cudnn_graph(
node.op, ctx, node.inputs, node.outputs
)
@local_optimizer([AbstractConv2d_gradInputs, AbstractConv3d_gradInputs])
def local_abstractconv_gi_cudnn(node):
ctx = infer_context_name(*node.inputs)
if not isinstance(node.inputs[0].type, GpuArrayType):
return
if node.op.unshared:
return None
if isinstance(node.op.border_mode, tuple) and any(
isinstance(p, tuple) for p in node.op.border_mode
):
# Asymmetric padding not yet supported
return None
if isinstance(node.op, AbstractConv2d_gradInputs):
with inherit_stack_trace(node.outputs):
return local_abstractconv_cudnn_graph(
node.op, ctx, node.inputs, node.outputs
)
elif isinstance(node.op, AbstractConv3d_gradInputs):
with inherit_stack_trace(node.outputs):
return local_abstractconv3d_cudnn_graph(
node.op, ctx, node.inputs, node.outputs
)
@inplace_allocempty(GpuDnnConv, 2)
def local_dnn_conv_inplace(node, inputs):
return [
GpuDnnConv(algo=node.op.algo, inplace=True, num_groups=node.op.num_groups)(
*inputs
)
]
@inplace_allocempty(GpuDnnConvGradW, 2)
def local_dnn_convgw_inplace(node, inputs):
return [
GpuDnnConvGradW(algo=node.op.algo, inplace=True, num_groups=node.op.num_groups)(
*inputs
)
]
@inplace_allocempty(GpuDnnConvGradI, 2)
def local_dnn_convgi_inplace(node, inputs):
return [
GpuDnnConvGradI(algo=node.op.algo, inplace=True, num_groups=node.op.num_groups)(
*inputs
)
]
optdb.register(
"local_dnna_conv_inplace",
theano.tensor.opt.in2out(
local_dnn_conv_inplace,
local_dnn_convgw_inplace,
local_dnn_convgi_inplace,
name="local_dnna_conv_inplace",
),
70.0,
"fast_run",
"inplace",
"gpuarray",
"cudnn",
)
@register_opt("cudnn")
@alpha_merge(GpuDnnConv, alpha_in=4, beta_in=5)
def local_dnn_conv_alpha_merge(node, *inputs):
return [GpuDnnConv(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)]
@register_opt("cudnn")
@alpha_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5)
def local_dnn_convw_alpha_merge(node, *inputs):
return [GpuDnnConvGradW(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)]
@register_opt("cudnn")
@alpha_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5)
def local_dnn_convi_alpha_merge(node, *inputs):
return [GpuDnnConvGradI(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)]
@register_opt("cudnn")
@output_merge(GpuDnnConv, alpha_in=4, beta_in=5, out_in=2)
def local_dnn_conv_output_merge(node, *inputs):
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConv(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)]
@register_opt("cudnn")
@output_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, out_in=2)
def local_dnn_convw_output_merge(node, *inputs):
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConvGradW(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)]
@register_opt("cudnn")
@output_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, out_in=2)
def local_dnn_convi_output_merge(node, *inputs):
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConvGradI(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)]
def local_gpua_pool_dnn_alternative(op, ctx_name, inputs, outputs):
if not dnn_available(ctx_name):
return
if not op.ignore_border:
return
img, ws, stride, pad = inputs
nd = op.ndim
if nd not in (2, 3):
return
img = gpu_contiguous(as_gpuarray_variable(img, ctx_name))
mode = op.mode
# dnn_pool expects exactly 2 non-pooling dimensions
if img.ndim == nd + 2:
return dnn_pool(img, ws, stride=stride, pad=pad, mode=mode)
else:
# reshape to 4D or 5D with 2 non-pooling dimensions
img_padded = pad_dims(img, 2, nd)
ret_padded = dnn_pool(img_padded, ws, stride=stride, pad=pad, mode=mode)
return unpad_dims(ret_padded, img, 2, nd)
pool_db.register(
"local_gpua_pool_dnn_alternative",
op_lifter([Pool])(local_gpua_pool_dnn_alternative),
"gpuarray",
"fast_compile",
"fast_run",
"cudnn",
position=0,
)
pool_db2.register(
"local_gpua_pool_dnn_alternative",
local_optimizer([Pool])(local_gpua_pool_dnn_alternative),
"gpuarray",
"fast_compile",
"fast_run",
"cudnn",
position=0,
)
def local_gpua_pool_dnn_grad_stride(op, ctx_name, inputs, outputs):
if not dnn_available(ctx_name):
return
if not op.ignore_border:
return
inp, out, out_grad, ws, stride, pad = inputs
nd = op.ndim
if nd not in (2, 3):
return
inp = gpu_contiguous(as_gpuarray_variable(inp, ctx_name))
out = gpu_contiguous(as_gpuarray_variable(out, ctx_name))
out_grad = gpu_contiguous(as_gpuarray_variable(out_grad, ctx_name))
mode = op.mode
# the GPU ops expect exactly 2 non-pooling dimensions
if inp.ndim == nd + 2:
return GpuDnnPoolGrad(mode=mode)(inp, out, out_grad, ws, stride, pad)
else:
# reshape to 4D or 5D with 2 non-pooling dimensions
inp_padded = pad_dims(inp, 2, nd)
out_padded = pad_dims(out, 2, nd)
out_grad_padded = pad_dims(out_grad, 2, nd)
ret_padded = GpuDnnPoolGrad(mode=mode)(
inp_padded, out_padded, out_grad_padded, ws, stride, pad
)
return unpad_dims(ret_padded, inp, 2, nd)
pool_db.register(
"local_gpua_pool_dnn_grad_stride",
op_lifter([MaxPoolGrad])(local_gpua_pool_dnn_grad_stride),
"gpuarray",
"fast_compile",
"fast_run",
"cudnn",
position=0,
)
pool_db2.register(
"local_gpua_pool_dnn_grad_stride",
local_optimizer([MaxPoolGrad])(local_gpua_pool_dnn_grad_stride),
"gpuarray",
"fast_compile",
"fast_run",
"cudnn",
position=0,
)
def local_gpua_avg_pool_dnn_grad_stride(op, ctx_name, inputs, outputs):
if not dnn_available(ctx_name):
return
if not op.ignore_border:
return
inp, out_grad, ws, stride, pad = inputs
nd = op.ndim
if nd not in (2, 3):
return
inp = gpu_contiguous(as_gpuarray_variable(inp, ctx_name))
out_grad = gpu_contiguous(as_gpuarray_variable(out_grad, ctx_name))
mode = op.mode
# the GPU ops expect exactly 2 non-pooling dimensions
if inp.ndim == nd + 2:
# We reuse out_grad because cuDNN does not use the value of the `out`
# argument but still checks its shape for average pooling. This
# has been observed in v2 and v3 as far as I know.
return GpuDnnPoolGrad(mode=mode)(inp, out_grad, out_grad, ws, stride, pad)
else:
# reshape to 4D or 5D with 2 non-pooling dimensions
inp_padded = pad_dims(inp, 2, nd)
out_grad_padded = pad_dims(out_grad, 2, nd)
ret_padded = GpuDnnPoolGrad(mode=mode)(
inp_padded, out_grad_padded, out_grad_padded, ws, stride, pad
)
return unpad_dims(ret_padded, inp, 2, nd)
pool_db.register(
"local_gpua_avg_pool_dnn_grad_stride",
op_lifter([AveragePoolGrad])(local_gpua_avg_pool_dnn_grad_stride),
"gpuarray",
"fast_compile",
"fast_run",
"cudnn",
position=0,
)
pool_db2.register(
"local_gpua_avg_pool_dnn_grad_stride",
local_optimizer([AveragePoolGrad])(local_gpua_avg_pool_dnn_grad_stride),
"gpuarray",
"fast_compile",
"fast_run",
"cudnn",
position=0,
)
@register_opt("cudnn", "fast_compile")
@local_optimizer([GpuSoftmax])
def local_softmax_dnn(node):
if isinstance(node.op, GpuSoftmax):
if not dnn_available(node.outputs[0].type.context_name):
return
ins = node.inputs[0].dimshuffle(0, 1, "x", "x")
ins = gpu_contiguous(ins)
out = GpuDnnSoftmax("accurate", "channel")(ins)
out = as_gpuarray_variable(out.dimshuffle(0, 1), out.type.context_name)
return [out]
@register_opt("cudnn", "stabilize")
@local_optimizer([GpuElemwise])
def local_log_softmax_dnn(node):
# This looks for GpuDnnSoftmax so we know that we have cudnn.
if (
isinstance(node.op, GpuElemwise)
and isinstance(node.op.scalar_op, Log)
and node.inputs[0].owner
and isinstance(node.inputs[0].owner.op, GpuDnnSoftmax)
and len(node.inputs[0].clients) == 1
):
softmax_node = node.inputs[0].owner
new_softmax = GpuDnnSoftmax("log", softmax_node.op.mode)
return [new_softmax(softmax_node.inputs[0])]
@register_opt("cudnn", "fast_compile")
@op_lifter([LogSoftmax])
@register_opt2([LogSoftmax], "fast_compile", "cudnn")
def local_gpua_logsoftmax_to_dnn(op, ctx_name, inputs, outputs):
# Transform the input in the format expected by GpuDnnSoftmax
inp = inputs[0]
if inp.ndim != 2:
return
if not dnn_available(ctx_name):
return
inp = inp.dimshuffle(0, 1, "x", "x")
inp.tag.context_name = ctx_name
# Apply GpuDnnSoftmax and return the result
out = GpuDnnSoftmax("log", "channel")(gpu_contiguous(inp))
return [out.dimshuffle(0, 1)]
@register_opt("cudnn", "fast_compile")
@op_lifter([SoftmaxGrad])
@register_opt2([SoftmaxGrad], "cudnn", "fast_compile")
def local_gpua_softmax_dnn_grad(op, ctx_name, inputs, outputs):
if not dnn_available(ctx_name):
return
ins = []
for n in inputs:
n = as_gpuarray_variable(n, ctx_name)
if n.ndim != 2:
return
ins.append(n.dimshuffle(0, "x", 1, "x"))
out = GpuDnnSoftmaxGrad("accurate", "instance")(
gpu_contiguous(ins[0]), gpu_contiguous(ins[1])
)
return [out.dimshuffle(0, 2)]
@register_opt("cudnn")
@local_optimizer([GpuCAReduceCuda])
def local_dnn_reduction(node):
if not isinstance(node.op, GpuCAReduceCuda):
return
if not dnn_available(node.inputs[0].type.context_name):
return
if version(raises=False) < 6000:
return
if node.inputs[0].ndim > 8:
return
acc_dtype = node.op._acc_dtype(node.inputs[0].dtype)
if node.inputs[0].dtype != node.outputs[0].dtype:
# We can mix float16 and float32, but not float64.
if node.inputs[0].dtype == "float64" or node.outputs[0].dtype == "float64":
return
if acc_dtype != "float32":
return
if node.inputs[0].dtype not in ["float16", "float32", "float64"]:
return
if node.inputs[0].dtype == "float64" and acc_dtype != "float64":
return
if node.inputs[0].dtype == "float32" and acc_dtype != "float32":
return
if node.inputs[0].dtype == "float16" and acc_dtype == "float64":
return
def _identity(a):
return a
def _square(a):
return GpuElemwise(theano.scalar.basic.sqr)(a)
scal = node.op.scalar_op.name
post = _identity
if node.op.pre_scalar_op is not None:
if isinstance(node.op.scalar_op, theano.scalar.basic.Add):
if isinstance(node.op.pre_scalar_op, theano.scalar.basic.Sqr):
scal = "norm2"
post = _square
elif isinstance(node.op.pre_scalar_op, theano.scalar.basic.Abs):
scal = "norm1"
else:
return
elif isinstance(node.op.scalar_op, theano.scalar.basic.Maximum) and isinstance(
node.op.pre_scalar_op, theano.scalar.basic.Abs
):
scal = "absmax"
else:
return
if not cudnn.cudnnReduceTensorOp_t.has_alias(scal):
return
with inherit_stack_trace(node.outputs):
ret = GpuDnnReduction(scal, node.op.axis, acc_dtype, node.op.dtype, False)(
node.inputs[0]
)
return [post(ret)]
@register_opt("cudnn")
@local_optimizer([GpuMaxAndArgmax])
def local_cudnn_maxandargmax(node):
if not isinstance(node.op, GpuMaxAndArgmax):
return
if not dnn_available(node.inputs[0].type.context_name):
return
if version(raises=False) < 6000:
return
if node.inputs[0].ndim > 8:
return
if node.inputs[0].dtype != node.outputs[0].dtype:
return
if node.inputs[0].dtype not in ["float16", "float32", "float64"]:
return
# order of the axes influences the output indices
if node.op.axis is not None and tuple(sorted(node.op.axis)) != node.op.axis:
return
max, arg = GpuDnnReduction(
"maximum", node.op.axis, node.outputs[0].dtype, node.outputs[0].dtype, True
)(node.inputs[0])
# cudnn can only return int32 indices
return (
max,
as_gpuarray_variable(arg.astype("int64"), node.outputs[1].type.context_name),
)
@register_opt("cudnn", "fast_compile")
@op_lifter([Argmax])
@register_opt2([Argmax], "fast_compile", "cudnn")
def local_dnn_argmax(op, ctx_name, inputs, outputs):
if not dnn_available(ctx_name):
return
if version(raises=False) < 6000:
return
if inputs[0].ndim > 8:
return
if inputs[0].dtype not in ["float16", "float32", "float64"]:
return
# order of the axes influences the output indices
if op.axis is not None and tuple(sorted(op.axis)) != op.axis:
return
max, arg = GpuDnnReduction(
"maximum", op.axis, inputs[0].dtype, inputs[0].dtype, True
)(*inputs)
return [as_gpuarray_variable(arg.astype("int64"), ctx_name)]
class NoCuDNNRaise(Optimizer):
def apply(self, fgraph):
"""
Raise a error if cudnn can't be used.
"""
for c in list_contexts():
if not dnn_available(c):
# Make an assert error as we want Theano to fail, not
# just skip this optimization.
raise AssertionError(
"cuDNN optimization was enabled, but Theano was not able "
"to use it for context "
+ str(c)
+ ". We got this error: \n"
+ dnn_available.msg
)
gpu_seqopt.register("NoCuDNNRaise", NoCuDNNRaise(), 0, "cudnn")
@register_inplace()
@local_optimizer([GpuDnnBatchNorm], inplace=True)
def local_batch_norm_inplace_output(node):
if isinstance(node.op, GpuDnnBatchNorm) and not node.op.inplace_output:
return GpuDnnBatchNorm(
mode=node.op.mode,
running_averages=node.op.running_averages,
inplace_running_mean=node.op.inplace_running_mean,
inplace_running_var=node.op.inplace_running_var,
inplace_output=True,
)(*node.inputs)
@register_inplace()
@local_optimizer([GpuDnnBatchNorm], inplace=True)
def local_batch_norm_inplace_running_mean(node):
if (
isinstance(node.op, GpuDnnBatchNorm)
and node.op.running_averages
and not node.op.inplace_running_mean
):
return GpuDnnBatchNorm(
mode=node.op.mode,
running_averages=node.op.running_averages,
inplace_running_mean=True,
inplace_running_var=node.op.inplace_running_var,
inplace_output=node.op.inplace_output,
)(*node.inputs)
@register_inplace()
@local_optimizer([GpuDnnBatchNorm], inplace=True)
def local_batch_norm_inplace_running_var(node):
if (
isinstance(node.op, GpuDnnBatchNorm)
and node.op.running_averages
and not node.op.inplace_running_var
):
return GpuDnnBatchNorm(
mode=node.op.mode,
running_averages=node.op.running_averages,
inplace_running_mean=node.op.inplace_running_mean,
inplace_running_var=True,
inplace_output=node.op.inplace_output,
)(*node.inputs)
@register_inplace()
@local_optimizer([GpuDnnBatchNormInference], inplace=True)
def local_batch_norm_inference_inplace(node):
if isinstance(node.op, GpuDnnBatchNormInference) and not node.op.inplace:
return [GpuDnnBatchNormInference(mode=node.op.mode, inplace=True)(*node.inputs)]
import copy import copy
import numpy as np import numpy as np
import theano
from six.moves import StringIO from six.moves import StringIO
from theano import Apply, scalar, Op import theano
from theano import Apply, Op, scalar
from theano.gof.utils import MethodNotDefined from theano.gof.utils import MethodNotDefined
from theano.scalar import Scalar, Composite from theano.scalar import Composite, Scalar
from theano.tensor.elemwise import Elemwise, DimShuffle, CAReduceDtype from theano.scalar.basic import complex_types, upgrade_to_float_no_complex
from theano.scalar.basic_scipy import Erfinv, Erfcinv from theano.scalar.basic_scipy import Erfcinv, Erfinv
from theano.scalar.basic import upgrade_to_float_no_complex, complex_types from theano.tensor.elemwise import CAReduceDtype, DimShuffle, Elemwise
try: try:
import pygpu import pygpu
from pygpu import gpuarray from pygpu import gpuarray
from pygpu.tools import ArrayArg
from pygpu.reduction import ReductionKernel
from pygpu.gpuarray import dtype_to_typecode from pygpu.gpuarray import dtype_to_typecode
from pygpu.reduction import ReductionKernel
from pygpu.tools import ArrayArg
except ImportError: except ImportError:
pass pass
from .basic_ops import ( from .basic_ops import (
as_gpuarray_variable,
HideC,
GpuKernelBase, GpuKernelBase,
HideC,
Kernel, Kernel,
as_gpuarray_variable,
infer_context_name, infer_context_name,
) )
from .type import GpuArrayType, gpu_context_type
from .fp16_help import load_w, write_w from .fp16_help import load_w, write_w
from .type import GpuArrayType, gpu_context_type
def make_argument(v, name): def make_argument(v, name):
......
from theano import Apply, Op from theano import Apply, Op
from theano.tensor.extra_ops import CumOp from theano.tensor.extra_ops import CumOp
try: try:
from pygpu import gpuarray from pygpu import gpuarray
except ImportError: except ImportError:
pass pass
import theano.scalar as scalar
from theano.gof import ParamsType
from .basic_ops import ( from .basic_ops import (
as_gpuarray_variable,
GpuKernelBase, GpuKernelBase,
Kernel,
GpuReshape, GpuReshape,
infer_context_name, Kernel,
as_gpuarray_variable,
gpuarray_helper_inc_dir, gpuarray_helper_inc_dir,
infer_context_name,
) )
from .opt import register_opt, op_lifter, register_opt2 from .opt import op_lifter, register_opt, register_opt2
from .type import gpu_context_type from .type import gpu_context_type
from theano.gof import ParamsType
import theano.scalar as scalar
class GpuCumOp(GpuKernelBase, Op): class GpuCumOp(GpuKernelBase, Op):
......
...@@ -2,18 +2,17 @@ import numpy as np ...@@ -2,18 +2,17 @@ import numpy as np
import theano import theano
import theano.tensor as tt import theano.tensor as tt
from theano import Op from theano import Op
from theano.gradient import DisconnectedType
from theano.gpuarray.basic_ops import ( from theano.gpuarray.basic_ops import (
gpu_contiguous,
as_gpuarray_variable, as_gpuarray_variable,
gpu_contiguous,
infer_context_name, infer_context_name,
) )
from theano.gpuarray.opt import op_lifter, register_opt, register_opt2
from theano.gpuarray.type import GpuArrayType from theano.gpuarray.type import GpuArrayType
from theano.gradient import DisconnectedType
from theano.tensor.fft import IRFFTOp from theano.tensor.fft import IRFFTOp
from theano.gpuarray.opt import register_opt, op_lifter, register_opt2
try: try:
import pygpu import pygpu
......
import warnings import warnings
import pkg_resources
import numpy as np import numpy as np
import pkg_resources
from numpy.linalg.linalg import LinAlgError from numpy.linalg.linalg import LinAlgError
import theano import theano
from theano import Op, config, tensor from theano import Op, config, tensor
from theano.scalar import bool as bool_t
from theano.gof import COp, ParamsType from theano.gof import COp, ParamsType
from theano.gpuarray import GpuArrayType from theano.gpuarray.basic_ops import (
from .basic_ops import (
CGpuKernelBase, CGpuKernelBase,
as_gpuarray_variable, as_gpuarray_variable,
gpu_contiguous, gpu_contiguous,
gpuarray_helper_inc_dir, gpuarray_helper_inc_dir,
infer_context_name, infer_context_name,
) )
from .type import gpu_context_type from theano.gpuarray.type import GpuArrayType, gpu_context_type
from theano.scalar import bool as bool_t
try: try:
import pygpu import pygpu
from pygpu.basic import triu, tril from pygpu.basic import tril, triu
pygpu_available = True pygpu_available = True
except ImportError: except ImportError:
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
import warnings import warnings
try: try:
import pygpu import pygpu
except ImportError: except ImportError:
...@@ -11,20 +12,20 @@ import theano ...@@ -11,20 +12,20 @@ import theano
import theano.sandbox.multinomial import theano.sandbox.multinomial
from theano import Apply from theano import Apply
from theano.gof import Op from theano.gof import Op
from theano.scalar import as_scalar
from theano.tensor import NotScalarConstantError, get_scalar_constant_value from theano.tensor import NotScalarConstantError, get_scalar_constant_value
from .basic_ops import ( from .basic_ops import (
as_gpuarray_variable,
infer_context_name,
GpuKernelBase, GpuKernelBase,
Kernel, Kernel,
as_gpuarray_variable,
gpuarray_helper_inc_dir, gpuarray_helper_inc_dir,
infer_context_name,
) )
from .opt import register_opt, op_lifter, register_opt2
from .type import GpuArrayType
from .elemwise import GpuDimShuffle from .elemwise import GpuDimShuffle
from theano.scalar import as_scalar from .fp16_help import load_w, work_dtype, write_w
from .fp16_help import write_w, load_w, work_dtype from .opt import op_lifter, register_opt, register_opt2
from .type import GpuArrayType
class GPUAMultinomialFromUniform(GpuKernelBase, Op): class GPUAMultinomialFromUniform(GpuKernelBase, Op):
......
import theano.tensor as tt import theano.tensor as tt
from theano import Apply, Op
from theano import Op, Apply
from theano.gof import ParamsType from theano.gof import ParamsType
from theano.tensor.nnet.neighbours import Images2Neibs from theano.tensor.nnet.neighbours import Images2Neibs
try: try:
from pygpu import gpuarray from pygpu import gpuarray
except ImportError: except ImportError:
pass pass
from theano.gpuarray.basic_ops import ( from theano.gpuarray.basic_ops import (
as_gpuarray_variable,
GpuKernelBase, GpuKernelBase,
Kernel, Kernel,
as_gpuarray_variable,
infer_context_name, infer_context_name,
) )
from theano.gpuarray.type import GpuArrayType, gpu_context_type from theano.gpuarray.type import GpuArrayType, gpu_context_type
......
import numpy as np import numpy as np
from theano import Op, Apply
from six import StringIO from six import StringIO
from theano import Apply, Op
try: try:
import pygpu import pygpu
from pygpu import gpuarray from pygpu import gpuarray
...@@ -10,14 +11,14 @@ except ImportError: ...@@ -10,14 +11,14 @@ except ImportError:
pass pass
from .basic_ops import ( from .basic_ops import (
as_gpuarray_variable,
GpuKernelBase, GpuKernelBase,
Kernel, Kernel,
as_gpuarray_variable,
gpuarray_helper_inc_dir, gpuarray_helper_inc_dir,
infer_context_name, infer_context_name,
) )
from .fp16_help import load_w, work_dtype, write_w
from .type import GpuArrayType from .type import GpuArrayType
from .fp16_help import work_dtype, load_w, write_w
class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op): class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
......
import copy import copy
import logging import logging
import pdb import pdb
import time
import sys import sys
import time
from collections import Counter
import numpy as np import numpy as np
import theano import theano
from theano import tensor, scalar, gof, config
from theano.compile import optdb
from theano.compile.ops import shape_i
from theano.gof import (
local_optimizer,
EquilibriumDB,
TopoOptimizer,
LocalGroupDB,
SequenceDB,
Optimizer,
DB,
toolbox,
graph,
)
from theano.gof.opt import LocalMetaOptimizer, copy_stack_trace, inherit_stack_trace
from theano.ifelse import IfElse
from theano.misc.ordered_set import OrderedSet
from theano.scalar.basic import Scalar, Pow, Cast
from theano.scalar.basic import log, neg, true_div
from theano.scalar.basic_scipy import Erfinv, Erfcinv
from theano.scan_module import scan_utils, scan_op, scan_opt
from theano.tensor.nnet import bn, conv3d2d
from theano.tensor.nnet.conv import ConvOp
from theano.tensor.nnet.blocksparse import SparseBlockGemv, SparseBlockOuter
from theano.tensor.nnet.abstract_conv import (
BaseAbstractConv,
AbstractConv2d,
AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs,
AbstractConv3d,
AbstractConv3d_gradWeights,
AbstractConv3d_gradInputs,
get_conv_output_shape,
)
from theano.tensor.nnet.neighbours import Images2Neibs
from theano.tensor.nnet.ctc import ConnectionistTemporalClassification
import theano.tensor.nlinalg as nlinalg import theano.tensor.nlinalg as nlinalg
import theano.tensor.signal.pool as pool import theano.tensor.signal.pool as pool
import theano.tensor.slinalg as slinalg import theano.tensor.slinalg as slinalg
from collections import Counter from theano import config, gof, scalar, tensor
from theano.breakpoint import PdbBreakpoint from theano.breakpoint import PdbBreakpoint
from theano.compile import optdb
from .type import ( from theano.compile.ops import shape_i
GpuArrayType, from theano.gof import Optimizer, graph, local_optimizer, toolbox
GpuArrayConstant, from theano.gof.opt import LocalMetaOptimizer, copy_stack_trace, inherit_stack_trace
get_context, from theano.gpuarray.basic_ops import (
ContextNotDefined,
move_to_gpu,
)
from .basic_ops import (
as_gpuarray_variable,
infer_context_name,
host_from_gpu,
GpuToGpu,
HostFromGpu,
GpuFromHost,
GpuSplit,
GpuContiguous,
gpu_contiguous,
GpuAlloc, GpuAlloc,
GpuAllocEmpty, GpuAllocEmpty,
GpuReshape, GpuContiguous,
GpuEye, GpuEye,
GpuFromHost,
GpuJoin,
GpuReshape,
GpuSplit,
GpuToGpu,
GpuTri, GpuTri,
HostFromGpu,
as_gpuarray_variable,
gpu_contiguous,
gpu_join, gpu_join,
GpuJoin, host_from_gpu,
infer_context_name,
) )
from .blas import ( from theano.gpuarray.blas import (
gpu_dot22, GpuCorr3dMM,
GpuCorr3dMM_gradInputs,
GpuCorr3dMM_gradWeights,
GpuCorrMM,
GpuCorrMM_gradInputs,
GpuCorrMM_gradWeights,
GpuGemm, GpuGemm,
GpuGer,
GpuGemmBatch, GpuGemmBatch,
gpugemm_no_inplace, GpuGer,
gpu_dot22,
gpugemm_inplace, gpugemm_inplace,
gpugemm_no_inplace,
gpugemmbatch_no_inplace, gpugemmbatch_no_inplace,
gpugemv_no_inplace,
gpugemv_inplace, gpugemv_inplace,
GpuCorrMM, gpugemv_no_inplace,
GpuCorrMM_gradInputs,
GpuCorrMM_gradWeights,
GpuCorr3dMM,
GpuCorr3dMM_gradInputs,
GpuCorr3dMM_gradWeights,
)
from .pool import (
GpuPool,
GpuMaxPoolGrad,
GpuAveragePoolGrad,
GpuMaxPoolRop,
GpuDownsampleFactorMaxGradGrad,
) )
from .blocksparse import ( from theano.gpuarray.blocksparse import (
GpuSparseBlockGemv, GpuSparseBlockGemv,
GpuSparseBlockOuter, GpuSparseBlockOuter,
gpu_sparse_block_outer,
gpu_sparse_block_outer_inplace,
gpu_sparse_block_gemv, gpu_sparse_block_gemv,
gpu_sparse_block_gemv_inplace, gpu_sparse_block_gemv_inplace,
gpu_sparse_block_outer,
gpu_sparse_block_outer_inplace,
) )
from .nnet import ( from theano.gpuarray.ctc import GpuConnectionistTemporalClassification
gpu_crossentropy_softmax_1hot_with_bias_dx, from theano.gpuarray.dnn_opt import (
gpu_crossentropy_softmax_argmax_1hot_with_bias, local_abstractconv3d_cudnn_alt,
gpu_softmax_with_bias, local_abstractconv_cudnn,
gpu_softmax, local_abstractconv_cudnn_alt,
local_abstractconv_gi_cudnn,
local_abstractconv_gw_cudnn,
) )
from .elemwise import ( from theano.gpuarray.elemwise import (
GpuElemwise,
GpuDimShuffle,
GpuCAReduceCuda,
GpuCAReduceCPY, GpuCAReduceCPY,
gpu_erfinv, GpuCAReduceCuda,
GpuDimShuffle,
GpuElemwise,
gpu_erfcinv, gpu_erfcinv,
gpu_erfinv,
max_inputs_to_GpuElemwise, max_inputs_to_GpuElemwise,
) )
from .subtensor import ( from theano.gpuarray.linalg import (
GpuIncSubtensor, MATRIX_STRUCTURES_SOLVE,
GpuSubtensor, GpuCholesky,
GpuAdvancedSubtensor, GpuCublasTriangularSolve,
GpuAdvancedSubtensor1, GpuCusolverSolve,
GpuMagmaCholesky,
GpuMagmaEigh,
GpuMagmaMatrixInverse,
cublas_available,
cusolver_available,
gpu_qr,
gpu_svd,
)
from theano.gpuarray.neighbours import GpuImages2Neibs
from theano.gpuarray.nnet import (
gpu_crossentropy_softmax_1hot_with_bias_dx,
gpu_crossentropy_softmax_argmax_1hot_with_bias,
gpu_softmax,
gpu_softmax_with_bias,
)
from theano.gpuarray.opt_util import (
alpha_merge,
op_lifter,
output_merge,
pad_dims,
safe_to_cpu,
safe_to_gpu,
unpad_dims,
)
from theano.gpuarray.optdb import (
GraphToGPUDB,
abstract_batch_norm_db,
abstract_batch_norm_db2,
abstract_batch_norm_groupopt,
abstractconv_groupopt,
gpu_cut_copies,
gpu_optimizer,
gpu_seqopt,
matrix_ops_db,
matrix_ops_db2,
pool_db,
pool_db2,
register_inplace,
register_opt,
register_opt2,
)
from theano.gpuarray.pool import (
GpuAveragePoolGrad,
GpuDownsampleFactorMaxGradGrad,
GpuMaxPoolGrad,
GpuMaxPoolRop,
GpuPool,
)
from theano.gpuarray.reduction import GpuMaxAndArgmax
from theano.gpuarray.subtensor import (
GpuAdvancedIncSubtensor, GpuAdvancedIncSubtensor,
GpuAdvancedIncSubtensor1, GpuAdvancedIncSubtensor1,
GpuAdvancedIncSubtensor1_dev20, GpuAdvancedIncSubtensor1_dev20,
GpuAdvancedSubtensor,
GpuAdvancedSubtensor1,
GpuAllocDiag, GpuAllocDiag,
GpuExtractDiag, GpuExtractDiag,
GpuIncSubtensor,
GpuSubtensor,
) )
from .opt_util import alpha_merge, output_merge, pad_dims, unpad_dims from theano.gpuarray.type import (
from .reduction import GpuMaxAndArgmax ContextNotDefined,
from .linalg import ( GpuArrayConstant,
GpuCusolverSolve, GpuArrayType,
MATRIX_STRUCTURES_SOLVE, get_context,
GpuCholesky, move_to_gpu,
cusolver_available,
GpuMagmaMatrixInverse,
gpu_svd,
GpuMagmaCholesky,
gpu_qr,
GpuMagmaEigh,
GpuCublasTriangularSolve,
cublas_available,
) )
from .neighbours import GpuImages2Neibs from theano.ifelse import IfElse
from .ctc import GpuConnectionistTemporalClassification from theano.misc.ordered_set import OrderedSet
from theano.scalar.basic import Cast, Pow, Scalar, log, neg, true_div
_logger = logging.getLogger("theano.gpuarray.opt") from theano.scalar.basic_scipy import Erfcinv, Erfinv
from theano.scan_module import scan_op, scan_opt, scan_utils
from theano.tensor.nnet import bn, conv3d2d
gpu_optimizer = EquilibriumDB() from theano.tensor.nnet.abstract_conv import (
gpu_cut_copies = EquilibriumDB() AbstractConv2d,
AbstractConv2d_gradInputs,
# Not used for an EquilibriumOptimizer. It has the "tracks" that we need for GraphToGPUDB. AbstractConv2d_gradWeights,
gpu_optimizer2 = EquilibriumDB() AbstractConv3d,
AbstractConv3d_gradInputs,
AbstractConv3d_gradWeights,
class GraphToGPUDB(DB): BaseAbstractConv,
""" get_conv_output_shape,
Retrieves the list local optimizers based on the optimizer flag's value )
from EquilibriumOptimizer by calling the method query. from theano.tensor.nnet.blocksparse import SparseBlockGemv, SparseBlockOuter
from theano.tensor.nnet.conv import ConvOp
""" from theano.tensor.nnet.ctc import ConnectionistTemporalClassification
from theano.tensor.nnet.neighbours import Images2Neibs
def query(self, *tags, **kwtags):
opt = gpu_optimizer2.query(*tags, **kwtags)
return GraphToGPU(opt.local_optimizers_all, opt.local_optimizers_map)
_logger = logging.getLogger("theano.gpuarray.opt")
gpu_seqopt = SequenceDB()
gpu_seqopt.register( gpu_seqopt.register(
"gpuarray_graph_optimization", "gpuarray_graph_optimization",
...@@ -200,68 +196,6 @@ gpu_seqopt.register( ...@@ -200,68 +196,6 @@ gpu_seqopt.register(
"gpuarray_cut_transfers", gpu_cut_copies, 2, "fast_compile", "fast_run", "gpuarray" "gpuarray_cut_transfers", gpu_cut_copies, 2, "fast_compile", "fast_run", "gpuarray"
) )
# do not add 'fast_run' to these two as this would always enable gpuarray mode
optdb.register(
"gpuarray_opt",
gpu_seqopt,
optdb.__position__.get("add_destroy_handler", 49.5) - 1,
"gpuarray",
)
def register_opt(*tags, **kwargs):
def f(local_opt):
name = (kwargs and kwargs.pop("name")) or local_opt.__name__
gpu_optimizer.register(name, local_opt, "fast_run", "gpuarray", *tags)
return local_opt
return f
def register_opt2(tracks, *tags, **kwargs):
"""
Decorator for the new GraphToGPU optimizer.
Takes an extra parameter(Op) compared to register_opt decorator.
Parameters
----------
tracks : List of Op class Or Op instance or None
The Node's Op to which optimization is being applied.
tags : String
The optimization tag to which the optimizer will be registered.
"""
def f(local_opt):
name = (kwargs and kwargs.pop("name")) or local_opt.__name__
if isinstance(local_opt, theano.gof.DB):
opt = local_opt
else:
opt = theano.gof.local_optimizer(tracks)(local_opt)
gpu_optimizer2.register(name, opt, "fast_run", "gpuarray", *tags)
return local_opt
return f
def register_inplace(*tags, **kwargs):
def f(local_opt):
name = (kwargs and kwargs.pop("name")) or local_opt.__name__
optdb.register(
name,
TopoOptimizer(local_opt, failure_callback=TopoOptimizer.warn_inplace),
60,
"fast_run",
"inplace",
"gpuarray",
*tags,
)
return local_opt
return f
register_opt("fast_compile")(theano.tensor.opt.local_track_shape_i) register_opt("fast_compile")(theano.tensor.opt.local_track_shape_i)
register_opt(final_opt=True, name="gpua_constant_folding")(tensor.opt.constant_folding) register_opt(final_opt=True, name="gpua_constant_folding")(tensor.opt.constant_folding)
gpu_optimizer.register( gpu_optimizer.register(
...@@ -269,108 +203,11 @@ gpu_optimizer.register( ...@@ -269,108 +203,11 @@ gpu_optimizer.register(
) )
# Define a few operations to use in optimizations,
# in order to avoid introducin new CPU Ops, or useless ones.
def safe_to_gpu(x, ctx_name):
if isinstance(x.type, tensor.TensorType):
return GpuFromHost(ctx_name)(x)
else:
return x
def safe_to_cpu(x):
if isinstance(x.type, GpuArrayType):
return x.transfer("cpu")
else:
return x
gpu_log = GpuElemwise(log) gpu_log = GpuElemwise(log)
gpu_neg = GpuElemwise(neg) gpu_neg = GpuElemwise(neg)
gpu_true_div = GpuElemwise(true_div) gpu_true_div = GpuElemwise(true_div)
def op_lifter(OP, cuda_only=False):
"""
OP(..., host_from_gpu(), ...) -> host_from_gpu(GpuOP(...))
gpu_from_host(OP(inp0, ...)) -> GpuOP(inp0, ...)
"""
def f(maker):
def local_opt(node):
if type(node.op) in OP:
# Either one of our inputs is on the gpu or
# all of our clients are on the gpu
replace = False
# TODO: Maybe set context_name with infer_context_name()?
context_name = None
# We replace if any input is a host_from_gpu
for i in node.inputs:
if i.owner and i.owner.op == host_from_gpu and move_to_gpu(i):
context_name = i.owner.inputs[0].type.context_name
replace = True
break
if not replace:
# We replace if *all* clients are on the GPU
clients = [c for o in node.outputs for c in o.clients]
replace = len(clients) != 0
for c, idx in clients:
if c == "output" or not isinstance(c.op, GpuFromHost):
replace = False
# TODO: check that the clients want the same context?
if replace:
# All clients are GpuFromHost and we have at least one
context_name = clients[0][0].op.context_name
# Check if we should replace
if (
not replace
or (cuda_only and get_context(context_name).kind != b"cuda")
or any(["complex" in getattr(i, "dtype", "") for i in node.inputs])
):
return False
# tag the inputs with the context in case
# the context was derived from the outputs
for i in node.inputs:
i.tag.context_name = context_name
new_op = maker(node.op, context_name, node.inputs, node.outputs)
# This is needed as sometimes new_op inherits from OP.
if new_op and new_op != node.op:
if isinstance(new_op, theano.Op):
new_outputs = new_op(*node.inputs, return_list=True)
to_cpu_fn = safe_to_cpu
elif isinstance(new_op, (tuple, list)):
new_outputs = new_op
to_cpu_fn = safe_to_cpu
else: # suppose it is a variable on the GPU
new_outputs = [new_op]
def to_cpu_fn(x):
return x.transfer("cpu")
# copy stack traces onto gpu outputs
# also copy the stack traces onto HostFromGpu outputs
on_cpu = []
for old_output, new_output in zip(node.outputs, new_outputs):
copy_stack_trace(old_output, new_output)
cpu = to_cpu_fn(new_output)
on_cpu.append(cpu)
copy_stack_trace(old_output, cpu)
return on_cpu
return False
local_opt.__name__ = maker.__name__
return local_optimizer(OP)(local_opt)
return f
class InputToGpuOptimizer(Optimizer): class InputToGpuOptimizer(Optimizer):
""" """
Transfer the input to the gpu to start the rolling wave. Transfer the input to the gpu to start the rolling wave.
...@@ -409,7 +246,6 @@ class InputToGpuOptimizer(Optimizer): ...@@ -409,7 +246,6 @@ class InputToGpuOptimizer(Optimizer):
raise raise
# If there is no context tag and no default context # If there is no context tag and no default context
# then it stays on the CPU # then it stays on the CPU
pass
gpu_seqopt.register( gpu_seqopt.register(
...@@ -2507,9 +2343,6 @@ def local_gpu_pool(op, ctx_name, inputs, outputs): ...@@ -2507,9 +2343,6 @@ def local_gpu_pool(op, ctx_name, inputs, outputs):
return unpad_dims(ret_padded, inp, 2, nd) return unpad_dims(ret_padded, inp, 2, nd)
pool_db = LocalGroupDB()
pool_db2 = LocalGroupDB(local_opt=theano.gof.opt.GraphToGPULocalOptGroup)
pool_db2.__name__ = "pool_db2"
lifter = op_lifter([pool.Pool])(local_gpu_pool) lifter = op_lifter([pool.Pool])(local_gpu_pool)
pool_db.register( pool_db.register(
"local_gpu_pool", lifter, "gpuarray", "fast_compile", "fast_run", position=1 "local_gpu_pool", lifter, "gpuarray", "fast_compile", "fast_run", position=1
...@@ -2903,10 +2736,6 @@ def local_gpu_cholesky(op, context_name, inputs, outputs): ...@@ -2903,10 +2736,6 @@ def local_gpu_cholesky(op, context_name, inputs, outputs):
return op return op
matrix_ops_db = LocalGroupDB()
matrix_ops_db2 = LocalGroupDB(local_opt=theano.gof.opt.GraphToGPULocalOptGroup)
matrix_ops_db2.__name__ = "matrix_ops_db2"
# For Cholesky decomposition, magma 2.2 is slower than cusolver 8 (tested for # For Cholesky decomposition, magma 2.2 is slower than cusolver 8 (tested for
# matrices of size 1000). Thus, cusolver is prioritized during graph # matrices of size 1000). Thus, cusolver is prioritized during graph
# optimizations. To explicitly use magma, you should disable cusolver using # optimizations. To explicitly use magma, you should disable cusolver using
...@@ -3094,24 +2923,6 @@ optdb.register( ...@@ -3094,24 +2923,6 @@ optdb.register(
"scan", "scan",
) )
# Register GPU convolution implementation
# They are tried in a specific order so we can control
# which ones take precedence over others.
abstractconv_groupopt = theano.gof.optdb.LocalGroupDB()
abstractconv_groupopt.__name__ = "gpuarray_abstractconv_opts"
register_opt("fast_compile")(abstractconv_groupopt)
# We import these opts here instead of at the top of this file
# to avoid a circular dependency problem with dnn
from .dnn import ( # noqa: E402
local_abstractconv_cudnn,
local_abstractconv_gw_cudnn,
local_abstractconv_gi_cudnn,
local_abstractconv_cudnn_alt,
local_abstractconv3d_cudnn_alt,
)
abstractconv_groupopt.register( abstractconv_groupopt.register(
"local_abstractconv_dnn", "local_abstractconv_dnn",
local_abstractconv_cudnn, local_abstractconv_cudnn,
...@@ -3240,19 +3051,15 @@ abstractconv_groupopt.register("conv_metaopt", conv_metaopt, "conv_meta", positi ...@@ -3240,19 +3051,15 @@ abstractconv_groupopt.register("conv_metaopt", conv_metaopt, "conv_meta", positi
# We import these opts here instead of at the top of this file # We import these opts here instead of at the top of this file
# to avoid a circular dependency problem with dnn # to avoid a circular dependency problem with dnn
from .dnn import ( # noqa: E402 from theano.gpuarray.dnn import ( # noqa: E402
local_abstract_batch_norm_inference_cudnn,
local_abstract_batch_norm_train_cudnn, local_abstract_batch_norm_train_cudnn,
local_abstract_batch_norm_train_grad_cudnn, local_abstract_batch_norm_train_grad_cudnn,
local_abstract_batch_norm_inference_cudnn,
) )
abstract_batch_norm_groupopt = theano.gof.optdb.LocalGroupDB()
abstract_batch_norm_groupopt.__name__ = "gpuarray_batchnorm_opts"
register_opt("fast_compile")(abstract_batch_norm_groupopt) register_opt("fast_compile")(abstract_batch_norm_groupopt)
abstract_batch_norm_db = LocalGroupDB()
abstract_batch_norm_db2 = LocalGroupDB(local_opt=theano.gof.opt.GraphToGPULocalOptGroup)
abstract_batch_norm_db2.__name__ = "abstract_batch_norm_db2"
register_opt("fast_compile", name="abstract_batch_norm_db")(abstract_batch_norm_db) register_opt("fast_compile", name="abstract_batch_norm_db")(abstract_batch_norm_db)
register_opt2( register_opt2(
[ [
......
...@@ -2,13 +2,38 @@ from functools import wraps ...@@ -2,13 +2,38 @@ from functools import wraps
import numpy as np import numpy as np
from theano import tensor, scalar as scal, Constant from theano import Constant
from theano import scalar as scal
from theano import tensor
from theano.gof import local_optimizer from theano.gof import local_optimizer
from theano.gof.opt import inherit_stack_trace from theano.gof.op import Op
from theano.tensor import DimShuffle, get_scalar_constant_value, NotScalarConstantError from theano.gof.opt import copy_stack_trace, inherit_stack_trace
from theano.gpuarray.basic_ops import (
GpuAllocEmpty,
GpuFromHost,
GpuReshape,
HostFromGpu,
host_from_gpu,
)
from theano.gpuarray.elemwise import GpuDimShuffle, GpuElemwise
from theano.gpuarray.type import GpuArrayType, get_context, move_to_gpu
from theano.tensor import DimShuffle, NotScalarConstantError, get_scalar_constant_value
# Define a few operations to use in optimizations,
# in order to avoid introducin new CPU Ops, or useless ones.
def safe_to_gpu(x, ctx_name):
if isinstance(x.type, tensor.TensorType):
return GpuFromHost(ctx_name)(x)
else:
return x
from .basic_ops import GpuFromHost, HostFromGpu, GpuAllocEmpty, GpuReshape def safe_to_cpu(x):
from .elemwise import GpuDimShuffle, GpuElemwise if isinstance(x.type, GpuArrayType):
return x.transfer("cpu")
else:
return x
def grab_cpu_scalar(v, nd): def grab_cpu_scalar(v, nd):
...@@ -420,3 +445,84 @@ def unpad_dims(output, input, leftdims, rightdims): ...@@ -420,3 +445,84 @@ def unpad_dims(output, input, leftdims, rightdims):
# restore the output to the original shape # restore the output to the original shape
outshp = tensor.join(0, input.shape[:-rightdims], output.shape[-rightdims:]) outshp = tensor.join(0, input.shape[:-rightdims], output.shape[-rightdims:])
return GpuReshape(input.ndim)(output, outshp) return GpuReshape(input.ndim)(output, outshp)
def op_lifter(OP, cuda_only=False):
"""
OP(..., host_from_gpu(), ...) -> host_from_gpu(GpuOP(...))
gpu_from_host(OP(inp0, ...)) -> GpuOP(inp0, ...)
"""
def f(maker):
def local_opt(node):
if type(node.op) in OP:
# Either one of our inputs is on the gpu or
# all of our clients are on the gpu
replace = False
# TODO: Maybe set context_name with infer_context_name()?
context_name = None
# We replace if any input is a host_from_gpu
for i in node.inputs:
if i.owner and i.owner.op == host_from_gpu and move_to_gpu(i):
context_name = i.owner.inputs[0].type.context_name
replace = True
break
if not replace:
# We replace if *all* clients are on the GPU
clients = [c for o in node.outputs for c in o.clients]
replace = len(clients) != 0
for c, idx in clients:
if c == "output" or not isinstance(c.op, GpuFromHost):
replace = False
# TODO: check that the clients want the same context?
if replace:
# All clients are GpuFromHost and we have at least one
context_name = clients[0][0].op.context_name
# Check if we should replace
if (
not replace
or (cuda_only and get_context(context_name).kind != b"cuda")
or any(["complex" in getattr(i, "dtype", "") for i in node.inputs])
):
return False
# tag the inputs with the context in case
# the context was derived from the outputs
for i in node.inputs:
i.tag.context_name = context_name
new_op = maker(node.op, context_name, node.inputs, node.outputs)
# This is needed as sometimes new_op inherits from OP.
if new_op and new_op != node.op:
if isinstance(new_op, Op):
new_outputs = new_op(*node.inputs, return_list=True)
to_cpu_fn = safe_to_cpu
elif isinstance(new_op, (tuple, list)):
new_outputs = new_op
to_cpu_fn = safe_to_cpu
else: # suppose it is a variable on the GPU
new_outputs = [new_op]
def to_cpu_fn(x):
return x.transfer("cpu")
# copy stack traces onto gpu outputs
# also copy the stack traces onto HostFromGpu outputs
on_cpu = []
for old_output, new_output in zip(node.outputs, new_outputs):
copy_stack_trace(old_output, new_output)
cpu = to_cpu_fn(new_output)
on_cpu.append(cpu)
copy_stack_trace(old_output, cpu)
return on_cpu
return False
local_opt.__name__ = maker.__name__
return local_optimizer(OP)(local_opt)
return f
from theano.compile import optdb
from theano.gof.opt import GraphToGPULocalOptGroup, TopoOptimizer, local_optimizer
from theano.gof.optdb import DB, EquilibriumDB, LocalGroupDB, SequenceDB
gpu_optimizer = EquilibriumDB()
gpu_cut_copies = EquilibriumDB()
# Not used for an EquilibriumOptimizer. It has the "tracks" that we need for GraphToGPUDB.
gpu_optimizer2 = EquilibriumDB()
gpu_seqopt = SequenceDB()
# do not add 'fast_run' to these two as this would always enable gpuarray mode
optdb.register(
"gpuarray_opt",
gpu_seqopt,
optdb.__position__.get("add_destroy_handler", 49.5) - 1,
"gpuarray",
)
pool_db = LocalGroupDB()
pool_db2 = LocalGroupDB(local_opt=GraphToGPULocalOptGroup)
pool_db2.__name__ = "pool_db2"
matrix_ops_db = LocalGroupDB()
matrix_ops_db2 = LocalGroupDB(local_opt=GraphToGPULocalOptGroup)
matrix_ops_db2.__name__ = "matrix_ops_db2"
abstract_batch_norm_db = LocalGroupDB()
abstract_batch_norm_db2 = LocalGroupDB(local_opt=GraphToGPULocalOptGroup)
abstract_batch_norm_db2.__name__ = "abstract_batch_norm_db2"
abstract_batch_norm_groupopt = LocalGroupDB()
abstract_batch_norm_groupopt.__name__ = "gpuarray_batchnorm_opts"
def register_opt(*tags, **kwargs):
def f(local_opt):
name = (kwargs and kwargs.pop("name")) or local_opt.__name__
gpu_optimizer.register(name, local_opt, "fast_run", "gpuarray", *tags)
return local_opt
return f
def register_opt2(tracks, *tags, **kwargs):
"""
Decorator for the new GraphToGPU optimizer.
Takes an extra parameter(Op) compared to register_opt decorator.
Parameters
----------
tracks : List of Op class Or Op instance or None
The Node's Op to which optimization is being applied.
tags : String
The optimization tag to which the optimizer will be registered.
"""
def f(local_opt):
name = (kwargs and kwargs.pop("name")) or local_opt.__name__
if isinstance(local_opt, DB):
opt = local_opt
else:
opt = local_optimizer(tracks)(local_opt)
gpu_optimizer2.register(name, opt, "fast_run", "gpuarray", *tags)
return local_opt
return f
def register_inplace(*tags, **kwargs):
def f(local_opt):
name = (kwargs and kwargs.pop("name")) or local_opt.__name__
optdb.register(
name,
TopoOptimizer(local_opt, failure_callback=TopoOptimizer.warn_inplace),
60,
"fast_run",
"inplace",
"gpuarray",
*tags,
)
return local_opt
return f
# Register GPU convolution implementation
# They are tried in a specific order so we can control
# which ones take precedence over others.
abstractconv_groupopt = LocalGroupDB()
abstractconv_groupopt.__name__ = "gpuarray_abstractconv_opts"
register_opt("fast_compile")(abstractconv_groupopt)
class GraphToGPUDB(DB):
"""
Retrieves the list local optimizers based on the optimizer flag's value
from EquilibriumOptimizer by calling the method query.
"""
def query(self, *tags, **kwtags):
from theano.gpuarray.opt import GraphToGPU
opt = gpu_optimizer2.query(*tags, **kwtags)
return GraphToGPU(opt.local_optimizers_all, opt.local_optimizers_map)
...@@ -5,14 +5,15 @@ from theano.scalar import bool as bool_t ...@@ -5,14 +5,15 @@ from theano.scalar import bool as bool_t
from theano.tensor.basic import as_tensor_variable from theano.tensor.basic import as_tensor_variable
from theano.tensor.signal.pool import Pool, PoolingMode_t from theano.tensor.signal.pool import Pool, PoolingMode_t
from .type import gpu_context_type
from .basic_ops import ( from .basic_ops import (
CGpuKernelBase, CGpuKernelBase,
infer_context_name,
gpuarray_helper_inc_dir,
as_gpuarray_variable, as_gpuarray_variable,
gpu_contiguous, gpu_contiguous,
gpuarray_helper_inc_dir,
infer_context_name,
) )
from .type import gpu_context_type
try: try:
import pygpu import pygpu
......
from theano.gof import Op, Apply from theano.gof import Apply, Op
from theano.gof.type import Generic from theano.gof.type import Generic
from .basic_ops import infer_context_name, as_gpuarray_variable, gpuarray_helper_inc_dir from .basic_ops import as_gpuarray_variable, gpuarray_helper_inc_dir, infer_context_name
from .type import GpuArrayType from .type import GpuArrayType
try: try:
import pygpu import pygpu
except ImportError: except ImportError:
......
...@@ -9,21 +9,21 @@ http://www.iro.umontreal.ca/~simardr/ssj/indexe.html ...@@ -9,21 +9,21 @@ http://www.iro.umontreal.ca/~simardr/ssj/indexe.html
from theano import Apply, tensor from theano import Apply, tensor
from theano.gof import local_optimizer from theano.gof import local_optimizer
from theano.sandbox.rng_mrg import mrg_uniform_base, mrg_uniform from theano.sandbox.rng_mrg import mrg_uniform, mrg_uniform_base
from theano.tensor import as_tensor_variable, get_vector_length
from theano.scalar import int32 as int_t from theano.scalar import int32 as int_t
from theano.tensor import as_tensor_variable, get_vector_length
from .basic_ops import ( from .basic_ops import (
GpuFromHost,
GpuKernelBase, GpuKernelBase,
Kernel, Kernel,
infer_context_name,
GpuFromHost,
host_from_gpu,
as_gpuarray_variable, as_gpuarray_variable,
host_from_gpu,
infer_context_name,
) )
from .type import GpuArrayType, gpu_context_type
from .fp16_help import write_w from .fp16_help import write_w
from .opt import register_opt, register_opt2 from .opt import register_opt, register_opt2
from .type import GpuArrayType, gpu_context_type
class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base): class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
......
...@@ -11,13 +11,14 @@ from theano.tensor.sort import TopKOp ...@@ -11,13 +11,14 @@ from theano.tensor.sort import TopKOp
from .basic_ops import ( from .basic_ops import (
GpuKernelBase, GpuKernelBase,
Kernel, Kernel,
infer_context_name,
as_gpuarray_variable, as_gpuarray_variable,
gpuarray_helper_inc_dir, gpuarray_helper_inc_dir,
infer_context_name,
) )
from .opt import register_opt, op_lifter, register_opt2 from .opt import op_lifter, register_opt, register_opt2
from .type import GpuArrayType from .type import GpuArrayType
try: try:
import pygpu import pygpu
import pygpu.gpuarray as ga import pygpu.gpuarray as ga
......
import numpy as np import numpy as np
import theano.tensor as tt
from six import integer_types from six import integer_types
from six.moves import StringIO from six.moves import StringIO
from theano import gof, Op import theano.tensor as tt
from theano import Op, gof
from theano.gof import ParamsType from theano.gof import ParamsType
from theano.gradient import grad_not_implemented from theano.gradient import grad_not_implemented
from theano.scalar import bool as bool_t
from theano.scalar import int32 as int_t
from theano.scalar import uint32 as size_t
from theano.tensor import AllocDiag from theano.tensor import AllocDiag
from theano.tensor.subtensor import ( from theano.tensor.subtensor import (
IncSubtensor,
AdvancedSubtensor,
Subtensor,
AdvancedIncSubtensor, AdvancedIncSubtensor,
AdvancedSubtensor,
AdvancedSubtensor1, AdvancedSubtensor1,
IncSubtensor,
Subtensor,
get_idx_list, get_idx_list,
) )
from theano.scalar import bool as bool_t, int32 as int_t, uint32 as size_t
try: try:
import pygpu import pygpu
...@@ -25,16 +26,17 @@ try: ...@@ -25,16 +26,17 @@ try:
except ImportError: except ImportError:
pass pass
from theano.gpuarray.type import GpuArrayType, gpu_context_type
from theano.gpuarray.basic_ops import ( from theano.gpuarray.basic_ops import (
as_gpuarray_variable,
HideC,
GpuKernelBase, GpuKernelBase,
HideC,
Kernel, Kernel,
as_gpuarray_variable,
gpu_contiguous,
gpuarray_helper_inc_dir, gpuarray_helper_inc_dir,
infer_context_name, infer_context_name,
gpu_contiguous,
) )
from theano.gpuarray.type import GpuArrayType, gpu_context_type
iadd_reg = {} iadd_reg = {}
......
import sys
import os import os
import sys
import warnings import warnings
import six.moves.copyreg as copyreg
import numpy as np import numpy as np
import six.moves.copyreg as copyreg
import theano import theano
from theano import Constant, Type, Variable, config, scalar, tensor
from theano.compile import SharedVariable
from theano.tensor.type import TensorType from theano.tensor.type import TensorType
from theano.tensor.var import _tensor_py_operators from theano.tensor.var import _tensor_py_operators
from theano import Type, Variable, Constant, tensor, config, scalar
from theano.compile import SharedVariable
# Make sure this is importable even if pygpu is absent # Make sure this is importable even if pygpu is absent
# (it will not work though) # (it will not work though)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论