提交 2a5fc594 authored 作者: Maxim Kochurov's avatar Maxim Kochurov 提交者: Brandon T. Willard

Remove aesara.gpuarray

上级 b3ce3640
import logging
import os
import sys
import warnings
import aesara
from aesara.compile import optdb
from aesara.configdefaults import config
from aesara.tensor.basic import register_transfer
_logger_name = "aesara.gpuarray"
_logger = logging.getLogger(_logger_name)
error = _logger.error
info = _logger.info
pygpu_activated = False
# Used to skip initialization checking when we are in the same processus.
aesara_gpu_is_already_active = False
try:
import pygpu
import pygpu.gpuarray
except ImportError:
pygpu = None
from aesara.gpuarray import (
ctc,
dnn,
extra_ops,
fft,
multinomial,
opt,
reduction,
rng_mrg,
sort,
)
from aesara.gpuarray.basic_ops import as_gpuarray_variable
# This is for documentation not to depend on the availability of pygpu
from aesara.gpuarray.type import (
ContextNotDefined,
GpuArrayConstant,
GpuArraySharedVariable,
GpuArrayType,
GpuArrayVariable,
get_context,
gpuarray_shared_constructor,
reg_context,
)
def transfer(x, target):
try:
get_context(target)
return as_gpuarray_variable(x, target)
except ContextNotDefined:
pass
register_transfer(transfer)
def pygpu_parse_version(version_string):
from collections import namedtuple
version_type = namedtuple(
"version_type", ("major", "minor", "patch", "fullversion")
)
pieces = version_string.split(".", 2)
assert len(pieces) == 3, version_string
major = int(pieces[0])
minor = int(pieces[1])
if "+" in pieces[2]: # It contain a git commit.
patch = int(pieces[2].split("+", 1)[0])
else: # Maybe it end with .devN
patch = int(pieces[2].split(".", 1)[0])
fullversion = f"{int(major)}.{int(minor)}.{pieces[2]}"
return version_type(major=major, minor=minor, patch=patch, fullversion=fullversion)
def init_dev(dev, name=None, preallocate=None):
global pygpu_activated
global aesara_gpu_is_already_active
if (
not aesara_gpu_is_already_active
and os.environ.get("AESARA_GPU_IS_ALREADY_ACTIVE", "") == "Yes"
):
raise RuntimeError(
"You can't initialize the GPU in a subprocess if the parent process already did it"
)
if not config.cxx:
raise RuntimeError("The new gpu-backend need a c++ compiler.")
pygpu_version = pygpu_parse_version(pygpu.__version__)
if pygpu_version.major != 0 or pygpu_version.minor != 7 or pygpu_version.patch < 0:
raise ValueError(
"Your installed version of pygpu(%s) is too old, please upgrade to 0.7.0 or later (but below 0.8.0)"
% pygpu_version.fullversion
)
# This is for the C headers API, we need to match the exact version.
gpuarray_version_major_supported = 2
gpuarray_version_major_detected = pygpu.gpuarray.api_version()[0]
if gpuarray_version_major_detected != gpuarray_version_major_supported:
raise ValueError(
"Your installed version of libgpuarray is not in sync with the current Aesara"
f" version. The installed libgpuarray version supports API version {int(gpuarray_version_major_detected)},"
f" while current Aesara supports API version {int(gpuarray_version_major_supported)}. Change the version of"
" libgpuarray or Aesara to fix this problem.",
)
if dev not in init_dev.devmap:
args = dict()
if config.gpuarray__cache_path != "":
args["kernel_cache_path"] = config.gpuarray__cache_path
if preallocate is None:
preallocate = config.gpuarray__preallocate
if preallocate < 0:
args["max_cache_size"] = 0
else:
args["initial_cache_size"] = preallocate
context = pygpu.init(
dev,
sched=config.gpuarray__sched,
single_stream=config.gpuarray__single_stream,
**args,
)
os.environ["AESARA_GPU_IS_ALREADY_ACTIVE"] = "Yes"
aesara_gpu_is_already_active = True
context.dev = dev
init_dev.devmap[dev] = context
reg_context(name, context)
MB = 1024 * 1024
if dev.startswith("cuda"):
avail = dnn.dnn_available(name)
# If we try to enable cudnn and there isn't enough GPU
# memory, there will be an unclear error message. So do
# not even try a clear error.
if avail and context.free_gmem < 75 * MB:
raise RuntimeError(
f"Can not enable cuDNN as there is only {int(context.free_gmem / MB)} MB of free GPU memory."
)
elif avail:
context.cudnn_handle = dnn._make_handle(context)
elif config.dnn__enabled == "True":
raise RuntimeError(
"You enabled cuDNN, but we aren't able to use it: %s"
% dnn.dnn_available.msg
)
if config.print_active_device:
if avail:
print(
f"Using cuDNN version {int(dnn.version())} on context {name}",
file=sys.stderr,
)
else:
print(
f"Can not use cuDNN on context {name}: {dnn.dnn_available.msg}",
file=sys.stderr,
)
if preallocate < 0:
print(f"Disabling allocation cache on {dev}")
elif preallocate > 0:
if preallocate <= 1:
gmem = min(preallocate, 0.95) * context.total_gmem
else:
gmem = preallocate * MB
if gmem > context.free_gmem:
raise RuntimeError(
f"Trying to preallocate {int(gmem / MB)} MB of GPU memory while only"
f" {int(context.free_gmem / MB)} MB are available."
)
elif gmem > context.free_gmem - 50 * MB:
warnings.warn(
"Preallocating too much memory can prevent cudnn and cublas from working properly"
)
# This will allocate and immediately free an object of size gmem
# which will reserve that amount of memory on the GPU.
pygpu.empty((gmem,), dtype="int8", context=context)
if config.print_active_device:
print(
f"Preallocating {int(gmem // MB)}/{int(context.total_gmem // MB)} Mb ({gmem / context.total_gmem}) on {dev}",
file=sys.stderr,
)
# Initialise the blas kernels. We do this after the
# preallocation to not fragment the heap accidentally.
tmp = pygpu.empty((2, 2), dtype="float32", context=context)
if dev.startswith("cuda"):
# In OpenCL, BLAS isn't always available
pygpu.blas.gemm(0, tmp, tmp, 0, tmp, overwrite_c=True)
del tmp
else:
context = init_dev.devmap[dev]
# This will map the context name to the real context object.
if config.print_active_device:
try:
unique_id = "(" + context.unique_id + ")"
except pygpu.gpuarray.UnsupportedException:
unique_id = ""
print(
f"Mapped name {name} to device {dev}: {context.devname} {unique_id}",
file=sys.stderr,
)
pygpu_activated = True
# This maps things like 'cuda0' to the context object on that device.
init_dev.devmap = {}
def use(
device,
force=False,
default_to_move_computation_to_gpu=True,
move_shared_to_gpu=True,
preallocate=None,
):
"""
Error and warning about CUDA should be displayed only when this
function is called. We need to be able to load this module only
to check if it is available!
Parameters
----------
device : string
"cuda", "cuda0", "cudaN", "" (N is the device number to use).
"" mean do all the rest and don't init a device.
force
Will always raise an exception if we can't use the gpu.
default_to_move_computation_to_gpu
If gpu init succeeded, enable by default optimizations to move
computations to the gpu.
move_shared_to_gpu
If gpu init succeeded, put new shared variables on the gpu.
preallocate
If specified, will use this value for preallocation instead of
gpuarray__preallocate.
"""
if force:
if not (device.startswith("cuda") or device.startswith("opencl")):
raise Exception("forced the init and bad device provided: " + device)
else:
# If we force, the device should not already be initialized.
assert device not in init_dev.devmap
if device:
init_dev(device, preallocate=preallocate)
if default_to_move_computation_to_gpu:
optdb.add_tags("gpuarray_opt", "fast_run", "fast_compile")
optdb.add_tags("gpua_scanOp_make_inplace", "fast_run")
if move_shared_to_gpu:
import aesara.compile
aesara.compile.shared_constructor(gpuarray_shared_constructor)
if pygpu:
try:
if config.device.startswith("cuda") or config.device.startswith("opencl"):
use(config.device)
elif config.init_gpu_device.startswith(
"cuda"
) or config.init_gpu_device.startswith("opencl"):
if config.device != "cpu":
raise ValueError("you must set device=cpu to use init_gpu_device.")
if config.contexts != "":
print(
"Using contexts will make init_gpu_device act like device and move all computations by default, which might not be what you want."
)
init_dev(config.init_gpu_device)
if config.contexts != "":
for n, d in (c.split("->") for c in config.contexts.split(";")):
init_dev(d.strip(), n.strip())
# To have shared var default on the GPU and opt to move to the GPU.
use("")
except Exception:
error("Could not initialize pygpu, support disabled", exc_info=True)
from .basic_ops import (
GpuAlloc,
GpuAllocEmpty,
GpuContiguous,
GpuEye,
GpuFromHost,
GpuJoin,
GpuReshape,
GpuSplit,
HostFromGpu,
host_from_gpu,
)
from .elemwise import GpuElemwise
from .subtensor import GpuAdvancedIncSubtensor1, GpuIncSubtensor, GpuSubtensor
else:
if (
config.init_gpu_device.startswith("cuda")
or config.init_gpu_device.startswith("opencl")
or config.device.startswith("opencl")
or config.device.startswith("cuda")
or config.contexts != ""
):
error(
"pygpu was configured but could not be imported or is too old (version 0.7 or higher required)",
exc_info=True,
)
import copy
import os
import re
from collections import deque
from typing import Union
import numpy as np
import aesara
import aesara.tensor as at
from aesara.configdefaults import config
from aesara.gradient import grad_undefined
from aesara.graph.basic import Apply, Variable
from aesara.graph.op import Op, _NoPythonOp
from aesara.graph.opt import copy_stack_trace
from aesara.graph.utils import MethodNotDefined
from aesara.link.c.interface import HideC
from aesara.link.c.op import COp, ExternalCOp
from aesara.link.c.params_type import ParamsType
from aesara.link.c.type import CType
from aesara.scalar import bool as bool_t
from aesara.scalar import int32 as int32_t
from aesara.tensor.basic import Alloc, AllocEmpty, Join, Split, infer_broadcastable
from aesara.tensor.shape import Reshape
from aesara.tensor.type import TensorType, values_eq_approx_always_true
try:
import pygpu
from pygpu import gpuarray
except ImportError:
pass
from aesara.gpuarray.fp16_help import write_w
from aesara.gpuarray.type import (
EQ_MAP,
ContextNotDefined,
GpuArrayConstant,
GpuArrayType,
GpuContextType,
get_context,
gpu_context_type,
)
def as_gpuarray_variable(x, context_name):
"""
This will attempt to convert `x` into a variable on the GPU.
It can take either a value of another variable. If `x` is already
suitable, it will be returned as-is.
Parameters
----------
x
Object to convert
context_name : str or None
target context name for the result
"""
# If this is already some form of variable, try to avoid an extra transfer
if isinstance(x, Variable):
while True:
# If we are already a GpuArrayVariable in the right context
# then there is nothing to do.
if isinstance(x.type, GpuArrayType) and x.type.context_name == context_name:
return x
# If x is the result of a transfer, try to dig through.
if getattr(x, "owner", None):
if isinstance(x.owner.op, HostFromGpu):
x = x.owner.inputs[0]
continue
if isinstance(x.owner.op, GpuFromHost):
x = x.owner.inputs[0]
continue
if isinstance(x.owner.op, GpuToGpu):
x = x.owner.inputs[0]
continue
# If none of the conditions where met, then continue with
# the rest of the body
break
# If we couldn't deal with transfers, then maybe it's a tensor
if isinstance(x.type, TensorType):
return copy_stack_trace(x, GpuFromHost(context_name)(x))
# Try _as_GpuArrayVariable if possible
if hasattr(x, "_as_GpuArrayVariable"):
return copy_stack_trace(x, x._as_GpuArrayVariable(context_name))
# If it didn't work try for a constant
ctx = get_context(context_name)
if isinstance(x, gpuarray.GpuArray):
if x.context.ptr != ctx.ptr:
x = x.transfer(ctx)
x = gpuarray.asarray(x, context=ctx)
bcast = [(s == 1) for s in x.shape]
return GpuArrayConstant(
GpuArrayType(dtype=x.dtype, broadcastable=bcast, context_name=context_name), x
)
def infer_context_name(*vars):
"""
Infer the context name to use from the inputs given
"""
# We try to infer the closest context first
# TODO: What to do in case of context conflicts?
# We currently use a first found wins approach.
todo = deque()
todo.extendleft(vars)
while todo:
v = todo.pop()
if isinstance(v.type, GpuArrayType):
return v.type.context_name
if hasattr(v.tag, "context_name"):
return v.tag.context_name
if v.owner:
if isinstance(v.owner.op, HostFromGpu):
return v.owner.inputs[0].type.context_name
if len(v.owner.inputs) == 1:
todo.extendleft(v.owner.inputs)
# If we can't find a context try None if it exists
try:
get_context(None)
return None
except ContextNotDefined:
raise ValueError("Could not infer context from inputs")
def gpuarray_helper_inc_dir():
return os.path.join(os.path.dirname(__file__), "c_code")
class Kernel:
"""
This class groups together all the attributes of a gpu kernel.
`params` should contain the data type for each argument. Buffer
arguments should use the GpuArray class as the data type and
scalar should use their equivalent numpy dtype. For ga_size and
ga_ssize, use gpuarray.SIZE and gpuarray.SSIZE.
If the `ctypes` flags is set to `True` then it should be a C
string which represent the typecode to use.
`flags` can contain the following keys whose values are booleans:
have_double
the kernel uses double-typed variables somewhere
have_small
the kernel uses variables whose type takes less than 4
bytes somewhere
have_complex
the kernel uses complex values somewhere
have_half
the kernel uses half-floats somewhere
ctypes
the `params` list consists of C typecodes
It can also have the key `cflags` which is a string of C flag
values like this `"GA_USE_DOUBLE|GA_USE_SMALL"`.
Parameters
----------
code: str
The source code of the kernel.
params: list
list of parameter types.
name: str
the name of the kernel function in the source.
flags: dict
dictionary of flags
codevar: str
the name of the variable for the code object.
(defaults to `kcode_` + name)
objvar: str
the name of the variable for the kernel object.
(defaults to `k_` + name)
fname: str
the name of the function wrapper.
(defaults to name + `_call`)
sname: str
the name of the scheduled call function
(defaults to name _ `_scall`)
"""
def __init__(
self,
code,
params,
name,
flags,
codevar=None,
objvar=None,
fname=None,
sname=None,
):
self.code = code
self.params = params
self.name = name
self.flags = flags
if codevar is None:
codevar = "kcode_" + name
self.codevar = codevar
if objvar is None:
objvar = "k_" + name
self.objvar = objvar
if fname is None:
fname = name + "_call"
self.fname = fname
if sname is None:
sname = name + "_scall"
self.sname = sname
@staticmethod
def get_flags(*types):
def get_dtype(t):
if isinstance(t, str):
return np.dtype(t)
elif isinstance(t, CType):
return t.dtype
elif isinstance(t, Variable):
return t.type.dtype
else:
raise TypeError(f"can't get a dtype from {type(t)}")
dtypes = [get_dtype(t) for t in types]
flags = dict()
if any(d == np.float64 for d in dtypes):
flags["have_double"] = True
if any(d.itemsize < 4 for d in dtypes):
flags["have_small"] = True
if any(d.kind == "c" for d in dtypes):
flags["have_complex"] = True
if any(d == np.float16 for d in dtypes):
flags["have_half"] = True
return flags
def _get_c_flags(self):
res = []
if self.flags.get("cflags", "") != "":
res.append(self.flags["cflags"])
if self.flags.get("have_double", False):
res.append("GA_USE_DOUBLE")
if self.flags.get("have_small", False):
res.append("GA_USE_SMALL")
if self.flags.get("have_complex", False):
res.append("GA_USE_COMPLEX")
if self.flags.get("have_half", False):
res.append("GA_USE_HALF")
res = "|".join(res)
if not res:
return "0"
return res
def _get_py_flags(self):
res = dict(self.flags)
cflags = res.pop("cflags", "")
for fl in cflags.split("|"):
fl = fl.strip()
if fl == "GA_USE_DOUBLE":
res["have_double"] = True
if fl == "GA_USE_SMALL":
res["have_small"] = True
if fl == "GA_USE_COMPLEX":
res["have_complex"] = True
if fl == "GA_USE_HALF":
res["have_half"] = True
return res
def _get_c_types(self):
def m(t):
if t == gpuarray.GpuArray:
return "GA_BUFFER"
else:
return str(gpuarray.dtype_to_typecode(t))
return ", ".join(m(t) for t in self.params)
def get_ctype(dtype):
if dtype is gpuarray.GpuArray:
return "gpudata *"
elif isinstance(dtype, np.dtype):
return "npy_" + dtype.name
elif dtype == gpuarray.SIZE:
return "size_t"
elif dtype == gpuarray.SSIZE:
return "ssize_t"
else:
dtype = np.dtype(dtype)
return "npy_" + dtype.name
class GpuKernelBase:
"""
Base class for operations that need to compile kernels.
It is not mandatory to use this class, but it helps with a lot of
the small things that you have to pay attention to.
"""
params_type: Union[ParamsType, GpuContextType] = gpu_context_type
def get_params(self, node):
# Default implementation, suitable for most sub-classes.
# To be necessarly overridden in a subclass that uses a ParamsType.
assert (
self.params_type is gpu_context_type
and node.inputs
and isinstance(node.inputs[0].type, GpuArrayType)
)
return node.inputs[0].type.context
def get_gpu_context(self, node):
# Private method used to retrieve GPU context, instead of
# directly using self.get_params(node), as this latter may be overridden.
if isinstance(self.params_type, ParamsType) and self.params_type.has_type(
gpu_context_type
):
# Get field name of gpu_context_type into ParamsType object.
gpu_context_field = self.params_type.get_field(gpu_context_type)
# Get Params object (self.get_params() should have been overridden).
wrap = self.get_params(node)
# Get GPU context from Params object.
return getattr(wrap, gpu_context_field)
assert self.params_type is gpu_context_type
return self.get_params(node)
def get_gpu_context_c_name(self, params_c_name):
# Private method used to retrieve C name of GPU context variable,
# instead of directly using sub['params'], as params may not be a GPU context
# (e.g. for sub-classes that use ParamsType).
if isinstance(self.params_type, ParamsType) and self.params_type.has_type(
gpu_context_type
):
return f"({params_c_name}->{self.params_type.get_field(gpu_context_type)})"
assert self.params_type is gpu_context_type
return params_c_name
def gpu_kernels(self, node, name):
"""
This is the method to override. This should return an iterable
of Kernel objects that describe the kernels this op will need.
"""
raise MethodNotDefined("gpu_kernels")
def c_headers(self, **kwargs):
try:
o = super().c_headers(**kwargs)
except MethodNotDefined:
o = []
return o + ["gpuarray/types.h", "numpy/npy_common.h"]
def c_header_dirs(self, **kwargs):
try:
o = super().c_header_dirs(**kwargs)
except MethodNotDefined:
o = []
# We rely on the input types for the directory to gpuarray includes
return o + [np.get_include()]
def _generate_kernel_code(self, k):
code = "\\n".join(l for l in k.code.split("\n"))
code = code.replace('"', '\\"')
return """static const char *%(cname)s_unsigned = "%(code)s";
static const char *%(cname)s = (char *)%(cname)s_unsigned;
""" % dict(
cname=k.codevar, code=code
)
def _generate_kernel_vars(self, k):
return f"""GpuKernel {k.objvar};"""
def _generate_kernel_wrap(self, k):
args = []
setargs = []
for i, p in enumerate(k.params):
args.append(f"{get_ctype(p)} arg{i}")
if p is gpuarray.GpuArray:
setarg = "GpuKernel_setarg(&{0}, {1}, arg{1});"
else:
setarg = "GpuKernel_setarg(&{0}, {1}, &arg{1});"
setargs.append(setarg.format(k.objvar, i))
args = ", ".join(args)
setargs = "\n ".join(setargs)
return """
int {fname}(unsigned int _nd, size_t *_gdim, size_t *_ldim, size_t _shared,
{args}) {{
{setargs}
return GpuKernel_call(&{kname}, _nd, _gdim, _ldim, _shared, NULL);
}}
int {sname}(unsigned int _nd, size_t *_n, size_t _shared, {args}) {{
size_t _gs = 0;
size_t _ls = 0;
int _err;
if (_nd != 1) return GA_UNSUPPORTED_ERROR;
_err = GpuKernel_sched(&{kname}, _n[0], &_gs, &_ls);
if (_err != GA_NO_ERROR)
return _err;
{setargs}
return GpuKernel_call(&{kname}, 1, &_gs, &_ls, _shared, NULL);
}}
""".format(
args=args, fname=k.fname, setargs=setargs, sname=k.sname, kname=k.objvar
)
def c_support_code_apply(self, node, name):
kernels = self.gpu_kernels(node, name)
codes = "\n".join(self._generate_kernel_code(k) for k in kernels)
return codes
def c_support_code_struct(self, node, name):
kernels = self.gpu_kernels(node, name)
kvars = "\n".join(self._generate_kernel_vars(k) for k in kernels)
wrappers = "\n".join(self._generate_kernel_wrap(k) for k in kernels)
return kvars + "\n" + wrappers
def _generate_zeros(self, k):
return f"""memset(&{k.objvar}, 0, sizeof({k.objvar}));"""
def _generate_kernel_init(self, k, fail, ctx):
return """{
int err;
int types[%(numargs)u] = {%(types)s};
if ((err = GpuKernel_init(&%(ovar)s, %(ctx)s->ctx, 1,
&%(cname)s, NULL, "%(kname)s", %(numargs)u,
types, %(flags)s, NULL)) != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "GpuKernel_init error %%d: %%s",
err, gpucontext_error(%(ctx)s->ctx, err));
%(fail)s
}
}""" % dict(
numargs=len(k.params),
types=k._get_c_types(),
ovar=k.objvar,
kname=k.name,
cname=k.codevar,
flags=k._get_c_flags(),
fail=fail,
ctx=ctx,
)
def c_init_code_struct(self, node, name, sub):
ctx = self.get_gpu_context_c_name(sub["params"])
kernels = self.gpu_kernels(node, name)
inits_0 = "\n".join(self._generate_zeros(k) for k in kernels)
inits = "\n".join(
self._generate_kernel_init(k, sub["fail"], ctx) for k in kernels
)
return "\n".join([inits_0, inits])
def _generate_kernel_cleanup(self, k):
return f"GpuKernel_clear(&{k.objvar});"
def c_cleanup_code_struct(self, node, name):
kernels = self.gpu_kernels(node, name)
cleanups = "\n".join(self._generate_kernel_cleanup(k) for k in kernels)
return cleanups
# This is a shorthand for if your op only has a fixed version
# You can reimplement it, but make sure to call kernel_version()
def c_code_cache_version_apply(self, node):
v = self.c_code_cache_version()
if not v:
return ()
return (v, self.kernel_version(node))
def kernel_version(self, node):
"""
If you override :meth:`c_code_cache_version_apply`, call this
method to have the version of the kernel support code.
Parameters
----------
node : apply node
The node that we need the cache version for.
"""
return (9,)
class GpuKernelBaseCOp(GpuKernelBase, COp):
pass
class GpuKernelBaseExternalCOp(GpuKernelBase, ExternalCOp):
pass
def forward_string_meth(name):
def f(*args):
res = getattr(GpuKernelBase, name)(*args)
try:
res = res + "\n" + getattr(ExternalCOp, name)(*args)
except MethodNotDefined:
pass
return res
f.__name__ = name
return f
def get_dtype(s):
if s == "*":
return gpuarray.GpuArray
if s == "size":
return gpuarray.SIZE
if s == "ssize":
return gpuarray.SSIZE
else:
return np.dtype(s)
class CGpuKernelBase(GpuKernelBaseExternalCOp, _NoPythonOp):
"""
Class to combine GpuKernelBase and ExternalCOp.
It adds a new section type 'kernels' where you can define kernels
with the '#kernel' tag
"""
SECTIONS = copy.copy(ExternalCOp.SECTIONS)
SECTIONS.add("kernels")
kernel_re = re.compile(r"^#kernel ([a-zA-Z_].*?)$", re.MULTILINE)
get_params = GpuKernelBase.get_params
c_support_code_apply = forward_string_meth("c_support_code_apply")
c_support_code_struct = forward_string_meth("c_support_code_struct")
c_init_code_struct = forward_string_meth("c_init_code_struct")
c_cleanup_code_struct = forward_string_meth("c_cleanup_code_struct")
def c_code_cache_version_apply(self, node):
return GpuKernelBase.c_code_cache_version_apply(self, node)
def _type_macros(self, node):
define_template = "#define %s %s\n"
undef_template = "#undef %s\n"
define_macros = []
undef_macros = []
for i, v in enumerate(node.inputs):
if isinstance(v.type, GpuArrayType):
macro_name = f"DTYPE_INPUT_{i}"
macro_value = pygpu.gpuarray.dtype_to_ctype(v.dtype)
define_macros.append(define_template % (macro_name, macro_value))
undef_macros.append(undef_template % macro_name)
for i, v in enumerate(node.outputs):
if isinstance(v.type, GpuArrayType):
macro_name = f"DTYPE_OUTPUT_{i}"
macro_value = pygpu.gpuarray.dtype_to_ctype(v.dtype)
define_macros.append(define_template % (macro_name, macro_value))
undef_macros.append(undef_template % macro_name)
return "".join(define_macros), "".join(undef_macros)
def gpu_kernels(self, node, name):
if hasattr(self, "_cached_kernels"):
return self._cached_kernels
if "kernels" in self.code_sections:
code = self.code_sections["kernels"]
split = self.kernel_re.split(code)
if split[0].strip() != "":
raise ValueError(
"Stray code in kernels section before the "
"first #kernel statement."
)
def_macros, undef_macros = self._type_macros(node)
n = 1
res = []
while n < len(split):
kspec = split[n]
kcode = split[n + 1]
splt2 = kspec.split(":")
if len(splt2) != 3:
raise ValueError(f"Bad kernel spec: {kspec}")
kname = splt2[0].strip()
ktypes = [get_dtype(s.strip()) for s in splt2[1].split(",")]
kflags = splt2[2].strip()
kcode = def_macros + "\n" + kcode + "\n" + undef_macros
res.append(Kernel(kcode, ktypes, kname, flags=dict(cflags=kflags)))
n += 2
self._cached_kernels = res
return res
else:
return GpuKernelBase.gpu_kernels(self, node, name)
class HostFromGpu(COp):
"""
Transfer data to CPU.
"""
__props__ = ()
_f16_ok = True
def __str__(self):
return "HostFromGpu(gpuarray)"
def make_node(self, x):
if not isinstance(x.type, GpuArrayType):
raise TypeError(x)
out_var = TensorType(dtype=x.dtype, broadcastable=x.broadcastable)()
# Keep the special comparison if there is one.
values_eq_approx = getattr(x.tag, "values_eq_approx", None)
if values_eq_approx:
out_var.tag.values_eq_approx = EQ_MAP.get(
values_eq_approx, values_eq_approx
)
return Apply(self, [x], [out_var])
def perform(self, node, inp, out):
(x,) = inp
(z,) = out
z[0] = np.asarray(x)
def c_code(self, node, name, inputs, outputs, sub):
return """
GpuArray %(name)s_ga_s;
GpuArray *%(name)s_ga = NULL;
int %(name)serr;
PyArray_Descr *%(name)s_dtype;
if (!GpuArray_ISONESEGMENT(&%(inp)s->ga)) {
if (GpuArray_copy(&%(name)s_ga_s, &%(inp)s->ga, GA_C_ORDER) != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "Can't make contiguous copy");
%(fail)s;
}
%(name)s_ga = &%(name)s_ga_s;
} else {
%(name)s_ga = &%(inp)s->ga;
}
%(name)s_dtype = typecode_to_dtype(%(name)s_ga->typecode);
Py_XDECREF(%(out)s);
// PyArray_Empty below steals a reference to the dtype we pass it
// so we need an extra one to spare.
Py_INCREF(%(name)s_dtype);
%(out)s = (PyArrayObject *)PyArray_Empty(%(inp)s->ga.nd,
(npy_intp *)%(inp)s->ga.dimensions,
%(name)s_dtype,
(%(inp)s->ga.flags & GA_F_CONTIGUOUS) &&
!(%(inp)s->ga.flags & GA_C_CONTIGUOUS));
if (%(out)s == NULL) {
if (%(name)s_ga == &%(name)s_ga_s) GpuArray_clear(%(name)s_ga);
%(fail)s
}
Py_BEGIN_ALLOW_THREADS
%(name)serr = GpuArray_read(PyArray_DATA(%(out)s),
PyArray_NBYTES(%(out)s),
%(name)s_ga);
Py_END_ALLOW_THREADS
if (%(name)s_ga == &%(name)s_ga_s) GpuArray_clear(%(name)s_ga);
if (%(name)serr != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "Could not read device data.");
%(fail)s
}
""" % {
"name": name,
"fail": sub["fail"],
"inp": inputs[0],
"out": outputs[0],
}
def c_code_cache_version(self):
return (2,)
def grad(self, inputs, grads):
(gz,) = grads
return [GpuFromHost(inputs[0].type.context_name)(gz)]
def R_op(self, inputs, eval_points):
(ev,) = eval_points
return [self(ev)]
def infer_shape(self, fgraph, node, xshp):
return xshp
host_from_gpu = HostFromGpu()
class GpuFromHost(COp):
"""
Transfer data to GPU.
"""
__props__ = ("context_name",)
_f16_ok = True
params_type = gpu_context_type
def __init__(self, context_name):
self.context_name = context_name
def __str__(self):
return f"GpuFromHost<{self.context_name}>"
def make_node(self, x):
if not isinstance(x.type, TensorType):
raise TypeError(x)
if "complex" in x.dtype:
raise TypeError("complex not supported in the new gpuarray back-end.", x)
out_var = GpuArrayType(
broadcastable=x.broadcastable, context_name=self.context_name, dtype=x.dtype
)()
# Keep the special comparison if there is one.
values_eq_approx = getattr(x.tag, "values_eq_approx", None)
if values_eq_approx:
out_var.tag.values_eq_approx = EQ_MAP.get(
values_eq_approx, values_eq_approx
)
return Apply(self, [x], [out_var])
def get_params(self, node):
return get_context(self.context_name)
def perform(self, node, inp, out, ctx):
(x,) = inp
(z,) = out
z[0] = gpuarray.array(x, context=ctx)
def grad(self, inputs, grads):
(gz,) = grads
return [
as_gpuarray_variable(gz, context_name=self.context_name).transfer("cpu")
]
def R_op(self, inputs, eval_points):
(ev,) = eval_points
return [self(ev)]
def infer_shape(self, fgraph, node, xshp):
return xshp
def c_headers(self, **kwargs):
return ["gpuarray_helper.h"]
def c_header_dirs(self, **kwargs):
return [gpuarray_helper_inc_dir()]
def c_code(self, node, name, inputs, outputs, sub):
return """
PyArrayObject *%(name)s_tmp;
%(name)s_tmp = PyArray_GETCONTIGUOUS(%(inp)s);
int err;
if (%(name)s_tmp == NULL)
%(fail)s
if (%(out)s == NULL || !GpuArray_IS_C_CONTIGUOUS(&%(out)s->ga) ||
!aesara_size_check(%(out)s, PyArray_NDIM(%(name)s_tmp),
(size_t *)PyArray_DIMS(%(name)s_tmp),
get_typecode((PyObject *)PyArray_DESCR(%(name)s_tmp)))) {
Py_XDECREF(%(out)s);
%(out)s = pygpu_empty(PyArray_NDIM(%(name)s_tmp),
(size_t *)PyArray_DIMS(%(name)s_tmp),
get_typecode((PyObject *)PyArray_DESCR(%(name)s_tmp)),
GA_C_ORDER, %(ctx)s, Py_None);
if (%(out)s == NULL) {
Py_DECREF(%(name)s_tmp);
%(fail)s;
}
}
Py_BEGIN_ALLOW_THREADS
err = GpuArray_write(&%(out)s->ga, PyArray_DATA(%(name)s_tmp),
PyArray_NBYTES(%(name)s_tmp));
Py_END_ALLOW_THREADS
Py_DECREF(%(name)s_tmp);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "Could not write data to gpu");
%(fail)s;
}
""" % {
"name": name,
"inp": inputs[0],
"ctx": sub["params"],
"out": outputs[0],
"fail": sub["fail"],
}
def c_code_cache_version(self):
return (10,)
class GpuToGpu(COp):
"""
Transfer data between GPUs.
"""
__props__ = ("context_name",)
_f16_ok = True
params_type = gpu_context_type
def __init__(self, context_name):
self.context_name = context_name
def __str__(self):
return f"GpuToGpu<{self.context_name}>"
def make_node(self, x):
if not isinstance(x.type, GpuArrayType):
raise TypeError(x)
return Apply(
self,
[x],
[
GpuArrayType(
broadcastable=x.broadcastable,
context_name=self.context_name,
dtype=x.dtype,
)()
],
)
def get_params(self, node):
return get_context(self.context_name)
def perform(self, node, inp, out, ctx):
(x,) = inp
(z,) = out
z[0] = x.transfer(ctx)
def grad(self, inputs, grads):
(gz,) = grads
return [GpuToGpu(inputs[0].type.context_name)(gz)]
def R_op(self, inputs, eval_points):
return self(eval_points[0])
def infer_shape(self, fgraph, node, xshp):
return xshp
def c_code(self, node, name, inputs, outputs, sub):
return """
Py_XDECREF(%(out)s);
%(out)s = pygpu_empty(%(inp)s->ga.nd,
%(inp)s->ga.dimensions,
%(inp)s->ga.typecode,
GpuArray_IS_C_CONTIGUOUS(&(%(inp)s->ga)) ? GA_C_ORDER:GA_F_ORDER,
%(ctx)s, Py_None);
if (%(out)s == NULL) {
%(fail)s
}
if (pygpu_transfer(%(out)s, %(inp)s)) {
%(fail)s
}
""" % {
"inp": inputs[0],
"ctx": sub["params"],
"out": outputs[0],
"fail": sub["fail"],
}
def c_code_cache_version(self):
return (1,)
class GpuAlloc(HideC, Alloc):
"""
Allocate initialized memory on the GPU.
Parameters
----------
context_name : str
The name of the context in which to allocate memory
memset_0 : bool
It's only an optimized version. True, it means the
value is always 0, so the c code call memset as it is faster.
"""
__props__ = ("memset_0", "context_name")
_f16_ok = True
params_type = ParamsType(context=gpu_context_type, memset_0=bool_t)
def __init__(self, context_name, memset_0=False):
self.context_name = context_name
self.memset_0 = memset_0
def get_params(self, node):
return self.params_type.get_params(
context=get_context(self.context_name), memset_0=self.memset_0
)
def __str__(self):
# Hide the memset parameter when not used to prevent confusion.
if self.memset_0:
m = "{memset_0=True}"
else:
m = ""
return f"{self.__class__.__name__}<{self.context_name}>{m}"
def make_node(self, value, *shape):
value = as_gpuarray_variable(value, context_name=self.context_name)
sh, bcast = infer_broadcastable(shape)
if value.ndim > len(sh):
TypeError(
"The GpuAlloc value to use has more dimensions "
"than the specified shape",
value.ndim,
len(sh),
)
otype = value.type.clone(broadcastable=bcast)
return Apply(self, [value] + sh, [otype()])
def c_headers(self, **kwargs):
return ["<numpy_compat.h>"]
def perform(self, node, inputs, outs, params):
(out,) = outs
v = inputs[0]
sh = tuple(map(int, inputs[1:]))
if out[0] is None or out[0].shape != sh:
if self.memset_0:
out[0] = gpuarray.zeros(sh, dtype=v.dtype, context=params.context)
else:
out[0] = gpuarray.empty(sh, dtype=v.dtype, context=params.context)
out[0][...] = v
else:
out[0][...] = v
def c_code(self, node, name, inp, out, sub):
vv = inp[0]
ndim = len(inp[1:])
(zz,) = out
code = """
int i;
size_t %(name)s_shape[%(ndim)s];
""" % dict(
name=name, ndim=ndim
)
for i, shp_i in enumerate(inp[1:]):
code += """
%(name)s_shape[%(i)s] = ((dtype_%(shp_i)s *)PyArray_DATA(%(shp_i)s))[0];
""" % dict(
name=name, i=i, shp_i=shp_i
)
code += """
int need_new_out = (NULL == %(zz)s || %(zz)s->ga.nd != %(ndim)s);
if (!need_new_out)
for (i = 0; i < %(ndim)s; i++)
need_new_out |= %(zz)s->ga.dimensions[i] != %(name)s_shape[i];
if (need_new_out && (%(params)s->memset_0)) {
//pygpu_zeros can be faster then empty followed by memset.
Py_XDECREF(%(zz)s);
%(zz)s = pygpu_zeros(%(ndim)s, %(name)s_shape,
%(vv)s->ga.typecode, GA_C_ORDER,
%(params)s->context, Py_None);
if (!%(zz)s) {
%(fail)s
}
} else {
if (need_new_out) {
Py_XDECREF(%(zz)s);
%(zz)s = pygpu_empty(%(ndim)s, %(name)s_shape,
%(vv)s->ga.typecode, GA_C_ORDER,
%(params)s->context, Py_None);
if (!%(zz)s) {
%(fail)s
}
}
if (%(params)s->memset_0 && GpuArray_ISONESEGMENT(&%(zz)s->ga))
{
int err = GpuArray_memset(&%(zz)s->ga, 0);
if (err != GA_NO_ERROR)
{
PyErr_Format(PyExc_MemoryError,
"GpuAlloc: Error memsetting %%llu"
" element of device memory to 0.",
(unsigned long long)PyGpuArray_SIZE(%(zz)s));
%(fail)s;
}
}
else if (GpuArray_setarray(&%(zz)s->ga, &%(vv)s->ga) !=
GA_NO_ERROR) {
PyErr_SetString(PyExc_ValueError, "setarray failed");
%(fail)s
}
}
""" % dict(
name=name, ndim=ndim, zz=zz, vv=vv, params=sub["params"], fail=sub["fail"]
)
return code
def c_code_cache_version(self):
return (4,)
def do_constant_folding(self, fgraph, node):
from . import blas, subtensor
for client in fgraph.clients[node.outputs[0]]:
if client[0] == "output":
# If the output is a constant, it will have to be deepcopied
# each time the function is called. So we do not fold.
return False
# The following ops work inplace of their input id 0.
elif (
client[1] == 0
and
# Ops that will work inplace on the Alloc. So if they
# get constant_folded, they would copy the
# constant and this is less efficients.
# Not doing the constant folding could also lower
# the peak memory usage, as we the "constant" won't
# always exists.
isinstance(
client[0].op,
(
subtensor.GpuIncSubtensor,
subtensor.GpuAdvancedIncSubtensor1,
subtensor.GpuAdvancedIncSubtensor1_dev20,
subtensor.GpuAdvancedIncSubtensor,
blas.GpuGemm,
blas.GpuGemv,
blas.GpuGer,
),
)
):
return False
# If the clients is a transfer, we don't want to fold. We
# let the moving opt finish before deciding what to do.
elif isinstance(client[0].op, HostFromGpu):
return False
return True
class GpuAllocEmpty(HideC, AllocEmpty):
"""
Allocate uninitialized memory on the GPU.
"""
__props__ = ("dtype", "context_name")
_f16_ok = True
params_type = ParamsType(context=gpu_context_type, typecode=int32_t)
def __init__(self, dtype, context_name):
self.dtype = dtype
self.context_name = context_name
@property
def typecode(self):
return gpuarray.dtype_to_typecode(self.dtype)
def get_params(self, node):
return self.params_type.get_params(
context=get_context(self.context_name), typecode=self.typecode
)
def make_node(self, *shape):
sh, bcast = infer_broadcastable(shape)
output = GpuArrayType(
dtype=self.dtype, broadcastable=bcast, context_name=self.context_name
)()
output.tag.values_eq_approx = values_eq_approx_always_true
# The output can contain nan/inf.
output.type.filter_checks_isfinite = False
output.tag.nan_guard_mode_check = False
return Apply(self, sh, [output])
def debug_perform(self, node, inputs, out_, params):
self.perform(node, inputs, out_, params)
out_[0][0][:] = -123456789
def perform(self, node, inputs, out_, params):
out = out_[0]
sh = [int(i) for i in inputs]
if out[0] is None or out[0].shape != sh:
out[0] = pygpu.empty(sh, dtype=self.dtype, context=params.context)
# if out[0] is the right shape, we just return it
def c_headers(self, **kwargs):
return ["<gpuarray_helper.h>"]
def c_header_dirs(self, **kwargs):
return [gpuarray_helper_inc_dir()]
def c_code(self, node, name, inp, out, sub):
ndim = len(inp)
zz = out[0]
fail = sub["fail"]
code = [
f"""
int i;
size_t shape[{ndim}];
"""
]
for i, shp_i in enumerate(inp):
code.append(
"""
shape[%(i)s] = ((dtype_%(shp_i)s *)PyArray_DATA(%(shp_i)s))[0];
"""
% dict(i=i, shp_i=shp_i)
)
code.append(
"""
if (aesara_prep_output(&%(zz)s, %(ndim)s, shape, %(params)s->typecode, GA_C_ORDER,
%(params)s->context)) {
%(fail)s
}
"""
% dict(zz=zz, ndim=ndim, fail=fail, params=sub["params"])
)
return "".join(code)
def c_code_cache_version(self):
return (2,)
def do_constant_folding(self, fgraph, node):
return False
def infer_shape(self, fgraph, node, input_shapes):
return [node.inputs]
def grad(self, *args):
# Don't reuse the grad implementation from Alloc
raise NotImplementedError("grad disabled")
def empty_like(var):
return GpuAllocEmpty(var.type.dtype, var.type.context_name)(*var.shape)
class GpuContiguous(Op):
"""
Return a C contiguous version of the input.
This may either pass the object as-is (if already C contiguous) or
make a copy.
"""
__props__ = ()
view_map = {0: [0]}
_f16_ok = True
def grad(self, inputs, dout):
(x,) = inputs
(dout,) = dout
dout = as_gpuarray_variable(dout, context_name=infer_context_name(x))
return [dout]
def make_node(self, input):
input = as_gpuarray_variable(input, context_name=infer_context_name(input))
return Apply(self, [input], [input.type()])
def c_header_dirs(self, **kwargs):
return [gpuarray_helper_inc_dir()]
def c_headers(self, **kwargs):
return ["<gpuarray_helper.h>"]
def c_code_cache_version(self):
return (4,)
def c_code(self, node, name, inp, out, sub):
return """
{
if (GpuArray_IS_C_CONTIGUOUS(&(%(input)s->ga))) {
Py_XDECREF(%(z)s);
%(z)s = %(input)s;
Py_INCREF(%(z)s);
} else if (NULL == %(z)s
|| !aesara_size_check(%(z)s, PyGpuArray_NDIM(%(input)s), PyGpuArray_DIMS(%(input)s),
%(input)s->ga.typecode)
|| !GpuArray_IS_C_CONTIGUOUS(&(%(z)s->ga)))
{
Py_XDECREF(%(z)s);
%(z)s = pygpu_copy(%(input)s, GA_C_ORDER);
if (!%(z)s)
{
%(fail)s;
}
} else if(pygpu_move(%(z)s, %(input)s) == -1) {
%(fail)s;
}
}
""" % dict(
input=inp[0], z=out[0], fail=sub["fail"]
)
def perform(self, node, inp, out_):
(x,) = inp
(out,) = out_
out[0] = pygpu.ascontiguousarray(x)
gpu_contiguous = GpuContiguous()
class GpuReshape(HideC, Reshape):
"""
Reshape for GPU variables.
"""
_f16_ok = True
# __hash__, __eq__, __str__ come from Reshape
def make_node(self, x, shp):
ctx_name = infer_context_name(x)
x = as_gpuarray_variable(x, context_name=ctx_name)
shp = at.as_tensor_variable(shp)
res = x.transfer("cpu").reshape(shp, ndim=self.ndim)
otype = GpuArrayType(
dtype=res.dtype, broadcastable=res.broadcastable, context_name=ctx_name
)
return Apply(self, [x, shp], [otype()])
def perform(self, node, inp, out_, params):
x, shp = inp
(out,) = out_
if len(shp) != self.ndim:
raise ValueError(
"shape argument to GpuReshape.perform"
" has incorrect length %i"
", should be %i" % (len(shp), self.ndim),
shp,
)
if shp.prod() != x.size:
# We need to do check here to raise the same error as NumPy.
# We should make pygpu do the same.
ss = 1
nb_m1 = 0
for i in shp:
if i == -1:
nb_m1 += 1
else:
ss *= i
if nb_m1 > 1:
raise ValueError("Only one -1 is accepted in the new shape")
elif nb_m1 == 1:
if (x.size % ss) != 0:
raise ValueError(
"When using -1 in new shape, the computed new shape must be an multiple of the original shape."
)
else:
raise ValueError("total size of new array must be unchanged")
out[0] = x.reshape(tuple(shp))
def c_code_cache_version(self):
return (3,)
def c_code(self, node, name, inputs, outputs, sub):
x, shape = inputs
(output,) = outputs
sdtype = node.inputs[1].type.dtype_specs()[1]
just_fail = sub["fail"]
fail = """{
free(new_dims);
%(just_fail)s
}""" % dict(
just_fail=just_fail
)
params = sub["params"]
return (
"""
size_t old_size = 1, new_size = 1;
size_t* new_dims = NULL;
int compute_axis = -1;
assert (PyArray_NDIM(%(shape)s) == 1);
if (PyArray_DIM(%(shape)s, 0) != %(params)s->ndim)
{
PyErr_Format(PyExc_ValueError,
"GpuReshape: given shape is of incorrect "
"length (%%d should be %%d).",
PyArray_DIM(%(shape)s, 0), %(params)s->ndim);
%(just_fail)s;
}
new_dims = (size_t*) malloc(sizeof(size_t) * %(params)s->ndim);
if (new_dims == NULL) {
PyErr_NoMemory();
%(just_fail)s
}
for (size_t i = 0; i < %(x)s->ga.nd; ++i)
old_size *= %(x)s->ga.dimensions[i];
for (size_t i = 0; i < %(params)s->ndim; ++i)
{
new_dims[i] = ((%(sdtype)s*)(
PyArray_BYTES(%(shape)s) +
i * PyArray_STRIDES(%(shape)s)[0]))[0];
if (new_dims[i] == -1)
{
if (compute_axis != -1)
{
PyErr_Format(PyExc_ValueError,
"GpuReshape: only one -1 is accepted "
"in the new shape, but got two at "
"indices %%d and %%zu.",
compute_axis, i);
%(fail)s;
}
compute_axis = i;
}
else
new_size *= new_dims[i];
}
if (compute_axis == -1 && new_size != old_size)
{
PyErr_Format(PyExc_ValueError,
"GpuReshape: trying to reshape an array of "
"total size %%zu into an array of total size "
"%%zu.", old_size, new_size);
%(fail)s;
}
else if (compute_axis != -1 && old_size %% new_size != 0)
{
PyErr_Format(PyExc_ValueError,
"GpuReshape: -1 axis found at index %%d in "
"new shape but the total size of the array "
"(%%zu) is not divisible by the given shapes "
"(%%zu).", compute_axis, old_size, new_size);
%(fail)s;
}
Py_XDECREF(%(output)s);
%(output)s = pygpu_reshape(%(x)s, %(params)s->ndim, new_dims,
GA_C_ORDER, 0, compute_axis);
free(new_dims);
if (%(output)s == NULL)
{
%(just_fail)s;
}
"""
% locals()
)
class GpuJoin(HideC, Join):
"""
Join for GPU.
"""
_f16_ok = True
__props__ = ("view",)
params_type = gpu_context_type
def __init__(self, view=-1):
self.view = view
if view != -1:
# since the first input is always the axis, the tensors
# start from index 1.
self.view_map = {0: [1 + view]}
def __str__(self):
return Join.__str__(self)
def make_node(self, axis, *tensors):
node = Join.make_node(self, axis, *tensors)
ctx_name = infer_context_name(*tensors)
def agv(v):
return as_gpuarray_variable(v, context_name=ctx_name)
return Apply(
self,
[node.inputs[0]] + list(map(agv, tensors)),
[
GpuArrayType(
broadcastable=node.outputs[0].broadcastable,
dtype=node.outputs[0].dtype,
context_name=ctx_name,
)()
],
)
def get_params(self, node):
return node.outputs[0].type.context
def perform(self, node, axis_and_tensors, out_, ctx):
(out,) = out_
view = self.view
axis = int(axis_and_tensors[0])
tensors = axis_and_tensors[1:]
if axis < -axis_and_tensors[1].ndim:
raise IndexError
if axis < 0:
axis += axis_and_tensors[1].ndim
# we check these tensors for being empty.
if (view != -1) and np.all(
[
tensor.shape[axis] == 0
for tensor in tensors[0:view] + tensors[view + 1 :]
]
):
out[0] = tensors[view]
else:
out[0] = pygpu.concatenate(tensors, axis=axis, context=ctx).astype(
node.outputs[0].dtype
)
def c_code_cache_version(self):
return (3,)
def c_support_code(self, **kwargs):
return """
#if PY_MAJOR_VERSION >= 3
#define PyInt_AsLong PyLong_AsLong
#endif
"""
def c_headers(self, **kwargs):
return ["<numpy_compat.h>"]
def c_code(self, node, name, inputs, out_, sub):
axis, tensors = inputs[0], inputs[1:]
copy_to_list = []
restype = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
view = self.view
non_empty_tensor = tensors[view]
for i, inp in enumerate(tensors):
copy_to_list.append(f"als[{i}] = &{inp}->ga;")
n = len(tensors)
fail = sub["fail"]
out = out_[0]
copy_inputs_to_list = "\n".join(copy_to_list)
ctx = sub["params"]
code = (
"""
const GpuArray **als = (const GpuArray **)PyMem_Malloc(sizeof(GpuArray *) *
%(n)s);
if (als == NULL) {
PyErr_NoMemory();
%(fail)s
}
%(copy_inputs_to_list)s
Py_XDECREF(%(out)s);
{
int axis = PyInt_AsLong((PyObject *)%(axis)s);
if (axis < 0) {
if (axis == -1 && PyErr_Occurred()) {
%(fail)s
}
axis += als[0]->nd;
if (axis < 0) {
PyErr_SetString(PyExc_IndexError, "invalid axis");
%(fail)s
}
}
int tensors_lens_sum;
if(%(view)s != -1) {
tensors_lens_sum = 0;
for(int i=0; i < %(n)s; i++){
tensors_lens_sum += als[i]->dimensions[axis];
}
tensors_lens_sum -= PyGpuArray_DIM(%(non_empty_tensor)s, axis);
}
if(%(view)s != -1 && tensors_lens_sum == 0) {
Py_INCREF(%(non_empty_tensor)s);
%(out)s = %(non_empty_tensor)s;
}else{
%(out)s = pygpu_concatenate(als, %(n)s, axis,
%(restype)s, (PyObject *)&PyGpuArrayType,
%(ctx)s);
}
}
PyMem_Free(als);
if (%(out)s == NULL)
%(fail)s
"""
% locals()
)
return code
gpu_join = GpuJoin()
class GpuSplit(HideC, Split, _NoPythonOp):
"""
Split for GPU.
"""
_f16_ok = True
def __init__(self, len_splits):
super().__init__(len_splits)
# The GPU version of Split returns splits as views of the input.
self.view_map = {}
for i in range(self.len_splits):
self.view_map[i] = [0]
def make_node(self, x, axis, splits):
node = Split.make_node(self, x, axis, splits)
x = as_gpuarray_variable(x, infer_context_name(x))
outs = [
GpuArrayType(
dtype=o.dtype,
broadcastable=o.broadcastable,
context_name=x.type.context_name,
)()
for o in node.outputs
]
return Apply(self, [x] + node.inputs[1:], outs)
# we reuse the perform of the CPU op, which is suitable
def c_code_cache_version(self):
return (2,)
def c_headers(self, **kwargs):
return ["<numpy_compat.h>", "<gpuarray_helper.h>"]
def c_header_dirs(self, **kwargs):
return [pygpu.get_include(), gpuarray_helper_inc_dir()]
def c_code(self, node, name, inputs, outputs, sub):
if self.len_splits == 0:
# There are no outputs, then nothing to do.
return ""
# outputs_pointers lists the addresses of the pointers to the outputs.
outputs_pointers = "&" + (", &".join(outputs))
x, axis, splits = inputs
fail = sub["fail"]
splits_dtype = node.inputs[2].type.dtype_specs()[1]
axis_dtype = node.inputs[1].type.dtype_specs()[1]
expected_splits_count = self.len_splits
main_code = """
int ndim = PyGpuArray_NDIM(%(x)s);
int axis = (int)(*(%(axis_dtype)s*)PyArray_GETPTR1(%(axis)s, 0));
int splits_count = PyArray_DIM(%(splits)s, 0);
size_t len_along_axis, sum_of_splits = 0;
%(splits_dtype)s current_split_length;
size_t* split_points = NULL;
GpuArray* split_views = NULL;
GpuArray** split_views_pointers = NULL;
int i, j;
PyGpuArrayObject** outputs[] = {%(outputs_pointers)s};
/* Check inputs. */
if (splits_count != %(expected_splits_count)s) {
PyErr_Format(PyExc_ValueError,
"GpuSplit: splits count (%%d) != expected count (%%d).", splits_count, %(expected_splits_count)s);
%(fail)s
}
if (axis < 0) {
axis += ndim;
}
if (axis < 0 || axis >= ndim) {
PyErr_Format(PyExc_IndexError, "GpuSplit: invalid axis %%d for a %%d-D array.", axis, ndim);
%(fail)s
}
len_along_axis = PyGpuArray_DIM(%(x)s, axis);
for (i = 0; i < splits_count; ++i) {
current_split_length = *(%(splits_dtype)s*)PyArray_GETPTR1(%(splits)s, i);
if (current_split_length < 0) {
PyErr_Format(PyExc_ValueError,
"GpuSplit: you try to take a negative number (%%ld) of elements.", current_split_length);
%(fail)s
}
sum_of_splits += current_split_length;
}
if (sum_of_splits != len_along_axis) {
PyErr_Format(PyExc_ValueError, "GpuSplit: the splits sums to %%ld, expected %%ld.", sum_of_splits, len_along_axis);
%(fail)s
}
/* Compute splits views. */
split_points = (size_t*) malloc((splits_count - 1) * sizeof(size_t));
if (split_points == NULL) {
PyErr_NoMemory();
%(fail)s
}
split_points[0] = (size_t) (* (%(splits_dtype)s*) PyArray_GETPTR1(%(splits)s, 0) );
for(i = 1; i < splits_count - 1; ++i) {
split_points[i] = split_points[i - 1] + (size_t) (* (%(splits_dtype)s*) PyArray_GETPTR1(%(splits)s, i) );
}
split_views = (GpuArray*) malloc(splits_count * sizeof(GpuArray));
split_views_pointers = (GpuArray**) malloc(splits_count * sizeof(GpuArray*));
if (split_views == NULL || split_views_pointers == NULL) {
PyErr_NoMemory();
free(split_views_pointers);
free(split_views);
free(split_points);
%(fail)s
}
for (i = 0; i < splits_count; ++i) {
split_views_pointers[i] = split_views + i;
}
if (GpuArray_split(split_views_pointers, &%(x)s->ga, splits_count - 1, split_points, axis) != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "GpuSplit: unable to compute split.");
for (i = 0; i < splits_count; ++i) {
GpuArray_clear(split_views_pointers[i]);
}
free(split_views_pointers);
free(split_views);
free(split_points);
%(fail)s
}
/* Put split views into outputs. */
for (i = 0; i < splits_count; ++i) {
PyGpuArrayObject** output = outputs[i];
Py_XDECREF(*output);
*output = pygpu_fromgpudata(
split_views[i].data,
split_views[i].offset,
split_views[i].typecode,
split_views[i].nd,
split_views[i].dimensions,
split_views[i].strides,
%(x)s->context,
1, // output is writable
Py_None, Py_None
);
if (*output == NULL) {
PyErr_SetString(PyExc_RuntimeError, "GpuSplit: unable to update an output from a split view.");
for (j = 0; j < splits_count; ++j) {
GpuArray_clear(split_views_pointers[j]);
}
free(split_views_pointers);
free(split_views);
free(split_points);
%(fail)s
}
}
/* Free memory. */
for (i = 0; i < splits_count; ++i) {
GpuArray_clear(split_views_pointers[i]);
}
free(split_views_pointers);
free(split_views);
free(split_points);
"""
return main_code % locals()
@aesara.compile.profiling.register_profiler_printer
def profile_printer(
message, compile_time, fct_call_time, apply_time, apply_cimpl, outputs_size, file
):
if any(
[
x.op.__class__.__name__.lower().startswith("gpu")
for (fgraph, x) in apply_time.keys()
]
):
local_time = sum(apply_time.values())
print("", file=file)
print("Some info useful for gpu:", file=file)
fgraphs = set()
for fgraph, node in apply_time.keys():
fgraphs.add(fgraph)
cpu = 0
gpu = 0
trans = 0
for (fgraph, node), t in apply_time.items():
if isinstance(node.op, (HostFromGpu, GpuFromHost)):
trans += t
elif node.op.__class__.__name__.lower().startswith("gpu"):
gpu += t
else:
cpu += t
print("", file=file)
print(
" Spent %.3fs(%.2f%%) in cpu Op, %.3fs(%.2f%%) in gpu Op and %.3fs(%.2f%%) transfert Op"
% (
cpu,
cpu / local_time * 100,
gpu,
gpu / local_time * 100,
trans,
trans / local_time * 100,
),
file=file,
)
print("", file=file)
print(" Aesara function input that are float64", file=file)
print(" <fct name> <input name> <input type> <str input>", file=file)
for fg in fgraphs:
for i in fg.inputs:
if hasattr(i.type, "dtype") and i.type.dtype == "float64":
print(" ", fg.name, i.name, i.type, i, file=file)
print("", file=file)
print(
" List of apply that don't have float64 as input but have float64 in outputs",
file=file,
)
print(
" (Useful to know if we forgot some cast when using floatX=float32 or gpu code)",
file=file,
)
print(
" <Apply> <Apply position> <fct name> <inputs type> <outputs type>",
file=file,
)
for fg in fgraphs:
for idx, node in enumerate(fg.toposort()):
if any(
hasattr(i, "dtype") and i.dtype == "float64" for i in node.outputs
) and not any(
hasattr(i, "dtype") and i.dtype == "float64" for i in node.inputs
):
print(" ", str(node), idx, fg.name, end=" ", file=file)
print(
str([getattr(i, "dtype", None) for i in node.inputs]),
end=" ",
file=file,
)
print(
str([getattr(i, "dtype", None) for i in node.outputs]),
file=file,
)
print("", file=file)
class GpuEye(GpuKernelBaseCOp, _NoPythonOp):
"""
Eye for GPU.
"""
__props__ = ("dtype", "context_name")
_f16_ok = True
def __init__(self, dtype=None, context_name=None):
if dtype is None:
dtype = config.floatX
self.dtype = dtype
self.context_name = context_name
def get_params(self, node):
return get_context(self.context_name)
def make_node(self, n, m, k):
n = at.as_tensor_variable(n)
m = at.as_tensor_variable(m)
k = at.as_tensor_variable(k)
assert n.ndim == 0
assert m.ndim == 0
assert k.ndim == 0
otype = GpuArrayType(
dtype=self.dtype,
broadcastable=(False, False),
context_name=self.context_name,
)
return Apply(self, [n, m, k], [otype()])
def infer_shape(self, fgraph, node, in_shapes):
out_shape = [node.inputs[0], node.inputs[1]]
return [out_shape]
def grad(self, inp, grads):
return [grad_undefined(self, i, inp[i]) for i in range(3)]
def gpu_kernels(self, node, name):
code = """#include "cluda.h"
KERNEL void eye(GLOBAL_MEM %(ctype)s *a, ga_size a_off,
ga_size n, ga_size m, ga_ssize k) {
a = (GLOBAL_MEM %(ctype)s *)(((GLOBAL_MEM char *)a) + a_off);
ga_ssize coff = max(k, (ga_ssize) 0);
ga_ssize roff = -min(k, (ga_ssize) 0);
ga_size nb = (ga_size) min(n - roff, m - coff);
for (ga_size i = LID_0; i < nb; i += LDIM_0) {
a[(i + roff)*m + i + coff] = %(write_a)s(1);
}
}""" % dict(
ctype=pygpu.gpuarray.dtype_to_ctype(self.dtype),
name=name,
write_a=write_w(self.dtype),
)
return [
Kernel(
code=code,
name="eye",
params=[
gpuarray.GpuArray,
gpuarray.SIZE,
gpuarray.SIZE,
gpuarray.SIZE,
gpuarray.SSIZE,
],
flags=Kernel.get_flags(self.dtype),
objvar="k_eye_" + name,
)
]
def c_code(self, node, name, inp, out, sub):
if len(inp) == 2:
n, m = inp
k = 0
elif len(inp) == 3:
n, m, k = inp
(z,) = out
fail = sub["fail"]
ctx = sub["params"]
typecode = pygpu.gpuarray.dtype_to_typecode(self.dtype)
kname = self.gpu_kernels(node, name)[0].objvar
s = (
"""
size_t dims[2] = {0, 0};
size_t ls, gs;
ssize_t k;
size_t col_off;
size_t row_off;
int err;
dims[0] = ((dtype_%(n)s*)PyArray_DATA(%(n)s))[0];
dims[1] = ((dtype_%(m)s*)PyArray_DATA(%(m)s))[0];
k = ((dtype_%(k)s*)PyArray_DATA(%(k)s))[0];
Py_CLEAR(%(z)s);
%(z)s = pygpu_zeros(2, dims,
%(typecode)s,
GA_C_ORDER,
%(ctx)s, Py_None);
if (%(z)s == NULL) {
%(fail)s
}
ls = 1;
gs = 256;
col_off = (size_t) (k > 0?k:0);
row_off = (size_t) (k < 0?-k:0);
if (row_off < dims[0] && col_off < dims[1]) {
err = eye_call(1, &gs, &ls, 0, %(z)s->ga.data, %(z)s->ga.offset,
dims[0], dims[1], k);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: kEye: %%s. n%%lu, m=%%lu.",
GpuKernel_error(&%(kname)s, err),
(unsigned long)dims[0], (unsigned long)dims[1]);
%(fail)s;
}
}
"""
% locals()
)
return s
def c_code_cache_version(self):
return (10,)
class GpuTri(GpuKernelBaseCOp, _NoPythonOp):
"""
Tri for GPU.
"""
__props__ = ("dtype", "context_name")
_f16_ok = True
def __init__(self, dtype=None, context_name=None):
if dtype is None:
dtype = config.floatX
self.dtype = dtype
self.context_name = context_name
def get_params(self, node):
return get_context(self.context_name)
def make_node(self, n, m, k):
n = at.as_tensor_variable(n)
m = at.as_tensor_variable(m)
k = at.as_tensor_variable(k)
assert n.ndim == 0
assert m.ndim == 0
assert k.ndim == 0
otype = GpuArrayType(
dtype=self.dtype,
broadcastable=(False, False),
context_name=self.context_name,
)
return Apply(self, [n, m, k], [otype()])
def infer_shape(self, fgraph, node, in_shapes):
out_shape = [node.inputs[0], node.inputs[1]]
return [out_shape]
def grad(self, inp, grads):
return [grad_undefined(self, i, inp[i]) for i in range(3)]
def gpu_kernels(self, node, name):
code = """#include "cluda.h"
KERNEL void tri(GLOBAL_MEM %(ctype)s *a, ga_size a_off,
ga_size n, ga_size m, ga_ssize k) {
a = (GLOBAL_MEM %(ctype)s *)(((GLOBAL_MEM char *)a) + a_off);
ga_ssize coff = max(k, (ga_ssize) 0);
ga_ssize roff = -min(k, (ga_ssize) 0);
for (ga_size i = LID_0; i < min(n - roff,n); i += LDIM_0) {
for (ga_size j = 0; j <= min(i + coff,m-1); j++) {
a[(i + roff)*m + j] = %(write_a)s(1);
}
}
}""" % dict(
ctype=pygpu.gpuarray.dtype_to_ctype(self.dtype),
name=name,
write_a=write_w(self.dtype),
)
return [
Kernel(
code=code,
name="tri",
params=[
gpuarray.GpuArray,
gpuarray.SIZE,
gpuarray.SIZE,
gpuarray.SIZE,
gpuarray.SSIZE,
],
flags=Kernel.get_flags(self.dtype),
objvar="k_tri_" + name,
)
]
def c_code(self, node, name, inp, out, sub):
if len(inp) == 2:
n, m = inp
k = 0
elif len(inp) == 3:
n, m, k = inp
(z,) = out
fail = sub["fail"]
ctx = sub["params"]
typecode = pygpu.gpuarray.dtype_to_typecode(self.dtype)
kname = self.gpu_kernels(node, name)[0].objvar
s = (
"""
size_t dims[2] = {0, 0};
size_t ls, gs;
ssize_t k;
int err;
dims[0] = ((dtype_%(n)s*)PyArray_DATA(%(n)s))[0];
dims[1] = ((dtype_%(m)s*)PyArray_DATA(%(m)s))[0];
k = ((dtype_%(k)s*)PyArray_DATA(%(k)s))[0];
Py_CLEAR(%(z)s);
%(z)s = pygpu_zeros(2, dims,
%(typecode)s,
GA_C_ORDER,
%(ctx)s, Py_None);
if (%(z)s == NULL) {
%(fail)s
}
ls = 1;
gs = 256;
err = tri_call(1, &gs, &ls, 0, %(z)s->ga.data, %(z)s->ga.offset,
dims[0], dims[1], k);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: kTri: %%s. n%%lu, m=%%lu.",
GpuKernel_error(&%(kname)s, err),
(unsigned long)dims[0], (unsigned long)dims[1]);
%(fail)s;
}
"""
% locals()
)
return s
def c_code_cache_version(self):
return (1,)
import aesara
from aesara.compile import optdb
from aesara.gpuarray.basic_ops import (
CGpuKernelBase,
GpuArrayType,
as_gpuarray_variable,
gpu_contiguous,
gpuarray_helper_inc_dir,
infer_context_name,
)
from aesara.gpuarray.opt_util import inplace_allocempty
from aesara.graph.basic import Apply
from aesara.graph.opt import LocalOptGroup, in2out
from aesara.link.c.op import _NoPythonCOp
from aesara.link.c.params_type import ParamsType
from aesara.scalar import bool as bool_t
from aesara.tensor.basic import as_tensor_variable
try:
import pygpu
from pygpu import blas
except ImportError:
# To make sure aesara is importable
pass
class BlasOp(_NoPythonCOp):
def c_headers(self, **kwargs):
return ["<blas_api.h>", "<numpy_compat.h>", "<gpuarray_helper.h>"]
def c_header_dirs(self, **kwargs):
return [pygpu.get_include(), gpuarray_helper_inc_dir()]
def c_init_code(self, **kwargs):
return ["import_pygpu__blas();"]
class GpuGemv(BlasOp):
"""
Gemv on the GPU.
"""
params_type = ParamsType(inplace=bool_t)
__props__ = ("inplace",)
def __init__(self, inplace=False):
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [0]}
def make_node(self, y, alpha, A, x, beta):
ctx_name = infer_context_name(y, A, x)
A = as_gpuarray_variable(A, ctx_name)
x = as_gpuarray_variable(x, ctx_name)
y = as_gpuarray_variable(y, ctx_name)
alpha = as_tensor_variable(alpha)
beta = as_tensor_variable(beta)
assert alpha.ndim == 0
assert beta.ndim == 0
assert A.ndim == 2
assert x.ndim == 1
assert y.ndim == 1
assert A.dtype == x.dtype == y.dtype
# float16 not supported
expected = A.dtype
assert aesara.scalar.upcast(alpha.dtype, beta.dtype, expected) == expected
alpha = alpha.astype(expected)
beta = beta.astype(expected)
return Apply(self, [y, alpha, A, x, beta], [y.type()])
def perform(self, node, inputs, out_storage, params):
y, alpha, A, x, beta = inputs
inplace = params.inplace
if inplace and y.strides[0] < 0:
inplace = False
if A.shape[1] == 0:
out_storage[0][0] = pygpu.zeros(y.shape, dtype=y.dtype, context=y.context)
else:
out_storage[0][0] = blas.gemv(alpha, A, x, beta, y, overwrite_y=inplace)
def c_code(self, node, name, inp, out, sub):
vars = dict(
out=out[0],
y=inp[0],
alpha=inp[1],
A=inp[2],
x=inp[3],
beta=inp[4],
fail=sub["fail"],
name=name,
params=sub["params"],
)
code = (
"""
if (!%(params)s->inplace || %(y)s->ga.strides[0] <= 0) {
%(out)s = aesara_try_copy(%(out)s, %(y)s);
if (%(out)s == NULL) {
%(fail)s
}
} else {
Py_XDECREF(%(out)s);
%(out)s = %(y)s;
Py_INCREF(%(out)s);
}
"""
% vars
)
# in case of possible speed up using blas dot,
# temporary hack A to 1D for vector-vector dot
code += (
"""
if (PyGpuArray_DIM(%(A)s, 1) == 0) {
int code;
code = GpuArray_memset(&%(out)s->ga, 0);
if (code != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "Memset failed");
%(fail)s
}
} else if ( PyGpuArray_DIM(%(A)s, 0) == 1
&&((dtype_%(alpha)s*)PyArray_DATA(%(alpha)s))[0] == (dtype_%(alpha)s)1.
&&((dtype_%(beta)s*)PyArray_DATA(%(beta)s))[0] == (dtype_%(beta)s)0.
) {
%(out)s->ga.nd = 0;
%(A)s->ga.nd = 1;
%(A)s->ga.dimensions[0] = %(A)s->ga.dimensions[1];
ssize_t a_stride0 = %(A)s->ga.strides[0];
%(A)s->ga.strides[0] = %(A)s->ga.strides[1];
if (pygpu_blas_rdot(%(x)s, %(A)s, %(out)s, 0) == -1) {
%(fail)s
}
%(A)s->ga.strides[0] = a_stride0;
%(out)s->ga.nd = 1;
%(A)s->ga.nd = 2;
%(A)s->ga.dimensions[0] = 1;
} else if (
pygpu_blas_rgemv(cb_no_trans,
((dtype_%(alpha)s *)PyArray_DATA(%(alpha)s))[0],
%(A)s, %(x)s,
((dtype_%(beta)s *)PyArray_DATA(%(beta)s))[0],
%(out)s, 0) == -1) {
%(fail)s
}
"""
% vars
)
return code
def c_code_cache_version(self):
return (10,)
gpugemv_no_inplace = GpuGemv(inplace=False)
gpugemv_inplace = GpuGemv(inplace=True)
class GpuGemm(BlasOp):
"""
Gemm on the GPU.
"""
params_type = ParamsType(inplace=bool_t)
__props__ = ("inplace",)
_f16_ok = True
def __init__(self, inplace=False):
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [0]}
def make_node(self, C, alpha, A, B, beta):
ctx_name = infer_context_name(C, A, B)
A = as_gpuarray_variable(A, ctx_name)
B = as_gpuarray_variable(B, ctx_name)
C = as_gpuarray_variable(C, ctx_name)
alpha = as_tensor_variable(alpha)
beta = as_tensor_variable(beta)
if not (A.dtype == B.dtype == C.dtype):
raise TypeError(
aesara.tensor.blas.Gemm.E_mixed,
(A.dtype, B.dtype, C.dtype, alpha.dtype, beta.dtype),
)
if not A.dtype.startswith("float"):
raise TypeError(aesara.tensor.blas.Gemm.E_float, (A.dtype))
if A.dtype == "float16":
expected = "float32"
else:
expected = A.dtype
assert aesara.scalar.upcast(alpha.dtype, beta.dtype, expected) == expected
alpha = alpha.astype(expected)
beta = beta.astype(expected)
assert alpha.ndim == 0
assert beta.ndim == 0
assert A.ndim == 2
assert B.ndim == 2
assert C.ndim == 2
return Apply(self, [C, alpha, A, B, beta], [C.type()])
def perform(self, node, inputs, outputs, params):
C, alpha, A, B, beta = inputs
inplace = params.inplace
if inplace and not C.flags.forc:
inplace = False
outputs[0][0] = blas.gemm(alpha, A, B, beta, C, overwrite_c=inplace)
def c_code(self, node, name, inp, out, sub):
vars = dict(
out=out[0],
C=inp[0],
alpha=inp[1],
A=inp[2],
B=inp[3],
beta=inp[4],
fail=sub["fail"],
name=name,
params=sub["params"],
)
code = (
"""
if (!%(params)s->inplace || !GpuArray_ISONESEGMENT(&%(C)s->ga)) {
%(out)s = aesara_try_copy(%(out)s, %(C)s);
if (%(out)s == NULL) {
%(fail)s
}
} else {
Py_XDECREF(%(out)s);
%(out)s = %(C)s;
Py_INCREF(%(out)s);
}
if (pygpu_blas_rgemm(cb_no_trans, cb_no_trans,
((dtype_%(alpha)s *)PyArray_DATA(%(alpha)s))[0],
%(A)s, %(B)s,
((dtype_%(beta)s *)PyArray_DATA(%(beta)s))[0],
%(out)s, 0) == -1) {
%(fail)s
}
"""
% vars
)
return code
def c_code_cache_version(self):
return (7,)
gpugemm_no_inplace = GpuGemm(inplace=False)
gpugemm_inplace = GpuGemm(inplace=True)
class GpuGer(BlasOp):
"""
Ger on the GPU.
"""
params_type = ParamsType(inplace=bool_t)
__props__ = ("inplace",)
def __init__(self, inplace=False):
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [0]}
def make_node(self, A, alpha, x, y):
ctx_name = infer_context_name(A, x, y)
A = as_gpuarray_variable(A, ctx_name)
x = as_gpuarray_variable(x, ctx_name)
y = as_gpuarray_variable(y, ctx_name)
alpha = as_tensor_variable(alpha)
if not (A.dtype == x.dtype == y.dtype):
raise TypeError(
"ger requires matching dtypes", (A.dtype, alpha.dtype, x.dtype, y.dtype)
)
assert aesara.scalar.upcast(alpha.dtype, A.dtype) == A.dtype
alpha = alpha.astype(A.dtype)
assert alpha.ndim == 0
assert A.ndim == 2
assert x.ndim == 1
assert y.ndim == 1
return Apply(self, [A, alpha, x, y], [A.type()])
def perform(self, node, inp, out, params):
A, alpha, x, y = inp
inplace = params.inplace
if inplace and not A.flags.forc:
inplace = False
out[0][0] = blas.ger(alpha, x, y, A, overwrite_a=inplace)
def c_code(self, node, name, inp, out, sub):
vars = dict(
out=out[0],
A=inp[0],
alpha=inp[1],
x=inp[2],
y=inp[3],
fail=sub["fail"],
name=name,
params=sub["params"],
)
code = (
"""
if (!%(params)s->inplace || !GpuArray_ISONESEGMENT(&%(A)s->ga)) {
%(out)s = aesara_try_copy(%(out)s, %(A)s);
if (%(out)s == NULL) {
%(fail)s
}
} else {
Py_XDECREF(%(out)s);
%(out)s = %(A)s;
Py_INCREF(%(out)s);
}
if (pygpu_blas_rger(((dtype_%(alpha)s *)PyArray_DATA(%(alpha)s))[0],
%(x)s, %(y)s, %(out)s, 0) == -1) {
%(fail)s
}
"""
% vars
)
return code
def c_code_cache_version(self):
return (5,)
gpuger_no_inplace = GpuGer(inplace=False)
gpuger_inplace = GpuGer(inplace=True)
class GpuDot22(BlasOp):
"""
Dot22 on the GPU.
"""
_f16_ok = True
__props__ = ()
def make_node(self, x, y):
ctx_name = infer_context_name(x, y)
x = as_gpuarray_variable(x, ctx_name)
y = as_gpuarray_variable(y, ctx_name)
assert x.ndim == 2
assert y.ndim == 2
assert x.dtype == y.dtype
otype = x.type.clone(
broadcastable=(x.type.broadcastable[0], y.type.broadcastable[1])
)
return Apply(self, [x, y], [otype()])
def perform(self, node, inputs, outputs):
x, y = inputs
out = pygpu.empty((x.shape[0], y.shape[1]), dtype=x.dtype, context=x.context)
outputs[0][0] = blas.gemm(1.0, x, y, 0.0, out, overwrite_c=True)
def c_code(self, node, name, inputs, outputs, sub):
dtype = node.inputs[0].dtype
typecode = pygpu.gpuarray.dtype_to_typecode(dtype)
vars = dict(
A=inputs[0],
B=inputs[1],
dtype=dtype,
out=outputs[0],
typecode=typecode,
fail=sub["fail"],
name=name,
)
code = (
"""
double one = 1.;
double zero = 0.;
size_t dims[] = {0, 0};
dims[0] = PyGpuArray_DIMS(%(A)s)[0];
dims[1] = PyGpuArray_DIMS(%(B)s)[1];
if (aesara_prep_output(&%(out)s, 2, dims, %(typecode)s, GA_C_ORDER,
%(A)s->context)) {
%(fail)s
}
if (pygpu_blas_rgemm(cb_no_trans, cb_no_trans,
one,
%(A)s, %(B)s,
zero,
%(out)s, 0) == -1) {
%(fail)s
}
"""
% vars
)
return code
def c_code_cache_version(self):
return (5,)
gpu_dot22 = GpuDot22()
class GpuGemmBatch(BlasOp, _NoPythonCOp):
params_type = ParamsType(inplace=bool_t)
__props__ = ("inplace",)
_f16_ok = True
def __init__(self, inplace=False):
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [0]}
def make_node(self, C, alpha, A, B, beta):
ctx_name = infer_context_name(C, A, B)
A = as_gpuarray_variable(A, ctx_name)
B = as_gpuarray_variable(B, ctx_name)
C = as_gpuarray_variable(C, ctx_name)
alpha = as_tensor_variable(alpha)
if alpha.dtype == "float16":
alpha = alpha.astype("float32")
beta = as_tensor_variable(beta)
if beta.dtype == "float16":
beta = beta.astype("float32")
assert alpha.ndim == 0
assert beta.ndim == 0
assert A.ndim == 3
assert B.ndim == 3
assert C.ndim == 3
assert A.dtype == B.dtype == C.dtype
if A.dtype in ("float32", "float64"):
assert A.dtype == alpha.dtype == beta.dtype
else:
assert "float32" == alpha.dtype == beta.dtype
return Apply(self, [C, alpha, A, B, beta], [C.type()])
def c_headers(self, **kwargs):
return super().c_headers(**kwargs) + ["<gpuarray/blas.h>"]
def c_code(self, node, name, inp, out, sub):
vars = dict(
out=out[0],
C=inp[0],
alpha=inp[1],
A=inp[2],
B=inp[3],
beta=inp[4],
params=sub["params"],
fail=sub["fail"],
name=name,
)
code = (
"""
int err;
if (%(params)s->inplace){
if (!GpuArray_ISONESEGMENT(&%(C)s->ga)) {
%(out)s = aesara_try_copy(%(out)s, %(C)s);
if (%(out)s == NULL) {
%(fail)s
}
} else {
Py_XDECREF(%(out)s);
%(out)s = %(C)s;
Py_INCREF(%(out)s);
}
} else {
%(out)s = aesara_try_copy(%(out)s, %(C)s);
if (%(out)s == NULL) {
%(fail)s
}
}
err = GpuArray_rgemmBatch_3d(
cb_no_trans, cb_no_trans,
((dtype_%(alpha)s *)PyArray_DATA(%(alpha)s))[0],
&%(A)s->ga, &%(B)s->ga,
((dtype_%(beta)s *)PyArray_DATA(%(beta)s))[0],
&%(out)s->ga, 0);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"%%s", GpuArray_error(&%(A)s->ga, err));
%(fail)s;
}
"""
% vars
)
return code
def c_code_cache_version(self):
return (4,)
gpugemmbatch_no_inplace = GpuGemmBatch(inplace=False)
gpugemmbatch_inplace = GpuGemmBatch(inplace=True)
class BaseGpuCorrMM(CGpuKernelBase):
"""
Base class for `GpuCorrMM`, `GpuCorrMM_gradWeights` and
`GpuCorrMM_gradInputs`. Cannot be used directly.
Parameters
----------
border_mode : {'valid', 'full', 'half'}
Additionally, the padding size could be directly specified by an integer,
a pair of integers, or two pairs of integers.
subsample
Perform subsampling of the output (default: (1, 1)).
filter_dilation
Perform subsampling of the input, also known as dilation (default: (1, 1)).
num_groups :
Divides the image, kernel and output tensors into num_groups
separate groups. Each which carry out convolutions separately (default : 1).
unshared
Perform unshared correlation (default: False)
"""
check_broadcast = False
__props__ = (
"border_mode",
"subsample",
"filter_dilation",
"num_groups",
"unshared",
)
_f16_ok = True
def __init__(
self,
border_mode="valid",
subsample=(1, 1),
filter_dilation=(1, 1),
num_groups=1,
unshared=False,
):
if isinstance(border_mode, int):
if border_mode < 0:
raise ValueError(
"invalid border_mode {}, which must be a "
"non-negative integer".format(border_mode)
)
border_mode = ((border_mode, border_mode),) * 2
elif isinstance(border_mode, tuple):
if len(border_mode) != 2:
raise ValueError(
"invalid border_mode {} which must be a "
"tuple of length 2".format(border_mode)
)
border = ()
for mode in border_mode:
if isinstance(mode, tuple) and len(mode) == 2 and min(mode) >= 0:
border += ((int(mode[0]), int(mode[1])),)
elif mode >= 0:
border += ((int(mode), int(mode)),)
else:
raise ValueError(
"invalid border mode {}. The tuple can only contain "
"integers or tuples of length 2".format(border_mode)
)
border_mode = border
elif border_mode not in ("valid", "full", "half"):
raise ValueError(
"invalid border_mode {}, which must be either "
'"valid", "full", "half", an integer or a tuple '
"of length 2".format(border_mode)
)
self.border_mode = border_mode
if len(subsample) != 2:
raise ValueError("subsample must have two elements")
if len(filter_dilation) != 2:
raise ValueError("filter_dilation must have two elements")
self.subsample = tuple(subsample)
self.filter_dilation = tuple(filter_dilation)
if num_groups < 1:
raise ValueError("Number of groups should be greater than 0")
self.num_groups = num_groups
CGpuKernelBase.__init__(self, ["c_code/corr_gemm.c"])
self.unshared = unshared
@property
def pad(self):
if self.border_mode != "valid":
return self.border_mode
return ((0, 0),) * 2
def __str__(self):
return "{}{{{}, {}, {}, {}, {}}}".format(
self.__class__.__name__,
self.border_mode,
str(self.subsample),
str(self.filter_dilation),
str(self.num_groups),
str(self.unshared),
)
def __setstate__(self, d):
self.__dict__.update(d)
if not hasattr(self, "num_groups"):
self.num_groups = 1
def flops(self, inp, outp):
"""
Useful with the hack in profilemode to print the MFlops.
"""
# if the output shape is correct, then this gives the correct
# flops for any direction, sampling, padding, and border mode
inputs, filters = inp
(outputs,) = outp
assert inputs[1] == (filters[1] * self.num_groups)
# nb mul and add by output pixel
flops = filters[2] * filters[3] * 2
# nb flops by output image
flops *= outputs[2] * outputs[3]
# nb patch multiplied
flops *= inputs[1] * filters[0] * inputs[0] / self.num_groups
return flops
def c_headers(self, **kwargs):
return ["<gpuarray/array.h>", "<gpuarray/blas.h>", "gpuarray_helper.h"]
def c_header_dirs(self, **kwargs):
return [gpuarray_helper_inc_dir()]
def c_code_cache_version(self):
# Raise this whenever modifying the C code (including the file).
return (12,)
def c_code_helper(
self, bottom, weights, top, direction, sub, height=None, width=None
):
"""
This generates the C code for GpuCorrMM (direction="forward"),
GpuCorrMM_gradWeights (direction="backprop weights"), and
GpuCorrMM_gradInputs (direction="backprop inputs").
Depending on the direction, one of bottom, weights, top will
receive the output, while the other two serve as inputs.
Parameters
----------
bottom
Variable name of the input images in the forward pass,
or the gradient of the input images in backprop wrt. inputs
weights
Variable name of the filters in the forward pass,
or the gradient of the filters in backprop wrt. weights
top
Variable name of the output images / feature maps in the
forward pass, or the gradient of the outputs in the backprop passes
direction : {'forward', 'backprop weights', 'backprop inputs'}
"forward" to correlate bottom with weights and store results in top,
"backprop weights" to do a valid convolution of bottom with top
(swapping the first two dimensions) and store results in weights,
and "backprop inputs" to do a full convolution of top with weights
(swapping the first two dimensions) and store results in bottom.
sub
Dictionary of substitutions usable to help generating the C code.
height
Required if self.subsample[0] != 1, a variable giving the height of
the filters for direction="backprop weights" or the height of the
input images for direction="backprop inputs".
Required if self.border_mode == 'half', a variable giving the height
of the filters for direction="backprop weights".
Not required otherwise, but if a value is given this will be checked.
width
Required if self.subsample[1] != 1, a variable giving the width of
the filters for direction="backprop weights" or the width of the
input images for direction="backprop inputs".
Required if self.border_mode == 'half', a variable giving the width
of the filters for direction="backprop weights".
Not required otherwise, but if a value is given this will be checked.
"""
dH, dW = self.subsample
dilH, dilW = self.filter_dilation
numgroups = self.num_groups
unshared = int(self.unshared)
if self.border_mode == "half":
padH_l = padH_r = padW_l = padW_r = -1
elif self.border_mode == "full":
padH_l = padH_r = padW_l = padW_r = -2
elif isinstance(self.border_mode, tuple):
(padH_l, padH_r), (padW_l, padW_r) = self.border_mode
else:
assert self.border_mode == "valid"
padH_l = padH_r = padW_l = padW_r = 0
if direction == "forward":
direction = 0
out = top
elif direction == "backprop weights":
direction = 1
out = weights
elif direction == "backprop inputs":
direction = 2
out = bottom
else:
raise ValueError(
"direction must be one of 'forward', "
"'backprop weights', 'backprop inputs'"
)
# When subsampling, we cannot unambiguously infer the height and width
# of bottom and weights from top, so we require them to be given.
# Similarly, when pad="half", we cannot infer the weight size.
if height:
height = f"(*(npy_int*)(PyArray_DATA({height})))"
else:
if ((direction != 0) and (dH != 1)) or (
(direction == 1) and (padH_l == -1 or padH_r == -1)
):
raise ValueError(
"height must be given for backprop with vertical sampling or pad='half'"
)
height = "-1"
if width:
width = f"(*(npy_int*)(PyArray_DATA({width})))"
else:
if ((direction != 0) and (dW != 1)) or (
(direction == 1) and (padW_l == -1 or padW_r == -1)
):
raise ValueError(
"width must be given for backprop with horizontal sampling or pad='half'"
)
width = "-1"
sub = sub.copy()
sub.update(locals())
return (
"""
// Mandatory args
int direction = %(direction)s; // forward, bprop weights, bprop inputs
// Optional args
size_t dH = %(dH)s;
size_t dW = %(dW)s;
size_t dilH = %(dilH)s;
size_t dilW = %(dilW)s;
int padH_l = %(padH_l)s;
int padH_r = %(padH_r)s;
int padW_l = %(padW_l)s;
int padW_r = %(padW_r)s;
int numgroups = %(numgroups)s;
int unshared = %(unshared)s;
PyGpuArrayObject * bottom = %(bottom)s;
PyGpuArrayObject * weights = %(weights)s;
PyGpuArrayObject * top = %(top)s;
PyGpuArrayObject * out2 = NULL;
int wdim, odim;
wdim = unshared ? 6 : 4;
odim = 4; //Can be set to 6 later for unshared backprop wrt weights
// Obtain or infer kernel width and height
// (we need to know it early to be able to handle auto-padding)
size_t kH, kW, dil_kH, dil_kW;
if (direction != 1) {
// weight is an input variable, we can just read its shape
kH = PyGpuArray_DIMS(weights)[wdim-2];
kW = PyGpuArray_DIMS(weights)[wdim-1];
}
else {
if (%(height)s != -1) {
// kernel height is specified (perhaps vertical subsampling or half padding)
kH = %(height)s;
}
else if (padH_l == -2 || padH_r == -2) {
// vertical full padding, we can infer the kernel height
kH = (2 - PyGpuArray_DIMS(bottom)[2] + (PyGpuArray_DIMS(top)[2] - 1) * dH - 1) / dilH + 1;
}
else {
// explicit padding, we can infer the kernel height
kH = (PyGpuArray_DIMS(bottom)[2] + padH_l + padH_r - (PyGpuArray_DIMS(top)[2] - 1) * dH - 1) / dilH + 1 ;
}
if (%(width)s != -1) {
kW = %(width)s;
}
else if (padW_l == -2 || padW_r == -2) {
kW = (2 - PyGpuArray_DIMS(bottom)[3] + (PyGpuArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
}
else {
kW = (PyGpuArray_DIMS(bottom)[3] + padW_l + padW_r - (PyGpuArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
}
}
// Implicit dilated kernel size
dil_kH = (kH - 1) * dilH + 1;
dil_kW = (kW - 1) * dilW + 1;
// Auto-padding if requested
if (padH_l == -1 || padH_r == -1) { // vertical half padding
padH_l = padH_r = dil_kH / 2;
}
else if (padH_l == -2 || padH_r == -2) { // vertical full padding
padH_l = padH_r = dil_kH - 1;
}
else if (padH_l < 0 || padH_r < 0) {
PyErr_SetString(PyExc_ValueError, "BaseGpuCorrMM: padH must be >= -2");
%(fail)s
}
if (padW_l == -1 || padW_r == -1) { // horizontal half padding
padW_l = padW_r = dil_kW / 2;
}
else if (padW_l == -2 || padW_r == -2) { // horizontal full padding
padW_l = padW_r = dil_kW - 1;
}
else if (padW_l < 0 || padW_r < 0) {
PyErr_SetString(PyExc_ValueError, "BaseGpuCorrMM: padW must be >= -2");
%(fail)s
}
// Infer output shape and type
// The inferred shape can be negative.
long long out_dim[6];
size_t out_dim_size[6];
out_dim[4] = out_dim[5] = 0; //Only used for unshared backprop wrt weights
out_dim_size[4] = out_dim_size[5] = 0; //Same
int out_typecode;
PyGpuContextObject *out_context;
switch(direction) {
case 0: // forward pass
// output is top: (batchsize, num_filters, height, width)
// height and width: top = (bottom + pad_l + pad_r - ((weight-1)*dil + 1)) / sample + 1
out_dim[0] = PyGpuArray_DIMS(bottom)[0];
out_dim[1] = PyGpuArray_DIMS(weights)[0];
out_dim[2] = (PyGpuArray_DIMS(bottom)[2] + padH_l + padH_r - ((PyGpuArray_DIMS(weights)[wdim-2]-1)*dilH + 1)) / dH + 1;
out_dim[3] = (PyGpuArray_DIMS(bottom)[3] + padW_l + padW_r - ((PyGpuArray_DIMS(weights)[wdim-1]-1)*dilW + 1)) / dW + 1;
out_typecode = bottom->ga.typecode;
out_context = bottom->context;
if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
{
if (unshared) {
PyErr_Format(PyExc_ValueError,
"GpuCorrMM: impossible output shape\\n"
" bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
" weights shape: %%ld x %%ld x %%ld x %%ld x %%ld x %%ld\\n"
" top shape: %%ld x %%ld x %%ld x %%ld\\n",
PyGpuArray_DIMS(bottom)[0], PyGpuArray_DIMS(bottom)[1],
PyGpuArray_DIMS(bottom)[2], PyGpuArray_DIMS(bottom)[3],
PyGpuArray_DIMS(weights)[0], PyGpuArray_DIMS(weights)[1],
PyGpuArray_DIMS(weights)[2], PyGpuArray_DIMS(weights)[3],
PyGpuArray_DIMS(weights)[4], PyGpuArray_DIMS(weights)[5],
out_dim[0], out_dim[1], out_dim[2], out_dim[3]);
%(fail)s
}
else {
PyErr_Format(PyExc_ValueError,
"GpuCorrMM: impossible output shape\\n"
" bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
" weights shape: %%ld x %%ld x %%ld x %%ld\\n"
" top shape: %%ld x %%ld x %%ld x %%ld\\n",
PyGpuArray_DIMS(bottom)[0], PyGpuArray_DIMS(bottom)[1],
PyGpuArray_DIMS(bottom)[2], PyGpuArray_DIMS(bottom)[3],
PyGpuArray_DIMS(weights)[0], PyGpuArray_DIMS(weights)[1],
PyGpuArray_DIMS(weights)[2], PyGpuArray_DIMS(weights)[3],
out_dim[0], out_dim[1], out_dim[2], out_dim[3]);
%(fail)s
}
}
break;
case 1: // backprop wrt. weights
// output is weights: (num_filters, num_channels, height, width) or
// (num_filters, top_height, top_width, num_channels, height, width) -> for unshared
// height and width: weights = (bottom + 2*pad - (top - 1) * sample - 1) / dil + 1
out_dim[0] = PyGpuArray_DIMS(top)[1];
if (unshared){
odim = 6;
out_dim[1] = PyGpuArray_DIMS(top)[2];
out_dim[2] = PyGpuArray_DIMS(top)[3];
}
out_dim[wdim-3] = PyGpuArray_DIMS(bottom)[1] / numgroups;
out_dim[wdim-2] = kH; // already inferred further above
out_dim[wdim-1] = kW; // how convenient
out_typecode = top->ga.typecode;
out_context = top->context;
if (unshared) {
if (out_dim[0] < 0 || out_dim[1] <= 0 || out_dim[2] <= 0 || out_dim[3] < 0
|| out_dim[4] <= 0 || out_dim[5] <= 0){
PyErr_Format(PyExc_ValueError,
"GpuCorrMM backprop wrt. weights: impossible output shape\\n"
" bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
" weights shape: %%ld x %%ld x %%ld x %%ld x %%ld x %%ld\\n"
" top shape: %%ld x %%ld x %%ld x %%ld\\n",
PyGpuArray_DIMS(bottom)[0], PyGpuArray_DIMS(bottom)[1],
PyGpuArray_DIMS(bottom)[2], PyGpuArray_DIMS(bottom)[3],
out_dim[0], out_dim[1], out_dim[2], out_dim[3],
out_dim[4], out_dim[5],
PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3]);
%(fail)s
}
}
else {
if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
{
PyErr_Format(PyExc_ValueError,
"GpuCorrMM backprop wrt. weights: impossible output shape\\n"
" bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
" weights shape: %%ld x %%ld x %%ld x %%ld\\n"
" top shape: %%ld x %%ld x %%ld x %%ld\\n",
PyGpuArray_DIMS(bottom)[0], PyGpuArray_DIMS(bottom)[1],
PyGpuArray_DIMS(bottom)[2], PyGpuArray_DIMS(bottom)[3],
out_dim[0], out_dim[1], out_dim[2], out_dim[3],
PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3]);
%(fail)s
}
}
break;
case 2: // backprop wrt. inputs
// output is bottom: (batchsize, num_channels, height, width)
// height and width: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad
out_dim[0] = PyGpuArray_DIMS(top)[0];
out_dim[1] = PyGpuArray_DIMS(weights)[wdim-3] * numgroups;
out_dim[2] = (%(height)s != -1) ? %(height)s : (PyGpuArray_DIMS(top)[2] - 1) * dH + (PyGpuArray_DIMS(weights)[wdim-2]-1)*dilH + 1 - padH_l - padH_r;
out_dim[3] = (%(width)s != -1) ? %(width)s : (PyGpuArray_DIMS(top)[3] - 1) * dW + (PyGpuArray_DIMS(weights)[wdim-1]-1)*dilW + 1 - padW_l - padW_r;
out_typecode = top->ga.typecode;
out_context = top->context;
if (unshared) {
if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
{
PyErr_Format(PyExc_ValueError,
"GpuCorrMM backprop wrt. inputs: impossible output shape\\n"
" bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
" weight shape: %%ld x %%ld x %%ld x %%ld x %%ld x %%ld\\n"
" top shape: %%ld x %%ld x %%ld x %%ld\\n",
out_dim[0], out_dim[1], out_dim[2], out_dim[3],
PyGpuArray_DIMS(weights)[0], PyGpuArray_DIMS(weights)[1],
PyGpuArray_DIMS(weights)[2], PyGpuArray_DIMS(weights)[3],
PyGpuArray_DIMS(weights)[4], PyGpuArray_DIMS(weights)[5],
PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3]);
%(fail)s
}
}
else {
if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
{
PyErr_Format(PyExc_ValueError,
"GpuCorrMM backprop wrt. inputs: impossible output shape\\n"
" bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
" weight shape: %%ld x %%ld x %%ld x %%ld\\n"
" top shape: %%ld x %%ld x %%ld x %%ld\\n",
out_dim[0], out_dim[1], out_dim[2], out_dim[3],
PyGpuArray_DIMS(weights)[0], PyGpuArray_DIMS(weights)[1],
PyGpuArray_DIMS(weights)[2], PyGpuArray_DIMS(weights)[3],
PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3]);
%(fail)s
}
}
break;
default:
PyErr_SetString(PyExc_ValueError, "BaseGpuCorrMM: direction must be 0, 1, or 2\\n");
%(fail)s
}
out_dim_size[0] = (size_t)out_dim[0];
out_dim_size[1] = (size_t)out_dim[1];
out_dim_size[2] = (size_t)out_dim[2];
out_dim_size[3] = (size_t)out_dim[3];
if (odim == 6) {
out_dim_size[4] = (size_t)out_dim[4];
out_dim_size[5] = (size_t)out_dim[5];
}
// Prepare output array
if (aesara_prep_output(&%(out)s, odim, out_dim_size, out_typecode, GA_C_ORDER, out_context) != 0)
{
if (odim == 4) {
PyErr_Format(PyExc_RuntimeError,
"BaseGpuCorrMM: Failed to allocate output of %%lld x %%lld x %%lld x %%lld",
out_dim[0], out_dim[1], out_dim[2], out_dim[3]);
}
if (odim == 6) {
PyErr_Format(PyExc_RuntimeError,
"BaseGpuCorrMM: Failed to allocate output of %%lld x %%lld x %%lld x %%lld %%lld %%lld",
out_dim[0], out_dim[1], out_dim[2], out_dim[3], out_dim[4], out_dim[5]);
}
%(fail)s
}
if (!GpuArray_IS_C_CONTIGUOUS(&%(out)s->ga)) {
PyErr_SetString(PyExc_ValueError, "Only contiguous outputs are supported.");
%(fail)s
}
// Call GPU code
out2 = corrMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, dilH, dilW,
padH_l, padH_r, padW_l, padW_r, numgroups, unshared);
if (out2==NULL){
%(fail)s
}
assert (out2 == %(out)s);
"""
% sub
)
class GpuCorrMM(BaseGpuCorrMM, _NoPythonCOp):
"""
GPU correlation implementation using Matrix Multiplication.
Parameters
----------
border_mode
The width of a border of implicit zeros to pad the
input with. Must be a tuple with 2 elements giving the numbers of rows
and columns to pad on each side, or a single integer to pad the same
on all sides, or a string shortcut setting the padding at runtime:
``'valid'`` for ``(0, 0)`` (valid convolution, no padding), ``'full'``
for ``(kernel_rows - 1, kernel_columns - 1)`` (full convolution),
``'half'`` for ``(kernel_rows // 2, kernel_columns // 2)`` (same
convolution for odd-sized kernels).
If it is a tuple containing 2 pairs of integers, then these specify
the padding to be applied on each side ((left, right), (top, bottom)).
Otherwise, each width is applied twice, once per side (left and right,
top and bottom).
subsample
The subsample operation applied to each output image.
Should be a tuple with 2 elements.
`(sv, sh)` is equivalent to `GpuCorrMM(...)(...)[:,:,::sv, ::sh]`,
but faster.
Set to `(1, 1)` to disable subsampling.
filter_dilation
The filter dilation operation applied to each input image.
Should be a tuple with 2 elements.
Set to `(1, 1)` to disable filter dilation.
num_groups
The number of distinct groups the image and kernel must be
divided into.
should be an int
set to 1 to disable grouped convolution
unshared
Perform unshared correlation (default: False)
Notes
-----
Currently, the Op requires the inputs, filters and outputs to be
C-contiguous. Use :func:`gpu_contiguous
<aesara.gpuarray.basic_ops.gpu_contiguous>` on these arguments
if needed.
You can either enable the Aesara flag `optimizer_including=conv_gemm`
to automatically replace all convolution operations with `GpuCorrMM`
or one of its gradients, or you can use it as a replacement for
:func:`conv2d <aesara.tensor.nnet.conv.conv2d>`, called as
`GpuCorrMM(subsample=...)(image, filters)`. The latter is currently
faster, but note that it computes a correlation -- if you need to
compute a convolution, flip the filters as `filters[:,:,::-1,::-1]`.
"""
def __init__(
self,
border_mode="valid",
subsample=(1, 1),
filter_dilation=(1, 1),
num_groups=1,
unshared=False,
):
super().__init__(border_mode, subsample, filter_dilation, num_groups, unshared)
def make_node(self, img, kern):
ctx_name = infer_context_name(img, kern)
img = as_gpuarray_variable(img, ctx_name)
kern = as_gpuarray_variable(kern, ctx_name)
if img.type.ndim != 4:
raise TypeError("img must be 4D tensor")
if self.unshared:
if kern.type.ndim != 6:
raise TypeError("kern must be 6D tensor")
else:
if kern.type.ndim != 4:
raise TypeError("kern must be 4D tensor")
broadcastable = [
img.type.broadcastable[0],
kern.type.broadcastable[0],
False,
False,
]
return Apply(
self,
[img, kern],
[
GpuArrayType(
dtype=img.dtype, context_name=ctx_name, broadcastable=broadcastable
)()
],
)
def c_code(self, node, nodename, inp, out_, sub):
bottom, weights = inp
(top,) = out_
direction = "forward"
return super().c_code_helper(bottom, weights, top, direction, sub)
def grad(self, inp, grads):
bottom, weights = inp
(top,) = grads
top = gpu_contiguous(top)
d_bottom = GpuCorrMM_gradInputs(
self.border_mode,
self.subsample,
self.filter_dilation,
self.num_groups,
self.unshared,
)(weights, top, bottom.shape[-2:])
d_weights = GpuCorrMM_gradWeights(
self.border_mode,
self.subsample,
self.filter_dilation,
self.num_groups,
self.unshared,
)(bottom, top, weights.shape[-2:])
return d_bottom, d_weights
class GpuCorrMM_gradWeights(BaseGpuCorrMM, _NoPythonCOp):
"""
Gradient wrt. filters for `GpuCorrMM`.
Notes
-----
You will not want to use this directly, but rely on Aesara's automatic
differentiation or graph optimization to use it as needed.
"""
def __init__(
self,
border_mode="valid",
subsample=(1, 1),
filter_dilation=(1, 1),
num_groups=1,
unshared=False,
):
super().__init__(border_mode, subsample, filter_dilation, num_groups, unshared)
def make_node(self, img, topgrad, shape=None):
ctx_name = infer_context_name(img, topgrad)
img = as_gpuarray_variable(img, ctx_name)
topgrad = as_gpuarray_variable(topgrad, ctx_name)
if img.type.ndim != 4:
raise TypeError("img must be 4D tensor")
if topgrad.type.ndim != 4:
raise TypeError("topgrad must be 4D tensor")
if shape is None:
if self.subsample != (1, 1) or self.border_mode == "half":
raise ValueError(
"shape must be given if subsample != (1, 1)"
' or border_mode == "half"'
)
height_width = []
else:
height_width = [shape[0], shape[1]]
assert shape[0].ndim == 0
assert shape[1].ndim == 0
if self.unshared:
broadcastable = [
topgrad.type.broadcastable[1],
False,
False,
img.type.broadcastable[1],
False,
False,
]
else:
broadcastable = [
topgrad.type.broadcastable[1],
img.type.broadcastable[1],
False,
False,
]
return Apply(
self,
[img, topgrad] + height_width,
[
GpuArrayType(
dtype=img.dtype, context_name=ctx_name, broadcastable=broadcastable
)()
],
)
def c_code(self, node, nodename, inp, out_, sub):
bottom, top = inp[:2]
height, width = inp[2:] or (None, None)
(weights,) = out_
direction = "backprop weights"
return super().c_code_helper(
bottom, weights, top, direction, sub, height, width
)
def grad(self, inp, grads):
bottom, top = inp[:2]
(weights,) = grads
weights = gpu_contiguous(weights)
d_bottom = GpuCorrMM_gradInputs(
self.border_mode,
self.subsample,
self.filter_dilation,
self.num_groups,
self.unshared,
)(weights, top, bottom.shape[-2:])
d_top = GpuCorrMM(
self.border_mode,
self.subsample,
self.filter_dilation,
self.num_groups,
self.unshared,
)(bottom, weights)
d_height_width = (
(aesara.gradient.DisconnectedType()(),) * 2 if len(inp) == 4 else ()
)
return (d_bottom, d_top) + d_height_width
def connection_pattern(self, node):
if node.nin == 2:
return [[1], [1]]
else:
return [[1], [1], [0], [0]] # no connection to height, width
class GpuCorrMM_gradInputs(BaseGpuCorrMM, _NoPythonCOp):
"""
Gradient wrt. inputs for `GpuCorrMM`.
Notes
-----
You will not want to use this directly, but rely on Aesara's automatic
differentiation or graph optimization to use it as needed.
"""
def __init__(
self,
border_mode="valid",
subsample=(1, 1),
filter_dilation=(1, 1),
num_groups=1,
unshared=False,
):
super().__init__(border_mode, subsample, filter_dilation, num_groups, unshared)
def make_node(self, kern, topgrad, shape=None):
ctx_name = infer_context_name(kern, topgrad)
kern = as_gpuarray_variable(kern, ctx_name)
topgrad = as_gpuarray_variable(topgrad, ctx_name)
if self.unshared:
if kern.type.ndim != 6:
raise TypeError("kern must be 6D tensor")
else:
if kern.type.ndim != 4:
raise TypeError("kern must be 4D tensor")
if topgrad.type.ndim != 4:
raise TypeError("topgrad must be 4D tensor")
if shape is None:
if self.subsample != (1, 1):
raise ValueError("shape must be given if subsample != (1, 1)")
height_width = []
else:
height_width = [shape[0], shape[1]]
assert shape[0].ndim == 0
assert shape[1].ndim == 0
if self.num_groups > 1:
broadcastable = [topgrad.type.broadcastable[0], False, False, False]
else:
broadcastable = [
topgrad.type.broadcastable[0],
kern.type.broadcastable[-3],
False,
False,
]
return Apply(
self,
[kern, topgrad] + height_width,
[
GpuArrayType(
dtype=topgrad.dtype,
context_name=ctx_name,
broadcastable=broadcastable,
)()
],
)
def c_code(self, node, nodename, inp, out_, sub):
weights, top = inp[:2]
height, width = inp[2:] or (None, None)
(bottom,) = out_
direction = "backprop inputs"
return super().c_code_helper(
bottom, weights, top, direction, sub, height, width
)
def grad(self, inp, grads):
weights, top = inp[:2]
(bottom,) = grads
bottom = gpu_contiguous(bottom)
d_weights = GpuCorrMM_gradWeights(
self.border_mode,
self.subsample,
self.filter_dilation,
self.num_groups,
self.unshared,
)(bottom, top, weights.shape[-2:])
d_top = GpuCorrMM(
self.border_mode,
self.subsample,
self.filter_dilation,
self.num_groups,
self.unshared,
)(bottom, weights)
d_height_width = (
(aesara.gradient.DisconnectedType()(),) * 2 if len(inp) == 4 else ()
)
return (d_weights, d_top) + d_height_width
def connection_pattern(self, node):
if node.nin == 2:
return [[1], [1]]
else:
return [[1], [1], [0], [0]] # no connection to height, width
class BaseGpuCorr3dMM(CGpuKernelBase, _NoPythonCOp):
"""
Base class for `GpuCorr3dMM`, `GpuCorr3dMM_gradWeights` and
`GpuCorr3dMM_gradInputs`. Cannot be used directly.
Parameters
----------
border_mode : {'valid', 'full', 'half'}
Additionally, the padding size could be directly specified by an integer
or a pair of integers
subsample
Perform subsampling of the output (default: (1, 1, 1)).
filter_dilation
Perform subsampling of the input, also known as dilation (default: (1, 1, 1)).
num_groups :
Divides the image, kernel and output tensors into num_groups
separate groups. Each which carry out convolutions separately (default : 1).
"""
check_broadcast = False
__props__ = ("border_mode", "subsample", "filter_dilation", "num_groups")
_f16_ok = True
def __init__(
self,
border_mode="valid",
subsample=(1, 1, 1),
filter_dilation=(1, 1, 1),
num_groups=1,
):
if isinstance(border_mode, int):
border_mode = (border_mode, border_mode, border_mode)
if isinstance(border_mode, tuple):
pad_h, pad_w, pad_d = map(int, border_mode)
border_mode = (pad_h, pad_w, pad_d)
if not (
(isinstance(border_mode, tuple) and min(border_mode) >= 0)
or border_mode in ("valid", "full", "half")
):
raise ValueError(
"invalid border_mode {}, which must be either "
'"valid", "full", "half", an integer or a tuple of'
" three integers".format(border_mode)
)
self.border_mode = border_mode
if len(subsample) != 3:
raise ValueError("subsample must have three elements")
if len(filter_dilation) != 3:
raise ValueError("filter_dilation must have three elements")
self.subsample = tuple(subsample)
self.filter_dilation = tuple(filter_dilation)
if num_groups < 1:
raise ValueError("Number of groups should be greater than 0")
self.num_groups = num_groups
CGpuKernelBase.__init__(self, ["c_code/corr3d_gemm.c"])
@property
def pad(self):
if self.border_mode != "valid":
return self.border_mode
return (0, 0, 0)
def __str__(self):
return "{}{{{}, {}, {}, {}}}".format(
self.__class__.__name__,
self.border_mode,
str(self.subsample),
str(self.filter_dilation),
str(self.num_groups),
)
def __setstate__(self, d):
self.__dict__.update(d)
if not hasattr(self, "num_groups"):
self.num_groups = 1
def flops(self, inp, outp):
"""
Useful with the hack in profilemode to print the MFlops.
"""
# if the output shape is correct, then this gives the correct
# flops for any direction, sampling, padding, and border mode
inputs, filters = inp
(outputs,) = outp
assert inputs[1] == (filters[1] * self.num_groups)
# nb mul and add by output pixel
flops = filters[2] * filters[3] * filters[4] * 2
# nb flops by output image
flops *= outputs[2] * outputs[3] * outputs[4]
# nb patch multiplied
flops *= inputs[1] * filters[0] * inputs[0] / self.num_groups
return flops
def c_headers(self, **kwargs):
return ["<gpuarray/array.h>", "<gpuarray/blas.h>", "gpuarray_helper.h"]
def c_header_dirs(self, **kwargs):
return [gpuarray_helper_inc_dir()]
def c_code_cache_version(self):
# raise this whenever modifying the code below.
return (8,)
def c_code_helper(
self, bottom, weights, top, direction, sub, height=None, width=None, depth=None
):
"""
This generates the C code for GpuCorr3dMM (direction="forward"),
GpuCorr3dMM_gradWeights (direction="backprop weights"), and
GpuCorr3dMM_gradInputs (direction="backprop inputs").
Depending on the direction, one of bottom, weights, top will
receive the output, while the other two serve as inputs.
Parameters
----------
bottom
Variable name of the input images in the forward pass,
or the gradient of the input images in backprop wrt. inputs
weights
Variable name of the filters in the forward pass,
or the gradient of the filters in backprop wrt. weights
top
Variable name of the output images / feature maps in the
forward pass, or the gradient of the outputs in the backprop passes
direction : {'forward', 'backprop weights', 'backprop inputs'}
"forward" to correlate bottom with weights and store results in top,
"backprop weights" to do a valid convolution of bottom with top
(swapping the first two dimensions) and store results in weights,
and "backprop inputs" to do a full convolution of top with weights
(swapping the first two dimensions) and store results in bottom.
sub
Dictionary of substitutions usable to help generating the C code.
height
Required if self.subsample[0] != 1, a variable giving the height of
the filters for direction="backprop weights" or the height of the
input images for direction="backprop inputs".
Required if self.border_mode == 'half', a variable giving the height
of the filters for direction="backprop weights".
Not required otherwise, but if a value is given this will be checked.
width
Required if self.subsample[1] != 1, a variable giving the width of
the filters for direction="backprop weights" or the width of the
input images for direction="backprop inputs".
Required if self.border_mode == 'half', a variable giving the width
of the filters for direction="backprop weights".
Not required otherwise, but if a value is given this will be checked.
depth
Required if self.subsample[2] != 1, a variable giving the depth of
the filters for direction="backprop weights" or the depth of the
input images for direction="backprop inputs".
Required if self.border_mode == 'half', a variable giving the depth
of the filters for direction="backprop weights".
Not required otherwise, but if a value is given this will be checked.
"""
dH, dW, dD = self.subsample
dilH, dilW, dilD = self.filter_dilation
numgroups = self.num_groups
if self.border_mode == "half":
padH = padW = padD = -1
elif self.border_mode == "full":
padH = padW = padD = -2
elif isinstance(self.border_mode, tuple):
padH, padW, padD = self.border_mode
else:
assert self.border_mode == "valid"
padH = padW = padD = 0
if direction == "forward":
direction = 0
out = top
elif direction == "backprop weights":
direction = 1
out = weights
elif direction == "backprop inputs":
direction = 2
out = bottom
else:
raise ValueError(
"direction must be one of 'forward', "
"'backprop weights', 'backprop inputs'"
)
# When subsampling, we cannot unambiguously infer the height and width
# of bottom and weights from top, so we require them to be given.
# Similarly, when pad="half", we cannot infer the weight size.
if height:
height = f"(*(npy_int*)(PyArray_DATA({height})))"
else:
if ((direction != 0) and (dH != 1)) or ((direction == 1) and (padH == -1)):
raise ValueError(
"height must be given for backprop with vertical sampling or pad='half'"
)
height = "-1"
if width:
width = f"(*(npy_int*)(PyArray_DATA({width})))"
else:
if ((direction != 0) and (dW != 1)) or ((direction == 1) and (padW == -1)):
raise ValueError(
"width must be given for backprop with horizontal sampling or pad='half'"
)
width = "-1"
if depth:
depth = f"(*(npy_int*)(PyArray_DATA({depth})))"
else:
if ((direction != 0) and (dD != 1)) or ((direction == 1) and (padD == -1)):
raise ValueError(
"depth must be given for backprop with horizontal sampling or pad='half'"
)
depth = "-1"
sub = sub.copy()
sub.update(locals())
return (
"""
// Mandatory args
int direction = %(direction)s; // forward, bprop weights, bprop inputs
// Optional args
size_t dH = %(dH)s;
size_t dW = %(dW)s;
size_t dD = %(dD)s;
size_t dilH = %(dilH)s;
size_t dilW = %(dilW)s;
size_t dilD = %(dilD)s;
int padH = %(padH)s;
int padW = %(padW)s;
int padD = %(padD)s;
int numgroups = %(numgroups)s;
PyGpuArrayObject * bottom = %(bottom)s;
PyGpuArrayObject * weights = %(weights)s;
PyGpuArrayObject * top = %(top)s;
PyGpuArrayObject * out2 = NULL;
// Obtain or infer kernel height, width and depth
// (we need to know it early to be able to handle auto-padding)
size_t kH, kW, kD, dil_kH, dil_kW, dil_kD;
if (direction != 1) {
// weight is an input variable, we can just read its shape
kH = PyGpuArray_DIMS(weights)[2];
kW = PyGpuArray_DIMS(weights)[3];
kD = PyGpuArray_DIMS(weights)[4];
}
else {
if (%(height)s != -1) {
// kernel height is specified (perhaps vertical subsampling or half padding)
kH = %(height)s;
}
else if (padH == -2) {
// vertical full padding, we can infer the kernel height
kH = (2 - PyGpuArray_DIMS(bottom)[2] + (PyGpuArray_DIMS(top)[2] - 1) * dH - 1) / dilH + 1;
}
else {
// explicit padding, we can infer the kernel height
kH = (PyGpuArray_DIMS(bottom)[2] + 2*padH - (PyGpuArray_DIMS(top)[2] - 1) * dH - 1) / dilH + 1 ;
}
if (%(width)s != -1) {
kW = %(width)s;
}
else if (padW == -2) {
kW = (2 - PyGpuArray_DIMS(bottom)[3] + (PyGpuArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
}
else {
kW = (PyGpuArray_DIMS(bottom)[3] + 2*padW - (PyGpuArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
}
if (%(depth)s != -1) {
kD = %(depth)s;
}
else if (padD == -2) {
kD = (2 - PyGpuArray_DIMS(bottom)[4] + (PyGpuArray_DIMS(top)[4] - 1) * dD - 1) / dilD + 1;
}
else {
kD = (PyGpuArray_DIMS(bottom)[4] + 2*padD - (PyGpuArray_DIMS(top)[4] - 1) * dD - 1) / dilD + 1;
}
}
// Implicit dilated kernel size
dil_kH = (kH - 1) * dilH + 1;
dil_kW = (kW - 1) * dilW + 1;
dil_kD = (kD - 1) * dilD + 1;
// Auto-padding if requested
if (padH == -1) { // vertical half padding
padH = dil_kH / 2;
}
else if (padH == -2) { // vertical full padding
padH = dil_kH - 1;
}
else if (padH < 0) {
PyErr_SetString(PyExc_ValueError, "BaseGpuCorr3dMM: padH must be >= -2");
%(fail)s
}
if (padW == -1) { // horizontal half padding
padW = dil_kW / 2;
}
else if (padW == -2) { // horizontal full padding
padW = dil_kW - 1;
}
else if (padW < 0) {
PyErr_SetString(PyExc_ValueError, "BaseGpuCorr3dMM: padW must be >= -2");
%(fail)s
}
if (padD == -1) { // depth half padding
padD = dil_kD / 2;
}
else if (padD == -2) { // depth full padding
padD = dil_kD - 1;
}
else if (padD < 0) {
PyErr_SetString(PyExc_ValueError, "BaseGpuCorr3dMM: padD must be >= -2");
%(fail)s
}
// Infer output shape and type
// The inferred shape can be negative.
long long out_dim[5];
size_t out_dim_size[5];
int out_typecode;
PyGpuContextObject *out_context;
switch(direction) {
case 0: // forward pass
// output is top: (batchsize, num_filters, height, width, depth)
// height, width and depth: top = (bottom + 2*pad - ((weight-1)*dil + 1)) / sample + 1
out_dim[0] = PyGpuArray_DIMS(bottom)[0];
out_dim[1] = PyGpuArray_DIMS(weights)[0];
out_dim[2] = (PyGpuArray_DIMS(bottom)[2] + 2*padH - ((PyGpuArray_DIMS(weights)[2]-1)*dilH + 1)) / dH + 1;
out_dim[3] = (PyGpuArray_DIMS(bottom)[3] + 2*padW - ((PyGpuArray_DIMS(weights)[3]-1)*dilW + 1)) / dW + 1;
out_dim[4] = (PyGpuArray_DIMS(bottom)[4] + 2*padD - ((PyGpuArray_DIMS(weights)[4]-1)*dilD + 1)) / dD + 1;
out_typecode = bottom->ga.typecode;
out_context = bottom->context;
if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0 || out_dim[4] <= 0)
{
PyErr_Format(PyExc_ValueError,
"GpuCorr3dMM: impossible output shape\\n"
" bottom shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n"
" weights shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n"
" top shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n",
PyGpuArray_DIMS(bottom)[0], PyGpuArray_DIMS(bottom)[1],
PyGpuArray_DIMS(bottom)[2], PyGpuArray_DIMS(bottom)[3],
PyGpuArray_DIMS(bottom)[4],
PyGpuArray_DIMS(weights)[0], PyGpuArray_DIMS(weights)[1],
PyGpuArray_DIMS(weights)[2], PyGpuArray_DIMS(weights)[3],
PyGpuArray_DIMS(weights)[4],
out_dim[0], out_dim[1], out_dim[2], out_dim[3], out_dim[4]);
%(fail)s
}
break;
case 1: // backprop wrt. weights
// output is weights: (num_filters, num_channels, height, width, depth)
// height, width and depth: weights = (bottom + 2*pad - (top - 1) * sample - 1) / dil + 1
out_dim[0] = PyGpuArray_DIMS(top)[1];
out_dim[1] = PyGpuArray_DIMS(bottom)[1] / numgroups;
out_dim[2] = kH; // already inferred further above
out_dim[3] = kW; // how convenient
out_dim[4] = kD;
out_typecode = top->ga.typecode;
out_context = top->context;
if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0 || out_dim[4] <= 0)
{
PyErr_Format(PyExc_ValueError,
"GpuCorr3dMM backprop wrt. weights: impossible output shape\\n"
" bottom shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n"
" weights shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n"
" top shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n",
PyGpuArray_DIMS(bottom)[0], PyGpuArray_DIMS(bottom)[1],
PyGpuArray_DIMS(bottom)[2], PyGpuArray_DIMS(bottom)[3],
PyGpuArray_DIMS(bottom)[4],
out_dim[0], out_dim[1], out_dim[2], out_dim[3], out_dim[4],
PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3],
PyGpuArray_DIMS(top)[4]);
%(fail)s
}
break;
case 2: // backprop wrt. inputs
// output is bottom: (batchsize, num_channels, height, width, depth)
// height, width and depth: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad
out_dim[0] = PyGpuArray_DIMS(top)[0];
out_dim[1] = PyGpuArray_DIMS(weights)[1] * numgroups;
out_dim[2] = (%(height)s != -1) ? %(height)s : (PyGpuArray_DIMS(top)[2] - 1) * dH + (PyGpuArray_DIMS(weights)[2]-1)*dilH + 1 - 2*padH;
out_dim[3] = (%(width)s != -1) ? %(width)s : (PyGpuArray_DIMS(top)[3] - 1) * dW + (PyGpuArray_DIMS(weights)[3]-1)*dilW + 1 - 2*padW;
out_dim[4] = (%(depth)s != -1) ? %(depth)s : (PyGpuArray_DIMS(top)[4] - 1) * dD + (PyGpuArray_DIMS(weights)[4]-1)*dilD + 1 - 2*padD;
out_typecode = top->ga.typecode;
out_context = top->context;
if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0 || out_dim[4] <= 0)
{
PyErr_Format(PyExc_ValueError,
"GpuCorr3dMM backprop wrt. inputs: impossible output shape\\n"
" bottom shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n"
" weights shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n"
" top shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n",
out_dim[0], out_dim[1], out_dim[2], out_dim[3], out_dim[4],
PyGpuArray_DIMS(weights)[0], PyGpuArray_DIMS(weights)[1],
PyGpuArray_DIMS(weights)[2], PyGpuArray_DIMS(weights)[3],
PyGpuArray_DIMS(weights)[4],
PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3],
PyGpuArray_DIMS(top)[4]);
%(fail)s
}
break;
default:
PyErr_SetString(PyExc_ValueError, "BaseGpuCorr3dMM: direction must be 0, 1, or 2\\n");
%(fail)s
}
out_dim_size[0] = (size_t)out_dim[0];
out_dim_size[1] = (size_t)out_dim[1];
out_dim_size[2] = (size_t)out_dim[2];
out_dim_size[3] = (size_t)out_dim[3];
out_dim_size[4] = (size_t)out_dim[4];
// Prepare output array
if (aesara_prep_output(&%(out)s, 5, out_dim_size, out_typecode, GA_C_ORDER, out_context) != 0)
{
PyErr_Format(PyExc_RuntimeError,
"BaseGpuCorrMM: Failed to allocate output of %%lld x %%lld x %%lld x %%lld x %%lld",
out_dim[0], out_dim[1], out_dim[2], out_dim[3], out_dim[4]);
%(fail)s
}
if (!GpuArray_IS_C_CONTIGUOUS(&%(out)s->ga)) {
PyErr_SetString(PyExc_ValueError, "Only contiguous outputs are supported.");
%(fail)s
}
// Call GPU code
out2 = corr3dMM(%(bottom)s, %(weights)s, %(top)s, direction,
dH, dW, dD, dilH, dilW, dilD, padH, padW, padD, numgroups);
if (out2==NULL){
%(fail)s
}
assert (out2 == %(out)s);
"""
% sub
)
class GpuCorr3dMM(BaseGpuCorr3dMM, _NoPythonCOp):
"""
GPU correlation implementation using Matrix Multiplication.
Parameters
----------
border_mode
The width of a border of implicit zeros to pad the
input with. Must be a tuple with 3 elements giving the width of
the padding on each side, or a single integer to pad the same
on all sides, or a string shortcut setting the padding at runtime:
``'valid'`` for ``(0, 0, 0)`` (valid convolution, no padding), ``'full'``
for ``(kernel_rows - 1, kernel_columns - 1, kernel_depth - 1)``
(full convolution), ``'half'`` for ``(kernel_rows // 2,
kernel_columns // 2, kernel_depth // 2)`` (same convolution for
odd-sized kernels). Note that the three widths are each
applied twice, once per side (left and right, top and bottom, front
and back).
subsample
The subsample operation applied to each output image. Should be a tuple
with 3 elements. `(sv, sh, sl)` is equivalent to
`GpuCorrMM(...)(...)[:,:,::sv, ::sh, ::sl]`, but faster.
Set to `(1, 1, 1)` to disable subsampling.
filter_dilation
The filter dilation operation applied to each input image.
Should be a tuple with 3 elements.
Set to `(1, 1, 1)` to disable filter dilation.
num_groups
The number of distinct groups the image and kernel must be
divided into.
should be an int
set to 1 to disable grouped convolution
Notes
-----
Currently, the Op requires the inputs, filters and outputs to be
C-contiguous. Use :func:`gpu_contiguous
<aesara.gpuarray.basic_ops.gpu_contiguous>` on these arguments
if needed.
You can either enable the Aesara flag `optimizer_including=conv_gemm`
to automatically replace all convolution operations with `GpuCorr3dMM`
or one of its gradients, or you can use it as a replacement for
:func:`conv2d <aesara.tensor.nnet.conv.conv2d>`, called as
`GpuCorr3dMM(subsample=...)(image, filters)`. The latter is currently
faster, but note that it computes a correlation -- if you need to
compute a convolution, flip the filters as `filters[:,:,::-1,::-1,::-1]`.
"""
def __init__(
self,
border_mode="valid",
subsample=(1, 1, 1),
filter_dilation=(1, 1, 1),
num_groups=1,
):
super().__init__(border_mode, subsample, filter_dilation, num_groups)
def make_node(self, img, kern):
ctx_name = infer_context_name(img, kern)
img = as_gpuarray_variable(img, ctx_name)
kern = as_gpuarray_variable(kern, ctx_name)
if img.type.ndim != 5:
raise TypeError("img must be 5D tensor")
if kern.type.ndim != 5:
raise TypeError("kern must be 5D tensor")
broadcastable = [
img.type.broadcastable[0],
kern.type.broadcastable[0],
False,
False,
False,
]
return Apply(
self,
[img, kern],
[
GpuArrayType(
dtype=img.dtype, context_name=ctx_name, broadcastable=broadcastable
)()
],
)
def c_code(self, node, nodename, inp, out_, sub):
bottom, weights = inp
(top,) = out_
direction = "forward"
return super().c_code_helper(bottom, weights, top, direction, sub)
def grad(self, inp, grads):
bottom, weights = inp
(top,) = grads
top = gpu_contiguous(top)
d_bottom = GpuCorr3dMM_gradInputs(
self.border_mode, self.subsample, self.filter_dilation, self.num_groups
)(weights, top, bottom.shape[-3:])
d_weights = GpuCorr3dMM_gradWeights(
self.border_mode, self.subsample, self.filter_dilation, self.num_groups
)(bottom, top, weights.shape[-3:])
return d_bottom, d_weights
class GpuCorr3dMM_gradWeights(BaseGpuCorr3dMM, _NoPythonCOp):
"""
Gradient wrt. filters for `GpuCorr3dMM`.
Notes
-----
You will not want to use this directly, but rely on Aesara's automatic
differentiation or graph optimization to use it as needed.
"""
def __init__(
self,
border_mode="valid",
subsample=(1, 1, 1),
filter_dilation=(1, 1, 1),
num_groups=1,
):
super().__init__(border_mode, subsample, filter_dilation, num_groups)
def make_node(self, img, topgrad, shape=None):
ctx_name = infer_context_name(img, topgrad)
img = as_gpuarray_variable(img, ctx_name)
topgrad = as_gpuarray_variable(topgrad, ctx_name)
if img.type.ndim != 5:
raise TypeError("img must be 5D tensor")
if topgrad.type.ndim != 5:
raise TypeError("topgrad must be 5D tensor")
if shape is None:
if self.subsample != (1, 1, 1) or self.border_mode == "half":
raise ValueError(
"shape must be given if subsample != (1, 1, 1)"
' or border_mode == "half"'
)
height_width_depth = []
else:
height_width_depth = [shape[0], shape[1], shape[2]]
assert shape[0].ndim == 0
assert shape[1].ndim == 0
assert shape[2].ndim == 0
broadcastable = [
topgrad.type.broadcastable[1],
img.type.broadcastable[1],
False,
False,
False,
]
return Apply(
self,
[img, topgrad] + height_width_depth,
[
GpuArrayType(
dtype=img.dtype, context_name=ctx_name, broadcastable=broadcastable
)()
],
)
def c_code(self, node, nodename, inp, out_, sub):
bottom, top = inp[:2]
height, width, depth = inp[2:] or (None, None, None)
(weights,) = out_
direction = "backprop weights"
return super().c_code_helper(
bottom, weights, top, direction, sub, height, width, depth
)
def grad(self, inp, grads):
bottom, top = inp[:2]
(weights,) = grads
weights = gpu_contiguous(weights)
d_bottom = GpuCorr3dMM_gradInputs(
self.border_mode, self.subsample, self.filter_dilation, self.num_groups
)(weights, top, bottom.shape[-3:])
d_top = GpuCorr3dMM(
self.border_mode, self.subsample, self.filter_dilation, self.num_groups
)(bottom, weights)
d_height_width_depth = (
(aesara.gradient.DisconnectedType()(),) * 3 if len(inp) == 5 else ()
)
return (d_bottom, d_top) + d_height_width_depth
def connection_pattern(self, node):
if node.nin == 2:
return [[1], [1]]
else:
return [[1], [1], [0], [0], [0]] # no connection to height, width, depth
class GpuCorr3dMM_gradInputs(BaseGpuCorr3dMM, _NoPythonCOp):
"""
Gradient wrt. inputs for `GpuCorr3dMM`.
Notes
-----
You will not want to use this directly, but rely on Aesara's automatic
differentiation or graph optimization to use it as needed.
"""
def __init__(
self,
border_mode="valid",
subsample=(1, 1, 1),
filter_dilation=(1, 1, 1),
num_groups=1,
):
super().__init__(border_mode, subsample, filter_dilation, num_groups)
def make_node(self, kern, topgrad, shape=None):
ctx_name = infer_context_name(kern, topgrad)
kern = as_gpuarray_variable(kern, ctx_name)
topgrad = as_gpuarray_variable(topgrad, ctx_name)
if kern.type.ndim != 5:
raise TypeError("kern must be 5D tensor")
if topgrad.type.ndim != 5:
raise TypeError("topgrad must be 5D tensor")
if shape is None:
if self.subsample != (1, 1, 1):
raise ValueError("shape must be given if subsample != (1, 1, 1)")
height_width_depth = []
else:
height_width_depth = [shape[0], shape[1], shape[2]]
assert shape[0].ndim == 0
assert shape[1].ndim == 0
assert shape[2].ndim == 0
if self.num_groups > 1:
broadcastable = [topgrad.type.broadcastable[0], False, False, False, False]
else:
broadcastable = [
topgrad.type.broadcastable[0],
kern.type.broadcastable[-4],
False,
False,
False,
]
return Apply(
self,
[kern, topgrad] + height_width_depth,
[
GpuArrayType(
dtype=topgrad.dtype,
context_name=ctx_name,
broadcastable=broadcastable,
)()
],
)
def c_code(self, node, nodename, inp, out_, sub):
weights, top = inp[:2]
height, width, depth = inp[2:] or (None, None, None)
(bottom,) = out_
direction = "backprop inputs"
return super().c_code_helper(
bottom, weights, top, direction, sub, height, width, depth
)
def grad(self, inp, grads):
weights, top = inp[:2]
(bottom,) = grads
bottom = gpu_contiguous(bottom)
d_weights = GpuCorr3dMM_gradWeights(
self.border_mode, self.subsample, self.filter_dilation, self.num_groups
)(bottom, top, weights.shape[-3:])
d_top = GpuCorr3dMM(
self.border_mode, self.subsample, self.filter_dilation, self.num_groups
)(bottom, weights)
d_height_width_depth = (
(aesara.gradient.DisconnectedType()(),) * 3 if len(inp) == 5 else ()
)
return (d_weights, d_top) + d_height_width_depth
def connection_pattern(self, node):
if node.nin == 2:
return [[1], [1]]
else:
return [[1], [1], [0], [0], [0]] # no connection to height, width, depth
@inplace_allocempty(GpuGemv, 0)
def local_inplace_gpuagemv(node, inputs):
return [gpugemv_inplace(*inputs)]
@inplace_allocempty(GpuGemm, 0)
def local_inplace_gpuagemm(node, inputs):
return [gpugemm_inplace(*inputs)]
@inplace_allocempty(GpuGer, 0)
def local_inplace_gpuager(node, inputs):
return [gpuger_inplace(*inputs)]
@inplace_allocempty(GpuGemmBatch, 0)
def local_inplace_gpuagemmbatch(node, inputs):
return [gpugemmbatch_inplace(*inputs)]
gpuablas_opt_inplace = in2out(
LocalOptGroup(
local_inplace_gpuagemv,
local_inplace_gpuagemm,
local_inplace_gpuager,
local_inplace_gpuagemmbatch,
),
name="gpuablas_opt_inplace",
)
optdb.register(
"InplaceGpuaBlasOpt",
gpuablas_opt_inplace,
"fast_run",
"inplace",
"gpuarray",
position=70.0,
)
import logging
import numpy as np
from aesara import tensor as at
from aesara.gpuarray.basic_ops import (
as_gpuarray_variable,
gpuarray_helper_inc_dir,
infer_context_name,
)
from aesara.gpuarray.type import gpu_context_type
from aesara.gradient import grad_undefined
from aesara.graph.basic import Apply
from aesara.link.c.op import _NoPythonExternalCOp
from aesara.link.c.params_type import ParamsType
from aesara.scalar import bool as bool_t
from aesara.tensor import as_tensor_variable
from aesara.tensor.type import discrete_dtypes
_logger = logging.getLogger("aesara.gpuarray.blocksparse")
class GpuSparseBlockGemv(_NoPythonExternalCOp):
"""
GPU version of SparseBlockGemv. Check SparseBlockGemv's docstring for more
information.
This should not be directly called since the interface is subject
to change without notice. Use the sandbox.blocksparse.sparse_block_dot()
function for a stable interface.
"""
__props__ = ("inplace",)
params_type = ParamsType(inplace=bool_t, context=gpu_context_type)
# NB: DTYPE_INPUT_* is used in C code, so I think we should not set check_input to False.
def __init__(self, inplace=False):
super().__init__("c_code/blockgemv.c", "APPLY_SPECIFIC(blockgemv)")
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [0]}
def get_params(self, node):
return self.params_type.get_params(self, context=node.inputs[0].type.context)
def c_header_dirs(self, **kwargs):
return [gpuarray_helper_inc_dir()]
def c_headers(self, **kwargs):
return [
"<gpuarray/buffer_blas.h>",
"<gpuarray/buffer.h>",
"<gpuarray_helper.h>",
]
def make_node(self, o, W, h, inputIdx, outputIdx):
ctx = infer_context_name(o, W, h)
o = as_gpuarray_variable(o, ctx)
W = as_gpuarray_variable(W, ctx)
h = as_gpuarray_variable(h, ctx)
inputIdx = as_tensor_variable(inputIdx)
outputIdx = as_tensor_variable(outputIdx)
assert o.ndim == 3
assert W.ndim == 4
assert h.ndim == 3
assert inputIdx.ndim == 2
assert outputIdx.ndim == 2
assert inputIdx.type.dtype in discrete_dtypes
assert outputIdx.type.dtype in discrete_dtypes
return Apply(self, [o, W, h, inputIdx, outputIdx], [o.type()])
def infer_shape(self, fgraph, node, input_shapes):
return [input_shapes[0]]
def grad(self, inputs, grads):
o, W, h, inputIdx, outputIdx = inputs
go = grads[0]
Wgrad = gpu_sparse_block_outer(W.zeros_like(), h, go, inputIdx, outputIdx)
hgrad = gpu_sparse_block_gemv(
h.zeros_like(), W.dimshuffle((1, 0, 3, 2)), go, outputIdx, inputIdx
)
return [
go,
Wgrad,
hgrad,
grad_undefined(self, 3, inputIdx, "grad of inputIdx makes no sense"),
grad_undefined(self, 4, outputIdx, "grad of outputIdx makes no sense"),
]
gpu_sparse_block_gemv = GpuSparseBlockGemv(False)
gpu_sparse_block_gemv_inplace = GpuSparseBlockGemv(True)
class GpuSparseBlockOuter(_NoPythonExternalCOp):
"""
GPU version of SparseBlockOuter. See SparseBlockOuter's docstring for more
information.
This op should not be called directly since its interface is
subject to change without notice. It is involved in the gradient
of GpuSparseBlockGemv. The gradient is not implemented.
"""
__props__ = ("inplace",)
params_type = ParamsType(inplace=bool_t, context=gpu_context_type)
def __init__(self, inplace=False):
super().__init__(["c_code/blockger.c"], "APPLY_SPECIFIC(blockger)")
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [0]}
def get_params(self, node):
return self.params_type.get_params(self, context=node.inputs[0].type.context)
def make_node(self, o, x, y, xIdx, yIdx, alpha=None):
ctx = infer_context_name(o, x, y)
one = at.constant(np.asarray(1.0, dtype="float32"))
o = as_gpuarray_variable(o, ctx)
x = as_gpuarray_variable(x, ctx)
y = as_gpuarray_variable(y, ctx)
xIdx = as_tensor_variable(xIdx)
yIdx = as_tensor_variable(yIdx)
if alpha is None:
alpha = one
return Apply(self, [o, x, y, xIdx, yIdx, alpha], [o.type()])
def infer_shape(self, fgraph, node, input_shapes):
return [input_shapes[0]]
def c_header_dirs(self, **kwargs):
return [gpuarray_helper_inc_dir()]
def c_headers(self, **kwargs):
return [
"<gpuarray/buffer_blas.h>",
"<gpuarray/buffer.h>",
"<gpuarray_helper.h>",
]
gpu_sparse_block_outer = GpuSparseBlockOuter(False)
gpu_sparse_block_outer_inplace = GpuSparseBlockOuter(True)
#section support_code_apply
int APPLY_SPECIFIC(blockgemv)(PyGpuArrayObject *o, PyGpuArrayObject *W,
PyGpuArrayObject *h, PyArrayObject *inputIdx,
PyArrayObject *outputIdx,
PyGpuArrayObject **_out,
PARAMS_TYPE* params) {
PyGpuArrayObject *out = *_out;
if (params->inplace) {
Py_XDECREF(out);
out = o;
Py_INCREF(out);
} else {
out = aesara_try_copy(out, o);
if (out == NULL) {
// Error already set
return -1;
}
}
gpudata **W_list = NULL;
gpudata **inp_list = NULL;
gpudata **out_list = NULL;
size_t *offW = NULL;
size_t *offInp = NULL;
size_t *offOut = NULL;
int err;
err = gpublas_setup(params->context->ctx);
if (err != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "Can't setup blas");
return -1;
}
/* Prepare lists for the batch */
size_t maxi = PyGpuArray_DIMS(h)[1];
size_t maxj = PyGpuArray_DIMS(out)[1];
size_t maxb = PyGpuArray_DIMS(out)[0];
ssize_t h_str_0 = PyGpuArray_STRIDES(h)[0];
ssize_t h_str_1 = PyGpuArray_STRIDES(h)[1];
ssize_t o_str_0 = PyGpuArray_STRIDES(out)[0];
ssize_t o_str_1 = PyGpuArray_STRIDES(out)[1];
ssize_t W_str_0 = PyGpuArray_STRIDES(W)[0];
ssize_t W_str_1 = PyGpuArray_STRIDES(W)[1];
W_list = (gpudata **)calloc(sizeof(gpudata *), maxi * maxj * maxb);
offW = (size_t *)calloc(sizeof(size_t), maxi * maxj * maxb);
inp_list = (gpudata **)calloc(sizeof(gpudata *), maxi * maxj * maxb);
offInp = (size_t *)calloc(sizeof(size_t), maxi * maxj * maxb);
out_list = (gpudata **)calloc(sizeof(gpudata *), maxi * maxj * maxb);
offOut = (size_t *)calloc(sizeof(size_t), maxi * maxj * maxb);
if (W_list == NULL || offW == NULL ||
inp_list == NULL || offInp == NULL ||
out_list == NULL || offOut == NULL) {
free(W_list);
free(offW);
free(inp_list);
free(offInp);
free(out_list);
free(offOut);
PyErr_NoMemory();
return -1;
}
for (size_t i = 0; i < maxi; i++) {
for (size_t j = 0; j < maxj; j++) {
for (size_t b = 0; b < maxb; b++) {
size_t p = i + j * maxi + b * maxi * maxj;
inp_list[p] = h->ga.data;
offInp[p] = b * h_str_0 + i * h_str_1 + h->ga.offset;
out_list[p] = out->ga.data;
offOut[p] = b * o_str_0 + j * o_str_1 + out->ga.offset;
W_list[p] = W->ga.data;
offW[p] = *(DTYPE_INPUT_3 *)PyArray_GETPTR2(inputIdx, b, i) * W_str_0 +
*(DTYPE_INPUT_4 *)PyArray_GETPTR2(outputIdx, b, j) * W_str_1 +
W->ga.offset;
}
}
}
cb_transpose transA = cb_no_trans;
size_t lda = PyGpuArray_STRIDES(W)[2] / gpuarray_get_elsize(W->ga.typecode);
if (lda == 1) {
transA = cb_trans;
lda = PyGpuArray_STRIDES(W)[3] / gpuarray_get_elsize(W->ga.typecode);
}
if (out->ga.typecode == GA_FLOAT) {
err = gpublas_sgemvBatch(cb_fortran, transA,
PyGpuArray_DIMS(out)[2],
PyGpuArray_DIMS(h)[2], 1,
W_list, offW, lda,
inp_list, offInp, PyGpuArray_STRIDES(h)[2] / gpuarray_get_elsize(h->ga.typecode),
1, out_list, offOut, PyGpuArray_STRIDES(out)[2] / gpuarray_get_elsize(out->ga.typecode),
PyGpuArray_DIMS(out)[1] * PyGpuArray_DIMS(h)[1] * PyGpuArray_DIMS(out)[0], 0);
} else if (out->ga.typecode == GA_DOUBLE) {
err = gpublas_dgemvBatch(cb_fortran, transA,
PyGpuArray_DIMS(out)[2],
PyGpuArray_DIMS(h)[2], 1,
W_list, offW, lda,
inp_list, offInp, PyGpuArray_STRIDES(h)[2] / gpuarray_get_elsize(h->ga.typecode),
1, out_list, offOut, PyGpuArray_STRIDES(out)[2] / gpuarray_get_elsize(out->ga.typecode),
PyGpuArray_DIMS(out)[1] * PyGpuArray_DIMS(h)[1] * PyGpuArray_DIMS(out)[0], 0);
} else if (out->ga.typecode == GA_HALF) {
err = gpublas_sgemvBatch(cb_fortran, transA,
PyGpuArray_DIMS(out)[2],
PyGpuArray_DIMS(h)[2], 1,
W_list, offW, lda,
inp_list, offInp, PyGpuArray_STRIDES(h)[2] / gpuarray_get_elsize(h->ga.typecode),
1, out_list, offOut, PyGpuArray_STRIDES(out)[2] / gpuarray_get_elsize(out->ga.typecode),
PyGpuArray_DIMS(out)[1] * PyGpuArray_DIMS(h)[1] * PyGpuArray_DIMS(out)[0], 0);
} else {
err = GA_INVALID_ERROR;
}
free(W_list);
free(offW);
free(inp_list);
free(offInp);
free(out_list);
free(offOut);
if (err != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "gemvBatch failed");
return -1;
}
*_out = out;
return 0;
}
#section support_code_apply
int APPLY_SPECIFIC(blockger)(PyGpuArrayObject *o, PyGpuArrayObject *x,
PyGpuArrayObject *y, PyArrayObject *xIdx,
PyArrayObject *yIdx, PyArrayObject *alpha,
PyGpuArrayObject **_out,
PARAMS_TYPE* params) {
PyGpuArrayObject *out = *_out;
gpudata **o_list = NULL;
gpudata **x_list = NULL;
gpudata **y_list = NULL;
size_t *offOut = NULL;
size_t *offX = NULL;
size_t *offY = NULL;
int err;
err = gpublas_setup(params->context->ctx);
if (err != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "Can't setup blas");
return -1;
}
if (params->inplace) {
Py_XDECREF(out);
out = o;
Py_INCREF(out);
} else {
out = aesara_try_copy(out, o);
if (out == NULL)
return -1;
}
size_t maxi = PyGpuArray_DIMS(x)[1];
size_t maxj = PyGpuArray_DIMS(y)[1];
size_t maxb = PyGpuArray_DIMS(x)[0];
ssize_t x_str_0 = PyGpuArray_STRIDES(x)[0];
ssize_t x_str_1 = PyGpuArray_STRIDES(x)[1];
ssize_t y_str_0 = PyGpuArray_STRIDES(y)[0];
ssize_t y_str_1 = PyGpuArray_STRIDES(y)[1];
ssize_t o_str_0 = PyGpuArray_STRIDES(out)[0];
ssize_t o_str_1 = PyGpuArray_STRIDES(out)[1];
o_list = (gpudata **)calloc(sizeof(gpudata *), maxi * maxj * maxb);
offOut = (size_t *)calloc(sizeof(size_t), maxi * maxj * maxb);
x_list = (gpudata **)calloc(sizeof(gpudata *), maxi * maxj * maxb);
offX = (size_t *)calloc(sizeof(size_t), maxi * maxj * maxb);
y_list = (gpudata **)calloc(sizeof(gpudata *), maxi * maxj * maxb);
offY = (size_t *)calloc(sizeof(size_t), maxi * maxj * maxb);
if (o_list == NULL || offOut == NULL ||
x_list == NULL || offX == NULL ||
y_list == NULL || offY == NULL) {
free(o_list);
free(offOut);
free(x_list);
free(offX);
free(y_list);
free(offY);
PyErr_NoMemory();
return -1;
}
for (size_t i = 0; i < maxi; i++) {
for (size_t j = 0; j < maxj; j++) {
for (size_t b = 0; b < maxb; b++) {
size_t p = i + j * maxi + b * maxi * maxj;
x_list[p] = x->ga.data;
offX[p] = b * x_str_0 + i * x_str_1 + x->ga.offset;
y_list[p] = y->ga.data;
offY[p] = b * y_str_0 + j * y_str_1 + y->ga.offset;
o_list[p] = out->ga.data;
offOut[p] = *(DTYPE_INPUT_3 *)PyArray_GETPTR2(xIdx, b, i) * o_str_0 + *(DTYPE_INPUT_4 *)PyArray_GETPTR2(yIdx, b, j) * o_str_1 + out->ga.offset;
}
}
}
ssize_t str_y = PyGpuArray_STRIDES(y)[2] / gpuarray_get_elsize(y->ga.typecode);
ssize_t str_x = PyGpuArray_STRIDES(x)[2] / gpuarray_get_elsize(x->ga.typecode);
ssize_t str_out = PyGpuArray_STRIDES(out)[2] / gpuarray_get_elsize(out->ga.typecode);
if (out->ga.typecode == GA_FLOAT) {
err = gpublas_sgerBatch(cb_fortran,
PyGpuArray_DIMS(y)[2], PyGpuArray_DIMS(x)[2],
*(float *)PyArray_GETPTR1(alpha, 0),
y_list, offY, str_y, x_list, offX, str_x,
o_list, offOut, str_out,
PyGpuArray_DIMS(x)[0] * PyGpuArray_DIMS(x)[1] * PyGpuArray_DIMS(y)[1], 0);
} else if (out->ga.typecode == GA_DOUBLE) {
err = gpublas_dgerBatch(cb_fortran,
PyGpuArray_DIMS(y)[2], PyGpuArray_DIMS(x)[2],
*(double *)PyArray_GETPTR1(alpha, 0),
y_list, offY, str_y, x_list, offX, str_x,
o_list, offOut, str_out,
PyGpuArray_DIMS(x)[0] * PyGpuArray_DIMS(x)[1] * PyGpuArray_DIMS(y)[1], 0);
} else if (out->ga.typecode == GA_HALF) {
err = gpublas_hgerBatch(cb_fortran,
PyGpuArray_DIMS(y)[2], PyGpuArray_DIMS(x)[2],
*(float *)PyArray_GETPTR1(alpha, 0),
y_list, offY, str_y, x_list, offX, str_x,
o_list, offOut, str_out,
PyGpuArray_DIMS(x)[0] * PyGpuArray_DIMS(x)[1] * PyGpuArray_DIMS(y)[1], 0);
} else {
err = GA_INVALID_ERROR;
}
free(o_list);
free(offOut);
free(x_list);
free(offX);
free(y_list);
free(offY);
if (err != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "gerBatch failed");
return -1;
}
*_out = out;
return 0;
}
#section support_code_apply
static int c_set_groups_for_conv(cudnnConvolutionDescriptor_t desc, int groups) {
#if CUDNN_MAJOR >= 7
cudnnStatus_t err = cudnnSetConvolutionGroupCount(desc, groups);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error setting groups for convolution : %s",
cudnnGetErrorString(err));
return -1;
}
#endif
return 0;
}
int APPLY_SPECIFIC(conv_desc)(PyArrayObject *filt_shp,
cudnnConvolutionDescriptor_t *desc,
PARAMS_TYPE* params) {
cudnnStatus_t err;
int pad[3] = {params->pad0, params->pad1, params->pad2};
int strides[3] = {params->sub0, params->sub1, params->sub2};
int dilation[3] = {params->dil0, params->dil1, params->dil2};
if (params->bmode == BORDER_MODE_FULL) {
pad[0] = (*(npy_int64 *)PyArray_GETPTR1(filt_shp, 2) - 1) * dilation[0];
pad[1] = (*(npy_int64 *)PyArray_GETPTR1(filt_shp, 3) - 1) * dilation[1];
if (params->nb_dims > 2) {
pad[2] = (*(npy_int64 *)PyArray_GETPTR1(filt_shp, 4) - 1) * dilation[2];
}
} else if(params->bmode == BORDER_MODE_HALF) {
pad[0] = ((*(npy_int64 *)PyArray_GETPTR1(filt_shp, 2) - 1) * dilation[0] + 1) / 2;
pad[1] = ((*(npy_int64 *)PyArray_GETPTR1(filt_shp, 3) - 1) * dilation[1] + 1) / 2;
if (params->nb_dims > 2) {
pad[2] = ((*(npy_int64 *)PyArray_GETPTR1(filt_shp, 4) - 1) * dilation[2] + 1) / 2;
}
}
if (PyArray_DIM(filt_shp, 0) - 2 != params->nb_dims) {
PyErr_Format(PyExc_ValueError, "Filter shape has too many dimensions: "
"expected %d, got %lld.", params->nb_dims,
(long long)PyArray_DIM(filt_shp, 0));
return -1;
}
err = cudnnCreateConvolutionDescriptor(desc);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate convolution "
"descriptor: %s", cudnnGetErrorString(err));
return -1;
}
err = cudnnSetConvolutionNdDescriptor(*desc, params->nb_dims, pad, strides,
dilation, params->conv_mode, params->precision);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not set convolution "
"descriptor: %s", cudnnGetErrorString(err));
return -1;
}
if (c_set_groups_for_conv(*desc, params->num_groups) == -1)
return -1;
return 0;
}
#section kernels
#kernel dilated_im3d2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
#include "cluda.h"
// This uses a lot of code from Caffe (http://caffe.berkeleyvision.org/);
// sources are clearly marked. Below we reproduce the original license of
// the Caffe software.
/*
Copyright (c) 2014, The Regents of the University of California (Regents)
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/util/im2col.cu)
// Kernels for fast unfold + copy
// GPU kernel for the case of dilation
KERNEL void dilated_im3d2col_kernel(const ga_size n,
GLOBAL_MEM const DTYPE_INPUT_0 * data_im,
const ga_size offset_im,
const ga_size data_im_offset,
// offset_im is the pointer offset for data_im.
// data_im_offset is an offset of elements in the array
const ga_size height, const ga_size width, const ga_size depth,
const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_d,
const ga_size dilation_h, const ga_size dilation_w, const ga_size dilation_d,
const ga_size pad_h, const ga_size pad_w, const ga_size pad_d,
const ga_size stride_h, const ga_size stride_w, const ga_size stride_d,
const ga_size height_col, const ga_size width_col, const ga_size depth_col,
GLOBAL_MEM DTYPE_INPUT_0 * data_col,
const ga_size offset_col) {
data_im = (GLOBAL_MEM DTYPE_INPUT_0 *)(((GLOBAL_MEM char *)data_im) + offset_im);
data_col = (GLOBAL_MEM DTYPE_INPUT_0 *)(((GLOBAL_MEM char *)data_col) + offset_col);
// grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < (n); index += LDIM_0 * GDIM_0) {
const ga_size w_index = index / depth_col;
const ga_size h_index = w_index / width_col;
const ga_size d_col = index % depth_col;
const ga_size h_col = h_index % height_col;
const ga_size w_col = w_index % width_col;
const ga_size c_im = h_index / height_col;
const ga_size c_col = c_im * kernel_h * kernel_w * kernel_d;
const ga_size h_offset = h_col * stride_h - pad_h;
const ga_size w_offset = w_col * stride_w - pad_w;
const ga_size d_offset = d_col * stride_d - pad_d;
GLOBAL_MEM DTYPE_INPUT_0 * data_col_ptr = data_col;
data_col_ptr += c_col * (height_col * width_col * depth_col) +
h_col * (width_col * depth_col) + w_col * depth_col + d_col;
GLOBAL_MEM const DTYPE_INPUT_0 * data_im_ptr = data_im + data_im_offset;
data_im_ptr += c_im * (height * width * depth) +
h_offset * (width * depth) + w_offset * depth + d_offset;
for (ga_size i = 0; i < kernel_h; ++i) {
ga_size h_im = h_offset + i * dilation_h;
for (ga_size j = 0; j < kernel_w; ++j) {
ga_size w_im = w_offset + j * dilation_w;
for (ga_size k = 0; k < kernel_d; ++k) {
ga_size d_im = d_offset + k * dilation_d;
*data_col_ptr = (h_im >= 0 && w_im >= 0 && d_im >= 0 &&
h_im < height && w_im < width && d_im < depth) ?
data_im_ptr[i * dilation_h * (width * depth) +
j * dilation_w * depth +
k * dilation_d] : 0;
data_col_ptr += height_col * width_col * depth_col;
}
}
}
}
}
#kernel im3d2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
#include "cluda.h"
KERNEL void im3d2col_kernel(const ga_size n,
GLOBAL_MEM const DTYPE_INPUT_0 * data_im,
const ga_size offset_im,
const ga_size data_im_offset,
// offset_im is the pointer offset for data_im.
// data_im_offset is an offset of elements in the array
const ga_size height, const ga_size width, const ga_size depth,
const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_d,
const ga_size pad_h, const ga_size pad_w, const ga_size pad_d,
const ga_size stride_h, const ga_size stride_w, const ga_size stride_d,
const ga_size height_col, const ga_size width_col, const ga_size depth_col,
GLOBAL_MEM DTYPE_INPUT_0 * data_col,
const ga_size offset_col) {
data_im = (GLOBAL_MEM DTYPE_INPUT_0 *)(((GLOBAL_MEM char *)data_im) + offset_im);
data_col = (GLOBAL_MEM DTYPE_INPUT_0 *)(((GLOBAL_MEM char *)data_col) + offset_col);
// grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < (n); index += LDIM_0 * GDIM_0) {
const ga_size w_index = index / depth_col;
const ga_size h_index = w_index / width_col;
const ga_size d_col = index % depth_col;
const ga_size h_col = h_index % height_col;
const ga_size w_col = w_index % width_col;
const ga_size c_im = h_index / height_col;
const ga_size c_col = c_im * kernel_h * kernel_w * kernel_d;
const ga_size h_offset = h_col * stride_h - pad_h;
const ga_size w_offset = w_col * stride_w - pad_w;
const ga_size d_offset = d_col * stride_d - pad_d;
GLOBAL_MEM DTYPE_INPUT_0 * data_col_ptr = data_col;
data_col_ptr += c_col * (height_col * width_col * depth_col) +
h_col * (width_col * depth_col) + w_col * depth_col + d_col;
GLOBAL_MEM const DTYPE_INPUT_0 * data_im_ptr = data_im + data_im_offset;
data_im_ptr += c_im * (height * width * depth) +
h_offset * (width * depth) + w_offset * depth + d_offset;
for (ga_size i = 0; i < kernel_h; ++i) {
ga_size h_im = h_offset + i;
for (ga_size j = 0; j < kernel_w; ++j) {
ga_size w_im = w_offset + j;
for (ga_size k = 0; k < kernel_d; ++k) {
ga_size d_im = d_offset + k;
*data_col_ptr = (h_im >= 0 && w_im >= 0 && d_im >= 0 &&
h_im < height && w_im < width && d_im < depth) ?
data_im_ptr[i * (width * depth) + j * depth + k] : 0;
data_col_ptr += height_col * width_col * depth_col;
}
}
}
}
}
// GPU kernel for the case of dilation
#kernel dilated_col2im3d_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size, size :
#include "cluda.h"
KERNEL void dilated_col2im3d_kernel(const ga_size n,
GLOBAL_MEM const DTYPE_INPUT_0 * data_col,
const ga_size offset_col,
const ga_size height, const ga_size width, const ga_size depth,
const ga_size channels,
const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_d,
const ga_size dilation_h, const ga_size dilation_w, const ga_size dilation_d,
const ga_size pad_h, const ga_size pad_w, const ga_size pad_d,
const ga_size stride_h, const ga_size stride_w, const ga_size stride_d,
const ga_size height_col, const ga_size width_col, const ga_size depth_col,
GLOBAL_MEM DTYPE_INPUT_0 * data_im,
const ga_size offset_im,
const ga_size data_im_offset) {
// offset_im is the pointer offset for data_im.
// data_im_offset is an offset of elements in the array
data_im = (GLOBAL_MEM DTYPE_INPUT_0 *)(((GLOBAL_MEM char *)data_im) + offset_im);
data_col = (GLOBAL_MEM DTYPE_INPUT_0 *)(((GLOBAL_MEM char *)data_col) + offset_col);
// grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < (n); index += LDIM_0 * GDIM_0) {
DTYPE_INPUT_0 val = 0;
const ga_size d_im = index % depth + pad_d;
const ga_size w_index = index / depth;
const ga_size w_im = w_index % width + pad_w;
const ga_size h_index = w_index / width;
const ga_size h_im = h_index % height + pad_h;
const ga_size c_im = h_index / height;
ga_size kernel_extent_w = (kernel_w - 1) * dilation_w + 1;
ga_size kernel_extent_h = (kernel_h - 1) * dilation_h + 1;
ga_size kernel_extent_d = (kernel_d - 1) * dilation_d + 1;
// compute the start and end of the output
const ga_size d_col_start =
(d_im < kernel_extent_d) ? 0 : (d_im - kernel_extent_d) / stride_d + 1;
const ga_size d_col_end = min(d_im / stride_d + 1, depth_col);
const ga_size w_col_start =
(w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1;
const ga_size w_col_end = min(w_im / stride_w + 1, width_col);
const ga_size h_col_start =
(h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) / stride_h + 1;
const ga_size h_col_end = min(h_im / stride_h + 1, height_col);
// TODO: use LCM of stride and dilation to avoid unnecessary loops
for (ga_size d_col = d_col_start; d_col < d_col_end; ++d_col) {
for (ga_size h_col = h_col_start; h_col < h_col_end; ++h_col) {
for (ga_size w_col = w_col_start; w_col < w_col_end; ++w_col) {
ga_size h_k = (h_im - h_col * stride_h);
ga_size w_k = (w_im - w_col * stride_w);
ga_size d_k = (d_im - d_col * stride_d);
if (h_k % dilation_h == 0 && w_k % dilation_w == 0 && d_k % dilation_d == 0) {
h_k /= dilation_h;
w_k /= dilation_w;
d_k /= dilation_d;
ga_size data_col_index = c_im * kernel_h * kernel_w * kernel_d * height_col * width_col * depth_col +
h_k * kernel_w * kernel_d * height_col * width_col * depth_col +
w_k * kernel_d * height_col * width_col * depth_col +
d_k * height_col * width_col * depth_col +
h_col * width_col * depth_col +
w_col * depth_col +
d_col;
val += data_col[data_col_index];
}
}
}
}
data_im[data_im_offset + index] = val;
}
}
#kernel col2im3d_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size, size :
#include "cluda.h"
KERNEL void col2im3d_kernel(const ga_size n,
GLOBAL_MEM const DTYPE_INPUT_0 * data_col,
const ga_size offset_col,
const ga_size height, const ga_size width, const ga_size depth,
const ga_size channels,
const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_d,
const ga_size pad_h, const ga_size pad_w, const ga_size pad_d,
const ga_size stride_h, const ga_size stride_w, const ga_size stride_d,
const ga_size height_col, const ga_size width_col, const ga_size depth_col,
GLOBAL_MEM DTYPE_INPUT_0 * data_im,
const ga_size offset_im,
const ga_size data_im_offset) {
// offset_im is the pointer offset for data_im.
// data_im_offset is an offset of elements in the array
data_im = (GLOBAL_MEM DTYPE_INPUT_0 *)(((GLOBAL_MEM char *)data_im) + offset_im);
data_col = (GLOBAL_MEM DTYPE_INPUT_0 *)(((GLOBAL_MEM char *)data_col) + offset_col);
// grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < (n); index += LDIM_0 * GDIM_0) {
DTYPE_INPUT_0 val = 0;
const ga_size d_im = index % depth + pad_d;
const ga_size w_index = index / depth;
const ga_size w_im = w_index % width + pad_w;
const ga_size h_index = w_index / width;
const ga_size h_im = h_index % height + pad_h;
const ga_size c_im = h_index / height;
// compute the start and end of the output
const ga_size d_col_start = (d_im < kernel_d) ? 0 : (d_im - kernel_d) / stride_d + 1;
const ga_size d_col_end = min(d_im / stride_d + 1, depth_col);
const ga_size w_col_start = (w_im < kernel_w) ? 0 : (w_im - kernel_w) / stride_w + 1;
const ga_size w_col_end = min(w_im / stride_w + 1, width_col);
const ga_size h_col_start = (h_im < kernel_h) ? 0 : (h_im - kernel_h) / stride_h + 1;
const ga_size h_col_end = min(h_im / stride_h + 1, height_col);
ga_size offset =
(c_im * kernel_h * kernel_w * kernel_d + h_im * kernel_w * kernel_d +
w_im * kernel_d + d_im) * height_col * width_col * depth_col;
ga_size coeff_h_col = (1 - stride_h * kernel_w * kernel_d * height_col) * width_col * depth_col;
ga_size coeff_w_col = (1 - stride_w * kernel_d * height_col * width_col) * depth_col;
ga_size coeff_d_col = (1 - stride_d * height_col * width_col * depth_col);
for (ga_size d_col = d_col_start; d_col < d_col_end; ++d_col) {
for (ga_size h_col = h_col_start; h_col < h_col_end; ++h_col) {
for (ga_size w_col = w_col_start; w_col < w_col_end; ++w_col) {
val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col + d_col * coeff_d_col];
}
}
}
data_im[data_im_offset + index] = val;
}
}
#section support_code
int rgemm(cb_order o, cb_transpose tA, cb_transpose tB,
size_t M, size_t N, size_t K, double alpha,
GpuArray *A, size_t offA, size_t lda,
GpuArray *B, size_t offB, size_t ldb,
double beta, GpuArray *C, size_t offC, size_t ldc) {
switch (A->typecode) {
case GA_FLOAT:
return gpublas_sgemm(o, tA, tB,
M, N, K, alpha,
A->data, (A->offset / 4) + offA, lda,
B->data, (B->offset / 4) + offB, ldb,
beta,
C->data, (C->offset / 4) + offC, ldc);
case GA_DOUBLE:
return gpublas_dgemm(o, tA, tB,
M, N, K, alpha,
A->data, (A->offset / 8) + offA, lda,
B->data, (B->offset / 8) + offB, ldb,
beta,
C->data, (C->offset / 8) + offC, ldc);
case GA_HALF:
return gpublas_hgemm(o, tA, tB,
M, N, K, alpha,
A->data, (A->offset / 2) + offA, lda,
B->data, (B->offset / 2) + offB, ldb,
beta,
C->data, (C->offset / 2) + offC, ldc);
default:
return GA_UNSUPPORTED_ERROR;
}
}
#section support_code_struct
int im3d2col(
GpuArray *data_im, const size_t data_im_offset, const size_t channels,
const size_t height, const size_t width, const size_t depth,
const size_t kernel_h, const size_t kernel_w, const size_t kernel_d,
const size_t dilation_h, const size_t dilation_w, const size_t dilation_d,
const size_t pad_h, const size_t pad_w, const size_t pad_d,
const size_t stride_h, const size_t stride_w, const size_t stride_d,
GpuArray *data_col) {
// We are going to launch channels * height_col * width_col * depth_col
// kernels, each kernel responsible for copying a single-channel grid.
size_t dil_kernel_h = (kernel_h - 1) * dilation_h + 1;
size_t dil_kernel_w = (kernel_w - 1) * dilation_w + 1;
size_t dil_kernel_d = (kernel_d - 1) * dilation_d + 1;
size_t height_col = (height + 2 * pad_h - dil_kernel_h) / stride_h + 1;
size_t width_col = (width + 2 * pad_w - dil_kernel_w) / stride_w + 1;
size_t depth_col = (depth + 2 * pad_d - dil_kernel_d) / stride_d + 1;
size_t num_kernels = channels * height_col * width_col * depth_col;
int err;
if (dilation_h != 1 || dilation_w != 1 || dilation_d != 1) {
err = dilated_im3d2col_kernel_scall(
1, &num_kernels, 0,
num_kernels, data_im->data, data_im->offset,
data_im_offset, height, width, depth,
kernel_h, kernel_w, kernel_d, dilation_h, dilation_w, dilation_d,
pad_h, pad_w, pad_d, stride_h, stride_w, stride_d, height_col,
width_col, depth_col, data_col->data, data_col->offset);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: dilated_im3d2col_kernel: %s.",
GpuKernel_error(&k_dilated_im3d2col_kernel, err));
}
} else {
err = im3d2col_kernel_scall(
1, &num_kernels, 0,
num_kernels, data_im->data, data_im->offset,
data_im_offset, height, width, depth,
kernel_h, kernel_w, kernel_d, pad_h, pad_w, pad_d,
stride_h, stride_w, stride_d, height_col, width_col, depth_col,
data_col->data, data_col->offset);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: im3d2col_kernel: %s.",
GpuKernel_error(&k_im3d2col_kernel, err));
}
}
return err;
}
int col2im3d(GpuArray *data_col, const size_t channels,
const size_t height, const size_t width, const size_t depth,
const size_t patch_h, const size_t patch_w, const size_t patch_d,
const size_t dilation_h, const size_t dilation_w, const size_t dilation_d,
const size_t pad_h, const size_t pad_w, const size_t pad_d,
const size_t stride_h, const size_t stride_w, const size_t stride_d,
GpuArray *data_im, const size_t data_im_offset) {
size_t dil_patch_h = (patch_h - 1) * dilation_h + 1;
size_t dil_patch_w = (patch_w - 1) * dilation_w + 1;
size_t dil_patch_d = (patch_d - 1) * dilation_d + 1;
size_t height_col = (height + 2 * pad_h - dil_patch_h) / stride_h + 1;
size_t width_col = (width + 2 * pad_w - dil_patch_w) / stride_w + 1;
size_t depth_col = (depth + 2 * pad_d - dil_patch_d) / stride_d + 1;
size_t num_kernels = channels * height * width * depth;
// To avoid involving atomic operations, we will launch one kernel per
// bottom dimension, and then in the kernel add up the top dimensions.
int err;
if (dilation_h != 1 || dilation_w != 1 || dilation_d != 1) {
err = dilated_col2im3d_kernel_scall(
1, &num_kernels, 0,
num_kernels, data_col->data, data_col->offset,
height, width, depth, channels, patch_h, patch_w,
patch_d, dilation_h, dilation_w, dilation_d, pad_h, pad_w, pad_d,
stride_h, stride_w, stride_d, height_col, width_col, depth_col,
data_im->data, data_im->offset, data_im_offset);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: dilated_col2im3d_kernel: %s.",
GpuKernel_error(&k_dilated_col2im3d_kernel, err));
}
}
else{
err = col2im3d_kernel_scall(
1, &num_kernels, 0,
num_kernels, data_col->data, data_col->offset,
height, width, depth, channels, patch_h, patch_w,
patch_d, pad_h, pad_w, pad_d, stride_h, stride_w, stride_d,
height_col, width_col, depth_col,
data_im->data, data_im->offset, data_im_offset);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: col2im3d_kernel: %s.",
GpuKernel_error(&k_col2im3d_kernel, err));
}
}
return err;
}
// Aesara op code
// Authors: Arjun Jain, Frederic Bastien, Jan Schluter
// Reference code: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu
// and https://github.com/torch/cunn/blob/master/SpatialConvolutionMM.cu
// Adaptation for 3d
PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
PyGpuArrayObject *const weight,
PyGpuArrayObject *const top,
const size_t direction,
const size_t dH = 1,
const size_t dW = 1,
const size_t dD = 1,
const size_t dilH = 1,
const size_t dilW = 1,
const size_t dilD = 1,
const size_t padH = 0,
const size_t padW = 0,
const size_t padD = 0,
const size_t numgroups = 1)
{
if (PyGpuArray_NDIM(bottom) != 5)
{
PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM requires bottom of 5D");
return NULL;
}
if (!GpuArray_IS_C_CONTIGUOUS(&bottom->ga))
{
PyErr_Format(PyExc_ValueError,
"GpuCorr3dMM requires bottom to be C-contiguous, "
"but strides are: %ld %ld %ld %ld %ld\n",
PyGpuArray_STRIDES(bottom)[0],
PyGpuArray_STRIDES(bottom)[1],
PyGpuArray_STRIDES(bottom)[2],
PyGpuArray_STRIDES(bottom)[3],
PyGpuArray_STRIDES(bottom)[4]);
return NULL;
}
if (PyGpuArray_NDIM(weight) != 5)
{
PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM requires weight of 5D");
return NULL;
}
if (!GpuArray_IS_C_CONTIGUOUS(&weight->ga))
{
PyErr_Format(PyExc_ValueError,
"GpuCorr3dMM requires weight to be C-contiguous, "
"but strides are: %ld %ld %ld %ld %ld\n",
PyGpuArray_STRIDES(weight)[0],
PyGpuArray_STRIDES(weight)[1],
PyGpuArray_STRIDES(weight)[2],
PyGpuArray_STRIDES(weight)[3],
PyGpuArray_STRIDES(weight)[4]);
return NULL;
}
if (PyGpuArray_NDIM(top) != 5)
{
PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM requires top of 5D");
return NULL;
}
if (!GpuArray_IS_C_CONTIGUOUS(&top->ga))
{
PyErr_Format(PyExc_ValueError,
"GpuCorr3dMM requires top to be C-contiguous, "
"but strides are: %ld %ld %ld %ld %ld\n",
PyGpuArray_STRIDES(top)[0],
PyGpuArray_STRIDES(top)[1],
PyGpuArray_STRIDES(top)[2],
PyGpuArray_STRIDES(top)[3],
PyGpuArray_STRIDES(top)[4]);
return NULL;
}
// Extract some shape information for later and check shape consistency
// bottom: (batchSize, nChannels, bottomHeight, bottomWidth, bottomDepth)
const size_t batchSize = PyGpuArray_DIMS(bottom)[0];
const size_t nChannels = PyGpuArray_DIMS(bottom)[1];
const size_t bottomHeight = PyGpuArray_DIMS(bottom)[2];
const size_t bottomWidth = PyGpuArray_DIMS(bottom)[3];
const size_t bottomDepth = PyGpuArray_DIMS(bottom)[4];
// weights: (nFilters, nChannels, rows, columns, slices)
const size_t nFilters = PyGpuArray_DIMS(weight)[0];
const size_t kH = PyGpuArray_DIMS(weight)[2];
const size_t kW = PyGpuArray_DIMS(weight)[3];
const size_t kD = PyGpuArray_DIMS(weight)[4];
if (nChannels != PyGpuArray_DIMS(weight)[1] * numgroups) {
PyErr_SetString(PyExc_ValueError,
"GpuCorr3dMM images and kernel must have the same stack size\n");
return NULL;
}
if ((nFilters % numgroups) != 0) {
PyErr_SetString(PyExc_ValueError,
"CorrMM the number of filters must be divisible by the number of groups\n");
return NULL;
}
// implicit dilated filter
const size_t dil_kH = (kH - 1) * dilH + 1;
const size_t dil_kW = (kW - 1) * dilW + 1;
const size_t dil_kD = (kD - 1) * dilD + 1;
// top: (batchSize, nFilters, topHeight, topWidth, topDepth)
const size_t topHeightNoDH = (bottomHeight + 2*padH - dil_kH);
const size_t topWidthNoDW = (bottomWidth + 2*padW - dil_kW);
const size_t topDepthNoDD = (bottomDepth + 2*padD - dil_kD);
// the above values might be negative so we need to use Python-like
// flooring integer division to be compatible with get_conv_output.
// note: this macro implements Python's // for negative x only
#define _CONV_FLOORDIV_X(x,y) ((x < 0) ? (- ((-x) / y) - (((-x) % y) == 0 ? 0 : 1)) : (x / y))
const size_t topHeight = _CONV_FLOORDIV_X(topHeightNoDH, dH) + 1;
const size_t topWidth = _CONV_FLOORDIV_X(topWidthNoDW, dW) + 1;
const size_t topDepth = _CONV_FLOORDIV_X(topDepthNoDD, dD) + 1;
#undef _CONV_FLOORDIV
if (batchSize != PyGpuArray_DIMS(top)[0] ||
nFilters != PyGpuArray_DIMS(top)[1] ||
topHeight != PyGpuArray_DIMS(top)[2] ||
topWidth != PyGpuArray_DIMS(top)[3] ||
topDepth != PyGpuArray_DIMS(top)[4]) {
PyErr_Format(PyExc_ValueError,
"GpuCorr3dMM shape inconsistency:\n"
" bottom shape: %ld %ld %ld %ld %ld\n"
" weight shape: %ld %ld %ld %ld %ld\n"
" top shape: %ld %ld %ld %ld %ld (expected %ld %ld %ld %ld %ld)\n",
batchSize, nChannels, bottomHeight, bottomWidth, bottomDepth,
nFilters, nChannels / numgroups, kH, kW, kD,
PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3], PyGpuArray_DIMS(top)[4],
batchSize, nFilters, topHeight, topWidth, topDepth);
return NULL;
}
int err = gpublas_setup(bottom->context->ctx);
if (err != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "Can't setup blas");
return NULL;
}
// Create temporary columns
size_t col_dim[2];
col_dim[0] = nChannels * kW * kH * kD;
col_dim[1] = topHeight * topWidth * topDepth;
PyGpuArrayObject* col = (PyGpuArrayObject*)pygpu_empty(2, col_dim,
bottom->ga.typecode,
GA_C_ORDER,
bottom->context,
Py_None);
if (NULL == col)
{
PyErr_Format(PyExc_RuntimeError,
"GpuCorr3dMM failed to allocate working memory of %ld x %ld\n",
col_dim[0], col_dim[1]);
return NULL;
}
// Define some useful variables
const size_t batch_bottom_stride = PyGpuArray_STRIDES(bottom)[0] / gpuarray_get_elsize(bottom->ga.typecode);
const size_t batch_top_stride = PyGpuArray_STRIDES(top)[0] / gpuarray_get_elsize(top->ga.typecode);
const size_t group_bottom_stride = (PyGpuArray_STRIDES(bottom)[1] * nChannels / numgroups) / gpuarray_get_elsize(bottom->ga.typecode);
const size_t group_top_stride = (PyGpuArray_STRIDES(top)[1] * nFilters / numgroups) / gpuarray_get_elsize(top->ga.typecode);
const size_t group_weight_stride = (PyGpuArray_STRIDES(weight)[0] * nFilters / numgroups) / gpuarray_get_elsize(weight->ga.typecode);
const size_t K_ = col_dim[0] / numgroups;
const size_t N_ = col_dim[1];
const size_t group_col_stride = (K_ * N_);
const size_t M_ = nFilters / numgroups;
PyGpuArrayObject *output;
if (direction == 0) { // forward pass
output = top;
if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
err = GpuArray_memset(&output->ga, 0);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"GpuCorr3dMM could not fill the output with zeros: %d", err);
Py_DECREF(col);
return NULL;
}
Py_DECREF(col);
return output;
}
// valid correlation: im3d2col, then gemm
// Iterate over batch
for (size_t n = 0; n < batchSize; n++) {
// First, im3d2col
err = im3d2col(
&bottom->ga, n * batch_bottom_stride, nChannels, bottomHeight,
bottomWidth, bottomDepth, kH, kW, kD, dilH, dilW, dilD,
padH, padW, padD, dH, dW, dD, &col->ga);
if (err != GA_NO_ERROR) {
Py_DECREF(col);
return NULL;
}
for ( size_t g = 0; g < numgroups; ++g){
// Second, gemm
err = rgemm(cb_fortran, cb_no_trans, cb_no_trans,
N_, M_, K_, 1,
&col->ga, g * group_col_stride, N_,
&weight->ga, g * group_weight_stride, K_,
0,
&top->ga, n * batch_top_stride + g * group_top_stride, N_);
}
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"GpuCorr3dMM forward encountered an error running gemm.");
Py_DECREF(col);
return NULL;
}
}
}
else if (direction == 1) { // backprop wrt. weights
output = weight;
if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
err = GpuArray_memset(&output->ga, 0);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"GpuCorr3dMM grad wrt. weights could not fill the output with zeros: %d", err);
Py_DECREF(col);
return NULL;
}
Py_DECREF(col);
return output;
}
// valid convolution: im3col, then gemm
// Iterate over batch
for (size_t n = 0; n < batchSize; n++) {
// First, im3d2col
err = im3d2col(
&bottom->ga, n * batch_bottom_stride, nChannels, bottomHeight,
bottomWidth, bottomDepth, kH, kW, kD, dilH, dilW, dilD,
padH, padW, padD, dH, dW, dD, &col->ga);
if (err != GA_NO_ERROR) {
Py_DECREF(col);
return NULL;
}
// Second, gemm
// Note that we accumulate into weight. We do so by setting beta = 0
// for the first iteration and beta = 1 for subsequent ones. (This
// is faster than setting weight to all zeros before the loop.)
for ( size_t g = 0; g < numgroups; ++g){
err = rgemm(cb_fortran, cb_trans, cb_no_trans,
K_, M_, N_, 1,
&col->ga, g * group_col_stride, N_,
&top->ga, n * batch_top_stride + g * group_top_stride, N_,
(n == 0) ? 0 : 1,
&weight->ga, g * group_weight_stride, K_);
}
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"GpuCorr3dMM grad weights encountered an error running gemm.");
Py_DECREF(col);
return NULL;
}
}
if (batchSize == 0) {
err = GpuArray_memset(&weight->ga, 0);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"GpuCorr3dMM grad weights could not fill the output with zeros: %d", err);
Py_DECREF(col);
return NULL;
}
}
}
else if (direction == 2) { // backprop wrt. inputs
output = bottom;
if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
err = GpuArray_memset(&output->ga, 0);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"GpuCorr3dMM grad wrt. inputs could not fill the output with zeros: %d", err);
Py_DECREF(col);
return NULL;
}
Py_DECREF(col);
return output;
}
// full convolution: gemm, then col2im3d
// Iterate over batch
for (size_t n = 0; n < batchSize; n++) {
// gemm into columns
for ( size_t g = 0; g < numgroups; ++g){
err = rgemm(cb_fortran, cb_no_trans, cb_trans,
N_, K_, M_, 1,
&top->ga, n * batch_top_stride + g * group_top_stride, N_,
&weight->ga, g * group_weight_stride, K_,
0,
&col->ga, g * group_col_stride, N_);
}
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"GpuCorr3dMM grad inputs encountered an error running gemm.");
Py_DECREF(col);
return NULL;
}
// col2im3d back to the data
err = col2im3d(&col->ga, nChannels,
bottomHeight, bottomWidth, bottomDepth,
kH, kW, kD, dilH, dilW, dilD, padH, padW, padD,
dH, dW, dD, &bottom->ga, n * batch_bottom_stride);
if (err != GA_NO_ERROR) {
Py_DECREF(col);
return NULL;
}
}
}
// Free temporary columns
Py_DECREF(col);
// Note that we don't change the refcount of the output matrix here. Output
// (re)allocation and refcounting is done in BaseGpuCorr3dMM.c_code_helper();
// in here output is just aliased to one of bottom, weights, or top.
return output;
}
#section kernels
#kernel dilated_im2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
#include "cluda.h"
// TODO check kernel flags
// This uses a lot of code from Caffe (http://caffe.berkeleyvision.org/);
// sources are clearly marked. Below we reproduce the original license of
// the Caffe software.
/*
Copyright (c) 2014, The Regents of the University of California (Regents)
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/util/im2col.cu)
// Kernels for fast unfold + copy
// GPU kernel for the case of dilation
KERNEL void dilated_im2col_kernel(const ga_size n,
GLOBAL_MEM const DTYPE_INPUT_0 * data_im,
const ga_size offset_im,
const ga_size data_im_offset,
// offset_im is the pointer offset for data_im.
// data_im_offset is an offset of elements in the array
const ga_size height, const ga_size width,
const ga_size kernel_h, const ga_size kernel_w,
const ga_size dilation_h, const ga_size dilation_w,
const ga_size pad_hl, const ga_size pad_wl,
const ga_size stride_h, const ga_size stride_w,
const ga_size height_col, const ga_size width_col,
GLOBAL_MEM DTYPE_INPUT_0 * data_col,
const ga_size offset_col) {
data_im = (GLOBAL_MEM DTYPE_INPUT_0 *)(((GLOBAL_MEM char *)data_im) + offset_im);
data_col = (GLOBAL_MEM DTYPE_INPUT_0 *)(((GLOBAL_MEM char *)data_col) + offset_col);
// grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < (n); index += LDIM_0 * GDIM_0) {
const ga_size h_index = index / width_col;
const ga_size h_col = h_index % height_col;
const ga_size w_col = index % width_col;
const ga_size c_im = h_index / height_col;
const ga_size c_col = c_im * kernel_h * kernel_w;
const ga_size h_offset = h_col * stride_h - pad_hl;
const ga_size w_offset = w_col * stride_w - pad_wl;
GLOBAL_MEM DTYPE_INPUT_0 * data_col_ptr = data_col;
data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;
GLOBAL_MEM const DTYPE_INPUT_0 * data_im_ptr = data_im + data_im_offset;
data_im_ptr += (c_im * height + h_offset) * width + w_offset;
for (ga_size i = 0; i < kernel_h; ++i) {
for (ga_size j = 0; j < kernel_w; ++j) {
ga_size h_im = h_offset + i * dilation_h;
ga_size w_im = w_offset + j * dilation_w;
*data_col_ptr =
(h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?
data_im_ptr[i * dilation_h * width + j * dilation_w] : 0;
data_col_ptr += height_col * width_col;
}
}
}
}
#kernel im2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
#include "cluda.h"
KERNEL void im2col_kernel(const ga_size n,
GLOBAL_MEM const DTYPE_INPUT_0 * data_im,
const ga_size offset_im,
const ga_size data_im_offset,
// offset_im is the pointer offset for data_im.
// data_im_offset is an offset of elements in the array
const ga_size height, const ga_size width,
const ga_size kernel_h, const ga_size kernel_w,
const ga_size pad_hl, const ga_size pad_wl,
const ga_size stride_h, const ga_size stride_w,
const ga_size height_col, const ga_size width_col,
GLOBAL_MEM DTYPE_INPUT_0 * data_col,
const ga_size offset_col) {
data_im = (GLOBAL_MEM DTYPE_INPUT_0 *)(((GLOBAL_MEM char *)data_im) + offset_im);
data_col = (GLOBAL_MEM DTYPE_INPUT_0 *)(((GLOBAL_MEM char *)data_col) + offset_col);
// grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < (n); index += LDIM_0 * GDIM_0) {
const ga_size h_index = index / width_col;
const ga_size h_col = h_index % height_col;
const ga_size w_col = index % width_col;
const ga_size c_im = h_index / height_col;
const ga_size c_col = c_im * kernel_h * kernel_w;
const ga_size h_offset = h_col * stride_h - pad_hl;
const ga_size w_offset = w_col * stride_w - pad_wl;
GLOBAL_MEM DTYPE_INPUT_0 * data_col_ptr = data_col;
data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;
GLOBAL_MEM const DTYPE_INPUT_0 * data_im_ptr = data_im + data_im_offset;
data_im_ptr += (c_im * height + h_offset) * width + w_offset;
for (ga_size i = 0; i < kernel_h; ++i) {
for (ga_size j = 0; j < kernel_w; ++j) {
ga_size h_im = h_offset + i ;
ga_size w_im = w_offset + j ;
*data_col_ptr =
(h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?
data_im_ptr[i * width + j] : 0;
data_col_ptr += height_col * width_col;
}
}
}
}
// GPU kernel for the case of dilation
#kernel dilated_col2im_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size, size :
#include "cluda.h"
KERNEL void dilated_col2im_kernel(const ga_size n,
GLOBAL_MEM const DTYPE_INPUT_0 * data_col, const ga_size offset_col,
const ga_size height, const ga_size width, const ga_size channels,
const ga_size kernel_h, const ga_size kernel_w,
const ga_size dilation_h, const ga_size dilation_w,
const ga_size pad_hl, const ga_size pad_wl,
const ga_size stride_h, const ga_size stride_w,
const ga_size height_col, const ga_size width_col,
GLOBAL_MEM DTYPE_INPUT_0 * data_im,
const ga_size offset_im,
const ga_size data_im_offset) {
// offset_im is the pointer offset for data_im.
// data_im_offset is an offset of elements in the array
data_col = (GLOBAL_MEM DTYPE_INPUT_0 *)(((GLOBAL_MEM char *)data_col) + offset_col);
data_im = (GLOBAL_MEM DTYPE_INPUT_0 *)(((GLOBAL_MEM char *)data_im) + offset_im);
// grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < (n); index += LDIM_0 * GDIM_0) {
DTYPE_INPUT_0 val = 0;
const ga_size w_im = index % width + pad_wl;
const ga_size h_im = (index / width) % height + pad_hl;
const ga_size c_im = index / (width * height);
ga_size kernel_extent_w = (kernel_w - 1) * dilation_w + 1;
ga_size kernel_extent_h = (kernel_h - 1) * dilation_h + 1;
// compute the start and end of the output
const ga_size w_col_start =
(w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1;
const ga_size w_col_end = min(w_im / stride_w + 1, width_col);
const ga_size h_col_start =
(h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) / stride_h + 1;
const ga_size h_col_end = min(h_im / stride_h + 1, height_col);
// TODO: use LCM of stride and dilation to avoid unnecessary loops
for (ga_size h_col = h_col_start; h_col < h_col_end; h_col += 1) {
for (ga_size w_col = w_col_start; w_col < w_col_end; w_col += 1) {
ga_size h_k = (h_im - h_col * stride_h);
ga_size w_k = (w_im - w_col * stride_w);
if (h_k % dilation_h == 0 && w_k % dilation_w == 0) {
h_k /= dilation_h;
w_k /= dilation_w;
ga_size data_col_index = (((c_im * kernel_h + h_k) * kernel_w + w_k) *
height_col + h_col) * width_col + w_col;
val += data_col[data_col_index];
}
}
}
data_im[data_im_offset + index] = val;
}
}
#kernel col2im_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, *, size, size :
#include "cluda.h"
KERNEL void col2im_kernel(const ga_size n,
GLOBAL_MEM const DTYPE_INPUT_0 * data_col, const ga_size offset_col,
const ga_size height, const ga_size width, const ga_size channels,
const ga_size kernel_h, const ga_size kernel_w,
const ga_size pad_hl, const ga_size pad_wl,
const ga_size stride_h, const ga_size stride_w,
const ga_size height_col, const ga_size width_col,
GLOBAL_MEM DTYPE_INPUT_0 * data_im,
const ga_size offset_im,
const ga_size data_im_offset) {
// offset_im is the pointer offset for data_im.
// data_im_offset is an offset of elements in the array
data_col = (GLOBAL_MEM DTYPE_INPUT_0 *)(((GLOBAL_MEM char *)data_col) + offset_col);
data_im = (GLOBAL_MEM DTYPE_INPUT_0 *)(((GLOBAL_MEM char *)data_im) + offset_im);
// grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < (n); index += LDIM_0 * GDIM_0) {
DTYPE_INPUT_0 val = 0;
const ga_size w_im = index % width + pad_wl;
const ga_size h_im = (index / width) % height + pad_hl;
const ga_size c_im = index / (width * height);
// compute the start and end of the output
const ga_size w_col_start =
(w_im < kernel_w) ? 0 : (w_im - kernel_w) / stride_w + 1;
const ga_size w_col_end = min(w_im / stride_w + 1, width_col);
const ga_size h_col_start =
(h_im < kernel_h) ? 0 : (h_im - kernel_h) / stride_h + 1;
const ga_size h_col_end = min(h_im / stride_h + 1, height_col);
// equivalent implementation, no dilation
ga_size offset =
(c_im * kernel_h * kernel_w + h_im * kernel_w + w_im) * height_col * width_col;
ga_size coeff_h_col = (1 - stride_h * kernel_w * height_col) * width_col;
ga_size coeff_w_col = (1 - stride_w * height_col * width_col);
for (ga_size h_col = h_col_start; h_col < h_col_end; ++h_col) {
for (ga_size w_col = w_col_start; w_col < w_col_end; ++w_col) {
val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
}
}
data_im[data_im_offset + index] = val;
}
}
#section support_code
int rgemm(cb_order o, cb_transpose tA, cb_transpose tB,
size_t M, size_t N, size_t K, double alpha,
GpuArray *A, size_t offA, size_t lda,
GpuArray *B, size_t offB, size_t ldb,
double beta, GpuArray *C, size_t offC, size_t ldc) {
switch (A->typecode) {
case GA_FLOAT:
return gpublas_sgemm(o, tA, tB,
M, N, K, alpha,
A->data, (A->offset / 4) + offA, lda,
B->data, (B->offset / 4) + offB, ldb,
beta,
C->data, (C->offset / 4) + offC, ldc);
case GA_DOUBLE:
return gpublas_dgemm(o, tA, tB,
M, N, K, alpha,
A->data, (A->offset / 8) + offA, lda,
B->data, (B->offset / 8) + offB, ldb,
beta,
C->data, (C->offset / 8) + offC, ldc);
case GA_HALF:
return gpublas_hgemm(o, tA, tB,
M, N, K, alpha,
A->data, (A->offset / 2) + offA, lda,
B->data, (B->offset / 2) + offB, ldb,
beta,
C->data, (C->offset / 2) + offC, ldc);
default:
return GA_UNSUPPORTED_ERROR;
}
}
#section support_code_struct
int im2col(GpuArray *data_im, const size_t data_im_offset, const size_t channels,
const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w,
const size_t dilation_h, const size_t dilation_w,
const size_t pad_hl, const size_t pad_hr,
const size_t pad_wl, const size_t pad_wr,
const size_t stride_h, const size_t stride_w,
GpuArray *data_col) {
// We are going to launch channels * height_col * width_col kernels, each
// kernel responsible for copying a single-channel grid.
size_t dil_kernel_h = (kernel_h - 1) * dilation_h + 1;
size_t dil_kernel_w = (kernel_w - 1) * dilation_w + 1;
size_t height_col = (height + pad_hl + pad_hr - dil_kernel_h) / stride_h + 1;
size_t width_col = (width + pad_wl + pad_wr - dil_kernel_w) / stride_w + 1;
size_t num_kernels = channels * height_col * width_col;
int err;
if (dilation_h != 1 || dilation_w != 1) {
err = dilated_im2col_kernel_scall(
1, &num_kernels, 0,
num_kernels, data_im->data, data_im->offset, data_im_offset,
height, width, kernel_h, kernel_w,
dilation_h, dilation_w, pad_hl, pad_wl, stride_h, stride_w, height_col,
width_col, data_col->data, data_col->offset);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: dilated_im2col_kernel: %s.",
GpuKernel_error(&k_dilated_im2col_kernel, err));
}
} else {
err = im2col_kernel_scall(
1, &num_kernels, 0,
num_kernels, data_im->data, data_im->offset, data_im_offset,
height, width, kernel_h, kernel_w,
pad_hl, pad_wl, stride_h, stride_w, height_col,
width_col, data_col->data, data_col->offset);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: im2col_kernel: %s.",
GpuKernel_error(&k_im2col_kernel, err));
}
}
return err;
}
int col2im(GpuArray *data_col, const size_t channels,
const size_t height, const size_t width, const size_t patch_h, const size_t patch_w,
const size_t dilation_h, const size_t dilation_w,
const size_t pad_hl, const size_t pad_hr, const size_t pad_wl, const size_t pad_wr,
const size_t stride_h, const size_t stride_w, GpuArray *data_im, const size_t data_im_offset) {
size_t dil_patch_h = (patch_h - 1) * dilation_h + 1;
size_t dil_patch_w = (patch_w - 1) * dilation_w + 1;
size_t height_col = (height + pad_hl + pad_hr - dil_patch_h) / stride_h + 1;
size_t width_col = (width + pad_wl + pad_wr - dil_patch_w) / stride_w + 1;
size_t num_kernels = channels * height * width;
// To avoid involving atomic operations, we will launch one kernel per
// bottom dimension, and then in the kernel add up the top dimensions.
int err;
if (dilation_h != 1 || dilation_w != 1) {
err = dilated_col2im_kernel_scall(
1, &num_kernels, 0,
num_kernels, data_col->data, data_col->offset,
height, width, channels, patch_h, patch_w,
dilation_h, dilation_w, pad_hl, pad_wl, stride_h, stride_w,
height_col, width_col, data_im->data, data_im->offset, data_im_offset);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: dilated_col2im_kernel: %s.",
GpuKernel_error(&k_dilated_col2im_kernel, err));
}
} else {
err = col2im_kernel_scall(
1, &num_kernels, 0,
num_kernels, data_col->data, data_col->offset,
height, width, channels, patch_h, patch_w,
pad_hl, pad_wl, stride_h, stride_w,
height_col, width_col, data_im->data, data_im->offset, data_im_offset);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: col2im_kernel: %s.",
GpuKernel_error(&k_col2im_kernel, err));
}
}
return err;
}
// Aesara op code
// Authors: Arjun Jain, Frederic Bastien, Jan Schluter
// Reference code: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu
// and https://github.com/torch/cunn/blob/master/SpatialConvolutionMM.cu
PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
PyGpuArrayObject *const weight,
PyGpuArrayObject *const top,
const size_t direction,
const size_t dH = 1,
const size_t dW = 1,
const size_t dilH = 1,
const size_t dilW = 1,
const size_t padH_l = 0,
const size_t padH_r = 0,
const size_t padW_l = 0,
const size_t padW_r = 0,
const size_t numgroups = 1,
const size_t unshared = 0)
{
if (PyGpuArray_NDIM(bottom) != 4)
{
PyErr_SetString(PyExc_ValueError, "GpuCorrMM requires bottom of 4D");
return NULL;
}
if (!GpuArray_IS_C_CONTIGUOUS(&bottom->ga))
{
PyErr_Format(PyExc_ValueError,
"GpuCorrMM requires bottom to be C-contiguous, "
"but strides are: %ld %ld %ld %ld\n",
PyGpuArray_STRIDES(bottom)[0],
PyGpuArray_STRIDES(bottom)[1],
PyGpuArray_STRIDES(bottom)[2],
PyGpuArray_STRIDES(bottom)[3]);
return NULL;
}
if (PyGpuArray_NDIM(weight) != (unshared ? 6 : 4))
{
PyErr_Format(PyExc_ValueError, "GpuCorrMM requires weight of %dD", unshared ? 6 : 4);
return NULL;
}
if (!GpuArray_IS_C_CONTIGUOUS(&weight->ga))
{
if (unshared) {
PyErr_Format(PyExc_ValueError,
"GpuCorrMM requires weight to be C-contiguous, "
"but strides are: %ld %ld %ld %ld %ld %ld\n",
PyGpuArray_STRIDES(weight)[0],
PyGpuArray_STRIDES(weight)[1],
PyGpuArray_STRIDES(weight)[2],
PyGpuArray_STRIDES(weight)[3],
PyGpuArray_STRIDES(weight)[4],
PyGpuArray_STRIDES(weight)[5]);
return NULL;
}
else {
PyErr_Format(PyExc_ValueError,
"GpuCorrMM requires weight to be C-contiguous, "
"but strides are: %ld %ld %ld %ld\n",
PyGpuArray_STRIDES(weight)[0],
PyGpuArray_STRIDES(weight)[1],
PyGpuArray_STRIDES(weight)[2],
PyGpuArray_STRIDES(weight)[3]);
return NULL;
}
}
if (PyGpuArray_NDIM(top) != 4)
{
PyErr_SetString(PyExc_ValueError, "GpuCorrMM requires top of 4D");
return NULL;
}
if (!GpuArray_IS_C_CONTIGUOUS(&top->ga))
{
PyErr_Format(PyExc_ValueError,
"GpuCorrMM requires top to be C-contiguous, "
"but strides are: %ld %ld %ld %ld\n",
PyGpuArray_STRIDES(top)[0],
PyGpuArray_STRIDES(top)[1],
PyGpuArray_STRIDES(top)[2],
PyGpuArray_STRIDES(top)[3]);
return NULL;
}
// Extract some shape information for later and check shape consistency
// bottom: (batchSize, nChannels, bottomHeight, bottomWidth)
const size_t batchSize = PyGpuArray_DIMS(bottom)[0];
const size_t nChannels = PyGpuArray_DIMS(bottom)[1];
const size_t bottomHeight = PyGpuArray_DIMS(bottom)[2];
const size_t bottomWidth = PyGpuArray_DIMS(bottom)[3];
// weights: (nFilters, nChannels, rows, columns)
// or (nFilters, out_rows, out_columns, nChannels, rows, columns) -> for unshared
const size_t nFilters = PyGpuArray_DIMS(weight)[0];
const size_t kH = PyGpuArray_DIMS(weight)[unshared ? 4 : 2];
const size_t kW = PyGpuArray_DIMS(weight)[unshared ? 5 : 3];
if (nChannels != PyGpuArray_DIMS(weight)[unshared ? 3 : 1] * numgroups) {
PyErr_SetString(PyExc_ValueError,
"GpuCorrMM images and kernel must have the same stack size\n");
return NULL;
}
if ((nFilters % numgroups) != 0) {
PyErr_SetString(PyExc_ValueError,
"GPUCorrMM the number of filters must be divisible by the number of groups\n");
return NULL;
}
// implicit dilated filter
const size_t dil_kH = (kH - 1) * dilH + 1;
const size_t dil_kW = (kW - 1) * dilW + 1;
// top: (batchSize, nFilters, topHeight, topWidth)
const size_t topHeightNoDH = (bottomHeight + padH_l + padH_r - dil_kH);
const size_t topWidthNoDW = (bottomWidth + padW_l + padW_r - dil_kW);
// the above values might be negative so we need to use Python-like
// flooring integer division to be compatible with get_conv_output.
// note: this macro implements Python's // for negative x only
#define _CONV_FLOORDIV_X(x,y) ((x < 0) ? (- ((-x) / y) - (((-x) % y) == 0 ? 0 : 1)) : (x / y))
const size_t topHeight = _CONV_FLOORDIV_X(topHeightNoDH, dH) + 1;
const size_t topWidth = _CONV_FLOORDIV_X(topWidthNoDW, dW) + 1;
#undef _CONV_FLOORDIV
if (unshared) {
if (topHeight != PyGpuArray_DIMS(weight)[1] ||
topWidth != PyGpuArray_DIMS(weight)[2]) {
PyErr_Format(PyExc_ValueError,
"GpuCorrMM regions in kernel must match output regions:\n"
" bottom shape: %ld %ld %ld %ld\n"
" weight shape: %ld %ld %ld %ld %ld %ld"
" (expected %ld %ld %ld %ld %ld %ld)\n"
" top shape(calculated): %ld %ld %ld %ld\n",
batchSize, nChannels, bottomHeight, bottomWidth,
nFilters, PyGpuArray_DIMS(weight)[1],
PyGpuArray_DIMS(weight)[2], nChannels / numgroups, kH, kW,
nFilters, topHeight, topWidth, nChannels / numgroups, kH, kW,
batchSize, nFilters, topHeight, topWidth);
return NULL;
}
if (batchSize != PyGpuArray_DIMS(top)[0] ||
nFilters != PyGpuArray_DIMS(top)[1] ||
topHeight != PyGpuArray_DIMS(top)[2] ||
topWidth != PyGpuArray_DIMS(top)[3]) {
PyErr_Format(PyExc_ValueError,
"GpuCorrMM shape inconsistency:\n"
" bottom shape: %ld %ld %ld %ld\n"
" weight shape: %ld %ld %ld %ld %ld %ld\n"
" top shape: %ld %ld %ld %ld (expected %ld %ld %ld %ld)\n",
batchSize, nChannels, bottomHeight, bottomWidth,
nFilters, topHeight, topWidth, nChannels / numgroups, kH, kW,
PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3],
batchSize, nFilters, topHeight, topWidth);
return NULL;
}
}
else{
if (batchSize != PyGpuArray_DIMS(top)[0] ||
nFilters != PyGpuArray_DIMS(top)[1] ||
topHeight != PyGpuArray_DIMS(top)[2] ||
topWidth != PyGpuArray_DIMS(top)[3]) {
PyErr_Format(PyExc_ValueError,
"GpuCorrMM shape inconsistency:\n"
" bottom shape: %ld %ld %ld %ld\n"
" weight shape: %ld %ld %ld %ld\n"
" top shape: %ld %ld %ld %ld (expected %ld %ld %ld %ld)\n",
batchSize, nChannels, bottomHeight, bottomWidth,
nFilters, nChannels / numgroups, kH, kW,
PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3],
batchSize, nFilters, topHeight, topWidth);
return NULL;
}
}
int err = gpublas_setup(bottom->context->ctx);
if (err != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "Can't setup blas");
return NULL;
}
// Create temporary columns
size_t col_dim[2];
col_dim[0] = nChannels * kW * kH;
col_dim[1] = topHeight * topWidth;
PyGpuArrayObject* col = (PyGpuArrayObject*)pygpu_empty(2, col_dim,
bottom->ga.typecode,
GA_C_ORDER,
bottom->context,
Py_None);
if (NULL == col) {
PyErr_Format(PyExc_RuntimeError,
"GpuCorrMM failed to allocate working memory of %ld x %ld\n",
col_dim[0], col_dim[1]);
return NULL;
}
// Define some useful variables
const size_t batch_bottom_stride = PyGpuArray_STRIDES(bottom)[0] / gpuarray_get_elsize(bottom->ga.typecode);
const size_t batch_top_stride = PyGpuArray_STRIDES(top)[0] / gpuarray_get_elsize(top->ga.typecode);
const size_t group_bottom_stride = (PyGpuArray_STRIDES(bottom)[1] * nChannels / numgroups) / gpuarray_get_elsize(bottom->ga.typecode);
const size_t group_top_stride = (PyGpuArray_STRIDES(top)[1] * nFilters / numgroups) / gpuarray_get_elsize(top->ga.typecode);
const size_t group_weight_stride = (PyGpuArray_STRIDES(weight)[0] * nFilters / numgroups) / gpuarray_get_elsize(weight->ga.typecode);
const size_t K_ = col_dim[0] / numgroups;
const size_t N_ = col_dim[1];
const size_t group_col_stride = (K_ * N_);
const size_t M_ = nFilters / numgroups;
PyGpuArrayObject *output;
if (direction == 0) { // forward pass
output = top;
if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
err = GpuArray_memset(&output->ga, 0);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"GpuCorrMM could not fill the output with zeros: %d", err);
Py_DECREF(col);
return NULL;
}
Py_DECREF(col);
return output;
}
// valid correlation: im2col, then gemm
// Iterate over batch
for (size_t n = 0; n < batchSize; n++) {
// First, im2col
err = im2col(&bottom->ga, n * batch_bottom_stride,
nChannels, bottomHeight,
bottomWidth, kH, kW, dilH, dilW,
padH_l, padH_r, padW_l, padW_r, dH, dW, &col->ga);
if (err != GA_NO_ERROR) {
Py_DECREF(col);
return NULL;
}
// Second, gemm
if (unshared) {
for (size_t g = 0; g < numgroups; ++g) {
for (size_t reg = 0; reg < N_; ++reg){
err = rgemm(cb_fortran, cb_no_trans, cb_no_trans,
1, M_, K_, 1,
&col->ga, g * group_col_stride + reg, N_,
&weight->ga, g * group_weight_stride + reg * K_, K_ * N_,
0,
&top->ga, n * batch_top_stride + g * group_top_stride + reg, N_);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "GpuCorrMM forward encountered an error running gemm: %d", err);
Py_DECREF(col);
return NULL;
}
}
}
}
else {
for (size_t g = 0; g < numgroups; ++g){
err = rgemm(cb_fortran, cb_no_trans, cb_no_trans,
N_, M_, K_, 1,
&col->ga, g * group_col_stride, N_,
&weight->ga, g * group_weight_stride, K_,
0,
&top->ga, n * batch_top_stride + g * group_top_stride, N_);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "GpuCorrMM forward encountered an error running gemm: %d", err);
Py_DECREF(col);
return NULL;
}
}
}
}
}
else if (direction == 1) { // backprop wrt. weights
output = weight;
if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
err = GpuArray_memset(&output->ga, 0);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"GpuCorrMM grad wrt. weights could not fill the output with zeros: %d", err);
Py_DECREF(col);
return NULL;
}
Py_DECREF(col);
return output;
}
// valid convolution: im2col, then gemm
// Iterate over batch
for (size_t n = 0; n < batchSize; n++) {
// First, im2col
err = im2col(&bottom->ga, n * batch_bottom_stride,
nChannels, bottomHeight,
bottomWidth, kH, kW, dilH, dilW,
padH_l, padH_r, padW_l, padW_r, dH, dW, &col->ga);
if (err != GA_NO_ERROR) {
Py_DECREF(col);
return NULL;
}
// Second, gemm
// Note that we accumulate into weight. We do so by setting beta = 0
// for the first iteration and beta = 1 for subsequent ones. (This
// is faster than setting weight to all zeros before the loop.)
if (unshared) {
for (size_t g = 0; g < numgroups; ++g) {
for (size_t reg = 0; reg < N_; ++reg){
err = rgemm(cb_fortran, cb_trans, cb_no_trans,
K_, M_, 1, 1,
&col->ga, g * group_col_stride + reg, N_,
&top->ga, n * batch_top_stride + g * group_top_stride + reg, N_,
(n == 0) ? 0 : 1,
&weight->ga, g * group_weight_stride + reg * K_, K_ * N_);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "GpuCorrMM grad weights encountered an error running gemm: %d", err);
Py_DECREF(col);
return NULL;
}
}
}
}
else{
for(size_t g = 0; g < numgroups; g++){
err = rgemm(cb_fortran, cb_trans, cb_no_trans,
K_, M_, N_, 1,
&col->ga, g * group_col_stride, N_,
&top->ga, n * batch_top_stride + g * group_top_stride, N_,
(n == 0) ? 0 : 1,
&weight->ga, g * group_weight_stride, K_);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "GpuCorrMM grad weights encountered an error running gemm: %d", err);
Py_DECREF(col);
return NULL;
}
}
}
}
}
else if (direction == 2) { // backprop wrt. inputs
output = bottom;
if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
err = GpuArray_memset(&output->ga, 0);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"GpuCorrMM grad wrt. inputs could not fill the output with zeros: %d", err);
Py_DECREF(col);
return NULL;
}
Py_DECREF(col);
return output;
}
// full convolution: gemm, then col2im
// Iterate over batch
for (size_t n = 0; n < batchSize; n++) {
// gemm into columns
if (unshared) {
for (size_t g = 0; g < numgroups; ++g){
for (size_t reg = 0; reg < N_; ++reg) {
err = rgemm(cb_fortran, cb_no_trans, cb_trans,
1, K_, M_, 1,
&top->ga, n * batch_top_stride + g * group_top_stride + reg, N_,
&weight->ga, g * group_weight_stride + reg * K_, K_ * N_,
0,
&col->ga, g * group_col_stride + reg, N_);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "GpuCorrMM grad inputs encountered an error running gemm: %d", err);
Py_DECREF(col);
return NULL;
}
}
}
}
else {
for (size_t g = 0; g < numgroups; ++g){
err = rgemm(cb_fortran, cb_no_trans, cb_trans,
N_, K_, M_, 1,
&top->ga, n * batch_top_stride + g * group_top_stride, N_,
&weight->ga, g * group_weight_stride, K_,
0,
&col->ga, g * group_col_stride, N_);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "GpuCorrMM grad inputs encountered an error running gemm: %d", err);
Py_DECREF(col);
return NULL;
}
}
}
// col2im back to the data
err = col2im(&col->ga, nChannels, bottomHeight, bottomWidth,
kH, kW, dilH, dilW, padH_l, padH_r, padW_l, padW_r,
dH, dW, &bottom->ga, n * batch_bottom_stride);
if (err != GA_NO_ERROR) {
Py_DECREF(col);
return NULL;
}
}
}
// Free temporary columns
Py_DECREF(col);
// Note that we don't change the refcount of the output matrix here. Output
// (re)allocation and refcounting is done in BaseGpuCorrMM.c_code_helper();
// in here output is just aliased to one of bottom, weights, or top.
return output;
}
#section init_code
setup_ext_cuda();
#section support_code
typedef struct ctc_context {
struct ctcOptions options;
gpudata * workspace;
int * input_lengths;
int * flat_labels;
int * label_lengths;
} ctc_context_t;
void ctc_context_init(ctc_context_t * context, PyGpuContextObject * gpu_context)
{
memset(&(context->options), 0, sizeof(struct ctcOptions));
context->options.loc = CTC_GPU;
// Get CUDA function pointer to obtain stream
CUstream (*getstream_func_ptr)(void *) = (CUstream (*)(void *)) gpuarray_get_extension( "cuda_get_stream" );
context->options.stream = getstream_func_ptr(gpu_context->ctx);
context->workspace = NULL;
context->input_lengths = NULL;
context->flat_labels = NULL;
context->label_lengths = NULL;
}
void ctc_context_destroy(ctc_context_t * context)
{
gpudata_release( context->workspace );
free( context->input_lengths );
free( context->flat_labels );
free( context->label_lengths );
}
int ctc_check_result(ctcStatus_t retcode, const char * msg)
{
if( CTC_STATUS_SUCCESS != retcode )
{
// Get error message from underlying library
const char * ctc_msg = ctcGetStatusString( retcode );
PyErr_Format( PyExc_RuntimeError,
"GpuConnectionistTemporalClassification: %s CTC error: %s",
msg,
ctc_msg );
return 1;
}
return 0;
}
void create_contiguous_input_lengths( PyArrayObject * input_lengths_arr,
int ** input_lengths )
{
npy_int num_elements = PyArray_DIMS( input_lengths_arr )[0];
*input_lengths = (int *) malloc( num_elements * sizeof(int) );
if ( NULL == (*input_lengths) )
return;
for( npy_int elem_idx = 0; elem_idx < num_elements; ++elem_idx )
{
(*input_lengths)[elem_idx] = *( (npy_int *) PyArray_GETPTR1( input_lengths_arr, elem_idx ) );
}
}
void create_flat_labels( PyArrayObject * label_matrix, int ** flat_labels,
int ** label_lengths )
{
npy_int rows = PyArray_DIMS( label_matrix )[0];
npy_int cols = PyArray_DIMS( label_matrix )[1];
*flat_labels = (int *) calloc( rows * cols, sizeof(int) );
if ( NULL == (*flat_labels) )
return;
*label_lengths = (int *) calloc( rows, sizeof(int) );
if ( NULL == (*label_lengths) )
{
free( *flat_labels );
*flat_labels = NULL;
return;
}
npy_int label_index = 0;
for( npy_int row_idx = 0; row_idx < rows; ++row_idx )
{
npy_int label_length = 0;
for( npy_int col_idx = 0; col_idx < cols; ++col_idx )
{
npy_int label = *( (npy_int *) PyArray_GETPTR2( label_matrix, row_idx, col_idx ) );
if ( label >= 0 ) // negative values are assumed to be padding
{
(*flat_labels)[ label_index++ ] = label;
++label_length;
}
}
(*label_lengths)[ row_idx ] = label_length;
}
}
#section support_code_apply
int APPLY_SPECIFIC(ctc_cost_gpu)(PyGpuArrayObject * in_activations,
PyArrayObject * in_labels,
PyArrayObject * in_input_lengths,
PyGpuArrayObject ** out_costs,
PyGpuArrayObject ** out_gradients,
PyGpuContextObject * gpu_context)
{
ctc_context_t ctc_object;
ctc_context_t * context = &ctc_object;
size_t gpu_workspace_size;
int ctc_error = 0;
const size_t num_activations = PyGpuArray_DIMS( in_activations )[0];
const size_t minibatch_size = PyGpuArray_DIMS( in_activations )[1];
const size_t alphabet_size = PyGpuArray_DIMS( in_activations )[2];
const size_t cost_size = minibatch_size;
const size_t grad_dims[3] = { num_activations, minibatch_size, alphabet_size };
float * costs = NULL,
* activations = NULL,
* gradients = NULL;
cuda_enter( gpu_context->ctx );
ctc_context_init( context, gpu_context );
switch (in_activations->ga.typecode)
{
case GA_FLOAT:
activations = (float *) PyGpuArray_DEV_DATA( in_activations );
break;
default:
ctc_context_destroy( context );
cuda_exit( gpu_context->ctx );
PyErr_SetString( PyExc_TypeError,
"GpuConnectionistTemporalClassification: Unsupported type for activations." );
return 1;
}
create_contiguous_input_lengths( in_input_lengths, &(context->input_lengths) );
if ( NULL == context->input_lengths )
{
// Destroy previous CTC context before returning exception
ctc_context_destroy( context );
cuda_exit( gpu_context->ctx );
PyErr_Format( PyExc_MemoryError,
"GpuConnectionistTemporalClassification: Could not allocate memory for input lengths." );
return 1;
}
// flatten labels to conform with library memory layout
create_flat_labels( in_labels, &(context->flat_labels), &(context->label_lengths) );
if ( ( NULL == context->label_lengths ) || ( NULL == context->flat_labels ) )
{
// Destroy previous CTC context before returning exception
ctc_context_destroy( context );
cuda_exit( gpu_context->ctx );
PyErr_Format( PyExc_MemoryError,
"GpuConnectionistTemporalClassification: Could not allocate memory for labels and their lengths." );
return 1;
}
if ( aesara_prep_output( out_costs, 1, &cost_size, in_activations->ga.typecode,
GA_C_ORDER, gpu_context ) != 0 )
{
ctc_context_destroy( context );
cuda_exit( gpu_context->ctx );
return 1;
}
GpuArray_memset( &((*out_costs)->ga), 0 );
costs = (float *) PyGpuArray_DEV_DATA( *out_costs );
if ( NULL != out_gradients ) // if gradient computation is not disabled
{
if ( aesara_prep_output( out_gradients, 3, grad_dims, in_activations->ga.typecode,
GA_C_ORDER, gpu_context ) != 0 )
{
ctc_context_destroy( context );
cuda_exit( gpu_context->ctx );
return 1;
}
GpuArray_memset( &((*out_gradients)->ga), 0 );
gradients = (float *) PyGpuArray_DEV_DATA( *out_gradients );
}
ctc_error = ctc_check_result( get_workspace_size( context->label_lengths,
context->input_lengths, alphabet_size, minibatch_size, context->options,
&gpu_workspace_size ),
"Failed to obtain CTC workspace size." );
if ( ctc_error ) // Exception is set by ctc_check_result, return error here
{
// Destroy previous CTC context before returning exception
ctc_context_destroy( context );
cuda_exit( gpu_context->ctx );
return 1;
}
context->workspace = gpudata_alloc( gpu_context->ctx, gpu_workspace_size, NULL, 0, NULL );
if ( NULL == context->workspace )
{
ctc_context_destroy( context );
cuda_exit( gpu_context->ctx );
PyErr_Format( PyExc_MemoryError,
"GpuConnectionistTemporalClassification: Failed to allocate memory for CTC workspace." );
return 1;
}
cuda_wait( in_activations->ga.data, GPUARRAY_CUDA_WAIT_READ );
cuda_wait( (*out_costs)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
if ( out_gradients != NULL )
cuda_wait( (*out_gradients)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
ctc_error = ctc_check_result( compute_ctc_loss( activations, gradients,
context->flat_labels, context->label_lengths, context->input_lengths,
alphabet_size, minibatch_size, costs, *(void **)context->workspace,
context->options ), "Failed to compute CTC loss function." );
cuda_record( in_activations->ga.data, GPUARRAY_CUDA_WAIT_READ );
cuda_record( (*out_costs)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
if ( out_gradients != NULL )
cuda_record( (*out_gradients)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
if ( ctc_error ) // Exception is set by ctc_check_result, return error here
{
ctc_context_destroy( context );
cuda_exit( gpu_context->ctx );
return 1;
}
ctc_context_destroy( context );
cuda_exit( gpu_context->ctx );
return 0;
}
#ifndef CUDNN_HELPER_H
#define CUDNN_HELPER_H
#include <cudnn.h>
#ifndef CUDNN_VERSION
#define CUDNN_VERSION -1
static inline int cudnnGetVersion() {
return -1;
}
#endif
#if CUDNN_MAJOR < 7
enum cudnnMathType_t { CUDNN_DEFAULT_MATH=0, CUDNN_TENSOR_OP_MATH = 1 };
#endif
/* a common struct for all 3 CUDNN enums */
struct AlgoRec {
int algo;
size_t wsSize;
cudnnMathType_t mathType;
};
#endif
#section support_code_apply
int APPLY_SPECIFIC(gpu_dimshuffle)(PyGpuArrayObject* input, PyGpuArrayObject** out, PARAMS_TYPE* params) {
PyGpuArrayObject *tmp = NULL;
npy_intp nd_in = PyArray_SIZE(params->input_broadcastable);
npy_intp nd_out = PyArray_SIZE(params->_new_order);
npy_int64* new_order = NULL;
unsigned int* transposition = NULL;
size_t* sh = NULL;
int e;
if (input->ga.nd != nd_in) {
PyErr_SetString(PyExc_TypeError, "input nd");
return 1;
}
if (!PyArray_IS_C_CONTIGUOUS(params->_new_order)) {
PyErr_SetString(PyExc_RuntimeError, "DimShuffle: param _new_order must be C-contiguous.");
return 1;
}
if (!PyArray_IS_C_CONTIGUOUS(params->transposition)) {
PyErr_SetString(PyExc_RuntimeError, "GpuDimShuffle: param transposition must be C-contiguous.");
return 1;
}
Py_XDECREF(*out);
/** Do shuffle. **/
new_order = (npy_int64*) PyArray_DATA(params->_new_order);
/* Type of params->transposition (npy_uint32) should be an alias of unsigned int
* on platforms supported by Aesara. */
transposition = (unsigned int*) PyArray_DATA(params->transposition);
sh = (size_t*) malloc(nd_out * sizeof(size_t));
if (sh == NULL) {
PyErr_NoMemory();
return 1;
}
tmp = pygpu_transpose(input, transposition);
if (!tmp) {
free(sh);
return 1;
}
e = 0;
for (npy_intp i = 0; i < nd_out; ++i) {
if (new_order[i] == -1) {
sh[i] = 1;
} else {
sh[i] = tmp->ga.dimensions[e];
++e;
}
}
*out = pygpu_reshape(tmp, nd_out, sh, GA_ANY_ORDER, 1, -1);
Py_DECREF(tmp);
free(sh);
if (*out == NULL) {
return 1;
}
/** End shuffle. **/
if (!params->inplace) {
tmp = pygpu_copy(*out, GA_ANY_ORDER);
Py_DECREF(*out);
if (!tmp) {
*out = NULL;
return 1;
}
*out = tmp;
}
return 0;
}
#section support_code
static int
c_set_tensor_for_conv(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc, size_t groups) {
cudnnDataType_t dt;
size_t ds;
switch (var->ga.typecode) {
case GA_FLOAT:
dt = CUDNN_DATA_FLOAT;
break;
case GA_DOUBLE:
dt = CUDNN_DATA_DOUBLE;
break;
case GA_HALF:
dt = CUDNN_DATA_HALF;
break;
default:
PyErr_SetString(PyExc_TypeError, "Non-float datatype in c_set_tensorNd");
return -1;
}
ds = gpuarray_get_elsize(var->ga.typecode);
int strs[8], dims[8], default_stride = 1;
unsigned int nd = PyGpuArray_NDIM(var);
if (nd > 8) {
PyErr_SetString(PyExc_TypeError, "Tensor of more than 8d");
return -1;
}
for (unsigned int _i = nd; _i > 0; _i--) {
unsigned int i = _i - 1;
strs[i] = (PyGpuArray_DIM(var, i) != 1 && PyGpuArray_STRIDE(var, i)) ?
PyGpuArray_STRIDE(var, i)/ds : default_stride;
default_stride *= PyGpuArray_DIM(var, i);
dims[i] = PyGpuArray_DIM(var, i);
}
/* Tensors can't be smaller than 3d for cudnn so we pad the
* descriptor if they are */
for (unsigned int i = nd; i < 3; i++) {
strs[i] = 1;
dims[i] = 1;
}
//only for grouped convolution i.e when groups > 1
dims[1] = dims[1] / groups;
cudnnStatus_t err = cudnnSetTensorNdDescriptor(desc, dt, nd < 3 ? 3 : nd,
dims, strs);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not set tensorNd descriptor: %s",
cudnnGetErrorString(err));
return -1;
}
return 0;
}
static int
c_set_tensorNd(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc) {
return c_set_tensor_for_conv(var, desc, 1);
}
static int c_make_tensorNd(PyGpuArrayObject *var, cudnnTensorDescriptor_t *desc) {
cudnnStatus_t err;
err = cudnnCreateTensorDescriptor(desc);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not create tensor descriptor: %s",
cudnnGetErrorString(err));
return -1;
}
if (c_set_tensorNd(var, *desc) != 0) {
cudnnDestroyTensorDescriptor(*desc);
return -1;
}
return 0;
}
static int
c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t desc, size_t groups) {
cudnnDataType_t dt;
cudnnStatus_t err;
if (!GpuArray_IS_C_CONTIGUOUS(&var->ga)) {
PyErr_SetString(PyExc_ValueError,
"Only contiguous filters (kernels) are supported.");
return -1;
}
switch (var->ga.typecode) {
case GA_FLOAT:
dt = CUDNN_DATA_FLOAT;
break;
case GA_DOUBLE:
dt = CUDNN_DATA_DOUBLE;
break;
case GA_HALF:
dt = CUDNN_DATA_HALF;
break;
default:
PyErr_SetString(PyExc_TypeError, "Non-float datatype in c_set_filter");
return -1;
}
int dims[8];
unsigned int nd = PyGpuArray_NDIM(var);
if (nd > 8) {
PyErr_SetString(PyExc_TypeError, "Tensor of more than 8d");
return -1;
}
for (unsigned int _i = nd; _i > 0; _i--) {
unsigned int i = _i - 1;
dims[i] = PyGpuArray_DIM(var, i);
}
/* Filters can't be less than 3d so we pad */
for (unsigned int i = nd; i < 3; i++)
dims[i] = 1;
dims[0] = dims[0] / groups;
if (nd < 3)
nd = 3;
err = cudnnSetFilterNdDescriptor(desc, dt, CUDNN_TENSOR_NCHW, nd, dims);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not set filter descriptor: %s.",
cudnnGetErrorString(err));
return -1;
}
return 0;
}
static int c_make_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t *desc) {
cudnnStatus_t err;
err = cudnnCreateFilterDescriptor(desc);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not create tensor descriptor: %s",
cudnnGetErrorString(err));
return -1;
}
if (c_set_filter(var, *desc, 1) != 0) {
cudnnDestroyFilterDescriptor(*desc);
return -1;
}
return 0;
}
#section init_code
setup_ext_cuda();
#section support_code_struct
int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
PyGpuArrayObject *bias, npy_float64 epsilon,
npy_float64 running_average_factor,
PyGpuArrayObject *in_running_mean, // may be NULL
PyGpuArrayObject *in_running_var, // may be NULL
PyGpuArrayObject **outp,
PyGpuArrayObject **x_mean,
PyGpuArrayObject **x_invstd,
PyGpuArrayObject **out_running_mean, // may be NULL
PyGpuArrayObject **out_running_var, // may be NULL
PARAMS_TYPE* params) {
/* Note: based on Python code, in_running_mean, in_running_var, out_running_mean and out_running_var
are together NULL (or not NULL) at same time, so we just need to check only one of them. */
bool running_averages = (in_running_mean != NULL);
PyGpuContextObject *c = inp->context;
if (c_set_tensorNd(inp, bn_input) != 0)
return 1;
if (c_set_tensorNd(scale, bn_params) != 0)
return 1;
if (epsilon < 1e-5) {
PyErr_Format(PyExc_ValueError, "epsilon must be at least 1e-5, got %f", epsilon);
return 1;
}
if (params->inplace_output) {
Py_XDECREF(*outp);
*outp = inp;
Py_INCREF(*outp);
} else if (aesara_prep_output(outp, inp->ga.nd, inp->ga.dimensions, inp->ga.typecode, GA_C_ORDER, c) != 0) {
return 1;
}
if (aesara_prep_output(x_mean, scale->ga.nd, scale->ga.dimensions, scale->ga.typecode, GA_C_ORDER, c) != 0)
return 1;
if (aesara_prep_output(x_invstd, scale->ga.nd, scale->ga.dimensions, scale->ga.typecode, GA_C_ORDER, c) != 0)
return 1;
if (c_set_tensorNd(*outp, bn_output) != 0)
return 1;
PyGpuArrayObject *running_mean = NULL;
PyGpuArrayObject *running_var = NULL;
if (running_averages) {
if (params->inplace_running_mean) {
Py_XDECREF(*out_running_mean);
running_mean = in_running_mean;
Py_INCREF(running_mean);
} else {
running_mean = *out_running_mean;
running_mean = aesara_try_copy(running_mean, in_running_mean);
if (running_mean == NULL) {
return 1;
}
}
if (params->inplace_running_var) {
Py_XDECREF(*out_running_var);
running_var = in_running_var;
Py_INCREF(running_var);
} else {
running_var = *out_running_var;
running_var = aesara_try_copy(running_var, in_running_var);
if (running_var == NULL) {
return 1;
}
}
}
{
const float falpha = 1.;
const float fbeta = 0.;
const double dalpha = 1.;
const double dbeta = 0.;
void *alpha;
void *beta;
if (inp->ga.typecode == GA_DOUBLE) {
alpha = (void *)&dalpha;
beta = (void *)&dbeta;
} else {
alpha = (void *)&falpha;
beta = (void *)&fbeta;
}
cudnnStatus_t err = cudnnBatchNormalizationForwardTraining(
params->handle,
params->mode,
alpha,
beta,
bn_input,
PyGpuArray_DEV_DATA(inp),
bn_output,
PyGpuArray_DEV_DATA(*outp),
bn_params,
PyGpuArray_DEV_DATA(scale),
PyGpuArray_DEV_DATA(bias),
running_averages ? running_average_factor : 0,
running_averages ? PyGpuArray_DEV_DATA(running_mean) : NULL,
running_averages ? PyGpuArray_DEV_DATA(running_var): NULL,
epsilon,
PyGpuArray_DEV_DATA(*x_mean),
PyGpuArray_DEV_DATA(*x_invstd)
);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "Error during batchnorm: %s\n",
cudnnGetErrorString(err));
return 1;
}
if (running_averages) {
*out_running_mean = running_mean;
*out_running_var = running_var;
}
}
return 0;
}
#section init_code_struct
{
cudnnStatus_t err;
bn_input = NULL;
bn_params = NULL;
bn_output = NULL;
if ((err = cudnnCreateTensorDescriptor(&bn_input)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
"(bn_input): %s", cudnnGetErrorString(err));
FAIL;
}
if ((err = cudnnCreateTensorDescriptor(&bn_params)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
"(bn_params): %s", cudnnGetErrorString(err));
FAIL;
}
if ((err = cudnnCreateTensorDescriptor(&bn_output)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
"(bn_output): %s", cudnnGetErrorString(err));
FAIL;
}
}
#section cleanup_code_struct
if (bn_input != NULL)
cudnnDestroyTensorDescriptor(bn_input);
if (bn_params != NULL)
cudnnDestroyTensorDescriptor(bn_params);
if (bn_output != NULL)
cudnnDestroyTensorDescriptor(bn_output);
#section support_code_struct
cudnnTensorDescriptor_t bn_input;
cudnnTensorDescriptor_t bn_params;
cudnnTensorDescriptor_t bn_output;
#section init_code_struct
{
cudnnStatus_t err;
bn_doutput = NULL;
if ((err = cudnnCreateTensorDescriptor(&bn_doutput)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
"(bn_doutput): %s", cudnnGetErrorString(err));
FAIL;
}
}
#section cleanup_code_struct
if (bn_doutput != NULL)
cudnnDestroyTensorDescriptor(bn_doutput);
#section support_code_struct
cudnnTensorDescriptor_t bn_doutput;
int dnn_batchnorm_grad(PyGpuArrayObject *inp, PyGpuArrayObject *doutp,
PyGpuArrayObject *scale, PyGpuArrayObject *x_mean,
PyGpuArrayObject *x_invstd, npy_float64 epsilon,
PyGpuArrayObject **dinp, PyGpuArrayObject **dscale,
PyGpuArrayObject **dbias, PARAMS_TYPE* params) {
PyGpuContextObject *c = inp->context;
if (c_set_tensorNd(inp, bn_input) != 0)
return 1;
if (c_set_tensorNd(doutp, bn_doutput) != 0)
return 1;
if (c_set_tensorNd(scale, bn_params) != 0)
return 1;
if (epsilon < 1e-5) {
PyErr_Format(PyExc_ValueError, "epsilon must be at least 1e-5, got %f", epsilon);
return 1;
}
if (aesara_prep_output(dinp, inp->ga.nd, inp->ga.dimensions, inp->ga.typecode, GA_C_ORDER, c) != 0)
return 1;
if (aesara_prep_output(dscale, scale->ga.nd, scale->ga.dimensions, scale->ga.typecode, GA_C_ORDER, c) != 0)
return 1;
if (aesara_prep_output(dbias, scale->ga.nd, scale->ga.dimensions, scale->ga.typecode, GA_C_ORDER, c) != 0)
return 1;
if (c_set_tensorNd(*dinp, bn_output) != 0)
return 1;
{
const float falpha = 1.;
const float fbeta = 0.;
const double dalpha = 1.;
const double dbeta = 0.;
void *alphaData;
void *betaData;
void *alphaParam;
void *betaParam;
if (inp->ga.typecode == GA_DOUBLE) {
alphaData = (void *)&dalpha;
betaData = (void *)&dbeta;
alphaParam = (void *)&dalpha;
betaParam = (void *)&dbeta;
} else {
alphaData = (void *)&falpha;
betaData = (void *)&fbeta;
alphaParam = (void *)&falpha;
betaParam = (void *)&fbeta;
}
cudnnStatus_t err = cudnnBatchNormalizationBackward(
params->handle,
params->mode,
alphaData,
betaData,
alphaParam,
betaParam,
bn_input,
PyGpuArray_DEV_DATA(inp),
bn_doutput,
PyGpuArray_DEV_DATA(doutp),
bn_output,
PyGpuArray_DEV_DATA(*dinp),
bn_params,
PyGpuArray_DEV_DATA(scale),
PyGpuArray_DEV_DATA(*dscale),
PyGpuArray_DEV_DATA(*dbias),
epsilon,
PyGpuArray_DEV_DATA(x_mean),
PyGpuArray_DEV_DATA(x_invstd)
);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "Error during batchnorm: %s\n",
cudnnGetErrorString(err));
return 1;
}
}
return 0;
}
#section support_code_struct
int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
PyGpuArrayObject *bias, PyGpuArrayObject *est_mean,
PyGpuArrayObject *est_var, npy_float64 epsilon,
PyGpuArrayObject **outp, PARAMS_TYPE* params) {
PyGpuContextObject *c = inp->context;
if (c_set_tensorNd(inp, bn_input) != 0)
return 1;
if (c_set_tensorNd(scale, bn_params) != 0)
return 1;
if (epsilon < 1e-5) {
PyErr_Format(PyExc_ValueError, "epsilon must be at least 1e-5, got %f", epsilon);
return 1;
}
if (params->inplace) {
Py_XDECREF(*outp);
*outp = inp;
Py_INCREF(*outp);
} else {
if (aesara_prep_output(outp, inp->ga.nd, inp->ga.dimensions, inp->ga.typecode, GA_C_ORDER, c) != 0)
return 1;
}
if (c_set_tensorNd(*outp, bn_output) != 0)
return 1;
{
const float falpha = 1.;
const float fbeta = 0.;
const double dalpha = 1.;
const double dbeta = 0.;
void *alpha;
void *beta;
if (inp->ga.typecode == GA_DOUBLE) {
alpha = (void *)&dalpha;
beta = (void *)&dbeta;
} else {
alpha = (void *)&falpha;
beta = (void *)&fbeta;
}
cudnnStatus_t err = cudnnBatchNormalizationForwardInference(
params->handle,
params->mode,
alpha,
beta,
bn_input,
PyGpuArray_DEV_DATA(inp),
bn_output,
PyGpuArray_DEV_DATA(*outp),
bn_params,
PyGpuArray_DEV_DATA(scale),
PyGpuArray_DEV_DATA(bias),
PyGpuArray_DEV_DATA(est_mean),
PyGpuArray_DEV_DATA(est_var),
epsilon
);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "Error during batchnorm: %s\n",
cudnnGetErrorString(err));
return 1;
}
}
return 0;
}
#section support_code_struct
cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
cudnnFilterDescriptor_t APPLY_SPECIFIC(kerns);
static int c_get_groups_for_conv(cudnnConvolutionDescriptor_t desc, int groups) {
#if CUDNN_MAJOR >= 7
int desc_groups;
if (groups > 1) {
cudnnStatus_t err = cudnnGetConvolutionGroupCount(desc, &desc_groups);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error getting groups for convolution : %s",
cudnnGetErrorString(err));
return -1;
}
if (groups != desc_groups) {
PyErr_SetString(PyExc_MemoryError,
"groups specified different from convolution descriptor");
return -1;
}
}
return 1;
#else
return groups;
#endif
}
static int c_set_math_type_for_conv(cudnnConvolutionDescriptor_t desc, cudnnMathType_t mathtype) {
#if CUDNN_MAJOR >= 7
// CUDNN7: need to set math type
cudnnStatus_t err = cudnnSetConvolutionMathType(desc, mathtype);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error setting math type for convolution : %s",
cudnnGetErrorString(err));
return -1;
}
#endif
return 0;
}
#section init_code_struct
cudnnStatus_t APPLY_SPECIFIC(err);
APPLY_SPECIFIC(input) = NULL;
APPLY_SPECIFIC(output) = NULL;
APPLY_SPECIFIC(kerns) = NULL;
if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
"(inp): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
FAIL;
}
if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
"(out): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
FAIL;
}
if ((APPLY_SPECIFIC(err) = cudnnCreateFilterDescriptor(&APPLY_SPECIFIC(kerns))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate filter descriptor: %s",
cudnnGetErrorString(APPLY_SPECIFIC(err)));
FAIL;
}
#section cleanup_code_struct
if (APPLY_SPECIFIC(input) != NULL)
cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input));
if (APPLY_SPECIFIC(output) != NULL)
cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output));
if (APPLY_SPECIFIC(kerns) != NULL)
cudnnDestroyFilterDescriptor(APPLY_SPECIFIC(kerns));
#section support_code
#include <sstream>
#include <string>
#if __cplusplus < 201103L && !defined(__APPLE__)
#include <tr1/unordered_map>
typedef std::tr1::unordered_map<std::string, AlgoRec> AlgoCache;
#else
#include <unordered_map>
typedef std::unordered_map<std::string, AlgoRec> AlgoCache;
#endif
#include "pthread.h"
#line 87 "dnn_conv_base.c"
#ifdef DEBUG
#if __cplusplus < 201103L
const char* const _cppver = "No timing available: C++11 or later is required.";
#else
#define DEBUG_TIMING
#include <chrono>
const char* const _cppver = NULL;
struct AesaraTimer {
double milliseconds;
std::chrono::steady_clock::time_point base;
void start() {base = std::chrono::steady_clock::now();}
void end() {
milliseconds =
std::chrono::duration_cast<std::chrono::nanoseconds>(
std::chrono::steady_clock::now() - base
).count() / 1000000.0;
}
};
#endif
#endif
pthread_mutex_t algoMutex;
AlgoCache algoCache;
static cudnnStatus_t checkCudnnStatus(cudnnStatus_t err, const char* msg)
{
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "CUDNN Error: %s: %s",
msg, cudnnGetErrorString(err));
}
return err;
}
static size_t
c_get_largest_free_block_size(PyGpuContextObject *c)
{
size_t maxfree = 0;
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &maxfree);
if (err2 != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
"memory information on the GPU");
}
// Guess 4Mb if the info is not available
if (maxfree == 0) maxfree = 4 * 1024 * 1024;
return maxfree;
}
/** Check if convolution output tensor has expected dimensions
depending on given inputs and number of groups.
return 0 if everything is ok, non-0 on error.
**/
static int dnn_check_convolution_output(cudnnConvolutionDescriptor_t convDesc,
cudnnTensorDescriptor_t inputDesc,
cudnnFilterDescriptor_t filterDesc,
size_t tensorNdim,
PyGpuArrayObject* output,
int groups) {
int expected_output_dims[5] = {0};
cudnnStatus_t err = cudnnGetConvolutionNdForwardOutputDim(convDesc, inputDesc, filterDesc,
tensorNdim, expected_output_dims);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error computing convolution output dim: %s",
cudnnGetErrorString(err));
return 1;
}
if (tensorNdim == 4) {
if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
(PyGpuArray_DIMS(output)[1] / groups != expected_output_dims[1]) ||
(PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
(PyGpuArray_DIMS(output)[3] != expected_output_dims[3])) {
PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %dx%dx%dx%d"
" but received %ldx%ldx%ldx%ld",
expected_output_dims[0], expected_output_dims[1] * groups,
expected_output_dims[2], expected_output_dims[3],
PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3]);
return 1;
}
} else if (tensorNdim == 5) {
if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
(PyGpuArray_DIMS(output)[1] / groups != expected_output_dims[1]) ||
(PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
(PyGpuArray_DIMS(output)[3] != expected_output_dims[3]) ||
(PyGpuArray_DIMS(output)[4] != expected_output_dims[4])) {
PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %dx%dx%dx%dx%d"
" but received %ldx%ldx%ldx%ldx%ld",
expected_output_dims[0], expected_output_dims[1] * groups,
expected_output_dims[2], expected_output_dims[3],
expected_output_dims[4],
PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3],
PyGpuArray_DIMS(output)[4]);
return 1;
}
}
return 0;
}
static std::string shape(int* res, int size)
{
std::ostringstream s;
if (size > 0) {
s << res[0];
for (int i = 1; i < size; ++i)
s <<',' << res[i];
}
return s.str();
}
static std::string shape(cudnnTensorDescriptor_t t)
{
// cuDNN can handle up to CUDNN_DIM_MAX dimensions.
int res[CUDNN_DIM_MAX];
int stride[CUDNN_DIM_MAX];
int nbDims;
cudnnDataType_t type;
checkCudnnStatus(cudnnGetTensorNdDescriptor(t, CUDNN_DIM_MAX, &type, &nbDims, res, stride),
"error getting tensor description");
if (PyErr_Occurred()) return "";
return shape(res, nbDims) + "," + shape(stride, nbDims);
};
static std::string shape(cudnnFilterDescriptor_t t, cudnnDataType_t* type)
{
cudnnTensorFormat_t format;
int res[CUDNN_DIM_MAX];
int outDims;
checkCudnnStatus(cudnnGetFilterNdDescriptor(t, CUDNN_DIM_MAX, type, &format, &outDims, res),
"error getting filter description");
if (PyErr_Occurred()) return "";
return shape(res, outDims);
};
static std::string shape(cudnnConvolutionDescriptor_t convDesc, int dataTypecode)
{
int nDim;
cudnnConvolutionMode_t mode;
cudnnDataType_t computeType;
int padA[5];
int strideA[5];
int dilationA[5];
/* Data type configuration. Format: " -<dtype><precision>" with dtype and precision in {h, f, d},
* h for half (float16), f for float (float32), d for double (float64). */
char data_type_configuration[5];
checkCudnnStatus(
cudnnGetConvolutionNdDescriptor( convDesc, 5,
&nDim,
&padA[0],
&strideA[0],
&dilationA[0],
&mode,
&computeType ),
"error getting convolution description");
if (PyErr_Occurred()) return "";
/* Build data type configuration string. */
data_type_configuration[0] = ' ';
data_type_configuration[1] = '-';
switch (dataTypecode) {
case GA_HALF: data_type_configuration[2] = 'h'; break;
case GA_FLOAT: data_type_configuration[2] = 'f'; break;
case GA_DOUBLE: data_type_configuration[2] = 'd'; break;
default:
PyErr_SetString(PyExc_TypeError, "Unsupported data type in convolution.");
return "";
}
switch (computeType) {
case CUDNN_DATA_HALF: data_type_configuration[3] = 'h'; break;
case CUDNN_DATA_FLOAT: data_type_configuration[3] = 'f'; break;
case CUDNN_DATA_DOUBLE: data_type_configuration[3] = 'd'; break;
default:
PyErr_SetString(PyExc_TypeError, "Unsupported precision in convolution.");
return "";
}
data_type_configuration[4] = '\0';
return (std::string("-mode ") +
((mode == CUDNN_CONVOLUTION) ? "conv" : "cross") +
" -pad " +
shape(padA, nDim) +
" -subsample " +
shape(strideA, nDim) +
" -dilation " +
shape(dilationA, nDim) +
data_type_configuration);
}
static bool all_aligned(cudnnDataType_t type, void* in, void* out, void* filter)
{
size_t alignMask = 0xF;
// there have to be entries for both aligned and not
if (((size_t)in | (size_t)out | (size_t)filter) & alignMask)
{
return false;
}
return true;
}
static std::string dnn_conv_shape(cudnnTensorDescriptor_t inputDesc, PyGpuArrayObject* input,
cudnnFilterDescriptor_t filterDesc, PyGpuArrayObject* filter,
cudnnConvolutionDescriptor_t convDesc,
PyGpuArrayObject* output, int groups)
{
cudnnDataType_t dType;
std::ostringstream s;
int expected_output_dims[5] = {0};
if (dnn_check_convolution_output(convDesc, inputDesc, filterDesc, PyGpuArray_NDIM(filter), output, groups) != 0)
return "";
std::string shapeInput = shape(inputDesc);
std::string shapeFilter = shape(filterDesc, &dType);
std::string shapeConvDesc = shape(convDesc, input->ga.typecode);
if (shapeInput.empty() || shapeFilter.empty() || shapeConvDesc.empty())
return "";
s << "-g " << groups << " -dim " << shapeInput << " -filt " <<
shapeFilter << " " << shapeConvDesc;
// there have to be entries for both aligned and not.
if (!all_aligned(dType, PyGpuArray_DEV_DATA(input), PyGpuArray_DEV_DATA(output), PyGpuArray_DEV_DATA(filter)))
{
s << " [unaligned]";
}
return s.str();
}
static void dnn_conv_update_cache(const std::string& hash, const AlgoRec& rec)
{
pthread_mutex_lock(&algoMutex);
algoCache[hash] = rec;
pthread_mutex_unlock(&algoMutex);
}
static const AlgoRec* dnn_conv_check_cache(const std::string& hash)
{
pthread_mutex_lock(&algoMutex);
const AlgoRec* ret = 0;
AlgoCache::iterator hit = algoCache.find(hash);
if (hit != algoCache.end())
ret = &hit->second;
pthread_mutex_unlock(&algoMutex);
return ret;
}
#section support_code
int dnn_dropout_desc(float dropout, unsigned long long seed,
PyGpuContextObject *c,
cudnnDropoutDescriptor_t *odesc,
PyGpuArrayObject **ostates,
cudnnHandle_t _handle) {
PyGpuArrayObject *states;
cudnnDropoutDescriptor_t desc;
size_t states_sz;
cudnnStatus_t err;
cuda_enter(c->ctx);
err = cudnnCreateDropoutDescriptor(&desc);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_SetString(PyExc_RuntimeError, "Can't create dropout descriptor");
cuda_exit(c->ctx);
return -1;
}
/* Can't fail according to docs */
cudnnDropoutGetStatesSize(_handle, &states_sz);
states = pygpu_empty(1, &states_sz, GA_UBYTE, GA_C_ORDER, c, Py_None);
if (states == NULL) {
cudnnDestroyDropoutDescriptor(desc);
cuda_exit(c->ctx);
return -1;
}
err = cudnnSetDropoutDescriptor(desc, _handle, dropout,
PyGpuArray_DEV_DATA(states),
states_sz, seed);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_SetString(PyExc_RuntimeError, "Can't set dropout descriptor");
Py_DECREF((PyObject *)states);
cudnnDestroyDropoutDescriptor(desc);
cuda_exit(c->ctx);
return -1;
}
cuda_exit(c->ctx);
*odesc = desc;
*ostates = states;
return 0;
}
#section support_code
int dnn_dropout_fwd(PyGpuArrayObject *x,
cudnnDropoutDescriptor_t *desc,
PyGpuArrayObject *state,
PyGpuArrayObject **y,
PyGpuArrayObject **ostate,
gpudata **reserve,
cudnnHandle_t _handle) {
PyGpuArrayContext *c = x->context;
cudnnTensorDescriptor_t xdesc;
cudnnTensorDescriptor_t ydesc;
gpudata *res;
size_t res_sz;
cudnnStatus_t err;
if (c_make_tensorNd(x, &xdesc))
return -1;
if (aesara_prep_output(y, x->ga.nd, x->ga.dimensions, x->ga.typecode,
GA_C_ORDER, c)) {
cudnnDestroyTensorDescriptor(xdesc);
return -1;
}
if (c_make_tensorNd(y, &ydesc)) {
cudnnDestroyTensorDescriptor(xdesc);
return -1;
}
*ostate = state;
Py_INCREF((PyObject *)state);
/* This can't fail according to the docs */
err = cudnnDropoutGetReserveSpaceSize(desc, &res_sz);
res = gpudata_alloc(c->ctx, res_zs, NULL, 0, NULL);
if (res == NULL) {
cudnnDestroyTensorDescriptor(xdesc);
cudnnDestroyTensorDescriptor(ydesc);
PyErr_SetString(PyExc_RuntimeError, "Could not allocate reserve for dropout");
}
*reserve = res;
cuda_enter(c->ctx);
err = cudnnDropoutForward(_handle, desc, xdesc, PyGpuArray_DEV_DATA(x),
ydesc, PyGpuArray_DEV_DATA(y), *(void **)res,
res_sz);
cudnnDestroyTensorDescriptor(xdesc);
cudnnDestroyTensorDescriptor(ydesc);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not run dropout: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return -1;
}
cuda_exit(c->ctx);
return 0;
}
#section init_code_struct
prev_algo.algo = PARAMS->conv_algo;
prev_algo.mathType = CUDNN_DEFAULT_MATH;
reuse_algo = 0;
hash_prefix = std::string("FWD|GPU#");
#ifdef DEBUG_TIMING
total_computation_time = 0;
total_selection_time = 0;
n_computations = 0;
n_selections = 0;
if (PARAMS->choose_algo) {
if (PARAMS->choose_time) {
selection_name = "fastest";
} else {
selection_name = "best suited";
}
};
#endif
#section support_code_struct
#line 22 "dnn_fwd.c"
int reuse_algo;
AlgoRec prev_algo;
std::string hash_prefix;
#define AESARA_DONT_MEMSET_STRUCT
#ifdef DEBUG
char algorithm_name[128];
#endif
#ifdef DEBUG_TIMING
double total_computation_time;
double total_selection_time;
size_t n_computations;
size_t n_selections;
const char* selection_name;
#endif
/** Check given algorithm against inputs and convolution descriptor,
change algorithm inplace to a fallback algorithm if checkings fail.
Return 0 on success, non-0 on error. **/
int dnn_conv_fwd_fallback(cudnnConvolutionFwdAlgo_t* _algo,
const PyGpuArrayObject* input,
const PyGpuArrayObject* kerns,
cudnnConvolutionDescriptor_t desc) {
cudnnConvolutionFwdAlgo_t algo = *_algo;
/* Only these algos are supported for 3d conv with cuDNN >= V5.1. */
if (PyGpuArray_NDIM(input) == 5 &&
!(algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM ||
algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM ||
algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING))
{
#ifdef DEBUG
if (0 != aesara_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name))
return 1;
fprintf(stderr, "(%s unsupported for 3D: fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)\n", algorithm_name);
#endif
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
}
// Algo `small` does not work for a batch size > 2^16, with cuDNN >= V5.1.
// Issue should be resolved for cuDNN > V6.0.
// NB: In cuDNN V7, issue is resolved for 2D convolutionss only.
if ((cudnnGetVersion() < 6100 || PyGpuArray_NDIM(input) == 5) &&
algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM &&
PyGpuArray_DIM(input, 0) > 65536)
{
#ifdef DEBUG
fprintf(stderr, "(CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM "
"will fail with batch size > 2^16, fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)\n");
#endif
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
}
// The FFT implementation does not support strides, 1x1 filters or inputs
// with a spatial dimension larger than 1024. The tiled-FFT implementation
// does not support strides.
// If the chosen implementation is FFT or tiled-FFT, validate that it can
// be used on the current data and default to a safe implementation if it
// can't.
// The following code is 2d-specific but it is fine as FFT and tiled-FFT are
// defined only for 2d filters
/* NB:
TODO: These checkings seems outdated for FFT algorithms with cuDNN >= 5.1.
New conditions apply and may depend on number of dimensions (2D or 3D)
e.g. for FFT_TILING.
TODO: More globally, how to handle CUDNN_STATUS_NOT_SUPPORTED with unsupported algorithms?
*/
if ((algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT ||
algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) && PyGpuArray_NDIM(input) == 4) {
// Extract the properties of the convolution descriptor
int nd;
int pad[2];
int stride[2];
int dilation[2];
cudnnConvolutionMode_t mode;
cudnnDataType_t data_type;
cudnnStatus_t err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride, dilation, &mode, &data_type);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error getting convolution properties: %s",
cudnnGetErrorString(err));
return 1;
}
if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT) {
if (stride[0] != 1 || stride[1] != 1 ||
PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
(PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1))
{
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
#ifdef DEBUG
fprintf(stderr, "(replacing fwd algo fft with none)\n");
#endif
}
} else {
// algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING
if (stride[0] != 1 || stride[1] != 1) {
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
#ifdef DEBUG
fprintf(stderr, "(replacing fwd algo fft_tiling with none)\n");
#endif
}
}
}
*_algo = algo;
return 0;
}
int
APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
PyGpuArrayObject *om,
cudnnConvolutionDescriptor_t desc,
double alpha, double beta,
PyGpuArrayObject **output,
PARAMS_TYPE* params) {
PyGpuContextObject *c = input->context;
void *alpha_p;
void *beta_p;
float af = alpha, bf = beta;
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
bool use_cached = 0;
#ifdef DEBUG
if (_cppver) fprintf(stderr, "%s\n", _cppver);
#endif
#ifdef DEBUG_TIMING
AesaraTimer timer;
#endif
if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1] * params->num_groups) {
PyErr_SetString(PyExc_ValueError,
"images and kernel must have the same stack size");
return 1;
}
if ((PyGpuArray_DIMS(kerns)[0] % params->num_groups) != 0) {
PyErr_SetString(PyExc_ValueError,
"Number of filters must be divisible by number of groups");
return 1;
}
switch (input->ga.typecode) {
case GA_DOUBLE:
alpha_p = (void *)&alpha;
beta_p = (void *)&beta;
break;
case GA_FLOAT:
case GA_HALF:
alpha_p = (void *)&af;
beta_p = (void *)&bf;
break;
default:
PyErr_SetString(PyExc_TypeError, "Unsupported type in convolution");
return 1;
}
if (params->inplace) {
Py_XDECREF(*output);
*output = om;
Py_INCREF(*output);
} else {
if (aesara_prep_output(output, PyGpuArray_NDIM(om), PyGpuArray_DIMS(om),
om->ga.typecode, GA_C_ORDER, c) != 0)
return 1;
if (beta != 0.0 && pygpu_move(*output, om))
return 1;
}
if (PyGpuArray_DIMS(input)[0] == 0 || PyGpuArray_DIMS(kerns)[0] == 0 || PyGpuArray_DIMS(kerns)[1] == 0) {
int err2 = GpuArray_memset(&(*output)->ga, 0);
if (err2 != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"GpuDnnConv could not fill the output with zeros: %d", err2);
return 1;
}
return 0;
}
int groups = c_get_groups_for_conv(desc, params->num_groups);
if (groups == -1)
return 1;
if (c_set_tensor_for_conv(input, APPLY_SPECIFIC(input), groups) == -1)
return 1;
if (c_set_filter(kerns, APPLY_SPECIFIC(kerns), groups) == -1)
return 1;
if (c_set_tensor_for_conv(*output, APPLY_SPECIFIC(output), groups) == -1)
return 1;
size_t input_offset = PyGpuArray_STRIDE(input, 0) / groups;
size_t kern_offset = PyGpuArray_STRIDE(kerns, 0) * PyGpuArray_DIM(kerns, 0) / groups;
size_t output_offset = PyGpuArray_STRIDE(*output, 0) / groups;
cudnnConvolutionFwdAlgo_t algo = params->conv_algo;
size_t worksize = 0;
cudnnMathType_t mathtype = CUDNN_DEFAULT_MATH;
std::string hashkey;
cuda_enter(c->ctx);
size_t maxfree = c_get_largest_free_block_size(c);
if (PyErr_Occurred()) {
cuda_exit(c->ctx);
return 1;
}
if (params->choose_algo) {
if (!reuse_algo) {
char pci_id[16];
gpucontext_property(c->ctx, GA_CTX_PROP_UNIQUE_ID, pci_id);
// check out cache
hashkey = dnn_conv_shape(APPLY_SPECIFIC(input), input, APPLY_SPECIFIC(kerns), kerns, desc, *output, groups);
if (hashkey.empty()) {
cuda_exit(c->ctx);
return 1;
}
hashkey = hash_prefix + pci_id + (params->choose_time ? " -t " : " ") + hashkey;
const AlgoRec* cached = dnn_conv_check_cache(hashkey);
if (cached) {
prev_algo = *cached;
use_cached = 1;
}
}
if (reuse_algo || use_cached) {
algo = (cudnnConvolutionFwdAlgo_t)prev_algo.algo;
worksize = prev_algo.wsSize;
mathtype = prev_algo.mathType;
} else {
if (params->choose_time) {
int count;
cudnnConvolutionFwdAlgoPerf_t choice;
gpudata *tmpmem;
tmpmem = gpudata_alloc(c->ctx, maxfree, NULL, 0, NULL);
if (tmpmem == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate GPU memory for FindEx");
cuda_exit(c->ctx);
return -1;
}
// set the 'tensor math ok' flag
if (input->ga.typecode == GA_HALF)
c_set_math_type_for_conv(desc, CUDNN_TENSOR_OP_MATH);
/* cudnnFindConvolutionForwardAlgorithmEx() may write to output.
We don't want that if output is used in computation (ie. if beta != 0). */
PyGpuArrayObject* o = *output;
if (beta != 0) {
o = pygpu_empty(PyGpuArray_NDIM(*output), PyGpuArray_DIMS(*output), (*output)->ga.typecode, GA_C_ORDER, c, Py_None);
}
#ifdef DEBUG_TIMING
timer.start();
#endif
// We don't sync the buffer as we don't care about the values.
err = cudnnFindConvolutionForwardAlgorithmEx(
params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
desc, APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(o),
1, &count, &choice, *(void **)tmpmem,
maxfree);
#ifdef DEBUG_TIMING
timer.end();
#endif
gpudata_release(tmpmem);
if (beta != 0) {
Py_XDECREF(o);
}
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
#ifdef DEBUG
if (count == 0) {
PyErr_SetString(PyExc_RuntimeError, "No best-timed conv fwd algorithm found");
cuda_exit(c->ctx);
return 1;
} else if (choice.status != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error getting best-timed FWD algo: %s",
cudnnGetErrorString(choice.status));
cuda_exit(c->ctx);
return 1;
} // Else, count is necessarly 1 for current implementation.
#endif
algo = choice.algo;
worksize = choice.memory;
#if CUDNN_MAJOR >= 7
if (input->ga.typecode == GA_HALF)
mathtype = choice.mathType;
#endif
} else {
#ifdef DEBUG_TIMING
timer.start();
#endif
err = cudnnGetConvolutionForwardAlgorithm(
params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
desc, APPLY_SPECIFIC(output),
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, maxfree, &algo);
#ifdef DEBUG_TIMING
timer.end();
#endif
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
}
#ifdef DEBUG_TIMING
total_selection_time += timer.milliseconds;
++n_selections;
#endif
}
}
if (c_set_math_type_for_conv(desc, mathtype) == -1 ||
dnn_conv_fwd_fallback(&algo, input, kerns, desc) != 0) {
cuda_exit(c->ctx);
return 1;
}
// if FindEx was used (choose_time), workspace size is set.
if (!(reuse_algo || use_cached || params->choose_time))
{
err = cudnnGetConvolutionForwardWorkspaceSize(params->handle,
APPLY_SPECIFIC(input),
APPLY_SPECIFIC(kerns),
desc,
APPLY_SPECIFIC(output),
algo,
&worksize);
if (err == CUDNN_STATUS_NOT_SUPPORTED) {
// Fallback to none algo if not supported
#ifdef DEBUG
if (0 != aesara_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name)) {
cuda_exit(c->ctx);
return 1;
}
fprintf(stderr, "(error getting worksize for %s: failing back to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)\n",
algorithm_name);
#endif
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
err = cudnnGetConvolutionForwardWorkspaceSize(params->handle,
APPLY_SPECIFIC(input),
APPLY_SPECIFIC(kerns),
desc,
APPLY_SPECIFIC(output),
algo,
&worksize);
}
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
}
if (params->choose_algo) {
#ifdef DEBUG
if (0 != aesara_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name)) {
cuda_exit(c->ctx);
return 1;
}
fprintf(stderr, "(using %s%s %s%s%s, ws:%ld, hash:%s)\n",
algorithm_name,
mathtype == CUDNN_TENSOR_OP_MATH ? "(tensor_op)" : "",
params->choose_time ? "(timed)": "" ,
reuse_algo ? "(reused)" : "",
use_cached ? "(cache)": "",
worksize,
hashkey.c_str()
);
#endif
#ifdef DEBUG_TIMING
if (!(reuse_algo || use_cached)) {
// We have selected an algorithm at runtime.
// `timer` still contains timing about selection step.
fprintf(stderr, "\t(selected %s fwd algo in %g milliseconds)\n", selection_name, timer.milliseconds);
if (n_selections > 1) {
fprintf(stderr, "\t(selected %lu fwd algos in %g milliseconds (average: %g milliseconds per selection))\n",
n_selections, total_selection_time, total_selection_time / n_selections);
}
}
#endif
if (!reuse_algo) {
// save for next time/cache
prev_algo.algo = algo;
prev_algo.wsSize = worksize;
prev_algo.mathType = mathtype;
// Add to the cache if we choose on shape change, or first time if
// we choose once.
if (!use_cached)
dnn_conv_update_cache(hashkey, prev_algo);
if (params->choose_once)
reuse_algo = 1;
}
} // params->choose_algo
{
gpudata *workspace = 0;
if (worksize != 0) {
workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
if (workspace == NULL) {
PyErr_SetString(PyExc_RuntimeError, "Could not allocate working memory");
cuda_exit(c->ctx);
return 1;
}
}
if (worksize != 0)
cuda_wait(workspace, GPUARRAY_CUDA_WAIT_WRITE);
cuda_wait(input->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_wait(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
#ifdef DEBUG_TIMING
GpuArray_sync(&(*output)->ga);
timer.start();
#endif
for ( int g = 0; g < groups; g++) {
err = cudnnConvolutionForward(
params->handle,
alpha_p,
APPLY_SPECIFIC(input), ((char *)PyGpuArray_DEV_DATA(input)) + input_offset * g,
APPLY_SPECIFIC(kerns), ((char *)PyGpuArray_DEV_DATA(kerns)) + kern_offset * g,
desc, algo,
worksize == 0 ? NULL : *(void **)workspace, worksize,
beta_p,
APPLY_SPECIFIC(output), ((char *)PyGpuArray_DEV_DATA(*output)) + output_offset * g);
}
if (worksize != 0) {
cuda_record(workspace, GPUARRAY_CUDA_WAIT_WRITE);
gpudata_release(workspace);
}
cuda_record(input->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_record(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_record((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
}
#ifdef DEBUG_TIMING
GpuArray_sync(&(*output)->ga);
timer.end();
total_computation_time += timer.milliseconds;
++n_computations;
#endif
cuda_exit(c->ctx);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error doing cuDNN conv FWD operation: %s",
cudnnGetErrorString(err));
return 1;
}
#ifdef DEBUG_TIMING
fprintf(stderr, "\t(ran fwd algo in %g milliseconds)\n", timer.milliseconds);
if (n_computations > 1) {
fprintf(stderr, "\t(ran %lu fwd computations in %g milliseconds (average: %g milliseconds per call))\n",
n_computations, total_computation_time, total_computation_time / n_computations);
}
#endif
return 0;
}
#section init_code_struct
prev_algo.algo = PARAMS->conv_algo;
prev_algo.mathType = CUDNN_DEFAULT_MATH;
reuse_algo = 0;
hash_prefix = std::string("GI|GPU#");
#ifdef DEBUG_TIMING
total_computation_time = 0;
total_selection_time = 0;
n_computations = 0;
n_selections = 0;
if (PARAMS->choose_algo) {
if (PARAMS->choose_time) {
selection_name = "fastest";
} else {
selection_name = "best suited";
}
};
#endif
#section support_code_struct
#line 22 "dnn_gi.c"
int reuse_algo;
AlgoRec prev_algo;
std::string hash_prefix;
#define AESARA_DONT_MEMSET_STRUCT
#ifdef DEBUG
char algorithm_name[128];
#endif
#ifdef DEBUG_TIMING
double total_computation_time;
double total_selection_time;
size_t n_computations;
size_t n_selections;
const char* selection_name;
#endif
/** Check given algorithm against inputs and convolution descriptor,
change algorithm inplace to a fallback algorithm if checkings fail.
Return 0 on success, non-0 on error. **/
int dnn_conv_gi_fallback(cudnnConvolutionBwdDataAlgo_t* _algo,
const PyGpuArrayObject* input,
const PyGpuArrayObject* kerns,
cudnnConvolutionDescriptor_t desc) {
cudnnConvolutionBwdDataAlgo_t algo = *_algo;
// The FFT implementation does not support strides, 1x1 filters or inputs
// with a spatial dimension larger than 1024. The tiled-FFT implementation
// does not support strides.
// If the chosen implementation is FFT or tiled-FFT, validate that it can
// be used on the current data and default to a safe implementation if it
// can't.
// The following code is 2d-specific but it is fine as FFT and tiled-FFT are
// defined only for 2d filters
if ((algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING ||
algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) && PyGpuArray_NDIM(kerns) == 4) {
// Extract the properties of the convolution descriptor
int nd;
int pad[2];
int stride[2];
int upscale[2];
cudnnConvolutionMode_t mode;
cudnnDataType_t data_type;
cudnnStatus_t err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride, upscale, &mode, &data_type);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error getting convolution properties: %s",
cudnnGetErrorString(err));
return 1;
}
if (algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) {
if (stride[0] != 1 || stride[1] != 1 ||
PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
(PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1))
{
algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
#ifdef DEBUG
fprintf(stderr, "(replacing gradinput algo fft with none)\n");
#endif
}
} else {
// algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING
if (stride[0] != 1 || stride[1] != 1) {
algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
#ifdef DEBUG
fprintf(stderr, "(replacing gradinput algo fft_tiling with none)\n");
#endif
}
}
}
*_algo = algo;
return 0;
}
int
APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
PyGpuArrayObject *im,
cudnnConvolutionDescriptor_t desc,
double alpha, double beta, PyGpuArrayObject **input,
PARAMS_TYPE* params) {
PyGpuContextObject *c = kerns->context;
void *alpha_p;
void *beta_p;
float af = alpha, bf = beta;
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
bool use_cached = 0;
#ifdef DEBUG
if (_cppver) fprintf(stderr, "%s\n", _cppver);
#endif
#ifdef DEBUG_TIMING
AesaraTimer timer;
#endif
if (PyGpuArray_DIMS(im)[1] != PyGpuArray_DIMS(kerns)[1] * params->num_groups) {
PyErr_SetString(PyExc_ValueError, "images and kernel must have the same "
"stack size");
return 1;
}
if ((PyGpuArray_DIMS(kerns)[0] % params->num_groups) != 0) {
PyErr_SetString(PyExc_ValueError,
"Number of filters must be divisible by number of groups");
return 1;
}
switch (im->ga.typecode) {
case GA_DOUBLE:
alpha_p = (void *)&alpha;
beta_p = (void *)&beta;
break;
case GA_FLOAT:
case GA_HALF:
alpha_p = (void *)&af;
beta_p = (void *)&bf;
break;
default:
PyErr_SetString(PyExc_TypeError, "Unsupported type in convolution");
return 1;
}
if (params->inplace) {
Py_XDECREF(*input);
*input = im;
Py_INCREF(*input);
} else {
if (aesara_prep_output(input, PyGpuArray_NDIM(im), PyGpuArray_DIMS(im),
im->ga.typecode, GA_C_ORDER, c) != 0)
return 1;
if (beta != 0.0 && pygpu_move(*input, im))
return 1;
}
if (PyGpuArray_DIMS(im)[0] == 0 || PyGpuArray_DIMS(kerns)[0] == 0 || PyGpuArray_DIMS(kerns)[1] == 0) {
int err2 = GpuArray_memset(&(*input)->ga, 0);
if (err2 != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"GpuDnnConv grad wrt. inputs could not fill the output with zeros: %d", err2);
return 1;
}
return 0;
}
int groups = c_get_groups_for_conv(desc, params->num_groups);
if (groups == -1)
return 1;
if (c_set_tensor_for_conv(output, APPLY_SPECIFIC(output), groups) == -1)
return 1;
if (c_set_filter(kerns, APPLY_SPECIFIC(kerns), groups) == -1)
return 1;
if (c_set_tensor_for_conv(*input, APPLY_SPECIFIC(input), groups) == -1)
return 1;
if (0 != dnn_check_convolution_output(desc, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
PyGpuArray_NDIM(kerns), output, groups))
return 1;
size_t input_offset = PyGpuArray_STRIDE(*input, 0) / groups;
size_t kern_offset = PyGpuArray_STRIDE(kerns, 0) * PyGpuArray_DIM(kerns, 0) / groups;
size_t output_offset = PyGpuArray_STRIDE(output, 0) / groups;
cudnnConvolutionBwdDataAlgo_t algo = params->conv_algo;
size_t worksize = 0;
cudnnMathType_t mathtype = CUDNN_DEFAULT_MATH;
std::string hashkey;
cuda_enter(c->ctx);
size_t maxfree = c_get_largest_free_block_size(c);
if (PyErr_Occurred()) {
cuda_exit(c->ctx);
return 1;
}
if (params->choose_algo) {
if (!reuse_algo) {
char pci_id[16];
gpucontext_property(c->ctx, GA_CTX_PROP_UNIQUE_ID, pci_id);
// check out cache
hashkey = dnn_conv_shape(APPLY_SPECIFIC(input), *input, APPLY_SPECIFIC(kerns), kerns, desc, output, groups);
if (hashkey.empty()) {
cuda_exit(c->ctx);
return 1;
}
hashkey = hash_prefix + pci_id + (params->choose_time ? " -t " : " ") + hashkey;
const AlgoRec* cached = dnn_conv_check_cache(hashkey);
if (cached) {
prev_algo = *cached;
use_cached = 1;
}
}
if (reuse_algo || use_cached) {
algo = (cudnnConvolutionBwdDataAlgo_t)prev_algo.algo;
worksize = prev_algo.wsSize;
mathtype = prev_algo.mathType;
} else {
if (params->choose_time) {
int count;
cudnnConvolutionBwdDataAlgoPerf_t choice;
gpudata *tmpmem;
// set the 'tensor math ok' flag
if (im->ga.typecode == GA_HALF)
c_set_math_type_for_conv(desc, CUDNN_TENSOR_OP_MATH);
tmpmem = gpudata_alloc(c->ctx, maxfree, NULL, 0, NULL);
if (tmpmem == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
cuda_exit(c->ctx);
return -1;
}
/* cudnnFindConvolutionBackwardDataAlgorithmEx() may write to output (input).
We don't want that if output is used in computation (ie. if beta != 0). */
PyGpuArrayObject* ip = *input;
if (beta != 0) {
ip = pygpu_empty(PyGpuArray_NDIM(*input), PyGpuArray_DIMS(*input), (*input)->ga.typecode, GA_C_ORDER, c, Py_None);
}
#ifdef DEBUG_TIMING
timer.start();
#endif
err = cudnnFindConvolutionBackwardDataAlgorithmEx(
params->handle, APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(ip),
1, &count, &choice, *(void **)tmpmem, maxfree);
#ifdef DEBUG_TIMING
timer.end();
#endif
gpudata_release(tmpmem);
if (beta != 0) {
Py_XDECREF(ip);
}
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
#ifdef DEBUG
if (count == 0) {
PyErr_SetString(PyExc_RuntimeError, "No best-timed conv gradinput algorithm found");
cuda_exit(c->ctx);
return 1;
} else if (choice.status != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error getting best-timed gradinput algo: %s",
cudnnGetErrorString(choice.status));
cuda_exit(c->ctx);
return 1;
} // Else, count is necessarly 1 for current implementation.
#endif
algo = choice.algo;
worksize = choice.memory;
#if CUDNN_MAJOR >= 7
if (im->ga.typecode == GA_HALF)
mathtype = choice.mathType;
#endif
} else {
#ifdef DEBUG_TIMING
timer.start();
#endif
err = cudnnGetConvolutionBackwardDataAlgorithm(
params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output),
desc, APPLY_SPECIFIC(input),
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, maxfree, &algo);
#ifdef DEBUG_TIMING
timer.end();
#endif
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
}
#ifdef DEBUG_TIMING
total_selection_time += timer.milliseconds;
++n_selections;
#endif
}
}
if (c_set_math_type_for_conv(desc, mathtype) == -1 ||
dnn_conv_gi_fallback(&algo, *input, kerns, desc) != 0) {
cuda_exit(c->ctx);
return 1;
}
// if FindEx was used (choose_time), workspace size is set.
if (!(reuse_algo || use_cached || params->choose_time))
{
err = cudnnGetConvolutionBackwardDataWorkspaceSize(
params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(input), algo, &worksize);
if (err == CUDNN_STATUS_NOT_SUPPORTED) {
// Fallback to none algo if not supported
#ifdef DEBUG
if (0 != aesara_enum_to_string_cudnnConvolutionBwdDataAlgo_t(algo, algorithm_name)) {
cuda_exit(c->ctx);
return 1;
}
fprintf(stderr, "(error getting worksize for %s: failing back to CUDNN_CONVOLUTION_BWD_DATA_ALGO_0)\n",
algorithm_name);
#endif
algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
err = cudnnGetConvolutionBackwardDataWorkspaceSize(
params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(input), algo, &worksize);
}
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
} // !(reuse_algo || use_cached || params->choose_time)
if (params->choose_algo) {
#ifdef DEBUG
if (0 != aesara_enum_to_string_cudnnConvolutionBwdDataAlgo_t(algo, algorithm_name)) {
cuda_exit(c->ctx);
return 1;
}
fprintf(stderr, "(using %s%s %s%s%s, ws:%ld, hash:%s)\n",
algorithm_name,
mathtype == CUDNN_TENSOR_OP_MATH ? "(tensor_op)" : "",
params->choose_time ? "(timed)": "" ,
reuse_algo ? "(reused)" : "",
use_cached ? "(cache)": "",
worksize,
hashkey.c_str()
);
#endif
#ifdef DEBUG_TIMING
if (!(reuse_algo || use_cached)) {
// We have selected an algorithm at runtime.
// `timer` still contains timing about selection step.
fprintf(stderr, "\t(selected %s gradinput algo in %g milliseconds)\n", selection_name, timer.milliseconds);
if (n_selections > 1) {
fprintf(stderr, "\t(selected %lu gradinput algos in %g milliseconds (average: %g milliseconds per selection))\n",
n_selections, total_selection_time, total_selection_time / n_selections);
}
}
#endif
if (!reuse_algo) {
// save for next time/cache
prev_algo.algo = algo;
prev_algo.wsSize = worksize;
prev_algo.mathType = mathtype;
// Add to the cache
if (!use_cached)
dnn_conv_update_cache(hashkey, prev_algo);
if (params->choose_once)
reuse_algo = 1;
}
} // params->choose_algo
gpudata *workspace = 0;
if (worksize != 0) {
workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
if (workspace == NULL) {
PyErr_SetString(PyExc_RuntimeError, "Could not allocate working memory");
cuda_exit(c->ctx);
return 1;
}
}
if (worksize != 0)
cuda_wait(workspace, GPUARRAY_CUDA_WAIT_WRITE);
cuda_wait(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_wait(output->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_wait((*input)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
#ifdef DEBUG_TIMING
GpuArray_sync(&(*input)->ga);
timer.start();
#endif
for ( int g = 0; g < groups; g++) {
err = cudnnConvolutionBackwardData(
params->handle,
alpha_p,
APPLY_SPECIFIC(kerns), ((char *)PyGpuArray_DEV_DATA(kerns)) + kern_offset * g,
APPLY_SPECIFIC(output), ((char *)PyGpuArray_DEV_DATA(output)) + output_offset * g,
desc, algo, worksize == 0 ? NULL : *(void **)workspace, worksize,
beta_p,
APPLY_SPECIFIC(input), ((char *)PyGpuArray_DEV_DATA(*input)) + input_offset * g);
}
if (worksize != 0) {
cuda_record(workspace, GPUARRAY_CUDA_WAIT_WRITE);
gpudata_release(workspace);
}
cuda_record(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_record(output->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_record((*input)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
#ifdef DEBUG_TIMING
GpuArray_sync(&(*input)->ga);
timer.end();
total_computation_time += timer.milliseconds;
++n_computations;
#endif
cuda_exit(c->ctx);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error doing cuDNN conv gradinput operation: %s",
cudnnGetErrorString(err));
return 1;
}
#ifdef DEBUG_TIMING
fprintf(stderr, "\t(ran gradinput algo in %g milliseconds)\n", timer.milliseconds);
if (n_computations > 1) {
fprintf(stderr, "\t(ran %lu gradinput computations in %g milliseconds (average: %g milliseconds per call))\n",
n_computations, total_computation_time, total_computation_time / n_computations);
}
#endif
return 0;
}
#section init_code_struct
prev_algo.algo = PARAMS->conv_algo;
prev_algo.mathType = CUDNN_DEFAULT_MATH;
reuse_algo = 0;
hash_prefix = std::string("GW|GPU#");
#ifdef DEBUG_TIMING
total_computation_time = 0;
total_selection_time = 0;
n_computations = 0;
n_selections = 0;
if (PARAMS->choose_algo) {
if (PARAMS->choose_time) {
selection_name = "fastest";
} else {
selection_name = "best suited";
}
};
#endif
#section support_code_struct
#line 22 "dnn_gw.c"
int reuse_algo;
AlgoRec prev_algo;
std::string hash_prefix;
#define AESARA_DONT_MEMSET_STRUCT
#ifdef DEBUG
char algorithm_name[128];
#endif
#ifdef DEBUG_TIMING
double total_computation_time;
double total_selection_time;
size_t n_computations;
size_t n_selections;
const char* selection_name;
#endif
/** Check given algorithm against inputs and convolution descriptor,
change algorithm inplace to a fallback algorithm if checkings fail.
Return 0 on success, non-0 on error. **/
int dnn_conv_gw_fallback(cudnnConvolutionBwdFilterAlgo_t* _algo,
const PyGpuArrayObject* input,
const PyGpuArrayObject* kerns,
cudnnConvolutionDescriptor_t desc) {
cudnnConvolutionBwdFilterAlgo_t algo = *_algo;
// The FFT implementation does not support strides, 1x1 filters or inputs
// with a spatial dimension larger than 1024.
// If the chosen implementation is FFT, validate that it can
// be used on the current data and default to a safe implementation if it
// can't.
// The following code is 2d-specific but it is fine as FFT and tiled-FFT are
// defined only for 2d filters
if (algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT &&
PyGpuArray_NDIM(input) == 4) {
// Extract the properties of the convolution descriptor
int nd;
int pad[2];
int stride[2];
int upscale[2];
cudnnConvolutionMode_t mode;
cudnnDataType_t data_type;
cudnnStatus_t err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride, upscale, &mode, &data_type);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error getting convolution properties: %s",
cudnnGetErrorString(err));
return 1;
}
if (stride[0] != 1 || stride[1] != 1 ||
PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
(PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1)) {
algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
#ifdef DEBUG
fprintf(stderr, "(replacing gradweight algo fft with none)\n");
#endif
}
}
*_algo = algo;
return 0;
}
int
APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
PyGpuArrayObject *km,
cudnnConvolutionDescriptor_t desc,
double alpha, double beta, PyGpuArrayObject **kerns,
PARAMS_TYPE* params) {
PyGpuContextObject *c = input->context;
void *alpha_p;
void *beta_p;
float af = alpha, bf = beta;
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
bool use_cached = 0;
#ifdef DEBUG
if (_cppver) fprintf(stderr, "%s\n", _cppver);
#endif
#ifdef DEBUG_TIMING
AesaraTimer timer;
#endif
if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1] * params->num_groups) {
PyErr_SetString(PyExc_ValueError,
"GpuDnnConv images and kernel must have the same stack size");
return 1;
}
if ((PyGpuArray_DIMS(output)[1] % params->num_groups) != 0) {
PyErr_SetString(PyExc_ValueError,
"Number of output channels must be divisible by number of groups");
return 1;
}
switch (input->ga.typecode) {
case GA_DOUBLE:
alpha_p = (void *)&alpha;
beta_p = (void *)&beta;
break;
case GA_FLOAT:
case GA_HALF:
alpha_p = (void *)&af;
beta_p = (void *)&bf;
break;
default:
PyErr_SetString(PyExc_TypeError, "Unsupported type in convolution");
return 1;
}
if (params->inplace) {
Py_XDECREF(*kerns);
*kerns = km;
Py_INCREF(*kerns);
} else {
if (aesara_prep_output(kerns, PyGpuArray_NDIM(km), PyGpuArray_DIMS(km),
km->ga.typecode, GA_C_ORDER, c) != 0)
return 1;
if (beta != 0.0 && pygpu_move(*kerns, km))
return 1;
}
if (PyGpuArray_DIMS(input)[0] == 0 || PyGpuArray_DIMS(km)[0] == 0 || PyGpuArray_DIMS(km)[1] == 0) {
int err2 = GpuArray_memset(&(*kerns)->ga, 0);
if (err2 != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"GpuDnnConv grad wrt. weights could not fill the output with zeros: %d", err2);
return 1;
}
return 0;
}
int groups = c_get_groups_for_conv(desc, params->num_groups);
if (groups == -1)
return 1;
if (c_set_tensor_for_conv(input, APPLY_SPECIFIC(input), groups) == -1)
return 1;
if (c_set_tensor_for_conv(output, APPLY_SPECIFIC(output), groups) == -1)
return 1;
if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns), groups) == -1)
return 1;
if (0 != dnn_check_convolution_output(desc, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
PyGpuArray_NDIM(*kerns), output, groups))
return 1;
size_t input_offset = PyGpuArray_STRIDE(input, 0) / groups;
size_t kern_offset = PyGpuArray_STRIDE(*kerns, 0) * PyGpuArray_DIM(*kerns, 0) / groups;
size_t output_offset = PyGpuArray_STRIDE(output, 0) / groups;
cudnnConvolutionBwdFilterAlgo_t algo = params->conv_algo;
size_t worksize = 0;
cudnnMathType_t mathtype = CUDNN_DEFAULT_MATH;
std::string hashkey ;
cuda_enter(c->ctx);
size_t maxfree = c_get_largest_free_block_size(c);
if (PyErr_Occurred()) {
cuda_exit(c->ctx);
return 1;
}
if (params->choose_algo) {
if (!reuse_algo) {
char pci_id[16];
gpucontext_property(c->ctx, GA_CTX_PROP_UNIQUE_ID, pci_id);
// check out cache
hashkey = dnn_conv_shape(APPLY_SPECIFIC(input), input, APPLY_SPECIFIC(kerns), *kerns, desc, output, groups);
if (hashkey.empty()) {
cuda_exit(c->ctx);
return 1;
}
hashkey = hash_prefix + pci_id + (params->choose_time ? " -t " : " ") + hashkey;
const AlgoRec* cached = dnn_conv_check_cache(hashkey);
if (cached) {
prev_algo = *cached;
use_cached = 1;
}
}
if (reuse_algo || use_cached) {
algo = (cudnnConvolutionBwdFilterAlgo_t)prev_algo.algo;
worksize = prev_algo.wsSize;
mathtype = prev_algo.mathType;
} else {
if (params->choose_time) {
int count;
cudnnConvolutionBwdFilterAlgoPerf_t choice;
gpudata *tmpmem;
// set the 'tensor math ok' flag
if (input->ga.typecode == GA_HALF)
c_set_math_type_for_conv(desc, CUDNN_TENSOR_OP_MATH);
tmpmem = gpudata_alloc(c->ctx, maxfree, NULL, 0, NULL);
if (tmpmem == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
cuda_exit(c->ctx);
return -1;
}
/* cudnnFindConvolutionBackwardFilterAlgorithmEx() may write to kernels output (kerns).
We don't want that if output is used in computation (ie. if beta != 0). */
PyGpuArrayObject* k = *kerns;
if (beta != 0) {
k = pygpu_empty(PyGpuArray_NDIM(*kerns), PyGpuArray_DIMS(*kerns), (*kerns)->ga.typecode, GA_C_ORDER, c, Py_None);
}
#ifdef DEBUG_TIMING
timer.start();
#endif
err = cudnnFindConvolutionBackwardFilterAlgorithmEx(
params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc,
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(k),
1, &count, &choice, *(void **)tmpmem, maxfree);
#ifdef DEBUG_TIMING
timer.end();
#endif
gpudata_release(tmpmem);
if (beta != 0) {
Py_XDECREF(k);
}
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
#ifdef DEBUG
if (count == 0) {
PyErr_SetString(PyExc_RuntimeError, "No best-timed conv gradweight algorithm found");
cuda_exit(c->ctx);
return 1;
} else if (choice.status != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error getting best-timed gradweight algo: %s",
cudnnGetErrorString(choice.status));
cuda_exit(c->ctx);
return 1;
} // Else, count is necessarly 1 for current implementation.
#endif
algo = choice.algo;
worksize = choice.memory;
#if CUDNN_MAJOR >= 7
if (input->ga.typecode == GA_HALF)
mathtype = choice.mathType;
#endif
} else {
#ifdef DEBUG_TIMING
timer.start();
#endif
err = cudnnGetConvolutionBackwardFilterAlgorithm(
params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
desc, APPLY_SPECIFIC(kerns),
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, maxfree, &algo);
#ifdef DEBUG_TIMING
timer.end();
#endif
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
}
#ifdef DEBUG_TIMING
total_selection_time += timer.milliseconds;
++n_selections;
#endif
}
} /* choose_algo */
if (c_set_math_type_for_conv(desc, mathtype) == -1 ||
dnn_conv_gw_fallback(&algo, input, *kerns, desc) != 0) {
cuda_exit(c->ctx);
return 1;
}
// if FindEx was used (choose_time), workspace size is set.
if (!(reuse_algo || use_cached || params->choose_time))
{
err = cudnnGetConvolutionBackwardFilterWorkspaceSize(
params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(kerns), algo, &worksize);
if (err == CUDNN_STATUS_NOT_SUPPORTED) {
// Fallback to none algo if not supported
#ifdef DEBUG
if (0 != aesara_enum_to_string_cudnnConvolutionBwdFilterAlgo_t(algo, algorithm_name)) {
cuda_exit(c->ctx);
return 1;
}
fprintf(stderr, "(error getting worksize for %s: falling back to CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0)\n",
algorithm_name);
#endif
algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
err = cudnnGetConvolutionBackwardFilterWorkspaceSize(
params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(kerns), algo, &worksize);
}
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
}
if (params->choose_algo) {
#ifdef DEBUG
if (0 != aesara_enum_to_string_cudnnConvolutionBwdFilterAlgo_t(algo, algorithm_name)) {
cuda_exit(c->ctx);
return 1;
}
fprintf(stderr, "(using %s%s %s%s%s, ws:%ld, hash:%s)\n",
algorithm_name,
mathtype == CUDNN_TENSOR_OP_MATH ? "(tensor_op)" : "",
params->choose_time ? "(timed)": "" ,
reuse_algo ? "(reused)" : "",
use_cached ? "(cache)": "",
worksize,
hashkey.c_str()
);
#endif
#ifdef DEBUG_TIMING
if (!(reuse_algo || use_cached)) {
// We have selected an algorithm at runtime.
// `timer` still contains timing about selection step.
fprintf(stderr, "\t(selected %s gradweight algo in %g milliseconds)\n", selection_name, timer.milliseconds);
if (n_selections > 1) {
fprintf(stderr, "\t(selected %lu gradweight algos in %g milliseconds (average: %g milliseconds per selection))\n",
n_selections, total_selection_time, total_selection_time / n_selections);
}
}
#endif
if (!reuse_algo) {
// save for next time/cache
prev_algo.algo = algo;
prev_algo.wsSize = worksize;
prev_algo.mathType = mathtype;
// Add to the cache if we choose on shape change, or first time if
// we choose once.
if (!use_cached)
dnn_conv_update_cache(hashkey, prev_algo);
if (params->choose_once)
reuse_algo = 1;
}
} // params->choose_algo
gpudata *workspace = 0;
if (worksize != 0) {
workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
if (workspace == NULL) {
PyErr_SetString(PyExc_RuntimeError, "Could not allocate working memory");
cuda_exit(c->ctx);
return 1;
}
}
if (worksize != 0)
cuda_wait(workspace, GPUARRAY_CUDA_WAIT_WRITE);
cuda_wait(input->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_wait(output->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_wait((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
#ifdef DEBUG_TIMING
GpuArray_sync(&(*kerns)->ga);
timer.start();
#endif
for ( int g = 0; g < groups; g++) {
err = cudnnConvolutionBackwardFilter(
params->handle,
alpha_p,
APPLY_SPECIFIC(input), ((char *)PyGpuArray_DEV_DATA(input)) + input_offset * g ,
APPLY_SPECIFIC(output), ((char *)PyGpuArray_DEV_DATA(output)) + output_offset * g,
desc, algo, worksize == 0 ? NULL : *(void **)workspace, worksize,
beta_p,
APPLY_SPECIFIC(kerns), ((char *)PyGpuArray_DEV_DATA(*kerns)) + kern_offset * g);
}
if (worksize != 0) {
cuda_record(workspace, GPUARRAY_CUDA_WAIT_WRITE);
gpudata_release(workspace);
}
cuda_record(input->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_record(output->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_record((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
#ifdef DEBUG_TIMING
GpuArray_sync(&(*kerns)->ga);
timer.end();
total_computation_time += timer.milliseconds;
++n_computations;
#endif
cuda_exit(c->ctx);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error doing cuDNN conv gradweight operation: %s",
cudnnGetErrorString(err));
return 1;
}
#ifdef DEBUG_TIMING
fprintf(stderr, "\t(ran gradweight algo in %g milliseconds)\n", timer.milliseconds);
if (n_computations > 1) {
fprintf(stderr, "\t(ran %lu gradweight computations in %g milliseconds (average: %g milliseconds per call))\n",
n_computations, total_computation_time, total_computation_time / n_computations);
}
#endif
return 0;
}
#section support_code_struct
cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
cudnnPoolingDescriptor_t APPLY_SPECIFIC(pool);
#section init_code_struct
cudnnStatus_t APPLY_SPECIFIC(err);
APPLY_SPECIFIC(input) = NULL;
APPLY_SPECIFIC(output) = NULL;
APPLY_SPECIFIC(pool) = NULL;
if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
"(inp): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
FAIL;
}
if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
"(out): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
FAIL;
}
if ((APPLY_SPECIFIC(err) = cudnnCreatePoolingDescriptor(&APPLY_SPECIFIC(pool))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate pooling descriptor"
"(pool): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
FAIL;
}
#section cleanup_code_struct
if (APPLY_SPECIFIC(input) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input)); }
if (APPLY_SPECIFIC(output) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output)); }
if (APPLY_SPECIFIC(pool) != NULL) { cudnnDestroyPoolingDescriptor(APPLY_SPECIFIC(pool)); }
#section support_code_struct
int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
PyArrayObject *ws,
PyArrayObject *stride,
PyArrayObject *pad,
PyGpuArrayObject **out,
PARAMS_TYPE* params) {
PyGpuContextObject *c = img->context;
size_t dims[5];
cudnnStatus_t err;
if (!GpuArray_IS_C_CONTIGUOUS(&img->ga)) {
PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
return 1;
}
cudnnPoolingMode_t mode;
int w[3];
int p[3];
int s[3];
int ndims = PyArray_DIM(ws, 0);//PyGpuArray_NDIM(img) - 2;
for(int i = 0; i < ndims; i++) {
w[i] = *((npy_intp*)PyArray_GETPTR1(ws, i));
}
for(int i = 0; i < ndims; i++) {
p[i] = *((npy_intp*)PyArray_GETPTR1(pad, i));
}
for(int i = 0; i < ndims; i++) {
s[i] = *((npy_intp*)PyArray_GETPTR1(stride, i));
}
dims[0] = PyGpuArray_DIM(img, 0);
dims[1] = PyGpuArray_DIM(img, 1);
dims[2] = (PyGpuArray_DIM(img, 2) + (p[0]*2) - w[0]) / s[0] + 1;
dims[3] = (PyGpuArray_DIM(img, 3) + (p[1]*2) - w[1]) / s[1] + 1;
if (ndims == 3)
dims[4] = (PyGpuArray_DIM(img, 4) + (p[2]*2) - w[2]) / s[2] + 1;
if (aesara_prep_output(out, ndims+2, dims, img->ga.typecode,
GA_C_ORDER, c) != 0)
return 1;
// if input batch is empty, we return the empty output without calling cuDNN
// (which will fail on zero batch size).
if (PyGpuArray_DIM(*out, 0) == 0)
return 0;
if (c_set_tensorNd(img, APPLY_SPECIFIC(input)) != 0)
return 1;
if (c_set_tensorNd(*out, APPLY_SPECIFIC(output)) != 0)
return 1;
err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), params->mode, CUDNN_PROPAGATE_NAN, ndims, w, p, s);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set op descriptor %s", cudnnGetErrorString(err));
return 1;
}
{
const float alphaf = 1;
const float betaf = 0;
const double alphad = 1;
const double betad = 0;
void *alpha, *beta;
switch (img->ga.typecode) {
case GA_DOUBLE:
alpha = (void *)&alphad;
beta = (void *)&betad;
break;
case GA_FLOAT:
case GA_HALF:
alpha = (void *)&alphaf;
beta = (void *)&betaf;
break;
default:
PyErr_SetString(PyExc_TypeError, "Unsupported type in pooling");
return 1;
}
cuda_enter(c->ctx);
cuda_wait(img->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_wait((*out)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnPoolingForward(
params->handle, APPLY_SPECIFIC(pool),
alpha,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(img),
beta,
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*out));
cuda_record(img->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_record((*out)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
cuda_exit(c->ctx);
}
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"GpuDnnPool: error doing cudnnPoolingForward operation: %s",
cudnnGetErrorString(err));
return 1;
}
return 0;
}
#section support_code_struct
cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
cudnnTensorDescriptor_t APPLY_SPECIFIC(input_grad);
cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
cudnnTensorDescriptor_t APPLY_SPECIFIC(output_grad);
cudnnPoolingDescriptor_t APPLY_SPECIFIC(pool);
#section init_code_struct
APPLY_SPECIFIC(input) = NULL;
APPLY_SPECIFIC(input_grad) = NULL;
APPLY_SPECIFIC(output) = NULL;
APPLY_SPECIFIC(output_grad) = NULL;
APPLY_SPECIFIC(pool) = NULL;
{
cudnnStatus_t err;
if ((err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError,
"could not allocate tensor descriptor (input): %s",
cudnnGetErrorString(err));
FAIL;
}
if ((err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input_grad))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError,
"could not allocate tensor descriptor (input_grad): %s",
cudnnGetErrorString(err));
FAIL;
}
if ((err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError,
"could not allocate tensor descriptor (output): %s",
cudnnGetErrorString(err));
FAIL;
}
if ((err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output_grad))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError,
"could not allocate tensor descriptor (output_grad): %s",
cudnnGetErrorString(err));
FAIL;
}
if ((err = cudnnCreatePoolingDescriptor(&APPLY_SPECIFIC(pool))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate pooling descriptor"
"(pool): %s", cudnnGetErrorString(err));
FAIL;
}
}
#section cleanup_code_struct
if (APPLY_SPECIFIC(input) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input)); }
if (APPLY_SPECIFIC(input_grad) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input_grad)); }
if (APPLY_SPECIFIC(output) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output)); }
if (APPLY_SPECIFIC(output_grad) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output_grad)); }
if (APPLY_SPECIFIC(pool) != NULL) { cudnnDestroyPoolingDescriptor(APPLY_SPECIFIC(pool)); }
#section support_code_struct
int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
PyGpuArrayObject *out,
PyGpuArrayObject *out_grad,
PyArrayObject *ws,
PyArrayObject *stride,
PyArrayObject *pad,
PyGpuArrayObject **inp_grad,
PARAMS_TYPE* params) {
PyGpuContextObject *c = inp->context;
cudnnStatus_t err;
if (!GpuArray_IS_C_CONTIGUOUS(&inp->ga)) {
PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
return 1;
}
if (!GpuArray_IS_C_CONTIGUOUS(&out_grad->ga)) {
PyErr_SetString(PyExc_ValueError, "Only contiguous output gradients are supported.");
return 1;
}
if (!GpuArray_IS_C_CONTIGUOUS(&out->ga)) {
PyErr_SetString(PyExc_ValueError, "Only contiguous outputs are supported.");
return 1;
}
if (aesara_prep_output(inp_grad, PyGpuArray_NDIM(inp),
PyGpuArray_DIMS(inp), inp->ga.typecode,
GA_C_ORDER, c) != 0) {
return 1;
}
// if input batch is empty, we return the empty output without calling cuDNN
// (which will fail on zero batch size).
if (PyGpuArray_DIM(*inp_grad, 0) == 0)
return 0;
if (c_set_tensorNd(inp, APPLY_SPECIFIC(input)) != 0)
return 1;
if (c_set_tensorNd(out_grad, APPLY_SPECIFIC(output_grad)) != 0)
return 1;
if (c_set_tensorNd(out, APPLY_SPECIFIC(output)) != 0)
return 1;
int w[3];
int p[3];
int s[3];
int ndims = PyArray_DIM(ws, 0);//PyGpuArray_NDIM(img) - 2;
for(int i = 0; i < ndims; i++) {
w[i] = *((npy_intp*)PyArray_GETPTR1(ws, i));
}
for(int i = 0; i < ndims; i++) {
p[i] = *((npy_intp*)PyArray_GETPTR1(pad, i));
}
for(int i = 0; i < ndims; i++) {
s[i] = *((npy_intp*)PyArray_GETPTR1(stride, i));
}
err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), params->mode, CUDNN_PROPAGATE_NAN, ndims, w, p, s);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set op descriptor %s", cudnnGetErrorString(err));
}
if (c_set_tensorNd(*inp_grad, APPLY_SPECIFIC(input_grad)) != 0)
return 1;
{
const float alphaf = 1;
const float betaf = 0;
const double alphad = 1;
const double betad = 0;
void *alpha, *beta;
switch (inp->ga.typecode) {
case GA_DOUBLE:
alpha = (void *)&alphad;
beta = (void *)&betad;
break;
case GA_FLOAT:
case GA_HALF:
alpha = (void *)&alphaf;
beta = (void *)&betaf;
break;
default:
PyErr_SetString(PyExc_TypeError, "Unsupported type in pooling gradient");
return 1;
}
cuda_enter(c->ctx);
cuda_wait(out->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_wait(out_grad->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_wait(inp->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_wait((*inp_grad)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnPoolingBackward(
params->handle, APPLY_SPECIFIC(pool),
alpha,
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(out),
APPLY_SPECIFIC(output_grad), PyGpuArray_DEV_DATA(out_grad),
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(inp),
beta,
APPLY_SPECIFIC(input_grad), PyGpuArray_DEV_DATA(*inp_grad)
);
cuda_record(out->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_record(out_grad->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_record(inp->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_record((*inp_grad)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
cuda_exit(c->ctx);
}
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error doing operation: %s.",
cudnnGetErrorString(err));
return 1;
}
return 0;
}
#section support_code_struct
cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
cudnnReduceTensorDescriptor_t APPLY_SPECIFIC(red);
GpuElemwise* elemwise;
gpuelemwise_arg arg;
#section init_code_struct
cudnnStatus_t APPLY_SPECIFIC(err);
APPLY_SPECIFIC(input) = NULL;
APPLY_SPECIFIC(output) = NULL;
APPLY_SPECIFIC(red) = NULL;
if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
"(inp): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
FAIL;
}
if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
"(out): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
FAIL;
}
if ((APPLY_SPECIFIC(err) = cudnnCreateReduceTensorDescriptor(&APPLY_SPECIFIC(red))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate reduction descriptor"
"(red): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
FAIL;
}
elemwise = NULL;
#section cleanup_code_struct
if (APPLY_SPECIFIC(input) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input)); }
if (APPLY_SPECIFIC(output) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output)); }
if (APPLY_SPECIFIC(red) != NULL) { cudnnDestroyReduceTensorDescriptor(APPLY_SPECIFIC(red)); }
if (elemwise) {
GpuElemwise_free(elemwise);
elemwise = NULL;
}
#section support_code_struct
int APPLY_SPECIFIC(dnn_redux)(PyGpuArrayObject *input,
PyGpuArrayObject **output,
PyGpuArrayObject **indices,
PARAMS_TYPE* params) {
PyGpuContextObject *c = input->context;
gpudata *workspace = NULL;
size_t worksize = 0;
size_t indsize = 0;
size_t *tdims;
ssize_t *tstrs;
size_t dims[8];
ssize_t strs[8];
size_t rsz;
void *alpha;
void *beta;
cudnnStatus_t err;
unsigned int p;
int e;
static float falpha = 1.0f;
static double dalpha = 1.0;
static float fbeta = 0.0f;
static double dbeta = 0.0;
if (!GpuArray_IS_C_CONTIGUOUS(&input->ga)) {
PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
return 1;
}
if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) != 0)
return 1;
p = 0;
rsz = 1;
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
if (!(params->c_axis & (1U << i))) {
dims[p] = PyGpuArray_DIM(input, i);
p++;
} else {
rsz *= PyGpuArray_DIM(input, i);
}
}
if (indices != NULL) {
if (aesara_prep_output(indices, p, dims, GA_UINT, GA_C_ORDER, c) != 0)
return 1;
indsize = PyGpuArray_SIZE(*indices) * 4;
}
if (p == input->ga.nd || rsz == 1) {
int err;
Py_XDECREF(*output);
*output = pygpu_copy(input, GA_C_ORDER);
if (*output == NULL)
return 1;
err = GpuArray_reshape_inplace(&(*output)->ga, p, dims, GA_C_ORDER);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "GpuArray_reshape_inplace: %s", GpuArray_error(&(*output)->ga, err));
return 1;
}
if (rsz == 1) {
/* We must reduce some dimensions which have all size 1.
* cuDNN (up to 7004) does not support this case. Let's use GpuElemwise. */
switch (params->red_op) {
// Nothing to do for following cases.
case CUDNN_REDUCE_TENSOR_ADD: break;
case CUDNN_REDUCE_TENSOR_MUL: break;
case CUDNN_REDUCE_TENSOR_MIN: break;
case CUDNN_REDUCE_TENSOR_MAX: break;
case CUDNN_REDUCE_TENSOR_AVG: break;
/* Work to do for following cases.
AMAX (maximum on absolute values) => apply abs(output)
NORM1 (addition of absolute values) => apply abs(output)
NORM2 (square root of sum of squares) => sqroot(output^2) => abs(output)
So, we must apply abs(output) for all following cases.
*/
case CUDNN_REDUCE_TENSOR_AMAX:
case CUDNN_REDUCE_TENSOR_NORM1:
case CUDNN_REDUCE_TENSOR_NORM2:
{
if (elemwise == NULL) {
arg.name = "out";
arg.typecode = (*output)->ga.typecode;
arg.flags = GE_READ | GE_WRITE;
elemwise = GpuElemwise_new(c->ctx, "", "out = (out < 0 ? -out : out)", 1, &arg, p, GE_CONVERT_F16);
if (!elemwise) {
PyErr_SetString(PyExc_RuntimeError, "Unable to create GpuElemwise for output.");
return 1;
}
}
void* args[1] = { (void*)&(*output)->ga };
int err = GpuElemwise_call(elemwise, args, 0);
if (err != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "Unable to call GpuElemwise on output.");
return 1;
};
}
break;
default: break;
}
}
if (indices != NULL) {
// All indices will be 0 since the size of the reduced area is 1.
err = GpuArray_memset(&(*indices)->ga, 0);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "GpuArray_memset: %s", GpuArray_error(&(*indices)->ga, err));
return 1;
}
}
// This is a shortcut path.
return 0;
}
if (aesara_prep_output(output, p, dims, input->ga.typecode,
GA_C_ORDER, c) != 0)
return 1;
// cuDNN expect that the output has the same number of dimension as
// the input, but the dimensions to reduce are of size 1 in the output.
// We have to do some trickery to be able to pass it what it need.
p = 0;
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
if (params->c_axis & (1U << i)) {
dims[i] = 1;
strs[i] = 0;
} else {
dims[i] = PyGpuArray_DIM(input, i);
strs[i] = PyGpuArray_STRIDE(*output, p);
p++;
}
}
// Perform horrible surgery to be able to reuse c_set_tensorNd()
tdims = (*output)->ga.dimensions;
tstrs = (*output)->ga.strides;
(*output)->ga.dimensions = dims;
(*output)->ga.strides = strs;
(*output)->ga.nd = input->ga.nd;
// Delay error checking to avoid exposing a broken object
e = c_set_tensorNd(*output, APPLY_SPECIFIC(output));
// Undo our horrible surgery
(*output)->ga.nd = p;
(*output)->ga.dimensions = tdims;
(*output)->ga.strides = tstrs;
if (e != 0)
return 1;
// Back to normal, no more horrible things
// Note that only CUDNN_32BIT_INDICES is implemented
err = cudnnSetReduceTensorDescriptor(
APPLY_SPECIFIC(red), params->red_op,
params->acc_dtype, CUDNN_PROPAGATE_NAN,
indices == NULL ? CUDNN_REDUCE_TENSOR_NO_INDICES : CUDNN_REDUCE_TENSOR_FLATTENED_INDICES,
CUDNN_32BIT_INDICES);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set reduce descriptor: %s",
cudnnGetErrorString(err));
return 1;
}
switch (input->ga.typecode) {
case GA_FLOAT:
case GA_HALF:
alpha = &falpha;
beta = &fbeta;
break;
case GA_DOUBLE:
alpha = &dalpha;
beta = &dbeta;
break;
default:
PyErr_SetString(PyExc_RuntimeError, "Unsupported dtype in dnn reduce");
return 1;
}
err = cudnnGetReductionWorkspaceSize(params->handle,
APPLY_SPECIFIC(red),
APPLY_SPECIFIC(input),
APPLY_SPECIFIC(output),
&worksize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not get reduce workspace size: %s",
cudnnGetErrorString(err));
return 1;
}
if (worksize != 0) {
workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, &e);
if (workspace == NULL) {
PyErr_Format(PyExc_RuntimeError, "gpudata_alloc: %s",
gpucontext_error(c->ctx, e));
return 1;
}
}
err = cudnnReduceTensor(params->handle, APPLY_SPECIFIC(red),
indices ? PyGpuArray_DEV_DATA(*indices) : NULL, indsize,
worksize ? *((void **)workspace) : NULL, worksize,
alpha,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
beta,
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output));
if (workspace != NULL)
gpudata_release(workspace);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not run reduction: %s",
cudnnGetErrorString(err));
return 1;
}
return 0;
}
#section support_code
int dnn_rnn_desc(int hidden_size, int num_layers,
cudnnDropoutDescriptor_t ddesc,
int input_mode, int direction_mode, int rnn_mode,
int dtype, cudnnRNNDescriptor_t *odesc,
cudnnHandle_t _handle) {
cudnnRNNDescriptor_t desc;
cudnnDataType_t data_type;
cudnnStatus_t err;
switch (dtype) {
case GA_FLOAT:
data_type = CUDNN_DATA_FLOAT;
break;
case GA_DOUBLE:
data_type = CUDNN_DATA_DOUBLE;
break;
case GA_HALF:
data_type = CUDNN_DATA_HALF;
break;
default:
PyErr_SetString(PyExc_ValueError, "Unsupported data type");
return -1;
}
err = cudnnCreateRNNDescriptor(&desc);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_SetString(PyExc_RuntimeError, "Can't create RNN descriptor");
return -1;
}
#if CUDNN_MAJOR < 7
err = cudnnSetRNNDescriptor(desc, hidden_size, num_layers, ddesc,
(cudnnRNNInputMode_t)input_mode,
(cudnnDirectionMode_t)direction_mode,
(cudnnRNNMode_t)rnn_mode, data_type);
#else
err = cudnnSetRNNDescriptor(_handle, desc, hidden_size, num_layers, ddesc,
(cudnnRNNInputMode_t)input_mode,
(cudnnDirectionMode_t)direction_mode,
(cudnnRNNMode_t)rnn_mode, CUDNN_RNN_ALGO_STANDARD, data_type);
#endif
if (err != CUDNN_STATUS_SUCCESS) {
cudnnDestroyRNNDescriptor(desc);
PyErr_SetString(PyExc_RuntimeError, "Can't set RNN descriptor");
return -1;
}
*odesc = desc;
return 0;
}
#section support_code
int dnn_rnn_fwd(cudnnRNNDescriptor_t desc, uint32_t numDirs,
PyGpuArrayObject *w, PyGpuArrayObject *x,
PyGpuArrayObject *hx, PyGpuArrayObject *cx,
gpudata **reserve, PyGpuArrayObject **y,
PyGpuArrayObject **hy, PyGpuArrayObject **cy,
cudnnHandle_t _handle) {
PyGpuContextObject *c = x->context;
cudnnTensorDescriptor_t xdesc = NULL;
cudnnTensorDescriptor_t hxdesc = NULL;
cudnnTensorDescriptor_t cxdesc = NULL;
cudnnTensorDescriptor_t ydesc = NULL;
cudnnTensorDescriptor_t hydesc = NULL;
cudnnTensorDescriptor_t cydesc = NULL;
cudnnFilterDescriptor_t wdesc = NULL;
cudnnTensorDescriptor_t *xl = NULL;
cudnnTensorDescriptor_t *yl = NULL;
gpudata *workspace = NULL;
size_t worksize, ressize;
size_t seqLength = PyGpuArray_DIM(x, 0);
size_t miniBatch = PyGpuArray_DIM(x, 1);
size_t inputSize = PyGpuArray_DIM(x, 2);
size_t hiddenSize = PyGpuArray_DIM(hx, 2);
size_t shape[3];
int strs[3], dims[3];
cudnnStatus_t err;
cudnnDataType_t dt;
int res = -1;
switch (x->ga.typecode) {
case GA_FLOAT:
dt = CUDNN_DATA_FLOAT;
break;
case GA_DOUBLE:
dt = CUDNN_DATA_DOUBLE;
break;
case GA_HALF:
dt = CUDNN_DATA_HALF;
break;
default:
PyErr_SetString(PyExc_TypeError, "Unsupported data type for x");
return -1;
}
// This is early to match the exit() in the fail label.
cuda_enter(c->ctx);
err = cudnnCreateTensorDescriptor(&xdesc);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not create xdesc: %s",
cudnnGetErrorString(err));
goto fail;
}
dims[0] = PyGpuArray_DIM(x, 1);
dims[1] = PyGpuArray_DIM(x, 2);
dims[2] = 1;
strs[0] = dims[1] * dims[2];
strs[1] = dims[2];
strs[2] = 1;
err = cudnnSetTensorNdDescriptor(xdesc, dt, 3, dims, strs);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not set xdesc: %s",
cudnnGetErrorString(err));
goto fail;
}
if (c_make_tensorNd(hx, &hxdesc) != 0)
goto fail;
if (cx != NULL)
if (c_make_tensorNd(cx, &cxdesc) != 0)
goto fail;
if (c_make_filter(w, &wdesc) != 0)
goto fail;
shape[0] = seqLength;
shape[1] = miniBatch;
shape[2] = hiddenSize * numDirs;
if (aesara_prep_output(y, 3, shape, x->ga.typecode, GA_C_ORDER, c) != 0)
goto fail;
err = cudnnCreateTensorDescriptor(&ydesc);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not create ydesc: %s",
cudnnGetErrorString(err));
goto fail;
}
dims[0] = shape[1];
dims[1] = shape[2];
dims[2] = 1;
strs[0] = dims[2] * dims[1];
strs[1] = dims[2];
strs[2] = 1;
err = cudnnSetTensorNdDescriptor(ydesc, dt, 3, dims, strs);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not set ydesc: %s",
cudnnGetErrorString(err));
goto fail;
}
if (aesara_prep_output(hy, 3, PyGpuArray_DIMS(hx),
hx->ga.typecode, GA_C_ORDER, c) != 0)
goto fail;
if (c_make_tensorNd(*hy, &hydesc) != 0)
goto fail;
if (cy != NULL) {
if (aesara_prep_output(cy, 3, PyGpuArray_DIMS(cx),
cx->ga.typecode, GA_C_ORDER, c) != 0)
goto fail;
if (c_make_tensorNd(*cy, &cydesc) != 0)
goto fail;
}
xl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), seqLength);
if (xl == NULL) {
PyErr_NoMemory();
goto fail;
}
for (size_t i = 0; i < seqLength; i++)
xl[i] = xdesc;
yl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), seqLength);
if (yl == NULL) {
PyErr_NoMemory();
goto fail;
}
for (size_t i = 0; i < seqLength; i++)
yl[i] = ydesc;
err = cudnnGetRNNWorkspaceSize(_handle, desc, (int)seqLength,
xl, &worksize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not get worksize: %s",
cudnnGetErrorString(err));
goto fail;
}
workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
if (workspace == NULL) {
PyErr_Format(PyExc_RuntimeError, "Could not allocate workspace");
goto fail;
}
err = cudnnGetRNNTrainingReserveSize(_handle, desc, (int)seqLength,
xl, &ressize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not get reserve size: %s",
cudnnGetErrorString(err));
goto fail;
}
*reserve = gpudata_alloc(c->ctx, ressize, NULL, 0, NULL);
if (*reserve == NULL) {
PyErr_Format(PyExc_RuntimeError, "Could not allocate reserve");
goto fail;
}
err = cudnnRNNForwardTraining(_handle, desc, (int)seqLength,
xl, PyGpuArray_DEV_DATA(x),
hxdesc, PyGpuArray_DEV_DATA(hx),
cxdesc, cx ? PyGpuArray_DEV_DATA(cx) : NULL,
wdesc, PyGpuArray_DEV_DATA(w),
yl, PyGpuArray_DEV_DATA(*y),
hydesc, PyGpuArray_DEV_DATA(*hy),
cydesc, cy ? PyGpuArray_DEV_DATA(*cy) : NULL,
*(void **)workspace, worksize,
*(void **)(*reserve), ressize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could run RNN: %s",
cudnnGetErrorString(err));
goto fail;
}
res = 0;
fail:
if (xdesc != NULL)
cudnnDestroyTensorDescriptor(xdesc);
if (hxdesc != NULL)
cudnnDestroyTensorDescriptor(hxdesc);
if (cxdesc != NULL)
cudnnDestroyTensorDescriptor(cxdesc);
if (wdesc != NULL)
cudnnDestroyFilterDescriptor(wdesc);
if (ydesc != NULL)
cudnnDestroyTensorDescriptor(ydesc);
if (hydesc != NULL)
cudnnDestroyTensorDescriptor(hydesc);
if (cydesc != NULL)
cudnnDestroyTensorDescriptor(cydesc);
free(xl);
free(yl);
if (workspace != NULL)
gpudata_release(workspace);
cuda_exit(c->ctx);
return res;
}
#section support_code
int dnn_rnn_gi(cudnnRNNDescriptor_t desc, npy_uint64 xshp,
PyGpuArrayObject *y, PyGpuArrayObject *dy,
PyGpuArrayObject *w, PyGpuArrayObject *hx,
gpudata *reserve, PyGpuArrayObject *cx,
PyGpuArrayObject *dhy, PyGpuArrayObject *dcy,
gpudata **oreserve, PyGpuArrayObject **dx,
PyGpuArrayObject **dhx, PyGpuArrayObject **dcx,
cudnnHandle_t _handle) {
PyGpuContextObject *c = y->context;
cudnnTensorDescriptor_t ydesc = NULL;
cudnnTensorDescriptor_t dhydesc = NULL;
cudnnTensorDescriptor_t dcydesc = NULL;
cudnnFilterDescriptor_t wdesc = NULL;
cudnnTensorDescriptor_t hxdesc = NULL;
cudnnTensorDescriptor_t cxdesc = NULL;
cudnnTensorDescriptor_t dxdesc = NULL;
cudnnTensorDescriptor_t dhxdesc = NULL;
cudnnTensorDescriptor_t dcxdesc = NULL;
cudnnTensorDescriptor_t *yl = NULL;
cudnnTensorDescriptor_t *dxl = NULL;
gpudata *workspace = NULL;
size_t worksize, ressize;
size_t seqLength = PyGpuArray_DIM(y, 0);
size_t miniBatch = PyGpuArray_DIM(y, 1);
size_t inputSize = xshp;
size_t shape[3];
int dims[3], strs[3];
cudnnStatus_t err;
cudnnDataType_t dt;
int res = -1;
switch (y->ga.typecode) {
case GA_FLOAT:
dt = CUDNN_DATA_FLOAT;
break;
case GA_DOUBLE:
dt = CUDNN_DATA_DOUBLE;
break;
case GA_HALF:
dt = CUDNN_DATA_HALF;
break;
default:
PyErr_SetString(PyExc_TypeError, "Unsupported data type for y");
return -1;
}
cuda_enter(c->ctx);
err = cudnnCreateTensorDescriptor(&ydesc);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not create ydesc: %s",
cudnnGetErrorString(err));
goto fail;
}
/* We need to use the last two dimensions for this, this is not a typo */
dims[0] = PyGpuArray_DIM(y, 1);
dims[1] = PyGpuArray_DIM(y, 2);
dims[2] = 1;
strs[0] = dims[2] * dims[1];
strs[1] = dims[2];
strs[2] = 1;
err = cudnnSetTensorNdDescriptor(ydesc, dt, 3, dims, strs);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not set ydesc: %s",
cudnnGetErrorString(err));
goto fail;
}
if (dhy != NULL)
if (c_make_tensorNd(dhy, &dhydesc) != 0)
goto fail;
if (dcy != NULL)
if (c_make_tensorNd(dcy, &dcydesc) != 0)
goto fail;
if (c_make_filter(w, &wdesc) != 0)
goto fail;
if (c_make_tensorNd(hx, &hxdesc) != 0)
goto fail;
if (cx != NULL)
if (c_make_tensorNd(cx, &cxdesc) != 0)
goto fail;
shape[0] = seqLength;
shape[1] = miniBatch;
shape[2] = inputSize;
if (aesara_prep_output(dx, 3, shape, y->ga.typecode, GA_C_ORDER, c) != 0)
goto fail;
err = cudnnCreateTensorDescriptor(&dxdesc);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not create dxdesc: %s",
cudnnGetErrorString(err));
goto fail;
}
/* Again not a typo, we need to use the last two dimensions */
dims[0] = shape[1];
dims[1] = shape[2];
dims[2] = 1;
strs[0] = dims[2] * dims[1];
strs[1] = dims[2];
strs[2] = 1;
err = cudnnSetTensorNdDescriptor(dxdesc, dt, 3, dims, strs);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not set dxdesc: %s",
cudnnGetErrorString(err));
goto fail;
}
if (aesara_prep_output(dhx, 3, PyGpuArray_DIMS(hx), hx->ga.typecode,
GA_C_ORDER, c) != 0)
goto fail;
if (c_make_tensorNd(*dhx, &dhxdesc) != 0)
goto fail;
if (cx != NULL) {
if (aesara_prep_output(dcx, 3, PyGpuArray_DIMS(cx), cx->ga.typecode,
GA_C_ORDER, c) != 0)
goto fail;
if (c_make_tensorNd(*dcx, &dcxdesc) != 0)
goto fail;
}
yl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), seqLength);
if (yl == NULL) {
PyErr_NoMemory();
goto fail;
}
for (size_t i = 0; i < seqLength; i++)
yl[i] = ydesc;
dxl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), seqLength);
if (dxl == NULL) {
PyErr_NoMemory();
goto fail;
}
for (size_t i = 0; i < seqLength; i++)
dxl[i] = dxdesc;
err = cudnnGetRNNWorkspaceSize(_handle, desc, (int)seqLength, dxl, &worksize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not get worksize: %s",
cudnnGetErrorString(err));
goto fail;
}
workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
if (workspace == NULL) {
PyErr_Format(PyExc_RuntimeError, "Could not allocate workspace");
goto fail;
}
err = cudnnGetRNNTrainingReserveSize(_handle, desc, (int)seqLength,
dxl, &ressize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not get reserve size: %s",
cudnnGetErrorString(err));
goto fail;
}
*oreserve = gpudata_alloc(c->ctx, ressize, NULL, 0, NULL);
if (*oreserve == NULL) {
PyErr_Format(PyExc_RuntimeError, "Could not allocate reserve");
goto fail;
}
if (gpudata_move(*oreserve, 0, reserve, 0, ressize) != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "could not copy reserve");
goto fail;
}
err = cudnnRNNBackwardData(_handle, desc, (int)seqLength,
yl, PyGpuArray_DEV_DATA(y),
/* y and dy are the same shape */
yl, PyGpuArray_DEV_DATA(dy),
dhydesc, dhy ? PyGpuArray_DEV_DATA(dhy) : NULL,
dcydesc, dcy ? PyGpuArray_DEV_DATA(dcy) : NULL,
wdesc, PyGpuArray_DEV_DATA(w),
hxdesc, PyGpuArray_DEV_DATA(hx),
cxdesc, cx ? PyGpuArray_DEV_DATA(cx) : NULL,
dxl, PyGpuArray_DEV_DATA(*dx),
dhxdesc, PyGpuArray_DEV_DATA(*dhx),
dcxdesc, dcx ? PyGpuArray_DEV_DATA(*dcx) : NULL,
*(void **)workspace, worksize,
*(void **)(*oreserve), ressize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could run RNN grad inputs: %s",
cudnnGetErrorString(err));
goto fail;
}
res = 0;
fail:
if (ydesc != NULL)
cudnnDestroyTensorDescriptor(ydesc);
if (dhydesc != NULL)
cudnnDestroyTensorDescriptor(dhydesc);
if (dcydesc != NULL)
cudnnDestroyTensorDescriptor(dcydesc);
if (wdesc != NULL)
cudnnDestroyFilterDescriptor(wdesc);
if (hxdesc != NULL)
cudnnDestroyTensorDescriptor(hxdesc);
if (cxdesc != NULL)
cudnnDestroyTensorDescriptor(cxdesc);
if (dxdesc != NULL)
cudnnDestroyTensorDescriptor(dxdesc);
if (dhxdesc != NULL)
cudnnDestroyTensorDescriptor(dhxdesc);
if (dcxdesc != NULL)
cudnnDestroyTensorDescriptor(dcxdesc);
free(yl);
free(dxl);
if (workspace != NULL)
gpudata_release(workspace);
cuda_exit(c->ctx);
return res;
}
#section support_code
int dnn_rnn_gw(cudnnRNNDescriptor_t desc, npy_uint64 _wsize,
PyGpuArrayObject *x, PyGpuArrayObject *hx,
PyGpuArrayObject *y, gpudata *reserve,
PyGpuArrayObject **dw, cudnnHandle_t _handle) {
PyGpuContextObject *c = x->context;
cudnnTensorDescriptor_t xdesc = NULL;
cudnnTensorDescriptor_t hxdesc = NULL;
cudnnTensorDescriptor_t ydesc = NULL;
cudnnFilterDescriptor_t dwdesc = NULL;
cudnnTensorDescriptor_t *xl = NULL;
cudnnTensorDescriptor_t *yl = NULL;
gpudata *workspace = NULL;
size_t worksize, ressize;
size_t iters = PyGpuArray_DIM(x, 0);
size_t wsize = _wsize;
int dims[3], strs[3];
cudnnStatus_t err;
cudnnDataType_t dt;
int res = -1;
switch (x->ga.typecode) {
case GA_FLOAT:
dt = CUDNN_DATA_FLOAT;
break;
case GA_DOUBLE:
dt = CUDNN_DATA_DOUBLE;
break;
case GA_HALF:
dt = CUDNN_DATA_HALF;
break;
default:
PyErr_SetString(PyExc_TypeError, "Unsupported data type for x");
return -1;
}
// This is early to match the exit() in the fail label.
cuda_enter(c->ctx);
err = cudnnCreateTensorDescriptor(&xdesc);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not create xdesc: %s",
cudnnGetErrorString(err));
goto fail;
}
/* We need to use the last two dimensions for this, this is not a typo */
dims[0] = PyGpuArray_DIM(x, 1);
dims[1] = PyGpuArray_DIM(x, 2);
dims[2] = 1;
strs[0] = dims[2] * dims[1];
strs[1] = dims[2];
strs[2] = 1;
err = cudnnSetTensorNdDescriptor(xdesc, dt, 3, dims, strs);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not set xdesc: %s",
cudnnGetErrorString(err));
goto fail;
}
if (c_make_tensorNd(hx, &hxdesc) != 0)
goto fail;
err = cudnnCreateTensorDescriptor(&ydesc);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not create ydesc: %s",
cudnnGetErrorString(err));
goto fail;
}
/* Again not a typo, we need to use the last two dimensions */
dims[0] = PyGpuArray_DIM(y, 1);
dims[1] = PyGpuArray_DIM(y, 2);
dims[2] = 1;
strs[0] = dims[2] * dims[1];
strs[1] = dims[2];
strs[2] = 1;
err = cudnnSetTensorNdDescriptor(ydesc, dt, 3, dims, strs);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not set ydesc: %s",
cudnnGetErrorString(err));
goto fail;
}
if (aesara_prep_output(dw, 1, &wsize, x->ga.typecode, GA_C_ORDER, c) != 0)
goto fail;
GpuArray_memset(&(*dw)->ga, 0);
if (c_make_filter(*dw, &dwdesc) != 0)
goto fail;
xl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), iters);
if (xl == NULL) {
PyErr_NoMemory();
goto fail;
}
for (size_t i = 0; i < iters; i++)
xl[i] = xdesc;
yl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), iters);
if (yl == NULL) {
PyErr_NoMemory();
goto fail;
}
for (size_t i = 0; i < iters; i++)
yl[i] = ydesc;
err = cudnnGetRNNWorkspaceSize(_handle, desc, (int)iters,
xl, &worksize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not get worksize: %s",
cudnnGetErrorString(err));
goto fail;
}
workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
if (workspace == NULL) {
PyErr_Format(PyExc_RuntimeError, "Could not allocate workspace");
goto fail;
}
err = cudnnGetRNNTrainingReserveSize(_handle, desc, (int)iters,
xl, &ressize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not get reserve size: %s",
cudnnGetErrorString(err));
goto fail;
}
err = cudnnRNNBackwardWeights(_handle, desc, (int)iters,
xl, PyGpuArray_DEV_DATA(x),
hxdesc, PyGpuArray_DEV_DATA(hx),
yl, PyGpuArray_DEV_DATA(y),
*(void **)workspace, worksize,
dwdesc, PyGpuArray_DEV_DATA(*dw),
*(void **)reserve, ressize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could run RNN grad weights: %s",
cudnnGetErrorString(err));
goto fail;
}
res = 0;
fail:
if (xdesc != NULL)
cudnnDestroyTensorDescriptor(xdesc);
if (hxdesc != NULL)
cudnnDestroyTensorDescriptor(hxdesc);
if (ydesc != NULL)
cudnnDestroyTensorDescriptor(ydesc);
if (dwdesc != NULL)
cudnnDestroyFilterDescriptor(dwdesc);
free(xl);
free(yl);
if (workspace != NULL)
gpudata_release(workspace);
cuda_exit(c->ctx);
return res;
}
#section support_code
int dnn_rnn_paramsize(cudnnRNNDescriptor_t desc,
PyArrayObject *isize,
npy_int32 typecode,
npy_uint64 *oparam_size,
cudnnHandle_t _handle) {
cudnnTensorDescriptor_t xdesc;
size_t param_size;
cudnnStatus_t err;
cudnnDataType_t dt;
int shape[3];
int strides[3];
if (PyArray_DIM(isize, 0) != 2) {
PyErr_SetString(PyExc_ValueError, "input_size should be of length two");
return -1;
}
switch (typecode) {
case GA_FLOAT:
dt = CUDNN_DATA_FLOAT;
break;
case GA_DOUBLE:
dt = CUDNN_DATA_DOUBLE;
break;
case GA_HALF:
dt = CUDNN_DATA_HALF;
break;
default:
PyErr_SetString(PyExc_ValueError, "Unsupported data type");
return -1;
}
err = cudnnCreateTensorDescriptor(&xdesc);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_SetString(PyExc_RuntimeError, "Could not create tensor descriptor");
return -1;
}
shape[0] = *(npy_uint64 *)PyArray_GETPTR1(isize, 0);
shape[1] = *(npy_uint64 *)PyArray_GETPTR1(isize, 1);
shape[2] = 1;
strides[0] = shape[2] * shape[1];
strides[1] = shape[2];
strides[2] = 1;
err = cudnnSetTensorNdDescriptor(xdesc, dt, 3, shape, strides);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "Could not set tensor descriptor: %s",
cudnnGetErrorString(err));
return -1;
}
err = cudnnGetRNNParamsSize(_handle, desc, xdesc, &param_size, dt);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_SetString(PyExc_RuntimeError, "Could not get parameter size");
return -1;
}
cudnnDestroyTensorDescriptor(xdesc);
*oparam_size = param_size;
return 0;
}
#section support_code_struct
cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
#section init_code_struct
APPLY_SPECIFIC(input) = NULL;
APPLY_SPECIFIC(output) = NULL;
{
cudnnStatus_t err;
err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input));
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor: %s",
cudnnGetErrorString(err));
FAIL;
}
err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output));
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor: %s",
cudnnGetErrorString(err));
FAIL;
}
}
#section cleanup_code_struct
if (APPLY_SPECIFIC(input) != NULL)
cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input));
if (APPLY_SPECIFIC(output) != NULL)
cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output));
#section support_code_struct
int APPLY_SPECIFIC(softmax)(PyGpuArrayObject *x,
PyGpuArrayObject **out,
PARAMS_TYPE* wrapper) {
PyGpuContextObject *c = x->context;
cudnnStatus_t err;
if (aesara_prep_output(out, PyGpuArray_NDIM(x),
PyGpuArray_DIMS(x), x->ga.typecode,
GA_C_ORDER, c) != 0)
return 1;
// Directly return the output if any of the dimensions is 0.
// (cuDNN does not support zero-length dimensions.)
if (PyGpuArray_SIZE(*out) == 0)
return 0;
if (c_set_tensorNd(x, APPLY_SPECIFIC(input)) != 0)
return 1;
if (c_set_tensorNd(*out, APPLY_SPECIFIC(output)) != 0)
return 1;
{
const float alphaf = 1;
const float betaf = 0;
const double alphad = 1;
const double betad = 0;
void *alpha, *beta;
switch (x->ga.typecode) {
case GA_DOUBLE:
alpha = (void *)&alphad;
beta = (void *)&betad;
break;
case GA_FLOAT:
case GA_HALF:
alpha = (void *)&alphaf;
beta = (void *)&betaf;
break;
default:
PyErr_SetString(PyExc_TypeError, "Unsupported type in softmax");
return 1;
}
cuda_enter(c->ctx);
cuda_wait(x->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_wait((*out)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnSoftmaxForward(
wrapper->handle,
wrapper->algo,
wrapper->mode,
alpha,
APPLY_SPECIFIC(input),
PyGpuArray_DEV_DATA(x),
beta,
APPLY_SPECIFIC(output),
PyGpuArray_DEV_DATA(*out)
);
cuda_record(x->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_record((*out)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
cuda_exit(c->ctx);
}
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error during operation: %s",
cudnnGetErrorString(err));
return 1;
}
return 0;
}
#section support_code_struct
cudnnTensorDescriptor_t APPLY_SPECIFIC(dy);
cudnnTensorDescriptor_t APPLY_SPECIFIC(sm);
cudnnTensorDescriptor_t APPLY_SPECIFIC(dx);
#section init_code_struct
APPLY_SPECIFIC(dy) = NULL;
APPLY_SPECIFIC(sm) = NULL;
APPLY_SPECIFIC(dx) = NULL;
{
cudnnStatus_t err;
err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(dy));
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor: %s",
cudnnGetErrorString(err));
FAIL;
}
err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(sm));
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor: %s",
cudnnGetErrorString(err));
FAIL;
}
err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(dx));
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor: %s",
cudnnGetErrorString(err));
FAIL;
}
}
#section cleanup_code_struct
if (APPLY_SPECIFIC(dy) != NULL)
cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(dy));
if (APPLY_SPECIFIC(sm) != NULL)
cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(sm));
if (APPLY_SPECIFIC(dx) != NULL)
cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(dx));
#section support_code_struct
int APPLY_SPECIFIC(softmax_grad)(PyGpuArrayObject *dy,
PyGpuArrayObject *sm,
PyGpuArrayObject **dx,
PARAMS_TYPE* wrapper) {
PyGpuContextObject *c = dy->context;
cudnnStatus_t err;
if (aesara_prep_output(dx, PyGpuArray_NDIM(dy),
PyGpuArray_DIMS(dy), dy->ga.typecode,
GA_C_ORDER, c) != 0)
return 1;
// Directly return the output if any of the dimensions is 0.
// (cuDNN does not support zero-length dimensions.)
if (PyGpuArray_SIZE(*dx) == 0)
return 0;
if (c_set_tensorNd(dy, APPLY_SPECIFIC(dy)) != 0)
return 1;
if (c_set_tensorNd(sm, APPLY_SPECIFIC(sm)) != 0)
return 1;
if (c_set_tensorNd(*dx, APPLY_SPECIFIC(dx)) != 0)
return 1;
{
const float alphaf = 1;
const float betaf = 0;
const double alphad = 1;
const double betad = 0;
void *alpha, *beta;
switch (sm->ga.typecode) {
case GA_DOUBLE:
alpha = (void *)&alphad;
beta = (void *)&betad;
break;
case GA_FLOAT:
case GA_HALF:
alpha = (void *)&alphaf;
beta = (void *)&betaf;
break;
default:
PyErr_SetString(PyExc_TypeError, "Unsupported type in softmax gradient");
return 1;
}
cuda_enter(c->ctx);
cuda_wait(sm->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_wait(dy->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_wait((*dx)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnSoftmaxBackward(
wrapper->handle,
wrapper->algo,
wrapper->mode,
alpha,
APPLY_SPECIFIC(sm),
PyGpuArray_DEV_DATA(sm),
APPLY_SPECIFIC(dy),
PyGpuArray_DEV_DATA(dy),
beta,
APPLY_SPECIFIC(dx),
PyGpuArray_DEV_DATA(*dx)
);
cuda_record(sm->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_record(dy->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_record((*dx)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
cuda_exit(c->ctx);
}
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error during operation: %s",
cudnnGetErrorString(err));
return 1;
}
return 0;
}
#section support_code_struct
cudnnSpatialTransformerDescriptor_t APPLY_SPECIFIC(sptf);
cudnnTensorDescriptor_t APPLY_SPECIFIC(xdesc);
cudnnTensorDescriptor_t APPLY_SPECIFIC(dxdesc);
cudnnTensorDescriptor_t APPLY_SPECIFIC(dydesc);
#section init_code_struct
APPLY_SPECIFIC(sptf) = NULL;
APPLY_SPECIFIC(xdesc) = NULL;
APPLY_SPECIFIC(dxdesc) = NULL;
APPLY_SPECIFIC(dydesc) = NULL;
{
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
err = cudnnCreateSpatialTransformerDescriptor(&APPLY_SPECIFIC(sptf));
if (err != CUDNN_STATUS_SUCCESS)
{
PyErr_Format(PyExc_MemoryError,
"GpuDnnTransformerGradI: could not allocate spatial transformer descriptor (sptf): %s",
cudnnGetErrorString(err));
FAIL;
}
err = cudnnCreateTensorDescriptor( &APPLY_SPECIFIC(xdesc) );
if ( err != CUDNN_STATUS_SUCCESS )
{
PyErr_Format( PyExc_MemoryError,
"GpuDnnTransformerGradI: failed to allocate cuDNN tensor descriptor xdesc: %s",
cudnnGetErrorString( err ) );
FAIL;
}
err = cudnnCreateTensorDescriptor( &APPLY_SPECIFIC(dxdesc) );
if ( err != CUDNN_STATUS_SUCCESS )
{
PyErr_Format( PyExc_MemoryError,
"GpuDnnTransformerGradI: failed to allocate cuDNN tensor descriptor dxdesc: %s",
cudnnGetErrorString( err ) );
FAIL;
}
err = cudnnCreateTensorDescriptor( &APPLY_SPECIFIC(dydesc) );
if ( err != CUDNN_STATUS_SUCCESS )
{
PyErr_Format( PyExc_MemoryError,
"GpuDnnTransformerGradI: failed to allocate cuDNN tensor descriptor dydesc: %s",
cudnnGetErrorString( err ) );
FAIL;
}
}
#section cleanup_code_struct
if (APPLY_SPECIFIC(sptf) != NULL)
cudnnDestroySpatialTransformerDescriptor( APPLY_SPECIFIC(sptf) );
if ( APPLY_SPECIFIC(xdesc) != NULL )
cudnnDestroyTensorDescriptor( APPLY_SPECIFIC(xdesc) );
if ( APPLY_SPECIFIC(dxdesc) != NULL )
cudnnDestroyTensorDescriptor( APPLY_SPECIFIC(dxdesc) );
if ( APPLY_SPECIFIC(dydesc) != NULL )
cudnnDestroyTensorDescriptor( APPLY_SPECIFIC(dydesc) );
#section support_code_struct
int
APPLY_SPECIFIC(dnn_sptf_gi)(PyGpuArrayObject * input,
PyGpuArrayObject * grid,
PyGpuArrayObject * dy,
PyGpuArrayObject ** input_grad,
PyGpuArrayObject ** grid_grad,
cudnnHandle_t _handle)
{
PyGpuContextObject * gpu_ctx = input->context;
void * alpha_p;
void * beta_p;
double alpha = 1.0, beta = 0.0;
float af = alpha, bf = beta;
int out_dims[4];
cudnnDataType_t dt;
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
switch (input->ga.typecode)
{
case GA_DOUBLE:
alpha_p = (void *)&alpha;
beta_p = (void *)&beta;
dt = CUDNN_DATA_DOUBLE;
break;
case GA_FLOAT:
alpha_p = (void *)&af;
beta_p = (void *)&bf;
dt = CUDNN_DATA_FLOAT;
break;
case GA_HALF:
alpha_p = (void *)&af;
beta_p = (void *)&bf;
dt = CUDNN_DATA_HALF;
break;
default:
PyErr_SetString( PyExc_TypeError,
"GpuDnnTransformerGradI: unsupported type for input in spatial transformer gradients" );
return 1;
}
if ( grid->ga.typecode != GA_FLOAT &&
grid->ga.typecode != GA_DOUBLE &&
grid->ga.typecode != GA_HALF )
{
PyErr_SetString( PyExc_TypeError,
"GpuDnnTransformerGradI: unsupported data type for grid in spatial transformer gradients." );
return 1;
}
if ( aesara_prep_output( input_grad, PyGpuArray_NDIM( input ),
PyGpuArray_DIMS( input ), input->ga.typecode,
GA_C_ORDER, gpu_ctx ) != 0 )
return 1;
if ( aesara_prep_output( grid_grad, PyGpuArray_NDIM( grid ),
PyGpuArray_DIMS( grid ), grid->ga.typecode,
GA_C_ORDER, gpu_ctx ) != 0 )
return 1;
// Obtain output dimensions to setup descriptor
out_dims[0] = (int) PyGpuArray_DIM(input, 0); // num_images
out_dims[1] = (int) PyGpuArray_DIM(input, 1); // num_channels
out_dims[2] = (int) PyGpuArray_DIM(grid, 1); // grid height
out_dims[3] = (int) PyGpuArray_DIM(grid, 2); // grid width
// Currently, only the bilinear sampler is supported by cuDNN,
// so the sampler method is currently not available as a parameter
err = cudnnSetSpatialTransformerNdDescriptor(APPLY_SPECIFIC(sptf), CUDNN_SAMPLER_BILINEAR,
dt, 4, out_dims );
if ( CUDNN_STATUS_SUCCESS != err )
{
PyErr_Format( PyExc_MemoryError,
"GpuDnnTransformerGradI: could not initialize descriptor (sptf): %s",
cudnnGetErrorString( err ) );
return 1;
}
if ( c_set_tensorNd( input, APPLY_SPECIFIC(xdesc) ) != 0 )
return 1;
if ( c_set_tensorNd( dy, APPLY_SPECIFIC(dydesc) ) != 0 )
return 1;
if ( c_set_tensorNd( *input_grad, APPLY_SPECIFIC(dxdesc) ) != 0 )
return 1;
// Directly return the outputs if any of the dimensions is 0.
// (cuDNN does not support zero-length dimensions.)
if ( PyGpuArray_SIZE( *input_grad ) == 0 || PyGpuArray_SIZE( *grid_grad ) == 0 )
return 0;
cuda_enter( gpu_ctx->ctx );
cuda_wait( input->ga.data, GPUARRAY_CUDA_WAIT_READ );
cuda_wait( grid->ga.data, GPUARRAY_CUDA_WAIT_READ );
cuda_wait( dy->ga.data, GPUARRAY_CUDA_WAIT_READ );
cuda_wait( (*input_grad)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
cuda_wait( (*grid_grad)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
err = cudnnSpatialTfSamplerBackward( _handle, APPLY_SPECIFIC(sptf), alpha_p,
APPLY_SPECIFIC(xdesc), PyGpuArray_DEV_DATA( input ), beta_p,
APPLY_SPECIFIC(dxdesc), PyGpuArray_DEV_DATA( *input_grad ), alpha_p,
APPLY_SPECIFIC(dydesc), PyGpuArray_DEV_DATA( dy ), PyGpuArray_DEV_DATA( grid ),
beta_p, PyGpuArray_DEV_DATA( *grid_grad ) );
cuda_record( input->ga.data, GPUARRAY_CUDA_WAIT_READ );
cuda_record( grid->ga.data, GPUARRAY_CUDA_WAIT_READ );
cuda_record( dy->ga.data, GPUARRAY_CUDA_WAIT_READ );
cuda_record( (*input_grad)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
cuda_record( (*grid_grad)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
cuda_exit( gpu_ctx->ctx );
if ( CUDNN_STATUS_SUCCESS != err )
{
PyErr_Format( PyExc_RuntimeError,
"GpuDnnTransformerGradI: failed to compute gradients of the inputs: %s",
cudnnGetErrorString( err ) );
return 1;
}
return 0;
}
#section support_code_struct
cudnnSpatialTransformerDescriptor_t APPLY_SPECIFIC(sptf);
#section init_code_struct
APPLY_SPECIFIC(sptf) = NULL;
{
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if ((err = cudnnCreateSpatialTransformerDescriptor(&APPLY_SPECIFIC(sptf))) != CUDNN_STATUS_SUCCESS)
{
PyErr_Format(PyExc_MemoryError,
"GpuDnnTransformerGrid: could not allocate spatial transformer descriptor (sptf): %s",
cudnnGetErrorString(err));
FAIL;
}
}
#section cleanup_code_struct
if (APPLY_SPECIFIC(sptf) != NULL) { cudnnDestroySpatialTransformerDescriptor(APPLY_SPECIFIC(sptf)); }
#section support_code_struct
int
APPLY_SPECIFIC(dnn_sptf_grid)(PyGpuArrayObject * theta,
PyArrayObject * out_dims,
PyGpuArrayObject ** grid,
cudnnHandle_t _handle)
{
PyGpuContextObject * gpu_ctx = theta->context;
size_t grid_dims[4];
int num_images, num_channels, height, width;
int desc_dims[4];
cudnnDataType_t dt;
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
switch(theta->ga.typecode)
{
case GA_DOUBLE:
dt = CUDNN_DATA_DOUBLE;
break;
case GA_FLOAT:
dt = CUDNN_DATA_FLOAT;
break;
case GA_HALF:
dt = CUDNN_DATA_HALF;
break;
default:
PyErr_SetString( PyExc_TypeError,
"GpuDnnTransformerGrid: unsupported data type for theta in spatial transformer." );
return 1;
}
if ( PyArray_NDIM( out_dims ) != 1 || PyArray_SIZE( out_dims ) != 4 )
{
PyErr_SetString( PyExc_MemoryError,
"GpuDnnTransformerGrid: out_dims must have 4 elements." );
return 1;
}
// Obtain output dimensions
num_images = (int) *( (npy_int64 *) PyArray_GETPTR1( out_dims, 0 ) );
num_channels = (int) *( (npy_int64 *) PyArray_GETPTR1( out_dims, 1 ) );
height = (int) *( (npy_int64 *) PyArray_GETPTR1( out_dims, 2 ) );
width = (int) *( (npy_int64 *) PyArray_GETPTR1( out_dims, 3 ) );
if ( PyGpuArray_DIM( theta, 0 ) != num_images ||
PyGpuArray_DIM( theta, 1 ) != 2 || PyGpuArray_DIM( theta, 2 ) != 3 )
{
PyErr_Format( PyExc_RuntimeError,
"GpuDnnTransformerGrid: incorrect dimensions for theta, expected (%d, %d, %d), got (%d, %d, %d)",
num_images, 2, 3, PyGpuArray_DIMS( theta )[0],
PyGpuArray_DIMS( theta )[1], PyGpuArray_DIMS( theta )[2] );
return 1;
}
// Set transformed output dimensions to setup the descriptor
desc_dims[0] = num_images;
desc_dims[1] = num_channels;
desc_dims[2] = height;
desc_dims[3] = width;
// Set sampling grid dimensions
grid_dims[0] = num_images;
grid_dims[1] = height;
grid_dims[2] = width;
grid_dims[3] = 2;
// Currently, only the bilinear sampler is supported by cuDNN,
// so the sampler method is currently not available as a parameter
err = cudnnSetSpatialTransformerNdDescriptor(APPLY_SPECIFIC(sptf), CUDNN_SAMPLER_BILINEAR,
dt, 4, desc_dims );
if ( CUDNN_STATUS_SUCCESS != err )
{
PyErr_Format( PyExc_MemoryError,
"GpuDnnTransformerGrid: could not initialize descriptor (sptf): %s",
cudnnGetErrorString( err ) );
return 1;
}
if ( aesara_prep_output( grid, 4, grid_dims, theta->ga.typecode,
GA_C_ORDER, gpu_ctx ) != 0 )
{
PyErr_SetString( PyExc_RuntimeError,
"GpuDnnTransformerGrid: could not allocate memory for grid of coordinates" );
return 1;
}
cuda_enter( gpu_ctx->ctx );
cuda_wait( theta->ga.data, GPUARRAY_CUDA_WAIT_READ );
cuda_wait( (*grid)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
err = cudnnSpatialTfGridGeneratorForward( _handle, APPLY_SPECIFIC(sptf),
PyGpuArray_DEV_DATA( theta ), PyGpuArray_DEV_DATA( *grid ) );
cuda_record( theta->ga.data, GPUARRAY_CUDA_WAIT_READ );
cuda_record( (*grid)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
cuda_exit( gpu_ctx->ctx );
if ( CUDNN_STATUS_SUCCESS != err )
{
PyErr_Format( PyExc_RuntimeError,
"GpuDnnTransformerGrid: could not create grid of coordinates: %s",
cudnnGetErrorString( err ) );
return 1;
}
return 0;
}
#section support_code_struct
cudnnSpatialTransformerDescriptor_t APPLY_SPECIFIC(sptf);
#section init_code_struct
APPLY_SPECIFIC(sptf) = NULL;
{
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if ((err = cudnnCreateSpatialTransformerDescriptor(&APPLY_SPECIFIC(sptf))) != CUDNN_STATUS_SUCCESS)
{
PyErr_Format(PyExc_MemoryError,
"GpuDnnTransformerGradT: could not allocate spatial transformer descriptor (sptf): %s",
cudnnGetErrorString(err));
FAIL;
}
}
#section cleanup_code_struct
if (APPLY_SPECIFIC(sptf) != NULL)
cudnnDestroySpatialTransformerDescriptor(APPLY_SPECIFIC(sptf));
#section support_code_struct
int
APPLY_SPECIFIC(dnn_sptf_gt)(PyGpuArrayObject * dgrid,
PyGpuArrayObject ** dtheta,
cudnnHandle_t _handle)
{
PyGpuContextObject * gpu_ctx = dgrid->context;
int num_images, height, width;
int desc_dims[4];
size_t dtheta_dims[3];
cudnnDataType_t dt;
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
switch(dgrid->ga.typecode)
{
case GA_DOUBLE:
dt = CUDNN_DATA_DOUBLE;
break;
case GA_FLOAT:
dt = CUDNN_DATA_FLOAT;
break;
case GA_HALF:
dt = CUDNN_DATA_HALF;
break;
default:
PyErr_SetString( PyExc_TypeError,
"GpuDnnTransformerGradT: unsupported data type for dgrid in spatial transformer." );
return 1;
}
num_images = (int) PyGpuArray_DIM( dgrid, 0 );
height = (int) PyGpuArray_DIM( dgrid, 1 );
width = (int) PyGpuArray_DIM( dgrid, 2 );
dtheta_dims[0] = num_images;
dtheta_dims[1] = 2;
dtheta_dims[2] = 3;
if ( aesara_prep_output( dtheta, 3, dtheta_dims, dgrid->ga.typecode,
GA_C_ORDER, gpu_ctx ) != 0 )
return 1;
desc_dims[0] = num_images;
// Assume number of channels is 1, because the information is not
// available or relevant here
desc_dims[1] = 1;
desc_dims[2] = height;
desc_dims[3] = width;
// Currently, only the bilinear sampler is supported by cuDNN,
// so the sampler method is currently not available as a parameter
err = cudnnSetSpatialTransformerNdDescriptor(APPLY_SPECIFIC(sptf), CUDNN_SAMPLER_BILINEAR,
dt, 4, desc_dims );
if ( CUDNN_STATUS_SUCCESS != err )
{
PyErr_Format( PyExc_MemoryError,
"GpuDnnTransformerGrid: could not initialize descriptor (sptf): %s",
cudnnGetErrorString( err ) );
return 1;
}
cuda_enter( gpu_ctx->ctx );
cuda_wait( dgrid->ga.data, GPUARRAY_CUDA_WAIT_READ );
cuda_wait( (*dtheta)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
err = cudnnSpatialTfGridGeneratorBackward( _handle, APPLY_SPECIFIC(sptf),
PyGpuArray_DEV_DATA( dgrid ), PyGpuArray_DEV_DATA( *dtheta ) );
cuda_record( dgrid->ga.data, GPUARRAY_CUDA_WAIT_READ );
cuda_record( (*dtheta)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
cuda_exit( gpu_ctx->ctx );
if ( err != CUDNN_STATUS_SUCCESS )
{
PyErr_Format( PyExc_RuntimeError,
"GpuDnnTransformerGradT: could not compute gradients of the affine transformation: %s",
cudnnGetErrorString( err ) );
return 1;
}
return 0;
}
#section support_code_struct
cudnnSpatialTransformerDescriptor_t APPLY_SPECIFIC(sptf);
cudnnTensorDescriptor_t APPLY_SPECIFIC(xdesc);
cudnnTensorDescriptor_t APPLY_SPECIFIC(ydesc);
#section init_code_struct
APPLY_SPECIFIC(sptf) = NULL;
APPLY_SPECIFIC(xdesc) = NULL;
APPLY_SPECIFIC(ydesc) = NULL;
{
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
err = cudnnCreateSpatialTransformerDescriptor(&APPLY_SPECIFIC(sptf));
if (err != CUDNN_STATUS_SUCCESS)
{
PyErr_Format(PyExc_MemoryError,
"GpuDnnTransformerSampler: could not allocate spatial transformer descriptor (sptf): %s",
cudnnGetErrorString( err ));
FAIL;
}
err = cudnnCreateTensorDescriptor( &APPLY_SPECIFIC(xdesc) );
if ( err != CUDNN_STATUS_SUCCESS )
{
PyErr_Format( PyExc_MemoryError,
"GpuDnnTransformerSampler: failed to allocate cuDNN tensor descriptor xdesc: %s",
cudnnGetErrorString( err ) );
FAIL;
}
err = cudnnCreateTensorDescriptor( &APPLY_SPECIFIC(ydesc) );
if ( err != CUDNN_STATUS_SUCCESS )
{
PyErr_Format( PyExc_MemoryError,
"GpuDnnTransformerSampler: failed to allocate cuDNN tensor descriptor ydesc: %s",
cudnnGetErrorString( err ) );
FAIL;
}
}
#section cleanup_code_struct
if (APPLY_SPECIFIC(sptf) != NULL)
cudnnDestroySpatialTransformerDescriptor(APPLY_SPECIFIC(sptf));
if ( APPLY_SPECIFIC(xdesc) != NULL )
cudnnDestroyTensorDescriptor( APPLY_SPECIFIC(xdesc) );
if ( APPLY_SPECIFIC(ydesc) != NULL )
cudnnDestroyTensorDescriptor( APPLY_SPECIFIC(ydesc) );
#section support_code_struct
int
APPLY_SPECIFIC(dnn_sptf_sampler)(PyGpuArrayObject * input,
PyGpuArrayObject * grid,
PyGpuArrayObject ** output,
cudnnHandle_t _handle)
{
PyGpuContextObject * gpu_ctx = input->context;
void * alpha_p;
void * beta_p;
double alpha = 1.0, beta = 0.0;
float af = alpha, bf = beta;
size_t out_dims[4];
int desc_dims[4];
cudnnDataType_t dt;
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
switch (input->ga.typecode)
{
case GA_DOUBLE:
alpha_p = (void *)&alpha;
beta_p = (void *)&beta;
dt = CUDNN_DATA_DOUBLE;
break;
case GA_FLOAT:
alpha_p = (void *)&af;
beta_p = (void *)&bf;
dt = CUDNN_DATA_FLOAT;
break;
case GA_HALF:
alpha_p = (void *)&af;
beta_p = (void *)&bf;
dt = CUDNN_DATA_HALF;
break;
default:
PyErr_SetString( PyExc_TypeError,
"GpuDnnTransformer: unsupported type for input in spatial transformer." );
return 1;
}
out_dims[0] = (size_t) PyGpuArray_DIM(input, 0); // num_images
out_dims[1] = (size_t) PyGpuArray_DIM(input, 1); // num_channels
out_dims[2] = (size_t) PyGpuArray_DIM(grid, 1); // grid height
out_dims[3] = (size_t) PyGpuArray_DIM(grid, 2); // grid width
// Set output dimensions for the descriptor setup
desc_dims[0] = (int) out_dims[0];
desc_dims[1] = (int) out_dims[1];
desc_dims[2] = (int) out_dims[2];
desc_dims[3] = (int) out_dims[3];
if ( out_dims[0] == 0 || out_dims[1] == 0 || out_dims[2] == 0 || out_dims[3] == 0 )
{
PyErr_SetString( PyExc_RuntimeError,
"GpuDnnTransformerSampler: one of the sampler dimensions is zero" );
return 1;
}
if ( aesara_prep_output( output, 4, out_dims, input->ga.typecode,
GA_C_ORDER, gpu_ctx ) != 0 )
{
PyErr_SetString( PyExc_MemoryError,
"GpuDnnTransformerSampler: could not allocate memory for grid sampler" );
return 1;
}
// Currently, only the bilinear sampler is supported by cuDNN,
// so the sampler method is currently not available as a parameter
err = cudnnSetSpatialTransformerNdDescriptor(APPLY_SPECIFIC(sptf), CUDNN_SAMPLER_BILINEAR,
dt, 4, desc_dims );
if ( CUDNN_STATUS_SUCCESS != err )
{
PyErr_Format( PyExc_MemoryError,
"GpuDnnTransformerSampler: could not initialize descriptor: %s",
cudnnGetErrorString( err ) );
return 1;
}
if ( c_set_tensorNd( input, APPLY_SPECIFIC(xdesc) ) != 0 )
return 1;
if ( c_set_tensorNd( *output, APPLY_SPECIFIC(ydesc) ) != 0 )
return 1;
cuda_enter( gpu_ctx->ctx );
cuda_wait( input->ga.data, GPUARRAY_CUDA_WAIT_READ );
cuda_wait( grid->ga.data, GPUARRAY_CUDA_WAIT_READ );
cuda_wait( (*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
err = cudnnSpatialTfSamplerForward( _handle, APPLY_SPECIFIC(sptf), alpha_p,
APPLY_SPECIFIC(xdesc), PyGpuArray_DEV_DATA( input ), PyGpuArray_DEV_DATA( grid ),
beta_p, APPLY_SPECIFIC(ydesc), PyGpuArray_DEV_DATA( *output ) );
cuda_record( input->ga.data, GPUARRAY_CUDA_WAIT_READ );
cuda_record( grid->ga.data, GPUARRAY_CUDA_WAIT_READ );
cuda_record( (*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
cuda_exit( gpu_ctx->ctx );
if ( CUDNN_STATUS_SUCCESS != err )
{
PyErr_Format( PyExc_RuntimeError,
"GpuDnnTransformerSampler: could not create grid sampler: %s",
cudnnGetErrorString( err ) );
return 1;
}
return 0;
}
#ifndef AESARA_GPUARRAY_HELPER
#define AESARA_GPUARRAY_HELPER
#include <string.h>
#include <gpuarray_api.h>
#include <numpy_compat.h>
#include <gpuarray/util.h>
static int aesara_size_check(PyGpuArrayObject *a, unsigned int nd,
const size_t *dims, int typecode) {
return (a->ga.nd == nd && a->ga.typecode == typecode &&
memcmp(a->ga.dimensions, dims, nd * sizeof(size_t)) == 0);
}
static int aesara_prep_output(PyGpuArrayObject **out, unsigned int nd,
const size_t *dims, int typecode, ga_order ord,
PyGpuContextObject *c) {
if (*out != NULL &&
aesara_size_check(*out, nd, dims, typecode)) {
return 0;
}
Py_XDECREF(*out);
*out = pygpu_empty(nd, dims, typecode, ord, c, Py_None);
return (*out == NULL) ? 1 : 0;
}
static PyGpuArrayObject *aesara_try_copy(PyGpuArrayObject *out,
PyGpuArrayObject *V) {
if (out &&
GpuArray_CHKFLAGS(&out->ga, GA_CARRAY) &&
aesara_size_check(out, PyGpuArray_NDIM(V),
PyGpuArray_DIMS(V),
V->ga.typecode)) {
if (pygpu_move(out, V)) {
Py_XDECREF(out);
return NULL;
}
} else {
Py_XDECREF(out);
out = pygpu_copy(V, GA_C_ORDER);
}
return out;
}
static inline void *PyGpuArray_DEV_DATA(PyGpuArrayObject *a) {
/* This is guaranteed to work and return the raw CUDA/OpenCL object on
* all recent (as of June 2015) version of libgpuarray. This is also
* promised to keep working in future versions. */
char * p = *((char **)a->ga.data);
/* This only works on cuda since we have a real pointer. */
return (void *)(p + a->ga.offset);
}
#endif
#section kernels
#kernel tril_kernel : size, size, size, *:
#include "cluda.h"
KERNEL void tril_kernel(const ga_size nthreads, const ga_size ncols,
const ga_size a_off, GLOBAL_MEM DTYPE_INPUT_0 *a) {
a = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)a) + a_off);
// grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; index < nthreads;
index += LDIM_0 * GDIM_0) {
unsigned int ix = index / ncols;
unsigned int iy = index % ncols;
if (ix < iy) {
a[index] = 0.0;
}
}
}
#kernel triu_kernel : size, size, size, *:
#include "cluda.h"
KERNEL void triu_kernel(const ga_size nthreads, const ga_size ncols,
const ga_size a_off, GLOBAL_MEM DTYPE_INPUT_0 *a) {
a = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)a) + a_off);
// grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; index < nthreads;
index += LDIM_0 * GDIM_0) {
unsigned int ix = index / ncols;
unsigned int iy = index % ncols;
if (ix > iy) {
a[index] = 0.0;
}
}
}
#section init_code
setup_ext_cuda();
#section support_code_struct
int APPLY_SPECIFIC(magma_cholesky)(PyGpuArrayObject *A, PyGpuArrayObject **L,
PARAMS_TYPE *params) {
const size_t *dims;
size_t N, n2;
magma_uplo_t ul;
int res = -1, info;
if (A->ga.typecode != GA_FLOAT) {
PyErr_SetString(PyExc_TypeError,
"GpuMagmaCholesky: unsupported data type");
return -1;
}
// This is early to match the exit() in the fail label.
cuda_enter(params->context->ctx);
if (!GpuArray_IS_C_CONTIGUOUS(&A->ga)) {
PyErr_SetString(PyExc_ValueError,
"GpuMagmaCholesky: requires data to be C-contiguous");
goto fail;
}
if (PyGpuArray_NDIM(A) != 2) {
PyErr_SetString(PyExc_ValueError, "GpuMagmaCholesky: matrix rank error");
goto fail;
}
dims = PyGpuArray_DIMS(A);
if (dims[0] != dims[1]) {
PyErr_SetString(PyExc_ValueError, "GpuMagmaCholesky: matrix is not square");
goto fail;
}
if (params->inplace) {
Py_XDECREF(*L);
*L = A;
Py_INCREF(*L);
} else {
*L = aesara_try_copy(*L, A);
if (*L == NULL) {
PyErr_SetString(
PyExc_RuntimeError,
"GpuMagmaCholesky: failed to allocate memory for the output");
goto fail;
}
}
// magma matrix cholesky
N = dims[0];
n2 = N * N;
// Magma requires column-major order for the matrix A. Instead of changing
// matrix order which requires copying data, we can compute cholesky
// decomposition where we change parameters lower to upper and upper to
// lower.
if (params->lower) {
ul = MagmaUpper;
}
else {
ul = MagmaLower;
}
magma_spotrf_gpu(ul, N, (float *)PyGpuArray_DEV_DATA(*L), N, &info);
if (info > 0) {
PyErr_Format(PyExc_RuntimeError, "GpuMagmaCholesky: the leading minor of "
"order %d is not positive definite",
info);
goto fail;
} else if (info < 0) {
PyErr_Format(
PyExc_RuntimeError,
"GpuMagmaCholesky: magma_spotrf_gpu argument %d has an illegal value",
-info);
goto fail;
}
if (params->lower) {
res = tril_kernel_scall(1, &n2, 0, n2, N, (*L)->ga.offset, (*L)->ga.data);
if (res != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "GpuMagmaCholesky: tril_kernel %s.",
GpuKernel_error(&k_tril_kernel, res));
goto fail;
}
} else {
res = triu_kernel_scall(1, &n2, 0, n2, N, (*L)->ga.offset, (*L)->ga.data);
if (res != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "GpuMagmaCholesky: triu_kernel %s.",
GpuKernel_error(&k_triu_kernel, res));
goto fail;
}
}
res = 0;
fail:
cuda_exit(params->context->ctx);
return res;
}
#section init_code
setup_ext_cuda();
#section support_code_struct
int APPLY_SPECIFIC(magma_eigh)(PyGpuArrayObject *A_,
PyGpuArrayObject **D,
PyGpuArrayObject **V, // may be NULL
PARAMS_TYPE *params) {
PyGpuArrayObject *A = NULL;
magma_int_t N, liwork, *iwork_data = NULL;
size_t d_dims[1], v_dims[2];
magma_uplo_t uplo;
magma_vec_t jobz;
float *w_data = NULL, *wA_data = NULL, *work_data = NULL, lwork;
int res = -1, info;
if (A_->ga.typecode != GA_FLOAT) {
PyErr_SetString(PyExc_TypeError,
"GpuMagmaEigh: Unsupported data type");
return -1;
}
// This is early to match the exit() in the fail label.
cuda_enter(params->context->ctx);
if (!GpuArray_IS_C_CONTIGUOUS(&A_->ga)) {
PyErr_SetString(PyExc_ValueError,
"GpuMagmaEigh: requires data to be C-contiguous");
goto fail;
}
if (PyGpuArray_NDIM(A_) != 2) {
PyErr_SetString(PyExc_ValueError,
"GpuMagmaEigh: matrix rank error");
goto fail;
}
if (PyGpuArray_DIM(A_, 0) != PyGpuArray_DIM(A_, 1)) {
PyErr_SetString(PyExc_ValueError,
"GpuMagmaEigh: matrix is not square");
goto fail;
}
A = pygpu_copy(A_, GA_F_ORDER);
if (A == NULL) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaEigh: failed to change to column-major order");
return -1;
}
// magma matrix eigen decomposition of a symmetric matrix
N = PyGpuArray_DIM(A, 0);
if (params->lower) {
uplo = MagmaLower;
} else {
uplo = MagmaUpper;
}
if (params->compute_v) {
jobz = MagmaVec;
} else {
jobz = MagmaNoVec;
}
if (MAGMA_SUCCESS != magma_smalloc_pinned(&w_data, N)) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaEigh: failed to allocate working memory");
goto fail;
}
if (MAGMA_SUCCESS != magma_smalloc_pinned(&wA_data, N * N)) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaEigh: failed to allocate working memory");
goto fail;
}
// query for workspace size
magma_ssyevd_gpu(jobz, uplo, N, NULL, N, NULL, NULL, N, &lwork,
-1, &liwork, -1, &info);
if (MAGMA_SUCCESS != magma_smalloc_pinned(&work_data, (size_t)lwork)) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaEigh: failed to allocate working memory");
goto fail;
}
if (MAGMA_SUCCESS != magma_imalloc_cpu(&iwork_data, liwork)) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaEigh: failed to allocate working memory");
goto fail;
}
magma_ssyevd_gpu(jobz, uplo, N, (float *)PyGpuArray_DEV_DATA(A), N, w_data,
wA_data, N, work_data, (size_t)lwork, iwork_data, liwork,
&info);
if (info > 0) {
PyErr_Format(
PyExc_RuntimeError,
"GpuMagmaEigh: %d off-diagonal elements of an didn't converge to zero",
info);
goto fail;
} else if (info < 0) {
PyErr_Format(
PyExc_RuntimeError,
"GpuMagmaEigh: magma_ssyevd_gpu argument %d has an illegal value", -info);
goto fail;
}
d_dims[0] = N;
if (aesara_prep_output(D, 1, d_dims, A->ga.typecode, GA_C_ORDER, params->context) != 0){
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaEigh: failed to allocate memory for the output");
goto fail;
}
cudaMemcpy(PyGpuArray_DEV_DATA(*D), w_data, N * sizeof(float),
cudaMemcpyDeviceToDevice);
if (params->compute_v) {
*V = aesara_try_copy(*V, A);
if (*V == NULL) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaEigh: failed to allocate memory for the output");
goto fail;
}
}
res = 0;
fail:
if (w_data != NULL)
magma_free_pinned(w_data);
if (wA_data != NULL)
magma_free_pinned(wA_data);
if (work_data != NULL)
magma_free_pinned(work_data);
if (iwork_data != NULL)
magma_free_cpu(iwork_data);
Py_XDECREF(A);
cuda_exit(params->context->ctx);
return res;
}
#section init_code
setup_ext_cuda();
#section support_code_struct
int APPLY_SPECIFIC(magma_inv)(PyGpuArrayObject *A, PyGpuArrayObject **A_inv,
PARAMS_TYPE *params) {
const size_t *dims;
magma_int_t N, ldwork, info;
magma_int_t *piv = NULL;
gpudata *dwork = NULL;
int res = -1;
if (A->ga.typecode != GA_FLOAT) {
PyErr_SetString(PyExc_TypeError,
"GpuMagmaMatrixInverse: Unsupported data type");
return -1;
}
// This is early to match the exit() in the fail label.
cuda_enter(params->context->ctx);
if (!GpuArray_IS_C_CONTIGUOUS(&A->ga)) {
PyErr_SetString(PyExc_ValueError,
"GpuMagmaMatrixInverse: requires data to be C-contiguous");
goto fail;
}
if (PyGpuArray_NDIM(A) != 2) {
PyErr_SetString(PyExc_ValueError,
"GpuMagmaMatrixInverse: matrix rank error");
goto fail;
}
dims = PyGpuArray_DIMS(A);
if (dims[0] != dims[1]) {
PyErr_SetString(PyExc_ValueError,
"GpuMagmaMatrixInverse: matrix is not square");
goto fail;
}
if (params->inplace) {
Py_XDECREF(*A_inv);
*A_inv = A;
Py_INCREF(*A_inv);
} else {
*A_inv = aesara_try_copy(*A_inv, A);
if (*A_inv == NULL) {
PyErr_SetString(
PyExc_RuntimeError,
"GpuMagmaMatrixInverse: failed to allocate memory for the output");
goto fail;
}
}
// magma matrix inverse
N = dims[0];
ldwork = N * magma_get_sgetri_nb(N);
dwork = gpudata_alloc(params->context->ctx, ldwork * sizeof(float), NULL, 0, NULL);
if (dwork == NULL) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaMatrixInverse: failed to allocate working memory");
goto fail;
}
if (magma_imalloc_cpu(&piv, N)) {
PyErr_SetString(
PyExc_RuntimeError,
"GpuMagmaMatrixInverse: failed to allocate memory for the pivot array");
goto fail;
}
magma_sgetrf_gpu(N, N, (float *)PyGpuArray_DEV_DATA(*A_inv), N, piv, &info);
if (info != 0) {
PyErr_Format(
PyExc_RuntimeError,
"GpuMagmaMatrixInverse: magma_sgetrf_gpu returned error %d: %s.", info,
magma_strerror(info));
goto fail;
}
magma_sgetri_gpu(N, (float *)PyGpuArray_DEV_DATA(*A_inv), N, piv,
*(float **)dwork, ldwork, &info);
if (info != 0) {
PyErr_Format(
PyExc_RuntimeError,
"GpuMagmaMatrixInverse: magma_sgetri_gpu returned error %d: %s.", info,
magma_strerror(info));
goto fail;
}
res = 0;
fail:
if (piv != NULL)
magma_free(piv);
if (dwork != NULL)
gpudata_release(dwork);
cuda_exit(params->context->ctx);
return res;
}
#section kernels
#kernel triu_kernel : size, size, size, *:
#include "cluda.h"
KERNEL void triu_kernel(const ga_size nthreads, const ga_size ncols,
const ga_size a_off, GLOBAL_MEM DTYPE_INPUT_0 *a) {
a = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)a) + a_off);
// grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; index < nthreads;
index += LDIM_0 * GDIM_0) {
const ga_size ix = index / ncols;
const ga_size iy = index % ncols;
if (ix > iy) {
a[index] = 0.0;
}
}
}
#section init_code
setup_ext_cuda();
#section support_code
static PyGpuArrayObject *pygpu_narrow(PyGpuArrayObject *src, size_t dim,
size_t size) {
PyGpuArrayObject *src_view = pygpu_view(src, Py_None);
src_view->ga.dimensions[dim] = size;
GpuArray_fix_flags(&src_view->ga);
return pygpu_copy(src_view, GA_C_ORDER);
}
#section support_code_struct
int APPLY_SPECIFIC(magma_qr)(PyGpuArrayObject *A_,
PyGpuArrayObject **R,
PyGpuArrayObject **Q, // may be NULL
PARAMS_TYPE* params) {
PyGpuArrayObject *A = NULL;
magma_int_t M, N, K, nb, ldwork;
size_t n2;
float *tau_data = NULL;
gpudata *work_data = NULL;
int res = -1, info;
A = A_;
if (A->ga.typecode != GA_FLOAT) {
PyErr_SetString(PyExc_TypeError,
"GpuMagmaQR: Unsupported data type");
return -1;
}
// This is early to match the exit() in the fail label.
cuda_enter(params->context->ctx);
if (!GpuArray_IS_C_CONTIGUOUS(&A->ga)) {
PyErr_SetString(PyExc_ValueError,
"GpuMagmaQR: requires data to be C-contiguous");
goto fail;
}
if (PyGpuArray_NDIM(A) != 2) {
PyErr_SetString(PyExc_ValueError, "GpuMagmaQR: matrix rank error");
goto fail;
}
A = pygpu_copy(A_, GA_F_ORDER);
if (A == NULL) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaQR: failed to change to column-major order");
goto fail;
}
// magma matrix qr
M = PyGpuArray_DIM(A, 0);
N = PyGpuArray_DIM(A, 1);
K = M < N ? M : N;
if (MAGMA_SUCCESS != magma_smalloc_pinned(&tau_data, N * N)) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaQR: failed to allocate working memory");
goto fail;
}
nb = magma_get_sgeqrf_nb(M, N);
ldwork = (2 * K + magma_roundup(N, 32)) * nb;
work_data = gpudata_alloc(params->context->ctx, ldwork * sizeof(float), NULL, 0, NULL);
if (work_data == NULL) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaQR: failed to allocate working memory");
goto fail;
}
// compute R
magma_sgeqrf2_gpu(M, N, (float *)PyGpuArray_DEV_DATA(A), M, tau_data, &info);
if (info != 0) {
PyErr_Format(
PyExc_RuntimeError,
"GpuMagmaQR: magma_sgeqrf2_gpu argument %d has an illegal value", -info);
goto fail;
}
*R = pygpu_narrow(A, 0, K);
if (*R == NULL) {
PyErr_SetString(PyExc_RuntimeError, "GpuMagmaQR: failed to narrow array");
goto fail;
}
n2 = K * N;
res = triu_kernel_scall(1, &n2, 0, n2, N, (*R)->ga.offset, (*R)->ga.data);
if (res != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "GpuMagmaQR: triu_kernel %s.",
GpuKernel_error(&k_triu_kernel, res));
goto fail;
}
if (params->complete) {
// compute Q
Py_XDECREF(A);
A = pygpu_copy(A_, GA_F_ORDER);
if (A == NULL) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaQR: failed to change to column-major order");
return -1;
}
magma_sgeqrf_gpu(M, N, (float *)PyGpuArray_DEV_DATA(A), M, tau_data,
*(float **)work_data, &info);
if (info != 0) {
PyErr_Format(
PyExc_RuntimeError,
"GpuMagmaQR: magma_sgeqrf_gpu argument %d has an illegal value", -info);
goto fail;
}
magma_sorgqr_gpu(M, K, K, (float *)PyGpuArray_DEV_DATA(A), M, tau_data,
*(float **)work_data, nb, &info);
if (info != 0) {
PyErr_Format(
PyExc_RuntimeError,
"GpuMagmaQR: magma_sorgqr_gpu argument %d has an illegal value", -info);
goto fail;
}
*Q = pygpu_narrow(A, 1, K);
if (*Q == NULL) {
PyErr_SetString(PyExc_RuntimeError, "GpuMagmaQR: failed to narrow array");
goto fail;
}
}
res = 0;
fail:
if (tau_data != NULL)
magma_free_pinned(tau_data);
if (work_data != NULL)
gpudata_release(work_data);
Py_XDECREF(A);
cuda_exit(params->context->ctx);
return res;
}
#section init_code
setup_ext_cuda();
#section support_code_struct
int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A,
PyGpuArrayObject **S,
PyGpuArrayObject **U, // may be NULL
PyGpuArrayObject **VT, // may be NULL
PARAMS_TYPE *params) {
bool compute_uv = (U != NULL);
magma_int_t *iwork = NULL, iunused[1];
magma_int_t M, N, K, ldu, ldv, M_U, N_VT, info;
magma_vec_t jobz;
size_t s_dims[1], u_dims[2], vt_dims[2];
float *a_data = NULL, *s_data = NULL, *u_data = NULL, *vt_data = NULL,
*work = NULL;
float dummy[1];
int res = -1, lwork;
if (A->ga.typecode != GA_FLOAT) {
PyErr_SetString(PyExc_TypeError,
"GpuMagmaSVD: Unsupported data type");
return -1;
}
// This is early to match the exit() in the fail label.
cuda_enter(params->context->ctx);
if (!GpuArray_IS_C_CONTIGUOUS(&A->ga)) {
PyErr_SetString(PyExc_ValueError,
"GpuMagmaSVD: requires data to be C-contiguous");
goto fail;
}
if (PyGpuArray_NDIM(A) != 2) {
PyErr_SetString(PyExc_ValueError,
"GpuMagmaSVD: matrix rank error");
goto fail;
}
// magma matrix svd
// reverse dimensions because MAGMA expects column-major matrices:
M = PyGpuArray_DIM(A, 1);
N = PyGpuArray_DIM(A, 0);
K = M < N ? M : N;
if (MAGMA_SUCCESS != magma_smalloc_pinned(&a_data, M * N)) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaSVD: failed to allocate memory");
goto fail;
}
cudaMemcpy(a_data, PyGpuArray_DEV_DATA(A), M * N * sizeof(float),
cudaMemcpyDeviceToDevice);
if (MAGMA_SUCCESS != magma_smalloc_pinned(&s_data, K)) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaSVD: failed to allocate memory");
goto fail;
}
if (compute_uv) {
if (params->full_matrices) {
jobz = MagmaAllVec;
} else {
jobz = MagmaSomeVec;
}
M_U = (jobz == MagmaAllVec ? M : K);
N_VT = (jobz == MagmaAllVec ? N : K);
ldu = M;
ldv = N_VT;
if (MAGMA_SUCCESS != magma_smalloc_pinned(&u_data, M_U * M)) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaSVD: failed to allocate memory");
goto fail;
}
if (MAGMA_SUCCESS != magma_smalloc_pinned(&vt_data, N * N_VT)) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaSVD: failed to allocate memory");
goto fail;
}
} else {
jobz = MagmaNoVec;
ldu = M;
ldv = N;
}
// query for workspace size
magma_sgesdd(jobz, M, N, NULL, M, NULL, NULL, ldu, NULL, ldv,
dummy, -1, iunused, &info);
lwork = (magma_int_t) MAGMA_S_REAL(dummy[0]);
if (MAGMA_SUCCESS != magma_smalloc_pinned(&work, lwork)) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaSVD: failed to allocate working memory");
goto fail;
}
if (MAGMA_SUCCESS != magma_imalloc_cpu(&iwork, 8*K)) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaSVD: failed to allocate working memory");
goto fail;
}
// compute svd
magma_sgesdd(jobz, M, N, a_data, M, s_data,
u_data, ldu, vt_data, ldv, work, lwork, iwork, &info);
if (info > 0) {
PyErr_Format(
PyExc_RuntimeError,
"GpuMagmaSVD: the updating process of SBDSDC did not converge (error: %d)",
info);
goto fail;
} else if (info < 0) {
PyErr_Format(
PyExc_RuntimeError,
"GpuMagmaSVD: magma_sgesdd_gpu argument %d has an illegal value", -info);
goto fail;
}
s_dims[0] = K;
if (aesara_prep_output(S, 1, s_dims, A->ga.typecode, GA_C_ORDER, params->context) != 0){
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaSVD: failed to allocate memory");
goto fail;
}
cudaMemcpy(PyGpuArray_DEV_DATA(*S), s_data, K * sizeof(float),
cudaMemcpyDeviceToDevice);
if (compute_uv) {
u_dims[0] = N; u_dims[1] = N_VT;
if (aesara_prep_output(U, 2, u_dims, A->ga.typecode, GA_C_ORDER, params->context) != 0){
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaSVD: failed to allocate memory");
goto fail;
}
// magma expects column-major matrices. Exchange u_data -> VT and vt_data -> U
// to match numpy.linalg.svd output
cudaMemcpy(PyGpuArray_DEV_DATA(*U), vt_data, N * N_VT * sizeof(float),
cudaMemcpyDeviceToDevice);
vt_dims[0] = M_U; vt_dims[1] = M;
if (aesara_prep_output(VT, 2, vt_dims, A->ga.typecode, GA_C_ORDER, params->context) != 0){
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaSVD: failed to allocate memory");
goto fail;
}
// magma expects column-major matrices. Exchange u_data -> VT and vt_data -> U
// to match numpy.linalg.svd output
cudaMemcpy(PyGpuArray_DEV_DATA(*VT), u_data, M_U * M * sizeof(float),
cudaMemcpyDeviceToDevice);
}
res = 0;
fail:
if (a_data != NULL)
magma_free_pinned(a_data);
if (s_data != NULL)
magma_free_pinned(s_data);
if (u_data != NULL)
magma_free_pinned(u_data);
if (vt_data != NULL)
magma_free_pinned(vt_data);
if (work != NULL)
magma_free_pinned(work);
if (iwork != NULL)
magma_free_cpu(iwork);
cuda_exit(params->context->ctx);
return res;
}
#section kernels
#kernel max_pool2d_kernel : size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, *, size :
#include "cluda.h"
// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void max_pool2d_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size pooled_height,
const ga_size pooled_width, const ga_size height, const ga_size width,
GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size x_off, const ga_size kernel_h, const ga_size kernel_w,
const ga_size stride_h, const ga_size stride_w, const ga_size pad_h, const ga_size pad_w,
GLOBAL_MEM DTYPE_OUTPUT_0 *z, const ga_size z_off)
{
x = (GLOBAL_MEM DTYPE_INPUT_0 *)(((GLOBAL_MEM char *)x) + x_off);
z = (GLOBAL_MEM DTYPE_OUTPUT_0 *)(((GLOBAL_MEM char *)z) + z_off);
// grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < nthreads;
index += LDIM_0 * GDIM_0) {
const ga_size pw = index % pooled_width;
const ga_size ph = (index / pooled_width) % pooled_height;
const ga_size c = (index / pooled_width / pooled_height) % channels;
const ga_size n = (index / pooled_width / pooled_height / channels);
ga_int hstart = (ga_int)(ph*stride_h) - (ga_int)(pad_h);
const ga_size hend = min(hstart + kernel_h, height);
ga_int wstart = (ga_int)(pw*stride_w) - (ga_int)(pad_w);
const ga_size wend = min(wstart + kernel_w, width);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
const ga_size offset = (n*channels + c) * height * width;
GLOBAL_MEM const DTYPE_INPUT_0* x_slice = x + offset;
DTYPE_OUTPUT_0 maxval = x_slice[hstart*width + wstart];
for (ga_size h=hstart; h < hend; ++h) {
for (ga_size w=wstart; w < wend; ++w) {
// maximum in the region
if (x_slice[h*width + w] > maxval) {
maxval = x_slice[h*width + w];
}
}
}
z[index] = maxval;
}
}
#kernel max_pool3d_kernel : size, size, size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, size, size, size, *, size :
#include "cluda.h"
// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void max_pool3d_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size pooled_depth,
const ga_size pooled_height, const ga_size pooled_width,
const ga_size depth, const ga_size height, const ga_size width,
GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size x_off, const ga_size kernel_d, const ga_size kernel_h,
const ga_size kernel_w, const ga_size stride_d, const ga_size stride_h,
const ga_size stride_w, const ga_size pad_d, const ga_size pad_h, const ga_size pad_w,
GLOBAL_MEM DTYPE_OUTPUT_0 *z, const ga_size z_off)
{
x = (GLOBAL_MEM DTYPE_INPUT_0 *)(((GLOBAL_MEM char *)x) + x_off);
z = (GLOBAL_MEM DTYPE_OUTPUT_0 *)(((GLOBAL_MEM char *)z) + z_off);
// grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < nthreads;
index += LDIM_0 * GDIM_0) {
const ga_size pw = index % pooled_width;
const ga_size ph = (index / pooled_width) % pooled_height;
const ga_size pd = (index / pooled_width / pooled_height) % pooled_depth;
const ga_size c = (index / pooled_width / pooled_height / pooled_depth) % channels;
const ga_size n = (index / pooled_width / pooled_height / pooled_depth / channels);
ga_int dstart = (ga_int)(pd*stride_d) - (ga_int)(pad_d);
const ga_size dend = min(dstart + kernel_d, depth);
ga_int hstart = (ga_int)(ph*stride_h) - (ga_int)(pad_h);
const ga_size hend = min(hstart + kernel_h, height);
ga_int wstart = (ga_int)(pw*stride_w) - (ga_int)(pad_w);
const ga_size wend = min(wstart + kernel_w, width);
dstart = max(dstart, 0);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
const ga_size offset = (n*channels + c) * depth * height * width;
GLOBAL_MEM const DTYPE_INPUT_0* x_slice = x + offset;
DTYPE_OUTPUT_0 maxval = x_slice[(dstart*height + hstart)*width + wstart];
for (ga_size d=dstart; d < dend; ++d) {
for (ga_size h=hstart; h < hend; ++h) {
for (ga_size w=wstart; w < wend; ++w) {
// maximum in the region
if (x_slice[(d*height + h)*width + w] > maxval) {
maxval = x_slice[(d*height + h)*width + w];
}
}
}
}
z[index] = maxval;
}
}
#kernel ave_pool2d_kernel : size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, bool, bool, *, size:
#include "cluda.h"
// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void ave_pool2d_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size pooled_height,
const ga_size pooled_width, const ga_size height, const ga_size width,
GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size x_off, const ga_size kernel_h, const ga_size kernel_w,
const ga_size stride_h, const ga_size stride_w, const ga_size pad_h, const ga_size pad_w,
const ga_bool inc_pad, const ga_bool sum_mode,
GLOBAL_MEM DTYPE_OUTPUT_0 *z, const ga_size z_off)
{
x = (GLOBAL_MEM DTYPE_INPUT_0 *)(((GLOBAL_MEM char *)x) + x_off);
z = (GLOBAL_MEM DTYPE_OUTPUT_0 *)(((GLOBAL_MEM char *)z) + z_off);
// grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < nthreads;
index += LDIM_0 * GDIM_0) {
const ga_size pw = index % pooled_width;
const ga_size ph = (index / pooled_width) % pooled_height;
const ga_size c = (index / pooled_width / pooled_height) % channels;
const ga_size n = (index / pooled_width / pooled_height / channels);
ga_int hstart = (ga_int)(ph*stride_h) - (ga_int)(pad_h);
ga_size hend = min(hstart + kernel_h, height + pad_h);
ga_int wstart = (ga_int)(pw*stride_w) - (ga_int)(pad_w);
ga_size wend = min(wstart + kernel_w, width + pad_w);
ga_size pool_size;
if (inc_pad) {
pool_size = (hend - hstart) * (wend - wstart);
}
hstart = max(hstart, 0);
wstart = max(wstart, 0);
hend = min(hend, height);
wend = min(wend, width);
if (!inc_pad) {
pool_size = (hend - hstart) * (wend - wstart);
}
const ga_size offset = (n*channels + c) * height * width;
GLOBAL_MEM const DTYPE_INPUT_0* x_slice = x + offset;
DTYPE_OUTPUT_0 collector = 0;
for (ga_size h=hstart; h < hend; ++h) {
for (ga_size w=wstart; w < wend; ++w) {
collector += x_slice[h * width + w];
}
}
if (sum_mode) {
z[index] = collector;
}
else {
z[index] = collector / pool_size;
}
}
}
#kernel ave_pool3d_kernel : size, size, size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, size, size, size, bool, bool, *, size :
#include "cluda.h"
// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void ave_pool3d_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size pooled_depth,
const ga_size pooled_height, const ga_size pooled_width,
const ga_size depth, const ga_size height, const ga_size width,
GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size x_off, const ga_size kernel_d, const ga_size kernel_h,
const ga_size kernel_w, const ga_size stride_d, const ga_size stride_h,
const ga_size stride_w, const ga_size pad_d, const ga_size pad_h, const ga_size pad_w,
const ga_bool inc_pad, const ga_bool sum_mode,
GLOBAL_MEM DTYPE_OUTPUT_0 *z, const ga_size z_off)
{
// grid stride looping
x = (GLOBAL_MEM DTYPE_INPUT_0 *)(((GLOBAL_MEM char *)x) + x_off);
z = (GLOBAL_MEM DTYPE_OUTPUT_0 *)(((GLOBAL_MEM char *)z) + z_off);
for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < nthreads;
index += LDIM_0 * GDIM_0) {
const ga_size pw = index % pooled_width;
const ga_size ph = (index / pooled_width) % pooled_height;
const ga_size pd = (index / pooled_width / pooled_height) % pooled_depth;
const ga_size c = (index / pooled_width / pooled_height / pooled_depth) % channels;
const ga_size n = (index / pooled_width / pooled_height / pooled_depth / channels);
ga_int dstart = (ga_int)(pd*stride_d) - (ga_int)(pad_d);
ga_size dend = min(dstart + kernel_d, depth + pad_d);
ga_int hstart = (ga_int)(ph*stride_h) - (ga_int)(pad_h);
ga_size hend = min(hstart + kernel_h, height + pad_h);
ga_int wstart = (ga_int)(pw*stride_w) - (ga_int)(pad_w);
ga_size wend = min(wstart + kernel_w, width + pad_w);
ga_size pool_size;
if (inc_pad) {
pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
}
dstart = max(dstart, 0);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
dend = min(dend, depth);
hend = min(hend, height);
wend = min(wend, width);
if (!inc_pad) {
pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
}
const ga_size offset = (n*channels + c) * depth * height * width;
GLOBAL_MEM const DTYPE_INPUT_0* x_slice = x + offset;
DTYPE_OUTPUT_0 collector = 0;
for (ga_size d=dstart; d < dend; ++d) {
for (ga_size h=hstart; h < hend; ++h) {
for (ga_size w=wstart; w < wend; ++w) {
collector += x_slice[(d * height + h) * width + w];
}
}
}
if (sum_mode) {
z[index] = collector;
}
else {
z[index] = collector / pool_size;
}
}
}
#section support_code
// output shape for a given input padded shape, window shape and stride
// We use ssize_t in the max since this is done to avoid negative results.
#define OUTPUT_DIMS(in_dim, ws, st, ignore_border) \
(ignore_border ? (in_dim - ws)/st + 1 : \
(st > ws ? (in_dim - 1)/st + 1 : \
std::max<ssize_t>(0, (in_dim - 1 - ws + st)/st) + 1))
#section support_code_struct
int APPLY_SPECIFIC(pool)(PyGpuArrayObject *x,
PyArrayObject *ws,
PyArrayObject *stride,
PyArrayObject *pad,
PyGpuArrayObject **z,
PARAMS_TYPE* params) {
bool max_pool = (params->mode == POOLING_MAX);
bool inc_pad = (params->mode != POOLING_AVERAGE_COUNT_EXCLUDE_PADDING);
bool sum_mode = (params->mode == POOLING_SUM);
if (!GpuArray_IS_C_CONTIGUOUS(&x->ga))
{
PyErr_Format(PyExc_ValueError,
"GpuPool: requires data to be C-contiguous");
return 1;
}
size_t ndims = PyArray_DIM(ws, 0);
if (PyGpuArray_NDIM(x) != ndims + 2)
{
PyErr_SetString(PyExc_ValueError, "GpuPool: rank error");
return 1;
}
// prepare output
const size_t* x_dims = PyGpuArray_DIMS(x);
size_t z_dims[5]; // avoid warning if use 2 + nd
size_t w[3];
size_t s[3];
size_t p[3]; z_dims[0] = x_dims[0]; z_dims[1] = x_dims[1];
int nonzero_padding = 0;
for (int i = 0; i < ndims; i++) {
w[i] = *((npy_int64*)PyArray_GETPTR1(ws, i));
s[i] = *((npy_int64*)PyArray_GETPTR1(stride, i));
p[i] = *((npy_int64*)PyArray_GETPTR1(pad, i));
z_dims[2 + i] = OUTPUT_DIMS(x_dims[2 + i] + 2*p[i], w[i], s[i], params->ignore_border);
if (p[i] > 0) {
nonzero_padding = 1;
}
}
if (!params->ignore_border && nonzero_padding) {
PyErr_SetString(PyExc_ValueError,
"GpuPool: padding works only with ignore_border=True");
return 1;
}
if (aesara_prep_output(z, PyGpuArray_NDIM(x), z_dims,
x->ga.typecode, GA_C_ORDER, params->context) != 0)
{
PyErr_SetString(PyExc_RuntimeError,
"GpuPool: failed to allocate memory");
return 1;
}
{
// scope for running kernel
int err;
if (ndims == 2) {
size_t num_kernels = z_dims[0] * z_dims[1] * z_dims[2] * z_dims[3];
if (max_pool) {
err = max_pool2d_kernel_scall(1, &num_kernels, 0, num_kernels,
z_dims[0], z_dims[1], z_dims[2], z_dims[3],
x_dims[2], x_dims[3],
x->ga.data, x->ga.offset, w[0], w[1], s[0], s[1], p[0], p[1],
(*z)->ga.data, (*z)->ga.offset);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"GpuPool: max_pool2d_kernel %s.",
GpuKernel_error(&k_max_pool2d_kernel, err));
return 1;
}
} else {
err = ave_pool2d_kernel_scall(1, &num_kernels, 0, num_kernels,
z_dims[0], z_dims[1], z_dims[2], z_dims[3],
x_dims[2], x_dims[3],
x->ga.data, x->ga.offset,
w[0], w[1], s[0], s[1], p[0], p[1],
inc_pad, sum_mode,
(*z)->ga.data, (*z)->ga.offset);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"GpuPool: ave_pool2d_kernel %s.",
GpuKernel_error(&k_ave_pool2d_kernel, err));
return 1;
}
}
}
else if (ndims == 3) {
size_t num_kernels = z_dims[0] * z_dims[1] * z_dims[2] * z_dims[3] * z_dims[4];
if (max_pool) {
err = max_pool3d_kernel_scall(1, &num_kernels, 0, num_kernels,
z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[4],
x_dims[2], x_dims[3], x_dims[4],
x->ga.data, x->ga.offset, w[0], w[1], w[2], s[0], s[1], s[2],
p[0], p[1], p[2], (*z)->ga.data, (*z)->ga.offset);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"GpuPool: max_pool3d_kernel %s.",
GpuKernel_error(&k_max_pool2d_kernel, err));
return 1;
}
} else {
err = ave_pool3d_kernel_scall(1, &num_kernels, 0, num_kernels,
z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[4],
x_dims[2], x_dims[3], x_dims[4],
x->ga.data, x->ga.offset,
w[0], w[1], w[2], s[0], s[1], s[2],
p[0], p[1], p[2],
inc_pad, sum_mode,
(*z)->ga.data, (*z)->ga.offset);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"GpuPool: ave_pool3d_kernel %s.",
GpuKernel_error(&k_ave_pool3d_kernel, err));
return 1;
}
}
}
}
return 0;
}
#section kernels
#kernel ave_pool2d_grad_kernel : size, size, size, size, size, size, size, *, size, *, size, size, size, size, size, size, size, bool, bool, *, size :
#include "cluda.h"
// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void ave_pool2d_grad_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size height,
const ga_size width, const ga_size pooled_height, const ga_size pooled_width,
GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size x_off, GLOBAL_MEM const DTYPE_INPUT_1 *gz, const ga_size gz_off,
const ga_size kernel_h, const ga_size kernel_w, const ga_size stride_h, const ga_size stride_w,
const ga_size pad_h, const ga_size pad_w, const ga_bool inc_pad, const ga_bool sum_mode,
GLOBAL_MEM DTYPE_OUTPUT_0 *gx, const ga_size gx_off)
{
x = (GLOBAL_MEM const DTYPE_INPUT_0 *)(((GLOBAL_MEM char *)x) + x_off);
gz = (GLOBAL_MEM const DTYPE_INPUT_1 *)(((GLOBAL_MEM char *)gz) + gz_off);
gx = (GLOBAL_MEM DTYPE_OUTPUT_0 *)(((GLOBAL_MEM char *)gx) + gx_off);
// grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < nthreads; index += LDIM_0 * GDIM_0) {
const ga_size w = index % width;
const ga_size h = (index / width) % height;
const ga_size c = (index / width / height) % channels;
const ga_size n = (index / width / height / channels);
const ga_size phstart = (h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1;
const ga_size phend = min((h + pad_h) / stride_h + 1, pooled_height);
const ga_size pwstart = (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1;
const ga_size pwend = min((w + pad_w) / stride_w + 1, pooled_width);
const ga_size offset = (n*channels + c) * pooled_height * pooled_width;
GLOBAL_MEM const DTYPE_INPUT_1* gz_slice = gz + offset;
DTYPE_OUTPUT_0 collector = 0;
for (ga_size ph=phstart; ph < phend; ++ph) {
for (ga_size pw=pwstart; pw < pwend; ++pw) {
if (sum_mode) {
collector += gz[ph*pooled_width + pw];
} else {
// figure out the pooling size
const ga_size hstart = ph * stride_h - pad_h;
const ga_size wstart = pw * stride_w - pad_w;
const ga_size hend = min(hstart + kernel_h, height + pad_h);
const ga_size wend = min(wstart + kernel_w, width + pad_w);
const ga_size pool_size = (hend - hstart) * (wend - wstart);
collector += gz_slice[ph*pooled_width + pw] / pool_size;
}
}
}
gx[index] = collector;
}
}
#kernel ave_pool3d_grad_kernel : size, size, size, size, size, size, size, size, size, *, size, *, size, size, size, size, size, size, size, size, size, size, bool, bool, *, size :
#include "cluda.h"
// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void ave_pool3d_grad_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size depth,
const ga_size height, const ga_size width, const ga_size pooled_depth,
const ga_size pooled_height, const ga_size pooled_width,
GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size x_off, GLOBAL_MEM const DTYPE_INPUT_1 *gz, const ga_size gz_off,
const ga_size kernel_d, const ga_size kernel_h, const ga_size kernel_w,
const ga_size stride_d, const ga_size stride_h, const ga_size stride_w,
const ga_size pad_d, const ga_size pad_h, const ga_size pad_w,
const ga_bool inc_pad, const ga_bool sum_mode, GLOBAL_MEM DTYPE_OUTPUT_0 *gx, const ga_size gx_off)
{
x = (GLOBAL_MEM const DTYPE_INPUT_0 *)(((GLOBAL_MEM char *)x) + x_off);
gz = (GLOBAL_MEM const DTYPE_INPUT_1 *)(((GLOBAL_MEM char *)gz) + gz_off);
gx = (GLOBAL_MEM DTYPE_OUTPUT_0 *)(((GLOBAL_MEM char *)gx) + gx_off);
// grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < nthreads; index += LDIM_0 * GDIM_0) {
const ga_size w = index % width;
const ga_size h = (index / width) % height;
const ga_size d = (index / width / height) % depth;
const ga_size c = (index / width / height / depth) % channels;
const ga_size n = (index / width / height / depth / channels);
const ga_size pdstart = (d + pad_d < kernel_d) ? 0 : (d + pad_d - kernel_d) / stride_d + 1;
const ga_size pdend = min((d + pad_d) / stride_d + 1, pooled_depth);
const ga_size phstart = (h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1;
const ga_size phend = min((h + pad_h) / stride_h + 1, pooled_height);
const ga_size pwstart = (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1;
const ga_size pwend = min((w + pad_w) / stride_w + 1, pooled_width);
const ga_size offset = (n*channels + c) * pooled_depth * pooled_height * pooled_width;
GLOBAL_MEM const DTYPE_INPUT_1* gz_slice = gz + offset;
DTYPE_OUTPUT_0 collector = 0;
for (ga_size pd=pdstart; pd < pdend; ++pd) {
for (ga_size ph=phstart; ph < phend; ++ph) {
for (ga_size pw=pwstart; pw < pwend; ++pw) {
if (sum_mode) {
collector += gz[ph*pooled_width + pw];
} else {
// figure out the pooling size
const ga_size dstart = pd * stride_d - pad_d;
const ga_size hstart = ph * stride_h - pad_h;
const ga_size wstart = pw * stride_w - pad_w;
const ga_size dend = min(dstart + kernel_d, depth + pad_d);
const ga_size hend = min(hstart + kernel_h, height + pad_h);
const ga_size wend = min(wstart + kernel_w, width + pad_w);
const ga_size pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
collector += gz[ph*pooled_width + pw] / pool_size;
}
}
}
}
gx[index] = collector;
}
}
#section support_code_struct
int APPLY_SPECIFIC(ave_pool_grad)(PyGpuArrayObject *x,
PyGpuArrayObject *gz,
PyArrayObject *ws,
PyArrayObject *stride,
PyArrayObject *pad,
PyGpuArrayObject **gx,
PARAMS_TYPE* params) {
bool inc_pad = (params->mode == POOLING_AVERAGE_COUNT_INCLUDE_PADDING);
bool sum_mode = (params->mode == POOLING_SUM);
if (!GpuArray_IS_C_CONTIGUOUS(&x->ga)
|| !GpuArray_IS_C_CONTIGUOUS(&gz->ga))
{
PyErr_Format(PyExc_ValueError,
"GpuMaxPoolGrad: requires data to be C-contiguous");
return 1;
}
size_t ndims = PyArray_DIM(ws, 0);
if (PyGpuArray_NDIM(x) != ndims + 2
|| PyGpuArray_NDIM(gz) != ndims + 2)
{
PyErr_SetString(PyExc_ValueError, "GpuMaxPoolGrad: rank error");
return 1;
}
if (aesara_prep_output(gx, PyGpuArray_NDIM(x), PyGpuArray_DIMS(x),
x->ga.typecode, GA_C_ORDER, params->context) != 0)
{
PyErr_SetString(PyExc_RuntimeError,
"GpuMaxPoolGrad: failed to allocate memory");
return 1;
}
{
// scope for running kernel
size_t w[3];
size_t s[3];
size_t p[3];
for(int i = 0; i < ndims; i++) {
w[i] = *((npy_int64*)PyArray_GETPTR1(ws, i));
s[i] = *((npy_int64*)PyArray_GETPTR1(stride, i));
p[i] = *((npy_int64*)PyArray_GETPTR1(pad, i));
}
int err;
const size_t* z_dims = PyGpuArray_DIMS(gz);
const size_t* x_dims = PyGpuArray_DIMS(x);
if (ndims == 2) {
size_t num_kernels = x_dims[0] * x_dims[1] * x_dims[2] * x_dims[3];
err = ave_pool2d_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
x_dims[0], x_dims[1], x_dims[2], x_dims[3],
z_dims[2], z_dims[3],
x->ga.data, x->ga.offset,
gz->ga.data, gz->ga.offset,
w[0], w[1], s[0], s[1], p[0], p[1],
inc_pad, sum_mode,
(*gx)->ga.data, (*gx)->ga.offset);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"GpuAveragePoolGrad: ave_pool2d_grad_kernel %s.",
GpuKernel_error(&k_ave_pool2d_grad_kernel, err));
return 1;
}
} else if (ndims == 3) {
size_t num_kernels = x_dims[0] * x_dims[1] * x_dims[2] * x_dims[3] * x_dims[4];
err = ave_pool3d_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
x_dims[0], x_dims[1], x_dims[2], x_dims[3], x_dims[4],
z_dims[2], z_dims[3], z_dims[4],
x->ga.data, x->ga.offset,
gz->ga.data, gz->ga.offset,
w[0], w[1], w[2], s[0], s[1], s[2],
p[0], p[1], p[2], inc_pad, sum_mode,
(*gx)->ga.data, (*gx)->ga.offset);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"GpuAveragePoolGrad: ave_pool3d_grad_kernel %s.",
GpuKernel_error(&k_ave_pool3d_grad_kernel, err));
return 1;
}
}
}
return 0;
}
#section kernels
#kernel max_pool2d_grad_grad_kernel : size, size, size, size, size, size, size, *, size, *, size, *, size, size, size, size, size, size, size, *, size :
#include "cluda.h"
KERNEL void max_pool2d_grad_grad_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size pooled_height,
const ga_size pooled_width, const ga_size height, const ga_size width,
GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size x_off, GLOBAL_MEM const DTYPE_INPUT_1 *z, const ga_size z_off, GLOBAL_MEM const DTYPE_INPUT_2 *gx, const ga_size gx_off,
const ga_size kernel_h, const ga_size kernel_w, const ga_size stride_h, const ga_size stride_w,
const ga_size pad_h, const ga_size pad_w,
GLOBAL_MEM DTYPE_OUTPUT_0 *gz, const ga_size gz_off)
{
x = (GLOBAL_MEM DTYPE_INPUT_0 *)(((GLOBAL_MEM char *)x) + x_off);
z = (GLOBAL_MEM DTYPE_INPUT_1 *)(((GLOBAL_MEM char *)z) + z_off);
gx = (GLOBAL_MEM DTYPE_INPUT_2 *)(((GLOBAL_MEM char *)gx) + gx_off);
gz = (GLOBAL_MEM DTYPE_OUTPUT_0 *)(((GLOBAL_MEM char *)gz) + gz_off);
// grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < nthreads; index += LDIM_0 * GDIM_0) {
const ga_size pw = index % pooled_width;
const ga_size ph = (index / pooled_width) % pooled_height;
const ga_size c = (index / pooled_width / pooled_height) % channels;
const ga_size n = (index / pooled_width / pooled_height / channels);
ga_int hstart = (ga_int)(ph*stride_h) - (ga_int)(pad_h);
const ga_size hend = min(hstart + kernel_h, height);
ga_int wstart = (ga_int)(pw*stride_w) - (ga_int)(pad_w);
const ga_size wend = min(wstart + kernel_w, width);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
const ga_size offset = (n*channels + c) * height * width;
GLOBAL_MEM const DTYPE_INPUT_0* x_slice = x + offset;
GLOBAL_MEM const DTYPE_INPUT_2* gx_slice = gx + offset;
DTYPE_OUTPUT_0 gradient = 0;
for (ga_size h=hstart; h < hend; ++h) {
for (ga_size w=wstart; w < wend; ++w) {
// maximum in the region
if (z[index] == x_slice[h * width + w]) {
gradient += gx_slice[h * width + w];
}
}
}
gz[index] = gradient;
}
}
#kernel max_pool3d_grad_grad_kernel : size, size, size, size, size, size, size, size, size, *, size, *, size, *, size, size, size, size, size, size, size, size, size, size, *, size :
#include "cluda.h"
KERNEL void max_pool3d_grad_grad_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size pooled_depth,
const ga_size pooled_height, const ga_size pooled_width,
const ga_size depth, const ga_size height, const ga_size width,
GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size x_off, GLOBAL_MEM const DTYPE_INPUT_1 *z, const ga_size z_off, GLOBAL_MEM const DTYPE_INPUT_2 *gx, const ga_size gx_off,
const ga_size kernel_d, const ga_size kernel_h, const ga_size kernel_w,
const ga_size stride_d, const ga_size stride_h, const ga_size stride_w,
const ga_size pad_d, const ga_size pad_h, const ga_size pad_w,
GLOBAL_MEM DTYPE_OUTPUT_0 *gz, const ga_size gz_off)
{
x = (GLOBAL_MEM DTYPE_INPUT_0 *)(((GLOBAL_MEM char *)x) + x_off);
z = (GLOBAL_MEM DTYPE_INPUT_1 *)(((GLOBAL_MEM char *)z) + z_off);
gx = (GLOBAL_MEM DTYPE_INPUT_2 *)(((GLOBAL_MEM char *)gx) + gx_off);
gz = (GLOBAL_MEM DTYPE_OUTPUT_0 *)(((GLOBAL_MEM char *)gz) + gz_off);
// grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < nthreads; index += LDIM_0 * GDIM_0) {
const ga_size pw = index % pooled_width;
const ga_size ph = (index / pooled_width) % pooled_height;
const ga_size pd = (index / pooled_width / pooled_height) % pooled_depth;
const ga_size c = (index / pooled_width / pooled_height / pooled_depth) % channels;
const ga_size n = (index / pooled_width / pooled_height / pooled_depth / channels);
ga_int dstart = (ga_int)(pd*stride_d) - (ga_int)(pad_d);
const ga_size dend = min(dstart + kernel_d, depth);
ga_int hstart = (ga_int)(ph*stride_h) - (ga_int)(pad_h);
const ga_size hend = min(hstart + kernel_h, height);
ga_int wstart = (ga_int)(pw*stride_w) - (ga_int)(pad_w);
const ga_size wend = min(wstart + kernel_w, width);
dstart = max(dstart, 0);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
const ga_size offset = (n*channels + c) * depth * height * width;
GLOBAL_MEM const DTYPE_INPUT_0* x_slice = x + offset;
GLOBAL_MEM const DTYPE_INPUT_2* gx_slice = gx + offset;
DTYPE_OUTPUT_0 gradient = 0;
for (ga_size d=dstart; d < dend; ++d) {
for (ga_size h=hstart; h < hend; ++h) {
for (ga_size w=wstart; w < wend; ++w) {
// maximum in the region
if (z[index] == x_slice[(d * height + h) * width + w]) {
gradient += gx_slice[(d * height + h)* width + w];
}
}
}
}
gz[index] = gradient;
}
}
#section support_code_struct
int APPLY_SPECIFIC(pool_grad_grad)(PyGpuArrayObject *x,
PyGpuArrayObject *z,
PyGpuArrayObject *gx,
PyArrayObject *ws,
PyArrayObject *stride,
PyArrayObject *pad,
PyGpuArrayObject **gz,
PyGpuContextObject *ctx) {
if (!GpuArray_IS_C_CONTIGUOUS(&x->ga)
|| !GpuArray_IS_C_CONTIGUOUS(&z->ga)
|| !GpuArray_IS_C_CONTIGUOUS(&gx->ga))
{
PyErr_Format(PyExc_ValueError,
"GpuPoolingGradGrad: requires data to be C-contiguous");
return 1;
}
size_t ndims = PyArray_DIM(ws, 0);
if (PyGpuArray_NDIM(x) != ndims + 2
|| PyGpuArray_NDIM(z) != ndims + 2
|| PyGpuArray_NDIM(gx) != ndims + 2)
{
PyErr_SetString(PyExc_ValueError, "GpuPoolingGradGrad: rank error");
return 1;
}
if (aesara_prep_output(gz, PyGpuArray_NDIM(z), PyGpuArray_DIMS(z),
z->ga.typecode, GA_C_ORDER, ctx) != 0)
{
PyErr_SetString(PyExc_RuntimeError,
"GpuPoolingGradGrad: failed to allocate memory");
return 1;
}
{
// scope for running kernel
size_t w[3];
size_t s[3];
size_t p[3];
for(int i = 0; i < ndims; i++) {
w[i] = *((npy_int64*)PyArray_GETPTR1(ws, i));
s[i] = *((npy_int64*)PyArray_GETPTR1(stride, i));
p[i] = *((npy_int64*)PyArray_GETPTR1(pad, i));
}
int err;
const size_t* z_dims = PyGpuArray_DIMS(z);
const size_t* x_dims = PyGpuArray_DIMS(x);
if (ndims == 2) {
size_t num_kernels = z_dims[0] * z_dims[1] * z_dims[2] * z_dims[3];
err = max_pool2d_grad_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
z_dims[0], z_dims[1], z_dims[2], z_dims[3],
x_dims[2], x_dims[3],
x->ga.data, x->ga.offset,
z->ga.data, z->ga.offset,
gx->ga.data, gx->ga.offset,
w[0], w[1], s[0], s[1], p[0], p[1],
(*gz)->ga.data, (*gz)->ga.offset);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"GpuPoolingGradGrad: max_pool2d_grad_grad_kernel %s.",
GpuKernel_error(&k_max_pool2d_grad_grad_kernel, err));
return 1;
}
}
else if (ndims == 3) {
size_t num_kernels = z_dims[0] * z_dims[1] * z_dims[2] * z_dims[3] * z_dims[4];
err = max_pool3d_grad_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[4],
x_dims[2], x_dims[3], x_dims[4],
x->ga.data, x->ga.offset,
z->ga.data, z->ga.offset,
gx->ga.data, gx->ga.offset,
w[0], w[1], w[2], s[0], s[1], s[2], p[0], p[1], p[2],
(*gz)->ga.data, (*gz)->ga.offset);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"GpuPoolingGradGrad: max_pool3d_grad_grad_kernel %s.",
GpuKernel_error(&k_max_pool3d_grad_grad_kernel, err));
return 1;
}
}
}
return 0;
}
#section kernels
#kernel max_pool2d_grad_kernel : size, size, size, size, size, size, size, *, size, *, size, *, size, size, size, size, size, size, size, *, size :
#include "cluda.h"
// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void max_pool2d_grad_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size height,
const ga_size width, const ga_size pooled_height, const ga_size pooled_width,
GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size x_off, GLOBAL_MEM const DTYPE_INPUT_1 *z, const ga_size z_off, GLOBAL_MEM const DTYPE_INPUT_2 *gz, const ga_size gz_off,
const ga_size kernel_h, const ga_size kernel_w, const ga_size stride_h, const ga_size stride_w,
const ga_size pad_h, const ga_size pad_w, GLOBAL_MEM DTYPE_OUTPUT_0 *gx, const ga_size gx_off)
{
x = (GLOBAL_MEM const DTYPE_INPUT_0 *)(((GLOBAL_MEM char *)x) + x_off);
z = (GLOBAL_MEM const DTYPE_INPUT_1 *)(((GLOBAL_MEM char *)z) + z_off);
gz = (GLOBAL_MEM const DTYPE_INPUT_2 *)(((GLOBAL_MEM char *)gz) + gz_off);
gx = (GLOBAL_MEM DTYPE_OUTPUT_0 *)(((GLOBAL_MEM char *)gx) + gx_off);
// grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < nthreads; index += LDIM_0 * GDIM_0) {
const ga_size w = index % width;
const ga_size h = (index / width) % height;
const ga_size c = (index / width / height) % channels;
const ga_size n = (index / width / height / channels);
const ga_size phstart = (h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1;
const ga_size phend = min((h + pad_h) / stride_h + 1, pooled_height);
const ga_size pwstart = (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1;
const ga_size pwend = min((w + pad_w) / stride_w + 1, pooled_width);
const ga_size offset = (n*channels + c) * pooled_height * pooled_width;
GLOBAL_MEM const DTYPE_INPUT_1* z_slice = z + offset;
GLOBAL_MEM const DTYPE_INPUT_2* gz_slice = gz + offset;
DTYPE_OUTPUT_0 gradient = 0;
for (ga_size ph=phstart; ph < phend; ++ph) {
for (ga_size pw=pwstart; pw < pwend; ++pw) {
if (x[index] == z_slice[ph * pooled_width + pw]) {
gradient += gz_slice[ph * pooled_width + pw];
}
}
}
gx[index] = gradient;
}
}
#kernel max_pool3d_grad_kernel : size, size, size, size, size, size, size, size, size, *, size, *, size, *, size, size, size, size, size, size, size, size, size, size, *, size :
#include "cluda.h"
// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void max_pool3d_grad_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size depth,
const ga_size height, const ga_size width, const ga_size pooled_depth,
const ga_size pooled_height, const ga_size pooled_width,
GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size x_off, GLOBAL_MEM const DTYPE_INPUT_1 *z, const ga_size z_off, GLOBAL_MEM const DTYPE_INPUT_2 *gz, const ga_size gz_off,
const ga_size kernel_d, const ga_size kernel_h, const ga_size kernel_w,
const ga_size stride_d, const ga_size stride_h, const ga_size stride_w,
const ga_size pad_d, const ga_size pad_h, const ga_size pad_w,
GLOBAL_MEM DTYPE_OUTPUT_0 *gx, const ga_size gx_off)
{
x = (GLOBAL_MEM const DTYPE_INPUT_0 *)(((GLOBAL_MEM char *)x) + x_off);
z = (GLOBAL_MEM const DTYPE_INPUT_1 *)(((GLOBAL_MEM char *)z) + z_off);
gz = (GLOBAL_MEM const DTYPE_INPUT_2 *)(((GLOBAL_MEM char *)gz) + gz_off);
gx = (GLOBAL_MEM DTYPE_OUTPUT_0 *)(((GLOBAL_MEM char *)gx) + gx_off);
// grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < nthreads; index += LDIM_0 * GDIM_0) {
const ga_size w = index % width;
const ga_size h = (index / width) % height;
const ga_size d = (index / width / height) % depth;
const ga_size c = (index / width / height / depth) % channels;
const ga_size n = (index / width / height / depth / channels);
const ga_size pdstart = (d + pad_d < kernel_d) ? 0 : (d + pad_d - kernel_d) / stride_d + 1;
const ga_size pdend = min((d + pad_d) / stride_d + 1, pooled_depth);
const ga_size phstart = (h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1;
const ga_size phend = min((h + pad_h) / stride_h + 1, pooled_height);
const ga_size pwstart = (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1;
const ga_size pwend = min((w + pad_w) / stride_w + 1, pooled_width);
const ga_size offset = (n*channels + c) * pooled_depth * pooled_height * pooled_width;
GLOBAL_MEM const DTYPE_INPUT_1* z_slice = z + offset;
GLOBAL_MEM const DTYPE_INPUT_2* gz_slice = gz + offset;
DTYPE_OUTPUT_0 gradient = 0;
for (ga_size pd=pdstart; pd < pdend; ++pd) {
for (ga_size ph=phstart; ph < phend; ++ph) {
for (ga_size pw=pwstart; pw < pwend; ++pw) {
if (x[index] == z_slice[(pd * pooled_height + ph) * pooled_width + pw]) {
gradient += gz_slice[(pd * pooled_height + ph) * pooled_width + pw];
}
}
}
}
gx[index] = gradient;
}
}
#section support_code_struct
int APPLY_SPECIFIC(max_pool_grad)(PyGpuArrayObject *x,
PyGpuArrayObject *z,
PyGpuArrayObject *gz,
PyArrayObject *ws,
PyArrayObject *stride,
PyArrayObject *pad,
PyGpuArrayObject **gx,
PyGpuContextObject *ctx) {
if (!GpuArray_IS_C_CONTIGUOUS(&x->ga)
|| !GpuArray_IS_C_CONTIGUOUS(&z->ga)
|| !GpuArray_IS_C_CONTIGUOUS(&gz->ga))
{
PyErr_Format(PyExc_ValueError,
"GpuMaxPoolGrad: requires data to be C-contiguous");
return 1;
}
size_t ndims = PyArray_DIM(ws, 0);
if (PyGpuArray_NDIM(x) != ndims + 2
|| PyGpuArray_NDIM(z) != ndims + 2
|| PyGpuArray_NDIM(gz) != ndims + 2)
{
PyErr_SetString(PyExc_ValueError, "GpuMaxPoolGrad: rank error");
return 1;
}
if (aesara_prep_output(gx, PyGpuArray_NDIM(x), PyGpuArray_DIMS(x),
x->ga.typecode, GA_C_ORDER, ctx) != 0)
{
PyErr_SetString(PyExc_RuntimeError,
"GpuMaxPoolGrad: failed to allocate memory");
return 1;
}
{
// scope for running kernel
size_t w[3];
size_t s[3];
size_t p[3];
for(int i = 0; i < ndims; i++) {
w[i] = *((npy_int64*)PyArray_GETPTR1(ws, i));
s[i] = *((npy_int64*)PyArray_GETPTR1(stride, i));
p[i] = *((npy_int64*)PyArray_GETPTR1(pad, i));
}
int err;
const size_t* z_dims = PyGpuArray_DIMS(z);
const size_t* x_dims = PyGpuArray_DIMS(x);
if (ndims == 2) {
size_t num_kernels = x_dims[0] * x_dims[1] * x_dims[2] * x_dims[3];
err = max_pool2d_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
x_dims[0], x_dims[1], x_dims[2], x_dims[3],
z_dims[2], z_dims[3],
x->ga.data, x->ga.offset,
z->ga.data, z->ga.offset,
gz->ga.data, gz->ga.offset,
w[0], w[1], s[0], s[1], p[0], p[1],
(*gx)->ga.data, (*gx)->ga.offset);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"GpuMaxPoolGrad: max_pool2d_grad_kernel %s.",
GpuKernel_error(&k_max_pool2d_grad_kernel, err));
return 1;
}
} else if (ndims == 3) {
size_t num_kernels = x_dims[0] * x_dims[1] * x_dims[2] * x_dims[3] * x_dims[4];
err = max_pool3d_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
x_dims[0], x_dims[1], x_dims[2], x_dims[3], x_dims[4],
z_dims[2], z_dims[3], z_dims[4],
x->ga.data, x->ga.offset,
z->ga.data, z->ga.offset,
gz->ga.data, gz->ga.offset,
w[0], w[1], w[2], s[0], s[1], s[2],
p[0], p[1], p[2], (*gx)->ga.data, (*gx)->ga.offset);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"GpuMaxPoolGrad: max_pool3d_grad_kernel %s.",
GpuKernel_error(&k_max_pool3d_grad_kernel, err));
return 1;
}
}
}
return 0;
}
#section kernels
#kernel max_pool2d_rop_kernel : size, size, size, size, size, size, size, *, size, *, size, size, size, size, size, size, size, *, size :
#include "cluda.h"
// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void max_pool2d_rop_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size pooled_height,
const ga_size pooled_width, const ga_size height, const ga_size width,
GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size x_off, GLOBAL_MEM const DTYPE_INPUT_1 *ex, const ga_size ex_off,
const ga_size kernel_h, const ga_size kernel_w,
const ga_size stride_h, const ga_size stride_w,
const ga_size pad_h, const ga_size pad_w,
GLOBAL_MEM DTYPE_OUTPUT_0 *z, const ga_size z_off)
{
x = (GLOBAL_MEM DTYPE_INPUT_0 *x)(((GLOBAL_MEM char *)x) + x_off);
ex = (GLOBAL_MEM DTYPE_INPUT_1 *x)(((GLOBAL_MEM char *)ex) + ex_off);
z = (GLOBAL_MEM DTYPE_OUTPUT_0 *x)(((GLOBAL_MEM char *)z) + z_off);
// grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < nthreads;
index += LDIM_0 * GDIM_0) {
const ga_size pw = index % pooled_width;
const ga_size ph = (index / pooled_width) % pooled_height;
const ga_size c = (index / pooled_width / pooled_height) % channels;
const ga_size n = (index / pooled_width / pooled_height / channels);
ga_int hstart = static_cast<ga_int>(ph*stride_h) - static_cast<ga_int>(pad_h);
const ga_size hend = min(hstart + kernel_h, height);
ga_int wstart = static_cast<ga_int>(pw*stride_w) - static_cast<ga_int>(pad_w);
const ga_size wend = min(wstart + kernel_w, width);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
const ga_size offset = (n*channels + c) * height * width;
GLOBAL_MEM const DTYPE_INPUT_0* x_slice = x + offset;
GLOBAL_MEM const DTYPE_INPUT_1* ex_slice = ex + offset;
DTYPE_OUTPUT_0 maxval = x_slice[hstart*width + wstart];
DTYPE_OUTPUT_0 collector = ex_slice[hstart*width + wstart];
for (ga_size h=hstart; h < hend; ++h) {
for (ga_size w=wstart; w < wend; ++w) {
// maximum in the region
if (x_slice[h*width + w] > maxval) {
maxval = x_slice[h*width + w];
collector = ex_slice[h*width + w];
}
}
}
z[index] = collector;
}
}
#kernel max_pool3d_rop_kernel : size, size, size, size, size, size, size, size, size, *, size, *, size, size, size, size, size, size, size, size, size, size, *, size :
#include "cluda.h"
// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void max_pool3d_rop_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size pooled_depth,
const ga_size pooled_height, const ga_size pooled_width,
const ga_size depth, const ga_size height, const ga_size width,
GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size x_off, GLOBAL_MEM const DTYPE_INPUT_1 *ex, const ga_size ex_off,
const ga_size kernel_d, const ga_size kernel_h, const ga_size kernel_w,
const ga_size stride_d, const ga_size stride_h, const ga_size stride_w,
const ga_size pad_d, const ga_size pad_h, const ga_size pad_w,
GLOBAL_MEM DTYPE_OUTPUT_0 *z, const ga_size x_off)
{
x = (GLOBAL_MEM DTYPE_INPUT_0 *x)(((GLOBAL_MEM char *)x) + x_off);
ex = (GLOBAL_MEM DTYPE_INPUT_1 *x)(((GLOBAL_MEM char *)ex) + ex_off);
z = (GLOBAL_MEM DTYPE_OUTPUT_0 *x)(((GLOBAL_MEM char *)z) + z_off);
// grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < nthreads;
index += LDIM_0 * GDIM_0) {
const ga_size pw = index % pooled_width;
const ga_size ph = (index / pooled_width) % pooled_height;
const ga_size pd = (index / pooled_width / pooled_height) % pooled_depth;
const ga_size c = (index / pooled_width / pooled_height / pooled_depth) % channels;
const ga_size n = (index / pooled_width / pooled_height / pooled_depth / channels);
ga_int dstart = static_cast<ga_int>(pd*stride_d) - static_cast<ga_int>(pad_d);
const ga_size dend = min(dstart + kernel_d, depth);
ga_int hstart = static_cast<ga_int>(ph*stride_h) - static_cast<ga_int>(pad_h);
const ga_size hend = min(hstart + kernel_h, height);
ga_int wstart = static_cast<ga_int>(pw*stride_w) - static_cast<ga_int>(pad_w);
const ga_size wend = min(wstart + kernel_w, width);
dstart = max(dstart, 0);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
const ga_size offset = (n*channels + c) * depth * height * width;
GLOBAL_MEM const DTYPE_INPUT_0* x_slice = x + offset;
GLOBAL_MEM const DTYPE_INPUT_1* ex_slice = ex + offset;
DTYPE_OUTPUT_0 maxval = x_slice[(dstart*height + hstart)*width + wstart];
DTYPE_OUTPUT_0 collector = ex_slice[(dstart*height + hstart)*width + wstart];
for (ga_size d=dstart; d < dend; ++d) {
for (ga_size h=hstart; h < hend; ++h) {
for (ga_size w=wstart; w < wend; ++w) {
// maximum in the region
if (x_slice[(d*height + h)*width + w] > maxval) {
maxval = x_slice[(d*height + h)*width + w];
collector = ex_slice[(d*height + h)*width + w];
}
}
}
}
z[index] = collector;
}
}
#section support_code
// output shape for a given input padded shape, window shape and stride
#define OUTPUT_DIMS(in_dim, ws, st, ignore_border) \
(ignore_border ? (in_dim - ws)/st + 1 : \
(st > ws ? (in_dim - 1)/st + 1 : \
std::max<ssize_t>(0, (in_dim - 1 - ws + st)/st) + 1))
#section support_code_struct
int APPLY_SPECIFIC(max_pool_rop)(PyGpuArrayObject *x,
PyGpuArrayObject *ex,
PyArrayObject *ws,
PyArrayObject *stride,
PyArrayObject *pad,
PyGpuArrayObject **z,
PARAMS_TYPE* params) {
if (!GpuArray_IS_C_CONTIGUOUS(&x->ga) || !GpuArray_IS_C_CONTIGUOUS(&ex->ga))
{
PyErr_Format(PyExc_ValueError,
"GpuMaxPoolRop: requires data to be C-contiguous");
return 1;
}
size_t ndims = PyArray_DIM(ws, 0);
if (PyGpuArray_NDIM(x) != ndims + 2 || PyGpuArray_NDIM(ex) != ndims + 2)
{
PyErr_SetString(PyExc_ValueError, "GpuMaxPoolRop: rank error");
return 1;
}
// prepare output
const size_t* x_dims = PyGpuArray_DIMS(x);
size_t z_dims[5]; // avoid warning if use 2 + nd
size_t w[3];
size_t s[3];
size_t p[3]; z_dims[0] = x_dims[0]; z_dims[1] = x_dims[1];
int nonzero_padding = 0;
for (int i = 0; i < ndims; i++) {
w[i] = *((npy_int64*)PyArray_GETPTR1(ws, i));
s[i] = *((npy_int64*)PyArray_GETPTR1(stride, i));
p[i] = *((npy_int64*)PyArray_GETPTR1(pad, i));
z_dims[2 + i] = OUTPUT_DIMS(x_dims[2 + i] + 2*p[i], w[i], s[i], params->ignore_border);
if (p[i] > 0) {
nonzero_padding = 1;
}
}
if (!params->ignore_border && nonzero_padding) {
PyErr_SetString(PyExc_ValueError,
"GpuMaxPoolRop: padding works only with ignore_border=True");
return 1;
}
if (aesara_prep_output(z, PyGpuArray_NDIM(ex), z_dims,
ex->ga.typecode, GA_C_ORDER, params->context) != 0)
{
PyErr_SetString(PyExc_RuntimeError,
"GpuMaxPoolRop: failed to allocate memory");
return 1;
}
{
// scope for running kernel
int err;
if (ndims == 2) {
size_t num_kernels = z_dims[0] * z_dims[1] * z_dims[2] * z_dims[3];
err = max_pool2d_rop_kernel_scall(1, &num_kernels, 0, num_kernels,
z_dims[0], z_dims[1], z_dims[2], z_dims[3],
x_dims[2], x_dims[3],
x->ga.data, x->ga.offset,
ex->ga.data, ex->ga.offset,
w[0], w[1], s[0], s[1], p[0], p[1],
(*z)->ga.data, (*z)->ga.offset);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"GpuMaxPoolRop: max_pool2d_rop_kernel %s.",
GpuKernel_error(&k_max_pool2d_rop_kernel, err));
return 1;
}
}
else if (ndims == 3) {
size_t num_kernels = z_dims[0] * z_dims[1] * z_dims[2] * z_dims[3] * z_dims[4];
err = max_pool3d_rop_kernel_scall(1, &num_kernels, 0, num_kernels,
z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[4],
x_dims[2], x_dims[3], x_dims[4],
x->ga.data, x->ga.offset,
ex->ga.data, ex->ga.offset,
w[0], w[1], w[2], s[0], s[1], s[2],
p[0], p[1], p[2],
(*z)->ga.data, (*z)->ga.offset);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"GpuMaxPoolRop: max_pool3d_rop_kernel %s.",
GpuKernel_error(&k_max_pool2d_rop_kernel, err));
return 1;
}
}
}
return 0;
}
// modified from pytorch
// https://github.com/pytorch/pytorch/master/blob/torch/lib/THC/THCTensorTopK.cuh
// original license below:
/*
Copyright (c) 2016- Facebook, Inc (Adam Paszke)
Copyright (c) 2014- Facebook, Inc (Soumith Chintala)
Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu)
Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
Copyright (c) 2011-2013 NYU (Clement Farabet)
Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
Copyright (c) 2006 Idiap Research Institute (Samy Bengio)
Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
and IDIAP Research Institute nor the names of its contributors may be
used to endorse or promote products derived from this software without
specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
*/
#if __CUDA_ARCH__ < 350
#define __ldg(ptr) (*(ptr))
#endif
typedef ptrdiff_t ssize_t;
__device__ __forceinline__ int lane_id() {
int id;
asm("mov.s32 %0, %laneid;" : "=r"(id) );
return id;
}
__device__ __forceinline__ unsigned lane_mask_lt() {
unsigned mask;
asm("mov.u32 %0, %%lanemask_lt;" : "=r"(mask));
return mask;
}
__device__ __forceinline__ unsigned lane_mask_le() {
unsigned mask;
asm("mov.u32 %0, %%lanemask_le;" : "=r"(mask));
return mask;
}
__device__ __forceinline__ unsigned lane_mask_gt() {
unsigned mask;
asm("mov.u32 %0, %%lanemask_gt;" : "=r"(mask));
return mask;
}
__device__ __forceinline__ unsigned lane_mask_ge() {
unsigned mask;
asm("mov.u32 %0, %%lanemask_ge;" : "=r"(mask));
return mask;
}
template <typename T>
struct Bitfield {};
template <>
struct Bitfield<unsigned int> {
static __device__ __forceinline__
unsigned int get(unsigned int val, int pos, int len) {
unsigned int ret;
asm("bfe.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(val), "r"(pos), "r"(len));
return ret;
}
static __device__ __forceinline__
unsigned int set(unsigned int val, unsigned int toInsert, int pos, int len) {
unsigned int ret;
asm("bfi.b32 %0, %1, %2, %3, %4;" :
"=r"(ret) : "r"(toInsert), "r"(val), "r"(pos), "r"(len));
return ret;
}
};
template <>
struct Bitfield<unsigned long long int> {
static __device__ __forceinline__
unsigned long long int get(unsigned long long int val, int pos, int len) {
unsigned long long int ret;
asm("bfe.u64 %0, %1, %2, %3;" : "=l"(ret) : "l"(val), "r"(pos), "r"(len));
return ret;
}
static __device__ __forceinline__
unsigned long long int set(unsigned long long int val, unsigned long long int toInsert, int pos, int len) {
unsigned long long int ret;
asm("bfi.b64 %0, %1, %2, %3, %4;" :
"=l"(ret) : "l"(toInsert), "l"(val), "r"(pos), "r"(len));
return ret;
}
};
template <typename T>
struct RadixConfig {
// Converts a type (maybe float) to an integer representation with the same
// sorting; i.e., for floats f1, f2:
// if f1 < f2 then convert(f1) < convert(f2)
// We use this to enable radix selection of floating-point values.
// This also gives a relative order for NaNs, but that's ok, as they
// will all be adjacent
typedef unsigned int RadixType;
static inline __device__ RadixType convert(T v) {
return (RadixType)v;
}
static inline __device__ float deconvert(RadixType v) {
return (T)v;
}
};
template <>
struct RadixConfig<float> {
typedef unsigned int RadixType;
static inline __device__ RadixType convert(float v) {
RadixType x = __float_as_int(v);
RadixType mask = (x & 0x80000000) ? 0xffffffff : 0x80000000;
return (x ^ mask);
}
static inline __device__ float deconvert(RadixType v) {
RadixType mask = (v & 0x80000000) ? 0x80000000 : 0xffffffff;
return __int_as_float(v ^ mask);
}
};
template <>
struct RadixConfig<double> {
typedef unsigned long long RadixType;
static inline __device__ RadixType convert(double v) {
RadixType x = __double_as_longlong(v);
RadixType mask = -((x >> 63)) | 0x8000000000000000;
return (x ^ mask);
}
static inline __device__ double deconvert(RadixType v) {
RadixType mask = ((v >> 63) - 1) | 0x8000000000000000;
return __longlong_as_double(v ^ mask);
}
};
template <>
struct RadixConfig<char> {
typedef unsigned int RadixType;
static inline __device__ RadixType convert(char v) {
return 128u + v;
}
static inline __device__ char deconvert(RadixType v) {
return v - 128;
}
};
// g++ makes difference between 'signed char' (ga_byte, int8) and 'char'.
// Same code as for char.
template <>
struct RadixConfig<ga_byte> {
typedef unsigned int RadixType;
static inline __device__ RadixType convert(ga_byte v) {
return 128u + v;
}
static inline __device__ ga_byte deconvert(RadixType v) {
return v - 128;
}
};
template <>
struct RadixConfig<short> {
typedef unsigned int RadixType;
static inline __device__ RadixType convert(short v) {
assert(sizeof(short) == 2);
return 32768u ^ v;
}
static inline __device__ short deconvert(RadixType v) {
return v - 32768;
}
};
template <>
struct RadixConfig<int> {
typedef unsigned int RadixType;
static inline __device__ RadixType convert(int v) {
assert(sizeof(int) == 4);
return 2147483648u + v;
}
static inline __device__ int deconvert(RadixType v) {
return v - 2147483648u;
}
};
template <>
struct RadixConfig<long long> {
typedef unsigned long long RadixType;
static inline __device__ RadixType convert(long long v) {
assert(sizeof(long long) == 8);
return 9223372036854775808ull + v;
}
static inline __device__ long long deconvert(RadixType v) {
return v - 9223372036854775808ull;
}
};
/* NB: This specialization for ga_half does know that ga_half is a struct with only one member of type ga_ushort.
* So, if ga_half implementation changes, this code should change too.
* TODO: Maybe should gpuarray provide abstract functions to manipulate ga_half internal structure? e.g:
* unsigned short ga_half2bits(ga_half value);
* ga_half ga_bits2half(unsigned short bits);
*/
template <>
struct RadixConfig<ga_half> {
typedef unsigned int RadixType;
static inline __device__ RadixType convert(ga_half v) {
RadixType mask = -(((RadixType)v.data >> 15)) | 0x8000;
return (v.data ^ mask);
}
static inline __device__ ga_half deconvert(RadixType v) {
RadixType mask = ((v >> 15) - 1) | 0x8000;
ga_half out = {(unsigned short)(v ^ mask)};
return out;
}
};
// $$inp_t should be replaced in c_code
// we cannot use templated kernel because gpuarray API does not support it
#define NDIM $ndim
#define INPUT_TYPE $inp_t
#define INDEX_TYPE $out_t
#define bitsof(T) (sizeof(T)*8)
#define radix_t RadixConfig<INPUT_TYPE>::RadixType
#define WRITE_VALUE $write_value
#define WRITE_INDEX $write_index
#if RADIX_SIZE > 32
#error "RADIX_SIZE must be smaller than warp size (32)"
#endif
void __device__ atomicAdd(long long *dst, long long &src) {
atomicAdd(
reinterpret_cast<unsigned long long*>(dst),
reinterpret_cast<unsigned long long&>(src));
}
template <typename T>
static inline __device__ T binary_cumsum(
int idx, int warp_id, T* smem, bool value) {
// cumsum within 1D thread block, which adds up `value` of all threads
// whose id is *no greater than* the current thread
// binary_cumsum(1, 0, 1, 0, 1) -> (1, 1, 2, 2, 3)
// cumsum within warp
unsigned int warp_bits = __ballot(value);
T warp_sum = __popc(lane_mask_le() & warp_bits);
if (lane_id() == 0)
smem[warp_id] = __popc(warp_bits);
local_barrier();
// cumsum across warps in one thread
if (idx == 0) {
T sum = smem[0];
for (int i = 1; i < blockDim.x / GA_WARP_SIZE; ++i) {
sum += smem[i];
smem[i] = sum;
}
}
local_barrier();
// load the carry from the preceding warp
if (warp_id >= 1) {
warp_sum = warp_sum+smem[warp_id - 1];
}
return warp_sum;
}
template <typename T>
static inline __device__ T binary_cumsum_exclusive(
int idx, int warp_id, T* smem, bool value) {
// cumsum within 1D thread block, which adds up `value` of all threads
// whose id is *less than* the current thread
// binary_cumsum_excl(1, 0, 1, 0, 1) -> (0, 1, 1, 2, 2)
// cumsum within warp
unsigned int warp_bits = __ballot(value);
T warp_sum = __popc(lane_mask_lt() & warp_bits);
if (lane_id() == 0)
smem[warp_id] = __popc(warp_bits);
local_barrier();
// cumsum across warps in one thread
if (idx == 0) {
T sum = smem[0];
for (int i = 1; i < blockDim.x / GA_WARP_SIZE; ++i) {
sum += smem[i];
smem[i] = sum;
}
}
local_barrier();
// load the carry from the preceding warp
if (warp_id >= 1)
warp_sum += smem[warp_id - 1];
return warp_sum;
}
// apply raw(byte) offset to pointer
template <typename T>
static __device__ inline T* ptr_add(T *ptr, ssize_t offset) {
return (T*)((char*)ptr + offset);
}
// get array element using raw(byte) offset
template <typename T>
static __device__ inline T& ptr_at(T *ptr, ssize_t offset) {
return *((T*)((char*)ptr + offset));
}
// read array element using raw(byte) offset
template <typename T>
static __device__ inline T ptr_read_cached(T *ptr, ssize_t offset) {
return __ldg(((T*)((char*)ptr + offset)));
}
/* NB: __ldg is not defined for ga_half, so we must specialize ptr_read_cached.
* To do it, I try to use a built-in type that should have the same size as ga_half.
* Based on current ga_half implementation (2017/11/27), it should be ga_ushort.
* This code must be updated every time ga_half implementation size changes,
* until a better code be provided. */
#define GA_HALF_STD_TYPE ga_ushort
static __device__ inline ga_half ptr_read_cached(ga_half *ptr, ssize_t offset) {
int check_ga_half_std_type[ ( ( sizeof(GA_HALF_STD_TYPE) - sizeof(ga_half) ) ? -1 : 1 ) ];
GA_HALF_STD_TYPE out = __ldg(((GA_HALF_STD_TYPE*)((char*)ptr + offset)));
ga_half real_out;
*(GA_HALF_STD_TYPE*)(&real_out) = out;
return real_out;
}
#undef GA_HALF_STD_TYPE
/* Comparisons involving ga_half and conversions from integers (e.g. 0, 1) to ga_half lead to compilation errors.
* Following functions are provided to bypass these issues. */
template<typename T>
static __device__ inline T aesara_zero() {return 0;}
template<>
__device__ inline ga_half aesara_zero() {return ga_float2half(0);}
template<typename T>
static __device__ inline T aesara_one() {return 1;}
template<>
__device__ inline ga_half aesara_one() {return ga_float2half(1);}
template<typename A, typename B> static __device__ inline bool aesara_eq(const A& a, const B& b) {return a == b;}
template<typename A, typename B> static __device__ inline bool aesara_ne(const A& a, const B& b) {return a != b;}
template<typename A, typename B> static __device__ inline bool aesara_lt(const A& a, const B& b) {return a < b;}
template<typename A, typename B> static __device__ inline bool aesara_gt(const A& a, const B& b) {return a > b;}
template<typename A, typename B> static __device__ inline bool aesara_le(const A& a, const B& b) {return a <= b;}
template<typename A, typename B> static __device__ inline bool aesara_ge(const A& a, const B& b) {return a >= b;}
template<typename T> static __device__ inline bool aesara_eq(const ga_half& a, const T& b) {return ga_half2float(a) == b;}
template<typename T> static __device__ inline bool aesara_ne(const ga_half& a, const T& b) {return ga_half2float(a) != b;}
template<typename T> static __device__ inline bool aesara_lt(const ga_half& a, const T& b) {return ga_half2float(a) < b;}
template<typename T> static __device__ inline bool aesara_gt(const ga_half& a, const T& b) {return ga_half2float(a) > b;}
template<typename T> static __device__ inline bool aesara_le(const ga_half& a, const T& b) {return ga_half2float(a) <= b;}
template<typename T> static __device__ inline bool aesara_ge(const ga_half& a, const T& b) {return ga_half2float(a) >= b;}
template<typename T> static __device__ inline bool aesara_eq(const T& a, const ga_half& b) {return a == ga_half2float(b);}
template<typename T> static __device__ inline bool aesara_ne(const T& a, const ga_half& b) {return a != ga_half2float(b);}
template<typename T> static __device__ inline bool aesara_lt(const T& a, const ga_half& b) {return a < ga_half2float(b);}
template<typename T> static __device__ inline bool aesara_gt(const T& a, const ga_half& b) {return a > ga_half2float(b);}
template<typename T> static __device__ inline bool aesara_le(const T& a, const ga_half& b) {return a <= ga_half2float(b);}
template<typename T> static __device__ inline bool aesara_ge(const T& a, const ga_half& b) {return a >= ga_half2float(b);}
static __device__ inline bool aesara_eq(const ga_half& a, const ga_half& b) {return ga_half2float(a) == ga_half2float(b);}
static __device__ inline bool aesara_ne(const ga_half& a, const ga_half& b) {return ga_half2float(a) != ga_half2float(b);}
static __device__ inline bool aesara_lt(const ga_half& a, const ga_half& b) {return ga_half2float(a) < ga_half2float(b);}
static __device__ inline bool aesara_gt(const ga_half& a, const ga_half& b) {return ga_half2float(a) > ga_half2float(b);}
static __device__ inline bool aesara_le(const ga_half& a, const ga_half& b) {return ga_half2float(a) <= ga_half2float(b);}
static __device__ inline bool aesara_ge(const ga_half& a, const ga_half& b) {return ga_half2float(a) >= ga_half2float(b);}
#define RADIX_BITS 4
#define RADIX_SIZE (1<<RADIX_BITS)
#define RADIX_MASK(n) ((RADIX_SIZE-1) << (n*RADIX_BITS))
#define RADIX_DIGITS(T) (bitsof(T)/RADIX_BITS)
// works when length on axis is within max allowed threads in block (1024)
extern "C" __global__ void k_topk_dense(
$dims
// size_t dims_1, ssize_t dims_2, ... , dims_$${NDIM}
$dstv
// INPUT_TYPE *dstv
$dstv_offset
// size_t offset
$dstv_strides
// ssize_t dstv_strides_0, ssize_t dstv_strides_1, ... , dstv_strides_$${NDIM}
$dsti
// INDEX_TYPE *dsti
$dsti_offset
// size_t offset
$dsti_strides
// ssize_t dsti_strides_0, ssize_t dsti_strides_1, ... , dsti_strides_$${NDIM}
ssize_t k,
INPUT_TYPE* src,
size_t src_offset,
$src_strides
// ssize_t src_strides_0, ssize_t src_strides_1, ... , src_strides_$${NDIM}
size_t size) {
__shared__ int smem[32 * RADIX_SIZE];
__shared__ int k2;
const unsigned int idx = threadIdx.x;
bool is_topk= (idx < size);
bool is_topkth = is_topk;
size_t out_idx;
const unsigned char warp_id = idx / GA_WARP_SIZE;
// 0. get the slice for thread block to work on
size_t gid = blockIdx.x, gidx;
$set_slice
// $$set_slice expands into:
//for(int i=1; i<NDIM; i++) {
// gidx = gid % dims_$${i};
// gid /= dims_$${i};
// dsti = ptr_add(dsti, gidx*dsti_strides_$${i};
// dstv = ptr_add(dstv, gidx*dstv_strides_$${i};
// src = ptr_add(src, gidx*src_strides_$${i});
//}
// get input and its radix friendly form
const INPUT_TYPE xval = is_topk ? ptr_at(src, idx*src_strides_0) : aesara_zero<INPUT_TYPE>();
radix_t x = RadixConfig<INPUT_TYPE>::convert(xval);
// resolve negative k
if (k<0) { x = ~x; k = -k; }
if (idx==0)
k2 = k;
// 1. filter is_topk and is_topkth using radix select
#pragma unroll
for (int i=bitsof(INPUT_TYPE)-RADIX_BITS; i>=0; i-=RADIX_BITS) {
const int digit = Bitfield<radix_t>::get(x, i, RADIX_BITS);
/*int digit = (x>>i) & (RADIX_SIZE-1);*/
// count within warp
#pragma unroll
for (int bin=0; bin<RADIX_SIZE; ++bin) {
bool vote = (bin == digit) && is_topkth;
unsigned int votes = __ballot(vote);
if (lane_id()==0)
smem[bin + RADIX_SIZE*warp_id] = __popc(votes);
}
local_barrier();
// sum counts across all warps
if (idx < RADIX_SIZE) {
int sum = smem[idx];
#pragma unroll
for(int w=RADIX_SIZE; w<blockDim.x*RADIX_SIZE / GA_WARP_SIZE; w+=RADIX_SIZE)
sum += smem[idx + w];
smem[idx] = sum;
}
local_barrier();
// find the bucket and update k2
// smem[:RADIX_SIZE:-1] = k2 - cumsum(smem[:RADIX_SIZE-1:-1])
if (idx == 0) {
int sum = k2;
#pragma unroll
for (int bin=RADIX_SIZE-1; bin>=0; --bin) {
sum -= smem[bin];
smem[bin] = sum;
k2 = (sum > 0) ? sum : k2;
}
smem[RADIX_SIZE] = 1;
}
local_barrier();
if (is_topkth) {
is_topk &= (smem[digit+1] > 0);
is_topkth &= (smem[digit] <= 0) && (smem[digit+1] > 0);
}
local_barrier();
}
// set k2 as number of exceeding values
if (idx==0) {
#pragma unroll
for (int bin=RADIX_SIZE-1; bin>=0; --bin) {
if (smem[bin] <= 0)
break;
k2 = smem[bin];
}
}
local_barrier();
// 2. find the index of output array, if exists
if (k2 != 0) {
// top_kth value may not be unique, so we need to
// perform binary cumsum on is_topkth to drop exceeding top-kth values
out_idx = binary_cumsum_exclusive(idx, warp_id, smem, is_topkth);
if ((out_idx >= k2) && is_topkth)
is_topk = false;
local_barrier();
}
// perform binary cumsum on is_topk to determine the indices to put result
out_idx = binary_cumsum_exclusive(idx, warp_id, smem, is_topk);
if (is_topk) {
#if WRITE_VALUE == 1
ptr_at(dstv, out_idx * dstv_strides_0) = xval;
#endif
#if WRITE_INDEX == 1
ptr_at(dsti, out_idx * dsti_strides_0) = (INDEX_TYPE)idx;
#endif
}
}
#define RADIX_BITS 2
#define RADIX_SIZE (1<<RADIX_BITS)
#define RADIX_DIGITS(T) (bitsof(T)/RADIX_BITS)
#define COUNT_TYPE $count_t
#define KERNEL_NAME $kname
// if count_t is int, work for array size within [1025, 2^31-1]
// if count_t is long long, work for array size within [2^31, 2^63-1]
template <typename DataType, typename RadixType, typename CountType>
__device__ DataType find_pattern(DataType* smem,
DataType* data,
CountType slice_size,
CountType stride,
RadixType known_bits,
RadixType known_bits_mask) {
if (threadIdx.x < 32)
smem[threadIdx.x] = aesara_zero<DataType>();
local_barrier();
// All threads participate in the loop, in order to sync on the flag
for (CountType i = threadIdx.x; i < (slice_size + (CountType)blockDim.x-1); i += blockDim.x) {
bool in_range = (i < slice_size);
DataType v = in_range ? ptr_read_cached(data, i*stride) : aesara_zero<DataType>();
if (in_range && ((RadixConfig<DataType>::convert(v) & known_bits_mask) == known_bits)) {
// There should not be conflicts if we are using find_pattern,
// since the result is unique
smem[0] = aesara_one<DataType>();
smem[1] = v; // can't use val as the flag, since it could be 0
}
local_barrier();
DataType found = smem[0];
DataType val = smem[1];
local_barrier();
// Check to see if a thread found the value
if (aesara_ne(found, 0))
return val;
}
return aesara_zero<DataType>();
}
// This function counts the distribution of all input values in a
// slice we are selecting by radix digit at `radix_digit_pos`, but only
// those that pass the filter `((v & known_bits_mask) == known_bits)`.
// This produces and broadcasts the seen counts for a single block only.
// `smem` must have at least `RADIX_SIZE` elements.
template <typename DataType, typename RadixType, typename CountType>
__device__ void count_radix_masked(CountType counts[RADIX_SIZE],
CountType* smem,
RadixType known_bits,
RadixType known_bits_mask,
int radix_digit_pos,
CountType slice_size,
CountType stride,
DataType* data) {
// Clear out per-thread counts from a previous round
#pragma unroll
for (int i = 0; i < RADIX_SIZE; ++i)
counts[i] = 0;
if (threadIdx.x < RADIX_SIZE)
smem[threadIdx.x] = 0;
local_barrier();
// Scan over all the data. Upon a read, the warp will accumulate
// counts per each digit in the radix using warp voting.
for (CountType i = threadIdx.x; i < slice_size; i += blockDim.x) {
RadixType val = RadixConfig<DataType>::convert(ptr_read_cached(data, i*stride));
bool has_val = ((val & known_bits_mask) == known_bits);
RadixType digit_in_radix = Bitfield<RadixType>::get(val, radix_digit_pos, RADIX_BITS);
#pragma unroll
for (int j = 0; j < RADIX_SIZE; ++j) {
bool vote = has_val && (digit_in_radix == j);
counts[j] += __popc(__ballot(vote));
}
}
// Now, for each warp, sum values
if (lane_id() == 0) {
for (int i=0; i<RADIX_SIZE; ++i)
atomicAdd(&smem[i], counts[i]);
}
/*
// not sure why, but this just give wrong results
if (lane_id() < RADIX_SIZE)
atomicAdd(&smem[lane_id()], counts[lane_id()]);
*/
local_barrier();
// For each thread, read in the total counts
#pragma unroll
for (unsigned int i = 0; i < RADIX_SIZE; ++i)
counts[i] = smem[i];
local_barrier();
}
template <typename DataType, typename RadixType, typename CountType>
__device__ void radix_select(DataType* data,
CountType k,
bool order,
CountType slice_size,
CountType stride,
CountType* smem,
DataType* top_kth) {
// Per-thread buckets into which we accumulate digit counts in our
// radix
register CountType counts[RADIX_SIZE];
// We only consider elements x such that (x & known_bits_mask) == known_bits
// Initially, we consider all elements of the array, so the above
// statement is true regardless of input.
RadixType known_bits = 0, known_bits_mask = 0;
// We are looking for the top k_to_find-th element when iterating over
// digits; this count gets reduced by elimination when counting
// successive digits
CountType k_to_find = abs(k);
// We start at the most significant digit in our radix, scanning
// through to the least significant digit
#pragma unroll
for (int digit_pos = bitsof(DataType) - RADIX_BITS;
digit_pos >= 0; digit_pos -= RADIX_BITS) {
// Count radix distribution for the current position and reduce
// across all threads
count_radix_masked<DataType, RadixType, CountType>(
counts, smem,
known_bits, known_bits_mask, digit_pos,
slice_size, stride, data);
// All threads participate in the comparisons below to know the
// final result
#define CHECK_RADIX(i) \\
int count = counts[i]; \\
/* All threads have the same value in counts here, so all */ \\
/* threads will return from the function. */ \\
if (count == 1 && k_to_find == 1) { \\
/* There is a unique answer. */ \\
known_bits = Bitfield<RadixType>::set( \\
known_bits, i, digit_pos, RADIX_BITS); \\
known_bits_mask = Bitfield<RadixType>::set( \\
known_bits_mask, RADIX_SIZE-1, digit_pos, RADIX_BITS); \\
/* The answer is now the unique element v such that: */ \\
/* (v & known_bits_mask) == known_bits */ \\
/* However, we do not yet know what the actual element is. We */ \\
/* need to perform a search through the data to find the */ \\
/* element that matches this pattern. */ \\
*top_kth = find_pattern<DataType, RadixType, CountType>( \\
(DataType*) smem, data, slice_size, \\
stride, known_bits, known_bits_mask); \\
return; \\
} \\
if (count >= k_to_find) { \\
known_bits = Bitfield<RadixType>::set(known_bits, i, digit_pos, RADIX_BITS); \\
known_bits_mask = Bitfield<RadixType>::set( \\
known_bits_mask, RADIX_SIZE-1, digit_pos, RADIX_BITS); \\
/* The top-Kth element v must now be one such that: */ \\
/* (v & known_bits_mask == known_bits) */ \\
/* but we haven't narrowed it down; we must check the next */ \\
/* least-significant digit */ \\
break; \\
} \\
k_to_find -= count
if (order) {
#pragma unroll
for (int i=RADIX_SIZE - 1; i >= 0; --i) {
CHECK_RADIX(i);
}
} else {
#pragma unroll
for (int i=0; i < RADIX_SIZE; ++i) {
CHECK_RADIX(i);
}
}
#undef CHECK_RADIX
} // end digit_pos for
// There is no unique result, but there is a non-unique result
// matching `known_bits` exactly
*top_kth = RadixConfig<DataType>::deconvert(known_bits);
}
extern "C" __global__ void KERNEL_NAME(
$dims
// size_t dims_1, ssize_t dims_2, ... , dims_$${NDIM}
$dstv
// INPUT_TYPE *dstv
$dstv_offset
// size_t offset
$dstv_strides
// ssize_t dstv_strides_0, ssize_t dstv_strides_1, ... , dstv_strides_$${NDIM}
$dsti
// INDEX_TYPE *dsti
$dsti_offset
// size_t offset
$dsti_strides
// ssize_t dsti_strides_0, ssize_t dsti_strides_1, ... , dsti_strides_$${NDIM}
ssize_t k,
INPUT_TYPE* src,
size_t src_offset,
$src_strides
// ssize_t src_strides_0, ssize_t src_strides_1, ... , src_strides_$${NDIM}
size_t size) {
__shared__ COUNT_TYPE smem[32];
INPUT_TYPE topkth_value;
const bool order = (k>0);
k = (order ? k : -k);
const int idx = threadIdx.x;
const int warp_id = idx / GA_WARP_SIZE;
// get the slice for thread block to work on
// size <- the axis to work on
// dims_1+ <- batched dimensions
unsigned int gid = blockIdx.x, gidx;
$set_slice
// $$set_slice expands into:
//for(int i=1; i<NDIM; i++) {
// gidx = gid % dims_$${i};
// gid /= dims_$${i};
// dsti = ptr_add(dsti, gidx*dsti_strides_$${i});
// dstv = ptr_add(dstv, gidx*dstv_strides_$${i});
// src = ptr_add(src, gidx*src_strides_$${i});
//}
radix_select<INPUT_TYPE, radix_t, COUNT_TYPE>(
src, k, order, size, src_strides_0,
smem, &topkth_value);
// Every value that is strictly less/greater than `pattern`
// (depending on sort dir) in sorted int format is in the top-K.
// The top-K value itself might not be unique.
//
// Since there are a variable number of elements that we see that
// are within the top-k, we don't know at what index to write out
// the resulting values.
// In order to get this, we perform an exclusive cumsum of
// `has_topk`. This will return the resulting index into which we
// need to write the result, if a thread has a result.
// All threads need to participate in the loop and the cumsum
// but not necessarily in the load; hence loop bounds being rounded
// up to a multiple of the block dim.
COUNT_TYPE iter_bound = size + blockDim.x-1;
INDEX_TYPE write_base = 0;
for (int i = idx; i < iter_bound; i += blockDim.x) {
bool in_range = (i < size);
INPUT_TYPE v = in_range ? ptr_read_cached(src, i*src_strides_0) : aesara_zero<INPUT_TYPE>();
bool has_topk;
if (order) {
has_topk = in_range && (aesara_gt(v, topkth_value));
} else {
has_topk = in_range && (aesara_lt(v, topkth_value));
}
int index = binary_cumsum_exclusive(idx, warp_id, smem, has_topk);
int carry = smem[blockDim.x / 32 - 1];
if (has_topk) {
COUNT_TYPE write_idx = write_base + index;
#if WRITE_VALUE == 1
ptr_at(dstv, write_idx * dstv_strides_0) = v;
#endif
#if WRITE_INDEX == 1
ptr_at(dsti, write_idx * dsti_strides_0) = (INDEX_TYPE)i;
#endif
}
write_base += carry;
}
COUNT_TYPE topk_remaining = (k - write_base);
for (COUNT_TYPE i = idx; i < iter_bound; i += blockDim.x) {
bool in_range = (i < size);
INPUT_TYPE v = in_range ? ptr_read_cached(src, i*src_strides_0) : aesara_zero<INPUT_TYPE>();
bool has_topk = in_range && (aesara_eq(v, topkth_value));
int index = binary_cumsum_exclusive(idx, warp_id, smem, has_topk);
int carry = smem[blockDim.x / 32 - 1];
if (has_topk && index < topk_remaining) {
COUNT_TYPE write_idx = write_base + index;
#if WRITE_VALUE == 1
ptr_at(dstv, write_idx * dstv_strides_0) = v;
#endif
#if WRITE_INDEX == 1
ptr_at(dsti, write_idx * dsti_strides_0) = (INDEX_TYPE)i;
#endif
}
if (carry >= topk_remaining)
break;
topk_remaining -= carry;
write_base += carry;
}
}
import os
import sys
from aesara.configdefaults import config
from aesara.gpuarray import pygpu
from aesara.gpuarray.basic_ops import (
as_gpuarray_variable,
gpu_contiguous,
gpuarray_helper_inc_dir,
infer_context_name,
)
from aesara.gpuarray.elemwise import GpuDimShuffle
from aesara.gpuarray.type import GpuArrayType, gpu_context_type
from aesara.gradient import grad_undefined
from aesara.graph.basic import Apply
from aesara.graph.opt import local_optimizer
from aesara.link.c.op import _NoPythonExternalCOp
from aesara.tensor.basic import as_tensor_variable
from aesara.tensor.basic_opt import register_canonicalize
from aesara.tensor.blas import batched_dot
from aesara.tensor.nnet.ctc import ctc_available
class GpuConnectionistTemporalClassification(_NoPythonExternalCOp):
"""
GPU wrapper for Baidu CTC loss function.
Parameters
----------
compute_grad
If set to True, enables the computation of gradients of the CTC loss function.
"""
__props__ = ("compute_grad",)
_cop_num_inputs = 3
_cop_num_outputs = 2
func_file = "./c_code/ctc_wrapper.c"
func_name = "APPLY_SPECIFIC(ctc_cost_gpu)"
params_type = gpu_context_type
def __init__(self, compute_grad=True):
if not ctc_available():
raise RuntimeError(
"Baidu CTC is not available and "
"GpuConnectionistTemporalClassification Op "
"can not be constructed."
)
self.compute_grad = compute_grad
# Return only the cost. Gradient will be returned by grad()
self.default_output = 0
super().__init__(self.func_file, self.func_name)
def c_lib_dirs(self, **kwargs):
lib_dirs = []
if ctc_available.path is not None:
lib_dirs += [ctc_available.path]
return lib_dirs
def c_compile_args(self, **kwargs):
if ctc_available.path is not None:
if sys.platform != "darwin" and " " in ctc_available.path:
return ['-Wl,-rpath,"' + ctc_available.path + '"']
else:
return ["-Wl,-rpath," + ctc_available.path]
return []
def c_libraries(self, **kwargs):
return ["warpctc", "gpuarray"]
def c_header_dirs(self, **kwargs):
dirs = [
gpuarray_helper_inc_dir(),
pygpu.get_include(),
config.cuda__include_path,
]
if config.ctc__root != "":
dirs.append(os.path.join(config.ctc__root, "include"))
return dirs
def c_headers(self, **kwargs):
return [
"ctc.h",
"numpy_compat.h",
"gpuarray/ext_cuda.h",
"gpuarray_helper.h",
"gpuarray/types.h",
"gpuarray_api.h",
"gpuarray/array.h",
"gpuarray/util.h",
"gpuarray/extension.h",
]
def get_params(self, node):
return node.inputs[0].type.context
def make_node(self, activations, labels, input_lengths):
context_name = infer_context_name(activations)
t_activations = as_gpuarray_variable(activations, context_name=context_name)
# Ensure activations array is C-contiguous
t_activations = gpu_contiguous(t_activations)
# Labels and input lengths are always on the CPU
t_labels = as_tensor_variable(labels)
t_input_lengths = as_tensor_variable(input_lengths)
if t_activations.type.dtype != "float32":
raise TypeError("activations must use the float32 type.")
if t_activations.ndim != 3:
raise ValueError("activations must have 3 dimensions.")
if t_labels.type.dtype != "int32":
raise TypeError("labels must use the int32 type.")
if t_labels.ndim != 2:
raise ValueError("labels must have 2 dimensions.")
if t_input_lengths.type.dtype != "int32":
raise TypeError("input_lengths must use the int32 type.")
if t_input_lengths.ndim != 1:
raise ValueError("input_lengths must have 1 dimension.")
costs = GpuArrayType(
dtype="float32", broadcastable=(False,), context_name=context_name
)()
outputs = [costs]
if self.compute_grad:
gradients = GpuArrayType(
dtype="float32",
broadcastable=(
False,
False,
False,
),
context_name=context_name,
)()
outputs += [gradients]
return Apply(
self, inputs=[t_activations, t_labels, t_input_lengths], outputs=outputs
)
def L_op(self, inputs, outputs, output_grads):
# Gradients computed by Op
assert self.compute_grad and len(outputs) == 2
gradients = outputs[1]
assert gradients is not None
# Gradients of original function, to compose chain rule
grad_op = output_grads[0]
grad_shuffle = GpuDimShuffle(
input_broadcastable=(
False,
False,
False,
),
new_order=(1, 0, 2),
)(gradients)
grad_bdot = batched_dot(grad_op, grad_shuffle)
grad_shuffle_reverse = GpuDimShuffle(
input_broadcastable=(
False,
False,
False,
),
new_order=(1, 0, 2),
)(grad_bdot)
return [
grad_shuffle_reverse,
grad_undefined(self, 1, inputs[1]),
grad_undefined(self, 2, inputs[2]),
]
def gpu_ctc(activations, labels, input_lengths):
"""
Compute CTC loss function on the GPU.
Parameters
----------
activations
Three-dimensional tensor, which has a shape of (t, m, p), where
t is the time index, m is the minibatch index, and p is the index
over the probabilities of each symbol in the alphabet. The memory
layout is assumed to be in C-order, which consists in the slowest
to the fastest changing dimension, from left to right. In this case,
p is the fastest changing dimension.
labels
A 2-D tensor of all the labels for the minibatch. In each row, there
is a sequence of target labels. Negative values are assumed to be padding,
and thus are ignored. Blank symbol is assumed to have index 0 in the
alphabet.
input_lengths
A 1-D tensor with the number of time steps for each sequence in
the minibatch.
Returns
-------
1-D array
Cost of each example in the minibatch.
"""
return GpuConnectionistTemporalClassification()(activations, labels, input_lengths)
# Disable gradient computation if not needed
@register_canonicalize("fast_compile")
@local_optimizer([GpuConnectionistTemporalClassification])
def local_gpu_ctc_no_grad(fgraph, node):
if isinstance(node.op, GpuConnectionistTemporalClassification):
if len(node.outputs) > 1:
if len(fgraph.clients[node.outputs[1]]) == 0: # gradient is not used
return [
GpuConnectionistTemporalClassification(compute_grad=False)(
*node.inputs
),
None,
]
return False
"""
Declarations of cuDNN types and constants used in Aesara gpuarray DNN module.
For every cuDNN API supported by Aesara, this module defines a class that
provides the set of cuDNN definitions to be used in Aesara Ops.
Use :func:`get_definitions` to get the right cuDNN definitions
for a given cuDNN version.
Currently supported cuDNN APIs:
- v5.1*
- v6.0*
- v7.0*
"""
from aesara.link.c.type import CEnumType
HALF, FLOAT, DOUBLE = ("float16", "float32", "float64")
TRUE_HALF_CONFIG = (HALF, HALF)
PSEUDO_HALF_CONFIG = (HALF, FLOAT)
FLOAT_CONFIG = (FLOAT, FLOAT)
DOUBLE_CONFIG = (DOUBLE, DOUBLE)
def is_true_half_config(dtype, precision):
return dtype == precision == HALF
def is_pseudo_half_config(dtype, precision):
return dtype == HALF and precision == FLOAT
def is_float_config(dtype, precision):
return dtype == precision == FLOAT
def is_double_config(dtype, precision):
return dtype == precision == DOUBLE
# NB: Some cuDNN algorithms are listed in cuDNN enums but not implemented.
# We still register them here because we try to exactly copy cuDNN enums
# in Python side, but they will have no aliases associated, to help
# exclude them from lists of supported algorithms.
class CuDNNV51:
version = 5
cudnnConvolutionMode_t = CEnumType(
("CUDNN_CONVOLUTION", "conv"),
("CUDNN_CROSS_CORRELATION", "cross"),
ctype="cudnnConvolutionMode_t",
)
cudnnDataType_t = CEnumType(
("CUDNN_DATA_FLOAT", "float32"),
("CUDNN_DATA_DOUBLE", "float64"),
("CUDNN_DATA_HALF", "float16"),
ctype="cudnnDataType_t",
)
cudnnConvolutionFwdAlgo_t = CEnumType(
("CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM", "none"),
("CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM", "small"),
("CUDNN_CONVOLUTION_FWD_ALGO_GEMM", "large"),
# not implemented:
("CUDNN_CONVOLUTION_FWD_ALGO_DIRECT"),
("CUDNN_CONVOLUTION_FWD_ALGO_FFT", "fft"),
("CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING", "fft_tiling"),
("CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD", "winograd"),
# TODO: Not yet tested/documented:
("CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED", "winograd_non_fused"),
ctype="cudnnConvolutionFwdAlgo_t",
)
conv3d_fwd_algorithms = ("none", "small", "fft_tiling")
deterministic_fwd_algorithms = cudnnConvolutionFwdAlgo_t.get_aliases()
cudnnConvolutionBwdFilterAlgo_t = CEnumType(
("CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0", "none"),
("CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1", "deterministic"),
("CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT", "fft"),
("CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3", "small"),
# TODO: not yet tested/documented:
("CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED", "winograd_non_fused"),
ctype="cudnnConvolutionBwdFilterAlgo_t",
)
conv3d_bwd_filter_algorithms = ("none", "small")
deterministic_bwd_filter_algorithms = ("deterministic", "fft", "winograd_non_fused")
cudnnConvolutionBwdDataAlgo_t = CEnumType(
("CUDNN_CONVOLUTION_BWD_DATA_ALGO_0", "none"),
("CUDNN_CONVOLUTION_BWD_DATA_ALGO_1", "deterministic"),
("CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT", "fft"),
("CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING", "fft_tiling"),
("CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD", "winograd"),
# TODO: not yet tested/documented:
("CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED", "winograd_non_fused"),
ctype="cudnnConvolutionBwdDataAlgo_t",
)
conv3d_bwd_data_algorithms = ("none", "deterministic", "fft_tiling")
deterministic_bwd_data_algorithms = (
"deterministic",
"fft",
"fft_tiling",
"winograd",
"winograd_non_fused",
)
cudnnPoolingMode_t = CEnumType(
("CUDNN_POOLING_MAX", "max"),
("CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING", "average_inc_pad"),
("CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING", "average_exc_pad"),
ctype="cudnnPoolingMode_t",
)
cudnnSoftmaxAlgorithm_t = CEnumType(
("CUDNN_SOFTMAX_FAST", "fast"),
("CUDNN_SOFTMAX_ACCURATE", "accurate"),
("CUDNN_SOFTMAX_LOG", "log"),
ctype="cudnnSoftmaxAlgorithm_t",
)
cudnnSoftmaxMode_t = CEnumType(
("CUDNN_SOFTMAX_MODE_INSTANCE", "instance"),
("CUDNN_SOFTMAX_MODE_CHANNEL", "channel"),
ctype="cudnnSoftmaxMode_t",
)
cudnnBatchNormMode_t = CEnumType(
("CUDNN_BATCHNORM_PER_ACTIVATION", "per-activation"),
("CUDNN_BATCHNORM_SPATIAL", "spatial"),
ctype="cudnnBatchNormMode_t",
)
# It was introduced in cudnnv6, but we need to define it with an
# empty list of enum to don't crash with cudnn 5
cudnnReduceTensorOp_t = CEnumType()
def get_supported_dtype_configs(self, check_runtime=None):
"""
Return the tuple of data type configurations supported by this version of cuDNN.
This is currently convenient for all supported cuDNN versions, as Aesara does not
yet support new data types (like INT8, INT8x4, etc.).
``check_runtime`` may be a function that tests if a data type configuration is supported.::
is_supported = check_runtime(dtype, precision)
.. warning::
From documentation for cudnnConvolutionForward (for both v5.1 and v6):
.. code-block::
TRUE_HALF_CONFIG is only supported on architectures with true fp16 support
(compute capability 5.3 and 6.0)
This seems to be a general remark about f16 support (not only for FWD).
It can be checked at runtime only.
"""
if check_runtime is None or check_runtime(*TRUE_HALF_CONFIG):
return (TRUE_HALF_CONFIG, PSEUDO_HALF_CONFIG, FLOAT_CONFIG, DOUBLE_CONFIG)
return (PSEUDO_HALF_CONFIG, FLOAT_CONFIG, DOUBLE_CONFIG)
def fwd_algo_supports_dtype_config(self, algo, dtype, precision, ndim):
algorithms = self.cudnnConvolutionFwdAlgo_t
algo = algorithms.fromalias(algo)
if algo == algorithms.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM:
return not is_true_half_config(dtype, precision)
if algo == algorithms.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM:
return ndim == 2 or not is_true_half_config(dtype, precision)
if algo == algorithms.CUDNN_CONVOLUTION_FWD_ALGO_GEMM:
return ndim == 2 and not is_true_half_config(dtype, precision)
# CUDNN_CONVOLUTION_FWD_ALGO_DIRECT: not implemented.
if algo == algorithms.CUDNN_CONVOLUTION_FWD_ALGO_FFT:
return ndim == 2 and (
is_pseudo_half_config(dtype, precision)
or is_float_config(dtype, precision)
)
if algo == algorithms.CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING:
if ndim == 2:
return is_pseudo_half_config(dtype, precision) or is_float_config(
dtype, precision
)
if ndim == 3:
return not is_true_half_config(dtype, precision)
if algo == algorithms.CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD:
return ndim == 2 and (
is_pseudo_half_config(dtype, precision)
or is_float_config(dtype, precision)
)
if algo == algorithms.CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED:
# NB: "If wDesc 's filter (height, width) is (5,5), data type config TRUE_HALF_CONFIG is not supported".
# We could not check it before being in C code.
return ndim == 2 and not is_double_config(dtype, precision)
return False
def bwd_filter_algo_supports_dtype_config(self, algo, dtype, precision, ndim):
# NB: Aesara does not support float16 precision anymore for backward cuDNN convolutions.
if is_true_half_config(dtype, precision):
return False
algorithms = self.cudnnConvolutionBwdFilterAlgo_t
algo = algorithms.fromalias(algo)
if algo == algorithms.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0:
return not is_true_half_config(dtype, precision)
if algo == algorithms.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1:
return ndim == 2
if algo == algorithms.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT:
return ndim == 2 and (
is_pseudo_half_config(dtype, precision)
or is_float_config(dtype, precision)
)
if algo == algorithms.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3:
return not is_true_half_config(dtype, precision)
if algo == algorithms.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED:
# NB: "If wDesc 's filter (height, width) is (5,5), data type config TRUE_HALF_CONFIG is not supported".
# We could not check it before being in C code.
return ndim == 2 and not is_double_config(dtype, precision)
return False
def bwd_data_algo_supports_dtype_config(self, algo, dtype, precision, ndim):
# NB: Aesara does not support float16 precision anymore for backward cuDNN convolutions.
if is_true_half_config(dtype, precision):
return False
algorithms = self.cudnnConvolutionBwdDataAlgo_t
algo = algorithms.fromalias(algo)
if algo == algorithms.CUDNN_CONVOLUTION_BWD_DATA_ALGO_0:
return not is_true_half_config(dtype, precision)
if algo == algorithms.CUDNN_CONVOLUTION_BWD_DATA_ALGO_1:
# CUDNN_CONVOLUTION_BWD_DATA_ALGO_1: all data type configs supported.
# NB: Let's avoid float16 precision, as some strange errors may be encountered
# with that precision ( see https://github.com/Theano/Theano/pull/5932/ )
return not is_true_half_config(dtype, precision)
if algo == algorithms.CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT:
return ndim == 2 and (
is_pseudo_half_config(dtype, precision)
or is_float_config(dtype, precision)
)
if algo == algorithms.CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING:
if ndim == 2:
return is_pseudo_half_config(dtype, precision) or is_float_config(
dtype, precision
)
if ndim == 3:
return not is_true_half_config(dtype, precision)
if algo == algorithms.CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD:
return ndim == 2 and (
is_pseudo_half_config(dtype, precision)
or is_float_config(dtype, precision)
)
if algo == algorithms.CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED:
# NB: "If wDesc 's filter (height, width) is (5,5), data type config TRUE_HALF_CONFIG is not supported".
# We could not check it before being in C code.
return ndim == 2 and not is_double_config(dtype, precision)
return False
class CuDNNV6(CuDNNV51):
version = 6
cudnnDataType_t = CEnumType(
("CUDNN_DATA_FLOAT", "float32"),
("CUDNN_DATA_DOUBLE", "float64"),
("CUDNN_DATA_HALF", "float16"),
# new in v6
("CUDNN_DATA_INT8", "int8"),
("CUDNN_DATA_INT32", "int32"),
# ('CUDNN_DATA_INT8X4', 'int8x4'),
ctype="cudnnDataType_t",
)
cudnnPoolingMode_t = CEnumType(
("CUDNN_POOLING_MAX", "max"),
("CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING", "average_inc_pad"),
("CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING", "average_exc_pad"),
# new in v6:
("CUDNN_POOLING_MAX_DETERMINISTIC", "max_deterministic"),
ctype="cudnnPoolingMode_t",
)
cudnnConvolutionBwdFilterAlgo_t = CEnumType(
("CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0", "none"),
("CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1", "deterministic"),
("CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT", "fft"),
("CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3", "small"),
("CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD"),
("CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED", "winograd_non_fused"),
# new in v6:
("CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING", "fft_tiling"),
ctype="cudnnConvolutionBwdFilterAlgo_t",
)
deterministic_bwd_filter_algorithms = (
CuDNNV51.deterministic_bwd_filter_algorithms + ("fft_tiling",)
)
cudnnReduceTensorOp_t = CEnumType(
("CUDNN_REDUCE_TENSOR_ADD", "add"),
("CUDNN_REDUCE_TENSOR_MUL", "mul"),
("CUDNN_REDUCE_TENSOR_MIN", "minimum"),
("CUDNN_REDUCE_TENSOR_MAX", "maximum"),
("CUDNN_REDUCE_TENSOR_AMAX", "absmax"),
("CUDNN_REDUCE_TENSOR_AVG", "avg"),
("CUDNN_REDUCE_TENSOR_NORM1", "norm1"),
("CUDNN_REDUCE_TENSOR_NORM2", "norm2"),
ctype="cudnnReduceTensorOp_t",
)
def fwd_algo_supports_dtype_config(self, algo, dtype, precision, ndim):
is_supported = super().fwd_algo_supports_dtype_config(
algo, dtype, precision, ndim
)
if not is_supported:
algorithms = self.cudnnConvolutionFwdAlgo_t
algo = algorithms.fromalias(algo)
if algo == algorithms.CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING:
# NB: For cuDNN V6:
# "Data Type Config Support: PSEUDO_HALF_CONFIG, FLOAT_CONFIG
# (DOUBLE_CONFIG is also supported when the task can be handled by 1D FFT,
# ie, one of the filter dimension, width or height is 1)"
# Could be checked only in C code. By default, let's allow DOUBLE_CONFIG.
return ndim == 2 and (
is_pseudo_half_config(dtype, precision)
or is_float_config(dtype, precision)
or is_double_config(dtype, precision)
)
return is_supported
def bwd_filter_algo_supports_dtype_config(self, algo, dtype, precision, ndim):
is_supported = super().bwd_filter_algo_supports_dtype_config(
algo, dtype, precision, ndim
)
if not is_supported:
algorithms = self.cudnnConvolutionBwdFilterAlgo_t
algo = algorithms.fromalias(algo)
if algo == algorithms.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING:
return ndim == 2 and (
is_pseudo_half_config(dtype, precision)
or is_float_config(dtype, precision)
or is_double_config(dtype, precision)
)
return is_supported
def bwd_data_algo_supports_dtype_config(self, algo, dtype, precision, ndim):
is_supported = super().bwd_data_algo_supports_dtype_config(
algo, dtype, precision, ndim
)
if not is_supported:
algorithms = self.cudnnConvolutionBwdDataAlgo_t
algo = algorithms.fromalias(algo)
if algo == algorithms.CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING:
# NB: For cuDNN V6:
# "Data Type Config Support: PSEUDO_HALF_CONFIG, FLOAT_CONFIG
# (DOUBLE_CONFIG is also supported when the task can be handled by 1D FFT,
# ie, one of the filter dimension, width or height is 1)"
# Could be checked only in C code. By default, let's allow DOUBLE_CONFIG.
return ndim == 2 and (
is_pseudo_half_config(dtype, precision)
or is_float_config(dtype, precision)
or is_double_config(dtype, precision)
)
return is_supported
class CuDNNV7(CuDNNV6):
version = 7
cudnnMathType_t = CEnumType(
("CUDNN_DEFAULT_MATH", "non_tensor_op"),
("CUDNN_TENSOR_OP_MATH", "tensor_op"),
ctype="cudnnMathType_t",
)
cudnnDeterminism_t = CEnumType(
("CUDNN_NON_DETERMINISTIC", "non_deterministic"),
("CUDNN_DETERMINISTIC", "deterministic"),
ctype="cudnnDeterminism_t",
)
def get_definitions(cudnn_version=None):
"""
Return cuDNN definitions to be used by Aesara for the given cuDNN version.
``cudnn_version`` must be None or an integer
(typically the version returned by :func:`Aesara.gpuarray.dnn.version`).
if None, return definitions for the most recent supported cuDNN version.
"""
if cudnn_version is not None:
if cudnn_version // 1000 == 5:
return CuDNNV51()
if cudnn_version // 1000 == 6:
return CuDNNV6()
# By default, we use definitions for the last supported cuDNN version.
return CuDNNV7()
This source diff could not be displayed because it is too large. You can view the blob instead.
import aesara
from aesara.compile import optdb
from aesara.gpuarray.basic_ops import (
GpuAllocEmpty,
GpuArrayType,
as_gpuarray_variable,
gpu_contiguous,
infer_context_name,
)
from aesara.gpuarray.dnn import (
GpuDnnBatchNorm,
GpuDnnBatchNormInference,
GpuDnnConv,
GpuDnnConvDesc,
GpuDnnConvGradI,
GpuDnnConvGradW,
GpuDnnPoolGrad,
GpuDnnReduction,
GpuDnnSoftmax,
GpuDnnSoftmaxGrad,
cudnn,
dnn_available,
dnn_conv,
dnn_conv3d,
dnn_pool,
get_precision,
local_abstractconv3d_cudnn_graph,
local_abstractconv_cudnn_graph,
version,
)
from aesara.gpuarray.elemwise import GpuCAReduceCuda, GpuElemwise
from aesara.gpuarray.nnet import GpuSoftmax
from aesara.gpuarray.opt_util import (
alpha_merge,
inplace_allocempty,
op_lifter,
output_merge,
pad_dims,
unpad_dims,
)
from aesara.gpuarray.optdb import (
gpu_seqopt,
pool_db,
pool_db2,
register_inplace,
register_opt,
register_opt2,
)
from aesara.gpuarray.reduction import GpuMaxAndArgmax
from aesara.gpuarray.type import list_contexts
from aesara.graph.opt import GlobalOptimizer, copy_stack_trace, local_optimizer
from aesara.scalar import Log
from aesara.tensor.math import Argmax
from aesara.tensor.nnet.abstract_conv import (
AbstractConv2d,
AbstractConv2d_gradInputs,
AbstractConv2d_gradWeights,
AbstractConv3d,
AbstractConv3d_gradInputs,
AbstractConv3d_gradWeights,
assert_conv_shape,
get_conv_output_shape,
)
from aesara.tensor.nnet.basic import LogSoftmax, SoftmaxGrad
from aesara.tensor.shape import shape_i_op
from aesara.tensor.signal.pool import AveragePoolGrad, MaxPoolGrad, Pool
@local_optimizer([AbstractConv2d, AbstractConv3d])
def local_abstractconv_cudnn(fgraph, node):
ctx = infer_context_name(*node.inputs)
if not isinstance(node.inputs[0].type, GpuArrayType):
return
if node.op.unshared:
return None
if isinstance(node.op.border_mode, tuple) and any(
isinstance(p, tuple) for p in node.op.border_mode
):
# Asymmetric padding not yet supported
return None
if isinstance(node.op, AbstractConv2d):
new_out = local_abstractconv_cudnn_graph(
node.op, ctx, node.inputs, node.outputs
)
copy_stack_trace(node.outputs, new_out)
return new_out
elif isinstance(node.op, AbstractConv3d):
new_out = local_abstractconv3d_cudnn_graph(
node.op, ctx, node.inputs, node.outputs
)
copy_stack_trace(node.outputs, new_out)
return new_out
@local_optimizer(
[AbstractConv2d, AbstractConv2d_gradWeights, AbstractConv2d_gradInputs]
)
def local_abstractconv_cudnn_alt(fgraph, node):
if not isinstance(
node.op, (AbstractConv2d, AbstractConv2d_gradWeights, AbstractConv2d_gradInputs)
):
return
if version(raises=False) < 6000 and node.op.filter_dilation != (1, 1):
return None
if node.op.unshared:
return None
if isinstance(node.op.border_mode, tuple) and any(
isinstance(p, tuple) for p in node.op.border_mode
):
# Asymmetric padding not yet supported
return None
inp1 = node.inputs[0]
inp2 = node.inputs[1]
if not dnn_available(inp1.type.context_name):
return
op = node.op
border_mode = node.op.border_mode
subsample = node.op.subsample
filter_dilation = node.op.filter_dilation
num_groups = node.op.num_groups
precision, _ = get_precision(None, [inp1, inp2])
if node.op.filter_flip:
conv_mode = "conv"
else:
conv_mode = "cross"
if isinstance(op, AbstractConv2d):
if border_mode == "half" or subsample != (1, 1) or num_groups != 1:
return None
if border_mode == "full":
direction_hint = "bprop inputs"
elif border_mode == "valid" and filter_dilation == (1, 1):
direction_hint = "bprop weights"
else:
return None
rval = dnn_conv(
inp1,
inp2,
border_mode=border_mode,
subsample=subsample,
dilation=filter_dilation,
direction_hint=direction_hint,
conv_mode=conv_mode,
num_groups=num_groups,
)
elif isinstance(op, AbstractConv2d_gradWeights):
if (
border_mode == "valid"
and subsample == (1, 1)
and filter_dilation == (1, 1)
and num_groups == 1
):
img = gpu_contiguous(inp1)
topgrad = gpu_contiguous(inp2)
ctx_name = infer_context_name(img, topgrad)
img = gpu_contiguous(img.dimshuffle(1, 0, 2, 3))
topgrad = gpu_contiguous(topgrad.dimshuffle(1, 0, 2, 3))
ishape = [shape_i_op(i)(img) for i in range(img.ndim)]
tshape = [shape_i_op(i)(topgrad) for i in range(topgrad.ndim)]
out_shp = get_conv_output_shape(
ishape,
tshape,
border_mode=border_mode,
subsample=subsample,
filter_dilation=filter_dilation,
)
out_shp = assert_conv_shape(out_shp)
out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*out_shp)
desc = GpuDnnConvDesc(
border_mode=border_mode,
subsample=subsample,
dilation=filter_dilation,
conv_mode="cross",
precision=precision,
)(out.shape)
conv = GpuDnnConv(algo=None, num_groups=num_groups)(img, topgrad, out, desc)
if conv_mode == "conv":
conv = conv[:, :, ::-1, ::-1]
rval = as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3), ctx_name)
else:
return None
elif isinstance(op, AbstractConv2d_gradInputs):
if border_mode == "valid" and subsample == (1, 1) and num_groups == 1:
kerns = gpu_contiguous(inp1.dimshuffle(1, 0, 2, 3))
topgrad = gpu_contiguous(inp2)
ctx_name = infer_context_name(kerns, topgrad)
conv_mode = "cross" if conv_mode == "conv" else "conv"
desc = GpuDnnConvDesc(
border_mode="full",
subsample=subsample,
dilation=filter_dilation,
conv_mode=conv_mode,
precision=precision,
)(kerns.shape)
tshape = [shape_i_op(i)(topgrad) for i in range(topgrad.ndim)]
kshape = [shape_i_op(i)(kerns) for i in range(kerns.ndim)]
shape = get_conv_output_shape(
tshape,
kshape,
border_mode="full",
subsample=subsample,
filter_dilation=filter_dilation,
)
shape = assert_conv_shape(shape)
out = GpuAllocEmpty(dtype=topgrad.dtype, context_name=ctx_name)(*shape)
rval = GpuDnnConv(algo=None, num_groups=num_groups)(
topgrad, kerns, out, desc
)
else:
return None
return [rval]
@local_optimizer(
[AbstractConv3d, AbstractConv3d_gradWeights, AbstractConv3d_gradInputs]
)
def local_abstractconv3d_cudnn_alt(fgraph, node):
if not isinstance(
node.op, (AbstractConv3d, AbstractConv3d_gradWeights, AbstractConv3d_gradInputs)
):
return
if version(raises=False) < 6000 and node.op.filter_dilation != (1, 1, 1):
return None
inp1 = node.inputs[0]
inp2 = node.inputs[1]
if not dnn_available(inp1.type.context_name):
return
op = node.op
border_mode = node.op.border_mode
subsample = node.op.subsample
filter_dilation = node.op.filter_dilation
num_groups = node.op.num_groups
precision, _ = get_precision(None, [inp1, inp2])
if node.op.filter_flip:
conv_mode = "conv"
else:
conv_mode = "cross"
if isinstance(op, AbstractConv3d):
if border_mode == "half" or subsample != (1, 1, 1) or num_groups > 1:
return None
if border_mode == "full":
direction_hint = "bprop inputs"
elif border_mode == "valid" and filter_dilation == (1, 1, 1):
direction_hint = "bprop weights"
else:
return None
rval = dnn_conv3d(
fgraph,
inp1,
inp2,
border_mode=border_mode,
subsample=subsample,
dilation=filter_dilation,
direction_hint=direction_hint,
conv_mode=conv_mode,
)
elif isinstance(op, AbstractConv3d_gradWeights):
if (
border_mode == "valid"
and subsample == (1, 1, 1)
and filter_dilation == (1, 1, 1)
and num_groups == 1
):
img = gpu_contiguous(inp1)
topgrad = gpu_contiguous(inp2)
ctx_name = infer_context_name(img, topgrad)
img = gpu_contiguous(img.dimshuffle(1, 0, 2, 3, 4))
topgrad = gpu_contiguous(topgrad.dimshuffle(1, 0, 2, 3, 4))
ishape = [shape_i_op(i)(img) for i in range(img.ndim)]
tshape = [shape_i_op(i)(topgrad) for i in range(topgrad.ndim)]
out_shp = get_conv_output_shape(
ishape,
tshape,
border_mode=border_mode,
subsample=subsample,
filter_dilation=filter_dilation,
)
out_shp = assert_conv_shape(out_shp)
out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*out_shp)
desc = GpuDnnConvDesc(
border_mode=border_mode,
subsample=subsample,
dilation=filter_dilation,
conv_mode="cross",
num_groups=num_groups,
precision=precision,
)(out.shape)
conv = GpuDnnConv(algo=None, num_groups=num_groups)(img, topgrad, out, desc)
if conv_mode == "conv":
conv = conv[:, :, ::-1, ::-1, ::-1]
rval = as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3, 4), ctx_name)
else:
return None
elif isinstance(op, AbstractConv3d_gradInputs):
if border_mode == "valid" and subsample == (1, 1, 1) and num_groups == 1:
kerns = gpu_contiguous(inp1.dimshuffle(1, 0, 2, 3, 4))
topgrad = gpu_contiguous(inp2)
ctx_name = infer_context_name(kerns, topgrad)
conv_mode = "cross" if conv_mode == "conv" else "conv"
desc = GpuDnnConvDesc(
border_mode="full",
subsample=subsample,
dilation=filter_dilation,
conv_mode=conv_mode,
num_groups=num_groups,
precision=precision,
)(kerns.shape)
tshape = [shape_i_op(i)(topgrad) for i in range(topgrad.ndim)]
kshape = [shape_i_op(i)(kerns) for i in range(kerns.ndim)]
shape = get_conv_output_shape(
tshape,
kshape,
border_mode="full",
subsample=subsample,
filter_dilation=filter_dilation,
)
shape = assert_conv_shape(shape)
out = GpuAllocEmpty(dtype=topgrad.dtype, context_name=ctx_name)(*shape)
rval = GpuDnnConv(algo=None, num_groups=num_groups)(
topgrad, kerns, out, desc
)
else:
return None
return [rval]
@local_optimizer([AbstractConv2d_gradWeights, AbstractConv3d_gradWeights])
def local_abstractconv_gw_cudnn(fgraph, node):
ctx = infer_context_name(*node.inputs)
if not isinstance(node.inputs[0].type, GpuArrayType):
return
if node.op.unshared:
return None
if isinstance(node.op.border_mode, tuple) and any(
isinstance(p, tuple) for p in node.op.border_mode
):
# Asymmetric padding not yet supported
return None
if isinstance(node.op, AbstractConv2d_gradWeights):
new_out = local_abstractconv_cudnn_graph(
node.op, ctx, node.inputs, node.outputs
)
copy_stack_trace(node.outputs, new_out)
return new_out
elif isinstance(node.op, AbstractConv3d_gradWeights):
new_out = local_abstractconv3d_cudnn_graph(
node.op, ctx, node.inputs, node.outputs
)
copy_stack_trace(node.outputs, new_out)
return new_out
@local_optimizer([AbstractConv2d_gradInputs, AbstractConv3d_gradInputs])
def local_abstractconv_gi_cudnn(fgraph, node):
ctx = infer_context_name(*node.inputs)
if not isinstance(node.inputs[0].type, GpuArrayType):
return
if node.op.unshared:
return None
if isinstance(node.op.border_mode, tuple) and any(
isinstance(p, tuple) for p in node.op.border_mode
):
# Asymmetric padding not yet supported
return None
if isinstance(node.op, AbstractConv2d_gradInputs):
new_out = local_abstractconv_cudnn_graph(
node.op, ctx, node.inputs, node.outputs
)
copy_stack_trace(node.outputs, new_out)
return new_out
elif isinstance(node.op, AbstractConv3d_gradInputs):
new_out = local_abstractconv3d_cudnn_graph(
node.op, ctx, node.inputs, node.outputs
)
copy_stack_trace(node.outputs, new_out)
return new_out
@inplace_allocempty(GpuDnnConv, 2)
def local_dnn_conv_inplace(node, inputs):
return [
GpuDnnConv(algo=node.op.algo, inplace=True, num_groups=node.op.num_groups)(
*inputs
)
]
@inplace_allocempty(GpuDnnConvGradW, 2)
def local_dnn_convgw_inplace(node, inputs):
return [
GpuDnnConvGradW(algo=node.op.algo, inplace=True, num_groups=node.op.num_groups)(
*inputs
)
]
@inplace_allocempty(GpuDnnConvGradI, 2)
def local_dnn_convgi_inplace(node, inputs):
return [
GpuDnnConvGradI(algo=node.op.algo, inplace=True, num_groups=node.op.num_groups)(
*inputs
)
]
optdb.register(
"local_dnna_conv_inplace",
aesara.graph.opt.in2out(
local_dnn_conv_inplace,
local_dnn_convgw_inplace,
local_dnn_convgi_inplace,
name="local_dnna_conv_inplace",
),
"fast_run",
"inplace",
"gpuarray",
"cudnn",
position=70.0,
)
@register_opt("cudnn")
@alpha_merge(GpuDnnConv, alpha_in=4, beta_in=5)
def local_dnn_conv_alpha_merge(node, *inputs):
return [GpuDnnConv(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)]
@register_opt("cudnn")
@alpha_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5)
def local_dnn_convw_alpha_merge(node, *inputs):
return [GpuDnnConvGradW(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)]
@register_opt("cudnn")
@alpha_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5)
def local_dnn_convi_alpha_merge(node, *inputs):
return [GpuDnnConvGradI(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)]
@register_opt("cudnn")
@output_merge(GpuDnnConv, alpha_in=4, beta_in=5, out_in=2)
def local_dnn_conv_output_merge(node, *inputs):
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConv(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)]
@register_opt("cudnn")
@output_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, out_in=2)
def local_dnn_convw_output_merge(node, *inputs):
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConvGradW(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)]
@register_opt("cudnn")
@output_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, out_in=2)
def local_dnn_convi_output_merge(node, *inputs):
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConvGradI(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)]
def local_gpua_pool_dnn_alternative(fgraph, op, ctx_name, inputs, outputs):
if not dnn_available(ctx_name):
return
if not op.ignore_border:
return
img, ws, stride, pad = inputs
nd = op.ndim
if nd not in (2, 3):
return
img = gpu_contiguous(as_gpuarray_variable(img, ctx_name))
mode = op.mode
# dnn_pool expects exactly 2 non-pooling dimensions
if img.ndim == nd + 2:
return dnn_pool(img, ws, stride=stride, pad=pad, mode=mode)
else:
# reshape to 4D or 5D with 2 non-pooling dimensions
img_padded = pad_dims(img, 2, nd)
ret_padded = dnn_pool(img_padded, ws, stride=stride, pad=pad, mode=mode)
return unpad_dims(ret_padded, img, 2, nd)
pool_db.register(
"local_gpua_pool_dnn_alternative",
op_lifter([Pool])(local_gpua_pool_dnn_alternative),
"gpuarray",
"fast_compile",
"fast_run",
"cudnn",
position=0,
)
pool_db2.register(
"local_gpua_pool_dnn_alternative",
local_optimizer([Pool])(local_gpua_pool_dnn_alternative),
"gpuarray",
"fast_compile",
"fast_run",
"cudnn",
position=0,
)
def local_gpua_pool_dnn_grad_stride(fgraph, op, ctx_name, inputs, outputs):
if not dnn_available(ctx_name):
return
if not op.ignore_border:
return
inp, out, out_grad, ws, stride, pad = inputs
nd = op.ndim
if nd not in (2, 3):
return
inp = gpu_contiguous(as_gpuarray_variable(inp, ctx_name))
out = gpu_contiguous(as_gpuarray_variable(out, ctx_name))
out_grad = gpu_contiguous(as_gpuarray_variable(out_grad, ctx_name))
mode = op.mode
# the GPU ops expect exactly 2 non-pooling dimensions
if inp.ndim == nd + 2:
return GpuDnnPoolGrad(mode=mode)(inp, out, out_grad, ws, stride, pad)
else:
# reshape to 4D or 5D with 2 non-pooling dimensions
inp_padded = pad_dims(inp, 2, nd)
out_padded = pad_dims(out, 2, nd)
out_grad_padded = pad_dims(out_grad, 2, nd)
ret_padded = GpuDnnPoolGrad(mode=mode)(
inp_padded, out_padded, out_grad_padded, ws, stride, pad
)
return unpad_dims(ret_padded, inp, 2, nd)
pool_db.register(
"local_gpua_pool_dnn_grad_stride",
op_lifter([MaxPoolGrad])(local_gpua_pool_dnn_grad_stride),
"gpuarray",
"fast_compile",
"fast_run",
"cudnn",
position=0,
)
pool_db2.register(
"local_gpua_pool_dnn_grad_stride",
local_optimizer([MaxPoolGrad])(local_gpua_pool_dnn_grad_stride),
"gpuarray",
"fast_compile",
"fast_run",
"cudnn",
position=0,
)
def local_gpua_avg_pool_dnn_grad_stride(fgraph, op, ctx_name, inputs, outputs):
if not dnn_available(ctx_name):
return
if not op.ignore_border:
return
inp, out_grad, ws, stride, pad = inputs
nd = op.ndim
if nd not in (2, 3):
return
inp = gpu_contiguous(as_gpuarray_variable(inp, ctx_name))
out_grad = gpu_contiguous(as_gpuarray_variable(out_grad, ctx_name))
mode = op.mode
# the GPU ops expect exactly 2 non-pooling dimensions
if inp.ndim == nd + 2:
# We reuse out_grad because cuDNN does not use the value of the `out`
# argument but still checks its shape for average pooling. This
# has been observed in v2 and v3 as far as I know.
return GpuDnnPoolGrad(mode=mode)(inp, out_grad, out_grad, ws, stride, pad)
else:
# reshape to 4D or 5D with 2 non-pooling dimensions
inp_padded = pad_dims(inp, 2, nd)
out_grad_padded = pad_dims(out_grad, 2, nd)
ret_padded = GpuDnnPoolGrad(mode=mode)(
inp_padded, out_grad_padded, out_grad_padded, ws, stride, pad
)
return unpad_dims(ret_padded, inp, 2, nd)
pool_db.register(
"local_gpua_avg_pool_dnn_grad_stride",
op_lifter([AveragePoolGrad])(local_gpua_avg_pool_dnn_grad_stride),
"gpuarray",
"fast_compile",
"fast_run",
"cudnn",
position=0,
)
pool_db2.register(
"local_gpua_avg_pool_dnn_grad_stride",
local_optimizer([AveragePoolGrad])(local_gpua_avg_pool_dnn_grad_stride),
"gpuarray",
"fast_compile",
"fast_run",
"cudnn",
position=0,
)
@register_opt("cudnn", "fast_compile")
@local_optimizer([GpuSoftmax])
def local_softmax_dnn(fgraph, node):
if isinstance(node.op, GpuSoftmax):
if not dnn_available(node.outputs[0].type.context_name):
return
ins = node.inputs[0].dimshuffle(0, 1, "x", "x")
ins = gpu_contiguous(ins)
out = GpuDnnSoftmax("accurate", "channel")(ins)
out = as_gpuarray_variable(out.dimshuffle(0, 1), out.type.context_name)
return [out]
@register_opt("cudnn", "stabilize")
@local_optimizer([GpuElemwise])
def local_log_softmax_dnn(fgraph, node):
# This looks for GpuDnnSoftmax so we know that we have cudnn.
if (
isinstance(node.op, GpuElemwise)
and isinstance(node.op.scalar_op, Log)
and node.inputs[0].owner
and isinstance(node.inputs[0].owner.op, GpuDnnSoftmax)
and len(fgraph.clients[node.inputs[0]]) == 1
):
softmax_node = node.inputs[0].owner
new_softmax = GpuDnnSoftmax("log", softmax_node.op.mode)
return [new_softmax(softmax_node.inputs[0])]
@register_opt("cudnn", "fast_compile")
@op_lifter([LogSoftmax])
@register_opt2([LogSoftmax], "fast_compile", "cudnn")
def local_gpua_logsoftmax_to_dnn(op, ctx_name, inputs, outputs):
# Transform the input in the format expected by GpuDnnSoftmax
inp = inputs[0]
if inp.ndim != 2:
return
if not dnn_available(ctx_name):
return
inp = inp.dimshuffle(0, 1, "x", "x")
inp.tag.context_name = ctx_name
# Apply GpuDnnSoftmax and return the result
out = GpuDnnSoftmax("log", "channel")(gpu_contiguous(inp))
return [out.dimshuffle(0, 1)]
@register_opt("cudnn", "fast_compile")
@op_lifter([SoftmaxGrad])
@register_opt2([SoftmaxGrad], "cudnn", "fast_compile")
def local_gpua_softmax_dnn_grad(op, ctx_name, inputs, outputs):
if not dnn_available(ctx_name):
return
ins = []
for n in inputs:
n = as_gpuarray_variable(n, ctx_name)
if n.ndim != 2:
return
ins.append(n.dimshuffle(0, "x", 1, "x"))
out = GpuDnnSoftmaxGrad("accurate", "instance")(
gpu_contiguous(ins[0]), gpu_contiguous(ins[1])
)
return [out.dimshuffle(0, 2)]
@register_opt("cudnn")
@local_optimizer([GpuCAReduceCuda])
def local_dnn_reduction(fgraph, node):
if not isinstance(node.op, GpuCAReduceCuda):
return
if not dnn_available(node.inputs[0].type.context_name):
return
if version(raises=False) < 6000:
return
if node.inputs[0].ndim > 8:
return
acc_dtype = node.op._acc_dtype(node.inputs[0].dtype)
if node.inputs[0].dtype != node.outputs[0].dtype:
# We can mix float16 and float32, but not float64.
if node.inputs[0].dtype == "float64" or node.outputs[0].dtype == "float64":
return
if acc_dtype != "float32":
return
if node.inputs[0].dtype not in ("float16", "float32", "float64"):
return
if node.inputs[0].dtype == "float64" and acc_dtype != "float64":
return
if node.inputs[0].dtype == "float32" and acc_dtype != "float32":
return
if node.inputs[0].dtype == "float16" and acc_dtype == "float64":
return
def _identity(a):
return a
def _square(a):
return GpuElemwise(aesara.scalar.basic.sqr)(a)
scal = node.op.scalar_op.name
post = _identity
if node.op.pre_scalar_op is not None:
if isinstance(node.op.scalar_op, aesara.scalar.basic.Add):
if isinstance(node.op.pre_scalar_op, aesara.scalar.basic.Sqr):
scal = "norm2"
post = _square
elif isinstance(node.op.pre_scalar_op, aesara.scalar.basic.Abs):
scal = "norm1"
else:
return
elif isinstance(
node.op.scalar_op, aesara.scalar.basic.ScalarMaximum
) and isinstance(node.op.pre_scalar_op, aesara.scalar.basic.Abs):
scal = "absmax"
else:
return
if not cudnn.cudnnReduceTensorOp_t.has_alias(scal):
return
ret = GpuDnnReduction(scal, node.op.axis, acc_dtype, node.op.dtype, False)(
node.inputs[0]
)
new_out = [post(ret)]
copy_stack_trace(node.outputs, new_out)
return new_out
@register_opt("cudnn")
@local_optimizer([GpuMaxAndArgmax])
def local_cudnn_maxandargmax(fgraph, node):
if not isinstance(node.op, GpuMaxAndArgmax):
return
if not dnn_available(node.inputs[0].type.context_name):
return
if version(raises=False) < 6000:
return
if node.inputs[0].ndim > 8:
return
if node.inputs[0].dtype != node.outputs[0].dtype:
return
if node.inputs[0].dtype not in ("float16", "float32", "float64"):
return
# order of the axes influences the output indices
if node.op.axis is not None and tuple(sorted(node.op.axis)) != node.op.axis:
return
max, arg = GpuDnnReduction(
"maximum", node.op.axis, node.outputs[0].dtype, node.outputs[0].dtype, True
)(node.inputs[0])
# cudnn can only return int32 indices
return (
max,
as_gpuarray_variable(arg.astype("int64"), node.outputs[1].type.context_name),
)
@register_opt("cudnn", "fast_compile")
@op_lifter([Argmax])
@register_opt2([Argmax], "fast_compile", "cudnn")
def local_dnn_argmax(op, ctx_name, inputs, outputs):
if not dnn_available(ctx_name):
return
if version(raises=False) < 6000:
return
if inputs[0].ndim > 8:
return
if inputs[0].dtype not in ("float16", "float32", "float64"):
return
# order of the axes influences the output indices
if op.axis is not None and tuple(sorted(op.axis)) != op.axis:
return
max, arg = GpuDnnReduction(
"maximum", op.axis, inputs[0].dtype, inputs[0].dtype, True
)(*inputs)
return [as_gpuarray_variable(arg.astype("int64"), ctx_name)]
class NoCuDNNRaise(GlobalOptimizer):
def apply(self, fgraph):
"""
Raise a error if cudnn can't be used.
"""
for c in list_contexts():
if not dnn_available(c):
# Make an assert error as we want Aesara to fail, not
# just skip this optimization.
raise AssertionError(
"cuDNN optimization was enabled, but Aesara was not able "
"to use it for context "
+ str(c)
+ ". We got this error: \n"
+ dnn_available.msg
)
gpu_seqopt.register("NoCuDNNRaise", NoCuDNNRaise(), "cudnn", position=0)
@register_inplace()
@local_optimizer([GpuDnnBatchNorm], inplace=True)
def local_batch_norm_inplace_output(fgraph, node):
if isinstance(node.op, GpuDnnBatchNorm) and not node.op.inplace_output:
return GpuDnnBatchNorm(
mode=node.op.mode,
running_averages=node.op.running_averages,
inplace_running_mean=node.op.inplace_running_mean,
inplace_running_var=node.op.inplace_running_var,
inplace_output=True,
)(*node.inputs)
@register_inplace()
@local_optimizer([GpuDnnBatchNorm], inplace=True)
def local_batch_norm_inplace_running_mean(fgraph, node):
if (
isinstance(node.op, GpuDnnBatchNorm)
and node.op.running_averages
and not node.op.inplace_running_mean
):
return GpuDnnBatchNorm(
mode=node.op.mode,
running_averages=node.op.running_averages,
inplace_running_mean=True,
inplace_running_var=node.op.inplace_running_var,
inplace_output=node.op.inplace_output,
)(*node.inputs)
@register_inplace()
@local_optimizer([GpuDnnBatchNorm], inplace=True)
def local_batch_norm_inplace_running_var(fgraph, node):
if (
isinstance(node.op, GpuDnnBatchNorm)
and node.op.running_averages
and not node.op.inplace_running_var
):
return GpuDnnBatchNorm(
mode=node.op.mode,
running_averages=node.op.running_averages,
inplace_running_mean=node.op.inplace_running_mean,
inplace_running_var=True,
inplace_output=node.op.inplace_output,
)(*node.inputs)
@register_inplace()
@local_optimizer([GpuDnnBatchNormInference], inplace=True)
def local_batch_norm_inference_inplace(fgraph, node):
if isinstance(node.op, GpuDnnBatchNormInference) and not node.op.inplace:
return [GpuDnnBatchNormInference(mode=node.op.mode, inplace=True)(*node.inputs)]
This source diff could not be displayed because it is too large. You can view the blob instead.
from aesara.graph.basic import Apply
from aesara.graph.op import _NoPythonOp
from aesara.tensor.extra_ops import CumOp
try:
from pygpu import gpuarray
except ImportError:
pass
import aesara.scalar as scalar
from aesara.gpuarray.basic_ops import (
GpuKernelBaseCOp,
GpuReshape,
Kernel,
as_gpuarray_variable,
gpuarray_helper_inc_dir,
infer_context_name,
)
from aesara.gpuarray.opt import op_lifter, register_opt, register_opt2
from aesara.gpuarray.type import gpu_context_type
from aesara.link.c.params_type import ParamsType
class GpuCumOp(GpuKernelBaseCOp, _NoPythonOp):
"""
Parameters
----------
axis
Can not be None. If you want the array flattened, do it before.
"""
SUPPORTED_NDIMS = 3
__props__ = ("axis", "mode")
params_type = ParamsType(axis=scalar.int32, context=gpu_context_type)
def __init__(self, axis, mode="add"):
assert axis is not None
self.axis = int(axis)
self.mode = mode
def __eq__(self, other):
if type(other) != type(self):
return False
return self.axis == other.axis and self.mode == other.mode
def __hash__(self):
return hash(self.axis) ^ hash(self.mode)
def c_code_cache_version(self):
return (7,)
def c_headers(self, **kwargs):
return ["<numpy_compat.h>", "<gpuarray/types.h>", "<gpuarray_helper.h>"]
def c_header_dirs(self, **kwargs):
return [gpuarray_helper_inc_dir()]
def get_params(self, node):
return self.params_type.get_params(self, context=node.inputs[0].type.context)
def make_node(self, x):
assert x.type.dtype == "float32", "Only float32 supported for GpuCumOp"
context_name = infer_context_name(x)
x = as_gpuarray_variable(x, context_name)
if x.ndim > GpuCumOp.SUPPORTED_NDIMS:
raise NotImplementedError(
"Only cum op on 1D, 2D and\
3D arrays are supported right now!"
)
if self.axis >= x.ndim or self.axis < -x.ndim:
raise ValueError(f"axis(={self.axis}) out of bounds")
return Apply(self, [x], [x.type()])
def gpu_kernels(self, node, nodename):
kernels = []
# cumadd
kname = "k_cumadd"
op = {"mul": "*", "add": "+"}[self.mode]
k_var = "k_cumadd_" + nodename
dtype_x = node.inputs[0].dtype
flags = Kernel.get_flags(dtype_x)
code = (
"""#include "cluda.h"
KERNEL void %(kname)s(float* input, ga_size input_offset,
float* output, ga_size output_offset,
ga_ssize inputStrides_x, ga_ssize inputStrides_y, ga_ssize inputStrides_z,
ga_ssize outputStrides_x, ga_ssize outputStrides_y, ga_ssize outputStrides_z,
const int offsetY, const int offsetZ,
const int beforeLastElementIdx, const int lastElementIdx){
input = (float *)(((char *)input) + input_offset);
output = (float *)(((char *)output) + output_offset);
int idY = blockIdx.y + offsetY;
int idZ = blockIdx.z + offsetZ;
int dataOffsetY_input = idY * inputStrides_y + idZ * inputStrides_z;
int dataOffsetY_output = idY * outputStrides_y + idZ * outputStrides_z;
int idx_last_input = lastElementIdx*inputStrides_x + dataOffsetY_input;
int idx_last_output = lastElementIdx*outputStrides_x + dataOffsetY_output;
int idx_beforelast = beforeLastElementIdx*outputStrides_x + dataOffsetY_output;
output[idx_last_output] = input[idx_last_input] %(op)s output[idx_beforelast];
}
"""
% locals()
)
params = [
gpuarray.GpuArray,
gpuarray.SIZE,
gpuarray.GpuArray,
gpuarray.SIZE,
gpuarray.SSIZE,
gpuarray.SSIZE,
gpuarray.SSIZE,
gpuarray.SSIZE,
gpuarray.SSIZE,
gpuarray.SSIZE,
"intc",
"intc",
"intc",
"intc",
]
kernels.append(
Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var)
)
# blockCumOp
kname = "k_blockCumOp"
k_var = "k_blockCumOp_" + nodename
params = [
gpuarray.GpuArray,
gpuarray.SIZE,
gpuarray.GpuArray,
gpuarray.SIZE,
gpuarray.SIZE,
gpuarray.SSIZE,
gpuarray.SSIZE,
gpuarray.SSIZE,
gpuarray.SSIZE,
gpuarray.SSIZE,
gpuarray.SSIZE,
"int32",
"int32",
gpuarray.GpuArray,
gpuarray.SIZE,
]
code = (
"""#include "cluda.h"
// helper functions
WITHIN_KERNEL
void k_reductionPhase(float* partialCumOp) {
// Traverse down from leaves to root building partial sums at internal nodes in the tree.
for (unsigned int stride = 1; stride <= blockDim.x; stride *= 2) {
local_barrier();
unsigned int index = (threadIdx.x + 1) * (stride * 2) - 1;
if (index < blockDim.x*2) {
partialCumOp[index] %(op)s= partialCumOp[index - stride];
}
}
}
WITHIN_KERNEL
void k_fetchData(float* partialCumOp, float* input, int globalThreadID,
ga_ssize dataStrides_x, ga_ssize dataStrides_y, ga_ssize dataStrides_z,
int offsetY, int offsetZ) {
// blockIdx.y and blockIdx.z represents the current independent cum op
int idY = blockIdx.y + offsetY;
int idZ = blockIdx.z + offsetZ; int offset = idY * dataStrides_y + idZ * dataStrides_z;
int idx_even = (globalThreadID*2 ) * dataStrides_x + offset;
int idx_odd = (globalThreadID*2 + 1) * dataStrides_x + offset;
partialCumOp[threadIdx.x*2] = input[idx_even];
partialCumOp[threadIdx.x*2 + 1] = input[idx_odd];
}
WITHIN_KERNEL
void k_reversePhase(float* partialCumOp) {
// Traverse back up the tree building the scan from the partial sums
for (unsigned int stride = exp2(ceil(log2((float)blockDim.x))); stride > 0; stride /= 2) {
local_barrier();
unsigned int index = (threadIdx.x + 1) * (stride * 2) - 1;
if (index + stride < blockDim.x*2) {
partialCumOp[index + stride] %(op)s= partialCumOp[index];
}
}
}
WITHIN_KERNEL
void k_pushData(float* partialCumOp, float* output, int globalThreadID,
ga_ssize dataStrides_x, ga_ssize dataStrides_y, ga_ssize dataStrides_z,
int offsetY, int offsetZ) {
local_barrier();
// blockIdx.y and blockIdx.z represents the current independent cum op
int idY = blockIdx.y + offsetY;
int idZ = blockIdx.z + offsetZ;
int offset = idY * dataStrides_y + idZ * dataStrides_z;
int idx_even = (globalThreadID*2 ) * dataStrides_x + offset;
int idx_odd = (globalThreadID*2 + 1) * dataStrides_x + offset;
output[idx_even] = partialCumOp[threadIdx.x*2];
output[idx_odd] = partialCumOp[threadIdx.x*2 + 1];
}
KERNEL void k_blockCumOp(float* input, ga_size input_offset,
float* output, ga_size output_offset,
size_t nbElementsPerCumOp, ga_ssize inputStrides_x,
ga_ssize inputStrides_y, ga_ssize inputStrides_z,
ga_ssize outputStrides_x, ga_ssize outputStrides_y,
ga_ssize outputStrides_z, int offsetY,
int offsetZ, float* blockSum, ga_size blockSum_offset) {
input = (float *)(((char *)input) + input_offset);
output = (float *)(((char *)output) + output_offset);
blockSum = (float *)(((char *)blockSum) + blockSum_offset);
// Regarding blockIdx and threadIdx, 'CumOp' is always performed along the X axis.
// The Y and Z axis of the grid will contain all independent cumops of the 2D/3D case.
int globalThreadID = blockIdx.x * blockDim.x + threadIdx.x;
// Check if current thread has data to process.
if (globalThreadID >= (nbElementsPerCumOp+1)/2) {
return;
}
extern __shared__ float partialCumOp[];
// Load data in shared memory
k_fetchData(partialCumOp, input, globalThreadID, inputStrides_x, inputStrides_y, inputStrides_z, offsetY, offsetZ);
// Use a dichotomy approach to compute the cum op (i.e. balanced binary tree).
// The tree is sweeped from the leaves to the root and from the root to the leaves.
// Similar to http://www.umiacs.umd.edu/~ramani/cmsc828e_gpusci/ScanTalk.pdf
k_reductionPhase(partialCumOp);
k_reversePhase(partialCumOp);
// Write the final output to global memory
k_pushData(partialCumOp, output, globalThreadID, outputStrides_x, outputStrides_y, outputStrides_z, offsetY, offsetZ);
if (blockSum != NULL){
if (threadIdx.x == blockDim.x - 1) {
blockSum[blockIdx.x*(gridDim.y*gridDim.z) + (blockIdx.y + offsetY)*gridDim.z + blockIdx.z + offsetZ] = partialCumOp[threadIdx.x*2 + 1];
}
}
}
"""
% locals()
)
kernels.append(
Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var)
)
# k_finalCumOp
kname = "k_finalCumOp"
k_var = "k_finalCumOp_" + nodename
code = (
"""#include "cluda.h"
KERNEL void k_finalCumOp(float* output, ga_size output_offset,
float* blockSum, ga_size blockSum_offset,
size_t nbElementsPerCumOp,
ga_ssize dataStrides_x, ga_ssize dataStrides_y, ga_ssize dataStrides_z,
int offsetY, int offsetZ) {
output = (float *)(((char *)output) + output_offset);
blockSum = (float *)(((char *)blockSum) + blockSum_offset);
int globalThreadID = (blockIdx.x + 1) * blockDim.x + threadIdx.x;
// Check if current has data to process.
if (globalThreadID >= (nbElementsPerCumOp+1)/2)
return;
int idY = blockIdx.y + offsetY;
int idZ = blockIdx.z + offsetZ;
const float currentBlockSum = blockSum[blockIdx.x*(gridDim.y*gridDim.z) + idY*gridDim.z + idZ];
int offset = idY * dataStrides_y + idZ * dataStrides_z;
int idx_even = (globalThreadID*2 ) * dataStrides_x + offset;
int idx_odd = (globalThreadID*2 + 1) * dataStrides_x + offset;
output[idx_even] %(op)s= currentBlockSum;
output[idx_odd] %(op)s= currentBlockSum;
}
"""
% locals()
)
params = [
gpuarray.GpuArray,
gpuarray.SIZE,
gpuarray.GpuArray,
gpuarray.SIZE,
gpuarray.SIZE,
gpuarray.SSIZE,
gpuarray.SSIZE,
gpuarray.SSIZE,
"int32",
"int32",
]
kernels.append(
Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var)
)
return kernels
def c_code(self, node, nodename, inp, out, sub):
if node.inputs[0].type.context.kind != b"cuda":
raise NotImplementedError("cuda only")
return """
const size_t* shape = PyGpuArray_DIMS(%(x)s);
bool needAllocation = !%(z)s || PyGpuArray_NDIM(%(x)s) != PyGpuArray_NDIM(%(z)s);
int axis = %(params)s->axis;
if (axis < 0) {
// Convert negative axis to positive axis.
axis += PyGpuArray_NDIM(%(x)s);
}
if (aesara_prep_output(&%(z)s, PyGpuArray_NDIM(%(x)s), PyGpuArray_DIMS(%(x)s),
%(x)s->ga.typecode, GA_C_ORDER, %(params)s->context) != 0) {
%(fail)s;
}
{ // Namespace for kernel calls //
size_t max_threads_dim0;
size_t max_grid_size1;
size_t max_grid_size2;
int err;
err = gpucontext_property(%(params)s->context->ctx, GA_CTX_PROP_MAXLSIZE0, &max_threads_dim0);
if (err != GA_NO_ERROR){
PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_threads_dims0");
%(fail)s;
}
err = gpucontext_property(%(params)s->context->ctx, GA_CTX_PROP_MAXGSIZE1, &max_grid_size1);
if (err != GA_NO_ERROR){
PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_grid_size1");
%(fail)s;
}
err = gpucontext_property(%(params)s->context->ctx, GA_CTX_PROP_MAXGSIZE2, &max_grid_size2);
if (err != GA_NO_ERROR){
PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_grid_size2");
%(fail)s;
}
if (cumOp_%(nodename)s(%(x)s, %(z)s, axis, max_threads_dim0, max_grid_size1, max_grid_size2) == -1){
%(fail)s;
}
}
""" % dict(
x=inp[0],
z=out[0],
nodename=nodename,
fail=sub["fail"],
params=sub["params"],
)
def c_support_code_struct(self, node, nodename):
code = (
"""
int cumOp_%(nodename)s(PyGpuArrayObject* input, PyGpuArrayObject* output, int axis, size_t maxThreads, size_t maxGridY, size_t maxGridZ) {
size_t shape[3] = { 1, 1, 1 };
ssize_t inputStrides_x;
ssize_t inputStrides_y;
ssize_t inputStrides_z;
ssize_t outputStrides_x;
ssize_t outputStrides_y;
ssize_t outputStrides_z;
switch (PyGpuArray_NDIM(input))
{
case 1:
shape[0] = PyGpuArray_DIMS(input)[0];
inputStrides_x = PyGpuArray_STRIDES(input)[0] / sizeof(float);
outputStrides_x = PyGpuArray_STRIDES(output)[0] / sizeof(float);
break;
case 2:
shape[0] = PyGpuArray_DIMS(input)[0];
shape[1] = PyGpuArray_DIMS(input)[1];
inputStrides_x = PyGpuArray_STRIDES(input)[0] / sizeof(float);
inputStrides_y = PyGpuArray_STRIDES(input)[1] / sizeof(float);
outputStrides_x = PyGpuArray_STRIDES(output)[0] / sizeof(float);
outputStrides_y = PyGpuArray_STRIDES(output)[1] / sizeof(float);
break;
case 3:
shape[0] = PyGpuArray_DIMS(input)[0];
shape[1] = PyGpuArray_DIMS(input)[1];
shape[2] = PyGpuArray_DIMS(input)[2];
inputStrides_x = PyGpuArray_STRIDES(input)[0] / sizeof(float);
inputStrides_y = PyGpuArray_STRIDES(input)[1] / sizeof(float);
inputStrides_z = PyGpuArray_STRIDES(input)[2] / sizeof(float);
outputStrides_x = PyGpuArray_STRIDES(output)[0] / sizeof(float);
outputStrides_y = PyGpuArray_STRIDES(output)[1] / sizeof(float);
outputStrides_z = PyGpuArray_STRIDES(output)[2] / sizeof(float);
break;
default:
PyErr_SetString(PyExc_RuntimeError, "Unsupported Axis");
return -1;
}
if (shape[axis] <= 1) {
int err = pygpu_move(output, input);
return err;
}
// Perform cum op on array of even size.
size_t nbElementsPerCumOp = shape[axis] - (shape[axis] %% 2);
// Determine how many elements can be processed in one block.
size_t dimBlockX = ((nbElementsPerCumOp > 2*maxThreads ? 2*maxThreads : nbElementsPerCumOp)+1)/2;
// Determine how many blocks are needed in total.
size_t dimGridX = (nbElementsPerCumOp+2*dimBlockX-1) / (2*dimBlockX); // Nb. of blocks needed per cum op.
size_t dimGridY; // Nb. of independent cum ops (width).
size_t dimGridZ; // Nb. of independent cum ops (height).
ssize_t tmp;
switch (axis)
{
case 0:
dimGridY = shape[1];
dimGridZ = shape[2];
break;
case 1:
dimGridY = shape[0];
dimGridZ = shape[2];
tmp = inputStrides_x;
inputStrides_x = inputStrides_y;
inputStrides_y = tmp;
tmp = outputStrides_x;
outputStrides_x = outputStrides_y;
outputStrides_y = tmp;
break;
case 2:
dimGridY = shape[1];
dimGridZ = shape[0];
tmp = inputStrides_x;
inputStrides_x = inputStrides_z;
inputStrides_z = tmp;
tmp = outputStrides_x;
outputStrides_x = outputStrides_z;
outputStrides_z = tmp;
break;
default:
PyErr_SetString(PyExc_RuntimeError, "Unsupported Axis");
return -1;
}
const size_t shapeBlockSum[2] = { dimGridX, dimGridY*dimGridZ };
PyGpuArrayObject* deviceBlockSum = pygpu_empty(2, shapeBlockSum, output->ga.typecode,
GA_C_ORDER, input->context, Py_None);
if (deviceBlockSum == NULL){
return -1;
}
// Perform `maxGridY`*`maxGridZ` cum ops in parallel.
for (size_t offsetY = 0; offsetY < dimGridY; offsetY += maxGridY){
size_t localDimGridY = (dimGridY - offsetY < maxGridY) ? (dimGridY - offsetY) : (maxGridY);
for (size_t offsetZ = 0; offsetZ < dimGridZ; offsetZ += maxGridZ){
size_t localDimGridZ = (dimGridZ - offsetZ < maxGridZ) ? (dimGridZ - offsetZ) : (maxGridZ);
size_t dimGrid[3] = {dimGridX, localDimGridY, localDimGridZ};
size_t dimBlock[3] = {dimBlockX, 1, 1}; // One cum op per block.
size_t sharedBytes = (2*dimBlockX) * sizeof(float);
int err = k_blockCumOp_call(3, dimGrid, dimBlock, sharedBytes, input->ga.data, input->ga.offset, output->ga.data, output->ga.offset, nbElementsPerCumOp, inputStrides_x, inputStrides_y, inputStrides_z, outputStrides_x, outputStrides_y, outputStrides_z, offsetY, offsetZ, deviceBlockSum->ga.data, deviceBlockSum->ga.offset);
if (err != GA_NO_ERROR){
PyErr_SetString(PyExc_RuntimeError, "blockCumOp call failed");
return -1;
}
if (dimGridX > 1) {
// Do a cum op over the blockSum (recursive).
if (cumOp_%(nodename)s(deviceBlockSum, deviceBlockSum, 0, maxThreads, maxGridY, maxGridZ) == -1){
Py_DECREF(deviceBlockSum);
return -1;
}
// Since there are more than one block (i.e. `dimGridX > 1`)
// report partial cum ops of previous blocks to subsequents ones.
size_t dimGrid[3] = {dimGridX, localDimGridY, localDimGridZ};
size_t dimBlock[3] = {dimBlockX, 1, 1};
int err = k_finalCumOp_call(3, dimGrid, dimBlock, sharedBytes, output->ga.data, output->ga.offset, deviceBlockSum->ga.data, deviceBlockSum->ga.offset, nbElementsPerCumOp, outputStrides_x, outputStrides_y, outputStrides_z, offsetY, offsetZ);
if (err != GA_NO_ERROR){
PyErr_SetString(PyExc_RuntimeError, "finalCumOp call failed");
return -1;
}
}
// If shape[axis] is odd, the last element is compute manually
if (shape[axis] != nbElementsPerCumOp){
size_t dimGrid[3] = {1, localDimGridY, localDimGridZ};
size_t dimBlock[3] = {1, 1, 1};
int err = k_cumadd_call(3, dimGrid, dimBlock, sharedBytes, input->ga.data, input->ga.offset, output->ga.data, output->ga.offset, inputStrides_x, inputStrides_y, inputStrides_z, outputStrides_x, outputStrides_y, outputStrides_z, offsetY, offsetZ, shape[axis] - 2, shape[axis] - 1);
if (err != GA_NO_ERROR){
PyErr_SetString(PyExc_RuntimeError, "cumadd call failed");
return -1;
}
}
}
}
Py_XDECREF(deviceBlockSum);
return 0;
}
"""
% locals()
)
return super().c_support_code_struct(node, nodename) + code
# GpuCumsumOp exists only to serve backward compatibility.
# Once an object is created, it will be converted to CumOp object.
class GpuCumsumOp(GpuKernelBaseCOp, _NoPythonOp):
SUPPORTED_NDIMS = 3
__props__ = ("axis",)
def __new__(typ, *args, **kwargs):
obj = object.__new__(GpuCumOp, *args, **kwargs)
obj.mode = "add"
return obj
@register_opt("fast_compile")
@op_lifter([CumOp])
@register_opt2([CumOp], "fast_compile")
def local_gpua_cumop(op, ctx_name, inputs, outputs):
if inputs[0].dtype != "float32":
return False
axis = op.axis
x = inputs[0]
if axis is not None and x.ndim > GpuCumOp.SUPPORTED_NDIMS:
return False
x = as_gpuarray_variable(x, ctx_name)
if axis is None and x.ndim > 1:
x = GpuReshape(1)(x, (-1,))
# ``gpu_cumop`` assume array has been flattened if needed.
if axis is None:
axis = 0
return GpuCumOp(axis, op.mode)(x)
import numpy as np
from aesara.gpuarray.basic_ops import (
as_gpuarray_variable,
gpu_contiguous,
infer_context_name,
)
from aesara.gpuarray.opt import op_lifter, register_opt, register_opt2
from aesara.gpuarray.type import GpuArrayType
from aesara.gradient import DisconnectedType
from aesara.graph.basic import Apply
from aesara.graph.op import _NoPythonOp
from aesara.tensor.basic import as_tensor_variable
from aesara.tensor.fft import IRFFTOp
from aesara.tensor.math import sqrt
from aesara.tensor.subtensor import set_subtensor
from aesara.tensor.type import integer_dtypes
try:
import pygpu
pygpu_available = True
except ImportError:
pygpu_available = False
try:
import pycuda.driver
pycuda_available = True
except ImportError:
pycuda_available = False
try:
import skcuda
from skcuda import fft
skcuda_available = True
except Exception:
skcuda_available = False
class CuRFFTOp(_NoPythonOp):
__props__ = ()
def output_type(self, inp):
# add one extra dim for real/imag
return GpuArrayType(
inp.dtype,
broadcastable=[False] * (inp.type.ndim + 1),
context_name=inp.type.context_name,
)
def make_node(self, inp, s=None):
# A shape parameter s can be provided as an input. For now this is used to
# manage odd transform sizes.
# Later this could be extended to handle padding and trunkation,
# following numpy's interface. However, cuFFT expects array that match
# the shape given to the plan, so padding will have to be done in the op.
# The effect of padding on gradients has yet to be investigated.
if not skcuda_available:
raise RuntimeError("skcuda is needed for CuFFTOp")
if not pygpu_available:
raise RuntimeError("pygpu is needed for CuFFTOp")
if not pycuda_available:
raise RuntimeError("pycuda is needed for CuFFTOp")
inp = gpu_contiguous(as_gpuarray_variable(inp, infer_context_name(inp)))
# If no shape is provided as input, default to input data shape.
if s is None:
s = inp.shape[1:]
s = as_tensor_variable(s)
assert inp.dtype == "float32"
assert s.ndim == 1
assert s.dtype in integer_dtypes
return Apply(self, [inp, s], [self.output_type(inp)()])
def make_thunk(self, node, storage_map, _, _2, impl=None):
inputs = [storage_map[v] for v in node.inputs]
outputs = [storage_map[v] for v in node.outputs]
# Initialize cuda context to the input's.
with node.inputs[0].type.context:
skcuda.misc.init()
plan_input_shape = [None]
plan = [None]
def thunk():
input_shape = inputs[0][0].shape
s = inputs[1][0]
# Since padding is not supported, assert s matches input shape.
assert (input_shape[1:] == s).all()
# construct output shape
output_shape = [input_shape[0]] + list(s)
# DFT of real input is symmetric, no need to store
# redundant coefficients
output_shape[-1] = output_shape[-1] // 2 + 1
# extra dimension with length 2 for real/imag
output_shape += [2]
output_shape = tuple(output_shape)
z = outputs[0]
# only allocate if there is no previous allocation of the
# right size.
if z[0] is None or z[0].shape != output_shape:
z[0] = pygpu.zeros(
output_shape, context=inputs[0][0].context, dtype="float32"
)
input_pycuda = inputs[0][0]
# I thought we'd need to change the type on output_pycuda
# so it is complex64, but as it turns out skcuda.fft
# doesn't really care either way and treats the array as
# if it is complex64 anyway.
output_pycuda = z[0]
with input_pycuda.context:
# only initialise plan if necessary
if plan[0] is None or plan_input_shape[0] != input_shape:
plan_input_shape[0] = input_shape
plan[0] = fft.Plan(
s, np.float32, np.complex64, batch=input_shape[0]
)
# Sync GPU variables before computation
input_pycuda.sync()
output_pycuda.sync()
fft.fft(input_pycuda, output_pycuda, plan[0])
# Sync results to ensure output contains completed computation
pycuda.driver.Context.synchronize()
thunk.inputs = inputs
thunk.outputs = outputs
thunk.lazy = False
return thunk
def grad(self, inputs, output_grads):
(gout,) = output_grads
s = inputs[1]
# Divide the last dimension of the output gradients by 2, they are
# double-counted by the real-IFFT due to symmetry, except the first
# and last elements (for even transforms) which are unique.
idx = (
[slice(None)] * (gout.ndim - 2)
+ [slice(1, (s[-1] // 2) + (s[-1] % 2))]
+ [slice(None)]
)
gout = set_subtensor(gout[idx], gout[idx] * 0.5)
return [cuirfft_op(gout, s), DisconnectedType()()]
def connection_pattern(self, node):
# Specify that shape input parameter has no connection to graph and gradients.
return [[True], [False]]
curfft_op = CuRFFTOp()
class CuIRFFTOp(_NoPythonOp):
__props__ = ()
def output_type(self, inp):
# remove extra dim for real/imag
return GpuArrayType(
inp.dtype,
broadcastable=[False] * (inp.type.ndim - 1),
context_name=inp.type.context_name,
)
def make_node(self, inp, s=None):
# A shape parameter is expected as an input. For now this is used to
# manage odd transform sizes.
# Later this could be extended to handle padding and trunkation,
# following numpy's interface. However, cuFFT expects array that match
# the shape given to the plan, so padding will have to be done in the op.
# The effect of padding on gradients has yet to be investigated.
if not skcuda_available:
raise RuntimeError("skcuda is needed for CuIFFTOp")
if not pygpu_available:
raise RuntimeError("pygpu is needed for CuIFFTOp")
if not pycuda_available:
raise RuntimeError("pycuda is needed for CuIFFTOp")
inp = gpu_contiguous(as_gpuarray_variable(inp, infer_context_name(inp)))
# If no shape is provided as input, calculate shape assuming even real transform.
if s is None:
s = inp.shape[1:-1]
s = set_subtensor(s[-1], (s[-1] - 1) * 2)
s = as_tensor_variable(s)
assert inp.dtype == "float32"
assert s.ndim == 1
return Apply(self, [inp, s], [self.output_type(inp)()])
def make_thunk(self, node, storage_map, _, _2, impl=None):
inputs = [storage_map[v] for v in node.inputs]
outputs = [storage_map[v] for v in node.outputs]
# Initialize cuda context to the input's.
with node.inputs[0].type.context:
skcuda.misc.init()
plan_input_shape = [None]
plan = [None]
def thunk():
input_shape = inputs[0][0].shape
s = inputs[1][0]
# Since padding is not supported, assert that last dimension corresponds to
# input forward transform size.
assert (input_shape[1:-2] == s[:-1]).all()
assert ((input_shape[-2] - 1) * 2 + s[-1] % 2 == s[-1]).all()
# construct output shape
# chop off the extra length-2 dimension for real/imag
output_shape = [input_shape[0]] + list(s)
output_shape = tuple(output_shape)
z = outputs[0]
# only allocate if there is no previous allocation of the
# right size.
if z[0] is None or z[0].shape != output_shape:
z[0] = pygpu.zeros(
output_shape, context=inputs[0][0].context, dtype="float32"
)
input_pycuda = inputs[0][0]
# input_pycuda is a float32 array with an extra dimension,
# but will be interpreted by skcuda as a complex64
# array instead.
output_pycuda = z[0]
with input_pycuda.context:
# only initialise plan if necessary
if plan[0] is None or plan_input_shape[0] != input_shape:
plan_input_shape[0] = input_shape
plan[0] = fft.Plan(
s, np.complex64, np.float32, batch=output_shape[0]
)
# Sync GPU variables before computation
input_pycuda.sync()
output_pycuda.sync()
fft.ifft(input_pycuda, output_pycuda, plan[0])
# strangely enough, enabling rescaling here makes it run
# very, very slowly, so do this rescaling manually
# afterwards!
# Sync results to ensure output contains completed computation
pycuda.driver.Context.synchronize()
thunk.inputs = inputs
thunk.outputs = outputs
thunk.lazy = False
return thunk
def grad(self, inputs, output_grads):
(gout,) = output_grads
s = inputs[1]
gf = curfft_op(gout, s)
# Multiply the last dimension of the gradient by 2, they represent
# both positive and negative frequencies, except the first
# and last elements (for even transforms) which are unique.
idx = (
[slice(None)] * (gf.ndim - 2)
+ [slice(1, (s[-1] // 2) + (s[-1] % 2))]
+ [slice(None)]
)
gf = set_subtensor(gf[idx], gf[idx] * 2)
return [gf, DisconnectedType()()]
def connection_pattern(self, node):
# Specify that shape input parameter has no connection to graph and gradients.
return [[True], [False]]
cuirfft_op = CuIRFFTOp()
def curfft(inp, norm=None):
r"""
Performs the fast Fourier transform of a real-valued input on the GPU.
The input must be a real-valued float32 variable of dimensions (m, ..., n).
It performs FFTs of size (..., n) on m batches.
The output is a GpuArray of dimensions (m, ..., n//2+1, 2). The second to
last dimension of the output contains the n//2+1 non-trivial elements of
the real-valued FFTs. The real and imaginary parts are stored as a pair of
float32 arrays.
Parameters
----------
inp
Array of real-valued float32 of size (m, ..., n), containing m inputs of
size (..., n).
norm : {None, 'ortho', 'no_norm'}
Normalization of transform. Following numpy, default *None* normalizes
only the inverse transform by n, 'ortho' yields the unitary transform
(:math:`1/\sqrt n` forward and inverse). In addition, 'no_norm' leaves
the transform unnormalized.
"""
s = inp.shape[1:]
cond_norm = _unitary(norm)
scaling = 1
if cond_norm == "ortho":
scaling = sqrt(s.prod().astype("float32"))
return curfft_op(inp, s) / scaling
def cuirfft(inp, norm=None, is_odd=False):
r"""
Performs the inverse fast Fourier Transform with real-valued output on the GPU.
The input is a variable of dimensions (m, ..., n//2+1, 2) with
type float32 representing the non-trivial elements of m
real-valued Fourier transforms of initial size (..., n). The real and
imaginary parts are stored as a pair of float32 arrays.
The output is a real-valued float32 variable of dimensions (m, ..., n)
giving the m inverse FFTs.
Parameters
----------
inp
Array of float32 of size (m, ..., n//2+1, 2), containing m inputs
with n//2+1 non-trivial elements on the last dimension and real
and imaginary parts stored as separate arrays.
norm : {None, 'ortho', 'no_norm'}
Normalization of transform. Following numpy, default *None* normalizes
only the inverse transform by n, 'ortho' yields the unitary transform
(:math:`1/\sqrt n` forward and inverse). In addition, 'no_norm' leaves
the transform unnormalized.
is_odd : {True, False}
Set to True to get a real inverse transform output with an odd last dimension
of length (N-1)*2 + 1 for an input last dimension of length N.
"""
if is_odd not in (True, False):
raise ValueError(f"Invalid value {is_odd} for id_odd, must be True or False")
s = inp.shape[1:-1]
if is_odd:
s = set_subtensor(s[-1], (s[-1] - 1) * 2 + 1)
else:
s = set_subtensor(s[-1], (s[-1] - 1) * 2)
cond_norm = _unitary(norm)
scaling = 1
if cond_norm is None:
scaling = s.prod().astype("float32")
elif cond_norm == "ortho":
scaling = sqrt(s.prod().astype("float32"))
return cuirfft_op(inp, s) / scaling
def _unitary(norm):
if norm not in (None, "ortho", "no_norm"):
raise ValueError(
f"Invalid value {norm} for norm, must be None, 'ortho' or 'no norm'"
)
return norm
if skcuda_available:
@register_opt("fast_compile")
@op_lifter([IRFFTOp])
@register_opt2([IRFFTOp], "fast_compile")
def local_gpua_curfft_op(op, ctx_name, inputs, outputs):
return curfft_op
@register_opt("fast_compile")
@op_lifter([IRFFTOp])
@register_opt2([IRFFTOp], "fast_compile")
def local_gpua_cuirfft_op(op, ctx_name, inputs, outputs):
return cuirfft_op
def work_dtype(dtype):
"""
Return the data type for working memory.
"""
if dtype == "float16":
return "float32"
else:
return dtype
def load_w(dtype):
"""
Return the function name to load data.
This should be used like this::
code = '%s(ival)' % (load_w(input_type),)
"""
if dtype == "float16":
return "ga_half2float"
else:
return ""
def write_w(dtype):
"""
Return the function name to write data.
This should be used like this::
code = 'res = %s(oval)' % (write_w(output_type),)
"""
if dtype == "float16":
return "ga_float2half"
else:
return ""
"""
Helper routines for generating gpu kernels for nvcc.
"""
try:
from pygpu import gpuarray
except ImportError:
pass
def nvcc_kernel(name, params, body):
"""
Return the c code of a kernel function.
Parameters
----------
params
The parameters to the function as one or more strings.
body
The [nested] list of statements for the body of the function.
These will be separated by ';' characters.
"""
paramstr = ", ".join(params)
def flatbody():
for b in body:
if isinstance(b, (list, tuple)):
yield from b
else:
yield b
bodystr = ";\n".join(flatbody())
return (
"""#include "cluda.h"
KERNEL void %(name)s (%(paramstr)s)
{
%(bodystr)s;
}
"""
% locals()
)
def code_version(version):
"""
Decorator to support version-based cache mechanism.
"""
if not isinstance(version, tuple):
raise TypeError("version must be tuple", version)
def deco(f):
f.code_version = version
return f
return deco
UNVERSIONED = ()
@code_version((2,))
def inline_reduce(N, buf, pos, count, manner_fn):
"""
Return C++ code for a function that reduces a contiguous buffer.
Parameters
----------
N
Length of the buffer.
buf
buffer pointer.
pos
Index of executing thread.
count
Number of executing threads.
manner_fn
A function that accepts strings of arguments a and b, and
returns c code for their reduction.
return "%(a)s + %(b)s"
for a sum reduction.
Notes
-----
`buf` should be in gpu shared memory, we access it many times.
This function leaves the answer in position 0 of the buffer. The
rest of the buffer is trashed by this function.
"""
loop_line = manner_fn(f"{buf}[{pos}]", f"{buf}[i]")
r_n = manner_fn(f"{buf}[{pos}]", f"{buf}[{pos}+_n]")
return (
"""
{
// This function trashes buf[1..warpSize],
// leaving the reduction result in buf[0].
if (%(pos)s < warpSize) {
for (int i = %(pos)s + warpSize; i < %(N)s; i += warpSize)
{
%(buf)s[%(pos)s] = %(loop_line)s;
}
}
__syncthreads();
//reduce so that %(pos)s 0 has the reduction of everything
for (unsigned int _n = warpSize / 2; _n > 0; _n /= 2) {
if (%(pos)s < _n && %(pos)s + _n < %(N)s)
%(buf)s[%(pos)s] = %(r_n)s;
__syncthreads();
}
}
"""
% locals()
)
@code_version(inline_reduce.code_version)
def inline_reduce_max(N, buf, pos, count):
return inline_reduce(N, buf, pos, count, lambda a, b: f"max({a}, {b})")
@code_version(inline_reduce.code_version)
def inline_reduce_sum(N, buf, pos, count):
return inline_reduce(N, buf, pos, count, lambda a, b: f"{a} + {b}")
@code_version(inline_reduce.code_version)
def inline_reduce_min(N, buf, pos, count):
return inline_reduce(N, buf, pos, count, lambda a, b: f"min({a}, {b})")
@code_version(inline_reduce.code_version)
def inline_reduce_prod(N, buf, pos, count):
return inline_reduce(N, buf, pos, count, lambda a, b: f"{a} * {b}")
@code_version((2,) + inline_reduce_max.code_version + inline_reduce_sum.code_version)
def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):
"""
Generate code for a softmax.
On entry, `buf` and `buf2` must contain two identical copies of
the input to softmax.
After the code returns `buf` contains the softmax, `buf2` contains
un-normalized softmax.
Parameters
----------
N
Length of the buffer.
threadPos
Index of executing thread.
threadCount
Number of executing threads.
dtype
Dtype of the softmax's output.
Notes
-----
`buf` and `buf2` should be in gpu shared memory, we access it many
times.
We use __i as an int variable in a loop.
"""
ctype = gpuarray.dtype_to_ctype(dtype)
# get max of buf (trashing all but buf[0])
return [
inline_reduce_max(N, buf, threadPos, threadCount),
"__syncthreads()",
f"{ctype} row_max = {buf}[0]",
"__syncthreads()",
f"for(int __i={threadPos}; __i<{N}; __i+={threadCount}){{",
f"{buf}[__i] = exp({buf2}[__i] - row_max)",
f"{buf}[__i] = {buf}[__i]",
"}",
"__syncthreads()",
inline_reduce_sum(N, buf, threadPos, threadCount),
"__syncthreads()",
f"{ctype} row_sum = {buf}[0]",
"__syncthreads()",
# divide each exp() result by the sum to complete the job.
f"for(int __i={threadPos}; __i<{N}; __i+={threadCount}){{",
f"{buf}[__i] = {buf2}[__i] / row_sum",
"}",
"__syncthreads()",
]
@code_version((3,))
def inline_reduce_fixed_shared(
N,
buf,
x,
stride_x,
load_x,
pos,
count,
manner_fn,
manner_init,
b="",
stride_b="",
load_b="",
dtype="float32",
):
"""
Return C++ code for a function that reduces a contiguous buffer.
This function leaves the answer in position 0 of the buffer. The
rest of the buffer is trashed by this function.
Parameters
----------
N
Length of the buffer.
buf
Buffer pointer of size warpSize * sizeof(dtype).
x
Input data.
stride_x
Input data stride.
load_x
Wrapper to read from x.
pos
Index of executing thread.
count
Number of executing threads.
manner_fn
A function that accepts strings of arguments a and b, and
returns c code for their reduction.
return "%(a)s + %(b)s"
for a sum reduction.
manner_init
A function that accepts strings of arguments a and return c
code for its initialization.
b
Optional, pointer to the bias.
stride_b
Optional, the stride of b if b is provided.
load_b
Optional, wrapper to read from b if b is provided.
dtype
Optional, the dtype of the output.
Notes
-----
`buf` should be in gpu shared memory, we access it many times.
"""
if b:
init = manner_init(
"%(load_x)s(%(x)s[%(pos)s * %(stride_x)s]) +"
" %(load_b)s(%(b)s[%(pos)s * %(stride_b)s])" % locals()
)
loop_line = manner_fn(
"red",
manner_init(
"%(load_x)s(%(x)s[i * %(stride_x)s]) + "
"%(load_b)s(%(b)s[i * %(stride_b)s])" % locals()
),
)
else:
init = manner_init(f"{load_x}({x}[{pos} * {stride_x}])")
loop_line = manner_fn(
"red",
manner_init(f"{load_x}({x}[i * {stride_x}])"),
)
loop_line2 = manner_fn(f"{buf}[{pos}]", f"{buf}[i]")
r_n = manner_fn(f"{buf}[{pos}]", f"{buf}[{pos}+_n]")
ctype = gpuarray.dtype_to_ctype(dtype)
return (
"""
{
// This function trashes buf[1..n_threads],
// leaving the reduction result in buf[0].
%(ctype)s red = %(init)s;
#pragma unroll 16
for (int i = %(pos)s + %(count)s; i<%(N)s; i += %(count)s) {
red = %(loop_line)s;
}
buf[%(pos)s] = red;
__syncthreads();
if (%(pos)s < warpSize) {
for (int i = %(pos)s + warpSize; i < %(count)s; i += warpSize) {
%(buf)s[%(pos)s] = %(loop_line2)s;
}
}
__syncthreads();
//reduce so that %(pos)s 0 has the reduction of everything
for (unsigned int _n = warpSize / 2; _n > 0; _n /= 2) {
if (%(pos)s < _n && %(pos)s + _n < %(N)s)
%(buf)s[%(pos)s] = %(r_n)s;
__syncthreads();
}
}
"""
% locals()
)
@code_version(inline_reduce_fixed_shared.code_version)
def inline_reduce_fixed_shared_max(
N,
buf,
x,
stride_x,
load_x,
pos,
count,
b="",
stride_b="",
load_b="",
dtype="float32",
):
return inline_reduce_fixed_shared(
N,
buf,
x,
stride_x,
load_x,
pos,
count,
lambda a, b: f"max({a}, {b})",
lambda a: a,
b,
stride_b,
load_b,
dtype,
)
@code_version((2,) + inline_reduce_max.code_version + inline_reduce_sum.code_version)
def inline_softmax_fixed_shared(
N,
buf,
x,
stride_x,
load_x,
sm,
sm_stride,
write_sm,
threadPos,
threadCount,
b="",
stride_b="",
load_b="",
dtype="float32",
):
"""
Generate code to perform softmax with a fixed amount of shared
memory.
On entry, `buf` is assumed to be empty.
On exit, `buf[0]` contains the softmax, `buf2` contains
un-normalized softmax.
Parameters
----------
N
Length of the buffer, at least warpSize(32).
buf
A shared memory buffer of size warpSize * sizeof(dtype).
x
A ptr to the gpu memory where the row is stored.
stride_x
The stride between each element in x.
load_x
Wrapper to read from x.
sm
A ptr to the gpu memory to store the result.
sm_stride
The stride between each sm element.
write_sm
Wrapper before writing to sm.
threadPos
Index of executing thread.
threadCount
Number of executing threads.
b
Optional, pointer to the bias.
stride_b
Optional, the stride of b if b is provided.
load_b
Optional, wrapper to read from b if b is provided.
dtype
Optional, the dtype of the softmax's output if not float32.
Notes
-----
`buf` should be in gpu shared memory, we access it many times.
We use tx as an int variable in a loop.
"""
ctype = gpuarray.dtype_to_ctype(dtype)
ret = [
# get max of buf (trashing all but buf[0])
inline_reduce_fixed_shared_max(
N,
buf,
x,
stride_x,
load_x,
threadPos,
threadCount,
b,
stride_b,
load_b,
dtype,
),
"__syncthreads()",
f"{ctype} row_max = {buf}[0]",
"__syncthreads()",
inline_reduce_fixed_shared(
N,
buf,
x,
stride_x,
load_x,
threadPos,
threadCount,
lambda a, b: f"{a} + {b}",
lambda a: f"exp({a} - row_max)",
b,
stride_b,
load_b,
dtype,
),
"__syncthreads()",
f"{ctype} row_sum = {buf}[0]",
"__syncthreads()",
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
]
# This set all value correctly
if b:
ret += [
"%(sm)s[tx * %(sm_stride)s] = "
" %(write_sm)s(exp(%(load_x)s(%(x)s[tx * %(stride_x)s]) +"
" %(load_b)s(%(b)s[tx * %(stride_b)s]) - row_max)"
" / row_sum)" % locals()
]
else:
ret += [
"%(sm)s[tx * %(sm_stride)s] = "
"%(write_sm)s(exp(%(load_x)s(%(x)s[tx * %(stride_x)s]) - row_max)"
" / row_sum)" % locals()
]
ret += [
"}",
"__syncthreads()",
]
return ret
import warnings
import numpy as np
import pkg_resources
from numpy.linalg.linalg import LinAlgError
from aesara.configdefaults import config
from aesara.gpuarray.basic_ops import (
CGpuKernelBase,
as_gpuarray_variable,
gpu_contiguous,
gpuarray_helper_inc_dir,
infer_context_name,
)
from aesara.gpuarray.type import GpuArrayType, gpu_context_type
from aesara.graph.basic import Apply
from aesara.graph.op import Op
from aesara.link.c.op import ExternalCOp
from aesara.link.c.params_type import ParamsType
from aesara.scalar import bool as bool_t
from aesara.tensor import basic as at
from aesara.tensor import math as tm
try:
import pygpu
from pygpu.basic import tril, triu
pygpu_available = True
except ImportError:
pygpu_available = False
cusolver_available = False
try:
import skcuda
from skcuda import cusolver
cusolver_available = True
except (ImportError, OSError, RuntimeError, pkg_resources.DistributionNotFound):
pass
cublas_available = False
try:
from skcuda import cublas
cublas_available = True
except (ImportError, OSError, RuntimeError, pkg_resources.DistributionNotFound):
pass
if cusolver_available:
# Add cusolver call as it is missing in skcuda
# SPOTRS
cusolver._libcusolver.cusolverDnSpotrs.restype = int
cusolver._libcusolver.cusolverDnSpotrs.argtypes = [
cusolver.ctypes.c_void_p,
cusolver.ctypes.c_int,
cusolver.ctypes.c_int,
cusolver.ctypes.c_int,
cusolver.ctypes.c_void_p,
cusolver.ctypes.c_int,
cusolver.ctypes.c_void_p,
cusolver.ctypes.c_int,
cusolver.ctypes.c_void_p,
]
def cusolverDnSpotrs(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo):
"""
Solve real single precision linear system for hermitian matrices.
References
----------
`cusolverDn<t>potrs <http://docs.nvidia.com/cuda/cusolver/index.html#cuds-lt-t-gt-potrs>`_
"""
status = cusolver._libcusolver.cusolverDnSpotrs(
handle, uplo, n, nrhs, int(A), lda, int(B), ldb, int(devInfo)
)
cusolver.cusolverCheckStatus(status)
# DPOTRS
# TODO: Are they still missing in skucda?
cusolver._libcusolver.cusolverDnDpotrs.restype = int
cusolver._libcusolver.cusolverDnDpotrs.argtypes = [
cusolver.ctypes.c_void_p,
cusolver.ctypes.c_int,
cusolver.ctypes.c_int,
cusolver.ctypes.c_int,
cusolver.ctypes.c_void_p,
cusolver.ctypes.c_int,
cusolver.ctypes.c_void_p,
cusolver.ctypes.c_int,
cusolver.ctypes.c_void_p,
]
def cusolverDnDpotrs(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo):
status = cusolver._libcusolver.cusolverDnDpotrs(
handle, uplo, n, nrhs, int(A), lda, int(B), ldb, int(devInfo)
)
cusolver.cusolverCheckStatus(status)
def attach_cusolver_handle_to_context(ctx):
handle = getattr(ctx, "cusolver_handle", None)
if handle is None:
with ctx:
ctx.cusolver_handle = cusolver.cusolverDnCreate()
def attach_cublas_handle_to_context(ctx):
handle = getattr(ctx, "cublas_handle", None)
if handle is None:
with ctx:
ctx.cublas_handle = cublas.cublasCreate()
# it is a subset of all cases available in slinalg's MATRIX_STRUCTURE
MATRIX_STRUCTURES_SOLVE = (
"general",
"symmetric",
"lower_triangular",
"upper_triangular",
)
class GpuCusolverSolve(Op):
"""
CUSOLVER GPU solver OP.
Parameters
----------
trans
Whether to take the transpose of the input matrix or not.
"""
__props__ = ("A_structure", "trans", "inplace")
def __init__(self, A_structure="general", trans="N", inplace=False):
self.trans = trans
self.inplace = inplace
self.A_structure = A_structure
if self.inplace:
self.destroy_map = {0: [0]}
assert A_structure in MATRIX_STRUCTURES_SOLVE
super().__init__()
def make_node(self, inp1, inp2):
if not cusolver_available:
raise RuntimeError(
"CUSOLVER is not available and "
"GpuCusolverSolve Op can not be constructed."
)
if skcuda.__version__ <= "0.5.1":
warnings.warn(
"The GpuSolve op requires scikit-cuda > 0.5.1 to work with CUDA 8"
)
context_name = infer_context_name(inp1, inp2)
inp1 = as_gpuarray_variable(inp1, context_name)
inp2 = as_gpuarray_variable(inp2, context_name)
inp1 = gpu_contiguous(inp1)
inp2 = gpu_contiguous(inp2)
assert inp1.ndim == 2
assert inp2.ndim == 2
assert inp1.dtype == inp2.dtype
return Apply(
self,
[inp1, inp2],
[
GpuArrayType(
inp1.dtype,
broadcastable=inp1.broadcastable,
context_name=context_name,
)()
],
)
def prepare_node(self, node, storage_map, compute_map, impl):
ctx = node.inputs[0].type.context
attach_cusolver_handle_to_context(ctx)
def check_dev_info(self, dev_info):
val = np.asarray(dev_info)[0]
if val > 0:
raise LinAlgError("A is singular")
def perform(self, node, inputs, outputs):
context = inputs[0][0].context
# Size of the matrices to invert.
z = outputs[0]
# Matrix.
A = inputs[0]
# Solution vectors.
b = inputs[1]
assert len(A.shape) == 2
assert len(b.shape) == 2
if self.trans in ("T", "C"):
trans = 1
l, n = A.shape
k, m = b.shape
elif self.trans == "N":
trans = 0
n, l = A.shape
k, m = b.shape
else:
raise ValueError("Invalid value for trans")
if l != n:
raise ValueError("A must be a square matrix")
if n != k:
raise ValueError("A and b must be aligned.")
lda = max(1, n)
ldb = max(1, k)
# We copy A and b as cusolver operates inplace
b = pygpu.array(b, copy=True, order="F")
if not self.inplace:
A = pygpu.array(A, copy=True)
A_ptr = A.gpudata
b_ptr = b.gpudata
# cusolver expects a F ordered matrix, but A is not explicitly
# converted between C and F order, instead we switch the
# "transpose" flag.
if A.flags["C_CONTIGUOUS"]:
trans = 1 - trans
if A.dtype == "float32":
potrf_bufferSize = cusolver.cusolverDnSpotrf_bufferSize
potrf = cusolver.cusolverDnSpotrf
potrs = cusolverDnSpotrs
getrf_bufferSize = cusolver.cusolverDnSgetrf_bufferSize
getrf = cusolver.cusolverDnSgetrf
getrs = cusolver.cusolverDnSgetrs
elif A.dtype == "float64":
potrf_bufferSize = cusolver.cusolverDnDpotrf_bufferSize
potrf = cusolver.cusolverDnDpotrf
potrs = cusolverDnDpotrs
getrf_bufferSize = cusolver.cusolverDnDgetrf_bufferSize
getrf = cusolver.cusolverDnDgetrf
getrs = cusolver.cusolverDnDgetrs
else:
raise ValueError("Unsupported dtype")
if self.A_structure == "symmetric":
with context:
workspace_size = potrf_bufferSize(
context.cusolver_handle, 0, n, A_ptr, lda
)
workspace = pygpu.zeros(workspace_size, dtype=A.dtype, context=context)
dev_info = pygpu.zeros((1,), dtype="int32", context=context)
workspace_ptr = workspace.gpudata
dev_info_ptr = dev_info.gpudata
with context:
potrf(
context.cusolver_handle,
0,
n,
A_ptr,
lda,
workspace_ptr,
workspace_size,
dev_info_ptr,
)
self.check_dev_info(dev_info)
potrs(
context.cusolver_handle,
0,
n,
m,
A_ptr,
lda,
b_ptr,
ldb,
dev_info_ptr,
)
else:
# general case for A
with context:
workspace_size = getrf_bufferSize(
context.cusolver_handle, n, n, A_ptr, lda
)
workspace = pygpu.zeros(workspace_size, dtype=A.dtype, context=context)
pivots = pygpu.zeros(n, dtype="int32", context=context)
dev_info = pygpu.zeros((1,), dtype="int32", context=context)
workspace_ptr = workspace.gpudata
pivots_ptr = pivots.gpudata
dev_info_ptr = dev_info.gpudata
with context:
getrf(
context.cusolver_handle,
n,
n,
A_ptr,
lda,
workspace_ptr,
pivots_ptr,
dev_info_ptr,
)
self.check_dev_info(dev_info)
getrs(
context.cusolver_handle,
trans,
n,
m,
A_ptr,
lda,
pivots_ptr,
b_ptr,
ldb,
dev_info_ptr,
)
z[0] = b
def L_op(self, inputs, outputs, output_gradients):
# Modified from aesara/tensor/slinalg.py
A, b = inputs
c = outputs[0]
c_bar = output_gradients[0]
# FIXME: triangular structure would use GpuCublasTriangularsolve?
# no need to handle A_structure like slinalg.py?
trans_solve_op = GpuCusolverSolve("general")
b_bar = trans_solve_op(A.T, c_bar)
A_bar = -tm.outer(b_bar, c) if c.ndim == 1 else -b_bar.dot(c.T)
return [A_bar, b_bar]
class GpuCublasTriangularSolve(Op):
"""
CUBLAS GPU Triangular Solve Op.
Parameters
----------
lower
Whether system is lower-triangular (True) or upper-triangular (False).
trans
Whether to take the transpose of the input matrix or not.
"""
__props__ = ("trans", "lower")
def __init__(self, lower=True, trans="N"):
self.trans = trans
self.lower = lower
super().__init__()
def make_node(self, inp1, inp2):
if not cublas_available:
raise RuntimeError(
"CUBLAS is not available and "
"GpuCublasTriangularSolve Op "
"can not be constructed."
)
context_name = infer_context_name(inp1, inp2)
inp1 = as_gpuarray_variable(inp1, context_name)
inp2 = as_gpuarray_variable(inp2, context_name)
inp1 = gpu_contiguous(inp1)
inp2 = gpu_contiguous(inp2)
assert inp1.ndim == 2
assert inp2.ndim in (1, 2)
assert inp1.dtype == inp2.dtype
return Apply(
self,
[inp1, inp2],
[
GpuArrayType(
inp1.dtype,
broadcastable=inp2.broadcastable,
context_name=context_name,
)()
],
)
def prepare_node(self, node, storage_map, compute_map, impl):
ctx = node.inputs[0].type.context
attach_cublas_handle_to_context(ctx)
def perform(self, node, inputs, outputs):
ctx = node.inputs[0].type.context
# Solution set
x = outputs[0]
# Matrix.
A = inputs[0]
# right hand side
b = inputs[1]
assert len(A.shape) == 2
assert len(b.shape) in (1, 2)
# implicitly deal with the difference between C order
# and fortran order by flipping the trans and lower flags
lower = not self.lower
trans = self.trans
if trans in ("T", "C"):
trans = "N"
l, n = A.shape
elif trans == "N":
trans = "T"
n, l = A.shape
else:
raise ValueError("Invalid value for trans")
if b.ndim == 2:
k, m = b.shape
else:
(k,) = b.shape
m = 1
if l != n:
raise ValueError("A must be a square matrix")
if n != k:
raise ValueError("A and b must be aligned.")
lda = max(1, n)
ldb = max(1, k)
# solution overwrites right hand side on exit
b = pygpu.array(b, copy=True, order="F")
A_ptr = A.gpudata
b_ptr = b.gpudata
# unit scalar used for multiplication
alpha = 1.0
# indicates matrix A is on left of B
side = "l"
# set whether upper or lower part of matrix A stored
uplo = "l" if lower else "u"
# indicates elements on diagonal of matrix A may not be unity
diag = "n"
if A.dtype == "float32":
trsv = cublas.cublasStrsv
trsm = cublas.cublasStrsm
elif A.dtype == "float64":
trsv = cublas.cublasDtrsv
trsm = cublas.cublasDtrsm
else:
raise ValueError("Unsupported dtype")
with ctx:
if b.ndim == 1:
# matrix vector solve
trsv(ctx.cublas_handle, uplo, trans, diag, n, A_ptr, lda, b_ptr, 1)
else:
trsm(
ctx.cublas_handle,
side,
uplo,
trans,
diag,
n,
m,
alpha,
A_ptr,
lda,
b_ptr,
ldb,
)
x[0] = b
def L_op(self, inputs, outputs, output_gradients):
# Modified from aesara/tensor/slinalg.py
A, b = inputs
c = outputs[0]
c_bar = output_gradients[0]
trans_solve_op = GpuCublasTriangularSolve(not self.lower)
b_bar = trans_solve_op(A.T, c_bar)
A_bar = -tm.outer(b_bar, c) if c.ndim == 1 else -b_bar.dot(c.T)
if self.lower:
A_bar = at.tril(A_bar)
else:
A_bar = at.triu(A_bar)
return [A_bar, b_bar]
def gpu_solve(A, b, A_structure="general", trans="N"):
if A_structure == "lower":
return GpuCublasTriangularSolve(True, trans)(A, b)
elif A_structure == "upper":
return GpuCublasTriangularSolve(False, trans)(A, b)
return GpuCusolverSolve(A_structure, trans)(A, b)
def gpu_solve_lower_triangular(A, b, trans="N"):
return GpuCublasTriangularSolve(True, trans)(A, b)
def gpu_solve_upper_triangular(A, b, trans="N"):
return GpuCublasTriangularSolve(False, trans)(A, b)
class GpuCholesky(Op):
"""
CUSOLVER GPU Cholesky Op.
Given a real positive definite matrix `A` returns either a lower
triangular matrix `L` such that `A == dot(L, L.T)` if `lower == True`
else returns an upper triangular matrix `U` such that `A == dot(U.T, U)`
if `lower == False`.
Parameters
----------
lower
Whether to return a lower rather than upper triangular decomposition.
"""
__props__ = ("lower", "inplace")
def __init__(self, lower=True, inplace=False):
self.lower = lower
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [0]}
super().__init__()
def clone_inplace(self):
return self.__class__(lower=self.lower, inplace=True)
def make_node(self, inp):
if not cusolver_available:
raise RuntimeError(
"CUSOLVER is not available and "
"GpuCholesky Op can not be constructed."
)
if skcuda.__version__ <= "0.5.1":
warnings.warn(
"The GpuCholesky op requires scikit-cuda > " "0.5.1 to work with CUDA 8"
)
if not pygpu_available:
raise RuntimeError(
"Missing pygpu or triu/tril functions." "Install or update libgpuarray."
)
context_name = infer_context_name(inp)
inp = as_gpuarray_variable(inp, context_name)
inp = gpu_contiguous(inp)
assert inp.ndim == 2
return Apply(self, [inp], [inp.type()])
def prepare_node(self, node, storage_map, compute_map, impl):
ctx = node.inputs[0].type.context
attach_cusolver_handle_to_context(ctx)
def perform(self, node, inputs, outputs):
context = inputs[0][0].context
# Input matrix.
A = inputs[0]
l, n = A.shape
if l != n:
raise ValueError("A must be a square matrix")
lda = max(1, n)
# cusolver operates on F ordered matrices, but A is expected
# to be symmetric so it does not matter.
# We copy A if needed
if self.inplace:
L = A
else:
L = pygpu.array(A, copy=True)
# The output matrix will contain only the upper or lower
# triangular factorization of A. If L is C ordered (it
# probably is as it is the default in Aesara) we just switch
# the fill mode parameter of cusolver
l_parameter = 0 if self.lower else 1
if L.flags["C_CONTIGUOUS"]:
l_parameter = 1 - l_parameter
L_ptr = L.gpudata
if A.dtype == "float32":
potrf_bufferSize = cusolver.cusolverDnSpotrf_bufferSize
potrf = cusolver.cusolverDnSpotrf
elif A.dtype == "float64":
potrf_bufferSize = cusolver.cusolverDnDpotrf_bufferSize
potrf = cusolver.cusolverDnDpotrf
else:
raise ValueError("Unsupported dtype")
with context:
workspace_size = potrf_bufferSize(
context.cusolver_handle, l_parameter, n, L_ptr, lda
)
workspace = pygpu.zeros(workspace_size, dtype=A.dtype, context=context)
dev_info = pygpu.zeros((1,), dtype="int32", context=context)
workspace_ptr = workspace.gpudata
dev_info_ptr = dev_info.gpudata
potrf(
context.cusolver_handle,
l_parameter,
n,
L_ptr,
lda,
workspace_ptr,
workspace_size,
dev_info_ptr,
)
val_dev_info = np.asarray(dev_info)[0]
if val_dev_info > 0:
raise LinAlgError("Cholesky decomposition failed (is A SPD?)")
# cusolver leaves the elements in the matrix outside the considered
# upper or lower triangle unchanged, so we need to put zeros outside
# the triangle
if self.lower:
tril(L)
else:
triu(L)
outputs[0][0] = L
def L_op(self, inputs, outputs, gradients):
# Modified from aesara/tensor/slinalg.py
# No handling for on_error = 'nan'
dz = gradients[0]
chol_x = outputs[0]
# this is for nan mode
#
# ok = ~tm.any(tm.isnan(chol_x))
# chol_x = at.switch(ok, chol_x, 1)
# dz = at.switch(ok, dz, 1)
# deal with upper triangular by converting to lower triangular
if not self.lower:
chol_x = chol_x.T
dz = dz.T
def tril_and_halve_diagonal(mtx):
"""Extracts lower triangle of square matrix and halves diagonal."""
return at.tril(mtx) - at.diag(at.diagonal(mtx) / 2.0)
def conjugate_solve_triangular(outer, inner):
"""Computes L^{-T} P L^{-1} for lower-triangular L."""
return gpu_solve_upper_triangular(
outer.T, gpu_solve_upper_triangular(outer.T, inner.T).T
)
s = conjugate_solve_triangular(
chol_x, tril_and_halve_diagonal(chol_x.T.dot(dz))
)
if self.lower:
grad = at.tril(s + s.T) - at.diag(at.diagonal(s))
else:
grad = at.triu(s + s.T) - at.diag(at.diagonal(s))
return [grad]
def gpu_cholesky(A, lower=True):
return GpuCholesky(lower)(A)
# TODO: add support for float64
class GpuMagmaBase(ExternalCOp):
"""Base class for magma related operations. Add the necessary headers,
libraries and optionally the location of headers and library.
"""
def c_headers(self, **kwargs):
return [
"gpuarray/types.h",
"gpuarray/array.h",
"gpuarray/ext_cuda.h",
"gpuarray_helper.h",
"magma.h",
]
def c_header_dirs(self, **kwargs):
dirs = [
gpuarray_helper_inc_dir(),
pygpu.get_include(),
config.cuda__include_path,
]
if config.magma__include_path:
dirs.append(config.magma__include_path)
return dirs
def c_libraries(self, **kwargs):
return ["magma"]
def c_lib_dirs(self, **kwargs):
if config.magma__library_path:
return [config.magma__library_path]
return []
def prepare_node(self, node, storage_map, compute_map, impl):
from skcuda.magma import magma_init
ctx = node.inputs[0].type.context
if not getattr(ctx, "is_magma_initialized", False):
with ctx:
magma_init()
ctx.is_magma_initialized = True
class GpuMagmaSVD(GpuMagmaBase):
"""Computes the svd of a matrix :math:`A` using magma library.
.. warning::
Because of implementation constraints, this Op returns outputs
in order ``S, U, VT``. Use :func:`aesara.gpuarray.linalg.gpu_svd`
to get them in expected order ``U, S, VT``.
"""
__props__ = ("full_matrices", "compute_uv")
_cop_num_inputs = 1
_cop_num_outputs = 3
check_input = False
params_type = ParamsType(full_matrices=bool_t, context=gpu_context_type)
def __init__(self, full_matrices=True, compute_uv=True):
self.full_matrices = full_matrices
self.compute_uv = compute_uv
ExternalCOp.__init__(self, ["c_code/magma_svd.c"], "APPLY_SPECIFIC(magma_svd)")
def make_node(self, A):
ctx_name = infer_context_name(A)
A = as_gpuarray_variable(A, ctx_name)
A = gpu_contiguous(A)
if A.ndim != 2:
raise LinAlgError("Matrix rank error")
if A.dtype != "float32":
raise TypeError("only `float32` is supported for now")
if self.compute_uv:
return Apply(
self,
[A],
# return S, U, VT
[
GpuArrayType(
A.dtype, broadcastable=[False], context_name=ctx_name
)(),
A.type(),
A.type(),
],
)
else:
return Apply(
self,
[A],
# return only S
[GpuArrayType(A.dtype, broadcastable=[False], context_name=ctx_name)()],
)
def prepare_node(self, node, storage_map, compute_map, impl):
super().prepare_node(node, storage_map, compute_map, impl)
# Check node to prevent eventual errors with old pickled nodes.
if self.compute_uv:
A, B, C = node.outputs
# We expect order: S (vector), U (matrix), VT (matrix)
assert A.type.ndim == 1 and B.type.ndim == C.type.ndim == 2, (
"Due to implementation constraints, GpuMagmaSVD interface has changed and now returns (S, U, VT) "
"instead of (U, S, VT). Either update your code, or use gpu_svd() to get the expected (U, S, VT) order."
)
def get_params(self, node):
return self.params_type.get_params(self, context=node.inputs[0].type.context)
def infer_shape(self, fgraph, node, shapes):
(x_shape,) = shapes
M, N = x_shape
K = tm.minimum(M, N)
s_shape = (K,)
if self.compute_uv:
u_shape = (M, M) if self.full_matrices else (M, K)
vt_shape = (N, N) if self.full_matrices else (K, N)
return [s_shape, u_shape, vt_shape]
else:
return [s_shape]
def gpu_svd(a, full_matrices=1, compute_uv=1):
"""
This function performs the SVD on GPU.
Parameters
----------
full_matrices : bool, optional
If True (default), u and v have the shapes (M, M) and (N, N),
respectively.
Otherwise, the shapes are (M, K) and (K, N), respectively,
where K = min(M, N).
compute_uv : bool, optional
Whether or not to compute u and v in addition to s.
True by default.
Returns
-------
U, V, D : matrices
"""
out = GpuMagmaSVD(full_matrices, compute_uv)(a)
if compute_uv:
S, U, VT = out
out = [U, S, VT]
return out
class GpuMagmaMatrixInverse(GpuMagmaBase):
"""Computes the inverse of a matrix :math:`A` using magma library."""
__props__ = ("inplace",)
check_input = False
params_type = ParamsType(inplace=bool_t, context=gpu_context_type)
def __init__(self, inplace=False):
ExternalCOp.__init__(self, ["c_code/magma_inv.c"], "APPLY_SPECIFIC(magma_inv)")
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [0]}
def clone_inplace(self):
return self.__class__(inplace=True)
def make_node(self, A):
ctx_name = infer_context_name(A)
A = as_gpuarray_variable(A, ctx_name)
A = gpu_contiguous(A)
if A.ndim != 2:
raise LinAlgError("Matrix rank error")
if A.dtype != "float32":
raise TypeError("only `float32` is supported for now")
return Apply(self, [A], [A.type()])
def get_params(self, node):
return self.params_type.get_params(self, context=node.inputs[0].type.context)
def infer_shape(self, fgraph, node, shapes):
return shapes
def gpu_matrix_inverse(a):
"""
This function performs the matrix inverse on GPU.
Returns
-------
a_inv: matrix
"""
return GpuMagmaMatrixInverse()(a)
class GpuMagmaCholesky(GpuMagmaBase, CGpuKernelBase):
"""Computes the cholesky decomposition of a matrix :math:`A` using magma
library.
"""
__props__ = ("lower", "inplace")
check_input = False
params_type = ParamsType(lower=bool_t, inplace=bool_t, context=gpu_context_type)
def __init__(self, lower=True, inplace=False):
self.lower = lower
ExternalCOp.__init__(
self, ["c_code/magma_cholesky.c"], "APPLY_SPECIFIC(magma_cholesky)"
)
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [0]}
def clone_inplace(self):
return self.__class__(lower=self.lower, inplace=True)
def make_node(self, A):
ctx_name = infer_context_name(A)
A = as_gpuarray_variable(A, ctx_name)
A = gpu_contiguous(A)
if A.ndim != 2:
raise LinAlgError("Matrix rank error")
if A.dtype != "float32":
raise TypeError("only `float32` is supported for now")
return Apply(self, [A], [A.type()])
def get_params(self, node):
return self.params_type.get_params(self, context=node.inputs[0].type.context)
def infer_shape(self, fgraph, node, shapes):
return [shapes[0]]
class GpuMagmaQR(GpuMagmaBase, CGpuKernelBase):
"""Computes the qr decomposition of a matrix :math:`A` using magma
library.
Parameters
----------
complete : If False, returns only ``R``.
.. warning::
Because of implementation constraints, this Op returns outputs
in order ``R, Q``. Use :func:`aesara.gpuarray.linalg.gpu_qr`
to get them in expected order ``Q, R``.
"""
__props__ = ("complete",)
_cop_num_inputs = 1
_cop_num_outputs = 2
check_input = False
params_type = ParamsType(complete=bool_t, context=gpu_context_type)
def __init__(self, complete=True):
self.complete = complete
ExternalCOp.__init__(self, ["c_code/magma_qr.c"], "APPLY_SPECIFIC(magma_qr)")
def make_node(self, A):
ctx_name = infer_context_name(A)
A = as_gpuarray_variable(A, ctx_name)
A = gpu_contiguous(A)
if A.ndim != 2:
raise LinAlgError("Matrix rank error")
if A.dtype != "float32":
raise TypeError("only `float32` is supported for now")
if self.complete:
return Apply(
self,
[A],
# return R, Q
[A.type(), A.type()],
)
else:
return Apply(
self,
[A],
# return R
[A.type()],
)
def get_params(self, node):
return self.params_type.get_params(self, context=node.inputs[0].type.context)
def gpu_qr(a, complete=True):
"""
This function performs the QR on GPU.
Parameters
----------
complete : bool, optional
If `False`, returns only r.
Returns
-------
Q, R : matrices
"""
out = GpuMagmaQR(complete)(a)
if complete:
R, Q = out
out = [Q, R]
return out
class GpuMagmaEigh(GpuMagmaBase):
"""Computes the eigen decomposition of a symmetric matrix :math:`A` using magma
library.
Parameters
----------
UPLO : Specifies whether the calculation is done with the lower triangular
part of matrix (`L`, default) or the upper triangular part (`U`).
compute_v : If `True`, computes eigenvalues and eigenvectors (`True`,
default). If `False`, computes only eigenvalues of matrix.
"""
__props__ = ("lower", "compute_v")
_cop_num_inputs = 1
_cop_num_outputs = 2
check_input = False
params_type = ParamsType(lower=bool_t, compute_v=bool_t, context=gpu_context_type)
def __init__(self, UPLO="L", compute_v=True):
assert UPLO in ("L", "U")
self.lower = UPLO == "L"
self.compute_v = compute_v
ExternalCOp.__init__(
self, ["c_code/magma_eigh.c"], "APPLY_SPECIFIC(magma_eigh)"
)
def make_node(self, A):
ctx_name = infer_context_name(A)
A = as_gpuarray_variable(A, ctx_name)
A = gpu_contiguous(A)
if A.ndim != 2:
raise LinAlgError("Matrix rank error")
if A.dtype != "float32":
raise TypeError("only `float32` is supported for now")
if self.compute_v:
return Apply(
self,
[A],
# return D, V
[
GpuArrayType(
A.dtype, broadcastable=[False], context_name=ctx_name
)(),
A.type(),
],
)
else:
return Apply(
self,
[A],
# return D
[GpuArrayType(A.dtype, broadcastable=[False], context_name=ctx_name)()],
)
def get_params(self, node):
return self.params_type.get_params(self, context=node.inputs[0].type.context)
# TODO test dtype != float32
import warnings
try:
import pygpu
except ImportError:
pass
import aesara
import aesara.sandbox.multinomial
from aesara.gpuarray.basic_ops import (
GpuKernelBaseCOp,
Kernel,
as_gpuarray_variable,
gpuarray_helper_inc_dir,
infer_context_name,
)
from aesara.gpuarray.elemwise import GpuDimShuffle
from aesara.gpuarray.fp16_help import load_w, work_dtype, write_w
from aesara.gpuarray.opt import op_lifter, register_opt, register_opt2
from aesara.gpuarray.type import GpuArrayType
from aesara.graph.basic import Apply
from aesara.graph.op import _NoPythonOp
from aesara.scalar import as_scalar
from aesara.tensor.basic import get_scalar_constant_value
from aesara.tensor.exceptions import NotScalarConstantError
class GPUAMultinomialFromUniform(GpuKernelBaseCOp, _NoPythonOp):
__props__ = ("odtype",)
_f16_ok = True
def __init__(self, odtype):
super().__init__(self)
self.odtype = odtype
def get_params(self, node):
return node.outputs[0].type.context
def c_headers(self, **kwargs):
return ["<numpy_compat.h>", "gpuarray_helper.h"]
def c_header_dirs(self, **kwargs):
return [gpuarray_helper_inc_dir()]
def make_node(self, pvals, unis):
ctx_name = infer_context_name(pvals, unis)
pvals = as_gpuarray_variable(pvals, ctx_name)
unis = as_gpuarray_variable(unis, ctx_name)
valid_dtypes = ("float32", "float16", "float64")
assert pvals.dtype in valid_dtypes
assert unis.dtype in valid_dtypes
if pvals.ndim != 2:
raise NotImplementedError("pvals ndim should be 2", pvals.ndim)
if unis.ndim != 1:
raise NotImplementedError("unis ndim should be 1", unis.ndim)
if self.odtype == "auto":
odtype = pvals.dtype
else:
odtype = self.odtype
br = (pvals.broadcastable[1], pvals.broadcastable[0])
out = GpuArrayType(broadcastable=br, dtype=odtype, context_name=ctx_name)()
return Apply(self, [pvals, unis], [out])
def gpu_kernels(self, node, name):
out_ctype = pygpu.gpuarray.dtype_to_ctype(node.outputs[0].dtype)
pvals_ctype = pygpu.gpuarray.dtype_to_ctype(node.inputs[0].dtype)
unis_ctype = pygpu.gpuarray.dtype_to_ctype(node.inputs[1].dtype)
work_ctype = pygpu.gpuarray.dtype_to_ctype(work_dtype(node.inputs[0].dtype))
write_out_ctype = write_w(node.outputs[0].dtype)
load_in_ctype = load_w(node.inputs[0].dtype)
code = """#include "cluda.h"
KERNEL void k_multi_warp_multinomial(
const ga_size nb_multi,
const ga_size nb_outcomes,
GLOBAL_MEM %(pvals_ctype)s *global_pvals,
const ga_size global_pvals_offset,
const ga_ssize pvals_row_stride,
const ga_ssize pvals_col_stride,
GLOBAL_MEM %(unis_ctype)s *global_unis,
const ga_size global_unis_offset,
const ga_ssize unis_stride,
GLOBAL_MEM %(out_ctype)s *global_outs,
const ga_size global_outs_offset,
const ga_ssize outs_row_stride,
const ga_ssize outs_col_stride
)
{
global_pvals = (GLOBAL_MEM %(pvals_ctype)s *)(((GLOBAL_MEM char *)global_pvals) + global_pvals_offset);
global_unis = (GLOBAL_MEM %(unis_ctype)s *)(((GLOBAL_MEM char *)global_unis) + global_unis_offset);
global_outs = (GLOBAL_MEM %(out_ctype)s *)(((GLOBAL_MEM char *)global_outs) + global_outs_offset);
// each thread takes care of one multinomial draw
int n = LDIM_0*GID_0 + LID_0;
if (n < nb_multi)
{
%(work_ctype)s cummul = 0.;
bool done = false;
const %(work_ctype)s unis_n = %(load_in_ctype)s(global_unis[n*unis_stride]);
for (ga_size m = 0; m < nb_outcomes; ++m)
{
%(work_ctype)s current_out = 0;
if (!done)
{
cummul += %(load_in_ctype)s(global_pvals[m * pvals_col_stride + n * pvals_row_stride]);
if (unis_n < cummul)
{
current_out = 1;
done = true;
}
}
//write out transposed for speed.
global_outs[n * outs_col_stride +
m * outs_row_stride] = %(write_out_ctype)s(current_out);
}
}
}
""" % dict(
out_ctype=out_ctype,
write_out_ctype=write_out_ctype,
work_ctype=work_ctype,
pvals_ctype=pvals_ctype,
unis_ctype=unis_ctype,
load_in_ctype=load_in_ctype,
)
return [
Kernel(
code=code,
name="k_multi_warp_multinomial",
params=[
pygpu.gpuarray.SIZE,
pygpu.gpuarray.SIZE,
pygpu.gpuarray.GpuArray,
pygpu.gpuarray.SIZE,
pygpu.gpuarray.SSIZE,
pygpu.gpuarray.SSIZE,
pygpu.gpuarray.GpuArray,
pygpu.gpuarray.SIZE,
pygpu.gpuarray.SSIZE,
pygpu.gpuarray.GpuArray,
pygpu.gpuarray.SIZE,
pygpu.gpuarray.SSIZE,
pygpu.gpuarray.SSIZE,
],
flags=Kernel.get_flags(node.outputs[0].dtype),
objvar="k_multi_warp_multinomial_" + name,
)
]
def c_code(self, node, name, inp, outputs, sub):
pvals, unis = inp
(out,) = outputs
fail = sub["fail"]
ctx = sub["params"]
kname = self.gpu_kernels(node, name)[0].objvar
out_typecode = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
pvals_typecode = pygpu.gpuarray.dtype_to_typecode(node.inputs[0].dtype)
unis_typecode = pygpu.gpuarray.dtype_to_typecode(node.inputs[1].dtype)
s = (
"""
PyGpuArrayObject * pvals = %(pvals)s;
PyGpuArrayObject * unis = %(unis)s;
PyGpuArrayObject * out = %(out)s;
size_t dims[2];
if (PyGpuArray_NDIM(pvals) != 2)
{
PyErr_Format(PyExc_TypeError, "pvals wrong rank");
%(fail)s
}
if (PyGpuArray_NDIM(unis) != 1)
{
PyErr_Format(PyExc_TypeError, "unis wrong rank");
%(fail)s
}
if (PyGpuArray_DIMS(unis)[0] != PyGpuArray_DIMS(pvals)[0])
{
PyErr_Format(PyExc_ValueError, "unis.shape[0] != pvals.shape[0]");
%(fail)s
}
dims[0] = PyGpuArray_DIMS(pvals)[1];
dims[1] = PyGpuArray_DIMS(pvals)[0];
if (aesara_prep_output(&out, 2, dims, %(out_typecode)s,
GA_C_ORDER, %(ctx)s) != 0){
%(fail)s
}
%(out)s = out;
GpuArray_memset(&(out->ga), 0);
{ // NESTED SCOPE
int nb_multi = PyGpuArray_DIMS(pvals)[0];
int nb_outcomes = PyGpuArray_DIMS(pvals)[1];
//TODO : change this for a beautiful constant
int max_nb_blocks = 2<<15 - 1;
size_t nb_blocks = max_nb_blocks + 1;
size_t nb_threads=16; // so it really starts at 32, because of the *2
do
{
nb_threads*=2;
if (nb_multi %% nb_threads == 0)
nb_blocks = nb_multi/nb_threads;
else
nb_blocks = (int)((float)nb_multi/(float)nb_threads + 1.);
} while (nb_blocks > max_nb_blocks);
//printf("\\nN=%%i b=%%i t=%%i t*b=%%i",
// nb_multi, nb_blocks, nb_threads, nb_blocks*nb_threads);
// TODO : next line is a bit hardcoded...
if (nb_threads > 512)
{
PyErr_Format(
PyExc_ValueError,
"Multinomial is not implemented for so many rows in the matrix (%%i)",
nb_multi);
%(fail)s
}
assert(nb_blocks*nb_threads >= nb_multi);
int err = k_multi_warp_multinomial_call(
1, &nb_blocks, &nb_threads, 0,
PyGpuArray_DIMS(out)[1], PyGpuArray_DIMS(out)[0], pvals->ga.data, pvals->ga.offset,
PyGpuArray_STRIDES(pvals)[0]/gpuarray_get_elsize(%(pvals_typecode)s),
PyGpuArray_STRIDES(pvals)[1]/gpuarray_get_elsize(%(pvals_typecode)s),
unis->ga.data, unis->ga.offset,
PyGpuArray_STRIDES(unis)[0]/gpuarray_get_elsize(%(unis_typecode)s), out->ga.data,
out->ga.offset, PyGpuArray_STRIDES(out)[0]/gpuarray_get_elsize(%(out_typecode)s),
PyGpuArray_STRIDES(out)[1]/gpuarray_get_elsize(%(out_typecode)s));
if (err != GA_NO_ERROR) {
PyErr_Format(
PyExc_RuntimeError,
"gpuarray error: %%s: %%s.\\n",
"k_multi_warp_%(name)s",
GpuKernel_error(&%(kname)s, err));
%(fail)s;
}
} // END NESTED SCOPE
"""
% locals()
)
return s
def c_code_cache_version(self):
return (7,)
class GPUAChoiceFromUniform(GpuKernelBaseCOp, _NoPythonOp):
"""
The output is transposed compared to MultinomialWOReplacementFromUniform.
We must insert a Transpose op after it.
The optimization that moves it to the gpu does it.
"""
__props__ = ("odtype", "replace")
def __init__(self, odtype, replace=False):
super().__init__(self)
self.odtype = odtype
self.replace = replace
def __setstate__(self, state):
self.__dict__.update(state)
if "replace" not in state:
self.replace = False
def get_params(self, node):
return node.outputs[0].type.context
def c_headers(self, **kwargs):
return ["<numpy_compat.h>", "gpuarray_helper.h"]
def c_header_dirs(self, **kwargs):
return [gpuarray_helper_inc_dir()]
def make_node(self, pvals, unis, n):
assert pvals.dtype == "float32"
assert unis.dtype == "float32"
ctx_name = infer_context_name(pvals, unis)
pvals = as_gpuarray_variable(pvals, ctx_name)
unis = as_gpuarray_variable(unis, ctx_name)
if pvals.ndim != 2:
raise NotImplementedError("pvals ndim should be 2", pvals.ndim)
if unis.ndim != 1:
raise NotImplementedError("unis ndim should be 1", unis.ndim)
if self.odtype == "auto":
odtype = "int64"
else:
odtype = self.odtype
assert odtype == "int64", odtype
br = (pvals.broadcastable[1], pvals.broadcastable[0])
out = GpuArrayType(broadcastable=br, dtype=odtype, context_name=ctx_name)()
return Apply(self, [pvals, unis, as_scalar(n)], [out])
def gpu_kernels(self, node, name):
replace = int(self.replace)
code = """#include "cluda.h"
KERNEL void k_multi_warp_multinomial_wor(
const ga_size nb_multi,
const ga_size nb_outcomes,
const ga_size n_samples,
GLOBAL_MEM float * global_pvals_copy,
const ga_size global_pvals_offset,
const ga_ssize pvals_row_stride,
const ga_ssize pvals_col_stride,
GLOBAL_MEM float * global_unis,
const ga_size global_unis_offset,
const ga_ssize unis_stride,
GLOBAL_MEM ga_long * global_outs,
const ga_size global_outs_offset,
const ga_ssize outs_row_stride,
const ga_ssize outs_col_stride
)
{
global_pvals_copy = (GLOBAL_MEM float *)(((GLOBAL_MEM char *)global_pvals_copy) + global_pvals_offset);
global_unis = (GLOBAL_MEM float *)(((GLOBAL_MEM char *)global_unis) + global_unis_offset);
global_outs = (GLOBAL_MEM ga_long *)(((GLOBAL_MEM char *)global_outs) + global_outs_offset);
// each thread takes care of one multinomial-wor n_samples-draw
int n = LDIM_0*GID_0 + LID_0;
if (n < nb_multi)
{
// Sum of the remaining p_vals in global_pvals_copy[n]
float pvals_sum = 1.;
for (int c = 0; c < n_samples; ++c)
{
float cummul = 0.;
const float unis_n = global_unis[(c * nb_multi + n)*unis_stride] * pvals_sum;
for (ga_size m = 0; m < nb_outcomes; ++m)
{
float pvals_nm = global_pvals_copy[m * pvals_col_stride + n * pvals_row_stride];
cummul += pvals_nm;
if (unis_n < cummul)
{
// write out transposed for speed.
global_outs[n * outs_col_stride +
c * outs_row_stride] = m;
if (! %(replace)s )
{
global_pvals_copy[m * pvals_col_stride + n * pvals_row_stride] = 0.0;
pvals_sum -= pvals_nm;
}
break;
}
}
}
}
}
""" % {
"replace": replace
}
return [
Kernel(
code=code,
name="k_multi_warp_multinomial_wor",
params=[
pygpu.gpuarray.SIZE,
pygpu.gpuarray.SIZE,
pygpu.gpuarray.SIZE,
pygpu.gpuarray.GpuArray,
pygpu.gpuarray.SIZE,
pygpu.gpuarray.SSIZE,
pygpu.gpuarray.SSIZE,
pygpu.gpuarray.GpuArray,
pygpu.gpuarray.SIZE,
pygpu.gpuarray.SSIZE,
pygpu.gpuarray.GpuArray,
pygpu.gpuarray.SIZE,
pygpu.gpuarray.SSIZE,
pygpu.gpuarray.SSIZE,
],
flags=Kernel.get_flags(node.outputs[0].dtype),
objvar="k_multi_warp_multinomial_wor_" + name,
)
]
def c_code(self, node, name, inp, outputs, sub):
pvals, unis, n = inp
(out,) = outputs
replace = int(self.replace)
fail = sub["fail"]
ctx = sub["params"]
kname = self.gpu_kernels(node, name)[0].objvar
s = (
"""
PyGpuArrayObject * pvals = %(pvals)s;
PyGpuArrayObject * unis = %(unis)s;
const size_t n_samples = %(n)s;
PyGpuArrayObject * out = %(out)s;
// create a copy of pvals matrix
PyGpuArrayObject * pvals_copy = NULL;
size_t dims[2];
if (PyGpuArray_NDIM(pvals) != 2)
{
PyErr_Format(PyExc_TypeError, "pvals wrong rank");
%(fail)s
}
if (PyGpuArray_NDIM(unis) != 1)
{
PyErr_Format(PyExc_TypeError, "unis wrong rank");
%(fail)s
}
if ( n_samples > (PyGpuArray_DIMS(pvals)[1]) )
{
PyErr_Format(PyExc_ValueError, "Cannot sample without replacement n samples bigger than the size of the distribution.");
%(fail)s;
}
if (PyGpuArray_DIMS(unis)[0] != PyGpuArray_DIMS(pvals)[0] * n_samples)
{
PyErr_Format(PyExc_ValueError, "unis.shape[0] != pvals.shape[0] * n");
%(fail)s
}
if (! %(replace)s) {
pvals_copy = pygpu_copy(pvals, GA_C_ORDER);
} else {
pvals_copy = pvals;
Py_INCREF(pvals_copy);
}
dims[0] = n_samples;
dims[1] = PyGpuArray_DIMS(pvals)[0];
if (aesara_prep_output(&out, 2, dims, GA_LONG,
GA_C_ORDER, %(ctx)s) != 0){
Py_DECREF(pvals_copy);
%(fail)s
}
%(out)s = out;
{ // NESTED SCOPE
int nb_multi = PyGpuArray_DIMS(pvals)[0];
int nb_outcomes = PyGpuArray_DIMS(pvals)[1];
//TODO : change this for a beautiful constant
int max_nb_blocks = 2<<15 - 1;
size_t nb_blocks = max_nb_blocks + 1;
size_t nb_threads=16; // so it really starts at 32, because of the *2
do
{
nb_threads*=2;
if (nb_multi %% nb_threads == 0)
nb_blocks = nb_multi/nb_threads;
else
nb_blocks = (int)((float)nb_multi/(float)nb_threads + 1.);
} while (nb_blocks > max_nb_blocks);
// TODO : next line is a bit hardcoded...
if (nb_threads > 512)
{
PyErr_Format(
PyExc_ValueError,
"Multinomial is not implemented for so many rows in the matrix (%%i)",
nb_multi);
Py_DECREF(pvals_copy);
%(fail)s
}
assert(nb_blocks*nb_threads >= nb_multi);
int err = k_multi_warp_multinomial_wor_call(1, &nb_blocks, &nb_threads, 0, PyGpuArray_DIMS(pvals)[0], PyGpuArray_DIMS(pvals)[1], n_samples, pvals_copy->ga.data, pvals_copy->ga.offset, PyGpuArray_STRIDES(pvals)[0]/sizeof(float), PyGpuArray_STRIDES(pvals)[1]/sizeof(float), unis->ga.data, unis->ga.offset, PyGpuArray_STRIDES(unis)[0]/sizeof(float), out->ga.data, out->ga.offset, PyGpuArray_STRIDES(out)[0]/8, PyGpuArray_STRIDES(out)[1]/8);
if (err != GA_NO_ERROR) {
PyErr_Format(
PyExc_RuntimeError,
"gpuarray error: %%s: %%s.\\n",
"k_multi_warp_%(name)s",
GpuKernel_error(&%(kname)s, err));
Py_DECREF(pvals_copy);
%(fail)s;
}
Py_DECREF(pvals_copy);
} // END NESTED SCOPE
"""
% locals()
)
return s
def c_code_cache_version(self):
return (10,)
@register_opt("fast_compile")
@op_lifter([aesara.sandbox.multinomial.MultinomialFromUniform])
@register_opt2([aesara.sandbox.multinomial.MultinomialFromUniform], "fast_compile")
def local_gpua_multinomial(op, context_name, inputs, outputs):
# TODO : need description for function
if len(inputs) == 2:
p, u = inputs
n_samples = 1
else:
p, u, n_samples = inputs
try:
if get_scalar_constant_value(n_samples) != 1:
return None
except NotScalarConstantError:
return None
(m,) = outputs
gpu_op = GPUAMultinomialFromUniform(op.odtype)
return GpuDimShuffle([False, False], [1, 0])(gpu_op(p, u))
@register_opt("fast_compile")
@op_lifter([aesara.sandbox.multinomial.ChoiceFromUniform])
@register_opt2([aesara.sandbox.multinomial.ChoiceFromUniform], "fast_compile")
def local_gpua_multinomial_wor(op, context_name, inputs, outputs):
# TODO : need description for function
p, u, n = inputs
(m,) = outputs
if (p.dtype == u.dtype == "float32") and (m.dtype == "int64"):
gpu_op = GPUAChoiceFromUniform(**op._props_dict())
return GpuDimShuffle([False, False], [1, 0])(gpu_op(p, u, n))
class GPUAMultinomialWOReplacementFromUniform(GPUAChoiceFromUniform):
def __init__(self, *args, **kwargs):
warnings.warn(
"GPUAMultinomialWOReplacementFromUniform is deprecated, "
"use GPUAChoiceFromUniform instead.",
DeprecationWarning,
stacklevel=2,
)
super().__init__(*args, **kwargs)
import aesara.tensor as at
from aesara.graph.basic import Apply
from aesara.graph.op import _NoPythonOp
from aesara.link.c.params_type import ParamsType
from aesara.tensor.nnet.neighbours import Images2Neibs
from aesara.tensor.type import integer_dtypes
try:
from pygpu import gpuarray
except ImportError:
pass
from aesara.gpuarray.basic_ops import (
GpuKernelBaseCOp,
Kernel,
as_gpuarray_variable,
infer_context_name,
)
from aesara.gpuarray.type import GpuArrayType, gpu_context_type
class GpuImages2Neibs(GpuKernelBaseCOp, Images2Neibs, _NoPythonOp):
"""
Images2Neibs for the GPU.
"""
params_type = ParamsType(mode=Images2Neibs.BORDER_MODE, context=gpu_context_type)
def get_params(self, node):
return self.params_type.get_params(self, context=node.inputs[0].type.context)
def make_node(self, ten4, neib_shape, neib_step=None):
ten4 = as_gpuarray_variable(ten4, infer_context_name(ten4))
neib_shape = at.as_tensor_variable(neib_shape)
if neib_step is None:
neib_step = neib_shape
else:
neib_step = at.as_tensor_variable(neib_step)
assert ten4.ndim == 4
assert neib_shape.ndim == 1
assert neib_step.ndim == 1
assert neib_shape.dtype in integer_dtypes
assert neib_step.dtype in integer_dtypes
return Apply(
self,
[ten4, neib_shape, neib_step],
[
GpuArrayType(
broadcastable=(False, False),
dtype=ten4.type.dtype,
context_name=ten4.type.context_name,
)()
],
)
def c_code_cache_version(self):
return (14,)
def c_headers(self, **kwargs):
return ["<numpy_compat.h>", "<gpuarray/types.h>"]
def gpu_kernels(self, node, nodename):
dtype_ten4 = node.inputs[0].dtype
dtype_z = node.outputs[0].dtype
flags = Kernel.get_flags(dtype_ten4, dtype_z)
type_ten4 = gpuarray.dtype_to_ctype(dtype_ten4)
type_z = gpuarray.dtype_to_ctype(dtype_z)
# `BORDER_MODE`'s c_support_code() contains C constants definitions that are useful here.
mode_constants = self.BORDER_MODE.c_support_code()
kernels = []
kname = "k_multi_warp_less"
k_var = "k_multi_warp_less_" + nodename
code = """#include "cluda.h"
// a version that uses less registers but doesn't work in all cases.
%(mode_constants)s
KERNEL void %(kname)s(
const ga_int mode,
const ga_int nb_batch,
const ga_int nb_stack,
const ga_int height,
const ga_int width,
const ga_int c,
const ga_int d,
const ga_int step_x,
const ga_int step_y,
const ga_int grid_c,
const ga_int grid_d,
const ga_size stride0, const ga_size stride1,
const ga_size stride2, const ga_size stride3,
GLOBAL_MEM const %(type_ten4)s * global_ten4, const ga_size offset_ten4,
const ga_size out_s0, const ga_size out_s1,
GLOBAL_MEM %(type_z)s * global_out, const ga_size offset_out
)
{
const ga_int wrap_centered_half_idx_shift_x = c/2;
const ga_int wrap_centered_half_idx_shift_y = d/2;
global_ten4 = (GLOBAL_MEM const %(type_ten4)s *)(((GLOBAL_MEM char *)global_ten4)+offset_ten4);
global_out = (GLOBAL_MEM %(type_z)s *)(((GLOBAL_MEM char *)global_out)+offset_out);
for(ga_int tblock = GID_0*LDIM_2+LID_2;
tblock<nb_batch*nb_stack*grid_c*grid_d;
tblock+=GDIM_0*LDIM_2){
const ga_int b = tblock%%grid_d;
ga_int left = tblock/grid_d;
const ga_int a = left%%grid_c;
left = left/grid_c;
const ga_int s = left%%nb_stack;
left = left/nb_stack;
const ga_int n = left;
if(n>nb_batch)continue;
if(s>nb_stack)continue;
if(a>grid_c)continue;
if(b>grid_d)continue;
ga_int z_row = b + grid_d*(a + grid_c*
(s + nb_stack*n));
ga_int i = LID_1; // loop over c
{
ga_int ten4_2 = i + a * step_x;
if(mode == MODE_WRAP_CENTERED) {
ten4_2 -= wrap_centered_half_idx_shift_x;
if ( ten4_2 < 0 )
ten4_2 += height;
else if (ten4_2 >= height)
ten4_2 -= height;
} else if (mode == MODE_HALF) {
ten4_2 -= wrap_centered_half_idx_shift_x;
} else if (mode == MODE_FULL) {
ten4_2 -= c - 1;
}
ga_int j = LID_0; // loop over d
{
ga_int ten4_3 = j + b * step_y;
if(mode == MODE_WRAP_CENTERED){
ten4_3 -= wrap_centered_half_idx_shift_y;
if ( ten4_3 < 0 )
ten4_3 += width;
else if (ten4_3 >= width)
ten4_3 -= width;
} else if (mode == MODE_HALF) {
ten4_3 -= wrap_centered_half_idx_shift_y;
} else if (mode == MODE_FULL) {
ten4_3 -= d - 1;
}
ga_int z_col = j + d * i;
ga_int z_idx = z_col * out_s1 +
z_row * out_s0;
if(ten4_2 < 0 || ten4_2 >= height || ten4_3 < 0 || ten4_3 >= width){
global_out[z_idx] = 0;
} else {
ga_int ten4_idx = stride3*ten4_3 +
stride2*ten4_2 +
stride1*s + stride0*n;
global_out[z_idx] = global_ten4[ten4_idx];
}
}
}
}
}""" % dict(
kname=kname,
type_ten4=type_ten4,
type_z=type_z,
mode_constants=mode_constants,
)
params = [
"intc",
"intc",
"intc",
"intc",
"intc",
"intc",
"intc",
"intc",
"intc",
"intc",
"intc",
"uintp",
"uintp",
"uintp",
"uintp",
gpuarray.GpuArray,
"uintp",
"uintp",
"uintp",
gpuarray.GpuArray,
"uintp",
]
kernels.append(
Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var)
)
kname = "k_multi_warp"
k_var = "k_multi_warp_" + nodename
code = """#include "cluda.h"
%(mode_constants)s
KERNEL void %(kname)s(
const ga_int mode,
const ga_int nb_batch,
const ga_int nb_stack,
const ga_int height,
const ga_int width,
const ga_int c,
const ga_int d,
const ga_int step_x,
const ga_int step_y,
const ga_int grid_c,
const ga_int grid_d,
const ga_size stride0, const ga_size stride1,
const ga_size stride2, const ga_size stride3,
GLOBAL_MEM const %(type_ten4)s * global_ten4, const ga_size offset_ten4,
const ga_size out_s0, const ga_size out_s1,
GLOBAL_MEM %(type_z)s * global_out, const ga_size offset_out
)
{
const ga_int wrap_centered_half_idx_shift_x = c/2;
const ga_int wrap_centered_half_idx_shift_y = d/2;
global_ten4 = (GLOBAL_MEM const %(type_ten4)s *)(((GLOBAL_MEM char *)global_ten4)+offset_ten4);
global_out = (GLOBAL_MEM %(type_z)s *)(((GLOBAL_MEM char *)global_out)+offset_out);
for(ga_int tblock = GID_0*LDIM_2+LID_2;
tblock<nb_batch*nb_stack*grid_c*grid_d;
tblock+=GDIM_0*LDIM_2){
const ga_int b = tblock%%grid_d;
ga_int left = tblock/grid_d;
const ga_int a = left%%grid_c;
left = left/grid_c;
const ga_int s = left%%nb_stack;
left = left/nb_stack;
const ga_int n = left;
if(n>nb_batch)continue;
if(s>nb_stack)continue;
if(a>grid_c)continue;
if(b>grid_d)continue;
ga_int z_row = b + grid_d*(a + grid_c*
(s + nb_stack*n));
// loop over c
for (ga_int i = LID_1; i < c; i+=LDIM_1)
{
ga_int ten4_2 = i + a * step_x;
if(mode == MODE_WRAP_CENTERED) {
ten4_2 -= wrap_centered_half_idx_shift_x;
if ( ten4_2 < 0 )
ten4_2 += height;
else if (ten4_2 >= height)
ten4_2 -= height;
} else if (mode == MODE_HALF) {
ten4_2 -= wrap_centered_half_idx_shift_x;
} else if (mode == MODE_FULL) {
ten4_2 -= c - 1;
}
// loop over d
for (ga_int j = LID_0; j < d; j+=LDIM_0)
{
ga_int ten4_3 = j + b * step_y;
if(mode == MODE_WRAP_CENTERED) {
ten4_3 -= wrap_centered_half_idx_shift_y;
if ( ten4_3 < 0 )
ten4_3 += width;
else if (ten4_3 >= width)
ten4_3 -= width;
} else if (mode == MODE_HALF) {
ten4_3 -= wrap_centered_half_idx_shift_y;
} else if (mode == MODE_FULL) {
ten4_3 -= d - 1;
}
ga_int z_col = j + d * i;
ga_int z_idx = z_col * out_s1 +
z_row * out_s0;
if(ten4_2 < 0 || ten4_2 >= height || ten4_3 < 0 || ten4_3 >= width){
global_out[z_idx] = 0;
} else {
ga_int ten4_idx = stride3*ten4_3 +
stride2*ten4_2 +
stride1*s + stride0*n;
global_out[z_idx] = global_ten4[ten4_idx];
}
}
}
}
}
""" % dict(
kname=kname,
type_ten4=type_ten4,
type_z=type_z,
mode_constants=mode_constants,
)
params = [
"intc",
"intc",
"intc",
"intc",
"intc",
"intc",
"intc",
"intc",
"intc",
"intc",
"intc",
"uintp",
"uintp",
"uintp",
"uintp",
gpuarray.GpuArray,
"uintp",
"uintp",
"uintp",
gpuarray.GpuArray,
"uintp",
]
kernels.append(
Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var)
)
return kernels
def c_support_code(self, **kwargs):
return """
template <typename T>
static T ceil_intdiv(T a, T b)
{
return (a/b) + ((a % b) ? 1: 0);
}
"""
def c_code(self, node, name, inp, out, sub):
err_check = """
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: *fptr: %%s.",
GpuKernel_error(fptr, err));
%(fail)s;
}
""" % dict(
fail=sub["fail"]
)
# NB: To reduce C code variability:
# For itemsize_ten4, I use GpuArray_ITEMSIZE(&ten4->ga) instead of np.dtype(node.inputs[0].dtype).itemsize
# For itemsize_z, I use itemsize_ten4, as ten4 and z have same type properties (deduced from make_node)
# For typecode_z, I use ten4->ga.typecode (for same reason as above)
return """
int grid_c = -1;
int grid_d = -1;
size_t itemsize_ten4 = GpuArray_ITEMSIZE(&%(ten4)s->ga);
size_t itemsize_z = itemsize_ten4;
int typecode_z = %(ten4)s->ga.typecode;
{
if (PyGpuArray_NDIM(%(ten4)s) != 4)
{
PyErr_Format(PyExc_TypeError,
"GpuImages2Neibs: pvals wrong rank");
%(fail)s;
}
if (PyArray_NDIM(%(neib_shape)s) != 1)
{
PyErr_Format(PyExc_TypeError,
"GpuImages2Neibs: unis wrong rank");
%(fail)s;
}
if (PyArray_DIMS(%(neib_shape)s)[0] != 2)
{
PyErr_Format(PyExc_ValueError,
"GpuImages2Neibs: neib_shape has to contain two"
" elements");
%(fail)s;
}
const int c = *(npy_%(dtype_neib_shape)s*) PyArray_GETPTR1(
%(neib_shape)s, 0);
const int d = *(npy_%(dtype_neib_shape)s*) PyArray_GETPTR1(
%(neib_shape)s, 1);
const npy_intp step_x = (npy_intp) *(npy_%(dtype_neib_step)s*)
PyArray_GETPTR1(%(neib_step)s, 0);
const npy_intp step_y = (npy_intp) *(npy_%(dtype_neib_step)s*)
PyArray_GETPTR1(%(neib_step)s, 1);
if (step_x <=0 || step_y <=0)
{
PyErr_Format(PyExc_ValueError,
"neib_step wrong step ; values <= 0. Got %%lld %%lld.",
(long long) step_x, (long long) step_y);
%(fail)s;
}
if (c <=0 || d <=0)
{
PyErr_Format(PyExc_ValueError,
"neib_shape values <= 0. Got %%lld %%lld.",
(long long)c, (long long)d);
%(fail)s;
}
if (%(params)s->mode == MODE_WRAP_CENTERED) {
if (c%%2!=1 || d%%2!=1){
PyErr_Format(PyExc_TypeError,
"GpuImages2Neibs: in mode wrap_centered need patch with odd shapes");
%(fail)s;
}
if ( PyGpuArray_DIMS(%(ten4)s)[2] < c ||
PyGpuArray_DIMS(%(ten4)s)[3] < d)
{
PyErr_Format(PyExc_TypeError,
"GpuImages2Neibs: in wrap_centered mode,"
" don't support image shapes smaller then"
" the patch shapes: neib_shape=(%%d,%%d),"
" ten4[2:]=[%%d,%%d]",
c, d, PyGpuArray_DIMS(%(ten4)s)[2],
PyGpuArray_DIMS(%(ten4)s)[3]);
%(fail)s;
}
grid_c = ceil_intdiv(((PyGpuArray_DIMS(%(ten4)s))[2]),
(size_t)step_x);
grid_d = ceil_intdiv(((PyGpuArray_DIMS(%(ten4)s))[3]),
(size_t)step_y);
} else if (%(params)s->mode == MODE_VALID) {
if ( ((PyGpuArray_DIMS(%(ten4)s))[2] < c) ||
((((PyGpuArray_DIMS(%(ten4)s))[2]-c) %% step_x)!=0))
{
PyErr_Format(PyExc_TypeError, "GpuImages2Neibs:"
" neib_shape[0]=%%d, neib_step[0]=%%d and"
" ten4.shape[2]=%%d not consistent",
c, step_x,
PyGpuArray_DIMS(%(ten4)s)[2]);
%(fail)s;
}
if ( ((PyGpuArray_DIMS(%(ten4)s))[3] < d) ||
((((PyGpuArray_DIMS(%(ten4)s))[3]-d) %% step_y)!=0))
{
PyErr_Format(PyExc_TypeError, "GpuImages2Neibs:"
" neib_shape[1]=%%d, neib_step[1]=%%d and"
" ten4.shape[3]=%%d not consistent",
d, step_y,
PyGpuArray_DIMS(%(ten4)s)[3]);
%(fail)s;
}
//number of patch in height
grid_c = 1+(((PyGpuArray_DIMS(%(ten4)s))[2]-c)/step_x);
//number of patch in width
grid_d = 1+(((PyGpuArray_DIMS(%(ten4)s))[3]-d)/step_y);
} else if (%(params)s->mode == MODE_IGNORE_BORDERS) {
//number of patch in height
grid_c = 1+(((PyGpuArray_DIMS(%(ten4)s))[2]-c)/step_x);
//number of patch in width
grid_d = 1+(((PyGpuArray_DIMS(%(ten4)s))[3]-d)/step_y);
} else if (%(params)s->mode == MODE_HALF) {
if ( ((PyGpuArray_DIMS(%(ten4)s))[2] < c) ||
((((PyGpuArray_DIMS(%(ten4)s))[2]-(c%%2)) %% step_x)!=0))
{
PyErr_Format(PyExc_TypeError, "GpuImages2Neibs:"
" neib_shape[0]=%%d, neib_step[0]=%%d and"
" ten4.shape[2]=%%d not consistent",
c, step_x,
PyGpuArray_DIMS(%(ten4)s)[2]);
%(fail)s;
}
if ( ((PyGpuArray_DIMS(%(ten4)s))[3] < d) ||
((((PyGpuArray_DIMS(%(ten4)s))[3]-(d%%2)) %% step_y)!=0))
{
PyErr_Format(PyExc_TypeError, "GpuImages2Neibs:"
" neib_shape[1]=%%d, neib_step[1]=%%d and"
" ten4.shape[3]=%%d not consistent",
d, step_y,
PyGpuArray_DIMS(%(ten4)s)[3]);
%(fail)s;
}
//number of patch in height
grid_c = 1+(((PyGpuArray_DIMS(%(ten4)s))[2]-(c%%2))/step_x);
//number of patch in width
grid_d = 1+(((PyGpuArray_DIMS(%(ten4)s))[3]-(d%%2))/step_y);
} else if (%(params)s->mode == MODE_FULL) {
if ( ((PyGpuArray_DIMS(%(ten4)s))[2] < c) ||
( (((PyGpuArray_DIMS(%(ten4)s))[2]+c-2) %% step_x)!=0))
{
PyErr_Format(PyExc_TypeError,
"neib_shape[0]=%%ld, neib_step[0]=%%ld and"
" ten4.shape[2]=%%ld not consistent",
(long int)c, (long int)step_x,
(long int)(PyGpuArray_DIMS(%(ten4)s)[2]));
%(fail)s;
}
if ( ((PyGpuArray_DIMS(%(ten4)s))[3] < d) ||
( (((PyGpuArray_DIMS(%(ten4)s))[3]+d-2) %% step_y)!=0))
{
PyErr_Format(PyExc_TypeError,
"neib_shape[1]=%%ld, neib_step[1]=%%ld and"
" ten4.shape[3]=%%ld not consistent",
(long int)d, (long int)step_y,
(long int)(PyGpuArray_DIMS(%(ten4)s)[3]));
%(fail)s;
}
//number of patch in height
grid_c = 1+(((PyGpuArray_DIMS(%(ten4)s))[2]+c-2)/step_x);
//number of patch in width
grid_d = 1+(((PyGpuArray_DIMS(%(ten4)s))[3]+d-2)/step_y);
} else {
PyErr_Format(PyExc_TypeError,
"GpuImages2Neibs:: unknown mode %%d", %(params)s->mode);
%(fail)s;
}
// new dimensions for z
const int z_dim1 = c * d;
const int z_dim0 = grid_c
* grid_d
* PyGpuArray_DIMS(%(ten4)s)[1]
* PyGpuArray_DIMS(%(ten4)s)[0];
if ((NULL == %(z)s)
|| (PyGpuArray_DIMS(%(z)s)[0] != z_dim0)
|| (PyGpuArray_DIMS(%(z)s)[1] != z_dim1))
{
Py_XDECREF(%(z)s);
size_t dims[2];
dims[0] = z_dim0;
dims[1] = z_dim1;
%(z)s = pygpu_empty(2, dims, typecode_z,
GA_C_ORDER, %(params)s->context, Py_None);
if (!%(z)s)
{
PyErr_SetString(PyExc_MemoryError, "GpuImages2Neibs:"
" failed to alloc z output");
%(fail)s;
}
}
}
{ // NESTED SCOPE
const int mode = %(params)s->mode;
const int nb_batch = PyGpuArray_DIMS(%(ten4)s)[0];
const int nb_stack = PyGpuArray_DIMS(%(ten4)s)[1];
const int height = PyGpuArray_DIMS(%(ten4)s)[2];
const int width = PyGpuArray_DIMS(%(ten4)s)[3];
const int c = *(npy_%(dtype_neib_shape)s*) PyArray_GETPTR1(
%(neib_shape)s, 0);
const int d = *(npy_%(dtype_neib_shape)s*) PyArray_GETPTR1(
%(neib_shape)s, 1);
const npy_intp step_x = (npy_intp) *(npy_%(dtype_neib_step)s*)
PyArray_GETPTR1(%(neib_step)s, 0);
const npy_intp step_y = (npy_intp) *(npy_%(dtype_neib_step)s*)
PyArray_GETPTR1(%(neib_step)s, 1);
size_t threads_per_block[3] = {d, c, 1};
//get the max threads per blocks
size_t max_threads_dim;
int err = gpucontext_property(%(params)s->context->ctx, GA_CTX_PROP_MAXLSIZE0, &max_threads_dim);
if (err != GA_NO_ERROR){
PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_threads_dims");
%(fail)s;
}
while(threads_per_block[0]*threads_per_block[1]>max_threads_dim && threads_per_block[1]>1)threads_per_block[1]--;
while(threads_per_block[0]*threads_per_block[1]>max_threads_dim && threads_per_block[0]>1)threads_per_block[0]--;
//Make bigger block to have better memory access pattern and
//a higher core utilisation. for smaller patch size
while(c*d*(threads_per_block[2]+1) < 128 && threads_per_block[2]<64 &&
threads_per_block[2]<PyGpuArray_DIMS(%(z)s)[0]){
threads_per_block[2]++;
}
int nb_block;
if (PyGpuArray_DIMS(%(z)s)[0] %% threads_per_block[2] == 0)
nb_block = PyGpuArray_DIMS(%(z)s)[0] / threads_per_block[2];
else
nb_block = (PyGpuArray_DIMS(%(z)s)[0] / threads_per_block[2]) + 1;
size_t n_blocks[3] = {std::min(32*1024,nb_block), 1, 1};
GpuKernel *fptr;
if(threads_per_block[0]==d && threads_per_block[1]==c){
fptr = &k_multi_warp_less_%(name)s;
}else{
fptr = &k_multi_warp_%(name)s;
}
/*
printf("%%zu %%zu %%zu %%zu %%zu %%zu %%zu\\n",
max_threads_dim, threads_per_block[0], threads_per_block[1], threads_per_block[2],
n_blocks[0], n_blocks[1], n_blocks[2]);
*/
size_t stride_A0 = PyGpuArray_STRIDES(%(ten4)s)[0] / itemsize_ten4;
size_t stride_A1 = PyGpuArray_STRIDES(%(ten4)s)[1] / itemsize_ten4;
size_t stride_A2 = PyGpuArray_STRIDES(%(ten4)s)[2] / itemsize_ten4;
size_t stride_A3 = PyGpuArray_STRIDES(%(ten4)s)[3] / itemsize_ten4;
size_t stride_Z0 = PyGpuArray_STRIDES(%(z)s)[0] / itemsize_z;
size_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[1] / itemsize_z;
void *kernel_params[] = {(void *)&mode,
(void *)&nb_batch,
(void *)&nb_stack,
(void *)&height, (void *)&width,
(void *)&c, (void *)&d,
(void *)&step_x, (void *)&step_y,
(void *)&grid_c, (void *)&grid_d,
(void *)&stride_A0,
(void *)&stride_A1,
(void *)&stride_A2,
(void *)&stride_A3,
(void *)%(ten4)s->ga.data,
(void *)&%(ten4)s->ga.offset,
(void *)&stride_Z0,
(void *)&stride_Z1,
(void *)%(z)s->ga.data,
(void *)&%(z)s->ga.offset};
err = GpuKernel_call(fptr, 3, n_blocks, threads_per_block, 0, kernel_params);
%(err_check)s
} // END NESTED SCOPE
""" % dict(
ten4=inp[0],
neib_shape=inp[1],
neib_step=inp[2],
z=out[0],
dtype_neib_shape=node.inputs[1].dtype,
dtype_neib_step=node.inputs[2].dtype,
err_check=err_check,
name=name,
params=sub["params"],
fail=sub["fail"],
)
from io import StringIO
import numpy as np
from aesara.graph.basic import Apply
from aesara.graph.op import _NoPythonOp
try:
import pygpu
from pygpu import gpuarray
except ImportError:
pass
from aesara.gpuarray.basic_ops import (
GpuKernelBaseCOp,
Kernel,
as_gpuarray_variable,
gpuarray_helper_inc_dir,
infer_context_name,
)
from aesara.gpuarray.fp16_help import load_w, work_dtype, write_w
from aesara.gpuarray.type import GpuArrayType
class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBaseCOp, _NoPythonOp):
"""
Implement CrossentropySoftmaxArgmax1HotWithBias on the gpu.
"""
nin = 3
nout = 3
__props__ = ()
_f16_ok = True
def make_node(self, x, b, y_idx):
ctx_name = infer_context_name(x, b, y_idx)
x = as_gpuarray_variable(x, ctx_name)
b = as_gpuarray_variable(b, ctx_name)
y_idx = as_gpuarray_variable(y_idx, ctx_name)
nll = GpuArrayType(
x.type.dtype, y_idx.type.broadcastable, context_name=ctx_name
)()
sm = x.type()
am = y_idx.type()
return Apply(self, [x, b, y_idx], [nll, sm, am])
def c_headers(self, **kwargs):
return ["<numpy_compat.h>", "<gpuarray/types.h>", "gpuarray_helper.h"]
def c_header_dirs(self, **kwargs):
return [gpuarray_helper_inc_dir()]
def gpu_kernels(self, node, nodename):
dtype_x = node.inputs[0].dtype
dtype_b = node.inputs[1].dtype
dtype_y_idx = node.inputs[2].dtype
work_x = work_dtype(dtype_x)
work_b = work_dtype(dtype_b)
load_x = load_w(dtype_x)
load_b = load_w(dtype_b)
write_x = write_w(dtype_x)
write_b = write_w(dtype_b)
flags = Kernel.get_flags(dtype_x, dtype_b, dtype_y_idx)
type_x = gpuarray.dtype_to_ctype(dtype_x)
type_b = gpuarray.dtype_to_ctype(dtype_b)
work_x = gpuarray.dtype_to_ctype(work_x)
type_y_idx = gpuarray.dtype_to_ctype(dtype_y_idx)
kname = "k_xent_sm_1hot_bias"
k_var = "k_xent_sm_1hot_bias_" + nodename
if node.inputs[0].type.context.kind != b"cuda":
f = ""
else:
f = "" if dtype_x == "float64" else "f"
params = [
gpuarray.SIZE,
gpuarray.SIZE,
gpuarray.GpuArray,
gpuarray.SIZE,
gpuarray.SSIZE,
gpuarray.SSIZE,
gpuarray.GpuArray,
gpuarray.SIZE,
gpuarray.SSIZE,
gpuarray.GpuArray,
gpuarray.SIZE,
gpuarray.SSIZE,
gpuarray.GpuArray,
gpuarray.SIZE,
gpuarray.SSIZE,
gpuarray.GpuArray,
gpuarray.SIZE,
gpuarray.SSIZE,
gpuarray.SSIZE,
gpuarray.GpuArray,
gpuarray.SIZE,
gpuarray.SSIZE,
]
sio = StringIO()
print(
"""#include "cluda.h"
KERNEL void %(kname)s(const ga_size M, const ga_size N,
GLOBAL_MEM const %(type_x)s* x_data, const ga_size offset_x, const ga_ssize xs0, const ga_ssize xs1,
GLOBAL_MEM const %(type_b)s* b, const ga_size offset_b, const ga_ssize bs0,
GLOBAL_MEM const %(type_y_idx)s* y_idx_data, const ga_size offset_y_idx, const ga_ssize y_idxs0,
GLOBAL_MEM %(type_x)s* nll_data, const ga_size offset_nll, const ga_ssize nlls0,
GLOBAL_MEM %(type_x)s* sm_data, const ga_size offset_sm, const ga_ssize sms0, const ga_ssize sms1,
GLOBAL_MEM %(type_y_idx)s* am_data, const ga_size offset_am, const ga_ssize ams0 GA_DECL_SHARED_PARAM(%(work_x)s, per_thread_values))
{
x_data = (GLOBAL_MEM const %(type_x)s *)(((GLOBAL_MEM char *)x_data)+offset_x);
b = (GLOBAL_MEM const %(type_b)s *)(((GLOBAL_MEM char *)b)+offset_b);
y_idx_data = (GLOBAL_MEM const %(type_y_idx)s *)(((GLOBAL_MEM char *)y_idx_data)+offset_y_idx);
nll_data = (GLOBAL_MEM %(type_x)s *)(((GLOBAL_MEM char *)nll_data)+offset_nll);
sm_data = (GLOBAL_MEM %(type_x)s *)(((GLOBAL_MEM char *)sm_data)+offset_sm);
am_data = (GLOBAL_MEM %(type_y_idx)s *)(((GLOBAL_MEM char *)am_data)+offset_am);
for (ga_int row = GID_0; row < M; row += GDIM_0){
GLOBAL_MEM const %(type_x)s* x = x_data + xs0 * row;
GLOBAL_MEM %(type_x)s* sm = sm_data + sms0 * row;
GA_DECL_SHARED_BODY(%(work_x)s, per_thread_values);
LOCAL_MEM %(work_x)s row_max, sum, sum_inv;
LOCAL_MEM ga_int row_max_threadIdx;
%(work_x)s per_thread_row_max, per_thread_sum;
ga_int per_thread_row_max_j;
// COMPUTE ROW MAX AND ARGMAX
// compute separate per-thread maximums and argmaxes
per_thread_row_max = NAN;
per_thread_row_max_j = 0;
for (ga_int j = LID_0; j < N; j += LDIM_0)
{
%(work_x)s row_ij = %(load_x)s(x[j * xs1]) + %(load_b)s(b[j * bs0]);
per_thread_row_max_j = (row_ij > per_thread_row_max) ? j : per_thread_row_max_j;
per_thread_row_max = fmax%(f)s(row_ij, per_thread_row_max);
}
per_thread_values[LID_0] = per_thread_row_max;
local_barrier();
if (LID_0 == 0) {
row_max = NAN;
row_max_threadIdx = 0;
for (ga_int j = 0; j < LDIM_0; j++)
{
%(work_x)s per_thread_max = per_thread_values[j];
row_max_threadIdx = (per_thread_max > row_max) ? j : row_max_threadIdx;
row_max = fmax%(f)s(per_thread_max, row_max);
}
}
local_barrier();
// The thread with the highest max writes out which of its
// values was the winner.
if (LID_0 == row_max_threadIdx) am_data[row * ams0] = per_thread_row_max_j;
// COMPUTE SOFTMAX
per_thread_sum = 0.0;
for (ga_int j = LID_0; j < N; j += LDIM_0)
{
%(work_x)s row_ij = %(load_x)s(x[j * xs1]) + %(load_b)s(b[j * bs0]);
%(work_x)s sm_ij = exp%(f)s(row_ij - row_max);
per_thread_sum += sm_ij;
sm[j * sms1] = %(write_x)s(sm_ij);
}
per_thread_values[LID_0] = per_thread_sum;
local_barrier();
if (LID_0 == 0) {
sum = 0.0;
for (ga_int j = 0; j < LDIM_0; j++) {
sum += per_thread_values[j];
}
sum_inv = 1.0 / sum;
}
local_barrier();
for (ga_int j = LID_0; j < N; j += LDIM_0) {
sm[j * sms1] = %(write_x)s(%(load_x)s(sm[j * sms1]) * sum_inv);
}
if (LID_0 == 0) {
const %(type_y_idx)s y_idx = (ga_int)y_idx_data[row * y_idxs0];
if ((y_idx >= N || y_idx < 0)) {
// raise some suspicion.
nll_data[row * nlls0] = %(write_x)s(0.0);
} else {
nll_data[row * nlls0] = %(write_x)s(
- %(load_x)s(x[y_idx * xs1])
- %(load_b)s(b[y_idx * bs0])
+ row_max + log%(f)s(sum));
}
}
}
}
"""
% locals(),
file=sio,
)
return [
Kernel(
code=sio.getvalue(),
name=kname,
params=params,
flags=flags,
objvar=k_var,
)
]
def c_code(self, node, nodename, inp, out, sub):
itemsize_x = np.dtype(node.inputs[0].dtype).itemsize
worksize_x = np.dtype(work_dtype(node.inputs[0].dtype)).itemsize
itemsize_b = np.dtype(node.inputs[1].dtype).itemsize
itemsize_y_idx = np.dtype(node.inputs[2].dtype).itemsize
itemsize_nll = np.dtype(node.outputs[0].dtype).itemsize
itemsize_sm = np.dtype(node.outputs[1].dtype).itemsize
itemsize_am = np.dtype(node.outputs[2].dtype).itemsize
x, b, y_idx = inp
nll, sm, am = out
fail = sub["fail"]
ctx = sub["params"]
k_var = f"k_xent_sm_1hot_bias_{nodename}"
err_check = (
"""
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: %(k_var)s: %%s.",
GpuKernel_error(&%(k_var)s, err));
%(fail)s;
}
"""
% locals()
)
sio = StringIO()
print(
"""
if (PyGpuArray_DIMS(%(x)s)[0] !=
PyGpuArray_DIMS(%(y_idx)s)[0])
{
PyErr_SetString(PyExc_ValueError,
"dimension mismatch in x,y_idx arguments");
%(fail)s;
}
if (PyGpuArray_DIMS(%(x)s)[1] != PyGpuArray_DIMS(%(b)s)[0])
{
PyErr_SetString(PyExc_ValueError,
"dimension mismatch in x,b arguments");
%(fail)s;
}
if (aesara_prep_output(&%(nll)s, 1, PyGpuArray_DIMS(%(y_idx)s), %(x)s->ga.typecode, GA_C_ORDER, %(ctx)s)) %(fail)s
if (aesara_prep_output(&%(sm)s, 2, PyGpuArray_DIMS(%(x)s), %(x)s->ga.typecode, GA_C_ORDER, %(ctx)s)) %(fail)s
if (aesara_prep_output(&%(am)s, 1, PyGpuArray_DIMS(%(y_idx)s), %(y_idx)s->ga.typecode, GA_C_ORDER, %(ctx)s)) %(fail)s
{
size_t n_blocks = std::min(PyGpuArray_DIM(%(x)s, 0), (size_t)4096);
size_t n_threads = std::min(PyGpuArray_DIM(%(x)s, 1), (size_t)256);
size_t n_shared = n_threads * %(worksize_x)s;
//TODO: launch more threads per row and do parallel sum and max reductions
int err = k_xent_sm_1hot_bias_call(
1, &n_blocks, &n_threads, n_shared,
PyGpuArray_DIMS(%(x)s)[0],
PyGpuArray_DIMS(%(x)s)[1],
%(x)s->ga.data, %(x)s->ga.offset,
PyGpuArray_STRIDE(%(x)s, 0) / %(itemsize_x)s,
PyGpuArray_STRIDE(%(x)s, 1) / %(itemsize_x)s,
%(b)s->ga.data, %(b)s->ga.offset,
PyGpuArray_STRIDE(%(b)s, 0) / %(itemsize_b)s,
%(y_idx)s->ga.data, %(y_idx)s->ga.offset,
PyGpuArray_STRIDE(%(y_idx)s, 0) / %(itemsize_y_idx)s,
%(nll)s->ga.data, %(nll)s->ga.offset,
PyGpuArray_STRIDE(%(nll)s, 0) / %(itemsize_nll)s,
%(sm)s->ga.data, %(sm)s->ga.offset,
PyGpuArray_STRIDE(%(sm)s, 0) / %(itemsize_sm)s,
PyGpuArray_STRIDE(%(sm)s, 1) / %(itemsize_sm)s,
%(am)s->ga.data, %(am)s->ga.offset,
PyGpuArray_STRIDE(%(am)s, 0) / %(itemsize_am)s);
%(err_check)s
}
"""
% locals(),
file=sio,
)
return sio.getvalue()
def c_code_cache_version(self):
return (14,)
gpu_crossentropy_softmax_argmax_1hot_with_bias = (
GpuCrossentropySoftmaxArgmax1HotWithBias()
)
class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBaseCOp, _NoPythonOp):
"""
Implement CrossentropySoftmax1HotWithBiasDx on the gpu.
Gradient wrt x of the CrossentropySoftmax1Hot Op.
"""
nin = 3
nout = 1
__props__ = ()
_f16_ok = True
def make_node(self, dnll, sm, y_idx):
ctx_name = infer_context_name(dnll, sm, y_idx)
dnll = as_gpuarray_variable(dnll, ctx_name)
sm = as_gpuarray_variable(sm, ctx_name)
y_idx = as_gpuarray_variable(y_idx, ctx_name)
return Apply(self, [dnll, sm, y_idx], [sm.type()])
def c_code_cache_version(self):
return (14,)
def c_headers(self, **kwargs):
return ["<numpy_compat.h>", "<gpuarray/types.h>"]
def c_code(self, node, nodename, inp, out, sub):
typecode_dx = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
itemsize_dnll = np.dtype(node.inputs[0].dtype).itemsize
itemsize_sm = np.dtype(node.inputs[1].dtype).itemsize
itemsize_y_idx = np.dtype(node.inputs[2].dtype).itemsize
itemsize_dx = np.dtype(node.outputs[0].dtype).itemsize
dtype_dnll = node.inputs[0].dtype
dtype_sm = node.inputs[1].dtype
dtype_y_idx = node.inputs[2].dtype
dtype_dx = node.outputs[0].dtype
type_intp = gpuarray.dtype_to_ctype(np.intp)
dnll, sm, y_idx = inp
(dx,) = out
fail = sub["fail"]
ctx = sub["params"]
k_var = "kCrossEntropySoftmax1HotWithBiasDx_" + nodename
err_check = (
"""
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: %(k_var)s: %%s.",
GpuKernel_error(&%(k_var)s, err));
%(fail)s;
}
"""
% locals()
)
return (
"""
// Get `dnll.shape[0]` or set it to zero if `dnll` is a scalar.
const ssize_t %(dnll)s_dims0 = (PyGpuArray_NDIM(%(dnll)s) > 0 ?
PyGpuArray_DIMS(%(dnll)s)[0] :
(ssize_t) 0);
// Get `dnll.strides[0]` and set it to zero if `dnll` is a scalar
// or a vector with just one element.
const ssize_t %(dnll)s_strides0 = (%(dnll)s_dims0 > 1 ?
PyGpuArray_STRIDES(%(dnll)s)[0] :
(ssize_t) 0);
if ((PyGpuArray_NDIM(%(dnll)s) > 1)
|| (PyGpuArray_NDIM(%(sm)s) != 2)
|| (PyGpuArray_NDIM(%(y_idx)s) != 1))
{
PyErr_SetString(PyExc_ValueError, "rank error");
%(fail)s;
}
if (%(dnll)s_dims0 !=
PyGpuArray_DIMS(%(sm)s)[0] && %(dnll)s_dims0 > 1)
{
PyErr_Format(PyExc_ValueError,
"dnll.shape[0] == %%i, but sm.shape[0] == %%i",
%(dnll)s_dims0,
PyGpuArray_DIMS(%(sm)s)[0]);
%(fail)s;
}
if (%(dnll)s_dims0 !=
PyGpuArray_DIMS(%(y_idx)s)[0] && %(dnll)s_dims0 > 1)
{
PyErr_SetString(PyExc_ValueError,
"dnll.shape[0] != y_idx.shape[0]");
%(fail)s;
}
if (PyGpuArray_DIMS(%(sm)s)[0] !=
PyGpuArray_DIMS(%(y_idx)s)[0])
{
PyErr_SetString(PyExc_ValueError,
"sm.shape[0] != y_idx.shape[0]");
%(fail)s;
}
if ((NULL == %(dx)s)
|| (PyGpuArray_DIMS(%(dx)s)[0] !=
PyGpuArray_DIMS(%(sm)s)[0])
|| (PyGpuArray_DIMS(%(dx)s)[1] !=
PyGpuArray_DIMS(%(sm)s)[1]))
{
Py_XDECREF(%(dx)s);
%(dx)s = pygpu_empty(2, PyGpuArray_DIMS(%(sm)s),
%(typecode_dx)s, GA_C_ORDER,
%(ctx)s, Py_None);
if (!%(dx)s) {
%(fail)s
}
}
{
size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(dx)s)[0], (size_t)256), 1, 1};
size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(%(dx)s)[1], (size_t)256), 1, 1};
ssize_t stride_DNLL0 = %(dnll)s_strides0 / %(itemsize_dnll)s;
ssize_t stride_SM0 = PyGpuArray_STRIDES(%(sm)s)[0] / %(itemsize_sm)s;
ssize_t stride_SM1 = PyGpuArray_STRIDES(%(sm)s)[1] / %(itemsize_sm)s;
ssize_t stride_YIDX0 = PyGpuArray_STRIDES(%(y_idx)s)[0] / %(itemsize_y_idx)s;
ssize_t stride_DX0 = PyGpuArray_STRIDES(%(dx)s)[0] / %(itemsize_dx)s;
ssize_t stride_DX1 = PyGpuArray_STRIDES(%(dx)s)[1] / %(itemsize_dx)s;
void *kernel_params[] = {
(void *)&PyGpuArray_DIMS(%(dx)s)[0],
(void *)&PyGpuArray_DIMS(%(dx)s)[1],
(void *)%(dnll)s->ga.data, (void *)&%(dnll)s->ga.offset,
(void *)&stride_DNLL0,
(void *)%(sm)s->ga.data, (void *)&%(sm)s->ga.offset,
(void *)&stride_SM0, (void *)&stride_SM1,
(void *)%(y_idx)s->ga.data, (void *)&%(y_idx)s->ga.offset,
(void *)&stride_YIDX0,
(void *)%(dx)s->ga.data, (void *)&%(dx)s->ga.offset,
(void *)&stride_DX0, (void *)&stride_DX1};
int err = GpuKernel_call(&%(k_var)s, 3, n_blocks, threads_per_block, 0, kernel_params);
%(err_check)s
}
assert(%(dx)s);
"""
% locals()
)
def gpu_kernels(self, node, nodename):
dtype_dnll = node.inputs[0].dtype
dtype_sm = node.inputs[1].dtype
dtype_y_idx = node.inputs[2].dtype
dtype_dx = node.outputs[0].dtype
work_dnll = work_dtype(dtype_dnll)
load_dnll = load_w(dtype_dnll)
load_sm = load_w(dtype_sm)
write_dx = write_w(dtype_dx)
flags = Kernel.get_flags(dtype_dnll, dtype_sm, dtype_y_idx, dtype_dx)
wtype_dnll = gpuarray.dtype_to_ctype(work_dnll)
type_dnll = gpuarray.dtype_to_ctype(dtype_dnll)
type_sm = gpuarray.dtype_to_ctype(dtype_sm)
type_y_idx = gpuarray.dtype_to_ctype(dtype_y_idx)
type_dx = gpuarray.dtype_to_ctype(dtype_dx)
kname = "kCrossEntropySoftmax1HotWithBiasDx"
k_var = "kCrossEntropySoftmax1HotWithBiasDx_" + nodename
params = [
gpuarray.SIZE,
gpuarray.SIZE,
gpuarray.GpuArray,
gpuarray.SIZE,
gpuarray.SSIZE,
gpuarray.GpuArray,
gpuarray.SIZE,
gpuarray.SSIZE,
gpuarray.SSIZE,
gpuarray.GpuArray,
gpuarray.SIZE,
gpuarray.SSIZE,
gpuarray.GpuArray,
gpuarray.SIZE,
gpuarray.SSIZE,
gpuarray.SSIZE,
]
sio = StringIO()
print(
"""#include "cluda.h"
KERNEL void %(kname)s(
const ga_size N, const ga_size K,
GLOBAL_MEM const %(type_dnll)s* dnll, const ga_size offset_dnll, const ga_ssize dnll_s0,
GLOBAL_MEM const %(type_sm)s* sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1,
GLOBAL_MEM const %(type_y_idx)s* y_idx, const ga_size offset_y_idx, const ga_ssize y_idx_s0,
GLOBAL_MEM %(type_dx)s* dx, const ga_size offset_dx, const ga_ssize dx_s0, const ga_ssize dx_s1)
{
dnll = (GLOBAL_MEM const %(type_dnll)s *)(((GLOBAL_MEM char *)dnll)+offset_dnll);
sm = (GLOBAL_MEM const %(type_sm)s *)(((GLOBAL_MEM char *)sm)+offset_sm);
y_idx = (GLOBAL_MEM const %(type_y_idx)s *)(((GLOBAL_MEM char *)y_idx)+offset_y_idx);
dx = (GLOBAL_MEM %(type_dx)s *)(((GLOBAL_MEM char *)dx)+offset_dx);
for (ga_int i = GID_0; i < N; i += GDIM_0)
{
%(wtype_dnll)s dnll_i = %(load_dnll)s(dnll[i * dnll_s0]);
%(type_y_idx)s y_i = y_idx[i * y_idx_s0];
for (ga_int j = LID_0; j < K; j += LDIM_0)
{
if (y_i == j)
{
dx[i * dx_s0 + j * dx_s1] =
%(write_dx)s(dnll_i *
(%(load_sm)s(sm[i * sm_s0 + j * sm_s1]) - 1.0));
}
else
{
dx[i * dx_s0 + j * dx_s1] =
%(write_dx)s(dnll_i *
%(load_sm)s(sm[i * sm_s0 + j * sm_s1]));
}
}
}
}
"""
% locals(),
file=sio,
)
return [
Kernel(
code=sio.getvalue(),
name=kname,
params=params,
flags=flags,
objvar=k_var,
)
]
gpu_crossentropy_softmax_1hot_with_bias_dx = GpuCrossentropySoftmax1HotWithBiasDx()
class GpuSoftmax(GpuKernelBaseCOp, _NoPythonOp):
"""
Implement Softmax on the gpu.
"""
__props__ = ()
_f16_ok = True
def make_node(self, x):
x = as_gpuarray_variable(x, infer_context_name(x))
return Apply(self, [x], [x.type()])
def infer_shape(self, fgraph, node, shape):
return shape
def c_code_cache_version(self):
return (17,)
def c_headers(self, **kwargs):
return ["<numpy_compat.h>", "<gpuarray/types.h>"]
def c_code(self, node, nodename, inp, out, sub):
dtype_x = node.inputs[0].dtype
work_x = work_dtype(dtype_x)
dtype_z = node.outputs[0].dtype
itemsize_x = np.dtype(dtype_x).itemsize
itemsize_z = np.dtype(dtype_z).itemsize
typecode = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
(x,) = inp
(z,) = out
fail = sub["fail"]
ctx = sub["params"]
err_check = (
"""
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, fmt_str, msg);
%(fail)s;
}
"""
% locals()
)
return (
"""
if (PyGpuArray_NDIM(%(x)s) != 2)
{
PyErr_SetString(PyExc_ValueError, "rank error");
%(fail)s;
}
if ((NULL == %(z)s) ||
(PyGpuArray_DIMS(%(z)s)[0] !=
PyGpuArray_DIMS(%(x)s)[0]) ||
(PyGpuArray_DIMS(%(z)s)[1] !=
PyGpuArray_DIMS(%(x)s)[1]))
{
Py_XDECREF(%(z)s);
%(z)s = pygpu_empty(2, PyGpuArray_DIMS(%(x)s),
%(typecode)s, GA_C_ORDER,
%(ctx)s, Py_None);
if (!%(z)s) {
%(fail)s
}
}
{
size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t)(32 * 1024)), 1, 1};
//TODO, detect the maximum number of thread per block.
size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t)256), 1, 1}; // TODO: Read GA_CTX_PROP_MAXLSIZE0
size_t shmem_sz = PyGpuArray_DIMS(%(x)s)[1] *
2 * sizeof(npy_%(work_x)s);
ssize_t stride_X0 = PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s;
ssize_t stride_X1 = PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s;
ssize_t stride_Z0 = PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s;
ssize_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s;
const char *fmt_str, *msg;
void *kernel_params[] = {
(void *)&PyGpuArray_DIMS(%(x)s)[0],
(void *)&PyGpuArray_DIMS(%(x)s)[1],
(void *)%(x)s->ga.data, (void *)&%(x)s->ga.offset,
(void *)&stride_X0, (void *)&stride_X1,
(void *)%(z)s->ga.data, (void *)&%(z)s->ga.offset,
(void *)&stride_Z0, (void *)&stride_Z1};
int err = GA_NO_ERROR;
if (PyGpuArray_DIMS(%(x)s)[0] > 0)
{
//Those numbers are based on not too recent GPU
//to make them compatible with more GPU.
//TODO: read the information from the card.
if(shmem_sz < (32 * 1024 - 500)){
err = GpuKernel_call(&kSoftmax_%(nodename)s, 3,
n_blocks, threads_per_block, shmem_sz,
kernel_params);
fmt_str = "gpuarray error: kSoftmax_%(nodename)s: %%s";
msg = GpuKernel_error(&kSoftmax_%(nodename)s, err);
}else{
err = GpuKernel_call(&kSoftmax_fixed_shared%(nodename)s, 3,
n_blocks, threads_per_block,
threads_per_block[0] * sizeof(npy_%(work_x)s),
kernel_params);
fmt_str = "gpuarray error: kSoftmax_fixed_shared%(nodename)s: %%s";
msg = GpuKernel_error(&kSoftmax_fixed_shared%(nodename)s, err);
}
%(err_check)s
}
}
assert(%(z)s);
"""
% locals()
)
def gpu_kernels(self, node, nodename):
dtype_x = node.inputs[0].dtype
dtype_sm = node.outputs[0].dtype
load_x = load_w(dtype_x)
write_sm = write_w(node.outputs[0].dtype)
work_sm = work_dtype(dtype_sm)
flags = Kernel.get_flags(dtype_x, dtype_sm)
type_x = gpuarray.dtype_to_ctype(dtype_x)
type_sm = gpuarray.dtype_to_ctype(dtype_sm)
type_acc = gpuarray.dtype_to_ctype(work_sm)
ctype = gpuarray.dtype_to_ctype(work_sm)
params = [
gpuarray.SIZE,
gpuarray.SIZE,
gpuarray.GpuArray,
gpuarray.SIZE,
gpuarray.SSIZE,
gpuarray.SSIZE,
gpuarray.GpuArray,
gpuarray.SIZE,
gpuarray.SSIZE,
gpuarray.SSIZE,
]
kernels = []
kname = "kSoftmax"
k_var = "kSoftmax_" + nodename
code = (
"""#include "cluda.h"
KERNEL void %(kname)s (const ga_size M, const ga_size N,
GLOBAL_MEM const %(type_x)s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1,
GLOBAL_MEM %(type_sm)s * sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1 GA_DECL_SHARED_PARAM(%(type_acc)s, buf))
{
GA_DECL_SHARED_BODY(%(type_acc)s, buf);
LOCAL_MEM_ARG %(type_acc)s * buf2 = buf + N;
x = (GLOBAL_MEM const %(type_x)s *)(((GLOBAL_MEM char *)x)+offset_x);
sm = (GLOBAL_MEM %(type_sm)s *)(((GLOBAL_MEM char *)sm)+offset_sm);
for (ga_int blockIDX = GID_0; blockIDX < M; blockIDX += GDIM_0) {
for (ga_int tx = LID_0; tx< N; tx += LDIM_0) {
buf[tx] = %(load_x)s(x[blockIDX * sx0 + tx * sx1]);
buf2[tx] = buf[tx];
}
local_barrier();
{
// This function trashes buf[1..GA_WARP_SIZE],
// leaving the reduction result in buf[0].
if (LID_0 < GA_WARP_SIZE) {
for (ga_int i = LID_0 + GA_WARP_SIZE; i < N; i += GA_WARP_SIZE)
{
buf[LID_0] = max(buf[LID_0], buf[i]);
}
}
local_barrier();
//reduce so that LID_0 0 has the reduction of everything
for (ga_uint _n = GA_WARP_SIZE / 2; _n > 0; _n /= 2) {
if (LID_0 < _n && LID_0 + _n < N)
buf[LID_0] = max(buf[LID_0], buf[LID_0+_n]);
local_barrier();
}
}
%(ctype)s row_max = buf[0];
local_barrier();
for(ga_int __i=LID_0; __i<N; __i+=LDIM_0){
buf[__i] = exp(buf2[__i] - row_max);
buf2[__i] = buf[__i];
}
local_barrier();
{
// This function trashes buf[1..GA_WARP_SIZE],
// leaving the reduction result in buf[0].
if (LID_0 < GA_WARP_SIZE) {
for (ga_int i = LID_0 + GA_WARP_SIZE; i < N; i += GA_WARP_SIZE)
{
buf[LID_0] = buf[LID_0] + buf[i];
}
}
local_barrier();
//reduce so that LID_0 0 has the reduction of everything
for (ga_uint _n = GA_WARP_SIZE / 2; _n > 0; _n /= 2) {
if (LID_0 < _n && LID_0 + _n < N)
buf[LID_0] = buf[LID_0] + buf[LID_0+_n];
local_barrier();
}
}
%(ctype)s row_sum = buf[0];
local_barrier();
for(ga_int __i=LID_0; __i<N; __i+=LDIM_0) {
buf[__i] = buf2[__i] / row_sum;
}
local_barrier();
for (ga_int tx = LID_0; tx< N; tx += LDIM_0) {
sm[blockIDX * sm_s0 + tx * sm_s1] = %(write_sm)s(buf[tx]);
}
local_barrier();
}
}
"""
% locals()
)
kernels.append(
Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var)
)
kname = "kSoftmax_fixed_shared"
k_var = "kSoftmax_fixed_shared" + nodename
code = (
"""#include "cluda.h"
KERNEL void %(kname)s (const ga_size M, const ga_size N,
GLOBAL_MEM const %(type_x)s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1,
GLOBAL_MEM %(type_sm)s * sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1 GA_DECL_SHARED_PARAM(%(type_acc)s, buf))
{
GA_DECL_SHARED_BODY(%(type_acc)s, buf);
x = (GLOBAL_MEM const %(type_x)s *)(((GLOBAL_MEM char *)x)+offset_x);
sm = (GLOBAL_MEM %(type_sm)s *)(((GLOBAL_MEM char *)sm)+offset_sm);
for (ga_int blockIDX = GID_0; blockIDX < M; blockIDX += GDIM_0){
GLOBAL_MEM const %(type_x)s *x_ptr = &x[blockIDX * sx0];
GLOBAL_MEM %(type_sm)s *sm_ptr = &sm[blockIDX * sm_s0];
{
// This function trashes buf[1..n_threads],
// leaving the reduction result in buf[0].
%(ctype)s red = %(load_x)s(x_ptr[LID_0 * sx1]);
#pragma unroll 16
for (ga_int i = LID_0 + LDIM_0; i<N; i += LDIM_0) {
red = max(red, %(load_x)s(x_ptr[i * sx1]));
}
buf[LID_0] = red;
local_barrier();
if (LID_0 < GA_WARP_SIZE) {
for (ga_int i = LID_0 + GA_WARP_SIZE; i < LDIM_0; i += GA_WARP_SIZE) {
buf[LID_0] = max(buf[LID_0], buf[i]);
}
}
local_barrier();
//reduce so that LID_0 0 has the reduction of everything
for (ga_uint _n = GA_WARP_SIZE / 2; _n > 0; _n /= 2) {
if (LID_0 < _n && LID_0 + _n < N)
buf[LID_0] = max(buf[LID_0], buf[LID_0+_n]);
local_barrier();
}
}
%(ctype)s row_max = buf[0];
local_barrier();
{
// This function trashes buf[1..n_threads],
// leaving the reduction result in buf[0].
%(ctype)s red = exp(%(load_x)s(x_ptr[LID_0 * sx1]) - row_max);
#pragma unroll 16
for (ga_int i = LID_0 + LDIM_0; i<N; i += LDIM_0) {
red = red + exp(%(load_x)s(x_ptr[i * sx1]) - row_max);
}
buf[LID_0] = red;
local_barrier();
if (LID_0 < GA_WARP_SIZE) {
for (ga_int i = LID_0 + GA_WARP_SIZE; i < LDIM_0; i += GA_WARP_SIZE) {
buf[LID_0] = buf[LID_0] + buf[i];
}
}
local_barrier();
//reduce so that LID_0 0 has the reduction of everything
for (ga_uint _n = GA_WARP_SIZE / 2; _n > 0; _n /= 2) {
if (LID_0 < _n && LID_0 + _n < N)
buf[LID_0] = buf[LID_0] + buf[LID_0+_n];
local_barrier();
}
}
%(ctype)s row_sum = buf[0];
local_barrier();
for (ga_int tx = LID_0; tx< N; tx += LDIM_0){
sm_ptr[tx * sm_s1] = %(write_sm)s(exp(%(load_x)s(x_ptr[tx * sx1]) - row_max) / row_sum);
}
local_barrier();
}
}
"""
% locals()
)
kernels.append(
Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var)
)
return kernels
gpu_softmax = GpuSoftmax()
class GpuSoftmaxWithBias(GpuKernelBaseCOp, _NoPythonOp):
"""
Implement SoftmaxWithBias on the gpu.
"""
nin = 2
nout = 1
__props__ = ()
_f16_ok = True
def make_node(self, x, b):
ctx_name = infer_context_name(x, b)
x = as_gpuarray_variable(x, ctx_name)
b = as_gpuarray_variable(b, ctx_name)
return Apply(self, [x, b], [x.type()])
def infer_shape(self, fgraph, node, shape):
return [shape[0]]
def c_code_cache_version(self):
return (16,)
def c_headers(self, **kwargs):
return ["<numpy_compat.h>", "<gpuarray/types.h>"]
def c_code(self, node, nodename, inp, out, sub):
dtype_x = node.inputs[0].dtype
dtype_b = node.inputs[1].dtype
dtype_z = node.outputs[0].dtype
work_x = work_dtype(dtype_x)
itemsize_x = np.dtype(dtype_x).itemsize
itemsize_b = np.dtype(dtype_b).itemsize
itemsize_z = np.dtype(dtype_z).itemsize
typecode = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
x, b = inp
(z,) = out
fail = sub["fail"]
ctx = sub["params"]
err_check = (
"""
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, fmt_str, msg);
%(fail)s;
}
"""
% locals()
)
return (
"""
if (PyGpuArray_NDIM(%(x)s) != 2)
{
PyErr_SetString(PyExc_ValueError, "rank error input");
%(fail)s;
}
if (PyGpuArray_NDIM(%(b)s) != 1)
{
PyErr_SetString(PyExc_ValueError, "rank error for the bias");
%(fail)s;
}
if ((PyGpuArray_DIMS(%(x)s)[1] !=
PyGpuArray_DIMS(%(b)s)[0]))
{
PyErr_Format(PyExc_ValueError,
"number of columns in x (%%ld)"
" does not match length of b (%%ld)",
(long int)PyGpuArray_DIMS(%(x)s)[1],
(long int)PyGpuArray_DIMS(%(b)s)[0]);
%(fail)s;
}
if ((NULL == %(z)s)
|| (PyGpuArray_DIMS(%(z)s)[0] !=
PyGpuArray_DIMS(%(x)s)[0])
|| (PyGpuArray_DIMS(%(z)s)[1] !=
PyGpuArray_DIMS(%(x)s)[1]))
{
Py_XDECREF(%(z)s);
%(z)s = pygpu_empty(2, PyGpuArray_DIMS(%(x)s),
%(typecode)s, GA_C_ORDER,
%(ctx)s, Py_None);
if (!%(z)s) {
%(fail)s
}
}
{
size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t)(32*1024)), 1, 1};
//TODO, detect the maximum number of thread per block.
size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t)256), 1, 1}; // TODO: Read GA_CTX_PROP_MAXLSIZE0
size_t shmem_sz = PyGpuArray_DIMS(%(x)s)[1] *
2 * sizeof(npy_%(work_x)s);
ssize_t stride_X0 = PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s;
ssize_t stride_X1 = PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s;
ssize_t stride_B0 = PyGpuArray_STRIDES(%(b)s)[0] / %(itemsize_b)s;
ssize_t stride_Z0 = PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s;
ssize_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s;
const char *fmt_str, *msg;
void *kernel_params[] = {
(void *)&PyGpuArray_DIMS(%(x)s)[0],
(void *)&PyGpuArray_DIMS(%(x)s)[1],
(void *)%(x)s->ga.data, (void *)&%(x)s->ga.offset,
(void *)&stride_X0, (void *)&stride_X1,
(void *)%(b)s->ga.data, (void *)&%(b)s->ga.offset,
(void *)&stride_B0,
(void *)%(z)s->ga.data, (void *)&%(z)s->ga.offset,
(void *)&stride_Z0, (void *)&stride_Z1};
int err = GA_NO_ERROR;
if (PyGpuArray_DIMS(%(x)s)[0] > 0)
{
if(shmem_sz < (32 * 1024 - 500)){
err = GpuKernel_call(&kSoftmaxWithBias_%(nodename)s, 3,
n_blocks, threads_per_block, shmem_sz,
kernel_params);
fmt_str = "gpuarray error: kSoftmaxWithBias_%(nodename)s: %%s";
msg = GpuKernel_error(&kSoftmaxWithBias_%(nodename)s, err);
}else{
err = GpuKernel_call(&kSoftmaxWithBias_fixed_shared%(nodename)s,
3, n_blocks, threads_per_block,
threads_per_block[0] * sizeof(npy_%(work_x)s),
kernel_params);
fmt_str = "gpuarray error: kSoftmaxWithBias_fixed_shared%(nodename)s: %%s";
msg = GpuKernel_error(&kSoftmaxWithBias_fixed_shared%(nodename)s, err);
}
%(err_check)s
}
}
assert(%(z)s);
"""
% locals()
)
def gpu_kernels(self, node, nodename):
dtype_x = node.inputs[0].dtype
dtype_b = node.inputs[1].dtype
dtype_sm = node.outputs[0].dtype
load_x = load_w(node.inputs[0].dtype)
load_b = load_w(node.inputs[1].dtype)
write_sm = write_w(node.outputs[0].dtype)
work_sm = work_dtype(node.outputs[0].dtype)
flags = Kernel.get_flags(dtype_x, dtype_b, dtype_sm)
type_x = gpuarray.dtype_to_ctype(dtype_x)
type_b = gpuarray.dtype_to_ctype(dtype_b)
type_sm = gpuarray.dtype_to_ctype(dtype_sm)
type_acc = gpuarray.dtype_to_ctype(work_sm)
ctype = gpuarray.dtype_to_ctype(work_sm)
params = [
gpuarray.SIZE,
gpuarray.SIZE,
gpuarray.GpuArray,
gpuarray.SIZE,
gpuarray.SSIZE,
gpuarray.SSIZE,
gpuarray.GpuArray,
gpuarray.SIZE,
gpuarray.SSIZE,
gpuarray.GpuArray,
gpuarray.SIZE,
gpuarray.SSIZE,
gpuarray.SSIZE,
]
kernels = []
kname = "kSoftmaxWithBias"
k_var = "kSoftmaxWithBias_" + nodename
code = (
"""#include "cluda.h"
KERNEL void %(kname)s (const ga_size M, const ga_size N,
GLOBAL_MEM const %(type_x)s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1,
GLOBAL_MEM const %(type_b)s * b, const ga_size offset_b, const ga_ssize sb0,
GLOBAL_MEM %(type_sm)s * sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1 GA_DECL_SHARED_PARAM(%(type_acc)s, buf))
{
GA_DECL_SHARED_BODY(%(type_acc)s, buf);
LOCAL_MEM_ARG %(type_acc)s * buf2 = buf + N;
x = (GLOBAL_MEM const %(type_x)s *)(((GLOBAL_MEM char *)x)+offset_x);
b = (GLOBAL_MEM const %(type_b)s *)(((GLOBAL_MEM char *)b)+offset_b);
sm = (GLOBAL_MEM %(type_sm)s *)(((GLOBAL_MEM char *)sm)+offset_sm);
for (ga_int blockIDX = GID_0; blockIDX < M; blockIDX += GDIM_0){
for (ga_int tx = LID_0; tx< N; tx += LDIM_0){
buf[tx] = %(load_x)s(x[blockIDX * sx0 + tx * sx1]);
buf[tx] += %(load_b)s(b[tx * sb0]);
buf2[tx] = buf[tx];
}
local_barrier();
{
// This function trashes buf[1..GA_WARP_SIZE],
// leaving the reduction result in buf[0].
if (LID_0 < GA_WARP_SIZE) {
for (ga_int i = LID_0 + GA_WARP_SIZE; i < N; i += GA_WARP_SIZE)
{
buf[LID_0] = max(buf[LID_0], buf[i]);
}
}
local_barrier();
//reduce so that LID_0 0 has the reduction of everything
for (ga_uint _n = GA_WARP_SIZE / 2; _n > 0; _n /= 2) {
if (LID_0 < _n && LID_0 + _n < N)
buf[LID_0] = max(buf[LID_0], buf[LID_0+_n]);
local_barrier();
}
}
%(ctype)s row_max = buf[0];
local_barrier();
for(ga_int __i=LID_0; __i<N; __i+=LDIM_0){;
buf[__i] = exp(buf2[__i] - row_max);
buf2[__i] = buf[__i];
}
local_barrier();
{
// This function trashes buf[1..GA_WARP_SIZE],
// leaving the reduction result in buf[0].
if (LID_0 < GA_WARP_SIZE) {
for (ga_int i = LID_0 + GA_WARP_SIZE; i < N; i += GA_WARP_SIZE)
{
buf[LID_0] = buf[LID_0] + buf[i];
}
}
local_barrier();
//reduce so that LID_0 0 has the reduction of everything
for (ga_uint _n = GA_WARP_SIZE / 2; _n > 0; _n /= 2) {
if (LID_0 < _n && LID_0 + _n < N)
buf[LID_0] = buf[LID_0] + buf[LID_0+_n];
local_barrier();
}
}
%(ctype)s row_sum = buf[0];
local_barrier();
for(ga_int __i=LID_0; __i<N; __i+=LDIM_0){
buf[__i] = buf2[__i] / row_sum;
}
local_barrier();
for (ga_int tx = LID_0; tx< N; tx += LDIM_0){
sm[blockIDX * sm_s0 + tx * sm_s1] = %(write_sm)s(buf[tx]);
}
local_barrier();
}
}
"""
% locals()
)
kernels.append(
Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var)
)
kname = "kSoftmaxWithBias_fixed_shared"
k_var = "kSoftmaxWithBias_fixed_shared" + nodename
code = (
"""#include "cluda.h"
KERNEL void %(kname)s (const ga_size M, const ga_size N,
GLOBAL_MEM const %(type_x)s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1,
GLOBAL_MEM const %(type_b)s * b, const ga_size offset_b, const ga_ssize sb0,
GLOBAL_MEM %(type_sm)s * sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1 GA_DECL_SHARED_PARAM(%(type_acc)s, buf))
{
GA_DECL_SHARED_BODY(%(type_acc)s, buf);
x = (GLOBAL_MEM const %(type_x)s *)(((GLOBAL_MEM char *)x)+offset_x);
b = (GLOBAL_MEM const %(type_b)s *)(((GLOBAL_MEM char *)b)+offset_b);
sm = (GLOBAL_MEM %(type_sm)s *)(((GLOBAL_MEM char *)sm)+offset_sm);
for (ga_int blockIDX = GID_0; blockIDX < M; blockIDX += GDIM_0){
GLOBAL_MEM const %(type_x)s *x_ptr = &x[blockIDX * sx0];
GLOBAL_MEM %(type_sm)s *sm_ptr = &sm[blockIDX * sm_s0];
{
// This function trashes buf[1..n_threads],
// leaving the reduction result in buf[0].
%(ctype)s red = %(load_x)s(x_ptr[LID_0 * sx1]) + %(load_b)s(b[LID_0 * sb0]);
#pragma unroll 16
for (ga_int i = LID_0 + LDIM_0; i<N; i += LDIM_0) {
red = max(red, %(load_x)s(x_ptr[i * sx1]) + %(load_b)s(b[i * sb0]));
}
buf[LID_0] = red;
local_barrier();
if (LID_0 < GA_WARP_SIZE) {
for (ga_int i = LID_0 + GA_WARP_SIZE; i < LDIM_0; i += GA_WARP_SIZE) {
buf[LID_0] = max(buf[LID_0], buf[i]);
}
}
local_barrier();
//reduce so that LID_0 0 has the reduction of everything
for (ga_uint _n = GA_WARP_SIZE / 2; _n > 0; _n /= 2) {
if (LID_0 < _n && LID_0 + _n < N)
buf[LID_0] = max(buf[LID_0], buf[LID_0+_n]);
local_barrier();
}
}
%(ctype)s row_max = buf[0];
local_barrier();
{
// This function trashes buf[1..n_threads],
// leaving the reduction result in buf[0].
%(ctype)s red = exp(%(load_x)s(x_ptr[LID_0 * sx1]) + %(load_b)s(b[LID_0 * sb0]) - row_max);
#pragma unroll 16
for (ga_int i = LID_0 + LDIM_0; i<N; i += LDIM_0) {
red = red + exp(%(load_x)s(x_ptr[i * sx1]) + %(load_b)s(b[i * sb0]) - row_max);
}
buf[LID_0] = red;
local_barrier();
if (LID_0 < GA_WARP_SIZE) {
for (ga_int i = LID_0 + GA_WARP_SIZE; i < LDIM_0; i += GA_WARP_SIZE) {
buf[LID_0] = buf[LID_0] + buf[i];
}
}
local_barrier();
//reduce so that LID_0 0 has the reduction of everything
for (ga_uint _n = GA_WARP_SIZE / 2; _n > 0; _n /= 2) {
if (LID_0 < _n && LID_0 + _n < N)
buf[LID_0] = buf[LID_0] + buf[LID_0+_n];
local_barrier();
}
}
%(ctype)s row_sum = buf[0];
local_barrier();
for (ga_int tx = LID_0; tx< N; tx += LDIM_0){
sm_ptr[tx * sm_s1] = %(write_sm)s(exp(%(load_x)s(x_ptr[tx * sx1]) + %(load_b)s(b[tx * sb0]) - row_max) / row_sum);
}
local_barrier();
}
}
"""
% locals()
)
kernels.append(
Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var)
)
return kernels
gpu_softmax_with_bias = GpuSoftmaxWithBias()
This source diff could not be displayed because it is too large. You can view the blob instead.
from functools import wraps
import numpy as np
from aesara import scalar as aes
from aesara.gpuarray.basic_ops import (
GpuAllocEmpty,
GpuFromHost,
GpuReshape,
HostFromGpu,
host_from_gpu,
)
from aesara.gpuarray.elemwise import GpuDimShuffle, GpuElemwise
from aesara.gpuarray.type import GpuArrayType, get_context, move_to_gpu
from aesara.graph.basic import Constant
from aesara.graph.op import Op
from aesara.graph.opt import copy_stack_trace, local_optimizer
from aesara.tensor.basic import as_tensor, cast, get_scalar_constant_value, join
from aesara.tensor.elemwise import DimShuffle
from aesara.tensor.exceptions import NotScalarConstantError
from aesara.tensor.math import prod
from aesara.tensor.shape import shape_padright
from aesara.tensor.type import TensorType
# Define a few operations to use in optimizations,
# in order to avoid introducin new CPU Ops, or useless ones.
def safe_to_gpu(x, ctx_name):
if isinstance(x.type, TensorType):
return GpuFromHost(ctx_name)(x)
else:
return x
def safe_to_cpu(x):
if isinstance(x.type, GpuArrayType):
return x.transfer("cpu")
else:
return x
def grab_cpu_scalar(v, nd):
"""
Get a scalar variable value from the tree at `v`.
This function will dig through transfers and dimshuffles to get
the constant value. If no such constant is found, it returns None.
Parameters
----------
v
Aesara variable to extract the constant value from.
nd : int
Expected number of dimensions for the variable (for
broadcasted constants).
"""
if v.owner is not None:
n = v.owner
if (
isinstance(n.op, (GpuDimShuffle, DimShuffle))
and n.op.new_order == ("x",) * nd
):
return grab_cpu_scalar(n.inputs[0], n.inputs[0].ndim)
elif isinstance(n.op, (GpuFromHost, HostFromGpu)):
return grab_cpu_scalar(n.inputs[0], nd)
else:
return None
else:
if isinstance(v, Constant) and v.broadcastable == (True,) * nd:
return v.dimshuffle(())
def find_node(fgraph, v, cls, ignore_clients=False):
"""
Find the node that has an op of of type `cls` in `v`.
This digs through possibly redundant transfers to for the node
that has the type `cls`. If `ignore_clients` is False (the
default) it will only dig through nodes that have a single client
to avoid duplicating computations.
Parameters
----------
v
The variable to dig through
cls : Op class
The type of the node we are looking for
ignore_clients : bool, optional
Whether to ignore multiple clients or not.
"""
if v.owner is not None and (ignore_clients or len(fgraph.clients[v]) == 1):
if isinstance(v.owner.op, cls):
return v.owner
elif (
isinstance(v.owner.op, GpuFromHost)
and v.owner.inputs[0].owner is not None
and (ignore_clients or len(fgraph.clients[v.owner.inputs[0]]) == 1)
and isinstance(v.owner.inputs[0].owner.op, HostFromGpu)
):
return find_node(fgraph, v.owner.inputs[0].owner.inputs[0], cls)
else:
return None
def is_equal(var, val):
"""
Returns True if `var` is always equal to `val`.
This will only return True if the variable will always be equal to
the value. If it might not be true in some cases then it returns False.
Parameters
----------
var
Variable to compare
val
Python value
"""
try:
v = get_scalar_constant_value(var)
return v == val
except NotScalarConstantError:
return False
def alpha_merge(cls, alpha_in, beta_in):
"""
Decorator to merge multiplication by a scalar on the output.
This will find a pattern of `aes * <yourop>(some, params, alpha,
beta)` and update it so that the scalar multiplication happens as
part of your op.
The op needs to accept an alpha and a beta scalar which act this way::
out = Op() * alpha + out_like * beta
Where out_like is a buffer that has the same size as the output
and gets added to the "real" output of the operation. An example
of an operation that respects this pattern is GEMM from blas.
The decorated function must have this signature::
maker(node, *inputs)
The `node` argument you receive is the original apply node that
contains your op. You should use it to grab relevant properties
for your op so that the new version performs the same computation.
The `*inputs` parameters contains the new inputs for your op. You
MUST use those inputs instead of the ones on `node`. Note that
this function can be as simple as::
def maker(node, *inputs):
return node.op(*inputs)
Parameters
----------
cls : op class
The class of the op you want to merge
alpha_in : int
The input index for the alpha scalar for your op (in node.inputs).
beta_in : int
The input index for the beta scalar for your op (in node.inputs).
Returns
-------
local optimizer
an unregistered local optimizer that has the same name as the
decorated function.
Notes
-----
This was factored out since the code to deal with intervening
transfers and correctness in the presence of different values of
alpha and beta scaling factors is not trivial.
"""
def wrapper(maker):
@local_optimizer([GpuElemwise])
@wraps(maker)
def opt(fgraph, node):
if (
isinstance(node.op, GpuElemwise)
and node.op.scalar_op == aes.mul
and node.nin == 2
):
targ = find_node(fgraph, node.inputs[0], cls)
if targ is None:
targ = find_node(fgraph, node.inputs[1], cls)
if targ is None:
return
lr = grab_cpu_scalar(node.inputs[0], nd=targ.outputs[0].ndim)
else:
lr = grab_cpu_scalar(node.inputs[1], nd=targ.outputs[0].ndim)
if lr is None or lr.dtype != targ.outputs[0].dtype:
return None
inputs = list(targ.inputs)
try:
c = get_scalar_constant_value(lr)
if c == 0:
inputs[alpha_in] = lr
inputs[beta_in] = lr
elif c == 1:
inputs[alpha_in] = targ.inputs[alpha_in]
inputs[beta_in] = targ.inputs[beta_in]
else:
inputs[alpha_in] = lr * targ.inputs[alpha_in]
inputs[beta_in] = lr * targ.inputs[beta_in]
except NotScalarConstantError:
inputs[alpha_in] = lr * targ.inputs[alpha_in]
inputs[beta_in] = lr * targ.inputs[beta_in]
new_out = maker(targ, *inputs)
copy_stack_trace(node.outputs, new_out)
return new_out
return opt
return wrapper
def output_merge(cls, alpha_in, beta_in, out_in):
"""
Decorator to merge addition by a value on the output.
This will find a pattern of `val * <yourop>(some, params, alpha,
beta, out_like)` and update it so that the addtition happens as
part of your op.
The op needs to accept an alpha and a beta scalar which act this way::
out = Op() * alpha + out_like * beta
Where out_like is a buffer that has the same size as the output
and gets added to the "real" output of the operation. An example
of an operation that respects this pattern is GEMM from blas.
The decorated function must have this signature::
maker(node, *inputs)
The `node` argument you receive is the original apply node that
contains your op. You should use it to grab relevant properties
for your op so that the new version performs the same computation.
The `*inputs` parameters contains the new inputs for your op. You
MUST use those inputs instead of the ones on `node`. Note that
this function can be as simple as::
def maker(node, *inputs):
return node.op(*inputs)
Parameters
----------
cls : op class
The class of the op you want to merge
alpha_in : int
The input index for the alpha scalar for your op (in node.inputs).
beta_in : int
The input index for the beta scalar for your op (in node.inputs).
out_in : int
The input index for the out_like input for your op (in node.inputs).
Returns
-------
local optimizer
an unregistered local optimizer that has the same name as the
decorated function.
Notes
-----
This was factored out since the code to deal with intervening
transfers and correctness in the presence of different values of
alpha and beta scaling factors is not trivial.
This also correctly handles the case where the added value is
broadcasted (by not performing the replacement).
"""
def wrapper(maker):
@local_optimizer([GpuElemwise])
@wraps(maker)
def opt(fgraph, node):
if (
isinstance(node.op, GpuElemwise)
and node.op.scalar_op == aes.add
and node.nin == 2
):
targ = find_node(fgraph, node.inputs[0], cls)
W = node.inputs[1]
if targ is None:
targ = find_node(fgraph, node.inputs[1], cls)
W = node.inputs[0]
if targ is None:
return None
if W.dtype != targ.outputs[0].dtype:
return None
if not is_equal(targ.inputs[beta_in], 0.0):
# other cases are too complex for now
return None
if W.broadcastable != targ.inputs[out_in].broadcastable:
# Would need to explicitly tile the output to fill
# the full shape here. Disable for now.
return None
inputs = list(targ.inputs)
inputs[out_in] = W
dtype = inputs[beta_in].dtype
one = aes.constant(np.asarray(1.0, dtype=dtype))
inputs[beta_in] = one
new_out = maker(targ, *inputs)
copy_stack_trace(node.outputs, new_out)
return new_out
return opt
return wrapper
def inplace_allocempty(op, idx):
"""
Wrapper to make an inplace optimization that deals with AllocEmpty
This will duplicate the alloc input if it has more than one client
to allow the op to work on it inplace.
The decorated function must have this signature::
maker(node, inputs)
The `node` argument you receive is the original apply node that
contains your op. You should use it to grab relevant properties
for your op so that the new version performs the same computation.
You should also switch the op to work inplace. The `*inputs`
parameters contains the new inputs for your op. You MUST use
those inputs instead of the ones on `node`. Note that this
function can be as simple as::
def maker(node, inputs):
return [node.op.__class__(inplace=True)(*inputs)]
Parameters
----------
op : op class
The op class to look for to make inplace
idx : int
The index of the (possibly) AllocEmpty input (in node.inputs).
Returns
-------
local optimizer
an unregistered inplace local optimizer that has the same name
as the decorated function.
"""
def wrapper(maker):
@local_optimizer([op], inplace=True)
@wraps(maker)
def opt(fgraph, node):
if not isinstance(node.op, op) or node.op.inplace:
return
inputs = list(node.inputs)
alloc = inputs[idx]
if (
alloc.owner
and isinstance(alloc.owner.op, GpuAllocEmpty)
and len(fgraph.clients[alloc]) > 1
):
alloc_op = GpuAllocEmpty(
alloc.owner.op.dtype, alloc.owner.op.context_name
)
inputs[idx] = alloc_op(*alloc.owner.inputs)
new_out = maker(node, inputs)
copy_stack_trace(node.outputs, new_out)
return new_out
return opt
return wrapper
def pad_dims(input, leftdims, rightdims):
"""Reshapes the input to a (leftdims + rightdims) tensor
This helper function is used to convert pooling inputs with arbitrary
non-pooling dimensions to the correct number of dimensions for the
GPU pooling ops.
This reduces or expands the number of dimensions of the input to
exactly `leftdims`, by adding extra dimensions on the left or by
combining some existing dimensions on the left of the input.
Use `unpad_dims` to reshape back to the original dimensions.
Examples
--------
Given input of shape (3, 5, 7), ``pad_dims(input, 2, 2)``
adds a singleton dimension and reshapes to (1, 3, 5, 7).
Given that output from pad_dims, ``unpad_dims(output, input, 2, 2)``
reshapes back to (3, 5, 7).
Given input of shape (3, 5, 7, 9), ``pad_dims(input, 2, 2)``
does not reshape and returns output with shape (3, 5, 7, 9).
Given input of shape (3, 5, 7, 9, 11), ``pad_dims(input, 2, 2)``
combines the first two dimensions and reshapes to (15, 7, 9, 11).
Given input of shape (3, 5, 7, 9), ``pad_dims(input, 2, 3)``
adds a singleton dimension and reshapes to (1, 3, 5, 7, 9).
"""
assert input.ndim >= rightdims
if input.ndim == (leftdims + rightdims):
return input
# extract image dimensions
img_shape = input.shape[-rightdims:]
non_pool_ndim = input.ndim - rightdims
if non_pool_ndim < leftdims:
# too few dimensions, pad on the left
dummy_dims = as_tensor([1] * (leftdims - non_pool_ndim))
new_shape = join(0, dummy_dims, input.shape[:non_pool_ndim], img_shape)
else:
# too many dimensions, combine the leading dimensions
batched_ndim = non_pool_ndim - leftdims + 1
batch_size = prod(input.shape[:batched_ndim])
# convert to a vector for join
batch_size = shape_padright(batch_size, 1)
new_shape = join(
0, batch_size, input.shape[batched_ndim:non_pool_ndim], img_shape
)
# store in the required shape
new_shape = cast(new_shape, "int64")
input_ND = GpuReshape(leftdims + rightdims)(input, new_shape)
return input_ND
def unpad_dims(output, input, leftdims, rightdims):
"""Reshapes the output after pad_dims.
This reverts the padding by `pad_dims`.
"""
if output.ndim == input.ndim:
return output
# restore the output to the original shape
outshp = join(0, input.shape[:-rightdims], output.shape[-rightdims:])
return GpuReshape(input.ndim)(output, outshp)
def op_lifter(OP, cuda_only=False):
"""
OP(..., host_from_gpu(), ...) -> host_from_gpu(GpuOP(...))
gpu_from_host(OP(inp0, ...)) -> GpuOP(inp0, ...)
"""
def f(maker):
def local_opt(fgraph, node):
if isinstance(node.op, OP):
# Either one of our inputs is on the gpu or
# all of our clients are on the gpu
replace = False
# TODO: Maybe set context_name with infer_context_name()?
context_name = None
# We replace if any input is a host_from_gpu
for i in node.inputs:
if i.owner and i.owner.op == host_from_gpu and move_to_gpu(i):
context_name = i.owner.inputs[0].type.context_name
replace = True
break
if not replace:
# We replace if *all* clients are on the GPU
clients = [c for o in node.outputs for c in fgraph.clients[o]]
replace = len(clients) != 0
for c, idx in clients:
if c == "output" or not isinstance(c.op, GpuFromHost):
replace = False
# TODO: check that the clients want the same context?
if replace:
# All clients are GpuFromHost and we have at least one
context_name = clients[0][0].op.context_name
# Check if we should replace
if (
not replace
or (cuda_only and get_context(context_name).kind != b"cuda")
or any("complex" in getattr(i, "dtype", "") for i in node.inputs)
):
return False
# tag the inputs with the context in case
# the context was derived from the outputs
for i in node.inputs:
i.tag.context_name = context_name
new_op = maker(node.op, context_name, node.inputs, node.outputs)
# This is needed as sometimes new_op inherits from OP.
if new_op and new_op != node.op:
if isinstance(new_op, Op):
new_outputs = new_op(*node.inputs, return_list=True)
to_cpu_fn = safe_to_cpu
elif isinstance(new_op, (tuple, list)):
new_outputs = new_op
to_cpu_fn = safe_to_cpu
else: # suppose it is a variable on the GPU
new_outputs = [new_op]
def to_cpu_fn(x):
return x.transfer("cpu")
# copy stack traces onto gpu outputs
# also copy the stack traces onto HostFromGpu outputs
on_cpu = []
for old_output, new_output in zip(node.outputs, new_outputs):
copy_stack_trace(old_output, new_output)
cpu = to_cpu_fn(new_output)
on_cpu.append(cpu)
copy_stack_trace(old_output, cpu)
return on_cpu
return False
local_opt.__name__ = maker.__name__
return local_optimizer(OP)(local_opt)
return f
import time
from aesara.compile import optdb
from aesara.graph.basic import applys_between
from aesara.graph.opt import LocalOptGroup, TopoOptimizer, local_optimizer
from aesara.graph.optdb import (
EquilibriumDB,
LocalGroupDB,
OptimizationDatabase,
SequenceDB,
)
class GraphToGPULocalOptGroup(LocalOptGroup):
"""This is the equivalent of `LocalOptGroup` for `GraphToGPU`.
The main different is the function signature of the local
optimizer that use the `GraphToGPU` signature and not the normal
`LocalOptimizer` signature.
``apply_all_opts=True`` is not supported
"""
def __init__(self, *optimizers, **kwargs):
super().__init__(*optimizers, **kwargs)
assert self.apply_all_opts is False
def transform(self, fgraph, op, context_name, inputs, outputs):
if len(self.opts) == 0:
return
for opt in self.tracker.get_trackers(op):
opt_start = time.time()
new_repl = opt.transform(fgraph, op, context_name, inputs, outputs)
opt_finish = time.time()
if self.profile:
self.time_opts[opt] += opt_start - opt_finish
self.process_count[opt] += 1
if not new_repl:
continue
if self.profile:
self.node_created[opt] += len(
list(applys_between(fgraph.variables, new_repl))
)
self.applied_true[opt] += 1
return new_repl
gpu_optimizer = EquilibriumDB()
gpu_cut_copies = EquilibriumDB()
# Not used for an EquilibriumOptimizer. It has the "tracks" that we need for GraphToGPUDB.
gpu_optimizer2 = EquilibriumDB()
gpu_seqopt = SequenceDB()
# do not add 'fast_run' to these two as this would always enable gpuarray mode
optdb.register(
"gpuarray_opt",
gpu_seqopt,
"gpuarray",
position=optdb.__position__.get("add_destroy_handler", 49.5) - 1,
)
pool_db = LocalGroupDB()
pool_db2 = LocalGroupDB(local_opt=GraphToGPULocalOptGroup)
pool_db2.__name__ = "pool_db2"
matrix_ops_db = LocalGroupDB()
matrix_ops_db2 = LocalGroupDB(local_opt=GraphToGPULocalOptGroup)
matrix_ops_db2.__name__ = "matrix_ops_db2"
abstract_batch_norm_db = LocalGroupDB()
abstract_batch_norm_db2 = LocalGroupDB(local_opt=GraphToGPULocalOptGroup)
abstract_batch_norm_db2.__name__ = "abstract_batch_norm_db2"
abstract_batch_norm_groupopt = LocalGroupDB()
abstract_batch_norm_groupopt.__name__ = "gpuarray_batchnorm_opts"
def register_opt(*tags, **kwargs):
def f(local_opt):
name = (kwargs and kwargs.pop("name")) or local_opt.__name__
gpu_optimizer.register(name, local_opt, "fast_run", "gpuarray", *tags)
return local_opt
return f
def register_opt2(tracks, *tags, **kwargs):
"""
Decorator for the new GraphToGPU optimizer.
Takes an extra parameter(Op) compared to register_opt decorator.
Parameters
----------
tracks : List of Op class Or Op instance or None
The Node's Op to which optimization is being applied.
tags : String
The optimization tag to which the optimizer will be registered.
"""
def f(local_opt):
name = (kwargs and kwargs.pop("name")) or local_opt.__name__
if isinstance(local_opt, OptimizationDatabase):
opt = local_opt
else:
opt = local_optimizer(tracks)(local_opt)
gpu_optimizer2.register(name, opt, "fast_run", "gpuarray", *tags)
return local_opt
return f
def register_inplace(*tags, **kwargs):
def f(local_opt):
name = (kwargs and kwargs.pop("name")) or local_opt.__name__
optdb.register(
name,
TopoOptimizer(local_opt, failure_callback=TopoOptimizer.warn_inplace),
"fast_run",
"inplace",
"gpuarray",
*tags,
position=60,
)
return local_opt
return f
# Register GPU convolution implementation
# They are tried in a specific order so we can control
# which ones take precedence over others.
abstractconv_groupopt = LocalGroupDB()
abstractconv_groupopt.__name__ = "gpuarray_abstractconv_opts"
register_opt("fast_compile")(abstractconv_groupopt)
class GraphToGPUDB(OptimizationDatabase):
"""
Retrieves the list local optimizers based on the optimizer flag's value
from EquilibriumOptimizer by calling the method query.
"""
def query(self, *tags, **kwtags):
from aesara.gpuarray.opt import GraphToGPU
opt = gpu_optimizer2.query(*tags, **kwtags)
return GraphToGPU(opt.local_optimizers_all, opt.local_optimizers_map)
import os
import sys
from typing import Set
class PathParser:
"""
Class that allows to modify system's PATH environment variable
at runtime. Currently used in ``aesara.gpuarray.dnn`` module
on Windows only.
**Examples**:
..code-block:: python
aesara.pathparse.PathParser(pathToAdd1, pathToAdd2, ...)
# PATH is then automatically updated for this execution.
..code-block:: python
paths = aesara.pathparse.PathParser()
paths.add(path1)
paths.add(path2)
# PATH is updated after each call to ``add()``.
"""
paths: Set = set()
def _add(self, path):
path = path.strip()
if path:
if sys.platform == "win32":
# Windows is case-insensitive.
path = path.lower()
self.paths.add(os.path.abspath(path))
def _update(self):
os.environ["PATH"] = os.pathsep.join(sorted(self.paths))
def _parse(self):
for path in os.environ["PATH"].split(os.pathsep):
self._add(path)
def __init__(self, *paths):
self._parse()
for path in paths:
self._add(path)
self._update()
def add(self, path):
self._add(path)
self._update()
def _debug(self):
for path in sorted(self.paths):
print(path)
import aesara
from aesara.gpuarray.basic_ops import (
CGpuKernelBase,
as_gpuarray_variable,
gpu_contiguous,
gpuarray_helper_inc_dir,
infer_context_name,
)
from aesara.gpuarray.type import gpu_context_type
from aesara.graph.basic import Apply
from aesara.link.c.params_type import ParamsType
from aesara.scalar import bool as bool_t
from aesara.tensor.basic import as_tensor_variable
from aesara.tensor.signal.pool import Pool, PoolingMode_t
from aesara.tensor.type import int_dtypes
try:
import pygpu
except ImportError:
# To make sure aesara is importable
pass
class GpuPool(CGpuKernelBase):
"""
Implement the max and average pooling on the gpu.
"""
__props__ = ("ignore_border", "mode", "ndim")
params_type = ParamsType(
ignore_border=bool_t, mode=PoolingMode_t, context=gpu_context_type
)
def __init__(self, ignore_border, mode="max", ndim=2):
self.ndim = ndim
self.ignore_border = ignore_border
if mode == "average":
mode = "average_inc_pad"
self.mode = mode
CGpuKernelBase.__init__(self, ["c_code/pool.c"], "APPLY_SPECIFIC(pool)")
assert PoolingMode_t.has_alias(self.mode)
assert self.ndim in (2, 3)
def get_params(self, node):
return self.params_type.get_params(self, context=node.inputs[0].type.context)
def c_headers(self, **kwargs):
return ["gpuarray_api.h", "gpuarray_helper.h", "numpy_compat.h"]
def c_header_dirs(self, **kwargs):
return [gpuarray_helper_inc_dir(), pygpu.get_include()]
def make_node(self, inp, ws, stride=None, pad=None):
ctx_name = infer_context_name(inp)
inp = as_gpuarray_variable(inp, ctx_name)
nd = self.ndim
assert inp.ndim == nd + 2
if stride is None:
stride = ws
if pad is None:
pad = (0,) * nd
elif isinstance(pad, (tuple, list)):
if max(pad) != 0 and not self.ignore_border:
raise ValueError("Padding works only with ignore_border=True")
if isinstance(ws, (tuple, list)):
if any(pad[i] >= ws[i] for i in range(nd)):
raise ValueError("Padding must be smaller than strides")
ws = as_tensor_variable(ws)
stride = as_tensor_variable(stride)
pad = as_tensor_variable(pad)
assert ws.ndim == stride.ndim and ws.ndim == pad.ndim
assert ws.ndim == 1
if ws.dtype not in int_dtypes:
raise TypeError("Window shape parameters must be ints.")
if stride.dtype not in int_dtypes:
raise TypeError("Stride parameters must be ints.")
if pad.dtype not in int_dtypes:
raise TypeError("Padding parameters must be ints.")
ws = aesara.tensor.cast(ws, "int64")
stride = aesara.tensor.cast(stride, "int64")
pad = aesara.tensor.cast(pad, "int64")
return Apply(self, [inp, ws, stride, pad], [inp.type()])
def infer_shape(self, fgraph, node, in_shapes):
ws, stride, pad = [node.inputs[1], node.inputs[2], node.inputs[3]]
shp = Pool.out_shape(
in_shapes[0], ws, self.ignore_border, stride, pad, self.ndim
)
return [shp]
def grad(self, inp, grads):
img, ws, stride, pad = inp
(grad,) = grads
grad = gpu_contiguous(grad)
disc = [aesara.gradient.DisconnectedType()() for i in inp[1:]]
if self.mode == "max":
out = self(img, ws, stride, pad)
g_out = GpuMaxPoolGrad(ndim=self.ndim, ignore_border=self.ignore_border)(
img, out, grad, ws, stride, pad
)
return [g_out] + disc
else:
g_out = GpuAveragePoolGrad(
ndim=self.ndim, ignore_border=self.ignore_border, mode=self.mode
)(img, grad, ws, stride, pad)
return [g_out] + disc
def connection_pattern(self, node):
return [[1], [0], [0], [0]]
def R_op(self, inputs, eval_points):
if self.mode != "max":
# Rop for average or sum is simply pooling evaluated at eval point
eval_inputs = [eval_points[0]] + inputs[1:]
return [self(*eval_inputs)]
# R_op can receive None as eval_points.
# That mean there is no diferientiable path through that input
# If this imply that you cannot compute some outputs,
# return None for those.
if eval_points[0] is None:
return [None]
z = self(*inputs)
x, ws, stride, pad = inputs
return [
GpuDownsampleFactorMaxGradGrad(self.ignore_border, self.mode, self.ndim)(
x, z, eval_points[0], ws, stride, pad
)
]
class GpuMaxPoolGrad(CGpuKernelBase):
"""
Implement the grad of max pooling on the gpu.
"""
__props__ = ("ignore_border", "mode", "ndim")
def __init__(self, ignore_border, mode="max", ndim=2):
self.ndim = ndim
self.ignore_border = ignore_border
self.mode = mode
CGpuKernelBase.__init__(
self, ["c_code/pool_max_grad.c"], "APPLY_SPECIFIC(max_pool_grad)"
)
assert mode == "max"
assert ndim in (2, 3)
def c_headers(self, **kwargs):
return ["gpuarray_api.h", "gpuarray_helper.h", "numpy_compat.h"]
def c_header_dirs(self, **kwargs):
return [gpuarray_helper_inc_dir(), pygpu.get_include()]
def make_node(self, inp, out, out_grad, ws, stride=None, pad=None):
ctx_name = infer_context_name(inp, out, out_grad)
nd = self.ndim
inp = as_gpuarray_variable(inp, ctx_name)
assert inp.ndim == nd + 2
out = as_gpuarray_variable(out, ctx_name)
assert out.ndim == nd + 2
out_grad = as_gpuarray_variable(out_grad, ctx_name)
assert out_grad.ndim == nd + 2
assert out_grad.ndim == inp.ndim
assert inp.ndim == out.ndim
if stride is None:
stride = ws
if pad is None:
pad = (0,) * nd
ws = as_tensor_variable(ws)
stride = as_tensor_variable(stride)
pad = as_tensor_variable(pad)
assert ws.ndim == stride.ndim and ws.ndim == pad.ndim
assert ws.ndim == 1
if ws.dtype not in int_dtypes:
raise TypeError("Window shape parameters must be ints.")
if stride.dtype not in int_dtypes:
raise TypeError("Stride parameters must be ints.")
if pad.dtype not in int_dtypes:
raise TypeError("Padding parameters must be ints.")
ws = aesara.tensor.cast(ws, "int64")
stride = aesara.tensor.cast(stride, "int64")
pad = aesara.tensor.cast(pad, "int64")
return Apply(self, [inp, out, out_grad, ws, stride, pad], [inp.type()])
def infer_shape(self, fgraph, node, in_shapes):
return [in_shapes[0]]
def grad(self, inp, grads):
x, maxout, gz, ws, stride, pad = inp
(ggx,) = grads
return [
aesara.tensor.zeros_like(x),
aesara.tensor.zeros_like(maxout),
GpuDownsampleFactorMaxGradGrad(
ndim=self.ndim, ignore_border=self.ignore_border
)(x, maxout, ggx, ws, stride, pad),
] + [aesara.gradient.DisconnectedType()() for i in inp[3:]]
def connection_pattern(self, node):
return [[1], [1], [1], [0], [0], [0]]
class GpuAveragePoolGrad(CGpuKernelBase):
"""
Implement the grad of average pooling on the gpu.
"""
__props__ = ("ignore_border", "mode", "ndim")
params_type = ParamsType(mode=PoolingMode_t, context=gpu_context_type)
def __init__(self, ignore_border, mode="max", ndim=2):
self.ndim = ndim
self.ignore_border = ignore_border
if mode == "average":
mode = "average_inc_pad"
self.mode = mode
CGpuKernelBase.__init__(
self, ["c_code/pool_ave_grad.c"], "APPLY_SPECIFIC(ave_pool_grad)"
)
assert mode in ("sum", "average_inc_pad", "average_exc_pad")
assert ndim in (2, 3)
def get_params(self, node):
return self.params_type.get_params(self, context=node.inputs[0].type.context)
def c_headers(self, **kwargs):
return ["gpuarray_api.h", "gpuarray_helper.h", "numpy_compat.h"]
def c_header_dirs(self, **kwargs):
return [gpuarray_helper_inc_dir(), pygpu.get_include()]
def make_node(self, inp, out_grad, ws, stride=None, pad=None):
ctx_name = infer_context_name(inp, out_grad)
nd = self.ndim
inp = as_gpuarray_variable(inp, ctx_name)
assert inp.ndim == nd + 2
out_grad = as_gpuarray_variable(out_grad, ctx_name)
assert out_grad.ndim == nd + 2
assert out_grad.ndim == inp.ndim
if stride is None:
stride = ws
if pad is None:
pad = (0,) * nd
elif isinstance(pad, (tuple, list)):
if max(pad) != 0 and self.mode != "average_exc_pad":
raise ValueError("Padding must be zero for average_exc_pad")
ws = as_tensor_variable(ws)
stride = as_tensor_variable(stride)
pad = as_tensor_variable(pad)
assert ws.ndim == stride.ndim and ws.ndim == pad.ndim
assert ws.ndim == 1
if ws.dtype not in int_dtypes:
raise TypeError("Window shape parameters must be ints.")
if stride.dtype not in int_dtypes:
raise TypeError("Stride parameters must be ints.")
if pad.dtype not in int_dtypes:
raise TypeError("Padding parameters must be ints.")
ws = aesara.tensor.cast(ws, "int64")
stride = aesara.tensor.cast(stride, "int64")
pad = aesara.tensor.cast(pad, "int64")
return Apply(self, [inp, out_grad, ws, stride, pad], [inp.type()])
def infer_shape(self, fgraph, node, in_shapes):
return [in_shapes[0]]
def grad(self, inp, grads):
x, gz, ws, stride, pad = inp
(ggx,) = grads
return [
aesara.tensor.zeros_like(x),
GpuPool(ignore_border=self.ignore_border, ndim=self.ndim, mode=self.mode)(
ggx, ws, stride, pad
),
] + [aesara.gradient.DisconnectedType()() for i in inp[2:]]
def connection_pattern(self, node):
return [[1], [1], [0], [0], [0]]
class GpuDownsampleFactorMaxGradGrad(CGpuKernelBase):
"""
Implement the grad of downsample with max on the gpu.
"""
__props__ = ("ignore_border", "mode", "ndim")
def __init__(self, ignore_border, mode="max", ndim=2):
self.ndim = ndim
self.ignore_border = ignore_border
self.mode = mode
CGpuKernelBase.__init__(
self, ["c_code/pool_grad_grad.c"], "APPLY_SPECIFIC(pool_grad_grad)"
)
assert self.mode == "max"
assert self.ndim in (2, 3)
def c_headers(self, **kwargs):
return ["gpuarray_api.h", "gpuarray_helper.h", "numpy_compat.h"]
def c_header_dirs(self, **kwargs):
return [gpuarray_helper_inc_dir(), pygpu.get_include()]
def make_node(self, inp, out, out_grad, ws, stride=None, pad=None):
ctx_name = infer_context_name(inp, out, out_grad)
nd = self.ndim
inp = as_gpuarray_variable(inp, ctx_name)
assert inp.ndim == nd + 2
out = as_gpuarray_variable(out, ctx_name)
assert out_grad.ndim == nd + 2
out_grad = as_gpuarray_variable(out_grad, ctx_name)
assert out.ndim == nd + 2
assert out_grad.ndim == inp.ndim
assert inp.ndim == out.ndim
if stride is None:
stride = ws
if pad is None:
pad = (0,) * nd
ws = as_tensor_variable(ws)
stride = as_tensor_variable(stride)
pad = as_tensor_variable(pad)
assert ws.ndim == stride.ndim and ws.ndim == pad.ndim
assert ws.ndim == 1
if ws.dtype not in int_dtypes:
raise TypeError("Window shape parameters must be ints.")
if stride.dtype not in int_dtypes:
raise TypeError("Stride parameters must be ints.")
if pad.dtype not in int_dtypes:
raise TypeError("Padding parameters must be ints.")
ws = aesara.tensor.cast(ws, "int64")
stride = aesara.tensor.cast(stride, "int64")
pad = aesara.tensor.cast(pad, "int64")
return Apply(self, [inp, out, out_grad, ws, stride, pad], [inp.type()])
def infer_shape(self, fgraph, node, in_shapes):
return [in_shapes[1]]
def grad(self, inp, grads):
x, maxout, ggx, ws, stride, pad = inp
(gz,) = grads
return [
aesara.tensor.zeros_like(x),
aesara.tensor.zeros_like(maxout),
GpuMaxPoolGrad(ignore_border=self.ignore_border, ndim=self.ndim)(
x, maxout, gz, ws, stride, pad
),
] + [aesara.gradient.DisconnectedType()() for i in inp[3:]]
def connection_pattern(self, node):
return [[1], [1], [1], [0], [0], [0]]
class GpuMaxPoolRop(CGpuKernelBase):
"""
Implements the R-operator for the downsample operation.
"""
__props__ = ("ignore_border", "mode", "ndim")
params_type = ParamsType(ignore_border=bool_t, context=gpu_context_type)
def __init__(self, ignore_border, mode="max", ndim=2):
self.ndim = ndim
self.ignore_border = ignore_border
self.mode = mode
CGpuKernelBase.__init__(
self, ["c_code/pool_max_rop.c"], "APPLY_SPECIFIC(max_pool_rop)"
)
assert mode == "max"
assert ndim in (2, 3)
def get_params(self, node):
return self.params_type.get_params(self, context=node.inputs[0].type.context)
def c_headers(self, **kwargs):
return ["gpuarray_api.h", "gpuarray_helper.h", "numpy_compat.h"]
def c_header_dirs(self, **kwargs):
return [gpuarray_helper_inc_dir(), pygpu.get_include()]
def make_node(self, inp, eval_point, ws, stride=None, pad=None):
ctx_name = infer_context_name(inp)
nd = self.ndim
inp = as_gpuarray_variable(inp, ctx_name)
assert inp.ndim == nd + 2
eval_point = as_gpuarray_variable(eval_point, ctx_name)
assert eval_point.ndim == nd + 2
if stride is None:
stride = ws
if pad is None:
pad = (0,) * nd
elif isinstance(pad, (tuple, list)):
if max(pad) != 0 and not self.ignore_border:
raise ValueError("Padding works only with ignore_border=True")
if isinstance(ws, (tuple, list)):
if any(pad[i] >= ws[i] for i in range(nd)):
raise ValueError("Padding must be smaller than strides")
ws = as_tensor_variable(ws)
stride = as_tensor_variable(stride)
pad = as_tensor_variable(pad)
assert ws.ndim == stride.ndim and ws.ndim == pad.ndim
assert ws.ndim == 1
if ws.dtype not in int_dtypes:
raise TypeError("Window shape parameters must be ints.")
if stride.dtype not in int_dtypes:
raise TypeError("Stride parameters must be ints.")
if pad.dtype not in int_dtypes:
raise TypeError("Padding parameters must be ints.")
ws = aesara.tensor.cast(ws, "int64")
stride = aesara.tensor.cast(stride, "int64")
pad = aesara.tensor.cast(pad, "int64")
return Apply(self, [inp, eval_point, ws, stride, pad], [eval_point.type()])
def infer_shape(self, fgraph, node, in_shapes):
ws, stride, pad = [node.inputs[2], node.inputs[3], node.inputs[4]]
shp = Pool.out_shape(
in_shapes[0], ws, self.ignore_border, stride, pad, self.ndim
)
return [shp]
from aesara.graph.basic import Apply
from aesara.link.c.op import COp
from aesara.link.c.type import Generic
from .basic_ops import as_gpuarray_variable, gpuarray_helper_inc_dir, infer_context_name
from .type import GpuArrayType
try:
import pygpu
except ImportError:
pass
class GpuMaxAndArgmax(COp):
"""
GPU version of MaxAndArgmax
"""
params_type = Generic()
__props__ = ("axis",)
argmax_dtype = "int64"
def __init__(self, axis):
assert isinstance(axis, (list, tuple))
self.axis = tuple(axis)
def get_params(self, node):
return self.axis
def make_node(self, X):
context_name = infer_context_name(X)
# We keep the original broadcastable flags for dimensions on which
# we do not perform the max / argmax.
all_axes = set(self.axis)
broadcastable = [
b for i, b in enumerate(X.type.broadcastable) if i not in all_axes
]
inputs = [as_gpuarray_variable(X, context_name)]
outputs = [
GpuArrayType(X.type.dtype, broadcastable, context_name=context_name)(),
GpuArrayType(self.argmax_dtype, broadcastable, context_name=context_name)(),
]
return Apply(self, inputs, outputs)
def c_headers(self, **kwargs):
return ["<numpy_compat.h>", "<gpuarray_helper.h>"]
def c_header_dirs(self, **kwargs):
return [pygpu.get_include(), gpuarray_helper_inc_dir()]
def c_code(self, node, name, input_names, output_names, sub):
# Recall: X = input_names[0]
# Recall: axes = sub['params']
# Recall: max, argmax = output_names
# Recall: fail = sub['fail']
max_typecode = pygpu.gpuarray.dtype_to_typecode(node.inputs[0].dtype)
argmax_typecode = pygpu.gpuarray.dtype_to_typecode(self.argmax_dtype)
ret = """
#if PY_MAJOR_VERSION >= 3
#ifndef PyInt_AS_LONG
#define PyInt_AS_LONG PyLong_AS_LONG
#endif
#endif
int err = 0;
unsigned %(name)s_redux_len = PyTuple_GET_SIZE(%(axes)s);
unsigned* %(name)s_axes_to_reduce = (unsigned*)malloc(%(name)s_redux_len * sizeof(unsigned));
for (unsigned i = 0; i < %(name)s_redux_len; ++i) {
PyObject* axis_object = PyTuple_GET_ITEM(%(axes)s, i);
%(name)s_axes_to_reduce[i] = (unsigned) PyInt_AS_LONG(axis_object);
}
size_t %(name)s_input_ndim = PyGpuArray_NDIM(%(X)s);
size_t %(name)s_output_ndim = %(name)s_input_ndim - %(name)s_redux_len;
size_t* %(name)s_output_dims = (size_t*)malloc(%(name)s_output_ndim * sizeof(size_t));
if (%(name)s_redux_len == 1) {
for (unsigned i = 0; i < %(name)s_axes_to_reduce[0]; ++i) {
%(name)s_output_dims[i] = PyGpuArray_DIM(%(X)s, i);
}
for (unsigned i = %(name)s_axes_to_reduce[0] + 1; i < %(name)s_input_ndim; ++i) {
%(name)s_output_dims[i-1] = PyGpuArray_DIM(%(X)s, i);
}
} else {
int64_t current_input_pos = -1;
int64_t current_output_pos = -1;
for (unsigned i = 0; i < %(name)s_redux_len; ++i) {
for (++current_input_pos; current_input_pos < %(name)s_axes_to_reduce[i]; ++current_input_pos) {
%(name)s_output_dims[++current_output_pos] = PyGpuArray_DIM(%(X)s, current_input_pos);
}
}
for (++current_input_pos; current_input_pos < %(name)s_input_ndim; ++current_input_pos) {
%(name)s_output_dims[++current_output_pos] = PyGpuArray_DIM(%(X)s, current_input_pos);
}
}
if (aesara_prep_output(&%(max)s, %(name)s_output_ndim, %(name)s_output_dims, %(max_typecode)s, GA_C_ORDER, %(X)s->context)) {
PyErr_SetString(PyExc_RuntimeError, "GpuMaxAndArgmax: unable to prepare max output.");
%(fail)s
}
if (aesara_prep_output(&%(argmax)s, %(name)s_output_ndim, %(name)s_output_dims, %(argmax_typecode)s, GA_C_ORDER, %(X)s->context)) {
PyErr_SetString(PyExc_RuntimeError, "GpuMaxAndArgmax: unable to prepare argmax output.");
%(fail)s
}
if (%(name)s_input_ndim == 0) {
/* GpuArray_maxandargmax can't handle a 0-d array
* because it expects that 1 <= redux_len <= input_ndim.
* As input_ndim == 0, then 1 <= redux_len <= 0 is false.
* To handle this case we copy input to max and we set argmax to 0.
*/
if (GA_NO_ERROR != GpuArray_setarray(&%(max)s->ga, &%(X)s->ga)) {
PyErr_SetString(PyExc_RuntimeError, "GpuMaxAndArgmax: unable to copy input to max when input is a scalar.");
%(fail)s
}
if (GA_NO_ERROR != GpuArray_memset(&%(argmax)s->ga, 0)) {
PyErr_SetString(PyExc_RuntimeError, "GpuMaxAndArgmax: unable to set argmax to 0 when input is a scalar.");
%(fail)s
}
} else if (GA_NO_ERROR != (err =
GpuArray_maxandargmax(&%(max)s->ga, &%(argmax)s->ga, &%(X)s->ga, %(name)s_redux_len, %(name)s_axes_to_reduce)
)) {
PyErr_Format(PyExc_RuntimeError,
"GpuMaxAndArgmax: unable to compute gpuarray maxandargmax: error %%d: %%s (%%s).",
err, gpuarray_error_str(err), GpuArray_error(&%(X)s->ga, err));
%(fail)s
}
"""
return ret % {
"X": input_names[0],
"axes": sub["params"],
"max": output_names[0],
"argmax": output_names[1],
"max_typecode": max_typecode,
"argmax_typecode": argmax_typecode,
"name": name,
"fail": sub["fail"],
}
def c_code_cleanup(self, node, name, inputs, outputs, sub):
return """
free(%(name)s_output_dims);
free(%(name)s_axes_to_reduce);
""" % {
"name": name,
}
def c_code_cache_version(self):
return (2,)
"""
GPU implementation of MRG31k3p random number generator for Aesara.
Generator code in SSJ package (L'Ecuyer & Simard).
http://www.iro.umontreal.ca/~simardr/ssj/indexe.html
"""
from aesara import tensor as at
from aesara.gpuarray.basic_ops import (
GpuFromHost,
GpuKernelBase,
Kernel,
as_gpuarray_variable,
host_from_gpu,
infer_context_name,
)
from aesara.gpuarray.fp16_help import write_w
from aesara.gpuarray.opt import register_opt, register_opt2
from aesara.gpuarray.type import GpuArrayType, gpu_context_type
from aesara.graph.basic import Apply
from aesara.graph.opt import local_optimizer
from aesara.sandbox.rng_mrg import mrg_uniform, mrg_uniform_base
from aesara.scalar import int32 as int_t
from aesara.tensor import as_tensor_variable, get_vector_length
class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
# GpuArray version
_f16_ok = True
params_type = mrg_uniform_base.params_type.extended(
otypecode=int_t, context=gpu_context_type
)
otypecode = property(lambda self: self.output_type.typecode)
def make_node(self, rstate, size):
# error checking slightly redundant here, since
# this op should not be called directly.
#
# call through MRG_RandomStream instead.
broad = []
for i in range(self.output_type.ndim):
broad.append(at.extract_constant(size[i]) == 1)
output_type = self.output_type.clone(broadcastable=broad)()
rstate = as_gpuarray_variable(rstate, infer_context_name(rstate))
return Apply(self, [rstate, size], [rstate.type(), output_type])
def get_params(self, node):
return self.params_type.get_params(self, context=node.inputs[0].type.context)
@classmethod
def new(cls, rstate, ndim, dtype, size):
v_size = as_tensor_variable(size)
if ndim is None:
ndim = get_vector_length(v_size)
op = cls(GpuArrayType(dtype, (False,) * ndim))
return op(rstate, v_size)
def c_headers(self, **kwargs):
return super().c_headers(**kwargs) + ["numpy_compat.h"]
def gpu_kernels(self, node, name):
write = write_w(self.output_type.dtype)
if self.output_type.dtype == "float16":
otype = "ga_half"
# limit the values of the state that we use.
mask = "& 0x7fff"
offset = "+ 1"
NORM = "3.0458e-05f" # numpy.float16(1.0/(2**15+33))
# this was determined by finding the biggest number such that
# numpy.float16(number * ((M1 & 0x7fff) + 1)) < 1.0
elif self.output_type.dtype == "float32":
otype = "float"
mask = ""
offset = ""
NORM = "4.6566126e-10f" # numpy.float32(1.0/(2**31+65))
# this was determined by finding the biggest number such that
# numpy.float32(number * M1) < 1.0
elif self.output_type.dtype == "float64":
otype = "double"
mask = ""
offset = ""
NORM = "4.656612873077392578125e-10"
else:
raise ValueError("Unsupported data type for output", self.output_type.dtype)
code = (
"""#include "cluda.h"
KERNEL void mrg_uniform(
GLOBAL_MEM %(otype)s *sample_data,
ga_size sample_offset,
GLOBAL_MEM ga_int *state_data,
ga_size state_offset,
const ga_uint Nsamples,
const ga_uint Nstreams_used)
{
sample_data = (GLOBAL_MEM %(otype)s *)(((GLOBAL_MEM char *)sample_data) + sample_offset);
state_data = (GLOBAL_MEM ga_int *)(((GLOBAL_MEM char *)state_data) + state_offset);
/*
* The cluda backend makes sure that ga_int corresponds to
* a 32 bit signed type on the target device. It is not a
* variable width type.
*/
const ga_int i7 = 7;
const ga_int i9 = 9;
const ga_int i15 = 15;
const ga_int i16 = 16;
const ga_int i22 = 22;
const ga_int i24 = 24;
const ga_int M1 = 2147483647; //2^31 - 1
const ga_int M2 = 2147462579; //2^31 - 21069
const ga_int MASK12 = 511; //2^9 - 1
const ga_int MASK13 = 16777215; //2^24 - 1
const ga_int MASK2 = 65535; //2^16 - 1
const ga_int MULT2 = 21069;
const ga_uint idx = GID_0 * LDIM_0 + LID_0;
ga_int y1, y2, x11, x12, x13, x21, x22, x23;
if (idx < Nstreams_used)
{
x11 = state_data[idx*6+0];
x12 = state_data[idx*6+1];
x13 = state_data[idx*6+2];
x21 = state_data[idx*6+3];
x22 = state_data[idx*6+4];
x23 = state_data[idx*6+5];
for (ga_uint i = idx; i < Nsamples; i += Nstreams_used)
{
y1 = ((x12 & MASK12) << i22) + (x12 >> i9) + ((x13 & MASK13) << i7) + (x13 >> i24);
y1 -= (y1 < 0 || y1 >= M1) ? M1 : 0;
y1 += x13;
y1 -= (y1 < 0 || y1 >= M1) ? M1 : 0;
x13 = x12;
x12 = x11;
x11 = y1;
y1 = ((x21 & MASK2) << i15) + (MULT2 * (x21 >> i16));
y1 -= (y1 < 0 || y1 >= M2) ? M2 : 0;
y2 = ((x23 & MASK2) << i15) + (MULT2 * (x23 >> i16));
y2 -= (y2 < 0 || y2 >= M2) ? M2 : 0;
y2 += x23;
y2 -= (y2 < 0 || y2 >= M2) ? M2 : 0;
y2 += y1;
y2 -= (y2 < 0 || y2 >= M2) ? M2 : 0;
x23 = x22;
x22 = x21;
x21 = y2;
if (x11 <= x21) {
sample_data[i] = %(write)s((((x11 - x21 + M1) %(mask)s) %(offset)s) * %(NORM)s);
}
else
{
sample_data[i] = %(write)s((((x11 - x21) %(mask)s) %(offset)s) * %(NORM)s);
}
}
state_data[idx*6+0]= x11;
state_data[idx*6+1]= x12;
state_data[idx*6+2]= x13;
state_data[idx*6+3]= x21;
state_data[idx*6+4]= x22;
state_data[idx*6+5]= x23;
}
}
"""
% locals()
)
# we shouldn't get to this line if it's about to fail
from pygpu import gpuarray
return [
Kernel(
code=code,
name="mrg_uniform",
params=[
gpuarray.GpuArray,
gpuarray.SIZE,
gpuarray.GpuArray,
gpuarray.SIZE,
"uint32",
"uint32",
],
flags=Kernel.get_flags(self.output_type.dtype, "int32"),
)
]
def c_code(self, node, nodename, inp, out, sub):
return """
npy_int64 M1 = 2147483647; //2^31 - 1
size_t n_elements = 1;
unsigned int n_streams;
int must_alloc_sample = ((NULL == %(o_sample)s)
|| !pygpu_GpuArray_Check((PyObject*)%(o_sample)s)
|| !(%(o_sample)s->ga.flags & GA_C_CONTIGUOUS)
|| (PyGpuArray_NDIM(%(o_sample)s) != %(params)s->ndim));
size_t* odims = (size_t*)malloc(%(params)s->ndim * sizeof(size_t));
if (odims == NULL) {
PyErr_NoMemory();
%(just_fail)s
}
if (PyArray_NDIM(%(size)s) != 1)
{
PyErr_SetString(PyExc_ValueError, "size must be vector");
%(fail)s
}
if (PyArray_DIMS(%(size)s)[0] != %(params)s->ndim)
{
PyErr_Format(PyExc_ValueError, "size must have length %%i (not %%li)",
%(params)s->ndim, PyArray_DIMS(%(size)s)[0]);
%(fail)s
}
for (int i = 0; i < %(params)s->ndim; ++i)
{
odims[i] = *(dtype_%(size)s *)PyArray_GETPTR1(%(size)s, i);
n_elements *= odims[i];
must_alloc_sample = (must_alloc_sample
|| PyGpuArray_DIMS(%(o_sample)s)[i] != odims[i]);
}
if (n_elements > M1)
{
PyErr_SetString(
PyExc_ValueError,
"rng_mrg gpu implementation does not support more than (2**31 -1) samples");
%(fail)s
}
if (must_alloc_sample)
{
Py_XDECREF(%(o_sample)s);
%(o_sample)s = pygpu_empty(%(params)s->ndim, odims, %(params)s->otypecode, GA_C_ORDER,
%(params)s->context, Py_None);
if(!%(o_sample)s)
{
%(fail)s;
}
}
if (!pygpu_GpuArray_Check((PyObject*)%(rstate)s))
{
PyErr_Format(PyExc_ValueError, "rstate must be gpuarray");
%(fail)s;
}
Py_XDECREF(%(o_rstate)s);
if (%(params)s->inplace)
{
Py_INCREF(%(rstate)s);
%(o_rstate)s = %(rstate)s;
}
else
{
%(o_rstate)s = pygpu_copy(%(rstate)s, GA_ANY_ORDER);
if (!%(o_rstate)s) {
%(fail)s
}
}
if (PyGpuArray_NDIM(%(o_rstate)s) != 2)
{
PyErr_SetString(PyExc_ValueError, "rstate must be a matrix");
%(fail)s
}
if (PyGpuArray_DIMS(%(o_rstate)s)[1] != 6)
{
PyErr_Format(PyExc_ValueError, "rstate must have 6 columns");
%(fail)s
}
if (%(o_rstate)s->ga.typecode != GA_INT) {
PyErr_Format(PyExc_ValueError, "rstate must be int32");
%(fail)s
}
if (!GpuArray_CHKFLAGS(&%(o_rstate)s->ga, GA_C_CONTIGUOUS)) {
PyErr_Format(PyExc_ValueError, "rstate must be C contiguous");
%(fail)s
}
n_streams = PyGpuArray_DIMS(%(o_rstate)s)[0];
if (n_streams > n_elements)
n_streams = n_elements;
if (n_elements > 0){
size_t ls = 0, gs = 0;
int err = GpuKernel_sched(&%(kname)s, n_streams, &ls, &gs);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "GpuKernel_sched: %%s\\n",
GpuKernel_error(&%(kname)s, err));
%(fail)s
}
// Make sure we run as many blocks as we need to cover the whole n_streams
gs = (n_streams + ls - 1)/ls;
err = mrg_uniform_call(1, &ls, &gs, 0, %(o_sample)s->ga.data, %(o_sample)s->ga.offset, %(o_rstate)s->ga.data, %(o_rstate)s->ga.offset, n_elements, n_streams);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "mrg_uniform_call: %%s\\n",
GpuKernel_error(&%(kname)s, err));
%(fail)s
}
}
free(odims);
""" % dict(
rstate=inp[0],
size=inp[1],
o_rstate=out[0],
o_sample=out[1],
kname=self.gpu_kernels(node, nodename)[0].objvar,
params=sub["params"],
just_fail=sub["fail"],
fail="""
{
free(odims);
%(fail)s
}
"""
% dict(fail=sub["fail"]),
)
def c_code_cache_version(self):
return (17,)
@register_opt2([mrg_uniform], "fast_compile")
def local_gpua_mrg_graph(fgraph, op, context_name, inputs, outputs):
if (
isinstance(op, mrg_uniform)
and isinstance(inputs[0].type, GpuArrayType)
and (inputs[0].owner is None or not isinstance(inputs[0].owner.op, GpuFromHost))
):
outs = GPUA_mrg_uniform.new(
inputs[0], op.output_type.ndim, op.output_type.dtype, inputs[1]
)
return [outs[0], host_from_gpu(outs[1])]
@register_opt("fast_compile")
@local_optimizer([mrg_uniform])
def local_gpua_mrg(fgraph, node):
context_name = infer_context_name(*node.inputs)
return local_gpua_mrg_graph(
fgraph, node.op, context_name, node.inputs, node.outputs
)
import os
from string import Template
import numpy as np
import aesara
from aesara.graph.basic import Apply
from aesara.tensor import as_tensor_variable
from aesara.tensor.sort import TopKOp
from .basic_ops import (
GpuKernelBase,
Kernel,
as_gpuarray_variable,
gpuarray_helper_inc_dir,
infer_context_name,
)
from .opt import op_lifter, register_opt, register_opt2
from .type import GpuArrayType
try:
import pygpu
import pygpu.gpuarray as ga
except ImportError:
# To make sure aesara is importable
pass
# TODO GPU sort / argsort
class GpuTopKOp(GpuKernelBase, TopKOp):
"""Implements TopKOp on gpu
Currently the output seem sorted, but we do not test it. So as on
the CPU, we only support sorted=False for now.
"""
__props__ = TopKOp.__props__
_f16_ok = True
def __init__(
self,
axis=-1,
sorted=True,
idx_dtype="int64",
return_values=True,
return_indices=True,
):
if sorted:
raise NotImplementedError(
"GpuTopK currently is not sure to give sorted output even if they look sorted.."
)
GpuKernelBase.__init__(self)
TopKOp.__init__(
self,
axis=axis,
sorted=sorted,
idx_dtype=idx_dtype,
return_values=return_values,
return_indices=return_indices,
)
def perform(self, node, inputs, output_storage, params):
raise NotImplementedError()
def c_headers(self, **kwargs):
return ["gpuarray_api.h", "gpuarray_helper.h", "numpy_compat.h"]
def c_header_dirs(self, **kwargs):
return [
os.path.dirname(__file__),
gpuarray_helper_inc_dir(),
pygpu.get_include(),
]
def c_code_cache_version(self):
return (4,)
def gpu_kernels(self, node, nodename):
# load kernel source
device_type = node.inputs[0].type.context.kind
kernel_ext = {b"cuda": ".cu", b"opencl": ".cl"}[device_type]
common_ext = {b"cuda": ".cuh", b"opencl": ".h"}[device_type]
# prepare "$" macros
if device_type == b"cuda":
ndim = node.inputs[0].ndim
dstv_strides_code = "".join(
f"ssize_t dstv_strides_{i}, " for i in range(ndim)
)
dsti_strides_code = "".join(
f"ssize_t dsti_strides_{i}, " for i in range(ndim)
)
src_strides_code = "".join(
f"ssize_t src_strides_{i}, " for i in range(ndim)
)
set_slice_code = """
gidx = gid %% dims_%(i)d;
gid /= dims_%(i)d;
{dstv};
{dsti};
src = ptr_add(src, gidx*src_strides_%(i)d);\n""".format(
dstv="dstv = ptr_add(dstv, gidx*dstv_strides_%(i)d)"
if self.return_values
else "",
dsti="dsti = ptr_add(dsti, gidx*dsti_strides_%(i)d)"
if self.return_indices
else "",
)
set_slice_code = "".join(set_slice_code % dict(i=j) for j in range(1, ndim))
if self.return_values:
set_slice_code += """
dstv = ptr_add(dstv, dstv_offset);
"""
if self.return_indices:
set_slice_code += """
dsti = ptr_add(dsti, dsti_offset);
"""
set_slice_code += """
src = ptr_add(src, src_offset);
"""
flags = Kernel.get_flags(node.inputs[0].dtype)
subs = dict(
inp_t=ga.dtype_to_ctype(node.inputs[0].dtype),
out_t=ga.dtype_to_ctype(self.idx_dtype),
dims="".join(f"size_t dims_{i}, " for i in range(1, ndim)),
dstv="INPUT_TYPE *dstv," if self.return_values else "",
dstv_offset="size_t dstv_offset," if self.return_values else "",
dsti="INDEX_TYPE *dsti," if self.return_indices else "",
dsti_offset="size_t dsti_offset," if self.return_indices else "",
dstv_strides=dstv_strides_code if self.return_values else "",
dsti_strides=dsti_strides_code if self.return_indices else "",
src_strides=src_strides_code,
set_slice=set_slice_code,
write_value=int(self.return_values),
write_index=int(self.return_indices),
ndim=str(ndim),
)
elif device_type == b"opencl":
raise NotImplementedError()
# setup parameters
param_types = [ga.SIZE] * (ndim - 1) # dims
for _ in range(self.return_values + self.return_indices):
param_types.append(ga.GpuArray) # dst*
param_types.append(ga.SIZE) # offset
param_types.extend([ga.SSIZE] * ndim) # dst*_strides
param_types.append(ga.SIZE) # k
param_types.append(ga.GpuArray) # src
param_types.append(ga.SIZE) # offset
param_types.extend([ga.SSIZE] * ndim) # src_strides
param_types.append(ga.SIZE) # size
# load and compile kernels
with open(
os.path.join(
os.path.dirname(__file__), "c_code", "topk_common" + common_ext
)
) as f:
common_src = f.read()
kernels = []
def build_kernel(fname, kname, subs):
with open(os.path.join(os.path.dirname(__file__), "c_code", fname)) as f:
kernel_src = f.read()
ker = Kernel(
code=(
"#include <cluda.h>\n"
+ Template(common_src + kernel_src).substitute(**subs)
),
name=kname,
params=param_types,
flags=flags,
objvar=kname + nodename,
)
return ker
subs["count_t"] = "int"
kernels.append(build_kernel("topk_dense" + kernel_ext, "k_topk_dense", subs))
subs["kname"] = "k_topk_dense_large"
kernels.append(
build_kernel("topk_dense_large" + kernel_ext, "k_topk_dense_large", subs)
)
subs["count_t"] = "long long"
subs["kname"] = "k_topk_dense_xlarge"
kernels.append(
build_kernel("topk_dense_large" + kernel_ext, "k_topk_dense_xlarge", subs)
)
return kernels
def c_code(self, node, nodename, inps, outs, sub):
context = node.inputs[0].type.context
if context.kind != b"cuda":
raise NotImplementedError(
f"{self.__class__.__name__}: We only have CUDA implementation so far."
)
x, k = inps
inp_dtc = ga.dtype_to_typecode(node.inputs[0].dtype)
if not self.return_indices:
(yv,) = outs
elif self.return_values:
yv, yi = outs
else:
(yi,) = outs
out_dtype_s = self.idx_dtype
out_dtc = ga.dtype_to_typecode(out_dtype_s)
fail = sub["fail"]
ctx = sub["params"]
k_dtype = node.inputs[1].type.dtype_specs()[1]
# max threads per block
MAX_TPB = context.maxlsize0
# max blocks per grid
MAX_BPG = context.maxgsize0
WARP_SIZE = 32
ndim = node.inputs[0].ndim
reordered_axes = list(range(ndim))
axis = self.axis % ndim
del reordered_axes[axis]
reordered_axes = [axis] + reordered_axes
dims = "".join(f"dims[{i}], " for i in reordered_axes[1:])
prep_output = ""
if self.return_values:
def_dvstrides = f"const ssize_t *dvstrides = PyGpuArray_STRIDES({yv})"
params_dv = f"{yv}->ga.data, {yv}->ga.offset,\n"
params_dv += "".join(f"dvstrides[{i}], " for i in reordered_axes)
prep_output += (
"""
if (0 != aesara_prep_output(
&%(yv)s, %(ndim)d, odims,
%(inp_dtc)s, GA_C_ORDER, %(ctx)s)) {
%(fail)s;
}\n"""
% locals()
)
else:
def_dvstrides = params_dv = ""
if self.return_indices:
def_distrides = f"const ssize_t *distrides = PyGpuArray_STRIDES({yi})"
params_di = f"{yi}->ga.data, {yi}->ga.offset,\n"
params_di += "".join(f"distrides[{i}], " for i in reordered_axes)
prep_output += (
"""
if (0 != aesara_prep_output(
&%(yi)s, %(ndim)d, odims,
%(out_dtc)s, GA_C_ORDER, %(ctx)s)) {
%(fail)s;
}\n"""
% locals()
)
else:
def_distrides = params_di = ""
sstrides = ", ".join(f"sstrides[{i}]" for i in reordered_axes)
code = """
{
const ssize_t k_ = ((%(k_dtype)s*)(PyArray_DATA(%(k)s)))[0];
const size_t *dims = PyGpuArray_DIMS(%(x)s);
size_t odims[%(ndim)d];
for (int i=0; i<%(ndim)d; i++)
odims[i] = dims[i];
odims[%(axis)d] = k_>=0 ? k_ : -k_;
if (0 == odims[%(axis)d]) {
PyErr_SetString(
PyExc_ValueError,
"topk: kth must not be zero");
%(fail)s;
} else if (dims[%(axis)d] < odims[%(axis)d]) {
PyErr_SetString(
PyExc_ValueError,
"topk: kth cannot be larger than the size of specified axis %(axis)d");
%(fail)s;
}
%(prep_output)s
size_t grid_size=1, block_size=1;
for (int i=0; i<%(ndim)d; ++i) {
if (i!=%(axis)d)
grid_size *= dims[i];
else
block_size = dims[i];
}
// round up to multiples of warp size
block_size = ((block_size + %(WARP_SIZE)d - 1) / %(WARP_SIZE)d) * %(WARP_SIZE)d;
if (grid_size > %(MAX_BPG)d) {
PyErr_SetString(
PyExc_ValueError,
"topk: too many slices to work with, expected <= %(MAX_BPG)d");
%(fail)s;
}
%(def_dvstrides)s;
%(def_distrides)s;
const ssize_t *sstrides = PyGpuArray_STRIDES(%(x)s);
int err;
if (dims[%(axis)d] > (1u << 31)) {
block_size = %(MAX_TPB)d;
err = k_topk_dense_xlarge_call(
1, &grid_size, &block_size, 0,
%(dims)s
%(params_dv)s
%(params_di)s
k_,
%(x)s->ga.data,
%(x)s->ga.offset,
%(sstrides)s,
dims[%(axis)d]
);
} else if (block_size > %(MAX_TPB)d) {
block_size = %(MAX_TPB)d;
err = k_topk_dense_large_call(
1, &grid_size, &block_size, 0,
%(dims)s
%(params_dv)s
%(params_di)s
k_,
%(x)s->ga.data,
%(x)s->ga.offset,
%(sstrides)s,
dims[%(axis)d]
);
} else {
err = k_topk_dense_call(
1, &grid_size, &block_size, 0,
%(dims)s
%(params_dv)s
%(params_di)s
k_,
%(x)s->ga.data,
%(x)s->ga.offset,
%(sstrides)s,
dims[%(axis)d]
);
}
if (err != GA_NO_ERROR) {
PyErr_SetString(
PyExc_RuntimeError,
"topk: gpu kernel failed to execute");
%(fail)s;
}
}
"""
return code % locals()
def make_node(self, inp, kth):
ctx_name = infer_context_name(inp)
inp = as_gpuarray_variable(inp, ctx_name)
kth = as_tensor_variable(kth)
bcast = inp.type.broadcastable
outs = []
if self.return_values:
outs.append(inp.type())
if self.return_indices:
outs.append(
GpuArrayType(
dtype=self.idx_dtype, broadcastable=bcast, context_name=ctx_name
)()
)
return Apply(self, [inp, kth], outs)
def get_params(self, node):
return node.inputs[0].type.context
class ValuesEqApproxNoOrder:
"""
We ignore the order of elements on a given axis during the comparison.
"""
def __init__(self, axis):
self.axis = axis
def __call__(self, val1, val2):
v1 = np.sort(val1, axis=self.axis)
v2 = np.sort(val2, axis=self.axis)
ret = aesara.tensor.type.values_eq_approx(v1, v2)
return ret
@register_opt("fast_compile")
@op_lifter([TopKOp], cuda_only=True)
@register_opt2([TopKOp], "fast_compile")
def local_gpua_topkop(op, ctx_name, inputs, outputs):
axis = op.axis
rv = op.return_values
ri = op.return_indices
x, k = inputs
x = as_gpuarray_variable(x, ctx_name)
if op.sorted:
return
gpu_op = GpuTopKOp(
axis=axis,
sorted=op.sorted,
idx_dtype=op.idx_dtype,
return_values=rv,
return_indices=ri,
)
rets = gpu_op(x, k, return_list=True)
c = ValuesEqApproxNoOrder(axis)
for r in rets:
r.tag.values_eq_approx = c
return rets
from io import StringIO
import numpy as np
import aesara.tensor as at
from aesara.gradient import grad_not_implemented
from aesara.graph.basic import Apply
from aesara.graph.op import Op
from aesara.link.c.interface import HideC
from aesara.link.c.op import COp
from aesara.link.c.params_type import ParamsType
from aesara.link.c.type import CType
from aesara.scalar import bool as bool_t
from aesara.scalar import int32 as int_t
from aesara.scalar import uint32 as size_t
from aesara.tensor.basic import AllocDiag
from aesara.tensor.math import clip, minimum
from aesara.tensor.subtensor import (
AdvancedIncSubtensor,
AdvancedSubtensor,
AdvancedSubtensor1,
IncSubtensor,
Subtensor,
get_idx_list,
)
from aesara.tensor.type import integer_dtypes
try:
import pygpu
from pygpu import gpuarray
except ImportError:
pass
from aesara.gpuarray.basic_ops import (
GpuKernelBase,
Kernel,
as_gpuarray_variable,
gpu_contiguous,
gpuarray_helper_inc_dir,
infer_context_name,
)
from aesara.gpuarray.type import GpuArrayType, gpu_context_type
iadd_reg = {}
def get_iadd(a, b):
key = (a.type.dtype, b.type.dtype, a.type.context)
if key not in iadd_reg:
a_arg = pygpu.elemwise.arg("a", a.type.dtype, read=True, write=True)
b_arg = pygpu.elemwise.arg("b", b.type.dtype, read=True)
res = pygpu.elemwise.GpuElemwise(
a.type.context, "a = a + b", [a_arg, b_arg], convert_f16=True
)
iadd_reg[key] = res
return iadd_reg[key]
class GpuSubtensor(HideC, Subtensor):
"""
Subtensor on the GPU.
"""
_f16_ok = True
def make_node(self, x, *inputs):
ctx_name = infer_context_name(x)
rval = Subtensor.make_node(self, x, *inputs)
otype = GpuArrayType(
dtype=rval.outputs[0].type.dtype,
broadcastable=rval.outputs[0].type.broadcastable,
context_name=ctx_name,
)
x = as_gpuarray_variable(x, ctx_name)
return Apply(self, [x] + rval.inputs[1:], [otype()])
def perform(self, node, inputs, out_):
(out,) = out_
x = inputs[0]
cdata = get_idx_list(inputs, self.idx_list)
if len(cdata) == 1:
cdata = cdata[0]
out[0] = x.__getitem__(cdata)
def c_support_code(self, **kwargs):
return """
static int fix_indices(ssize_t *start, ssize_t *stop, ssize_t *step,
int start_n, int stop_n, int step_n,
size_t len) {
if (step_n) *step = 1;
if (*step == 0) {
PyErr_SetString(PyExc_ValueError, "slice step cannot be zero");
return -1;
}
if (start_n) *start = (*step < 0) ? len-1 : 0;
else {
if (*start < 0) *start += len;
if (*start < 0) *start = (*step < 0) ? -1 : 0;
if (*start > -1 && *start >= len) {
*start = (*step < 0) ? len-1 : len;
}
}
if (stop_n) *stop = (*step < 0) ? -1 : len;
else {
if (*stop < 0) *stop += len;
if (*stop < 0) *stop = (*step < 0) ? -1 : 0;
if (*stop > -1 && *stop >= len) {
*stop = (*step < 0) ? len-1 : len;
}
}
if (*stop < *start && *step > 0)
*stop = *start;
return 0;
}
"""
def c_code(self, node, name, inputs, outputs, sub):
inp_ndim = node.inputs[0].ndim
inp = inputs[0]
indices = inputs[1:]
# pad out the index list to the same dimension as the input
idx_list = self.idx_list + ((slice(None),) * (inp_ndim - len(self.idx_list)))
# This case fails when we use pygpu_index(), so here is some
# special code
if len(idx_list) == 0:
return """
Py_XDECREF(%(out)s);
%(out)s = pygpu_copy(%(inp)s, GA_ANY_ORDER);
if (!%(out)s) {
// Exception already set
%(fail)s
}
""" % dict(
out=outputs[0], inp=inp, fail=sub["fail"]
)
sio = StringIO()
print(
"""
ssize_t starts[%(sz)s];
ssize_t stops[%(sz)s];
ssize_t steps[%(sz)s];
ssize_t cur;
int err;
if (%(inp)s->ga.nd != %(sz)s) {
PyErr_SetString(PyExc_IndexError, "invalid index");
%(fail)s
}
"""
% dict(sz=len(idx_list), inp=inp, fail=sub["fail"]),
file=sio,
)
def fix_idx(idx):
if idx is None:
return "0", 1
elif isinstance(idx, (np.integer, int)):
return str(idx), 0
elif isinstance(idx, CType):
return indices.pop(0), 0
else:
assert 0, idx
for i, idx in enumerate(idx_list):
if isinstance(idx, slice):
start, start_n = fix_idx(idx.start)
stop, stop_n = fix_idx(idx.stop)
step, step_n = fix_idx(idx.step)
print(
"""
starts[%(i)s] = %(start)s;
stops[%(i)s] = %(stop)s;
steps[%(i)s] = %(step)s;
if (fix_indices(&starts[%(i)s], &stops[%(i)s], &steps[%(i)s],
%(start_n)s, %(stop_n)s, %(step_n)s,
%(inp)s->ga.dimensions[%(i)s]) == -1) {
%(fail)s
}
"""
% dict(
i=i,
start=start,
stop=stop,
step=step,
start_n=start_n,
stop_n=stop_n,
step_n=step_n,
fail=sub["fail"],
inp=inp,
),
file=sio,
)
else:
if isinstance(idx, CType):
start = indices.pop(0)
elif isinstance(idx, (np.integer, int)):
start = idx
else:
assert 0, idx
print(
"""
cur = %(start)s;
if (cur < 0)
cur += %(inp)s->ga.dimensions[%(i)s];
starts[%(i)s] = cur;
steps[%(i)s] = 0;
"""
% dict(i=i, start=start, fail=sub["fail"], inp=inp),
file=sio,
)
print(
"""
Py_XDECREF(%(out)s);
%(out)s = pygpu_index(%(inp)s, starts, stops, steps);
if (!%(out)s) { %(fail)s }
"""
% dict(name=name, fail=sub["fail"], inp=inp, out=outputs[0]),
file=sio,
)
return sio.getvalue()
def c_code_cache_version(self):
return (8,)
class GpuIncSubtensor(IncSubtensor):
"""
Implement IncSubtensor on the gpu.
Notes
-----
The optimization to make this inplace is in tensor/opt.
The same optimization handles IncSubtensor and GpuIncSubtensor.
This Op has c_code too; it inherits IncSubtensor's c_code.
The helper methods like :meth:`do_type_checking`,
:meth:`copy_of_x`, etc. specialize the c_code for this Op.
"""
_f16_ok = True
params_type = gpu_context_type
def make_node(self, x, y, *inputs):
ctx_name = infer_context_name(x, y)
x = as_gpuarray_variable(x, ctx_name)
y = as_gpuarray_variable(y, ctx_name)
rval = IncSubtensor.make_node(self, x, y, *inputs)
ret = Apply(self, [x, y] + rval.inputs[2:], [x.type()])
return ret
def get_params(self, node):
return node.outputs[0].type.context
def perform(self, node, inputs, out_, ctx):
(out,) = out_
x, y = inputs[:2]
indices = list(reversed(inputs[2:]))
def convert(entry):
if isinstance(entry, CType):
rval = indices.pop()
return rval
elif isinstance(entry, slice):
return slice(
convert(entry.start), convert(entry.stop), convert(entry.step)
)
else:
return entry
cdata = tuple(map(convert, self.idx_list))
if len(cdata) == 1:
cdata = cdata[0]
if not self.inplace:
x = x.copy()
sub_x = x.__getitem__(cdata)
if sub_x.shape:
# we've sliced out an N-D tensor with N > 0
if not self.set_instead_of_inc:
# sub_x += y
iadd = get_iadd(node.inputs[0], node.inputs[1])
iadd(sub_x, y)
else:
# sub_x[...] = y
x.__setitem__(cdata, y)
else:
# scalar case
if not self.set_instead_of_inc:
# x.__setitem__(cdata, sub_x + y)
tmp = pygpu.elemwise.elemwise2(sub_x, "+", y, sub_x, broadcast=False)
x.__setitem__(cdata, tmp)
else:
x.__setitem__(cdata, y)
out[0] = x
def do_type_checking(self, node):
"""
Should raise NotImplementedError if c_code does not support
the types involved in this node.
"""
if not isinstance(node.inputs[0].type, GpuArrayType):
raise NotImplementedError()
def copy_of_x(self, x):
"""
Parameters
----------
x
A string giving the name of a C variable pointing to an array.
Returns
-------
str
C code expression to make a copy of x.
Notes
-----
Base class uses `PyArrayObject *`, subclasses may override for
different types of arrays.
"""
return f"""pygpu_copy({x}, GA_ANY_ORDER)"""
def decl_view(self):
return "PyGpuArrayObject* zview = NULL;"
def make_view_array(self, x, view_ndim):
"""
//TODO
Parameters
----------
x
A string identifying an array to be viewed.
view_ndim
A string specifying the number of dimensions to have in the view.
This doesn't need to actually set up the view with the
right indexing; we'll do that manually later.
"""
ret = f"""
size_t dims[{view_ndim}];
for(int i=0; i<{view_ndim}; i++)
dims[i] = xview_dims[i];
zview = pygpu_fromgpudata({x}->ga.data,
{x}->ga.offset + xview_offset,
{x}->ga.typecode,
{view_ndim},
dims,
xview_strides,
{x}->context,
1,
(PyObject *){x},
(PyObject *)&PyGpuArrayType);
"""
return ret
def get_helper_c_code_args(self):
"""
Return a dictionary of arguments to use with helper_c_code.
"""
return {"c_prefix": "PyGpuArray", "strides_mul": 1}
def copy_into(self, view, source):
"""
Parameters
----------
view : string
C code expression for an array.
source : string
C code expression for an array.
Returns
-------
str
C code expression to copy source into view, and 0 on success.
"""
return f"""sub_setarray(&{view}->ga, &{source}->ga)"""
def c_headers(self, **kwargs):
return [
"<numpy_compat.h>",
"<gpuarray/error.h>",
"<gpuarray/array.h>",
"<gpuarray/elemwise.h>",
]
def c_support_code(self, **kwargs):
return """
int sub_setarray(GpuArray *dst, GpuArray *src) {
int err;
err = GpuArray_setarray(dst, src);
if (err != GA_NO_ERROR)
PyErr_SetString(PyExc_RuntimeError, GpuArray_error(src, err));
return err;
}
"""
def c_support_code_struct(self, node, nodename):
return "\nGpuElemwise *iadd;\n"
def c_init_code_struct(self, node, name, sub):
return """
gpuelemwise_arg args[2] = {{0}};
args[0].name = "a";
args[0].typecode = %(type1)s;
args[0].flags = GE_READ|GE_WRITE;
args[1].name = "b";
args[1].typecode = %(type2)s;
args[1].flags = GE_READ;
iadd = GpuElemwise_new(%(ctx)s->ctx, "", "a += b",
2, args, %(nd)s, GE_CONVERT_F16);
if (iadd == NULL) {
PyErr_SetString(PyExc_RuntimeError, "Could not initialize inplace add support");
%(fail)s
}
""" % dict(
ctx=sub["params"],
fail=sub["fail"],
type1=node.inputs[0].type.typecode,
type2=node.inputs[1].type.typecode,
nd=node.inputs[1].ndim,
)
def add_to_zview(self, nodename, x, fail):
return (
"""
{
void *args[2];
args[0] = &zview->ga;
args[1] = &%(x)s->ga;
if (GpuElemwise_call(iadd, args, GE_BROADCAST | GE_PADSHAPE) != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "Error doing inplace add");
Py_DECREF(zview);
%(fail)s
}
}
"""
% locals()
)
def c_code_cache_version(self):
parent_version = super().c_code_cache_version()
if not parent_version:
return
return parent_version + (10,)
class GpuAdvancedSubtensor1(HideC, AdvancedSubtensor1):
"""
AdvancedSubrensor1 on the GPU.
"""
_f16_ok = True
def make_node(self, x, ilist):
ctx_name = infer_context_name(x, ilist)
x_ = as_gpuarray_variable(x, ctx_name)
ilist__ = at.as_tensor_variable(ilist)
if ilist__.type.dtype not in integer_dtypes:
raise TypeError("index must be integers")
if ilist__.type.dtype != "int64":
ilist__ = at.cast(ilist__, "int64")
ilist_ = gpu_contiguous(as_gpuarray_variable(ilist__, ctx_name))
if ilist_.type.dtype != "int64":
raise TypeError("index must be int64")
if ilist_.type.ndim != 1:
raise TypeError("index must be a vector")
if x_.type.ndim == 0:
raise TypeError("cannot index into a scalar")
bcast = ilist_.broadcastable + x_.broadcastable[1:]
return Apply(
self,
[x_, ilist_],
[GpuArrayType(dtype=x.dtype, context_name=ctx_name, broadcastable=bcast)()],
)
def perform(self, node, inp, out_):
raise NotImplementedError()
def c_support_code(self, **kwargs):
return """
int take1_match_dims(GpuArray *a, GpuArray *v) {
if (a->nd != v->nd) return 0;
for (unsigned int i = 1; i < v->nd; i++) {
if (a->dimensions[i] != v->dimensions[i]) return 0;
}
return 1;
}
"""
def c_code(self, node, name, inputs, outputs, sub):
return """
int err;
if (%(out)s == NULL || !GpuArray_IS_C_CONTIGUOUS(&%(out)s->ga) ||
%(out)s->ga.dimensions[0] != %(idx)s->ga.dimensions[0] ||
!take1_match_dims(&%(out)s->ga, &%(v)s->ga)) {
size_t tmp;
Py_XDECREF(%(out)s);
/* This is a dirty hack to avoid an extra alloc */
tmp = %(v)s->ga.dimensions[0];
%(v)s->ga.dimensions[0] = %(idx)s->ga.dimensions[0];
%(out)s = pygpu_empty(%(v)s->ga.nd, %(v)s->ga.dimensions, %(v)s->ga.typecode,
GA_C_ORDER, %(v)s->context, Py_None);
if (%(out)s == NULL) {
%(fail)s;
}
%(v)s->ga.dimensions[0] = tmp; // Don't remove this line
}
err = GpuArray_take1(&%(out)s->ga, &%(v)s->ga, &%(idx)s->ga, 1);
if (err != GA_NO_ERROR) {
if (err == GA_VALUE_ERROR) {
PyErr_SetString(PyExc_IndexError, "Index out of bounds.");
} else {
PyErr_SetString(PyExc_RuntimeError, GpuArray_error(&%(v)s->ga, err));
}
%(fail)s
}
""" % dict(
out=outputs[0], v=inputs[0], idx=inputs[1], fail=sub["fail"]
)
def c_code_cache_version(self):
return (1,)
def check_and_convert_boolean_masks(input, idx_list):
"""
This function checks if the boolean mask arrays in the index have
the right shape and converts them to index arrays by calling nonzero.
For each boolean mask, we check if the mask has the
same shape as the input. This is enforced in NumPy 0.13.0 and
newer, but not by earlier versions. If the size is not the same,
this method raises an IndexError.
"""
dim_seen = 0
out_idx_list = []
for index in idx_list:
if index is np.newaxis:
# skip, does not count as an input dimension
out_idx_list.append(index)
elif isinstance(index, np.ndarray) and index.dtype == "bool":
for i in range(index.ndim):
if index.shape[i] != input.shape[dim_seen + i]:
raise IndexError(
"boolean index did not match indexed array "
f"along dimension {int(dim_seen + i)}; dimension is {int(input.shape[dim_seen + i])} but "
f"corresponding boolean dimension is {int(index.shape[i])}"
)
dim_seen += index.ndim
out_idx_list += index.nonzero()
else:
dim_seen += 1
out_idx_list.append(index)
return out_idx_list
class BaseGpuAdvancedSubtensor:
def perform(self, node, inputs, out_):
(out,) = out_
x = inputs[0]
idx = inputs[1:]
# convert boolean masks to index arrays
idx = check_and_convert_boolean_masks(x, idx)
# detect and transpose array indices
nidx = []
nshp = list(x.shape)
for k, i in enumerate(idx):
if i is None:
nidx.append(slice(None))
nshp.insert(k, 1)
else:
nidx.append(i)
x = x.reshape(nshp)
transp = list(range(x.ndim))
# number of array-indexed dimensions
p = 0
# ap represents the axis in the resulting array where the
# dimensions indexed by arrays and ints will be inserted.
# For instance, if all such dimensions are grouped together,
# it corresponds to the index of the first such dimension in the
# initial array. If these dimensions are split (with slices
# between), then the resulting dimensions will be moved to the
# beginning, and ap will be 0.
# If no such dimension has been encountered, ap is None.
ap = None
# Indicates whether we have already encountered an index (array
# or number), and then a slice.
slice_after_idx = False
for k, i in enumerate(list(nidx)):
if isinstance(i, np.ndarray) and i.ndim != 0:
transp.remove(k)
transp.insert(p, k)
i = nidx.pop(k)
nidx.insert(p, i)
p += 1
if ap is None:
# first non-slice index
ap = k
elif slice_after_idx:
# We already encountered at least an array or int, and then
# a slice. Array-indexed axes are not grouped,
# moving to the beginning
ap = 0
else:
try:
i.__index__()
if ap is None:
ap = k
# indices do not break the contiguity of
# array-indexed axes
except Exception:
# If we already encountered an array/int index, it
# means future ones will not be grouped.
if ap is not None:
slice_after_idx = True
x = x.transpose(*transp)
idx_ = [slice(None)] * p + nidx[p:]
x = x.__getitem__(idx_)
if p == 0:
assert ap is None
# The only indexing was through slices and indices.
# This can happen with symbolic slices for instance.
# Since no view_map is set, we need to copy the returned value
out[0] = x.copy()
return
# At this point, we should have encountered at least one array
assert ap is not None
# flatten the array-indexed dimensions
shape = (np.prod(x.shape[0:p]),) + x.shape[p:]
input_flat = x.reshape(shape)
# build the strides
strides = [1]
for i in range(p - 1, 0, -1):
stride = x.shape[i] * strides[0]
strides.insert(0, stride)
# build the indices and use it
take_idx = sum((i * s for i, s in zip(nidx, strides)))
out_flat = input_flat.take1(
pygpu.asarray(take_idx.flatten(), context=x.context)
)
# finish up
out_flat_shp = take_idx.shape + x.shape[p:]
o = out_flat.reshape(out_flat_shp)
if ap != 0:
# Put the resulting indexing at the place that NumPy
# decided was the right one.
ntransp = list(range(take_idx.ndim, o.ndim))
ntransp[ap:ap] = list(range(take_idx.ndim))
o = o.transpose(*ntransp)
out[0] = o
class GpuAdvancedSubtensor(HideC, BaseGpuAdvancedSubtensor, AdvancedSubtensor):
"""
AdvancedSubtensor on the GPU.
"""
def make_node(self, x, *inputs):
ctx_name = infer_context_name(x)
rval = AdvancedSubtensor.make_node(self, x, *inputs)
otype = GpuArrayType(
dtype=rval.outputs[0].type.dtype,
broadcastable=rval.outputs[0].type.broadcastable,
context_name=ctx_name,
)
x = as_gpuarray_variable(x, ctx_name)
return Apply(self, [x] + rval.inputs[1:], [otype()])
class BaseGpuAdvancedIncSubtensor:
def perform(self, node, inp, out_):
(out,) = out_
x = inp[0]
y = inp[1]
idx = inp[2:]
x = x.copy()
# Get a handle to the GpuElemwise object that will be called.
# It is not necessary to have the right number of dimensions,
# so we just pass symbolic x and y.
iadd = get_iadd(node.inputs[0], node.inputs[1])
# convert all indices to np.array
for i in range(len(idx)):
if isinstance(idx[i], gpuarray.GpuArray):
idx[i] = np.asarray(idx[i])
# convert boolean masks to index arrays
idx = check_and_convert_boolean_masks(x, idx)
# Insert axes for None indexing
nidx = []
nshp = list(x.shape)
for k, i in enumerate(idx):
if i is None:
nidx.append(slice(None))
nshp.insert(k, 1)
else:
nidx.append(i)
x_ = x.reshape(nshp)
# Bring array indices to front
transp = []
nidx_ = []
p = 0
for k, i in enumerate(list(nidx)):
if isinstance(i, np.ndarray) and i.ndim != 0:
transp.append(k)
nidx_.append(i)
p += 1
for k, i in enumerate(list(nidx)):
if not (isinstance(i, np.ndarray) and i.ndim != 0):
transp.append(k)
nidx_.append(i)
transp = transp + list(range(len(transp), x_.ndim))
rtransp = [i for i, _ in sorted(enumerate(transp), key=lambda x: x[1])]
nidx = nidx_
# transp: order to shuffle axes of x so that single dimension
# subarrays are extracted first
# p: number of axes with array indexing
x_ = x_.transpose(*transp)
idx_ = [slice(None)] * p + nidx[p:]
# flatten the array-indexed dimensions
x_flat = x_.reshape((np.prod(x_.shape[0:p]),) + x_.shape[p:])
# process y so that last axes are the same
if y.shape != (1,):
y_shape_reverse = []
for x_s, y_s in zip(x_flat.shape[::-1], y.shape[::-1]):
if x_s == y_s or y_s == 1:
y_shape_reverse.append(y_s)
else:
break
if np.prod(y_shape_reverse) < np.prod(y.shape):
if len(y_shape_reverse) > 0:
y_shape_reverse.append(
int(np.prod(y.shape[0 : -len(y_shape_reverse)]))
)
else:
y_shape_reverse.append(int(np.prod(y.shape)))
y_shape = y_shape_reverse[::-1]
y_flat = y.reshape(y_shape)
else:
y_flat = y[0]
# build the strides
strides = [1]
for i in range(p - 1, 0, -1):
stride = x_.shape[i] * strides[0]
strides.insert(0, stride)
# build the indices and use it
index = idx_[p:] + [slice(None)] * (len(x_flat.shape) - len(idx_[p:]) - 1)
take_idx = sum(i * s for i, s in zip(nidx, strides))
if index == []:
for j, i in enumerate(take_idx.flatten()):
if y_flat.shape == ():
val = y_flat
else:
val = y_flat[j]
iadd(x_flat[i], val, broadcast=True)
else:
if x_flat.shape[-len(y_flat.shape) :] == y_flat.shape or y_flat.shape == ():
# y_flat has to be broadcast over axes of x_flat[i]
for i in take_idx.flatten():
if len(idx_[p:]) > 0:
x_flat_sub = x_flat[i].__getitem__(index)
else:
x_flat_sub = x_flat[i]
iadd(x_flat_sub, y_flat, broadcast=True)
else:
# y_flat's first axis corresponds to first exist of x_flat
for j, i in enumerate(take_idx.flatten()):
if len(idx_[p:]) > 0:
x_flat_sub = x_flat[i].__getitem__(index)
else:
x_flat_sub = x_flat[i]
iadd(x_flat_sub, y_flat[j % y_flat.shape[0]], broadcast=True)
x_ = x_flat.reshape(x_.shape).transpose(*rtransp)
out[0] = x_
class GpuAdvancedIncSubtensor(HideC, BaseGpuAdvancedIncSubtensor, AdvancedIncSubtensor):
"""
Implement AdvancedIncSubtensor on the gpu.
"""
def make_node(self, x, y, *inputs):
ctx_name = infer_context_name(x, y)
rval = AdvancedIncSubtensor.make_node(self, x, y, *inputs)
otype = GpuArrayType(
dtype=rval.outputs[0].type.dtype,
broadcastable=rval.outputs[0].type.broadcastable,
context_name=ctx_name,
)
x = as_gpuarray_variable(x, ctx_name)
y = as_gpuarray_variable(y, ctx_name)
return Apply(self, [x, y] + rval.inputs[2:], [otype()])
class GpuAdvancedIncSubtensor1(COp):
"""
Implement AdvancedIncSubtensor1 on the gpu.
"""
_f16_ok = True
__props__ = ("inplace", "set_instead_of_inc")
params_type = ParamsType(
inplace=bool_t,
set_instead_of_inc=bool_t,
context=gpu_context_type,
# following params are used into c_init_code_struct(),
# as inputs are not available in that function.
ndim_input_0=size_t,
ndim_input_1=size_t,
typecode_input_0=int_t,
typecode_input_1=int_t,
)
def __init__(self, inplace=False, set_instead_of_inc=False):
self.inplace = inplace
self.set_instead_of_inc = set_instead_of_inc
if inplace:
self.destroy_map = {0: [0]}
def clone_inplace(self):
return self.__class__(inplace=True, set_instead_of_inc=self.set_instead_of_inc)
def make_node(self, x, y, ilist):
ctx_name = infer_context_name(x, y)
x_ = as_gpuarray_variable(x, ctx_name)
y_ = as_gpuarray_variable(y, ctx_name)
ilist_ = at.as_tensor_variable(ilist)
assert x_.type.ndim >= y_.type.ndim
if ilist_.type.dtype not in integer_dtypes:
raise TypeError("index must be integers")
if ilist_.type.ndim != 1:
raise TypeError("index must be vector")
if x_.type.ndim == 0:
raise TypeError("cannot index into a scalar")
if y_.type.ndim > x_.type.ndim:
if self.set_instead_of_inc:
opname = "set"
else:
opname = "increment"
raise TypeError(
"cannot %s x subtensor with ndim=%s by y with ndim=%s "
% (opname, x_.type.ndim, y_.type.ndim)
)
return Apply(self, [x_, y_, ilist_], [x_.type()])
def get_params(self, node):
return self.params_type.get_params(
self,
context=node.outputs[0].type.context,
# following params are used into c_init_code_struct().
ndim_input_0=node.inputs[0].ndim,
ndim_input_1=node.inputs[1].ndim,
typecode_input_0=node.inputs[0].type.typecode,
typecode_input_1=node.inputs[1].type.typecode,
)
# We can't use the parent version that loops on each index
# as we also need to loop when set_instead_of_inc is True and the
# parent doesn't loop in that case.
def perform(self, node, inp, out_, params=None):
# TODO opt to make this inplace
x, y, idx = inp
(out,) = out_
if not self.inplace:
x = x.copy()
out[0] = x
if len(idx) == 0:
return
# Make sure idx is not a GpuArray otherwise we cannot use its
# content to index x and y (This is because we serve as
# fallback for _dev20).
if isinstance(idx, gpuarray.GpuArray):
idx = np.asarray(idx)
# If `y` has as many dimensions as `x`, then we want to iterate
# jointly on `x` and `y`. Otherwise, it means `y` should be
# broadcasted to fill all relevant rows of `x`.
if y.ndim == x.ndim and y.shape[0] != 1:
assert len(y) == len(idx)
if self.set_instead_of_inc:
for (j, i) in enumerate(idx):
x[i] = y[j]
else:
k = get_iadd(node.inputs[0], node.inputs[1])
for (j, i) in enumerate(idx):
k(x[i], y[j], broadcast=True)
else:
if y.ndim == x.ndim:
# First dim is always 1 in this case.
reshaped_y = y.reshape(y.shape[1:])
else:
nb_dims_to_add = (x.ndim - 1) - y.ndim
reshaped_y = y.reshape((1,) * nb_dims_to_add + y.shape)
if self.set_instead_of_inc:
for i in idx:
x[i] = reshaped_y
else:
k = get_iadd(node.inputs[0], node.inputs[1])
for i in idx:
k(x[i], reshaped_y, broadcast=True)
def c_headers(self, **kwargs):
return [
"<numpy_compat.h>",
"<gpuarray/error.h>",
"<gpuarray/array.h>",
"<gpuarray/elemwise.h>",
"gpuarray_helper.h",
]
def c_header_dirs(self, **kwargs):
return [gpuarray_helper_inc_dir()]
def c_support_code_struct(self, node, nodename):
return "\nGpuElemwise *iadd;\n"
def c_init_code_struct(self, node, name, sub):
return """
gpuelemwise_arg args[2] = {{0}};
args[0].name = "a";
args[0].typecode = %(params)s->typecode_input_0;
args[0].flags = GE_READ|GE_WRITE;
args[1].name = "b";
args[1].typecode = %(params)s->typecode_input_1;
args[1].flags = GE_READ;
iadd = GpuElemwise_new(%(params)s->context->ctx, "", "a += b",
2, args, %(params)s->ndim_input_1, GE_CONVERT_F16);
if (iadd == NULL) {
PyErr_SetString(PyExc_RuntimeError, "Could not initialize inplace add support");
%(fail)s
}
""" % dict(
params=sub["params"], fail=sub["fail"]
)
def c_code(self, node, name, inputs, outputs, sub):
if node.inputs[0].ndim != node.inputs[1].ndim:
raise NotImplementedError("This case does not have C code yet.")
return """
PyGpuArrayObject *row_x, *row_y;
size_t nd = %(params)s->ndim_input_0;
ssize_t *start = NULL, *step = NULL;
size_t num_indices, j;
int ret;
int broadcast_y;
start = (ssize_t*)malloc(nd * sizeof(ssize_t));
step = (ssize_t*)malloc(nd * sizeof(ssize_t));
if (start == NULL || step == NULL) {
PyErr_NoMemory();
%(fail)s
}
for (j = 0; j < nd; ++j) {
start[j] = 0;
step[j] = 1;
}
step[0] = 0;
num_indices = PyArray_SIZE(%(ind)s);
if (!%(params)s->inplace) {
%(out)s = aesara_try_copy(%(out)s, %(x)s);
if (%(out)s == NULL) {
// Exception already set
%(fail)s
}
} else {
Py_XDECREF(%(out)s);
%(out)s = %(x)s;
Py_INCREF(%(out)s);
}
if (num_indices != 0) {
if ((num_indices - 1) > LONG_MAX) {
PyErr_Format(PyExc_AssertionError,
"num_indices %%lld exceeds LONG_MAX + 1", (long long)num_indices);
%(fail)s
}
broadcast_y = PyGpuArray_DIM(%(y)s, 0) == 1;
for (j = 0; j < num_indices; j++) {
start[0] = *(dtype_%(ind)s *)PyArray_GETPTR1(%(ind)s, j);
if (start[0] < 0)
start[0] += PyGpuArray_DIM(%(out)s, 0);
if (start[0] < 0 || start[0] >= PyGpuArray_DIM(%(out)s, 0)) {
PyErr_SetString(PyExc_IndexError, "index out of bounds");
%(fail)s;
}
row_x = pygpu_index(%(out)s, start, (ssize_t *)PyGpuArray_DIMS(%(out)s), step);
if (row_x == NULL)
%(fail)s;
if (broadcast_y)
start[0] = 0;
else
start[0] = j;
row_y = pygpu_index(%(y)s, start, (ssize_t *)PyGpuArray_DIMS(%(y)s), step);
if (row_y == NULL) {
Py_DECREF(row_x);
%(fail)s;
}
if (%(params)s->set_instead_of_inc) {
ret = GpuArray_setarray(&row_x->ga, &row_y->ga);
} else {
void *args[2];
args[0] = (void *)&row_x->ga;
args[1] = (void *)&row_y->ga;
ret = GpuElemwise_call(iadd, args, GE_BROADCAST | GE_PADSHAPE);
}
Py_DECREF(row_x);
Py_DECREF(row_y);
if (ret != GA_NO_ERROR)
PyErr_SetString(PyExc_RuntimeError, "Failed to set/inc elements");
}
}
free(start);
free(step);
""" % dict(
x=inputs[0],
y=inputs[1],
ind=inputs[2],
out=outputs[0],
params=sub["params"],
fail="""
{
free(start);
free(step);
%(fail)s
}
"""
% dict(fail=sub["fail"]),
)
def c_code_cache_version(self):
return (5,)
class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC, GpuAdvancedIncSubtensor1):
"""
Implement AdvancedIncSubtensor1 on the gpu with atomics
"""
_f16_ok = True
params_type = GpuAdvancedIncSubtensor1.params_type
get_params = GpuAdvancedIncSubtensor1.get_params
def make_node(self, x, y, ilist):
"""
It differs from GpuAdvancedIncSubtensor1 in that it makes sure
the indexes are of type long.
"""
ctx_name = infer_context_name(x, y, ilist)
x_ = as_gpuarray_variable(x, ctx_name)
y_ = as_gpuarray_variable(y.astype(x.dtype), ctx_name)
ilist_ = as_gpuarray_variable(ilist, ctx_name)
assert x_.type.ndim >= y_.type.ndim
if ilist_.type.dtype not in integer_dtypes:
raise TypeError("index must be integers")
if ilist_.type.ndim != 1:
raise TypeError("index must be vector")
if x_.type.ndim == 0:
raise TypeError("cannot index into a scalar")
if y_.type.ndim > x_.type.ndim:
if self.set_instead_of_inc:
opname = "set"
else:
opname = "increment"
raise TypeError(
"cannot %s x subtensor with ndim=%s by y with ndim=%s "
% (opname, x_.type.ndim, y_.type.ndim)
)
return Apply(self, [x_, y_, ilist_], [x_.type()])
def perform(self, node, inp, out, params):
return super().perform(node, inp, out)
def c_code_cache_version(self):
return (14,)
def c_headers(self, **kwargs):
return ["<numpy_compat.h>", "<gpuarray_helper.h>", "<gpuarray/types.h>"]
def c_header_dirs(self, **kwargs):
return [gpuarray_helper_inc_dir()]
def c_code(self, node, name, inputs, outputs, sub):
if node.inputs[0].ndim != node.inputs[1].ndim or node.inputs[0].ndim != 2:
raise NotImplementedError("This case does not have C code yet.")
return """
int err;
if (%(params)s->inplace) {
Py_XDECREF(%(out)s);
%(out)s = %(x)s;
Py_INCREF(%(out)s);
} else {
%(out)s = aesara_try_copy(%(out)s, %(x)s);
}
if (!%(out)s) {
// Exception already set
%(fail)s
}
if (GpuArray_vector_add_fast(%(out)s, %(y)s, %(ind)s, %(params)s->set_instead_of_inc)) {
%(fail)s
}
""" % dict(
x=inputs[0],
y=inputs[1],
ind=inputs[2],
out=outputs[0],
fail=sub["fail"],
params=sub["params"],
)
def gpu_kernels(self, node, nodename):
# We can't rely on numpy for this, it changes with the OS
CHARMAP = dict(
int32="i",
uint32="I",
int64="l",
uint64="L",
float16="e",
float32="f",
float64="d",
)
dtype_x = node.inputs[0].dtype
dtype_y = node.inputs[1].dtype
dtype_ind = node.inputs[2].dtype
type_x = gpuarray.dtype_to_ctype(dtype_x)
type_y = gpuarray.dtype_to_ctype(dtype_y)
type_ind = gpuarray.dtype_to_ctype(dtype_ind)
flags = Kernel.get_flags(dtype_x, dtype_y, dtype_ind)
kname = "k_vector_add_fast"
k_var = "k_vector_add_fast_" + nodename
code = """#include "cluda.h"
KERNEL void k_vector_add_fast(const ga_size numRowsX,
const ga_size numColsX,
const ga_ssize stridesX0,
const ga_ssize stridesX1,
GLOBAL_MEM %(type_x)s *X,
const ga_size offset_X,
const ga_size numRowsY,
const ga_size numColsY,
const ga_ssize stridesY0,
const ga_ssize stridesY1,
GLOBAL_MEM %(type_y)s *Y,
const ga_size offset_Y,
const ga_size numIndices,
const ga_ssize stridesIndices,
GLOBAL_MEM %(type_ind)s *indices_arr,
const ga_size offset_indices_arr,
const ga_int set_instead_of_inc,
GLOBAL_MEM ga_int *err)
{
X = (GLOBAL_MEM %(type_x)s *)(((GLOBAL_MEM char *)X)+offset_X);
Y = (GLOBAL_MEM %(type_y)s *)(((GLOBAL_MEM char *)Y)+offset_Y);
indices_arr = (GLOBAL_MEM %(type_ind)s *)(((GLOBAL_MEM char *)indices_arr)+offset_indices_arr);
for (ga_int i = GID_0; i < numIndices; i += GDIM_0)
{
for (ga_int j = LID_0; j < numColsX; j += LDIM_0)
{
ga_ssize x_row = indices_arr[i * stridesIndices];
if (x_row < 0)
x_row += numRowsX;
ga_ssize y_row = i;
if (x_row < numRowsX && x_row >= 0) {
if (set_instead_of_inc) {
atom_xchg_%(tc)sg(&X[(x_row * stridesX0) + (j * stridesX1)],
Y[(y_row * stridesY0) + (j * stridesY1)]);
} else {
atom_add_%(tc)sg(&X[(x_row * stridesX0) + (j * stridesX1)],
Y[(y_row * stridesY0) + (j * stridesY1)]);
}
} else {
*err = 1;
}
}
}
return;
}
""" % dict(
type_x=type_x, type_y=type_y, type_ind=type_ind, tc=CHARMAP[dtype_x]
)
from pygpu.gpuarray import SIZE, SSIZE
params = [
SIZE,
SIZE,
SSIZE,
SSIZE,
gpuarray.GpuArray,
SIZE,
SIZE,
SIZE,
SSIZE,
SSIZE,
gpuarray.GpuArray,
SIZE,
SIZE,
SSIZE,
gpuarray.GpuArray,
SIZE,
"int32",
gpuarray.GpuArray,
]
return [Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var)]
def c_support_code_struct(self, node, nodename):
return (
super().c_support_code_struct(node, nodename)
+ """
int GpuArray_vector_add_fast(PyGpuArrayObject* py_self,
PyGpuArrayObject* py_other,
PyGpuArrayObject* indices_arr,
const int set_instead_of_inc)
{
size_t threads_per_block = std::min(PyGpuArray_DIMS(py_self)[1], (size_t)256);
size_t n_blocks = std::min(PyGpuArray_SIZE(indices_arr), (size_t)4096);
gpudata *errbuf;
int err, kerr = 0;
size_t itemsize_x = GpuArray_ITEMSIZE(&py_self->ga);
size_t itemsize_y = GpuArray_ITEMSIZE(&py_other->ga);
size_t itemsize_ind = GpuArray_ITEMSIZE(&indices_arr->ga);
if (threads_per_block > 0 && n_blocks > 0) {
err = gpudata_property(py_self->ga.data,
GA_CTX_PROP_ERRBUF, &errbuf);
if (err != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "Can't fetch error buffer");
return 1;
}
err = k_vector_add_fast_call(
1, &n_blocks, &threads_per_block, 0,
PyGpuArray_DIMS(py_self)[0],
PyGpuArray_DIMS(py_self)[1],
PyGpuArray_STRIDES(py_self)[0] / itemsize_x,
PyGpuArray_STRIDES(py_self)[1] / itemsize_x,
py_self->ga.data,
py_self->ga.offset,
PyGpuArray_DIMS(py_other)[0],
PyGpuArray_DIMS(py_other)[1],
PyGpuArray_DIMS(py_other)[0] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[0] / itemsize_y,
PyGpuArray_DIMS(py_other)[1] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[1] / itemsize_y,
py_other->ga.data,
py_other->ga.offset,
PyGpuArray_DIMS(indices_arr)[0],
PyGpuArray_STRIDES(indices_arr)[0] / itemsize_ind,
indices_arr->ga.data,
indices_arr->ga.offset,
set_instead_of_inc,
errbuf);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: %(k_var)s: %%s.",
GpuKernel_error(&%(k_var)s, err));
return 1;
}
err = gpudata_read(&kerr, errbuf, 0, sizeof(int));
if (err != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "Can't read error buffer");
return 1;
}
if (kerr != 0) {
PyErr_SetString(PyExc_IndexError, "Index out of bounds");
kerr = 0;
gpudata_write(errbuf, 0, &kerr, sizeof(int));
return 1;
}
}
return 0;
}
"""
% dict(k_var="k_vector_add_fast_" + nodename)
)
class GpuExtractDiag(Op):
__props__ = ("offset", "axis1", "axis2", "view")
_f16_ok = True
def __init__(self, offset=0, axis1=0, axis2=1, view=False):
self.view = view
if self.view:
self.view_map = {0: [0]}
self.offset = offset
self.axis1 = axis1
self.axis2 = axis2
def make_node(self, _x):
ctx_name = infer_context_name(_x)
x = as_gpuarray_variable(_x, ctx_name)
if x.ndim < 2:
raise ValueError("Diagonal needs an input with 2 or more " "dimensions", x)
axis_small, axis_large = sorted((self.axis1, self.axis2))
broadcastable = (
x.broadcastable[:axis_small]
+ x.broadcastable[axis_small + 1 : axis_large]
+ x.broadcastable[axis_large + 1 :]
+ (False,)
)
return Apply(self, [x], [x.type.clone(broadcastable=broadcastable)()])
def perform(self, node, inputs, outputs):
(x,) = inputs
(z,) = outputs
# zero-dimensional matrices ...
if x.size == 0:
out_shape = [
d for i, d in enumerate(x.shape) if i not in (self.axis1, self.axis2)
]
diag_size = np.min((x.shape[self.axis1], x.shape[self.axis2]))
out_shape.append(diag_size)
z[0] = node.outputs[0].type.value_zeros(tuple(out_shape))
return
# step 1) slicing on axis1 and axis2.
if self.offset >= 0:
stride_axis, slice_axis = self.axis1, self.axis2
else:
slice_axis, stride_axis = self.axis1, self.axis2
small_axis, large_axis = sorted((x.shape[self.axis1], x.shape[self.axis2]))
if x.shape[stride_axis] < x.shape[slice_axis]:
# in the bigger triangle
numstride = small_axis - np.max(
(0, small_axis + np.abs(self.offset) - large_axis)
)
else:
# in the smaller triangle
numstride = small_axis - np.abs(self.offset)
slicer = [
np.s_[:],
] * x.ndim
slicer[stride_axis] = np.s_[:numstride]
slicer[slice_axis] = np.abs(self.offset)
slicer = tuple(slicer)
# step 2) Swap stride_axis to the last dim because we want the dim on
# which the diags extracted be listed as the last dim of the tensor.
# This is also in consistence with the interface of numpy.diagonal.
if slice_axis < stride_axis:
stride_axis -= 1
new_dim_order = list(range(x[slicer].ndim))
new_dim_order = tuple(
new_dim_order[:stride_axis]
+ new_dim_order[stride_axis + 1 :]
+ [
stride_axis,
]
)
rval = x[slicer].transpose(new_dim_order)
# step 3) modify the strides in the last axis, such that rval becomes
# a view on the diagonal.
other_strides = tuple(
[d for i, d in enumerate(x.strides) if i not in (self.axis1, self.axis2)]
)
rval.strides = other_strides + (x.strides[self.axis1] + x.strides[self.axis2],)
if self.view:
z[0] = rval
else:
z[0] = rval.copy()
def grad(self, inputs, gout):
(input_x,) = inputs
return [grad_not_implemented(self, 0, input_x)]
def infer_shape(self, fgraph, node, shapes):
(in_shape,) = shapes
dim1 = in_shape[self.axis1]
dim2 = in_shape[self.axis2]
out_shape = [
d for i, d in enumerate(in_shape) if i not in (self.axis1, self.axis2)
]
# The following logic is inspired by C code of PyArray_Diagonal().
offset = self.offset
if offset > 0:
diag_size = clip(dim2 - offset, 0, dim1)
elif offset < 0:
diag_size = clip(dim1 + offset, 0, dim2)
else:
diag_size = minimum(dim1, dim2)
out_shape.append(diag_size)
return [tuple(out_shape)]
class GpuAllocDiag(AllocDiag):
__props__ = ("offset", "axis1", "axis2")
def make_node(self, diag):
ctx_name = infer_context_name(diag)
diag = as_gpuarray_variable(diag, ctx_name)
if diag.type.ndim < 1:
raise ValueError(
"AllocDiag needs an input with 1 or more " "dimensions", diag.type
)
return Apply(
self,
[diag],
[
diag.type.__class__(
dtype=diag.dtype, broadcastable=[False] * (diag.ndim + 1)
)()
],
)
def perform(self, node, inputs, outputs):
(x,) = inputs
(z,) = outputs
axis1 = np.minimum(self.axis1, self.axis2)
axis2 = np.maximum(self.axis1, self.axis2)
offset = self.offset
# Initialise a buffer the same size as the output
result_shape = x.shape[:-1] + (x.shape[-1] + abs(offset),) * 2
result_buffer_shape = (np.prod(x.shape[:-1]).astype(np.int64),) + (
x.shape[-1] + abs(offset),
) * 2
result_buffer = gpuarray.zeros(
result_buffer_shape, dtype=x.dtype, context=x.context
)
# Slice out a view of the diagonals
if offset < 0: # diag in the lower triangle
diag_view = result_buffer[:, abs(offset) :, 0]
else: # diag in the upper triangle
diag_view = result_buffer[:, : x.shape[-1], abs(offset)]
diag_view.strides = (
diag_view.strides[0],
diag_view.strides[1] + x.dtype.itemsize,
)
# Fill view with flattened array of diagonals
diag_view[:] = x.reshape(diag_view.shape)[:]
# Unflatten buffer into output size
result = result_buffer.reshape(result_shape)
if len(x.shape) > 1:
# Re-order axes so they correspond to diagonals at axis1, axis2
axes = list(range(len(x.shape[:-1])))
last_idx = axes[-1]
axes = axes[:axis1] + [last_idx + 1] + axes[axis1:]
axes = axes[:axis2] + [last_idx + 2] + axes[axis2:]
result = result.transpose(axes)
z[0] = result
def grad(self, inputs, gout):
(gz,) = gout
return [
GpuExtractDiag(offset=self.offset, axis1=self.axis1, axis2=self.axis2)(gz)
]
import copyreg
import os
import sys
import warnings
import numpy as np
import aesara
import aesara.scalar as aes
import aesara.tensor as at
import aesara.tensor.basic
from aesara.compile import SharedVariable
from aesara.configdefaults import config
from aesara.graph.basic import Constant, Variable
from aesara.link.c.type import CType
from aesara.misc.safe_asarray import _asarray
from aesara.tensor.shape import (
register_shape_c_code,
register_shape_i_c_code,
register_specify_shape_c_code,
)
from aesara.tensor.type import TensorType, complex_dtypes, discrete_dtypes
from aesara.tensor.type import values_eq_approx as tensor_values_eq_approx
from aesara.tensor.type import (
values_eq_approx_remove_inf as tensor_values_eq_approx_remove_inf,
)
from aesara.tensor.type import (
values_eq_approx_remove_inf_nan as tensor_values_eq_approx_remove_inf_nan,
)
from aesara.tensor.type import (
values_eq_approx_remove_nan as tensor_values_eq_approx_remove_nan,
)
from aesara.tensor.var import TensorConstantSignature, _tensor_py_operators
# Make sure this is importable even if pygpu is absent
# (it will not work though)
try:
import pygpu
from pygpu import gpuarray
from pygpu.elemwise import compare, elemwise2
except ImportError:
pygpu = None
_context_reg = {}
def gpu_supported(data):
"""
Is the following data supported on the GPU?
Currently, only complex aren't supported.
Parameters
----------
data : numpy.ndarray or TensorVariable
(it must have dtype and ndim parameter)
"""
return str(data.dtype) not in complex_dtypes
def move_to_gpu(data):
"""
Do we want to move this computation to the GPU?
Currently, we don't move complex and scalar.
Parameters
----------
data : numpy.ndarray or TensorVariable
(it must have dtype and ndim parameter)
"""
# We don't support complex on the GPU
if not gpu_supported(data):
return False
# We don't want scalars on the GPU.
if data.ndim == 0:
return False
return True
class ContextNotDefined(ValueError):
pass
def reg_context(name, ctx):
"""
Register a context by mapping it to a name.
The context must be of type `GpuContext` and the name can be
anything hashable (but is usually a string). Only one context can
be registered per name and the second registration for a given
name will raise an error.
Parameters
----------
name : hashable object
Name to associate the context with (usually a string)
ctx : GpuContext
Context instance
"""
if name in _context_reg:
raise ValueError(f"context name {name} is already defined")
if not isinstance(ctx, gpuarray.GpuContext):
raise TypeError("context is not GpuContext")
_context_reg[name] = ctx
def get_context(name):
"""
Retrieve the context associated with a name.
Return the context object mapped to `ref` that was previously
register through :func:`reg_context`. Trying to get the context
for an unregistered `ref` will raise a exception.
Parameters
----------
name : hashable object
Name associated with the context we want (usually a string)
"""
if name not in _context_reg:
raise ContextNotDefined(f"context name {name} not defined")
return _context_reg[name]
def list_contexts():
"""
Return an iterable of all the registered context names.
"""
return _context_reg.keys()
# Private method
def _name_for_ctx(ctx):
for k, v in _context_reg.items():
if v == ctx:
return k
raise ContextNotDefined("context is not registered")
# This is a private method for use by the tests only
def _unreg_context(name):
del _context_reg[name]
class GpuArrayType(CType):
"""
The type that represents an array on a gpu.
The `dtype` indicates what scalar data type the elements of
variables of this type will be.
`broadcastable` indicates whether each dimension is broadcastable
or not (to be broadcastable a dimension must always be of length
1).
The `context_name` is the name of the context on will values of
variables of this type will be stored.
Parameters
----------
dtype : str
The name of a numpy dtype
broadcastable : tuple of bools
A tuple that indicates both the number of dimensions (by its
length) and whether those dimensions are broadcastable or not
(by the boolean values).
context_name : str
The name of the context the that this type is attached to
(default: None, which is the context specified by
config.device).
name : string, optional
A name for the type that will be used in printouts.
Attributes
----------
dtype : str
Data type used for scalar elements of variables.
broadcastable : tuple of bools
Indicates whether the dimensions are broadcastable or not.
ndim : int
The number of dimensions
context_name : str
The name of a gpu context on which variables will have their values.
name : str
A string used to print the type if given.
typecode : int
The gpuarray typecode for `dtype`
See Also
--------
aesara.graph.type.Type
"""
def __init__(self, dtype, broadcastable, context_name=None, name=None):
# In case this was not provided and no global value is available
self.dtype = str(dtype)
self.broadcastable = tuple(bool(b) for b in broadcastable)
self.ndim = len(self.broadcastable)
self.name = name
self.context_name = context_name
# This will check that the passed context name is valid and registered.
get_context(self.context_name)
try:
self.typecode = gpuarray.dtype_to_typecode(self.dtype)
except gpuarray.GpuArrayException:
raise TypeError(
f"Unsupported dtype for {self.__class__.__name__}: {self.dtype}"
)
def clone(self, dtype=None, broadcastable=None):
if dtype is None:
dtype = self.dtype
if broadcastable is None:
broadcastable = self.broadcastable
return self.__class__(
dtype=dtype,
broadcastable=broadcastable,
context_name=self.context_name,
name=self.name,
)
# This is a property to keep the type pickleable
@property
def context(self):
"""
The context object mapped to the type's :attr:`context_name`.
This is a property.
"""
return get_context(self.context_name)
def __repr__(self):
# Inspired from TensorType.
if self.name:
return self.name
else:
b = self.broadcastable
named_broadcastable = {
tuple(): "scalar",
(False,): "vector",
(False, True): "col",
(True, False): "row",
(False, False): "matrix",
}
if b in named_broadcastable:
bcast = named_broadcastable[b]
elif any(b):
bcast = str(b)
else:
bcast = f"{len(b)}D"
return f"GpuArrayType<{self.context_name}>({self.dtype}, {bcast})"
def filter(self, data, strict=False, allow_downcast=None):
return self.filter_inplace(
data, None, strict=strict, allow_downcast=allow_downcast
)
def filter_inplace(self, data, old_data, strict=False, allow_downcast=None):
if isinstance(data, gpuarray.GpuArray) and data.typecode == self.typecode:
# This is just to make this condition not enter the
# following branches
pass
elif strict:
if not isinstance(data, gpuarray.GpuArray):
raise TypeError(f"{self} expected a GpuArray object.", data, type(data))
if self.typecode != data.typecode:
raise TypeError(
f"{self} expected typecode {int(self.typecode)} (dtype {self.dtype}), "
f"got {int(data.typecode)} (dtype {data.dtype})."
)
if self.context != data.context:
raise TypeError("data context does not match type context")
# fallthrough to ndim check
elif allow_downcast or (
allow_downcast is None
and isinstance(data, float)
and self.dtype == config.floatX
):
if not isinstance(data, gpuarray.GpuArray):
data = np.array(
data, dtype=self.dtype, copy=False, ndmin=len(self.broadcastable)
)
else:
data = gpuarray.array(
data,
dtype=self.typecode,
copy=False,
ndmin=len(self.broadcastable),
context=self.context,
)
else:
if not hasattr(data, "dtype"):
converted_data = _asarray(data, self.dtype)
# We use the `values_eq` static function from TensorType
# to handle NaN values.
if TensorType.values_eq(
np.asarray(data), converted_data, force_same_dtype=False
):
data = converted_data
up_dtype = aes.upcast(self.dtype, data.dtype)
if up_dtype == self.dtype:
if not isinstance(data, gpuarray.GpuArray):
data = np.array(data, dtype=self.dtype, copy=False)
else:
data = gpuarray.array(data, dtype=self.dtype, copy=False)
else:
raise TypeError(
f"{self} cannot store a value of dtype {data.dtype} "
"without risking loss of precision."
)
if self.ndim != data.ndim:
raise TypeError(
f"Wrong number of dimensions: expected {self.ndim}, "
f"got {data.ndim} with shape {data.shape}.",
data,
)
shp = data.shape
for i, b in enumerate(self.broadcastable):
if b and shp[i] != 1:
raise TypeError(
"Non-unit value on shape on a broadcastable" " dimension.",
shp,
self.broadcastable,
)
if not isinstance(data, gpuarray.GpuArray):
if (
old_data is not None
and old_data.shape == data.shape
and (
# write() only work if the destitation is contiguous.
old_data.flags["C_CONTIGUOUS"]
or old_data.flags["F_CONTIGUOUS"]
)
):
old_data.write(data)
data = old_data
else:
data = pygpu.array(data, context=self.context)
return data
def filter_variable(self, other, allow_convert=True):
if hasattr(other, "_as_GpuArrayVariable"):
other = other._as_GpuArrayVariable(self.context_name)
if not isinstance(other, Variable):
other = self.constant_type(type=self, data=other)
if other.type == self:
return other
if not isinstance(other.type, (TensorType, GpuArrayType)):
raise TypeError("Incompatible type", (self, other.type))
if other.type.dtype != self.dtype:
raise TypeError("Incompatible dtype", (self.dtype, other.type.dtype))
if other.type.ndim != self.ndim:
raise TypeError(
"Incompatible number of dimensions."
f" Expected {int(self.ndim)}, got {int(other.ndim)}."
)
if other.type.broadcastable != self.broadcastable:
if allow_convert:
type2 = other.type.clone(broadcastable=self.broadcastable)
other2 = type2.convert_variable(other)
else:
other2 = None
if other2 is None:
raise TypeError(
"Incompatible broadcastable dimensions."
f" Expected {other.type.broadcastable}, got {self.broadcastable}."
)
other = other2
return other.transfer(self.context_name)
@staticmethod
def values_eq(a, b, force_same_dtype=True):
if a.shape != b.shape:
return False
if force_same_dtype and a.typecode != b.typecode:
return False
a_eq_b = np.asarray(compare(a, "==", b))
if a_eq_b.all():
return True
# maybe the trouble is that there are NaNs
a = np.asarray(a)
b = np.asarray(b)
a_missing = np.isnan(a)
if a_missing.any():
b_missing = np.isnan(b)
return np.all(a_eq_b + (a_missing == b_missing))
else:
return False
@staticmethod
def values_eq_approx(
a, b, allow_remove_inf=False, allow_remove_nan=False, rtol=None, atol=None
):
return values_eq_approx(a, b, allow_remove_inf, allow_remove_nan, rtol, atol)
@staticmethod
def may_share_memory(a, b):
if not isinstance(a, gpuarray.GpuArray) or not isinstance(b, gpuarray.GpuArray):
return False
return pygpu.gpuarray.may_share_memory(a, b)
def value_zeros(self, shape):
return pygpu.gpuarray.zeros(shape, dtype=self.typecode, context=self.context)
def __eq__(self, other):
return (
type(self) == type(other)
and self.typecode == other.typecode
and self.broadcastable == other.broadcastable
and self.context_name == other.context_name
)
def convert_variable(self, var):
vt = var.type
if (
isinstance(vt, type(self))
and self.typecode == vt.typecode
and self.ndim == vt.ndim
and self.context_name == vt.context_name
and all(
sb == ob or ob for sb, ob in zip(self.broadcastable, vt.broadcastable)
)
):
return at.patternbroadcast(var, self.broadcastable)
def __hash__(self):
return hash((type(self), self.typecode, self.broadcastable, self.context_name))
def dtype_specs(self):
"""
Return a tuple (python type, c type, numpy typenum) that corresponds
to self.dtype.
This function is used internally as part of C code generation.
"""
try:
return {
"float16": (float, "npy_float16", "NPY_FLOAT16"),
"float32": (float, "npy_float32", "NPY_FLOAT32"),
"float64": (float, "npy_float64", "NPY_FLOAT64"),
"bool": (int, "npy_bool", "NPY_BOOL"),
"uint8": (int, "npy_uint8", "NPY_UINT8"),
"int8": (int, "npy_int8", "NPY_INT8"),
"uint16": (int, "npy_uint16", "NPY_UINT16"),
"int16": (int, "npy_int16", "NPY_INT16"),
"uint32": (int, "npy_uint32", "NPY_UINT32"),
"int32": (int, "npy_int32", "NPY_INT32"),
"uint64": (int, "npy_uint64", "NPY_UINT64"),
"int64": (int, "npy_int64", "NPY_INT64"),
# 'complex128': (complex, 'aesara_complex128', 'NPY_COMPLEX128'),
# 'complex64': (complex, 'aesara_complex64', 'NPY_COMPLEX64')
}[self.dtype]
except KeyError:
raise TypeError(
f"Unsupported dtype for {self.__class__.__name__}: {self.dtype}"
)
def get_shape_info(self, obj):
return obj.shape
def get_size(self, shape_info):
if shape_info:
return np.prod(shape_info) * np.dtype(self.dtype).itemsize
else:
return np.dtype(self.dtype).itemsize
def c_element_type(self):
return pygpu.gpuarray.dtype_to_ctype(self.dtype)
def c_declare(self, name, sub, check_input=True):
return f"""
PyGpuArrayObject *{name};
"""
def c_init(self, name, sub):
return f"{name} = NULL;"
def c_extract(self, name, sub, check_input=True, **kwargs):
# TODO I don't check broadcast stuff for now.
return """
%(name)s = NULL;
if (py_%(name)s == Py_None) {
PyErr_SetString(PyExc_ValueError, "expected a GpuArray, not None");
%(fail)s
}
/* First check if we are the base type exactly (the most common case),
then do the full subclass check if needed. */
if (py_%(name)s->ob_type != &PyGpuArrayType &&
!PyObject_TypeCheck(py_%(name)s, &PyGpuArrayType)) {
PyErr_SetString(PyExc_ValueError, "expected a GpuArray");
%(fail)s
}
%(name)s = (PyGpuArrayObject *)py_%(name)s;
Py_INCREF(%(name)s);
""" % {
"name": name,
"fail": sub["fail"],
}
def c_cleanup(self, name, sub):
return "Py_XDECREF({name}); {name} = NULL;".format(name=name)
def c_sync(self, name, sub):
return """
if (!%(name)s) {
Py_XDECREF(py_%(name)s);
Py_INCREF(Py_None);
py_%(name)s = Py_None;
} else if ((void *)py_%(name)s != (void *)%(name)s) {
Py_XDECREF(py_%(name)s);
py_%(name)s = (PyObject *)%(name)s;
Py_INCREF(py_%(name)s);
}
""" % {
"name": name
}
def c_init_code(self, **kwargs):
# We don't actually need the numpy API except in
# HostFromGpu and GpuFromHost and those case will be covered
# by the TensorType parameter
return ["import_pygpu__gpuarray();"]
def c_headers(self, **kwargs):
# We need arrayobject for the PyArrayDescr struct def
# (even if we just use a pointer to it in a function def)
return [
"<gpuarray/array.h>",
"<gpuarray/kernel.h>",
"<gpuarray/error.h>",
"<gpuarray/buffer.h>",
"<gpuarray/buffer_blas.h>",
"<numpy/arrayobject.h>",
"<gpuarray_api.h>",
]
def c_header_dirs(self, **kwargs):
other_dirs = []
for dir_to_add in ("Library/include", "include"):
alt_inc_dir = os.path.abspath(
os.path.normpath(sys.exec_prefix + "/" + dir_to_add)
)
if os.path.exists(alt_inc_dir) and os.path.isdir(alt_inc_dir):
other_dirs.append(alt_inc_dir)
return [pygpu.get_include(), np.get_include()] + other_dirs
def c_lib_dirs(self, **kwargs):
dirs = []
for dir_to_add in ("Library/lib", "lib"):
alt_lib_dir = os.path.abspath(
os.path.normpath(sys.exec_prefix + "/" + dir_to_add)
)
if os.path.exists(alt_lib_dir) and os.path.isdir(alt_lib_dir):
dirs.append(alt_lib_dir)
return dirs
def c_libraries(self, **kwargs):
return ["gpuarray"]
def c_code_cache_version(self):
ver = pygpu.gpuarray.abi_version()
# we only use the major version since the minor revision are compatible.
return (2, ver[0])
def values_eq_approx(
a, b, allow_remove_inf=False, allow_remove_nan=False, rtol=None, atol=None
):
if a.shape != b.shape or a.dtype != b.dtype:
return False
if str(a.dtype) in discrete_dtypes:
return GpuArrayType.values_eq(a, b)
else:
if not (allow_remove_inf or allow_remove_nan):
atol_, rtol_ = aesara.tensor.math._get_atol_rtol(a, b)
if rtol is not None:
rtol_ = rtol
if atol is not None:
atol_ = atol
res = elemwise2(
a,
"",
b,
a,
odtype=np.dtype("bool"),
op_tmpl="res = (fabs(a - b) <"
"(%(atol_)s + %(rtol_)s * fabs(b)))" % locals(),
)
ret = np.asarray(res).all()
if ret:
return True
an = np.asarray(a)
bn = np.asarray(b)
return TensorType.values_eq_approx(
an,
bn,
allow_remove_inf=allow_remove_inf,
allow_remove_nan=allow_remove_nan,
rtol=rtol,
atol=atol,
)
def values_eq_approx_remove_inf(a, b):
return values_eq_approx(a, b, True)
def values_eq_approx_remove_nan(a, b):
return values_eq_approx(a, b, False, True)
def values_eq_approx_remove_inf_nan(a, b):
return values_eq_approx(a, b, True, True)
# This is to map ndarray-specific versions of these functions to the GPU.
EQ_MAP = {
tensor_values_eq_approx: values_eq_approx,
tensor_values_eq_approx_remove_inf: values_eq_approx_remove_inf,
tensor_values_eq_approx_remove_nan: values_eq_approx_remove_nan,
tensor_values_eq_approx_remove_inf_nan: values_eq_approx_remove_inf_nan,
}
# Add a reverse map too.
EQ_MAP.update(list((v, k) for k, v in EQ_MAP.items()))
class _operators(_tensor_py_operators):
def _as_GpuArrayVariable(self, context_name):
if self.type.context_name == context_name:
return self
else:
from .basic_ops import GpuToGpu
return GpuToGpu(context_name)(self)
@at._as_tensor_variable.register(_operators)
def _as_tensor_operators(x, **kwargs):
from aesara.gpuarray.basic_ops import host_from_gpu
return host_from_gpu(x)
class GpuArrayVariable(_operators, Variable):
"""
A variable representing a computation on a certain GPU.
This supports all the operations that :class:`TensorType`
supports.
See Also
--------
Variable
"""
# override the default
def __repr_test_value__(self):
return repr(np.array(aesara.graph.op.get_test_value(self)))
GpuArrayType.variable_type = GpuArrayVariable
class GpuArraySignature(TensorConstantSignature):
# might do something better if we can run the sum on the GPU, but
# for now this will suffice.
pass
class GpuArrayConstant(_operators, Constant):
"""
A constant representing a value on a certain GPU.
This supports all the operations that :class:`TensorType`
supports.
See Also
--------
Constant
"""
def signature(self):
return GpuArraySignature((self.type, np.asarray(self.data)))
def __str__(self):
if self.name is not None:
return self.name
try:
np_data = np.asarray(self.data)
except gpuarray.GpuArrayException:
try:
np_data = str(self.data)
except Exception:
np_data = "Unknown"
return "GpuArrayConstant{%s}" % np_data
GpuArrayType.constant_type = GpuArrayConstant
class GpuArraySharedVariable(_operators, SharedVariable):
"""
A variable representing a shared value on a certain GPU.
This supports all the operations that :class:`TensorType`
supports.
See Also
--------
SharedVariable
"""
def get_value(self, borrow=False, return_internal_type=False):
if return_internal_type:
if borrow:
return self.container.value
else:
return self.container.value.copy()
else:
return np.asarray(self.container.value)
def set_value(self, value, borrow=False):
if isinstance(value, pygpu.gpuarray.GpuArray):
value = pygpu.gpuarray.array(
value, copy=(not borrow), context=self.type.context
)
self.container.value = value
def __getitem__(self, *args):
return _operators.__getitem__(self, *args)
GpuArrayType.SharedVariable = GpuArraySharedVariable
notset = object()
def gpuarray_shared_constructor(
value,
name=None,
strict=False,
allow_downcast=None,
borrow=False,
broadcastable=None,
target=notset,
):
"""
SharedVariable constructor for GpuArrayType.
See :func:`aesara.shared`.
:target: default None
The device target. As None is a valid value and we need to
differentiate from the parameter notset and None, we use a
notset object.
"""
if target == "cpu":
raise TypeError("not for me")
if not isinstance(value, (np.ndarray, pygpu.gpuarray.GpuArray)):
raise TypeError("ndarray or GpuArray required")
if target is notset:
target = None
if not gpu_supported(value):
raise TypeError("The GPU do not support that value.")
if not move_to_gpu(value):
raise TypeError("We do not move that data by default to the GPU")
try:
get_context(target)
except ContextNotDefined:
# Don't make this a hard error if we attempt to make a shared
# variable while there is no default context.
if target is None:
raise TypeError("No default context and no context specified")
raise
if broadcastable is None:
broadcastable = (False,) * value.ndim
type = GpuArrayType(value.dtype, broadcastable, context_name=target)
deviceval = pygpu.gpuarray.array(value, copy=(not borrow), context=type.context)
return GpuArraySharedVariable(type=type, value=deviceval, name=name, strict=strict)
aesara.compile.register_view_op_c_code(
GpuArrayType,
"""
Py_XDECREF(%(oname)s);
%(oname)s = %(iname)s;
Py_XINCREF(%(oname)s);
""",
version=(0,),
)
# Register GpuArrayType C code for Shape Op.
register_shape_c_code(
GpuArrayType,
"""
npy_intp shape[] = {%(iname)s->ga.nd};
if(%(oname)s == NULL || (PyArray_DIMS(%(oname)s)[0] != shape[0]))
{
Py_XDECREF(%(oname)s);
%(oname)s = (PyArrayObject*) PyArray_SimpleNew(1, shape, NPY_INT64);
}
for(int i=0;i<shape[0];i++)
{
((npy_int64*)PyArray_GETPTR1(%(oname)s, i))[0] = %(iname)s->ga.dimensions[i];
}
""",
version=1,
)
register_shape_i_c_code(
GpuArrayType,
"""
if(!%(oname)s)
%(oname)s=(PyArrayObject*)PyArray_ZEROS(0, NULL, NPY_INT64, 0);
((npy_int64*)PyArray_DATA(%(oname)s))[0] =
%(iname)s->ga.dimensions[%(i)s];
""",
"""
if (%(i)s>=%(iname)s->ga.nd){
PyErr_SetString(PyExc_TypeError,
"Number of dimensions lower than expected");
%(fail)s
}
""",
version=(1,),
)
aesara.compile.register_deep_copy_op_c_code(
GpuArrayType,
"""
Py_XDECREF(%(oname)s);
%(oname)s = pygpu_copy(%(iname)s, GA_ANY_ORDER);
if (!%(oname)s) { %(fail)s }
""",
version=(5,),
)
aesara.tensor.basic.register_rebroadcast_c_code(
GpuArrayType,
"""
if(%(iname)s->ga.dimensions[%(axis)s] != 1){
PyErr_Format(PyExc_ValueError,
"Dimension %(axis)s in Rebroadcast's input was"
" supposed to be 1 (got %%d instead)",
%(iname)s->ga.dimensions[%(axis)s]);
%(fail)s
}
""",
version=1,
)
register_specify_shape_c_code(
GpuArrayType,
"""
if (PyGpuArray_NDIM(%(iname)s) != PyArray_DIMS(%(shape)s)[0]) {
PyErr_Format(PyExc_AssertionError,
"SpecifyShape: vector of shape has %%d elements,"
" but the input has %%d dimensions.",
PyGpuArray_NDIM(%(iname)s),
PyArray_DIMS(%(shape)s)[0]);
%(fail)s;
}
for(int i = 0; i < PyGpuArray_NDIM(%(iname)s); i++){
dtype_%(shape)s shp = ((dtype_%(shape)s*)PyArray_GETPTR1(%(shape)s,
i))[0];
if (PyGpuArray_DIMS(%(iname)s)[i] != shp) {
PyErr_Format(PyExc_AssertionError,
"SpecifyShape: dim %%d of input has shape %%d,"
" expected %%d.",
i, PyGpuArray_DIMS(%(iname)s)[i],
shp);
%(fail)s;
}
}
Py_XDECREF(%(oname)s);
%(oname)s = %(iname)s;
Py_XINCREF(%(oname)s);
""",
version=1,
c_support_code_apply="#include <numpy_compat.h>",
)
class GpuContextType(CType):
"""
Minimal type used for passing contexts to nodes.
This Type is not a complete type and should never be used for
regular graph operations.
"""
def filter(self, data, strict=False, allow_downcast=None):
if not isinstance(data, gpuarray.GpuContext):
raise TypeError("context is not a GpuContext")
return data
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
@staticmethod
def values_eq(a, b):
return a == b
def c_declare(self, name, sub, check_input=True):
return f"PyGpuContextObject *{name};"
def c_init(self, name, sub):
return f"{name} = NULL;"
def c_extract(self, name, sub, check_input=True, **kwargs):
if check_input:
res = """
if (!PyObject_TypeCheck(py_%(name)s, &PyGpuContextType)) {
PyErr_SetString(PyExc_TypeError, "expected a GpuContext");
%(fail)s
}
""" % dict(
name=name, fail=sub["fail"]
)
else:
res = ""
return (
res
+ """
%(name)s = (PyGpuContextObject *)py_%(name)s;
Py_INCREF(%(name)s);
"""
% dict(name=name)
)
def c_cleanup(self, name, sub):
return f"Py_XDECREF({name}); {name} = NULL;"
def c_sync(self, name, sub):
# c_sync is intentionally not declared to prevent normal usage
raise NotImplementedError("Variables of this type cannot be graph outputs")
def c_init_code(self, **kwargs):
return ["import_pygpu__gpuarray();"]
def c_headers(self, **kwargs):
return ["<gpuarray_api.h>"]
def c_header_dirs(self, **kwargs):
return [pygpu.get_include()]
def c_code_cache_version(self):
ver = pygpu.gpuarray.api_version()
return (0, ver[0])
# Variable, Constant, ... not declared
"""
Instance of :class:`GpuContextType` to use for the context_type
declaration of an operation.
"""
gpu_context_type: GpuContextType = GpuContextType()
# THIS WORKS But GpuArray instances don't compare equal to one
# another, and what about __hash__ ? So the unpickled version doesn't
# equal the pickled version, and the cmodule cache is not happy with
# the situation. The old back-end have this same comment and use the
# same mechanism.
def GpuArray_unpickler(npa, ctx_name):
if config.experimental__unpickle_gpu_on_cpu:
# directly return numpy array
warnings.warn(
"config.experimental__unpickle_gpu_on_cpu is set to True. "
"Unpickling GpuArray as numpy.ndarray"
)
return npa
elif pygpu:
ctx = get_context(ctx_name)
return pygpu.gpuarray.array(npa, copy=True, context=ctx)
else:
raise ImportError("pygpu not found. Cannot unpickle GpuArray")
copyreg.constructor(GpuArray_unpickler)
def GpuArray_pickler(cnda):
ctx_name = _name_for_ctx(cnda.context)
return (GpuArray_unpickler, (np.asarray(cnda), ctx_name))
# In case pygpu is not imported.
if pygpu is not None:
copyreg.pickle(pygpu.gpuarray.GpuArray, GpuArray_pickler, GpuArray_unpickler)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论