提交 b3ce3640 authored 作者: Maxim Kochurov's avatar Maxim Kochurov 提交者: Brandon T. Willard

Remove tests.gpuarray

上级 c803c67e
#section kernels
#kernel eye : *, size, size, size :
#include <cluda.h>
/* The eye name will be used to generate supporting objects. The only
you probably need to care about is the kernel object which will be
named 'k_' + <the name above> (k_eye in this case). This name also
has to match the kernel function name below.
*/
KERNEL void eye(GLOBAL_MEM DTYPE_OUTPUT_0 *a, ga_size a_off, ga_size n, ga_size m) {
a = (GLOBAL_MEM DTYPE_OUTPUT_0 *)(((GLOBAL_MEM char *)a) + a_off);
ga_size nb = n < m ? n : m;
for (ga_size i = LID_0; i < nb; i += LDIM_0) {
a[i*m + i] = 1;
}
}
#section support_code_struct
int APPLY_SPECIFIC(tstgpueye)(PyArrayObject *n, PyArrayObject *m,
PyGpuArrayObject **z, PARAMS_TYPE* params) {
size_t dims[2] = {0, 0};
size_t ls, gs;
void *args[3];
int err;
dims[0] = ((DTYPE_INPUT_0 *)PyArray_DATA(n))[0];
dims[1] = ((DTYPE_INPUT_1 *)PyArray_DATA(m))[0];
Py_XDECREF(*z);
*z = pygpu_zeros(2, dims,
params->typecode,
GA_C_ORDER,
params->context, Py_None);
if (*z == NULL)
return -1;
ls = 1;
gs = 256;
/* The eye_call name comes from the kernel declaration above. */
err = eye_call(1, &gs, &ls, 0, (*z)->ga.data, (*z)->ga.offset, dims[0], dims[1]);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: kEye: %s. n%lu, m=%lu.",
GpuKernel_error(&k_eye, err),
(unsigned long)dims[0], (unsigned long)dims[1]);
return -1;
}
return 0;
}
#!/usr/bin/env python
# Without args, this script executes all its tests like `pytest -vs`
# python check_dnn_conv.py
# If there is only one arg `infos`, this script prints some infos about
# supported algorithms and data type configurations for current GPU and cuDNN version.
# python check_dnn_conv.py infos
# If there is only one arg `list`, this script prints all test cases without running them.
# python check_dnn_conv.py list
# Else, any arg will be directly passed to pytest.
# python check_dnn_conv.py -xvs # verbose mode, capture output, exit at first error.
import math
import sys
from itertools import chain, product
import numpy as np
import pytest
from aesarat.tensor.type import TensorType
import aesara
import tests.unittest_tools as utt
from aesara.configdefaults import SUPPORTED_DNN_CONV_ALGO_RUNTIME
from aesara.gpuarray import cudnn_defs
from aesara.gpuarray.dnn import GpuDnnConv, GpuDnnConvGradI, GpuDnnConvGradW
from aesara.gpuarray.dnn import _dnn_conv as dnn_conv
from aesara.gpuarray.dnn import _dnn_gradinput as dnn_gradinput
from aesara.gpuarray.dnn import _dnn_gradweight as dnn_gradweight
from aesara.gpuarray.dnn import version
from aesara.tensor.nnet.abstract_conv import assert_conv_shape, get_conv_output_shape
from aesara.tensor.nnet.corr import CorrMM, CorrMM_gradInputs, CorrMM_gradWeights
from aesara.tensor.nnet.corr3d import Corr3dMM, Corr3dMMGradInputs, Corr3dMMGradWeights
from tests.gpuarray.config import mode_with_gpu, ref_cast
def check_dtype_config_support(dtype, precision):
# We use FWD 2D to check it.
# Based on documentation, algo small (CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM)
# should support all configurations, for both v5.1, v6 and v7.
inputs = aesara.shared(np.zeros((1, 1, 2, 2), dtype=dtype))
filters = aesara.shared(np.zeros((1, 1, 2, 2), dtype=dtype))
conv = dnn_conv(inputs, filters, precision=precision, algo="small")
f = aesara.function([], conv, mode=mode_with_gpu)
try:
f()
except RuntimeError as e:
assert "CUDNN_STATUS_ARCH_MISMATCH" in str(e)
return False
return True
cudnn = cudnn_defs.get_definitions(version(raises=False))
class ConvCase:
"""
Helper class to describe a special test case quickly.
This handles only 2D and 3D cases.
"""
FWD, GRADINPUT, GRADWEIGHT = 0, 1, 2
def __init__(
self,
type,
inputs_shape,
filters_shape,
algo=None,
dtype=None,
precision=None,
subsample=None,
dilation=None,
border_mode="valid",
conv_mode="conv",
alpha=1,
beta=0,
should_fail=False,
):
assert type in (ConvCase.FWD, ConvCase.GRADINPUT, ConvCase.GRADWEIGHT)
assert len(inputs_shape) == len(filters_shape) in (4, 5)
ndim = len(inputs_shape) - 2
if dtype is None:
dtype = aesara.config.floatX
if precision is None:
precision = aesara.config.floatX
if subsample is None:
subsample = (1,) * ndim
if dilation is None:
dilation = (1,) * ndim
assert dtype in ("float16", "float32", "float64")
assert precision in ("float16", "float32", "float64")
assert len(subsample) == len(dilation) == ndim
assert border_mode in ("valid", "full", "half") or (
isinstance(border_mode, (list, tuple)) and len(border_mode) == ndim
)
assert conv_mode in ("conv", "cross")
assert alpha != 0
self.type = type
self.ndim = ndim
self.algo = algo
self.inputs_shape = inputs_shape
self.filters_shape = filters_shape
self.dtype = dtype
self.precision = precision
self.subsample = subsample
self.dilation = dilation
self.border_mode = border_mode
self.conv_mode = conv_mode
self.alpha = alpha
self.beta = beta
self.should_fail = bool(should_fail)
def is_fwd(self):
return self.type == ConvCase.FWD
def is_bwd_filter(self):
return self.type == ConvCase.GRADWEIGHT
def is_bwd_data(self):
return self.type == ConvCase.GRADINPUT
def get_case(self):
return (
self.algo,
self.dtype,
self.precision,
(
self.inputs_shape,
self.filters_shape,
self.subsample,
self.dilation,
self.border_mode,
self.conv_mode,
self.alpha,
self.beta,
),
)
@staticmethod
def fwd(*args, **kwargs):
return ConvCase(ConvCase.FWD, *args, **kwargs)
@staticmethod
def bwd_filter(*args, **kwargs):
return ConvCase(ConvCase.GRADWEIGHT, *args, **kwargs)
@staticmethod
def bwd_data(*args, **kwargs):
return ConvCase(ConvCase.GRADINPUT, *args, **kwargs)
class ConvCaseGenerator:
"""
Main class used to generate test cases.
This handles only 2D and 3D cases.
"""
def _as_tuple_of_tuples(self, iterable):
return tuple(tuple(sequence) for sequence in iterable)
def __init__(
self,
ndim,
alpha=2,
beta=-3,
batch_size=2,
input_channels=3,
inputs_sizes=None,
output_channels=2,
filters_sizes=None,
subsamples=None,
dilations=None,
borders=None,
with_border_valid=True,
with_border_half=True,
with_border_full=True,
):
self.ndim = int(ndim)
self.alpha = float(alpha)
self.beta = float(beta)
self.batch_size = int(batch_size)
self.input_channels = int(input_channels)
self.output_channels = int(output_channels)
assert self.ndim in (2, 3)
assert self.alpha != 0
assert self.batch_size > 0
assert self.input_channels > 0
assert self.output_channels > 0
# NB: it is quite arbitrary to choose default values for inputs sizes and filters sizes.
# Here, we just put some values that may generate errors in some cases, but that should be OK for other cases.
# For instance, input size 300 is > 256, that is a limit for certain algorithms (cf. documentation).
# Filter size 40 is > 32 and > 16, that are limits for certain algorithms (cf. documentation).
# We should either manually specify sizes, or give an appropriate filter to this generator
# before testing values (see `self.get_cases()`).
if inputs_sizes is None:
inputs_sizes = ((5,) * self.ndim, (300, 5) + (2,) * (self.ndim - 2))
if filters_sizes is None:
filters_sizes = ((4,) * self.ndim, (40, 4) + (2,) * (self.ndim - 2))
if borders is None:
borders = ((1,) * self.ndim, tuple(range(1, self.ndim + 1)))
if subsamples is None:
subsamples = ((1,) * self.ndim, tuple(range(1, self.ndim + 1)))
if dilations is None:
dilations = ((1,) * self.ndim,)
if cudnn.version >= 6:
dilations += (tuple(range(1, self.ndim + 1)),)
for sequence_list in (
inputs_sizes,
filters_sizes,
borders,
subsamples,
dilations,
):
assert isinstance(sequence_list, (tuple, list)) and all(
isinstance(sequence, (tuple, list)) and len(sequence) == self.ndim
for sequence in sequence_list
), (self.ndim, sequence_list)
self.auto_borders = tuple()
if with_border_valid:
self.auto_borders += ("valid",)
if with_border_half:
self.auto_borders += ("half",)
if with_border_full:
self.auto_borders += ("full",)
self.inputs_sizes = self._as_tuple_of_tuples(inputs_sizes)
self.filters_sizes = self._as_tuple_of_tuples(filters_sizes)
self.borders = self._as_tuple_of_tuples(borders)
self.subsamples = self._as_tuple_of_tuples(subsamples)
self.dilations = self._as_tuple_of_tuples(dilations)
@staticmethod
def get_if_valid_conv_output_shape(case_tuple):
# Filter function to keep only cases that produce valid convolution output shapes.
out_shp = get_conv_output_shape(
case_tuple[0], # input shape
case_tuple[1], # filter shape
case_tuple[4], # border mode
case_tuple[2], # subsample
case_tuple[3],
) # dilation
try:
return assert_conv_shape(out_shp)
except ValueError:
return False
def get_cases(self, filter=None):
# Generate an iterator of tuples with format:
# (input shape, filter shape, subsample, dilation, border mode, convolution mode, alpha, beta)
# filter may be a callable that gets one tuple (with format specified above) and returns
# a boolean, so that tuple is kept only if filter(tuple) is True.
all_batch_sizes = (self.batch_size,)
all_input_channels = (self.input_channels,)
all_input_sizes = self.inputs_sizes
all_output_channels = (self.output_channels,)
all_filter_sizes = self.filters_sizes
all_subsamples = self.subsamples
all_dilations = self.dilations
all_border_modes = self.auto_borders + self.borders
all_conv_modes = ("conv", "cross")
all_alphas = (self.alpha,)
all_betas = (0,) if self.beta == 0 else (0, self.beta)
all_input_shapes = (
(bs, ic) + ins
for bs in all_batch_sizes
for ic in all_input_channels
for ins in all_input_sizes
)
all_filter_shapes = (
(oc, ic) + fis
for oc in all_output_channels
for ic in all_input_channels
for fis in all_filter_sizes
)
if callable(filter):
def local_filter(case_tuple):
return ConvCaseGenerator.get_if_valid_conv_output_shape(
case_tuple
) and filter(case_tuple)
else:
local_filter = ConvCaseGenerator.get_if_valid_conv_output_shape
return filter(
local_filter,
product(
all_input_shapes,
all_filter_shapes,
all_subsamples,
all_dilations,
all_border_modes,
all_conv_modes,
all_alphas,
all_betas,
),
)
class ConvCaseGeneratorChain:
"""
Helper class to concatenate many conv case generators.
"""
def __init__(self, *conv_case_generators):
assert all(isinstance(g, ConvCaseGenerator) for g in conv_case_generators)
self.generators = conv_case_generators
def get_cases(self, filter=None):
return chain(*[generator.get_cases(filter) for generator in self.generators])
class CuDNNV51ConvCaseGenerator:
"""
Helper class to generate specific test cases for every algorithm supported by cuDNN V5.1.
Same class exists for cuDNN V6.0 (see below).
This should help avoid test cases that are intended to fail according to cuDNN documentation.
"""
NONE = "none"
FFT = "fft"
FFT_TILING = "fft_tiling"
WINOGRAD = "winograd"
WINOGRAD_NON_FUSED = "winograd_non_fused"
# Protected interface.
def _dilations(self, ndim):
return [(1,) * ndim]
def _fwd_fft(self, ndim):
inputs_sizes = [(10,) * ndim, (240, 5) + (2,) * (ndim - 2)]
filters_sizes = [tuple(range(9, 9 - ndim, -1))]
subsamples = [(1,) * ndim]
return ConvCaseGenerator(
ndim=ndim,
inputs_sizes=inputs_sizes,
filters_sizes=filters_sizes,
subsamples=subsamples,
dilations=self._dilations(ndim),
)
def _fwd_fft_tiling(self, ndim, dtype, precision):
if ndim == 2:
filters_sizes = [(32, 5)]
if ndim == 3:
filters_sizes = [(16, 5, 5)]
subsamples = [(1,) * ndim]
return ConvCaseGenerator(
ndim=ndim,
filters_sizes=filters_sizes,
subsamples=subsamples,
dilations=self._dilations(ndim),
)
def _fwd_winograd(self, ndim):
filters_sizes = [(3,) * ndim]
subsamples = [(1,) * ndim]
return ConvCaseGenerator(
ndim=ndim,
filters_sizes=filters_sizes,
subsamples=subsamples,
dilations=self._dilations(ndim),
)
def _fwd_winograd_non_fused(self, ndim, dtype, precision):
filters_sizes = [(3,) * ndim]
if not (dtype == precision == "float16"):
filters_sizes += [(5,) * ndim]
subsamples = [(1,) * ndim]
return ConvCaseGenerator(
ndim=ndim,
filters_sizes=filters_sizes,
subsamples=subsamples,
dilations=self._dilations(ndim),
)
def _gw_fft(self, ndim):
return self._fwd_fft(ndim)
def _gw_winograd_non_fused(self, ndim, dtype, precision):
return self._fwd_winograd_non_fused(ndim, dtype, precision)
def _gi_fft(self, ndim):
return self._fwd_fft(ndim)
def _gi_fft_tiling(self, ndim, dtype, precision):
return self._fwd_fft_tiling(ndim, dtype, precision)
def _gi_winograd(self, ndim):
return self._fwd_winograd(ndim)
def _gi_winograd_non_fused(self, ndim, dtype, precision):
return self._fwd_winograd_non_fused(ndim, dtype, precision)
def _fwd_runtime(self, ndim, dtype, precision):
return ConvCaseGenerator(ndim=ndim, dilations=self._dilations(ndim))
def _gw_runtime(self, ndim, dtype, precision):
return self._fwd_runtime(ndim, dtype, precision)
def _gi_runtime(self, ndim, dtype, precision):
return self._fwd_runtime(ndim, dtype, precision)
# Public interface.
def fwd(self, algo, ndim, dtype, precision):
if algo == self.FFT:
return self._fwd_fft(ndim)
if algo == self.FFT_TILING:
return self._fwd_fft_tiling(ndim, dtype, precision)
if algo == self.WINOGRAD:
return self._fwd_winograd(ndim)
if algo == self.WINOGRAD_NON_FUSED:
return self._fwd_winograd_non_fused(ndim, dtype, precision)
if algo in SUPPORTED_DNN_CONV_ALGO_RUNTIME:
return self._fwd_runtime(ndim, dtype, precision)
return ConvCaseGenerator(ndim=ndim, dilations=self._dilations(ndim))
def gw(self, algo, ndim, dtype, precision):
if algo == self.FFT:
return self._gw_fft(ndim)
if algo == self.WINOGRAD_NON_FUSED:
return self._gw_winograd_non_fused(ndim, dtype, precision)
if algo in SUPPORTED_DNN_CONV_ALGO_RUNTIME:
return self._gw_runtime(ndim, dtype, precision)
return ConvCaseGenerator(ndim=ndim, dilations=self._dilations(ndim))
def gi(self, algo, ndim, dtype, precision):
if algo == self.FFT:
return self._gi_fft(ndim)
if algo == self.FFT_TILING:
return self._gi_fft_tiling(ndim, dtype, precision)
if algo == self.WINOGRAD:
return self._gi_winograd(ndim)
if algo == self.WINOGRAD_NON_FUSED:
return self._gi_winograd_non_fused(ndim, dtype, precision)
if algo in SUPPORTED_DNN_CONV_ALGO_RUNTIME:
return self._gi_runtime(ndim, dtype, precision)
return ConvCaseGenerator(ndim=ndim, dilations=self._dilations(ndim))
class CuDNNV6ConvCaseGenerator(CuDNNV51ConvCaseGenerator):
def _fwd_none(self, ndim):
# All dilations allowed.
return ConvCaseGenerator(ndim=ndim)
def _fwd_fft_tiling(self, ndim, dtype, precision):
if ndim == 2:
subsamples = [(1, 1)]
# wDesc's filter height must be greater than convDesc's zero-padding height
# wDesc's filter width must be greater than convDesc's zero-padding width
generators = []
if (dtype, precision) != ("float64", "float64"):
# Filter sizes with every dimension != 1 is not supported for DOUBLE_CONFIG.
filters_sizes = [(32, 5), (10, 10)]
borders = [(1, 1), (6, 4)]
generators += [
ConvCaseGenerator(
ndim=ndim,
dilations=self._dilations(ndim),
subsamples=subsamples,
filters_sizes=filters_sizes,
borders=borders,
)
]
filters_sizes = [(256, 1), (5, 1)]
borders = [(1, 0), (2, 0)]
generators += [
ConvCaseGenerator(
ndim=ndim,
dilations=self._dilations(ndim),
subsamples=subsamples,
filters_sizes=filters_sizes,
borders=borders,
)
]
return ConvCaseGeneratorChain(*generators)
if ndim == 3:
return super()._fwd_fft_tiling(ndim, dtype, precision)
def _gw_none(self, ndim):
return self._fwd_none(ndim)
def _gw_fft_tiling(self, ndim):
inputs_sizes = [(247, 1), (20, 1)]
filters_sizes = [(3, 1), (10, 1)]
subsamples = [(1,) * ndim]
borders = [(1, 0), (2, 0)]
return ConvCaseGenerator(
ndim=ndim,
inputs_sizes=inputs_sizes,
filters_sizes=filters_sizes,
subsamples=subsamples,
borders=borders,
dilations=self._dilations(ndim),
)
def _gi_none(self, ndim):
return self._fwd_none(ndim)
def _fwd_runtime(self, ndim, dtype, precision):
if ndim == 2 and dtype == precision == "float16":
return ConvCaseGenerator(ndim=ndim, dilations=self._dilations(ndim))
return super()._fwd_runtime(ndim, dtype, precision)
def _gw_runtime(self, ndim, dtype, precision):
if ndim == 2 and dtype == precision == "float16":
return ConvCaseGenerator(ndim=ndim, dilations=self._dilations(ndim))
return super()._gw_runtime(ndim, dtype, precision)
def _gi_runtime(self, ndim, dtype, precision):
if ndim == 2 and dtype == precision == "float16":
return ConvCaseGenerator(ndim=ndim, dilations=self._dilations(ndim))
return super()._gi_runtime(ndim, dtype, precision)
def fwd(self, algo, ndim, dtype, precision):
if algo == self.NONE:
return self._fwd_none(ndim)
return super().fwd(algo, ndim, dtype, precision)
def gw(self, algo, ndim, dtype, precision):
if algo == self.NONE:
return self._gw_none(ndim)
if algo == self.FFT_TILING:
return self._gw_fft_tiling(ndim)
return super().gw(algo, ndim, dtype, precision)
def gi(self, algo, ndim, dtype, precision):
if algo == self.NONE:
return self._gi_none(ndim)
return super().gi(algo, ndim, dtype, precision)
cudnn_conv_case_generator = (
CuDNNV51ConvCaseGenerator() if cudnn.version < 6 else CuDNNV6ConvCaseGenerator()
)
class BaseTestDnnConv:
"""
Base class for exhaustive tests. Use its subclasses
to run actual tests.
"""
# Abstract attributes.
ndim = 2
fwd_algorithms = None
bwd_filter_algorithms = None
bwd_data_algorithms = None
cpu_conv_class = None
cpu_gradinput_class = None
cpu_gradweight_class = None
special_cases = [] # List of special ConvCases.
runtime_shapes = (
[]
) # Tuple of tuples with format: n_times, (inputs_shape, filters_shape)
# Utility methods.
def _next_ten_exponent(self, val):
# Return exponent for the next ten power that follows val.
# val should be a positive integer.
# Examples:
# for 0 to 9, returns 1 (=> 10**1 == 10)
# for 10 to 99, returns 2 (=> 10**2 == 100)
ten_exponent = 1
while val // 10 > 0:
ten_exponent += 1
val //= 10
return ten_exponent
def scale_numpy_arrays_inplace(self, A, B, alpha):
scale_factor = 1
# Scale down simultaneously A and B if alpha is not 1.
if alpha != 1:
scale_factor *= alpha
# Normalize A and B simultaneously so that any values in these tensors are in interval [0, 1)
max_a = math.floor(abs(A.max()))
max_b = math.floor(abs(B.max()))
if max_a or max_b:
m_a = self._next_ten_exponent(max_a)
m_b = self._next_ten_exponent(max_b)
max_m = max(m_a, m_b)
scale_factor *= 10**max_m
if scale_factor != 1:
A /= scale_factor
B /= scale_factor
def get_atol_rtol(self, algo, dtype, precision):
if dtype == "float16":
# Raise tolerance for float16
return (5e-2, 5e-2)
if algo == "winograd_non_fused" and dtype == precision == "float32":
# Raise tolerance for winograd_non_fused in FLOAT_CONFIG.
return (1e-4, 1e-4)
return None, None
def __init__(self):
self.dtype_configs = cudnn.get_supported_dtype_configs(
check_dtype_config_support
)
def array_like_conv_output(
self, inputs_shape, filters_shape, border_mode, subsample, dilation, dtype
):
# Return a random array with inferred convolution output shape.
out_shp = get_conv_output_shape(
inputs_shape, filters_shape, border_mode, subsample, dilation
)
out_shp = assert_conv_shape(out_shp)
return np.random.random(out_shp).astype(dtype)
def run_conv_fwd(self, algo, dtype, precision, parameters):
(
inputs_shape,
filters_shape,
subsample,
dilation,
border_mode,
conv_mode,
alpha,
beta,
) = parameters
inputs_val = np.random.random(inputs_shape).astype(dtype)
filters_val = np.random.random(filters_shape).astype(dtype)
# Scale down the input values to prevent very large absolute errors
# due to float rounding
inputs_val /= 10
filters_val /= 10
inputs = aesara.shared(inputs_val)
filters = aesara.shared(filters_val)
if beta == 0:
out = None
else:
out = self.array_like_conv_output(
inputs_shape, filters_shape, border_mode, subsample, dilation, dtype
)
out /= 10
# Compile an Aesara function for the cuDNN implementation
conv = dnn_conv(
img=inputs,
kerns=filters,
alpha=alpha,
beta=beta,
out=out,
border_mode=border_mode,
subsample=subsample,
dilation=dilation,
conv_mode=conv_mode,
algo=algo,
precision=precision,
)
f = aesara.function([], conv, mode=mode_with_gpu)
# If conv_mode is 'conv' the reference implementation should use
# filters flipped according to the width, height and time axis
if conv_mode == "conv":
if inputs.ndim == 5:
flipped_filters = filters[:, :, ::-1, ::-1, ::-1]
else:
flipped_filters = filters[:, :, ::-1, ::-1]
else:
flipped_filters = filters
# Compile an Aesara function for the reference implementation
conv_ref = self.cpu_conv_class(
border_mode=border_mode, subsample=subsample, filter_dilation=dilation
)(ref_cast(inputs), flipped_filters)
f_ref = aesara.function([], conv_ref, mode="FAST_RUN")
# Compare the results of the two implementations
res_ref = f_ref()
res = np.asarray(f())
if algo in cudnn.deterministic_fwd_algorithms:
utt.assert_allclose(res, np.asarray(f()))
atol, rtol = self.get_atol_rtol(algo, dtype, precision)
if beta == 0:
cpu_res = alpha * res_ref
else:
cpu_res = alpha * res_ref + beta * out
self.scale_numpy_arrays_inplace(cpu_res, res, alpha)
utt.assert_allclose(cpu_res, res, rtol=rtol, atol=atol)
def run_conv_gradinput(self, algo, dtype, precision, parameters):
(
inputs_shape,
filters_shape,
subsample,
dilation,
border_mode,
conv_mode,
alpha,
beta,
) = parameters
if beta == 0:
inputs_val = None
else:
inputs_val = np.random.random(inputs_shape).astype(dtype)
inputs_val /= 10
filters_val = np.random.random(filters_shape).astype(dtype)
topgrad_val = self.array_like_conv_output(
inputs_shape, filters_shape, border_mode, subsample, dilation, dtype
)
# Scale down the input values to prevent absolute errors in utt.assert_allclose.
filters_val /= 10
topgrad_val /= 10
filters = aesara.shared(filters_val)
topgrad = aesara.shared(topgrad_val)
# Compile an Aesara function for the cuDNN implementation
grad_i = dnn_gradinput(
filters,
topgrad,
inputs_shape,
alpha=alpha,
beta=beta,
out=inputs_val,
border_mode=border_mode,
subsample=subsample,
dilation=dilation,
conv_mode=conv_mode,
algo=algo,
precision=precision,
)
f = aesara.function([], grad_i, mode=mode_with_gpu)
# If conv_mode is 'conv' the reference implementation should use
# filters flipped according to the width, height and time axis
if conv_mode == "conv":
if filters.ndim == 5:
flipped_filters = filters[:, :, ::-1, ::-1, ::-1]
else:
flipped_filters = filters[:, :, ::-1, ::-1]
else:
flipped_filters = filters
# Compile an Aesara function for the reference implementation
grad_i_ref = self.cpu_gradinput_class(
border_mode=border_mode, subsample=subsample, filter_dilation=dilation
)(ref_cast(flipped_filters), ref_cast(topgrad), inputs_shape[2:])
f_ref = aesara.function([], grad_i_ref, mode="FAST_RUN")
# Compare the results of the two implementations
res_ref = f_ref()
res = np.asarray(f())
if algo in cudnn.deterministic_bwd_data_algorithms:
utt.assert_allclose(res, np.asarray(f()))
atol, rtol = self.get_atol_rtol(algo, dtype, precision)
if beta == 0:
cpu_res = alpha * res_ref
else:
cpu_res = alpha * res_ref + beta * inputs_val
self.scale_numpy_arrays_inplace(cpu_res, res, alpha)
utt.assert_allclose(cpu_res, res, rtol=rtol, atol=atol)
def run_conv_gradweight(self, algo, dtype, precision, parameters):
(
inputs_shape,
filters_shape,
subsample,
dilation,
border_mode,
conv_mode,
alpha,
beta,
) = parameters
inputs_val = np.random.random(inputs_shape).astype(dtype)
if beta == 0:
filters_val = None
else:
filters_val = np.random.random(filters_shape).astype(dtype)
filters_val /= 10
topgrad_val = self.array_like_conv_output(
inputs_shape, filters_shape, border_mode, subsample, dilation, dtype
)
# Scale down the input values to prevent absolute errors in utt.assert_allclose.
inputs_val /= 10
topgrad_val /= 10
inputs = aesara.shared(inputs_val)
topgrad = aesara.shared(topgrad_val)
# Compile an Aesara function for the cuDNN implementation
grad_w = dnn_gradweight(
inputs,
topgrad,
filters_shape,
alpha=alpha,
beta=beta,
out=filters_val,
border_mode=border_mode,
subsample=subsample,
dilation=dilation,
conv_mode=conv_mode,
algo=algo,
precision=precision,
)
f = aesara.function([], grad_w, mode=mode_with_gpu)
# Compile an Aesara function for the reference implementation
grad_w_ref = self.cpu_gradweight_class(
border_mode=border_mode, subsample=subsample, filter_dilation=dilation
)(ref_cast(inputs), ref_cast(topgrad), filters_shape[2:])
if conv_mode == "conv":
if inputs.ndim == 5:
grad_w_ref = grad_w_ref[:, :, ::-1, ::-1, ::-1]
else:
grad_w_ref = grad_w_ref[:, :, ::-1, ::-1]
f_ref = aesara.function([], grad_w_ref, mode="FAST_RUN")
# Compare the results of the two implementations
res_ref = f_ref()
res = np.asarray(f())
if algo in cudnn.deterministic_bwd_filter_algorithms:
utt.assert_allclose(res, np.asarray(f()))
atol, rtol = self.get_atol_rtol(algo, dtype, precision)
if beta == 0:
cpu_res = alpha * res_ref
else:
cpu_res = alpha * res_ref + beta * filters_val
self.scale_numpy_arrays_inplace(cpu_res, res, alpha)
utt.assert_allclose(cpu_res, res, rtol=rtol, atol=atol)
def should_fail(self, function, *args):
try:
print("(should fail)", file=sys.stderr, end=" ")
function(*args)
except Exception:
pass
else:
raise AssertionError("Should fail", callable.__name__, *args)
def should_fail_fwd(self, *args):
self.should_fail(self.run_conv_fwd, *args)
def should_fail_gradinput(self, *args):
self.should_fail(self.run_conv_gradinput, *args)
def should_fail_gradweight(self, *args):
self.should_fail(self.run_conv_gradweight, *args)
def get_expected_tcount(self):
"""Utility function to get expected test count without actually running pytest."""
return (
sum(1 for t in self.test_fwd())
+ sum(1 for t in self.test_gradweight())
+ sum(1 for t in self.test_gradinput())
+ sum(1 for t in self.test_fwd_runtime_algorithms())
+ sum(1 for t in self.test_gradweight_runtime_algorithms())
+ sum(1 for t in self.test_gradinput_runtime_algorithms())
)
# Iterable test methods.
def test_fwd(self):
for dtype, precision in self.dtype_configs:
algos = [
algo
for algo in self.fwd_algorithms
if cudnn.fwd_algo_supports_dtype_config(
algo, dtype, precision, self.ndim
)
]
for algo in algos:
for parameters in cudnn_conv_case_generator.fwd(
algo, self.ndim, dtype, precision
).get_cases():
self.run_conv_fwd(algo, dtype, precision, parameters)
if algos:
# Some algorithms support current data type configuration for current ndim.
# So, an algorithm could be chosen at runtime.
for algo in SUPPORTED_DNN_CONV_ALGO_RUNTIME:
for parameters in cudnn_conv_case_generator.fwd(
algo, self.ndim, dtype, precision
).get_cases():
self.run_conv_fwd(algo, dtype, precision, parameters)
for dnn_case in self.special_cases:
if dnn_case.is_fwd():
if dnn_case.should_fail:
self.should_fail_fwd(dnn_case.get_case())
else:
self.run_conv_fwd(dnn_case.get_case())
def test_gradinput(self):
for dtype, precision in self.dtype_configs:
algos = [
algo
for algo in self.bwd_data_algorithms
if cudnn.bwd_data_algo_supports_dtype_config(
algo, dtype, precision, self.ndim
)
]
for algo in algos:
for parameters in cudnn_conv_case_generator.gi(
algo, self.ndim, dtype, precision
).get_cases():
self.run_conv_gradinput(algo, dtype, precision, parameters)
if algos:
# Some algorithms support current data type configuration for current ndim.
# So, an algorithm could be chosen at runtime.
for algo in SUPPORTED_DNN_CONV_ALGO_RUNTIME:
for parameters in cudnn_conv_case_generator.gi(
algo, self.ndim, dtype, precision
).get_cases():
self.run_conv_gradinput(algo, dtype, precision, parameters)
for dnn_case in self.special_cases:
if dnn_case.is_bwd_data():
if dnn_case.should_fail:
self.should_fail_gradinput(dnn_case.get_case())
else:
self.run_conv_gradinput(dnn_case.get_case())
def test_gradweight(self):
for dtype, precision in self.dtype_configs:
algos = [
algo
for algo in self.bwd_filter_algorithms
if cudnn.bwd_filter_algo_supports_dtype_config(
algo, dtype, precision, self.ndim
)
]
for algo in algos:
for parameters in cudnn_conv_case_generator.gw(
algo, self.ndim, dtype, precision
).get_cases():
self.run_conv_gradweight(algo, dtype, precision, parameters)
if algos:
# Some algorithms support current data type configuration for current ndim.
# So, an algorithm could be chosen at runtime.
for algo in SUPPORTED_DNN_CONV_ALGO_RUNTIME:
for parameters in cudnn_conv_case_generator.gw(
algo, self.ndim, dtype, precision
).get_cases():
self.run_conv_gradweight(algo, dtype, precision, parameters)
for dnn_case in self.special_cases:
if dnn_case.is_bwd_filter():
if dnn_case.should_fail:
self.should_fail_gradweight(dnn_case.get_case())
else:
self.run_conv_gradweight(dnn_case.get_case())
# The 3 following tests are intended to be run with aesara flag `cmodule__debug=True`.
# The output message should then be analyzed to check if runtime algorithms are
# reused, reloaded from cache or updated, depending on what we expect from
# dnn_fwd/dnn_gi/dnn_gw current codes. I currently don't know a better way
# to efficiently test implemented cuDNN convolution caches.
def test_fwd_runtime_algorithms(self):
dtype = "float32"
unit_shape = (1,) * self.ndim
_broadcastable = [False] * (2 + self.ndim)
def run_fwd_runtime_algorithm(algo):
inputs = TensorType(dtype, _broadcastable)()
filters = TensorType(dtype, _broadcastable)()
# Scale down the input values to prevent very large absolute errors
# due to float rounding
lower_inputs = inputs / 10
lower_filters = filters / 10
conv = dnn_conv(
img=lower_inputs,
kerns=lower_filters,
algo=algo,
precision=dtype,
subsample=unit_shape,
dilation=unit_shape,
)
f = aesara.function([inputs, filters], conv, mode=mode_with_gpu)
if self.ndim == 3:
flipped_filters = lower_filters[:, :, ::-1, ::-1, ::-1]
else:
flipped_filters = lower_filters[:, :, ::-1, ::-1]
conv_ref = self.cpu_conv_class(subsample=unit_shape)(
ref_cast(lower_inputs), flipped_filters
)
f_ref = aesara.function([inputs, filters], conv_ref, mode="FAST_RUN")
runtime_shapes = self.runtime_shapes
if algo in ("time_once", "guess_once"):
runtime_shapes = [list(runtime_shapes[0])]
runtime_shapes[0][0] = 5
for ntimes, (inputs_shape, filters_shape) in runtime_shapes:
print("Shapes:", inputs_shape, filters_shape)
for i in range(ntimes):
inputs_val = np.random.random(inputs_shape).astype(dtype)
filters_val = np.random.random(filters_shape).astype(dtype)
gpu_res = np.asarray(f(inputs_val, filters_val))
cpu_res = f_ref(inputs_val, filters_val)
self.scale_numpy_arrays_inplace(cpu_res, gpu_res, 1)
utt.assert_allclose(cpu_res, gpu_res)
for algo in SUPPORTED_DNN_CONV_ALGO_RUNTIME:
run_fwd_runtime_algorithm(algo)
def test_gradinput_runtime_algorithms(self):
dtype = "float32"
unit_shape = (1,) * self.ndim
_broadcastable = [False] * (2 + self.ndim)
def run_gradinput_runtime_algorithm(algo):
aesara.config.dnn__conv__algo_bwd_data = algo
inputs = TensorType(dtype, _broadcastable)()
filters = TensorType(dtype, _broadcastable)()
conv = dnn_conv(
img=inputs,
kerns=filters,
algo=algo,
precision=dtype,
subsample=unit_shape,
dilation=unit_shape,
)
grad_i = aesara.gradient.grad(conv.sum(), [inputs])
f = aesara.function([inputs, filters], grad_i, mode=mode_with_gpu)
assert 1 == len(
[
node
for node in f.maker.fgraph.apply_nodes
if isinstance(node.op, GpuDnnConvGradI)
]
)
assert not any(
isinstance(node.op, GpuDnnConv) for node in f.maker.fgraph.apply_nodes
)
assert not any(
isinstance(node.op, GpuDnnConvGradW)
for node in f.maker.fgraph.apply_nodes
)
if self.ndim == 3:
flipped_filters = filters[:, :, ::-1, ::-1, ::-1]
else:
flipped_filters = filters[:, :, ::-1, ::-1]
conv_ref = self.cpu_conv_class(subsample=unit_shape)(
ref_cast(inputs), flipped_filters
)
grad_i_ref = aesara.gradient.grad(conv_ref.sum(), [inputs])
f_ref = aesara.function([inputs, filters], grad_i_ref, mode="FAST_RUN")
runtime_shapes = self.runtime_shapes
if algo in ("time_once", "guess_once"):
runtime_shapes = [list(runtime_shapes[0])]
runtime_shapes[0][0] = 5
for ntimes, (inputs_shape, filters_shape) in runtime_shapes:
print("Shapes:", inputs_shape, filters_shape)
for i in range(ntimes):
inputs_val = np.random.random(inputs_shape).astype(dtype)
filters_val = np.random.random(filters_shape).astype(dtype)
gpu_res = f(inputs_val, filters_val)
cpu_res = f_ref(inputs_val, filters_val)
utt.assert_allclose(cpu_res, np.asarray(gpu_res))
for algo in SUPPORTED_DNN_CONV_ALGO_RUNTIME:
run_gradinput_runtime_algorithm(algo)
def test_gradweight_runtime_algorithms(self):
dtype = "float32"
unit_shape = (1,) * self.ndim
_broadcastable = [False] * (2 + self.ndim)
def run_gradweight_runtime_algorithm(algo):
with aesara.config.change_flags(dnn__conv__algo_bwd_filter=algo):
inputs = TensorType(dtype, _broadcastable)()
filters = TensorType(dtype, _broadcastable)()
conv = dnn_conv(
img=inputs,
kerns=filters,
algo=algo,
precision=dtype,
subsample=unit_shape,
dilation=unit_shape,
)
grad_w = aesara.gradient.grad(conv.sum(), [filters])
f = aesara.function([inputs, filters], grad_w, mode=mode_with_gpu)
assert 1 == len(
[
node
for node in f.maker.fgraph.apply_nodes
if isinstance(node.op, GpuDnnConvGradW)
]
)
assert not any(
isinstance(node.op, GpuDnnConv)
for node in f.maker.fgraph.apply_nodes
)
assert not any(
isinstance(node.op, GpuDnnConvGradI)
for node in f.maker.fgraph.apply_nodes
)
if self.ndim == 3:
flipped_filters = filters[:, :, ::-1, ::-1, ::-1]
else:
flipped_filters = filters[:, :, ::-1, ::-1]
conv_ref = self.cpu_conv_class(subsample=unit_shape)(
ref_cast(inputs), flipped_filters
)
grad_w_ref = aesara.gradient.grad(conv_ref.sum(), [filters])
f_ref = aesara.function([inputs, filters], grad_w_ref, mode="FAST_RUN")
runtime_shapes = self.runtime_shapes
if algo in ("time_once", "guess_once"):
runtime_shapes = [list(runtime_shapes[0])]
runtime_shapes[0][0] = 5
for ntimes, (inputs_shape, filters_shape) in runtime_shapes:
print("Shapes:", inputs_shape, filters_shape)
for i in range(ntimes):
inputs_val = np.random.random(inputs_shape).astype(dtype)
filters_val = np.random.random(filters_shape).astype(dtype)
gpu_res = f(inputs_val, filters_val)
cpu_res = f_ref(inputs_val, filters_val)
utt.assert_allclose(cpu_res, np.asarray(gpu_res))
for algo in SUPPORTED_DNN_CONV_ALGO_RUNTIME:
run_gradweight_runtime_algorithm(algo)
class TestDnnConv2D(BaseTestDnnConv):
ndim = 2
fwd_algorithms = cudnn.cudnnConvolutionFwdAlgo_t.get_aliases()
bwd_filter_algorithms = cudnn.cudnnConvolutionBwdFilterAlgo_t.get_aliases()
bwd_data_algorithms = cudnn.cudnnConvolutionBwdDataAlgo_t.get_aliases()
cpu_conv_class = CorrMM
cpu_gradinput_class = CorrMM_gradInputs
cpu_gradweight_class = CorrMM_gradWeights
special_cases = [
ConvCase.bwd_filter(
algo="deterministic",
dtype="float32",
precision="float32",
inputs_shape=(1, 1, 541211, 10),
filters_shape=(50, 1, 3, 10),
border_mode=(1, 0),
should_fail=(cudnn.version <= 6),
),
ConvCase.fwd(
algo="small",
dtype="float32",
precision="float32",
inputs_shape=(65536, 2, 2, 2),
filters_shape=(1, 2, 2, 2),
),
# NB: Due to current workaround (see dnn_fwd.c), this test won't fail for cuDNN < v6100.
ConvCase.fwd(
algo="small",
dtype="float32",
precision="float32",
inputs_shape=(65537, 2, 2, 2),
filters_shape=(1, 2, 2, 2),
),
]
runtime_shapes = [
(3, [(2, 3, 10, 9), (5, 3, 7, 7)]),
(1, [(1, 1, 100, 200), (1, 1, 50, 200)]),
(1, [(4, 2, 20, 20), (2, 2, 20, 19)]),
(3, [(2, 3, 10, 9), (5, 3, 7, 7)]), # cache should be used
(1, [(2, 2, 50, 50), (5, 2, 25, 31)]),
(1, [(1, 1, 100, 200), (1, 1, 50, 200)]), # cache should be used
(1, [(4, 2, 20, 20), (2, 2, 20, 19)]), # cache should be used
(1, [(1, 2, 3, 4), (6, 2, 2, 1)]),
]
class TestDnnConv3D(BaseTestDnnConv):
ndim = 3
fwd_algorithms = cudnn.conv3d_fwd_algorithms
bwd_filter_algorithms = cudnn.conv3d_bwd_filter_algorithms
bwd_data_algorithms = cudnn.conv3d_bwd_data_algorithms
cpu_conv_class = Corr3dMM
cpu_gradinput_class = Corr3dMMGradInputs
cpu_gradweight_class = Corr3dMMGradWeights
special_cases = [
ConvCase.fwd(
algo="small",
dtype="float32",
precision="float32",
inputs_shape=(65536, 2, 2, 2, 2),
filters_shape=(1, 2, 2, 2, 2),
),
# NB: Due to current workaround (see dnn_fwd.c), this test won't fail for cuDNN < v6100.
ConvCase.fwd(
algo="small",
dtype="float32",
precision="float32",
inputs_shape=(65537, 2, 2, 2, 2),
filters_shape=(1, 2, 2, 2, 2),
),
]
runtime_shapes = [
(3, [(2, 3, 5, 10, 9), (5, 3, 4, 7, 7)]),
(1, [(1, 1, 5, 100, 200), (1, 1, 4, 50, 200)]),
(1, [(4, 2, 20, 20, 20), (2, 2, 20, 19, 18)]),
(3, [(2, 3, 5, 10, 9), (5, 3, 4, 7, 7)]), # cache should be used
(1, [(2, 2, 50, 50, 5), (5, 2, 25, 31, 4)]),
(1, [(1, 1, 5, 100, 200), (1, 1, 4, 50, 200)]), # cache should be used
(1, [(4, 2, 20, 20, 20), (2, 2, 20, 19, 18)]), # cache should be used
(1, [(1, 2, 3, 4, 5), (6, 2, 3, 2, 1)]),
]
def test_true_half_config_support():
# For cuDNN V5.1 and V6.0:
# "TRUE_HALF_CONFIG is only supported on architectures with true fp16 support (compute capability 5.3 and 6.0)"
if not check_dtype_config_support("float16", "float16"):
pytest.skip("FWD: TRUE_HALF_CONFIG not supported on this GPU.")
class CheckDnn:
"""
Utility functions for scripting and infos printing.
"""
@staticmethod
def dtype_config_to_str(dtype_config):
dtype, precision = dtype_config
if dtype == precision == "float16":
return "TRUE_HALF_CONFIG"
if dtype == "float16" and precision == "float32":
return "PSEUDO_HALF_CONFIG"
if dtype == precision == "float32":
return "FLOAT_CONFIG"
if dtype == precision == "float64":
return "DOUBLE_CONFIG"
raise ValueError("unknown data type configuration", dtype_config)
@staticmethod
def print_infos(count_tests=True):
# Print infos about tests and cuDNN supported algorithms and configurations.
test_2d = TestDnnConv2D()
test_3d = TestDnnConv3D()
print()
print(
"Available data type configurations:",
", ".join(
CheckDnn.dtype_config_to_str(d)
for d in cudnn.get_supported_dtype_configs(check_dtype_config_support)
),
)
print()
print("2D algorithms:")
print("FWD :", ", ".join(test_2d.fwd_algorithms))
print("BWD FILTER :", ", ".join(test_2d.bwd_filter_algorithms))
print("BWD DATA :", ", ".join(test_2d.bwd_data_algorithms))
print()
print("3D algorithms:")
print("FWD :", ", ".join(test_3d.fwd_algorithms))
print("BWD FILTER :", ", ".join(test_3d.bwd_filter_algorithms))
print("BWD DATA :", ", ".join(test_3d.bwd_data_algorithms))
print()
if count_tests:
count_tests_2d = test_2d.get_expected_tcount()
count_tests_3d = test_3d.get_expected_tcount()
print(count_tests_2d, "conv2D test cases.")
print(count_tests_3d, "conv3D test cases.")
print("1 supplementary test.")
print(count_tests_2d + count_tests_3d + 1, "total conv tests.")
print()
@staticmethod
def print_tests():
# Print test cases without running them.
for test in (TestDnnConv2D(), TestDnnConv3D()):
for tcase in test.test_fwd():
print(tcase[0].__name__, *tcase[1:])
for tcase in test.test_gradinput():
print(tcase[0].__name__, *tcase[1:])
for tcase in test.test_gradweight():
print(tcase[0].__name__, *tcase[1:])
for tcase in test.test_fwd_runtime_algorithms():
print(tcase[0].__name__, *tcase[1:])
for tcase in test.test_gradinput_runtime_algorithms():
print(tcase[0].__name__, *tcase[1:])
for tcase in test.test_gradweight_runtime_algorithms():
print(tcase[0].__name__, *tcase[1:])
print(test_true_half_config_support.__name__)
import pytest
import aesara.gpuarray
import aesara.tensor
if aesara.gpuarray.pygpu is None:
pytest.skip("pygpu not installed", allow_module_level=True)
init_error = None
if not aesara.gpuarray.pygpu_activated and not aesara.config.force_device:
try:
aesara.gpuarray.init_dev("cuda")
except Exception as e:
init_error = e
if not aesara.gpuarray.pygpu_activated:
if init_error:
pytest.skip(str(init_error), allow_module_level=True)
else:
pytest.skip("pygpu disabled", allow_module_level=True)
test_ctx_name = None
if aesara.config.mode == "FAST_COMPILE":
mode_with_gpu = (
aesara.compile.mode.get_mode("FAST_RUN").including("gpuarray").excluding("gpu")
)
mode_without_gpu = aesara.compile.mode.get_mode("FAST_RUN").excluding("gpuarray")
else:
mode_with_gpu = (
aesara.compile.mode.get_default_mode().including("gpuarray").excluding("gpu")
)
mode_without_gpu = aesara.compile.mode.get_default_mode().excluding("gpuarray")
mode_without_gpu.check_py_code = False
# If using float16, cast reference input to float32
def ref_cast(x):
if x.type.dtype == "float16":
x = aesara.tensor.cast(x, "float32")
return x
import numpy as np
import aesara
from aesara.tensor.math import dot, sigmoid, tanh
class Model:
def __init__(self, name=""):
self.name = name
self.layers = []
self.params = []
self.other_updates = {}
def add_layer(self, layer):
self.layers.append(layer)
for p in layer.params:
self.params.append(p)
if hasattr(layer, "other_updates"):
for y in layer.other_updates:
self.other_updates[y[0]] = y[1]
def get_params(self):
return self.params
def uniform(stdev, size):
"""uniform distribution with the given stdev and size"""
return np.random.uniform(
low=-stdev * np.sqrt(3), high=stdev * np.sqrt(3), size=size
).astype(aesara.config.floatX)
def linear_transform_weights(input_dim, output_dim, param_list=None, name=""):
"aesara shared variable given input and output dimension"
weight_inialization = uniform(np.sqrt(2.0 / input_dim), (input_dim, output_dim))
W = aesara.shared(weight_inialization, name=name)
assert param_list is not None
param_list.append(W)
return W
def bias_weights(length, param_list=None, name=""):
"aesara shared variable for bias unit, given length"
bias_initialization = np.zeros(length).astype(aesara.config.floatX)
bias = aesara.shared(bias_initialization, name=name)
if param_list is not None:
param_list.append(bias)
return bias
class Layer:
"""Generic Layer Template which all layers should inherit"""
def __init__(self, name=""):
self.name = name
self.params = []
def get_params(self):
return self.params
class GRU(Layer):
def __init__(self, input_dim, output_dim, input_layer, s0=None, name=""):
"""Layers information"""
self.name = name
self.input_dim = input_dim
self.hidden_dim = output_dim
self.output_dim = output_dim
self.input_layer = input_layer
self.X = input_layer.output()
self.s0 = s0
self.params = []
"""Layers weights"""
"""self.params is passed so that any parameters could be appended to it"""
self.W_r = linear_transform_weights(
input_dim, output_dim, param_list=self.params, name=name + ".W_r"
)
self.b_wr = bias_weights(
(output_dim,), param_list=self.params, name=name + ".b_wr"
)
self.W_i = linear_transform_weights(
input_dim, output_dim, param_list=self.params, name=name + ".W_i"
)
self.b_wi = bias_weights(
(output_dim,), param_list=self.params, name=name + ".b_wi"
)
self.W_h = linear_transform_weights(
input_dim, output_dim, param_list=self.params, name=name + ".W_h"
)
self.b_wh = bias_weights(
(output_dim,), param_list=self.params, name=name + ".b_wh"
)
self.R_r = linear_transform_weights(
output_dim, output_dim, param_list=self.params, name=name + ".R_r"
)
self.b_rr = bias_weights(
(output_dim,), param_list=self.params, name=name + ".b_rr"
)
self.R_i = linear_transform_weights(
output_dim, output_dim, param_list=self.params, name=name + ".R_i"
)
self.b_ru = bias_weights(
(output_dim,), param_list=self.params, name=name + ".b_ru"
)
self.R_h = linear_transform_weights(
output_dim, output_dim, param_list=self.params, name=name + ".R_h"
)
self.b_rh = bias_weights(
(output_dim,), param_list=self.params, name=name + ".b_rh"
)
"""step through processed input to create output"""
def step(inp, s_prev):
i_t = sigmoid(
dot(inp, self.W_i) + dot(s_prev, self.R_i) + self.b_wi + self.b_ru
)
r_t = sigmoid(
dot(inp, self.W_r) + dot(s_prev, self.R_r) + self.b_wr + self.b_rr
)
h_hat_t = tanh(
dot(inp, self.W_h)
+ (r_t * (dot(s_prev, self.R_h) + self.b_rh))
+ self.b_wh
)
s_curr = ((1.0 - i_t) * h_hat_t) + (i_t * s_prev)
return s_curr
outputs_info = self.s0
states, updates = aesara.scan(
fn=step, sequences=[self.X], outputs_info=outputs_info
)
self.Y = states
def output(self):
return self.Y
class LSTM(Layer):
def __init__(self, input_dim, output_dim, input_layer, s0=None, c0=None, name=""):
"""Layers information"""
self.name = name
self.input_dim = input_dim
self.hidden_dim = output_dim
self.output_dim = output_dim
self.input_layer = input_layer
self.X = input_layer.output()
self.s0 = s0
self.c0 = c0
self.params = []
"""Layers weights"""
"""self.params is passed so that any parameters could be appended to it"""
self.W_i = linear_transform_weights(
input_dim, output_dim, param_list=self.params, name=name + ".W_i"
)
self.b_wi = bias_weights(
(output_dim,), param_list=self.params, name=name + ".b_wi"
)
self.W_f = linear_transform_weights(
input_dim, output_dim, param_list=self.params, name=name + ".W_f"
)
self.b_wf = bias_weights(
(output_dim,), param_list=self.params, name=name + ".b_wf"
)
self.W_c = linear_transform_weights(
input_dim, output_dim, param_list=self.params, name=name + ".W_c"
)
self.b_wc = bias_weights(
(output_dim,), param_list=self.params, name=name + ".b_wc"
)
self.W_o = linear_transform_weights(
input_dim, output_dim, param_list=self.params, name=name + ".W_o"
)
self.b_wo = bias_weights(
(output_dim,), param_list=self.params, name=name + ".b_wo"
)
self.R_i = linear_transform_weights(
output_dim, output_dim, param_list=self.params, name=name + ".R_i"
)
self.b_ri = bias_weights(
(output_dim,), param_list=self.params, name=name + ".b_ri"
)
self.R_f = linear_transform_weights(
output_dim, output_dim, param_list=self.params, name=name + ".R_f"
)
self.b_rf = bias_weights(
(output_dim,), param_list=self.params, name=name + ".b_rf"
)
self.R_c = linear_transform_weights(
output_dim, output_dim, param_list=self.params, name=name + ".R_c"
)
self.b_rc = bias_weights(
(output_dim,), param_list=self.params, name=name + ".b_rc"
)
self.R_o = linear_transform_weights(
output_dim, output_dim, param_list=self.params, name=name + ".R_o"
)
self.b_ro = bias_weights(
(output_dim,), param_list=self.params, name=name + ".b_ro"
)
"""step through processed input to create output"""
def step(x_t, h_tm1, c_tm1):
i_t = sigmoid(
dot(x_t, self.W_i) + dot(h_tm1, self.R_i) + self.b_wi + self.b_ri
)
f_t = sigmoid(
dot(x_t, self.W_f) + dot(h_tm1, self.R_f) + self.b_wf + self.b_rf
)
o_t = sigmoid(
dot(x_t, self.W_o) + dot(h_tm1, self.R_o) + self.b_ro + self.b_wo
)
c_hat_t = tanh(
dot(x_t, self.W_c) + dot(h_tm1, self.R_c) + self.b_wc + self.b_rc
)
c_t = f_t * c_tm1 + i_t * c_hat_t
h_t = o_t * tanh(c_t)
return h_t, c_t
outputs_info = [self.s0, self.c0]
states, updates = aesara.scan(
fn=step, sequences=[self.X], outputs_info=outputs_info
)
self.Y = states[0]
self.C = states[1]
def output(self):
return self.Y
class FC(Layer):
def __init__(self, input_dim, output_dim, input_layer, name=""):
self.input_layer = input_layer
self.name = name
self.params = []
self.input_dim = input_dim
self.output_dim = output_dim
self.X = self.input_layer.output()
self.W = linear_transform_weights(
input_dim, output_dim, param_list=self.params, name=name + ".W"
)
self.b = bias_weights((output_dim,), param_list=self.params, name=name + ".b")
def output(self):
return dot(self.X, self.W) + self.b
class WrapperLayer(Layer):
def __init__(self, X, name=""):
self.params = []
self.name = name
self.X = X
def output(self):
return self.X
# This script allows to run one specific cuDNN convolution test case.
# This script should not be imported, but only used as a program.
# python run_dnn_conv.py --help # Print help.
# python run_dnn_conv.py {fwd|bwd-filter|bwd-data} {2d|3d} -a <algo> -i <inputShape> -f <filterShape> ...
import argparse
import sys
import aesara
from aesara.configdefaults import SUPPORTED_DNN_CONV_ALGO_RUNTIME
from aesara.gpuarray.cudnn_defs import (
DOUBLE,
DOUBLE_CONFIG,
FLOAT,
FLOAT_CONFIG,
HALF,
PSEUDO_HALF_CONFIG,
TRUE_HALF_CONFIG,
)
from aesara.tensor.nnet.abstract_conv import get_conv_output_shape
from tests.gpuarray.check_dnn_conv import CheckDnn, TestDnnConv2D, TestDnnConv3D, cudnn
if __name__ != "__main__":
raise ImportError("This script must not be imported.")
class TupleAction(argparse.Action):
# Tuple extractor for command line args parser.
def __call__(self, parser, namespace, values, option_string=None):
values = tuple(int(v) for v in values.split(","))
setattr(namespace, self.dest, values)
class BorderAction(TupleAction):
# Border extractor for command line args parser.
def __call__(self, parser, namespace, values, option_string=None):
if values not in ("valid", "full", "half"):
super().__call__(parser, namespace, values, option_string)
else:
setattr(namespace, self.dest, values)
args = sys.argv[1:]
computations = FWD, BWD_FILTER, BWD_DATA = ("fwd", "gradweight", "gradinput")
algorithms = (
tuple(
sorted(
list(
set(
cudnn.cudnnConvolutionFwdAlgo_t.get_aliases()
+ cudnn.cudnnConvolutionBwdFilterAlgo_t.get_aliases()
+ cudnn.cudnnConvolutionBwdDataAlgo_t.get_aliases()
)
)
)
)
+ SUPPORTED_DNN_CONV_ALGO_RUNTIME
)
types = (HALF, FLOAT, DOUBLE)
data_type_configurations = dict(
TRUE_HALF_CONFIG=TRUE_HALF_CONFIG,
PSEUDO_HALF_CONFIG=PSEUDO_HALF_CONFIG,
FLOAT_CONFIG=FLOAT_CONFIG,
DOUBLE_CONFIG=DOUBLE_CONFIG,
)
parser = argparse.ArgumentParser()
parser.add_argument("computation", choices=computations, help="Computation to run.")
parser.add_argument(
"-a",
"--algo",
choices=algorithms,
required=True,
help="Algorithm to use for computation.",
)
parser.add_argument(
"-i",
"--input-shape",
action=TupleAction,
required=True,
help="Input shape. Comma-separated list of integers (no spaces).",
)
parser.add_argument(
"-f",
"--filter-shape",
action=TupleAction,
required=True,
help="Filter shape. Comma-separated list of integers (no spaces).",
)
parser.add_argument(
"-D",
"--dtype-config",
choices=list(sorted(data_type_configurations.keys())),
default=None,
help="Data type configuration for (data type; precision). Default (aesara floatX; aesara floatX). "
"To specify data type configuration, you can either use this option or set data type and "
'precision separately with "-t" and "-p" options.',
)
parser.add_argument(
"-t",
"--dtype",
choices=types,
default=None,
help="Data type (default aesara floatX).",
)
parser.add_argument(
"-p",
"--precision",
choices=types,
default=None,
help="Precision (default aesara floatX).",
)
parser.add_argument(
"-s",
"--subsample",
action=TupleAction,
help="Subsample. Comma-separated list of integers (no spaces). "
"Default: 1 per dimension.",
)
parser.add_argument(
"-d",
"--dilation",
action=TupleAction,
help="Dilation. Comma-separated list of integers (no spaces). "
"Default: 1 per dimension.",
)
parser.add_argument(
"-b",
"--border-mode",
default="valid",
action=BorderAction,
help='Border mode. "valid" (default), "full", "half" '
"or a comma-separated list of integers (no spaces).",
)
parser.add_argument(
"-c",
"--conv-mode",
choices=("conv", "cross"),
default="conv",
help="Conv mode (default: conv).",
)
parser.add_argument(
"-A",
"--alpha",
type=float,
default=1,
help="alpha (floating), must not be zero. Default 1.",
)
parser.add_argument(
"-B", "--beta", type=float, default=0, help="beta (floating). Default 0."
)
parser.add_argument(
"-I",
"--print-infos",
action="store_true",
default=False,
help="Print some infos before testing.",
)
args = parser.parse_args(args)
test = args.computation
if len(args.input_shape) != len(args.filter_shape):
raise ValueError("Expected same length for input shape and filter shape")
if len(args.input_shape) not in (4, 5):
raise ValueError("Expected length 4 or 5 for input shape")
ndim = len(args.input_shape) - 2
if ndim == 2:
tests = TestDnnConv2D()
elif ndim == 3:
tests = TestDnnConv3D()
if args.subsample is None:
args.subsample = (1,) * ndim
if args.dilation is None:
args.dilation = (1,) * ndim
if not (ndim == len(args.subsample) == len(args.dilation)):
raise ValueError(f"Expected parameters sized for {int(ndim)} dimensions.")
if isinstance(args.border_mode, tuple) and ndim != len(args.border_mode):
raise ValueError(f"Expected borders sized for {int(ndim)} dimensions.")
if args.alpha == 0:
raise ValueError("Nothing could be computed if alpha is 0.")
if args.dtype_config is None:
if args.dtype is None:
args.dtype = aesara.config.floatX
if args.precision is None:
args.precision = aesara.config.floatX
else:
if args.dtype is not None or args.precision is not None:
raise ValueError(
"You must specify either -D <data-type-configuration> "
"or (-t <data-type> -p <precision>), not both."
)
args.dtype, args.precision = data_type_configurations[args.dtype_config]
if (args.dtype, args.precision) not in cudnn.get_supported_dtype_configs():
raise ValueError(
f"Unsupported data type configuration {args.dtype} {args.precision}."
)
if args.algo not in SUPPORTED_DNN_CONV_ALGO_RUNTIME:
check_config = False
if test == FWD:
check_config = cudnn.fwd_algo_supports_dtype_config(
args.algo, args.dtype, args.precision, ndim
)
elif test == BWD_FILTER:
check_config = cudnn.bwd_filter_algo_supports_dtype_config(
args.algo, args.dtype, args.precision, ndim
)
elif test == BWD_DATA:
check_config = cudnn.bwd_data_algo_supports_dtype_config(
args.algo, args.dtype, args.precision, ndim
)
if not check_config:
print(
"Warning: %s computation does not normally support configuration (%s, %s) for algo %s."
% (test, args.dtype, args.precision, args.algo),
file=sys.stderr,
)
algo = args.algo
dtype = args.dtype
precision = args.precision
parameters = (
args.input_shape,
args.filter_shape,
args.subsample,
args.dilation,
args.border_mode,
args.conv_mode,
args.alpha,
args.beta,
)
if args.print_infos:
CheckDnn.print_infos(count_tests=False)
print("======================")
print("Running", test, algo, dtype, precision, *parameters)
if test == FWD:
tests.run_conv_fwd(algo, dtype, precision, parameters)
expected_output_shape = get_conv_output_shape(
args.input_shape,
args.filter_shape,
args.border_mode,
args.subsample,
args.dilation,
)
elif test == BWD_FILTER:
tests.run_conv_gradweight(algo, dtype, precision, parameters)
expected_output_shape = args.filter_shape
elif test == BWD_DATA:
tests.run_conv_gradinput(algo, dtype, precision, parameters)
expected_output_shape = args.input_shape
print("Computed shape:", expected_output_shape)
print("... OK")
import numpy as np
import pytest
pygpu = pytest.importorskip("pygpu")
gpuarray = pygpu.gpuarray
from aesara.gpuarray.blas import (
GpuCorr3dMM,
GpuCorr3dMM_gradInputs,
GpuCorr3dMM_gradWeights,
GpuCorrMM,
GpuCorrMM_gradInputs,
GpuCorrMM_gradWeights,
)
from aesara.gpuarray.dnn import (
GpuDnnConv,
GpuDnnConvGradI,
GpuDnnConvGradW,
dnn_available,
)
from aesara.gpuarray.type import GpuArrayType, get_context, gpuarray_shared_constructor
from tests.gpuarray.config import mode_with_gpu, test_ctx_name
from tests.tensor.nnet.test_abstract_conv import (
BaseTestConv2d,
BaseTestConv3d,
TestConv2dTranspose,
TestConvTypes,
)
gpu_ftensor4 = GpuArrayType(dtype="float32", broadcastable=(False,) * 4)
class TestDnnConv2d(BaseTestConv2d):
@classmethod
def setup_class(cls):
super().setup_class()
cls.shared = staticmethod(gpuarray_shared_constructor)
# provide_shape is not used by the cuDNN implementation
cls.provide_shape = [False]
@pytest.mark.skipif(dnn_available(test_ctx_name), reason=dnn_available.msg)
def run_test_case(self, i, f, s, b, flip, provide_shape, fd=(1, 1)):
mode = mode_with_gpu
if fd != (1, 1):
pytest.skip("Doesn't have CUDNN implementation")
o = self.get_output_shape(i, f, s, b, fd)
self.run_fwd(
inputs_shape=i,
filters_shape=f,
subsample=s,
verify_grad=True,
mode=mode,
provide_shape=provide_shape,
border_mode=b,
filter_flip=flip,
target_op=GpuDnnConv,
)
self.run_gradweight(
inputs_shape=i,
filters_shape=f,
output_shape=o,
subsample=s,
verify_grad=True,
mode=mode,
provide_shape=provide_shape,
border_mode=b,
filter_flip=flip,
target_op=GpuDnnConvGradW,
)
self.run_gradinput(
inputs_shape=i,
filters_shape=f,
output_shape=o,
subsample=s,
verify_grad=True,
mode=mode,
provide_shape=provide_shape,
border_mode=b,
filter_flip=flip,
target_op=GpuDnnConvGradI,
)
@pytest.mark.skipif(dnn_available(test_ctx_name), reason=dnn_available.msg)
def run_test_case_gi(
self, i, f, o, s, b, flip, provide_shape, fd=(1, 1), expect_error=False
):
if fd != (1, 1):
pytest.skip("Doesn't have CUDNN implementation")
mode = mode_with_gpu
if not expect_error:
self.run_gradinput(
inputs_shape=i,
filters_shape=f,
output_shape=o,
subsample=s,
verify_grad=True,
mode=mode,
provide_shape=provide_shape,
border_mode=b,
filter_flip=flip,
target_op=GpuDnnConvGradI,
filter_dilation=fd,
)
else:
with pytest.raises((RuntimeError, ValueError)):
self.run_gradinput(
inputs_shape=i,
filters_shape=f,
output_shape=o,
subsample=s,
verify_grad=False,
mode=mode,
provide_shape=provide_shape,
border_mode=b,
filter_flip=flip,
target_op=GpuDnnConvGradI,
ref=None,
filter_dilation=fd,
)
class TestDnnConv3d(BaseTestConv3d):
@classmethod
def setup_class(cls):
super().setup_class()
cls.shared = staticmethod(gpuarray_shared_constructor)
# provide_shape is not used by the cuDNN implementation
cls.provide_shape = [False]
@pytest.mark.skipif(dnn_available(test_ctx_name), reason=dnn_available.msg)
def run_test_case(self, i, f, s, b, flip, provide_shape, fd=(1, 1, 1)):
mode = mode_with_gpu
if fd != (1, 1, 1):
pytest.skip("Doesn't have CUDNN implementation")
o = self.get_output_shape(i, f, s, b, fd)
self.run_fwd(
inputs_shape=i,
filters_shape=f,
subsample=s,
verify_grad=True,
mode=mode,
provide_shape=provide_shape,
border_mode=b,
filter_flip=flip,
target_op=GpuDnnConv,
)
self.run_gradweight(
inputs_shape=i,
filters_shape=f,
output_shape=o,
subsample=s,
verify_grad=True,
mode=mode,
provide_shape=provide_shape,
border_mode=b,
filter_flip=flip,
target_op=GpuDnnConvGradW,
)
self.run_gradinput(
inputs_shape=i,
filters_shape=f,
output_shape=o,
subsample=s,
verify_grad=True,
mode=mode,
provide_shape=provide_shape,
border_mode=b,
filter_flip=flip,
target_op=GpuDnnConvGradI,
)
@pytest.mark.skipif(dnn_available(test_ctx_name), reason=dnn_available.msg)
def run_test_case_gi(
self, i, f, o, s, b, flip, provide_shape, fd=(1, 1, 1), expect_error=False
):
if fd != (1, 1, 1):
pytest.skip("Doesn't have CUDNN implementation")
mode = mode_with_gpu
if not expect_error:
self.run_gradinput(
inputs_shape=i,
filters_shape=f,
output_shape=o,
subsample=s,
verify_grad=True,
mode=mode,
provide_shape=provide_shape,
border_mode=b,
filter_flip=flip,
target_op=GpuDnnConvGradI,
filter_dilation=fd,
)
else:
with pytest.raises((RuntimeError, ValueError)):
self.run_gradinput(
inputs_shape=i,
filters_shape=f,
output_shape=o,
subsample=s,
verify_grad=False,
mode=mode,
provide_shape=provide_shape,
border_mode=b,
filter_flip=flip,
target_op=GpuDnnConvGradI,
ref=None,
filter_dilation=fd,
)
class TestCorrMMConv2d(BaseTestConv2d):
@classmethod
def setup_class(cls):
super().setup_class()
cls.shared = staticmethod(gpuarray_shared_constructor)
cls.mode = mode_with_gpu.excluding("cudnn")
def run_test_case(self, i, f, s, b, flip, provide_shape, fd=(1, 1)):
mode = self.mode
o = self.get_output_shape(i, f, s, b, fd)
self.run_fwd(
inputs_shape=i,
filters_shape=f,
subsample=s,
verify_grad=True,
mode=mode,
provide_shape=provide_shape,
border_mode=b,
filter_flip=flip,
target_op=(GpuCorrMM, GpuCorrMM_gradWeights, GpuCorrMM_gradInputs),
filter_dilation=fd,
)
self.run_gradweight(
inputs_shape=i,
filters_shape=f,
output_shape=o,
subsample=s,
verify_grad=True,
mode=mode,
provide_shape=provide_shape,
border_mode=b,
filter_flip=flip,
target_op=GpuCorrMM_gradWeights,
filter_dilation=fd,
)
self.run_gradinput(
inputs_shape=i,
filters_shape=f,
output_shape=o,
subsample=s,
verify_grad=True,
mode=mode,
provide_shape=provide_shape,
border_mode=b,
filter_flip=flip,
target_op=GpuCorrMM_gradInputs,
filter_dilation=fd,
)
def run_test_case_gi(
self, i, f, o, s, b, flip, provide_shape, fd=(1, 1), expect_error=False
):
mode = self.mode
if not expect_error:
self.run_gradinput(
inputs_shape=i,
filters_shape=f,
output_shape=o,
subsample=s,
verify_grad=True,
mode=mode,
provide_shape=provide_shape,
border_mode=b,
filter_flip=flip,
target_op=GpuCorrMM_gradInputs,
filter_dilation=fd,
)
else:
with pytest.raises(ValueError):
self.run_gradinput(
inputs_shape=i,
filters_shape=f,
output_shape=o,
subsample=s,
verify_grad=False,
mode=mode,
provide_shape=provide_shape,
border_mode=b,
filter_flip=flip,
target_op=GpuCorrMM_gradInputs,
ref=None,
filter_dilation=fd,
)
class TestCorrMMConv3d(BaseTestConv3d):
@classmethod
def setup_class(cls):
super().setup_class()
cls.shared = staticmethod(gpuarray_shared_constructor)
cls.mode = mode_with_gpu.excluding("cudnn")
def run_test_case(self, i, f, s, b, flip, provide_shape, fd=(1, 1, 1)):
mode = self.mode
o = self.get_output_shape(i, f, s, b, fd)
self.run_fwd(
inputs_shape=i,
filters_shape=f,
subsample=s,
verify_grad=True,
mode=mode,
provide_shape=provide_shape,
border_mode=b,
filter_flip=flip,
target_op=(GpuCorr3dMM, GpuCorr3dMM_gradWeights, GpuCorr3dMM_gradInputs),
filter_dilation=fd,
)
self.run_gradweight(
inputs_shape=i,
filters_shape=f,
output_shape=o,
subsample=s,
verify_grad=True,
mode=mode,
provide_shape=provide_shape,
border_mode=b,
filter_flip=flip,
target_op=GpuCorr3dMM_gradWeights,
filter_dilation=fd,
)
self.run_gradinput(
inputs_shape=i,
filters_shape=f,
output_shape=o,
subsample=s,
verify_grad=True,
mode=mode,
provide_shape=provide_shape,
border_mode=b,
filter_flip=flip,
target_op=GpuCorr3dMM_gradInputs,
filter_dilation=fd,
)
def run_test_case_gi(
self, i, f, o, s, b, flip, provide_shape, fd=(1, 1, 1), expect_error=False
):
mode = self.mode
if not expect_error:
self.run_gradinput(
inputs_shape=i,
filters_shape=f,
output_shape=o,
subsample=s,
verify_grad=True,
mode=mode,
provide_shape=provide_shape,
border_mode=b,
filter_flip=flip,
target_op=GpuCorr3dMM_gradInputs,
filter_dilation=fd,
)
else:
with pytest.raises(ValueError):
self.run_gradinput(
inputs_shape=i,
filters_shape=f,
output_shape=o,
subsample=s,
verify_grad=False,
mode=mode,
provide_shape=provide_shape,
border_mode=b,
filter_flip=flip,
target_op=GpuCorr3dMM_gradInputs,
ref=None,
filter_dilation=fd,
)
class TestDnnConvTypes(TestConvTypes):
def setup_method(self):
self.input = gpu_ftensor4()
self.filters = gpu_ftensor4()
self.topgrad = gpu_ftensor4()
self.constant_tensor = gpuarray.array(
np.zeros((3, 5, 7, 11), dtype="float32"), context=get_context(test_ctx_name)
)
super().setup_method()
class TestConv2dTranspose(TestConv2dTranspose):
mode = mode_with_gpu
import numpy as np
import pytest
import aesara
import aesara.tensor as at
from aesara.gpuarray.basic_ops import (
GpuAlloc,
GpuAllocEmpty,
GpuContiguous,
GpuEye,
GpuFromHost,
GpuJoin,
GpuReshape,
GpuSplit,
GpuToGpu,
GpuTri,
HostFromGpu,
gpu_contiguous,
gpu_join,
host_from_gpu,
)
from aesara.gpuarray.elemwise import GpuDimShuffle, GpuElemwise
from aesara.gpuarray.subtensor import GpuSubtensor
from aesara.gpuarray.type import GpuArrayType, get_context, gpuarray_shared_constructor
from aesara.tensor.basic import Alloc, MakeVector, Split, alloc
from aesara.tensor.shape import Shape, Shape_i
from aesara.tensor.type import TensorType, fmatrix, iscalar, lscalar, matrix
# Don't import test classes otherwise they get tested as part of the file
from tests import unittest_tools as utt
from tests.gpuarray.config import mode_with_gpu, mode_without_gpu, test_ctx_name
from tests.tensor.test_basic import (
TestAlloc,
TestComparison,
TestJoinAndSplit,
TestReshape,
)
from tests.tensor.utils import random, safe_make_node
pygpu = pytest.importorskip("pygpu")
gpuarray = pygpu.gpuarray
rng = np.random.default_rng(seed=utt.fetch_seed())
def inplace_func(
inputs,
outputs,
mode=None,
allow_input_downcast=False,
on_unused_input="raise",
name=None,
):
if mode is None:
mode = mode_with_gpu
return aesara.function(
inputs,
outputs,
mode=mode,
allow_input_downcast=allow_input_downcast,
accept_inplace=True,
on_unused_input=on_unused_input,
name=name,
)
def fake_shared(value, name=None, strict=False, allow_downcast=None, **kwargs):
from aesara.tensor.sharedvar import scalar_constructor, tensor_constructor
for c in (gpuarray_shared_constructor, tensor_constructor, scalar_constructor):
try:
return c(
value, name=name, strict=strict, allow_downcast=allow_downcast, **kwargs
)
except TypeError:
continue
def rand_gpuarray(*shape, **kwargs):
r = rng.random(shape) * 2 - 1
dtype = kwargs.pop("dtype", aesara.config.floatX)
cls = kwargs.pop("cls", None)
if len(kwargs) != 0:
raise TypeError("Unexpected argument %s", list(kwargs.keys())[0])
return gpuarray.array(r, dtype=dtype, cls=cls, context=get_context(test_ctx_name))
def makeTester(
name,
op,
gpu_op,
cases,
checks=None,
mode_gpu=mode_with_gpu,
mode_nogpu=mode_without_gpu,
skip=False,
eps=1e-10,
):
if checks is None:
checks = {}
_op = op
_gpu_op = gpu_op
_cases = cases
_skip = skip
_checks = checks
class Checker(utt.OptimizationTestMixin):
op = staticmethod(_op)
gpu_op = staticmethod(_gpu_op)
cases = _cases
skip = _skip
checks = _checks
def setup_method(self):
eval(self.__class__.__module__ + "." + self.__class__.__name__)
def test_all(self):
if skip:
pytest.skip(skip)
for testname, inputs in cases.items():
for _ in range(len(inputs)):
if type(inputs[_]) is float:
inputs[_] = np.asarray(inputs[_], dtype=aesara.config.floatX)
self.run_case(testname, inputs)
def run_case(self, testname, inputs):
inputs_ref = [aesara.shared(inp) for inp in inputs]
inputs_tst = [aesara.shared(inp) for inp in inputs]
try:
node_ref = safe_make_node(self.op, *inputs_ref)
node_tst = safe_make_node(self.op, *inputs_tst)
except Exception as exc:
err_msg = (
"Test %s::%s: Error occurred while making " "a node with inputs %s"
) % (self.gpu_op, testname, inputs)
exc.args += (err_msg,)
raise
try:
f_ref = inplace_func([], node_ref.outputs, mode=mode_nogpu)
f_tst = inplace_func([], node_tst.outputs, mode=mode_gpu)
except Exception as exc:
err_msg = (
"Test %s::%s: Error occurred while trying to " "make a Function"
) % (self.gpu_op, testname)
exc.args += (err_msg,)
raise
self.assertFunctionContains1(f_tst, self.gpu_op)
ref_e = None
try:
expecteds = f_ref()
except Exception as exc:
ref_e = exc
try:
variables = f_tst()
except Exception as exc:
if ref_e is None:
err_msg = (
"Test %s::%s: exception when calling the " "Function"
) % (self.gpu_op, testname)
exc.args += (err_msg,)
raise
else:
# if we raised an exception of the same type we're good.
if isinstance(exc, type(ref_e)):
return
else:
err_msg = (
"Test %s::%s: exception raised during test "
"call was not the same as the reference "
"call (got: %s, expected %s)"
% (self.gpu_op, testname, type(exc), type(ref_e))
)
exc.args += (err_msg,)
raise
for i, (variable, expected) in enumerate(zip(variables, expecteds)):
condition = (
variable.dtype != expected.dtype
or variable.shape != expected.shape
or not TensorType.values_eq_approx(variable, expected)
)
assert not condition, (
"Test %s::%s: Output %s gave the wrong "
"value. With inputs %s, expected %s "
"(dtype %s), got %s (dtype %s)."
% (
self.op,
testname,
i,
inputs,
expected,
expected.dtype,
variable,
variable.dtype,
)
)
for description, check in self.checks.items():
assert check(inputs, variables), (
"Test %s::%s: Failed check: %s " "(inputs were %s, outputs were %s)"
) % (self.op, testname, description, inputs, variables)
Checker.__name__ = name
if hasattr(Checker, "__qualname__"):
Checker.__qualname__ = name
return Checker
def test_transfer_cpu_gpu():
a = fmatrix("a")
g = GpuArrayType(dtype="float32", broadcastable=(False, False))("g")
av = np.asarray(rng.random((5, 4)), dtype="float32")
gv = gpuarray.array(av, context=get_context(test_ctx_name))
f = aesara.function([a], GpuFromHost(test_ctx_name)(a))
fv = f(av)
assert GpuArrayType.values_eq(fv, gv)
f = aesara.function([g], host_from_gpu(g))
fv = f(gv)
assert np.all(fv == av)
def test_transfer_gpu_gpu():
g = GpuArrayType(
dtype="float32", broadcastable=(False, False), context_name=test_ctx_name
)()
av = np.asarray(rng.random((5, 4)), dtype="float32")
gv = gpuarray.array(av, context=get_context(test_ctx_name))
mode = mode_with_gpu.excluding(
"cut_gpua_host_transfers", "local_cut_gpua_host_gpua"
)
f = aesara.function([g], GpuToGpu(test_ctx_name)(g), mode=mode)
topo = f.maker.fgraph.toposort()
assert len(topo) == 1
assert isinstance(topo[0].op, GpuToGpu)
fv = f(gv)
assert GpuArrayType.values_eq(fv, gv)
def test_transfer_strided():
# This is just to ensure that it works in aesara
# libgpuarray has a much more comprehensive suit of tests to
# ensure correctness
a = fmatrix("a")
g = GpuArrayType(dtype="float32", broadcastable=(False, False))("g")
av = np.asarray(rng.random((5, 8)), dtype="float32")
gv = gpuarray.array(av, context=get_context(test_ctx_name))
av = av[:, ::2]
gv = gv[:, ::2]
f = aesara.function([a], GpuFromHost(test_ctx_name)(a))
fv = f(av)
assert GpuArrayType.values_eq(fv, gv)
f = aesara.function([g], host_from_gpu(g))
fv = f(gv)
assert np.all(fv == av)
def gpu_alloc_expected(x, *shp):
g = gpuarray.empty(shp, dtype=x.dtype, context=get_context(test_ctx_name))
g[:] = x
return g
TestGpuAlloc = makeTester(
name="GpuAllocTester",
# The +1 is there to allow the lift to the GPU.
op=lambda *args: alloc(*args) + 1,
gpu_op=GpuAlloc(test_ctx_name),
cases=dict(
correct01=(random(), np.int32(7)),
# just gives a DeepCopyOp with possibly wrong results on the CPU
# correct01_bcast=(random(1), np.int32(7)),
correct02=(random(), np.int32(4), np.int32(7)),
correct12=(random(7), np.int32(4), np.int32(7)),
correct13=(random(7), np.int32(2), np.int32(4), np.int32(7)),
correct23=(random(4, 7), np.int32(2), np.int32(4), np.int32(7)),
bad_shape12=(random(7), np.int32(7), np.int32(5)),
),
)
class TestGPUAlloc(TestAlloc):
dtype = "float32"
mode = mode_with_gpu
shared = staticmethod(gpuarray_shared_constructor)
allocs = [GpuAlloc(test_ctx_name), GpuAlloc(test_ctx_name), Alloc()]
def test_alloc_empty():
for dt in ["float32", "int8"]:
f = aesara.function([], GpuAllocEmpty(dt, context_name=test_ctx_name)(2, 3))
assert len(f.maker.fgraph.apply_nodes) == 1
out = f()
assert out.shape == (2, 3)
assert out.dtype == dt
f = aesara.function(
[],
[
GpuAllocEmpty("uint64", test_ctx_name)(3, 2),
GpuAllocEmpty("uint64", test_ctx_name)(3, 2),
],
)
out = f()
assert out[0].shape == (3, 2)
assert out[0].dtype == "uint64"
assert out[1].shape == (3, 2)
assert out[1].dtype == "uint64"
assert (
len(
[
node
for node in f.maker.fgraph.apply_nodes
if isinstance(node.op, GpuAllocEmpty)
]
)
== 1
)
def test_shape():
x = GpuArrayType(dtype="float32", broadcastable=[False, False, False])()
v = gpuarray.zeros((3, 4, 5), dtype="float32", context=get_context(test_ctx_name))
f = aesara.function([x], x.shape)
topo = f.maker.fgraph.toposort()
assert np.all(f(v) == (3, 4, 5))
if aesara.config.mode != "FAST_COMPILE":
assert len(topo) == 4
assert isinstance(topo[0].op, Shape_i)
assert isinstance(topo[1].op, Shape_i)
assert isinstance(topo[2].op, Shape_i)
assert isinstance(topo[3].op, MakeVector)
mode = mode_with_gpu.excluding("local_shape_to_shape_i")
f = aesara.function([x], x.shape, mode=mode)
topo = f.maker.fgraph.toposort()
assert np.all(f(v) == (3, 4, 5))
assert len(topo) == 1
assert isinstance(topo[0].op, Shape)
def test_gpu_contiguous():
a = fmatrix("a")
i = iscalar("i")
a_val = np.asarray(np.random.random(4, 5), dtype="float32")
# The reshape is needed otherwise we make the subtensor on the CPU
# to transfer less data.
f = aesara.function(
[a, i], gpu_contiguous(a.reshape((5, 4))[::i]), mode=mode_with_gpu
)
topo = f.maker.fgraph.toposort()
assert any(isinstance(node.op, GpuSubtensor) for node in topo)
assert any(isinstance(node.op, GpuContiguous) for node in topo)
assert f(a_val, 1).flags.c_contiguous
assert f(a_val, 2).flags.c_contiguous
assert f(a_val, 2).flags.c_contiguous
class TestGPUReshape(TestReshape):
def setup_method(self):
self.shared = gpuarray_shared_constructor
self.op = GpuReshape
self.mode = mode_with_gpu
self.ignore_topo = (
HostFromGpu,
GpuFromHost,
aesara.compile.DeepCopyOp,
GpuDimShuffle,
GpuElemwise,
Shape_i,
MakeVector,
)
assert self.op == GpuReshape
class TestGPUComparison(TestComparison):
def setup_method(self):
self.mode = mode_with_gpu
self.shared = gpuarray_shared_constructor
self.dtypes = ["float64", "float32"]
class TestGPUJoinAndSplit(TestJoinAndSplit):
def setup_method(self):
self.mode = mode_with_gpu.excluding("constant_folding")
self.join_op = GpuJoin()
self.split_op_class = GpuSplit
# Use join instead of MakeVector since there is no MakeVector on GPU
self.make_vector_op = GpuJoin()
# this is to avoid errors with limited devices
self.floatX = "float32"
self.hide_error = aesara.config.mode not in ["DebugMode", "DEBUG_MODE"]
def shared(x, **kwargs):
return gpuarray_shared_constructor(x, target=test_ctx_name, **kwargs)
self.shared = shared
def test_gpusplit_opt(self):
# Test that we move the node to the GPU
# Also test float16 computation at the same time.
rng = np.random.default_rng(seed=utt.fetch_seed())
m = self.shared(rng.random((4, 6)).astype("float16"))
o = Split(2)(m, 0, [2, 2])
assert o[0].dtype == "float16"
f = aesara.function([], o, mode=self.mode)
assert any(
[
isinstance(node.op, self.split_op_class)
for node in f.maker.fgraph.toposort()
]
)
o1, o2 = f()
assert np.allclose(o1, m.get_value(borrow=True)[:2])
assert np.allclose(o2, m.get_value(borrow=True)[2:])
def test_gpujoin_gpualloc():
a = fmatrix("a")
a_val = np.asarray(np.random.random(4, 5), dtype="float32")
b = fmatrix("b")
b_val = np.asarray(np.random.random(3, 5), dtype="float32")
f = aesara.function(
[a, b],
at.join(0, at.zeros_like(a), at.ones_like(b)) + 4,
mode=mode_without_gpu,
)
f_gpu = aesara.function(
[a, b], at.join(0, at.zeros_like(a), at.ones_like(b)), mode=mode_with_gpu
)
f_gpu2 = aesara.function(
[a, b], at.join(0, at.zeros_like(a), at.ones_like(b)) + 4, mode=mode_with_gpu
)
assert sum([node.op == at.alloc for node in f.maker.fgraph.toposort()]) == 2
assert sum([node.op == at.join_ for node in f.maker.fgraph.toposort()]) == 1
assert (
sum([isinstance(node.op, GpuAlloc) for node in f_gpu.maker.fgraph.toposort()])
== 2
)
assert sum([node.op == gpu_join for node in f_gpu.maker.fgraph.toposort()]) == 1
assert (
sum([isinstance(node.op, GpuAlloc) for node in f_gpu2.maker.fgraph.toposort()])
== 2
)
assert sum([node.op == gpu_join for node in f_gpu2.maker.fgraph.toposort()]) == 1
assert np.allclose(f(a_val, b_val), f_gpu2(a_val, b_val))
def test_gpueye():
def check(dtype, N, M_=None, k=0):
# Aesara does not accept None as a tensor.
# So we must use a real value.
M = M_
# Currently DebugMode does not support None as inputs even if this is
# allowed.
if M is None:
M = N
N_symb = iscalar()
M_symb = iscalar()
k_symb = iscalar()
out = at.eye(N_symb, M_symb, k_symb, dtype=dtype) + np.array(1).astype(dtype)
f = aesara.function([N_symb, M_symb, k_symb], out, mode=mode_with_gpu)
result = np.asarray(f(N, M, k)) - np.array(1).astype(dtype)
assert np.allclose(result, np.eye(N, M_, k, dtype=dtype))
assert result.dtype == np.dtype(dtype)
assert any(isinstance(node.op, GpuEye) for node in f.maker.fgraph.toposort())
for dtype in ["float32", "int32", "float16"]:
check(dtype, 3)
# M != N, k = 0
check(dtype, 3, 5)
check(dtype, 5, 3)
# N == M, k != 0
check(dtype, 3, 3, 1)
check(dtype, 3, 3, -1)
# N < M, k != 0
check(dtype, 3, 5, 1)
check(dtype, 3, 5, -1)
# N > M, k != 0
check(dtype, 5, 3, 1)
check(dtype, 5, 3, -1)
# k > M, -k > N, k > M, k > N
check(dtype, 5, 3, 3)
check(dtype, 3, 5, 3)
check(dtype, 5, 3, -3)
check(dtype, 3, 5, -3)
check(dtype, 5, 3, 6)
check(dtype, 3, 5, -6)
def test_hostfromgpu_shape_i():
# Test that the shape is lifted over hostfromgpu
m = mode_with_gpu.including(
"local_dot_to_dot22", "local_dot22_to_dot22scalar", "specialize"
)
a = fmatrix("a")
ca = aesara.gpuarray.type.GpuArrayType("float32", (False, False))()
av = np.asarray(np.random.random(5, 4), dtype="float32")
cv = gpuarray.asarray(
np.random.random(5, 4), dtype="float32", context=get_context(test_ctx_name)
)
f = aesara.function([a], GpuFromHost(test_ctx_name)(a), mode=m)
assert any(isinstance(x.op, GpuFromHost) for x in f.maker.fgraph.toposort())
f = aesara.function([a], GpuFromHost(test_ctx_name)(a).shape, mode=m)
topo = f.maker.fgraph.toposort()
assert isinstance(topo[0].op, Shape_i)
assert isinstance(topo[1].op, Shape_i)
assert isinstance(topo[2].op, MakeVector)
assert tuple(f(av)) == (5, 4)
f = aesara.function([ca], host_from_gpu(ca), mode=m)
assert host_from_gpu in [x.op for x in f.maker.fgraph.toposort()]
f = aesara.function([ca], host_from_gpu(ca).shape, mode=m)
topo = f.maker.fgraph.toposort()
assert isinstance(topo[0].op, Shape_i)
assert isinstance(topo[1].op, Shape_i)
assert isinstance(topo[2].op, MakeVector)
assert tuple(f(cv)) == (5, 4)
def test_Gpujoin_inplace():
# Test Gpujoin to work inplace.
#
# This function tests the case when several elements are passed to the
# Gpujoin function but all except one of them are empty. In this case
# Gpujoin should work inplace and the output should be the view of the
# non-empty element.
s = lscalar()
data = np.array([3, 4, 5], dtype=aesara.config.floatX)
x = gpuarray_shared_constructor(data, borrow=True)
z = at.zeros((s,))
join = GpuJoin(view=0)
c = join(0, x, z)
f = aesara.function([s], aesara.Out(c, borrow=True))
if not isinstance(mode_with_gpu, aesara.compile.debugmode.DebugMode):
assert x.get_value(borrow=True, return_internal_type=True) is f(0)
assert np.allclose(f(0), [3, 4, 5])
def test_gpu_tril_triu():
def check_l(m, k=0):
m_symb = matrix(dtype=m.dtype)
k_symb = iscalar()
f = aesara.function(
[m_symb, k_symb], at.tril(m_symb, k_symb), mode=mode_with_gpu
)
result = f(m, k)
assert np.allclose(result, np.tril(m, k))
assert result.dtype == np.dtype(dtype)
assert any(isinstance(node.op, GpuTri) for node in f.maker.fgraph.toposort())
def check_u(m, k=0):
m_symb = matrix(dtype=m.dtype)
k_symb = iscalar()
f = aesara.function(
[m_symb, k_symb], at.triu(m_symb, k_symb), mode=mode_with_gpu
)
result = f(m, k)
assert np.allclose(result, np.triu(m, k))
assert result.dtype == np.dtype(dtype)
assert any(isinstance(node.op, GpuTri) for node in f.maker.fgraph.toposort())
test_rng = np.random.default_rng(seed=utt.fetch_seed())
for dtype in ["float64", "float32", "float16"]:
# try a big one
m = np.asarray(test_rng.random((5000, 5000)) * 2 - 1, dtype=dtype)
check_l(m, 0)
check_l(m, 1)
check_l(m, -1)
check_u(m, 0)
check_u(m, 1)
check_u(m, -1)
m = np.asarray(test_rng.random((10, 10)) * 2 - 1, dtype=dtype)
check_l(m, 0)
check_l(m, 1)
check_l(m, -1)
check_u(m, 0)
check_u(m, 1)
check_u(m, -1)
m = np.asarray(test_rng.random((10, 5)) * 2 - 1, dtype=dtype)
check_l(m, 0)
check_l(m, 1)
check_l(m, -1)
check_u(m, 0)
check_u(m, 1)
check_u(m, -1)
def test_gputri():
def check(dtype, N, M_=None, k=0):
# Aesara does not accept None as a tensor.
# So we must use a real value.
M = M_
# Currently DebugMode does not support None as inputs even if this is
# allowed.
if M is None:
M = N
N_symb = iscalar()
M_symb = iscalar()
k_symb = iscalar()
out = at.tri(N_symb, M_symb, k_symb, dtype=dtype) + np.array(1).astype(dtype)
f = aesara.function([N_symb, M_symb, k_symb], out, mode=mode_with_gpu)
result = np.asarray(f(N, M, k)) - np.array(1).astype(dtype)
assert np.allclose(result, np.tri(N, M_, k, dtype=dtype))
assert result.dtype == np.dtype(dtype)
assert any(isinstance(node.op, GpuTri) for node in f.maker.fgraph.toposort())
for dtype in ("float64", "float32", "int32", "float16"):
# try a big one
check(dtype, 1000, 1000, 0)
check(dtype, 1000, 1000, -400)
check(dtype, 1000, 1000, 400)
check(dtype, 5)
# M != N, k = 0
check(dtype, 3, 5)
check(dtype, 5, 3)
# N == M, k != 0
check(dtype, 3, 3, 1)
check(dtype, 3, 3, -1)
# N < M, k != 0
check(dtype, 3, 5, 1)
check(dtype, 3, 5, -1)
# N > M, k != 0
check(dtype, 5, 3, 1)
check(dtype, 5, 3, -1)
# k > M, -k > N, k > M, k > N
check(dtype, 5, 3, 3)
check(dtype, 3, 5, 3)
check(dtype, 5, 3, -3)
check(dtype, 3, 5, -3)
check(dtype, 5, 3, 6)
check(dtype, 3, 5, -6)
import itertools
import numpy as np
import aesara
from aesara.configdefaults import config
from aesara.gpuarray import gpuarray_shared_constructor
from aesara.gpuarray.blas import (
GpuGemm,
GpuGer,
gpu_dot22,
gpugemm_inplace,
gpugemm_no_inplace,
gpugemmbatch_inplace,
gpugemv_inplace,
gpugemv_no_inplace,
gpuger_inplace,
gpuger_no_inplace,
)
from aesara.tensor.blas import (
BatchedDot,
_dot22,
batched_dot,
gemm_inplace,
gemv,
gemv_inplace,
)
from aesara.tensor.math import dot
from aesara.tensor.type import matrix, tensor, tensor3, vector
from tests import unittest_tools as utt
from tests.gpuarray.config import mode_with_gpu, test_ctx_name
from tests.gpuarray.test_basic_ops import makeTester, rand
from tests.tensor.test_blas import BaseGemv, TestGer
TestGpuGemv = makeTester(
"GpuGemvTester",
op=gemv_inplace,
gpu_op=gpugemv_inplace,
# It doesn't support float16
cases=dict(
dot_vv=[rand(1), 1.0, rand(1, 2), rand(2), 0.0],
dot_vm=[rand(3), 1.0, rand(3, 2), rand(2), 0.0],
float32=[
rand(3).astype("float32"),
np.float32(1),
rand(3, 2).astype("float32"),
rand(2).astype("float32"),
np.float32(0),
],
float64=[
rand(3).astype("float64"),
np.float64(1),
rand(3, 2).astype("float64"),
rand(2).astype("float64"),
np.float64(0),
],
# test_02=[rand(0), 1, rand(0, 2), rand(2), 0],
# test_30=[rand(3), 1, rand(3, 0), rand(0), 0],
# test_00=[rand(0), 1, rand(0, 0), rand(0), 0],
test_stride=[rand(3)[::-1], 1.0, rand(3, 2)[::-1], rand(2)[::-1], 0.0],
),
)
def test_float16():
# gemv (gemm called)
float16_data = [
rand(3).astype("float16"),
np.asarray(1, dtype=np.float32),
rand(3, 3).astype("float16"),
rand(3).astype("float16"),
np.asarray(0.5, dtype=np.float32),
]
float16_shared = [
gpuarray_shared_constructor(val, target=test_ctx_name) for val in float16_data
]
o = gemv(*float16_shared)
f = aesara.function([], o, mode=mode_with_gpu)
y, alpha, A, x, beta = float16_data
out = f()
utt.assert_allclose(np.asarray(out), alpha * np.dot(A, x) + beta * y)
topo = f.maker.fgraph.toposort()
assert any(isinstance(n.op, GpuGemm) for n in topo)
# gemm
float16_data = [
rand(3, 3).astype("float16"),
np.asarray(1, dtype=np.float32),
rand(3, 3).astype("float16"),
rand(3, 3).astype("float16"),
np.asarray(0.5, dtype=np.float32),
]
float16_shared = [
gpuarray_shared_constructor(val, target=test_ctx_name) for val in float16_data
]
o = gpugemm_no_inplace(*float16_shared)
f = aesara.function([], o)
y, alpha, A, x, beta = float16_data
out = f()
utt.assert_allclose(np.asarray(out), alpha * np.dot(A, x) + beta * y)
# dot22
float16_data = [rand(3, 3).astype("float16"), rand(3, 3).astype("float16")]
float16_shared = [gpuarray_shared_constructor(val) for val in float16_data]
o = gpu_dot22(*float16_shared)
f = aesara.function([], o)
x, y = float16_data
out = f()
utt.assert_allclose(np.asarray(out), np.dot(x, y))
class TestGpuSgemv(BaseGemv, utt.OptimizationTestMixin):
mode = mode_with_gpu
dtype = "float32"
gemv = gpugemv_no_inplace
gemv_inplace = gpugemv_inplace
@staticmethod
def shared(val):
try:
return gpuarray_shared_constructor(val)
except TypeError:
return aesara.shared(val)
TestGpuGemm = makeTester(
"GpuGemmTester",
op=gemm_inplace,
gpu_op=gpugemm_inplace,
# float16 tested in test_float16
cases=dict(
test1=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), 0.0],
test2=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), 1.0],
test3=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), -1.0],
test4=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), 0.0],
test5=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), 0.6],
test6=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), -1.0],
test7=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), 0.0],
test8=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), 1.1],
float32=[
rand(3, 4).astype("float32"),
np.float32(-1.0),
rand(3, 5).astype("float32"),
rand(5, 4).astype("float32"),
np.float32(-1.1),
],
float64=[
rand(3, 4).astype("float64"),
np.float64(-1.0),
rand(3, 5).astype("float64"),
rand(5, 4).astype("float64"),
np.float64(-1.1),
],
# test10=[rand(0, 4), -1.0, rand(0, 5), rand(5, 4), 0.0],
# test11=[rand(3, 0), -1.0, rand(3, 5), rand(5, 0), 1.1],
# test12=[rand(3, 4), -1.0, rand(3, 0), rand(0, 4), -1.1],
# test13=[rand(0, 0), -1.0, rand(0, 0), rand(0, 0), -1.1],
),
)
gemm_batched_tests = {
"test_b%im%ik%in%i"
% (b, m, k, n): [rand(b, m, n), rand(), rand(b, m, k), rand(b, k, n), rand()]
for b, m, k, n in itertools.combinations([2, 3, 5, 7, 11, 13], 4)
}
gemm_batched_tests["float16"] = [
rand(3, 4, 7).astype("float16"),
rand().astype("float16"),
rand(3, 4, 4).astype("float16"),
rand(3, 4, 7).astype("float16"),
rand().astype("float16"),
]
gemm_batched_tests["float32"] = [
rand(3, 4, 7).astype("float32"),
rand().astype("float32"),
rand(3, 4, 4).astype("float32"),
rand(3, 4, 7).astype("float32"),
rand().astype("float32"),
]
gemm_batched_tests["float64"] = [
rand(3, 4, 7).astype("float64"),
rand().astype("float64"),
rand(3, 4, 4).astype("float64"),
rand(3, 4, 7).astype("float64"),
rand().astype("float64"),
]
TestGpuGemmBatch = makeTester(
"GpuGemmBatchTester",
op=lambda z, alpha, x, y, beta: alpha * BatchedDot()(x, y) + beta * z,
gpu_op=gpugemmbatch_inplace,
cases=gemm_batched_tests,
)
class TestGpuGemmBatchStrided:
def test_basic(self):
# Reported in https://github.com/Theano/Theano/issues/5730
x = tensor3()
y = tensor3()
z = batched_dot(x, y[:, 0, :, np.newaxis])
f = aesara.function([x, y], z, mode=mode_with_gpu)
x_num = np.arange(32 * 19 * 600, dtype=config.floatX).reshape((32, 19, 600))
y_num = np.arange(7 * 32 * 600, dtype=config.floatX).reshape((32, 7, 600))
f(x_num, y_num)
assert f.maker.fgraph.toposort()[-2].op.inplace
class TestGpuSger(TestGer):
def setup_method(self):
self.mode = mode_with_gpu
dtype = self.dtype = "float32" # optimization isn't dtype-dependent
self.A = tensor(dtype=dtype, broadcastable=(False, False))
self.a = tensor(dtype=dtype, broadcastable=())
self.x = tensor(dtype=dtype, broadcastable=(False,))
self.y = tensor(dtype=dtype, broadcastable=(False,))
self.ger_destructive = gpuger_inplace
# data on the gpu make the op always inplace
self.ger = gpuger_inplace
self.gemm = gpugemm_inplace
super().setup_method()
class TestGpuSgerNoTransfer(TestGpuSger):
shared = staticmethod(gpuarray_shared_constructor)
class TestGpuGer_OpContract(utt.OpContractTestMixin):
def setup_method(self):
self.ops = [gpuger_no_inplace, gpuger_inplace]
def clone(self, op):
return GpuGer(inplace=op.inplace)
TestGpuDot22 = makeTester(
"GpuDot22Tester",
op=_dot22,
gpu_op=gpu_dot22,
cases=dict(
test1=[rand(3, 4), rand(4, 5)],
test2=[rand(1, 4), rand(4, 5)],
test3=[rand(3, 1), rand(1, 5)],
test4=[rand(3, 4), rand(4, 1)],
# test5=[rand(0, 4), rand(4, 5)],
# test6=[rand(3, 0), rand(0, 5)],
# test7=[rand(3, 4), rand(4, 0)],
# test8=[rand(0, 4), rand(4, 0)],
# test9=[rand(0, 0), rand(0, 0)],
),
)
def test_gemv_zeros():
W = matrix()
v = vector()
f = aesara.function([W, v], W.dot(v), mode=mode_with_gpu)
# Apply to an empty matrix shape (5,0) and an empty vector shape (0,)
dim = 1000
A = np.zeros((dim, 0), dtype=aesara.config.floatX)
b = np.zeros((0,), dtype=aesara.config.floatX)
tmp = f(A, b)
assert np.allclose(tmp, np.zeros((dim,)))
def test_gemv_dot_strides():
# Reported in https://github.com/Theano/Theano/issues/6142
xv = rand(5)
yv = rand(5, 1)
x = gpuarray_shared_constructor(xv)
y = gpuarray_shared_constructor(yv, broadcastable=(False, True))
f = aesara.function([], dot(x, y[::-1]), mode=mode_with_gpu)
out = f()
utt.assert_allclose(out, np.dot(xv, yv[::-1]))
import numpy as np
import pytest
import aesara
import tests.unittest_tools as utt
from aesara.gpuarray.blocksparse import (
GpuSparseBlockGemv,
GpuSparseBlockOuter,
gpu_sparse_block_gemv,
gpu_sparse_block_outer,
)
from aesara.gpuarray.type import gpuarray_shared_constructor
from aesara.tensor.type import fmatrix, ftensor3, lmatrix
from tests.gpuarray.config import mode_with_gpu, test_ctx_name
from tests.tensor.nnet.test_blocksparse import TestBlockSparseGemvAndOuter
class TestBlockSparseGemvAndOuterGPUarray(TestBlockSparseGemvAndOuter):
def setup_method(self):
self.mode = mode_with_gpu.excluding("constant_folding")
self.gemv_op = gpu_sparse_block_gemv
self.outer_op = gpu_sparse_block_outer
self.gemv_class = GpuSparseBlockGemv
self.outer_class = GpuSparseBlockOuter
super().setup_method()
@pytest.mark.skip(
reason="""
This test is temporarily disabled since we disabled the output_merge
and alpha_merge optimizations for blocksparse due to brokenness.
Re-enable when those are re-added.
"""
)
def test_blocksparse_grad_merge(self):
b = fmatrix()
h = ftensor3()
iIdx = lmatrix()
oIdx = lmatrix()
W_val, h_val, iIdx_val, b_val, oIdx_val = self.gemv_data()
W = gpuarray_shared_constructor(W_val, context=test_ctx_name)
o = gpu_sparse_block_gemv(b.take(oIdx, axis=0), W, h, iIdx, oIdx)
gW = aesara.grad(o.sum(), W)
lr = np.asarray(0.05, dtype="float32")
upd = W - lr * gW
f1 = aesara.function([h, iIdx, b, oIdx], updates=[(W, upd)], mode=mode_with_gpu)
# Make sure the lr update was merged.
assert isinstance(f1.maker.fgraph.outputs[0].owner.op, GpuSparseBlockOuter)
# Exclude the merge optimizations.
mode = mode_with_gpu.excluding("local_merge_blocksparse_alpha")
mode = mode.excluding("local_merge_blocksparse_output")
f2 = aesara.function([h, iIdx, b, oIdx], updates=[(W, upd)], mode=mode)
# Make sure the lr update is not merged.
assert not isinstance(f2.maker.fgraph.outputs[0].owner.op, GpuSparseBlockOuter)
f2(h_val, iIdx_val, b_val, oIdx_val)
W_ref = W.get_value()
# reset the var
W.set_value(W_val)
f1(h_val, iIdx_val, b_val, oIdx_val)
W_opt = W.get_value()
utt.assert_allclose(W_ref, W_opt)
import numpy as np
import pytest
import aesara
from aesara import config
from aesara import tensor as at
from aesara.gpuarray.basic_ops import CGpuKernelBase
from aesara.gpuarray.type import GpuArrayType, get_context, gpu_context_type
from aesara.gradient import grad_undefined
from aesara.graph.basic import Apply
from aesara.link.c.params_type import ParamsType
from aesara.scalar import int32 as int_t
class GpuEye(CGpuKernelBase):
"""Eye for GPU.
This is an implementation to test that `CGpuKernelBase` works and also
to use as an example in the docs. It is not used for user graphs.
"""
__props__ = ("dtype", "context_name")
params_type = ParamsType(typecode=int_t, context=gpu_context_type)
def __init__(self, dtype=None, context_name=None):
if dtype is None:
dtype = config.floatX
self.dtype = dtype
self.context_name = context_name
super().__init__(["c_code/tstgpueye.c"], "APPLY_SPECIFIC(tstgpueye)")
def get_params(self, node):
pygpu_gpuarray = pytest.importorskip("pygpu.gpuarray")
return self.params_type.get_params(
typecode=pygpu_gpuarray.dtype_to_typecode(self.dtype),
context=get_context(self.context_name),
)
def c_headers(self, **kwargs):
return ["<gpuarray/types.h>", "<gpuarray/kernel.h>"]
def make_node(self, n, m):
n = at.as_tensor_variable(n)
m = at.as_tensor_variable(m)
assert n.ndim == 0
assert m.ndim == 0
otype = GpuArrayType(
dtype=self.dtype,
broadcastable=(False, False),
context_name=self.context_name,
)
return Apply(self, [n, m], [otype()])
def infer_shape(self, fgraph, node, in_shapes):
out_shape = [node.inputs[0], node.inputs[1]]
return [out_shape]
def grad(self, inp, grads):
return [grad_undefined(self, i, inp[i]) for i in range(2)]
def test_cgpukernelbase():
# Import inside the function to prevent the back-end from being
# initialized when reloading the GpuEye object from cache.
from .config import mode_with_gpu, test_ctx_name
op = GpuEye(dtype="int32", context_name=test_ctx_name)
f = aesara.function([], op(4, 5), mode=mode_with_gpu)
r = f()
assert r.dtype == "int32"
assert (np.asarray(r) == np.eye(4, 5, dtype="int32")).all()
import numpy as np
import pytest
import aesara
import aesara.gpuarray
from aesara.gpuarray.ctc import GpuConnectionistTemporalClassification, gpu_ctc
from aesara.gradient import grad
from aesara.tensor.math import mean
from aesara.tensor.nnet.ctc import (
ConnectionistTemporalClassification,
ctc,
ctc_available,
)
from tests import unittest_tools as utt
from tests.gpuarray.config import mode_with_gpu, mode_without_gpu
from tests.tensor.nnet.test_ctc import setup_ctc_case, setup_grad_case, setup_torch_case
@pytest.mark.skipif(
not ctc_available(), reason="Optional library warp-ctc not available"
)
class TestCTC:
def check_ctc(
self, activations, labels, input_length, expected_costs, expected_grads
):
# Create symbolic variables
t_activations = aesara.shared(activations, name="activations")
t_activation_times = aesara.shared(input_length, name="activation_times")
t_labels = aesara.shared(labels, name="labels")
inputs = [t_activations, t_labels, t_activation_times]
# Execute several tests for each test case
self.check_expected_values(
t_activations, t_labels, t_activation_times, expected_costs, expected_grads
)
self.compare_gpu_and_cpu_values(*inputs)
self.check_grads_disabled(*inputs)
self.run_gpu_optimization_with_grad(*inputs)
self.run_gpu_optimization_no_grad(*inputs)
def setup_cpu_op(
self,
activations,
labels,
input_length,
compute_grad=True,
mode=mode_without_gpu,
):
cpu_ctc_cost = ctc(activations, labels, input_length)
outputs = [cpu_ctc_cost]
if compute_grad:
# Symbolic gradient of CTC cost
cpu_ctc_grad = grad(mean(cpu_ctc_cost), activations)
outputs += [cpu_ctc_grad]
return aesara.function([], outputs, mode=mode)
def setup_gpu_op(self, activations, labels, input_length, compute_grad=True):
gpu_ctc_cost = gpu_ctc(activations, labels, input_length)
outputs = [gpu_ctc_cost]
if compute_grad:
# Symbolic gradient of CTC cost
gpu_ctc_grad = grad(mean(gpu_ctc_cost), activations)
outputs += [gpu_ctc_grad]
return aesara.function([], outputs, mode=mode_with_gpu)
def check_expected_values(
self, activations, labels, input_length, expected_costs, expected_grads
):
gpu_train = self.setup_gpu_op(activations, labels, input_length)
gpu_cost, gpu_grad = gpu_train()
# Transfer costs from GPU memory to host
cost_from_gpu = np.asarray(gpu_cost)
# Transfer gradients from GPU memory to host
grad_from_gpu = np.asarray(gpu_grad)
# Check that results are in conformance with expected values
utt.assert_allclose(expected_grads / cost_from_gpu.shape[0], grad_from_gpu)
utt.assert_allclose(expected_costs, cost_from_gpu)
def compare_gpu_and_cpu_values(self, activations, labels, input_length):
cpu_train = self.setup_cpu_op(activations, labels, input_length)
cpu_cost, cpu_grad = cpu_train()
gpu_train = self.setup_gpu_op(activations, labels, input_length)
gpu_cost, gpu_grad = gpu_train()
# Transfer costs from GPU memory to host
cost_from_gpu = np.asarray(gpu_cost)
# Transfer gradients from GPU memory to host
grad_from_gpu = np.asarray(gpu_grad)
# Check that results are in conformance with expected values
utt.assert_allclose(cpu_grad, grad_from_gpu)
utt.assert_allclose(cpu_cost, cost_from_gpu)
def check_grads_disabled(self, activations, labels, input_length):
"""
Check if optimization to disable gradients is working
"""
gpu_ctc_cost = gpu_ctc(activations, labels, input_length)
gpu_ctc_function = aesara.function([], [gpu_ctc_cost])
for node in gpu_ctc_function.maker.fgraph.apply_nodes:
if isinstance(node.op, GpuConnectionistTemporalClassification):
assert node.op.compute_grad is False
def run_gpu_optimization_with_grad(self, activations, labels, input_length):
# Compile CPU function with optimization
cpu_lifted_train = self.setup_cpu_op(
activations, labels, input_length, mode=mode_with_gpu
)
# Check whether Op is lifted to the GPU
assert self.has_only_gpu_op(cpu_lifted_train)
def run_gpu_optimization_no_grad(self, activations, labels, input_length):
cpu_train = self.setup_cpu_op(
activations, labels, input_length, compute_grad=False
)
cpu_cost = cpu_train()
# Compile CPU function with optimization
cpu_lifted_test = self.setup_cpu_op(
activations, labels, input_length, compute_grad=False, mode=mode_with_gpu
)
# Check whether Op is lifted to the GPU
assert self.has_only_gpu_op(cpu_lifted_test)
gpu_cost = cpu_lifted_test()
# Transfer costs from GPU memory to host
cost_from_gpu = np.asarray(gpu_cost)
# Compare values from CPU and GPU Ops
utt.assert_allclose(cpu_cost, cost_from_gpu)
def has_only_gpu_op(self, function):
has_cpu_instance = False
has_gpu_instance = False
for node in function.maker.fgraph.apply_nodes:
if isinstance(node.op, ConnectionistTemporalClassification):
has_cpu_instance = True
if isinstance(node.op, GpuConnectionistTemporalClassification):
has_gpu_instance = True
return has_gpu_instance and (not has_cpu_instance)
# Test obtained from Torch tutorial at:
# https://github.com/baidu-research/warp-ctc/blob/master/torch_binding/TUTORIAL.md
def test_torch_case(self):
(
activations,
labels,
activation_times,
expected_costs,
expected_grads,
) = setup_torch_case()
self.check_ctc(
activations, labels, activation_times, expected_costs, expected_grads
)
def test_ctc(self):
(
activations,
labels,
input_length,
expected_costs,
expected_grads,
) = setup_ctc_case()
self.check_ctc(
activations, labels, input_length, expected_costs, expected_grads
)
def test_verify_grad(self):
def ctc_op_functor(labels, in_lengths):
def wrapper(acts):
# Create auxiliary symbolic variables
t_activation_times = aesara.shared(in_lengths, name="activation_times")
t_labels = aesara.shared(labels, name="labels")
return gpu_ctc(acts, t_labels, t_activation_times)
return wrapper
activations, labels, activation_times = setup_grad_case()
ctc_op = ctc_op_functor(labels, activation_times)
utt.verify_grad(ctc_op, [activations], mode=mode_with_gpu)
This source diff could not be displayed because it is too large. You can view the blob instead.
import numpy as np
import pytest
import aesara
import aesara.scalar as aes
import aesara.tensor as at
pygpu = pytest.importorskip("pygpu")
gpuarray = pygpu.ndgpuarray
from copy import copy
from aesara.compile.debugmode import DebugMode
from aesara.compile.mode import Mode
from aesara.gpuarray.dnn import GpuDnnReduction
from aesara.gpuarray.elemwise import (
GpuCAReduceCPY,
GpuCAReduceCuda,
GpuDimShuffle,
GpuElemwise,
GpuErfcinv,
GpuErfinv,
)
from aesara.gpuarray.type import GpuArrayType, get_context, gpuarray_shared_constructor
from aesara.link.basic import PerformLinker
from aesara.link.c.basic import CLinker
from aesara.tensor.math import erfcinv, erfinv, mul, tanh
from aesara.tensor.type import bvector, float_dtypes, fmatrix, fvector, vector
from tests.gpuarray.config import mode_with_gpu, mode_without_gpu, test_ctx_name
from tests.gpuarray.test_basic_ops import rand_gpuarray
from tests.tensor import test_elemwise
from tests.unittest_tools import assert_allclose
# This is actually a test for GpuElemwise
class TestGpuBroadcast(test_elemwise.TestBroadcast):
cop = GpuElemwise
ctype = GpuArrayType
# The order is important
linkers = [PerformLinker, CLinker]
def rand_cval(self, shp):
return rand_gpuarray(*shp, cls=gpuarray)
def test_elemwise_pow():
# Test that GpuElemwise(pow) can compile with any combination of integer
# or float input dtype.
dtypes = [
"uint8",
"uint16",
"uint32",
"uint64",
"int8",
"int16",
"int32",
"int64",
"float16",
"float32",
"float64",
]
for dtype_base in dtypes:
for dtype_exp in dtypes:
# Compile a gpu function with the specified dtypes
base_val = np.random.randint(0, 5, size=10).astype(dtype_base)
exp_val = np.random.randint(0, 3, size=10).astype(dtype_exp)
base = vector(dtype=dtype_base)
exp = gpuarray_shared_constructor(exp_val)
assert exp.dtype == dtype_exp
output = base**exp
f = aesara.function([base], output, mode=mode_with_gpu)
# We don't transfer to the GPU when the output dtype is int*
n = len(
[n for n in f.maker.fgraph.apply_nodes if isinstance(n.op, GpuElemwise)]
)
assert n == (output.dtype in float_dtypes)
# Call the function to make sure the output is valid
out = f(base_val)
expected_out = base_val**exp_val
assert_allclose(out, expected_out)
class TestMathErrorFunctions:
dtypes = ["float64", "float32", "float16"]
default_arrays = {}
expected_erfinv_outputs = {}
expected_erfcinv_outputs = {}
@classmethod
def setup_class(cls):
scipy_special = pytest.importorskip("scipy.special")
# NB: erfinv is defined in ]-1;1[, and erfcinv is defined in ]0;2[,
# so we just take some values in an interval that covers both domains
# (this will also allow to test some values outside the domains).
# We take [-5;5[ by default and we concatenate it 1000 times
# to have the GPU ops run on large data.
default_array = [x / 10.0 for x in range(-50, 50)] * 1000
for dtype in cls.dtypes:
numpy_array = np.asarray(default_array, dtype=dtype)
cls.default_arrays[dtype] = numpy_array
cls.expected_erfinv_outputs[dtype] = scipy_special.erfinv(numpy_array)
cls.expected_erfcinv_outputs[dtype] = scipy_special.erfcinv(numpy_array)
# Since there are infinite values, we need to disable that check
# in DebugMode if needed
if isinstance(mode_with_gpu, DebugMode):
cls.mode_with_gpu = copy(mode_with_gpu)
cls.mode_with_gpu.check_isfinite = False
else:
cls.mode_with_gpu = mode_with_gpu
if isinstance(mode_without_gpu, DebugMode):
cls.mode_without_gpu = copy(mode_without_gpu)
cls.mode_without_gpu.check_isfinite = False
else:
cls.mode_without_gpu = mode_without_gpu
def check_gpu_scalar_op(self, aesara_function, scalar_optype):
for node in aesara_function.maker.fgraph.apply_nodes:
if isinstance(node.op, GpuElemwise) and isinstance(
node.op.scalar_op, scalar_optype
):
return True
aesara.printing.debugprint(aesara_function)
return False
def test_elemwise_erfinv(self):
for dtype in self.dtypes:
vec = vector(dtype=dtype)
output = erfinv(vec)
f_host = aesara.function(
[vec],
output,
name="HOST/erfinv/" + dtype,
mode=self.mode_without_gpu,
)
f_gpu = aesara.function(
[vec], output, name="GPU/erfinv/" + dtype, mode=self.mode_with_gpu
)
assert (
len(
[
n
for n in f_host.maker.fgraph.apply_nodes
if isinstance(n.op, GpuElemwise)
]
)
== 0
)
if not aesara.config.device.startswith("opencl"):
assert self.check_gpu_scalar_op(
f_gpu, GpuErfinv
), 'Function graph does not contains scalar op "GpuErfinv".'
vector_val = self.default_arrays[dtype]
f_host(vector_val)
f_gpu(vector_val)
out_host = f_host(vector_val)
out_gpu = f_gpu(vector_val)
assert_allclose(out_host, out_gpu)
assert_allclose(self.expected_erfinv_outputs[dtype], out_gpu)
def test_elemwise_erfcinv(self):
for dtype in self.dtypes:
vec = vector(dtype=dtype)
output = erfcinv(vec)
f_host = aesara.function(
[vec],
output,
name="HOST/erfcinv/" + dtype,
mode=self.mode_without_gpu,
)
f_gpu = aesara.function(
[vec], output, name="GPU/erfcinv/" + dtype, mode=self.mode_with_gpu
)
assert (
len(
[
n
for n in f_host.maker.fgraph.apply_nodes
if isinstance(n.op, GpuElemwise)
]
)
== 0
)
if not aesara.config.device.startswith("opencl"):
assert self.check_gpu_scalar_op(
f_gpu, GpuErfcinv
), 'Function graph does not contains scalar op "GpuErfcinv".'
vector_val = self.default_arrays[dtype]
f_host(vector_val)
f_gpu(vector_val)
out_host = f_host(vector_val)
out_gpu = f_gpu(vector_val)
assert_allclose(out_host, out_gpu)
assert_allclose(self.expected_erfcinv_outputs[dtype], out_gpu)
class TestFloat16:
def test_composite_elemwise_float16(self):
w = bvector()
x = vector(dtype="float16")
y = fvector()
cz = tanh(x + at.cast(y, "float16"))
o = (
cz
- cz**2
+ at.cast(x, "int16")
+ at.cast(x, "float32")
+ at.cast(w, "float16")
- at.constant(np.float16(1.0))
)
aesara.function([w, x, y], o, mode=mode_with_gpu)
v = vector(dtype="uint8")
w = vector(dtype="float16")
x = vector(dtype="float16")
y = vector(dtype="float16")
z = vector(dtype="float16")
o = at.switch(v, mul(w, x, y), z)
aesara.function([v, w, x, y, z], o, mode=mode_with_gpu)
def test_cast_float16(self):
f16 = vector(dtype="float16")
f32 = fvector()
i8 = bvector()
f = aesara.function(
[f16, f32, i8],
[
f16.astype("float32"),
f32.astype("float16"),
f32.astype("float64"),
f16.astype("int8"),
f32.astype("int8"),
i8.astype("float16"),
i8.astype("float32"),
],
mode=mode_with_gpu,
)
d1 = (np.random.rand(4) * 10).astype("float16")
d2 = (np.random.rand(5) * 10).astype("float32")
d3 = (np.random.rand(6) * 10).astype("int8")
res = f(d1, d2, d3)
for i, out in enumerate(f.outputs):
dtype = out.variable.dtype
assert res[i].dtype == dtype
inp = out.variable.owner.inputs[0]
if inp.dtype == "float16":
d = d1
elif inp.dtype == "float32":
d = d2
else:
d = d3
assert_allclose(d.astype(dtype), res[i])
class TestGpuDimShuffle(test_elemwise.TestDimShuffle):
op = GpuDimShuffle
class TestGpuCAReduceCPY(test_elemwise.TestCAReduce):
dtypes = ["float32"]
bin_dtypes = ["uint8", "int8"]
op = GpuCAReduceCPY
reds = [aes.add, aes.mul]
pre_scalar_op = None
mode = mode_with_gpu
def test_perform(self):
for dtype in self.dtypes + self.bin_dtypes:
for op in self.reds:
self.with_mode(
Mode(linker="py", optimizer=mode_with_gpu.optimizer),
op,
dtype=dtype,
pre_scalar_op=self.pre_scalar_op,
)
def test_perform_nan(self):
for dtype in self.dtypes:
if not dtype.startswith("float"):
continue
for op in self.reds:
self.with_mode(
Mode(linker="py", optimizer=mode_with_gpu.optimizer),
op,
dtype=dtype,
test_nan=True,
pre_scalar_op=self.pre_scalar_op,
)
def test_c(self):
for dtype in self.dtypes + self.bin_dtypes:
for op in self.reds:
self.with_mode(
Mode(linker="c", optimizer=mode_with_gpu.optimizer),
op,
dtype=dtype,
pre_scalar_op=self.pre_scalar_op,
)
def test_c_nan(self):
for dtype in self.dtypes:
if not dtype.startswith("float"):
continue
for op in self.reds:
self.with_mode(
Mode(linker="c", optimizer=mode_with_gpu.optimizer),
op,
dtype=dtype,
test_nan=True,
pre_scalar_op=self.pre_scalar_op,
)
def test_infer_shape(self):
for dtype in self.dtypes:
super().test_infer_shape(dtype)
class TestGpuCAReduceCuda(TestGpuCAReduceCPY):
dtypes = ["float32", "int64"]
bin_dtypes = ["uint8", "int8"]
cases = [
((5, 6), None),
((5, 6), (0, 1)),
((5, 6), (0,)),
((5, 6), (1,)),
((5, 6), (-1,)),
((5, 6), (-2,)),
# ((5, 6), ()), #reduce on no axis(copy) isn't implemented
# ((2, 3, 4, 5), (0, 1, 3)), mask 1101 isn't implemented
# ((2, 3, 4, 5), (-2, -3)), mask 0110 isn't implemented
((5, 0), None),
((5, 0), (0,)),
((5, 0), (1,)),
# ((5, 0), ()), reduce on no axis isn't implemented
# ((), None), reduce on no axis isn't implemented
# ((), ()) reduce on no axis isn't implemented
# Test all GPU cases implemented
((1, 0), (1,)),
((0, 1), (1,)),
((0, 0), (1,)),
((0, 0, 0), (1, 2)),
((0, 0, 0, 0), (1, 2, 3)),
((2, 1), (1,)),
((1, 2), (1,)),
((100, 3, 1300), [1]),
((0,), [0]),
((5,), [0]),
((0, 0), [0, 1]),
((1, 0), [0, 1]),
((5, 4), [0, 1]),
((33, 31), [0, 1]),
((5, 4), [1]),
((5, 4), [0]), # need something bigger then 32 for some opt test.
((5, 4, 3), [0]),
((5, 4, 3), [1]),
((5, 4, 3), [0, 1]),
((5, 4, 3), [2]),
((5, 4, 3), [1, 2]),
((5, 4, 3), [0, 1, 2]),
((0, 0, 0, 0), [0, 1, 2, 3]),
((5, 4, 3, 20), [2, 3]),
((5, 4, 3, 2), [0, 1, 2, 3]),
((5, 4, 3, 2), [0, 2, 3]),
((5, 4, 3, 2), [1, 2, 3]),
# test shape bigger then 4096 on each dimension to make sure that we work correctly when we don't have enough thread/block in each dimensions
((4100, 3), [0]),
((3, 4101), [0]), # 10
((1024, 33), [0]),
((33, 1024), [0]), # 10
((1025, 33), [0]),
((33, 1025), [0]), # 10
((4100, 3), [1]),
((3, 4101), [1]), # 01
((1024, 33), [1]),
((33, 1024), [1]), # 01
((1025, 33), [1]),
((33, 1025), [1]), # 01
((4100, 3), [0, 1]),
((3, 4101), [0, 1]), # 11
((1024, 33), [0, 1]),
((33, 1024), [0, 1]), # 01
((1025, 33), [0, 1]),
((33, 1025), [0, 1]), # 01
((4100, 4, 3), [0]),
((5, 4100, 3), [0]),
((5, 4, 4100), [0]),
((3, 65536, 1), [0]), # 100
((4100, 4, 3), [1]),
((5, 4100, 3), [1]),
((5, 4, 4100), [1]), # 010
((4100, 4, 3), [2]),
((5, 4100, 3), [2]),
((5, 4, 4100), [2]), # 001
((4100, 4, 3), [0, 1]),
((5, 4100, 3), [0, 1]),
((5, 4, 4100), [0, 1]), # 110
((4100, 4, 3), [1, 2]),
((5, 4100, 3), [1, 2]),
((5, 4, 4100), [1, 2]), # 011
((4100, 4, 3), [0, 2]),
((5, 4100, 3), [0, 2]),
((5, 4, 4100), [0, 2]), # 101
((4100, 4, 3), [0, 1, 2]),
((5, 4100, 3), [0, 1, 2]),
((5, 4, 4100), [0, 1, 2]), # 111
((65, 4, 3), [0, 1, 2]),
((5, 65, 3), [0, 1, 2]),
((5, 4, 65), [0, 1, 2]), # 111
# reduce over 2d
((4100, 4, 3, 2), [2, 3]),
((4, 4100, 3, 2), [2, 3]),
((4, 3, 4100, 2), [2, 3]),
((4, 3, 2, 4100), [2, 3]), # 0011
((4100, 4, 3, 2), [1, 3]),
((4, 4100, 3, 2), [1, 3]),
((4, 3, 4100, 2), [1, 3]),
((4, 3, 2, 4100), [1, 3]), # 0101
# ((4100, 4, 3, 2), [1, 2]), ((4, 4100, 3, 2), [1, 2]), ((4, 3, 4100, 2), [1, 2]), ((4, 3, 2, 4100), [1, 2]), # 0110 by reshape
# ((4100,4,3,2),[0,3]),((4,4100,3,2),[0,3]),((4,3,4100,2),[0,3]),((4,3,2,4100),[0,3]), # 1001 by reshape
# ((4100,4,3,2),[0,2]),((4,4100,3,2),[0,2]),((4,3,4100,2),[0,2]),((4,3,2,4100),[0,2]), # 1010 not implemented
# ((4100, 4, 3, 2), [0, 1]), ((4, 4100, 3, 2), [0, 1]), ((4, 3, 4100, 2), [0, 1]), ((4, 3, 2, 4100), [0, 1]), # 1100 by reshape
# reduce over 3d
# 3d not tested: 1101, 1110, 1111
# ((4100,4,3,2),[0,1,3]),((4,4100,3,2),[0,1,3]),((4,3,4100,2),[0,1,3]),((4,3,2,4100),[0,1,3]), # 1101 by reshape
# ((4100, 4, 3, 2), [0, 1, 2]), ((4, 4100, 3, 2), [0, 1, 2]), ((4, 3, 4100, 2), [0, 1, 2]), ((4, 3, 2, 4100), [0, 1, 2]), # 1110 by reshape
((4100, 4, 3, 2), [0, 2, 3]),
((4, 4100, 3, 2), [0, 2, 3]),
((4, 3, 4100, 2), [0, 2, 3]), # ((4,3,2,4100),[0,2,3]), # 1011
((4100, 4, 3, 2), [1, 2, 3]),
((4, 4100, 3, 2), [1, 2, 3]),
((4, 3, 4100, 2), [1, 2, 3]),
((4, 3, 2, 4100), [1, 2, 3]), # 0111
((65, 4, 3, 2), [1, 2, 3]),
((4, 65, 3, 2), [1, 2, 3]),
((4, 3, 65, 2), [1, 2, 3]),
((4, 3, 2, 65), [1, 2, 3]), # 0111
# reduce over 4d
((4100, 2, 3, 4), [0, 1, 2, 3]),
((2, 4100, 3, 4), [0, 1, 2, 3]),
((2, 3, 4100, 4), [0, 1, 2, 3]),
((2, 3, 4, 4100), [0, 1, 2, 3]),
((128, 1, 3, 3), [0, 1, 2, 3]), # 1111
# test pattern implemented by reshape
# Skip them as this test the op directly, not the optimization with reshape
# ((4100,4,3,2),[0]),((4,4100,3,2),[0]),((4,3,4100,2),[0]),((4,3,2,4100),[0]),#1000
# ((4100,4,3,2),[1]),((4,4100,3,2),[1]),((4,3,4100,2),[1]),((4,3,2,4100),[1]),#0100
# ((4100,4,3,2),[2]),((4,4100,3,2),[2]),((4,3,4100,2),[2]),((4,3,2,4100),[2]),#0010
# ((4100,4,3,2),[3]),((4,4100,3,2),[3]),((4,3,4100,2),[3]),((4,3,2,4100),[3]),#0001
# ((1100,2,3,4,5),[0,1,2,3,4]),((2,1100,3,4,5),[0,1,2,3,4]),((2,3,1100,4,5),[0,1,2,3,4]),((2,3,4,1100,5),[0,1,2,3,4]),((2,3,4,5,1100),[0,1,2,3,4]),#11111
# ((5,4,3,10,11),[1,2]),
]
op = GpuCAReduceCuda
reds = [aes.add, aes.mul, aes.scalar_maximum, aes.scalar_minimum]
pre_scalar_op = None
def test_perform_noopt(self):
return
def test_perform(self):
return
def test_perform_nan(self):
return
def setup_method(self):
super().setup_method()
if get_context(test_ctx_name).kind != b"cuda":
pytest.skip("Cuda specific tests")
class TestGpuReduceDtype(test_elemwise.TestReduceDtype):
mode = mode_with_gpu.excluding("local_cut_useless_reduce")
# GpuDnnReduction doesn't cover all cases, but should cover some
op = (GpuCAReduceCuda, GpuDnnReduction)
# Currently we don't support reduction on 0 axis
axes = [None, 0, 1, 1, [0], [1], [0, 1]]
# We don't support complex dtype
dtypes = [
"int8",
"int16",
"int32",
"int64",
"uint8",
"uint16",
"uint32",
"uint64",
"float32",
"float64",
]
def setup_method(self):
if get_context(test_ctx_name).kind != b"cuda":
pytest.skip("Cuda specific tests")
def speed_reduce10():
data = np.random.rand(1000, 1000).astype("float32")
m = fmatrix()
f = aesara.function([m], [m.sum(axis=0), m.T.sum(axis=0)], mode=mode_with_gpu)
f(data)
from functools import partial
from itertools import product
import numpy as np
import pytest
import aesara
import aesara.tensor.math as tm
from aesara.gpuarray.extra_ops import GpuCumOp
from aesara.gpuarray.type import get_context
from aesara.tensor.extra_ops import CumOp
from aesara.tensor.type import fmatrix, ftensor3, ftensor4, fvector, tensor3
from tests import unittest_tools as utt
from tests.gpuarray.config import mode_with_gpu, test_ctx_name
from tests.tensor.test_extra_ops import TestCumOp
class TestGpuCumOp(TestCumOp):
mode = mode_with_gpu
def setup_method(self):
super().setup_method()
test_ctx = get_context(test_ctx_name)
if test_ctx.kind != b"cuda":
pytest.skip("Cuda specific tests")
self.max_threads_dim0 = test_ctx.maxlsize0
self.max_grid_size1 = test_ctx.maxgsize2
self.op_class = CumOp
# The CPU implementation is not so accurate, which throws out DebugMode.
# Since propagating .tag.values_eq_approx to the output of every
# GpuFromHost seems overkill, we just relax the rtol for these tests
self.old_rtol = tm.float32_rtol
tm.float32_rtol *= 2
def teardown_method(self):
super().teardown_method()
# Restore rtol
tm.float32_rtol = self.old_rtol
@pytest.mark.skipif(
aesara.config.floatX != "float32",
reason=f"Gpucumop not implemented for dtype {aesara.config.floatX}",
)
@pytest.mark.parametrized("mode", ["mul", "add"])
def test_infer_shape(self, mode):
op_class = partial(self.op_class, mode=mode)
x = tensor3("x")
a = np.random.random((3, 5, 2)).astype(aesara.config.floatX)
for axis in range(-len(a.shape), len(a.shape)):
self._compile_and_check([x], [op_class(axis=axis)(x)], [a], GpuCumOp)
@pytest.mark.parametrized("mode", ["mul", "add"])
def test_Strides1D(self, mode):
op_class = partial(self.op_class, mode=mode)
np_func = dict(add=np.cumsum, mul=np.cumprod)[mode]
x = fvector("x")
for axis in (0, None, -1):
a = np.random.random((42,)).astype("float32")
cumop_function = aesara.function(
[x], op_class(axis=axis)(x), mode=self.mode
)
slicings = [
slice(None, None, None), # Normal strides
slice(None, None, 2), # Stepped strides
slice(None, None, -1), # Negative strides
]
# Cartesian product of all slicings to test.
for slicing in product(slicings, repeat=x.ndim):
f = aesara.function(
[x], op_class(axis=axis)(x[slicing]), mode=self.mode
)
assert [
n for n in f.maker.fgraph.toposort() if isinstance(n.op, GpuCumOp)
]
utt.assert_allclose(np_func(a[slicing], axis=axis), f(a))
utt.assert_allclose(
np_func(a[slicing], axis=axis), cumop_function(a[slicing])
)
@pytest.mark.parametrized("mode", ["mul", "add"])
def test_Strides2D(self, mode):
np_func = dict(add=np.cumsum, mul=np.cumprod)[mode]
op_class = partial(self.op_class, mode=mode)
x = fmatrix("x")
for axis in (0, 1, None, -1, -2):
a = np.random.random((42, 30)).astype("float32")
cumop_function = aesara.function(
[x], op_class(axis=axis)(x), mode=self.mode
)
slicings = [
slice(None, None, None), # Normal strides
slice(None, None, 2), # Stepped strides
slice(None, None, -1), # Negative strides
]
# Cartesian product of all slicings to test.
for slicing in product(slicings, repeat=x.ndim):
f = aesara.function(
[x], op_class(axis=axis)(x[slicing]), mode=self.mode
)
assert [
n for n in f.maker.fgraph.toposort() if isinstance(n.op, GpuCumOp)
]
utt.assert_allclose(np_func(a[slicing], axis=axis), f(a))
utt.assert_allclose(
np_func(a[slicing], axis=axis), cumop_function(a[slicing])
)
@pytest.mark.parametrized("mode", ["mul", "add"])
def test_Strides3D(self, mode):
np_func = dict(add=np.cumsum, mul=np.cumprod)[mode]
op_class = partial(self.op_class, mode=mode)
x = ftensor3("x")
for axis in (0, 1, 2, None, -1, -2, -3):
a = np.random.random((42, 30, 25)).astype("float32")
cumop_function = aesara.function(
[x], op_class(axis=axis)(x), mode=self.mode
)
slicings = [
slice(None, None, None), # Normal strides
slice(None, None, 2), # Stepped strides
slice(None, None, -1), # Negative strides
]
# Cartesian product of all slicings to test.
for slicing in product(slicings, repeat=x.ndim):
f = aesara.function(
[x], op_class(axis=axis)(x[slicing]), mode=self.mode
)
assert [
n for n in f.maker.fgraph.toposort() if isinstance(n.op, GpuCumOp)
]
utt.assert_allclose(np_func(a[slicing], axis=axis), f(a))
utt.assert_allclose(
np_func(a[slicing], axis=axis), cumop_function(a[slicing])
)
@pytest.mark.parametrized("mode", ["mul", "add"])
def test_GpuCumOp1D(self, mode):
np_func = dict(add=np.cumsum, mul=np.cumprod)[mode]
op_class = partial(self.op_class, mode=mode)
block_max_size = self.max_threads_dim0 * 2
x = fvector("x")
f = aesara.function([x], op_class(axis=0)(x), mode=self.mode)
assert [n for n in f.maker.fgraph.toposort() if isinstance(n.op, GpuCumOp)]
# Extensive testing for the first 1025 sizes
a = np.random.random(1025).astype("float32")
for i in range(a.shape[0]):
utt.assert_allclose(np_func(a[:i]), f(a[:i]))
# Use multiple GPU threadblocks
a = np.random.random((block_max_size + 2,)).astype("float32")
utt.assert_allclose(np_func(a), f(a))
# Use recursive cumop
a = np.ones((block_max_size * (block_max_size + 1) + 2,), dtype="float32")
utt.assert_allclose(np_func(a), f(a))
@pytest.mark.parametrized("mode", ["mul", "add"])
def test_GpuCumOp2D(self, mode):
np_func = dict(add=np.cumsum, mul=np.cumprod)[mode]
op_class = partial(self.op_class, mode=mode)
block_max_size = self.max_threads_dim0 * 2
x = fmatrix("x")
for shape_axis, axis in zip([0, 1, 0, 1, 0], [0, 1, None, -1, -2]):
f = aesara.function([x], op_class(axis=axis)(x), mode=self.mode)
assert [n for n in f.maker.fgraph.toposort() if isinstance(n.op, GpuCumOp)]
# Extensive testing for the first 1025 sizes
a_shape = [5, 5]
a_shape[shape_axis] = 1025
a = np.random.random(a_shape).astype("float32")
slices = [slice(None), slice(None)]
for i in range(a.shape[shape_axis]):
slices[shape_axis] = slice(i)
fa = f(a[slices])
npa = np_func(a[slices], axis=axis)
utt.assert_allclose(npa, fa)
# Use multiple GPU threadblocks
a_shape = [5, 5]
a_shape[shape_axis] = block_max_size + 2
a = np.random.random(a_shape).astype("float32")
utt.assert_allclose(np_func(a, axis=axis), f(a))
# Use multiple GPU gridblocks
a_shape = [4, 4]
a_shape[1 - shape_axis] = self.max_grid_size1 + 1
a = np.random.random(a_shape).astype("float32")
utt.assert_allclose(np_func(a, axis=axis), f(a), rtol=5e-5)
# Use recursive cumop
a_shape = [3, 3]
a_shape[shape_axis] = block_max_size * (block_max_size + 1) + 2
a = np.random.random(a_shape).astype("float32")
a = np.sign(a - 0.5).astype("float32") # Avoid floating point error
utt.assert_allclose(np_func(a, axis=axis), f(a))
@pytest.mark.parametrized("mode", ["mul", "add"])
def test_GpuCumOp3D(self, mode):
np_func = dict(add=np.cumsum, mul=np.cumprod)[mode]
op_class = partial(self.op_class, mode=mode)
block_max_size = self.max_threads_dim0 * 2
x = ftensor3("x")
for shape_axis, axis in zip([0, 1, 2, 0, 2, 1, 0], [0, 1, 2, None, -1, -2, -3]):
f = aesara.function([x], op_class(axis=axis)(x), mode=self.mode)
assert [n for n in f.maker.fgraph.toposort() if isinstance(n.op, GpuCumOp)]
# Extensive testing for the first 1025 sizes
a_shape = [5, 5, 5]
a_shape[shape_axis] = 1025
a = np.random.rand(*a_shape).astype("float32")
slices = [slice(None), slice(None), slice(None)]
for i in range(a.shape[shape_axis]):
slices[shape_axis] = slice(i)
fa = f(a[slices])
npa = np_func(a[slices], axis=axis)
utt.assert_allclose(npa, fa)
# Use multiple GPU threadblocks (along accumulation axis)
a_shape = [2, 2, 2]
a_shape[shape_axis] = block_max_size + 2
a = np.random.random(a_shape).astype("float32")
utt.assert_allclose(np_func(a, axis=axis), f(a))
# Use multiple GPU gridblocks (not along accumulation axis)
a_shape = [5, 5, 5]
a_shape[(shape_axis + 1) % 3] = self.max_grid_size1 + 1
a = np.random.random(a_shape).astype("float32")
if axis is None:
# Avoid floating point error
a = np.sign(a - 0.5).astype("float32")
utt.assert_allclose(np_func(a, axis=axis), f(a))
a_shape = [5, 5, 5]
a_shape[(shape_axis + 2) % 3] = self.max_grid_size1 + 1
a = np.random.random(a_shape).astype("float32")
if axis is None:
# Avoid floating point error
a = np.sign(a - 0.5).astype("float32")
utt.assert_allclose(np_func(a, axis=axis), f(a))
# Use recursive cumop (along accumulation axis)
a_shape = [3, 3, 3]
a_shape[shape_axis] = block_max_size * (block_max_size + 1) + 2
a = np.random.random(a_shape).astype("float32")
a = np.sign(a - 0.5).astype("float32") # Avoid floating point error
utt.assert_allclose(np_func(a, axis=axis), f(a))
@pytest.mark.parametrized("mode", ["mul", "add"])
def test_GpuCumOp4D(self, mode):
op_class = partial(self.op_class, mode=mode)
# Should not use the GPU version.
x = ftensor4("x")
f = aesara.function([x], op_class(axis=1)(x), mode=self.mode)
assert [n for n in f.maker.fgraph.toposort() if isinstance(n.op, CumOp)]
import numpy as np
import pytest
import aesara
import aesara.gpuarray.fft
from aesara.gpuarray.fft import pycuda_available, pygpu_available, skcuda_available
from aesara.tensor.type import matrix
from tests import unittest_tools as utt
from tests.gpuarray.config import mode_with_gpu
# Skip tests if pygpu is not available.
if not pygpu_available: # noqa
pytest.skip("Optional package pygpu not available", allow_module_level=True)
if not skcuda_available: # noqa
pytest.skip("Optional package scikit-cuda not available", allow_module_level=True)
if not pycuda_available: # noqa
pytest.skip("Optional package pycuda not available", allow_module_level=True)
# Transform sizes
N = 32
class TestFFT:
def test_1Dfft(self):
inputs_val = np.random.random((1, N)).astype("float32")
x = matrix("x", dtype="float32")
rfft = aesara.gpuarray.fft.curfft(x)
f_rfft = aesara.function([x], rfft, mode=mode_with_gpu)
res_rfft = f_rfft(inputs_val)
res_rfft_comp = np.asarray(res_rfft[:, :, 0]) + 1j * np.asarray(
res_rfft[:, :, 1]
)
rfft_ref = np.fft.rfft(inputs_val, axis=1)
utt.assert_allclose(rfft_ref, res_rfft_comp)
m = rfft.type()
irfft = aesara.gpuarray.fft.cuirfft(m)
f_irfft = aesara.function([m], irfft, mode=mode_with_gpu)
res_irfft = f_irfft(res_rfft)
utt.assert_allclose(inputs_val, np.asarray(res_irfft))
# The numerical gradient of the FFT is sensitive, must set large
# enough epsilon to get good accuracy.
eps = 1e-1
def f_rfft(inp):
return aesara.gpuarray.fft.curfft(inp)
inputs_val = np.random.random((1, N)).astype("float32")
utt.verify_grad(f_rfft, [inputs_val], eps=eps, mode=mode_with_gpu)
def f_irfft(inp):
return aesara.gpuarray.fft.cuirfft(inp)
inputs_val = np.random.random((1, N // 2 + 1, 2)).astype("float32")
utt.verify_grad(f_irfft, [inputs_val], eps=eps, mode=mode_with_gpu)
def test_rfft(self):
inputs_val = np.random.random((1, N, N)).astype("float32")
inputs = aesara.shared(inputs_val)
rfft = aesara.gpuarray.fft.curfft(inputs)
f_rfft = aesara.function([], rfft, mode=mode_with_gpu)
res_rfft = f_rfft()
res_rfft_comp = np.asarray(res_rfft[:, :, :, 0]) + 1j * np.asarray(
res_rfft[:, :, :, 1]
)
rfft_ref = np.fft.rfftn(inputs_val, axes=(1, 2))
utt.assert_allclose(rfft_ref, res_rfft_comp, atol=1e-4, rtol=1e-4)
def test_irfft(self):
inputs_val = np.random.random((1, N, N)).astype("float32")
inputs = aesara.shared(inputs_val)
fft = aesara.gpuarray.fft.curfft(inputs)
f_fft = aesara.function([], fft, mode=mode_with_gpu)
res_fft = f_fft()
m = fft.type()
ifft = aesara.gpuarray.fft.cuirfft(m)
f_ifft = aesara.function([m], ifft, mode=mode_with_gpu)
res_ifft = f_ifft(res_fft)
utt.assert_allclose(inputs_val, np.asarray(res_ifft))
inputs_val = np.random.random((1, N, N, 2)).astype("float32")
inputs = aesara.shared(inputs_val)
irfft = aesara.gpuarray.fft.cuirfft(inputs)
f_irfft = aesara.function([], irfft, mode=mode_with_gpu)
res_irfft = f_irfft()
inputs_ref = inputs_val[..., 0] + inputs_val[..., 1] * 1j
irfft_ref = np.fft.irfftn(inputs_ref, axes=(1, 2))
utt.assert_allclose(irfft_ref, res_irfft, atol=1e-4, rtol=1e-4)
def test_type(self):
inputs_val = np.random.random((1, N)).astype("float64")
inputs = aesara.shared(inputs_val)
with pytest.raises(AssertionError):
aesara.gpuarray.fft.curfft(inputs)
with pytest.raises(AssertionError):
aesara.gpuarray.fft.cuirfft(inputs)
def test_norm(self):
inputs_val = np.random.random((1, N, N)).astype("float32")
inputs = aesara.shared(inputs_val)
# Unitary normalization
rfft = aesara.gpuarray.fft.curfft(inputs, norm="ortho")
f_rfft = aesara.function([], rfft, mode=mode_with_gpu)
res_rfft = f_rfft()
res_rfft_comp = np.asarray(res_rfft[:, :, :, 0]) + 1j * np.asarray(
res_rfft[:, :, :, 1]
)
rfft_ref = np.fft.rfftn(inputs_val, axes=(1, 2))
utt.assert_allclose(rfft_ref / N, res_rfft_comp, atol=1e-4, rtol=1e-4)
# No normalization
rfft = aesara.gpuarray.fft.curfft(inputs, norm="no_norm")
f_rfft = aesara.function([], rfft, mode=mode_with_gpu)
res_rfft = f_rfft()
res_rfft_comp = np.asarray(res_rfft[:, :, :, 0]) + 1j * np.asarray(
res_rfft[:, :, :, 1]
)
utt.assert_allclose(rfft_ref, res_rfft_comp, atol=1e-4, rtol=1e-4)
# Inverse FFT inputs
inputs_val = np.random.random((1, N, N // 2 + 1, 2)).astype("float32")
inputs = aesara.shared(inputs_val)
inputs_ref = inputs_val[:, :, :, 0] + 1j * inputs_val[:, :, :, 1]
# Unitary normalization inverse FFT
irfft = aesara.gpuarray.fft.cuirfft(inputs, norm="ortho")
f_irfft = aesara.function([], irfft, mode=mode_with_gpu)
res_irfft = f_irfft()
irfft_ref = np.fft.irfftn(inputs_ref, axes=(1, 2))
utt.assert_allclose(irfft_ref * N, res_irfft, atol=1e-4, rtol=1e-4)
# No normalization inverse FFT
irfft = aesara.gpuarray.fft.cuirfft(inputs, norm="no_norm")
f_irfft = aesara.function([], irfft, mode=mode_with_gpu)
res_irfft = f_irfft()
utt.assert_allclose(irfft_ref * N**2, res_irfft, atol=1e-4, rtol=1e-4)
def test_grad(self):
# The numerical gradient of the FFT is sensitive, must set large
# enough epsilon to get good accuracy.
eps = 1e-1
def f_rfft(inp):
return aesara.gpuarray.fft.curfft(inp)
inputs_val = np.random.random((1, N, N)).astype("float32")
utt.verify_grad(f_rfft, [inputs_val], eps=eps, mode=mode_with_gpu)
def f_irfft(inp):
return aesara.gpuarray.fft.cuirfft(inp)
inputs_val = np.random.random((1, N, N // 2 + 1, 2)).astype("float32")
utt.verify_grad(f_irfft, [inputs_val], eps=eps, mode=mode_with_gpu)
def f_rfft(inp):
return aesara.gpuarray.fft.curfft(inp, norm="ortho")
inputs_val = np.random.random((1, N, N)).astype("float32")
utt.verify_grad(f_rfft, [inputs_val], eps=eps, mode=mode_with_gpu)
def f_irfft(inp):
return aesara.gpuarray.fft.cuirfft(inp, norm="no_norm")
inputs_val = np.random.random((1, N, N // 2 + 1, 2)).astype("float32")
utt.verify_grad(f_irfft, [inputs_val], eps=eps, mode=mode_with_gpu)
def test_odd(self):
M = N - 1
inputs_val = np.random.random((1, M, M)).astype("float32")
inputs = aesara.shared(inputs_val)
rfft = aesara.gpuarray.fft.curfft(inputs)
f_rfft = aesara.function([], rfft, mode=mode_with_gpu)
res_rfft = f_rfft()
res_rfft_comp = np.asarray(res_rfft[:, :, :, 0]) + 1j * np.asarray(
res_rfft[:, :, :, 1]
)
rfft_ref = np.fft.rfftn(inputs_val, s=(M, M), axes=(1, 2))
utt.assert_allclose(rfft_ref, res_rfft_comp, atol=1e-4, rtol=1e-4)
m = rfft.type()
ifft = aesara.gpuarray.fft.cuirfft(m, is_odd=True)
f_ifft = aesara.function([m], ifft, mode=mode_with_gpu)
res_ifft = f_ifft(res_rfft)
utt.assert_allclose(inputs_val, np.asarray(res_ifft))
inputs_val = np.random.random((1, M, M // 2 + 1, 2)).astype("float32")
inputs = aesara.shared(inputs_val)
irfft = aesara.gpuarray.fft.cuirfft(inputs, norm="ortho", is_odd=True)
f_irfft = aesara.function([], irfft, mode=mode_with_gpu)
res_irfft = f_irfft()
inputs_ref = inputs_val[:, :, :, 0] + 1j * inputs_val[:, :, :, 1]
irfft_ref = np.fft.irfftn(inputs_ref, s=(M, M), axes=(1, 2)) * M
utt.assert_allclose(irfft_ref, res_irfft, atol=1e-4, rtol=1e-4)
# The numerical gradient of the FFT is sensitive, must set large
# enough epsilon to get good accuracy.
eps = 1e-1
def f_rfft(inp):
return aesara.gpuarray.fft.curfft(inp)
inputs_val = np.random.random((1, M, M)).astype("float32")
utt.verify_grad(f_rfft, [inputs_val], eps=eps, mode=mode_with_gpu)
def f_irfft(inp):
return aesara.gpuarray.fft.cuirfft(inp, is_odd=True)
inputs_val = np.random.random((1, M, M // 2 + 1, 2)).astype("float32")
utt.verify_grad(f_irfft, [inputs_val], eps=eps, mode=mode_with_gpu)
def f_rfft(inp):
return aesara.gpuarray.fft.curfft(inp, norm="ortho")
inputs_val = np.random.random((1, M, M)).astype("float32")
utt.verify_grad(f_rfft, [inputs_val], eps=eps, mode=mode_with_gpu)
def f_irfft(inp):
return aesara.gpuarray.fft.cuirfft(inp, norm="no_norm", is_odd=True)
inputs_val = np.random.random((1, M, M // 2 + 1, 2)).astype("float32")
utt.verify_grad(f_irfft, [inputs_val], eps=eps, mode=mode_with_gpu)
def test_params(self):
inputs_val = np.random.random((1, N)).astype("float32")
inputs = aesara.shared(inputs_val)
with pytest.raises(ValueError):
aesara.gpuarray.fft.curfft(inputs, norm=123)
inputs_val = np.random.random((1, N // 2 + 1, 2)).astype("float32")
inputs = aesara.shared(inputs_val)
with pytest.raises(ValueError):
aesara.gpuarray.fft.cuirfft(inputs, norm=123)
with pytest.raises(ValueError):
aesara.gpuarray.fft.cuirfft(inputs, is_odd=123)
import numpy as np
import aesara
from aesara.configdefaults import config
from aesara.gpuarray.blas import GpuCorrMM, GpuCorrMM_gradInputs, GpuCorrMM_gradWeights
from aesara.gpuarray.type import gpuarray_shared_constructor
from aesara.tensor.nnet.corr import CorrMM, CorrMM_gradInputs, CorrMM_gradWeights
from tests import unittest_tools as utt
from tests.gpuarray.config import mode_with_gpu, mode_without_gpu, ref_cast
from tests.tensor.nnet.test_abstract_conv import (
TestAsymmetricPadding,
TestCausalConv,
TestGroupedConvNoOptim,
TestUnsharedConv,
)
class TestCorrMM:
def run_conv_valid(
self,
inputs_shape,
filters_shape,
border_mode="valid",
filter_dilation=(1, 1),
subsample=(1, 1),
unshared=False,
verify_grad=False,
):
inputs_shape = [inputs_shape[i] for i in (0, 3, 1, 2)]
if unshared:
filters_shape = [filters_shape[i] for i in (0, 1, 2, 5, 3, 4)]
else:
filters_shape = [filters_shape[i] for i in (0, 3, 1, 2)]
inputs_val = np.random.random(inputs_shape).astype(config.floatX)
filters_val = np.random.random(filters_shape).astype(config.floatX)
inputs = gpuarray_shared_constructor(inputs_val)
filters = gpuarray_shared_constructor(filters_val)
conv_ref = CorrMM(
border_mode=border_mode,
filter_dilation=filter_dilation,
subsample=subsample,
unshared=unshared,
)(ref_cast(inputs), ref_cast(filters))
f_ref = aesara.function([], conv_ref, mode=mode_without_gpu)
conv = GpuCorrMM(
border_mode=border_mode,
filter_dilation=filter_dilation,
subsample=subsample,
unshared=unshared,
)(inputs, filters)
f = aesara.function([], conv, mode=mode_with_gpu)
res_ref = f_ref()
res = f()
utt.assert_allclose(res_ref, res)
if verify_grad:
utt.verify_grad(
GpuCorrMM(
border_mode=border_mode,
filter_dilation=filter_dilation,
subsample=subsample,
unshared=unshared,
),
[inputs_val, filters_val],
mode=mode_with_gpu,
)
def test_valid(self):
self.run_conv_valid(inputs_shape=(16, 20, 12, 1), filters_shape=(10, 6, 12, 1))
self.run_conv_valid(
inputs_shape=(16, 20, 12, 1), filters_shape=(10, 6, 12, 1), subsample=(2, 2)
)
self.run_conv_valid(
inputs_shape=(16, 20, 12, 1), filters_shape=(10, 6, 12, 1), subsample=(3, 3)
)
self.run_conv_valid(
inputs_shape=(16, 20, 12, 1), filters_shape=(10, 6, 12, 1), subsample=(3, 2)
)
self.run_conv_valid(
inputs_shape=(16, 20, 12, 1), filters_shape=(10, 6, 12, 1), subsample=(1, 2)
)
def test_border_mode(self):
self.run_conv_valid(
inputs_shape=(16, 20, 12, 1),
filters_shape=(10, 6, 12, 1),
border_mode="valid",
)
self.run_conv_valid(
inputs_shape=(16, 20, 12, 1),
filters_shape=(10, 6, 12, 1),
border_mode="half",
)
self.run_conv_valid(
inputs_shape=(16, 20, 12, 1),
filters_shape=(10, 6, 12, 1),
border_mode="full",
)
self.run_conv_valid(
inputs_shape=(16, 20, 12, 1),
filters_shape=(10, 6, 12, 1),
border_mode=(0, 0),
)
self.run_conv_valid(
inputs_shape=(16, 20, 12, 1),
filters_shape=(10, 6, 12, 1),
border_mode=(1, 2),
)
self.run_conv_valid(
inputs_shape=(16, 20, 12, 1),
filters_shape=(10, 6, 12, 1),
border_mode=(3, 2),
)
def test_filter_dilation(self):
inputs_shape = [16, 20, 12, 1]
filters_shape = [10, 6, 5, 1]
for filter_dilation in [(2, 1), (1, 2)]:
for border_mode in ["valid", "half", "full"]:
self.run_conv_valid(
inputs_shape=inputs_shape,
filters_shape=filters_shape,
filter_dilation=filter_dilation,
border_mode=border_mode,
)
def test_verify_gradients(self):
# use a small example to check the gradients
inputs_shape = [2, 7, 9, 1]
filters_shape = [1, 3, 3, 1]
for filter_dilation in [(2, 1), (1, 2)]:
for border_mode in ["valid", "half", "full", (2, 1)]:
self.run_conv_valid(
inputs_shape=inputs_shape,
filters_shape=filters_shape,
filter_dilation=filter_dilation,
border_mode=border_mode,
verify_grad=True,
)
def test_unshared(self):
self.run_conv_valid(
inputs_shape=(16, 20, 12, 1),
filters_shape=(10, 15, 1, 6, 12, 1),
unshared=True,
)
self.run_conv_valid(
inputs_shape=(16, 20, 12, 1),
filters_shape=(10, 8, 1, 6, 12, 1),
subsample=(2, 2),
unshared=True,
)
self.run_conv_valid(
inputs_shape=(16, 20, 12, 1),
filters_shape=(10, 5, 1, 6, 12, 1),
subsample=(3, 3),
unshared=True,
)
self.run_conv_valid(
inputs_shape=(16, 20, 12, 1),
filters_shape=(10, 5, 1, 6, 12, 1),
subsample=(3, 2),
unshared=True,
)
self.run_conv_valid(
inputs_shape=(16, 20, 12, 1),
filters_shape=(10, 15, 1, 6, 12, 1),
subsample=(1, 2),
unshared=True,
)
self.run_conv_valid(
inputs_shape=(16, 20, 12, 1),
filters_shape=(10, 15, 1, 6, 12, 1),
border_mode="valid",
unshared=True,
)
self.run_conv_valid(
inputs_shape=(16, 20, 12, 1),
filters_shape=(10, 21, 13, 6, 12, 1),
border_mode="half",
unshared=True,
)
self.run_conv_valid(
inputs_shape=(16, 20, 12, 1),
filters_shape=(10, 25, 23, 6, 12, 1),
border_mode="full",
unshared=True,
)
self.run_conv_valid(
inputs_shape=(16, 20, 12, 1),
filters_shape=(10, 15, 1, 6, 12, 1),
border_mode=(0, 0),
unshared=True,
)
self.run_conv_valid(
inputs_shape=(16, 20, 12, 1),
filters_shape=(10, 17, 5, 6, 12, 1),
border_mode=(1, 2),
unshared=True,
)
self.run_conv_valid(
inputs_shape=(16, 20, 12, 1),
filters_shape=(10, 21, 5, 6, 12, 1),
border_mode=(3, 2),
unshared=True,
)
def run_gradweight(self, inputs_shape, filters_shape, dCdH_shape, subsample=(1, 1)):
inputs_shape = [inputs_shape[i] for i in (0, 3, 1, 2)]
filters_shape = [filters_shape[i] for i in (0, 3, 1, 2)]
dCdH_shape = [dCdH_shape[i] for i in (0, 3, 1, 2)]
inputs_val = np.random.random(inputs_shape).astype(config.floatX)
dCdH_val = np.random.random(dCdH_shape).astype(config.floatX)
inputs = gpuarray_shared_constructor(inputs_val)
dCdH = gpuarray_shared_constructor(dCdH_val)
shape = gpuarray_shared_constructor(np.array(filters_shape[2:]))
if subsample == (1, 1):
conv_ref = CorrMM_gradWeights(subsample=subsample)(
ref_cast(inputs), ref_cast(dCdH)
)
conv_gemm = GpuCorrMM_gradWeights(subsample=subsample)(inputs, dCdH)
else:
conv_ref = CorrMM_gradWeights(subsample=subsample)(
ref_cast(inputs), ref_cast(dCdH), shape=shape
)
conv_gemm = GpuCorrMM_gradWeights(subsample=subsample)(
inputs, dCdH, shape=shape
)
f_ref = aesara.function([], conv_ref, mode=mode_without_gpu)
f = aesara.function([], conv_gemm, mode=mode_with_gpu)
res_ref = f_ref()
res = f()
utt.assert_allclose(res_ref, res)
def test_gradweight(self):
self.run_gradweight(
inputs_shape=(16, 10, 12, 1),
filters_shape=(10, 6, 12, 1),
dCdH_shape=(16, 5, 1, 10),
subsample=(1, 1),
)
self.run_gradweight(
inputs_shape=(16, 20, 10, 1),
filters_shape=(10, 6, 4, 1),
dCdH_shape=(16, 8, 4, 10),
subsample=(2, 2),
)
self.run_gradweight(
inputs_shape=(16, 20, 10, 1),
filters_shape=(10, 6, 3, 1),
dCdH_shape=(16, 5, 3, 10),
subsample=(3, 3),
)
self.run_gradweight(
inputs_shape=(16, 20, 12, 1),
filters_shape=(10, 6, 12, 1),
dCdH_shape=(16, 8, 1, 10),
subsample=(2, 1),
)
def run_gradinput(self, inputs_shape, filters_shape, subsample=(1, 1)):
inputs_shape = [inputs_shape[i] for i in (0, 3, 1, 2)]
filters_shape = [filters_shape[i] for i in (0, 3, 1, 2)]
inputs_val = np.random.random(inputs_shape).astype(config.floatX)
filters_val = np.random.random(filters_shape).astype(config.floatX)
inputs = gpuarray_shared_constructor(inputs_val)
filters = gpuarray_shared_constructor(filters_val)
bottom_height = (inputs_shape[2] - 1) * subsample[0] + filters_shape[2]
bottom_width = (inputs_shape[3] - 1) * subsample[1] + filters_shape[3]
bottom_shape = gpuarray_shared_constructor(
np.array([bottom_height, bottom_width])
)
if subsample == (1, 1):
conv_ref = CorrMM_gradInputs(subsample=subsample)(
kern=ref_cast(filters), topgrad=ref_cast(inputs)
)
conv_gemm = GpuCorrMM_gradInputs(subsample=subsample)(
kern=filters, topgrad=inputs
)
else:
conv_ref = CorrMM_gradInputs(subsample=subsample)(
kern=ref_cast(filters), topgrad=ref_cast(inputs), shape=bottom_shape
)
conv_gemm = GpuCorrMM_gradInputs(subsample=subsample)(
kern=filters, topgrad=inputs, shape=bottom_shape
)
f_ref = aesara.function([], conv_ref, mode=mode_without_gpu)
f = aesara.function([], conv_gemm, mode=mode_with_gpu)
res_ref = f_ref()
res = f()
utt.assert_allclose(res_ref, res)
def test_gradinput(self):
self.run_gradinput(inputs_shape=(16, 15, 12, 10), filters_shape=(10, 6, 12, 1))
self.run_gradinput(
inputs_shape=(16, 15, 12, 10),
filters_shape=(10, 6, 12, 1),
subsample=(2, 2),
)
self.run_gradinput(
inputs_shape=(16, 15, 12, 10),
filters_shape=(10, 6, 12, 1),
subsample=(3, 3),
)
self.run_gradinput(
inputs_shape=(16, 15, 12, 10),
filters_shape=(10, 6, 12, 1),
subsample=(3, 1),
)
def test_large_input(self):
# This tests the number-of-threads computation
# by making (channels * height) > (max_threads_dim ** 2).
# (See also issue #5165.)
self.run_conv_valid(
inputs_shape=(1, 1024, 3, 1024),
filters_shape=(1, 1, 1, 1024),
verify_grad=False,
)
self.run_gradinput(inputs_shape=(1, 1024, 3, 1), filters_shape=(1, 1, 1, 1024))
class TestGroupGpuCorr2d(TestGroupedConvNoOptim):
mode = mode_with_gpu.excluding("cudnn")
conv_op = GpuCorrMM
conv_gradw_op = GpuCorrMM_gradWeights
conv_gradi_op = GpuCorrMM_gradInputs
flip_filter = True
is_dnn = False
class TestUnsharedGpuCorr2d(TestUnsharedConv):
mode = mode_with_gpu
conv2d_op = GpuCorrMM
conv2d_gradw_op = GpuCorrMM_gradWeights
conv2d_gradi_op = GpuCorrMM_gradInputs
class TestAsymmetricGpu(TestAsymmetricPadding):
mode = mode_with_gpu
conv2d_op = GpuCorrMM
conv2d_gradw_op = GpuCorrMM_gradWeights
conv2d_gradi_op = GpuCorrMM_gradInputs
class TestCausalGpuCorr(TestCausalConv):
mode = mode_with_gpu
import numpy as np
import aesara
from aesara.configdefaults import config
from aesara.gpuarray.blas import (
GpuCorr3dMM,
GpuCorr3dMM_gradInputs,
GpuCorr3dMM_gradWeights,
)
from aesara.gpuarray.type import gpuarray_shared_constructor
from aesara.tensor.nnet.corr3d import Corr3dMM, Corr3dMMGradInputs, Corr3dMMGradWeights
from tests import unittest_tools as utt
from tests.gpuarray.config import mode_with_gpu, mode_without_gpu, ref_cast
from tests.tensor.nnet.test_abstract_conv import TestGroupedConv3dNoOptim
class TestCorr3dMM:
def run_conv_valid(
self,
inputs_shape,
filters_shape,
border_mode="valid",
filter_dilation=(1, 1, 1),
subsample=(1, 1, 1),
verify_grad=False,
):
inputs_shape = [inputs_shape[i] for i in (0, 4, 1, 2, 3)]
filters_shape = [filters_shape[i] for i in (0, 4, 1, 2, 3)]
inputs_val = np.random.random(inputs_shape).astype(config.floatX)
filters_val = np.random.random(filters_shape).astype(config.floatX)
inputs = gpuarray_shared_constructor(inputs_val)
filters = gpuarray_shared_constructor(filters_val)
conv_ref = Corr3dMM(
border_mode=border_mode,
filter_dilation=filter_dilation,
subsample=subsample,
)(ref_cast(inputs), ref_cast(filters))
f_ref = aesara.function([], conv_ref, mode=mode_without_gpu)
conv = GpuCorr3dMM(
border_mode=border_mode,
filter_dilation=filter_dilation,
subsample=subsample,
)(inputs, filters)
f = aesara.function([], conv, mode=mode_with_gpu)
res_ref = f_ref()
res = f()
utt.assert_allclose(res_ref, res)
if verify_grad:
utt.verify_grad(
GpuCorr3dMM(
border_mode=border_mode,
filter_dilation=filter_dilation,
subsample=subsample,
),
[inputs_val, filters_val],
mode=mode_with_gpu,
)
def test_valid(self):
self.run_conv_valid(
inputs_shape=(16, 20, 12, 16, 1), filters_shape=(10, 6, 12, 4, 1)
)
self.run_conv_valid(
inputs_shape=(16, 20, 12, 15, 1),
filters_shape=(10, 6, 12, 4, 1),
subsample=(2, 2, 2),
)
self.run_conv_valid(
inputs_shape=(16, 20, 12, 15, 1),
filters_shape=(10, 6, 12, 4, 1),
subsample=(2, 2, 2),
)
self.run_conv_valid(
inputs_shape=(16, 20, 12, 15, 1),
filters_shape=(10, 6, 12, 4, 1),
subsample=(3, 3, 3),
)
self.run_conv_valid(
inputs_shape=(16, 20, 12, 15, 1),
filters_shape=(10, 6, 12, 4, 1),
subsample=(3, 3, 3),
)
self.run_conv_valid(
inputs_shape=(16, 20, 12, 15, 1),
filters_shape=(10, 6, 12, 4, 1),
subsample=(3, 2, 1),
)
self.run_conv_valid(
inputs_shape=(16, 20, 12, 15, 1),
filters_shape=(10, 6, 12, 4, 1),
subsample=(1, 2, 3),
)
def test_border_mode(self):
self.run_conv_valid(
inputs_shape=(16, 20, 12, 15, 1),
filters_shape=(10, 6, 12, 4, 1),
border_mode="valid",
)
self.run_conv_valid(
inputs_shape=(16, 20, 12, 15, 1),
filters_shape=(10, 6, 12, 4, 1),
border_mode="half",
)
self.run_conv_valid(
inputs_shape=(16, 20, 12, 15, 1),
filters_shape=(10, 6, 12, 4, 1),
border_mode="full",
)
self.run_conv_valid(
inputs_shape=(16, 20, 12, 15, 1),
filters_shape=(10, 6, 12, 4, 1),
border_mode=(0, 0, 0),
)
self.run_conv_valid(
inputs_shape=(16, 20, 12, 15, 1),
filters_shape=(10, 6, 12, 4, 1),
border_mode=(1, 2, 3),
)
self.run_conv_valid(
inputs_shape=(16, 20, 12, 15, 1),
filters_shape=(10, 6, 12, 4, 1),
border_mode=(3, 2, 1),
)
def test_filter_dilation(self):
inputs_shape = [16, 20, 12, 15, 1]
filters_shape = [10, 6, 5, 4, 1]
for filter_dilation in [(2, 1, 1), (1, 2, 1), (1, 1, 2)]:
for border_mode in ["valid", "half", "full"]:
self.run_conv_valid(
inputs_shape=inputs_shape,
filters_shape=filters_shape,
filter_dilation=filter_dilation,
border_mode=border_mode,
)
def test_verify_gradients(self):
# use a small example to check the gradients
inputs_shape = [2, 7, 9, 6, 1]
filters_shape = [1, 3, 3, 2, 1]
for filter_dilation in [(2, 1, 1), (1, 2, 1), (1, 1, 2)]:
for border_mode in ["valid", "half", "full", (2, 1, 3)]:
self.run_conv_valid(
inputs_shape=inputs_shape,
filters_shape=filters_shape,
filter_dilation=filter_dilation,
border_mode=border_mode,
verify_grad=True,
)
def run_gradweight(
self, inputs_shape, filters_shape, dCdH_shape, subsample=(1, 1, 1)
):
inputs_shape = [inputs_shape[i] for i in (0, 4, 1, 2, 3)]
filters_shape = [filters_shape[i] for i in (0, 4, 1, 2, 3)]
dCdH_shape = [dCdH_shape[i] for i in (0, 4, 1, 2, 3)]
inputs_val = np.random.random(inputs_shape).astype(config.floatX)
dCdH_val = np.random.random(dCdH_shape).astype(config.floatX)
inputs = gpuarray_shared_constructor(inputs_val)
dCdH = gpuarray_shared_constructor(dCdH_val)
shape = gpuarray_shared_constructor(np.array(filters_shape[2:]))
if subsample == (1, 1, 1):
conv_ref = Corr3dMMGradWeights(subsample=subsample)(
ref_cast(inputs), ref_cast(dCdH)
)
conv_gemm = GpuCorr3dMM_gradWeights(subsample=subsample)(inputs, dCdH)
else:
conv_ref = Corr3dMMGradWeights(subsample=subsample)(
ref_cast(inputs), ref_cast(dCdH), shape=shape
)
conv_gemm = GpuCorr3dMM_gradWeights(subsample=subsample)(
inputs, dCdH, shape=shape
)
f_ref = aesara.function([], conv_ref, mode=mode_without_gpu)
f = aesara.function([], conv_gemm, mode=mode_with_gpu)
res_ref = f_ref()
res = f()
utt.assert_allclose(res_ref, res)
def test_gradweight(self):
self.run_gradweight(
inputs_shape=(16, 10, 12, 16, 1),
filters_shape=(10, 6, 12, 4, 1),
dCdH_shape=(16, 5, 1, 13, 10),
subsample=(1, 1, 1),
)
self.run_gradweight(
inputs_shape=(16, 20, 10, 16, 1),
filters_shape=(10, 6, 4, 4, 1),
dCdH_shape=(16, 8, 4, 7, 10),
subsample=(2, 2, 2),
)
self.run_gradweight(
inputs_shape=(16, 20, 10, 16, 1),
filters_shape=(10, 6, 3, 4, 1),
dCdH_shape=(16, 5, 3, 5, 10),
subsample=(3, 3, 3),
)
self.run_gradweight(
inputs_shape=(16, 20, 12, 16, 1),
filters_shape=(10, 6, 12, 4, 1),
dCdH_shape=(16, 8, 1, 5, 10),
subsample=(2, 1, 3),
)
def run_gradinput(self, inputs_shape, filters_shape, subsample=(1, 1, 1)):
inputs_shape = [inputs_shape[i] for i in (0, 4, 1, 2, 3)]
filters_shape = [filters_shape[i] for i in (0, 4, 1, 2, 3)]
inputs_val = np.random.random(inputs_shape).astype(config.floatX)
filters_val = np.random.random(filters_shape).astype(config.floatX)
inputs = gpuarray_shared_constructor(inputs_val)
filters = gpuarray_shared_constructor(filters_val)
bottom_height = (inputs_shape[2] - 1) * subsample[0] + filters_shape[2]
bottom_width = (inputs_shape[3] - 1) * subsample[1] + filters_shape[3]
bottom_depth = (inputs_shape[4] - 1) * subsample[2] + filters_shape[4]
bottom_shape = gpuarray_shared_constructor(
np.array([bottom_height, bottom_width, bottom_depth])
)
if subsample == (1, 1, 1):
conv_ref = Corr3dMMGradInputs(subsample=subsample)(
kern=ref_cast(filters), topgrad=ref_cast(inputs)
)
conv_gemm = GpuCorr3dMM_gradInputs(subsample=subsample)(
kern=filters, topgrad=inputs
)
else:
conv_ref = Corr3dMMGradInputs(subsample=subsample)(
kern=ref_cast(filters), topgrad=ref_cast(inputs), shape=bottom_shape
)
conv_gemm = GpuCorr3dMM_gradInputs(subsample=subsample)(
kern=filters, topgrad=inputs, shape=bottom_shape
)
f_ref = aesara.function([], conv_ref, mode=mode_without_gpu)
f = aesara.function([], conv_gemm, mode=mode_with_gpu)
res_ref = f_ref()
res = f()
utt.assert_allclose(res_ref, res)
def test_gradinput(self):
self.run_gradinput(
inputs_shape=(16, 15, 12, 12, 10), filters_shape=(10, 6, 12, 4, 1)
)
self.run_gradinput(
inputs_shape=(16, 15, 12, 12, 10),
filters_shape=(10, 6, 12, 4, 1),
subsample=(2, 2, 2),
)
self.run_gradinput(
inputs_shape=(16, 15, 12, 12, 10),
filters_shape=(10, 6, 12, 4, 1),
subsample=(3, 3, 3),
)
self.run_gradinput(
inputs_shape=(16, 15, 12, 12, 10),
filters_shape=(10, 6, 12, 4, 1),
subsample=(3, 1, 2),
)
def test_large_input(self):
# This tests the number-of-threads computation
# by making (channels * height) > (max_threads_dim ** 2).
# (See also issue #5165.)
self.run_conv_valid(
inputs_shape=(1, 1024, 3, 3, 1024),
filters_shape=(1, 1, 1, 1, 1024),
verify_grad=False,
)
self.run_gradinput(
inputs_shape=(1, 1024, 3, 3, 1), filters_shape=(1, 1, 1, 1, 1024)
)
class TestGroupGpuCorr3d(TestGroupedConv3dNoOptim):
mode = mode_with_gpu.excluding("cudnn")
conv_op = GpuCorr3dMM
conv_gradw_op = GpuCorr3dMM_gradWeights
conv_gradi_op = GpuCorr3dMM_gradInputs
import numpy as np
import pytest
from numpy.linalg.linalg import LinAlgError
import aesara
from aesara.configdefaults import config
from aesara.gpuarray import gpuarray_shared_constructor
from aesara.gpuarray.linalg import (
GpuCholesky,
GpuCublasTriangularSolve,
GpuCusolverSolve,
GpuMagmaCholesky,
GpuMagmaEigh,
GpuMagmaMatrixInverse,
GpuMagmaQR,
GpuMagmaSVD,
cusolver_available,
gpu_cholesky,
gpu_matrix_inverse,
gpu_qr,
gpu_solve,
gpu_solve_lower_triangular,
gpu_svd,
)
from aesara.tensor.nlinalg import SVD, MatrixInverse, QRFull, eigh, matrix_inverse, qr
from aesara.tensor.slinalg import Cholesky, cholesky
from aesara.tensor.type import fmatrix, matrix, tensor3, vector
from tests import unittest_tools as utt
from tests.gpuarray.config import mode_with_gpu, mode_without_gpu
from tests.gpuarray.test_basic_ops import random
@pytest.mark.skipif(
not cusolver_available,
reason="Optional package scikits.cuda.cusolver not available",
)
class TestCusolver:
def run_gpu_solve(self, A_val, x_val, A_struct=None):
b_val = np.dot(A_val, x_val)
b_val_trans = np.dot(A_val.T, x_val)
A = matrix("A", dtype="float32")
b = matrix("b", dtype="float32")
b_trans = matrix("b", dtype="float32")
if A_struct is None:
solver = gpu_solve(A, b)
solver_trans = gpu_solve(A, b_trans, trans="T")
else:
solver = gpu_solve(A, b, A_struct)
solver_trans = gpu_solve(A, b_trans, A_struct, trans="T")
fn = aesara.function(
[A, b, b_trans], [solver, solver_trans], mode=mode_with_gpu
)
res = fn(A_val, b_val, b_val_trans)
x_res = np.array(res[0])
x_res_trans = np.array(res[1])
utt.assert_allclose(x_val, x_res)
utt.assert_allclose(x_val, x_res_trans)
def test_diag_solve(self):
np.random.seed(1)
A_val = np.asarray([[2, 0, 0], [0, 1, 0], [0, 0, 1]], dtype="float32")
x_val = np.random.uniform(-0.4, 0.4, (A_val.shape[1], 1)).astype("float32")
self.run_gpu_solve(A_val, x_val)
def test_bshape_solve(self):
# Test when shape of b (k, m) is such as m > k
np.random.seed(1)
A_val = np.asarray([[2, 0, 0], [0, 1, 0], [0, 0, 1]], dtype="float32")
x_val = np.random.uniform(
-0.4, 0.4, (A_val.shape[1], A_val.shape[1] + 1)
).astype("float32")
self.run_gpu_solve(A_val, x_val)
def test_sym_solve(self):
np.random.seed(1)
A_val = np.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
A_sym = np.dot(A_val, A_val.T)
x_val = np.random.uniform(-0.4, 0.4, (A_val.shape[1], 1)).astype("float32")
self.run_gpu_solve(A_sym, x_val, "symmetric")
def test_orth_solve(self):
np.random.seed(1)
A_val = np.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
A_orth = np.linalg.svd(A_val)[0]
x_val = np.random.uniform(-0.4, 0.4, (A_orth.shape[1], 1)).astype("float32")
self.run_gpu_solve(A_orth, x_val)
def test_uni_rand_solve(self):
np.random.seed(1)
A_val = np.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
x_val = np.random.uniform(-0.4, 0.4, (A_val.shape[1], 4)).astype("float32")
self.run_gpu_solve(A_val, x_val)
def test_linalgerrsym_solve(self):
np.random.seed(1)
A_val = np.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
x_val = np.random.uniform(-0.4, 0.4, (A_val.shape[1], 4)).astype("float32")
A_val = np.dot(A_val.T, A_val)
# make A singular
A_val[:, 2] = A_val[:, 1] + A_val[:, 3]
A = matrix("A", dtype="float32")
b = matrix("b", dtype="float32")
solver = gpu_solve(A, b, "symmetric")
fn = aesara.function([A, b], [solver], mode=mode_with_gpu)
with pytest.raises(LinAlgError):
fn(A_val, x_val)
def test_linalgerr_solve(self):
np.random.seed(1)
A_val = np.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
x_val = np.random.uniform(-0.4, 0.4, (A_val.shape[1], 4)).astype("float32")
# make A singular
A_val[:, 2] = 0
A = matrix("A", dtype="float32")
b = matrix("b", dtype="float32")
solver = gpu_solve(A, b, trans="T")
fn = aesara.function([A, b], [solver], mode=mode_with_gpu)
with pytest.raises(LinAlgError):
fn(A_val, x_val)
def verify_solve_grad(self, m, n, A_structure, lower, rng):
# ensure diagonal elements of A relatively large to avoid numerical
# precision issues
A_val = (rng.normal(size=(m, m)) * 0.5 + np.eye(m)).astype(config.floatX)
if A_structure == "lower_triangular":
A_val = np.tril(A_val)
elif A_structure == "upper_triangular":
A_val = np.triu(A_val)
if n is None:
b_val = rng.normal(size=m).astype(config.floatX)
else:
b_val = rng.normal(size=(m, n)).astype(config.floatX)
eps = None
if config.floatX == "float64":
eps = 2e-8
if A_structure in ("lower_triangular", "upper_triangular"):
solve_op = GpuCublasTriangularSolve(lower=lower)
else:
solve_op = GpuCusolverSolve(A_structure="general")
utt.verify_grad(solve_op, [A_val, b_val], 3, rng, eps=eps)
def test_solve_grad(self):
rng = np.random.default_rng(utt.fetch_seed())
structures = ["general", "lower_triangular", "upper_triangular"]
for A_structure in structures:
lower = A_structure == "lower_triangular"
# self.verify_solve_grad(5, None, A_structure, lower, rng)
self.verify_solve_grad(6, 1, A_structure, lower, rng)
self.verify_solve_grad(4, 3, A_structure, lower, rng)
# lower should have no effect for A_structure == 'general' so also
# check lower=True case
self.verify_solve_grad(4, 3, "general", lower=True, rng=rng)
@pytest.mark.skipif(
not cusolver_available,
reason="Optional package scikits.cuda.cusolver not available",
)
class TestGpuCholesky:
def get_gpu_cholesky_func(self, lower=True, inplace=False):
# Helper function to compile function from GPU Cholesky op.
A = matrix("A", dtype="float32")
cholesky_op = GpuCholesky(lower=lower, inplace=inplace)
chol_A = cholesky_op(A)
return aesara.function([A], chol_A, accept_inplace=inplace, mode=mode_with_gpu)
def compare_gpu_cholesky_to_np(self, A_val, lower=True, inplace=False):
# Helper function to compare op output to np.cholesky output.
chol_A_val = np.linalg.cholesky(A_val)
if not lower:
chol_A_val = chol_A_val.T
fn = self.get_gpu_cholesky_func(lower, inplace)
res = fn(A_val)
chol_A_res = np.array(res)
utt.assert_allclose(chol_A_res, chol_A_val)
def test_gpu_cholesky_opt(self):
A = matrix("A", dtype="float32")
fn = aesara.function([A], cholesky(A), mode=mode_with_gpu)
assert any(
[isinstance(node.op, GpuCholesky) for node in fn.maker.fgraph.toposort()]
)
def test_invalid_input_fail_non_square(self):
# Invalid Cholesky input test with non-square matrix as input.
A_val = np.random.normal(size=(3, 2)).astype("float32")
fn = self.get_gpu_cholesky_func(True, False)
with pytest.raises(ValueError):
fn(A_val)
def test_invalid_input_fail_vector(self):
# Invalid Cholesky input test with vector as input.
def invalid_input_func():
A = vector("A", dtype="float32")
GpuCholesky(lower=True, inplace=False)(A)
with pytest.raises(AssertionError):
invalid_input_func()
def test_invalid_input_fail_tensor3(self):
# Invalid Cholesky input test with 3D tensor as input.
def invalid_input_func():
A = tensor3("A", dtype="float32")
GpuCholesky(lower=True, inplace=False)(A)
with pytest.raises(AssertionError):
invalid_input_func()
@utt.assertFailure_fast
def test_diag_chol(self):
# Diagonal matrix input Cholesky test.
for lower in [True, False]:
for inplace in [True, False]:
# make sure all diagonal elements are positive so positive-definite
A_val = np.diag(np.random.uniform(size=5).astype("float32") + 1)
self.compare_gpu_cholesky_to_np(A_val, lower=lower, inplace=inplace)
@utt.assertFailure_fast
def test_dense_chol_lower(self):
# Dense matrix input lower-triangular Cholesky test.
for lower in [True, False]:
for inplace in [True, False]:
M_val = np.random.normal(size=(3, 3)).astype("float32")
# A = M.dot(M) will be positive definite for all non-singular M
A_val = M_val.dot(M_val.T)
self.compare_gpu_cholesky_to_np(A_val, lower=lower, inplace=inplace)
def test_invalid_input_fail_non_symmetric(self):
# Invalid Cholesky input test with non-symmetric input.
# (Non-symmetric real input must also be non-positive definite).
A_val = None
while True:
A_val = np.random.normal(size=(3, 3)).astype("float32")
if not np.allclose(A_val, A_val.T):
break
fn = self.get_gpu_cholesky_func(True, False)
with pytest.raises(LinAlgError):
fn(A_val)
def test_invalid_input_fail_negative_definite(self):
# Invalid Cholesky input test with negative-definite input.
M_val = np.random.normal(size=(3, 3)).astype("float32")
# A = -M.dot(M) will be negative definite for all non-singular M
A_val = -M_val.dot(M_val.T)
fn = self.get_gpu_cholesky_func(True, False)
with pytest.raises(LinAlgError):
fn(A_val)
@pytest.mark.skipif(
not cusolver_available,
reason="Optional package scikits.cuda.cusolver not available",
)
class TestGpuCholesky64:
def get_gpu_cholesky_func(self, lower=True, inplace=False):
# Helper function to compile function from GPU Cholesky op.
A = matrix("A", dtype="float64")
cholesky_op = GpuCholesky(lower=lower, inplace=inplace)
chol_A = cholesky_op(A)
return aesara.function([A], chol_A, accept_inplace=inplace, mode=mode_with_gpu)
def compare_gpu_cholesky_to_np(self, A_val, lower=True, inplace=False):
# Helper function to compare op output to np.cholesky output.
chol_A_val = np.linalg.cholesky(A_val)
if not lower:
chol_A_val = chol_A_val.T
fn = self.get_gpu_cholesky_func(lower, inplace)
res = fn(A_val)
chol_A_res = np.array(res)
utt.assert_allclose(chol_A_res, chol_A_val)
def test_gpu_cholesky_opt(self):
A = matrix("A", dtype="float64")
fn = aesara.function([A], cholesky(A), mode=mode_with_gpu)
assert any(
[isinstance(node.op, GpuCholesky) for node in fn.maker.fgraph.toposort()]
)
def test_invalid_input_fail_non_square(self):
# Invalid Cholesky input test with non-square matrix as input.
A_val = np.random.normal(size=(3, 2)).astype("float64")
fn = self.get_gpu_cholesky_func(True, False)
with pytest.raises(ValueError):
fn(A_val)
def test_invalid_input_fail_vector(self):
# Invalid Cholesky input test with vector as input.
def invalid_input_func():
A = vector("A", dtype="float64")
GpuCholesky(lower=True, inplace=False)(A)
with pytest.raises(AssertionError):
invalid_input_func()
def test_invalid_input_fail_tensor3(self):
# Invalid Cholesky input test with 3D tensor as input.
def invalid_input_func():
A = tensor3("A", dtype="float64")
GpuCholesky(lower=True, inplace=False)(A)
with pytest.raises(AssertionError):
invalid_input_func()
@utt.assertFailure_fast
def test_diag_chol(self):
# Diagonal matrix input Cholesky test.
for lower in [True, False]:
for inplace in [True, False]:
# make sure all diagonal elements are positive so positive-definite
A_val = np.diag(np.random.uniform(size=5).astype("float64") + 1)
self.compare_gpu_cholesky_to_np(A_val, lower=lower, inplace=inplace)
@utt.assertFailure_fast
def test_dense_chol_lower(self):
# Dense matrix input lower-triangular Cholesky test.
for lower in [True, False]:
for inplace in [True, False]:
M_val = np.random.normal(size=(3, 3)).astype("float64")
# A = M.dot(M) will be positive definite for all non-singular M
A_val = M_val.dot(M_val.T)
self.compare_gpu_cholesky_to_np(A_val, lower=lower, inplace=inplace)
def test_invalid_input_fail_non_symmetric(self):
# Invalid Cholesky input test with non-symmetric input.
# (Non-symmetric real input must also be non-positive definite).
A_val = None
while True:
A_val = np.random.normal(size=(3, 3)).astype("float64")
if not np.allclose(A_val, A_val.T):
break
fn = self.get_gpu_cholesky_func(True, False)
with pytest.raises(LinAlgError):
fn(A_val)
def test_invalid_input_fail_negative_definite(self):
# Invalid Cholesky input test with negative-definite input.
M_val = np.random.normal(size=(3, 3)).astype("float64")
# A = -M.dot(M) will be negative definite for all non-singular M
A_val = -M_val.dot(M_val.T)
fn = self.get_gpu_cholesky_func(True, False)
with pytest.raises(LinAlgError):
fn(A_val)
@pytest.mark.skipif(
not config.magma__enabled, reason="Magma is not enabled, skipping test"
)
class TestMagma:
def test_magma_opt_float16(self):
ops_to_gpu = [
(MatrixInverse(), GpuMagmaMatrixInverse),
(SVD(), GpuMagmaSVD),
(QRFull(mode="reduced"), GpuMagmaQR),
# TODO: add support for float16 to Eigh numpy
# (Eigh(), GpuMagmaEigh),
(Cholesky(), GpuMagmaCholesky),
]
for op, gpu_op in ops_to_gpu:
A = matrix("A", dtype="float16")
fn = aesara.function([A], op(A), mode=mode_with_gpu.excluding("cusolver"))
assert any(
[isinstance(node.op, gpu_op) for node in fn.maker.fgraph.toposort()]
)
def test_gpu_matrix_inverse(self):
A = fmatrix("A")
fn = aesara.function([A], gpu_matrix_inverse(A), mode=mode_with_gpu)
N = 1000
test_rng = np.random.default_rng(seed=1)
# Copied from tests.tensor.utils.random.
A_val = test_rng.random((N, N)).astype("float32") * 2 - 1
A_val_inv = fn(A_val)
utt.assert_allclose(np.eye(N), np.dot(A_val_inv, A_val), atol=1e-2)
@utt.assertFailure_fast
def test_gpu_matrix_inverse_inplace(self):
N = 1000
test_rng = np.random.default_rng(seed=1)
A_val_gpu = gpuarray_shared_constructor(
test_rng.random((N, N)).astype("float32") * 2 - 1
)
A_val_copy = A_val_gpu.get_value()
A_val_gpu_inv = GpuMagmaMatrixInverse()(A_val_gpu)
fn = aesara.function(
[], A_val_gpu_inv, mode=mode_with_gpu, updates=[(A_val_gpu, A_val_gpu_inv)]
)
assert any(
[
node.op.inplace
for node in fn.maker.fgraph.toposort()
if isinstance(node.op, GpuMagmaMatrixInverse)
]
)
fn()
utt.assert_allclose(
np.eye(N), np.dot(A_val_gpu.get_value(), A_val_copy), atol=5e-3
)
@utt.assertFailure_fast
def test_gpu_matrix_inverse_inplace_opt(self):
A = fmatrix("A")
fn = aesara.function([A], matrix_inverse(A), mode=mode_with_gpu)
assert any(
[
node.op.inplace
for node in fn.maker.fgraph.toposort()
if isinstance(node.op, GpuMagmaMatrixInverse)
]
)
def run_gpu_svd(self, A_val, full_matrices=True, compute_uv=True):
A = fmatrix("A")
f = aesara.function(
[A],
gpu_svd(A, full_matrices=full_matrices, compute_uv=compute_uv),
mode=mode_with_gpu,
)
return f(A_val)
def assert_column_orthonormal(self, Ot):
utt.assert_allclose(np.dot(Ot.T, Ot), np.eye(Ot.shape[1]))
def check_svd(self, A, U, S, VT, rtol=None, atol=None):
S_m = np.zeros_like(A)
np.fill_diagonal(S_m, S)
utt.assert_allclose(np.dot(np.dot(U, S_m), VT), A, rtol=rtol, atol=atol)
def test_gpu_svd_wide(self):
A = random(100, 50).astype("float32")
M, N = A.shape
U, S, VT = self.run_gpu_svd(A)
self.assert_column_orthonormal(U)
self.assert_column_orthonormal(VT.T)
self.check_svd(A, U, S, VT)
U, S, VT = self.run_gpu_svd(A, full_matrices=False)
assert U.shape[1], min(M, N)
self.assert_column_orthonormal(U)
assert VT.shape[0], min(M, N)
self.assert_column_orthonormal(VT.T)
def test_gpu_svd_tall(self):
A = random(50, 100).astype("float32")
M, N = A.shape
U, S, VT = self.run_gpu_svd(A)
self.assert_column_orthonormal(U)
self.assert_column_orthonormal(VT.T)
self.check_svd(A, U, S, VT)
U, S, VT = self.run_gpu_svd(A, full_matrices=False)
assert U.shape[1], min(M, N)
self.assert_column_orthonormal(U)
assert VT.shape[0], min(M, N)
self.assert_column_orthonormal(VT.T)
def test_gpu_singular_values(self):
A = fmatrix("A")
f_cpu = aesara.function(
[A], aesara.tensor.nlinalg.svd(A, compute_uv=False), mode=mode_without_gpu
)
f_gpu = aesara.function([A], gpu_svd(A, compute_uv=False), mode=mode_with_gpu)
A_val = random(50, 100).astype("float32")
utt.assert_allclose(f_cpu(A_val), f_gpu(A_val))
A_val = random(100, 50).astype("float32")
utt.assert_allclose(f_cpu(A_val), f_gpu(A_val))
def run_gpu_cholesky(self, A_val, lower=True):
A = fmatrix("A")
f = aesara.function(
[A],
GpuMagmaCholesky(lower=lower)(A),
mode=mode_with_gpu.excluding("cusolver"),
)
return f(A_val)
def rand_symmetric(self, N):
A = random(N, N).astype("float32")
# ensure that eigenvalues are not too small which sometimes results in
# magma cholesky failure due to gpu limited numerical precision
D, W = np.linalg.eigh(A)
D[D < 1] = 1
V_m = np.zeros_like(A)
np.fill_diagonal(V_m, D)
return np.dot(np.dot(W.T, V_m), W)
def check_cholesky(self, N, lower=True, rtol=None, atol=None):
A = self.rand_symmetric(N)
L = self.run_gpu_cholesky(A, lower=lower)
if not lower:
L = L.T
utt.assert_allclose(np.dot(L, L.T), A, rtol=rtol, atol=atol)
def test_gpu_cholesky(self):
self.check_cholesky(1000, atol=1e-3)
self.check_cholesky(1000, lower=False, atol=1e-3)
def test_gpu_cholesky_opt(self):
A = matrix("A", dtype="float32")
fn = aesara.function([A], cholesky(A), mode=mode_with_gpu.excluding("cusolver"))
assert any(
[
isinstance(node.op, GpuMagmaCholesky)
for node in fn.maker.fgraph.toposort()
]
)
@utt.assertFailure_fast
def test_gpu_cholesky_inplace(self):
A = self.rand_symmetric(1000)
A_gpu = gpuarray_shared_constructor(A)
A_copy = A_gpu.get_value()
C = GpuMagmaCholesky()(A_gpu)
fn = aesara.function([], C, mode=mode_with_gpu, updates=[(A_gpu, C)])
assert any(
[
node.op.inplace
for node in fn.maker.fgraph.toposort()
if isinstance(node.op, GpuMagmaCholesky)
]
)
fn()
L = A_gpu.get_value()
utt.assert_allclose(np.dot(L, L.T), A_copy, atol=1e-3)
@utt.assertFailure_fast
def test_gpu_cholesky_inplace_opt(self):
A = fmatrix("A")
fn = aesara.function([A], GpuMagmaCholesky()(A), mode=mode_with_gpu)
assert any(
[
node.op.inplace
for node in fn.maker.fgraph.toposort()
if isinstance(node.op, GpuMagmaCholesky)
]
)
def run_gpu_qr(self, A_val, complete=True):
A = fmatrix("A")
fn = aesara.function([A], gpu_qr(A, complete=complete), mode=mode_with_gpu)
return fn(A_val)
def check_gpu_qr(self, M, N, complete=True, rtol=None, atol=None):
A = random(M, N).astype("float32")
if complete:
Q_gpu, R_gpu = self.run_gpu_qr(A, complete=complete)
else:
R_gpu = self.run_gpu_qr(A, complete=complete)
Q_np, R_np = np.linalg.qr(A, mode="reduced")
utt.assert_allclose(R_np, R_gpu, rtol=rtol, atol=atol)
if complete:
utt.assert_allclose(Q_np, Q_gpu, rtol=rtol, atol=atol)
def test_gpu_qr(self):
self.check_gpu_qr(1000, 500, atol=1e-3)
self.check_gpu_qr(1000, 500, complete=False, atol=1e-3)
self.check_gpu_qr(500, 1000, atol=1e-3)
self.check_gpu_qr(500, 1000, complete=False, atol=1e-3)
def test_gpu_qr_opt(self):
A = fmatrix("A")
fn = aesara.function([A], qr(A), mode=mode_with_gpu)
assert any(
[
isinstance(node.op, GpuMagmaQR) and node.op.complete
for node in fn.maker.fgraph.toposort()
]
)
def test_gpu_qr_incomplete_opt(self):
A = fmatrix("A")
fn = aesara.function([A], qr(A, mode="r"), mode=mode_with_gpu)
assert any(
[
isinstance(node.op, GpuMagmaQR) and not node.op.complete
for node in fn.maker.fgraph.toposort()
]
)
def run_gpu_eigh(self, A_val, UPLO="L", compute_v=True):
A = fmatrix("A")
fn = aesara.function(
[A], GpuMagmaEigh(UPLO=UPLO, compute_v=compute_v)(A), mode=mode_with_gpu
)
return fn(A_val)
def check_gpu_eigh(self, N, UPLO="L", compute_v=True, rtol=None, atol=None):
A = random(N, N).astype("float32")
A = np.dot(A.T, A)
d_np, v_np = np.linalg.eigh(A, UPLO=UPLO)
if compute_v:
d_gpu, v_gpu = self.run_gpu_eigh(A, UPLO=UPLO, compute_v=compute_v)
else:
d_gpu = self.run_gpu_eigh(A, UPLO=UPLO, compute_v=False)
utt.assert_allclose(d_np, d_gpu, rtol=rtol, atol=atol)
if compute_v:
utt.assert_allclose(np.eye(N), np.dot(v_gpu, v_gpu.T), rtol=rtol, atol=atol)
D_m = np.zeros_like(A)
np.fill_diagonal(D_m, d_gpu)
utt.assert_allclose(
A, np.dot(np.dot(v_gpu, D_m), v_gpu.T), rtol=rtol, atol=atol
)
def test_gpu_eigh(self):
self.check_gpu_eigh(1000, UPLO="L", atol=1e-3)
self.check_gpu_eigh(1000, UPLO="U", atol=1e-3)
self.check_gpu_eigh(1000, UPLO="L", compute_v=False, atol=1e-3)
self.check_gpu_eigh(1000, UPLO="U", compute_v=False, atol=1e-3)
def test_gpu_eigh_opt(self):
A = fmatrix("A")
fn = aesara.function([A], eigh(A), mode=mode_with_gpu)
assert any(
[isinstance(node.op, GpuMagmaEigh) for node in fn.maker.fgraph.toposort()]
)
# mostly copied from aesara/tensor/tests/test_slinalg.py
def test_cholesky_grad():
rng = np.random.default_rng(utt.fetch_seed())
r = rng.standard_normal((5, 5)).astype(config.floatX)
# The dots are inside the graph since Cholesky needs separable matrices
# Check the default.
utt.verify_grad(lambda r: gpu_cholesky(r.dot(r.T)), [r], 3, rng)
# Explicit lower-triangular.
utt.verify_grad(lambda r: GpuCholesky(lower=True)(r.dot(r.T)), [r], 3, rng)
# Explicit upper-triangular.
utt.verify_grad(lambda r: GpuCholesky(lower=False)(r.dot(r.T)), [r], 3, rng)
def test_cholesky_grad_indef():
x = matrix()
mat = np.array([[1, 0.2], [0.2, -2]]).astype(config.floatX)
cholesky = GpuCholesky(lower=True)
chol_f = aesara.function([x], aesara.gradient.grad(cholesky(x).sum(), [x]))
with pytest.raises(LinAlgError):
chol_f(mat)
# cholesky = GpuCholesky(lower=True, on_error='nan')
# chol_f = function([x], grad(gpu_cholesky(x).sum(), [x]))
# assert np.all(np.isnan(chol_f(matrix)))
def test_lower_triangular_and_cholesky_grad():
# Random lower triangular system is ill-conditioned.
#
# Reference
# -----------
# Viswanath, Divakar, and L. N. Trefethen. "Condition numbers of random triangular matrices."
# SIAM Journal on Matrix Analysis and Applications 19.2 (1998): 564-581.
#
# Use smaller number of N when using float32
if config.floatX == "float64":
N = 100
else:
N = 5
rng = np.random.default_rng(utt.fetch_seed())
r = rng.standard_normal((N, N)).astype(config.floatX)
y = rng.random((N, 1)).astype(config.floatX)
def f(r, y):
PD = r.dot(r.T)
L = gpu_cholesky(PD)
A = gpu_solve_lower_triangular(L, y)
AAT = aesara.tensor.dot(A, A.T)
B = AAT + aesara.tensor.eye(N)
LB = gpu_cholesky(B)
return aesara.tensor.sum(aesara.tensor.log(aesara.tensor.diag(LB)))
utt.verify_grad(f, [r, y], 3, rng)
# Test that normally could be outside gpuarray, to have all gpuarray
# tests in the same directory, we put them here.
import numpy as np
import aesara
from aesara.compile.nanguardmode import NanGuardMode
from aesara.tensor.type import vector
from tests.gpuarray.config import mode_with_gpu
def test_nan_guard_mode():
# Also test that abs uint* and bool have c code.
for dtype in ["uint8", "int64", "bool"]:
x = vector(dtype=dtype)
y = x + 1
mode = NanGuardMode(nan_is_error=True, optimizer=mode_with_gpu.optimizer)
f = aesara.function([x], y, mode=mode)
d = np.asarray([23, 7]).astype(dtype)
assert np.allclose(f(d), d + 1)
import numpy as np
import pytest
import aesara
import tests.unittest_tools as utt
from aesara import function
from aesara.configdefaults import config
from aesara.gpuarray.multinomial import (
GPUAChoiceFromUniform,
GPUAMultinomialFromUniform,
)
from aesara.sandbox import multinomial
from aesara.sandbox.rng_mrg import MRG_RandomStream as RandomStream
from aesara.tensor.type import fmatrix, frow, fvector, iscalar, matrix, vector
from tests.gpuarray.config import mode_with_gpu
def test_multinomial_output_dtype():
# This tests the MultinomialFromUniform Op directly, not going through the
# multinomial() call in GPU random generation.
p = fmatrix()
u = fvector()
for dtype in ["int64", "float32", "float16", "float64", "int32", "auto"]:
m = aesara.sandbox.multinomial.MultinomialFromUniform(dtype)(p, u)
# the m*2 allows the multinomial to reuse output
f = function([p, u], m * 2, allow_input_downcast=True, mode=mode_with_gpu)
assert any(
[
type(node.op) is GPUAMultinomialFromUniform
for node in f.maker.fgraph.toposort()
]
)
# test that both first and second samples can be drawn
utt.assert_allclose(f([[1, 0], [0, 1]], [0.1, 0.1]), [[2, 0], [0, 2]])
# test that both second labels can be drawn
r = f([[0.2, 0.8], [0.3, 0.7]], [0.31, 0.31])
utt.assert_allclose(r, [[0, 2], [0, 2]])
# test that both first labels can be drawn
r = f([[0.2, 0.8], [0.3, 0.7]], [0.21, 0.21])
utt.assert_allclose(r, [[0, 2], [2, 0]])
# change the size to make sure output gets reallocated ok
# and also make sure that the GPU version doesn't screw up the
# transposed-ness
r = f([[0.2, 0.8]], [0.25])
utt.assert_allclose(r, [[0, 2]])
def test_multinomial_input_dtype():
# This tests the MultinomialFromUniform Op directly, not going through the
# multinomial() call in GPU random generation.
for idtype in ["float32", "float16", "float64"]:
for odtype in ["float32", "float16", "float64", "int32"]:
p = matrix("p", idtype)
u = vector("u", idtype)
# p = dmatrix('p')
# u = dvector('u')
m = aesara.sandbox.multinomial.MultinomialFromUniform(odtype)(p, u)
# the m*2 allows the multinomial to reuse output
f = function([p, u], m * 2, allow_input_downcast=True, mode=mode_with_gpu)
assert any(
[
type(node.op) is GPUAMultinomialFromUniform
for node in f.maker.fgraph.toposort()
]
)
# test that both first and second samples can be drawn
utt.assert_allclose(f([[1, 0], [0, 1]], [0.1, 0.1]), [[2, 0], [0, 2]])
# test that both second labels can be drawn
r = f([[0.2, 0.8], [0.3, 0.7]], [0.31, 0.31])
utt.assert_allclose(r, [[0, 2], [0, 2]])
# test that both first labels can be drawn
r = f([[0.2, 0.8], [0.3, 0.7]], [0.21, 0.21])
utt.assert_allclose(r, [[0, 2], [2, 0]])
# change the size to make sure output gets reallocated ok
# and also make sure that the GPU version doesn't screw up the
# transposed-ness
r = f([[0.2, 0.8]], [0.25])
utt.assert_allclose(r, [[0, 2]])
# TODO: check a bigger example (make sure blocking on GPU is handled correctly)
def test_multinomial_large():
# DEBUG_MODE will test this on GPU
p = fmatrix()
u = fvector()
m = aesara.sandbox.multinomial.MultinomialFromUniform("auto")(p, u)
f = function([p, u], m * 2, allow_input_downcast=True, mode=mode_with_gpu)
assert any(
[
type(node.op) is GPUAMultinomialFromUniform
for node in f.maker.fgraph.toposort()
]
)
pval = np.arange(10000 * 4, dtype="float32").reshape((10000, 4)) + 0.1
pval = pval / pval.sum(axis=1)[:, None]
uval = np.ones_like(pval[:, 0]) * 0.5
mval = f(pval, uval)
assert mval.shape == pval.shape
if config.cast_policy == "custom":
assert mval.dtype == pval.dtype
elif config.cast_policy == "numpy+floatX":
assert mval.dtype == config.floatX
elif config.cast_policy == "numpy":
assert mval.dtype == "float64"
else:
raise NotImplementedError(config.cast_policy)
utt.assert_allclose(mval.sum(axis=1), 2)
asdf = np.asarray([0, 0, 2, 0]) + 0 * pval
utt.assert_allclose(mval, asdf) # broadcast over all rows
def test_gpu_opt_dtypes():
# Test if the returned samples are of the datatype specified
for dtype in ["uint32", "float32", "int64", "float64"]:
p = fmatrix()
u = fvector()
m = aesara.sandbox.multinomial.MultinomialFromUniform(dtype)(p, u)
f = function([p, u], m, allow_input_downcast=True, mode=mode_with_gpu)
assert any(
[
type(node.op) is GPUAMultinomialFromUniform
for node in f.maker.fgraph.toposort()
]
)
pval = np.arange(10000 * 4, dtype="float32").reshape((10000, 4)) + 0.1
pval = pval / pval.sum(axis=1)[:, None]
uval = np.ones_like(pval[:, 0]) * 0.5
samples = f(pval, uval)
assert samples.dtype == dtype, f"{samples.dtype} != {dtype}"
def test_gpu_opt():
# Does have some overlap with test_multinomial_0
# We test the case where we put the op on the gpu when the output
# is moved to the gpu.
p = fmatrix()
u = fvector()
m = aesara.sandbox.multinomial.MultinomialFromUniform("auto")(p, u)
assert m.dtype == "float32", m.dtype
f = function([p, u], m, allow_input_downcast=True, mode=mode_with_gpu)
assert any(
[
type(node.op) is GPUAMultinomialFromUniform
for node in f.maker.fgraph.toposort()
]
)
pval = np.arange(10000 * 4, dtype="float32").reshape((10000, 4)) + 0.1
pval = pval / pval.sum(axis=1)[:, None]
uval = np.ones_like(pval[:, 0]) * 0.5
f(pval, uval)
# Test with a row, it was failing in the past.
r = frow()
m = aesara.sandbox.multinomial.MultinomialFromUniform("auto")(r, u)
assert m.dtype == "float32", m.dtype
f = function([r, u], m, allow_input_downcast=True, mode=mode_with_gpu)
assert any(
[
type(node.op) is GPUAMultinomialFromUniform
for node in f.maker.fgraph.toposort()
]
)
pval = np.arange(1 * 4, dtype="float32").reshape((1, 4)) + 0.1
pval = pval / pval.sum(axis=1)[:, None]
uval = np.ones_like(pval[:, 0]) * 0.5
f(pval, uval)
class TestOPWor:
def test_select_distinct(self):
# Tests that ChoiceFromUniform always selects distinct elements
p = fmatrix()
u = fvector()
n = iscalar()
m = multinomial.ChoiceFromUniform(odtype="auto")(p, u, n)
f = function([p, u, n], m, allow_input_downcast=True)
n_elements = 1000
all_indices = range(n_elements)
np.random.seed(12345)
for i in [5, 10, 50, 100, 500, n_elements]:
uni = np.random.rand(i).astype(config.floatX)
pvals = np.random.randint(1, 100, (1, n_elements)).astype(config.floatX)
pvals /= pvals.sum(1)
res = f(pvals, uni, i)
res = np.squeeze(res)
assert len(res) == i, res
assert np.all(np.in1d(np.unique(res), all_indices)), res
def test_fail_select_alot(self):
# Tests that ChoiceFromUniform fails when asked to sample more
# elements than the actual number of elements
p = fmatrix()
u = fvector()
n = iscalar()
m = multinomial.ChoiceFromUniform(odtype="auto")(p, u, n)
f = function([p, u, n], m, allow_input_downcast=True)
n_elements = 100
n_selected = 200
np.random.seed(12345)
uni = np.random.rand(n_selected).astype(config.floatX)
pvals = np.random.randint(1, 100, (1, n_elements)).astype(config.floatX)
pvals /= pvals.sum(1)
with pytest.raises(ValueError):
f(pvals, uni, n_selected)
def test_select_proportional_to_weight(self):
# Tests that ChoiceFromUniform selects elements, on average,
# proportional to the their probabilities
p = fmatrix()
u = fvector()
n = iscalar()
m = multinomial.ChoiceFromUniform(odtype="auto")(p, u, n)
f = function([p, u, n], m, allow_input_downcast=True)
n_elements = 100
n_selected = 10
mean_rtol = 0.0005
np.random.seed(12345)
pvals = np.random.randint(1, 100, (1, n_elements)).astype(config.floatX)
pvals /= pvals.sum(1)
avg_pvals = np.zeros((n_elements,), dtype=config.floatX)
for rep in range(10000):
uni = np.random.rand(n_selected).astype(config.floatX)
res = f(pvals, uni, n_selected)
res = np.squeeze(res)
avg_pvals[res] += 1
avg_pvals /= avg_pvals.sum()
avg_diff = np.mean(abs(avg_pvals - pvals))
assert avg_diff < mean_rtol, avg_diff
class TestFunctionWor:
def test_select_distinct(self):
# Tests that multinomial_wo_replacement always selects distinct elements
th_rng = RandomStream(12345)
p = fmatrix()
n = iscalar()
m = th_rng.multinomial_wo_replacement(pvals=p, n=n)
f = function([p, n], m, allow_input_downcast=True)
n_elements = 1000
all_indices = range(n_elements)
np.random.seed(12345)
for i in [5, 10, 50, 100, 500, n_elements]:
pvals = np.random.randint(1, 100, (1, n_elements)).astype(config.floatX)
pvals /= pvals.sum(1)
res = f(pvals, i)
res = np.squeeze(res)
assert len(res) == i
assert np.all(np.in1d(np.unique(res), all_indices)), res
def test_fail_select_alot(self):
# Tests that multinomial_wo_replacement fails when asked to sample more
# elements than the actual number of elements
th_rng = RandomStream(12345)
p = fmatrix()
n = iscalar()
m = th_rng.multinomial_wo_replacement(pvals=p, n=n)
f = function([p, n], m, allow_input_downcast=True)
n_elements = 100
n_selected = 200
np.random.seed(12345)
pvals = np.random.randint(1, 100, (1, n_elements)).astype(config.floatX)
pvals /= pvals.sum(1)
with pytest.raises(ValueError):
f(pvals, n_selected)
def test_select_proportional_to_weight(self):
# Tests that multinomial_wo_replacement selects elements, on average,
# proportional to the their probabilities
th_rng = RandomStream(12345)
p = fmatrix()
n = iscalar()
m = th_rng.multinomial_wo_replacement(pvals=p, n=n)
f = function([p, n], m, allow_input_downcast=True)
n_elements = 100
n_selected = 10
mean_rtol = 0.0005
np.random.seed(12345)
pvals = np.random.randint(1, 100, (1, n_elements)).astype(config.floatX)
pvals /= pvals.sum(1)
avg_pvals = np.zeros((n_elements,), dtype=config.floatX)
for rep in range(10000):
res = f(pvals, n_selected)
res = np.squeeze(res)
avg_pvals[res] += 1
avg_pvals /= avg_pvals.sum()
avg_diff = np.mean(abs(avg_pvals - pvals))
assert avg_diff < mean_rtol
def test_gpu_opt_wor():
# We test the case where we put the op on the gpu when the output
# is moved to the gpu.
p = fmatrix()
u = fvector()
n = iscalar()
for replace in [False, True]:
m = multinomial.ChoiceFromUniform(odtype="auto", replace=replace)(p, u, n)
assert m.dtype == "int64", m.dtype
f = function([p, u, n], m, allow_input_downcast=True, mode=mode_with_gpu)
assert any(
[
type(node.op) is GPUAChoiceFromUniform
for node in f.maker.fgraph.toposort()
]
)
n_samples = 3
pval = np.arange(10000 * 4, dtype="float32").reshape((10000, 4)) + 0.1
pval = pval / pval.sum(axis=1)[:, None]
uval = np.ones(pval.shape[0] * n_samples) * 0.5
f(pval, uval, n_samples)
# Test with a row, it was failing in the past.
r = frow()
m = multinomial.ChoiceFromUniform("auto", replace=replace)(r, u, n)
assert m.dtype == "int64", m.dtype
f = function([r, u, n], m, allow_input_downcast=True, mode=mode_with_gpu)
assert any(
[
type(node.op) is GPUAChoiceFromUniform
for node in f.maker.fgraph.toposort()
]
)
pval = np.arange(1 * 4, dtype="float32").reshape((1, 4)) + 0.1
pval = pval / pval.sum(axis=1)[:, None]
uval = np.ones_like(pval[:, 0]) * 0.5
f(pval, uval, 1)
from aesara.gpuarray.neighbours import GpuImages2Neibs
from tests.gpuarray.config import mode_with_gpu
from tests.tensor.nnet import test_neighbours
class TestGpuImages2Neibs(test_neighbours.TestImages2Neibs):
mode = mode_with_gpu
op = GpuImages2Neibs
dtypes = ["int64", "float32", "float64"]
import numpy as np
import aesara
import aesara.tensor as at
import tests.unittest_tools as utt
from aesara.gpuarray.nnet import (
GpuCrossentropySoftmax1HotWithBiasDx,
GpuCrossentropySoftmaxArgmax1HotWithBias,
GpuSoftmax,
GpuSoftmaxWithBias,
)
from aesara.gradient import grad
from aesara.tensor.math import argmax, log, mean
from aesara.tensor.nnet import crossentropy_softmax_1hot_with_bias_dx
from aesara.tensor.type import fmatrix, fvector, lvector, matrix, vector
from tests.gpuarray.config import mode_with_gpu, mode_without_gpu
mode_wo_cudnn = mode_with_gpu.excluding("cudnn")
def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
# This is basic test for GpuCrossentropySoftmaxArgmax1HotWithBias
# We check that we loop when their is too much threads
n_in = 1000
batch_size = 4097
n_out = 1250
if not isinstance(mode_with_gpu, aesara.compile.debugmode.DebugMode):
n_in = 4098
n_out = 4099
y = lvector("y")
b = fvector("b")
# we precompute the dot with big shape before to allow the test of
# GpuCrossentropySoftmax1HotWithBiasDx to don't fail with the error
# (the launch timed out and was terminated) on GPU card not
# powerful enough. We need the big shape to check for corner
# case.
dot_result = fmatrix("dot_result")
xx = np.asarray(np.random.rand(batch_size, n_in), dtype=np.float32)
yy = np.ones((batch_size,), dtype="int32")
b_values = np.zeros((n_out,), dtype="float32")
W_values = np.asarray(np.random.rand(n_in, n_out), dtype="float32")
dot_value = np.asarray(np.dot(xx, W_values), dtype="float32")
del W_values
p_y_given_x = aesara.tensor.nnet.softmax(dot_result + b)
y_pred = argmax(p_y_given_x, axis=-1)
loss = -mean(log(p_y_given_x)[at.arange(y.shape[0]), y])
dW = grad(loss, dot_result)
classify = aesara.function(
inputs=[y, b, dot_result], outputs=[loss, y_pred, dW], mode=mode_without_gpu
)
classify_gpu = aesara.function(
inputs=[y, b, dot_result], outputs=[loss, y_pred, dW], mode=mode_with_gpu
)
assert any(
[
isinstance(
node.op, aesara.tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias
)
for node in classify.maker.fgraph.toposort()
]
)
assert any(
[
isinstance(node.op, GpuCrossentropySoftmaxArgmax1HotWithBias)
for node in classify_gpu.maker.fgraph.toposort()
]
)
out = classify(yy, b_values, dot_value)
gout = classify_gpu(yy, b_values, dot_value)
assert len(out) == len(gout) == 3
utt.assert_allclose(out[0], gout[0])
utt.assert_allclose(out[2], gout[2], atol=3e-6)
utt.assert_allclose(out[1], gout[1])
def test_GpuCrossentropySoftmax1HotWithBiasDx():
# This is basic test for GpuCrossentropySoftmax1HotWithBiasDx
# We check that we loop when their is too much threads
batch_size = 4097
n_out = 1250
if not isinstance(mode_with_gpu, aesara.compile.debugmode.DebugMode):
n_out = 4099
softmax_output_value = np.random.rand(batch_size, n_out).astype("float32")
dnll_value = np.asarray(np.random.rand(batch_size), dtype="float32")
y_idx_value = np.random.randint(low=0, high=5, size=batch_size)
softmax_output = fmatrix()
softmax_output /= softmax_output.sum(axis=1).reshape(softmax_output.shape[1], 1)
op = crossentropy_softmax_1hot_with_bias_dx(dnll_value, softmax_output, y_idx_value)
cpu_f = aesara.function([softmax_output], op, mode=mode_without_gpu)
gpu_f = aesara.function([softmax_output], op, mode=mode_with_gpu)
# aesara.printing.debugprint(cpu_f)
# aesara.printing.debugprint(gpu_f)
assert any(
[
isinstance(node.op, aesara.tensor.nnet.CrossentropySoftmax1HotWithBiasDx)
for node in cpu_f.maker.fgraph.toposort()
]
)
assert any(
[
isinstance(node.op, GpuCrossentropySoftmax1HotWithBiasDx)
for node in gpu_f.maker.fgraph.toposort()
]
)
cpu_out = cpu_f(softmax_output_value)
gpu_out = gpu_f(softmax_output_value)
rtol = 1e-5
atol = 1e-6
utt.assert_allclose(cpu_out, gpu_out, rtol=rtol, atol=atol)
def test_softmax_with_bias_float16():
softmax_with_bias_unittest_template(dtypeInput="float16", dtypeBias="float32")
softmax_with_bias_unittest_template(dtypeInput="float16", dtypeBias="float16")
softmax_with_bias_unittest_template(dtypeInput="float32", dtypeBias="float16")
def test_softmax_with_bias_float32():
softmax_with_bias_unittest_template(dtypeInput="float32", dtypeBias="float32")
def test_softmax_with_bias_float64():
softmax_with_bias_unittest_template(dtypeInput="float32", dtypeBias="float64")
softmax_with_bias_unittest_template(dtypeInput="float64", dtypeBias="float32")
softmax_with_bias_unittest_template(dtypeInput="float64", dtypeBias="float64")
def softmax_with_bias_unittest_template(dtypeInput, dtypeBias):
# This is a basic test for GpuSoftmaxWithBias.
#
# We check that we loop when there are too many blocks.
#
# TODO: check that we loop when there are too many threads. (THIS IS
# NOT IMPLEMENTED)
x = matrix("x", dtype=dtypeInput)
b = vector("b", dtype=dtypeBias)
z = aesara.tensor.nnet.softmax_with_bias(x, b)
f = aesara.function([x, b], z, mode=mode_without_gpu)
f_gpu = aesara.function([x, b], z, mode=mode_with_gpu)
assert f.maker.fgraph.toposort()[-1].op == aesara.tensor.nnet.softmax_with_bias
assert isinstance(f_gpu.maker.fgraph.toposort()[-2].op, GpuSoftmaxWithBias)
def cmp(n, m):
data = np.random.uniform(1e-7, 1, (n, m)).astype(dtype=dtypeInput)
b_data = np.random.uniform(1e-7, 1, (m,)).astype(dtype=dtypeBias)
out = f(data, b_data)
gout = f_gpu(data, b_data)
utt.assert_allclose(out, gout)
cmp(2, 5)
# we need to test n>32*1024 to check that we make the block loop.
cmp(2 << 15, 5)
cmp(4074, 400)
cmp(784, 784)
cmp(4, 1000)
cmp(4, 1024)
cmp(4, 2000)
cmp(4, 2024)
# GTX285 don't have enough shared mem for this case.
cmp(4, 4074)
# The GTX580, 680 and kepler don't have enough shared memory.
cmp(2, 10000)
cmp(128, 16 * 1024)
cmp(128, 64 * 1024)
def test_softmax_float16():
softmax_unittest_template("float16")
def test_softmax_float32():
softmax_unittest_template("float32")
def test_softmax_float64():
softmax_unittest_template("float64")
def softmax_unittest_template(dtypeInput):
# This is basic test for GpuSoftmax.
#
# We check that we loop when their is too much block
# We use slower code when there isn't enough shared memory
x = matrix("x", dtype=dtypeInput)
z = aesara.tensor.nnet.softmax(x)
f = aesara.function([x], z, mode=mode_without_gpu)
f_gpu = aesara.function([x], z, mode=mode_wo_cudnn)
assert f.maker.fgraph.toposort()[-1].op == aesara.tensor.nnet.softmax_legacy
assert isinstance(f_gpu.maker.fgraph.toposort()[-2].op, GpuSoftmax)
def cmp(n, m):
data = np.random.uniform(0, 1, (n, m)).astype(dtype=dtypeInput)
out = f(data)
gout = f_gpu(data)
utt.assert_allclose(out, gout)
# we need to test n>32*1024 to check that we make the block loop.
cmp(2, 5)
cmp(2 << 15, 5)
cmp(4074, 400)
cmp(784, 784)
cmp(4, 1000)
cmp(4, 1024)
cmp(4, 2000)
cmp(4, 2024)
# The GTX285 don't have enough shared memory.
cmp(4, 4074)
# The GTX580, 680 and kepler don't have enough shared memory.
cmp(2, 10000)
cmp(128, 16 * 1024)
cmp(128, 64 * 1024)
class TestSoftMax:
gpu_op = GpuSoftmax
mode = mode_wo_cudnn
def _test_softmax(self, x, x_gpu, f_z, f_gpu_z, cmp):
# This is basic test for GpuSoftmax and GpuDnnSoftmax
#
# We check that we loop when there is too much block
# We use slower code when there isn't enough shared memory
f_z_out = f_z(x)
f_gpu_z_out = f_gpu_z(x_gpu)
f = aesara.function([x], f_z_out, mode=mode_without_gpu)
f_gpu = aesara.function([x_gpu], f_gpu_z_out, mode=self.mode)
self._check_types(f, f_gpu, aesara.tensor.nnet.Softmax, self.gpu_op)
# we need to test n>32*1024 to check that we make the block loop.
cmp(1, 5, f, f_gpu)
cmp(2, 5, f, f_gpu)
cmp(10, 5, f, f_gpu)
cmp(100, 5, f, f_gpu)
cmp(1000, 5, f, f_gpu)
cmp(10000, 5, f, f_gpu)
cmp(4074, 400, f, f_gpu)
cmp(784, 784, f, f_gpu)
cmp(4, 1000, f, f_gpu)
cmp(4, 1024, f, f_gpu)
cmp(4, 2000, f, f_gpu)
cmp(4, 2024, f, f_gpu)
# The GTX285 don't have enough shared memory.
cmp(4, 4074, f, f_gpu)
# The GTX580, 680 and kepler don't have enough shared memory.
cmp(2, 10000, f, f_gpu)
cmp(128, 16 * 1024, f, f_gpu)
cmp(128, 64 * 1024, f, f_gpu)
# cudnn permits no more than 2^15 - 1 rows
cmp((2 << 15) - 1, 5, f, f_gpu)
cmp(5, 2 << 15, f, f_gpu)
return f, f_gpu
def _cmp(self, n, m, f, f_gpu):
data = np.arange(n * m, dtype="float32").reshape(n, m)
out = f(data)
gout = f_gpu(data)
utt.assert_allclose(out, gout)
def _check_types(self, graph, graph_gpu, f_type, f_gpu_type):
assert isinstance(graph.maker.fgraph.toposort()[-1].op, f_type)
assert (
len(
[
node
for node in graph_gpu.maker.fgraph.toposort()
if isinstance(node.op, f_gpu_type)
]
)
== 1
)
def test_softmax(self):
x = fmatrix("x")
z = aesara.tensor.nnet.softmax_legacy
f, f_gpu = self._test_softmax(x, x, z, z, self._cmp)
self._cmp(2 << 15, 5, f, f_gpu)
def test_softmax_shape_0(self):
x = fmatrix("x")
z = aesara.tensor.nnet.softmax_legacy
f, f_gpu = self._test_softmax(x, x, z, z, self._cmp)
# Aesara can handle that case, but cudnn can't
self._cmp(0, 10, f, f_gpu)
import numpy as np
import pytest
import aesara
import aesara.gpuarray
import aesara.tensor.slinalg as slinalg
from aesara import tensor as at
from aesara.breakpoint import PdbBreakpoint
from aesara.configdefaults import config
from aesara.gpuarray import basic_ops, blas, dnn, opt
from aesara.gpuarray.basic_ops import (
GpuAlloc,
GpuAllocEmpty,
GpuFromHost,
GpuReshape,
HostFromGpu,
host_from_gpu,
)
from aesara.gpuarray.blas import GpuGemm
from aesara.gpuarray.dnn import GpuDnnReduction
from aesara.gpuarray.elemwise import (
Elemwise,
GpuCAReduceCPY,
GpuCAReduceCuda,
GpuElemwise,
max_inputs_to_GpuElemwise,
)
from aesara.gpuarray.linalg import GpuCholesky, GpuCusolverSolve, cusolver_available
from aesara.gpuarray.subtensor import GpuSubtensor
from aesara.gpuarray.type import GpuArrayType, get_context, gpuarray_shared_constructor
from aesara.graph.opt import check_stack_trace
from aesara.raise_op import Assert, assert_op
from aesara.tensor.basic import Alloc, AllocEmpty, MakeVector, Rebroadcast
from aesara.tensor.blas import batched_dot
from aesara.tensor.math import dot, eq, exp, gt, tanh
from aesara.tensor.nnet import abstract_conv
from aesara.tensor.type import (
TensorType,
bmatrix,
cscalar,
fmatrix,
fscalar,
ftensor4,
iscalar,
ivector,
lscalar,
lvector,
matrix,
scalar,
tensor3,
vector,
)
from tests import unittest_tools as utt
from tests.gpuarray.config import mode_with_gpu, mode_without_gpu, test_ctx_name
from tests.tensor.test_basic import TestSpecifyShape
from tests.test_ifelse import TestIfelse
def _check_stack_trace(thing):
from aesara.tensor.shape import Shape, Shape_i
def _ops_to_check(op):
if not isinstance(op, aesara.graph.op.Op):
op = op.op # assume it is an apply node
return not isinstance(
op,
(
Shape_i,
Shape,
aesara.compile.ops.DeepCopyOp,
MakeVector,
aesara.tensor.subtensor.Subtensor,
aesara.tensor.elemwise.Elemwise,
aesara.ifelse.IfElse,
GpuFromHost,
HostFromGpu,
),
)
return check_stack_trace(thing, ops_to_check=_ops_to_check, bug_print="ignore")
def test_local_assert():
x = fmatrix()
a = assert_op(x, eq(x, 0).any())
f = aesara.function([x], a, mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
a_op = [n for n in topo if isinstance(n.op, Assert)]
assert len(a_op) == 1
assert isinstance(a_op[0].inputs[0].type, GpuArrayType)
def test_local_remove_all_assert():
x = fmatrix()
a = assert_op(x, eq(x, 0).any())
# By default `unsafe` should not be there
f = aesara.function([x], a, mode=mode_with_gpu.excluding("unsafe"))
topo = f.maker.fgraph.toposort()
a_op = [n for n in topo if isinstance(n.op, Assert)]
assert len(a_op) == 1
# Put `unsafe`
f = aesara.function([x], a, mode=mode_with_gpu.including("unsafe"))
topo = f.maker.fgraph.toposort()
a_op = [n for n in topo if isinstance(n.op, Assert)]
assert len(a_op) == 0
# Remove `unsafe`
f = aesara.function([x], a, mode=mode_with_gpu.excluding("unsafe"))
topo = f.maker.fgraph.toposort()
a_op = [n for n in topo if isinstance(n.op, Assert)]
assert len(a_op) == 1
def test_local_gpu_contiguous_gpu_contiguous():
a = fmatrix()
o1 = basic_ops.gpu_contiguous(a)
o2 = basic_ops.gpu_contiguous(o1)
f1 = aesara.function([a], o1, mode=mode_with_gpu)
f2 = aesara.function([a], o2, mode=mode_with_gpu)
assert 1 == len(
[
node
for node in f1.maker.fgraph.toposort()
if isinstance(node.op, basic_ops.GpuContiguous)
]
)
assert 1 == len(
[
node
for node in f2.maker.fgraph.toposort()
if isinstance(node.op, basic_ops.GpuContiguous)
]
)
assert _check_stack_trace(f1)
assert _check_stack_trace(f2)
def test_local_gpu_contiguous():
a = fmatrix()
o = aesara.tensor.extra_ops.cpu_contiguous(a)
f = aesara.function([a], o, mode=mode_with_gpu)
assert 1 == len(
[
node
for node in f.maker.fgraph.toposort()
if isinstance(node.op, basic_ops.GpuContiguous)
]
)
f([[2.0]])
assert _check_stack_trace(f)
def test_flatten():
m = fmatrix()
f = aesara.function([m], m.flatten(), mode=mode_with_gpu)
val = np.random.rand(10, 11).astype("float32")
res = f(val)
utt.assert_allclose(res, val.flatten())
assert res.shape == val.flatten().shape
assert GpuReshape in [type(node.op) for node in f.maker.fgraph.toposort()]
val = np.random.rand(10, 11).astype("float32")
res = f(val)
utt.assert_allclose(res, val.flatten())
assert res.shape == val.flatten().shape
assert GpuReshape in [type(node.op) for node in f.maker.fgraph.toposort()]
assert _check_stack_trace(f)
f = aesara.function(
[m], m.flatten(ndim=2), mode=mode_with_gpu.excluding("local_useless_reshape")
)
val = np.random.rand(10, 11).astype("float32")
res = f(val)
utt.assert_allclose(res, val)
assert res.shape == val.shape
assert GpuReshape in [type(node.op) for node in f.maker.fgraph.toposort()]
assert _check_stack_trace(f)
m = tensor3()
f = aesara.function([m], m.flatten(ndim=2), mode=mode_with_gpu)
val = np.random.rand(10, 11, 12).astype("float32")
res = f(val)
utt.assert_allclose(res, val.reshape(10, -1))
assert res.shape == val.reshape(10, -1).shape
assert GpuReshape in [type(node.op) for node in f.maker.fgraph.toposort()]
assert _check_stack_trace(f)
def test_reduce():
kind = get_context(test_ctx_name).kind
for method, param in [
("sum", dict(acc_dtype="float32")),
("prod", dict(acc_dtype="float32")),
("max", {}),
("min", {}),
]:
m = fmatrix()
f = aesara.function(
[m], getattr(m, method)(axis=0, **param), mode=mode_with_gpu
)
# assert _check_stack_trace(f) this op is ok but since
# it is using GpuCAReduceCuda that has an empty stack
# trace, this assertion gives error.
val = np.random.rand(10, 11).astype("float32")
res = f(val)
utt.assert_allclose(res, getattr(val, method)(axis=0))
assert res.shape == (11,)
topo = f.maker.fgraph.toposort()
ops = [type(node.op) for node in topo]
if kind == b"opencl" and method in ["max", "min"]:
assert not (
GpuCAReduceCuda in ops
or GpuCAReduceCPY in ops
or GpuDnnReduction in ops
)
else:
assert (
GpuCAReduceCuda in ops
or GpuCAReduceCPY in ops
or GpuDnnReduction in ops
)
def test_local_gpualloc_memset_0():
i = iscalar()
z = np.zeros((1,), dtype="float32")
o = np.ones((1,), dtype="float32")
ones = np.ones((2,), dtype="float32")
# Test with 0 from CPU op.
# Should not be transferred as the only client is the output
a = at.alloc(z, i)
f = aesara.function([i], a, mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert len(topo) == 1
assert isinstance(topo[0].op, Alloc)
assert (np.asarray(f(6)) == 0).all()
assert _check_stack_trace(f)
# Test with 0 from CPU op.
# Should be transferred as it is used by another op.
a = at.alloc(z, i)
f = aesara.function([i], a.cumsum(), mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert len(topo) == 3
assert isinstance(topo[0].op, GpuAlloc)
assert (np.asarray(f(6)) == 0).all()
assert _check_stack_trace(f)
# Test with 0
a = GpuAlloc(test_ctx_name)(z, i)
f = aesara.function([i], a, mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert len(topo) == 1
assert isinstance(topo[0].op, GpuAlloc) and topo[0].op.memset_0
assert (np.asarray(f(6)) == 0).all()
assert _check_stack_trace(f)
# Test with 1
a = GpuAlloc(test_ctx_name)(o, i)
f = aesara.function([i], a, mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert len(topo) == 1
assert isinstance(topo[0].op, GpuAlloc)
assert not topo[0].op.memset_0
assert (np.asarray(f(6)) == 1).all()
assert _check_stack_trace(f)
# Test with 1, 1
a = GpuAlloc(test_ctx_name)(ones, i)
f = aesara.function([i], a, mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert len(topo) == 1
assert isinstance(topo[0].op, GpuAlloc)
assert not topo[0].op.memset_0
assert (np.asarray(f(2)) == 1).all()
assert _check_stack_trace(f)
def test_local_gpualloc_empty():
i = iscalar()
ii = iscalar()
# Test with vector
# Should not be moved as the only client is the output
a = AllocEmpty("float32")(i)
f = aesara.function([i], a, mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert len(topo) == 1
assert isinstance(topo[0].op, AllocEmpty)
# This return not initialized data, so we can only check the shape
assert f(3).shape == (3,)
assert _check_stack_trace(f)
# Test with vector
# Should be moved
a = AllocEmpty("float32")(i)
f = aesara.function([i], a.cumsum(), mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert len(topo) == 3
assert isinstance(topo[0].op, GpuAllocEmpty)
# This return not initialized data, so we can only check the shape
assert f(3).shape == (3,)
assert _check_stack_trace(f)
# Test with matrix
a = AllocEmpty("float32")(i, ii)
f = aesara.function([i, ii], a.cumsum(axis=0), mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert len(topo) == 3
assert isinstance(topo[0].op, GpuAllocEmpty)
# This return not initialized data, so we can only check the shape
assert f(3, 4).shape == (3, 4)
assert _check_stack_trace(f)
def test_rebroadcast():
d = np.random.rand(10, 10).astype("float32")
v = fmatrix()
up = at.unbroadcast(v.sum().dimshuffle("x", "x"), 0, 1)
f = aesara.function([v], [up], mode=mode_with_gpu)
f(d)
topo = f.maker.fgraph.toposort()
rebrs = [node for node in topo if isinstance(node.op, Rebroadcast)]
assert len(rebrs) == 1
rebr = rebrs[0]
assert isinstance(rebr.inputs[0].type, GpuArrayType)
assert isinstance(rebr.outputs[0].type, GpuArrayType)
assert _check_stack_trace(f)
class TestSpecifyShape(TestSpecifyShape):
mode = mode_with_gpu
input_type = GpuArrayType
class TestGpuIfelse(TestIfelse):
mode = mode_with_gpu
@staticmethod
def cast_output(v):
return basic_ops.as_gpuarray_variable(v, test_ctx_name)
shared = staticmethod(gpuarray_shared_constructor)
def get_ifelse(self, n):
return aesara.ifelse.IfElse(n, gpu=True, as_view=True)
def test_lifter_with_inputs_of_graph(self):
x = vector()
cond = iscalar()
f = aesara.function(
[x, cond], aesara.ifelse.ifelse(cond, x.mean(), x.sum()), mode=mode_with_gpu
)
assert f(np.float32([1, 2, 3]), 0) == 6
assert _check_stack_trace(f)
x = vector()
cond = scalar()
f = aesara.function(
[x, cond], aesara.ifelse.ifelse(cond, x.mean(), x.sum()), mode=mode_with_gpu
)
assert f(np.float32([1, 2, 3]), 0) == 6
assert _check_stack_trace(f)
def test_lifter_with_shared_var(self):
x = lscalar("x")
y = gpuarray_shared_constructor(
np.asarray(1, dtype="float32"), target=test_ctx_name
)
z = at.constant(2.0)
a = aesara.ifelse.ifelse(x, y, z)
with config.change_flags(on_opt_error="raise"):
aesara.function([x], [a], mode=mode_with_gpu)
def test_print_op():
# Test that print ops don't block gpu optimization
b = fmatrix()
f = aesara.function([b], aesara.printing.Print()(b) * 2, mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert isinstance(topo[0].op, GpuFromHost)
assert isinstance(topo[1].op, aesara.printing.Print)
assert isinstance(topo[2].op, GpuElemwise)
assert topo[3].op == host_from_gpu
assert _check_stack_trace(f)
f(np.random.random((5, 5)).astype("float32"))
def test_pdbbreakpoint_op():
# Test that PdbBreakpoint ops don't block gpu optimization
b = fmatrix()
# Create a function composed of a breakpoint followed by
# some computation
condition = gt(b.sum(), 0)
b_monitored = PdbBreakpoint(name="TestBreakpoint")(condition, b)
output = b_monitored**2
f = aesara.function([b], output, mode=mode_with_gpu)
# Ensure that, in the compiled function, the computation following the
# breakpoint has been moved to the gpu.
topo = f.maker.fgraph.toposort()
assert isinstance(topo[-2].op, GpuElemwise)
assert topo[-1].op == host_from_gpu
assert _check_stack_trace(f)
def test_local_gpu_elemwise_careduce():
mode_with_gpu_no_cudnn = mode_with_gpu.excluding("cudnn")
x = matrix()
def fn_sum_square(x, axis):
return (x * x).sum(axis=axis)
def fn_sum_abs(x, axis):
return abs(x).sum(axis=axis)
def fn_max_abs(x, axis):
return abs(x).max(axis=axis)
for fn, pre_scalar_op in (
(fn_sum_square, aesara.scalar.sqr),
(fn_sum_abs, aesara.scalar.abs_),
(fn_max_abs, aesara.scalar.abs_),
):
for axis in (None, 0, 1):
o = fn(x, axis)
f = aesara.function([x], o, mode=mode_with_gpu_no_cudnn)
topo = f.maker.fgraph.toposort()
assert len(topo) == 3
assert isinstance(topo[1].op, GpuCAReduceCuda)
assert topo[1].op.pre_scalar_op == pre_scalar_op
assert _check_stack_trace(f)
data = np.random.rand(3, 4).astype(config.floatX)
utt.assert_allclose(fn(data, axis), f(data))
def test_local_lift_dot22scalar():
x = matrix()
y = matrix()
a = scalar()
o = aesara.tensor.blas.Dot22Scalar()(x, y, a)
f_cpu = aesara.function([x, y, a], o)
f_gpu = aesara.function([x, y, a], o, mode=mode_with_gpu)
assert not any(
isinstance(n.op, aesara.tensor.blas.Dot22Scalar)
for n in f_gpu.maker.fgraph.apply_nodes
)
assert any(isinstance(n.op, GpuGemm) for n in f_gpu.maker.fgraph.apply_nodes)
x_val = np.random.random((2, 3)).astype(config.floatX)
y_val = np.random.random((3, 4)).astype(config.floatX)
a_val = 0.5
utt.assert_allclose(f_cpu(x_val, y_val, a_val), f_gpu(x_val, y_val, a_val))
assert _check_stack_trace(f_gpu)
def test_local_gpu_subtensor():
# Test shared forced on CPU.
t = aesara.shared(np.zeros(20, "float32"))
f = aesara.function([], t[3:4], mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert any(type(node.op) is aesara.tensor.subtensor.Subtensor for node in topo)
assert not any(isinstance(node.op, GpuSubtensor) for node in topo)
assert _check_stack_trace(f)
# Test graph input.
t = fmatrix()
f = aesara.function([t], t[3:4], mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert any(type(node.op) is aesara.tensor.subtensor.Subtensor for node in topo)
assert not any(isinstance(node.op, GpuSubtensor) for node in topo)
assert _check_stack_trace(f)
# Test multiple use of the input
# We want the subtensor to be on the GPU to prevent multiple transfer.
t = fmatrix()
f = aesara.function([t], [t[3:4], t + 1], mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert not any(type(node.op) is aesara.tensor.subtensor.Subtensor for node in topo)
assert any(isinstance(node.op, GpuSubtensor) for node in topo)
assert _check_stack_trace(f)
# Test multiple use of the input + input as output
# We want the subtensor to be on the GPU to prevent multiple transfer.
t = fmatrix()
f = aesara.function([t], [t[3:4], t + 1, t], mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert not any(type(node.op) is aesara.tensor.subtensor.Subtensor for node in topo)
assert any(isinstance(node.op, GpuSubtensor) for node in topo)
assert _check_stack_trace(f)
# Test shared forced on CPU end we do computation on the output of
# the subtensor.
t = aesara.shared(np.zeros(20, "float32"))
f = aesara.function([], t[3:4] + 1, mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert any(type(node.op) is aesara.tensor.subtensor.Subtensor for node in topo)
assert not any(isinstance(node.op, GpuSubtensor) for node in topo)
# Our optimizer isn't smart enough to move to the GPU Elemwise.
# If it where just a little bit smarter, it could wrongly move it to the GPU.
# If it where super smart, it would know it should not move it to the GPU.
assert any(isinstance(node.op, aesara.tensor.elemwise.Elemwise) for node in topo)
assert _check_stack_trace(f)
def test_local_gpu_elemwise():
# Test local_gpu_elemwise when there is a dtype upcastable to float32
a = bmatrix()
b = fmatrix()
c = fmatrix()
a_v = (np.random.rand(4, 5) * 10).astype("int8")
b_v = (np.random.rand(4, 5) * 10).astype("float32")
c_v = (np.random.rand(4, 5) * 10).astype("float32")
# Due to optimization order, this composite is created when all
# the op are on the gpu.
f = aesara.function([a, b, c], a + b + c, mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert sum(isinstance(node.op, GpuElemwise) for node in topo) == 1
assert sum(type(node.op) == aesara.tensor.elemwise.Elemwise for node in topo) == 0
utt.assert_allclose(f(a_v, b_v, c_v), a_v + b_v + c_v)
assert _check_stack_trace(f)
# Now test with the composite already on the cpu before we move it
# to the gpu
a_s = aesara.scalar.int8()
b_s = aesara.scalar.float32()
c_s = aesara.scalar.float32()
out_s = aesara.scalar.Composite([a_s, b_s, c_s], [a_s + b_s + c_s])
out_op = aesara.tensor.elemwise.Elemwise(out_s)
f = aesara.function([a, b, c], out_op(a, b, c), mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert sum(isinstance(node.op, GpuElemwise) for node in topo) == 1
assert sum(type(node.op) == aesara.tensor.elemwise.Elemwise for node in topo) == 0
utt.assert_allclose(f(a_v, b_v, c_v), a_v + b_v + c_v)
assert _check_stack_trace(f)
return # Not yet implemented
# Test multiple output
a_s = aesara.scalar.float32()
a = fmatrix()
from aesara.scalar.basic import identity
out_s = aesara.scalar.Composite(
[a_s, b_s, c_s], [identity(a_s), identity(c_s), identity(b_s)]
)
outs_op = aesara.tensor.elemwise.Elemwise(out_s)
f = aesara.function([a, b, c], outs_op(a, b, c), mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert sum(isinstance(node.op, GpuElemwise) for node in topo) == 1
assert sum(type(node.op) == aesara.tensor.elemwise.Elemwise for node in topo) == 0
out = f(a_v, b_v, c_v)
utt.assert_allclose(out[0], a_v)
utt.assert_allclose(out[1], c_v)
utt.assert_allclose(out[2], b_v)
assert _check_stack_trace(f)
# Test multiple output
out_s = aesara.scalar.Composite([a_s, b_s, c_s], [a_s + b_s, a_s * b_s])
outs_op = aesara.tensor.elemwise.Elemwise(out_s)
f = aesara.function([a, b, c], outs_op(a, b, c), mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert sum(isinstance(node.op, GpuElemwise) for node in topo) == 1
assert sum(type(node.op) == aesara.tensor.elemwise.Elemwise for node in topo) == 0
out = f(a_v, b_v, c_v)
utt.assert_allclose(out[0], a_v + b_v)
utt.assert_allclose(out[1], a_v * c_v)
assert _check_stack_trace(f)
# Test non-contiguous input
c = gpuarray_shared_constructor(np.asarray(c_v, dtype="float32"))
f = aesara.function([a, b], outs_op(a[::2], b[::2], c[::2]), mode=mode_with_gpu)
out = f(a_v, b_v)
utt.assert_allclose(out[0], a_v[::2] + b_v[::2])
utt.assert_allclose(out[1], a_v[::2] * c_v[::2])
assert _check_stack_trace(f)
def test_many_arg_elemwise():
# This test checks whether the + and * elemwise ops can handle
# extremely large numbers of arguments on gpu.
rng = np.random.default_rng([1, 2, 3])
nb_of_inputs_overflows = []
for num_args in [64]:
for op_to_test in [aesara.tensor.add, aesara.tensor.mul]:
for nb_dim in [2, 8]:
shapes = [rng.integers(1, 5) for i in range(nb_dim)]
args = [
np.cast["float32"](rng.standard_normal(shapes))
for arg in range(0, num_args)
]
symb_args = [
TensorType("float32", (False,) * nb_dim)()
for arg in range(0, num_args)
]
outputs = []
for mode in [mode_with_gpu, mode_without_gpu]:
# test the optimization local_gpua_elemwise
output = op_to_test(*symb_args)
f = aesara.function(symb_args, output, mode=mode)
outputs.append(f(*args))
# assert that the test was done on the gpu.
if mode is mode_with_gpu:
nb_of_inputs_overflows.append(
max_inputs_to_GpuElemwise(output.owner) - num_args
)
nodelst = [node for node in f.maker.fgraph.apply_nodes]
assert any(isinstance(node.op, GpuElemwise) for node in nodelst)
assert not any(
isinstance(node.op, Elemwise)
for node in nodelst
if not isinstance(node.op, GpuElemwise)
)
results_gpu, results_cpu = outputs
utt.assert_allclose(results_gpu, results_cpu)
# Make sure we test at least one case with no number of inputs overflow
assert any(overflow >= 0 for overflow in nb_of_inputs_overflows)
# Make sure we test at least one case with number of inputs overflow
assert any(overflow < 0 for overflow in nb_of_inputs_overflows)
def test_not_useless_scalar_gpuelemwise():
# We don't want to move elemwise on scalar on the GPU when the
# result will not be used on the GPU!
with config.change_flags(warn_float64="ignore"):
X = fmatrix()
x = np.random.standard_normal((32, 32)).astype(np.float32)
m1 = aesara.shared(np.random.standard_normal((32, 32)).astype(np.float32))
loss = (X - dot(X, m1)).norm(L=2)
lr = aesara.shared(np.asarray(0.001, dtype=np.float32))
grad = aesara.grad(loss, m1)
train = aesara.function(
inputs=[X], updates=[(m1, m1 - lr * grad)], mode=mode_with_gpu
)
train(x)
topo = train.maker.fgraph.toposort()
gemms = [app for app in topo if isinstance(app.op, GpuGemm)]
assert len(gemms) == 2
assert isinstance(gemms[1].inputs[1].owner.op, aesara.tensor.elemwise.Elemwise)
def test_local_lift_abstractconv_gpu_shape():
with config.change_flags(on_opt_error="raise"):
s = ivector()
a = ftensor4()
b = ftensor4()
c = aesara.tensor.nnet.abstract_conv.AbstractConv2d_gradWeights()(a, b, s)
f = aesara.function([s, a, b], c, mode=mode_with_gpu)
assert _check_stack_trace(f)
def test_local_assert_no_cpu_op():
rng = np.random.default_rng(utt.fetch_seed())
m = rng.uniform(-1, 1, (10, 10)).astype("float32")
ms = gpuarray_shared_constructor(m, name="m_shared")
out = tanh(ms).dot(ms.T)
mode_local_assert = mode_with_gpu.including("assert_no_cpu_op")
mode_local_assert = mode_local_assert.excluding("local_gpua_elemwise")
with config.change_flags(assert_no_cpu_op="raise", on_opt_error="ignore"):
with pytest.raises(AssertionError):
aesara.function([], out, mode=mode_local_assert)
with config.change_flags(assert_no_cpu_op="ignore"):
f = aesara.function([], out, mode=mode_local_assert)
assert _check_stack_trace(f)
def test_no_complex():
width_var = cscalar()
freq_var = fscalar()
signal_var = fscalar()
stft_out = exp(width_var * freq_var) * signal_var
f = aesara.function([width_var, freq_var, signal_var], stft_out, mode=mode_with_gpu)
assert _check_stack_trace(f)
@utt.assertFailure_fast
@pytest.mark.skipif(not cusolver_available, reason="No cuSolver or SciPy")
def test_local_lift_solve():
A = fmatrix()
b = fmatrix()
o = slinalg.solve(A, b)
f_cpu = aesara.function([A, b], o, mode_without_gpu)
f_gpu = aesara.function([A, b], o, mode=mode_with_gpu)
assert not any(
isinstance(n.op, slinalg.Solve) for n in f_gpu.maker.fgraph.apply_nodes
)
assert any(
isinstance(n.op, GpuCusolverSolve) and n.op.inplace
for n in f_gpu.maker.fgraph.apply_nodes
)
A_val = np.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
b_val = np.random.uniform(-0.4, 0.4, (5, 3)).astype("float32")
utt.assert_allclose(f_cpu(A_val, b_val), f_gpu(A_val, b_val))
assert _check_stack_trace(f_gpu)
@pytest.mark.skipif(not cusolver_available, reason="No cuSolver or SciPy")
def test_gpu_solve_not_inplace():
A = fmatrix()
b = fmatrix()
s = slinalg.solve(A, b)
o = dot(A, s)
f_cpu = aesara.function([A, b], o, mode_without_gpu)
f_gpu = aesara.function([A, b], o, mode=mode_with_gpu)
count_not_inplace = len(
[
n.op
for n in f_gpu.maker.fgraph.apply_nodes
if isinstance(n.op, GpuCusolverSolve) and not n.op.inplace
]
)
assert count_not_inplace == 1, count_not_inplace
A_val = np.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
b_val = np.random.uniform(-0.4, 0.4, (5, 3)).astype("float32")
utt.assert_allclose(f_cpu(A_val, b_val), f_gpu(A_val, b_val))
@utt.assertFailure_fast
@pytest.mark.skipif(not cusolver_available, reason="No cuSolver or SciPy")
def test_local_lift_cholesky():
A = fmatrix()
o = slinalg.cholesky(A)
f_cpu = aesara.function([A], o, mode=mode_without_gpu)
f_gpu = aesara.function([A], o, mode=mode_with_gpu)
assert not any(
isinstance(n.op, slinalg.Cholesky) for n in f_gpu.maker.fgraph.apply_nodes
)
# GpuCholesky op in this graph should be inplace (as his input is not reused by other op).
assert any(
isinstance(n.op, GpuCholesky) and n.op.inplace
for n in f_gpu.maker.fgraph.apply_nodes
)
M_val = np.random.normal(size=(3, 3)).astype("float32")
# A = M.dot(M) will be positive definite for all non-singular M
A_val = M_val.dot(M_val.T)
utt.assert_allclose(f_cpu(A_val), f_gpu(A_val))
@pytest.mark.skipif(not cusolver_available, reason="No cuSolver or SciPy")
def test_gpu_cholesky_not_inplace():
A = fmatrix()
A_squared = A**2
B = slinalg.cholesky(A_squared)
D = B + A_squared
f_cpu = aesara.function([A], D, mode=mode_without_gpu)
f_gpu = aesara.function([A], D, mode=mode_with_gpu)
# GpuCholesky op in this graph should NOT be inplace (as his input is reused in another op)
count_cholesky_not_inplace = len(
[
n.op
for n in f_gpu.maker.fgraph.apply_nodes
if isinstance(n.op, GpuCholesky) and not n.op.inplace
]
)
assert count_cholesky_not_inplace == 1, count_cholesky_not_inplace
M_val = np.random.normal(size=(3, 3)).astype("float32")
# A = M.dot(M) will be positive definite for all non-singular M
A_val = M_val.dot(M_val.T)
utt.assert_allclose(f_cpu(A_val), f_gpu(A_val))
def test_local_gpua_advanced_incsubtensor():
# test a corner case reported at gh-5589
target = ftensor4()
y = target.dimshuffle(1, 0, 2, 3).flatten(ndim=1)
w = at.ones_like(y)
w = aesara.tensor.subtensor.set_subtensor(w[eq(y, 1.0).nonzero()], 100)
w = aesara.tensor.subtensor.set_subtensor(w[eq(y, -1.0).nonzero()], 0)
f = aesara.function([target], w)
assert _check_stack_trace(f)
def test_batched_dot_lifter():
# The CPU Op accepts 2D and 3D inputs, as well as mixed dtypes.
# Make sure the lifter adds the appropriate dimshuffles and casts
rng = np.random.default_rng(utt.fetch_seed())
def randX(*args):
return rng.random(args).astype(config.floatX)
cases = [
(randX(3, 5, 7), randX(3, 7)),
(randX(3, 5), randX(3, 5, 7)),
(randX(3, 5), randX(3, 5)),
(rng.random((3, 5, 7)).astype("float32"), randX(3, 7, 9)),
(rng.random((3, 5, 7)).astype("float64"), randX(3, 7, 9)),
]
for x_val, y_val in cases:
x = TensorType(broadcastable=[s == 1 for s in x_val.shape], dtype=x_val.dtype)(
"x"
)
y = TensorType(broadcastable=[s == 1 for s in y_val.shape], dtype=y_val.dtype)(
"y"
)
z = batched_dot(x, y)
f = aesara.function([x, y], z, mode=mode_with_gpu)
f(x_val, y_val)
assert check_stack_trace(f, ops_to_check="all")
def test_crossentropycategorical1hot_lifter():
rng = np.random.default_rng(utt.fetch_seed())
x = matrix()
y = lvector()
z = aesara.tensor.nnet.crossentropy_categorical_1hot(x, y)
gx = aesara.grad(z.mean(), x)
f = aesara.function([x, y], [z, gx], mode=mode_with_gpu)
assert not any(
isinstance(
n.op,
(
aesara.tensor.nnet.CrossentropyCategorical1Hot,
aesara.tensor.nnet.CrossentropyCategorical1HotGrad,
),
)
for n in f.maker.fgraph.apply_nodes
)
f(
rng.uniform(0.1, 0.9, (13, 5)).astype(config.floatX),
rng.integers(5, size=(13,)),
)
class TestConv_opt:
def optimizer_2d(
self,
input_shapes,
direction,
include_tags,
exclude_tags,
op,
border_mode="valid",
subsample=(1, 1),
filter_dilation=(1, 1),
num_groups=1,
unshared=False,
optimiser=None,
):
inp1 = aesara.shared(np.random.random(input_shapes[0]).astype(config.floatX))
inp2 = aesara.shared(np.random.random(input_shapes[1]).astype(config.floatX))
if op is None:
inp1 = basic_ops.as_gpuarray_variable(inp1, test_ctx_name)
inp2 = basic_ops.as_gpuarray_variable(inp2, test_ctx_name)
if direction == 0:
conv_op = abstract_conv.AbstractConv2d(
input_shapes[0],
input_shapes[1],
border_mode=border_mode,
subsample=subsample,
filter_dilation=filter_dilation,
num_groups=num_groups,
unshared=unshared,
)(inp1, inp2)
if direction == 1:
conv_op = abstract_conv.AbstractConv2d_gradWeights(
imshp=input_shapes[0],
kshp=input_shapes[2],
border_mode=border_mode,
subsample=subsample,
filter_dilation=filter_dilation,
num_groups=num_groups,
unshared=unshared,
)(inp1, inp2, input_shapes[2][-2:])
if direction == 2:
conv_op = abstract_conv.AbstractConv2d_gradInputs(
imshp=input_shapes[2],
kshp=input_shapes[1],
border_mode=border_mode,
subsample=subsample,
filter_dilation=filter_dilation,
num_groups=num_groups,
unshared=unshared,
)(inp2, inp1, input_shapes[2][-2:])
with config.change_flags(
metaopt__optimizer_including=include_tags,
metaopt__optimizer_excluding=exclude_tags,
):
mode = (
mode_with_gpu.including("conv_meta")
.excluding("conv_dnn")
.excluding("conv_gemm")
)
# All meta optimizer compile a new function. This need to know
# the current linker, but this information is not available,
# so it use the default mode.
if op is None:
# No convolutions optimization takes place
assert optimiser.transform(None, conv_op.owner) is None
else:
ref_func = aesara.function([], conv_op, mode=mode_with_gpu)
with config.change_flags(mode=mode):
conv_func = aesara.function([], conv_op, mode=mode)
assert any(
[
isinstance(node.op, op)
for node in conv_func.maker.fgraph.toposort()
]
)
utt.assert_allclose(conv_func(), ref_func())
def optimizer_3d(
self,
input_shapes,
direction,
include_tags,
exclude_tags,
op,
border_mode="valid",
subsample=(1, 1, 1),
filter_dilation=(1, 1, 1),
num_groups=1,
optimiser=None,
):
inp1 = aesara.shared(np.random.random(input_shapes[0]).astype(config.floatX))
inp2 = aesara.shared(np.random.random(input_shapes[1]).astype(config.floatX))
if op is None:
inp1 = basic_ops.as_gpuarray_variable(inp1, None)
inp2 = basic_ops.as_gpuarray_variable(inp2, None)
if direction == 0:
conv_op = abstract_conv.AbstractConv3d(
input_shapes[0],
input_shapes[1],
border_mode=border_mode,
subsample=subsample,
filter_dilation=filter_dilation,
num_groups=num_groups,
)(inp1, inp2)
if direction == 1:
conv_op = abstract_conv.AbstractConv3d_gradWeights(
input_shapes[0],
input_shapes[2],
border_mode=border_mode,
subsample=subsample,
filter_dilation=filter_dilation,
num_groups=num_groups,
)(inp1, inp2, input_shapes[2][-3:])
if direction == 2:
conv_op = abstract_conv.AbstractConv3d_gradInputs(
input_shapes[2],
input_shapes[1],
border_mode=border_mode,
subsample=subsample,
filter_dilation=filter_dilation,
num_groups=num_groups,
)(inp2, inp1, input_shapes[2][-3:])
with config.change_flags(
metaopt__optimizer_including=include_tags,
metaopt__optimizer_excluding=exclude_tags,
):
mode = (
mode_with_gpu.including("conv_meta")
.excluding("conv_dnn")
.excluding("conv_gemm")
)
# All meta optimizer compile a new function. This need to know
# the current linker, but this information is not available,
# so it use the default mode.
if op is None:
# No convolutions optimization takes place
assert optimiser.transform(None, conv_op.owner) is None
return
elif op != "conv3d2d":
with config.change_flags(mode=mode):
conv_func = aesara.function([], conv_op, mode=mode)
assert any(
[
isinstance(node.op, op)
for node in conv_func.maker.fgraph.toposort()
]
)
else:
with config.change_flags(mode=mode):
conv_func = aesara.function(
[], conv_op, mode=mode_with_gpu.including("conv_meta")
)
ref_func = aesara.function([], conv_op, mode=mode_with_gpu)
utt.assert_allclose(conv_func(), ref_func())
@pytest.mark.skipif(config.cxx == "", reason="Need a c compiler.")
def test_optimizers_2d(self):
imshp2d = [(2, 3, 5, 5), (2, 2, 5, 7), (2, 1, 3, 3)]
kshp2d = [(4, 3, 3, 3), (3, 2, 3, 5), (4, 1, 1, 1)]
tshp2d = [(2, 4, 3, 3), (2, 3, 3, 3), (2, 4, 3, 3)]
for imshp, kshp, tshp in zip(imshp2d, kshp2d, tshp2d):
# forward passes
self.optimizer_2d(
[imshp, kshp, tshp], 0, "", "conv_dnn:alternative", blas.GpuCorrMM
)
self.optimizer_2d(
[imshp, kshp, tshp],
0,
"alternative",
"conv_dnn:default",
blas.GpuCorrMM_gradWeights,
)
self.optimizer_2d(
[imshp, kshp, tshp], 0, "", "conv_gemm:alternative", dnn.GpuDnnConv
)
self.optimizer_2d(
[imshp, kshp, tshp],
0,
"alternative",
"conv_gemm:default",
dnn.GpuDnnConvGradW,
)
# backwards wrt weights
self.optimizer_2d(
[imshp, tshp, kshp],
1,
"",
"conv_dnn:alternative",
blas.GpuCorrMM_gradWeights,
)
self.optimizer_2d(
[imshp, tshp, kshp],
1,
"alternative",
"conv_dnn:default",
blas.GpuCorrMM,
)
self.optimizer_2d(
[imshp, tshp, kshp], 1, "", "conv_gemm:alternative", dnn.GpuDnnConvGradW
)
self.optimizer_2d(
[imshp, tshp, kshp],
1,
"alternative",
"conv_gemm:default",
dnn.GpuDnnConv,
)
# backwards wrt to inputs
self.optimizer_2d(
[tshp, kshp, imshp],
2,
"",
"conv_dnn:alternative",
blas.GpuCorrMM_gradInputs,
)
self.optimizer_2d(
[tshp, kshp, imshp],
2,
"alternative",
"conv_dnn:default",
blas.GpuCorrMM,
)
self.optimizer_2d(
[tshp, kshp, imshp], 2, "", "conv_gemm:alternative", dnn.GpuDnnConvGradI
)
self.optimizer_2d(
[tshp, kshp, imshp],
2,
"alternative",
"conv_gemm:default",
dnn.GpuDnnConv,
)
@pytest.mark.skipif(config.cxx == "", reason="Need a c compiler.")
def test_optimizers_3d(self):
imshp3d = [(2, 3, 5, 5, 5), (2, 2, 5, 7, 5), (2, 1, 3, 3, 3)]
kshp3d = [(4, 3, 3, 3, 3), (3, 2, 3, 5, 3), (4, 1, 1, 1, 1)]
tshp3d = [(2, 4, 3, 3, 3), (2, 3, 3, 3, 3), (2, 4, 3, 3, 3)]
for imshp, kshp, tshp in zip(imshp3d, kshp3d, tshp3d):
# forwards passes
self.optimizer_3d(
[imshp, kshp, tshp],
0,
"",
"conv_dnn:alternative:conv3d2d",
blas.GpuCorr3dMM,
)
self.optimizer_3d(
[imshp, kshp, tshp],
0,
"alternative",
"conv_dnn:default:conv3d2d",
blas.GpuCorr3dMM_gradWeights,
)
self.optimizer_3d([imshp, kshp, tshp], 0, "conv3d2d", "default", "conv3d2d")
self.optimizer_3d(
[imshp, kshp, tshp],
0,
"alternative",
"conv_gemm:default:conv3d2d",
dnn.GpuDnnConvGradW,
)
self.optimizer_3d(
[imshp, kshp, tshp],
0,
"",
"conv_gemm:alternative:conv3d2d",
dnn.GpuDnnConv,
)
# backward pass wrt weight
self.optimizer_3d(
[imshp, tshp, kshp],
1,
"",
"conv_dnn:alternative",
blas.GpuCorr3dMM_gradWeights,
)
self.optimizer_3d(
[imshp, tshp, kshp],
1,
"alternative",
"conv_dnn:default",
blas.GpuCorr3dMM,
)
self.optimizer_3d(
[imshp, tshp, kshp],
1,
"alternative",
"conv_gemm:default",
dnn.GpuDnnConv,
)
self.optimizer_3d(
[imshp, tshp, kshp], 1, "", "conv_gemm:alternative", dnn.GpuDnnConvGradW
)
# backward pass wrt inputs
self.optimizer_3d(
[tshp, kshp, imshp],
2,
"",
"conv_dnn:alternative",
blas.GpuCorr3dMM_gradInputs,
)
self.optimizer_3d(
[tshp, kshp, imshp],
2,
"alternative",
"conv_dnn:default",
blas.GpuCorr3dMM,
)
self.optimizer_3d(
[tshp, kshp, imshp],
2,
"alternative",
"conv_gemm:default",
dnn.GpuDnnConv,
)
self.optimizer_3d(
[tshp, kshp, imshp], 2, "", "conv_gemm:alternative", dnn.GpuDnnConvGradI
)
@pytest.mark.skipif(config.cxx == "", reason="Need a c compiler.")
def test_optimizers_non_default(self):
# conv2d forward pass with Non-default border_mode and filter_dilation
imshp2d = [(2, 3, 5, 5), (4, 2, 5, 5)]
kshp2d = [(4, 3, 3, 3), (3, 2, 3, 3)]
filter_dilation = [(1, 1), (2, 2)]
for imshp, kshp, fdil in zip(imshp2d, kshp2d, filter_dilation):
self.optimizer_2d(
[imshp, kshp],
0,
"",
"conv_dnn:alternative",
blas.GpuCorrMM,
border_mode="full",
filter_dilation=fdil,
)
self.optimizer_2d(
[imshp, kshp],
0,
"alternative",
"conv_dnn:default",
blas.GpuCorrMM_gradInputs,
border_mode="full",
filter_dilation=fdil,
)
self.optimizer_2d(
[imshp, kshp],
0,
"",
"conv_gemm:alternative",
dnn.GpuDnnConv,
border_mode="full",
filter_dilation=fdil,
)
self.optimizer_2d(
[imshp, kshp],
0,
"alternative",
"conv_gemm:default",
dnn.GpuDnnConvGradI,
border_mode="full",
filter_dilation=fdil,
)
# conv3d forward pass with Non-default border_mode and filter_dilation
imshp3d = [(2, 3, 5, 5, 5), (4, 2, 5, 5, 5)]
kshp3d = [(4, 3, 3, 3, 3), (3, 2, 3, 3, 3)]
filter_dilation = [(1, 1, 1), (2, 2, 2)]
for imshp, kshp, fdil in zip(imshp3d, kshp3d, filter_dilation):
self.optimizer_3d(
[imshp, kshp],
0,
"",
"conv_dnn:alternative:conv3d2d",
blas.GpuCorr3dMM,
border_mode="full",
filter_dilation=fdil,
)
self.optimizer_3d(
[imshp, kshp],
0,
"alternative",
"conv_dnn:default:conv3d2d",
blas.GpuCorr3dMM_gradInputs,
border_mode="full",
filter_dilation=fdil,
)
self.optimizer_3d(
[imshp, kshp],
0,
"",
"conv_gemm:alternative:conv3d2d",
dnn.GpuDnnConv,
border_mode="full",
filter_dilation=fdil,
)
self.optimizer_3d(
[imshp, kshp],
0,
"alternative",
"conv_gemm:default:conv3d2d",
dnn.GpuDnnConvGradI,
border_mode="full",
filter_dilation=fdil,
)
# test non default num_groups for default optimizers
imshp2d = [(2, 6, 5, 5), (2, 4, 5, 5)]
kshp2d = [(3, 2, 3, 3), (2, 2, 3, 3)]
tshp2d = [(2, 3, 3, 3), (2, 2, 3, 3)]
num_groups = [3, 2]
for imshp, kshp, tshp, groups in zip(imshp2d, kshp2d, tshp2d, num_groups):
# forward pass
self.optimizer_2d(
[imshp, kshp, tshp],
0,
"",
"conv_dnn:alternative",
blas.GpuCorrMM,
num_groups=groups,
)
self.optimizer_2d(
[imshp, kshp, tshp],
0,
"",
"conv_gemm:alternative",
dnn.GpuDnnConv,
num_groups=groups,
)
# grad with respect to weights
self.optimizer_2d(
[imshp, tshp, kshp],
1,
"",
"conv_dnn:alternative",
blas.GpuCorrMM_gradWeights,
num_groups=groups,
)
self.optimizer_2d(
[imshp, tshp, kshp],
1,
"",
"conv_gemm:alternative",
dnn.GpuDnnConvGradW,
num_groups=groups,
)
# grad with respect to inputs
self.optimizer_2d(
[tshp, kshp, imshp],
2,
"",
"conv_dnn:alternative",
blas.GpuCorrMM_gradInputs,
num_groups=groups,
)
self.optimizer_2d(
[tshp, kshp, imshp],
2,
"",
"conv_gemm:alternative",
dnn.GpuDnnConvGradI,
num_groups=groups,
)
# test unshared for default optimizers
imshp2d = [(2, 2, 4, 4), (3, 2, 5, 3)]
kshp2d = [(2, 2, 2, 2, 3, 3), (2, 3, 1, 2, 3, 3)]
tshp2d = [(2, 2, 2, 2), (3, 2, 3, 1)]
for imshp, kshp, tshp, groups in zip(imshp2d, kshp2d, tshp2d, num_groups):
# forward pass
self.optimizer_2d(
[imshp, kshp, tshp], 0, "", "alternative", blas.GpuCorrMM, unshared=True
)
# grad with respect to weights
self.optimizer_2d(
[imshp, tshp, kshp],
1,
"",
"alternative",
blas.GpuCorrMM_gradWeights,
unshared=True,
)
# grad with respect to inputs
self.optimizer_2d(
[tshp, kshp, imshp],
2,
"",
"alternative",
blas.GpuCorrMM_gradInputs,
unshared=True,
)
imshp3d = [(2, 6, 5, 5, 5), (2, 4, 5, 5, 5)]
kshp3d = [(3, 2, 3, 3, 3), (2, 2, 3, 3, 3)]
tshp3d = [(2, 3, 3, 3, 3), (2, 2, 3, 3, 3)]
num_groups = [3, 2]
for imshp, kshp, tshp, groups in zip(imshp3d, kshp3d, tshp3d, num_groups):
# forward pass
self.optimizer_3d(
[imshp, kshp, tshp],
0,
"",
"conv_dnn:alternative:conv3d2d",
blas.GpuCorr3dMM,
num_groups=groups,
)
self.optimizer_3d(
[imshp, kshp, tshp],
0,
"",
"conv_gemm:alternative:conv3d2d",
dnn.GpuDnnConv,
num_groups=groups,
)
# grad with respect to weights
self.optimizer_3d(
[imshp, tshp, kshp],
1,
"",
"conv_dnn:alternative:conv3d2d",
blas.GpuCorr3dMM_gradWeights,
num_groups=groups,
)
self.optimizer_3d(
[imshp, tshp, kshp],
1,
"",
"conv_gemm:alternative:conv3d2d",
dnn.GpuDnnConvGradW,
num_groups=groups,
)
# grad with respect to inputs
self.optimizer_3d(
[tshp, kshp, imshp],
2,
"",
"conv_dnn:alternative:conv3d2d",
blas.GpuCorr3dMM_gradInputs,
num_groups=groups,
)
self.optimizer_3d(
[tshp, kshp, imshp],
2,
"",
"conv_gemm:alternative:conv3d2d",
dnn.GpuDnnConvGradI,
num_groups=groups,
)
@pytest.mark.skipif(config.cxx == "", reason="Need a c compiler.")
def test_returns_none_2d(self):
# values given don't matter since it returns None
imshp = (2, 3, 5, 5)
kshp = (4, 3, 3, 3)
tshp = (2, 4, 3, 3)
conv_direction = [0, 1, 2]
optimisers = [
[opt.local_abstractconv_gemm_alt, opt.local_abstractconv_cudnn_alt],
[
opt.local_abstractconv_gemm_gradweights_alt,
opt.local_abstractconv_cudnn_alt,
],
[
opt.local_abstractconv_gradinputs_gemm_alt,
opt.local_abstractconv_cudnn_alt,
],
]
# test that non default subsample returns None
for opt_direction, direction in zip(optimisers, conv_direction):
for optimiser in opt_direction:
self.optimizer_2d(
[imshp, kshp, tshp],
direction,
"",
"",
None,
subsample=(2, 2),
optimiser=optimiser,
)
# test that non default num_groups returns None
for opt_direction, direction in zip(optimisers, conv_direction):
for optimiser in opt_direction:
self.optimizer_2d(
[imshp, kshp, tshp],
direction,
"",
"",
None,
num_groups=3,
optimiser=optimiser,
)
# test that border_mode=half returns None
for opt_direction, direction in zip(optimisers, conv_direction):
for optimiser in opt_direction:
self.optimizer_2d(
[imshp, kshp, tshp],
direction,
"",
"",
None,
border_mode="half",
optimiser=optimiser,
)
# test that Non-default filter dilation return None for
# direction 1
for optimiser in optimisers[1]:
self.optimizer_2d(
[imshp, kshp, tshp],
1,
"",
"",
None,
filter_dilation=(2, 2),
optimiser=optimiser,
)
imshp = (2, 2, 4, 4)
kshp = (2, 2, 2, 2, 3, 3)
tshp = (2, 2, 2, 2)
shape_perms = [[imshp, kshp, tshp], [imshp, tshp, kshp], [tshp, kshp, imshp]]
# test unshared convolution returns None
for opt_direction, direction, perms in zip(
optimisers, conv_direction, shape_perms
):
for optimiser in opt_direction:
self.optimizer_2d(
perms, direction, "", "", None, unshared=True, optimiser=optimiser
)
@pytest.mark.skipif(config.cxx == "", reason="Need a c compiler.")
def test_returns_none_3d(self):
imshp = (2, 3, 5, 5, 5)
kshp = (4, 3, 3, 3, 3)
tshp = (2, 4, 3, 3, 3)
conv_direction = [0, 1, 2]
optimisers = [
[opt.local_abstractconv3d_alt, opt.local_abstractconv3d_cudnn_alt],
[
opt.local_abstractconv3d_gemm_gradweights_alt,
opt.local_abstractconv3d_cudnn_alt,
],
[
opt.local_abstractconv3d_gradinputs_gemm_alt,
opt.local_abstractconv3d_cudnn_alt,
],
]
# test that non default subsample returns None
for opt_direction, direction in zip(optimisers, conv_direction):
for optimiser in opt_direction:
self.optimizer_3d(
[imshp, kshp, tshp],
direction,
"",
"",
None,
subsample=(2, 2, 2),
optimiser=optimiser,
)
# test that non default num_groups returns None
for opt_direction, direction in zip(optimisers, conv_direction):
for optimiser in opt_direction:
self.optimizer_3d(
[imshp, kshp, tshp],
direction,
"",
"",
None,
num_groups=3,
optimiser=optimiser,
)
# test that border_mode=half returns None
for opt_direction, direction in zip(optimisers, conv_direction):
for optimiser in opt_direction:
self.optimizer_3d(
[imshp, kshp, tshp],
direction,
"",
"",
None,
border_mode="half",
optimiser=optimiser,
)
# test that Non-default filter dilation return None for
# direction 1
for optimiser in optimisers[1]:
self.optimizer_3d(
[imshp, kshp, tshp],
1,
"",
"",
None,
filter_dilation=(2, 2, 2),
optimiser=optimiser,
)
import numpy as np
import pytest
pygpu = pytest.importorskip("pygpu")
from aesara.gpuarray.basic_ops import GpuFromHost, HostFromGpu
from aesara.gpuarray.type import (
GpuArraySharedVariable,
GpuArrayType,
get_context,
gpuarray_shared_constructor,
)
from aesara.misc.pkl_utils import dump, load
from tests.gpuarray.config import mode_with_gpu, test_ctx_name
from tests.misc.test_may_share_memory import may_share_memory_core
from tests.tensor import test_opt
class TestFusion(test_opt.TestFusion):
mode = mode_with_gpu.excluding("local_dnn_reduction")
_shared = staticmethod(gpuarray_shared_constructor)
topo_exclude = (GpuFromHost, HostFromGpu)
def test_may_share_memory():
ctx = get_context(test_ctx_name)
a = pygpu.empty((5, 4), context=ctx)
b = pygpu.empty((5, 4), context=ctx)
may_share_memory_core(a, b)
def test_dump_load():
x = GpuArraySharedVariable(
"x",
GpuArrayType("float32", (1, 1), name="x", context_name=test_ctx_name),
[[1]],
False,
)
with open("test", "wb") as f:
dump(x, f)
with open("test", "rb") as f:
x = load(f)
assert x.name == "x"
np.testing.assert_allclose(x.get_value(), [[1]])
"""
Some pickle test when pygpu isn't there. The test when pygpu is
available are in test_type.py.
This is needed as we skip all the test file when pygpu isn't there in
regular test file.
"""
import os
import sys
from pickle import Unpickler
import numpy as np
import pytest
from aesara.configdefaults import config
from aesara.gpuarray.type import ContextNotDefined
try:
import pygpu # noqa: F401
have_pygpu = True
except ImportError:
have_pygpu = False
@pytest.mark.skip(reason="These tests relied on saved/versioned pickled files.")
@pytest.mark.skipif(have_pygpu, reason="pygpu active")
def test_unpickle_gpuarray_as_numpy_ndarray_flag1():
oldflag = config.experimental__unpickle_gpu_on_cpu
config.experimental__unpickle_gpu_on_cpu = False
try:
testfile_dir = os.path.dirname(os.path.realpath(__file__))
fname = "GpuArray.pkl"
with open(os.path.join(testfile_dir, fname), "rb") as fp:
u = Unpickler(fp, encoding="latin1")
with pytest.raises((ImportError, ContextNotDefined)):
u.load()
finally:
config.experimental__unpickle_gpu_on_cpu = oldflag
@pytest.mark.skip(reason="These tests relied on saved/versioned pickled files.")
def test_unpickle_gpuarray_as_numpy_ndarray_flag2():
oldflag = config.experimental__unpickle_gpu_on_cpu
config.experimental__unpickle_gpu_on_cpu = True
try:
testfile_dir = os.path.dirname(os.path.realpath(__file__))
fname = "GpuArray.pkl"
with open(os.path.join(testfile_dir, fname), "rb") as fp:
u = Unpickler(fp, encoding="latin1")
try:
mat = u.load()
except ImportError:
# Windows sometimes fail with nonsensical errors like:
# ImportError: No module named type
# ImportError: No module named copy_reg
# when "type" and "copy_reg" are builtin modules.
if sys.platform == "win32":
exc_type, exc_value, exc_trace = sys.exc_info()
raise
raise
assert isinstance(mat, np.ndarray)
assert mat[0] == -42.0
finally:
config.experimental__unpickle_gpu_on_cpu = oldflag
import copy
import itertools
import numpy as np
import pytest
import aesara
from aesara import tensor as at
from aesara.gpuarray.pool import (
GpuAveragePoolGrad,
GpuDownsampleFactorMaxGradGrad,
GpuMaxPoolGrad,
GpuPool,
)
from aesara.gradient import Lop, Rop, grad
from aesara.tensor.signal.pool import (
AveragePoolGrad,
DownsampleFactorMaxGradGrad,
MaxPoolGrad,
Pool,
)
from tests import unittest_tools as utt
from tests.gpuarray.config import mode_with_gpu, mode_without_gpu
from tests.gpuarray.test_basic_ops import random
class TestPool:
def test_pool_py_interface(self):
shp = (2, 2, 2, 2)
inp = aesara.shared(random(*shp), "a")
inp = at.as_tensor_variable(inp)
with pytest.raises(ValueError):
# test when pad >= ws
ds_op = GpuPool(ignore_border=True, ndim=2)
ds_op(inp, [2, 2], pad=[3, 3])
with pytest.raises(ValueError):
# test when ignore_border and pad >= 0
ds_op = GpuPool(ignore_border=False, ndim=2)
ds_op(inp, [2, 2], pad=[1, 1])
def test_pool_c_interface(self):
gpu_mode = mode_with_gpu.excluding("cudnn")
gpu_mode.check_py_code = False
shp = (2, 2, 2, 2)
inp = aesara.shared(random(*shp), "a")
inp = at.as_tensor_variable(inp)
with pytest.raises(ValueError):
# test when ignore_border and pad >= 0
ds_op = GpuPool(ignore_border=False, ndim=2)
pad = at.as_tensor_variable([1, 1])
f = aesara.function([], ds_op(inp, [2, 2], pad=pad), mode=gpu_mode)
f()
def test_pool_big_ws(self):
gpu_mode = mode_with_gpu.excluding("cudnn")
gpu_mode.check_py_code = False
shp = (2, 2, 2, 2)
inp = aesara.shared(random(*shp), "a")
inp = at.as_tensor_variable(inp)
ds_op = GpuPool(ignore_border=False, mode="average_exc_pad", ndim=2)
pad = at.as_tensor_variable([0, 0])
f = aesara.function(
[], ds_op(inp, [5, 5], stride=[1, 1], pad=pad), mode=gpu_mode
)
f()
def test_pool2d():
shps = [
(1, 12),
(1, 1, 12),
(1, 1, 1, 12),
(1, 1, 2, 2),
(1, 1, 1, 1),
(1, 1, 4, 4),
(1, 1, 10, 11),
(1, 2, 2, 2),
(3, 5, 4, 4),
(25, 1, 7, 7),
(1, 1, 12, 12),
(1, 1, 2, 14),
(1, 1, 12, 14),
(1, 1, 14, 14),
(1, 1, 16, 16),
(1, 1, 18, 18),
(1, 1, 24, 24),
(1, 6, 24, 24),
(10, 1, 24, 24),
(10, 6, 24, 24),
(30, 6, 12, 12),
(30, 2, 24, 24),
(30, 6, 24, 24),
(10, 10, 10, 11),
(1, 1, 10, 1025),
(1, 1, 10, 1023),
(1, 1, 1025, 10),
(1, 1, 1023, 10),
(3, 2, 16, 16, 16),
(3, 2, 6, 6, 6, 5),
(3, 2, 6, 6, 6, 5, 7),
]
np.random.default_rng(utt.fetch_seed()).shuffle(shps)
test_ws = (2, 2), (3, 2), (1, 1)
test_st = (2, 2), (3, 2), (1, 1)
test_mode = ["max", "sum", "average_inc_pad", "average_exc_pad"]
ref_mode = copy.copy(mode_without_gpu)
ref_mode.check_py_code = False
gpu_mode = mode_with_gpu.excluding("cudnn")
gpu_mode.check_py_code = False
for shp in shps:
for mode, ws, st in itertools.product(test_mode, test_ws, test_st):
if ws[0] > shp[-2] or ws[1] > shp[-1]:
continue
for ignore_border, pad in zip((True, False), [(1, 1), (0, 0)]):
if pad[0] >= ws[0] or pad[1] >= ws[1]:
continue
if mode == "average_exc_pad" and (pad[0] > 0 or pad[1] > 0):
continue
# print('test_pool2d', shp, ws, st, pad, mode, ignore_border)
ds_op = Pool(ndim=len(ws), mode=mode, ignore_border=ignore_border)
a = aesara.shared(random(*shp), "a")
a_pooled = ds_op(at.as_tensor_variable(a), ws, st, pad)
f = aesara.function([], a_pooled, mode=gpu_mode)
f2 = aesara.function([], a_pooled, mode=ref_mode)
assert any(
[isinstance(node.op, GpuPool) for node in f.maker.fgraph.toposort()]
)
assert any(
[isinstance(node.op, Pool) for node in f2.maker.fgraph.toposort()]
)
assert np.allclose(f(), f2()), (shp, ws, st, pad, mode, ignore_border)
a_pooled_grad = grad(a_pooled.sum(), a)
g = aesara.function([], a_pooled_grad, mode=gpu_mode)
g2 = aesara.function([], a_pooled_grad, mode=ref_mode)
if mode == "max":
gop = GpuMaxPoolGrad
gop2 = MaxPoolGrad
else:
gop = GpuAveragePoolGrad
gop2 = AveragePoolGrad
assert any(
[isinstance(node.op, gop) for node in g.maker.fgraph.toposort()]
)
assert any(
[isinstance(node.op, gop2) for node in g2.maker.fgraph.toposort()]
)
assert np.allclose(g(), g2()), (shp, ws, st, pad, mode, ignore_border)
# test rop and grad grad for max pooling
# for average pooling grad grad is just average pooling grad
if mode != "max":
continue
ea = aesara.shared(random(*shp), "ea")
gr = aesara.function([], Rop(a_pooled, a, ea), mode=gpu_mode)
gr2 = aesara.function([], Rop(a_pooled, a, ea), mode=ref_mode)
assert any(
[
isinstance(node.op, GpuDownsampleFactorMaxGradGrad)
for node in gr.maker.fgraph.toposort()
]
)
assert any(
[
isinstance(node.op, DownsampleFactorMaxGradGrad)
for node in gr2.maker.fgraph.toposort()
]
)
assert np.allclose(gr(), gr2()), (shp, ws, st, pad, mode, ignore_border)
ggf = Lop(grad((a_pooled**2).sum(), a), a, a)
gg = aesara.function([], ggf, mode=gpu_mode)
gg2 = aesara.function([], ggf, mode=ref_mode)
assert any(
[
isinstance(node.op, GpuDownsampleFactorMaxGradGrad)
for node in gg.maker.fgraph.toposort()
]
)
assert any(
[
isinstance(node.op, DownsampleFactorMaxGradGrad)
for node in gg2.maker.fgraph.toposort()
]
)
assert np.allclose(gg(), gg2()), (shp, ws, st, pad, mode, ignore_border)
def test_pool3d():
shps = [
(1, 1, 12),
(1, 1, 1, 1, 1),
(1, 1, 1, 1, 1025),
(1, 1, 2, 2, 2),
(1, 1, 7, 7, 7),
(1, 1, 9, 10, 11),
(1, 6, 18, 18, 18),
(1, 1, 6, 24, 24),
(1, 10, 1, 24, 24),
(1, 10, 6, 24, 24),
(1, 30, 6, 12, 12),
(1, 30, 2, 24, 24),
(1, 30, 6, 24, 24),
(1, 10, 10, 10, 11),
(1, 1, 10, 10, 1025),
(1, 1, 10, 10, 1023),
(1, 1, 10, 1025, 10),
(1, 1, 10, 1023, 10),
(3, 2, 6, 6, 6, 5),
(3, 2, 6, 6, 6, 5, 7),
]
np.random.default_rng(utt.fetch_seed()).shuffle(shps)
test_ws = (2, 2, 2), (3, 2, 3), (1, 1, 1)
test_st = (2, 2, 2), (2, 3, 2), (1, 1, 1)
test_mode = ["max", "sum", "average_inc_pad", "average_exc_pad"]
ref_mode = copy.copy(mode_without_gpu)
ref_mode.check_py_code = False
gpu_mode = mode_with_gpu.excluding("cudnn")
gpu_mode.check_py_code = False
for shp in shps:
for mode, ws, st in itertools.product(test_mode, test_ws, test_st):
if ws[0] > shp[-3] or ws[1] > shp[-2] or ws[2] > shp[-1]:
continue
for ignore_border, pad in zip((True, False), [(1, 1, 1), (0, 0, 0)]):
if pad[0] >= ws[0] or pad[1] >= ws[1] or pad[2] >= ws[2]:
continue
if mode == "average_exc_pad" and (
pad[0] > 0 or pad[1] > 0 or pad[2] > 0
):
continue
# print('test_pool3d', shp, ws, st, pad, mode, ignore_border)
ds_op = Pool(ndim=len(ws), mode=mode, ignore_border=ignore_border)
a = aesara.shared(random(*shp), "a")
a_pooled = ds_op(at.as_tensor_variable(a), ws, st, pad)
f = aesara.function([], a_pooled, mode=gpu_mode)
f2 = aesara.function([], a_pooled, mode=ref_mode)
assert any(
[isinstance(node.op, GpuPool) for node in f.maker.fgraph.toposort()]
)
assert any(
[isinstance(node.op, Pool) for node in f2.maker.fgraph.toposort()]
)
assert np.allclose(f(), f2()), (shp, ws, st, pad, mode, ignore_border)
a_pooled_grad = grad(a_pooled.sum(), a)
g = aesara.function([], a_pooled_grad, mode=gpu_mode)
g2 = aesara.function([], a_pooled_grad, mode=ref_mode)
if mode == "max":
gop = GpuMaxPoolGrad
gop2 = MaxPoolGrad
else:
gop = GpuAveragePoolGrad
gop2 = AveragePoolGrad
assert any(
[isinstance(node.op, gop) for node in g.maker.fgraph.toposort()]
)
assert any(
[isinstance(node.op, gop2) for node in g2.maker.fgraph.toposort()]
)
assert np.allclose(g(), g2()), (shp, ws, st, pad, mode, ignore_border)
# test rop and grad grad for max pooling
# for average pooling grad grad is just average pooling grad
if mode != "max":
continue
ea = aesara.shared(random(*shp), "ea")
gr = aesara.function([], Rop(a_pooled, a, ea), mode=gpu_mode)
gr2 = aesara.function([], Rop(a_pooled, a, ea), mode=ref_mode)
assert any(
[
isinstance(node.op, GpuDownsampleFactorMaxGradGrad)
for node in gr.maker.fgraph.toposort()
]
)
assert any(
[
isinstance(node.op, DownsampleFactorMaxGradGrad)
for node in gr2.maker.fgraph.toposort()
]
)
assert np.allclose(gr(), gr2()), (shp, ws, st, pad, mode, ignore_border)
ggf = Lop(grad((a_pooled**2).sum(), a), a, a)
gg = aesara.function([], ggf, mode=gpu_mode)
gg2 = aesara.function([], ggf, mode=ref_mode)
assert any(
[
isinstance(node.op, GpuDownsampleFactorMaxGradGrad)
for node in gg.maker.fgraph.toposort()
]
)
assert any(
[
isinstance(node.op, DownsampleFactorMaxGradGrad)
for node in gg2.maker.fgraph.toposort()
]
)
assert np.allclose(gg(), gg2()), (shp, ws, st, pad, mode, ignore_border)
import math
import numpy as np
import pytest
import aesara
import aesara.tensor as at
from aesara.gpuarray import GpuArrayType
from aesara.gpuarray.dnn import GpuDnnReduction
from aesara.gpuarray.reduction import GpuMaxAndArgmax
from aesara.tensor.math import argmax
from aesara.tensor.math import max as at_max
from tests import unittest_tools as utt
from tests.gpuarray.config import mode_with_gpu, mode_without_gpu
from tests.gpuarray.test_basic_ops import rand_gpuarray
# Number of values to be used in test tensors (except with 0-D tensors!).
test_size = 10000
# NB: This order of "unsorted axes" is arbitrary and is here
# just to have the same information on profile output
# from one test to another.
unsorted_axes = (2, 4, 0, 3, 1)
np.random.seed()
def numpy_random_array(shapes):
size = 1
for dimsize in shapes:
size *= dimsize
return np.random.normal(size=size).astype(aesara.config.floatX).reshape(shapes)
def numpy_maxandargmax(X, axis=None):
if axis is None:
axis = list(range(X.ndim))
elif not isinstance(axis, (tuple, list)):
axis = [int(axis)]
axis = list(set(axis)) # remove duplicated values.
axis.sort()
axis = tuple(axis)
ref_max = np.max(X, axis=axis)
# Following code is copied from MaxAndArgmax.perform():
# Numpy does not support multiple axes for argmax. Work around.
keep_axes = np.array([i for i in range(X.ndim) if i not in axis], dtype="int64")
# Not-reduced axes in front
transposed_x = np.transpose(X, np.concatenate((keep_axes, axis)))
kept_shape = transposed_x.shape[: len(keep_axes)]
reduced_shape = transposed_x.shape[len(keep_axes) :]
new_shape = kept_shape + (np.prod(reduced_shape),)
new_shape = tuple(int(i) for i in new_shape)
reshaped_x = transposed_x.reshape(new_shape)
return (ref_max, np.argmax(reshaped_x, axis=-1))
def check_if_gpu_reduce_in_graph(aesara_function):
assert any(
isinstance(node.op, (GpuMaxAndArgmax, GpuDnnReduction))
for node in aesara_function.maker.fgraph.apply_nodes
)
def check_if_gpu_reduce_not_in_graph(aesara_function):
assert all(
not isinstance(node.op, (GpuMaxAndArgmax, GpuDnnReduction))
for node in aesara_function.maker.fgraph.apply_nodes
)
class BaseTest:
# This attribute must be set in subclasses.
tensor_size = None
shape = None
dtype = aesara.config.floatX
def get_shape(self):
if self.tensor_size == 0:
return []
return [
int(math.ceil(math.pow(test_size, 1 / self.tensor_size)))
] * self.tensor_size
def setup_method(self):
if not isinstance(self.tensor_size, int):
pytest.skip("No tensor ndim defined.")
if self.tensor_size < 0 or self.tensor_size > 5:
pytest.skip(
"We allow from 0 (included) to 5 (included) dimensons for these tests."
)
if self.shape is None:
self.shape = self.get_shape()
def get_host_tensor(self):
broadcastable = (False,) * self.tensor_size
return at.tensor(self.dtype, broadcastable)
def get_gpu_tensor(self):
broadcastable = (False,) * self.tensor_size
return GpuArrayType(self.dtype, broadcastable)()
def get_host_value(self):
return numpy_random_array(self.shape)
def get_gpu_value(self):
return rand_gpuarray(*self.shape)
# NB: In compute_host() and compute_gpu(),
# the first call of the aesara function should be ignored in profiling,
# with Aesara config flag profiling__ignore_first_call=True.
def compute_host(self, test_tensor, axis):
M = self.get_host_tensor()
f = aesara.function(
[M],
[at_max(M, axis=axis), argmax(M, axis=axis)],
name="shape:" + str(test_tensor.shape) + "/axis:" + str(axis) + "/HOST",
mode=mode_without_gpu,
)
check_if_gpu_reduce_not_in_graph(f)
f(test_tensor)
aesara_max, aesara_argmax = f(test_tensor)
ref_max, ref_argmax = numpy_maxandargmax(test_tensor, axis=axis)
utt.assert_allclose(ref_max, aesara_max)
utt.assert_allclose(ref_argmax, aesara_argmax)
def compute_gpu(self, test_gpu_tensor, test_host_tensor, axis):
M = self.get_gpu_tensor()
f = aesara.function(
[M],
[at_max(M, axis=axis), argmax(M, axis=axis)],
name="shape:" + str(test_gpu_tensor.shape) + "/axis:" + str(axis) + "/GPU",
mode=mode_with_gpu,
)
check_if_gpu_reduce_in_graph(f)
f(test_gpu_tensor)
aesara_max, aesara_argmax = f(test_gpu_tensor)
ref_max, ref_argmax = numpy_maxandargmax(test_host_tensor, axis=axis)
utt.assert_allclose(ref_max, aesara_max)
utt.assert_allclose(ref_argmax, aesara_argmax)
def compute(self, axis=None):
# We want to run CPU op and GPU op on the same tensor randomly generated.
test_gpu_tensor = self.get_gpu_value()
test_host_tensor = np.asarray(test_gpu_tensor)
self.compute_host(test_host_tensor, axis)
self.compute_gpu(test_gpu_tensor, test_host_tensor, axis)
def compute_axis(self, pos):
if self.tensor_size != 1 and 0 <= pos < self.tensor_size:
self.compute(pos)
def compute_some_axes(self, count):
if 0 <= count < self.tensor_size:
self.compute([i for i in unsorted_axes if i < self.tensor_size][:count])
# Equivalent to test reduction on all axes.
def test_none(self):
self.compute(None)
def test_axis_1(self):
self.compute_axis(0)
def test_axis_2(self):
self.compute_axis(1)
def test_axis_3(self):
self.compute_axis(2)
def test_axis_4(self):
self.compute_axis(3)
def test_axis_5(self):
self.compute_axis(4)
# For the tests below, we expect CPU op to run with Python implementation.
def test_2_axes(self):
self.compute_some_axes(2)
def test_3_axes(self):
self.compute_some_axes(3)
def test_4_axes(self):
self.compute_some_axes(4)
class TestScalar(BaseTest):
tensor_size = 0
class TestVector(BaseTest):
tensor_size = 1
# Special case
class TestRow(BaseTest):
tensor_size = 2
shape = [1, test_size]
# Special case
class TestColumn(BaseTest):
tensor_size = 2
shape = [test_size, 1]
class TestMatrix(BaseTest):
tensor_size = 2
class TestTensor5(BaseTest):
tensor_size = 5
import functools
import numpy as np
import aesara
from aesara import tensor as at
from aesara.configdefaults import config
from aesara.gpuarray.rng_mrg import GPUA_mrg_uniform
from aesara.gpuarray.type import gpuarray_shared_constructor
from aesara.sandbox import rng_mrg
from aesara.sandbox.rng_mrg import MRG_RandomStream
from tests import unittest_tools as utt
from tests.gpuarray.config import mode_with_gpu as mode
from tests.sandbox.test_rng_mrg import java_samples, rng_mrg_overflow
from tests.sandbox.test_rng_mrg import test_f16_nonzero as cpu_f16_nonzero
def test_consistency_GPUA_serial():
# Verify that the random numbers generated by GPUA_mrg_uniform, serially,
# are the same as the reference (Java) implementation by L'Ecuyer et al.
seed = 12345
n_samples = 5
n_streams = 12
n_substreams = 7
samples = []
curr_rstate = np.array([seed] * 6, dtype="int32")
for i in range(n_streams):
stream_rstate = curr_rstate.copy()
for j in range(n_substreams):
substream_rstate = np.array([stream_rstate.copy()], dtype="int32")
# Transfer to device
rstate = gpuarray_shared_constructor(substream_rstate)
new_rstate, sample = GPUA_mrg_uniform.new(
rstate, ndim=None, dtype="float32", size=(1,)
)
rstate.default_update = new_rstate
# Not really necessary, just mimicking
# rng_mrg.MRG_RandomStream' behavior
sample.rstate = rstate
sample.update = (rstate, new_rstate)
# We need the sample back in the main memory
cpu_sample = at.as_tensor_variable(sample)
f = aesara.function([], cpu_sample, mode=mode)
for k in range(n_samples):
s = f()
samples.append(s)
# next substream
stream_rstate = rng_mrg.ff_2p72(stream_rstate)
# next stream
curr_rstate = rng_mrg.ff_2p134(curr_rstate)
samples = np.array(samples).flatten()
assert np.allclose(samples, java_samples)
def test_consistency_GPUA_parallel():
# Verify that the random numbers generated by GPUA_mrg_uniform, in
# parallel, are the same as the reference (Java) implementation by
# L'Ecuyer et al.
seed = 12345
n_samples = 5
n_streams = 12
n_substreams = 7 # 7 samples will be drawn in parallel
samples = []
curr_rstate = np.array([seed] * 6, dtype="int32")
for i in range(n_streams):
stream_samples = []
rstate = [curr_rstate.copy()]
for j in range(1, n_substreams):
rstate.append(rng_mrg.ff_2p72(rstate[-1]))
rstate = np.asarray(rstate)
rstate = gpuarray_shared_constructor(rstate)
new_rstate, sample = GPUA_mrg_uniform.new(
rstate, ndim=None, dtype="float32", size=(n_substreams,)
)
rstate.default_update = new_rstate
# Not really necessary, just mimicking
# rng_mrg.MRG_RandomStream' behavior
sample.rstate = rstate
sample.update = (rstate, new_rstate)
# We need the sample back in the main memory
cpu_sample = at.as_tensor_variable(sample)
f = aesara.function([], cpu_sample, mode=mode)
for k in range(n_samples):
s = f()
stream_samples.append(s)
samples.append(np.array(stream_samples).T.flatten())
# next stream
curr_rstate = rng_mrg.ff_2p134(curr_rstate)
samples = np.array(samples).flatten()
assert np.allclose(samples, java_samples)
def test_GPUA_full_fill():
# Make sure the whole sample buffer is filled. Also make sure
# large samples are consistent with CPU results.
# This needs to be large to trigger the problem on GPU
size = (10, 1000)
R = MRG_RandomStream(234)
uni = R.uniform(size, nstreams=60 * 256)
f_cpu = aesara.function([], uni)
rstate_gpu = gpuarray_shared_constructor(R.state_updates[-1][0].get_value())
new_rstate, sample = GPUA_mrg_uniform.new(
rstate_gpu, ndim=None, dtype="float32", size=size
)
rstate_gpu.default_update = new_rstate
f_gpu = aesara.function([], sample, mode=mode)
utt.assert_allclose(f_cpu(), f_gpu())
def test_overflow_gpu_new_backend():
seed = 12345
n_substreams = 7
curr_rstate = np.array([seed] * 6, dtype="int32")
rstate = [curr_rstate.copy()]
for j in range(1, n_substreams):
rstate.append(rng_mrg.ff_2p72(rstate[-1]))
rstate = np.asarray(rstate)
rstate = gpuarray_shared_constructor(rstate)
fct = functools.partial(GPUA_mrg_uniform.new, rstate, ndim=None, dtype="float32")
# should raise error as the size overflows
sizes = [
(2**31,),
(2**32,),
(
2**15,
2**16,
),
(2, 2**15, 2**15),
]
rng_mrg_overflow(sizes, fct, mode, should_raise_error=True)
# should not raise error
sizes = [(2**5,), (2**5, 2**5), (2**5, 2**5, 2**5)]
rng_mrg_overflow(sizes, fct, mode, should_raise_error=False)
# should support int32 sizes
sizes = [(np.int32(2**10),), (np.int32(2), np.int32(2**10), np.int32(2**10))]
rng_mrg_overflow(sizes, fct, mode, should_raise_error=False)
def test_validate_input_types_gpuarray_backend():
with config.change_flags(compute_test_value="raise"):
rstate = np.zeros((7, 6), dtype="int32")
rstate = gpuarray_shared_constructor(rstate)
rng_mrg.mrg_uniform.new(rstate, ndim=None, dtype="float32", size=(3,))
def test_f16_nonzero():
try:
# To have aesara.shared(x) try to move on the GPU
aesara.compile.shared_constructor(gpuarray_shared_constructor)
cpu_f16_nonzero(mode=mode, op_to_check=GPUA_mrg_uniform)
finally:
aesara.compile.shared_constructor(gpuarray_shared_constructor, remove=True)
def test_cpu_target_with_shared_variable():
srng = MRG_RandomStream()
s = np.random.rand(2, 3).astype("float32")
x = gpuarray_shared_constructor(s, name="x")
try:
# To have aesara.shared(x) try to move on the GPU
aesara.compile.shared_constructor(gpuarray_shared_constructor)
y = srng.uniform(x.shape, target="cpu")
y.name = "y"
z = (x * y).sum()
z.name = "z"
fz = aesara.function([], z, mode=mode)
nodes = fz.maker.fgraph.toposort()
assert not any(isinstance(node.op, GPUA_mrg_uniform) for node in nodes)
finally:
aesara.compile.shared_constructor(gpuarray_shared_constructor, remove=True)
import numpy as np
import pytest
import aesara
import aesara.sandbox.rng_mrg
from aesara import gpuarray
from aesara import tensor as at
from aesara.gpuarray.basic_ops import GpuFromHost, HostFromGpu
from aesara.gpuarray.elemwise import GpuElemwise
from aesara.scan.basic import scan
from aesara.scan.checkpoints import scan_checkpoints
from aesara.scan.op import Scan
from aesara.tensor.math import dot
from aesara.tensor.math import sum as at_sum
from aesara.tensor.type import fscalar, ftensor3, fvector, iscalar, vector
from tests import unittest_tools as utt
from tests.gpuarray.config import mode_with_gpu, test_ctx_name
pygpu_gpuarray = pytest.importorskip("pygpy.gpuarray")
GpuArrayException = pygpu_gpuarray.GpuArrayException
if aesara.config.mode == "FAST_COMPILE":
mode_with_opt = aesara.compile.mode.get_mode("FAST_RUN")
else:
mode_with_opt = aesara.compile.mode.get_default_mode()
if aesara.config.mode in ("DEBUG_MODE", "DebugMode"):
mode_nodebug = aesara.compile.mode.get_mode("FAST_RUN")
else:
mode_nodebug = mode_with_opt
class TestScan:
def test_one_sequence_one_output_weights_gpu1(self):
def f_rnn(u_t, x_tm1, W_in, W):
return u_t * W_in + x_tm1 * W
u = fvector("u")
x0 = fscalar("x0")
W_in = fscalar("win")
W = fscalar("w")
mode = mode_with_gpu.excluding("InputToGpuOptimizer")
output, updates = scan(
f_rnn,
u,
x0,
[W_in, W],
n_steps=None,
truncate_gradient=-1,
go_backwards=False,
mode=mode,
)
output = GpuFromHost(test_ctx_name)(output)
f2 = aesara.function(
[u, x0, W_in, W],
output,
updates=updates,
allow_input_downcast=True,
mode=mode,
)
rng = np.random.default_rng(utt.fetch_seed())
v_u = rng.uniform(size=(4,), low=-5.0, high=5.0)
v_x0 = rng.uniform()
W = rng.uniform()
W_in = rng.uniform()
v_u = np.asarray(v_u, dtype="float32")
v_x0 = np.asarray(v_x0, dtype="float32")
W = np.asarray(W, dtype="float32")
W_in = np.asarray(W_in, dtype="float32")
# compute the output in numpy
v_out = np.zeros((4,))
v_out[0] = v_u[0] * W_in + v_x0 * W
for step in range(1, 4):
v_out[step] = v_u[step] * W_in + v_out[step - 1] * W
aesara_values = f2(v_u, v_x0, W_in, W)
utt.assert_allclose(aesara_values, v_out)
# TO DEL
topo = f2.maker.fgraph.toposort()
scan_node = [node for node in topo if isinstance(node.op, scan.op.Scan)]
assert len(scan_node) == 1
scan_node = scan_node[0]
topo = f2.maker.fgraph.toposort()
assert sum([isinstance(node.op, HostFromGpu) for node in topo]) == 0
assert sum([isinstance(node.op, GpuFromHost) for node in topo]) == 4
scan_node = [node for node in topo if isinstance(node.op, scan.op.Scan)]
assert len(scan_node) == 1
scan_node = scan_node[0]
scan_node_topo = scan_node.op.fn.maker.fgraph.toposort()
# check that there is no gpu transfer in the inner loop.
assert any(isinstance(node.op, GpuElemwise) for node in scan_node_topo)
assert not any(isinstance(node.op, HostFromGpu) for node in scan_node_topo)
assert not any(isinstance(node.op, GpuFromHost) for node in scan_node_topo)
# This second version test the second case in the optimizer to the gpu.
def test_one_sequence_one_output_weights_gpu2(self):
def f_rnn(u_t, x_tm1, W_in, W):
return u_t * W_in + x_tm1 * W
u = fvector("u")
x0 = fscalar("x0")
W_in = fscalar("win")
W = fscalar("w")
output, updates = scan(
f_rnn,
u,
x0,
[W_in, W],
n_steps=None,
truncate_gradient=-1,
go_backwards=False,
mode=mode_with_gpu,
)
f2 = aesara.function(
[u, x0, W_in, W],
output,
updates=updates,
allow_input_downcast=True,
mode=mode_with_gpu,
)
# get random initial values
rng = np.random.default_rng(utt.fetch_seed())
v_u = rng.uniform(size=(4,), low=-5.0, high=5.0)
v_x0 = rng.uniform()
W = rng.uniform()
W_in = rng.uniform()
# compute the output in numpy
v_out = np.zeros((4,))
v_out[0] = v_u[0] * W_in + v_x0 * W
for step in range(1, 4):
v_out[step] = v_u[step] * W_in + v_out[step - 1] * W
aesara_values = f2(v_u, v_x0, W_in, W)
utt.assert_allclose(aesara_values, v_out)
topo = f2.maker.fgraph.toposort()
assert sum([isinstance(node.op, HostFromGpu) for node in topo]) == 1
assert sum([isinstance(node.op, GpuFromHost) for node in topo]) == 4
scan_node = [node for node in topo if isinstance(node.op, scan.op.Scan)]
assert len(scan_node) == 1
scan_node = scan_node[0]
scan_node_topo = scan_node.op.fn.maker.fgraph.toposort()
# check that there is no gpu transfer in the inner loop.
assert any(isinstance(node.op, GpuElemwise) for node in scan_node_topo)
assert not any(isinstance(node.op, HostFromGpu) for node in scan_node_topo)
assert not any(isinstance(node.op, GpuFromHost) for node in scan_node_topo)
# This third test checks that scan can deal with a mixture of dtypes as
# outputs when is running on GPU
def test_gpu3_mixture_dtype_outputs(self):
def f_rnn(u_t, x_tm1, W_in, W):
return (u_t * W_in + x_tm1 * W, at.cast(u_t + x_tm1, "int64"))
u = fvector("u")
x0 = fscalar("x0")
W_in = fscalar("win")
W = fscalar("w")
output, updates = scan(
f_rnn,
u,
[x0, None],
[W_in, W],
n_steps=None,
truncate_gradient=-1,
go_backwards=False,
mode=mode_with_gpu,
)
f2 = aesara.function(
[u, x0, W_in, W],
output,
updates=updates,
allow_input_downcast=True,
mode=mode_with_gpu,
)
# get random initial values
rng = np.random.default_rng(utt.fetch_seed())
v_u = rng.uniform(size=(4,), low=-5.0, high=5.0)
v_x0 = rng.uniform()
W = rng.uniform()
W_in = rng.uniform()
# compute the output in numpy
v_out1 = np.zeros((4,))
v_out2 = np.zeros((4,), dtype="int64")
v_out1[0] = v_u[0] * W_in + v_x0 * W
v_out2[0] = v_u[0] + v_x0
for step in range(1, 4):
v_out1[step] = v_u[step] * W_in + v_out1[step - 1] * W
v_out2[step] = np.int64(v_u[step] + v_out1[step - 1])
aesara_out1, aesara_out2 = f2(v_u, v_x0, W_in, W)
utt.assert_allclose(aesara_out1, v_out1)
utt.assert_allclose(aesara_out2, v_out2)
topo = f2.maker.fgraph.toposort()
scan_node = [node for node in topo if isinstance(node.op, scan.op.Scan)]
assert len(scan_node) == 1
scan_node = scan_node[0]
assert scan_node.op.gpua
scan_node_topo = scan_node.op.fn.maker.fgraph.toposort()
# check that there is no gpu transfer in the inner loop.
assert not any(isinstance(node.op, HostFromGpu) for node in scan_node_topo)
assert not any(isinstance(node.op, GpuFromHost) for node in scan_node_topo)
def test_gpu4_gibbs_chain(self):
rng = np.random.default_rng(utt.fetch_seed())
v_vsample = np.array(
rng.binomial(
1,
0.5,
size=(3, 20),
),
dtype="float32",
)
vsample = aesara.shared(v_vsample)
trng = aesara.sandbox.rng_mrg.MRG_RandomStream(utt.fetch_seed())
def f(vsample_tm1):
return (
trng.binomial(vsample_tm1.shape, n=1, p=0.3, dtype="float32")
* vsample_tm1
)
aesara_vsamples, updates = scan(
f,
[],
vsample,
[],
n_steps=10,
truncate_gradient=-1,
go_backwards=False,
mode=mode_with_gpu,
)
my_f = aesara.function(
[],
aesara_vsamples[-1],
updates=updates,
allow_input_downcast=True,
mode=mode_with_gpu,
)
# I leave this to tested by debugmode, this test was anyway
# more of does the graph compile kind of test
my_f()
class ScanGpuTests:
"""
This class defines a number of tests for Scan on GPU as well as a few
helper functions for these tests. The GPU tests defined in this class are
independent of the GPU backend used. Because of this, a class inheriting
from ScanGpuTests should define the following attributes and methods to
make the tests run on a specific backend :
- self.gpu_backend : Reference to the backend module
- self.mode_with_opt : Compilation mode to force usage of the gpu backend
- self.is_scan_on_gpu(node) : Method to determine is a scan node has been
moved to run on a gpu under the specific
backend. Returns a boolean.
"""
def test_one_sequence_one_output_weights_gpu1(self):
def f_rnn(u_t, x_tm1, W_in, W):
return u_t * W_in + x_tm1 * W
u = fvector("u")
x0 = fscalar("x0")
W_in = fscalar("win")
W = fscalar("w")
# The following line is needed to have the first case being used
# Otherwise, it is the second that is tested.
mode = self.mode_with_gpu.excluding("InputToGpuOptimizer")
output, updates = scan(
f_rnn,
u,
x0,
[W_in, W],
n_steps=None,
truncate_gradient=-1,
go_backwards=False,
mode=mode,
)
output = self.gpu_backend.gpu_from_host(output)
f2 = aesara.function(
[u, x0, W_in, W],
output,
updates=updates,
allow_input_downcast=True,
mode=self.mode_with_gpu,
)
# get random initial values
rng = np.random.default_rng(utt.fetch_seed())
v_u = rng.uniform(size=(4,), low=-5.0, high=5.0)
v_x0 = rng.uniform()
W = rng.uniform()
W_in = rng.uniform()
v_u = np.asarray(v_u, dtype="float32")
v_x0 = np.asarray(v_x0, dtype="float32")
W = np.asarray(W, dtype="float32")
W_in = np.asarray(W_in, dtype="float32")
# compute the output in numpy
v_out = np.zeros((4,))
v_out[0] = v_u[0] * W_in + v_x0 * W
for step in range(1, 4):
v_out[step] = v_u[step] * W_in + v_out[step - 1] * W
aesara_values = f2(v_u, v_x0, W_in, W)
utt.assert_allclose(aesara_values, v_out)
# TO DEL
topo = f2.maker.fgraph.toposort()
scan_node = [node for node in topo if isinstance(node.op, Scan)]
assert len(scan_node) == 1
scan_node = scan_node[0]
topo = f2.maker.fgraph.toposort()
assert (
sum([isinstance(node.op, self.gpu_backend.HostFromGpu) for node in topo])
== 0
)
assert (
sum([isinstance(node.op, self.gpu_backend.GpuFromHost) for node in topo])
== 4
)
scan_node = [node for node in topo if isinstance(node.op, Scan)]
assert len(scan_node) == 1
scan_node = scan_node[0]
scan_node_topo = scan_node.op.fn.maker.fgraph.toposort()
# check that there is no gpu transfer in the inner loop.
assert any(
[
isinstance(node.op, self.gpu_backend.GpuElemwise)
for node in scan_node_topo
]
)
assert not any(
[
isinstance(node.op, self.gpu_backend.HostFromGpu)
for node in scan_node_topo
]
)
assert not any(
[
isinstance(node.op, self.gpu_backend.GpuFromHost)
for node in scan_node_topo
]
)
# This second version test the second case in the optimizer to the gpu.
def test_one_sequence_one_output_weights_gpu2(self):
def f_rnn(u_t, x_tm1, W_in, W):
return u_t * W_in + x_tm1 * W
u = fvector("u")
x0 = fscalar("x0")
W_in = fscalar("win")
W = fscalar("w")
output, updates = scan(
f_rnn,
u,
x0,
[W_in, W],
n_steps=None,
truncate_gradient=-1,
go_backwards=False,
mode=self.mode_with_gpu,
)
f2 = aesara.function(
[u, x0, W_in, W],
output,
updates=updates,
allow_input_downcast=True,
mode=self.mode_with_gpu,
)
# get random initial values
rng = np.random.default_rng(utt.fetch_seed())
v_u = rng.uniform(size=(4,), low=-5.0, high=5.0)
v_x0 = rng.uniform()
W = rng.uniform()
W_in = rng.uniform()
# compute the output in numpy
v_out = np.zeros((4,))
v_out[0] = v_u[0] * W_in + v_x0 * W
for step in range(1, 4):
v_out[step] = v_u[step] * W_in + v_out[step - 1] * W
aesara_values = f2(v_u, v_x0, W_in, W)
utt.assert_allclose(aesara_values, v_out)
topo = f2.maker.fgraph.toposort()
assert (
sum([isinstance(node.op, self.gpu_backend.HostFromGpu) for node in topo])
== 1
)
assert (
sum([isinstance(node.op, self.gpu_backend.GpuFromHost) for node in topo])
== 4
)
scan_node = [node for node in topo if isinstance(node.op, Scan)]
assert len(scan_node) == 1
scan_node = scan_node[0]
scan_node_topo = scan_node.op.fn.maker.fgraph.toposort()
# check that there is no gpu transfer in the inner loop.
assert any(
[
isinstance(node.op, self.gpu_backend.GpuElemwise)
for node in scan_node_topo
]
)
assert not any(
[
isinstance(node.op, self.gpu_backend.HostFromGpu)
for node in scan_node_topo
]
)
assert not any(
[
isinstance(node.op, self.gpu_backend.GpuFromHost)
for node in scan_node_topo
]
)
# This third test checks that scan can deal with a mixture of dtypes as
# outputs when is running on GPU
def test_gpu3_mixture_dtype_outputs(self):
def f_rnn(u_t, x_tm1, W_in, W):
return (u_t * W_in + x_tm1 * W, at.cast(u_t + x_tm1, "int64"))
u = fvector("u")
x0 = fscalar("x0")
W_in = fscalar("win")
W = fscalar("w")
output, updates = scan(
f_rnn,
u,
[x0, None],
[W_in, W],
n_steps=None,
truncate_gradient=-1,
go_backwards=False,
mode=self.mode_with_gpu,
)
f2 = aesara.function(
[u, x0, W_in, W],
output,
updates=updates,
allow_input_downcast=True,
mode=self.mode_with_gpu,
)
# get random initial values
rng = np.random.default_rng(utt.fetch_seed())
v_u = rng.uniform(size=(4,), low=-5.0, high=5.0)
v_x0 = rng.uniform()
W = rng.uniform()
W_in = rng.uniform()
# compute the output in numpy
v_out1 = np.zeros((4,))
v_out2 = np.zeros((4,), dtype="int64")
v_out1[0] = v_u[0] * W_in + v_x0 * W
v_out2[0] = v_u[0] + v_x0
for step in range(1, 4):
v_out1[step] = v_u[step] * W_in + v_out1[step - 1] * W
v_out2[step] = np.int64(v_u[step] + v_out1[step - 1])
aesara_out1, aesara_out2 = f2(v_u, v_x0, W_in, W)
utt.assert_allclose(aesara_out1, v_out1)
utt.assert_allclose(aesara_out2, v_out2)
topo = f2.maker.fgraph.toposort()
scan_node = [node for node in topo if isinstance(node.op, Scan)]
assert len(scan_node) == 1
scan_node = scan_node[0]
assert self.is_scan_on_gpu(scan_node)
def test_gibbs_chain(self):
rng = np.random.default_rng(utt.fetch_seed())
v_vsample = np.array(
rng.binomial(
1,
0.5,
size=(3, 20),
),
dtype="float32",
)
vsample = aesara.shared(v_vsample)
trng = aesara.sandbox.rng_mrg.MRG_RandomStream(utt.fetch_seed())
def f(vsample_tm1):
return (
trng.binomial(vsample_tm1.shape, n=1, p=0.3, dtype="float32")
* vsample_tm1
)
aesara_vsamples, updates = scan(
f,
[],
vsample,
[],
n_steps=10,
truncate_gradient=-1,
go_backwards=False,
mode=self.mode_with_gpu,
)
my_f = aesara.function(
[],
aesara_vsamples[-1],
updates=updates,
allow_input_downcast=True,
mode=self.mode_with_gpu,
)
# I leave this to tested by debugmode, this test was anyway more of
# doest the graph compile kind of test
my_f()
def test_gpu_memory_usage(self):
# This test validates that the memory usage of the defined aesara
# function is reasonable when executed on the GPU. It checks for
# a bug in which one of scan's optimization was not applied which
# made the scan node compute large and unnecessary outputs which
# brought memory usage on the GPU to ~12G.
# Dimensionality of input and output data (not one-hot coded)
n_in = 100
n_out = 100
# Number of neurons in hidden layer
n_hid = 4000
# Number of minibatches
mb_size = 2
# Time steps in minibatch
mb_length = 200
# Define input variables
xin = ftensor3(name="xin")
yout = ftensor3(name="yout")
# Initialize the network parameters
U = aesara.shared(np.zeros((n_in, n_hid), dtype="float32"), name="W_xin_to_l1")
V = aesara.shared(np.zeros((n_hid, n_hid), dtype="float32"), name="W_l1_to_l1")
W = aesara.shared(np.zeros((n_hid, n_out), dtype="float32"), name="W_l1_to_l2")
nparams = [U, V, W]
# Build the forward pass
l1_base = dot(xin, U)
def scan_l(baseline, last_step):
return baseline + dot(last_step, V)
zero_output = at.alloc(np.asarray(0.0, dtype="float32"), mb_size, n_hid)
l1_out, _ = scan(
scan_l,
sequences=[l1_base],
outputs_info=[zero_output],
mode=self.mode_with_gpu_nodebug,
)
l2_out = dot(l1_out, W)
# Compute the cost and take the gradient wrt params
cost = at_sum((l2_out - yout) ** 2)
grads = aesara.grad(cost, nparams)
updates = list(zip(nparams, (n - g for n, g in zip(nparams, grads))))
# Compile the aesara function
feval_backprop = aesara.function(
[xin, yout], cost, updates=updates, mode=self.mode_with_gpu_nodebug
)
# Validate that the PushOutScanOutput optimization has been applied
# by checking the number of outputs of the grad Scan node in the
# compiled function.
nodes = feval_backprop.maker.fgraph.toposort()
scan_nodes = [n for n in nodes if isinstance(n.op, Scan)]
# The grad scan is always the 2nd one according to toposort. If the
# optimization has been applied, it has 2 outputs, otherwise 3.
grad_scan_node = scan_nodes[1]
assert len(grad_scan_node.outputs) == 2, len(grad_scan_node.outputs)
# Call the aesara function to ensure the absence of a memory error
feval_backprop(
np.zeros((mb_length, mb_size, n_in), dtype="float32"),
np.zeros((mb_length, mb_size, n_out), dtype="float32"),
)
def test_memory_reuse_gpudimshuffle(self):
# Test the memory pre-allocation feature in scan when one output is
# the result of a GpuDimshuffle (because an optimization in
# GpuDimshuffle can cause issues with the memory pre-allocation
# where it falsely thinks that a pre-allocated memory region has
# been used when it hasn't).
def inner_fn(seq1, recurrent_out):
temp = seq1 + recurrent_out.sum()
output1 = temp.dimshuffle(1, 0)
output2 = temp.sum() + recurrent_out
return output1, output2
input1 = ftensor3()
init = ftensor3()
outputs_info = [None, init]
out, _ = scan(
inner_fn,
sequences=[input1],
outputs_info=outputs_info,
mode=self.mode_with_gpu,
)
out1 = out[0].flatten()
out2 = out[1].flatten()
fct = aesara.function([input1, init], [out1, out2], mode=self.mode_with_gpu)
output = fct(
np.ones((2, 1, 1), dtype="float32"), np.ones((1, 1, 1), dtype="float32")
)
expected_output = (
np.array([2, 4], dtype="float32"),
np.array([3, 7], dtype="float32"),
)
utt.assert_allclose(output, expected_output)
class TestScanGpuarray(ScanGpuTests):
"""
This class takes the gpu tests for scan that are defined in
class ScanGpuTests and runs them using the gpuarray backend.
"""
def setup_method(self):
self.gpu_backend = gpuarray
# This is unfortunate, but required
def gpu_from_host(v):
return gpuarray.GpuFromHost(None)(v)
self.gpu_backend.gpu_from_host = gpu_from_host
self.mode_with_gpu = mode_with_opt.including("gpuarray", "scan")
self.mode_with_gpu_nodebug = mode_nodebug.including("gpuarray", "scan")
# Skip the test if pygpu is not available
if not self.gpu_backend.pygpu_activated:
pytest.skip("Optional package pygpu disabled")
def is_scan_on_gpu(self, node):
return node.op.info.get("gpua", False)
class TestScanCheckpoint:
def setup_method(self):
self.k = iscalar("k")
self.A = vector("A")
result, _ = scan(
fn=lambda prior_result, A: prior_result * A,
outputs_info=at.ones_like(self.A),
non_sequences=self.A,
n_steps=self.k,
)
result_check, _ = scan_checkpoints(
fn=lambda prior_result, A: prior_result * A,
outputs_info=at.ones_like(self.A),
non_sequences=self.A,
n_steps=self.k,
save_every_N=100,
)
self.result = result[-1]
self.result_check = result_check[-1]
self.grad_A = aesara.grad(self.result.sum(), self.A)
self.grad_A_check = aesara.grad(self.result_check.sum(), self.A)
def test_memory(self):
from tests.gpuarray.config import mode_with_gpu # noqa
f = aesara.function(
inputs=[self.A, self.k], outputs=self.grad_A, mode=mode_with_gpu
)
f_check = aesara.function(
inputs=[self.A, self.k], outputs=self.grad_A_check, mode=mode_with_gpu
)
free_gmem = aesara.gpuarray.type._context_reg[None].free_gmem
data = np.ones(free_gmem // 3000, dtype=np.float32)
# Check that it works with the checkpoints
size = 1000
if isinstance(mode_with_gpu, aesara.compile.debugmode.DebugMode):
size = 100
f_check(data, size)
# Check that the basic scan fails in that case
# Skip that check in DebugMode, as it can fail in different ways
if not isinstance(mode_with_gpu, aesara.compile.debugmode.DebugMode):
with pytest.raises(GpuArrayException):
f(data, 1000)
from aesara.gpuarray.sort import GpuTopKOp
from tests.gpuarray.config import mode_with_gpu
from tests.tensor.test_sort import TestTopK
class TestGpuTopK(TestTopK):
mode = mode_with_gpu
dtype = "float32"
op_class = GpuTopKOp
import numpy as np
import aesara
from aesara.compile import DeepCopyOp
from aesara.gpuarray.basic_ops import GpuContiguous, GpuFromHost, HostFromGpu
from aesara.gpuarray.elemwise import GpuDimShuffle
from aesara.gpuarray.subtensor import (
GpuAdvancedIncSubtensor,
GpuAdvancedIncSubtensor1,
GpuAdvancedIncSubtensor1_dev20,
GpuAdvancedSubtensor,
GpuAdvancedSubtensor1,
GpuAllocDiag,
GpuExtractDiag,
GpuIncSubtensor,
GpuSubtensor,
)
from aesara.gpuarray.type import gpuarray_shared_constructor
from aesara.tensor.basic import AllocDiag, ExtractDiag
from aesara.tensor.math import sum as at_sum
from aesara.tensor.subtensor import advanced_inc_subtensor1, inc_subtensor
from aesara.tensor.type import ivectors, matrix, tensor, tensor4, vector
from tests import unittest_tools as utt
from tests.gpuarray.config import mode_with_gpu, test_ctx_name
from tests.tensor.test_basic import TestAllocDiag
from tests.tensor.test_subtensor import TestAdvancedSubtensor, TestSubtensor
class TestGPUSubtensor(TestSubtensor):
def setup_method(self):
def shared(x, **kwargs):
return gpuarray_shared_constructor(x, target=test_ctx_name, **kwargs)
self.shared = shared
self.sub = GpuSubtensor
self.inc_sub = GpuIncSubtensor
self.adv_sub1 = GpuAdvancedSubtensor1
self.adv_incsub1 = GpuAdvancedIncSubtensor1
self.adv_sub = GpuAdvancedSubtensor
self.dimshuffle = GpuDimShuffle
self.mode = mode_with_gpu
# avoid errors with limited devices
self.dtype = "float32"
self.ignore_topo = (HostFromGpu, GpuFromHost, DeepCopyOp, GpuContiguous)
# GPU opt can't run in fast_compile only.
self.fast_compile = False
assert self.sub == GpuSubtensor
super().setup_method()
class TestGPUSubtensorF16(TestSubtensor):
def setup_method(self):
def shared(x, **kwargs):
return gpuarray_shared_constructor(x, target=test_ctx_name, **kwargs)
self.shared = shared
self.sub = GpuSubtensor
self.inc_sub = GpuIncSubtensor
self.adv_sub1 = GpuAdvancedSubtensor1
self.adv_incsub1 = GpuAdvancedIncSubtensor1
self.adv_sub = GpuAdvancedSubtensor
self.dimshuffle = GpuDimShuffle
self.mode = mode_with_gpu
# avoid errors with limited devices
self.dtype = "float16" # use floatX?
self.ignore_topo = (HostFromGpu, GpuFromHost, DeepCopyOp, GpuContiguous)
# GPU opt can't run in fast_compile only.
self.fast_compile = False
assert self.sub == GpuSubtensor
super().setup_method()
def test_advinc_subtensor1():
# Test the second case in the opt local_gpu_advanced_incsubtensor1
for shp in [(3, 3), (3, 3, 3)]:
shared = gpuarray_shared_constructor
xval = np.arange(np.prod(shp), dtype="float32").reshape(shp) + 1
yval = np.empty((2,) + shp[1:], dtype="float32")
yval[:] = 10
x = shared(xval, name="x")
y = tensor(dtype="float32", broadcastable=(False,) * len(shp), name="y")
expr = advanced_inc_subtensor1(x, y, [0, 2])
f = aesara.function([y], expr, mode=mode_with_gpu)
assert (
sum(
[
isinstance(node.op, GpuAdvancedIncSubtensor1)
for node in f.maker.fgraph.toposort()
]
)
== 1
)
rval = f(yval)
rep = xval.copy()
np.add.at(rep, [0, 2], yval)
assert np.allclose(rval, rep)
def test_advinc_subtensor1_dtype():
# Test the mixed dtype case
shp = (3, 4)
for dtype1, dtype2 in [
("float32", "int8"),
("float32", "float64"),
("uint64", "int8"),
("int64", "uint8"),
("float16", "int8"),
("float16", "float64"),
("float16", "float16"),
]:
shared = gpuarray_shared_constructor
xval = np.arange(np.prod(shp), dtype=dtype1).reshape(shp) + 1
yval = np.empty((2,) + shp[1:], dtype=dtype2)
yval[:] = 10
x = shared(xval, name="x")
y = tensor(dtype=yval.dtype, broadcastable=(False,) * len(yval.shape), name="y")
expr = advanced_inc_subtensor1(x, y, [0, 2])
f = aesara.function([y], expr, mode=mode_with_gpu)
assert (
sum(
[
isinstance(node.op, GpuAdvancedIncSubtensor1_dev20)
for node in f.maker.fgraph.toposort()
]
)
== 1
)
rval = f(yval)
rep = xval.copy()
np.add.at(rep, [[0, 2]], yval)
assert np.allclose(rval, rep)
@aesara.config.change_flags(deterministic="more")
def test_deterministic_flag():
shp = (3, 4)
for dtype1, dtype2 in [("float32", "int8")]:
shared = gpuarray_shared_constructor
xval = np.arange(np.prod(shp), dtype=dtype1).reshape(shp) + 1
yval = np.empty((2,) + shp[1:], dtype=dtype2)
yval[:] = 10
x = shared(xval, name="x")
y = tensor(dtype=yval.dtype, broadcastable=(False,) * len(yval.shape), name="y")
expr = advanced_inc_subtensor1(x, y, [0, 2])
f = aesara.function([y], expr, mode=mode_with_gpu)
assert (
sum(
[
isinstance(node.op, GpuAdvancedIncSubtensor1)
for node in f.maker.fgraph.toposort()
]
)
== 1
)
rval = f(yval)
rep = xval.copy()
np.add.at(rep, [[0, 2]], yval)
assert np.allclose(rval, rep)
def test_advinc_subtensor1_vector_scalar():
# Test the case where x is a vector and y a scalar
shp = (3,)
for dtype1, dtype2 in [
("float32", "int8"),
("float32", "float64"),
("float16", "int8"),
("float16", "float64"),
("float16", "float16"),
("int8", "int8"),
("int16", "int16"),
]:
shared = gpuarray_shared_constructor
xval = np.arange(np.prod(shp), dtype=dtype1).reshape(shp) + 1
yval = np.asarray(10, dtype=dtype2)
x = shared(xval, name="x")
y = tensor(dtype=yval.dtype, broadcastable=(False,) * len(yval.shape), name="y")
expr = advanced_inc_subtensor1(x, y, [0, 2])
f = aesara.function([y], expr, mode=mode_with_gpu)
assert (
sum(
[
isinstance(
node.op,
(GpuAdvancedIncSubtensor1_dev20, GpuAdvancedIncSubtensor1),
)
for node in f.maker.fgraph.toposort()
]
)
== 1
)
rval = f(yval)
rep = xval.copy()
rep[[0, 2]] += yval
assert np.allclose(rval, rep)
def test_incsub_f16():
shp = (3, 3)
shared = gpuarray_shared_constructor
xval = np.arange(np.prod(shp), dtype="float16").reshape(shp) + 1
yval = np.empty((2,) + shp[1:], dtype="float16")
yval[:] = 2
x = shared(xval, name="x")
y = tensor(dtype="float16", broadcastable=(False,) * len(shp), name="y")
expr = advanced_inc_subtensor1(x, y, [0, 2])
f = aesara.function([y], expr, mode=mode_with_gpu)
assert (
sum(
[
isinstance(node.op, GpuAdvancedIncSubtensor1)
for node in f.maker.fgraph.toposort()
]
)
== 1
)
rval = f(yval)
rep = xval.copy()
np.add.at(rep, [[0, 2]], yval)
assert np.allclose(rval, rep)
expr = inc_subtensor(x[1:], y)
f = aesara.function([y], expr, mode=mode_with_gpu)
assert (
sum(
[isinstance(node.op, GpuIncSubtensor) for node in f.maker.fgraph.toposort()]
)
== 1
)
rval = f(yval)
rep = xval.copy()
rep[1:] += yval
assert np.allclose(rval, rep)
def test_incsub_offset():
# Test for https://github.com/Theano/Theano/issues/5670
# Build a GPU variable which value will have an offset (x1)
x = gpuarray_shared_constructor(np.zeros(5, dtype=aesara.config.floatX))
x1 = x[1:]
# Use inc_subtensor on it
y = vector()
z = inc_subtensor(x1[2:], y)
# Use updates so that inc_subtensor can happen inplace
f = aesara.function([y], z, updates={x: z}, mode=mode_with_gpu)
utt.assert_allclose(f([1, 2]), np.array([0, 0, 1, 2], dtype=aesara.config.floatX))
class TestGPUAdvancedSubtensor(TestAdvancedSubtensor):
def setup_method(self):
self.shared = gpuarray_shared_constructor
self.sub = GpuAdvancedSubtensor
self.inc_sub = GpuAdvancedIncSubtensor
self.mode = mode_with_gpu
# avoid errors with limited devices
self.dtype = "float32" # floatX?
self.ignore_topo = (HostFromGpu, GpuFromHost, DeepCopyOp)
# GPU opt can't run in fast_compile only.
self.fast_compile = False
assert self.sub == GpuAdvancedSubtensor
super().setup_method()
class TestGPUAdvancedSubtensorF16(TestAdvancedSubtensor):
def setup_method(self):
self.shared = gpuarray_shared_constructor
self.sub = GpuAdvancedSubtensor
self.mode = mode_with_gpu
# avoid errors with limited devices
self.dtype = "float16" # floatX?
self.ignore_topo = (HostFromGpu, GpuFromHost, DeepCopyOp)
# GPU opt can't run in fast_compile only.
self.fast_compile = False
assert self.sub == GpuAdvancedSubtensor
super().setup_method()
def test_adv_subtensor():
# Test the advancedsubtensor on gpu.
shp = (2, 3, 4)
shared = gpuarray_shared_constructor
xval = np.arange(np.prod(shp), dtype=aesara.config.floatX).reshape(shp)
idx1, idx2 = ivectors("idx1", "idx2")
idxs = [idx1, None, slice(0, 2, 1), idx2, None]
x = shared(xval, name="x")
expr = x[idxs]
f = aesara.function([idx1, idx2], expr, mode=mode_with_gpu)
assert (
sum(
[
isinstance(node.op, GpuAdvancedSubtensor)
for node in f.maker.fgraph.toposort()
]
)
== 1
)
idx1_val = [0, 1]
idx2_val = [0, 1]
rval = f(idx1_val, idx2_val)
rep = xval[idx1_val, None, slice(0, 2, 1), idx2_val, None]
assert np.allclose(rval, rep)
class TestGpuExtractDiag:
def test_extractdiag_opt(self):
x = matrix()
fn = aesara.function([x], ExtractDiag()(x), mode=mode_with_gpu)
assert any(
[isinstance(node.op, GpuExtractDiag) for node in fn.maker.fgraph.toposort()]
)
def test_matrix(self):
x = matrix()
np_x = np.arange(77).reshape(7, 11).astype(aesara.config.floatX)
fn = aesara.function([x], GpuExtractDiag()(x), mode=mode_with_gpu)
assert np.allclose(fn(np_x), np_x.diagonal())
fn = aesara.function([x], GpuExtractDiag(2)(x), mode=mode_with_gpu)
assert np.allclose(fn(np_x), np_x.diagonal(2))
fn = aesara.function([x], GpuExtractDiag(-3)(x), mode=mode_with_gpu)
assert np.allclose(fn(np_x), np_x.diagonal(-3))
def test_tensor(self):
x = tensor4()
np_x = np.arange(30107).reshape(7, 11, 17, 23).astype(aesara.config.floatX)
for offset, axis1, axis2 in [
(1, 0, 1),
(-1, 0, 1),
(0, 1, 0),
(-2, 1, 0),
(-3, 1, 0),
(-2, 2, 0),
(3, 3, 0),
(-1, 3, 2),
(2, 2, 3),
(-1, 2, 1),
(1, 3, 1),
(-1, 1, 3),
]:
assert np.allclose(
GpuExtractDiag(offset, axis1, axis2)(x).eval({x: np_x}),
np_x.diagonal(offset, axis1, axis2),
)
def test_tensor_float16(self):
x = tensor4()
np_x = np.arange(30107).reshape(7, 11, 17, 23).astype("float16")
for offset, axis1, axis2 in [
(1, 0, 1),
(-1, 0, 1),
(0, 1, 0),
(-2, 1, 0),
(-3, 1, 0),
(-2, 2, 0),
(3, 3, 0),
(-1, 3, 2),
(2, 2, 3),
(-1, 2, 1),
(1, 3, 1),
(-1, 1, 3),
]:
assert np.allclose(
GpuExtractDiag(offset, axis1, axis2)(x).eval({x: np_x}),
np_x.diagonal(offset, axis1, axis2),
)
class TestGpuAllocDiag(TestAllocDiag):
def setup_method(self):
self.alloc_diag = GpuAllocDiag
self.mode = mode_with_gpu
super().setup_method()
def test_allocdiag_opt(self):
x = vector()
fn = aesara.function([x], AllocDiag()(x), mode=mode_with_gpu)
assert any(
[isinstance(node.op, GpuAllocDiag) for node in fn.maker.fgraph.toposort()]
)
def test_matrix(self):
x = vector()
np_x = np.arange(7).astype(aesara.config.floatX)
fn = aesara.function([x], GpuAllocDiag()(x), mode=mode_with_gpu)
assert np.allclose(fn(np_x), np.diag(np_x))
fn = aesara.function([x], GpuAllocDiag(2)(x), mode=mode_with_gpu)
assert np.allclose(fn(np_x), np.diag(np_x, 2))
fn = aesara.function([x], GpuAllocDiag(-3)(x), mode=mode_with_gpu)
assert np.allclose(fn(np_x), np.diag(np_x, -3))
def test_grad(self):
x = vector()
np_x = np.random.randn(7).astype(aesara.config.floatX)
# offset = 0 case:
mtx_x = GpuAllocDiag()(x)
sum_mtx_x = at_sum(mtx_x)
grad_x = aesara.grad(sum_mtx_x, x)
grad_mtx_x = aesara.grad(sum_mtx_x, mtx_x)
fn_grad_x = aesara.function([x], grad_x, mode=mode_with_gpu)
fn_grad_mtx_x = aesara.function([x], grad_mtx_x, mode=mode_with_gpu)
computed_grad_x = fn_grad_x(np_x)
computed_grad_mtx_x = fn_grad_mtx_x(np_x)
true_grad_x = np.diagonal(computed_grad_mtx_x, 0)
assert np.allclose(computed_grad_x, true_grad_x)
# offset > 0 case:
mtx_x = GpuAllocDiag(2)(x)
sum_mtx_x = at_sum(mtx_x)
grad_x = aesara.grad(sum_mtx_x, x)
grad_mtx_x = aesara.grad(sum_mtx_x, mtx_x)
fn_grad_x = aesara.function([x], grad_x, mode=mode_with_gpu)
fn_grad_mtx_x = aesara.function([x], grad_mtx_x, mode=mode_with_gpu)
computed_grad_x = fn_grad_x(np_x)
computed_grad_mtx_x = fn_grad_mtx_x(np_x)
true_grad_x = np.diagonal(computed_grad_mtx_x, 2)
assert np.allclose(computed_grad_x, true_grad_x)
# offset < 0 case:
mtx_x = GpuAllocDiag(-3)(x)
sum_mtx_x = at_sum(mtx_x)
grad_x = aesara.grad(sum_mtx_x, x)
grad_mtx_x = aesara.grad(sum_mtx_x, mtx_x)
fn_grad_x = aesara.function([x], grad_x, mode=mode_with_gpu)
fn_grad_mtx_x = aesara.function([x], grad_mtx_x, mode=mode_with_gpu)
computed_grad_x = fn_grad_x(np_x)
computed_grad_mtx_x = fn_grad_mtx_x(np_x)
true_grad_x = np.diagonal(computed_grad_mtx_x, -3)
assert np.allclose(computed_grad_x, true_grad_x)
# assert
import os
from pickle import Unpickler
import numpy as np
import pytest
import aesara
from aesara.compile.ops import DeepCopyOp, ViewOp
from aesara.configdefaults import config
from aesara.gpuarray.type import GpuArrayType, gpuarray_shared_constructor
from aesara.tensor.basic import Rebroadcast
from aesara.tensor.shape import specify_shape
from aesara.tensor.type import row
from tests.gpuarray.config import test_ctx_name
from tests.gpuarray.test_basic_ops import rand_gpuarray
pygpu = pytest.importorskip("pygpu")
# Disabled for now
# from tests.tensor.test_sharedvar import makeSharedTester
def test_deep_copy():
for dtype in ("float16", "float32"):
a = rand_gpuarray(20, dtype=dtype)
g = GpuArrayType(dtype=dtype, broadcastable=(False,))("g")
f = aesara.function([g], g)
assert isinstance(f.maker.fgraph.toposort()[0].op, DeepCopyOp)
res = f(a)
assert GpuArrayType.values_eq(res, a)
def test_view():
for dtype in ("float16", "float32"):
a = rand_gpuarray(20, dtype=dtype)
g = GpuArrayType(dtype=dtype, broadcastable=(False,))("g")
m = aesara.compile.get_default_mode().excluding("local_view_op")
f = aesara.function([g], ViewOp()(g), mode=m)
assert isinstance(f.maker.fgraph.toposort()[0].op, ViewOp)
res = f(a)
assert GpuArrayType.values_eq(res, a)
def test_rebroadcast():
for dtype in ("float16", "float32"):
a = rand_gpuarray(1, dtype=dtype)
g = GpuArrayType(dtype=dtype, broadcastable=(False,))("g")
f = aesara.function([g], Rebroadcast((0, True))(g))
assert isinstance(f.maker.fgraph.toposort()[0].op, Rebroadcast)
res = f(a)
assert GpuArrayType.values_eq(res, a)
def test_values_eq_approx():
a = rand_gpuarray(20, dtype="float32")
assert GpuArrayType.values_eq_approx(a, a)
b = a.copy()
b[0] = np.asarray(b[0]) + 1.0
assert not GpuArrayType.values_eq_approx(a, b)
b = a.copy()
b[0] = -np.asarray(b[0])
assert not GpuArrayType.values_eq_approx(a, b)
def test_specify_shape():
for dtype in ("float16", "float32"):
a = rand_gpuarray(20, dtype=dtype)
g = GpuArrayType(dtype=dtype, broadcastable=(False,))("g")
f = aesara.function([g], specify_shape(g, [20]))
f(a)
def test_filter_float():
aesara.compile.shared_constructor(gpuarray_shared_constructor)
try:
s = aesara.shared(np.array(0.0, dtype="float32"), target=test_ctx_name)
aesara.function([], updates=[(s, 0.0)])
finally:
del aesara.compile.sharedvalue.shared.constructors[-1]
def test_filter_variable():
# Test that filter_variable accepts more restrictive broadcast
gpu_row = GpuArrayType(dtype=aesara.config.floatX, broadcastable=(True, False))
gpu_matrix = GpuArrayType(dtype=aesara.config.floatX, broadcastable=(False, False))
r = gpu_row()
m = gpu_matrix.filter_variable(r)
assert m.type == gpu_matrix
# On CPU as well
r = row()
m = gpu_matrix.filter_variable(r)
assert m.type == gpu_matrix
def test_gpuarray_shared_scalar():
# By default, we don't put scalar as shared variable on the GPU
with pytest.raises(TypeError):
gpuarray_shared_constructor(np.asarray(1, dtype="float32"))
# But we can force that
gpuarray_shared_constructor(np.asarray(1, dtype="float32"), target=test_ctx_name)
def test_unpickle_gpuarray_as_numpy_ndarray_flag0():
# Test when pygpu isn't there for unpickle are in test_pickle.py
oldflag = config.experimental__unpickle_gpu_on_cpu
config.experimental__unpickle_gpu_on_cpu = False
try:
testfile_dir = os.path.dirname(os.path.realpath(__file__))
fname = "GpuArray.pkl"
with open(os.path.join(testfile_dir, fname), "rb") as fp:
u = Unpickler(fp, encoding="latin1")
mat = u.load()
assert isinstance(mat, pygpu.gpuarray.GpuArray)
assert np.asarray(mat)[0] == -42.0
finally:
config.experimental__unpickle_gpu_on_cpu = oldflag
# These tests are disabled because they expect the impossible
# @makeSharedTester(
# shared_constructor_=gpuarray_shared_constructor,
# dtype_=aesara.config.floatX,
# get_value_borrow_true_alias_=True,
# shared_borrow_true_alias_=True,
# set_value_borrow_true_alias_=True,
# set_value_inplace_=True,
# set_cast_value_inplace_=False,
# shared_constructor_accept_ndarray_=True,
# internal_type_=lambda v: pygpu.array(v, context=get_context(test_ctx_name),
# cls=pygpu._array.ndgpuarray),
# test_internal_type_=lambda a: isinstance(a, pygpu.gpuarray.GpuArray),
# aesara_fct_=aesara.tensor.exp,
# ref_fct_=np.exp,
# cast_value_=lambda v: pygpu.array(v, context=get_context(test_ctx_name),
# cls=pygpu._array.ndgpuarray))
# class TestSharedOptions(object):
# pass
# @makeSharedTester(
# shared_constructor_=gpuarray_shared_constructor,
# dtype_=aesara.config.floatX,
# get_value_borrow_true_alias_=False,
# shared_borrow_true_alias_=False,
# set_value_borrow_true_alias_=False,
# set_value_inplace_=True,
# set_cast_value_inplace_=True,
# shared_constructor_accept_ndarray_=True,
# internal_type_=lambda v: pygpu.array(v, context=get_context(test_ctx_name),
# cls=pygpu._array.ndgpuarray),
# test_internal_type_=lambda a: isinstance(a, pygpu.gpuarray.GpuArray),
# aesara_fct_=aesara.tensor.exp,
# ref_fct_=np.exp,
# cast_value_=lambda v: pygpu.array(v, context=get_context(test_ctx_name),
# cls=pygpu._array.ndgpuarray))
# class TestSharedOptions2(object):
# pass
def test_set_value_non_contiguous():
s = gpuarray_shared_constructor(np.asarray([[1.0, 2.0], [1.0, 2.0], [5, 6]]))
s.set_value(s.get_value(borrow=True, return_internal_type=True)[::2], borrow=True)
assert not s.get_value(borrow=True, return_internal_type=True).flags["C_CONTIGUOUS"]
# In the past, this failed
s.set_value([[0, 0], [1, 1]])
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论